diff --git "a/LORAs/300mb-DB-CodeFeedback-Tinyllama/checkpoint-156526/trainer_state.json" "b/LORAs/300mb-DB-CodeFeedback-Tinyllama/checkpoint-156526/trainer_state.json" new file mode 100644--- /dev/null +++ "b/LORAs/300mb-DB-CodeFeedback-Tinyllama/checkpoint-156526/trainer_state.json" @@ -0,0 +1,109597 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 156526, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 6.388714973870156e-05, + "grad_norm": 0.3715410828590393, + "learning_rate": 9.999999974822838e-05, + "loss": 1.0516, + "step": 10 + }, + { + "epoch": 0.00012777429947740312, + "grad_norm": 0.5814082026481628, + "learning_rate": 9.99999989929135e-05, + "loss": 1.1442, + "step": 20 + }, + { + "epoch": 0.00019166144921610467, + "grad_norm": 0.5080631375312805, + "learning_rate": 9.999999773405537e-05, + "loss": 0.786, + "step": 30 + }, + { + "epoch": 0.00025554859895480624, + "grad_norm": 0.9986467361450195, + "learning_rate": 9.999999636441773e-05, + "loss": 0.9332, + "step": 40 + }, + { + "epoch": 0.00031943574869350776, + "grad_norm": 0.9622060060501099, + "learning_rate": 9.99999941991818e-05, + "loss": 1.0711, + "step": 50 + }, + { + "epoch": 0.00038332289843220934, + "grad_norm": 0.7242945432662964, + "learning_rate": 9.999999153040267e-05, + "loss": 1.0533, + "step": 60 + }, + { + "epoch": 0.0004472100481709109, + "grad_norm": 0.6160323619842529, + "learning_rate": 9.999998835808037e-05, + "loss": 1.0065, + "step": 70 + }, + { + "epoch": 0.0005110971979096125, + "grad_norm": 1.1578069925308228, + "learning_rate": 9.999998468221492e-05, + "loss": 1.0088, + "step": 80 + }, + { + "epoch": 0.0005749843476483141, + "grad_norm": 0.7251982092857361, + "learning_rate": 9.999998050280638e-05, + "loss": 0.9858, + "step": 90 + }, + { + "epoch": 0.0006388714973870155, + "grad_norm": 0.9950217604637146, + "learning_rate": 9.999997581985477e-05, + "loss": 1.0171, + "step": 100 + }, + { + "epoch": 0.0007027586471257171, + "grad_norm": 1.055307388305664, + "learning_rate": 9.999997063336015e-05, + "loss": 0.993, + "step": 110 + }, + { + "epoch": 0.0007666457968644187, + "grad_norm": 0.8479915857315063, + "learning_rate": 9.999996494332258e-05, + "loss": 0.9431, + "step": 120 + }, + { + "epoch": 0.0008305329466031203, + "grad_norm": 0.8658767938613892, + "learning_rate": 9.999995874974209e-05, + "loss": 0.8251, + "step": 130 + }, + { + "epoch": 0.0008944200963418218, + "grad_norm": 2.8936755657196045, + "learning_rate": 9.999995205261878e-05, + "loss": 1.2926, + "step": 140 + }, + { + "epoch": 0.0009583072460805234, + "grad_norm": 0.5815268158912659, + "learning_rate": 9.999994485195268e-05, + "loss": 1.0419, + "step": 150 + }, + { + "epoch": 0.001022194395819225, + "grad_norm": 1.1052826642990112, + "learning_rate": 9.999993714774389e-05, + "loss": 1.1736, + "step": 160 + }, + { + "epoch": 0.0010860815455579266, + "grad_norm": 1.4575506448745728, + "learning_rate": 9.999992893999246e-05, + "loss": 1.129, + "step": 170 + }, + { + "epoch": 0.0011499686952966281, + "grad_norm": 0.6741073131561279, + "learning_rate": 9.99999202286985e-05, + "loss": 1.0822, + "step": 180 + }, + { + "epoch": 0.0012138558450353297, + "grad_norm": 0.7011894583702087, + "learning_rate": 9.99999110138621e-05, + "loss": 0.8165, + "step": 190 + }, + { + "epoch": 0.001277742994774031, + "grad_norm": 0.812254011631012, + "learning_rate": 9.999990129548333e-05, + "loss": 0.901, + "step": 200 + }, + { + "epoch": 0.0013416301445127326, + "grad_norm": 2.268691301345825, + "learning_rate": 9.99998910735623e-05, + "loss": 1.0642, + "step": 210 + }, + { + "epoch": 0.0014055172942514342, + "grad_norm": 1.1439971923828125, + "learning_rate": 9.999988034809911e-05, + "loss": 1.0481, + "step": 220 + }, + { + "epoch": 0.0014694044439901358, + "grad_norm": 0.953459620475769, + "learning_rate": 9.999986911909385e-05, + "loss": 0.9192, + "step": 230 + }, + { + "epoch": 0.0015332915937288374, + "grad_norm": 1.2826868295669556, + "learning_rate": 9.999985738654666e-05, + "loss": 0.9682, + "step": 240 + }, + { + "epoch": 0.001597178743467539, + "grad_norm": 0.9371192455291748, + "learning_rate": 9.999984515045768e-05, + "loss": 1.0604, + "step": 250 + }, + { + "epoch": 0.0016610658932062405, + "grad_norm": 1.085310459136963, + "learning_rate": 9.999983241082698e-05, + "loss": 0.9281, + "step": 260 + }, + { + "epoch": 0.001724953042944942, + "grad_norm": 0.906535267829895, + "learning_rate": 9.99998191676547e-05, + "loss": 0.9776, + "step": 270 + }, + { + "epoch": 0.0017888401926836437, + "grad_norm": 1.267155647277832, + "learning_rate": 9.9999805420941e-05, + "loss": 1.0792, + "step": 280 + }, + { + "epoch": 0.0018527273424223452, + "grad_norm": 0.6740846037864685, + "learning_rate": 9.9999791170686e-05, + "loss": 0.8078, + "step": 290 + }, + { + "epoch": 0.0019166144921610468, + "grad_norm": 1.0244088172912598, + "learning_rate": 9.999977641688985e-05, + "loss": 0.8445, + "step": 300 + }, + { + "epoch": 0.001980501641899748, + "grad_norm": 1.1094986200332642, + "learning_rate": 9.99997611595527e-05, + "loss": 0.9578, + "step": 310 + }, + { + "epoch": 0.00204438879163845, + "grad_norm": 1.1098616123199463, + "learning_rate": 9.99997453986747e-05, + "loss": 0.968, + "step": 320 + }, + { + "epoch": 0.0021082759413771513, + "grad_norm": 0.9596286416053772, + "learning_rate": 9.9999729134256e-05, + "loss": 1.0052, + "step": 330 + }, + { + "epoch": 0.002172163091115853, + "grad_norm": 1.476741075515747, + "learning_rate": 9.999971236629676e-05, + "loss": 1.0505, + "step": 340 + }, + { + "epoch": 0.0022360502408545545, + "grad_norm": 1.221603274345398, + "learning_rate": 9.999969509479718e-05, + "loss": 0.7985, + "step": 350 + }, + { + "epoch": 0.0022999373905932563, + "grad_norm": 1.42596435546875, + "learning_rate": 9.99996773197574e-05, + "loss": 1.0444, + "step": 360 + }, + { + "epoch": 0.0023638245403319576, + "grad_norm": 2.004958152770996, + "learning_rate": 9.999965904117762e-05, + "loss": 0.9776, + "step": 370 + }, + { + "epoch": 0.0024277116900706594, + "grad_norm": 0.6965352892875671, + "learning_rate": 9.999964025905801e-05, + "loss": 0.8532, + "step": 380 + }, + { + "epoch": 0.0024915988398093608, + "grad_norm": 0.995827853679657, + "learning_rate": 9.999962097339879e-05, + "loss": 1.1039, + "step": 390 + }, + { + "epoch": 0.002555485989548062, + "grad_norm": 4.0594635009765625, + "learning_rate": 9.999960118420011e-05, + "loss": 0.954, + "step": 400 + }, + { + "epoch": 0.002619373139286764, + "grad_norm": 1.0161226987838745, + "learning_rate": 9.99995808914622e-05, + "loss": 0.9513, + "step": 410 + }, + { + "epoch": 0.0026832602890254653, + "grad_norm": 1.1862547397613525, + "learning_rate": 9.999956009518525e-05, + "loss": 0.8693, + "step": 420 + }, + { + "epoch": 0.002747147438764167, + "grad_norm": 1.0898480415344238, + "learning_rate": 9.999953879536947e-05, + "loss": 1.0837, + "step": 430 + }, + { + "epoch": 0.0028110345885028684, + "grad_norm": 0.5398057103157043, + "learning_rate": 9.999951699201509e-05, + "loss": 0.8603, + "step": 440 + }, + { + "epoch": 0.00287492173824157, + "grad_norm": 0.825309157371521, + "learning_rate": 9.999949468512231e-05, + "loss": 1.1857, + "step": 450 + }, + { + "epoch": 0.0029388088879802716, + "grad_norm": 1.3066387176513672, + "learning_rate": 9.999947187469137e-05, + "loss": 0.918, + "step": 460 + }, + { + "epoch": 0.0030026960377189734, + "grad_norm": 1.3756886720657349, + "learning_rate": 9.999944856072248e-05, + "loss": 0.9851, + "step": 470 + }, + { + "epoch": 0.0030665831874576747, + "grad_norm": 1.39496648311615, + "learning_rate": 9.99994247432159e-05, + "loss": 0.9551, + "step": 480 + }, + { + "epoch": 0.0031304703371963765, + "grad_norm": 1.3940093517303467, + "learning_rate": 9.999940042217184e-05, + "loss": 0.9794, + "step": 490 + }, + { + "epoch": 0.003194357486935078, + "grad_norm": 0.708533763885498, + "learning_rate": 9.999937559759059e-05, + "loss": 0.8929, + "step": 500 + }, + { + "epoch": 0.0032582446366737792, + "grad_norm": 1.2305490970611572, + "learning_rate": 9.999935026947235e-05, + "loss": 0.8819, + "step": 510 + }, + { + "epoch": 0.003322131786412481, + "grad_norm": 1.3446779251098633, + "learning_rate": 9.99993244378174e-05, + "loss": 1.5002, + "step": 520 + }, + { + "epoch": 0.0033860189361511824, + "grad_norm": 1.0329649448394775, + "learning_rate": 9.9999298102626e-05, + "loss": 0.9671, + "step": 530 + }, + { + "epoch": 0.003449906085889884, + "grad_norm": 1.0138870477676392, + "learning_rate": 9.99992712638984e-05, + "loss": 0.9673, + "step": 540 + }, + { + "epoch": 0.0035137932356285855, + "grad_norm": 0.76459139585495, + "learning_rate": 9.999924392163491e-05, + "loss": 1.1123, + "step": 550 + }, + { + "epoch": 0.0035776803853672873, + "grad_norm": 1.9452675580978394, + "learning_rate": 9.999921607583576e-05, + "loss": 0.8708, + "step": 560 + }, + { + "epoch": 0.0036415675351059887, + "grad_norm": 0.7392802834510803, + "learning_rate": 9.999918772650126e-05, + "loss": 1.0164, + "step": 570 + }, + { + "epoch": 0.0037054546848446905, + "grad_norm": 1.3913438320159912, + "learning_rate": 9.999915887363167e-05, + "loss": 1.0721, + "step": 580 + }, + { + "epoch": 0.003769341834583392, + "grad_norm": 0.5592684745788574, + "learning_rate": 9.99991295172273e-05, + "loss": 1.0308, + "step": 590 + }, + { + "epoch": 0.0038332289843220936, + "grad_norm": 1.1413140296936035, + "learning_rate": 9.999909965728845e-05, + "loss": 0.8317, + "step": 600 + }, + { + "epoch": 0.003897116134060795, + "grad_norm": 1.4501404762268066, + "learning_rate": 9.99990692938154e-05, + "loss": 0.8625, + "step": 610 + }, + { + "epoch": 0.003961003283799496, + "grad_norm": 0.7926478981971741, + "learning_rate": 9.999903842680846e-05, + "loss": 1.0851, + "step": 620 + }, + { + "epoch": 0.004024890433538198, + "grad_norm": 2.0299394130706787, + "learning_rate": 9.999900705626797e-05, + "loss": 0.8804, + "step": 630 + }, + { + "epoch": 0.0040887775832769, + "grad_norm": 0.8396856188774109, + "learning_rate": 9.99989751821942e-05, + "loss": 0.9552, + "step": 640 + }, + { + "epoch": 0.004152664733015601, + "grad_norm": 1.275235891342163, + "learning_rate": 9.999894280458752e-05, + "loss": 0.9358, + "step": 650 + }, + { + "epoch": 0.004216551882754303, + "grad_norm": 0.8721204400062561, + "learning_rate": 9.999890992344821e-05, + "loss": 0.8874, + "step": 660 + }, + { + "epoch": 0.004280439032493004, + "grad_norm": 0.6353357434272766, + "learning_rate": 9.999887653877663e-05, + "loss": 1.1176, + "step": 670 + }, + { + "epoch": 0.004344326182231706, + "grad_norm": 1.0310698747634888, + "learning_rate": 9.999884265057311e-05, + "loss": 0.9272, + "step": 680 + }, + { + "epoch": 0.004408213331970407, + "grad_norm": 0.8742356896400452, + "learning_rate": 9.999880825883798e-05, + "loss": 0.9773, + "step": 690 + }, + { + "epoch": 0.004472100481709109, + "grad_norm": 0.9229012131690979, + "learning_rate": 9.99987733635716e-05, + "loss": 1.0118, + "step": 700 + }, + { + "epoch": 0.004535987631447811, + "grad_norm": 1.0641270875930786, + "learning_rate": 9.999873796477433e-05, + "loss": 0.9236, + "step": 710 + }, + { + "epoch": 0.0045998747811865125, + "grad_norm": 1.2784768342971802, + "learning_rate": 9.99987020624465e-05, + "loss": 1.3926, + "step": 720 + }, + { + "epoch": 0.004663761930925213, + "grad_norm": 0.919906497001648, + "learning_rate": 9.999866565658848e-05, + "loss": 0.9255, + "step": 730 + }, + { + "epoch": 0.004727649080663915, + "grad_norm": 1.3947570323944092, + "learning_rate": 9.999862874720065e-05, + "loss": 0.9953, + "step": 740 + }, + { + "epoch": 0.004791536230402617, + "grad_norm": 1.0191991329193115, + "learning_rate": 9.999859133428338e-05, + "loss": 0.9042, + "step": 750 + }, + { + "epoch": 0.004855423380141319, + "grad_norm": 5.101995944976807, + "learning_rate": 9.999855341783703e-05, + "loss": 1.0483, + "step": 760 + }, + { + "epoch": 0.00491931052988002, + "grad_norm": 1.350167155265808, + "learning_rate": 9.9998514997862e-05, + "loss": 1.0955, + "step": 770 + }, + { + "epoch": 0.0049831976796187215, + "grad_norm": 2.278700351715088, + "learning_rate": 9.999847607435866e-05, + "loss": 0.9322, + "step": 780 + }, + { + "epoch": 0.005047084829357423, + "grad_norm": 0.9818449020385742, + "learning_rate": 9.999843664732743e-05, + "loss": 0.8905, + "step": 790 + }, + { + "epoch": 0.005110971979096124, + "grad_norm": 1.0081336498260498, + "learning_rate": 9.999839671676865e-05, + "loss": 1.0194, + "step": 800 + }, + { + "epoch": 0.005174859128834826, + "grad_norm": 1.2605959177017212, + "learning_rate": 9.999835628268279e-05, + "loss": 0.9553, + "step": 810 + }, + { + "epoch": 0.005238746278573528, + "grad_norm": 0.9298542737960815, + "learning_rate": 9.999831534507022e-05, + "loss": 0.7657, + "step": 820 + }, + { + "epoch": 0.00530263342831223, + "grad_norm": 1.2965129613876343, + "learning_rate": 9.999827390393136e-05, + "loss": 0.7605, + "step": 830 + }, + { + "epoch": 0.0053665205780509305, + "grad_norm": 0.6737092137336731, + "learning_rate": 9.999823195926663e-05, + "loss": 1.3731, + "step": 840 + }, + { + "epoch": 0.005430407727789632, + "grad_norm": 1.1260855197906494, + "learning_rate": 9.999818951107644e-05, + "loss": 1.1665, + "step": 850 + }, + { + "epoch": 0.005494294877528334, + "grad_norm": 0.9080353379249573, + "learning_rate": 9.999814655936123e-05, + "loss": 1.053, + "step": 860 + }, + { + "epoch": 0.005558182027267036, + "grad_norm": 0.7714121341705322, + "learning_rate": 9.999810310412146e-05, + "loss": 1.0622, + "step": 870 + }, + { + "epoch": 0.005622069177005737, + "grad_norm": 1.5367814302444458, + "learning_rate": 9.99980591453575e-05, + "loss": 0.832, + "step": 880 + }, + { + "epoch": 0.005685956326744439, + "grad_norm": 0.8397789597511292, + "learning_rate": 9.999801468306984e-05, + "loss": 0.8085, + "step": 890 + }, + { + "epoch": 0.00574984347648314, + "grad_norm": 1.5233057737350464, + "learning_rate": 9.999796971725892e-05, + "loss": 0.7896, + "step": 900 + }, + { + "epoch": 0.005813730626221841, + "grad_norm": 0.987886369228363, + "learning_rate": 9.99979242479252e-05, + "loss": 1.2048, + "step": 910 + }, + { + "epoch": 0.005877617775960543, + "grad_norm": 0.8574057817459106, + "learning_rate": 9.999787827506911e-05, + "loss": 1.1049, + "step": 920 + }, + { + "epoch": 0.005941504925699245, + "grad_norm": 0.8249441385269165, + "learning_rate": 9.999783179869114e-05, + "loss": 0.8109, + "step": 930 + }, + { + "epoch": 0.006005392075437947, + "grad_norm": 1.311522364616394, + "learning_rate": 9.999778481879175e-05, + "loss": 1.0632, + "step": 940 + }, + { + "epoch": 0.006069279225176648, + "grad_norm": 0.7848984599113464, + "learning_rate": 9.999773733537141e-05, + "loss": 0.8783, + "step": 950 + }, + { + "epoch": 0.006133166374915349, + "grad_norm": 1.3800158500671387, + "learning_rate": 9.999768934843062e-05, + "loss": 1.0681, + "step": 960 + }, + { + "epoch": 0.006197053524654051, + "grad_norm": 2.259437322616577, + "learning_rate": 9.999764085796981e-05, + "loss": 1.0596, + "step": 970 + }, + { + "epoch": 0.006260940674392753, + "grad_norm": 0.876724123954773, + "learning_rate": 9.999759186398951e-05, + "loss": 0.9664, + "step": 980 + }, + { + "epoch": 0.006324827824131454, + "grad_norm": 0.6863592267036438, + "learning_rate": 9.999754236649023e-05, + "loss": 0.9189, + "step": 990 + }, + { + "epoch": 0.006388714973870156, + "grad_norm": 1.7007731199264526, + "learning_rate": 9.999749236547242e-05, + "loss": 1.3958, + "step": 1000 + }, + { + "epoch": 0.0064526021236088575, + "grad_norm": 1.7878336906433105, + "learning_rate": 9.999744186093662e-05, + "loss": 0.689, + "step": 1010 + }, + { + "epoch": 0.0065164892733475584, + "grad_norm": 0.8811324238777161, + "learning_rate": 9.999739085288333e-05, + "loss": 1.0409, + "step": 1020 + }, + { + "epoch": 0.00658037642308626, + "grad_norm": 0.7681977152824402, + "learning_rate": 9.999733934131305e-05, + "loss": 0.6836, + "step": 1030 + }, + { + "epoch": 0.006644263572824962, + "grad_norm": 0.9528589844703674, + "learning_rate": 9.999728732622631e-05, + "loss": 0.8524, + "step": 1040 + }, + { + "epoch": 0.006708150722563664, + "grad_norm": 0.8264364004135132, + "learning_rate": 9.999723480762365e-05, + "loss": 1.2183, + "step": 1050 + }, + { + "epoch": 0.006772037872302365, + "grad_norm": 0.740313708782196, + "learning_rate": 9.999718178550556e-05, + "loss": 1.0717, + "step": 1060 + }, + { + "epoch": 0.0068359250220410665, + "grad_norm": 1.0919981002807617, + "learning_rate": 9.99971282598726e-05, + "loss": 0.8355, + "step": 1070 + }, + { + "epoch": 0.006899812171779768, + "grad_norm": 1.0758978128433228, + "learning_rate": 9.999707423072531e-05, + "loss": 0.853, + "step": 1080 + }, + { + "epoch": 0.00696369932151847, + "grad_norm": 1.2707561254501343, + "learning_rate": 9.999701969806424e-05, + "loss": 1.0517, + "step": 1090 + }, + { + "epoch": 0.007027586471257171, + "grad_norm": 0.8416491150856018, + "learning_rate": 9.99969646618899e-05, + "loss": 0.9035, + "step": 1100 + }, + { + "epoch": 0.007091473620995873, + "grad_norm": 1.4568763971328735, + "learning_rate": 9.99969091222029e-05, + "loss": 0.8714, + "step": 1110 + }, + { + "epoch": 0.007155360770734575, + "grad_norm": 1.4576863050460815, + "learning_rate": 9.999685307900376e-05, + "loss": 0.8144, + "step": 1120 + }, + { + "epoch": 0.007219247920473276, + "grad_norm": 1.0689126253128052, + "learning_rate": 9.999679653229304e-05, + "loss": 0.925, + "step": 1130 + }, + { + "epoch": 0.007283135070211977, + "grad_norm": 1.1141548156738281, + "learning_rate": 9.999673948207134e-05, + "loss": 1.1567, + "step": 1140 + }, + { + "epoch": 0.007347022219950679, + "grad_norm": 0.8566306829452515, + "learning_rate": 9.999668192833922e-05, + "loss": 0.9069, + "step": 1150 + }, + { + "epoch": 0.007410909369689381, + "grad_norm": 0.7586050629615784, + "learning_rate": 9.999662387109728e-05, + "loss": 0.9713, + "step": 1160 + }, + { + "epoch": 0.007474796519428082, + "grad_norm": 0.6867004036903381, + "learning_rate": 9.999656531034604e-05, + "loss": 0.9686, + "step": 1170 + }, + { + "epoch": 0.007538683669166784, + "grad_norm": 0.9020546078681946, + "learning_rate": 9.999650624608617e-05, + "loss": 0.8857, + "step": 1180 + }, + { + "epoch": 0.007602570818905485, + "grad_norm": 0.6556907892227173, + "learning_rate": 9.999644667831822e-05, + "loss": 0.7392, + "step": 1190 + }, + { + "epoch": 0.007666457968644187, + "grad_norm": 0.8906095027923584, + "learning_rate": 9.99963866070428e-05, + "loss": 0.9946, + "step": 1200 + }, + { + "epoch": 0.007730345118382888, + "grad_norm": 0.776619017124176, + "learning_rate": 9.99963260322605e-05, + "loss": 0.8289, + "step": 1210 + }, + { + "epoch": 0.00779423226812159, + "grad_norm": 0.7643131613731384, + "learning_rate": 9.999626495397197e-05, + "loss": 1.1158, + "step": 1220 + }, + { + "epoch": 0.007858119417860292, + "grad_norm": 0.6919121742248535, + "learning_rate": 9.999620337217778e-05, + "loss": 0.713, + "step": 1230 + }, + { + "epoch": 0.007922006567598993, + "grad_norm": 0.9334784150123596, + "learning_rate": 9.999614128687857e-05, + "loss": 1.1716, + "step": 1240 + }, + { + "epoch": 0.007985893717337695, + "grad_norm": 1.4314568042755127, + "learning_rate": 9.999607869807496e-05, + "loss": 1.1029, + "step": 1250 + }, + { + "epoch": 0.008049780867076396, + "grad_norm": 1.5527832508087158, + "learning_rate": 9.99960156057676e-05, + "loss": 0.9211, + "step": 1260 + }, + { + "epoch": 0.008113668016815097, + "grad_norm": 0.7879507541656494, + "learning_rate": 9.999595200995711e-05, + "loss": 1.0019, + "step": 1270 + }, + { + "epoch": 0.0081775551665538, + "grad_norm": 1.07510244846344, + "learning_rate": 9.999588791064412e-05, + "loss": 0.9774, + "step": 1280 + }, + { + "epoch": 0.0082414423162925, + "grad_norm": 0.6843255162239075, + "learning_rate": 9.999582330782928e-05, + "loss": 0.9584, + "step": 1290 + }, + { + "epoch": 0.008305329466031202, + "grad_norm": 1.3522765636444092, + "learning_rate": 9.999575820151326e-05, + "loss": 0.8153, + "step": 1300 + }, + { + "epoch": 0.008369216615769904, + "grad_norm": 0.546192467212677, + "learning_rate": 9.99956925916967e-05, + "loss": 1.19, + "step": 1310 + }, + { + "epoch": 0.008433103765508605, + "grad_norm": 0.7880367636680603, + "learning_rate": 9.999562647838026e-05, + "loss": 1.017, + "step": 1320 + }, + { + "epoch": 0.008496990915247308, + "grad_norm": 0.9877641201019287, + "learning_rate": 9.999555986156461e-05, + "loss": 1.1224, + "step": 1330 + }, + { + "epoch": 0.008560878064986009, + "grad_norm": 1.541818618774414, + "learning_rate": 9.999549274125042e-05, + "loss": 0.8995, + "step": 1340 + }, + { + "epoch": 0.00862476521472471, + "grad_norm": 0.7599831223487854, + "learning_rate": 9.999542511743836e-05, + "loss": 1.0069, + "step": 1350 + }, + { + "epoch": 0.008688652364463412, + "grad_norm": 1.1491132974624634, + "learning_rate": 9.999535699012912e-05, + "loss": 0.882, + "step": 1360 + }, + { + "epoch": 0.008752539514202113, + "grad_norm": 0.8400082588195801, + "learning_rate": 9.999528835932339e-05, + "loss": 0.9932, + "step": 1370 + }, + { + "epoch": 0.008816426663940814, + "grad_norm": 0.897087037563324, + "learning_rate": 9.999521922502185e-05, + "loss": 0.8435, + "step": 1380 + }, + { + "epoch": 0.008880313813679517, + "grad_norm": 1.469058632850647, + "learning_rate": 9.99951495872252e-05, + "loss": 1.1746, + "step": 1390 + }, + { + "epoch": 0.008944200963418218, + "grad_norm": 0.7353557348251343, + "learning_rate": 9.999507944593413e-05, + "loss": 0.8944, + "step": 1400 + }, + { + "epoch": 0.009008088113156919, + "grad_norm": 0.8597003817558289, + "learning_rate": 9.999500880114938e-05, + "loss": 1.1237, + "step": 1410 + }, + { + "epoch": 0.009071975262895621, + "grad_norm": 1.2774052619934082, + "learning_rate": 9.999493765287164e-05, + "loss": 1.0234, + "step": 1420 + }, + { + "epoch": 0.009135862412634322, + "grad_norm": 1.0299676656723022, + "learning_rate": 9.99948660011016e-05, + "loss": 1.2595, + "step": 1430 + }, + { + "epoch": 0.009199749562373025, + "grad_norm": 0.6526196002960205, + "learning_rate": 9.999479384584003e-05, + "loss": 0.9003, + "step": 1440 + }, + { + "epoch": 0.009263636712111726, + "grad_norm": 0.9184065461158752, + "learning_rate": 9.999472118708763e-05, + "loss": 0.8782, + "step": 1450 + }, + { + "epoch": 0.009327523861850427, + "grad_norm": 1.0141165256500244, + "learning_rate": 9.999464802484513e-05, + "loss": 0.8616, + "step": 1460 + }, + { + "epoch": 0.00939141101158913, + "grad_norm": 1.9449567794799805, + "learning_rate": 9.999457435911328e-05, + "loss": 0.9921, + "step": 1470 + }, + { + "epoch": 0.00945529816132783, + "grad_norm": 0.9585944414138794, + "learning_rate": 9.99945001898928e-05, + "loss": 0.9861, + "step": 1480 + }, + { + "epoch": 0.009519185311066531, + "grad_norm": 1.201170802116394, + "learning_rate": 9.999442551718448e-05, + "loss": 1.3192, + "step": 1490 + }, + { + "epoch": 0.009583072460805234, + "grad_norm": 0.8674155473709106, + "learning_rate": 9.999435034098901e-05, + "loss": 0.8345, + "step": 1500 + }, + { + "epoch": 0.009646959610543935, + "grad_norm": 1.0349905490875244, + "learning_rate": 9.999427466130721e-05, + "loss": 1.1643, + "step": 1510 + }, + { + "epoch": 0.009710846760282638, + "grad_norm": 0.8286603689193726, + "learning_rate": 9.99941984781398e-05, + "loss": 1.0253, + "step": 1520 + }, + { + "epoch": 0.009774733910021339, + "grad_norm": 1.2230565547943115, + "learning_rate": 9.999412179148756e-05, + "loss": 1.1343, + "step": 1530 + }, + { + "epoch": 0.00983862105976004, + "grad_norm": 0.7413927912712097, + "learning_rate": 9.999404460135126e-05, + "loss": 0.9257, + "step": 1540 + }, + { + "epoch": 0.009902508209498742, + "grad_norm": 1.2482092380523682, + "learning_rate": 9.999396690773169e-05, + "loss": 1.2573, + "step": 1550 + }, + { + "epoch": 0.009966395359237443, + "grad_norm": 1.8260524272918701, + "learning_rate": 9.99938887106296e-05, + "loss": 0.9978, + "step": 1560 + }, + { + "epoch": 0.010030282508976144, + "grad_norm": 0.7294577956199646, + "learning_rate": 9.999381001004582e-05, + "loss": 0.8249, + "step": 1570 + }, + { + "epoch": 0.010094169658714847, + "grad_norm": 0.8026980757713318, + "learning_rate": 9.999373080598112e-05, + "loss": 0.9687, + "step": 1580 + }, + { + "epoch": 0.010158056808453548, + "grad_norm": 0.9354428052902222, + "learning_rate": 9.99936510984363e-05, + "loss": 1.0309, + "step": 1590 + }, + { + "epoch": 0.010221943958192248, + "grad_norm": 1.3766313791275024, + "learning_rate": 9.999357088741216e-05, + "loss": 1.0006, + "step": 1600 + }, + { + "epoch": 0.010285831107930951, + "grad_norm": 0.6556980609893799, + "learning_rate": 9.999349017290951e-05, + "loss": 0.9616, + "step": 1610 + }, + { + "epoch": 0.010349718257669652, + "grad_norm": 0.6386595368385315, + "learning_rate": 9.999340895492917e-05, + "loss": 0.9685, + "step": 1620 + }, + { + "epoch": 0.010413605407408355, + "grad_norm": 0.859089195728302, + "learning_rate": 9.999332723347194e-05, + "loss": 1.1593, + "step": 1630 + }, + { + "epoch": 0.010477492557147056, + "grad_norm": 0.8099786043167114, + "learning_rate": 9.999324500853866e-05, + "loss": 1.1586, + "step": 1640 + }, + { + "epoch": 0.010541379706885757, + "grad_norm": 0.8301547169685364, + "learning_rate": 9.999316228013016e-05, + "loss": 0.9436, + "step": 1650 + }, + { + "epoch": 0.01060526685662446, + "grad_norm": 1.2509781122207642, + "learning_rate": 9.999307904824725e-05, + "loss": 0.8458, + "step": 1660 + }, + { + "epoch": 0.01066915400636316, + "grad_norm": 1.0006517171859741, + "learning_rate": 9.99929953128908e-05, + "loss": 0.9221, + "step": 1670 + }, + { + "epoch": 0.010733041156101861, + "grad_norm": 0.8921092748641968, + "learning_rate": 9.999291107406163e-05, + "loss": 0.918, + "step": 1680 + }, + { + "epoch": 0.010796928305840564, + "grad_norm": 0.8920373916625977, + "learning_rate": 9.999282633176059e-05, + "loss": 1.059, + "step": 1690 + }, + { + "epoch": 0.010860815455579265, + "grad_norm": 0.7163852453231812, + "learning_rate": 9.999274108598854e-05, + "loss": 1.1965, + "step": 1700 + }, + { + "epoch": 0.010924702605317966, + "grad_norm": 0.7184985876083374, + "learning_rate": 9.999265533674635e-05, + "loss": 1.1157, + "step": 1710 + }, + { + "epoch": 0.010988589755056668, + "grad_norm": 1.6878572702407837, + "learning_rate": 9.999256908403485e-05, + "loss": 0.7872, + "step": 1720 + }, + { + "epoch": 0.01105247690479537, + "grad_norm": 2.4965457916259766, + "learning_rate": 9.999248232785494e-05, + "loss": 0.7284, + "step": 1730 + }, + { + "epoch": 0.011116364054534072, + "grad_norm": 0.6647805571556091, + "learning_rate": 9.999239506820749e-05, + "loss": 1.0634, + "step": 1740 + }, + { + "epoch": 0.011180251204272773, + "grad_norm": 1.112949252128601, + "learning_rate": 9.999230730509337e-05, + "loss": 1.0865, + "step": 1750 + }, + { + "epoch": 0.011244138354011474, + "grad_norm": 0.7501624822616577, + "learning_rate": 9.999221903851346e-05, + "loss": 1.0212, + "step": 1760 + }, + { + "epoch": 0.011308025503750176, + "grad_norm": 0.6178969144821167, + "learning_rate": 9.999213026846865e-05, + "loss": 0.9825, + "step": 1770 + }, + { + "epoch": 0.011371912653488877, + "grad_norm": 0.7546608448028564, + "learning_rate": 9.999204099495984e-05, + "loss": 0.8365, + "step": 1780 + }, + { + "epoch": 0.011435799803227578, + "grad_norm": 0.6355531811714172, + "learning_rate": 9.999195121798795e-05, + "loss": 1.1684, + "step": 1790 + }, + { + "epoch": 0.01149968695296628, + "grad_norm": 1.0356401205062866, + "learning_rate": 9.999186093755385e-05, + "loss": 1.0752, + "step": 1800 + }, + { + "epoch": 0.011563574102704982, + "grad_norm": 0.9333721399307251, + "learning_rate": 9.999177015365844e-05, + "loss": 0.9288, + "step": 1810 + }, + { + "epoch": 0.011627461252443683, + "grad_norm": 0.9251835942268372, + "learning_rate": 9.999167886630269e-05, + "loss": 0.748, + "step": 1820 + }, + { + "epoch": 0.011691348402182385, + "grad_norm": 1.0885391235351562, + "learning_rate": 9.999158707548745e-05, + "loss": 1.1773, + "step": 1830 + }, + { + "epoch": 0.011755235551921086, + "grad_norm": 1.2786647081375122, + "learning_rate": 9.99914947812137e-05, + "loss": 1.1812, + "step": 1840 + }, + { + "epoch": 0.011819122701659789, + "grad_norm": 0.6569780111312866, + "learning_rate": 9.999140198348236e-05, + "loss": 0.989, + "step": 1850 + }, + { + "epoch": 0.01188300985139849, + "grad_norm": 1.241723656654358, + "learning_rate": 9.999130868229434e-05, + "loss": 1.0771, + "step": 1860 + }, + { + "epoch": 0.01194689700113719, + "grad_norm": 1.3552509546279907, + "learning_rate": 9.999121487765058e-05, + "loss": 1.0246, + "step": 1870 + }, + { + "epoch": 0.012010784150875893, + "grad_norm": 0.6326724290847778, + "learning_rate": 9.999112056955205e-05, + "loss": 0.9514, + "step": 1880 + }, + { + "epoch": 0.012074671300614594, + "grad_norm": 2.2786476612091064, + "learning_rate": 9.99910257579997e-05, + "loss": 0.6857, + "step": 1890 + }, + { + "epoch": 0.012138558450353295, + "grad_norm": 1.0282983779907227, + "learning_rate": 9.999093044299446e-05, + "loss": 0.7788, + "step": 1900 + }, + { + "epoch": 0.012202445600091998, + "grad_norm": 1.1858989000320435, + "learning_rate": 9.999083462453728e-05, + "loss": 0.9619, + "step": 1910 + }, + { + "epoch": 0.012266332749830699, + "grad_norm": 0.6922428011894226, + "learning_rate": 9.999073830262918e-05, + "loss": 1.1683, + "step": 1920 + }, + { + "epoch": 0.0123302198995694, + "grad_norm": 0.6754278540611267, + "learning_rate": 9.999064147727109e-05, + "loss": 0.7358, + "step": 1930 + }, + { + "epoch": 0.012394107049308102, + "grad_norm": 0.7409210205078125, + "learning_rate": 9.999054414846398e-05, + "loss": 1.1866, + "step": 1940 + }, + { + "epoch": 0.012457994199046803, + "grad_norm": 0.8322914242744446, + "learning_rate": 9.999044631620887e-05, + "loss": 0.7945, + "step": 1950 + }, + { + "epoch": 0.012521881348785506, + "grad_norm": 1.1325633525848389, + "learning_rate": 9.999034798050668e-05, + "loss": 0.9324, + "step": 1960 + }, + { + "epoch": 0.012585768498524207, + "grad_norm": 0.9204065203666687, + "learning_rate": 9.999024914135846e-05, + "loss": 0.8747, + "step": 1970 + }, + { + "epoch": 0.012649655648262908, + "grad_norm": 1.3020517826080322, + "learning_rate": 9.999014979876517e-05, + "loss": 1.0649, + "step": 1980 + }, + { + "epoch": 0.01271354279800161, + "grad_norm": 1.0476547479629517, + "learning_rate": 9.999004995272785e-05, + "loss": 1.0729, + "step": 1990 + }, + { + "epoch": 0.012777429947740311, + "grad_norm": 0.8980121612548828, + "learning_rate": 9.998994960324746e-05, + "loss": 0.8566, + "step": 2000 + }, + { + "epoch": 0.012841317097479012, + "grad_norm": 2.678067684173584, + "learning_rate": 9.998984875032503e-05, + "loss": 1.01, + "step": 2010 + }, + { + "epoch": 0.012905204247217715, + "grad_norm": 1.1093647480010986, + "learning_rate": 9.998974739396159e-05, + "loss": 0.886, + "step": 2020 + }, + { + "epoch": 0.012969091396956416, + "grad_norm": 0.5292948484420776, + "learning_rate": 9.998964553415813e-05, + "loss": 0.9973, + "step": 2030 + }, + { + "epoch": 0.013032978546695117, + "grad_norm": 1.0876336097717285, + "learning_rate": 9.998954317091568e-05, + "loss": 1.09, + "step": 2040 + }, + { + "epoch": 0.01309686569643382, + "grad_norm": 1.7153469324111938, + "learning_rate": 9.998944030423531e-05, + "loss": 0.8236, + "step": 2050 + }, + { + "epoch": 0.01316075284617252, + "grad_norm": 0.5473589897155762, + "learning_rate": 9.998933693411802e-05, + "loss": 0.8271, + "step": 2060 + }, + { + "epoch": 0.013224639995911223, + "grad_norm": 0.930847704410553, + "learning_rate": 9.998923306056487e-05, + "loss": 1.0062, + "step": 2070 + }, + { + "epoch": 0.013288527145649924, + "grad_norm": 1.016547679901123, + "learning_rate": 9.998912868357688e-05, + "loss": 0.9092, + "step": 2080 + }, + { + "epoch": 0.013352414295388625, + "grad_norm": 0.8655534386634827, + "learning_rate": 9.99890238031551e-05, + "loss": 0.8901, + "step": 2090 + }, + { + "epoch": 0.013416301445127328, + "grad_norm": 0.7575225830078125, + "learning_rate": 9.998891841930064e-05, + "loss": 1.1021, + "step": 2100 + }, + { + "epoch": 0.013480188594866029, + "grad_norm": 0.8108758330345154, + "learning_rate": 9.998881253201452e-05, + "loss": 1.0897, + "step": 2110 + }, + { + "epoch": 0.01354407574460473, + "grad_norm": 1.2894190549850464, + "learning_rate": 9.998870614129781e-05, + "loss": 1.1317, + "step": 2120 + }, + { + "epoch": 0.013607962894343432, + "grad_norm": 1.173697590827942, + "learning_rate": 9.998859924715157e-05, + "loss": 0.7373, + "step": 2130 + }, + { + "epoch": 0.013671850044082133, + "grad_norm": 0.7047708034515381, + "learning_rate": 9.998849184957689e-05, + "loss": 0.7464, + "step": 2140 + }, + { + "epoch": 0.013735737193820836, + "grad_norm": 0.7167409062385559, + "learning_rate": 9.998838394857486e-05, + "loss": 0.9529, + "step": 2150 + }, + { + "epoch": 0.013799624343559537, + "grad_norm": 0.8524914383888245, + "learning_rate": 9.998827554414656e-05, + "loss": 1.0282, + "step": 2160 + }, + { + "epoch": 0.013863511493298238, + "grad_norm": 0.7894335389137268, + "learning_rate": 9.998816663629307e-05, + "loss": 1.0432, + "step": 2170 + }, + { + "epoch": 0.01392739864303694, + "grad_norm": 0.7883844971656799, + "learning_rate": 9.99880572250155e-05, + "loss": 1.1062, + "step": 2180 + }, + { + "epoch": 0.013991285792775641, + "grad_norm": 1.115862250328064, + "learning_rate": 9.998794731031494e-05, + "loss": 1.0593, + "step": 2190 + }, + { + "epoch": 0.014055172942514342, + "grad_norm": 0.5956576466560364, + "learning_rate": 9.998783689219251e-05, + "loss": 0.8832, + "step": 2200 + }, + { + "epoch": 0.014119060092253045, + "grad_norm": 1.0389795303344727, + "learning_rate": 9.998773708545755e-05, + "loss": 0.9955, + "step": 2210 + }, + { + "epoch": 0.014182947241991746, + "grad_norm": 1.3539459705352783, + "learning_rate": 9.998762571083662e-05, + "loss": 0.9878, + "step": 2220 + }, + { + "epoch": 0.014246834391730447, + "grad_norm": 1.031422734260559, + "learning_rate": 9.998751383279706e-05, + "loss": 0.9666, + "step": 2230 + }, + { + "epoch": 0.01431072154146915, + "grad_norm": 0.5059804320335388, + "learning_rate": 9.998740145134e-05, + "loss": 0.7655, + "step": 2240 + }, + { + "epoch": 0.01437460869120785, + "grad_norm": 0.7834402322769165, + "learning_rate": 9.998728856646656e-05, + "loss": 1.0262, + "step": 2250 + }, + { + "epoch": 0.014438495840946553, + "grad_norm": 0.7399794459342957, + "learning_rate": 9.998717517817786e-05, + "loss": 0.982, + "step": 2260 + }, + { + "epoch": 0.014502382990685254, + "grad_norm": 0.7037153840065002, + "learning_rate": 9.998706128647508e-05, + "loss": 0.7902, + "step": 2270 + }, + { + "epoch": 0.014566270140423955, + "grad_norm": 0.8694613575935364, + "learning_rate": 9.998694689135934e-05, + "loss": 1.0659, + "step": 2280 + }, + { + "epoch": 0.014630157290162657, + "grad_norm": 1.4297699928283691, + "learning_rate": 9.99868319928318e-05, + "loss": 1.0498, + "step": 2290 + }, + { + "epoch": 0.014694044439901358, + "grad_norm": 1.0179654359817505, + "learning_rate": 9.998671659089361e-05, + "loss": 0.9041, + "step": 2300 + }, + { + "epoch": 0.01475793158964006, + "grad_norm": 0.9118665456771851, + "learning_rate": 9.998660068554596e-05, + "loss": 1.0452, + "step": 2310 + }, + { + "epoch": 0.014821818739378762, + "grad_norm": 1.0615768432617188, + "learning_rate": 9.998649594031891e-05, + "loss": 0.9364, + "step": 2320 + }, + { + "epoch": 0.014885705889117463, + "grad_norm": 1.8446980714797974, + "learning_rate": 9.998637907849646e-05, + "loss": 1.0038, + "step": 2330 + }, + { + "epoch": 0.014949593038856164, + "grad_norm": 1.1372798681259155, + "learning_rate": 9.998626171326792e-05, + "loss": 1.0814, + "step": 2340 + }, + { + "epoch": 0.015013480188594866, + "grad_norm": 1.2520413398742676, + "learning_rate": 9.998614384463449e-05, + "loss": 0.9373, + "step": 2350 + }, + { + "epoch": 0.015077367338333567, + "grad_norm": 0.7592064738273621, + "learning_rate": 9.998602547259734e-05, + "loss": 1.0605, + "step": 2360 + }, + { + "epoch": 0.01514125448807227, + "grad_norm": 0.8538485169410706, + "learning_rate": 9.998590659715766e-05, + "loss": 0.8727, + "step": 2370 + }, + { + "epoch": 0.01520514163781097, + "grad_norm": 0.7715469002723694, + "learning_rate": 9.998578721831666e-05, + "loss": 1.0918, + "step": 2380 + }, + { + "epoch": 0.015269028787549672, + "grad_norm": 1.0266464948654175, + "learning_rate": 9.998566733607554e-05, + "loss": 1.0816, + "step": 2390 + }, + { + "epoch": 0.015332915937288374, + "grad_norm": 0.564927339553833, + "learning_rate": 9.998554695043552e-05, + "loss": 0.8394, + "step": 2400 + }, + { + "epoch": 0.015396803087027075, + "grad_norm": 1.2067440748214722, + "learning_rate": 9.998542606139779e-05, + "loss": 1.1371, + "step": 2410 + }, + { + "epoch": 0.015460690236765776, + "grad_norm": 1.1786682605743408, + "learning_rate": 9.998530466896357e-05, + "loss": 0.9845, + "step": 2420 + }, + { + "epoch": 0.015524577386504479, + "grad_norm": 3.820138454437256, + "learning_rate": 9.99851827731341e-05, + "loss": 0.9517, + "step": 2430 + }, + { + "epoch": 0.01558846453624318, + "grad_norm": 0.8492526412010193, + "learning_rate": 9.998506037391058e-05, + "loss": 0.989, + "step": 2440 + }, + { + "epoch": 0.01565235168598188, + "grad_norm": 1.1744376420974731, + "learning_rate": 9.998493747129428e-05, + "loss": 0.8713, + "step": 2450 + }, + { + "epoch": 0.015716238835720583, + "grad_norm": 1.1239817142486572, + "learning_rate": 9.99848140652864e-05, + "loss": 0.801, + "step": 2460 + }, + { + "epoch": 0.015780125985459286, + "grad_norm": 0.8037886023521423, + "learning_rate": 9.99846901558882e-05, + "loss": 0.8116, + "step": 2470 + }, + { + "epoch": 0.015844013135197985, + "grad_norm": 0.9169192314147949, + "learning_rate": 9.998456574310094e-05, + "loss": 1.0343, + "step": 2480 + }, + { + "epoch": 0.015907900284936688, + "grad_norm": 0.7503566145896912, + "learning_rate": 9.998444082692585e-05, + "loss": 1.0077, + "step": 2490 + }, + { + "epoch": 0.01597178743467539, + "grad_norm": 1.1476398706436157, + "learning_rate": 9.99843154073642e-05, + "loss": 0.8427, + "step": 2500 + }, + { + "epoch": 0.01603567458441409, + "grad_norm": 0.7474212646484375, + "learning_rate": 9.998418948441726e-05, + "loss": 0.7488, + "step": 2510 + }, + { + "epoch": 0.016099561734152792, + "grad_norm": 0.9779971837997437, + "learning_rate": 9.998406305808627e-05, + "loss": 0.8778, + "step": 2520 + }, + { + "epoch": 0.016163448883891495, + "grad_norm": 1.0902825593948364, + "learning_rate": 9.998393612837254e-05, + "loss": 1.2649, + "step": 2530 + }, + { + "epoch": 0.016227336033630194, + "grad_norm": 0.9004558324813843, + "learning_rate": 9.998380869527732e-05, + "loss": 0.7976, + "step": 2540 + }, + { + "epoch": 0.016291223183368897, + "grad_norm": 0.8847173452377319, + "learning_rate": 9.998368075880192e-05, + "loss": 0.9168, + "step": 2550 + }, + { + "epoch": 0.0163551103331076, + "grad_norm": 1.1703412532806396, + "learning_rate": 9.99835523189476e-05, + "loss": 0.9307, + "step": 2560 + }, + { + "epoch": 0.0164189974828463, + "grad_norm": 0.7630004286766052, + "learning_rate": 9.998342337571565e-05, + "loss": 0.8969, + "step": 2570 + }, + { + "epoch": 0.016482884632585, + "grad_norm": 0.9424830079078674, + "learning_rate": 9.998329392910741e-05, + "loss": 1.0097, + "step": 2580 + }, + { + "epoch": 0.016546771782323704, + "grad_norm": 0.891345739364624, + "learning_rate": 9.998316397912415e-05, + "loss": 0.9626, + "step": 2590 + }, + { + "epoch": 0.016610658932062403, + "grad_norm": 0.7180986404418945, + "learning_rate": 9.998303352576719e-05, + "loss": 0.9426, + "step": 2600 + }, + { + "epoch": 0.016674546081801106, + "grad_norm": 1.2385119199752808, + "learning_rate": 9.998290256903784e-05, + "loss": 0.7992, + "step": 2610 + }, + { + "epoch": 0.01673843323153981, + "grad_norm": 0.9304938316345215, + "learning_rate": 9.998277110893741e-05, + "loss": 1.1183, + "step": 2620 + }, + { + "epoch": 0.01680232038127851, + "grad_norm": 4.244834899902344, + "learning_rate": 9.998263914546724e-05, + "loss": 1.1446, + "step": 2630 + }, + { + "epoch": 0.01686620753101721, + "grad_norm": 1.0744621753692627, + "learning_rate": 9.998250667862868e-05, + "loss": 0.7592, + "step": 2640 + }, + { + "epoch": 0.016930094680755913, + "grad_norm": 1.1547142267227173, + "learning_rate": 9.9982373708423e-05, + "loss": 0.9098, + "step": 2650 + }, + { + "epoch": 0.016993981830494616, + "grad_norm": 0.8676884770393372, + "learning_rate": 9.998224023485159e-05, + "loss": 1.1234, + "step": 2660 + }, + { + "epoch": 0.017057868980233315, + "grad_norm": 1.3594059944152832, + "learning_rate": 9.998210625791578e-05, + "loss": 0.7285, + "step": 2670 + }, + { + "epoch": 0.017121756129972018, + "grad_norm": 0.9443914294242859, + "learning_rate": 9.998197177761692e-05, + "loss": 1.0057, + "step": 2680 + }, + { + "epoch": 0.01718564327971072, + "grad_norm": 0.7387935519218445, + "learning_rate": 9.998183679395636e-05, + "loss": 0.8873, + "step": 2690 + }, + { + "epoch": 0.01724953042944942, + "grad_norm": 0.9435983896255493, + "learning_rate": 9.998170130693545e-05, + "loss": 0.8891, + "step": 2700 + }, + { + "epoch": 0.017313417579188122, + "grad_norm": 1.0034334659576416, + "learning_rate": 9.998156531655557e-05, + "loss": 1.0039, + "step": 2710 + }, + { + "epoch": 0.017377304728926825, + "grad_norm": 1.2125136852264404, + "learning_rate": 9.99814288228181e-05, + "loss": 0.7617, + "step": 2720 + }, + { + "epoch": 0.017441191878665524, + "grad_norm": 0.9862277507781982, + "learning_rate": 9.998129182572442e-05, + "loss": 0.8764, + "step": 2730 + }, + { + "epoch": 0.017505079028404227, + "grad_norm": 1.1421021223068237, + "learning_rate": 9.998115432527586e-05, + "loss": 0.9241, + "step": 2740 + }, + { + "epoch": 0.01756896617814293, + "grad_norm": 0.8746705651283264, + "learning_rate": 9.998101632147385e-05, + "loss": 0.9238, + "step": 2750 + }, + { + "epoch": 0.01763285332788163, + "grad_norm": 0.6663450002670288, + "learning_rate": 9.998087781431977e-05, + "loss": 1.0525, + "step": 2760 + }, + { + "epoch": 0.01769674047762033, + "grad_norm": 1.4795788526535034, + "learning_rate": 9.9980738803815e-05, + "loss": 0.9025, + "step": 2770 + }, + { + "epoch": 0.017760627627359034, + "grad_norm": 0.7279462218284607, + "learning_rate": 9.998059928996095e-05, + "loss": 1.1858, + "step": 2780 + }, + { + "epoch": 0.017824514777097733, + "grad_norm": 0.7917711138725281, + "learning_rate": 9.998045927275903e-05, + "loss": 0.9119, + "step": 2790 + }, + { + "epoch": 0.017888401926836436, + "grad_norm": 1.2472501993179321, + "learning_rate": 9.998031875221065e-05, + "loss": 0.839, + "step": 2800 + }, + { + "epoch": 0.01795228907657514, + "grad_norm": 0.9328956604003906, + "learning_rate": 9.998017772831723e-05, + "loss": 0.9749, + "step": 2810 + }, + { + "epoch": 0.018016176226313838, + "grad_norm": 0.732351541519165, + "learning_rate": 9.998003620108017e-05, + "loss": 1.0359, + "step": 2820 + }, + { + "epoch": 0.01808006337605254, + "grad_norm": 0.8829627633094788, + "learning_rate": 9.99798941705009e-05, + "loss": 0.9921, + "step": 2830 + }, + { + "epoch": 0.018143950525791243, + "grad_norm": 0.7300599813461304, + "learning_rate": 9.997975163658086e-05, + "loss": 0.9041, + "step": 2840 + }, + { + "epoch": 0.018207837675529946, + "grad_norm": 1.0057677030563354, + "learning_rate": 9.997960859932148e-05, + "loss": 1.1656, + "step": 2850 + }, + { + "epoch": 0.018271724825268645, + "grad_norm": 0.6405202746391296, + "learning_rate": 9.997946505872421e-05, + "loss": 0.7273, + "step": 2860 + }, + { + "epoch": 0.018335611975007347, + "grad_norm": 1.383867621421814, + "learning_rate": 9.997932101479049e-05, + "loss": 1.0818, + "step": 2870 + }, + { + "epoch": 0.01839949912474605, + "grad_norm": 2.046144723892212, + "learning_rate": 9.997917646752175e-05, + "loss": 1.0075, + "step": 2880 + }, + { + "epoch": 0.01846338627448475, + "grad_norm": 0.6531755924224854, + "learning_rate": 9.99790314169195e-05, + "loss": 0.978, + "step": 2890 + }, + { + "epoch": 0.018527273424223452, + "grad_norm": 0.8605973720550537, + "learning_rate": 9.997888586298514e-05, + "loss": 1.0424, + "step": 2900 + }, + { + "epoch": 0.018591160573962155, + "grad_norm": 1.2451750040054321, + "learning_rate": 9.997873980572017e-05, + "loss": 0.9909, + "step": 2910 + }, + { + "epoch": 0.018655047723700854, + "grad_norm": 1.1829801797866821, + "learning_rate": 9.997859324512604e-05, + "loss": 0.8175, + "step": 2920 + }, + { + "epoch": 0.018718934873439556, + "grad_norm": 1.987342357635498, + "learning_rate": 9.997844618120424e-05, + "loss": 1.1086, + "step": 2930 + }, + { + "epoch": 0.01878282202317826, + "grad_norm": 1.5796905755996704, + "learning_rate": 9.997829861395627e-05, + "loss": 0.9863, + "step": 2940 + }, + { + "epoch": 0.018846709172916958, + "grad_norm": 0.5378701686859131, + "learning_rate": 9.997815054338357e-05, + "loss": 0.7471, + "step": 2950 + }, + { + "epoch": 0.01891059632265566, + "grad_norm": 1.4551935195922852, + "learning_rate": 9.997800196948768e-05, + "loss": 1.1466, + "step": 2960 + }, + { + "epoch": 0.018974483472394364, + "grad_norm": 0.5287359356880188, + "learning_rate": 9.997785289227007e-05, + "loss": 0.8842, + "step": 2970 + }, + { + "epoch": 0.019038370622133063, + "grad_norm": 0.6062310338020325, + "learning_rate": 9.997770331173221e-05, + "loss": 0.8015, + "step": 2980 + }, + { + "epoch": 0.019102257771871765, + "grad_norm": 0.9560365676879883, + "learning_rate": 9.997755322787568e-05, + "loss": 1.1405, + "step": 2990 + }, + { + "epoch": 0.019166144921610468, + "grad_norm": 0.7935013175010681, + "learning_rate": 9.997740264070194e-05, + "loss": 0.8133, + "step": 3000 + }, + { + "epoch": 0.019230032071349167, + "grad_norm": 0.8417540788650513, + "learning_rate": 9.997725155021253e-05, + "loss": 0.8547, + "step": 3010 + }, + { + "epoch": 0.01929391922108787, + "grad_norm": 0.5501998662948608, + "learning_rate": 9.997709995640894e-05, + "loss": 0.9299, + "step": 3020 + }, + { + "epoch": 0.019357806370826573, + "grad_norm": 0.821506917476654, + "learning_rate": 9.997694785929273e-05, + "loss": 0.9835, + "step": 3030 + }, + { + "epoch": 0.019421693520565275, + "grad_norm": 0.817926287651062, + "learning_rate": 9.997679525886541e-05, + "loss": 1.2224, + "step": 3040 + }, + { + "epoch": 0.019485580670303974, + "grad_norm": 2.5229651927948, + "learning_rate": 9.997664215512854e-05, + "loss": 1.0535, + "step": 3050 + }, + { + "epoch": 0.019549467820042677, + "grad_norm": 0.8168900609016418, + "learning_rate": 9.997648854808364e-05, + "loss": 1.0088, + "step": 3060 + }, + { + "epoch": 0.01961335496978138, + "grad_norm": 0.522985577583313, + "learning_rate": 9.997633443773226e-05, + "loss": 0.9106, + "step": 3070 + }, + { + "epoch": 0.01967724211952008, + "grad_norm": 0.5633349418640137, + "learning_rate": 9.997617982407595e-05, + "loss": 0.9174, + "step": 3080 + }, + { + "epoch": 0.01974112926925878, + "grad_norm": 2.293459892272949, + "learning_rate": 9.997602470711628e-05, + "loss": 0.8805, + "step": 3090 + }, + { + "epoch": 0.019805016418997484, + "grad_norm": 0.6353404521942139, + "learning_rate": 9.997586908685481e-05, + "loss": 0.9407, + "step": 3100 + }, + { + "epoch": 0.019868903568736183, + "grad_norm": 0.6325660943984985, + "learning_rate": 9.997571296329312e-05, + "loss": 0.6832, + "step": 3110 + }, + { + "epoch": 0.019932790718474886, + "grad_norm": 0.7705810070037842, + "learning_rate": 9.997555633643274e-05, + "loss": 0.7368, + "step": 3120 + }, + { + "epoch": 0.01999667786821359, + "grad_norm": 0.601768434047699, + "learning_rate": 9.997539920627527e-05, + "loss": 1.0854, + "step": 3130 + }, + { + "epoch": 0.020060565017952288, + "grad_norm": 1.055450439453125, + "learning_rate": 9.997524157282231e-05, + "loss": 1.1712, + "step": 3140 + }, + { + "epoch": 0.02012445216769099, + "grad_norm": 0.5919578671455383, + "learning_rate": 9.997508343607542e-05, + "loss": 0.9698, + "step": 3150 + }, + { + "epoch": 0.020188339317429693, + "grad_norm": 1.1966851949691772, + "learning_rate": 9.997492479603623e-05, + "loss": 0.9131, + "step": 3160 + }, + { + "epoch": 0.020252226467168392, + "grad_norm": 0.5245844721794128, + "learning_rate": 9.997476565270629e-05, + "loss": 0.9533, + "step": 3170 + }, + { + "epoch": 0.020316113616907095, + "grad_norm": 0.6640262603759766, + "learning_rate": 9.997460600608723e-05, + "loss": 0.954, + "step": 3180 + }, + { + "epoch": 0.020380000766645798, + "grad_norm": 1.1632764339447021, + "learning_rate": 9.997444585618066e-05, + "loss": 0.9693, + "step": 3190 + }, + { + "epoch": 0.020443887916384497, + "grad_norm": 0.8746532797813416, + "learning_rate": 9.997428520298817e-05, + "loss": 0.9353, + "step": 3200 + }, + { + "epoch": 0.0205077750661232, + "grad_norm": 0.7248082756996155, + "learning_rate": 9.997412404651141e-05, + "loss": 1.0746, + "step": 3210 + }, + { + "epoch": 0.020571662215861902, + "grad_norm": 1.0290027856826782, + "learning_rate": 9.997396238675198e-05, + "loss": 1.013, + "step": 3220 + }, + { + "epoch": 0.0206355493656006, + "grad_norm": 1.3203686475753784, + "learning_rate": 9.997380022371153e-05, + "loss": 0.9819, + "step": 3230 + }, + { + "epoch": 0.020699436515339304, + "grad_norm": 1.1412265300750732, + "learning_rate": 9.997363755739166e-05, + "loss": 0.756, + "step": 3240 + }, + { + "epoch": 0.020763323665078007, + "grad_norm": 1.012272596359253, + "learning_rate": 9.997347438779403e-05, + "loss": 0.8896, + "step": 3250 + }, + { + "epoch": 0.02082721081481671, + "grad_norm": 0.6581144332885742, + "learning_rate": 9.997331071492028e-05, + "loss": 0.9093, + "step": 3260 + }, + { + "epoch": 0.02089109796455541, + "grad_norm": 0.6292199492454529, + "learning_rate": 9.997314653877206e-05, + "loss": 0.8898, + "step": 3270 + }, + { + "epoch": 0.02095498511429411, + "grad_norm": 0.8514048457145691, + "learning_rate": 9.997298185935102e-05, + "loss": 1.0219, + "step": 3280 + }, + { + "epoch": 0.021018872264032814, + "grad_norm": 0.8251546621322632, + "learning_rate": 9.99728166766588e-05, + "loss": 1.1417, + "step": 3290 + }, + { + "epoch": 0.021082759413771513, + "grad_norm": 0.7164210081100464, + "learning_rate": 9.997265099069712e-05, + "loss": 0.8025, + "step": 3300 + }, + { + "epoch": 0.021146646563510216, + "grad_norm": 0.6162307858467102, + "learning_rate": 9.997248480146759e-05, + "loss": 1.1907, + "step": 3310 + }, + { + "epoch": 0.02121053371324892, + "grad_norm": 0.9600428938865662, + "learning_rate": 9.997231810897191e-05, + "loss": 1.0066, + "step": 3320 + }, + { + "epoch": 0.021274420862987618, + "grad_norm": 1.1238371133804321, + "learning_rate": 9.997215091321178e-05, + "loss": 0.8551, + "step": 3330 + }, + { + "epoch": 0.02133830801272632, + "grad_norm": 1.7699466943740845, + "learning_rate": 9.997198321418881e-05, + "loss": 1.049, + "step": 3340 + }, + { + "epoch": 0.021402195162465023, + "grad_norm": 1.0499175786972046, + "learning_rate": 9.997181501190478e-05, + "loss": 1.11, + "step": 3350 + }, + { + "epoch": 0.021466082312203722, + "grad_norm": 0.9096553325653076, + "learning_rate": 9.997164630636132e-05, + "loss": 0.9371, + "step": 3360 + }, + { + "epoch": 0.021529969461942425, + "grad_norm": 0.8059217929840088, + "learning_rate": 9.997147709756014e-05, + "loss": 0.8638, + "step": 3370 + }, + { + "epoch": 0.021593856611681127, + "grad_norm": 0.6484128832817078, + "learning_rate": 9.997130738550298e-05, + "loss": 0.9817, + "step": 3380 + }, + { + "epoch": 0.021657743761419827, + "grad_norm": 1.1222511529922485, + "learning_rate": 9.997113717019151e-05, + "loss": 0.7598, + "step": 3390 + }, + { + "epoch": 0.02172163091115853, + "grad_norm": 1.0018411874771118, + "learning_rate": 9.997096645162745e-05, + "loss": 0.9593, + "step": 3400 + }, + { + "epoch": 0.021785518060897232, + "grad_norm": 0.6298023462295532, + "learning_rate": 9.997079522981254e-05, + "loss": 0.8118, + "step": 3410 + }, + { + "epoch": 0.02184940521063593, + "grad_norm": 0.5194735527038574, + "learning_rate": 9.997062350474849e-05, + "loss": 0.8344, + "step": 3420 + }, + { + "epoch": 0.021913292360374634, + "grad_norm": 0.7458469271659851, + "learning_rate": 9.997045127643703e-05, + "loss": 1.1305, + "step": 3430 + }, + { + "epoch": 0.021977179510113336, + "grad_norm": 1.092467188835144, + "learning_rate": 9.997027854487988e-05, + "loss": 0.7839, + "step": 3440 + }, + { + "epoch": 0.022041066659852036, + "grad_norm": 0.5377646088600159, + "learning_rate": 9.997010531007879e-05, + "loss": 0.9457, + "step": 3450 + }, + { + "epoch": 0.02210495380959074, + "grad_norm": 0.8158820271492004, + "learning_rate": 9.996993157203554e-05, + "loss": 1.0827, + "step": 3460 + }, + { + "epoch": 0.02216884095932944, + "grad_norm": 0.9033936858177185, + "learning_rate": 9.996975733075184e-05, + "loss": 0.8901, + "step": 3470 + }, + { + "epoch": 0.022232728109068144, + "grad_norm": 0.6493645310401917, + "learning_rate": 9.996958258622944e-05, + "loss": 1.0609, + "step": 3480 + }, + { + "epoch": 0.022296615258806843, + "grad_norm": 1.416635274887085, + "learning_rate": 9.996940733847013e-05, + "loss": 0.9017, + "step": 3490 + }, + { + "epoch": 0.022360502408545545, + "grad_norm": 0.9830083847045898, + "learning_rate": 9.996923158747564e-05, + "loss": 0.8952, + "step": 3500 + }, + { + "epoch": 0.022424389558284248, + "grad_norm": 1.130096197128296, + "learning_rate": 9.996905533324777e-05, + "loss": 0.8779, + "step": 3510 + }, + { + "epoch": 0.022488276708022947, + "grad_norm": 0.7025210857391357, + "learning_rate": 9.996887857578828e-05, + "loss": 1.0576, + "step": 3520 + }, + { + "epoch": 0.02255216385776165, + "grad_norm": 0.7813702821731567, + "learning_rate": 9.996870131509897e-05, + "loss": 1.1396, + "step": 3530 + }, + { + "epoch": 0.022616051007500353, + "grad_norm": 0.9451877474784851, + "learning_rate": 9.996852355118158e-05, + "loss": 0.8531, + "step": 3540 + }, + { + "epoch": 0.022679938157239052, + "grad_norm": 0.9123436212539673, + "learning_rate": 9.996834528403795e-05, + "loss": 0.8832, + "step": 3550 + }, + { + "epoch": 0.022743825306977754, + "grad_norm": 3.4489307403564453, + "learning_rate": 9.996816651366985e-05, + "loss": 0.8413, + "step": 3560 + }, + { + "epoch": 0.022807712456716457, + "grad_norm": 1.0235848426818848, + "learning_rate": 9.996798724007907e-05, + "loss": 0.9275, + "step": 3570 + }, + { + "epoch": 0.022871599606455156, + "grad_norm": 0.7772485017776489, + "learning_rate": 9.996780746326743e-05, + "loss": 1.0924, + "step": 3580 + }, + { + "epoch": 0.02293548675619386, + "grad_norm": 0.7384485006332397, + "learning_rate": 9.996762718323677e-05, + "loss": 0.8231, + "step": 3590 + }, + { + "epoch": 0.02299937390593256, + "grad_norm": 0.9038792848587036, + "learning_rate": 9.996744639998885e-05, + "loss": 0.7318, + "step": 3600 + }, + { + "epoch": 0.02306326105567126, + "grad_norm": 0.7685703039169312, + "learning_rate": 9.996726511352553e-05, + "loss": 0.7945, + "step": 3610 + }, + { + "epoch": 0.023127148205409963, + "grad_norm": 0.9612904787063599, + "learning_rate": 9.996708332384862e-05, + "loss": 0.7389, + "step": 3620 + }, + { + "epoch": 0.023191035355148666, + "grad_norm": 0.6820782423019409, + "learning_rate": 9.996690103095995e-05, + "loss": 0.7208, + "step": 3630 + }, + { + "epoch": 0.023254922504887365, + "grad_norm": 0.7813957333564758, + "learning_rate": 9.996671823486135e-05, + "loss": 1.1023, + "step": 3640 + }, + { + "epoch": 0.023318809654626068, + "grad_norm": 0.41932976245880127, + "learning_rate": 9.996653493555469e-05, + "loss": 0.8274, + "step": 3650 + }, + { + "epoch": 0.02338269680436477, + "grad_norm": 1.1898959875106812, + "learning_rate": 9.996635113304178e-05, + "loss": 0.862, + "step": 3660 + }, + { + "epoch": 0.023446583954103473, + "grad_norm": 1.4429035186767578, + "learning_rate": 9.99661668273245e-05, + "loss": 0.9036, + "step": 3670 + }, + { + "epoch": 0.023510471103842173, + "grad_norm": 0.9616169929504395, + "learning_rate": 9.996598201840469e-05, + "loss": 0.9577, + "step": 3680 + }, + { + "epoch": 0.023574358253580875, + "grad_norm": 0.8261591196060181, + "learning_rate": 9.99657967062842e-05, + "loss": 1.0401, + "step": 3690 + }, + { + "epoch": 0.023638245403319578, + "grad_norm": 0.8811150789260864, + "learning_rate": 9.996561089096493e-05, + "loss": 0.8111, + "step": 3700 + }, + { + "epoch": 0.023702132553058277, + "grad_norm": 0.5696326494216919, + "learning_rate": 9.996542457244871e-05, + "loss": 0.7984, + "step": 3710 + }, + { + "epoch": 0.02376601970279698, + "grad_norm": 0.9691576361656189, + "learning_rate": 9.996523775073746e-05, + "loss": 0.9321, + "step": 3720 + }, + { + "epoch": 0.023829906852535682, + "grad_norm": 0.7957014441490173, + "learning_rate": 9.996505042583303e-05, + "loss": 0.9805, + "step": 3730 + }, + { + "epoch": 0.02389379400227438, + "grad_norm": 0.920781135559082, + "learning_rate": 9.996486259773732e-05, + "loss": 0.83, + "step": 3740 + }, + { + "epoch": 0.023957681152013084, + "grad_norm": 1.661712646484375, + "learning_rate": 9.996467426645221e-05, + "loss": 0.7208, + "step": 3750 + }, + { + "epoch": 0.024021568301751787, + "grad_norm": 0.6871623396873474, + "learning_rate": 9.99644854319796e-05, + "loss": 0.7284, + "step": 3760 + }, + { + "epoch": 0.024085455451490486, + "grad_norm": 0.9017264246940613, + "learning_rate": 9.99642960943214e-05, + "loss": 1.1523, + "step": 3770 + }, + { + "epoch": 0.02414934260122919, + "grad_norm": 0.894895613193512, + "learning_rate": 9.996410625347953e-05, + "loss": 0.7732, + "step": 3780 + }, + { + "epoch": 0.02421322975096789, + "grad_norm": 0.8452061414718628, + "learning_rate": 9.996391590945588e-05, + "loss": 0.913, + "step": 3790 + }, + { + "epoch": 0.02427711690070659, + "grad_norm": 0.7204217910766602, + "learning_rate": 9.996372506225235e-05, + "loss": 0.8552, + "step": 3800 + }, + { + "epoch": 0.024341004050445293, + "grad_norm": 2.9905290603637695, + "learning_rate": 9.996353371187091e-05, + "loss": 0.8771, + "step": 3810 + }, + { + "epoch": 0.024404891200183996, + "grad_norm": 0.9556611180305481, + "learning_rate": 9.996334185831346e-05, + "loss": 0.8103, + "step": 3820 + }, + { + "epoch": 0.024468778349922695, + "grad_norm": 0.775848925113678, + "learning_rate": 9.996314950158192e-05, + "loss": 0.8078, + "step": 3830 + }, + { + "epoch": 0.024532665499661398, + "grad_norm": 0.9693676829338074, + "learning_rate": 9.996295664167824e-05, + "loss": 0.9336, + "step": 3840 + }, + { + "epoch": 0.0245965526494001, + "grad_norm": 1.195697546005249, + "learning_rate": 9.996276327860436e-05, + "loss": 1.2527, + "step": 3850 + }, + { + "epoch": 0.0246604397991388, + "grad_norm": 0.8424214124679565, + "learning_rate": 9.996256941236223e-05, + "loss": 0.966, + "step": 3860 + }, + { + "epoch": 0.024724326948877502, + "grad_norm": 0.6259729266166687, + "learning_rate": 9.996237504295382e-05, + "loss": 0.9363, + "step": 3870 + }, + { + "epoch": 0.024788214098616205, + "grad_norm": 0.7807269096374512, + "learning_rate": 9.996218017038106e-05, + "loss": 0.6411, + "step": 3880 + }, + { + "epoch": 0.024852101248354908, + "grad_norm": 0.6563220024108887, + "learning_rate": 9.996198479464591e-05, + "loss": 0.8191, + "step": 3890 + }, + { + "epoch": 0.024915988398093607, + "grad_norm": 0.831295371055603, + "learning_rate": 9.996178891575037e-05, + "loss": 0.8589, + "step": 3900 + }, + { + "epoch": 0.02497987554783231, + "grad_norm": 1.157340168952942, + "learning_rate": 9.996159253369638e-05, + "loss": 0.9202, + "step": 3910 + }, + { + "epoch": 0.025043762697571012, + "grad_norm": 0.7473374009132385, + "learning_rate": 9.996139564848594e-05, + "loss": 0.829, + "step": 3920 + }, + { + "epoch": 0.02510764984730971, + "grad_norm": 1.1940234899520874, + "learning_rate": 9.996119826012101e-05, + "loss": 0.9879, + "step": 3930 + }, + { + "epoch": 0.025171536997048414, + "grad_norm": 0.7762036323547363, + "learning_rate": 9.99610003686036e-05, + "loss": 0.9162, + "step": 3940 + }, + { + "epoch": 0.025235424146787117, + "grad_norm": 1.1545424461364746, + "learning_rate": 9.996080197393569e-05, + "loss": 0.9567, + "step": 3950 + }, + { + "epoch": 0.025299311296525816, + "grad_norm": 0.6979715824127197, + "learning_rate": 9.996060307611927e-05, + "loss": 0.9685, + "step": 3960 + }, + { + "epoch": 0.02536319844626452, + "grad_norm": 0.9557220339775085, + "learning_rate": 9.996040367515638e-05, + "loss": 1.0768, + "step": 3970 + }, + { + "epoch": 0.02542708559600322, + "grad_norm": 0.8868962526321411, + "learning_rate": 9.996020377104898e-05, + "loss": 1.0351, + "step": 3980 + }, + { + "epoch": 0.02549097274574192, + "grad_norm": 0.5406913757324219, + "learning_rate": 9.996000336379913e-05, + "loss": 0.9042, + "step": 3990 + }, + { + "epoch": 0.025554859895480623, + "grad_norm": 0.64485764503479, + "learning_rate": 9.995980245340881e-05, + "loss": 1.1883, + "step": 4000 + }, + { + "epoch": 0.025618747045219326, + "grad_norm": 1.2904107570648193, + "learning_rate": 9.995960103988005e-05, + "loss": 0.936, + "step": 4010 + }, + { + "epoch": 0.025682634194958025, + "grad_norm": 1.247886061668396, + "learning_rate": 9.99593991232149e-05, + "loss": 0.8806, + "step": 4020 + }, + { + "epoch": 0.025746521344696727, + "grad_norm": 0.9545615911483765, + "learning_rate": 9.995919670341538e-05, + "loss": 1.1493, + "step": 4030 + }, + { + "epoch": 0.02581040849443543, + "grad_norm": 1.999590277671814, + "learning_rate": 9.995899378048352e-05, + "loss": 0.6754, + "step": 4040 + }, + { + "epoch": 0.02587429564417413, + "grad_norm": 0.7333373427391052, + "learning_rate": 9.995879035442138e-05, + "loss": 0.8109, + "step": 4050 + }, + { + "epoch": 0.025938182793912832, + "grad_norm": 0.7739579081535339, + "learning_rate": 9.995858642523099e-05, + "loss": 0.8638, + "step": 4060 + }, + { + "epoch": 0.026002069943651535, + "grad_norm": 1.069405198097229, + "learning_rate": 9.995838199291443e-05, + "loss": 0.9313, + "step": 4070 + }, + { + "epoch": 0.026065957093390234, + "grad_norm": 1.366487979888916, + "learning_rate": 9.995817705747372e-05, + "loss": 1.0205, + "step": 4080 + }, + { + "epoch": 0.026129844243128936, + "grad_norm": 1.6458861827850342, + "learning_rate": 9.995797161891097e-05, + "loss": 0.9609, + "step": 4090 + }, + { + "epoch": 0.02619373139286764, + "grad_norm": 1.0026328563690186, + "learning_rate": 9.995776567722822e-05, + "loss": 1.0618, + "step": 4100 + }, + { + "epoch": 0.02625761854260634, + "grad_norm": 1.0415229797363281, + "learning_rate": 9.995755923242754e-05, + "loss": 0.761, + "step": 4110 + }, + { + "epoch": 0.02632150569234504, + "grad_norm": 1.169027328491211, + "learning_rate": 9.995735228451103e-05, + "loss": 0.92, + "step": 4120 + }, + { + "epoch": 0.026385392842083744, + "grad_norm": 1.2535079717636108, + "learning_rate": 9.995714483348076e-05, + "loss": 0.8859, + "step": 4130 + }, + { + "epoch": 0.026449279991822446, + "grad_norm": 0.6948879957199097, + "learning_rate": 9.995693687933883e-05, + "loss": 0.7189, + "step": 4140 + }, + { + "epoch": 0.026513167141561145, + "grad_norm": 0.7670521140098572, + "learning_rate": 9.995672842208731e-05, + "loss": 1.0072, + "step": 4150 + }, + { + "epoch": 0.026577054291299848, + "grad_norm": 0.8560011982917786, + "learning_rate": 9.995651946172833e-05, + "loss": 1.1125, + "step": 4160 + }, + { + "epoch": 0.02664094144103855, + "grad_norm": 0.762663722038269, + "learning_rate": 9.995630999826397e-05, + "loss": 0.9922, + "step": 4170 + }, + { + "epoch": 0.02670482859077725, + "grad_norm": 1.432151198387146, + "learning_rate": 9.995610003169635e-05, + "loss": 1.0305, + "step": 4180 + }, + { + "epoch": 0.026768715740515953, + "grad_norm": 1.0463693141937256, + "learning_rate": 9.99558895620276e-05, + "loss": 0.9721, + "step": 4190 + }, + { + "epoch": 0.026832602890254655, + "grad_norm": 0.6497074961662292, + "learning_rate": 9.99556785892598e-05, + "loss": 0.8886, + "step": 4200 + }, + { + "epoch": 0.026896490039993354, + "grad_norm": 0.8750442266464233, + "learning_rate": 9.995546711339512e-05, + "loss": 1.1452, + "step": 4210 + }, + { + "epoch": 0.026960377189732057, + "grad_norm": 0.5352575778961182, + "learning_rate": 9.995525513443566e-05, + "loss": 1.3216, + "step": 4220 + }, + { + "epoch": 0.02702426433947076, + "grad_norm": 0.7286153435707092, + "learning_rate": 9.995504265238357e-05, + "loss": 0.9927, + "step": 4230 + }, + { + "epoch": 0.02708815148920946, + "grad_norm": 1.133766770362854, + "learning_rate": 9.995482966724098e-05, + "loss": 0.9198, + "step": 4240 + }, + { + "epoch": 0.02715203863894816, + "grad_norm": 1.060925006866455, + "learning_rate": 9.995461617901004e-05, + "loss": 0.984, + "step": 4250 + }, + { + "epoch": 0.027215925788686864, + "grad_norm": 0.8017410039901733, + "learning_rate": 9.995440218769288e-05, + "loss": 0.8302, + "step": 4260 + }, + { + "epoch": 0.027279812938425563, + "grad_norm": 0.6474617719650269, + "learning_rate": 9.995418769329171e-05, + "loss": 0.8526, + "step": 4270 + }, + { + "epoch": 0.027343700088164266, + "grad_norm": 0.7051038146018982, + "learning_rate": 9.995397269580862e-05, + "loss": 0.6267, + "step": 4280 + }, + { + "epoch": 0.02740758723790297, + "grad_norm": 0.8523268699645996, + "learning_rate": 9.995375719524582e-05, + "loss": 0.7513, + "step": 4290 + }, + { + "epoch": 0.02747147438764167, + "grad_norm": 0.5515130162239075, + "learning_rate": 9.995354119160546e-05, + "loss": 0.8045, + "step": 4300 + }, + { + "epoch": 0.02753536153738037, + "grad_norm": 0.6105387806892395, + "learning_rate": 9.995332468488974e-05, + "loss": 0.9739, + "step": 4310 + }, + { + "epoch": 0.027599248687119073, + "grad_norm": 0.9270747303962708, + "learning_rate": 9.99531076751008e-05, + "loss": 0.8789, + "step": 4320 + }, + { + "epoch": 0.027663135836857776, + "grad_norm": 0.46213430166244507, + "learning_rate": 9.995289016224087e-05, + "loss": 0.8914, + "step": 4330 + }, + { + "epoch": 0.027727022986596475, + "grad_norm": 0.8763656616210938, + "learning_rate": 9.995267214631213e-05, + "loss": 0.9085, + "step": 4340 + }, + { + "epoch": 0.027790910136335178, + "grad_norm": 1.6064941883087158, + "learning_rate": 9.995245362731676e-05, + "loss": 1.0047, + "step": 4350 + }, + { + "epoch": 0.02785479728607388, + "grad_norm": 1.2199528217315674, + "learning_rate": 9.995223460525696e-05, + "loss": 0.749, + "step": 4360 + }, + { + "epoch": 0.02791868443581258, + "grad_norm": 0.9066464304924011, + "learning_rate": 9.995201508013494e-05, + "loss": 1.0363, + "step": 4370 + }, + { + "epoch": 0.027982571585551282, + "grad_norm": 0.8760823011398315, + "learning_rate": 9.995179505195291e-05, + "loss": 1.1568, + "step": 4380 + }, + { + "epoch": 0.028046458735289985, + "grad_norm": 0.6646769046783447, + "learning_rate": 9.99515745207131e-05, + "loss": 1.3106, + "step": 4390 + }, + { + "epoch": 0.028110345885028684, + "grad_norm": 0.7811892032623291, + "learning_rate": 9.995135348641771e-05, + "loss": 0.8003, + "step": 4400 + }, + { + "epoch": 0.028174233034767387, + "grad_norm": 1.2583142518997192, + "learning_rate": 9.995113194906899e-05, + "loss": 0.934, + "step": 4410 + }, + { + "epoch": 0.02823812018450609, + "grad_norm": 1.4330214262008667, + "learning_rate": 9.995090990866915e-05, + "loss": 0.8924, + "step": 4420 + }, + { + "epoch": 0.02830200733424479, + "grad_norm": 0.7987727522850037, + "learning_rate": 9.995068736522044e-05, + "loss": 1.257, + "step": 4430 + }, + { + "epoch": 0.02836589448398349, + "grad_norm": 0.90681391954422, + "learning_rate": 9.995046431872507e-05, + "loss": 0.9746, + "step": 4440 + }, + { + "epoch": 0.028429781633722194, + "grad_norm": 1.1222659349441528, + "learning_rate": 9.995024076918534e-05, + "loss": 0.8702, + "step": 4450 + }, + { + "epoch": 0.028493668783460893, + "grad_norm": 1.4470833539962769, + "learning_rate": 9.995001671660347e-05, + "loss": 0.9072, + "step": 4460 + }, + { + "epoch": 0.028557555933199596, + "grad_norm": 0.9265400767326355, + "learning_rate": 9.994979216098171e-05, + "loss": 0.9651, + "step": 4470 + }, + { + "epoch": 0.0286214430829383, + "grad_norm": 0.40936312079429626, + "learning_rate": 9.994956710232232e-05, + "loss": 0.9576, + "step": 4480 + }, + { + "epoch": 0.028685330232676998, + "grad_norm": 0.7994583249092102, + "learning_rate": 9.99493415406276e-05, + "loss": 0.8773, + "step": 4490 + }, + { + "epoch": 0.0287492173824157, + "grad_norm": 0.8965862989425659, + "learning_rate": 9.994911547589979e-05, + "loss": 0.9247, + "step": 4500 + }, + { + "epoch": 0.028813104532154403, + "grad_norm": 0.5341432690620422, + "learning_rate": 9.994888890814116e-05, + "loss": 0.9735, + "step": 4510 + }, + { + "epoch": 0.028876991681893106, + "grad_norm": 0.796406090259552, + "learning_rate": 9.994866183735403e-05, + "loss": 1.0474, + "step": 4520 + }, + { + "epoch": 0.028940878831631805, + "grad_norm": 0.6537384986877441, + "learning_rate": 9.994843426354064e-05, + "loss": 0.7858, + "step": 4530 + }, + { + "epoch": 0.029004765981370507, + "grad_norm": 0.7321698665618896, + "learning_rate": 9.994820618670332e-05, + "loss": 1.017, + "step": 4540 + }, + { + "epoch": 0.02906865313110921, + "grad_norm": 0.9634839296340942, + "learning_rate": 9.994797760684435e-05, + "loss": 0.9671, + "step": 4550 + }, + { + "epoch": 0.02913254028084791, + "grad_norm": 0.7006617784500122, + "learning_rate": 9.994774852396603e-05, + "loss": 1.053, + "step": 4560 + }, + { + "epoch": 0.029196427430586612, + "grad_norm": 0.7608281373977661, + "learning_rate": 9.994751893807068e-05, + "loss": 0.7445, + "step": 4570 + }, + { + "epoch": 0.029260314580325315, + "grad_norm": 1.0257230997085571, + "learning_rate": 9.99472888491606e-05, + "loss": 0.7304, + "step": 4580 + }, + { + "epoch": 0.029324201730064014, + "grad_norm": 0.6806319355964661, + "learning_rate": 9.994705825723811e-05, + "loss": 1.1287, + "step": 4590 + }, + { + "epoch": 0.029388088879802717, + "grad_norm": 1.2967884540557861, + "learning_rate": 9.994682716230552e-05, + "loss": 0.948, + "step": 4600 + }, + { + "epoch": 0.02945197602954142, + "grad_norm": 1.0324482917785645, + "learning_rate": 9.994659556436518e-05, + "loss": 1.0642, + "step": 4610 + }, + { + "epoch": 0.02951586317928012, + "grad_norm": 0.5615150928497314, + "learning_rate": 9.994636346341943e-05, + "loss": 0.6903, + "step": 4620 + }, + { + "epoch": 0.02957975032901882, + "grad_norm": 0.6164289712905884, + "learning_rate": 9.994613085947058e-05, + "loss": 0.8748, + "step": 4630 + }, + { + "epoch": 0.029643637478757524, + "grad_norm": 0.9414746761322021, + "learning_rate": 9.994589775252097e-05, + "loss": 0.9157, + "step": 4640 + }, + { + "epoch": 0.029707524628496223, + "grad_norm": 0.8447662591934204, + "learning_rate": 9.994566414257297e-05, + "loss": 1.1894, + "step": 4650 + }, + { + "epoch": 0.029771411778234926, + "grad_norm": 0.8695082664489746, + "learning_rate": 9.994543002962892e-05, + "loss": 1.173, + "step": 4660 + }, + { + "epoch": 0.029835298927973628, + "grad_norm": 1.3696662187576294, + "learning_rate": 9.994519541369119e-05, + "loss": 0.8384, + "step": 4670 + }, + { + "epoch": 0.029899186077712327, + "grad_norm": 0.6377172470092773, + "learning_rate": 9.994496029476213e-05, + "loss": 0.8018, + "step": 4680 + }, + { + "epoch": 0.02996307322745103, + "grad_norm": 1.396103858947754, + "learning_rate": 9.99447246728441e-05, + "loss": 0.8777, + "step": 4690 + }, + { + "epoch": 0.030026960377189733, + "grad_norm": 0.741669774055481, + "learning_rate": 9.99444885479395e-05, + "loss": 1.1431, + "step": 4700 + }, + { + "epoch": 0.030090847526928435, + "grad_norm": 0.8591098189353943, + "learning_rate": 9.994425192005067e-05, + "loss": 1.0976, + "step": 4710 + }, + { + "epoch": 0.030154734676667135, + "grad_norm": 0.6573971509933472, + "learning_rate": 9.994401478918003e-05, + "loss": 0.9112, + "step": 4720 + }, + { + "epoch": 0.030218621826405837, + "grad_norm": 0.7204700708389282, + "learning_rate": 9.994377715532996e-05, + "loss": 0.8728, + "step": 4730 + }, + { + "epoch": 0.03028250897614454, + "grad_norm": 1.0097802877426147, + "learning_rate": 9.994353901850283e-05, + "loss": 1.0269, + "step": 4740 + }, + { + "epoch": 0.03034639612588324, + "grad_norm": 1.4376720190048218, + "learning_rate": 9.994330037870107e-05, + "loss": 0.8102, + "step": 4750 + }, + { + "epoch": 0.03041028327562194, + "grad_norm": 0.7325295209884644, + "learning_rate": 9.994306123592704e-05, + "loss": 0.9336, + "step": 4760 + }, + { + "epoch": 0.030474170425360644, + "grad_norm": 0.7449788451194763, + "learning_rate": 9.994282159018323e-05, + "loss": 0.8539, + "step": 4770 + }, + { + "epoch": 0.030538057575099344, + "grad_norm": 0.7632824778556824, + "learning_rate": 9.994258144147195e-05, + "loss": 0.6919, + "step": 4780 + }, + { + "epoch": 0.030601944724838046, + "grad_norm": 0.9371885657310486, + "learning_rate": 9.99423407897957e-05, + "loss": 0.8518, + "step": 4790 + }, + { + "epoch": 0.03066583187457675, + "grad_norm": 0.9703862071037292, + "learning_rate": 9.994209963515684e-05, + "loss": 0.8882, + "step": 4800 + }, + { + "epoch": 0.030729719024315448, + "grad_norm": 0.6255933046340942, + "learning_rate": 9.994185797755787e-05, + "loss": 0.9969, + "step": 4810 + }, + { + "epoch": 0.03079360617405415, + "grad_norm": 2.328423261642456, + "learning_rate": 9.994161581700115e-05, + "loss": 0.8677, + "step": 4820 + }, + { + "epoch": 0.030857493323792853, + "grad_norm": 0.8444818258285522, + "learning_rate": 9.994137315348917e-05, + "loss": 0.9273, + "step": 4830 + }, + { + "epoch": 0.030921380473531553, + "grad_norm": 0.6778904795646667, + "learning_rate": 9.994112998702434e-05, + "loss": 1.136, + "step": 4840 + }, + { + "epoch": 0.030985267623270255, + "grad_norm": 0.7254196405410767, + "learning_rate": 9.994088631760914e-05, + "loss": 0.9659, + "step": 4850 + }, + { + "epoch": 0.031049154773008958, + "grad_norm": 2.3594653606414795, + "learning_rate": 9.994064214524602e-05, + "loss": 0.9981, + "step": 4860 + }, + { + "epoch": 0.031113041922747657, + "grad_norm": 0.9346766471862793, + "learning_rate": 9.994039746993742e-05, + "loss": 1.2296, + "step": 4870 + }, + { + "epoch": 0.03117692907248636, + "grad_norm": 1.1860244274139404, + "learning_rate": 9.994015229168581e-05, + "loss": 1.0124, + "step": 4880 + }, + { + "epoch": 0.031240816222225062, + "grad_norm": 0.977857232093811, + "learning_rate": 9.993990661049366e-05, + "loss": 0.8632, + "step": 4890 + }, + { + "epoch": 0.03130470337196376, + "grad_norm": 0.9144421815872192, + "learning_rate": 9.993966042636345e-05, + "loss": 0.9927, + "step": 4900 + }, + { + "epoch": 0.03136859052170247, + "grad_norm": 1.034429669380188, + "learning_rate": 9.993941373929764e-05, + "loss": 0.8818, + "step": 4910 + }, + { + "epoch": 0.03143247767144117, + "grad_norm": 0.4996863007545471, + "learning_rate": 9.993916654929876e-05, + "loss": 0.6711, + "step": 4920 + }, + { + "epoch": 0.031496364821179866, + "grad_norm": 0.6924141049385071, + "learning_rate": 9.993891885636925e-05, + "loss": 0.9002, + "step": 4930 + }, + { + "epoch": 0.03156025197091857, + "grad_norm": 0.7536648511886597, + "learning_rate": 9.993867066051163e-05, + "loss": 1.0268, + "step": 4940 + }, + { + "epoch": 0.03162413912065727, + "grad_norm": 1.059717059135437, + "learning_rate": 9.993842196172838e-05, + "loss": 1.4731, + "step": 4950 + }, + { + "epoch": 0.03168802627039597, + "grad_norm": 0.9447365999221802, + "learning_rate": 9.993817276002203e-05, + "loss": 0.8936, + "step": 4960 + }, + { + "epoch": 0.03175191342013468, + "grad_norm": 2.9407436847686768, + "learning_rate": 9.993792305539507e-05, + "loss": 0.9535, + "step": 4970 + }, + { + "epoch": 0.031815800569873376, + "grad_norm": 0.9434256553649902, + "learning_rate": 9.993767284785003e-05, + "loss": 0.9241, + "step": 4980 + }, + { + "epoch": 0.031879687719612075, + "grad_norm": 0.5843566060066223, + "learning_rate": 9.993742213738942e-05, + "loss": 1.1005, + "step": 4990 + }, + { + "epoch": 0.03194357486935078, + "grad_norm": 0.5183364748954773, + "learning_rate": 9.993717092401577e-05, + "loss": 1.0861, + "step": 5000 + }, + { + "epoch": 0.03200746201908948, + "grad_norm": 0.716195821762085, + "learning_rate": 9.99369192077316e-05, + "loss": 1.0468, + "step": 5010 + }, + { + "epoch": 0.03207134916882818, + "grad_norm": 0.6783444285392761, + "learning_rate": 9.993666698853946e-05, + "loss": 0.9456, + "step": 5020 + }, + { + "epoch": 0.032135236318566886, + "grad_norm": 0.8905858397483826, + "learning_rate": 9.99364142664419e-05, + "loss": 0.9607, + "step": 5030 + }, + { + "epoch": 0.032199123468305585, + "grad_norm": 1.1394882202148438, + "learning_rate": 9.993616104144141e-05, + "loss": 0.7845, + "step": 5040 + }, + { + "epoch": 0.032263010618044284, + "grad_norm": 0.9417553544044495, + "learning_rate": 9.99359073135406e-05, + "loss": 0.9869, + "step": 5050 + }, + { + "epoch": 0.03232689776778299, + "grad_norm": 0.6557328104972839, + "learning_rate": 9.993565308274199e-05, + "loss": 1.132, + "step": 5060 + }, + { + "epoch": 0.03239078491752169, + "grad_norm": 1.505283236503601, + "learning_rate": 9.993539834904816e-05, + "loss": 0.6938, + "step": 5070 + }, + { + "epoch": 0.03245467206726039, + "grad_norm": 0.7740111947059631, + "learning_rate": 9.993514311246166e-05, + "loss": 0.9475, + "step": 5080 + }, + { + "epoch": 0.032518559216999095, + "grad_norm": 1.1379529237747192, + "learning_rate": 9.993488737298509e-05, + "loss": 0.7626, + "step": 5090 + }, + { + "epoch": 0.032582446366737794, + "grad_norm": 0.5552259683609009, + "learning_rate": 9.993463113062099e-05, + "loss": 0.9058, + "step": 5100 + }, + { + "epoch": 0.03264633351647649, + "grad_norm": 0.7772766351699829, + "learning_rate": 9.993437438537194e-05, + "loss": 1.0914, + "step": 5110 + }, + { + "epoch": 0.0327102206662152, + "grad_norm": 0.7294765114784241, + "learning_rate": 9.993411713724056e-05, + "loss": 0.9447, + "step": 5120 + }, + { + "epoch": 0.0327741078159539, + "grad_norm": 0.8332342505455017, + "learning_rate": 9.993385938622942e-05, + "loss": 0.7607, + "step": 5130 + }, + { + "epoch": 0.0328379949656926, + "grad_norm": 0.759425163269043, + "learning_rate": 9.993360113234111e-05, + "loss": 0.8551, + "step": 5140 + }, + { + "epoch": 0.032901882115431304, + "grad_norm": 0.8883112668991089, + "learning_rate": 9.993334237557825e-05, + "loss": 0.815, + "step": 5150 + }, + { + "epoch": 0.03296576926517, + "grad_norm": 0.5959163308143616, + "learning_rate": 9.993308311594343e-05, + "loss": 1.0528, + "step": 5160 + }, + { + "epoch": 0.0330296564149087, + "grad_norm": 1.0523767471313477, + "learning_rate": 9.993282335343925e-05, + "loss": 1.0073, + "step": 5170 + }, + { + "epoch": 0.03309354356464741, + "grad_norm": 0.8208662271499634, + "learning_rate": 9.993256308806835e-05, + "loss": 0.8802, + "step": 5180 + }, + { + "epoch": 0.03315743071438611, + "grad_norm": 0.7097920775413513, + "learning_rate": 9.993230231983334e-05, + "loss": 1.0191, + "step": 5190 + }, + { + "epoch": 0.03322131786412481, + "grad_norm": 0.7505048513412476, + "learning_rate": 9.993204104873686e-05, + "loss": 1.0811, + "step": 5200 + }, + { + "epoch": 0.03328520501386351, + "grad_norm": 0.9009354710578918, + "learning_rate": 9.993177927478152e-05, + "loss": 0.9172, + "step": 5210 + }, + { + "epoch": 0.03334909216360221, + "grad_norm": 0.681164562702179, + "learning_rate": 9.993151699796996e-05, + "loss": 0.8789, + "step": 5220 + }, + { + "epoch": 0.03341297931334091, + "grad_norm": 0.9279341101646423, + "learning_rate": 9.993125421830484e-05, + "loss": 0.7841, + "step": 5230 + }, + { + "epoch": 0.03347686646307962, + "grad_norm": 0.8030073642730713, + "learning_rate": 9.993099093578879e-05, + "loss": 1.1084, + "step": 5240 + }, + { + "epoch": 0.033540753612818316, + "grad_norm": 0.8783805966377258, + "learning_rate": 9.993072715042447e-05, + "loss": 0.9935, + "step": 5250 + }, + { + "epoch": 0.03360464076255702, + "grad_norm": 1.2054526805877686, + "learning_rate": 9.99304628622145e-05, + "loss": 1.0037, + "step": 5260 + }, + { + "epoch": 0.03366852791229572, + "grad_norm": 0.7649316787719727, + "learning_rate": 9.99301980711616e-05, + "loss": 0.7857, + "step": 5270 + }, + { + "epoch": 0.03373241506203442, + "grad_norm": 1.0451691150665283, + "learning_rate": 9.992993277726841e-05, + "loss": 1.0657, + "step": 5280 + }, + { + "epoch": 0.03379630221177313, + "grad_norm": 1.1677067279815674, + "learning_rate": 9.99296669805376e-05, + "loss": 0.8225, + "step": 5290 + }, + { + "epoch": 0.033860189361511826, + "grad_norm": 0.8038674592971802, + "learning_rate": 9.992940068097184e-05, + "loss": 0.8793, + "step": 5300 + }, + { + "epoch": 0.033924076511250525, + "grad_norm": 0.8285770416259766, + "learning_rate": 9.992913387857383e-05, + "loss": 1.175, + "step": 5310 + }, + { + "epoch": 0.03398796366098923, + "grad_norm": 1.8478131294250488, + "learning_rate": 9.992886657334624e-05, + "loss": 1.1025, + "step": 5320 + }, + { + "epoch": 0.03405185081072793, + "grad_norm": 0.6567774415016174, + "learning_rate": 9.992859876529177e-05, + "loss": 0.979, + "step": 5330 + }, + { + "epoch": 0.03411573796046663, + "grad_norm": 1.635343074798584, + "learning_rate": 9.992833045441312e-05, + "loss": 0.9373, + "step": 5340 + }, + { + "epoch": 0.034179625110205336, + "grad_norm": 0.6428894400596619, + "learning_rate": 9.992806164071298e-05, + "loss": 0.9726, + "step": 5350 + }, + { + "epoch": 0.034243512259944035, + "grad_norm": 0.9768702983856201, + "learning_rate": 9.992779232419407e-05, + "loss": 1.1691, + "step": 5360 + }, + { + "epoch": 0.034307399409682734, + "grad_norm": 0.9969322681427002, + "learning_rate": 9.99275225048591e-05, + "loss": 0.9453, + "step": 5370 + }, + { + "epoch": 0.03437128655942144, + "grad_norm": 1.498533010482788, + "learning_rate": 9.992725218271078e-05, + "loss": 0.9161, + "step": 5380 + }, + { + "epoch": 0.03443517370916014, + "grad_norm": 0.6910355687141418, + "learning_rate": 9.992698135775185e-05, + "loss": 0.8751, + "step": 5390 + }, + { + "epoch": 0.03449906085889884, + "grad_norm": 0.7530591487884521, + "learning_rate": 9.992671002998502e-05, + "loss": 1.0573, + "step": 5400 + }, + { + "epoch": 0.034562948008637545, + "grad_norm": 0.9451344013214111, + "learning_rate": 9.992643819941301e-05, + "loss": 0.8682, + "step": 5410 + }, + { + "epoch": 0.034626835158376244, + "grad_norm": 1.7209718227386475, + "learning_rate": 9.992616586603859e-05, + "loss": 0.8826, + "step": 5420 + }, + { + "epoch": 0.034690722308114944, + "grad_norm": 0.7069958448410034, + "learning_rate": 9.992589302986448e-05, + "loss": 0.8965, + "step": 5430 + }, + { + "epoch": 0.03475460945785365, + "grad_norm": 0.6233651041984558, + "learning_rate": 9.992561969089345e-05, + "loss": 0.9789, + "step": 5440 + }, + { + "epoch": 0.03481849660759235, + "grad_norm": 0.7849096655845642, + "learning_rate": 9.992534584912823e-05, + "loss": 1.0208, + "step": 5450 + }, + { + "epoch": 0.03488238375733105, + "grad_norm": 0.7504194378852844, + "learning_rate": 9.992507150457158e-05, + "loss": 0.7951, + "step": 5460 + }, + { + "epoch": 0.034946270907069754, + "grad_norm": 1.141536831855774, + "learning_rate": 9.992479665722627e-05, + "loss": 0.7366, + "step": 5470 + }, + { + "epoch": 0.03501015805680845, + "grad_norm": 0.8907060623168945, + "learning_rate": 9.992452130709507e-05, + "loss": 1.1784, + "step": 5480 + }, + { + "epoch": 0.03507404520654715, + "grad_norm": 0.9252203106880188, + "learning_rate": 9.992424545418074e-05, + "loss": 0.9195, + "step": 5490 + }, + { + "epoch": 0.03513793235628586, + "grad_norm": 0.9670997262001038, + "learning_rate": 9.992396909848608e-05, + "loss": 0.8106, + "step": 5500 + }, + { + "epoch": 0.03520181950602456, + "grad_norm": 0.9867545962333679, + "learning_rate": 9.992369224001386e-05, + "loss": 0.8976, + "step": 5510 + }, + { + "epoch": 0.03526570665576326, + "grad_norm": 1.0230097770690918, + "learning_rate": 9.992341487876686e-05, + "loss": 0.8986, + "step": 5520 + }, + { + "epoch": 0.03532959380550196, + "grad_norm": 0.7679455876350403, + "learning_rate": 9.99231370147479e-05, + "loss": 0.9554, + "step": 5530 + }, + { + "epoch": 0.03539348095524066, + "grad_norm": 0.6599009037017822, + "learning_rate": 9.992285864795974e-05, + "loss": 0.8623, + "step": 5540 + }, + { + "epoch": 0.03545736810497936, + "grad_norm": 1.114585041999817, + "learning_rate": 9.992257977840521e-05, + "loss": 1.0822, + "step": 5550 + }, + { + "epoch": 0.03552125525471807, + "grad_norm": 0.6967979073524475, + "learning_rate": 9.992230040608713e-05, + "loss": 1.0806, + "step": 5560 + }, + { + "epoch": 0.03558514240445677, + "grad_norm": 2.6597609519958496, + "learning_rate": 9.992202053100826e-05, + "loss": 0.958, + "step": 5570 + }, + { + "epoch": 0.035649029554195466, + "grad_norm": 0.7488609552383423, + "learning_rate": 9.992174015317148e-05, + "loss": 0.6722, + "step": 5580 + }, + { + "epoch": 0.03571291670393417, + "grad_norm": 1.290249228477478, + "learning_rate": 9.992145927257958e-05, + "loss": 1.1259, + "step": 5590 + }, + { + "epoch": 0.03577680385367287, + "grad_norm": 0.7017959952354431, + "learning_rate": 9.99211778892354e-05, + "loss": 0.9599, + "step": 5600 + }, + { + "epoch": 0.03584069100341157, + "grad_norm": 0.6516076922416687, + "learning_rate": 9.992089600314179e-05, + "loss": 1.0698, + "step": 5610 + }, + { + "epoch": 0.03590457815315028, + "grad_norm": 0.860114336013794, + "learning_rate": 9.992061361430153e-05, + "loss": 0.8568, + "step": 5620 + }, + { + "epoch": 0.035968465302888976, + "grad_norm": 0.6573166847229004, + "learning_rate": 9.992033072271754e-05, + "loss": 0.9076, + "step": 5630 + }, + { + "epoch": 0.036032352452627675, + "grad_norm": 1.0699505805969238, + "learning_rate": 9.992004732839261e-05, + "loss": 0.8982, + "step": 5640 + }, + { + "epoch": 0.03609623960236638, + "grad_norm": 0.8025882840156555, + "learning_rate": 9.991976343132963e-05, + "loss": 0.9928, + "step": 5650 + }, + { + "epoch": 0.03616012675210508, + "grad_norm": 0.7112436294555664, + "learning_rate": 9.991947903153143e-05, + "loss": 1.0748, + "step": 5660 + }, + { + "epoch": 0.036224013901843787, + "grad_norm": 0.8061192631721497, + "learning_rate": 9.991919412900091e-05, + "loss": 1.0776, + "step": 5670 + }, + { + "epoch": 0.036287901051582486, + "grad_norm": 3.550689220428467, + "learning_rate": 9.99189087237409e-05, + "loss": 0.8897, + "step": 5680 + }, + { + "epoch": 0.036351788201321185, + "grad_norm": 0.6956158876419067, + "learning_rate": 9.991862281575431e-05, + "loss": 0.9601, + "step": 5690 + }, + { + "epoch": 0.03641567535105989, + "grad_norm": 2.4917612075805664, + "learning_rate": 9.991833640504397e-05, + "loss": 1.2047, + "step": 5700 + }, + { + "epoch": 0.03647956250079859, + "grad_norm": 0.8588683009147644, + "learning_rate": 9.991804949161284e-05, + "loss": 0.8791, + "step": 5710 + }, + { + "epoch": 0.03654344965053729, + "grad_norm": 1.8225440979003906, + "learning_rate": 9.991776207546373e-05, + "loss": 1.1723, + "step": 5720 + }, + { + "epoch": 0.036607336800275996, + "grad_norm": 0.6750584244728088, + "learning_rate": 9.991747415659959e-05, + "loss": 1.0424, + "step": 5730 + }, + { + "epoch": 0.036671223950014695, + "grad_norm": 1.0814725160598755, + "learning_rate": 9.99171857350233e-05, + "loss": 0.7245, + "step": 5740 + }, + { + "epoch": 0.036735111099753394, + "grad_norm": 0.6731589436531067, + "learning_rate": 9.991689681073776e-05, + "loss": 0.7107, + "step": 5750 + }, + { + "epoch": 0.0367989982494921, + "grad_norm": 1.090672492980957, + "learning_rate": 9.991660738374589e-05, + "loss": 1.1092, + "step": 5760 + }, + { + "epoch": 0.0368628853992308, + "grad_norm": 0.9638064503669739, + "learning_rate": 9.991631745405059e-05, + "loss": 1.0152, + "step": 5770 + }, + { + "epoch": 0.0369267725489695, + "grad_norm": 0.6535985469818115, + "learning_rate": 9.99160270216548e-05, + "loss": 0.9731, + "step": 5780 + }, + { + "epoch": 0.036990659698708205, + "grad_norm": 0.8303619623184204, + "learning_rate": 9.991573608656144e-05, + "loss": 1.0109, + "step": 5790 + }, + { + "epoch": 0.037054546848446904, + "grad_norm": 0.8238627910614014, + "learning_rate": 9.991544464877342e-05, + "loss": 1.1488, + "step": 5800 + }, + { + "epoch": 0.0371184339981856, + "grad_norm": 0.7430026531219482, + "learning_rate": 9.991515270829369e-05, + "loss": 0.9808, + "step": 5810 + }, + { + "epoch": 0.03718232114792431, + "grad_norm": 1.1487149000167847, + "learning_rate": 9.99148602651252e-05, + "loss": 0.8169, + "step": 5820 + }, + { + "epoch": 0.03724620829766301, + "grad_norm": 0.8699382543563843, + "learning_rate": 9.991456731927087e-05, + "loss": 0.9892, + "step": 5830 + }, + { + "epoch": 0.03731009544740171, + "grad_norm": 0.92801833152771, + "learning_rate": 9.991427387073367e-05, + "loss": 1.1314, + "step": 5840 + }, + { + "epoch": 0.037373982597140414, + "grad_norm": 0.9899303913116455, + "learning_rate": 9.991397991951656e-05, + "loss": 0.7899, + "step": 5850 + }, + { + "epoch": 0.03743786974687911, + "grad_norm": 0.6273317933082581, + "learning_rate": 9.991368546562249e-05, + "loss": 1.0946, + "step": 5860 + }, + { + "epoch": 0.03750175689661781, + "grad_norm": 1.1781492233276367, + "learning_rate": 9.991339050905442e-05, + "loss": 0.9631, + "step": 5870 + }, + { + "epoch": 0.03756564404635652, + "grad_norm": 1.5557823181152344, + "learning_rate": 9.991309504981533e-05, + "loss": 0.8755, + "step": 5880 + }, + { + "epoch": 0.03762953119609522, + "grad_norm": 1.418256402015686, + "learning_rate": 9.991279908790818e-05, + "loss": 1.0737, + "step": 5890 + }, + { + "epoch": 0.037693418345833916, + "grad_norm": 1.275620460510254, + "learning_rate": 9.991250262333597e-05, + "loss": 0.7169, + "step": 5900 + }, + { + "epoch": 0.03775730549557262, + "grad_norm": 0.9257436394691467, + "learning_rate": 9.991220565610169e-05, + "loss": 1.0117, + "step": 5910 + }, + { + "epoch": 0.03782119264531132, + "grad_norm": 0.6086337566375732, + "learning_rate": 9.99119081862083e-05, + "loss": 0.9319, + "step": 5920 + }, + { + "epoch": 0.03788507979505002, + "grad_norm": 1.3489453792572021, + "learning_rate": 9.991161021365882e-05, + "loss": 1.1381, + "step": 5930 + }, + { + "epoch": 0.03794896694478873, + "grad_norm": 0.7379159927368164, + "learning_rate": 9.991131173845624e-05, + "loss": 1.1553, + "step": 5940 + }, + { + "epoch": 0.038012854094527426, + "grad_norm": 0.8401197195053101, + "learning_rate": 9.991101276060358e-05, + "loss": 0.8074, + "step": 5950 + }, + { + "epoch": 0.038076741244266125, + "grad_norm": 1.0958367586135864, + "learning_rate": 9.991071328010384e-05, + "loss": 1.1319, + "step": 5960 + }, + { + "epoch": 0.03814062839400483, + "grad_norm": 0.9215190410614014, + "learning_rate": 9.991041329696005e-05, + "loss": 1.1632, + "step": 5970 + }, + { + "epoch": 0.03820451554374353, + "grad_norm": 1.5827072858810425, + "learning_rate": 9.991011281117521e-05, + "loss": 0.9153, + "step": 5980 + }, + { + "epoch": 0.03826840269348223, + "grad_norm": 0.67779141664505, + "learning_rate": 9.990981182275236e-05, + "loss": 0.968, + "step": 5990 + }, + { + "epoch": 0.038332289843220936, + "grad_norm": 1.1568547487258911, + "learning_rate": 9.990951033169451e-05, + "loss": 0.9781, + "step": 6000 + }, + { + "epoch": 0.038396176992959635, + "grad_norm": 0.7177845239639282, + "learning_rate": 9.990920833800472e-05, + "loss": 0.9362, + "step": 6010 + }, + { + "epoch": 0.038460064142698334, + "grad_norm": 0.7867560982704163, + "learning_rate": 9.990890584168604e-05, + "loss": 0.8053, + "step": 6020 + }, + { + "epoch": 0.03852395129243704, + "grad_norm": 0.9753761887550354, + "learning_rate": 9.990860284274148e-05, + "loss": 0.9772, + "step": 6030 + }, + { + "epoch": 0.03858783844217574, + "grad_norm": 1.043918490409851, + "learning_rate": 9.990829934117413e-05, + "loss": 1.0062, + "step": 6040 + }, + { + "epoch": 0.03865172559191444, + "grad_norm": 0.6653173565864563, + "learning_rate": 9.990799533698703e-05, + "loss": 0.946, + "step": 6050 + }, + { + "epoch": 0.038715612741653145, + "grad_norm": 0.6706075072288513, + "learning_rate": 9.990769083018322e-05, + "loss": 0.9202, + "step": 6060 + }, + { + "epoch": 0.038779499891391844, + "grad_norm": 1.005500078201294, + "learning_rate": 9.99073858207658e-05, + "loss": 1.2583, + "step": 6070 + }, + { + "epoch": 0.03884338704113055, + "grad_norm": 0.9135782122612, + "learning_rate": 9.990708030873783e-05, + "loss": 1.1592, + "step": 6080 + }, + { + "epoch": 0.03890727419086925, + "grad_norm": 0.8927890658378601, + "learning_rate": 9.990677429410237e-05, + "loss": 1.0624, + "step": 6090 + }, + { + "epoch": 0.03897116134060795, + "grad_norm": 1.1654282808303833, + "learning_rate": 9.990646777686255e-05, + "loss": 0.8439, + "step": 6100 + }, + { + "epoch": 0.039035048490346655, + "grad_norm": 0.5983591079711914, + "learning_rate": 9.99061607570214e-05, + "loss": 0.8542, + "step": 6110 + }, + { + "epoch": 0.039098935640085354, + "grad_norm": 0.9841302633285522, + "learning_rate": 9.990585323458204e-05, + "loss": 1.0852, + "step": 6120 + }, + { + "epoch": 0.03916282278982405, + "grad_norm": 1.078748106956482, + "learning_rate": 9.990554520954755e-05, + "loss": 0.8696, + "step": 6130 + }, + { + "epoch": 0.03922670993956276, + "grad_norm": 0.9046047925949097, + "learning_rate": 9.990523668192106e-05, + "loss": 0.9837, + "step": 6140 + }, + { + "epoch": 0.03929059708930146, + "grad_norm": 0.6112083196640015, + "learning_rate": 9.990492765170567e-05, + "loss": 1.445, + "step": 6150 + }, + { + "epoch": 0.03935448423904016, + "grad_norm": 0.8192219138145447, + "learning_rate": 9.990461811890447e-05, + "loss": 0.7521, + "step": 6160 + }, + { + "epoch": 0.039418371388778864, + "grad_norm": 1.2310230731964111, + "learning_rate": 9.99043080835206e-05, + "loss": 0.7873, + "step": 6170 + }, + { + "epoch": 0.03948225853851756, + "grad_norm": 0.5166013836860657, + "learning_rate": 9.990399754555717e-05, + "loss": 1.0726, + "step": 6180 + }, + { + "epoch": 0.03954614568825626, + "grad_norm": 0.8496847748756409, + "learning_rate": 9.990368650501731e-05, + "loss": 0.8312, + "step": 6190 + }, + { + "epoch": 0.03961003283799497, + "grad_norm": 1.445300579071045, + "learning_rate": 9.990337496190416e-05, + "loss": 0.8953, + "step": 6200 + }, + { + "epoch": 0.03967391998773367, + "grad_norm": 2.797938108444214, + "learning_rate": 9.990306291622085e-05, + "loss": 0.8305, + "step": 6210 + }, + { + "epoch": 0.03973780713747237, + "grad_norm": 0.5867908596992493, + "learning_rate": 9.990275036797054e-05, + "loss": 0.7997, + "step": 6220 + }, + { + "epoch": 0.03980169428721107, + "grad_norm": 0.5474823713302612, + "learning_rate": 9.990243731715634e-05, + "loss": 1.3339, + "step": 6230 + }, + { + "epoch": 0.03986558143694977, + "grad_norm": 1.1061484813690186, + "learning_rate": 9.990212376378143e-05, + "loss": 0.8513, + "step": 6240 + }, + { + "epoch": 0.03992946858668847, + "grad_norm": 1.0674853324890137, + "learning_rate": 9.990180970784897e-05, + "loss": 1.0124, + "step": 6250 + }, + { + "epoch": 0.03999335573642718, + "grad_norm": 0.7848487496376038, + "learning_rate": 9.99014951493621e-05, + "loss": 0.9535, + "step": 6260 + }, + { + "epoch": 0.04005724288616588, + "grad_norm": 0.7292889356613159, + "learning_rate": 9.9901180088324e-05, + "loss": 1.1792, + "step": 6270 + }, + { + "epoch": 0.040121130035904576, + "grad_norm": 0.7035486698150635, + "learning_rate": 9.990086452473785e-05, + "loss": 0.8471, + "step": 6280 + }, + { + "epoch": 0.04018501718564328, + "grad_norm": 0.6115634441375732, + "learning_rate": 9.990054845860683e-05, + "loss": 1.1244, + "step": 6290 + }, + { + "epoch": 0.04024890433538198, + "grad_norm": 2.171461582183838, + "learning_rate": 9.990023188993412e-05, + "loss": 1.0045, + "step": 6300 + }, + { + "epoch": 0.04031279148512068, + "grad_norm": 0.8362821936607361, + "learning_rate": 9.989991481872292e-05, + "loss": 1.0352, + "step": 6310 + }, + { + "epoch": 0.040376678634859386, + "grad_norm": 0.8392160534858704, + "learning_rate": 9.989959724497638e-05, + "loss": 0.785, + "step": 6320 + }, + { + "epoch": 0.040440565784598086, + "grad_norm": 0.4593855142593384, + "learning_rate": 9.989927916869773e-05, + "loss": 0.8819, + "step": 6330 + }, + { + "epoch": 0.040504452934336785, + "grad_norm": 0.6949111223220825, + "learning_rate": 9.98989605898902e-05, + "loss": 0.9824, + "step": 6340 + }, + { + "epoch": 0.04056834008407549, + "grad_norm": 0.6681846976280212, + "learning_rate": 9.989864150855693e-05, + "loss": 0.7795, + "step": 6350 + }, + { + "epoch": 0.04063222723381419, + "grad_norm": 0.9278548359870911, + "learning_rate": 9.989832192470118e-05, + "loss": 0.9975, + "step": 6360 + }, + { + "epoch": 0.04069611438355289, + "grad_norm": 0.7522639632225037, + "learning_rate": 9.989800183832616e-05, + "loss": 1.0204, + "step": 6370 + }, + { + "epoch": 0.040760001533291595, + "grad_norm": 0.9609561562538147, + "learning_rate": 9.98976812494351e-05, + "loss": 1.0157, + "step": 6380 + }, + { + "epoch": 0.040823888683030295, + "grad_norm": 0.7092857956886292, + "learning_rate": 9.989736015803123e-05, + "loss": 0.9443, + "step": 6390 + }, + { + "epoch": 0.040887775832768994, + "grad_norm": 4.257565498352051, + "learning_rate": 9.989703856411776e-05, + "loss": 1.134, + "step": 6400 + }, + { + "epoch": 0.0409516629825077, + "grad_norm": 1.1755651235580444, + "learning_rate": 9.989671646769796e-05, + "loss": 1.1108, + "step": 6410 + }, + { + "epoch": 0.0410155501322464, + "grad_norm": 0.8459087610244751, + "learning_rate": 9.989639386877505e-05, + "loss": 1.0194, + "step": 6420 + }, + { + "epoch": 0.0410794372819851, + "grad_norm": 1.175000786781311, + "learning_rate": 9.989607076735229e-05, + "loss": 0.8072, + "step": 6430 + }, + { + "epoch": 0.041143324431723804, + "grad_norm": 1.2269272804260254, + "learning_rate": 9.989574716343294e-05, + "loss": 1.1758, + "step": 6440 + }, + { + "epoch": 0.041207211581462504, + "grad_norm": 0.7292816042900085, + "learning_rate": 9.989542305702022e-05, + "loss": 0.9037, + "step": 6450 + }, + { + "epoch": 0.0412710987312012, + "grad_norm": 1.1013445854187012, + "learning_rate": 9.989509844811745e-05, + "loss": 0.7594, + "step": 6460 + }, + { + "epoch": 0.04133498588093991, + "grad_norm": 1.5162911415100098, + "learning_rate": 9.989477333672787e-05, + "loss": 0.8458, + "step": 6470 + }, + { + "epoch": 0.04139887303067861, + "grad_norm": 0.5727777481079102, + "learning_rate": 9.989444772285475e-05, + "loss": 1.0281, + "step": 6480 + }, + { + "epoch": 0.041462760180417314, + "grad_norm": 0.940905749797821, + "learning_rate": 9.989412160650137e-05, + "loss": 0.8714, + "step": 6490 + }, + { + "epoch": 0.041526647330156014, + "grad_norm": 1.0898019075393677, + "learning_rate": 9.989379498767104e-05, + "loss": 0.8905, + "step": 6500 + }, + { + "epoch": 0.04159053447989471, + "grad_norm": 1.05965256690979, + "learning_rate": 9.989346786636701e-05, + "loss": 1.0419, + "step": 6510 + }, + { + "epoch": 0.04165442162963342, + "grad_norm": 1.0670409202575684, + "learning_rate": 9.989314024259262e-05, + "loss": 0.7306, + "step": 6520 + }, + { + "epoch": 0.04171830877937212, + "grad_norm": 0.9134021401405334, + "learning_rate": 9.989281211635114e-05, + "loss": 0.9002, + "step": 6530 + }, + { + "epoch": 0.04178219592911082, + "grad_norm": 0.9163311719894409, + "learning_rate": 9.989248348764586e-05, + "loss": 0.9131, + "step": 6540 + }, + { + "epoch": 0.04184608307884952, + "grad_norm": 0.6874496936798096, + "learning_rate": 9.989215435648011e-05, + "loss": 0.9497, + "step": 6550 + }, + { + "epoch": 0.04190997022858822, + "grad_norm": 0.9504197239875793, + "learning_rate": 9.989182472285721e-05, + "loss": 1.06, + "step": 6560 + }, + { + "epoch": 0.04197385737832692, + "grad_norm": 0.794982373714447, + "learning_rate": 9.989149458678046e-05, + "loss": 0.8137, + "step": 6570 + }, + { + "epoch": 0.04203774452806563, + "grad_norm": 0.9030359983444214, + "learning_rate": 9.989116394825322e-05, + "loss": 0.7989, + "step": 6580 + }, + { + "epoch": 0.04210163167780433, + "grad_norm": 0.7701511979103088, + "learning_rate": 9.989083280727878e-05, + "loss": 1.0566, + "step": 6590 + }, + { + "epoch": 0.042165518827543026, + "grad_norm": 0.8130073547363281, + "learning_rate": 9.98905011638605e-05, + "loss": 0.9397, + "step": 6600 + }, + { + "epoch": 0.04222940597728173, + "grad_norm": 0.6246233582496643, + "learning_rate": 9.989016901800171e-05, + "loss": 0.8776, + "step": 6610 + }, + { + "epoch": 0.04229329312702043, + "grad_norm": 0.7861520648002625, + "learning_rate": 9.988983636970576e-05, + "loss": 1.0794, + "step": 6620 + }, + { + "epoch": 0.04235718027675913, + "grad_norm": 1.3345977067947388, + "learning_rate": 9.988950321897599e-05, + "loss": 0.8676, + "step": 6630 + }, + { + "epoch": 0.04242106742649784, + "grad_norm": 0.56337571144104, + "learning_rate": 9.988916956581577e-05, + "loss": 0.8426, + "step": 6640 + }, + { + "epoch": 0.042484954576236536, + "grad_norm": 1.3534024953842163, + "learning_rate": 9.988883541022844e-05, + "loss": 0.7897, + "step": 6650 + }, + { + "epoch": 0.042548841725975235, + "grad_norm": 1.3062078952789307, + "learning_rate": 9.988850075221738e-05, + "loss": 1.1495, + "step": 6660 + }, + { + "epoch": 0.04261272887571394, + "grad_norm": 0.8563300967216492, + "learning_rate": 9.988816559178597e-05, + "loss": 0.7691, + "step": 6670 + }, + { + "epoch": 0.04267661602545264, + "grad_norm": 0.6267048120498657, + "learning_rate": 9.988782992893757e-05, + "loss": 0.9558, + "step": 6680 + }, + { + "epoch": 0.04274050317519134, + "grad_norm": 1.3723206520080566, + "learning_rate": 9.988749376367556e-05, + "loss": 0.9185, + "step": 6690 + }, + { + "epoch": 0.042804390324930046, + "grad_norm": 1.9447133541107178, + "learning_rate": 9.988715709600332e-05, + "loss": 1.0383, + "step": 6700 + }, + { + "epoch": 0.042868277474668745, + "grad_norm": 0.8852369785308838, + "learning_rate": 9.988681992592426e-05, + "loss": 0.8813, + "step": 6710 + }, + { + "epoch": 0.042932164624407444, + "grad_norm": 2.174041986465454, + "learning_rate": 9.988648225344177e-05, + "loss": 0.9662, + "step": 6720 + }, + { + "epoch": 0.04299605177414615, + "grad_norm": 1.9878665208816528, + "learning_rate": 9.988614407855924e-05, + "loss": 0.9924, + "step": 6730 + }, + { + "epoch": 0.04305993892388485, + "grad_norm": 0.9836265444755554, + "learning_rate": 9.988580540128008e-05, + "loss": 1.2755, + "step": 6740 + }, + { + "epoch": 0.04312382607362355, + "grad_norm": 0.999160647392273, + "learning_rate": 9.98854662216077e-05, + "loss": 0.9726, + "step": 6750 + }, + { + "epoch": 0.043187713223362255, + "grad_norm": 1.9516860246658325, + "learning_rate": 9.988512653954552e-05, + "loss": 0.7816, + "step": 6760 + }, + { + "epoch": 0.043251600373100954, + "grad_norm": 0.7745450735092163, + "learning_rate": 9.988478635509696e-05, + "loss": 0.7726, + "step": 6770 + }, + { + "epoch": 0.04331548752283965, + "grad_norm": 0.8929428458213806, + "learning_rate": 9.988444566826544e-05, + "loss": 1.0001, + "step": 6780 + }, + { + "epoch": 0.04337937467257836, + "grad_norm": 0.895820140838623, + "learning_rate": 9.98841044790544e-05, + "loss": 0.8765, + "step": 6790 + }, + { + "epoch": 0.04344326182231706, + "grad_norm": 0.6711694598197937, + "learning_rate": 9.988376278746727e-05, + "loss": 0.9975, + "step": 6800 + }, + { + "epoch": 0.04350714897205576, + "grad_norm": 0.9492961764335632, + "learning_rate": 9.988342059350751e-05, + "loss": 1.0356, + "step": 6810 + }, + { + "epoch": 0.043571036121794464, + "grad_norm": 0.7187815308570862, + "learning_rate": 9.988307789717853e-05, + "loss": 0.8538, + "step": 6820 + }, + { + "epoch": 0.04363492327153316, + "grad_norm": 0.9014946222305298, + "learning_rate": 9.98827346984838e-05, + "loss": 1.0214, + "step": 6830 + }, + { + "epoch": 0.04369881042127186, + "grad_norm": 0.5608994960784912, + "learning_rate": 9.98823909974268e-05, + "loss": 0.8462, + "step": 6840 + }, + { + "epoch": 0.04376269757101057, + "grad_norm": 0.8809041976928711, + "learning_rate": 9.988204679401094e-05, + "loss": 0.813, + "step": 6850 + }, + { + "epoch": 0.04382658472074927, + "grad_norm": 0.7527191638946533, + "learning_rate": 9.988170208823972e-05, + "loss": 1.0194, + "step": 6860 + }, + { + "epoch": 0.04389047187048797, + "grad_norm": 0.7817595601081848, + "learning_rate": 9.988135688011662e-05, + "loss": 0.8165, + "step": 6870 + }, + { + "epoch": 0.04395435902022667, + "grad_norm": 0.8186140656471252, + "learning_rate": 9.988101116964508e-05, + "loss": 0.8789, + "step": 6880 + }, + { + "epoch": 0.04401824616996537, + "grad_norm": 0.6612401008605957, + "learning_rate": 9.988066495682863e-05, + "loss": 0.8621, + "step": 6890 + }, + { + "epoch": 0.04408213331970407, + "grad_norm": 0.8166273832321167, + "learning_rate": 9.988031824167073e-05, + "loss": 1.0722, + "step": 6900 + }, + { + "epoch": 0.04414602046944278, + "grad_norm": 1.0065597295761108, + "learning_rate": 9.987997102417486e-05, + "loss": 1.082, + "step": 6910 + }, + { + "epoch": 0.04420990761918148, + "grad_norm": 1.0010764598846436, + "learning_rate": 9.987962330434452e-05, + "loss": 0.8206, + "step": 6920 + }, + { + "epoch": 0.04427379476892018, + "grad_norm": 0.7217119932174683, + "learning_rate": 9.987927508218324e-05, + "loss": 0.8516, + "step": 6930 + }, + { + "epoch": 0.04433768191865888, + "grad_norm": 1.464766502380371, + "learning_rate": 9.987892635769449e-05, + "loss": 1.1353, + "step": 6940 + }, + { + "epoch": 0.04440156906839758, + "grad_norm": 0.887629508972168, + "learning_rate": 9.987857713088182e-05, + "loss": 0.8636, + "step": 6950 + }, + { + "epoch": 0.04446545621813629, + "grad_norm": 1.562030553817749, + "learning_rate": 9.987822740174871e-05, + "loss": 1.2412, + "step": 6960 + }, + { + "epoch": 0.044529343367874986, + "grad_norm": 0.6418665647506714, + "learning_rate": 9.987787717029871e-05, + "loss": 1.1301, + "step": 6970 + }, + { + "epoch": 0.044593230517613686, + "grad_norm": 0.7377752065658569, + "learning_rate": 9.987752643653533e-05, + "loss": 0.89, + "step": 6980 + }, + { + "epoch": 0.04465711766735239, + "grad_norm": 0.709084689617157, + "learning_rate": 9.987717520046211e-05, + "loss": 0.9194, + "step": 6990 + }, + { + "epoch": 0.04472100481709109, + "grad_norm": 0.7699615359306335, + "learning_rate": 9.98768234620826e-05, + "loss": 0.995, + "step": 7000 + }, + { + "epoch": 0.04478489196682979, + "grad_norm": 0.8531057238578796, + "learning_rate": 9.987647122140031e-05, + "loss": 0.8096, + "step": 7010 + }, + { + "epoch": 0.044848779116568496, + "grad_norm": 1.1459274291992188, + "learning_rate": 9.987611847841883e-05, + "loss": 0.9038, + "step": 7020 + }, + { + "epoch": 0.044912666266307195, + "grad_norm": 0.966291606426239, + "learning_rate": 9.987576523314167e-05, + "loss": 0.9996, + "step": 7030 + }, + { + "epoch": 0.044976553416045895, + "grad_norm": 1.0549588203430176, + "learning_rate": 9.987541148557238e-05, + "loss": 0.7135, + "step": 7040 + }, + { + "epoch": 0.0450404405657846, + "grad_norm": 0.8475518226623535, + "learning_rate": 9.987505723571458e-05, + "loss": 0.7685, + "step": 7050 + }, + { + "epoch": 0.0451043277155233, + "grad_norm": 0.8754829168319702, + "learning_rate": 9.98747024835718e-05, + "loss": 0.9184, + "step": 7060 + }, + { + "epoch": 0.045168214865262, + "grad_norm": 0.8908385038375854, + "learning_rate": 9.987434722914762e-05, + "loss": 1.0456, + "step": 7070 + }, + { + "epoch": 0.045232102015000705, + "grad_norm": 0.9609813094139099, + "learning_rate": 9.987399147244562e-05, + "loss": 1.1562, + "step": 7080 + }, + { + "epoch": 0.045295989164739404, + "grad_norm": 0.681609034538269, + "learning_rate": 9.987363521346937e-05, + "loss": 0.8802, + "step": 7090 + }, + { + "epoch": 0.045359876314478104, + "grad_norm": 0.6809660792350769, + "learning_rate": 9.987327845222246e-05, + "loss": 0.9104, + "step": 7100 + }, + { + "epoch": 0.04542376346421681, + "grad_norm": 0.5972456932067871, + "learning_rate": 9.98729211887085e-05, + "loss": 0.9686, + "step": 7110 + }, + { + "epoch": 0.04548765061395551, + "grad_norm": 2.145796537399292, + "learning_rate": 9.987256342293108e-05, + "loss": 0.8764, + "step": 7120 + }, + { + "epoch": 0.04555153776369421, + "grad_norm": 1.2157313823699951, + "learning_rate": 9.98722051548938e-05, + "loss": 0.8955, + "step": 7130 + }, + { + "epoch": 0.045615424913432914, + "grad_norm": 0.8759172558784485, + "learning_rate": 9.987184638460026e-05, + "loss": 0.8679, + "step": 7140 + }, + { + "epoch": 0.04567931206317161, + "grad_norm": 1.0199391841888428, + "learning_rate": 9.987148711205408e-05, + "loss": 0.7592, + "step": 7150 + }, + { + "epoch": 0.04574319921291031, + "grad_norm": 0.7216569185256958, + "learning_rate": 9.98711273372589e-05, + "loss": 0.7954, + "step": 7160 + }, + { + "epoch": 0.04580708636264902, + "grad_norm": 1.0680534839630127, + "learning_rate": 9.98707670602183e-05, + "loss": 1.0779, + "step": 7170 + }, + { + "epoch": 0.04587097351238772, + "grad_norm": 0.9365562796592712, + "learning_rate": 9.987040628093594e-05, + "loss": 1.0918, + "step": 7180 + }, + { + "epoch": 0.04593486066212642, + "grad_norm": 1.0162864923477173, + "learning_rate": 9.987004499941545e-05, + "loss": 0.791, + "step": 7190 + }, + { + "epoch": 0.04599874781186512, + "grad_norm": 0.9427816271781921, + "learning_rate": 9.986968321566045e-05, + "loss": 0.8263, + "step": 7200 + }, + { + "epoch": 0.04606263496160382, + "grad_norm": 0.9530696868896484, + "learning_rate": 9.98693209296746e-05, + "loss": 1.0719, + "step": 7210 + }, + { + "epoch": 0.04612652211134252, + "grad_norm": 0.687778890132904, + "learning_rate": 9.986895814146156e-05, + "loss": 0.8541, + "step": 7220 + }, + { + "epoch": 0.04619040926108123, + "grad_norm": 0.8100598454475403, + "learning_rate": 9.986859485102495e-05, + "loss": 1.0194, + "step": 7230 + }, + { + "epoch": 0.04625429641081993, + "grad_norm": 0.5516176819801331, + "learning_rate": 9.986823105836847e-05, + "loss": 0.8347, + "step": 7240 + }, + { + "epoch": 0.046318183560558626, + "grad_norm": 0.8812345862388611, + "learning_rate": 9.986786676349573e-05, + "loss": 1.0472, + "step": 7250 + }, + { + "epoch": 0.04638207071029733, + "grad_norm": 1.0025354623794556, + "learning_rate": 9.986750196641047e-05, + "loss": 1.0196, + "step": 7260 + }, + { + "epoch": 0.04644595786003603, + "grad_norm": 1.2470890283584595, + "learning_rate": 9.986713666711629e-05, + "loss": 0.7237, + "step": 7270 + }, + { + "epoch": 0.04650984500977473, + "grad_norm": 0.7719841599464417, + "learning_rate": 9.986677086561691e-05, + "loss": 0.9012, + "step": 7280 + }, + { + "epoch": 0.04657373215951344, + "grad_norm": 0.5865141749382019, + "learning_rate": 9.9866404561916e-05, + "loss": 0.7885, + "step": 7290 + }, + { + "epoch": 0.046637619309252136, + "grad_norm": 0.8722718954086304, + "learning_rate": 9.986603775601728e-05, + "loss": 0.9654, + "step": 7300 + }, + { + "epoch": 0.046701506458990835, + "grad_norm": 0.9440786838531494, + "learning_rate": 9.98656704479244e-05, + "loss": 1.1144, + "step": 7310 + }, + { + "epoch": 0.04676539360872954, + "grad_norm": 0.8505666851997375, + "learning_rate": 9.986530263764108e-05, + "loss": 0.9502, + "step": 7320 + }, + { + "epoch": 0.04682928075846824, + "grad_norm": 0.7318026423454285, + "learning_rate": 9.986493432517103e-05, + "loss": 0.6851, + "step": 7330 + }, + { + "epoch": 0.04689316790820695, + "grad_norm": 1.4378130435943604, + "learning_rate": 9.986456551051795e-05, + "loss": 0.8454, + "step": 7340 + }, + { + "epoch": 0.046957055057945646, + "grad_norm": 0.9807822704315186, + "learning_rate": 9.986419619368554e-05, + "loss": 1.0638, + "step": 7350 + }, + { + "epoch": 0.047020942207684345, + "grad_norm": 1.2284691333770752, + "learning_rate": 9.986382637467757e-05, + "loss": 0.9615, + "step": 7360 + }, + { + "epoch": 0.04708482935742305, + "grad_norm": 0.7769535183906555, + "learning_rate": 9.986345605349769e-05, + "loss": 0.8708, + "step": 7370 + }, + { + "epoch": 0.04714871650716175, + "grad_norm": 1.48138427734375, + "learning_rate": 9.98630852301497e-05, + "loss": 0.7993, + "step": 7380 + }, + { + "epoch": 0.04721260365690045, + "grad_norm": 0.605939507484436, + "learning_rate": 9.986271390463728e-05, + "loss": 0.8898, + "step": 7390 + }, + { + "epoch": 0.047276490806639156, + "grad_norm": 0.7884547710418701, + "learning_rate": 9.986234207696421e-05, + "loss": 0.9975, + "step": 7400 + }, + { + "epoch": 0.047340377956377855, + "grad_norm": 0.9767579436302185, + "learning_rate": 9.986196974713422e-05, + "loss": 0.9493, + "step": 7410 + }, + { + "epoch": 0.047404265106116554, + "grad_norm": 0.9091633558273315, + "learning_rate": 9.986159691515105e-05, + "loss": 0.7876, + "step": 7420 + }, + { + "epoch": 0.04746815225585526, + "grad_norm": 0.6155557036399841, + "learning_rate": 9.986122358101847e-05, + "loss": 0.5978, + "step": 7430 + }, + { + "epoch": 0.04753203940559396, + "grad_norm": 0.8261324763298035, + "learning_rate": 9.986084974474024e-05, + "loss": 0.9533, + "step": 7440 + }, + { + "epoch": 0.04759592655533266, + "grad_norm": 0.5973717570304871, + "learning_rate": 9.98604754063201e-05, + "loss": 0.8045, + "step": 7450 + }, + { + "epoch": 0.047659813705071365, + "grad_norm": 1.0176916122436523, + "learning_rate": 9.986010056576184e-05, + "loss": 1.0215, + "step": 7460 + }, + { + "epoch": 0.047723700854810064, + "grad_norm": 0.5865172147750854, + "learning_rate": 9.985972522306923e-05, + "loss": 0.7648, + "step": 7470 + }, + { + "epoch": 0.04778758800454876, + "grad_norm": 1.0286486148834229, + "learning_rate": 9.985934937824605e-05, + "loss": 0.8718, + "step": 7480 + }, + { + "epoch": 0.04785147515428747, + "grad_norm": 1.0322641134262085, + "learning_rate": 9.98589730312961e-05, + "loss": 0.9538, + "step": 7490 + }, + { + "epoch": 0.04791536230402617, + "grad_norm": 0.8804035782814026, + "learning_rate": 9.985859618222316e-05, + "loss": 0.7283, + "step": 7500 + }, + { + "epoch": 0.04797924945376487, + "grad_norm": 0.7622368931770325, + "learning_rate": 9.985821883103102e-05, + "loss": 0.7618, + "step": 7510 + }, + { + "epoch": 0.048043136603503574, + "grad_norm": 1.1401050090789795, + "learning_rate": 9.985784097772347e-05, + "loss": 1.0667, + "step": 7520 + }, + { + "epoch": 0.04810702375324227, + "grad_norm": 0.6780824661254883, + "learning_rate": 9.985746262230433e-05, + "loss": 0.9327, + "step": 7530 + }, + { + "epoch": 0.04817091090298097, + "grad_norm": 1.0564121007919312, + "learning_rate": 9.985708376477743e-05, + "loss": 0.857, + "step": 7540 + }, + { + "epoch": 0.04823479805271968, + "grad_norm": 0.45248645544052124, + "learning_rate": 9.985670440514654e-05, + "loss": 0.7797, + "step": 7550 + }, + { + "epoch": 0.04829868520245838, + "grad_norm": 0.9228289127349854, + "learning_rate": 9.985632454341551e-05, + "loss": 1.2661, + "step": 7560 + }, + { + "epoch": 0.04836257235219708, + "grad_norm": 0.665448784828186, + "learning_rate": 9.985594417958816e-05, + "loss": 0.8736, + "step": 7570 + }, + { + "epoch": 0.04842645950193578, + "grad_norm": 0.7093620896339417, + "learning_rate": 9.985556331366832e-05, + "loss": 0.9296, + "step": 7580 + }, + { + "epoch": 0.04849034665167448, + "grad_norm": 1.1496485471725464, + "learning_rate": 9.985518194565983e-05, + "loss": 1.0429, + "step": 7590 + }, + { + "epoch": 0.04855423380141318, + "grad_norm": 0.8305206298828125, + "learning_rate": 9.985480007556653e-05, + "loss": 0.9499, + "step": 7600 + }, + { + "epoch": 0.04861812095115189, + "grad_norm": 0.8451396822929382, + "learning_rate": 9.985441770339226e-05, + "loss": 0.9502, + "step": 7610 + }, + { + "epoch": 0.048682008100890586, + "grad_norm": 1.2433000802993774, + "learning_rate": 9.985403482914087e-05, + "loss": 0.6543, + "step": 7620 + }, + { + "epoch": 0.048745895250629286, + "grad_norm": 0.8674241304397583, + "learning_rate": 9.985365145281622e-05, + "loss": 1.1627, + "step": 7630 + }, + { + "epoch": 0.04880978240036799, + "grad_norm": 0.5980839133262634, + "learning_rate": 9.985326757442217e-05, + "loss": 1.1205, + "step": 7640 + }, + { + "epoch": 0.04887366955010669, + "grad_norm": 1.4166803359985352, + "learning_rate": 9.98528831939626e-05, + "loss": 0.8682, + "step": 7650 + }, + { + "epoch": 0.04893755669984539, + "grad_norm": 0.8415298461914062, + "learning_rate": 9.985249831144135e-05, + "loss": 0.9133, + "step": 7660 + }, + { + "epoch": 0.049001443849584096, + "grad_norm": 1.0600535869598389, + "learning_rate": 9.985211292686231e-05, + "loss": 0.9593, + "step": 7670 + }, + { + "epoch": 0.049065330999322795, + "grad_norm": 0.5692518353462219, + "learning_rate": 9.985172704022939e-05, + "loss": 1.1105, + "step": 7680 + }, + { + "epoch": 0.049129218149061495, + "grad_norm": 1.1608545780181885, + "learning_rate": 9.985134065154643e-05, + "loss": 0.9287, + "step": 7690 + }, + { + "epoch": 0.0491931052988002, + "grad_norm": 0.9091508984565735, + "learning_rate": 9.985095376081734e-05, + "loss": 0.8312, + "step": 7700 + }, + { + "epoch": 0.0492569924485389, + "grad_norm": 0.8366988897323608, + "learning_rate": 9.985056636804604e-05, + "loss": 1.0451, + "step": 7710 + }, + { + "epoch": 0.0493208795982776, + "grad_norm": 1.0978457927703857, + "learning_rate": 9.98501784732364e-05, + "loss": 0.8821, + "step": 7720 + }, + { + "epoch": 0.049384766748016305, + "grad_norm": 1.7002284526824951, + "learning_rate": 9.984979007639233e-05, + "loss": 0.7092, + "step": 7730 + }, + { + "epoch": 0.049448653897755004, + "grad_norm": 1.77642023563385, + "learning_rate": 9.984940117751773e-05, + "loss": 1.0623, + "step": 7740 + }, + { + "epoch": 0.04951254104749371, + "grad_norm": 0.800308883190155, + "learning_rate": 9.984901177661656e-05, + "loss": 1.3445, + "step": 7750 + }, + { + "epoch": 0.04957642819723241, + "grad_norm": 0.9408762454986572, + "learning_rate": 9.98486218736927e-05, + "loss": 0.8466, + "step": 7760 + }, + { + "epoch": 0.04964031534697111, + "grad_norm": 0.7024977207183838, + "learning_rate": 9.98482314687501e-05, + "loss": 0.7186, + "step": 7770 + }, + { + "epoch": 0.049704202496709815, + "grad_norm": 0.7420535087585449, + "learning_rate": 9.98478405617927e-05, + "loss": 1.0408, + "step": 7780 + }, + { + "epoch": 0.049768089646448514, + "grad_norm": 1.0378546714782715, + "learning_rate": 9.98474491528244e-05, + "loss": 0.8885, + "step": 7790 + }, + { + "epoch": 0.04983197679618721, + "grad_norm": 1.380505919456482, + "learning_rate": 9.984705724184917e-05, + "loss": 1.113, + "step": 7800 + }, + { + "epoch": 0.04989586394592592, + "grad_norm": 1.8946232795715332, + "learning_rate": 9.984666482887096e-05, + "loss": 0.8355, + "step": 7810 + }, + { + "epoch": 0.04995975109566462, + "grad_norm": 1.4878778457641602, + "learning_rate": 9.98462719138937e-05, + "loss": 0.992, + "step": 7820 + }, + { + "epoch": 0.05002363824540332, + "grad_norm": 0.7730852365493774, + "learning_rate": 9.984587849692136e-05, + "loss": 0.7539, + "step": 7830 + }, + { + "epoch": 0.050087525395142024, + "grad_norm": 0.83015376329422, + "learning_rate": 9.984548457795791e-05, + "loss": 0.8696, + "step": 7840 + }, + { + "epoch": 0.05015141254488072, + "grad_norm": 0.7511310577392578, + "learning_rate": 9.98450901570073e-05, + "loss": 0.9013, + "step": 7850 + }, + { + "epoch": 0.05021529969461942, + "grad_norm": 0.9059261679649353, + "learning_rate": 9.984469523407349e-05, + "loss": 0.8444, + "step": 7860 + }, + { + "epoch": 0.05027918684435813, + "grad_norm": 0.9825949668884277, + "learning_rate": 9.98442998091605e-05, + "loss": 0.8864, + "step": 7870 + }, + { + "epoch": 0.05034307399409683, + "grad_norm": 0.904929518699646, + "learning_rate": 9.984390388227228e-05, + "loss": 0.7628, + "step": 7880 + }, + { + "epoch": 0.05040696114383553, + "grad_norm": 0.736785888671875, + "learning_rate": 9.984350745341284e-05, + "loss": 0.6913, + "step": 7890 + }, + { + "epoch": 0.05047084829357423, + "grad_norm": 0.7877079248428345, + "learning_rate": 9.984311052258615e-05, + "loss": 1.2899, + "step": 7900 + }, + { + "epoch": 0.05053473544331293, + "grad_norm": 3.8321728706359863, + "learning_rate": 9.984271308979622e-05, + "loss": 0.9465, + "step": 7910 + }, + { + "epoch": 0.05059862259305163, + "grad_norm": 0.729813277721405, + "learning_rate": 9.984231515504705e-05, + "loss": 1.1176, + "step": 7920 + }, + { + "epoch": 0.05066250974279034, + "grad_norm": 1.07712984085083, + "learning_rate": 9.984191671834264e-05, + "loss": 0.821, + "step": 7930 + }, + { + "epoch": 0.05072639689252904, + "grad_norm": 0.6421816349029541, + "learning_rate": 9.984151777968701e-05, + "loss": 0.8634, + "step": 7940 + }, + { + "epoch": 0.050790284042267736, + "grad_norm": 1.0871955156326294, + "learning_rate": 9.984111833908419e-05, + "loss": 0.9175, + "step": 7950 + }, + { + "epoch": 0.05085417119200644, + "grad_norm": 0.9562147855758667, + "learning_rate": 9.984071839653817e-05, + "loss": 0.8648, + "step": 7960 + }, + { + "epoch": 0.05091805834174514, + "grad_norm": 0.8465697169303894, + "learning_rate": 9.9840317952053e-05, + "loss": 0.7018, + "step": 7970 + }, + { + "epoch": 0.05098194549148384, + "grad_norm": 0.4053485095500946, + "learning_rate": 9.983991700563273e-05, + "loss": 0.8683, + "step": 7980 + }, + { + "epoch": 0.05104583264122255, + "grad_norm": 0.7025613188743591, + "learning_rate": 9.983951555728135e-05, + "loss": 0.9431, + "step": 7990 + }, + { + "epoch": 0.051109719790961246, + "grad_norm": 0.7401816248893738, + "learning_rate": 9.983911360700296e-05, + "loss": 1.1364, + "step": 8000 + }, + { + "epoch": 0.051173606940699945, + "grad_norm": 0.41972461342811584, + "learning_rate": 9.983871115480155e-05, + "loss": 0.9925, + "step": 8010 + }, + { + "epoch": 0.05123749409043865, + "grad_norm": 0.577347457408905, + "learning_rate": 9.983830820068123e-05, + "loss": 0.7687, + "step": 8020 + }, + { + "epoch": 0.05130138124017735, + "grad_norm": 0.8155549764633179, + "learning_rate": 9.983790474464601e-05, + "loss": 0.9115, + "step": 8030 + }, + { + "epoch": 0.05136526838991605, + "grad_norm": 0.9730279445648193, + "learning_rate": 9.983750078669998e-05, + "loss": 1.1313, + "step": 8040 + }, + { + "epoch": 0.051429155539654756, + "grad_norm": 0.8205385208129883, + "learning_rate": 9.98370963268472e-05, + "loss": 0.9971, + "step": 8050 + }, + { + "epoch": 0.051493042689393455, + "grad_norm": 0.5464890599250793, + "learning_rate": 9.983669136509175e-05, + "loss": 0.7868, + "step": 8060 + }, + { + "epoch": 0.051556929839132154, + "grad_norm": 1.3623446226119995, + "learning_rate": 9.98362859014377e-05, + "loss": 0.9343, + "step": 8070 + }, + { + "epoch": 0.05162081698887086, + "grad_norm": 0.8901773691177368, + "learning_rate": 9.983587993588914e-05, + "loss": 0.7135, + "step": 8080 + }, + { + "epoch": 0.05168470413860956, + "grad_norm": 0.7160339951515198, + "learning_rate": 9.983547346845015e-05, + "loss": 1.2925, + "step": 8090 + }, + { + "epoch": 0.05174859128834826, + "grad_norm": 0.6623441576957703, + "learning_rate": 9.983506649912482e-05, + "loss": 0.923, + "step": 8100 + }, + { + "epoch": 0.051812478438086965, + "grad_norm": 0.469149112701416, + "learning_rate": 9.983465902791726e-05, + "loss": 0.94, + "step": 8110 + }, + { + "epoch": 0.051876365587825664, + "grad_norm": 0.5665640234947205, + "learning_rate": 9.98342510548316e-05, + "loss": 1.0792, + "step": 8120 + }, + { + "epoch": 0.05194025273756436, + "grad_norm": 1.4578264951705933, + "learning_rate": 9.983384257987189e-05, + "loss": 1.0587, + "step": 8130 + }, + { + "epoch": 0.05200413988730307, + "grad_norm": 0.8157141804695129, + "learning_rate": 9.983343360304227e-05, + "loss": 1.2347, + "step": 8140 + }, + { + "epoch": 0.05206802703704177, + "grad_norm": 0.9772050976753235, + "learning_rate": 9.983302412434688e-05, + "loss": 1.1827, + "step": 8150 + }, + { + "epoch": 0.05213191418678047, + "grad_norm": 1.2389028072357178, + "learning_rate": 9.983261414378982e-05, + "loss": 0.8998, + "step": 8160 + }, + { + "epoch": 0.052195801336519174, + "grad_norm": 1.1656652688980103, + "learning_rate": 9.983220366137522e-05, + "loss": 0.9351, + "step": 8170 + }, + { + "epoch": 0.05225968848625787, + "grad_norm": 1.1599071025848389, + "learning_rate": 9.983179267710721e-05, + "loss": 0.8263, + "step": 8180 + }, + { + "epoch": 0.05232357563599658, + "grad_norm": 1.0255035161972046, + "learning_rate": 9.983138119098993e-05, + "loss": 1.1271, + "step": 8190 + }, + { + "epoch": 0.05238746278573528, + "grad_norm": 0.7129418849945068, + "learning_rate": 9.983096920302755e-05, + "loss": 0.7378, + "step": 8200 + }, + { + "epoch": 0.05245134993547398, + "grad_norm": 1.268712043762207, + "learning_rate": 9.983055671322421e-05, + "loss": 0.7541, + "step": 8210 + }, + { + "epoch": 0.05251523708521268, + "grad_norm": 0.8625295758247375, + "learning_rate": 9.983014372158403e-05, + "loss": 0.7441, + "step": 8220 + }, + { + "epoch": 0.05257912423495138, + "grad_norm": 3.3473124504089355, + "learning_rate": 9.982973022811122e-05, + "loss": 1.0331, + "step": 8230 + }, + { + "epoch": 0.05264301138469008, + "grad_norm": 1.0163989067077637, + "learning_rate": 9.982931623280989e-05, + "loss": 0.7206, + "step": 8240 + }, + { + "epoch": 0.05270689853442879, + "grad_norm": 0.6889442205429077, + "learning_rate": 9.982890173568426e-05, + "loss": 0.7788, + "step": 8250 + }, + { + "epoch": 0.05277078568416749, + "grad_norm": 0.7175592184066772, + "learning_rate": 9.982848673673846e-05, + "loss": 0.847, + "step": 8260 + }, + { + "epoch": 0.052834672833906186, + "grad_norm": 1.019832730293274, + "learning_rate": 9.98280712359767e-05, + "loss": 0.8542, + "step": 8270 + }, + { + "epoch": 0.05289855998364489, + "grad_norm": 0.9718403220176697, + "learning_rate": 9.982765523340316e-05, + "loss": 1.0609, + "step": 8280 + }, + { + "epoch": 0.05296244713338359, + "grad_norm": 2.7732856273651123, + "learning_rate": 9.982723872902202e-05, + "loss": 0.9938, + "step": 8290 + }, + { + "epoch": 0.05302633428312229, + "grad_norm": 0.6997831463813782, + "learning_rate": 9.982682172283748e-05, + "loss": 0.7695, + "step": 8300 + }, + { + "epoch": 0.053090221432861, + "grad_norm": 1.2381385564804077, + "learning_rate": 9.982640421485374e-05, + "loss": 0.8043, + "step": 8310 + }, + { + "epoch": 0.053154108582599696, + "grad_norm": 1.2460087537765503, + "learning_rate": 9.9825986205075e-05, + "loss": 0.9461, + "step": 8320 + }, + { + "epoch": 0.053217995732338395, + "grad_norm": 0.7866740822792053, + "learning_rate": 9.982556769350549e-05, + "loss": 0.88, + "step": 8330 + }, + { + "epoch": 0.0532818828820771, + "grad_norm": 1.1013973951339722, + "learning_rate": 9.982514868014938e-05, + "loss": 0.7032, + "step": 8340 + }, + { + "epoch": 0.0533457700318158, + "grad_norm": 0.7456531524658203, + "learning_rate": 9.982472916501093e-05, + "loss": 0.8763, + "step": 8350 + }, + { + "epoch": 0.0534096571815545, + "grad_norm": 0.6022664904594421, + "learning_rate": 9.982430914809437e-05, + "loss": 1.0766, + "step": 8360 + }, + { + "epoch": 0.053473544331293206, + "grad_norm": 0.6867753267288208, + "learning_rate": 9.982388862940389e-05, + "loss": 0.8823, + "step": 8370 + }, + { + "epoch": 0.053537431481031905, + "grad_norm": 1.045599102973938, + "learning_rate": 9.982346760894375e-05, + "loss": 1.0784, + "step": 8380 + }, + { + "epoch": 0.053601318630770604, + "grad_norm": 1.3521573543548584, + "learning_rate": 9.982304608671819e-05, + "loss": 1.1522, + "step": 8390 + }, + { + "epoch": 0.05366520578050931, + "grad_norm": 0.6618836522102356, + "learning_rate": 9.982262406273146e-05, + "loss": 0.863, + "step": 8400 + }, + { + "epoch": 0.05372909293024801, + "grad_norm": 0.6689035892486572, + "learning_rate": 9.98222015369878e-05, + "loss": 0.9005, + "step": 8410 + }, + { + "epoch": 0.05379298007998671, + "grad_norm": 1.0590460300445557, + "learning_rate": 9.982177850949147e-05, + "loss": 1.0022, + "step": 8420 + }, + { + "epoch": 0.053856867229725415, + "grad_norm": 0.6324277520179749, + "learning_rate": 9.982135498024673e-05, + "loss": 0.7492, + "step": 8430 + }, + { + "epoch": 0.053920754379464114, + "grad_norm": 0.5392162203788757, + "learning_rate": 9.982093094925784e-05, + "loss": 0.991, + "step": 8440 + }, + { + "epoch": 0.05398464152920281, + "grad_norm": 0.6738571524620056, + "learning_rate": 9.982050641652908e-05, + "loss": 1.0112, + "step": 8450 + }, + { + "epoch": 0.05404852867894152, + "grad_norm": 0.8277943730354309, + "learning_rate": 9.98200813820647e-05, + "loss": 0.6247, + "step": 8460 + }, + { + "epoch": 0.05411241582868022, + "grad_norm": 1.3968684673309326, + "learning_rate": 9.981965584586901e-05, + "loss": 1.0051, + "step": 8470 + }, + { + "epoch": 0.05417630297841892, + "grad_norm": 1.391640543937683, + "learning_rate": 9.981922980794629e-05, + "loss": 0.9332, + "step": 8480 + }, + { + "epoch": 0.054240190128157624, + "grad_norm": 2.0874507427215576, + "learning_rate": 9.981880326830083e-05, + "loss": 1.135, + "step": 8490 + }, + { + "epoch": 0.05430407727789632, + "grad_norm": 1.9418469667434692, + "learning_rate": 9.981837622693692e-05, + "loss": 0.8689, + "step": 8500 + }, + { + "epoch": 0.05436796442763502, + "grad_norm": 0.9285494089126587, + "learning_rate": 9.981794868385886e-05, + "loss": 0.8521, + "step": 8510 + }, + { + "epoch": 0.05443185157737373, + "grad_norm": 1.062789797782898, + "learning_rate": 9.981752063907096e-05, + "loss": 1.0655, + "step": 8520 + }, + { + "epoch": 0.05449573872711243, + "grad_norm": 0.6997897624969482, + "learning_rate": 9.981709209257752e-05, + "loss": 0.9636, + "step": 8530 + }, + { + "epoch": 0.05455962587685113, + "grad_norm": 0.8409900665283203, + "learning_rate": 9.981666304438286e-05, + "loss": 0.9073, + "step": 8540 + }, + { + "epoch": 0.05462351302658983, + "grad_norm": 0.7529276013374329, + "learning_rate": 9.981623349449131e-05, + "loss": 0.695, + "step": 8550 + }, + { + "epoch": 0.05468740017632853, + "grad_norm": 0.6798946261405945, + "learning_rate": 9.981580344290722e-05, + "loss": 0.9083, + "step": 8560 + }, + { + "epoch": 0.05475128732606723, + "grad_norm": 0.537013828754425, + "learning_rate": 9.981537288963487e-05, + "loss": 0.9872, + "step": 8570 + }, + { + "epoch": 0.05481517447580594, + "grad_norm": 0.9144914150238037, + "learning_rate": 9.981494183467861e-05, + "loss": 0.9987, + "step": 8580 + }, + { + "epoch": 0.05487906162554464, + "grad_norm": 1.6605632305145264, + "learning_rate": 9.98145102780428e-05, + "loss": 0.9811, + "step": 8590 + }, + { + "epoch": 0.05494294877528334, + "grad_norm": 0.8611153960227966, + "learning_rate": 9.981407821973176e-05, + "loss": 1.0801, + "step": 8600 + }, + { + "epoch": 0.05500683592502204, + "grad_norm": 0.9995184540748596, + "learning_rate": 9.981364565974988e-05, + "loss": 0.9886, + "step": 8610 + }, + { + "epoch": 0.05507072307476074, + "grad_norm": 1.9788289070129395, + "learning_rate": 9.981321259810149e-05, + "loss": 0.8339, + "step": 8620 + }, + { + "epoch": 0.05513461022449945, + "grad_norm": 0.6516178250312805, + "learning_rate": 9.981277903479095e-05, + "loss": 0.87, + "step": 8630 + }, + { + "epoch": 0.05519849737423815, + "grad_norm": 0.6122477054595947, + "learning_rate": 9.981234496982262e-05, + "loss": 0.9143, + "step": 8640 + }, + { + "epoch": 0.055262384523976846, + "grad_norm": 0.6674822568893433, + "learning_rate": 9.98119104032009e-05, + "loss": 0.9851, + "step": 8650 + }, + { + "epoch": 0.05532627167371555, + "grad_norm": 0.7896667122840881, + "learning_rate": 9.981147533493013e-05, + "loss": 0.9507, + "step": 8660 + }, + { + "epoch": 0.05539015882345425, + "grad_norm": 0.5288309454917908, + "learning_rate": 9.981103976501474e-05, + "loss": 0.8592, + "step": 8670 + }, + { + "epoch": 0.05545404597319295, + "grad_norm": 1.2801772356033325, + "learning_rate": 9.981060369345905e-05, + "loss": 0.799, + "step": 8680 + }, + { + "epoch": 0.055517933122931656, + "grad_norm": 1.178462266921997, + "learning_rate": 9.981016712026752e-05, + "loss": 0.8998, + "step": 8690 + }, + { + "epoch": 0.055581820272670356, + "grad_norm": 0.5843381285667419, + "learning_rate": 9.98097300454445e-05, + "loss": 0.9922, + "step": 8700 + }, + { + "epoch": 0.055645707422409055, + "grad_norm": 1.011044979095459, + "learning_rate": 9.980929246899441e-05, + "loss": 0.8379, + "step": 8710 + }, + { + "epoch": 0.05570959457214776, + "grad_norm": 0.9214301109313965, + "learning_rate": 9.980885439092165e-05, + "loss": 0.6383, + "step": 8720 + }, + { + "epoch": 0.05577348172188646, + "grad_norm": 0.6694325804710388, + "learning_rate": 9.980841581123064e-05, + "loss": 1.1735, + "step": 8730 + }, + { + "epoch": 0.05583736887162516, + "grad_norm": 0.6404210329055786, + "learning_rate": 9.98079767299258e-05, + "loss": 0.7791, + "step": 8740 + }, + { + "epoch": 0.055901256021363865, + "grad_norm": 1.0678333044052124, + "learning_rate": 9.980753714701152e-05, + "loss": 0.8481, + "step": 8750 + }, + { + "epoch": 0.055965143171102565, + "grad_norm": 2.0661401748657227, + "learning_rate": 9.980709706249227e-05, + "loss": 1.0899, + "step": 8760 + }, + { + "epoch": 0.056029030320841264, + "grad_norm": 1.161922812461853, + "learning_rate": 9.980665647637246e-05, + "loss": 0.7383, + "step": 8770 + }, + { + "epoch": 0.05609291747057997, + "grad_norm": 0.5117482542991638, + "learning_rate": 9.980621538865654e-05, + "loss": 0.9479, + "step": 8780 + }, + { + "epoch": 0.05615680462031867, + "grad_norm": 1.124691367149353, + "learning_rate": 9.980581798085118e-05, + "loss": 1.0286, + "step": 8790 + }, + { + "epoch": 0.05622069177005737, + "grad_norm": 0.9648489952087402, + "learning_rate": 9.980537594011486e-05, + "loss": 0.7825, + "step": 8800 + }, + { + "epoch": 0.056284578919796074, + "grad_norm": 0.9334906339645386, + "learning_rate": 9.980493339779533e-05, + "loss": 0.8359, + "step": 8810 + }, + { + "epoch": 0.056348466069534774, + "grad_norm": 0.8304029107093811, + "learning_rate": 9.980449035389702e-05, + "loss": 0.7827, + "step": 8820 + }, + { + "epoch": 0.05641235321927347, + "grad_norm": 0.8497231006622314, + "learning_rate": 9.980404680842441e-05, + "loss": 1.0369, + "step": 8830 + }, + { + "epoch": 0.05647624036901218, + "grad_norm": 0.7895182371139526, + "learning_rate": 9.980360276138196e-05, + "loss": 0.8317, + "step": 8840 + }, + { + "epoch": 0.05654012751875088, + "grad_norm": 2.521169900894165, + "learning_rate": 9.980315821277415e-05, + "loss": 1.1953, + "step": 8850 + }, + { + "epoch": 0.05660401466848958, + "grad_norm": 0.8677668571472168, + "learning_rate": 9.980271316260544e-05, + "loss": 0.6768, + "step": 8860 + }, + { + "epoch": 0.05666790181822828, + "grad_norm": 0.6117026805877686, + "learning_rate": 9.980226761088033e-05, + "loss": 0.8991, + "step": 8870 + }, + { + "epoch": 0.05673178896796698, + "grad_norm": 0.5636959075927734, + "learning_rate": 9.98018215576033e-05, + "loss": 0.9742, + "step": 8880 + }, + { + "epoch": 0.05679567611770568, + "grad_norm": 1.0202407836914062, + "learning_rate": 9.980137500277885e-05, + "loss": 0.8069, + "step": 8890 + }, + { + "epoch": 0.05685956326744439, + "grad_norm": 0.7063365578651428, + "learning_rate": 9.980092794641144e-05, + "loss": 0.7919, + "step": 8900 + }, + { + "epoch": 0.05692345041718309, + "grad_norm": 0.6419750452041626, + "learning_rate": 9.980048038850564e-05, + "loss": 1.0765, + "step": 8910 + }, + { + "epoch": 0.056987337566921786, + "grad_norm": 0.8232806921005249, + "learning_rate": 9.98000323290659e-05, + "loss": 0.9938, + "step": 8920 + }, + { + "epoch": 0.05705122471666049, + "grad_norm": 0.846300482749939, + "learning_rate": 9.979958376809675e-05, + "loss": 0.9364, + "step": 8930 + }, + { + "epoch": 0.05711511186639919, + "grad_norm": 0.9861621856689453, + "learning_rate": 9.979913470560271e-05, + "loss": 0.9568, + "step": 8940 + }, + { + "epoch": 0.05717899901613789, + "grad_norm": 1.035204529762268, + "learning_rate": 9.97986851415883e-05, + "loss": 1.1155, + "step": 8950 + }, + { + "epoch": 0.0572428861658766, + "grad_norm": 0.901535153388977, + "learning_rate": 9.979823507605806e-05, + "loss": 0.9014, + "step": 8960 + }, + { + "epoch": 0.057306773315615296, + "grad_norm": 0.7851259708404541, + "learning_rate": 9.97977845090165e-05, + "loss": 0.9278, + "step": 8970 + }, + { + "epoch": 0.057370660465353995, + "grad_norm": 0.8578255772590637, + "learning_rate": 9.979733344046818e-05, + "loss": 1.0668, + "step": 8980 + }, + { + "epoch": 0.0574345476150927, + "grad_norm": 0.5631706714630127, + "learning_rate": 9.979688187041761e-05, + "loss": 0.7958, + "step": 8990 + }, + { + "epoch": 0.0574984347648314, + "grad_norm": 0.9356205463409424, + "learning_rate": 9.979642979886938e-05, + "loss": 0.9709, + "step": 9000 + }, + { + "epoch": 0.05756232191457011, + "grad_norm": 1.1016316413879395, + "learning_rate": 9.979597722582801e-05, + "loss": 1.0941, + "step": 9010 + }, + { + "epoch": 0.057626209064308806, + "grad_norm": 0.7269836068153381, + "learning_rate": 9.979552415129806e-05, + "loss": 0.8328, + "step": 9020 + }, + { + "epoch": 0.057690096214047505, + "grad_norm": 1.0700838565826416, + "learning_rate": 9.979507057528412e-05, + "loss": 1.0288, + "step": 9030 + }, + { + "epoch": 0.05775398336378621, + "grad_norm": 0.9521405100822449, + "learning_rate": 9.979461649779074e-05, + "loss": 0.8238, + "step": 9040 + }, + { + "epoch": 0.05781787051352491, + "grad_norm": 1.3190817832946777, + "learning_rate": 9.97941619188225e-05, + "loss": 0.9142, + "step": 9050 + }, + { + "epoch": 0.05788175766326361, + "grad_norm": 0.7254020571708679, + "learning_rate": 9.979370683838396e-05, + "loss": 0.952, + "step": 9060 + }, + { + "epoch": 0.057945644813002316, + "grad_norm": 0.6510186195373535, + "learning_rate": 9.979325125647972e-05, + "loss": 0.9684, + "step": 9070 + }, + { + "epoch": 0.058009531962741015, + "grad_norm": 0.8847187757492065, + "learning_rate": 9.979279517311435e-05, + "loss": 0.867, + "step": 9080 + }, + { + "epoch": 0.058073419112479714, + "grad_norm": 1.1535148620605469, + "learning_rate": 9.979233858829246e-05, + "loss": 1.1381, + "step": 9090 + }, + { + "epoch": 0.05813730626221842, + "grad_norm": 0.8919582366943359, + "learning_rate": 9.979188150201866e-05, + "loss": 0.9911, + "step": 9100 + }, + { + "epoch": 0.05820119341195712, + "grad_norm": 0.8089918494224548, + "learning_rate": 9.979142391429753e-05, + "loss": 1.0435, + "step": 9110 + }, + { + "epoch": 0.05826508056169582, + "grad_norm": 1.1607420444488525, + "learning_rate": 9.979096582513366e-05, + "loss": 0.8656, + "step": 9120 + }, + { + "epoch": 0.058328967711434525, + "grad_norm": 0.8984375596046448, + "learning_rate": 9.979050723453171e-05, + "loss": 0.7627, + "step": 9130 + }, + { + "epoch": 0.058392854861173224, + "grad_norm": 1.1916580200195312, + "learning_rate": 9.979004814249629e-05, + "loss": 0.8041, + "step": 9140 + }, + { + "epoch": 0.05845674201091192, + "grad_norm": 1.0592631101608276, + "learning_rate": 9.978958854903198e-05, + "loss": 0.8423, + "step": 9150 + }, + { + "epoch": 0.05852062916065063, + "grad_norm": 0.8369486331939697, + "learning_rate": 9.978912845414347e-05, + "loss": 0.7743, + "step": 9160 + }, + { + "epoch": 0.05858451631038933, + "grad_norm": 0.8720589280128479, + "learning_rate": 9.978866785783533e-05, + "loss": 0.8537, + "step": 9170 + }, + { + "epoch": 0.05864840346012803, + "grad_norm": 1.644795298576355, + "learning_rate": 9.978820676011227e-05, + "loss": 0.7972, + "step": 9180 + }, + { + "epoch": 0.058712290609866734, + "grad_norm": 2.7408289909362793, + "learning_rate": 9.978774516097886e-05, + "loss": 1.3147, + "step": 9190 + }, + { + "epoch": 0.05877617775960543, + "grad_norm": 0.5846157670021057, + "learning_rate": 9.97872830604398e-05, + "loss": 0.948, + "step": 9200 + }, + { + "epoch": 0.05884006490934413, + "grad_norm": 0.5981711149215698, + "learning_rate": 9.978682045849975e-05, + "loss": 0.9317, + "step": 9210 + }, + { + "epoch": 0.05890395205908284, + "grad_norm": 0.8747972249984741, + "learning_rate": 9.97863573551633e-05, + "loss": 1.0936, + "step": 9220 + }, + { + "epoch": 0.05896783920882154, + "grad_norm": 0.8239877223968506, + "learning_rate": 9.978589375043519e-05, + "loss": 0.9512, + "step": 9230 + }, + { + "epoch": 0.05903172635856024, + "grad_norm": 0.48801517486572266, + "learning_rate": 9.978542964432005e-05, + "loss": 0.8562, + "step": 9240 + }, + { + "epoch": 0.05909561350829894, + "grad_norm": 2.152211904525757, + "learning_rate": 9.978496503682258e-05, + "loss": 1.0353, + "step": 9250 + }, + { + "epoch": 0.05915950065803764, + "grad_norm": 0.9108942747116089, + "learning_rate": 9.978449992794742e-05, + "loss": 0.9188, + "step": 9260 + }, + { + "epoch": 0.05922338780777634, + "grad_norm": 0.41772526502609253, + "learning_rate": 9.978403431769927e-05, + "loss": 0.8177, + "step": 9270 + }, + { + "epoch": 0.05928727495751505, + "grad_norm": 0.5694353580474854, + "learning_rate": 9.978356820608284e-05, + "loss": 0.8956, + "step": 9280 + }, + { + "epoch": 0.059351162107253747, + "grad_norm": 0.620496928691864, + "learning_rate": 9.978310159310282e-05, + "loss": 0.631, + "step": 9290 + }, + { + "epoch": 0.059415049256992446, + "grad_norm": 3.017289638519287, + "learning_rate": 9.978263447876388e-05, + "loss": 0.7887, + "step": 9300 + }, + { + "epoch": 0.05947893640673115, + "grad_norm": 1.065492868423462, + "learning_rate": 9.978216686307075e-05, + "loss": 0.8404, + "step": 9310 + }, + { + "epoch": 0.05954282355646985, + "grad_norm": 0.5826980471611023, + "learning_rate": 9.978169874602813e-05, + "loss": 0.8956, + "step": 9320 + }, + { + "epoch": 0.05960671070620855, + "grad_norm": 0.9797850847244263, + "learning_rate": 9.978123012764074e-05, + "loss": 1.0606, + "step": 9330 + }, + { + "epoch": 0.059670597855947256, + "grad_norm": 0.6139065027236938, + "learning_rate": 9.97807610079133e-05, + "loss": 0.9635, + "step": 9340 + }, + { + "epoch": 0.059734485005685956, + "grad_norm": 0.7059098482131958, + "learning_rate": 9.978029138685052e-05, + "loss": 0.861, + "step": 9350 + }, + { + "epoch": 0.059798372155424655, + "grad_norm": 0.8045505285263062, + "learning_rate": 9.977982126445712e-05, + "loss": 0.923, + "step": 9360 + }, + { + "epoch": 0.05986225930516336, + "grad_norm": NaN, + "learning_rate": 9.977939772566934e-05, + "loss": 1.1034, + "step": 9370 + }, + { + "epoch": 0.05992614645490206, + "grad_norm": 0.8647670745849609, + "learning_rate": 9.977892665076088e-05, + "loss": 0.9245, + "step": 9380 + }, + { + "epoch": 0.05999003360464076, + "grad_norm": 1.1494901180267334, + "learning_rate": 9.977845507453554e-05, + "loss": 0.9954, + "step": 9390 + }, + { + "epoch": 0.060053920754379465, + "grad_norm": 0.6723154783248901, + "learning_rate": 9.977798299699811e-05, + "loss": 0.7105, + "step": 9400 + }, + { + "epoch": 0.060117807904118165, + "grad_norm": 1.0932044982910156, + "learning_rate": 9.977751041815333e-05, + "loss": 0.994, + "step": 9410 + }, + { + "epoch": 0.06018169505385687, + "grad_norm": 1.036632776260376, + "learning_rate": 9.977703733800594e-05, + "loss": 0.9975, + "step": 9420 + }, + { + "epoch": 0.06024558220359557, + "grad_norm": 0.9420500993728638, + "learning_rate": 9.977656375656072e-05, + "loss": 0.8054, + "step": 9430 + }, + { + "epoch": 0.06030946935333427, + "grad_norm": 0.8750972747802734, + "learning_rate": 9.977608967382246e-05, + "loss": 1.233, + "step": 9440 + }, + { + "epoch": 0.060373356503072975, + "grad_norm": 2.4813504219055176, + "learning_rate": 9.977561508979591e-05, + "loss": 1.0237, + "step": 9450 + }, + { + "epoch": 0.060437243652811674, + "grad_norm": 0.8663612604141235, + "learning_rate": 9.977514000448584e-05, + "loss": 0.9739, + "step": 9460 + }, + { + "epoch": 0.060501130802550374, + "grad_norm": 0.5622289776802063, + "learning_rate": 9.977466441789707e-05, + "loss": 0.6195, + "step": 9470 + }, + { + "epoch": 0.06056501795228908, + "grad_norm": 0.7465640902519226, + "learning_rate": 9.977418833003436e-05, + "loss": 0.6977, + "step": 9480 + }, + { + "epoch": 0.06062890510202778, + "grad_norm": 0.8643200993537903, + "learning_rate": 9.97737117409025e-05, + "loss": 1.1056, + "step": 9490 + }, + { + "epoch": 0.06069279225176648, + "grad_norm": 0.8004162311553955, + "learning_rate": 9.977323465050631e-05, + "loss": 0.8349, + "step": 9500 + }, + { + "epoch": 0.060756679401505184, + "grad_norm": 0.7937789559364319, + "learning_rate": 9.977275705885058e-05, + "loss": 1.0755, + "step": 9510 + }, + { + "epoch": 0.06082056655124388, + "grad_norm": 0.7888356447219849, + "learning_rate": 9.977227896594014e-05, + "loss": 1.242, + "step": 9520 + }, + { + "epoch": 0.06088445370098258, + "grad_norm": 0.6252095103263855, + "learning_rate": 9.977180037177979e-05, + "loss": 1.1968, + "step": 9530 + }, + { + "epoch": 0.06094834085072129, + "grad_norm": 0.6318356990814209, + "learning_rate": 9.977132127637434e-05, + "loss": 1.0921, + "step": 9540 + }, + { + "epoch": 0.06101222800045999, + "grad_norm": 0.6336533427238464, + "learning_rate": 9.977084167972863e-05, + "loss": 0.7744, + "step": 9550 + }, + { + "epoch": 0.06107611515019869, + "grad_norm": 0.7241688966751099, + "learning_rate": 9.97703615818475e-05, + "loss": 0.7843, + "step": 9560 + }, + { + "epoch": 0.06114000229993739, + "grad_norm": 1.5715322494506836, + "learning_rate": 9.976988098273576e-05, + "loss": 1.1104, + "step": 9570 + }, + { + "epoch": 0.06120388944967609, + "grad_norm": 0.5444793105125427, + "learning_rate": 9.976939988239826e-05, + "loss": 0.9894, + "step": 9580 + }, + { + "epoch": 0.06126777659941479, + "grad_norm": 0.785284698009491, + "learning_rate": 9.976891828083985e-05, + "loss": 0.9782, + "step": 9590 + }, + { + "epoch": 0.0613316637491535, + "grad_norm": 1.1315600872039795, + "learning_rate": 9.976843617806538e-05, + "loss": 0.9443, + "step": 9600 + }, + { + "epoch": 0.0613955508988922, + "grad_norm": 5.169201850891113, + "learning_rate": 9.97679535740797e-05, + "loss": 0.9371, + "step": 9610 + }, + { + "epoch": 0.061459438048630896, + "grad_norm": 0.6818580031394958, + "learning_rate": 9.976747046888767e-05, + "loss": 0.8102, + "step": 9620 + }, + { + "epoch": 0.0615233251983696, + "grad_norm": 0.8099622130393982, + "learning_rate": 9.976698686249416e-05, + "loss": 1.0892, + "step": 9630 + }, + { + "epoch": 0.0615872123481083, + "grad_norm": 0.8413625955581665, + "learning_rate": 9.976650275490404e-05, + "loss": 0.9822, + "step": 9640 + }, + { + "epoch": 0.061651099497847, + "grad_norm": 1.4564588069915771, + "learning_rate": 9.976601814612217e-05, + "loss": 1.1034, + "step": 9650 + }, + { + "epoch": 0.06171498664758571, + "grad_norm": 0.897906482219696, + "learning_rate": 9.976553303615346e-05, + "loss": 0.9956, + "step": 9660 + }, + { + "epoch": 0.061778873797324406, + "grad_norm": 0.5349118113517761, + "learning_rate": 9.976504742500277e-05, + "loss": 0.9361, + "step": 9670 + }, + { + "epoch": 0.061842760947063105, + "grad_norm": 0.8950748443603516, + "learning_rate": 9.9764561312675e-05, + "loss": 0.8486, + "step": 9680 + }, + { + "epoch": 0.06190664809680181, + "grad_norm": 1.0401899814605713, + "learning_rate": 9.976407469917504e-05, + "loss": 0.855, + "step": 9690 + }, + { + "epoch": 0.06197053524654051, + "grad_norm": 1.0119845867156982, + "learning_rate": 9.976358758450781e-05, + "loss": 0.8117, + "step": 9700 + }, + { + "epoch": 0.06203442239627921, + "grad_norm": 1.028308629989624, + "learning_rate": 9.976309996867819e-05, + "loss": 1.0832, + "step": 9710 + }, + { + "epoch": 0.062098309546017916, + "grad_norm": 0.6654931902885437, + "learning_rate": 9.976261185169111e-05, + "loss": 0.9543, + "step": 9720 + }, + { + "epoch": 0.062162196695756615, + "grad_norm": 0.7170969843864441, + "learning_rate": 9.976212323355148e-05, + "loss": 0.7589, + "step": 9730 + }, + { + "epoch": 0.062226083845495314, + "grad_norm": 0.7951648831367493, + "learning_rate": 9.97616341142642e-05, + "loss": 0.7643, + "step": 9740 + }, + { + "epoch": 0.06228997099523402, + "grad_norm": 0.8642029166221619, + "learning_rate": 9.976114449383422e-05, + "loss": 0.7792, + "step": 9750 + }, + { + "epoch": 0.06235385814497272, + "grad_norm": 0.7159494757652283, + "learning_rate": 9.976065437226648e-05, + "loss": 0.7695, + "step": 9760 + }, + { + "epoch": 0.06241774529471142, + "grad_norm": 0.8568373918533325, + "learning_rate": 9.976016374956589e-05, + "loss": 0.9835, + "step": 9770 + }, + { + "epoch": 0.062481632444450125, + "grad_norm": 0.7960609793663025, + "learning_rate": 9.97596726257374e-05, + "loss": 0.7966, + "step": 9780 + }, + { + "epoch": 0.06254551959418883, + "grad_norm": 0.9307446479797363, + "learning_rate": 9.975918100078598e-05, + "loss": 0.9899, + "step": 9790 + }, + { + "epoch": 0.06260940674392752, + "grad_norm": 0.6713595986366272, + "learning_rate": 9.975868887471654e-05, + "loss": 0.9225, + "step": 9800 + }, + { + "epoch": 0.06267329389366623, + "grad_norm": 0.8115236163139343, + "learning_rate": 9.975819624753405e-05, + "loss": 1.0275, + "step": 9810 + }, + { + "epoch": 0.06273718104340494, + "grad_norm": 1.7901041507720947, + "learning_rate": 9.975770311924348e-05, + "loss": 1.0027, + "step": 9820 + }, + { + "epoch": 0.06280106819314363, + "grad_norm": 1.7000713348388672, + "learning_rate": 9.975720948984981e-05, + "loss": 0.7821, + "step": 9830 + }, + { + "epoch": 0.06286495534288233, + "grad_norm": 0.7414657473564148, + "learning_rate": 9.975671535935797e-05, + "loss": 1.1558, + "step": 9840 + }, + { + "epoch": 0.06292884249262104, + "grad_norm": 1.6867907047271729, + "learning_rate": 9.975622072777299e-05, + "loss": 0.8346, + "step": 9850 + }, + { + "epoch": 0.06299272964235973, + "grad_norm": 0.8193734884262085, + "learning_rate": 9.97557255950998e-05, + "loss": 1.0878, + "step": 9860 + }, + { + "epoch": 0.06305661679209844, + "grad_norm": 0.8677065968513489, + "learning_rate": 9.975522996134341e-05, + "loss": 1.0119, + "step": 9870 + }, + { + "epoch": 0.06312050394183714, + "grad_norm": 0.9119147658348083, + "learning_rate": 9.975473382650882e-05, + "loss": 0.9826, + "step": 9880 + }, + { + "epoch": 0.06318439109157584, + "grad_norm": 0.5988776683807373, + "learning_rate": 9.9754237190601e-05, + "loss": 0.9762, + "step": 9890 + }, + { + "epoch": 0.06324827824131454, + "grad_norm": 0.7673864364624023, + "learning_rate": 9.9753740053625e-05, + "loss": 1.0638, + "step": 9900 + }, + { + "epoch": 0.06331216539105325, + "grad_norm": 0.7230051159858704, + "learning_rate": 9.975324241558577e-05, + "loss": 1.005, + "step": 9910 + }, + { + "epoch": 0.06337605254079194, + "grad_norm": 0.7979596257209778, + "learning_rate": 9.975274427648834e-05, + "loss": 0.816, + "step": 9920 + }, + { + "epoch": 0.06343993969053065, + "grad_norm": 0.8814641237258911, + "learning_rate": 9.975224563633774e-05, + "loss": 0.808, + "step": 9930 + }, + { + "epoch": 0.06350382684026935, + "grad_norm": 1.0135507583618164, + "learning_rate": 9.975174649513899e-05, + "loss": 0.6825, + "step": 9940 + }, + { + "epoch": 0.06356771399000805, + "grad_norm": 2.055793046951294, + "learning_rate": 9.97512468528971e-05, + "loss": 0.9164, + "step": 9950 + }, + { + "epoch": 0.06363160113974675, + "grad_norm": 1.3944244384765625, + "learning_rate": 9.975074670961712e-05, + "loss": 0.6302, + "step": 9960 + }, + { + "epoch": 0.06369548828948546, + "grad_norm": 0.828702986240387, + "learning_rate": 9.97502460653041e-05, + "loss": 0.8697, + "step": 9970 + }, + { + "epoch": 0.06375937543922415, + "grad_norm": 0.6198043823242188, + "learning_rate": 9.974974491996303e-05, + "loss": 1.019, + "step": 9980 + }, + { + "epoch": 0.06382326258896286, + "grad_norm": 1.0051112174987793, + "learning_rate": 9.9749243273599e-05, + "loss": 0.8931, + "step": 9990 + }, + { + "epoch": 0.06388714973870156, + "grad_norm": 0.7894333004951477, + "learning_rate": 9.974874112621706e-05, + "loss": 0.729, + "step": 10000 + }, + { + "epoch": 0.06395103688844025, + "grad_norm": 1.0666780471801758, + "learning_rate": 9.974823847782226e-05, + "loss": 0.8405, + "step": 10010 + }, + { + "epoch": 0.06401492403817896, + "grad_norm": 0.8409984111785889, + "learning_rate": 9.974773532841965e-05, + "loss": 0.7593, + "step": 10020 + }, + { + "epoch": 0.06407881118791767, + "grad_norm": 0.7679229974746704, + "learning_rate": 9.97472316780143e-05, + "loss": 0.9023, + "step": 10030 + }, + { + "epoch": 0.06414269833765636, + "grad_norm": 0.893464207649231, + "learning_rate": 9.97467275266113e-05, + "loss": 0.9048, + "step": 10040 + }, + { + "epoch": 0.06420658548739507, + "grad_norm": 0.8160121440887451, + "learning_rate": 9.974622287421571e-05, + "loss": 0.7204, + "step": 10050 + }, + { + "epoch": 0.06427047263713377, + "grad_norm": 1.0811116695404053, + "learning_rate": 9.974571772083264e-05, + "loss": 1.0378, + "step": 10060 + }, + { + "epoch": 0.06433435978687246, + "grad_norm": 2.037599802017212, + "learning_rate": 9.974521206646714e-05, + "loss": 1.205, + "step": 10070 + }, + { + "epoch": 0.06439824693661117, + "grad_norm": 1.153348445892334, + "learning_rate": 9.974470591112431e-05, + "loss": 1.1017, + "step": 10080 + }, + { + "epoch": 0.06446213408634988, + "grad_norm": 0.8410546183586121, + "learning_rate": 9.974419925480927e-05, + "loss": 0.9647, + "step": 10090 + }, + { + "epoch": 0.06452602123608857, + "grad_norm": 1.0550462007522583, + "learning_rate": 9.97436920975271e-05, + "loss": 0.6909, + "step": 10100 + }, + { + "epoch": 0.06458990838582727, + "grad_norm": 0.7067312598228455, + "learning_rate": 9.974318443928292e-05, + "loss": 1.0198, + "step": 10110 + }, + { + "epoch": 0.06465379553556598, + "grad_norm": 0.9904884696006775, + "learning_rate": 9.974267628008184e-05, + "loss": 0.853, + "step": 10120 + }, + { + "epoch": 0.06471768268530467, + "grad_norm": 0.9248889684677124, + "learning_rate": 9.974216761992899e-05, + "loss": 0.8722, + "step": 10130 + }, + { + "epoch": 0.06478156983504338, + "grad_norm": 0.9939360618591309, + "learning_rate": 9.974165845882946e-05, + "loss": 0.7184, + "step": 10140 + }, + { + "epoch": 0.06484545698478208, + "grad_norm": 0.7473933696746826, + "learning_rate": 9.97411487967884e-05, + "loss": 1.1064, + "step": 10150 + }, + { + "epoch": 0.06490934413452078, + "grad_norm": 0.6957441568374634, + "learning_rate": 9.974063863381093e-05, + "loss": 1.0598, + "step": 10160 + }, + { + "epoch": 0.06497323128425948, + "grad_norm": 0.5153073072433472, + "learning_rate": 9.974012796990222e-05, + "loss": 1.0821, + "step": 10170 + }, + { + "epoch": 0.06503711843399819, + "grad_norm": 0.6289156675338745, + "learning_rate": 9.973961680506736e-05, + "loss": 0.7954, + "step": 10180 + }, + { + "epoch": 0.06510100558373688, + "grad_norm": 0.8114803433418274, + "learning_rate": 9.973910513931155e-05, + "loss": 1.0314, + "step": 10190 + }, + { + "epoch": 0.06516489273347559, + "grad_norm": 0.9270540475845337, + "learning_rate": 9.973859297263992e-05, + "loss": 1.0626, + "step": 10200 + }, + { + "epoch": 0.0652287798832143, + "grad_norm": 0.7939660549163818, + "learning_rate": 9.973808030505762e-05, + "loss": 1.0844, + "step": 10210 + }, + { + "epoch": 0.06529266703295299, + "grad_norm": 0.7727285027503967, + "learning_rate": 9.973756713656983e-05, + "loss": 1.1614, + "step": 10220 + }, + { + "epoch": 0.06535655418269169, + "grad_norm": 0.628436803817749, + "learning_rate": 9.973705346718172e-05, + "loss": 1.0243, + "step": 10230 + }, + { + "epoch": 0.0654204413324304, + "grad_norm": 0.6849284172058105, + "learning_rate": 9.973653929689843e-05, + "loss": 0.9389, + "step": 10240 + }, + { + "epoch": 0.06548432848216909, + "grad_norm": 1.1843525171279907, + "learning_rate": 9.973602462572517e-05, + "loss": 1.1492, + "step": 10250 + }, + { + "epoch": 0.0655482156319078, + "grad_norm": 0.8269469141960144, + "learning_rate": 9.973550945366713e-05, + "loss": 1.2698, + "step": 10260 + }, + { + "epoch": 0.0656121027816465, + "grad_norm": 1.4048844575881958, + "learning_rate": 9.973499378072945e-05, + "loss": 0.9471, + "step": 10270 + }, + { + "epoch": 0.0656759899313852, + "grad_norm": 0.4660175144672394, + "learning_rate": 9.973447760691738e-05, + "loss": 1.0006, + "step": 10280 + }, + { + "epoch": 0.0657398770811239, + "grad_norm": 0.5896976590156555, + "learning_rate": 9.973396093223609e-05, + "loss": 0.9568, + "step": 10290 + }, + { + "epoch": 0.06580376423086261, + "grad_norm": 0.8697670102119446, + "learning_rate": 9.973344375669078e-05, + "loss": 1.0002, + "step": 10300 + }, + { + "epoch": 0.0658676513806013, + "grad_norm": 0.6355106234550476, + "learning_rate": 9.973292608028667e-05, + "loss": 0.847, + "step": 10310 + }, + { + "epoch": 0.06593153853034, + "grad_norm": 0.8912832736968994, + "learning_rate": 9.973240790302898e-05, + "loss": 0.9665, + "step": 10320 + }, + { + "epoch": 0.06599542568007871, + "grad_norm": 0.662343442440033, + "learning_rate": 9.97318892249229e-05, + "loss": 0.8517, + "step": 10330 + }, + { + "epoch": 0.0660593128298174, + "grad_norm": 0.7244953513145447, + "learning_rate": 9.973137004597368e-05, + "loss": 0.8731, + "step": 10340 + }, + { + "epoch": 0.06612319997955611, + "grad_norm": 0.9315572381019592, + "learning_rate": 9.973085036618655e-05, + "loss": 0.8918, + "step": 10350 + }, + { + "epoch": 0.06618708712929482, + "grad_norm": 0.7289429306983948, + "learning_rate": 9.973033018556671e-05, + "loss": 0.8263, + "step": 10360 + }, + { + "epoch": 0.06625097427903351, + "grad_norm": 1.0025968551635742, + "learning_rate": 9.972980950411944e-05, + "loss": 0.8438, + "step": 10370 + }, + { + "epoch": 0.06631486142877221, + "grad_norm": 1.0387686491012573, + "learning_rate": 9.972928832184996e-05, + "loss": 1.0417, + "step": 10380 + }, + { + "epoch": 0.06637874857851092, + "grad_norm": 1.1046053171157837, + "learning_rate": 9.972876663876352e-05, + "loss": 1.0033, + "step": 10390 + }, + { + "epoch": 0.06644263572824961, + "grad_norm": 0.7507469058036804, + "learning_rate": 9.972824445486539e-05, + "loss": 0.7265, + "step": 10400 + }, + { + "epoch": 0.06650652287798832, + "grad_norm": 0.8238981366157532, + "learning_rate": 9.972772177016081e-05, + "loss": 1.0, + "step": 10410 + }, + { + "epoch": 0.06657041002772703, + "grad_norm": 0.9973478317260742, + "learning_rate": 9.972719858465504e-05, + "loss": 1.1175, + "step": 10420 + }, + { + "epoch": 0.06663429717746572, + "grad_norm": 1.0181374549865723, + "learning_rate": 9.972667489835338e-05, + "loss": 0.9529, + "step": 10430 + }, + { + "epoch": 0.06669818432720442, + "grad_norm": 0.5428194403648376, + "learning_rate": 9.972615071126108e-05, + "loss": 0.6749, + "step": 10440 + }, + { + "epoch": 0.06676207147694313, + "grad_norm": 1.1994624137878418, + "learning_rate": 9.972562602338341e-05, + "loss": 0.8246, + "step": 10450 + }, + { + "epoch": 0.06682595862668182, + "grad_norm": 1.502936601638794, + "learning_rate": 9.972510083472569e-05, + "loss": 0.9699, + "step": 10460 + }, + { + "epoch": 0.06688984577642053, + "grad_norm": 0.9399340748786926, + "learning_rate": 9.972457514529316e-05, + "loss": 0.8597, + "step": 10470 + }, + { + "epoch": 0.06695373292615923, + "grad_norm": 1.0776817798614502, + "learning_rate": 9.972404895509116e-05, + "loss": 1.0443, + "step": 10480 + }, + { + "epoch": 0.06701762007589794, + "grad_norm": 1.5870468616485596, + "learning_rate": 9.972352226412495e-05, + "loss": 1.1327, + "step": 10490 + }, + { + "epoch": 0.06708150722563663, + "grad_norm": 0.8504364490509033, + "learning_rate": 9.972299507239988e-05, + "loss": 0.9158, + "step": 10500 + }, + { + "epoch": 0.06714539437537534, + "grad_norm": 0.7087526321411133, + "learning_rate": 9.972246737992122e-05, + "loss": 0.9687, + "step": 10510 + }, + { + "epoch": 0.06720928152511405, + "grad_norm": 0.9799100756645203, + "learning_rate": 9.972193918669429e-05, + "loss": 1.1421, + "step": 10520 + }, + { + "epoch": 0.06727316867485274, + "grad_norm": 0.6044210195541382, + "learning_rate": 9.972141049272444e-05, + "loss": 1.1096, + "step": 10530 + }, + { + "epoch": 0.06733705582459144, + "grad_norm": 0.8850777745246887, + "learning_rate": 9.972088129801693e-05, + "loss": 0.8467, + "step": 10540 + }, + { + "epoch": 0.06740094297433015, + "grad_norm": 0.8483796715736389, + "learning_rate": 9.972035160257717e-05, + "loss": 0.9819, + "step": 10550 + }, + { + "epoch": 0.06746483012406884, + "grad_norm": 1.1407147645950317, + "learning_rate": 9.971982140641043e-05, + "loss": 0.9107, + "step": 10560 + }, + { + "epoch": 0.06752871727380755, + "grad_norm": 0.834553599357605, + "learning_rate": 9.971929070952209e-05, + "loss": 1.1262, + "step": 10570 + }, + { + "epoch": 0.06759260442354625, + "grad_norm": 1.0828417539596558, + "learning_rate": 9.971875951191747e-05, + "loss": 0.9017, + "step": 10580 + }, + { + "epoch": 0.06765649157328495, + "grad_norm": 0.5860454440116882, + "learning_rate": 9.971822781360194e-05, + "loss": 0.7191, + "step": 10590 + }, + { + "epoch": 0.06772037872302365, + "grad_norm": 0.767382025718689, + "learning_rate": 9.971769561458084e-05, + "loss": 1.048, + "step": 10600 + }, + { + "epoch": 0.06778426587276236, + "grad_norm": 0.6914779543876648, + "learning_rate": 9.971716291485953e-05, + "loss": 0.949, + "step": 10610 + }, + { + "epoch": 0.06784815302250105, + "grad_norm": 1.306636929512024, + "learning_rate": 9.971662971444338e-05, + "loss": 0.8191, + "step": 10620 + }, + { + "epoch": 0.06791204017223976, + "grad_norm": 1.0141420364379883, + "learning_rate": 9.971609601333776e-05, + "loss": 0.9747, + "step": 10630 + }, + { + "epoch": 0.06797592732197846, + "grad_norm": 0.7582118511199951, + "learning_rate": 9.971556181154802e-05, + "loss": 0.7757, + "step": 10640 + }, + { + "epoch": 0.06803981447171716, + "grad_norm": 0.6744197010993958, + "learning_rate": 9.971502710907958e-05, + "loss": 0.7907, + "step": 10650 + }, + { + "epoch": 0.06810370162145586, + "grad_norm": 1.1960172653198242, + "learning_rate": 9.971449190593782e-05, + "loss": 0.9023, + "step": 10660 + }, + { + "epoch": 0.06816758877119457, + "grad_norm": 1.0107911825180054, + "learning_rate": 9.971395620212811e-05, + "loss": 0.918, + "step": 10670 + }, + { + "epoch": 0.06823147592093326, + "grad_norm": 0.6501746773719788, + "learning_rate": 9.971341999765585e-05, + "loss": 0.9352, + "step": 10680 + }, + { + "epoch": 0.06829536307067197, + "grad_norm": 0.9184291362762451, + "learning_rate": 9.971288329252644e-05, + "loss": 1.1747, + "step": 10690 + }, + { + "epoch": 0.06835925022041067, + "grad_norm": 0.5910547971725464, + "learning_rate": 9.971234608674529e-05, + "loss": 0.7598, + "step": 10700 + }, + { + "epoch": 0.06842313737014936, + "grad_norm": 0.8851799964904785, + "learning_rate": 9.97118083803178e-05, + "loss": 0.9643, + "step": 10710 + }, + { + "epoch": 0.06848702451988807, + "grad_norm": 0.6597937941551208, + "learning_rate": 9.97112701732494e-05, + "loss": 0.9897, + "step": 10720 + }, + { + "epoch": 0.06855091166962678, + "grad_norm": 0.6581412553787231, + "learning_rate": 9.97107314655455e-05, + "loss": 1.1446, + "step": 10730 + }, + { + "epoch": 0.06861479881936547, + "grad_norm": 0.5868738293647766, + "learning_rate": 9.971019225721153e-05, + "loss": 1.0789, + "step": 10740 + }, + { + "epoch": 0.06867868596910418, + "grad_norm": 0.6730684041976929, + "learning_rate": 9.970965254825292e-05, + "loss": 0.9802, + "step": 10750 + }, + { + "epoch": 0.06874257311884288, + "grad_norm": 0.8661940097808838, + "learning_rate": 9.970911233867511e-05, + "loss": 1.0777, + "step": 10760 + }, + { + "epoch": 0.06880646026858157, + "grad_norm": 1.0571337938308716, + "learning_rate": 9.970857162848352e-05, + "loss": 1.0175, + "step": 10770 + }, + { + "epoch": 0.06887034741832028, + "grad_norm": 1.2184176445007324, + "learning_rate": 9.970803041768362e-05, + "loss": 0.9196, + "step": 10780 + }, + { + "epoch": 0.06893423456805899, + "grad_norm": 0.6517652869224548, + "learning_rate": 9.970748870628083e-05, + "loss": 0.9498, + "step": 10790 + }, + { + "epoch": 0.06899812171779768, + "grad_norm": 1.2037395238876343, + "learning_rate": 9.970694649428065e-05, + "loss": 0.785, + "step": 10800 + }, + { + "epoch": 0.06906200886753638, + "grad_norm": 0.8196636438369751, + "learning_rate": 9.97064037816885e-05, + "loss": 0.9136, + "step": 10810 + }, + { + "epoch": 0.06912589601727509, + "grad_norm": 0.9403445720672607, + "learning_rate": 9.970586056850988e-05, + "loss": 0.847, + "step": 10820 + }, + { + "epoch": 0.06918978316701378, + "grad_norm": 0.5096237659454346, + "learning_rate": 9.970531685475024e-05, + "loss": 0.8693, + "step": 10830 + }, + { + "epoch": 0.06925367031675249, + "grad_norm": 0.5676767230033875, + "learning_rate": 9.970477264041505e-05, + "loss": 0.9367, + "step": 10840 + }, + { + "epoch": 0.0693175574664912, + "grad_norm": 0.9769662618637085, + "learning_rate": 9.970422792550978e-05, + "loss": 0.9091, + "step": 10850 + }, + { + "epoch": 0.06938144461622989, + "grad_norm": 0.6873984932899475, + "learning_rate": 9.970368271003995e-05, + "loss": 0.9392, + "step": 10860 + }, + { + "epoch": 0.0694453317659686, + "grad_norm": 1.1281991004943848, + "learning_rate": 9.970313699401104e-05, + "loss": 0.8311, + "step": 10870 + }, + { + "epoch": 0.0695092189157073, + "grad_norm": 0.8184236288070679, + "learning_rate": 9.970259077742855e-05, + "loss": 0.7781, + "step": 10880 + }, + { + "epoch": 0.06957310606544599, + "grad_norm": 0.7411293983459473, + "learning_rate": 9.970204406029796e-05, + "loss": 0.8319, + "step": 10890 + }, + { + "epoch": 0.0696369932151847, + "grad_norm": 0.8405719995498657, + "learning_rate": 9.97014968426248e-05, + "loss": 1.1157, + "step": 10900 + }, + { + "epoch": 0.0697008803649234, + "grad_norm": 0.8236634731292725, + "learning_rate": 9.970094912441454e-05, + "loss": 0.8209, + "step": 10910 + }, + { + "epoch": 0.0697647675146621, + "grad_norm": 0.7503064870834351, + "learning_rate": 9.970040090567275e-05, + "loss": 1.1207, + "step": 10920 + }, + { + "epoch": 0.0698286546644008, + "grad_norm": 1.037656545639038, + "learning_rate": 9.969985218640492e-05, + "loss": 1.0938, + "step": 10930 + }, + { + "epoch": 0.06989254181413951, + "grad_norm": 2.2834203243255615, + "learning_rate": 9.969930296661658e-05, + "loss": 1.0299, + "step": 10940 + }, + { + "epoch": 0.0699564289638782, + "grad_norm": 0.47441643476486206, + "learning_rate": 9.969875324631327e-05, + "loss": 0.8998, + "step": 10950 + }, + { + "epoch": 0.0700203161136169, + "grad_norm": 0.8986606597900391, + "learning_rate": 9.969820302550051e-05, + "loss": 0.8735, + "step": 10960 + }, + { + "epoch": 0.07008420326335561, + "grad_norm": 0.6057919263839722, + "learning_rate": 9.969765230418386e-05, + "loss": 0.8311, + "step": 10970 + }, + { + "epoch": 0.0701480904130943, + "grad_norm": 0.9726822972297668, + "learning_rate": 9.969710108236885e-05, + "loss": 1.0337, + "step": 10980 + }, + { + "epoch": 0.07021197756283301, + "grad_norm": 0.875328779220581, + "learning_rate": 9.969654936006102e-05, + "loss": 0.978, + "step": 10990 + }, + { + "epoch": 0.07027586471257172, + "grad_norm": 1.4699301719665527, + "learning_rate": 9.969599713726599e-05, + "loss": 0.709, + "step": 11000 + }, + { + "epoch": 0.07033975186231041, + "grad_norm": 0.9150874614715576, + "learning_rate": 9.969544441398924e-05, + "loss": 0.7534, + "step": 11010 + }, + { + "epoch": 0.07040363901204912, + "grad_norm": 0.9999013543128967, + "learning_rate": 9.969489119023638e-05, + "loss": 1.0469, + "step": 11020 + }, + { + "epoch": 0.07046752616178782, + "grad_norm": 1.0596497058868408, + "learning_rate": 9.969433746601298e-05, + "loss": 0.8638, + "step": 11030 + }, + { + "epoch": 0.07053141331152651, + "grad_norm": 0.5560715198516846, + "learning_rate": 9.96937832413246e-05, + "loss": 0.8614, + "step": 11040 + }, + { + "epoch": 0.07059530046126522, + "grad_norm": 0.7285141944885254, + "learning_rate": 9.969322851617684e-05, + "loss": 0.7894, + "step": 11050 + }, + { + "epoch": 0.07065918761100393, + "grad_norm": 0.8218443393707275, + "learning_rate": 9.969267329057526e-05, + "loss": 1.116, + "step": 11060 + }, + { + "epoch": 0.07072307476074262, + "grad_norm": 0.7729995250701904, + "learning_rate": 9.96921175645255e-05, + "loss": 0.9044, + "step": 11070 + }, + { + "epoch": 0.07078696191048132, + "grad_norm": 0.719794511795044, + "learning_rate": 9.96915613380331e-05, + "loss": 1.182, + "step": 11080 + }, + { + "epoch": 0.07085084906022003, + "grad_norm": 0.9527838230133057, + "learning_rate": 9.96910046111037e-05, + "loss": 1.0074, + "step": 11090 + }, + { + "epoch": 0.07091473620995872, + "grad_norm": 0.7101008892059326, + "learning_rate": 9.969044738374289e-05, + "loss": 1.0559, + "step": 11100 + }, + { + "epoch": 0.07097862335969743, + "grad_norm": 0.4492223560810089, + "learning_rate": 9.968988965595629e-05, + "loss": 0.727, + "step": 11110 + }, + { + "epoch": 0.07104251050943614, + "grad_norm": 0.6947804093360901, + "learning_rate": 9.968933142774952e-05, + "loss": 0.9424, + "step": 11120 + }, + { + "epoch": 0.07110639765917483, + "grad_norm": 1.0676300525665283, + "learning_rate": 9.968877269912819e-05, + "loss": 0.7982, + "step": 11130 + }, + { + "epoch": 0.07117028480891353, + "grad_norm": 0.7446919679641724, + "learning_rate": 9.968821347009792e-05, + "loss": 0.9773, + "step": 11140 + }, + { + "epoch": 0.07123417195865224, + "grad_norm": 1.2251659631729126, + "learning_rate": 9.968765374066437e-05, + "loss": 0.8226, + "step": 11150 + }, + { + "epoch": 0.07129805910839093, + "grad_norm": 0.635826051235199, + "learning_rate": 9.968709351083315e-05, + "loss": 0.7913, + "step": 11160 + }, + { + "epoch": 0.07136194625812964, + "grad_norm": 1.43468177318573, + "learning_rate": 9.968653278060992e-05, + "loss": 0.7686, + "step": 11170 + }, + { + "epoch": 0.07142583340786834, + "grad_norm": 6.540151596069336, + "learning_rate": 9.968597155000033e-05, + "loss": 0.8114, + "step": 11180 + }, + { + "epoch": 0.07148972055760704, + "grad_norm": 1.3150196075439453, + "learning_rate": 9.968540981901e-05, + "loss": 0.6353, + "step": 11190 + }, + { + "epoch": 0.07155360770734574, + "grad_norm": 0.5050914883613586, + "learning_rate": 9.968484758764462e-05, + "loss": 0.6865, + "step": 11200 + }, + { + "epoch": 0.07161749485708445, + "grad_norm": 0.9180815815925598, + "learning_rate": 9.968428485590983e-05, + "loss": 0.9142, + "step": 11210 + }, + { + "epoch": 0.07168138200682314, + "grad_norm": 1.4517556428909302, + "learning_rate": 9.968372162381133e-05, + "loss": 0.7999, + "step": 11220 + }, + { + "epoch": 0.07174526915656185, + "grad_norm": 0.6034737229347229, + "learning_rate": 9.968315789135475e-05, + "loss": 1.23, + "step": 11230 + }, + { + "epoch": 0.07180915630630055, + "grad_norm": 0.9869849681854248, + "learning_rate": 9.96825936585458e-05, + "loss": 1.0731, + "step": 11240 + }, + { + "epoch": 0.07187304345603925, + "grad_norm": 0.6998457908630371, + "learning_rate": 9.968202892539014e-05, + "loss": 1.1126, + "step": 11250 + }, + { + "epoch": 0.07193693060577795, + "grad_norm": 0.7587766647338867, + "learning_rate": 9.968146369189349e-05, + "loss": 0.9376, + "step": 11260 + }, + { + "epoch": 0.07200081775551666, + "grad_norm": 0.9407736659049988, + "learning_rate": 9.96808979580615e-05, + "loss": 0.7904, + "step": 11270 + }, + { + "epoch": 0.07206470490525535, + "grad_norm": 1.7557258605957031, + "learning_rate": 9.968033172389989e-05, + "loss": 0.8119, + "step": 11280 + }, + { + "epoch": 0.07212859205499406, + "grad_norm": 0.6084944605827332, + "learning_rate": 9.967976498941436e-05, + "loss": 0.7708, + "step": 11290 + }, + { + "epoch": 0.07219247920473276, + "grad_norm": 0.7556819915771484, + "learning_rate": 9.967919775461063e-05, + "loss": 0.7996, + "step": 11300 + }, + { + "epoch": 0.07225636635447147, + "grad_norm": 0.7954988479614258, + "learning_rate": 9.967863001949438e-05, + "loss": 1.1191, + "step": 11310 + }, + { + "epoch": 0.07232025350421016, + "grad_norm": 0.7278555631637573, + "learning_rate": 9.967806178407135e-05, + "loss": 0.8343, + "step": 11320 + }, + { + "epoch": 0.07238414065394887, + "grad_norm": 0.7036782503128052, + "learning_rate": 9.967749304834728e-05, + "loss": 0.8194, + "step": 11330 + }, + { + "epoch": 0.07244802780368757, + "grad_norm": 1.3781989812850952, + "learning_rate": 9.967692381232786e-05, + "loss": 0.8285, + "step": 11340 + }, + { + "epoch": 0.07251191495342627, + "grad_norm": 0.885075569152832, + "learning_rate": 9.967635407601886e-05, + "loss": 1.0042, + "step": 11350 + }, + { + "epoch": 0.07257580210316497, + "grad_norm": 0.6959792375564575, + "learning_rate": 9.967578383942597e-05, + "loss": 0.8172, + "step": 11360 + }, + { + "epoch": 0.07263968925290368, + "grad_norm": 1.800525188446045, + "learning_rate": 9.967521310255498e-05, + "loss": 0.8708, + "step": 11370 + }, + { + "epoch": 0.07270357640264237, + "grad_norm": 0.6853658556938171, + "learning_rate": 9.96746418654116e-05, + "loss": 0.8518, + "step": 11380 + }, + { + "epoch": 0.07276746355238108, + "grad_norm": 0.7943517565727234, + "learning_rate": 9.967407012800163e-05, + "loss": 0.7797, + "step": 11390 + }, + { + "epoch": 0.07283135070211978, + "grad_norm": 0.7777195572853088, + "learning_rate": 9.967349789033078e-05, + "loss": 0.7811, + "step": 11400 + }, + { + "epoch": 0.07289523785185847, + "grad_norm": 0.9152284860610962, + "learning_rate": 9.967292515240486e-05, + "loss": 0.7322, + "step": 11410 + }, + { + "epoch": 0.07295912500159718, + "grad_norm": 1.2940709590911865, + "learning_rate": 9.967235191422957e-05, + "loss": 0.7784, + "step": 11420 + }, + { + "epoch": 0.07302301215133589, + "grad_norm": 1.4273176193237305, + "learning_rate": 9.967177817581075e-05, + "loss": 1.334, + "step": 11430 + }, + { + "epoch": 0.07308689930107458, + "grad_norm": 0.9415301084518433, + "learning_rate": 9.967120393715414e-05, + "loss": 1.33, + "step": 11440 + }, + { + "epoch": 0.07315078645081328, + "grad_norm": 1.6769905090332031, + "learning_rate": 9.967062919826552e-05, + "loss": 0.8804, + "step": 11450 + }, + { + "epoch": 0.07321467360055199, + "grad_norm": 0.8233237266540527, + "learning_rate": 9.967005395915072e-05, + "loss": 0.9747, + "step": 11460 + }, + { + "epoch": 0.07327856075029068, + "grad_norm": 0.793849527835846, + "learning_rate": 9.966947821981551e-05, + "loss": 0.736, + "step": 11470 + }, + { + "epoch": 0.07334244790002939, + "grad_norm": 0.8288117051124573, + "learning_rate": 9.966890198026566e-05, + "loss": 0.9165, + "step": 11480 + }, + { + "epoch": 0.0734063350497681, + "grad_norm": 0.7047694325447083, + "learning_rate": 9.966832524050702e-05, + "loss": 0.8662, + "step": 11490 + }, + { + "epoch": 0.07347022219950679, + "grad_norm": 0.6443949937820435, + "learning_rate": 9.966774800054535e-05, + "loss": 1.0167, + "step": 11500 + }, + { + "epoch": 0.0735341093492455, + "grad_norm": 0.6362110376358032, + "learning_rate": 9.966717026038651e-05, + "loss": 1.0175, + "step": 11510 + }, + { + "epoch": 0.0735979964989842, + "grad_norm": 0.7651115655899048, + "learning_rate": 9.96665920200363e-05, + "loss": 0.914, + "step": 11520 + }, + { + "epoch": 0.07366188364872289, + "grad_norm": 0.7375466823577881, + "learning_rate": 9.966601327950052e-05, + "loss": 0.9936, + "step": 11530 + }, + { + "epoch": 0.0737257707984616, + "grad_norm": 0.7288793325424194, + "learning_rate": 9.966543403878503e-05, + "loss": 1.1943, + "step": 11540 + }, + { + "epoch": 0.0737896579482003, + "grad_norm": 0.8896105289459229, + "learning_rate": 9.966485429789565e-05, + "loss": 1.0228, + "step": 11550 + }, + { + "epoch": 0.073853545097939, + "grad_norm": 1.1143486499786377, + "learning_rate": 9.966427405683823e-05, + "loss": 0.8327, + "step": 11560 + }, + { + "epoch": 0.0739174322476777, + "grad_norm": 0.9701015949249268, + "learning_rate": 9.96636933156186e-05, + "loss": 0.8488, + "step": 11570 + }, + { + "epoch": 0.07398131939741641, + "grad_norm": 0.8440617322921753, + "learning_rate": 9.966311207424261e-05, + "loss": 1.1248, + "step": 11580 + }, + { + "epoch": 0.0740452065471551, + "grad_norm": 1.1028122901916504, + "learning_rate": 9.96625303327161e-05, + "loss": 0.941, + "step": 11590 + }, + { + "epoch": 0.07410909369689381, + "grad_norm": 0.8367504477500916, + "learning_rate": 9.966194809104498e-05, + "loss": 1.0069, + "step": 11600 + }, + { + "epoch": 0.07417298084663251, + "grad_norm": 0.6582353115081787, + "learning_rate": 9.966136534923507e-05, + "loss": 1.0914, + "step": 11610 + }, + { + "epoch": 0.0742368679963712, + "grad_norm": 0.720551609992981, + "learning_rate": 9.966078210729224e-05, + "loss": 0.8932, + "step": 11620 + }, + { + "epoch": 0.07430075514610991, + "grad_norm": 1.5726115703582764, + "learning_rate": 9.966019836522235e-05, + "loss": 0.666, + "step": 11630 + }, + { + "epoch": 0.07436464229584862, + "grad_norm": 0.8888491988182068, + "learning_rate": 9.965961412303133e-05, + "loss": 0.8511, + "step": 11640 + }, + { + "epoch": 0.07442852944558731, + "grad_norm": 0.9958298206329346, + "learning_rate": 9.965902938072503e-05, + "loss": 0.8403, + "step": 11650 + }, + { + "epoch": 0.07449241659532602, + "grad_norm": 0.9258823394775391, + "learning_rate": 9.965844413830934e-05, + "loss": 0.9406, + "step": 11660 + }, + { + "epoch": 0.07455630374506472, + "grad_norm": 0.6303139328956604, + "learning_rate": 9.965785839579016e-05, + "loss": 0.8162, + "step": 11670 + }, + { + "epoch": 0.07462019089480341, + "grad_norm": 0.8224695920944214, + "learning_rate": 9.965727215317338e-05, + "loss": 0.8578, + "step": 11680 + }, + { + "epoch": 0.07468407804454212, + "grad_norm": 0.7703375816345215, + "learning_rate": 9.965668541046491e-05, + "loss": 0.9871, + "step": 11690 + }, + { + "epoch": 0.07474796519428083, + "grad_norm": 0.5986992716789246, + "learning_rate": 9.965609816767066e-05, + "loss": 0.793, + "step": 11700 + }, + { + "epoch": 0.07481185234401952, + "grad_norm": 0.7556684613227844, + "learning_rate": 9.965551042479655e-05, + "loss": 0.9343, + "step": 11710 + }, + { + "epoch": 0.07487573949375823, + "grad_norm": 0.7659729719161987, + "learning_rate": 9.965492218184848e-05, + "loss": 0.8594, + "step": 11720 + }, + { + "epoch": 0.07493962664349693, + "grad_norm": 0.7803331017494202, + "learning_rate": 9.965433343883239e-05, + "loss": 0.7292, + "step": 11730 + }, + { + "epoch": 0.07500351379323562, + "grad_norm": 0.9800279140472412, + "learning_rate": 9.96537441957542e-05, + "loss": 0.7982, + "step": 11740 + }, + { + "epoch": 0.07506740094297433, + "grad_norm": 1.3977315425872803, + "learning_rate": 9.965315445261986e-05, + "loss": 0.7011, + "step": 11750 + }, + { + "epoch": 0.07513128809271304, + "grad_norm": 0.6457341313362122, + "learning_rate": 9.965256420943529e-05, + "loss": 0.8958, + "step": 11760 + }, + { + "epoch": 0.07519517524245173, + "grad_norm": 0.789249062538147, + "learning_rate": 9.965197346620645e-05, + "loss": 0.8956, + "step": 11770 + }, + { + "epoch": 0.07525906239219043, + "grad_norm": 0.8489546179771423, + "learning_rate": 9.965138222293928e-05, + "loss": 0.8684, + "step": 11780 + }, + { + "epoch": 0.07532294954192914, + "grad_norm": 0.7303208112716675, + "learning_rate": 9.965079047963974e-05, + "loss": 0.9646, + "step": 11790 + }, + { + "epoch": 0.07538683669166783, + "grad_norm": 3.839034080505371, + "learning_rate": 9.965019823631378e-05, + "loss": 0.8553, + "step": 11800 + }, + { + "epoch": 0.07545072384140654, + "grad_norm": 1.2064359188079834, + "learning_rate": 9.964960549296736e-05, + "loss": 1.0195, + "step": 11810 + }, + { + "epoch": 0.07551461099114525, + "grad_norm": 0.7502697706222534, + "learning_rate": 9.964901224960647e-05, + "loss": 0.9259, + "step": 11820 + }, + { + "epoch": 0.07557849814088394, + "grad_norm": 0.5781645774841309, + "learning_rate": 9.964841850623709e-05, + "loss": 0.8668, + "step": 11830 + }, + { + "epoch": 0.07564238529062264, + "grad_norm": 0.8652671575546265, + "learning_rate": 9.964782426286516e-05, + "loss": 0.8489, + "step": 11840 + }, + { + "epoch": 0.07570627244036135, + "grad_norm": 0.9653028845787048, + "learning_rate": 9.96472295194967e-05, + "loss": 0.9514, + "step": 11850 + }, + { + "epoch": 0.07577015959010004, + "grad_norm": 2.5349843502044678, + "learning_rate": 9.964663427613769e-05, + "loss": 1.0536, + "step": 11860 + }, + { + "epoch": 0.07583404673983875, + "grad_norm": 1.0257644653320312, + "learning_rate": 9.96460385327941e-05, + "loss": 1.05, + "step": 11870 + }, + { + "epoch": 0.07589793388957745, + "grad_norm": 0.6599146723747253, + "learning_rate": 9.964544228947199e-05, + "loss": 0.9347, + "step": 11880 + }, + { + "epoch": 0.07596182103931615, + "grad_norm": 1.0453253984451294, + "learning_rate": 9.96448455461773e-05, + "loss": 0.9054, + "step": 11890 + }, + { + "epoch": 0.07602570818905485, + "grad_norm": 0.5662599802017212, + "learning_rate": 9.964424830291607e-05, + "loss": 0.9117, + "step": 11900 + }, + { + "epoch": 0.07608959533879356, + "grad_norm": 0.6186836361885071, + "learning_rate": 9.964365055969431e-05, + "loss": 0.9725, + "step": 11910 + }, + { + "epoch": 0.07615348248853225, + "grad_norm": 0.8609874844551086, + "learning_rate": 9.964305231651804e-05, + "loss": 0.9634, + "step": 11920 + }, + { + "epoch": 0.07621736963827096, + "grad_norm": 0.8729275465011597, + "learning_rate": 9.96424535733933e-05, + "loss": 0.9384, + "step": 11930 + }, + { + "epoch": 0.07628125678800966, + "grad_norm": 0.9938400387763977, + "learning_rate": 9.964185433032609e-05, + "loss": 0.8695, + "step": 11940 + }, + { + "epoch": 0.07634514393774836, + "grad_norm": 0.836526095867157, + "learning_rate": 9.964125458732247e-05, + "loss": 0.9405, + "step": 11950 + }, + { + "epoch": 0.07640903108748706, + "grad_norm": 0.7302273511886597, + "learning_rate": 9.964065434438846e-05, + "loss": 1.0793, + "step": 11960 + }, + { + "epoch": 0.07647291823722577, + "grad_norm": 0.49212926626205444, + "learning_rate": 9.964005360153013e-05, + "loss": 0.8772, + "step": 11970 + }, + { + "epoch": 0.07653680538696446, + "grad_norm": 0.6889157295227051, + "learning_rate": 9.963945235875351e-05, + "loss": 0.9, + "step": 11980 + }, + { + "epoch": 0.07660069253670317, + "grad_norm": 0.9073895215988159, + "learning_rate": 9.963885061606466e-05, + "loss": 1.2127, + "step": 11990 + }, + { + "epoch": 0.07666457968644187, + "grad_norm": 0.8105494976043701, + "learning_rate": 9.963824837346963e-05, + "loss": 0.8683, + "step": 12000 + }, + { + "epoch": 0.07672846683618056, + "grad_norm": 0.9559453129768372, + "learning_rate": 9.963764563097451e-05, + "loss": 0.8229, + "step": 12010 + }, + { + "epoch": 0.07679235398591927, + "grad_norm": 0.7197737693786621, + "learning_rate": 9.963704238858535e-05, + "loss": 1.0417, + "step": 12020 + }, + { + "epoch": 0.07685624113565798, + "grad_norm": 1.704092025756836, + "learning_rate": 9.963643864630823e-05, + "loss": 0.8046, + "step": 12030 + }, + { + "epoch": 0.07692012828539667, + "grad_norm": 0.7579613327980042, + "learning_rate": 9.963583440414923e-05, + "loss": 0.9269, + "step": 12040 + }, + { + "epoch": 0.07698401543513538, + "grad_norm": 1.0408282279968262, + "learning_rate": 9.963522966211444e-05, + "loss": 1.0785, + "step": 12050 + }, + { + "epoch": 0.07704790258487408, + "grad_norm": 0.5655786991119385, + "learning_rate": 9.963462442020994e-05, + "loss": 0.8481, + "step": 12060 + }, + { + "epoch": 0.07711178973461277, + "grad_norm": 0.6558650732040405, + "learning_rate": 9.963401867844184e-05, + "loss": 0.9213, + "step": 12070 + }, + { + "epoch": 0.07717567688435148, + "grad_norm": 0.9138306975364685, + "learning_rate": 9.963341243681623e-05, + "loss": 0.8109, + "step": 12080 + }, + { + "epoch": 0.07723956403409019, + "grad_norm": 0.8476769924163818, + "learning_rate": 9.963280569533923e-05, + "loss": 0.8877, + "step": 12090 + }, + { + "epoch": 0.07730345118382888, + "grad_norm": 1.9213597774505615, + "learning_rate": 9.963219845401692e-05, + "loss": 0.8959, + "step": 12100 + }, + { + "epoch": 0.07736733833356758, + "grad_norm": 0.6933993697166443, + "learning_rate": 9.963159071285544e-05, + "loss": 0.8968, + "step": 12110 + }, + { + "epoch": 0.07743122548330629, + "grad_norm": 0.6891202926635742, + "learning_rate": 9.963098247186091e-05, + "loss": 1.2008, + "step": 12120 + }, + { + "epoch": 0.07749511263304498, + "grad_norm": 0.7064499855041504, + "learning_rate": 9.963037373103944e-05, + "loss": 0.9018, + "step": 12130 + }, + { + "epoch": 0.07755899978278369, + "grad_norm": 0.7487188577651978, + "learning_rate": 9.962976449039717e-05, + "loss": 1.0011, + "step": 12140 + }, + { + "epoch": 0.0776228869325224, + "grad_norm": 0.8367332816123962, + "learning_rate": 9.962915474994023e-05, + "loss": 0.9068, + "step": 12150 + }, + { + "epoch": 0.0776867740822611, + "grad_norm": 1.0736783742904663, + "learning_rate": 9.962854450967478e-05, + "loss": 0.9293, + "step": 12160 + }, + { + "epoch": 0.0777506612319998, + "grad_norm": 0.715390682220459, + "learning_rate": 9.962793376960695e-05, + "loss": 0.9036, + "step": 12170 + }, + { + "epoch": 0.0778145483817385, + "grad_norm": 1.1531165838241577, + "learning_rate": 9.962732252974289e-05, + "loss": 0.7847, + "step": 12180 + }, + { + "epoch": 0.0778784355314772, + "grad_norm": 0.6619348526000977, + "learning_rate": 9.962671079008876e-05, + "loss": 1.0075, + "step": 12190 + }, + { + "epoch": 0.0779423226812159, + "grad_norm": 1.0544220209121704, + "learning_rate": 9.962609855065072e-05, + "loss": 0.9982, + "step": 12200 + }, + { + "epoch": 0.0780062098309546, + "grad_norm": 0.6626638174057007, + "learning_rate": 9.962548581143494e-05, + "loss": 1.0559, + "step": 12210 + }, + { + "epoch": 0.07807009698069331, + "grad_norm": 1.291588544845581, + "learning_rate": 9.962487257244757e-05, + "loss": 1.0497, + "step": 12220 + }, + { + "epoch": 0.078133984130432, + "grad_norm": 0.7503036260604858, + "learning_rate": 9.962425883369481e-05, + "loss": 0.9837, + "step": 12230 + }, + { + "epoch": 0.07819787128017071, + "grad_norm": 0.789021909236908, + "learning_rate": 9.962364459518283e-05, + "loss": 0.8779, + "step": 12240 + }, + { + "epoch": 0.07826175842990941, + "grad_norm": 1.2305183410644531, + "learning_rate": 9.962302985691783e-05, + "loss": 0.9292, + "step": 12250 + }, + { + "epoch": 0.0783256455796481, + "grad_norm": 1.5961018800735474, + "learning_rate": 9.962241461890598e-05, + "loss": 0.9467, + "step": 12260 + }, + { + "epoch": 0.07838953272938681, + "grad_norm": 0.5835550427436829, + "learning_rate": 9.962179888115348e-05, + "loss": 1.0957, + "step": 12270 + }, + { + "epoch": 0.07845341987912552, + "grad_norm": 1.0020620822906494, + "learning_rate": 9.962118264366655e-05, + "loss": 0.9427, + "step": 12280 + }, + { + "epoch": 0.07851730702886421, + "grad_norm": 0.6819837689399719, + "learning_rate": 9.962056590645136e-05, + "loss": 1.0855, + "step": 12290 + }, + { + "epoch": 0.07858119417860292, + "grad_norm": 1.3488112688064575, + "learning_rate": 9.961994866951416e-05, + "loss": 0.6407, + "step": 12300 + }, + { + "epoch": 0.07864508132834162, + "grad_norm": 0.8530036807060242, + "learning_rate": 9.961933093286115e-05, + "loss": 1.0095, + "step": 12310 + }, + { + "epoch": 0.07870896847808032, + "grad_norm": 0.7318217158317566, + "learning_rate": 9.961871269649854e-05, + "loss": 0.8607, + "step": 12320 + }, + { + "epoch": 0.07877285562781902, + "grad_norm": 0.5192087292671204, + "learning_rate": 9.96180939604326e-05, + "loss": 0.7035, + "step": 12330 + }, + { + "epoch": 0.07883674277755773, + "grad_norm": 0.8365872502326965, + "learning_rate": 9.961747472466949e-05, + "loss": 1.4109, + "step": 12340 + }, + { + "epoch": 0.07890062992729642, + "grad_norm": 0.9271693229675293, + "learning_rate": 9.96168549892155e-05, + "loss": 0.842, + "step": 12350 + }, + { + "epoch": 0.07896451707703513, + "grad_norm": 1.00367271900177, + "learning_rate": 9.961623475407684e-05, + "loss": 1.0556, + "step": 12360 + }, + { + "epoch": 0.07902840422677383, + "grad_norm": 1.339418888092041, + "learning_rate": 9.96156140192598e-05, + "loss": 0.8171, + "step": 12370 + }, + { + "epoch": 0.07909229137651252, + "grad_norm": 1.03416109085083, + "learning_rate": 9.961499278477058e-05, + "loss": 0.8902, + "step": 12380 + }, + { + "epoch": 0.07915617852625123, + "grad_norm": 0.847169041633606, + "learning_rate": 9.961437105061546e-05, + "loss": 0.9201, + "step": 12390 + }, + { + "epoch": 0.07922006567598994, + "grad_norm": 1.1525788307189941, + "learning_rate": 9.961374881680072e-05, + "loss": 1.054, + "step": 12400 + }, + { + "epoch": 0.07928395282572863, + "grad_norm": 0.7588199973106384, + "learning_rate": 9.96131260833326e-05, + "loss": 0.9179, + "step": 12410 + }, + { + "epoch": 0.07934783997546734, + "grad_norm": 1.2406294345855713, + "learning_rate": 9.961250285021737e-05, + "loss": 1.1218, + "step": 12420 + }, + { + "epoch": 0.07941172712520604, + "grad_norm": 0.7575234174728394, + "learning_rate": 9.961187911746133e-05, + "loss": 1.0122, + "step": 12430 + }, + { + "epoch": 0.07947561427494473, + "grad_norm": 0.7496919631958008, + "learning_rate": 9.961125488507072e-05, + "loss": 1.0282, + "step": 12440 + }, + { + "epoch": 0.07953950142468344, + "grad_norm": 0.8383338451385498, + "learning_rate": 9.961063015305188e-05, + "loss": 0.9828, + "step": 12450 + }, + { + "epoch": 0.07960338857442215, + "grad_norm": 1.0005531311035156, + "learning_rate": 9.961000492141106e-05, + "loss": 1.061, + "step": 12460 + }, + { + "epoch": 0.07966727572416084, + "grad_norm": 0.9767794013023376, + "learning_rate": 9.960937919015458e-05, + "loss": 1.0097, + "step": 12470 + }, + { + "epoch": 0.07973116287389954, + "grad_norm": 0.7348878383636475, + "learning_rate": 9.960875295928874e-05, + "loss": 0.8203, + "step": 12480 + }, + { + "epoch": 0.07979505002363825, + "grad_norm": 0.7473248243331909, + "learning_rate": 9.960812622881982e-05, + "loss": 0.8261, + "step": 12490 + }, + { + "epoch": 0.07985893717337694, + "grad_norm": 0.6296994686126709, + "learning_rate": 9.960749899875417e-05, + "loss": 0.9531, + "step": 12500 + }, + { + "epoch": 0.07992282432311565, + "grad_norm": 0.48655831813812256, + "learning_rate": 9.960687126909807e-05, + "loss": 0.8131, + "step": 12510 + }, + { + "epoch": 0.07998671147285435, + "grad_norm": 0.8312428593635559, + "learning_rate": 9.960624303985787e-05, + "loss": 0.7988, + "step": 12520 + }, + { + "epoch": 0.08005059862259305, + "grad_norm": 0.7593886256217957, + "learning_rate": 9.96056143110399e-05, + "loss": 0.6993, + "step": 12530 + }, + { + "epoch": 0.08011448577233175, + "grad_norm": 0.9787190556526184, + "learning_rate": 9.960498508265046e-05, + "loss": 1.1168, + "step": 12540 + }, + { + "epoch": 0.08017837292207046, + "grad_norm": 1.374013066291809, + "learning_rate": 9.960435535469591e-05, + "loss": 0.959, + "step": 12550 + }, + { + "epoch": 0.08024226007180915, + "grad_norm": 0.632503867149353, + "learning_rate": 9.960372512718258e-05, + "loss": 0.9161, + "step": 12560 + }, + { + "epoch": 0.08030614722154786, + "grad_norm": 0.7403663992881775, + "learning_rate": 9.960309440011685e-05, + "loss": 0.5914, + "step": 12570 + }, + { + "epoch": 0.08037003437128656, + "grad_norm": 0.691646158695221, + "learning_rate": 9.960246317350503e-05, + "loss": 0.8991, + "step": 12580 + }, + { + "epoch": 0.08043392152102526, + "grad_norm": 0.5965979099273682, + "learning_rate": 9.960183144735348e-05, + "loss": 0.81, + "step": 12590 + }, + { + "epoch": 0.08049780867076396, + "grad_norm": 0.9545162320137024, + "learning_rate": 9.960119922166859e-05, + "loss": 1.0659, + "step": 12600 + }, + { + "epoch": 0.08056169582050267, + "grad_norm": 2.2266764640808105, + "learning_rate": 9.960056649645673e-05, + "loss": 1.2056, + "step": 12610 + }, + { + "epoch": 0.08062558297024136, + "grad_norm": 1.257367730140686, + "learning_rate": 9.959993327172423e-05, + "loss": 1.0144, + "step": 12620 + }, + { + "epoch": 0.08068947011998007, + "grad_norm": 0.8366072177886963, + "learning_rate": 9.959929954747751e-05, + "loss": 0.896, + "step": 12630 + }, + { + "epoch": 0.08075335726971877, + "grad_norm": 0.71613609790802, + "learning_rate": 9.959866532372292e-05, + "loss": 0.7121, + "step": 12640 + }, + { + "epoch": 0.08081724441945747, + "grad_norm": 0.678428053855896, + "learning_rate": 9.959803060046687e-05, + "loss": 0.8114, + "step": 12650 + }, + { + "epoch": 0.08088113156919617, + "grad_norm": 0.8528268337249756, + "learning_rate": 9.959739537771573e-05, + "loss": 0.9052, + "step": 12660 + }, + { + "epoch": 0.08094501871893488, + "grad_norm": 0.8090612292289734, + "learning_rate": 9.959675965547592e-05, + "loss": 0.9429, + "step": 12670 + }, + { + "epoch": 0.08100890586867357, + "grad_norm": 1.0413676500320435, + "learning_rate": 9.959612343375385e-05, + "loss": 0.9671, + "step": 12680 + }, + { + "epoch": 0.08107279301841228, + "grad_norm": 0.6349504590034485, + "learning_rate": 9.959548671255588e-05, + "loss": 1.0272, + "step": 12690 + }, + { + "epoch": 0.08113668016815098, + "grad_norm": 1.0371969938278198, + "learning_rate": 9.959484949188846e-05, + "loss": 0.7439, + "step": 12700 + }, + { + "epoch": 0.08120056731788967, + "grad_norm": 0.7047412991523743, + "learning_rate": 9.9594211771758e-05, + "loss": 0.8986, + "step": 12710 + }, + { + "epoch": 0.08126445446762838, + "grad_norm": 0.659905195236206, + "learning_rate": 9.959357355217093e-05, + "loss": 0.7917, + "step": 12720 + }, + { + "epoch": 0.08132834161736709, + "grad_norm": 0.7714025378227234, + "learning_rate": 9.959293483313368e-05, + "loss": 0.826, + "step": 12730 + }, + { + "epoch": 0.08139222876710578, + "grad_norm": 1.3492543697357178, + "learning_rate": 9.959229561465266e-05, + "loss": 1.0079, + "step": 12740 + }, + { + "epoch": 0.08145611591684448, + "grad_norm": 0.7474777698516846, + "learning_rate": 9.959165589673432e-05, + "loss": 0.8973, + "step": 12750 + }, + { + "epoch": 0.08152000306658319, + "grad_norm": 0.6047500371932983, + "learning_rate": 9.959101567938509e-05, + "loss": 0.8909, + "step": 12760 + }, + { + "epoch": 0.08158389021632188, + "grad_norm": 0.7488225698471069, + "learning_rate": 9.959037496261146e-05, + "loss": 0.9554, + "step": 12770 + }, + { + "epoch": 0.08164777736606059, + "grad_norm": 1.0440471172332764, + "learning_rate": 9.958973374641982e-05, + "loss": 0.7622, + "step": 12780 + }, + { + "epoch": 0.0817116645157993, + "grad_norm": 0.6892119646072388, + "learning_rate": 9.958909203081668e-05, + "loss": 0.9316, + "step": 12790 + }, + { + "epoch": 0.08177555166553799, + "grad_norm": 0.7813330292701721, + "learning_rate": 9.958844981580847e-05, + "loss": 1.0202, + "step": 12800 + }, + { + "epoch": 0.0818394388152767, + "grad_norm": 0.926389217376709, + "learning_rate": 9.958780710140167e-05, + "loss": 1.0061, + "step": 12810 + }, + { + "epoch": 0.0819033259650154, + "grad_norm": 0.7981832027435303, + "learning_rate": 9.958716388760277e-05, + "loss": 0.9619, + "step": 12820 + }, + { + "epoch": 0.08196721311475409, + "grad_norm": 0.7643110752105713, + "learning_rate": 9.958652017441822e-05, + "loss": 1.0358, + "step": 12830 + }, + { + "epoch": 0.0820311002644928, + "grad_norm": 2.3932769298553467, + "learning_rate": 9.958587596185451e-05, + "loss": 0.8638, + "step": 12840 + }, + { + "epoch": 0.0820949874142315, + "grad_norm": 0.6485501527786255, + "learning_rate": 9.958523124991814e-05, + "loss": 0.8252, + "step": 12850 + }, + { + "epoch": 0.0821588745639702, + "grad_norm": 1.1081517934799194, + "learning_rate": 9.958458603861559e-05, + "loss": 0.6834, + "step": 12860 + }, + { + "epoch": 0.0822227617137089, + "grad_norm": 0.6985851526260376, + "learning_rate": 9.958394032795335e-05, + "loss": 0.8498, + "step": 12870 + }, + { + "epoch": 0.08228664886344761, + "grad_norm": 0.9049435257911682, + "learning_rate": 9.958329411793796e-05, + "loss": 0.832, + "step": 12880 + }, + { + "epoch": 0.0823505360131863, + "grad_norm": 1.0366233587265015, + "learning_rate": 9.958264740857588e-05, + "loss": 0.6583, + "step": 12890 + }, + { + "epoch": 0.08241442316292501, + "grad_norm": 0.5812174081802368, + "learning_rate": 9.958200019987364e-05, + "loss": 0.8656, + "step": 12900 + }, + { + "epoch": 0.08247831031266371, + "grad_norm": 0.5848665237426758, + "learning_rate": 9.95813524918378e-05, + "loss": 0.9233, + "step": 12910 + }, + { + "epoch": 0.0825421974624024, + "grad_norm": 0.8434141278266907, + "learning_rate": 9.958070428447481e-05, + "loss": 0.8677, + "step": 12920 + }, + { + "epoch": 0.08260608461214111, + "grad_norm": 0.6627490520477295, + "learning_rate": 9.958005557779125e-05, + "loss": 1.0076, + "step": 12930 + }, + { + "epoch": 0.08266997176187982, + "grad_norm": 0.5368894934654236, + "learning_rate": 9.957940637179364e-05, + "loss": 1.0226, + "step": 12940 + }, + { + "epoch": 0.08273385891161851, + "grad_norm": 0.9018540978431702, + "learning_rate": 9.95787566664885e-05, + "loss": 0.9392, + "step": 12950 + }, + { + "epoch": 0.08279774606135722, + "grad_norm": 0.9104921817779541, + "learning_rate": 9.957810646188242e-05, + "loss": 0.8816, + "step": 12960 + }, + { + "epoch": 0.08286163321109592, + "grad_norm": 1.005777359008789, + "learning_rate": 9.957745575798189e-05, + "loss": 0.9567, + "step": 12970 + }, + { + "epoch": 0.08292552036083463, + "grad_norm": 0.9677864909172058, + "learning_rate": 9.957680455479348e-05, + "loss": 0.865, + "step": 12980 + }, + { + "epoch": 0.08298940751057332, + "grad_norm": 0.5736163854598999, + "learning_rate": 9.957615285232379e-05, + "loss": 0.9897, + "step": 12990 + }, + { + "epoch": 0.08305329466031203, + "grad_norm": 1.2024660110473633, + "learning_rate": 9.957550065057932e-05, + "loss": 0.8672, + "step": 13000 + }, + { + "epoch": 0.08311718181005073, + "grad_norm": 0.7755523920059204, + "learning_rate": 9.95748479495667e-05, + "loss": 0.833, + "step": 13010 + }, + { + "epoch": 0.08318106895978943, + "grad_norm": 2.249293088912964, + "learning_rate": 9.957419474929246e-05, + "loss": 0.9011, + "step": 13020 + }, + { + "epoch": 0.08324495610952813, + "grad_norm": 1.1623985767364502, + "learning_rate": 9.957354104976317e-05, + "loss": 0.9665, + "step": 13030 + }, + { + "epoch": 0.08330884325926684, + "grad_norm": 0.9698325395584106, + "learning_rate": 9.957288685098547e-05, + "loss": 0.581, + "step": 13040 + }, + { + "epoch": 0.08337273040900553, + "grad_norm": 1.5064680576324463, + "learning_rate": 9.957223215296589e-05, + "loss": 0.993, + "step": 13050 + }, + { + "epoch": 0.08343661755874424, + "grad_norm": 0.9795089960098267, + "learning_rate": 9.957157695571106e-05, + "loss": 0.8646, + "step": 13060 + }, + { + "epoch": 0.08350050470848294, + "grad_norm": 1.1535509824752808, + "learning_rate": 9.957092125922755e-05, + "loss": 0.9196, + "step": 13070 + }, + { + "epoch": 0.08356439185822163, + "grad_norm": 0.5842729210853577, + "learning_rate": 9.957026506352198e-05, + "loss": 1.1444, + "step": 13080 + }, + { + "epoch": 0.08362827900796034, + "grad_norm": 1.164316177368164, + "learning_rate": 9.956960836860096e-05, + "loss": 0.8979, + "step": 13090 + }, + { + "epoch": 0.08369216615769905, + "grad_norm": 1.0627108812332153, + "learning_rate": 9.956895117447112e-05, + "loss": 1.1704, + "step": 13100 + }, + { + "epoch": 0.08375605330743774, + "grad_norm": 0.5449188947677612, + "learning_rate": 9.956829348113903e-05, + "loss": 0.9608, + "step": 13110 + }, + { + "epoch": 0.08381994045717645, + "grad_norm": 0.8680428862571716, + "learning_rate": 9.956763528861135e-05, + "loss": 0.9228, + "step": 13120 + }, + { + "epoch": 0.08388382760691515, + "grad_norm": 0.9110902547836304, + "learning_rate": 9.95669765968947e-05, + "loss": 1.4213, + "step": 13130 + }, + { + "epoch": 0.08394771475665384, + "grad_norm": 2.3549108505249023, + "learning_rate": 9.956631740599571e-05, + "loss": 0.9036, + "step": 13140 + }, + { + "epoch": 0.08401160190639255, + "grad_norm": 0.9437476992607117, + "learning_rate": 9.956565771592103e-05, + "loss": 0.9577, + "step": 13150 + }, + { + "epoch": 0.08407548905613126, + "grad_norm": 0.5156351923942566, + "learning_rate": 9.956499752667729e-05, + "loss": 1.0223, + "step": 13160 + }, + { + "epoch": 0.08413937620586995, + "grad_norm": 0.6962876915931702, + "learning_rate": 9.956433683827115e-05, + "loss": 0.7827, + "step": 13170 + }, + { + "epoch": 0.08420326335560865, + "grad_norm": 1.191227912902832, + "learning_rate": 9.956367565070927e-05, + "loss": 0.7738, + "step": 13180 + }, + { + "epoch": 0.08426715050534736, + "grad_norm": 0.9918831586837769, + "learning_rate": 9.956301396399829e-05, + "loss": 1.2268, + "step": 13190 + }, + { + "epoch": 0.08433103765508605, + "grad_norm": 1.3545849323272705, + "learning_rate": 9.956235177814488e-05, + "loss": 0.9728, + "step": 13200 + }, + { + "epoch": 0.08439492480482476, + "grad_norm": 0.8052165508270264, + "learning_rate": 9.956168909315571e-05, + "loss": 0.8022, + "step": 13210 + }, + { + "epoch": 0.08445881195456346, + "grad_norm": 1.1841431856155396, + "learning_rate": 9.956102590903744e-05, + "loss": 0.8663, + "step": 13220 + }, + { + "epoch": 0.08452269910430216, + "grad_norm": 1.1858928203582764, + "learning_rate": 9.956036222579679e-05, + "loss": 0.8862, + "step": 13230 + }, + { + "epoch": 0.08458658625404086, + "grad_norm": 0.6900216937065125, + "learning_rate": 9.955969804344039e-05, + "loss": 0.7973, + "step": 13240 + }, + { + "epoch": 0.08465047340377957, + "grad_norm": 0.737177848815918, + "learning_rate": 9.955903336197497e-05, + "loss": 0.6908, + "step": 13250 + }, + { + "epoch": 0.08471436055351826, + "grad_norm": 1.1123918294906616, + "learning_rate": 9.955836818140721e-05, + "loss": 0.8086, + "step": 13260 + }, + { + "epoch": 0.08477824770325697, + "grad_norm": 0.9774020910263062, + "learning_rate": 9.95577025017438e-05, + "loss": 0.8224, + "step": 13270 + }, + { + "epoch": 0.08484213485299567, + "grad_norm": 1.0861930847167969, + "learning_rate": 9.955703632299144e-05, + "loss": 1.216, + "step": 13280 + }, + { + "epoch": 0.08490602200273437, + "grad_norm": 0.6377803683280945, + "learning_rate": 9.955636964515688e-05, + "loss": 1.0431, + "step": 13290 + }, + { + "epoch": 0.08496990915247307, + "grad_norm": 0.799303412437439, + "learning_rate": 9.95557024682468e-05, + "loss": 0.8008, + "step": 13300 + }, + { + "epoch": 0.08503379630221178, + "grad_norm": 0.6764736175537109, + "learning_rate": 9.955503479226791e-05, + "loss": 0.856, + "step": 13310 + }, + { + "epoch": 0.08509768345195047, + "grad_norm": 0.7718757390975952, + "learning_rate": 9.955436661722696e-05, + "loss": 1.1674, + "step": 13320 + }, + { + "epoch": 0.08516157060168918, + "grad_norm": 0.8467085957527161, + "learning_rate": 9.955369794313066e-05, + "loss": 0.7126, + "step": 13330 + }, + { + "epoch": 0.08522545775142788, + "grad_norm": 0.7613494992256165, + "learning_rate": 9.955302876998576e-05, + "loss": 0.8779, + "step": 13340 + }, + { + "epoch": 0.08528934490116657, + "grad_norm": 1.5320026874542236, + "learning_rate": 9.955235909779898e-05, + "loss": 0.92, + "step": 13350 + }, + { + "epoch": 0.08535323205090528, + "grad_norm": 0.9841747879981995, + "learning_rate": 9.955168892657709e-05, + "loss": 1.195, + "step": 13360 + }, + { + "epoch": 0.08541711920064399, + "grad_norm": 0.9456724524497986, + "learning_rate": 9.955101825632681e-05, + "loss": 0.8966, + "step": 13370 + }, + { + "epoch": 0.08548100635038268, + "grad_norm": 0.6288855671882629, + "learning_rate": 9.95503470870549e-05, + "loss": 1.018, + "step": 13380 + }, + { + "epoch": 0.08554489350012139, + "grad_norm": 0.6074085831642151, + "learning_rate": 9.954967541876816e-05, + "loss": 1.1021, + "step": 13390 + }, + { + "epoch": 0.08560878064986009, + "grad_norm": 0.6871976852416992, + "learning_rate": 9.954900325147329e-05, + "loss": 0.7936, + "step": 13400 + }, + { + "epoch": 0.08567266779959878, + "grad_norm": 1.1917479038238525, + "learning_rate": 9.954833058517712e-05, + "loss": 1.0316, + "step": 13410 + }, + { + "epoch": 0.08573655494933749, + "grad_norm": 0.8669334650039673, + "learning_rate": 9.954765741988638e-05, + "loss": 0.7559, + "step": 13420 + }, + { + "epoch": 0.0858004420990762, + "grad_norm": 1.0920523405075073, + "learning_rate": 9.954698375560786e-05, + "loss": 1.0566, + "step": 13430 + }, + { + "epoch": 0.08586432924881489, + "grad_norm": 0.6692205667495728, + "learning_rate": 9.954630959234835e-05, + "loss": 1.1381, + "step": 13440 + }, + { + "epoch": 0.0859282163985536, + "grad_norm": 0.435250461101532, + "learning_rate": 9.954563493011464e-05, + "loss": 0.656, + "step": 13450 + }, + { + "epoch": 0.0859921035482923, + "grad_norm": 0.719704806804657, + "learning_rate": 9.954495976891354e-05, + "loss": 0.9106, + "step": 13460 + }, + { + "epoch": 0.08605599069803099, + "grad_norm": 1.0210596323013306, + "learning_rate": 9.95442841087518e-05, + "loss": 1.0513, + "step": 13470 + }, + { + "epoch": 0.0861198778477697, + "grad_norm": 0.8312535881996155, + "learning_rate": 9.954360794963629e-05, + "loss": 0.9642, + "step": 13480 + }, + { + "epoch": 0.0861837649975084, + "grad_norm": 0.7173671126365662, + "learning_rate": 9.954299897983244e-05, + "loss": 0.9963, + "step": 13490 + }, + { + "epoch": 0.0862476521472471, + "grad_norm": 0.8660849928855896, + "learning_rate": 9.954232187272345e-05, + "loss": 0.7152, + "step": 13500 + }, + { + "epoch": 0.0863115392969858, + "grad_norm": 0.757793664932251, + "learning_rate": 9.954164426668044e-05, + "loss": 1.0053, + "step": 13510 + }, + { + "epoch": 0.08637542644672451, + "grad_norm": 0.6356269717216492, + "learning_rate": 9.954096616171018e-05, + "loss": 1.0546, + "step": 13520 + }, + { + "epoch": 0.0864393135964632, + "grad_norm": 0.6191072463989258, + "learning_rate": 9.954028755781956e-05, + "loss": 0.8486, + "step": 13530 + }, + { + "epoch": 0.08650320074620191, + "grad_norm": 1.0523960590362549, + "learning_rate": 9.953960845501537e-05, + "loss": 0.8107, + "step": 13540 + }, + { + "epoch": 0.08656708789594061, + "grad_norm": 0.7796614170074463, + "learning_rate": 9.953892885330447e-05, + "loss": 0.8723, + "step": 13550 + }, + { + "epoch": 0.0866309750456793, + "grad_norm": 0.7295846939086914, + "learning_rate": 9.953824875269369e-05, + "loss": 0.913, + "step": 13560 + }, + { + "epoch": 0.08669486219541801, + "grad_norm": 1.0830540657043457, + "learning_rate": 9.95375681531899e-05, + "loss": 0.8378, + "step": 13570 + }, + { + "epoch": 0.08675874934515672, + "grad_norm": 1.3589000701904297, + "learning_rate": 9.953688705479994e-05, + "loss": 0.9502, + "step": 13580 + }, + { + "epoch": 0.08682263649489541, + "grad_norm": 0.796097993850708, + "learning_rate": 9.953620545753067e-05, + "loss": 0.6924, + "step": 13590 + }, + { + "epoch": 0.08688652364463412, + "grad_norm": 0.933182954788208, + "learning_rate": 9.953552336138896e-05, + "loss": 1.0789, + "step": 13600 + }, + { + "epoch": 0.08695041079437282, + "grad_norm": 0.830242395401001, + "learning_rate": 9.953484076638166e-05, + "loss": 0.7949, + "step": 13610 + }, + { + "epoch": 0.08701429794411152, + "grad_norm": 0.751649022102356, + "learning_rate": 9.953415767251568e-05, + "loss": 0.8326, + "step": 13620 + }, + { + "epoch": 0.08707818509385022, + "grad_norm": 3.0472450256347656, + "learning_rate": 9.953347407979788e-05, + "loss": 0.7271, + "step": 13630 + }, + { + "epoch": 0.08714207224358893, + "grad_norm": 0.4621819853782654, + "learning_rate": 9.953278998823513e-05, + "loss": 1.0762, + "step": 13640 + }, + { + "epoch": 0.08720595939332762, + "grad_norm": 0.8232291340827942, + "learning_rate": 9.953210539783434e-05, + "loss": 0.7763, + "step": 13650 + }, + { + "epoch": 0.08726984654306633, + "grad_norm": 1.4312729835510254, + "learning_rate": 9.953142030860238e-05, + "loss": 0.8253, + "step": 13660 + }, + { + "epoch": 0.08733373369280503, + "grad_norm": 0.9248529672622681, + "learning_rate": 9.95307347205462e-05, + "loss": 1.0337, + "step": 13670 + }, + { + "epoch": 0.08739762084254372, + "grad_norm": 0.6953186392784119, + "learning_rate": 9.953004863367264e-05, + "loss": 0.93, + "step": 13680 + }, + { + "epoch": 0.08746150799228243, + "grad_norm": 0.8455583453178406, + "learning_rate": 9.952936204798866e-05, + "loss": 0.8386, + "step": 13690 + }, + { + "epoch": 0.08752539514202114, + "grad_norm": 1.119112253189087, + "learning_rate": 9.952867496350115e-05, + "loss": 0.8611, + "step": 13700 + }, + { + "epoch": 0.08758928229175983, + "grad_norm": 0.8125833868980408, + "learning_rate": 9.952798738021703e-05, + "loss": 0.9875, + "step": 13710 + }, + { + "epoch": 0.08765316944149854, + "grad_norm": 0.6726895570755005, + "learning_rate": 9.952729929814323e-05, + "loss": 1.0715, + "step": 13720 + }, + { + "epoch": 0.08771705659123724, + "grad_norm": 0.6909586787223816, + "learning_rate": 9.952661071728669e-05, + "loss": 0.9544, + "step": 13730 + }, + { + "epoch": 0.08778094374097593, + "grad_norm": 0.983298122882843, + "learning_rate": 9.952592163765432e-05, + "loss": 1.0024, + "step": 13740 + }, + { + "epoch": 0.08784483089071464, + "grad_norm": 1.025319218635559, + "learning_rate": 9.952523205925309e-05, + "loss": 0.7382, + "step": 13750 + }, + { + "epoch": 0.08790871804045335, + "grad_norm": 2.100965976715088, + "learning_rate": 9.952454198208991e-05, + "loss": 0.8063, + "step": 13760 + }, + { + "epoch": 0.08797260519019204, + "grad_norm": 0.9277796149253845, + "learning_rate": 9.952385140617174e-05, + "loss": 0.963, + "step": 13770 + }, + { + "epoch": 0.08803649233993074, + "grad_norm": 1.1502079963684082, + "learning_rate": 9.952316033150556e-05, + "loss": 0.857, + "step": 13780 + }, + { + "epoch": 0.08810037948966945, + "grad_norm": 0.5683081746101379, + "learning_rate": 9.952246875809831e-05, + "loss": 0.8632, + "step": 13790 + }, + { + "epoch": 0.08816426663940814, + "grad_norm": 0.7985507249832153, + "learning_rate": 9.952177668595695e-05, + "loss": 0.693, + "step": 13800 + }, + { + "epoch": 0.08822815378914685, + "grad_norm": 0.9982643723487854, + "learning_rate": 9.952108411508845e-05, + "loss": 0.7695, + "step": 13810 + }, + { + "epoch": 0.08829204093888555, + "grad_norm": 0.8026944994926453, + "learning_rate": 9.952039104549981e-05, + "loss": 0.9093, + "step": 13820 + }, + { + "epoch": 0.08835592808862426, + "grad_norm": 0.9833221435546875, + "learning_rate": 9.951969747719798e-05, + "loss": 1.1041, + "step": 13830 + }, + { + "epoch": 0.08841981523836295, + "grad_norm": 0.7445331811904907, + "learning_rate": 9.951900341018996e-05, + "loss": 1.1706, + "step": 13840 + }, + { + "epoch": 0.08848370238810166, + "grad_norm": 0.7326770424842834, + "learning_rate": 9.951830884448274e-05, + "loss": 1.1022, + "step": 13850 + }, + { + "epoch": 0.08854758953784037, + "grad_norm": 1.3713650703430176, + "learning_rate": 9.95176137800833e-05, + "loss": 0.812, + "step": 13860 + }, + { + "epoch": 0.08861147668757906, + "grad_norm": 0.8719102740287781, + "learning_rate": 9.951691821699864e-05, + "loss": 1.037, + "step": 13870 + }, + { + "epoch": 0.08867536383731776, + "grad_norm": 0.7241623997688293, + "learning_rate": 9.951622215523579e-05, + "loss": 0.9797, + "step": 13880 + }, + { + "epoch": 0.08873925098705647, + "grad_norm": 0.9998733401298523, + "learning_rate": 9.951552559480176e-05, + "loss": 1.0036, + "step": 13890 + }, + { + "epoch": 0.08880313813679516, + "grad_norm": 1.31692373752594, + "learning_rate": 9.951482853570353e-05, + "loss": 1.0621, + "step": 13900 + }, + { + "epoch": 0.08886702528653387, + "grad_norm": 0.509678840637207, + "learning_rate": 9.951413097794816e-05, + "loss": 0.7828, + "step": 13910 + }, + { + "epoch": 0.08893091243627257, + "grad_norm": 0.6443775296211243, + "learning_rate": 9.951343292154263e-05, + "loss": 0.8265, + "step": 13920 + }, + { + "epoch": 0.08899479958601127, + "grad_norm": 1.014041781425476, + "learning_rate": 9.9512734366494e-05, + "loss": 1.0371, + "step": 13930 + }, + { + "epoch": 0.08905868673574997, + "grad_norm": 0.8309150338172913, + "learning_rate": 9.951203531280931e-05, + "loss": 0.9042, + "step": 13940 + }, + { + "epoch": 0.08912257388548868, + "grad_norm": 0.6780155897140503, + "learning_rate": 9.951133576049558e-05, + "loss": 0.8917, + "step": 13950 + }, + { + "epoch": 0.08918646103522737, + "grad_norm": 0.7868662476539612, + "learning_rate": 9.951063570955988e-05, + "loss": 0.9667, + "step": 13960 + }, + { + "epoch": 0.08925034818496608, + "grad_norm": 0.6636529564857483, + "learning_rate": 9.950993516000924e-05, + "loss": 0.8601, + "step": 13970 + }, + { + "epoch": 0.08931423533470478, + "grad_norm": 0.8302227854728699, + "learning_rate": 9.950923411185071e-05, + "loss": 0.9081, + "step": 13980 + }, + { + "epoch": 0.08937812248444348, + "grad_norm": 0.9507797360420227, + "learning_rate": 9.950853256509138e-05, + "loss": 0.7923, + "step": 13990 + }, + { + "epoch": 0.08944200963418218, + "grad_norm": 0.5564282536506653, + "learning_rate": 9.950783051973828e-05, + "loss": 0.9981, + "step": 14000 + }, + { + "epoch": 0.08950589678392089, + "grad_norm": 1.1084082126617432, + "learning_rate": 9.950712797579849e-05, + "loss": 0.7917, + "step": 14010 + }, + { + "epoch": 0.08956978393365958, + "grad_norm": 1.2243750095367432, + "learning_rate": 9.950642493327911e-05, + "loss": 1.0782, + "step": 14020 + }, + { + "epoch": 0.08963367108339829, + "grad_norm": 1.1874489784240723, + "learning_rate": 9.950572139218719e-05, + "loss": 0.9879, + "step": 14030 + }, + { + "epoch": 0.08969755823313699, + "grad_norm": 0.6582461595535278, + "learning_rate": 9.950501735252984e-05, + "loss": 0.8992, + "step": 14040 + }, + { + "epoch": 0.08976144538287568, + "grad_norm": 0.945318341255188, + "learning_rate": 9.950431281431413e-05, + "loss": 0.9753, + "step": 14050 + }, + { + "epoch": 0.08982533253261439, + "grad_norm": 1.02214777469635, + "learning_rate": 9.950360777754716e-05, + "loss": 0.8625, + "step": 14060 + }, + { + "epoch": 0.0898892196823531, + "grad_norm": 0.6554903388023376, + "learning_rate": 9.950290224223604e-05, + "loss": 0.7558, + "step": 14070 + }, + { + "epoch": 0.08995310683209179, + "grad_norm": 0.9139891266822815, + "learning_rate": 9.950219620838786e-05, + "loss": 0.9843, + "step": 14080 + }, + { + "epoch": 0.0900169939818305, + "grad_norm": 0.6926449537277222, + "learning_rate": 9.950148967600974e-05, + "loss": 0.6626, + "step": 14090 + }, + { + "epoch": 0.0900808811315692, + "grad_norm": 1.608420968055725, + "learning_rate": 9.95007826451088e-05, + "loss": 1.0037, + "step": 14100 + }, + { + "epoch": 0.0901447682813079, + "grad_norm": 0.9414392113685608, + "learning_rate": 9.950007511569214e-05, + "loss": 0.9188, + "step": 14110 + }, + { + "epoch": 0.0902086554310466, + "grad_norm": 0.8587938547134399, + "learning_rate": 9.949936708776691e-05, + "loss": 0.9312, + "step": 14120 + }, + { + "epoch": 0.0902725425807853, + "grad_norm": 1.4284396171569824, + "learning_rate": 9.949865856134024e-05, + "loss": 1.1385, + "step": 14130 + }, + { + "epoch": 0.090336429730524, + "grad_norm": 0.7485639452934265, + "learning_rate": 9.949794953641925e-05, + "loss": 0.9224, + "step": 14140 + }, + { + "epoch": 0.0904003168802627, + "grad_norm": 0.7703597545623779, + "learning_rate": 9.949724001301108e-05, + "loss": 0.8031, + "step": 14150 + }, + { + "epoch": 0.09046420403000141, + "grad_norm": 0.6931461095809937, + "learning_rate": 9.949652999112289e-05, + "loss": 0.8585, + "step": 14160 + }, + { + "epoch": 0.0905280911797401, + "grad_norm": 0.9867964386940002, + "learning_rate": 9.94958194707618e-05, + "loss": 0.9662, + "step": 14170 + }, + { + "epoch": 0.09059197832947881, + "grad_norm": 0.7029063105583191, + "learning_rate": 9.949510845193501e-05, + "loss": 0.9446, + "step": 14180 + }, + { + "epoch": 0.09065586547921752, + "grad_norm": 0.6712666153907776, + "learning_rate": 9.949439693464965e-05, + "loss": 0.7581, + "step": 14190 + }, + { + "epoch": 0.09071975262895621, + "grad_norm": 0.8002526760101318, + "learning_rate": 9.94936849189129e-05, + "loss": 1.1783, + "step": 14200 + }, + { + "epoch": 0.09078363977869491, + "grad_norm": 1.9806957244873047, + "learning_rate": 9.949297240473192e-05, + "loss": 0.8167, + "step": 14210 + }, + { + "epoch": 0.09084752692843362, + "grad_norm": 0.6431198716163635, + "learning_rate": 9.949225939211391e-05, + "loss": 1.1454, + "step": 14220 + }, + { + "epoch": 0.09091141407817231, + "grad_norm": 1.083142638206482, + "learning_rate": 9.9491545881066e-05, + "loss": 1.2549, + "step": 14230 + }, + { + "epoch": 0.09097530122791102, + "grad_norm": 0.520418643951416, + "learning_rate": 9.949083187159542e-05, + "loss": 0.7501, + "step": 14240 + }, + { + "epoch": 0.09103918837764972, + "grad_norm": 1.0432296991348267, + "learning_rate": 9.949011736370935e-05, + "loss": 0.8595, + "step": 14250 + }, + { + "epoch": 0.09110307552738842, + "grad_norm": 0.8031591773033142, + "learning_rate": 9.948940235741499e-05, + "loss": 0.7955, + "step": 14260 + }, + { + "epoch": 0.09116696267712712, + "grad_norm": 0.7311345934867859, + "learning_rate": 9.948868685271952e-05, + "loss": 0.9517, + "step": 14270 + }, + { + "epoch": 0.09123084982686583, + "grad_norm": 1.3706258535385132, + "learning_rate": 9.948797084963016e-05, + "loss": 0.9347, + "step": 14280 + }, + { + "epoch": 0.09129473697660452, + "grad_norm": 0.5846802592277527, + "learning_rate": 9.948725434815413e-05, + "loss": 0.7575, + "step": 14290 + }, + { + "epoch": 0.09135862412634323, + "grad_norm": 0.7384892702102661, + "learning_rate": 9.948653734829863e-05, + "loss": 0.9603, + "step": 14300 + }, + { + "epoch": 0.09142251127608193, + "grad_norm": 2.91487717628479, + "learning_rate": 9.948581985007089e-05, + "loss": 1.0739, + "step": 14310 + }, + { + "epoch": 0.09148639842582063, + "grad_norm": 0.6311538815498352, + "learning_rate": 9.948510185347813e-05, + "loss": 1.0676, + "step": 14320 + }, + { + "epoch": 0.09155028557555933, + "grad_norm": 0.6362346410751343, + "learning_rate": 9.948438335852759e-05, + "loss": 1.1728, + "step": 14330 + }, + { + "epoch": 0.09161417272529804, + "grad_norm": 0.6874721646308899, + "learning_rate": 9.94836643652265e-05, + "loss": 1.2022, + "step": 14340 + }, + { + "epoch": 0.09167805987503673, + "grad_norm": 0.721106231212616, + "learning_rate": 9.948294487358208e-05, + "loss": 1.097, + "step": 14350 + }, + { + "epoch": 0.09174194702477544, + "grad_norm": 1.0813249349594116, + "learning_rate": 9.948222488360162e-05, + "loss": 1.2724, + "step": 14360 + }, + { + "epoch": 0.09180583417451414, + "grad_norm": 0.8952019810676575, + "learning_rate": 9.948150439529233e-05, + "loss": 0.8907, + "step": 14370 + }, + { + "epoch": 0.09186972132425283, + "grad_norm": 0.8344172835350037, + "learning_rate": 9.94807834086615e-05, + "loss": 0.8321, + "step": 14380 + }, + { + "epoch": 0.09193360847399154, + "grad_norm": 0.9786416888237, + "learning_rate": 9.948006192371635e-05, + "loss": 0.7653, + "step": 14390 + }, + { + "epoch": 0.09199749562373025, + "grad_norm": 1.2197997570037842, + "learning_rate": 9.947933994046419e-05, + "loss": 0.9922, + "step": 14400 + }, + { + "epoch": 0.09206138277346894, + "grad_norm": 0.915473222732544, + "learning_rate": 9.947861745891227e-05, + "loss": 0.921, + "step": 14410 + }, + { + "epoch": 0.09212526992320764, + "grad_norm": 0.9322916865348816, + "learning_rate": 9.947789447906785e-05, + "loss": 1.0827, + "step": 14420 + }, + { + "epoch": 0.09218915707294635, + "grad_norm": 1.073462963104248, + "learning_rate": 9.947717100093825e-05, + "loss": 0.9149, + "step": 14430 + }, + { + "epoch": 0.09225304422268504, + "grad_norm": 1.7424027919769287, + "learning_rate": 9.947644702453072e-05, + "loss": 0.9262, + "step": 14440 + }, + { + "epoch": 0.09231693137242375, + "grad_norm": 0.5602442026138306, + "learning_rate": 9.947572254985258e-05, + "loss": 0.8065, + "step": 14450 + }, + { + "epoch": 0.09238081852216246, + "grad_norm": 0.7667688727378845, + "learning_rate": 9.94749975769111e-05, + "loss": 0.9672, + "step": 14460 + }, + { + "epoch": 0.09244470567190115, + "grad_norm": 0.8217202425003052, + "learning_rate": 9.947427210571359e-05, + "loss": 0.8194, + "step": 14470 + }, + { + "epoch": 0.09250859282163985, + "grad_norm": 0.7690846920013428, + "learning_rate": 9.947354613626737e-05, + "loss": 0.6602, + "step": 14480 + }, + { + "epoch": 0.09257247997137856, + "grad_norm": 0.7123977541923523, + "learning_rate": 9.947281966857973e-05, + "loss": 0.9875, + "step": 14490 + }, + { + "epoch": 0.09263636712111725, + "grad_norm": 1.3590373992919922, + "learning_rate": 9.947209270265801e-05, + "loss": 1.0355, + "step": 14500 + }, + { + "epoch": 0.09270025427085596, + "grad_norm": 2.0925168991088867, + "learning_rate": 9.947136523850949e-05, + "loss": 0.9441, + "step": 14510 + }, + { + "epoch": 0.09276414142059466, + "grad_norm": 0.7630490064620972, + "learning_rate": 9.947063727614155e-05, + "loss": 0.7035, + "step": 14520 + }, + { + "epoch": 0.09282802857033336, + "grad_norm": 0.5995486378669739, + "learning_rate": 9.946990881556148e-05, + "loss": 0.8794, + "step": 14530 + }, + { + "epoch": 0.09289191572007206, + "grad_norm": 0.5936999917030334, + "learning_rate": 9.946917985677664e-05, + "loss": 0.8, + "step": 14540 + }, + { + "epoch": 0.09295580286981077, + "grad_norm": 2.0189425945281982, + "learning_rate": 9.946845039979436e-05, + "loss": 0.9379, + "step": 14550 + }, + { + "epoch": 0.09301969001954946, + "grad_norm": 0.9083710312843323, + "learning_rate": 9.946772044462197e-05, + "loss": 1.1928, + "step": 14560 + }, + { + "epoch": 0.09308357716928817, + "grad_norm": 0.7872990965843201, + "learning_rate": 9.946698999126686e-05, + "loss": 0.9303, + "step": 14570 + }, + { + "epoch": 0.09314746431902687, + "grad_norm": 0.9097589254379272, + "learning_rate": 9.946625903973636e-05, + "loss": 0.8706, + "step": 14580 + }, + { + "epoch": 0.09321135146876557, + "grad_norm": 1.2268530130386353, + "learning_rate": 9.946552759003783e-05, + "loss": 0.7452, + "step": 14590 + }, + { + "epoch": 0.09327523861850427, + "grad_norm": 0.7525649070739746, + "learning_rate": 9.946479564217866e-05, + "loss": 1.1206, + "step": 14600 + }, + { + "epoch": 0.09333912576824298, + "grad_norm": 0.9777686595916748, + "learning_rate": 9.946406319616619e-05, + "loss": 0.9522, + "step": 14610 + }, + { + "epoch": 0.09340301291798167, + "grad_norm": 0.7327966690063477, + "learning_rate": 9.946333025200781e-05, + "loss": 0.7119, + "step": 14620 + }, + { + "epoch": 0.09346690006772038, + "grad_norm": 0.8345320820808411, + "learning_rate": 9.946259680971091e-05, + "loss": 0.9164, + "step": 14630 + }, + { + "epoch": 0.09353078721745908, + "grad_norm": 1.128624439239502, + "learning_rate": 9.946186286928288e-05, + "loss": 0.8583, + "step": 14640 + }, + { + "epoch": 0.09359467436719779, + "grad_norm": 0.753193199634552, + "learning_rate": 9.946112843073107e-05, + "loss": 1.0453, + "step": 14650 + }, + { + "epoch": 0.09365856151693648, + "grad_norm": 0.9466274380683899, + "learning_rate": 9.946039349406294e-05, + "loss": 1.1494, + "step": 14660 + }, + { + "epoch": 0.09372244866667519, + "grad_norm": 0.8753125667572021, + "learning_rate": 9.945965805928583e-05, + "loss": 0.7926, + "step": 14670 + }, + { + "epoch": 0.0937863358164139, + "grad_norm": 0.7783929109573364, + "learning_rate": 9.94589221264072e-05, + "loss": 1.0401, + "step": 14680 + }, + { + "epoch": 0.09385022296615259, + "grad_norm": 1.0802748203277588, + "learning_rate": 9.945818569543441e-05, + "loss": 0.7928, + "step": 14690 + }, + { + "epoch": 0.09391411011589129, + "grad_norm": 0.8250336647033691, + "learning_rate": 9.945744876637491e-05, + "loss": 0.9204, + "step": 14700 + }, + { + "epoch": 0.09397799726563, + "grad_norm": 0.6883922815322876, + "learning_rate": 9.945671133923614e-05, + "loss": 0.8513, + "step": 14710 + }, + { + "epoch": 0.09404188441536869, + "grad_norm": 0.4683299958705902, + "learning_rate": 9.945597341402547e-05, + "loss": 0.6514, + "step": 14720 + }, + { + "epoch": 0.0941057715651074, + "grad_norm": 0.6585717797279358, + "learning_rate": 9.945523499075037e-05, + "loss": 0.9824, + "step": 14730 + }, + { + "epoch": 0.0941696587148461, + "grad_norm": 0.5519923567771912, + "learning_rate": 9.945449606941826e-05, + "loss": 1.007, + "step": 14740 + }, + { + "epoch": 0.0942335458645848, + "grad_norm": 0.6457942128181458, + "learning_rate": 9.945375665003661e-05, + "loss": 0.6664, + "step": 14750 + }, + { + "epoch": 0.0942974330143235, + "grad_norm": 0.906104326248169, + "learning_rate": 9.945301673261285e-05, + "loss": 0.8221, + "step": 14760 + }, + { + "epoch": 0.0943613201640622, + "grad_norm": 0.8347557187080383, + "learning_rate": 9.945227631715442e-05, + "loss": 0.8833, + "step": 14770 + }, + { + "epoch": 0.0944252073138009, + "grad_norm": 0.6181365847587585, + "learning_rate": 9.945153540366877e-05, + "loss": 1.0287, + "step": 14780 + }, + { + "epoch": 0.0944890944635396, + "grad_norm": 0.6475619077682495, + "learning_rate": 9.945079399216339e-05, + "loss": 0.8144, + "step": 14790 + }, + { + "epoch": 0.09455298161327831, + "grad_norm": 0.6462060809135437, + "learning_rate": 9.945005208264572e-05, + "loss": 0.8489, + "step": 14800 + }, + { + "epoch": 0.094616868763017, + "grad_norm": 0.6881303787231445, + "learning_rate": 9.944938393828552e-05, + "loss": 0.8207, + "step": 14810 + }, + { + "epoch": 0.09468075591275571, + "grad_norm": 0.6425718069076538, + "learning_rate": 9.944864108256513e-05, + "loss": 0.91, + "step": 14820 + }, + { + "epoch": 0.09474464306249442, + "grad_norm": 1.9519624710083008, + "learning_rate": 9.944789772885414e-05, + "loss": 0.8698, + "step": 14830 + }, + { + "epoch": 0.09480853021223311, + "grad_norm": 0.887140154838562, + "learning_rate": 9.944715387716004e-05, + "loss": 0.909, + "step": 14840 + }, + { + "epoch": 0.09487241736197181, + "grad_norm": 0.7273536920547485, + "learning_rate": 9.944640952749033e-05, + "loss": 1.1605, + "step": 14850 + }, + { + "epoch": 0.09493630451171052, + "grad_norm": 0.928715169429779, + "learning_rate": 9.944566467985249e-05, + "loss": 1.0493, + "step": 14860 + }, + { + "epoch": 0.09500019166144921, + "grad_norm": 0.5552724003791809, + "learning_rate": 9.944491933425403e-05, + "loss": 1.1027, + "step": 14870 + }, + { + "epoch": 0.09506407881118792, + "grad_norm": 0.8260436058044434, + "learning_rate": 9.944417349070247e-05, + "loss": 0.7093, + "step": 14880 + }, + { + "epoch": 0.09512796596092662, + "grad_norm": 2.4791147708892822, + "learning_rate": 9.944342714920529e-05, + "loss": 1.1502, + "step": 14890 + }, + { + "epoch": 0.09519185311066532, + "grad_norm": 0.8212199211120605, + "learning_rate": 9.944268030977003e-05, + "loss": 1.0912, + "step": 14900 + }, + { + "epoch": 0.09525574026040402, + "grad_norm": 0.8238768577575684, + "learning_rate": 9.94419329724042e-05, + "loss": 0.8248, + "step": 14910 + }, + { + "epoch": 0.09531962741014273, + "grad_norm": 1.0283452272415161, + "learning_rate": 9.944118513711535e-05, + "loss": 1.0666, + "step": 14920 + }, + { + "epoch": 0.09538351455988142, + "grad_norm": 0.7515852451324463, + "learning_rate": 9.944043680391098e-05, + "loss": 0.798, + "step": 14930 + }, + { + "epoch": 0.09544740170962013, + "grad_norm": 0.8797821998596191, + "learning_rate": 9.943968797279864e-05, + "loss": 0.8629, + "step": 14940 + }, + { + "epoch": 0.09551128885935883, + "grad_norm": 0.8942396640777588, + "learning_rate": 9.943893864378587e-05, + "loss": 0.8589, + "step": 14950 + }, + { + "epoch": 0.09557517600909753, + "grad_norm": 0.7868557572364807, + "learning_rate": 9.943818881688023e-05, + "loss": 0.7879, + "step": 14960 + }, + { + "epoch": 0.09563906315883623, + "grad_norm": 0.766189694404602, + "learning_rate": 9.943743849208924e-05, + "loss": 1.0051, + "step": 14970 + }, + { + "epoch": 0.09570295030857494, + "grad_norm": 0.7284533381462097, + "learning_rate": 9.943668766942049e-05, + "loss": 0.6991, + "step": 14980 + }, + { + "epoch": 0.09576683745831363, + "grad_norm": 1.0945543050765991, + "learning_rate": 9.943593634888151e-05, + "loss": 0.8595, + "step": 14990 + }, + { + "epoch": 0.09583072460805234, + "grad_norm": 1.704253077507019, + "learning_rate": 9.943518453047988e-05, + "loss": 1.0841, + "step": 15000 + }, + { + "epoch": 0.09589461175779104, + "grad_norm": 0.537315309047699, + "learning_rate": 9.943443221422319e-05, + "loss": 1.0965, + "step": 15010 + }, + { + "epoch": 0.09595849890752974, + "grad_norm": 1.1799222230911255, + "learning_rate": 9.9433679400119e-05, + "loss": 0.9025, + "step": 15020 + }, + { + "epoch": 0.09602238605726844, + "grad_norm": 3.8464369773864746, + "learning_rate": 9.943292608817489e-05, + "loss": 0.8995, + "step": 15030 + }, + { + "epoch": 0.09608627320700715, + "grad_norm": 1.1854133605957031, + "learning_rate": 9.943217227839845e-05, + "loss": 1.2093, + "step": 15040 + }, + { + "epoch": 0.09615016035674584, + "grad_norm": 1.119036078453064, + "learning_rate": 9.943141797079727e-05, + "loss": 0.6415, + "step": 15050 + }, + { + "epoch": 0.09621404750648455, + "grad_norm": 0.9091972708702087, + "learning_rate": 9.943066316537895e-05, + "loss": 0.7339, + "step": 15060 + }, + { + "epoch": 0.09627793465622325, + "grad_norm": 2.1518936157226562, + "learning_rate": 9.942990786215107e-05, + "loss": 0.7829, + "step": 15070 + }, + { + "epoch": 0.09634182180596194, + "grad_norm": 0.8024427890777588, + "learning_rate": 9.942915206112126e-05, + "loss": 0.9612, + "step": 15080 + }, + { + "epoch": 0.09640570895570065, + "grad_norm": 0.90773606300354, + "learning_rate": 9.942839576229714e-05, + "loss": 1.0113, + "step": 15090 + }, + { + "epoch": 0.09646959610543936, + "grad_norm": 1.2031515836715698, + "learning_rate": 9.942763896568632e-05, + "loss": 0.929, + "step": 15100 + }, + { + "epoch": 0.09653348325517805, + "grad_norm": 1.1134458780288696, + "learning_rate": 9.942688167129639e-05, + "loss": 1.0391, + "step": 15110 + }, + { + "epoch": 0.09659737040491675, + "grad_norm": 1.0063025951385498, + "learning_rate": 9.942612387913501e-05, + "loss": 0.8559, + "step": 15120 + }, + { + "epoch": 0.09666125755465546, + "grad_norm": 0.737177848815918, + "learning_rate": 9.94253655892098e-05, + "loss": 1.0731, + "step": 15130 + }, + { + "epoch": 0.09672514470439415, + "grad_norm": 0.8199975490570068, + "learning_rate": 9.942460680152842e-05, + "loss": 0.8919, + "step": 15140 + }, + { + "epoch": 0.09678903185413286, + "grad_norm": 0.9995172023773193, + "learning_rate": 9.942384751609848e-05, + "loss": 0.9533, + "step": 15150 + }, + { + "epoch": 0.09685291900387157, + "grad_norm": 1.6807196140289307, + "learning_rate": 9.942308773292764e-05, + "loss": 1.2186, + "step": 15160 + }, + { + "epoch": 0.09691680615361026, + "grad_norm": 0.6781327724456787, + "learning_rate": 9.942232745202353e-05, + "loss": 0.9126, + "step": 15170 + }, + { + "epoch": 0.09698069330334896, + "grad_norm": 0.8096178770065308, + "learning_rate": 9.942156667339385e-05, + "loss": 0.8445, + "step": 15180 + }, + { + "epoch": 0.09704458045308767, + "grad_norm": 0.4493632912635803, + "learning_rate": 9.942080539704621e-05, + "loss": 0.9263, + "step": 15190 + }, + { + "epoch": 0.09710846760282636, + "grad_norm": 1.0077593326568604, + "learning_rate": 9.942004362298834e-05, + "loss": 0.8551, + "step": 15200 + }, + { + "epoch": 0.09717235475256507, + "grad_norm": 0.7614121437072754, + "learning_rate": 9.941928135122784e-05, + "loss": 0.9088, + "step": 15210 + }, + { + "epoch": 0.09723624190230377, + "grad_norm": 1.770782470703125, + "learning_rate": 9.941851858177244e-05, + "loss": 0.8671, + "step": 15220 + }, + { + "epoch": 0.09730012905204247, + "grad_norm": 0.8057569861412048, + "learning_rate": 9.941775531462982e-05, + "loss": 0.8172, + "step": 15230 + }, + { + "epoch": 0.09736401620178117, + "grad_norm": 0.6936876177787781, + "learning_rate": 9.941699154980763e-05, + "loss": 0.8575, + "step": 15240 + }, + { + "epoch": 0.09742790335151988, + "grad_norm": 0.6702722311019897, + "learning_rate": 9.941622728731359e-05, + "loss": 1.004, + "step": 15250 + }, + { + "epoch": 0.09749179050125857, + "grad_norm": 1.0262168645858765, + "learning_rate": 9.94154625271554e-05, + "loss": 0.9267, + "step": 15260 + }, + { + "epoch": 0.09755567765099728, + "grad_norm": 1.287480115890503, + "learning_rate": 9.941469726934074e-05, + "loss": 0.8412, + "step": 15270 + }, + { + "epoch": 0.09761956480073598, + "grad_norm": 1.0471506118774414, + "learning_rate": 9.941393151387734e-05, + "loss": 0.9556, + "step": 15280 + }, + { + "epoch": 0.09768345195047468, + "grad_norm": 0.948810875415802, + "learning_rate": 9.941316526077289e-05, + "loss": 1.0511, + "step": 15290 + }, + { + "epoch": 0.09774733910021338, + "grad_norm": 0.6042103171348572, + "learning_rate": 9.941239851003511e-05, + "loss": 0.858, + "step": 15300 + }, + { + "epoch": 0.09781122624995209, + "grad_norm": 0.7108423113822937, + "learning_rate": 9.941163126167175e-05, + "loss": 1.0698, + "step": 15310 + }, + { + "epoch": 0.09787511339969078, + "grad_norm": 0.8583425283432007, + "learning_rate": 9.94108635156905e-05, + "loss": 0.9262, + "step": 15320 + }, + { + "epoch": 0.09793900054942949, + "grad_norm": 1.3478715419769287, + "learning_rate": 9.941009527209911e-05, + "loss": 0.8279, + "step": 15330 + }, + { + "epoch": 0.09800288769916819, + "grad_norm": 0.7297415137290955, + "learning_rate": 9.940932653090532e-05, + "loss": 0.7739, + "step": 15340 + }, + { + "epoch": 0.09806677484890688, + "grad_norm": 0.6165359616279602, + "learning_rate": 9.940855729211687e-05, + "loss": 0.9152, + "step": 15350 + }, + { + "epoch": 0.09813066199864559, + "grad_norm": 0.6644479632377625, + "learning_rate": 9.940778755574149e-05, + "loss": 0.8523, + "step": 15360 + }, + { + "epoch": 0.0981945491483843, + "grad_norm": 0.9046561121940613, + "learning_rate": 9.940701732178695e-05, + "loss": 0.9418, + "step": 15370 + }, + { + "epoch": 0.09825843629812299, + "grad_norm": 0.6211059093475342, + "learning_rate": 9.9406246590261e-05, + "loss": 0.7587, + "step": 15380 + }, + { + "epoch": 0.0983223234478617, + "grad_norm": 1.164886236190796, + "learning_rate": 9.940547536117142e-05, + "loss": 1.2, + "step": 15390 + }, + { + "epoch": 0.0983862105976004, + "grad_norm": 0.9881723523139954, + "learning_rate": 9.940470363452596e-05, + "loss": 0.7596, + "step": 15400 + }, + { + "epoch": 0.0984500977473391, + "grad_norm": 6.108283042907715, + "learning_rate": 9.940393141033238e-05, + "loss": 1.05, + "step": 15410 + }, + { + "epoch": 0.0985139848970778, + "grad_norm": 0.5831863880157471, + "learning_rate": 9.940315868859847e-05, + "loss": 1.2292, + "step": 15420 + }, + { + "epoch": 0.0985778720468165, + "grad_norm": 1.4908435344696045, + "learning_rate": 9.940238546933203e-05, + "loss": 0.99, + "step": 15430 + }, + { + "epoch": 0.0986417591965552, + "grad_norm": 0.80536949634552, + "learning_rate": 9.940161175254082e-05, + "loss": 0.9417, + "step": 15440 + }, + { + "epoch": 0.0987056463462939, + "grad_norm": 0.6706516146659851, + "learning_rate": 9.940083753823263e-05, + "loss": 1.249, + "step": 15450 + }, + { + "epoch": 0.09876953349603261, + "grad_norm": 0.6131950616836548, + "learning_rate": 9.940006282641527e-05, + "loss": 0.7975, + "step": 15460 + }, + { + "epoch": 0.0988334206457713, + "grad_norm": 0.9210124611854553, + "learning_rate": 9.939928761709655e-05, + "loss": 0.7322, + "step": 15470 + }, + { + "epoch": 0.09889730779551001, + "grad_norm": 0.8976283669471741, + "learning_rate": 9.939851191028426e-05, + "loss": 0.9391, + "step": 15480 + }, + { + "epoch": 0.09896119494524871, + "grad_norm": 0.7244909405708313, + "learning_rate": 9.939773570598623e-05, + "loss": 0.7818, + "step": 15490 + }, + { + "epoch": 0.09902508209498742, + "grad_norm": 1.1001940965652466, + "learning_rate": 9.939695900421024e-05, + "loss": 0.9527, + "step": 15500 + }, + { + "epoch": 0.09908896924472611, + "grad_norm": 0.7406299114227295, + "learning_rate": 9.939618180496417e-05, + "loss": 0.8922, + "step": 15510 + }, + { + "epoch": 0.09915285639446482, + "grad_norm": 1.2300517559051514, + "learning_rate": 9.93954041082558e-05, + "loss": 1.2021, + "step": 15520 + }, + { + "epoch": 0.09921674354420353, + "grad_norm": 0.9667423963546753, + "learning_rate": 9.9394625914093e-05, + "loss": 1.1348, + "step": 15530 + }, + { + "epoch": 0.09928063069394222, + "grad_norm": 0.8901247382164001, + "learning_rate": 9.939384722248355e-05, + "loss": 1.2461, + "step": 15540 + }, + { + "epoch": 0.09934451784368092, + "grad_norm": 0.8347676992416382, + "learning_rate": 9.939306803343533e-05, + "loss": 0.7845, + "step": 15550 + }, + { + "epoch": 0.09940840499341963, + "grad_norm": 0.9552205801010132, + "learning_rate": 9.93922883469562e-05, + "loss": 0.6641, + "step": 15560 + }, + { + "epoch": 0.09947229214315832, + "grad_norm": 0.8416782021522522, + "learning_rate": 9.939150816305399e-05, + "loss": 0.9133, + "step": 15570 + }, + { + "epoch": 0.09953617929289703, + "grad_norm": 1.2031623125076294, + "learning_rate": 9.939072748173656e-05, + "loss": 0.7874, + "step": 15580 + }, + { + "epoch": 0.09960006644263573, + "grad_norm": 1.0405542850494385, + "learning_rate": 9.938994630301179e-05, + "loss": 1.0763, + "step": 15590 + }, + { + "epoch": 0.09966395359237443, + "grad_norm": 0.7080594301223755, + "learning_rate": 9.938916462688753e-05, + "loss": 1.1229, + "step": 15600 + }, + { + "epoch": 0.09972784074211313, + "grad_norm": 0.6351432204246521, + "learning_rate": 9.938838245337163e-05, + "loss": 0.8626, + "step": 15610 + }, + { + "epoch": 0.09979172789185184, + "grad_norm": 1.3848146200180054, + "learning_rate": 9.938759978247201e-05, + "loss": 0.8473, + "step": 15620 + }, + { + "epoch": 0.09985561504159053, + "grad_norm": 0.9175819754600525, + "learning_rate": 9.938681661419654e-05, + "loss": 0.8902, + "step": 15630 + }, + { + "epoch": 0.09991950219132924, + "grad_norm": 0.729713499546051, + "learning_rate": 9.938603294855309e-05, + "loss": 0.8599, + "step": 15640 + }, + { + "epoch": 0.09998338934106794, + "grad_norm": 0.8896664381027222, + "learning_rate": 9.938524878554956e-05, + "loss": 1.2631, + "step": 15650 + }, + { + "epoch": 0.10004727649080664, + "grad_norm": 1.1083167791366577, + "learning_rate": 9.938446412519387e-05, + "loss": 0.8752, + "step": 15660 + }, + { + "epoch": 0.10011116364054534, + "grad_norm": 0.9350288510322571, + "learning_rate": 9.938367896749388e-05, + "loss": 0.891, + "step": 15670 + }, + { + "epoch": 0.10017505079028405, + "grad_norm": 0.8491414785385132, + "learning_rate": 9.938289331245753e-05, + "loss": 0.9962, + "step": 15680 + }, + { + "epoch": 0.10023893794002274, + "grad_norm": 1.3653219938278198, + "learning_rate": 9.938210716009272e-05, + "loss": 0.8964, + "step": 15690 + }, + { + "epoch": 0.10030282508976145, + "grad_norm": 1.137112021446228, + "learning_rate": 9.938132051040736e-05, + "loss": 0.9982, + "step": 15700 + }, + { + "epoch": 0.10036671223950015, + "grad_norm": 0.8561280965805054, + "learning_rate": 9.93805333634094e-05, + "loss": 0.7421, + "step": 15710 + }, + { + "epoch": 0.10043059938923884, + "grad_norm": 1.1215713024139404, + "learning_rate": 9.937974571910674e-05, + "loss": 0.926, + "step": 15720 + }, + { + "epoch": 0.10049448653897755, + "grad_norm": 0.6843059659004211, + "learning_rate": 9.937895757750733e-05, + "loss": 0.934, + "step": 15730 + }, + { + "epoch": 0.10055837368871626, + "grad_norm": 0.8098707795143127, + "learning_rate": 9.937816893861909e-05, + "loss": 0.8128, + "step": 15740 + }, + { + "epoch": 0.10062226083845495, + "grad_norm": 0.8894488215446472, + "learning_rate": 9.937737980244997e-05, + "loss": 0.9038, + "step": 15750 + }, + { + "epoch": 0.10068614798819366, + "grad_norm": 1.0936787128448486, + "learning_rate": 9.937659016900791e-05, + "loss": 0.8245, + "step": 15760 + }, + { + "epoch": 0.10075003513793236, + "grad_norm": 1.0727956295013428, + "learning_rate": 9.937580003830088e-05, + "loss": 0.8693, + "step": 15770 + }, + { + "epoch": 0.10081392228767105, + "grad_norm": 1.4079822301864624, + "learning_rate": 9.937500941033682e-05, + "loss": 0.6185, + "step": 15780 + }, + { + "epoch": 0.10087780943740976, + "grad_norm": 1.4234700202941895, + "learning_rate": 9.937421828512371e-05, + "loss": 0.7397, + "step": 15790 + }, + { + "epoch": 0.10094169658714847, + "grad_norm": 0.8071795701980591, + "learning_rate": 9.937342666266951e-05, + "loss": 1.1495, + "step": 15800 + }, + { + "epoch": 0.10100558373688716, + "grad_norm": 2.0237574577331543, + "learning_rate": 9.937263454298217e-05, + "loss": 1.0899, + "step": 15810 + }, + { + "epoch": 0.10106947088662586, + "grad_norm": 0.6750722527503967, + "learning_rate": 9.93718419260697e-05, + "loss": 0.7826, + "step": 15820 + }, + { + "epoch": 0.10113335803636457, + "grad_norm": 0.9621725678443909, + "learning_rate": 9.937104881194008e-05, + "loss": 1.0047, + "step": 15830 + }, + { + "epoch": 0.10119724518610326, + "grad_norm": 0.7849874496459961, + "learning_rate": 9.937025520060127e-05, + "loss": 0.8557, + "step": 15840 + }, + { + "epoch": 0.10126113233584197, + "grad_norm": 0.6543164253234863, + "learning_rate": 9.936946109206129e-05, + "loss": 0.8556, + "step": 15850 + }, + { + "epoch": 0.10132501948558068, + "grad_norm": 0.8516491651535034, + "learning_rate": 9.936866648632811e-05, + "loss": 0.8175, + "step": 15860 + }, + { + "epoch": 0.10138890663531937, + "grad_norm": 0.7480735778808594, + "learning_rate": 9.936787138340976e-05, + "loss": 0.9132, + "step": 15870 + }, + { + "epoch": 0.10145279378505807, + "grad_norm": 0.7891073822975159, + "learning_rate": 9.936707578331423e-05, + "loss": 0.9786, + "step": 15880 + }, + { + "epoch": 0.10151668093479678, + "grad_norm": 0.6075239181518555, + "learning_rate": 9.936627968604955e-05, + "loss": 0.7032, + "step": 15890 + }, + { + "epoch": 0.10158056808453547, + "grad_norm": 2.8596110343933105, + "learning_rate": 9.93654830916237e-05, + "loss": 0.9138, + "step": 15900 + }, + { + "epoch": 0.10164445523427418, + "grad_norm": 1.1982015371322632, + "learning_rate": 9.936468600004477e-05, + "loss": 0.7194, + "step": 15910 + }, + { + "epoch": 0.10170834238401288, + "grad_norm": 0.6473510265350342, + "learning_rate": 9.936388841132071e-05, + "loss": 0.7809, + "step": 15920 + }, + { + "epoch": 0.10177222953375158, + "grad_norm": 1.089911937713623, + "learning_rate": 9.936309032545961e-05, + "loss": 0.846, + "step": 15930 + }, + { + "epoch": 0.10183611668349028, + "grad_norm": 0.9146657586097717, + "learning_rate": 9.936229174246947e-05, + "loss": 0.8369, + "step": 15940 + }, + { + "epoch": 0.10190000383322899, + "grad_norm": 0.599389374256134, + "learning_rate": 9.936149266235835e-05, + "loss": 0.9457, + "step": 15950 + }, + { + "epoch": 0.10196389098296768, + "grad_norm": 0.5718626976013184, + "learning_rate": 9.93606930851343e-05, + "loss": 1.1508, + "step": 15960 + }, + { + "epoch": 0.10202777813270639, + "grad_norm": 0.5820611715316772, + "learning_rate": 9.935989301080535e-05, + "loss": 0.5636, + "step": 15970 + }, + { + "epoch": 0.1020916652824451, + "grad_norm": 0.9194528460502625, + "learning_rate": 9.935909243937959e-05, + "loss": 0.9002, + "step": 15980 + }, + { + "epoch": 0.10215555243218379, + "grad_norm": 1.094212293624878, + "learning_rate": 9.935829137086508e-05, + "loss": 1.2759, + "step": 15990 + }, + { + "epoch": 0.10221943958192249, + "grad_norm": 0.8695144653320312, + "learning_rate": 9.935748980526986e-05, + "loss": 0.6543, + "step": 16000 + }, + { + "epoch": 0.1022833267316612, + "grad_norm": 1.7058948278427124, + "learning_rate": 9.935668774260202e-05, + "loss": 0.8703, + "step": 16010 + }, + { + "epoch": 0.10234721388139989, + "grad_norm": 2.493241310119629, + "learning_rate": 9.935588518286963e-05, + "loss": 1.282, + "step": 16020 + }, + { + "epoch": 0.1024111010311386, + "grad_norm": 0.7929388284683228, + "learning_rate": 9.935508212608078e-05, + "loss": 0.6585, + "step": 16030 + }, + { + "epoch": 0.1024749881808773, + "grad_norm": 0.8106563091278076, + "learning_rate": 9.935427857224356e-05, + "loss": 0.9955, + "step": 16040 + }, + { + "epoch": 0.102538875330616, + "grad_norm": 2.413360357284546, + "learning_rate": 9.935347452136606e-05, + "loss": 1.0771, + "step": 16050 + }, + { + "epoch": 0.1026027624803547, + "grad_norm": 0.7023759484291077, + "learning_rate": 9.935266997345636e-05, + "loss": 1.0192, + "step": 16060 + }, + { + "epoch": 0.1026666496300934, + "grad_norm": 1.3818843364715576, + "learning_rate": 9.935186492852258e-05, + "loss": 1.1104, + "step": 16070 + }, + { + "epoch": 0.1027305367798321, + "grad_norm": 1.1903809309005737, + "learning_rate": 9.935105938657283e-05, + "loss": 0.8756, + "step": 16080 + }, + { + "epoch": 0.1027944239295708, + "grad_norm": 0.610237181186676, + "learning_rate": 9.935025334761523e-05, + "loss": 0.9345, + "step": 16090 + }, + { + "epoch": 0.10285831107930951, + "grad_norm": 0.8631981015205383, + "learning_rate": 9.934944681165786e-05, + "loss": 1.0976, + "step": 16100 + }, + { + "epoch": 0.1029221982290482, + "grad_norm": 0.5845250487327576, + "learning_rate": 9.934863977870889e-05, + "loss": 0.8406, + "step": 16110 + }, + { + "epoch": 0.10298608537878691, + "grad_norm": 0.5269205570220947, + "learning_rate": 9.93478322487764e-05, + "loss": 0.9238, + "step": 16120 + }, + { + "epoch": 0.10304997252852562, + "grad_norm": 0.6796483397483826, + "learning_rate": 9.934702422186857e-05, + "loss": 0.9912, + "step": 16130 + }, + { + "epoch": 0.10311385967826431, + "grad_norm": 0.9061000347137451, + "learning_rate": 9.93462156979935e-05, + "loss": 1.0622, + "step": 16140 + }, + { + "epoch": 0.10317774682800301, + "grad_norm": 0.5684584379196167, + "learning_rate": 9.934540667715936e-05, + "loss": 0.8797, + "step": 16150 + }, + { + "epoch": 0.10324163397774172, + "grad_norm": 0.8343471884727478, + "learning_rate": 9.934459715937428e-05, + "loss": 0.8628, + "step": 16160 + }, + { + "epoch": 0.10330552112748041, + "grad_norm": 0.9811477065086365, + "learning_rate": 9.934378714464642e-05, + "loss": 1.1671, + "step": 16170 + }, + { + "epoch": 0.10336940827721912, + "grad_norm": 0.9283135533332825, + "learning_rate": 9.934297663298393e-05, + "loss": 0.7027, + "step": 16180 + }, + { + "epoch": 0.10343329542695782, + "grad_norm": 0.7332042455673218, + "learning_rate": 9.934216562439498e-05, + "loss": 0.8026, + "step": 16190 + }, + { + "epoch": 0.10349718257669652, + "grad_norm": 3.353732109069824, + "learning_rate": 9.934135411888773e-05, + "loss": 1.1843, + "step": 16200 + }, + { + "epoch": 0.10356106972643522, + "grad_norm": 1.056642770767212, + "learning_rate": 9.934054211647036e-05, + "loss": 0.8445, + "step": 16210 + }, + { + "epoch": 0.10362495687617393, + "grad_norm": 0.6340813636779785, + "learning_rate": 9.933972961715104e-05, + "loss": 1.0407, + "step": 16220 + }, + { + "epoch": 0.10368884402591262, + "grad_norm": 0.823939859867096, + "learning_rate": 9.933891662093797e-05, + "loss": 0.9409, + "step": 16230 + }, + { + "epoch": 0.10375273117565133, + "grad_norm": 1.3675154447555542, + "learning_rate": 9.933810312783932e-05, + "loss": 0.7627, + "step": 16240 + }, + { + "epoch": 0.10381661832539003, + "grad_norm": 2.952162742614746, + "learning_rate": 9.933728913786328e-05, + "loss": 0.7343, + "step": 16250 + }, + { + "epoch": 0.10388050547512873, + "grad_norm": 0.5602843165397644, + "learning_rate": 9.933647465101807e-05, + "loss": 0.9949, + "step": 16260 + }, + { + "epoch": 0.10394439262486743, + "grad_norm": 1.035836935043335, + "learning_rate": 9.933565966731187e-05, + "loss": 0.733, + "step": 16270 + }, + { + "epoch": 0.10400827977460614, + "grad_norm": 0.599962055683136, + "learning_rate": 9.93348441867529e-05, + "loss": 0.8972, + "step": 16280 + }, + { + "epoch": 0.10407216692434483, + "grad_norm": 1.3323990106582642, + "learning_rate": 9.933402820934936e-05, + "loss": 0.8854, + "step": 16290 + }, + { + "epoch": 0.10413605407408354, + "grad_norm": 1.9497777223587036, + "learning_rate": 9.933321173510949e-05, + "loss": 0.9189, + "step": 16300 + }, + { + "epoch": 0.10419994122382224, + "grad_norm": 1.2453469038009644, + "learning_rate": 9.933239476404149e-05, + "loss": 0.9895, + "step": 16310 + }, + { + "epoch": 0.10426382837356094, + "grad_norm": 1.1778478622436523, + "learning_rate": 9.933157729615359e-05, + "loss": 0.9034, + "step": 16320 + }, + { + "epoch": 0.10432771552329964, + "grad_norm": 0.7370180487632751, + "learning_rate": 9.933075933145404e-05, + "loss": 0.9827, + "step": 16330 + }, + { + "epoch": 0.10439160267303835, + "grad_norm": 0.992669403553009, + "learning_rate": 9.932994086995107e-05, + "loss": 0.696, + "step": 16340 + }, + { + "epoch": 0.10445548982277705, + "grad_norm": 0.8469734191894531, + "learning_rate": 9.93291219116529e-05, + "loss": 0.7605, + "step": 16350 + }, + { + "epoch": 0.10451937697251575, + "grad_norm": 1.4844669103622437, + "learning_rate": 9.932830245656782e-05, + "loss": 0.8848, + "step": 16360 + }, + { + "epoch": 0.10458326412225445, + "grad_norm": 0.7089157104492188, + "learning_rate": 9.932748250470403e-05, + "loss": 0.8722, + "step": 16370 + }, + { + "epoch": 0.10464715127199316, + "grad_norm": 0.6361833214759827, + "learning_rate": 9.932666205606984e-05, + "loss": 1.0907, + "step": 16380 + }, + { + "epoch": 0.10471103842173185, + "grad_norm": 1.060922384262085, + "learning_rate": 9.932584111067348e-05, + "loss": 0.9377, + "step": 16390 + }, + { + "epoch": 0.10477492557147056, + "grad_norm": 1.2127258777618408, + "learning_rate": 9.932501966852323e-05, + "loss": 1.1433, + "step": 16400 + }, + { + "epoch": 0.10483881272120926, + "grad_norm": 0.6231849193572998, + "learning_rate": 9.932419772962735e-05, + "loss": 0.925, + "step": 16410 + }, + { + "epoch": 0.10490269987094795, + "grad_norm": 0.5481915473937988, + "learning_rate": 9.932337529399415e-05, + "loss": 0.8031, + "step": 16420 + }, + { + "epoch": 0.10496658702068666, + "grad_norm": 0.5232637524604797, + "learning_rate": 9.932255236163187e-05, + "loss": 0.8512, + "step": 16430 + }, + { + "epoch": 0.10503047417042537, + "grad_norm": 0.6596049666404724, + "learning_rate": 9.932172893254884e-05, + "loss": 0.7366, + "step": 16440 + }, + { + "epoch": 0.10509436132016406, + "grad_norm": 0.826575517654419, + "learning_rate": 9.932090500675331e-05, + "loss": 0.7942, + "step": 16450 + }, + { + "epoch": 0.10515824846990277, + "grad_norm": 0.6646784543991089, + "learning_rate": 9.932008058425359e-05, + "loss": 1.1065, + "step": 16460 + }, + { + "epoch": 0.10522213561964147, + "grad_norm": 0.6288832426071167, + "learning_rate": 9.931925566505802e-05, + "loss": 0.9242, + "step": 16470 + }, + { + "epoch": 0.10528602276938016, + "grad_norm": 0.7876302003860474, + "learning_rate": 9.931843024917484e-05, + "loss": 0.9227, + "step": 16480 + }, + { + "epoch": 0.10534990991911887, + "grad_norm": 0.6333622336387634, + "learning_rate": 9.931760433661244e-05, + "loss": 0.9783, + "step": 16490 + }, + { + "epoch": 0.10541379706885758, + "grad_norm": 1.2118867635726929, + "learning_rate": 9.931677792737907e-05, + "loss": 0.727, + "step": 16500 + }, + { + "epoch": 0.10547768421859627, + "grad_norm": 0.8063325881958008, + "learning_rate": 9.931595102148309e-05, + "loss": 1.2654, + "step": 16510 + }, + { + "epoch": 0.10554157136833497, + "grad_norm": 0.5137673020362854, + "learning_rate": 9.931512361893283e-05, + "loss": 0.7905, + "step": 16520 + }, + { + "epoch": 0.10560545851807368, + "grad_norm": 1.0696414709091187, + "learning_rate": 9.93142957197366e-05, + "loss": 1.0821, + "step": 16530 + }, + { + "epoch": 0.10566934566781237, + "grad_norm": 1.1155736446380615, + "learning_rate": 9.931346732390274e-05, + "loss": 0.7375, + "step": 16540 + }, + { + "epoch": 0.10573323281755108, + "grad_norm": 0.784761369228363, + "learning_rate": 9.931263843143962e-05, + "loss": 0.7859, + "step": 16550 + }, + { + "epoch": 0.10579711996728978, + "grad_norm": 0.9071635007858276, + "learning_rate": 9.931180904235557e-05, + "loss": 1.0189, + "step": 16560 + }, + { + "epoch": 0.10586100711702848, + "grad_norm": 0.6615142822265625, + "learning_rate": 9.931097915665892e-05, + "loss": 0.9826, + "step": 16570 + }, + { + "epoch": 0.10592489426676718, + "grad_norm": 1.0913355350494385, + "learning_rate": 9.931014877435806e-05, + "loss": 1.2501, + "step": 16580 + }, + { + "epoch": 0.10598878141650589, + "grad_norm": 0.7185521125793457, + "learning_rate": 9.930931789546136e-05, + "loss": 0.9584, + "step": 16590 + }, + { + "epoch": 0.10605266856624458, + "grad_norm": 0.9962629079818726, + "learning_rate": 9.930848651997716e-05, + "loss": 1.2084, + "step": 16600 + }, + { + "epoch": 0.10611655571598329, + "grad_norm": 0.5388261079788208, + "learning_rate": 9.930765464791383e-05, + "loss": 0.7474, + "step": 16610 + }, + { + "epoch": 0.106180442865722, + "grad_norm": 0.963033139705658, + "learning_rate": 9.930682227927978e-05, + "loss": 0.8856, + "step": 16620 + }, + { + "epoch": 0.10624433001546069, + "grad_norm": 0.8740180730819702, + "learning_rate": 9.930598941408335e-05, + "loss": 0.9665, + "step": 16630 + }, + { + "epoch": 0.10630821716519939, + "grad_norm": 0.7706631422042847, + "learning_rate": 9.930515605233297e-05, + "loss": 0.9538, + "step": 16640 + }, + { + "epoch": 0.1063721043149381, + "grad_norm": 1.0172282457351685, + "learning_rate": 9.930432219403702e-05, + "loss": 0.9451, + "step": 16650 + }, + { + "epoch": 0.10643599146467679, + "grad_norm": 1.1416665315628052, + "learning_rate": 9.930348783920387e-05, + "loss": 0.812, + "step": 16660 + }, + { + "epoch": 0.1064998786144155, + "grad_norm": 1.248719573020935, + "learning_rate": 9.930265298784196e-05, + "loss": 1.0079, + "step": 16670 + }, + { + "epoch": 0.1065637657641542, + "grad_norm": 0.8804942965507507, + "learning_rate": 9.930181763995968e-05, + "loss": 1.0038, + "step": 16680 + }, + { + "epoch": 0.1066276529138929, + "grad_norm": 0.9898728132247925, + "learning_rate": 9.930098179556543e-05, + "loss": 0.9694, + "step": 16690 + }, + { + "epoch": 0.1066915400636316, + "grad_norm": 1.1314060688018799, + "learning_rate": 9.930014545466765e-05, + "loss": 1.0318, + "step": 16700 + }, + { + "epoch": 0.10675542721337031, + "grad_norm": 1.0899930000305176, + "learning_rate": 9.929930861727476e-05, + "loss": 1.1298, + "step": 16710 + }, + { + "epoch": 0.106819314363109, + "grad_norm": 1.2332922220230103, + "learning_rate": 9.929847128339517e-05, + "loss": 0.9744, + "step": 16720 + }, + { + "epoch": 0.1068832015128477, + "grad_norm": 1.1803171634674072, + "learning_rate": 9.929763345303733e-05, + "loss": 0.9733, + "step": 16730 + }, + { + "epoch": 0.10694708866258641, + "grad_norm": 0.8435320854187012, + "learning_rate": 9.929679512620969e-05, + "loss": 0.8418, + "step": 16740 + }, + { + "epoch": 0.1070109758123251, + "grad_norm": 0.68702632188797, + "learning_rate": 9.929595630292066e-05, + "loss": 1.0078, + "step": 16750 + }, + { + "epoch": 0.10707486296206381, + "grad_norm": 0.8807457089424133, + "learning_rate": 9.92951169831787e-05, + "loss": 0.8933, + "step": 16760 + }, + { + "epoch": 0.10713875011180252, + "grad_norm": 0.922346293926239, + "learning_rate": 9.929427716699227e-05, + "loss": 0.765, + "step": 16770 + }, + { + "epoch": 0.10720263726154121, + "grad_norm": 0.6668721437454224, + "learning_rate": 9.929343685436982e-05, + "loss": 0.7723, + "step": 16780 + }, + { + "epoch": 0.10726652441127991, + "grad_norm": 1.0509366989135742, + "learning_rate": 9.929259604531981e-05, + "loss": 0.9128, + "step": 16790 + }, + { + "epoch": 0.10733041156101862, + "grad_norm": 0.9233303070068359, + "learning_rate": 9.929175473985073e-05, + "loss": 0.8772, + "step": 16800 + }, + { + "epoch": 0.10739429871075731, + "grad_norm": 0.5858426094055176, + "learning_rate": 9.929091293797102e-05, + "loss": 0.9377, + "step": 16810 + }, + { + "epoch": 0.10745818586049602, + "grad_norm": 0.7452363967895508, + "learning_rate": 9.929007063968919e-05, + "loss": 0.9821, + "step": 16820 + }, + { + "epoch": 0.10752207301023473, + "grad_norm": 0.8996424078941345, + "learning_rate": 9.92892278450137e-05, + "loss": 0.969, + "step": 16830 + }, + { + "epoch": 0.10758596015997342, + "grad_norm": 0.9038456082344055, + "learning_rate": 9.928838455395304e-05, + "loss": 0.9136, + "step": 16840 + }, + { + "epoch": 0.10764984730971212, + "grad_norm": 2.0651540756225586, + "learning_rate": 9.928754076651571e-05, + "loss": 0.9447, + "step": 16850 + }, + { + "epoch": 0.10771373445945083, + "grad_norm": 0.9123902916908264, + "learning_rate": 9.928669648271021e-05, + "loss": 1.0723, + "step": 16860 + }, + { + "epoch": 0.10777762160918952, + "grad_norm": 0.7702105641365051, + "learning_rate": 9.928585170254503e-05, + "loss": 1.0555, + "step": 16870 + }, + { + "epoch": 0.10784150875892823, + "grad_norm": 0.8191667795181274, + "learning_rate": 9.928500642602869e-05, + "loss": 0.952, + "step": 16880 + }, + { + "epoch": 0.10790539590866693, + "grad_norm": 0.8521249890327454, + "learning_rate": 9.928416065316969e-05, + "loss": 0.9182, + "step": 16890 + }, + { + "epoch": 0.10796928305840563, + "grad_norm": 0.4355503022670746, + "learning_rate": 9.928331438397655e-05, + "loss": 0.7828, + "step": 16900 + }, + { + "epoch": 0.10803317020814433, + "grad_norm": 0.8053306937217712, + "learning_rate": 9.928246761845782e-05, + "loss": 1.073, + "step": 16910 + }, + { + "epoch": 0.10809705735788304, + "grad_norm": 1.1718153953552246, + "learning_rate": 9.928162035662199e-05, + "loss": 0.8979, + "step": 16920 + }, + { + "epoch": 0.10816094450762173, + "grad_norm": 0.8112810850143433, + "learning_rate": 9.928077259847761e-05, + "loss": 1.0277, + "step": 16930 + }, + { + "epoch": 0.10822483165736044, + "grad_norm": 0.5165520906448364, + "learning_rate": 9.927992434403322e-05, + "loss": 1.0714, + "step": 16940 + }, + { + "epoch": 0.10828871880709914, + "grad_norm": 0.9523488283157349, + "learning_rate": 9.927907559329736e-05, + "loss": 0.9623, + "step": 16950 + }, + { + "epoch": 0.10835260595683784, + "grad_norm": 0.5549238324165344, + "learning_rate": 9.927822634627857e-05, + "loss": 0.7777, + "step": 16960 + }, + { + "epoch": 0.10841649310657654, + "grad_norm": 0.8362735509872437, + "learning_rate": 9.927737660298541e-05, + "loss": 0.9429, + "step": 16970 + }, + { + "epoch": 0.10848038025631525, + "grad_norm": 1.324947714805603, + "learning_rate": 9.927652636342645e-05, + "loss": 0.9626, + "step": 16980 + }, + { + "epoch": 0.10854426740605394, + "grad_norm": 0.8219287395477295, + "learning_rate": 9.927567562761021e-05, + "loss": 1.1411, + "step": 16990 + }, + { + "epoch": 0.10860815455579265, + "grad_norm": 0.7673150897026062, + "learning_rate": 9.927482439554532e-05, + "loss": 0.9758, + "step": 17000 + }, + { + "epoch": 0.10867204170553135, + "grad_norm": 0.7057496905326843, + "learning_rate": 9.92739726672403e-05, + "loss": 0.8181, + "step": 17010 + }, + { + "epoch": 0.10873592885527004, + "grad_norm": 1.2595868110656738, + "learning_rate": 9.927312044270375e-05, + "loss": 0.9396, + "step": 17020 + }, + { + "epoch": 0.10879981600500875, + "grad_norm": 0.8270642161369324, + "learning_rate": 9.927226772194426e-05, + "loss": 0.9074, + "step": 17030 + }, + { + "epoch": 0.10886370315474746, + "grad_norm": 0.5199679732322693, + "learning_rate": 9.927141450497039e-05, + "loss": 0.9427, + "step": 17040 + }, + { + "epoch": 0.10892759030448615, + "grad_norm": 0.7724682688713074, + "learning_rate": 9.927056079179076e-05, + "loss": 0.8286, + "step": 17050 + }, + { + "epoch": 0.10899147745422486, + "grad_norm": 0.614035964012146, + "learning_rate": 9.926970658241397e-05, + "loss": 0.8915, + "step": 17060 + }, + { + "epoch": 0.10905536460396356, + "grad_norm": 1.045047402381897, + "learning_rate": 9.926885187684859e-05, + "loss": 0.8422, + "step": 17070 + }, + { + "epoch": 0.10911925175370225, + "grad_norm": 0.7779353857040405, + "learning_rate": 9.926799667510326e-05, + "loss": 0.8882, + "step": 17080 + }, + { + "epoch": 0.10918313890344096, + "grad_norm": 0.9462752938270569, + "learning_rate": 9.926714097718657e-05, + "loss": 0.923, + "step": 17090 + }, + { + "epoch": 0.10924702605317967, + "grad_norm": 0.5807504057884216, + "learning_rate": 9.926628478310715e-05, + "loss": 0.8799, + "step": 17100 + }, + { + "epoch": 0.10931091320291836, + "grad_norm": 0.8692427277565002, + "learning_rate": 9.926542809287364e-05, + "loss": 0.9051, + "step": 17110 + }, + { + "epoch": 0.10937480035265706, + "grad_norm": 0.8406835794448853, + "learning_rate": 9.926457090649462e-05, + "loss": 0.7788, + "step": 17120 + }, + { + "epoch": 0.10943868750239577, + "grad_norm": 0.8116185665130615, + "learning_rate": 9.926371322397877e-05, + "loss": 0.7086, + "step": 17130 + }, + { + "epoch": 0.10950257465213446, + "grad_norm": 1.1767171621322632, + "learning_rate": 9.92628550453347e-05, + "loss": 0.9115, + "step": 17140 + }, + { + "epoch": 0.10956646180187317, + "grad_norm": 0.8342850804328918, + "learning_rate": 9.926199637057108e-05, + "loss": 0.8859, + "step": 17150 + }, + { + "epoch": 0.10963034895161188, + "grad_norm": 1.0540907382965088, + "learning_rate": 9.926113719969652e-05, + "loss": 1.1537, + "step": 17160 + }, + { + "epoch": 0.10969423610135058, + "grad_norm": 1.0921380519866943, + "learning_rate": 9.926027753271969e-05, + "loss": 0.6981, + "step": 17170 + }, + { + "epoch": 0.10975812325108927, + "grad_norm": 1.9400385618209839, + "learning_rate": 9.925941736964925e-05, + "loss": 0.953, + "step": 17180 + }, + { + "epoch": 0.10982201040082798, + "grad_norm": 0.9200677275657654, + "learning_rate": 9.925855671049387e-05, + "loss": 1.0603, + "step": 17190 + }, + { + "epoch": 0.10988589755056669, + "grad_norm": 0.8739213347434998, + "learning_rate": 9.92576955552622e-05, + "loss": 0.9993, + "step": 17200 + }, + { + "epoch": 0.10994978470030538, + "grad_norm": 0.4887886345386505, + "learning_rate": 9.925683390396292e-05, + "loss": 0.9623, + "step": 17210 + }, + { + "epoch": 0.11001367185004408, + "grad_norm": 0.7912802696228027, + "learning_rate": 9.925597175660472e-05, + "loss": 0.9074, + "step": 17220 + }, + { + "epoch": 0.11007755899978279, + "grad_norm": 0.8125321865081787, + "learning_rate": 9.925510911319626e-05, + "loss": 0.8537, + "step": 17230 + }, + { + "epoch": 0.11014144614952148, + "grad_norm": 0.5782091617584229, + "learning_rate": 9.925424597374626e-05, + "loss": 0.8458, + "step": 17240 + }, + { + "epoch": 0.11020533329926019, + "grad_norm": 0.777730405330658, + "learning_rate": 9.925338233826338e-05, + "loss": 0.8778, + "step": 17250 + }, + { + "epoch": 0.1102692204489989, + "grad_norm": 0.8471282124519348, + "learning_rate": 9.925251820675633e-05, + "loss": 0.8727, + "step": 17260 + }, + { + "epoch": 0.11033310759873759, + "grad_norm": 0.7456023097038269, + "learning_rate": 9.92516535792338e-05, + "loss": 0.9105, + "step": 17270 + }, + { + "epoch": 0.1103969947484763, + "grad_norm": 1.1746282577514648, + "learning_rate": 9.925078845570452e-05, + "loss": 0.9972, + "step": 17280 + }, + { + "epoch": 0.110460881898215, + "grad_norm": 1.0486959218978882, + "learning_rate": 9.92499228361772e-05, + "loss": 0.923, + "step": 17290 + }, + { + "epoch": 0.11052476904795369, + "grad_norm": 0.8411831259727478, + "learning_rate": 9.924905672066054e-05, + "loss": 0.8309, + "step": 17300 + }, + { + "epoch": 0.1105886561976924, + "grad_norm": 0.8965527415275574, + "learning_rate": 9.924819010916328e-05, + "loss": 0.66, + "step": 17310 + }, + { + "epoch": 0.1106525433474311, + "grad_norm": 0.8932517170906067, + "learning_rate": 9.924732300169414e-05, + "loss": 0.9388, + "step": 17320 + }, + { + "epoch": 0.1107164304971698, + "grad_norm": 0.4896504878997803, + "learning_rate": 9.924645539826184e-05, + "loss": 0.9554, + "step": 17330 + }, + { + "epoch": 0.1107803176469085, + "grad_norm": 1.2872638702392578, + "learning_rate": 9.924558729887514e-05, + "loss": 1.0668, + "step": 17340 + }, + { + "epoch": 0.11084420479664721, + "grad_norm": 0.7142483592033386, + "learning_rate": 9.924471870354277e-05, + "loss": 0.9363, + "step": 17350 + }, + { + "epoch": 0.1109080919463859, + "grad_norm": 0.8296705484390259, + "learning_rate": 9.924384961227348e-05, + "loss": 0.7258, + "step": 17360 + }, + { + "epoch": 0.1109719790961246, + "grad_norm": 0.609883189201355, + "learning_rate": 9.924298002507602e-05, + "loss": 1.0268, + "step": 17370 + }, + { + "epoch": 0.11103586624586331, + "grad_norm": 1.5724817514419556, + "learning_rate": 9.924210994195915e-05, + "loss": 0.9622, + "step": 17380 + }, + { + "epoch": 0.111099753395602, + "grad_norm": 0.8032723069190979, + "learning_rate": 9.924123936293164e-05, + "loss": 0.7995, + "step": 17390 + }, + { + "epoch": 0.11116364054534071, + "grad_norm": 0.6989961266517639, + "learning_rate": 9.924036828800223e-05, + "loss": 0.8276, + "step": 17400 + }, + { + "epoch": 0.11122752769507942, + "grad_norm": 0.6586987972259521, + "learning_rate": 9.923949671717973e-05, + "loss": 0.9443, + "step": 17410 + }, + { + "epoch": 0.11129141484481811, + "grad_norm": 1.005934715270996, + "learning_rate": 9.923862465047291e-05, + "loss": 0.7672, + "step": 17420 + }, + { + "epoch": 0.11135530199455682, + "grad_norm": 0.7518536448478699, + "learning_rate": 9.923775208789053e-05, + "loss": 0.9946, + "step": 17430 + }, + { + "epoch": 0.11141918914429552, + "grad_norm": 0.8329457640647888, + "learning_rate": 9.923687902944138e-05, + "loss": 0.8019, + "step": 17440 + }, + { + "epoch": 0.11148307629403421, + "grad_norm": 0.5968138575553894, + "learning_rate": 9.923600547513427e-05, + "loss": 0.779, + "step": 17450 + }, + { + "epoch": 0.11154696344377292, + "grad_norm": 0.5740717053413391, + "learning_rate": 9.9235131424978e-05, + "loss": 1.1269, + "step": 17460 + }, + { + "epoch": 0.11161085059351163, + "grad_norm": 0.4525648355484009, + "learning_rate": 9.923425687898135e-05, + "loss": 0.9606, + "step": 17470 + }, + { + "epoch": 0.11167473774325032, + "grad_norm": 0.7562941312789917, + "learning_rate": 9.923338183715314e-05, + "loss": 1.3565, + "step": 17480 + }, + { + "epoch": 0.11173862489298902, + "grad_norm": 0.8217481970787048, + "learning_rate": 9.923250629950218e-05, + "loss": 0.9787, + "step": 17490 + }, + { + "epoch": 0.11180251204272773, + "grad_norm": 1.1421339511871338, + "learning_rate": 9.92316302660373e-05, + "loss": 1.118, + "step": 17500 + }, + { + "epoch": 0.11186639919246642, + "grad_norm": 1.1751115322113037, + "learning_rate": 9.92307537367673e-05, + "loss": 0.8283, + "step": 17510 + }, + { + "epoch": 0.11193028634220513, + "grad_norm": 0.8353852033615112, + "learning_rate": 9.922987671170103e-05, + "loss": 1.1464, + "step": 17520 + }, + { + "epoch": 0.11199417349194384, + "grad_norm": 0.6148945689201355, + "learning_rate": 9.92289991908473e-05, + "loss": 0.8993, + "step": 17530 + }, + { + "epoch": 0.11205806064168253, + "grad_norm": 0.7143790125846863, + "learning_rate": 9.922812117421496e-05, + "loss": 0.8293, + "step": 17540 + }, + { + "epoch": 0.11212194779142123, + "grad_norm": 0.6704200506210327, + "learning_rate": 9.922724266181286e-05, + "loss": 0.9002, + "step": 17550 + }, + { + "epoch": 0.11218583494115994, + "grad_norm": 0.6758965253829956, + "learning_rate": 9.922636365364984e-05, + "loss": 0.868, + "step": 17560 + }, + { + "epoch": 0.11224972209089863, + "grad_norm": 1.4119186401367188, + "learning_rate": 9.922548414973473e-05, + "loss": 0.8967, + "step": 17570 + }, + { + "epoch": 0.11231360924063734, + "grad_norm": 0.7103084921836853, + "learning_rate": 9.922460415007644e-05, + "loss": 0.7774, + "step": 17580 + }, + { + "epoch": 0.11237749639037604, + "grad_norm": 1.5748227834701538, + "learning_rate": 9.922372365468378e-05, + "loss": 0.8543, + "step": 17590 + }, + { + "epoch": 0.11244138354011474, + "grad_norm": 0.8554244637489319, + "learning_rate": 9.922284266356565e-05, + "loss": 0.9862, + "step": 17600 + }, + { + "epoch": 0.11250527068985344, + "grad_norm": 0.9203200936317444, + "learning_rate": 9.92219611767309e-05, + "loss": 0.8462, + "step": 17610 + }, + { + "epoch": 0.11256915783959215, + "grad_norm": 0.8570156097412109, + "learning_rate": 9.922107919418842e-05, + "loss": 0.9768, + "step": 17620 + }, + { + "epoch": 0.11263304498933084, + "grad_norm": 0.8079208135604858, + "learning_rate": 9.92201967159471e-05, + "loss": 0.7745, + "step": 17630 + }, + { + "epoch": 0.11269693213906955, + "grad_norm": 0.8128913640975952, + "learning_rate": 9.92193137420158e-05, + "loss": 0.7183, + "step": 17640 + }, + { + "epoch": 0.11276081928880825, + "grad_norm": 1.0222535133361816, + "learning_rate": 9.921843027240345e-05, + "loss": 0.762, + "step": 17650 + }, + { + "epoch": 0.11282470643854695, + "grad_norm": 0.782536506652832, + "learning_rate": 9.921754630711891e-05, + "loss": 1.0573, + "step": 17660 + }, + { + "epoch": 0.11288859358828565, + "grad_norm": 0.7294056415557861, + "learning_rate": 9.921666184617111e-05, + "loss": 1.1262, + "step": 17670 + }, + { + "epoch": 0.11295248073802436, + "grad_norm": 0.7423584461212158, + "learning_rate": 9.921577688956893e-05, + "loss": 0.9985, + "step": 17680 + }, + { + "epoch": 0.11301636788776305, + "grad_norm": 0.7123269438743591, + "learning_rate": 9.921489143732133e-05, + "loss": 0.9849, + "step": 17690 + }, + { + "epoch": 0.11308025503750176, + "grad_norm": 0.9806658625602722, + "learning_rate": 9.921400548943718e-05, + "loss": 0.8499, + "step": 17700 + }, + { + "epoch": 0.11314414218724046, + "grad_norm": 2.702582359313965, + "learning_rate": 9.921311904592541e-05, + "loss": 0.9368, + "step": 17710 + }, + { + "epoch": 0.11320802933697915, + "grad_norm": 0.627751350402832, + "learning_rate": 9.921223210679495e-05, + "loss": 1.0154, + "step": 17720 + }, + { + "epoch": 0.11327191648671786, + "grad_norm": 1.1272038221359253, + "learning_rate": 9.921134467205477e-05, + "loss": 1.0128, + "step": 17730 + }, + { + "epoch": 0.11333580363645657, + "grad_norm": 1.0452537536621094, + "learning_rate": 9.921045674171374e-05, + "loss": 0.9581, + "step": 17740 + }, + { + "epoch": 0.11339969078619526, + "grad_norm": 0.6000169515609741, + "learning_rate": 9.920956831578086e-05, + "loss": 1.0127, + "step": 17750 + }, + { + "epoch": 0.11346357793593397, + "grad_norm": 0.8441605567932129, + "learning_rate": 9.920867939426505e-05, + "loss": 1.0766, + "step": 17760 + }, + { + "epoch": 0.11352746508567267, + "grad_norm": 1.0325100421905518, + "learning_rate": 9.920778997717527e-05, + "loss": 0.673, + "step": 17770 + }, + { + "epoch": 0.11359135223541136, + "grad_norm": 0.8646054863929749, + "learning_rate": 9.920690006452047e-05, + "loss": 0.8475, + "step": 17780 + }, + { + "epoch": 0.11365523938515007, + "grad_norm": 1.1158571243286133, + "learning_rate": 9.920600965630962e-05, + "loss": 0.7743, + "step": 17790 + }, + { + "epoch": 0.11371912653488878, + "grad_norm": 0.790447473526001, + "learning_rate": 9.920511875255168e-05, + "loss": 0.8564, + "step": 17800 + }, + { + "epoch": 0.11378301368462747, + "grad_norm": 0.6469011902809143, + "learning_rate": 9.920422735325561e-05, + "loss": 0.9071, + "step": 17810 + }, + { + "epoch": 0.11384690083436617, + "grad_norm": 0.8129775524139404, + "learning_rate": 9.920333545843042e-05, + "loss": 0.9754, + "step": 17820 + }, + { + "epoch": 0.11391078798410488, + "grad_norm": 1.0118224620819092, + "learning_rate": 9.920244306808509e-05, + "loss": 0.8034, + "step": 17830 + }, + { + "epoch": 0.11397467513384357, + "grad_norm": 0.8558486104011536, + "learning_rate": 9.920155018222857e-05, + "loss": 1.0181, + "step": 17840 + }, + { + "epoch": 0.11403856228358228, + "grad_norm": 1.0910837650299072, + "learning_rate": 9.920065680086988e-05, + "loss": 0.9216, + "step": 17850 + }, + { + "epoch": 0.11410244943332098, + "grad_norm": 0.6649434566497803, + "learning_rate": 9.9199762924018e-05, + "loss": 0.7545, + "step": 17860 + }, + { + "epoch": 0.11416633658305968, + "grad_norm": 2.092512369155884, + "learning_rate": 9.919886855168196e-05, + "loss": 1.0409, + "step": 17870 + }, + { + "epoch": 0.11423022373279838, + "grad_norm": 1.0226621627807617, + "learning_rate": 9.919797368387073e-05, + "loss": 0.9839, + "step": 17880 + }, + { + "epoch": 0.11429411088253709, + "grad_norm": 0.9362402558326721, + "learning_rate": 9.919707832059337e-05, + "loss": 0.9349, + "step": 17890 + }, + { + "epoch": 0.11435799803227578, + "grad_norm": 0.6043878793716431, + "learning_rate": 9.919618246185886e-05, + "loss": 0.9667, + "step": 17900 + }, + { + "epoch": 0.11442188518201449, + "grad_norm": 0.7030009031295776, + "learning_rate": 9.919528610767622e-05, + "loss": 0.8868, + "step": 17910 + }, + { + "epoch": 0.1144857723317532, + "grad_norm": 0.54000324010849, + "learning_rate": 9.919438925805451e-05, + "loss": 0.9966, + "step": 17920 + }, + { + "epoch": 0.11454965948149189, + "grad_norm": 0.7529541254043579, + "learning_rate": 9.919349191300272e-05, + "loss": 0.8292, + "step": 17930 + }, + { + "epoch": 0.11461354663123059, + "grad_norm": 1.047979712486267, + "learning_rate": 9.919259407252992e-05, + "loss": 0.722, + "step": 17940 + }, + { + "epoch": 0.1146774337809693, + "grad_norm": 0.6364821195602417, + "learning_rate": 9.919169573664513e-05, + "loss": 1.0382, + "step": 17950 + }, + { + "epoch": 0.11474132093070799, + "grad_norm": 1.298886775970459, + "learning_rate": 9.919079690535742e-05, + "loss": 1.0275, + "step": 17960 + }, + { + "epoch": 0.1148052080804467, + "grad_norm": 0.9060257077217102, + "learning_rate": 9.918989757867583e-05, + "loss": 0.8959, + "step": 17970 + }, + { + "epoch": 0.1148690952301854, + "grad_norm": 1.0557827949523926, + "learning_rate": 9.91889977566094e-05, + "loss": 0.8224, + "step": 17980 + }, + { + "epoch": 0.1149329823799241, + "grad_norm": 1.0880374908447266, + "learning_rate": 9.918809743916722e-05, + "loss": 1.0926, + "step": 17990 + }, + { + "epoch": 0.1149968695296628, + "grad_norm": 0.9131140112876892, + "learning_rate": 9.918719662635834e-05, + "loss": 0.8125, + "step": 18000 + }, + { + "epoch": 0.11506075667940151, + "grad_norm": 0.956883430480957, + "learning_rate": 9.918629531819184e-05, + "loss": 0.7358, + "step": 18010 + }, + { + "epoch": 0.11512464382914021, + "grad_norm": 1.1593812704086304, + "learning_rate": 9.91853935146768e-05, + "loss": 0.9325, + "step": 18020 + }, + { + "epoch": 0.1151885309788789, + "grad_norm": 0.8647767901420593, + "learning_rate": 9.918449121582228e-05, + "loss": 0.889, + "step": 18030 + }, + { + "epoch": 0.11525241812861761, + "grad_norm": 0.875560462474823, + "learning_rate": 9.91835884216374e-05, + "loss": 0.7376, + "step": 18040 + }, + { + "epoch": 0.11531630527835632, + "grad_norm": 1.0609110593795776, + "learning_rate": 9.918268513213123e-05, + "loss": 0.9935, + "step": 18050 + }, + { + "epoch": 0.11538019242809501, + "grad_norm": 0.7033603191375732, + "learning_rate": 9.918178134731286e-05, + "loss": 0.9307, + "step": 18060 + }, + { + "epoch": 0.11544407957783372, + "grad_norm": 0.7909555435180664, + "learning_rate": 9.918087706719141e-05, + "loss": 1.0967, + "step": 18070 + }, + { + "epoch": 0.11550796672757242, + "grad_norm": 1.5477937459945679, + "learning_rate": 9.917997229177597e-05, + "loss": 0.921, + "step": 18080 + }, + { + "epoch": 0.11557185387731111, + "grad_norm": 1.373567819595337, + "learning_rate": 9.91790670210757e-05, + "loss": 0.7096, + "step": 18090 + }, + { + "epoch": 0.11563574102704982, + "grad_norm": 0.5353577136993408, + "learning_rate": 9.917816125509965e-05, + "loss": 0.8476, + "step": 18100 + }, + { + "epoch": 0.11569962817678853, + "grad_norm": 0.6826961040496826, + "learning_rate": 9.917725499385698e-05, + "loss": 1.0802, + "step": 18110 + }, + { + "epoch": 0.11576351532652722, + "grad_norm": 0.9268578290939331, + "learning_rate": 9.917634823735678e-05, + "loss": 1.0728, + "step": 18120 + }, + { + "epoch": 0.11582740247626593, + "grad_norm": 0.9943346381187439, + "learning_rate": 9.917544098560824e-05, + "loss": 1.2018, + "step": 18130 + }, + { + "epoch": 0.11589128962600463, + "grad_norm": 1.2347413301467896, + "learning_rate": 9.917453323862046e-05, + "loss": 0.8933, + "step": 18140 + }, + { + "epoch": 0.11595517677574332, + "grad_norm": 0.7425234913825989, + "learning_rate": 9.91736249964026e-05, + "loss": 0.7152, + "step": 18150 + }, + { + "epoch": 0.11601906392548203, + "grad_norm": 1.068671703338623, + "learning_rate": 9.917271625896377e-05, + "loss": 0.9737, + "step": 18160 + }, + { + "epoch": 0.11608295107522074, + "grad_norm": 0.9823939204216003, + "learning_rate": 9.917180702631316e-05, + "loss": 0.9365, + "step": 18170 + }, + { + "epoch": 0.11614683822495943, + "grad_norm": 0.7012134194374084, + "learning_rate": 9.917089729845991e-05, + "loss": 0.9741, + "step": 18180 + }, + { + "epoch": 0.11621072537469813, + "grad_norm": 0.8662933111190796, + "learning_rate": 9.916998707541319e-05, + "loss": 0.9238, + "step": 18190 + }, + { + "epoch": 0.11627461252443684, + "grad_norm": 0.5047873258590698, + "learning_rate": 9.916907635718216e-05, + "loss": 0.772, + "step": 18200 + }, + { + "epoch": 0.11633849967417553, + "grad_norm": 1.2148154973983765, + "learning_rate": 9.916816514377598e-05, + "loss": 0.8872, + "step": 18210 + }, + { + "epoch": 0.11640238682391424, + "grad_norm": 0.6862503886222839, + "learning_rate": 9.916725343520386e-05, + "loss": 0.9914, + "step": 18220 + }, + { + "epoch": 0.11646627397365295, + "grad_norm": 0.7228761315345764, + "learning_rate": 9.916634123147495e-05, + "loss": 1.3034, + "step": 18230 + }, + { + "epoch": 0.11653016112339164, + "grad_norm": 0.5457968711853027, + "learning_rate": 9.916542853259848e-05, + "loss": 0.8272, + "step": 18240 + }, + { + "epoch": 0.11659404827313034, + "grad_norm": 2.023207187652588, + "learning_rate": 9.916451533858358e-05, + "loss": 0.7746, + "step": 18250 + }, + { + "epoch": 0.11665793542286905, + "grad_norm": 0.9167050123214722, + "learning_rate": 9.916360164943947e-05, + "loss": 0.9439, + "step": 18260 + }, + { + "epoch": 0.11672182257260774, + "grad_norm": 0.7956591248512268, + "learning_rate": 9.916268746517537e-05, + "loss": 1.0798, + "step": 18270 + }, + { + "epoch": 0.11678570972234645, + "grad_norm": 0.8357956409454346, + "learning_rate": 9.916177278580047e-05, + "loss": 1.0405, + "step": 18280 + }, + { + "epoch": 0.11684959687208515, + "grad_norm": 0.7955309152603149, + "learning_rate": 9.9160857611324e-05, + "loss": 0.7501, + "step": 18290 + }, + { + "epoch": 0.11691348402182385, + "grad_norm": 0.8821001052856445, + "learning_rate": 9.915994194175516e-05, + "loss": 0.916, + "step": 18300 + }, + { + "epoch": 0.11697737117156255, + "grad_norm": 0.7497395873069763, + "learning_rate": 9.915902577710318e-05, + "loss": 0.9209, + "step": 18310 + }, + { + "epoch": 0.11704125832130126, + "grad_norm": 0.7040755152702332, + "learning_rate": 9.915810911737727e-05, + "loss": 0.807, + "step": 18320 + }, + { + "epoch": 0.11710514547103995, + "grad_norm": 0.640442430973053, + "learning_rate": 9.915719196258668e-05, + "loss": 0.8374, + "step": 18330 + }, + { + "epoch": 0.11716903262077866, + "grad_norm": 0.8393665552139282, + "learning_rate": 9.915627431274064e-05, + "loss": 1.118, + "step": 18340 + }, + { + "epoch": 0.11723291977051736, + "grad_norm": 0.9538019895553589, + "learning_rate": 9.915535616784838e-05, + "loss": 0.6021, + "step": 18350 + }, + { + "epoch": 0.11729680692025606, + "grad_norm": 0.5672876238822937, + "learning_rate": 9.915443752791917e-05, + "loss": 0.7703, + "step": 18360 + }, + { + "epoch": 0.11736069406999476, + "grad_norm": 0.6178574562072754, + "learning_rate": 9.915351839296225e-05, + "loss": 1.1465, + "step": 18370 + }, + { + "epoch": 0.11742458121973347, + "grad_norm": 0.9924026131629944, + "learning_rate": 9.915259876298688e-05, + "loss": 0.9693, + "step": 18380 + }, + { + "epoch": 0.11748846836947216, + "grad_norm": 0.9154996275901794, + "learning_rate": 9.91516786380023e-05, + "loss": 1.5368, + "step": 18390 + }, + { + "epoch": 0.11755235551921087, + "grad_norm": 0.8077566623687744, + "learning_rate": 9.91507580180178e-05, + "loss": 0.9521, + "step": 18400 + }, + { + "epoch": 0.11761624266894957, + "grad_norm": 0.8165660500526428, + "learning_rate": 9.914983690304266e-05, + "loss": 0.977, + "step": 18410 + }, + { + "epoch": 0.11768012981868826, + "grad_norm": 0.46091389656066895, + "learning_rate": 9.914891529308614e-05, + "loss": 0.9916, + "step": 18420 + }, + { + "epoch": 0.11774401696842697, + "grad_norm": 0.8578134775161743, + "learning_rate": 9.914799318815751e-05, + "loss": 0.8512, + "step": 18430 + }, + { + "epoch": 0.11780790411816568, + "grad_norm": 1.149581789970398, + "learning_rate": 9.914707058826607e-05, + "loss": 1.0471, + "step": 18440 + }, + { + "epoch": 0.11787179126790437, + "grad_norm": 1.0105202198028564, + "learning_rate": 9.91461474934211e-05, + "loss": 0.8365, + "step": 18450 + }, + { + "epoch": 0.11793567841764308, + "grad_norm": 0.5020955801010132, + "learning_rate": 9.914522390363194e-05, + "loss": 0.9565, + "step": 18460 + }, + { + "epoch": 0.11799956556738178, + "grad_norm": 0.5407631993293762, + "learning_rate": 9.914429981890783e-05, + "loss": 0.885, + "step": 18470 + }, + { + "epoch": 0.11806345271712047, + "grad_norm": 0.5676096081733704, + "learning_rate": 9.914337523925812e-05, + "loss": 0.9684, + "step": 18480 + }, + { + "epoch": 0.11812733986685918, + "grad_norm": 0.7046330571174622, + "learning_rate": 9.91424501646921e-05, + "loss": 1.2259, + "step": 18490 + }, + { + "epoch": 0.11819122701659789, + "grad_norm": 0.501208484172821, + "learning_rate": 9.914152459521909e-05, + "loss": 0.8909, + "step": 18500 + }, + { + "epoch": 0.11825511416633658, + "grad_norm": 1.0521641969680786, + "learning_rate": 9.914059853084842e-05, + "loss": 0.8647, + "step": 18510 + }, + { + "epoch": 0.11831900131607528, + "grad_norm": 1.0477256774902344, + "learning_rate": 9.913967197158942e-05, + "loss": 0.8122, + "step": 18520 + }, + { + "epoch": 0.11838288846581399, + "grad_norm": 1.0611625909805298, + "learning_rate": 9.913874491745138e-05, + "loss": 1.0057, + "step": 18530 + }, + { + "epoch": 0.11844677561555268, + "grad_norm": 0.833010733127594, + "learning_rate": 9.91378173684437e-05, + "loss": 0.8437, + "step": 18540 + }, + { + "epoch": 0.11851066276529139, + "grad_norm": 0.7416166067123413, + "learning_rate": 9.913688932457567e-05, + "loss": 0.9933, + "step": 18550 + }, + { + "epoch": 0.1185745499150301, + "grad_norm": 1.7045838832855225, + "learning_rate": 9.913596078585667e-05, + "loss": 0.7402, + "step": 18560 + }, + { + "epoch": 0.11863843706476879, + "grad_norm": 1.204579472541809, + "learning_rate": 9.913503175229603e-05, + "loss": 0.73, + "step": 18570 + }, + { + "epoch": 0.11870232421450749, + "grad_norm": 2.575094223022461, + "learning_rate": 9.91341022239031e-05, + "loss": 1.0376, + "step": 18580 + }, + { + "epoch": 0.1187662113642462, + "grad_norm": 1.0583864450454712, + "learning_rate": 9.913317220068728e-05, + "loss": 0.9919, + "step": 18590 + }, + { + "epoch": 0.11883009851398489, + "grad_norm": 1.465122938156128, + "learning_rate": 9.913224168265788e-05, + "loss": 0.8039, + "step": 18600 + }, + { + "epoch": 0.1188939856637236, + "grad_norm": 0.8531835675239563, + "learning_rate": 9.913131066982431e-05, + "loss": 0.846, + "step": 18610 + }, + { + "epoch": 0.1189578728134623, + "grad_norm": 0.6930166482925415, + "learning_rate": 9.913037916219594e-05, + "loss": 1.1698, + "step": 18620 + }, + { + "epoch": 0.119021759963201, + "grad_norm": 0.8985093832015991, + "learning_rate": 9.912944715978215e-05, + "loss": 1.0585, + "step": 18630 + }, + { + "epoch": 0.1190856471129397, + "grad_norm": 0.7134751677513123, + "learning_rate": 9.912851466259232e-05, + "loss": 0.8098, + "step": 18640 + }, + { + "epoch": 0.11914953426267841, + "grad_norm": 1.5828766822814941, + "learning_rate": 9.912758167063585e-05, + "loss": 1.116, + "step": 18650 + }, + { + "epoch": 0.1192134214124171, + "grad_norm": 0.602565586566925, + "learning_rate": 9.912664818392213e-05, + "loss": 1.0292, + "step": 18660 + }, + { + "epoch": 0.1192773085621558, + "grad_norm": 0.9910022616386414, + "learning_rate": 9.912571420246057e-05, + "loss": 1.0432, + "step": 18670 + }, + { + "epoch": 0.11934119571189451, + "grad_norm": 0.8652639389038086, + "learning_rate": 9.912477972626055e-05, + "loss": 0.941, + "step": 18680 + }, + { + "epoch": 0.1194050828616332, + "grad_norm": 0.6660580039024353, + "learning_rate": 9.912384475533152e-05, + "loss": 0.8312, + "step": 18690 + }, + { + "epoch": 0.11946897001137191, + "grad_norm": 1.2698357105255127, + "learning_rate": 9.912290928968286e-05, + "loss": 0.6955, + "step": 18700 + }, + { + "epoch": 0.11953285716111062, + "grad_norm": 0.7728399634361267, + "learning_rate": 9.9121973329324e-05, + "loss": 0.6761, + "step": 18710 + }, + { + "epoch": 0.11959674431084931, + "grad_norm": 1.1762244701385498, + "learning_rate": 9.91210368742644e-05, + "loss": 0.9973, + "step": 18720 + }, + { + "epoch": 0.11966063146058802, + "grad_norm": 0.9727983474731445, + "learning_rate": 9.912009992451343e-05, + "loss": 1.0287, + "step": 18730 + }, + { + "epoch": 0.11972451861032672, + "grad_norm": 1.166279673576355, + "learning_rate": 9.911916248008058e-05, + "loss": 0.9455, + "step": 18740 + }, + { + "epoch": 0.11978840576006541, + "grad_norm": 1.3871594667434692, + "learning_rate": 9.911822454097526e-05, + "loss": 0.8691, + "step": 18750 + }, + { + "epoch": 0.11985229290980412, + "grad_norm": 0.7483668923377991, + "learning_rate": 9.911728610720693e-05, + "loss": 0.7596, + "step": 18760 + }, + { + "epoch": 0.11991618005954283, + "grad_norm": 0.7471362352371216, + "learning_rate": 9.911634717878505e-05, + "loss": 0.7925, + "step": 18770 + }, + { + "epoch": 0.11998006720928152, + "grad_norm": 0.7331792712211609, + "learning_rate": 9.911540775571903e-05, + "loss": 0.7732, + "step": 18780 + }, + { + "epoch": 0.12004395435902022, + "grad_norm": 0.8485783934593201, + "learning_rate": 9.911446783801839e-05, + "loss": 1.0558, + "step": 18790 + }, + { + "epoch": 0.12010784150875893, + "grad_norm": 0.63601154088974, + "learning_rate": 9.911352742569255e-05, + "loss": 0.8409, + "step": 18800 + }, + { + "epoch": 0.12017172865849762, + "grad_norm": 1.0364725589752197, + "learning_rate": 9.911258651875102e-05, + "loss": 1.1726, + "step": 18810 + }, + { + "epoch": 0.12023561580823633, + "grad_norm": 1.1578558683395386, + "learning_rate": 9.911164511720324e-05, + "loss": 1.0072, + "step": 18820 + }, + { + "epoch": 0.12029950295797504, + "grad_norm": 0.622075617313385, + "learning_rate": 9.911070322105871e-05, + "loss": 0.7986, + "step": 18830 + }, + { + "epoch": 0.12036339010771374, + "grad_norm": 0.9480080604553223, + "learning_rate": 9.91097608303269e-05, + "loss": 0.7835, + "step": 18840 + }, + { + "epoch": 0.12042727725745243, + "grad_norm": 0.6373130679130554, + "learning_rate": 9.910881794501734e-05, + "loss": 1.2013, + "step": 18850 + }, + { + "epoch": 0.12049116440719114, + "grad_norm": 1.1628334522247314, + "learning_rate": 9.910787456513948e-05, + "loss": 0.8801, + "step": 18860 + }, + { + "epoch": 0.12055505155692985, + "grad_norm": 1.2941060066223145, + "learning_rate": 9.910693069070285e-05, + "loss": 0.8426, + "step": 18870 + }, + { + "epoch": 0.12061893870666854, + "grad_norm": 1.0892646312713623, + "learning_rate": 9.910598632171692e-05, + "loss": 0.7019, + "step": 18880 + }, + { + "epoch": 0.12068282585640724, + "grad_norm": 1.0153416395187378, + "learning_rate": 9.910504145819124e-05, + "loss": 0.9361, + "step": 18890 + }, + { + "epoch": 0.12074671300614595, + "grad_norm": 0.8913525342941284, + "learning_rate": 9.910409610013531e-05, + "loss": 1.2171, + "step": 18900 + }, + { + "epoch": 0.12081060015588464, + "grad_norm": 1.518178105354309, + "learning_rate": 9.910315024755866e-05, + "loss": 0.8538, + "step": 18910 + }, + { + "epoch": 0.12087448730562335, + "grad_norm": 0.8142111301422119, + "learning_rate": 9.910220390047081e-05, + "loss": 0.9446, + "step": 18920 + }, + { + "epoch": 0.12093837445536205, + "grad_norm": 0.6663020849227905, + "learning_rate": 9.910125705888127e-05, + "loss": 0.9821, + "step": 18930 + }, + { + "epoch": 0.12100226160510075, + "grad_norm": 0.7732610106468201, + "learning_rate": 9.91003097227996e-05, + "loss": 0.7072, + "step": 18940 + }, + { + "epoch": 0.12106614875483945, + "grad_norm": 0.8097338080406189, + "learning_rate": 9.909936189223533e-05, + "loss": 1.0208, + "step": 18950 + }, + { + "epoch": 0.12113003590457816, + "grad_norm": 1.0220088958740234, + "learning_rate": 9.909841356719802e-05, + "loss": 0.7898, + "step": 18960 + }, + { + "epoch": 0.12119392305431685, + "grad_norm": 2.5410892963409424, + "learning_rate": 9.909746474769718e-05, + "loss": 0.884, + "step": 18970 + }, + { + "epoch": 0.12125781020405556, + "grad_norm": 1.1213639974594116, + "learning_rate": 9.909651543374243e-05, + "loss": 0.9554, + "step": 18980 + }, + { + "epoch": 0.12132169735379426, + "grad_norm": 0.8598119020462036, + "learning_rate": 9.909556562534327e-05, + "loss": 1.0724, + "step": 18990 + }, + { + "epoch": 0.12138558450353296, + "grad_norm": 0.89163738489151, + "learning_rate": 9.90946153225093e-05, + "loss": 0.9145, + "step": 19000 + }, + { + "epoch": 0.12144947165327166, + "grad_norm": 0.8153218030929565, + "learning_rate": 9.909366452525009e-05, + "loss": 0.8033, + "step": 19010 + }, + { + "epoch": 0.12151335880301037, + "grad_norm": 0.8267776966094971, + "learning_rate": 9.90927132335752e-05, + "loss": 0.9408, + "step": 19020 + }, + { + "epoch": 0.12157724595274906, + "grad_norm": 0.794154942035675, + "learning_rate": 9.909176144749421e-05, + "loss": 0.9167, + "step": 19030 + }, + { + "epoch": 0.12164113310248777, + "grad_norm": 0.9239640831947327, + "learning_rate": 9.909080916701672e-05, + "loss": 0.9062, + "step": 19040 + }, + { + "epoch": 0.12170502025222647, + "grad_norm": 1.426063060760498, + "learning_rate": 9.908995169188589e-05, + "loss": 0.9635, + "step": 19050 + }, + { + "epoch": 0.12176890740196517, + "grad_norm": 0.840755820274353, + "learning_rate": 9.908899847208145e-05, + "loss": 0.6732, + "step": 19060 + }, + { + "epoch": 0.12183279455170387, + "grad_norm": 1.245961308479309, + "learning_rate": 9.908804475790834e-05, + "loss": 1.0316, + "step": 19070 + }, + { + "epoch": 0.12189668170144258, + "grad_norm": 0.5957521796226501, + "learning_rate": 9.908709054937615e-05, + "loss": 0.7994, + "step": 19080 + }, + { + "epoch": 0.12196056885118127, + "grad_norm": 0.753171980381012, + "learning_rate": 9.908613584649447e-05, + "loss": 0.7903, + "step": 19090 + }, + { + "epoch": 0.12202445600091998, + "grad_norm": 0.5334873199462891, + "learning_rate": 9.908518064927297e-05, + "loss": 0.8806, + "step": 19100 + }, + { + "epoch": 0.12208834315065868, + "grad_norm": 0.7774950265884399, + "learning_rate": 9.908422495772121e-05, + "loss": 0.785, + "step": 19110 + }, + { + "epoch": 0.12215223030039737, + "grad_norm": 1.0679373741149902, + "learning_rate": 9.908326877184885e-05, + "loss": 1.1829, + "step": 19120 + }, + { + "epoch": 0.12221611745013608, + "grad_norm": 0.9180088043212891, + "learning_rate": 9.908231209166552e-05, + "loss": 0.943, + "step": 19130 + }, + { + "epoch": 0.12228000459987479, + "grad_norm": 2.2565629482269287, + "learning_rate": 9.908135491718082e-05, + "loss": 0.7051, + "step": 19140 + }, + { + "epoch": 0.12234389174961348, + "grad_norm": 0.5851088762283325, + "learning_rate": 9.908039724840444e-05, + "loss": 0.8, + "step": 19150 + }, + { + "epoch": 0.12240777889935218, + "grad_norm": 1.1300508975982666, + "learning_rate": 9.9079439085346e-05, + "loss": 1.0469, + "step": 19160 + }, + { + "epoch": 0.12247166604909089, + "grad_norm": 1.3692076206207275, + "learning_rate": 9.907848042801514e-05, + "loss": 0.8056, + "step": 19170 + }, + { + "epoch": 0.12253555319882958, + "grad_norm": 0.7391330599784851, + "learning_rate": 9.907752127642151e-05, + "loss": 1.0543, + "step": 19180 + }, + { + "epoch": 0.12259944034856829, + "grad_norm": 1.7373781204223633, + "learning_rate": 9.90765616305748e-05, + "loss": 1.1021, + "step": 19190 + }, + { + "epoch": 0.122663327498307, + "grad_norm": 1.2597390413284302, + "learning_rate": 9.907560149048465e-05, + "loss": 1.0209, + "step": 19200 + }, + { + "epoch": 0.12272721464804569, + "grad_norm": 0.7740830183029175, + "learning_rate": 9.907464085616073e-05, + "loss": 0.8195, + "step": 19210 + }, + { + "epoch": 0.1227911017977844, + "grad_norm": 0.8929482698440552, + "learning_rate": 9.907367972761273e-05, + "loss": 0.8193, + "step": 19220 + }, + { + "epoch": 0.1228549889475231, + "grad_norm": 0.854239821434021, + "learning_rate": 9.907271810485033e-05, + "loss": 0.9699, + "step": 19230 + }, + { + "epoch": 0.12291887609726179, + "grad_norm": 1.0040228366851807, + "learning_rate": 9.907175598788319e-05, + "loss": 0.8653, + "step": 19240 + }, + { + "epoch": 0.1229827632470005, + "grad_norm": 0.9501043558120728, + "learning_rate": 9.907079337672102e-05, + "loss": 1.2441, + "step": 19250 + }, + { + "epoch": 0.1230466503967392, + "grad_norm": 0.9891424179077148, + "learning_rate": 9.90698302713735e-05, + "loss": 1.0697, + "step": 19260 + }, + { + "epoch": 0.1231105375464779, + "grad_norm": 0.7450829148292542, + "learning_rate": 9.906886667185034e-05, + "loss": 0.883, + "step": 19270 + }, + { + "epoch": 0.1231744246962166, + "grad_norm": 0.9859048128128052, + "learning_rate": 9.906790257816125e-05, + "loss": 1.0223, + "step": 19280 + }, + { + "epoch": 0.12323831184595531, + "grad_norm": 0.6718336343765259, + "learning_rate": 9.906693799031593e-05, + "loss": 0.7721, + "step": 19290 + }, + { + "epoch": 0.123302198995694, + "grad_norm": 0.9734120965003967, + "learning_rate": 9.90659729083241e-05, + "loss": 1.1092, + "step": 19300 + }, + { + "epoch": 0.12336608614543271, + "grad_norm": 0.5610973238945007, + "learning_rate": 9.906500733219545e-05, + "loss": 0.8074, + "step": 19310 + }, + { + "epoch": 0.12342997329517141, + "grad_norm": 0.9786707162857056, + "learning_rate": 9.906404126193976e-05, + "loss": 0.8548, + "step": 19320 + }, + { + "epoch": 0.1234938604449101, + "grad_norm": 0.71066814661026, + "learning_rate": 9.90630746975667e-05, + "loss": 0.9489, + "step": 19330 + }, + { + "epoch": 0.12355774759464881, + "grad_norm": 1.0802106857299805, + "learning_rate": 9.906210763908606e-05, + "loss": 1.0818, + "step": 19340 + }, + { + "epoch": 0.12362163474438752, + "grad_norm": 0.8210603594779968, + "learning_rate": 9.906114008650753e-05, + "loss": 1.1651, + "step": 19350 + }, + { + "epoch": 0.12368552189412621, + "grad_norm": 1.066074252128601, + "learning_rate": 9.906017203984089e-05, + "loss": 1.1113, + "step": 19360 + }, + { + "epoch": 0.12374940904386492, + "grad_norm": 1.0652400255203247, + "learning_rate": 9.905920349909587e-05, + "loss": 0.8688, + "step": 19370 + }, + { + "epoch": 0.12381329619360362, + "grad_norm": 0.6207056045532227, + "learning_rate": 9.905823446428222e-05, + "loss": 0.7867, + "step": 19380 + }, + { + "epoch": 0.12387718334334231, + "grad_norm": 0.508903443813324, + "learning_rate": 9.905726493540972e-05, + "loss": 0.7805, + "step": 19390 + }, + { + "epoch": 0.12394107049308102, + "grad_norm": 1.3334448337554932, + "learning_rate": 9.905629491248812e-05, + "loss": 1.1862, + "step": 19400 + }, + { + "epoch": 0.12400495764281973, + "grad_norm": 0.6775515675544739, + "learning_rate": 9.905532439552718e-05, + "loss": 1.0348, + "step": 19410 + }, + { + "epoch": 0.12406884479255842, + "grad_norm": 0.628044605255127, + "learning_rate": 9.905435338453668e-05, + "loss": 0.8879, + "step": 19420 + }, + { + "epoch": 0.12413273194229713, + "grad_norm": 0.4216572940349579, + "learning_rate": 9.905338187952642e-05, + "loss": 0.9814, + "step": 19430 + }, + { + "epoch": 0.12419661909203583, + "grad_norm": 0.9256001710891724, + "learning_rate": 9.905240988050616e-05, + "loss": 0.834, + "step": 19440 + }, + { + "epoch": 0.12426050624177452, + "grad_norm": 1.2580517530441284, + "learning_rate": 9.90514373874857e-05, + "loss": 0.8597, + "step": 19450 + }, + { + "epoch": 0.12432439339151323, + "grad_norm": 0.9285855889320374, + "learning_rate": 9.905046440047483e-05, + "loss": 0.8476, + "step": 19460 + }, + { + "epoch": 0.12438828054125194, + "grad_norm": 4.265834808349609, + "learning_rate": 9.904949091948335e-05, + "loss": 0.7808, + "step": 19470 + }, + { + "epoch": 0.12445216769099063, + "grad_norm": 1.1027454137802124, + "learning_rate": 9.904851694452105e-05, + "loss": 0.9509, + "step": 19480 + }, + { + "epoch": 0.12451605484072933, + "grad_norm": 0.7222440838813782, + "learning_rate": 9.904754247559776e-05, + "loss": 1.19, + "step": 19490 + }, + { + "epoch": 0.12457994199046804, + "grad_norm": 0.9820877909660339, + "learning_rate": 9.904656751272328e-05, + "loss": 1.1383, + "step": 19500 + }, + { + "epoch": 0.12464382914020673, + "grad_norm": 0.521395742893219, + "learning_rate": 9.904559205590744e-05, + "loss": 0.7945, + "step": 19510 + }, + { + "epoch": 0.12470771628994544, + "grad_norm": 0.819299042224884, + "learning_rate": 9.904461610516006e-05, + "loss": 0.9847, + "step": 19520 + }, + { + "epoch": 0.12477160343968415, + "grad_norm": 0.7167036533355713, + "learning_rate": 9.904363966049098e-05, + "loss": 0.9058, + "step": 19530 + }, + { + "epoch": 0.12483549058942284, + "grad_norm": 0.9135296940803528, + "learning_rate": 9.904266272190999e-05, + "loss": 1.0799, + "step": 19540 + }, + { + "epoch": 0.12489937773916154, + "grad_norm": 0.9460045695304871, + "learning_rate": 9.904168528942696e-05, + "loss": 0.8938, + "step": 19550 + }, + { + "epoch": 0.12496326488890025, + "grad_norm": 0.8096686601638794, + "learning_rate": 9.904070736305176e-05, + "loss": 0.9354, + "step": 19560 + }, + { + "epoch": 0.12502715203863896, + "grad_norm": 0.8548075556755066, + "learning_rate": 9.903972894279419e-05, + "loss": 1.0604, + "step": 19570 + }, + { + "epoch": 0.12509103918837766, + "grad_norm": 0.9655779600143433, + "learning_rate": 9.903875002866412e-05, + "loss": 0.9133, + "step": 19580 + }, + { + "epoch": 0.12515492633811634, + "grad_norm": 0.6967488527297974, + "learning_rate": 9.903777062067142e-05, + "loss": 0.9566, + "step": 19590 + }, + { + "epoch": 0.12521881348785505, + "grad_norm": 0.5470744967460632, + "learning_rate": 9.903679071882594e-05, + "loss": 1.1614, + "step": 19600 + }, + { + "epoch": 0.12528270063759375, + "grad_norm": 0.44721782207489014, + "learning_rate": 9.903581032313757e-05, + "loss": 0.8072, + "step": 19610 + }, + { + "epoch": 0.12534658778733246, + "grad_norm": 0.6788471937179565, + "learning_rate": 9.903482943361616e-05, + "loss": 0.7861, + "step": 19620 + }, + { + "epoch": 0.12541047493707116, + "grad_norm": 1.1422935724258423, + "learning_rate": 9.90338480502716e-05, + "loss": 0.7205, + "step": 19630 + }, + { + "epoch": 0.12547436208680987, + "grad_norm": 0.9505433440208435, + "learning_rate": 9.903286617311375e-05, + "loss": 0.9314, + "step": 19640 + }, + { + "epoch": 0.12553824923654855, + "grad_norm": 1.1754378080368042, + "learning_rate": 9.903188380215254e-05, + "loss": 1.0777, + "step": 19650 + }, + { + "epoch": 0.12560213638628726, + "grad_norm": 0.8856581449508667, + "learning_rate": 9.903090093739784e-05, + "loss": 0.8573, + "step": 19660 + }, + { + "epoch": 0.12566602353602596, + "grad_norm": 0.9377738237380981, + "learning_rate": 9.902991757885955e-05, + "loss": 1.1693, + "step": 19670 + }, + { + "epoch": 0.12572991068576467, + "grad_norm": 1.0631327629089355, + "learning_rate": 9.902893372654755e-05, + "loss": 1.0915, + "step": 19680 + }, + { + "epoch": 0.12579379783550337, + "grad_norm": 0.9726115465164185, + "learning_rate": 9.902794938047179e-05, + "loss": 0.8837, + "step": 19690 + }, + { + "epoch": 0.12585768498524208, + "grad_norm": 1.0716935396194458, + "learning_rate": 9.902696454064218e-05, + "loss": 0.9323, + "step": 19700 + }, + { + "epoch": 0.12592157213498076, + "grad_norm": 0.6799229383468628, + "learning_rate": 9.90259792070686e-05, + "loss": 1.0071, + "step": 19710 + }, + { + "epoch": 0.12598545928471946, + "grad_norm": 0.6276800632476807, + "learning_rate": 9.9024993379761e-05, + "loss": 0.7668, + "step": 19720 + }, + { + "epoch": 0.12604934643445817, + "grad_norm": 0.6206457018852234, + "learning_rate": 9.902400705872931e-05, + "loss": 0.9062, + "step": 19730 + }, + { + "epoch": 0.12611323358419688, + "grad_norm": 0.8407792448997498, + "learning_rate": 9.902302024398344e-05, + "loss": 0.9027, + "step": 19740 + }, + { + "epoch": 0.12617712073393558, + "grad_norm": 1.0708434581756592, + "learning_rate": 9.902203293553337e-05, + "loss": 1.1047, + "step": 19750 + }, + { + "epoch": 0.1262410078836743, + "grad_norm": 0.7790530920028687, + "learning_rate": 9.902104513338901e-05, + "loss": 0.9325, + "step": 19760 + }, + { + "epoch": 0.12630489503341297, + "grad_norm": 0.8316869139671326, + "learning_rate": 9.90200568375603e-05, + "loss": 0.7061, + "step": 19770 + }, + { + "epoch": 0.12636878218315167, + "grad_norm": 1.0187642574310303, + "learning_rate": 9.901906804805723e-05, + "loss": 1.0777, + "step": 19780 + }, + { + "epoch": 0.12643266933289038, + "grad_norm": 0.5136988759040833, + "learning_rate": 9.901807876488973e-05, + "loss": 0.9242, + "step": 19790 + }, + { + "epoch": 0.12649655648262909, + "grad_norm": 0.8000445365905762, + "learning_rate": 9.901708898806777e-05, + "loss": 0.9573, + "step": 19800 + }, + { + "epoch": 0.1265604436323678, + "grad_norm": 0.6587111353874207, + "learning_rate": 9.901609871760132e-05, + "loss": 1.0622, + "step": 19810 + }, + { + "epoch": 0.1266243307821065, + "grad_norm": 0.7772683501243591, + "learning_rate": 9.901510795350035e-05, + "loss": 0.9968, + "step": 19820 + }, + { + "epoch": 0.12668821793184518, + "grad_norm": 0.578628659248352, + "learning_rate": 9.901411669577484e-05, + "loss": 0.8025, + "step": 19830 + }, + { + "epoch": 0.12675210508158388, + "grad_norm": 0.5878568887710571, + "learning_rate": 9.901312494443477e-05, + "loss": 0.9639, + "step": 19840 + }, + { + "epoch": 0.1268159922313226, + "grad_norm": 1.2923487424850464, + "learning_rate": 9.901213269949013e-05, + "loss": 0.8744, + "step": 19850 + }, + { + "epoch": 0.1268798793810613, + "grad_norm": 0.8328975439071655, + "learning_rate": 9.90111399609509e-05, + "loss": 0.8774, + "step": 19860 + }, + { + "epoch": 0.1269437665308, + "grad_norm": 0.5888987183570862, + "learning_rate": 9.901024607425051e-05, + "loss": 0.8943, + "step": 19870 + }, + { + "epoch": 0.1270076536805387, + "grad_norm": 0.5450535416603088, + "learning_rate": 9.900925239790913e-05, + "loss": 0.8265, + "step": 19880 + }, + { + "epoch": 0.12707154083027739, + "grad_norm": 1.1033037900924683, + "learning_rate": 9.90082582280022e-05, + "loss": 1.0133, + "step": 19890 + }, + { + "epoch": 0.1271354279800161, + "grad_norm": 0.7691605687141418, + "learning_rate": 9.90072635645397e-05, + "loss": 1.0807, + "step": 19900 + }, + { + "epoch": 0.1271993151297548, + "grad_norm": 0.5714837908744812, + "learning_rate": 9.900626840753167e-05, + "loss": 0.8473, + "step": 19910 + }, + { + "epoch": 0.1272632022794935, + "grad_norm": 0.5955528020858765, + "learning_rate": 9.90052727569881e-05, + "loss": 0.9808, + "step": 19920 + }, + { + "epoch": 0.1273270894292322, + "grad_norm": 0.6563436388969421, + "learning_rate": 9.900427661291904e-05, + "loss": 0.9406, + "step": 19930 + }, + { + "epoch": 0.12739097657897092, + "grad_norm": 1.398422360420227, + "learning_rate": 9.900327997533454e-05, + "loss": 1.1866, + "step": 19940 + }, + { + "epoch": 0.1274548637287096, + "grad_norm": 0.8855098485946655, + "learning_rate": 9.900228284424459e-05, + "loss": 1.167, + "step": 19950 + }, + { + "epoch": 0.1275187508784483, + "grad_norm": 0.8752385973930359, + "learning_rate": 9.900128521965927e-05, + "loss": 0.9714, + "step": 19960 + }, + { + "epoch": 0.127582638028187, + "grad_norm": 0.7587289810180664, + "learning_rate": 9.900028710158865e-05, + "loss": 1.1985, + "step": 19970 + }, + { + "epoch": 0.1276465251779257, + "grad_norm": 0.683338463306427, + "learning_rate": 9.899928849004269e-05, + "loss": 1.0779, + "step": 19980 + }, + { + "epoch": 0.12771041232766442, + "grad_norm": 0.738228440284729, + "learning_rate": 9.899828938503155e-05, + "loss": 0.8112, + "step": 19990 + }, + { + "epoch": 0.12777429947740312, + "grad_norm": 1.1224406957626343, + "learning_rate": 9.899728978656521e-05, + "loss": 0.707, + "step": 20000 + }, + { + "epoch": 0.1278381866271418, + "grad_norm": 1.0595028400421143, + "learning_rate": 9.89962896946538e-05, + "loss": 1.296, + "step": 20010 + }, + { + "epoch": 0.1279020737768805, + "grad_norm": 0.972698986530304, + "learning_rate": 9.899528910930736e-05, + "loss": 0.9258, + "step": 20020 + }, + { + "epoch": 0.12796596092661922, + "grad_norm": 0.7331506609916687, + "learning_rate": 9.899428803053597e-05, + "loss": 0.8608, + "step": 20030 + }, + { + "epoch": 0.12802984807635792, + "grad_norm": 0.9206950664520264, + "learning_rate": 9.899328645834971e-05, + "loss": 0.9087, + "step": 20040 + }, + { + "epoch": 0.12809373522609663, + "grad_norm": 2.2389299869537354, + "learning_rate": 9.899228439275867e-05, + "loss": 0.9422, + "step": 20050 + }, + { + "epoch": 0.12815762237583533, + "grad_norm": 1.7067959308624268, + "learning_rate": 9.899128183377294e-05, + "loss": 0.8746, + "step": 20060 + }, + { + "epoch": 0.128221509525574, + "grad_norm": 0.6370442509651184, + "learning_rate": 9.899027878140264e-05, + "loss": 1.1108, + "step": 20070 + }, + { + "epoch": 0.12828539667531272, + "grad_norm": 0.7334869503974915, + "learning_rate": 9.898927523565782e-05, + "loss": 0.7668, + "step": 20080 + }, + { + "epoch": 0.12834928382505142, + "grad_norm": 0.948521077632904, + "learning_rate": 9.898827119654864e-05, + "loss": 0.9522, + "step": 20090 + }, + { + "epoch": 0.12841317097479013, + "grad_norm": 1.9327528476715088, + "learning_rate": 9.898726666408516e-05, + "loss": 0.8717, + "step": 20100 + }, + { + "epoch": 0.12847705812452884, + "grad_norm": 0.8920581936836243, + "learning_rate": 9.898626163827755e-05, + "loss": 0.956, + "step": 20110 + }, + { + "epoch": 0.12854094527426754, + "grad_norm": 0.7983399033546448, + "learning_rate": 9.89852561191359e-05, + "loss": 0.8899, + "step": 20120 + }, + { + "epoch": 0.12860483242400625, + "grad_norm": 0.9559574723243713, + "learning_rate": 9.898425010667035e-05, + "loss": 0.9156, + "step": 20130 + }, + { + "epoch": 0.12866871957374493, + "grad_norm": 0.5370156764984131, + "learning_rate": 9.898324360089099e-05, + "loss": 0.8623, + "step": 20140 + }, + { + "epoch": 0.12873260672348363, + "grad_norm": 1.084375262260437, + "learning_rate": 9.898223660180802e-05, + "loss": 0.7424, + "step": 20150 + }, + { + "epoch": 0.12879649387322234, + "grad_norm": 0.6435216069221497, + "learning_rate": 9.898122910943155e-05, + "loss": 0.7816, + "step": 20160 + }, + { + "epoch": 0.12886038102296105, + "grad_norm": 0.8738903999328613, + "learning_rate": 9.898022112377172e-05, + "loss": 0.7824, + "step": 20170 + }, + { + "epoch": 0.12892426817269975, + "grad_norm": 0.943022608757019, + "learning_rate": 9.89792126448387e-05, + "loss": 1.1974, + "step": 20180 + }, + { + "epoch": 0.12898815532243846, + "grad_norm": 0.9258697032928467, + "learning_rate": 9.897820367264262e-05, + "loss": 0.9837, + "step": 20190 + }, + { + "epoch": 0.12905204247217714, + "grad_norm": 0.8255495429039001, + "learning_rate": 9.897719420719367e-05, + "loss": 0.8147, + "step": 20200 + }, + { + "epoch": 0.12911592962191584, + "grad_norm": 0.5483478307723999, + "learning_rate": 9.897618424850199e-05, + "loss": 0.9607, + "step": 20210 + }, + { + "epoch": 0.12917981677165455, + "grad_norm": 0.976705014705658, + "learning_rate": 9.897517379657778e-05, + "loss": 0.9184, + "step": 20220 + }, + { + "epoch": 0.12924370392139325, + "grad_norm": 0.66350257396698, + "learning_rate": 9.89741628514312e-05, + "loss": 0.8475, + "step": 20230 + }, + { + "epoch": 0.12930759107113196, + "grad_norm": 0.9961204528808594, + "learning_rate": 9.897315141307242e-05, + "loss": 0.9149, + "step": 20240 + }, + { + "epoch": 0.12937147822087067, + "grad_norm": 0.8872457146644592, + "learning_rate": 9.897213948151165e-05, + "loss": 0.8368, + "step": 20250 + }, + { + "epoch": 0.12943536537060935, + "grad_norm": 1.1536744832992554, + "learning_rate": 9.897112705675906e-05, + "loss": 0.8775, + "step": 20260 + }, + { + "epoch": 0.12949925252034805, + "grad_norm": 0.835328221321106, + "learning_rate": 9.897011413882484e-05, + "loss": 0.8357, + "step": 20270 + }, + { + "epoch": 0.12956313967008676, + "grad_norm": 0.5641841292381287, + "learning_rate": 9.896910072771924e-05, + "loss": 0.9148, + "step": 20280 + }, + { + "epoch": 0.12962702681982546, + "grad_norm": 0.9598913192749023, + "learning_rate": 9.89680868234524e-05, + "loss": 0.8754, + "step": 20290 + }, + { + "epoch": 0.12969091396956417, + "grad_norm": 0.7789944410324097, + "learning_rate": 9.896707242603457e-05, + "loss": 0.8845, + "step": 20300 + }, + { + "epoch": 0.12975480111930288, + "grad_norm": 1.1389309167861938, + "learning_rate": 9.896605753547596e-05, + "loss": 1.028, + "step": 20310 + }, + { + "epoch": 0.12981868826904155, + "grad_norm": 0.8242889642715454, + "learning_rate": 9.896504215178681e-05, + "loss": 0.7889, + "step": 20320 + }, + { + "epoch": 0.12988257541878026, + "grad_norm": 1.3238638639450073, + "learning_rate": 9.89640262749773e-05, + "loss": 0.8451, + "step": 20330 + }, + { + "epoch": 0.12994646256851897, + "grad_norm": 1.0306720733642578, + "learning_rate": 9.896300990505768e-05, + "loss": 0.9655, + "step": 20340 + }, + { + "epoch": 0.13001034971825767, + "grad_norm": 0.7990890145301819, + "learning_rate": 9.896199304203821e-05, + "loss": 0.9537, + "step": 20350 + }, + { + "epoch": 0.13007423686799638, + "grad_norm": 0.8819360136985779, + "learning_rate": 9.89609756859291e-05, + "loss": 0.9703, + "step": 20360 + }, + { + "epoch": 0.13013812401773509, + "grad_norm": 0.8472315669059753, + "learning_rate": 9.895995783674061e-05, + "loss": 1.1459, + "step": 20370 + }, + { + "epoch": 0.13020201116747376, + "grad_norm": 0.8132781386375427, + "learning_rate": 9.895893949448301e-05, + "loss": 1.2826, + "step": 20380 + }, + { + "epoch": 0.13026589831721247, + "grad_norm": 1.0438861846923828, + "learning_rate": 9.89579206591665e-05, + "loss": 0.846, + "step": 20390 + }, + { + "epoch": 0.13032978546695118, + "grad_norm": 1.2839152812957764, + "learning_rate": 9.89569013308014e-05, + "loss": 0.798, + "step": 20400 + }, + { + "epoch": 0.13039367261668988, + "grad_norm": 0.7642764449119568, + "learning_rate": 9.895588150939794e-05, + "loss": 1.2106, + "step": 20410 + }, + { + "epoch": 0.1304575597664286, + "grad_norm": 1.4906141757965088, + "learning_rate": 9.89548611949664e-05, + "loss": 1.0197, + "step": 20420 + }, + { + "epoch": 0.1305214469161673, + "grad_norm": 1.0365071296691895, + "learning_rate": 9.895384038751705e-05, + "loss": 0.793, + "step": 20430 + }, + { + "epoch": 0.13058533406590597, + "grad_norm": 0.7034469842910767, + "learning_rate": 9.895281908706018e-05, + "loss": 1.0824, + "step": 20440 + }, + { + "epoch": 0.13064922121564468, + "grad_norm": 0.8058176636695862, + "learning_rate": 9.895179729360606e-05, + "loss": 0.9053, + "step": 20450 + }, + { + "epoch": 0.13071310836538338, + "grad_norm": 1.0343101024627686, + "learning_rate": 9.8950775007165e-05, + "loss": 1.0945, + "step": 20460 + }, + { + "epoch": 0.1307769955151221, + "grad_norm": 0.7652077674865723, + "learning_rate": 9.89497522277473e-05, + "loss": 0.8568, + "step": 20470 + }, + { + "epoch": 0.1308408826648608, + "grad_norm": 0.6593330502510071, + "learning_rate": 9.894872895536325e-05, + "loss": 0.9574, + "step": 20480 + }, + { + "epoch": 0.1309047698145995, + "grad_norm": 1.9122685194015503, + "learning_rate": 9.894770519002314e-05, + "loss": 0.8306, + "step": 20490 + }, + { + "epoch": 0.13096865696433818, + "grad_norm": 0.57440185546875, + "learning_rate": 9.894668093173729e-05, + "loss": 0.6806, + "step": 20500 + }, + { + "epoch": 0.1310325441140769, + "grad_norm": 0.5228521227836609, + "learning_rate": 9.894565618051603e-05, + "loss": 0.9544, + "step": 20510 + }, + { + "epoch": 0.1310964312638156, + "grad_norm": 0.6962705850601196, + "learning_rate": 9.894463093636966e-05, + "loss": 0.7487, + "step": 20520 + }, + { + "epoch": 0.1311603184135543, + "grad_norm": 0.92603999376297, + "learning_rate": 9.89436051993085e-05, + "loss": 0.7536, + "step": 20530 + }, + { + "epoch": 0.131224205563293, + "grad_norm": 1.5977349281311035, + "learning_rate": 9.894257896934292e-05, + "loss": 1.0637, + "step": 20540 + }, + { + "epoch": 0.1312880927130317, + "grad_norm": 1.1071442365646362, + "learning_rate": 9.894155224648322e-05, + "loss": 0.7441, + "step": 20550 + }, + { + "epoch": 0.1313519798627704, + "grad_norm": 0.576611340045929, + "learning_rate": 9.894052503073973e-05, + "loss": 0.7077, + "step": 20560 + }, + { + "epoch": 0.1314158670125091, + "grad_norm": 0.7525666356086731, + "learning_rate": 9.893949732212284e-05, + "loss": 0.9049, + "step": 20570 + }, + { + "epoch": 0.1314797541622478, + "grad_norm": 0.7113981246948242, + "learning_rate": 9.893846912064287e-05, + "loss": 1.0453, + "step": 20580 + }, + { + "epoch": 0.1315436413119865, + "grad_norm": 0.9703547358512878, + "learning_rate": 9.893744042631016e-05, + "loss": 0.903, + "step": 20590 + }, + { + "epoch": 0.13160752846172522, + "grad_norm": 0.8187039494514465, + "learning_rate": 9.89364112391351e-05, + "loss": 0.7462, + "step": 20600 + }, + { + "epoch": 0.13167141561146392, + "grad_norm": 0.6756948232650757, + "learning_rate": 9.893538155912804e-05, + "loss": 0.8157, + "step": 20610 + }, + { + "epoch": 0.1317353027612026, + "grad_norm": 1.0830146074295044, + "learning_rate": 9.893435138629936e-05, + "loss": 0.7643, + "step": 20620 + }, + { + "epoch": 0.1317991899109413, + "grad_norm": 1.8327852487564087, + "learning_rate": 9.893332072065942e-05, + "loss": 0.9862, + "step": 20630 + }, + { + "epoch": 0.13186307706068, + "grad_norm": 2.034275770187378, + "learning_rate": 9.893228956221861e-05, + "loss": 0.8296, + "step": 20640 + }, + { + "epoch": 0.13192696421041872, + "grad_norm": 0.7762085795402527, + "learning_rate": 9.893125791098729e-05, + "loss": 0.8986, + "step": 20650 + }, + { + "epoch": 0.13199085136015742, + "grad_norm": 1.0018727779388428, + "learning_rate": 9.89302257669759e-05, + "loss": 0.9179, + "step": 20660 + }, + { + "epoch": 0.13205473850989613, + "grad_norm": 1.3458504676818848, + "learning_rate": 9.89291931301948e-05, + "loss": 0.7677, + "step": 20670 + }, + { + "epoch": 0.1321186256596348, + "grad_norm": 0.7849268913269043, + "learning_rate": 9.89281600006544e-05, + "loss": 1.1385, + "step": 20680 + }, + { + "epoch": 0.13218251280937351, + "grad_norm": 0.9244788289070129, + "learning_rate": 9.892712637836507e-05, + "loss": 0.8751, + "step": 20690 + }, + { + "epoch": 0.13224639995911222, + "grad_norm": 0.7756919860839844, + "learning_rate": 9.892609226333728e-05, + "loss": 0.8581, + "step": 20700 + }, + { + "epoch": 0.13231028710885093, + "grad_norm": 0.7075464129447937, + "learning_rate": 9.89250576555814e-05, + "loss": 0.8242, + "step": 20710 + }, + { + "epoch": 0.13237417425858963, + "grad_norm": 0.8638562560081482, + "learning_rate": 9.892402255510786e-05, + "loss": 0.9992, + "step": 20720 + }, + { + "epoch": 0.13243806140832834, + "grad_norm": 0.9571630954742432, + "learning_rate": 9.89229869619271e-05, + "loss": 0.9963, + "step": 20730 + }, + { + "epoch": 0.13250194855806702, + "grad_norm": 1.0435787439346313, + "learning_rate": 9.892195087604954e-05, + "loss": 0.8632, + "step": 20740 + }, + { + "epoch": 0.13256583570780572, + "grad_norm": 1.1710478067398071, + "learning_rate": 9.89209142974856e-05, + "loss": 0.8433, + "step": 20750 + }, + { + "epoch": 0.13262972285754443, + "grad_norm": 0.6886267066001892, + "learning_rate": 9.891987722624574e-05, + "loss": 1.0012, + "step": 20760 + }, + { + "epoch": 0.13269361000728314, + "grad_norm": 1.172371745109558, + "learning_rate": 9.89188396623404e-05, + "loss": 0.9275, + "step": 20770 + }, + { + "epoch": 0.13275749715702184, + "grad_norm": 0.8536580204963684, + "learning_rate": 9.891780160577999e-05, + "loss": 1.0204, + "step": 20780 + }, + { + "epoch": 0.13282138430676055, + "grad_norm": 0.8853366374969482, + "learning_rate": 9.891676305657502e-05, + "loss": 0.913, + "step": 20790 + }, + { + "epoch": 0.13288527145649923, + "grad_norm": 0.9350702166557312, + "learning_rate": 9.891572401473594e-05, + "loss": 0.8241, + "step": 20800 + }, + { + "epoch": 0.13294915860623793, + "grad_norm": 0.7683811783790588, + "learning_rate": 9.891468448027318e-05, + "loss": 0.7284, + "step": 20810 + }, + { + "epoch": 0.13301304575597664, + "grad_norm": 1.978036880493164, + "learning_rate": 9.891364445319723e-05, + "loss": 0.9082, + "step": 20820 + }, + { + "epoch": 0.13307693290571534, + "grad_norm": 0.828632652759552, + "learning_rate": 9.891260393351858e-05, + "loss": 0.7259, + "step": 20830 + }, + { + "epoch": 0.13314082005545405, + "grad_norm": 0.8856496810913086, + "learning_rate": 9.891156292124768e-05, + "loss": 0.8231, + "step": 20840 + }, + { + "epoch": 0.13320470720519276, + "grad_norm": 0.9500540494918823, + "learning_rate": 9.891052141639505e-05, + "loss": 0.8454, + "step": 20850 + }, + { + "epoch": 0.13326859435493144, + "grad_norm": 0.6504539251327515, + "learning_rate": 9.890947941897113e-05, + "loss": 0.8924, + "step": 20860 + }, + { + "epoch": 0.13333248150467014, + "grad_norm": 0.9036272168159485, + "learning_rate": 9.890843692898644e-05, + "loss": 1.2291, + "step": 20870 + }, + { + "epoch": 0.13339636865440885, + "grad_norm": 0.487404465675354, + "learning_rate": 9.890739394645149e-05, + "loss": 0.9082, + "step": 20880 + }, + { + "epoch": 0.13346025580414755, + "grad_norm": 0.6194189786911011, + "learning_rate": 9.890635047137678e-05, + "loss": 0.8234, + "step": 20890 + }, + { + "epoch": 0.13352414295388626, + "grad_norm": 1.1986579895019531, + "learning_rate": 9.890530650377279e-05, + "loss": 1.0975, + "step": 20900 + }, + { + "epoch": 0.13358803010362497, + "grad_norm": 0.5254888534545898, + "learning_rate": 9.890426204365006e-05, + "loss": 0.9337, + "step": 20910 + }, + { + "epoch": 0.13365191725336364, + "grad_norm": 2.1500959396362305, + "learning_rate": 9.890321709101911e-05, + "loss": 0.9268, + "step": 20920 + }, + { + "epoch": 0.13371580440310235, + "grad_norm": 1.653495192527771, + "learning_rate": 9.890217164589044e-05, + "loss": 0.805, + "step": 20930 + }, + { + "epoch": 0.13377969155284106, + "grad_norm": 0.9262358546257019, + "learning_rate": 9.890112570827461e-05, + "loss": 0.7364, + "step": 20940 + }, + { + "epoch": 0.13384357870257976, + "grad_norm": 0.7505791187286377, + "learning_rate": 9.890007927818214e-05, + "loss": 0.836, + "step": 20950 + }, + { + "epoch": 0.13390746585231847, + "grad_norm": 0.7554075717926025, + "learning_rate": 9.889903235562357e-05, + "loss": 1.0677, + "step": 20960 + }, + { + "epoch": 0.13397135300205718, + "grad_norm": 1.8679813146591187, + "learning_rate": 9.889798494060942e-05, + "loss": 0.818, + "step": 20970 + }, + { + "epoch": 0.13403524015179588, + "grad_norm": 1.584902286529541, + "learning_rate": 9.889693703315029e-05, + "loss": 1.1151, + "step": 20980 + }, + { + "epoch": 0.13409912730153456, + "grad_norm": 0.8589569330215454, + "learning_rate": 9.889588863325667e-05, + "loss": 0.9884, + "step": 20990 + }, + { + "epoch": 0.13416301445127327, + "grad_norm": 0.9949905872344971, + "learning_rate": 9.889483974093917e-05, + "loss": 0.925, + "step": 21000 + }, + { + "epoch": 0.13422690160101197, + "grad_norm": 0.6873974204063416, + "learning_rate": 9.889379035620833e-05, + "loss": 0.9067, + "step": 21010 + }, + { + "epoch": 0.13429078875075068, + "grad_norm": 2.3519535064697266, + "learning_rate": 9.889274047907472e-05, + "loss": 0.9542, + "step": 21020 + }, + { + "epoch": 0.13435467590048938, + "grad_norm": 0.6520812511444092, + "learning_rate": 9.889169010954892e-05, + "loss": 0.9918, + "step": 21030 + }, + { + "epoch": 0.1344185630502281, + "grad_norm": 0.6458450555801392, + "learning_rate": 9.88906392476415e-05, + "loss": 1.0032, + "step": 21040 + }, + { + "epoch": 0.13448245019996677, + "grad_norm": 1.0632940530776978, + "learning_rate": 9.888958789336304e-05, + "loss": 1.0281, + "step": 21050 + }, + { + "epoch": 0.13454633734970547, + "grad_norm": 0.8738301992416382, + "learning_rate": 9.888853604672415e-05, + "loss": 0.8943, + "step": 21060 + }, + { + "epoch": 0.13461022449944418, + "grad_norm": 0.8271169662475586, + "learning_rate": 9.88874837077354e-05, + "loss": 0.8891, + "step": 21070 + }, + { + "epoch": 0.1346741116491829, + "grad_norm": 0.7805771827697754, + "learning_rate": 9.888643087640739e-05, + "loss": 0.9641, + "step": 21080 + }, + { + "epoch": 0.1347379987989216, + "grad_norm": 1.1134415864944458, + "learning_rate": 9.888537755275073e-05, + "loss": 0.8162, + "step": 21090 + }, + { + "epoch": 0.1348018859486603, + "grad_norm": 0.9039101004600525, + "learning_rate": 9.888432373677602e-05, + "loss": 1.0201, + "step": 21100 + }, + { + "epoch": 0.13486577309839898, + "grad_norm": 0.8428747653961182, + "learning_rate": 9.888326942849389e-05, + "loss": 0.8404, + "step": 21110 + }, + { + "epoch": 0.13492966024813768, + "grad_norm": 0.8015506267547607, + "learning_rate": 9.888221462791493e-05, + "loss": 1.0085, + "step": 21120 + }, + { + "epoch": 0.1349935473978764, + "grad_norm": 1.1341489553451538, + "learning_rate": 9.88811593350498e-05, + "loss": 0.9912, + "step": 21130 + }, + { + "epoch": 0.1350574345476151, + "grad_norm": 0.8308176398277283, + "learning_rate": 9.888010354990911e-05, + "loss": 0.7831, + "step": 21140 + }, + { + "epoch": 0.1351213216973538, + "grad_norm": 0.8086538910865784, + "learning_rate": 9.887904727250348e-05, + "loss": 0.8645, + "step": 21150 + }, + { + "epoch": 0.1351852088470925, + "grad_norm": 0.5411624908447266, + "learning_rate": 9.887799050284355e-05, + "loss": 1.0745, + "step": 21160 + }, + { + "epoch": 0.1352490959968312, + "grad_norm": 0.8454309105873108, + "learning_rate": 9.887693324093998e-05, + "loss": 0.8502, + "step": 21170 + }, + { + "epoch": 0.1353129831465699, + "grad_norm": 0.8707975149154663, + "learning_rate": 9.88758754868034e-05, + "loss": 0.9231, + "step": 21180 + }, + { + "epoch": 0.1353768702963086, + "grad_norm": 0.819693386554718, + "learning_rate": 9.887481724044447e-05, + "loss": 0.8355, + "step": 21190 + }, + { + "epoch": 0.1354407574460473, + "grad_norm": 0.7734857201576233, + "learning_rate": 9.887375850187386e-05, + "loss": 1.1568, + "step": 21200 + }, + { + "epoch": 0.135504644595786, + "grad_norm": 0.6396207809448242, + "learning_rate": 9.887269927110222e-05, + "loss": 0.9182, + "step": 21210 + }, + { + "epoch": 0.13556853174552472, + "grad_norm": 1.281610369682312, + "learning_rate": 9.88716395481402e-05, + "loss": 0.9132, + "step": 21220 + }, + { + "epoch": 0.1356324188952634, + "grad_norm": 1.1592093706130981, + "learning_rate": 9.88705793329985e-05, + "loss": 0.9757, + "step": 21230 + }, + { + "epoch": 0.1356963060450021, + "grad_norm": 0.7820732593536377, + "learning_rate": 9.88695186256878e-05, + "loss": 0.9314, + "step": 21240 + }, + { + "epoch": 0.1357601931947408, + "grad_norm": 0.7652541399002075, + "learning_rate": 9.886845742621876e-05, + "loss": 1.0022, + "step": 21250 + }, + { + "epoch": 0.13582408034447951, + "grad_norm": 0.7700982689857483, + "learning_rate": 9.886739573460207e-05, + "loss": 1.0373, + "step": 21260 + }, + { + "epoch": 0.13588796749421822, + "grad_norm": 1.0912948846817017, + "learning_rate": 9.886633355084843e-05, + "loss": 0.9453, + "step": 21270 + }, + { + "epoch": 0.13595185464395693, + "grad_norm": 0.6350242495536804, + "learning_rate": 9.886527087496853e-05, + "loss": 0.7426, + "step": 21280 + }, + { + "epoch": 0.1360157417936956, + "grad_norm": 0.7051372528076172, + "learning_rate": 9.886420770697309e-05, + "loss": 0.823, + "step": 21290 + }, + { + "epoch": 0.1360796289434343, + "grad_norm": 0.8976541757583618, + "learning_rate": 9.88631440468728e-05, + "loss": 0.9737, + "step": 21300 + }, + { + "epoch": 0.13614351609317302, + "grad_norm": 2.363358974456787, + "learning_rate": 9.886207989467837e-05, + "loss": 0.9999, + "step": 21310 + }, + { + "epoch": 0.13620740324291172, + "grad_norm": 1.8028829097747803, + "learning_rate": 9.886101525040055e-05, + "loss": 0.8832, + "step": 21320 + }, + { + "epoch": 0.13627129039265043, + "grad_norm": 1.440885305404663, + "learning_rate": 9.885995011405e-05, + "loss": 0.922, + "step": 21330 + }, + { + "epoch": 0.13633517754238914, + "grad_norm": 0.9806457161903381, + "learning_rate": 9.88588844856375e-05, + "loss": 0.9399, + "step": 21340 + }, + { + "epoch": 0.1363990646921278, + "grad_norm": 0.8839708566665649, + "learning_rate": 9.885781836517377e-05, + "loss": 0.7167, + "step": 21350 + }, + { + "epoch": 0.13646295184186652, + "grad_norm": 1.2500883340835571, + "learning_rate": 9.885675175266953e-05, + "loss": 1.177, + "step": 21360 + }, + { + "epoch": 0.13652683899160523, + "grad_norm": 1.0007693767547607, + "learning_rate": 9.885568464813554e-05, + "loss": 0.9975, + "step": 21370 + }, + { + "epoch": 0.13659072614134393, + "grad_norm": 0.8086827397346497, + "learning_rate": 9.885461705158254e-05, + "loss": 0.8139, + "step": 21380 + }, + { + "epoch": 0.13665461329108264, + "grad_norm": 0.947471022605896, + "learning_rate": 9.885354896302128e-05, + "loss": 1.1116, + "step": 21390 + }, + { + "epoch": 0.13671850044082134, + "grad_norm": 0.8959566950798035, + "learning_rate": 9.885248038246251e-05, + "loss": 1.1009, + "step": 21400 + }, + { + "epoch": 0.13678238759056002, + "grad_norm": 1.1560317277908325, + "learning_rate": 9.8851411309917e-05, + "loss": 0.6756, + "step": 21410 + }, + { + "epoch": 0.13684627474029873, + "grad_norm": 1.5213913917541504, + "learning_rate": 9.885034174539552e-05, + "loss": 0.7979, + "step": 21420 + }, + { + "epoch": 0.13691016189003744, + "grad_norm": 0.5531548261642456, + "learning_rate": 9.884927168890884e-05, + "loss": 0.9408, + "step": 21430 + }, + { + "epoch": 0.13697404903977614, + "grad_norm": 0.7810382843017578, + "learning_rate": 9.884820114046774e-05, + "loss": 0.8515, + "step": 21440 + }, + { + "epoch": 0.13703793618951485, + "grad_norm": 1.0958387851715088, + "learning_rate": 9.884713010008298e-05, + "loss": 0.869, + "step": 21450 + }, + { + "epoch": 0.13710182333925355, + "grad_norm": 0.4343324899673462, + "learning_rate": 9.884605856776537e-05, + "loss": 0.8596, + "step": 21460 + }, + { + "epoch": 0.13716571048899223, + "grad_norm": 0.9415945410728455, + "learning_rate": 9.884498654352567e-05, + "loss": 0.7679, + "step": 21470 + }, + { + "epoch": 0.13722959763873094, + "grad_norm": 0.6814182996749878, + "learning_rate": 9.884391402737473e-05, + "loss": 0.9849, + "step": 21480 + }, + { + "epoch": 0.13729348478846964, + "grad_norm": 0.8244829177856445, + "learning_rate": 9.88428410193233e-05, + "loss": 0.9204, + "step": 21490 + }, + { + "epoch": 0.13735737193820835, + "grad_norm": 0.5591076612472534, + "learning_rate": 9.884176751938222e-05, + "loss": 1.0907, + "step": 21500 + }, + { + "epoch": 0.13742125908794706, + "grad_norm": 0.6328865885734558, + "learning_rate": 9.884069352756228e-05, + "loss": 0.7108, + "step": 21510 + }, + { + "epoch": 0.13748514623768576, + "grad_norm": 0.6090789437294006, + "learning_rate": 9.883961904387431e-05, + "loss": 0.8593, + "step": 21520 + }, + { + "epoch": 0.13754903338742444, + "grad_norm": 1.0573042631149292, + "learning_rate": 9.88385440683291e-05, + "loss": 1.2391, + "step": 21530 + }, + { + "epoch": 0.13761292053716315, + "grad_norm": 1.0376691818237305, + "learning_rate": 9.883746860093752e-05, + "loss": 0.8013, + "step": 21540 + }, + { + "epoch": 0.13767680768690185, + "grad_norm": 0.9400094747543335, + "learning_rate": 9.883639264171038e-05, + "loss": 0.8789, + "step": 21550 + }, + { + "epoch": 0.13774069483664056, + "grad_norm": 2.4618563652038574, + "learning_rate": 9.88353161906585e-05, + "loss": 1.1602, + "step": 21560 + }, + { + "epoch": 0.13780458198637927, + "grad_norm": 1.2091678380966187, + "learning_rate": 9.883423924779277e-05, + "loss": 0.7947, + "step": 21570 + }, + { + "epoch": 0.13786846913611797, + "grad_norm": 0.7721507549285889, + "learning_rate": 9.883316181312398e-05, + "loss": 1.0147, + "step": 21580 + }, + { + "epoch": 0.13793235628585665, + "grad_norm": 0.9228678941726685, + "learning_rate": 9.8832083886663e-05, + "loss": 0.8414, + "step": 21590 + }, + { + "epoch": 0.13799624343559536, + "grad_norm": 0.6696807742118835, + "learning_rate": 9.883100546842071e-05, + "loss": 1.0162, + "step": 21600 + }, + { + "epoch": 0.13806013058533406, + "grad_norm": 0.8186768889427185, + "learning_rate": 9.882992655840793e-05, + "loss": 0.8442, + "step": 21610 + }, + { + "epoch": 0.13812401773507277, + "grad_norm": 1.444062352180481, + "learning_rate": 9.882884715663557e-05, + "loss": 1.2117, + "step": 21620 + }, + { + "epoch": 0.13818790488481147, + "grad_norm": 0.7770470380783081, + "learning_rate": 9.882776726311445e-05, + "loss": 0.7657, + "step": 21630 + }, + { + "epoch": 0.13825179203455018, + "grad_norm": 1.0606368780136108, + "learning_rate": 9.882668687785548e-05, + "loss": 0.8434, + "step": 21640 + }, + { + "epoch": 0.13831567918428886, + "grad_norm": 1.0077322721481323, + "learning_rate": 9.882560600086954e-05, + "loss": 0.8445, + "step": 21650 + }, + { + "epoch": 0.13837956633402757, + "grad_norm": 0.9700446128845215, + "learning_rate": 9.882452463216749e-05, + "loss": 0.7823, + "step": 21660 + }, + { + "epoch": 0.13844345348376627, + "grad_norm": 0.7618522644042969, + "learning_rate": 9.882344277176025e-05, + "loss": 1.0858, + "step": 21670 + }, + { + "epoch": 0.13850734063350498, + "grad_norm": 0.5642924904823303, + "learning_rate": 9.882236041965871e-05, + "loss": 0.9753, + "step": 21680 + }, + { + "epoch": 0.13857122778324368, + "grad_norm": 0.6261829733848572, + "learning_rate": 9.882127757587377e-05, + "loss": 0.773, + "step": 21690 + }, + { + "epoch": 0.1386351149329824, + "grad_norm": 0.48715344071388245, + "learning_rate": 9.882019424041629e-05, + "loss": 0.7998, + "step": 21700 + }, + { + "epoch": 0.13869900208272107, + "grad_norm": 0.850307285785675, + "learning_rate": 9.881911041329726e-05, + "loss": 0.9011, + "step": 21710 + }, + { + "epoch": 0.13876288923245977, + "grad_norm": 0.7470149993896484, + "learning_rate": 9.881802609452753e-05, + "loss": 0.9515, + "step": 21720 + }, + { + "epoch": 0.13882677638219848, + "grad_norm": 0.6368236541748047, + "learning_rate": 9.881694128411804e-05, + "loss": 1.0206, + "step": 21730 + }, + { + "epoch": 0.1388906635319372, + "grad_norm": 1.1505577564239502, + "learning_rate": 9.881585598207973e-05, + "loss": 1.0826, + "step": 21740 + }, + { + "epoch": 0.1389545506816759, + "grad_norm": 2.4669744968414307, + "learning_rate": 9.881477018842352e-05, + "loss": 1.018, + "step": 21750 + }, + { + "epoch": 0.1390184378314146, + "grad_norm": 1.7851297855377197, + "learning_rate": 9.881368390316033e-05, + "loss": 0.7395, + "step": 21760 + }, + { + "epoch": 0.1390823249811533, + "grad_norm": 0.6467908620834351, + "learning_rate": 9.881259712630113e-05, + "loss": 0.9388, + "step": 21770 + }, + { + "epoch": 0.13914621213089198, + "grad_norm": 1.0224095582962036, + "learning_rate": 9.881150985785683e-05, + "loss": 0.9804, + "step": 21780 + }, + { + "epoch": 0.1392100992806307, + "grad_norm": 0.8423238396644592, + "learning_rate": 9.881042209783842e-05, + "loss": 0.8013, + "step": 21790 + }, + { + "epoch": 0.1392739864303694, + "grad_norm": 0.8437933325767517, + "learning_rate": 9.880933384625681e-05, + "loss": 1.0403, + "step": 21800 + }, + { + "epoch": 0.1393378735801081, + "grad_norm": 0.8127179145812988, + "learning_rate": 9.880824510312301e-05, + "loss": 1.0857, + "step": 21810 + }, + { + "epoch": 0.1394017607298468, + "grad_norm": 0.7408185005187988, + "learning_rate": 9.880715586844793e-05, + "loss": 0.8628, + "step": 21820 + }, + { + "epoch": 0.1394656478795855, + "grad_norm": 0.8337761759757996, + "learning_rate": 9.880606614224256e-05, + "loss": 0.9279, + "step": 21830 + }, + { + "epoch": 0.1395295350293242, + "grad_norm": 0.7604190707206726, + "learning_rate": 9.880497592451791e-05, + "loss": 0.6789, + "step": 21840 + }, + { + "epoch": 0.1395934221790629, + "grad_norm": 0.8677889704704285, + "learning_rate": 9.880388521528491e-05, + "loss": 1.2008, + "step": 21850 + }, + { + "epoch": 0.1396573093288016, + "grad_norm": 0.6006574630737305, + "learning_rate": 9.880279401455459e-05, + "loss": 0.9166, + "step": 21860 + }, + { + "epoch": 0.1397211964785403, + "grad_norm": 1.9674246311187744, + "learning_rate": 9.880170232233789e-05, + "loss": 0.7204, + "step": 21870 + }, + { + "epoch": 0.13978508362827902, + "grad_norm": 1.377977967262268, + "learning_rate": 9.880061013864583e-05, + "loss": 0.9785, + "step": 21880 + }, + { + "epoch": 0.13984897077801772, + "grad_norm": 0.8513831496238708, + "learning_rate": 9.879951746348942e-05, + "loss": 1.1253, + "step": 21890 + }, + { + "epoch": 0.1399128579277564, + "grad_norm": 0.7017676830291748, + "learning_rate": 9.879842429687964e-05, + "loss": 0.6319, + "step": 21900 + }, + { + "epoch": 0.1399767450774951, + "grad_norm": 0.8190149068832397, + "learning_rate": 9.87973306388275e-05, + "loss": 1.0802, + "step": 21910 + }, + { + "epoch": 0.1400406322272338, + "grad_norm": 0.7550898790359497, + "learning_rate": 9.879623648934404e-05, + "loss": 0.8998, + "step": 21920 + }, + { + "epoch": 0.14010451937697252, + "grad_norm": 0.8115261793136597, + "learning_rate": 9.879514184844027e-05, + "loss": 0.6637, + "step": 21930 + }, + { + "epoch": 0.14016840652671123, + "grad_norm": 0.6252816319465637, + "learning_rate": 9.87940467161272e-05, + "loss": 0.9156, + "step": 21940 + }, + { + "epoch": 0.14023229367644993, + "grad_norm": 2.5343711376190186, + "learning_rate": 9.879295109241587e-05, + "loss": 1.0213, + "step": 21950 + }, + { + "epoch": 0.1402961808261886, + "grad_norm": 0.9597296714782715, + "learning_rate": 9.87918549773173e-05, + "loss": 0.7637, + "step": 21960 + }, + { + "epoch": 0.14036006797592732, + "grad_norm": 0.613199770450592, + "learning_rate": 9.879075837084255e-05, + "loss": 0.9528, + "step": 21970 + }, + { + "epoch": 0.14042395512566602, + "grad_norm": 1.902039885520935, + "learning_rate": 9.878966127300264e-05, + "loss": 0.7262, + "step": 21980 + }, + { + "epoch": 0.14048784227540473, + "grad_norm": 0.6681598424911499, + "learning_rate": 9.878856368380864e-05, + "loss": 0.9852, + "step": 21990 + }, + { + "epoch": 0.14055172942514343, + "grad_norm": 0.593425989151001, + "learning_rate": 9.87874656032716e-05, + "loss": 0.8866, + "step": 22000 + }, + { + "epoch": 0.14061561657488214, + "grad_norm": 0.8456883430480957, + "learning_rate": 9.878636703140257e-05, + "loss": 0.7837, + "step": 22010 + }, + { + "epoch": 0.14067950372462082, + "grad_norm": 0.719262421131134, + "learning_rate": 9.878526796821261e-05, + "loss": 1.0117, + "step": 22020 + }, + { + "epoch": 0.14074339087435953, + "grad_norm": 0.740960419178009, + "learning_rate": 9.878416841371282e-05, + "loss": 1.0046, + "step": 22030 + }, + { + "epoch": 0.14080727802409823, + "grad_norm": 1.0368300676345825, + "learning_rate": 9.878306836791423e-05, + "loss": 0.8077, + "step": 22040 + }, + { + "epoch": 0.14087116517383694, + "grad_norm": 1.3289177417755127, + "learning_rate": 9.878196783082793e-05, + "loss": 0.777, + "step": 22050 + }, + { + "epoch": 0.14093505232357564, + "grad_norm": 0.9713805913925171, + "learning_rate": 9.878086680246504e-05, + "loss": 0.8098, + "step": 22060 + }, + { + "epoch": 0.14099893947331435, + "grad_norm": 0.5410668253898621, + "learning_rate": 9.877976528283661e-05, + "loss": 0.9304, + "step": 22070 + }, + { + "epoch": 0.14106282662305303, + "grad_norm": 0.8843280673027039, + "learning_rate": 9.877866327195373e-05, + "loss": 0.8307, + "step": 22080 + }, + { + "epoch": 0.14112671377279173, + "grad_norm": 1.040749430656433, + "learning_rate": 9.877756076982751e-05, + "loss": 0.8895, + "step": 22090 + }, + { + "epoch": 0.14119060092253044, + "grad_norm": 0.8764167428016663, + "learning_rate": 9.877645777646907e-05, + "loss": 0.9634, + "step": 22100 + }, + { + "epoch": 0.14125448807226915, + "grad_norm": 0.5217092633247375, + "learning_rate": 9.87753542918895e-05, + "loss": 1.0052, + "step": 22110 + }, + { + "epoch": 0.14131837522200785, + "grad_norm": 0.6405453681945801, + "learning_rate": 9.87742503160999e-05, + "loss": 0.9599, + "step": 22120 + }, + { + "epoch": 0.14138226237174656, + "grad_norm": 0.7412799000740051, + "learning_rate": 9.877314584911143e-05, + "loss": 0.7852, + "step": 22130 + }, + { + "epoch": 0.14144614952148524, + "grad_norm": 1.6060749292373657, + "learning_rate": 9.877204089093516e-05, + "loss": 1.2637, + "step": 22140 + }, + { + "epoch": 0.14151003667122394, + "grad_norm": 1.0910207033157349, + "learning_rate": 9.877093544158227e-05, + "loss": 0.8333, + "step": 22150 + }, + { + "epoch": 0.14157392382096265, + "grad_norm": 1.1602824926376343, + "learning_rate": 9.876982950106384e-05, + "loss": 0.9858, + "step": 22160 + }, + { + "epoch": 0.14163781097070136, + "grad_norm": 0.8228923082351685, + "learning_rate": 9.876872306939105e-05, + "loss": 1.0867, + "step": 22170 + }, + { + "epoch": 0.14170169812044006, + "grad_norm": 0.825602650642395, + "learning_rate": 9.876761614657504e-05, + "loss": 0.8261, + "step": 22180 + }, + { + "epoch": 0.14176558527017877, + "grad_norm": 0.7997944355010986, + "learning_rate": 9.876650873262692e-05, + "loss": 0.8914, + "step": 22190 + }, + { + "epoch": 0.14182947241991745, + "grad_norm": 0.6990909576416016, + "learning_rate": 9.876540082755788e-05, + "loss": 1.1852, + "step": 22200 + }, + { + "epoch": 0.14189335956965615, + "grad_norm": 0.7908971309661865, + "learning_rate": 9.876429243137906e-05, + "loss": 0.7917, + "step": 22210 + }, + { + "epoch": 0.14195724671939486, + "grad_norm": 1.2012591361999512, + "learning_rate": 9.876318354410163e-05, + "loss": 0.9249, + "step": 22220 + }, + { + "epoch": 0.14202113386913356, + "grad_norm": 0.7461243867874146, + "learning_rate": 9.876207416573677e-05, + "loss": 0.9312, + "step": 22230 + }, + { + "epoch": 0.14208502101887227, + "grad_norm": 0.8374635577201843, + "learning_rate": 9.876096429629563e-05, + "loss": 0.8613, + "step": 22240 + }, + { + "epoch": 0.14214890816861098, + "grad_norm": 0.6775134801864624, + "learning_rate": 9.875985393578938e-05, + "loss": 0.9284, + "step": 22250 + }, + { + "epoch": 0.14221279531834966, + "grad_norm": 1.069081425666809, + "learning_rate": 9.875874308422923e-05, + "loss": 0.809, + "step": 22260 + }, + { + "epoch": 0.14227668246808836, + "grad_norm": 0.8016782402992249, + "learning_rate": 9.875763174162635e-05, + "loss": 0.8151, + "step": 22270 + }, + { + "epoch": 0.14234056961782707, + "grad_norm": 0.7888844609260559, + "learning_rate": 9.875651990799196e-05, + "loss": 0.8556, + "step": 22280 + }, + { + "epoch": 0.14240445676756577, + "grad_norm": 0.8360929489135742, + "learning_rate": 9.875540758333721e-05, + "loss": 0.7994, + "step": 22290 + }, + { + "epoch": 0.14246834391730448, + "grad_norm": 0.520611584186554, + "learning_rate": 9.875429476767333e-05, + "loss": 0.8767, + "step": 22300 + }, + { + "epoch": 0.14253223106704319, + "grad_norm": 0.47477564215660095, + "learning_rate": 9.875318146101151e-05, + "loss": 0.7093, + "step": 22310 + }, + { + "epoch": 0.14259611821678186, + "grad_norm": 0.7633807063102722, + "learning_rate": 9.8752067663363e-05, + "loss": 1.1947, + "step": 22320 + }, + { + "epoch": 0.14266000536652057, + "grad_norm": 0.7206790447235107, + "learning_rate": 9.875095337473899e-05, + "loss": 0.8928, + "step": 22330 + }, + { + "epoch": 0.14272389251625928, + "grad_norm": 0.7361767888069153, + "learning_rate": 9.874983859515069e-05, + "loss": 0.8716, + "step": 22340 + }, + { + "epoch": 0.14278777966599798, + "grad_norm": 0.8034409880638123, + "learning_rate": 9.874872332460934e-05, + "loss": 0.9446, + "step": 22350 + }, + { + "epoch": 0.1428516668157367, + "grad_norm": 0.8999035954475403, + "learning_rate": 9.874760756312617e-05, + "loss": 1.0096, + "step": 22360 + }, + { + "epoch": 0.1429155539654754, + "grad_norm": 0.8220607042312622, + "learning_rate": 9.874649131071244e-05, + "loss": 0.9535, + "step": 22370 + }, + { + "epoch": 0.14297944111521407, + "grad_norm": 1.6880701780319214, + "learning_rate": 9.874537456737936e-05, + "loss": 0.9347, + "step": 22380 + }, + { + "epoch": 0.14304332826495278, + "grad_norm": 2.4227957725524902, + "learning_rate": 9.874425733313819e-05, + "loss": 0.9415, + "step": 22390 + }, + { + "epoch": 0.14310721541469149, + "grad_norm": 0.665111243724823, + "learning_rate": 9.874313960800017e-05, + "loss": 0.8991, + "step": 22400 + }, + { + "epoch": 0.1431711025644302, + "grad_norm": 1.0277364253997803, + "learning_rate": 9.874202139197657e-05, + "loss": 1.0399, + "step": 22410 + }, + { + "epoch": 0.1432349897141689, + "grad_norm": 0.8064048290252686, + "learning_rate": 9.874090268507866e-05, + "loss": 0.9161, + "step": 22420 + }, + { + "epoch": 0.1432988768639076, + "grad_norm": 0.5607860684394836, + "learning_rate": 9.873978348731767e-05, + "loss": 0.8696, + "step": 22430 + }, + { + "epoch": 0.14336276401364628, + "grad_norm": 0.6954864263534546, + "learning_rate": 9.873866379870492e-05, + "loss": 0.6301, + "step": 22440 + }, + { + "epoch": 0.143426651163385, + "grad_norm": 0.675815999507904, + "learning_rate": 9.873754361925162e-05, + "loss": 0.9119, + "step": 22450 + }, + { + "epoch": 0.1434905383131237, + "grad_norm": 1.266095757484436, + "learning_rate": 9.873642294896913e-05, + "loss": 0.9423, + "step": 22460 + }, + { + "epoch": 0.1435544254628624, + "grad_norm": 0.8914671540260315, + "learning_rate": 9.873530178786868e-05, + "loss": 1.034, + "step": 22470 + }, + { + "epoch": 0.1436183126126011, + "grad_norm": 0.953437864780426, + "learning_rate": 9.873418013596159e-05, + "loss": 0.9487, + "step": 22480 + }, + { + "epoch": 0.1436821997623398, + "grad_norm": 0.6912809014320374, + "learning_rate": 9.873305799325914e-05, + "loss": 1.1522, + "step": 22490 + }, + { + "epoch": 0.1437460869120785, + "grad_norm": 0.6595206260681152, + "learning_rate": 9.873193535977263e-05, + "loss": 0.977, + "step": 22500 + }, + { + "epoch": 0.1438099740618172, + "grad_norm": 0.9730925559997559, + "learning_rate": 9.873081223551338e-05, + "loss": 0.7952, + "step": 22510 + }, + { + "epoch": 0.1438738612115559, + "grad_norm": 4.339688777923584, + "learning_rate": 9.872968862049268e-05, + "loss": 0.9139, + "step": 22520 + }, + { + "epoch": 0.1439377483612946, + "grad_norm": 0.939578652381897, + "learning_rate": 9.872856451472188e-05, + "loss": 1.0464, + "step": 22530 + }, + { + "epoch": 0.14400163551103332, + "grad_norm": 0.8998389840126038, + "learning_rate": 9.872743991821227e-05, + "loss": 0.9492, + "step": 22540 + }, + { + "epoch": 0.14406552266077202, + "grad_norm": 0.7495961785316467, + "learning_rate": 9.872631483097518e-05, + "loss": 0.8357, + "step": 22550 + }, + { + "epoch": 0.1441294098105107, + "grad_norm": 0.7158836126327515, + "learning_rate": 9.872518925302195e-05, + "loss": 0.6346, + "step": 22560 + }, + { + "epoch": 0.1441932969602494, + "grad_norm": 1.3562219142913818, + "learning_rate": 9.872406318436391e-05, + "loss": 0.7683, + "step": 22570 + }, + { + "epoch": 0.1442571841099881, + "grad_norm": 2.4515798091888428, + "learning_rate": 9.872293662501239e-05, + "loss": 0.93, + "step": 22580 + }, + { + "epoch": 0.14432107125972682, + "grad_norm": 0.6932923197746277, + "learning_rate": 9.872180957497876e-05, + "loss": 0.8557, + "step": 22590 + }, + { + "epoch": 0.14438495840946552, + "grad_norm": 0.8083714842796326, + "learning_rate": 9.872068203427434e-05, + "loss": 0.9603, + "step": 22600 + }, + { + "epoch": 0.14444884555920423, + "grad_norm": 0.6430138945579529, + "learning_rate": 9.871955400291052e-05, + "loss": 1.0151, + "step": 22610 + }, + { + "epoch": 0.14451273270894294, + "grad_norm": 0.5157865881919861, + "learning_rate": 9.871842548089864e-05, + "loss": 1.0402, + "step": 22620 + }, + { + "epoch": 0.14457661985868162, + "grad_norm": 0.7073084115982056, + "learning_rate": 9.871729646825008e-05, + "loss": 1.1601, + "step": 22630 + }, + { + "epoch": 0.14464050700842032, + "grad_norm": 0.8356124758720398, + "learning_rate": 9.871616696497618e-05, + "loss": 0.7882, + "step": 22640 + }, + { + "epoch": 0.14470439415815903, + "grad_norm": 0.7543877959251404, + "learning_rate": 9.871503697108833e-05, + "loss": 1.1977, + "step": 22650 + }, + { + "epoch": 0.14476828130789773, + "grad_norm": 0.5048431158065796, + "learning_rate": 9.871390648659793e-05, + "loss": 0.6942, + "step": 22660 + }, + { + "epoch": 0.14483216845763644, + "grad_norm": 0.8877227306365967, + "learning_rate": 9.871277551151635e-05, + "loss": 1.0161, + "step": 22670 + }, + { + "epoch": 0.14489605560737515, + "grad_norm": 1.6515774726867676, + "learning_rate": 9.871164404585496e-05, + "loss": 0.7984, + "step": 22680 + }, + { + "epoch": 0.14495994275711382, + "grad_norm": 0.7503309845924377, + "learning_rate": 9.871051208962518e-05, + "loss": 1.4356, + "step": 22690 + }, + { + "epoch": 0.14502382990685253, + "grad_norm": 0.5918260216712952, + "learning_rate": 9.87093796428384e-05, + "loss": 0.8047, + "step": 22700 + }, + { + "epoch": 0.14508771705659124, + "grad_norm": 0.7670891880989075, + "learning_rate": 9.870824670550603e-05, + "loss": 1.0355, + "step": 22710 + }, + { + "epoch": 0.14515160420632994, + "grad_norm": 0.7030889987945557, + "learning_rate": 9.870711327763947e-05, + "loss": 0.9419, + "step": 22720 + }, + { + "epoch": 0.14521549135606865, + "grad_norm": 1.9804078340530396, + "learning_rate": 9.870597935925016e-05, + "loss": 1.0519, + "step": 22730 + }, + { + "epoch": 0.14527937850580736, + "grad_norm": 0.5866715312004089, + "learning_rate": 9.870484495034948e-05, + "loss": 0.8467, + "step": 22740 + }, + { + "epoch": 0.14534326565554603, + "grad_norm": 1.0047521591186523, + "learning_rate": 9.87037100509489e-05, + "loss": 0.9234, + "step": 22750 + }, + { + "epoch": 0.14540715280528474, + "grad_norm": 0.8460586667060852, + "learning_rate": 9.87025746610598e-05, + "loss": 0.98, + "step": 22760 + }, + { + "epoch": 0.14547103995502345, + "grad_norm": 0.6952506303787231, + "learning_rate": 9.870143878069364e-05, + "loss": 0.8913, + "step": 22770 + }, + { + "epoch": 0.14553492710476215, + "grad_norm": 0.8370442986488342, + "learning_rate": 9.870030240986188e-05, + "loss": 0.7564, + "step": 22780 + }, + { + "epoch": 0.14559881425450086, + "grad_norm": 2.1772940158843994, + "learning_rate": 9.869916554857593e-05, + "loss": 1.0058, + "step": 22790 + }, + { + "epoch": 0.14566270140423956, + "grad_norm": 1.9751546382904053, + "learning_rate": 9.869802819684726e-05, + "loss": 0.8494, + "step": 22800 + }, + { + "epoch": 0.14572658855397824, + "grad_norm": 1.1138042211532593, + "learning_rate": 9.86968903546873e-05, + "loss": 0.745, + "step": 22810 + }, + { + "epoch": 0.14579047570371695, + "grad_norm": 0.9470332264900208, + "learning_rate": 9.869575202210754e-05, + "loss": 0.9222, + "step": 22820 + }, + { + "epoch": 0.14585436285345565, + "grad_norm": 0.6957728862762451, + "learning_rate": 9.869461319911944e-05, + "loss": 1.0055, + "step": 22830 + }, + { + "epoch": 0.14591825000319436, + "grad_norm": 0.7304112911224365, + "learning_rate": 9.869347388573443e-05, + "loss": 0.8063, + "step": 22840 + }, + { + "epoch": 0.14598213715293307, + "grad_norm": 0.4859442710876465, + "learning_rate": 9.869233408196403e-05, + "loss": 0.7749, + "step": 22850 + }, + { + "epoch": 0.14604602430267177, + "grad_norm": 0.6382431387901306, + "learning_rate": 9.86911937878197e-05, + "loss": 0.9488, + "step": 22860 + }, + { + "epoch": 0.14610991145241045, + "grad_norm": 0.6626219153404236, + "learning_rate": 9.869005300331291e-05, + "loss": 0.6605, + "step": 22870 + }, + { + "epoch": 0.14617379860214916, + "grad_norm": 0.9865225553512573, + "learning_rate": 9.868891172845519e-05, + "loss": 1.0758, + "step": 22880 + }, + { + "epoch": 0.14623768575188786, + "grad_norm": 0.7838436365127563, + "learning_rate": 9.868776996325799e-05, + "loss": 1.0838, + "step": 22890 + }, + { + "epoch": 0.14630157290162657, + "grad_norm": 0.7881513833999634, + "learning_rate": 9.868662770773282e-05, + "loss": 0.8395, + "step": 22900 + }, + { + "epoch": 0.14636546005136528, + "grad_norm": 0.6249982118606567, + "learning_rate": 9.86854849618912e-05, + "loss": 1.0855, + "step": 22910 + }, + { + "epoch": 0.14642934720110398, + "grad_norm": 0.7879114151000977, + "learning_rate": 9.868434172574462e-05, + "loss": 1.0791, + "step": 22920 + }, + { + "epoch": 0.14649323435084266, + "grad_norm": 0.872688353061676, + "learning_rate": 9.86831979993046e-05, + "loss": 1.189, + "step": 22930 + }, + { + "epoch": 0.14655712150058137, + "grad_norm": 0.6431063413619995, + "learning_rate": 9.868205378258266e-05, + "loss": 1.0102, + "step": 22940 + }, + { + "epoch": 0.14662100865032007, + "grad_norm": 0.9336161017417908, + "learning_rate": 9.868090907559033e-05, + "loss": 1.1622, + "step": 22950 + }, + { + "epoch": 0.14668489580005878, + "grad_norm": 1.0055698156356812, + "learning_rate": 9.867976387833913e-05, + "loss": 0.8623, + "step": 22960 + }, + { + "epoch": 0.14674878294979748, + "grad_norm": 1.0225908756256104, + "learning_rate": 9.867861819084059e-05, + "loss": 0.7738, + "step": 22970 + }, + { + "epoch": 0.1468126700995362, + "grad_norm": 0.9196385741233826, + "learning_rate": 9.867747201310626e-05, + "loss": 0.8153, + "step": 22980 + }, + { + "epoch": 0.14687655724927487, + "grad_norm": 1.0798165798187256, + "learning_rate": 9.867632534514766e-05, + "loss": 0.9407, + "step": 22990 + }, + { + "epoch": 0.14694044439901358, + "grad_norm": 0.8176427483558655, + "learning_rate": 9.867517818697636e-05, + "loss": 0.9316, + "step": 23000 + }, + { + "epoch": 0.14700433154875228, + "grad_norm": 1.2678016424179077, + "learning_rate": 9.867403053860391e-05, + "loss": 0.7385, + "step": 23010 + }, + { + "epoch": 0.147068218698491, + "grad_norm": 1.1173145771026611, + "learning_rate": 9.867288240004185e-05, + "loss": 0.9177, + "step": 23020 + }, + { + "epoch": 0.1471321058482297, + "grad_norm": 0.6615016460418701, + "learning_rate": 9.867173377130177e-05, + "loss": 0.9355, + "step": 23030 + }, + { + "epoch": 0.1471959929979684, + "grad_norm": 0.5626130104064941, + "learning_rate": 9.867058465239522e-05, + "loss": 0.73, + "step": 23040 + }, + { + "epoch": 0.14725988014770708, + "grad_norm": 0.9644745588302612, + "learning_rate": 9.866943504333377e-05, + "loss": 0.8876, + "step": 23050 + }, + { + "epoch": 0.14732376729744578, + "grad_norm": 1.4023088216781616, + "learning_rate": 9.866828494412901e-05, + "loss": 0.8923, + "step": 23060 + }, + { + "epoch": 0.1473876544471845, + "grad_norm": 0.6760227680206299, + "learning_rate": 9.866713435479252e-05, + "loss": 0.8072, + "step": 23070 + }, + { + "epoch": 0.1474515415969232, + "grad_norm": 0.9531158804893494, + "learning_rate": 9.866598327533589e-05, + "loss": 0.8004, + "step": 23080 + }, + { + "epoch": 0.1475154287466619, + "grad_norm": 0.6163201928138733, + "learning_rate": 9.866483170577069e-05, + "loss": 0.9639, + "step": 23090 + }, + { + "epoch": 0.1475793158964006, + "grad_norm": 0.6841567158699036, + "learning_rate": 9.866367964610854e-05, + "loss": 1.0902, + "step": 23100 + }, + { + "epoch": 0.1476432030461393, + "grad_norm": 0.8613043427467346, + "learning_rate": 9.866252709636104e-05, + "loss": 0.8745, + "step": 23110 + }, + { + "epoch": 0.147707090195878, + "grad_norm": 0.9095843434333801, + "learning_rate": 9.86613740565398e-05, + "loss": 0.8784, + "step": 23120 + }, + { + "epoch": 0.1477709773456167, + "grad_norm": 0.6751396059989929, + "learning_rate": 9.86602205266564e-05, + "loss": 1.0972, + "step": 23130 + }, + { + "epoch": 0.1478348644953554, + "grad_norm": 0.7569636106491089, + "learning_rate": 9.86590665067225e-05, + "loss": 1.0461, + "step": 23140 + }, + { + "epoch": 0.1478987516450941, + "grad_norm": 1.0290982723236084, + "learning_rate": 9.86579119967497e-05, + "loss": 1.112, + "step": 23150 + }, + { + "epoch": 0.14796263879483282, + "grad_norm": 0.6011145114898682, + "learning_rate": 9.865675699674964e-05, + "loss": 1.0506, + "step": 23160 + }, + { + "epoch": 0.1480265259445715, + "grad_norm": 0.8810587525367737, + "learning_rate": 9.865560150673392e-05, + "loss": 0.9679, + "step": 23170 + }, + { + "epoch": 0.1480904130943102, + "grad_norm": 0.7942286133766174, + "learning_rate": 9.865444552671422e-05, + "loss": 0.8441, + "step": 23180 + }, + { + "epoch": 0.1481543002440489, + "grad_norm": 1.2883180379867554, + "learning_rate": 9.865328905670215e-05, + "loss": 0.9123, + "step": 23190 + }, + { + "epoch": 0.14821818739378761, + "grad_norm": 0.9160734415054321, + "learning_rate": 9.865213209670939e-05, + "loss": 0.8103, + "step": 23200 + }, + { + "epoch": 0.14828207454352632, + "grad_norm": 0.5292953848838806, + "learning_rate": 9.865097464674754e-05, + "loss": 0.7631, + "step": 23210 + }, + { + "epoch": 0.14834596169326503, + "grad_norm": 1.5886908769607544, + "learning_rate": 9.86498167068283e-05, + "loss": 0.9782, + "step": 23220 + }, + { + "epoch": 0.1484098488430037, + "grad_norm": 1.2354532480239868, + "learning_rate": 9.864865827696333e-05, + "loss": 1.0666, + "step": 23230 + }, + { + "epoch": 0.1484737359927424, + "grad_norm": 0.902732789516449, + "learning_rate": 9.864749935716427e-05, + "loss": 0.8587, + "step": 23240 + }, + { + "epoch": 0.14853762314248112, + "grad_norm": 0.9489061236381531, + "learning_rate": 9.86463399474428e-05, + "loss": 0.9015, + "step": 23250 + }, + { + "epoch": 0.14860151029221982, + "grad_norm": 1.0594868659973145, + "learning_rate": 9.86451800478106e-05, + "loss": 0.9639, + "step": 23260 + }, + { + "epoch": 0.14866539744195853, + "grad_norm": 0.9709058403968811, + "learning_rate": 9.864401965827936e-05, + "loss": 0.9575, + "step": 23270 + }, + { + "epoch": 0.14872928459169724, + "grad_norm": 0.7420225143432617, + "learning_rate": 9.864285877886076e-05, + "loss": 0.8139, + "step": 23280 + }, + { + "epoch": 0.14879317174143591, + "grad_norm": 1.2411167621612549, + "learning_rate": 9.86416974095665e-05, + "loss": 0.9154, + "step": 23290 + }, + { + "epoch": 0.14885705889117462, + "grad_norm": 0.9969791769981384, + "learning_rate": 9.864053555040826e-05, + "loss": 0.7712, + "step": 23300 + }, + { + "epoch": 0.14892094604091333, + "grad_norm": 0.7000773549079895, + "learning_rate": 9.863937320139774e-05, + "loss": 0.9034, + "step": 23310 + }, + { + "epoch": 0.14898483319065203, + "grad_norm": 0.8266654014587402, + "learning_rate": 9.863821036254666e-05, + "loss": 0.9289, + "step": 23320 + }, + { + "epoch": 0.14904872034039074, + "grad_norm": 0.5291149616241455, + "learning_rate": 9.863704703386671e-05, + "loss": 0.8965, + "step": 23330 + }, + { + "epoch": 0.14911260749012945, + "grad_norm": 1.1645135879516602, + "learning_rate": 9.863588321536964e-05, + "loss": 1.0616, + "step": 23340 + }, + { + "epoch": 0.14917649463986812, + "grad_norm": 0.7084513902664185, + "learning_rate": 9.863471890706714e-05, + "loss": 1.0098, + "step": 23350 + }, + { + "epoch": 0.14924038178960683, + "grad_norm": 0.6941312551498413, + "learning_rate": 9.863355410897095e-05, + "loss": 0.9369, + "step": 23360 + }, + { + "epoch": 0.14930426893934554, + "grad_norm": 1.0156537294387817, + "learning_rate": 9.863238882109278e-05, + "loss": 1.1076, + "step": 23370 + }, + { + "epoch": 0.14936815608908424, + "grad_norm": 0.8023911714553833, + "learning_rate": 9.863122304344439e-05, + "loss": 0.8709, + "step": 23380 + }, + { + "epoch": 0.14943204323882295, + "grad_norm": 0.8865915536880493, + "learning_rate": 9.863005677603752e-05, + "loss": 0.8393, + "step": 23390 + }, + { + "epoch": 0.14949593038856165, + "grad_norm": 1.4520982503890991, + "learning_rate": 9.86288900188839e-05, + "loss": 1.138, + "step": 23400 + }, + { + "epoch": 0.14955981753830033, + "grad_norm": 1.1401234865188599, + "learning_rate": 9.862772277199529e-05, + "loss": 1.1788, + "step": 23410 + }, + { + "epoch": 0.14962370468803904, + "grad_norm": 0.632628858089447, + "learning_rate": 9.862655503538344e-05, + "loss": 0.8879, + "step": 23420 + }, + { + "epoch": 0.14968759183777774, + "grad_norm": 0.6416946649551392, + "learning_rate": 9.862538680906012e-05, + "loss": 0.8936, + "step": 23430 + }, + { + "epoch": 0.14975147898751645, + "grad_norm": 0.6808968186378479, + "learning_rate": 9.862421809303708e-05, + "loss": 0.8778, + "step": 23440 + }, + { + "epoch": 0.14981536613725516, + "grad_norm": 0.9920696020126343, + "learning_rate": 9.86230488873261e-05, + "loss": 0.8278, + "step": 23450 + }, + { + "epoch": 0.14987925328699386, + "grad_norm": 0.8314083218574524, + "learning_rate": 9.862187919193895e-05, + "loss": 0.9445, + "step": 23460 + }, + { + "epoch": 0.14994314043673257, + "grad_norm": 0.7839555740356445, + "learning_rate": 9.862070900688742e-05, + "loss": 0.9105, + "step": 23470 + }, + { + "epoch": 0.15000702758647125, + "grad_norm": 0.7194756865501404, + "learning_rate": 9.861953833218329e-05, + "loss": 0.8104, + "step": 23480 + }, + { + "epoch": 0.15007091473620995, + "grad_norm": 0.8320297002792358, + "learning_rate": 9.861836716783834e-05, + "loss": 0.9076, + "step": 23490 + }, + { + "epoch": 0.15013480188594866, + "grad_norm": 0.744303822517395, + "learning_rate": 9.861719551386437e-05, + "loss": 0.7775, + "step": 23500 + }, + { + "epoch": 0.15019868903568737, + "grad_norm": 1.1499621868133545, + "learning_rate": 9.861602337027318e-05, + "loss": 1.0126, + "step": 23510 + }, + { + "epoch": 0.15026257618542607, + "grad_norm": 0.893481969833374, + "learning_rate": 9.861485073707658e-05, + "loss": 0.9876, + "step": 23520 + }, + { + "epoch": 0.15032646333516478, + "grad_norm": 1.0423784255981445, + "learning_rate": 9.861367761428638e-05, + "loss": 0.831, + "step": 23530 + }, + { + "epoch": 0.15039035048490346, + "grad_norm": 0.7774150371551514, + "learning_rate": 9.861250400191438e-05, + "loss": 0.8752, + "step": 23540 + }, + { + "epoch": 0.15045423763464216, + "grad_norm": 0.9276893138885498, + "learning_rate": 9.861132989997242e-05, + "loss": 0.815, + "step": 23550 + }, + { + "epoch": 0.15051812478438087, + "grad_norm": 1.5479460954666138, + "learning_rate": 9.86101553084723e-05, + "loss": 1.1705, + "step": 23560 + }, + { + "epoch": 0.15058201193411958, + "grad_norm": 1.3702467679977417, + "learning_rate": 9.860898022742587e-05, + "loss": 1.1229, + "step": 23570 + }, + { + "epoch": 0.15064589908385828, + "grad_norm": 0.8833318948745728, + "learning_rate": 9.860780465684497e-05, + "loss": 0.8501, + "step": 23580 + }, + { + "epoch": 0.150709786233597, + "grad_norm": 0.8857479691505432, + "learning_rate": 9.860662859674139e-05, + "loss": 0.9028, + "step": 23590 + }, + { + "epoch": 0.15077367338333567, + "grad_norm": 0.9464370608329773, + "learning_rate": 9.860545204712703e-05, + "loss": 0.8605, + "step": 23600 + }, + { + "epoch": 0.15083756053307437, + "grad_norm": 0.9219076037406921, + "learning_rate": 9.860427500801372e-05, + "loss": 0.8217, + "step": 23610 + }, + { + "epoch": 0.15090144768281308, + "grad_norm": 2.4392945766448975, + "learning_rate": 9.860309747941333e-05, + "loss": 0.8927, + "step": 23620 + }, + { + "epoch": 0.15096533483255178, + "grad_norm": 1.1871190071105957, + "learning_rate": 9.860191946133766e-05, + "loss": 1.1577, + "step": 23630 + }, + { + "epoch": 0.1510292219822905, + "grad_norm": 1.2772961854934692, + "learning_rate": 9.860074095379863e-05, + "loss": 0.9204, + "step": 23640 + }, + { + "epoch": 0.1510931091320292, + "grad_norm": 0.6214377284049988, + "learning_rate": 9.859956195680811e-05, + "loss": 0.8562, + "step": 23650 + }, + { + "epoch": 0.15115699628176787, + "grad_norm": 0.7957346439361572, + "learning_rate": 9.859838247037794e-05, + "loss": 0.7878, + "step": 23660 + }, + { + "epoch": 0.15122088343150658, + "grad_norm": 0.7047122716903687, + "learning_rate": 9.859720249452003e-05, + "loss": 0.9215, + "step": 23670 + }, + { + "epoch": 0.1512847705812453, + "grad_norm": 0.8219524025917053, + "learning_rate": 9.859602202924623e-05, + "loss": 0.884, + "step": 23680 + }, + { + "epoch": 0.151348657730984, + "grad_norm": 0.844274640083313, + "learning_rate": 9.859484107456846e-05, + "loss": 0.8565, + "step": 23690 + }, + { + "epoch": 0.1514125448807227, + "grad_norm": 0.8894696831703186, + "learning_rate": 9.859365963049858e-05, + "loss": 0.8738, + "step": 23700 + }, + { + "epoch": 0.1514764320304614, + "grad_norm": 1.032109260559082, + "learning_rate": 9.859247769704854e-05, + "loss": 0.7034, + "step": 23710 + }, + { + "epoch": 0.15154031918020008, + "grad_norm": 0.8953695297241211, + "learning_rate": 9.859129527423019e-05, + "loss": 0.9061, + "step": 23720 + }, + { + "epoch": 0.1516042063299388, + "grad_norm": 0.7908507585525513, + "learning_rate": 9.859011236205547e-05, + "loss": 0.9427, + "step": 23730 + }, + { + "epoch": 0.1516680934796775, + "grad_norm": 0.7494611144065857, + "learning_rate": 9.858892896053626e-05, + "loss": 0.7095, + "step": 23740 + }, + { + "epoch": 0.1517319806294162, + "grad_norm": 0.7644729614257812, + "learning_rate": 9.858774506968451e-05, + "loss": 0.9053, + "step": 23750 + }, + { + "epoch": 0.1517958677791549, + "grad_norm": 1.1524786949157715, + "learning_rate": 9.858656068951215e-05, + "loss": 0.7965, + "step": 23760 + }, + { + "epoch": 0.15185975492889361, + "grad_norm": 0.8188411593437195, + "learning_rate": 9.858537582003107e-05, + "loss": 0.986, + "step": 23770 + }, + { + "epoch": 0.1519236420786323, + "grad_norm": 0.9521570801734924, + "learning_rate": 9.858419046125322e-05, + "loss": 0.791, + "step": 23780 + }, + { + "epoch": 0.151987529228371, + "grad_norm": 1.1801695823669434, + "learning_rate": 9.858300461319057e-05, + "loss": 0.8084, + "step": 23790 + }, + { + "epoch": 0.1520514163781097, + "grad_norm": 0.66313236951828, + "learning_rate": 9.8581818275855e-05, + "loss": 1.0134, + "step": 23800 + }, + { + "epoch": 0.1521153035278484, + "grad_norm": 0.7492579817771912, + "learning_rate": 9.85806314492585e-05, + "loss": 0.892, + "step": 23810 + }, + { + "epoch": 0.15217919067758712, + "grad_norm": 0.7110322713851929, + "learning_rate": 9.857944413341304e-05, + "loss": 1.1158, + "step": 23820 + }, + { + "epoch": 0.15224307782732582, + "grad_norm": 1.010519027709961, + "learning_rate": 9.857825632833053e-05, + "loss": 0.9537, + "step": 23830 + }, + { + "epoch": 0.1523069649770645, + "grad_norm": 0.8604142069816589, + "learning_rate": 9.857706803402294e-05, + "loss": 0.936, + "step": 23840 + }, + { + "epoch": 0.1523708521268032, + "grad_norm": 0.5838251113891602, + "learning_rate": 9.857587925050226e-05, + "loss": 0.9363, + "step": 23850 + }, + { + "epoch": 0.15243473927654191, + "grad_norm": 0.7778534889221191, + "learning_rate": 9.857468997778046e-05, + "loss": 1.0045, + "step": 23860 + }, + { + "epoch": 0.15249862642628062, + "grad_norm": 1.257494568824768, + "learning_rate": 9.85735002158695e-05, + "loss": 0.847, + "step": 23870 + }, + { + "epoch": 0.15256251357601933, + "grad_norm": 0.7079510688781738, + "learning_rate": 9.857230996478137e-05, + "loss": 1.0672, + "step": 23880 + }, + { + "epoch": 0.15262640072575803, + "grad_norm": 2.4514129161834717, + "learning_rate": 9.857111922452807e-05, + "loss": 0.7693, + "step": 23890 + }, + { + "epoch": 0.1526902878754967, + "grad_norm": 0.5904504060745239, + "learning_rate": 9.856992799512157e-05, + "loss": 0.9016, + "step": 23900 + }, + { + "epoch": 0.15275417502523542, + "grad_norm": 0.7344809770584106, + "learning_rate": 9.856873627657387e-05, + "loss": 0.7255, + "step": 23910 + }, + { + "epoch": 0.15281806217497412, + "grad_norm": 1.4561502933502197, + "learning_rate": 9.856754406889698e-05, + "loss": 0.9038, + "step": 23920 + }, + { + "epoch": 0.15288194932471283, + "grad_norm": 0.8599551916122437, + "learning_rate": 9.85663513721029e-05, + "loss": 0.7992, + "step": 23930 + }, + { + "epoch": 0.15294583647445154, + "grad_norm": 2.2323386669158936, + "learning_rate": 9.856515818620367e-05, + "loss": 0.9681, + "step": 23940 + }, + { + "epoch": 0.15300972362419024, + "grad_norm": 1.3280889987945557, + "learning_rate": 9.856396451121125e-05, + "loss": 0.6727, + "step": 23950 + }, + { + "epoch": 0.15307361077392892, + "grad_norm": 1.3691116571426392, + "learning_rate": 9.856277034713772e-05, + "loss": 0.8038, + "step": 23960 + }, + { + "epoch": 0.15313749792366763, + "grad_norm": 1.1116257905960083, + "learning_rate": 9.856157569399507e-05, + "loss": 0.749, + "step": 23970 + }, + { + "epoch": 0.15320138507340633, + "grad_norm": 1.1849030256271362, + "learning_rate": 9.856038055179535e-05, + "loss": 0.9773, + "step": 23980 + }, + { + "epoch": 0.15326527222314504, + "grad_norm": 0.88172447681427, + "learning_rate": 9.855918492055057e-05, + "loss": 1.1426, + "step": 23990 + }, + { + "epoch": 0.15332915937288374, + "grad_norm": 1.102968454360962, + "learning_rate": 9.855798880027279e-05, + "loss": 0.9212, + "step": 24000 + }, + { + "epoch": 0.15339304652262245, + "grad_norm": 1.179286003112793, + "learning_rate": 9.855679219097407e-05, + "loss": 0.9407, + "step": 24010 + }, + { + "epoch": 0.15345693367236113, + "grad_norm": 0.7198648452758789, + "learning_rate": 9.855559509266644e-05, + "loss": 0.9663, + "step": 24020 + }, + { + "epoch": 0.15352082082209983, + "grad_norm": 0.9259359240531921, + "learning_rate": 9.855439750536195e-05, + "loss": 1.0747, + "step": 24030 + }, + { + "epoch": 0.15358470797183854, + "grad_norm": 0.9067502021789551, + "learning_rate": 9.855319942907268e-05, + "loss": 0.7373, + "step": 24040 + }, + { + "epoch": 0.15364859512157725, + "grad_norm": 0.6593869924545288, + "learning_rate": 9.855200086381068e-05, + "loss": 0.9685, + "step": 24050 + }, + { + "epoch": 0.15371248227131595, + "grad_norm": 0.810939610004425, + "learning_rate": 9.855080180958803e-05, + "loss": 0.7862, + "step": 24060 + }, + { + "epoch": 0.15377636942105466, + "grad_norm": 0.8420569896697998, + "learning_rate": 9.854960226641681e-05, + "loss": 0.8562, + "step": 24070 + }, + { + "epoch": 0.15384025657079334, + "grad_norm": 0.7327421307563782, + "learning_rate": 9.854840223430909e-05, + "loss": 0.852, + "step": 24080 + }, + { + "epoch": 0.15390414372053204, + "grad_norm": 0.8360452055931091, + "learning_rate": 9.854720171327696e-05, + "loss": 0.9425, + "step": 24090 + }, + { + "epoch": 0.15396803087027075, + "grad_norm": 0.6557414531707764, + "learning_rate": 9.854600070333251e-05, + "loss": 0.754, + "step": 24100 + }, + { + "epoch": 0.15403191802000946, + "grad_norm": 0.9082469940185547, + "learning_rate": 9.854479920448782e-05, + "loss": 0.9427, + "step": 24110 + }, + { + "epoch": 0.15409580516974816, + "grad_norm": 0.7796029448509216, + "learning_rate": 9.854359721675503e-05, + "loss": 0.8438, + "step": 24120 + }, + { + "epoch": 0.15415969231948687, + "grad_norm": 0.6190805435180664, + "learning_rate": 9.85423947401462e-05, + "loss": 0.8237, + "step": 24130 + }, + { + "epoch": 0.15422357946922555, + "grad_norm": 0.813653290271759, + "learning_rate": 9.854119177467347e-05, + "loss": 0.8553, + "step": 24140 + }, + { + "epoch": 0.15428746661896425, + "grad_norm": 0.8362258672714233, + "learning_rate": 9.853998832034894e-05, + "loss": 0.9488, + "step": 24150 + }, + { + "epoch": 0.15435135376870296, + "grad_norm": 1.0680490732192993, + "learning_rate": 9.853878437718473e-05, + "loss": 0.9838, + "step": 24160 + }, + { + "epoch": 0.15441524091844167, + "grad_norm": 1.0183037519454956, + "learning_rate": 9.853757994519299e-05, + "loss": 0.6685, + "step": 24170 + }, + { + "epoch": 0.15447912806818037, + "grad_norm": 0.7617247700691223, + "learning_rate": 9.853637502438582e-05, + "loss": 0.8784, + "step": 24180 + }, + { + "epoch": 0.15454301521791908, + "grad_norm": 0.633660614490509, + "learning_rate": 9.853516961477535e-05, + "loss": 0.8068, + "step": 24190 + }, + { + "epoch": 0.15460690236765776, + "grad_norm": 0.8987011313438416, + "learning_rate": 9.853396371637374e-05, + "loss": 0.6322, + "step": 24200 + }, + { + "epoch": 0.15467078951739646, + "grad_norm": 0.8973355889320374, + "learning_rate": 9.853275732919314e-05, + "loss": 1.2822, + "step": 24210 + }, + { + "epoch": 0.15473467666713517, + "grad_norm": 1.284421682357788, + "learning_rate": 9.853155045324567e-05, + "loss": 0.9294, + "step": 24220 + }, + { + "epoch": 0.15479856381687387, + "grad_norm": 1.0189619064331055, + "learning_rate": 9.85303430885435e-05, + "loss": 0.7897, + "step": 24230 + }, + { + "epoch": 0.15486245096661258, + "grad_norm": 0.8572905659675598, + "learning_rate": 9.85291352350988e-05, + "loss": 0.9204, + "step": 24240 + }, + { + "epoch": 0.1549263381163513, + "grad_norm": 1.0044801235198975, + "learning_rate": 9.852792689292373e-05, + "loss": 1.0265, + "step": 24250 + }, + { + "epoch": 0.15499022526608996, + "grad_norm": 0.8651962280273438, + "learning_rate": 9.852671806203045e-05, + "loss": 0.6892, + "step": 24260 + }, + { + "epoch": 0.15505411241582867, + "grad_norm": 1.309009075164795, + "learning_rate": 9.852550874243111e-05, + "loss": 1.0858, + "step": 24270 + }, + { + "epoch": 0.15511799956556738, + "grad_norm": 0.9584972262382507, + "learning_rate": 9.852429893413795e-05, + "loss": 0.9216, + "step": 24280 + }, + { + "epoch": 0.15518188671530608, + "grad_norm": 0.6010156869888306, + "learning_rate": 9.852308863716311e-05, + "loss": 0.8739, + "step": 24290 + }, + { + "epoch": 0.1552457738650448, + "grad_norm": 0.8952304124832153, + "learning_rate": 9.852187785151879e-05, + "loss": 0.9147, + "step": 24300 + }, + { + "epoch": 0.1553096610147835, + "grad_norm": 0.6536133885383606, + "learning_rate": 9.85206665772172e-05, + "loss": 0.8771, + "step": 24310 + }, + { + "epoch": 0.1553735481645222, + "grad_norm": 0.7753522992134094, + "learning_rate": 9.851945481427048e-05, + "loss": 1.0301, + "step": 24320 + }, + { + "epoch": 0.15543743531426088, + "grad_norm": 1.4516469240188599, + "learning_rate": 9.851824256269092e-05, + "loss": 1.0265, + "step": 24330 + }, + { + "epoch": 0.1555013224639996, + "grad_norm": 0.934195339679718, + "learning_rate": 9.851702982249065e-05, + "loss": 0.995, + "step": 24340 + }, + { + "epoch": 0.1555652096137383, + "grad_norm": 0.7957481741905212, + "learning_rate": 9.851581659368192e-05, + "loss": 0.8226, + "step": 24350 + }, + { + "epoch": 0.155629096763477, + "grad_norm": 0.7475680708885193, + "learning_rate": 9.851460287627695e-05, + "loss": 1.0825, + "step": 24360 + }, + { + "epoch": 0.1556929839132157, + "grad_norm": 0.65959233045578, + "learning_rate": 9.851338867028797e-05, + "loss": 1.1795, + "step": 24370 + }, + { + "epoch": 0.1557568710629544, + "grad_norm": 0.6770491600036621, + "learning_rate": 9.851217397572718e-05, + "loss": 0.9308, + "step": 24380 + }, + { + "epoch": 0.1558207582126931, + "grad_norm": 0.6056157946586609, + "learning_rate": 9.851095879260684e-05, + "loss": 1.0731, + "step": 24390 + }, + { + "epoch": 0.1558846453624318, + "grad_norm": 0.8914613127708435, + "learning_rate": 9.850974312093918e-05, + "loss": 0.7644, + "step": 24400 + }, + { + "epoch": 0.1559485325121705, + "grad_norm": 0.8289129137992859, + "learning_rate": 9.850852696073643e-05, + "loss": 1.1423, + "step": 24410 + }, + { + "epoch": 0.1560124196619092, + "grad_norm": 1.1932592391967773, + "learning_rate": 9.850731031201084e-05, + "loss": 0.7908, + "step": 24420 + }, + { + "epoch": 0.1560763068116479, + "grad_norm": 0.8615885376930237, + "learning_rate": 9.850609317477468e-05, + "loss": 0.7105, + "step": 24430 + }, + { + "epoch": 0.15614019396138662, + "grad_norm": 0.647098958492279, + "learning_rate": 9.85048755490402e-05, + "loss": 1.0009, + "step": 24440 + }, + { + "epoch": 0.1562040811111253, + "grad_norm": 0.6660744547843933, + "learning_rate": 9.850365743481965e-05, + "loss": 0.8714, + "step": 24450 + }, + { + "epoch": 0.156267968260864, + "grad_norm": 0.84688800573349, + "learning_rate": 9.850243883212531e-05, + "loss": 0.942, + "step": 24460 + }, + { + "epoch": 0.1563318554106027, + "grad_norm": 0.48218655586242676, + "learning_rate": 9.850121974096946e-05, + "loss": 1.0805, + "step": 24470 + }, + { + "epoch": 0.15639574256034142, + "grad_norm": 0.9218449592590332, + "learning_rate": 9.850000016136437e-05, + "loss": 1.0481, + "step": 24480 + }, + { + "epoch": 0.15645962971008012, + "grad_norm": 0.584633469581604, + "learning_rate": 9.849878009332231e-05, + "loss": 0.8474, + "step": 24490 + }, + { + "epoch": 0.15652351685981883, + "grad_norm": 0.8491461873054504, + "learning_rate": 9.849755953685557e-05, + "loss": 0.9905, + "step": 24500 + }, + { + "epoch": 0.1565874040095575, + "grad_norm": 0.961509644985199, + "learning_rate": 9.849633849197649e-05, + "loss": 1.1605, + "step": 24510 + }, + { + "epoch": 0.1566512911592962, + "grad_norm": 0.8623896241188049, + "learning_rate": 9.849511695869728e-05, + "loss": 0.7161, + "step": 24520 + }, + { + "epoch": 0.15671517830903492, + "grad_norm": 0.6448975205421448, + "learning_rate": 9.84938949370303e-05, + "loss": 0.9754, + "step": 24530 + }, + { + "epoch": 0.15677906545877363, + "grad_norm": 0.5791314244270325, + "learning_rate": 9.849267242698785e-05, + "loss": 0.7836, + "step": 24540 + }, + { + "epoch": 0.15684295260851233, + "grad_norm": 0.5874826312065125, + "learning_rate": 9.849144942858224e-05, + "loss": 0.8067, + "step": 24550 + }, + { + "epoch": 0.15690683975825104, + "grad_norm": 0.7695150375366211, + "learning_rate": 9.849022594182577e-05, + "loss": 1.153, + "step": 24560 + }, + { + "epoch": 0.15697072690798972, + "grad_norm": 0.7399982213973999, + "learning_rate": 9.848900196673079e-05, + "loss": 1.2349, + "step": 24570 + }, + { + "epoch": 0.15703461405772842, + "grad_norm": 0.8517500758171082, + "learning_rate": 9.848777750330961e-05, + "loss": 1.01, + "step": 24580 + }, + { + "epoch": 0.15709850120746713, + "grad_norm": 0.6582129001617432, + "learning_rate": 9.848655255157456e-05, + "loss": 0.71, + "step": 24590 + }, + { + "epoch": 0.15716238835720583, + "grad_norm": 0.5711886286735535, + "learning_rate": 9.848532711153797e-05, + "loss": 0.9785, + "step": 24600 + }, + { + "epoch": 0.15722627550694454, + "grad_norm": 0.7866716980934143, + "learning_rate": 9.848410118321221e-05, + "loss": 0.8093, + "step": 24610 + }, + { + "epoch": 0.15729016265668325, + "grad_norm": 0.6282891631126404, + "learning_rate": 9.848287476660958e-05, + "loss": 0.8937, + "step": 24620 + }, + { + "epoch": 0.15735404980642193, + "grad_norm": 1.6044594049453735, + "learning_rate": 9.848164786174248e-05, + "loss": 1.0449, + "step": 24630 + }, + { + "epoch": 0.15741793695616063, + "grad_norm": 1.279166579246521, + "learning_rate": 9.848042046862322e-05, + "loss": 1.2909, + "step": 24640 + }, + { + "epoch": 0.15748182410589934, + "grad_norm": 1.3262732028961182, + "learning_rate": 9.847919258726421e-05, + "loss": 0.9336, + "step": 24650 + }, + { + "epoch": 0.15754571125563804, + "grad_norm": 0.7303173542022705, + "learning_rate": 9.847796421767777e-05, + "loss": 0.8935, + "step": 24660 + }, + { + "epoch": 0.15760959840537675, + "grad_norm": 0.8746846914291382, + "learning_rate": 9.84767353598763e-05, + "loss": 0.8438, + "step": 24670 + }, + { + "epoch": 0.15767348555511546, + "grad_norm": 1.244907259941101, + "learning_rate": 9.847550601387217e-05, + "loss": 0.672, + "step": 24680 + }, + { + "epoch": 0.15773737270485413, + "grad_norm": 0.7882753610610962, + "learning_rate": 9.847427617967775e-05, + "loss": 0.8104, + "step": 24690 + }, + { + "epoch": 0.15780125985459284, + "grad_norm": 0.5869142413139343, + "learning_rate": 9.847304585730544e-05, + "loss": 0.9445, + "step": 24700 + }, + { + "epoch": 0.15786514700433155, + "grad_norm": 0.8743402361869812, + "learning_rate": 9.847181504676761e-05, + "loss": 1.0129, + "step": 24710 + }, + { + "epoch": 0.15792903415407025, + "grad_norm": 0.8246279358863831, + "learning_rate": 9.847058374807669e-05, + "loss": 0.8171, + "step": 24720 + }, + { + "epoch": 0.15799292130380896, + "grad_norm": 0.7410875558853149, + "learning_rate": 9.846935196124504e-05, + "loss": 0.9308, + "step": 24730 + }, + { + "epoch": 0.15805680845354766, + "grad_norm": 0.9520349502563477, + "learning_rate": 9.846811968628509e-05, + "loss": 1.0484, + "step": 24740 + }, + { + "epoch": 0.15812069560328634, + "grad_norm": 1.908379316329956, + "learning_rate": 9.846688692320925e-05, + "loss": 0.9074, + "step": 24750 + }, + { + "epoch": 0.15818458275302505, + "grad_norm": 1.148059368133545, + "learning_rate": 9.846565367202992e-05, + "loss": 1.0573, + "step": 24760 + }, + { + "epoch": 0.15824846990276376, + "grad_norm": 0.6221771836280823, + "learning_rate": 9.846441993275952e-05, + "loss": 0.9355, + "step": 24770 + }, + { + "epoch": 0.15831235705250246, + "grad_norm": 0.7107810974121094, + "learning_rate": 9.84631857054105e-05, + "loss": 0.8245, + "step": 24780 + }, + { + "epoch": 0.15837624420224117, + "grad_norm": 2.3203704357147217, + "learning_rate": 9.846195098999527e-05, + "loss": 0.7197, + "step": 24790 + }, + { + "epoch": 0.15844013135197987, + "grad_norm": 0.8047979474067688, + "learning_rate": 9.846071578652627e-05, + "loss": 1.0095, + "step": 24800 + }, + { + "epoch": 0.15850401850171855, + "grad_norm": 0.848024845123291, + "learning_rate": 9.845948009501593e-05, + "loss": 0.9665, + "step": 24810 + }, + { + "epoch": 0.15856790565145726, + "grad_norm": 0.5435264706611633, + "learning_rate": 9.845824391547671e-05, + "loss": 0.7763, + "step": 24820 + }, + { + "epoch": 0.15863179280119596, + "grad_norm": 0.6636167764663696, + "learning_rate": 9.845700724792104e-05, + "loss": 0.966, + "step": 24830 + }, + { + "epoch": 0.15869567995093467, + "grad_norm": 0.9921244382858276, + "learning_rate": 9.84557700923614e-05, + "loss": 0.8407, + "step": 24840 + }, + { + "epoch": 0.15875956710067338, + "grad_norm": 0.6068295836448669, + "learning_rate": 9.845453244881022e-05, + "loss": 0.7625, + "step": 24850 + }, + { + "epoch": 0.15882345425041208, + "grad_norm": 0.5496127605438232, + "learning_rate": 9.845329431728e-05, + "loss": 0.8734, + "step": 24860 + }, + { + "epoch": 0.15888734140015076, + "grad_norm": 1.1657304763793945, + "learning_rate": 9.845205569778316e-05, + "loss": 0.8215, + "step": 24870 + }, + { + "epoch": 0.15895122854988947, + "grad_norm": 0.6050916910171509, + "learning_rate": 9.845081659033221e-05, + "loss": 0.7701, + "step": 24880 + }, + { + "epoch": 0.15901511569962817, + "grad_norm": 0.7160899043083191, + "learning_rate": 9.844957699493964e-05, + "loss": 1.0013, + "step": 24890 + }, + { + "epoch": 0.15907900284936688, + "grad_norm": 0.8572732210159302, + "learning_rate": 9.84483369116179e-05, + "loss": 0.8973, + "step": 24900 + }, + { + "epoch": 0.15914288999910559, + "grad_norm": 0.8619921803474426, + "learning_rate": 9.84470963403795e-05, + "loss": 0.9491, + "step": 24910 + }, + { + "epoch": 0.1592067771488443, + "grad_norm": 0.6899974942207336, + "learning_rate": 9.844585528123692e-05, + "loss": 0.8375, + "step": 24920 + }, + { + "epoch": 0.15927066429858297, + "grad_norm": 0.7540447115898132, + "learning_rate": 9.844461373420267e-05, + "loss": 0.8525, + "step": 24930 + }, + { + "epoch": 0.15933455144832168, + "grad_norm": 0.8030637502670288, + "learning_rate": 9.844337169928926e-05, + "loss": 0.8833, + "step": 24940 + }, + { + "epoch": 0.15939843859806038, + "grad_norm": 0.8504492044448853, + "learning_rate": 9.844212917650917e-05, + "loss": 0.9273, + "step": 24950 + }, + { + "epoch": 0.1594623257477991, + "grad_norm": 1.3353928327560425, + "learning_rate": 9.844088616587493e-05, + "loss": 0.8097, + "step": 24960 + }, + { + "epoch": 0.1595262128975378, + "grad_norm": 1.6527575254440308, + "learning_rate": 9.843964266739907e-05, + "loss": 0.7699, + "step": 24970 + }, + { + "epoch": 0.1595901000472765, + "grad_norm": 0.6608484387397766, + "learning_rate": 9.84383986810941e-05, + "loss": 0.9171, + "step": 24980 + }, + { + "epoch": 0.15965398719701518, + "grad_norm": 0.8177617788314819, + "learning_rate": 9.843715420697254e-05, + "loss": 0.9391, + "step": 24990 + }, + { + "epoch": 0.15971787434675389, + "grad_norm": 0.8526275753974915, + "learning_rate": 9.843590924504696e-05, + "loss": 0.9272, + "step": 25000 + }, + { + "epoch": 0.1597817614964926, + "grad_norm": 0.753639817237854, + "learning_rate": 9.843466379532985e-05, + "loss": 0.6739, + "step": 25010 + }, + { + "epoch": 0.1598456486462313, + "grad_norm": 0.8092784881591797, + "learning_rate": 9.843341785783377e-05, + "loss": 0.7158, + "step": 25020 + }, + { + "epoch": 0.15990953579597, + "grad_norm": 1.0467857122421265, + "learning_rate": 9.843217143257126e-05, + "loss": 0.7562, + "step": 25030 + }, + { + "epoch": 0.1599734229457087, + "grad_norm": 0.5774504542350769, + "learning_rate": 9.843092451955491e-05, + "loss": 0.7832, + "step": 25040 + }, + { + "epoch": 0.1600373100954474, + "grad_norm": 1.2460720539093018, + "learning_rate": 9.842967711879725e-05, + "loss": 0.7436, + "step": 25050 + }, + { + "epoch": 0.1601011972451861, + "grad_norm": 1.1241377592086792, + "learning_rate": 9.842842923031084e-05, + "loss": 0.7252, + "step": 25060 + }, + { + "epoch": 0.1601650843949248, + "grad_norm": 0.565549910068512, + "learning_rate": 9.842718085410823e-05, + "loss": 0.8209, + "step": 25070 + }, + { + "epoch": 0.1602289715446635, + "grad_norm": 0.5020076036453247, + "learning_rate": 9.842593199020203e-05, + "loss": 0.661, + "step": 25080 + }, + { + "epoch": 0.1602928586944022, + "grad_norm": 0.7566838264465332, + "learning_rate": 9.842480759571027e-05, + "loss": 0.9231, + "step": 25090 + }, + { + "epoch": 0.16035674584414092, + "grad_norm": 0.9458664655685425, + "learning_rate": 9.842355780520187e-05, + "loss": 0.7723, + "step": 25100 + }, + { + "epoch": 0.1604206329938796, + "grad_norm": 1.0208630561828613, + "learning_rate": 9.842230752702635e-05, + "loss": 0.7883, + "step": 25110 + }, + { + "epoch": 0.1604845201436183, + "grad_norm": 0.7197948098182678, + "learning_rate": 9.84210567611963e-05, + "loss": 0.8673, + "step": 25120 + }, + { + "epoch": 0.160548407293357, + "grad_norm": 0.9319686889648438, + "learning_rate": 9.841980550772433e-05, + "loss": 0.7893, + "step": 25130 + }, + { + "epoch": 0.16061229444309572, + "grad_norm": 0.8447830677032471, + "learning_rate": 9.841855376662302e-05, + "loss": 1.0086, + "step": 25140 + }, + { + "epoch": 0.16067618159283442, + "grad_norm": 1.1380891799926758, + "learning_rate": 9.841730153790499e-05, + "loss": 0.7411, + "step": 25150 + }, + { + "epoch": 0.16074006874257313, + "grad_norm": 0.988677442073822, + "learning_rate": 9.841604882158285e-05, + "loss": 0.8238, + "step": 25160 + }, + { + "epoch": 0.16080395589231183, + "grad_norm": 0.6261546611785889, + "learning_rate": 9.84147956176692e-05, + "loss": 0.9153, + "step": 25170 + }, + { + "epoch": 0.1608678430420505, + "grad_norm": 1.1242022514343262, + "learning_rate": 9.841354192617667e-05, + "loss": 0.8479, + "step": 25180 + }, + { + "epoch": 0.16093173019178922, + "grad_norm": 0.8760757446289062, + "learning_rate": 9.84122877471179e-05, + "loss": 1.0954, + "step": 25190 + }, + { + "epoch": 0.16099561734152792, + "grad_norm": 0.8859489560127258, + "learning_rate": 9.841103308050552e-05, + "loss": 0.6732, + "step": 25200 + }, + { + "epoch": 0.16105950449126663, + "grad_norm": 1.3529788255691528, + "learning_rate": 9.840977792635215e-05, + "loss": 1.0534, + "step": 25210 + }, + { + "epoch": 0.16112339164100534, + "grad_norm": 0.721413254737854, + "learning_rate": 9.840852228467041e-05, + "loss": 0.9705, + "step": 25220 + }, + { + "epoch": 0.16118727879074404, + "grad_norm": 0.9626721739768982, + "learning_rate": 9.8407266155473e-05, + "loss": 0.8799, + "step": 25230 + }, + { + "epoch": 0.16125116594048272, + "grad_norm": 0.5856235027313232, + "learning_rate": 9.840600953877253e-05, + "loss": 1.2152, + "step": 25240 + }, + { + "epoch": 0.16131505309022143, + "grad_norm": 1.074049711227417, + "learning_rate": 9.840475243458167e-05, + "loss": 0.8506, + "step": 25250 + }, + { + "epoch": 0.16137894023996013, + "grad_norm": 0.7193922400474548, + "learning_rate": 9.840349484291308e-05, + "loss": 0.7899, + "step": 25260 + }, + { + "epoch": 0.16144282738969884, + "grad_norm": 1.0390762090682983, + "learning_rate": 9.840223676377942e-05, + "loss": 0.9389, + "step": 25270 + }, + { + "epoch": 0.16150671453943755, + "grad_norm": 1.7726080417633057, + "learning_rate": 9.840097819719336e-05, + "loss": 0.9474, + "step": 25280 + }, + { + "epoch": 0.16157060168917625, + "grad_norm": 0.6403753757476807, + "learning_rate": 9.839971914316757e-05, + "loss": 0.8837, + "step": 25290 + }, + { + "epoch": 0.16163448883891493, + "grad_norm": 0.8878451585769653, + "learning_rate": 9.839845960171475e-05, + "loss": 0.9911, + "step": 25300 + }, + { + "epoch": 0.16169837598865364, + "grad_norm": 0.9376581907272339, + "learning_rate": 9.839719957284756e-05, + "loss": 1.0247, + "step": 25310 + }, + { + "epoch": 0.16176226313839234, + "grad_norm": 0.6702033877372742, + "learning_rate": 9.839593905657871e-05, + "loss": 0.9453, + "step": 25320 + }, + { + "epoch": 0.16182615028813105, + "grad_norm": 0.4987049400806427, + "learning_rate": 9.839467805292089e-05, + "loss": 0.7227, + "step": 25330 + }, + { + "epoch": 0.16189003743786975, + "grad_norm": 0.6735382080078125, + "learning_rate": 9.839341656188677e-05, + "loss": 0.8046, + "step": 25340 + }, + { + "epoch": 0.16195392458760846, + "grad_norm": 0.8256925344467163, + "learning_rate": 9.839215458348909e-05, + "loss": 1.342, + "step": 25350 + }, + { + "epoch": 0.16201781173734714, + "grad_norm": 1.0099321603775024, + "learning_rate": 9.839089211774056e-05, + "loss": 0.9079, + "step": 25360 + }, + { + "epoch": 0.16208169888708585, + "grad_norm": 0.9464432597160339, + "learning_rate": 9.838962916465388e-05, + "loss": 1.0935, + "step": 25370 + }, + { + "epoch": 0.16214558603682455, + "grad_norm": 1.0927412509918213, + "learning_rate": 9.838836572424176e-05, + "loss": 0.6795, + "step": 25380 + }, + { + "epoch": 0.16220947318656326, + "grad_norm": 0.6880885362625122, + "learning_rate": 9.838710179651694e-05, + "loss": 0.9407, + "step": 25390 + }, + { + "epoch": 0.16227336033630196, + "grad_norm": 0.9150338768959045, + "learning_rate": 9.838583738149215e-05, + "loss": 0.9107, + "step": 25400 + }, + { + "epoch": 0.16233724748604067, + "grad_norm": 1.087501049041748, + "learning_rate": 9.838457247918012e-05, + "loss": 0.7319, + "step": 25410 + }, + { + "epoch": 0.16240113463577935, + "grad_norm": 0.7410935163497925, + "learning_rate": 9.838330708959358e-05, + "loss": 0.755, + "step": 25420 + }, + { + "epoch": 0.16246502178551805, + "grad_norm": 0.7320923209190369, + "learning_rate": 9.838204121274527e-05, + "loss": 0.9022, + "step": 25430 + }, + { + "epoch": 0.16252890893525676, + "grad_norm": 0.7874162793159485, + "learning_rate": 9.838077484864796e-05, + "loss": 0.8658, + "step": 25440 + }, + { + "epoch": 0.16259279608499547, + "grad_norm": 0.6988115310668945, + "learning_rate": 9.83795079973144e-05, + "loss": 0.8592, + "step": 25450 + }, + { + "epoch": 0.16265668323473417, + "grad_norm": 1.1536266803741455, + "learning_rate": 9.837824065875733e-05, + "loss": 0.9147, + "step": 25460 + }, + { + "epoch": 0.16272057038447288, + "grad_norm": 0.8450478911399841, + "learning_rate": 9.837697283298952e-05, + "loss": 0.8379, + "step": 25470 + }, + { + "epoch": 0.16278445753421156, + "grad_norm": 0.545207679271698, + "learning_rate": 9.837570452002375e-05, + "loss": 0.8029, + "step": 25480 + }, + { + "epoch": 0.16284834468395026, + "grad_norm": 0.8712167739868164, + "learning_rate": 9.837443571987277e-05, + "loss": 0.9546, + "step": 25490 + }, + { + "epoch": 0.16291223183368897, + "grad_norm": 1.3103466033935547, + "learning_rate": 9.837316643254938e-05, + "loss": 0.8578, + "step": 25500 + }, + { + "epoch": 0.16297611898342768, + "grad_norm": 0.6110614538192749, + "learning_rate": 9.837189665806637e-05, + "loss": 0.9893, + "step": 25510 + }, + { + "epoch": 0.16304000613316638, + "grad_norm": 0.8567008972167969, + "learning_rate": 9.83706263964365e-05, + "loss": 0.7717, + "step": 25520 + }, + { + "epoch": 0.1631038932829051, + "grad_norm": 0.844247579574585, + "learning_rate": 9.836935564767257e-05, + "loss": 1.0621, + "step": 25530 + }, + { + "epoch": 0.16316778043264377, + "grad_norm": 0.8914816379547119, + "learning_rate": 9.836808441178739e-05, + "loss": 0.8287, + "step": 25540 + }, + { + "epoch": 0.16323166758238247, + "grad_norm": 0.6251090168952942, + "learning_rate": 9.836681268879377e-05, + "loss": 1.051, + "step": 25550 + }, + { + "epoch": 0.16329555473212118, + "grad_norm": 0.6964147090911865, + "learning_rate": 9.836554047870447e-05, + "loss": 0.9595, + "step": 25560 + }, + { + "epoch": 0.16335944188185988, + "grad_norm": 0.5562779307365417, + "learning_rate": 9.836426778153236e-05, + "loss": 0.8304, + "step": 25570 + }, + { + "epoch": 0.1634233290315986, + "grad_norm": 0.6539714336395264, + "learning_rate": 9.836299459729023e-05, + "loss": 0.9026, + "step": 25580 + }, + { + "epoch": 0.1634872161813373, + "grad_norm": 0.789167582988739, + "learning_rate": 9.836172092599089e-05, + "loss": 0.806, + "step": 25590 + }, + { + "epoch": 0.16355110333107598, + "grad_norm": 0.7832333445549011, + "learning_rate": 9.83604467676472e-05, + "loss": 0.8309, + "step": 25600 + }, + { + "epoch": 0.16361499048081468, + "grad_norm": 0.9938201308250427, + "learning_rate": 9.835917212227197e-05, + "loss": 0.9, + "step": 25610 + }, + { + "epoch": 0.1636788776305534, + "grad_norm": 0.7347666621208191, + "learning_rate": 9.835789698987802e-05, + "loss": 0.7665, + "step": 25620 + }, + { + "epoch": 0.1637427647802921, + "grad_norm": 0.7416117191314697, + "learning_rate": 9.835662137047824e-05, + "loss": 0.8239, + "step": 25630 + }, + { + "epoch": 0.1638066519300308, + "grad_norm": 0.6439573764801025, + "learning_rate": 9.835534526408543e-05, + "loss": 0.9106, + "step": 25640 + }, + { + "epoch": 0.1638705390797695, + "grad_norm": 1.0646562576293945, + "learning_rate": 9.835406867071247e-05, + "loss": 0.7518, + "step": 25650 + }, + { + "epoch": 0.16393442622950818, + "grad_norm": 1.135383129119873, + "learning_rate": 9.83527915903722e-05, + "loss": 0.723, + "step": 25660 + }, + { + "epoch": 0.1639983133792469, + "grad_norm": 0.9141467213630676, + "learning_rate": 9.83515140230775e-05, + "loss": 1.104, + "step": 25670 + }, + { + "epoch": 0.1640622005289856, + "grad_norm": 0.7846889495849609, + "learning_rate": 9.83502359688412e-05, + "loss": 0.8969, + "step": 25680 + }, + { + "epoch": 0.1641260876787243, + "grad_norm": 0.8037777543067932, + "learning_rate": 9.834895742767622e-05, + "loss": 0.9751, + "step": 25690 + }, + { + "epoch": 0.164189974828463, + "grad_norm": 1.0449095964431763, + "learning_rate": 9.83476783995954e-05, + "loss": 1.0799, + "step": 25700 + }, + { + "epoch": 0.16425386197820172, + "grad_norm": 0.6123198866844177, + "learning_rate": 9.834639888461162e-05, + "loss": 0.8884, + "step": 25710 + }, + { + "epoch": 0.1643177491279404, + "grad_norm": 0.7933758497238159, + "learning_rate": 9.834511888273778e-05, + "loss": 0.9816, + "step": 25720 + }, + { + "epoch": 0.1643816362776791, + "grad_norm": 1.233192801475525, + "learning_rate": 9.83438383939868e-05, + "loss": 0.9493, + "step": 25730 + }, + { + "epoch": 0.1644455234274178, + "grad_norm": 0.9002760052680969, + "learning_rate": 9.834255741837151e-05, + "loss": 1.0682, + "step": 25740 + }, + { + "epoch": 0.1645094105771565, + "grad_norm": 0.6131082773208618, + "learning_rate": 9.834127595590485e-05, + "loss": 0.964, + "step": 25750 + }, + { + "epoch": 0.16457329772689522, + "grad_norm": 1.53384530544281, + "learning_rate": 9.833999400659972e-05, + "loss": 0.9393, + "step": 25760 + }, + { + "epoch": 0.16463718487663392, + "grad_norm": 0.8691433072090149, + "learning_rate": 9.833871157046904e-05, + "loss": 0.708, + "step": 25770 + }, + { + "epoch": 0.1647010720263726, + "grad_norm": 0.6749919652938843, + "learning_rate": 9.833742864752571e-05, + "loss": 1.1174, + "step": 25780 + }, + { + "epoch": 0.1647649591761113, + "grad_norm": 0.6683396100997925, + "learning_rate": 9.833614523778266e-05, + "loss": 0.8302, + "step": 25790 + }, + { + "epoch": 0.16482884632585001, + "grad_norm": 0.8051975965499878, + "learning_rate": 9.833486134125281e-05, + "loss": 1.2393, + "step": 25800 + }, + { + "epoch": 0.16489273347558872, + "grad_norm": 0.6575607657432556, + "learning_rate": 9.833357695794909e-05, + "loss": 1.0257, + "step": 25810 + }, + { + "epoch": 0.16495662062532743, + "grad_norm": 0.9496917128562927, + "learning_rate": 9.833229208788443e-05, + "loss": 0.8261, + "step": 25820 + }, + { + "epoch": 0.16502050777506613, + "grad_norm": 0.7231150269508362, + "learning_rate": 9.833100673107179e-05, + "loss": 0.6341, + "step": 25830 + }, + { + "epoch": 0.1650843949248048, + "grad_norm": 1.2418237924575806, + "learning_rate": 9.832972088752407e-05, + "loss": 0.803, + "step": 25840 + }, + { + "epoch": 0.16514828207454352, + "grad_norm": 0.6519736051559448, + "learning_rate": 9.832843455725427e-05, + "loss": 0.918, + "step": 25850 + }, + { + "epoch": 0.16521216922428222, + "grad_norm": 0.6396727561950684, + "learning_rate": 9.832714774027534e-05, + "loss": 1.0144, + "step": 25860 + }, + { + "epoch": 0.16527605637402093, + "grad_norm": 1.0266163349151611, + "learning_rate": 9.832586043660019e-05, + "loss": 0.7874, + "step": 25870 + }, + { + "epoch": 0.16533994352375964, + "grad_norm": 0.9573850035667419, + "learning_rate": 9.832457264624184e-05, + "loss": 0.8346, + "step": 25880 + }, + { + "epoch": 0.16540383067349834, + "grad_norm": 0.7382820844650269, + "learning_rate": 9.832328436921324e-05, + "loss": 0.7884, + "step": 25890 + }, + { + "epoch": 0.16546771782323702, + "grad_norm": 0.8257744908332825, + "learning_rate": 9.832199560552734e-05, + "loss": 0.9137, + "step": 25900 + }, + { + "epoch": 0.16553160497297573, + "grad_norm": 0.4377366006374359, + "learning_rate": 9.832070635519715e-05, + "loss": 0.7715, + "step": 25910 + }, + { + "epoch": 0.16559549212271443, + "grad_norm": 0.6788588166236877, + "learning_rate": 9.831941661823564e-05, + "loss": 0.8829, + "step": 25920 + }, + { + "epoch": 0.16565937927245314, + "grad_norm": 0.7223168611526489, + "learning_rate": 9.831812639465581e-05, + "loss": 0.9969, + "step": 25930 + }, + { + "epoch": 0.16572326642219185, + "grad_norm": 0.5885007977485657, + "learning_rate": 9.831683568447064e-05, + "loss": 0.8589, + "step": 25940 + }, + { + "epoch": 0.16578715357193055, + "grad_norm": 0.4553689956665039, + "learning_rate": 9.831554448769314e-05, + "loss": 1.0332, + "step": 25950 + }, + { + "epoch": 0.16585104072166926, + "grad_norm": 0.8313513398170471, + "learning_rate": 9.831425280433631e-05, + "loss": 0.8301, + "step": 25960 + }, + { + "epoch": 0.16591492787140794, + "grad_norm": 0.8566281795501709, + "learning_rate": 9.831296063441315e-05, + "loss": 0.8196, + "step": 25970 + }, + { + "epoch": 0.16597881502114664, + "grad_norm": 0.9477049708366394, + "learning_rate": 9.831166797793668e-05, + "loss": 1.0331, + "step": 25980 + }, + { + "epoch": 0.16604270217088535, + "grad_norm": 0.9263975620269775, + "learning_rate": 9.831037483491991e-05, + "loss": 1.0746, + "step": 25990 + }, + { + "epoch": 0.16610658932062405, + "grad_norm": 0.5900850296020508, + "learning_rate": 9.83090812053759e-05, + "loss": 0.9527, + "step": 26000 + }, + { + "epoch": 0.16617047647036276, + "grad_norm": 1.1755515336990356, + "learning_rate": 9.830778708931762e-05, + "loss": 0.8315, + "step": 26010 + }, + { + "epoch": 0.16623436362010147, + "grad_norm": 0.9088423848152161, + "learning_rate": 9.830649248675814e-05, + "loss": 0.9244, + "step": 26020 + }, + { + "epoch": 0.16629825076984014, + "grad_norm": 0.7324301600456238, + "learning_rate": 9.83051973977105e-05, + "loss": 0.904, + "step": 26030 + }, + { + "epoch": 0.16636213791957885, + "grad_norm": 0.8652370572090149, + "learning_rate": 9.830390182218771e-05, + "loss": 0.9646, + "step": 26040 + }, + { + "epoch": 0.16642602506931756, + "grad_norm": 1.0668914318084717, + "learning_rate": 9.830260576020286e-05, + "loss": 1.0844, + "step": 26050 + }, + { + "epoch": 0.16648991221905626, + "grad_norm": 1.8865516185760498, + "learning_rate": 9.830130921176898e-05, + "loss": 0.9959, + "step": 26060 + }, + { + "epoch": 0.16655379936879497, + "grad_norm": 0.7931020855903625, + "learning_rate": 9.830001217689913e-05, + "loss": 0.8263, + "step": 26070 + }, + { + "epoch": 0.16661768651853368, + "grad_norm": 1.674660325050354, + "learning_rate": 9.829871465560637e-05, + "loss": 0.8527, + "step": 26080 + }, + { + "epoch": 0.16668157366827235, + "grad_norm": 1.0010359287261963, + "learning_rate": 9.829741664790376e-05, + "loss": 0.7847, + "step": 26090 + }, + { + "epoch": 0.16674546081801106, + "grad_norm": 1.0310410261154175, + "learning_rate": 9.829611815380439e-05, + "loss": 0.8471, + "step": 26100 + }, + { + "epoch": 0.16680934796774977, + "grad_norm": 0.8554787039756775, + "learning_rate": 9.829481917332132e-05, + "loss": 0.8849, + "step": 26110 + }, + { + "epoch": 0.16687323511748847, + "grad_norm": 1.3090757131576538, + "learning_rate": 9.829351970646764e-05, + "loss": 0.921, + "step": 26120 + }, + { + "epoch": 0.16693712226722718, + "grad_norm": 1.068739414215088, + "learning_rate": 9.829221975325644e-05, + "loss": 0.9898, + "step": 26130 + }, + { + "epoch": 0.16700100941696588, + "grad_norm": 0.8239749073982239, + "learning_rate": 9.829091931370082e-05, + "loss": 1.1161, + "step": 26140 + }, + { + "epoch": 0.16706489656670456, + "grad_norm": 1.0234822034835815, + "learning_rate": 9.828961838781385e-05, + "loss": 1.0181, + "step": 26150 + }, + { + "epoch": 0.16712878371644327, + "grad_norm": 1.0589160919189453, + "learning_rate": 9.828831697560865e-05, + "loss": 1.0243, + "step": 26160 + }, + { + "epoch": 0.16719267086618197, + "grad_norm": 0.8593624234199524, + "learning_rate": 9.828701507709832e-05, + "loss": 1.3933, + "step": 26170 + }, + { + "epoch": 0.16725655801592068, + "grad_norm": 1.0242136716842651, + "learning_rate": 9.828571269229598e-05, + "loss": 0.9601, + "step": 26180 + }, + { + "epoch": 0.1673204451656594, + "grad_norm": 0.47015684843063354, + "learning_rate": 9.828440982121473e-05, + "loss": 1.2651, + "step": 26190 + }, + { + "epoch": 0.1673843323153981, + "grad_norm": 0.9608283042907715, + "learning_rate": 9.828310646386772e-05, + "loss": 0.7508, + "step": 26200 + }, + { + "epoch": 0.16744821946513677, + "grad_norm": 0.8850206136703491, + "learning_rate": 9.828180262026805e-05, + "loss": 0.9822, + "step": 26210 + }, + { + "epoch": 0.16751210661487548, + "grad_norm": 1.7484781742095947, + "learning_rate": 9.828049829042884e-05, + "loss": 0.9558, + "step": 26220 + }, + { + "epoch": 0.16757599376461418, + "grad_norm": 0.625224769115448, + "learning_rate": 9.827919347436328e-05, + "loss": 0.8881, + "step": 26230 + }, + { + "epoch": 0.1676398809143529, + "grad_norm": 0.576524555683136, + "learning_rate": 9.827788817208444e-05, + "loss": 0.9399, + "step": 26240 + }, + { + "epoch": 0.1677037680640916, + "grad_norm": 1.0603713989257812, + "learning_rate": 9.827658238360553e-05, + "loss": 0.8588, + "step": 26250 + }, + { + "epoch": 0.1677676552138303, + "grad_norm": 0.7877979278564453, + "learning_rate": 9.827527610893964e-05, + "loss": 0.8973, + "step": 26260 + }, + { + "epoch": 0.16783154236356898, + "grad_norm": 1.2610846757888794, + "learning_rate": 9.827396934809997e-05, + "loss": 0.7684, + "step": 26270 + }, + { + "epoch": 0.1678954295133077, + "grad_norm": 0.49026232957839966, + "learning_rate": 9.827266210109967e-05, + "loss": 1.061, + "step": 26280 + }, + { + "epoch": 0.1679593166630464, + "grad_norm": 1.0687637329101562, + "learning_rate": 9.827135436795189e-05, + "loss": 0.8798, + "step": 26290 + }, + { + "epoch": 0.1680232038127851, + "grad_norm": 0.9565626978874207, + "learning_rate": 9.827004614866981e-05, + "loss": 0.8781, + "step": 26300 + }, + { + "epoch": 0.1680870909625238, + "grad_norm": 1.148451566696167, + "learning_rate": 9.826873744326661e-05, + "loss": 0.7915, + "step": 26310 + }, + { + "epoch": 0.1681509781122625, + "grad_norm": 0.6154188513755798, + "learning_rate": 9.826742825175547e-05, + "loss": 0.8317, + "step": 26320 + }, + { + "epoch": 0.1682148652620012, + "grad_norm": 0.9438403844833374, + "learning_rate": 9.826611857414957e-05, + "loss": 0.8347, + "step": 26330 + }, + { + "epoch": 0.1682787524117399, + "grad_norm": 0.6729276776313782, + "learning_rate": 9.82648084104621e-05, + "loss": 0.969, + "step": 26340 + }, + { + "epoch": 0.1683426395614786, + "grad_norm": 0.6888546943664551, + "learning_rate": 9.826349776070625e-05, + "loss": 1.0223, + "step": 26350 + }, + { + "epoch": 0.1684065267112173, + "grad_norm": 0.8470525741577148, + "learning_rate": 9.826218662489521e-05, + "loss": 0.8919, + "step": 26360 + }, + { + "epoch": 0.16847041386095601, + "grad_norm": 0.653862714767456, + "learning_rate": 9.826087500304222e-05, + "loss": 1.0743, + "step": 26370 + }, + { + "epoch": 0.16853430101069472, + "grad_norm": 0.7015219926834106, + "learning_rate": 9.825956289516046e-05, + "loss": 1.1053, + "step": 26380 + }, + { + "epoch": 0.1685981881604334, + "grad_norm": 1.116733193397522, + "learning_rate": 9.825825030126315e-05, + "loss": 1.199, + "step": 26390 + }, + { + "epoch": 0.1686620753101721, + "grad_norm": 0.8197908401489258, + "learning_rate": 9.825693722136351e-05, + "loss": 0.9155, + "step": 26400 + }, + { + "epoch": 0.1687259624599108, + "grad_norm": 0.9840227365493774, + "learning_rate": 9.825562365547477e-05, + "loss": 0.9655, + "step": 26410 + }, + { + "epoch": 0.16878984960964952, + "grad_norm": 0.6856445074081421, + "learning_rate": 9.825430960361015e-05, + "loss": 0.7135, + "step": 26420 + }, + { + "epoch": 0.16885373675938822, + "grad_norm": 1.0108433961868286, + "learning_rate": 9.825299506578288e-05, + "loss": 1.1918, + "step": 26430 + }, + { + "epoch": 0.16891762390912693, + "grad_norm": 0.7306868433952332, + "learning_rate": 9.82516800420062e-05, + "loss": 0.9422, + "step": 26440 + }, + { + "epoch": 0.1689815110588656, + "grad_norm": 0.8072736859321594, + "learning_rate": 9.825036453229336e-05, + "loss": 0.8563, + "step": 26450 + }, + { + "epoch": 0.1690453982086043, + "grad_norm": 0.9829355478286743, + "learning_rate": 9.824904853665764e-05, + "loss": 1.2103, + "step": 26460 + }, + { + "epoch": 0.16910928535834302, + "grad_norm": 0.7728550434112549, + "learning_rate": 9.824773205511222e-05, + "loss": 0.986, + "step": 26470 + }, + { + "epoch": 0.16917317250808173, + "grad_norm": 0.7675033211708069, + "learning_rate": 9.824641508767042e-05, + "loss": 1.1175, + "step": 26480 + }, + { + "epoch": 0.16923705965782043, + "grad_norm": 1.7961393594741821, + "learning_rate": 9.824509763434548e-05, + "loss": 0.8564, + "step": 26490 + }, + { + "epoch": 0.16930094680755914, + "grad_norm": 0.782536506652832, + "learning_rate": 9.824377969515065e-05, + "loss": 1.0492, + "step": 26500 + }, + { + "epoch": 0.16936483395729782, + "grad_norm": 0.6397303342819214, + "learning_rate": 9.824246127009924e-05, + "loss": 0.922, + "step": 26510 + }, + { + "epoch": 0.16942872110703652, + "grad_norm": 0.7447948455810547, + "learning_rate": 9.82411423592045e-05, + "loss": 0.9926, + "step": 26520 + }, + { + "epoch": 0.16949260825677523, + "grad_norm": 0.7400467991828918, + "learning_rate": 9.823982296247972e-05, + "loss": 1.0191, + "step": 26530 + }, + { + "epoch": 0.16955649540651394, + "grad_norm": 0.6189865469932556, + "learning_rate": 9.82385030799382e-05, + "loss": 1.0518, + "step": 26540 + }, + { + "epoch": 0.16962038255625264, + "grad_norm": 0.8793081641197205, + "learning_rate": 9.823718271159321e-05, + "loss": 0.8839, + "step": 26550 + }, + { + "epoch": 0.16968426970599135, + "grad_norm": 0.6479794979095459, + "learning_rate": 9.823586185745808e-05, + "loss": 1.1906, + "step": 26560 + }, + { + "epoch": 0.16974815685573003, + "grad_norm": 0.9083991646766663, + "learning_rate": 9.823454051754605e-05, + "loss": 0.8276, + "step": 26570 + }, + { + "epoch": 0.16981204400546873, + "grad_norm": 0.7456206679344177, + "learning_rate": 9.823321869187051e-05, + "loss": 1.6253, + "step": 26580 + }, + { + "epoch": 0.16987593115520744, + "grad_norm": 0.7797310948371887, + "learning_rate": 9.823189638044473e-05, + "loss": 0.9139, + "step": 26590 + }, + { + "epoch": 0.16993981830494614, + "grad_norm": 0.6889947056770325, + "learning_rate": 9.8230573583282e-05, + "loss": 0.8341, + "step": 26600 + }, + { + "epoch": 0.17000370545468485, + "grad_norm": 0.9696791172027588, + "learning_rate": 9.822925030039567e-05, + "loss": 0.8444, + "step": 26610 + }, + { + "epoch": 0.17006759260442356, + "grad_norm": 0.5877872705459595, + "learning_rate": 9.822792653179908e-05, + "loss": 1.012, + "step": 26620 + }, + { + "epoch": 0.17013147975416223, + "grad_norm": 0.7431389093399048, + "learning_rate": 9.822660227750554e-05, + "loss": 0.7642, + "step": 26630 + }, + { + "epoch": 0.17019536690390094, + "grad_norm": 0.7920153737068176, + "learning_rate": 9.822527753752839e-05, + "loss": 0.8715, + "step": 26640 + }, + { + "epoch": 0.17025925405363965, + "grad_norm": 0.8526118397712708, + "learning_rate": 9.822395231188099e-05, + "loss": 0.816, + "step": 26650 + }, + { + "epoch": 0.17032314120337835, + "grad_norm": 0.8121978640556335, + "learning_rate": 9.822262660057666e-05, + "loss": 0.9923, + "step": 26660 + }, + { + "epoch": 0.17038702835311706, + "grad_norm": 1.0887260437011719, + "learning_rate": 9.822130040362875e-05, + "loss": 0.9544, + "step": 26670 + }, + { + "epoch": 0.17045091550285577, + "grad_norm": 0.5011045336723328, + "learning_rate": 9.821997372105065e-05, + "loss": 0.7011, + "step": 26680 + }, + { + "epoch": 0.17051480265259444, + "grad_norm": 1.1520075798034668, + "learning_rate": 9.821864655285569e-05, + "loss": 0.963, + "step": 26690 + }, + { + "epoch": 0.17057868980233315, + "grad_norm": 0.7860487699508667, + "learning_rate": 9.821731889905722e-05, + "loss": 0.8835, + "step": 26700 + }, + { + "epoch": 0.17064257695207186, + "grad_norm": 0.7170895934104919, + "learning_rate": 9.821599075966868e-05, + "loss": 0.8771, + "step": 26710 + }, + { + "epoch": 0.17070646410181056, + "grad_norm": 1.2707265615463257, + "learning_rate": 9.821466213470337e-05, + "loss": 1.134, + "step": 26720 + }, + { + "epoch": 0.17077035125154927, + "grad_norm": 0.8907286524772644, + "learning_rate": 9.82133330241747e-05, + "loss": 0.8502, + "step": 26730 + }, + { + "epoch": 0.17083423840128797, + "grad_norm": 0.6497828960418701, + "learning_rate": 9.821200342809606e-05, + "loss": 1.2541, + "step": 26740 + }, + { + "epoch": 0.17089812555102665, + "grad_norm": 0.7036256194114685, + "learning_rate": 9.821067334648084e-05, + "loss": 0.9958, + "step": 26750 + }, + { + "epoch": 0.17096201270076536, + "grad_norm": 0.8180755376815796, + "learning_rate": 9.820934277934243e-05, + "loss": 1.0885, + "step": 26760 + }, + { + "epoch": 0.17102589985050407, + "grad_norm": 0.9146037697792053, + "learning_rate": 9.820801172669425e-05, + "loss": 0.8732, + "step": 26770 + }, + { + "epoch": 0.17108978700024277, + "grad_norm": 0.9962326884269714, + "learning_rate": 9.820668018854966e-05, + "loss": 0.711, + "step": 26780 + }, + { + "epoch": 0.17115367414998148, + "grad_norm": 0.9202134609222412, + "learning_rate": 9.82053481649221e-05, + "loss": 1.0103, + "step": 26790 + }, + { + "epoch": 0.17121756129972018, + "grad_norm": 1.4406436681747437, + "learning_rate": 9.820401565582498e-05, + "loss": 1.1804, + "step": 26800 + }, + { + "epoch": 0.1712814484494589, + "grad_norm": 0.8345924019813538, + "learning_rate": 9.820268266127173e-05, + "loss": 0.7762, + "step": 26810 + }, + { + "epoch": 0.17134533559919757, + "grad_norm": 1.7090119123458862, + "learning_rate": 9.820134918127576e-05, + "loss": 0.867, + "step": 26820 + }, + { + "epoch": 0.17140922274893627, + "grad_norm": 0.7631736397743225, + "learning_rate": 9.82000152158505e-05, + "loss": 1.1823, + "step": 26830 + }, + { + "epoch": 0.17147310989867498, + "grad_norm": 0.5878487825393677, + "learning_rate": 9.81986807650094e-05, + "loss": 0.8455, + "step": 26840 + }, + { + "epoch": 0.1715369970484137, + "grad_norm": 1.0740894079208374, + "learning_rate": 9.819734582876587e-05, + "loss": 0.8497, + "step": 26850 + }, + { + "epoch": 0.1716008841981524, + "grad_norm": 0.8244208097457886, + "learning_rate": 9.819601040713337e-05, + "loss": 0.7606, + "step": 26860 + }, + { + "epoch": 0.1716647713478911, + "grad_norm": 0.9550793170928955, + "learning_rate": 9.819467450012536e-05, + "loss": 0.8171, + "step": 26870 + }, + { + "epoch": 0.17172865849762978, + "grad_norm": 0.7170982360839844, + "learning_rate": 9.819333810775528e-05, + "loss": 0.823, + "step": 26880 + }, + { + "epoch": 0.17179254564736848, + "grad_norm": 0.8252397775650024, + "learning_rate": 9.81920012300366e-05, + "loss": 0.7653, + "step": 26890 + }, + { + "epoch": 0.1718564327971072, + "grad_norm": 1.1286877393722534, + "learning_rate": 9.819066386698277e-05, + "loss": 0.991, + "step": 26900 + }, + { + "epoch": 0.1719203199468459, + "grad_norm": 0.7603797912597656, + "learning_rate": 9.818932601860727e-05, + "loss": 0.9141, + "step": 26910 + }, + { + "epoch": 0.1719842070965846, + "grad_norm": 0.7588580250740051, + "learning_rate": 9.818798768492354e-05, + "loss": 0.8255, + "step": 26920 + }, + { + "epoch": 0.1720480942463233, + "grad_norm": 0.9968806505203247, + "learning_rate": 9.81866488659451e-05, + "loss": 0.6958, + "step": 26930 + }, + { + "epoch": 0.17211198139606199, + "grad_norm": 0.7764785885810852, + "learning_rate": 9.818530956168543e-05, + "loss": 1.1488, + "step": 26940 + }, + { + "epoch": 0.1721758685458007, + "grad_norm": 0.6332468390464783, + "learning_rate": 9.818396977215801e-05, + "loss": 0.6837, + "step": 26950 + }, + { + "epoch": 0.1722397556955394, + "grad_norm": 0.8513321876525879, + "learning_rate": 9.818262949737632e-05, + "loss": 0.7871, + "step": 26960 + }, + { + "epoch": 0.1723036428452781, + "grad_norm": 0.6733559370040894, + "learning_rate": 9.818128873735386e-05, + "loss": 0.8591, + "step": 26970 + }, + { + "epoch": 0.1723675299950168, + "grad_norm": 1.0465015172958374, + "learning_rate": 9.817994749210415e-05, + "loss": 0.8665, + "step": 26980 + }, + { + "epoch": 0.17243141714475552, + "grad_norm": 0.6963700652122498, + "learning_rate": 9.817860576164069e-05, + "loss": 0.8684, + "step": 26990 + }, + { + "epoch": 0.1724953042944942, + "grad_norm": 1.0664374828338623, + "learning_rate": 9.817726354597699e-05, + "loss": 0.6893, + "step": 27000 + }, + { + "epoch": 0.1725591914442329, + "grad_norm": 0.7583040595054626, + "learning_rate": 9.817592084512655e-05, + "loss": 0.9267, + "step": 27010 + }, + { + "epoch": 0.1726230785939716, + "grad_norm": 0.8282020092010498, + "learning_rate": 9.817457765910292e-05, + "loss": 0.7665, + "step": 27020 + }, + { + "epoch": 0.1726869657437103, + "grad_norm": 0.8650298118591309, + "learning_rate": 9.817323398791961e-05, + "loss": 1.0732, + "step": 27030 + }, + { + "epoch": 0.17275085289344902, + "grad_norm": 0.5665771961212158, + "learning_rate": 9.817188983159016e-05, + "loss": 0.92, + "step": 27040 + }, + { + "epoch": 0.17281474004318773, + "grad_norm": 1.4481645822525024, + "learning_rate": 9.817054519012811e-05, + "loss": 0.8976, + "step": 27050 + }, + { + "epoch": 0.1728786271929264, + "grad_norm": 0.7741625308990479, + "learning_rate": 9.8169200063547e-05, + "loss": 1.0959, + "step": 27060 + }, + { + "epoch": 0.1729425143426651, + "grad_norm": 0.7932523488998413, + "learning_rate": 9.816785445186036e-05, + "loss": 0.9241, + "step": 27070 + }, + { + "epoch": 0.17300640149240382, + "grad_norm": 0.6542154550552368, + "learning_rate": 9.816650835508177e-05, + "loss": 0.9807, + "step": 27080 + }, + { + "epoch": 0.17307028864214252, + "grad_norm": 0.7726758718490601, + "learning_rate": 9.816516177322477e-05, + "loss": 0.8918, + "step": 27090 + }, + { + "epoch": 0.17313417579188123, + "grad_norm": 0.8398792743682861, + "learning_rate": 9.81638147063029e-05, + "loss": 1.1571, + "step": 27100 + }, + { + "epoch": 0.17319806294161993, + "grad_norm": 3.0609123706817627, + "learning_rate": 9.816246715432977e-05, + "loss": 1.0103, + "step": 27110 + }, + { + "epoch": 0.1732619500913586, + "grad_norm": 0.8899956941604614, + "learning_rate": 9.816111911731892e-05, + "loss": 0.878, + "step": 27120 + }, + { + "epoch": 0.17332583724109732, + "grad_norm": 1.076644778251648, + "learning_rate": 9.815977059528393e-05, + "loss": 1.0136, + "step": 27130 + }, + { + "epoch": 0.17338972439083603, + "grad_norm": 2.1969175338745117, + "learning_rate": 9.81584215882384e-05, + "loss": 0.7375, + "step": 27140 + }, + { + "epoch": 0.17345361154057473, + "grad_norm": 0.9302259087562561, + "learning_rate": 9.815707209619589e-05, + "loss": 0.802, + "step": 27150 + }, + { + "epoch": 0.17351749869031344, + "grad_norm": 0.6798985004425049, + "learning_rate": 9.815572211917001e-05, + "loss": 0.7363, + "step": 27160 + }, + { + "epoch": 0.17358138584005214, + "grad_norm": 0.7445381879806519, + "learning_rate": 9.815437165717435e-05, + "loss": 1.024, + "step": 27170 + }, + { + "epoch": 0.17364527298979082, + "grad_norm": 0.7571766972541809, + "learning_rate": 9.81530207102225e-05, + "loss": 0.8216, + "step": 27180 + }, + { + "epoch": 0.17370916013952953, + "grad_norm": 1.2653508186340332, + "learning_rate": 9.815166927832809e-05, + "loss": 0.8769, + "step": 27190 + }, + { + "epoch": 0.17377304728926823, + "grad_norm": 1.0241389274597168, + "learning_rate": 9.815031736150468e-05, + "loss": 0.8065, + "step": 27200 + }, + { + "epoch": 0.17383693443900694, + "grad_norm": 0.6065948605537415, + "learning_rate": 9.814896495976595e-05, + "loss": 0.8726, + "step": 27210 + }, + { + "epoch": 0.17390082158874565, + "grad_norm": 0.7081197500228882, + "learning_rate": 9.814761207312547e-05, + "loss": 0.9101, + "step": 27220 + }, + { + "epoch": 0.17396470873848435, + "grad_norm": 1.0318403244018555, + "learning_rate": 9.814625870159688e-05, + "loss": 0.9142, + "step": 27230 + }, + { + "epoch": 0.17402859588822303, + "grad_norm": 1.1322126388549805, + "learning_rate": 9.814490484519384e-05, + "loss": 0.8966, + "step": 27240 + }, + { + "epoch": 0.17409248303796174, + "grad_norm": 1.0569275617599487, + "learning_rate": 9.814355050392993e-05, + "loss": 0.8479, + "step": 27250 + }, + { + "epoch": 0.17415637018770044, + "grad_norm": 0.6752243041992188, + "learning_rate": 9.814219567781882e-05, + "loss": 0.8054, + "step": 27260 + }, + { + "epoch": 0.17422025733743915, + "grad_norm": 2.970486640930176, + "learning_rate": 9.814084036687417e-05, + "loss": 0.7318, + "step": 27270 + }, + { + "epoch": 0.17428414448717786, + "grad_norm": 1.1387560367584229, + "learning_rate": 9.813948457110957e-05, + "loss": 0.7659, + "step": 27280 + }, + { + "epoch": 0.17434803163691656, + "grad_norm": 0.7417890429496765, + "learning_rate": 9.813812829053874e-05, + "loss": 0.7819, + "step": 27290 + }, + { + "epoch": 0.17441191878665524, + "grad_norm": 1.0214507579803467, + "learning_rate": 9.813677152517533e-05, + "loss": 0.8779, + "step": 27300 + }, + { + "epoch": 0.17447580593639395, + "grad_norm": 0.9005577564239502, + "learning_rate": 9.813541427503296e-05, + "loss": 0.826, + "step": 27310 + }, + { + "epoch": 0.17453969308613265, + "grad_norm": 0.5254817008972168, + "learning_rate": 9.813405654012533e-05, + "loss": 0.7745, + "step": 27320 + }, + { + "epoch": 0.17460358023587136, + "grad_norm": 0.8588125109672546, + "learning_rate": 9.813269832046612e-05, + "loss": 0.8896, + "step": 27330 + }, + { + "epoch": 0.17466746738561006, + "grad_norm": 0.9681766033172607, + "learning_rate": 9.813133961606899e-05, + "loss": 0.9978, + "step": 27340 + }, + { + "epoch": 0.17473135453534877, + "grad_norm": 0.6579704880714417, + "learning_rate": 9.812998042694762e-05, + "loss": 0.9591, + "step": 27350 + }, + { + "epoch": 0.17479524168508745, + "grad_norm": 1.3134688138961792, + "learning_rate": 9.812862075311572e-05, + "loss": 0.9493, + "step": 27360 + }, + { + "epoch": 0.17485912883482616, + "grad_norm": 1.0650473833084106, + "learning_rate": 9.812726059458697e-05, + "loss": 0.6251, + "step": 27370 + }, + { + "epoch": 0.17492301598456486, + "grad_norm": 0.9300364851951599, + "learning_rate": 9.812589995137507e-05, + "loss": 0.8485, + "step": 27380 + }, + { + "epoch": 0.17498690313430357, + "grad_norm": 0.8550617098808289, + "learning_rate": 9.812453882349373e-05, + "loss": 0.9799, + "step": 27390 + }, + { + "epoch": 0.17505079028404227, + "grad_norm": 1.0517045259475708, + "learning_rate": 9.812317721095662e-05, + "loss": 1.092, + "step": 27400 + }, + { + "epoch": 0.17511467743378098, + "grad_norm": 0.8268793821334839, + "learning_rate": 9.812181511377752e-05, + "loss": 0.8651, + "step": 27410 + }, + { + "epoch": 0.17517856458351966, + "grad_norm": 1.0271008014678955, + "learning_rate": 9.81204525319701e-05, + "loss": 1.0251, + "step": 27420 + }, + { + "epoch": 0.17524245173325836, + "grad_norm": 1.1085052490234375, + "learning_rate": 9.811908946554809e-05, + "loss": 0.897, + "step": 27430 + }, + { + "epoch": 0.17530633888299707, + "grad_norm": 0.9341952204704285, + "learning_rate": 9.811772591452521e-05, + "loss": 0.9069, + "step": 27440 + }, + { + "epoch": 0.17537022603273578, + "grad_norm": 1.8567582368850708, + "learning_rate": 9.811636187891521e-05, + "loss": 0.8957, + "step": 27450 + }, + { + "epoch": 0.17543411318247448, + "grad_norm": 0.8161446452140808, + "learning_rate": 9.811499735873182e-05, + "loss": 0.9018, + "step": 27460 + }, + { + "epoch": 0.1754980003322132, + "grad_norm": 0.8577879667282104, + "learning_rate": 9.811363235398878e-05, + "loss": 0.9191, + "step": 27470 + }, + { + "epoch": 0.17556188748195187, + "grad_norm": 1.067243218421936, + "learning_rate": 9.811226686469985e-05, + "loss": 0.6471, + "step": 27480 + }, + { + "epoch": 0.17562577463169057, + "grad_norm": 1.763016939163208, + "learning_rate": 9.811090089087875e-05, + "loss": 0.7081, + "step": 27490 + }, + { + "epoch": 0.17568966178142928, + "grad_norm": 1.0972936153411865, + "learning_rate": 9.810953443253927e-05, + "loss": 0.821, + "step": 27500 + }, + { + "epoch": 0.17575354893116799, + "grad_norm": 0.691754937171936, + "learning_rate": 9.810816748969516e-05, + "loss": 0.9142, + "step": 27510 + }, + { + "epoch": 0.1758174360809067, + "grad_norm": 0.7978219389915466, + "learning_rate": 9.810680006236017e-05, + "loss": 1.0896, + "step": 27520 + }, + { + "epoch": 0.1758813232306454, + "grad_norm": 0.5945133566856384, + "learning_rate": 9.81054321505481e-05, + "loss": 1.1876, + "step": 27530 + }, + { + "epoch": 0.17594521038038408, + "grad_norm": 0.7158066034317017, + "learning_rate": 9.81040637542727e-05, + "loss": 0.8112, + "step": 27540 + }, + { + "epoch": 0.17600909753012278, + "grad_norm": 0.7002230882644653, + "learning_rate": 9.810269487354777e-05, + "loss": 0.7471, + "step": 27550 + }, + { + "epoch": 0.1760729846798615, + "grad_norm": 1.0922120809555054, + "learning_rate": 9.810132550838709e-05, + "loss": 0.6824, + "step": 27560 + }, + { + "epoch": 0.1761368718296002, + "grad_norm": 0.7432847023010254, + "learning_rate": 9.809995565880443e-05, + "loss": 0.9265, + "step": 27570 + }, + { + "epoch": 0.1762007589793389, + "grad_norm": 0.5448877215385437, + "learning_rate": 9.809858532481362e-05, + "loss": 0.8096, + "step": 27580 + }, + { + "epoch": 0.1762646461290776, + "grad_norm": 0.7894873023033142, + "learning_rate": 9.809721450642844e-05, + "loss": 0.9688, + "step": 27590 + }, + { + "epoch": 0.17632853327881629, + "grad_norm": 1.4557750225067139, + "learning_rate": 9.80958432036627e-05, + "loss": 0.8877, + "step": 27600 + }, + { + "epoch": 0.176392420428555, + "grad_norm": 0.8581323623657227, + "learning_rate": 9.809447141653022e-05, + "loss": 0.8595, + "step": 27610 + }, + { + "epoch": 0.1764563075782937, + "grad_norm": 1.0392162799835205, + "learning_rate": 9.809309914504479e-05, + "loss": 0.9148, + "step": 27620 + }, + { + "epoch": 0.1765201947280324, + "grad_norm": 0.5153777003288269, + "learning_rate": 9.809172638922024e-05, + "loss": 0.9317, + "step": 27630 + }, + { + "epoch": 0.1765840818777711, + "grad_norm": 0.6191779971122742, + "learning_rate": 9.809035314907043e-05, + "loss": 0.7501, + "step": 27640 + }, + { + "epoch": 0.17664796902750982, + "grad_norm": 1.2180255651474, + "learning_rate": 9.808897942460912e-05, + "loss": 0.9112, + "step": 27650 + }, + { + "epoch": 0.17671185617724852, + "grad_norm": 0.8534625768661499, + "learning_rate": 9.808760521585021e-05, + "loss": 1.1213, + "step": 27660 + }, + { + "epoch": 0.1767757433269872, + "grad_norm": 0.7606062889099121, + "learning_rate": 9.808623052280752e-05, + "loss": 0.9272, + "step": 27670 + }, + { + "epoch": 0.1768396304767259, + "grad_norm": 0.8535296320915222, + "learning_rate": 9.808485534549488e-05, + "loss": 0.9289, + "step": 27680 + }, + { + "epoch": 0.1769035176264646, + "grad_norm": 0.9565229415893555, + "learning_rate": 9.808347968392613e-05, + "loss": 1.1181, + "step": 27690 + }, + { + "epoch": 0.17696740477620332, + "grad_norm": 0.8111469149589539, + "learning_rate": 9.808210353811516e-05, + "loss": 0.9397, + "step": 27700 + }, + { + "epoch": 0.17703129192594202, + "grad_norm": 0.8592471480369568, + "learning_rate": 9.808072690807582e-05, + "loss": 0.9435, + "step": 27710 + }, + { + "epoch": 0.17709517907568073, + "grad_norm": 0.4907069206237793, + "learning_rate": 9.807934979382194e-05, + "loss": 1.0336, + "step": 27720 + }, + { + "epoch": 0.1771590662254194, + "grad_norm": 1.013027310371399, + "learning_rate": 9.807797219536743e-05, + "loss": 0.8746, + "step": 27730 + }, + { + "epoch": 0.17722295337515812, + "grad_norm": 0.910508394241333, + "learning_rate": 9.807659411272614e-05, + "loss": 0.7623, + "step": 27740 + }, + { + "epoch": 0.17728684052489682, + "grad_norm": 1.0840027332305908, + "learning_rate": 9.807521554591194e-05, + "loss": 1.2327, + "step": 27750 + }, + { + "epoch": 0.17735072767463553, + "grad_norm": 0.9532760977745056, + "learning_rate": 9.807383649493875e-05, + "loss": 0.8192, + "step": 27760 + }, + { + "epoch": 0.17741461482437423, + "grad_norm": 1.1489735841751099, + "learning_rate": 9.807245695982044e-05, + "loss": 0.7777, + "step": 27770 + }, + { + "epoch": 0.17747850197411294, + "grad_norm": 0.6683622598648071, + "learning_rate": 9.807107694057089e-05, + "loss": 0.6466, + "step": 27780 + }, + { + "epoch": 0.17754238912385162, + "grad_norm": 1.4319005012512207, + "learning_rate": 9.806969643720401e-05, + "loss": 1.1009, + "step": 27790 + }, + { + "epoch": 0.17760627627359032, + "grad_norm": 1.017777919769287, + "learning_rate": 9.80683154497337e-05, + "loss": 0.9284, + "step": 27800 + }, + { + "epoch": 0.17767016342332903, + "grad_norm": 0.8920938968658447, + "learning_rate": 9.806693397817386e-05, + "loss": 0.8675, + "step": 27810 + }, + { + "epoch": 0.17773405057306774, + "grad_norm": 1.0226699113845825, + "learning_rate": 9.806555202253842e-05, + "loss": 1.0085, + "step": 27820 + }, + { + "epoch": 0.17779793772280644, + "grad_norm": 0.841672956943512, + "learning_rate": 9.806416958284127e-05, + "loss": 0.9486, + "step": 27830 + }, + { + "epoch": 0.17786182487254515, + "grad_norm": 0.7303531765937805, + "learning_rate": 9.806278665909638e-05, + "loss": 0.9338, + "step": 27840 + }, + { + "epoch": 0.17792571202228383, + "grad_norm": 0.723166823387146, + "learning_rate": 9.806140325131763e-05, + "loss": 0.9934, + "step": 27850 + }, + { + "epoch": 0.17798959917202253, + "grad_norm": 1.413759469985962, + "learning_rate": 9.806001935951899e-05, + "loss": 1.061, + "step": 27860 + }, + { + "epoch": 0.17805348632176124, + "grad_norm": 0.8165162205696106, + "learning_rate": 9.805863498371435e-05, + "loss": 0.9142, + "step": 27870 + }, + { + "epoch": 0.17811737347149995, + "grad_norm": 0.6334624886512756, + "learning_rate": 9.805725012391768e-05, + "loss": 0.9758, + "step": 27880 + }, + { + "epoch": 0.17818126062123865, + "grad_norm": 0.7921863794326782, + "learning_rate": 9.805586478014294e-05, + "loss": 1.4444, + "step": 27890 + }, + { + "epoch": 0.17824514777097736, + "grad_norm": 0.94256192445755, + "learning_rate": 9.805447895240407e-05, + "loss": 0.7907, + "step": 27900 + }, + { + "epoch": 0.17830903492071604, + "grad_norm": 0.948287844657898, + "learning_rate": 9.805309264071502e-05, + "loss": 0.9496, + "step": 27910 + }, + { + "epoch": 0.17837292207045474, + "grad_norm": 0.5825172066688538, + "learning_rate": 9.805170584508976e-05, + "loss": 1.1519, + "step": 27920 + }, + { + "epoch": 0.17843680922019345, + "grad_norm": 1.1197121143341064, + "learning_rate": 9.80504573152731e-05, + "loss": 0.866, + "step": 27930 + }, + { + "epoch": 0.17850069636993215, + "grad_norm": 0.8723785877227783, + "learning_rate": 9.804906960020751e-05, + "loss": 0.7456, + "step": 27940 + }, + { + "epoch": 0.17856458351967086, + "grad_norm": 0.6122041940689087, + "learning_rate": 9.804768140124621e-05, + "loss": 0.7238, + "step": 27950 + }, + { + "epoch": 0.17862847066940957, + "grad_norm": 0.7413936853408813, + "learning_rate": 9.80462927184032e-05, + "loss": 0.863, + "step": 27960 + }, + { + "epoch": 0.17869235781914825, + "grad_norm": 0.7080979943275452, + "learning_rate": 9.804490355169246e-05, + "loss": 0.8773, + "step": 27970 + }, + { + "epoch": 0.17875624496888695, + "grad_norm": 0.9712502956390381, + "learning_rate": 9.804351390112799e-05, + "loss": 0.9399, + "step": 27980 + }, + { + "epoch": 0.17882013211862566, + "grad_norm": 1.720031499862671, + "learning_rate": 9.804212376672375e-05, + "loss": 1.4551, + "step": 27990 + }, + { + "epoch": 0.17888401926836436, + "grad_norm": 3.504847526550293, + "learning_rate": 9.804073314849375e-05, + "loss": 1.1386, + "step": 28000 + }, + { + "epoch": 0.17894790641810307, + "grad_norm": 0.8636149168014526, + "learning_rate": 9.803934204645202e-05, + "loss": 0.8948, + "step": 28010 + }, + { + "epoch": 0.17901179356784178, + "grad_norm": 1.0400105714797974, + "learning_rate": 9.803795046061257e-05, + "loss": 0.8915, + "step": 28020 + }, + { + "epoch": 0.17907568071758045, + "grad_norm": 0.6742110848426819, + "learning_rate": 9.803655839098938e-05, + "loss": 1.0636, + "step": 28030 + }, + { + "epoch": 0.17913956786731916, + "grad_norm": 1.9153518676757812, + "learning_rate": 9.80351658375965e-05, + "loss": 0.8614, + "step": 28040 + }, + { + "epoch": 0.17920345501705787, + "grad_norm": 0.7775312662124634, + "learning_rate": 9.803377280044794e-05, + "loss": 0.869, + "step": 28050 + }, + { + "epoch": 0.17926734216679657, + "grad_norm": 0.558363676071167, + "learning_rate": 9.803237927955772e-05, + "loss": 0.7641, + "step": 28060 + }, + { + "epoch": 0.17933122931653528, + "grad_norm": 0.7154206037521362, + "learning_rate": 9.80309852749399e-05, + "loss": 0.7582, + "step": 28070 + }, + { + "epoch": 0.17939511646627399, + "grad_norm": 0.7916398048400879, + "learning_rate": 9.802959078660851e-05, + "loss": 1.0197, + "step": 28080 + }, + { + "epoch": 0.17945900361601266, + "grad_norm": 1.3828551769256592, + "learning_rate": 9.802819581457758e-05, + "loss": 0.9683, + "step": 28090 + }, + { + "epoch": 0.17952289076575137, + "grad_norm": 1.6986253261566162, + "learning_rate": 9.802680035886118e-05, + "loss": 1.0508, + "step": 28100 + }, + { + "epoch": 0.17958677791549008, + "grad_norm": 0.576038658618927, + "learning_rate": 9.802540441947334e-05, + "loss": 0.6362, + "step": 28110 + }, + { + "epoch": 0.17965066506522878, + "grad_norm": 0.8584470748901367, + "learning_rate": 9.802400799642814e-05, + "loss": 0.8484, + "step": 28120 + }, + { + "epoch": 0.1797145522149675, + "grad_norm": 0.6002673506736755, + "learning_rate": 9.802261108973962e-05, + "loss": 0.6569, + "step": 28130 + }, + { + "epoch": 0.1797784393647062, + "grad_norm": 1.76115083694458, + "learning_rate": 9.802121369942188e-05, + "loss": 1.0472, + "step": 28140 + }, + { + "epoch": 0.17984232651444487, + "grad_norm": 0.6964778304100037, + "learning_rate": 9.801981582548896e-05, + "loss": 1.0831, + "step": 28150 + }, + { + "epoch": 0.17990621366418358, + "grad_norm": 0.6689683198928833, + "learning_rate": 9.801841746795495e-05, + "loss": 1.013, + "step": 28160 + }, + { + "epoch": 0.17997010081392228, + "grad_norm": 1.5421873331069946, + "learning_rate": 9.801701862683393e-05, + "loss": 0.7561, + "step": 28170 + }, + { + "epoch": 0.180033987963661, + "grad_norm": 0.8853926062583923, + "learning_rate": 9.801561930214001e-05, + "loss": 0.7668, + "step": 28180 + }, + { + "epoch": 0.1800978751133997, + "grad_norm": 0.7320166826248169, + "learning_rate": 9.801421949388723e-05, + "loss": 0.8514, + "step": 28190 + }, + { + "epoch": 0.1801617622631384, + "grad_norm": 1.5770325660705566, + "learning_rate": 9.801281920208976e-05, + "loss": 1.2304, + "step": 28200 + }, + { + "epoch": 0.18022564941287708, + "grad_norm": 0.8628795146942139, + "learning_rate": 9.801141842676164e-05, + "loss": 0.999, + "step": 28210 + }, + { + "epoch": 0.1802895365626158, + "grad_norm": 1.4478768110275269, + "learning_rate": 9.801001716791701e-05, + "loss": 0.7788, + "step": 28220 + }, + { + "epoch": 0.1803534237123545, + "grad_norm": 1.1721216440200806, + "learning_rate": 9.800861542556998e-05, + "loss": 0.6793, + "step": 28230 + }, + { + "epoch": 0.1804173108620932, + "grad_norm": 1.0601638555526733, + "learning_rate": 9.800721319973465e-05, + "loss": 0.914, + "step": 28240 + }, + { + "epoch": 0.1804811980118319, + "grad_norm": 1.330712914466858, + "learning_rate": 9.800581049042515e-05, + "loss": 0.9251, + "step": 28250 + }, + { + "epoch": 0.1805450851615706, + "grad_norm": 1.723365306854248, + "learning_rate": 9.80044072976556e-05, + "loss": 0.8571, + "step": 28260 + }, + { + "epoch": 0.1806089723113093, + "grad_norm": 1.0684921741485596, + "learning_rate": 9.800300362144015e-05, + "loss": 0.8753, + "step": 28270 + }, + { + "epoch": 0.180672859461048, + "grad_norm": 0.870155394077301, + "learning_rate": 9.800159946179292e-05, + "loss": 0.8745, + "step": 28280 + }, + { + "epoch": 0.1807367466107867, + "grad_norm": 0.8147633075714111, + "learning_rate": 9.800019481872807e-05, + "loss": 0.8873, + "step": 28290 + }, + { + "epoch": 0.1808006337605254, + "grad_norm": 0.8370197415351868, + "learning_rate": 9.799878969225971e-05, + "loss": 0.7692, + "step": 28300 + }, + { + "epoch": 0.18086452091026411, + "grad_norm": 0.695644199848175, + "learning_rate": 9.799738408240202e-05, + "loss": 1.0125, + "step": 28310 + }, + { + "epoch": 0.18092840806000282, + "grad_norm": 0.8963587284088135, + "learning_rate": 9.799597798916915e-05, + "loss": 0.9593, + "step": 28320 + }, + { + "epoch": 0.1809922952097415, + "grad_norm": 0.9512690305709839, + "learning_rate": 9.799457141257527e-05, + "loss": 0.9553, + "step": 28330 + }, + { + "epoch": 0.1810561823594802, + "grad_norm": 0.8540796637535095, + "learning_rate": 9.799316435263452e-05, + "loss": 0.8412, + "step": 28340 + }, + { + "epoch": 0.1811200695092189, + "grad_norm": 0.7773367762565613, + "learning_rate": 9.799175680936109e-05, + "loss": 0.8601, + "step": 28350 + }, + { + "epoch": 0.18118395665895762, + "grad_norm": 2.9732205867767334, + "learning_rate": 9.799034878276916e-05, + "loss": 0.8188, + "step": 28360 + }, + { + "epoch": 0.18124784380869632, + "grad_norm": 1.0311912298202515, + "learning_rate": 9.798894027287289e-05, + "loss": 0.6879, + "step": 28370 + }, + { + "epoch": 0.18131173095843503, + "grad_norm": 1.366125464439392, + "learning_rate": 9.798753127968647e-05, + "loss": 0.7352, + "step": 28380 + }, + { + "epoch": 0.1813756181081737, + "grad_norm": 0.7077022790908813, + "learning_rate": 9.79861218032241e-05, + "loss": 0.9083, + "step": 28390 + }, + { + "epoch": 0.18143950525791241, + "grad_norm": 0.9163293242454529, + "learning_rate": 9.798471184349997e-05, + "loss": 1.0788, + "step": 28400 + }, + { + "epoch": 0.18150339240765112, + "grad_norm": 0.7429232001304626, + "learning_rate": 9.798330140052829e-05, + "loss": 1.2201, + "step": 28410 + }, + { + "epoch": 0.18156727955738983, + "grad_norm": 0.7430415749549866, + "learning_rate": 9.798189047432323e-05, + "loss": 0.7114, + "step": 28420 + }, + { + "epoch": 0.18163116670712853, + "grad_norm": 0.9560526013374329, + "learning_rate": 9.798047906489905e-05, + "loss": 0.9409, + "step": 28430 + }, + { + "epoch": 0.18169505385686724, + "grad_norm": 1.2373318672180176, + "learning_rate": 9.797906717226992e-05, + "loss": 0.9829, + "step": 28440 + }, + { + "epoch": 0.18175894100660592, + "grad_norm": 1.158624291419983, + "learning_rate": 9.797765479645007e-05, + "loss": 0.8655, + "step": 28450 + }, + { + "epoch": 0.18182282815634462, + "grad_norm": 0.6600698232650757, + "learning_rate": 9.797624193745374e-05, + "loss": 0.9877, + "step": 28460 + }, + { + "epoch": 0.18188671530608333, + "grad_norm": 0.8037683963775635, + "learning_rate": 9.797482859529514e-05, + "loss": 0.7506, + "step": 28470 + }, + { + "epoch": 0.18195060245582204, + "grad_norm": 0.7499133348464966, + "learning_rate": 9.797341476998853e-05, + "loss": 0.7967, + "step": 28480 + }, + { + "epoch": 0.18201448960556074, + "grad_norm": 0.6633144617080688, + "learning_rate": 9.797200046154811e-05, + "loss": 0.8313, + "step": 28490 + }, + { + "epoch": 0.18207837675529945, + "grad_norm": 1.5353120565414429, + "learning_rate": 9.797058566998816e-05, + "loss": 0.7916, + "step": 28500 + }, + { + "epoch": 0.18214226390503815, + "grad_norm": 1.1196563243865967, + "learning_rate": 9.79691703953229e-05, + "loss": 0.8152, + "step": 28510 + }, + { + "epoch": 0.18220615105477683, + "grad_norm": 0.7169744968414307, + "learning_rate": 9.79677546375666e-05, + "loss": 0.9211, + "step": 28520 + }, + { + "epoch": 0.18227003820451554, + "grad_norm": 0.8993495106697083, + "learning_rate": 9.796633839673352e-05, + "loss": 1.0358, + "step": 28530 + }, + { + "epoch": 0.18233392535425424, + "grad_norm": 0.600199282169342, + "learning_rate": 9.79649216728379e-05, + "loss": 0.9579, + "step": 28540 + }, + { + "epoch": 0.18239781250399295, + "grad_norm": 0.901833176612854, + "learning_rate": 9.796350446589404e-05, + "loss": 0.8611, + "step": 28550 + }, + { + "epoch": 0.18246169965373166, + "grad_norm": 0.5698120594024658, + "learning_rate": 9.796208677591619e-05, + "loss": 0.6931, + "step": 28560 + }, + { + "epoch": 0.18252558680347036, + "grad_norm": 0.9085325598716736, + "learning_rate": 9.796066860291861e-05, + "loss": 0.7067, + "step": 28570 + }, + { + "epoch": 0.18258947395320904, + "grad_norm": 0.8795328140258789, + "learning_rate": 9.795924994691564e-05, + "loss": 0.938, + "step": 28580 + }, + { + "epoch": 0.18265336110294775, + "grad_norm": 0.7105121612548828, + "learning_rate": 9.795783080792151e-05, + "loss": 1.0905, + "step": 28590 + }, + { + "epoch": 0.18271724825268645, + "grad_norm": 1.094942569732666, + "learning_rate": 9.795641118595053e-05, + "loss": 0.9418, + "step": 28600 + }, + { + "epoch": 0.18278113540242516, + "grad_norm": 1.1036394834518433, + "learning_rate": 9.795499108101702e-05, + "loss": 0.7659, + "step": 28610 + }, + { + "epoch": 0.18284502255216387, + "grad_norm": 0.9667114019393921, + "learning_rate": 9.795357049313526e-05, + "loss": 0.7326, + "step": 28620 + }, + { + "epoch": 0.18290890970190257, + "grad_norm": 0.8882653713226318, + "learning_rate": 9.795214942231956e-05, + "loss": 0.9086, + "step": 28630 + }, + { + "epoch": 0.18297279685164125, + "grad_norm": 0.941718339920044, + "learning_rate": 9.795072786858421e-05, + "loss": 0.8087, + "step": 28640 + }, + { + "epoch": 0.18303668400137996, + "grad_norm": 0.749993622303009, + "learning_rate": 9.794930583194357e-05, + "loss": 0.8691, + "step": 28650 + }, + { + "epoch": 0.18310057115111866, + "grad_norm": 0.9505361318588257, + "learning_rate": 9.794788331241193e-05, + "loss": 0.9151, + "step": 28660 + }, + { + "epoch": 0.18316445830085737, + "grad_norm": 0.7003071308135986, + "learning_rate": 9.794646031000363e-05, + "loss": 0.9178, + "step": 28670 + }, + { + "epoch": 0.18322834545059608, + "grad_norm": 0.7516195178031921, + "learning_rate": 9.7945036824733e-05, + "loss": 0.9932, + "step": 28680 + }, + { + "epoch": 0.18329223260033478, + "grad_norm": 0.737834095954895, + "learning_rate": 9.794361285661435e-05, + "loss": 1.0445, + "step": 28690 + }, + { + "epoch": 0.18335611975007346, + "grad_norm": 1.5142183303833008, + "learning_rate": 9.794218840566205e-05, + "loss": 0.9432, + "step": 28700 + }, + { + "epoch": 0.18342000689981217, + "grad_norm": 0.9545480012893677, + "learning_rate": 9.794076347189045e-05, + "loss": 1.0564, + "step": 28710 + }, + { + "epoch": 0.18348389404955087, + "grad_norm": 1.4519827365875244, + "learning_rate": 9.793933805531387e-05, + "loss": 1.0927, + "step": 28720 + }, + { + "epoch": 0.18354778119928958, + "grad_norm": 1.179065465927124, + "learning_rate": 9.793791215594669e-05, + "loss": 0.8412, + "step": 28730 + }, + { + "epoch": 0.18361166834902828, + "grad_norm": 0.5378461480140686, + "learning_rate": 9.793648577380325e-05, + "loss": 0.9532, + "step": 28740 + }, + { + "epoch": 0.183675555498767, + "grad_norm": 0.9860353469848633, + "learning_rate": 9.793505890889795e-05, + "loss": 0.7965, + "step": 28750 + }, + { + "epoch": 0.18373944264850567, + "grad_norm": 0.7210092544555664, + "learning_rate": 9.793363156124513e-05, + "loss": 0.9562, + "step": 28760 + }, + { + "epoch": 0.18380332979824437, + "grad_norm": 0.9851694703102112, + "learning_rate": 9.793220373085917e-05, + "loss": 0.9258, + "step": 28770 + }, + { + "epoch": 0.18386721694798308, + "grad_norm": 1.2864528894424438, + "learning_rate": 9.793077541775444e-05, + "loss": 0.8495, + "step": 28780 + }, + { + "epoch": 0.1839311040977218, + "grad_norm": 0.5326701402664185, + "learning_rate": 9.792934662194534e-05, + "loss": 0.7886, + "step": 28790 + }, + { + "epoch": 0.1839949912474605, + "grad_norm": 0.9040879011154175, + "learning_rate": 9.792791734344627e-05, + "loss": 0.7028, + "step": 28800 + }, + { + "epoch": 0.1840588783971992, + "grad_norm": 0.7170331478118896, + "learning_rate": 9.792648758227159e-05, + "loss": 0.957, + "step": 28810 + }, + { + "epoch": 0.18412276554693788, + "grad_norm": 1.0186604261398315, + "learning_rate": 9.792505733843573e-05, + "loss": 0.8086, + "step": 28820 + }, + { + "epoch": 0.18418665269667658, + "grad_norm": 1.5476514101028442, + "learning_rate": 9.792362661195307e-05, + "loss": 0.9259, + "step": 28830 + }, + { + "epoch": 0.1842505398464153, + "grad_norm": 0.7610865235328674, + "learning_rate": 9.792219540283804e-05, + "loss": 0.867, + "step": 28840 + }, + { + "epoch": 0.184314426996154, + "grad_norm": 0.6964796185493469, + "learning_rate": 9.792076371110503e-05, + "loss": 0.8641, + "step": 28850 + }, + { + "epoch": 0.1843783141458927, + "grad_norm": 1.106491208076477, + "learning_rate": 9.791933153676849e-05, + "loss": 0.9952, + "step": 28860 + }, + { + "epoch": 0.1844422012956314, + "grad_norm": 1.025023102760315, + "learning_rate": 9.791789887984282e-05, + "loss": 0.9773, + "step": 28870 + }, + { + "epoch": 0.1845060884453701, + "grad_norm": 0.7797799706459045, + "learning_rate": 9.791646574034245e-05, + "loss": 0.7025, + "step": 28880 + }, + { + "epoch": 0.1845699755951088, + "grad_norm": 0.6405588984489441, + "learning_rate": 9.791503211828182e-05, + "loss": 0.7509, + "step": 28890 + }, + { + "epoch": 0.1846338627448475, + "grad_norm": 1.01836097240448, + "learning_rate": 9.791359801367536e-05, + "loss": 0.7725, + "step": 28900 + }, + { + "epoch": 0.1846977498945862, + "grad_norm": 1.2316473722457886, + "learning_rate": 9.791216342653751e-05, + "loss": 0.8199, + "step": 28910 + }, + { + "epoch": 0.1847616370443249, + "grad_norm": 1.0237054824829102, + "learning_rate": 9.791072835688274e-05, + "loss": 0.7915, + "step": 28920 + }, + { + "epoch": 0.18482552419406362, + "grad_norm": 0.6611847877502441, + "learning_rate": 9.790929280472547e-05, + "loss": 0.8064, + "step": 28930 + }, + { + "epoch": 0.1848894113438023, + "grad_norm": 0.6756503582000732, + "learning_rate": 9.790785677008018e-05, + "loss": 0.7544, + "step": 28940 + }, + { + "epoch": 0.184953298493541, + "grad_norm": 1.1825060844421387, + "learning_rate": 9.790642025296134e-05, + "loss": 1.0022, + "step": 28950 + }, + { + "epoch": 0.1850171856432797, + "grad_norm": 1.195821762084961, + "learning_rate": 9.790498325338339e-05, + "loss": 1.0366, + "step": 28960 + }, + { + "epoch": 0.18508107279301841, + "grad_norm": 3.405341863632202, + "learning_rate": 9.790354577136083e-05, + "loss": 0.978, + "step": 28970 + }, + { + "epoch": 0.18514495994275712, + "grad_norm": 1.0382331609725952, + "learning_rate": 9.790210780690811e-05, + "loss": 0.9581, + "step": 28980 + }, + { + "epoch": 0.18520884709249583, + "grad_norm": 0.6907293200492859, + "learning_rate": 9.790066936003972e-05, + "loss": 0.9692, + "step": 28990 + }, + { + "epoch": 0.1852727342422345, + "grad_norm": 0.8222552537918091, + "learning_rate": 9.789923043077015e-05, + "loss": 1.1995, + "step": 29000 + }, + { + "epoch": 0.1853366213919732, + "grad_norm": 0.5325214862823486, + "learning_rate": 9.78977910191139e-05, + "loss": 0.9084, + "step": 29010 + }, + { + "epoch": 0.18540050854171192, + "grad_norm": 0.6033929586410522, + "learning_rate": 9.789635112508544e-05, + "loss": 0.9668, + "step": 29020 + }, + { + "epoch": 0.18546439569145062, + "grad_norm": 0.604171872138977, + "learning_rate": 9.78949107486993e-05, + "loss": 0.8394, + "step": 29030 + }, + { + "epoch": 0.18552828284118933, + "grad_norm": 0.6410810947418213, + "learning_rate": 9.789346988996997e-05, + "loss": 1.072, + "step": 29040 + }, + { + "epoch": 0.18559216999092804, + "grad_norm": 0.8470253348350525, + "learning_rate": 9.789202854891198e-05, + "loss": 0.9716, + "step": 29050 + }, + { + "epoch": 0.1856560571406667, + "grad_norm": 0.9727482199668884, + "learning_rate": 9.789058672553982e-05, + "loss": 0.9176, + "step": 29060 + }, + { + "epoch": 0.18571994429040542, + "grad_norm": 0.9362789988517761, + "learning_rate": 9.7889144419868e-05, + "loss": 0.8712, + "step": 29070 + }, + { + "epoch": 0.18578383144014413, + "grad_norm": 0.6700981259346008, + "learning_rate": 9.788770163191108e-05, + "loss": 0.6975, + "step": 29080 + }, + { + "epoch": 0.18584771858988283, + "grad_norm": 0.863276481628418, + "learning_rate": 9.788625836168359e-05, + "loss": 1.2225, + "step": 29090 + }, + { + "epoch": 0.18591160573962154, + "grad_norm": 0.9833418130874634, + "learning_rate": 9.788481460920003e-05, + "loss": 0.92, + "step": 29100 + }, + { + "epoch": 0.18597549288936024, + "grad_norm": 1.1162675619125366, + "learning_rate": 9.788337037447497e-05, + "loss": 0.765, + "step": 29110 + }, + { + "epoch": 0.18603938003909892, + "grad_norm": 0.8579927086830139, + "learning_rate": 9.788192565752294e-05, + "loss": 0.8593, + "step": 29120 + }, + { + "epoch": 0.18610326718883763, + "grad_norm": 0.6623185276985168, + "learning_rate": 9.788048045835851e-05, + "loss": 0.9438, + "step": 29130 + }, + { + "epoch": 0.18616715433857633, + "grad_norm": 1.0203254222869873, + "learning_rate": 9.78790347769962e-05, + "loss": 0.9786, + "step": 29140 + }, + { + "epoch": 0.18623104148831504, + "grad_norm": 1.5538065433502197, + "learning_rate": 9.78775886134506e-05, + "loss": 0.7127, + "step": 29150 + }, + { + "epoch": 0.18629492863805375, + "grad_norm": 0.5423676371574402, + "learning_rate": 9.787614196773627e-05, + "loss": 0.9467, + "step": 29160 + }, + { + "epoch": 0.18635881578779245, + "grad_norm": 0.8626308441162109, + "learning_rate": 9.787469483986775e-05, + "loss": 1.054, + "step": 29170 + }, + { + "epoch": 0.18642270293753113, + "grad_norm": 2.267576217651367, + "learning_rate": 9.787324722985966e-05, + "loss": 0.8235, + "step": 29180 + }, + { + "epoch": 0.18648659008726984, + "grad_norm": 1.2194722890853882, + "learning_rate": 9.787179913772653e-05, + "loss": 1.046, + "step": 29190 + }, + { + "epoch": 0.18655047723700854, + "grad_norm": 1.2716878652572632, + "learning_rate": 9.787035056348298e-05, + "loss": 1.0831, + "step": 29200 + }, + { + "epoch": 0.18661436438674725, + "grad_norm": 0.5902767181396484, + "learning_rate": 9.786890150714359e-05, + "loss": 0.9125, + "step": 29210 + }, + { + "epoch": 0.18667825153648596, + "grad_norm": 0.6737661361694336, + "learning_rate": 9.786745196872295e-05, + "loss": 0.8752, + "step": 29220 + }, + { + "epoch": 0.18674213868622466, + "grad_norm": 0.7880046367645264, + "learning_rate": 9.786600194823565e-05, + "loss": 0.7642, + "step": 29230 + }, + { + "epoch": 0.18680602583596334, + "grad_norm": 1.327628254890442, + "learning_rate": 9.78645514456963e-05, + "loss": 0.668, + "step": 29240 + }, + { + "epoch": 0.18686991298570205, + "grad_norm": 1.034236192703247, + "learning_rate": 9.786310046111951e-05, + "loss": 0.9501, + "step": 29250 + }, + { + "epoch": 0.18693380013544075, + "grad_norm": 0.7702693939208984, + "learning_rate": 9.78616489945199e-05, + "loss": 0.9508, + "step": 29260 + }, + { + "epoch": 0.18699768728517946, + "grad_norm": 0.8737154603004456, + "learning_rate": 9.786019704591206e-05, + "loss": 0.8081, + "step": 29270 + }, + { + "epoch": 0.18706157443491817, + "grad_norm": 0.7933652400970459, + "learning_rate": 9.785874461531064e-05, + "loss": 0.8241, + "step": 29280 + }, + { + "epoch": 0.18712546158465687, + "grad_norm": 1.6798765659332275, + "learning_rate": 9.785729170273026e-05, + "loss": 0.8096, + "step": 29290 + }, + { + "epoch": 0.18718934873439558, + "grad_norm": 0.7516373991966248, + "learning_rate": 9.785583830818554e-05, + "loss": 0.8489, + "step": 29300 + }, + { + "epoch": 0.18725323588413426, + "grad_norm": 0.5362650752067566, + "learning_rate": 9.785438443169115e-05, + "loss": 0.8583, + "step": 29310 + }, + { + "epoch": 0.18731712303387296, + "grad_norm": 0.5288386940956116, + "learning_rate": 9.785293007326169e-05, + "loss": 0.8078, + "step": 29320 + }, + { + "epoch": 0.18738101018361167, + "grad_norm": 0.7445020079612732, + "learning_rate": 9.785147523291183e-05, + "loss": 0.9432, + "step": 29330 + }, + { + "epoch": 0.18744489733335037, + "grad_norm": 0.8663593530654907, + "learning_rate": 9.78500199106562e-05, + "loss": 0.8345, + "step": 29340 + }, + { + "epoch": 0.18750878448308908, + "grad_norm": 1.6068364381790161, + "learning_rate": 9.784856410650951e-05, + "loss": 1.0205, + "step": 29350 + }, + { + "epoch": 0.1875726716328278, + "grad_norm": 0.7024542689323425, + "learning_rate": 9.784710782048636e-05, + "loss": 0.891, + "step": 29360 + }, + { + "epoch": 0.18763655878256646, + "grad_norm": 0.6852838397026062, + "learning_rate": 9.784565105260145e-05, + "loss": 0.7938, + "step": 29370 + }, + { + "epoch": 0.18770044593230517, + "grad_norm": 0.5752915740013123, + "learning_rate": 9.784419380286944e-05, + "loss": 0.9839, + "step": 29380 + }, + { + "epoch": 0.18776433308204388, + "grad_norm": 1.396058201789856, + "learning_rate": 9.784273607130501e-05, + "loss": 0.8067, + "step": 29390 + }, + { + "epoch": 0.18782822023178258, + "grad_norm": 0.9546979665756226, + "learning_rate": 9.784127785792283e-05, + "loss": 0.8, + "step": 29400 + }, + { + "epoch": 0.1878921073815213, + "grad_norm": 1.17519211769104, + "learning_rate": 9.783981916273758e-05, + "loss": 1.1313, + "step": 29410 + }, + { + "epoch": 0.18795599453126, + "grad_norm": 2.2271242141723633, + "learning_rate": 9.783835998576398e-05, + "loss": 0.8251, + "step": 29420 + }, + { + "epoch": 0.18801988168099867, + "grad_norm": 1.1907005310058594, + "learning_rate": 9.78369003270167e-05, + "loss": 0.7798, + "step": 29430 + }, + { + "epoch": 0.18808376883073738, + "grad_norm": 1.2218221426010132, + "learning_rate": 9.783544018651048e-05, + "loss": 0.9479, + "step": 29440 + }, + { + "epoch": 0.1881476559804761, + "grad_norm": 0.7123143076896667, + "learning_rate": 9.783397956425997e-05, + "loss": 0.8228, + "step": 29450 + }, + { + "epoch": 0.1882115431302148, + "grad_norm": 1.4676718711853027, + "learning_rate": 9.78325184602799e-05, + "loss": 0.9251, + "step": 29460 + }, + { + "epoch": 0.1882754302799535, + "grad_norm": 0.7313151359558105, + "learning_rate": 9.783105687458499e-05, + "loss": 0.9859, + "step": 29470 + }, + { + "epoch": 0.1883393174296922, + "grad_norm": 0.7722935080528259, + "learning_rate": 9.782959480718997e-05, + "loss": 0.7907, + "step": 29480 + }, + { + "epoch": 0.18840320457943088, + "grad_norm": 1.3157824277877808, + "learning_rate": 9.782813225810953e-05, + "loss": 1.0648, + "step": 29490 + }, + { + "epoch": 0.1884670917291696, + "grad_norm": 1.3221862316131592, + "learning_rate": 9.782666922735843e-05, + "loss": 0.7726, + "step": 29500 + }, + { + "epoch": 0.1885309788789083, + "grad_norm": 0.5356481671333313, + "learning_rate": 9.78252057149514e-05, + "loss": 0.8444, + "step": 29510 + }, + { + "epoch": 0.188594866028647, + "grad_norm": 1.3000450134277344, + "learning_rate": 9.782374172090318e-05, + "loss": 0.7855, + "step": 29520 + }, + { + "epoch": 0.1886587531783857, + "grad_norm": 0.7123465538024902, + "learning_rate": 9.78222772452285e-05, + "loss": 0.8986, + "step": 29530 + }, + { + "epoch": 0.1887226403281244, + "grad_norm": 0.8324477076530457, + "learning_rate": 9.78208122879421e-05, + "loss": 0.795, + "step": 29540 + }, + { + "epoch": 0.1887865274778631, + "grad_norm": 1.0922014713287354, + "learning_rate": 9.781934684905879e-05, + "loss": 0.8251, + "step": 29550 + }, + { + "epoch": 0.1888504146276018, + "grad_norm": 0.6796879768371582, + "learning_rate": 9.781788092859326e-05, + "loss": 0.8954, + "step": 29560 + }, + { + "epoch": 0.1889143017773405, + "grad_norm": 0.6543946862220764, + "learning_rate": 9.78164145265603e-05, + "loss": 0.9359, + "step": 29570 + }, + { + "epoch": 0.1889781889270792, + "grad_norm": 0.7796209454536438, + "learning_rate": 9.781494764297468e-05, + "loss": 0.7721, + "step": 29580 + }, + { + "epoch": 0.18904207607681792, + "grad_norm": 1.0429221391677856, + "learning_rate": 9.781348027785116e-05, + "loss": 1.3679, + "step": 29590 + }, + { + "epoch": 0.18910596322655662, + "grad_norm": 1.09304940700531, + "learning_rate": 9.781201243120455e-05, + "loss": 1.1277, + "step": 29600 + }, + { + "epoch": 0.1891698503762953, + "grad_norm": 0.9372734427452087, + "learning_rate": 9.781054410304959e-05, + "loss": 0.7567, + "step": 29610 + }, + { + "epoch": 0.189233737526034, + "grad_norm": 0.9679316282272339, + "learning_rate": 9.780907529340111e-05, + "loss": 0.8106, + "step": 29620 + }, + { + "epoch": 0.1892976246757727, + "grad_norm": 1.304903268814087, + "learning_rate": 9.780760600227388e-05, + "loss": 0.9488, + "step": 29630 + }, + { + "epoch": 0.18936151182551142, + "grad_norm": 1.0478878021240234, + "learning_rate": 9.780613622968269e-05, + "loss": 0.8575, + "step": 29640 + }, + { + "epoch": 0.18942539897525013, + "grad_norm": 1.2268606424331665, + "learning_rate": 9.780466597564235e-05, + "loss": 1.0457, + "step": 29650 + }, + { + "epoch": 0.18948928612498883, + "grad_norm": 0.8506630659103394, + "learning_rate": 9.780319524016767e-05, + "loss": 0.7606, + "step": 29660 + }, + { + "epoch": 0.1895531732747275, + "grad_norm": 1.1287379264831543, + "learning_rate": 9.780172402327346e-05, + "loss": 1.0102, + "step": 29670 + }, + { + "epoch": 0.18961706042446622, + "grad_norm": 0.9983859062194824, + "learning_rate": 9.780025232497452e-05, + "loss": 0.7572, + "step": 29680 + }, + { + "epoch": 0.18968094757420492, + "grad_norm": 0.6691607236862183, + "learning_rate": 9.77987801452857e-05, + "loss": 1.0665, + "step": 29690 + }, + { + "epoch": 0.18974483472394363, + "grad_norm": 1.1289949417114258, + "learning_rate": 9.779730748422181e-05, + "loss": 0.9657, + "step": 29700 + }, + { + "epoch": 0.18980872187368233, + "grad_norm": 0.8309307098388672, + "learning_rate": 9.779583434179769e-05, + "loss": 1.1482, + "step": 29710 + }, + { + "epoch": 0.18987260902342104, + "grad_norm": 0.9599489569664001, + "learning_rate": 9.779436071802815e-05, + "loss": 0.8744, + "step": 29720 + }, + { + "epoch": 0.18993649617315972, + "grad_norm": 1.1437122821807861, + "learning_rate": 9.779288661292807e-05, + "loss": 0.9947, + "step": 29730 + }, + { + "epoch": 0.19000038332289843, + "grad_norm": 0.6847367882728577, + "learning_rate": 9.779141202651225e-05, + "loss": 0.6783, + "step": 29740 + }, + { + "epoch": 0.19006427047263713, + "grad_norm": 0.7857696413993835, + "learning_rate": 9.778993695879559e-05, + "loss": 0.9785, + "step": 29750 + }, + { + "epoch": 0.19012815762237584, + "grad_norm": 0.6318495273590088, + "learning_rate": 9.778846140979292e-05, + "loss": 0.8373, + "step": 29760 + }, + { + "epoch": 0.19019204477211454, + "grad_norm": 1.0563832521438599, + "learning_rate": 9.778698537951908e-05, + "loss": 0.7032, + "step": 29770 + }, + { + "epoch": 0.19025593192185325, + "grad_norm": 0.8542010188102722, + "learning_rate": 9.778550886798898e-05, + "loss": 0.9274, + "step": 29780 + }, + { + "epoch": 0.19031981907159193, + "grad_norm": 0.6458016633987427, + "learning_rate": 9.778403187521746e-05, + "loss": 0.8418, + "step": 29790 + }, + { + "epoch": 0.19038370622133063, + "grad_norm": 1.4049769639968872, + "learning_rate": 9.778255440121937e-05, + "loss": 0.9105, + "step": 29800 + }, + { + "epoch": 0.19044759337106934, + "grad_norm": 1.6816697120666504, + "learning_rate": 9.778107644600964e-05, + "loss": 0.9616, + "step": 29810 + }, + { + "epoch": 0.19051148052080805, + "grad_norm": 0.8408365249633789, + "learning_rate": 9.777959800960314e-05, + "loss": 0.9771, + "step": 29820 + }, + { + "epoch": 0.19057536767054675, + "grad_norm": 0.9713007211685181, + "learning_rate": 9.777811909201476e-05, + "loss": 0.8812, + "step": 29830 + }, + { + "epoch": 0.19063925482028546, + "grad_norm": 0.5639253258705139, + "learning_rate": 9.777663969325938e-05, + "loss": 0.9724, + "step": 29840 + }, + { + "epoch": 0.19070314197002414, + "grad_norm": 1.0495178699493408, + "learning_rate": 9.77751598133519e-05, + "loss": 1.005, + "step": 29850 + }, + { + "epoch": 0.19076702911976284, + "grad_norm": 1.3950402736663818, + "learning_rate": 9.777367945230722e-05, + "loss": 0.6716, + "step": 29860 + }, + { + "epoch": 0.19083091626950155, + "grad_norm": 1.0976344347000122, + "learning_rate": 9.777219861014028e-05, + "loss": 0.7201, + "step": 29870 + }, + { + "epoch": 0.19089480341924026, + "grad_norm": 0.6188146471977234, + "learning_rate": 9.777071728686595e-05, + "loss": 0.8153, + "step": 29880 + }, + { + "epoch": 0.19095869056897896, + "grad_norm": 0.748587965965271, + "learning_rate": 9.776923548249919e-05, + "loss": 1.0403, + "step": 29890 + }, + { + "epoch": 0.19102257771871767, + "grad_norm": 0.6070273518562317, + "learning_rate": 9.776775319705488e-05, + "loss": 0.7215, + "step": 29900 + }, + { + "epoch": 0.19108646486845635, + "grad_norm": 1.0740474462509155, + "learning_rate": 9.776627043054799e-05, + "loss": 0.9513, + "step": 29910 + }, + { + "epoch": 0.19115035201819505, + "grad_norm": 0.5291925072669983, + "learning_rate": 9.776478718299343e-05, + "loss": 0.6963, + "step": 29920 + }, + { + "epoch": 0.19121423916793376, + "grad_norm": 1.0318714380264282, + "learning_rate": 9.776330345440613e-05, + "loss": 0.7995, + "step": 29930 + }, + { + "epoch": 0.19127812631767246, + "grad_norm": 0.8970870971679688, + "learning_rate": 9.776181924480105e-05, + "loss": 0.9622, + "step": 29940 + }, + { + "epoch": 0.19134201346741117, + "grad_norm": 1.631463885307312, + "learning_rate": 9.776033455419313e-05, + "loss": 0.7972, + "step": 29950 + }, + { + "epoch": 0.19140590061714988, + "grad_norm": 1.0540581941604614, + "learning_rate": 9.775884938259732e-05, + "loss": 1.0735, + "step": 29960 + }, + { + "epoch": 0.19146978776688856, + "grad_norm": 2.7128796577453613, + "learning_rate": 9.775736373002858e-05, + "loss": 0.7245, + "step": 29970 + }, + { + "epoch": 0.19153367491662726, + "grad_norm": 3.253152847290039, + "learning_rate": 9.775587759650186e-05, + "loss": 1.0705, + "step": 29980 + }, + { + "epoch": 0.19159756206636597, + "grad_norm": 0.7138085961341858, + "learning_rate": 9.775439098203216e-05, + "loss": 1.0778, + "step": 29990 + }, + { + "epoch": 0.19166144921610467, + "grad_norm": 1.333784580230713, + "learning_rate": 9.775290388663443e-05, + "loss": 0.7873, + "step": 30000 + }, + { + "epoch": 0.19172533636584338, + "grad_norm": 1.4584836959838867, + "learning_rate": 9.775141631032362e-05, + "loss": 0.827, + "step": 30010 + }, + { + "epoch": 0.19178922351558209, + "grad_norm": 0.7264024019241333, + "learning_rate": 9.774992825311476e-05, + "loss": 0.7283, + "step": 30020 + }, + { + "epoch": 0.19185311066532076, + "grad_norm": 0.9471032619476318, + "learning_rate": 9.774843971502282e-05, + "loss": 0.8963, + "step": 30030 + }, + { + "epoch": 0.19191699781505947, + "grad_norm": 0.9348069429397583, + "learning_rate": 9.774695069606275e-05, + "loss": 0.9005, + "step": 30040 + }, + { + "epoch": 0.19198088496479818, + "grad_norm": 0.7033948302268982, + "learning_rate": 9.774546119624961e-05, + "loss": 0.7593, + "step": 30050 + }, + { + "epoch": 0.19204477211453688, + "grad_norm": 0.7773811221122742, + "learning_rate": 9.774397121559836e-05, + "loss": 0.9053, + "step": 30060 + }, + { + "epoch": 0.1921086592642756, + "grad_norm": 1.9900609254837036, + "learning_rate": 9.7742480754124e-05, + "loss": 0.9314, + "step": 30070 + }, + { + "epoch": 0.1921725464140143, + "grad_norm": 0.6554052233695984, + "learning_rate": 9.774098981184158e-05, + "loss": 0.6362, + "step": 30080 + }, + { + "epoch": 0.19223643356375297, + "grad_norm": 1.0862607955932617, + "learning_rate": 9.773949838876608e-05, + "loss": 0.7648, + "step": 30090 + }, + { + "epoch": 0.19230032071349168, + "grad_norm": 0.7586400508880615, + "learning_rate": 9.773800648491252e-05, + "loss": 0.769, + "step": 30100 + }, + { + "epoch": 0.19236420786323039, + "grad_norm": 0.8479837775230408, + "learning_rate": 9.773651410029594e-05, + "loss": 0.802, + "step": 30110 + }, + { + "epoch": 0.1924280950129691, + "grad_norm": 0.5918093323707581, + "learning_rate": 9.773502123493139e-05, + "loss": 0.7993, + "step": 30120 + }, + { + "epoch": 0.1924919821627078, + "grad_norm": 1.4376020431518555, + "learning_rate": 9.773352788883385e-05, + "loss": 0.9593, + "step": 30130 + }, + { + "epoch": 0.1925558693124465, + "grad_norm": 0.9727760553359985, + "learning_rate": 9.77320340620184e-05, + "loss": 0.7962, + "step": 30140 + }, + { + "epoch": 0.1926197564621852, + "grad_norm": 0.9183517098426819, + "learning_rate": 9.773053975450009e-05, + "loss": 0.755, + "step": 30150 + }, + { + "epoch": 0.1926836436119239, + "grad_norm": 1.3329063653945923, + "learning_rate": 9.772904496629391e-05, + "loss": 0.747, + "step": 30160 + }, + { + "epoch": 0.1927475307616626, + "grad_norm": 0.7893358469009399, + "learning_rate": 9.7727549697415e-05, + "loss": 0.9337, + "step": 30170 + }, + { + "epoch": 0.1928114179114013, + "grad_norm": 1.3940712213516235, + "learning_rate": 9.772605394787834e-05, + "loss": 0.924, + "step": 30180 + }, + { + "epoch": 0.19287530506114, + "grad_norm": 1.1371750831604004, + "learning_rate": 9.772455771769905e-05, + "loss": 0.7126, + "step": 30190 + }, + { + "epoch": 0.1929391922108787, + "grad_norm": 0.8628626465797424, + "learning_rate": 9.772306100689216e-05, + "loss": 0.965, + "step": 30200 + }, + { + "epoch": 0.19300307936061742, + "grad_norm": 0.5869954228401184, + "learning_rate": 9.772156381547277e-05, + "loss": 0.7079, + "step": 30210 + }, + { + "epoch": 0.1930669665103561, + "grad_norm": 0.6862210035324097, + "learning_rate": 9.772006614345594e-05, + "loss": 0.8432, + "step": 30220 + }, + { + "epoch": 0.1931308536600948, + "grad_norm": 0.9875562191009521, + "learning_rate": 9.771856799085678e-05, + "loss": 1.3028, + "step": 30230 + }, + { + "epoch": 0.1931947408098335, + "grad_norm": 1.2262318134307861, + "learning_rate": 9.771706935769034e-05, + "loss": 0.9413, + "step": 30240 + }, + { + "epoch": 0.19325862795957222, + "grad_norm": 1.6821092367172241, + "learning_rate": 9.771557024397173e-05, + "loss": 0.8401, + "step": 30250 + }, + { + "epoch": 0.19332251510931092, + "grad_norm": 0.5990639925003052, + "learning_rate": 9.771407064971605e-05, + "loss": 0.918, + "step": 30260 + }, + { + "epoch": 0.19338640225904963, + "grad_norm": 0.7293832898139954, + "learning_rate": 9.771257057493841e-05, + "loss": 0.8454, + "step": 30270 + }, + { + "epoch": 0.1934502894087883, + "grad_norm": 0.7124828100204468, + "learning_rate": 9.77110700196539e-05, + "loss": 0.7466, + "step": 30280 + }, + { + "epoch": 0.193514176558527, + "grad_norm": 0.7515029311180115, + "learning_rate": 9.770956898387764e-05, + "loss": 0.8641, + "step": 30290 + }, + { + "epoch": 0.19357806370826572, + "grad_norm": 0.7060081958770752, + "learning_rate": 9.770806746762473e-05, + "loss": 0.8651, + "step": 30300 + }, + { + "epoch": 0.19364195085800442, + "grad_norm": 0.7407328486442566, + "learning_rate": 9.770656547091033e-05, + "loss": 1.1405, + "step": 30310 + }, + { + "epoch": 0.19370583800774313, + "grad_norm": 1.009606122970581, + "learning_rate": 9.770506299374953e-05, + "loss": 0.9224, + "step": 30320 + }, + { + "epoch": 0.19376972515748184, + "grad_norm": 1.1087229251861572, + "learning_rate": 9.770356003615749e-05, + "loss": 0.9545, + "step": 30330 + }, + { + "epoch": 0.19383361230722052, + "grad_norm": 0.6406879425048828, + "learning_rate": 9.770205659814931e-05, + "loss": 0.9398, + "step": 30340 + }, + { + "epoch": 0.19389749945695922, + "grad_norm": 0.5704166889190674, + "learning_rate": 9.770055267974017e-05, + "loss": 0.6516, + "step": 30350 + }, + { + "epoch": 0.19396138660669793, + "grad_norm": 0.5956087112426758, + "learning_rate": 9.769904828094519e-05, + "loss": 1.0608, + "step": 30360 + }, + { + "epoch": 0.19402527375643663, + "grad_norm": 1.1136138439178467, + "learning_rate": 9.769754340177953e-05, + "loss": 0.7172, + "step": 30370 + }, + { + "epoch": 0.19408916090617534, + "grad_norm": 0.4953550696372986, + "learning_rate": 9.769603804225833e-05, + "loss": 0.9855, + "step": 30380 + }, + { + "epoch": 0.19415304805591405, + "grad_norm": 1.3780313730239868, + "learning_rate": 9.769453220239677e-05, + "loss": 0.8654, + "step": 30390 + }, + { + "epoch": 0.19421693520565272, + "grad_norm": 1.0662996768951416, + "learning_rate": 9.769302588221002e-05, + "loss": 1.1878, + "step": 30400 + }, + { + "epoch": 0.19428082235539143, + "grad_norm": 0.896293044090271, + "learning_rate": 9.769151908171324e-05, + "loss": 0.9222, + "step": 30410 + }, + { + "epoch": 0.19434470950513014, + "grad_norm": 1.046999454498291, + "learning_rate": 9.769001180092159e-05, + "loss": 0.9972, + "step": 30420 + }, + { + "epoch": 0.19440859665486884, + "grad_norm": 0.9748583436012268, + "learning_rate": 9.768850403985028e-05, + "loss": 0.7333, + "step": 30430 + }, + { + "epoch": 0.19447248380460755, + "grad_norm": 1.3169922828674316, + "learning_rate": 9.768699579851446e-05, + "loss": 0.7077, + "step": 30440 + }, + { + "epoch": 0.19453637095434625, + "grad_norm": 0.8591229319572449, + "learning_rate": 9.768548707692935e-05, + "loss": 0.7176, + "step": 30450 + }, + { + "epoch": 0.19460025810408493, + "grad_norm": 1.1447664499282837, + "learning_rate": 9.768397787511012e-05, + "loss": 0.7956, + "step": 30460 + }, + { + "epoch": 0.19466414525382364, + "grad_norm": 0.8214355111122131, + "learning_rate": 9.768246819307199e-05, + "loss": 0.9318, + "step": 30470 + }, + { + "epoch": 0.19472803240356235, + "grad_norm": 0.6454271078109741, + "learning_rate": 9.768095803083015e-05, + "loss": 0.9187, + "step": 30480 + }, + { + "epoch": 0.19479191955330105, + "grad_norm": 0.8612026572227478, + "learning_rate": 9.767944738839983e-05, + "loss": 0.8895, + "step": 30490 + }, + { + "epoch": 0.19485580670303976, + "grad_norm": 0.7116665244102478, + "learning_rate": 9.76779362657962e-05, + "loss": 0.9473, + "step": 30500 + }, + { + "epoch": 0.19491969385277846, + "grad_norm": 0.4623630940914154, + "learning_rate": 9.767642466303452e-05, + "loss": 1.0248, + "step": 30510 + }, + { + "epoch": 0.19498358100251714, + "grad_norm": 0.7994482517242432, + "learning_rate": 9.767491258013e-05, + "loss": 1.0697, + "step": 30520 + }, + { + "epoch": 0.19504746815225585, + "grad_norm": 0.9058681130409241, + "learning_rate": 9.767340001709785e-05, + "loss": 1.0001, + "step": 30530 + }, + { + "epoch": 0.19511135530199455, + "grad_norm": 0.8972348570823669, + "learning_rate": 9.767188697395333e-05, + "loss": 0.9495, + "step": 30540 + }, + { + "epoch": 0.19517524245173326, + "grad_norm": 0.664193868637085, + "learning_rate": 9.767037345071166e-05, + "loss": 0.9913, + "step": 30550 + }, + { + "epoch": 0.19523912960147197, + "grad_norm": 1.2621389627456665, + "learning_rate": 9.766885944738808e-05, + "loss": 0.7485, + "step": 30560 + }, + { + "epoch": 0.19530301675121067, + "grad_norm": 0.7015652656555176, + "learning_rate": 9.766734496399786e-05, + "loss": 0.7023, + "step": 30570 + }, + { + "epoch": 0.19536690390094935, + "grad_norm": 1.066769003868103, + "learning_rate": 9.766583000055625e-05, + "loss": 1.0337, + "step": 30580 + }, + { + "epoch": 0.19543079105068806, + "grad_norm": 1.1455520391464233, + "learning_rate": 9.766431455707847e-05, + "loss": 1.0366, + "step": 30590 + }, + { + "epoch": 0.19549467820042676, + "grad_norm": 1.4529062509536743, + "learning_rate": 9.766279863357982e-05, + "loss": 0.9134, + "step": 30600 + }, + { + "epoch": 0.19555856535016547, + "grad_norm": 0.7042234539985657, + "learning_rate": 9.766128223007556e-05, + "loss": 1.032, + "step": 30610 + }, + { + "epoch": 0.19562245249990418, + "grad_norm": 0.7277450561523438, + "learning_rate": 9.765991705652953e-05, + "loss": 0.9731, + "step": 30620 + }, + { + "epoch": 0.19568633964964288, + "grad_norm": 0.7915880084037781, + "learning_rate": 9.765839974105665e-05, + "loss": 1.0449, + "step": 30630 + }, + { + "epoch": 0.19575022679938156, + "grad_norm": 1.1217659711837769, + "learning_rate": 9.765688194562249e-05, + "loss": 0.816, + "step": 30640 + }, + { + "epoch": 0.19581411394912027, + "grad_norm": 0.7037495374679565, + "learning_rate": 9.765536367024229e-05, + "loss": 0.9901, + "step": 30650 + }, + { + "epoch": 0.19587800109885897, + "grad_norm": 0.8996081352233887, + "learning_rate": 9.765384491493132e-05, + "loss": 0.9512, + "step": 30660 + }, + { + "epoch": 0.19594188824859768, + "grad_norm": 0.543251633644104, + "learning_rate": 9.765232567970493e-05, + "loss": 0.8288, + "step": 30670 + }, + { + "epoch": 0.19600577539833638, + "grad_norm": 0.7527588605880737, + "learning_rate": 9.76508059645784e-05, + "loss": 0.9027, + "step": 30680 + }, + { + "epoch": 0.1960696625480751, + "grad_norm": 0.8170384764671326, + "learning_rate": 9.764928576956703e-05, + "loss": 0.8716, + "step": 30690 + }, + { + "epoch": 0.19613354969781377, + "grad_norm": 0.8016200661659241, + "learning_rate": 9.764776509468611e-05, + "loss": 0.9099, + "step": 30700 + }, + { + "epoch": 0.19619743684755248, + "grad_norm": 1.191615343093872, + "learning_rate": 9.764624393995098e-05, + "loss": 0.9785, + "step": 30710 + }, + { + "epoch": 0.19626132399729118, + "grad_norm": 1.0004390478134155, + "learning_rate": 9.764472230537697e-05, + "loss": 1.06, + "step": 30720 + }, + { + "epoch": 0.1963252111470299, + "grad_norm": 0.5032203197479248, + "learning_rate": 9.764320019097938e-05, + "loss": 0.7955, + "step": 30730 + }, + { + "epoch": 0.1963890982967686, + "grad_norm": 1.1866439580917358, + "learning_rate": 9.764167759677354e-05, + "loss": 0.7862, + "step": 30740 + }, + { + "epoch": 0.1964529854465073, + "grad_norm": 0.934973955154419, + "learning_rate": 9.764015452277479e-05, + "loss": 0.9502, + "step": 30750 + }, + { + "epoch": 0.19651687259624598, + "grad_norm": 1.0195708274841309, + "learning_rate": 9.763863096899847e-05, + "loss": 1.0983, + "step": 30760 + }, + { + "epoch": 0.19658075974598468, + "grad_norm": 0.8169684410095215, + "learning_rate": 9.763710693545993e-05, + "loss": 0.7333, + "step": 30770 + }, + { + "epoch": 0.1966446468957234, + "grad_norm": 1.0230990648269653, + "learning_rate": 9.763558242217452e-05, + "loss": 1.1088, + "step": 30780 + }, + { + "epoch": 0.1967085340454621, + "grad_norm": 1.095651388168335, + "learning_rate": 9.763405742915756e-05, + "loss": 0.8304, + "step": 30790 + }, + { + "epoch": 0.1967724211952008, + "grad_norm": 0.717144787311554, + "learning_rate": 9.763253195642446e-05, + "loss": 0.9346, + "step": 30800 + }, + { + "epoch": 0.1968363083449395, + "grad_norm": 3.6631853580474854, + "learning_rate": 9.763100600399053e-05, + "loss": 1.0039, + "step": 30810 + }, + { + "epoch": 0.1969001954946782, + "grad_norm": 0.7753827571868896, + "learning_rate": 9.762947957187117e-05, + "loss": 0.986, + "step": 30820 + }, + { + "epoch": 0.1969640826444169, + "grad_norm": 1.0646581649780273, + "learning_rate": 9.762795266008175e-05, + "loss": 0.743, + "step": 30830 + }, + { + "epoch": 0.1970279697941556, + "grad_norm": 0.9290790557861328, + "learning_rate": 9.762642526863765e-05, + "loss": 0.9802, + "step": 30840 + }, + { + "epoch": 0.1970918569438943, + "grad_norm": 1.0001217126846313, + "learning_rate": 9.762489739755423e-05, + "loss": 0.8519, + "step": 30850 + }, + { + "epoch": 0.197155744093633, + "grad_norm": 0.9493054151535034, + "learning_rate": 9.76233690468469e-05, + "loss": 0.844, + "step": 30860 + }, + { + "epoch": 0.19721963124337172, + "grad_norm": 0.775419294834137, + "learning_rate": 9.762184021653104e-05, + "loss": 0.6618, + "step": 30870 + }, + { + "epoch": 0.1972835183931104, + "grad_norm": 0.6491733193397522, + "learning_rate": 9.762031090662205e-05, + "loss": 0.8618, + "step": 30880 + }, + { + "epoch": 0.1973474055428491, + "grad_norm": 0.7320391535758972, + "learning_rate": 9.761878111713534e-05, + "loss": 1.1604, + "step": 30890 + }, + { + "epoch": 0.1974112926925878, + "grad_norm": 0.6711703538894653, + "learning_rate": 9.761725084808629e-05, + "loss": 0.9965, + "step": 30900 + }, + { + "epoch": 0.19747517984232651, + "grad_norm": 0.5033368468284607, + "learning_rate": 9.761572009949035e-05, + "loss": 1.0613, + "step": 30910 + }, + { + "epoch": 0.19753906699206522, + "grad_norm": 0.8021765947341919, + "learning_rate": 9.76141888713629e-05, + "loss": 0.8296, + "step": 30920 + }, + { + "epoch": 0.19760295414180393, + "grad_norm": 0.6978395581245422, + "learning_rate": 9.761265716371938e-05, + "loss": 0.8845, + "step": 30930 + }, + { + "epoch": 0.1976668412915426, + "grad_norm": 0.741265594959259, + "learning_rate": 9.761112497657522e-05, + "loss": 0.8021, + "step": 30940 + }, + { + "epoch": 0.1977307284412813, + "grad_norm": 0.6882484555244446, + "learning_rate": 9.760959230994583e-05, + "loss": 0.7808, + "step": 30950 + }, + { + "epoch": 0.19779461559102002, + "grad_norm": 1.2899192571640015, + "learning_rate": 9.760805916384666e-05, + "loss": 0.9258, + "step": 30960 + }, + { + "epoch": 0.19785850274075872, + "grad_norm": 0.7548243999481201, + "learning_rate": 9.760652553829314e-05, + "loss": 0.749, + "step": 30970 + }, + { + "epoch": 0.19792238989049743, + "grad_norm": 0.8977358341217041, + "learning_rate": 9.760499143330075e-05, + "loss": 0.8827, + "step": 30980 + }, + { + "epoch": 0.19798627704023614, + "grad_norm": 0.8583622574806213, + "learning_rate": 9.760345684888489e-05, + "loss": 0.7604, + "step": 30990 + }, + { + "epoch": 0.19805016418997484, + "grad_norm": 0.930568516254425, + "learning_rate": 9.760192178506104e-05, + "loss": 0.8838, + "step": 31000 + }, + { + "epoch": 0.19811405133971352, + "grad_norm": 0.7296523451805115, + "learning_rate": 9.760038624184466e-05, + "loss": 0.997, + "step": 31010 + }, + { + "epoch": 0.19817793848945223, + "grad_norm": 0.5813782215118408, + "learning_rate": 9.75988502192512e-05, + "loss": 0.873, + "step": 31020 + }, + { + "epoch": 0.19824182563919093, + "grad_norm": 1.0174976587295532, + "learning_rate": 9.759731371729614e-05, + "loss": 0.9311, + "step": 31030 + }, + { + "epoch": 0.19830571278892964, + "grad_norm": 0.6261200904846191, + "learning_rate": 9.759577673599497e-05, + "loss": 0.8865, + "step": 31040 + }, + { + "epoch": 0.19836959993866835, + "grad_norm": 0.5916396975517273, + "learning_rate": 9.759423927536316e-05, + "loss": 0.7946, + "step": 31050 + }, + { + "epoch": 0.19843348708840705, + "grad_norm": 1.060449242591858, + "learning_rate": 9.759270133541616e-05, + "loss": 1.2101, + "step": 31060 + }, + { + "epoch": 0.19849737423814573, + "grad_norm": 0.5522297620773315, + "learning_rate": 9.759116291616948e-05, + "loss": 0.86, + "step": 31070 + }, + { + "epoch": 0.19856126138788444, + "grad_norm": 1.0017218589782715, + "learning_rate": 9.758962401763863e-05, + "loss": 0.8776, + "step": 31080 + }, + { + "epoch": 0.19862514853762314, + "grad_norm": 0.8278487920761108, + "learning_rate": 9.758808463983911e-05, + "loss": 1.0988, + "step": 31090 + }, + { + "epoch": 0.19868903568736185, + "grad_norm": 0.8800287246704102, + "learning_rate": 9.758654478278638e-05, + "loss": 0.8976, + "step": 31100 + }, + { + "epoch": 0.19875292283710055, + "grad_norm": 0.7034065127372742, + "learning_rate": 9.758500444649598e-05, + "loss": 1.0156, + "step": 31110 + }, + { + "epoch": 0.19881680998683926, + "grad_norm": 1.02751886844635, + "learning_rate": 9.758346363098344e-05, + "loss": 0.9064, + "step": 31120 + }, + { + "epoch": 0.19888069713657794, + "grad_norm": 0.8063342571258545, + "learning_rate": 9.758192233626425e-05, + "loss": 0.9177, + "step": 31130 + }, + { + "epoch": 0.19894458428631664, + "grad_norm": 2.721904754638672, + "learning_rate": 9.758038056235393e-05, + "loss": 0.9505, + "step": 31140 + }, + { + "epoch": 0.19900847143605535, + "grad_norm": 1.0083937644958496, + "learning_rate": 9.757883830926801e-05, + "loss": 1.1092, + "step": 31150 + }, + { + "epoch": 0.19907235858579406, + "grad_norm": 0.841985821723938, + "learning_rate": 9.757729557702202e-05, + "loss": 0.7708, + "step": 31160 + }, + { + "epoch": 0.19913624573553276, + "grad_norm": 0.6755800843238831, + "learning_rate": 9.757575236563152e-05, + "loss": 0.7743, + "step": 31170 + }, + { + "epoch": 0.19920013288527147, + "grad_norm": 0.7885231971740723, + "learning_rate": 9.757420867511202e-05, + "loss": 0.9718, + "step": 31180 + }, + { + "epoch": 0.19926402003501015, + "grad_norm": 1.2030565738677979, + "learning_rate": 9.75726645054791e-05, + "loss": 0.9473, + "step": 31190 + }, + { + "epoch": 0.19932790718474885, + "grad_norm": 1.3136283159255981, + "learning_rate": 9.757111985674828e-05, + "loss": 1.0381, + "step": 31200 + }, + { + "epoch": 0.19939179433448756, + "grad_norm": 0.6797472834587097, + "learning_rate": 9.756957472893513e-05, + "loss": 1.0419, + "step": 31210 + }, + { + "epoch": 0.19945568148422627, + "grad_norm": 0.7219412922859192, + "learning_rate": 9.756802912205522e-05, + "loss": 1.0792, + "step": 31220 + }, + { + "epoch": 0.19951956863396497, + "grad_norm": 0.8325220346450806, + "learning_rate": 9.756648303612409e-05, + "loss": 0.7956, + "step": 31230 + }, + { + "epoch": 0.19958345578370368, + "grad_norm": 0.9289294481277466, + "learning_rate": 9.756493647115734e-05, + "loss": 0.7096, + "step": 31240 + }, + { + "epoch": 0.19964734293344236, + "grad_norm": 0.908420205116272, + "learning_rate": 9.756338942717051e-05, + "loss": 0.8602, + "step": 31250 + }, + { + "epoch": 0.19971123008318106, + "grad_norm": 2.6106882095336914, + "learning_rate": 9.756184190417921e-05, + "loss": 0.9356, + "step": 31260 + }, + { + "epoch": 0.19977511723291977, + "grad_norm": 0.8880581259727478, + "learning_rate": 9.756029390219901e-05, + "loss": 0.7862, + "step": 31270 + }, + { + "epoch": 0.19983900438265847, + "grad_norm": 0.6896887421607971, + "learning_rate": 9.755874542124551e-05, + "loss": 0.9089, + "step": 31280 + }, + { + "epoch": 0.19990289153239718, + "grad_norm": 1.0063308477401733, + "learning_rate": 9.75571964613343e-05, + "loss": 1.0273, + "step": 31290 + }, + { + "epoch": 0.1999667786821359, + "grad_norm": 1.277763843536377, + "learning_rate": 9.755564702248099e-05, + "loss": 1.0963, + "step": 31300 + }, + { + "epoch": 0.20003066583187457, + "grad_norm": 0.6711148023605347, + "learning_rate": 9.755409710470116e-05, + "loss": 1.016, + "step": 31310 + }, + { + "epoch": 0.20009455298161327, + "grad_norm": 1.1493245363235474, + "learning_rate": 9.755254670801042e-05, + "loss": 0.6895, + "step": 31320 + }, + { + "epoch": 0.20015844013135198, + "grad_norm": 0.5734307765960693, + "learning_rate": 9.755099583242442e-05, + "loss": 0.9138, + "step": 31330 + }, + { + "epoch": 0.20022232728109068, + "grad_norm": 0.6204320788383484, + "learning_rate": 9.754944447795874e-05, + "loss": 0.8158, + "step": 31340 + }, + { + "epoch": 0.2002862144308294, + "grad_norm": 1.0882511138916016, + "learning_rate": 9.754789264462902e-05, + "loss": 0.7919, + "step": 31350 + }, + { + "epoch": 0.2003501015805681, + "grad_norm": 1.2842504978179932, + "learning_rate": 9.754634033245089e-05, + "loss": 1.1121, + "step": 31360 + }, + { + "epoch": 0.20041398873030677, + "grad_norm": 0.7042865753173828, + "learning_rate": 9.754478754143998e-05, + "loss": 0.8598, + "step": 31370 + }, + { + "epoch": 0.20047787588004548, + "grad_norm": 0.7466055154800415, + "learning_rate": 9.754323427161191e-05, + "loss": 0.7496, + "step": 31380 + }, + { + "epoch": 0.2005417630297842, + "grad_norm": 1.2161649465560913, + "learning_rate": 9.754168052298237e-05, + "loss": 0.9603, + "step": 31390 + }, + { + "epoch": 0.2006056501795229, + "grad_norm": 0.871167778968811, + "learning_rate": 9.754012629556696e-05, + "loss": 1.0315, + "step": 31400 + }, + { + "epoch": 0.2006695373292616, + "grad_norm": 0.853158175945282, + "learning_rate": 9.753857158938135e-05, + "loss": 0.9701, + "step": 31410 + }, + { + "epoch": 0.2007334244790003, + "grad_norm": 0.5134825706481934, + "learning_rate": 9.753701640444121e-05, + "loss": 0.9838, + "step": 31420 + }, + { + "epoch": 0.20079731162873898, + "grad_norm": 0.7412970662117004, + "learning_rate": 9.753546074076217e-05, + "loss": 0.802, + "step": 31430 + }, + { + "epoch": 0.2008611987784777, + "grad_norm": 1.8500874042510986, + "learning_rate": 9.753390459835993e-05, + "loss": 0.711, + "step": 31440 + }, + { + "epoch": 0.2009250859282164, + "grad_norm": 0.685453474521637, + "learning_rate": 9.753234797725015e-05, + "loss": 0.9091, + "step": 31450 + }, + { + "epoch": 0.2009889730779551, + "grad_norm": 1.9982002973556519, + "learning_rate": 9.75307908774485e-05, + "loss": 0.8882, + "step": 31460 + }, + { + "epoch": 0.2010528602276938, + "grad_norm": 1.9767764806747437, + "learning_rate": 9.752923329897066e-05, + "loss": 0.8807, + "step": 31470 + }, + { + "epoch": 0.20111674737743251, + "grad_norm": 0.6557339429855347, + "learning_rate": 9.752767524183233e-05, + "loss": 0.8447, + "step": 31480 + }, + { + "epoch": 0.2011806345271712, + "grad_norm": 0.6405972838401794, + "learning_rate": 9.752611670604919e-05, + "loss": 0.8889, + "step": 31490 + }, + { + "epoch": 0.2012445216769099, + "grad_norm": 0.8593305349349976, + "learning_rate": 9.752455769163693e-05, + "loss": 1.1378, + "step": 31500 + }, + { + "epoch": 0.2013084088266486, + "grad_norm": 0.6940191984176636, + "learning_rate": 9.752299819861127e-05, + "loss": 0.9958, + "step": 31510 + }, + { + "epoch": 0.2013722959763873, + "grad_norm": 0.8981072306632996, + "learning_rate": 9.752143822698789e-05, + "loss": 0.8305, + "step": 31520 + }, + { + "epoch": 0.20143618312612602, + "grad_norm": 1.0248847007751465, + "learning_rate": 9.751987777678253e-05, + "loss": 0.91, + "step": 31530 + }, + { + "epoch": 0.20150007027586472, + "grad_norm": 0.8903045654296875, + "learning_rate": 9.751831684801089e-05, + "loss": 0.8491, + "step": 31540 + }, + { + "epoch": 0.2015639574256034, + "grad_norm": 1.1542670726776123, + "learning_rate": 9.75167554406887e-05, + "loss": 0.8601, + "step": 31550 + }, + { + "epoch": 0.2016278445753421, + "grad_norm": 0.7678368091583252, + "learning_rate": 9.751519355483166e-05, + "loss": 0.8247, + "step": 31560 + }, + { + "epoch": 0.20169173172508081, + "grad_norm": 0.9471594095230103, + "learning_rate": 9.75136311904555e-05, + "loss": 0.9038, + "step": 31570 + }, + { + "epoch": 0.20175561887481952, + "grad_norm": 0.8465635180473328, + "learning_rate": 9.7512068347576e-05, + "loss": 0.788, + "step": 31580 + }, + { + "epoch": 0.20181950602455823, + "grad_norm": 0.9624682664871216, + "learning_rate": 9.751050502620885e-05, + "loss": 1.0697, + "step": 31590 + }, + { + "epoch": 0.20188339317429693, + "grad_norm": 0.569759726524353, + "learning_rate": 9.750894122636982e-05, + "loss": 1.0777, + "step": 31600 + }, + { + "epoch": 0.2019472803240356, + "grad_norm": 3.1683318614959717, + "learning_rate": 9.750737694807464e-05, + "loss": 0.9169, + "step": 31610 + }, + { + "epoch": 0.20201116747377432, + "grad_norm": 0.7441072463989258, + "learning_rate": 9.75058121913391e-05, + "loss": 0.771, + "step": 31620 + }, + { + "epoch": 0.20207505462351302, + "grad_norm": 1.1185020208358765, + "learning_rate": 9.75042469561789e-05, + "loss": 0.8128, + "step": 31630 + }, + { + "epoch": 0.20213894177325173, + "grad_norm": 0.7714232206344604, + "learning_rate": 9.750268124260987e-05, + "loss": 0.8612, + "step": 31640 + }, + { + "epoch": 0.20220282892299044, + "grad_norm": 0.6368833184242249, + "learning_rate": 9.75011150506477e-05, + "loss": 1.076, + "step": 31650 + }, + { + "epoch": 0.20226671607272914, + "grad_norm": 1.164900779724121, + "learning_rate": 9.749954838030824e-05, + "loss": 0.9611, + "step": 31660 + }, + { + "epoch": 0.20233060322246782, + "grad_norm": 0.66245436668396, + "learning_rate": 9.749798123160723e-05, + "loss": 0.8932, + "step": 31670 + }, + { + "epoch": 0.20239449037220653, + "grad_norm": 0.7968323826789856, + "learning_rate": 9.749641360456045e-05, + "loss": 1.0375, + "step": 31680 + }, + { + "epoch": 0.20245837752194523, + "grad_norm": 1.1304694414138794, + "learning_rate": 9.749484549918371e-05, + "loss": 0.843, + "step": 31690 + }, + { + "epoch": 0.20252226467168394, + "grad_norm": 1.4667329788208008, + "learning_rate": 9.749327691549277e-05, + "loss": 0.8078, + "step": 31700 + }, + { + "epoch": 0.20258615182142264, + "grad_norm": 0.8826027512550354, + "learning_rate": 9.749170785350344e-05, + "loss": 1.0263, + "step": 31710 + }, + { + "epoch": 0.20265003897116135, + "grad_norm": 0.7443497180938721, + "learning_rate": 9.749013831323154e-05, + "loss": 0.7889, + "step": 31720 + }, + { + "epoch": 0.20271392612090003, + "grad_norm": 0.5433924198150635, + "learning_rate": 9.748856829469287e-05, + "loss": 0.9073, + "step": 31730 + }, + { + "epoch": 0.20277781327063873, + "grad_norm": 0.5322934985160828, + "learning_rate": 9.74869977979032e-05, + "loss": 0.7924, + "step": 31740 + }, + { + "epoch": 0.20284170042037744, + "grad_norm": 1.0181642770767212, + "learning_rate": 9.748542682287841e-05, + "loss": 0.7738, + "step": 31750 + }, + { + "epoch": 0.20290558757011615, + "grad_norm": 0.8533402681350708, + "learning_rate": 9.74838553696343e-05, + "loss": 1.1269, + "step": 31760 + }, + { + "epoch": 0.20296947471985485, + "grad_norm": 0.6573584079742432, + "learning_rate": 9.748228343818666e-05, + "loss": 0.9684, + "step": 31770 + }, + { + "epoch": 0.20303336186959356, + "grad_norm": 1.141799807548523, + "learning_rate": 9.748071102855135e-05, + "loss": 1.0159, + "step": 31780 + }, + { + "epoch": 0.20309724901933224, + "grad_norm": 2.4994301795959473, + "learning_rate": 9.747913814074421e-05, + "loss": 0.7334, + "step": 31790 + }, + { + "epoch": 0.20316113616907094, + "grad_norm": 1.0525953769683838, + "learning_rate": 9.747756477478108e-05, + "loss": 0.9094, + "step": 31800 + }, + { + "epoch": 0.20322502331880965, + "grad_norm": 0.6493773460388184, + "learning_rate": 9.747599093067779e-05, + "loss": 0.7929, + "step": 31810 + }, + { + "epoch": 0.20328891046854836, + "grad_norm": 1.622753381729126, + "learning_rate": 9.747441660845021e-05, + "loss": 1.3227, + "step": 31820 + }, + { + "epoch": 0.20335279761828706, + "grad_norm": 0.932706356048584, + "learning_rate": 9.747284180811417e-05, + "loss": 1.2109, + "step": 31830 + }, + { + "epoch": 0.20341668476802577, + "grad_norm": 0.706366240978241, + "learning_rate": 9.747126652968554e-05, + "loss": 1.0372, + "step": 31840 + }, + { + "epoch": 0.20348057191776447, + "grad_norm": 0.5680680871009827, + "learning_rate": 9.74696907731802e-05, + "loss": 0.8115, + "step": 31850 + }, + { + "epoch": 0.20354445906750315, + "grad_norm": 0.7886488437652588, + "learning_rate": 9.7468114538614e-05, + "loss": 0.8516, + "step": 31860 + }, + { + "epoch": 0.20360834621724186, + "grad_norm": 1.467068076133728, + "learning_rate": 9.746653782600284e-05, + "loss": 0.9352, + "step": 31870 + }, + { + "epoch": 0.20367223336698057, + "grad_norm": 0.6054574251174927, + "learning_rate": 9.746496063536254e-05, + "loss": 0.738, + "step": 31880 + }, + { + "epoch": 0.20373612051671927, + "grad_norm": 0.8049781322479248, + "learning_rate": 9.746338296670906e-05, + "loss": 0.9212, + "step": 31890 + }, + { + "epoch": 0.20380000766645798, + "grad_norm": 2.8067591190338135, + "learning_rate": 9.746180482005825e-05, + "loss": 1.1401, + "step": 31900 + }, + { + "epoch": 0.20386389481619668, + "grad_norm": 0.5604707598686218, + "learning_rate": 9.746022619542599e-05, + "loss": 0.7448, + "step": 31910 + }, + { + "epoch": 0.20392778196593536, + "grad_norm": 0.6594801545143127, + "learning_rate": 9.745864709282819e-05, + "loss": 1.0038, + "step": 31920 + }, + { + "epoch": 0.20399166911567407, + "grad_norm": 0.7814098000526428, + "learning_rate": 9.745706751228076e-05, + "loss": 1.0487, + "step": 31930 + }, + { + "epoch": 0.20405555626541277, + "grad_norm": 0.892376184463501, + "learning_rate": 9.745548745379961e-05, + "loss": 1.006, + "step": 31940 + }, + { + "epoch": 0.20411944341515148, + "grad_norm": 0.4663401246070862, + "learning_rate": 9.745390691740064e-05, + "loss": 0.9555, + "step": 31950 + }, + { + "epoch": 0.2041833305648902, + "grad_norm": 0.9874062538146973, + "learning_rate": 9.745232590309978e-05, + "loss": 1.0092, + "step": 31960 + }, + { + "epoch": 0.2042472177146289, + "grad_norm": 0.5330253839492798, + "learning_rate": 9.745074441091294e-05, + "loss": 1.0081, + "step": 31970 + }, + { + "epoch": 0.20431110486436757, + "grad_norm": 1.0687589645385742, + "learning_rate": 9.744916244085606e-05, + "loss": 0.8934, + "step": 31980 + }, + { + "epoch": 0.20437499201410628, + "grad_norm": 0.6077286601066589, + "learning_rate": 9.744757999294506e-05, + "loss": 0.8938, + "step": 31990 + }, + { + "epoch": 0.20443887916384498, + "grad_norm": 0.6717079281806946, + "learning_rate": 9.744599706719588e-05, + "loss": 0.9467, + "step": 32000 + }, + { + "epoch": 0.2045027663135837, + "grad_norm": 1.032605767250061, + "learning_rate": 9.744441366362447e-05, + "loss": 0.9648, + "step": 32010 + }, + { + "epoch": 0.2045666534633224, + "grad_norm": 0.6703940629959106, + "learning_rate": 9.744282978224677e-05, + "loss": 0.7152, + "step": 32020 + }, + { + "epoch": 0.2046305406130611, + "grad_norm": 1.4983042478561401, + "learning_rate": 9.744124542307871e-05, + "loss": 0.9562, + "step": 32030 + }, + { + "epoch": 0.20469442776279978, + "grad_norm": 0.7340278029441833, + "learning_rate": 9.743966058613629e-05, + "loss": 0.9512, + "step": 32040 + }, + { + "epoch": 0.20475831491253849, + "grad_norm": 2.0036234855651855, + "learning_rate": 9.743807527143544e-05, + "loss": 1.1441, + "step": 32050 + }, + { + "epoch": 0.2048222020622772, + "grad_norm": 0.8495148420333862, + "learning_rate": 9.743648947899214e-05, + "loss": 1.0051, + "step": 32060 + }, + { + "epoch": 0.2048860892120159, + "grad_norm": 1.4452283382415771, + "learning_rate": 9.743490320882234e-05, + "loss": 0.8346, + "step": 32070 + }, + { + "epoch": 0.2049499763617546, + "grad_norm": 0.7870922684669495, + "learning_rate": 9.743331646094202e-05, + "loss": 0.8006, + "step": 32080 + }, + { + "epoch": 0.2050138635114933, + "grad_norm": 0.8627803325653076, + "learning_rate": 9.743172923536718e-05, + "loss": 0.947, + "step": 32090 + }, + { + "epoch": 0.205077750661232, + "grad_norm": 1.4049910306930542, + "learning_rate": 9.74301415321138e-05, + "loss": 1.1061, + "step": 32100 + }, + { + "epoch": 0.2051416378109707, + "grad_norm": 0.6403430700302124, + "learning_rate": 9.742855335119785e-05, + "loss": 0.8099, + "step": 32110 + }, + { + "epoch": 0.2052055249607094, + "grad_norm": 1.0958514213562012, + "learning_rate": 9.742696469263533e-05, + "loss": 0.9353, + "step": 32120 + }, + { + "epoch": 0.2052694121104481, + "grad_norm": 0.828372597694397, + "learning_rate": 9.742537555644225e-05, + "loss": 0.7152, + "step": 32130 + }, + { + "epoch": 0.2053332992601868, + "grad_norm": 0.743424654006958, + "learning_rate": 9.742378594263461e-05, + "loss": 0.8165, + "step": 32140 + }, + { + "epoch": 0.20539718640992552, + "grad_norm": 1.0300029516220093, + "learning_rate": 9.742219585122843e-05, + "loss": 1.0059, + "step": 32150 + }, + { + "epoch": 0.2054610735596642, + "grad_norm": 0.9428716897964478, + "learning_rate": 9.74206052822397e-05, + "loss": 0.9221, + "step": 32160 + }, + { + "epoch": 0.2055249607094029, + "grad_norm": 0.9042668342590332, + "learning_rate": 9.741901423568446e-05, + "loss": 0.8297, + "step": 32170 + }, + { + "epoch": 0.2055888478591416, + "grad_norm": 1.2374792098999023, + "learning_rate": 9.741742271157872e-05, + "loss": 0.8647, + "step": 32180 + }, + { + "epoch": 0.20565273500888032, + "grad_norm": 0.9123538136482239, + "learning_rate": 9.74158307099385e-05, + "loss": 0.6822, + "step": 32190 + }, + { + "epoch": 0.20571662215861902, + "grad_norm": 0.725796103477478, + "learning_rate": 9.741423823077986e-05, + "loss": 1.1005, + "step": 32200 + }, + { + "epoch": 0.20578050930835773, + "grad_norm": 0.8962036371231079, + "learning_rate": 9.741264527411881e-05, + "loss": 1.0891, + "step": 32210 + }, + { + "epoch": 0.2058443964580964, + "grad_norm": 0.9846658110618591, + "learning_rate": 9.741105183997141e-05, + "loss": 1.0041, + "step": 32220 + }, + { + "epoch": 0.2059082836078351, + "grad_norm": 0.4427562654018402, + "learning_rate": 9.74094579283537e-05, + "loss": 0.8606, + "step": 32230 + }, + { + "epoch": 0.20597217075757382, + "grad_norm": 0.8591815829277039, + "learning_rate": 9.740786353928173e-05, + "loss": 1.0499, + "step": 32240 + }, + { + "epoch": 0.20603605790731253, + "grad_norm": 0.5261662602424622, + "learning_rate": 9.740626867277157e-05, + "loss": 0.9264, + "step": 32250 + }, + { + "epoch": 0.20609994505705123, + "grad_norm": 1.2539498805999756, + "learning_rate": 9.740467332883926e-05, + "loss": 1.0337, + "step": 32260 + }, + { + "epoch": 0.20616383220678994, + "grad_norm": 0.6254390478134155, + "learning_rate": 9.740307750750088e-05, + "loss": 1.0999, + "step": 32270 + }, + { + "epoch": 0.20622771935652862, + "grad_norm": 0.6762027144432068, + "learning_rate": 9.740148120877251e-05, + "loss": 0.9724, + "step": 32280 + }, + { + "epoch": 0.20629160650626732, + "grad_norm": 0.9390422105789185, + "learning_rate": 9.73998844326702e-05, + "loss": 0.8626, + "step": 32290 + }, + { + "epoch": 0.20635549365600603, + "grad_norm": 0.8526495695114136, + "learning_rate": 9.739828717921006e-05, + "loss": 0.911, + "step": 32300 + }, + { + "epoch": 0.20641938080574473, + "grad_norm": 1.050434947013855, + "learning_rate": 9.739668944840817e-05, + "loss": 1.0802, + "step": 32310 + }, + { + "epoch": 0.20648326795548344, + "grad_norm": 0.8968641757965088, + "learning_rate": 9.739509124028062e-05, + "loss": 1.0353, + "step": 32320 + }, + { + "epoch": 0.20654715510522215, + "grad_norm": 0.9247165322303772, + "learning_rate": 9.739349255484346e-05, + "loss": 1.1142, + "step": 32330 + }, + { + "epoch": 0.20661104225496082, + "grad_norm": 0.7122106552124023, + "learning_rate": 9.739189339211286e-05, + "loss": 1.0356, + "step": 32340 + }, + { + "epoch": 0.20667492940469953, + "grad_norm": 0.5841015577316284, + "learning_rate": 9.739029375210489e-05, + "loss": 0.9243, + "step": 32350 + }, + { + "epoch": 0.20673881655443824, + "grad_norm": 0.7304105758666992, + "learning_rate": 9.738869363483565e-05, + "loss": 0.8895, + "step": 32360 + }, + { + "epoch": 0.20680270370417694, + "grad_norm": 0.9879099726676941, + "learning_rate": 9.738709304032128e-05, + "loss": 1.0733, + "step": 32370 + }, + { + "epoch": 0.20686659085391565, + "grad_norm": 1.092883586883545, + "learning_rate": 9.738549196857789e-05, + "loss": 0.9595, + "step": 32380 + }, + { + "epoch": 0.20693047800365436, + "grad_norm": 0.8127654194831848, + "learning_rate": 9.738389041962159e-05, + "loss": 0.6739, + "step": 32390 + }, + { + "epoch": 0.20699436515339303, + "grad_norm": 0.60942542552948, + "learning_rate": 9.738228839346853e-05, + "loss": 0.829, + "step": 32400 + }, + { + "epoch": 0.20705825230313174, + "grad_norm": 1.1465409994125366, + "learning_rate": 9.738068589013483e-05, + "loss": 0.7232, + "step": 32410 + }, + { + "epoch": 0.20712213945287045, + "grad_norm": 0.6177552342414856, + "learning_rate": 9.737908290963663e-05, + "loss": 0.8286, + "step": 32420 + }, + { + "epoch": 0.20718602660260915, + "grad_norm": 0.5419365763664246, + "learning_rate": 9.737747945199009e-05, + "loss": 0.8722, + "step": 32430 + }, + { + "epoch": 0.20724991375234786, + "grad_norm": 1.0209770202636719, + "learning_rate": 9.737587551721132e-05, + "loss": 0.9187, + "step": 32440 + }, + { + "epoch": 0.20731380090208656, + "grad_norm": 0.7830290198326111, + "learning_rate": 9.737427110531652e-05, + "loss": 1.1599, + "step": 32450 + }, + { + "epoch": 0.20737768805182524, + "grad_norm": 1.0259994268417358, + "learning_rate": 9.737266621632182e-05, + "loss": 1.1211, + "step": 32460 + }, + { + "epoch": 0.20744157520156395, + "grad_norm": 0.6848270893096924, + "learning_rate": 9.73710608502434e-05, + "loss": 0.7989, + "step": 32470 + }, + { + "epoch": 0.20750546235130266, + "grad_norm": 0.779099702835083, + "learning_rate": 9.736945500709737e-05, + "loss": 0.9887, + "step": 32480 + }, + { + "epoch": 0.20756934950104136, + "grad_norm": 0.7140209078788757, + "learning_rate": 9.736784868689999e-05, + "loss": 1.075, + "step": 32490 + }, + { + "epoch": 0.20763323665078007, + "grad_norm": 0.7910488247871399, + "learning_rate": 9.736624188966738e-05, + "loss": 1.1467, + "step": 32500 + }, + { + "epoch": 0.20769712380051877, + "grad_norm": 0.8852772116661072, + "learning_rate": 9.736463461541574e-05, + "loss": 1.0504, + "step": 32510 + }, + { + "epoch": 0.20776101095025745, + "grad_norm": 1.6205745935440063, + "learning_rate": 9.736302686416126e-05, + "loss": 0.8582, + "step": 32520 + }, + { + "epoch": 0.20782489809999616, + "grad_norm": 0.9984052777290344, + "learning_rate": 9.736141863592012e-05, + "loss": 1.0526, + "step": 32530 + }, + { + "epoch": 0.20788878524973486, + "grad_norm": 0.7698317170143127, + "learning_rate": 9.735980993070852e-05, + "loss": 0.8745, + "step": 32540 + }, + { + "epoch": 0.20795267239947357, + "grad_norm": 1.8012065887451172, + "learning_rate": 9.735820074854265e-05, + "loss": 0.9542, + "step": 32550 + }, + { + "epoch": 0.20801655954921228, + "grad_norm": 0.7188138365745544, + "learning_rate": 9.735659108943876e-05, + "loss": 0.6682, + "step": 32560 + }, + { + "epoch": 0.20808044669895098, + "grad_norm": 0.7604565620422363, + "learning_rate": 9.7354980953413e-05, + "loss": 0.7246, + "step": 32570 + }, + { + "epoch": 0.20814433384868966, + "grad_norm": 0.6722016334533691, + "learning_rate": 9.735337034048162e-05, + "loss": 0.8719, + "step": 32580 + }, + { + "epoch": 0.20820822099842837, + "grad_norm": 0.5613377690315247, + "learning_rate": 9.735175925066082e-05, + "loss": 0.8531, + "step": 32590 + }, + { + "epoch": 0.20827210814816707, + "grad_norm": 1.168945550918579, + "learning_rate": 9.735014768396686e-05, + "loss": 1.1047, + "step": 32600 + }, + { + "epoch": 0.20833599529790578, + "grad_norm": 0.7283167243003845, + "learning_rate": 9.734853564041595e-05, + "loss": 0.7414, + "step": 32610 + }, + { + "epoch": 0.20839988244764449, + "grad_norm": 0.8897091150283813, + "learning_rate": 9.734692312002431e-05, + "loss": 0.9406, + "step": 32620 + }, + { + "epoch": 0.2084637695973832, + "grad_norm": 0.6193281412124634, + "learning_rate": 9.734531012280821e-05, + "loss": 0.8429, + "step": 32630 + }, + { + "epoch": 0.20852765674712187, + "grad_norm": 1.2287752628326416, + "learning_rate": 9.734369664878387e-05, + "loss": 0.9993, + "step": 32640 + }, + { + "epoch": 0.20859154389686058, + "grad_norm": 1.4086371660232544, + "learning_rate": 9.734208269796754e-05, + "loss": 0.6823, + "step": 32650 + }, + { + "epoch": 0.20865543104659928, + "grad_norm": 0.9113640785217285, + "learning_rate": 9.734046827037548e-05, + "loss": 1.1112, + "step": 32660 + }, + { + "epoch": 0.208719318196338, + "grad_norm": 0.7698211073875427, + "learning_rate": 9.733885336602396e-05, + "loss": 0.7977, + "step": 32670 + }, + { + "epoch": 0.2087832053460767, + "grad_norm": 2.5170323848724365, + "learning_rate": 9.733723798492921e-05, + "loss": 0.8861, + "step": 32680 + }, + { + "epoch": 0.2088470924958154, + "grad_norm": 0.5907607078552246, + "learning_rate": 9.733562212710755e-05, + "loss": 0.8325, + "step": 32690 + }, + { + "epoch": 0.2089109796455541, + "grad_norm": 0.7293870449066162, + "learning_rate": 9.733400579257521e-05, + "loss": 0.8956, + "step": 32700 + }, + { + "epoch": 0.20897486679529279, + "grad_norm": 0.9861850738525391, + "learning_rate": 9.733238898134848e-05, + "loss": 0.8441, + "step": 32710 + }, + { + "epoch": 0.2090387539450315, + "grad_norm": 0.8502741456031799, + "learning_rate": 9.733077169344366e-05, + "loss": 0.7623, + "step": 32720 + }, + { + "epoch": 0.2091026410947702, + "grad_norm": 0.6573517322540283, + "learning_rate": 9.7329153928877e-05, + "loss": 1.134, + "step": 32730 + }, + { + "epoch": 0.2091665282445089, + "grad_norm": 1.0283352136611938, + "learning_rate": 9.732753568766482e-05, + "loss": 1.1342, + "step": 32740 + }, + { + "epoch": 0.2092304153942476, + "grad_norm": 0.9217149019241333, + "learning_rate": 9.732591696982343e-05, + "loss": 0.7505, + "step": 32750 + }, + { + "epoch": 0.20929430254398632, + "grad_norm": 1.5344794988632202, + "learning_rate": 9.732429777536909e-05, + "loss": 0.8524, + "step": 32760 + }, + { + "epoch": 0.209358189693725, + "grad_norm": 0.6569311022758484, + "learning_rate": 9.732267810431814e-05, + "loss": 0.9557, + "step": 32770 + }, + { + "epoch": 0.2094220768434637, + "grad_norm": 1.269944190979004, + "learning_rate": 9.732105795668689e-05, + "loss": 0.8407, + "step": 32780 + }, + { + "epoch": 0.2094859639932024, + "grad_norm": 0.914414644241333, + "learning_rate": 9.731943733249164e-05, + "loss": 0.7725, + "step": 32790 + }, + { + "epoch": 0.2095498511429411, + "grad_norm": 0.5438032746315002, + "learning_rate": 9.731781623174871e-05, + "loss": 0.9418, + "step": 32800 + }, + { + "epoch": 0.20961373829267982, + "grad_norm": 0.9533820152282715, + "learning_rate": 9.731619465447445e-05, + "loss": 0.7887, + "step": 32810 + }, + { + "epoch": 0.20967762544241852, + "grad_norm": 0.9719078540802002, + "learning_rate": 9.731457260068517e-05, + "loss": 1.0511, + "step": 32820 + }, + { + "epoch": 0.2097415125921572, + "grad_norm": 0.8131768107414246, + "learning_rate": 9.73129500703972e-05, + "loss": 0.7443, + "step": 32830 + }, + { + "epoch": 0.2098053997418959, + "grad_norm": 0.9436559081077576, + "learning_rate": 9.731132706362692e-05, + "loss": 0.7655, + "step": 32840 + }, + { + "epoch": 0.20986928689163462, + "grad_norm": 0.6353892683982849, + "learning_rate": 9.730970358039062e-05, + "loss": 0.9139, + "step": 32850 + }, + { + "epoch": 0.20993317404137332, + "grad_norm": 1.7300466299057007, + "learning_rate": 9.730807962070467e-05, + "loss": 0.8533, + "step": 32860 + }, + { + "epoch": 0.20999706119111203, + "grad_norm": 0.9070175886154175, + "learning_rate": 9.730645518458545e-05, + "loss": 1.0384, + "step": 32870 + }, + { + "epoch": 0.21006094834085073, + "grad_norm": 1.630418300628662, + "learning_rate": 9.73048302720493e-05, + "loss": 1.1833, + "step": 32880 + }, + { + "epoch": 0.2101248354905894, + "grad_norm": 0.6094731092453003, + "learning_rate": 9.730320488311258e-05, + "loss": 0.8528, + "step": 32890 + }, + { + "epoch": 0.21018872264032812, + "grad_norm": 0.9163777828216553, + "learning_rate": 9.730157901779165e-05, + "loss": 0.986, + "step": 32900 + }, + { + "epoch": 0.21025260979006682, + "grad_norm": 0.885759174823761, + "learning_rate": 9.729995267610293e-05, + "loss": 1.0211, + "step": 32910 + }, + { + "epoch": 0.21031649693980553, + "grad_norm": 0.6660359501838684, + "learning_rate": 9.729832585806273e-05, + "loss": 0.8855, + "step": 32920 + }, + { + "epoch": 0.21038038408954424, + "grad_norm": 0.9728102087974548, + "learning_rate": 9.729669856368748e-05, + "loss": 0.9548, + "step": 32930 + }, + { + "epoch": 0.21044427123928294, + "grad_norm": 0.8899286985397339, + "learning_rate": 9.729507079299359e-05, + "loss": 0.98, + "step": 32940 + }, + { + "epoch": 0.21050815838902162, + "grad_norm": 0.8630788326263428, + "learning_rate": 9.729344254599738e-05, + "loss": 0.8842, + "step": 32950 + }, + { + "epoch": 0.21057204553876033, + "grad_norm": 1.159555435180664, + "learning_rate": 9.72918138227153e-05, + "loss": 1.0565, + "step": 32960 + }, + { + "epoch": 0.21063593268849903, + "grad_norm": 0.9720593690872192, + "learning_rate": 9.729018462316375e-05, + "loss": 0.8663, + "step": 32970 + }, + { + "epoch": 0.21069981983823774, + "grad_norm": 1.0807291269302368, + "learning_rate": 9.728855494735914e-05, + "loss": 0.7609, + "step": 32980 + }, + { + "epoch": 0.21076370698797645, + "grad_norm": 0.9693974852561951, + "learning_rate": 9.728692479531784e-05, + "loss": 0.9466, + "step": 32990 + }, + { + "epoch": 0.21082759413771515, + "grad_norm": 1.1828261613845825, + "learning_rate": 9.728529416705632e-05, + "loss": 1.17, + "step": 33000 + }, + { + "epoch": 0.21089148128745383, + "grad_norm": 0.8070554733276367, + "learning_rate": 9.728366306259098e-05, + "loss": 0.999, + "step": 33010 + }, + { + "epoch": 0.21095536843719254, + "grad_norm": 0.6054061651229858, + "learning_rate": 9.728203148193824e-05, + "loss": 0.7462, + "step": 33020 + }, + { + "epoch": 0.21101925558693124, + "grad_norm": 0.9334638714790344, + "learning_rate": 9.728039942511453e-05, + "loss": 0.8478, + "step": 33030 + }, + { + "epoch": 0.21108314273666995, + "grad_norm": 0.692486584186554, + "learning_rate": 9.727876689213631e-05, + "loss": 0.9051, + "step": 33040 + }, + { + "epoch": 0.21114702988640865, + "grad_norm": 0.7370048761367798, + "learning_rate": 9.727713388302e-05, + "loss": 1.131, + "step": 33050 + }, + { + "epoch": 0.21121091703614736, + "grad_norm": 0.8169997930526733, + "learning_rate": 9.727550039778205e-05, + "loss": 0.762, + "step": 33060 + }, + { + "epoch": 0.21127480418588604, + "grad_norm": 1.1108886003494263, + "learning_rate": 9.727386643643891e-05, + "loss": 0.8818, + "step": 33070 + }, + { + "epoch": 0.21133869133562475, + "grad_norm": 2.2037575244903564, + "learning_rate": 9.727223199900704e-05, + "loss": 0.9574, + "step": 33080 + }, + { + "epoch": 0.21140257848536345, + "grad_norm": 0.820559024810791, + "learning_rate": 9.72705970855029e-05, + "loss": 1.0319, + "step": 33090 + }, + { + "epoch": 0.21146646563510216, + "grad_norm": 0.6320390701293945, + "learning_rate": 9.726896169594295e-05, + "loss": 0.8773, + "step": 33100 + }, + { + "epoch": 0.21153035278484086, + "grad_norm": 0.6292109489440918, + "learning_rate": 9.726732583034365e-05, + "loss": 0.7979, + "step": 33110 + }, + { + "epoch": 0.21159423993457957, + "grad_norm": 1.0046201944351196, + "learning_rate": 9.72656894887215e-05, + "loss": 0.7807, + "step": 33120 + }, + { + "epoch": 0.21165812708431825, + "grad_norm": 0.8816448450088501, + "learning_rate": 9.726405267109297e-05, + "loss": 0.7321, + "step": 33130 + }, + { + "epoch": 0.21172201423405695, + "grad_norm": 0.9356503486633301, + "learning_rate": 9.726241537747454e-05, + "loss": 0.791, + "step": 33140 + }, + { + "epoch": 0.21178590138379566, + "grad_norm": 0.8952210545539856, + "learning_rate": 9.72607776078827e-05, + "loss": 0.9033, + "step": 33150 + }, + { + "epoch": 0.21184978853353437, + "grad_norm": 0.6787972450256348, + "learning_rate": 9.725913936233393e-05, + "loss": 0.8994, + "step": 33160 + }, + { + "epoch": 0.21191367568327307, + "grad_norm": 1.112884759902954, + "learning_rate": 9.725750064084476e-05, + "loss": 0.8439, + "step": 33170 + }, + { + "epoch": 0.21197756283301178, + "grad_norm": 1.08254873752594, + "learning_rate": 9.725586144343166e-05, + "loss": 0.8901, + "step": 33180 + }, + { + "epoch": 0.21204144998275046, + "grad_norm": 0.7427080273628235, + "learning_rate": 9.725422177011116e-05, + "loss": 0.9528, + "step": 33190 + }, + { + "epoch": 0.21210533713248916, + "grad_norm": 0.6845873594284058, + "learning_rate": 9.725274565723552e-05, + "loss": 1.1284, + "step": 33200 + }, + { + "epoch": 0.21216922428222787, + "grad_norm": 2.6716866493225098, + "learning_rate": 9.725110507973644e-05, + "loss": 0.9867, + "step": 33210 + }, + { + "epoch": 0.21223311143196658, + "grad_norm": 1.6081085205078125, + "learning_rate": 9.724946402637786e-05, + "loss": 0.6687, + "step": 33220 + }, + { + "epoch": 0.21229699858170528, + "grad_norm": 0.7291703820228577, + "learning_rate": 9.724782249717628e-05, + "loss": 0.8611, + "step": 33230 + }, + { + "epoch": 0.212360885731444, + "grad_norm": 0.6999391317367554, + "learning_rate": 9.724618049214828e-05, + "loss": 0.8015, + "step": 33240 + }, + { + "epoch": 0.21242477288118267, + "grad_norm": 0.7499661445617676, + "learning_rate": 9.724453801131035e-05, + "loss": 0.8521, + "step": 33250 + }, + { + "epoch": 0.21248866003092137, + "grad_norm": 1.027510404586792, + "learning_rate": 9.724289505467906e-05, + "loss": 1.0125, + "step": 33260 + }, + { + "epoch": 0.21255254718066008, + "grad_norm": 1.0336750745773315, + "learning_rate": 9.724125162227095e-05, + "loss": 0.8207, + "step": 33270 + }, + { + "epoch": 0.21261643433039878, + "grad_norm": 0.8094274401664734, + "learning_rate": 9.723960771410256e-05, + "loss": 0.7034, + "step": 33280 + }, + { + "epoch": 0.2126803214801375, + "grad_norm": 0.9066417813301086, + "learning_rate": 9.723796333019044e-05, + "loss": 0.8273, + "step": 33290 + }, + { + "epoch": 0.2127442086298762, + "grad_norm": 1.2769392728805542, + "learning_rate": 9.723631847055119e-05, + "loss": 0.792, + "step": 33300 + }, + { + "epoch": 0.21280809577961488, + "grad_norm": 0.751732349395752, + "learning_rate": 9.723467313520133e-05, + "loss": 0.8004, + "step": 33310 + }, + { + "epoch": 0.21287198292935358, + "grad_norm": 0.7040248513221741, + "learning_rate": 9.723302732415745e-05, + "loss": 1.0993, + "step": 33320 + }, + { + "epoch": 0.2129358700790923, + "grad_norm": 0.6100977063179016, + "learning_rate": 9.723138103743612e-05, + "loss": 0.7998, + "step": 33330 + }, + { + "epoch": 0.212999757228831, + "grad_norm": 1.0050344467163086, + "learning_rate": 9.722973427505391e-05, + "loss": 0.8967, + "step": 33340 + }, + { + "epoch": 0.2130636443785697, + "grad_norm": 0.5379306674003601, + "learning_rate": 9.722808703702743e-05, + "loss": 0.7652, + "step": 33350 + }, + { + "epoch": 0.2131275315283084, + "grad_norm": 0.6813077330589294, + "learning_rate": 9.722643932337327e-05, + "loss": 1.2678, + "step": 33360 + }, + { + "epoch": 0.21319141867804708, + "grad_norm": 1.1152585744857788, + "learning_rate": 9.722479113410799e-05, + "loss": 0.9101, + "step": 33370 + }, + { + "epoch": 0.2132553058277858, + "grad_norm": 0.8351494073867798, + "learning_rate": 9.722314246924822e-05, + "loss": 0.8285, + "step": 33380 + }, + { + "epoch": 0.2133191929775245, + "grad_norm": 0.7308449149131775, + "learning_rate": 9.722149332881054e-05, + "loss": 1.1201, + "step": 33390 + }, + { + "epoch": 0.2133830801272632, + "grad_norm": 1.078356385231018, + "learning_rate": 9.721984371281158e-05, + "loss": 0.9609, + "step": 33400 + }, + { + "epoch": 0.2134469672770019, + "grad_norm": 1.385568380355835, + "learning_rate": 9.721819362126793e-05, + "loss": 0.9715, + "step": 33410 + }, + { + "epoch": 0.21351085442674061, + "grad_norm": 0.8912048935890198, + "learning_rate": 9.721654305419623e-05, + "loss": 0.7701, + "step": 33420 + }, + { + "epoch": 0.2135747415764793, + "grad_norm": 0.7083896994590759, + "learning_rate": 9.721489201161309e-05, + "loss": 0.8202, + "step": 33430 + }, + { + "epoch": 0.213638628726218, + "grad_norm": 0.6518615484237671, + "learning_rate": 9.721324049353515e-05, + "loss": 0.7974, + "step": 33440 + }, + { + "epoch": 0.2137025158759567, + "grad_norm": 0.7615000605583191, + "learning_rate": 9.721158849997903e-05, + "loss": 0.9024, + "step": 33450 + }, + { + "epoch": 0.2137664030256954, + "grad_norm": 0.6199432611465454, + "learning_rate": 9.720993603096136e-05, + "loss": 1.0076, + "step": 33460 + }, + { + "epoch": 0.21383029017543412, + "grad_norm": 0.6537955403327942, + "learning_rate": 9.720828308649879e-05, + "loss": 0.9644, + "step": 33470 + }, + { + "epoch": 0.21389417732517282, + "grad_norm": 0.8364148139953613, + "learning_rate": 9.720662966660799e-05, + "loss": 0.7734, + "step": 33480 + }, + { + "epoch": 0.21395806447491153, + "grad_norm": 0.8252184391021729, + "learning_rate": 9.720497577130557e-05, + "loss": 0.9241, + "step": 33490 + }, + { + "epoch": 0.2140219516246502, + "grad_norm": 1.0425599813461304, + "learning_rate": 9.72033214006082e-05, + "loss": 0.6872, + "step": 33500 + }, + { + "epoch": 0.21408583877438891, + "grad_norm": 0.7613168358802795, + "learning_rate": 9.720166655453256e-05, + "loss": 0.8292, + "step": 33510 + }, + { + "epoch": 0.21414972592412762, + "grad_norm": 0.7358224391937256, + "learning_rate": 9.72000112330953e-05, + "loss": 0.7993, + "step": 33520 + }, + { + "epoch": 0.21421361307386633, + "grad_norm": 1.4351872205734253, + "learning_rate": 9.71983554363131e-05, + "loss": 0.9427, + "step": 33530 + }, + { + "epoch": 0.21427750022360503, + "grad_norm": 0.9211145043373108, + "learning_rate": 9.719669916420262e-05, + "loss": 0.7403, + "step": 33540 + }, + { + "epoch": 0.21434138737334374, + "grad_norm": 0.5790296792984009, + "learning_rate": 9.719504241678054e-05, + "loss": 0.77, + "step": 33550 + }, + { + "epoch": 0.21440527452308242, + "grad_norm": 1.3659369945526123, + "learning_rate": 9.719338519406358e-05, + "loss": 0.9941, + "step": 33560 + }, + { + "epoch": 0.21446916167282112, + "grad_norm": 0.6189954876899719, + "learning_rate": 9.719172749606838e-05, + "loss": 0.8592, + "step": 33570 + }, + { + "epoch": 0.21453304882255983, + "grad_norm": 0.8214682936668396, + "learning_rate": 9.719006932281167e-05, + "loss": 0.7411, + "step": 33580 + }, + { + "epoch": 0.21459693597229854, + "grad_norm": 0.5750226974487305, + "learning_rate": 9.718841067431013e-05, + "loss": 0.7238, + "step": 33590 + }, + { + "epoch": 0.21466082312203724, + "grad_norm": 1.5233280658721924, + "learning_rate": 9.718675155058046e-05, + "loss": 0.7061, + "step": 33600 + }, + { + "epoch": 0.21472471027177595, + "grad_norm": 0.5941923260688782, + "learning_rate": 9.718509195163939e-05, + "loss": 1.0065, + "step": 33610 + }, + { + "epoch": 0.21478859742151463, + "grad_norm": 0.8326600790023804, + "learning_rate": 9.718343187750363e-05, + "loss": 0.8198, + "step": 33620 + }, + { + "epoch": 0.21485248457125333, + "grad_norm": 0.6903313994407654, + "learning_rate": 9.718177132818988e-05, + "loss": 0.8067, + "step": 33630 + }, + { + "epoch": 0.21491637172099204, + "grad_norm": 1.647194266319275, + "learning_rate": 9.71801103037149e-05, + "loss": 0.8966, + "step": 33640 + }, + { + "epoch": 0.21498025887073074, + "grad_norm": 0.6679027080535889, + "learning_rate": 9.717844880409537e-05, + "loss": 0.7546, + "step": 33650 + }, + { + "epoch": 0.21504414602046945, + "grad_norm": 0.8270406723022461, + "learning_rate": 9.717678682934803e-05, + "loss": 1.068, + "step": 33660 + }, + { + "epoch": 0.21510803317020816, + "grad_norm": 0.6147032976150513, + "learning_rate": 9.717512437948966e-05, + "loss": 0.747, + "step": 33670 + }, + { + "epoch": 0.21517192031994684, + "grad_norm": 1.2196052074432373, + "learning_rate": 9.717346145453696e-05, + "loss": 0.7214, + "step": 33680 + }, + { + "epoch": 0.21523580746968554, + "grad_norm": 1.0216395854949951, + "learning_rate": 9.717179805450671e-05, + "loss": 0.8437, + "step": 33690 + }, + { + "epoch": 0.21529969461942425, + "grad_norm": 0.7304588556289673, + "learning_rate": 9.717013417941563e-05, + "loss": 0.6288, + "step": 33700 + }, + { + "epoch": 0.21536358176916295, + "grad_norm": 1.711125135421753, + "learning_rate": 9.716846982928049e-05, + "loss": 0.8811, + "step": 33710 + }, + { + "epoch": 0.21542746891890166, + "grad_norm": 0.868000864982605, + "learning_rate": 9.716680500411805e-05, + "loss": 0.857, + "step": 33720 + }, + { + "epoch": 0.21549135606864037, + "grad_norm": 0.7319660186767578, + "learning_rate": 9.716513970394509e-05, + "loss": 0.8252, + "step": 33730 + }, + { + "epoch": 0.21555524321837904, + "grad_norm": 0.9054515361785889, + "learning_rate": 9.716347392877836e-05, + "loss": 0.8681, + "step": 33740 + }, + { + "epoch": 0.21561913036811775, + "grad_norm": 1.218607783317566, + "learning_rate": 9.716180767863465e-05, + "loss": 0.9609, + "step": 33750 + }, + { + "epoch": 0.21568301751785646, + "grad_norm": 0.9217560291290283, + "learning_rate": 9.716014095353075e-05, + "loss": 0.8119, + "step": 33760 + }, + { + "epoch": 0.21574690466759516, + "grad_norm": 0.7078598141670227, + "learning_rate": 9.715847375348342e-05, + "loss": 0.9151, + "step": 33770 + }, + { + "epoch": 0.21581079181733387, + "grad_norm": 0.7617483139038086, + "learning_rate": 9.715680607850945e-05, + "loss": 0.9346, + "step": 33780 + }, + { + "epoch": 0.21587467896707258, + "grad_norm": 0.7594091892242432, + "learning_rate": 9.715513792862565e-05, + "loss": 0.9478, + "step": 33790 + }, + { + "epoch": 0.21593856611681125, + "grad_norm": 0.9850571155548096, + "learning_rate": 9.715346930384882e-05, + "loss": 0.7815, + "step": 33800 + }, + { + "epoch": 0.21600245326654996, + "grad_norm": 0.8838279843330383, + "learning_rate": 9.715180020419576e-05, + "loss": 1.0338, + "step": 33810 + }, + { + "epoch": 0.21606634041628867, + "grad_norm": 0.7649998068809509, + "learning_rate": 9.715013062968328e-05, + "loss": 0.839, + "step": 33820 + }, + { + "epoch": 0.21613022756602737, + "grad_norm": 0.8073322176933289, + "learning_rate": 9.71484605803282e-05, + "loss": 1.0359, + "step": 33830 + }, + { + "epoch": 0.21619411471576608, + "grad_norm": 1.914969563484192, + "learning_rate": 9.714679005614733e-05, + "loss": 0.972, + "step": 33840 + }, + { + "epoch": 0.21625800186550478, + "grad_norm": 0.781913161277771, + "learning_rate": 9.714511905715749e-05, + "loss": 1.2603, + "step": 33850 + }, + { + "epoch": 0.21632188901524346, + "grad_norm": 0.5499342083930969, + "learning_rate": 9.714344758337553e-05, + "loss": 1.0211, + "step": 33860 + }, + { + "epoch": 0.21638577616498217, + "grad_norm": 2.390815496444702, + "learning_rate": 9.714177563481824e-05, + "loss": 1.1886, + "step": 33870 + }, + { + "epoch": 0.21644966331472087, + "grad_norm": 2.6002392768859863, + "learning_rate": 9.71401032115025e-05, + "loss": 1.0595, + "step": 33880 + }, + { + "epoch": 0.21651355046445958, + "grad_norm": 0.8145592212677002, + "learning_rate": 9.713843031344515e-05, + "loss": 0.8558, + "step": 33890 + }, + { + "epoch": 0.2165774376141983, + "grad_norm": 0.7605422139167786, + "learning_rate": 9.713675694066302e-05, + "loss": 0.79, + "step": 33900 + }, + { + "epoch": 0.216641324763937, + "grad_norm": 0.9282397031784058, + "learning_rate": 9.713508309317296e-05, + "loss": 0.8963, + "step": 33910 + }, + { + "epoch": 0.21670521191367567, + "grad_norm": 0.6586880683898926, + "learning_rate": 9.713340877099183e-05, + "loss": 0.7421, + "step": 33920 + }, + { + "epoch": 0.21676909906341438, + "grad_norm": 0.9235056042671204, + "learning_rate": 9.713173397413652e-05, + "loss": 0.8292, + "step": 33930 + }, + { + "epoch": 0.21683298621315308, + "grad_norm": 0.7915987372398376, + "learning_rate": 9.713005870262386e-05, + "loss": 0.8096, + "step": 33940 + }, + { + "epoch": 0.2168968733628918, + "grad_norm": 0.5287061333656311, + "learning_rate": 9.712838295647074e-05, + "loss": 0.7746, + "step": 33950 + }, + { + "epoch": 0.2169607605126305, + "grad_norm": 0.7330449819564819, + "learning_rate": 9.712670673569403e-05, + "loss": 0.9486, + "step": 33960 + }, + { + "epoch": 0.2170246476623692, + "grad_norm": 0.7698398232460022, + "learning_rate": 9.712503004031061e-05, + "loss": 0.9407, + "step": 33970 + }, + { + "epoch": 0.21708853481210788, + "grad_norm": 5.071091651916504, + "learning_rate": 9.712335287033739e-05, + "loss": 1.046, + "step": 33980 + }, + { + "epoch": 0.2171524219618466, + "grad_norm": 0.8342990875244141, + "learning_rate": 9.712167522579121e-05, + "loss": 0.7953, + "step": 33990 + }, + { + "epoch": 0.2172163091115853, + "grad_norm": 1.221957802772522, + "learning_rate": 9.7119997106689e-05, + "loss": 0.8411, + "step": 34000 + }, + { + "epoch": 0.217280196261324, + "grad_norm": 1.421647548675537, + "learning_rate": 9.711831851304767e-05, + "loss": 0.8459, + "step": 34010 + }, + { + "epoch": 0.2173440834110627, + "grad_norm": 1.0833210945129395, + "learning_rate": 9.71166394448841e-05, + "loss": 0.9682, + "step": 34020 + }, + { + "epoch": 0.2174079705608014, + "grad_norm": 0.7942554354667664, + "learning_rate": 9.71149599022152e-05, + "loss": 0.9468, + "step": 34030 + }, + { + "epoch": 0.2174718577105401, + "grad_norm": 0.5950953364372253, + "learning_rate": 9.71132798850579e-05, + "loss": 0.9885, + "step": 34040 + }, + { + "epoch": 0.2175357448602788, + "grad_norm": 1.1501030921936035, + "learning_rate": 9.711159939342911e-05, + "loss": 0.7241, + "step": 34050 + }, + { + "epoch": 0.2175996320100175, + "grad_norm": 0.8352699875831604, + "learning_rate": 9.710991842734577e-05, + "loss": 0.9376, + "step": 34060 + }, + { + "epoch": 0.2176635191597562, + "grad_norm": 1.2237290143966675, + "learning_rate": 9.710823698682478e-05, + "loss": 0.8397, + "step": 34070 + }, + { + "epoch": 0.21772740630949491, + "grad_norm": 1.1886348724365234, + "learning_rate": 9.71065550718831e-05, + "loss": 0.8056, + "step": 34080 + }, + { + "epoch": 0.21779129345923362, + "grad_norm": 0.954849362373352, + "learning_rate": 9.710487268253765e-05, + "loss": 0.9837, + "step": 34090 + }, + { + "epoch": 0.2178551806089723, + "grad_norm": 0.7035555243492126, + "learning_rate": 9.710318981880539e-05, + "loss": 0.7794, + "step": 34100 + }, + { + "epoch": 0.217919067758711, + "grad_norm": 1.048746109008789, + "learning_rate": 9.710150648070325e-05, + "loss": 0.8262, + "step": 34110 + }, + { + "epoch": 0.2179829549084497, + "grad_norm": 0.8809221386909485, + "learning_rate": 9.70998226682482e-05, + "loss": 1.0516, + "step": 34120 + }, + { + "epoch": 0.21804684205818842, + "grad_norm": 1.0661201477050781, + "learning_rate": 9.709813838145718e-05, + "loss": 1.0833, + "step": 34130 + }, + { + "epoch": 0.21811072920792712, + "grad_norm": 1.1189355850219727, + "learning_rate": 9.709645362034716e-05, + "loss": 1.1234, + "step": 34140 + }, + { + "epoch": 0.21817461635766583, + "grad_norm": 0.872307538986206, + "learning_rate": 9.709476838493511e-05, + "loss": 1.0436, + "step": 34150 + }, + { + "epoch": 0.2182385035074045, + "grad_norm": 0.6649029850959778, + "learning_rate": 9.709308267523801e-05, + "loss": 0.8959, + "step": 34160 + }, + { + "epoch": 0.2183023906571432, + "grad_norm": 0.6744316220283508, + "learning_rate": 9.70913964912728e-05, + "loss": 0.9037, + "step": 34170 + }, + { + "epoch": 0.21836627780688192, + "grad_norm": 1.0881192684173584, + "learning_rate": 9.708970983305652e-05, + "loss": 0.8183, + "step": 34180 + }, + { + "epoch": 0.21843016495662063, + "grad_norm": 0.9044772386550903, + "learning_rate": 9.70880227006061e-05, + "loss": 0.9755, + "step": 34190 + }, + { + "epoch": 0.21849405210635933, + "grad_norm": 0.9986025094985962, + "learning_rate": 9.708633509393856e-05, + "loss": 0.9058, + "step": 34200 + }, + { + "epoch": 0.21855793925609804, + "grad_norm": 0.7025921940803528, + "learning_rate": 9.70846470130709e-05, + "loss": 1.0454, + "step": 34210 + }, + { + "epoch": 0.21862182640583672, + "grad_norm": 0.6166189312934875, + "learning_rate": 9.70829584580201e-05, + "loss": 0.9536, + "step": 34220 + }, + { + "epoch": 0.21868571355557542, + "grad_norm": 1.0105708837509155, + "learning_rate": 9.708126942880318e-05, + "loss": 0.7328, + "step": 34230 + }, + { + "epoch": 0.21874960070531413, + "grad_norm": 0.7658517956733704, + "learning_rate": 9.707957992543714e-05, + "loss": 0.753, + "step": 34240 + }, + { + "epoch": 0.21881348785505284, + "grad_norm": 0.8330119252204895, + "learning_rate": 9.707788994793901e-05, + "loss": 0.9129, + "step": 34250 + }, + { + "epoch": 0.21887737500479154, + "grad_norm": 1.216202735900879, + "learning_rate": 9.707619949632578e-05, + "loss": 0.7501, + "step": 34260 + }, + { + "epoch": 0.21894126215453025, + "grad_norm": 0.7274483442306519, + "learning_rate": 9.707450857061452e-05, + "loss": 0.8814, + "step": 34270 + }, + { + "epoch": 0.21900514930426893, + "grad_norm": 0.7238608598709106, + "learning_rate": 9.707281717082222e-05, + "loss": 0.7132, + "step": 34280 + }, + { + "epoch": 0.21906903645400763, + "grad_norm": 0.736379861831665, + "learning_rate": 9.707112529696594e-05, + "loss": 0.8236, + "step": 34290 + }, + { + "epoch": 0.21913292360374634, + "grad_norm": 0.8833523988723755, + "learning_rate": 9.706943294906268e-05, + "loss": 1.0377, + "step": 34300 + }, + { + "epoch": 0.21919681075348504, + "grad_norm": 0.7226671576499939, + "learning_rate": 9.706774012712953e-05, + "loss": 0.9242, + "step": 34310 + }, + { + "epoch": 0.21926069790322375, + "grad_norm": 1.3238605260849, + "learning_rate": 9.706604683118353e-05, + "loss": 0.9551, + "step": 34320 + }, + { + "epoch": 0.21932458505296246, + "grad_norm": 2.324223279953003, + "learning_rate": 9.706435306124169e-05, + "loss": 1.0707, + "step": 34330 + }, + { + "epoch": 0.21938847220270116, + "grad_norm": 0.6457687020301819, + "learning_rate": 9.70626588173211e-05, + "loss": 0.8856, + "step": 34340 + }, + { + "epoch": 0.21945235935243984, + "grad_norm": 0.7554599642753601, + "learning_rate": 9.706096409943883e-05, + "loss": 0.8271, + "step": 34350 + }, + { + "epoch": 0.21951624650217855, + "grad_norm": 1.154531478881836, + "learning_rate": 9.705926890761195e-05, + "loss": 1.2138, + "step": 34360 + }, + { + "epoch": 0.21958013365191725, + "grad_norm": 0.8493779897689819, + "learning_rate": 9.705757324185751e-05, + "loss": 0.7191, + "step": 34370 + }, + { + "epoch": 0.21964402080165596, + "grad_norm": 1.1541070938110352, + "learning_rate": 9.705587710219259e-05, + "loss": 0.9184, + "step": 34380 + }, + { + "epoch": 0.21970790795139467, + "grad_norm": 2.6271910667419434, + "learning_rate": 9.705418048863429e-05, + "loss": 1.0036, + "step": 34390 + }, + { + "epoch": 0.21977179510113337, + "grad_norm": 0.7804545164108276, + "learning_rate": 9.705248340119968e-05, + "loss": 1.1445, + "step": 34400 + }, + { + "epoch": 0.21983568225087205, + "grad_norm": 0.515604555606842, + "learning_rate": 9.705078583990586e-05, + "loss": 0.729, + "step": 34410 + }, + { + "epoch": 0.21989956940061076, + "grad_norm": 0.9133629202842712, + "learning_rate": 9.704908780476991e-05, + "loss": 1.0537, + "step": 34420 + }, + { + "epoch": 0.21996345655034946, + "grad_norm": 1.274163842201233, + "learning_rate": 9.704738929580896e-05, + "loss": 1.0591, + "step": 34430 + }, + { + "epoch": 0.22002734370008817, + "grad_norm": 0.45899906754493713, + "learning_rate": 9.704569031304009e-05, + "loss": 0.6701, + "step": 34440 + }, + { + "epoch": 0.22009123084982687, + "grad_norm": 0.942436933517456, + "learning_rate": 9.704399085648041e-05, + "loss": 0.9153, + "step": 34450 + }, + { + "epoch": 0.22015511799956558, + "grad_norm": 1.0042204856872559, + "learning_rate": 9.704229092614705e-05, + "loss": 0.8758, + "step": 34460 + }, + { + "epoch": 0.22021900514930426, + "grad_norm": 0.4676646292209625, + "learning_rate": 9.704059052205712e-05, + "loss": 0.7552, + "step": 34470 + }, + { + "epoch": 0.22028289229904296, + "grad_norm": 0.8477068543434143, + "learning_rate": 9.703888964422775e-05, + "loss": 0.8348, + "step": 34480 + }, + { + "epoch": 0.22034677944878167, + "grad_norm": 1.006347417831421, + "learning_rate": 9.703718829267607e-05, + "loss": 0.9339, + "step": 34490 + }, + { + "epoch": 0.22041066659852038, + "grad_norm": 0.8507176637649536, + "learning_rate": 9.703548646741923e-05, + "loss": 0.9948, + "step": 34500 + }, + { + "epoch": 0.22047455374825908, + "grad_norm": 0.9493306279182434, + "learning_rate": 9.703378416847431e-05, + "loss": 0.7232, + "step": 34510 + }, + { + "epoch": 0.2205384408979978, + "grad_norm": 0.7349863052368164, + "learning_rate": 9.703208139585851e-05, + "loss": 0.7541, + "step": 34520 + }, + { + "epoch": 0.22060232804773647, + "grad_norm": 0.8959886431694031, + "learning_rate": 9.703037814958898e-05, + "loss": 0.9639, + "step": 34530 + }, + { + "epoch": 0.22066621519747517, + "grad_norm": 0.6771888136863708, + "learning_rate": 9.702867442968283e-05, + "loss": 0.9092, + "step": 34540 + }, + { + "epoch": 0.22073010234721388, + "grad_norm": 0.784125804901123, + "learning_rate": 9.702697023615726e-05, + "loss": 0.8621, + "step": 34550 + }, + { + "epoch": 0.2207939894969526, + "grad_norm": 1.009945273399353, + "learning_rate": 9.70252655690294e-05, + "loss": 0.7527, + "step": 34560 + }, + { + "epoch": 0.2208578766466913, + "grad_norm": 1.0403534173965454, + "learning_rate": 9.702356042831643e-05, + "loss": 1.0779, + "step": 34570 + }, + { + "epoch": 0.22092176379643, + "grad_norm": 0.9144579172134399, + "learning_rate": 9.702185481403555e-05, + "loss": 0.9942, + "step": 34580 + }, + { + "epoch": 0.22098565094616868, + "grad_norm": 1.012250542640686, + "learning_rate": 9.702014872620388e-05, + "loss": 0.8412, + "step": 34590 + }, + { + "epoch": 0.22104953809590738, + "grad_norm": 1.3977776765823364, + "learning_rate": 9.701844216483866e-05, + "loss": 0.9844, + "step": 34600 + }, + { + "epoch": 0.2211134252456461, + "grad_norm": 0.8186967372894287, + "learning_rate": 9.701673512995704e-05, + "loss": 0.8303, + "step": 34610 + }, + { + "epoch": 0.2211773123953848, + "grad_norm": 0.7828638553619385, + "learning_rate": 9.701502762157623e-05, + "loss": 0.9695, + "step": 34620 + }, + { + "epoch": 0.2212411995451235, + "grad_norm": 0.9973053336143494, + "learning_rate": 9.701331963971341e-05, + "loss": 0.8977, + "step": 34630 + }, + { + "epoch": 0.2213050866948622, + "grad_norm": 1.1445131301879883, + "learning_rate": 9.70116111843858e-05, + "loss": 0.8871, + "step": 34640 + }, + { + "epoch": 0.22136897384460089, + "grad_norm": 0.8758741617202759, + "learning_rate": 9.700990225561058e-05, + "loss": 0.81, + "step": 34650 + }, + { + "epoch": 0.2214328609943396, + "grad_norm": 0.49622881412506104, + "learning_rate": 9.700819285340497e-05, + "loss": 0.8899, + "step": 34660 + }, + { + "epoch": 0.2214967481440783, + "grad_norm": 0.9389495253562927, + "learning_rate": 9.700648297778621e-05, + "loss": 0.867, + "step": 34670 + }, + { + "epoch": 0.221560635293817, + "grad_norm": 2.2437360286712646, + "learning_rate": 9.700477262877149e-05, + "loss": 1.0428, + "step": 34680 + }, + { + "epoch": 0.2216245224435557, + "grad_norm": 1.3925631046295166, + "learning_rate": 9.700306180637804e-05, + "loss": 0.91, + "step": 34690 + }, + { + "epoch": 0.22168840959329442, + "grad_norm": 1.310964822769165, + "learning_rate": 9.700135051062312e-05, + "loss": 0.8114, + "step": 34700 + }, + { + "epoch": 0.2217522967430331, + "grad_norm": 1.04167902469635, + "learning_rate": 9.699963874152392e-05, + "loss": 0.7845, + "step": 34710 + }, + { + "epoch": 0.2218161838927718, + "grad_norm": 0.9633674621582031, + "learning_rate": 9.699792649909768e-05, + "loss": 0.6929, + "step": 34720 + }, + { + "epoch": 0.2218800710425105, + "grad_norm": 0.6973922252655029, + "learning_rate": 9.699621378336168e-05, + "loss": 0.7923, + "step": 34730 + }, + { + "epoch": 0.2219439581922492, + "grad_norm": 0.6631523370742798, + "learning_rate": 9.699450059433314e-05, + "loss": 0.8096, + "step": 34740 + }, + { + "epoch": 0.22200784534198792, + "grad_norm": 1.064477801322937, + "learning_rate": 9.699278693202933e-05, + "loss": 0.9907, + "step": 34750 + }, + { + "epoch": 0.22207173249172663, + "grad_norm": 1.0626312494277954, + "learning_rate": 9.699107279646751e-05, + "loss": 0.7736, + "step": 34760 + }, + { + "epoch": 0.2221356196414653, + "grad_norm": 0.5820396542549133, + "learning_rate": 9.698935818766493e-05, + "loss": 0.7869, + "step": 34770 + }, + { + "epoch": 0.222199506791204, + "grad_norm": 0.7940320372581482, + "learning_rate": 9.698764310563885e-05, + "loss": 0.8672, + "step": 34780 + }, + { + "epoch": 0.22226339394094272, + "grad_norm": 0.9088238477706909, + "learning_rate": 9.698592755040657e-05, + "loss": 0.8374, + "step": 34790 + }, + { + "epoch": 0.22232728109068142, + "grad_norm": 1.1797140836715698, + "learning_rate": 9.698421152198533e-05, + "loss": 1.074, + "step": 34800 + }, + { + "epoch": 0.22239116824042013, + "grad_norm": 0.7393913269042969, + "learning_rate": 9.698249502039243e-05, + "loss": 0.7102, + "step": 34810 + }, + { + "epoch": 0.22245505539015883, + "grad_norm": 1.2401602268218994, + "learning_rate": 9.698077804564519e-05, + "loss": 1.0855, + "step": 34820 + }, + { + "epoch": 0.2225189425398975, + "grad_norm": 0.7187434434890747, + "learning_rate": 9.697906059776085e-05, + "loss": 1.02, + "step": 34830 + }, + { + "epoch": 0.22258282968963622, + "grad_norm": 0.719468355178833, + "learning_rate": 9.697734267675674e-05, + "loss": 0.932, + "step": 34840 + }, + { + "epoch": 0.22264671683937493, + "grad_norm": 0.8819088935852051, + "learning_rate": 9.697562428265012e-05, + "loss": 0.8238, + "step": 34850 + }, + { + "epoch": 0.22271060398911363, + "grad_norm": 0.49491390585899353, + "learning_rate": 9.697390541545834e-05, + "loss": 0.9514, + "step": 34860 + }, + { + "epoch": 0.22277449113885234, + "grad_norm": 1.5479438304901123, + "learning_rate": 9.697218607519871e-05, + "loss": 0.9275, + "step": 34870 + }, + { + "epoch": 0.22283837828859104, + "grad_norm": 0.763923704624176, + "learning_rate": 9.697046626188852e-05, + "loss": 0.8258, + "step": 34880 + }, + { + "epoch": 0.22290226543832972, + "grad_norm": 1.1767523288726807, + "learning_rate": 9.696874597554509e-05, + "loss": 0.7937, + "step": 34890 + }, + { + "epoch": 0.22296615258806843, + "grad_norm": 1.3667820692062378, + "learning_rate": 9.696702521618576e-05, + "loss": 0.8892, + "step": 34900 + }, + { + "epoch": 0.22303003973780713, + "grad_norm": 0.7159459590911865, + "learning_rate": 9.696530398382786e-05, + "loss": 0.9855, + "step": 34910 + }, + { + "epoch": 0.22309392688754584, + "grad_norm": 0.6876511573791504, + "learning_rate": 9.69635822784887e-05, + "loss": 1.0461, + "step": 34920 + }, + { + "epoch": 0.22315781403728455, + "grad_norm": 0.6138442158699036, + "learning_rate": 9.696186010018566e-05, + "loss": 0.8192, + "step": 34930 + }, + { + "epoch": 0.22322170118702325, + "grad_norm": 0.6763925552368164, + "learning_rate": 9.696013744893604e-05, + "loss": 0.8746, + "step": 34940 + }, + { + "epoch": 0.22328558833676193, + "grad_norm": 0.807370126247406, + "learning_rate": 9.695841432475723e-05, + "loss": 0.7289, + "step": 34950 + }, + { + "epoch": 0.22334947548650064, + "grad_norm": 0.7103719711303711, + "learning_rate": 9.695669072766655e-05, + "loss": 0.8883, + "step": 34960 + }, + { + "epoch": 0.22341336263623934, + "grad_norm": 0.6593259572982788, + "learning_rate": 9.695496665768138e-05, + "loss": 0.845, + "step": 34970 + }, + { + "epoch": 0.22347724978597805, + "grad_norm": 0.7919392585754395, + "learning_rate": 9.695324211481907e-05, + "loss": 0.7294, + "step": 34980 + }, + { + "epoch": 0.22354113693571676, + "grad_norm": 1.0960744619369507, + "learning_rate": 9.695151709909698e-05, + "loss": 0.8352, + "step": 34990 + }, + { + "epoch": 0.22360502408545546, + "grad_norm": 0.9134578704833984, + "learning_rate": 9.69497916105325e-05, + "loss": 0.8196, + "step": 35000 + }, + { + "epoch": 0.22366891123519414, + "grad_norm": 0.7842540144920349, + "learning_rate": 9.6948065649143e-05, + "loss": 1.0348, + "step": 35010 + }, + { + "epoch": 0.22373279838493285, + "grad_norm": 0.6312137842178345, + "learning_rate": 9.694633921494588e-05, + "loss": 0.62, + "step": 35020 + }, + { + "epoch": 0.22379668553467155, + "grad_norm": 0.6972392797470093, + "learning_rate": 9.69446123079585e-05, + "loss": 0.8615, + "step": 35030 + }, + { + "epoch": 0.22386057268441026, + "grad_norm": 0.7970590591430664, + "learning_rate": 9.694288492819825e-05, + "loss": 0.925, + "step": 35040 + }, + { + "epoch": 0.22392445983414896, + "grad_norm": 1.2083357572555542, + "learning_rate": 9.694115707568254e-05, + "loss": 0.7092, + "step": 35050 + }, + { + "epoch": 0.22398834698388767, + "grad_norm": 0.585113525390625, + "learning_rate": 9.693942875042878e-05, + "loss": 0.9887, + "step": 35060 + }, + { + "epoch": 0.22405223413362635, + "grad_norm": 1.90079665184021, + "learning_rate": 9.693769995245437e-05, + "loss": 0.9447, + "step": 35070 + }, + { + "epoch": 0.22411612128336506, + "grad_norm": 0.8119843602180481, + "learning_rate": 9.69359706817767e-05, + "loss": 0.7767, + "step": 35080 + }, + { + "epoch": 0.22418000843310376, + "grad_norm": 0.5535334348678589, + "learning_rate": 9.69342409384132e-05, + "loss": 1.0211, + "step": 35090 + }, + { + "epoch": 0.22424389558284247, + "grad_norm": 0.5760706663131714, + "learning_rate": 9.69325107223813e-05, + "loss": 0.7181, + "step": 35100 + }, + { + "epoch": 0.22430778273258117, + "grad_norm": 0.7716420292854309, + "learning_rate": 9.69307800336984e-05, + "loss": 0.9217, + "step": 35110 + }, + { + "epoch": 0.22437166988231988, + "grad_norm": 1.1675033569335938, + "learning_rate": 9.692904887238195e-05, + "loss": 1.1387, + "step": 35120 + }, + { + "epoch": 0.22443555703205856, + "grad_norm": 0.8765130043029785, + "learning_rate": 9.692731723844939e-05, + "loss": 0.7809, + "step": 35130 + }, + { + "epoch": 0.22449944418179726, + "grad_norm": 2.621401786804199, + "learning_rate": 9.692558513191812e-05, + "loss": 0.9963, + "step": 35140 + }, + { + "epoch": 0.22456333133153597, + "grad_norm": 1.051527976989746, + "learning_rate": 9.692385255280564e-05, + "loss": 1.0771, + "step": 35150 + }, + { + "epoch": 0.22462721848127468, + "grad_norm": 0.8392159342765808, + "learning_rate": 9.692211950112936e-05, + "loss": 0.9217, + "step": 35160 + }, + { + "epoch": 0.22469110563101338, + "grad_norm": 0.7495473027229309, + "learning_rate": 9.692038597690674e-05, + "loss": 1.0467, + "step": 35170 + }, + { + "epoch": 0.2247549927807521, + "grad_norm": 0.6959127187728882, + "learning_rate": 9.691865198015524e-05, + "loss": 1.0204, + "step": 35180 + }, + { + "epoch": 0.2248188799304908, + "grad_norm": 1.0118756294250488, + "learning_rate": 9.691691751089234e-05, + "loss": 0.9488, + "step": 35190 + }, + { + "epoch": 0.22488276708022947, + "grad_norm": 1.1415350437164307, + "learning_rate": 9.691518256913547e-05, + "loss": 1.2746, + "step": 35200 + }, + { + "epoch": 0.22494665422996818, + "grad_norm": 0.9474114179611206, + "learning_rate": 9.691344715490213e-05, + "loss": 0.9522, + "step": 35210 + }, + { + "epoch": 0.22501054137970689, + "grad_norm": 1.113400936126709, + "learning_rate": 9.69117112682098e-05, + "loss": 0.6583, + "step": 35220 + }, + { + "epoch": 0.2250744285294456, + "grad_norm": 1.0649466514587402, + "learning_rate": 9.690997490907594e-05, + "loss": 0.9882, + "step": 35230 + }, + { + "epoch": 0.2251383156791843, + "grad_norm": 0.7435470819473267, + "learning_rate": 9.690823807751807e-05, + "loss": 0.7211, + "step": 35240 + }, + { + "epoch": 0.225202202828923, + "grad_norm": 1.1411978006362915, + "learning_rate": 9.690650077355364e-05, + "loss": 0.8664, + "step": 35250 + }, + { + "epoch": 0.22526608997866168, + "grad_norm": 0.888380765914917, + "learning_rate": 9.690476299720018e-05, + "loss": 0.9631, + "step": 35260 + }, + { + "epoch": 0.2253299771284004, + "grad_norm": 0.8436518907546997, + "learning_rate": 9.690302474847516e-05, + "loss": 1.0347, + "step": 35270 + }, + { + "epoch": 0.2253938642781391, + "grad_norm": 0.8739194869995117, + "learning_rate": 9.690128602739613e-05, + "loss": 1.0825, + "step": 35280 + }, + { + "epoch": 0.2254577514278778, + "grad_norm": 0.9203752875328064, + "learning_rate": 9.689954683398057e-05, + "loss": 0.8836, + "step": 35290 + }, + { + "epoch": 0.2255216385776165, + "grad_norm": 0.7080454230308533, + "learning_rate": 9.6897807168246e-05, + "loss": 0.7775, + "step": 35300 + }, + { + "epoch": 0.2255855257273552, + "grad_norm": 0.8330931067466736, + "learning_rate": 9.689606703020993e-05, + "loss": 0.948, + "step": 35310 + }, + { + "epoch": 0.2256494128770939, + "grad_norm": 0.9416504502296448, + "learning_rate": 9.689432641988988e-05, + "loss": 0.8721, + "step": 35320 + }, + { + "epoch": 0.2257133000268326, + "grad_norm": 0.7803798317909241, + "learning_rate": 9.689258533730341e-05, + "loss": 0.8416, + "step": 35330 + }, + { + "epoch": 0.2257771871765713, + "grad_norm": 0.6242881417274475, + "learning_rate": 9.689084378246804e-05, + "loss": 0.7793, + "step": 35340 + }, + { + "epoch": 0.22584107432631, + "grad_norm": 0.8477808833122253, + "learning_rate": 9.68891017554013e-05, + "loss": 0.84, + "step": 35350 + }, + { + "epoch": 0.22590496147604872, + "grad_norm": 0.8038986325263977, + "learning_rate": 9.688735925612075e-05, + "loss": 0.7162, + "step": 35360 + }, + { + "epoch": 0.22596884862578742, + "grad_norm": 0.6531451940536499, + "learning_rate": 9.688561628464391e-05, + "loss": 0.8058, + "step": 35370 + }, + { + "epoch": 0.2260327357755261, + "grad_norm": 0.8681033849716187, + "learning_rate": 9.688387284098837e-05, + "loss": 1.0791, + "step": 35380 + }, + { + "epoch": 0.2260966229252648, + "grad_norm": 1.3325775861740112, + "learning_rate": 9.688212892517167e-05, + "loss": 0.6875, + "step": 35390 + }, + { + "epoch": 0.2261605100750035, + "grad_norm": 0.5260213017463684, + "learning_rate": 9.688038453721137e-05, + "loss": 0.8236, + "step": 35400 + }, + { + "epoch": 0.22622439722474222, + "grad_norm": 1.0699787139892578, + "learning_rate": 9.687863967712503e-05, + "loss": 0.8972, + "step": 35410 + }, + { + "epoch": 0.22628828437448092, + "grad_norm": 0.6690873503684998, + "learning_rate": 9.687689434493025e-05, + "loss": 0.8042, + "step": 35420 + }, + { + "epoch": 0.22635217152421963, + "grad_norm": 0.6598352789878845, + "learning_rate": 9.687514854064458e-05, + "loss": 1.0096, + "step": 35430 + }, + { + "epoch": 0.2264160586739583, + "grad_norm": 0.839152455329895, + "learning_rate": 9.68735769131643e-05, + "loss": 1.0554, + "step": 35440 + }, + { + "epoch": 0.22647994582369702, + "grad_norm": 1.03608238697052, + "learning_rate": 9.68718302119544e-05, + "loss": 1.0627, + "step": 35450 + }, + { + "epoch": 0.22654383297343572, + "grad_norm": 0.8941081762313843, + "learning_rate": 9.687008303870461e-05, + "loss": 0.663, + "step": 35460 + }, + { + "epoch": 0.22660772012317443, + "grad_norm": 0.5950977802276611, + "learning_rate": 9.686833539343256e-05, + "loss": 0.9636, + "step": 35470 + }, + { + "epoch": 0.22667160727291313, + "grad_norm": 0.5966373085975647, + "learning_rate": 9.686658727615581e-05, + "loss": 0.8674, + "step": 35480 + }, + { + "epoch": 0.22673549442265184, + "grad_norm": 0.8043856620788574, + "learning_rate": 9.686483868689198e-05, + "loss": 0.9328, + "step": 35490 + }, + { + "epoch": 0.22679938157239052, + "grad_norm": 1.025963306427002, + "learning_rate": 9.686308962565869e-05, + "loss": 0.7796, + "step": 35500 + }, + { + "epoch": 0.22686326872212922, + "grad_norm": 0.4956408143043518, + "learning_rate": 9.686134009247354e-05, + "loss": 0.7355, + "step": 35510 + }, + { + "epoch": 0.22692715587186793, + "grad_norm": 0.9197072386741638, + "learning_rate": 9.685959008735414e-05, + "loss": 0.7268, + "step": 35520 + }, + { + "epoch": 0.22699104302160664, + "grad_norm": 0.9792423248291016, + "learning_rate": 9.685783961031814e-05, + "loss": 0.8215, + "step": 35530 + }, + { + "epoch": 0.22705493017134534, + "grad_norm": 1.209794282913208, + "learning_rate": 9.685608866138316e-05, + "loss": 0.7836, + "step": 35540 + }, + { + "epoch": 0.22711881732108405, + "grad_norm": 0.8678392767906189, + "learning_rate": 9.685433724056683e-05, + "loss": 0.862, + "step": 35550 + }, + { + "epoch": 0.22718270447082273, + "grad_norm": 1.0202693939208984, + "learning_rate": 9.685258534788679e-05, + "loss": 0.8804, + "step": 35560 + }, + { + "epoch": 0.22724659162056143, + "grad_norm": 0.8867144584655762, + "learning_rate": 9.685083298336068e-05, + "loss": 0.8365, + "step": 35570 + }, + { + "epoch": 0.22731047877030014, + "grad_norm": 0.7046698927879333, + "learning_rate": 9.684908014700616e-05, + "loss": 1.1958, + "step": 35580 + }, + { + "epoch": 0.22737436592003885, + "grad_norm": 0.7776816487312317, + "learning_rate": 9.684732683884085e-05, + "loss": 0.8462, + "step": 35590 + }, + { + "epoch": 0.22743825306977755, + "grad_norm": 0.9116525650024414, + "learning_rate": 9.684557305888245e-05, + "loss": 0.744, + "step": 35600 + }, + { + "epoch": 0.22750214021951626, + "grad_norm": 1.0605876445770264, + "learning_rate": 9.684381880714858e-05, + "loss": 0.9077, + "step": 35610 + }, + { + "epoch": 0.22756602736925494, + "grad_norm": 1.1371787786483765, + "learning_rate": 9.684206408365695e-05, + "loss": 1.1714, + "step": 35620 + }, + { + "epoch": 0.22762991451899364, + "grad_norm": 1.0647424459457397, + "learning_rate": 9.684030888842521e-05, + "loss": 0.8973, + "step": 35630 + }, + { + "epoch": 0.22769380166873235, + "grad_norm": 1.0106362104415894, + "learning_rate": 9.683855322147103e-05, + "loss": 0.7832, + "step": 35640 + }, + { + "epoch": 0.22775768881847105, + "grad_norm": 1.509164571762085, + "learning_rate": 9.68367970828121e-05, + "loss": 1.0245, + "step": 35650 + }, + { + "epoch": 0.22782157596820976, + "grad_norm": 0.6998576521873474, + "learning_rate": 9.68350404724661e-05, + "loss": 0.9198, + "step": 35660 + }, + { + "epoch": 0.22788546311794847, + "grad_norm": 0.7818799614906311, + "learning_rate": 9.683328339045073e-05, + "loss": 0.9013, + "step": 35670 + }, + { + "epoch": 0.22794935026768715, + "grad_norm": 0.9192219972610474, + "learning_rate": 9.683152583678367e-05, + "loss": 0.8992, + "step": 35680 + }, + { + "epoch": 0.22801323741742585, + "grad_norm": 0.7720584273338318, + "learning_rate": 9.682976781148265e-05, + "loss": 1.0002, + "step": 35690 + }, + { + "epoch": 0.22807712456716456, + "grad_norm": 1.023474097251892, + "learning_rate": 9.682800931456534e-05, + "loss": 0.8645, + "step": 35700 + }, + { + "epoch": 0.22814101171690326, + "grad_norm": 0.7522472143173218, + "learning_rate": 9.682625034604946e-05, + "loss": 0.9877, + "step": 35710 + }, + { + "epoch": 0.22820489886664197, + "grad_norm": 0.7929263710975647, + "learning_rate": 9.682449090595274e-05, + "loss": 0.9654, + "step": 35720 + }, + { + "epoch": 0.22826878601638068, + "grad_norm": 0.8946601152420044, + "learning_rate": 9.682273099429288e-05, + "loss": 1.1321, + "step": 35730 + }, + { + "epoch": 0.22833267316611935, + "grad_norm": 0.884692907333374, + "learning_rate": 9.682097061108761e-05, + "loss": 0.7554, + "step": 35740 + }, + { + "epoch": 0.22839656031585806, + "grad_norm": 0.6156822443008423, + "learning_rate": 9.681920975635467e-05, + "loss": 0.8625, + "step": 35750 + }, + { + "epoch": 0.22846044746559677, + "grad_norm": 0.6044219732284546, + "learning_rate": 9.681744843011177e-05, + "loss": 0.758, + "step": 35760 + }, + { + "epoch": 0.22852433461533547, + "grad_norm": 0.835270345211029, + "learning_rate": 9.681568663237668e-05, + "loss": 0.8325, + "step": 35770 + }, + { + "epoch": 0.22858822176507418, + "grad_norm": 0.9461874961853027, + "learning_rate": 9.68139243631671e-05, + "loss": 0.8916, + "step": 35780 + }, + { + "epoch": 0.22865210891481288, + "grad_norm": 1.3007314205169678, + "learning_rate": 9.681216162250082e-05, + "loss": 1.1537, + "step": 35790 + }, + { + "epoch": 0.22871599606455156, + "grad_norm": 1.0743658542633057, + "learning_rate": 9.681039841039557e-05, + "loss": 0.7409, + "step": 35800 + }, + { + "epoch": 0.22877988321429027, + "grad_norm": 2.3741660118103027, + "learning_rate": 9.680863472686911e-05, + "loss": 0.8093, + "step": 35810 + }, + { + "epoch": 0.22884377036402898, + "grad_norm": 0.9726037383079529, + "learning_rate": 9.68068705719392e-05, + "loss": 1.1677, + "step": 35820 + }, + { + "epoch": 0.22890765751376768, + "grad_norm": 0.7922230958938599, + "learning_rate": 9.680510594562362e-05, + "loss": 0.9944, + "step": 35830 + }, + { + "epoch": 0.2289715446635064, + "grad_norm": 0.8513554930686951, + "learning_rate": 9.680334084794011e-05, + "loss": 0.8125, + "step": 35840 + }, + { + "epoch": 0.2290354318132451, + "grad_norm": 1.046993374824524, + "learning_rate": 9.680157527890649e-05, + "loss": 0.9013, + "step": 35850 + }, + { + "epoch": 0.22909931896298377, + "grad_norm": 0.6349254250526428, + "learning_rate": 9.679980923854051e-05, + "loss": 0.903, + "step": 35860 + }, + { + "epoch": 0.22916320611272248, + "grad_norm": 0.4237905740737915, + "learning_rate": 9.679804272685995e-05, + "loss": 0.7127, + "step": 35870 + }, + { + "epoch": 0.22922709326246118, + "grad_norm": 0.7686927914619446, + "learning_rate": 9.679627574388264e-05, + "loss": 0.8212, + "step": 35880 + }, + { + "epoch": 0.2292909804121999, + "grad_norm": 1.274295687675476, + "learning_rate": 9.679450828962633e-05, + "loss": 0.7458, + "step": 35890 + }, + { + "epoch": 0.2293548675619386, + "grad_norm": 0.8231094479560852, + "learning_rate": 9.679274036410884e-05, + "loss": 0.851, + "step": 35900 + }, + { + "epoch": 0.2294187547116773, + "grad_norm": 0.5917838215827942, + "learning_rate": 9.679097196734797e-05, + "loss": 1.0595, + "step": 35910 + }, + { + "epoch": 0.22948264186141598, + "grad_norm": 0.9595643877983093, + "learning_rate": 9.678920309936155e-05, + "loss": 0.8143, + "step": 35920 + }, + { + "epoch": 0.2295465290111547, + "grad_norm": 0.9315831661224365, + "learning_rate": 9.678743376016736e-05, + "loss": 0.8278, + "step": 35930 + }, + { + "epoch": 0.2296104161608934, + "grad_norm": 0.8110885620117188, + "learning_rate": 9.678566394978323e-05, + "loss": 0.9624, + "step": 35940 + }, + { + "epoch": 0.2296743033106321, + "grad_norm": 0.8156410455703735, + "learning_rate": 9.6783893668227e-05, + "loss": 0.7916, + "step": 35950 + }, + { + "epoch": 0.2297381904603708, + "grad_norm": 0.7576091885566711, + "learning_rate": 9.678212291551649e-05, + "loss": 1.2787, + "step": 35960 + }, + { + "epoch": 0.2298020776101095, + "grad_norm": 0.6447461843490601, + "learning_rate": 9.678035169166953e-05, + "loss": 0.8515, + "step": 35970 + }, + { + "epoch": 0.2298659647598482, + "grad_norm": 0.6504492163658142, + "learning_rate": 9.677857999670394e-05, + "loss": 1.0268, + "step": 35980 + }, + { + "epoch": 0.2299298519095869, + "grad_norm": 1.1412609815597534, + "learning_rate": 9.677680783063761e-05, + "loss": 1.1179, + "step": 35990 + }, + { + "epoch": 0.2299937390593256, + "grad_norm": 0.7995015382766724, + "learning_rate": 9.677503519348834e-05, + "loss": 0.9593, + "step": 36000 + }, + { + "epoch": 0.2300576262090643, + "grad_norm": 1.159679889678955, + "learning_rate": 9.677326208527399e-05, + "loss": 0.9146, + "step": 36010 + }, + { + "epoch": 0.23012151335880301, + "grad_norm": 0.721098780632019, + "learning_rate": 9.677148850601243e-05, + "loss": 1.0502, + "step": 36020 + }, + { + "epoch": 0.23018540050854172, + "grad_norm": 0.4577333927154541, + "learning_rate": 9.676971445572152e-05, + "loss": 0.9092, + "step": 36030 + }, + { + "epoch": 0.23024928765828043, + "grad_norm": 0.8602834343910217, + "learning_rate": 9.676793993441913e-05, + "loss": 0.7162, + "step": 36040 + }, + { + "epoch": 0.2303131748080191, + "grad_norm": 0.8518884181976318, + "learning_rate": 9.676616494212314e-05, + "loss": 0.9275, + "step": 36050 + }, + { + "epoch": 0.2303770619577578, + "grad_norm": 1.1824616193771362, + "learning_rate": 9.676438947885138e-05, + "loss": 0.6779, + "step": 36060 + }, + { + "epoch": 0.23044094910749652, + "grad_norm": 1.0840277671813965, + "learning_rate": 9.676261354462177e-05, + "loss": 0.8189, + "step": 36070 + }, + { + "epoch": 0.23050483625723522, + "grad_norm": 0.7747464776039124, + "learning_rate": 9.67608371394522e-05, + "loss": 1.0298, + "step": 36080 + }, + { + "epoch": 0.23056872340697393, + "grad_norm": 0.7132411003112793, + "learning_rate": 9.675906026336053e-05, + "loss": 0.823, + "step": 36090 + }, + { + "epoch": 0.23063261055671264, + "grad_norm": 1.1659483909606934, + "learning_rate": 9.675728291636467e-05, + "loss": 0.8323, + "step": 36100 + }, + { + "epoch": 0.23069649770645131, + "grad_norm": 0.7727037072181702, + "learning_rate": 9.675550509848253e-05, + "loss": 0.8996, + "step": 36110 + }, + { + "epoch": 0.23076038485619002, + "grad_norm": 0.511026918888092, + "learning_rate": 9.6753726809732e-05, + "loss": 0.9119, + "step": 36120 + }, + { + "epoch": 0.23082427200592873, + "grad_norm": 1.2003488540649414, + "learning_rate": 9.6751948050131e-05, + "loss": 0.9831, + "step": 36130 + }, + { + "epoch": 0.23088815915566743, + "grad_norm": 0.9001702070236206, + "learning_rate": 9.675016881969743e-05, + "loss": 1.0382, + "step": 36140 + }, + { + "epoch": 0.23095204630540614, + "grad_norm": 0.8864395618438721, + "learning_rate": 9.674838911844923e-05, + "loss": 0.8401, + "step": 36150 + }, + { + "epoch": 0.23101593345514485, + "grad_norm": 0.8258879780769348, + "learning_rate": 9.674660894640429e-05, + "loss": 0.9833, + "step": 36160 + }, + { + "epoch": 0.23107982060488352, + "grad_norm": 0.8250300884246826, + "learning_rate": 9.674482830358056e-05, + "loss": 0.8936, + "step": 36170 + }, + { + "epoch": 0.23114370775462223, + "grad_norm": 0.9559470415115356, + "learning_rate": 9.674304718999598e-05, + "loss": 1.2631, + "step": 36180 + }, + { + "epoch": 0.23120759490436094, + "grad_norm": 2.168290853500366, + "learning_rate": 9.674126560566846e-05, + "loss": 0.9825, + "step": 36190 + }, + { + "epoch": 0.23127148205409964, + "grad_norm": 0.775067150592804, + "learning_rate": 9.673948355061597e-05, + "loss": 0.8517, + "step": 36200 + }, + { + "epoch": 0.23133536920383835, + "grad_norm": 1.186092495918274, + "learning_rate": 9.673770102485644e-05, + "loss": 0.8136, + "step": 36210 + }, + { + "epoch": 0.23139925635357705, + "grad_norm": 1.8314769268035889, + "learning_rate": 9.673591802840782e-05, + "loss": 1.0137, + "step": 36220 + }, + { + "epoch": 0.23146314350331573, + "grad_norm": 0.9208132028579712, + "learning_rate": 9.673413456128808e-05, + "loss": 0.8576, + "step": 36230 + }, + { + "epoch": 0.23152703065305444, + "grad_norm": 0.8547564148902893, + "learning_rate": 9.673235062351517e-05, + "loss": 1.1041, + "step": 36240 + }, + { + "epoch": 0.23159091780279314, + "grad_norm": 0.6247135400772095, + "learning_rate": 9.673056621510707e-05, + "loss": 0.8918, + "step": 36250 + }, + { + "epoch": 0.23165480495253185, + "grad_norm": 1.1294952630996704, + "learning_rate": 9.672878133608174e-05, + "loss": 1.1331, + "step": 36260 + }, + { + "epoch": 0.23171869210227056, + "grad_norm": 1.048307180404663, + "learning_rate": 9.672699598645716e-05, + "loss": 1.0438, + "step": 36270 + }, + { + "epoch": 0.23178257925200926, + "grad_norm": 0.8274295926094055, + "learning_rate": 9.672521016625128e-05, + "loss": 1.0533, + "step": 36280 + }, + { + "epoch": 0.23184646640174794, + "grad_norm": 0.6973618268966675, + "learning_rate": 9.672342387548215e-05, + "loss": 1.0061, + "step": 36290 + }, + { + "epoch": 0.23191035355148665, + "grad_norm": 0.6850184798240662, + "learning_rate": 9.672163711416768e-05, + "loss": 0.9715, + "step": 36300 + }, + { + "epoch": 0.23197424070122535, + "grad_norm": 0.9231820702552795, + "learning_rate": 9.671984988232593e-05, + "loss": 0.8866, + "step": 36310 + }, + { + "epoch": 0.23203812785096406, + "grad_norm": 1.0431686639785767, + "learning_rate": 9.671806217997485e-05, + "loss": 0.8008, + "step": 36320 + }, + { + "epoch": 0.23210201500070277, + "grad_norm": 0.5410827398300171, + "learning_rate": 9.67162740071325e-05, + "loss": 0.9067, + "step": 36330 + }, + { + "epoch": 0.23216590215044147, + "grad_norm": 0.6281831860542297, + "learning_rate": 9.671448536381683e-05, + "loss": 0.9372, + "step": 36340 + }, + { + "epoch": 0.23222978930018015, + "grad_norm": 0.8428774476051331, + "learning_rate": 9.671269625004589e-05, + "loss": 0.9881, + "step": 36350 + }, + { + "epoch": 0.23229367644991886, + "grad_norm": 0.5876288414001465, + "learning_rate": 9.671090666583769e-05, + "loss": 0.7809, + "step": 36360 + }, + { + "epoch": 0.23235756359965756, + "grad_norm": 0.904808521270752, + "learning_rate": 9.670911661121023e-05, + "loss": 0.9961, + "step": 36370 + }, + { + "epoch": 0.23242145074939627, + "grad_norm": 0.6523864269256592, + "learning_rate": 9.670732608618157e-05, + "loss": 0.9395, + "step": 36380 + }, + { + "epoch": 0.23248533789913498, + "grad_norm": 0.8728864192962646, + "learning_rate": 9.670553509076972e-05, + "loss": 0.7788, + "step": 36390 + }, + { + "epoch": 0.23254922504887368, + "grad_norm": 0.7656633257865906, + "learning_rate": 9.670374362499274e-05, + "loss": 0.9066, + "step": 36400 + }, + { + "epoch": 0.23261311219861236, + "grad_norm": 0.7706246972084045, + "learning_rate": 9.670195168886866e-05, + "loss": 1.1319, + "step": 36410 + }, + { + "epoch": 0.23267699934835107, + "grad_norm": 0.6671524047851562, + "learning_rate": 9.670015928241551e-05, + "loss": 1.1416, + "step": 36420 + }, + { + "epoch": 0.23274088649808977, + "grad_norm": 0.8740767240524292, + "learning_rate": 9.669836640565136e-05, + "loss": 0.9599, + "step": 36430 + }, + { + "epoch": 0.23280477364782848, + "grad_norm": 0.8602978587150574, + "learning_rate": 9.669657305859425e-05, + "loss": 0.7909, + "step": 36440 + }, + { + "epoch": 0.23286866079756718, + "grad_norm": 0.7806286215782166, + "learning_rate": 9.669477924126226e-05, + "loss": 0.7961, + "step": 36450 + }, + { + "epoch": 0.2329325479473059, + "grad_norm": 0.5049117803573608, + "learning_rate": 9.669298495367345e-05, + "loss": 0.742, + "step": 36460 + }, + { + "epoch": 0.23299643509704457, + "grad_norm": 0.719462513923645, + "learning_rate": 9.669119019584589e-05, + "loss": 0.8667, + "step": 36470 + }, + { + "epoch": 0.23306032224678327, + "grad_norm": 0.8203737735748291, + "learning_rate": 9.668939496779763e-05, + "loss": 1.0432, + "step": 36480 + }, + { + "epoch": 0.23312420939652198, + "grad_norm": 0.7739396691322327, + "learning_rate": 9.668759926954679e-05, + "loss": 0.7717, + "step": 36490 + }, + { + "epoch": 0.2331880965462607, + "grad_norm": 0.5877523422241211, + "learning_rate": 9.668580310111142e-05, + "loss": 0.8282, + "step": 36500 + }, + { + "epoch": 0.2332519836959994, + "grad_norm": 0.7117794156074524, + "learning_rate": 9.668400646250963e-05, + "loss": 0.8262, + "step": 36510 + }, + { + "epoch": 0.2333158708457381, + "grad_norm": 0.6126281023025513, + "learning_rate": 9.668220935375953e-05, + "loss": 0.7877, + "step": 36520 + }, + { + "epoch": 0.23337975799547678, + "grad_norm": 1.1325799226760864, + "learning_rate": 9.668041177487917e-05, + "loss": 1.2053, + "step": 36530 + }, + { + "epoch": 0.23344364514521548, + "grad_norm": 0.8727070689201355, + "learning_rate": 9.667861372588669e-05, + "loss": 0.9724, + "step": 36540 + }, + { + "epoch": 0.2335075322949542, + "grad_norm": 0.87961345911026, + "learning_rate": 9.667681520680017e-05, + "loss": 0.8785, + "step": 36550 + }, + { + "epoch": 0.2335714194446929, + "grad_norm": 0.9073530435562134, + "learning_rate": 9.667501621763777e-05, + "loss": 0.7719, + "step": 36560 + }, + { + "epoch": 0.2336353065944316, + "grad_norm": 0.7770230770111084, + "learning_rate": 9.667321675841754e-05, + "loss": 0.9077, + "step": 36570 + }, + { + "epoch": 0.2336991937441703, + "grad_norm": 1.0296423435211182, + "learning_rate": 9.667141682915765e-05, + "loss": 0.867, + "step": 36580 + }, + { + "epoch": 0.233763080893909, + "grad_norm": 0.7076445817947388, + "learning_rate": 9.666961642987624e-05, + "loss": 0.7565, + "step": 36590 + }, + { + "epoch": 0.2338269680436477, + "grad_norm": 1.4758923053741455, + "learning_rate": 9.66678155605914e-05, + "loss": 1.0654, + "step": 36600 + }, + { + "epoch": 0.2338908551933864, + "grad_norm": 0.8394945859909058, + "learning_rate": 9.666601422132129e-05, + "loss": 0.6541, + "step": 36610 + }, + { + "epoch": 0.2339547423431251, + "grad_norm": 0.946808934211731, + "learning_rate": 9.666421241208404e-05, + "loss": 0.9308, + "step": 36620 + }, + { + "epoch": 0.2340186294928638, + "grad_norm": 0.8768804669380188, + "learning_rate": 9.666241013289781e-05, + "loss": 0.7125, + "step": 36630 + }, + { + "epoch": 0.23408251664260252, + "grad_norm": 0.9706554412841797, + "learning_rate": 9.666060738378072e-05, + "loss": 0.8804, + "step": 36640 + }, + { + "epoch": 0.2341464037923412, + "grad_norm": 1.6427329778671265, + "learning_rate": 9.665880416475097e-05, + "loss": 0.9644, + "step": 36650 + }, + { + "epoch": 0.2342102909420799, + "grad_norm": 0.792389988899231, + "learning_rate": 9.665700047582667e-05, + "loss": 0.8932, + "step": 36660 + }, + { + "epoch": 0.2342741780918186, + "grad_norm": 0.6772669553756714, + "learning_rate": 9.665519631702605e-05, + "loss": 0.8973, + "step": 36670 + }, + { + "epoch": 0.23433806524155731, + "grad_norm": 0.8175477385520935, + "learning_rate": 9.66533916883672e-05, + "loss": 0.9906, + "step": 36680 + }, + { + "epoch": 0.23440195239129602, + "grad_norm": 1.3049653768539429, + "learning_rate": 9.665158658986835e-05, + "loss": 0.9246, + "step": 36690 + }, + { + "epoch": 0.23446583954103473, + "grad_norm": 0.7505981922149658, + "learning_rate": 9.664978102154766e-05, + "loss": 0.9096, + "step": 36700 + }, + { + "epoch": 0.2345297266907734, + "grad_norm": 0.8786876797676086, + "learning_rate": 9.664797498342333e-05, + "loss": 0.9795, + "step": 36710 + }, + { + "epoch": 0.2345936138405121, + "grad_norm": 1.1042776107788086, + "learning_rate": 9.664616847551354e-05, + "loss": 0.871, + "step": 36720 + }, + { + "epoch": 0.23465750099025082, + "grad_norm": 0.5629504919052124, + "learning_rate": 9.664436149783647e-05, + "loss": 0.7445, + "step": 36730 + }, + { + "epoch": 0.23472138813998952, + "grad_norm": 0.7298271656036377, + "learning_rate": 9.664255405041031e-05, + "loss": 0.9827, + "step": 36740 + }, + { + "epoch": 0.23478527528972823, + "grad_norm": 0.6317089200019836, + "learning_rate": 9.66407461332533e-05, + "loss": 0.8363, + "step": 36750 + }, + { + "epoch": 0.23484916243946694, + "grad_norm": 0.8942947387695312, + "learning_rate": 9.663893774638362e-05, + "loss": 0.9289, + "step": 36760 + }, + { + "epoch": 0.2349130495892056, + "grad_norm": 1.3955134153366089, + "learning_rate": 9.663712888981949e-05, + "loss": 0.9013, + "step": 36770 + }, + { + "epoch": 0.23497693673894432, + "grad_norm": 0.84214186668396, + "learning_rate": 9.663531956357912e-05, + "loss": 0.8152, + "step": 36780 + }, + { + "epoch": 0.23504082388868303, + "grad_norm": 0.4366759955883026, + "learning_rate": 9.663350976768074e-05, + "loss": 0.7441, + "step": 36790 + }, + { + "epoch": 0.23510471103842173, + "grad_norm": 0.7600962519645691, + "learning_rate": 9.663169950214257e-05, + "loss": 0.9543, + "step": 36800 + }, + { + "epoch": 0.23516859818816044, + "grad_norm": 1.2092550992965698, + "learning_rate": 9.662988876698285e-05, + "loss": 1.0359, + "step": 36810 + }, + { + "epoch": 0.23523248533789914, + "grad_norm": 0.6062434911727905, + "learning_rate": 9.662807756221981e-05, + "loss": 0.6755, + "step": 36820 + }, + { + "epoch": 0.23529637248763785, + "grad_norm": 0.9666545987129211, + "learning_rate": 9.662626588787168e-05, + "loss": 0.7634, + "step": 36830 + }, + { + "epoch": 0.23536025963737653, + "grad_norm": 0.9782662987709045, + "learning_rate": 9.662445374395672e-05, + "loss": 0.9015, + "step": 36840 + }, + { + "epoch": 0.23542414678711523, + "grad_norm": 0.6901407241821289, + "learning_rate": 9.662264113049318e-05, + "loss": 0.8262, + "step": 36850 + }, + { + "epoch": 0.23548803393685394, + "grad_norm": 0.6084008812904358, + "learning_rate": 9.66208280474993e-05, + "loss": 0.7851, + "step": 36860 + }, + { + "epoch": 0.23555192108659265, + "grad_norm": 1.5526678562164307, + "learning_rate": 9.661901449499336e-05, + "loss": 0.9491, + "step": 36870 + }, + { + "epoch": 0.23561580823633135, + "grad_norm": 0.6443691849708557, + "learning_rate": 9.66172004729936e-05, + "loss": 0.8368, + "step": 36880 + }, + { + "epoch": 0.23567969538607006, + "grad_norm": 1.0201776027679443, + "learning_rate": 9.661538598151831e-05, + "loss": 0.9269, + "step": 36890 + }, + { + "epoch": 0.23574358253580874, + "grad_norm": 1.2530359029769897, + "learning_rate": 9.661357102058577e-05, + "loss": 0.9521, + "step": 36900 + }, + { + "epoch": 0.23580746968554744, + "grad_norm": 0.675190269947052, + "learning_rate": 9.661175559021423e-05, + "loss": 0.8889, + "step": 36910 + }, + { + "epoch": 0.23587135683528615, + "grad_norm": 1.3392939567565918, + "learning_rate": 9.660993969042197e-05, + "loss": 1.0459, + "step": 36920 + }, + { + "epoch": 0.23593524398502486, + "grad_norm": 0.7173458337783813, + "learning_rate": 9.66081233212273e-05, + "loss": 0.8731, + "step": 36930 + }, + { + "epoch": 0.23599913113476356, + "grad_norm": 1.142118215560913, + "learning_rate": 9.660630648264852e-05, + "loss": 0.9468, + "step": 36940 + }, + { + "epoch": 0.23606301828450227, + "grad_norm": 0.6740077137947083, + "learning_rate": 9.66044891747039e-05, + "loss": 0.8258, + "step": 36950 + }, + { + "epoch": 0.23612690543424095, + "grad_norm": 0.7697812914848328, + "learning_rate": 9.660267139741177e-05, + "loss": 0.9605, + "step": 36960 + }, + { + "epoch": 0.23619079258397965, + "grad_norm": 0.9170047044754028, + "learning_rate": 9.660085315079041e-05, + "loss": 0.8237, + "step": 36970 + }, + { + "epoch": 0.23625467973371836, + "grad_norm": 1.0468403100967407, + "learning_rate": 9.659903443485816e-05, + "loss": 0.7339, + "step": 36980 + }, + { + "epoch": 0.23631856688345707, + "grad_norm": 0.7569143176078796, + "learning_rate": 9.659721524963331e-05, + "loss": 0.9094, + "step": 36990 + }, + { + "epoch": 0.23638245403319577, + "grad_norm": 2.99776291847229, + "learning_rate": 9.659539559513418e-05, + "loss": 0.7063, + "step": 37000 + }, + { + "epoch": 0.23644634118293448, + "grad_norm": 0.6073469519615173, + "learning_rate": 9.659357547137912e-05, + "loss": 0.6927, + "step": 37010 + }, + { + "epoch": 0.23651022833267316, + "grad_norm": 0.9018070101737976, + "learning_rate": 9.659175487838643e-05, + "loss": 0.6648, + "step": 37020 + }, + { + "epoch": 0.23657411548241186, + "grad_norm": 1.5573745965957642, + "learning_rate": 9.658993381617447e-05, + "loss": 0.866, + "step": 37030 + }, + { + "epoch": 0.23663800263215057, + "grad_norm": 0.9631299376487732, + "learning_rate": 9.658811228476158e-05, + "loss": 0.8542, + "step": 37040 + }, + { + "epoch": 0.23670188978188927, + "grad_norm": 0.5931088924407959, + "learning_rate": 9.658629028416608e-05, + "loss": 1.0986, + "step": 37050 + }, + { + "epoch": 0.23676577693162798, + "grad_norm": 1.1451070308685303, + "learning_rate": 9.658446781440635e-05, + "loss": 1.026, + "step": 37060 + }, + { + "epoch": 0.2368296640813667, + "grad_norm": 0.9093202352523804, + "learning_rate": 9.65826448755007e-05, + "loss": 1.0764, + "step": 37070 + }, + { + "epoch": 0.23689355123110536, + "grad_norm": 0.6607868075370789, + "learning_rate": 9.658082146746754e-05, + "loss": 0.8786, + "step": 37080 + }, + { + "epoch": 0.23695743838084407, + "grad_norm": 1.8870525360107422, + "learning_rate": 9.65789975903252e-05, + "loss": 0.7795, + "step": 37090 + }, + { + "epoch": 0.23702132553058278, + "grad_norm": 0.9815956354141235, + "learning_rate": 9.657717324409207e-05, + "loss": 0.9482, + "step": 37100 + }, + { + "epoch": 0.23708521268032148, + "grad_norm": 0.7396382689476013, + "learning_rate": 9.65753484287865e-05, + "loss": 1.2419, + "step": 37110 + }, + { + "epoch": 0.2371490998300602, + "grad_norm": 1.3282475471496582, + "learning_rate": 9.657352314442688e-05, + "loss": 1.0154, + "step": 37120 + }, + { + "epoch": 0.2372129869797989, + "grad_norm": 0.5715224742889404, + "learning_rate": 9.65716973910316e-05, + "loss": 0.8772, + "step": 37130 + }, + { + "epoch": 0.23727687412953757, + "grad_norm": 0.646783173084259, + "learning_rate": 9.656987116861902e-05, + "loss": 0.9359, + "step": 37140 + }, + { + "epoch": 0.23734076127927628, + "grad_norm": 0.9318345189094543, + "learning_rate": 9.656804447720755e-05, + "loss": 0.7484, + "step": 37150 + }, + { + "epoch": 0.23740464842901499, + "grad_norm": 0.9858495593070984, + "learning_rate": 9.65662173168156e-05, + "loss": 0.9772, + "step": 37160 + }, + { + "epoch": 0.2374685355787537, + "grad_norm": 0.8943020701408386, + "learning_rate": 9.656438968746153e-05, + "loss": 0.9814, + "step": 37170 + }, + { + "epoch": 0.2375324227284924, + "grad_norm": 0.7488458752632141, + "learning_rate": 9.656256158916379e-05, + "loss": 0.8101, + "step": 37180 + }, + { + "epoch": 0.2375963098782311, + "grad_norm": 1.547443151473999, + "learning_rate": 9.656073302194078e-05, + "loss": 0.7346, + "step": 37190 + }, + { + "epoch": 0.23766019702796978, + "grad_norm": 0.7410275340080261, + "learning_rate": 9.655890398581088e-05, + "loss": 0.8636, + "step": 37200 + }, + { + "epoch": 0.2377240841777085, + "grad_norm": 1.3418773412704468, + "learning_rate": 9.655707448079256e-05, + "loss": 0.9634, + "step": 37210 + }, + { + "epoch": 0.2377879713274472, + "grad_norm": 1.0941447019577026, + "learning_rate": 9.655524450690423e-05, + "loss": 1.0396, + "step": 37220 + }, + { + "epoch": 0.2378518584771859, + "grad_norm": 0.6817768216133118, + "learning_rate": 9.65534140641643e-05, + "loss": 1.1446, + "step": 37230 + }, + { + "epoch": 0.2379157456269246, + "grad_norm": 1.0512549877166748, + "learning_rate": 9.65515831525912e-05, + "loss": 0.8289, + "step": 37240 + }, + { + "epoch": 0.2379796327766633, + "grad_norm": 0.6401187777519226, + "learning_rate": 9.654975177220341e-05, + "loss": 1.045, + "step": 37250 + }, + { + "epoch": 0.238043519926402, + "grad_norm": 1.0263795852661133, + "learning_rate": 9.654791992301935e-05, + "loss": 1.0096, + "step": 37260 + }, + { + "epoch": 0.2381074070761407, + "grad_norm": 0.7788522839546204, + "learning_rate": 9.654608760505745e-05, + "loss": 1.0715, + "step": 37270 + }, + { + "epoch": 0.2381712942258794, + "grad_norm": 0.7468205094337463, + "learning_rate": 9.654425481833618e-05, + "loss": 1.0105, + "step": 37280 + }, + { + "epoch": 0.2382351813756181, + "grad_norm": 0.6502282619476318, + "learning_rate": 9.6542421562874e-05, + "loss": 0.9838, + "step": 37290 + }, + { + "epoch": 0.23829906852535682, + "grad_norm": 0.6235799193382263, + "learning_rate": 9.654058783868938e-05, + "loss": 0.8914, + "step": 37300 + }, + { + "epoch": 0.23836295567509552, + "grad_norm": 0.6103238463401794, + "learning_rate": 9.653875364580077e-05, + "loss": 0.864, + "step": 37310 + }, + { + "epoch": 0.2384268428248342, + "grad_norm": 0.9452196955680847, + "learning_rate": 9.653691898422666e-05, + "loss": 0.8753, + "step": 37320 + }, + { + "epoch": 0.2384907299745729, + "grad_norm": 0.8040950298309326, + "learning_rate": 9.653508385398549e-05, + "loss": 0.8442, + "step": 37330 + }, + { + "epoch": 0.2385546171243116, + "grad_norm": 1.0032446384429932, + "learning_rate": 9.65332482550958e-05, + "loss": 0.9091, + "step": 37340 + }, + { + "epoch": 0.23861850427405032, + "grad_norm": 1.0294917821884155, + "learning_rate": 9.653141218757602e-05, + "loss": 0.7559, + "step": 37350 + }, + { + "epoch": 0.23868239142378903, + "grad_norm": 0.6536062955856323, + "learning_rate": 9.652957565144465e-05, + "loss": 0.7608, + "step": 37360 + }, + { + "epoch": 0.23874627857352773, + "grad_norm": 0.7073416113853455, + "learning_rate": 9.652773864672022e-05, + "loss": 1.0675, + "step": 37370 + }, + { + "epoch": 0.2388101657232664, + "grad_norm": 0.8172992467880249, + "learning_rate": 9.652590117342122e-05, + "loss": 0.8483, + "step": 37380 + }, + { + "epoch": 0.23887405287300512, + "grad_norm": 0.7354963421821594, + "learning_rate": 9.652406323156613e-05, + "loss": 0.9358, + "step": 37390 + }, + { + "epoch": 0.23893794002274382, + "grad_norm": 0.6794359683990479, + "learning_rate": 9.652222482117347e-05, + "loss": 0.9437, + "step": 37400 + }, + { + "epoch": 0.23900182717248253, + "grad_norm": 0.7249003648757935, + "learning_rate": 9.652038594226177e-05, + "loss": 0.8782, + "step": 37410 + }, + { + "epoch": 0.23906571432222123, + "grad_norm": 0.8355563282966614, + "learning_rate": 9.651854659484954e-05, + "loss": 0.6612, + "step": 37420 + }, + { + "epoch": 0.23912960147195994, + "grad_norm": 0.7103647589683533, + "learning_rate": 9.651670677895529e-05, + "loss": 1.1142, + "step": 37430 + }, + { + "epoch": 0.23919348862169862, + "grad_norm": 0.5884954929351807, + "learning_rate": 9.651486649459755e-05, + "loss": 0.9896, + "step": 37440 + }, + { + "epoch": 0.23925737577143733, + "grad_norm": 0.7389781475067139, + "learning_rate": 9.651302574179489e-05, + "loss": 0.8372, + "step": 37450 + }, + { + "epoch": 0.23932126292117603, + "grad_norm": 0.5792128443717957, + "learning_rate": 9.651118452056582e-05, + "loss": 0.7093, + "step": 37460 + }, + { + "epoch": 0.23938515007091474, + "grad_norm": 0.7699292898178101, + "learning_rate": 9.650934283092887e-05, + "loss": 1.0111, + "step": 37470 + }, + { + "epoch": 0.23944903722065344, + "grad_norm": 0.7070481181144714, + "learning_rate": 9.65075006729026e-05, + "loss": 1.084, + "step": 37480 + }, + { + "epoch": 0.23951292437039215, + "grad_norm": 1.0527695417404175, + "learning_rate": 9.650565804650556e-05, + "loss": 0.8554, + "step": 37490 + }, + { + "epoch": 0.23957681152013083, + "grad_norm": 0.7435452342033386, + "learning_rate": 9.650381495175633e-05, + "loss": 0.8564, + "step": 37500 + }, + { + "epoch": 0.23964069866986953, + "grad_norm": 0.9343265295028687, + "learning_rate": 9.650197138867343e-05, + "loss": 1.2956, + "step": 37510 + }, + { + "epoch": 0.23970458581960824, + "grad_norm": 1.0350561141967773, + "learning_rate": 9.650012735727546e-05, + "loss": 0.9515, + "step": 37520 + }, + { + "epoch": 0.23976847296934695, + "grad_norm": 1.1967248916625977, + "learning_rate": 9.649828285758098e-05, + "loss": 0.9391, + "step": 37530 + }, + { + "epoch": 0.23983236011908565, + "grad_norm": 1.7346086502075195, + "learning_rate": 9.649643788960856e-05, + "loss": 0.8431, + "step": 37540 + }, + { + "epoch": 0.23989624726882436, + "grad_norm": 0.7352771162986755, + "learning_rate": 9.649459245337679e-05, + "loss": 0.6744, + "step": 37550 + }, + { + "epoch": 0.23996013441856304, + "grad_norm": 0.6544600129127502, + "learning_rate": 9.649293116042181e-05, + "loss": 1.2871, + "step": 37560 + }, + { + "epoch": 0.24002402156830174, + "grad_norm": 0.4782470464706421, + "learning_rate": 9.649108483454848e-05, + "loss": 1.022, + "step": 37570 + }, + { + "epoch": 0.24008790871804045, + "grad_norm": 0.9200822710990906, + "learning_rate": 9.648923804046968e-05, + "loss": 0.737, + "step": 37580 + }, + { + "epoch": 0.24015179586777916, + "grad_norm": 1.8405570983886719, + "learning_rate": 9.648739077820405e-05, + "loss": 0.832, + "step": 37590 + }, + { + "epoch": 0.24021568301751786, + "grad_norm": 1.054779052734375, + "learning_rate": 9.648554304777017e-05, + "loss": 0.8275, + "step": 37600 + }, + { + "epoch": 0.24027957016725657, + "grad_norm": 0.8630744814872742, + "learning_rate": 9.648369484918667e-05, + "loss": 0.8602, + "step": 37610 + }, + { + "epoch": 0.24034345731699525, + "grad_norm": 1.0110766887664795, + "learning_rate": 9.648184618247214e-05, + "loss": 0.8669, + "step": 37620 + }, + { + "epoch": 0.24040734446673395, + "grad_norm": 1.0114331245422363, + "learning_rate": 9.64799970476452e-05, + "loss": 1.0707, + "step": 37630 + }, + { + "epoch": 0.24047123161647266, + "grad_norm": 0.8818547129631042, + "learning_rate": 9.64781474447245e-05, + "loss": 0.9434, + "step": 37640 + }, + { + "epoch": 0.24053511876621136, + "grad_norm": 1.12362802028656, + "learning_rate": 9.647629737372863e-05, + "loss": 1.1379, + "step": 37650 + }, + { + "epoch": 0.24059900591595007, + "grad_norm": 0.696323812007904, + "learning_rate": 9.647444683467623e-05, + "loss": 0.951, + "step": 37660 + }, + { + "epoch": 0.24066289306568878, + "grad_norm": 0.8041189312934875, + "learning_rate": 9.647259582758597e-05, + "loss": 0.9218, + "step": 37670 + }, + { + "epoch": 0.24072678021542748, + "grad_norm": 0.45444270968437195, + "learning_rate": 9.647074435247644e-05, + "loss": 0.7025, + "step": 37680 + }, + { + "epoch": 0.24079066736516616, + "grad_norm": 0.6444490551948547, + "learning_rate": 9.646889240936632e-05, + "loss": 1.011, + "step": 37690 + }, + { + "epoch": 0.24085455451490487, + "grad_norm": 0.9339631199836731, + "learning_rate": 9.646703999827426e-05, + "loss": 1.0486, + "step": 37700 + }, + { + "epoch": 0.24091844166464357, + "grad_norm": 1.2948579788208008, + "learning_rate": 9.64651871192189e-05, + "loss": 1.2862, + "step": 37710 + }, + { + "epoch": 0.24098232881438228, + "grad_norm": 0.8452892899513245, + "learning_rate": 9.64633337722189e-05, + "loss": 0.8614, + "step": 37720 + }, + { + "epoch": 0.24104621596412099, + "grad_norm": 0.8650469183921814, + "learning_rate": 9.646147995729294e-05, + "loss": 1.0569, + "step": 37730 + }, + { + "epoch": 0.2411101031138597, + "grad_norm": 0.8053631782531738, + "learning_rate": 9.645962567445969e-05, + "loss": 0.9448, + "step": 37740 + }, + { + "epoch": 0.24117399026359837, + "grad_norm": 0.6854358315467834, + "learning_rate": 9.64577709237378e-05, + "loss": 0.9552, + "step": 37750 + }, + { + "epoch": 0.24123787741333708, + "grad_norm": 1.0860304832458496, + "learning_rate": 9.645591570514598e-05, + "loss": 1.111, + "step": 37760 + }, + { + "epoch": 0.24130176456307578, + "grad_norm": 0.7687236070632935, + "learning_rate": 9.64540600187029e-05, + "loss": 0.8652, + "step": 37770 + }, + { + "epoch": 0.2413656517128145, + "grad_norm": 1.2046473026275635, + "learning_rate": 9.645220386442724e-05, + "loss": 0.7453, + "step": 37780 + }, + { + "epoch": 0.2414295388625532, + "grad_norm": 0.9802344441413879, + "learning_rate": 9.64503472423377e-05, + "loss": 0.8819, + "step": 37790 + }, + { + "epoch": 0.2414934260122919, + "grad_norm": 0.7101196646690369, + "learning_rate": 9.644849015245296e-05, + "loss": 0.8814, + "step": 37800 + }, + { + "epoch": 0.24155731316203058, + "grad_norm": 1.215147852897644, + "learning_rate": 9.644663259479177e-05, + "loss": 0.9877, + "step": 37810 + }, + { + "epoch": 0.24162120031176929, + "grad_norm": 0.9594703316688538, + "learning_rate": 9.64447745693728e-05, + "loss": 0.8849, + "step": 37820 + }, + { + "epoch": 0.241685087461508, + "grad_norm": 0.6628295183181763, + "learning_rate": 9.644291607621476e-05, + "loss": 0.6372, + "step": 37830 + }, + { + "epoch": 0.2417489746112467, + "grad_norm": 0.7092610001564026, + "learning_rate": 9.644105711533638e-05, + "loss": 0.9584, + "step": 37840 + }, + { + "epoch": 0.2418128617609854, + "grad_norm": 1.3304320573806763, + "learning_rate": 9.643919768675637e-05, + "loss": 1.075, + "step": 37850 + }, + { + "epoch": 0.2418767489107241, + "grad_norm": 0.8040294051170349, + "learning_rate": 9.643733779049349e-05, + "loss": 0.9873, + "step": 37860 + }, + { + "epoch": 0.2419406360604628, + "grad_norm": 0.6643669009208679, + "learning_rate": 9.643547742656643e-05, + "loss": 0.9092, + "step": 37870 + }, + { + "epoch": 0.2420045232102015, + "grad_norm": 0.8764951229095459, + "learning_rate": 9.643361659499392e-05, + "loss": 0.7729, + "step": 37880 + }, + { + "epoch": 0.2420684103599402, + "grad_norm": 6.183263778686523, + "learning_rate": 9.643175529579475e-05, + "loss": 1.238, + "step": 37890 + }, + { + "epoch": 0.2421322975096789, + "grad_norm": 1.3563274145126343, + "learning_rate": 9.642989352898762e-05, + "loss": 0.8639, + "step": 37900 + }, + { + "epoch": 0.2421961846594176, + "grad_norm": 0.8023094534873962, + "learning_rate": 9.64280312945913e-05, + "loss": 1.1368, + "step": 37910 + }, + { + "epoch": 0.24226007180915632, + "grad_norm": 0.6188109517097473, + "learning_rate": 9.642616859262455e-05, + "loss": 1.0875, + "step": 37920 + }, + { + "epoch": 0.242323958958895, + "grad_norm": 0.7044292688369751, + "learning_rate": 9.64243054231061e-05, + "loss": 0.9899, + "step": 37930 + }, + { + "epoch": 0.2423878461086337, + "grad_norm": 0.8876643776893616, + "learning_rate": 9.642244178605473e-05, + "loss": 0.9804, + "step": 37940 + }, + { + "epoch": 0.2424517332583724, + "grad_norm": 0.7421206831932068, + "learning_rate": 9.642057768148922e-05, + "loss": 0.8828, + "step": 37950 + }, + { + "epoch": 0.24251562040811112, + "grad_norm": 0.8457249402999878, + "learning_rate": 9.641871310942832e-05, + "loss": 0.9491, + "step": 37960 + }, + { + "epoch": 0.24257950755784982, + "grad_norm": 1.0700315237045288, + "learning_rate": 9.641684806989084e-05, + "loss": 0.8752, + "step": 37970 + }, + { + "epoch": 0.24264339470758853, + "grad_norm": 0.7216569781303406, + "learning_rate": 9.641498256289552e-05, + "loss": 1.1564, + "step": 37980 + }, + { + "epoch": 0.2427072818573272, + "grad_norm": 0.847780704498291, + "learning_rate": 9.641311658846119e-05, + "loss": 0.8944, + "step": 37990 + }, + { + "epoch": 0.2427711690070659, + "grad_norm": 0.9553901553153992, + "learning_rate": 9.641125014660662e-05, + "loss": 0.9013, + "step": 38000 + }, + { + "epoch": 0.24283505615680462, + "grad_norm": 1.37058424949646, + "learning_rate": 9.64093832373506e-05, + "loss": 0.9315, + "step": 38010 + }, + { + "epoch": 0.24289894330654332, + "grad_norm": 0.7296909689903259, + "learning_rate": 9.640751586071195e-05, + "loss": 0.8648, + "step": 38020 + }, + { + "epoch": 0.24296283045628203, + "grad_norm": 1.1600792407989502, + "learning_rate": 9.640564801670948e-05, + "loss": 0.7834, + "step": 38030 + }, + { + "epoch": 0.24302671760602074, + "grad_norm": 1.0222969055175781, + "learning_rate": 9.640377970536197e-05, + "loss": 1.0175, + "step": 38040 + }, + { + "epoch": 0.24309060475575942, + "grad_norm": 1.1712769269943237, + "learning_rate": 9.640191092668825e-05, + "loss": 1.0173, + "step": 38050 + }, + { + "epoch": 0.24315449190549812, + "grad_norm": 1.4743320941925049, + "learning_rate": 9.640004168070716e-05, + "loss": 1.2532, + "step": 38060 + }, + { + "epoch": 0.24321837905523683, + "grad_norm": 0.7207898497581482, + "learning_rate": 9.639817196743749e-05, + "loss": 0.9538, + "step": 38070 + }, + { + "epoch": 0.24328226620497553, + "grad_norm": 1.0338420867919922, + "learning_rate": 9.639630178689809e-05, + "loss": 0.8653, + "step": 38080 + }, + { + "epoch": 0.24334615335471424, + "grad_norm": 0.7284950017929077, + "learning_rate": 9.639443113910781e-05, + "loss": 0.7094, + "step": 38090 + }, + { + "epoch": 0.24341004050445295, + "grad_norm": 1.0017796754837036, + "learning_rate": 9.639256002408545e-05, + "loss": 0.7997, + "step": 38100 + }, + { + "epoch": 0.24347392765419162, + "grad_norm": 1.1406546831130981, + "learning_rate": 9.639068844184989e-05, + "loss": 0.9456, + "step": 38110 + }, + { + "epoch": 0.24353781480393033, + "grad_norm": 0.7113826870918274, + "learning_rate": 9.638881639241996e-05, + "loss": 0.8586, + "step": 38120 + }, + { + "epoch": 0.24360170195366904, + "grad_norm": 0.8635872602462769, + "learning_rate": 9.638694387581453e-05, + "loss": 0.8049, + "step": 38130 + }, + { + "epoch": 0.24366558910340774, + "grad_norm": 0.882631242275238, + "learning_rate": 9.638507089205242e-05, + "loss": 0.7798, + "step": 38140 + }, + { + "epoch": 0.24372947625314645, + "grad_norm": 0.9470729231834412, + "learning_rate": 9.638319744115254e-05, + "loss": 0.875, + "step": 38150 + }, + { + "epoch": 0.24379336340288515, + "grad_norm": 0.5995595455169678, + "learning_rate": 9.638132352313371e-05, + "loss": 0.7982, + "step": 38160 + }, + { + "epoch": 0.24385725055262383, + "grad_norm": 0.702936589717865, + "learning_rate": 9.637944913801485e-05, + "loss": 0.8089, + "step": 38170 + }, + { + "epoch": 0.24392113770236254, + "grad_norm": 0.9595651626586914, + "learning_rate": 9.63775742858148e-05, + "loss": 0.9638, + "step": 38180 + }, + { + "epoch": 0.24398502485210125, + "grad_norm": 0.706251323223114, + "learning_rate": 9.637569896655245e-05, + "loss": 0.7911, + "step": 38190 + }, + { + "epoch": 0.24404891200183995, + "grad_norm": 0.6460245251655579, + "learning_rate": 9.63740107798937e-05, + "loss": 1.1869, + "step": 38200 + }, + { + "epoch": 0.24411279915157866, + "grad_norm": 0.7056565880775452, + "learning_rate": 9.637213457326503e-05, + "loss": 0.874, + "step": 38210 + }, + { + "epoch": 0.24417668630131736, + "grad_norm": 1.0978292226791382, + "learning_rate": 9.637025789962885e-05, + "loss": 1.059, + "step": 38220 + }, + { + "epoch": 0.24424057345105604, + "grad_norm": 1.170605182647705, + "learning_rate": 9.636838075900405e-05, + "loss": 0.7483, + "step": 38230 + }, + { + "epoch": 0.24430446060079475, + "grad_norm": 0.9686828255653381, + "learning_rate": 9.636650315140955e-05, + "loss": 1.0386, + "step": 38240 + }, + { + "epoch": 0.24436834775053345, + "grad_norm": 0.9862095713615417, + "learning_rate": 9.636462507686425e-05, + "loss": 0.8925, + "step": 38250 + }, + { + "epoch": 0.24443223490027216, + "grad_norm": 0.8100789189338684, + "learning_rate": 9.636274653538707e-05, + "loss": 0.8097, + "step": 38260 + }, + { + "epoch": 0.24449612205001087, + "grad_norm": 0.9674224257469177, + "learning_rate": 9.636086752699691e-05, + "loss": 0.9992, + "step": 38270 + }, + { + "epoch": 0.24456000919974957, + "grad_norm": 0.8364433646202087, + "learning_rate": 9.63589880517127e-05, + "loss": 0.7888, + "step": 38280 + }, + { + "epoch": 0.24462389634948825, + "grad_norm": 0.8828948140144348, + "learning_rate": 9.63571081095534e-05, + "loss": 0.9031, + "step": 38290 + }, + { + "epoch": 0.24468778349922696, + "grad_norm": 0.7281533479690552, + "learning_rate": 9.63552277005379e-05, + "loss": 0.7772, + "step": 38300 + }, + { + "epoch": 0.24475167064896566, + "grad_norm": 1.225823998451233, + "learning_rate": 9.635334682468516e-05, + "loss": 0.7892, + "step": 38310 + }, + { + "epoch": 0.24481555779870437, + "grad_norm": 0.8330084681510925, + "learning_rate": 9.63514654820141e-05, + "loss": 0.8299, + "step": 38320 + }, + { + "epoch": 0.24487944494844308, + "grad_norm": 1.0153292417526245, + "learning_rate": 9.63495836725437e-05, + "loss": 1.0699, + "step": 38330 + }, + { + "epoch": 0.24494333209818178, + "grad_norm": 1.185373306274414, + "learning_rate": 9.634770139629288e-05, + "loss": 0.9322, + "step": 38340 + }, + { + "epoch": 0.24500721924792046, + "grad_norm": 0.7242724895477295, + "learning_rate": 9.634581865328062e-05, + "loss": 0.7414, + "step": 38350 + }, + { + "epoch": 0.24507110639765917, + "grad_norm": 0.8044312596321106, + "learning_rate": 9.634393544352589e-05, + "loss": 1.1166, + "step": 38360 + }, + { + "epoch": 0.24513499354739787, + "grad_norm": 0.9795138239860535, + "learning_rate": 9.63420517670476e-05, + "loss": 0.7423, + "step": 38370 + }, + { + "epoch": 0.24519888069713658, + "grad_norm": 0.7290918231010437, + "learning_rate": 9.634016762386478e-05, + "loss": 0.9426, + "step": 38380 + }, + { + "epoch": 0.24526276784687528, + "grad_norm": 0.7532154321670532, + "learning_rate": 9.633828301399639e-05, + "loss": 0.9311, + "step": 38390 + }, + { + "epoch": 0.245326654996614, + "grad_norm": 1.0642715692520142, + "learning_rate": 9.633639793746139e-05, + "loss": 0.83, + "step": 38400 + }, + { + "epoch": 0.24539054214635267, + "grad_norm": 0.5684540271759033, + "learning_rate": 9.633451239427877e-05, + "loss": 0.9418, + "step": 38410 + }, + { + "epoch": 0.24545442929609138, + "grad_norm": 0.827085554599762, + "learning_rate": 9.633262638446753e-05, + "loss": 0.8866, + "step": 38420 + }, + { + "epoch": 0.24551831644583008, + "grad_norm": 1.217444896697998, + "learning_rate": 9.633073990804666e-05, + "loss": 0.8359, + "step": 38430 + }, + { + "epoch": 0.2455822035955688, + "grad_norm": 0.964013397693634, + "learning_rate": 9.632885296503515e-05, + "loss": 0.9809, + "step": 38440 + }, + { + "epoch": 0.2456460907453075, + "grad_norm": 0.5672999024391174, + "learning_rate": 9.632696555545203e-05, + "loss": 0.7156, + "step": 38450 + }, + { + "epoch": 0.2457099778950462, + "grad_norm": 0.6509802341461182, + "learning_rate": 9.632507767931626e-05, + "loss": 0.7118, + "step": 38460 + }, + { + "epoch": 0.24577386504478488, + "grad_norm": 1.3255314826965332, + "learning_rate": 9.63231893366469e-05, + "loss": 0.9581, + "step": 38470 + }, + { + "epoch": 0.24583775219452358, + "grad_norm": 0.7225618958473206, + "learning_rate": 9.632130052746296e-05, + "loss": 0.9634, + "step": 38480 + }, + { + "epoch": 0.2459016393442623, + "grad_norm": 1.1634318828582764, + "learning_rate": 9.631941125178343e-05, + "loss": 1.0248, + "step": 38490 + }, + { + "epoch": 0.245965526494001, + "grad_norm": 0.75383460521698, + "learning_rate": 9.631752150962736e-05, + "loss": 0.8206, + "step": 38500 + }, + { + "epoch": 0.2460294136437397, + "grad_norm": 1.0557365417480469, + "learning_rate": 9.631563130101377e-05, + "loss": 0.8587, + "step": 38510 + }, + { + "epoch": 0.2460933007934784, + "grad_norm": 0.8488501906394958, + "learning_rate": 9.631374062596172e-05, + "loss": 0.9565, + "step": 38520 + }, + { + "epoch": 0.24615718794321712, + "grad_norm": 0.7064526081085205, + "learning_rate": 9.631184948449023e-05, + "loss": 0.8974, + "step": 38530 + }, + { + "epoch": 0.2462210750929558, + "grad_norm": 1.123842716217041, + "learning_rate": 9.630995787661834e-05, + "loss": 1.0052, + "step": 38540 + }, + { + "epoch": 0.2462849622426945, + "grad_norm": 0.8303399085998535, + "learning_rate": 9.630806580236512e-05, + "loss": 0.6941, + "step": 38550 + }, + { + "epoch": 0.2463488493924332, + "grad_norm": 0.6929298043251038, + "learning_rate": 9.630617326174962e-05, + "loss": 0.9629, + "step": 38560 + }, + { + "epoch": 0.2464127365421719, + "grad_norm": 0.9439372420310974, + "learning_rate": 9.630428025479088e-05, + "loss": 0.8717, + "step": 38570 + }, + { + "epoch": 0.24647662369191062, + "grad_norm": 0.9138436317443848, + "learning_rate": 9.630238678150799e-05, + "loss": 0.8803, + "step": 38580 + }, + { + "epoch": 0.24654051084164932, + "grad_norm": 0.8482638597488403, + "learning_rate": 9.630049284192001e-05, + "loss": 0.9979, + "step": 38590 + }, + { + "epoch": 0.246604397991388, + "grad_norm": 0.779525637626648, + "learning_rate": 9.6298598436046e-05, + "loss": 0.7542, + "step": 38600 + }, + { + "epoch": 0.2466682851411267, + "grad_norm": 0.8518264293670654, + "learning_rate": 9.629670356390505e-05, + "loss": 0.9886, + "step": 38610 + }, + { + "epoch": 0.24673217229086541, + "grad_norm": 1.1973893642425537, + "learning_rate": 9.629480822551627e-05, + "loss": 1.0507, + "step": 38620 + }, + { + "epoch": 0.24679605944060412, + "grad_norm": 0.8994162678718567, + "learning_rate": 9.629291242089869e-05, + "loss": 0.8537, + "step": 38630 + }, + { + "epoch": 0.24685994659034283, + "grad_norm": 0.5772508978843689, + "learning_rate": 9.629101615007145e-05, + "loss": 0.8669, + "step": 38640 + }, + { + "epoch": 0.24692383374008153, + "grad_norm": 0.7069337368011475, + "learning_rate": 9.628911941305361e-05, + "loss": 1.0522, + "step": 38650 + }, + { + "epoch": 0.2469877208898202, + "grad_norm": 0.722184956073761, + "learning_rate": 9.62872222098643e-05, + "loss": 1.1426, + "step": 38660 + }, + { + "epoch": 0.24705160803955892, + "grad_norm": 1.3423835039138794, + "learning_rate": 9.628532454052263e-05, + "loss": 0.7297, + "step": 38670 + }, + { + "epoch": 0.24711549518929762, + "grad_norm": 0.7924500703811646, + "learning_rate": 9.628342640504769e-05, + "loss": 0.7804, + "step": 38680 + }, + { + "epoch": 0.24717938233903633, + "grad_norm": 0.9021787047386169, + "learning_rate": 9.628152780345861e-05, + "loss": 0.8003, + "step": 38690 + }, + { + "epoch": 0.24724326948877504, + "grad_norm": 0.991362452507019, + "learning_rate": 9.62796287357745e-05, + "loss": 0.6662, + "step": 38700 + }, + { + "epoch": 0.24730715663851374, + "grad_norm": 0.8103492856025696, + "learning_rate": 9.62777292020145e-05, + "loss": 0.8117, + "step": 38710 + }, + { + "epoch": 0.24737104378825242, + "grad_norm": 0.6966975331306458, + "learning_rate": 9.627582920219773e-05, + "loss": 0.8408, + "step": 38720 + }, + { + "epoch": 0.24743493093799113, + "grad_norm": 0.7350102663040161, + "learning_rate": 9.627392873634332e-05, + "loss": 0.7777, + "step": 38730 + }, + { + "epoch": 0.24749881808772983, + "grad_norm": 0.7632319927215576, + "learning_rate": 9.627202780447041e-05, + "loss": 1.0788, + "step": 38740 + }, + { + "epoch": 0.24756270523746854, + "grad_norm": 0.6083953976631165, + "learning_rate": 9.627012640659816e-05, + "loss": 0.8096, + "step": 38750 + }, + { + "epoch": 0.24762659238720724, + "grad_norm": 0.7463345527648926, + "learning_rate": 9.62682245427457e-05, + "loss": 0.7349, + "step": 38760 + }, + { + "epoch": 0.24769047953694595, + "grad_norm": 0.7767286896705627, + "learning_rate": 9.626632221293219e-05, + "loss": 1.1039, + "step": 38770 + }, + { + "epoch": 0.24775436668668463, + "grad_norm": 1.4006277322769165, + "learning_rate": 9.626441941717678e-05, + "loss": 0.7395, + "step": 38780 + }, + { + "epoch": 0.24781825383642334, + "grad_norm": 0.9676879048347473, + "learning_rate": 9.626251615549867e-05, + "loss": 1.1208, + "step": 38790 + }, + { + "epoch": 0.24788214098616204, + "grad_norm": 0.8642779588699341, + "learning_rate": 9.626061242791699e-05, + "loss": 0.8964, + "step": 38800 + }, + { + "epoch": 0.24794602813590075, + "grad_norm": 1.1887127161026, + "learning_rate": 9.625870823445092e-05, + "loss": 0.8361, + "step": 38810 + }, + { + "epoch": 0.24800991528563945, + "grad_norm": 0.7598790526390076, + "learning_rate": 9.625680357511962e-05, + "loss": 0.9854, + "step": 38820 + }, + { + "epoch": 0.24807380243537816, + "grad_norm": 1.8052411079406738, + "learning_rate": 9.625489844994231e-05, + "loss": 0.867, + "step": 38830 + }, + { + "epoch": 0.24813768958511684, + "grad_norm": 0.9150733947753906, + "learning_rate": 9.625299285893816e-05, + "loss": 0.9337, + "step": 38840 + }, + { + "epoch": 0.24820157673485554, + "grad_norm": 0.8813467621803284, + "learning_rate": 9.625108680212633e-05, + "loss": 0.6746, + "step": 38850 + }, + { + "epoch": 0.24826546388459425, + "grad_norm": 0.7594160437583923, + "learning_rate": 9.624918027952607e-05, + "loss": 1.0736, + "step": 38860 + }, + { + "epoch": 0.24832935103433296, + "grad_norm": 0.7469112873077393, + "learning_rate": 9.624727329115655e-05, + "loss": 0.8787, + "step": 38870 + }, + { + "epoch": 0.24839323818407166, + "grad_norm": 0.7725271582603455, + "learning_rate": 9.624536583703697e-05, + "loss": 0.768, + "step": 38880 + }, + { + "epoch": 0.24845712533381037, + "grad_norm": 2.877929210662842, + "learning_rate": 9.624345791718656e-05, + "loss": 1.0134, + "step": 38890 + }, + { + "epoch": 0.24852101248354905, + "grad_norm": 0.7709558606147766, + "learning_rate": 9.62415495316245e-05, + "loss": 0.922, + "step": 38900 + }, + { + "epoch": 0.24858489963328775, + "grad_norm": 1.2846732139587402, + "learning_rate": 9.623964068037006e-05, + "loss": 1.2037, + "step": 38910 + }, + { + "epoch": 0.24864878678302646, + "grad_norm": 0.6465185284614563, + "learning_rate": 9.62377313634424e-05, + "loss": 0.9262, + "step": 38920 + }, + { + "epoch": 0.24871267393276517, + "grad_norm": 0.7968388199806213, + "learning_rate": 9.623582158086081e-05, + "loss": 0.7902, + "step": 38930 + }, + { + "epoch": 0.24877656108250387, + "grad_norm": 0.9779314994812012, + "learning_rate": 9.62339113326445e-05, + "loss": 0.9249, + "step": 38940 + }, + { + "epoch": 0.24884044823224258, + "grad_norm": 1.0651602745056152, + "learning_rate": 9.62320006188127e-05, + "loss": 0.8328, + "step": 38950 + }, + { + "epoch": 0.24890433538198126, + "grad_norm": 0.6496372222900391, + "learning_rate": 9.623008943938466e-05, + "loss": 0.8704, + "step": 38960 + }, + { + "epoch": 0.24896822253171996, + "grad_norm": 1.1484968662261963, + "learning_rate": 9.62281777943796e-05, + "loss": 0.8183, + "step": 38970 + }, + { + "epoch": 0.24903210968145867, + "grad_norm": 1.39591383934021, + "learning_rate": 9.622626568381684e-05, + "loss": 1.0141, + "step": 38980 + }, + { + "epoch": 0.24909599683119737, + "grad_norm": 0.6642056107521057, + "learning_rate": 9.622435310771556e-05, + "loss": 0.8481, + "step": 38990 + }, + { + "epoch": 0.24915988398093608, + "grad_norm": 1.48539400100708, + "learning_rate": 9.622244006609506e-05, + "loss": 0.8486, + "step": 39000 + }, + { + "epoch": 0.2492237711306748, + "grad_norm": 0.6797450184822083, + "learning_rate": 9.62205265589746e-05, + "loss": 0.8783, + "step": 39010 + }, + { + "epoch": 0.24928765828041347, + "grad_norm": 0.6741315126419067, + "learning_rate": 9.621861258637345e-05, + "loss": 0.8605, + "step": 39020 + }, + { + "epoch": 0.24935154543015217, + "grad_norm": 0.8567194938659668, + "learning_rate": 9.621669814831089e-05, + "loss": 1.1903, + "step": 39030 + }, + { + "epoch": 0.24941543257989088, + "grad_norm": 0.665440022945404, + "learning_rate": 9.62147832448062e-05, + "loss": 0.9027, + "step": 39040 + }, + { + "epoch": 0.24947931972962958, + "grad_norm": 0.7882826924324036, + "learning_rate": 9.621286787587866e-05, + "loss": 0.7315, + "step": 39050 + }, + { + "epoch": 0.2495432068793683, + "grad_norm": 0.6440451145172119, + "learning_rate": 9.621095204154756e-05, + "loss": 1.0849, + "step": 39060 + }, + { + "epoch": 0.249607094029107, + "grad_norm": 0.7116972804069519, + "learning_rate": 9.62090357418322e-05, + "loss": 0.9145, + "step": 39070 + }, + { + "epoch": 0.24967098117884567, + "grad_norm": 0.895585298538208, + "learning_rate": 9.620711897675185e-05, + "loss": 0.9243, + "step": 39080 + }, + { + "epoch": 0.24973486832858438, + "grad_norm": 0.730792224407196, + "learning_rate": 9.620520174632585e-05, + "loss": 0.8114, + "step": 39090 + }, + { + "epoch": 0.2497987554783231, + "grad_norm": 1.1607691049575806, + "learning_rate": 9.620328405057352e-05, + "loss": 0.9737, + "step": 39100 + }, + { + "epoch": 0.2498626426280618, + "grad_norm": 0.927515983581543, + "learning_rate": 9.620136588951412e-05, + "loss": 1.0355, + "step": 39110 + }, + { + "epoch": 0.2499265297778005, + "grad_norm": 1.267722487449646, + "learning_rate": 9.6199447263167e-05, + "loss": 1.0208, + "step": 39120 + }, + { + "epoch": 0.2499904169275392, + "grad_norm": 0.9854876399040222, + "learning_rate": 9.619752817155149e-05, + "loss": 1.2538, + "step": 39130 + }, + { + "epoch": 0.2500543040772779, + "grad_norm": 0.8375557065010071, + "learning_rate": 9.61956086146869e-05, + "loss": 0.8444, + "step": 39140 + }, + { + "epoch": 0.2501181912270166, + "grad_norm": 0.7425163388252258, + "learning_rate": 9.619368859259255e-05, + "loss": 0.6912, + "step": 39150 + }, + { + "epoch": 0.2501820783767553, + "grad_norm": 0.9362971782684326, + "learning_rate": 9.61917681052878e-05, + "loss": 1.0123, + "step": 39160 + }, + { + "epoch": 0.250245965526494, + "grad_norm": 0.7394902110099792, + "learning_rate": 9.6189847152792e-05, + "loss": 0.7578, + "step": 39170 + }, + { + "epoch": 0.2503098526762327, + "grad_norm": 2.326955556869507, + "learning_rate": 9.618792573512447e-05, + "loss": 0.9821, + "step": 39180 + }, + { + "epoch": 0.2503737398259714, + "grad_norm": 0.5567811727523804, + "learning_rate": 9.618600385230456e-05, + "loss": 1.543, + "step": 39190 + }, + { + "epoch": 0.2504376269757101, + "grad_norm": 0.8062513470649719, + "learning_rate": 9.618408150435165e-05, + "loss": 1.0742, + "step": 39200 + }, + { + "epoch": 0.2505015141254488, + "grad_norm": 0.8506273031234741, + "learning_rate": 9.618215869128507e-05, + "loss": 0.8167, + "step": 39210 + }, + { + "epoch": 0.2505654012751875, + "grad_norm": 1.3575971126556396, + "learning_rate": 9.61802354131242e-05, + "loss": 1.0804, + "step": 39220 + }, + { + "epoch": 0.2506292884249262, + "grad_norm": 0.6114894151687622, + "learning_rate": 9.617831166988842e-05, + "loss": 0.7265, + "step": 39230 + }, + { + "epoch": 0.2506931755746649, + "grad_norm": 1.1617469787597656, + "learning_rate": 9.617638746159709e-05, + "loss": 1.1414, + "step": 39240 + }, + { + "epoch": 0.2507570627244036, + "grad_norm": 1.004840612411499, + "learning_rate": 9.617446278826958e-05, + "loss": 0.8523, + "step": 39250 + }, + { + "epoch": 0.25082094987414233, + "grad_norm": 0.69590824842453, + "learning_rate": 9.617253764992529e-05, + "loss": 0.6603, + "step": 39260 + }, + { + "epoch": 0.25088483702388104, + "grad_norm": 2.461747169494629, + "learning_rate": 9.61706120465836e-05, + "loss": 0.973, + "step": 39270 + }, + { + "epoch": 0.25094872417361974, + "grad_norm": 1.718680500984192, + "learning_rate": 9.616868597826389e-05, + "loss": 0.9792, + "step": 39280 + }, + { + "epoch": 0.2510126113233584, + "grad_norm": 0.9190512299537659, + "learning_rate": 9.616675944498559e-05, + "loss": 1.0048, + "step": 39290 + }, + { + "epoch": 0.2510764984730971, + "grad_norm": 0.6358333230018616, + "learning_rate": 9.616483244676809e-05, + "loss": 0.8014, + "step": 39300 + }, + { + "epoch": 0.2511403856228358, + "grad_norm": 0.8349801301956177, + "learning_rate": 9.616290498363076e-05, + "loss": 0.9359, + "step": 39310 + }, + { + "epoch": 0.2512042727725745, + "grad_norm": 0.715552568435669, + "learning_rate": 9.616097705559306e-05, + "loss": 0.9922, + "step": 39320 + }, + { + "epoch": 0.2512681599223132, + "grad_norm": 0.7606783509254456, + "learning_rate": 9.615904866267438e-05, + "loss": 0.8454, + "step": 39330 + }, + { + "epoch": 0.2513320470720519, + "grad_norm": 0.4417531192302704, + "learning_rate": 9.615711980489415e-05, + "loss": 0.7793, + "step": 39340 + }, + { + "epoch": 0.25139593422179063, + "grad_norm": 0.9691148400306702, + "learning_rate": 9.615519048227178e-05, + "loss": 1.0334, + "step": 39350 + }, + { + "epoch": 0.25145982137152934, + "grad_norm": 0.9279484748840332, + "learning_rate": 9.615326069482673e-05, + "loss": 0.76, + "step": 39360 + }, + { + "epoch": 0.25152370852126804, + "grad_norm": 0.8048676252365112, + "learning_rate": 9.61513304425784e-05, + "loss": 0.7468, + "step": 39370 + }, + { + "epoch": 0.25158759567100675, + "grad_norm": 0.460409939289093, + "learning_rate": 9.614939972554626e-05, + "loss": 0.9653, + "step": 39380 + }, + { + "epoch": 0.25165148282074545, + "grad_norm": 0.5549013018608093, + "learning_rate": 9.614746854374972e-05, + "loss": 0.6636, + "step": 39390 + }, + { + "epoch": 0.25171536997048416, + "grad_norm": 0.5946542024612427, + "learning_rate": 9.614553689720827e-05, + "loss": 1.0487, + "step": 39400 + }, + { + "epoch": 0.25177925712022287, + "grad_norm": 0.4577612578868866, + "learning_rate": 9.614360478594133e-05, + "loss": 0.7034, + "step": 39410 + }, + { + "epoch": 0.2518431442699615, + "grad_norm": 1.0320554971694946, + "learning_rate": 9.614167220996838e-05, + "loss": 1.0302, + "step": 39420 + }, + { + "epoch": 0.2519070314197002, + "grad_norm": 0.5677948594093323, + "learning_rate": 9.613973916930887e-05, + "loss": 0.7853, + "step": 39430 + }, + { + "epoch": 0.25197091856943893, + "grad_norm": 0.8020201325416565, + "learning_rate": 9.613780566398227e-05, + "loss": 0.9151, + "step": 39440 + }, + { + "epoch": 0.25203480571917763, + "grad_norm": 0.966617226600647, + "learning_rate": 9.613587169400805e-05, + "loss": 0.8891, + "step": 39450 + }, + { + "epoch": 0.25209869286891634, + "grad_norm": 1.3065134286880493, + "learning_rate": 9.613393725940568e-05, + "loss": 0.8974, + "step": 39460 + }, + { + "epoch": 0.25216258001865505, + "grad_norm": 0.45482340455055237, + "learning_rate": 9.613200236019466e-05, + "loss": 0.8328, + "step": 39470 + }, + { + "epoch": 0.25222646716839375, + "grad_norm": 1.2391636371612549, + "learning_rate": 9.613006699639446e-05, + "loss": 1.1481, + "step": 39480 + }, + { + "epoch": 0.25229035431813246, + "grad_norm": 0.6843194365501404, + "learning_rate": 9.612813116802459e-05, + "loss": 0.9104, + "step": 39490 + }, + { + "epoch": 0.25235424146787117, + "grad_norm": 0.9102997779846191, + "learning_rate": 9.612619487510452e-05, + "loss": 0.9072, + "step": 39500 + }, + { + "epoch": 0.25241812861760987, + "grad_norm": 1.0311905145645142, + "learning_rate": 9.612425811765376e-05, + "loss": 0.7641, + "step": 39510 + }, + { + "epoch": 0.2524820157673486, + "grad_norm": 0.7546457648277283, + "learning_rate": 9.612232089569183e-05, + "loss": 0.7963, + "step": 39520 + }, + { + "epoch": 0.2525459029170873, + "grad_norm": 0.6303521990776062, + "learning_rate": 9.612038320923822e-05, + "loss": 0.7462, + "step": 39530 + }, + { + "epoch": 0.25260979006682593, + "grad_norm": 0.8027763366699219, + "learning_rate": 9.611844505831245e-05, + "loss": 1.104, + "step": 39540 + }, + { + "epoch": 0.25267367721656464, + "grad_norm": 0.7128822803497314, + "learning_rate": 9.611650644293404e-05, + "loss": 0.8728, + "step": 39550 + }, + { + "epoch": 0.25273756436630335, + "grad_norm": 0.6497736573219299, + "learning_rate": 9.611456736312252e-05, + "loss": 0.7607, + "step": 39560 + }, + { + "epoch": 0.25280145151604205, + "grad_norm": 0.8743992447853088, + "learning_rate": 9.61126278188974e-05, + "loss": 0.7501, + "step": 39570 + }, + { + "epoch": 0.25286533866578076, + "grad_norm": 0.9536669254302979, + "learning_rate": 9.611068781027824e-05, + "loss": 0.9285, + "step": 39580 + }, + { + "epoch": 0.25292922581551947, + "grad_norm": 0.5790122747421265, + "learning_rate": 9.610874733728455e-05, + "loss": 0.9496, + "step": 39590 + }, + { + "epoch": 0.25299311296525817, + "grad_norm": 0.9158995151519775, + "learning_rate": 9.61068063999359e-05, + "loss": 0.6402, + "step": 39600 + }, + { + "epoch": 0.2530570001149969, + "grad_norm": 0.5689387321472168, + "learning_rate": 9.61048649982518e-05, + "loss": 0.8452, + "step": 39610 + }, + { + "epoch": 0.2531208872647356, + "grad_norm": 0.7655090093612671, + "learning_rate": 9.610292313225184e-05, + "loss": 0.8777, + "step": 39620 + }, + { + "epoch": 0.2531847744144743, + "grad_norm": 0.9562214612960815, + "learning_rate": 9.610098080195555e-05, + "loss": 0.8012, + "step": 39630 + }, + { + "epoch": 0.253248661564213, + "grad_norm": 0.9904442429542542, + "learning_rate": 9.609903800738251e-05, + "loss": 0.7636, + "step": 39640 + }, + { + "epoch": 0.2533125487139517, + "grad_norm": 1.2793811559677124, + "learning_rate": 9.609709474855226e-05, + "loss": 0.7783, + "step": 39650 + }, + { + "epoch": 0.25337643586369035, + "grad_norm": 1.0388842821121216, + "learning_rate": 9.60951510254844e-05, + "loss": 0.9002, + "step": 39660 + }, + { + "epoch": 0.25344032301342906, + "grad_norm": 0.5834752321243286, + "learning_rate": 9.60932068381985e-05, + "loss": 0.8864, + "step": 39670 + }, + { + "epoch": 0.25350421016316776, + "grad_norm": 0.7528826594352722, + "learning_rate": 9.609126218671411e-05, + "loss": 0.921, + "step": 39680 + }, + { + "epoch": 0.25356809731290647, + "grad_norm": 0.8979531526565552, + "learning_rate": 9.608931707105085e-05, + "loss": 0.7506, + "step": 39690 + }, + { + "epoch": 0.2536319844626452, + "grad_norm": 0.8471142649650574, + "learning_rate": 9.608737149122829e-05, + "loss": 0.7908, + "step": 39700 + }, + { + "epoch": 0.2536958716123839, + "grad_norm": 1.1389135122299194, + "learning_rate": 9.608542544726603e-05, + "loss": 0.6934, + "step": 39710 + }, + { + "epoch": 0.2537597587621226, + "grad_norm": 0.7484509944915771, + "learning_rate": 9.608347893918366e-05, + "loss": 0.8213, + "step": 39720 + }, + { + "epoch": 0.2538236459118613, + "grad_norm": 1.0670719146728516, + "learning_rate": 9.608153196700078e-05, + "loss": 0.8251, + "step": 39730 + }, + { + "epoch": 0.2538875330616, + "grad_norm": 0.6782344579696655, + "learning_rate": 9.607958453073702e-05, + "loss": 1.1657, + "step": 39740 + }, + { + "epoch": 0.2539514202113387, + "grad_norm": 1.3708243370056152, + "learning_rate": 9.607763663041198e-05, + "loss": 0.815, + "step": 39750 + }, + { + "epoch": 0.2540153073610774, + "grad_norm": 0.6719133853912354, + "learning_rate": 9.607568826604528e-05, + "loss": 0.8005, + "step": 39760 + }, + { + "epoch": 0.2540791945108161, + "grad_norm": 0.8270106315612793, + "learning_rate": 9.607373943765652e-05, + "loss": 0.9914, + "step": 39770 + }, + { + "epoch": 0.25414308166055477, + "grad_norm": 0.9956563711166382, + "learning_rate": 9.607179014526535e-05, + "loss": 0.8496, + "step": 39780 + }, + { + "epoch": 0.2542069688102935, + "grad_norm": 1.2813925743103027, + "learning_rate": 9.60698403888914e-05, + "loss": 0.8953, + "step": 39790 + }, + { + "epoch": 0.2542708559600322, + "grad_norm": 0.9512122273445129, + "learning_rate": 9.60678901685543e-05, + "loss": 1.2809, + "step": 39800 + }, + { + "epoch": 0.2543347431097709, + "grad_norm": 0.9539186954498291, + "learning_rate": 9.60659394842737e-05, + "loss": 1.0062, + "step": 39810 + }, + { + "epoch": 0.2543986302595096, + "grad_norm": 0.963093101978302, + "learning_rate": 9.606398833606923e-05, + "loss": 0.9276, + "step": 39820 + }, + { + "epoch": 0.2544625174092483, + "grad_norm": 0.8398544192314148, + "learning_rate": 9.606203672396055e-05, + "loss": 1.2115, + "step": 39830 + }, + { + "epoch": 0.254526404558987, + "grad_norm": 0.7242417335510254, + "learning_rate": 9.60600846479673e-05, + "loss": 0.7489, + "step": 39840 + }, + { + "epoch": 0.2545902917087257, + "grad_norm": 1.4944490194320679, + "learning_rate": 9.605813210810917e-05, + "loss": 1.3959, + "step": 39850 + }, + { + "epoch": 0.2546541788584644, + "grad_norm": 0.8167847394943237, + "learning_rate": 9.605617910440579e-05, + "loss": 0.8212, + "step": 39860 + }, + { + "epoch": 0.2547180660082031, + "grad_norm": 0.7867516279220581, + "learning_rate": 9.605422563687684e-05, + "loss": 1.1342, + "step": 39870 + }, + { + "epoch": 0.25478195315794183, + "grad_norm": 1.030428171157837, + "learning_rate": 9.605227170554201e-05, + "loss": 1.0195, + "step": 39880 + }, + { + "epoch": 0.25484584030768054, + "grad_norm": 0.6590962409973145, + "learning_rate": 9.605031731042094e-05, + "loss": 0.928, + "step": 39890 + }, + { + "epoch": 0.2549097274574192, + "grad_norm": 1.0177749395370483, + "learning_rate": 9.604836245153334e-05, + "loss": 0.8701, + "step": 39900 + }, + { + "epoch": 0.2549736146071579, + "grad_norm": 0.6760947108268738, + "learning_rate": 9.604640712889891e-05, + "loss": 0.75, + "step": 39910 + }, + { + "epoch": 0.2550375017568966, + "grad_norm": 0.6182072758674622, + "learning_rate": 9.604445134253731e-05, + "loss": 0.7936, + "step": 39920 + }, + { + "epoch": 0.2551013889066353, + "grad_norm": 2.0102100372314453, + "learning_rate": 9.604249509246826e-05, + "loss": 0.7663, + "step": 39930 + }, + { + "epoch": 0.255165276056374, + "grad_norm": 0.7477763295173645, + "learning_rate": 9.604053837871145e-05, + "loss": 0.7996, + "step": 39940 + }, + { + "epoch": 0.2552291632061127, + "grad_norm": 0.8741987943649292, + "learning_rate": 9.603858120128658e-05, + "loss": 0.6339, + "step": 39950 + }, + { + "epoch": 0.2552930503558514, + "grad_norm": 0.9012942314147949, + "learning_rate": 9.603662356021337e-05, + "loss": 1.0624, + "step": 39960 + }, + { + "epoch": 0.25535693750559013, + "grad_norm": 0.9270766973495483, + "learning_rate": 9.603466545551155e-05, + "loss": 0.8575, + "step": 39970 + }, + { + "epoch": 0.25542082465532884, + "grad_norm": 1.3213611841201782, + "learning_rate": 9.603270688720081e-05, + "loss": 1.0033, + "step": 39980 + }, + { + "epoch": 0.25548471180506754, + "grad_norm": 1.1009562015533447, + "learning_rate": 9.603074785530088e-05, + "loss": 1.2523, + "step": 39990 + }, + { + "epoch": 0.25554859895480625, + "grad_norm": 0.6423203945159912, + "learning_rate": 9.602878835983151e-05, + "loss": 1.3048, + "step": 40000 + }, + { + "epoch": 0.25561248610454496, + "grad_norm": 0.747044026851654, + "learning_rate": 9.60268284008124e-05, + "loss": 0.8828, + "step": 40010 + }, + { + "epoch": 0.2556763732542836, + "grad_norm": 2.6469714641571045, + "learning_rate": 9.602486797826333e-05, + "loss": 1.0208, + "step": 40020 + }, + { + "epoch": 0.2557402604040223, + "grad_norm": 1.1847596168518066, + "learning_rate": 9.602290709220403e-05, + "loss": 1.0072, + "step": 40030 + }, + { + "epoch": 0.255804147553761, + "grad_norm": 2.8336246013641357, + "learning_rate": 9.602094574265421e-05, + "loss": 1.1539, + "step": 40040 + }, + { + "epoch": 0.2558680347034997, + "grad_norm": 0.8645704388618469, + "learning_rate": 9.601898392963368e-05, + "loss": 0.9056, + "step": 40050 + }, + { + "epoch": 0.25593192185323843, + "grad_norm": 1.049857258796692, + "learning_rate": 9.601702165316216e-05, + "loss": 0.9048, + "step": 40060 + }, + { + "epoch": 0.25599580900297714, + "grad_norm": 0.4633677005767822, + "learning_rate": 9.601505891325941e-05, + "loss": 0.8304, + "step": 40070 + }, + { + "epoch": 0.25605969615271584, + "grad_norm": 0.9349238872528076, + "learning_rate": 9.601309570994522e-05, + "loss": 0.8373, + "step": 40080 + }, + { + "epoch": 0.25612358330245455, + "grad_norm": 0.8270478844642639, + "learning_rate": 9.601113204323935e-05, + "loss": 0.9072, + "step": 40090 + }, + { + "epoch": 0.25618747045219326, + "grad_norm": 0.9788760542869568, + "learning_rate": 9.600916791316157e-05, + "loss": 0.9037, + "step": 40100 + }, + { + "epoch": 0.25625135760193196, + "grad_norm": 0.9276544451713562, + "learning_rate": 9.600720331973167e-05, + "loss": 0.855, + "step": 40110 + }, + { + "epoch": 0.25631524475167067, + "grad_norm": 0.6395992636680603, + "learning_rate": 9.600523826296943e-05, + "loss": 0.7903, + "step": 40120 + }, + { + "epoch": 0.2563791319014094, + "grad_norm": 0.8007004261016846, + "learning_rate": 9.600327274289464e-05, + "loss": 0.8177, + "step": 40130 + }, + { + "epoch": 0.256443019051148, + "grad_norm": 1.1678056716918945, + "learning_rate": 9.60013067595271e-05, + "loss": 0.8526, + "step": 40140 + }, + { + "epoch": 0.25650690620088673, + "grad_norm": 0.8026590347290039, + "learning_rate": 9.59993403128866e-05, + "loss": 0.9358, + "step": 40150 + }, + { + "epoch": 0.25657079335062544, + "grad_norm": 1.020652174949646, + "learning_rate": 9.599737340299294e-05, + "loss": 1.0027, + "step": 40160 + }, + { + "epoch": 0.25663468050036414, + "grad_norm": 0.8998063206672668, + "learning_rate": 9.599540602986594e-05, + "loss": 0.9003, + "step": 40170 + }, + { + "epoch": 0.25669856765010285, + "grad_norm": 1.1373684406280518, + "learning_rate": 9.599343819352542e-05, + "loss": 0.9378, + "step": 40180 + }, + { + "epoch": 0.25676245479984156, + "grad_norm": 0.9886625409126282, + "learning_rate": 9.599146989399117e-05, + "loss": 0.7858, + "step": 40190 + }, + { + "epoch": 0.25682634194958026, + "grad_norm": 0.899255633354187, + "learning_rate": 9.598950113128304e-05, + "loss": 0.829, + "step": 40200 + }, + { + "epoch": 0.25689022909931897, + "grad_norm": 1.3470916748046875, + "learning_rate": 9.598753190542086e-05, + "loss": 1.1775, + "step": 40210 + }, + { + "epoch": 0.2569541162490577, + "grad_norm": 0.6345730423927307, + "learning_rate": 9.598556221642443e-05, + "loss": 0.9636, + "step": 40220 + }, + { + "epoch": 0.2570180033987964, + "grad_norm": 0.9716305136680603, + "learning_rate": 9.598359206431362e-05, + "loss": 0.7459, + "step": 40230 + }, + { + "epoch": 0.2570818905485351, + "grad_norm": 0.5304000377655029, + "learning_rate": 9.598162144910824e-05, + "loss": 1.0725, + "step": 40240 + }, + { + "epoch": 0.2571457776982738, + "grad_norm": 0.5957240462303162, + "learning_rate": 9.597965037082817e-05, + "loss": 0.6653, + "step": 40250 + }, + { + "epoch": 0.2572096648480125, + "grad_norm": 0.4943158030509949, + "learning_rate": 9.597767882949322e-05, + "loss": 0.86, + "step": 40260 + }, + { + "epoch": 0.25727355199775115, + "grad_norm": 0.5531061291694641, + "learning_rate": 9.59757068251233e-05, + "loss": 1.0038, + "step": 40270 + }, + { + "epoch": 0.25733743914748985, + "grad_norm": 0.747992753982544, + "learning_rate": 9.59737343577382e-05, + "loss": 0.7299, + "step": 40280 + }, + { + "epoch": 0.25740132629722856, + "grad_norm": 0.6985594034194946, + "learning_rate": 9.597176142735784e-05, + "loss": 0.7868, + "step": 40290 + }, + { + "epoch": 0.25746521344696727, + "grad_norm": 0.7488262057304382, + "learning_rate": 9.596978803400207e-05, + "loss": 0.8295, + "step": 40300 + }, + { + "epoch": 0.257529100596706, + "grad_norm": 1.0540517568588257, + "learning_rate": 9.596781417769076e-05, + "loss": 0.9902, + "step": 40310 + }, + { + "epoch": 0.2575929877464447, + "grad_norm": 0.7886415123939514, + "learning_rate": 9.596583985844381e-05, + "loss": 1.1803, + "step": 40320 + }, + { + "epoch": 0.2576568748961834, + "grad_norm": 0.8558555841445923, + "learning_rate": 9.596386507628108e-05, + "loss": 0.7431, + "step": 40330 + }, + { + "epoch": 0.2577207620459221, + "grad_norm": 1.4910976886749268, + "learning_rate": 9.596188983122246e-05, + "loss": 0.804, + "step": 40340 + }, + { + "epoch": 0.2577846491956608, + "grad_norm": 0.7117838859558105, + "learning_rate": 9.595991412328784e-05, + "loss": 1.1232, + "step": 40350 + }, + { + "epoch": 0.2578485363453995, + "grad_norm": 1.4588252305984497, + "learning_rate": 9.595793795249714e-05, + "loss": 0.8111, + "step": 40360 + }, + { + "epoch": 0.2579124234951382, + "grad_norm": 1.322583794593811, + "learning_rate": 9.595596131887024e-05, + "loss": 0.6423, + "step": 40370 + }, + { + "epoch": 0.2579763106448769, + "grad_norm": 1.1012037992477417, + "learning_rate": 9.595398422242702e-05, + "loss": 1.0775, + "step": 40380 + }, + { + "epoch": 0.25804019779461557, + "grad_norm": 0.7852954864501953, + "learning_rate": 9.595200666318746e-05, + "loss": 0.9674, + "step": 40390 + }, + { + "epoch": 0.2581040849443543, + "grad_norm": 0.9846484661102295, + "learning_rate": 9.595002864117144e-05, + "loss": 1.0256, + "step": 40400 + }, + { + "epoch": 0.258167972094093, + "grad_norm": 0.7954578399658203, + "learning_rate": 9.594805015639887e-05, + "loss": 0.8524, + "step": 40410 + }, + { + "epoch": 0.2582318592438317, + "grad_norm": 1.4870191812515259, + "learning_rate": 9.594607120888968e-05, + "loss": 1.0345, + "step": 40420 + }, + { + "epoch": 0.2582957463935704, + "grad_norm": 0.8756714463233948, + "learning_rate": 9.594409179866382e-05, + "loss": 0.7956, + "step": 40430 + }, + { + "epoch": 0.2583596335433091, + "grad_norm": 0.9294307231903076, + "learning_rate": 9.594211192574119e-05, + "loss": 1.2266, + "step": 40440 + }, + { + "epoch": 0.2584235206930478, + "grad_norm": 0.6904338598251343, + "learning_rate": 9.594013159014174e-05, + "loss": 1.1333, + "step": 40450 + }, + { + "epoch": 0.2584874078427865, + "grad_norm": 0.9498498439788818, + "learning_rate": 9.593815079188544e-05, + "loss": 0.8015, + "step": 40460 + }, + { + "epoch": 0.2585512949925252, + "grad_norm": 0.686410665512085, + "learning_rate": 9.593616953099222e-05, + "loss": 1.0482, + "step": 40470 + }, + { + "epoch": 0.2586151821422639, + "grad_norm": 0.8870381116867065, + "learning_rate": 9.593418780748203e-05, + "loss": 0.9889, + "step": 40480 + }, + { + "epoch": 0.25867906929200263, + "grad_norm": 0.6513268947601318, + "learning_rate": 9.593220562137481e-05, + "loss": 0.8747, + "step": 40490 + }, + { + "epoch": 0.25874295644174133, + "grad_norm": 0.8651543855667114, + "learning_rate": 9.593022297269056e-05, + "loss": 0.8722, + "step": 40500 + }, + { + "epoch": 0.25880684359148, + "grad_norm": 0.8488206267356873, + "learning_rate": 9.592823986144923e-05, + "loss": 0.8462, + "step": 40510 + }, + { + "epoch": 0.2588707307412187, + "grad_norm": 1.4681724309921265, + "learning_rate": 9.592625628767079e-05, + "loss": 0.8677, + "step": 40520 + }, + { + "epoch": 0.2589346178909574, + "grad_norm": 4.392560958862305, + "learning_rate": 9.592427225137521e-05, + "loss": 0.8277, + "step": 40530 + }, + { + "epoch": 0.2589985050406961, + "grad_norm": 0.7526928782463074, + "learning_rate": 9.59222877525825e-05, + "loss": 0.9594, + "step": 40540 + }, + { + "epoch": 0.2590623921904348, + "grad_norm": 0.8426737785339355, + "learning_rate": 9.59203027913126e-05, + "loss": 0.9697, + "step": 40550 + }, + { + "epoch": 0.2591262793401735, + "grad_norm": 1.0020760297775269, + "learning_rate": 9.591831736758553e-05, + "loss": 1.0801, + "step": 40560 + }, + { + "epoch": 0.2591901664899122, + "grad_norm": 0.746593177318573, + "learning_rate": 9.591633148142129e-05, + "loss": 0.9383, + "step": 40570 + }, + { + "epoch": 0.2592540536396509, + "grad_norm": 0.7140881419181824, + "learning_rate": 9.591434513283986e-05, + "loss": 0.8904, + "step": 40580 + }, + { + "epoch": 0.25931794078938963, + "grad_norm": 0.9984216690063477, + "learning_rate": 9.591235832186125e-05, + "loss": 0.8797, + "step": 40590 + }, + { + "epoch": 0.25938182793912834, + "grad_norm": 0.9869062304496765, + "learning_rate": 9.591037104850546e-05, + "loss": 0.8081, + "step": 40600 + }, + { + "epoch": 0.25944571508886705, + "grad_norm": 1.0706651210784912, + "learning_rate": 9.590838331279255e-05, + "loss": 1.1206, + "step": 40610 + }, + { + "epoch": 0.25950960223860575, + "grad_norm": 1.028882384300232, + "learning_rate": 9.590639511474248e-05, + "loss": 0.9107, + "step": 40620 + }, + { + "epoch": 0.2595734893883444, + "grad_norm": 1.1108314990997314, + "learning_rate": 9.590440645437529e-05, + "loss": 0.8844, + "step": 40630 + }, + { + "epoch": 0.2596373765380831, + "grad_norm": 0.9321774840354919, + "learning_rate": 9.590241733171104e-05, + "loss": 0.8686, + "step": 40640 + }, + { + "epoch": 0.2597012636878218, + "grad_norm": 1.029826283454895, + "learning_rate": 9.59004277467697e-05, + "loss": 1.1136, + "step": 40650 + }, + { + "epoch": 0.2597651508375605, + "grad_norm": 0.908687949180603, + "learning_rate": 9.589843769957138e-05, + "loss": 0.7381, + "step": 40660 + }, + { + "epoch": 0.2598290379872992, + "grad_norm": 0.565542459487915, + "learning_rate": 9.589644719013607e-05, + "loss": 0.9248, + "step": 40670 + }, + { + "epoch": 0.25989292513703793, + "grad_norm": 0.7926294803619385, + "learning_rate": 9.589445621848384e-05, + "loss": 0.8401, + "step": 40680 + }, + { + "epoch": 0.25995681228677664, + "grad_norm": 0.6880931854248047, + "learning_rate": 9.589246478463471e-05, + "loss": 1.0261, + "step": 40690 + }, + { + "epoch": 0.26002069943651535, + "grad_norm": 1.0573099851608276, + "learning_rate": 9.589047288860876e-05, + "loss": 0.7308, + "step": 40700 + }, + { + "epoch": 0.26008458658625405, + "grad_norm": 1.2168048620224, + "learning_rate": 9.588848053042605e-05, + "loss": 0.9018, + "step": 40710 + }, + { + "epoch": 0.26014847373599276, + "grad_norm": 0.883126437664032, + "learning_rate": 9.588648771010666e-05, + "loss": 1.0485, + "step": 40720 + }, + { + "epoch": 0.26021236088573146, + "grad_norm": 0.7025336623191833, + "learning_rate": 9.588449442767062e-05, + "loss": 0.8312, + "step": 40730 + }, + { + "epoch": 0.26027624803547017, + "grad_norm": 1.2748278379440308, + "learning_rate": 9.588250068313803e-05, + "loss": 1.1546, + "step": 40740 + }, + { + "epoch": 0.2603401351852088, + "grad_norm": 0.5718688368797302, + "learning_rate": 9.588050647652898e-05, + "loss": 1.0319, + "step": 40750 + }, + { + "epoch": 0.2604040223349475, + "grad_norm": 0.8465229868888855, + "learning_rate": 9.587851180786351e-05, + "loss": 0.9557, + "step": 40760 + }, + { + "epoch": 0.26046790948468623, + "grad_norm": 0.9384807348251343, + "learning_rate": 9.587651667716175e-05, + "loss": 0.9037, + "step": 40770 + }, + { + "epoch": 0.26053179663442494, + "grad_norm": 0.8962541222572327, + "learning_rate": 9.58745210844438e-05, + "loss": 0.782, + "step": 40780 + }, + { + "epoch": 0.26059568378416365, + "grad_norm": 1.1555324792861938, + "learning_rate": 9.58725250297297e-05, + "loss": 0.9214, + "step": 40790 + }, + { + "epoch": 0.26065957093390235, + "grad_norm": 0.5348967909812927, + "learning_rate": 9.587052851303961e-05, + "loss": 0.9402, + "step": 40800 + }, + { + "epoch": 0.26072345808364106, + "grad_norm": 0.9009259343147278, + "learning_rate": 9.586853153439359e-05, + "loss": 0.8024, + "step": 40810 + }, + { + "epoch": 0.26078734523337976, + "grad_norm": 0.7219264507293701, + "learning_rate": 9.58665340938118e-05, + "loss": 0.6523, + "step": 40820 + }, + { + "epoch": 0.26085123238311847, + "grad_norm": 0.5624069571495056, + "learning_rate": 9.586453619131432e-05, + "loss": 0.8817, + "step": 40830 + }, + { + "epoch": 0.2609151195328572, + "grad_norm": 0.8910852074623108, + "learning_rate": 9.586253782692129e-05, + "loss": 0.9517, + "step": 40840 + }, + { + "epoch": 0.2609790066825959, + "grad_norm": 0.9586001038551331, + "learning_rate": 9.586053900065282e-05, + "loss": 0.788, + "step": 40850 + }, + { + "epoch": 0.2610428938323346, + "grad_norm": 0.696487307548523, + "learning_rate": 9.585853971252905e-05, + "loss": 0.8417, + "step": 40860 + }, + { + "epoch": 0.26110678098207324, + "grad_norm": 0.5967971682548523, + "learning_rate": 9.585653996257011e-05, + "loss": 0.6904, + "step": 40870 + }, + { + "epoch": 0.26117066813181194, + "grad_norm": 0.7355442643165588, + "learning_rate": 9.585453975079615e-05, + "loss": 0.8139, + "step": 40880 + }, + { + "epoch": 0.26123455528155065, + "grad_norm": 1.0505903959274292, + "learning_rate": 9.585253907722729e-05, + "loss": 0.9211, + "step": 40890 + }, + { + "epoch": 0.26129844243128936, + "grad_norm": 0.8949944376945496, + "learning_rate": 9.58505379418837e-05, + "loss": 0.7938, + "step": 40900 + }, + { + "epoch": 0.26136232958102806, + "grad_norm": 0.9528142809867859, + "learning_rate": 9.584853634478553e-05, + "loss": 1.1102, + "step": 40910 + }, + { + "epoch": 0.26142621673076677, + "grad_norm": 1.1576639413833618, + "learning_rate": 9.584653428595294e-05, + "loss": 1.1977, + "step": 40920 + }, + { + "epoch": 0.2614901038805055, + "grad_norm": 0.895746648311615, + "learning_rate": 9.584453176540607e-05, + "loss": 0.9543, + "step": 40930 + }, + { + "epoch": 0.2615539910302442, + "grad_norm": 0.5793939828872681, + "learning_rate": 9.58425287831651e-05, + "loss": 0.7111, + "step": 40940 + }, + { + "epoch": 0.2616178781799829, + "grad_norm": 0.8070379495620728, + "learning_rate": 9.584052533925023e-05, + "loss": 0.9304, + "step": 40950 + }, + { + "epoch": 0.2616817653297216, + "grad_norm": 0.7948583364486694, + "learning_rate": 9.583852143368159e-05, + "loss": 0.8819, + "step": 40960 + }, + { + "epoch": 0.2617456524794603, + "grad_norm": 0.9934036731719971, + "learning_rate": 9.58365170664794e-05, + "loss": 0.9324, + "step": 40970 + }, + { + "epoch": 0.261809539629199, + "grad_norm": 0.47576090693473816, + "learning_rate": 9.583451223766382e-05, + "loss": 0.686, + "step": 40980 + }, + { + "epoch": 0.26187342677893766, + "grad_norm": 0.9381804466247559, + "learning_rate": 9.583250694725505e-05, + "loss": 1.1989, + "step": 40990 + }, + { + "epoch": 0.26193731392867636, + "grad_norm": 0.6319658756256104, + "learning_rate": 9.58305011952733e-05, + "loss": 0.9446, + "step": 41000 + }, + { + "epoch": 0.26200120107841507, + "grad_norm": 0.8187490701675415, + "learning_rate": 9.582849498173873e-05, + "loss": 1.0167, + "step": 41010 + }, + { + "epoch": 0.2620650882281538, + "grad_norm": 0.682817816734314, + "learning_rate": 9.582648830667157e-05, + "loss": 0.8066, + "step": 41020 + }, + { + "epoch": 0.2621289753778925, + "grad_norm": 1.3718403577804565, + "learning_rate": 9.582448117009205e-05, + "loss": 0.6587, + "step": 41030 + }, + { + "epoch": 0.2621928625276312, + "grad_norm": 0.7690182328224182, + "learning_rate": 9.582247357202035e-05, + "loss": 0.8507, + "step": 41040 + }, + { + "epoch": 0.2622567496773699, + "grad_norm": 1.0088491439819336, + "learning_rate": 9.58204655124767e-05, + "loss": 0.907, + "step": 41050 + }, + { + "epoch": 0.2623206368271086, + "grad_norm": 0.912486732006073, + "learning_rate": 9.581845699148132e-05, + "loss": 1.0735, + "step": 41060 + }, + { + "epoch": 0.2623845239768473, + "grad_norm": 0.9121546149253845, + "learning_rate": 9.581644800905442e-05, + "loss": 0.8556, + "step": 41070 + }, + { + "epoch": 0.262448411126586, + "grad_norm": 0.8414210677146912, + "learning_rate": 9.581443856521628e-05, + "loss": 1.1905, + "step": 41080 + }, + { + "epoch": 0.2625122982763247, + "grad_norm": 0.5232017040252686, + "learning_rate": 9.58124286599871e-05, + "loss": 0.8749, + "step": 41090 + }, + { + "epoch": 0.2625761854260634, + "grad_norm": 1.9335732460021973, + "learning_rate": 9.581041829338712e-05, + "loss": 0.7256, + "step": 41100 + }, + { + "epoch": 0.26264007257580213, + "grad_norm": 0.7388540506362915, + "learning_rate": 9.58084074654366e-05, + "loss": 0.8657, + "step": 41110 + }, + { + "epoch": 0.2627039597255408, + "grad_norm": 1.9808521270751953, + "learning_rate": 9.580639617615579e-05, + "loss": 0.7139, + "step": 41120 + }, + { + "epoch": 0.2627678468752795, + "grad_norm": 1.5845258235931396, + "learning_rate": 9.580438442556494e-05, + "loss": 0.9972, + "step": 41130 + }, + { + "epoch": 0.2628317340250182, + "grad_norm": 0.8241519331932068, + "learning_rate": 9.580237221368431e-05, + "loss": 0.7273, + "step": 41140 + }, + { + "epoch": 0.2628956211747569, + "grad_norm": 0.8821679353713989, + "learning_rate": 9.580035954053418e-05, + "loss": 0.8493, + "step": 41150 + }, + { + "epoch": 0.2629595083244956, + "grad_norm": 0.7632741332054138, + "learning_rate": 9.57983464061348e-05, + "loss": 1.0987, + "step": 41160 + }, + { + "epoch": 0.2630233954742343, + "grad_norm": 0.6200475692749023, + "learning_rate": 9.579633281050644e-05, + "loss": 0.7774, + "step": 41170 + }, + { + "epoch": 0.263087282623973, + "grad_norm": 0.7186120748519897, + "learning_rate": 9.57943187536694e-05, + "loss": 1.0146, + "step": 41180 + }, + { + "epoch": 0.2631511697737117, + "grad_norm": 1.4124023914337158, + "learning_rate": 9.579230423564395e-05, + "loss": 0.9421, + "step": 41190 + }, + { + "epoch": 0.26321505692345043, + "grad_norm": 1.0203825235366821, + "learning_rate": 9.579028925645038e-05, + "loss": 0.7599, + "step": 41200 + }, + { + "epoch": 0.26327894407318914, + "grad_norm": 0.9980260729789734, + "learning_rate": 9.578827381610899e-05, + "loss": 0.7085, + "step": 41210 + }, + { + "epoch": 0.26334283122292784, + "grad_norm": 0.6271802186965942, + "learning_rate": 9.578625791464006e-05, + "loss": 0.7378, + "step": 41220 + }, + { + "epoch": 0.26340671837266655, + "grad_norm": 0.8588720560073853, + "learning_rate": 9.578424155206392e-05, + "loss": 1.1045, + "step": 41230 + }, + { + "epoch": 0.2634706055224052, + "grad_norm": 0.9197202920913696, + "learning_rate": 9.578222472840083e-05, + "loss": 0.7892, + "step": 41240 + }, + { + "epoch": 0.2635344926721439, + "grad_norm": 1.5513139963150024, + "learning_rate": 9.578020744367115e-05, + "loss": 0.9384, + "step": 41250 + }, + { + "epoch": 0.2635983798218826, + "grad_norm": 0.952202320098877, + "learning_rate": 9.577818969789516e-05, + "loss": 1.0154, + "step": 41260 + }, + { + "epoch": 0.2636622669716213, + "grad_norm": 0.7039241790771484, + "learning_rate": 9.577617149109322e-05, + "loss": 0.8493, + "step": 41270 + }, + { + "epoch": 0.26372615412136, + "grad_norm": 0.8046781420707703, + "learning_rate": 9.577415282328561e-05, + "loss": 0.8281, + "step": 41280 + }, + { + "epoch": 0.26379004127109873, + "grad_norm": 1.6643345355987549, + "learning_rate": 9.57721336944927e-05, + "loss": 0.9271, + "step": 41290 + }, + { + "epoch": 0.26385392842083744, + "grad_norm": 0.9182053208351135, + "learning_rate": 9.577011410473477e-05, + "loss": 0.9575, + "step": 41300 + }, + { + "epoch": 0.26391781557057614, + "grad_norm": 1.2427566051483154, + "learning_rate": 9.576809405403222e-05, + "loss": 1.1546, + "step": 41310 + }, + { + "epoch": 0.26398170272031485, + "grad_norm": 1.6159720420837402, + "learning_rate": 9.576607354240536e-05, + "loss": 0.8444, + "step": 41320 + }, + { + "epoch": 0.26404558987005355, + "grad_norm": 1.222095012664795, + "learning_rate": 9.576405256987456e-05, + "loss": 1.0805, + "step": 41330 + }, + { + "epoch": 0.26410947701979226, + "grad_norm": 0.5747536420822144, + "learning_rate": 9.576203113646015e-05, + "loss": 0.9668, + "step": 41340 + }, + { + "epoch": 0.26417336416953097, + "grad_norm": 0.505827009677887, + "learning_rate": 9.576000924218249e-05, + "loss": 0.812, + "step": 41350 + }, + { + "epoch": 0.2642372513192696, + "grad_norm": 0.9075201749801636, + "learning_rate": 9.575798688706196e-05, + "loss": 0.907, + "step": 41360 + }, + { + "epoch": 0.2643011384690083, + "grad_norm": 0.6353416442871094, + "learning_rate": 9.575596407111891e-05, + "loss": 0.9169, + "step": 41370 + }, + { + "epoch": 0.26436502561874703, + "grad_norm": 0.8017897009849548, + "learning_rate": 9.575394079437372e-05, + "loss": 0.8862, + "step": 41380 + }, + { + "epoch": 0.26442891276848574, + "grad_norm": 0.6533048748970032, + "learning_rate": 9.575191705684676e-05, + "loss": 1.004, + "step": 41390 + }, + { + "epoch": 0.26449279991822444, + "grad_norm": 0.6979532837867737, + "learning_rate": 9.574989285855842e-05, + "loss": 0.7255, + "step": 41400 + }, + { + "epoch": 0.26455668706796315, + "grad_norm": 1.3084895610809326, + "learning_rate": 9.574786819952908e-05, + "loss": 0.9328, + "step": 41410 + }, + { + "epoch": 0.26462057421770185, + "grad_norm": 0.8081639409065247, + "learning_rate": 9.574584307977912e-05, + "loss": 1.1026, + "step": 41420 + }, + { + "epoch": 0.26468446136744056, + "grad_norm": 0.6545292735099792, + "learning_rate": 9.574381749932894e-05, + "loss": 0.8194, + "step": 41430 + }, + { + "epoch": 0.26474834851717927, + "grad_norm": 1.0248223543167114, + "learning_rate": 9.574179145819898e-05, + "loss": 0.9576, + "step": 41440 + }, + { + "epoch": 0.26481223566691797, + "grad_norm": 1.235369086265564, + "learning_rate": 9.573976495640958e-05, + "loss": 0.758, + "step": 41450 + }, + { + "epoch": 0.2648761228166567, + "grad_norm": 0.8014651536941528, + "learning_rate": 9.573773799398116e-05, + "loss": 0.7959, + "step": 41460 + }, + { + "epoch": 0.2649400099663954, + "grad_norm": 0.6294002532958984, + "learning_rate": 9.573571057093418e-05, + "loss": 0.9757, + "step": 41470 + }, + { + "epoch": 0.26500389711613404, + "grad_norm": 0.9327560663223267, + "learning_rate": 9.573368268728901e-05, + "loss": 1.1375, + "step": 41480 + }, + { + "epoch": 0.26506778426587274, + "grad_norm": 1.0414352416992188, + "learning_rate": 9.57316543430661e-05, + "loss": 1.0202, + "step": 41490 + }, + { + "epoch": 0.26513167141561145, + "grad_norm": 1.695721983909607, + "learning_rate": 9.572962553828586e-05, + "loss": 0.8127, + "step": 41500 + }, + { + "epoch": 0.26519555856535015, + "grad_norm": 0.8339122533798218, + "learning_rate": 9.572759627296872e-05, + "loss": 0.7414, + "step": 41510 + }, + { + "epoch": 0.26525944571508886, + "grad_norm": 0.8159180283546448, + "learning_rate": 9.572556654713514e-05, + "loss": 0.811, + "step": 41520 + }, + { + "epoch": 0.26532333286482757, + "grad_norm": 0.9597871899604797, + "learning_rate": 9.572353636080555e-05, + "loss": 1.089, + "step": 41530 + }, + { + "epoch": 0.26538722001456627, + "grad_norm": 0.5867129564285278, + "learning_rate": 9.572150571400038e-05, + "loss": 0.9204, + "step": 41540 + }, + { + "epoch": 0.265451107164305, + "grad_norm": 0.706292450428009, + "learning_rate": 9.57194746067401e-05, + "loss": 0.8317, + "step": 41550 + }, + { + "epoch": 0.2655149943140437, + "grad_norm": 1.7897143363952637, + "learning_rate": 9.571744303904515e-05, + "loss": 0.9645, + "step": 41560 + }, + { + "epoch": 0.2655788814637824, + "grad_norm": 0.9594964981079102, + "learning_rate": 9.571541101093602e-05, + "loss": 0.9729, + "step": 41570 + }, + { + "epoch": 0.2656427686135211, + "grad_norm": 0.7055974006652832, + "learning_rate": 9.571337852243313e-05, + "loss": 0.8736, + "step": 41580 + }, + { + "epoch": 0.2657066557632598, + "grad_norm": 0.8249185681343079, + "learning_rate": 9.571134557355697e-05, + "loss": 0.9665, + "step": 41590 + }, + { + "epoch": 0.26577054291299845, + "grad_norm": 0.5963894128799438, + "learning_rate": 9.570931216432801e-05, + "loss": 0.8331, + "step": 41600 + }, + { + "epoch": 0.26583443006273716, + "grad_norm": 0.5887112617492676, + "learning_rate": 9.570727829476676e-05, + "loss": 0.8705, + "step": 41610 + }, + { + "epoch": 0.26589831721247587, + "grad_norm": 0.7210960388183594, + "learning_rate": 9.570524396489365e-05, + "loss": 0.7763, + "step": 41620 + }, + { + "epoch": 0.26596220436221457, + "grad_norm": 0.8247012495994568, + "learning_rate": 9.570320917472919e-05, + "loss": 0.8148, + "step": 41630 + }, + { + "epoch": 0.2660260915119533, + "grad_norm": 0.8511599898338318, + "learning_rate": 9.57011739242939e-05, + "loss": 0.9971, + "step": 41640 + }, + { + "epoch": 0.266089978661692, + "grad_norm": 0.6672869324684143, + "learning_rate": 9.569913821360824e-05, + "loss": 0.9768, + "step": 41650 + }, + { + "epoch": 0.2661538658114307, + "grad_norm": 1.1974848508834839, + "learning_rate": 9.569710204269271e-05, + "loss": 0.991, + "step": 41660 + }, + { + "epoch": 0.2662177529611694, + "grad_norm": 0.8013436198234558, + "learning_rate": 9.569506541156784e-05, + "loss": 0.8786, + "step": 41670 + }, + { + "epoch": 0.2662816401109081, + "grad_norm": 0.7496063113212585, + "learning_rate": 9.569302832025413e-05, + "loss": 0.858, + "step": 41680 + }, + { + "epoch": 0.2663455272606468, + "grad_norm": 0.8424622416496277, + "learning_rate": 9.569099076877208e-05, + "loss": 0.6938, + "step": 41690 + }, + { + "epoch": 0.2664094144103855, + "grad_norm": 0.8976988196372986, + "learning_rate": 9.568895275714225e-05, + "loss": 0.9233, + "step": 41700 + }, + { + "epoch": 0.2664733015601242, + "grad_norm": 1.4451935291290283, + "learning_rate": 9.568691428538512e-05, + "loss": 1.0334, + "step": 41710 + }, + { + "epoch": 0.26653718870986287, + "grad_norm": 1.2237658500671387, + "learning_rate": 9.568487535352124e-05, + "loss": 0.8801, + "step": 41720 + }, + { + "epoch": 0.2666010758596016, + "grad_norm": 2.1000301837921143, + "learning_rate": 9.568283596157115e-05, + "loss": 1.0871, + "step": 41730 + }, + { + "epoch": 0.2666649630093403, + "grad_norm": 0.8955364227294922, + "learning_rate": 9.568079610955539e-05, + "loss": 0.8636, + "step": 41740 + }, + { + "epoch": 0.266728850159079, + "grad_norm": 0.9859808087348938, + "learning_rate": 9.567875579749447e-05, + "loss": 0.9954, + "step": 41750 + }, + { + "epoch": 0.2667927373088177, + "grad_norm": 1.0908890962600708, + "learning_rate": 9.567671502540897e-05, + "loss": 0.8674, + "step": 41760 + }, + { + "epoch": 0.2668566244585564, + "grad_norm": 0.48835402727127075, + "learning_rate": 9.567467379331943e-05, + "loss": 1.2828, + "step": 41770 + }, + { + "epoch": 0.2669205116082951, + "grad_norm": 0.7387278079986572, + "learning_rate": 9.567263210124641e-05, + "loss": 0.9535, + "step": 41780 + }, + { + "epoch": 0.2669843987580338, + "grad_norm": 0.813470184803009, + "learning_rate": 9.567058994921049e-05, + "loss": 0.9851, + "step": 41790 + }, + { + "epoch": 0.2670482859077725, + "grad_norm": 0.8729559779167175, + "learning_rate": 9.566854733723221e-05, + "loss": 0.8694, + "step": 41800 + }, + { + "epoch": 0.2671121730575112, + "grad_norm": 0.9543429613113403, + "learning_rate": 9.566650426533214e-05, + "loss": 0.8436, + "step": 41810 + }, + { + "epoch": 0.26717606020724993, + "grad_norm": 1.1418914794921875, + "learning_rate": 9.566446073353089e-05, + "loss": 0.9252, + "step": 41820 + }, + { + "epoch": 0.26723994735698864, + "grad_norm": 0.9289169907569885, + "learning_rate": 9.566241674184898e-05, + "loss": 0.9306, + "step": 41830 + }, + { + "epoch": 0.2673038345067273, + "grad_norm": 1.3721492290496826, + "learning_rate": 9.566037229030704e-05, + "loss": 1.0788, + "step": 41840 + }, + { + "epoch": 0.267367721656466, + "grad_norm": 0.7210074067115784, + "learning_rate": 9.565832737892566e-05, + "loss": 0.7705, + "step": 41850 + }, + { + "epoch": 0.2674316088062047, + "grad_norm": 0.6854256391525269, + "learning_rate": 9.565628200772542e-05, + "loss": 0.8524, + "step": 41860 + }, + { + "epoch": 0.2674954959559434, + "grad_norm": 1.1195999383926392, + "learning_rate": 9.565423617672691e-05, + "loss": 1.0449, + "step": 41870 + }, + { + "epoch": 0.2675593831056821, + "grad_norm": 1.1100611686706543, + "learning_rate": 9.565218988595077e-05, + "loss": 0.9949, + "step": 41880 + }, + { + "epoch": 0.2676232702554208, + "grad_norm": 0.6192795038223267, + "learning_rate": 9.565014313541756e-05, + "loss": 1.0524, + "step": 41890 + }, + { + "epoch": 0.2676871574051595, + "grad_norm": 1.138809084892273, + "learning_rate": 9.564809592514793e-05, + "loss": 1.0164, + "step": 41900 + }, + { + "epoch": 0.26775104455489823, + "grad_norm": 0.6620566844940186, + "learning_rate": 9.564604825516248e-05, + "loss": 0.9091, + "step": 41910 + }, + { + "epoch": 0.26781493170463694, + "grad_norm": 0.8734396696090698, + "learning_rate": 9.564400012548183e-05, + "loss": 1.1338, + "step": 41920 + }, + { + "epoch": 0.26787881885437564, + "grad_norm": 0.5580737590789795, + "learning_rate": 9.56419515361266e-05, + "loss": 0.8139, + "step": 41930 + }, + { + "epoch": 0.26794270600411435, + "grad_norm": 0.8338034152984619, + "learning_rate": 9.563990248711745e-05, + "loss": 0.8791, + "step": 41940 + }, + { + "epoch": 0.26800659315385306, + "grad_norm": 0.557685136795044, + "learning_rate": 9.563785297847501e-05, + "loss": 0.7298, + "step": 41950 + }, + { + "epoch": 0.26807048030359176, + "grad_norm": 0.8459478616714478, + "learning_rate": 9.563580301021988e-05, + "loss": 0.9614, + "step": 41960 + }, + { + "epoch": 0.2681343674533304, + "grad_norm": 0.918144166469574, + "learning_rate": 9.563375258237275e-05, + "loss": 0.9374, + "step": 41970 + }, + { + "epoch": 0.2681982546030691, + "grad_norm": 0.9142857193946838, + "learning_rate": 9.563170169495424e-05, + "loss": 0.8053, + "step": 41980 + }, + { + "epoch": 0.2682621417528078, + "grad_norm": 0.9095722436904907, + "learning_rate": 9.562965034798502e-05, + "loss": 0.9772, + "step": 41990 + }, + { + "epoch": 0.26832602890254653, + "grad_norm": 1.0715844631195068, + "learning_rate": 9.562759854148575e-05, + "loss": 0.8129, + "step": 42000 + }, + { + "epoch": 0.26838991605228524, + "grad_norm": 0.8338362574577332, + "learning_rate": 9.562554627547709e-05, + "loss": 0.9305, + "step": 42010 + }, + { + "epoch": 0.26845380320202394, + "grad_norm": 0.709132969379425, + "learning_rate": 9.562349354997971e-05, + "loss": 0.7656, + "step": 42020 + }, + { + "epoch": 0.26851769035176265, + "grad_norm": 1.1715177297592163, + "learning_rate": 9.562144036501428e-05, + "loss": 0.9993, + "step": 42030 + }, + { + "epoch": 0.26858157750150136, + "grad_norm": 1.6266652345657349, + "learning_rate": 9.561938672060147e-05, + "loss": 0.8534, + "step": 42040 + }, + { + "epoch": 0.26864546465124006, + "grad_norm": 0.8818448781967163, + "learning_rate": 9.561733261676196e-05, + "loss": 0.8328, + "step": 42050 + }, + { + "epoch": 0.26870935180097877, + "grad_norm": 1.15716552734375, + "learning_rate": 9.561527805351646e-05, + "loss": 0.7874, + "step": 42060 + }, + { + "epoch": 0.2687732389507175, + "grad_norm": 0.913690984249115, + "learning_rate": 9.561322303088565e-05, + "loss": 0.7483, + "step": 42070 + }, + { + "epoch": 0.2688371261004562, + "grad_norm": 0.644812822341919, + "learning_rate": 9.561116754889022e-05, + "loss": 1.011, + "step": 42080 + }, + { + "epoch": 0.26890101325019483, + "grad_norm": 0.7774586081504822, + "learning_rate": 9.560911160755088e-05, + "loss": 1.0542, + "step": 42090 + }, + { + "epoch": 0.26896490039993354, + "grad_norm": 0.4670238494873047, + "learning_rate": 9.56070552068883e-05, + "loss": 0.9087, + "step": 42100 + }, + { + "epoch": 0.26902878754967224, + "grad_norm": 1.1221692562103271, + "learning_rate": 9.560499834692325e-05, + "loss": 0.8704, + "step": 42110 + }, + { + "epoch": 0.26909267469941095, + "grad_norm": 1.4995726346969604, + "learning_rate": 9.56029410276764e-05, + "loss": 0.9453, + "step": 42120 + }, + { + "epoch": 0.26915656184914966, + "grad_norm": 0.8239421844482422, + "learning_rate": 9.56008832491685e-05, + "loss": 0.8527, + "step": 42130 + }, + { + "epoch": 0.26922044899888836, + "grad_norm": 0.7687192559242249, + "learning_rate": 9.559882501142024e-05, + "loss": 1.0542, + "step": 42140 + }, + { + "epoch": 0.26928433614862707, + "grad_norm": 0.8161906003952026, + "learning_rate": 9.559676631445236e-05, + "loss": 0.8796, + "step": 42150 + }, + { + "epoch": 0.2693482232983658, + "grad_norm": 0.7608240842819214, + "learning_rate": 9.559470715828559e-05, + "loss": 0.8504, + "step": 42160 + }, + { + "epoch": 0.2694121104481045, + "grad_norm": 1.2632617950439453, + "learning_rate": 9.559264754294068e-05, + "loss": 0.6786, + "step": 42170 + }, + { + "epoch": 0.2694759975978432, + "grad_norm": 1.0786528587341309, + "learning_rate": 9.55907934965501e-05, + "loss": 1.0747, + "step": 42180 + }, + { + "epoch": 0.2695398847475819, + "grad_norm": 0.9386286735534668, + "learning_rate": 9.558873300882385e-05, + "loss": 0.6627, + "step": 42190 + }, + { + "epoch": 0.2696037718973206, + "grad_norm": 0.9134590029716492, + "learning_rate": 9.558667206197964e-05, + "loss": 0.7985, + "step": 42200 + }, + { + "epoch": 0.26966765904705925, + "grad_norm": 0.6581794619560242, + "learning_rate": 9.55846106560382e-05, + "loss": 0.7476, + "step": 42210 + }, + { + "epoch": 0.26973154619679796, + "grad_norm": 0.7346853017807007, + "learning_rate": 9.558254879102028e-05, + "loss": 1.1158, + "step": 42220 + }, + { + "epoch": 0.26979543334653666, + "grad_norm": 0.7923113107681274, + "learning_rate": 9.558048646694668e-05, + "loss": 1.0275, + "step": 42230 + }, + { + "epoch": 0.26985932049627537, + "grad_norm": 0.9055522680282593, + "learning_rate": 9.557842368383813e-05, + "loss": 0.7192, + "step": 42240 + }, + { + "epoch": 0.2699232076460141, + "grad_norm": 0.5908991694450378, + "learning_rate": 9.557636044171542e-05, + "loss": 0.9693, + "step": 42250 + }, + { + "epoch": 0.2699870947957528, + "grad_norm": 0.636661946773529, + "learning_rate": 9.557429674059935e-05, + "loss": 1.0553, + "step": 42260 + }, + { + "epoch": 0.2700509819454915, + "grad_norm": 0.9865610599517822, + "learning_rate": 9.557223258051069e-05, + "loss": 1.0789, + "step": 42270 + }, + { + "epoch": 0.2701148690952302, + "grad_norm": 0.9444893598556519, + "learning_rate": 9.557016796147021e-05, + "loss": 0.8252, + "step": 42280 + }, + { + "epoch": 0.2701787562449689, + "grad_norm": 0.7374017238616943, + "learning_rate": 9.556810288349871e-05, + "loss": 0.9914, + "step": 42290 + }, + { + "epoch": 0.2702426433947076, + "grad_norm": 0.772415041923523, + "learning_rate": 9.5566037346617e-05, + "loss": 0.7827, + "step": 42300 + }, + { + "epoch": 0.2703065305444463, + "grad_norm": 1.0962374210357666, + "learning_rate": 9.556397135084587e-05, + "loss": 0.995, + "step": 42310 + }, + { + "epoch": 0.270370417694185, + "grad_norm": 0.7097411751747131, + "learning_rate": 9.556190489620612e-05, + "loss": 0.8302, + "step": 42320 + }, + { + "epoch": 0.27043430484392367, + "grad_norm": 0.7932478785514832, + "learning_rate": 9.555983798271859e-05, + "loss": 0.9678, + "step": 42330 + }, + { + "epoch": 0.2704981919936624, + "grad_norm": 0.6816592812538147, + "learning_rate": 9.555777061040407e-05, + "loss": 1.0183, + "step": 42340 + }, + { + "epoch": 0.2705620791434011, + "grad_norm": 0.6527500152587891, + "learning_rate": 9.555570277928338e-05, + "loss": 0.8971, + "step": 42350 + }, + { + "epoch": 0.2706259662931398, + "grad_norm": 0.6478419899940491, + "learning_rate": 9.555363448937735e-05, + "loss": 0.8146, + "step": 42360 + }, + { + "epoch": 0.2706898534428785, + "grad_norm": 0.5460163354873657, + "learning_rate": 9.555156574070681e-05, + "loss": 0.7972, + "step": 42370 + }, + { + "epoch": 0.2707537405926172, + "grad_norm": 0.5501124262809753, + "learning_rate": 9.554949653329262e-05, + "loss": 0.6, + "step": 42380 + }, + { + "epoch": 0.2708176277423559, + "grad_norm": 0.9783973693847656, + "learning_rate": 9.554742686715557e-05, + "loss": 0.9689, + "step": 42390 + }, + { + "epoch": 0.2708815148920946, + "grad_norm": 1.4098743200302124, + "learning_rate": 9.554535674231652e-05, + "loss": 0.8839, + "step": 42400 + }, + { + "epoch": 0.2709454020418333, + "grad_norm": 0.9374015927314758, + "learning_rate": 9.554328615879636e-05, + "loss": 1.1531, + "step": 42410 + }, + { + "epoch": 0.271009289191572, + "grad_norm": 2.437901258468628, + "learning_rate": 9.554121511661587e-05, + "loss": 0.8329, + "step": 42420 + }, + { + "epoch": 0.27107317634131073, + "grad_norm": 0.5805662870407104, + "learning_rate": 9.553914361579597e-05, + "loss": 0.8364, + "step": 42430 + }, + { + "epoch": 0.27113706349104943, + "grad_norm": 0.8254538178443909, + "learning_rate": 9.553707165635747e-05, + "loss": 0.5683, + "step": 42440 + }, + { + "epoch": 0.2712009506407881, + "grad_norm": 1.0397802591323853, + "learning_rate": 9.55349992383213e-05, + "loss": 0.9475, + "step": 42450 + }, + { + "epoch": 0.2712648377905268, + "grad_norm": 1.111701488494873, + "learning_rate": 9.553292636170827e-05, + "loss": 0.9378, + "step": 42460 + }, + { + "epoch": 0.2713287249402655, + "grad_norm": 1.4257961511611938, + "learning_rate": 9.553085302653929e-05, + "loss": 0.828, + "step": 42470 + }, + { + "epoch": 0.2713926120900042, + "grad_norm": 1.0189239978790283, + "learning_rate": 9.552877923283522e-05, + "loss": 1.1691, + "step": 42480 + }, + { + "epoch": 0.2714564992397429, + "grad_norm": 1.3065085411071777, + "learning_rate": 9.552670498061697e-05, + "loss": 1.0535, + "step": 42490 + }, + { + "epoch": 0.2715203863894816, + "grad_norm": 0.6838773488998413, + "learning_rate": 9.55246302699054e-05, + "loss": 0.938, + "step": 42500 + }, + { + "epoch": 0.2715842735392203, + "grad_norm": 0.9010002613067627, + "learning_rate": 9.552255510072142e-05, + "loss": 0.7285, + "step": 42510 + }, + { + "epoch": 0.27164816068895903, + "grad_norm": 0.7863100171089172, + "learning_rate": 9.552047947308593e-05, + "loss": 0.8349, + "step": 42520 + }, + { + "epoch": 0.27171204783869773, + "grad_norm": 1.3251279592514038, + "learning_rate": 9.551840338701983e-05, + "loss": 0.8618, + "step": 42530 + }, + { + "epoch": 0.27177593498843644, + "grad_norm": 1.1294409036636353, + "learning_rate": 9.551632684254405e-05, + "loss": 0.9233, + "step": 42540 + }, + { + "epoch": 0.27183982213817515, + "grad_norm": 1.4269248247146606, + "learning_rate": 9.551424983967946e-05, + "loss": 0.8823, + "step": 42550 + }, + { + "epoch": 0.27190370928791385, + "grad_norm": 0.7050525546073914, + "learning_rate": 9.551217237844701e-05, + "loss": 0.8103, + "step": 42560 + }, + { + "epoch": 0.2719675964376525, + "grad_norm": 1.4217504262924194, + "learning_rate": 9.551009445886759e-05, + "loss": 0.7929, + "step": 42570 + }, + { + "epoch": 0.2720314835873912, + "grad_norm": 0.6999850869178772, + "learning_rate": 9.550801608096216e-05, + "loss": 1.3094, + "step": 42580 + }, + { + "epoch": 0.2720953707371299, + "grad_norm": 0.5494612455368042, + "learning_rate": 9.550593724475163e-05, + "loss": 0.9256, + "step": 42590 + }, + { + "epoch": 0.2721592578868686, + "grad_norm": 0.5456877946853638, + "learning_rate": 9.550385795025696e-05, + "loss": 0.8309, + "step": 42600 + }, + { + "epoch": 0.27222314503660733, + "grad_norm": 0.6689077615737915, + "learning_rate": 9.550177819749905e-05, + "loss": 1.158, + "step": 42610 + }, + { + "epoch": 0.27228703218634603, + "grad_norm": 0.811871349811554, + "learning_rate": 9.54996979864989e-05, + "loss": 0.6157, + "step": 42620 + }, + { + "epoch": 0.27235091933608474, + "grad_norm": 0.5832274556159973, + "learning_rate": 9.549761731727741e-05, + "loss": 0.9875, + "step": 42630 + }, + { + "epoch": 0.27241480648582345, + "grad_norm": 0.874345064163208, + "learning_rate": 9.549553618985556e-05, + "loss": 0.8906, + "step": 42640 + }, + { + "epoch": 0.27247869363556215, + "grad_norm": 2.064990282058716, + "learning_rate": 9.54934546042543e-05, + "loss": 0.8612, + "step": 42650 + }, + { + "epoch": 0.27254258078530086, + "grad_norm": 0.5960216522216797, + "learning_rate": 9.549137256049459e-05, + "loss": 0.8631, + "step": 42660 + }, + { + "epoch": 0.27260646793503956, + "grad_norm": 1.0062336921691895, + "learning_rate": 9.548929005859739e-05, + "loss": 0.786, + "step": 42670 + }, + { + "epoch": 0.27267035508477827, + "grad_norm": 0.9856522679328918, + "learning_rate": 9.548720709858371e-05, + "loss": 0.8347, + "step": 42680 + }, + { + "epoch": 0.272734242234517, + "grad_norm": 1.2544548511505127, + "learning_rate": 9.548512368047448e-05, + "loss": 1.0405, + "step": 42690 + }, + { + "epoch": 0.2727981293842556, + "grad_norm": 0.45234474539756775, + "learning_rate": 9.548303980429072e-05, + "loss": 0.8274, + "step": 42700 + }, + { + "epoch": 0.27286201653399433, + "grad_norm": 0.7926174402236938, + "learning_rate": 9.54809554700534e-05, + "loss": 0.9903, + "step": 42710 + }, + { + "epoch": 0.27292590368373304, + "grad_norm": 0.9858782291412354, + "learning_rate": 9.547887067778352e-05, + "loss": 0.8354, + "step": 42720 + }, + { + "epoch": 0.27298979083347175, + "grad_norm": 0.5488384962081909, + "learning_rate": 9.547678542750204e-05, + "loss": 0.9663, + "step": 42730 + }, + { + "epoch": 0.27305367798321045, + "grad_norm": 0.7685027122497559, + "learning_rate": 9.547469971923001e-05, + "loss": 0.8943, + "step": 42740 + }, + { + "epoch": 0.27311756513294916, + "grad_norm": 0.910285472869873, + "learning_rate": 9.54726135529884e-05, + "loss": 0.7713, + "step": 42750 + }, + { + "epoch": 0.27318145228268786, + "grad_norm": 1.956381916999817, + "learning_rate": 9.547052692879825e-05, + "loss": 0.7784, + "step": 42760 + }, + { + "epoch": 0.27324533943242657, + "grad_norm": 0.7288005352020264, + "learning_rate": 9.546843984668055e-05, + "loss": 0.8306, + "step": 42770 + }, + { + "epoch": 0.2733092265821653, + "grad_norm": 0.8818903565406799, + "learning_rate": 9.54663523066563e-05, + "loss": 0.8758, + "step": 42780 + }, + { + "epoch": 0.273373113731904, + "grad_norm": 1.380008339881897, + "learning_rate": 9.546426430874658e-05, + "loss": 0.8951, + "step": 42790 + }, + { + "epoch": 0.2734370008816427, + "grad_norm": 1.8795280456542969, + "learning_rate": 9.546217585297236e-05, + "loss": 0.8414, + "step": 42800 + }, + { + "epoch": 0.2735008880313814, + "grad_norm": 1.4341343641281128, + "learning_rate": 9.546008693935473e-05, + "loss": 0.7366, + "step": 42810 + }, + { + "epoch": 0.27356477518112005, + "grad_norm": 1.095348596572876, + "learning_rate": 9.545799756791467e-05, + "loss": 0.8262, + "step": 42820 + }, + { + "epoch": 0.27362866233085875, + "grad_norm": 0.8619642853736877, + "learning_rate": 9.545590773867325e-05, + "loss": 0.9742, + "step": 42830 + }, + { + "epoch": 0.27369254948059746, + "grad_norm": 0.8517597317695618, + "learning_rate": 9.545381745165154e-05, + "loss": 0.9214, + "step": 42840 + }, + { + "epoch": 0.27375643663033616, + "grad_norm": 0.5588153600692749, + "learning_rate": 9.545172670687053e-05, + "loss": 0.7431, + "step": 42850 + }, + { + "epoch": 0.27382032378007487, + "grad_norm": 0.8848083019256592, + "learning_rate": 9.544963550435133e-05, + "loss": 0.7999, + "step": 42860 + }, + { + "epoch": 0.2738842109298136, + "grad_norm": 0.7159634232521057, + "learning_rate": 9.544754384411499e-05, + "loss": 0.9701, + "step": 42870 + }, + { + "epoch": 0.2739480980795523, + "grad_norm": 0.8267273306846619, + "learning_rate": 9.544545172618255e-05, + "loss": 0.8219, + "step": 42880 + }, + { + "epoch": 0.274011985229291, + "grad_norm": 1.47396981716156, + "learning_rate": 9.54433591505751e-05, + "loss": 1.1159, + "step": 42890 + }, + { + "epoch": 0.2740758723790297, + "grad_norm": 0.6588977575302124, + "learning_rate": 9.54412661173137e-05, + "loss": 0.9557, + "step": 42900 + }, + { + "epoch": 0.2741397595287684, + "grad_norm": 0.9745913743972778, + "learning_rate": 9.543917262641944e-05, + "loss": 0.9112, + "step": 42910 + }, + { + "epoch": 0.2742036466785071, + "grad_norm": 0.8897466063499451, + "learning_rate": 9.543707867791342e-05, + "loss": 0.9167, + "step": 42920 + }, + { + "epoch": 0.2742675338282458, + "grad_norm": 0.7316814064979553, + "learning_rate": 9.543498427181669e-05, + "loss": 0.8055, + "step": 42930 + }, + { + "epoch": 0.27433142097798446, + "grad_norm": 1.263555645942688, + "learning_rate": 9.543288940815036e-05, + "loss": 0.9486, + "step": 42940 + }, + { + "epoch": 0.27439530812772317, + "grad_norm": 0.6425541639328003, + "learning_rate": 9.543079408693554e-05, + "loss": 0.7572, + "step": 42950 + }, + { + "epoch": 0.2744591952774619, + "grad_norm": 0.5203328728675842, + "learning_rate": 9.542869830819332e-05, + "loss": 0.7523, + "step": 42960 + }, + { + "epoch": 0.2745230824272006, + "grad_norm": 0.5347851514816284, + "learning_rate": 9.542660207194481e-05, + "loss": 0.8578, + "step": 42970 + }, + { + "epoch": 0.2745869695769393, + "grad_norm": 0.863381564617157, + "learning_rate": 9.542450537821111e-05, + "loss": 0.87, + "step": 42980 + }, + { + "epoch": 0.274650856726678, + "grad_norm": 0.608485996723175, + "learning_rate": 9.542240822701333e-05, + "loss": 1.1265, + "step": 42990 + }, + { + "epoch": 0.2747147438764167, + "grad_norm": 0.8107671737670898, + "learning_rate": 9.542031061837262e-05, + "loss": 1.0411, + "step": 43000 + }, + { + "epoch": 0.2747786310261554, + "grad_norm": 0.8316226601600647, + "learning_rate": 9.541821255231009e-05, + "loss": 0.7935, + "step": 43010 + }, + { + "epoch": 0.2748425181758941, + "grad_norm": 0.741036593914032, + "learning_rate": 9.541611402884685e-05, + "loss": 0.9842, + "step": 43020 + }, + { + "epoch": 0.2749064053256328, + "grad_norm": 1.501214861869812, + "learning_rate": 9.541401504800407e-05, + "loss": 1.4551, + "step": 43030 + }, + { + "epoch": 0.2749702924753715, + "grad_norm": 0.6897192001342773, + "learning_rate": 9.541191560980287e-05, + "loss": 0.7029, + "step": 43040 + }, + { + "epoch": 0.27503417962511023, + "grad_norm": 0.555748701095581, + "learning_rate": 9.540981571426437e-05, + "loss": 0.8156, + "step": 43050 + }, + { + "epoch": 0.2750980667748489, + "grad_norm": 0.5355345010757446, + "learning_rate": 9.540771536140976e-05, + "loss": 1.1097, + "step": 43060 + }, + { + "epoch": 0.2751619539245876, + "grad_norm": 0.839185357093811, + "learning_rate": 9.540561455126018e-05, + "loss": 0.7823, + "step": 43070 + }, + { + "epoch": 0.2752258410743263, + "grad_norm": 1.0240132808685303, + "learning_rate": 9.540351328383676e-05, + "loss": 0.825, + "step": 43080 + }, + { + "epoch": 0.275289728224065, + "grad_norm": 0.6717436909675598, + "learning_rate": 9.54014115591607e-05, + "loss": 0.8841, + "step": 43090 + }, + { + "epoch": 0.2753536153738037, + "grad_norm": 0.7767571806907654, + "learning_rate": 9.539930937725313e-05, + "loss": 0.8338, + "step": 43100 + }, + { + "epoch": 0.2754175025235424, + "grad_norm": 0.9243952035903931, + "learning_rate": 9.539720673813526e-05, + "loss": 0.8565, + "step": 43110 + }, + { + "epoch": 0.2754813896732811, + "grad_norm": 0.7475656270980835, + "learning_rate": 9.539510364182822e-05, + "loss": 0.8373, + "step": 43120 + }, + { + "epoch": 0.2755452768230198, + "grad_norm": 0.7929537296295166, + "learning_rate": 9.539300008835323e-05, + "loss": 1.1913, + "step": 43130 + }, + { + "epoch": 0.27560916397275853, + "grad_norm": 0.9647955298423767, + "learning_rate": 9.539089607773145e-05, + "loss": 0.8969, + "step": 43140 + }, + { + "epoch": 0.27567305112249724, + "grad_norm": 0.7929497957229614, + "learning_rate": 9.538879160998408e-05, + "loss": 0.8129, + "step": 43150 + }, + { + "epoch": 0.27573693827223594, + "grad_norm": 0.7888079881668091, + "learning_rate": 9.538668668513232e-05, + "loss": 1.0804, + "step": 43160 + }, + { + "epoch": 0.27580082542197465, + "grad_norm": 1.1037369966506958, + "learning_rate": 9.538458130319736e-05, + "loss": 0.7396, + "step": 43170 + }, + { + "epoch": 0.2758647125717133, + "grad_norm": 1.0101234912872314, + "learning_rate": 9.538247546420038e-05, + "loss": 0.899, + "step": 43180 + }, + { + "epoch": 0.275928599721452, + "grad_norm": 1.06256902217865, + "learning_rate": 9.538036916816264e-05, + "loss": 0.8693, + "step": 43190 + }, + { + "epoch": 0.2759924868711907, + "grad_norm": 1.1790313720703125, + "learning_rate": 9.53782624151053e-05, + "loss": 0.8821, + "step": 43200 + }, + { + "epoch": 0.2760563740209294, + "grad_norm": 1.0005087852478027, + "learning_rate": 9.537615520504961e-05, + "loss": 0.8263, + "step": 43210 + }, + { + "epoch": 0.2761202611706681, + "grad_norm": 0.965392529964447, + "learning_rate": 9.537404753801679e-05, + "loss": 0.7032, + "step": 43220 + }, + { + "epoch": 0.27618414832040683, + "grad_norm": 1.0671719312667847, + "learning_rate": 9.537193941402805e-05, + "loss": 0.6795, + "step": 43230 + }, + { + "epoch": 0.27624803547014554, + "grad_norm": 0.8210242390632629, + "learning_rate": 9.536983083310463e-05, + "loss": 0.8035, + "step": 43240 + }, + { + "epoch": 0.27631192261988424, + "grad_norm": 1.3356382846832275, + "learning_rate": 9.536772179526774e-05, + "loss": 0.8635, + "step": 43250 + }, + { + "epoch": 0.27637580976962295, + "grad_norm": 0.6056402325630188, + "learning_rate": 9.536561230053866e-05, + "loss": 1.1843, + "step": 43260 + }, + { + "epoch": 0.27643969691936165, + "grad_norm": 0.674022912979126, + "learning_rate": 9.536350234893863e-05, + "loss": 1.2542, + "step": 43270 + }, + { + "epoch": 0.27650358406910036, + "grad_norm": 1.186140537261963, + "learning_rate": 9.536139194048888e-05, + "loss": 0.884, + "step": 43280 + }, + { + "epoch": 0.27656747121883907, + "grad_norm": 0.8582965731620789, + "learning_rate": 9.535928107521067e-05, + "loss": 0.7672, + "step": 43290 + }, + { + "epoch": 0.2766313583685777, + "grad_norm": 0.8430611491203308, + "learning_rate": 9.535716975312524e-05, + "loss": 0.8817, + "step": 43300 + }, + { + "epoch": 0.2766952455183164, + "grad_norm": 0.9493967294692993, + "learning_rate": 9.535505797425388e-05, + "loss": 0.9554, + "step": 43310 + }, + { + "epoch": 0.27675913266805513, + "grad_norm": 0.9920264482498169, + "learning_rate": 9.535294573861786e-05, + "loss": 0.9554, + "step": 43320 + }, + { + "epoch": 0.27682301981779384, + "grad_norm": 0.8129369616508484, + "learning_rate": 9.535083304623844e-05, + "loss": 0.839, + "step": 43330 + }, + { + "epoch": 0.27688690696753254, + "grad_norm": 1.0524061918258667, + "learning_rate": 9.534871989713688e-05, + "loss": 0.8456, + "step": 43340 + }, + { + "epoch": 0.27695079411727125, + "grad_norm": 0.5314822793006897, + "learning_rate": 9.53466062913345e-05, + "loss": 0.8378, + "step": 43350 + }, + { + "epoch": 0.27701468126700995, + "grad_norm": 0.574448823928833, + "learning_rate": 9.534449222885254e-05, + "loss": 0.8479, + "step": 43360 + }, + { + "epoch": 0.27707856841674866, + "grad_norm": 0.622386634349823, + "learning_rate": 9.534237770971233e-05, + "loss": 1.2532, + "step": 43370 + }, + { + "epoch": 0.27714245556648737, + "grad_norm": 0.7160522937774658, + "learning_rate": 9.534026273393515e-05, + "loss": 0.8913, + "step": 43380 + }, + { + "epoch": 0.2772063427162261, + "grad_norm": 0.8195508122444153, + "learning_rate": 9.533814730154229e-05, + "loss": 0.8407, + "step": 43390 + }, + { + "epoch": 0.2772702298659648, + "grad_norm": 2.375558614730835, + "learning_rate": 9.533603141255508e-05, + "loss": 0.8774, + "step": 43400 + }, + { + "epoch": 0.2773341170157035, + "grad_norm": 1.008970022201538, + "learning_rate": 9.533391506699481e-05, + "loss": 0.8729, + "step": 43410 + }, + { + "epoch": 0.27739800416544214, + "grad_norm": 1.062031865119934, + "learning_rate": 9.533179826488278e-05, + "loss": 0.7358, + "step": 43420 + }, + { + "epoch": 0.27746189131518084, + "grad_norm": 1.110970139503479, + "learning_rate": 9.532968100624034e-05, + "loss": 0.9176, + "step": 43430 + }, + { + "epoch": 0.27752577846491955, + "grad_norm": 0.7017518877983093, + "learning_rate": 9.532756329108879e-05, + "loss": 0.7531, + "step": 43440 + }, + { + "epoch": 0.27758966561465825, + "grad_norm": 0.7108795046806335, + "learning_rate": 9.532544511944945e-05, + "loss": 1.2516, + "step": 43450 + }, + { + "epoch": 0.27765355276439696, + "grad_norm": 1.1942083835601807, + "learning_rate": 9.532332649134368e-05, + "loss": 0.8122, + "step": 43460 + }, + { + "epoch": 0.27771743991413567, + "grad_norm": 1.6900832653045654, + "learning_rate": 9.53212074067928e-05, + "loss": 0.9352, + "step": 43470 + }, + { + "epoch": 0.2777813270638744, + "grad_norm": 0.6773011088371277, + "learning_rate": 9.531908786581816e-05, + "loss": 0.7606, + "step": 43480 + }, + { + "epoch": 0.2778452142136131, + "grad_norm": 0.9404903054237366, + "learning_rate": 9.53169678684411e-05, + "loss": 0.8783, + "step": 43490 + }, + { + "epoch": 0.2779091013633518, + "grad_norm": 0.9946535229682922, + "learning_rate": 9.531484741468296e-05, + "loss": 0.8917, + "step": 43500 + }, + { + "epoch": 0.2779729885130905, + "grad_norm": 0.9458416104316711, + "learning_rate": 9.531272650456508e-05, + "loss": 1.1044, + "step": 43510 + }, + { + "epoch": 0.2780368756628292, + "grad_norm": 0.5456779599189758, + "learning_rate": 9.531060513810887e-05, + "loss": 0.7003, + "step": 43520 + }, + { + "epoch": 0.2781007628125679, + "grad_norm": 0.7244671583175659, + "learning_rate": 9.530848331533569e-05, + "loss": 0.8803, + "step": 43530 + }, + { + "epoch": 0.2781646499623066, + "grad_norm": 0.7101406455039978, + "learning_rate": 9.530636103626684e-05, + "loss": 0.9651, + "step": 43540 + }, + { + "epoch": 0.27822853711204526, + "grad_norm": 0.8346617221832275, + "learning_rate": 9.530423830092376e-05, + "loss": 1.1236, + "step": 43550 + }, + { + "epoch": 0.27829242426178397, + "grad_norm": 1.0931973457336426, + "learning_rate": 9.530211510932781e-05, + "loss": 1.1875, + "step": 43560 + }, + { + "epoch": 0.27835631141152267, + "grad_norm": 0.7076020240783691, + "learning_rate": 9.529999146150037e-05, + "loss": 0.9529, + "step": 43570 + }, + { + "epoch": 0.2784201985612614, + "grad_norm": 0.8255470991134644, + "learning_rate": 9.529786735746281e-05, + "loss": 0.9156, + "step": 43580 + }, + { + "epoch": 0.2784840857110001, + "grad_norm": 1.8320345878601074, + "learning_rate": 9.529574279723655e-05, + "loss": 1.1068, + "step": 43590 + }, + { + "epoch": 0.2785479728607388, + "grad_norm": 0.8600050806999207, + "learning_rate": 9.529361778084297e-05, + "loss": 0.7782, + "step": 43600 + }, + { + "epoch": 0.2786118600104775, + "grad_norm": 0.5279654860496521, + "learning_rate": 9.529149230830348e-05, + "loss": 0.6292, + "step": 43610 + }, + { + "epoch": 0.2786757471602162, + "grad_norm": 1.0602446794509888, + "learning_rate": 9.528936637963948e-05, + "loss": 0.8949, + "step": 43620 + }, + { + "epoch": 0.2787396343099549, + "grad_norm": 0.687105655670166, + "learning_rate": 9.528723999487236e-05, + "loss": 0.9358, + "step": 43630 + }, + { + "epoch": 0.2788035214596936, + "grad_norm": 0.6915990114212036, + "learning_rate": 9.528511315402358e-05, + "loss": 1.011, + "step": 43640 + }, + { + "epoch": 0.2788674086094323, + "grad_norm": 1.5329688787460327, + "learning_rate": 9.528298585711453e-05, + "loss": 0.8036, + "step": 43650 + }, + { + "epoch": 0.278931295759171, + "grad_norm": 0.8263123035430908, + "learning_rate": 9.528085810416663e-05, + "loss": 0.8864, + "step": 43660 + }, + { + "epoch": 0.2789951829089097, + "grad_norm": 1.231269121170044, + "learning_rate": 9.52787298952013e-05, + "loss": 0.8268, + "step": 43670 + }, + { + "epoch": 0.2790590700586484, + "grad_norm": 0.7919381260871887, + "learning_rate": 9.527660123024e-05, + "loss": 1.0224, + "step": 43680 + }, + { + "epoch": 0.2791229572083871, + "grad_norm": 0.850471556186676, + "learning_rate": 9.527447210930417e-05, + "loss": 0.7467, + "step": 43690 + }, + { + "epoch": 0.2791868443581258, + "grad_norm": 1.2980278730392456, + "learning_rate": 9.527234253241522e-05, + "loss": 0.7765, + "step": 43700 + }, + { + "epoch": 0.2792507315078645, + "grad_norm": 1.044519066810608, + "learning_rate": 9.527021249959462e-05, + "loss": 1.0149, + "step": 43710 + }, + { + "epoch": 0.2793146186576032, + "grad_norm": 0.9248097538948059, + "learning_rate": 9.526808201086382e-05, + "loss": 1.0461, + "step": 43720 + }, + { + "epoch": 0.2793785058073419, + "grad_norm": 0.7465457320213318, + "learning_rate": 9.526595106624428e-05, + "loss": 0.8779, + "step": 43730 + }, + { + "epoch": 0.2794423929570806, + "grad_norm": 1.1304700374603271, + "learning_rate": 9.526381966575744e-05, + "loss": 0.9451, + "step": 43740 + }, + { + "epoch": 0.2795062801068193, + "grad_norm": 0.8045159578323364, + "learning_rate": 9.526168780942477e-05, + "loss": 0.851, + "step": 43750 + }, + { + "epoch": 0.27957016725655803, + "grad_norm": 0.8568775057792664, + "learning_rate": 9.525955549726776e-05, + "loss": 1.2149, + "step": 43760 + }, + { + "epoch": 0.27963405440629674, + "grad_norm": 0.8600241541862488, + "learning_rate": 9.525742272930787e-05, + "loss": 1.2311, + "step": 43770 + }, + { + "epoch": 0.27969794155603545, + "grad_norm": 0.9878765940666199, + "learning_rate": 9.525528950556657e-05, + "loss": 0.7023, + "step": 43780 + }, + { + "epoch": 0.2797618287057741, + "grad_norm": 1.1209038496017456, + "learning_rate": 9.525315582606537e-05, + "loss": 0.7451, + "step": 43790 + }, + { + "epoch": 0.2798257158555128, + "grad_norm": 0.8326137065887451, + "learning_rate": 9.525102169082573e-05, + "loss": 1.139, + "step": 43800 + }, + { + "epoch": 0.2798896030052515, + "grad_norm": 0.7747043371200562, + "learning_rate": 9.524888709986914e-05, + "loss": 1.1918, + "step": 43810 + }, + { + "epoch": 0.2799534901549902, + "grad_norm": 0.7023658752441406, + "learning_rate": 9.524675205321713e-05, + "loss": 1.1406, + "step": 43820 + }, + { + "epoch": 0.2800173773047289, + "grad_norm": 0.9086745977401733, + "learning_rate": 9.524461655089119e-05, + "loss": 1.0247, + "step": 43830 + }, + { + "epoch": 0.2800812644544676, + "grad_norm": 0.850036084651947, + "learning_rate": 9.52424805929128e-05, + "loss": 0.9891, + "step": 43840 + }, + { + "epoch": 0.28014515160420633, + "grad_norm": 0.7269537448883057, + "learning_rate": 9.52403441793035e-05, + "loss": 0.917, + "step": 43850 + }, + { + "epoch": 0.28020903875394504, + "grad_norm": 0.6691316962242126, + "learning_rate": 9.523820731008479e-05, + "loss": 0.8787, + "step": 43860 + }, + { + "epoch": 0.28027292590368375, + "grad_norm": 0.9154207110404968, + "learning_rate": 9.52360699852782e-05, + "loss": 1.1603, + "step": 43870 + }, + { + "epoch": 0.28033681305342245, + "grad_norm": 0.8188737034797668, + "learning_rate": 9.523393220490526e-05, + "loss": 1.0814, + "step": 43880 + }, + { + "epoch": 0.28040070020316116, + "grad_norm": 0.873671293258667, + "learning_rate": 9.523179396898748e-05, + "loss": 0.8897, + "step": 43890 + }, + { + "epoch": 0.28046458735289986, + "grad_norm": 1.1229972839355469, + "learning_rate": 9.52296552775464e-05, + "loss": 0.9826, + "step": 43900 + }, + { + "epoch": 0.2805284745026385, + "grad_norm": 0.664357602596283, + "learning_rate": 9.522751613060356e-05, + "loss": 0.8484, + "step": 43910 + }, + { + "epoch": 0.2805923616523772, + "grad_norm": 0.6779937148094177, + "learning_rate": 9.522537652818051e-05, + "loss": 0.8464, + "step": 43920 + }, + { + "epoch": 0.2806562488021159, + "grad_norm": 0.7764692306518555, + "learning_rate": 9.522323647029879e-05, + "loss": 1.005, + "step": 43930 + }, + { + "epoch": 0.28072013595185463, + "grad_norm": 0.6373327970504761, + "learning_rate": 9.522109595697997e-05, + "loss": 0.7112, + "step": 43940 + }, + { + "epoch": 0.28078402310159334, + "grad_norm": 1.3876614570617676, + "learning_rate": 9.521895498824558e-05, + "loss": 0.7226, + "step": 43950 + }, + { + "epoch": 0.28084791025133204, + "grad_norm": 1.1054093837738037, + "learning_rate": 9.521681356411718e-05, + "loss": 0.8765, + "step": 43960 + }, + { + "epoch": 0.28091179740107075, + "grad_norm": 0.7157889008522034, + "learning_rate": 9.521467168461637e-05, + "loss": 0.9107, + "step": 43970 + }, + { + "epoch": 0.28097568455080946, + "grad_norm": 0.7243021726608276, + "learning_rate": 9.521252934976469e-05, + "loss": 0.6519, + "step": 43980 + }, + { + "epoch": 0.28103957170054816, + "grad_norm": 1.0799028873443604, + "learning_rate": 9.521038655958373e-05, + "loss": 0.9358, + "step": 43990 + }, + { + "epoch": 0.28110345885028687, + "grad_norm": 0.8918923139572144, + "learning_rate": 9.520824331409506e-05, + "loss": 1.0831, + "step": 44000 + }, + { + "epoch": 0.2811673460000256, + "grad_norm": 0.6121041774749756, + "learning_rate": 9.520609961332027e-05, + "loss": 0.9407, + "step": 44010 + }, + { + "epoch": 0.2812312331497643, + "grad_norm": 0.8242114186286926, + "learning_rate": 9.520395545728096e-05, + "loss": 0.6712, + "step": 44020 + }, + { + "epoch": 0.28129512029950293, + "grad_norm": 0.8655091524124146, + "learning_rate": 9.52018108459987e-05, + "loss": 0.8633, + "step": 44030 + }, + { + "epoch": 0.28135900744924164, + "grad_norm": 0.7352428436279297, + "learning_rate": 9.51996657794951e-05, + "loss": 1.0033, + "step": 44040 + }, + { + "epoch": 0.28142289459898034, + "grad_norm": 1.2268953323364258, + "learning_rate": 9.519752025779177e-05, + "loss": 1.0435, + "step": 44050 + }, + { + "epoch": 0.28148678174871905, + "grad_norm": 0.766212522983551, + "learning_rate": 9.51953742809103e-05, + "loss": 0.8992, + "step": 44060 + }, + { + "epoch": 0.28155066889845776, + "grad_norm": 0.9193620681762695, + "learning_rate": 9.51932278488723e-05, + "loss": 0.9638, + "step": 44070 + }, + { + "epoch": 0.28161455604819646, + "grad_norm": 0.7695852518081665, + "learning_rate": 9.519108096169943e-05, + "loss": 0.875, + "step": 44080 + }, + { + "epoch": 0.28167844319793517, + "grad_norm": 0.7172717452049255, + "learning_rate": 9.518893361941326e-05, + "loss": 1.106, + "step": 44090 + }, + { + "epoch": 0.2817423303476739, + "grad_norm": 0.7750954031944275, + "learning_rate": 9.518678582203542e-05, + "loss": 1.0095, + "step": 44100 + }, + { + "epoch": 0.2818062174974126, + "grad_norm": 1.0420329570770264, + "learning_rate": 9.518463756958758e-05, + "loss": 0.6998, + "step": 44110 + }, + { + "epoch": 0.2818701046471513, + "grad_norm": 1.6733392477035522, + "learning_rate": 9.518248886209134e-05, + "loss": 0.733, + "step": 44120 + }, + { + "epoch": 0.28193399179689, + "grad_norm": 0.9372019171714783, + "learning_rate": 9.518033969956834e-05, + "loss": 0.8278, + "step": 44130 + }, + { + "epoch": 0.2819978789466287, + "grad_norm": 0.6449085474014282, + "learning_rate": 9.517819008204025e-05, + "loss": 0.8307, + "step": 44140 + }, + { + "epoch": 0.28206176609636735, + "grad_norm": 1.0656044483184814, + "learning_rate": 9.517604000952869e-05, + "loss": 0.7596, + "step": 44150 + }, + { + "epoch": 0.28212565324610606, + "grad_norm": 1.2960087060928345, + "learning_rate": 9.517388948205532e-05, + "loss": 0.7277, + "step": 44160 + }, + { + "epoch": 0.28218954039584476, + "grad_norm": 0.7332021594047546, + "learning_rate": 9.517173849964181e-05, + "loss": 0.9438, + "step": 44170 + }, + { + "epoch": 0.28225342754558347, + "grad_norm": 1.0289632081985474, + "learning_rate": 9.516958706230981e-05, + "loss": 1.1855, + "step": 44180 + }, + { + "epoch": 0.2823173146953222, + "grad_norm": 0.628814697265625, + "learning_rate": 9.516743517008099e-05, + "loss": 1.0023, + "step": 44190 + }, + { + "epoch": 0.2823812018450609, + "grad_norm": 0.8182355165481567, + "learning_rate": 9.516528282297703e-05, + "loss": 0.924, + "step": 44200 + }, + { + "epoch": 0.2824450889947996, + "grad_norm": 0.7950728535652161, + "learning_rate": 9.51631300210196e-05, + "loss": 0.6805, + "step": 44210 + }, + { + "epoch": 0.2825089761445383, + "grad_norm": 0.902574360370636, + "learning_rate": 9.516097676423037e-05, + "loss": 0.9775, + "step": 44220 + }, + { + "epoch": 0.282572863294277, + "grad_norm": 0.8031740784645081, + "learning_rate": 9.515882305263104e-05, + "loss": 1.0566, + "step": 44230 + }, + { + "epoch": 0.2826367504440157, + "grad_norm": 0.8170803189277649, + "learning_rate": 9.515666888624329e-05, + "loss": 0.9942, + "step": 44240 + }, + { + "epoch": 0.2827006375937544, + "grad_norm": 0.6290708184242249, + "learning_rate": 9.515451426508882e-05, + "loss": 0.7682, + "step": 44250 + }, + { + "epoch": 0.2827645247434931, + "grad_norm": 0.9795184135437012, + "learning_rate": 9.515235918918932e-05, + "loss": 0.7583, + "step": 44260 + }, + { + "epoch": 0.28282841189323177, + "grad_norm": 0.7491911053657532, + "learning_rate": 9.515020365856651e-05, + "loss": 0.9229, + "step": 44270 + }, + { + "epoch": 0.2828922990429705, + "grad_norm": 1.1259350776672363, + "learning_rate": 9.51480476732421e-05, + "loss": 1.0074, + "step": 44280 + }, + { + "epoch": 0.2829561861927092, + "grad_norm": 1.4809681177139282, + "learning_rate": 9.514589123323777e-05, + "loss": 0.7639, + "step": 44290 + }, + { + "epoch": 0.2830200733424479, + "grad_norm": 1.034775733947754, + "learning_rate": 9.514373433857527e-05, + "loss": 0.9323, + "step": 44300 + }, + { + "epoch": 0.2830839604921866, + "grad_norm": 0.6214499473571777, + "learning_rate": 9.51415769892763e-05, + "loss": 0.7838, + "step": 44310 + }, + { + "epoch": 0.2831478476419253, + "grad_norm": 3.2163054943084717, + "learning_rate": 9.51394191853626e-05, + "loss": 1.0207, + "step": 44320 + }, + { + "epoch": 0.283211734791664, + "grad_norm": 0.6460834741592407, + "learning_rate": 9.513726092685591e-05, + "loss": 1.1491, + "step": 44330 + }, + { + "epoch": 0.2832756219414027, + "grad_norm": 0.7531580924987793, + "learning_rate": 9.513510221377793e-05, + "loss": 0.7412, + "step": 44340 + }, + { + "epoch": 0.2833395090911414, + "grad_norm": 1.0614440441131592, + "learning_rate": 9.513294304615044e-05, + "loss": 0.8966, + "step": 44350 + }, + { + "epoch": 0.2834033962408801, + "grad_norm": 0.6449925303459167, + "learning_rate": 9.513078342399517e-05, + "loss": 0.9988, + "step": 44360 + }, + { + "epoch": 0.28346728339061883, + "grad_norm": 0.7040312886238098, + "learning_rate": 9.512862334733386e-05, + "loss": 0.8929, + "step": 44370 + }, + { + "epoch": 0.28353117054035754, + "grad_norm": 1.03850257396698, + "learning_rate": 9.512646281618828e-05, + "loss": 0.5284, + "step": 44380 + }, + { + "epoch": 0.28359505769009624, + "grad_norm": 1.0077382326126099, + "learning_rate": 9.512430183058016e-05, + "loss": 0.8976, + "step": 44390 + }, + { + "epoch": 0.2836589448398349, + "grad_norm": 1.8333910703659058, + "learning_rate": 9.512214039053131e-05, + "loss": 0.9276, + "step": 44400 + }, + { + "epoch": 0.2837228319895736, + "grad_norm": 1.7474950551986694, + "learning_rate": 9.511997849606344e-05, + "loss": 0.9906, + "step": 44410 + }, + { + "epoch": 0.2837867191393123, + "grad_norm": 0.6216913461685181, + "learning_rate": 9.511781614719838e-05, + "loss": 0.8697, + "step": 44420 + }, + { + "epoch": 0.283850606289051, + "grad_norm": 0.8137566447257996, + "learning_rate": 9.511565334395786e-05, + "loss": 1.1427, + "step": 44430 + }, + { + "epoch": 0.2839144934387897, + "grad_norm": 0.8825230598449707, + "learning_rate": 9.51134900863637e-05, + "loss": 0.8255, + "step": 44440 + }, + { + "epoch": 0.2839783805885284, + "grad_norm": 1.51393461227417, + "learning_rate": 9.511132637443765e-05, + "loss": 0.9725, + "step": 44450 + }, + { + "epoch": 0.28404226773826713, + "grad_norm": 1.1853009462356567, + "learning_rate": 9.510916220820152e-05, + "loss": 0.9037, + "step": 44460 + }, + { + "epoch": 0.28410615488800584, + "grad_norm": 1.493323802947998, + "learning_rate": 9.510699758767709e-05, + "loss": 0.9787, + "step": 44470 + }, + { + "epoch": 0.28417004203774454, + "grad_norm": 0.8198840022087097, + "learning_rate": 9.510483251288619e-05, + "loss": 0.8874, + "step": 44480 + }, + { + "epoch": 0.28423392918748325, + "grad_norm": 0.7507383227348328, + "learning_rate": 9.51026669838506e-05, + "loss": 1.0607, + "step": 44490 + }, + { + "epoch": 0.28429781633722195, + "grad_norm": 0.698621928691864, + "learning_rate": 9.510050100059214e-05, + "loss": 0.9481, + "step": 44500 + }, + { + "epoch": 0.28436170348696066, + "grad_norm": 1.1146284341812134, + "learning_rate": 9.50983345631326e-05, + "loss": 1.0293, + "step": 44510 + }, + { + "epoch": 0.2844255906366993, + "grad_norm": 0.7054140567779541, + "learning_rate": 9.509616767149383e-05, + "loss": 0.8634, + "step": 44520 + }, + { + "epoch": 0.284489477786438, + "grad_norm": 0.7089869976043701, + "learning_rate": 9.509400032569763e-05, + "loss": 1.0544, + "step": 44530 + }, + { + "epoch": 0.2845533649361767, + "grad_norm": 0.817032516002655, + "learning_rate": 9.509183252576583e-05, + "loss": 0.8768, + "step": 44540 + }, + { + "epoch": 0.28461725208591543, + "grad_norm": 0.9883630275726318, + "learning_rate": 9.508966427172028e-05, + "loss": 0.8952, + "step": 44550 + }, + { + "epoch": 0.28468113923565413, + "grad_norm": 0.8523919582366943, + "learning_rate": 9.50874955635828e-05, + "loss": 0.9583, + "step": 44560 + }, + { + "epoch": 0.28474502638539284, + "grad_norm": 0.5503199696540833, + "learning_rate": 9.508532640137522e-05, + "loss": 0.986, + "step": 44570 + }, + { + "epoch": 0.28480891353513155, + "grad_norm": 0.6643738150596619, + "learning_rate": 9.50831567851194e-05, + "loss": 0.9774, + "step": 44580 + }, + { + "epoch": 0.28487280068487025, + "grad_norm": 0.8332018256187439, + "learning_rate": 9.50809867148372e-05, + "loss": 0.8159, + "step": 44590 + }, + { + "epoch": 0.28493668783460896, + "grad_norm": 0.8186637759208679, + "learning_rate": 9.507881619055046e-05, + "loss": 0.8421, + "step": 44600 + }, + { + "epoch": 0.28500057498434767, + "grad_norm": 0.5218867659568787, + "learning_rate": 9.507664521228106e-05, + "loss": 0.6236, + "step": 44610 + }, + { + "epoch": 0.28506446213408637, + "grad_norm": 0.8833245038986206, + "learning_rate": 9.507447378005083e-05, + "loss": 0.7893, + "step": 44620 + }, + { + "epoch": 0.2851283492838251, + "grad_norm": 0.9100516438484192, + "learning_rate": 9.507230189388164e-05, + "loss": 0.8338, + "step": 44630 + }, + { + "epoch": 0.28519223643356373, + "grad_norm": 1.299561858177185, + "learning_rate": 9.50701295537954e-05, + "loss": 0.7506, + "step": 44640 + }, + { + "epoch": 0.28525612358330243, + "grad_norm": 0.8499222993850708, + "learning_rate": 9.506795675981394e-05, + "loss": 0.8838, + "step": 44650 + }, + { + "epoch": 0.28532001073304114, + "grad_norm": 0.7215855121612549, + "learning_rate": 9.506578351195918e-05, + "loss": 1.1284, + "step": 44660 + }, + { + "epoch": 0.28538389788277985, + "grad_norm": 0.9105709791183472, + "learning_rate": 9.5063609810253e-05, + "loss": 0.7879, + "step": 44670 + }, + { + "epoch": 0.28544778503251855, + "grad_norm": 1.2917571067810059, + "learning_rate": 9.506165309069255e-05, + "loss": 1.0344, + "step": 44680 + }, + { + "epoch": 0.28551167218225726, + "grad_norm": 0.7351425886154175, + "learning_rate": 9.505947852672896e-05, + "loss": 0.9027, + "step": 44690 + }, + { + "epoch": 0.28557555933199597, + "grad_norm": 1.4496943950653076, + "learning_rate": 9.505730350897745e-05, + "loss": 0.7425, + "step": 44700 + }, + { + "epoch": 0.28563944648173467, + "grad_norm": 0.9381955862045288, + "learning_rate": 9.505512803745991e-05, + "loss": 1.0261, + "step": 44710 + }, + { + "epoch": 0.2857033336314734, + "grad_norm": 1.0335243940353394, + "learning_rate": 9.505295211219824e-05, + "loss": 1.0252, + "step": 44720 + }, + { + "epoch": 0.2857672207812121, + "grad_norm": 0.9310586452484131, + "learning_rate": 9.505077573321438e-05, + "loss": 0.947, + "step": 44730 + }, + { + "epoch": 0.2858311079309508, + "grad_norm": 1.818731665611267, + "learning_rate": 9.504859890053023e-05, + "loss": 0.9851, + "step": 44740 + }, + { + "epoch": 0.2858949950806895, + "grad_norm": 2.0556581020355225, + "learning_rate": 9.504642161416773e-05, + "loss": 0.7945, + "step": 44750 + }, + { + "epoch": 0.28595888223042815, + "grad_norm": 0.9168753027915955, + "learning_rate": 9.504424387414876e-05, + "loss": 0.7114, + "step": 44760 + }, + { + "epoch": 0.28602276938016685, + "grad_norm": 0.7781484723091125, + "learning_rate": 9.504206568049532e-05, + "loss": 0.8891, + "step": 44770 + }, + { + "epoch": 0.28608665652990556, + "grad_norm": 2.3256382942199707, + "learning_rate": 9.503988703322928e-05, + "loss": 0.8655, + "step": 44780 + }, + { + "epoch": 0.28615054367964426, + "grad_norm": 0.826259970664978, + "learning_rate": 9.503770793237263e-05, + "loss": 0.9931, + "step": 44790 + }, + { + "epoch": 0.28621443082938297, + "grad_norm": 1.0867620706558228, + "learning_rate": 9.50355283779473e-05, + "loss": 0.9931, + "step": 44800 + }, + { + "epoch": 0.2862783179791217, + "grad_norm": 0.6833561658859253, + "learning_rate": 9.503334836997524e-05, + "loss": 0.7724, + "step": 44810 + }, + { + "epoch": 0.2863422051288604, + "grad_norm": 0.8544519543647766, + "learning_rate": 9.503116790847839e-05, + "loss": 0.8207, + "step": 44820 + }, + { + "epoch": 0.2864060922785991, + "grad_norm": 0.5061067342758179, + "learning_rate": 9.502898699347873e-05, + "loss": 0.8357, + "step": 44830 + }, + { + "epoch": 0.2864699794283378, + "grad_norm": 0.91792231798172, + "learning_rate": 9.502680562499821e-05, + "loss": 1.0274, + "step": 44840 + }, + { + "epoch": 0.2865338665780765, + "grad_norm": 0.8766928911209106, + "learning_rate": 9.502462380305881e-05, + "loss": 0.8878, + "step": 44850 + }, + { + "epoch": 0.2865977537278152, + "grad_norm": 0.5164894461631775, + "learning_rate": 9.50224415276825e-05, + "loss": 0.961, + "step": 44860 + }, + { + "epoch": 0.2866616408775539, + "grad_norm": 0.7407921552658081, + "learning_rate": 9.502025879889125e-05, + "loss": 0.8303, + "step": 44870 + }, + { + "epoch": 0.28672552802729256, + "grad_norm": 0.8378937244415283, + "learning_rate": 9.501807561670703e-05, + "loss": 0.831, + "step": 44880 + }, + { + "epoch": 0.28678941517703127, + "grad_norm": 2.5379931926727295, + "learning_rate": 9.501589198115186e-05, + "loss": 0.8583, + "step": 44890 + }, + { + "epoch": 0.28685330232677, + "grad_norm": 0.8692481517791748, + "learning_rate": 9.501370789224772e-05, + "loss": 0.8469, + "step": 44900 + }, + { + "epoch": 0.2869171894765087, + "grad_norm": 0.7590452432632446, + "learning_rate": 9.501152335001658e-05, + "loss": 0.7567, + "step": 44910 + }, + { + "epoch": 0.2869810766262474, + "grad_norm": 0.6347168684005737, + "learning_rate": 9.500933835448047e-05, + "loss": 0.898, + "step": 44920 + }, + { + "epoch": 0.2870449637759861, + "grad_norm": 0.975532054901123, + "learning_rate": 9.500715290566138e-05, + "loss": 1.1958, + "step": 44930 + }, + { + "epoch": 0.2871088509257248, + "grad_norm": 0.670184850692749, + "learning_rate": 9.500496700358132e-05, + "loss": 0.9185, + "step": 44940 + }, + { + "epoch": 0.2871727380754635, + "grad_norm": 0.5493016839027405, + "learning_rate": 9.500278064826232e-05, + "loss": 0.9177, + "step": 44950 + }, + { + "epoch": 0.2872366252252022, + "grad_norm": 1.360520839691162, + "learning_rate": 9.500059383972638e-05, + "loss": 0.9026, + "step": 44960 + }, + { + "epoch": 0.2873005123749409, + "grad_norm": 0.6873490214347839, + "learning_rate": 9.499840657799553e-05, + "loss": 0.7064, + "step": 44970 + }, + { + "epoch": 0.2873643995246796, + "grad_norm": 0.814471423625946, + "learning_rate": 9.49962188630918e-05, + "loss": 0.8996, + "step": 44980 + }, + { + "epoch": 0.28742828667441833, + "grad_norm": 0.7156900763511658, + "learning_rate": 9.49940306950372e-05, + "loss": 0.841, + "step": 44990 + }, + { + "epoch": 0.287492173824157, + "grad_norm": 0.6915486454963684, + "learning_rate": 9.499184207385381e-05, + "loss": 0.8996, + "step": 45000 + }, + { + "epoch": 0.2875560609738957, + "grad_norm": 1.0259060859680176, + "learning_rate": 9.498965299956364e-05, + "loss": 0.9954, + "step": 45010 + }, + { + "epoch": 0.2876199481236344, + "grad_norm": 0.5235810875892639, + "learning_rate": 9.498746347218873e-05, + "loss": 1.1643, + "step": 45020 + }, + { + "epoch": 0.2876838352733731, + "grad_norm": 0.7001626491546631, + "learning_rate": 9.498527349175115e-05, + "loss": 1.0269, + "step": 45030 + }, + { + "epoch": 0.2877477224231118, + "grad_norm": 1.0902423858642578, + "learning_rate": 9.498308305827294e-05, + "loss": 0.9768, + "step": 45040 + }, + { + "epoch": 0.2878116095728505, + "grad_norm": 0.6482483744621277, + "learning_rate": 9.49808921717762e-05, + "loss": 1.0191, + "step": 45050 + }, + { + "epoch": 0.2878754967225892, + "grad_norm": 1.0290073156356812, + "learning_rate": 9.497870083228292e-05, + "loss": 1.0096, + "step": 45060 + }, + { + "epoch": 0.2879393838723279, + "grad_norm": 0.8370404243469238, + "learning_rate": 9.497650903981524e-05, + "loss": 1.0161, + "step": 45070 + }, + { + "epoch": 0.28800327102206663, + "grad_norm": 0.8315509557723999, + "learning_rate": 9.497431679439519e-05, + "loss": 0.7909, + "step": 45080 + }, + { + "epoch": 0.28806715817180534, + "grad_norm": 0.7571452856063843, + "learning_rate": 9.497212409604487e-05, + "loss": 0.9372, + "step": 45090 + }, + { + "epoch": 0.28813104532154404, + "grad_norm": 0.9375543594360352, + "learning_rate": 9.496993094478634e-05, + "loss": 0.6588, + "step": 45100 + }, + { + "epoch": 0.28819493247128275, + "grad_norm": 0.8192710876464844, + "learning_rate": 9.496773734064171e-05, + "loss": 0.9545, + "step": 45110 + }, + { + "epoch": 0.2882588196210214, + "grad_norm": 0.8890470862388611, + "learning_rate": 9.496554328363307e-05, + "loss": 0.9824, + "step": 45120 + }, + { + "epoch": 0.2883227067707601, + "grad_norm": 0.8460478186607361, + "learning_rate": 9.49633487737825e-05, + "loss": 0.66, + "step": 45130 + }, + { + "epoch": 0.2883865939204988, + "grad_norm": 1.1381182670593262, + "learning_rate": 9.496115381111211e-05, + "loss": 0.788, + "step": 45140 + }, + { + "epoch": 0.2884504810702375, + "grad_norm": 0.7069154977798462, + "learning_rate": 9.495895839564401e-05, + "loss": 0.7456, + "step": 45150 + }, + { + "epoch": 0.2885143682199762, + "grad_norm": 0.9826921820640564, + "learning_rate": 9.495676252740029e-05, + "loss": 1.0517, + "step": 45160 + }, + { + "epoch": 0.28857825536971493, + "grad_norm": 0.9633061289787292, + "learning_rate": 9.495456620640308e-05, + "loss": 1.0595, + "step": 45170 + }, + { + "epoch": 0.28864214251945364, + "grad_norm": 0.8172567486763, + "learning_rate": 9.495236943267451e-05, + "loss": 0.676, + "step": 45180 + }, + { + "epoch": 0.28870602966919234, + "grad_norm": 0.9763637185096741, + "learning_rate": 9.495017220623669e-05, + "loss": 0.7928, + "step": 45190 + }, + { + "epoch": 0.28876991681893105, + "grad_norm": 1.778890609741211, + "learning_rate": 9.494797452711174e-05, + "loss": 0.699, + "step": 45200 + }, + { + "epoch": 0.28883380396866976, + "grad_norm": 0.7678098678588867, + "learning_rate": 9.49457763953218e-05, + "loss": 0.8654, + "step": 45210 + }, + { + "epoch": 0.28889769111840846, + "grad_norm": 0.7432067394256592, + "learning_rate": 9.494357781088901e-05, + "loss": 0.7378, + "step": 45220 + }, + { + "epoch": 0.28896157826814717, + "grad_norm": 0.9834187030792236, + "learning_rate": 9.494137877383551e-05, + "loss": 0.9317, + "step": 45230 + }, + { + "epoch": 0.2890254654178859, + "grad_norm": 0.6653081178665161, + "learning_rate": 9.493917928418345e-05, + "loss": 0.7968, + "step": 45240 + }, + { + "epoch": 0.2890893525676245, + "grad_norm": 0.7635434865951538, + "learning_rate": 9.493697934195499e-05, + "loss": 0.8611, + "step": 45250 + }, + { + "epoch": 0.28915323971736323, + "grad_norm": 0.9517902135848999, + "learning_rate": 9.493477894717224e-05, + "loss": 0.7536, + "step": 45260 + }, + { + "epoch": 0.28921712686710194, + "grad_norm": 0.7886881232261658, + "learning_rate": 9.49325780998574e-05, + "loss": 0.9113, + "step": 45270 + }, + { + "epoch": 0.28928101401684064, + "grad_norm": 0.7776336669921875, + "learning_rate": 9.493037680003264e-05, + "loss": 0.8193, + "step": 45280 + }, + { + "epoch": 0.28934490116657935, + "grad_norm": 0.8685764670372009, + "learning_rate": 9.492817504772012e-05, + "loss": 0.9521, + "step": 45290 + }, + { + "epoch": 0.28940878831631806, + "grad_norm": 1.247955322265625, + "learning_rate": 9.492597284294198e-05, + "loss": 0.9216, + "step": 45300 + }, + { + "epoch": 0.28947267546605676, + "grad_norm": 0.9125822186470032, + "learning_rate": 9.492377018572046e-05, + "loss": 1.003, + "step": 45310 + }, + { + "epoch": 0.28953656261579547, + "grad_norm": 0.7521454095840454, + "learning_rate": 9.492156707607769e-05, + "loss": 0.806, + "step": 45320 + }, + { + "epoch": 0.2896004497655342, + "grad_norm": 0.8048921823501587, + "learning_rate": 9.491936351403588e-05, + "loss": 1.0567, + "step": 45330 + }, + { + "epoch": 0.2896643369152729, + "grad_norm": 0.6293105483055115, + "learning_rate": 9.491715949961721e-05, + "loss": 0.6753, + "step": 45340 + }, + { + "epoch": 0.2897282240650116, + "grad_norm": 0.7665662169456482, + "learning_rate": 9.491495503284391e-05, + "loss": 1.1162, + "step": 45350 + }, + { + "epoch": 0.2897921112147503, + "grad_norm": 1.389918327331543, + "learning_rate": 9.491275011373813e-05, + "loss": 0.8464, + "step": 45360 + }, + { + "epoch": 0.28985599836448894, + "grad_norm": 0.5303570628166199, + "learning_rate": 9.491054474232212e-05, + "loss": 0.8697, + "step": 45370 + }, + { + "epoch": 0.28991988551422765, + "grad_norm": 0.7896818518638611, + "learning_rate": 9.490833891861806e-05, + "loss": 0.8274, + "step": 45380 + }, + { + "epoch": 0.28998377266396635, + "grad_norm": 1.085740566253662, + "learning_rate": 9.490613264264818e-05, + "loss": 1.2138, + "step": 45390 + }, + { + "epoch": 0.29004765981370506, + "grad_norm": 0.9836480617523193, + "learning_rate": 9.490392591443469e-05, + "loss": 1.0133, + "step": 45400 + }, + { + "epoch": 0.29011154696344377, + "grad_norm": 1.2857035398483276, + "learning_rate": 9.490171873399982e-05, + "loss": 0.7627, + "step": 45410 + }, + { + "epoch": 0.2901754341131825, + "grad_norm": 0.7839180827140808, + "learning_rate": 9.489951110136581e-05, + "loss": 0.8626, + "step": 45420 + }, + { + "epoch": 0.2902393212629212, + "grad_norm": 0.6946144104003906, + "learning_rate": 9.489730301655486e-05, + "loss": 0.8252, + "step": 45430 + }, + { + "epoch": 0.2903032084126599, + "grad_norm": 0.7816669344902039, + "learning_rate": 9.489509447958924e-05, + "loss": 0.8103, + "step": 45440 + }, + { + "epoch": 0.2903670955623986, + "grad_norm": 1.0374782085418701, + "learning_rate": 9.489288549049118e-05, + "loss": 0.9284, + "step": 45450 + }, + { + "epoch": 0.2904309827121373, + "grad_norm": 1.0329042673110962, + "learning_rate": 9.489067604928293e-05, + "loss": 1.0252, + "step": 45460 + }, + { + "epoch": 0.290494869861876, + "grad_norm": 1.062635898590088, + "learning_rate": 9.488846615598671e-05, + "loss": 0.8388, + "step": 45470 + }, + { + "epoch": 0.2905587570116147, + "grad_norm": 1.2873570919036865, + "learning_rate": 9.488625581062483e-05, + "loss": 0.8721, + "step": 45480 + }, + { + "epoch": 0.29062264416135336, + "grad_norm": 1.4806243181228638, + "learning_rate": 9.48840450132195e-05, + "loss": 1.1295, + "step": 45490 + }, + { + "epoch": 0.29068653131109207, + "grad_norm": 0.9083405137062073, + "learning_rate": 9.488183376379302e-05, + "loss": 0.782, + "step": 45500 + }, + { + "epoch": 0.2907504184608308, + "grad_norm": 0.43349987268447876, + "learning_rate": 9.487962206236765e-05, + "loss": 0.9368, + "step": 45510 + }, + { + "epoch": 0.2908143056105695, + "grad_norm": 0.6599463224411011, + "learning_rate": 9.487740990896564e-05, + "loss": 0.7841, + "step": 45520 + }, + { + "epoch": 0.2908781927603082, + "grad_norm": 0.6311991810798645, + "learning_rate": 9.48751973036093e-05, + "loss": 0.9138, + "step": 45530 + }, + { + "epoch": 0.2909420799100469, + "grad_norm": 0.5348168015480042, + "learning_rate": 9.487298424632089e-05, + "loss": 0.7043, + "step": 45540 + }, + { + "epoch": 0.2910059670597856, + "grad_norm": 0.8502787947654724, + "learning_rate": 9.487077073712273e-05, + "loss": 1.0872, + "step": 45550 + }, + { + "epoch": 0.2910698542095243, + "grad_norm": 0.8174751400947571, + "learning_rate": 9.486855677603707e-05, + "loss": 0.9294, + "step": 45560 + }, + { + "epoch": 0.291133741359263, + "grad_norm": 0.7692357897758484, + "learning_rate": 9.486634236308624e-05, + "loss": 0.9752, + "step": 45570 + }, + { + "epoch": 0.2911976285090017, + "grad_norm": 0.43835484981536865, + "learning_rate": 9.486412749829251e-05, + "loss": 0.8376, + "step": 45580 + }, + { + "epoch": 0.2912615156587404, + "grad_norm": 1.0766656398773193, + "learning_rate": 9.486191218167823e-05, + "loss": 0.95, + "step": 45590 + }, + { + "epoch": 0.29132540280847913, + "grad_norm": 0.7746816277503967, + "learning_rate": 9.485969641326566e-05, + "loss": 1.114, + "step": 45600 + }, + { + "epoch": 0.2913892899582178, + "grad_norm": 0.8561303019523621, + "learning_rate": 9.485748019307716e-05, + "loss": 1.125, + "step": 45610 + }, + { + "epoch": 0.2914531771079565, + "grad_norm": 0.7320988774299622, + "learning_rate": 9.4855263521135e-05, + "loss": 0.8814, + "step": 45620 + }, + { + "epoch": 0.2915170642576952, + "grad_norm": 0.9033998847007751, + "learning_rate": 9.485304639746155e-05, + "loss": 0.8253, + "step": 45630 + }, + { + "epoch": 0.2915809514074339, + "grad_norm": 1.71013605594635, + "learning_rate": 9.485082882207911e-05, + "loss": 0.632, + "step": 45640 + }, + { + "epoch": 0.2916448385571726, + "grad_norm": 0.5615208148956299, + "learning_rate": 9.484861079501003e-05, + "loss": 0.7828, + "step": 45650 + }, + { + "epoch": 0.2917087257069113, + "grad_norm": 1.236778974533081, + "learning_rate": 9.484639231627664e-05, + "loss": 0.9695, + "step": 45660 + }, + { + "epoch": 0.29177261285665, + "grad_norm": 0.9685949683189392, + "learning_rate": 9.484417338590127e-05, + "loss": 0.6323, + "step": 45670 + }, + { + "epoch": 0.2918365000063887, + "grad_norm": 1.1921647787094116, + "learning_rate": 9.484195400390629e-05, + "loss": 1.1795, + "step": 45680 + }, + { + "epoch": 0.2919003871561274, + "grad_norm": 1.2076773643493652, + "learning_rate": 9.483973417031404e-05, + "loss": 0.9146, + "step": 45690 + }, + { + "epoch": 0.29196427430586613, + "grad_norm": 1.2398484945297241, + "learning_rate": 9.483751388514685e-05, + "loss": 0.9077, + "step": 45700 + }, + { + "epoch": 0.29202816145560484, + "grad_norm": 0.9221826195716858, + "learning_rate": 9.483529314842715e-05, + "loss": 0.6419, + "step": 45710 + }, + { + "epoch": 0.29209204860534355, + "grad_norm": 0.9597351551055908, + "learning_rate": 9.483307196017722e-05, + "loss": 0.8304, + "step": 45720 + }, + { + "epoch": 0.2921559357550822, + "grad_norm": 0.8210084438323975, + "learning_rate": 9.483085032041949e-05, + "loss": 0.5964, + "step": 45730 + }, + { + "epoch": 0.2922198229048209, + "grad_norm": 0.8063592910766602, + "learning_rate": 9.48286282291763e-05, + "loss": 1.1034, + "step": 45740 + }, + { + "epoch": 0.2922837100545596, + "grad_norm": 1.1515041589736938, + "learning_rate": 9.482640568647006e-05, + "loss": 1.1599, + "step": 45750 + }, + { + "epoch": 0.2923475972042983, + "grad_norm": 0.5701981782913208, + "learning_rate": 9.482418269232311e-05, + "loss": 0.8986, + "step": 45760 + }, + { + "epoch": 0.292411484354037, + "grad_norm": 1.5599360466003418, + "learning_rate": 9.482195924675789e-05, + "loss": 1.0177, + "step": 45770 + }, + { + "epoch": 0.2924753715037757, + "grad_norm": 0.7682929635047913, + "learning_rate": 9.481973534979674e-05, + "loss": 1.104, + "step": 45780 + }, + { + "epoch": 0.29253925865351443, + "grad_norm": 0.5851641297340393, + "learning_rate": 9.481751100146209e-05, + "loss": 1.0031, + "step": 45790 + }, + { + "epoch": 0.29260314580325314, + "grad_norm": 0.8387541174888611, + "learning_rate": 9.481528620177633e-05, + "loss": 0.868, + "step": 45800 + }, + { + "epoch": 0.29266703295299185, + "grad_norm": 0.9967763423919678, + "learning_rate": 9.481306095076188e-05, + "loss": 0.7826, + "step": 45810 + }, + { + "epoch": 0.29273092010273055, + "grad_norm": 0.5963412523269653, + "learning_rate": 9.481083524844113e-05, + "loss": 0.8424, + "step": 45820 + }, + { + "epoch": 0.29279480725246926, + "grad_norm": 0.5736973285675049, + "learning_rate": 9.480860909483649e-05, + "loss": 0.8342, + "step": 45830 + }, + { + "epoch": 0.29285869440220796, + "grad_norm": 1.3335403203964233, + "learning_rate": 9.480638248997039e-05, + "loss": 1.086, + "step": 45840 + }, + { + "epoch": 0.2929225815519466, + "grad_norm": 0.5996566414833069, + "learning_rate": 9.480415543386528e-05, + "loss": 0.8788, + "step": 45850 + }, + { + "epoch": 0.2929864687016853, + "grad_norm": 0.8202914595603943, + "learning_rate": 9.480192792654355e-05, + "loss": 0.9448, + "step": 45860 + }, + { + "epoch": 0.293050355851424, + "grad_norm": 1.0648213624954224, + "learning_rate": 9.479969996802763e-05, + "loss": 1.1049, + "step": 45870 + }, + { + "epoch": 0.29311424300116273, + "grad_norm": 0.8735106587409973, + "learning_rate": 9.479747155833999e-05, + "loss": 0.7924, + "step": 45880 + }, + { + "epoch": 0.29317813015090144, + "grad_norm": 0.7611486315727234, + "learning_rate": 9.479524269750306e-05, + "loss": 0.7604, + "step": 45890 + }, + { + "epoch": 0.29324201730064015, + "grad_norm": 1.0947805643081665, + "learning_rate": 9.479301338553927e-05, + "loss": 0.8645, + "step": 45900 + }, + { + "epoch": 0.29330590445037885, + "grad_norm": 1.0103657245635986, + "learning_rate": 9.479078362247109e-05, + "loss": 1.0796, + "step": 45910 + }, + { + "epoch": 0.29336979160011756, + "grad_norm": 0.4850558042526245, + "learning_rate": 9.478855340832097e-05, + "loss": 0.9143, + "step": 45920 + }, + { + "epoch": 0.29343367874985626, + "grad_norm": 1.094140648841858, + "learning_rate": 9.478632274311137e-05, + "loss": 0.8126, + "step": 45930 + }, + { + "epoch": 0.29349756589959497, + "grad_norm": 0.43484973907470703, + "learning_rate": 9.478409162686475e-05, + "loss": 0.8784, + "step": 45940 + }, + { + "epoch": 0.2935614530493337, + "grad_norm": 0.7616863250732422, + "learning_rate": 9.478186005960359e-05, + "loss": 0.9753, + "step": 45950 + }, + { + "epoch": 0.2936253401990724, + "grad_norm": 0.625148594379425, + "learning_rate": 9.477962804135037e-05, + "loss": 0.7565, + "step": 45960 + }, + { + "epoch": 0.29368922734881103, + "grad_norm": 0.5480995178222656, + "learning_rate": 9.477739557212753e-05, + "loss": 0.9026, + "step": 45970 + }, + { + "epoch": 0.29375311449854974, + "grad_norm": 2.2931747436523438, + "learning_rate": 9.477516265195759e-05, + "loss": 0.7589, + "step": 45980 + }, + { + "epoch": 0.29381700164828845, + "grad_norm": 0.734191358089447, + "learning_rate": 9.477292928086303e-05, + "loss": 0.9262, + "step": 45990 + }, + { + "epoch": 0.29388088879802715, + "grad_norm": 0.8451043367385864, + "learning_rate": 9.477069545886633e-05, + "loss": 0.9716, + "step": 46000 + }, + { + "epoch": 0.29394477594776586, + "grad_norm": 0.7809146046638489, + "learning_rate": 9.476846118599e-05, + "loss": 1.0449, + "step": 46010 + }, + { + "epoch": 0.29400866309750456, + "grad_norm": 0.930077314376831, + "learning_rate": 9.476622646225653e-05, + "loss": 0.7448, + "step": 46020 + }, + { + "epoch": 0.29407255024724327, + "grad_norm": 0.7774382829666138, + "learning_rate": 9.476399128768845e-05, + "loss": 0.9692, + "step": 46030 + }, + { + "epoch": 0.294136437396982, + "grad_norm": 0.678877592086792, + "learning_rate": 9.476175566230822e-05, + "loss": 0.8851, + "step": 46040 + }, + { + "epoch": 0.2942003245467207, + "grad_norm": 0.7753483653068542, + "learning_rate": 9.475951958613842e-05, + "loss": 0.7935, + "step": 46050 + }, + { + "epoch": 0.2942642116964594, + "grad_norm": 1.3318039178848267, + "learning_rate": 9.475728305920151e-05, + "loss": 0.9516, + "step": 46060 + }, + { + "epoch": 0.2943280988461981, + "grad_norm": 3.3444128036499023, + "learning_rate": 9.475504608152005e-05, + "loss": 0.9883, + "step": 46070 + }, + { + "epoch": 0.2943919859959368, + "grad_norm": 1.5764853954315186, + "learning_rate": 9.475280865311656e-05, + "loss": 0.8666, + "step": 46080 + }, + { + "epoch": 0.2944558731456755, + "grad_norm": 1.4029241800308228, + "learning_rate": 9.475057077401356e-05, + "loss": 0.9805, + "step": 46090 + }, + { + "epoch": 0.29451976029541416, + "grad_norm": 0.5334384441375732, + "learning_rate": 9.47483324442336e-05, + "loss": 0.8014, + "step": 46100 + }, + { + "epoch": 0.29458364744515286, + "grad_norm": 0.8421732783317566, + "learning_rate": 9.474609366379923e-05, + "loss": 0.8272, + "step": 46110 + }, + { + "epoch": 0.29464753459489157, + "grad_norm": 0.5735695362091064, + "learning_rate": 9.474385443273296e-05, + "loss": 0.9271, + "step": 46120 + }, + { + "epoch": 0.2947114217446303, + "grad_norm": 0.8526939749717712, + "learning_rate": 9.47416147510574e-05, + "loss": 0.9938, + "step": 46130 + }, + { + "epoch": 0.294775308894369, + "grad_norm": 0.6834962964057922, + "learning_rate": 9.473937461879505e-05, + "loss": 1.1168, + "step": 46140 + }, + { + "epoch": 0.2948391960441077, + "grad_norm": 1.1148639917373657, + "learning_rate": 9.47371340359685e-05, + "loss": 0.9541, + "step": 46150 + }, + { + "epoch": 0.2949030831938464, + "grad_norm": 0.8598116040229797, + "learning_rate": 9.47348930026003e-05, + "loss": 0.9374, + "step": 46160 + }, + { + "epoch": 0.2949669703435851, + "grad_norm": 0.6423646211624146, + "learning_rate": 9.473265151871304e-05, + "loss": 0.8231, + "step": 46170 + }, + { + "epoch": 0.2950308574933238, + "grad_norm": 2.0000832080841064, + "learning_rate": 9.473040958432927e-05, + "loss": 0.8936, + "step": 46180 + }, + { + "epoch": 0.2950947446430625, + "grad_norm": 1.143376350402832, + "learning_rate": 9.472816719947159e-05, + "loss": 0.6661, + "step": 46190 + }, + { + "epoch": 0.2951586317928012, + "grad_norm": 0.7327792048454285, + "learning_rate": 9.472592436416255e-05, + "loss": 0.8819, + "step": 46200 + }, + { + "epoch": 0.2952225189425399, + "grad_norm": 0.8125030994415283, + "learning_rate": 9.472368107842477e-05, + "loss": 0.9795, + "step": 46210 + }, + { + "epoch": 0.2952864060922786, + "grad_norm": 0.8501039743423462, + "learning_rate": 9.472143734228083e-05, + "loss": 1.1246, + "step": 46220 + }, + { + "epoch": 0.2953502932420173, + "grad_norm": 0.4900776743888855, + "learning_rate": 9.471919315575333e-05, + "loss": 0.6896, + "step": 46230 + }, + { + "epoch": 0.295414180391756, + "grad_norm": 1.3538086414337158, + "learning_rate": 9.471694851886487e-05, + "loss": 0.961, + "step": 46240 + }, + { + "epoch": 0.2954780675414947, + "grad_norm": 0.9380719661712646, + "learning_rate": 9.471470343163804e-05, + "loss": 1.1836, + "step": 46250 + }, + { + "epoch": 0.2955419546912334, + "grad_norm": 0.9986345767974854, + "learning_rate": 9.471245789409548e-05, + "loss": 0.8949, + "step": 46260 + }, + { + "epoch": 0.2956058418409721, + "grad_norm": 0.35391414165496826, + "learning_rate": 9.471021190625977e-05, + "loss": 0.8161, + "step": 46270 + }, + { + "epoch": 0.2956697289907108, + "grad_norm": 0.7981874942779541, + "learning_rate": 9.470796546815354e-05, + "loss": 0.9282, + "step": 46280 + }, + { + "epoch": 0.2957336161404495, + "grad_norm": 0.6027029752731323, + "learning_rate": 9.470571857979945e-05, + "loss": 0.9214, + "step": 46290 + }, + { + "epoch": 0.2957975032901882, + "grad_norm": 0.973746657371521, + "learning_rate": 9.470347124122008e-05, + "loss": 0.8735, + "step": 46300 + }, + { + "epoch": 0.29586139043992693, + "grad_norm": 0.944004476070404, + "learning_rate": 9.470122345243809e-05, + "loss": 0.9898, + "step": 46310 + }, + { + "epoch": 0.29592527758966564, + "grad_norm": 0.9042976498603821, + "learning_rate": 9.469897521347609e-05, + "loss": 1.0455, + "step": 46320 + }, + { + "epoch": 0.29598916473940434, + "grad_norm": 0.7813184857368469, + "learning_rate": 9.469672652435675e-05, + "loss": 0.879, + "step": 46330 + }, + { + "epoch": 0.296053051889143, + "grad_norm": 1.1560348272323608, + "learning_rate": 9.469447738510269e-05, + "loss": 0.8168, + "step": 46340 + }, + { + "epoch": 0.2961169390388817, + "grad_norm": 0.8251795768737793, + "learning_rate": 9.46922277957366e-05, + "loss": 0.9762, + "step": 46350 + }, + { + "epoch": 0.2961808261886204, + "grad_norm": 1.086754560470581, + "learning_rate": 9.46899777562811e-05, + "loss": 0.9877, + "step": 46360 + }, + { + "epoch": 0.2962447133383591, + "grad_norm": 1.2580642700195312, + "learning_rate": 9.468772726675887e-05, + "loss": 0.808, + "step": 46370 + }, + { + "epoch": 0.2963086004880978, + "grad_norm": 0.946445107460022, + "learning_rate": 9.468547632719255e-05, + "loss": 0.7862, + "step": 46380 + }, + { + "epoch": 0.2963724876378365, + "grad_norm": 1.1934231519699097, + "learning_rate": 9.468322493760484e-05, + "loss": 1.1795, + "step": 46390 + }, + { + "epoch": 0.29643637478757523, + "grad_norm": 0.9049299955368042, + "learning_rate": 9.46809730980184e-05, + "loss": 0.8541, + "step": 46400 + }, + { + "epoch": 0.29650026193731394, + "grad_norm": 1.3336893320083618, + "learning_rate": 9.46787208084559e-05, + "loss": 1.0398, + "step": 46410 + }, + { + "epoch": 0.29656414908705264, + "grad_norm": 0.9916601181030273, + "learning_rate": 9.467646806894001e-05, + "loss": 0.8907, + "step": 46420 + }, + { + "epoch": 0.29662803623679135, + "grad_norm": 0.9866839051246643, + "learning_rate": 9.467421487949347e-05, + "loss": 1.1556, + "step": 46430 + }, + { + "epoch": 0.29669192338653005, + "grad_norm": 0.7323521971702576, + "learning_rate": 9.467196124013893e-05, + "loss": 0.9656, + "step": 46440 + }, + { + "epoch": 0.29675581053626876, + "grad_norm": 1.1069689989089966, + "learning_rate": 9.466970715089907e-05, + "loss": 0.6297, + "step": 46450 + }, + { + "epoch": 0.2968196976860074, + "grad_norm": 0.5628019571304321, + "learning_rate": 9.466745261179664e-05, + "loss": 0.806, + "step": 46460 + }, + { + "epoch": 0.2968835848357461, + "grad_norm": 1.0032429695129395, + "learning_rate": 9.466519762285431e-05, + "loss": 0.9214, + "step": 46470 + }, + { + "epoch": 0.2969474719854848, + "grad_norm": 0.7182255983352661, + "learning_rate": 9.466294218409479e-05, + "loss": 0.9303, + "step": 46480 + }, + { + "epoch": 0.29701135913522353, + "grad_norm": 1.8324652910232544, + "learning_rate": 9.466068629554082e-05, + "loss": 1.1856, + "step": 46490 + }, + { + "epoch": 0.29707524628496224, + "grad_norm": 0.7303147315979004, + "learning_rate": 9.46584299572151e-05, + "loss": 0.7481, + "step": 46500 + }, + { + "epoch": 0.29713913343470094, + "grad_norm": 2.1872732639312744, + "learning_rate": 9.465617316914033e-05, + "loss": 0.7029, + "step": 46510 + }, + { + "epoch": 0.29720302058443965, + "grad_norm": 1.0133579969406128, + "learning_rate": 9.46539159313393e-05, + "loss": 0.8906, + "step": 46520 + }, + { + "epoch": 0.29726690773417835, + "grad_norm": 0.7005990147590637, + "learning_rate": 9.465165824383468e-05, + "loss": 0.725, + "step": 46530 + }, + { + "epoch": 0.29733079488391706, + "grad_norm": 1.0312696695327759, + "learning_rate": 9.464940010664925e-05, + "loss": 0.9634, + "step": 46540 + }, + { + "epoch": 0.29739468203365577, + "grad_norm": 0.36605343222618103, + "learning_rate": 9.464714151980571e-05, + "loss": 0.7082, + "step": 46550 + }, + { + "epoch": 0.2974585691833945, + "grad_norm": 1.1739599704742432, + "learning_rate": 9.464488248332685e-05, + "loss": 1.4234, + "step": 46560 + }, + { + "epoch": 0.2975224563331332, + "grad_norm": 0.8871263265609741, + "learning_rate": 9.464262299723539e-05, + "loss": 0.7826, + "step": 46570 + }, + { + "epoch": 0.29758634348287183, + "grad_norm": 0.652490496635437, + "learning_rate": 9.46403630615541e-05, + "loss": 0.9345, + "step": 46580 + }, + { + "epoch": 0.29765023063261054, + "grad_norm": 0.8714577555656433, + "learning_rate": 9.463810267630573e-05, + "loss": 1.096, + "step": 46590 + }, + { + "epoch": 0.29771411778234924, + "grad_norm": 0.48764970898628235, + "learning_rate": 9.463584184151305e-05, + "loss": 0.9833, + "step": 46600 + }, + { + "epoch": 0.29777800493208795, + "grad_norm": 0.5805774331092834, + "learning_rate": 9.463358055719883e-05, + "loss": 0.6249, + "step": 46610 + }, + { + "epoch": 0.29784189208182665, + "grad_norm": 1.5289901494979858, + "learning_rate": 9.463131882338583e-05, + "loss": 1.0411, + "step": 46620 + }, + { + "epoch": 0.29790577923156536, + "grad_norm": 0.9983165264129639, + "learning_rate": 9.462905664009685e-05, + "loss": 0.9297, + "step": 46630 + }, + { + "epoch": 0.29796966638130407, + "grad_norm": 0.5943264961242676, + "learning_rate": 9.462679400735466e-05, + "loss": 1.0207, + "step": 46640 + }, + { + "epoch": 0.29803355353104277, + "grad_norm": 0.9419231414794922, + "learning_rate": 9.462453092518204e-05, + "loss": 0.8072, + "step": 46650 + }, + { + "epoch": 0.2980974406807815, + "grad_norm": 0.6155195832252502, + "learning_rate": 9.46222673936018e-05, + "loss": 0.7672, + "step": 46660 + }, + { + "epoch": 0.2981613278305202, + "grad_norm": 0.5580214858055115, + "learning_rate": 9.462000341263671e-05, + "loss": 0.9204, + "step": 46670 + }, + { + "epoch": 0.2982252149802589, + "grad_norm": 1.5612927675247192, + "learning_rate": 9.46177389823096e-05, + "loss": 1.2065, + "step": 46680 + }, + { + "epoch": 0.2982891021299976, + "grad_norm": 1.112136960029602, + "learning_rate": 9.461547410264324e-05, + "loss": 0.7437, + "step": 46690 + }, + { + "epoch": 0.29835298927973625, + "grad_norm": 0.646933913230896, + "learning_rate": 9.461320877366047e-05, + "loss": 0.8512, + "step": 46700 + }, + { + "epoch": 0.29841687642947495, + "grad_norm": 0.7455711960792542, + "learning_rate": 9.461094299538408e-05, + "loss": 1.0632, + "step": 46710 + }, + { + "epoch": 0.29848076357921366, + "grad_norm": 1.7939437627792358, + "learning_rate": 9.460867676783691e-05, + "loss": 0.9103, + "step": 46720 + }, + { + "epoch": 0.29854465072895237, + "grad_norm": 0.8458738923072815, + "learning_rate": 9.460641009104177e-05, + "loss": 0.8318, + "step": 46730 + }, + { + "epoch": 0.29860853787869107, + "grad_norm": 1.1365669965744019, + "learning_rate": 9.460414296502149e-05, + "loss": 0.94, + "step": 46740 + }, + { + "epoch": 0.2986724250284298, + "grad_norm": 0.8920236229896545, + "learning_rate": 9.46018753897989e-05, + "loss": 0.8648, + "step": 46750 + }, + { + "epoch": 0.2987363121781685, + "grad_norm": 1.0041251182556152, + "learning_rate": 9.459960736539683e-05, + "loss": 0.6963, + "step": 46760 + }, + { + "epoch": 0.2988001993279072, + "grad_norm": 0.6039364337921143, + "learning_rate": 9.459733889183815e-05, + "loss": 0.8719, + "step": 46770 + }, + { + "epoch": 0.2988640864776459, + "grad_norm": 1.0676556825637817, + "learning_rate": 9.459506996914568e-05, + "loss": 0.7705, + "step": 46780 + }, + { + "epoch": 0.2989279736273846, + "grad_norm": 1.1080639362335205, + "learning_rate": 9.459280059734226e-05, + "loss": 0.9965, + "step": 46790 + }, + { + "epoch": 0.2989918607771233, + "grad_norm": 1.0551854372024536, + "learning_rate": 9.459053077645077e-05, + "loss": 0.8556, + "step": 46800 + }, + { + "epoch": 0.299055747926862, + "grad_norm": 0.8783060908317566, + "learning_rate": 9.458826050649407e-05, + "loss": 1.329, + "step": 46810 + }, + { + "epoch": 0.29911963507660067, + "grad_norm": 0.8185521960258484, + "learning_rate": 9.4585989787495e-05, + "loss": 1.0131, + "step": 46820 + }, + { + "epoch": 0.29918352222633937, + "grad_norm": 0.5932868719100952, + "learning_rate": 9.458371861947645e-05, + "loss": 1.0617, + "step": 46830 + }, + { + "epoch": 0.2992474093760781, + "grad_norm": 1.5922162532806396, + "learning_rate": 9.458144700246127e-05, + "loss": 0.8565, + "step": 46840 + }, + { + "epoch": 0.2993112965258168, + "grad_norm": 0.603920042514801, + "learning_rate": 9.457917493647235e-05, + "loss": 0.7725, + "step": 46850 + }, + { + "epoch": 0.2993751836755555, + "grad_norm": 0.9906972646713257, + "learning_rate": 9.457690242153258e-05, + "loss": 1.0442, + "step": 46860 + }, + { + "epoch": 0.2994390708252942, + "grad_norm": 0.756675124168396, + "learning_rate": 9.457462945766484e-05, + "loss": 0.8007, + "step": 46870 + }, + { + "epoch": 0.2995029579750329, + "grad_norm": 0.6027681827545166, + "learning_rate": 9.4572356044892e-05, + "loss": 1.1144, + "step": 46880 + }, + { + "epoch": 0.2995668451247716, + "grad_norm": 0.542198896408081, + "learning_rate": 9.457008218323699e-05, + "loss": 0.9496, + "step": 46890 + }, + { + "epoch": 0.2996307322745103, + "grad_norm": 1.004642367362976, + "learning_rate": 9.45678078727227e-05, + "loss": 0.8916, + "step": 46900 + }, + { + "epoch": 0.299694619424249, + "grad_norm": 0.5443822741508484, + "learning_rate": 9.456553311337202e-05, + "loss": 1.0492, + "step": 46910 + }, + { + "epoch": 0.2997585065739877, + "grad_norm": 0.562498927116394, + "learning_rate": 9.456325790520789e-05, + "loss": 1.0578, + "step": 46920 + }, + { + "epoch": 0.29982239372372643, + "grad_norm": 0.7859065532684326, + "learning_rate": 9.456098224825316e-05, + "loss": 1.046, + "step": 46930 + }, + { + "epoch": 0.29988628087346514, + "grad_norm": 0.7627017498016357, + "learning_rate": 9.455870614253081e-05, + "loss": 0.8198, + "step": 46940 + }, + { + "epoch": 0.2999501680232038, + "grad_norm": 0.6485910415649414, + "learning_rate": 9.455642958806374e-05, + "loss": 0.9887, + "step": 46950 + }, + { + "epoch": 0.3000140551729425, + "grad_norm": 1.4447276592254639, + "learning_rate": 9.455415258487487e-05, + "loss": 0.7989, + "step": 46960 + }, + { + "epoch": 0.3000779423226812, + "grad_norm": 0.9059609770774841, + "learning_rate": 9.455187513298714e-05, + "loss": 0.9545, + "step": 46970 + }, + { + "epoch": 0.3001418294724199, + "grad_norm": 1.1355173587799072, + "learning_rate": 9.454959723242349e-05, + "loss": 0.825, + "step": 46980 + }, + { + "epoch": 0.3002057166221586, + "grad_norm": 1.0973711013793945, + "learning_rate": 9.454731888320684e-05, + "loss": 0.9209, + "step": 46990 + }, + { + "epoch": 0.3002696037718973, + "grad_norm": 0.9574286341667175, + "learning_rate": 9.454504008536017e-05, + "loss": 0.8564, + "step": 47000 + }, + { + "epoch": 0.300333490921636, + "grad_norm": 1.527851939201355, + "learning_rate": 9.454276083890641e-05, + "loss": 1.3292, + "step": 47010 + }, + { + "epoch": 0.30039737807137473, + "grad_norm": 0.8139092326164246, + "learning_rate": 9.454048114386848e-05, + "loss": 0.8496, + "step": 47020 + }, + { + "epoch": 0.30046126522111344, + "grad_norm": 1.5546993017196655, + "learning_rate": 9.453820100026942e-05, + "loss": 1.1378, + "step": 47030 + }, + { + "epoch": 0.30052515237085214, + "grad_norm": 0.7080782055854797, + "learning_rate": 9.45359204081321e-05, + "loss": 0.7672, + "step": 47040 + }, + { + "epoch": 0.30058903952059085, + "grad_norm": 0.6548307538032532, + "learning_rate": 9.453363936747957e-05, + "loss": 1.0312, + "step": 47050 + }, + { + "epoch": 0.30065292667032956, + "grad_norm": 0.640304684638977, + "learning_rate": 9.453135787833473e-05, + "loss": 0.9947, + "step": 47060 + }, + { + "epoch": 0.3007168138200682, + "grad_norm": 0.9930755496025085, + "learning_rate": 9.452907594072062e-05, + "loss": 0.7912, + "step": 47070 + }, + { + "epoch": 0.3007807009698069, + "grad_norm": 0.8189347386360168, + "learning_rate": 9.452679355466018e-05, + "loss": 1.1204, + "step": 47080 + }, + { + "epoch": 0.3008445881195456, + "grad_norm": 0.8146405220031738, + "learning_rate": 9.45245107201764e-05, + "loss": 1.0947, + "step": 47090 + }, + { + "epoch": 0.3009084752692843, + "grad_norm": 0.9201721549034119, + "learning_rate": 9.45222274372923e-05, + "loss": 0.7746, + "step": 47100 + }, + { + "epoch": 0.30097236241902303, + "grad_norm": 0.7247973680496216, + "learning_rate": 9.451994370603084e-05, + "loss": 0.8073, + "step": 47110 + }, + { + "epoch": 0.30103624956876174, + "grad_norm": 3.4702842235565186, + "learning_rate": 9.451765952641502e-05, + "loss": 0.8341, + "step": 47120 + }, + { + "epoch": 0.30110013671850044, + "grad_norm": 2.2234277725219727, + "learning_rate": 9.451537489846787e-05, + "loss": 0.8992, + "step": 47130 + }, + { + "epoch": 0.30116402386823915, + "grad_norm": 0.8516297340393066, + "learning_rate": 9.451308982221238e-05, + "loss": 1.1016, + "step": 47140 + }, + { + "epoch": 0.30122791101797786, + "grad_norm": 0.6456612348556519, + "learning_rate": 9.451080429767157e-05, + "loss": 0.878, + "step": 47150 + }, + { + "epoch": 0.30129179816771656, + "grad_norm": 1.235134482383728, + "learning_rate": 9.450851832486844e-05, + "loss": 0.8274, + "step": 47160 + }, + { + "epoch": 0.30135568531745527, + "grad_norm": 1.2965903282165527, + "learning_rate": 9.450623190382604e-05, + "loss": 1.0011, + "step": 47170 + }, + { + "epoch": 0.301419572467194, + "grad_norm": 0.6325692534446716, + "learning_rate": 9.450394503456739e-05, + "loss": 0.8392, + "step": 47180 + }, + { + "epoch": 0.3014834596169326, + "grad_norm": 0.9124320149421692, + "learning_rate": 9.45016577171155e-05, + "loss": 1.0633, + "step": 47190 + }, + { + "epoch": 0.30154734676667133, + "grad_norm": 0.5959859490394592, + "learning_rate": 9.44993699514934e-05, + "loss": 0.993, + "step": 47200 + }, + { + "epoch": 0.30161123391641004, + "grad_norm": 0.5984769463539124, + "learning_rate": 9.449708173772417e-05, + "loss": 0.8204, + "step": 47210 + }, + { + "epoch": 0.30167512106614874, + "grad_norm": 1.2055346965789795, + "learning_rate": 9.449479307583082e-05, + "loss": 1.1527, + "step": 47220 + }, + { + "epoch": 0.30173900821588745, + "grad_norm": 0.8771409392356873, + "learning_rate": 9.449250396583642e-05, + "loss": 0.7836, + "step": 47230 + }, + { + "epoch": 0.30180289536562616, + "grad_norm": 1.1012285947799683, + "learning_rate": 9.4490214407764e-05, + "loss": 0.8842, + "step": 47240 + }, + { + "epoch": 0.30186678251536486, + "grad_norm": 2.031371831893921, + "learning_rate": 9.448792440163664e-05, + "loss": 0.8747, + "step": 47250 + }, + { + "epoch": 0.30193066966510357, + "grad_norm": 1.0956002473831177, + "learning_rate": 9.44856339474774e-05, + "loss": 0.9346, + "step": 47260 + }, + { + "epoch": 0.3019945568148423, + "grad_norm": 1.060286521911621, + "learning_rate": 9.448334304530932e-05, + "loss": 0.9462, + "step": 47270 + }, + { + "epoch": 0.302058443964581, + "grad_norm": 0.9044703841209412, + "learning_rate": 9.448105169515551e-05, + "loss": 0.8297, + "step": 47280 + }, + { + "epoch": 0.3021223311143197, + "grad_norm": 0.7445279955863953, + "learning_rate": 9.447875989703902e-05, + "loss": 0.9671, + "step": 47290 + }, + { + "epoch": 0.3021862182640584, + "grad_norm": 0.9023739099502563, + "learning_rate": 9.447646765098294e-05, + "loss": 0.9307, + "step": 47300 + }, + { + "epoch": 0.30225010541379704, + "grad_norm": 1.161218523979187, + "learning_rate": 9.447417495701036e-05, + "loss": 0.8708, + "step": 47310 + }, + { + "epoch": 0.30231399256353575, + "grad_norm": 0.9403544068336487, + "learning_rate": 9.447188181514437e-05, + "loss": 1.0087, + "step": 47320 + }, + { + "epoch": 0.30237787971327446, + "grad_norm": 1.1664180755615234, + "learning_rate": 9.446958822540803e-05, + "loss": 0.9059, + "step": 47330 + }, + { + "epoch": 0.30244176686301316, + "grad_norm": 1.02223539352417, + "learning_rate": 9.446729418782448e-05, + "loss": 1.0916, + "step": 47340 + }, + { + "epoch": 0.30250565401275187, + "grad_norm": 0.7959775924682617, + "learning_rate": 9.446499970241682e-05, + "loss": 0.9342, + "step": 47350 + }, + { + "epoch": 0.3025695411624906, + "grad_norm": 0.938345730304718, + "learning_rate": 9.446270476920813e-05, + "loss": 1.0355, + "step": 47360 + }, + { + "epoch": 0.3026334283122293, + "grad_norm": 0.8003197908401489, + "learning_rate": 9.446040938822154e-05, + "loss": 0.6568, + "step": 47370 + }, + { + "epoch": 0.302697315461968, + "grad_norm": 1.0039125680923462, + "learning_rate": 9.445811355948016e-05, + "loss": 0.7738, + "step": 47380 + }, + { + "epoch": 0.3027612026117067, + "grad_norm": 0.7357406616210938, + "learning_rate": 9.44558172830071e-05, + "loss": 0.9009, + "step": 47390 + }, + { + "epoch": 0.3028250897614454, + "grad_norm": 0.9000012874603271, + "learning_rate": 9.445352055882552e-05, + "loss": 0.6797, + "step": 47400 + }, + { + "epoch": 0.3028889769111841, + "grad_norm": 0.7020642161369324, + "learning_rate": 9.445122338695853e-05, + "loss": 1.0587, + "step": 47410 + }, + { + "epoch": 0.3029528640609228, + "grad_norm": 0.9596700668334961, + "learning_rate": 9.444892576742927e-05, + "loss": 0.8238, + "step": 47420 + }, + { + "epoch": 0.30301675121066146, + "grad_norm": 0.9670388698577881, + "learning_rate": 9.444662770026087e-05, + "loss": 1.0262, + "step": 47430 + }, + { + "epoch": 0.30308063836040017, + "grad_norm": 0.6952800154685974, + "learning_rate": 9.444432918547648e-05, + "loss": 0.6862, + "step": 47440 + }, + { + "epoch": 0.3031445255101389, + "grad_norm": 1.1551501750946045, + "learning_rate": 9.444203022309923e-05, + "loss": 0.8036, + "step": 47450 + }, + { + "epoch": 0.3032084126598776, + "grad_norm": 0.7595165371894836, + "learning_rate": 9.44397308131523e-05, + "loss": 0.7412, + "step": 47460 + }, + { + "epoch": 0.3032722998096163, + "grad_norm": 1.8203938007354736, + "learning_rate": 9.443743095565882e-05, + "loss": 0.8287, + "step": 47470 + }, + { + "epoch": 0.303336186959355, + "grad_norm": 0.9286074042320251, + "learning_rate": 9.443513065064198e-05, + "loss": 0.7826, + "step": 47480 + }, + { + "epoch": 0.3034000741090937, + "grad_norm": 0.6284856200218201, + "learning_rate": 9.443282989812495e-05, + "loss": 0.8863, + "step": 47490 + }, + { + "epoch": 0.3034639612588324, + "grad_norm": 0.5591778755187988, + "learning_rate": 9.443052869813085e-05, + "loss": 0.8473, + "step": 47500 + }, + { + "epoch": 0.3035278484085711, + "grad_norm": 0.854895293712616, + "learning_rate": 9.44282270506829e-05, + "loss": 0.9199, + "step": 47510 + }, + { + "epoch": 0.3035917355583098, + "grad_norm": 0.6344938278198242, + "learning_rate": 9.442592495580427e-05, + "loss": 0.8679, + "step": 47520 + }, + { + "epoch": 0.3036556227080485, + "grad_norm": 0.9397995471954346, + "learning_rate": 9.442362241351815e-05, + "loss": 0.8879, + "step": 47530 + }, + { + "epoch": 0.30371950985778723, + "grad_norm": 0.9806991219520569, + "learning_rate": 9.442131942384769e-05, + "loss": 1.0208, + "step": 47540 + }, + { + "epoch": 0.3037833970075259, + "grad_norm": 0.7757532000541687, + "learning_rate": 9.441901598681615e-05, + "loss": 1.0113, + "step": 47550 + }, + { + "epoch": 0.3038472841572646, + "grad_norm": 1.0031111240386963, + "learning_rate": 9.441671210244667e-05, + "loss": 0.8251, + "step": 47560 + }, + { + "epoch": 0.3039111713070033, + "grad_norm": 0.7999134659767151, + "learning_rate": 9.441440777076248e-05, + "loss": 0.836, + "step": 47570 + }, + { + "epoch": 0.303975058456742, + "grad_norm": 1.0625855922698975, + "learning_rate": 9.441210299178677e-05, + "loss": 0.87, + "step": 47580 + }, + { + "epoch": 0.3040389456064807, + "grad_norm": 1.3165303468704224, + "learning_rate": 9.440979776554278e-05, + "loss": 0.7388, + "step": 47590 + }, + { + "epoch": 0.3041028327562194, + "grad_norm": 1.166496992111206, + "learning_rate": 9.44074920920537e-05, + "loss": 0.7745, + "step": 47600 + }, + { + "epoch": 0.3041667199059581, + "grad_norm": 0.9159298539161682, + "learning_rate": 9.440518597134275e-05, + "loss": 0.932, + "step": 47610 + }, + { + "epoch": 0.3042306070556968, + "grad_norm": 1.0953913927078247, + "learning_rate": 9.440287940343317e-05, + "loss": 0.9399, + "step": 47620 + }, + { + "epoch": 0.30429449420543553, + "grad_norm": 0.7524245977401733, + "learning_rate": 9.440057238834816e-05, + "loss": 0.7907, + "step": 47630 + }, + { + "epoch": 0.30435838135517423, + "grad_norm": 0.6022171974182129, + "learning_rate": 9.4398264926111e-05, + "loss": 0.9259, + "step": 47640 + }, + { + "epoch": 0.30442226850491294, + "grad_norm": 0.8347755074501038, + "learning_rate": 9.439595701674488e-05, + "loss": 1.0379, + "step": 47650 + }, + { + "epoch": 0.30448615565465165, + "grad_norm": 0.7987663149833679, + "learning_rate": 9.43936486602731e-05, + "loss": 0.926, + "step": 47660 + }, + { + "epoch": 0.3045500428043903, + "grad_norm": 0.762823760509491, + "learning_rate": 9.439133985671884e-05, + "loss": 1.031, + "step": 47670 + }, + { + "epoch": 0.304613929954129, + "grad_norm": 5.046191692352295, + "learning_rate": 9.438903060610539e-05, + "loss": 0.8839, + "step": 47680 + }, + { + "epoch": 0.3046778171038677, + "grad_norm": 0.8193703293800354, + "learning_rate": 9.438672090845599e-05, + "loss": 0.9656, + "step": 47690 + }, + { + "epoch": 0.3047417042536064, + "grad_norm": 1.1891075372695923, + "learning_rate": 9.438441076379395e-05, + "loss": 0.8131, + "step": 47700 + }, + { + "epoch": 0.3048055914033451, + "grad_norm": 0.6901410222053528, + "learning_rate": 9.438210017214245e-05, + "loss": 0.9052, + "step": 47710 + }, + { + "epoch": 0.30486947855308383, + "grad_norm": 0.8975858092308044, + "learning_rate": 9.437978913352483e-05, + "loss": 0.9107, + "step": 47720 + }, + { + "epoch": 0.30493336570282253, + "grad_norm": 0.71076500415802, + "learning_rate": 9.437747764796432e-05, + "loss": 0.9106, + "step": 47730 + }, + { + "epoch": 0.30499725285256124, + "grad_norm": 0.6818621158599854, + "learning_rate": 9.437516571548424e-05, + "loss": 1.0038, + "step": 47740 + }, + { + "epoch": 0.30506114000229995, + "grad_norm": 1.3536254167556763, + "learning_rate": 9.437285333610784e-05, + "loss": 1.0431, + "step": 47750 + }, + { + "epoch": 0.30512502715203865, + "grad_norm": 0.7278540730476379, + "learning_rate": 9.437054050985842e-05, + "loss": 0.67, + "step": 47760 + }, + { + "epoch": 0.30518891430177736, + "grad_norm": 0.8322495222091675, + "learning_rate": 9.436822723675926e-05, + "loss": 0.8593, + "step": 47770 + }, + { + "epoch": 0.30525280145151606, + "grad_norm": 0.8993383646011353, + "learning_rate": 9.436591351683368e-05, + "loss": 0.9672, + "step": 47780 + }, + { + "epoch": 0.30531668860125477, + "grad_norm": 0.9851189851760864, + "learning_rate": 9.436359935010498e-05, + "loss": 0.775, + "step": 47790 + }, + { + "epoch": 0.3053805757509934, + "grad_norm": 0.7736380696296692, + "learning_rate": 9.436128473659644e-05, + "loss": 0.838, + "step": 47800 + }, + { + "epoch": 0.3054444629007321, + "grad_norm": 0.7815120816230774, + "learning_rate": 9.43589696763314e-05, + "loss": 0.6829, + "step": 47810 + }, + { + "epoch": 0.30550835005047083, + "grad_norm": 0.9725918769836426, + "learning_rate": 9.435665416933315e-05, + "loss": 0.7912, + "step": 47820 + }, + { + "epoch": 0.30557223720020954, + "grad_norm": 0.9616202712059021, + "learning_rate": 9.4354338215625e-05, + "loss": 0.8527, + "step": 47830 + }, + { + "epoch": 0.30563612434994825, + "grad_norm": 0.7812166810035706, + "learning_rate": 9.435202181523031e-05, + "loss": 0.5296, + "step": 47840 + }, + { + "epoch": 0.30570001149968695, + "grad_norm": 0.5943650603294373, + "learning_rate": 9.43497049681724e-05, + "loss": 0.982, + "step": 47850 + }, + { + "epoch": 0.30576389864942566, + "grad_norm": 1.0416995286941528, + "learning_rate": 9.434738767447458e-05, + "loss": 1.0534, + "step": 47860 + }, + { + "epoch": 0.30582778579916436, + "grad_norm": 0.5415847301483154, + "learning_rate": 9.43450699341602e-05, + "loss": 0.939, + "step": 47870 + }, + { + "epoch": 0.30589167294890307, + "grad_norm": 1.0292586088180542, + "learning_rate": 9.43427517472526e-05, + "loss": 1.0046, + "step": 47880 + }, + { + "epoch": 0.3059555600986418, + "grad_norm": 0.9215097427368164, + "learning_rate": 9.434043311377513e-05, + "loss": 0.977, + "step": 47890 + }, + { + "epoch": 0.3060194472483805, + "grad_norm": 0.45443835854530334, + "learning_rate": 9.433811403375114e-05, + "loss": 0.8409, + "step": 47900 + }, + { + "epoch": 0.3060833343981192, + "grad_norm": 1.0631471872329712, + "learning_rate": 9.433579450720398e-05, + "loss": 0.9141, + "step": 47910 + }, + { + "epoch": 0.30614722154785784, + "grad_norm": 1.0696526765823364, + "learning_rate": 9.433347453415702e-05, + "loss": 1.0058, + "step": 47920 + }, + { + "epoch": 0.30621110869759655, + "grad_norm": 1.055720329284668, + "learning_rate": 9.433115411463361e-05, + "loss": 0.7988, + "step": 47930 + }, + { + "epoch": 0.30627499584733525, + "grad_norm": 0.7248536944389343, + "learning_rate": 9.432883324865713e-05, + "loss": 0.9809, + "step": 47940 + }, + { + "epoch": 0.30633888299707396, + "grad_norm": 0.7505012154579163, + "learning_rate": 9.432651193625095e-05, + "loss": 1.0935, + "step": 47950 + }, + { + "epoch": 0.30640277014681266, + "grad_norm": 1.8567789793014526, + "learning_rate": 9.432419017743845e-05, + "loss": 0.8664, + "step": 47960 + }, + { + "epoch": 0.30646665729655137, + "grad_norm": 0.6086595058441162, + "learning_rate": 9.432186797224301e-05, + "loss": 1.1033, + "step": 47970 + }, + { + "epoch": 0.3065305444462901, + "grad_norm": 1.0013806819915771, + "learning_rate": 9.431954532068801e-05, + "loss": 0.8595, + "step": 47980 + }, + { + "epoch": 0.3065944315960288, + "grad_norm": 0.9920240640640259, + "learning_rate": 9.431722222279684e-05, + "loss": 0.9719, + "step": 47990 + }, + { + "epoch": 0.3066583187457675, + "grad_norm": 1.207126498222351, + "learning_rate": 9.43148986785929e-05, + "loss": 0.8834, + "step": 48000 + }, + { + "epoch": 0.3067222058955062, + "grad_norm": 0.6420753598213196, + "learning_rate": 9.431257468809961e-05, + "loss": 1.1191, + "step": 48010 + }, + { + "epoch": 0.3067860930452449, + "grad_norm": 0.9169009327888489, + "learning_rate": 9.431025025134036e-05, + "loss": 1.0277, + "step": 48020 + }, + { + "epoch": 0.3068499801949836, + "grad_norm": 1.1910425424575806, + "learning_rate": 9.430792536833855e-05, + "loss": 0.7912, + "step": 48030 + }, + { + "epoch": 0.30691386734472226, + "grad_norm": 1.0869101285934448, + "learning_rate": 9.43056000391176e-05, + "loss": 1.1964, + "step": 48040 + }, + { + "epoch": 0.30697775449446096, + "grad_norm": 0.8634042739868164, + "learning_rate": 9.430327426370091e-05, + "loss": 0.7144, + "step": 48050 + }, + { + "epoch": 0.30704164164419967, + "grad_norm": 1.0796973705291748, + "learning_rate": 9.430094804211195e-05, + "loss": 0.8565, + "step": 48060 + }, + { + "epoch": 0.3071055287939384, + "grad_norm": 0.871731698513031, + "learning_rate": 9.42986213743741e-05, + "loss": 0.9328, + "step": 48070 + }, + { + "epoch": 0.3071694159436771, + "grad_norm": 1.139355182647705, + "learning_rate": 9.429629426051081e-05, + "loss": 0.8468, + "step": 48080 + }, + { + "epoch": 0.3072333030934158, + "grad_norm": 0.9230227470397949, + "learning_rate": 9.429396670054551e-05, + "loss": 1.0648, + "step": 48090 + }, + { + "epoch": 0.3072971902431545, + "grad_norm": 0.9664996862411499, + "learning_rate": 9.429163869450166e-05, + "loss": 0.9543, + "step": 48100 + }, + { + "epoch": 0.3073610773928932, + "grad_norm": 0.7057569026947021, + "learning_rate": 9.428931024240267e-05, + "loss": 0.978, + "step": 48110 + }, + { + "epoch": 0.3074249645426319, + "grad_norm": 0.6868560314178467, + "learning_rate": 9.428698134427202e-05, + "loss": 0.8156, + "step": 48120 + }, + { + "epoch": 0.3074888516923706, + "grad_norm": 1.3215396404266357, + "learning_rate": 9.428465200013317e-05, + "loss": 1.176, + "step": 48130 + }, + { + "epoch": 0.3075527388421093, + "grad_norm": 0.767733633518219, + "learning_rate": 9.428232221000954e-05, + "loss": 1.0589, + "step": 48140 + }, + { + "epoch": 0.307616625991848, + "grad_norm": 0.9023085832595825, + "learning_rate": 9.427999197392463e-05, + "loss": 0.861, + "step": 48150 + }, + { + "epoch": 0.3076805131415867, + "grad_norm": 0.7275156378746033, + "learning_rate": 9.427766129190189e-05, + "loss": 0.9598, + "step": 48160 + }, + { + "epoch": 0.3077444002913254, + "grad_norm": 1.1125576496124268, + "learning_rate": 9.427533016396479e-05, + "loss": 0.7333, + "step": 48170 + }, + { + "epoch": 0.3078082874410641, + "grad_norm": 2.008270025253296, + "learning_rate": 9.427299859013682e-05, + "loss": 1.134, + "step": 48180 + }, + { + "epoch": 0.3078721745908028, + "grad_norm": 0.5112677216529846, + "learning_rate": 9.427066657044144e-05, + "loss": 0.7755, + "step": 48190 + }, + { + "epoch": 0.3079360617405415, + "grad_norm": 0.8975897431373596, + "learning_rate": 9.426833410490215e-05, + "loss": 0.791, + "step": 48200 + }, + { + "epoch": 0.3079999488902802, + "grad_norm": 0.7356785535812378, + "learning_rate": 9.426600119354245e-05, + "loss": 0.8472, + "step": 48210 + }, + { + "epoch": 0.3080638360400189, + "grad_norm": 0.8324338793754578, + "learning_rate": 9.426366783638582e-05, + "loss": 0.9064, + "step": 48220 + }, + { + "epoch": 0.3081277231897576, + "grad_norm": 0.9921901226043701, + "learning_rate": 9.426133403345576e-05, + "loss": 0.9901, + "step": 48230 + }, + { + "epoch": 0.3081916103394963, + "grad_norm": 1.4877877235412598, + "learning_rate": 9.425899978477577e-05, + "loss": 0.7397, + "step": 48240 + }, + { + "epoch": 0.30825549748923503, + "grad_norm": 1.276802897453308, + "learning_rate": 9.425666509036936e-05, + "loss": 0.9455, + "step": 48250 + }, + { + "epoch": 0.30831938463897374, + "grad_norm": 1.7324568033218384, + "learning_rate": 9.425432995026005e-05, + "loss": 0.8014, + "step": 48260 + }, + { + "epoch": 0.30838327178871244, + "grad_norm": 1.0053337812423706, + "learning_rate": 9.425199436447135e-05, + "loss": 1.1581, + "step": 48270 + }, + { + "epoch": 0.3084471589384511, + "grad_norm": 1.1063930988311768, + "learning_rate": 9.424965833302679e-05, + "loss": 1.1048, + "step": 48280 + }, + { + "epoch": 0.3085110460881898, + "grad_norm": 1.1341273784637451, + "learning_rate": 9.424732185594989e-05, + "loss": 1.0374, + "step": 48290 + }, + { + "epoch": 0.3085749332379285, + "grad_norm": 0.6320347785949707, + "learning_rate": 9.424498493326417e-05, + "loss": 0.9549, + "step": 48300 + }, + { + "epoch": 0.3086388203876672, + "grad_norm": 0.48150819540023804, + "learning_rate": 9.424264756499317e-05, + "loss": 0.8902, + "step": 48310 + }, + { + "epoch": 0.3087027075374059, + "grad_norm": 0.6222485899925232, + "learning_rate": 9.424030975116045e-05, + "loss": 0.8407, + "step": 48320 + }, + { + "epoch": 0.3087665946871446, + "grad_norm": 0.6241324543952942, + "learning_rate": 9.423797149178952e-05, + "loss": 0.8781, + "step": 48330 + }, + { + "epoch": 0.30883048183688333, + "grad_norm": 0.8804172277450562, + "learning_rate": 9.423563278690397e-05, + "loss": 1.0311, + "step": 48340 + }, + { + "epoch": 0.30889436898662204, + "grad_norm": 1.6444405317306519, + "learning_rate": 9.423329363652731e-05, + "loss": 0.7865, + "step": 48350 + }, + { + "epoch": 0.30895825613636074, + "grad_norm": 0.8753210306167603, + "learning_rate": 9.423095404068312e-05, + "loss": 1.0464, + "step": 48360 + }, + { + "epoch": 0.30902214328609945, + "grad_norm": 0.8216173052787781, + "learning_rate": 9.422861399939495e-05, + "loss": 0.8443, + "step": 48370 + }, + { + "epoch": 0.30908603043583815, + "grad_norm": 1.1134603023529053, + "learning_rate": 9.422627351268638e-05, + "loss": 1.1639, + "step": 48380 + }, + { + "epoch": 0.30914991758557686, + "grad_norm": 0.8974233269691467, + "learning_rate": 9.422393258058098e-05, + "loss": 0.8378, + "step": 48390 + }, + { + "epoch": 0.3092138047353155, + "grad_norm": 0.8469827175140381, + "learning_rate": 9.422159120310232e-05, + "loss": 0.8669, + "step": 48400 + }, + { + "epoch": 0.3092776918850542, + "grad_norm": 0.701692521572113, + "learning_rate": 9.421924938027397e-05, + "loss": 0.9278, + "step": 48410 + }, + { + "epoch": 0.3093415790347929, + "grad_norm": 0.7484457492828369, + "learning_rate": 9.421690711211952e-05, + "loss": 0.9045, + "step": 48420 + }, + { + "epoch": 0.30940546618453163, + "grad_norm": 0.7951037883758545, + "learning_rate": 9.421456439866257e-05, + "loss": 0.7149, + "step": 48430 + }, + { + "epoch": 0.30946935333427034, + "grad_norm": 0.6220124363899231, + "learning_rate": 9.421222123992671e-05, + "loss": 0.8828, + "step": 48440 + }, + { + "epoch": 0.30953324048400904, + "grad_norm": 0.8480835556983948, + "learning_rate": 9.420987763593554e-05, + "loss": 0.8722, + "step": 48450 + }, + { + "epoch": 0.30959712763374775, + "grad_norm": 0.8057517409324646, + "learning_rate": 9.420753358671264e-05, + "loss": 1.0229, + "step": 48460 + }, + { + "epoch": 0.30966101478348645, + "grad_norm": 0.7954405546188354, + "learning_rate": 9.420518909228164e-05, + "loss": 0.7711, + "step": 48470 + }, + { + "epoch": 0.30972490193322516, + "grad_norm": 1.0141898393630981, + "learning_rate": 9.420284415266613e-05, + "loss": 0.7272, + "step": 48480 + }, + { + "epoch": 0.30978878908296387, + "grad_norm": 0.8430118560791016, + "learning_rate": 9.420049876788974e-05, + "loss": 0.9584, + "step": 48490 + }, + { + "epoch": 0.3098526762327026, + "grad_norm": 0.703395426273346, + "learning_rate": 9.419815293797611e-05, + "loss": 0.7518, + "step": 48500 + }, + { + "epoch": 0.3099165633824413, + "grad_norm": 0.6851431727409363, + "learning_rate": 9.419580666294883e-05, + "loss": 0.6678, + "step": 48510 + }, + { + "epoch": 0.30998045053217993, + "grad_norm": 0.6793634295463562, + "learning_rate": 9.419345994283153e-05, + "loss": 0.902, + "step": 48520 + }, + { + "epoch": 0.31004433768191864, + "grad_norm": 0.7479285597801208, + "learning_rate": 9.419111277764788e-05, + "loss": 0.8441, + "step": 48530 + }, + { + "epoch": 0.31010822483165734, + "grad_norm": 1.3613170385360718, + "learning_rate": 9.418876516742148e-05, + "loss": 1.0158, + "step": 48540 + }, + { + "epoch": 0.31017211198139605, + "grad_norm": 1.1104499101638794, + "learning_rate": 9.418665193772571e-05, + "loss": 1.0577, + "step": 48550 + }, + { + "epoch": 0.31023599913113475, + "grad_norm": 1.6109135150909424, + "learning_rate": 9.418430348198326e-05, + "loss": 0.8694, + "step": 48560 + }, + { + "epoch": 0.31029988628087346, + "grad_norm": 1.1885582208633423, + "learning_rate": 9.418195458126664e-05, + "loss": 0.8233, + "step": 48570 + }, + { + "epoch": 0.31036377343061217, + "grad_norm": 1.3938848972320557, + "learning_rate": 9.41796052355995e-05, + "loss": 0.824, + "step": 48580 + }, + { + "epoch": 0.3104276605803509, + "grad_norm": 0.72528076171875, + "learning_rate": 9.417725544500552e-05, + "loss": 0.912, + "step": 48590 + }, + { + "epoch": 0.3104915477300896, + "grad_norm": 0.7403073310852051, + "learning_rate": 9.417490520950838e-05, + "loss": 0.8565, + "step": 48600 + }, + { + "epoch": 0.3105554348798283, + "grad_norm": 1.263451337814331, + "learning_rate": 9.417255452913171e-05, + "loss": 0.9227, + "step": 48610 + }, + { + "epoch": 0.310619322029567, + "grad_norm": 1.1956753730773926, + "learning_rate": 9.417020340389922e-05, + "loss": 0.9026, + "step": 48620 + }, + { + "epoch": 0.3106832091793057, + "grad_norm": 0.985579252243042, + "learning_rate": 9.416785183383454e-05, + "loss": 0.9717, + "step": 48630 + }, + { + "epoch": 0.3107470963290444, + "grad_norm": 1.1130268573760986, + "learning_rate": 9.416549981896141e-05, + "loss": 0.6567, + "step": 48640 + }, + { + "epoch": 0.31081098347878305, + "grad_norm": 0.6500244736671448, + "learning_rate": 9.416314735930347e-05, + "loss": 1.0039, + "step": 48650 + }, + { + "epoch": 0.31087487062852176, + "grad_norm": 1.0111130475997925, + "learning_rate": 9.416079445488444e-05, + "loss": 1.0116, + "step": 48660 + }, + { + "epoch": 0.31093875777826047, + "grad_norm": 0.8683274984359741, + "learning_rate": 9.4158441105728e-05, + "loss": 0.9201, + "step": 48670 + }, + { + "epoch": 0.3110026449279992, + "grad_norm": 0.9501914381980896, + "learning_rate": 9.415608731185786e-05, + "loss": 1.0651, + "step": 48680 + }, + { + "epoch": 0.3110665320777379, + "grad_norm": 1.045398473739624, + "learning_rate": 9.415373307329771e-05, + "loss": 1.1339, + "step": 48690 + }, + { + "epoch": 0.3111304192274766, + "grad_norm": 0.6909173727035522, + "learning_rate": 9.415137839007127e-05, + "loss": 0.8049, + "step": 48700 + }, + { + "epoch": 0.3111943063772153, + "grad_norm": 0.929655909538269, + "learning_rate": 9.414902326220225e-05, + "loss": 0.8028, + "step": 48710 + }, + { + "epoch": 0.311258193526954, + "grad_norm": 0.9743746519088745, + "learning_rate": 9.414666768971438e-05, + "loss": 0.8393, + "step": 48720 + }, + { + "epoch": 0.3113220806766927, + "grad_norm": 0.9275550246238708, + "learning_rate": 9.414431167263139e-05, + "loss": 0.9987, + "step": 48730 + }, + { + "epoch": 0.3113859678264314, + "grad_norm": 0.9921037554740906, + "learning_rate": 9.414195521097697e-05, + "loss": 0.87, + "step": 48740 + }, + { + "epoch": 0.3114498549761701, + "grad_norm": 0.6033504605293274, + "learning_rate": 9.413959830477488e-05, + "loss": 1.0159, + "step": 48750 + }, + { + "epoch": 0.3115137421259088, + "grad_norm": 0.709073543548584, + "learning_rate": 9.413724095404884e-05, + "loss": 0.8106, + "step": 48760 + }, + { + "epoch": 0.31157762927564747, + "grad_norm": 0.8493859171867371, + "learning_rate": 9.413488315882261e-05, + "loss": 0.7047, + "step": 48770 + }, + { + "epoch": 0.3116415164253862, + "grad_norm": 1.3952586650848389, + "learning_rate": 9.413252491911993e-05, + "loss": 0.8132, + "step": 48780 + }, + { + "epoch": 0.3117054035751249, + "grad_norm": 1.9814708232879639, + "learning_rate": 9.413016623496452e-05, + "loss": 0.709, + "step": 48790 + }, + { + "epoch": 0.3117692907248636, + "grad_norm": 0.6453489661216736, + "learning_rate": 9.412780710638017e-05, + "loss": 0.6557, + "step": 48800 + }, + { + "epoch": 0.3118331778746023, + "grad_norm": 0.954566478729248, + "learning_rate": 9.412544753339063e-05, + "loss": 0.9015, + "step": 48810 + }, + { + "epoch": 0.311897065024341, + "grad_norm": 1.347273588180542, + "learning_rate": 9.412308751601967e-05, + "loss": 0.7975, + "step": 48820 + }, + { + "epoch": 0.3119609521740797, + "grad_norm": 1.4250802993774414, + "learning_rate": 9.412072705429103e-05, + "loss": 0.9073, + "step": 48830 + }, + { + "epoch": 0.3120248393238184, + "grad_norm": 0.8338466286659241, + "learning_rate": 9.41183661482285e-05, + "loss": 1.0889, + "step": 48840 + }, + { + "epoch": 0.3120887264735571, + "grad_norm": 0.8401825428009033, + "learning_rate": 9.411600479785586e-05, + "loss": 0.9772, + "step": 48850 + }, + { + "epoch": 0.3121526136232958, + "grad_norm": 0.7773457169532776, + "learning_rate": 9.411364300319688e-05, + "loss": 0.6319, + "step": 48860 + }, + { + "epoch": 0.31221650077303453, + "grad_norm": 0.6651679873466492, + "learning_rate": 9.411128076427536e-05, + "loss": 0.7987, + "step": 48870 + }, + { + "epoch": 0.31228038792277324, + "grad_norm": 1.0372742414474487, + "learning_rate": 9.410891808111508e-05, + "loss": 0.8707, + "step": 48880 + }, + { + "epoch": 0.3123442750725119, + "grad_norm": 1.1569018363952637, + "learning_rate": 9.410655495373983e-05, + "loss": 0.875, + "step": 48890 + }, + { + "epoch": 0.3124081622222506, + "grad_norm": 0.7065810561180115, + "learning_rate": 9.41041913821734e-05, + "loss": 1.1122, + "step": 48900 + }, + { + "epoch": 0.3124720493719893, + "grad_norm": 0.7592551112174988, + "learning_rate": 9.410182736643964e-05, + "loss": 0.9027, + "step": 48910 + }, + { + "epoch": 0.312535936521728, + "grad_norm": 0.6045570969581604, + "learning_rate": 9.40994629065623e-05, + "loss": 1.0845, + "step": 48920 + }, + { + "epoch": 0.3125998236714667, + "grad_norm": 1.0101828575134277, + "learning_rate": 9.409709800256523e-05, + "loss": 0.7681, + "step": 48930 + }, + { + "epoch": 0.3126637108212054, + "grad_norm": 0.5947315692901611, + "learning_rate": 9.409473265447224e-05, + "loss": 0.8502, + "step": 48940 + }, + { + "epoch": 0.3127275979709441, + "grad_norm": 0.9828523397445679, + "learning_rate": 9.409236686230713e-05, + "loss": 0.8704, + "step": 48950 + }, + { + "epoch": 0.31279148512068283, + "grad_norm": 0.731641948223114, + "learning_rate": 9.409000062609374e-05, + "loss": 1.0281, + "step": 48960 + }, + { + "epoch": 0.31285537227042154, + "grad_norm": 0.9199566841125488, + "learning_rate": 9.408763394585592e-05, + "loss": 0.95, + "step": 48970 + }, + { + "epoch": 0.31291925942016025, + "grad_norm": 1.1518223285675049, + "learning_rate": 9.408526682161746e-05, + "loss": 0.8526, + "step": 48980 + }, + { + "epoch": 0.31298314656989895, + "grad_norm": 0.8787213563919067, + "learning_rate": 9.408289925340224e-05, + "loss": 0.9255, + "step": 48990 + }, + { + "epoch": 0.31304703371963766, + "grad_norm": 0.9983569979667664, + "learning_rate": 9.408053124123408e-05, + "loss": 1.1062, + "step": 49000 + }, + { + "epoch": 0.3131109208693763, + "grad_norm": 1.392701268196106, + "learning_rate": 9.407816278513683e-05, + "loss": 0.6401, + "step": 49010 + }, + { + "epoch": 0.313174808019115, + "grad_norm": 0.5991133451461792, + "learning_rate": 9.407579388513434e-05, + "loss": 0.8792, + "step": 49020 + }, + { + "epoch": 0.3132386951688537, + "grad_norm": 0.7458668947219849, + "learning_rate": 9.40734245412505e-05, + "loss": 1.1298, + "step": 49030 + }, + { + "epoch": 0.3133025823185924, + "grad_norm": 1.072922706604004, + "learning_rate": 9.407105475350914e-05, + "loss": 0.8286, + "step": 49040 + }, + { + "epoch": 0.31336646946833113, + "grad_norm": 2.292825698852539, + "learning_rate": 9.406868452193411e-05, + "loss": 1.0391, + "step": 49050 + }, + { + "epoch": 0.31343035661806984, + "grad_norm": 0.8459042906761169, + "learning_rate": 9.406631384654934e-05, + "loss": 0.8123, + "step": 49060 + }, + { + "epoch": 0.31349424376780854, + "grad_norm": 1.0259894132614136, + "learning_rate": 9.406394272737863e-05, + "loss": 0.8378, + "step": 49070 + }, + { + "epoch": 0.31355813091754725, + "grad_norm": 0.6724763512611389, + "learning_rate": 9.406157116444592e-05, + "loss": 0.7612, + "step": 49080 + }, + { + "epoch": 0.31362201806728596, + "grad_norm": 0.6208413243293762, + "learning_rate": 9.405919915777506e-05, + "loss": 0.8256, + "step": 49090 + }, + { + "epoch": 0.31368590521702466, + "grad_norm": 0.5845530033111572, + "learning_rate": 9.405682670738995e-05, + "loss": 0.8746, + "step": 49100 + }, + { + "epoch": 0.31374979236676337, + "grad_norm": 1.0131624937057495, + "learning_rate": 9.405445381331449e-05, + "loss": 0.7601, + "step": 49110 + }, + { + "epoch": 0.3138136795165021, + "grad_norm": 0.8902072310447693, + "learning_rate": 9.405208047557255e-05, + "loss": 1.0531, + "step": 49120 + }, + { + "epoch": 0.3138775666662407, + "grad_norm": 1.196207880973816, + "learning_rate": 9.404970669418804e-05, + "loss": 1.023, + "step": 49130 + }, + { + "epoch": 0.31394145381597943, + "grad_norm": 0.9805600643157959, + "learning_rate": 9.404733246918489e-05, + "loss": 0.7877, + "step": 49140 + }, + { + "epoch": 0.31400534096571814, + "grad_norm": 0.7303978204727173, + "learning_rate": 9.404495780058701e-05, + "loss": 0.8739, + "step": 49150 + }, + { + "epoch": 0.31406922811545684, + "grad_norm": 0.754725456237793, + "learning_rate": 9.404258268841827e-05, + "loss": 0.8963, + "step": 49160 + }, + { + "epoch": 0.31413311526519555, + "grad_norm": 0.8432728052139282, + "learning_rate": 9.404020713270265e-05, + "loss": 0.8278, + "step": 49170 + }, + { + "epoch": 0.31419700241493426, + "grad_norm": 1.7309197187423706, + "learning_rate": 9.403783113346402e-05, + "loss": 0.7055, + "step": 49180 + }, + { + "epoch": 0.31426088956467296, + "grad_norm": 1.0485713481903076, + "learning_rate": 9.403545469072636e-05, + "loss": 1.0848, + "step": 49190 + }, + { + "epoch": 0.31432477671441167, + "grad_norm": 1.0555837154388428, + "learning_rate": 9.403307780451356e-05, + "loss": 0.9272, + "step": 49200 + }, + { + "epoch": 0.3143886638641504, + "grad_norm": 0.642327070236206, + "learning_rate": 9.403070047484957e-05, + "loss": 1.0311, + "step": 49210 + }, + { + "epoch": 0.3144525510138891, + "grad_norm": 1.0337790250778198, + "learning_rate": 9.402832270175833e-05, + "loss": 0.6988, + "step": 49220 + }, + { + "epoch": 0.3145164381636278, + "grad_norm": 0.6804295182228088, + "learning_rate": 9.40259444852638e-05, + "loss": 0.9084, + "step": 49230 + }, + { + "epoch": 0.3145803253133665, + "grad_norm": 0.9639153480529785, + "learning_rate": 9.402356582538991e-05, + "loss": 0.9531, + "step": 49240 + }, + { + "epoch": 0.31464421246310514, + "grad_norm": 1.2512791156768799, + "learning_rate": 9.402118672216064e-05, + "loss": 0.8388, + "step": 49250 + }, + { + "epoch": 0.31470809961284385, + "grad_norm": 1.065996527671814, + "learning_rate": 9.401880717559993e-05, + "loss": 1.0074, + "step": 49260 + }, + { + "epoch": 0.31477198676258256, + "grad_norm": 0.7025091052055359, + "learning_rate": 9.401642718573175e-05, + "loss": 0.9072, + "step": 49270 + }, + { + "epoch": 0.31483587391232126, + "grad_norm": 0.6397770047187805, + "learning_rate": 9.401404675258006e-05, + "loss": 0.9634, + "step": 49280 + }, + { + "epoch": 0.31489976106205997, + "grad_norm": 0.938177764415741, + "learning_rate": 9.401166587616885e-05, + "loss": 1.015, + "step": 49290 + }, + { + "epoch": 0.3149636482117987, + "grad_norm": 0.6089216470718384, + "learning_rate": 9.400928455652209e-05, + "loss": 0.8248, + "step": 49300 + }, + { + "epoch": 0.3150275353615374, + "grad_norm": 0.5597997307777405, + "learning_rate": 9.400690279366377e-05, + "loss": 0.7582, + "step": 49310 + }, + { + "epoch": 0.3150914225112761, + "grad_norm": 1.5323448181152344, + "learning_rate": 9.400452058761784e-05, + "loss": 0.8959, + "step": 49320 + }, + { + "epoch": 0.3151553096610148, + "grad_norm": 0.6819522976875305, + "learning_rate": 9.400213793840835e-05, + "loss": 0.8379, + "step": 49330 + }, + { + "epoch": 0.3152191968107535, + "grad_norm": 1.2610814571380615, + "learning_rate": 9.399975484605925e-05, + "loss": 1.0078, + "step": 49340 + }, + { + "epoch": 0.3152830839604922, + "grad_norm": 0.9983810782432556, + "learning_rate": 9.399737131059454e-05, + "loss": 0.9125, + "step": 49350 + }, + { + "epoch": 0.3153469711102309, + "grad_norm": 0.8259413838386536, + "learning_rate": 9.399498733203827e-05, + "loss": 0.921, + "step": 49360 + }, + { + "epoch": 0.31541085825996956, + "grad_norm": 1.0133284330368042, + "learning_rate": 9.399260291041439e-05, + "loss": 0.9468, + "step": 49370 + }, + { + "epoch": 0.31547474540970827, + "grad_norm": 0.9398267269134521, + "learning_rate": 9.399021804574694e-05, + "loss": 0.8747, + "step": 49380 + }, + { + "epoch": 0.315538632559447, + "grad_norm": 0.7082047462463379, + "learning_rate": 9.398783273805995e-05, + "loss": 0.8867, + "step": 49390 + }, + { + "epoch": 0.3156025197091857, + "grad_norm": 0.8589842915534973, + "learning_rate": 9.398544698737743e-05, + "loss": 0.713, + "step": 49400 + }, + { + "epoch": 0.3156664068589244, + "grad_norm": 1.1471587419509888, + "learning_rate": 9.398306079372339e-05, + "loss": 0.9788, + "step": 49410 + }, + { + "epoch": 0.3157302940086631, + "grad_norm": 0.49813616275787354, + "learning_rate": 9.398067415712188e-05, + "loss": 0.7366, + "step": 49420 + }, + { + "epoch": 0.3157941811584018, + "grad_norm": 1.774720311164856, + "learning_rate": 9.397828707759695e-05, + "loss": 0.685, + "step": 49430 + }, + { + "epoch": 0.3158580683081405, + "grad_norm": 1.121861219406128, + "learning_rate": 9.397589955517261e-05, + "loss": 0.9251, + "step": 49440 + }, + { + "epoch": 0.3159219554578792, + "grad_norm": 0.8228227496147156, + "learning_rate": 9.397351158987293e-05, + "loss": 0.9838, + "step": 49450 + }, + { + "epoch": 0.3159858426076179, + "grad_norm": 0.9918887615203857, + "learning_rate": 9.397112318172192e-05, + "loss": 0.8684, + "step": 49460 + }, + { + "epoch": 0.3160497297573566, + "grad_norm": 0.8907731175422668, + "learning_rate": 9.396873433074367e-05, + "loss": 0.8603, + "step": 49470 + }, + { + "epoch": 0.31611361690709533, + "grad_norm": 0.9690269827842712, + "learning_rate": 9.396634503696225e-05, + "loss": 0.931, + "step": 49480 + }, + { + "epoch": 0.31617750405683404, + "grad_norm": 0.7284924983978271, + "learning_rate": 9.396395530040167e-05, + "loss": 0.8224, + "step": 49490 + }, + { + "epoch": 0.3162413912065727, + "grad_norm": 0.8061665296554565, + "learning_rate": 9.396156512108603e-05, + "loss": 0.8418, + "step": 49500 + }, + { + "epoch": 0.3163052783563114, + "grad_norm": 0.7680371999740601, + "learning_rate": 9.39591744990394e-05, + "loss": 0.5664, + "step": 49510 + }, + { + "epoch": 0.3163691655060501, + "grad_norm": 0.8830310702323914, + "learning_rate": 9.395678343428586e-05, + "loss": 0.7123, + "step": 49520 + }, + { + "epoch": 0.3164330526557888, + "grad_norm": 0.9915767908096313, + "learning_rate": 9.395439192684947e-05, + "loss": 0.9406, + "step": 49530 + }, + { + "epoch": 0.3164969398055275, + "grad_norm": 1.3470020294189453, + "learning_rate": 9.395199997675435e-05, + "loss": 0.8661, + "step": 49540 + }, + { + "epoch": 0.3165608269552662, + "grad_norm": 0.934509813785553, + "learning_rate": 9.394960758402455e-05, + "loss": 1.0339, + "step": 49550 + }, + { + "epoch": 0.3166247141050049, + "grad_norm": 0.5762020349502563, + "learning_rate": 9.394721474868418e-05, + "loss": 0.9045, + "step": 49560 + }, + { + "epoch": 0.31668860125474363, + "grad_norm": 0.8978357315063477, + "learning_rate": 9.394482147075734e-05, + "loss": 0.8767, + "step": 49570 + }, + { + "epoch": 0.31675248840448234, + "grad_norm": 0.7672891020774841, + "learning_rate": 9.394242775026812e-05, + "loss": 0.9131, + "step": 49580 + }, + { + "epoch": 0.31681637555422104, + "grad_norm": 0.9509188532829285, + "learning_rate": 9.394003358724067e-05, + "loss": 0.9731, + "step": 49590 + }, + { + "epoch": 0.31688026270395975, + "grad_norm": 0.6911554336547852, + "learning_rate": 9.393763898169903e-05, + "loss": 0.7586, + "step": 49600 + }, + { + "epoch": 0.31694414985369845, + "grad_norm": 0.9574739933013916, + "learning_rate": 9.393524393366737e-05, + "loss": 0.9106, + "step": 49610 + }, + { + "epoch": 0.3170080370034371, + "grad_norm": 0.7374638915061951, + "learning_rate": 9.393284844316979e-05, + "loss": 0.7652, + "step": 49620 + }, + { + "epoch": 0.3170719241531758, + "grad_norm": 0.971748948097229, + "learning_rate": 9.393045251023042e-05, + "loss": 0.9203, + "step": 49630 + }, + { + "epoch": 0.3171358113029145, + "grad_norm": 0.8712042570114136, + "learning_rate": 9.392805613487339e-05, + "loss": 0.6788, + "step": 49640 + }, + { + "epoch": 0.3171996984526532, + "grad_norm": 0.782733142375946, + "learning_rate": 9.392565931712282e-05, + "loss": 0.936, + "step": 49650 + }, + { + "epoch": 0.31726358560239193, + "grad_norm": 0.6605043411254883, + "learning_rate": 9.392326205700288e-05, + "loss": 0.8659, + "step": 49660 + }, + { + "epoch": 0.31732747275213063, + "grad_norm": 0.831889808177948, + "learning_rate": 9.392086435453769e-05, + "loss": 0.8596, + "step": 49670 + }, + { + "epoch": 0.31739135990186934, + "grad_norm": 0.6261256337165833, + "learning_rate": 9.391846620975139e-05, + "loss": 0.9768, + "step": 49680 + }, + { + "epoch": 0.31745524705160805, + "grad_norm": 0.8194597959518433, + "learning_rate": 9.391606762266814e-05, + "loss": 0.7694, + "step": 49690 + }, + { + "epoch": 0.31751913420134675, + "grad_norm": 0.9790688157081604, + "learning_rate": 9.39136685933121e-05, + "loss": 1.0057, + "step": 49700 + }, + { + "epoch": 0.31758302135108546, + "grad_norm": 0.8990379571914673, + "learning_rate": 9.391126912170742e-05, + "loss": 0.9249, + "step": 49710 + }, + { + "epoch": 0.31764690850082417, + "grad_norm": 0.9338327050209045, + "learning_rate": 9.390886920787828e-05, + "loss": 0.99, + "step": 49720 + }, + { + "epoch": 0.31771079565056287, + "grad_norm": 0.7837555408477783, + "learning_rate": 9.390646885184884e-05, + "loss": 0.966, + "step": 49730 + }, + { + "epoch": 0.3177746828003015, + "grad_norm": 0.7973876595497131, + "learning_rate": 9.390406805364327e-05, + "loss": 0.9153, + "step": 49740 + }, + { + "epoch": 0.31783856995004023, + "grad_norm": 0.8525822758674622, + "learning_rate": 9.390166681328575e-05, + "loss": 0.8138, + "step": 49750 + }, + { + "epoch": 0.31790245709977893, + "grad_norm": 1.3615291118621826, + "learning_rate": 9.389926513080047e-05, + "loss": 0.887, + "step": 49760 + }, + { + "epoch": 0.31796634424951764, + "grad_norm": 1.1276623010635376, + "learning_rate": 9.389686300621162e-05, + "loss": 0.9642, + "step": 49770 + }, + { + "epoch": 0.31803023139925635, + "grad_norm": 0.7759473323822021, + "learning_rate": 9.389446043954336e-05, + "loss": 0.7954, + "step": 49780 + }, + { + "epoch": 0.31809411854899505, + "grad_norm": 0.9552074074745178, + "learning_rate": 9.389205743081992e-05, + "loss": 0.8348, + "step": 49790 + }, + { + "epoch": 0.31815800569873376, + "grad_norm": 0.6165658831596375, + "learning_rate": 9.388965398006549e-05, + "loss": 0.9181, + "step": 49800 + }, + { + "epoch": 0.31822189284847247, + "grad_norm": 0.7126082181930542, + "learning_rate": 9.388725008730428e-05, + "loss": 1.1795, + "step": 49810 + }, + { + "epoch": 0.31828577999821117, + "grad_norm": 0.8872388005256653, + "learning_rate": 9.388484575256049e-05, + "loss": 0.9206, + "step": 49820 + }, + { + "epoch": 0.3183496671479499, + "grad_norm": 0.8513489365577698, + "learning_rate": 9.388244097585835e-05, + "loss": 1.0587, + "step": 49830 + }, + { + "epoch": 0.3184135542976886, + "grad_norm": 0.6428020596504211, + "learning_rate": 9.388003575722204e-05, + "loss": 0.7731, + "step": 49840 + }, + { + "epoch": 0.3184774414474273, + "grad_norm": 1.0599162578582764, + "learning_rate": 9.387763009667583e-05, + "loss": 0.8681, + "step": 49850 + }, + { + "epoch": 0.31854132859716594, + "grad_norm": 0.6444383859634399, + "learning_rate": 9.387522399424391e-05, + "loss": 0.79, + "step": 49860 + }, + { + "epoch": 0.31860521574690465, + "grad_norm": 0.6530598998069763, + "learning_rate": 9.387281744995053e-05, + "loss": 1.1558, + "step": 49870 + }, + { + "epoch": 0.31866910289664335, + "grad_norm": 0.7804102897644043, + "learning_rate": 9.387041046381994e-05, + "loss": 0.9238, + "step": 49880 + }, + { + "epoch": 0.31873299004638206, + "grad_norm": 0.8872278332710266, + "learning_rate": 9.386800303587635e-05, + "loss": 0.7599, + "step": 49890 + }, + { + "epoch": 0.31879687719612076, + "grad_norm": 0.5427918434143066, + "learning_rate": 9.386559516614401e-05, + "loss": 0.8869, + "step": 49900 + }, + { + "epoch": 0.31886076434585947, + "grad_norm": 0.8014676570892334, + "learning_rate": 9.386318685464719e-05, + "loss": 0.885, + "step": 49910 + }, + { + "epoch": 0.3189246514955982, + "grad_norm": 0.7238976359367371, + "learning_rate": 9.386077810141013e-05, + "loss": 0.8402, + "step": 49920 + }, + { + "epoch": 0.3189885386453369, + "grad_norm": 1.0209499597549438, + "learning_rate": 9.385836890645708e-05, + "loss": 0.7001, + "step": 49930 + }, + { + "epoch": 0.3190524257950756, + "grad_norm": 0.978877067565918, + "learning_rate": 9.385595926981232e-05, + "loss": 0.8989, + "step": 49940 + }, + { + "epoch": 0.3191163129448143, + "grad_norm": 1.336305856704712, + "learning_rate": 9.385354919150011e-05, + "loss": 1.0186, + "step": 49950 + }, + { + "epoch": 0.319180200094553, + "grad_norm": 0.9015640616416931, + "learning_rate": 9.385113867154473e-05, + "loss": 0.9387, + "step": 49960 + }, + { + "epoch": 0.3192440872442917, + "grad_norm": 1.013190507888794, + "learning_rate": 9.384872770997043e-05, + "loss": 1.1722, + "step": 49970 + }, + { + "epoch": 0.31930797439403036, + "grad_norm": 1.667493224143982, + "learning_rate": 9.384631630680152e-05, + "loss": 0.9188, + "step": 49980 + }, + { + "epoch": 0.31937186154376906, + "grad_norm": 0.7360647320747375, + "learning_rate": 9.384390446206226e-05, + "loss": 0.8126, + "step": 49990 + }, + { + "epoch": 0.31943574869350777, + "grad_norm": 1.0479674339294434, + "learning_rate": 9.384149217577695e-05, + "loss": 0.8874, + "step": 50000 + }, + { + "epoch": 0.3194996358432465, + "grad_norm": 0.7030632495880127, + "learning_rate": 9.38390794479699e-05, + "loss": 1.0218, + "step": 50010 + }, + { + "epoch": 0.3195635229929852, + "grad_norm": 0.8451624512672424, + "learning_rate": 9.383666627866539e-05, + "loss": 0.8403, + "step": 50020 + }, + { + "epoch": 0.3196274101427239, + "grad_norm": 0.5618483424186707, + "learning_rate": 9.383425266788772e-05, + "loss": 0.7409, + "step": 50030 + }, + { + "epoch": 0.3196912972924626, + "grad_norm": 0.7848330736160278, + "learning_rate": 9.383183861566121e-05, + "loss": 0.9107, + "step": 50040 + }, + { + "epoch": 0.3197551844422013, + "grad_norm": 0.7237081527709961, + "learning_rate": 9.382942412201016e-05, + "loss": 0.6871, + "step": 50050 + }, + { + "epoch": 0.31981907159194, + "grad_norm": 1.093504786491394, + "learning_rate": 9.382700918695889e-05, + "loss": 0.8252, + "step": 50060 + }, + { + "epoch": 0.3198829587416787, + "grad_norm": 0.733311653137207, + "learning_rate": 9.382459381053173e-05, + "loss": 0.7627, + "step": 50070 + }, + { + "epoch": 0.3199468458914174, + "grad_norm": 0.8467728495597839, + "learning_rate": 9.3822177992753e-05, + "loss": 0.8656, + "step": 50080 + }, + { + "epoch": 0.3200107330411561, + "grad_norm": 1.3215608596801758, + "learning_rate": 9.381976173364702e-05, + "loss": 0.7438, + "step": 50090 + }, + { + "epoch": 0.3200746201908948, + "grad_norm": 0.7115028500556946, + "learning_rate": 9.381734503323812e-05, + "loss": 1.1696, + "step": 50100 + }, + { + "epoch": 0.3201385073406335, + "grad_norm": 0.7195971608161926, + "learning_rate": 9.381492789155066e-05, + "loss": 0.7597, + "step": 50110 + }, + { + "epoch": 0.3202023944903722, + "grad_norm": 0.7704067230224609, + "learning_rate": 9.381251030860896e-05, + "loss": 0.8963, + "step": 50120 + }, + { + "epoch": 0.3202662816401109, + "grad_norm": 0.7331963181495667, + "learning_rate": 9.381009228443737e-05, + "loss": 0.7832, + "step": 50130 + }, + { + "epoch": 0.3203301687898496, + "grad_norm": 0.8845292329788208, + "learning_rate": 9.380767381906029e-05, + "loss": 0.8378, + "step": 50140 + }, + { + "epoch": 0.3203940559395883, + "grad_norm": 0.714884340763092, + "learning_rate": 9.380525491250201e-05, + "loss": 0.9028, + "step": 50150 + }, + { + "epoch": 0.320457943089327, + "grad_norm": 0.6993659138679504, + "learning_rate": 9.380283556478691e-05, + "loss": 1.1563, + "step": 50160 + }, + { + "epoch": 0.3205218302390657, + "grad_norm": 1.0421202182769775, + "learning_rate": 9.380041577593937e-05, + "loss": 0.9022, + "step": 50170 + }, + { + "epoch": 0.3205857173888044, + "grad_norm": 1.632599115371704, + "learning_rate": 9.379799554598374e-05, + "loss": 0.8848, + "step": 50180 + }, + { + "epoch": 0.32064960453854313, + "grad_norm": 0.9513861536979675, + "learning_rate": 9.379557487494442e-05, + "loss": 1.0339, + "step": 50190 + }, + { + "epoch": 0.32071349168828184, + "grad_norm": 0.803616464138031, + "learning_rate": 9.379315376284576e-05, + "loss": 0.852, + "step": 50200 + }, + { + "epoch": 0.32077737883802054, + "grad_norm": 0.938457727432251, + "learning_rate": 9.379073220971215e-05, + "loss": 0.9651, + "step": 50210 + }, + { + "epoch": 0.3208412659877592, + "grad_norm": 1.7576130628585815, + "learning_rate": 9.3788310215568e-05, + "loss": 0.8734, + "step": 50220 + }, + { + "epoch": 0.3209051531374979, + "grad_norm": 0.8921381831169128, + "learning_rate": 9.378588778043766e-05, + "loss": 0.9362, + "step": 50230 + }, + { + "epoch": 0.3209690402872366, + "grad_norm": 0.6942434906959534, + "learning_rate": 9.378346490434558e-05, + "loss": 0.8118, + "step": 50240 + }, + { + "epoch": 0.3210329274369753, + "grad_norm": 0.9267338514328003, + "learning_rate": 9.378104158731611e-05, + "loss": 0.8815, + "step": 50250 + }, + { + "epoch": 0.321096814586714, + "grad_norm": 0.8614099621772766, + "learning_rate": 9.377861782937369e-05, + "loss": 0.8871, + "step": 50260 + }, + { + "epoch": 0.3211607017364527, + "grad_norm": 0.944468080997467, + "learning_rate": 9.37761936305427e-05, + "loss": 1.0129, + "step": 50270 + }, + { + "epoch": 0.32122458888619143, + "grad_norm": 1.3810163736343384, + "learning_rate": 9.377376899084757e-05, + "loss": 0.6677, + "step": 50280 + }, + { + "epoch": 0.32128847603593014, + "grad_norm": 0.8450731039047241, + "learning_rate": 9.377134391031272e-05, + "loss": 1.0518, + "step": 50290 + }, + { + "epoch": 0.32135236318566884, + "grad_norm": 1.011494755744934, + "learning_rate": 9.376891838896258e-05, + "loss": 0.7787, + "step": 50300 + }, + { + "epoch": 0.32141625033540755, + "grad_norm": 0.8098331093788147, + "learning_rate": 9.376649242682154e-05, + "loss": 0.9035, + "step": 50310 + }, + { + "epoch": 0.32148013748514626, + "grad_norm": 0.7809407711029053, + "learning_rate": 9.376406602391407e-05, + "loss": 0.8507, + "step": 50320 + }, + { + "epoch": 0.32154402463488496, + "grad_norm": 1.0858904123306274, + "learning_rate": 9.376163918026461e-05, + "loss": 1.109, + "step": 50330 + }, + { + "epoch": 0.32160791178462367, + "grad_norm": 1.2475218772888184, + "learning_rate": 9.375921189589756e-05, + "loss": 0.7929, + "step": 50340 + }, + { + "epoch": 0.3216717989343623, + "grad_norm": 0.9990367293357849, + "learning_rate": 9.375678417083741e-05, + "loss": 0.9663, + "step": 50350 + }, + { + "epoch": 0.321735686084101, + "grad_norm": 0.7379415035247803, + "learning_rate": 9.375435600510858e-05, + "loss": 0.8639, + "step": 50360 + }, + { + "epoch": 0.32179957323383973, + "grad_norm": 0.7513096928596497, + "learning_rate": 9.375192739873553e-05, + "loss": 0.9025, + "step": 50370 + }, + { + "epoch": 0.32186346038357844, + "grad_norm": 1.1147832870483398, + "learning_rate": 9.374949835174273e-05, + "loss": 1.0522, + "step": 50380 + }, + { + "epoch": 0.32192734753331714, + "grad_norm": 1.0712778568267822, + "learning_rate": 9.374706886415462e-05, + "loss": 0.8807, + "step": 50390 + }, + { + "epoch": 0.32199123468305585, + "grad_norm": 1.0778201818466187, + "learning_rate": 9.374463893599568e-05, + "loss": 1.0531, + "step": 50400 + }, + { + "epoch": 0.32205512183279456, + "grad_norm": 0.7178623676300049, + "learning_rate": 9.374220856729039e-05, + "loss": 0.7271, + "step": 50410 + }, + { + "epoch": 0.32211900898253326, + "grad_norm": 0.9530071020126343, + "learning_rate": 9.373977775806321e-05, + "loss": 0.7916, + "step": 50420 + }, + { + "epoch": 0.32218289613227197, + "grad_norm": 0.8260478377342224, + "learning_rate": 9.373734650833862e-05, + "loss": 0.9391, + "step": 50430 + }, + { + "epoch": 0.3222467832820107, + "grad_norm": 0.9254810810089111, + "learning_rate": 9.373491481814114e-05, + "loss": 1.0402, + "step": 50440 + }, + { + "epoch": 0.3223106704317494, + "grad_norm": 1.080959439277649, + "learning_rate": 9.373248268749521e-05, + "loss": 0.8513, + "step": 50450 + }, + { + "epoch": 0.3223745575814881, + "grad_norm": 1.447561264038086, + "learning_rate": 9.373005011642534e-05, + "loss": 0.8074, + "step": 50460 + }, + { + "epoch": 0.32243844473122674, + "grad_norm": 0.7180037498474121, + "learning_rate": 9.372761710495605e-05, + "loss": 0.9142, + "step": 50470 + }, + { + "epoch": 0.32250233188096544, + "grad_norm": 0.8241226077079773, + "learning_rate": 9.372518365311183e-05, + "loss": 0.7491, + "step": 50480 + }, + { + "epoch": 0.32256621903070415, + "grad_norm": 0.647534966468811, + "learning_rate": 9.372274976091718e-05, + "loss": 0.988, + "step": 50490 + }, + { + "epoch": 0.32263010618044285, + "grad_norm": 0.953140139579773, + "learning_rate": 9.372031542839658e-05, + "loss": 1.0637, + "step": 50500 + }, + { + "epoch": 0.32269399333018156, + "grad_norm": 1.1263874769210815, + "learning_rate": 9.371788065557463e-05, + "loss": 0.8773, + "step": 50510 + }, + { + "epoch": 0.32275788047992027, + "grad_norm": 0.9044056534767151, + "learning_rate": 9.371544544247577e-05, + "loss": 0.8032, + "step": 50520 + }, + { + "epoch": 0.322821767629659, + "grad_norm": 0.7417857646942139, + "learning_rate": 9.371300978912456e-05, + "loss": 0.8008, + "step": 50530 + }, + { + "epoch": 0.3228856547793977, + "grad_norm": 0.6616377830505371, + "learning_rate": 9.371057369554552e-05, + "loss": 0.9739, + "step": 50540 + }, + { + "epoch": 0.3229495419291364, + "grad_norm": 0.9354878664016724, + "learning_rate": 9.370813716176321e-05, + "loss": 0.9562, + "step": 50550 + }, + { + "epoch": 0.3230134290788751, + "grad_norm": 0.9551581740379333, + "learning_rate": 9.370570018780213e-05, + "loss": 0.9345, + "step": 50560 + }, + { + "epoch": 0.3230773162286138, + "grad_norm": 0.7437955737113953, + "learning_rate": 9.370326277368684e-05, + "loss": 0.8398, + "step": 50570 + }, + { + "epoch": 0.3231412033783525, + "grad_norm": 0.7190837264060974, + "learning_rate": 9.370082491944188e-05, + "loss": 0.978, + "step": 50580 + }, + { + "epoch": 0.32320509052809115, + "grad_norm": 1.749056100845337, + "learning_rate": 9.36983866250918e-05, + "loss": 1.241, + "step": 50590 + }, + { + "epoch": 0.32326897767782986, + "grad_norm": 0.8506900072097778, + "learning_rate": 9.369594789066119e-05, + "loss": 1.1666, + "step": 50600 + }, + { + "epoch": 0.32333286482756857, + "grad_norm": 0.8432791829109192, + "learning_rate": 9.369350871617454e-05, + "loss": 1.1055, + "step": 50610 + }, + { + "epoch": 0.3233967519773073, + "grad_norm": 0.8245983123779297, + "learning_rate": 9.36910691016565e-05, + "loss": 0.8282, + "step": 50620 + }, + { + "epoch": 0.323460639127046, + "grad_norm": 0.7552667856216431, + "learning_rate": 9.368862904713158e-05, + "loss": 0.8247, + "step": 50630 + }, + { + "epoch": 0.3235245262767847, + "grad_norm": 1.2741241455078125, + "learning_rate": 9.368618855262437e-05, + "loss": 0.8029, + "step": 50640 + }, + { + "epoch": 0.3235884134265234, + "grad_norm": 0.47164657711982727, + "learning_rate": 9.368374761815943e-05, + "loss": 1.1075, + "step": 50650 + }, + { + "epoch": 0.3236523005762621, + "grad_norm": 0.9931887984275818, + "learning_rate": 9.368130624376139e-05, + "loss": 0.7937, + "step": 50660 + }, + { + "epoch": 0.3237161877260008, + "grad_norm": 0.6441866755485535, + "learning_rate": 9.36788644294548e-05, + "loss": 0.748, + "step": 50670 + }, + { + "epoch": 0.3237800748757395, + "grad_norm": 1.0310367345809937, + "learning_rate": 9.367642217526423e-05, + "loss": 0.7168, + "step": 50680 + }, + { + "epoch": 0.3238439620254782, + "grad_norm": 1.0950703620910645, + "learning_rate": 9.367397948121433e-05, + "loss": 0.722, + "step": 50690 + }, + { + "epoch": 0.3239078491752169, + "grad_norm": 0.7349279522895813, + "learning_rate": 9.367153634732966e-05, + "loss": 0.9114, + "step": 50700 + }, + { + "epoch": 0.3239717363249556, + "grad_norm": 0.6076371073722839, + "learning_rate": 9.366909277363484e-05, + "loss": 0.9256, + "step": 50710 + }, + { + "epoch": 0.3240356234746943, + "grad_norm": 0.8350464105606079, + "learning_rate": 9.366664876015448e-05, + "loss": 0.7421, + "step": 50720 + }, + { + "epoch": 0.324099510624433, + "grad_norm": 0.6974174976348877, + "learning_rate": 9.36642043069132e-05, + "loss": 0.9282, + "step": 50730 + }, + { + "epoch": 0.3241633977741717, + "grad_norm": 0.8667670488357544, + "learning_rate": 9.36617594139356e-05, + "loss": 1.0113, + "step": 50740 + }, + { + "epoch": 0.3242272849239104, + "grad_norm": 0.8927708864212036, + "learning_rate": 9.365931408124631e-05, + "loss": 1.0419, + "step": 50750 + }, + { + "epoch": 0.3242911720736491, + "grad_norm": 0.6961429119110107, + "learning_rate": 9.365686830886995e-05, + "loss": 0.9287, + "step": 50760 + }, + { + "epoch": 0.3243550592233878, + "grad_norm": 0.638645350933075, + "learning_rate": 9.365442209683116e-05, + "loss": 0.8599, + "step": 50770 + }, + { + "epoch": 0.3244189463731265, + "grad_norm": 1.2358146905899048, + "learning_rate": 9.365197544515456e-05, + "loss": 0.7256, + "step": 50780 + }, + { + "epoch": 0.3244828335228652, + "grad_norm": 0.6738364696502686, + "learning_rate": 9.364952835386482e-05, + "loss": 1.0181, + "step": 50790 + }, + { + "epoch": 0.32454672067260393, + "grad_norm": 1.1015185117721558, + "learning_rate": 9.364708082298656e-05, + "loss": 0.6716, + "step": 50800 + }, + { + "epoch": 0.32461060782234263, + "grad_norm": 0.45196080207824707, + "learning_rate": 9.364463285254446e-05, + "loss": 0.8246, + "step": 50810 + }, + { + "epoch": 0.32467449497208134, + "grad_norm": 0.5970223546028137, + "learning_rate": 9.364218444256312e-05, + "loss": 1.08, + "step": 50820 + }, + { + "epoch": 0.32473838212182, + "grad_norm": 0.6557901501655579, + "learning_rate": 9.363973559306724e-05, + "loss": 0.8581, + "step": 50830 + }, + { + "epoch": 0.3248022692715587, + "grad_norm": 1.3220319747924805, + "learning_rate": 9.363728630408146e-05, + "loss": 0.8748, + "step": 50840 + }, + { + "epoch": 0.3248661564212974, + "grad_norm": 0.6746674180030823, + "learning_rate": 9.363483657563046e-05, + "loss": 0.9018, + "step": 50850 + }, + { + "epoch": 0.3249300435710361, + "grad_norm": 0.8742882013320923, + "learning_rate": 9.363238640773891e-05, + "loss": 0.8107, + "step": 50860 + }, + { + "epoch": 0.3249939307207748, + "grad_norm": 0.8248798847198486, + "learning_rate": 9.362993580043148e-05, + "loss": 0.9382, + "step": 50870 + }, + { + "epoch": 0.3250578178705135, + "grad_norm": 0.7204467058181763, + "learning_rate": 9.362748475373284e-05, + "loss": 1.0355, + "step": 50880 + }, + { + "epoch": 0.3251217050202522, + "grad_norm": 2.2558605670928955, + "learning_rate": 9.36250332676677e-05, + "loss": 1.1596, + "step": 50890 + }, + { + "epoch": 0.32518559216999093, + "grad_norm": 0.6045457124710083, + "learning_rate": 9.362258134226074e-05, + "loss": 0.905, + "step": 50900 + }, + { + "epoch": 0.32524947931972964, + "grad_norm": 0.6380605101585388, + "learning_rate": 9.362012897753662e-05, + "loss": 0.969, + "step": 50910 + }, + { + "epoch": 0.32531336646946835, + "grad_norm": 1.1410925388336182, + "learning_rate": 9.361767617352008e-05, + "loss": 0.7254, + "step": 50920 + }, + { + "epoch": 0.32537725361920705, + "grad_norm": 1.1019296646118164, + "learning_rate": 9.361522293023581e-05, + "loss": 0.9216, + "step": 50930 + }, + { + "epoch": 0.32544114076894576, + "grad_norm": 1.174682378768921, + "learning_rate": 9.361276924770853e-05, + "loss": 0.8368, + "step": 50940 + }, + { + "epoch": 0.3255050279186844, + "grad_norm": 0.5832127928733826, + "learning_rate": 9.36103151259629e-05, + "loss": 0.7741, + "step": 50950 + }, + { + "epoch": 0.3255689150684231, + "grad_norm": 0.9537053108215332, + "learning_rate": 9.360786056502367e-05, + "loss": 0.9866, + "step": 50960 + }, + { + "epoch": 0.3256328022181618, + "grad_norm": 1.0229930877685547, + "learning_rate": 9.360540556491558e-05, + "loss": 0.8135, + "step": 50970 + }, + { + "epoch": 0.3256966893679005, + "grad_norm": 1.187549352645874, + "learning_rate": 9.360295012566332e-05, + "loss": 0.9386, + "step": 50980 + }, + { + "epoch": 0.32576057651763923, + "grad_norm": 1.09153151512146, + "learning_rate": 9.360049424729162e-05, + "loss": 0.8265, + "step": 50990 + }, + { + "epoch": 0.32582446366737794, + "grad_norm": 0.7309248447418213, + "learning_rate": 9.359803792982525e-05, + "loss": 0.8341, + "step": 51000 + }, + { + "epoch": 0.32588835081711665, + "grad_norm": 0.45413196086883545, + "learning_rate": 9.359558117328891e-05, + "loss": 0.9438, + "step": 51010 + }, + { + "epoch": 0.32595223796685535, + "grad_norm": 0.8382761478424072, + "learning_rate": 9.359312397770733e-05, + "loss": 0.8004, + "step": 51020 + }, + { + "epoch": 0.32601612511659406, + "grad_norm": 0.8672879338264465, + "learning_rate": 9.359066634310529e-05, + "loss": 0.9812, + "step": 51030 + }, + { + "epoch": 0.32608001226633276, + "grad_norm": 1.0492173433303833, + "learning_rate": 9.358820826950754e-05, + "loss": 0.6533, + "step": 51040 + }, + { + "epoch": 0.32614389941607147, + "grad_norm": 0.9843623042106628, + "learning_rate": 9.358574975693882e-05, + "loss": 0.8744, + "step": 51050 + }, + { + "epoch": 0.3262077865658102, + "grad_norm": 0.9067795872688293, + "learning_rate": 9.358329080542389e-05, + "loss": 0.8468, + "step": 51060 + }, + { + "epoch": 0.3262716737155489, + "grad_norm": 0.6147485971450806, + "learning_rate": 9.358083141498751e-05, + "loss": 0.9103, + "step": 51070 + }, + { + "epoch": 0.32633556086528753, + "grad_norm": 0.8605659008026123, + "learning_rate": 9.357837158565446e-05, + "loss": 0.7366, + "step": 51080 + }, + { + "epoch": 0.32639944801502624, + "grad_norm": 0.6923385858535767, + "learning_rate": 9.357591131744952e-05, + "loss": 1.0148, + "step": 51090 + }, + { + "epoch": 0.32646333516476495, + "grad_norm": 0.8151227235794067, + "learning_rate": 9.357345061039745e-05, + "loss": 0.9406, + "step": 51100 + }, + { + "epoch": 0.32652722231450365, + "grad_norm": 0.7649495005607605, + "learning_rate": 9.357098946452301e-05, + "loss": 0.8339, + "step": 51110 + }, + { + "epoch": 0.32659110946424236, + "grad_norm": 1.2730705738067627, + "learning_rate": 9.356852787985105e-05, + "loss": 1.0306, + "step": 51120 + }, + { + "epoch": 0.32665499661398106, + "grad_norm": 0.6461998820304871, + "learning_rate": 9.35660658564063e-05, + "loss": 0.7584, + "step": 51130 + }, + { + "epoch": 0.32671888376371977, + "grad_norm": 0.999034583568573, + "learning_rate": 9.356360339421357e-05, + "loss": 0.9405, + "step": 51140 + }, + { + "epoch": 0.3267827709134585, + "grad_norm": 1.0120235681533813, + "learning_rate": 9.356114049329767e-05, + "loss": 0.8825, + "step": 51150 + }, + { + "epoch": 0.3268466580631972, + "grad_norm": 0.9906069040298462, + "learning_rate": 9.35586771536834e-05, + "loss": 0.8241, + "step": 51160 + }, + { + "epoch": 0.3269105452129359, + "grad_norm": 0.6129631996154785, + "learning_rate": 9.355621337539558e-05, + "loss": 0.8977, + "step": 51170 + }, + { + "epoch": 0.3269744323626746, + "grad_norm": 0.9414940476417542, + "learning_rate": 9.3553749158459e-05, + "loss": 0.9349, + "step": 51180 + }, + { + "epoch": 0.3270383195124133, + "grad_norm": 0.7236936092376709, + "learning_rate": 9.35512845028985e-05, + "loss": 0.8081, + "step": 51190 + }, + { + "epoch": 0.32710220666215195, + "grad_norm": 1.0342822074890137, + "learning_rate": 9.354881940873888e-05, + "loss": 0.8802, + "step": 51200 + }, + { + "epoch": 0.32716609381189066, + "grad_norm": 0.7799363136291504, + "learning_rate": 9.354635387600497e-05, + "loss": 0.8126, + "step": 51210 + }, + { + "epoch": 0.32722998096162936, + "grad_norm": 0.5983444452285767, + "learning_rate": 9.35438879047216e-05, + "loss": 0.909, + "step": 51220 + }, + { + "epoch": 0.32729386811136807, + "grad_norm": 0.9435120820999146, + "learning_rate": 9.35414214949136e-05, + "loss": 0.8125, + "step": 51230 + }, + { + "epoch": 0.3273577552611068, + "grad_norm": 0.7243615388870239, + "learning_rate": 9.353895464660585e-05, + "loss": 0.9006, + "step": 51240 + }, + { + "epoch": 0.3274216424108455, + "grad_norm": 0.7836112976074219, + "learning_rate": 9.353648735982312e-05, + "loss": 0.8456, + "step": 51250 + }, + { + "epoch": 0.3274855295605842, + "grad_norm": 0.8060475587844849, + "learning_rate": 9.353401963459032e-05, + "loss": 0.8903, + "step": 51260 + }, + { + "epoch": 0.3275494167103229, + "grad_norm": 0.49010974168777466, + "learning_rate": 9.353155147093228e-05, + "loss": 0.7631, + "step": 51270 + }, + { + "epoch": 0.3276133038600616, + "grad_norm": 1.089375615119934, + "learning_rate": 9.352908286887385e-05, + "loss": 0.8264, + "step": 51280 + }, + { + "epoch": 0.3276771910098003, + "grad_norm": 0.7374083399772644, + "learning_rate": 9.35266138284399e-05, + "loss": 0.7873, + "step": 51290 + }, + { + "epoch": 0.327741078159539, + "grad_norm": 0.8920226693153381, + "learning_rate": 9.352414434965531e-05, + "loss": 0.8032, + "step": 51300 + }, + { + "epoch": 0.3278049653092777, + "grad_norm": 0.8124125599861145, + "learning_rate": 9.35216744325449e-05, + "loss": 0.7542, + "step": 51310 + }, + { + "epoch": 0.32786885245901637, + "grad_norm": 0.7306677103042603, + "learning_rate": 9.35192040771336e-05, + "loss": 0.7191, + "step": 51320 + }, + { + "epoch": 0.3279327396087551, + "grad_norm": 1.0035959482192993, + "learning_rate": 9.351673328344626e-05, + "loss": 0.8754, + "step": 51330 + }, + { + "epoch": 0.3279966267584938, + "grad_norm": 1.0333647727966309, + "learning_rate": 9.351426205150777e-05, + "loss": 1.091, + "step": 51340 + }, + { + "epoch": 0.3280605139082325, + "grad_norm": 0.8857297897338867, + "learning_rate": 9.351179038134301e-05, + "loss": 0.8794, + "step": 51350 + }, + { + "epoch": 0.3281244010579712, + "grad_norm": 0.7112529873847961, + "learning_rate": 9.350931827297689e-05, + "loss": 0.9096, + "step": 51360 + }, + { + "epoch": 0.3281882882077099, + "grad_norm": 1.19603431224823, + "learning_rate": 9.350684572643427e-05, + "loss": 1.207, + "step": 51370 + }, + { + "epoch": 0.3282521753574486, + "grad_norm": 0.8583690524101257, + "learning_rate": 9.350437274174009e-05, + "loss": 0.9503, + "step": 51380 + }, + { + "epoch": 0.3283160625071873, + "grad_norm": 1.2817461490631104, + "learning_rate": 9.350189931891925e-05, + "loss": 0.8277, + "step": 51390 + }, + { + "epoch": 0.328379949656926, + "grad_norm": 1.0474090576171875, + "learning_rate": 9.349942545799664e-05, + "loss": 0.6875, + "step": 51400 + }, + { + "epoch": 0.3284438368066647, + "grad_norm": 2.7098753452301025, + "learning_rate": 9.349695115899717e-05, + "loss": 1.1116, + "step": 51410 + }, + { + "epoch": 0.32850772395640343, + "grad_norm": 0.9274638295173645, + "learning_rate": 9.349447642194578e-05, + "loss": 0.9108, + "step": 51420 + }, + { + "epoch": 0.32857161110614214, + "grad_norm": 0.9244821667671204, + "learning_rate": 9.34920012468674e-05, + "loss": 1.0516, + "step": 51430 + }, + { + "epoch": 0.3286354982558808, + "grad_norm": 0.7755671143531799, + "learning_rate": 9.348952563378693e-05, + "loss": 0.9229, + "step": 51440 + }, + { + "epoch": 0.3286993854056195, + "grad_norm": 0.8693557381629944, + "learning_rate": 9.348704958272931e-05, + "loss": 1.0101, + "step": 51450 + }, + { + "epoch": 0.3287632725553582, + "grad_norm": 1.1465810537338257, + "learning_rate": 9.348457309371948e-05, + "loss": 0.6397, + "step": 51460 + }, + { + "epoch": 0.3288271597050969, + "grad_norm": 1.0010018348693848, + "learning_rate": 9.348209616678238e-05, + "loss": 0.6065, + "step": 51470 + }, + { + "epoch": 0.3288910468548356, + "grad_norm": 0.7339697480201721, + "learning_rate": 9.347961880194296e-05, + "loss": 0.8538, + "step": 51480 + }, + { + "epoch": 0.3289549340045743, + "grad_norm": 0.5943730473518372, + "learning_rate": 9.347714099922616e-05, + "loss": 0.7904, + "step": 51490 + }, + { + "epoch": 0.329018821154313, + "grad_norm": 0.6852931380271912, + "learning_rate": 9.347466275865694e-05, + "loss": 0.8648, + "step": 51500 + }, + { + "epoch": 0.32908270830405173, + "grad_norm": 0.9251651167869568, + "learning_rate": 9.347218408026025e-05, + "loss": 0.9211, + "step": 51510 + }, + { + "epoch": 0.32914659545379044, + "grad_norm": 0.7798057794570923, + "learning_rate": 9.346970496406105e-05, + "loss": 0.6644, + "step": 51520 + }, + { + "epoch": 0.32921048260352914, + "grad_norm": 0.7447190880775452, + "learning_rate": 9.346722541008432e-05, + "loss": 0.7424, + "step": 51530 + }, + { + "epoch": 0.32927436975326785, + "grad_norm": 0.6202836036682129, + "learning_rate": 9.346474541835504e-05, + "loss": 1.0315, + "step": 51540 + }, + { + "epoch": 0.32933825690300655, + "grad_norm": 0.8937282562255859, + "learning_rate": 9.346226498889817e-05, + "loss": 0.8993, + "step": 51550 + }, + { + "epoch": 0.3294021440527452, + "grad_norm": 0.7935136556625366, + "learning_rate": 9.345978412173866e-05, + "loss": 0.8778, + "step": 51560 + }, + { + "epoch": 0.3294660312024839, + "grad_norm": 0.5474129319190979, + "learning_rate": 9.345730281690156e-05, + "loss": 0.9576, + "step": 51570 + }, + { + "epoch": 0.3295299183522226, + "grad_norm": 0.7357134819030762, + "learning_rate": 9.345482107441182e-05, + "loss": 0.7586, + "step": 51580 + }, + { + "epoch": 0.3295938055019613, + "grad_norm": 1.1119588613510132, + "learning_rate": 9.345233889429442e-05, + "loss": 0.9351, + "step": 51590 + }, + { + "epoch": 0.32965769265170003, + "grad_norm": 1.2632428407669067, + "learning_rate": 9.344985627657439e-05, + "loss": 0.7127, + "step": 51600 + }, + { + "epoch": 0.32972157980143874, + "grad_norm": 0.9762067198753357, + "learning_rate": 9.344737322127671e-05, + "loss": 1.2152, + "step": 51610 + }, + { + "epoch": 0.32978546695117744, + "grad_norm": 0.6457685232162476, + "learning_rate": 9.34448897284264e-05, + "loss": 0.7824, + "step": 51620 + }, + { + "epoch": 0.32984935410091615, + "grad_norm": 0.8155549764633179, + "learning_rate": 9.344240579804846e-05, + "loss": 1.0339, + "step": 51630 + }, + { + "epoch": 0.32991324125065485, + "grad_norm": 0.7448208332061768, + "learning_rate": 9.343992143016791e-05, + "loss": 0.8618, + "step": 51640 + }, + { + "epoch": 0.32997712840039356, + "grad_norm": 1.0054072141647339, + "learning_rate": 9.343743662480977e-05, + "loss": 1.0246, + "step": 51650 + }, + { + "epoch": 0.33004101555013227, + "grad_norm": 0.7192854285240173, + "learning_rate": 9.343495138199907e-05, + "loss": 0.8324, + "step": 51660 + }, + { + "epoch": 0.330104902699871, + "grad_norm": 1.023016095161438, + "learning_rate": 9.343246570176083e-05, + "loss": 0.9012, + "step": 51670 + }, + { + "epoch": 0.3301687898496096, + "grad_norm": 0.9066780209541321, + "learning_rate": 9.34299795841201e-05, + "loss": 0.7939, + "step": 51680 + }, + { + "epoch": 0.33023267699934833, + "grad_norm": 0.9442875385284424, + "learning_rate": 9.342749302910188e-05, + "loss": 0.8326, + "step": 51690 + }, + { + "epoch": 0.33029656414908704, + "grad_norm": 0.7106100916862488, + "learning_rate": 9.342500603673125e-05, + "loss": 0.9531, + "step": 51700 + }, + { + "epoch": 0.33036045129882574, + "grad_norm": 1.2981065511703491, + "learning_rate": 9.342251860703324e-05, + "loss": 0.9319, + "step": 51710 + }, + { + "epoch": 0.33042433844856445, + "grad_norm": 0.6678254008293152, + "learning_rate": 9.34200307400329e-05, + "loss": 0.7245, + "step": 51720 + }, + { + "epoch": 0.33048822559830315, + "grad_norm": 0.716221809387207, + "learning_rate": 9.341754243575528e-05, + "loss": 0.9938, + "step": 51730 + }, + { + "epoch": 0.33055211274804186, + "grad_norm": 0.9788273572921753, + "learning_rate": 9.341505369422546e-05, + "loss": 0.9796, + "step": 51740 + }, + { + "epoch": 0.33061599989778057, + "grad_norm": 0.6634423136711121, + "learning_rate": 9.341256451546848e-05, + "loss": 0.9807, + "step": 51750 + }, + { + "epoch": 0.33067988704751927, + "grad_norm": 1.2706855535507202, + "learning_rate": 9.341007489950942e-05, + "loss": 0.6877, + "step": 51760 + }, + { + "epoch": 0.330743774197258, + "grad_norm": 2.7023708820343018, + "learning_rate": 9.340758484637334e-05, + "loss": 0.7223, + "step": 51770 + }, + { + "epoch": 0.3308076613469967, + "grad_norm": 0.543978214263916, + "learning_rate": 9.340509435608534e-05, + "loss": 0.9397, + "step": 51780 + }, + { + "epoch": 0.3308715484967354, + "grad_norm": 0.8344589471817017, + "learning_rate": 9.340260342867049e-05, + "loss": 0.884, + "step": 51790 + }, + { + "epoch": 0.33093543564647404, + "grad_norm": 0.6055552959442139, + "learning_rate": 9.340011206415386e-05, + "loss": 0.6553, + "step": 51800 + }, + { + "epoch": 0.33099932279621275, + "grad_norm": 1.077162265777588, + "learning_rate": 9.339762026256058e-05, + "loss": 0.7583, + "step": 51810 + }, + { + "epoch": 0.33106320994595145, + "grad_norm": 1.1166653633117676, + "learning_rate": 9.33951280239157e-05, + "loss": 0.8469, + "step": 51820 + }, + { + "epoch": 0.33112709709569016, + "grad_norm": 0.7520068287849426, + "learning_rate": 9.339263534824436e-05, + "loss": 0.8934, + "step": 51830 + }, + { + "epoch": 0.33119098424542887, + "grad_norm": 0.8291226029396057, + "learning_rate": 9.339014223557163e-05, + "loss": 0.9665, + "step": 51840 + }, + { + "epoch": 0.33125487139516757, + "grad_norm": 0.8782137036323547, + "learning_rate": 9.338764868592262e-05, + "loss": 0.8251, + "step": 51850 + }, + { + "epoch": 0.3313187585449063, + "grad_norm": 0.6978154182434082, + "learning_rate": 9.338515469932246e-05, + "loss": 0.7853, + "step": 51860 + }, + { + "epoch": 0.331382645694645, + "grad_norm": 0.7604345679283142, + "learning_rate": 9.338266027579626e-05, + "loss": 0.8233, + "step": 51870 + }, + { + "epoch": 0.3314465328443837, + "grad_norm": 0.9156827926635742, + "learning_rate": 9.338016541536914e-05, + "loss": 0.7708, + "step": 51880 + }, + { + "epoch": 0.3315104199941224, + "grad_norm": 0.6973231434822083, + "learning_rate": 9.337767011806622e-05, + "loss": 0.82, + "step": 51890 + }, + { + "epoch": 0.3315743071438611, + "grad_norm": 0.7553335428237915, + "learning_rate": 9.337517438391263e-05, + "loss": 0.9323, + "step": 51900 + }, + { + "epoch": 0.3316381942935998, + "grad_norm": 1.0353292226791382, + "learning_rate": 9.337267821293351e-05, + "loss": 1.0569, + "step": 51910 + }, + { + "epoch": 0.3317020814433385, + "grad_norm": 0.6070806980133057, + "learning_rate": 9.3370181605154e-05, + "loss": 0.7626, + "step": 51920 + }, + { + "epoch": 0.33176596859307717, + "grad_norm": 1.3806092739105225, + "learning_rate": 9.336768456059925e-05, + "loss": 0.8368, + "step": 51930 + }, + { + "epoch": 0.33182985574281587, + "grad_norm": 0.8327397108078003, + "learning_rate": 9.33651870792944e-05, + "loss": 0.8303, + "step": 51940 + }, + { + "epoch": 0.3318937428925546, + "grad_norm": 0.916780948638916, + "learning_rate": 9.33626891612646e-05, + "loss": 0.8053, + "step": 51950 + }, + { + "epoch": 0.3319576300422933, + "grad_norm": 0.7326523065567017, + "learning_rate": 9.3360190806535e-05, + "loss": 0.8407, + "step": 51960 + }, + { + "epoch": 0.332021517192032, + "grad_norm": 1.0814404487609863, + "learning_rate": 9.335769201513075e-05, + "loss": 1.0026, + "step": 51970 + }, + { + "epoch": 0.3320854043417707, + "grad_norm": 0.9184064865112305, + "learning_rate": 9.335519278707705e-05, + "loss": 1.1877, + "step": 51980 + }, + { + "epoch": 0.3321492914915094, + "grad_norm": 0.7729029655456543, + "learning_rate": 9.335269312239904e-05, + "loss": 1.0875, + "step": 51990 + }, + { + "epoch": 0.3322131786412481, + "grad_norm": 1.2618939876556396, + "learning_rate": 9.335019302112193e-05, + "loss": 0.9594, + "step": 52000 + }, + { + "epoch": 0.3322770657909868, + "grad_norm": 0.6286314725875854, + "learning_rate": 9.334769248327085e-05, + "loss": 0.8619, + "step": 52010 + }, + { + "epoch": 0.3323409529407255, + "grad_norm": 1.7984399795532227, + "learning_rate": 9.334519150887103e-05, + "loss": 0.9147, + "step": 52020 + }, + { + "epoch": 0.3324048400904642, + "grad_norm": 1.0144270658493042, + "learning_rate": 9.33426900979476e-05, + "loss": 0.8399, + "step": 52030 + }, + { + "epoch": 0.33246872724020293, + "grad_norm": 1.9516681432724, + "learning_rate": 9.33401882505258e-05, + "loss": 0.8073, + "step": 52040 + }, + { + "epoch": 0.3325326143899416, + "grad_norm": 0.5465503931045532, + "learning_rate": 9.333768596663082e-05, + "loss": 0.8589, + "step": 52050 + }, + { + "epoch": 0.3325965015396803, + "grad_norm": 0.9213358759880066, + "learning_rate": 9.333518324628783e-05, + "loss": 0.6787, + "step": 52060 + }, + { + "epoch": 0.332660388689419, + "grad_norm": 0.7872808575630188, + "learning_rate": 9.333268008952206e-05, + "loss": 0.8307, + "step": 52070 + }, + { + "epoch": 0.3327242758391577, + "grad_norm": 0.9161990284919739, + "learning_rate": 9.333017649635871e-05, + "loss": 1.1748, + "step": 52080 + }, + { + "epoch": 0.3327881629888964, + "grad_norm": 0.7564883232116699, + "learning_rate": 9.332767246682301e-05, + "loss": 0.735, + "step": 52090 + }, + { + "epoch": 0.3328520501386351, + "grad_norm": 0.7654510140419006, + "learning_rate": 9.332516800094015e-05, + "loss": 0.5545, + "step": 52100 + }, + { + "epoch": 0.3329159372883738, + "grad_norm": 0.5725670456886292, + "learning_rate": 9.332266309873538e-05, + "loss": 1.294, + "step": 52110 + }, + { + "epoch": 0.3329798244381125, + "grad_norm": 1.0625219345092773, + "learning_rate": 9.332015776023391e-05, + "loss": 0.9125, + "step": 52120 + }, + { + "epoch": 0.33304371158785123, + "grad_norm": 0.9181973934173584, + "learning_rate": 9.331765198546097e-05, + "loss": 0.8822, + "step": 52130 + }, + { + "epoch": 0.33310759873758994, + "grad_norm": 1.5193865299224854, + "learning_rate": 9.33151457744418e-05, + "loss": 0.8638, + "step": 52140 + }, + { + "epoch": 0.33317148588732864, + "grad_norm": 1.0229812860488892, + "learning_rate": 9.331263912720165e-05, + "loss": 0.8326, + "step": 52150 + }, + { + "epoch": 0.33323537303706735, + "grad_norm": 1.2124236822128296, + "learning_rate": 9.331013204376573e-05, + "loss": 0.7525, + "step": 52160 + }, + { + "epoch": 0.333299260186806, + "grad_norm": 0.8149605393409729, + "learning_rate": 9.330762452415934e-05, + "loss": 0.9571, + "step": 52170 + }, + { + "epoch": 0.3333631473365447, + "grad_norm": 1.2210596799850464, + "learning_rate": 9.330511656840768e-05, + "loss": 1.0235, + "step": 52180 + }, + { + "epoch": 0.3334270344862834, + "grad_norm": 0.6121252775192261, + "learning_rate": 9.330260817653604e-05, + "loss": 0.8165, + "step": 52190 + }, + { + "epoch": 0.3334909216360221, + "grad_norm": 0.770204484462738, + "learning_rate": 9.330009934856967e-05, + "loss": 1.0588, + "step": 52200 + }, + { + "epoch": 0.3335548087857608, + "grad_norm": 0.6882258057594299, + "learning_rate": 9.329759008453385e-05, + "loss": 0.7148, + "step": 52210 + }, + { + "epoch": 0.33361869593549953, + "grad_norm": 1.0399905443191528, + "learning_rate": 9.329508038445382e-05, + "loss": 0.7462, + "step": 52220 + }, + { + "epoch": 0.33368258308523824, + "grad_norm": 1.3440240621566772, + "learning_rate": 9.32925702483549e-05, + "loss": 0.8031, + "step": 52230 + }, + { + "epoch": 0.33374647023497694, + "grad_norm": 0.5900636911392212, + "learning_rate": 9.329005967626234e-05, + "loss": 0.8395, + "step": 52240 + }, + { + "epoch": 0.33381035738471565, + "grad_norm": 0.8768534660339355, + "learning_rate": 9.328754866820142e-05, + "loss": 1.068, + "step": 52250 + }, + { + "epoch": 0.33387424453445436, + "grad_norm": 1.3895585536956787, + "learning_rate": 9.328503722419744e-05, + "loss": 0.9927, + "step": 52260 + }, + { + "epoch": 0.33393813168419306, + "grad_norm": 0.9054799675941467, + "learning_rate": 9.328252534427568e-05, + "loss": 0.7075, + "step": 52270 + }, + { + "epoch": 0.33400201883393177, + "grad_norm": 0.9763078689575195, + "learning_rate": 9.328001302846145e-05, + "loss": 1.0979, + "step": 52280 + }, + { + "epoch": 0.3340659059836704, + "grad_norm": 0.9339504241943359, + "learning_rate": 9.327750027678005e-05, + "loss": 0.8496, + "step": 52290 + }, + { + "epoch": 0.3341297931334091, + "grad_norm": 1.0181572437286377, + "learning_rate": 9.327498708925677e-05, + "loss": 0.9902, + "step": 52300 + }, + { + "epoch": 0.33419368028314783, + "grad_norm": 0.7839872241020203, + "learning_rate": 9.327247346591694e-05, + "loss": 0.9367, + "step": 52310 + }, + { + "epoch": 0.33425756743288654, + "grad_norm": 0.5195721387863159, + "learning_rate": 9.326995940678587e-05, + "loss": 0.8934, + "step": 52320 + }, + { + "epoch": 0.33432145458262524, + "grad_norm": 0.8356695175170898, + "learning_rate": 9.326744491188888e-05, + "loss": 0.9758, + "step": 52330 + }, + { + "epoch": 0.33438534173236395, + "grad_norm": 1.7605299949645996, + "learning_rate": 9.326492998125128e-05, + "loss": 0.7481, + "step": 52340 + }, + { + "epoch": 0.33444922888210266, + "grad_norm": 1.1781415939331055, + "learning_rate": 9.326241461489839e-05, + "loss": 0.8391, + "step": 52350 + }, + { + "epoch": 0.33451311603184136, + "grad_norm": 1.0908046960830688, + "learning_rate": 9.325989881285559e-05, + "loss": 0.6456, + "step": 52360 + }, + { + "epoch": 0.33457700318158007, + "grad_norm": 0.6815122961997986, + "learning_rate": 9.325738257514816e-05, + "loss": 0.7305, + "step": 52370 + }, + { + "epoch": 0.3346408903313188, + "grad_norm": 0.8093428611755371, + "learning_rate": 9.325486590180149e-05, + "loss": 0.778, + "step": 52380 + }, + { + "epoch": 0.3347047774810575, + "grad_norm": 0.8250554800033569, + "learning_rate": 9.325234879284086e-05, + "loss": 0.6891, + "step": 52390 + }, + { + "epoch": 0.3347686646307962, + "grad_norm": 0.8145758509635925, + "learning_rate": 9.324983124829169e-05, + "loss": 0.925, + "step": 52400 + }, + { + "epoch": 0.33483255178053484, + "grad_norm": 0.7351551651954651, + "learning_rate": 9.324731326817928e-05, + "loss": 1.0542, + "step": 52410 + }, + { + "epoch": 0.33489643893027354, + "grad_norm": 0.9697402119636536, + "learning_rate": 9.324479485252904e-05, + "loss": 0.9113, + "step": 52420 + }, + { + "epoch": 0.33496032608001225, + "grad_norm": 0.8043109774589539, + "learning_rate": 9.324227600136628e-05, + "loss": 0.9284, + "step": 52430 + }, + { + "epoch": 0.33502421322975096, + "grad_norm": 0.6603406667709351, + "learning_rate": 9.32397567147164e-05, + "loss": 0.9909, + "step": 52440 + }, + { + "epoch": 0.33508810037948966, + "grad_norm": 0.5201127529144287, + "learning_rate": 9.323723699260476e-05, + "loss": 0.7742, + "step": 52450 + }, + { + "epoch": 0.33515198752922837, + "grad_norm": 1.1055912971496582, + "learning_rate": 9.323471683505674e-05, + "loss": 0.8968, + "step": 52460 + }, + { + "epoch": 0.3352158746789671, + "grad_norm": 1.6916980743408203, + "learning_rate": 9.323219624209772e-05, + "loss": 0.8835, + "step": 52470 + }, + { + "epoch": 0.3352797618287058, + "grad_norm": 0.889218270778656, + "learning_rate": 9.322967521375307e-05, + "loss": 0.8445, + "step": 52480 + }, + { + "epoch": 0.3353436489784445, + "grad_norm": 0.9384592175483704, + "learning_rate": 9.32271537500482e-05, + "loss": 0.701, + "step": 52490 + }, + { + "epoch": 0.3354075361281832, + "grad_norm": 1.5930566787719727, + "learning_rate": 9.322463185100849e-05, + "loss": 0.8175, + "step": 52500 + }, + { + "epoch": 0.3354714232779219, + "grad_norm": 0.6644344925880432, + "learning_rate": 9.322210951665935e-05, + "loss": 1.0025, + "step": 52510 + }, + { + "epoch": 0.3355353104276606, + "grad_norm": 0.9203514456748962, + "learning_rate": 9.321958674702617e-05, + "loss": 0.7474, + "step": 52520 + }, + { + "epoch": 0.33559919757739926, + "grad_norm": 0.5946767330169678, + "learning_rate": 9.321706354213438e-05, + "loss": 0.7824, + "step": 52530 + }, + { + "epoch": 0.33566308472713796, + "grad_norm": 0.7163698077201843, + "learning_rate": 9.321453990200935e-05, + "loss": 0.778, + "step": 52540 + }, + { + "epoch": 0.33572697187687667, + "grad_norm": 0.8819127678871155, + "learning_rate": 9.321201582667653e-05, + "loss": 0.8468, + "step": 52550 + }, + { + "epoch": 0.3357908590266154, + "grad_norm": 0.8515467643737793, + "learning_rate": 9.320949131616132e-05, + "loss": 0.9673, + "step": 52560 + }, + { + "epoch": 0.3358547461763541, + "grad_norm": 0.7722886204719543, + "learning_rate": 9.320696637048915e-05, + "loss": 0.8924, + "step": 52570 + }, + { + "epoch": 0.3359186333260928, + "grad_norm": 0.7204701900482178, + "learning_rate": 9.320444098968545e-05, + "loss": 0.9221, + "step": 52580 + }, + { + "epoch": 0.3359825204758315, + "grad_norm": 1.010270595550537, + "learning_rate": 9.320191517377566e-05, + "loss": 1.2194, + "step": 52590 + }, + { + "epoch": 0.3360464076255702, + "grad_norm": 0.7149573564529419, + "learning_rate": 9.319938892278519e-05, + "loss": 0.9444, + "step": 52600 + }, + { + "epoch": 0.3361102947753089, + "grad_norm": 0.908594012260437, + "learning_rate": 9.31968622367395e-05, + "loss": 0.8676, + "step": 52610 + }, + { + "epoch": 0.3361741819250476, + "grad_norm": 0.9813511371612549, + "learning_rate": 9.319433511566406e-05, + "loss": 1.0841, + "step": 52620 + }, + { + "epoch": 0.3362380690747863, + "grad_norm": 0.6576645374298096, + "learning_rate": 9.31918075595843e-05, + "loss": 0.968, + "step": 52630 + }, + { + "epoch": 0.336301956224525, + "grad_norm": 0.9117244482040405, + "learning_rate": 9.318927956852566e-05, + "loss": 0.9304, + "step": 52640 + }, + { + "epoch": 0.3363658433742637, + "grad_norm": 0.8400249481201172, + "learning_rate": 9.318675114251361e-05, + "loss": 1.0644, + "step": 52650 + }, + { + "epoch": 0.3364297305240024, + "grad_norm": 0.950006365776062, + "learning_rate": 9.31842222815736e-05, + "loss": 0.8254, + "step": 52660 + }, + { + "epoch": 0.3364936176737411, + "grad_norm": 0.9387775659561157, + "learning_rate": 9.318169298573112e-05, + "loss": 0.9178, + "step": 52670 + }, + { + "epoch": 0.3365575048234798, + "grad_norm": 0.5320703387260437, + "learning_rate": 9.317916325501165e-05, + "loss": 0.5815, + "step": 52680 + }, + { + "epoch": 0.3366213919732185, + "grad_norm": 1.047491192817688, + "learning_rate": 9.317663308944064e-05, + "loss": 1.2488, + "step": 52690 + }, + { + "epoch": 0.3366852791229572, + "grad_norm": 2.0185956954956055, + "learning_rate": 9.317410248904358e-05, + "loss": 0.8944, + "step": 52700 + }, + { + "epoch": 0.3367491662726959, + "grad_norm": 0.8261764049530029, + "learning_rate": 9.317157145384596e-05, + "loss": 0.818, + "step": 52710 + }, + { + "epoch": 0.3368130534224346, + "grad_norm": 0.9799476265907288, + "learning_rate": 9.316903998387326e-05, + "loss": 0.7601, + "step": 52720 + }, + { + "epoch": 0.3368769405721733, + "grad_norm": 0.8541726469993591, + "learning_rate": 9.3166508079151e-05, + "loss": 0.9305, + "step": 52730 + }, + { + "epoch": 0.33694082772191203, + "grad_norm": 0.7811595797538757, + "learning_rate": 9.316397573970464e-05, + "loss": 0.82, + "step": 52740 + }, + { + "epoch": 0.33700471487165073, + "grad_norm": 0.7470584511756897, + "learning_rate": 9.316144296555971e-05, + "loss": 0.8631, + "step": 52750 + }, + { + "epoch": 0.33706860202138944, + "grad_norm": 0.8616728782653809, + "learning_rate": 9.315890975674169e-05, + "loss": 0.8319, + "step": 52760 + }, + { + "epoch": 0.33713248917112815, + "grad_norm": 0.6505323052406311, + "learning_rate": 9.315637611327614e-05, + "loss": 0.9409, + "step": 52770 + }, + { + "epoch": 0.3371963763208668, + "grad_norm": 1.1408954858779907, + "learning_rate": 9.315384203518853e-05, + "loss": 0.8325, + "step": 52780 + }, + { + "epoch": 0.3372602634706055, + "grad_norm": 0.8268606066703796, + "learning_rate": 9.31513075225044e-05, + "loss": 0.947, + "step": 52790 + }, + { + "epoch": 0.3373241506203442, + "grad_norm": 1.4688328504562378, + "learning_rate": 9.314877257524928e-05, + "loss": 0.9942, + "step": 52800 + }, + { + "epoch": 0.3373880377700829, + "grad_norm": 0.8979589343070984, + "learning_rate": 9.314623719344869e-05, + "loss": 0.8571, + "step": 52810 + }, + { + "epoch": 0.3374519249198216, + "grad_norm": 0.6567512154579163, + "learning_rate": 9.314370137712816e-05, + "loss": 0.6655, + "step": 52820 + }, + { + "epoch": 0.33751581206956033, + "grad_norm": 0.8439179062843323, + "learning_rate": 9.314116512631324e-05, + "loss": 0.8662, + "step": 52830 + }, + { + "epoch": 0.33757969921929903, + "grad_norm": 0.7378790378570557, + "learning_rate": 9.313862844102946e-05, + "loss": 0.8929, + "step": 52840 + }, + { + "epoch": 0.33764358636903774, + "grad_norm": 0.5747960209846497, + "learning_rate": 9.313609132130235e-05, + "loss": 0.8469, + "step": 52850 + }, + { + "epoch": 0.33770747351877645, + "grad_norm": 2.188962459564209, + "learning_rate": 9.313355376715751e-05, + "loss": 0.7715, + "step": 52860 + }, + { + "epoch": 0.33777136066851515, + "grad_norm": 1.6319129467010498, + "learning_rate": 9.313101577862046e-05, + "loss": 0.851, + "step": 52870 + }, + { + "epoch": 0.33783524781825386, + "grad_norm": 0.8833417892456055, + "learning_rate": 9.312847735571676e-05, + "loss": 0.9841, + "step": 52880 + }, + { + "epoch": 0.33789913496799256, + "grad_norm": 0.771787703037262, + "learning_rate": 9.312593849847198e-05, + "loss": 0.6755, + "step": 52890 + }, + { + "epoch": 0.3379630221177312, + "grad_norm": 1.289760947227478, + "learning_rate": 9.31233992069117e-05, + "loss": 1.1848, + "step": 52900 + }, + { + "epoch": 0.3380269092674699, + "grad_norm": 0.8547393083572388, + "learning_rate": 9.312085948106148e-05, + "loss": 1.0867, + "step": 52910 + }, + { + "epoch": 0.3380907964172086, + "grad_norm": 1.357723593711853, + "learning_rate": 9.311831932094691e-05, + "loss": 0.821, + "step": 52920 + }, + { + "epoch": 0.33815468356694733, + "grad_norm": 0.9254101514816284, + "learning_rate": 9.311577872659355e-05, + "loss": 1.188, + "step": 52930 + }, + { + "epoch": 0.33821857071668604, + "grad_norm": 0.9655906558036804, + "learning_rate": 9.311323769802701e-05, + "loss": 1.1519, + "step": 52940 + }, + { + "epoch": 0.33828245786642475, + "grad_norm": 0.9837827682495117, + "learning_rate": 9.311069623527285e-05, + "loss": 0.9612, + "step": 52950 + }, + { + "epoch": 0.33834634501616345, + "grad_norm": 0.7545758485794067, + "learning_rate": 9.310840854758487e-05, + "loss": 1.1672, + "step": 52960 + }, + { + "epoch": 0.33841023216590216, + "grad_norm": 0.9971650838851929, + "learning_rate": 9.31058662599448e-05, + "loss": 0.9516, + "step": 52970 + }, + { + "epoch": 0.33847411931564086, + "grad_norm": 0.8151521682739258, + "learning_rate": 9.310332353819136e-05, + "loss": 0.6755, + "step": 52980 + }, + { + "epoch": 0.33853800646537957, + "grad_norm": 1.5848335027694702, + "learning_rate": 9.310078038235014e-05, + "loss": 0.8026, + "step": 52990 + }, + { + "epoch": 0.3386018936151183, + "grad_norm": 1.3594563007354736, + "learning_rate": 9.30982367924468e-05, + "loss": 0.6692, + "step": 53000 + }, + { + "epoch": 0.338665780764857, + "grad_norm": 1.4335222244262695, + "learning_rate": 9.309569276850692e-05, + "loss": 0.8874, + "step": 53010 + }, + { + "epoch": 0.33872966791459563, + "grad_norm": 1.4923986196517944, + "learning_rate": 9.309314831055615e-05, + "loss": 1.0218, + "step": 53020 + }, + { + "epoch": 0.33879355506433434, + "grad_norm": 0.6935365796089172, + "learning_rate": 9.309060341862008e-05, + "loss": 0.8023, + "step": 53030 + }, + { + "epoch": 0.33885744221407305, + "grad_norm": 1.7542939186096191, + "learning_rate": 9.308805809272434e-05, + "loss": 0.7334, + "step": 53040 + }, + { + "epoch": 0.33892132936381175, + "grad_norm": 2.078371286392212, + "learning_rate": 9.30855123328946e-05, + "loss": 0.9812, + "step": 53050 + }, + { + "epoch": 0.33898521651355046, + "grad_norm": 0.6690249443054199, + "learning_rate": 9.308296613915647e-05, + "loss": 0.9794, + "step": 53060 + }, + { + "epoch": 0.33904910366328916, + "grad_norm": 0.8142697215080261, + "learning_rate": 9.30804195115356e-05, + "loss": 0.9776, + "step": 53070 + }, + { + "epoch": 0.33911299081302787, + "grad_norm": 0.7654648423194885, + "learning_rate": 9.307787245005764e-05, + "loss": 0.842, + "step": 53080 + }, + { + "epoch": 0.3391768779627666, + "grad_norm": 0.5504037141799927, + "learning_rate": 9.307532495474822e-05, + "loss": 0.7776, + "step": 53090 + }, + { + "epoch": 0.3392407651125053, + "grad_norm": 1.0997267961502075, + "learning_rate": 9.307277702563302e-05, + "loss": 0.685, + "step": 53100 + }, + { + "epoch": 0.339304652262244, + "grad_norm": 0.9791783690452576, + "learning_rate": 9.307022866273771e-05, + "loss": 1.1581, + "step": 53110 + }, + { + "epoch": 0.3393685394119827, + "grad_norm": 0.6219057440757751, + "learning_rate": 9.306767986608791e-05, + "loss": 0.9069, + "step": 53120 + }, + { + "epoch": 0.3394324265617214, + "grad_norm": 0.5955463647842407, + "learning_rate": 9.306513063570933e-05, + "loss": 1.112, + "step": 53130 + }, + { + "epoch": 0.33949631371146005, + "grad_norm": 0.7455695867538452, + "learning_rate": 9.306258097162763e-05, + "loss": 0.7857, + "step": 53140 + }, + { + "epoch": 0.33956020086119876, + "grad_norm": 0.9764438271522522, + "learning_rate": 9.306003087386848e-05, + "loss": 0.9552, + "step": 53150 + }, + { + "epoch": 0.33962408801093746, + "grad_norm": 0.6675849556922913, + "learning_rate": 9.305748034245756e-05, + "loss": 0.7883, + "step": 53160 + }, + { + "epoch": 0.33968797516067617, + "grad_norm": 0.9111708998680115, + "learning_rate": 9.305492937742057e-05, + "loss": 0.8918, + "step": 53170 + }, + { + "epoch": 0.3397518623104149, + "grad_norm": 2.7284460067749023, + "learning_rate": 9.30523779787832e-05, + "loss": 0.8114, + "step": 53180 + }, + { + "epoch": 0.3398157494601536, + "grad_norm": 0.586710512638092, + "learning_rate": 9.304982614657114e-05, + "loss": 0.856, + "step": 53190 + }, + { + "epoch": 0.3398796366098923, + "grad_norm": 0.644350528717041, + "learning_rate": 9.304727388081007e-05, + "loss": 0.8175, + "step": 53200 + }, + { + "epoch": 0.339943523759631, + "grad_norm": 0.6203905940055847, + "learning_rate": 9.304472118152572e-05, + "loss": 1.0128, + "step": 53210 + }, + { + "epoch": 0.3400074109093697, + "grad_norm": 0.840505063533783, + "learning_rate": 9.304216804874379e-05, + "loss": 0.8672, + "step": 53220 + }, + { + "epoch": 0.3400712980591084, + "grad_norm": 0.750717282295227, + "learning_rate": 9.303961448248998e-05, + "loss": 0.8607, + "step": 53230 + }, + { + "epoch": 0.3401351852088471, + "grad_norm": 0.7886949181556702, + "learning_rate": 9.303706048279004e-05, + "loss": 1.2132, + "step": 53240 + }, + { + "epoch": 0.3401990723585858, + "grad_norm": 0.9253231883049011, + "learning_rate": 9.303450604966966e-05, + "loss": 1.0289, + "step": 53250 + }, + { + "epoch": 0.34026295950832447, + "grad_norm": 1.0587670803070068, + "learning_rate": 9.303195118315455e-05, + "loss": 0.9249, + "step": 53260 + }, + { + "epoch": 0.3403268466580632, + "grad_norm": 1.1579573154449463, + "learning_rate": 9.302939588327048e-05, + "loss": 0.8702, + "step": 53270 + }, + { + "epoch": 0.3403907338078019, + "grad_norm": 1.4637956619262695, + "learning_rate": 9.302684015004318e-05, + "loss": 0.8417, + "step": 53280 + }, + { + "epoch": 0.3404546209575406, + "grad_norm": 2.425816774368286, + "learning_rate": 9.302428398349836e-05, + "loss": 0.8657, + "step": 53290 + }, + { + "epoch": 0.3405185081072793, + "grad_norm": 0.530267596244812, + "learning_rate": 9.30217273836618e-05, + "loss": 0.9432, + "step": 53300 + }, + { + "epoch": 0.340582395257018, + "grad_norm": 1.081075668334961, + "learning_rate": 9.30191703505592e-05, + "loss": 1.262, + "step": 53310 + }, + { + "epoch": 0.3406462824067567, + "grad_norm": 0.7147884964942932, + "learning_rate": 9.301661288421636e-05, + "loss": 0.8376, + "step": 53320 + }, + { + "epoch": 0.3407101695564954, + "grad_norm": 0.8092734217643738, + "learning_rate": 9.301405498465901e-05, + "loss": 0.8306, + "step": 53330 + }, + { + "epoch": 0.3407740567062341, + "grad_norm": 1.257656216621399, + "learning_rate": 9.30114966519129e-05, + "loss": 0.6576, + "step": 53340 + }, + { + "epoch": 0.3408379438559728, + "grad_norm": 0.7588216066360474, + "learning_rate": 9.30089378860038e-05, + "loss": 0.8001, + "step": 53350 + }, + { + "epoch": 0.34090183100571153, + "grad_norm": 2.2834153175354004, + "learning_rate": 9.300637868695752e-05, + "loss": 0.7371, + "step": 53360 + }, + { + "epoch": 0.34096571815545024, + "grad_norm": 1.2148463726043701, + "learning_rate": 9.300381905479978e-05, + "loss": 0.7611, + "step": 53370 + }, + { + "epoch": 0.3410296053051889, + "grad_norm": 0.7011250853538513, + "learning_rate": 9.300125898955639e-05, + "loss": 0.7491, + "step": 53380 + }, + { + "epoch": 0.3410934924549276, + "grad_norm": 0.9669275879859924, + "learning_rate": 9.299869849125311e-05, + "loss": 0.9306, + "step": 53390 + }, + { + "epoch": 0.3411573796046663, + "grad_norm": 0.8897387981414795, + "learning_rate": 9.299613755991573e-05, + "loss": 1.0307, + "step": 53400 + }, + { + "epoch": 0.341221266754405, + "grad_norm": 0.9630199670791626, + "learning_rate": 9.299357619557005e-05, + "loss": 1.1292, + "step": 53410 + }, + { + "epoch": 0.3412851539041437, + "grad_norm": 0.8969447016716003, + "learning_rate": 9.299101439824188e-05, + "loss": 0.9029, + "step": 53420 + }, + { + "epoch": 0.3413490410538824, + "grad_norm": 1.110783338546753, + "learning_rate": 9.298845216795699e-05, + "loss": 0.9651, + "step": 53430 + }, + { + "epoch": 0.3414129282036211, + "grad_norm": 0.8235384225845337, + "learning_rate": 9.29858895047412e-05, + "loss": 0.5702, + "step": 53440 + }, + { + "epoch": 0.34147681535335983, + "grad_norm": 1.1357210874557495, + "learning_rate": 9.298332640862032e-05, + "loss": 0.8345, + "step": 53450 + }, + { + "epoch": 0.34154070250309854, + "grad_norm": 0.7951391935348511, + "learning_rate": 9.298076287962016e-05, + "loss": 0.7113, + "step": 53460 + }, + { + "epoch": 0.34160458965283724, + "grad_norm": 0.9098735451698303, + "learning_rate": 9.297819891776651e-05, + "loss": 0.9365, + "step": 53470 + }, + { + "epoch": 0.34166847680257595, + "grad_norm": 0.6273751854896545, + "learning_rate": 9.297563452308525e-05, + "loss": 0.7352, + "step": 53480 + }, + { + "epoch": 0.34173236395231466, + "grad_norm": 0.4580266773700714, + "learning_rate": 9.297306969560213e-05, + "loss": 0.9588, + "step": 53490 + }, + { + "epoch": 0.3417962511020533, + "grad_norm": 1.1689975261688232, + "learning_rate": 9.297050443534305e-05, + "loss": 0.7314, + "step": 53500 + }, + { + "epoch": 0.341860138251792, + "grad_norm": 0.8858540058135986, + "learning_rate": 9.29679387423338e-05, + "loss": 0.8758, + "step": 53510 + }, + { + "epoch": 0.3419240254015307, + "grad_norm": 0.7352036833763123, + "learning_rate": 9.296537261660026e-05, + "loss": 0.9193, + "step": 53520 + }, + { + "epoch": 0.3419879125512694, + "grad_norm": 1.1787981986999512, + "learning_rate": 9.296280605816823e-05, + "loss": 0.853, + "step": 53530 + }, + { + "epoch": 0.34205179970100813, + "grad_norm": 0.8490791320800781, + "learning_rate": 9.296023906706357e-05, + "loss": 1.2468, + "step": 53540 + }, + { + "epoch": 0.34211568685074684, + "grad_norm": 1.1873284578323364, + "learning_rate": 9.295767164331215e-05, + "loss": 1.1106, + "step": 53550 + }, + { + "epoch": 0.34217957400048554, + "grad_norm": 1.3740506172180176, + "learning_rate": 9.29551037869398e-05, + "loss": 0.8165, + "step": 53560 + }, + { + "epoch": 0.34224346115022425, + "grad_norm": 1.074511170387268, + "learning_rate": 9.295253549797241e-05, + "loss": 0.8433, + "step": 53570 + }, + { + "epoch": 0.34230734829996295, + "grad_norm": 0.9406700134277344, + "learning_rate": 9.294996677643581e-05, + "loss": 0.844, + "step": 53580 + }, + { + "epoch": 0.34237123544970166, + "grad_norm": 0.9452252984046936, + "learning_rate": 9.294739762235589e-05, + "loss": 0.7768, + "step": 53590 + }, + { + "epoch": 0.34243512259944037, + "grad_norm": 0.8128929734230042, + "learning_rate": 9.294482803575853e-05, + "loss": 0.622, + "step": 53600 + }, + { + "epoch": 0.3424990097491791, + "grad_norm": 0.825412392616272, + "learning_rate": 9.294225801666959e-05, + "loss": 1.0291, + "step": 53610 + }, + { + "epoch": 0.3425628968989178, + "grad_norm": 1.06623113155365, + "learning_rate": 9.293968756511496e-05, + "loss": 1.0841, + "step": 53620 + }, + { + "epoch": 0.34262678404865643, + "grad_norm": 0.981828510761261, + "learning_rate": 9.293711668112054e-05, + "loss": 0.8458, + "step": 53630 + }, + { + "epoch": 0.34269067119839514, + "grad_norm": 1.0561970472335815, + "learning_rate": 9.29345453647122e-05, + "loss": 0.7624, + "step": 53640 + }, + { + "epoch": 0.34275455834813384, + "grad_norm": 0.7628150582313538, + "learning_rate": 9.293197361591586e-05, + "loss": 0.8328, + "step": 53650 + }, + { + "epoch": 0.34281844549787255, + "grad_norm": 0.9464593529701233, + "learning_rate": 9.292940143475737e-05, + "loss": 0.9501, + "step": 53660 + }, + { + "epoch": 0.34288233264761125, + "grad_norm": 2.0435502529144287, + "learning_rate": 9.292682882126272e-05, + "loss": 0.864, + "step": 53670 + }, + { + "epoch": 0.34294621979734996, + "grad_norm": 1.0263941287994385, + "learning_rate": 9.292425577545772e-05, + "loss": 0.8141, + "step": 53680 + }, + { + "epoch": 0.34301010694708867, + "grad_norm": 0.7042751908302307, + "learning_rate": 9.292168229736836e-05, + "loss": 0.7909, + "step": 53690 + }, + { + "epoch": 0.3430739940968274, + "grad_norm": 1.1945339441299438, + "learning_rate": 9.29191083870205e-05, + "loss": 0.9799, + "step": 53700 + }, + { + "epoch": 0.3431378812465661, + "grad_norm": 0.965678870677948, + "learning_rate": 9.29165340444401e-05, + "loss": 0.8093, + "step": 53710 + }, + { + "epoch": 0.3432017683963048, + "grad_norm": 2.425915241241455, + "learning_rate": 9.291395926965307e-05, + "loss": 1.0032, + "step": 53720 + }, + { + "epoch": 0.3432656555460435, + "grad_norm": 0.5332554578781128, + "learning_rate": 9.291138406268536e-05, + "loss": 0.7977, + "step": 53730 + }, + { + "epoch": 0.3433295426957822, + "grad_norm": 2.8045296669006348, + "learning_rate": 9.290880842356287e-05, + "loss": 0.9274, + "step": 53740 + }, + { + "epoch": 0.34339342984552085, + "grad_norm": 0.7845577597618103, + "learning_rate": 9.290623235231157e-05, + "loss": 1.0535, + "step": 53750 + }, + { + "epoch": 0.34345731699525955, + "grad_norm": 0.9177809953689575, + "learning_rate": 9.290365584895739e-05, + "loss": 0.9278, + "step": 53760 + }, + { + "epoch": 0.34352120414499826, + "grad_norm": 0.9220765829086304, + "learning_rate": 9.290107891352628e-05, + "loss": 0.945, + "step": 53770 + }, + { + "epoch": 0.34358509129473697, + "grad_norm": 0.8571166396141052, + "learning_rate": 9.289850154604417e-05, + "loss": 0.903, + "step": 53780 + }, + { + "epoch": 0.3436489784444757, + "grad_norm": 0.8738123178482056, + "learning_rate": 9.289592374653708e-05, + "loss": 0.9928, + "step": 53790 + }, + { + "epoch": 0.3437128655942144, + "grad_norm": 0.7225977778434753, + "learning_rate": 9.28933455150309e-05, + "loss": 0.8828, + "step": 53800 + }, + { + "epoch": 0.3437767527439531, + "grad_norm": 1.3303672075271606, + "learning_rate": 9.289076685155162e-05, + "loss": 1.0604, + "step": 53810 + }, + { + "epoch": 0.3438406398936918, + "grad_norm": 0.8628764152526855, + "learning_rate": 9.28881877561252e-05, + "loss": 1.1439, + "step": 53820 + }, + { + "epoch": 0.3439045270434305, + "grad_norm": 0.6281081438064575, + "learning_rate": 9.288560822877765e-05, + "loss": 0.9286, + "step": 53830 + }, + { + "epoch": 0.3439684141931692, + "grad_norm": 0.6044685244560242, + "learning_rate": 9.288302826953492e-05, + "loss": 1.1626, + "step": 53840 + }, + { + "epoch": 0.3440323013429079, + "grad_norm": 0.847324788570404, + "learning_rate": 9.288044787842298e-05, + "loss": 0.7661, + "step": 53850 + }, + { + "epoch": 0.3440961884926466, + "grad_norm": 0.9134111404418945, + "learning_rate": 9.287786705546785e-05, + "loss": 0.7944, + "step": 53860 + }, + { + "epoch": 0.34416007564238527, + "grad_norm": 1.3941556215286255, + "learning_rate": 9.287528580069551e-05, + "loss": 1.2369, + "step": 53870 + }, + { + "epoch": 0.34422396279212397, + "grad_norm": 0.8589109182357788, + "learning_rate": 9.287270411413194e-05, + "loss": 0.6585, + "step": 53880 + }, + { + "epoch": 0.3442878499418627, + "grad_norm": 0.49347206950187683, + "learning_rate": 9.287012199580315e-05, + "loss": 0.8574, + "step": 53890 + }, + { + "epoch": 0.3443517370916014, + "grad_norm": 0.9737316370010376, + "learning_rate": 9.286753944573514e-05, + "loss": 0.6949, + "step": 53900 + }, + { + "epoch": 0.3444156242413401, + "grad_norm": 1.0737287998199463, + "learning_rate": 9.286495646395392e-05, + "loss": 0.9367, + "step": 53910 + }, + { + "epoch": 0.3444795113910788, + "grad_norm": 0.9133766293525696, + "learning_rate": 9.28623730504855e-05, + "loss": 0.7619, + "step": 53920 + }, + { + "epoch": 0.3445433985408175, + "grad_norm": 0.784355640411377, + "learning_rate": 9.285978920535592e-05, + "loss": 1.0223, + "step": 53930 + }, + { + "epoch": 0.3446072856905562, + "grad_norm": 0.7285311818122864, + "learning_rate": 9.285720492859118e-05, + "loss": 0.9259, + "step": 53940 + }, + { + "epoch": 0.3446711728402949, + "grad_norm": 0.8762960433959961, + "learning_rate": 9.28546202202173e-05, + "loss": 0.8949, + "step": 53950 + }, + { + "epoch": 0.3447350599900336, + "grad_norm": 0.8869500756263733, + "learning_rate": 9.285203508026032e-05, + "loss": 0.8522, + "step": 53960 + }, + { + "epoch": 0.3447989471397723, + "grad_norm": 0.7807958722114563, + "learning_rate": 9.284944950874628e-05, + "loss": 1.0629, + "step": 53970 + }, + { + "epoch": 0.34486283428951103, + "grad_norm": 2.027085542678833, + "learning_rate": 9.284686350570121e-05, + "loss": 0.9566, + "step": 53980 + }, + { + "epoch": 0.3449267214392497, + "grad_norm": 1.2319154739379883, + "learning_rate": 9.284427707115116e-05, + "loss": 0.718, + "step": 53990 + }, + { + "epoch": 0.3449906085889884, + "grad_norm": 0.5686825513839722, + "learning_rate": 9.284169020512217e-05, + "loss": 0.7659, + "step": 54000 + }, + { + "epoch": 0.3450544957387271, + "grad_norm": 0.9346210956573486, + "learning_rate": 9.283910290764029e-05, + "loss": 0.8805, + "step": 54010 + }, + { + "epoch": 0.3451183828884658, + "grad_norm": 1.0254408121109009, + "learning_rate": 9.28365151787316e-05, + "loss": 1.0447, + "step": 54020 + }, + { + "epoch": 0.3451822700382045, + "grad_norm": 1.1026064157485962, + "learning_rate": 9.283392701842213e-05, + "loss": 1.11, + "step": 54030 + }, + { + "epoch": 0.3452461571879432, + "grad_norm": 1.1791328191757202, + "learning_rate": 9.283133842673797e-05, + "loss": 0.9846, + "step": 54040 + }, + { + "epoch": 0.3453100443376819, + "grad_norm": 0.6459341049194336, + "learning_rate": 9.282874940370517e-05, + "loss": 0.8446, + "step": 54050 + }, + { + "epoch": 0.3453739314874206, + "grad_norm": 0.7654846906661987, + "learning_rate": 9.282615994934982e-05, + "loss": 1.1735, + "step": 54060 + }, + { + "epoch": 0.34543781863715933, + "grad_norm": 1.2747883796691895, + "learning_rate": 9.282357006369798e-05, + "loss": 0.9468, + "step": 54070 + }, + { + "epoch": 0.34550170578689804, + "grad_norm": 0.5862970352172852, + "learning_rate": 9.282097974677574e-05, + "loss": 0.5708, + "step": 54080 + }, + { + "epoch": 0.34556559293663675, + "grad_norm": 1.1748859882354736, + "learning_rate": 9.28183889986092e-05, + "loss": 0.9329, + "step": 54090 + }, + { + "epoch": 0.34562948008637545, + "grad_norm": 0.7171411514282227, + "learning_rate": 9.281579781922442e-05, + "loss": 1.0105, + "step": 54100 + }, + { + "epoch": 0.3456933672361141, + "grad_norm": 0.8102126717567444, + "learning_rate": 9.281320620864754e-05, + "loss": 0.8918, + "step": 54110 + }, + { + "epoch": 0.3457572543858528, + "grad_norm": 1.1540294885635376, + "learning_rate": 9.281061416690462e-05, + "loss": 0.794, + "step": 54120 + }, + { + "epoch": 0.3458211415355915, + "grad_norm": 0.4848040044307709, + "learning_rate": 9.280802169402178e-05, + "loss": 0.7435, + "step": 54130 + }, + { + "epoch": 0.3458850286853302, + "grad_norm": 1.207207202911377, + "learning_rate": 9.280542879002512e-05, + "loss": 0.9234, + "step": 54140 + }, + { + "epoch": 0.3459489158350689, + "grad_norm": 0.7210013270378113, + "learning_rate": 9.280283545494077e-05, + "loss": 0.942, + "step": 54150 + }, + { + "epoch": 0.34601280298480763, + "grad_norm": 2.0840461254119873, + "learning_rate": 9.280024168879482e-05, + "loss": 0.828, + "step": 54160 + }, + { + "epoch": 0.34607669013454634, + "grad_norm": 0.8813756704330444, + "learning_rate": 9.279764749161344e-05, + "loss": 0.8051, + "step": 54170 + }, + { + "epoch": 0.34614057728428504, + "grad_norm": 0.606823205947876, + "learning_rate": 9.27950528634227e-05, + "loss": 1.0424, + "step": 54180 + }, + { + "epoch": 0.34620446443402375, + "grad_norm": 0.9201170206069946, + "learning_rate": 9.279245780424876e-05, + "loss": 0.772, + "step": 54190 + }, + { + "epoch": 0.34626835158376246, + "grad_norm": 0.9958915710449219, + "learning_rate": 9.278986231411776e-05, + "loss": 0.6918, + "step": 54200 + }, + { + "epoch": 0.34633223873350116, + "grad_norm": 0.7050455212593079, + "learning_rate": 9.278726639305581e-05, + "loss": 0.7851, + "step": 54210 + }, + { + "epoch": 0.34639612588323987, + "grad_norm": 0.9351766109466553, + "learning_rate": 9.27846700410891e-05, + "loss": 0.796, + "step": 54220 + }, + { + "epoch": 0.3464600130329785, + "grad_norm": 2.2169976234436035, + "learning_rate": 9.278207325824373e-05, + "loss": 0.9384, + "step": 54230 + }, + { + "epoch": 0.3465239001827172, + "grad_norm": 1.0841212272644043, + "learning_rate": 9.277947604454587e-05, + "loss": 0.996, + "step": 54240 + }, + { + "epoch": 0.34658778733245593, + "grad_norm": 1.3543506860733032, + "learning_rate": 9.277687840002167e-05, + "loss": 0.7683, + "step": 54250 + }, + { + "epoch": 0.34665167448219464, + "grad_norm": 0.7349464297294617, + "learning_rate": 9.277428032469731e-05, + "loss": 0.9666, + "step": 54260 + }, + { + "epoch": 0.34671556163193334, + "grad_norm": 1.0820789337158203, + "learning_rate": 9.277168181859893e-05, + "loss": 0.6931, + "step": 54270 + }, + { + "epoch": 0.34677944878167205, + "grad_norm": 0.6938410997390747, + "learning_rate": 9.276908288175272e-05, + "loss": 0.8715, + "step": 54280 + }, + { + "epoch": 0.34684333593141076, + "grad_norm": 0.7121148109436035, + "learning_rate": 9.276648351418484e-05, + "loss": 0.8973, + "step": 54290 + }, + { + "epoch": 0.34690722308114946, + "grad_norm": 0.8023224472999573, + "learning_rate": 9.276388371592149e-05, + "loss": 0.8985, + "step": 54300 + }, + { + "epoch": 0.34697111023088817, + "grad_norm": 1.1684279441833496, + "learning_rate": 9.276128348698881e-05, + "loss": 0.8147, + "step": 54310 + }, + { + "epoch": 0.3470349973806269, + "grad_norm": 1.3102762699127197, + "learning_rate": 9.275868282741303e-05, + "loss": 0.8545, + "step": 54320 + }, + { + "epoch": 0.3470988845303656, + "grad_norm": 0.8226547837257385, + "learning_rate": 9.27560817372203e-05, + "loss": 0.6908, + "step": 54330 + }, + { + "epoch": 0.3471627716801043, + "grad_norm": 1.1584205627441406, + "learning_rate": 9.275348021643686e-05, + "loss": 0.8704, + "step": 54340 + }, + { + "epoch": 0.34722665882984294, + "grad_norm": 0.852271556854248, + "learning_rate": 9.275087826508887e-05, + "loss": 0.8696, + "step": 54350 + }, + { + "epoch": 0.34729054597958164, + "grad_norm": 2.0320937633514404, + "learning_rate": 9.274827588320257e-05, + "loss": 0.6919, + "step": 54360 + }, + { + "epoch": 0.34735443312932035, + "grad_norm": 0.7250359058380127, + "learning_rate": 9.274567307080412e-05, + "loss": 0.8589, + "step": 54370 + }, + { + "epoch": 0.34741832027905906, + "grad_norm": 1.2491799592971802, + "learning_rate": 9.27430698279198e-05, + "loss": 0.8009, + "step": 54380 + }, + { + "epoch": 0.34748220742879776, + "grad_norm": 0.9660385251045227, + "learning_rate": 9.274046615457577e-05, + "loss": 1.0643, + "step": 54390 + }, + { + "epoch": 0.34754609457853647, + "grad_norm": 0.9620506167411804, + "learning_rate": 9.273786205079826e-05, + "loss": 0.8099, + "step": 54400 + }, + { + "epoch": 0.3476099817282752, + "grad_norm": 0.6800320744514465, + "learning_rate": 9.273525751661353e-05, + "loss": 1.039, + "step": 54410 + }, + { + "epoch": 0.3476738688780139, + "grad_norm": 0.6459980010986328, + "learning_rate": 9.273265255204778e-05, + "loss": 0.94, + "step": 54420 + }, + { + "epoch": 0.3477377560277526, + "grad_norm": 0.5387960076332092, + "learning_rate": 9.273004715712723e-05, + "loss": 0.9139, + "step": 54430 + }, + { + "epoch": 0.3478016431774913, + "grad_norm": 0.9442420601844788, + "learning_rate": 9.272744133187816e-05, + "loss": 0.813, + "step": 54440 + }, + { + "epoch": 0.34786553032723, + "grad_norm": 0.6634787321090698, + "learning_rate": 9.272483507632676e-05, + "loss": 0.6832, + "step": 54450 + }, + { + "epoch": 0.3479294174769687, + "grad_norm": 0.7288976311683655, + "learning_rate": 9.272222839049933e-05, + "loss": 0.873, + "step": 54460 + }, + { + "epoch": 0.3479933046267074, + "grad_norm": 1.1111667156219482, + "learning_rate": 9.27196212744221e-05, + "loss": 1.1114, + "step": 54470 + }, + { + "epoch": 0.34805719177644606, + "grad_norm": 1.427985668182373, + "learning_rate": 9.271701372812134e-05, + "loss": 0.8317, + "step": 54480 + }, + { + "epoch": 0.34812107892618477, + "grad_norm": 0.5816881656646729, + "learning_rate": 9.271440575162328e-05, + "loss": 0.9996, + "step": 54490 + }, + { + "epoch": 0.3481849660759235, + "grad_norm": 1.1159511804580688, + "learning_rate": 9.27117973449542e-05, + "loss": 0.8265, + "step": 54500 + }, + { + "epoch": 0.3482488532256622, + "grad_norm": 1.1096454858779907, + "learning_rate": 9.270918850814037e-05, + "loss": 0.9171, + "step": 54510 + }, + { + "epoch": 0.3483127403754009, + "grad_norm": 0.5924681425094604, + "learning_rate": 9.270657924120808e-05, + "loss": 0.9957, + "step": 54520 + }, + { + "epoch": 0.3483766275251396, + "grad_norm": 3.2207529544830322, + "learning_rate": 9.270396954418357e-05, + "loss": 1.1604, + "step": 54530 + }, + { + "epoch": 0.3484405146748783, + "grad_norm": 1.6686917543411255, + "learning_rate": 9.270135941709315e-05, + "loss": 0.7374, + "step": 54540 + }, + { + "epoch": 0.348504401824617, + "grad_norm": 0.8895770907402039, + "learning_rate": 9.26987488599631e-05, + "loss": 0.8122, + "step": 54550 + }, + { + "epoch": 0.3485682889743557, + "grad_norm": 0.9306029081344604, + "learning_rate": 9.26961378728197e-05, + "loss": 0.7176, + "step": 54560 + }, + { + "epoch": 0.3486321761240944, + "grad_norm": 1.2075837850570679, + "learning_rate": 9.269352645568927e-05, + "loss": 0.8263, + "step": 54570 + }, + { + "epoch": 0.3486960632738331, + "grad_norm": 0.7406107187271118, + "learning_rate": 9.269091460859807e-05, + "loss": 0.8766, + "step": 54580 + }, + { + "epoch": 0.34875995042357183, + "grad_norm": 0.717327892780304, + "learning_rate": 9.268830233157245e-05, + "loss": 1.2179, + "step": 54590 + }, + { + "epoch": 0.3488238375733105, + "grad_norm": 0.9631721377372742, + "learning_rate": 9.268568962463868e-05, + "loss": 0.9515, + "step": 54600 + }, + { + "epoch": 0.3488877247230492, + "grad_norm": 0.9041351675987244, + "learning_rate": 9.26830764878231e-05, + "loss": 1.1696, + "step": 54610 + }, + { + "epoch": 0.3489516118727879, + "grad_norm": 0.8273685574531555, + "learning_rate": 9.2680462921152e-05, + "loss": 1.1219, + "step": 54620 + }, + { + "epoch": 0.3490154990225266, + "grad_norm": 0.5111979842185974, + "learning_rate": 9.267784892465172e-05, + "loss": 0.8967, + "step": 54630 + }, + { + "epoch": 0.3490793861722653, + "grad_norm": 0.8033791184425354, + "learning_rate": 9.267523449834858e-05, + "loss": 0.8957, + "step": 54640 + }, + { + "epoch": 0.349143273322004, + "grad_norm": 0.8571832776069641, + "learning_rate": 9.267261964226892e-05, + "loss": 1.0502, + "step": 54650 + }, + { + "epoch": 0.3492071604717427, + "grad_norm": 0.4170287549495697, + "learning_rate": 9.267000435643904e-05, + "loss": 0.8696, + "step": 54660 + }, + { + "epoch": 0.3492710476214814, + "grad_norm": 0.687233567237854, + "learning_rate": 9.266738864088533e-05, + "loss": 0.787, + "step": 54670 + }, + { + "epoch": 0.34933493477122013, + "grad_norm": 0.8800210356712341, + "learning_rate": 9.266477249563408e-05, + "loss": 0.8221, + "step": 54680 + }, + { + "epoch": 0.34939882192095884, + "grad_norm": 0.8803540468215942, + "learning_rate": 9.266215592071167e-05, + "loss": 1.0652, + "step": 54690 + }, + { + "epoch": 0.34946270907069754, + "grad_norm": 0.5410533547401428, + "learning_rate": 9.265953891614445e-05, + "loss": 0.9378, + "step": 54700 + }, + { + "epoch": 0.34952659622043625, + "grad_norm": 0.5955383777618408, + "learning_rate": 9.265692148195875e-05, + "loss": 0.8833, + "step": 54710 + }, + { + "epoch": 0.3495904833701749, + "grad_norm": 0.633705735206604, + "learning_rate": 9.265430361818096e-05, + "loss": 1.004, + "step": 54720 + }, + { + "epoch": 0.3496543705199136, + "grad_norm": 0.9979560971260071, + "learning_rate": 9.265168532483744e-05, + "loss": 0.8923, + "step": 54730 + }, + { + "epoch": 0.3497182576696523, + "grad_norm": 0.5315431952476501, + "learning_rate": 9.264906660195453e-05, + "loss": 0.7914, + "step": 54740 + }, + { + "epoch": 0.349782144819391, + "grad_norm": 1.4878370761871338, + "learning_rate": 9.264644744955863e-05, + "loss": 1.0878, + "step": 54750 + }, + { + "epoch": 0.3498460319691297, + "grad_norm": 0.9964064359664917, + "learning_rate": 9.264382786767612e-05, + "loss": 1.1167, + "step": 54760 + }, + { + "epoch": 0.34990991911886843, + "grad_norm": 0.8638894557952881, + "learning_rate": 9.264120785633335e-05, + "loss": 0.7903, + "step": 54770 + }, + { + "epoch": 0.34997380626860713, + "grad_norm": 0.7577997446060181, + "learning_rate": 9.263858741555674e-05, + "loss": 0.86, + "step": 54780 + }, + { + "epoch": 0.35003769341834584, + "grad_norm": 0.9834237694740295, + "learning_rate": 9.263596654537265e-05, + "loss": 0.8051, + "step": 54790 + }, + { + "epoch": 0.35010158056808455, + "grad_norm": 0.9026603102684021, + "learning_rate": 9.263334524580751e-05, + "loss": 0.9596, + "step": 54800 + }, + { + "epoch": 0.35016546771782325, + "grad_norm": 1.9557400941848755, + "learning_rate": 9.26307235168877e-05, + "loss": 0.9289, + "step": 54810 + }, + { + "epoch": 0.35022935486756196, + "grad_norm": 0.5680462718009949, + "learning_rate": 9.262810135863962e-05, + "loss": 1.1719, + "step": 54820 + }, + { + "epoch": 0.35029324201730067, + "grad_norm": 1.077825665473938, + "learning_rate": 9.26254787710897e-05, + "loss": 0.8691, + "step": 54830 + }, + { + "epoch": 0.3503571291670393, + "grad_norm": 1.1171085834503174, + "learning_rate": 9.262285575426431e-05, + "loss": 0.7501, + "step": 54840 + }, + { + "epoch": 0.350421016316778, + "grad_norm": 0.8400352001190186, + "learning_rate": 9.262023230818987e-05, + "loss": 0.8568, + "step": 54850 + }, + { + "epoch": 0.35048490346651673, + "grad_norm": 0.8587310910224915, + "learning_rate": 9.261760843289284e-05, + "loss": 1.1126, + "step": 54860 + }, + { + "epoch": 0.35054879061625543, + "grad_norm": 0.979992687702179, + "learning_rate": 9.261498412839963e-05, + "loss": 0.7968, + "step": 54870 + }, + { + "epoch": 0.35061267776599414, + "grad_norm": 1.4268198013305664, + "learning_rate": 9.261235939473665e-05, + "loss": 0.9709, + "step": 54880 + }, + { + "epoch": 0.35067656491573285, + "grad_norm": 0.8531477451324463, + "learning_rate": 9.260973423193036e-05, + "loss": 1.1384, + "step": 54890 + }, + { + "epoch": 0.35074045206547155, + "grad_norm": 0.8192383050918579, + "learning_rate": 9.260710864000718e-05, + "loss": 0.9567, + "step": 54900 + }, + { + "epoch": 0.35080433921521026, + "grad_norm": 0.6545119881629944, + "learning_rate": 9.260448261899355e-05, + "loss": 0.8276, + "step": 54910 + }, + { + "epoch": 0.35086822636494897, + "grad_norm": 1.2469779253005981, + "learning_rate": 9.260185616891592e-05, + "loss": 0.9101, + "step": 54920 + }, + { + "epoch": 0.35093211351468767, + "grad_norm": 1.0227653980255127, + "learning_rate": 9.259922928980075e-05, + "loss": 1.0185, + "step": 54930 + }, + { + "epoch": 0.3509960006644264, + "grad_norm": 0.8625701665878296, + "learning_rate": 9.259660198167449e-05, + "loss": 1.0336, + "step": 54940 + }, + { + "epoch": 0.3510598878141651, + "grad_norm": 1.399640679359436, + "learning_rate": 9.259397424456359e-05, + "loss": 0.9261, + "step": 54950 + }, + { + "epoch": 0.35112377496390373, + "grad_norm": 1.6561399698257446, + "learning_rate": 9.259134607849451e-05, + "loss": 0.8661, + "step": 54960 + }, + { + "epoch": 0.35118766211364244, + "grad_norm": 0.7466694712638855, + "learning_rate": 9.258871748349375e-05, + "loss": 0.8944, + "step": 54970 + }, + { + "epoch": 0.35125154926338115, + "grad_norm": 2.0133652687072754, + "learning_rate": 9.258608845958774e-05, + "loss": 0.7284, + "step": 54980 + }, + { + "epoch": 0.35131543641311985, + "grad_norm": 0.8402307629585266, + "learning_rate": 9.258345900680299e-05, + "loss": 1.1441, + "step": 54990 + }, + { + "epoch": 0.35137932356285856, + "grad_norm": 0.6770734190940857, + "learning_rate": 9.258082912516597e-05, + "loss": 0.9305, + "step": 55000 + }, + { + "epoch": 0.35144321071259726, + "grad_norm": 1.0294511318206787, + "learning_rate": 9.257819881470315e-05, + "loss": 0.7655, + "step": 55010 + }, + { + "epoch": 0.35150709786233597, + "grad_norm": 0.6236374974250793, + "learning_rate": 9.257556807544106e-05, + "loss": 0.6974, + "step": 55020 + }, + { + "epoch": 0.3515709850120747, + "grad_norm": 0.7847385406494141, + "learning_rate": 9.257293690740614e-05, + "loss": 1.0462, + "step": 55030 + }, + { + "epoch": 0.3516348721618134, + "grad_norm": 0.6366947293281555, + "learning_rate": 9.257030531062492e-05, + "loss": 0.9091, + "step": 55040 + }, + { + "epoch": 0.3516987593115521, + "grad_norm": 0.9689487218856812, + "learning_rate": 9.25676732851239e-05, + "loss": 1.14, + "step": 55050 + }, + { + "epoch": 0.3517626464612908, + "grad_norm": 0.7967630624771118, + "learning_rate": 9.256504083092959e-05, + "loss": 0.7999, + "step": 55060 + }, + { + "epoch": 0.3518265336110295, + "grad_norm": 0.6108505725860596, + "learning_rate": 9.256240794806847e-05, + "loss": 1.1205, + "step": 55070 + }, + { + "epoch": 0.35189042076076815, + "grad_norm": 1.0797632932662964, + "learning_rate": 9.25597746365671e-05, + "loss": 0.8593, + "step": 55080 + }, + { + "epoch": 0.35195430791050686, + "grad_norm": 0.7324128150939941, + "learning_rate": 9.255714089645198e-05, + "loss": 0.9101, + "step": 55090 + }, + { + "epoch": 0.35201819506024556, + "grad_norm": 0.6534935235977173, + "learning_rate": 9.255450672774964e-05, + "loss": 1.2862, + "step": 55100 + }, + { + "epoch": 0.35208208220998427, + "grad_norm": 0.7674654722213745, + "learning_rate": 9.255187213048658e-05, + "loss": 1.0429, + "step": 55110 + }, + { + "epoch": 0.352145969359723, + "grad_norm": 0.8261142373085022, + "learning_rate": 9.254923710468937e-05, + "loss": 0.8614, + "step": 55120 + }, + { + "epoch": 0.3522098565094617, + "grad_norm": 1.2243504524230957, + "learning_rate": 9.254660165038453e-05, + "loss": 0.9836, + "step": 55130 + }, + { + "epoch": 0.3522737436592004, + "grad_norm": 0.9247923493385315, + "learning_rate": 9.254396576759861e-05, + "loss": 0.9118, + "step": 55140 + }, + { + "epoch": 0.3523376308089391, + "grad_norm": 1.049172043800354, + "learning_rate": 9.254132945635814e-05, + "loss": 1.1066, + "step": 55150 + }, + { + "epoch": 0.3524015179586778, + "grad_norm": 0.9203839302062988, + "learning_rate": 9.253869271668967e-05, + "loss": 1.0225, + "step": 55160 + }, + { + "epoch": 0.3524654051084165, + "grad_norm": 0.4765165448188782, + "learning_rate": 9.253605554861978e-05, + "loss": 0.7226, + "step": 55170 + }, + { + "epoch": 0.3525292922581552, + "grad_norm": 1.0200433731079102, + "learning_rate": 9.2533417952175e-05, + "loss": 0.9903, + "step": 55180 + }, + { + "epoch": 0.3525931794078939, + "grad_norm": 1.3597415685653687, + "learning_rate": 9.253077992738192e-05, + "loss": 0.7764, + "step": 55190 + }, + { + "epoch": 0.35265706655763257, + "grad_norm": 0.7081646919250488, + "learning_rate": 9.252814147426708e-05, + "loss": 0.9052, + "step": 55200 + }, + { + "epoch": 0.3527209537073713, + "grad_norm": 0.5674062967300415, + "learning_rate": 9.252550259285707e-05, + "loss": 0.8937, + "step": 55210 + }, + { + "epoch": 0.35278484085711, + "grad_norm": 0.8797856569290161, + "learning_rate": 9.252286328317846e-05, + "loss": 0.6981, + "step": 55220 + }, + { + "epoch": 0.3528487280068487, + "grad_norm": 0.6591719388961792, + "learning_rate": 9.252022354525783e-05, + "loss": 0.7734, + "step": 55230 + }, + { + "epoch": 0.3529126151565874, + "grad_norm": 0.9455986022949219, + "learning_rate": 9.251758337912174e-05, + "loss": 0.7539, + "step": 55240 + }, + { + "epoch": 0.3529765023063261, + "grad_norm": 0.6497638821601868, + "learning_rate": 9.251494278479682e-05, + "loss": 0.8169, + "step": 55250 + }, + { + "epoch": 0.3530403894560648, + "grad_norm": 0.9514163136482239, + "learning_rate": 9.251230176230965e-05, + "loss": 1.2422, + "step": 55260 + }, + { + "epoch": 0.3531042766058035, + "grad_norm": 1.0354559421539307, + "learning_rate": 9.250966031168682e-05, + "loss": 0.8663, + "step": 55270 + }, + { + "epoch": 0.3531681637555422, + "grad_norm": 0.6657097935676575, + "learning_rate": 9.250701843295492e-05, + "loss": 1.169, + "step": 55280 + }, + { + "epoch": 0.3532320509052809, + "grad_norm": 0.6656765937805176, + "learning_rate": 9.25043761261406e-05, + "loss": 0.722, + "step": 55290 + }, + { + "epoch": 0.35329593805501963, + "grad_norm": 0.7539229989051819, + "learning_rate": 9.250173339127042e-05, + "loss": 0.8882, + "step": 55300 + }, + { + "epoch": 0.35335982520475834, + "grad_norm": 2.1997039318084717, + "learning_rate": 9.249909022837102e-05, + "loss": 0.8417, + "step": 55310 + }, + { + "epoch": 0.35342371235449704, + "grad_norm": 0.5912847518920898, + "learning_rate": 9.249644663746901e-05, + "loss": 0.8431, + "step": 55320 + }, + { + "epoch": 0.3534875995042357, + "grad_norm": 1.1441770792007446, + "learning_rate": 9.249380261859103e-05, + "loss": 0.6843, + "step": 55330 + }, + { + "epoch": 0.3535514866539744, + "grad_norm": 1.2015843391418457, + "learning_rate": 9.249115817176368e-05, + "loss": 0.825, + "step": 55340 + }, + { + "epoch": 0.3536153738037131, + "grad_norm": 0.9341386556625366, + "learning_rate": 9.248851329701362e-05, + "loss": 1.0235, + "step": 55350 + }, + { + "epoch": 0.3536792609534518, + "grad_norm": 0.8819360733032227, + "learning_rate": 9.248586799436747e-05, + "loss": 0.8604, + "step": 55360 + }, + { + "epoch": 0.3537431481031905, + "grad_norm": 0.8615573048591614, + "learning_rate": 9.248322226385187e-05, + "loss": 0.9667, + "step": 55370 + }, + { + "epoch": 0.3538070352529292, + "grad_norm": 1.185778021812439, + "learning_rate": 9.248057610549348e-05, + "loss": 1.0003, + "step": 55380 + }, + { + "epoch": 0.35387092240266793, + "grad_norm": 0.9160196781158447, + "learning_rate": 9.247792951931893e-05, + "loss": 0.8687, + "step": 55390 + }, + { + "epoch": 0.35393480955240664, + "grad_norm": 0.6795194745063782, + "learning_rate": 9.247528250535487e-05, + "loss": 0.7333, + "step": 55400 + }, + { + "epoch": 0.35399869670214534, + "grad_norm": 0.5489585399627686, + "learning_rate": 9.247263506362798e-05, + "loss": 0.8638, + "step": 55410 + }, + { + "epoch": 0.35406258385188405, + "grad_norm": 1.2006055116653442, + "learning_rate": 9.246998719416491e-05, + "loss": 0.9143, + "step": 55420 + }, + { + "epoch": 0.35412647100162276, + "grad_norm": 1.0024096965789795, + "learning_rate": 9.246733889699233e-05, + "loss": 0.9047, + "step": 55430 + }, + { + "epoch": 0.35419035815136146, + "grad_norm": 0.5763610005378723, + "learning_rate": 9.24646901721369e-05, + "loss": 0.9754, + "step": 55440 + }, + { + "epoch": 0.3542542453011001, + "grad_norm": 1.1366212368011475, + "learning_rate": 9.24620410196253e-05, + "loss": 0.891, + "step": 55450 + }, + { + "epoch": 0.3543181324508388, + "grad_norm": 1.1361256837844849, + "learning_rate": 9.245939143948424e-05, + "loss": 1.0441, + "step": 55460 + }, + { + "epoch": 0.3543820196005775, + "grad_norm": 0.7863855361938477, + "learning_rate": 9.245674143174034e-05, + "loss": 0.7866, + "step": 55470 + }, + { + "epoch": 0.35444590675031623, + "grad_norm": 0.8668807744979858, + "learning_rate": 9.245409099642033e-05, + "loss": 0.9319, + "step": 55480 + }, + { + "epoch": 0.35450979390005494, + "grad_norm": 0.6587684750556946, + "learning_rate": 9.245144013355092e-05, + "loss": 0.8019, + "step": 55490 + }, + { + "epoch": 0.35457368104979364, + "grad_norm": 1.1338073015213013, + "learning_rate": 9.244878884315876e-05, + "loss": 0.8598, + "step": 55500 + }, + { + "epoch": 0.35463756819953235, + "grad_norm": 0.4027159512042999, + "learning_rate": 9.244613712527057e-05, + "loss": 0.7706, + "step": 55510 + }, + { + "epoch": 0.35470145534927106, + "grad_norm": 1.0326690673828125, + "learning_rate": 9.244348497991306e-05, + "loss": 0.9883, + "step": 55520 + }, + { + "epoch": 0.35476534249900976, + "grad_norm": 2.621795415878296, + "learning_rate": 9.244083240711297e-05, + "loss": 1.087, + "step": 55530 + }, + { + "epoch": 0.35482922964874847, + "grad_norm": 0.8886315822601318, + "learning_rate": 9.243817940689694e-05, + "loss": 0.7566, + "step": 55540 + }, + { + "epoch": 0.3548931167984872, + "grad_norm": 0.7971783876419067, + "learning_rate": 9.243552597929174e-05, + "loss": 0.7039, + "step": 55550 + }, + { + "epoch": 0.3549570039482259, + "grad_norm": 0.7734363675117493, + "learning_rate": 9.243287212432409e-05, + "loss": 0.9843, + "step": 55560 + }, + { + "epoch": 0.35502089109796453, + "grad_norm": 0.9685491919517517, + "learning_rate": 9.24302178420207e-05, + "loss": 0.9583, + "step": 55570 + }, + { + "epoch": 0.35508477824770324, + "grad_norm": 1.15921950340271, + "learning_rate": 9.242756313240833e-05, + "loss": 0.7942, + "step": 55580 + }, + { + "epoch": 0.35514866539744194, + "grad_norm": 1.1534897089004517, + "learning_rate": 9.242490799551366e-05, + "loss": 0.8079, + "step": 55590 + }, + { + "epoch": 0.35521255254718065, + "grad_norm": 0.9609005451202393, + "learning_rate": 9.242225243136348e-05, + "loss": 0.9695, + "step": 55600 + }, + { + "epoch": 0.35527643969691935, + "grad_norm": 0.6478775143623352, + "learning_rate": 9.241959643998453e-05, + "loss": 0.8381, + "step": 55610 + }, + { + "epoch": 0.35534032684665806, + "grad_norm": 0.9925094246864319, + "learning_rate": 9.241694002140354e-05, + "loss": 0.6593, + "step": 55620 + }, + { + "epoch": 0.35540421399639677, + "grad_norm": 0.9142459630966187, + "learning_rate": 9.241428317564725e-05, + "loss": 0.934, + "step": 55630 + }, + { + "epoch": 0.3554681011461355, + "grad_norm": 0.6951974034309387, + "learning_rate": 9.241162590274244e-05, + "loss": 0.9468, + "step": 55640 + }, + { + "epoch": 0.3555319882958742, + "grad_norm": 0.8623539209365845, + "learning_rate": 9.240896820271588e-05, + "loss": 0.8084, + "step": 55650 + }, + { + "epoch": 0.3555958754456129, + "grad_norm": 0.7138127684593201, + "learning_rate": 9.240631007559432e-05, + "loss": 0.8162, + "step": 55660 + }, + { + "epoch": 0.3556597625953516, + "grad_norm": 0.8145920634269714, + "learning_rate": 9.240365152140451e-05, + "loss": 1.0244, + "step": 55670 + }, + { + "epoch": 0.3557236497450903, + "grad_norm": 0.9237201809883118, + "learning_rate": 9.240099254017327e-05, + "loss": 0.8636, + "step": 55680 + }, + { + "epoch": 0.35578753689482895, + "grad_norm": 0.9301193356513977, + "learning_rate": 9.239833313192734e-05, + "loss": 1.1658, + "step": 55690 + }, + { + "epoch": 0.35585142404456765, + "grad_norm": 0.6827517151832581, + "learning_rate": 9.239567329669352e-05, + "loss": 1.1023, + "step": 55700 + }, + { + "epoch": 0.35591531119430636, + "grad_norm": 1.0909185409545898, + "learning_rate": 9.239301303449859e-05, + "loss": 0.8033, + "step": 55710 + }, + { + "epoch": 0.35597919834404507, + "grad_norm": 0.4835173189640045, + "learning_rate": 9.239035234536934e-05, + "loss": 0.8785, + "step": 55720 + }, + { + "epoch": 0.3560430854937838, + "grad_norm": 0.862131655216217, + "learning_rate": 9.238769122933257e-05, + "loss": 1.0392, + "step": 55730 + }, + { + "epoch": 0.3561069726435225, + "grad_norm": 1.5188207626342773, + "learning_rate": 9.238502968641509e-05, + "loss": 1.1016, + "step": 55740 + }, + { + "epoch": 0.3561708597932612, + "grad_norm": 0.6719252467155457, + "learning_rate": 9.238236771664369e-05, + "loss": 1.0367, + "step": 55750 + }, + { + "epoch": 0.3562347469429999, + "grad_norm": 0.8751115798950195, + "learning_rate": 9.237970532004516e-05, + "loss": 0.9716, + "step": 55760 + }, + { + "epoch": 0.3562986340927386, + "grad_norm": 0.8691346049308777, + "learning_rate": 9.237704249664637e-05, + "loss": 0.8428, + "step": 55770 + }, + { + "epoch": 0.3563625212424773, + "grad_norm": 0.7232783436775208, + "learning_rate": 9.237437924647408e-05, + "loss": 0.7021, + "step": 55780 + }, + { + "epoch": 0.356426408392216, + "grad_norm": 1.281238317489624, + "learning_rate": 9.237171556955513e-05, + "loss": 0.9095, + "step": 55790 + }, + { + "epoch": 0.3564902955419547, + "grad_norm": 1.1289631128311157, + "learning_rate": 9.236905146591635e-05, + "loss": 0.9427, + "step": 55800 + }, + { + "epoch": 0.35655418269169337, + "grad_norm": 0.8392340540885925, + "learning_rate": 9.236638693558456e-05, + "loss": 0.7125, + "step": 55810 + }, + { + "epoch": 0.3566180698414321, + "grad_norm": 1.3441346883773804, + "learning_rate": 9.23637219785866e-05, + "loss": 0.8185, + "step": 55820 + }, + { + "epoch": 0.3566819569911708, + "grad_norm": 0.7084068059921265, + "learning_rate": 9.236105659494933e-05, + "loss": 0.8048, + "step": 55830 + }, + { + "epoch": 0.3567458441409095, + "grad_norm": 0.8866279125213623, + "learning_rate": 9.235839078469956e-05, + "loss": 1.0885, + "step": 55840 + }, + { + "epoch": 0.3568097312906482, + "grad_norm": 0.9575055837631226, + "learning_rate": 9.235572454786414e-05, + "loss": 0.8621, + "step": 55850 + }, + { + "epoch": 0.3568736184403869, + "grad_norm": 0.7449828386306763, + "learning_rate": 9.235305788446995e-05, + "loss": 0.902, + "step": 55860 + }, + { + "epoch": 0.3569375055901256, + "grad_norm": 0.5956260561943054, + "learning_rate": 9.235039079454382e-05, + "loss": 0.9419, + "step": 55870 + }, + { + "epoch": 0.3570013927398643, + "grad_norm": 0.7238242030143738, + "learning_rate": 9.23477232781126e-05, + "loss": 1.07, + "step": 55880 + }, + { + "epoch": 0.357065279889603, + "grad_norm": 1.0870457887649536, + "learning_rate": 9.234505533520319e-05, + "loss": 0.9432, + "step": 55890 + }, + { + "epoch": 0.3571291670393417, + "grad_norm": 0.9857404232025146, + "learning_rate": 9.234238696584244e-05, + "loss": 1.1723, + "step": 55900 + }, + { + "epoch": 0.35719305418908043, + "grad_norm": 0.9271001815795898, + "learning_rate": 9.233971817005722e-05, + "loss": 0.9523, + "step": 55910 + }, + { + "epoch": 0.35725694133881913, + "grad_norm": 1.4764975309371948, + "learning_rate": 9.23370489478744e-05, + "loss": 1.0909, + "step": 55920 + }, + { + "epoch": 0.3573208284885578, + "grad_norm": 1.0458935499191284, + "learning_rate": 9.233437929932087e-05, + "loss": 0.8501, + "step": 55930 + }, + { + "epoch": 0.3573847156382965, + "grad_norm": 0.8124297857284546, + "learning_rate": 9.233170922442353e-05, + "loss": 1.0442, + "step": 55940 + }, + { + "epoch": 0.3574486027880352, + "grad_norm": 1.1618013381958008, + "learning_rate": 9.232903872320924e-05, + "loss": 1.0649, + "step": 55950 + }, + { + "epoch": 0.3575124899377739, + "grad_norm": 0.8419058918952942, + "learning_rate": 9.232636779570491e-05, + "loss": 0.8909, + "step": 55960 + }, + { + "epoch": 0.3575763770875126, + "grad_norm": 0.9706215262413025, + "learning_rate": 9.232369644193746e-05, + "loss": 0.9632, + "step": 55970 + }, + { + "epoch": 0.3576402642372513, + "grad_norm": 1.0121309757232666, + "learning_rate": 9.232102466193375e-05, + "loss": 0.7683, + "step": 55980 + }, + { + "epoch": 0.35770415138699, + "grad_norm": 0.8496699929237366, + "learning_rate": 9.231835245572072e-05, + "loss": 0.7359, + "step": 55990 + }, + { + "epoch": 0.3577680385367287, + "grad_norm": 0.6330521702766418, + "learning_rate": 9.231567982332528e-05, + "loss": 0.7402, + "step": 56000 + }, + { + "epoch": 0.35783192568646743, + "grad_norm": 0.8351934552192688, + "learning_rate": 9.23130067647743e-05, + "loss": 0.8587, + "step": 56010 + }, + { + "epoch": 0.35789581283620614, + "grad_norm": 0.8029147386550903, + "learning_rate": 9.231033328009477e-05, + "loss": 0.7748, + "step": 56020 + }, + { + "epoch": 0.35795969998594485, + "grad_norm": 0.6627703905105591, + "learning_rate": 9.230765936931355e-05, + "loss": 0.8785, + "step": 56030 + }, + { + "epoch": 0.35802358713568355, + "grad_norm": 1.2232366800308228, + "learning_rate": 9.230498503245764e-05, + "loss": 0.8073, + "step": 56040 + }, + { + "epoch": 0.3580874742854222, + "grad_norm": 0.783330500125885, + "learning_rate": 9.23023102695539e-05, + "loss": 1.0156, + "step": 56050 + }, + { + "epoch": 0.3581513614351609, + "grad_norm": 0.8045901656150818, + "learning_rate": 9.229963508062931e-05, + "loss": 0.9699, + "step": 56060 + }, + { + "epoch": 0.3582152485848996, + "grad_norm": 1.1084731817245483, + "learning_rate": 9.229695946571079e-05, + "loss": 1.0626, + "step": 56070 + }, + { + "epoch": 0.3582791357346383, + "grad_norm": 0.9448803067207336, + "learning_rate": 9.229428342482531e-05, + "loss": 0.873, + "step": 56080 + }, + { + "epoch": 0.358343022884377, + "grad_norm": 1.1229417324066162, + "learning_rate": 9.229160695799981e-05, + "loss": 0.9604, + "step": 56090 + }, + { + "epoch": 0.35840691003411573, + "grad_norm": 0.8668258190155029, + "learning_rate": 9.228893006526122e-05, + "loss": 1.0267, + "step": 56100 + }, + { + "epoch": 0.35847079718385444, + "grad_norm": 1.0289602279663086, + "learning_rate": 9.228625274663653e-05, + "loss": 0.8669, + "step": 56110 + }, + { + "epoch": 0.35853468433359315, + "grad_norm": 0.9492294192314148, + "learning_rate": 9.22835750021527e-05, + "loss": 0.8236, + "step": 56120 + }, + { + "epoch": 0.35859857148333185, + "grad_norm": 0.9482596516609192, + "learning_rate": 9.228116466802996e-05, + "loss": 0.9974, + "step": 56130 + }, + { + "epoch": 0.35866245863307056, + "grad_norm": 0.8407812118530273, + "learning_rate": 9.227848611448803e-05, + "loss": 0.8215, + "step": 56140 + }, + { + "epoch": 0.35872634578280926, + "grad_norm": 0.8675394654273987, + "learning_rate": 9.227580713516519e-05, + "loss": 1.0991, + "step": 56150 + }, + { + "epoch": 0.35879023293254797, + "grad_norm": 0.9417056441307068, + "learning_rate": 9.227312773008838e-05, + "loss": 1.1657, + "step": 56160 + }, + { + "epoch": 0.3588541200822867, + "grad_norm": 0.6891525983810425, + "learning_rate": 9.22704478992846e-05, + "loss": 0.8083, + "step": 56170 + }, + { + "epoch": 0.3589180072320253, + "grad_norm": 0.8754307627677917, + "learning_rate": 9.226776764278087e-05, + "loss": 0.8525, + "step": 56180 + }, + { + "epoch": 0.35898189438176403, + "grad_norm": 0.7058795094490051, + "learning_rate": 9.226508696060412e-05, + "loss": 0.9577, + "step": 56190 + }, + { + "epoch": 0.35904578153150274, + "grad_norm": 0.7938945889472961, + "learning_rate": 9.22624058527814e-05, + "loss": 0.9518, + "step": 56200 + }, + { + "epoch": 0.35910966868124145, + "grad_norm": 1.1744211912155151, + "learning_rate": 9.225972431933968e-05, + "loss": 0.9626, + "step": 56210 + }, + { + "epoch": 0.35917355583098015, + "grad_norm": 0.6415241360664368, + "learning_rate": 9.225704236030597e-05, + "loss": 0.8535, + "step": 56220 + }, + { + "epoch": 0.35923744298071886, + "grad_norm": 0.8061168789863586, + "learning_rate": 9.225435997570731e-05, + "loss": 0.9465, + "step": 56230 + }, + { + "epoch": 0.35930133013045756, + "grad_norm": 1.0841360092163086, + "learning_rate": 9.225167716557066e-05, + "loss": 0.8539, + "step": 56240 + }, + { + "epoch": 0.35936521728019627, + "grad_norm": 0.8104168772697449, + "learning_rate": 9.22489939299231e-05, + "loss": 0.9817, + "step": 56250 + }, + { + "epoch": 0.359429104429935, + "grad_norm": 0.5234248042106628, + "learning_rate": 9.22463102687916e-05, + "loss": 0.9769, + "step": 56260 + }, + { + "epoch": 0.3594929915796737, + "grad_norm": 0.9442692995071411, + "learning_rate": 9.224362618220321e-05, + "loss": 0.7631, + "step": 56270 + }, + { + "epoch": 0.3595568787294124, + "grad_norm": 0.7581874132156372, + "learning_rate": 9.224094167018496e-05, + "loss": 0.9655, + "step": 56280 + }, + { + "epoch": 0.3596207658791511, + "grad_norm": 0.9377095699310303, + "learning_rate": 9.223825673276387e-05, + "loss": 0.8839, + "step": 56290 + }, + { + "epoch": 0.35968465302888974, + "grad_norm": 1.3251482248306274, + "learning_rate": 9.2235571369967e-05, + "loss": 1.07, + "step": 56300 + }, + { + "epoch": 0.35974854017862845, + "grad_norm": 0.6887118220329285, + "learning_rate": 9.223288558182141e-05, + "loss": 0.8927, + "step": 56310 + }, + { + "epoch": 0.35981242732836716, + "grad_norm": 1.181915044784546, + "learning_rate": 9.22301993683541e-05, + "loss": 0.8667, + "step": 56320 + }, + { + "epoch": 0.35987631447810586, + "grad_norm": 0.9099196195602417, + "learning_rate": 9.222751272959216e-05, + "loss": 0.853, + "step": 56330 + }, + { + "epoch": 0.35994020162784457, + "grad_norm": 0.717306911945343, + "learning_rate": 9.222482566556263e-05, + "loss": 0.6981, + "step": 56340 + }, + { + "epoch": 0.3600040887775833, + "grad_norm": 0.9583873748779297, + "learning_rate": 9.222213817629258e-05, + "loss": 0.9945, + "step": 56350 + }, + { + "epoch": 0.360067975927322, + "grad_norm": 0.770458459854126, + "learning_rate": 9.221945026180907e-05, + "loss": 1.0296, + "step": 56360 + }, + { + "epoch": 0.3601318630770607, + "grad_norm": 1.2659205198287964, + "learning_rate": 9.221676192213918e-05, + "loss": 1.0096, + "step": 56370 + }, + { + "epoch": 0.3601957502267994, + "grad_norm": 1.0208081007003784, + "learning_rate": 9.221407315730997e-05, + "loss": 1.0415, + "step": 56380 + }, + { + "epoch": 0.3602596373765381, + "grad_norm": 0.8561046719551086, + "learning_rate": 9.22113839673485e-05, + "loss": 1.052, + "step": 56390 + }, + { + "epoch": 0.3603235245262768, + "grad_norm": 0.8812417984008789, + "learning_rate": 9.22086943522819e-05, + "loss": 0.9035, + "step": 56400 + }, + { + "epoch": 0.3603874116760155, + "grad_norm": 0.9357554316520691, + "learning_rate": 9.220600431213721e-05, + "loss": 1.052, + "step": 56410 + }, + { + "epoch": 0.36045129882575416, + "grad_norm": 0.5159763693809509, + "learning_rate": 9.220331384694157e-05, + "loss": 0.857, + "step": 56420 + }, + { + "epoch": 0.36051518597549287, + "grad_norm": 0.6961538791656494, + "learning_rate": 9.220062295672203e-05, + "loss": 0.7773, + "step": 56430 + }, + { + "epoch": 0.3605790731252316, + "grad_norm": 0.8022356033325195, + "learning_rate": 9.219793164150572e-05, + "loss": 0.9277, + "step": 56440 + }, + { + "epoch": 0.3606429602749703, + "grad_norm": 0.7829380631446838, + "learning_rate": 9.219523990131972e-05, + "loss": 1.2579, + "step": 56450 + }, + { + "epoch": 0.360706847424709, + "grad_norm": 0.705920934677124, + "learning_rate": 9.219254773619118e-05, + "loss": 0.7642, + "step": 56460 + }, + { + "epoch": 0.3607707345744477, + "grad_norm": 0.6244139075279236, + "learning_rate": 9.218985514614715e-05, + "loss": 0.8506, + "step": 56470 + }, + { + "epoch": 0.3608346217241864, + "grad_norm": 0.9292709231376648, + "learning_rate": 9.218716213121479e-05, + "loss": 0.8007, + "step": 56480 + }, + { + "epoch": 0.3608985088739251, + "grad_norm": 1.1093422174453735, + "learning_rate": 9.218446869142121e-05, + "loss": 0.948, + "step": 56490 + }, + { + "epoch": 0.3609623960236638, + "grad_norm": 0.9102591872215271, + "learning_rate": 9.218177482679354e-05, + "loss": 0.9274, + "step": 56500 + }, + { + "epoch": 0.3610262831734025, + "grad_norm": 0.8324138522148132, + "learning_rate": 9.217908053735889e-05, + "loss": 0.7481, + "step": 56510 + }, + { + "epoch": 0.3610901703231412, + "grad_norm": 0.6961225867271423, + "learning_rate": 9.217638582314442e-05, + "loss": 0.9775, + "step": 56520 + }, + { + "epoch": 0.36115405747287993, + "grad_norm": 0.9226143956184387, + "learning_rate": 9.217369068417726e-05, + "loss": 0.849, + "step": 56530 + }, + { + "epoch": 0.3612179446226186, + "grad_norm": 0.7031887769699097, + "learning_rate": 9.217099512048454e-05, + "loss": 0.8807, + "step": 56540 + }, + { + "epoch": 0.3612818317723573, + "grad_norm": 1.840198278427124, + "learning_rate": 9.216829913209342e-05, + "loss": 0.8067, + "step": 56550 + }, + { + "epoch": 0.361345718922096, + "grad_norm": 0.6737743020057678, + "learning_rate": 9.216560271903105e-05, + "loss": 0.992, + "step": 56560 + }, + { + "epoch": 0.3614096060718347, + "grad_norm": 1.441496729850769, + "learning_rate": 9.216290588132457e-05, + "loss": 0.9385, + "step": 56570 + }, + { + "epoch": 0.3614734932215734, + "grad_norm": 0.6251863837242126, + "learning_rate": 9.216020861900117e-05, + "loss": 0.6162, + "step": 56580 + }, + { + "epoch": 0.3615373803713121, + "grad_norm": 1.0875921249389648, + "learning_rate": 9.215751093208798e-05, + "loss": 0.631, + "step": 56590 + }, + { + "epoch": 0.3616012675210508, + "grad_norm": 0.7786641716957092, + "learning_rate": 9.215481282061221e-05, + "loss": 1.1934, + "step": 56600 + }, + { + "epoch": 0.3616651546707895, + "grad_norm": 0.6881313323974609, + "learning_rate": 9.215211428460098e-05, + "loss": 1.0602, + "step": 56610 + }, + { + "epoch": 0.36172904182052823, + "grad_norm": 1.173574686050415, + "learning_rate": 9.21494153240815e-05, + "loss": 0.8231, + "step": 56620 + }, + { + "epoch": 0.36179292897026694, + "grad_norm": 0.581283450126648, + "learning_rate": 9.214671593908092e-05, + "loss": 0.6751, + "step": 56630 + }, + { + "epoch": 0.36185681612000564, + "grad_norm": 0.7389190196990967, + "learning_rate": 9.214401612962649e-05, + "loss": 0.7668, + "step": 56640 + }, + { + "epoch": 0.36192070326974435, + "grad_norm": 0.6907786130905151, + "learning_rate": 9.214131589574534e-05, + "loss": 0.8037, + "step": 56650 + }, + { + "epoch": 0.361984590419483, + "grad_norm": 0.8607721328735352, + "learning_rate": 9.213861523746467e-05, + "loss": 0.8867, + "step": 56660 + }, + { + "epoch": 0.3620484775692217, + "grad_norm": 1.4309948682785034, + "learning_rate": 9.213591415481172e-05, + "loss": 0.9099, + "step": 56670 + }, + { + "epoch": 0.3621123647189604, + "grad_norm": 0.4009925425052643, + "learning_rate": 9.213321264781363e-05, + "loss": 0.6807, + "step": 56680 + }, + { + "epoch": 0.3621762518686991, + "grad_norm": 1.2572836875915527, + "learning_rate": 9.213051071649766e-05, + "loss": 0.9303, + "step": 56690 + }, + { + "epoch": 0.3622401390184378, + "grad_norm": 0.9721736311912537, + "learning_rate": 9.212780836089098e-05, + "loss": 0.8034, + "step": 56700 + }, + { + "epoch": 0.36230402616817653, + "grad_norm": 0.7228071093559265, + "learning_rate": 9.212510558102083e-05, + "loss": 1.0872, + "step": 56710 + }, + { + "epoch": 0.36236791331791524, + "grad_norm": 1.043264389038086, + "learning_rate": 9.212240237691443e-05, + "loss": 0.8663, + "step": 56720 + }, + { + "epoch": 0.36243180046765394, + "grad_norm": 0.8477384448051453, + "learning_rate": 9.211969874859898e-05, + "loss": 1.0247, + "step": 56730 + }, + { + "epoch": 0.36249568761739265, + "grad_norm": 1.0123629570007324, + "learning_rate": 9.211699469610174e-05, + "loss": 0.886, + "step": 56740 + }, + { + "epoch": 0.36255957476713135, + "grad_norm": 1.2255103588104248, + "learning_rate": 9.211429021944993e-05, + "loss": 0.8577, + "step": 56750 + }, + { + "epoch": 0.36262346191687006, + "grad_norm": 1.0467153787612915, + "learning_rate": 9.211158531867078e-05, + "loss": 1.0881, + "step": 56760 + }, + { + "epoch": 0.36268734906660877, + "grad_norm": 0.876595139503479, + "learning_rate": 9.210887999379153e-05, + "loss": 1.1139, + "step": 56770 + }, + { + "epoch": 0.3627512362163474, + "grad_norm": 0.9058437943458557, + "learning_rate": 9.210617424483943e-05, + "loss": 1.2917, + "step": 56780 + }, + { + "epoch": 0.3628151233660861, + "grad_norm": 0.755662202835083, + "learning_rate": 9.210346807184174e-05, + "loss": 0.899, + "step": 56790 + }, + { + "epoch": 0.36287901051582483, + "grad_norm": 0.5830435156822205, + "learning_rate": 9.210076147482567e-05, + "loss": 0.748, + "step": 56800 + }, + { + "epoch": 0.36294289766556354, + "grad_norm": 0.7086238861083984, + "learning_rate": 9.209805445381854e-05, + "loss": 1.1404, + "step": 56810 + }, + { + "epoch": 0.36300678481530224, + "grad_norm": 0.8161737322807312, + "learning_rate": 9.209534700884758e-05, + "loss": 0.6793, + "step": 56820 + }, + { + "epoch": 0.36307067196504095, + "grad_norm": 0.9982635974884033, + "learning_rate": 9.209263913994004e-05, + "loss": 0.865, + "step": 56830 + }, + { + "epoch": 0.36313455911477965, + "grad_norm": 0.8346578478813171, + "learning_rate": 9.208993084712322e-05, + "loss": 1.001, + "step": 56840 + }, + { + "epoch": 0.36319844626451836, + "grad_norm": 0.7267649173736572, + "learning_rate": 9.20872221304244e-05, + "loss": 0.6267, + "step": 56850 + }, + { + "epoch": 0.36326233341425707, + "grad_norm": 0.8349143266677856, + "learning_rate": 9.208451298987082e-05, + "loss": 0.9017, + "step": 56860 + }, + { + "epoch": 0.36332622056399577, + "grad_norm": 4.069967269897461, + "learning_rate": 9.20818034254898e-05, + "loss": 0.9907, + "step": 56870 + }, + { + "epoch": 0.3633901077137345, + "grad_norm": 1.0109018087387085, + "learning_rate": 9.20790934373086e-05, + "loss": 0.7675, + "step": 56880 + }, + { + "epoch": 0.3634539948634732, + "grad_norm": 0.7021664977073669, + "learning_rate": 9.207638302535452e-05, + "loss": 0.8808, + "step": 56890 + }, + { + "epoch": 0.36351788201321183, + "grad_norm": 1.4066033363342285, + "learning_rate": 9.207367218965487e-05, + "loss": 1.1123, + "step": 56900 + }, + { + "epoch": 0.36358176916295054, + "grad_norm": 0.5992746353149414, + "learning_rate": 9.207096093023694e-05, + "loss": 0.7128, + "step": 56910 + }, + { + "epoch": 0.36364565631268925, + "grad_norm": 0.9940372109413147, + "learning_rate": 9.206824924712805e-05, + "loss": 1.0003, + "step": 56920 + }, + { + "epoch": 0.36370954346242795, + "grad_norm": 0.7933813333511353, + "learning_rate": 9.206553714035549e-05, + "loss": 0.9643, + "step": 56930 + }, + { + "epoch": 0.36377343061216666, + "grad_norm": 0.7373024225234985, + "learning_rate": 9.206282460994657e-05, + "loss": 0.8773, + "step": 56940 + }, + { + "epoch": 0.36383731776190537, + "grad_norm": 1.27448570728302, + "learning_rate": 9.206011165592863e-05, + "loss": 0.909, + "step": 56950 + }, + { + "epoch": 0.36390120491164407, + "grad_norm": 0.7734085917472839, + "learning_rate": 9.205739827832895e-05, + "loss": 0.9389, + "step": 56960 + }, + { + "epoch": 0.3639650920613828, + "grad_norm": 0.5840640068054199, + "learning_rate": 9.205468447717491e-05, + "loss": 1.01, + "step": 56970 + }, + { + "epoch": 0.3640289792111215, + "grad_norm": 1.308883547782898, + "learning_rate": 9.205197025249382e-05, + "loss": 0.8717, + "step": 56980 + }, + { + "epoch": 0.3640928663608602, + "grad_norm": 0.5307298898696899, + "learning_rate": 9.2049255604313e-05, + "loss": 0.998, + "step": 56990 + }, + { + "epoch": 0.3641567535105989, + "grad_norm": 0.6630975604057312, + "learning_rate": 9.20465405326598e-05, + "loss": 0.9668, + "step": 57000 + }, + { + "epoch": 0.3642206406603376, + "grad_norm": 0.6394637823104858, + "learning_rate": 9.204382503756154e-05, + "loss": 0.8324, + "step": 57010 + }, + { + "epoch": 0.3642845278100763, + "grad_norm": 1.0292775630950928, + "learning_rate": 9.204110911904562e-05, + "loss": 0.8907, + "step": 57020 + }, + { + "epoch": 0.36434841495981496, + "grad_norm": 1.187157392501831, + "learning_rate": 9.203839277713935e-05, + "loss": 1.0058, + "step": 57030 + }, + { + "epoch": 0.36441230210955367, + "grad_norm": 1.1334859132766724, + "learning_rate": 9.20356760118701e-05, + "loss": 0.9049, + "step": 57040 + }, + { + "epoch": 0.36447618925929237, + "grad_norm": 1.5810905694961548, + "learning_rate": 9.203295882326521e-05, + "loss": 0.8885, + "step": 57050 + }, + { + "epoch": 0.3645400764090311, + "grad_norm": 0.981046736240387, + "learning_rate": 9.203024121135209e-05, + "loss": 0.8166, + "step": 57060 + }, + { + "epoch": 0.3646039635587698, + "grad_norm": 0.9694949388504028, + "learning_rate": 9.202752317615805e-05, + "loss": 0.7219, + "step": 57070 + }, + { + "epoch": 0.3646678507085085, + "grad_norm": 1.3031917810440063, + "learning_rate": 9.202480471771052e-05, + "loss": 0.9798, + "step": 57080 + }, + { + "epoch": 0.3647317378582472, + "grad_norm": 0.9253140687942505, + "learning_rate": 9.202208583603683e-05, + "loss": 0.7253, + "step": 57090 + }, + { + "epoch": 0.3647956250079859, + "grad_norm": 1.0539807081222534, + "learning_rate": 9.201936653116439e-05, + "loss": 0.8563, + "step": 57100 + }, + { + "epoch": 0.3648595121577246, + "grad_norm": 0.8437415361404419, + "learning_rate": 9.201664680312057e-05, + "loss": 0.955, + "step": 57110 + }, + { + "epoch": 0.3649233993074633, + "grad_norm": 0.7053326368331909, + "learning_rate": 9.201392665193276e-05, + "loss": 0.8577, + "step": 57120 + }, + { + "epoch": 0.364987286457202, + "grad_norm": 0.5430055856704712, + "learning_rate": 9.201120607762837e-05, + "loss": 0.8196, + "step": 57130 + }, + { + "epoch": 0.3650511736069407, + "grad_norm": 0.6964545845985413, + "learning_rate": 9.20084850802348e-05, + "loss": 0.8877, + "step": 57140 + }, + { + "epoch": 0.3651150607566794, + "grad_norm": 2.833962917327881, + "learning_rate": 9.200576365977943e-05, + "loss": 1.1258, + "step": 57150 + }, + { + "epoch": 0.3651789479064181, + "grad_norm": 0.9289480447769165, + "learning_rate": 9.200304181628968e-05, + "loss": 0.8065, + "step": 57160 + }, + { + "epoch": 0.3652428350561568, + "grad_norm": 0.6666757464408875, + "learning_rate": 9.200031954979297e-05, + "loss": 0.876, + "step": 57170 + }, + { + "epoch": 0.3653067222058955, + "grad_norm": 0.9867071509361267, + "learning_rate": 9.19975968603167e-05, + "loss": 0.9151, + "step": 57180 + }, + { + "epoch": 0.3653706093556342, + "grad_norm": 0.6142376661300659, + "learning_rate": 9.19948737478883e-05, + "loss": 0.7852, + "step": 57190 + }, + { + "epoch": 0.3654344965053729, + "grad_norm": 1.6434037685394287, + "learning_rate": 9.199215021253518e-05, + "loss": 0.8127, + "step": 57200 + }, + { + "epoch": 0.3654983836551116, + "grad_norm": 1.2232186794281006, + "learning_rate": 9.198942625428479e-05, + "loss": 0.9223, + "step": 57210 + }, + { + "epoch": 0.3655622708048503, + "grad_norm": 1.112564206123352, + "learning_rate": 9.198670187316456e-05, + "loss": 1.0382, + "step": 57220 + }, + { + "epoch": 0.365626157954589, + "grad_norm": 0.8125051259994507, + "learning_rate": 9.19839770692019e-05, + "loss": 0.8196, + "step": 57230 + }, + { + "epoch": 0.36569004510432773, + "grad_norm": 3.3364717960357666, + "learning_rate": 9.198125184242427e-05, + "loss": 1.0401, + "step": 57240 + }, + { + "epoch": 0.36575393225406644, + "grad_norm": 0.8038178086280823, + "learning_rate": 9.197852619285913e-05, + "loss": 1.2333, + "step": 57250 + }, + { + "epoch": 0.36581781940380514, + "grad_norm": 0.9946600198745728, + "learning_rate": 9.19758001205339e-05, + "loss": 0.8022, + "step": 57260 + }, + { + "epoch": 0.3658817065535438, + "grad_norm": 2.188892126083374, + "learning_rate": 9.197307362547607e-05, + "loss": 0.8886, + "step": 57270 + }, + { + "epoch": 0.3659455937032825, + "grad_norm": 0.699150025844574, + "learning_rate": 9.197034670771306e-05, + "loss": 0.8193, + "step": 57280 + }, + { + "epoch": 0.3660094808530212, + "grad_norm": 0.9230877757072449, + "learning_rate": 9.196761936727235e-05, + "loss": 0.9072, + "step": 57290 + }, + { + "epoch": 0.3660733680027599, + "grad_norm": 1.229096531867981, + "learning_rate": 9.19648916041814e-05, + "loss": 0.883, + "step": 57300 + }, + { + "epoch": 0.3661372551524986, + "grad_norm": 0.6960686445236206, + "learning_rate": 9.196216341846771e-05, + "loss": 1.1022, + "step": 57310 + }, + { + "epoch": 0.3662011423022373, + "grad_norm": 0.7202252745628357, + "learning_rate": 9.195943481015872e-05, + "loss": 0.9708, + "step": 57320 + }, + { + "epoch": 0.36626502945197603, + "grad_norm": 0.6151859760284424, + "learning_rate": 9.19567057792819e-05, + "loss": 1.1355, + "step": 57330 + }, + { + "epoch": 0.36632891660171474, + "grad_norm": 0.6116877198219299, + "learning_rate": 9.195397632586478e-05, + "loss": 0.7314, + "step": 57340 + }, + { + "epoch": 0.36639280375145344, + "grad_norm": 1.559106707572937, + "learning_rate": 9.195124644993483e-05, + "loss": 0.9246, + "step": 57350 + }, + { + "epoch": 0.36645669090119215, + "grad_norm": 0.8441659808158875, + "learning_rate": 9.194851615151951e-05, + "loss": 0.8061, + "step": 57360 + }, + { + "epoch": 0.36652057805093086, + "grad_norm": 0.8084182143211365, + "learning_rate": 9.194578543064635e-05, + "loss": 1.0054, + "step": 57370 + }, + { + "epoch": 0.36658446520066956, + "grad_norm": 0.9725624918937683, + "learning_rate": 9.194305428734285e-05, + "loss": 0.9369, + "step": 57380 + }, + { + "epoch": 0.3666483523504082, + "grad_norm": 0.7019644975662231, + "learning_rate": 9.19403227216365e-05, + "loss": 1.0316, + "step": 57390 + }, + { + "epoch": 0.3667122395001469, + "grad_norm": 0.84947669506073, + "learning_rate": 9.193759073355482e-05, + "loss": 1.0048, + "step": 57400 + }, + { + "epoch": 0.3667761266498856, + "grad_norm": 0.8308162689208984, + "learning_rate": 9.193485832312532e-05, + "loss": 1.2343, + "step": 57410 + }, + { + "epoch": 0.36684001379962433, + "grad_norm": 1.0776206254959106, + "learning_rate": 9.193212549037551e-05, + "loss": 1.2088, + "step": 57420 + }, + { + "epoch": 0.36690390094936304, + "grad_norm": 1.5936800241470337, + "learning_rate": 9.192939223533292e-05, + "loss": 1.0616, + "step": 57430 + }, + { + "epoch": 0.36696778809910174, + "grad_norm": 0.9751461148262024, + "learning_rate": 9.192665855802509e-05, + "loss": 0.9944, + "step": 57440 + }, + { + "epoch": 0.36703167524884045, + "grad_norm": 0.893570601940155, + "learning_rate": 9.192392445847953e-05, + "loss": 0.6853, + "step": 57450 + }, + { + "epoch": 0.36709556239857916, + "grad_norm": 1.0084480047225952, + "learning_rate": 9.192118993672378e-05, + "loss": 1.1445, + "step": 57460 + }, + { + "epoch": 0.36715944954831786, + "grad_norm": 0.9631890654563904, + "learning_rate": 9.191845499278539e-05, + "loss": 0.9025, + "step": 57470 + }, + { + "epoch": 0.36722333669805657, + "grad_norm": 1.809931993484497, + "learning_rate": 9.191571962669187e-05, + "loss": 0.8336, + "step": 57480 + }, + { + "epoch": 0.3672872238477953, + "grad_norm": 1.0318517684936523, + "learning_rate": 9.191298383847083e-05, + "loss": 0.9237, + "step": 57490 + }, + { + "epoch": 0.367351110997534, + "grad_norm": 1.242576003074646, + "learning_rate": 9.191024762814975e-05, + "loss": 0.9736, + "step": 57500 + }, + { + "epoch": 0.36741499814727263, + "grad_norm": 0.8778398036956787, + "learning_rate": 9.190751099575623e-05, + "loss": 0.7765, + "step": 57510 + }, + { + "epoch": 0.36747888529701134, + "grad_norm": 1.108216643333435, + "learning_rate": 9.19047739413178e-05, + "loss": 1.0051, + "step": 57520 + }, + { + "epoch": 0.36754277244675004, + "grad_norm": 0.9173517227172852, + "learning_rate": 9.190203646486206e-05, + "loss": 0.9958, + "step": 57530 + }, + { + "epoch": 0.36760665959648875, + "grad_norm": 0.8545486330986023, + "learning_rate": 9.189929856641657e-05, + "loss": 0.8174, + "step": 57540 + }, + { + "epoch": 0.36767054674622746, + "grad_norm": 0.8391945362091064, + "learning_rate": 9.18965602460089e-05, + "loss": 0.9733, + "step": 57550 + }, + { + "epoch": 0.36773443389596616, + "grad_norm": 0.6733419895172119, + "learning_rate": 9.189382150366662e-05, + "loss": 0.8057, + "step": 57560 + }, + { + "epoch": 0.36779832104570487, + "grad_norm": 2.302520513534546, + "learning_rate": 9.189108233941729e-05, + "loss": 0.9927, + "step": 57570 + }, + { + "epoch": 0.3678622081954436, + "grad_norm": 0.8115237355232239, + "learning_rate": 9.188834275328853e-05, + "loss": 0.9236, + "step": 57580 + }, + { + "epoch": 0.3679260953451823, + "grad_norm": 1.0810778141021729, + "learning_rate": 9.188560274530793e-05, + "loss": 1.1711, + "step": 57590 + }, + { + "epoch": 0.367989982494921, + "grad_norm": 0.9964002966880798, + "learning_rate": 9.188286231550307e-05, + "loss": 1.013, + "step": 57600 + }, + { + "epoch": 0.3680538696446597, + "grad_norm": 1.2044520378112793, + "learning_rate": 9.188012146390155e-05, + "loss": 0.798, + "step": 57610 + }, + { + "epoch": 0.3681177567943984, + "grad_norm": 2.0826616287231445, + "learning_rate": 9.187738019053098e-05, + "loss": 0.7468, + "step": 57620 + }, + { + "epoch": 0.36818164394413705, + "grad_norm": 0.8267672657966614, + "learning_rate": 9.187463849541895e-05, + "loss": 0.911, + "step": 57630 + }, + { + "epoch": 0.36824553109387576, + "grad_norm": 0.7479827404022217, + "learning_rate": 9.18718963785931e-05, + "loss": 0.7645, + "step": 57640 + }, + { + "epoch": 0.36830941824361446, + "grad_norm": 0.7282874584197998, + "learning_rate": 9.186915384008103e-05, + "loss": 1.1665, + "step": 57650 + }, + { + "epoch": 0.36837330539335317, + "grad_norm": 0.6607038974761963, + "learning_rate": 9.186641087991034e-05, + "loss": 0.9505, + "step": 57660 + }, + { + "epoch": 0.3684371925430919, + "grad_norm": 0.7457965612411499, + "learning_rate": 9.186366749810869e-05, + "loss": 0.7571, + "step": 57670 + }, + { + "epoch": 0.3685010796928306, + "grad_norm": 0.9892933368682861, + "learning_rate": 9.186092369470368e-05, + "loss": 0.8002, + "step": 57680 + }, + { + "epoch": 0.3685649668425693, + "grad_norm": 1.134438395500183, + "learning_rate": 9.185817946972296e-05, + "loss": 0.9039, + "step": 57690 + }, + { + "epoch": 0.368628853992308, + "grad_norm": 0.7737533450126648, + "learning_rate": 9.185543482319417e-05, + "loss": 0.8303, + "step": 57700 + }, + { + "epoch": 0.3686927411420467, + "grad_norm": 2.6117820739746094, + "learning_rate": 9.185268975514491e-05, + "loss": 0.9566, + "step": 57710 + }, + { + "epoch": 0.3687566282917854, + "grad_norm": 1.5219961404800415, + "learning_rate": 9.184994426560289e-05, + "loss": 0.976, + "step": 57720 + }, + { + "epoch": 0.3688205154415241, + "grad_norm": 0.796924889087677, + "learning_rate": 9.18471983545957e-05, + "loss": 0.8359, + "step": 57730 + }, + { + "epoch": 0.3688844025912628, + "grad_norm": 0.5662297606468201, + "learning_rate": 9.184445202215104e-05, + "loss": 0.9518, + "step": 57740 + }, + { + "epoch": 0.36894828974100147, + "grad_norm": 1.4038177728652954, + "learning_rate": 9.184170526829654e-05, + "loss": 0.8367, + "step": 57750 + }, + { + "epoch": 0.3690121768907402, + "grad_norm": 1.051730990409851, + "learning_rate": 9.183895809305987e-05, + "loss": 0.8319, + "step": 57760 + }, + { + "epoch": 0.3690760640404789, + "grad_norm": 0.6114339232444763, + "learning_rate": 9.183621049646869e-05, + "loss": 0.8821, + "step": 57770 + }, + { + "epoch": 0.3691399511902176, + "grad_norm": 0.7710915803909302, + "learning_rate": 9.18334624785507e-05, + "loss": 0.7208, + "step": 57780 + }, + { + "epoch": 0.3692038383399563, + "grad_norm": 1.5859193801879883, + "learning_rate": 9.183071403933353e-05, + "loss": 0.8121, + "step": 57790 + }, + { + "epoch": 0.369267725489695, + "grad_norm": 0.8052014708518982, + "learning_rate": 9.182796517884487e-05, + "loss": 0.9727, + "step": 57800 + }, + { + "epoch": 0.3693316126394337, + "grad_norm": 0.7919948101043701, + "learning_rate": 9.182521589711244e-05, + "loss": 1.0669, + "step": 57810 + }, + { + "epoch": 0.3693954997891724, + "grad_norm": 0.9116694927215576, + "learning_rate": 9.182246619416388e-05, + "loss": 0.7669, + "step": 57820 + }, + { + "epoch": 0.3694593869389111, + "grad_norm": 0.7370694875717163, + "learning_rate": 9.181971607002693e-05, + "loss": 0.6573, + "step": 57830 + }, + { + "epoch": 0.3695232740886498, + "grad_norm": 2.3321750164031982, + "learning_rate": 9.181696552472924e-05, + "loss": 1.0031, + "step": 57840 + }, + { + "epoch": 0.36958716123838853, + "grad_norm": 1.1017699241638184, + "learning_rate": 9.181421455829852e-05, + "loss": 0.9181, + "step": 57850 + }, + { + "epoch": 0.36965104838812723, + "grad_norm": 1.479238510131836, + "learning_rate": 9.181146317076252e-05, + "loss": 1.0418, + "step": 57860 + }, + { + "epoch": 0.36971493553786594, + "grad_norm": 0.8015438914299011, + "learning_rate": 9.180871136214889e-05, + "loss": 0.8837, + "step": 57870 + }, + { + "epoch": 0.3697788226876046, + "grad_norm": 0.7428931593894958, + "learning_rate": 9.180595913248537e-05, + "loss": 0.9252, + "step": 57880 + }, + { + "epoch": 0.3698427098373433, + "grad_norm": 1.2738107442855835, + "learning_rate": 9.180320648179968e-05, + "loss": 0.8249, + "step": 57890 + }, + { + "epoch": 0.369906596987082, + "grad_norm": 1.0015136003494263, + "learning_rate": 9.180045341011953e-05, + "loss": 0.9307, + "step": 57900 + }, + { + "epoch": 0.3699704841368207, + "grad_norm": 0.7193623185157776, + "learning_rate": 9.179769991747264e-05, + "loss": 0.8081, + "step": 57910 + }, + { + "epoch": 0.3700343712865594, + "grad_norm": 0.964747428894043, + "learning_rate": 9.179494600388677e-05, + "loss": 0.9367, + "step": 57920 + }, + { + "epoch": 0.3700982584362981, + "grad_norm": 0.8497000932693481, + "learning_rate": 9.179219166938963e-05, + "loss": 0.7509, + "step": 57930 + }, + { + "epoch": 0.37016214558603683, + "grad_norm": 1.6816493272781372, + "learning_rate": 9.178943691400896e-05, + "loss": 0.7834, + "step": 57940 + }, + { + "epoch": 0.37022603273577553, + "grad_norm": 0.8694002032279968, + "learning_rate": 9.178668173777252e-05, + "loss": 0.9374, + "step": 57950 + }, + { + "epoch": 0.37028991988551424, + "grad_norm": 0.6682251691818237, + "learning_rate": 9.178392614070803e-05, + "loss": 0.9475, + "step": 57960 + }, + { + "epoch": 0.37035380703525295, + "grad_norm": 0.610185980796814, + "learning_rate": 9.178117012284326e-05, + "loss": 0.8925, + "step": 57970 + }, + { + "epoch": 0.37041769418499165, + "grad_norm": 0.5272064805030823, + "learning_rate": 9.177841368420596e-05, + "loss": 0.8726, + "step": 57980 + }, + { + "epoch": 0.37048158133473036, + "grad_norm": 0.878194272518158, + "learning_rate": 9.17756568248239e-05, + "loss": 0.8307, + "step": 57990 + }, + { + "epoch": 0.370545468484469, + "grad_norm": 0.5838222503662109, + "learning_rate": 9.177289954472483e-05, + "loss": 1.0879, + "step": 58000 + }, + { + "epoch": 0.3706093556342077, + "grad_norm": 1.0944530963897705, + "learning_rate": 9.177014184393654e-05, + "loss": 0.8774, + "step": 58010 + }, + { + "epoch": 0.3706732427839464, + "grad_norm": 0.8681952953338623, + "learning_rate": 9.176738372248675e-05, + "loss": 0.9085, + "step": 58020 + }, + { + "epoch": 0.37073712993368513, + "grad_norm": 1.0131874084472656, + "learning_rate": 9.176462518040328e-05, + "loss": 1.0068, + "step": 58030 + }, + { + "epoch": 0.37080101708342383, + "grad_norm": 1.0025804042816162, + "learning_rate": 9.176186621771392e-05, + "loss": 0.8304, + "step": 58040 + }, + { + "epoch": 0.37086490423316254, + "grad_norm": 0.5216399431228638, + "learning_rate": 9.175910683444641e-05, + "loss": 1.0596, + "step": 58050 + }, + { + "epoch": 0.37092879138290125, + "grad_norm": 1.1650744676589966, + "learning_rate": 9.17563470306286e-05, + "loss": 0.9596, + "step": 58060 + }, + { + "epoch": 0.37099267853263995, + "grad_norm": 0.738498866558075, + "learning_rate": 9.175358680628825e-05, + "loss": 1.0937, + "step": 58070 + }, + { + "epoch": 0.37105656568237866, + "grad_norm": 1.8002482652664185, + "learning_rate": 9.175082616145314e-05, + "loss": 1.0585, + "step": 58080 + }, + { + "epoch": 0.37112045283211736, + "grad_norm": 0.9968917369842529, + "learning_rate": 9.17480650961511e-05, + "loss": 0.8607, + "step": 58090 + }, + { + "epoch": 0.37118433998185607, + "grad_norm": 0.6830025911331177, + "learning_rate": 9.174530361040992e-05, + "loss": 0.909, + "step": 58100 + }, + { + "epoch": 0.3712482271315948, + "grad_norm": 0.8409507870674133, + "learning_rate": 9.174254170425742e-05, + "loss": 0.7824, + "step": 58110 + }, + { + "epoch": 0.3713121142813334, + "grad_norm": 2.4945926666259766, + "learning_rate": 9.173977937772143e-05, + "loss": 1.0359, + "step": 58120 + }, + { + "epoch": 0.37137600143107213, + "grad_norm": 2.378359317779541, + "learning_rate": 9.173701663082972e-05, + "loss": 0.9768, + "step": 58130 + }, + { + "epoch": 0.37143988858081084, + "grad_norm": 1.1033055782318115, + "learning_rate": 9.173425346361017e-05, + "loss": 0.8963, + "step": 58140 + }, + { + "epoch": 0.37150377573054955, + "grad_norm": 0.982449471950531, + "learning_rate": 9.173148987609057e-05, + "loss": 0.9571, + "step": 58150 + }, + { + "epoch": 0.37156766288028825, + "grad_norm": 0.8938246965408325, + "learning_rate": 9.172872586829878e-05, + "loss": 1.0901, + "step": 58160 + }, + { + "epoch": 0.37163155003002696, + "grad_norm": 1.218361258506775, + "learning_rate": 9.17259614402626e-05, + "loss": 0.9947, + "step": 58170 + }, + { + "epoch": 0.37169543717976566, + "grad_norm": 0.7889940142631531, + "learning_rate": 9.17231965920099e-05, + "loss": 0.8841, + "step": 58180 + }, + { + "epoch": 0.37175932432950437, + "grad_norm": 0.9611823558807373, + "learning_rate": 9.17204313235685e-05, + "loss": 0.8782, + "step": 58190 + }, + { + "epoch": 0.3718232114792431, + "grad_norm": 1.1690157651901245, + "learning_rate": 9.171766563496628e-05, + "loss": 0.7884, + "step": 58200 + }, + { + "epoch": 0.3718870986289818, + "grad_norm": 1.155748963356018, + "learning_rate": 9.171489952623109e-05, + "loss": 1.0516, + "step": 58210 + }, + { + "epoch": 0.3719509857787205, + "grad_norm": 0.927507758140564, + "learning_rate": 9.171213299739075e-05, + "loss": 1.0492, + "step": 58220 + }, + { + "epoch": 0.3720148729284592, + "grad_norm": 1.9883320331573486, + "learning_rate": 9.170936604847315e-05, + "loss": 1.0933, + "step": 58230 + }, + { + "epoch": 0.37207876007819785, + "grad_norm": 0.8213433623313904, + "learning_rate": 9.170659867950615e-05, + "loss": 0.8121, + "step": 58240 + }, + { + "epoch": 0.37214264722793655, + "grad_norm": 1.8754318952560425, + "learning_rate": 9.170383089051762e-05, + "loss": 1.0397, + "step": 58250 + }, + { + "epoch": 0.37220653437767526, + "grad_norm": 1.1018773317337036, + "learning_rate": 9.170106268153543e-05, + "loss": 0.9177, + "step": 58260 + }, + { + "epoch": 0.37227042152741396, + "grad_norm": 0.6226816773414612, + "learning_rate": 9.169829405258747e-05, + "loss": 0.8247, + "step": 58270 + }, + { + "epoch": 0.37233430867715267, + "grad_norm": 0.615023136138916, + "learning_rate": 9.169552500370161e-05, + "loss": 1.0718, + "step": 58280 + }, + { + "epoch": 0.3723981958268914, + "grad_norm": 0.7454681396484375, + "learning_rate": 9.169275553490573e-05, + "loss": 0.9678, + "step": 58290 + }, + { + "epoch": 0.3724620829766301, + "grad_norm": 0.9580934047698975, + "learning_rate": 9.168998564622774e-05, + "loss": 1.2206, + "step": 58300 + }, + { + "epoch": 0.3725259701263688, + "grad_norm": 1.0605610609054565, + "learning_rate": 9.168721533769556e-05, + "loss": 0.8984, + "step": 58310 + }, + { + "epoch": 0.3725898572761075, + "grad_norm": 1.1907299757003784, + "learning_rate": 9.168444460933702e-05, + "loss": 0.8531, + "step": 58320 + }, + { + "epoch": 0.3726537444258462, + "grad_norm": 0.995368480682373, + "learning_rate": 9.168167346118006e-05, + "loss": 0.6946, + "step": 58330 + }, + { + "epoch": 0.3727176315755849, + "grad_norm": 1.1580376625061035, + "learning_rate": 9.167890189325261e-05, + "loss": 0.7377, + "step": 58340 + }, + { + "epoch": 0.3727815187253236, + "grad_norm": 1.3204131126403809, + "learning_rate": 9.167612990558254e-05, + "loss": 0.8134, + "step": 58350 + }, + { + "epoch": 0.37284540587506226, + "grad_norm": 1.4517935514450073, + "learning_rate": 9.167335749819781e-05, + "loss": 0.7879, + "step": 58360 + }, + { + "epoch": 0.37290929302480097, + "grad_norm": 0.6076644659042358, + "learning_rate": 9.167058467112629e-05, + "loss": 0.9626, + "step": 58370 + }, + { + "epoch": 0.3729731801745397, + "grad_norm": 0.8815622925758362, + "learning_rate": 9.166781142439595e-05, + "loss": 0.8204, + "step": 58380 + }, + { + "epoch": 0.3730370673242784, + "grad_norm": 1.5476758480072021, + "learning_rate": 9.16650377580347e-05, + "loss": 1.0324, + "step": 58390 + }, + { + "epoch": 0.3731009544740171, + "grad_norm": 1.1092950105667114, + "learning_rate": 9.166226367207047e-05, + "loss": 0.7715, + "step": 58400 + }, + { + "epoch": 0.3731648416237558, + "grad_norm": 0.8686773777008057, + "learning_rate": 9.16594891665312e-05, + "loss": 0.8466, + "step": 58410 + }, + { + "epoch": 0.3732287287734945, + "grad_norm": 0.948353111743927, + "learning_rate": 9.165671424144484e-05, + "loss": 0.7348, + "step": 58420 + }, + { + "epoch": 0.3732926159232332, + "grad_norm": 0.8880228400230408, + "learning_rate": 9.165393889683933e-05, + "loss": 0.8305, + "step": 58430 + }, + { + "epoch": 0.3733565030729719, + "grad_norm": 0.6419790983200073, + "learning_rate": 9.165116313274262e-05, + "loss": 0.8744, + "step": 58440 + }, + { + "epoch": 0.3734203902227106, + "grad_norm": 0.6578751802444458, + "learning_rate": 9.164838694918266e-05, + "loss": 1.0893, + "step": 58450 + }, + { + "epoch": 0.3734842773724493, + "grad_norm": 0.6613552570343018, + "learning_rate": 9.16456103461874e-05, + "loss": 1.0446, + "step": 58460 + }, + { + "epoch": 0.37354816452218803, + "grad_norm": 0.7612963318824768, + "learning_rate": 9.164283332378483e-05, + "loss": 0.9673, + "step": 58470 + }, + { + "epoch": 0.3736120516719267, + "grad_norm": 1.8602917194366455, + "learning_rate": 9.16400558820029e-05, + "loss": 1.1563, + "step": 58480 + }, + { + "epoch": 0.3736759388216654, + "grad_norm": 0.7156874537467957, + "learning_rate": 9.163755582585293e-05, + "loss": 0.9562, + "step": 58490 + }, + { + "epoch": 0.3737398259714041, + "grad_norm": 0.8070692420005798, + "learning_rate": 9.163477758732727e-05, + "loss": 0.8344, + "step": 58500 + }, + { + "epoch": 0.3738037131211428, + "grad_norm": 0.8404533267021179, + "learning_rate": 9.163199892950341e-05, + "loss": 0.9861, + "step": 58510 + }, + { + "epoch": 0.3738676002708815, + "grad_norm": 0.8748318552970886, + "learning_rate": 9.162921985240928e-05, + "loss": 0.9779, + "step": 58520 + }, + { + "epoch": 0.3739314874206202, + "grad_norm": 0.8599054217338562, + "learning_rate": 9.16264403560729e-05, + "loss": 0.8124, + "step": 58530 + }, + { + "epoch": 0.3739953745703589, + "grad_norm": 0.7923135161399841, + "learning_rate": 9.162366044052226e-05, + "loss": 0.6135, + "step": 58540 + }, + { + "epoch": 0.3740592617200976, + "grad_norm": 0.6415694952011108, + "learning_rate": 9.162088010578535e-05, + "loss": 0.9293, + "step": 58550 + }, + { + "epoch": 0.37412314886983633, + "grad_norm": 2.256666898727417, + "learning_rate": 9.161809935189016e-05, + "loss": 1.1138, + "step": 58560 + }, + { + "epoch": 0.37418703601957504, + "grad_norm": 1.2693225145339966, + "learning_rate": 9.161531817886471e-05, + "loss": 0.6599, + "step": 58570 + }, + { + "epoch": 0.37425092316931374, + "grad_norm": 0.8467420935630798, + "learning_rate": 9.1612536586737e-05, + "loss": 0.6876, + "step": 58580 + }, + { + "epoch": 0.37431481031905245, + "grad_norm": 0.9001184701919556, + "learning_rate": 9.160975457553504e-05, + "loss": 0.5682, + "step": 58590 + }, + { + "epoch": 0.37437869746879116, + "grad_norm": 0.6269614696502686, + "learning_rate": 9.160697214528687e-05, + "loss": 1.0431, + "step": 58600 + }, + { + "epoch": 0.3744425846185298, + "grad_norm": 1.413830280303955, + "learning_rate": 9.160418929602048e-05, + "loss": 0.7761, + "step": 58610 + }, + { + "epoch": 0.3745064717682685, + "grad_norm": 2.2682693004608154, + "learning_rate": 9.160140602776392e-05, + "loss": 1.1893, + "step": 58620 + }, + { + "epoch": 0.3745703589180072, + "grad_norm": 0.5779352188110352, + "learning_rate": 9.159862234054521e-05, + "loss": 1.035, + "step": 58630 + }, + { + "epoch": 0.3746342460677459, + "grad_norm": 0.7203439474105835, + "learning_rate": 9.15958382343924e-05, + "loss": 0.9331, + "step": 58640 + }, + { + "epoch": 0.37469813321748463, + "grad_norm": 0.8126745223999023, + "learning_rate": 9.159305370933349e-05, + "loss": 0.7504, + "step": 58650 + }, + { + "epoch": 0.37476202036722334, + "grad_norm": 0.7604427337646484, + "learning_rate": 9.159026876539656e-05, + "loss": 0.8239, + "step": 58660 + }, + { + "epoch": 0.37482590751696204, + "grad_norm": 0.9764753580093384, + "learning_rate": 9.158748340260962e-05, + "loss": 0.8887, + "step": 58670 + }, + { + "epoch": 0.37488979466670075, + "grad_norm": 1.0595623254776, + "learning_rate": 9.158469762100077e-05, + "loss": 0.9124, + "step": 58680 + }, + { + "epoch": 0.37495368181643945, + "grad_norm": 0.8522325158119202, + "learning_rate": 9.158191142059803e-05, + "loss": 0.8533, + "step": 58690 + }, + { + "epoch": 0.37501756896617816, + "grad_norm": 1.0041230916976929, + "learning_rate": 9.157912480142947e-05, + "loss": 0.9559, + "step": 58700 + }, + { + "epoch": 0.37508145611591687, + "grad_norm": 1.3694050312042236, + "learning_rate": 9.157633776352314e-05, + "loss": 0.905, + "step": 58710 + }, + { + "epoch": 0.3751453432656556, + "grad_norm": 0.4370633363723755, + "learning_rate": 9.157355030690714e-05, + "loss": 0.7518, + "step": 58720 + }, + { + "epoch": 0.3752092304153942, + "grad_norm": 0.8439906239509583, + "learning_rate": 9.157076243160951e-05, + "loss": 0.9578, + "step": 58730 + }, + { + "epoch": 0.37527311756513293, + "grad_norm": 0.7714953422546387, + "learning_rate": 9.156797413765834e-05, + "loss": 0.9042, + "step": 58740 + }, + { + "epoch": 0.37533700471487164, + "grad_norm": 0.7417599558830261, + "learning_rate": 9.156518542508172e-05, + "loss": 0.8571, + "step": 58750 + }, + { + "epoch": 0.37540089186461034, + "grad_norm": 1.951737642288208, + "learning_rate": 9.15623962939077e-05, + "loss": 0.9094, + "step": 58760 + }, + { + "epoch": 0.37546477901434905, + "grad_norm": 0.8249172568321228, + "learning_rate": 9.155960674416441e-05, + "loss": 0.7664, + "step": 58770 + }, + { + "epoch": 0.37552866616408775, + "grad_norm": 0.667812705039978, + "learning_rate": 9.155681677587992e-05, + "loss": 0.708, + "step": 58780 + }, + { + "epoch": 0.37559255331382646, + "grad_norm": 0.6393797993659973, + "learning_rate": 9.155402638908235e-05, + "loss": 0.8337, + "step": 58790 + }, + { + "epoch": 0.37565644046356517, + "grad_norm": 0.7899972200393677, + "learning_rate": 9.155123558379976e-05, + "loss": 1.0715, + "step": 58800 + }, + { + "epoch": 0.3757203276133039, + "grad_norm": 2.3867976665496826, + "learning_rate": 9.154844436006029e-05, + "loss": 0.9635, + "step": 58810 + }, + { + "epoch": 0.3757842147630426, + "grad_norm": 0.7886314392089844, + "learning_rate": 9.154565271789206e-05, + "loss": 0.8288, + "step": 58820 + }, + { + "epoch": 0.3758481019127813, + "grad_norm": 0.6438289880752563, + "learning_rate": 9.154286065732313e-05, + "loss": 0.683, + "step": 58830 + }, + { + "epoch": 0.37591198906252, + "grad_norm": 0.8149610161781311, + "learning_rate": 9.154006817838168e-05, + "loss": 0.9502, + "step": 58840 + }, + { + "epoch": 0.37597587621225864, + "grad_norm": 1.0395874977111816, + "learning_rate": 9.15372752810958e-05, + "loss": 0.6418, + "step": 58850 + }, + { + "epoch": 0.37603976336199735, + "grad_norm": 1.5722790956497192, + "learning_rate": 9.153448196549362e-05, + "loss": 0.927, + "step": 58860 + }, + { + "epoch": 0.37610365051173605, + "grad_norm": 1.1867657899856567, + "learning_rate": 9.153168823160327e-05, + "loss": 0.7479, + "step": 58870 + }, + { + "epoch": 0.37616753766147476, + "grad_norm": 0.9400370121002197, + "learning_rate": 9.15288940794529e-05, + "loss": 0.8849, + "step": 58880 + }, + { + "epoch": 0.37623142481121347, + "grad_norm": 0.6055128574371338, + "learning_rate": 9.152609950907062e-05, + "loss": 0.8318, + "step": 58890 + }, + { + "epoch": 0.3762953119609522, + "grad_norm": 0.8164952993392944, + "learning_rate": 9.152330452048462e-05, + "loss": 0.9452, + "step": 58900 + }, + { + "epoch": 0.3763591991106909, + "grad_norm": 0.4781966507434845, + "learning_rate": 9.152050911372301e-05, + "loss": 1.0144, + "step": 58910 + }, + { + "epoch": 0.3764230862604296, + "grad_norm": 0.7525957822799683, + "learning_rate": 9.151771328881394e-05, + "loss": 1.0175, + "step": 58920 + }, + { + "epoch": 0.3764869734101683, + "grad_norm": 0.9770300388336182, + "learning_rate": 9.151491704578559e-05, + "loss": 1.0909, + "step": 58930 + }, + { + "epoch": 0.376550860559907, + "grad_norm": 0.8200979232788086, + "learning_rate": 9.151212038466612e-05, + "loss": 0.6905, + "step": 58940 + }, + { + "epoch": 0.3766147477096457, + "grad_norm": 0.8204917907714844, + "learning_rate": 9.150932330548367e-05, + "loss": 0.9003, + "step": 58950 + }, + { + "epoch": 0.3766786348593844, + "grad_norm": 0.7505319714546204, + "learning_rate": 9.150652580826642e-05, + "loss": 1.0317, + "step": 58960 + }, + { + "epoch": 0.37674252200912306, + "grad_norm": 0.782026469707489, + "learning_rate": 9.150372789304256e-05, + "loss": 0.8431, + "step": 58970 + }, + { + "epoch": 0.37680640915886177, + "grad_norm": 2.76662278175354, + "learning_rate": 9.150092955984025e-05, + "loss": 1.0264, + "step": 58980 + }, + { + "epoch": 0.37687029630860047, + "grad_norm": 0.9735605716705322, + "learning_rate": 9.149813080868766e-05, + "loss": 1.0035, + "step": 58990 + }, + { + "epoch": 0.3769341834583392, + "grad_norm": 0.6053544282913208, + "learning_rate": 9.149533163961302e-05, + "loss": 0.8895, + "step": 59000 + }, + { + "epoch": 0.3769980706080779, + "grad_norm": 1.281782865524292, + "learning_rate": 9.149253205264448e-05, + "loss": 1.1018, + "step": 59010 + }, + { + "epoch": 0.3770619577578166, + "grad_norm": 0.6073563694953918, + "learning_rate": 9.148973204781023e-05, + "loss": 1.0346, + "step": 59020 + }, + { + "epoch": 0.3771258449075553, + "grad_norm": 0.5802990198135376, + "learning_rate": 9.148693162513851e-05, + "loss": 0.8453, + "step": 59030 + }, + { + "epoch": 0.377189732057294, + "grad_norm": 0.9088721871376038, + "learning_rate": 9.148413078465747e-05, + "loss": 1.0229, + "step": 59040 + }, + { + "epoch": 0.3772536192070327, + "grad_norm": 0.8357219099998474, + "learning_rate": 9.148132952639536e-05, + "loss": 1.133, + "step": 59050 + }, + { + "epoch": 0.3773175063567714, + "grad_norm": 0.7745949029922485, + "learning_rate": 9.147852785038038e-05, + "loss": 0.6222, + "step": 59060 + }, + { + "epoch": 0.3773813935065101, + "grad_norm": 0.5787645578384399, + "learning_rate": 9.147572575664074e-05, + "loss": 0.8277, + "step": 59070 + }, + { + "epoch": 0.3774452806562488, + "grad_norm": 0.5599297881126404, + "learning_rate": 9.147292324520466e-05, + "loss": 0.8404, + "step": 59080 + }, + { + "epoch": 0.3775091678059875, + "grad_norm": 0.6565321087837219, + "learning_rate": 9.147012031610035e-05, + "loss": 0.937, + "step": 59090 + }, + { + "epoch": 0.3775730549557262, + "grad_norm": 0.8938694000244141, + "learning_rate": 9.146731696935606e-05, + "loss": 1.0061, + "step": 59100 + }, + { + "epoch": 0.3776369421054649, + "grad_norm": 0.8118715286254883, + "learning_rate": 9.146451320500001e-05, + "loss": 0.9974, + "step": 59110 + }, + { + "epoch": 0.3777008292552036, + "grad_norm": 0.7012856006622314, + "learning_rate": 9.146170902306045e-05, + "loss": 1.0306, + "step": 59120 + }, + { + "epoch": 0.3777647164049423, + "grad_norm": 0.6307138204574585, + "learning_rate": 9.145890442356561e-05, + "loss": 0.685, + "step": 59130 + }, + { + "epoch": 0.377828603554681, + "grad_norm": 0.793086588382721, + "learning_rate": 9.145609940654373e-05, + "loss": 1.0748, + "step": 59140 + }, + { + "epoch": 0.3778924907044197, + "grad_norm": 1.0463335514068604, + "learning_rate": 9.145329397202307e-05, + "loss": 0.9517, + "step": 59150 + }, + { + "epoch": 0.3779563778541584, + "grad_norm": 0.8374640345573425, + "learning_rate": 9.145048812003186e-05, + "loss": 0.8408, + "step": 59160 + }, + { + "epoch": 0.3780202650038971, + "grad_norm": 0.8713728189468384, + "learning_rate": 9.144768185059838e-05, + "loss": 1.1013, + "step": 59170 + }, + { + "epoch": 0.37808415215363583, + "grad_norm": 0.706382691860199, + "learning_rate": 9.14448751637509e-05, + "loss": 0.8862, + "step": 59180 + }, + { + "epoch": 0.37814803930337454, + "grad_norm": 0.464167058467865, + "learning_rate": 9.144206805951767e-05, + "loss": 0.6612, + "step": 59190 + }, + { + "epoch": 0.37821192645311325, + "grad_norm": 0.7974499464035034, + "learning_rate": 9.143926053792696e-05, + "loss": 1.1017, + "step": 59200 + }, + { + "epoch": 0.3782758136028519, + "grad_norm": 1.0677493810653687, + "learning_rate": 9.143645259900704e-05, + "loss": 1.0395, + "step": 59210 + }, + { + "epoch": 0.3783397007525906, + "grad_norm": 0.6200050711631775, + "learning_rate": 9.14336442427862e-05, + "loss": 0.8772, + "step": 59220 + }, + { + "epoch": 0.3784035879023293, + "grad_norm": 0.8041068315505981, + "learning_rate": 9.143083546929272e-05, + "loss": 0.8241, + "step": 59230 + }, + { + "epoch": 0.378467475052068, + "grad_norm": 1.050399661064148, + "learning_rate": 9.142802627855487e-05, + "loss": 0.7528, + "step": 59240 + }, + { + "epoch": 0.3785313622018067, + "grad_norm": 0.5964527726173401, + "learning_rate": 9.142521667060098e-05, + "loss": 0.8251, + "step": 59250 + }, + { + "epoch": 0.3785952493515454, + "grad_norm": 0.813062310218811, + "learning_rate": 9.14224066454593e-05, + "loss": 1.0259, + "step": 59260 + }, + { + "epoch": 0.37865913650128413, + "grad_norm": 0.8622726798057556, + "learning_rate": 9.141959620315816e-05, + "loss": 0.8479, + "step": 59270 + }, + { + "epoch": 0.37872302365102284, + "grad_norm": 0.59121173620224, + "learning_rate": 9.141678534372584e-05, + "loss": 0.6244, + "step": 59280 + }, + { + "epoch": 0.37878691080076154, + "grad_norm": 0.7073304653167725, + "learning_rate": 9.141397406719066e-05, + "loss": 0.7587, + "step": 59290 + }, + { + "epoch": 0.37885079795050025, + "grad_norm": 0.5923304557800293, + "learning_rate": 9.141116237358095e-05, + "loss": 0.9219, + "step": 59300 + }, + { + "epoch": 0.37891468510023896, + "grad_norm": 0.909243106842041, + "learning_rate": 9.1408350262925e-05, + "loss": 0.721, + "step": 59310 + }, + { + "epoch": 0.37897857224997766, + "grad_norm": 0.40945374965667725, + "learning_rate": 9.140553773525114e-05, + "loss": 0.9946, + "step": 59320 + }, + { + "epoch": 0.3790424593997163, + "grad_norm": 1.5487751960754395, + "learning_rate": 9.14027247905877e-05, + "loss": 1.0129, + "step": 59330 + }, + { + "epoch": 0.379106346549455, + "grad_norm": 0.946149468421936, + "learning_rate": 9.1399911428963e-05, + "loss": 1.0866, + "step": 59340 + }, + { + "epoch": 0.3791702336991937, + "grad_norm": 1.2105820178985596, + "learning_rate": 9.139709765040537e-05, + "loss": 1.1053, + "step": 59350 + }, + { + "epoch": 0.37923412084893243, + "grad_norm": 0.9297011494636536, + "learning_rate": 9.139428345494316e-05, + "loss": 1.0082, + "step": 59360 + }, + { + "epoch": 0.37929800799867114, + "grad_norm": 1.3490419387817383, + "learning_rate": 9.139146884260469e-05, + "loss": 0.7593, + "step": 59370 + }, + { + "epoch": 0.37936189514840984, + "grad_norm": 1.027013897895813, + "learning_rate": 9.138865381341835e-05, + "loss": 0.8555, + "step": 59380 + }, + { + "epoch": 0.37942578229814855, + "grad_norm": 0.7934104800224304, + "learning_rate": 9.138583836741243e-05, + "loss": 0.7812, + "step": 59390 + }, + { + "epoch": 0.37948966944788726, + "grad_norm": 0.702707052230835, + "learning_rate": 9.138302250461532e-05, + "loss": 0.9684, + "step": 59400 + }, + { + "epoch": 0.37955355659762596, + "grad_norm": 0.6672869920730591, + "learning_rate": 9.138020622505539e-05, + "loss": 0.7703, + "step": 59410 + }, + { + "epoch": 0.37961744374736467, + "grad_norm": 0.811865508556366, + "learning_rate": 9.137738952876096e-05, + "loss": 0.7615, + "step": 59420 + }, + { + "epoch": 0.3796813308971034, + "grad_norm": 1.041718602180481, + "learning_rate": 9.137457241576044e-05, + "loss": 0.8087, + "step": 59430 + }, + { + "epoch": 0.3797452180468421, + "grad_norm": 0.9935733079910278, + "learning_rate": 9.137175488608217e-05, + "loss": 0.8609, + "step": 59440 + }, + { + "epoch": 0.3798091051965808, + "grad_norm": 0.6558438539505005, + "learning_rate": 9.136893693975455e-05, + "loss": 1.1521, + "step": 59450 + }, + { + "epoch": 0.37987299234631944, + "grad_norm": 1.0106873512268066, + "learning_rate": 9.136611857680593e-05, + "loss": 0.8439, + "step": 59460 + }, + { + "epoch": 0.37993687949605814, + "grad_norm": 0.8947387337684631, + "learning_rate": 9.136329979726472e-05, + "loss": 0.9528, + "step": 59470 + }, + { + "epoch": 0.38000076664579685, + "grad_norm": 1.6661876440048218, + "learning_rate": 9.13604806011593e-05, + "loss": 0.7804, + "step": 59480 + }, + { + "epoch": 0.38006465379553556, + "grad_norm": 0.7552819848060608, + "learning_rate": 9.135766098851803e-05, + "loss": 0.8697, + "step": 59490 + }, + { + "epoch": 0.38012854094527426, + "grad_norm": 1.3484975099563599, + "learning_rate": 9.135484095936937e-05, + "loss": 0.7785, + "step": 59500 + }, + { + "epoch": 0.38019242809501297, + "grad_norm": 0.9297848343849182, + "learning_rate": 9.135202051374167e-05, + "loss": 0.695, + "step": 59510 + }, + { + "epoch": 0.3802563152447517, + "grad_norm": 0.8916332125663757, + "learning_rate": 9.134919965166335e-05, + "loss": 0.8245, + "step": 59520 + }, + { + "epoch": 0.3803202023944904, + "grad_norm": 1.1042640209197998, + "learning_rate": 9.13463783731628e-05, + "loss": 0.8783, + "step": 59530 + }, + { + "epoch": 0.3803840895442291, + "grad_norm": 0.8340087532997131, + "learning_rate": 9.134355667826847e-05, + "loss": 0.7602, + "step": 59540 + }, + { + "epoch": 0.3804479766939678, + "grad_norm": 1.1028873920440674, + "learning_rate": 9.134073456700876e-05, + "loss": 0.9535, + "step": 59550 + }, + { + "epoch": 0.3805118638437065, + "grad_norm": 1.2923681735992432, + "learning_rate": 9.133791203941207e-05, + "loss": 0.9221, + "step": 59560 + }, + { + "epoch": 0.3805757509934452, + "grad_norm": 1.8344556093215942, + "learning_rate": 9.133508909550686e-05, + "loss": 1.1256, + "step": 59570 + }, + { + "epoch": 0.38063963814318386, + "grad_norm": 0.9875249862670898, + "learning_rate": 9.133226573532154e-05, + "loss": 0.7142, + "step": 59580 + }, + { + "epoch": 0.38070352529292256, + "grad_norm": 0.9586598873138428, + "learning_rate": 9.132944195888455e-05, + "loss": 0.7369, + "step": 59590 + }, + { + "epoch": 0.38076741244266127, + "grad_norm": 0.8368619084358215, + "learning_rate": 9.132661776622431e-05, + "loss": 0.7057, + "step": 59600 + }, + { + "epoch": 0.3808312995924, + "grad_norm": 0.9391975998878479, + "learning_rate": 9.132379315736928e-05, + "loss": 0.9706, + "step": 59610 + }, + { + "epoch": 0.3808951867421387, + "grad_norm": 0.6700417995452881, + "learning_rate": 9.132096813234792e-05, + "loss": 1.1595, + "step": 59620 + }, + { + "epoch": 0.3809590738918774, + "grad_norm": 4.27878475189209, + "learning_rate": 9.131814269118864e-05, + "loss": 1.0109, + "step": 59630 + }, + { + "epoch": 0.3810229610416161, + "grad_norm": 0.9258844256401062, + "learning_rate": 9.131531683391993e-05, + "loss": 0.979, + "step": 59640 + }, + { + "epoch": 0.3810868481913548, + "grad_norm": 4.915820121765137, + "learning_rate": 9.131249056057023e-05, + "loss": 1.0458, + "step": 59650 + }, + { + "epoch": 0.3811507353410935, + "grad_norm": 2.258350133895874, + "learning_rate": 9.130966387116802e-05, + "loss": 0.7549, + "step": 59660 + }, + { + "epoch": 0.3812146224908322, + "grad_norm": 0.5593277812004089, + "learning_rate": 9.130683676574175e-05, + "loss": 0.8745, + "step": 59670 + }, + { + "epoch": 0.3812785096405709, + "grad_norm": 0.8787796497344971, + "learning_rate": 9.13040092443199e-05, + "loss": 0.8045, + "step": 59680 + }, + { + "epoch": 0.3813423967903096, + "grad_norm": 0.9920330047607422, + "learning_rate": 9.130118130693095e-05, + "loss": 0.9066, + "step": 59690 + }, + { + "epoch": 0.3814062839400483, + "grad_norm": 2.229135513305664, + "learning_rate": 9.129835295360336e-05, + "loss": 0.8905, + "step": 59700 + }, + { + "epoch": 0.381470171089787, + "grad_norm": 0.8204028010368347, + "learning_rate": 9.129552418436563e-05, + "loss": 1.0525, + "step": 59710 + }, + { + "epoch": 0.3815340582395257, + "grad_norm": 0.8208606243133545, + "learning_rate": 9.129269499924626e-05, + "loss": 0.8469, + "step": 59720 + }, + { + "epoch": 0.3815979453892644, + "grad_norm": 0.8647171854972839, + "learning_rate": 9.128986539827371e-05, + "loss": 0.8889, + "step": 59730 + }, + { + "epoch": 0.3816618325390031, + "grad_norm": 1.4753942489624023, + "learning_rate": 9.128703538147651e-05, + "loss": 0.9241, + "step": 59740 + }, + { + "epoch": 0.3817257196887418, + "grad_norm": 6.072597503662109, + "learning_rate": 9.128420494888313e-05, + "loss": 1.3249, + "step": 59750 + }, + { + "epoch": 0.3817896068384805, + "grad_norm": 1.8557090759277344, + "learning_rate": 9.128137410052211e-05, + "loss": 1.0087, + "step": 59760 + }, + { + "epoch": 0.3818534939882192, + "grad_norm": 0.828505277633667, + "learning_rate": 9.127854283642192e-05, + "loss": 0.8843, + "step": 59770 + }, + { + "epoch": 0.3819173811379579, + "grad_norm": 0.6272063851356506, + "learning_rate": 9.127571115661111e-05, + "loss": 0.9136, + "step": 59780 + }, + { + "epoch": 0.38198126828769663, + "grad_norm": 0.683825671672821, + "learning_rate": 9.127287906111817e-05, + "loss": 0.9161, + "step": 59790 + }, + { + "epoch": 0.38204515543743534, + "grad_norm": 0.7828848958015442, + "learning_rate": 9.127004654997163e-05, + "loss": 0.8366, + "step": 59800 + }, + { + "epoch": 0.38210904258717404, + "grad_norm": 2.599881410598755, + "learning_rate": 9.126721362320003e-05, + "loss": 1.0435, + "step": 59810 + }, + { + "epoch": 0.3821729297369127, + "grad_norm": 1.0187602043151855, + "learning_rate": 9.126438028083186e-05, + "loss": 0.9667, + "step": 59820 + }, + { + "epoch": 0.3822368168866514, + "grad_norm": 1.3073110580444336, + "learning_rate": 9.126154652289571e-05, + "loss": 0.7698, + "step": 59830 + }, + { + "epoch": 0.3823007040363901, + "grad_norm": 0.6932925581932068, + "learning_rate": 9.125871234942008e-05, + "loss": 0.6695, + "step": 59840 + }, + { + "epoch": 0.3823645911861288, + "grad_norm": 1.466614842414856, + "learning_rate": 9.125587776043352e-05, + "loss": 1.0159, + "step": 59850 + }, + { + "epoch": 0.3824284783358675, + "grad_norm": 0.5515915155410767, + "learning_rate": 9.125304275596458e-05, + "loss": 1.0273, + "step": 59860 + }, + { + "epoch": 0.3824923654856062, + "grad_norm": 0.6064876914024353, + "learning_rate": 9.125020733604182e-05, + "loss": 0.8891, + "step": 59870 + }, + { + "epoch": 0.38255625263534493, + "grad_norm": 0.8917511105537415, + "learning_rate": 9.124737150069378e-05, + "loss": 1.1068, + "step": 59880 + }, + { + "epoch": 0.38262013978508363, + "grad_norm": 0.7151978611946106, + "learning_rate": 9.1244535249949e-05, + "loss": 0.8862, + "step": 59890 + }, + { + "epoch": 0.38268402693482234, + "grad_norm": 0.8112443089485168, + "learning_rate": 9.124169858383611e-05, + "loss": 0.897, + "step": 59900 + }, + { + "epoch": 0.38274791408456105, + "grad_norm": 1.089768886566162, + "learning_rate": 9.123886150238361e-05, + "loss": 0.9832, + "step": 59910 + }, + { + "epoch": 0.38281180123429975, + "grad_norm": 0.7794529795646667, + "learning_rate": 9.12360240056201e-05, + "loss": 0.8341, + "step": 59920 + }, + { + "epoch": 0.38287568838403846, + "grad_norm": 0.5675161480903625, + "learning_rate": 9.123318609357417e-05, + "loss": 0.9027, + "step": 59930 + }, + { + "epoch": 0.3829395755337771, + "grad_norm": 0.8330199718475342, + "learning_rate": 9.123034776627437e-05, + "loss": 0.9739, + "step": 59940 + }, + { + "epoch": 0.3830034626835158, + "grad_norm": 1.6454709768295288, + "learning_rate": 9.12275090237493e-05, + "loss": 0.977, + "step": 59950 + }, + { + "epoch": 0.3830673498332545, + "grad_norm": 0.8024013042449951, + "learning_rate": 9.122466986602756e-05, + "loss": 0.9452, + "step": 59960 + }, + { + "epoch": 0.38313123698299323, + "grad_norm": 1.1360933780670166, + "learning_rate": 9.122183029313771e-05, + "loss": 1.0236, + "step": 59970 + }, + { + "epoch": 0.38319512413273193, + "grad_norm": 0.7337785959243774, + "learning_rate": 9.121899030510839e-05, + "loss": 0.9299, + "step": 59980 + }, + { + "epoch": 0.38325901128247064, + "grad_norm": 0.8636689782142639, + "learning_rate": 9.121614990196816e-05, + "loss": 0.7671, + "step": 59990 + }, + { + "epoch": 0.38332289843220935, + "grad_norm": 1.2737140655517578, + "learning_rate": 9.121330908374564e-05, + "loss": 0.8175, + "step": 60000 + }, + { + "epoch": 0.38338678558194805, + "grad_norm": 0.6086975336074829, + "learning_rate": 9.121046785046945e-05, + "loss": 1.1958, + "step": 60010 + }, + { + "epoch": 0.38345067273168676, + "grad_norm": 0.7334972023963928, + "learning_rate": 9.12076262021682e-05, + "loss": 0.8533, + "step": 60020 + }, + { + "epoch": 0.38351455988142547, + "grad_norm": 0.67818683385849, + "learning_rate": 9.12047841388705e-05, + "loss": 0.7376, + "step": 60030 + }, + { + "epoch": 0.38357844703116417, + "grad_norm": 0.5810967683792114, + "learning_rate": 9.120194166060498e-05, + "loss": 0.8313, + "step": 60040 + }, + { + "epoch": 0.3836423341809029, + "grad_norm": 0.7271260619163513, + "learning_rate": 9.119909876740027e-05, + "loss": 0.8529, + "step": 60050 + }, + { + "epoch": 0.38370622133064153, + "grad_norm": 1.0164223909378052, + "learning_rate": 9.119625545928499e-05, + "loss": 0.919, + "step": 60060 + }, + { + "epoch": 0.38377010848038023, + "grad_norm": 1.4784969091415405, + "learning_rate": 9.119341173628777e-05, + "loss": 0.9259, + "step": 60070 + }, + { + "epoch": 0.38383399563011894, + "grad_norm": 0.8718630075454712, + "learning_rate": 9.119056759843724e-05, + "loss": 0.918, + "step": 60080 + }, + { + "epoch": 0.38389788277985765, + "grad_norm": 0.9398227334022522, + "learning_rate": 9.118772304576209e-05, + "loss": 1.0287, + "step": 60090 + }, + { + "epoch": 0.38396176992959635, + "grad_norm": 0.7162007689476013, + "learning_rate": 9.118487807829093e-05, + "loss": 0.8178, + "step": 60100 + }, + { + "epoch": 0.38402565707933506, + "grad_norm": 1.4307546615600586, + "learning_rate": 9.118203269605242e-05, + "loss": 0.8535, + "step": 60110 + }, + { + "epoch": 0.38408954422907376, + "grad_norm": 1.0519388914108276, + "learning_rate": 9.11791868990752e-05, + "loss": 1.2414, + "step": 60120 + }, + { + "epoch": 0.38415343137881247, + "grad_norm": 0.8539866805076599, + "learning_rate": 9.117634068738794e-05, + "loss": 0.8189, + "step": 60130 + }, + { + "epoch": 0.3842173185285512, + "grad_norm": 0.8897231221199036, + "learning_rate": 9.117349406101931e-05, + "loss": 1.0583, + "step": 60140 + }, + { + "epoch": 0.3842812056782899, + "grad_norm": 0.9356622099876404, + "learning_rate": 9.117064701999797e-05, + "loss": 0.8774, + "step": 60150 + }, + { + "epoch": 0.3843450928280286, + "grad_norm": 0.934384822845459, + "learning_rate": 9.116779956435262e-05, + "loss": 1.0653, + "step": 60160 + }, + { + "epoch": 0.3844089799777673, + "grad_norm": 0.5904353857040405, + "learning_rate": 9.11649516941119e-05, + "loss": 0.8482, + "step": 60170 + }, + { + "epoch": 0.38447286712750595, + "grad_norm": 0.840069055557251, + "learning_rate": 9.116210340930451e-05, + "loss": 1.0966, + "step": 60180 + }, + { + "epoch": 0.38453675427724465, + "grad_norm": 2.140904188156128, + "learning_rate": 9.115925470995912e-05, + "loss": 0.9313, + "step": 60190 + }, + { + "epoch": 0.38460064142698336, + "grad_norm": 1.6145496368408203, + "learning_rate": 9.115640559610444e-05, + "loss": 0.9065, + "step": 60200 + }, + { + "epoch": 0.38466452857672206, + "grad_norm": 0.8971934914588928, + "learning_rate": 9.115355606776913e-05, + "loss": 0.7211, + "step": 60210 + }, + { + "epoch": 0.38472841572646077, + "grad_norm": 0.740960419178009, + "learning_rate": 9.115070612498192e-05, + "loss": 0.7915, + "step": 60220 + }, + { + "epoch": 0.3847923028761995, + "grad_norm": 1.029941201210022, + "learning_rate": 9.114785576777149e-05, + "loss": 0.9746, + "step": 60230 + }, + { + "epoch": 0.3848561900259382, + "grad_norm": 1.0357356071472168, + "learning_rate": 9.114500499616656e-05, + "loss": 0.7439, + "step": 60240 + }, + { + "epoch": 0.3849200771756769, + "grad_norm": 0.823661208152771, + "learning_rate": 9.114215381019584e-05, + "loss": 0.8409, + "step": 60250 + }, + { + "epoch": 0.3849839643254156, + "grad_norm": 1.5275285243988037, + "learning_rate": 9.113930220988804e-05, + "loss": 0.6833, + "step": 60260 + }, + { + "epoch": 0.3850478514751543, + "grad_norm": 0.8406334519386292, + "learning_rate": 9.113645019527187e-05, + "loss": 0.941, + "step": 60270 + }, + { + "epoch": 0.385111738624893, + "grad_norm": 1.2402430772781372, + "learning_rate": 9.113359776637604e-05, + "loss": 0.823, + "step": 60280 + }, + { + "epoch": 0.3851756257746317, + "grad_norm": 0.8033724427223206, + "learning_rate": 9.113074492322933e-05, + "loss": 1.0329, + "step": 60290 + }, + { + "epoch": 0.3852395129243704, + "grad_norm": 0.7544481158256531, + "learning_rate": 9.112789166586041e-05, + "loss": 0.7707, + "step": 60300 + }, + { + "epoch": 0.38530340007410907, + "grad_norm": 1.0110443830490112, + "learning_rate": 9.112503799429805e-05, + "loss": 0.8752, + "step": 60310 + }, + { + "epoch": 0.3853672872238478, + "grad_norm": 0.9389250874519348, + "learning_rate": 9.112218390857098e-05, + "loss": 1.004, + "step": 60320 + }, + { + "epoch": 0.3854311743735865, + "grad_norm": 0.7335034608840942, + "learning_rate": 9.111932940870793e-05, + "loss": 0.9463, + "step": 60330 + }, + { + "epoch": 0.3854950615233252, + "grad_norm": 0.9130538105964661, + "learning_rate": 9.111647449473766e-05, + "loss": 0.8286, + "step": 60340 + }, + { + "epoch": 0.3855589486730639, + "grad_norm": 1.8311418294906616, + "learning_rate": 9.111361916668894e-05, + "loss": 0.9905, + "step": 60350 + }, + { + "epoch": 0.3856228358228026, + "grad_norm": 0.7370787858963013, + "learning_rate": 9.111076342459051e-05, + "loss": 1.105, + "step": 60360 + }, + { + "epoch": 0.3856867229725413, + "grad_norm": 0.8268787860870361, + "learning_rate": 9.110790726847109e-05, + "loss": 0.779, + "step": 60370 + }, + { + "epoch": 0.38575061012228, + "grad_norm": 0.7258269190788269, + "learning_rate": 9.110505069835952e-05, + "loss": 0.8981, + "step": 60380 + }, + { + "epoch": 0.3858144972720187, + "grad_norm": 1.1114614009857178, + "learning_rate": 9.11021937142845e-05, + "loss": 0.9508, + "step": 60390 + }, + { + "epoch": 0.3858783844217574, + "grad_norm": 0.6973649263381958, + "learning_rate": 9.109933631627485e-05, + "loss": 0.9868, + "step": 60400 + }, + { + "epoch": 0.38594227157149613, + "grad_norm": 0.8535771369934082, + "learning_rate": 9.109647850435931e-05, + "loss": 0.9278, + "step": 60410 + }, + { + "epoch": 0.38600615872123484, + "grad_norm": 0.9913718104362488, + "learning_rate": 9.10936202785667e-05, + "loss": 0.9525, + "step": 60420 + }, + { + "epoch": 0.3860700458709735, + "grad_norm": 0.9371497631072998, + "learning_rate": 9.109076163892577e-05, + "loss": 0.9669, + "step": 60430 + }, + { + "epoch": 0.3861339330207122, + "grad_norm": 0.6546643972396851, + "learning_rate": 9.108790258546533e-05, + "loss": 0.6787, + "step": 60440 + }, + { + "epoch": 0.3861978201704509, + "grad_norm": 0.8154623508453369, + "learning_rate": 9.108504311821416e-05, + "loss": 1.0956, + "step": 60450 + }, + { + "epoch": 0.3862617073201896, + "grad_norm": 0.5797396898269653, + "learning_rate": 9.108218323720104e-05, + "loss": 1.2229, + "step": 60460 + }, + { + "epoch": 0.3863255944699283, + "grad_norm": 1.2264608144760132, + "learning_rate": 9.107932294245483e-05, + "loss": 0.9712, + "step": 60470 + }, + { + "epoch": 0.386389481619667, + "grad_norm": 0.9331986904144287, + "learning_rate": 9.107646223400428e-05, + "loss": 0.8631, + "step": 60480 + }, + { + "epoch": 0.3864533687694057, + "grad_norm": 1.17788827419281, + "learning_rate": 9.107360111187821e-05, + "loss": 0.7527, + "step": 60490 + }, + { + "epoch": 0.38651725591914443, + "grad_norm": 0.9666171073913574, + "learning_rate": 9.107073957610546e-05, + "loss": 0.745, + "step": 60500 + }, + { + "epoch": 0.38658114306888314, + "grad_norm": 0.7744701504707336, + "learning_rate": 9.106787762671483e-05, + "loss": 0.9245, + "step": 60510 + }, + { + "epoch": 0.38664503021862184, + "grad_norm": 0.7567153573036194, + "learning_rate": 9.106501526373514e-05, + "loss": 0.8483, + "step": 60520 + }, + { + "epoch": 0.38670891736836055, + "grad_norm": 1.0141370296478271, + "learning_rate": 9.106215248719522e-05, + "loss": 0.8139, + "step": 60530 + }, + { + "epoch": 0.38677280451809926, + "grad_norm": 0.924473762512207, + "learning_rate": 9.10592892971239e-05, + "loss": 1.1207, + "step": 60540 + }, + { + "epoch": 0.3868366916678379, + "grad_norm": 0.6461699604988098, + "learning_rate": 9.105642569355002e-05, + "loss": 1.1942, + "step": 60550 + }, + { + "epoch": 0.3869005788175766, + "grad_norm": 0.7070831060409546, + "learning_rate": 9.105356167650241e-05, + "loss": 0.7269, + "step": 60560 + }, + { + "epoch": 0.3869644659673153, + "grad_norm": 1.24761962890625, + "learning_rate": 9.105069724600992e-05, + "loss": 0.9219, + "step": 60570 + }, + { + "epoch": 0.387028353117054, + "grad_norm": 0.9694204330444336, + "learning_rate": 9.104783240210137e-05, + "loss": 1.1463, + "step": 60580 + }, + { + "epoch": 0.38709224026679273, + "grad_norm": 0.7237581014633179, + "learning_rate": 9.104496714480567e-05, + "loss": 1.1098, + "step": 60590 + }, + { + "epoch": 0.38715612741653144, + "grad_norm": 0.9114017486572266, + "learning_rate": 9.104210147415163e-05, + "loss": 1.0888, + "step": 60600 + }, + { + "epoch": 0.38722001456627014, + "grad_norm": 0.5623325705528259, + "learning_rate": 9.103923539016813e-05, + "loss": 0.9529, + "step": 60610 + }, + { + "epoch": 0.38728390171600885, + "grad_norm": 0.6232447624206543, + "learning_rate": 9.1036368892884e-05, + "loss": 1.0587, + "step": 60620 + }, + { + "epoch": 0.38734778886574756, + "grad_norm": 0.9023538827896118, + "learning_rate": 9.103350198232816e-05, + "loss": 1.0181, + "step": 60630 + }, + { + "epoch": 0.38741167601548626, + "grad_norm": 1.999245047569275, + "learning_rate": 9.103063465852945e-05, + "loss": 0.9449, + "step": 60640 + }, + { + "epoch": 0.38747556316522497, + "grad_norm": 1.0726778507232666, + "learning_rate": 9.102776692151675e-05, + "loss": 0.8554, + "step": 60650 + }, + { + "epoch": 0.3875394503149637, + "grad_norm": 0.9312451481819153, + "learning_rate": 9.102489877131894e-05, + "loss": 0.8106, + "step": 60660 + }, + { + "epoch": 0.3876033374647023, + "grad_norm": 0.7528103590011597, + "learning_rate": 9.102203020796491e-05, + "loss": 0.9015, + "step": 60670 + }, + { + "epoch": 0.38766722461444103, + "grad_norm": 0.6276060342788696, + "learning_rate": 9.101916123148356e-05, + "loss": 0.8222, + "step": 60680 + }, + { + "epoch": 0.38773111176417974, + "grad_norm": 0.818074107170105, + "learning_rate": 9.101629184190375e-05, + "loss": 1.1241, + "step": 60690 + }, + { + "epoch": 0.38779499891391844, + "grad_norm": 0.8359874486923218, + "learning_rate": 9.10134220392544e-05, + "loss": 0.9222, + "step": 60700 + }, + { + "epoch": 0.38785888606365715, + "grad_norm": 0.846093475818634, + "learning_rate": 9.101055182356442e-05, + "loss": 0.9757, + "step": 60710 + }, + { + "epoch": 0.38792277321339586, + "grad_norm": 0.7747712731361389, + "learning_rate": 9.100768119486269e-05, + "loss": 0.7789, + "step": 60720 + }, + { + "epoch": 0.38798666036313456, + "grad_norm": 1.336980938911438, + "learning_rate": 9.100481015317814e-05, + "loss": 1.1395, + "step": 60730 + }, + { + "epoch": 0.38805054751287327, + "grad_norm": 1.1585602760314941, + "learning_rate": 9.100193869853968e-05, + "loss": 1.0321, + "step": 60740 + }, + { + "epoch": 0.388114434662612, + "grad_norm": 0.9213445782661438, + "learning_rate": 9.099906683097623e-05, + "loss": 0.9182, + "step": 60750 + }, + { + "epoch": 0.3881783218123507, + "grad_norm": 0.7520207166671753, + "learning_rate": 9.09961945505167e-05, + "loss": 0.6615, + "step": 60760 + }, + { + "epoch": 0.3882422089620894, + "grad_norm": 1.0059177875518799, + "learning_rate": 9.099332185719003e-05, + "loss": 0.7059, + "step": 60770 + }, + { + "epoch": 0.3883060961118281, + "grad_norm": 1.6132454872131348, + "learning_rate": 9.099044875102513e-05, + "loss": 1.1878, + "step": 60780 + }, + { + "epoch": 0.38836998326156674, + "grad_norm": 0.8192178010940552, + "learning_rate": 9.098757523205097e-05, + "loss": 1.0932, + "step": 60790 + }, + { + "epoch": 0.38843387041130545, + "grad_norm": 0.9005227088928223, + "learning_rate": 9.098470130029645e-05, + "loss": 0.762, + "step": 60800 + }, + { + "epoch": 0.38849775756104415, + "grad_norm": 0.7836887240409851, + "learning_rate": 9.098182695579054e-05, + "loss": 0.8244, + "step": 60810 + }, + { + "epoch": 0.38856164471078286, + "grad_norm": 0.7896131277084351, + "learning_rate": 9.097895219856218e-05, + "loss": 0.7864, + "step": 60820 + }, + { + "epoch": 0.38862553186052157, + "grad_norm": 1.6993827819824219, + "learning_rate": 9.09760770286403e-05, + "loss": 0.7552, + "step": 60830 + }, + { + "epoch": 0.3886894190102603, + "grad_norm": 0.8872599601745605, + "learning_rate": 9.09732014460539e-05, + "loss": 1.1259, + "step": 60840 + }, + { + "epoch": 0.388753306159999, + "grad_norm": 0.8446595072746277, + "learning_rate": 9.097032545083191e-05, + "loss": 0.7728, + "step": 60850 + }, + { + "epoch": 0.3888171933097377, + "grad_norm": 0.7190898656845093, + "learning_rate": 9.09674490430033e-05, + "loss": 1.0357, + "step": 60860 + }, + { + "epoch": 0.3888810804594764, + "grad_norm": 0.8590859770774841, + "learning_rate": 9.096457222259702e-05, + "loss": 0.7801, + "step": 60870 + }, + { + "epoch": 0.3889449676092151, + "grad_norm": 0.994317889213562, + "learning_rate": 9.096169498964206e-05, + "loss": 0.9578, + "step": 60880 + }, + { + "epoch": 0.3890088547589538, + "grad_norm": 1.0959383249282837, + "learning_rate": 9.095881734416742e-05, + "loss": 0.7354, + "step": 60890 + }, + { + "epoch": 0.3890727419086925, + "grad_norm": 1.1300466060638428, + "learning_rate": 9.095593928620203e-05, + "loss": 1.2792, + "step": 60900 + }, + { + "epoch": 0.38913662905843116, + "grad_norm": 0.9118770360946655, + "learning_rate": 9.095306081577491e-05, + "loss": 0.8323, + "step": 60910 + }, + { + "epoch": 0.38920051620816987, + "grad_norm": 0.5770663022994995, + "learning_rate": 9.095018193291504e-05, + "loss": 0.7813, + "step": 60920 + }, + { + "epoch": 0.3892644033579086, + "grad_norm": 1.2142269611358643, + "learning_rate": 9.094730263765141e-05, + "loss": 0.6744, + "step": 60930 + }, + { + "epoch": 0.3893282905076473, + "grad_norm": 0.6319569945335388, + "learning_rate": 9.094442293001301e-05, + "loss": 0.7512, + "step": 60940 + }, + { + "epoch": 0.389392177657386, + "grad_norm": 0.9332210421562195, + "learning_rate": 9.094154281002884e-05, + "loss": 0.9045, + "step": 60950 + }, + { + "epoch": 0.3894560648071247, + "grad_norm": 0.786271870136261, + "learning_rate": 9.093866227772794e-05, + "loss": 1.1151, + "step": 60960 + }, + { + "epoch": 0.3895199519568634, + "grad_norm": 0.8566588163375854, + "learning_rate": 9.093578133313928e-05, + "loss": 0.7992, + "step": 60970 + }, + { + "epoch": 0.3895838391066021, + "grad_norm": 0.7604480385780334, + "learning_rate": 9.093289997629188e-05, + "loss": 0.924, + "step": 60980 + }, + { + "epoch": 0.3896477262563408, + "grad_norm": 1.0149980783462524, + "learning_rate": 9.093001820721479e-05, + "loss": 1.0535, + "step": 60990 + }, + { + "epoch": 0.3897116134060795, + "grad_norm": 1.085911512374878, + "learning_rate": 9.092713602593699e-05, + "loss": 0.7629, + "step": 61000 + }, + { + "epoch": 0.3897755005558182, + "grad_norm": 1.1118038892745972, + "learning_rate": 9.092425343248753e-05, + "loss": 0.8315, + "step": 61010 + }, + { + "epoch": 0.38983938770555693, + "grad_norm": 0.49953410029411316, + "learning_rate": 9.092137042689542e-05, + "loss": 0.8272, + "step": 61020 + }, + { + "epoch": 0.3899032748552956, + "grad_norm": 0.703426718711853, + "learning_rate": 9.091848700918973e-05, + "loss": 0.8759, + "step": 61030 + }, + { + "epoch": 0.3899671620050343, + "grad_norm": 1.1554392576217651, + "learning_rate": 9.091560317939946e-05, + "loss": 0.9506, + "step": 61040 + }, + { + "epoch": 0.390031049154773, + "grad_norm": 0.745389997959137, + "learning_rate": 9.091271893755367e-05, + "loss": 0.7726, + "step": 61050 + }, + { + "epoch": 0.3900949363045117, + "grad_norm": 0.6152491569519043, + "learning_rate": 9.090983428368141e-05, + "loss": 0.896, + "step": 61060 + }, + { + "epoch": 0.3901588234542504, + "grad_norm": 2.3798322677612305, + "learning_rate": 9.09069492178117e-05, + "loss": 0.9652, + "step": 61070 + }, + { + "epoch": 0.3902227106039891, + "grad_norm": 0.8589335680007935, + "learning_rate": 9.090435230629522e-05, + "loss": 0.772, + "step": 61080 + }, + { + "epoch": 0.3902865977537278, + "grad_norm": 0.7509768009185791, + "learning_rate": 9.090146645771047e-05, + "loss": 0.9196, + "step": 61090 + }, + { + "epoch": 0.3903504849034665, + "grad_norm": 0.9738487005233765, + "learning_rate": 9.089858019721258e-05, + "loss": 0.8836, + "step": 61100 + }, + { + "epoch": 0.3904143720532052, + "grad_norm": 2.166499137878418, + "learning_rate": 9.089569352483061e-05, + "loss": 1.2521, + "step": 61110 + }, + { + "epoch": 0.39047825920294393, + "grad_norm": 0.9337096214294434, + "learning_rate": 9.089280644059361e-05, + "loss": 0.8933, + "step": 61120 + }, + { + "epoch": 0.39054214635268264, + "grad_norm": 1.1011388301849365, + "learning_rate": 9.088991894453069e-05, + "loss": 0.7827, + "step": 61130 + }, + { + "epoch": 0.39060603350242135, + "grad_norm": 1.5726940631866455, + "learning_rate": 9.08870310366709e-05, + "loss": 1.0105, + "step": 61140 + }, + { + "epoch": 0.39066992065216005, + "grad_norm": 0.6980756521224976, + "learning_rate": 9.088414271704334e-05, + "loss": 1.1352, + "step": 61150 + }, + { + "epoch": 0.3907338078018987, + "grad_norm": 0.9901998043060303, + "learning_rate": 9.088125398567708e-05, + "loss": 0.8634, + "step": 61160 + }, + { + "epoch": 0.3907976949516374, + "grad_norm": 0.7848410606384277, + "learning_rate": 9.087836484260125e-05, + "loss": 0.968, + "step": 61170 + }, + { + "epoch": 0.3908615821013761, + "grad_norm": 2.4346492290496826, + "learning_rate": 9.08754752878449e-05, + "loss": 0.8797, + "step": 61180 + }, + { + "epoch": 0.3909254692511148, + "grad_norm": 0.5621653199195862, + "learning_rate": 9.087258532143716e-05, + "loss": 0.8708, + "step": 61190 + }, + { + "epoch": 0.3909893564008535, + "grad_norm": 0.6077272891998291, + "learning_rate": 9.086969494340714e-05, + "loss": 0.9137, + "step": 61200 + }, + { + "epoch": 0.39105324355059223, + "grad_norm": 1.679137945175171, + "learning_rate": 9.08668041537839e-05, + "loss": 1.0507, + "step": 61210 + }, + { + "epoch": 0.39111713070033094, + "grad_norm": 0.7337985634803772, + "learning_rate": 9.086391295259662e-05, + "loss": 0.7978, + "step": 61220 + }, + { + "epoch": 0.39118101785006965, + "grad_norm": 0.8496336340904236, + "learning_rate": 9.086102133987436e-05, + "loss": 1.0827, + "step": 61230 + }, + { + "epoch": 0.39124490499980835, + "grad_norm": 1.5202635526657104, + "learning_rate": 9.085812931564627e-05, + "loss": 0.7946, + "step": 61240 + }, + { + "epoch": 0.39130879214954706, + "grad_norm": 1.236046314239502, + "learning_rate": 9.085523687994148e-05, + "loss": 0.6731, + "step": 61250 + }, + { + "epoch": 0.39137267929928576, + "grad_norm": 0.6897780895233154, + "learning_rate": 9.085234403278912e-05, + "loss": 0.9761, + "step": 61260 + }, + { + "epoch": 0.39143656644902447, + "grad_norm": 2.731182098388672, + "learning_rate": 9.08494507742183e-05, + "loss": 0.6203, + "step": 61270 + }, + { + "epoch": 0.3915004535987631, + "grad_norm": 1.2654629945755005, + "learning_rate": 9.084655710425817e-05, + "loss": 0.9412, + "step": 61280 + }, + { + "epoch": 0.3915643407485018, + "grad_norm": 0.9175102114677429, + "learning_rate": 9.084366302293787e-05, + "loss": 0.7672, + "step": 61290 + }, + { + "epoch": 0.39162822789824053, + "grad_norm": 0.8177767395973206, + "learning_rate": 9.084076853028656e-05, + "loss": 0.9016, + "step": 61300 + }, + { + "epoch": 0.39169211504797924, + "grad_norm": 0.6506124138832092, + "learning_rate": 9.083787362633336e-05, + "loss": 0.7544, + "step": 61310 + }, + { + "epoch": 0.39175600219771795, + "grad_norm": 0.7509700655937195, + "learning_rate": 9.083497831110745e-05, + "loss": 0.6952, + "step": 61320 + }, + { + "epoch": 0.39181988934745665, + "grad_norm": 0.8444516658782959, + "learning_rate": 9.0832082584638e-05, + "loss": 0.7465, + "step": 61330 + }, + { + "epoch": 0.39188377649719536, + "grad_norm": 0.7589380145072937, + "learning_rate": 9.082918644695413e-05, + "loss": 0.7664, + "step": 61340 + }, + { + "epoch": 0.39194766364693406, + "grad_norm": 4.895397186279297, + "learning_rate": 9.082628989808504e-05, + "loss": 1.0256, + "step": 61350 + }, + { + "epoch": 0.39201155079667277, + "grad_norm": 0.7632289528846741, + "learning_rate": 9.082339293805988e-05, + "loss": 1.2474, + "step": 61360 + }, + { + "epoch": 0.3920754379464115, + "grad_norm": 0.765465259552002, + "learning_rate": 9.082049556690786e-05, + "loss": 0.9572, + "step": 61370 + }, + { + "epoch": 0.3921393250961502, + "grad_norm": 0.6176701188087463, + "learning_rate": 9.081759778465811e-05, + "loss": 0.8701, + "step": 61380 + }, + { + "epoch": 0.3922032122458889, + "grad_norm": 1.1706359386444092, + "learning_rate": 9.081469959133986e-05, + "loss": 0.8748, + "step": 61390 + }, + { + "epoch": 0.39226709939562754, + "grad_norm": 1.032160758972168, + "learning_rate": 9.081180098698225e-05, + "loss": 0.8505, + "step": 61400 + }, + { + "epoch": 0.39233098654536624, + "grad_norm": 1.0283243656158447, + "learning_rate": 9.080890197161452e-05, + "loss": 0.7096, + "step": 61410 + }, + { + "epoch": 0.39239487369510495, + "grad_norm": 1.100449800491333, + "learning_rate": 9.080600254526583e-05, + "loss": 0.9363, + "step": 61420 + }, + { + "epoch": 0.39245876084484366, + "grad_norm": 1.9551182985305786, + "learning_rate": 9.080310270796539e-05, + "loss": 0.795, + "step": 61430 + }, + { + "epoch": 0.39252264799458236, + "grad_norm": 1.056577205657959, + "learning_rate": 9.080020245974241e-05, + "loss": 0.8075, + "step": 61440 + }, + { + "epoch": 0.39258653514432107, + "grad_norm": 0.6849813461303711, + "learning_rate": 9.07973018006261e-05, + "loss": 0.9542, + "step": 61450 + }, + { + "epoch": 0.3926504222940598, + "grad_norm": 0.8313121199607849, + "learning_rate": 9.079440073064567e-05, + "loss": 1.0857, + "step": 61460 + }, + { + "epoch": 0.3927143094437985, + "grad_norm": 0.7464626431465149, + "learning_rate": 9.079149924983031e-05, + "loss": 0.6962, + "step": 61470 + }, + { + "epoch": 0.3927781965935372, + "grad_norm": 1.59227454662323, + "learning_rate": 9.078859735820928e-05, + "loss": 0.8309, + "step": 61480 + }, + { + "epoch": 0.3928420837432759, + "grad_norm": 0.6378403306007385, + "learning_rate": 9.078569505581178e-05, + "loss": 0.7235, + "step": 61490 + }, + { + "epoch": 0.3929059708930146, + "grad_norm": 0.43592649698257446, + "learning_rate": 9.078279234266705e-05, + "loss": 0.8301, + "step": 61500 + }, + { + "epoch": 0.3929698580427533, + "grad_norm": 0.9196266531944275, + "learning_rate": 9.077988921880431e-05, + "loss": 0.8455, + "step": 61510 + }, + { + "epoch": 0.39303374519249196, + "grad_norm": 0.7270370721817017, + "learning_rate": 9.077698568425283e-05, + "loss": 0.8118, + "step": 61520 + }, + { + "epoch": 0.39309763234223066, + "grad_norm": 0.693191647529602, + "learning_rate": 9.07740817390418e-05, + "loss": 0.9402, + "step": 61530 + }, + { + "epoch": 0.39316151949196937, + "grad_norm": 0.7091450691223145, + "learning_rate": 9.077117738320051e-05, + "loss": 0.7799, + "step": 61540 + }, + { + "epoch": 0.3932254066417081, + "grad_norm": 1.02108633518219, + "learning_rate": 9.07682726167582e-05, + "loss": 0.9665, + "step": 61550 + }, + { + "epoch": 0.3932892937914468, + "grad_norm": 1.1987274885177612, + "learning_rate": 9.07653674397441e-05, + "loss": 0.9019, + "step": 61560 + }, + { + "epoch": 0.3933531809411855, + "grad_norm": 0.7170557379722595, + "learning_rate": 9.076246185218747e-05, + "loss": 0.9895, + "step": 61570 + }, + { + "epoch": 0.3934170680909242, + "grad_norm": 1.2851723432540894, + "learning_rate": 9.07595558541176e-05, + "loss": 0.8972, + "step": 61580 + }, + { + "epoch": 0.3934809552406629, + "grad_norm": 0.7113538384437561, + "learning_rate": 9.075664944556374e-05, + "loss": 0.8101, + "step": 61590 + }, + { + "epoch": 0.3935448423904016, + "grad_norm": 1.113052487373352, + "learning_rate": 9.075374262655516e-05, + "loss": 0.8718, + "step": 61600 + }, + { + "epoch": 0.3936087295401403, + "grad_norm": 0.9161044955253601, + "learning_rate": 9.075083539712113e-05, + "loss": 0.8209, + "step": 61610 + }, + { + "epoch": 0.393672616689879, + "grad_norm": 0.9524838328361511, + "learning_rate": 9.074792775729096e-05, + "loss": 0.9234, + "step": 61620 + }, + { + "epoch": 0.3937365038396177, + "grad_norm": 1.2486933469772339, + "learning_rate": 9.074501970709385e-05, + "loss": 0.7753, + "step": 61630 + }, + { + "epoch": 0.3938003909893564, + "grad_norm": 0.8280370831489563, + "learning_rate": 9.07421112465592e-05, + "loss": 1.0375, + "step": 61640 + }, + { + "epoch": 0.3938642781390951, + "grad_norm": 0.9013057947158813, + "learning_rate": 9.07392023757162e-05, + "loss": 0.9043, + "step": 61650 + }, + { + "epoch": 0.3939281652888338, + "grad_norm": 0.9092079401016235, + "learning_rate": 9.073629309459422e-05, + "loss": 0.9026, + "step": 61660 + }, + { + "epoch": 0.3939920524385725, + "grad_norm": 1.4664134979248047, + "learning_rate": 9.07333834032225e-05, + "loss": 0.6136, + "step": 61670 + }, + { + "epoch": 0.3940559395883112, + "grad_norm": 1.106016755104065, + "learning_rate": 9.07304733016304e-05, + "loss": 1.0314, + "step": 61680 + }, + { + "epoch": 0.3941198267380499, + "grad_norm": 0.8790785670280457, + "learning_rate": 9.072756278984717e-05, + "loss": 1.0497, + "step": 61690 + }, + { + "epoch": 0.3941837138877886, + "grad_norm": 1.431808590888977, + "learning_rate": 9.072465186790215e-05, + "loss": 0.9975, + "step": 61700 + }, + { + "epoch": 0.3942476010375273, + "grad_norm": 0.8433964252471924, + "learning_rate": 9.072174053582468e-05, + "loss": 0.6958, + "step": 61710 + }, + { + "epoch": 0.394311488187266, + "grad_norm": 0.829806923866272, + "learning_rate": 9.071882879364402e-05, + "loss": 1.0986, + "step": 61720 + }, + { + "epoch": 0.39437537533700473, + "grad_norm": 0.8924597501754761, + "learning_rate": 9.071591664138954e-05, + "loss": 0.9314, + "step": 61730 + }, + { + "epoch": 0.39443926248674344, + "grad_norm": 0.7619827389717102, + "learning_rate": 9.071300407909056e-05, + "loss": 0.9549, + "step": 61740 + }, + { + "epoch": 0.39450314963648214, + "grad_norm": 0.6050899028778076, + "learning_rate": 9.07100911067764e-05, + "loss": 0.9636, + "step": 61750 + }, + { + "epoch": 0.3945670367862208, + "grad_norm": 1.1481192111968994, + "learning_rate": 9.070717772447641e-05, + "loss": 0.743, + "step": 61760 + }, + { + "epoch": 0.3946309239359595, + "grad_norm": 1.505147099494934, + "learning_rate": 9.070426393221993e-05, + "loss": 0.7202, + "step": 61770 + }, + { + "epoch": 0.3946948110856982, + "grad_norm": 1.0512402057647705, + "learning_rate": 9.070134973003628e-05, + "loss": 0.8743, + "step": 61780 + }, + { + "epoch": 0.3947586982354369, + "grad_norm": 0.7054274082183838, + "learning_rate": 9.069843511795484e-05, + "loss": 0.9366, + "step": 61790 + }, + { + "epoch": 0.3948225853851756, + "grad_norm": 0.6536909937858582, + "learning_rate": 9.069552009600494e-05, + "loss": 0.7258, + "step": 61800 + }, + { + "epoch": 0.3948864725349143, + "grad_norm": 0.7718044519424438, + "learning_rate": 9.069260466421596e-05, + "loss": 1.0622, + "step": 61810 + }, + { + "epoch": 0.39495035968465303, + "grad_norm": 0.991255521774292, + "learning_rate": 9.068968882261723e-05, + "loss": 1.0272, + "step": 61820 + }, + { + "epoch": 0.39501424683439174, + "grad_norm": 5.583859443664551, + "learning_rate": 9.068677257123815e-05, + "loss": 0.8138, + "step": 61830 + }, + { + "epoch": 0.39507813398413044, + "grad_norm": 3.004866123199463, + "learning_rate": 9.068385591010805e-05, + "loss": 1.1612, + "step": 61840 + }, + { + "epoch": 0.39514202113386915, + "grad_norm": 0.7518250346183777, + "learning_rate": 9.068093883925633e-05, + "loss": 0.8184, + "step": 61850 + }, + { + "epoch": 0.39520590828360785, + "grad_norm": 1.5399583578109741, + "learning_rate": 9.067802135871237e-05, + "loss": 0.8756, + "step": 61860 + }, + { + "epoch": 0.39526979543334656, + "grad_norm": 2.1497974395751953, + "learning_rate": 9.067510346850554e-05, + "loss": 1.1971, + "step": 61870 + }, + { + "epoch": 0.3953336825830852, + "grad_norm": 0.8201958537101746, + "learning_rate": 9.067218516866523e-05, + "loss": 1.0288, + "step": 61880 + }, + { + "epoch": 0.3953975697328239, + "grad_norm": 1.203514575958252, + "learning_rate": 9.066926645922084e-05, + "loss": 1.0717, + "step": 61890 + }, + { + "epoch": 0.3954614568825626, + "grad_norm": 0.8252068161964417, + "learning_rate": 9.066634734020174e-05, + "loss": 0.8844, + "step": 61900 + }, + { + "epoch": 0.39552534403230133, + "grad_norm": 0.7639890313148499, + "learning_rate": 9.066342781163733e-05, + "loss": 0.907, + "step": 61910 + }, + { + "epoch": 0.39558923118204004, + "grad_norm": 0.8897015452384949, + "learning_rate": 9.066050787355704e-05, + "loss": 0.7727, + "step": 61920 + }, + { + "epoch": 0.39565311833177874, + "grad_norm": 0.7301774024963379, + "learning_rate": 9.065758752599026e-05, + "loss": 0.9699, + "step": 61930 + }, + { + "epoch": 0.39571700548151745, + "grad_norm": 1.1246193647384644, + "learning_rate": 9.065466676896639e-05, + "loss": 0.7621, + "step": 61940 + }, + { + "epoch": 0.39578089263125615, + "grad_norm": 0.7929351329803467, + "learning_rate": 9.065174560251487e-05, + "loss": 0.8905, + "step": 61950 + }, + { + "epoch": 0.39584477978099486, + "grad_norm": 1.0358200073242188, + "learning_rate": 9.064882402666508e-05, + "loss": 0.7801, + "step": 61960 + }, + { + "epoch": 0.39590866693073357, + "grad_norm": 0.7412776947021484, + "learning_rate": 9.064590204144647e-05, + "loss": 0.9169, + "step": 61970 + }, + { + "epoch": 0.39597255408047227, + "grad_norm": 0.7912229895591736, + "learning_rate": 9.064297964688848e-05, + "loss": 0.7336, + "step": 61980 + }, + { + "epoch": 0.396036441230211, + "grad_norm": 0.7048013210296631, + "learning_rate": 9.064005684302051e-05, + "loss": 0.7669, + "step": 61990 + }, + { + "epoch": 0.3961003283799497, + "grad_norm": 0.594420850276947, + "learning_rate": 9.063713362987201e-05, + "loss": 0.858, + "step": 62000 + }, + { + "epoch": 0.39616421552968833, + "grad_norm": 0.6770893931388855, + "learning_rate": 9.063421000747243e-05, + "loss": 0.931, + "step": 62010 + }, + { + "epoch": 0.39622810267942704, + "grad_norm": 0.9604712128639221, + "learning_rate": 9.06312859758512e-05, + "loss": 0.8397, + "step": 62020 + }, + { + "epoch": 0.39629198982916575, + "grad_norm": 0.693006157875061, + "learning_rate": 9.062836153503775e-05, + "loss": 0.9519, + "step": 62030 + }, + { + "epoch": 0.39635587697890445, + "grad_norm": 0.6312511563301086, + "learning_rate": 9.062543668506156e-05, + "loss": 0.9113, + "step": 62040 + }, + { + "epoch": 0.39641976412864316, + "grad_norm": 0.7017596364021301, + "learning_rate": 9.062251142595208e-05, + "loss": 0.6917, + "step": 62050 + }, + { + "epoch": 0.39648365127838187, + "grad_norm": 0.5928127765655518, + "learning_rate": 9.061958575773876e-05, + "loss": 0.9722, + "step": 62060 + }, + { + "epoch": 0.39654753842812057, + "grad_norm": 1.742937445640564, + "learning_rate": 9.06166596804511e-05, + "loss": 0.9624, + "step": 62070 + }, + { + "epoch": 0.3966114255778593, + "grad_norm": 0.7170588374137878, + "learning_rate": 9.06137331941185e-05, + "loss": 1.2726, + "step": 62080 + }, + { + "epoch": 0.396675312727598, + "grad_norm": 1.1127740144729614, + "learning_rate": 9.06108062987705e-05, + "loss": 1.0994, + "step": 62090 + }, + { + "epoch": 0.3967391998773367, + "grad_norm": 1.1925806999206543, + "learning_rate": 9.060787899443652e-05, + "loss": 0.8594, + "step": 62100 + }, + { + "epoch": 0.3968030870270754, + "grad_norm": 0.9429210424423218, + "learning_rate": 9.060495128114607e-05, + "loss": 0.8756, + "step": 62110 + }, + { + "epoch": 0.3968669741768141, + "grad_norm": 0.5963910222053528, + "learning_rate": 9.060202315892866e-05, + "loss": 0.9714, + "step": 62120 + }, + { + "epoch": 0.39693086132655275, + "grad_norm": 1.1170175075531006, + "learning_rate": 9.059909462781373e-05, + "loss": 0.8631, + "step": 62130 + }, + { + "epoch": 0.39699474847629146, + "grad_norm": 0.8226674199104309, + "learning_rate": 9.05961656878308e-05, + "loss": 0.8723, + "step": 62140 + }, + { + "epoch": 0.39705863562603017, + "grad_norm": 0.8132166862487793, + "learning_rate": 9.059323633900936e-05, + "loss": 0.8302, + "step": 62150 + }, + { + "epoch": 0.39712252277576887, + "grad_norm": 1.661969542503357, + "learning_rate": 9.059030658137892e-05, + "loss": 0.6955, + "step": 62160 + }, + { + "epoch": 0.3971864099255076, + "grad_norm": 1.1347659826278687, + "learning_rate": 9.058737641496896e-05, + "loss": 0.8491, + "step": 62170 + }, + { + "epoch": 0.3972502970752463, + "grad_norm": 0.8600234985351562, + "learning_rate": 9.058444583980901e-05, + "loss": 0.8166, + "step": 62180 + }, + { + "epoch": 0.397314184224985, + "grad_norm": 1.037380337715149, + "learning_rate": 9.058151485592858e-05, + "loss": 1.0938, + "step": 62190 + }, + { + "epoch": 0.3973780713747237, + "grad_norm": 0.8460977673530579, + "learning_rate": 9.057858346335719e-05, + "loss": 0.7327, + "step": 62200 + }, + { + "epoch": 0.3974419585244624, + "grad_norm": 1.0163371562957764, + "learning_rate": 9.057565166212436e-05, + "loss": 0.8889, + "step": 62210 + }, + { + "epoch": 0.3975058456742011, + "grad_norm": 0.8460572361946106, + "learning_rate": 9.057271945225962e-05, + "loss": 0.9403, + "step": 62220 + }, + { + "epoch": 0.3975697328239398, + "grad_norm": 0.8837966322898865, + "learning_rate": 9.056978683379249e-05, + "loss": 0.8961, + "step": 62230 + }, + { + "epoch": 0.3976336199736785, + "grad_norm": 2.452587127685547, + "learning_rate": 9.056685380675251e-05, + "loss": 0.7814, + "step": 62240 + }, + { + "epoch": 0.39769750712341717, + "grad_norm": 1.0550041198730469, + "learning_rate": 9.056392037116922e-05, + "loss": 0.944, + "step": 62250 + }, + { + "epoch": 0.3977613942731559, + "grad_norm": 0.6195732951164246, + "learning_rate": 9.056098652707215e-05, + "loss": 0.8217, + "step": 62260 + }, + { + "epoch": 0.3978252814228946, + "grad_norm": 0.9907122254371643, + "learning_rate": 9.055805227449086e-05, + "loss": 0.9645, + "step": 62270 + }, + { + "epoch": 0.3978891685726333, + "grad_norm": 0.8403761386871338, + "learning_rate": 9.05551176134549e-05, + "loss": 0.8914, + "step": 62280 + }, + { + "epoch": 0.397953055722372, + "grad_norm": 0.880027174949646, + "learning_rate": 9.055218254399382e-05, + "loss": 0.8041, + "step": 62290 + }, + { + "epoch": 0.3980169428721107, + "grad_norm": 1.1135274171829224, + "learning_rate": 9.054924706613716e-05, + "loss": 0.9497, + "step": 62300 + }, + { + "epoch": 0.3980808300218494, + "grad_norm": 0.6633336544036865, + "learning_rate": 9.054631117991453e-05, + "loss": 0.9701, + "step": 62310 + }, + { + "epoch": 0.3981447171715881, + "grad_norm": 0.9444105625152588, + "learning_rate": 9.054337488535546e-05, + "loss": 0.7332, + "step": 62320 + }, + { + "epoch": 0.3982086043213268, + "grad_norm": 1.4530045986175537, + "learning_rate": 9.054043818248952e-05, + "loss": 0.7233, + "step": 62330 + }, + { + "epoch": 0.3982724914710655, + "grad_norm": 0.8733393549919128, + "learning_rate": 9.053750107134631e-05, + "loss": 0.9144, + "step": 62340 + }, + { + "epoch": 0.39833637862080423, + "grad_norm": 0.6644203662872314, + "learning_rate": 9.053456355195537e-05, + "loss": 0.874, + "step": 62350 + }, + { + "epoch": 0.39840026577054294, + "grad_norm": 0.9878085851669312, + "learning_rate": 9.053162562434633e-05, + "loss": 1.2423, + "step": 62360 + }, + { + "epoch": 0.3984641529202816, + "grad_norm": 1.3864879608154297, + "learning_rate": 9.052868728854876e-05, + "loss": 0.9242, + "step": 62370 + }, + { + "epoch": 0.3985280400700203, + "grad_norm": 0.777885377407074, + "learning_rate": 9.052574854459223e-05, + "loss": 0.9149, + "step": 62380 + }, + { + "epoch": 0.398591927219759, + "grad_norm": 0.8761781454086304, + "learning_rate": 9.052280939250636e-05, + "loss": 1.0996, + "step": 62390 + }, + { + "epoch": 0.3986558143694977, + "grad_norm": 0.5955054759979248, + "learning_rate": 9.051986983232073e-05, + "loss": 0.8387, + "step": 62400 + }, + { + "epoch": 0.3987197015192364, + "grad_norm": 0.9722325205802917, + "learning_rate": 9.051692986406496e-05, + "loss": 1.0697, + "step": 62410 + }, + { + "epoch": 0.3987835886689751, + "grad_norm": 0.6643452048301697, + "learning_rate": 9.051398948776868e-05, + "loss": 1.2275, + "step": 62420 + }, + { + "epoch": 0.3988474758187138, + "grad_norm": 1.9424246549606323, + "learning_rate": 9.051104870346146e-05, + "loss": 0.8924, + "step": 62430 + }, + { + "epoch": 0.39891136296845253, + "grad_norm": 0.771974503993988, + "learning_rate": 9.050810751117292e-05, + "loss": 0.7818, + "step": 62440 + }, + { + "epoch": 0.39897525011819124, + "grad_norm": 1.0370486974716187, + "learning_rate": 9.05051659109327e-05, + "loss": 0.9449, + "step": 62450 + }, + { + "epoch": 0.39903913726792994, + "grad_norm": 1.289140224456787, + "learning_rate": 9.050222390277041e-05, + "loss": 0.7505, + "step": 62460 + }, + { + "epoch": 0.39910302441766865, + "grad_norm": 0.7696613669395447, + "learning_rate": 9.049928148671569e-05, + "loss": 0.9424, + "step": 62470 + }, + { + "epoch": 0.39916691156740736, + "grad_norm": 2.606376886367798, + "learning_rate": 9.049633866279819e-05, + "loss": 0.9175, + "step": 62480 + }, + { + "epoch": 0.399230798717146, + "grad_norm": 0.9909952282905579, + "learning_rate": 9.049339543104751e-05, + "loss": 0.6305, + "step": 62490 + }, + { + "epoch": 0.3992946858668847, + "grad_norm": 0.9084514379501343, + "learning_rate": 9.04904517914933e-05, + "loss": 0.8609, + "step": 62500 + }, + { + "epoch": 0.3993585730166234, + "grad_norm": 1.2386984825134277, + "learning_rate": 9.048750774416521e-05, + "loss": 0.863, + "step": 62510 + }, + { + "epoch": 0.3994224601663621, + "grad_norm": 1.0682573318481445, + "learning_rate": 9.04845632890929e-05, + "loss": 0.9319, + "step": 62520 + }, + { + "epoch": 0.39948634731610083, + "grad_norm": 0.7610236406326294, + "learning_rate": 9.048161842630602e-05, + "loss": 0.7901, + "step": 62530 + }, + { + "epoch": 0.39955023446583954, + "grad_norm": 0.8096383810043335, + "learning_rate": 9.04786731558342e-05, + "loss": 0.8542, + "step": 62540 + }, + { + "epoch": 0.39961412161557824, + "grad_norm": 0.6308041214942932, + "learning_rate": 9.047572747770713e-05, + "loss": 0.9005, + "step": 62550 + }, + { + "epoch": 0.39967800876531695, + "grad_norm": 1.0608285665512085, + "learning_rate": 9.047278139195447e-05, + "loss": 0.9082, + "step": 62560 + }, + { + "epoch": 0.39974189591505566, + "grad_norm": 1.0790623426437378, + "learning_rate": 9.04698348986059e-05, + "loss": 0.9912, + "step": 62570 + }, + { + "epoch": 0.39980578306479436, + "grad_norm": 0.5858973860740662, + "learning_rate": 9.046688799769107e-05, + "loss": 0.7241, + "step": 62580 + }, + { + "epoch": 0.39986967021453307, + "grad_norm": 0.7396795153617859, + "learning_rate": 9.046394068923967e-05, + "loss": 0.8767, + "step": 62590 + }, + { + "epoch": 0.3999335573642718, + "grad_norm": 0.5871313214302063, + "learning_rate": 9.046099297328138e-05, + "loss": 0.6491, + "step": 62600 + }, + { + "epoch": 0.3999974445140104, + "grad_norm": 1.0462760925292969, + "learning_rate": 9.045804484984588e-05, + "loss": 0.9854, + "step": 62610 + }, + { + "epoch": 0.40006133166374913, + "grad_norm": 1.0738905668258667, + "learning_rate": 9.045509631896287e-05, + "loss": 0.8662, + "step": 62620 + }, + { + "epoch": 0.40012521881348784, + "grad_norm": 0.7057567834854126, + "learning_rate": 9.045214738066206e-05, + "loss": 0.736, + "step": 62630 + }, + { + "epoch": 0.40018910596322654, + "grad_norm": 0.9611753225326538, + "learning_rate": 9.044919803497312e-05, + "loss": 0.794, + "step": 62640 + }, + { + "epoch": 0.40025299311296525, + "grad_norm": 0.9139066934585571, + "learning_rate": 9.044624828192573e-05, + "loss": 0.7416, + "step": 62650 + }, + { + "epoch": 0.40031688026270396, + "grad_norm": 0.7299910187721252, + "learning_rate": 9.044329812154966e-05, + "loss": 1.1855, + "step": 62660 + }, + { + "epoch": 0.40038076741244266, + "grad_norm": 1.0329594612121582, + "learning_rate": 9.04403475538746e-05, + "loss": 0.7538, + "step": 62670 + }, + { + "epoch": 0.40044465456218137, + "grad_norm": 0.8256815671920776, + "learning_rate": 9.043739657893025e-05, + "loss": 0.8794, + "step": 62680 + }, + { + "epoch": 0.4005085417119201, + "grad_norm": 0.7287086248397827, + "learning_rate": 9.043444519674631e-05, + "loss": 1.0395, + "step": 62690 + }, + { + "epoch": 0.4005724288616588, + "grad_norm": 0.7934675216674805, + "learning_rate": 9.043149340735253e-05, + "loss": 0.7567, + "step": 62700 + }, + { + "epoch": 0.4006363160113975, + "grad_norm": 0.5604273676872253, + "learning_rate": 9.042854121077865e-05, + "loss": 1.0449, + "step": 62710 + }, + { + "epoch": 0.4007002031611362, + "grad_norm": 1.5315760374069214, + "learning_rate": 9.042558860705436e-05, + "loss": 1.1271, + "step": 62720 + }, + { + "epoch": 0.40076409031087484, + "grad_norm": 1.0402827262878418, + "learning_rate": 9.042263559620945e-05, + "loss": 0.8949, + "step": 62730 + }, + { + "epoch": 0.40082797746061355, + "grad_norm": 0.6324530243873596, + "learning_rate": 9.041968217827363e-05, + "loss": 0.7596, + "step": 62740 + }, + { + "epoch": 0.40089186461035226, + "grad_norm": 0.6953981518745422, + "learning_rate": 9.041672835327661e-05, + "loss": 0.6683, + "step": 62750 + }, + { + "epoch": 0.40095575176009096, + "grad_norm": 0.8645387887954712, + "learning_rate": 9.04137741212482e-05, + "loss": 1.1533, + "step": 62760 + }, + { + "epoch": 0.40101963890982967, + "grad_norm": 0.8752760291099548, + "learning_rate": 9.04108194822181e-05, + "loss": 0.9688, + "step": 62770 + }, + { + "epoch": 0.4010835260595684, + "grad_norm": 1.0620766878128052, + "learning_rate": 9.040786443621609e-05, + "loss": 0.9483, + "step": 62780 + }, + { + "epoch": 0.4011474132093071, + "grad_norm": 0.9805885553359985, + "learning_rate": 9.040490898327194e-05, + "loss": 0.9889, + "step": 62790 + }, + { + "epoch": 0.4012113003590458, + "grad_norm": 1.2980453968048096, + "learning_rate": 9.04019531234154e-05, + "loss": 1.0088, + "step": 62800 + }, + { + "epoch": 0.4012751875087845, + "grad_norm": 0.6901305913925171, + "learning_rate": 9.039899685667624e-05, + "loss": 0.841, + "step": 62810 + }, + { + "epoch": 0.4013390746585232, + "grad_norm": 0.6811827421188354, + "learning_rate": 9.039604018308423e-05, + "loss": 0.7313, + "step": 62820 + }, + { + "epoch": 0.4014029618082619, + "grad_norm": 1.0031507015228271, + "learning_rate": 9.039308310266914e-05, + "loss": 0.9193, + "step": 62830 + }, + { + "epoch": 0.4014668489580006, + "grad_norm": 0.66957688331604, + "learning_rate": 9.039012561546076e-05, + "loss": 0.9917, + "step": 62840 + }, + { + "epoch": 0.4015307361077393, + "grad_norm": 1.3045806884765625, + "learning_rate": 9.038716772148888e-05, + "loss": 0.8695, + "step": 62850 + }, + { + "epoch": 0.40159462325747797, + "grad_norm": 0.8219857811927795, + "learning_rate": 9.038420942078327e-05, + "loss": 0.913, + "step": 62860 + }, + { + "epoch": 0.4016585104072167, + "grad_norm": 1.7274596691131592, + "learning_rate": 9.038125071337374e-05, + "loss": 0.9524, + "step": 62870 + }, + { + "epoch": 0.4017223975569554, + "grad_norm": 1.0028507709503174, + "learning_rate": 9.037829159929008e-05, + "loss": 0.8358, + "step": 62880 + }, + { + "epoch": 0.4017862847066941, + "grad_norm": 1.3326009511947632, + "learning_rate": 9.03753320785621e-05, + "loss": 1.1543, + "step": 62890 + }, + { + "epoch": 0.4018501718564328, + "grad_norm": 0.4937954246997833, + "learning_rate": 9.037237215121958e-05, + "loss": 1.0826, + "step": 62900 + }, + { + "epoch": 0.4019140590061715, + "grad_norm": 1.1819350719451904, + "learning_rate": 9.036941181729236e-05, + "loss": 0.8164, + "step": 62910 + }, + { + "epoch": 0.4019779461559102, + "grad_norm": 0.710355281829834, + "learning_rate": 9.036645107681023e-05, + "loss": 0.8995, + "step": 62920 + }, + { + "epoch": 0.4020418333056489, + "grad_norm": 0.6797157526016235, + "learning_rate": 9.036348992980301e-05, + "loss": 0.9323, + "step": 62930 + }, + { + "epoch": 0.4021057204553876, + "grad_norm": 0.6142218112945557, + "learning_rate": 9.036052837630054e-05, + "loss": 1.0316, + "step": 62940 + }, + { + "epoch": 0.4021696076051263, + "grad_norm": 0.5623320937156677, + "learning_rate": 9.035756641633264e-05, + "loss": 1.0354, + "step": 62950 + }, + { + "epoch": 0.40223349475486503, + "grad_norm": 0.6780490279197693, + "learning_rate": 9.03546040499291e-05, + "loss": 0.8209, + "step": 62960 + }, + { + "epoch": 0.40229738190460373, + "grad_norm": 0.8299171328544617, + "learning_rate": 9.035164127711981e-05, + "loss": 0.9596, + "step": 62970 + }, + { + "epoch": 0.4023612690543424, + "grad_norm": 0.6555722951889038, + "learning_rate": 9.03486780979346e-05, + "loss": 1.0825, + "step": 62980 + }, + { + "epoch": 0.4024251562040811, + "grad_norm": 1.324913501739502, + "learning_rate": 9.034571451240325e-05, + "loss": 0.9062, + "step": 62990 + }, + { + "epoch": 0.4024890433538198, + "grad_norm": 1.155165672302246, + "learning_rate": 9.034275052055568e-05, + "loss": 0.9358, + "step": 63000 + }, + { + "epoch": 0.4025529305035585, + "grad_norm": 0.9214060306549072, + "learning_rate": 9.03397861224217e-05, + "loss": 0.6413, + "step": 63010 + }, + { + "epoch": 0.4026168176532972, + "grad_norm": 0.9040579199790955, + "learning_rate": 9.033682131803119e-05, + "loss": 0.9746, + "step": 63020 + }, + { + "epoch": 0.4026807048030359, + "grad_norm": 0.9403018355369568, + "learning_rate": 9.033385610741398e-05, + "loss": 0.8279, + "step": 63030 + }, + { + "epoch": 0.4027445919527746, + "grad_norm": 0.9676703810691833, + "learning_rate": 9.033089049059996e-05, + "loss": 1.2033, + "step": 63040 + }, + { + "epoch": 0.40280847910251333, + "grad_norm": 3.454418420791626, + "learning_rate": 9.032792446761896e-05, + "loss": 0.8704, + "step": 63050 + }, + { + "epoch": 0.40287236625225203, + "grad_norm": 3.037147283554077, + "learning_rate": 9.032495803850088e-05, + "loss": 0.8457, + "step": 63060 + }, + { + "epoch": 0.40293625340199074, + "grad_norm": 0.6617047190666199, + "learning_rate": 9.032199120327558e-05, + "loss": 0.8883, + "step": 63070 + }, + { + "epoch": 0.40300014055172945, + "grad_norm": 0.987816333770752, + "learning_rate": 9.031902396197296e-05, + "loss": 0.791, + "step": 63080 + }, + { + "epoch": 0.40306402770146815, + "grad_norm": 0.7392496466636658, + "learning_rate": 9.031605631462288e-05, + "loss": 0.8478, + "step": 63090 + }, + { + "epoch": 0.4031279148512068, + "grad_norm": 1.0428085327148438, + "learning_rate": 9.031308826125524e-05, + "loss": 0.9056, + "step": 63100 + }, + { + "epoch": 0.4031918020009455, + "grad_norm": 1.2448642253875732, + "learning_rate": 9.031011980189992e-05, + "loss": 0.8957, + "step": 63110 + }, + { + "epoch": 0.4032556891506842, + "grad_norm": 0.9294689893722534, + "learning_rate": 9.030715093658681e-05, + "loss": 0.6793, + "step": 63120 + }, + { + "epoch": 0.4033195763004229, + "grad_norm": 1.0956008434295654, + "learning_rate": 9.030418166534585e-05, + "loss": 1.106, + "step": 63130 + }, + { + "epoch": 0.40338346345016163, + "grad_norm": 0.8035675883293152, + "learning_rate": 9.030121198820688e-05, + "loss": 0.8668, + "step": 63140 + }, + { + "epoch": 0.40344735059990033, + "grad_norm": 1.7835280895233154, + "learning_rate": 9.029824190519986e-05, + "loss": 1.0177, + "step": 63150 + }, + { + "epoch": 0.40351123774963904, + "grad_norm": 1.057437777519226, + "learning_rate": 9.029527141635467e-05, + "loss": 0.8812, + "step": 63160 + }, + { + "epoch": 0.40357512489937775, + "grad_norm": 0.8696292638778687, + "learning_rate": 9.029230052170123e-05, + "loss": 0.8662, + "step": 63170 + }, + { + "epoch": 0.40363901204911645, + "grad_norm": 0.9994892477989197, + "learning_rate": 9.02893292212695e-05, + "loss": 1.1962, + "step": 63180 + }, + { + "epoch": 0.40370289919885516, + "grad_norm": 1.2672826051712036, + "learning_rate": 9.028635751508933e-05, + "loss": 0.95, + "step": 63190 + }, + { + "epoch": 0.40376678634859386, + "grad_norm": 0.6766789555549622, + "learning_rate": 9.02833854031907e-05, + "loss": 0.7107, + "step": 63200 + }, + { + "epoch": 0.40383067349833257, + "grad_norm": 1.4297183752059937, + "learning_rate": 9.028041288560354e-05, + "loss": 0.9061, + "step": 63210 + }, + { + "epoch": 0.4038945606480712, + "grad_norm": 0.7802338004112244, + "learning_rate": 9.027743996235775e-05, + "loss": 0.858, + "step": 63220 + }, + { + "epoch": 0.4039584477978099, + "grad_norm": 1.5008245706558228, + "learning_rate": 9.027446663348333e-05, + "loss": 1.1954, + "step": 63230 + }, + { + "epoch": 0.40402233494754863, + "grad_norm": 0.9021018743515015, + "learning_rate": 9.027149289901016e-05, + "loss": 0.8044, + "step": 63240 + }, + { + "epoch": 0.40408622209728734, + "grad_norm": 0.7308499217033386, + "learning_rate": 9.026851875896822e-05, + "loss": 1.0717, + "step": 63250 + }, + { + "epoch": 0.40415010924702605, + "grad_norm": 0.8657183051109314, + "learning_rate": 9.026554421338748e-05, + "loss": 1.0214, + "step": 63260 + }, + { + "epoch": 0.40421399639676475, + "grad_norm": 0.9111654162406921, + "learning_rate": 9.026256926229786e-05, + "loss": 0.863, + "step": 63270 + }, + { + "epoch": 0.40427788354650346, + "grad_norm": 0.9648974537849426, + "learning_rate": 9.025959390572933e-05, + "loss": 0.7586, + "step": 63280 + }, + { + "epoch": 0.40434177069624216, + "grad_norm": 0.8858680725097656, + "learning_rate": 9.025661814371187e-05, + "loss": 1.1628, + "step": 63290 + }, + { + "epoch": 0.40440565784598087, + "grad_norm": 0.7507526874542236, + "learning_rate": 9.025364197627543e-05, + "loss": 0.7555, + "step": 63300 + }, + { + "epoch": 0.4044695449957196, + "grad_norm": 0.7680438160896301, + "learning_rate": 9.025066540345e-05, + "loss": 1.0276, + "step": 63310 + }, + { + "epoch": 0.4045334321454583, + "grad_norm": 1.0163004398345947, + "learning_rate": 9.024768842526554e-05, + "loss": 0.9563, + "step": 63320 + }, + { + "epoch": 0.404597319295197, + "grad_norm": 0.7688309550285339, + "learning_rate": 9.024471104175203e-05, + "loss": 0.9156, + "step": 63330 + }, + { + "epoch": 0.40466120644493564, + "grad_norm": 1.04693603515625, + "learning_rate": 9.024173325293949e-05, + "loss": 0.8006, + "step": 63340 + }, + { + "epoch": 0.40472509359467435, + "grad_norm": 0.6474732160568237, + "learning_rate": 9.023875505885786e-05, + "loss": 0.5947, + "step": 63350 + }, + { + "epoch": 0.40478898074441305, + "grad_norm": 0.6486718058586121, + "learning_rate": 9.023577645953718e-05, + "loss": 0.8847, + "step": 63360 + }, + { + "epoch": 0.40485286789415176, + "grad_norm": 0.8913542628288269, + "learning_rate": 9.023279745500738e-05, + "loss": 1.0172, + "step": 63370 + }, + { + "epoch": 0.40491675504389046, + "grad_norm": 0.629562497138977, + "learning_rate": 9.022981804529853e-05, + "loss": 0.9124, + "step": 63380 + }, + { + "epoch": 0.40498064219362917, + "grad_norm": 0.6215500831604004, + "learning_rate": 9.022683823044061e-05, + "loss": 0.7679, + "step": 63390 + }, + { + "epoch": 0.4050445293433679, + "grad_norm": 1.2027158737182617, + "learning_rate": 9.022385801046363e-05, + "loss": 0.9516, + "step": 63400 + }, + { + "epoch": 0.4051084164931066, + "grad_norm": 1.0377594232559204, + "learning_rate": 9.02208773853976e-05, + "loss": 0.791, + "step": 63410 + }, + { + "epoch": 0.4051723036428453, + "grad_norm": 0.5519084334373474, + "learning_rate": 9.021789635527252e-05, + "loss": 1.0029, + "step": 63420 + }, + { + "epoch": 0.405236190792584, + "grad_norm": 0.8896793723106384, + "learning_rate": 9.021491492011844e-05, + "loss": 0.9838, + "step": 63430 + }, + { + "epoch": 0.4053000779423227, + "grad_norm": 0.8860163688659668, + "learning_rate": 9.021193307996538e-05, + "loss": 1.2256, + "step": 63440 + }, + { + "epoch": 0.4053639650920614, + "grad_norm": 1.1644326448440552, + "learning_rate": 9.020895083484337e-05, + "loss": 0.8291, + "step": 63450 + }, + { + "epoch": 0.40542785224180006, + "grad_norm": 0.8265649676322937, + "learning_rate": 9.020596818478244e-05, + "loss": 1.0556, + "step": 63460 + }, + { + "epoch": 0.40549173939153876, + "grad_norm": 1.3576620817184448, + "learning_rate": 9.020298512981262e-05, + "loss": 0.9018, + "step": 63470 + }, + { + "epoch": 0.40555562654127747, + "grad_norm": 0.8418384194374084, + "learning_rate": 9.020000166996397e-05, + "loss": 0.949, + "step": 63480 + }, + { + "epoch": 0.4056195136910162, + "grad_norm": 0.9804365634918213, + "learning_rate": 9.01970178052665e-05, + "loss": 1.0751, + "step": 63490 + }, + { + "epoch": 0.4056834008407549, + "grad_norm": 1.0201619863510132, + "learning_rate": 9.01940335357503e-05, + "loss": 0.9828, + "step": 63500 + }, + { + "epoch": 0.4057472879904936, + "grad_norm": 0.6420082449913025, + "learning_rate": 9.019104886144543e-05, + "loss": 0.7166, + "step": 63510 + }, + { + "epoch": 0.4058111751402323, + "grad_norm": 0.6462534070014954, + "learning_rate": 9.01880637823819e-05, + "loss": 0.883, + "step": 63520 + }, + { + "epoch": 0.405875062289971, + "grad_norm": 0.44123539328575134, + "learning_rate": 9.018507829858981e-05, + "loss": 0.8291, + "step": 63530 + }, + { + "epoch": 0.4059389494397097, + "grad_norm": 0.8670223951339722, + "learning_rate": 9.018209241009921e-05, + "loss": 1.0204, + "step": 63540 + }, + { + "epoch": 0.4060028365894484, + "grad_norm": 1.4019068479537964, + "learning_rate": 9.017910611694018e-05, + "loss": 0.6407, + "step": 63550 + }, + { + "epoch": 0.4060667237391871, + "grad_norm": 0.8712426424026489, + "learning_rate": 9.01761194191428e-05, + "loss": 0.7192, + "step": 63560 + }, + { + "epoch": 0.4061306108889258, + "grad_norm": 1.283201813697815, + "learning_rate": 9.017313231673714e-05, + "loss": 0.9573, + "step": 63570 + }, + { + "epoch": 0.4061944980386645, + "grad_norm": 0.5912864208221436, + "learning_rate": 9.017014480975327e-05, + "loss": 0.6543, + "step": 63580 + }, + { + "epoch": 0.4062583851884032, + "grad_norm": 0.8346386551856995, + "learning_rate": 9.01671568982213e-05, + "loss": 0.9464, + "step": 63590 + }, + { + "epoch": 0.4063222723381419, + "grad_norm": 0.7532115578651428, + "learning_rate": 9.016416858217131e-05, + "loss": 0.7063, + "step": 63600 + }, + { + "epoch": 0.4063861594878806, + "grad_norm": 0.8477169275283813, + "learning_rate": 9.016117986163339e-05, + "loss": 1.3564, + "step": 63610 + }, + { + "epoch": 0.4064500466376193, + "grad_norm": 0.8860613703727722, + "learning_rate": 9.015819073663765e-05, + "loss": 0.9798, + "step": 63620 + }, + { + "epoch": 0.406513933787358, + "grad_norm": 1.1103487014770508, + "learning_rate": 9.015520120721419e-05, + "loss": 1.222, + "step": 63630 + }, + { + "epoch": 0.4065778209370967, + "grad_norm": 0.9043728709220886, + "learning_rate": 9.015221127339311e-05, + "loss": 1.0145, + "step": 63640 + }, + { + "epoch": 0.4066417080868354, + "grad_norm": 1.244024634361267, + "learning_rate": 9.01492209352045e-05, + "loss": 0.8443, + "step": 63650 + }, + { + "epoch": 0.4067055952365741, + "grad_norm": 0.982548713684082, + "learning_rate": 9.014623019267853e-05, + "loss": 0.9263, + "step": 63660 + }, + { + "epoch": 0.40676948238631283, + "grad_norm": 0.7706730961799622, + "learning_rate": 9.01432390458453e-05, + "loss": 1.0515, + "step": 63670 + }, + { + "epoch": 0.40683336953605154, + "grad_norm": 1.0974234342575073, + "learning_rate": 9.014024749473491e-05, + "loss": 0.8391, + "step": 63680 + }, + { + "epoch": 0.40689725668579024, + "grad_norm": 0.8479081392288208, + "learning_rate": 9.01372555393775e-05, + "loss": 1.1546, + "step": 63690 + }, + { + "epoch": 0.40696114383552895, + "grad_norm": 0.672386884689331, + "learning_rate": 9.01342631798032e-05, + "loss": 0.7925, + "step": 63700 + }, + { + "epoch": 0.4070250309852676, + "grad_norm": 0.6552578806877136, + "learning_rate": 9.013127041604217e-05, + "loss": 0.8039, + "step": 63710 + }, + { + "epoch": 0.4070889181350063, + "grad_norm": 0.7471428513526917, + "learning_rate": 9.01282772481245e-05, + "loss": 0.8832, + "step": 63720 + }, + { + "epoch": 0.407152805284745, + "grad_norm": 0.871847927570343, + "learning_rate": 9.012528367608037e-05, + "loss": 0.87, + "step": 63730 + }, + { + "epoch": 0.4072166924344837, + "grad_norm": 0.8684267401695251, + "learning_rate": 9.012228969993992e-05, + "loss": 1.0718, + "step": 63740 + }, + { + "epoch": 0.4072805795842224, + "grad_norm": 1.1163896322250366, + "learning_rate": 9.01192953197333e-05, + "loss": 0.8171, + "step": 63750 + }, + { + "epoch": 0.40734446673396113, + "grad_norm": 0.8885432481765747, + "learning_rate": 9.011630053549069e-05, + "loss": 0.7901, + "step": 63760 + }, + { + "epoch": 0.40740835388369984, + "grad_norm": 0.9396836161613464, + "learning_rate": 9.011330534724221e-05, + "loss": 0.8013, + "step": 63770 + }, + { + "epoch": 0.40747224103343854, + "grad_norm": 1.0267233848571777, + "learning_rate": 9.011030975501804e-05, + "loss": 0.8062, + "step": 63780 + }, + { + "epoch": 0.40753612818317725, + "grad_norm": 0.8495022654533386, + "learning_rate": 9.010731375884835e-05, + "loss": 0.7481, + "step": 63790 + }, + { + "epoch": 0.40760001533291595, + "grad_norm": 0.9368879199028015, + "learning_rate": 9.010431735876332e-05, + "loss": 1.0601, + "step": 63800 + }, + { + "epoch": 0.40766390248265466, + "grad_norm": 0.8383790254592896, + "learning_rate": 9.010132055479313e-05, + "loss": 0.7068, + "step": 63810 + }, + { + "epoch": 0.40772778963239337, + "grad_norm": 0.7061938643455505, + "learning_rate": 9.009832334696792e-05, + "loss": 1.0569, + "step": 63820 + }, + { + "epoch": 0.407791676782132, + "grad_norm": 0.8540278673171997, + "learning_rate": 9.009532573531793e-05, + "loss": 1.0359, + "step": 63830 + }, + { + "epoch": 0.4078555639318707, + "grad_norm": 1.0595225095748901, + "learning_rate": 9.009232771987331e-05, + "loss": 0.8767, + "step": 63840 + }, + { + "epoch": 0.40791945108160943, + "grad_norm": 0.6768961548805237, + "learning_rate": 9.008932930066428e-05, + "loss": 1.0288, + "step": 63850 + }, + { + "epoch": 0.40798333823134814, + "grad_norm": 0.642488420009613, + "learning_rate": 9.0086330477721e-05, + "loss": 0.9504, + "step": 63860 + }, + { + "epoch": 0.40804722538108684, + "grad_norm": 0.9785758852958679, + "learning_rate": 9.008333125107371e-05, + "loss": 0.9287, + "step": 63870 + }, + { + "epoch": 0.40811111253082555, + "grad_norm": 0.949464738368988, + "learning_rate": 9.008033162075259e-05, + "loss": 0.9448, + "step": 63880 + }, + { + "epoch": 0.40817499968056425, + "grad_norm": 1.017958164215088, + "learning_rate": 9.007733158678787e-05, + "loss": 0.8016, + "step": 63890 + }, + { + "epoch": 0.40823888683030296, + "grad_norm": 3.1285202503204346, + "learning_rate": 9.007433114920972e-05, + "loss": 0.9056, + "step": 63900 + }, + { + "epoch": 0.40830277398004167, + "grad_norm": 0.8148319721221924, + "learning_rate": 9.00713303080484e-05, + "loss": 1.0595, + "step": 63910 + }, + { + "epoch": 0.4083666611297804, + "grad_norm": 0.773764967918396, + "learning_rate": 9.006832906333411e-05, + "loss": 0.8172, + "step": 63920 + }, + { + "epoch": 0.4084305482795191, + "grad_norm": 0.8310877084732056, + "learning_rate": 9.00653274150971e-05, + "loss": 0.9826, + "step": 63930 + }, + { + "epoch": 0.4084944354292578, + "grad_norm": 0.904560923576355, + "learning_rate": 9.006232536336756e-05, + "loss": 1.1305, + "step": 63940 + }, + { + "epoch": 0.40855832257899644, + "grad_norm": 0.9716793298721313, + "learning_rate": 9.005932290817576e-05, + "loss": 0.7498, + "step": 63950 + }, + { + "epoch": 0.40862220972873514, + "grad_norm": 1.2290844917297363, + "learning_rate": 9.005632004955192e-05, + "loss": 0.8095, + "step": 63960 + }, + { + "epoch": 0.40868609687847385, + "grad_norm": 0.5113322734832764, + "learning_rate": 9.005331678752629e-05, + "loss": 0.9543, + "step": 63970 + }, + { + "epoch": 0.40874998402821255, + "grad_norm": 0.6806952357292175, + "learning_rate": 9.00503131221291e-05, + "loss": 1.0935, + "step": 63980 + }, + { + "epoch": 0.40881387117795126, + "grad_norm": 1.0288503170013428, + "learning_rate": 9.00473090533906e-05, + "loss": 0.8507, + "step": 63990 + }, + { + "epoch": 0.40887775832768997, + "grad_norm": 0.6882049441337585, + "learning_rate": 9.004430458134107e-05, + "loss": 1.0034, + "step": 64000 + }, + { + "epoch": 0.4089416454774287, + "grad_norm": 1.0259922742843628, + "learning_rate": 9.004129970601074e-05, + "loss": 1.11, + "step": 64010 + }, + { + "epoch": 0.4090055326271674, + "grad_norm": 0.7232376337051392, + "learning_rate": 9.003829442742989e-05, + "loss": 1.0349, + "step": 64020 + }, + { + "epoch": 0.4090694197769061, + "grad_norm": 0.7843562960624695, + "learning_rate": 9.003528874562875e-05, + "loss": 0.8303, + "step": 64030 + }, + { + "epoch": 0.4091333069266448, + "grad_norm": 0.7340204119682312, + "learning_rate": 9.003228266063765e-05, + "loss": 0.8455, + "step": 64040 + }, + { + "epoch": 0.4091971940763835, + "grad_norm": 0.618037760257721, + "learning_rate": 9.00292761724868e-05, + "loss": 0.8546, + "step": 64050 + }, + { + "epoch": 0.4092610812261222, + "grad_norm": 0.4729726314544678, + "learning_rate": 9.002626928120654e-05, + "loss": 0.8056, + "step": 64060 + }, + { + "epoch": 0.40932496837586085, + "grad_norm": 4.078080654144287, + "learning_rate": 9.002326198682712e-05, + "loss": 1.038, + "step": 64070 + }, + { + "epoch": 0.40938885552559956, + "grad_norm": 0.8431742787361145, + "learning_rate": 9.002025428937879e-05, + "loss": 0.9887, + "step": 64080 + }, + { + "epoch": 0.40945274267533827, + "grad_norm": 1.1777523756027222, + "learning_rate": 9.00172461888919e-05, + "loss": 0.729, + "step": 64090 + }, + { + "epoch": 0.40951662982507697, + "grad_norm": 0.9393613338470459, + "learning_rate": 9.001423768539672e-05, + "loss": 0.9377, + "step": 64100 + }, + { + "epoch": 0.4095805169748157, + "grad_norm": 0.6365240812301636, + "learning_rate": 9.001122877892356e-05, + "loss": 0.6823, + "step": 64110 + }, + { + "epoch": 0.4096444041245544, + "grad_norm": 0.9078708291053772, + "learning_rate": 9.00082194695027e-05, + "loss": 0.8199, + "step": 64120 + }, + { + "epoch": 0.4097082912742931, + "grad_norm": 2.3477323055267334, + "learning_rate": 9.000520975716445e-05, + "loss": 0.955, + "step": 64130 + }, + { + "epoch": 0.4097721784240318, + "grad_norm": 0.5948389768600464, + "learning_rate": 9.000219964193914e-05, + "loss": 0.8302, + "step": 64140 + }, + { + "epoch": 0.4098360655737705, + "grad_norm": 0.6397002935409546, + "learning_rate": 8.999918912385708e-05, + "loss": 0.8583, + "step": 64150 + }, + { + "epoch": 0.4098999527235092, + "grad_norm": 1.9273061752319336, + "learning_rate": 8.999617820294857e-05, + "loss": 0.9524, + "step": 64160 + }, + { + "epoch": 0.4099638398732479, + "grad_norm": 0.8526675701141357, + "learning_rate": 8.999316687924395e-05, + "loss": 0.9583, + "step": 64170 + }, + { + "epoch": 0.4100277270229866, + "grad_norm": 0.9683032035827637, + "learning_rate": 8.999015515277352e-05, + "loss": 1.1449, + "step": 64180 + }, + { + "epoch": 0.41009161417272527, + "grad_norm": 0.6847209930419922, + "learning_rate": 8.998714302356766e-05, + "loss": 0.8517, + "step": 64190 + }, + { + "epoch": 0.410155501322464, + "grad_norm": 0.6209728717803955, + "learning_rate": 8.998413049165666e-05, + "loss": 0.8522, + "step": 64200 + }, + { + "epoch": 0.4102193884722027, + "grad_norm": 0.6670452952384949, + "learning_rate": 8.998111755707088e-05, + "loss": 0.9287, + "step": 64210 + }, + { + "epoch": 0.4102832756219414, + "grad_norm": 1.0811097621917725, + "learning_rate": 8.997810421984065e-05, + "loss": 0.9053, + "step": 64220 + }, + { + "epoch": 0.4103471627716801, + "grad_norm": 0.8516783118247986, + "learning_rate": 8.997509047999634e-05, + "loss": 0.9318, + "step": 64230 + }, + { + "epoch": 0.4104110499214188, + "grad_norm": 0.880136251449585, + "learning_rate": 8.997207633756828e-05, + "loss": 0.8013, + "step": 64240 + }, + { + "epoch": 0.4104749370711575, + "grad_norm": 0.804787278175354, + "learning_rate": 8.996906179258681e-05, + "loss": 1.0225, + "step": 64250 + }, + { + "epoch": 0.4105388242208962, + "grad_norm": 0.7522739171981812, + "learning_rate": 8.996604684508234e-05, + "loss": 0.9695, + "step": 64260 + }, + { + "epoch": 0.4106027113706349, + "grad_norm": 0.702925443649292, + "learning_rate": 8.996303149508518e-05, + "loss": 0.7292, + "step": 64270 + }, + { + "epoch": 0.4106665985203736, + "grad_norm": 1.3208075761795044, + "learning_rate": 8.996001574262574e-05, + "loss": 0.7754, + "step": 64280 + }, + { + "epoch": 0.41073048567011233, + "grad_norm": 0.8626548051834106, + "learning_rate": 8.995699958773435e-05, + "loss": 0.8081, + "step": 64290 + }, + { + "epoch": 0.41079437281985104, + "grad_norm": 0.8688634037971497, + "learning_rate": 8.995398303044142e-05, + "loss": 0.8649, + "step": 64300 + }, + { + "epoch": 0.4108582599695897, + "grad_norm": 0.9219092130661011, + "learning_rate": 8.995096607077731e-05, + "loss": 0.8441, + "step": 64310 + }, + { + "epoch": 0.4109221471193284, + "grad_norm": 1.0303871631622314, + "learning_rate": 8.994794870877241e-05, + "loss": 0.9238, + "step": 64320 + }, + { + "epoch": 0.4109860342690671, + "grad_norm": 0.938761830329895, + "learning_rate": 8.994493094445711e-05, + "loss": 0.6409, + "step": 64330 + }, + { + "epoch": 0.4110499214188058, + "grad_norm": 0.8566682934761047, + "learning_rate": 8.99419127778618e-05, + "loss": 0.9025, + "step": 64340 + }, + { + "epoch": 0.4111138085685445, + "grad_norm": 0.8266015648841858, + "learning_rate": 8.993889420901687e-05, + "loss": 0.811, + "step": 64350 + }, + { + "epoch": 0.4111776957182832, + "grad_norm": 0.9890789985656738, + "learning_rate": 8.993587523795271e-05, + "loss": 0.921, + "step": 64360 + }, + { + "epoch": 0.4112415828680219, + "grad_norm": 0.8247410655021667, + "learning_rate": 8.993285586469976e-05, + "loss": 1.0017, + "step": 64370 + }, + { + "epoch": 0.41130547001776063, + "grad_norm": 1.1178898811340332, + "learning_rate": 8.992983608928839e-05, + "loss": 0.9229, + "step": 64380 + }, + { + "epoch": 0.41136935716749934, + "grad_norm": 0.6633570194244385, + "learning_rate": 8.992681591174903e-05, + "loss": 0.7906, + "step": 64390 + }, + { + "epoch": 0.41143324431723804, + "grad_norm": 0.9946048855781555, + "learning_rate": 8.99237953321121e-05, + "loss": 0.8584, + "step": 64400 + }, + { + "epoch": 0.41149713146697675, + "grad_norm": 1.1637663841247559, + "learning_rate": 8.992077435040799e-05, + "loss": 0.7197, + "step": 64410 + }, + { + "epoch": 0.41156101861671546, + "grad_norm": 0.9917969703674316, + "learning_rate": 8.991775296666717e-05, + "loss": 0.9235, + "step": 64420 + }, + { + "epoch": 0.4116249057664541, + "grad_norm": 0.6718196868896484, + "learning_rate": 8.991473118092003e-05, + "loss": 0.891, + "step": 64430 + }, + { + "epoch": 0.4116887929161928, + "grad_norm": 0.6841692924499512, + "learning_rate": 8.991170899319702e-05, + "loss": 1.2224, + "step": 64440 + }, + { + "epoch": 0.4117526800659315, + "grad_norm": 0.8956950306892395, + "learning_rate": 8.990868640352857e-05, + "loss": 0.7681, + "step": 64450 + }, + { + "epoch": 0.4118165672156702, + "grad_norm": 0.8539284467697144, + "learning_rate": 8.990566341194513e-05, + "loss": 0.9718, + "step": 64460 + }, + { + "epoch": 0.41188045436540893, + "grad_norm": 1.2791920900344849, + "learning_rate": 8.990294237590787e-05, + "loss": 0.885, + "step": 64470 + }, + { + "epoch": 0.41194434151514764, + "grad_norm": 0.7224549651145935, + "learning_rate": 8.989991862076981e-05, + "loss": 0.8548, + "step": 64480 + }, + { + "epoch": 0.41200822866488634, + "grad_norm": 0.8494675159454346, + "learning_rate": 8.989689446380503e-05, + "loss": 1.2577, + "step": 64490 + }, + { + "epoch": 0.41207211581462505, + "grad_norm": 1.3127714395523071, + "learning_rate": 8.989386990504402e-05, + "loss": 0.9295, + "step": 64500 + }, + { + "epoch": 0.41213600296436376, + "grad_norm": 1.0279886722564697, + "learning_rate": 8.989084494451725e-05, + "loss": 1.096, + "step": 64510 + }, + { + "epoch": 0.41219989011410246, + "grad_norm": 1.2837331295013428, + "learning_rate": 8.988781958225515e-05, + "loss": 0.744, + "step": 64520 + }, + { + "epoch": 0.41226377726384117, + "grad_norm": 0.7983502745628357, + "learning_rate": 8.988479381828817e-05, + "loss": 0.7822, + "step": 64530 + }, + { + "epoch": 0.4123276644135799, + "grad_norm": 1.4588627815246582, + "learning_rate": 8.988176765264684e-05, + "loss": 0.9902, + "step": 64540 + }, + { + "epoch": 0.4123915515633186, + "grad_norm": 0.8244463205337524, + "learning_rate": 8.98787410853616e-05, + "loss": 0.7243, + "step": 64550 + }, + { + "epoch": 0.41245543871305723, + "grad_norm": 0.6158170104026794, + "learning_rate": 8.987571411646292e-05, + "loss": 0.9385, + "step": 64560 + }, + { + "epoch": 0.41251932586279594, + "grad_norm": 1.447361707687378, + "learning_rate": 8.987268674598133e-05, + "loss": 0.8133, + "step": 64570 + }, + { + "epoch": 0.41258321301253464, + "grad_norm": 0.8146925568580627, + "learning_rate": 8.986965897394728e-05, + "loss": 0.8676, + "step": 64580 + }, + { + "epoch": 0.41264710016227335, + "grad_norm": 0.798591136932373, + "learning_rate": 8.986663080039126e-05, + "loss": 0.9362, + "step": 64590 + }, + { + "epoch": 0.41271098731201206, + "grad_norm": 0.6926212906837463, + "learning_rate": 8.986360222534377e-05, + "loss": 0.7352, + "step": 64600 + }, + { + "epoch": 0.41277487446175076, + "grad_norm": 0.8038867115974426, + "learning_rate": 8.986057324883535e-05, + "loss": 1.2618, + "step": 64610 + }, + { + "epoch": 0.41283876161148947, + "grad_norm": 1.9613544940948486, + "learning_rate": 8.985754387089647e-05, + "loss": 0.7561, + "step": 64620 + }, + { + "epoch": 0.4129026487612282, + "grad_norm": 0.8577353358268738, + "learning_rate": 8.985451409155762e-05, + "loss": 0.8224, + "step": 64630 + }, + { + "epoch": 0.4129665359109669, + "grad_norm": 0.5765991806983948, + "learning_rate": 8.985148391084934e-05, + "loss": 0.8005, + "step": 64640 + }, + { + "epoch": 0.4130304230607056, + "grad_norm": 0.4820258319377899, + "learning_rate": 8.984845332880213e-05, + "loss": 0.8121, + "step": 64650 + }, + { + "epoch": 0.4130943102104443, + "grad_norm": 1.0253220796585083, + "learning_rate": 8.984542234544656e-05, + "loss": 0.667, + "step": 64660 + }, + { + "epoch": 0.413158197360183, + "grad_norm": 0.6785098910331726, + "learning_rate": 8.984239096081308e-05, + "loss": 0.9457, + "step": 64670 + }, + { + "epoch": 0.41322208450992165, + "grad_norm": 0.9571899771690369, + "learning_rate": 8.983935917493227e-05, + "loss": 1.1104, + "step": 64680 + }, + { + "epoch": 0.41328597165966036, + "grad_norm": 0.9146550893783569, + "learning_rate": 8.983632698783463e-05, + "loss": 1.0036, + "step": 64690 + }, + { + "epoch": 0.41334985880939906, + "grad_norm": 1.2323453426361084, + "learning_rate": 8.983329439955075e-05, + "loss": 0.8704, + "step": 64700 + }, + { + "epoch": 0.41341374595913777, + "grad_norm": 1.2397221326828003, + "learning_rate": 8.98302614101111e-05, + "loss": 0.9207, + "step": 64710 + }, + { + "epoch": 0.4134776331088765, + "grad_norm": 0.7488781213760376, + "learning_rate": 8.982722801954627e-05, + "loss": 0.7842, + "step": 64720 + }, + { + "epoch": 0.4135415202586152, + "grad_norm": 2.860487461090088, + "learning_rate": 8.98241942278868e-05, + "loss": 0.7993, + "step": 64730 + }, + { + "epoch": 0.4136054074083539, + "grad_norm": 0.9510349035263062, + "learning_rate": 8.982116003516324e-05, + "loss": 0.9748, + "step": 64740 + }, + { + "epoch": 0.4136692945580926, + "grad_norm": 0.7281776666641235, + "learning_rate": 8.981812544140615e-05, + "loss": 0.8138, + "step": 64750 + }, + { + "epoch": 0.4137331817078313, + "grad_norm": 1.0613374710083008, + "learning_rate": 8.981509044664608e-05, + "loss": 0.7931, + "step": 64760 + }, + { + "epoch": 0.41379706885757, + "grad_norm": 0.5681692361831665, + "learning_rate": 8.981205505091363e-05, + "loss": 0.7882, + "step": 64770 + }, + { + "epoch": 0.4138609560073087, + "grad_norm": 0.9813446998596191, + "learning_rate": 8.980901925423932e-05, + "loss": 0.8314, + "step": 64780 + }, + { + "epoch": 0.4139248431570474, + "grad_norm": 0.9293444156646729, + "learning_rate": 8.980598305665375e-05, + "loss": 0.6781, + "step": 64790 + }, + { + "epoch": 0.41398873030678607, + "grad_norm": 0.8885220289230347, + "learning_rate": 8.98029464581875e-05, + "loss": 0.8954, + "step": 64800 + }, + { + "epoch": 0.4140526174565248, + "grad_norm": 0.7418517470359802, + "learning_rate": 8.979990945887114e-05, + "loss": 0.8223, + "step": 64810 + }, + { + "epoch": 0.4141165046062635, + "grad_norm": 0.9399496912956238, + "learning_rate": 8.979687205873526e-05, + "loss": 0.9294, + "step": 64820 + }, + { + "epoch": 0.4141803917560022, + "grad_norm": 0.7863808274269104, + "learning_rate": 8.979383425781046e-05, + "loss": 0.7421, + "step": 64830 + }, + { + "epoch": 0.4142442789057409, + "grad_norm": 0.8157973289489746, + "learning_rate": 8.97907960561273e-05, + "loss": 0.7436, + "step": 64840 + }, + { + "epoch": 0.4143081660554796, + "grad_norm": 0.8442412614822388, + "learning_rate": 8.978775745371642e-05, + "loss": 0.8952, + "step": 64850 + }, + { + "epoch": 0.4143720532052183, + "grad_norm": 1.1635627746582031, + "learning_rate": 8.978471845060838e-05, + "loss": 0.9926, + "step": 64860 + }, + { + "epoch": 0.414435940354957, + "grad_norm": 0.975397527217865, + "learning_rate": 8.978167904683383e-05, + "loss": 0.814, + "step": 64870 + }, + { + "epoch": 0.4144998275046957, + "grad_norm": 0.6589390635490417, + "learning_rate": 8.977863924242335e-05, + "loss": 0.8215, + "step": 64880 + }, + { + "epoch": 0.4145637146544344, + "grad_norm": 1.0602787733078003, + "learning_rate": 8.977559903740756e-05, + "loss": 0.7723, + "step": 64890 + }, + { + "epoch": 0.41462760180417313, + "grad_norm": 0.7909052968025208, + "learning_rate": 8.977255843181707e-05, + "loss": 0.9256, + "step": 64900 + }, + { + "epoch": 0.41469148895391184, + "grad_norm": 3.674098491668701, + "learning_rate": 8.976951742568249e-05, + "loss": 0.822, + "step": 64910 + }, + { + "epoch": 0.4147553761036505, + "grad_norm": 1.0264021158218384, + "learning_rate": 8.97664760190345e-05, + "loss": 0.9519, + "step": 64920 + }, + { + "epoch": 0.4148192632533892, + "grad_norm": 0.8971536755561829, + "learning_rate": 8.976343421190367e-05, + "loss": 0.8374, + "step": 64930 + }, + { + "epoch": 0.4148831504031279, + "grad_norm": 0.4284789562225342, + "learning_rate": 8.976039200432067e-05, + "loss": 0.7519, + "step": 64940 + }, + { + "epoch": 0.4149470375528666, + "grad_norm": 0.5948058366775513, + "learning_rate": 8.975734939631612e-05, + "loss": 0.847, + "step": 64950 + }, + { + "epoch": 0.4150109247026053, + "grad_norm": 1.361910104751587, + "learning_rate": 8.975430638792066e-05, + "loss": 0.8689, + "step": 64960 + }, + { + "epoch": 0.415074811852344, + "grad_norm": 1.295660376548767, + "learning_rate": 8.975126297916495e-05, + "loss": 0.8311, + "step": 64970 + }, + { + "epoch": 0.4151386990020827, + "grad_norm": 1.1786941289901733, + "learning_rate": 8.974821917007962e-05, + "loss": 1.0239, + "step": 64980 + }, + { + "epoch": 0.41520258615182143, + "grad_norm": 1.0737582445144653, + "learning_rate": 8.974517496069536e-05, + "loss": 0.9261, + "step": 64990 + }, + { + "epoch": 0.41526647330156014, + "grad_norm": 0.853832483291626, + "learning_rate": 8.97421303510428e-05, + "loss": 0.9344, + "step": 65000 + }, + { + "epoch": 0.41533036045129884, + "grad_norm": 2.251110553741455, + "learning_rate": 8.973908534115259e-05, + "loss": 1.0998, + "step": 65010 + }, + { + "epoch": 0.41539424760103755, + "grad_norm": 0.8067615628242493, + "learning_rate": 8.973603993105542e-05, + "loss": 0.9716, + "step": 65020 + }, + { + "epoch": 0.41545813475077625, + "grad_norm": 0.5543524622917175, + "learning_rate": 8.973299412078194e-05, + "loss": 0.7536, + "step": 65030 + }, + { + "epoch": 0.4155220219005149, + "grad_norm": 1.8256611824035645, + "learning_rate": 8.972994791036284e-05, + "loss": 1.0459, + "step": 65040 + }, + { + "epoch": 0.4155859090502536, + "grad_norm": 0.8897950053215027, + "learning_rate": 8.97269012998288e-05, + "loss": 0.9819, + "step": 65050 + }, + { + "epoch": 0.4156497961999923, + "grad_norm": 0.9131116271018982, + "learning_rate": 8.97238542892105e-05, + "loss": 0.8402, + "step": 65060 + }, + { + "epoch": 0.415713683349731, + "grad_norm": 1.1356046199798584, + "learning_rate": 8.972080687853861e-05, + "loss": 0.841, + "step": 65070 + }, + { + "epoch": 0.41577757049946973, + "grad_norm": 0.8197834491729736, + "learning_rate": 8.971775906784383e-05, + "loss": 0.7874, + "step": 65080 + }, + { + "epoch": 0.41584145764920843, + "grad_norm": 0.6989075541496277, + "learning_rate": 8.971471085715686e-05, + "loss": 0.8665, + "step": 65090 + }, + { + "epoch": 0.41590534479894714, + "grad_norm": 0.732832670211792, + "learning_rate": 8.97116622465084e-05, + "loss": 0.6657, + "step": 65100 + }, + { + "epoch": 0.41596923194868585, + "grad_norm": 0.8469944000244141, + "learning_rate": 8.970861323592913e-05, + "loss": 0.8977, + "step": 65110 + }, + { + "epoch": 0.41603311909842455, + "grad_norm": 1.300403118133545, + "learning_rate": 8.970556382544978e-05, + "loss": 1.0034, + "step": 65120 + }, + { + "epoch": 0.41609700624816326, + "grad_norm": 1.2970237731933594, + "learning_rate": 8.970251401510107e-05, + "loss": 1.0144, + "step": 65130 + }, + { + "epoch": 0.41616089339790197, + "grad_norm": 1.0652505159378052, + "learning_rate": 8.969946380491367e-05, + "loss": 0.7254, + "step": 65140 + }, + { + "epoch": 0.41622478054764067, + "grad_norm": 0.9304187297821045, + "learning_rate": 8.969641319491833e-05, + "loss": 0.884, + "step": 65150 + }, + { + "epoch": 0.4162886676973793, + "grad_norm": 0.8894677758216858, + "learning_rate": 8.969336218514579e-05, + "loss": 1.1695, + "step": 65160 + }, + { + "epoch": 0.41635255484711803, + "grad_norm": 0.7384070158004761, + "learning_rate": 8.969031077562673e-05, + "loss": 0.8618, + "step": 65170 + }, + { + "epoch": 0.41641644199685673, + "grad_norm": 0.8503040671348572, + "learning_rate": 8.968725896639189e-05, + "loss": 1.1173, + "step": 65180 + }, + { + "epoch": 0.41648032914659544, + "grad_norm": 1.213909387588501, + "learning_rate": 8.968420675747204e-05, + "loss": 0.5525, + "step": 65190 + }, + { + "epoch": 0.41654421629633415, + "grad_norm": 0.8109204769134521, + "learning_rate": 8.968115414889791e-05, + "loss": 0.8147, + "step": 65200 + }, + { + "epoch": 0.41660810344607285, + "grad_norm": 0.9055116772651672, + "learning_rate": 8.967810114070022e-05, + "loss": 0.8597, + "step": 65210 + }, + { + "epoch": 0.41667199059581156, + "grad_norm": 0.7332736849784851, + "learning_rate": 8.96750477329097e-05, + "loss": 0.8135, + "step": 65220 + }, + { + "epoch": 0.41673587774555026, + "grad_norm": 1.6133145093917847, + "learning_rate": 8.967199392555714e-05, + "loss": 0.5944, + "step": 65230 + }, + { + "epoch": 0.41679976489528897, + "grad_norm": 1.0285025835037231, + "learning_rate": 8.966893971867329e-05, + "loss": 1.2345, + "step": 65240 + }, + { + "epoch": 0.4168636520450277, + "grad_norm": 0.7970749139785767, + "learning_rate": 8.966588511228888e-05, + "loss": 0.9716, + "step": 65250 + }, + { + "epoch": 0.4169275391947664, + "grad_norm": 0.9936865568161011, + "learning_rate": 8.96628301064347e-05, + "loss": 0.6991, + "step": 65260 + }, + { + "epoch": 0.4169914263445051, + "grad_norm": 0.6631901264190674, + "learning_rate": 8.965977470114151e-05, + "loss": 1.0921, + "step": 65270 + }, + { + "epoch": 0.41705531349424374, + "grad_norm": 1.0194664001464844, + "learning_rate": 8.965671889644007e-05, + "loss": 0.8967, + "step": 65280 + }, + { + "epoch": 0.41711920064398245, + "grad_norm": 1.145621657371521, + "learning_rate": 8.965366269236117e-05, + "loss": 0.9233, + "step": 65290 + }, + { + "epoch": 0.41718308779372115, + "grad_norm": 0.7853092551231384, + "learning_rate": 8.965060608893559e-05, + "loss": 0.8627, + "step": 65300 + }, + { + "epoch": 0.41724697494345986, + "grad_norm": 0.7077251672744751, + "learning_rate": 8.96475490861941e-05, + "loss": 1.0426, + "step": 65310 + }, + { + "epoch": 0.41731086209319856, + "grad_norm": 0.9070340394973755, + "learning_rate": 8.964449168416749e-05, + "loss": 0.9206, + "step": 65320 + }, + { + "epoch": 0.41737474924293727, + "grad_norm": 1.0521044731140137, + "learning_rate": 8.964143388288653e-05, + "loss": 0.886, + "step": 65330 + }, + { + "epoch": 0.417438636392676, + "grad_norm": 0.46310827136039734, + "learning_rate": 8.963837568238205e-05, + "loss": 0.8873, + "step": 65340 + }, + { + "epoch": 0.4175025235424147, + "grad_norm": 0.7745985388755798, + "learning_rate": 8.963531708268485e-05, + "loss": 0.9885, + "step": 65350 + }, + { + "epoch": 0.4175664106921534, + "grad_norm": 0.7876570820808411, + "learning_rate": 8.96322580838257e-05, + "loss": 0.6772, + "step": 65360 + }, + { + "epoch": 0.4176302978418921, + "grad_norm": 1.1240822076797485, + "learning_rate": 8.962919868583544e-05, + "loss": 0.9992, + "step": 65370 + }, + { + "epoch": 0.4176941849916308, + "grad_norm": 1.488118290901184, + "learning_rate": 8.962613888874485e-05, + "loss": 1.1016, + "step": 65380 + }, + { + "epoch": 0.4177580721413695, + "grad_norm": 1.2619564533233643, + "learning_rate": 8.962307869258476e-05, + "loss": 1.1059, + "step": 65390 + }, + { + "epoch": 0.4178219592911082, + "grad_norm": 3.796415328979492, + "learning_rate": 8.962001809738599e-05, + "loss": 1.0094, + "step": 65400 + }, + { + "epoch": 0.41788584644084686, + "grad_norm": 0.6639039516448975, + "learning_rate": 8.961695710317936e-05, + "loss": 0.9859, + "step": 65410 + }, + { + "epoch": 0.41794973359058557, + "grad_norm": 1.1306976079940796, + "learning_rate": 8.961389570999573e-05, + "loss": 0.6482, + "step": 65420 + }, + { + "epoch": 0.4180136207403243, + "grad_norm": 1.1172826290130615, + "learning_rate": 8.961083391786585e-05, + "loss": 0.9486, + "step": 65430 + }, + { + "epoch": 0.418077507890063, + "grad_norm": 0.6498112678527832, + "learning_rate": 8.960777172682063e-05, + "loss": 0.7618, + "step": 65440 + }, + { + "epoch": 0.4181413950398017, + "grad_norm": 0.7367339134216309, + "learning_rate": 8.960470913689088e-05, + "loss": 0.9973, + "step": 65450 + }, + { + "epoch": 0.4182052821895404, + "grad_norm": 0.7694912552833557, + "learning_rate": 8.960164614810744e-05, + "loss": 0.8996, + "step": 65460 + }, + { + "epoch": 0.4182691693392791, + "grad_norm": 0.9336798191070557, + "learning_rate": 8.959858276050118e-05, + "loss": 0.9093, + "step": 65470 + }, + { + "epoch": 0.4183330564890178, + "grad_norm": 0.7847772836685181, + "learning_rate": 8.959551897410292e-05, + "loss": 0.7734, + "step": 65480 + }, + { + "epoch": 0.4183969436387565, + "grad_norm": 0.8503950834274292, + "learning_rate": 8.959245478894353e-05, + "loss": 0.9816, + "step": 65490 + }, + { + "epoch": 0.4184608307884952, + "grad_norm": 0.9452194571495056, + "learning_rate": 8.958939020505388e-05, + "loss": 0.9682, + "step": 65500 + }, + { + "epoch": 0.4185247179382339, + "grad_norm": 0.7661617398262024, + "learning_rate": 8.95863252224648e-05, + "loss": 0.6985, + "step": 65510 + }, + { + "epoch": 0.41858860508797263, + "grad_norm": 0.4529026448726654, + "learning_rate": 8.958325984120718e-05, + "loss": 0.6564, + "step": 65520 + }, + { + "epoch": 0.4186524922377113, + "grad_norm": 2.9925787448883057, + "learning_rate": 8.958019406131191e-05, + "loss": 0.6851, + "step": 65530 + }, + { + "epoch": 0.41871637938745, + "grad_norm": 0.726306676864624, + "learning_rate": 8.957712788280982e-05, + "loss": 0.8031, + "step": 65540 + }, + { + "epoch": 0.4187802665371887, + "grad_norm": 0.6993584036827087, + "learning_rate": 8.957406130573183e-05, + "loss": 0.8313, + "step": 65550 + }, + { + "epoch": 0.4188441536869274, + "grad_norm": 0.9798833131790161, + "learning_rate": 8.957099433010881e-05, + "loss": 0.9016, + "step": 65560 + }, + { + "epoch": 0.4189080408366661, + "grad_norm": 0.7420501708984375, + "learning_rate": 8.956792695597163e-05, + "loss": 0.6753, + "step": 65570 + }, + { + "epoch": 0.4189719279864048, + "grad_norm": 0.7620697617530823, + "learning_rate": 8.95648591833512e-05, + "loss": 0.8782, + "step": 65580 + }, + { + "epoch": 0.4190358151361435, + "grad_norm": 1.2457002401351929, + "learning_rate": 8.956179101227842e-05, + "loss": 1.1031, + "step": 65590 + }, + { + "epoch": 0.4190997022858822, + "grad_norm": 1.005566120147705, + "learning_rate": 8.955872244278416e-05, + "loss": 1.1262, + "step": 65600 + }, + { + "epoch": 0.41916358943562093, + "grad_norm": 1.014225959777832, + "learning_rate": 8.955565347489935e-05, + "loss": 0.9578, + "step": 65610 + }, + { + "epoch": 0.41922747658535964, + "grad_norm": 1.1588691473007202, + "learning_rate": 8.955258410865488e-05, + "loss": 1.1571, + "step": 65620 + }, + { + "epoch": 0.41929136373509834, + "grad_norm": 0.5882399678230286, + "learning_rate": 8.954951434408168e-05, + "loss": 0.8187, + "step": 65630 + }, + { + "epoch": 0.41935525088483705, + "grad_norm": 0.5177319645881653, + "learning_rate": 8.954644418121065e-05, + "loss": 0.9707, + "step": 65640 + }, + { + "epoch": 0.4194191380345757, + "grad_norm": 1.1925745010375977, + "learning_rate": 8.954337362007273e-05, + "loss": 0.9326, + "step": 65650 + }, + { + "epoch": 0.4194830251843144, + "grad_norm": 7.771919250488281, + "learning_rate": 8.954030266069882e-05, + "loss": 0.9108, + "step": 65660 + }, + { + "epoch": 0.4195469123340531, + "grad_norm": 0.7512636184692383, + "learning_rate": 8.953723130311984e-05, + "loss": 0.7775, + "step": 65670 + }, + { + "epoch": 0.4196107994837918, + "grad_norm": 1.3332923650741577, + "learning_rate": 8.953415954736675e-05, + "loss": 1.0754, + "step": 65680 + }, + { + "epoch": 0.4196746866335305, + "grad_norm": 0.5721650719642639, + "learning_rate": 8.953108739347047e-05, + "loss": 0.9334, + "step": 65690 + }, + { + "epoch": 0.41973857378326923, + "grad_norm": 0.9686694741249084, + "learning_rate": 8.952801484146194e-05, + "loss": 1.0833, + "step": 65700 + }, + { + "epoch": 0.41980246093300794, + "grad_norm": 0.7527593374252319, + "learning_rate": 8.95249418913721e-05, + "loss": 0.9329, + "step": 65710 + }, + { + "epoch": 0.41986634808274664, + "grad_norm": 0.8230323195457458, + "learning_rate": 8.95218685432319e-05, + "loss": 1.1346, + "step": 65720 + }, + { + "epoch": 0.41993023523248535, + "grad_norm": 2.3285200595855713, + "learning_rate": 8.95187947970723e-05, + "loss": 1.1473, + "step": 65730 + }, + { + "epoch": 0.41999412238222406, + "grad_norm": 0.7577224969863892, + "learning_rate": 8.951572065292424e-05, + "loss": 0.9537, + "step": 65740 + }, + { + "epoch": 0.42005800953196276, + "grad_norm": 0.5602964758872986, + "learning_rate": 8.95126461108187e-05, + "loss": 0.9735, + "step": 65750 + }, + { + "epoch": 0.42012189668170147, + "grad_norm": 0.63779217004776, + "learning_rate": 8.950957117078662e-05, + "loss": 0.7456, + "step": 65760 + }, + { + "epoch": 0.4201857838314401, + "grad_norm": 1.0952467918395996, + "learning_rate": 8.950649583285898e-05, + "loss": 0.906, + "step": 65770 + }, + { + "epoch": 0.4202496709811788, + "grad_norm": 0.687418520450592, + "learning_rate": 8.950342009706675e-05, + "loss": 0.7934, + "step": 65780 + }, + { + "epoch": 0.42031355813091753, + "grad_norm": 1.1235476732254028, + "learning_rate": 8.95003439634409e-05, + "loss": 0.9785, + "step": 65790 + }, + { + "epoch": 0.42037744528065624, + "grad_norm": 0.9413420557975769, + "learning_rate": 8.949726743201242e-05, + "loss": 0.8127, + "step": 65800 + }, + { + "epoch": 0.42044133243039494, + "grad_norm": 0.9101980924606323, + "learning_rate": 8.949419050281228e-05, + "loss": 1.1065, + "step": 65810 + }, + { + "epoch": 0.42050521958013365, + "grad_norm": 0.6493207812309265, + "learning_rate": 8.94911131758715e-05, + "loss": 0.7957, + "step": 65820 + }, + { + "epoch": 0.42056910672987236, + "grad_norm": 0.753649890422821, + "learning_rate": 8.9488035451221e-05, + "loss": 1.1482, + "step": 65830 + }, + { + "epoch": 0.42063299387961106, + "grad_norm": 0.9392029047012329, + "learning_rate": 8.948495732889185e-05, + "loss": 0.8979, + "step": 65840 + }, + { + "epoch": 0.42069688102934977, + "grad_norm": 1.159693717956543, + "learning_rate": 8.948187880891501e-05, + "loss": 0.9243, + "step": 65850 + }, + { + "epoch": 0.4207607681790885, + "grad_norm": 0.7073017358779907, + "learning_rate": 8.947879989132151e-05, + "loss": 0.7319, + "step": 65860 + }, + { + "epoch": 0.4208246553288272, + "grad_norm": 0.5303799510002136, + "learning_rate": 8.947572057614231e-05, + "loss": 0.7607, + "step": 65870 + }, + { + "epoch": 0.4208885424785659, + "grad_norm": 0.734471321105957, + "learning_rate": 8.947264086340847e-05, + "loss": 0.8881, + "step": 65880 + }, + { + "epoch": 0.42095242962830454, + "grad_norm": 0.990151047706604, + "learning_rate": 8.9469560753151e-05, + "loss": 1.0643, + "step": 65890 + }, + { + "epoch": 0.42101631677804324, + "grad_norm": 0.6194562315940857, + "learning_rate": 8.94664802454009e-05, + "loss": 1.0016, + "step": 65900 + }, + { + "epoch": 0.42108020392778195, + "grad_norm": 0.8033568263053894, + "learning_rate": 8.946339934018919e-05, + "loss": 0.9705, + "step": 65910 + }, + { + "epoch": 0.42114409107752065, + "grad_norm": 0.9492495656013489, + "learning_rate": 8.946031803754693e-05, + "loss": 0.7755, + "step": 65920 + }, + { + "epoch": 0.42120797822725936, + "grad_norm": 0.7201482057571411, + "learning_rate": 8.945723633750512e-05, + "loss": 0.9378, + "step": 65930 + }, + { + "epoch": 0.42127186537699807, + "grad_norm": 0.9046865701675415, + "learning_rate": 8.945415424009478e-05, + "loss": 0.9119, + "step": 65940 + }, + { + "epoch": 0.4213357525267368, + "grad_norm": 0.8343170881271362, + "learning_rate": 8.945107174534699e-05, + "loss": 0.9199, + "step": 65950 + }, + { + "epoch": 0.4213996396764755, + "grad_norm": 1.330496072769165, + "learning_rate": 8.94479888532928e-05, + "loss": 0.7923, + "step": 65960 + }, + { + "epoch": 0.4214635268262142, + "grad_norm": 0.7059889435768127, + "learning_rate": 8.94449055639632e-05, + "loss": 0.9541, + "step": 65970 + }, + { + "epoch": 0.4215274139759529, + "grad_norm": 0.8712106347084045, + "learning_rate": 8.944182187738929e-05, + "loss": 1.1176, + "step": 65980 + }, + { + "epoch": 0.4215913011256916, + "grad_norm": 0.9229239821434021, + "learning_rate": 8.943873779360213e-05, + "loss": 0.7503, + "step": 65990 + }, + { + "epoch": 0.4216551882754303, + "grad_norm": 1.1028259992599487, + "learning_rate": 8.943565331263274e-05, + "loss": 1.1007, + "step": 66000 + }, + { + "epoch": 0.42171907542516895, + "grad_norm": 0.8142592906951904, + "learning_rate": 8.943256843451221e-05, + "loss": 0.9841, + "step": 66010 + }, + { + "epoch": 0.42178296257490766, + "grad_norm": 1.184031367301941, + "learning_rate": 8.94294831592716e-05, + "loss": 0.9047, + "step": 66020 + }, + { + "epoch": 0.42184684972464637, + "grad_norm": 1.0467089414596558, + "learning_rate": 8.9426397486942e-05, + "loss": 1.0965, + "step": 66030 + }, + { + "epoch": 0.4219107368743851, + "grad_norm": 2.1014811992645264, + "learning_rate": 8.942331141755445e-05, + "loss": 0.7972, + "step": 66040 + }, + { + "epoch": 0.4219746240241238, + "grad_norm": 0.9660546183586121, + "learning_rate": 8.942022495114004e-05, + "loss": 0.8632, + "step": 66050 + }, + { + "epoch": 0.4220385111738625, + "grad_norm": 1.027685523033142, + "learning_rate": 8.941713808772986e-05, + "loss": 0.9732, + "step": 66060 + }, + { + "epoch": 0.4221023983236012, + "grad_norm": 1.0367803573608398, + "learning_rate": 8.941405082735503e-05, + "loss": 0.8984, + "step": 66070 + }, + { + "epoch": 0.4221662854733399, + "grad_norm": 0.9707443714141846, + "learning_rate": 8.941096317004658e-05, + "loss": 0.8234, + "step": 66080 + }, + { + "epoch": 0.4222301726230786, + "grad_norm": 0.9538044929504395, + "learning_rate": 8.940787511583567e-05, + "loss": 0.8863, + "step": 66090 + }, + { + "epoch": 0.4222940597728173, + "grad_norm": 0.920036256313324, + "learning_rate": 8.940478666475333e-05, + "loss": 0.9216, + "step": 66100 + }, + { + "epoch": 0.422357946922556, + "grad_norm": 1.042989730834961, + "learning_rate": 8.94016978168307e-05, + "loss": 0.9341, + "step": 66110 + }, + { + "epoch": 0.4224218340722947, + "grad_norm": 0.8969932794570923, + "learning_rate": 8.93986085720989e-05, + "loss": 0.9116, + "step": 66120 + }, + { + "epoch": 0.42248572122203343, + "grad_norm": 1.1264333724975586, + "learning_rate": 8.939551893058902e-05, + "loss": 0.8614, + "step": 66130 + }, + { + "epoch": 0.4225496083717721, + "grad_norm": 0.9502818584442139, + "learning_rate": 8.939242889233219e-05, + "loss": 0.7416, + "step": 66140 + }, + { + "epoch": 0.4226134955215108, + "grad_norm": 1.023710012435913, + "learning_rate": 8.93893384573595e-05, + "loss": 0.6771, + "step": 66150 + }, + { + "epoch": 0.4226773826712495, + "grad_norm": 0.9767735004425049, + "learning_rate": 8.938624762570213e-05, + "loss": 0.8853, + "step": 66160 + }, + { + "epoch": 0.4227412698209882, + "grad_norm": 0.9584904909133911, + "learning_rate": 8.938315639739115e-05, + "loss": 0.8563, + "step": 66170 + }, + { + "epoch": 0.4228051569707269, + "grad_norm": 0.7890828847885132, + "learning_rate": 8.938006477245773e-05, + "loss": 0.6442, + "step": 66180 + }, + { + "epoch": 0.4228690441204656, + "grad_norm": 0.7645769119262695, + "learning_rate": 8.937697275093298e-05, + "loss": 0.5916, + "step": 66190 + }, + { + "epoch": 0.4229329312702043, + "grad_norm": 1.5121580362319946, + "learning_rate": 8.937388033284804e-05, + "loss": 0.8618, + "step": 66200 + }, + { + "epoch": 0.422996818419943, + "grad_norm": 0.7250730395317078, + "learning_rate": 8.937078751823406e-05, + "loss": 0.7188, + "step": 66210 + }, + { + "epoch": 0.4230607055696817, + "grad_norm": 1.0403555631637573, + "learning_rate": 8.93676943071222e-05, + "loss": 0.7553, + "step": 66220 + }, + { + "epoch": 0.42312459271942043, + "grad_norm": 0.6218414902687073, + "learning_rate": 8.93646006995436e-05, + "loss": 0.921, + "step": 66230 + }, + { + "epoch": 0.42318847986915914, + "grad_norm": 0.6752848029136658, + "learning_rate": 8.93615066955294e-05, + "loss": 0.9711, + "step": 66240 + }, + { + "epoch": 0.42325236701889785, + "grad_norm": 0.8991808295249939, + "learning_rate": 8.935841229511079e-05, + "loss": 0.8345, + "step": 66250 + }, + { + "epoch": 0.4233162541686365, + "grad_norm": 0.5273988246917725, + "learning_rate": 8.935531749831892e-05, + "loss": 1.0576, + "step": 66260 + }, + { + "epoch": 0.4233801413183752, + "grad_norm": 0.6460761427879333, + "learning_rate": 8.935222230518496e-05, + "loss": 0.7446, + "step": 66270 + }, + { + "epoch": 0.4234440284681139, + "grad_norm": 0.8064502477645874, + "learning_rate": 8.934912671574007e-05, + "loss": 0.9758, + "step": 66280 + }, + { + "epoch": 0.4235079156178526, + "grad_norm": 1.1486152410507202, + "learning_rate": 8.934603073001542e-05, + "loss": 0.9056, + "step": 66290 + }, + { + "epoch": 0.4235718027675913, + "grad_norm": 0.8460824489593506, + "learning_rate": 8.934293434804221e-05, + "loss": 1.0032, + "step": 66300 + }, + { + "epoch": 0.42363568991733, + "grad_norm": 1.1093133687973022, + "learning_rate": 8.933983756985163e-05, + "loss": 0.9909, + "step": 66310 + }, + { + "epoch": 0.42369957706706873, + "grad_norm": 0.9142333269119263, + "learning_rate": 8.933674039547484e-05, + "loss": 0.7974, + "step": 66320 + }, + { + "epoch": 0.42376346421680744, + "grad_norm": 1.893848180770874, + "learning_rate": 8.933364282494304e-05, + "loss": 0.8881, + "step": 66330 + }, + { + "epoch": 0.42382735136654615, + "grad_norm": 0.6643238663673401, + "learning_rate": 8.933054485828742e-05, + "loss": 0.6162, + "step": 66340 + }, + { + "epoch": 0.42389123851628485, + "grad_norm": 0.853939950466156, + "learning_rate": 8.932744649553921e-05, + "loss": 0.8599, + "step": 66350 + }, + { + "epoch": 0.42395512566602356, + "grad_norm": 0.6014842987060547, + "learning_rate": 8.932434773672958e-05, + "loss": 0.7083, + "step": 66360 + }, + { + "epoch": 0.42401901281576226, + "grad_norm": 0.7437098622322083, + "learning_rate": 8.932124858188975e-05, + "loss": 0.8959, + "step": 66370 + }, + { + "epoch": 0.4240828999655009, + "grad_norm": 0.731093168258667, + "learning_rate": 8.931814903105092e-05, + "loss": 0.7875, + "step": 66380 + }, + { + "epoch": 0.4241467871152396, + "grad_norm": 0.7877000570297241, + "learning_rate": 8.931504908424431e-05, + "loss": 0.8525, + "step": 66390 + }, + { + "epoch": 0.4242106742649783, + "grad_norm": 0.929058849811554, + "learning_rate": 8.931194874150116e-05, + "loss": 0.7897, + "step": 66400 + }, + { + "epoch": 0.42427456141471703, + "grad_norm": 1.1878929138183594, + "learning_rate": 8.930884800285266e-05, + "loss": 0.9998, + "step": 66410 + }, + { + "epoch": 0.42433844856445574, + "grad_norm": 0.5432239174842834, + "learning_rate": 8.930574686833008e-05, + "loss": 0.8411, + "step": 66420 + }, + { + "epoch": 0.42440233571419445, + "grad_norm": 0.9339645504951477, + "learning_rate": 8.930264533796459e-05, + "loss": 0.9499, + "step": 66430 + }, + { + "epoch": 0.42446622286393315, + "grad_norm": 0.762392520904541, + "learning_rate": 8.929954341178749e-05, + "loss": 0.8893, + "step": 66440 + }, + { + "epoch": 0.42453011001367186, + "grad_norm": 0.6582323908805847, + "learning_rate": 8.929644108982998e-05, + "loss": 0.8409, + "step": 66450 + }, + { + "epoch": 0.42459399716341056, + "grad_norm": 0.8481007218360901, + "learning_rate": 8.92933383721233e-05, + "loss": 0.9136, + "step": 66460 + }, + { + "epoch": 0.42465788431314927, + "grad_norm": 0.6672992706298828, + "learning_rate": 8.929023525869872e-05, + "loss": 0.8445, + "step": 66470 + }, + { + "epoch": 0.424721771462888, + "grad_norm": 1.0847039222717285, + "learning_rate": 8.928713174958748e-05, + "loss": 0.9611, + "step": 66480 + }, + { + "epoch": 0.4247856586126267, + "grad_norm": 0.881767213344574, + "learning_rate": 8.928402784482084e-05, + "loss": 0.9177, + "step": 66490 + }, + { + "epoch": 0.42484954576236533, + "grad_norm": 1.421280026435852, + "learning_rate": 8.928123399227131e-05, + "loss": 0.852, + "step": 66500 + }, + { + "epoch": 0.42491343291210404, + "grad_norm": 0.6856515407562256, + "learning_rate": 8.927812933584552e-05, + "loss": 1.2407, + "step": 66510 + }, + { + "epoch": 0.42497732006184274, + "grad_norm": 0.8232982754707336, + "learning_rate": 8.927502428385498e-05, + "loss": 1.111, + "step": 66520 + }, + { + "epoch": 0.42504120721158145, + "grad_norm": 0.7256129384040833, + "learning_rate": 8.927191883633097e-05, + "loss": 0.9756, + "step": 66530 + }, + { + "epoch": 0.42510509436132016, + "grad_norm": 0.5605076551437378, + "learning_rate": 8.926881299330476e-05, + "loss": 0.8828, + "step": 66540 + }, + { + "epoch": 0.42516898151105886, + "grad_norm": 1.3462023735046387, + "learning_rate": 8.926570675480764e-05, + "loss": 0.9569, + "step": 66550 + }, + { + "epoch": 0.42523286866079757, + "grad_norm": 1.0285893678665161, + "learning_rate": 8.926260012087087e-05, + "loss": 0.9012, + "step": 66560 + }, + { + "epoch": 0.4252967558105363, + "grad_norm": 1.9040067195892334, + "learning_rate": 8.925949309152577e-05, + "loss": 1.0781, + "step": 66570 + }, + { + "epoch": 0.425360642960275, + "grad_norm": 0.7241610288619995, + "learning_rate": 8.925638566680359e-05, + "loss": 0.7973, + "step": 66580 + }, + { + "epoch": 0.4254245301100137, + "grad_norm": 1.2173702716827393, + "learning_rate": 8.925327784673564e-05, + "loss": 1.047, + "step": 66590 + }, + { + "epoch": 0.4254884172597524, + "grad_norm": 0.8426626920700073, + "learning_rate": 8.925016963135324e-05, + "loss": 0.78, + "step": 66600 + }, + { + "epoch": 0.4255523044094911, + "grad_norm": 1.3263126611709595, + "learning_rate": 8.924706102068767e-05, + "loss": 0.7994, + "step": 66610 + }, + { + "epoch": 0.42561619155922975, + "grad_norm": 1.6536914110183716, + "learning_rate": 8.924395201477025e-05, + "loss": 0.8917, + "step": 66620 + }, + { + "epoch": 0.42568007870896846, + "grad_norm": 0.7909001111984253, + "learning_rate": 8.924084261363228e-05, + "loss": 0.7676, + "step": 66630 + }, + { + "epoch": 0.42574396585870716, + "grad_norm": 0.40761637687683105, + "learning_rate": 8.923773281730505e-05, + "loss": 0.7697, + "step": 66640 + }, + { + "epoch": 0.42580785300844587, + "grad_norm": 0.8846787810325623, + "learning_rate": 8.923462262581994e-05, + "loss": 0.8687, + "step": 66650 + }, + { + "epoch": 0.4258717401581846, + "grad_norm": 0.4814871847629547, + "learning_rate": 8.923151203920822e-05, + "loss": 0.6312, + "step": 66660 + }, + { + "epoch": 0.4259356273079233, + "grad_norm": 0.6910040378570557, + "learning_rate": 8.922840105750124e-05, + "loss": 0.8927, + "step": 66670 + }, + { + "epoch": 0.425999514457662, + "grad_norm": 1.046462893486023, + "learning_rate": 8.922528968073032e-05, + "loss": 0.7882, + "step": 66680 + }, + { + "epoch": 0.4260634016074007, + "grad_norm": 1.2014803886413574, + "learning_rate": 8.92221779089268e-05, + "loss": 0.9165, + "step": 66690 + }, + { + "epoch": 0.4261272887571394, + "grad_norm": 0.9146868586540222, + "learning_rate": 8.921906574212202e-05, + "loss": 1.0733, + "step": 66700 + }, + { + "epoch": 0.4261911759068781, + "grad_norm": 0.8694166541099548, + "learning_rate": 8.92159531803473e-05, + "loss": 0.9912, + "step": 66710 + }, + { + "epoch": 0.4262550630566168, + "grad_norm": 0.904970645904541, + "learning_rate": 8.92131515370767e-05, + "loss": 1.1384, + "step": 66720 + }, + { + "epoch": 0.4263189502063555, + "grad_norm": 1.0510960817337036, + "learning_rate": 8.92100382249455e-05, + "loss": 0.8948, + "step": 66730 + }, + { + "epoch": 0.42638283735609417, + "grad_norm": 0.8128019571304321, + "learning_rate": 8.920692451793531e-05, + "loss": 0.7585, + "step": 66740 + }, + { + "epoch": 0.4264467245058329, + "grad_norm": 0.7644541263580322, + "learning_rate": 8.920381041607746e-05, + "loss": 1.0066, + "step": 66750 + }, + { + "epoch": 0.4265106116555716, + "grad_norm": 0.6716737151145935, + "learning_rate": 8.920069591940332e-05, + "loss": 0.7818, + "step": 66760 + }, + { + "epoch": 0.4265744988053103, + "grad_norm": 0.5078364610671997, + "learning_rate": 8.919758102794427e-05, + "loss": 1.0828, + "step": 66770 + }, + { + "epoch": 0.426638385955049, + "grad_norm": 1.3749090433120728, + "learning_rate": 8.919446574173165e-05, + "loss": 0.7222, + "step": 66780 + }, + { + "epoch": 0.4267022731047877, + "grad_norm": 0.9173924922943115, + "learning_rate": 8.919135006079686e-05, + "loss": 0.9544, + "step": 66790 + }, + { + "epoch": 0.4267661602545264, + "grad_norm": 2.012134552001953, + "learning_rate": 8.918823398517127e-05, + "loss": 0.805, + "step": 66800 + }, + { + "epoch": 0.4268300474042651, + "grad_norm": 1.9749096632003784, + "learning_rate": 8.918511751488627e-05, + "loss": 0.9767, + "step": 66810 + }, + { + "epoch": 0.4268939345540038, + "grad_norm": 1.5169198513031006, + "learning_rate": 8.918200064997324e-05, + "loss": 1.0532, + "step": 66820 + }, + { + "epoch": 0.4269578217037425, + "grad_norm": 0.8941536545753479, + "learning_rate": 8.917888339046354e-05, + "loss": 0.9049, + "step": 66830 + }, + { + "epoch": 0.42702170885348123, + "grad_norm": 0.7928354144096375, + "learning_rate": 8.917576573638862e-05, + "loss": 0.7091, + "step": 66840 + }, + { + "epoch": 0.42708559600321994, + "grad_norm": 0.7303599119186401, + "learning_rate": 8.917264768777983e-05, + "loss": 0.9175, + "step": 66850 + }, + { + "epoch": 0.4271494831529586, + "grad_norm": 0.6727188229560852, + "learning_rate": 8.91695292446686e-05, + "loss": 0.9231, + "step": 66860 + }, + { + "epoch": 0.4272133703026973, + "grad_norm": 0.5279465913772583, + "learning_rate": 8.91664104070863e-05, + "loss": 0.7118, + "step": 66870 + }, + { + "epoch": 0.427277257452436, + "grad_norm": 0.7597615122795105, + "learning_rate": 8.916329117506439e-05, + "loss": 0.9777, + "step": 66880 + }, + { + "epoch": 0.4273411446021747, + "grad_norm": 1.0109660625457764, + "learning_rate": 8.916017154863425e-05, + "loss": 0.8774, + "step": 66890 + }, + { + "epoch": 0.4274050317519134, + "grad_norm": 0.7892023324966431, + "learning_rate": 8.91570515278273e-05, + "loss": 1.0988, + "step": 66900 + }, + { + "epoch": 0.4274689189016521, + "grad_norm": 0.7960211038589478, + "learning_rate": 8.915393111267496e-05, + "loss": 0.8625, + "step": 66910 + }, + { + "epoch": 0.4275328060513908, + "grad_norm": 0.9462035894393921, + "learning_rate": 8.915081030320867e-05, + "loss": 0.9255, + "step": 66920 + }, + { + "epoch": 0.42759669320112953, + "grad_norm": 0.6910783648490906, + "learning_rate": 8.914768909945985e-05, + "loss": 0.695, + "step": 66930 + }, + { + "epoch": 0.42766058035086824, + "grad_norm": 0.8994881510734558, + "learning_rate": 8.914456750145991e-05, + "loss": 0.9296, + "step": 66940 + }, + { + "epoch": 0.42772446750060694, + "grad_norm": 0.7058424949645996, + "learning_rate": 8.914144550924034e-05, + "loss": 0.9154, + "step": 66950 + }, + { + "epoch": 0.42778835465034565, + "grad_norm": 0.9544531106948853, + "learning_rate": 8.913832312283254e-05, + "loss": 1.0751, + "step": 66960 + }, + { + "epoch": 0.42785224180008435, + "grad_norm": 1.3822722434997559, + "learning_rate": 8.913520034226797e-05, + "loss": 0.9816, + "step": 66970 + }, + { + "epoch": 0.42791612894982306, + "grad_norm": 0.7017986178398132, + "learning_rate": 8.913207716757807e-05, + "loss": 0.7602, + "step": 66980 + }, + { + "epoch": 0.4279800160995617, + "grad_norm": 0.4731631577014923, + "learning_rate": 8.912895359879431e-05, + "loss": 1.0528, + "step": 66990 + }, + { + "epoch": 0.4280439032493004, + "grad_norm": 0.7982561588287354, + "learning_rate": 8.912582963594813e-05, + "loss": 0.831, + "step": 67000 + }, + { + "epoch": 0.4281077903990391, + "grad_norm": 1.1041196584701538, + "learning_rate": 8.912270527907099e-05, + "loss": 0.8662, + "step": 67010 + }, + { + "epoch": 0.42817167754877783, + "grad_norm": 1.0753505229949951, + "learning_rate": 8.911958052819436e-05, + "loss": 0.8874, + "step": 67020 + }, + { + "epoch": 0.42823556469851654, + "grad_norm": 1.0988258123397827, + "learning_rate": 8.911645538334971e-05, + "loss": 1.1105, + "step": 67030 + }, + { + "epoch": 0.42829945184825524, + "grad_norm": 3.2240488529205322, + "learning_rate": 8.911332984456854e-05, + "loss": 0.8623, + "step": 67040 + }, + { + "epoch": 0.42836333899799395, + "grad_norm": 0.5571461915969849, + "learning_rate": 8.911020391188229e-05, + "loss": 0.8196, + "step": 67050 + }, + { + "epoch": 0.42842722614773265, + "grad_norm": 1.0421580076217651, + "learning_rate": 8.910707758532244e-05, + "loss": 0.8394, + "step": 67060 + }, + { + "epoch": 0.42849111329747136, + "grad_norm": 1.025112509727478, + "learning_rate": 8.91039508649205e-05, + "loss": 0.7116, + "step": 67070 + }, + { + "epoch": 0.42855500044721007, + "grad_norm": 1.0207161903381348, + "learning_rate": 8.910082375070792e-05, + "loss": 1.3015, + "step": 67080 + }, + { + "epoch": 0.42861888759694877, + "grad_norm": 0.5158314108848572, + "learning_rate": 8.909769624271625e-05, + "loss": 0.873, + "step": 67090 + }, + { + "epoch": 0.4286827747466875, + "grad_norm": 2.0511646270751953, + "learning_rate": 8.909456834097693e-05, + "loss": 0.7935, + "step": 67100 + }, + { + "epoch": 0.42874666189642613, + "grad_norm": 0.5745459198951721, + "learning_rate": 8.909144004552148e-05, + "loss": 1.0678, + "step": 67110 + }, + { + "epoch": 0.42881054904616484, + "grad_norm": 1.3202085494995117, + "learning_rate": 8.908831135638143e-05, + "loss": 0.6992, + "step": 67120 + }, + { + "epoch": 0.42887443619590354, + "grad_norm": 1.3148435354232788, + "learning_rate": 8.908518227358826e-05, + "loss": 0.7438, + "step": 67130 + }, + { + "epoch": 0.42893832334564225, + "grad_norm": 1.0374261140823364, + "learning_rate": 8.908205279717349e-05, + "loss": 0.9165, + "step": 67140 + }, + { + "epoch": 0.42900221049538095, + "grad_norm": 1.0254135131835938, + "learning_rate": 8.907892292716864e-05, + "loss": 1.087, + "step": 67150 + }, + { + "epoch": 0.42906609764511966, + "grad_norm": 0.7220088839530945, + "learning_rate": 8.907579266360523e-05, + "loss": 1.0477, + "step": 67160 + }, + { + "epoch": 0.42912998479485837, + "grad_norm": 0.8464503884315491, + "learning_rate": 8.907266200651478e-05, + "loss": 0.9686, + "step": 67170 + }, + { + "epoch": 0.42919387194459707, + "grad_norm": 0.6220828890800476, + "learning_rate": 8.906953095592882e-05, + "loss": 0.7929, + "step": 67180 + }, + { + "epoch": 0.4292577590943358, + "grad_norm": 0.8189134001731873, + "learning_rate": 8.906639951187889e-05, + "loss": 0.9221, + "step": 67190 + }, + { + "epoch": 0.4293216462440745, + "grad_norm": 0.879299521446228, + "learning_rate": 8.906326767439651e-05, + "loss": 0.8583, + "step": 67200 + }, + { + "epoch": 0.4293855333938132, + "grad_norm": 0.9902644157409668, + "learning_rate": 8.906013544351323e-05, + "loss": 0.8649, + "step": 67210 + }, + { + "epoch": 0.4294494205435519, + "grad_norm": 0.727925717830658, + "learning_rate": 8.905700281926061e-05, + "loss": 0.8093, + "step": 67220 + }, + { + "epoch": 0.42951330769329055, + "grad_norm": 0.6252252459526062, + "learning_rate": 8.905386980167016e-05, + "loss": 0.7309, + "step": 67230 + }, + { + "epoch": 0.42957719484302925, + "grad_norm": 1.9642329216003418, + "learning_rate": 8.905073639077347e-05, + "loss": 0.9235, + "step": 67240 + }, + { + "epoch": 0.42964108199276796, + "grad_norm": 0.7746663689613342, + "learning_rate": 8.904760258660208e-05, + "loss": 0.9314, + "step": 67250 + }, + { + "epoch": 0.42970496914250667, + "grad_norm": 0.423170804977417, + "learning_rate": 8.904446838918754e-05, + "loss": 0.9009, + "step": 67260 + }, + { + "epoch": 0.42976885629224537, + "grad_norm": 1.2594034671783447, + "learning_rate": 8.904133379856143e-05, + "loss": 0.9342, + "step": 67270 + }, + { + "epoch": 0.4298327434419841, + "grad_norm": 0.9244500994682312, + "learning_rate": 8.903819881475532e-05, + "loss": 0.9128, + "step": 67280 + }, + { + "epoch": 0.4298966305917228, + "grad_norm": 0.9682210683822632, + "learning_rate": 8.903506343780077e-05, + "loss": 0.8821, + "step": 67290 + }, + { + "epoch": 0.4299605177414615, + "grad_norm": 1.104791283607483, + "learning_rate": 8.903192766772936e-05, + "loss": 1.0183, + "step": 67300 + }, + { + "epoch": 0.4300244048912002, + "grad_norm": 1.1504932641983032, + "learning_rate": 8.902879150457269e-05, + "loss": 0.7472, + "step": 67310 + }, + { + "epoch": 0.4300882920409389, + "grad_norm": 0.5592100024223328, + "learning_rate": 8.90256549483623e-05, + "loss": 0.8123, + "step": 67320 + }, + { + "epoch": 0.4301521791906776, + "grad_norm": 1.0708913803100586, + "learning_rate": 8.902251799912981e-05, + "loss": 0.7882, + "step": 67330 + }, + { + "epoch": 0.4302160663404163, + "grad_norm": 0.6294905543327332, + "learning_rate": 8.90193806569068e-05, + "loss": 0.8449, + "step": 67340 + }, + { + "epoch": 0.43027995349015496, + "grad_norm": 1.0562630891799927, + "learning_rate": 8.901624292172488e-05, + "loss": 1.2612, + "step": 67350 + }, + { + "epoch": 0.43034384063989367, + "grad_norm": 0.6391942501068115, + "learning_rate": 8.901310479361564e-05, + "loss": 0.9626, + "step": 67360 + }, + { + "epoch": 0.4304077277896324, + "grad_norm": 0.8884569406509399, + "learning_rate": 8.900996627261067e-05, + "loss": 0.9499, + "step": 67370 + }, + { + "epoch": 0.4304716149393711, + "grad_norm": 1.3086752891540527, + "learning_rate": 8.90068273587416e-05, + "loss": 0.9847, + "step": 67380 + }, + { + "epoch": 0.4305355020891098, + "grad_norm": 0.8015036582946777, + "learning_rate": 8.900368805204003e-05, + "loss": 0.9094, + "step": 67390 + }, + { + "epoch": 0.4305993892388485, + "grad_norm": 0.5839217901229858, + "learning_rate": 8.900054835253758e-05, + "loss": 0.9917, + "step": 67400 + }, + { + "epoch": 0.4306632763885872, + "grad_norm": 1.5205440521240234, + "learning_rate": 8.899740826026587e-05, + "loss": 0.7, + "step": 67410 + }, + { + "epoch": 0.4307271635383259, + "grad_norm": 0.9681718349456787, + "learning_rate": 8.899426777525653e-05, + "loss": 0.7742, + "step": 67420 + }, + { + "epoch": 0.4307910506880646, + "grad_norm": 0.8119606375694275, + "learning_rate": 8.899112689754117e-05, + "loss": 0.8792, + "step": 67430 + }, + { + "epoch": 0.4308549378378033, + "grad_norm": 0.8435991406440735, + "learning_rate": 8.898798562715142e-05, + "loss": 1.0099, + "step": 67440 + }, + { + "epoch": 0.430918824987542, + "grad_norm": 0.4675378203392029, + "learning_rate": 8.898484396411894e-05, + "loss": 0.8346, + "step": 67450 + }, + { + "epoch": 0.43098271213728073, + "grad_norm": 0.8612586855888367, + "learning_rate": 8.898170190847535e-05, + "loss": 0.7461, + "step": 67460 + }, + { + "epoch": 0.4310465992870194, + "grad_norm": 0.769745409488678, + "learning_rate": 8.897855946025228e-05, + "loss": 0.9233, + "step": 67470 + }, + { + "epoch": 0.4311104864367581, + "grad_norm": 1.4678987264633179, + "learning_rate": 8.897541661948142e-05, + "loss": 0.7533, + "step": 67480 + }, + { + "epoch": 0.4311743735864968, + "grad_norm": 1.0737018585205078, + "learning_rate": 8.897227338619438e-05, + "loss": 0.6886, + "step": 67490 + }, + { + "epoch": 0.4312382607362355, + "grad_norm": 0.6413093209266663, + "learning_rate": 8.896912976042285e-05, + "loss": 0.9434, + "step": 67500 + }, + { + "epoch": 0.4313021478859742, + "grad_norm": 0.8239714503288269, + "learning_rate": 8.896598574219845e-05, + "loss": 0.904, + "step": 67510 + }, + { + "epoch": 0.4313660350357129, + "grad_norm": 0.8861196041107178, + "learning_rate": 8.896284133155288e-05, + "loss": 0.909, + "step": 67520 + }, + { + "epoch": 0.4314299221854516, + "grad_norm": 0.7210700511932373, + "learning_rate": 8.895969652851778e-05, + "loss": 0.9084, + "step": 67530 + }, + { + "epoch": 0.4314938093351903, + "grad_norm": 1.172956943511963, + "learning_rate": 8.895655133312483e-05, + "loss": 0.9011, + "step": 67540 + }, + { + "epoch": 0.43155769648492903, + "grad_norm": 0.9112328886985779, + "learning_rate": 8.895340574540571e-05, + "loss": 0.7824, + "step": 67550 + }, + { + "epoch": 0.43162158363466774, + "grad_norm": 1.0518110990524292, + "learning_rate": 8.895025976539209e-05, + "loss": 1.2023, + "step": 67560 + }, + { + "epoch": 0.43168547078440644, + "grad_norm": 0.8246524930000305, + "learning_rate": 8.894711339311567e-05, + "loss": 1.0688, + "step": 67570 + }, + { + "epoch": 0.43174935793414515, + "grad_norm": 0.9622389078140259, + "learning_rate": 8.894396662860811e-05, + "loss": 0.6852, + "step": 67580 + }, + { + "epoch": 0.4318132450838838, + "grad_norm": 0.7277495265007019, + "learning_rate": 8.894081947190112e-05, + "loss": 0.8892, + "step": 67590 + }, + { + "epoch": 0.4318771322336225, + "grad_norm": 0.5220545530319214, + "learning_rate": 8.893767192302639e-05, + "loss": 0.7688, + "step": 67600 + }, + { + "epoch": 0.4319410193833612, + "grad_norm": 0.7757664322853088, + "learning_rate": 8.893452398201561e-05, + "loss": 0.9166, + "step": 67610 + }, + { + "epoch": 0.4320049065330999, + "grad_norm": 1.4959086179733276, + "learning_rate": 8.89313756489005e-05, + "loss": 1.0149, + "step": 67620 + }, + { + "epoch": 0.4320687936828386, + "grad_norm": 1.3954126834869385, + "learning_rate": 8.892822692371277e-05, + "loss": 0.802, + "step": 67630 + }, + { + "epoch": 0.43213268083257733, + "grad_norm": 0.7430549263954163, + "learning_rate": 8.89250778064841e-05, + "loss": 0.8956, + "step": 67640 + }, + { + "epoch": 0.43219656798231604, + "grad_norm": 1.650481939315796, + "learning_rate": 8.892192829724621e-05, + "loss": 1.1669, + "step": 67650 + }, + { + "epoch": 0.43226045513205474, + "grad_norm": 1.7691149711608887, + "learning_rate": 8.891877839603085e-05, + "loss": 1.1042, + "step": 67660 + }, + { + "epoch": 0.43232434228179345, + "grad_norm": 0.6639987230300903, + "learning_rate": 8.891562810286971e-05, + "loss": 0.8842, + "step": 67670 + }, + { + "epoch": 0.43238822943153216, + "grad_norm": 0.9073365926742554, + "learning_rate": 8.891247741779454e-05, + "loss": 1.1714, + "step": 67680 + }, + { + "epoch": 0.43245211658127086, + "grad_norm": 1.1349682807922363, + "learning_rate": 8.890932634083704e-05, + "loss": 0.9899, + "step": 67690 + }, + { + "epoch": 0.43251600373100957, + "grad_norm": 0.7573813796043396, + "learning_rate": 8.890617487202899e-05, + "loss": 0.8316, + "step": 67700 + }, + { + "epoch": 0.4325798908807482, + "grad_norm": 0.7431557178497314, + "learning_rate": 8.890302301140208e-05, + "loss": 0.8598, + "step": 67710 + }, + { + "epoch": 0.4326437780304869, + "grad_norm": 0.6789889931678772, + "learning_rate": 8.889987075898807e-05, + "loss": 1.1971, + "step": 67720 + }, + { + "epoch": 0.43270766518022563, + "grad_norm": 0.5719479322433472, + "learning_rate": 8.889671811481872e-05, + "loss": 0.6596, + "step": 67730 + }, + { + "epoch": 0.43277155232996434, + "grad_norm": 0.8801824450492859, + "learning_rate": 8.889356507892575e-05, + "loss": 0.8168, + "step": 67740 + }, + { + "epoch": 0.43283543947970304, + "grad_norm": 1.7005552053451538, + "learning_rate": 8.889041165134096e-05, + "loss": 0.8598, + "step": 67750 + }, + { + "epoch": 0.43289932662944175, + "grad_norm": 0.5636479258537292, + "learning_rate": 8.888725783209606e-05, + "loss": 0.7868, + "step": 67760 + }, + { + "epoch": 0.43296321377918046, + "grad_norm": 0.9649848937988281, + "learning_rate": 8.888410362122283e-05, + "loss": 0.8729, + "step": 67770 + }, + { + "epoch": 0.43302710092891916, + "grad_norm": 1.2120856046676636, + "learning_rate": 8.888094901875303e-05, + "loss": 1.1061, + "step": 67780 + }, + { + "epoch": 0.43309098807865787, + "grad_norm": 1.1897577047348022, + "learning_rate": 8.887779402471846e-05, + "loss": 0.8963, + "step": 67790 + }, + { + "epoch": 0.4331548752283966, + "grad_norm": 0.8927859663963318, + "learning_rate": 8.887463863915087e-05, + "loss": 0.985, + "step": 67800 + }, + { + "epoch": 0.4332187623781353, + "grad_norm": 1.1183792352676392, + "learning_rate": 8.887148286208202e-05, + "loss": 1.0094, + "step": 67810 + }, + { + "epoch": 0.433282649527874, + "grad_norm": 1.071887731552124, + "learning_rate": 8.886832669354372e-05, + "loss": 0.8359, + "step": 67820 + }, + { + "epoch": 0.4333465366776127, + "grad_norm": 0.6402618885040283, + "learning_rate": 8.886517013356774e-05, + "loss": 1.0026, + "step": 67830 + }, + { + "epoch": 0.43341042382735134, + "grad_norm": 1.0560641288757324, + "learning_rate": 8.886201318218587e-05, + "loss": 0.7045, + "step": 67840 + }, + { + "epoch": 0.43347431097709005, + "grad_norm": 0.9585883021354675, + "learning_rate": 8.88588558394299e-05, + "loss": 0.9572, + "step": 67850 + }, + { + "epoch": 0.43353819812682876, + "grad_norm": 0.7981050610542297, + "learning_rate": 8.885569810533166e-05, + "loss": 0.7819, + "step": 67860 + }, + { + "epoch": 0.43360208527656746, + "grad_norm": 1.467461347579956, + "learning_rate": 8.88525399799229e-05, + "loss": 0.9183, + "step": 67870 + }, + { + "epoch": 0.43366597242630617, + "grad_norm": 0.9360789060592651, + "learning_rate": 8.884938146323546e-05, + "loss": 1.0038, + "step": 67880 + }, + { + "epoch": 0.4337298595760449, + "grad_norm": 1.3165303468704224, + "learning_rate": 8.884622255530116e-05, + "loss": 0.8743, + "step": 67890 + }, + { + "epoch": 0.4337937467257836, + "grad_norm": 1.1677271127700806, + "learning_rate": 8.884306325615174e-05, + "loss": 1.0382, + "step": 67900 + }, + { + "epoch": 0.4338576338755223, + "grad_norm": 1.1823782920837402, + "learning_rate": 8.883990356581911e-05, + "loss": 0.8917, + "step": 67910 + }, + { + "epoch": 0.433921521025261, + "grad_norm": 0.8433313369750977, + "learning_rate": 8.883674348433504e-05, + "loss": 0.8236, + "step": 67920 + }, + { + "epoch": 0.4339854081749997, + "grad_norm": 1.1049748659133911, + "learning_rate": 8.883358301173138e-05, + "loss": 0.7639, + "step": 67930 + }, + { + "epoch": 0.4340492953247384, + "grad_norm": 0.8020467162132263, + "learning_rate": 8.883042214803991e-05, + "loss": 0.9805, + "step": 67940 + }, + { + "epoch": 0.4341131824744771, + "grad_norm": 0.5183336734771729, + "learning_rate": 8.882726089329252e-05, + "loss": 0.9406, + "step": 67950 + }, + { + "epoch": 0.43417706962421576, + "grad_norm": 0.9549485445022583, + "learning_rate": 8.882409924752102e-05, + "loss": 0.7904, + "step": 67960 + }, + { + "epoch": 0.43424095677395447, + "grad_norm": 0.9031966924667358, + "learning_rate": 8.882093721075724e-05, + "loss": 0.6085, + "step": 67970 + }, + { + "epoch": 0.4343048439236932, + "grad_norm": 0.7417629957199097, + "learning_rate": 8.881777478303306e-05, + "loss": 0.979, + "step": 67980 + }, + { + "epoch": 0.4343687310734319, + "grad_norm": 0.742239236831665, + "learning_rate": 8.881461196438027e-05, + "loss": 1.0707, + "step": 67990 + }, + { + "epoch": 0.4344326182231706, + "grad_norm": 1.0497804880142212, + "learning_rate": 8.88114487548308e-05, + "loss": 0.8717, + "step": 68000 + }, + { + "epoch": 0.4344965053729093, + "grad_norm": 0.7527285814285278, + "learning_rate": 8.880828515441643e-05, + "loss": 1.0762, + "step": 68010 + }, + { + "epoch": 0.434560392522648, + "grad_norm": 0.8218625783920288, + "learning_rate": 8.880512116316908e-05, + "loss": 1.0556, + "step": 68020 + }, + { + "epoch": 0.4346242796723867, + "grad_norm": 1.8415364027023315, + "learning_rate": 8.880195678112058e-05, + "loss": 1.1582, + "step": 68030 + }, + { + "epoch": 0.4346881668221254, + "grad_norm": 0.6465769410133362, + "learning_rate": 8.87987920083028e-05, + "loss": 1.0762, + "step": 68040 + }, + { + "epoch": 0.4347520539718641, + "grad_norm": 0.6471286416053772, + "learning_rate": 8.879562684474762e-05, + "loss": 1.2511, + "step": 68050 + }, + { + "epoch": 0.4348159411216028, + "grad_norm": 0.6721779704093933, + "learning_rate": 8.879246129048693e-05, + "loss": 0.8825, + "step": 68060 + }, + { + "epoch": 0.43487982827134153, + "grad_norm": 0.8682761788368225, + "learning_rate": 8.878929534555259e-05, + "loss": 1.0418, + "step": 68070 + }, + { + "epoch": 0.4349437154210802, + "grad_norm": 0.7083001732826233, + "learning_rate": 8.878612900997648e-05, + "loss": 0.9285, + "step": 68080 + }, + { + "epoch": 0.4350076025708189, + "grad_norm": 0.7909469604492188, + "learning_rate": 8.878296228379048e-05, + "loss": 0.9, + "step": 68090 + }, + { + "epoch": 0.4350714897205576, + "grad_norm": 0.7747198939323425, + "learning_rate": 8.877979516702651e-05, + "loss": 0.7877, + "step": 68100 + }, + { + "epoch": 0.4351353768702963, + "grad_norm": 1.1311992406845093, + "learning_rate": 8.877662765971646e-05, + "loss": 0.9031, + "step": 68110 + }, + { + "epoch": 0.435199264020035, + "grad_norm": 0.8452590107917786, + "learning_rate": 8.877345976189223e-05, + "loss": 0.8362, + "step": 68120 + }, + { + "epoch": 0.4352631511697737, + "grad_norm": 1.3919566869735718, + "learning_rate": 8.877029147358571e-05, + "loss": 0.8168, + "step": 68130 + }, + { + "epoch": 0.4353270383195124, + "grad_norm": 1.0793455839157104, + "learning_rate": 8.87671227948288e-05, + "loss": 0.6657, + "step": 68140 + }, + { + "epoch": 0.4353909254692511, + "grad_norm": 0.6547946929931641, + "learning_rate": 8.876395372565344e-05, + "loss": 1.1194, + "step": 68150 + }, + { + "epoch": 0.43545481261898983, + "grad_norm": 1.289340615272522, + "learning_rate": 8.876078426609153e-05, + "loss": 1.0495, + "step": 68160 + }, + { + "epoch": 0.43551869976872853, + "grad_norm": 1.287331223487854, + "learning_rate": 8.875761441617498e-05, + "loss": 0.8023, + "step": 68170 + }, + { + "epoch": 0.43558258691846724, + "grad_norm": 1.054658055305481, + "learning_rate": 8.875444417593574e-05, + "loss": 0.8072, + "step": 68180 + }, + { + "epoch": 0.43564647406820595, + "grad_norm": 1.5471371412277222, + "learning_rate": 8.87512735454057e-05, + "loss": 0.8984, + "step": 68190 + }, + { + "epoch": 0.4357103612179446, + "grad_norm": 0.9853270649909973, + "learning_rate": 8.874810252461683e-05, + "loss": 0.8457, + "step": 68200 + }, + { + "epoch": 0.4357742483676833, + "grad_norm": 0.8379093408584595, + "learning_rate": 8.874493111360103e-05, + "loss": 1.0092, + "step": 68210 + }, + { + "epoch": 0.435838135517422, + "grad_norm": 0.6254721879959106, + "learning_rate": 8.874175931239026e-05, + "loss": 0.792, + "step": 68220 + }, + { + "epoch": 0.4359020226671607, + "grad_norm": 0.5673577189445496, + "learning_rate": 8.873858712101645e-05, + "loss": 0.7041, + "step": 68230 + }, + { + "epoch": 0.4359659098168994, + "grad_norm": 0.8581469058990479, + "learning_rate": 8.873541453951157e-05, + "loss": 1.118, + "step": 68240 + }, + { + "epoch": 0.43602979696663813, + "grad_norm": 0.7700116634368896, + "learning_rate": 8.873224156790754e-05, + "loss": 0.9587, + "step": 68250 + }, + { + "epoch": 0.43609368411637683, + "grad_norm": 1.4901466369628906, + "learning_rate": 8.872906820623634e-05, + "loss": 0.9082, + "step": 68260 + }, + { + "epoch": 0.43615757126611554, + "grad_norm": 1.1333754062652588, + "learning_rate": 8.872589445452991e-05, + "loss": 0.8202, + "step": 68270 + }, + { + "epoch": 0.43622145841585425, + "grad_norm": 0.4992083013057709, + "learning_rate": 8.872272031282022e-05, + "loss": 0.8428, + "step": 68280 + }, + { + "epoch": 0.43628534556559295, + "grad_norm": 0.7288440465927124, + "learning_rate": 8.871954578113925e-05, + "loss": 0.7839, + "step": 68290 + }, + { + "epoch": 0.43634923271533166, + "grad_norm": 1.4860522747039795, + "learning_rate": 8.871637085951894e-05, + "loss": 0.9678, + "step": 68300 + }, + { + "epoch": 0.43641311986507036, + "grad_norm": 0.8923503756523132, + "learning_rate": 8.87131955479913e-05, + "loss": 0.9322, + "step": 68310 + }, + { + "epoch": 0.436477007014809, + "grad_norm": 1.1527504920959473, + "learning_rate": 8.871001984658826e-05, + "loss": 1.0341, + "step": 68320 + }, + { + "epoch": 0.4365408941645477, + "grad_norm": 0.9049966931343079, + "learning_rate": 8.870684375534185e-05, + "loss": 1.0123, + "step": 68330 + }, + { + "epoch": 0.4366047813142864, + "grad_norm": 0.6281135678291321, + "learning_rate": 8.870366727428404e-05, + "loss": 0.9563, + "step": 68340 + }, + { + "epoch": 0.43666866846402513, + "grad_norm": 0.6897270679473877, + "learning_rate": 8.870049040344682e-05, + "loss": 0.8434, + "step": 68350 + }, + { + "epoch": 0.43673255561376384, + "grad_norm": 1.3322041034698486, + "learning_rate": 8.869731314286215e-05, + "loss": 1.0403, + "step": 68360 + }, + { + "epoch": 0.43679644276350255, + "grad_norm": 0.9318044781684875, + "learning_rate": 8.869413549256209e-05, + "loss": 0.8422, + "step": 68370 + }, + { + "epoch": 0.43686032991324125, + "grad_norm": 0.8586065769195557, + "learning_rate": 8.86909574525786e-05, + "loss": 0.8878, + "step": 68380 + }, + { + "epoch": 0.43692421706297996, + "grad_norm": 1.9818271398544312, + "learning_rate": 8.86877790229437e-05, + "loss": 0.7168, + "step": 68390 + }, + { + "epoch": 0.43698810421271866, + "grad_norm": 0.7556184530258179, + "learning_rate": 8.868460020368941e-05, + "loss": 1.0074, + "step": 68400 + }, + { + "epoch": 0.43705199136245737, + "grad_norm": 0.5547859072685242, + "learning_rate": 8.868142099484771e-05, + "loss": 0.9824, + "step": 68410 + }, + { + "epoch": 0.4371158785121961, + "grad_norm": 0.8270901441574097, + "learning_rate": 8.867824139645063e-05, + "loss": 0.6677, + "step": 68420 + }, + { + "epoch": 0.4371797656619348, + "grad_norm": 0.7375511527061462, + "learning_rate": 8.867506140853021e-05, + "loss": 0.8542, + "step": 68430 + }, + { + "epoch": 0.43724365281167343, + "grad_norm": 0.879522979259491, + "learning_rate": 8.867188103111845e-05, + "loss": 0.8551, + "step": 68440 + }, + { + "epoch": 0.43730753996141214, + "grad_norm": 1.1079013347625732, + "learning_rate": 8.866870026424741e-05, + "loss": 1.1122, + "step": 68450 + }, + { + "epoch": 0.43737142711115085, + "grad_norm": 1.25412917137146, + "learning_rate": 8.86655191079491e-05, + "loss": 0.8394, + "step": 68460 + }, + { + "epoch": 0.43743531426088955, + "grad_norm": 0.7833040952682495, + "learning_rate": 8.866233756225555e-05, + "loss": 0.8275, + "step": 68470 + }, + { + "epoch": 0.43749920141062826, + "grad_norm": 1.0346733331680298, + "learning_rate": 8.865915562719882e-05, + "loss": 0.8503, + "step": 68480 + }, + { + "epoch": 0.43756308856036696, + "grad_norm": 0.9302981495857239, + "learning_rate": 8.865597330281096e-05, + "loss": 0.7965, + "step": 68490 + }, + { + "epoch": 0.43762697571010567, + "grad_norm": 0.8941460251808167, + "learning_rate": 8.8652790589124e-05, + "loss": 0.7823, + "step": 68500 + }, + { + "epoch": 0.4376908628598444, + "grad_norm": 0.7403380870819092, + "learning_rate": 8.864960748617e-05, + "loss": 0.9164, + "step": 68510 + }, + { + "epoch": 0.4377547500095831, + "grad_norm": 1.2985106706619263, + "learning_rate": 8.8646423993981e-05, + "loss": 0.9005, + "step": 68520 + }, + { + "epoch": 0.4378186371593218, + "grad_norm": 0.5682730078697205, + "learning_rate": 8.864324011258908e-05, + "loss": 0.8248, + "step": 68530 + }, + { + "epoch": 0.4378825243090605, + "grad_norm": 1.3618555068969727, + "learning_rate": 8.864005584202632e-05, + "loss": 1.1664, + "step": 68540 + }, + { + "epoch": 0.4379464114587992, + "grad_norm": 0.6019179224967957, + "learning_rate": 8.863687118232475e-05, + "loss": 0.8097, + "step": 68550 + }, + { + "epoch": 0.43801029860853785, + "grad_norm": 1.4094189405441284, + "learning_rate": 8.863368613351648e-05, + "loss": 0.7467, + "step": 68560 + }, + { + "epoch": 0.43807418575827656, + "grad_norm": 1.509199857711792, + "learning_rate": 8.863050069563355e-05, + "loss": 0.9534, + "step": 68570 + }, + { + "epoch": 0.43813807290801526, + "grad_norm": 1.1251524686813354, + "learning_rate": 8.862731486870808e-05, + "loss": 0.9511, + "step": 68580 + }, + { + "epoch": 0.43820196005775397, + "grad_norm": 0.9050050973892212, + "learning_rate": 8.862412865277211e-05, + "loss": 0.9554, + "step": 68590 + }, + { + "epoch": 0.4382658472074927, + "grad_norm": 0.6649369597434998, + "learning_rate": 8.862094204785776e-05, + "loss": 0.8778, + "step": 68600 + }, + { + "epoch": 0.4383297343572314, + "grad_norm": 0.7536949515342712, + "learning_rate": 8.86177550539971e-05, + "loss": 0.8764, + "step": 68610 + }, + { + "epoch": 0.4383936215069701, + "grad_norm": 0.898378312587738, + "learning_rate": 8.861456767122226e-05, + "loss": 1.0107, + "step": 68620 + }, + { + "epoch": 0.4384575086567088, + "grad_norm": 2.269949436187744, + "learning_rate": 8.861137989956529e-05, + "loss": 0.7672, + "step": 68630 + }, + { + "epoch": 0.4385213958064475, + "grad_norm": 6.667402267456055, + "learning_rate": 8.860819173905835e-05, + "loss": 0.7432, + "step": 68640 + }, + { + "epoch": 0.4385852829561862, + "grad_norm": 0.7865056395530701, + "learning_rate": 8.860500318973351e-05, + "loss": 0.9813, + "step": 68650 + }, + { + "epoch": 0.4386491701059249, + "grad_norm": 2.528974771499634, + "learning_rate": 8.860181425162287e-05, + "loss": 0.8215, + "step": 68660 + }, + { + "epoch": 0.4387130572556636, + "grad_norm": 0.5087980031967163, + "learning_rate": 8.859862492475858e-05, + "loss": 1.2262, + "step": 68670 + }, + { + "epoch": 0.4387769444054023, + "grad_norm": 1.1823939085006714, + "learning_rate": 8.859543520917275e-05, + "loss": 0.7388, + "step": 68680 + }, + { + "epoch": 0.438840831555141, + "grad_norm": 0.7431660294532776, + "learning_rate": 8.859224510489747e-05, + "loss": 0.6863, + "step": 68690 + }, + { + "epoch": 0.4389047187048797, + "grad_norm": 1.038490653038025, + "learning_rate": 8.858905461196492e-05, + "loss": 0.853, + "step": 68700 + }, + { + "epoch": 0.4389686058546184, + "grad_norm": 0.9958590269088745, + "learning_rate": 8.85858637304072e-05, + "loss": 0.8119, + "step": 68710 + }, + { + "epoch": 0.4390324930043571, + "grad_norm": 0.6802636384963989, + "learning_rate": 8.858267246025645e-05, + "loss": 1.0443, + "step": 68720 + }, + { + "epoch": 0.4390963801540958, + "grad_norm": 1.3269374370574951, + "learning_rate": 8.857948080154481e-05, + "loss": 0.8071, + "step": 68730 + }, + { + "epoch": 0.4391602673038345, + "grad_norm": 0.7724654078483582, + "learning_rate": 8.857628875430444e-05, + "loss": 0.7978, + "step": 68740 + }, + { + "epoch": 0.4392241544535732, + "grad_norm": 0.7245962023735046, + "learning_rate": 8.857309631856745e-05, + "loss": 0.8891, + "step": 68750 + }, + { + "epoch": 0.4392880416033119, + "grad_norm": 0.9760708808898926, + "learning_rate": 8.8569903494366e-05, + "loss": 0.9933, + "step": 68760 + }, + { + "epoch": 0.4393519287530506, + "grad_norm": 0.828381359577179, + "learning_rate": 8.856671028173227e-05, + "loss": 0.949, + "step": 68770 + }, + { + "epoch": 0.43941581590278933, + "grad_norm": 0.948192834854126, + "learning_rate": 8.85635166806984e-05, + "loss": 0.785, + "step": 68780 + }, + { + "epoch": 0.43947970305252804, + "grad_norm": 1.3965764045715332, + "learning_rate": 8.856032269129655e-05, + "loss": 0.6816, + "step": 68790 + }, + { + "epoch": 0.43954359020226674, + "grad_norm": 0.6188552975654602, + "learning_rate": 8.85571283135589e-05, + "loss": 0.7448, + "step": 68800 + }, + { + "epoch": 0.4396074773520054, + "grad_norm": 0.7305311560630798, + "learning_rate": 8.85539335475176e-05, + "loss": 0.8134, + "step": 68810 + }, + { + "epoch": 0.4396713645017441, + "grad_norm": 0.6679476499557495, + "learning_rate": 8.855073839320484e-05, + "loss": 0.8225, + "step": 68820 + }, + { + "epoch": 0.4397352516514828, + "grad_norm": 0.41290047764778137, + "learning_rate": 8.85475428506528e-05, + "loss": 0.8303, + "step": 68830 + }, + { + "epoch": 0.4397991388012215, + "grad_norm": 1.389434814453125, + "learning_rate": 8.854434691989365e-05, + "loss": 1.0117, + "step": 68840 + }, + { + "epoch": 0.4398630259509602, + "grad_norm": 0.970970869064331, + "learning_rate": 8.854115060095958e-05, + "loss": 0.9634, + "step": 68850 + }, + { + "epoch": 0.4399269131006989, + "grad_norm": 0.8621498346328735, + "learning_rate": 8.853795389388277e-05, + "loss": 0.7216, + "step": 68860 + }, + { + "epoch": 0.43999080025043763, + "grad_norm": 0.8945342898368835, + "learning_rate": 8.853475679869545e-05, + "loss": 0.924, + "step": 68870 + }, + { + "epoch": 0.44005468740017634, + "grad_norm": 0.7587364315986633, + "learning_rate": 8.853155931542978e-05, + "loss": 0.9817, + "step": 68880 + }, + { + "epoch": 0.44011857454991504, + "grad_norm": 0.9429205656051636, + "learning_rate": 8.852836144411795e-05, + "loss": 1.1741, + "step": 68890 + }, + { + "epoch": 0.44018246169965375, + "grad_norm": 0.9457645416259766, + "learning_rate": 8.852516318479223e-05, + "loss": 0.8122, + "step": 68900 + }, + { + "epoch": 0.44024634884939245, + "grad_norm": 0.8908385038375854, + "learning_rate": 8.852196453748476e-05, + "loss": 1.0426, + "step": 68910 + }, + { + "epoch": 0.44031023599913116, + "grad_norm": 1.4087450504302979, + "learning_rate": 8.851876550222779e-05, + "loss": 0.6433, + "step": 68920 + }, + { + "epoch": 0.4403741231488698, + "grad_norm": 0.8311522006988525, + "learning_rate": 8.851556607905351e-05, + "loss": 0.8959, + "step": 68930 + }, + { + "epoch": 0.4404380102986085, + "grad_norm": 0.6747666597366333, + "learning_rate": 8.851236626799419e-05, + "loss": 1.1469, + "step": 68940 + }, + { + "epoch": 0.4405018974483472, + "grad_norm": 0.8693909049034119, + "learning_rate": 8.850916606908199e-05, + "loss": 0.7576, + "step": 68950 + }, + { + "epoch": 0.44056578459808593, + "grad_norm": 0.6947962045669556, + "learning_rate": 8.85059654823492e-05, + "loss": 0.9861, + "step": 68960 + }, + { + "epoch": 0.44062967174782464, + "grad_norm": 4.578150749206543, + "learning_rate": 8.850276450782802e-05, + "loss": 1.0223, + "step": 68970 + }, + { + "epoch": 0.44069355889756334, + "grad_norm": 0.8332919478416443, + "learning_rate": 8.849956314555068e-05, + "loss": 0.7311, + "step": 68980 + }, + { + "epoch": 0.44075744604730205, + "grad_norm": 0.8713606595993042, + "learning_rate": 8.849636139554945e-05, + "loss": 0.8488, + "step": 68990 + }, + { + "epoch": 0.44082133319704075, + "grad_norm": 0.8420679569244385, + "learning_rate": 8.849315925785654e-05, + "loss": 0.9619, + "step": 69000 + }, + { + "epoch": 0.44088522034677946, + "grad_norm": 0.9233155846595764, + "learning_rate": 8.848995673250421e-05, + "loss": 0.9395, + "step": 69010 + }, + { + "epoch": 0.44094910749651817, + "grad_norm": 1.0304968357086182, + "learning_rate": 8.848675381952474e-05, + "loss": 0.7857, + "step": 69020 + }, + { + "epoch": 0.4410129946462569, + "grad_norm": 1.142500638961792, + "learning_rate": 8.848355051895035e-05, + "loss": 0.7173, + "step": 69030 + }, + { + "epoch": 0.4410768817959956, + "grad_norm": 1.1199169158935547, + "learning_rate": 8.848034683081332e-05, + "loss": 0.7658, + "step": 69040 + }, + { + "epoch": 0.44114076894573423, + "grad_norm": 0.6068952679634094, + "learning_rate": 8.84771427551459e-05, + "loss": 0.7444, + "step": 69050 + }, + { + "epoch": 0.44120465609547294, + "grad_norm": 1.1909863948822021, + "learning_rate": 8.847393829198036e-05, + "loss": 1.017, + "step": 69060 + }, + { + "epoch": 0.44126854324521164, + "grad_norm": 0.84711092710495, + "learning_rate": 8.847073344134898e-05, + "loss": 0.7326, + "step": 69070 + }, + { + "epoch": 0.44133243039495035, + "grad_norm": 1.1196755170822144, + "learning_rate": 8.846752820328403e-05, + "loss": 0.9662, + "step": 69080 + }, + { + "epoch": 0.44139631754468905, + "grad_norm": 0.9795490503311157, + "learning_rate": 8.846432257781781e-05, + "loss": 0.976, + "step": 69090 + }, + { + "epoch": 0.44146020469442776, + "grad_norm": 0.7674742341041565, + "learning_rate": 8.846111656498257e-05, + "loss": 0.9718, + "step": 69100 + }, + { + "epoch": 0.44152409184416647, + "grad_norm": 0.8170384764671326, + "learning_rate": 8.845791016481062e-05, + "loss": 0.9278, + "step": 69110 + }, + { + "epoch": 0.4415879789939052, + "grad_norm": 0.8551295399665833, + "learning_rate": 8.845470337733423e-05, + "loss": 0.9096, + "step": 69120 + }, + { + "epoch": 0.4416518661436439, + "grad_norm": 1.4373359680175781, + "learning_rate": 8.845149620258573e-05, + "loss": 1.2263, + "step": 69130 + }, + { + "epoch": 0.4417157532933826, + "grad_norm": 0.746088981628418, + "learning_rate": 8.844828864059738e-05, + "loss": 0.9052, + "step": 69140 + }, + { + "epoch": 0.4417796404431213, + "grad_norm": 0.6683810949325562, + "learning_rate": 8.84450806914015e-05, + "loss": 0.7266, + "step": 69150 + }, + { + "epoch": 0.44184352759286, + "grad_norm": 0.9795920848846436, + "learning_rate": 8.84418723550304e-05, + "loss": 1.1524, + "step": 69160 + }, + { + "epoch": 0.44190741474259865, + "grad_norm": 0.9631989002227783, + "learning_rate": 8.843866363151641e-05, + "loss": 1.059, + "step": 69170 + }, + { + "epoch": 0.44197130189233735, + "grad_norm": 0.7739669680595398, + "learning_rate": 8.84354545208918e-05, + "loss": 0.7757, + "step": 69180 + }, + { + "epoch": 0.44203518904207606, + "grad_norm": 0.5001319646835327, + "learning_rate": 8.843224502318892e-05, + "loss": 0.8741, + "step": 69190 + }, + { + "epoch": 0.44209907619181477, + "grad_norm": 0.739460825920105, + "learning_rate": 8.842903513844008e-05, + "loss": 0.9077, + "step": 69200 + }, + { + "epoch": 0.44216296334155347, + "grad_norm": 0.810375452041626, + "learning_rate": 8.842582486667762e-05, + "loss": 0.7811, + "step": 69210 + }, + { + "epoch": 0.4422268504912922, + "grad_norm": 1.088107705116272, + "learning_rate": 8.842261420793385e-05, + "loss": 0.8338, + "step": 69220 + }, + { + "epoch": 0.4422907376410309, + "grad_norm": 0.6136099100112915, + "learning_rate": 8.841940316224111e-05, + "loss": 0.6255, + "step": 69230 + }, + { + "epoch": 0.4423546247907696, + "grad_norm": 0.7968172430992126, + "learning_rate": 8.841619172963175e-05, + "loss": 0.8622, + "step": 69240 + }, + { + "epoch": 0.4424185119405083, + "grad_norm": 0.8373786211013794, + "learning_rate": 8.84129799101381e-05, + "loss": 0.9056, + "step": 69250 + }, + { + "epoch": 0.442482399090247, + "grad_norm": 0.47702914476394653, + "learning_rate": 8.840976770379252e-05, + "loss": 0.7324, + "step": 69260 + }, + { + "epoch": 0.4425462862399857, + "grad_norm": 0.8604845404624939, + "learning_rate": 8.840655511062734e-05, + "loss": 0.82, + "step": 69270 + }, + { + "epoch": 0.4426101733897244, + "grad_norm": 1.151459813117981, + "learning_rate": 8.840334213067493e-05, + "loss": 1.1619, + "step": 69280 + }, + { + "epoch": 0.44267406053946307, + "grad_norm": 1.4183622598648071, + "learning_rate": 8.840012876396765e-05, + "loss": 0.9147, + "step": 69290 + }, + { + "epoch": 0.44273794768920177, + "grad_norm": 0.7213853001594543, + "learning_rate": 8.839691501053784e-05, + "loss": 0.9717, + "step": 69300 + }, + { + "epoch": 0.4428018348389405, + "grad_norm": 0.8650780320167542, + "learning_rate": 8.839370087041787e-05, + "loss": 1.0401, + "step": 69310 + }, + { + "epoch": 0.4428657219886792, + "grad_norm": 0.9786863923072815, + "learning_rate": 8.839048634364014e-05, + "loss": 0.8482, + "step": 69320 + }, + { + "epoch": 0.4429296091384179, + "grad_norm": 0.907888650894165, + "learning_rate": 8.838727143023698e-05, + "loss": 1.0272, + "step": 69330 + }, + { + "epoch": 0.4429934962881566, + "grad_norm": 0.5181243419647217, + "learning_rate": 8.83840561302408e-05, + "loss": 1.1205, + "step": 69340 + }, + { + "epoch": 0.4430573834378953, + "grad_norm": 1.089030146598816, + "learning_rate": 8.838084044368396e-05, + "loss": 0.9977, + "step": 69350 + }, + { + "epoch": 0.443121270587634, + "grad_norm": 0.7841888070106506, + "learning_rate": 8.837762437059884e-05, + "loss": 0.9291, + "step": 69360 + }, + { + "epoch": 0.4431851577373727, + "grad_norm": 0.706368088722229, + "learning_rate": 8.837440791101787e-05, + "loss": 0.8566, + "step": 69370 + }, + { + "epoch": 0.4432490448871114, + "grad_norm": 0.8301064968109131, + "learning_rate": 8.83711910649734e-05, + "loss": 0.8693, + "step": 69380 + }, + { + "epoch": 0.4433129320368501, + "grad_norm": 0.95965576171875, + "learning_rate": 8.836797383249784e-05, + "loss": 0.8735, + "step": 69390 + }, + { + "epoch": 0.44337681918658883, + "grad_norm": 0.644489586353302, + "learning_rate": 8.836475621362359e-05, + "loss": 1.1971, + "step": 69400 + }, + { + "epoch": 0.4434407063363275, + "grad_norm": 0.834976851940155, + "learning_rate": 8.836153820838304e-05, + "loss": 0.9157, + "step": 69410 + }, + { + "epoch": 0.4435045934860662, + "grad_norm": 0.6156612634658813, + "learning_rate": 8.835831981680864e-05, + "loss": 0.7013, + "step": 69420 + }, + { + "epoch": 0.4435684806358049, + "grad_norm": 0.5868956446647644, + "learning_rate": 8.835510103893276e-05, + "loss": 1.0189, + "step": 69430 + }, + { + "epoch": 0.4436323677855436, + "grad_norm": 1.2473644018173218, + "learning_rate": 8.835188187478782e-05, + "loss": 0.7598, + "step": 69440 + }, + { + "epoch": 0.4436962549352823, + "grad_norm": 1.8413316011428833, + "learning_rate": 8.834866232440627e-05, + "loss": 0.7408, + "step": 69450 + }, + { + "epoch": 0.443760142085021, + "grad_norm": 1.211452603340149, + "learning_rate": 8.83454423878205e-05, + "loss": 0.9004, + "step": 69460 + }, + { + "epoch": 0.4438240292347597, + "grad_norm": 1.3288507461547852, + "learning_rate": 8.834222206506297e-05, + "loss": 0.9584, + "step": 69470 + }, + { + "epoch": 0.4438879163844984, + "grad_norm": 1.8809562921524048, + "learning_rate": 8.833900135616608e-05, + "loss": 0.7489, + "step": 69480 + }, + { + "epoch": 0.44395180353423713, + "grad_norm": 0.8909973502159119, + "learning_rate": 8.833578026116228e-05, + "loss": 0.6701, + "step": 69490 + }, + { + "epoch": 0.44401569068397584, + "grad_norm": 0.9204776287078857, + "learning_rate": 8.833255878008402e-05, + "loss": 0.8157, + "step": 69500 + }, + { + "epoch": 0.44407957783371454, + "grad_norm": 0.722482442855835, + "learning_rate": 8.832933691296371e-05, + "loss": 1.1273, + "step": 69510 + }, + { + "epoch": 0.44414346498345325, + "grad_norm": 0.8715541958808899, + "learning_rate": 8.832611465983383e-05, + "loss": 1.2145, + "step": 69520 + }, + { + "epoch": 0.44420735213319196, + "grad_norm": 0.9637245535850525, + "learning_rate": 8.832289202072681e-05, + "loss": 0.973, + "step": 69530 + }, + { + "epoch": 0.4442712392829306, + "grad_norm": 0.8205868005752563, + "learning_rate": 8.831966899567512e-05, + "loss": 0.7592, + "step": 69540 + }, + { + "epoch": 0.4443351264326693, + "grad_norm": 1.58009672164917, + "learning_rate": 8.831644558471122e-05, + "loss": 1.2691, + "step": 69550 + }, + { + "epoch": 0.444399013582408, + "grad_norm": 1.08955717086792, + "learning_rate": 8.831322178786754e-05, + "loss": 0.9777, + "step": 69560 + }, + { + "epoch": 0.4444629007321467, + "grad_norm": 0.9413936138153076, + "learning_rate": 8.830999760517659e-05, + "loss": 1.029, + "step": 69570 + }, + { + "epoch": 0.44452678788188543, + "grad_norm": 1.7037255764007568, + "learning_rate": 8.830677303667081e-05, + "loss": 1.2211, + "step": 69580 + }, + { + "epoch": 0.44459067503162414, + "grad_norm": 1.0269652605056763, + "learning_rate": 8.83035480823827e-05, + "loss": 1.3183, + "step": 69590 + }, + { + "epoch": 0.44465456218136284, + "grad_norm": 0.8793505430221558, + "learning_rate": 8.830032274234472e-05, + "loss": 0.9107, + "step": 69600 + }, + { + "epoch": 0.44471844933110155, + "grad_norm": 1.2114499807357788, + "learning_rate": 8.829709701658934e-05, + "loss": 0.9675, + "step": 69610 + }, + { + "epoch": 0.44478233648084026, + "grad_norm": 1.1938707828521729, + "learning_rate": 8.82938709051491e-05, + "loss": 0.85, + "step": 69620 + }, + { + "epoch": 0.44484622363057896, + "grad_norm": 1.2485358715057373, + "learning_rate": 8.829064440805641e-05, + "loss": 0.8547, + "step": 69630 + }, + { + "epoch": 0.44491011078031767, + "grad_norm": 0.7239115238189697, + "learning_rate": 8.828741752534382e-05, + "loss": 1.1611, + "step": 69640 + }, + { + "epoch": 0.4449739979300564, + "grad_norm": 0.9881543517112732, + "learning_rate": 8.82841902570438e-05, + "loss": 0.668, + "step": 69650 + }, + { + "epoch": 0.445037885079795, + "grad_norm": 1.0397281646728516, + "learning_rate": 8.828096260318888e-05, + "loss": 0.7762, + "step": 69660 + }, + { + "epoch": 0.44510177222953373, + "grad_norm": 1.2399822473526, + "learning_rate": 8.827773456381155e-05, + "loss": 0.7973, + "step": 69670 + }, + { + "epoch": 0.44516565937927244, + "grad_norm": 0.8497468829154968, + "learning_rate": 8.82745061389443e-05, + "loss": 0.9985, + "step": 69680 + }, + { + "epoch": 0.44522954652901114, + "grad_norm": 0.73412024974823, + "learning_rate": 8.827127732861967e-05, + "loss": 0.812, + "step": 69690 + }, + { + "epoch": 0.44529343367874985, + "grad_norm": 0.7889323234558105, + "learning_rate": 8.826804813287017e-05, + "loss": 0.9489, + "step": 69700 + }, + { + "epoch": 0.44535732082848856, + "grad_norm": 0.7215690612792969, + "learning_rate": 8.826481855172832e-05, + "loss": 1.0469, + "step": 69710 + }, + { + "epoch": 0.44542120797822726, + "grad_norm": 1.6253806352615356, + "learning_rate": 8.826158858522665e-05, + "loss": 0.7258, + "step": 69720 + }, + { + "epoch": 0.44548509512796597, + "grad_norm": 1.0504227876663208, + "learning_rate": 8.825835823339768e-05, + "loss": 0.9111, + "step": 69730 + }, + { + "epoch": 0.4455489822777047, + "grad_norm": 0.9772189855575562, + "learning_rate": 8.825512749627393e-05, + "loss": 0.9676, + "step": 69740 + }, + { + "epoch": 0.4456128694274434, + "grad_norm": 0.7481646537780762, + "learning_rate": 8.825189637388795e-05, + "loss": 0.9435, + "step": 69750 + }, + { + "epoch": 0.4456767565771821, + "grad_norm": 0.6458262801170349, + "learning_rate": 8.824866486627231e-05, + "loss": 0.9124, + "step": 69760 + }, + { + "epoch": 0.4457406437269208, + "grad_norm": 0.9859530925750732, + "learning_rate": 8.824543297345949e-05, + "loss": 1.0758, + "step": 69770 + }, + { + "epoch": 0.44580453087665944, + "grad_norm": 0.8648393750190735, + "learning_rate": 8.82422006954821e-05, + "loss": 1.0258, + "step": 69780 + }, + { + "epoch": 0.44586841802639815, + "grad_norm": 2.013597249984741, + "learning_rate": 8.823896803237264e-05, + "loss": 0.9565, + "step": 69790 + }, + { + "epoch": 0.44593230517613686, + "grad_norm": 0.8398522138595581, + "learning_rate": 8.823573498416371e-05, + "loss": 0.8652, + "step": 69800 + }, + { + "epoch": 0.44599619232587556, + "grad_norm": 0.751560389995575, + "learning_rate": 8.823250155088785e-05, + "loss": 0.948, + "step": 69810 + }, + { + "epoch": 0.44606007947561427, + "grad_norm": 0.7580850124359131, + "learning_rate": 8.82292677325776e-05, + "loss": 0.8804, + "step": 69820 + }, + { + "epoch": 0.446123966625353, + "grad_norm": 2.6924216747283936, + "learning_rate": 8.822603352926558e-05, + "loss": 0.814, + "step": 69830 + }, + { + "epoch": 0.4461878537750917, + "grad_norm": 1.0442085266113281, + "learning_rate": 8.82227989409843e-05, + "loss": 0.9705, + "step": 69840 + }, + { + "epoch": 0.4462517409248304, + "grad_norm": 0.6417388319969177, + "learning_rate": 8.821956396776641e-05, + "loss": 0.8304, + "step": 69850 + }, + { + "epoch": 0.4463156280745691, + "grad_norm": 0.49614080786705017, + "learning_rate": 8.821632860964442e-05, + "loss": 1.1193, + "step": 69860 + }, + { + "epoch": 0.4463795152243078, + "grad_norm": 0.6962358355522156, + "learning_rate": 8.821309286665094e-05, + "loss": 1.02, + "step": 69870 + }, + { + "epoch": 0.4464434023740465, + "grad_norm": 0.9865720868110657, + "learning_rate": 8.820985673881857e-05, + "loss": 1.191, + "step": 69880 + }, + { + "epoch": 0.4465072895237852, + "grad_norm": 0.9626466631889343, + "learning_rate": 8.820662022617987e-05, + "loss": 0.9506, + "step": 69890 + }, + { + "epoch": 0.44657117667352386, + "grad_norm": 0.40864917635917664, + "learning_rate": 8.820338332876745e-05, + "loss": 0.9994, + "step": 69900 + }, + { + "epoch": 0.44663506382326257, + "grad_norm": 0.5569325089454651, + "learning_rate": 8.82001460466139e-05, + "loss": 0.8532, + "step": 69910 + }, + { + "epoch": 0.4466989509730013, + "grad_norm": 0.6157374978065491, + "learning_rate": 8.819690837975185e-05, + "loss": 0.834, + "step": 69920 + }, + { + "epoch": 0.44676283812274, + "grad_norm": 0.9512416124343872, + "learning_rate": 8.819367032821389e-05, + "loss": 0.7586, + "step": 69930 + }, + { + "epoch": 0.4468267252724787, + "grad_norm": 0.6513834595680237, + "learning_rate": 8.819043189203262e-05, + "loss": 1.0077, + "step": 69940 + }, + { + "epoch": 0.4468906124222174, + "grad_norm": 0.6988425254821777, + "learning_rate": 8.818719307124066e-05, + "loss": 0.9777, + "step": 69950 + }, + { + "epoch": 0.4469544995719561, + "grad_norm": 0.7577906250953674, + "learning_rate": 8.818395386587064e-05, + "loss": 0.8364, + "step": 69960 + }, + { + "epoch": 0.4470183867216948, + "grad_norm": 3.7700507640838623, + "learning_rate": 8.818071427595515e-05, + "loss": 0.9155, + "step": 69970 + }, + { + "epoch": 0.4470822738714335, + "grad_norm": 0.9014910459518433, + "learning_rate": 8.817747430152687e-05, + "loss": 0.9437, + "step": 69980 + }, + { + "epoch": 0.4471461610211722, + "grad_norm": 0.7106698155403137, + "learning_rate": 8.817423394261837e-05, + "loss": 1.2196, + "step": 69990 + }, + { + "epoch": 0.4472100481709109, + "grad_norm": 0.7741692066192627, + "learning_rate": 8.817099319926231e-05, + "loss": 0.8265, + "step": 70000 + }, + { + "epoch": 0.44727393532064963, + "grad_norm": 0.8128407001495361, + "learning_rate": 8.816775207149133e-05, + "loss": 0.7937, + "step": 70010 + }, + { + "epoch": 0.4473378224703883, + "grad_norm": 1.0812875032424927, + "learning_rate": 8.816451055933807e-05, + "loss": 0.9699, + "step": 70020 + }, + { + "epoch": 0.447401709620127, + "grad_norm": 0.8170537948608398, + "learning_rate": 8.816126866283515e-05, + "loss": 0.7516, + "step": 70030 + }, + { + "epoch": 0.4474655967698657, + "grad_norm": 0.8234254121780396, + "learning_rate": 8.815802638201527e-05, + "loss": 0.7975, + "step": 70040 + }, + { + "epoch": 0.4475294839196044, + "grad_norm": 0.5763027667999268, + "learning_rate": 8.815478371691104e-05, + "loss": 0.8927, + "step": 70050 + }, + { + "epoch": 0.4475933710693431, + "grad_norm": 0.6996818780899048, + "learning_rate": 8.815154066755514e-05, + "loss": 0.7487, + "step": 70060 + }, + { + "epoch": 0.4476572582190818, + "grad_norm": 1.1514983177185059, + "learning_rate": 8.814829723398021e-05, + "loss": 0.7932, + "step": 70070 + }, + { + "epoch": 0.4477211453688205, + "grad_norm": 1.56476628780365, + "learning_rate": 8.814505341621892e-05, + "loss": 1.0774, + "step": 70080 + }, + { + "epoch": 0.4477850325185592, + "grad_norm": 1.2454763650894165, + "learning_rate": 8.814180921430395e-05, + "loss": 0.7339, + "step": 70090 + }, + { + "epoch": 0.44784891966829793, + "grad_norm": 0.7148693799972534, + "learning_rate": 8.813856462826794e-05, + "loss": 0.8958, + "step": 70100 + }, + { + "epoch": 0.44791280681803664, + "grad_norm": 1.7030229568481445, + "learning_rate": 8.813531965814363e-05, + "loss": 0.9031, + "step": 70110 + }, + { + "epoch": 0.44797669396777534, + "grad_norm": 0.7845126986503601, + "learning_rate": 8.813207430396365e-05, + "loss": 0.8665, + "step": 70120 + }, + { + "epoch": 0.44804058111751405, + "grad_norm": 0.6932292580604553, + "learning_rate": 8.812882856576066e-05, + "loss": 0.9553, + "step": 70130 + }, + { + "epoch": 0.4481044682672527, + "grad_norm": 1.0173585414886475, + "learning_rate": 8.812558244356742e-05, + "loss": 1.1723, + "step": 70140 + }, + { + "epoch": 0.4481683554169914, + "grad_norm": 0.7353670597076416, + "learning_rate": 8.812233593741655e-05, + "loss": 0.8626, + "step": 70150 + }, + { + "epoch": 0.4482322425667301, + "grad_norm": 0.4959295392036438, + "learning_rate": 8.811908904734079e-05, + "loss": 0.9257, + "step": 70160 + }, + { + "epoch": 0.4482961297164688, + "grad_norm": 0.9090648889541626, + "learning_rate": 8.811584177337281e-05, + "loss": 0.6679, + "step": 70170 + }, + { + "epoch": 0.4483600168662075, + "grad_norm": 0.840734601020813, + "learning_rate": 8.811259411554536e-05, + "loss": 0.7846, + "step": 70180 + }, + { + "epoch": 0.44842390401594623, + "grad_norm": 0.8319433927536011, + "learning_rate": 8.81093460738911e-05, + "loss": 0.9433, + "step": 70190 + }, + { + "epoch": 0.44848779116568493, + "grad_norm": 0.5484992265701294, + "learning_rate": 8.810609764844276e-05, + "loss": 0.8511, + "step": 70200 + }, + { + "epoch": 0.44855167831542364, + "grad_norm": 0.8629337549209595, + "learning_rate": 8.810284883923304e-05, + "loss": 1.033, + "step": 70210 + }, + { + "epoch": 0.44861556546516235, + "grad_norm": 0.8372594118118286, + "learning_rate": 8.809959964629467e-05, + "loss": 0.6458, + "step": 70220 + }, + { + "epoch": 0.44867945261490105, + "grad_norm": 0.6603564620018005, + "learning_rate": 8.809635006966037e-05, + "loss": 0.9905, + "step": 70230 + }, + { + "epoch": 0.44874333976463976, + "grad_norm": 0.7497221231460571, + "learning_rate": 8.809310010936288e-05, + "loss": 0.9827, + "step": 70240 + }, + { + "epoch": 0.44880722691437847, + "grad_norm": 0.6426061987876892, + "learning_rate": 8.80898497654349e-05, + "loss": 0.9913, + "step": 70250 + }, + { + "epoch": 0.4488711140641171, + "grad_norm": 1.1607120037078857, + "learning_rate": 8.808659903790919e-05, + "loss": 0.9618, + "step": 70260 + }, + { + "epoch": 0.4489350012138558, + "grad_norm": 0.6175957322120667, + "learning_rate": 8.808334792681848e-05, + "loss": 0.7507, + "step": 70270 + }, + { + "epoch": 0.44899888836359453, + "grad_norm": 0.96190345287323, + "learning_rate": 8.80800964321955e-05, + "loss": 0.7085, + "step": 70280 + }, + { + "epoch": 0.44906277551333323, + "grad_norm": 2.022925615310669, + "learning_rate": 8.807684455407301e-05, + "loss": 0.997, + "step": 70290 + }, + { + "epoch": 0.44912666266307194, + "grad_norm": 0.8769704103469849, + "learning_rate": 8.807359229248376e-05, + "loss": 1.0706, + "step": 70300 + }, + { + "epoch": 0.44919054981281065, + "grad_norm": 0.8224455118179321, + "learning_rate": 8.80703396474605e-05, + "loss": 1.1076, + "step": 70310 + }, + { + "epoch": 0.44925443696254935, + "grad_norm": 0.5599127411842346, + "learning_rate": 8.806708661903598e-05, + "loss": 0.7477, + "step": 70320 + }, + { + "epoch": 0.44931832411228806, + "grad_norm": 0.8950123190879822, + "learning_rate": 8.806383320724295e-05, + "loss": 0.9558, + "step": 70330 + }, + { + "epoch": 0.44938221126202677, + "grad_norm": 0.7584883570671082, + "learning_rate": 8.80605794121142e-05, + "loss": 1.051, + "step": 70340 + }, + { + "epoch": 0.44944609841176547, + "grad_norm": 0.7890920042991638, + "learning_rate": 8.805732523368249e-05, + "loss": 0.747, + "step": 70350 + }, + { + "epoch": 0.4495099855615042, + "grad_norm": 0.7396231889724731, + "learning_rate": 8.805407067198059e-05, + "loss": 0.9456, + "step": 70360 + }, + { + "epoch": 0.4495738727112429, + "grad_norm": 1.3219010829925537, + "learning_rate": 8.805081572704128e-05, + "loss": 0.785, + "step": 70370 + }, + { + "epoch": 0.4496377598609816, + "grad_norm": 0.5966509580612183, + "learning_rate": 8.804756039889735e-05, + "loss": 1.1855, + "step": 70380 + }, + { + "epoch": 0.44970164701072024, + "grad_norm": 0.9530605673789978, + "learning_rate": 8.804430468758153e-05, + "loss": 0.8681, + "step": 70390 + }, + { + "epoch": 0.44976553416045895, + "grad_norm": 0.8958638310432434, + "learning_rate": 8.804104859312668e-05, + "loss": 1.0291, + "step": 70400 + }, + { + "epoch": 0.44982942131019765, + "grad_norm": 0.8644607663154602, + "learning_rate": 8.803779211556555e-05, + "loss": 0.7294, + "step": 70410 + }, + { + "epoch": 0.44989330845993636, + "grad_norm": 0.8499729037284851, + "learning_rate": 8.803453525493096e-05, + "loss": 0.8267, + "step": 70420 + }, + { + "epoch": 0.44995719560967506, + "grad_norm": 0.8581739068031311, + "learning_rate": 8.803127801125568e-05, + "loss": 0.7154, + "step": 70430 + }, + { + "epoch": 0.45002108275941377, + "grad_norm": 0.8350471258163452, + "learning_rate": 8.802802038457253e-05, + "loss": 0.9061, + "step": 70440 + }, + { + "epoch": 0.4500849699091525, + "grad_norm": 0.8576902747154236, + "learning_rate": 8.802476237491433e-05, + "loss": 0.9597, + "step": 70450 + }, + { + "epoch": 0.4501488570588912, + "grad_norm": 0.8955521583557129, + "learning_rate": 8.802150398231387e-05, + "loss": 1.0067, + "step": 70460 + }, + { + "epoch": 0.4502127442086299, + "grad_norm": 0.8049098253250122, + "learning_rate": 8.801824520680397e-05, + "loss": 1.002, + "step": 70470 + }, + { + "epoch": 0.4502766313583686, + "grad_norm": 0.8177332878112793, + "learning_rate": 8.801498604841745e-05, + "loss": 0.9605, + "step": 70480 + }, + { + "epoch": 0.4503405185081073, + "grad_norm": 0.5897266864776611, + "learning_rate": 8.801172650718711e-05, + "loss": 0.9324, + "step": 70490 + }, + { + "epoch": 0.450404405657846, + "grad_norm": 0.7611057758331299, + "learning_rate": 8.800846658314583e-05, + "loss": 1.1378, + "step": 70500 + }, + { + "epoch": 0.45046829280758466, + "grad_norm": 0.9269735813140869, + "learning_rate": 8.80052062763264e-05, + "loss": 0.7767, + "step": 70510 + }, + { + "epoch": 0.45053217995732336, + "grad_norm": 0.7874916791915894, + "learning_rate": 8.800194558676167e-05, + "loss": 1.0075, + "step": 70520 + }, + { + "epoch": 0.45059606710706207, + "grad_norm": 0.5338902473449707, + "learning_rate": 8.799868451448446e-05, + "loss": 0.7581, + "step": 70530 + }, + { + "epoch": 0.4506599542568008, + "grad_norm": 0.6649864315986633, + "learning_rate": 8.799542305952764e-05, + "loss": 1.309, + "step": 70540 + }, + { + "epoch": 0.4507238414065395, + "grad_norm": 0.5478102564811707, + "learning_rate": 8.799216122192402e-05, + "loss": 0.9979, + "step": 70550 + }, + { + "epoch": 0.4507877285562782, + "grad_norm": 0.48872268199920654, + "learning_rate": 8.798889900170648e-05, + "loss": 0.862, + "step": 70560 + }, + { + "epoch": 0.4508516157060169, + "grad_norm": 1.0504260063171387, + "learning_rate": 8.798563639890786e-05, + "loss": 0.8303, + "step": 70570 + }, + { + "epoch": 0.4509155028557556, + "grad_norm": 0.7641623616218567, + "learning_rate": 8.798237341356102e-05, + "loss": 0.8984, + "step": 70580 + }, + { + "epoch": 0.4509793900054943, + "grad_norm": 0.8579826951026917, + "learning_rate": 8.797911004569882e-05, + "loss": 0.8908, + "step": 70590 + }, + { + "epoch": 0.451043277155233, + "grad_norm": 1.6578333377838135, + "learning_rate": 8.797584629535412e-05, + "loss": 1.0401, + "step": 70600 + }, + { + "epoch": 0.4511071643049717, + "grad_norm": 0.901781439781189, + "learning_rate": 8.79725821625598e-05, + "loss": 1.1365, + "step": 70610 + }, + { + "epoch": 0.4511710514547104, + "grad_norm": 1.3552802801132202, + "learning_rate": 8.796931764734873e-05, + "loss": 0.9429, + "step": 70620 + }, + { + "epoch": 0.4512349386044491, + "grad_norm": 0.5758177638053894, + "learning_rate": 8.796605274975377e-05, + "loss": 0.9782, + "step": 70630 + }, + { + "epoch": 0.4512988257541878, + "grad_norm": 0.6553876996040344, + "learning_rate": 8.796278746980782e-05, + "loss": 0.9537, + "step": 70640 + }, + { + "epoch": 0.4513627129039265, + "grad_norm": 0.6024998426437378, + "learning_rate": 8.795952180754376e-05, + "loss": 0.9083, + "step": 70650 + }, + { + "epoch": 0.4514266000536652, + "grad_norm": 0.521595299243927, + "learning_rate": 8.795625576299447e-05, + "loss": 0.8758, + "step": 70660 + }, + { + "epoch": 0.4514904872034039, + "grad_norm": 0.9571405053138733, + "learning_rate": 8.795298933619284e-05, + "loss": 1.0593, + "step": 70670 + }, + { + "epoch": 0.4515543743531426, + "grad_norm": 1.07502281665802, + "learning_rate": 8.79497225271718e-05, + "loss": 0.8501, + "step": 70680 + }, + { + "epoch": 0.4516182615028813, + "grad_norm": 1.2445697784423828, + "learning_rate": 8.794645533596422e-05, + "loss": 0.8585, + "step": 70690 + }, + { + "epoch": 0.45168214865262, + "grad_norm": 0.8725454211235046, + "learning_rate": 8.794318776260299e-05, + "loss": 1.109, + "step": 70700 + }, + { + "epoch": 0.4517460358023587, + "grad_norm": 0.6738957762718201, + "learning_rate": 8.793991980712103e-05, + "loss": 0.8554, + "step": 70710 + }, + { + "epoch": 0.45180992295209743, + "grad_norm": 0.6663877964019775, + "learning_rate": 8.793665146955127e-05, + "loss": 0.8995, + "step": 70720 + }, + { + "epoch": 0.45187381010183614, + "grad_norm": 1.05771005153656, + "learning_rate": 8.79333827499266e-05, + "loss": 0.8408, + "step": 70730 + }, + { + "epoch": 0.45193769725157484, + "grad_norm": 0.8179357051849365, + "learning_rate": 8.793011364827995e-05, + "loss": 0.9386, + "step": 70740 + }, + { + "epoch": 0.4520015844013135, + "grad_norm": 0.8579227328300476, + "learning_rate": 8.792684416464425e-05, + "loss": 0.9987, + "step": 70750 + }, + { + "epoch": 0.4520654715510522, + "grad_norm": 1.311963438987732, + "learning_rate": 8.79235742990524e-05, + "loss": 0.8236, + "step": 70760 + }, + { + "epoch": 0.4521293587007909, + "grad_norm": 1.2034355401992798, + "learning_rate": 8.792030405153736e-05, + "loss": 1.0315, + "step": 70770 + }, + { + "epoch": 0.4521932458505296, + "grad_norm": 0.9574033617973328, + "learning_rate": 8.791703342213205e-05, + "loss": 0.9112, + "step": 70780 + }, + { + "epoch": 0.4522571330002683, + "grad_norm": 0.6111446022987366, + "learning_rate": 8.791376241086942e-05, + "loss": 0.8271, + "step": 70790 + }, + { + "epoch": 0.452321020150007, + "grad_norm": 0.5406328439712524, + "learning_rate": 8.791049101778239e-05, + "loss": 0.6997, + "step": 70800 + }, + { + "epoch": 0.45238490729974573, + "grad_norm": 0.4767141044139862, + "learning_rate": 8.790721924290393e-05, + "loss": 0.8203, + "step": 70810 + }, + { + "epoch": 0.45244879444948444, + "grad_norm": 1.9145870208740234, + "learning_rate": 8.790394708626697e-05, + "loss": 1.0273, + "step": 70820 + }, + { + "epoch": 0.45251268159922314, + "grad_norm": 1.5563242435455322, + "learning_rate": 8.790067454790447e-05, + "loss": 1.1937, + "step": 70830 + }, + { + "epoch": 0.45257656874896185, + "grad_norm": 0.6038081645965576, + "learning_rate": 8.789740162784939e-05, + "loss": 0.9115, + "step": 70840 + }, + { + "epoch": 0.45264045589870056, + "grad_norm": 0.8606191873550415, + "learning_rate": 8.789412832613468e-05, + "loss": 0.8494, + "step": 70850 + }, + { + "epoch": 0.45270434304843926, + "grad_norm": 0.9102177023887634, + "learning_rate": 8.789085464279334e-05, + "loss": 0.9009, + "step": 70860 + }, + { + "epoch": 0.4527682301981779, + "grad_norm": 0.8726232051849365, + "learning_rate": 8.788758057785828e-05, + "loss": 0.9052, + "step": 70870 + }, + { + "epoch": 0.4528321173479166, + "grad_norm": 0.8285619020462036, + "learning_rate": 8.788430613136254e-05, + "loss": 0.9528, + "step": 70880 + }, + { + "epoch": 0.4528960044976553, + "grad_norm": 1.2340794801712036, + "learning_rate": 8.788103130333905e-05, + "loss": 0.8517, + "step": 70890 + }, + { + "epoch": 0.45295989164739403, + "grad_norm": 0.5685308575630188, + "learning_rate": 8.787775609382078e-05, + "loss": 0.9504, + "step": 70900 + }, + { + "epoch": 0.45302377879713274, + "grad_norm": 0.7877033948898315, + "learning_rate": 8.787448050284077e-05, + "loss": 0.8238, + "step": 70910 + }, + { + "epoch": 0.45308766594687144, + "grad_norm": 1.047734260559082, + "learning_rate": 8.787120453043196e-05, + "loss": 1.0679, + "step": 70920 + }, + { + "epoch": 0.45315155309661015, + "grad_norm": 0.5385513305664062, + "learning_rate": 8.786792817662737e-05, + "loss": 0.8655, + "step": 70930 + }, + { + "epoch": 0.45321544024634886, + "grad_norm": 0.9814597964286804, + "learning_rate": 8.786465144145996e-05, + "loss": 0.9863, + "step": 70940 + }, + { + "epoch": 0.45327932739608756, + "grad_norm": 0.7968815565109253, + "learning_rate": 8.786137432496278e-05, + "loss": 0.8118, + "step": 70950 + }, + { + "epoch": 0.45334321454582627, + "grad_norm": 1.0466378927230835, + "learning_rate": 8.785809682716879e-05, + "loss": 0.8782, + "step": 70960 + }, + { + "epoch": 0.453407101695565, + "grad_norm": 1.5503062009811401, + "learning_rate": 8.7854818948111e-05, + "loss": 1.0405, + "step": 70970 + }, + { + "epoch": 0.4534709888453037, + "grad_norm": 0.6179012060165405, + "learning_rate": 8.785154068782246e-05, + "loss": 0.8444, + "step": 70980 + }, + { + "epoch": 0.45353487599504233, + "grad_norm": 1.3943589925765991, + "learning_rate": 8.784826204633614e-05, + "loss": 0.8734, + "step": 70990 + }, + { + "epoch": 0.45359876314478104, + "grad_norm": 0.8613284230232239, + "learning_rate": 8.784498302368508e-05, + "loss": 0.7613, + "step": 71000 + }, + { + "epoch": 0.45366265029451974, + "grad_norm": 1.1512913703918457, + "learning_rate": 8.784170361990232e-05, + "loss": 0.7816, + "step": 71010 + }, + { + "epoch": 0.45372653744425845, + "grad_norm": 0.8666269779205322, + "learning_rate": 8.783842383502084e-05, + "loss": 1.043, + "step": 71020 + }, + { + "epoch": 0.45379042459399715, + "grad_norm": 0.900255560874939, + "learning_rate": 8.783514366907371e-05, + "loss": 0.9663, + "step": 71030 + }, + { + "epoch": 0.45385431174373586, + "grad_norm": 1.041473150253296, + "learning_rate": 8.783186312209395e-05, + "loss": 1.0225, + "step": 71040 + }, + { + "epoch": 0.45391819889347457, + "grad_norm": 1.203635811805725, + "learning_rate": 8.78285821941146e-05, + "loss": 0.8201, + "step": 71050 + }, + { + "epoch": 0.4539820860432133, + "grad_norm": 1.2860292196273804, + "learning_rate": 8.782530088516869e-05, + "loss": 1.0791, + "step": 71060 + }, + { + "epoch": 0.454045973192952, + "grad_norm": 0.9229752421379089, + "learning_rate": 8.782201919528929e-05, + "loss": 0.9097, + "step": 71070 + }, + { + "epoch": 0.4541098603426907, + "grad_norm": 0.5410824418067932, + "learning_rate": 8.7819065348727e-05, + "loss": 1.1974, + "step": 71080 + }, + { + "epoch": 0.4541737474924294, + "grad_norm": 1.527845025062561, + "learning_rate": 8.7815782935165e-05, + "loss": 0.9155, + "step": 71090 + }, + { + "epoch": 0.4542376346421681, + "grad_norm": 0.6132227182388306, + "learning_rate": 8.781250014076534e-05, + "loss": 0.9264, + "step": 71100 + }, + { + "epoch": 0.45430152179190675, + "grad_norm": 1.0122579336166382, + "learning_rate": 8.78092169655611e-05, + "loss": 1.126, + "step": 71110 + }, + { + "epoch": 0.45436540894164545, + "grad_norm": 0.9451808333396912, + "learning_rate": 8.780593340958535e-05, + "loss": 1.0451, + "step": 71120 + }, + { + "epoch": 0.45442929609138416, + "grad_norm": 1.5981924533843994, + "learning_rate": 8.780264947287111e-05, + "loss": 1.1555, + "step": 71130 + }, + { + "epoch": 0.45449318324112287, + "grad_norm": 0.8764825463294983, + "learning_rate": 8.779936515545151e-05, + "loss": 0.8937, + "step": 71140 + }, + { + "epoch": 0.4545570703908616, + "grad_norm": 0.6347659230232239, + "learning_rate": 8.779608045735959e-05, + "loss": 1.2468, + "step": 71150 + }, + { + "epoch": 0.4546209575406003, + "grad_norm": 0.9502388834953308, + "learning_rate": 8.779279537862844e-05, + "loss": 0.791, + "step": 71160 + }, + { + "epoch": 0.454684844690339, + "grad_norm": 0.9341233968734741, + "learning_rate": 8.778950991929114e-05, + "loss": 0.9172, + "step": 71170 + }, + { + "epoch": 0.4547487318400777, + "grad_norm": 0.7763635516166687, + "learning_rate": 8.77862240793808e-05, + "loss": 0.9442, + "step": 71180 + }, + { + "epoch": 0.4548126189898164, + "grad_norm": 1.2328989505767822, + "learning_rate": 8.778293785893048e-05, + "loss": 0.7446, + "step": 71190 + }, + { + "epoch": 0.4548765061395551, + "grad_norm": 0.6243307590484619, + "learning_rate": 8.777965125797329e-05, + "loss": 0.8242, + "step": 71200 + }, + { + "epoch": 0.4549403932892938, + "grad_norm": 0.7185580134391785, + "learning_rate": 8.777636427654234e-05, + "loss": 1.0433, + "step": 71210 + }, + { + "epoch": 0.4550042804390325, + "grad_norm": 0.7410394549369812, + "learning_rate": 8.777307691467072e-05, + "loss": 0.9533, + "step": 71220 + }, + { + "epoch": 0.4550681675887712, + "grad_norm": 0.8406373858451843, + "learning_rate": 8.776978917239153e-05, + "loss": 0.9858, + "step": 71230 + }, + { + "epoch": 0.4551320547385099, + "grad_norm": 1.1634323596954346, + "learning_rate": 8.776650104973789e-05, + "loss": 0.8353, + "step": 71240 + }, + { + "epoch": 0.4551959418882486, + "grad_norm": 0.847737729549408, + "learning_rate": 8.776321254674291e-05, + "loss": 0.6618, + "step": 71250 + }, + { + "epoch": 0.4552598290379873, + "grad_norm": 0.853600025177002, + "learning_rate": 8.77599236634397e-05, + "loss": 1.011, + "step": 71260 + }, + { + "epoch": 0.455323716187726, + "grad_norm": 0.6608572602272034, + "learning_rate": 8.77566343998614e-05, + "loss": 0.988, + "step": 71270 + }, + { + "epoch": 0.4553876033374647, + "grad_norm": 0.8262060284614563, + "learning_rate": 8.775334475604114e-05, + "loss": 1.2176, + "step": 71280 + }, + { + "epoch": 0.4554514904872034, + "grad_norm": 0.7335585355758667, + "learning_rate": 8.775005473201202e-05, + "loss": 0.9556, + "step": 71290 + }, + { + "epoch": 0.4555153776369421, + "grad_norm": 1.2570284605026245, + "learning_rate": 8.774676432780719e-05, + "loss": 1.0209, + "step": 71300 + }, + { + "epoch": 0.4555792647866808, + "grad_norm": 0.6992619037628174, + "learning_rate": 8.774347354345979e-05, + "loss": 0.8241, + "step": 71310 + }, + { + "epoch": 0.4556431519364195, + "grad_norm": 1.3729963302612305, + "learning_rate": 8.774018237900297e-05, + "loss": 0.9433, + "step": 71320 + }, + { + "epoch": 0.4557070390861582, + "grad_norm": 0.9504528045654297, + "learning_rate": 8.773689083446986e-05, + "loss": 0.8976, + "step": 71330 + }, + { + "epoch": 0.45577092623589693, + "grad_norm": 0.9891476035118103, + "learning_rate": 8.773359890989361e-05, + "loss": 0.975, + "step": 71340 + }, + { + "epoch": 0.45583481338563564, + "grad_norm": 1.9540566205978394, + "learning_rate": 8.773030660530736e-05, + "loss": 1.0329, + "step": 71350 + }, + { + "epoch": 0.4558987005353743, + "grad_norm": 0.9334406852722168, + "learning_rate": 8.77270139207443e-05, + "loss": 0.8753, + "step": 71360 + }, + { + "epoch": 0.455962587685113, + "grad_norm": 0.9712562561035156, + "learning_rate": 8.772372085623756e-05, + "loss": 1.0013, + "step": 71370 + }, + { + "epoch": 0.4560264748348517, + "grad_norm": 1.0713132619857788, + "learning_rate": 8.772042741182034e-05, + "loss": 0.9794, + "step": 71380 + }, + { + "epoch": 0.4560903619845904, + "grad_norm": 1.1477205753326416, + "learning_rate": 8.771713358752575e-05, + "loss": 0.9437, + "step": 71390 + }, + { + "epoch": 0.4561542491343291, + "grad_norm": 0.9125179052352905, + "learning_rate": 8.771383938338702e-05, + "loss": 1.0213, + "step": 71400 + }, + { + "epoch": 0.4562181362840678, + "grad_norm": 0.7508702278137207, + "learning_rate": 8.771054479943728e-05, + "loss": 1.0244, + "step": 71410 + }, + { + "epoch": 0.4562820234338065, + "grad_norm": 0.7568835616111755, + "learning_rate": 8.770724983570974e-05, + "loss": 1.0994, + "step": 71420 + }, + { + "epoch": 0.45634591058354523, + "grad_norm": 1.1442028284072876, + "learning_rate": 8.770395449223758e-05, + "loss": 0.768, + "step": 71430 + }, + { + "epoch": 0.45640979773328394, + "grad_norm": 1.0177857875823975, + "learning_rate": 8.770065876905396e-05, + "loss": 0.762, + "step": 71440 + }, + { + "epoch": 0.45647368488302265, + "grad_norm": 0.8974249362945557, + "learning_rate": 8.76973626661921e-05, + "loss": 0.8523, + "step": 71450 + }, + { + "epoch": 0.45653757203276135, + "grad_norm": 1.9021356105804443, + "learning_rate": 8.769406618368519e-05, + "loss": 1.068, + "step": 71460 + }, + { + "epoch": 0.45660145918250006, + "grad_norm": 0.6922752857208252, + "learning_rate": 8.769076932156642e-05, + "loss": 0.9142, + "step": 71470 + }, + { + "epoch": 0.4566653463322387, + "grad_norm": 0.853961706161499, + "learning_rate": 8.7687472079869e-05, + "loss": 0.8104, + "step": 71480 + }, + { + "epoch": 0.4567292334819774, + "grad_norm": 0.6722909212112427, + "learning_rate": 8.768417445862613e-05, + "loss": 0.8424, + "step": 71490 + }, + { + "epoch": 0.4567931206317161, + "grad_norm": 0.8289230465888977, + "learning_rate": 8.768087645787102e-05, + "loss": 1.0402, + "step": 71500 + }, + { + "epoch": 0.4568570077814548, + "grad_norm": 0.9513195157051086, + "learning_rate": 8.767757807763687e-05, + "loss": 0.8345, + "step": 71510 + }, + { + "epoch": 0.45692089493119353, + "grad_norm": 1.016177773475647, + "learning_rate": 8.767427931795694e-05, + "loss": 1.0672, + "step": 71520 + }, + { + "epoch": 0.45698478208093224, + "grad_norm": 0.815597653388977, + "learning_rate": 8.767098017886442e-05, + "loss": 0.9043, + "step": 71530 + }, + { + "epoch": 0.45704866923067095, + "grad_norm": 0.6468181610107422, + "learning_rate": 8.766768066039252e-05, + "loss": 0.7424, + "step": 71540 + }, + { + "epoch": 0.45711255638040965, + "grad_norm": 1.0520371198654175, + "learning_rate": 8.76643807625745e-05, + "loss": 0.7597, + "step": 71550 + }, + { + "epoch": 0.45717644353014836, + "grad_norm": 0.770429790019989, + "learning_rate": 8.766108048544359e-05, + "loss": 0.9429, + "step": 71560 + }, + { + "epoch": 0.45724033067988706, + "grad_norm": 1.4733335971832275, + "learning_rate": 8.7657779829033e-05, + "loss": 0.9293, + "step": 71570 + }, + { + "epoch": 0.45730421782962577, + "grad_norm": 0.8638445138931274, + "learning_rate": 8.765447879337601e-05, + "loss": 0.9278, + "step": 71580 + }, + { + "epoch": 0.4573681049793645, + "grad_norm": 1.1189749240875244, + "learning_rate": 8.765117737850584e-05, + "loss": 0.9357, + "step": 71590 + }, + { + "epoch": 0.4574319921291031, + "grad_norm": 0.82524174451828, + "learning_rate": 8.764787558445573e-05, + "loss": 1.1374, + "step": 71600 + }, + { + "epoch": 0.45749587927884183, + "grad_norm": 1.3126778602600098, + "learning_rate": 8.764457341125894e-05, + "loss": 1.1194, + "step": 71610 + }, + { + "epoch": 0.45755976642858054, + "grad_norm": 0.6606014370918274, + "learning_rate": 8.764127085894874e-05, + "loss": 0.815, + "step": 71620 + }, + { + "epoch": 0.45762365357831924, + "grad_norm": 0.8767343163490295, + "learning_rate": 8.763796792755836e-05, + "loss": 1.0984, + "step": 71630 + }, + { + "epoch": 0.45768754072805795, + "grad_norm": 0.6078357100486755, + "learning_rate": 8.763466461712108e-05, + "loss": 0.9051, + "step": 71640 + }, + { + "epoch": 0.45775142787779666, + "grad_norm": 0.7068758010864258, + "learning_rate": 8.763136092767019e-05, + "loss": 0.8346, + "step": 71650 + }, + { + "epoch": 0.45781531502753536, + "grad_norm": 1.372742772102356, + "learning_rate": 8.762805685923894e-05, + "loss": 0.9978, + "step": 71660 + }, + { + "epoch": 0.45787920217727407, + "grad_norm": 0.9161148071289062, + "learning_rate": 8.762475241186059e-05, + "loss": 1.0414, + "step": 71670 + }, + { + "epoch": 0.4579430893270128, + "grad_norm": 0.8733229637145996, + "learning_rate": 8.762144758556846e-05, + "loss": 0.5721, + "step": 71680 + }, + { + "epoch": 0.4580069764767515, + "grad_norm": 0.966275691986084, + "learning_rate": 8.761814238039576e-05, + "loss": 1.0169, + "step": 71690 + }, + { + "epoch": 0.4580708636264902, + "grad_norm": 0.8296000957489014, + "learning_rate": 8.761483679637585e-05, + "loss": 1.0115, + "step": 71700 + }, + { + "epoch": 0.4581347507762289, + "grad_norm": 0.718637228012085, + "learning_rate": 8.761153083354198e-05, + "loss": 0.9809, + "step": 71710 + }, + { + "epoch": 0.45819863792596754, + "grad_norm": 0.6848533153533936, + "learning_rate": 8.760822449192747e-05, + "loss": 0.9013, + "step": 71720 + }, + { + "epoch": 0.45826252507570625, + "grad_norm": 0.9018274545669556, + "learning_rate": 8.760491777156561e-05, + "loss": 1.2462, + "step": 71730 + }, + { + "epoch": 0.45832641222544496, + "grad_norm": 0.8693473935127258, + "learning_rate": 8.760161067248968e-05, + "loss": 1.1541, + "step": 71740 + }, + { + "epoch": 0.45839029937518366, + "grad_norm": 0.7893520593643188, + "learning_rate": 8.759830319473302e-05, + "loss": 0.7432, + "step": 71750 + }, + { + "epoch": 0.45845418652492237, + "grad_norm": 0.704264223575592, + "learning_rate": 8.759499533832889e-05, + "loss": 0.653, + "step": 71760 + }, + { + "epoch": 0.4585180736746611, + "grad_norm": 0.7048154473304749, + "learning_rate": 8.759168710331064e-05, + "loss": 0.7997, + "step": 71770 + }, + { + "epoch": 0.4585819608243998, + "grad_norm": 0.891446590423584, + "learning_rate": 8.75883784897116e-05, + "loss": 0.8512, + "step": 71780 + }, + { + "epoch": 0.4586458479741385, + "grad_norm": 0.7560920715332031, + "learning_rate": 8.758506949756505e-05, + "loss": 1.0229, + "step": 71790 + }, + { + "epoch": 0.4587097351238772, + "grad_norm": 0.9192051887512207, + "learning_rate": 8.758176012690433e-05, + "loss": 0.8684, + "step": 71800 + }, + { + "epoch": 0.4587736222736159, + "grad_norm": 0.9770452380180359, + "learning_rate": 8.757845037776279e-05, + "loss": 0.7901, + "step": 71810 + }, + { + "epoch": 0.4588375094233546, + "grad_norm": 0.8596158623695374, + "learning_rate": 8.757514025017374e-05, + "loss": 1.098, + "step": 71820 + }, + { + "epoch": 0.4589013965730933, + "grad_norm": 0.8267128467559814, + "learning_rate": 8.757182974417051e-05, + "loss": 1.2681, + "step": 71830 + }, + { + "epoch": 0.45896528372283196, + "grad_norm": 0.850165069103241, + "learning_rate": 8.756851885978646e-05, + "loss": 1.0255, + "step": 71840 + }, + { + "epoch": 0.45902917087257067, + "grad_norm": 0.8577935695648193, + "learning_rate": 8.756520759705494e-05, + "loss": 0.729, + "step": 71850 + }, + { + "epoch": 0.4590930580223094, + "grad_norm": 0.9524192214012146, + "learning_rate": 8.756189595600924e-05, + "loss": 0.7808, + "step": 71860 + }, + { + "epoch": 0.4591569451720481, + "grad_norm": 0.7128868699073792, + "learning_rate": 8.755858393668278e-05, + "loss": 0.9852, + "step": 71870 + }, + { + "epoch": 0.4592208323217868, + "grad_norm": 0.7015582919120789, + "learning_rate": 8.755527153910888e-05, + "loss": 1.0348, + "step": 71880 + }, + { + "epoch": 0.4592847194715255, + "grad_norm": 0.7809120416641235, + "learning_rate": 8.755195876332092e-05, + "loss": 0.9461, + "step": 71890 + }, + { + "epoch": 0.4593486066212642, + "grad_norm": 1.345109224319458, + "learning_rate": 8.754864560935223e-05, + "loss": 0.7244, + "step": 71900 + }, + { + "epoch": 0.4594124937710029, + "grad_norm": 1.41587233543396, + "learning_rate": 8.75453320772362e-05, + "loss": 1.0676, + "step": 71910 + }, + { + "epoch": 0.4594763809207416, + "grad_norm": 0.6541619300842285, + "learning_rate": 8.754201816700619e-05, + "loss": 0.7355, + "step": 71920 + }, + { + "epoch": 0.4595402680704803, + "grad_norm": 0.9459201693534851, + "learning_rate": 8.753870387869558e-05, + "loss": 0.9362, + "step": 71930 + }, + { + "epoch": 0.459604155220219, + "grad_norm": 1.1776293516159058, + "learning_rate": 8.753538921233776e-05, + "loss": 1.0149, + "step": 71940 + }, + { + "epoch": 0.45966804236995773, + "grad_norm": 0.7286693453788757, + "learning_rate": 8.753207416796608e-05, + "loss": 0.8923, + "step": 71950 + }, + { + "epoch": 0.4597319295196964, + "grad_norm": 0.6185081601142883, + "learning_rate": 8.752875874561395e-05, + "loss": 0.7427, + "step": 71960 + }, + { + "epoch": 0.4597958166694351, + "grad_norm": 1.2175235748291016, + "learning_rate": 8.752544294531474e-05, + "loss": 0.8131, + "step": 71970 + }, + { + "epoch": 0.4598597038191738, + "grad_norm": 1.1320040225982666, + "learning_rate": 8.752212676710188e-05, + "loss": 0.9201, + "step": 71980 + }, + { + "epoch": 0.4599235909689125, + "grad_norm": 0.9990871548652649, + "learning_rate": 8.751881021100874e-05, + "loss": 0.7032, + "step": 71990 + }, + { + "epoch": 0.4599874781186512, + "grad_norm": 0.7369240522384644, + "learning_rate": 8.751549327706872e-05, + "loss": 0.7436, + "step": 72000 + }, + { + "epoch": 0.4600513652683899, + "grad_norm": 1.1463274955749512, + "learning_rate": 8.75121759653152e-05, + "loss": 0.8619, + "step": 72010 + }, + { + "epoch": 0.4601152524181286, + "grad_norm": 1.0860117673873901, + "learning_rate": 8.750885827578165e-05, + "loss": 0.8812, + "step": 72020 + }, + { + "epoch": 0.4601791395678673, + "grad_norm": 1.319459319114685, + "learning_rate": 8.750554020850144e-05, + "loss": 0.8778, + "step": 72030 + }, + { + "epoch": 0.46024302671760603, + "grad_norm": 0.8144885301589966, + "learning_rate": 8.750222176350798e-05, + "loss": 0.9735, + "step": 72040 + }, + { + "epoch": 0.46030691386734474, + "grad_norm": 0.9385289549827576, + "learning_rate": 8.749890294083471e-05, + "loss": 0.9461, + "step": 72050 + }, + { + "epoch": 0.46037080101708344, + "grad_norm": 0.8200061321258545, + "learning_rate": 8.749558374051505e-05, + "loss": 1.1353, + "step": 72060 + }, + { + "epoch": 0.46043468816682215, + "grad_norm": 1.0239852666854858, + "learning_rate": 8.749226416258242e-05, + "loss": 0.8544, + "step": 72070 + }, + { + "epoch": 0.46049857531656085, + "grad_norm": 0.8354944586753845, + "learning_rate": 8.748894420707025e-05, + "loss": 0.9432, + "step": 72080 + }, + { + "epoch": 0.4605624624662995, + "grad_norm": 1.8811396360397339, + "learning_rate": 8.748562387401197e-05, + "loss": 1.0131, + "step": 72090 + }, + { + "epoch": 0.4606263496160382, + "grad_norm": 0.9125493764877319, + "learning_rate": 8.748230316344106e-05, + "loss": 0.8341, + "step": 72100 + }, + { + "epoch": 0.4606902367657769, + "grad_norm": 0.9623830914497375, + "learning_rate": 8.747898207539092e-05, + "loss": 0.8355, + "step": 72110 + }, + { + "epoch": 0.4607541239155156, + "grad_norm": 0.845895528793335, + "learning_rate": 8.747566060989498e-05, + "loss": 0.6371, + "step": 72120 + }, + { + "epoch": 0.46081801106525433, + "grad_norm": 1.4472181797027588, + "learning_rate": 8.747233876698674e-05, + "loss": 0.6948, + "step": 72130 + }, + { + "epoch": 0.46088189821499304, + "grad_norm": 1.0555568933486938, + "learning_rate": 8.746901654669962e-05, + "loss": 0.984, + "step": 72140 + }, + { + "epoch": 0.46094578536473174, + "grad_norm": 0.6194450259208679, + "learning_rate": 8.746569394906709e-05, + "loss": 0.7546, + "step": 72150 + }, + { + "epoch": 0.46100967251447045, + "grad_norm": 0.5184745192527771, + "learning_rate": 8.746237097412262e-05, + "loss": 0.6829, + "step": 72160 + }, + { + "epoch": 0.46107355966420915, + "grad_norm": 0.696123480796814, + "learning_rate": 8.745904762189966e-05, + "loss": 0.9351, + "step": 72170 + }, + { + "epoch": 0.46113744681394786, + "grad_norm": 0.8792576789855957, + "learning_rate": 8.745572389243168e-05, + "loss": 0.8513, + "step": 72180 + }, + { + "epoch": 0.46120133396368657, + "grad_norm": 0.8823778629302979, + "learning_rate": 8.745239978575215e-05, + "loss": 1.1258, + "step": 72190 + }, + { + "epoch": 0.46126522111342527, + "grad_norm": 0.9086830019950867, + "learning_rate": 8.744907530189457e-05, + "loss": 1.0118, + "step": 72200 + }, + { + "epoch": 0.4613291082631639, + "grad_norm": 1.1880900859832764, + "learning_rate": 8.74457504408924e-05, + "loss": 0.8436, + "step": 72210 + }, + { + "epoch": 0.46139299541290263, + "grad_norm": 1.3979392051696777, + "learning_rate": 8.744242520277912e-05, + "loss": 0.8647, + "step": 72220 + }, + { + "epoch": 0.46145688256264134, + "grad_norm": 1.0479652881622314, + "learning_rate": 8.743909958758823e-05, + "loss": 0.8345, + "step": 72230 + }, + { + "epoch": 0.46152076971238004, + "grad_norm": 0.9071272015571594, + "learning_rate": 8.743577359535321e-05, + "loss": 0.9718, + "step": 72240 + }, + { + "epoch": 0.46158465686211875, + "grad_norm": 0.838445246219635, + "learning_rate": 8.743244722610757e-05, + "loss": 0.9786, + "step": 72250 + }, + { + "epoch": 0.46164854401185745, + "grad_norm": 0.6116316914558411, + "learning_rate": 8.742912047988481e-05, + "loss": 1.0287, + "step": 72260 + }, + { + "epoch": 0.46171243116159616, + "grad_norm": 0.7119680047035217, + "learning_rate": 8.742579335671841e-05, + "loss": 0.8999, + "step": 72270 + }, + { + "epoch": 0.46177631831133487, + "grad_norm": 0.8352906107902527, + "learning_rate": 8.74224658566419e-05, + "loss": 0.7239, + "step": 72280 + }, + { + "epoch": 0.46184020546107357, + "grad_norm": 1.3281790018081665, + "learning_rate": 8.741913797968879e-05, + "loss": 0.8587, + "step": 72290 + }, + { + "epoch": 0.4619040926108123, + "grad_norm": 1.4477615356445312, + "learning_rate": 8.741580972589258e-05, + "loss": 1.1694, + "step": 72300 + }, + { + "epoch": 0.461967979760551, + "grad_norm": 0.925883948802948, + "learning_rate": 8.741248109528679e-05, + "loss": 0.8509, + "step": 72310 + }, + { + "epoch": 0.4620318669102897, + "grad_norm": 1.4060012102127075, + "learning_rate": 8.740915208790496e-05, + "loss": 0.968, + "step": 72320 + }, + { + "epoch": 0.46209575406002834, + "grad_norm": 0.7903311848640442, + "learning_rate": 8.740582270378061e-05, + "loss": 0.8068, + "step": 72330 + }, + { + "epoch": 0.46215964120976705, + "grad_norm": 0.8597942590713501, + "learning_rate": 8.740249294294727e-05, + "loss": 0.661, + "step": 72340 + }, + { + "epoch": 0.46222352835950575, + "grad_norm": 1.3927608728408813, + "learning_rate": 8.739916280543845e-05, + "loss": 0.9103, + "step": 72350 + }, + { + "epoch": 0.46228741550924446, + "grad_norm": 1.1652497053146362, + "learning_rate": 8.739583229128771e-05, + "loss": 0.726, + "step": 72360 + }, + { + "epoch": 0.46235130265898317, + "grad_norm": 1.1774413585662842, + "learning_rate": 8.739250140052859e-05, + "loss": 0.8478, + "step": 72370 + }, + { + "epoch": 0.46241518980872187, + "grad_norm": 1.1708256006240845, + "learning_rate": 8.738917013319463e-05, + "loss": 0.8796, + "step": 72380 + }, + { + "epoch": 0.4624790769584606, + "grad_norm": 0.6467223763465881, + "learning_rate": 8.738583848931938e-05, + "loss": 0.9981, + "step": 72390 + }, + { + "epoch": 0.4625429641081993, + "grad_norm": 0.9683516621589661, + "learning_rate": 8.73825064689364e-05, + "loss": 0.9423, + "step": 72400 + }, + { + "epoch": 0.462606851257938, + "grad_norm": 1.3207199573516846, + "learning_rate": 8.737917407207922e-05, + "loss": 0.8087, + "step": 72410 + }, + { + "epoch": 0.4626707384076767, + "grad_norm": 0.6273083090782166, + "learning_rate": 8.737584129878145e-05, + "loss": 0.8718, + "step": 72420 + }, + { + "epoch": 0.4627346255574154, + "grad_norm": 0.6665430068969727, + "learning_rate": 8.73725081490766e-05, + "loss": 0.7988, + "step": 72430 + }, + { + "epoch": 0.4627985127071541, + "grad_norm": 0.5622584819793701, + "learning_rate": 8.736917462299827e-05, + "loss": 0.7487, + "step": 72440 + }, + { + "epoch": 0.46286239985689276, + "grad_norm": 2.063795566558838, + "learning_rate": 8.736584072058003e-05, + "loss": 0.828, + "step": 72450 + }, + { + "epoch": 0.46292628700663147, + "grad_norm": 0.960210919380188, + "learning_rate": 8.736250644185545e-05, + "loss": 0.9685, + "step": 72460 + }, + { + "epoch": 0.46299017415637017, + "grad_norm": 0.7123231291770935, + "learning_rate": 8.735917178685807e-05, + "loss": 0.8832, + "step": 72470 + }, + { + "epoch": 0.4630540613061089, + "grad_norm": 1.0152729749679565, + "learning_rate": 8.735583675562154e-05, + "loss": 0.8156, + "step": 72480 + }, + { + "epoch": 0.4631179484558476, + "grad_norm": 0.9183365106582642, + "learning_rate": 8.735250134817942e-05, + "loss": 0.8343, + "step": 72490 + }, + { + "epoch": 0.4631818356055863, + "grad_norm": 0.9543598890304565, + "learning_rate": 8.734916556456528e-05, + "loss": 0.7168, + "step": 72500 + }, + { + "epoch": 0.463245722755325, + "grad_norm": 0.6270715594291687, + "learning_rate": 8.734582940481275e-05, + "loss": 0.9325, + "step": 72510 + }, + { + "epoch": 0.4633096099050637, + "grad_norm": 1.1470279693603516, + "learning_rate": 8.73424928689554e-05, + "loss": 0.898, + "step": 72520 + }, + { + "epoch": 0.4633734970548024, + "grad_norm": 1.4141347408294678, + "learning_rate": 8.733915595702685e-05, + "loss": 1.1052, + "step": 72530 + }, + { + "epoch": 0.4634373842045411, + "grad_norm": 0.9898306727409363, + "learning_rate": 8.733581866906066e-05, + "loss": 1.1064, + "step": 72540 + }, + { + "epoch": 0.4635012713542798, + "grad_norm": 1.889242172241211, + "learning_rate": 8.733248100509052e-05, + "loss": 0.7938, + "step": 72550 + }, + { + "epoch": 0.4635651585040185, + "grad_norm": 1.8361594676971436, + "learning_rate": 8.732914296514998e-05, + "loss": 0.8884, + "step": 72560 + }, + { + "epoch": 0.4636290456537572, + "grad_norm": 1.3661187887191772, + "learning_rate": 8.732580454927267e-05, + "loss": 0.7056, + "step": 72570 + }, + { + "epoch": 0.4636929328034959, + "grad_norm": 0.8329597115516663, + "learning_rate": 8.732246575749223e-05, + "loss": 0.6071, + "step": 72580 + }, + { + "epoch": 0.4637568199532346, + "grad_norm": 0.803459107875824, + "learning_rate": 8.731912658984227e-05, + "loss": 0.8693, + "step": 72590 + }, + { + "epoch": 0.4638207071029733, + "grad_norm": 0.7911583185195923, + "learning_rate": 8.731578704635642e-05, + "loss": 0.7637, + "step": 72600 + }, + { + "epoch": 0.463884594252712, + "grad_norm": 0.9523131847381592, + "learning_rate": 8.73124471270683e-05, + "loss": 0.9817, + "step": 72610 + }, + { + "epoch": 0.4639484814024507, + "grad_norm": 0.9202901124954224, + "learning_rate": 8.730910683201157e-05, + "loss": 0.7682, + "step": 72620 + }, + { + "epoch": 0.4640123685521894, + "grad_norm": 0.8219014406204224, + "learning_rate": 8.730576616121984e-05, + "loss": 0.8304, + "step": 72630 + }, + { + "epoch": 0.4640762557019281, + "grad_norm": 0.653312087059021, + "learning_rate": 8.73024251147268e-05, + "loss": 0.6409, + "step": 72640 + }, + { + "epoch": 0.4641401428516668, + "grad_norm": 0.7120294570922852, + "learning_rate": 8.729908369256603e-05, + "loss": 0.7654, + "step": 72650 + }, + { + "epoch": 0.46420403000140553, + "grad_norm": 0.9074612259864807, + "learning_rate": 8.729574189477124e-05, + "loss": 0.9235, + "step": 72660 + }, + { + "epoch": 0.46426791715114424, + "grad_norm": 1.1278350353240967, + "learning_rate": 8.729239972137608e-05, + "loss": 0.8423, + "step": 72670 + }, + { + "epoch": 0.46433180430088294, + "grad_norm": 0.8770177960395813, + "learning_rate": 8.728905717241417e-05, + "loss": 0.7709, + "step": 72680 + }, + { + "epoch": 0.4643956914506216, + "grad_norm": 0.8250672817230225, + "learning_rate": 8.728571424791921e-05, + "loss": 0.8168, + "step": 72690 + }, + { + "epoch": 0.4644595786003603, + "grad_norm": 1.0635161399841309, + "learning_rate": 8.728237094792482e-05, + "loss": 0.9243, + "step": 72700 + }, + { + "epoch": 0.464523465750099, + "grad_norm": 1.3261529207229614, + "learning_rate": 8.727902727246473e-05, + "loss": 0.9011, + "step": 72710 + }, + { + "epoch": 0.4645873528998377, + "grad_norm": 0.8366795182228088, + "learning_rate": 8.727568322157259e-05, + "loss": 1.0456, + "step": 72720 + }, + { + "epoch": 0.4646512400495764, + "grad_norm": 1.3640772104263306, + "learning_rate": 8.727233879528204e-05, + "loss": 0.8938, + "step": 72730 + }, + { + "epoch": 0.4647151271993151, + "grad_norm": 0.7236993312835693, + "learning_rate": 8.72689939936268e-05, + "loss": 0.8754, + "step": 72740 + }, + { + "epoch": 0.46477901434905383, + "grad_norm": 0.9828342199325562, + "learning_rate": 8.726564881664056e-05, + "loss": 0.8948, + "step": 72750 + }, + { + "epoch": 0.46484290149879254, + "grad_norm": 0.7750747799873352, + "learning_rate": 8.7262303264357e-05, + "loss": 0.9002, + "step": 72760 + }, + { + "epoch": 0.46490678864853124, + "grad_norm": 1.2255038022994995, + "learning_rate": 8.725895733680983e-05, + "loss": 0.933, + "step": 72770 + }, + { + "epoch": 0.46497067579826995, + "grad_norm": 3.7937097549438477, + "learning_rate": 8.725561103403267e-05, + "loss": 0.8682, + "step": 72780 + }, + { + "epoch": 0.46503456294800866, + "grad_norm": 0.7408625483512878, + "learning_rate": 8.725226435605934e-05, + "loss": 0.644, + "step": 72790 + }, + { + "epoch": 0.46509845009774736, + "grad_norm": 0.8894087672233582, + "learning_rate": 8.724891730292344e-05, + "loss": 0.826, + "step": 72800 + }, + { + "epoch": 0.465162337247486, + "grad_norm": 1.360103964805603, + "learning_rate": 8.724556987465872e-05, + "loss": 0.9597, + "step": 72810 + }, + { + "epoch": 0.4652262243972247, + "grad_norm": 0.8977581858634949, + "learning_rate": 8.724222207129889e-05, + "loss": 0.7513, + "step": 72820 + }, + { + "epoch": 0.4652901115469634, + "grad_norm": 0.8301047086715698, + "learning_rate": 8.723887389287768e-05, + "loss": 0.7628, + "step": 72830 + }, + { + "epoch": 0.46535399869670213, + "grad_norm": 1.7645938396453857, + "learning_rate": 8.723552533942878e-05, + "loss": 0.8691, + "step": 72840 + }, + { + "epoch": 0.46541788584644084, + "grad_norm": 0.6487802267074585, + "learning_rate": 8.723217641098594e-05, + "loss": 0.7312, + "step": 72850 + }, + { + "epoch": 0.46548177299617954, + "grad_norm": 0.7930013537406921, + "learning_rate": 8.722882710758286e-05, + "loss": 0.9037, + "step": 72860 + }, + { + "epoch": 0.46554566014591825, + "grad_norm": 0.749622106552124, + "learning_rate": 8.722547742925328e-05, + "loss": 1.0156, + "step": 72870 + }, + { + "epoch": 0.46560954729565696, + "grad_norm": 0.8207896947860718, + "learning_rate": 8.722212737603095e-05, + "loss": 0.7249, + "step": 72880 + }, + { + "epoch": 0.46567343444539566, + "grad_norm": 2.454975128173828, + "learning_rate": 8.721877694794958e-05, + "loss": 0.823, + "step": 72890 + }, + { + "epoch": 0.46573732159513437, + "grad_norm": 0.46863147616386414, + "learning_rate": 8.721542614504294e-05, + "loss": 0.9426, + "step": 72900 + }, + { + "epoch": 0.4658012087448731, + "grad_norm": 1.0021847486495972, + "learning_rate": 8.721207496734476e-05, + "loss": 1.1263, + "step": 72910 + }, + { + "epoch": 0.4658650958946118, + "grad_norm": 0.6298357844352722, + "learning_rate": 8.720872341488879e-05, + "loss": 0.8613, + "step": 72920 + }, + { + "epoch": 0.4659289830443505, + "grad_norm": 1.0806231498718262, + "learning_rate": 8.72053714877088e-05, + "loss": 0.7853, + "step": 72930 + }, + { + "epoch": 0.46599287019408914, + "grad_norm": 0.9028376936912537, + "learning_rate": 8.720201918583853e-05, + "loss": 0.8704, + "step": 72940 + }, + { + "epoch": 0.46605675734382784, + "grad_norm": 1.1052665710449219, + "learning_rate": 8.719866650931172e-05, + "loss": 0.7413, + "step": 72950 + }, + { + "epoch": 0.46612064449356655, + "grad_norm": 0.8171069025993347, + "learning_rate": 8.719531345816216e-05, + "loss": 0.9989, + "step": 72960 + }, + { + "epoch": 0.46618453164330526, + "grad_norm": 1.0400487184524536, + "learning_rate": 8.719196003242362e-05, + "loss": 0.8036, + "step": 72970 + }, + { + "epoch": 0.46624841879304396, + "grad_norm": 0.5930902361869812, + "learning_rate": 8.718860623212988e-05, + "loss": 0.9512, + "step": 72980 + }, + { + "epoch": 0.46631230594278267, + "grad_norm": 0.9061450958251953, + "learning_rate": 8.718525205731469e-05, + "loss": 0.909, + "step": 72990 + }, + { + "epoch": 0.4663761930925214, + "grad_norm": 0.9812560081481934, + "learning_rate": 8.718189750801184e-05, + "loss": 0.9485, + "step": 73000 + }, + { + "epoch": 0.4664400802422601, + "grad_norm": 1.0964970588684082, + "learning_rate": 8.717854258425512e-05, + "loss": 0.6829, + "step": 73010 + }, + { + "epoch": 0.4665039673919988, + "grad_norm": 0.513983964920044, + "learning_rate": 8.717518728607832e-05, + "loss": 0.7848, + "step": 73020 + }, + { + "epoch": 0.4665678545417375, + "grad_norm": 0.647631824016571, + "learning_rate": 8.71718316135152e-05, + "loss": 1.0222, + "step": 73030 + }, + { + "epoch": 0.4666317416914762, + "grad_norm": 0.7781062126159668, + "learning_rate": 8.716847556659961e-05, + "loss": 0.767, + "step": 73040 + }, + { + "epoch": 0.4666956288412149, + "grad_norm": 0.9073989987373352, + "learning_rate": 8.71651191453653e-05, + "loss": 0.7717, + "step": 73050 + }, + { + "epoch": 0.46675951599095356, + "grad_norm": 1.8708226680755615, + "learning_rate": 8.71617623498461e-05, + "loss": 0.902, + "step": 73060 + }, + { + "epoch": 0.46682340314069226, + "grad_norm": 0.9808720350265503, + "learning_rate": 8.715840518007578e-05, + "loss": 0.9002, + "step": 73070 + }, + { + "epoch": 0.46688729029043097, + "grad_norm": 0.9100602865219116, + "learning_rate": 8.715504763608818e-05, + "loss": 1.0678, + "step": 73080 + }, + { + "epoch": 0.4669511774401697, + "grad_norm": 0.6987651586532593, + "learning_rate": 8.71516897179171e-05, + "loss": 1.1864, + "step": 73090 + }, + { + "epoch": 0.4670150645899084, + "grad_norm": 0.9975560903549194, + "learning_rate": 8.714833142559637e-05, + "loss": 1.4101, + "step": 73100 + }, + { + "epoch": 0.4670789517396471, + "grad_norm": 1.2323815822601318, + "learning_rate": 8.714497275915982e-05, + "loss": 0.81, + "step": 73110 + }, + { + "epoch": 0.4671428388893858, + "grad_norm": 1.1183509826660156, + "learning_rate": 8.714161371864124e-05, + "loss": 0.8636, + "step": 73120 + }, + { + "epoch": 0.4672067260391245, + "grad_norm": 0.9153540730476379, + "learning_rate": 8.71382543040745e-05, + "loss": 0.9229, + "step": 73130 + }, + { + "epoch": 0.4672706131888632, + "grad_norm": 0.6757118701934814, + "learning_rate": 8.71348945154934e-05, + "loss": 0.8663, + "step": 73140 + }, + { + "epoch": 0.4673345003386019, + "grad_norm": 0.9965721964836121, + "learning_rate": 8.713153435293178e-05, + "loss": 0.9432, + "step": 73150 + }, + { + "epoch": 0.4673983874883406, + "grad_norm": 3.0663352012634277, + "learning_rate": 8.712817381642348e-05, + "loss": 0.9173, + "step": 73160 + }, + { + "epoch": 0.4674622746380793, + "grad_norm": 0.8566670417785645, + "learning_rate": 8.712481290600235e-05, + "loss": 0.8049, + "step": 73170 + }, + { + "epoch": 0.467526161787818, + "grad_norm": 0.6621735095977783, + "learning_rate": 8.712145162170224e-05, + "loss": 1.0432, + "step": 73180 + }, + { + "epoch": 0.4675900489375567, + "grad_norm": 0.7513931393623352, + "learning_rate": 8.7118089963557e-05, + "loss": 0.999, + "step": 73190 + }, + { + "epoch": 0.4676539360872954, + "grad_norm": 0.6250850558280945, + "learning_rate": 8.711472793160049e-05, + "loss": 1.0574, + "step": 73200 + }, + { + "epoch": 0.4677178232370341, + "grad_norm": 1.0595519542694092, + "learning_rate": 8.711136552586655e-05, + "loss": 0.8253, + "step": 73210 + }, + { + "epoch": 0.4677817103867728, + "grad_norm": 0.5927673578262329, + "learning_rate": 8.71083390411543e-05, + "loss": 0.9964, + "step": 73220 + }, + { + "epoch": 0.4678455975365115, + "grad_norm": 0.9299998879432678, + "learning_rate": 8.710497592533657e-05, + "loss": 0.8753, + "step": 73230 + }, + { + "epoch": 0.4679094846862502, + "grad_norm": 0.6862097978591919, + "learning_rate": 8.710161243583962e-05, + "loss": 0.7737, + "step": 73240 + }, + { + "epoch": 0.4679733718359889, + "grad_norm": 0.6349765062332153, + "learning_rate": 8.709824857269732e-05, + "loss": 0.7059, + "step": 73250 + }, + { + "epoch": 0.4680372589857276, + "grad_norm": 0.7105598449707031, + "learning_rate": 8.709488433594359e-05, + "loss": 0.9881, + "step": 73260 + }, + { + "epoch": 0.46810114613546633, + "grad_norm": 0.836338996887207, + "learning_rate": 8.709151972561228e-05, + "loss": 0.8385, + "step": 73270 + }, + { + "epoch": 0.46816503328520503, + "grad_norm": 0.8248547911643982, + "learning_rate": 8.708815474173728e-05, + "loss": 0.8845, + "step": 73280 + }, + { + "epoch": 0.46822892043494374, + "grad_norm": 3.468738079071045, + "learning_rate": 8.708478938435246e-05, + "loss": 1.0441, + "step": 73290 + }, + { + "epoch": 0.4682928075846824, + "grad_norm": 0.9611918330192566, + "learning_rate": 8.708142365349173e-05, + "loss": 1.1622, + "step": 73300 + }, + { + "epoch": 0.4683566947344211, + "grad_norm": 1.594110369682312, + "learning_rate": 8.7078057549189e-05, + "loss": 0.7014, + "step": 73310 + }, + { + "epoch": 0.4684205818841598, + "grad_norm": 0.8596274852752686, + "learning_rate": 8.707469107147815e-05, + "loss": 0.9094, + "step": 73320 + }, + { + "epoch": 0.4684844690338985, + "grad_norm": 0.673202395439148, + "learning_rate": 8.707132422039305e-05, + "loss": 1.0132, + "step": 73330 + }, + { + "epoch": 0.4685483561836372, + "grad_norm": 0.6166740655899048, + "learning_rate": 8.706795699596769e-05, + "loss": 0.8095, + "step": 73340 + }, + { + "epoch": 0.4686122433333759, + "grad_norm": 0.7982991337776184, + "learning_rate": 8.706458939823592e-05, + "loss": 0.8268, + "step": 73350 + }, + { + "epoch": 0.46867613048311463, + "grad_norm": 2.1832122802734375, + "learning_rate": 8.706122142723167e-05, + "loss": 0.9482, + "step": 73360 + }, + { + "epoch": 0.46874001763285333, + "grad_norm": 0.5912348031997681, + "learning_rate": 8.705785308298886e-05, + "loss": 0.9366, + "step": 73370 + }, + { + "epoch": 0.46880390478259204, + "grad_norm": 0.9966716766357422, + "learning_rate": 8.705448436554139e-05, + "loss": 1.1344, + "step": 73380 + }, + { + "epoch": 0.46886779193233075, + "grad_norm": 0.6441813111305237, + "learning_rate": 8.705111527492322e-05, + "loss": 1.0889, + "step": 73390 + }, + { + "epoch": 0.46893167908206945, + "grad_norm": 0.8401795029640198, + "learning_rate": 8.704774581116827e-05, + "loss": 1.0559, + "step": 73400 + }, + { + "epoch": 0.46899556623180816, + "grad_norm": 0.7190397381782532, + "learning_rate": 8.704437597431047e-05, + "loss": 0.9705, + "step": 73410 + }, + { + "epoch": 0.4690594533815468, + "grad_norm": 0.8827881217002869, + "learning_rate": 8.704100576438374e-05, + "loss": 0.6985, + "step": 73420 + }, + { + "epoch": 0.4691233405312855, + "grad_norm": 1.544293999671936, + "learning_rate": 8.703763518142205e-05, + "loss": 0.9164, + "step": 73430 + }, + { + "epoch": 0.4691872276810242, + "grad_norm": 1.1441346406936646, + "learning_rate": 8.703426422545934e-05, + "loss": 1.31, + "step": 73440 + }, + { + "epoch": 0.4692511148307629, + "grad_norm": 0.8129305243492126, + "learning_rate": 8.703089289652954e-05, + "loss": 0.8406, + "step": 73450 + }, + { + "epoch": 0.46931500198050163, + "grad_norm": 0.8427706956863403, + "learning_rate": 8.70275211946666e-05, + "loss": 0.873, + "step": 73460 + }, + { + "epoch": 0.46937888913024034, + "grad_norm": 1.3682218790054321, + "learning_rate": 8.70241491199045e-05, + "loss": 0.7712, + "step": 73470 + }, + { + "epoch": 0.46944277627997905, + "grad_norm": 0.7666106820106506, + "learning_rate": 8.70207766722772e-05, + "loss": 0.8279, + "step": 73480 + }, + { + "epoch": 0.46950666342971775, + "grad_norm": 0.8763406276702881, + "learning_rate": 8.701740385181863e-05, + "loss": 0.7302, + "step": 73490 + }, + { + "epoch": 0.46957055057945646, + "grad_norm": 0.8393523693084717, + "learning_rate": 8.70140306585628e-05, + "loss": 0.6908, + "step": 73500 + }, + { + "epoch": 0.46963443772919516, + "grad_norm": 0.8529371619224548, + "learning_rate": 8.701065709254363e-05, + "loss": 0.7582, + "step": 73510 + }, + { + "epoch": 0.46969832487893387, + "grad_norm": 0.9004521369934082, + "learning_rate": 8.700728315379515e-05, + "loss": 0.8145, + "step": 73520 + }, + { + "epoch": 0.4697622120286726, + "grad_norm": 1.3330974578857422, + "learning_rate": 8.70039088423513e-05, + "loss": 0.7189, + "step": 73530 + }, + { + "epoch": 0.4698260991784112, + "grad_norm": 0.6903400421142578, + "learning_rate": 8.700053415824608e-05, + "loss": 1.0698, + "step": 73540 + }, + { + "epoch": 0.46988998632814993, + "grad_norm": 0.8569963574409485, + "learning_rate": 8.699715910151347e-05, + "loss": 0.8589, + "step": 73550 + }, + { + "epoch": 0.46995387347788864, + "grad_norm": 1.7143669128417969, + "learning_rate": 8.699378367218747e-05, + "loss": 0.9826, + "step": 73560 + }, + { + "epoch": 0.47001776062762735, + "grad_norm": 0.8588539361953735, + "learning_rate": 8.699040787030205e-05, + "loss": 0.7637, + "step": 73570 + }, + { + "epoch": 0.47008164777736605, + "grad_norm": 0.7576454877853394, + "learning_rate": 8.698703169589122e-05, + "loss": 0.8103, + "step": 73580 + }, + { + "epoch": 0.47014553492710476, + "grad_norm": 1.1570996046066284, + "learning_rate": 8.698365514898899e-05, + "loss": 0.9776, + "step": 73590 + }, + { + "epoch": 0.47020942207684346, + "grad_norm": 0.5057058334350586, + "learning_rate": 8.698027822962937e-05, + "loss": 0.9414, + "step": 73600 + }, + { + "epoch": 0.47027330922658217, + "grad_norm": 0.6375735402107239, + "learning_rate": 8.697690093784634e-05, + "loss": 0.7779, + "step": 73610 + }, + { + "epoch": 0.4703371963763209, + "grad_norm": 0.9835091829299927, + "learning_rate": 8.697352327367391e-05, + "loss": 0.8034, + "step": 73620 + }, + { + "epoch": 0.4704010835260596, + "grad_norm": 0.5068366527557373, + "learning_rate": 8.697014523714615e-05, + "loss": 0.7921, + "step": 73630 + }, + { + "epoch": 0.4704649706757983, + "grad_norm": 0.646186351776123, + "learning_rate": 8.696676682829704e-05, + "loss": 0.7929, + "step": 73640 + }, + { + "epoch": 0.470528857825537, + "grad_norm": 0.6051701307296753, + "learning_rate": 8.696338804716058e-05, + "loss": 0.7974, + "step": 73650 + }, + { + "epoch": 0.4705927449752757, + "grad_norm": 0.7829045653343201, + "learning_rate": 8.696000889377085e-05, + "loss": 0.8989, + "step": 73660 + }, + { + "epoch": 0.47065663212501435, + "grad_norm": 0.8022125959396362, + "learning_rate": 8.695662936816185e-05, + "loss": 0.9278, + "step": 73670 + }, + { + "epoch": 0.47072051927475306, + "grad_norm": 1.3843055963516235, + "learning_rate": 8.69532494703676e-05, + "loss": 0.7795, + "step": 73680 + }, + { + "epoch": 0.47078440642449176, + "grad_norm": 0.8143162727355957, + "learning_rate": 8.694986920042218e-05, + "loss": 0.8787, + "step": 73690 + }, + { + "epoch": 0.47084829357423047, + "grad_norm": 0.9179696440696716, + "learning_rate": 8.694648855835961e-05, + "loss": 0.8331, + "step": 73700 + }, + { + "epoch": 0.4709121807239692, + "grad_norm": 0.7905839085578918, + "learning_rate": 8.694310754421393e-05, + "loss": 0.8672, + "step": 73710 + }, + { + "epoch": 0.4709760678737079, + "grad_norm": 0.642015278339386, + "learning_rate": 8.69397261580192e-05, + "loss": 0.6751, + "step": 73720 + }, + { + "epoch": 0.4710399550234466, + "grad_norm": 0.7508492469787598, + "learning_rate": 8.693634439980946e-05, + "loss": 1.0497, + "step": 73730 + }, + { + "epoch": 0.4711038421731853, + "grad_norm": 0.7233025431632996, + "learning_rate": 8.693296226961879e-05, + "loss": 0.9594, + "step": 73740 + }, + { + "epoch": 0.471167729322924, + "grad_norm": 0.9499550461769104, + "learning_rate": 8.692957976748124e-05, + "loss": 1.0151, + "step": 73750 + }, + { + "epoch": 0.4712316164726627, + "grad_norm": 2.5865375995635986, + "learning_rate": 8.692619689343087e-05, + "loss": 0.8826, + "step": 73760 + }, + { + "epoch": 0.4712955036224014, + "grad_norm": 1.2711101770401, + "learning_rate": 8.692281364750174e-05, + "loss": 1.1665, + "step": 73770 + }, + { + "epoch": 0.4713593907721401, + "grad_norm": 2.8341193199157715, + "learning_rate": 8.691943002972794e-05, + "loss": 0.7414, + "step": 73780 + }, + { + "epoch": 0.47142327792187877, + "grad_norm": 1.014237880706787, + "learning_rate": 8.691604604014355e-05, + "loss": 1.0644, + "step": 73790 + }, + { + "epoch": 0.4714871650716175, + "grad_norm": 0.9451431632041931, + "learning_rate": 8.691266167878263e-05, + "loss": 0.9587, + "step": 73800 + }, + { + "epoch": 0.4715510522213562, + "grad_norm": 0.7285395264625549, + "learning_rate": 8.690927694567927e-05, + "loss": 0.8145, + "step": 73810 + }, + { + "epoch": 0.4716149393710949, + "grad_norm": 0.9039714932441711, + "learning_rate": 8.690589184086758e-05, + "loss": 0.9801, + "step": 73820 + }, + { + "epoch": 0.4716788265208336, + "grad_norm": 1.3245606422424316, + "learning_rate": 8.690250636438161e-05, + "loss": 0.8307, + "step": 73830 + }, + { + "epoch": 0.4717427136705723, + "grad_norm": 0.8605784773826599, + "learning_rate": 8.689912051625549e-05, + "loss": 0.802, + "step": 73840 + }, + { + "epoch": 0.471806600820311, + "grad_norm": 0.8803051710128784, + "learning_rate": 8.689573429652329e-05, + "loss": 0.9706, + "step": 73850 + }, + { + "epoch": 0.4718704879700497, + "grad_norm": 1.14476478099823, + "learning_rate": 8.689234770521913e-05, + "loss": 1.1188, + "step": 73860 + }, + { + "epoch": 0.4719343751197884, + "grad_norm": 0.6339378356933594, + "learning_rate": 8.688896074237712e-05, + "loss": 1.0567, + "step": 73870 + }, + { + "epoch": 0.4719982622695271, + "grad_norm": 1.278977394104004, + "learning_rate": 8.688557340803135e-05, + "loss": 1.0485, + "step": 73880 + }, + { + "epoch": 0.47206214941926583, + "grad_norm": 0.6915751695632935, + "learning_rate": 8.688218570221596e-05, + "loss": 1.0948, + "step": 73890 + }, + { + "epoch": 0.47212603656900454, + "grad_norm": 0.6988540887832642, + "learning_rate": 8.687879762496504e-05, + "loss": 1.1413, + "step": 73900 + }, + { + "epoch": 0.4721899237187432, + "grad_norm": 0.9477376341819763, + "learning_rate": 8.687540917631273e-05, + "loss": 1.0465, + "step": 73910 + }, + { + "epoch": 0.4722538108684819, + "grad_norm": 0.8210738301277161, + "learning_rate": 8.687202035629314e-05, + "loss": 0.9296, + "step": 73920 + }, + { + "epoch": 0.4723176980182206, + "grad_norm": 1.0816015005111694, + "learning_rate": 8.686863116494042e-05, + "loss": 1.1211, + "step": 73930 + }, + { + "epoch": 0.4723815851679593, + "grad_norm": 0.5747155547142029, + "learning_rate": 8.686524160228867e-05, + "loss": 0.9563, + "step": 73940 + }, + { + "epoch": 0.472445472317698, + "grad_norm": 0.9445788860321045, + "learning_rate": 8.686185166837206e-05, + "loss": 1.0368, + "step": 73950 + }, + { + "epoch": 0.4725093594674367, + "grad_norm": 0.9299923181533813, + "learning_rate": 8.685846136322471e-05, + "loss": 0.8771, + "step": 73960 + }, + { + "epoch": 0.4725732466171754, + "grad_norm": 0.8922392725944519, + "learning_rate": 8.685507068688075e-05, + "loss": 0.7732, + "step": 73970 + }, + { + "epoch": 0.47263713376691413, + "grad_norm": 1.0317169427871704, + "learning_rate": 8.685167963937437e-05, + "loss": 0.7212, + "step": 73980 + }, + { + "epoch": 0.47270102091665284, + "grad_norm": 0.6838691234588623, + "learning_rate": 8.684828822073967e-05, + "loss": 0.9596, + "step": 73990 + }, + { + "epoch": 0.47276490806639154, + "grad_norm": 0.93050616979599, + "learning_rate": 8.684489643101085e-05, + "loss": 0.9891, + "step": 74000 + }, + { + "epoch": 0.47282879521613025, + "grad_norm": 0.7852534651756287, + "learning_rate": 8.684150427022205e-05, + "loss": 0.8776, + "step": 74010 + }, + { + "epoch": 0.47289268236586895, + "grad_norm": 1.1245160102844238, + "learning_rate": 8.683811173840741e-05, + "loss": 1.0402, + "step": 74020 + }, + { + "epoch": 0.4729565695156076, + "grad_norm": 1.0451771020889282, + "learning_rate": 8.683471883560113e-05, + "loss": 0.8141, + "step": 74030 + }, + { + "epoch": 0.4730204566653463, + "grad_norm": 0.8366501331329346, + "learning_rate": 8.683132556183735e-05, + "loss": 0.9944, + "step": 74040 + }, + { + "epoch": 0.473084343815085, + "grad_norm": 1.0142920017242432, + "learning_rate": 8.682793191715027e-05, + "loss": 0.8935, + "step": 74050 + }, + { + "epoch": 0.4731482309648237, + "grad_norm": 0.7233960032463074, + "learning_rate": 8.682453790157405e-05, + "loss": 0.9414, + "step": 74060 + }, + { + "epoch": 0.47321211811456243, + "grad_norm": 0.8871427178382874, + "learning_rate": 8.682114351514287e-05, + "loss": 0.8502, + "step": 74070 + }, + { + "epoch": 0.47327600526430114, + "grad_norm": 1.854498028755188, + "learning_rate": 8.681774875789095e-05, + "loss": 1.076, + "step": 74080 + }, + { + "epoch": 0.47333989241403984, + "grad_norm": 0.7415865063667297, + "learning_rate": 8.681435362985242e-05, + "loss": 0.8037, + "step": 74090 + }, + { + "epoch": 0.47340377956377855, + "grad_norm": 0.7761117815971375, + "learning_rate": 8.68109581310615e-05, + "loss": 0.7338, + "step": 74100 + }, + { + "epoch": 0.47346766671351725, + "grad_norm": 1.1805341243743896, + "learning_rate": 8.68075622615524e-05, + "loss": 1.1164, + "step": 74110 + }, + { + "epoch": 0.47353155386325596, + "grad_norm": 0.7617985606193542, + "learning_rate": 8.680416602135929e-05, + "loss": 1.064, + "step": 74120 + }, + { + "epoch": 0.47359544101299467, + "grad_norm": 0.5913506746292114, + "learning_rate": 8.68007694105164e-05, + "loss": 1.0071, + "step": 74130 + }, + { + "epoch": 0.4736593281627334, + "grad_norm": 1.0599095821380615, + "learning_rate": 8.679737242905792e-05, + "loss": 0.8347, + "step": 74140 + }, + { + "epoch": 0.473723215312472, + "grad_norm": 1.3343327045440674, + "learning_rate": 8.679397507701806e-05, + "loss": 1.1742, + "step": 74150 + }, + { + "epoch": 0.47378710246221073, + "grad_norm": 0.5925554037094116, + "learning_rate": 8.679057735443104e-05, + "loss": 0.9706, + "step": 74160 + }, + { + "epoch": 0.47385098961194944, + "grad_norm": 0.764336109161377, + "learning_rate": 8.678717926133109e-05, + "loss": 1.2481, + "step": 74170 + }, + { + "epoch": 0.47391487676168814, + "grad_norm": 0.8975499868392944, + "learning_rate": 8.678378079775241e-05, + "loss": 0.9182, + "step": 74180 + }, + { + "epoch": 0.47397876391142685, + "grad_norm": 0.7242470383644104, + "learning_rate": 8.678038196372925e-05, + "loss": 0.7105, + "step": 74190 + }, + { + "epoch": 0.47404265106116555, + "grad_norm": 0.8966102004051208, + "learning_rate": 8.67769827592958e-05, + "loss": 0.9065, + "step": 74200 + }, + { + "epoch": 0.47410653821090426, + "grad_norm": 0.5318197011947632, + "learning_rate": 8.677358318448633e-05, + "loss": 0.8434, + "step": 74210 + }, + { + "epoch": 0.47417042536064297, + "grad_norm": 1.834756851196289, + "learning_rate": 8.677018323933505e-05, + "loss": 1.1204, + "step": 74220 + }, + { + "epoch": 0.4742343125103817, + "grad_norm": 1.0717896223068237, + "learning_rate": 8.676678292387623e-05, + "loss": 0.8128, + "step": 74230 + }, + { + "epoch": 0.4742981996601204, + "grad_norm": 0.7135387063026428, + "learning_rate": 8.67633822381441e-05, + "loss": 0.8956, + "step": 74240 + }, + { + "epoch": 0.4743620868098591, + "grad_norm": 1.2037732601165771, + "learning_rate": 8.675998118217289e-05, + "loss": 0.833, + "step": 74250 + }, + { + "epoch": 0.4744259739595978, + "grad_norm": 1.4196306467056274, + "learning_rate": 8.675657975599688e-05, + "loss": 1.05, + "step": 74260 + }, + { + "epoch": 0.47448986110933644, + "grad_norm": 0.9658291339874268, + "learning_rate": 8.675317795965031e-05, + "loss": 0.6996, + "step": 74270 + }, + { + "epoch": 0.47455374825907515, + "grad_norm": 1.4640781879425049, + "learning_rate": 8.674977579316745e-05, + "loss": 0.704, + "step": 74280 + }, + { + "epoch": 0.47461763540881385, + "grad_norm": 0.7206962704658508, + "learning_rate": 8.674637325658254e-05, + "loss": 0.6542, + "step": 74290 + }, + { + "epoch": 0.47468152255855256, + "grad_norm": 0.9174501895904541, + "learning_rate": 8.674297034992986e-05, + "loss": 0.904, + "step": 74300 + }, + { + "epoch": 0.47474540970829127, + "grad_norm": 0.7845925092697144, + "learning_rate": 8.673956707324369e-05, + "loss": 0.9329, + "step": 74310 + }, + { + "epoch": 0.47480929685802997, + "grad_norm": 1.1023668050765991, + "learning_rate": 8.67361634265583e-05, + "loss": 0.9712, + "step": 74320 + }, + { + "epoch": 0.4748731840077687, + "grad_norm": 1.1444423198699951, + "learning_rate": 8.673275940990796e-05, + "loss": 0.8582, + "step": 74330 + }, + { + "epoch": 0.4749370711575074, + "grad_norm": 0.9823821783065796, + "learning_rate": 8.672935502332696e-05, + "loss": 0.9597, + "step": 74340 + }, + { + "epoch": 0.4750009583072461, + "grad_norm": 1.3823814392089844, + "learning_rate": 8.672595026684955e-05, + "loss": 0.8345, + "step": 74350 + }, + { + "epoch": 0.4750648454569848, + "grad_norm": 1.1680278778076172, + "learning_rate": 8.672254514051009e-05, + "loss": 1.0284, + "step": 74360 + }, + { + "epoch": 0.4751287326067235, + "grad_norm": 0.6968647241592407, + "learning_rate": 8.67191396443428e-05, + "loss": 0.7296, + "step": 74370 + }, + { + "epoch": 0.4751926197564622, + "grad_norm": 1.0131860971450806, + "learning_rate": 8.671573377838202e-05, + "loss": 0.9332, + "step": 74380 + }, + { + "epoch": 0.47525650690620086, + "grad_norm": 1.0270569324493408, + "learning_rate": 8.671232754266203e-05, + "loss": 0.7498, + "step": 74390 + }, + { + "epoch": 0.47532039405593957, + "grad_norm": 0.8761411905288696, + "learning_rate": 8.670892093721715e-05, + "loss": 0.729, + "step": 74400 + }, + { + "epoch": 0.47538428120567827, + "grad_norm": 0.9730551838874817, + "learning_rate": 8.670551396208168e-05, + "loss": 0.8988, + "step": 74410 + }, + { + "epoch": 0.475448168355417, + "grad_norm": 0.7609050869941711, + "learning_rate": 8.670210661728992e-05, + "loss": 0.965, + "step": 74420 + }, + { + "epoch": 0.4755120555051557, + "grad_norm": 0.9374824166297913, + "learning_rate": 8.669869890287621e-05, + "loss": 0.8523, + "step": 74430 + }, + { + "epoch": 0.4755759426548944, + "grad_norm": 0.9228322505950928, + "learning_rate": 8.669529081887484e-05, + "loss": 1.0975, + "step": 74440 + }, + { + "epoch": 0.4756398298046331, + "grad_norm": 0.8603367209434509, + "learning_rate": 8.669188236532013e-05, + "loss": 0.8295, + "step": 74450 + }, + { + "epoch": 0.4757037169543718, + "grad_norm": 0.9186978936195374, + "learning_rate": 8.668847354224645e-05, + "loss": 0.9944, + "step": 74460 + }, + { + "epoch": 0.4757676041041105, + "grad_norm": 0.8278791904449463, + "learning_rate": 8.668506434968808e-05, + "loss": 1.077, + "step": 74470 + }, + { + "epoch": 0.4758314912538492, + "grad_norm": 1.0060932636260986, + "learning_rate": 8.66816547876794e-05, + "loss": 1.1607, + "step": 74480 + }, + { + "epoch": 0.4758953784035879, + "grad_norm": 1.0081162452697754, + "learning_rate": 8.667824485625471e-05, + "loss": 0.8542, + "step": 74490 + }, + { + "epoch": 0.4759592655533266, + "grad_norm": 1.362919569015503, + "learning_rate": 8.667483455544835e-05, + "loss": 1.1144, + "step": 74500 + }, + { + "epoch": 0.47602315270306533, + "grad_norm": 0.8849195241928101, + "learning_rate": 8.667142388529467e-05, + "loss": 1.0793, + "step": 74510 + }, + { + "epoch": 0.476087039852804, + "grad_norm": 1.2261072397232056, + "learning_rate": 8.666801284582806e-05, + "loss": 0.6733, + "step": 74520 + }, + { + "epoch": 0.4761509270025427, + "grad_norm": 0.8894041180610657, + "learning_rate": 8.666460143708283e-05, + "loss": 0.8456, + "step": 74530 + }, + { + "epoch": 0.4762148141522814, + "grad_norm": 0.7797572016716003, + "learning_rate": 8.666118965909334e-05, + "loss": 0.7103, + "step": 74540 + }, + { + "epoch": 0.4762787013020201, + "grad_norm": 0.8391841650009155, + "learning_rate": 8.665777751189395e-05, + "loss": 0.9627, + "step": 74550 + }, + { + "epoch": 0.4763425884517588, + "grad_norm": 0.9427254796028137, + "learning_rate": 8.665436499551903e-05, + "loss": 0.8994, + "step": 74560 + }, + { + "epoch": 0.4764064756014975, + "grad_norm": 0.9323469996452332, + "learning_rate": 8.665095211000293e-05, + "loss": 0.9536, + "step": 74570 + }, + { + "epoch": 0.4764703627512362, + "grad_norm": 0.731502890586853, + "learning_rate": 8.664753885538005e-05, + "loss": 0.9477, + "step": 74580 + }, + { + "epoch": 0.4765342499009749, + "grad_norm": 0.7330303192138672, + "learning_rate": 8.664412523168474e-05, + "loss": 0.7534, + "step": 74590 + }, + { + "epoch": 0.47659813705071363, + "grad_norm": 1.0151233673095703, + "learning_rate": 8.664071123895138e-05, + "loss": 1.1555, + "step": 74600 + }, + { + "epoch": 0.47666202420045234, + "grad_norm": 0.7544573545455933, + "learning_rate": 8.663729687721439e-05, + "loss": 0.8015, + "step": 74610 + }, + { + "epoch": 0.47672591135019105, + "grad_norm": 0.5822036862373352, + "learning_rate": 8.66338821465081e-05, + "loss": 0.7977, + "step": 74620 + }, + { + "epoch": 0.47678979849992975, + "grad_norm": 1.627901554107666, + "learning_rate": 8.663046704686692e-05, + "loss": 0.9961, + "step": 74630 + }, + { + "epoch": 0.4768536856496684, + "grad_norm": 0.9120510220527649, + "learning_rate": 8.662705157832527e-05, + "loss": 0.9101, + "step": 74640 + }, + { + "epoch": 0.4769175727994071, + "grad_norm": 1.2490442991256714, + "learning_rate": 8.662363574091752e-05, + "loss": 0.9137, + "step": 74650 + }, + { + "epoch": 0.4769814599491458, + "grad_norm": 0.5486982464790344, + "learning_rate": 8.662021953467806e-05, + "loss": 1.0668, + "step": 74660 + }, + { + "epoch": 0.4770453470988845, + "grad_norm": 1.0078871250152588, + "learning_rate": 8.661680295964131e-05, + "loss": 0.8362, + "step": 74670 + }, + { + "epoch": 0.4771092342486232, + "grad_norm": 1.1747907400131226, + "learning_rate": 8.661338601584168e-05, + "loss": 0.6677, + "step": 74680 + }, + { + "epoch": 0.47717312139836193, + "grad_norm": 1.0243124961853027, + "learning_rate": 8.660996870331357e-05, + "loss": 0.9675, + "step": 74690 + }, + { + "epoch": 0.47723700854810064, + "grad_norm": 0.8121140599250793, + "learning_rate": 8.66065510220914e-05, + "loss": 0.7801, + "step": 74700 + }, + { + "epoch": 0.47730089569783934, + "grad_norm": 1.2878518104553223, + "learning_rate": 8.660313297220962e-05, + "loss": 0.8109, + "step": 74710 + }, + { + "epoch": 0.47736478284757805, + "grad_norm": 0.9882553219795227, + "learning_rate": 8.65997145537026e-05, + "loss": 0.8357, + "step": 74720 + }, + { + "epoch": 0.47742866999731676, + "grad_norm": 0.9014390110969543, + "learning_rate": 8.659629576660479e-05, + "loss": 1.0185, + "step": 74730 + }, + { + "epoch": 0.47749255714705546, + "grad_norm": 0.8646599054336548, + "learning_rate": 8.659287661095063e-05, + "loss": 0.8571, + "step": 74740 + }, + { + "epoch": 0.47755644429679417, + "grad_norm": 0.6751865744590759, + "learning_rate": 8.658945708677455e-05, + "loss": 0.8639, + "step": 74750 + }, + { + "epoch": 0.4776203314465328, + "grad_norm": 0.6493138074874878, + "learning_rate": 8.658603719411098e-05, + "loss": 0.9801, + "step": 74760 + }, + { + "epoch": 0.4776842185962715, + "grad_norm": 0.7330247163772583, + "learning_rate": 8.658261693299436e-05, + "loss": 0.9075, + "step": 74770 + }, + { + "epoch": 0.47774810574601023, + "grad_norm": 0.8450262546539307, + "learning_rate": 8.657919630345914e-05, + "loss": 0.9218, + "step": 74780 + }, + { + "epoch": 0.47781199289574894, + "grad_norm": 1.399348258972168, + "learning_rate": 8.657577530553977e-05, + "loss": 0.987, + "step": 74790 + }, + { + "epoch": 0.47787588004548764, + "grad_norm": 0.6834306716918945, + "learning_rate": 8.65723539392707e-05, + "loss": 1.0965, + "step": 74800 + }, + { + "epoch": 0.47793976719522635, + "grad_norm": 0.8273354768753052, + "learning_rate": 8.656893220468638e-05, + "loss": 0.9514, + "step": 74810 + }, + { + "epoch": 0.47800365434496506, + "grad_norm": 0.5543147325515747, + "learning_rate": 8.656551010182128e-05, + "loss": 0.8666, + "step": 74820 + }, + { + "epoch": 0.47806754149470376, + "grad_norm": 1.8543487787246704, + "learning_rate": 8.656208763070986e-05, + "loss": 0.8342, + "step": 74830 + }, + { + "epoch": 0.47813142864444247, + "grad_norm": 0.9043295383453369, + "learning_rate": 8.655866479138659e-05, + "loss": 1.0627, + "step": 74840 + }, + { + "epoch": 0.4781953157941812, + "grad_norm": 3.4877755641937256, + "learning_rate": 8.655524158388595e-05, + "loss": 0.9094, + "step": 74850 + }, + { + "epoch": 0.4782592029439199, + "grad_norm": 0.6816970109939575, + "learning_rate": 8.655181800824237e-05, + "loss": 0.6711, + "step": 74860 + }, + { + "epoch": 0.4783230900936586, + "grad_norm": 1.15105402469635, + "learning_rate": 8.654839406449037e-05, + "loss": 0.8039, + "step": 74870 + }, + { + "epoch": 0.47838697724339724, + "grad_norm": 0.43395039439201355, + "learning_rate": 8.654496975266445e-05, + "loss": 0.8421, + "step": 74880 + }, + { + "epoch": 0.47845086439313594, + "grad_norm": 0.7869691252708435, + "learning_rate": 8.654154507279904e-05, + "loss": 0.8841, + "step": 74890 + }, + { + "epoch": 0.47851475154287465, + "grad_norm": 1.013023853302002, + "learning_rate": 8.653812002492867e-05, + "loss": 1.1382, + "step": 74900 + }, + { + "epoch": 0.47857863869261336, + "grad_norm": 0.7388662099838257, + "learning_rate": 8.653469460908783e-05, + "loss": 0.8478, + "step": 74910 + }, + { + "epoch": 0.47864252584235206, + "grad_norm": 0.8700296878814697, + "learning_rate": 8.6531268825311e-05, + "loss": 0.8433, + "step": 74920 + }, + { + "epoch": 0.47870641299209077, + "grad_norm": 0.7805728912353516, + "learning_rate": 8.652784267363268e-05, + "loss": 0.8564, + "step": 74930 + }, + { + "epoch": 0.4787703001418295, + "grad_norm": 1.0190261602401733, + "learning_rate": 8.652441615408739e-05, + "loss": 0.7729, + "step": 74940 + }, + { + "epoch": 0.4788341872915682, + "grad_norm": 0.9089486002922058, + "learning_rate": 8.652098926670961e-05, + "loss": 0.8051, + "step": 74950 + }, + { + "epoch": 0.4788980744413069, + "grad_norm": 0.8379830121994019, + "learning_rate": 8.651756201153391e-05, + "loss": 0.7344, + "step": 74960 + }, + { + "epoch": 0.4789619615910456, + "grad_norm": 0.8890141844749451, + "learning_rate": 8.651413438859475e-05, + "loss": 1.0695, + "step": 74970 + }, + { + "epoch": 0.4790258487407843, + "grad_norm": 1.0251997709274292, + "learning_rate": 8.651070639792667e-05, + "loss": 0.9472, + "step": 74980 + }, + { + "epoch": 0.479089735890523, + "grad_norm": 0.7137789130210876, + "learning_rate": 8.650727803956418e-05, + "loss": 0.9155, + "step": 74990 + }, + { + "epoch": 0.47915362304026166, + "grad_norm": 0.6541804671287537, + "learning_rate": 8.650384931354183e-05, + "loss": 0.7172, + "step": 75000 + }, + { + "epoch": 0.47921751019000036, + "grad_norm": 1.1364400386810303, + "learning_rate": 8.650042021989415e-05, + "loss": 0.7023, + "step": 75010 + }, + { + "epoch": 0.47928139733973907, + "grad_norm": 1.3749972581863403, + "learning_rate": 8.649699075865564e-05, + "loss": 0.7755, + "step": 75020 + }, + { + "epoch": 0.4793452844894778, + "grad_norm": 1.0463199615478516, + "learning_rate": 8.649356092986086e-05, + "loss": 0.7507, + "step": 75030 + }, + { + "epoch": 0.4794091716392165, + "grad_norm": 2.42689847946167, + "learning_rate": 8.649013073354434e-05, + "loss": 0.8019, + "step": 75040 + }, + { + "epoch": 0.4794730587889552, + "grad_norm": 0.8399762511253357, + "learning_rate": 8.648670016974067e-05, + "loss": 0.9304, + "step": 75050 + }, + { + "epoch": 0.4795369459386939, + "grad_norm": 0.804482638835907, + "learning_rate": 8.648326923848434e-05, + "loss": 0.963, + "step": 75060 + }, + { + "epoch": 0.4796008330884326, + "grad_norm": 1.2717317342758179, + "learning_rate": 8.647983793980993e-05, + "loss": 0.7493, + "step": 75070 + }, + { + "epoch": 0.4796647202381713, + "grad_norm": 1.063368797302246, + "learning_rate": 8.647640627375199e-05, + "loss": 0.9279, + "step": 75080 + }, + { + "epoch": 0.47972860738791, + "grad_norm": 0.8830692768096924, + "learning_rate": 8.647297424034509e-05, + "loss": 0.887, + "step": 75090 + }, + { + "epoch": 0.4797924945376487, + "grad_norm": 0.9686833620071411, + "learning_rate": 8.646954183962378e-05, + "loss": 0.8693, + "step": 75100 + }, + { + "epoch": 0.4798563816873874, + "grad_norm": 0.8640769124031067, + "learning_rate": 8.646610907162262e-05, + "loss": 0.7796, + "step": 75110 + }, + { + "epoch": 0.4799202688371261, + "grad_norm": 0.5140219926834106, + "learning_rate": 8.646267593637621e-05, + "loss": 1.4085, + "step": 75120 + }, + { + "epoch": 0.4799841559868648, + "grad_norm": 0.6936458945274353, + "learning_rate": 8.64592424339191e-05, + "loss": 0.9243, + "step": 75130 + }, + { + "epoch": 0.4800480431366035, + "grad_norm": 0.8202782869338989, + "learning_rate": 8.645580856428588e-05, + "loss": 1.1241, + "step": 75140 + }, + { + "epoch": 0.4801119302863422, + "grad_norm": 1.985823631286621, + "learning_rate": 8.645237432751113e-05, + "loss": 1.0022, + "step": 75150 + }, + { + "epoch": 0.4801758174360809, + "grad_norm": 1.036049723625183, + "learning_rate": 8.644893972362945e-05, + "loss": 1.0675, + "step": 75160 + }, + { + "epoch": 0.4802397045858196, + "grad_norm": 0.7775549292564392, + "learning_rate": 8.644550475267538e-05, + "loss": 0.8857, + "step": 75170 + }, + { + "epoch": 0.4803035917355583, + "grad_norm": 0.7424293160438538, + "learning_rate": 8.644206941468358e-05, + "loss": 1.2084, + "step": 75180 + }, + { + "epoch": 0.480367478885297, + "grad_norm": 2.01617169380188, + "learning_rate": 8.64386337096886e-05, + "loss": 1.284, + "step": 75190 + }, + { + "epoch": 0.4804313660350357, + "grad_norm": 0.6096950173377991, + "learning_rate": 8.643519763772506e-05, + "loss": 0.7568, + "step": 75200 + }, + { + "epoch": 0.48049525318477443, + "grad_norm": 0.817476212978363, + "learning_rate": 8.643176119882755e-05, + "loss": 0.8748, + "step": 75210 + }, + { + "epoch": 0.48055914033451314, + "grad_norm": 0.9491440057754517, + "learning_rate": 8.642832439303067e-05, + "loss": 0.8784, + "step": 75220 + }, + { + "epoch": 0.48062302748425184, + "grad_norm": 0.4414537847042084, + "learning_rate": 8.642488722036908e-05, + "loss": 1.0155, + "step": 75230 + }, + { + "epoch": 0.4806869146339905, + "grad_norm": 0.8975993394851685, + "learning_rate": 8.642144968087735e-05, + "loss": 0.931, + "step": 75240 + }, + { + "epoch": 0.4807508017837292, + "grad_norm": 1.0731254816055298, + "learning_rate": 8.641801177459012e-05, + "loss": 1.1996, + "step": 75250 + }, + { + "epoch": 0.4808146889334679, + "grad_norm": 0.9253545999526978, + "learning_rate": 8.641457350154201e-05, + "loss": 0.6433, + "step": 75260 + }, + { + "epoch": 0.4808785760832066, + "grad_norm": 0.6967938542366028, + "learning_rate": 8.641113486176764e-05, + "loss": 0.7571, + "step": 75270 + }, + { + "epoch": 0.4809424632329453, + "grad_norm": 0.694025993347168, + "learning_rate": 8.640769585530162e-05, + "loss": 1.0296, + "step": 75280 + }, + { + "epoch": 0.481006350382684, + "grad_norm": 0.6931796073913574, + "learning_rate": 8.640425648217863e-05, + "loss": 1.0492, + "step": 75290 + }, + { + "epoch": 0.48107023753242273, + "grad_norm": 0.8335185050964355, + "learning_rate": 8.640081674243326e-05, + "loss": 0.74, + "step": 75300 + }, + { + "epoch": 0.48113412468216143, + "grad_norm": 0.9621481895446777, + "learning_rate": 8.639737663610019e-05, + "loss": 0.9957, + "step": 75310 + }, + { + "epoch": 0.48119801183190014, + "grad_norm": 0.6297350525856018, + "learning_rate": 8.639393616321404e-05, + "loss": 0.6059, + "step": 75320 + }, + { + "epoch": 0.48126189898163885, + "grad_norm": 0.8291562795639038, + "learning_rate": 8.639049532380948e-05, + "loss": 0.7669, + "step": 75330 + }, + { + "epoch": 0.48132578613137755, + "grad_norm": 0.7181857824325562, + "learning_rate": 8.638705411792115e-05, + "loss": 0.9866, + "step": 75340 + }, + { + "epoch": 0.48138967328111626, + "grad_norm": 0.6401185393333435, + "learning_rate": 8.63836125455837e-05, + "loss": 0.7838, + "step": 75350 + }, + { + "epoch": 0.48145356043085497, + "grad_norm": 0.6353443264961243, + "learning_rate": 8.638017060683179e-05, + "loss": 0.6636, + "step": 75360 + }, + { + "epoch": 0.4815174475805936, + "grad_norm": 0.9812245965003967, + "learning_rate": 8.637672830170009e-05, + "loss": 0.7057, + "step": 75370 + }, + { + "epoch": 0.4815813347303323, + "grad_norm": 0.8041467666625977, + "learning_rate": 8.637328563022327e-05, + "loss": 0.8152, + "step": 75380 + }, + { + "epoch": 0.48164522188007103, + "grad_norm": 0.738399863243103, + "learning_rate": 8.636984259243601e-05, + "loss": 0.8781, + "step": 75390 + }, + { + "epoch": 0.48170910902980973, + "grad_norm": 0.9629417061805725, + "learning_rate": 8.636639918837294e-05, + "loss": 0.9158, + "step": 75400 + }, + { + "epoch": 0.48177299617954844, + "grad_norm": 0.898951530456543, + "learning_rate": 8.636295541806881e-05, + "loss": 0.9504, + "step": 75410 + }, + { + "epoch": 0.48183688332928715, + "grad_norm": 0.8771629929542542, + "learning_rate": 8.635951128155822e-05, + "loss": 0.9677, + "step": 75420 + }, + { + "epoch": 0.48190077047902585, + "grad_norm": 0.7448533177375793, + "learning_rate": 8.635606677887591e-05, + "loss": 0.752, + "step": 75430 + }, + { + "epoch": 0.48196465762876456, + "grad_norm": 0.6516122221946716, + "learning_rate": 8.635262191005656e-05, + "loss": 0.6401, + "step": 75440 + }, + { + "epoch": 0.48202854477850327, + "grad_norm": 0.7587134838104248, + "learning_rate": 8.634917667513486e-05, + "loss": 0.8766, + "step": 75450 + }, + { + "epoch": 0.48209243192824197, + "grad_norm": 0.728209376335144, + "learning_rate": 8.63457310741455e-05, + "loss": 0.9743, + "step": 75460 + }, + { + "epoch": 0.4821563190779807, + "grad_norm": 0.7866697907447815, + "learning_rate": 8.634228510712318e-05, + "loss": 0.9598, + "step": 75470 + }, + { + "epoch": 0.4822202062277194, + "grad_norm": 0.8349552750587463, + "learning_rate": 8.633883877410261e-05, + "loss": 0.7729, + "step": 75480 + }, + { + "epoch": 0.48228409337745803, + "grad_norm": 0.7193264365196228, + "learning_rate": 8.63353920751185e-05, + "loss": 0.714, + "step": 75490 + }, + { + "epoch": 0.48234798052719674, + "grad_norm": 0.9247245192527771, + "learning_rate": 8.633194501020556e-05, + "loss": 0.9012, + "step": 75500 + }, + { + "epoch": 0.48241186767693545, + "grad_norm": 1.0399880409240723, + "learning_rate": 8.632849757939849e-05, + "loss": 0.9669, + "step": 75510 + }, + { + "epoch": 0.48247575482667415, + "grad_norm": 0.7889145016670227, + "learning_rate": 8.632504978273204e-05, + "loss": 1.1164, + "step": 75520 + }, + { + "epoch": 0.48253964197641286, + "grad_norm": 0.8151355981826782, + "learning_rate": 8.63216016202409e-05, + "loss": 0.9048, + "step": 75530 + }, + { + "epoch": 0.48260352912615156, + "grad_norm": 0.9007961750030518, + "learning_rate": 8.631815309195981e-05, + "loss": 0.7891, + "step": 75540 + }, + { + "epoch": 0.48266741627589027, + "grad_norm": 1.8607451915740967, + "learning_rate": 8.631470419792348e-05, + "loss": 1.0807, + "step": 75550 + }, + { + "epoch": 0.482731303425629, + "grad_norm": 0.6548914909362793, + "learning_rate": 8.63112549381667e-05, + "loss": 1.0285, + "step": 75560 + }, + { + "epoch": 0.4827951905753677, + "grad_norm": 0.7430241107940674, + "learning_rate": 8.630780531272414e-05, + "loss": 0.8952, + "step": 75570 + }, + { + "epoch": 0.4828590777251064, + "grad_norm": 0.6701022386550903, + "learning_rate": 8.630435532163059e-05, + "loss": 0.8305, + "step": 75580 + }, + { + "epoch": 0.4829229648748451, + "grad_norm": 0.8253774046897888, + "learning_rate": 8.630090496492076e-05, + "loss": 1.2012, + "step": 75590 + }, + { + "epoch": 0.4829868520245838, + "grad_norm": 0.7972230911254883, + "learning_rate": 8.629745424262942e-05, + "loss": 1.138, + "step": 75600 + }, + { + "epoch": 0.48305073917432245, + "grad_norm": 1.0207947492599487, + "learning_rate": 8.62940031547913e-05, + "loss": 0.9028, + "step": 75610 + }, + { + "epoch": 0.48311462632406116, + "grad_norm": 0.6902018785476685, + "learning_rate": 8.62905517014412e-05, + "loss": 0.8966, + "step": 75620 + }, + { + "epoch": 0.48317851347379986, + "grad_norm": 1.1125010251998901, + "learning_rate": 8.628709988261381e-05, + "loss": 1.008, + "step": 75630 + }, + { + "epoch": 0.48324240062353857, + "grad_norm": 0.6313163638114929, + "learning_rate": 8.628364769834395e-05, + "loss": 0.8845, + "step": 75640 + }, + { + "epoch": 0.4833062877732773, + "grad_norm": 0.6679086685180664, + "learning_rate": 8.628019514866637e-05, + "loss": 0.9086, + "step": 75650 + }, + { + "epoch": 0.483370174923016, + "grad_norm": 0.7422047853469849, + "learning_rate": 8.627674223361584e-05, + "loss": 0.8719, + "step": 75660 + }, + { + "epoch": 0.4834340620727547, + "grad_norm": 0.7488150596618652, + "learning_rate": 8.627328895322713e-05, + "loss": 0.8072, + "step": 75670 + }, + { + "epoch": 0.4834979492224934, + "grad_norm": 0.5652221441268921, + "learning_rate": 8.627018068854189e-05, + "loss": 1.0794, + "step": 75680 + }, + { + "epoch": 0.4835618363722321, + "grad_norm": 0.8535979986190796, + "learning_rate": 8.626672671410644e-05, + "loss": 0.8991, + "step": 75690 + }, + { + "epoch": 0.4836257235219708, + "grad_norm": 0.8179265260696411, + "learning_rate": 8.62632723744337e-05, + "loss": 1.2132, + "step": 75700 + }, + { + "epoch": 0.4836896106717095, + "grad_norm": 0.7996183037757874, + "learning_rate": 8.625981766955842e-05, + "loss": 0.8212, + "step": 75710 + }, + { + "epoch": 0.4837534978214482, + "grad_norm": 0.671373724937439, + "learning_rate": 8.625636259951542e-05, + "loss": 0.9386, + "step": 75720 + }, + { + "epoch": 0.48381738497118687, + "grad_norm": 1.1768290996551514, + "learning_rate": 8.625290716433947e-05, + "loss": 0.8154, + "step": 75730 + }, + { + "epoch": 0.4838812721209256, + "grad_norm": 0.9020494818687439, + "learning_rate": 8.62494513640654e-05, + "loss": 1.119, + "step": 75740 + }, + { + "epoch": 0.4839451592706643, + "grad_norm": 1.0053081512451172, + "learning_rate": 8.624599519872798e-05, + "loss": 1.0964, + "step": 75750 + }, + { + "epoch": 0.484009046420403, + "grad_norm": 1.5894237756729126, + "learning_rate": 8.624253866836202e-05, + "loss": 0.9744, + "step": 75760 + }, + { + "epoch": 0.4840729335701417, + "grad_norm": 0.5304593443870544, + "learning_rate": 8.623908177300236e-05, + "loss": 0.7911, + "step": 75770 + }, + { + "epoch": 0.4841368207198804, + "grad_norm": 0.8319995403289795, + "learning_rate": 8.623562451268378e-05, + "loss": 1.0109, + "step": 75780 + }, + { + "epoch": 0.4842007078696191, + "grad_norm": 1.3417378664016724, + "learning_rate": 8.623216688744113e-05, + "loss": 0.8693, + "step": 75790 + }, + { + "epoch": 0.4842645950193578, + "grad_norm": 0.7891839742660522, + "learning_rate": 8.622870889730921e-05, + "loss": 0.9214, + "step": 75800 + }, + { + "epoch": 0.4843284821690965, + "grad_norm": 0.7130112648010254, + "learning_rate": 8.622525054232285e-05, + "loss": 0.8262, + "step": 75810 + }, + { + "epoch": 0.4843923693188352, + "grad_norm": 0.9320762157440186, + "learning_rate": 8.622179182251686e-05, + "loss": 0.8674, + "step": 75820 + }, + { + "epoch": 0.48445625646857393, + "grad_norm": 0.6487066745758057, + "learning_rate": 8.62183327379261e-05, + "loss": 1.126, + "step": 75830 + }, + { + "epoch": 0.48452014361831264, + "grad_norm": 0.6271628737449646, + "learning_rate": 8.62148732885854e-05, + "loss": 0.7827, + "step": 75840 + }, + { + "epoch": 0.4845840307680513, + "grad_norm": 0.7439334988594055, + "learning_rate": 8.621141347452959e-05, + "loss": 1.2293, + "step": 75850 + }, + { + "epoch": 0.48464791791779, + "grad_norm": 0.8553930521011353, + "learning_rate": 8.620795329579354e-05, + "loss": 0.8525, + "step": 75860 + }, + { + "epoch": 0.4847118050675287, + "grad_norm": 0.9168295860290527, + "learning_rate": 8.620449275241205e-05, + "loss": 0.7103, + "step": 75870 + }, + { + "epoch": 0.4847756922172674, + "grad_norm": 0.61861652135849, + "learning_rate": 8.620103184442001e-05, + "loss": 0.796, + "step": 75880 + }, + { + "epoch": 0.4848395793670061, + "grad_norm": 1.4174355268478394, + "learning_rate": 8.619757057185226e-05, + "loss": 0.8479, + "step": 75890 + }, + { + "epoch": 0.4849034665167448, + "grad_norm": 0.9580785036087036, + "learning_rate": 8.619410893474365e-05, + "loss": 0.7067, + "step": 75900 + }, + { + "epoch": 0.4849673536664835, + "grad_norm": 0.7961419820785522, + "learning_rate": 8.619064693312906e-05, + "loss": 1.1983, + "step": 75910 + }, + { + "epoch": 0.48503124081622223, + "grad_norm": 1.8671194314956665, + "learning_rate": 8.618718456704335e-05, + "loss": 1.2858, + "step": 75920 + }, + { + "epoch": 0.48509512796596094, + "grad_norm": 1.4799001216888428, + "learning_rate": 8.618372183652137e-05, + "loss": 0.9962, + "step": 75930 + }, + { + "epoch": 0.48515901511569964, + "grad_norm": 0.6392105221748352, + "learning_rate": 8.6180258741598e-05, + "loss": 0.8089, + "step": 75940 + }, + { + "epoch": 0.48522290226543835, + "grad_norm": 0.8513908982276917, + "learning_rate": 8.617679528230816e-05, + "loss": 0.9247, + "step": 75950 + }, + { + "epoch": 0.48528678941517706, + "grad_norm": 0.6598104238510132, + "learning_rate": 8.617333145868667e-05, + "loss": 0.9169, + "step": 75960 + }, + { + "epoch": 0.4853506765649157, + "grad_norm": 1.3016315698623657, + "learning_rate": 8.616986727076843e-05, + "loss": 0.8606, + "step": 75970 + }, + { + "epoch": 0.4854145637146544, + "grad_norm": 0.950963020324707, + "learning_rate": 8.616640271858835e-05, + "loss": 0.8453, + "step": 75980 + }, + { + "epoch": 0.4854784508643931, + "grad_norm": 0.9443991780281067, + "learning_rate": 8.616293780218131e-05, + "loss": 0.9117, + "step": 75990 + }, + { + "epoch": 0.4855423380141318, + "grad_norm": 0.8694010972976685, + "learning_rate": 8.615947252158219e-05, + "loss": 0.9585, + "step": 76000 + }, + { + "epoch": 0.48560622516387053, + "grad_norm": 1.7652310132980347, + "learning_rate": 8.615600687682591e-05, + "loss": 0.9593, + "step": 76010 + }, + { + "epoch": 0.48567011231360924, + "grad_norm": 0.4394935369491577, + "learning_rate": 8.615254086794735e-05, + "loss": 0.856, + "step": 76020 + }, + { + "epoch": 0.48573399946334794, + "grad_norm": 1.1516753435134888, + "learning_rate": 8.614907449498144e-05, + "loss": 1.2644, + "step": 76030 + }, + { + "epoch": 0.48579788661308665, + "grad_norm": 2.2719500064849854, + "learning_rate": 8.614560775796307e-05, + "loss": 0.7425, + "step": 76040 + }, + { + "epoch": 0.48586177376282536, + "grad_norm": 0.5767148733139038, + "learning_rate": 8.614214065692715e-05, + "loss": 0.8913, + "step": 76050 + }, + { + "epoch": 0.48592566091256406, + "grad_norm": 0.7121883034706116, + "learning_rate": 8.613867319190861e-05, + "loss": 1.0213, + "step": 76060 + }, + { + "epoch": 0.48598954806230277, + "grad_norm": 0.948017418384552, + "learning_rate": 8.613520536294238e-05, + "loss": 0.8787, + "step": 76070 + }, + { + "epoch": 0.4860534352120415, + "grad_norm": 0.6437438130378723, + "learning_rate": 8.613173717006335e-05, + "loss": 0.9009, + "step": 76080 + }, + { + "epoch": 0.4861173223617801, + "grad_norm": 0.7081766724586487, + "learning_rate": 8.612826861330648e-05, + "loss": 0.9181, + "step": 76090 + }, + { + "epoch": 0.48618120951151883, + "grad_norm": 0.7698941826820374, + "learning_rate": 8.61247996927067e-05, + "loss": 0.8093, + "step": 76100 + }, + { + "epoch": 0.48624509666125754, + "grad_norm": 0.8024051785469055, + "learning_rate": 8.612133040829892e-05, + "loss": 0.98, + "step": 76110 + }, + { + "epoch": 0.48630898381099624, + "grad_norm": 2.622551679611206, + "learning_rate": 8.611786076011809e-05, + "loss": 0.9282, + "step": 76120 + }, + { + "epoch": 0.48637287096073495, + "grad_norm": 1.0354362726211548, + "learning_rate": 8.611439074819917e-05, + "loss": 1.0491, + "step": 76130 + }, + { + "epoch": 0.48643675811047365, + "grad_norm": 0.6621295213699341, + "learning_rate": 8.611092037257709e-05, + "loss": 0.776, + "step": 76140 + }, + { + "epoch": 0.48650064526021236, + "grad_norm": 0.6664482355117798, + "learning_rate": 8.610744963328679e-05, + "loss": 1.0026, + "step": 76150 + }, + { + "epoch": 0.48656453240995107, + "grad_norm": 0.9559485912322998, + "learning_rate": 8.610397853036325e-05, + "loss": 1.0425, + "step": 76160 + }, + { + "epoch": 0.4866284195596898, + "grad_norm": 1.0177385807037354, + "learning_rate": 8.61005070638414e-05, + "loss": 0.9678, + "step": 76170 + }, + { + "epoch": 0.4866923067094285, + "grad_norm": 0.630623459815979, + "learning_rate": 8.60970352337562e-05, + "loss": 0.99, + "step": 76180 + }, + { + "epoch": 0.4867561938591672, + "grad_norm": 0.9502881169319153, + "learning_rate": 8.609356304014264e-05, + "loss": 0.7239, + "step": 76190 + }, + { + "epoch": 0.4868200810089059, + "grad_norm": 2.090254306793213, + "learning_rate": 8.60900904830357e-05, + "loss": 1.2069, + "step": 76200 + }, + { + "epoch": 0.4868839681586446, + "grad_norm": 0.5957566499710083, + "learning_rate": 8.60866175624703e-05, + "loss": 0.8932, + "step": 76210 + }, + { + "epoch": 0.48694785530838325, + "grad_norm": 1.5734295845031738, + "learning_rate": 8.608314427848144e-05, + "loss": 0.7924, + "step": 76220 + }, + { + "epoch": 0.48701174245812195, + "grad_norm": 0.6711301207542419, + "learning_rate": 8.60796706311041e-05, + "loss": 0.952, + "step": 76230 + }, + { + "epoch": 0.48707562960786066, + "grad_norm": 0.6539300084114075, + "learning_rate": 8.607619662037327e-05, + "loss": 0.949, + "step": 76240 + }, + { + "epoch": 0.48713951675759937, + "grad_norm": 1.1970055103302002, + "learning_rate": 8.607272224632393e-05, + "loss": 1.0121, + "step": 76250 + }, + { + "epoch": 0.4872034039073381, + "grad_norm": 0.9336310625076294, + "learning_rate": 8.606924750899106e-05, + "loss": 0.8952, + "step": 76260 + }, + { + "epoch": 0.4872672910570768, + "grad_norm": 0.9023282527923584, + "learning_rate": 8.606577240840968e-05, + "loss": 0.9134, + "step": 76270 + }, + { + "epoch": 0.4873311782068155, + "grad_norm": 0.4293481111526489, + "learning_rate": 8.606229694461476e-05, + "loss": 0.7425, + "step": 76280 + }, + { + "epoch": 0.4873950653565542, + "grad_norm": 0.736682116985321, + "learning_rate": 8.605882111764132e-05, + "loss": 0.8171, + "step": 76290 + }, + { + "epoch": 0.4874589525062929, + "grad_norm": 1.6317270994186401, + "learning_rate": 8.605534492752434e-05, + "loss": 0.824, + "step": 76300 + }, + { + "epoch": 0.4875228396560316, + "grad_norm": 3.0119450092315674, + "learning_rate": 8.605186837429887e-05, + "loss": 0.859, + "step": 76310 + }, + { + "epoch": 0.4875867268057703, + "grad_norm": 1.0656332969665527, + "learning_rate": 8.604839145799987e-05, + "loss": 0.7387, + "step": 76320 + }, + { + "epoch": 0.487650613955509, + "grad_norm": 0.7559338808059692, + "learning_rate": 8.604491417866238e-05, + "loss": 0.9439, + "step": 76330 + }, + { + "epoch": 0.48771450110524767, + "grad_norm": 0.8888264894485474, + "learning_rate": 8.604143653632144e-05, + "loss": 1.0296, + "step": 76340 + }, + { + "epoch": 0.4877783882549864, + "grad_norm": 0.9546695947647095, + "learning_rate": 8.603795853101204e-05, + "loss": 1.0504, + "step": 76350 + }, + { + "epoch": 0.4878422754047251, + "grad_norm": 2.9092493057250977, + "learning_rate": 8.603448016276924e-05, + "loss": 1.1027, + "step": 76360 + }, + { + "epoch": 0.4879061625544638, + "grad_norm": 0.840092122554779, + "learning_rate": 8.603100143162803e-05, + "loss": 0.7812, + "step": 76370 + }, + { + "epoch": 0.4879700497042025, + "grad_norm": 0.8399893641471863, + "learning_rate": 8.602752233762348e-05, + "loss": 0.9633, + "step": 76380 + }, + { + "epoch": 0.4880339368539412, + "grad_norm": 0.9037623405456543, + "learning_rate": 8.60240428807906e-05, + "loss": 0.9545, + "step": 76390 + }, + { + "epoch": 0.4880978240036799, + "grad_norm": 0.843728244304657, + "learning_rate": 8.602056306116445e-05, + "loss": 0.7823, + "step": 76400 + }, + { + "epoch": 0.4881617111534186, + "grad_norm": 0.9266428351402283, + "learning_rate": 8.601708287878006e-05, + "loss": 0.7908, + "step": 76410 + }, + { + "epoch": 0.4882255983031573, + "grad_norm": 0.7917917966842651, + "learning_rate": 8.60136023336725e-05, + "loss": 1.0549, + "step": 76420 + }, + { + "epoch": 0.488289485452896, + "grad_norm": 0.7976272702217102, + "learning_rate": 8.601012142587678e-05, + "loss": 0.944, + "step": 76430 + }, + { + "epoch": 0.4883533726026347, + "grad_norm": 1.0543662309646606, + "learning_rate": 8.6006640155428e-05, + "loss": 0.8536, + "step": 76440 + }, + { + "epoch": 0.48841725975237343, + "grad_norm": 0.7209562063217163, + "learning_rate": 8.600315852236121e-05, + "loss": 0.7236, + "step": 76450 + }, + { + "epoch": 0.4884811469021121, + "grad_norm": 0.7003374695777893, + "learning_rate": 8.599967652671147e-05, + "loss": 1.1172, + "step": 76460 + }, + { + "epoch": 0.4885450340518508, + "grad_norm": 1.0924787521362305, + "learning_rate": 8.599619416851384e-05, + "loss": 0.7156, + "step": 76470 + }, + { + "epoch": 0.4886089212015895, + "grad_norm": 0.6103460192680359, + "learning_rate": 8.599271144780339e-05, + "loss": 0.9213, + "step": 76480 + }, + { + "epoch": 0.4886728083513282, + "grad_norm": 0.675788938999176, + "learning_rate": 8.59892283646152e-05, + "loss": 0.7218, + "step": 76490 + }, + { + "epoch": 0.4887366955010669, + "grad_norm": 0.5468382835388184, + "learning_rate": 8.598574491898435e-05, + "loss": 0.7851, + "step": 76500 + }, + { + "epoch": 0.4888005826508056, + "grad_norm": 0.9708940982818604, + "learning_rate": 8.59822611109459e-05, + "loss": 0.8894, + "step": 76510 + }, + { + "epoch": 0.4888644698005443, + "grad_norm": 2.2232227325439453, + "learning_rate": 8.597877694053496e-05, + "loss": 1.0381, + "step": 76520 + }, + { + "epoch": 0.488928356950283, + "grad_norm": 1.9338047504425049, + "learning_rate": 8.597529240778661e-05, + "loss": 0.8914, + "step": 76530 + }, + { + "epoch": 0.48899224410002173, + "grad_norm": 0.842464029788971, + "learning_rate": 8.597180751273595e-05, + "loss": 0.8219, + "step": 76540 + }, + { + "epoch": 0.48905613124976044, + "grad_norm": 0.6262010931968689, + "learning_rate": 8.596832225541806e-05, + "loss": 0.6786, + "step": 76550 + }, + { + "epoch": 0.48912001839949915, + "grad_norm": 0.7342615723609924, + "learning_rate": 8.596483663586804e-05, + "loss": 0.6999, + "step": 76560 + }, + { + "epoch": 0.48918390554923785, + "grad_norm": 0.6208049654960632, + "learning_rate": 8.596135065412101e-05, + "loss": 1.0309, + "step": 76570 + }, + { + "epoch": 0.4892477926989765, + "grad_norm": 0.8948808312416077, + "learning_rate": 8.595786431021207e-05, + "loss": 0.7239, + "step": 76580 + }, + { + "epoch": 0.4893116798487152, + "grad_norm": 0.7227377891540527, + "learning_rate": 8.595437760417633e-05, + "loss": 0.9171, + "step": 76590 + }, + { + "epoch": 0.4893755669984539, + "grad_norm": 0.8162720203399658, + "learning_rate": 8.59508905360489e-05, + "loss": 0.9206, + "step": 76600 + }, + { + "epoch": 0.4894394541481926, + "grad_norm": 1.031140923500061, + "learning_rate": 8.59474031058649e-05, + "loss": 0.8738, + "step": 76610 + }, + { + "epoch": 0.4895033412979313, + "grad_norm": 0.6611879467964172, + "learning_rate": 8.594391531365943e-05, + "loss": 0.8089, + "step": 76620 + }, + { + "epoch": 0.48956722844767003, + "grad_norm": 0.8293446898460388, + "learning_rate": 8.594042715946768e-05, + "loss": 1.0846, + "step": 76630 + }, + { + "epoch": 0.48963111559740874, + "grad_norm": 0.7987895607948303, + "learning_rate": 8.59369386433247e-05, + "loss": 1.0023, + "step": 76640 + }, + { + "epoch": 0.48969500274714745, + "grad_norm": 0.8030225038528442, + "learning_rate": 8.593344976526569e-05, + "loss": 0.6244, + "step": 76650 + }, + { + "epoch": 0.48975888989688615, + "grad_norm": 1.0051014423370361, + "learning_rate": 8.592996052532572e-05, + "loss": 0.8662, + "step": 76660 + }, + { + "epoch": 0.48982277704662486, + "grad_norm": 1.1758030652999878, + "learning_rate": 8.592647092353998e-05, + "loss": 0.8531, + "step": 76670 + }, + { + "epoch": 0.48988666419636356, + "grad_norm": 0.8429425358772278, + "learning_rate": 8.59229809599436e-05, + "loss": 0.9254, + "step": 76680 + }, + { + "epoch": 0.48995055134610227, + "grad_norm": 0.9333186149597168, + "learning_rate": 8.591949063457172e-05, + "loss": 0.9736, + "step": 76690 + }, + { + "epoch": 0.4900144384958409, + "grad_norm": 0.98914635181427, + "learning_rate": 8.59159999474595e-05, + "loss": 0.8071, + "step": 76700 + }, + { + "epoch": 0.4900783256455796, + "grad_norm": 0.6618992686271667, + "learning_rate": 8.591250889864209e-05, + "loss": 1.0296, + "step": 76710 + }, + { + "epoch": 0.49014221279531833, + "grad_norm": 0.5613696575164795, + "learning_rate": 8.590901748815464e-05, + "loss": 0.8095, + "step": 76720 + }, + { + "epoch": 0.49020609994505704, + "grad_norm": 0.6220462322235107, + "learning_rate": 8.590552571603232e-05, + "loss": 0.7297, + "step": 76730 + }, + { + "epoch": 0.49026998709479575, + "grad_norm": 0.5085312128067017, + "learning_rate": 8.590203358231028e-05, + "loss": 0.7892, + "step": 76740 + }, + { + "epoch": 0.49033387424453445, + "grad_norm": 0.7087169885635376, + "learning_rate": 8.589854108702371e-05, + "loss": 0.8027, + "step": 76750 + }, + { + "epoch": 0.49039776139427316, + "grad_norm": 0.7277820110321045, + "learning_rate": 8.589504823020778e-05, + "loss": 0.8146, + "step": 76760 + }, + { + "epoch": 0.49046164854401186, + "grad_norm": 0.8798472881317139, + "learning_rate": 8.589155501189767e-05, + "loss": 0.8185, + "step": 76770 + }, + { + "epoch": 0.49052553569375057, + "grad_norm": 0.8742108345031738, + "learning_rate": 8.588806143212852e-05, + "loss": 0.9735, + "step": 76780 + }, + { + "epoch": 0.4905894228434893, + "grad_norm": 1.9560281038284302, + "learning_rate": 8.588456749093558e-05, + "loss": 0.9918, + "step": 76790 + }, + { + "epoch": 0.490653309993228, + "grad_norm": 0.953271746635437, + "learning_rate": 8.588107318835398e-05, + "loss": 1.0473, + "step": 76800 + }, + { + "epoch": 0.4907171971429667, + "grad_norm": 0.8690406084060669, + "learning_rate": 8.587757852441893e-05, + "loss": 0.9629, + "step": 76810 + }, + { + "epoch": 0.49078108429270534, + "grad_norm": 1.7574247121810913, + "learning_rate": 8.587408349916564e-05, + "loss": 0.662, + "step": 76820 + }, + { + "epoch": 0.49084497144244404, + "grad_norm": 0.9854816198348999, + "learning_rate": 8.587058811262929e-05, + "loss": 0.86, + "step": 76830 + }, + { + "epoch": 0.49090885859218275, + "grad_norm": 1.1772929430007935, + "learning_rate": 8.586709236484507e-05, + "loss": 0.7821, + "step": 76840 + }, + { + "epoch": 0.49097274574192146, + "grad_norm": 1.0073944330215454, + "learning_rate": 8.586359625584822e-05, + "loss": 0.8854, + "step": 76850 + }, + { + "epoch": 0.49103663289166016, + "grad_norm": 0.7128773927688599, + "learning_rate": 8.586009978567391e-05, + "loss": 0.7433, + "step": 76860 + }, + { + "epoch": 0.49110052004139887, + "grad_norm": 0.663662314414978, + "learning_rate": 8.58566029543574e-05, + "loss": 0.8485, + "step": 76870 + }, + { + "epoch": 0.4911644071911376, + "grad_norm": 0.6211137771606445, + "learning_rate": 8.585310576193384e-05, + "loss": 0.8421, + "step": 76880 + }, + { + "epoch": 0.4912282943408763, + "grad_norm": 1.202216625213623, + "learning_rate": 8.584960820843851e-05, + "loss": 0.7988, + "step": 76890 + }, + { + "epoch": 0.491292181490615, + "grad_norm": 0.7233720421791077, + "learning_rate": 8.584611029390661e-05, + "loss": 0.903, + "step": 76900 + }, + { + "epoch": 0.4913560686403537, + "grad_norm": 0.5349984765052795, + "learning_rate": 8.584261201837337e-05, + "loss": 1.0756, + "step": 76910 + }, + { + "epoch": 0.4914199557900924, + "grad_norm": 0.9443363547325134, + "learning_rate": 8.583911338187401e-05, + "loss": 0.8465, + "step": 76920 + }, + { + "epoch": 0.4914838429398311, + "grad_norm": 0.7299784421920776, + "learning_rate": 8.583561438444379e-05, + "loss": 1.1754, + "step": 76930 + }, + { + "epoch": 0.49154773008956976, + "grad_norm": 1.1991528272628784, + "learning_rate": 8.583211502611792e-05, + "loss": 0.9597, + "step": 76940 + }, + { + "epoch": 0.49161161723930846, + "grad_norm": 1.1621544361114502, + "learning_rate": 8.582861530693165e-05, + "loss": 0.7185, + "step": 76950 + }, + { + "epoch": 0.49167550438904717, + "grad_norm": 1.565203070640564, + "learning_rate": 8.582511522692022e-05, + "loss": 0.8875, + "step": 76960 + }, + { + "epoch": 0.4917393915387859, + "grad_norm": 0.9279037714004517, + "learning_rate": 8.58216147861189e-05, + "loss": 1.1073, + "step": 76970 + }, + { + "epoch": 0.4918032786885246, + "grad_norm": 1.2294920682907104, + "learning_rate": 8.581811398456292e-05, + "loss": 0.9467, + "step": 76980 + }, + { + "epoch": 0.4918671658382633, + "grad_norm": 0.6225689053535461, + "learning_rate": 8.581461282228756e-05, + "loss": 0.9234, + "step": 76990 + }, + { + "epoch": 0.491931052988002, + "grad_norm": 1.2206075191497803, + "learning_rate": 8.581111129932805e-05, + "loss": 1.0835, + "step": 77000 + }, + { + "epoch": 0.4919949401377407, + "grad_norm": 0.7417840957641602, + "learning_rate": 8.580760941571967e-05, + "loss": 0.9111, + "step": 77010 + }, + { + "epoch": 0.4920588272874794, + "grad_norm": 0.6536421775817871, + "learning_rate": 8.580410717149769e-05, + "loss": 0.9932, + "step": 77020 + }, + { + "epoch": 0.4921227144372181, + "grad_norm": 0.8102644681930542, + "learning_rate": 8.580060456669738e-05, + "loss": 1.0341, + "step": 77030 + }, + { + "epoch": 0.4921866015869568, + "grad_norm": 0.9170993566513062, + "learning_rate": 8.579710160135399e-05, + "loss": 1.0178, + "step": 77040 + }, + { + "epoch": 0.4922504887366955, + "grad_norm": 0.4981268644332886, + "learning_rate": 8.579359827550284e-05, + "loss": 0.7805, + "step": 77050 + }, + { + "epoch": 0.49231437588643423, + "grad_norm": 1.8786097764968872, + "learning_rate": 8.579009458917917e-05, + "loss": 0.9201, + "step": 77060 + }, + { + "epoch": 0.4923782630361729, + "grad_norm": 1.0269956588745117, + "learning_rate": 8.57865905424183e-05, + "loss": 0.9258, + "step": 77070 + }, + { + "epoch": 0.4924421501859116, + "grad_norm": 0.959625244140625, + "learning_rate": 8.578308613525549e-05, + "loss": 0.7213, + "step": 77080 + }, + { + "epoch": 0.4925060373356503, + "grad_norm": 0.7318682670593262, + "learning_rate": 8.577958136772608e-05, + "loss": 0.9329, + "step": 77090 + }, + { + "epoch": 0.492569924485389, + "grad_norm": 0.9213690757751465, + "learning_rate": 8.57760762398653e-05, + "loss": 0.8856, + "step": 77100 + }, + { + "epoch": 0.4926338116351277, + "grad_norm": 0.7937483191490173, + "learning_rate": 8.577257075170849e-05, + "loss": 1.2098, + "step": 77110 + }, + { + "epoch": 0.4926976987848664, + "grad_norm": 1.0895425081253052, + "learning_rate": 8.576906490329094e-05, + "loss": 0.8427, + "step": 77120 + }, + { + "epoch": 0.4927615859346051, + "grad_norm": 0.9130338430404663, + "learning_rate": 8.576555869464798e-05, + "loss": 1.2261, + "step": 77130 + }, + { + "epoch": 0.4928254730843438, + "grad_norm": 0.7346659302711487, + "learning_rate": 8.576205212581488e-05, + "loss": 0.6587, + "step": 77140 + }, + { + "epoch": 0.49288936023408253, + "grad_norm": 3.969825267791748, + "learning_rate": 8.575854519682698e-05, + "loss": 1.0008, + "step": 77150 + }, + { + "epoch": 0.49295324738382124, + "grad_norm": 1.1453746557235718, + "learning_rate": 8.575503790771959e-05, + "loss": 1.0563, + "step": 77160 + }, + { + "epoch": 0.49301713453355994, + "grad_norm": 1.0311975479125977, + "learning_rate": 8.575153025852804e-05, + "loss": 0.944, + "step": 77170 + }, + { + "epoch": 0.49308102168329865, + "grad_norm": 0.713505744934082, + "learning_rate": 8.574802224928766e-05, + "loss": 0.8591, + "step": 77180 + }, + { + "epoch": 0.4931449088330373, + "grad_norm": 0.9348772764205933, + "learning_rate": 8.574451388003378e-05, + "loss": 0.7919, + "step": 77190 + }, + { + "epoch": 0.493208795982776, + "grad_norm": 1.0843831300735474, + "learning_rate": 8.57410051508017e-05, + "loss": 0.8934, + "step": 77200 + }, + { + "epoch": 0.4932726831325147, + "grad_norm": 0.8350475430488586, + "learning_rate": 8.573749606162678e-05, + "loss": 0.7806, + "step": 77210 + }, + { + "epoch": 0.4933365702822534, + "grad_norm": 1.9250234365463257, + "learning_rate": 8.573398661254438e-05, + "loss": 0.7523, + "step": 77220 + }, + { + "epoch": 0.4934004574319921, + "grad_norm": 0.9152832627296448, + "learning_rate": 8.573047680358978e-05, + "loss": 0.8821, + "step": 77230 + }, + { + "epoch": 0.49346434458173083, + "grad_norm": 0.9231581687927246, + "learning_rate": 8.57269666347984e-05, + "loss": 0.7803, + "step": 77240 + }, + { + "epoch": 0.49352823173146954, + "grad_norm": 0.792326807975769, + "learning_rate": 8.572345610620553e-05, + "loss": 0.9124, + "step": 77250 + }, + { + "epoch": 0.49359211888120824, + "grad_norm": 0.986379861831665, + "learning_rate": 8.571994521784659e-05, + "loss": 0.9055, + "step": 77260 + }, + { + "epoch": 0.49365600603094695, + "grad_norm": 1.0129300355911255, + "learning_rate": 8.571643396975688e-05, + "loss": 0.8168, + "step": 77270 + }, + { + "epoch": 0.49371989318068565, + "grad_norm": 0.5476410984992981, + "learning_rate": 8.571292236197178e-05, + "loss": 0.6777, + "step": 77280 + }, + { + "epoch": 0.49378378033042436, + "grad_norm": 1.3759448528289795, + "learning_rate": 8.570941039452665e-05, + "loss": 0.9291, + "step": 77290 + }, + { + "epoch": 0.49384766748016307, + "grad_norm": 1.3601030111312866, + "learning_rate": 8.570589806745687e-05, + "loss": 0.9618, + "step": 77300 + }, + { + "epoch": 0.4939115546299017, + "grad_norm": 1.174814224243164, + "learning_rate": 8.57023853807978e-05, + "loss": 0.8576, + "step": 77310 + }, + { + "epoch": 0.4939754417796404, + "grad_norm": 1.2242119312286377, + "learning_rate": 8.569887233458482e-05, + "loss": 1.1461, + "step": 77320 + }, + { + "epoch": 0.49403932892937913, + "grad_norm": 1.0794601440429688, + "learning_rate": 8.569535892885333e-05, + "loss": 0.8275, + "step": 77330 + }, + { + "epoch": 0.49410321607911784, + "grad_norm": 0.802666962146759, + "learning_rate": 8.569184516363869e-05, + "loss": 1.0711, + "step": 77340 + }, + { + "epoch": 0.49416710322885654, + "grad_norm": 0.9685359001159668, + "learning_rate": 8.568833103897628e-05, + "loss": 0.8529, + "step": 77350 + }, + { + "epoch": 0.49423099037859525, + "grad_norm": 0.9577045440673828, + "learning_rate": 8.568481655490151e-05, + "loss": 0.801, + "step": 77360 + }, + { + "epoch": 0.49429487752833395, + "grad_norm": 0.6817383170127869, + "learning_rate": 8.568130171144975e-05, + "loss": 0.9074, + "step": 77370 + }, + { + "epoch": 0.49435876467807266, + "grad_norm": 1.0045511722564697, + "learning_rate": 8.567778650865643e-05, + "loss": 0.8414, + "step": 77380 + }, + { + "epoch": 0.49442265182781137, + "grad_norm": 1.0986788272857666, + "learning_rate": 8.567427094655693e-05, + "loss": 0.8889, + "step": 77390 + }, + { + "epoch": 0.49448653897755007, + "grad_norm": 0.8840765357017517, + "learning_rate": 8.567075502518667e-05, + "loss": 0.9405, + "step": 77400 + }, + { + "epoch": 0.4945504261272888, + "grad_norm": 0.7125093936920166, + "learning_rate": 8.566723874458102e-05, + "loss": 1.0639, + "step": 77410 + }, + { + "epoch": 0.4946143132770275, + "grad_norm": 0.9904595017433167, + "learning_rate": 8.566372210477544e-05, + "loss": 0.9249, + "step": 77420 + }, + { + "epoch": 0.49467820042676613, + "grad_norm": 0.9218760132789612, + "learning_rate": 8.566020510580532e-05, + "loss": 0.8189, + "step": 77430 + }, + { + "epoch": 0.49474208757650484, + "grad_norm": 0.7512104511260986, + "learning_rate": 8.56566877477061e-05, + "loss": 1.1084, + "step": 77440 + }, + { + "epoch": 0.49480597472624355, + "grad_norm": 1.2798714637756348, + "learning_rate": 8.565317003051316e-05, + "loss": 1.0737, + "step": 77450 + }, + { + "epoch": 0.49486986187598225, + "grad_norm": 0.9428762793540955, + "learning_rate": 8.564965195426197e-05, + "loss": 0.8921, + "step": 77460 + }, + { + "epoch": 0.49493374902572096, + "grad_norm": 0.6847555637359619, + "learning_rate": 8.564613351898794e-05, + "loss": 0.6569, + "step": 77470 + }, + { + "epoch": 0.49499763617545967, + "grad_norm": 0.9463028311729431, + "learning_rate": 8.56426147247265e-05, + "loss": 1.0144, + "step": 77480 + }, + { + "epoch": 0.49506152332519837, + "grad_norm": 0.9155146479606628, + "learning_rate": 8.56390955715131e-05, + "loss": 1.0397, + "step": 77490 + }, + { + "epoch": 0.4951254104749371, + "grad_norm": 1.0941190719604492, + "learning_rate": 8.563557605938317e-05, + "loss": 0.9424, + "step": 77500 + }, + { + "epoch": 0.4951892976246758, + "grad_norm": 0.6458025574684143, + "learning_rate": 8.563205618837217e-05, + "loss": 0.9965, + "step": 77510 + }, + { + "epoch": 0.4952531847744145, + "grad_norm": 0.49309995770454407, + "learning_rate": 8.562853595851554e-05, + "loss": 0.9261, + "step": 77520 + }, + { + "epoch": 0.4953170719241532, + "grad_norm": 0.741894543170929, + "learning_rate": 8.562501536984873e-05, + "loss": 0.9298, + "step": 77530 + }, + { + "epoch": 0.4953809590738919, + "grad_norm": 0.9847732782363892, + "learning_rate": 8.562149442240718e-05, + "loss": 0.911, + "step": 77540 + }, + { + "epoch": 0.49544484622363055, + "grad_norm": 0.9211950302124023, + "learning_rate": 8.561797311622637e-05, + "loss": 0.8011, + "step": 77550 + }, + { + "epoch": 0.49550873337336926, + "grad_norm": 1.5273650884628296, + "learning_rate": 8.561445145134177e-05, + "loss": 0.9902, + "step": 77560 + }, + { + "epoch": 0.49557262052310797, + "grad_norm": 0.8541943430900574, + "learning_rate": 8.561092942778882e-05, + "loss": 0.7033, + "step": 77570 + }, + { + "epoch": 0.49563650767284667, + "grad_norm": 1.0141565799713135, + "learning_rate": 8.560740704560299e-05, + "loss": 0.7592, + "step": 77580 + }, + { + "epoch": 0.4957003948225854, + "grad_norm": 1.0680170059204102, + "learning_rate": 8.560388430481979e-05, + "loss": 1.0655, + "step": 77590 + }, + { + "epoch": 0.4957642819723241, + "grad_norm": 0.8860800862312317, + "learning_rate": 8.560036120547468e-05, + "loss": 0.9499, + "step": 77600 + }, + { + "epoch": 0.4958281691220628, + "grad_norm": 1.698761224746704, + "learning_rate": 8.559683774760311e-05, + "loss": 1.088, + "step": 77610 + }, + { + "epoch": 0.4958920562718015, + "grad_norm": 1.741514801979065, + "learning_rate": 8.559331393124059e-05, + "loss": 0.9058, + "step": 77620 + }, + { + "epoch": 0.4959559434215402, + "grad_norm": 2.4504334926605225, + "learning_rate": 8.558978975642262e-05, + "loss": 0.8441, + "step": 77630 + }, + { + "epoch": 0.4960198305712789, + "grad_norm": 1.7975729703903198, + "learning_rate": 8.558626522318467e-05, + "loss": 1.0169, + "step": 77640 + }, + { + "epoch": 0.4960837177210176, + "grad_norm": 0.8959939479827881, + "learning_rate": 8.558274033156224e-05, + "loss": 0.7613, + "step": 77650 + }, + { + "epoch": 0.4961476048707563, + "grad_norm": 1.107818841934204, + "learning_rate": 8.557921508159083e-05, + "loss": 1.1208, + "step": 77660 + }, + { + "epoch": 0.49621149202049497, + "grad_norm": 0.7719281911849976, + "learning_rate": 8.557568947330596e-05, + "loss": 0.932, + "step": 77670 + }, + { + "epoch": 0.4962753791702337, + "grad_norm": 0.7551287412643433, + "learning_rate": 8.557216350674311e-05, + "loss": 0.9016, + "step": 77680 + }, + { + "epoch": 0.4963392663199724, + "grad_norm": 0.8318659067153931, + "learning_rate": 8.556863718193779e-05, + "loss": 0.9607, + "step": 77690 + }, + { + "epoch": 0.4964031534697111, + "grad_norm": 0.799311101436615, + "learning_rate": 8.556511049892553e-05, + "loss": 1.0467, + "step": 77700 + }, + { + "epoch": 0.4964670406194498, + "grad_norm": 1.174993634223938, + "learning_rate": 8.556158345774184e-05, + "loss": 0.9272, + "step": 77710 + }, + { + "epoch": 0.4965309277691885, + "grad_norm": 0.6065928936004639, + "learning_rate": 8.555805605842224e-05, + "loss": 0.9209, + "step": 77720 + }, + { + "epoch": 0.4965948149189272, + "grad_norm": 0.9526639580726624, + "learning_rate": 8.555452830100226e-05, + "loss": 0.8938, + "step": 77730 + }, + { + "epoch": 0.4966587020686659, + "grad_norm": 0.893748939037323, + "learning_rate": 8.555100018551741e-05, + "loss": 1.0344, + "step": 77740 + }, + { + "epoch": 0.4967225892184046, + "grad_norm": 0.9119165539741516, + "learning_rate": 8.554747171200324e-05, + "loss": 1.1131, + "step": 77750 + }, + { + "epoch": 0.4967864763681433, + "grad_norm": 0.649663507938385, + "learning_rate": 8.554394288049526e-05, + "loss": 0.8, + "step": 77760 + }, + { + "epoch": 0.49685036351788203, + "grad_norm": 0.7879058122634888, + "learning_rate": 8.554041369102904e-05, + "loss": 0.7511, + "step": 77770 + }, + { + "epoch": 0.49691425066762074, + "grad_norm": 1.0487127304077148, + "learning_rate": 8.55368841436401e-05, + "loss": 1.022, + "step": 77780 + }, + { + "epoch": 0.4969781378173594, + "grad_norm": 2.496959686279297, + "learning_rate": 8.553335423836399e-05, + "loss": 1.0092, + "step": 77790 + }, + { + "epoch": 0.4970420249670981, + "grad_norm": 0.8213831186294556, + "learning_rate": 8.552982397523628e-05, + "loss": 0.6308, + "step": 77800 + }, + { + "epoch": 0.4971059121168368, + "grad_norm": 1.1579580307006836, + "learning_rate": 8.55262933542925e-05, + "loss": 1.0219, + "step": 77810 + }, + { + "epoch": 0.4971697992665755, + "grad_norm": 0.927528440952301, + "learning_rate": 8.55227623755682e-05, + "loss": 0.7056, + "step": 77820 + }, + { + "epoch": 0.4972336864163142, + "grad_norm": 1.2623285055160522, + "learning_rate": 8.551923103909896e-05, + "loss": 0.9829, + "step": 77830 + }, + { + "epoch": 0.4972975735660529, + "grad_norm": 0.9464250802993774, + "learning_rate": 8.551569934492032e-05, + "loss": 0.8887, + "step": 77840 + }, + { + "epoch": 0.4973614607157916, + "grad_norm": 2.496879816055298, + "learning_rate": 8.551216729306788e-05, + "loss": 1.054, + "step": 77850 + }, + { + "epoch": 0.49742534786553033, + "grad_norm": 1.2735011577606201, + "learning_rate": 8.550863488357718e-05, + "loss": 0.7108, + "step": 77860 + }, + { + "epoch": 0.49748923501526904, + "grad_norm": 0.8742243647575378, + "learning_rate": 8.550510211648382e-05, + "loss": 1.1427, + "step": 77870 + }, + { + "epoch": 0.49755312216500774, + "grad_norm": 0.5537328720092773, + "learning_rate": 8.550156899182336e-05, + "loss": 0.5181, + "step": 77880 + }, + { + "epoch": 0.49761700931474645, + "grad_norm": 1.0898637771606445, + "learning_rate": 8.54980355096314e-05, + "loss": 1.0092, + "step": 77890 + }, + { + "epoch": 0.49768089646448516, + "grad_norm": 0.8493994474411011, + "learning_rate": 8.549450166994348e-05, + "loss": 0.9335, + "step": 77900 + }, + { + "epoch": 0.49774478361422386, + "grad_norm": 0.8540746569633484, + "learning_rate": 8.549096747279526e-05, + "loss": 0.9631, + "step": 77910 + }, + { + "epoch": 0.4978086707639625, + "grad_norm": 0.9067754745483398, + "learning_rate": 8.548743291822227e-05, + "loss": 0.7435, + "step": 77920 + }, + { + "epoch": 0.4978725579137012, + "grad_norm": 0.9325600862503052, + "learning_rate": 8.548389800626013e-05, + "loss": 0.9721, + "step": 77930 + }, + { + "epoch": 0.4979364450634399, + "grad_norm": 0.892930805683136, + "learning_rate": 8.548036273694445e-05, + "loss": 1.0944, + "step": 77940 + }, + { + "epoch": 0.49800033221317863, + "grad_norm": 0.8587602376937866, + "learning_rate": 8.54768271103108e-05, + "loss": 1.1267, + "step": 77950 + }, + { + "epoch": 0.49806421936291734, + "grad_norm": 0.6374524831771851, + "learning_rate": 8.547329112639483e-05, + "loss": 0.8988, + "step": 77960 + }, + { + "epoch": 0.49812810651265604, + "grad_norm": 0.7276429533958435, + "learning_rate": 8.546975478523211e-05, + "loss": 0.9158, + "step": 77970 + }, + { + "epoch": 0.49819199366239475, + "grad_norm": 1.0100622177124023, + "learning_rate": 8.546621808685829e-05, + "loss": 0.9022, + "step": 77980 + }, + { + "epoch": 0.49825588081213346, + "grad_norm": 0.7123457193374634, + "learning_rate": 8.546268103130897e-05, + "loss": 1.0161, + "step": 77990 + }, + { + "epoch": 0.49831976796187216, + "grad_norm": 1.1501771211624146, + "learning_rate": 8.545914361861977e-05, + "loss": 0.848, + "step": 78000 + }, + { + "epoch": 0.49838365511161087, + "grad_norm": 1.3650606870651245, + "learning_rate": 8.545560584882632e-05, + "loss": 1.0104, + "step": 78010 + }, + { + "epoch": 0.4984475422613496, + "grad_norm": 0.5080598592758179, + "learning_rate": 8.545206772196425e-05, + "loss": 0.8855, + "step": 78020 + }, + { + "epoch": 0.4985114294110883, + "grad_norm": 0.9266533851623535, + "learning_rate": 8.544852923806918e-05, + "loss": 0.8948, + "step": 78030 + }, + { + "epoch": 0.49857531656082693, + "grad_norm": 0.782556414604187, + "learning_rate": 8.544499039717675e-05, + "loss": 1.0098, + "step": 78040 + }, + { + "epoch": 0.49863920371056564, + "grad_norm": 0.6983265280723572, + "learning_rate": 8.544145119932261e-05, + "loss": 0.8239, + "step": 78050 + }, + { + "epoch": 0.49870309086030434, + "grad_norm": 0.8616853952407837, + "learning_rate": 8.543791164454238e-05, + "loss": 0.8512, + "step": 78060 + }, + { + "epoch": 0.49876697801004305, + "grad_norm": 1.00681471824646, + "learning_rate": 8.543437173287175e-05, + "loss": 0.8171, + "step": 78070 + }, + { + "epoch": 0.49883086515978176, + "grad_norm": 0.7537940144538879, + "learning_rate": 8.543083146434632e-05, + "loss": 0.7415, + "step": 78080 + }, + { + "epoch": 0.49889475230952046, + "grad_norm": 0.8292582631111145, + "learning_rate": 8.542729083900176e-05, + "loss": 0.7361, + "step": 78090 + }, + { + "epoch": 0.49895863945925917, + "grad_norm": 0.6987549066543579, + "learning_rate": 8.542374985687376e-05, + "loss": 1.0473, + "step": 78100 + }, + { + "epoch": 0.4990225266089979, + "grad_norm": 0.7763581275939941, + "learning_rate": 8.542020851799792e-05, + "loss": 0.9915, + "step": 78110 + }, + { + "epoch": 0.4990864137587366, + "grad_norm": 0.857244610786438, + "learning_rate": 8.541666682240996e-05, + "loss": 0.8533, + "step": 78120 + }, + { + "epoch": 0.4991503009084753, + "grad_norm": 0.7238770127296448, + "learning_rate": 8.541312477014551e-05, + "loss": 1.0054, + "step": 78130 + }, + { + "epoch": 0.499214188058214, + "grad_norm": 1.2885125875473022, + "learning_rate": 8.540958236124028e-05, + "loss": 0.9024, + "step": 78140 + }, + { + "epoch": 0.4992780752079527, + "grad_norm": 1.3444433212280273, + "learning_rate": 8.540603959572991e-05, + "loss": 0.8516, + "step": 78150 + }, + { + "epoch": 0.49934196235769135, + "grad_norm": 1.0060087442398071, + "learning_rate": 8.540249647365008e-05, + "loss": 0.8785, + "step": 78160 + }, + { + "epoch": 0.49940584950743006, + "grad_norm": 1.0679055452346802, + "learning_rate": 8.539895299503648e-05, + "loss": 0.9874, + "step": 78170 + }, + { + "epoch": 0.49946973665716876, + "grad_norm": 0.7113552093505859, + "learning_rate": 8.539540915992482e-05, + "loss": 0.8014, + "step": 78180 + }, + { + "epoch": 0.49953362380690747, + "grad_norm": 1.0472384691238403, + "learning_rate": 8.539186496835077e-05, + "loss": 0.9478, + "step": 78190 + }, + { + "epoch": 0.4995975109566462, + "grad_norm": 0.7270193099975586, + "learning_rate": 8.538832042035e-05, + "loss": 0.9747, + "step": 78200 + }, + { + "epoch": 0.4996613981063849, + "grad_norm": 0.6182805895805359, + "learning_rate": 8.538477551595824e-05, + "loss": 0.9063, + "step": 78210 + }, + { + "epoch": 0.4997252852561236, + "grad_norm": 1.1360803842544556, + "learning_rate": 8.538123025521117e-05, + "loss": 0.942, + "step": 78220 + }, + { + "epoch": 0.4997891724058623, + "grad_norm": 0.7602689862251282, + "learning_rate": 8.537768463814451e-05, + "loss": 0.9089, + "step": 78230 + }, + { + "epoch": 0.499853059555601, + "grad_norm": 0.8490816354751587, + "learning_rate": 8.537413866479396e-05, + "loss": 1.0442, + "step": 78240 + }, + { + "epoch": 0.4999169467053397, + "grad_norm": 0.8993768692016602, + "learning_rate": 8.537059233519522e-05, + "loss": 0.7744, + "step": 78250 + }, + { + "epoch": 0.4999808338550784, + "grad_norm": 1.221891164779663, + "learning_rate": 8.536704564938402e-05, + "loss": 0.9663, + "step": 78260 + }, + { + "epoch": 0.5000447210048171, + "grad_norm": 0.7886923551559448, + "learning_rate": 8.536349860739608e-05, + "loss": 0.9475, + "step": 78270 + }, + { + "epoch": 0.5001086081545558, + "grad_norm": 0.8273355960845947, + "learning_rate": 8.535995120926712e-05, + "loss": 0.9991, + "step": 78280 + }, + { + "epoch": 0.5001724953042945, + "grad_norm": 0.7041333317756653, + "learning_rate": 8.535640345503285e-05, + "loss": 0.7416, + "step": 78290 + }, + { + "epoch": 0.5002363824540332, + "grad_norm": 0.994158148765564, + "learning_rate": 8.535285534472901e-05, + "loss": 0.802, + "step": 78300 + }, + { + "epoch": 0.5003002696037719, + "grad_norm": 1.7603987455368042, + "learning_rate": 8.534930687839134e-05, + "loss": 0.948, + "step": 78310 + }, + { + "epoch": 0.5003641567535106, + "grad_norm": 0.8774569034576416, + "learning_rate": 8.534575805605555e-05, + "loss": 0.9112, + "step": 78320 + }, + { + "epoch": 0.5004280439032494, + "grad_norm": 1.1153593063354492, + "learning_rate": 8.534220887775743e-05, + "loss": 0.8952, + "step": 78330 + }, + { + "epoch": 0.500491931052988, + "grad_norm": 0.6436009407043457, + "learning_rate": 8.533865934353267e-05, + "loss": 0.8265, + "step": 78340 + }, + { + "epoch": 0.5005558182027267, + "grad_norm": 0.8798633217811584, + "learning_rate": 8.533510945341704e-05, + "loss": 0.876, + "step": 78350 + }, + { + "epoch": 0.5006197053524654, + "grad_norm": 0.92572021484375, + "learning_rate": 8.533155920744629e-05, + "loss": 1.0156, + "step": 78360 + }, + { + "epoch": 0.5006835925022041, + "grad_norm": 1.193281650543213, + "learning_rate": 8.532800860565618e-05, + "loss": 0.7637, + "step": 78370 + }, + { + "epoch": 0.5007474796519428, + "grad_norm": 1.7722188234329224, + "learning_rate": 8.532445764808243e-05, + "loss": 0.8904, + "step": 78380 + }, + { + "epoch": 0.5008113668016815, + "grad_norm": 1.1688928604125977, + "learning_rate": 8.532090633476087e-05, + "loss": 0.9443, + "step": 78390 + }, + { + "epoch": 0.5008752539514202, + "grad_norm": 0.7432667016983032, + "learning_rate": 8.531735466572722e-05, + "loss": 0.8921, + "step": 78400 + }, + { + "epoch": 0.5009391411011589, + "grad_norm": 0.5280702710151672, + "learning_rate": 8.531380264101722e-05, + "loss": 0.6771, + "step": 78410 + }, + { + "epoch": 0.5010030282508976, + "grad_norm": 1.1904703378677368, + "learning_rate": 8.531025026066672e-05, + "loss": 0.9539, + "step": 78420 + }, + { + "epoch": 0.5010669154006363, + "grad_norm": 1.0105900764465332, + "learning_rate": 8.530669752471142e-05, + "loss": 0.7645, + "step": 78430 + }, + { + "epoch": 0.501130802550375, + "grad_norm": 1.2654132843017578, + "learning_rate": 8.530314443318714e-05, + "loss": 0.8891, + "step": 78440 + }, + { + "epoch": 0.5011946897001137, + "grad_norm": 1.8313031196594238, + "learning_rate": 8.529959098612966e-05, + "loss": 0.8518, + "step": 78450 + }, + { + "epoch": 0.5012585768498524, + "grad_norm": 0.9657493829727173, + "learning_rate": 8.529603718357476e-05, + "loss": 0.9087, + "step": 78460 + }, + { + "epoch": 0.5013224639995911, + "grad_norm": 0.9630830883979797, + "learning_rate": 8.529248302555824e-05, + "loss": 0.9349, + "step": 78470 + }, + { + "epoch": 0.5013863511493298, + "grad_norm": 0.7544282674789429, + "learning_rate": 8.528892851211587e-05, + "loss": 0.811, + "step": 78480 + }, + { + "epoch": 0.5014502382990685, + "grad_norm": 1.854946255683899, + "learning_rate": 8.528537364328346e-05, + "loss": 1.142, + "step": 78490 + }, + { + "epoch": 0.5015141254488072, + "grad_norm": 0.9642276763916016, + "learning_rate": 8.528181841909681e-05, + "loss": 0.8864, + "step": 78500 + }, + { + "epoch": 0.501578012598546, + "grad_norm": 0.8221122026443481, + "learning_rate": 8.527826283959173e-05, + "loss": 0.9846, + "step": 78510 + }, + { + "epoch": 0.5016418997482847, + "grad_norm": 1.0337133407592773, + "learning_rate": 8.527470690480403e-05, + "loss": 0.7898, + "step": 78520 + }, + { + "epoch": 0.5017057868980234, + "grad_norm": 0.7045915126800537, + "learning_rate": 8.527115061476951e-05, + "loss": 0.9587, + "step": 78530 + }, + { + "epoch": 0.5017696740477621, + "grad_norm": 0.8976203799247742, + "learning_rate": 8.526759396952398e-05, + "loss": 0.815, + "step": 78540 + }, + { + "epoch": 0.5018335611975008, + "grad_norm": 1.2948665618896484, + "learning_rate": 8.526403696910326e-05, + "loss": 1.1365, + "step": 78550 + }, + { + "epoch": 0.5018974483472395, + "grad_norm": 0.5973215699195862, + "learning_rate": 8.52604796135432e-05, + "loss": 0.934, + "step": 78560 + }, + { + "epoch": 0.5019613354969782, + "grad_norm": 0.8393608927726746, + "learning_rate": 8.52569219028796e-05, + "loss": 0.9389, + "step": 78570 + }, + { + "epoch": 0.5020252226467168, + "grad_norm": 0.8553054332733154, + "learning_rate": 8.525336383714831e-05, + "loss": 0.9821, + "step": 78580 + }, + { + "epoch": 0.5020891097964555, + "grad_norm": 0.43800783157348633, + "learning_rate": 8.524980541638513e-05, + "loss": 0.7432, + "step": 78590 + }, + { + "epoch": 0.5021529969461942, + "grad_norm": 0.6436516046524048, + "learning_rate": 8.524624664062591e-05, + "loss": 0.8488, + "step": 78600 + }, + { + "epoch": 0.5022168840959329, + "grad_norm": 1.9890680313110352, + "learning_rate": 8.524268750990649e-05, + "loss": 0.9869, + "step": 78610 + }, + { + "epoch": 0.5022807712456716, + "grad_norm": 0.5732369422912598, + "learning_rate": 8.523912802426274e-05, + "loss": 0.8985, + "step": 78620 + }, + { + "epoch": 0.5023446583954103, + "grad_norm": 2.491802453994751, + "learning_rate": 8.523556818373047e-05, + "loss": 0.7785, + "step": 78630 + }, + { + "epoch": 0.502408545545149, + "grad_norm": 0.7027126550674438, + "learning_rate": 8.523200798834555e-05, + "loss": 0.9466, + "step": 78640 + }, + { + "epoch": 0.5024724326948877, + "grad_norm": 0.7287322282791138, + "learning_rate": 8.522844743814382e-05, + "loss": 0.7692, + "step": 78650 + }, + { + "epoch": 0.5025363198446264, + "grad_norm": 0.9365010857582092, + "learning_rate": 8.522488653316117e-05, + "loss": 0.7661, + "step": 78660 + }, + { + "epoch": 0.5026002069943651, + "grad_norm": 0.5464925765991211, + "learning_rate": 8.522132527343342e-05, + "loss": 0.6622, + "step": 78670 + }, + { + "epoch": 0.5026640941441038, + "grad_norm": 1.0046019554138184, + "learning_rate": 8.521776365899645e-05, + "loss": 0.8033, + "step": 78680 + }, + { + "epoch": 0.5027279812938426, + "grad_norm": 1.4342055320739746, + "learning_rate": 8.521420168988615e-05, + "loss": 0.6578, + "step": 78690 + }, + { + "epoch": 0.5027918684435813, + "grad_norm": 0.5104334950447083, + "learning_rate": 8.521063936613835e-05, + "loss": 0.8798, + "step": 78700 + }, + { + "epoch": 0.50285575559332, + "grad_norm": 0.5231984853744507, + "learning_rate": 8.520707668778897e-05, + "loss": 0.709, + "step": 78710 + }, + { + "epoch": 0.5029196427430587, + "grad_norm": 1.10532546043396, + "learning_rate": 8.520351365487387e-05, + "loss": 0.7458, + "step": 78720 + }, + { + "epoch": 0.5029835298927974, + "grad_norm": 1.1135833263397217, + "learning_rate": 8.519995026742892e-05, + "loss": 0.779, + "step": 78730 + }, + { + "epoch": 0.5030474170425361, + "grad_norm": 1.3996037244796753, + "learning_rate": 8.519638652549003e-05, + "loss": 0.8194, + "step": 78740 + }, + { + "epoch": 0.5031113041922748, + "grad_norm": 0.8986942172050476, + "learning_rate": 8.519282242909307e-05, + "loss": 0.876, + "step": 78750 + }, + { + "epoch": 0.5031751913420135, + "grad_norm": 1.100974202156067, + "learning_rate": 8.518925797827394e-05, + "loss": 0.9528, + "step": 78760 + }, + { + "epoch": 0.5032390784917522, + "grad_norm": 0.801201581954956, + "learning_rate": 8.518569317306855e-05, + "loss": 1.0414, + "step": 78770 + }, + { + "epoch": 0.5033029656414909, + "grad_norm": 0.7082595825195312, + "learning_rate": 8.518212801351278e-05, + "loss": 0.9245, + "step": 78780 + }, + { + "epoch": 0.5033668527912296, + "grad_norm": 0.708473265171051, + "learning_rate": 8.517856249964254e-05, + "loss": 1.1068, + "step": 78790 + }, + { + "epoch": 0.5034307399409683, + "grad_norm": 0.6335508823394775, + "learning_rate": 8.517499663149376e-05, + "loss": 0.6662, + "step": 78800 + }, + { + "epoch": 0.503494627090707, + "grad_norm": 0.6749662756919861, + "learning_rate": 8.517143040910231e-05, + "loss": 0.7987, + "step": 78810 + }, + { + "epoch": 0.5035585142404457, + "grad_norm": 0.8133144974708557, + "learning_rate": 8.516786383250415e-05, + "loss": 0.8016, + "step": 78820 + }, + { + "epoch": 0.5036224013901843, + "grad_norm": 2.017829418182373, + "learning_rate": 8.516429690173516e-05, + "loss": 0.9251, + "step": 78830 + }, + { + "epoch": 0.503686288539923, + "grad_norm": 0.5370079278945923, + "learning_rate": 8.516072961683128e-05, + "loss": 0.8484, + "step": 78840 + }, + { + "epoch": 0.5037501756896617, + "grad_norm": 0.9369492530822754, + "learning_rate": 8.515716197782845e-05, + "loss": 0.7588, + "step": 78850 + }, + { + "epoch": 0.5038140628394004, + "grad_norm": 0.9964790344238281, + "learning_rate": 8.515359398476257e-05, + "loss": 1.0391, + "step": 78860 + }, + { + "epoch": 0.5038779499891392, + "grad_norm": 0.9717357158660889, + "learning_rate": 8.51500256376696e-05, + "loss": 0.8133, + "step": 78870 + }, + { + "epoch": 0.5039418371388779, + "grad_norm": 1.1114850044250488, + "learning_rate": 8.514645693658545e-05, + "loss": 1.0353, + "step": 78880 + }, + { + "epoch": 0.5040057242886166, + "grad_norm": 0.9141243100166321, + "learning_rate": 8.514288788154607e-05, + "loss": 1.0811, + "step": 78890 + }, + { + "epoch": 0.5040696114383553, + "grad_norm": 0.6969479322433472, + "learning_rate": 8.513931847258741e-05, + "loss": 0.7896, + "step": 78900 + }, + { + "epoch": 0.504133498588094, + "grad_norm": 0.8812980651855469, + "learning_rate": 8.513574870974542e-05, + "loss": 0.9231, + "step": 78910 + }, + { + "epoch": 0.5041973857378327, + "grad_norm": 0.9980469942092896, + "learning_rate": 8.513217859305604e-05, + "loss": 0.8142, + "step": 78920 + }, + { + "epoch": 0.5042612728875714, + "grad_norm": 2.051957130432129, + "learning_rate": 8.512860812255523e-05, + "loss": 0.9061, + "step": 78930 + }, + { + "epoch": 0.5043251600373101, + "grad_norm": 1.683716058731079, + "learning_rate": 8.512503729827894e-05, + "loss": 0.8771, + "step": 78940 + }, + { + "epoch": 0.5043890471870488, + "grad_norm": 0.7868318557739258, + "learning_rate": 8.512146612026314e-05, + "loss": 0.8051, + "step": 78950 + }, + { + "epoch": 0.5044529343367875, + "grad_norm": 0.5927671790122986, + "learning_rate": 8.511789458854379e-05, + "loss": 0.8834, + "step": 78960 + }, + { + "epoch": 0.5045168214865262, + "grad_norm": 1.6531774997711182, + "learning_rate": 8.511432270315685e-05, + "loss": 0.7847, + "step": 78970 + }, + { + "epoch": 0.5045807086362649, + "grad_norm": 1.921579360961914, + "learning_rate": 8.511075046413832e-05, + "loss": 0.9616, + "step": 78980 + }, + { + "epoch": 0.5046445957860036, + "grad_norm": 0.9210075736045837, + "learning_rate": 8.510717787152416e-05, + "loss": 0.6717, + "step": 78990 + }, + { + "epoch": 0.5047084829357423, + "grad_norm": 0.7043361663818359, + "learning_rate": 8.510360492535033e-05, + "loss": 0.8636, + "step": 79000 + }, + { + "epoch": 0.504772370085481, + "grad_norm": 0.8452950716018677, + "learning_rate": 8.510003162565283e-05, + "loss": 1.0588, + "step": 79010 + }, + { + "epoch": 0.5048362572352197, + "grad_norm": 0.9324773550033569, + "learning_rate": 8.509645797246766e-05, + "loss": 0.8968, + "step": 79020 + }, + { + "epoch": 0.5049001443849584, + "grad_norm": 0.7496733069419861, + "learning_rate": 8.50928839658308e-05, + "loss": 0.9523, + "step": 79030 + }, + { + "epoch": 0.5049640315346972, + "grad_norm": 1.1798431873321533, + "learning_rate": 8.508930960577821e-05, + "loss": 0.9494, + "step": 79040 + }, + { + "epoch": 0.5050279186844359, + "grad_norm": 0.9668488502502441, + "learning_rate": 8.508573489234594e-05, + "loss": 0.8869, + "step": 79050 + }, + { + "epoch": 0.5050918058341746, + "grad_norm": 0.746605634689331, + "learning_rate": 8.508215982556996e-05, + "loss": 0.8892, + "step": 79060 + }, + { + "epoch": 0.5051556929839132, + "grad_norm": 0.7922160029411316, + "learning_rate": 8.507858440548628e-05, + "loss": 0.772, + "step": 79070 + }, + { + "epoch": 0.5052195801336519, + "grad_norm": 0.7023123502731323, + "learning_rate": 8.50750086321309e-05, + "loss": 1.062, + "step": 79080 + }, + { + "epoch": 0.5052834672833906, + "grad_norm": 0.7416033148765564, + "learning_rate": 8.507143250553985e-05, + "loss": 0.8482, + "step": 79090 + }, + { + "epoch": 0.5053473544331293, + "grad_norm": 0.6974393725395203, + "learning_rate": 8.506785602574914e-05, + "loss": 1.0322, + "step": 79100 + }, + { + "epoch": 0.505411241582868, + "grad_norm": 1.0407123565673828, + "learning_rate": 8.506427919279478e-05, + "loss": 0.7803, + "step": 79110 + }, + { + "epoch": 0.5054751287326067, + "grad_norm": 0.6520995497703552, + "learning_rate": 8.506070200671277e-05, + "loss": 1.2658, + "step": 79120 + }, + { + "epoch": 0.5055390158823454, + "grad_norm": 1.1513316631317139, + "learning_rate": 8.505712446753918e-05, + "loss": 0.8079, + "step": 79130 + }, + { + "epoch": 0.5056029030320841, + "grad_norm": 0.7983292937278748, + "learning_rate": 8.505354657531001e-05, + "loss": 1.3388, + "step": 79140 + }, + { + "epoch": 0.5056667901818228, + "grad_norm": 0.6515194177627563, + "learning_rate": 8.50499683300613e-05, + "loss": 0.7375, + "step": 79150 + }, + { + "epoch": 0.5057306773315615, + "grad_norm": 0.7249539494514465, + "learning_rate": 8.504638973182908e-05, + "loss": 0.8181, + "step": 79160 + }, + { + "epoch": 0.5057945644813002, + "grad_norm": 1.1405197381973267, + "learning_rate": 8.504281078064942e-05, + "loss": 0.8314, + "step": 79170 + }, + { + "epoch": 0.5058584516310389, + "grad_norm": 0.5889720320701599, + "learning_rate": 8.503923147655832e-05, + "loss": 0.9283, + "step": 79180 + }, + { + "epoch": 0.5059223387807776, + "grad_norm": 1.013061761856079, + "learning_rate": 8.503565181959185e-05, + "loss": 0.7729, + "step": 79190 + }, + { + "epoch": 0.5059862259305163, + "grad_norm": 1.1202266216278076, + "learning_rate": 8.503207180978604e-05, + "loss": 0.888, + "step": 79200 + }, + { + "epoch": 0.506050113080255, + "grad_norm": 1.2008094787597656, + "learning_rate": 8.502849144717698e-05, + "loss": 0.8544, + "step": 79210 + }, + { + "epoch": 0.5061140002299938, + "grad_norm": 0.7154238224029541, + "learning_rate": 8.50249107318007e-05, + "loss": 1.0849, + "step": 79220 + }, + { + "epoch": 0.5061778873797325, + "grad_norm": 0.9151634573936462, + "learning_rate": 8.502132966369327e-05, + "loss": 0.8898, + "step": 79230 + }, + { + "epoch": 0.5062417745294712, + "grad_norm": 0.9326740503311157, + "learning_rate": 8.501774824289076e-05, + "loss": 0.7975, + "step": 79240 + }, + { + "epoch": 0.5063056616792099, + "grad_norm": 0.9655689001083374, + "learning_rate": 8.501416646942922e-05, + "loss": 0.8229, + "step": 79250 + }, + { + "epoch": 0.5063695488289486, + "grad_norm": NaN, + "learning_rate": 8.50109425718202e-05, + "loss": 0.929, + "step": 79260 + }, + { + "epoch": 0.5064334359786873, + "grad_norm": 1.2509207725524902, + "learning_rate": 8.50073601284059e-05, + "loss": 0.8922, + "step": 79270 + }, + { + "epoch": 0.506497323128426, + "grad_norm": 0.7241592407226562, + "learning_rate": 8.50037773324372e-05, + "loss": 1.0239, + "step": 79280 + }, + { + "epoch": 0.5065612102781647, + "grad_norm": 0.7398717999458313, + "learning_rate": 8.500019418395019e-05, + "loss": 1.0812, + "step": 79290 + }, + { + "epoch": 0.5066250974279034, + "grad_norm": 1.2325761318206787, + "learning_rate": 8.499661068298093e-05, + "loss": 0.9234, + "step": 79300 + }, + { + "epoch": 0.506688984577642, + "grad_norm": 0.7128446102142334, + "learning_rate": 8.499302682956554e-05, + "loss": 0.7636, + "step": 79310 + }, + { + "epoch": 0.5067528717273807, + "grad_norm": 1.0909960269927979, + "learning_rate": 8.498944262374009e-05, + "loss": 0.9345, + "step": 79320 + }, + { + "epoch": 0.5068167588771194, + "grad_norm": 0.8504812121391296, + "learning_rate": 8.498585806554069e-05, + "loss": 1.0587, + "step": 79330 + }, + { + "epoch": 0.5068806460268581, + "grad_norm": 1.0179625749588013, + "learning_rate": 8.498227315500343e-05, + "loss": 0.8948, + "step": 79340 + }, + { + "epoch": 0.5069445331765968, + "grad_norm": 0.7913358807563782, + "learning_rate": 8.497868789216439e-05, + "loss": 0.9132, + "step": 79350 + }, + { + "epoch": 0.5070084203263355, + "grad_norm": 2.097581148147583, + "learning_rate": 8.497510227705972e-05, + "loss": 1.0746, + "step": 79360 + }, + { + "epoch": 0.5070723074760742, + "grad_norm": 0.8437251448631287, + "learning_rate": 8.497151630972552e-05, + "loss": 0.8626, + "step": 79370 + }, + { + "epoch": 0.5071361946258129, + "grad_norm": 1.6225666999816895, + "learning_rate": 8.496792999019789e-05, + "loss": 0.9101, + "step": 79380 + }, + { + "epoch": 0.5072000817755516, + "grad_norm": 1.98760986328125, + "learning_rate": 8.496434331851295e-05, + "loss": 0.8182, + "step": 79390 + }, + { + "epoch": 0.5072639689252904, + "grad_norm": 0.8181973099708557, + "learning_rate": 8.496075629470683e-05, + "loss": 0.7777, + "step": 79400 + }, + { + "epoch": 0.5073278560750291, + "grad_norm": 0.9031455516815186, + "learning_rate": 8.495716891881564e-05, + "loss": 1.0561, + "step": 79410 + }, + { + "epoch": 0.5073917432247678, + "grad_norm": 1.2834783792495728, + "learning_rate": 8.495358119087553e-05, + "loss": 0.8807, + "step": 79420 + }, + { + "epoch": 0.5074556303745065, + "grad_norm": 0.5575640201568604, + "learning_rate": 8.494999311092262e-05, + "loss": 0.8329, + "step": 79430 + }, + { + "epoch": 0.5075195175242452, + "grad_norm": 1.2049697637557983, + "learning_rate": 8.494640467899303e-05, + "loss": 1.0383, + "step": 79440 + }, + { + "epoch": 0.5075834046739839, + "grad_norm": 1.0265311002731323, + "learning_rate": 8.494281589512292e-05, + "loss": 0.8573, + "step": 79450 + }, + { + "epoch": 0.5076472918237226, + "grad_norm": 1.0250693559646606, + "learning_rate": 8.493922675934842e-05, + "loss": 1.0297, + "step": 79460 + }, + { + "epoch": 0.5077111789734613, + "grad_norm": 0.6825410723686218, + "learning_rate": 8.493563727170569e-05, + "loss": 0.7719, + "step": 79470 + }, + { + "epoch": 0.5077750661232, + "grad_norm": 0.7861701250076294, + "learning_rate": 8.493204743223084e-05, + "loss": 0.915, + "step": 79480 + }, + { + "epoch": 0.5078389532729387, + "grad_norm": 0.7970221638679504, + "learning_rate": 8.492845724096008e-05, + "loss": 0.7341, + "step": 79490 + }, + { + "epoch": 0.5079028404226774, + "grad_norm": 1.6295416355133057, + "learning_rate": 8.492486669792955e-05, + "loss": 0.7568, + "step": 79500 + }, + { + "epoch": 0.5079667275724161, + "grad_norm": 1.207197666168213, + "learning_rate": 8.492127580317536e-05, + "loss": 0.7609, + "step": 79510 + }, + { + "epoch": 0.5080306147221548, + "grad_norm": 0.6028062105178833, + "learning_rate": 8.491768455673373e-05, + "loss": 0.8507, + "step": 79520 + }, + { + "epoch": 0.5080945018718935, + "grad_norm": 0.6845399737358093, + "learning_rate": 8.49140929586408e-05, + "loss": 0.8778, + "step": 79530 + }, + { + "epoch": 0.5081583890216322, + "grad_norm": 0.9547748565673828, + "learning_rate": 8.491050100893276e-05, + "loss": 0.7729, + "step": 79540 + }, + { + "epoch": 0.508222276171371, + "grad_norm": 1.033980369567871, + "learning_rate": 8.490690870764577e-05, + "loss": 0.928, + "step": 79550 + }, + { + "epoch": 0.5082861633211095, + "grad_norm": 0.9669222831726074, + "learning_rate": 8.490331605481602e-05, + "loss": 0.9523, + "step": 79560 + }, + { + "epoch": 0.5083500504708482, + "grad_norm": 0.8368834853172302, + "learning_rate": 8.489972305047968e-05, + "loss": 1.0998, + "step": 79570 + }, + { + "epoch": 0.508413937620587, + "grad_norm": 0.8119040727615356, + "learning_rate": 8.489612969467292e-05, + "loss": 1.0353, + "step": 79580 + }, + { + "epoch": 0.5084778247703257, + "grad_norm": 0.9374289512634277, + "learning_rate": 8.489253598743195e-05, + "loss": 1.2263, + "step": 79590 + }, + { + "epoch": 0.5085417119200644, + "grad_norm": 0.6595514416694641, + "learning_rate": 8.488894192879297e-05, + "loss": 0.9092, + "step": 79600 + }, + { + "epoch": 0.5086055990698031, + "grad_norm": 0.5380666851997375, + "learning_rate": 8.488534751879213e-05, + "loss": 1.0118, + "step": 79610 + }, + { + "epoch": 0.5086694862195418, + "grad_norm": 0.6525367498397827, + "learning_rate": 8.488175275746568e-05, + "loss": 0.9371, + "step": 79620 + }, + { + "epoch": 0.5087333733692805, + "grad_norm": 0.5488191246986389, + "learning_rate": 8.487815764484981e-05, + "loss": 0.8618, + "step": 79630 + }, + { + "epoch": 0.5087972605190192, + "grad_norm": 0.7757022380828857, + "learning_rate": 8.487456218098071e-05, + "loss": 0.9421, + "step": 79640 + }, + { + "epoch": 0.5088611476687579, + "grad_norm": 0.7398278117179871, + "learning_rate": 8.48709663658946e-05, + "loss": 1.1401, + "step": 79650 + }, + { + "epoch": 0.5089250348184966, + "grad_norm": 1.6941704750061035, + "learning_rate": 8.486737019962769e-05, + "loss": 0.8624, + "step": 79660 + }, + { + "epoch": 0.5089889219682353, + "grad_norm": 0.8483586311340332, + "learning_rate": 8.486377368221621e-05, + "loss": 0.8825, + "step": 79670 + }, + { + "epoch": 0.509052809117974, + "grad_norm": 0.8225073218345642, + "learning_rate": 8.486017681369636e-05, + "loss": 0.7361, + "step": 79680 + }, + { + "epoch": 0.5091166962677127, + "grad_norm": 0.8197336196899414, + "learning_rate": 8.485657959410436e-05, + "loss": 0.9902, + "step": 79690 + }, + { + "epoch": 0.5091805834174514, + "grad_norm": 0.6956250667572021, + "learning_rate": 8.485298202347646e-05, + "loss": 0.9947, + "step": 79700 + }, + { + "epoch": 0.5092444705671901, + "grad_norm": 1.1216806173324585, + "learning_rate": 8.484938410184888e-05, + "loss": 0.7103, + "step": 79710 + }, + { + "epoch": 0.5093083577169288, + "grad_norm": 1.101396083831787, + "learning_rate": 8.484578582925784e-05, + "loss": 0.7783, + "step": 79720 + }, + { + "epoch": 0.5093722448666675, + "grad_norm": 1.2090519666671753, + "learning_rate": 8.48421872057396e-05, + "loss": 0.7809, + "step": 79730 + }, + { + "epoch": 0.5094361320164063, + "grad_norm": 0.9379667043685913, + "learning_rate": 8.48385882313304e-05, + "loss": 1.0095, + "step": 79740 + }, + { + "epoch": 0.509500019166145, + "grad_norm": 0.6880574822425842, + "learning_rate": 8.483498890606647e-05, + "loss": 0.7678, + "step": 79750 + }, + { + "epoch": 0.5095639063158837, + "grad_norm": 0.9663302898406982, + "learning_rate": 8.483138922998406e-05, + "loss": 0.9895, + "step": 79760 + }, + { + "epoch": 0.5096277934656224, + "grad_norm": 2.0903241634368896, + "learning_rate": 8.482778920311942e-05, + "loss": 0.9586, + "step": 79770 + }, + { + "epoch": 0.5096916806153611, + "grad_norm": 0.723540723323822, + "learning_rate": 8.482418882550882e-05, + "loss": 0.7927, + "step": 79780 + }, + { + "epoch": 0.5097555677650998, + "grad_norm": 0.6735635995864868, + "learning_rate": 8.482058809718852e-05, + "loss": 0.733, + "step": 79790 + }, + { + "epoch": 0.5098194549148384, + "grad_norm": 0.7494048476219177, + "learning_rate": 8.481698701819476e-05, + "loss": 0.7265, + "step": 79800 + }, + { + "epoch": 0.5098833420645771, + "grad_norm": 1.2474843263626099, + "learning_rate": 8.481338558856383e-05, + "loss": 0.9442, + "step": 79810 + }, + { + "epoch": 0.5099472292143158, + "grad_norm": 1.0671770572662354, + "learning_rate": 8.4809783808332e-05, + "loss": 0.6449, + "step": 79820 + }, + { + "epoch": 0.5100111163640545, + "grad_norm": 1.693997859954834, + "learning_rate": 8.480618167753551e-05, + "loss": 0.9382, + "step": 79830 + }, + { + "epoch": 0.5100750035137932, + "grad_norm": 0.8211742639541626, + "learning_rate": 8.480257919621067e-05, + "loss": 0.8989, + "step": 79840 + }, + { + "epoch": 0.5101388906635319, + "grad_norm": 0.6184179186820984, + "learning_rate": 8.479897636439375e-05, + "loss": 0.9222, + "step": 79850 + }, + { + "epoch": 0.5102027778132706, + "grad_norm": 0.9833461046218872, + "learning_rate": 8.479537318212103e-05, + "loss": 0.8038, + "step": 79860 + }, + { + "epoch": 0.5102666649630093, + "grad_norm": 1.01847243309021, + "learning_rate": 8.479176964942879e-05, + "loss": 1.0515, + "step": 79870 + }, + { + "epoch": 0.510330552112748, + "grad_norm": 1.1789108514785767, + "learning_rate": 8.478816576635334e-05, + "loss": 0.7673, + "step": 79880 + }, + { + "epoch": 0.5103944392624867, + "grad_norm": 1.949750542640686, + "learning_rate": 8.478456153293096e-05, + "loss": 0.8108, + "step": 79890 + }, + { + "epoch": 0.5104583264122254, + "grad_norm": 1.040195345878601, + "learning_rate": 8.478095694919797e-05, + "loss": 0.9984, + "step": 79900 + }, + { + "epoch": 0.5105222135619641, + "grad_norm": 0.8911735415458679, + "learning_rate": 8.477735201519063e-05, + "loss": 0.9404, + "step": 79910 + }, + { + "epoch": 0.5105861007117029, + "grad_norm": 0.8057443499565125, + "learning_rate": 8.477374673094526e-05, + "loss": 0.638, + "step": 79920 + }, + { + "epoch": 0.5106499878614416, + "grad_norm": 0.5853357911109924, + "learning_rate": 8.477014109649822e-05, + "loss": 0.8098, + "step": 79930 + }, + { + "epoch": 0.5107138750111803, + "grad_norm": 0.8207983374595642, + "learning_rate": 8.476653511188575e-05, + "loss": 0.9, + "step": 79940 + }, + { + "epoch": 0.510777762160919, + "grad_norm": 0.6729571223258972, + "learning_rate": 8.47629287771442e-05, + "loss": 0.8749, + "step": 79950 + }, + { + "epoch": 0.5108416493106577, + "grad_norm": 0.5826616287231445, + "learning_rate": 8.475932209230987e-05, + "loss": 0.8363, + "step": 79960 + }, + { + "epoch": 0.5109055364603964, + "grad_norm": 0.5043898224830627, + "learning_rate": 8.475571505741912e-05, + "loss": 0.7508, + "step": 79970 + }, + { + "epoch": 0.5109694236101351, + "grad_norm": 0.9225212931632996, + "learning_rate": 8.475210767250823e-05, + "loss": 0.9501, + "step": 79980 + }, + { + "epoch": 0.5110333107598738, + "grad_norm": 1.0718021392822266, + "learning_rate": 8.474849993761357e-05, + "loss": 0.8453, + "step": 79990 + }, + { + "epoch": 0.5110971979096125, + "grad_norm": 0.7842211127281189, + "learning_rate": 8.474489185277143e-05, + "loss": 1.3727, + "step": 80000 + }, + { + "epoch": 0.5111610850593512, + "grad_norm": 0.8479704260826111, + "learning_rate": 8.474128341801819e-05, + "loss": 1.1579, + "step": 80010 + }, + { + "epoch": 0.5112249722090899, + "grad_norm": 0.736724317073822, + "learning_rate": 8.473767463339018e-05, + "loss": 0.8251, + "step": 80020 + }, + { + "epoch": 0.5112888593588286, + "grad_norm": 0.6635915040969849, + "learning_rate": 8.47340654989237e-05, + "loss": 1.2262, + "step": 80030 + }, + { + "epoch": 0.5113527465085672, + "grad_norm": 0.818091869354248, + "learning_rate": 8.473045601465515e-05, + "loss": 0.9825, + "step": 80040 + }, + { + "epoch": 0.5114166336583059, + "grad_norm": 0.8209525346755981, + "learning_rate": 8.472684618062085e-05, + "loss": 1.1992, + "step": 80050 + }, + { + "epoch": 0.5114805208080446, + "grad_norm": 0.6535345911979675, + "learning_rate": 8.472323599685718e-05, + "loss": 0.6442, + "step": 80060 + }, + { + "epoch": 0.5115444079577833, + "grad_norm": 1.321568489074707, + "learning_rate": 8.471962546340049e-05, + "loss": 1.0123, + "step": 80070 + }, + { + "epoch": 0.511608295107522, + "grad_norm": 1.0992311239242554, + "learning_rate": 8.471601458028713e-05, + "loss": 0.8926, + "step": 80080 + }, + { + "epoch": 0.5116721822572607, + "grad_norm": 0.987280547618866, + "learning_rate": 8.471240334755346e-05, + "loss": 0.943, + "step": 80090 + }, + { + "epoch": 0.5117360694069994, + "grad_norm": 0.7247947454452515, + "learning_rate": 8.470879176523586e-05, + "loss": 0.8531, + "step": 80100 + }, + { + "epoch": 0.5117999565567382, + "grad_norm": 0.8526644706726074, + "learning_rate": 8.470517983337071e-05, + "loss": 0.7333, + "step": 80110 + }, + { + "epoch": 0.5118638437064769, + "grad_norm": 1.081724762916565, + "learning_rate": 8.470156755199436e-05, + "loss": 0.9023, + "step": 80120 + }, + { + "epoch": 0.5119277308562156, + "grad_norm": 0.9575611352920532, + "learning_rate": 8.469795492114321e-05, + "loss": 0.7949, + "step": 80130 + }, + { + "epoch": 0.5119916180059543, + "grad_norm": 0.6004752516746521, + "learning_rate": 8.469434194085364e-05, + "loss": 1.0179, + "step": 80140 + }, + { + "epoch": 0.512055505155693, + "grad_norm": 0.7859931588172913, + "learning_rate": 8.469072861116202e-05, + "loss": 0.9604, + "step": 80150 + }, + { + "epoch": 0.5121193923054317, + "grad_norm": 0.9513803124427795, + "learning_rate": 8.468711493210476e-05, + "loss": 0.8357, + "step": 80160 + }, + { + "epoch": 0.5121832794551704, + "grad_norm": 0.8474782109260559, + "learning_rate": 8.468350090371825e-05, + "loss": 0.9121, + "step": 80170 + }, + { + "epoch": 0.5122471666049091, + "grad_norm": 0.49391424655914307, + "learning_rate": 8.467988652603887e-05, + "loss": 1.0967, + "step": 80180 + }, + { + "epoch": 0.5123110537546478, + "grad_norm": 0.6341314911842346, + "learning_rate": 8.467627179910304e-05, + "loss": 1.1882, + "step": 80190 + }, + { + "epoch": 0.5123749409043865, + "grad_norm": 0.5973122119903564, + "learning_rate": 8.467265672294715e-05, + "loss": 1.372, + "step": 80200 + }, + { + "epoch": 0.5124388280541252, + "grad_norm": 1.0197665691375732, + "learning_rate": 8.46690412976076e-05, + "loss": 0.6034, + "step": 80210 + }, + { + "epoch": 0.5125027152038639, + "grad_norm": 1.1325267553329468, + "learning_rate": 8.466542552312083e-05, + "loss": 0.9992, + "step": 80220 + }, + { + "epoch": 0.5125666023536026, + "grad_norm": 1.2969529628753662, + "learning_rate": 8.466180939952322e-05, + "loss": 0.9412, + "step": 80230 + }, + { + "epoch": 0.5126304895033413, + "grad_norm": 0.804654598236084, + "learning_rate": 8.465819292685121e-05, + "loss": 0.9241, + "step": 80240 + }, + { + "epoch": 0.51269437665308, + "grad_norm": 0.5683889985084534, + "learning_rate": 8.465457610514122e-05, + "loss": 0.9131, + "step": 80250 + }, + { + "epoch": 0.5127582638028187, + "grad_norm": 1.4431538581848145, + "learning_rate": 8.465095893442965e-05, + "loss": 0.8802, + "step": 80260 + }, + { + "epoch": 0.5128221509525575, + "grad_norm": 0.7495303750038147, + "learning_rate": 8.464734141475296e-05, + "loss": 0.7763, + "step": 80270 + }, + { + "epoch": 0.512886038102296, + "grad_norm": 1.0469660758972168, + "learning_rate": 8.464372354614755e-05, + "loss": 0.7827, + "step": 80280 + }, + { + "epoch": 0.5129499252520348, + "grad_norm": 0.8818047046661377, + "learning_rate": 8.46401053286499e-05, + "loss": 0.7446, + "step": 80290 + }, + { + "epoch": 0.5130138124017735, + "grad_norm": 0.610306441783905, + "learning_rate": 8.463648676229641e-05, + "loss": 0.9616, + "step": 80300 + }, + { + "epoch": 0.5130776995515122, + "grad_norm": 1.0561434030532837, + "learning_rate": 8.463286784712352e-05, + "loss": 0.9341, + "step": 80310 + }, + { + "epoch": 0.5131415867012509, + "grad_norm": 1.1245967149734497, + "learning_rate": 8.46292485831677e-05, + "loss": 1.0593, + "step": 80320 + }, + { + "epoch": 0.5132054738509896, + "grad_norm": 0.8336319327354431, + "learning_rate": 8.462562897046539e-05, + "loss": 0.9832, + "step": 80330 + }, + { + "epoch": 0.5132693610007283, + "grad_norm": 1.2860108613967896, + "learning_rate": 8.462200900905304e-05, + "loss": 1.2113, + "step": 80340 + }, + { + "epoch": 0.513333248150467, + "grad_norm": 0.6594120860099792, + "learning_rate": 8.46183886989671e-05, + "loss": 1.0788, + "step": 80350 + }, + { + "epoch": 0.5133971353002057, + "grad_norm": 1.1738802194595337, + "learning_rate": 8.461476804024405e-05, + "loss": 1.1394, + "step": 80360 + }, + { + "epoch": 0.5134610224499444, + "grad_norm": 0.8349171280860901, + "learning_rate": 8.461114703292032e-05, + "loss": 0.9976, + "step": 80370 + }, + { + "epoch": 0.5135249095996831, + "grad_norm": 0.9331271648406982, + "learning_rate": 8.460752567703242e-05, + "loss": 0.8871, + "step": 80380 + }, + { + "epoch": 0.5135887967494218, + "grad_norm": 1.0842266082763672, + "learning_rate": 8.460390397261679e-05, + "loss": 0.8379, + "step": 80390 + }, + { + "epoch": 0.5136526838991605, + "grad_norm": 0.8020588159561157, + "learning_rate": 8.46002819197099e-05, + "loss": 0.7943, + "step": 80400 + }, + { + "epoch": 0.5137165710488992, + "grad_norm": 1.0201034545898438, + "learning_rate": 8.459665951834825e-05, + "loss": 0.8956, + "step": 80410 + }, + { + "epoch": 0.5137804581986379, + "grad_norm": 0.710241436958313, + "learning_rate": 8.459303676856829e-05, + "loss": 1.1422, + "step": 80420 + }, + { + "epoch": 0.5138443453483766, + "grad_norm": 1.129925012588501, + "learning_rate": 8.458941367040654e-05, + "loss": 1.0028, + "step": 80430 + }, + { + "epoch": 0.5139082324981153, + "grad_norm": 0.9500714540481567, + "learning_rate": 8.458579022389946e-05, + "loss": 0.9935, + "step": 80440 + }, + { + "epoch": 0.513972119647854, + "grad_norm": 1.4013770818710327, + "learning_rate": 8.458216642908357e-05, + "loss": 1.1331, + "step": 80450 + }, + { + "epoch": 0.5140360067975928, + "grad_norm": 1.6361690759658813, + "learning_rate": 8.457854228599533e-05, + "loss": 0.8196, + "step": 80460 + }, + { + "epoch": 0.5140998939473315, + "grad_norm": 1.773687481880188, + "learning_rate": 8.457491779467124e-05, + "loss": 0.6577, + "step": 80470 + }, + { + "epoch": 0.5141637810970702, + "grad_norm": 1.1989527940750122, + "learning_rate": 8.457129295514785e-05, + "loss": 0.9754, + "step": 80480 + }, + { + "epoch": 0.5142276682468089, + "grad_norm": 1.0061672925949097, + "learning_rate": 8.456766776746161e-05, + "loss": 0.7289, + "step": 80490 + }, + { + "epoch": 0.5142915553965476, + "grad_norm": 0.5245055556297302, + "learning_rate": 8.456404223164906e-05, + "loss": 0.8355, + "step": 80500 + }, + { + "epoch": 0.5143554425462863, + "grad_norm": 0.9344064593315125, + "learning_rate": 8.45604163477467e-05, + "loss": 0.9097, + "step": 80510 + }, + { + "epoch": 0.514419329696025, + "grad_norm": 0.8581297993659973, + "learning_rate": 8.455679011579104e-05, + "loss": 0.6422, + "step": 80520 + }, + { + "epoch": 0.5144832168457636, + "grad_norm": 1.0841580629348755, + "learning_rate": 8.455316353581861e-05, + "loss": 1.1547, + "step": 80530 + }, + { + "epoch": 0.5145471039955023, + "grad_norm": 0.8380923867225647, + "learning_rate": 8.454953660786594e-05, + "loss": 1.1443, + "step": 80540 + }, + { + "epoch": 0.514610991145241, + "grad_norm": 1.0253181457519531, + "learning_rate": 8.454590933196953e-05, + "loss": 1.0363, + "step": 80550 + }, + { + "epoch": 0.5146748782949797, + "grad_norm": 0.8039796948432922, + "learning_rate": 8.454228170816594e-05, + "loss": 0.9947, + "step": 80560 + }, + { + "epoch": 0.5147387654447184, + "grad_norm": 0.6157310605049133, + "learning_rate": 8.453865373649168e-05, + "loss": 0.8205, + "step": 80570 + }, + { + "epoch": 0.5148026525944571, + "grad_norm": 1.2950266599655151, + "learning_rate": 8.45350254169833e-05, + "loss": 0.8778, + "step": 80580 + }, + { + "epoch": 0.5148665397441958, + "grad_norm": 0.5074208974838257, + "learning_rate": 8.453139674967735e-05, + "loss": 0.9512, + "step": 80590 + }, + { + "epoch": 0.5149304268939345, + "grad_norm": 0.974296510219574, + "learning_rate": 8.452776773461035e-05, + "loss": 0.7415, + "step": 80600 + }, + { + "epoch": 0.5149943140436732, + "grad_norm": 0.8522329926490784, + "learning_rate": 8.452413837181886e-05, + "loss": 0.7619, + "step": 80610 + }, + { + "epoch": 0.515058201193412, + "grad_norm": 0.7677290439605713, + "learning_rate": 8.452050866133943e-05, + "loss": 0.7501, + "step": 80620 + }, + { + "epoch": 0.5151220883431507, + "grad_norm": 0.7231885194778442, + "learning_rate": 8.451687860320862e-05, + "loss": 0.8417, + "step": 80630 + }, + { + "epoch": 0.5151859754928894, + "grad_norm": 1.0473037958145142, + "learning_rate": 8.451324819746297e-05, + "loss": 0.7961, + "step": 80640 + }, + { + "epoch": 0.5152498626426281, + "grad_norm": 1.339667558670044, + "learning_rate": 8.450961744413906e-05, + "loss": 0.6476, + "step": 80650 + }, + { + "epoch": 0.5153137497923668, + "grad_norm": 1.0341308116912842, + "learning_rate": 8.450598634327342e-05, + "loss": 0.7599, + "step": 80660 + }, + { + "epoch": 0.5153776369421055, + "grad_norm": 0.5190713405609131, + "learning_rate": 8.450235489490268e-05, + "loss": 1.0512, + "step": 80670 + }, + { + "epoch": 0.5154415240918442, + "grad_norm": 0.6918653845787048, + "learning_rate": 8.449872309906338e-05, + "loss": 0.8157, + "step": 80680 + }, + { + "epoch": 0.5155054112415829, + "grad_norm": 0.6024577617645264, + "learning_rate": 8.449509095579206e-05, + "loss": 0.9064, + "step": 80690 + }, + { + "epoch": 0.5155692983913216, + "grad_norm": 0.7624403238296509, + "learning_rate": 8.449145846512536e-05, + "loss": 0.7265, + "step": 80700 + }, + { + "epoch": 0.5156331855410603, + "grad_norm": 1.1898252964019775, + "learning_rate": 8.448782562709983e-05, + "loss": 0.9, + "step": 80710 + }, + { + "epoch": 0.515697072690799, + "grad_norm": 1.169190526008606, + "learning_rate": 8.448419244175205e-05, + "loss": 0.9871, + "step": 80720 + }, + { + "epoch": 0.5157609598405377, + "grad_norm": 0.9798734188079834, + "learning_rate": 8.448055890911863e-05, + "loss": 0.9437, + "step": 80730 + }, + { + "epoch": 0.5158248469902764, + "grad_norm": 0.6357259154319763, + "learning_rate": 8.447692502923615e-05, + "loss": 0.8561, + "step": 80740 + }, + { + "epoch": 0.5158887341400151, + "grad_norm": 0.7946950793266296, + "learning_rate": 8.447329080214119e-05, + "loss": 0.9226, + "step": 80750 + }, + { + "epoch": 0.5159526212897538, + "grad_norm": 1.21712064743042, + "learning_rate": 8.446965622787038e-05, + "loss": 0.7975, + "step": 80760 + }, + { + "epoch": 0.5160165084394924, + "grad_norm": 0.6995162963867188, + "learning_rate": 8.446602130646031e-05, + "loss": 0.7762, + "step": 80770 + }, + { + "epoch": 0.5160803955892311, + "grad_norm": 0.9936223030090332, + "learning_rate": 8.44623860379476e-05, + "loss": 0.8434, + "step": 80780 + }, + { + "epoch": 0.5161442827389698, + "grad_norm": 0.8816124796867371, + "learning_rate": 8.445875042236884e-05, + "loss": 0.9124, + "step": 80790 + }, + { + "epoch": 0.5162081698887085, + "grad_norm": 3.8590049743652344, + "learning_rate": 8.445511445976064e-05, + "loss": 0.9158, + "step": 80800 + }, + { + "epoch": 0.5162720570384473, + "grad_norm": 0.8542178273200989, + "learning_rate": 8.445147815015964e-05, + "loss": 0.7654, + "step": 80810 + }, + { + "epoch": 0.516335944188186, + "grad_norm": 1.375125527381897, + "learning_rate": 8.444784149360245e-05, + "loss": 0.8894, + "step": 80820 + }, + { + "epoch": 0.5163998313379247, + "grad_norm": 0.7835062742233276, + "learning_rate": 8.444420449012569e-05, + "loss": 0.6714, + "step": 80830 + }, + { + "epoch": 0.5164637184876634, + "grad_norm": 0.8339881896972656, + "learning_rate": 8.4440567139766e-05, + "loss": 0.9002, + "step": 80840 + }, + { + "epoch": 0.5165276056374021, + "grad_norm": 0.8910601139068604, + "learning_rate": 8.443692944256001e-05, + "loss": 0.9653, + "step": 80850 + }, + { + "epoch": 0.5165914927871408, + "grad_norm": 0.9393212795257568, + "learning_rate": 8.443329139854433e-05, + "loss": 0.9248, + "step": 80860 + }, + { + "epoch": 0.5166553799368795, + "grad_norm": 0.8954446315765381, + "learning_rate": 8.442965300775563e-05, + "loss": 0.8944, + "step": 80870 + }, + { + "epoch": 0.5167192670866182, + "grad_norm": 1.6217565536499023, + "learning_rate": 8.442601427023054e-05, + "loss": 0.9823, + "step": 80880 + }, + { + "epoch": 0.5167831542363569, + "grad_norm": 0.6830261945724487, + "learning_rate": 8.442237518600569e-05, + "loss": 0.8189, + "step": 80890 + }, + { + "epoch": 0.5168470413860956, + "grad_norm": 0.788052499294281, + "learning_rate": 8.441873575511775e-05, + "loss": 0.897, + "step": 80900 + }, + { + "epoch": 0.5169109285358343, + "grad_norm": 0.8000684380531311, + "learning_rate": 8.441509597760336e-05, + "loss": 0.902, + "step": 80910 + }, + { + "epoch": 0.516974815685573, + "grad_norm": 2.1170742511749268, + "learning_rate": 8.441145585349918e-05, + "loss": 0.7763, + "step": 80920 + }, + { + "epoch": 0.5170387028353117, + "grad_norm": 0.7653173208236694, + "learning_rate": 8.440781538284189e-05, + "loss": 0.8674, + "step": 80930 + }, + { + "epoch": 0.5171025899850504, + "grad_norm": 0.8054555654525757, + "learning_rate": 8.44041745656681e-05, + "loss": 1.0094, + "step": 80940 + }, + { + "epoch": 0.5171664771347891, + "grad_norm": 0.9257411956787109, + "learning_rate": 8.440053340201454e-05, + "loss": 0.698, + "step": 80950 + }, + { + "epoch": 0.5172303642845278, + "grad_norm": 0.7227391600608826, + "learning_rate": 8.439689189191783e-05, + "loss": 1.0385, + "step": 80960 + }, + { + "epoch": 0.5172942514342665, + "grad_norm": 0.873188853263855, + "learning_rate": 8.439325003541466e-05, + "loss": 0.8549, + "step": 80970 + }, + { + "epoch": 0.5173581385840053, + "grad_norm": 2.3998446464538574, + "learning_rate": 8.438960783254171e-05, + "loss": 1.0805, + "step": 80980 + }, + { + "epoch": 0.517422025733744, + "grad_norm": 0.8006882071495056, + "learning_rate": 8.438596528333567e-05, + "loss": 1.0806, + "step": 80990 + }, + { + "epoch": 0.5174859128834827, + "grad_norm": 0.723087728023529, + "learning_rate": 8.438232238783319e-05, + "loss": 0.9784, + "step": 81000 + }, + { + "epoch": 0.5175498000332213, + "grad_norm": 0.805282473564148, + "learning_rate": 8.437867914607099e-05, + "loss": 0.9561, + "step": 81010 + }, + { + "epoch": 0.51761368718296, + "grad_norm": 0.7623560428619385, + "learning_rate": 8.437503555808575e-05, + "loss": 0.9467, + "step": 81020 + }, + { + "epoch": 0.5176775743326987, + "grad_norm": 0.8528174161911011, + "learning_rate": 8.437139162391416e-05, + "loss": 0.8532, + "step": 81030 + }, + { + "epoch": 0.5177414614824374, + "grad_norm": 0.9997398257255554, + "learning_rate": 8.436774734359292e-05, + "loss": 0.9379, + "step": 81040 + }, + { + "epoch": 0.5178053486321761, + "grad_norm": 0.682331383228302, + "learning_rate": 8.436410271715873e-05, + "loss": 0.9537, + "step": 81050 + }, + { + "epoch": 0.5178692357819148, + "grad_norm": 0.829108476638794, + "learning_rate": 8.436045774464831e-05, + "loss": 0.6795, + "step": 81060 + }, + { + "epoch": 0.5179331229316535, + "grad_norm": 0.5814771056175232, + "learning_rate": 8.435681242609834e-05, + "loss": 1.0538, + "step": 81070 + }, + { + "epoch": 0.5179970100813922, + "grad_norm": 0.9756491184234619, + "learning_rate": 8.435316676154557e-05, + "loss": 0.7018, + "step": 81080 + }, + { + "epoch": 0.5180608972311309, + "grad_norm": 0.49684908986091614, + "learning_rate": 8.434952075102665e-05, + "loss": 0.7389, + "step": 81090 + }, + { + "epoch": 0.5181247843808696, + "grad_norm": 0.7988196015357971, + "learning_rate": 8.434587439457837e-05, + "loss": 0.9728, + "step": 81100 + }, + { + "epoch": 0.5181886715306083, + "grad_norm": 0.7685717344284058, + "learning_rate": 8.43422276922374e-05, + "loss": 1.1467, + "step": 81110 + }, + { + "epoch": 0.518252558680347, + "grad_norm": 1.4645850658416748, + "learning_rate": 8.433858064404052e-05, + "loss": 0.9914, + "step": 81120 + }, + { + "epoch": 0.5183164458300857, + "grad_norm": 1.1191790103912354, + "learning_rate": 8.433493325002439e-05, + "loss": 1.0248, + "step": 81130 + }, + { + "epoch": 0.5183803329798244, + "grad_norm": 1.087517499923706, + "learning_rate": 8.43312855102258e-05, + "loss": 0.8725, + "step": 81140 + }, + { + "epoch": 0.5184442201295631, + "grad_norm": 0.9224085807800293, + "learning_rate": 8.432763742468146e-05, + "loss": 1.0876, + "step": 81150 + }, + { + "epoch": 0.5185081072793019, + "grad_norm": 1.3059546947479248, + "learning_rate": 8.432398899342811e-05, + "loss": 1.003, + "step": 81160 + }, + { + "epoch": 0.5185719944290406, + "grad_norm": 1.533253788948059, + "learning_rate": 8.43203402165025e-05, + "loss": 0.8566, + "step": 81170 + }, + { + "epoch": 0.5186358815787793, + "grad_norm": 0.714002251625061, + "learning_rate": 8.431669109394138e-05, + "loss": 0.7685, + "step": 81180 + }, + { + "epoch": 0.518699768728518, + "grad_norm": 0.8544260859489441, + "learning_rate": 8.431304162578148e-05, + "loss": 0.8297, + "step": 81190 + }, + { + "epoch": 0.5187636558782567, + "grad_norm": 0.5248509049415588, + "learning_rate": 8.430939181205957e-05, + "loss": 1.0113, + "step": 81200 + }, + { + "epoch": 0.5188275430279954, + "grad_norm": 0.8331204056739807, + "learning_rate": 8.430574165281239e-05, + "loss": 0.8771, + "step": 81210 + }, + { + "epoch": 0.5188914301777341, + "grad_norm": 0.9952676296234131, + "learning_rate": 8.430209114807675e-05, + "loss": 0.9538, + "step": 81220 + }, + { + "epoch": 0.5189553173274728, + "grad_norm": 5.568673133850098, + "learning_rate": 8.429844029788933e-05, + "loss": 1.1575, + "step": 81230 + }, + { + "epoch": 0.5190192044772115, + "grad_norm": 0.9033558964729309, + "learning_rate": 8.429478910228697e-05, + "loss": 1.1018, + "step": 81240 + }, + { + "epoch": 0.5190830916269502, + "grad_norm": 0.795211911201477, + "learning_rate": 8.42911375613064e-05, + "loss": 0.8717, + "step": 81250 + }, + { + "epoch": 0.5191469787766888, + "grad_norm": 0.7301700115203857, + "learning_rate": 8.428748567498443e-05, + "loss": 0.8484, + "step": 81260 + }, + { + "epoch": 0.5192108659264275, + "grad_norm": 0.9231024384498596, + "learning_rate": 8.428383344335779e-05, + "loss": 0.8684, + "step": 81270 + }, + { + "epoch": 0.5192747530761662, + "grad_norm": 0.681303083896637, + "learning_rate": 8.428018086646333e-05, + "loss": 0.8345, + "step": 81280 + }, + { + "epoch": 0.5193386402259049, + "grad_norm": 1.12642502784729, + "learning_rate": 8.427652794433776e-05, + "loss": 0.8428, + "step": 81290 + }, + { + "epoch": 0.5194025273756436, + "grad_norm": 0.9355636239051819, + "learning_rate": 8.42728746770179e-05, + "loss": 0.7608, + "step": 81300 + }, + { + "epoch": 0.5194664145253823, + "grad_norm": 1.2537355422973633, + "learning_rate": 8.426922106454054e-05, + "loss": 0.8054, + "step": 81310 + }, + { + "epoch": 0.519530301675121, + "grad_norm": 0.8078314065933228, + "learning_rate": 8.42655671069425e-05, + "loss": 0.9239, + "step": 81320 + }, + { + "epoch": 0.5195941888248597, + "grad_norm": 1.214921236038208, + "learning_rate": 8.426191280426052e-05, + "loss": 0.8623, + "step": 81330 + }, + { + "epoch": 0.5196580759745985, + "grad_norm": 1.2319025993347168, + "learning_rate": 8.425825815653145e-05, + "loss": 0.8355, + "step": 81340 + }, + { + "epoch": 0.5197219631243372, + "grad_norm": 0.8140376806259155, + "learning_rate": 8.42546031637921e-05, + "loss": 0.9517, + "step": 81350 + }, + { + "epoch": 0.5197858502740759, + "grad_norm": 1.0668420791625977, + "learning_rate": 8.425094782607925e-05, + "loss": 0.8228, + "step": 81360 + }, + { + "epoch": 0.5198497374238146, + "grad_norm": 0.855273962020874, + "learning_rate": 8.424729214342972e-05, + "loss": 0.8945, + "step": 81370 + }, + { + "epoch": 0.5199136245735533, + "grad_norm": 1.8131141662597656, + "learning_rate": 8.424363611588033e-05, + "loss": 0.9209, + "step": 81380 + }, + { + "epoch": 0.519977511723292, + "grad_norm": 0.7118239402770996, + "learning_rate": 8.42399797434679e-05, + "loss": 1.0615, + "step": 81390 + }, + { + "epoch": 0.5200413988730307, + "grad_norm": 0.7992277145385742, + "learning_rate": 8.423632302622926e-05, + "loss": 0.8819, + "step": 81400 + }, + { + "epoch": 0.5201052860227694, + "grad_norm": 0.9642464518547058, + "learning_rate": 8.423266596420123e-05, + "loss": 0.8717, + "step": 81410 + }, + { + "epoch": 0.5201691731725081, + "grad_norm": 0.8006801605224609, + "learning_rate": 8.422900855742062e-05, + "loss": 0.8487, + "step": 81420 + }, + { + "epoch": 0.5202330603222468, + "grad_norm": 0.7330396771430969, + "learning_rate": 8.422535080592431e-05, + "loss": 0.9406, + "step": 81430 + }, + { + "epoch": 0.5202969474719855, + "grad_norm": 0.8267525434494019, + "learning_rate": 8.422169270974909e-05, + "loss": 0.8308, + "step": 81440 + }, + { + "epoch": 0.5203608346217242, + "grad_norm": 1.518169641494751, + "learning_rate": 8.421803426893182e-05, + "loss": 0.9029, + "step": 81450 + }, + { + "epoch": 0.5204247217714629, + "grad_norm": 1.128998041152954, + "learning_rate": 8.421437548350935e-05, + "loss": 0.8468, + "step": 81460 + }, + { + "epoch": 0.5204886089212016, + "grad_norm": 1.051698088645935, + "learning_rate": 8.42107163535185e-05, + "loss": 1.12, + "step": 81470 + }, + { + "epoch": 0.5205524960709403, + "grad_norm": 0.7762027382850647, + "learning_rate": 8.420705687899616e-05, + "loss": 0.6149, + "step": 81480 + }, + { + "epoch": 0.520616383220679, + "grad_norm": 1.0097482204437256, + "learning_rate": 8.420339705997915e-05, + "loss": 0.7171, + "step": 81490 + }, + { + "epoch": 0.5206802703704176, + "grad_norm": 1.062819242477417, + "learning_rate": 8.419973689650436e-05, + "loss": 0.9634, + "step": 81500 + }, + { + "epoch": 0.5207441575201563, + "grad_norm": 0.9354637861251831, + "learning_rate": 8.41960763886086e-05, + "loss": 0.8755, + "step": 81510 + }, + { + "epoch": 0.520808044669895, + "grad_norm": 0.5439125299453735, + "learning_rate": 8.41924155363288e-05, + "loss": 0.9437, + "step": 81520 + }, + { + "epoch": 0.5208719318196338, + "grad_norm": 0.7927828431129456, + "learning_rate": 8.418875433970177e-05, + "loss": 0.9138, + "step": 81530 + }, + { + "epoch": 0.5209358189693725, + "grad_norm": 1.90019953250885, + "learning_rate": 8.418509279876444e-05, + "loss": 1.1347, + "step": 81540 + }, + { + "epoch": 0.5209997061191112, + "grad_norm": 1.1833665370941162, + "learning_rate": 8.418179711756595e-05, + "loss": 1.0368, + "step": 81550 + }, + { + "epoch": 0.5210635932688499, + "grad_norm": 0.6297504305839539, + "learning_rate": 8.417813492254057e-05, + "loss": 0.7755, + "step": 81560 + }, + { + "epoch": 0.5211274804185886, + "grad_norm": 1.1400600671768188, + "learning_rate": 8.417447238331177e-05, + "loss": 1.1425, + "step": 81570 + }, + { + "epoch": 0.5211913675683273, + "grad_norm": 0.9770885705947876, + "learning_rate": 8.41708094999165e-05, + "loss": 0.8823, + "step": 81580 + }, + { + "epoch": 0.521255254718066, + "grad_norm": 1.117124080657959, + "learning_rate": 8.41671462723916e-05, + "loss": 1.0206, + "step": 81590 + }, + { + "epoch": 0.5213191418678047, + "grad_norm": 0.7140239477157593, + "learning_rate": 8.416348270077399e-05, + "loss": 0.9016, + "step": 81600 + }, + { + "epoch": 0.5213830290175434, + "grad_norm": 0.8254780769348145, + "learning_rate": 8.415981878510054e-05, + "loss": 0.9917, + "step": 81610 + }, + { + "epoch": 0.5214469161672821, + "grad_norm": 0.7922462821006775, + "learning_rate": 8.415615452540817e-05, + "loss": 0.7269, + "step": 81620 + }, + { + "epoch": 0.5215108033170208, + "grad_norm": 0.5598194003105164, + "learning_rate": 8.415248992173377e-05, + "loss": 0.7201, + "step": 81630 + }, + { + "epoch": 0.5215746904667595, + "grad_norm": 1.1841058731079102, + "learning_rate": 8.414882497411424e-05, + "loss": 0.9925, + "step": 81640 + }, + { + "epoch": 0.5216385776164982, + "grad_norm": 1.0316505432128906, + "learning_rate": 8.414515968258653e-05, + "loss": 0.7948, + "step": 81650 + }, + { + "epoch": 0.5217024647662369, + "grad_norm": 1.1518917083740234, + "learning_rate": 8.41414940471875e-05, + "loss": 1.0671, + "step": 81660 + }, + { + "epoch": 0.5217663519159756, + "grad_norm": 1.1492241621017456, + "learning_rate": 8.413782806795409e-05, + "loss": 0.9031, + "step": 81670 + }, + { + "epoch": 0.5218302390657144, + "grad_norm": 1.136619210243225, + "learning_rate": 8.413416174492323e-05, + "loss": 0.8399, + "step": 81680 + }, + { + "epoch": 0.5218941262154531, + "grad_norm": 0.7834781408309937, + "learning_rate": 8.413049507813182e-05, + "loss": 1.0536, + "step": 81690 + }, + { + "epoch": 0.5219580133651918, + "grad_norm": 0.7433455586433411, + "learning_rate": 8.412682806761681e-05, + "loss": 0.9202, + "step": 81700 + }, + { + "epoch": 0.5220219005149305, + "grad_norm": 0.7488266229629517, + "learning_rate": 8.41231607134151e-05, + "loss": 1.1904, + "step": 81710 + }, + { + "epoch": 0.5220857876646692, + "grad_norm": 1.112772822380066, + "learning_rate": 8.411949301556365e-05, + "loss": 0.8893, + "step": 81720 + }, + { + "epoch": 0.5221496748144079, + "grad_norm": 1.1430962085723877, + "learning_rate": 8.411582497409937e-05, + "loss": 0.7654, + "step": 81730 + }, + { + "epoch": 0.5222135619641465, + "grad_norm": 0.9723464250564575, + "learning_rate": 8.411215658905925e-05, + "loss": 0.8554, + "step": 81740 + }, + { + "epoch": 0.5222774491138852, + "grad_norm": 0.7580609321594238, + "learning_rate": 8.410848786048018e-05, + "loss": 0.8596, + "step": 81750 + }, + { + "epoch": 0.5223413362636239, + "grad_norm": 1.1168793439865112, + "learning_rate": 8.410481878839914e-05, + "loss": 0.798, + "step": 81760 + }, + { + "epoch": 0.5224052234133626, + "grad_norm": 0.7281593084335327, + "learning_rate": 8.410114937285308e-05, + "loss": 0.9573, + "step": 81770 + }, + { + "epoch": 0.5224691105631013, + "grad_norm": 0.8710981011390686, + "learning_rate": 8.409747961387892e-05, + "loss": 0.8257, + "step": 81780 + }, + { + "epoch": 0.52253299771284, + "grad_norm": 1.258819818496704, + "learning_rate": 8.409380951151364e-05, + "loss": 0.7601, + "step": 81790 + }, + { + "epoch": 0.5225968848625787, + "grad_norm": 0.5845881700515747, + "learning_rate": 8.409013906579422e-05, + "loss": 0.894, + "step": 81800 + }, + { + "epoch": 0.5226607720123174, + "grad_norm": 0.7453858852386475, + "learning_rate": 8.40864682767576e-05, + "loss": 0.8617, + "step": 81810 + }, + { + "epoch": 0.5227246591620561, + "grad_norm": 0.8870908617973328, + "learning_rate": 8.408279714444076e-05, + "loss": 0.9018, + "step": 81820 + }, + { + "epoch": 0.5227885463117948, + "grad_norm": 0.7710734605789185, + "learning_rate": 8.407912566888068e-05, + "loss": 0.7743, + "step": 81830 + }, + { + "epoch": 0.5228524334615335, + "grad_norm": 1.001556396484375, + "learning_rate": 8.40754538501143e-05, + "loss": 0.9576, + "step": 81840 + }, + { + "epoch": 0.5229163206112722, + "grad_norm": 1.0256420373916626, + "learning_rate": 8.407178168817862e-05, + "loss": 0.9606, + "step": 81850 + }, + { + "epoch": 0.522980207761011, + "grad_norm": 1.1523104906082153, + "learning_rate": 8.406810918311063e-05, + "loss": 0.8513, + "step": 81860 + }, + { + "epoch": 0.5230440949107497, + "grad_norm": 0.8839610815048218, + "learning_rate": 8.40644363349473e-05, + "loss": 1.0848, + "step": 81870 + }, + { + "epoch": 0.5231079820604884, + "grad_norm": 0.863746166229248, + "learning_rate": 8.406076314372564e-05, + "loss": 0.8318, + "step": 81880 + }, + { + "epoch": 0.5231718692102271, + "grad_norm": 1.282516360282898, + "learning_rate": 8.405708960948262e-05, + "loss": 0.9989, + "step": 81890 + }, + { + "epoch": 0.5232357563599658, + "grad_norm": 0.9695531129837036, + "learning_rate": 8.405341573225524e-05, + "loss": 1.042, + "step": 81900 + }, + { + "epoch": 0.5232996435097045, + "grad_norm": 1.1501736640930176, + "learning_rate": 8.40497415120805e-05, + "loss": 0.9179, + "step": 81910 + }, + { + "epoch": 0.5233635306594432, + "grad_norm": 0.8112602829933167, + "learning_rate": 8.404606694899542e-05, + "loss": 0.7767, + "step": 81920 + }, + { + "epoch": 0.5234274178091819, + "grad_norm": 0.7559998631477356, + "learning_rate": 8.404239204303698e-05, + "loss": 0.8375, + "step": 81930 + }, + { + "epoch": 0.5234913049589206, + "grad_norm": 0.6688728332519531, + "learning_rate": 8.403871679424222e-05, + "loss": 0.8205, + "step": 81940 + }, + { + "epoch": 0.5235551921086593, + "grad_norm": 1.4303487539291382, + "learning_rate": 8.403504120264811e-05, + "loss": 1.0413, + "step": 81950 + }, + { + "epoch": 0.523619079258398, + "grad_norm": 1.0139001607894897, + "learning_rate": 8.403136526829171e-05, + "loss": 0.7122, + "step": 81960 + }, + { + "epoch": 0.5236829664081367, + "grad_norm": 1.1375806331634521, + "learning_rate": 8.402768899121e-05, + "loss": 0.9585, + "step": 81970 + }, + { + "epoch": 0.5237468535578753, + "grad_norm": 0.801270604133606, + "learning_rate": 8.402401237144005e-05, + "loss": 1.0773, + "step": 81980 + }, + { + "epoch": 0.523810740707614, + "grad_norm": 1.0210850238800049, + "learning_rate": 8.402033540901884e-05, + "loss": 0.9391, + "step": 81990 + }, + { + "epoch": 0.5238746278573527, + "grad_norm": 0.44863033294677734, + "learning_rate": 8.401665810398342e-05, + "loss": 0.7008, + "step": 82000 + }, + { + "epoch": 0.5239385150070914, + "grad_norm": 0.6878476738929749, + "learning_rate": 8.401298045637083e-05, + "loss": 1.0242, + "step": 82010 + }, + { + "epoch": 0.5240024021568301, + "grad_norm": 1.221062421798706, + "learning_rate": 8.40093024662181e-05, + "loss": 0.6651, + "step": 82020 + }, + { + "epoch": 0.5240662893065688, + "grad_norm": 0.8801291584968567, + "learning_rate": 8.400562413356228e-05, + "loss": 1.07, + "step": 82030 + }, + { + "epoch": 0.5241301764563076, + "grad_norm": 1.5638190507888794, + "learning_rate": 8.40019454584404e-05, + "loss": 1.0253, + "step": 82040 + }, + { + "epoch": 0.5241940636060463, + "grad_norm": 0.7046545147895813, + "learning_rate": 8.399826644088951e-05, + "loss": 1.0211, + "step": 82050 + }, + { + "epoch": 0.524257950755785, + "grad_norm": 0.7261834740638733, + "learning_rate": 8.399458708094668e-05, + "loss": 1.0918, + "step": 82060 + }, + { + "epoch": 0.5243218379055237, + "grad_norm": 0.7312687635421753, + "learning_rate": 8.399090737864893e-05, + "loss": 1.0264, + "step": 82070 + }, + { + "epoch": 0.5243857250552624, + "grad_norm": 0.7334839105606079, + "learning_rate": 8.398722733403335e-05, + "loss": 0.6618, + "step": 82080 + }, + { + "epoch": 0.5244496122050011, + "grad_norm": 1.9923456907272339, + "learning_rate": 8.398354694713697e-05, + "loss": 0.7325, + "step": 82090 + }, + { + "epoch": 0.5245134993547398, + "grad_norm": 1.5566961765289307, + "learning_rate": 8.397986621799688e-05, + "loss": 1.0462, + "step": 82100 + }, + { + "epoch": 0.5245773865044785, + "grad_norm": 0.5462529063224792, + "learning_rate": 8.397618514665015e-05, + "loss": 0.7883, + "step": 82110 + }, + { + "epoch": 0.5246412736542172, + "grad_norm": 0.7034682631492615, + "learning_rate": 8.397250373313383e-05, + "loss": 0.98, + "step": 82120 + }, + { + "epoch": 0.5247051608039559, + "grad_norm": 2.1927855014801025, + "learning_rate": 8.396882197748501e-05, + "loss": 1.0054, + "step": 82130 + }, + { + "epoch": 0.5247690479536946, + "grad_norm": 0.724446713924408, + "learning_rate": 8.396513987974078e-05, + "loss": 0.8474, + "step": 82140 + }, + { + "epoch": 0.5248329351034333, + "grad_norm": 1.2433834075927734, + "learning_rate": 8.396145743993819e-05, + "loss": 1.0282, + "step": 82150 + }, + { + "epoch": 0.524896822253172, + "grad_norm": 0.5404759645462036, + "learning_rate": 8.395777465811434e-05, + "loss": 0.9458, + "step": 82160 + }, + { + "epoch": 0.5249607094029107, + "grad_norm": 0.5961291790008545, + "learning_rate": 8.395409153430633e-05, + "loss": 0.8767, + "step": 82170 + }, + { + "epoch": 0.5250245965526494, + "grad_norm": 1.5657272338867188, + "learning_rate": 8.395040806855125e-05, + "loss": 0.8355, + "step": 82180 + }, + { + "epoch": 0.5250884837023881, + "grad_norm": 0.761735737323761, + "learning_rate": 8.394672426088618e-05, + "loss": 1.1087, + "step": 82190 + }, + { + "epoch": 0.5251523708521268, + "grad_norm": 0.904757022857666, + "learning_rate": 8.394304011134822e-05, + "loss": 1.1173, + "step": 82200 + }, + { + "epoch": 0.5252162580018656, + "grad_norm": 1.890184998512268, + "learning_rate": 8.39393556199745e-05, + "loss": 0.7589, + "step": 82210 + }, + { + "epoch": 0.5252801451516043, + "grad_norm": 0.8762137293815613, + "learning_rate": 8.39356707868021e-05, + "loss": 0.9525, + "step": 82220 + }, + { + "epoch": 0.5253440323013429, + "grad_norm": 1.153976321220398, + "learning_rate": 8.393198561186814e-05, + "loss": 1.0239, + "step": 82230 + }, + { + "epoch": 0.5254079194510816, + "grad_norm": 1.3719924688339233, + "learning_rate": 8.392830009520972e-05, + "loss": 0.8364, + "step": 82240 + }, + { + "epoch": 0.5254718066008203, + "grad_norm": 0.9596297144889832, + "learning_rate": 8.392461423686397e-05, + "loss": 0.8316, + "step": 82250 + }, + { + "epoch": 0.525535693750559, + "grad_norm": 0.8172164559364319, + "learning_rate": 8.392092803686801e-05, + "loss": 0.8146, + "step": 82260 + }, + { + "epoch": 0.5255995809002977, + "grad_norm": 0.7319055795669556, + "learning_rate": 8.391724149525895e-05, + "loss": 0.8911, + "step": 82270 + }, + { + "epoch": 0.5256634680500364, + "grad_norm": 0.949073314666748, + "learning_rate": 8.391355461207393e-05, + "loss": 0.8981, + "step": 82280 + }, + { + "epoch": 0.5257273551997751, + "grad_norm": 2.1071205139160156, + "learning_rate": 8.390986738735007e-05, + "loss": 0.882, + "step": 82290 + }, + { + "epoch": 0.5257912423495138, + "grad_norm": 0.9172298908233643, + "learning_rate": 8.390617982112452e-05, + "loss": 1.0809, + "step": 82300 + }, + { + "epoch": 0.5258551294992525, + "grad_norm": 1.3219941854476929, + "learning_rate": 8.390249191343442e-05, + "loss": 0.8637, + "step": 82310 + }, + { + "epoch": 0.5259190166489912, + "grad_norm": 0.8916542530059814, + "learning_rate": 8.389880366431687e-05, + "loss": 1.1391, + "step": 82320 + }, + { + "epoch": 0.5259829037987299, + "grad_norm": 0.6826764941215515, + "learning_rate": 8.389511507380905e-05, + "loss": 0.814, + "step": 82330 + }, + { + "epoch": 0.5260467909484686, + "grad_norm": 1.7593846321105957, + "learning_rate": 8.389142614194809e-05, + "loss": 1.1427, + "step": 82340 + }, + { + "epoch": 0.5261106780982073, + "grad_norm": 0.4655475318431854, + "learning_rate": 8.388773686877117e-05, + "loss": 0.6992, + "step": 82350 + }, + { + "epoch": 0.526174565247946, + "grad_norm": 1.185663104057312, + "learning_rate": 8.38840472543154e-05, + "loss": 0.8237, + "step": 82360 + }, + { + "epoch": 0.5262384523976847, + "grad_norm": 0.8329865336418152, + "learning_rate": 8.388035729861797e-05, + "loss": 0.9379, + "step": 82370 + }, + { + "epoch": 0.5263023395474234, + "grad_norm": 0.5856214761734009, + "learning_rate": 8.387666700171603e-05, + "loss": 0.7349, + "step": 82380 + }, + { + "epoch": 0.5263662266971622, + "grad_norm": 0.8730786442756653, + "learning_rate": 8.387297636364675e-05, + "loss": 0.8201, + "step": 82390 + }, + { + "epoch": 0.5264301138469009, + "grad_norm": 0.7338623404502869, + "learning_rate": 8.38692853844473e-05, + "loss": 0.9384, + "step": 82400 + }, + { + "epoch": 0.5264940009966396, + "grad_norm": 0.6729745268821716, + "learning_rate": 8.386559406415481e-05, + "loss": 0.8973, + "step": 82410 + }, + { + "epoch": 0.5265578881463783, + "grad_norm": 1.3873519897460938, + "learning_rate": 8.386190240280652e-05, + "loss": 0.836, + "step": 82420 + }, + { + "epoch": 0.526621775296117, + "grad_norm": 0.808180570602417, + "learning_rate": 8.385821040043958e-05, + "loss": 0.8696, + "step": 82430 + }, + { + "epoch": 0.5266856624458557, + "grad_norm": 1.6018669605255127, + "learning_rate": 8.385451805709116e-05, + "loss": 0.7054, + "step": 82440 + }, + { + "epoch": 0.5267495495955944, + "grad_norm": 0.9540040493011475, + "learning_rate": 8.385082537279846e-05, + "loss": 0.9149, + "step": 82450 + }, + { + "epoch": 0.5268134367453331, + "grad_norm": 1.0204883813858032, + "learning_rate": 8.384713234759866e-05, + "loss": 0.8372, + "step": 82460 + }, + { + "epoch": 0.5268773238950717, + "grad_norm": 0.6929607391357422, + "learning_rate": 8.384343898152896e-05, + "loss": 0.7188, + "step": 82470 + }, + { + "epoch": 0.5269412110448104, + "grad_norm": 0.7308977246284485, + "learning_rate": 8.383974527462655e-05, + "loss": 0.8171, + "step": 82480 + }, + { + "epoch": 0.5270050981945491, + "grad_norm": 0.9977597594261169, + "learning_rate": 8.383605122692861e-05, + "loss": 0.7189, + "step": 82490 + }, + { + "epoch": 0.5270689853442878, + "grad_norm": 1.0899478197097778, + "learning_rate": 8.383235683847238e-05, + "loss": 1.0767, + "step": 82500 + }, + { + "epoch": 0.5271328724940265, + "grad_norm": 1.763649582862854, + "learning_rate": 8.382866210929506e-05, + "loss": 0.9618, + "step": 82510 + }, + { + "epoch": 0.5271967596437652, + "grad_norm": 0.8483105897903442, + "learning_rate": 8.382496703943382e-05, + "loss": 0.9302, + "step": 82520 + }, + { + "epoch": 0.5272606467935039, + "grad_norm": 1.4405713081359863, + "learning_rate": 8.38212716289259e-05, + "loss": 1.0092, + "step": 82530 + }, + { + "epoch": 0.5273245339432426, + "grad_norm": 0.8310216069221497, + "learning_rate": 8.381757587780853e-05, + "loss": 0.9264, + "step": 82540 + }, + { + "epoch": 0.5273884210929813, + "grad_norm": 1.1414271593093872, + "learning_rate": 8.381387978611892e-05, + "loss": 0.8157, + "step": 82550 + }, + { + "epoch": 0.52745230824272, + "grad_norm": 1.1843761205673218, + "learning_rate": 8.381018335389428e-05, + "loss": 1.0638, + "step": 82560 + }, + { + "epoch": 0.5275161953924588, + "grad_norm": 0.6531765460968018, + "learning_rate": 8.380648658117186e-05, + "loss": 0.8616, + "step": 82570 + }, + { + "epoch": 0.5275800825421975, + "grad_norm": 0.6731355786323547, + "learning_rate": 8.380278946798883e-05, + "loss": 0.7789, + "step": 82580 + }, + { + "epoch": 0.5276439696919362, + "grad_norm": 0.7285798788070679, + "learning_rate": 8.37990920143825e-05, + "loss": 0.8369, + "step": 82590 + }, + { + "epoch": 0.5277078568416749, + "grad_norm": 1.3306784629821777, + "learning_rate": 8.379539422039006e-05, + "loss": 0.9274, + "step": 82600 + }, + { + "epoch": 0.5277717439914136, + "grad_norm": 0.7392432689666748, + "learning_rate": 8.379169608604877e-05, + "loss": 0.7246, + "step": 82610 + }, + { + "epoch": 0.5278356311411523, + "grad_norm": 1.119094967842102, + "learning_rate": 8.378799761139587e-05, + "loss": 0.8628, + "step": 82620 + }, + { + "epoch": 0.527899518290891, + "grad_norm": 0.5163264274597168, + "learning_rate": 8.378429879646859e-05, + "loss": 0.6895, + "step": 82630 + }, + { + "epoch": 0.5279634054406297, + "grad_norm": 0.7841934561729431, + "learning_rate": 8.378059964130421e-05, + "loss": 0.9658, + "step": 82640 + }, + { + "epoch": 0.5280272925903684, + "grad_norm": 0.6107231378555298, + "learning_rate": 8.377690014593996e-05, + "loss": 0.8572, + "step": 82650 + }, + { + "epoch": 0.5280911797401071, + "grad_norm": 0.839893102645874, + "learning_rate": 8.377320031041309e-05, + "loss": 1.003, + "step": 82660 + }, + { + "epoch": 0.5281550668898458, + "grad_norm": 0.8523367643356323, + "learning_rate": 8.37695001347609e-05, + "loss": 0.8451, + "step": 82670 + }, + { + "epoch": 0.5282189540395845, + "grad_norm": 0.8957772850990295, + "learning_rate": 8.37657996190206e-05, + "loss": 1.1725, + "step": 82680 + }, + { + "epoch": 0.5282828411893232, + "grad_norm": 1.1203519105911255, + "learning_rate": 8.376209876322952e-05, + "loss": 0.7363, + "step": 82690 + }, + { + "epoch": 0.5283467283390619, + "grad_norm": 0.440065860748291, + "learning_rate": 8.375839756742487e-05, + "loss": 0.9615, + "step": 82700 + }, + { + "epoch": 0.5284106154888005, + "grad_norm": 0.8723582029342651, + "learning_rate": 8.375469603164397e-05, + "loss": 1.0116, + "step": 82710 + }, + { + "epoch": 0.5284745026385392, + "grad_norm": 0.7620988488197327, + "learning_rate": 8.375099415592406e-05, + "loss": 0.74, + "step": 82720 + }, + { + "epoch": 0.5285383897882779, + "grad_norm": 0.6935875415802002, + "learning_rate": 8.374729194030245e-05, + "loss": 0.6662, + "step": 82730 + }, + { + "epoch": 0.5286022769380166, + "grad_norm": 0.8621264696121216, + "learning_rate": 8.374358938481641e-05, + "loss": 0.8284, + "step": 82740 + }, + { + "epoch": 0.5286661640877554, + "grad_norm": 1.0655511617660522, + "learning_rate": 8.373988648950324e-05, + "loss": 0.8265, + "step": 82750 + }, + { + "epoch": 0.5287300512374941, + "grad_norm": 0.6569475531578064, + "learning_rate": 8.373618325440022e-05, + "loss": 0.8138, + "step": 82760 + }, + { + "epoch": 0.5287939383872328, + "grad_norm": 0.7676059007644653, + "learning_rate": 8.373247967954465e-05, + "loss": 0.798, + "step": 82770 + }, + { + "epoch": 0.5288578255369715, + "grad_norm": 0.5665331482887268, + "learning_rate": 8.372877576497383e-05, + "loss": 1.0496, + "step": 82780 + }, + { + "epoch": 0.5289217126867102, + "grad_norm": 0.7789902687072754, + "learning_rate": 8.372507151072506e-05, + "loss": 0.9221, + "step": 82790 + }, + { + "epoch": 0.5289855998364489, + "grad_norm": 0.6825060248374939, + "learning_rate": 8.372136691683563e-05, + "loss": 0.7601, + "step": 82800 + }, + { + "epoch": 0.5290494869861876, + "grad_norm": 0.8659316897392273, + "learning_rate": 8.371766198334288e-05, + "loss": 0.763, + "step": 82810 + }, + { + "epoch": 0.5291133741359263, + "grad_norm": 0.7778592109680176, + "learning_rate": 8.371395671028409e-05, + "loss": 0.8137, + "step": 82820 + }, + { + "epoch": 0.529177261285665, + "grad_norm": 0.7770895957946777, + "learning_rate": 8.37102510976966e-05, + "loss": 1.0609, + "step": 82830 + }, + { + "epoch": 0.5292411484354037, + "grad_norm": 0.8733034729957581, + "learning_rate": 8.370654514561771e-05, + "loss": 0.598, + "step": 82840 + }, + { + "epoch": 0.5293050355851424, + "grad_norm": 0.7436967492103577, + "learning_rate": 8.370283885408474e-05, + "loss": 0.8707, + "step": 82850 + }, + { + "epoch": 0.5293689227348811, + "grad_norm": 0.6810287237167358, + "learning_rate": 8.369913222313504e-05, + "loss": 0.831, + "step": 82860 + }, + { + "epoch": 0.5294328098846198, + "grad_norm": 0.6513703465461731, + "learning_rate": 8.369542525280593e-05, + "loss": 1.0104, + "step": 82870 + }, + { + "epoch": 0.5294966970343585, + "grad_norm": 0.7047162055969238, + "learning_rate": 8.369171794313473e-05, + "loss": 0.93, + "step": 82880 + }, + { + "epoch": 0.5295605841840972, + "grad_norm": 1.145445704460144, + "learning_rate": 8.368801029415878e-05, + "loss": 0.777, + "step": 82890 + }, + { + "epoch": 0.5296244713338359, + "grad_norm": 0.8920283317565918, + "learning_rate": 8.368430230591542e-05, + "loss": 0.7874, + "step": 82900 + }, + { + "epoch": 0.5296883584835747, + "grad_norm": 1.2382694482803345, + "learning_rate": 8.3680593978442e-05, + "loss": 0.9681, + "step": 82910 + }, + { + "epoch": 0.5297522456333134, + "grad_norm": 1.1094623804092407, + "learning_rate": 8.367688531177586e-05, + "loss": 1.0037, + "step": 82920 + }, + { + "epoch": 0.5298161327830521, + "grad_norm": 0.6824793815612793, + "learning_rate": 8.367317630595434e-05, + "loss": 0.8448, + "step": 82930 + }, + { + "epoch": 0.5298800199327908, + "grad_norm": 0.7402679324150085, + "learning_rate": 8.366946696101483e-05, + "loss": 0.8678, + "step": 82940 + }, + { + "epoch": 0.5299439070825295, + "grad_norm": 0.8517834544181824, + "learning_rate": 8.366575727699464e-05, + "loss": 0.9731, + "step": 82950 + }, + { + "epoch": 0.5300077942322681, + "grad_norm": 0.645380437374115, + "learning_rate": 8.366204725393114e-05, + "loss": 0.7756, + "step": 82960 + }, + { + "epoch": 0.5300716813820068, + "grad_norm": 1.7040249109268188, + "learning_rate": 8.365833689186172e-05, + "loss": 0.8345, + "step": 82970 + }, + { + "epoch": 0.5301355685317455, + "grad_norm": 1.2551957368850708, + "learning_rate": 8.365462619082372e-05, + "loss": 1.1466, + "step": 82980 + }, + { + "epoch": 0.5301994556814842, + "grad_norm": 0.596899688243866, + "learning_rate": 8.365091515085452e-05, + "loss": 0.9004, + "step": 82990 + }, + { + "epoch": 0.5302633428312229, + "grad_norm": 0.7382088899612427, + "learning_rate": 8.36472037719915e-05, + "loss": 0.9212, + "step": 83000 + }, + { + "epoch": 0.5303272299809616, + "grad_norm": 0.587727963924408, + "learning_rate": 8.364349205427203e-05, + "loss": 1.1075, + "step": 83010 + }, + { + "epoch": 0.5303911171307003, + "grad_norm": 0.5298671126365662, + "learning_rate": 8.363977999773347e-05, + "loss": 0.8375, + "step": 83020 + }, + { + "epoch": 0.530455004280439, + "grad_norm": 0.7597865462303162, + "learning_rate": 8.363606760241323e-05, + "loss": 1.0665, + "step": 83030 + }, + { + "epoch": 0.5305188914301777, + "grad_norm": 1.4384864568710327, + "learning_rate": 8.363235486834871e-05, + "loss": 0.8882, + "step": 83040 + }, + { + "epoch": 0.5305827785799164, + "grad_norm": 1.0824155807495117, + "learning_rate": 8.362864179557726e-05, + "loss": 0.9262, + "step": 83050 + }, + { + "epoch": 0.5306466657296551, + "grad_norm": 0.9911159873008728, + "learning_rate": 8.36249283841363e-05, + "loss": 1.0327, + "step": 83060 + }, + { + "epoch": 0.5307105528793938, + "grad_norm": 1.1133580207824707, + "learning_rate": 8.362121463406323e-05, + "loss": 0.9494, + "step": 83070 + }, + { + "epoch": 0.5307744400291325, + "grad_norm": 1.960688591003418, + "learning_rate": 8.361750054539544e-05, + "loss": 0.9712, + "step": 83080 + }, + { + "epoch": 0.5308383271788712, + "grad_norm": 0.710477888584137, + "learning_rate": 8.361378611817033e-05, + "loss": 0.913, + "step": 83090 + }, + { + "epoch": 0.53090221432861, + "grad_norm": 0.9821794629096985, + "learning_rate": 8.36100713524253e-05, + "loss": 0.9127, + "step": 83100 + }, + { + "epoch": 0.5309661014783487, + "grad_norm": 0.5997344851493835, + "learning_rate": 8.360635624819778e-05, + "loss": 0.8929, + "step": 83110 + }, + { + "epoch": 0.5310299886280874, + "grad_norm": 1.1449615955352783, + "learning_rate": 8.36026408055252e-05, + "loss": 0.5898, + "step": 83120 + }, + { + "epoch": 0.5310938757778261, + "grad_norm": 1.3157292604446411, + "learning_rate": 8.359892502444494e-05, + "loss": 0.8881, + "step": 83130 + }, + { + "epoch": 0.5311577629275648, + "grad_norm": 0.6515958905220032, + "learning_rate": 8.359520890499443e-05, + "loss": 0.8024, + "step": 83140 + }, + { + "epoch": 0.5312216500773035, + "grad_norm": 0.8860677480697632, + "learning_rate": 8.359149244721112e-05, + "loss": 0.8348, + "step": 83150 + }, + { + "epoch": 0.5312855372270422, + "grad_norm": 0.8517594933509827, + "learning_rate": 8.358777565113242e-05, + "loss": 0.8221, + "step": 83160 + }, + { + "epoch": 0.5313494243767809, + "grad_norm": 0.9374033808708191, + "learning_rate": 8.358405851679574e-05, + "loss": 0.7633, + "step": 83170 + }, + { + "epoch": 0.5314133115265196, + "grad_norm": 0.885487973690033, + "learning_rate": 8.358034104423857e-05, + "loss": 0.926, + "step": 83180 + }, + { + "epoch": 0.5314771986762583, + "grad_norm": 0.6758369207382202, + "learning_rate": 8.357662323349828e-05, + "loss": 0.9676, + "step": 83190 + }, + { + "epoch": 0.5315410858259969, + "grad_norm": 1.0564520359039307, + "learning_rate": 8.357290508461238e-05, + "loss": 1.1299, + "step": 83200 + }, + { + "epoch": 0.5316049729757356, + "grad_norm": 0.7590727210044861, + "learning_rate": 8.356918659761826e-05, + "loss": 0.7102, + "step": 83210 + }, + { + "epoch": 0.5316688601254743, + "grad_norm": 1.157811164855957, + "learning_rate": 8.356546777255339e-05, + "loss": 1.3055, + "step": 83220 + }, + { + "epoch": 0.531732747275213, + "grad_norm": 0.7595437169075012, + "learning_rate": 8.356174860945521e-05, + "loss": 0.6526, + "step": 83230 + }, + { + "epoch": 0.5317966344249517, + "grad_norm": 1.0199005603790283, + "learning_rate": 8.355802910836122e-05, + "loss": 0.8271, + "step": 83240 + }, + { + "epoch": 0.5318605215746904, + "grad_norm": 0.9487647414207458, + "learning_rate": 8.355430926930882e-05, + "loss": 1.0003, + "step": 83250 + }, + { + "epoch": 0.5319244087244291, + "grad_norm": 0.8482071161270142, + "learning_rate": 8.35505890923355e-05, + "loss": 1.0448, + "step": 83260 + }, + { + "epoch": 0.5319882958741678, + "grad_norm": 0.7448784708976746, + "learning_rate": 8.354686857747872e-05, + "loss": 1.0787, + "step": 83270 + }, + { + "epoch": 0.5320521830239066, + "grad_norm": 0.6928815245628357, + "learning_rate": 8.354314772477596e-05, + "loss": 0.7474, + "step": 83280 + }, + { + "epoch": 0.5321160701736453, + "grad_norm": 0.7215262651443481, + "learning_rate": 8.353942653426468e-05, + "loss": 0.8376, + "step": 83290 + }, + { + "epoch": 0.532179957323384, + "grad_norm": 1.3700790405273438, + "learning_rate": 8.353570500598235e-05, + "loss": 0.9953, + "step": 83300 + }, + { + "epoch": 0.5322438444731227, + "grad_norm": 1.3022416830062866, + "learning_rate": 8.353198313996649e-05, + "loss": 0.874, + "step": 83310 + }, + { + "epoch": 0.5323077316228614, + "grad_norm": 0.7596305012702942, + "learning_rate": 8.352826093625453e-05, + "loss": 0.6174, + "step": 83320 + }, + { + "epoch": 0.5323716187726001, + "grad_norm": 0.7313193082809448, + "learning_rate": 8.352453839488397e-05, + "loss": 1.105, + "step": 83330 + }, + { + "epoch": 0.5324355059223388, + "grad_norm": 0.6660478711128235, + "learning_rate": 8.35208155158923e-05, + "loss": 1.0361, + "step": 83340 + }, + { + "epoch": 0.5324993930720775, + "grad_norm": 1.1651302576065063, + "learning_rate": 8.351709229931704e-05, + "loss": 0.8399, + "step": 83350 + }, + { + "epoch": 0.5325632802218162, + "grad_norm": 1.0588157176971436, + "learning_rate": 8.351336874519564e-05, + "loss": 1.0247, + "step": 83360 + }, + { + "epoch": 0.5326271673715549, + "grad_norm": 1.3601503372192383, + "learning_rate": 8.350964485356562e-05, + "loss": 0.9979, + "step": 83370 + }, + { + "epoch": 0.5326910545212936, + "grad_norm": 1.1261094808578491, + "learning_rate": 8.350592062446451e-05, + "loss": 0.793, + "step": 83380 + }, + { + "epoch": 0.5327549416710323, + "grad_norm": 0.7241072654724121, + "learning_rate": 8.35021960579298e-05, + "loss": 0.965, + "step": 83390 + }, + { + "epoch": 0.532818828820771, + "grad_norm": 0.7780799865722656, + "learning_rate": 8.349847115399896e-05, + "loss": 1.0567, + "step": 83400 + }, + { + "epoch": 0.5328827159705097, + "grad_norm": 0.7408662438392639, + "learning_rate": 8.349474591270957e-05, + "loss": 0.782, + "step": 83410 + }, + { + "epoch": 0.5329466031202484, + "grad_norm": 1.189795732498169, + "learning_rate": 8.349102033409907e-05, + "loss": 0.8716, + "step": 83420 + }, + { + "epoch": 0.5330104902699871, + "grad_norm": 0.7125329375267029, + "learning_rate": 8.348729441820505e-05, + "loss": 0.9386, + "step": 83430 + }, + { + "epoch": 0.5330743774197257, + "grad_norm": 0.9453898668289185, + "learning_rate": 8.3483568165065e-05, + "loss": 0.9899, + "step": 83440 + }, + { + "epoch": 0.5331382645694644, + "grad_norm": 0.7429458498954773, + "learning_rate": 8.347984157471645e-05, + "loss": 0.893, + "step": 83450 + }, + { + "epoch": 0.5332021517192032, + "grad_norm": 0.758669376373291, + "learning_rate": 8.347611464719694e-05, + "loss": 0.9558, + "step": 83460 + }, + { + "epoch": 0.5332660388689419, + "grad_norm": 0.7475212216377258, + "learning_rate": 8.347238738254399e-05, + "loss": 0.7202, + "step": 83470 + }, + { + "epoch": 0.5333299260186806, + "grad_norm": 1.387135624885559, + "learning_rate": 8.346865978079512e-05, + "loss": 0.6861, + "step": 83480 + }, + { + "epoch": 0.5333938131684193, + "grad_norm": 0.9755001068115234, + "learning_rate": 8.346493184198792e-05, + "loss": 0.8146, + "step": 83490 + }, + { + "epoch": 0.533457700318158, + "grad_norm": 0.9414482712745667, + "learning_rate": 8.346120356615989e-05, + "loss": 0.9161, + "step": 83500 + }, + { + "epoch": 0.5335215874678967, + "grad_norm": 0.7464240193367004, + "learning_rate": 8.34574749533486e-05, + "loss": 0.9186, + "step": 83510 + }, + { + "epoch": 0.5335854746176354, + "grad_norm": 1.2203441858291626, + "learning_rate": 8.34537460035916e-05, + "loss": 0.8786, + "step": 83520 + }, + { + "epoch": 0.5336493617673741, + "grad_norm": 1.0309191942214966, + "learning_rate": 8.345001671692641e-05, + "loss": 0.7864, + "step": 83530 + }, + { + "epoch": 0.5337132489171128, + "grad_norm": 1.0020480155944824, + "learning_rate": 8.344628709339063e-05, + "loss": 0.858, + "step": 83540 + }, + { + "epoch": 0.5337771360668515, + "grad_norm": 0.7191622257232666, + "learning_rate": 8.344255713302181e-05, + "loss": 0.699, + "step": 83550 + }, + { + "epoch": 0.5338410232165902, + "grad_norm": 0.4417421817779541, + "learning_rate": 8.343882683585748e-05, + "loss": 0.8309, + "step": 83560 + }, + { + "epoch": 0.5339049103663289, + "grad_norm": 0.5455567240715027, + "learning_rate": 8.343509620193526e-05, + "loss": 0.7101, + "step": 83570 + }, + { + "epoch": 0.5339687975160676, + "grad_norm": 0.7480769753456116, + "learning_rate": 8.343136523129269e-05, + "loss": 1.0079, + "step": 83580 + }, + { + "epoch": 0.5340326846658063, + "grad_norm": 0.569848895072937, + "learning_rate": 8.342763392396735e-05, + "loss": 0.9228, + "step": 83590 + }, + { + "epoch": 0.534096571815545, + "grad_norm": 0.8300278782844543, + "learning_rate": 8.342390227999683e-05, + "loss": 0.8459, + "step": 83600 + }, + { + "epoch": 0.5341604589652837, + "grad_norm": 0.7378689050674438, + "learning_rate": 8.342017029941868e-05, + "loss": 0.9019, + "step": 83610 + }, + { + "epoch": 0.5342243461150225, + "grad_norm": 1.1345140933990479, + "learning_rate": 8.34164379822705e-05, + "loss": 0.7472, + "step": 83620 + }, + { + "epoch": 0.5342882332647612, + "grad_norm": 0.5428297519683838, + "learning_rate": 8.341270532858989e-05, + "loss": 0.7232, + "step": 83630 + }, + { + "epoch": 0.5343521204144999, + "grad_norm": 0.8249925971031189, + "learning_rate": 8.340897233841443e-05, + "loss": 0.785, + "step": 83640 + }, + { + "epoch": 0.5344160075642386, + "grad_norm": 0.9514716863632202, + "learning_rate": 8.340523901178173e-05, + "loss": 0.7077, + "step": 83650 + }, + { + "epoch": 0.5344798947139773, + "grad_norm": 1.2342941761016846, + "learning_rate": 8.340150534872934e-05, + "loss": 0.7654, + "step": 83660 + }, + { + "epoch": 0.534543781863716, + "grad_norm": 0.7578923106193542, + "learning_rate": 8.339777134929492e-05, + "loss": 0.8597, + "step": 83670 + }, + { + "epoch": 0.5346076690134546, + "grad_norm": 0.7773808836936951, + "learning_rate": 8.339403701351604e-05, + "loss": 0.9918, + "step": 83680 + }, + { + "epoch": 0.5346715561631933, + "grad_norm": 1.0507415533065796, + "learning_rate": 8.339030234143032e-05, + "loss": 0.7686, + "step": 83690 + }, + { + "epoch": 0.534735443312932, + "grad_norm": 0.6321387887001038, + "learning_rate": 8.338656733307537e-05, + "loss": 0.9376, + "step": 83700 + }, + { + "epoch": 0.5347993304626707, + "grad_norm": 0.705500066280365, + "learning_rate": 8.33828319884888e-05, + "loss": 0.9138, + "step": 83710 + }, + { + "epoch": 0.5348632176124094, + "grad_norm": 0.745877206325531, + "learning_rate": 8.337909630770824e-05, + "loss": 1.2483, + "step": 83720 + }, + { + "epoch": 0.5349271047621481, + "grad_norm": 0.9086830615997314, + "learning_rate": 8.337536029077129e-05, + "loss": 0.6154, + "step": 83730 + }, + { + "epoch": 0.5349909919118868, + "grad_norm": 1.130573034286499, + "learning_rate": 8.337162393771559e-05, + "loss": 0.8188, + "step": 83740 + }, + { + "epoch": 0.5350548790616255, + "grad_norm": 0.8201401829719543, + "learning_rate": 8.336788724857878e-05, + "loss": 1.1955, + "step": 83750 + }, + { + "epoch": 0.5351187662113642, + "grad_norm": 1.0572373867034912, + "learning_rate": 8.336415022339847e-05, + "loss": 1.0602, + "step": 83760 + }, + { + "epoch": 0.5351826533611029, + "grad_norm": 0.4251170754432678, + "learning_rate": 8.33604128622123e-05, + "loss": 0.7514, + "step": 83770 + }, + { + "epoch": 0.5352465405108416, + "grad_norm": 1.0447115898132324, + "learning_rate": 8.335667516505791e-05, + "loss": 0.9743, + "step": 83780 + }, + { + "epoch": 0.5353104276605803, + "grad_norm": 0.8478367924690247, + "learning_rate": 8.335293713197296e-05, + "loss": 1.123, + "step": 83790 + }, + { + "epoch": 0.535374314810319, + "grad_norm": 0.8603829145431519, + "learning_rate": 8.334919876299507e-05, + "loss": 0.9385, + "step": 83800 + }, + { + "epoch": 0.5354382019600578, + "grad_norm": 0.719473659992218, + "learning_rate": 8.334546005816188e-05, + "loss": 0.8173, + "step": 83810 + }, + { + "epoch": 0.5355020891097965, + "grad_norm": 1.2602207660675049, + "learning_rate": 8.334172101751108e-05, + "loss": 1.4649, + "step": 83820 + }, + { + "epoch": 0.5355659762595352, + "grad_norm": 1.0611252784729004, + "learning_rate": 8.33379816410803e-05, + "loss": 0.922, + "step": 83830 + }, + { + "epoch": 0.5356298634092739, + "grad_norm": 0.7457683682441711, + "learning_rate": 8.33342419289072e-05, + "loss": 0.9272, + "step": 83840 + }, + { + "epoch": 0.5356937505590126, + "grad_norm": 1.1285589933395386, + "learning_rate": 8.333050188102944e-05, + "loss": 0.9133, + "step": 83850 + }, + { + "epoch": 0.5357576377087513, + "grad_norm": 1.1243196725845337, + "learning_rate": 8.33267614974847e-05, + "loss": 0.6372, + "step": 83860 + }, + { + "epoch": 0.53582152485849, + "grad_norm": 1.4428707361221313, + "learning_rate": 8.332302077831065e-05, + "loss": 0.9235, + "step": 83870 + }, + { + "epoch": 0.5358854120082287, + "grad_norm": 0.7449108362197876, + "learning_rate": 8.331927972354492e-05, + "loss": 0.9763, + "step": 83880 + }, + { + "epoch": 0.5359492991579674, + "grad_norm": 0.6374861001968384, + "learning_rate": 8.331553833322526e-05, + "loss": 0.7703, + "step": 83890 + }, + { + "epoch": 0.5360131863077061, + "grad_norm": 1.0096155405044556, + "learning_rate": 8.331179660738927e-05, + "loss": 0.8562, + "step": 83900 + }, + { + "epoch": 0.5360770734574448, + "grad_norm": 0.7320453524589539, + "learning_rate": 8.330805454607468e-05, + "loss": 0.666, + "step": 83910 + }, + { + "epoch": 0.5361409606071835, + "grad_norm": 0.8086037635803223, + "learning_rate": 8.330431214931917e-05, + "loss": 0.8849, + "step": 83920 + }, + { + "epoch": 0.5362048477569221, + "grad_norm": 2.057863235473633, + "learning_rate": 8.330056941716043e-05, + "loss": 0.9219, + "step": 83930 + }, + { + "epoch": 0.5362687349066608, + "grad_norm": 0.6205108761787415, + "learning_rate": 8.329682634963614e-05, + "loss": 1.0129, + "step": 83940 + }, + { + "epoch": 0.5363326220563995, + "grad_norm": 1.2019091844558716, + "learning_rate": 8.3293082946784e-05, + "loss": 1.1293, + "step": 83950 + }, + { + "epoch": 0.5363965092061382, + "grad_norm": 0.6992289423942566, + "learning_rate": 8.328933920864172e-05, + "loss": 0.6816, + "step": 83960 + }, + { + "epoch": 0.536460396355877, + "grad_norm": 0.8456112742424011, + "learning_rate": 8.328559513524699e-05, + "loss": 0.9054, + "step": 83970 + }, + { + "epoch": 0.5365242835056157, + "grad_norm": 0.6546765565872192, + "learning_rate": 8.328185072663752e-05, + "loss": 0.8627, + "step": 83980 + }, + { + "epoch": 0.5365881706553544, + "grad_norm": 0.9863765835762024, + "learning_rate": 8.327810598285102e-05, + "loss": 1.2343, + "step": 83990 + }, + { + "epoch": 0.5366520578050931, + "grad_norm": 0.8402466773986816, + "learning_rate": 8.32743609039252e-05, + "loss": 0.8839, + "step": 84000 + }, + { + "epoch": 0.5367159449548318, + "grad_norm": 0.5946282744407654, + "learning_rate": 8.327061548989778e-05, + "loss": 0.8401, + "step": 84010 + }, + { + "epoch": 0.5367798321045705, + "grad_norm": 0.8258355855941772, + "learning_rate": 8.32668697408065e-05, + "loss": 1.094, + "step": 84020 + }, + { + "epoch": 0.5368437192543092, + "grad_norm": 0.779899537563324, + "learning_rate": 8.326312365668905e-05, + "loss": 1.0426, + "step": 84030 + }, + { + "epoch": 0.5369076064040479, + "grad_norm": 0.9077179431915283, + "learning_rate": 8.325937723758314e-05, + "loss": 0.9158, + "step": 84040 + }, + { + "epoch": 0.5369714935537866, + "grad_norm": 1.139228105545044, + "learning_rate": 8.325563048352655e-05, + "loss": 0.8906, + "step": 84050 + }, + { + "epoch": 0.5370353807035253, + "grad_norm": 0.8066197037696838, + "learning_rate": 8.3251883394557e-05, + "loss": 0.8026, + "step": 84060 + }, + { + "epoch": 0.537099267853264, + "grad_norm": 0.8473499417304993, + "learning_rate": 8.32481359707122e-05, + "loss": 0.5615, + "step": 84070 + }, + { + "epoch": 0.5371631550030027, + "grad_norm": 1.1238465309143066, + "learning_rate": 8.324438821202992e-05, + "loss": 0.814, + "step": 84080 + }, + { + "epoch": 0.5372270421527414, + "grad_norm": 0.8760488629341125, + "learning_rate": 8.324064011854789e-05, + "loss": 0.8522, + "step": 84090 + }, + { + "epoch": 0.5372909293024801, + "grad_norm": 1.5137993097305298, + "learning_rate": 8.323689169030384e-05, + "loss": 0.7777, + "step": 84100 + }, + { + "epoch": 0.5373548164522188, + "grad_norm": 1.2992900609970093, + "learning_rate": 8.323314292733556e-05, + "loss": 0.8892, + "step": 84110 + }, + { + "epoch": 0.5374187036019575, + "grad_norm": 0.8411065936088562, + "learning_rate": 8.322939382968077e-05, + "loss": 0.8524, + "step": 84120 + }, + { + "epoch": 0.5374825907516962, + "grad_norm": 0.8992130160331726, + "learning_rate": 8.322564439737723e-05, + "loss": 0.8281, + "step": 84130 + }, + { + "epoch": 0.537546477901435, + "grad_norm": 0.5751587152481079, + "learning_rate": 8.322189463046271e-05, + "loss": 0.749, + "step": 84140 + }, + { + "epoch": 0.5376103650511737, + "grad_norm": 0.6489611268043518, + "learning_rate": 8.321814452897498e-05, + "loss": 0.9997, + "step": 84150 + }, + { + "epoch": 0.5376742522009124, + "grad_norm": 0.6058949828147888, + "learning_rate": 8.321439409295179e-05, + "loss": 1.159, + "step": 84160 + }, + { + "epoch": 0.537738139350651, + "grad_norm": 0.779172420501709, + "learning_rate": 8.321064332243091e-05, + "loss": 0.8733, + "step": 84170 + }, + { + "epoch": 0.5378020265003897, + "grad_norm": 0.8683562278747559, + "learning_rate": 8.320689221745012e-05, + "loss": 0.7102, + "step": 84180 + }, + { + "epoch": 0.5378659136501284, + "grad_norm": 0.6446613073348999, + "learning_rate": 8.32031407780472e-05, + "loss": 0.6436, + "step": 84190 + }, + { + "epoch": 0.5379298007998671, + "grad_norm": 0.7266974449157715, + "learning_rate": 8.319938900425994e-05, + "loss": 0.8872, + "step": 84200 + }, + { + "epoch": 0.5379936879496058, + "grad_norm": 0.8739939332008362, + "learning_rate": 8.319563689612611e-05, + "loss": 0.8188, + "step": 84210 + }, + { + "epoch": 0.5380575750993445, + "grad_norm": 1.239883542060852, + "learning_rate": 8.319188445368349e-05, + "loss": 0.8107, + "step": 84220 + }, + { + "epoch": 0.5381214622490832, + "grad_norm": 1.1432856321334839, + "learning_rate": 8.318813167696987e-05, + "loss": 0.7252, + "step": 84230 + }, + { + "epoch": 0.5381853493988219, + "grad_norm": 1.283229947090149, + "learning_rate": 8.318437856602306e-05, + "loss": 0.8599, + "step": 84240 + }, + { + "epoch": 0.5382492365485606, + "grad_norm": 1.238756775856018, + "learning_rate": 8.318062512088087e-05, + "loss": 0.939, + "step": 84250 + }, + { + "epoch": 0.5383131236982993, + "grad_norm": 0.9360271096229553, + "learning_rate": 8.317687134158106e-05, + "loss": 0.9372, + "step": 84260 + }, + { + "epoch": 0.538377010848038, + "grad_norm": 0.6929467916488647, + "learning_rate": 8.317311722816145e-05, + "loss": 0.8145, + "step": 84270 + }, + { + "epoch": 0.5384408979977767, + "grad_norm": 1.410101294517517, + "learning_rate": 8.316936278065986e-05, + "loss": 0.6732, + "step": 84280 + }, + { + "epoch": 0.5385047851475154, + "grad_norm": 1.029524803161621, + "learning_rate": 8.316560799911408e-05, + "loss": 1.0576, + "step": 84290 + }, + { + "epoch": 0.5385686722972541, + "grad_norm": 1.0988661050796509, + "learning_rate": 8.316185288356194e-05, + "loss": 0.7547, + "step": 84300 + }, + { + "epoch": 0.5386325594469928, + "grad_norm": 0.8414357304573059, + "learning_rate": 8.315809743404126e-05, + "loss": 0.9667, + "step": 84310 + }, + { + "epoch": 0.5386964465967315, + "grad_norm": 0.6246783137321472, + "learning_rate": 8.315434165058983e-05, + "loss": 0.7647, + "step": 84320 + }, + { + "epoch": 0.5387603337464703, + "grad_norm": 0.7971277236938477, + "learning_rate": 8.315058553324551e-05, + "loss": 1.2018, + "step": 84330 + }, + { + "epoch": 0.538824220896209, + "grad_norm": 0.7713975310325623, + "learning_rate": 8.314682908204612e-05, + "loss": 0.9313, + "step": 84340 + }, + { + "epoch": 0.5388881080459477, + "grad_norm": 0.6083114147186279, + "learning_rate": 8.314307229702949e-05, + "loss": 0.8577, + "step": 84350 + }, + { + "epoch": 0.5389519951956864, + "grad_norm": 1.0030479431152344, + "learning_rate": 8.313931517823344e-05, + "loss": 0.88, + "step": 84360 + }, + { + "epoch": 0.5390158823454251, + "grad_norm": 0.9634591341018677, + "learning_rate": 8.313555772569581e-05, + "loss": 1.137, + "step": 84370 + }, + { + "epoch": 0.5390797694951638, + "grad_norm": 0.6758565902709961, + "learning_rate": 8.313179993945445e-05, + "loss": 0.8548, + "step": 84380 + }, + { + "epoch": 0.5391436566449025, + "grad_norm": 1.2440674304962158, + "learning_rate": 8.312804181954721e-05, + "loss": 0.9583, + "step": 84390 + }, + { + "epoch": 0.5392075437946412, + "grad_norm": 0.7709629535675049, + "learning_rate": 8.312428336601193e-05, + "loss": 0.9656, + "step": 84400 + }, + { + "epoch": 0.5392714309443798, + "grad_norm": 0.8080304265022278, + "learning_rate": 8.312052457888646e-05, + "loss": 0.8182, + "step": 84410 + }, + { + "epoch": 0.5393353180941185, + "grad_norm": 0.7901466488838196, + "learning_rate": 8.311676545820865e-05, + "loss": 0.8039, + "step": 84420 + }, + { + "epoch": 0.5393992052438572, + "grad_norm": 0.6051963567733765, + "learning_rate": 8.311338196444268e-05, + "loss": 0.9002, + "step": 84430 + }, + { + "epoch": 0.5394630923935959, + "grad_norm": 0.8808472156524658, + "learning_rate": 8.310962221011971e-05, + "loss": 0.8492, + "step": 84440 + }, + { + "epoch": 0.5395269795433346, + "grad_norm": 0.7070138454437256, + "learning_rate": 8.310586212235423e-05, + "loss": 1.134, + "step": 84450 + }, + { + "epoch": 0.5395908666930733, + "grad_norm": 0.7789306640625, + "learning_rate": 8.310210170118406e-05, + "loss": 0.7914, + "step": 84460 + }, + { + "epoch": 0.539654753842812, + "grad_norm": 0.9255892634391785, + "learning_rate": 8.30983409466471e-05, + "loss": 0.8324, + "step": 84470 + }, + { + "epoch": 0.5397186409925507, + "grad_norm": 1.0117281675338745, + "learning_rate": 8.309457985878122e-05, + "loss": 0.9676, + "step": 84480 + }, + { + "epoch": 0.5397825281422894, + "grad_norm": 0.7408267855644226, + "learning_rate": 8.309081843762428e-05, + "loss": 0.829, + "step": 84490 + }, + { + "epoch": 0.5398464152920281, + "grad_norm": 0.6966201663017273, + "learning_rate": 8.308705668321417e-05, + "loss": 0.9113, + "step": 84500 + }, + { + "epoch": 0.5399103024417669, + "grad_norm": 0.7605626583099365, + "learning_rate": 8.308329459558877e-05, + "loss": 0.9392, + "step": 84510 + }, + { + "epoch": 0.5399741895915056, + "grad_norm": 0.7314460277557373, + "learning_rate": 8.307953217478599e-05, + "loss": 0.7721, + "step": 84520 + }, + { + "epoch": 0.5400380767412443, + "grad_norm": 0.8111374974250793, + "learning_rate": 8.30757694208437e-05, + "loss": 0.6695, + "step": 84530 + }, + { + "epoch": 0.540101963890983, + "grad_norm": 0.7169995903968811, + "learning_rate": 8.307200633379978e-05, + "loss": 0.8237, + "step": 84540 + }, + { + "epoch": 0.5401658510407217, + "grad_norm": 0.8992086052894592, + "learning_rate": 8.306824291369216e-05, + "loss": 0.7942, + "step": 84550 + }, + { + "epoch": 0.5402297381904604, + "grad_norm": 0.5550522804260254, + "learning_rate": 8.306447916055871e-05, + "loss": 0.8189, + "step": 84560 + }, + { + "epoch": 0.5402936253401991, + "grad_norm": 1.1253445148468018, + "learning_rate": 8.306071507443737e-05, + "loss": 0.8835, + "step": 84570 + }, + { + "epoch": 0.5403575124899378, + "grad_norm": 1.120518684387207, + "learning_rate": 8.305695065536602e-05, + "loss": 1.099, + "step": 84580 + }, + { + "epoch": 0.5404213996396765, + "grad_norm": 1.3610060214996338, + "learning_rate": 8.305318590338258e-05, + "loss": 0.9345, + "step": 84590 + }, + { + "epoch": 0.5404852867894152, + "grad_norm": 0.8917859792709351, + "learning_rate": 8.304942081852496e-05, + "loss": 1.2491, + "step": 84600 + }, + { + "epoch": 0.5405491739391539, + "grad_norm": 0.734668493270874, + "learning_rate": 8.304565540083107e-05, + "loss": 1.0179, + "step": 84610 + }, + { + "epoch": 0.5406130610888926, + "grad_norm": 0.808816134929657, + "learning_rate": 8.304188965033885e-05, + "loss": 0.9507, + "step": 84620 + }, + { + "epoch": 0.5406769482386313, + "grad_norm": 0.8101891875267029, + "learning_rate": 8.303812356708622e-05, + "loss": 0.7707, + "step": 84630 + }, + { + "epoch": 0.54074083538837, + "grad_norm": 0.5614955425262451, + "learning_rate": 8.303435715111111e-05, + "loss": 0.9146, + "step": 84640 + }, + { + "epoch": 0.5408047225381087, + "grad_norm": 1.0607208013534546, + "learning_rate": 8.303059040245144e-05, + "loss": 0.8684, + "step": 84650 + }, + { + "epoch": 0.5408686096878473, + "grad_norm": 0.9058458805084229, + "learning_rate": 8.302682332114515e-05, + "loss": 0.7029, + "step": 84660 + }, + { + "epoch": 0.540932496837586, + "grad_norm": 0.9082807898521423, + "learning_rate": 8.302305590723016e-05, + "loss": 0.8539, + "step": 84670 + }, + { + "epoch": 0.5409963839873247, + "grad_norm": 0.8213421702384949, + "learning_rate": 8.301928816074445e-05, + "loss": 0.9783, + "step": 84680 + }, + { + "epoch": 0.5410602711370635, + "grad_norm": 0.7759522795677185, + "learning_rate": 8.301552008172593e-05, + "loss": 0.6989, + "step": 84690 + }, + { + "epoch": 0.5411241582868022, + "grad_norm": 0.574531078338623, + "learning_rate": 8.301175167021256e-05, + "loss": 0.9258, + "step": 84700 + }, + { + "epoch": 0.5411880454365409, + "grad_norm": 0.8771001100540161, + "learning_rate": 8.300798292624228e-05, + "loss": 1.0307, + "step": 84710 + }, + { + "epoch": 0.5412519325862796, + "grad_norm": 1.086178183555603, + "learning_rate": 8.300421384985309e-05, + "loss": 1.1992, + "step": 84720 + }, + { + "epoch": 0.5413158197360183, + "grad_norm": 1.1887942552566528, + "learning_rate": 8.300044444108288e-05, + "loss": 0.7615, + "step": 84730 + }, + { + "epoch": 0.541379706885757, + "grad_norm": 0.909010648727417, + "learning_rate": 8.299667469996966e-05, + "loss": 0.9202, + "step": 84740 + }, + { + "epoch": 0.5414435940354957, + "grad_norm": 0.6186991930007935, + "learning_rate": 8.299290462655138e-05, + "loss": 0.7071, + "step": 84750 + }, + { + "epoch": 0.5415074811852344, + "grad_norm": 0.7226212620735168, + "learning_rate": 8.2989134220866e-05, + "loss": 1.0302, + "step": 84760 + }, + { + "epoch": 0.5415713683349731, + "grad_norm": 1.6351087093353271, + "learning_rate": 8.298536348295152e-05, + "loss": 0.7616, + "step": 84770 + }, + { + "epoch": 0.5416352554847118, + "grad_norm": 2.3202216625213623, + "learning_rate": 8.298159241284587e-05, + "loss": 0.8703, + "step": 84780 + }, + { + "epoch": 0.5416991426344505, + "grad_norm": 0.5504477620124817, + "learning_rate": 8.297782101058706e-05, + "loss": 1.0846, + "step": 84790 + }, + { + "epoch": 0.5417630297841892, + "grad_norm": 0.846871554851532, + "learning_rate": 8.297404927621306e-05, + "loss": 0.9876, + "step": 84800 + }, + { + "epoch": 0.5418269169339279, + "grad_norm": 0.9501508474349976, + "learning_rate": 8.297027720976185e-05, + "loss": 0.779, + "step": 84810 + }, + { + "epoch": 0.5418908040836666, + "grad_norm": 0.6770570278167725, + "learning_rate": 8.296650481127144e-05, + "loss": 0.741, + "step": 84820 + }, + { + "epoch": 0.5419546912334053, + "grad_norm": 1.0204015970230103, + "learning_rate": 8.296273208077981e-05, + "loss": 0.8651, + "step": 84830 + }, + { + "epoch": 0.542018578383144, + "grad_norm": 1.2423951625823975, + "learning_rate": 8.295895901832493e-05, + "loss": 0.9325, + "step": 84840 + }, + { + "epoch": 0.5420824655328828, + "grad_norm": 1.19427490234375, + "learning_rate": 8.295518562394484e-05, + "loss": 0.9283, + "step": 84850 + }, + { + "epoch": 0.5421463526826215, + "grad_norm": 0.9197470545768738, + "learning_rate": 8.29514118976775e-05, + "loss": 0.7334, + "step": 84860 + }, + { + "epoch": 0.5422102398323602, + "grad_norm": 1.0136433839797974, + "learning_rate": 8.294763783956096e-05, + "loss": 0.6607, + "step": 84870 + }, + { + "epoch": 0.5422741269820989, + "grad_norm": 1.1331266164779663, + "learning_rate": 8.294386344963319e-05, + "loss": 0.6727, + "step": 84880 + }, + { + "epoch": 0.5423380141318376, + "grad_norm": 1.1634505987167358, + "learning_rate": 8.294008872793222e-05, + "loss": 1.0549, + "step": 84890 + }, + { + "epoch": 0.5424019012815762, + "grad_norm": 0.885592520236969, + "learning_rate": 8.293631367449605e-05, + "loss": 0.9175, + "step": 84900 + }, + { + "epoch": 0.5424657884313149, + "grad_norm": 0.7307121753692627, + "learning_rate": 8.293253828936271e-05, + "loss": 0.8359, + "step": 84910 + }, + { + "epoch": 0.5425296755810536, + "grad_norm": 1.0684562921524048, + "learning_rate": 8.292876257257022e-05, + "loss": 0.9552, + "step": 84920 + }, + { + "epoch": 0.5425935627307923, + "grad_norm": 0.9303468465805054, + "learning_rate": 8.29249865241566e-05, + "loss": 0.8814, + "step": 84930 + }, + { + "epoch": 0.542657449880531, + "grad_norm": 1.3686809539794922, + "learning_rate": 8.292121014415987e-05, + "loss": 1.0071, + "step": 84940 + }, + { + "epoch": 0.5427213370302697, + "grad_norm": 0.9795172214508057, + "learning_rate": 8.29174334326181e-05, + "loss": 0.8684, + "step": 84950 + }, + { + "epoch": 0.5427852241800084, + "grad_norm": 0.7319976687431335, + "learning_rate": 8.29136563895693e-05, + "loss": 0.8488, + "step": 84960 + }, + { + "epoch": 0.5428491113297471, + "grad_norm": 0.7034667730331421, + "learning_rate": 8.290987901505148e-05, + "loss": 0.8596, + "step": 84970 + }, + { + "epoch": 0.5429129984794858, + "grad_norm": 1.3945845365524292, + "learning_rate": 8.290610130910272e-05, + "loss": 0.759, + "step": 84980 + }, + { + "epoch": 0.5429768856292245, + "grad_norm": 0.878729522228241, + "learning_rate": 8.290232327176104e-05, + "loss": 1.0464, + "step": 84990 + }, + { + "epoch": 0.5430407727789632, + "grad_norm": 1.176857590675354, + "learning_rate": 8.289854490306453e-05, + "loss": 0.9618, + "step": 85000 + }, + { + "epoch": 0.5431046599287019, + "grad_norm": 1.061789870262146, + "learning_rate": 8.289476620305118e-05, + "loss": 0.7039, + "step": 85010 + }, + { + "epoch": 0.5431685470784406, + "grad_norm": 1.1933741569519043, + "learning_rate": 8.289098717175909e-05, + "loss": 0.9763, + "step": 85020 + }, + { + "epoch": 0.5432324342281794, + "grad_norm": 1.391781210899353, + "learning_rate": 8.28872078092263e-05, + "loss": 1.044, + "step": 85030 + }, + { + "epoch": 0.5432963213779181, + "grad_norm": 0.5679248571395874, + "learning_rate": 8.288342811549088e-05, + "loss": 1.1061, + "step": 85040 + }, + { + "epoch": 0.5433602085276568, + "grad_norm": 0.892066240310669, + "learning_rate": 8.28796480905909e-05, + "loss": 0.8354, + "step": 85050 + }, + { + "epoch": 0.5434240956773955, + "grad_norm": 0.6071507930755615, + "learning_rate": 8.28758677345644e-05, + "loss": 1.047, + "step": 85060 + }, + { + "epoch": 0.5434879828271342, + "grad_norm": 0.8333146572113037, + "learning_rate": 8.287208704744946e-05, + "loss": 0.7995, + "step": 85070 + }, + { + "epoch": 0.5435518699768729, + "grad_norm": 1.3047791719436646, + "learning_rate": 8.28683060292842e-05, + "loss": 1.1147, + "step": 85080 + }, + { + "epoch": 0.5436157571266116, + "grad_norm": 0.8263481259346008, + "learning_rate": 8.286452468010664e-05, + "loss": 0.9913, + "step": 85090 + }, + { + "epoch": 0.5436796442763503, + "grad_norm": 0.7588023543357849, + "learning_rate": 8.28607429999549e-05, + "loss": 0.8798, + "step": 85100 + }, + { + "epoch": 0.543743531426089, + "grad_norm": 0.6401307582855225, + "learning_rate": 8.285696098886704e-05, + "loss": 1.1625, + "step": 85110 + }, + { + "epoch": 0.5438074185758277, + "grad_norm": 1.0735725164413452, + "learning_rate": 8.285317864688116e-05, + "loss": 0.836, + "step": 85120 + }, + { + "epoch": 0.5438713057255664, + "grad_norm": 0.8113425970077515, + "learning_rate": 8.284939597403533e-05, + "loss": 0.7685, + "step": 85130 + }, + { + "epoch": 0.543935192875305, + "grad_norm": 0.911358654499054, + "learning_rate": 8.28456129703677e-05, + "loss": 0.9308, + "step": 85140 + }, + { + "epoch": 0.5439990800250437, + "grad_norm": 1.186699390411377, + "learning_rate": 8.284182963591631e-05, + "loss": 0.8727, + "step": 85150 + }, + { + "epoch": 0.5440629671747824, + "grad_norm": 1.335977554321289, + "learning_rate": 8.283804597071928e-05, + "loss": 1.0234, + "step": 85160 + }, + { + "epoch": 0.5441268543245211, + "grad_norm": 1.1093186140060425, + "learning_rate": 8.283426197481473e-05, + "loss": 1.1973, + "step": 85170 + }, + { + "epoch": 0.5441907414742598, + "grad_norm": 0.4914005994796753, + "learning_rate": 8.283047764824075e-05, + "loss": 0.9727, + "step": 85180 + }, + { + "epoch": 0.5442546286239985, + "grad_norm": 0.5826841592788696, + "learning_rate": 8.282669299103544e-05, + "loss": 0.9037, + "step": 85190 + }, + { + "epoch": 0.5443185157737372, + "grad_norm": 1.0560849905014038, + "learning_rate": 8.282290800323697e-05, + "loss": 1.0382, + "step": 85200 + }, + { + "epoch": 0.544382402923476, + "grad_norm": 0.6486173272132874, + "learning_rate": 8.28191226848834e-05, + "loss": 0.913, + "step": 85210 + }, + { + "epoch": 0.5444462900732147, + "grad_norm": 1.390495777130127, + "learning_rate": 8.281533703601288e-05, + "loss": 0.9475, + "step": 85220 + }, + { + "epoch": 0.5445101772229534, + "grad_norm": 0.9394730925559998, + "learning_rate": 8.28115510566635e-05, + "loss": 0.8956, + "step": 85230 + }, + { + "epoch": 0.5445740643726921, + "grad_norm": 1.8664871454238892, + "learning_rate": 8.280776474687343e-05, + "loss": 0.8971, + "step": 85240 + }, + { + "epoch": 0.5446379515224308, + "grad_norm": 1.337372899055481, + "learning_rate": 8.28039781066808e-05, + "loss": 0.9751, + "step": 85250 + }, + { + "epoch": 0.5447018386721695, + "grad_norm": 0.7601255178451538, + "learning_rate": 8.280019113612371e-05, + "loss": 0.8855, + "step": 85260 + }, + { + "epoch": 0.5447657258219082, + "grad_norm": 0.9285007119178772, + "learning_rate": 8.279640383524034e-05, + "loss": 0.6376, + "step": 85270 + }, + { + "epoch": 0.5448296129716469, + "grad_norm": 0.9922348260879517, + "learning_rate": 8.279261620406881e-05, + "loss": 1.2103, + "step": 85280 + }, + { + "epoch": 0.5448935001213856, + "grad_norm": 1.2273433208465576, + "learning_rate": 8.278882824264726e-05, + "loss": 0.7563, + "step": 85290 + }, + { + "epoch": 0.5449573872711243, + "grad_norm": 1.0333365201950073, + "learning_rate": 8.278503995101383e-05, + "loss": 1.0593, + "step": 85300 + }, + { + "epoch": 0.545021274420863, + "grad_norm": 0.8431949615478516, + "learning_rate": 8.278125132920669e-05, + "loss": 1.0156, + "step": 85310 + }, + { + "epoch": 0.5450851615706017, + "grad_norm": 0.8159672617912292, + "learning_rate": 8.277746237726401e-05, + "loss": 0.8822, + "step": 85320 + }, + { + "epoch": 0.5451490487203404, + "grad_norm": 0.8089435696601868, + "learning_rate": 8.27736730952239e-05, + "loss": 0.9965, + "step": 85330 + }, + { + "epoch": 0.5452129358700791, + "grad_norm": 1.1019364595413208, + "learning_rate": 8.276988348312456e-05, + "loss": 0.6972, + "step": 85340 + }, + { + "epoch": 0.5452768230198178, + "grad_norm": 0.755990207195282, + "learning_rate": 8.276609354100414e-05, + "loss": 0.941, + "step": 85350 + }, + { + "epoch": 0.5453407101695565, + "grad_norm": 1.2020833492279053, + "learning_rate": 8.276230326890081e-05, + "loss": 0.8955, + "step": 85360 + }, + { + "epoch": 0.5454045973192952, + "grad_norm": 0.9152243733406067, + "learning_rate": 8.275851266685276e-05, + "loss": 0.9033, + "step": 85370 + }, + { + "epoch": 0.545468484469034, + "grad_norm": 0.9686945080757141, + "learning_rate": 8.275472173489814e-05, + "loss": 0.967, + "step": 85380 + }, + { + "epoch": 0.5455323716187725, + "grad_norm": 0.7973108887672424, + "learning_rate": 8.275093047307511e-05, + "loss": 0.9112, + "step": 85390 + }, + { + "epoch": 0.5455962587685113, + "grad_norm": 0.8229728937149048, + "learning_rate": 8.27471388814219e-05, + "loss": 0.9667, + "step": 85400 + }, + { + "epoch": 0.54566014591825, + "grad_norm": 2.0562517642974854, + "learning_rate": 8.274334695997668e-05, + "loss": 0.8901, + "step": 85410 + }, + { + "epoch": 0.5457240330679887, + "grad_norm": 0.8347399830818176, + "learning_rate": 8.273955470877762e-05, + "loss": 0.6852, + "step": 85420 + }, + { + "epoch": 0.5457879202177274, + "grad_norm": 0.8529654741287231, + "learning_rate": 8.273576212786292e-05, + "loss": 0.7934, + "step": 85430 + }, + { + "epoch": 0.5458518073674661, + "grad_norm": 1.5173248052597046, + "learning_rate": 8.273196921727075e-05, + "loss": 1.0872, + "step": 85440 + }, + { + "epoch": 0.5459156945172048, + "grad_norm": 0.9968129396438599, + "learning_rate": 8.272817597703936e-05, + "loss": 0.831, + "step": 85450 + }, + { + "epoch": 0.5459795816669435, + "grad_norm": 0.980364978313446, + "learning_rate": 8.272438240720692e-05, + "loss": 0.993, + "step": 85460 + }, + { + "epoch": 0.5460434688166822, + "grad_norm": 0.8081022500991821, + "learning_rate": 8.272058850781164e-05, + "loss": 0.8859, + "step": 85470 + }, + { + "epoch": 0.5461073559664209, + "grad_norm": 0.5837751030921936, + "learning_rate": 8.271679427889172e-05, + "loss": 0.8914, + "step": 85480 + }, + { + "epoch": 0.5461712431161596, + "grad_norm": 1.0083229541778564, + "learning_rate": 8.271299972048538e-05, + "loss": 1.0343, + "step": 85490 + }, + { + "epoch": 0.5462351302658983, + "grad_norm": 1.3048707246780396, + "learning_rate": 8.270920483263082e-05, + "loss": 1.014, + "step": 85500 + }, + { + "epoch": 0.546299017415637, + "grad_norm": 0.6506006121635437, + "learning_rate": 8.27054096153663e-05, + "loss": 0.941, + "step": 85510 + }, + { + "epoch": 0.5463629045653757, + "grad_norm": 0.6416545510292053, + "learning_rate": 8.270161406872998e-05, + "loss": 1.053, + "step": 85520 + }, + { + "epoch": 0.5464267917151144, + "grad_norm": 0.9509826302528381, + "learning_rate": 8.269781819276015e-05, + "loss": 1.2414, + "step": 85530 + }, + { + "epoch": 0.5464906788648531, + "grad_norm": 0.8895853757858276, + "learning_rate": 8.269402198749496e-05, + "loss": 0.7903, + "step": 85540 + }, + { + "epoch": 0.5465545660145918, + "grad_norm": 0.7762950658798218, + "learning_rate": 8.269022545297272e-05, + "loss": 1.1589, + "step": 85550 + }, + { + "epoch": 0.5466184531643306, + "grad_norm": 1.1687923669815063, + "learning_rate": 8.268642858923161e-05, + "loss": 0.7478, + "step": 85560 + }, + { + "epoch": 0.5466823403140693, + "grad_norm": 0.8369365930557251, + "learning_rate": 8.268263139630989e-05, + "loss": 0.7321, + "step": 85570 + }, + { + "epoch": 0.546746227463808, + "grad_norm": 1.429382085800171, + "learning_rate": 8.26788338742458e-05, + "loss": 0.9934, + "step": 85580 + }, + { + "epoch": 0.5468101146135467, + "grad_norm": 0.798319935798645, + "learning_rate": 8.267503602307758e-05, + "loss": 1.0856, + "step": 85590 + }, + { + "epoch": 0.5468740017632854, + "grad_norm": 0.8369146585464478, + "learning_rate": 8.267123784284348e-05, + "loss": 1.1573, + "step": 85600 + }, + { + "epoch": 0.5469378889130241, + "grad_norm": 0.8850108981132507, + "learning_rate": 8.266743933358176e-05, + "loss": 0.9553, + "step": 85610 + }, + { + "epoch": 0.5470017760627628, + "grad_norm": 1.494284987449646, + "learning_rate": 8.266364049533065e-05, + "loss": 0.8582, + "step": 85620 + }, + { + "epoch": 0.5470656632125014, + "grad_norm": 1.0392504930496216, + "learning_rate": 8.265984132812843e-05, + "loss": 0.9366, + "step": 85630 + }, + { + "epoch": 0.5471295503622401, + "grad_norm": 1.1216447353363037, + "learning_rate": 8.265604183201335e-05, + "loss": 0.7129, + "step": 85640 + }, + { + "epoch": 0.5471934375119788, + "grad_norm": 0.9301803708076477, + "learning_rate": 8.265224200702368e-05, + "loss": 0.6782, + "step": 85650 + }, + { + "epoch": 0.5472573246617175, + "grad_norm": 0.8470297455787659, + "learning_rate": 8.264844185319767e-05, + "loss": 0.9429, + "step": 85660 + }, + { + "epoch": 0.5473212118114562, + "grad_norm": 0.5491040945053101, + "learning_rate": 8.264464137057361e-05, + "loss": 0.896, + "step": 85670 + }, + { + "epoch": 0.5473850989611949, + "grad_norm": 2.296058416366577, + "learning_rate": 8.264084055918979e-05, + "loss": 0.9733, + "step": 85680 + }, + { + "epoch": 0.5474489861109336, + "grad_norm": 0.6744887828826904, + "learning_rate": 8.263703941908445e-05, + "loss": 0.8849, + "step": 85690 + }, + { + "epoch": 0.5475128732606723, + "grad_norm": 0.819434404373169, + "learning_rate": 8.26332379502959e-05, + "loss": 0.9268, + "step": 85700 + }, + { + "epoch": 0.547576760410411, + "grad_norm": 0.725577175617218, + "learning_rate": 8.26294361528624e-05, + "loss": 0.7676, + "step": 85710 + }, + { + "epoch": 0.5476406475601497, + "grad_norm": 0.8266330361366272, + "learning_rate": 8.262563402682226e-05, + "loss": 0.7853, + "step": 85720 + }, + { + "epoch": 0.5477045347098884, + "grad_norm": 0.8885651230812073, + "learning_rate": 8.262183157221375e-05, + "loss": 0.9026, + "step": 85730 + }, + { + "epoch": 0.5477684218596272, + "grad_norm": 0.5924326777458191, + "learning_rate": 8.261802878907518e-05, + "loss": 0.8313, + "step": 85740 + }, + { + "epoch": 0.5478323090093659, + "grad_norm": 0.5639459490776062, + "learning_rate": 8.261422567744484e-05, + "loss": 0.8063, + "step": 85750 + }, + { + "epoch": 0.5478961961591046, + "grad_norm": 1.0976332426071167, + "learning_rate": 8.261042223736101e-05, + "loss": 0.7802, + "step": 85760 + }, + { + "epoch": 0.5479600833088433, + "grad_norm": 0.8617295622825623, + "learning_rate": 8.260661846886205e-05, + "loss": 0.8656, + "step": 85770 + }, + { + "epoch": 0.548023970458582, + "grad_norm": 0.8581666946411133, + "learning_rate": 8.260281437198622e-05, + "loss": 0.84, + "step": 85780 + }, + { + "epoch": 0.5480878576083207, + "grad_norm": 1.351036548614502, + "learning_rate": 8.259900994677185e-05, + "loss": 0.7633, + "step": 85790 + }, + { + "epoch": 0.5481517447580594, + "grad_norm": 0.803022563457489, + "learning_rate": 8.259520519325725e-05, + "loss": 0.8321, + "step": 85800 + }, + { + "epoch": 0.5482156319077981, + "grad_norm": 1.7103533744812012, + "learning_rate": 8.259140011148073e-05, + "loss": 1.2177, + "step": 85810 + }, + { + "epoch": 0.5482795190575368, + "grad_norm": 0.7438388466835022, + "learning_rate": 8.258759470148061e-05, + "loss": 0.7835, + "step": 85820 + }, + { + "epoch": 0.5483434062072755, + "grad_norm": 0.5970612168312073, + "learning_rate": 8.258378896329521e-05, + "loss": 0.8907, + "step": 85830 + }, + { + "epoch": 0.5484072933570142, + "grad_norm": 1.2347373962402344, + "learning_rate": 8.257998289696289e-05, + "loss": 0.7738, + "step": 85840 + }, + { + "epoch": 0.5484711805067529, + "grad_norm": 0.8404068350791931, + "learning_rate": 8.257617650252194e-05, + "loss": 0.9716, + "step": 85850 + }, + { + "epoch": 0.5485350676564916, + "grad_norm": 0.6082412600517273, + "learning_rate": 8.257236978001071e-05, + "loss": 0.8301, + "step": 85860 + }, + { + "epoch": 0.5485989548062302, + "grad_norm": 1.7358567714691162, + "learning_rate": 8.256856272946756e-05, + "loss": 0.8836, + "step": 85870 + }, + { + "epoch": 0.5486628419559689, + "grad_norm": 1.0207961797714233, + "learning_rate": 8.256475535093077e-05, + "loss": 0.8748, + "step": 85880 + }, + { + "epoch": 0.5487267291057076, + "grad_norm": 0.6568586826324463, + "learning_rate": 8.256094764443876e-05, + "loss": 0.8192, + "step": 85890 + }, + { + "epoch": 0.5487906162554463, + "grad_norm": 1.5999755859375, + "learning_rate": 8.255713961002981e-05, + "loss": 0.8331, + "step": 85900 + }, + { + "epoch": 0.548854503405185, + "grad_norm": 0.7200731635093689, + "learning_rate": 8.255333124774231e-05, + "loss": 0.9032, + "step": 85910 + }, + { + "epoch": 0.5489183905549238, + "grad_norm": 0.9791352152824402, + "learning_rate": 8.254952255761458e-05, + "loss": 1.2814, + "step": 85920 + }, + { + "epoch": 0.5489822777046625, + "grad_norm": 0.7664794921875, + "learning_rate": 8.254571353968504e-05, + "loss": 0.9206, + "step": 85930 + }, + { + "epoch": 0.5490461648544012, + "grad_norm": 0.8306074738502502, + "learning_rate": 8.254190419399197e-05, + "loss": 1.034, + "step": 85940 + }, + { + "epoch": 0.5491100520041399, + "grad_norm": 0.9220788478851318, + "learning_rate": 8.25380945205738e-05, + "loss": 1.0287, + "step": 85950 + }, + { + "epoch": 0.5491739391538786, + "grad_norm": 0.8465439081192017, + "learning_rate": 8.253428451946885e-05, + "loss": 0.6528, + "step": 85960 + }, + { + "epoch": 0.5492378263036173, + "grad_norm": 1.0008618831634521, + "learning_rate": 8.253047419071551e-05, + "loss": 1.0279, + "step": 85970 + }, + { + "epoch": 0.549301713453356, + "grad_norm": 1.0628058910369873, + "learning_rate": 8.252666353435217e-05, + "loss": 1.0874, + "step": 85980 + }, + { + "epoch": 0.5493656006030947, + "grad_norm": 0.6217834949493408, + "learning_rate": 8.252285255041717e-05, + "loss": 0.9456, + "step": 85990 + }, + { + "epoch": 0.5494294877528334, + "grad_norm": 0.46333763003349304, + "learning_rate": 8.251904123894892e-05, + "loss": 0.8225, + "step": 86000 + }, + { + "epoch": 0.5494933749025721, + "grad_norm": 0.8038020730018616, + "learning_rate": 8.251522959998577e-05, + "loss": 0.8062, + "step": 86010 + }, + { + "epoch": 0.5495572620523108, + "grad_norm": 1.191167950630188, + "learning_rate": 8.251141763356614e-05, + "loss": 0.8599, + "step": 86020 + }, + { + "epoch": 0.5496211492020495, + "grad_norm": 0.8158173561096191, + "learning_rate": 8.25076053397284e-05, + "loss": 0.9257, + "step": 86030 + }, + { + "epoch": 0.5496850363517882, + "grad_norm": 0.7656988501548767, + "learning_rate": 8.250379271851098e-05, + "loss": 0.7153, + "step": 86040 + }, + { + "epoch": 0.5497489235015269, + "grad_norm": 0.7358280420303345, + "learning_rate": 8.249997976995223e-05, + "loss": 1.0252, + "step": 86050 + }, + { + "epoch": 0.5498128106512656, + "grad_norm": 1.4587607383728027, + "learning_rate": 8.249616649409057e-05, + "loss": 0.8815, + "step": 86060 + }, + { + "epoch": 0.5498766978010043, + "grad_norm": 0.657911479473114, + "learning_rate": 8.24923528909644e-05, + "loss": 1.1022, + "step": 86070 + }, + { + "epoch": 0.549940584950743, + "grad_norm": 0.8566390872001648, + "learning_rate": 8.248853896061213e-05, + "loss": 0.7702, + "step": 86080 + }, + { + "epoch": 0.5500044721004818, + "grad_norm": 0.9400178790092468, + "learning_rate": 8.248472470307216e-05, + "loss": 0.811, + "step": 86090 + }, + { + "epoch": 0.5500683592502205, + "grad_norm": 1.0126426219940186, + "learning_rate": 8.24809101183829e-05, + "loss": 0.8001, + "step": 86100 + }, + { + "epoch": 0.5501322463999591, + "grad_norm": 1.513357162475586, + "learning_rate": 8.24770952065828e-05, + "loss": 0.9136, + "step": 86110 + }, + { + "epoch": 0.5501961335496978, + "grad_norm": 1.1465846300125122, + "learning_rate": 8.247327996771024e-05, + "loss": 0.8718, + "step": 86120 + }, + { + "epoch": 0.5502600206994365, + "grad_norm": 0.9992212057113647, + "learning_rate": 8.246946440180365e-05, + "loss": 0.7195, + "step": 86130 + }, + { + "epoch": 0.5503239078491752, + "grad_norm": 0.9585177302360535, + "learning_rate": 8.246564850890148e-05, + "loss": 0.8262, + "step": 86140 + }, + { + "epoch": 0.5503877949989139, + "grad_norm": 0.9354924559593201, + "learning_rate": 8.246183228904212e-05, + "loss": 1.3068, + "step": 86150 + }, + { + "epoch": 0.5504516821486526, + "grad_norm": 0.7876535058021545, + "learning_rate": 8.245801574226403e-05, + "loss": 0.9991, + "step": 86160 + }, + { + "epoch": 0.5505155692983913, + "grad_norm": 1.089166522026062, + "learning_rate": 8.245419886860566e-05, + "loss": 0.8906, + "step": 86170 + }, + { + "epoch": 0.55057945644813, + "grad_norm": 2.4468488693237305, + "learning_rate": 8.245038166810543e-05, + "loss": 0.9857, + "step": 86180 + }, + { + "epoch": 0.5506433435978687, + "grad_norm": 0.6112414598464966, + "learning_rate": 8.244656414080176e-05, + "loss": 0.8398, + "step": 86190 + }, + { + "epoch": 0.5507072307476074, + "grad_norm": 0.5781684517860413, + "learning_rate": 8.244274628673314e-05, + "loss": 0.8127, + "step": 86200 + }, + { + "epoch": 0.5507711178973461, + "grad_norm": 1.1223702430725098, + "learning_rate": 8.243892810593798e-05, + "loss": 0.8847, + "step": 86210 + }, + { + "epoch": 0.5508350050470848, + "grad_norm": 0.6717120409011841, + "learning_rate": 8.243510959845478e-05, + "loss": 0.8302, + "step": 86220 + }, + { + "epoch": 0.5508988921968235, + "grad_norm": 0.8732984662055969, + "learning_rate": 8.243129076432193e-05, + "loss": 0.6463, + "step": 86230 + }, + { + "epoch": 0.5509627793465622, + "grad_norm": 0.8502248525619507, + "learning_rate": 8.242747160357796e-05, + "loss": 0.8256, + "step": 86240 + }, + { + "epoch": 0.5510266664963009, + "grad_norm": 0.9782050251960754, + "learning_rate": 8.242365211626127e-05, + "loss": 0.9004, + "step": 86250 + }, + { + "epoch": 0.5510905536460396, + "grad_norm": 0.48341086506843567, + "learning_rate": 8.241983230241037e-05, + "loss": 0.7873, + "step": 86260 + }, + { + "epoch": 0.5511544407957784, + "grad_norm": 1.0804799795150757, + "learning_rate": 8.241601216206369e-05, + "loss": 1.1629, + "step": 86270 + }, + { + "epoch": 0.5512183279455171, + "grad_norm": 0.8438146114349365, + "learning_rate": 8.241219169525973e-05, + "loss": 0.7865, + "step": 86280 + }, + { + "epoch": 0.5512822150952558, + "grad_norm": 0.9812383055686951, + "learning_rate": 8.240837090203696e-05, + "loss": 0.8253, + "step": 86290 + }, + { + "epoch": 0.5513461022449945, + "grad_norm": 1.204162836074829, + "learning_rate": 8.240454978243387e-05, + "loss": 1.0205, + "step": 86300 + }, + { + "epoch": 0.5514099893947332, + "grad_norm": 1.9019670486450195, + "learning_rate": 8.240072833648894e-05, + "loss": 0.9302, + "step": 86310 + }, + { + "epoch": 0.5514738765444719, + "grad_norm": 0.8653010129928589, + "learning_rate": 8.239690656424062e-05, + "loss": 0.7381, + "step": 86320 + }, + { + "epoch": 0.5515377636942106, + "grad_norm": 0.9398866891860962, + "learning_rate": 8.239308446572742e-05, + "loss": 0.9952, + "step": 86330 + }, + { + "epoch": 0.5516016508439493, + "grad_norm": 1.2343336343765259, + "learning_rate": 8.238926204098787e-05, + "loss": 0.9225, + "step": 86340 + }, + { + "epoch": 0.551665537993688, + "grad_norm": 0.6204321384429932, + "learning_rate": 8.23854392900604e-05, + "loss": 0.8943, + "step": 86350 + }, + { + "epoch": 0.5517294251434266, + "grad_norm": 1.226536512374878, + "learning_rate": 8.238161621298355e-05, + "loss": 0.7769, + "step": 86360 + }, + { + "epoch": 0.5517933122931653, + "grad_norm": 1.1433632373809814, + "learning_rate": 8.23777928097958e-05, + "loss": 0.9487, + "step": 86370 + }, + { + "epoch": 0.551857199442904, + "grad_norm": 1.344122052192688, + "learning_rate": 8.237396908053567e-05, + "loss": 0.7983, + "step": 86380 + }, + { + "epoch": 0.5519210865926427, + "grad_norm": 0.6882800459861755, + "learning_rate": 8.237014502524168e-05, + "loss": 0.9736, + "step": 86390 + }, + { + "epoch": 0.5519849737423814, + "grad_norm": 0.9313778877258301, + "learning_rate": 8.236632064395231e-05, + "loss": 0.7895, + "step": 86400 + }, + { + "epoch": 0.5520488608921201, + "grad_norm": 1.4271520376205444, + "learning_rate": 8.236249593670609e-05, + "loss": 0.9795, + "step": 86410 + }, + { + "epoch": 0.5521127480418588, + "grad_norm": 2.3434298038482666, + "learning_rate": 8.235867090354153e-05, + "loss": 0.9954, + "step": 86420 + }, + { + "epoch": 0.5521766351915975, + "grad_norm": 3.081796169281006, + "learning_rate": 8.235484554449718e-05, + "loss": 0.8829, + "step": 86430 + }, + { + "epoch": 0.5522405223413362, + "grad_norm": 1.0064252614974976, + "learning_rate": 8.235101985961154e-05, + "loss": 0.8981, + "step": 86440 + }, + { + "epoch": 0.552304409491075, + "grad_norm": 1.3928056955337524, + "learning_rate": 8.234719384892314e-05, + "loss": 1.0775, + "step": 86450 + }, + { + "epoch": 0.5523682966408137, + "grad_norm": 1.0833989381790161, + "learning_rate": 8.23433675124705e-05, + "loss": 0.782, + "step": 86460 + }, + { + "epoch": 0.5524321837905524, + "grad_norm": 0.7659380435943604, + "learning_rate": 8.233954085029219e-05, + "loss": 0.9672, + "step": 86470 + }, + { + "epoch": 0.5524960709402911, + "grad_norm": 0.788266658782959, + "learning_rate": 8.23357138624267e-05, + "loss": 0.7397, + "step": 86480 + }, + { + "epoch": 0.5525599580900298, + "grad_norm": 0.8975238800048828, + "learning_rate": 8.233188654891262e-05, + "loss": 0.9158, + "step": 86490 + }, + { + "epoch": 0.5526238452397685, + "grad_norm": 1.3429840803146362, + "learning_rate": 8.232805890978845e-05, + "loss": 1.0506, + "step": 86500 + }, + { + "epoch": 0.5526877323895072, + "grad_norm": 1.5317329168319702, + "learning_rate": 8.232423094509278e-05, + "loss": 0.9425, + "step": 86510 + }, + { + "epoch": 0.5527516195392459, + "grad_norm": 0.8036388754844666, + "learning_rate": 8.232040265486413e-05, + "loss": 0.9453, + "step": 86520 + }, + { + "epoch": 0.5528155066889846, + "grad_norm": 0.8902758359909058, + "learning_rate": 8.231657403914107e-05, + "loss": 0.7362, + "step": 86530 + }, + { + "epoch": 0.5528793938387233, + "grad_norm": 2.19069242477417, + "learning_rate": 8.231274509796215e-05, + "loss": 0.9963, + "step": 86540 + }, + { + "epoch": 0.552943280988462, + "grad_norm": 0.8316643238067627, + "learning_rate": 8.230891583136593e-05, + "loss": 0.9057, + "step": 86550 + }, + { + "epoch": 0.5530071681382007, + "grad_norm": 1.3300632238388062, + "learning_rate": 8.230508623939097e-05, + "loss": 0.9182, + "step": 86560 + }, + { + "epoch": 0.5530710552879394, + "grad_norm": 0.6165077686309814, + "learning_rate": 8.230125632207585e-05, + "loss": 0.7956, + "step": 86570 + }, + { + "epoch": 0.5531349424376781, + "grad_norm": 0.7122068405151367, + "learning_rate": 8.229742607945915e-05, + "loss": 0.8135, + "step": 86580 + }, + { + "epoch": 0.5531988295874168, + "grad_norm": 1.0047301054000854, + "learning_rate": 8.229359551157941e-05, + "loss": 0.7511, + "step": 86590 + }, + { + "epoch": 0.5532627167371554, + "grad_norm": 0.6250356435775757, + "learning_rate": 8.228976461847522e-05, + "loss": 0.9118, + "step": 86600 + }, + { + "epoch": 0.5533266038868941, + "grad_norm": 0.7042723894119263, + "learning_rate": 8.228593340018518e-05, + "loss": 1.1653, + "step": 86610 + }, + { + "epoch": 0.5533904910366328, + "grad_norm": 1.037340760231018, + "learning_rate": 8.228210185674784e-05, + "loss": 0.9482, + "step": 86620 + }, + { + "epoch": 0.5534543781863716, + "grad_norm": 0.9349649548530579, + "learning_rate": 8.227826998820183e-05, + "loss": 0.7801, + "step": 86630 + }, + { + "epoch": 0.5535182653361103, + "grad_norm": 0.9645569920539856, + "learning_rate": 8.227443779458572e-05, + "loss": 0.8324, + "step": 86640 + }, + { + "epoch": 0.553582152485849, + "grad_norm": 0.991255521774292, + "learning_rate": 8.227060527593808e-05, + "loss": 0.8347, + "step": 86650 + }, + { + "epoch": 0.5536460396355877, + "grad_norm": 0.6694484353065491, + "learning_rate": 8.226677243229753e-05, + "loss": 0.8864, + "step": 86660 + }, + { + "epoch": 0.5537099267853264, + "grad_norm": 0.9695293307304382, + "learning_rate": 8.226293926370268e-05, + "loss": 0.8119, + "step": 86670 + }, + { + "epoch": 0.5537738139350651, + "grad_norm": 0.9662237763404846, + "learning_rate": 8.22591057701921e-05, + "loss": 1.2784, + "step": 86680 + }, + { + "epoch": 0.5538377010848038, + "grad_norm": 0.8255831003189087, + "learning_rate": 8.225527195180442e-05, + "loss": 0.7241, + "step": 86690 + }, + { + "epoch": 0.5539015882345425, + "grad_norm": 1.059599757194519, + "learning_rate": 8.225143780857827e-05, + "loss": 0.9431, + "step": 86700 + }, + { + "epoch": 0.5539654753842812, + "grad_norm": 0.7622451186180115, + "learning_rate": 8.224760334055222e-05, + "loss": 0.8522, + "step": 86710 + }, + { + "epoch": 0.5540293625340199, + "grad_norm": 0.5998595952987671, + "learning_rate": 8.22437685477649e-05, + "loss": 0.7598, + "step": 86720 + }, + { + "epoch": 0.5540932496837586, + "grad_norm": 0.9817114472389221, + "learning_rate": 8.223993343025496e-05, + "loss": 0.8681, + "step": 86730 + }, + { + "epoch": 0.5541571368334973, + "grad_norm": 0.8455765843391418, + "learning_rate": 8.223609798806097e-05, + "loss": 1.0724, + "step": 86740 + }, + { + "epoch": 0.554221023983236, + "grad_norm": 0.9038847088813782, + "learning_rate": 8.22322622212216e-05, + "loss": 0.8695, + "step": 86750 + }, + { + "epoch": 0.5542849111329747, + "grad_norm": 1.3185269832611084, + "learning_rate": 8.222842612977545e-05, + "loss": 0.8196, + "step": 86760 + }, + { + "epoch": 0.5543487982827134, + "grad_norm": 1.6181766986846924, + "learning_rate": 8.22245897137612e-05, + "loss": 0.9941, + "step": 86770 + }, + { + "epoch": 0.5544126854324521, + "grad_norm": 0.7883678674697876, + "learning_rate": 8.222075297321742e-05, + "loss": 0.9108, + "step": 86780 + }, + { + "epoch": 0.5544765725821909, + "grad_norm": 1.2159056663513184, + "learning_rate": 8.221691590818281e-05, + "loss": 0.8442, + "step": 86790 + }, + { + "epoch": 0.5545404597319296, + "grad_norm": 0.6376563310623169, + "learning_rate": 8.221307851869597e-05, + "loss": 0.8837, + "step": 86800 + }, + { + "epoch": 0.5546043468816683, + "grad_norm": 0.8084425926208496, + "learning_rate": 8.220924080479558e-05, + "loss": 0.9176, + "step": 86810 + }, + { + "epoch": 0.554668234031407, + "grad_norm": 0.9043588042259216, + "learning_rate": 8.220540276652024e-05, + "loss": 1.3496, + "step": 86820 + }, + { + "epoch": 0.5547321211811457, + "grad_norm": 0.6929380893707275, + "learning_rate": 8.220156440390865e-05, + "loss": 0.8788, + "step": 86830 + }, + { + "epoch": 0.5547960083308843, + "grad_norm": 0.8745282888412476, + "learning_rate": 8.219772571699945e-05, + "loss": 0.6351, + "step": 86840 + }, + { + "epoch": 0.554859895480623, + "grad_norm": 0.9505758285522461, + "learning_rate": 8.21938867058313e-05, + "loss": 0.7184, + "step": 86850 + }, + { + "epoch": 0.5549237826303617, + "grad_norm": 1.9422472715377808, + "learning_rate": 8.219004737044285e-05, + "loss": 0.915, + "step": 86860 + }, + { + "epoch": 0.5549876697801004, + "grad_norm": 1.590299367904663, + "learning_rate": 8.218620771087277e-05, + "loss": 0.8802, + "step": 86870 + }, + { + "epoch": 0.5550515569298391, + "grad_norm": 0.7018561959266663, + "learning_rate": 8.218236772715972e-05, + "loss": 0.8559, + "step": 86880 + }, + { + "epoch": 0.5551154440795778, + "grad_norm": 1.1416157484054565, + "learning_rate": 8.217852741934242e-05, + "loss": 0.792, + "step": 86890 + }, + { + "epoch": 0.5551793312293165, + "grad_norm": 0.9537761807441711, + "learning_rate": 8.21746867874595e-05, + "loss": 1.0657, + "step": 86900 + }, + { + "epoch": 0.5552432183790552, + "grad_norm": 2.785801887512207, + "learning_rate": 8.217084583154964e-05, + "loss": 1.0421, + "step": 86910 + }, + { + "epoch": 0.5553071055287939, + "grad_norm": 1.0733259916305542, + "learning_rate": 8.216700455165152e-05, + "loss": 1.1064, + "step": 86920 + }, + { + "epoch": 0.5553709926785326, + "grad_norm": 0.9249892830848694, + "learning_rate": 8.216316294780386e-05, + "loss": 0.6912, + "step": 86930 + }, + { + "epoch": 0.5554348798282713, + "grad_norm": 1.2269346714019775, + "learning_rate": 8.215932102004531e-05, + "loss": 0.8911, + "step": 86940 + }, + { + "epoch": 0.55549876697801, + "grad_norm": 1.0599247217178345, + "learning_rate": 8.215547876841459e-05, + "loss": 0.9636, + "step": 86950 + }, + { + "epoch": 0.5555626541277487, + "grad_norm": 1.5449506044387817, + "learning_rate": 8.215163619295036e-05, + "loss": 0.9123, + "step": 86960 + }, + { + "epoch": 0.5556265412774875, + "grad_norm": 0.9864984154701233, + "learning_rate": 8.214779329369134e-05, + "loss": 0.7596, + "step": 86970 + }, + { + "epoch": 0.5556904284272262, + "grad_norm": 0.9634939432144165, + "learning_rate": 8.214395007067624e-05, + "loss": 0.9034, + "step": 86980 + }, + { + "epoch": 0.5557543155769649, + "grad_norm": 1.1555999517440796, + "learning_rate": 8.214010652394376e-05, + "loss": 0.7741, + "step": 86990 + }, + { + "epoch": 0.5558182027267036, + "grad_norm": 1.121670126914978, + "learning_rate": 8.213626265353259e-05, + "loss": 0.926, + "step": 87000 + }, + { + "epoch": 0.5558820898764423, + "grad_norm": 2.0978503227233887, + "learning_rate": 8.213241845948145e-05, + "loss": 0.7041, + "step": 87010 + }, + { + "epoch": 0.555945977026181, + "grad_norm": 0.9516173601150513, + "learning_rate": 8.212857394182906e-05, + "loss": 1.0637, + "step": 87020 + }, + { + "epoch": 0.5560098641759197, + "grad_norm": 1.092417597770691, + "learning_rate": 8.212472910061415e-05, + "loss": 0.8465, + "step": 87030 + }, + { + "epoch": 0.5560737513256584, + "grad_norm": 0.8060597777366638, + "learning_rate": 8.212088393587543e-05, + "loss": 1.2128, + "step": 87040 + }, + { + "epoch": 0.5561376384753971, + "grad_norm": 0.8627819418907166, + "learning_rate": 8.21170384476516e-05, + "loss": 0.9167, + "step": 87050 + }, + { + "epoch": 0.5562015256251358, + "grad_norm": 0.9988081455230713, + "learning_rate": 8.211319263598142e-05, + "loss": 1.0736, + "step": 87060 + }, + { + "epoch": 0.5562654127748745, + "grad_norm": 0.8938575387001038, + "learning_rate": 8.210934650090361e-05, + "loss": 0.9065, + "step": 87070 + }, + { + "epoch": 0.5563292999246132, + "grad_norm": 0.968144953250885, + "learning_rate": 8.210550004245688e-05, + "loss": 1.1044, + "step": 87080 + }, + { + "epoch": 0.5563931870743518, + "grad_norm": 0.6786410212516785, + "learning_rate": 8.210165326068001e-05, + "loss": 0.849, + "step": 87090 + }, + { + "epoch": 0.5564570742240905, + "grad_norm": 0.6535912156105042, + "learning_rate": 8.209780615561172e-05, + "loss": 0.942, + "step": 87100 + }, + { + "epoch": 0.5565209613738292, + "grad_norm": 0.8034776449203491, + "learning_rate": 8.209395872729074e-05, + "loss": 0.8953, + "step": 87110 + }, + { + "epoch": 0.5565848485235679, + "grad_norm": 1.0414639711380005, + "learning_rate": 8.209011097575584e-05, + "loss": 1.1798, + "step": 87120 + }, + { + "epoch": 0.5566487356733066, + "grad_norm": 0.8047749996185303, + "learning_rate": 8.208626290104577e-05, + "loss": 0.9456, + "step": 87130 + }, + { + "epoch": 0.5567126228230453, + "grad_norm": 0.904681384563446, + "learning_rate": 8.208241450319925e-05, + "loss": 0.8932, + "step": 87140 + }, + { + "epoch": 0.556776509972784, + "grad_norm": 0.9343833327293396, + "learning_rate": 8.207856578225508e-05, + "loss": 0.9492, + "step": 87150 + }, + { + "epoch": 0.5568403971225228, + "grad_norm": 0.9821017384529114, + "learning_rate": 8.207471673825199e-05, + "loss": 0.8225, + "step": 87160 + }, + { + "epoch": 0.5569042842722615, + "grad_norm": 0.9593321681022644, + "learning_rate": 8.207086737122876e-05, + "loss": 1.0382, + "step": 87170 + }, + { + "epoch": 0.5569681714220002, + "grad_norm": 1.1232051849365234, + "learning_rate": 8.206701768122415e-05, + "loss": 0.8715, + "step": 87180 + }, + { + "epoch": 0.5570320585717389, + "grad_norm": 0.7729601860046387, + "learning_rate": 8.206316766827692e-05, + "loss": 0.9942, + "step": 87190 + }, + { + "epoch": 0.5570959457214776, + "grad_norm": 1.0262385606765747, + "learning_rate": 8.205931733242586e-05, + "loss": 1.0279, + "step": 87200 + }, + { + "epoch": 0.5571598328712163, + "grad_norm": 0.9253116846084595, + "learning_rate": 8.205546667370975e-05, + "loss": 1.1048, + "step": 87210 + }, + { + "epoch": 0.557223720020955, + "grad_norm": 1.2374264001846313, + "learning_rate": 8.205161569216735e-05, + "loss": 0.7685, + "step": 87220 + }, + { + "epoch": 0.5572876071706937, + "grad_norm": 0.7475976347923279, + "learning_rate": 8.204776438783745e-05, + "loss": 0.9137, + "step": 87230 + }, + { + "epoch": 0.5573514943204324, + "grad_norm": 0.5616995096206665, + "learning_rate": 8.204391276075882e-05, + "loss": 1.0628, + "step": 87240 + }, + { + "epoch": 0.5574153814701711, + "grad_norm": 0.816423773765564, + "learning_rate": 8.20400608109703e-05, + "loss": 0.9533, + "step": 87250 + }, + { + "epoch": 0.5574792686199098, + "grad_norm": 1.1088494062423706, + "learning_rate": 8.203620853851062e-05, + "loss": 1.115, + "step": 87260 + }, + { + "epoch": 0.5575431557696485, + "grad_norm": 0.9146358370780945, + "learning_rate": 8.203235594341862e-05, + "loss": 0.8036, + "step": 87270 + }, + { + "epoch": 0.5576070429193872, + "grad_norm": 0.7417098879814148, + "learning_rate": 8.202850302573308e-05, + "loss": 0.8677, + "step": 87280 + }, + { + "epoch": 0.5576709300691259, + "grad_norm": 1.2907207012176514, + "learning_rate": 8.202464978549281e-05, + "loss": 0.8702, + "step": 87290 + }, + { + "epoch": 0.5577348172188646, + "grad_norm": 0.6193691492080688, + "learning_rate": 8.202079622273662e-05, + "loss": 0.6505, + "step": 87300 + }, + { + "epoch": 0.5577987043686033, + "grad_norm": 0.8418506979942322, + "learning_rate": 8.20169423375033e-05, + "loss": 1.0309, + "step": 87310 + }, + { + "epoch": 0.557862591518342, + "grad_norm": 0.8040059208869934, + "learning_rate": 8.201308812983165e-05, + "loss": 0.8816, + "step": 87320 + }, + { + "epoch": 0.5579264786680806, + "grad_norm": 0.9239259958267212, + "learning_rate": 8.200923359976055e-05, + "loss": 1.2638, + "step": 87330 + }, + { + "epoch": 0.5579903658178194, + "grad_norm": 1.0745997428894043, + "learning_rate": 8.200537874732876e-05, + "loss": 0.839, + "step": 87340 + }, + { + "epoch": 0.5580542529675581, + "grad_norm": 0.9547412395477295, + "learning_rate": 8.200152357257511e-05, + "loss": 1.0117, + "step": 87350 + }, + { + "epoch": 0.5581181401172968, + "grad_norm": 0.6735034584999084, + "learning_rate": 8.199766807553843e-05, + "loss": 1.0254, + "step": 87360 + }, + { + "epoch": 0.5581820272670355, + "grad_norm": 0.9394139051437378, + "learning_rate": 8.199381225625755e-05, + "loss": 0.9061, + "step": 87370 + }, + { + "epoch": 0.5582459144167742, + "grad_norm": 0.6589183211326599, + "learning_rate": 8.198995611477132e-05, + "loss": 0.9975, + "step": 87380 + }, + { + "epoch": 0.5583098015665129, + "grad_norm": 0.8695818185806274, + "learning_rate": 8.198609965111854e-05, + "loss": 0.8602, + "step": 87390 + }, + { + "epoch": 0.5583736887162516, + "grad_norm": 0.8587558269500732, + "learning_rate": 8.198224286533807e-05, + "loss": 1.041, + "step": 87400 + }, + { + "epoch": 0.5584375758659903, + "grad_norm": 0.8694300651550293, + "learning_rate": 8.197838575746874e-05, + "loss": 1.0423, + "step": 87410 + }, + { + "epoch": 0.558501463015729, + "grad_norm": 1.0709831714630127, + "learning_rate": 8.197452832754939e-05, + "loss": 1.1133, + "step": 87420 + }, + { + "epoch": 0.5585653501654677, + "grad_norm": 0.8248537182807922, + "learning_rate": 8.19706705756189e-05, + "loss": 0.8726, + "step": 87430 + }, + { + "epoch": 0.5586292373152064, + "grad_norm": 2.853898525238037, + "learning_rate": 8.196681250171609e-05, + "loss": 0.9757, + "step": 87440 + }, + { + "epoch": 0.5586931244649451, + "grad_norm": 2.0060508251190186, + "learning_rate": 8.196295410587982e-05, + "loss": 1.1241, + "step": 87450 + }, + { + "epoch": 0.5587570116146838, + "grad_norm": 1.0533684492111206, + "learning_rate": 8.195909538814895e-05, + "loss": 0.746, + "step": 87460 + }, + { + "epoch": 0.5588208987644225, + "grad_norm": 0.6444224715232849, + "learning_rate": 8.195523634856234e-05, + "loss": 0.8998, + "step": 87470 + }, + { + "epoch": 0.5588847859141612, + "grad_norm": 0.8431418538093567, + "learning_rate": 8.195137698715887e-05, + "loss": 0.9075, + "step": 87480 + }, + { + "epoch": 0.5589486730639, + "grad_norm": 5.22327995300293, + "learning_rate": 8.194751730397738e-05, + "loss": 0.8413, + "step": 87490 + }, + { + "epoch": 0.5590125602136387, + "grad_norm": 0.5308577418327332, + "learning_rate": 8.194365729905675e-05, + "loss": 0.7192, + "step": 87500 + }, + { + "epoch": 0.5590764473633774, + "grad_norm": 1.6924850940704346, + "learning_rate": 8.193979697243586e-05, + "loss": 0.9143, + "step": 87510 + }, + { + "epoch": 0.5591403345131161, + "grad_norm": 0.574246883392334, + "learning_rate": 8.193593632415358e-05, + "loss": 0.9482, + "step": 87520 + }, + { + "epoch": 0.5592042216628548, + "grad_norm": 0.572245180606842, + "learning_rate": 8.19320753542488e-05, + "loss": 0.6802, + "step": 87530 + }, + { + "epoch": 0.5592681088125935, + "grad_norm": 1.4606465101242065, + "learning_rate": 8.192821406276039e-05, + "loss": 1.3266, + "step": 87540 + }, + { + "epoch": 0.5593319959623322, + "grad_norm": 0.8305387496948242, + "learning_rate": 8.192435244972725e-05, + "loss": 0.8321, + "step": 87550 + }, + { + "epoch": 0.5593958831120709, + "grad_norm": 0.7311245799064636, + "learning_rate": 8.192049051518826e-05, + "loss": 0.8271, + "step": 87560 + }, + { + "epoch": 0.5594597702618095, + "grad_norm": 0.702063798904419, + "learning_rate": 8.19166282591823e-05, + "loss": 0.9991, + "step": 87570 + }, + { + "epoch": 0.5595236574115482, + "grad_norm": 0.699548602104187, + "learning_rate": 8.19127656817483e-05, + "loss": 0.9318, + "step": 87580 + }, + { + "epoch": 0.5595875445612869, + "grad_norm": 1.65463387966156, + "learning_rate": 8.190890278292513e-05, + "loss": 1.1159, + "step": 87590 + }, + { + "epoch": 0.5596514317110256, + "grad_norm": 0.7709631323814392, + "learning_rate": 8.190503956275171e-05, + "loss": 0.9198, + "step": 87600 + }, + { + "epoch": 0.5597153188607643, + "grad_norm": 0.7824264168739319, + "learning_rate": 8.190117602126694e-05, + "loss": 0.7632, + "step": 87610 + }, + { + "epoch": 0.559779206010503, + "grad_norm": 0.6739460825920105, + "learning_rate": 8.189731215850973e-05, + "loss": 1.0088, + "step": 87620 + }, + { + "epoch": 0.5598430931602417, + "grad_norm": 0.6374611258506775, + "learning_rate": 8.189344797451898e-05, + "loss": 0.8637, + "step": 87630 + }, + { + "epoch": 0.5599069803099804, + "grad_norm": 0.7015722990036011, + "learning_rate": 8.188958346933361e-05, + "loss": 0.7372, + "step": 87640 + }, + { + "epoch": 0.5599708674597191, + "grad_norm": 0.9866139888763428, + "learning_rate": 8.188571864299257e-05, + "loss": 0.8335, + "step": 87650 + }, + { + "epoch": 0.5600347546094578, + "grad_norm": 1.009989619255066, + "learning_rate": 8.188185349553474e-05, + "loss": 0.8123, + "step": 87660 + }, + { + "epoch": 0.5600986417591965, + "grad_norm": 0.5342085957527161, + "learning_rate": 8.187798802699909e-05, + "loss": 0.772, + "step": 87670 + }, + { + "epoch": 0.5601625289089353, + "grad_norm": 0.8871537446975708, + "learning_rate": 8.18741222374245e-05, + "loss": 0.8139, + "step": 87680 + }, + { + "epoch": 0.560226416058674, + "grad_norm": 0.8497464656829834, + "learning_rate": 8.187025612684993e-05, + "loss": 0.8549, + "step": 87690 + }, + { + "epoch": 0.5602903032084127, + "grad_norm": 1.7199604511260986, + "learning_rate": 8.18663896953143e-05, + "loss": 1.0503, + "step": 87700 + }, + { + "epoch": 0.5603541903581514, + "grad_norm": 0.686752200126648, + "learning_rate": 8.186252294285656e-05, + "loss": 0.7221, + "step": 87710 + }, + { + "epoch": 0.5604180775078901, + "grad_norm": 0.6970003247261047, + "learning_rate": 8.185865586951567e-05, + "loss": 0.6405, + "step": 87720 + }, + { + "epoch": 0.5604819646576288, + "grad_norm": 0.8833878040313721, + "learning_rate": 8.185478847533052e-05, + "loss": 0.742, + "step": 87730 + }, + { + "epoch": 0.5605458518073675, + "grad_norm": 2.5486907958984375, + "learning_rate": 8.185092076034012e-05, + "loss": 0.7818, + "step": 87740 + }, + { + "epoch": 0.5606097389571062, + "grad_norm": 0.5542274713516235, + "learning_rate": 8.184705272458338e-05, + "loss": 0.8269, + "step": 87750 + }, + { + "epoch": 0.5606736261068449, + "grad_norm": 0.9123031497001648, + "learning_rate": 8.184318436809927e-05, + "loss": 0.9681, + "step": 87760 + }, + { + "epoch": 0.5607375132565836, + "grad_norm": 1.2762703895568848, + "learning_rate": 8.183931569092676e-05, + "loss": 0.992, + "step": 87770 + }, + { + "epoch": 0.5608014004063223, + "grad_norm": 0.723517656326294, + "learning_rate": 8.183544669310477e-05, + "loss": 0.7665, + "step": 87780 + }, + { + "epoch": 0.560865287556061, + "grad_norm": 0.5567760467529297, + "learning_rate": 8.183157737467229e-05, + "loss": 0.8054, + "step": 87790 + }, + { + "epoch": 0.5609291747057997, + "grad_norm": 1.262997031211853, + "learning_rate": 8.182770773566833e-05, + "loss": 0.9354, + "step": 87800 + }, + { + "epoch": 0.5609930618555383, + "grad_norm": 0.7385443449020386, + "learning_rate": 8.182383777613177e-05, + "loss": 1.0054, + "step": 87810 + }, + { + "epoch": 0.561056949005277, + "grad_norm": 0.8645491600036621, + "learning_rate": 8.181996749610166e-05, + "loss": 1.0764, + "step": 87820 + }, + { + "epoch": 0.5611208361550157, + "grad_norm": 1.3922362327575684, + "learning_rate": 8.181609689561693e-05, + "loss": 1.2028, + "step": 87830 + }, + { + "epoch": 0.5611847233047544, + "grad_norm": 1.1848599910736084, + "learning_rate": 8.181222597471658e-05, + "loss": 0.7765, + "step": 87840 + }, + { + "epoch": 0.5612486104544931, + "grad_norm": 1.078633189201355, + "learning_rate": 8.18083547334396e-05, + "loss": 0.793, + "step": 87850 + }, + { + "epoch": 0.5613124976042319, + "grad_norm": 0.8541363477706909, + "learning_rate": 8.180448317182498e-05, + "loss": 0.6918, + "step": 87860 + }, + { + "epoch": 0.5613763847539706, + "grad_norm": 1.5451418161392212, + "learning_rate": 8.180061128991168e-05, + "loss": 0.9104, + "step": 87870 + }, + { + "epoch": 0.5614402719037093, + "grad_norm": 0.9246516823768616, + "learning_rate": 8.179673908773872e-05, + "loss": 0.8337, + "step": 87880 + }, + { + "epoch": 0.561504159053448, + "grad_norm": 1.042832851409912, + "learning_rate": 8.179286656534511e-05, + "loss": 1.1324, + "step": 87890 + }, + { + "epoch": 0.5615680462031867, + "grad_norm": 0.7726474404335022, + "learning_rate": 8.17889937227698e-05, + "loss": 1.1108, + "step": 87900 + }, + { + "epoch": 0.5616319333529254, + "grad_norm": 0.8662251234054565, + "learning_rate": 8.178512056005184e-05, + "loss": 0.6078, + "step": 87910 + }, + { + "epoch": 0.5616958205026641, + "grad_norm": 0.7089243531227112, + "learning_rate": 8.178124707723021e-05, + "loss": 0.8552, + "step": 87920 + }, + { + "epoch": 0.5617597076524028, + "grad_norm": 1.1423563957214355, + "learning_rate": 8.177737327434393e-05, + "loss": 0.8504, + "step": 87930 + }, + { + "epoch": 0.5618235948021415, + "grad_norm": 0.934367299079895, + "learning_rate": 8.1773499151432e-05, + "loss": 1.1178, + "step": 87940 + }, + { + "epoch": 0.5618874819518802, + "grad_norm": 1.198276162147522, + "learning_rate": 8.176962470853346e-05, + "loss": 0.9645, + "step": 87950 + }, + { + "epoch": 0.5619513691016189, + "grad_norm": 0.8779740929603577, + "learning_rate": 8.176574994568731e-05, + "loss": 0.9631, + "step": 87960 + }, + { + "epoch": 0.5620152562513576, + "grad_norm": 1.0242273807525635, + "learning_rate": 8.176187486293258e-05, + "loss": 0.8799, + "step": 87970 + }, + { + "epoch": 0.5620791434010963, + "grad_norm": 0.8855654001235962, + "learning_rate": 8.17579994603083e-05, + "loss": 0.8782, + "step": 87980 + }, + { + "epoch": 0.562143030550835, + "grad_norm": 0.9518892168998718, + "learning_rate": 8.175412373785346e-05, + "loss": 0.9524, + "step": 87990 + }, + { + "epoch": 0.5622069177005737, + "grad_norm": 1.5354324579238892, + "learning_rate": 8.175024769560714e-05, + "loss": 0.8593, + "step": 88000 + }, + { + "epoch": 0.5622708048503124, + "grad_norm": 0.7242043018341064, + "learning_rate": 8.174637133360837e-05, + "loss": 0.7903, + "step": 88010 + }, + { + "epoch": 0.5623346920000512, + "grad_norm": 1.2966783046722412, + "learning_rate": 8.174249465189615e-05, + "loss": 0.9194, + "step": 88020 + }, + { + "epoch": 0.5623985791497899, + "grad_norm": 0.7880471348762512, + "learning_rate": 8.173861765050956e-05, + "loss": 0.8203, + "step": 88030 + }, + { + "epoch": 0.5624624662995286, + "grad_norm": 0.8181769251823425, + "learning_rate": 8.173474032948764e-05, + "loss": 0.8065, + "step": 88040 + }, + { + "epoch": 0.5625263534492673, + "grad_norm": 1.0602998733520508, + "learning_rate": 8.173086268886943e-05, + "loss": 1.0891, + "step": 88050 + }, + { + "epoch": 0.5625902405990059, + "grad_norm": 0.9757769107818604, + "learning_rate": 8.172698472869398e-05, + "loss": 0.7662, + "step": 88060 + }, + { + "epoch": 0.5626541277487446, + "grad_norm": 0.7748228907585144, + "learning_rate": 8.172310644900035e-05, + "loss": 1.0889, + "step": 88070 + }, + { + "epoch": 0.5627180148984833, + "grad_norm": 0.6546492576599121, + "learning_rate": 8.171922784982757e-05, + "loss": 0.7745, + "step": 88080 + }, + { + "epoch": 0.562781902048222, + "grad_norm": 0.9595057368278503, + "learning_rate": 8.171534893121476e-05, + "loss": 0.7195, + "step": 88090 + }, + { + "epoch": 0.5628457891979607, + "grad_norm": 1.4880831241607666, + "learning_rate": 8.171146969320091e-05, + "loss": 0.7606, + "step": 88100 + }, + { + "epoch": 0.5629096763476994, + "grad_norm": 0.6960776448249817, + "learning_rate": 8.17079781059329e-05, + "loss": 0.8305, + "step": 88110 + }, + { + "epoch": 0.5629735634974381, + "grad_norm": 0.6546306610107422, + "learning_rate": 8.17040982611648e-05, + "loss": 0.9146, + "step": 88120 + }, + { + "epoch": 0.5630374506471768, + "grad_norm": 0.9104867577552795, + "learning_rate": 8.170021809710901e-05, + "loss": 0.8428, + "step": 88130 + }, + { + "epoch": 0.5631013377969155, + "grad_norm": 0.8855934143066406, + "learning_rate": 8.169633761380459e-05, + "loss": 1.1035, + "step": 88140 + }, + { + "epoch": 0.5631652249466542, + "grad_norm": 0.8834415674209595, + "learning_rate": 8.169245681129063e-05, + "loss": 0.7963, + "step": 88150 + }, + { + "epoch": 0.5632291120963929, + "grad_norm": 1.0610204935073853, + "learning_rate": 8.168857568960621e-05, + "loss": 0.8703, + "step": 88160 + }, + { + "epoch": 0.5632929992461316, + "grad_norm": 0.7726658582687378, + "learning_rate": 8.168469424879041e-05, + "loss": 0.7914, + "step": 88170 + }, + { + "epoch": 0.5633568863958703, + "grad_norm": 0.835917592048645, + "learning_rate": 8.168081248888236e-05, + "loss": 1.1727, + "step": 88180 + }, + { + "epoch": 0.563420773545609, + "grad_norm": 0.6268913149833679, + "learning_rate": 8.16769304099211e-05, + "loss": 0.8944, + "step": 88190 + }, + { + "epoch": 0.5634846606953477, + "grad_norm": 0.7659708857536316, + "learning_rate": 8.167304801194574e-05, + "loss": 0.6793, + "step": 88200 + }, + { + "epoch": 0.5635485478450865, + "grad_norm": 0.8323150873184204, + "learning_rate": 8.166916529499539e-05, + "loss": 1.0036, + "step": 88210 + }, + { + "epoch": 0.5636124349948252, + "grad_norm": 0.8650678396224976, + "learning_rate": 8.166528225910915e-05, + "loss": 1.0035, + "step": 88220 + }, + { + "epoch": 0.5636763221445639, + "grad_norm": 1.1237086057662964, + "learning_rate": 8.166139890432612e-05, + "loss": 0.863, + "step": 88230 + }, + { + "epoch": 0.5637402092943026, + "grad_norm": 1.4714837074279785, + "learning_rate": 8.165751523068541e-05, + "loss": 0.7476, + "step": 88240 + }, + { + "epoch": 0.5638040964440413, + "grad_norm": 1.561540126800537, + "learning_rate": 8.165363123822613e-05, + "loss": 0.8476, + "step": 88250 + }, + { + "epoch": 0.56386798359378, + "grad_norm": 1.166987657546997, + "learning_rate": 8.164974692698741e-05, + "loss": 0.7393, + "step": 88260 + }, + { + "epoch": 0.5639318707435187, + "grad_norm": 0.8641453385353088, + "learning_rate": 8.164586229700837e-05, + "loss": 0.9397, + "step": 88270 + }, + { + "epoch": 0.5639957578932574, + "grad_norm": 0.8461394906044006, + "learning_rate": 8.164197734832811e-05, + "loss": 0.9712, + "step": 88280 + }, + { + "epoch": 0.5640596450429961, + "grad_norm": 1.124748945236206, + "learning_rate": 8.163809208098573e-05, + "loss": 0.9336, + "step": 88290 + }, + { + "epoch": 0.5641235321927347, + "grad_norm": 0.7399802803993225, + "learning_rate": 8.163420649502044e-05, + "loss": 0.8218, + "step": 88300 + }, + { + "epoch": 0.5641874193424734, + "grad_norm": 0.5592508316040039, + "learning_rate": 8.163032059047129e-05, + "loss": 1.0124, + "step": 88310 + }, + { + "epoch": 0.5642513064922121, + "grad_norm": 0.7099899649620056, + "learning_rate": 8.162643436737747e-05, + "loss": 1.0729, + "step": 88320 + }, + { + "epoch": 0.5643151936419508, + "grad_norm": 0.8268618583679199, + "learning_rate": 8.162254782577807e-05, + "loss": 0.9777, + "step": 88330 + }, + { + "epoch": 0.5643790807916895, + "grad_norm": 1.0088896751403809, + "learning_rate": 8.161866096571229e-05, + "loss": 0.7944, + "step": 88340 + }, + { + "epoch": 0.5644429679414282, + "grad_norm": 1.0741251707077026, + "learning_rate": 8.161477378721922e-05, + "loss": 0.6418, + "step": 88350 + }, + { + "epoch": 0.5645068550911669, + "grad_norm": 0.7108768224716187, + "learning_rate": 8.161088629033802e-05, + "loss": 0.792, + "step": 88360 + }, + { + "epoch": 0.5645707422409056, + "grad_norm": 0.7690078020095825, + "learning_rate": 8.160699847510787e-05, + "loss": 1.0006, + "step": 88370 + }, + { + "epoch": 0.5646346293906443, + "grad_norm": 0.9260159134864807, + "learning_rate": 8.160311034156788e-05, + "loss": 0.836, + "step": 88380 + }, + { + "epoch": 0.564698516540383, + "grad_norm": 0.8775709867477417, + "learning_rate": 8.159922188975724e-05, + "loss": 1.0681, + "step": 88390 + }, + { + "epoch": 0.5647624036901218, + "grad_norm": 5.725255489349365, + "learning_rate": 8.159533311971509e-05, + "loss": 0.8525, + "step": 88400 + }, + { + "epoch": 0.5648262908398605, + "grad_norm": 0.7472132444381714, + "learning_rate": 8.15914440314806e-05, + "loss": 0.8077, + "step": 88410 + }, + { + "epoch": 0.5648901779895992, + "grad_norm": 0.733696460723877, + "learning_rate": 8.158755462509294e-05, + "loss": 0.9897, + "step": 88420 + }, + { + "epoch": 0.5649540651393379, + "grad_norm": 0.7739579081535339, + "learning_rate": 8.15836649005913e-05, + "loss": 0.9822, + "step": 88430 + }, + { + "epoch": 0.5650179522890766, + "grad_norm": 0.7440978288650513, + "learning_rate": 8.157977485801481e-05, + "loss": 0.9445, + "step": 88440 + }, + { + "epoch": 0.5650818394388153, + "grad_norm": 1.1585737466812134, + "learning_rate": 8.157588449740268e-05, + "loss": 0.9039, + "step": 88450 + }, + { + "epoch": 0.565145726588554, + "grad_norm": 0.6025387048721313, + "learning_rate": 8.157199381879406e-05, + "loss": 1.0429, + "step": 88460 + }, + { + "epoch": 0.5652096137382927, + "grad_norm": 1.27161705493927, + "learning_rate": 8.156810282222815e-05, + "loss": 0.9338, + "step": 88470 + }, + { + "epoch": 0.5652735008880314, + "grad_norm": 1.4997930526733398, + "learning_rate": 8.156421150774413e-05, + "loss": 0.9103, + "step": 88480 + }, + { + "epoch": 0.5653373880377701, + "grad_norm": 1.060680866241455, + "learning_rate": 8.156031987538121e-05, + "loss": 0.8186, + "step": 88490 + }, + { + "epoch": 0.5654012751875088, + "grad_norm": 0.8524025082588196, + "learning_rate": 8.155642792517854e-05, + "loss": 0.6961, + "step": 88500 + }, + { + "epoch": 0.5654651623372475, + "grad_norm": 0.7399099469184875, + "learning_rate": 8.155253565717538e-05, + "loss": 0.9526, + "step": 88510 + }, + { + "epoch": 0.5655290494869862, + "grad_norm": 0.5844240784645081, + "learning_rate": 8.154864307141086e-05, + "loss": 0.7829, + "step": 88520 + }, + { + "epoch": 0.5655929366367249, + "grad_norm": 1.2243051528930664, + "learning_rate": 8.154475016792422e-05, + "loss": 1.1276, + "step": 88530 + }, + { + "epoch": 0.5656568237864635, + "grad_norm": 0.8951583504676819, + "learning_rate": 8.154085694675465e-05, + "loss": 0.5625, + "step": 88540 + }, + { + "epoch": 0.5657207109362022, + "grad_norm": 0.9766507148742676, + "learning_rate": 8.153696340794137e-05, + "loss": 0.8022, + "step": 88550 + }, + { + "epoch": 0.565784598085941, + "grad_norm": 1.5120043754577637, + "learning_rate": 8.153306955152358e-05, + "loss": 1.0359, + "step": 88560 + }, + { + "epoch": 0.5658484852356797, + "grad_norm": 0.8103669881820679, + "learning_rate": 8.15291753775405e-05, + "loss": 0.9112, + "step": 88570 + }, + { + "epoch": 0.5659123723854184, + "grad_norm": 0.9393633604049683, + "learning_rate": 8.152528088603136e-05, + "loss": 1.0055, + "step": 88580 + }, + { + "epoch": 0.5659762595351571, + "grad_norm": 0.7244747281074524, + "learning_rate": 8.152138607703534e-05, + "loss": 0.9209, + "step": 88590 + }, + { + "epoch": 0.5660401466848958, + "grad_norm": 0.9815077185630798, + "learning_rate": 8.151749095059172e-05, + "loss": 1.0403, + "step": 88600 + }, + { + "epoch": 0.5661040338346345, + "grad_norm": 0.7395929098129272, + "learning_rate": 8.151359550673968e-05, + "loss": 0.8846, + "step": 88610 + }, + { + "epoch": 0.5661679209843732, + "grad_norm": 1.4598946571350098, + "learning_rate": 8.150969974551848e-05, + "loss": 0.8897, + "step": 88620 + }, + { + "epoch": 0.5662318081341119, + "grad_norm": 1.4247801303863525, + "learning_rate": 8.150580366696734e-05, + "loss": 0.8266, + "step": 88630 + }, + { + "epoch": 0.5662956952838506, + "grad_norm": 0.9204801321029663, + "learning_rate": 8.150190727112551e-05, + "loss": 0.9737, + "step": 88640 + }, + { + "epoch": 0.5663595824335893, + "grad_norm": 0.9166013598442078, + "learning_rate": 8.149801055803222e-05, + "loss": 0.7175, + "step": 88650 + }, + { + "epoch": 0.566423469583328, + "grad_norm": 0.6634111404418945, + "learning_rate": 8.149411352772672e-05, + "loss": 0.8279, + "step": 88660 + }, + { + "epoch": 0.5664873567330667, + "grad_norm": 0.7929425835609436, + "learning_rate": 8.149021618024823e-05, + "loss": 0.938, + "step": 88670 + }, + { + "epoch": 0.5665512438828054, + "grad_norm": 0.834792971611023, + "learning_rate": 8.148631851563602e-05, + "loss": 0.9483, + "step": 88680 + }, + { + "epoch": 0.5666151310325441, + "grad_norm": 0.9581144452095032, + "learning_rate": 8.148242053392937e-05, + "loss": 0.8688, + "step": 88690 + }, + { + "epoch": 0.5666790181822828, + "grad_norm": 1.2036710977554321, + "learning_rate": 8.147852223516747e-05, + "loss": 0.76, + "step": 88700 + }, + { + "epoch": 0.5667429053320215, + "grad_norm": 0.9013402462005615, + "learning_rate": 8.147462361938965e-05, + "loss": 0.7514, + "step": 88710 + }, + { + "epoch": 0.5668067924817602, + "grad_norm": 1.2912942171096802, + "learning_rate": 8.147072468663514e-05, + "loss": 0.934, + "step": 88720 + }, + { + "epoch": 0.566870679631499, + "grad_norm": 0.8134447336196899, + "learning_rate": 8.146682543694318e-05, + "loss": 1.0641, + "step": 88730 + }, + { + "epoch": 0.5669345667812377, + "grad_norm": 1.032091736793518, + "learning_rate": 8.14629258703531e-05, + "loss": 0.9206, + "step": 88740 + }, + { + "epoch": 0.5669984539309764, + "grad_norm": 0.8850687146186829, + "learning_rate": 8.145902598690411e-05, + "loss": 1.1147, + "step": 88750 + }, + { + "epoch": 0.5670623410807151, + "grad_norm": 1.0859354734420776, + "learning_rate": 8.145512578663553e-05, + "loss": 1.1992, + "step": 88760 + }, + { + "epoch": 0.5671262282304538, + "grad_norm": 1.4345630407333374, + "learning_rate": 8.14512252695866e-05, + "loss": 0.7953, + "step": 88770 + }, + { + "epoch": 0.5671901153801925, + "grad_norm": 0.8256250023841858, + "learning_rate": 8.144732443579664e-05, + "loss": 0.8639, + "step": 88780 + }, + { + "epoch": 0.5672540025299311, + "grad_norm": 1.0019294023513794, + "learning_rate": 8.14434232853049e-05, + "loss": 0.9049, + "step": 88790 + }, + { + "epoch": 0.5673178896796698, + "grad_norm": 0.7967015504837036, + "learning_rate": 8.14395218181507e-05, + "loss": 1.0006, + "step": 88800 + }, + { + "epoch": 0.5673817768294085, + "grad_norm": 0.729104220867157, + "learning_rate": 8.143562003437331e-05, + "loss": 0.724, + "step": 88810 + }, + { + "epoch": 0.5674456639791472, + "grad_norm": 1.0281226634979248, + "learning_rate": 8.143171793401204e-05, + "loss": 0.7041, + "step": 88820 + }, + { + "epoch": 0.5675095511288859, + "grad_norm": 0.5173068046569824, + "learning_rate": 8.142781551710617e-05, + "loss": 0.7518, + "step": 88830 + }, + { + "epoch": 0.5675734382786246, + "grad_norm": 1.003184199333191, + "learning_rate": 8.1423912783695e-05, + "loss": 0.9193, + "step": 88840 + }, + { + "epoch": 0.5676373254283633, + "grad_norm": 0.8439940214157104, + "learning_rate": 8.142000973381787e-05, + "loss": 0.874, + "step": 88850 + }, + { + "epoch": 0.567701212578102, + "grad_norm": 1.0235188007354736, + "learning_rate": 8.141610636751405e-05, + "loss": 1.3299, + "step": 88860 + }, + { + "epoch": 0.5677650997278407, + "grad_norm": 1.1160435676574707, + "learning_rate": 8.141220268482284e-05, + "loss": 0.9362, + "step": 88870 + }, + { + "epoch": 0.5678289868775794, + "grad_norm": 0.9153828620910645, + "learning_rate": 8.140829868578359e-05, + "loss": 1.0991, + "step": 88880 + }, + { + "epoch": 0.5678928740273181, + "grad_norm": 0.7839625477790833, + "learning_rate": 8.140439437043558e-05, + "loss": 0.8622, + "step": 88890 + }, + { + "epoch": 0.5679567611770568, + "grad_norm": 0.9344704151153564, + "learning_rate": 8.140048973881817e-05, + "loss": 0.9697, + "step": 88900 + }, + { + "epoch": 0.5680206483267956, + "grad_norm": 1.2923128604888916, + "learning_rate": 8.139697529998467e-05, + "loss": 0.9158, + "step": 88910 + }, + { + "epoch": 0.5680845354765343, + "grad_norm": 0.7165302038192749, + "learning_rate": 8.139307006756369e-05, + "loss": 0.9479, + "step": 88920 + }, + { + "epoch": 0.568148422626273, + "grad_norm": 0.9380423426628113, + "learning_rate": 8.138916451898734e-05, + "loss": 1.0259, + "step": 88930 + }, + { + "epoch": 0.5682123097760117, + "grad_norm": 0.9337356686592102, + "learning_rate": 8.138525865429494e-05, + "loss": 0.7998, + "step": 88940 + }, + { + "epoch": 0.5682761969257504, + "grad_norm": 1.1728187799453735, + "learning_rate": 8.138135247352586e-05, + "loss": 0.7984, + "step": 88950 + }, + { + "epoch": 0.5683400840754891, + "grad_norm": 0.7271674871444702, + "learning_rate": 8.137744597671938e-05, + "loss": 0.8608, + "step": 88960 + }, + { + "epoch": 0.5684039712252278, + "grad_norm": 0.8207966089248657, + "learning_rate": 8.137353916391488e-05, + "loss": 0.8993, + "step": 88970 + }, + { + "epoch": 0.5684678583749665, + "grad_norm": 0.8019614815711975, + "learning_rate": 8.136963203515173e-05, + "loss": 0.9974, + "step": 88980 + }, + { + "epoch": 0.5685317455247052, + "grad_norm": 0.8686451315879822, + "learning_rate": 8.136572459046921e-05, + "loss": 0.7662, + "step": 88990 + }, + { + "epoch": 0.5685956326744439, + "grad_norm": 0.7033054232597351, + "learning_rate": 8.136181682990673e-05, + "loss": 0.9693, + "step": 89000 + }, + { + "epoch": 0.5686595198241826, + "grad_norm": 0.5672471523284912, + "learning_rate": 8.135790875350361e-05, + "loss": 0.8768, + "step": 89010 + }, + { + "epoch": 0.5687234069739213, + "grad_norm": 0.8341061472892761, + "learning_rate": 8.135400036129923e-05, + "loss": 0.8212, + "step": 89020 + }, + { + "epoch": 0.5687872941236599, + "grad_norm": 0.6905550956726074, + "learning_rate": 8.135009165333294e-05, + "loss": 0.8526, + "step": 89030 + }, + { + "epoch": 0.5688511812733986, + "grad_norm": 1.169189453125, + "learning_rate": 8.134618262964409e-05, + "loss": 0.8095, + "step": 89040 + }, + { + "epoch": 0.5689150684231373, + "grad_norm": 0.8723841309547424, + "learning_rate": 8.134227329027208e-05, + "loss": 0.934, + "step": 89050 + }, + { + "epoch": 0.568978955572876, + "grad_norm": 0.7888420224189758, + "learning_rate": 8.133836363525626e-05, + "loss": 0.8145, + "step": 89060 + }, + { + "epoch": 0.5690428427226147, + "grad_norm": 0.774649441242218, + "learning_rate": 8.133445366463601e-05, + "loss": 0.8815, + "step": 89070 + }, + { + "epoch": 0.5691067298723534, + "grad_norm": 0.6284635066986084, + "learning_rate": 8.13305433784507e-05, + "loss": 0.8557, + "step": 89080 + }, + { + "epoch": 0.5691706170220922, + "grad_norm": 0.747380793094635, + "learning_rate": 8.132663277673971e-05, + "loss": 1.0254, + "step": 89090 + }, + { + "epoch": 0.5692345041718309, + "grad_norm": 0.6164722442626953, + "learning_rate": 8.132272185954243e-05, + "loss": 0.8931, + "step": 89100 + }, + { + "epoch": 0.5692983913215696, + "grad_norm": 1.5792529582977295, + "learning_rate": 8.131881062689823e-05, + "loss": 1.1431, + "step": 89110 + }, + { + "epoch": 0.5693622784713083, + "grad_norm": 0.8760352730751038, + "learning_rate": 8.131489907884653e-05, + "loss": 1.0246, + "step": 89120 + }, + { + "epoch": 0.569426165621047, + "grad_norm": 1.4259626865386963, + "learning_rate": 8.13109872154267e-05, + "loss": 0.7266, + "step": 89130 + }, + { + "epoch": 0.5694900527707857, + "grad_norm": 1.4645694494247437, + "learning_rate": 8.130707503667814e-05, + "loss": 0.645, + "step": 89140 + }, + { + "epoch": 0.5695539399205244, + "grad_norm": 1.1326792240142822, + "learning_rate": 8.130316254264024e-05, + "loss": 1.1472, + "step": 89150 + }, + { + "epoch": 0.5696178270702631, + "grad_norm": 0.9139853715896606, + "learning_rate": 8.129924973335243e-05, + "loss": 1.025, + "step": 89160 + }, + { + "epoch": 0.5696817142200018, + "grad_norm": 0.5680515766143799, + "learning_rate": 8.129533660885407e-05, + "loss": 0.9172, + "step": 89170 + }, + { + "epoch": 0.5697456013697405, + "grad_norm": 0.7290217876434326, + "learning_rate": 8.129142316918463e-05, + "loss": 0.9759, + "step": 89180 + }, + { + "epoch": 0.5698094885194792, + "grad_norm": 0.5047475099563599, + "learning_rate": 8.128750941438346e-05, + "loss": 0.9244, + "step": 89190 + }, + { + "epoch": 0.5698733756692179, + "grad_norm": 1.2016278505325317, + "learning_rate": 8.128359534449002e-05, + "loss": 0.6997, + "step": 89200 + }, + { + "epoch": 0.5699372628189566, + "grad_norm": 1.542084813117981, + "learning_rate": 8.127968095954371e-05, + "loss": 0.8523, + "step": 89210 + }, + { + "epoch": 0.5700011499686953, + "grad_norm": 0.9595821499824524, + "learning_rate": 8.127576625958394e-05, + "loss": 0.9706, + "step": 89220 + }, + { + "epoch": 0.570065037118434, + "grad_norm": 0.6322153210639954, + "learning_rate": 8.127185124465016e-05, + "loss": 0.7701, + "step": 89230 + }, + { + "epoch": 0.5701289242681727, + "grad_norm": 1.8529796600341797, + "learning_rate": 8.126793591478177e-05, + "loss": 0.877, + "step": 89240 + }, + { + "epoch": 0.5701928114179114, + "grad_norm": 0.6426035761833191, + "learning_rate": 8.126402027001822e-05, + "loss": 0.9249, + "step": 89250 + }, + { + "epoch": 0.5702566985676502, + "grad_norm": 0.822106659412384, + "learning_rate": 8.126010431039895e-05, + "loss": 0.7386, + "step": 89260 + }, + { + "epoch": 0.5703205857173888, + "grad_norm": 1.577788233757019, + "learning_rate": 8.125618803596338e-05, + "loss": 0.9725, + "step": 89270 + }, + { + "epoch": 0.5703844728671275, + "grad_norm": 0.9563323259353638, + "learning_rate": 8.125227144675096e-05, + "loss": 0.6633, + "step": 89280 + }, + { + "epoch": 0.5704483600168662, + "grad_norm": 0.5990996956825256, + "learning_rate": 8.12483545428011e-05, + "loss": 0.969, + "step": 89290 + }, + { + "epoch": 0.5705122471666049, + "grad_norm": 0.8185253739356995, + "learning_rate": 8.124443732415331e-05, + "loss": 0.8085, + "step": 89300 + }, + { + "epoch": 0.5705761343163436, + "grad_norm": 1.3149315118789673, + "learning_rate": 8.124051979084699e-05, + "loss": 1.0496, + "step": 89310 + }, + { + "epoch": 0.5706400214660823, + "grad_norm": 0.9711151123046875, + "learning_rate": 8.123660194292162e-05, + "loss": 0.8151, + "step": 89320 + }, + { + "epoch": 0.570703908615821, + "grad_norm": 0.8611701726913452, + "learning_rate": 8.123268378041664e-05, + "loss": 1.007, + "step": 89330 + }, + { + "epoch": 0.5707677957655597, + "grad_norm": 0.8310745358467102, + "learning_rate": 8.122876530337151e-05, + "loss": 1.0438, + "step": 89340 + }, + { + "epoch": 0.5708316829152984, + "grad_norm": 1.0816930532455444, + "learning_rate": 8.12248465118257e-05, + "loss": 0.7444, + "step": 89350 + }, + { + "epoch": 0.5708955700650371, + "grad_norm": 0.9173716902732849, + "learning_rate": 8.122092740581867e-05, + "loss": 1.183, + "step": 89360 + }, + { + "epoch": 0.5709594572147758, + "grad_norm": 1.3235098123550415, + "learning_rate": 8.121700798538989e-05, + "loss": 0.9047, + "step": 89370 + }, + { + "epoch": 0.5710233443645145, + "grad_norm": 0.7508029341697693, + "learning_rate": 8.121308825057882e-05, + "loss": 1.086, + "step": 89380 + }, + { + "epoch": 0.5710872315142532, + "grad_norm": 0.840752124786377, + "learning_rate": 8.120916820142498e-05, + "loss": 1.1446, + "step": 89390 + }, + { + "epoch": 0.5711511186639919, + "grad_norm": 1.024584412574768, + "learning_rate": 8.12052478379678e-05, + "loss": 1.057, + "step": 89400 + }, + { + "epoch": 0.5712150058137306, + "grad_norm": 0.9996415972709656, + "learning_rate": 8.120132716024678e-05, + "loss": 0.9998, + "step": 89410 + }, + { + "epoch": 0.5712788929634693, + "grad_norm": 0.9148262143135071, + "learning_rate": 8.11974061683014e-05, + "loss": 1.3289, + "step": 89420 + }, + { + "epoch": 0.571342780113208, + "grad_norm": 1.0694867372512817, + "learning_rate": 8.119348486217116e-05, + "loss": 0.9789, + "step": 89430 + }, + { + "epoch": 0.5714066672629468, + "grad_norm": 0.6613714694976807, + "learning_rate": 8.118956324189553e-05, + "loss": 1.0028, + "step": 89440 + }, + { + "epoch": 0.5714705544126855, + "grad_norm": 0.981604814529419, + "learning_rate": 8.1185641307514e-05, + "loss": 0.9502, + "step": 89450 + }, + { + "epoch": 0.5715344415624242, + "grad_norm": 1.0735020637512207, + "learning_rate": 8.118171905906611e-05, + "loss": 0.7949, + "step": 89460 + }, + { + "epoch": 0.5715983287121629, + "grad_norm": 0.7923375368118286, + "learning_rate": 8.117779649659132e-05, + "loss": 0.7809, + "step": 89470 + }, + { + "epoch": 0.5716622158619016, + "grad_norm": 1.0053727626800537, + "learning_rate": 8.117387362012915e-05, + "loss": 0.772, + "step": 89480 + }, + { + "epoch": 0.5717261030116403, + "grad_norm": 0.8714725375175476, + "learning_rate": 8.116995042971909e-05, + "loss": 1.0257, + "step": 89490 + }, + { + "epoch": 0.571789990161379, + "grad_norm": 0.8163152933120728, + "learning_rate": 8.116602692540069e-05, + "loss": 0.7977, + "step": 89500 + }, + { + "epoch": 0.5718538773111176, + "grad_norm": 0.49878737330436707, + "learning_rate": 8.116210310721342e-05, + "loss": 0.8128, + "step": 89510 + }, + { + "epoch": 0.5719177644608563, + "grad_norm": 0.6907072067260742, + "learning_rate": 8.115817897519682e-05, + "loss": 0.7639, + "step": 89520 + }, + { + "epoch": 0.571981651610595, + "grad_norm": 0.6646702885627747, + "learning_rate": 8.115425452939039e-05, + "loss": 0.8343, + "step": 89530 + }, + { + "epoch": 0.5720455387603337, + "grad_norm": 0.7955873608589172, + "learning_rate": 8.115032976983368e-05, + "loss": 0.8254, + "step": 89540 + }, + { + "epoch": 0.5721094259100724, + "grad_norm": 1.2668566703796387, + "learning_rate": 8.114640469656619e-05, + "loss": 0.879, + "step": 89550 + }, + { + "epoch": 0.5721733130598111, + "grad_norm": 0.8631924390792847, + "learning_rate": 8.114247930962746e-05, + "loss": 0.9851, + "step": 89560 + }, + { + "epoch": 0.5722372002095498, + "grad_norm": 1.4024226665496826, + "learning_rate": 8.113855360905702e-05, + "loss": 0.9293, + "step": 89570 + }, + { + "epoch": 0.5723010873592885, + "grad_norm": 0.826225996017456, + "learning_rate": 8.113462759489441e-05, + "loss": 0.8176, + "step": 89580 + }, + { + "epoch": 0.5723649745090272, + "grad_norm": 0.7401711344718933, + "learning_rate": 8.113070126717916e-05, + "loss": 0.7405, + "step": 89590 + }, + { + "epoch": 0.5724288616587659, + "grad_norm": 0.8626922369003296, + "learning_rate": 8.112677462595084e-05, + "loss": 0.9589, + "step": 89600 + }, + { + "epoch": 0.5724927488085046, + "grad_norm": 0.7971317172050476, + "learning_rate": 8.112284767124894e-05, + "loss": 0.9191, + "step": 89610 + }, + { + "epoch": 0.5725566359582434, + "grad_norm": 0.7105472683906555, + "learning_rate": 8.111892040311305e-05, + "loss": 1.0642, + "step": 89620 + }, + { + "epoch": 0.5726205231079821, + "grad_norm": 0.8470255732536316, + "learning_rate": 8.111499282158271e-05, + "loss": 0.9904, + "step": 89630 + }, + { + "epoch": 0.5726844102577208, + "grad_norm": 1.0502973794937134, + "learning_rate": 8.111106492669747e-05, + "loss": 0.7748, + "step": 89640 + }, + { + "epoch": 0.5727482974074595, + "grad_norm": 0.9017430543899536, + "learning_rate": 8.11071367184969e-05, + "loss": 1.0352, + "step": 89650 + }, + { + "epoch": 0.5728121845571982, + "grad_norm": 1.4443550109863281, + "learning_rate": 8.110320819702055e-05, + "loss": 0.7986, + "step": 89660 + }, + { + "epoch": 0.5728760717069369, + "grad_norm": 0.8041828274726868, + "learning_rate": 8.109927936230798e-05, + "loss": 0.8368, + "step": 89670 + }, + { + "epoch": 0.5729399588566756, + "grad_norm": 0.8347397446632385, + "learning_rate": 8.109535021439876e-05, + "loss": 1.0824, + "step": 89680 + }, + { + "epoch": 0.5730038460064143, + "grad_norm": 1.0206586122512817, + "learning_rate": 8.109142075333247e-05, + "loss": 0.9412, + "step": 89690 + }, + { + "epoch": 0.573067733156153, + "grad_norm": 1.7678121328353882, + "learning_rate": 8.108749097914867e-05, + "loss": 0.698, + "step": 89700 + }, + { + "epoch": 0.5731316203058917, + "grad_norm": 0.7423132658004761, + "learning_rate": 8.108356089188694e-05, + "loss": 1.1062, + "step": 89710 + }, + { + "epoch": 0.5731955074556304, + "grad_norm": 0.5420845746994019, + "learning_rate": 8.107963049158686e-05, + "loss": 0.9447, + "step": 89720 + }, + { + "epoch": 0.5732593946053691, + "grad_norm": 1.7676161527633667, + "learning_rate": 8.107569977828803e-05, + "loss": 0.9414, + "step": 89730 + }, + { + "epoch": 0.5733232817551078, + "grad_norm": 0.9517451524734497, + "learning_rate": 8.107176875203e-05, + "loss": 0.8824, + "step": 89740 + }, + { + "epoch": 0.5733871689048465, + "grad_norm": 3.0669705867767334, + "learning_rate": 8.106783741285237e-05, + "loss": 0.9844, + "step": 89750 + }, + { + "epoch": 0.5734510560545851, + "grad_norm": 1.1198365688323975, + "learning_rate": 8.106390576079477e-05, + "loss": 0.9505, + "step": 89760 + }, + { + "epoch": 0.5735149432043238, + "grad_norm": 0.8078299164772034, + "learning_rate": 8.105997379589675e-05, + "loss": 0.9776, + "step": 89770 + }, + { + "epoch": 0.5735788303540625, + "grad_norm": 2.656707286834717, + "learning_rate": 8.105604151819793e-05, + "loss": 0.6794, + "step": 89780 + }, + { + "epoch": 0.5736427175038012, + "grad_norm": 1.2671973705291748, + "learning_rate": 8.105210892773789e-05, + "loss": 0.7823, + "step": 89790 + }, + { + "epoch": 0.57370660465354, + "grad_norm": 0.7666161060333252, + "learning_rate": 8.104817602455626e-05, + "loss": 0.7727, + "step": 89800 + }, + { + "epoch": 0.5737704918032787, + "grad_norm": 1.4913362264633179, + "learning_rate": 8.104424280869263e-05, + "loss": 0.8733, + "step": 89810 + }, + { + "epoch": 0.5738343789530174, + "grad_norm": 1.1301679611206055, + "learning_rate": 8.104030928018662e-05, + "loss": 0.7014, + "step": 89820 + }, + { + "epoch": 0.5738982661027561, + "grad_norm": 1.294581413269043, + "learning_rate": 8.103637543907784e-05, + "loss": 1.1176, + "step": 89830 + }, + { + "epoch": 0.5739621532524948, + "grad_norm": 0.9880439639091492, + "learning_rate": 8.103244128540591e-05, + "loss": 1.1367, + "step": 89840 + }, + { + "epoch": 0.5740260404022335, + "grad_norm": 1.8410000801086426, + "learning_rate": 8.102850681921046e-05, + "loss": 0.9493, + "step": 89850 + }, + { + "epoch": 0.5740899275519722, + "grad_norm": 0.7882397770881653, + "learning_rate": 8.102457204053109e-05, + "loss": 0.9901, + "step": 89860 + }, + { + "epoch": 0.5741538147017109, + "grad_norm": 0.833531379699707, + "learning_rate": 8.102063694940745e-05, + "loss": 0.8304, + "step": 89870 + }, + { + "epoch": 0.5742177018514496, + "grad_norm": 0.7302635908126831, + "learning_rate": 8.101670154587915e-05, + "loss": 0.9241, + "step": 89880 + }, + { + "epoch": 0.5742815890011883, + "grad_norm": 0.7871063947677612, + "learning_rate": 8.101276582998583e-05, + "loss": 0.7141, + "step": 89890 + }, + { + "epoch": 0.574345476150927, + "grad_norm": 1.1040356159210205, + "learning_rate": 8.100882980176712e-05, + "loss": 0.8373, + "step": 89900 + }, + { + "epoch": 0.5744093633006657, + "grad_norm": 0.5926182866096497, + "learning_rate": 8.100489346126268e-05, + "loss": 0.6924, + "step": 89910 + }, + { + "epoch": 0.5744732504504044, + "grad_norm": 1.047753930091858, + "learning_rate": 8.100095680851214e-05, + "loss": 0.8614, + "step": 89920 + }, + { + "epoch": 0.5745371376001431, + "grad_norm": 0.9589722752571106, + "learning_rate": 8.099701984355514e-05, + "loss": 0.796, + "step": 89930 + }, + { + "epoch": 0.5746010247498818, + "grad_norm": 1.2964690923690796, + "learning_rate": 8.099308256643134e-05, + "loss": 0.7672, + "step": 89940 + }, + { + "epoch": 0.5746649118996205, + "grad_norm": 1.9228540658950806, + "learning_rate": 8.09891449771804e-05, + "loss": 0.8787, + "step": 89950 + }, + { + "epoch": 0.5747287990493593, + "grad_norm": 0.5643669962882996, + "learning_rate": 8.098520707584195e-05, + "loss": 0.855, + "step": 89960 + }, + { + "epoch": 0.574792686199098, + "grad_norm": 0.9459285736083984, + "learning_rate": 8.098126886245564e-05, + "loss": 0.8887, + "step": 89970 + }, + { + "epoch": 0.5748565733488367, + "grad_norm": 0.8743549585342407, + "learning_rate": 8.097733033706117e-05, + "loss": 1.0167, + "step": 89980 + }, + { + "epoch": 0.5749204604985754, + "grad_norm": 1.0637538433074951, + "learning_rate": 8.097339149969818e-05, + "loss": 0.76, + "step": 89990 + }, + { + "epoch": 0.574984347648314, + "grad_norm": 0.6457778215408325, + "learning_rate": 8.096945235040634e-05, + "loss": 0.8873, + "step": 90000 + }, + { + "epoch": 0.5750482347980527, + "grad_norm": 0.7007945775985718, + "learning_rate": 8.096551288922532e-05, + "loss": 0.6989, + "step": 90010 + }, + { + "epoch": 0.5751121219477914, + "grad_norm": 1.048057198524475, + "learning_rate": 8.096157311619479e-05, + "loss": 0.7702, + "step": 90020 + }, + { + "epoch": 0.5751760090975301, + "grad_norm": 0.8541986346244812, + "learning_rate": 8.095763303135444e-05, + "loss": 0.996, + "step": 90030 + }, + { + "epoch": 0.5752398962472688, + "grad_norm": 1.4196783304214478, + "learning_rate": 8.095369263474396e-05, + "loss": 0.5537, + "step": 90040 + }, + { + "epoch": 0.5753037833970075, + "grad_norm": 0.8951913118362427, + "learning_rate": 8.094975192640299e-05, + "loss": 0.6825, + "step": 90050 + }, + { + "epoch": 0.5753676705467462, + "grad_norm": 0.6169331669807434, + "learning_rate": 8.094581090637127e-05, + "loss": 0.8536, + "step": 90060 + }, + { + "epoch": 0.5754315576964849, + "grad_norm": 0.7136598825454712, + "learning_rate": 8.094186957468843e-05, + "loss": 0.9564, + "step": 90070 + }, + { + "epoch": 0.5754954448462236, + "grad_norm": 1.0115174055099487, + "learning_rate": 8.093792793139421e-05, + "loss": 0.7456, + "step": 90080 + }, + { + "epoch": 0.5755593319959623, + "grad_norm": 0.7271766662597656, + "learning_rate": 8.09339859765283e-05, + "loss": 0.8566, + "step": 90090 + }, + { + "epoch": 0.575623219145701, + "grad_norm": 0.870293140411377, + "learning_rate": 8.093004371013038e-05, + "loss": 1.0552, + "step": 90100 + }, + { + "epoch": 0.5756871062954397, + "grad_norm": 0.8186811208724976, + "learning_rate": 8.092610113224017e-05, + "loss": 0.7465, + "step": 90110 + }, + { + "epoch": 0.5757509934451784, + "grad_norm": 0.918304979801178, + "learning_rate": 8.092215824289735e-05, + "loss": 0.8869, + "step": 90120 + }, + { + "epoch": 0.5758148805949171, + "grad_norm": 0.9499895572662354, + "learning_rate": 8.091821504214166e-05, + "loss": 0.8331, + "step": 90130 + }, + { + "epoch": 0.5758787677446559, + "grad_norm": 1.6564801931381226, + "learning_rate": 8.091427153001278e-05, + "loss": 0.9647, + "step": 90140 + }, + { + "epoch": 0.5759426548943946, + "grad_norm": 1.2745451927185059, + "learning_rate": 8.091032770655048e-05, + "loss": 0.7745, + "step": 90150 + }, + { + "epoch": 0.5760065420441333, + "grad_norm": 1.276982069015503, + "learning_rate": 8.09063835717944e-05, + "loss": 1.0932, + "step": 90160 + }, + { + "epoch": 0.576070429193872, + "grad_norm": 1.1688791513442993, + "learning_rate": 8.09024391257843e-05, + "loss": 0.8497, + "step": 90170 + }, + { + "epoch": 0.5761343163436107, + "grad_norm": 0.6883856058120728, + "learning_rate": 8.089849436855992e-05, + "loss": 0.8468, + "step": 90180 + }, + { + "epoch": 0.5761982034933494, + "grad_norm": 1.2166844606399536, + "learning_rate": 8.089454930016095e-05, + "loss": 0.9501, + "step": 90190 + }, + { + "epoch": 0.5762620906430881, + "grad_norm": 0.9265638589859009, + "learning_rate": 8.089060392062718e-05, + "loss": 0.828, + "step": 90200 + }, + { + "epoch": 0.5763259777928268, + "grad_norm": 1.2245463132858276, + "learning_rate": 8.088665822999827e-05, + "loss": 1.0158, + "step": 90210 + }, + { + "epoch": 0.5763898649425655, + "grad_norm": 1.1771318912506104, + "learning_rate": 8.088271222831401e-05, + "loss": 0.8838, + "step": 90220 + }, + { + "epoch": 0.5764537520923042, + "grad_norm": 0.6416171193122864, + "learning_rate": 8.08787659156141e-05, + "loss": 0.8364, + "step": 90230 + }, + { + "epoch": 0.5765176392420428, + "grad_norm": 0.7262217998504639, + "learning_rate": 8.087481929193831e-05, + "loss": 0.8772, + "step": 90240 + }, + { + "epoch": 0.5765815263917815, + "grad_norm": 1.0803191661834717, + "learning_rate": 8.08708723573264e-05, + "loss": 0.9563, + "step": 90250 + }, + { + "epoch": 0.5766454135415202, + "grad_norm": 0.8832546472549438, + "learning_rate": 8.086692511181806e-05, + "loss": 0.9067, + "step": 90260 + }, + { + "epoch": 0.5767093006912589, + "grad_norm": 0.8966745138168335, + "learning_rate": 8.086297755545312e-05, + "loss": 1.0014, + "step": 90270 + }, + { + "epoch": 0.5767731878409976, + "grad_norm": 1.051358938217163, + "learning_rate": 8.085902968827128e-05, + "loss": 0.8469, + "step": 90280 + }, + { + "epoch": 0.5768370749907363, + "grad_norm": 0.5282111763954163, + "learning_rate": 8.085508151031232e-05, + "loss": 0.7655, + "step": 90290 + }, + { + "epoch": 0.576900962140475, + "grad_norm": 0.9313019514083862, + "learning_rate": 8.085113302161598e-05, + "loss": 0.8174, + "step": 90300 + }, + { + "epoch": 0.5769648492902137, + "grad_norm": 1.0769340991973877, + "learning_rate": 8.084718422222205e-05, + "loss": 0.6471, + "step": 90310 + }, + { + "epoch": 0.5770287364399524, + "grad_norm": 0.8049689531326294, + "learning_rate": 8.084323511217029e-05, + "loss": 0.9863, + "step": 90320 + }, + { + "epoch": 0.5770926235896912, + "grad_norm": 1.1444233655929565, + "learning_rate": 8.083928569150045e-05, + "loss": 0.7216, + "step": 90330 + }, + { + "epoch": 0.5771565107394299, + "grad_norm": 0.6875047087669373, + "learning_rate": 8.083533596025234e-05, + "loss": 0.8915, + "step": 90340 + }, + { + "epoch": 0.5772203978891686, + "grad_norm": 0.9097625613212585, + "learning_rate": 8.083138591846574e-05, + "loss": 0.7636, + "step": 90350 + }, + { + "epoch": 0.5772842850389073, + "grad_norm": 1.2456170320510864, + "learning_rate": 8.082743556618038e-05, + "loss": 0.8581, + "step": 90360 + }, + { + "epoch": 0.577348172188646, + "grad_norm": 0.8649427890777588, + "learning_rate": 8.082348490343608e-05, + "loss": 1.0361, + "step": 90370 + }, + { + "epoch": 0.5774120593383847, + "grad_norm": 2.0383450984954834, + "learning_rate": 8.081953393027263e-05, + "loss": 0.7167, + "step": 90380 + }, + { + "epoch": 0.5774759464881234, + "grad_norm": 1.4382243156433105, + "learning_rate": 8.081558264672982e-05, + "loss": 0.881, + "step": 90390 + }, + { + "epoch": 0.5775398336378621, + "grad_norm": 0.6351116895675659, + "learning_rate": 8.081163105284741e-05, + "loss": 0.7479, + "step": 90400 + }, + { + "epoch": 0.5776037207876008, + "grad_norm": 0.8547778129577637, + "learning_rate": 8.080767914866523e-05, + "loss": 0.9155, + "step": 90410 + }, + { + "epoch": 0.5776676079373395, + "grad_norm": 0.6154083609580994, + "learning_rate": 8.080372693422307e-05, + "loss": 0.7414, + "step": 90420 + }, + { + "epoch": 0.5777314950870782, + "grad_norm": 0.8668635487556458, + "learning_rate": 8.079977440956073e-05, + "loss": 0.6981, + "step": 90430 + }, + { + "epoch": 0.5777953822368169, + "grad_norm": 0.6722155809402466, + "learning_rate": 8.079582157471801e-05, + "loss": 0.9792, + "step": 90440 + }, + { + "epoch": 0.5778592693865556, + "grad_norm": 0.5810309648513794, + "learning_rate": 8.079186842973473e-05, + "loss": 0.5862, + "step": 90450 + }, + { + "epoch": 0.5779231565362943, + "grad_norm": 1.0260207653045654, + "learning_rate": 8.07879149746507e-05, + "loss": 0.803, + "step": 90460 + }, + { + "epoch": 0.577987043686033, + "grad_norm": 0.8295899033546448, + "learning_rate": 8.078396120950572e-05, + "loss": 0.955, + "step": 90470 + }, + { + "epoch": 0.5780509308357717, + "grad_norm": 0.7317106127738953, + "learning_rate": 8.078000713433962e-05, + "loss": 1.0108, + "step": 90480 + }, + { + "epoch": 0.5781148179855103, + "grad_norm": 0.925954282283783, + "learning_rate": 8.077605274919224e-05, + "loss": 0.675, + "step": 90490 + }, + { + "epoch": 0.578178705135249, + "grad_norm": 1.5816576480865479, + "learning_rate": 8.077209805410336e-05, + "loss": 0.802, + "step": 90500 + }, + { + "epoch": 0.5782425922849878, + "grad_norm": 1.3754863739013672, + "learning_rate": 8.076814304911285e-05, + "loss": 0.7063, + "step": 90510 + }, + { + "epoch": 0.5783064794347265, + "grad_norm": 1.1097428798675537, + "learning_rate": 8.076418773426051e-05, + "loss": 0.8896, + "step": 90520 + }, + { + "epoch": 0.5783703665844652, + "grad_norm": 0.9202744364738464, + "learning_rate": 8.076023210958618e-05, + "loss": 1.1369, + "step": 90530 + }, + { + "epoch": 0.5784342537342039, + "grad_norm": 0.8386440873146057, + "learning_rate": 8.07562761751297e-05, + "loss": 1.1063, + "step": 90540 + }, + { + "epoch": 0.5784981408839426, + "grad_norm": 1.387734293937683, + "learning_rate": 8.075231993093093e-05, + "loss": 0.8543, + "step": 90550 + }, + { + "epoch": 0.5785620280336813, + "grad_norm": 1.1072419881820679, + "learning_rate": 8.074836337702969e-05, + "loss": 1.1178, + "step": 90560 + }, + { + "epoch": 0.57862591518342, + "grad_norm": 0.7916972041130066, + "learning_rate": 8.074440651346582e-05, + "loss": 1.2825, + "step": 90570 + }, + { + "epoch": 0.5786898023331587, + "grad_norm": 1.407332181930542, + "learning_rate": 8.074044934027918e-05, + "loss": 0.9507, + "step": 90580 + }, + { + "epoch": 0.5787536894828974, + "grad_norm": 0.8653108477592468, + "learning_rate": 8.073649185750962e-05, + "loss": 0.7288, + "step": 90590 + }, + { + "epoch": 0.5788175766326361, + "grad_norm": 1.2476141452789307, + "learning_rate": 8.073253406519699e-05, + "loss": 0.8561, + "step": 90600 + }, + { + "epoch": 0.5788814637823748, + "grad_norm": 0.8226998448371887, + "learning_rate": 8.072857596338116e-05, + "loss": 0.8944, + "step": 90610 + }, + { + "epoch": 0.5789453509321135, + "grad_norm": 0.7982886433601379, + "learning_rate": 8.0724617552102e-05, + "loss": 0.8097, + "step": 90620 + }, + { + "epoch": 0.5790092380818522, + "grad_norm": 0.7011058926582336, + "learning_rate": 8.072065883139935e-05, + "loss": 1.0291, + "step": 90630 + }, + { + "epoch": 0.5790731252315909, + "grad_norm": 1.2704604864120483, + "learning_rate": 8.071669980131307e-05, + "loss": 0.893, + "step": 90640 + }, + { + "epoch": 0.5791370123813296, + "grad_norm": 0.7645861506462097, + "learning_rate": 8.071274046188306e-05, + "loss": 0.6751, + "step": 90650 + }, + { + "epoch": 0.5792008995310683, + "grad_norm": 3.0247256755828857, + "learning_rate": 8.07087808131492e-05, + "loss": 0.7969, + "step": 90660 + }, + { + "epoch": 0.579264786680807, + "grad_norm": 1.2994235754013062, + "learning_rate": 8.070482085515134e-05, + "loss": 1.3255, + "step": 90670 + }, + { + "epoch": 0.5793286738305458, + "grad_norm": 1.6243011951446533, + "learning_rate": 8.070086058792937e-05, + "loss": 0.8921, + "step": 90680 + }, + { + "epoch": 0.5793925609802845, + "grad_norm": 0.907557487487793, + "learning_rate": 8.069690001152317e-05, + "loss": 0.8408, + "step": 90690 + }, + { + "epoch": 0.5794564481300232, + "grad_norm": 0.7467745542526245, + "learning_rate": 8.069293912597263e-05, + "loss": 0.881, + "step": 90700 + }, + { + "epoch": 0.5795203352797619, + "grad_norm": 0.7291324734687805, + "learning_rate": 8.068897793131764e-05, + "loss": 1.0837, + "step": 90710 + }, + { + "epoch": 0.5795842224295006, + "grad_norm": 0.7500112056732178, + "learning_rate": 8.068501642759811e-05, + "loss": 0.6602, + "step": 90720 + }, + { + "epoch": 0.5796481095792392, + "grad_norm": 0.3867965042591095, + "learning_rate": 8.068105461485391e-05, + "loss": 0.7279, + "step": 90730 + }, + { + "epoch": 0.5797119967289779, + "grad_norm": 0.6861584186553955, + "learning_rate": 8.067709249312494e-05, + "loss": 0.924, + "step": 90740 + }, + { + "epoch": 0.5797758838787166, + "grad_norm": 0.9725950956344604, + "learning_rate": 8.06731300624511e-05, + "loss": 0.686, + "step": 90750 + }, + { + "epoch": 0.5798397710284553, + "grad_norm": 0.7137267589569092, + "learning_rate": 8.066916732287232e-05, + "loss": 0.7585, + "step": 90760 + }, + { + "epoch": 0.579903658178194, + "grad_norm": 0.8141860961914062, + "learning_rate": 8.06652042744285e-05, + "loss": 1.2448, + "step": 90770 + }, + { + "epoch": 0.5799675453279327, + "grad_norm": 0.9452531337738037, + "learning_rate": 8.066124091715952e-05, + "loss": 0.8939, + "step": 90780 + }, + { + "epoch": 0.5800314324776714, + "grad_norm": 0.8053810596466064, + "learning_rate": 8.065727725110533e-05, + "loss": 0.8234, + "step": 90790 + }, + { + "epoch": 0.5800953196274101, + "grad_norm": 0.8168609142303467, + "learning_rate": 8.065331327630585e-05, + "loss": 1.1362, + "step": 90800 + }, + { + "epoch": 0.5801592067771488, + "grad_norm": 0.7657856941223145, + "learning_rate": 8.064934899280096e-05, + "loss": 0.9269, + "step": 90810 + }, + { + "epoch": 0.5802230939268875, + "grad_norm": 1.230660319328308, + "learning_rate": 8.064538440063063e-05, + "loss": 0.7815, + "step": 90820 + }, + { + "epoch": 0.5802869810766262, + "grad_norm": 0.7919756770133972, + "learning_rate": 8.064141949983476e-05, + "loss": 0.7874, + "step": 90830 + }, + { + "epoch": 0.580350868226365, + "grad_norm": 0.6535720229148865, + "learning_rate": 8.063745429045329e-05, + "loss": 0.7333, + "step": 90840 + }, + { + "epoch": 0.5804147553761037, + "grad_norm": 0.8120725750923157, + "learning_rate": 8.063348877252614e-05, + "loss": 0.7505, + "step": 90850 + }, + { + "epoch": 0.5804786425258424, + "grad_norm": 0.7102304697036743, + "learning_rate": 8.062952294609327e-05, + "loss": 0.6825, + "step": 90860 + }, + { + "epoch": 0.5805425296755811, + "grad_norm": 0.9454075694084167, + "learning_rate": 8.062555681119459e-05, + "loss": 0.6988, + "step": 90870 + }, + { + "epoch": 0.5806064168253198, + "grad_norm": 1.1664451360702515, + "learning_rate": 8.062159036787007e-05, + "loss": 0.8987, + "step": 90880 + }, + { + "epoch": 0.5806703039750585, + "grad_norm": 1.2764151096343994, + "learning_rate": 8.061762361615964e-05, + "loss": 0.9188, + "step": 90890 + }, + { + "epoch": 0.5807341911247972, + "grad_norm": 1.244565725326538, + "learning_rate": 8.061365655610325e-05, + "loss": 1.1752, + "step": 90900 + }, + { + "epoch": 0.5807980782745359, + "grad_norm": 0.8151182532310486, + "learning_rate": 8.060968918774085e-05, + "loss": 0.8404, + "step": 90910 + }, + { + "epoch": 0.5808619654242746, + "grad_norm": 0.8376042246818542, + "learning_rate": 8.06057215111124e-05, + "loss": 0.8567, + "step": 90920 + }, + { + "epoch": 0.5809258525740133, + "grad_norm": 1.4422600269317627, + "learning_rate": 8.060175352625787e-05, + "loss": 0.9805, + "step": 90930 + }, + { + "epoch": 0.580989739723752, + "grad_norm": 0.6964714527130127, + "learning_rate": 8.05977852332172e-05, + "loss": 0.6621, + "step": 90940 + }, + { + "epoch": 0.5810536268734907, + "grad_norm": 0.9028936624526978, + "learning_rate": 8.059381663203036e-05, + "loss": 0.9439, + "step": 90950 + }, + { + "epoch": 0.5811175140232294, + "grad_norm": 1.128549575805664, + "learning_rate": 8.058984772273733e-05, + "loss": 0.7059, + "step": 90960 + }, + { + "epoch": 0.581181401172968, + "grad_norm": 1.0536413192749023, + "learning_rate": 8.058587850537804e-05, + "loss": 0.8569, + "step": 90970 + }, + { + "epoch": 0.5812452883227067, + "grad_norm": 0.8410016894340515, + "learning_rate": 8.058190897999252e-05, + "loss": 0.9881, + "step": 90980 + }, + { + "epoch": 0.5813091754724454, + "grad_norm": 0.5887959003448486, + "learning_rate": 8.057793914662071e-05, + "loss": 1.2143, + "step": 90990 + }, + { + "epoch": 0.5813730626221841, + "grad_norm": 0.9902825951576233, + "learning_rate": 8.057396900530261e-05, + "loss": 1.0165, + "step": 91000 + }, + { + "epoch": 0.5814369497719228, + "grad_norm": 0.7618111371994019, + "learning_rate": 8.056999855607819e-05, + "loss": 0.9192, + "step": 91010 + }, + { + "epoch": 0.5815008369216615, + "grad_norm": 1.465938925743103, + "learning_rate": 8.056602779898742e-05, + "loss": 1.0972, + "step": 91020 + }, + { + "epoch": 0.5815647240714003, + "grad_norm": 0.6929851770401001, + "learning_rate": 8.056205673407031e-05, + "loss": 0.8941, + "step": 91030 + }, + { + "epoch": 0.581628611221139, + "grad_norm": 0.7662091851234436, + "learning_rate": 8.055808536136687e-05, + "loss": 1.1148, + "step": 91040 + }, + { + "epoch": 0.5816924983708777, + "grad_norm": 1.1561191082000732, + "learning_rate": 8.055411368091706e-05, + "loss": 0.9246, + "step": 91050 + }, + { + "epoch": 0.5817563855206164, + "grad_norm": 1.0664466619491577, + "learning_rate": 8.05501416927609e-05, + "loss": 1.0493, + "step": 91060 + }, + { + "epoch": 0.5818202726703551, + "grad_norm": 1.0375691652297974, + "learning_rate": 8.054616939693837e-05, + "loss": 0.8237, + "step": 91070 + }, + { + "epoch": 0.5818841598200938, + "grad_norm": 1.251013159751892, + "learning_rate": 8.054219679348949e-05, + "loss": 0.9984, + "step": 91080 + }, + { + "epoch": 0.5819480469698325, + "grad_norm": 0.7131451964378357, + "learning_rate": 8.053822388245426e-05, + "loss": 0.8158, + "step": 91090 + }, + { + "epoch": 0.5820119341195712, + "grad_norm": 0.6554450988769531, + "learning_rate": 8.053425066387271e-05, + "loss": 0.988, + "step": 91100 + }, + { + "epoch": 0.5820758212693099, + "grad_norm": 0.5957306623458862, + "learning_rate": 8.053027713778484e-05, + "loss": 0.8844, + "step": 91110 + }, + { + "epoch": 0.5821397084190486, + "grad_norm": 2.745039463043213, + "learning_rate": 8.052630330423066e-05, + "loss": 0.8555, + "step": 91120 + }, + { + "epoch": 0.5822035955687873, + "grad_norm": 1.19644033908844, + "learning_rate": 8.05223291632502e-05, + "loss": 1.1978, + "step": 91130 + }, + { + "epoch": 0.582267482718526, + "grad_norm": 0.7143746614456177, + "learning_rate": 8.051835471488347e-05, + "loss": 1.0662, + "step": 91140 + }, + { + "epoch": 0.5823313698682647, + "grad_norm": 0.6921029686927795, + "learning_rate": 8.051437995917051e-05, + "loss": 0.8375, + "step": 91150 + }, + { + "epoch": 0.5823952570180034, + "grad_norm": 1.4767210483551025, + "learning_rate": 8.051040489615136e-05, + "loss": 0.9358, + "step": 91160 + }, + { + "epoch": 0.5824591441677421, + "grad_norm": 1.0952929258346558, + "learning_rate": 8.050642952586602e-05, + "loss": 0.8761, + "step": 91170 + }, + { + "epoch": 0.5825230313174808, + "grad_norm": 0.7604880332946777, + "learning_rate": 8.050245384835455e-05, + "loss": 0.9666, + "step": 91180 + }, + { + "epoch": 0.5825869184672195, + "grad_norm": 0.6385078430175781, + "learning_rate": 8.049847786365698e-05, + "loss": 1.0319, + "step": 91190 + }, + { + "epoch": 0.5826508056169583, + "grad_norm": 0.8328523635864258, + "learning_rate": 8.049450157181336e-05, + "loss": 0.6776, + "step": 91200 + }, + { + "epoch": 0.5827146927666969, + "grad_norm": 0.9883635640144348, + "learning_rate": 8.049052497286372e-05, + "loss": 0.8049, + "step": 91210 + }, + { + "epoch": 0.5827785799164356, + "grad_norm": 0.9479039907455444, + "learning_rate": 8.048654806684812e-05, + "loss": 0.8743, + "step": 91220 + }, + { + "epoch": 0.5828424670661743, + "grad_norm": 1.1088539361953735, + "learning_rate": 8.048257085380659e-05, + "loss": 0.9272, + "step": 91230 + }, + { + "epoch": 0.582906354215913, + "grad_norm": 0.6845352053642273, + "learning_rate": 8.047859333377923e-05, + "loss": 0.9951, + "step": 91240 + }, + { + "epoch": 0.5829702413656517, + "grad_norm": 1.5272711515426636, + "learning_rate": 8.047461550680606e-05, + "loss": 0.8264, + "step": 91250 + }, + { + "epoch": 0.5830341285153904, + "grad_norm": 0.7585494518280029, + "learning_rate": 8.047063737292712e-05, + "loss": 0.7845, + "step": 91260 + }, + { + "epoch": 0.5830980156651291, + "grad_norm": 0.8442081809043884, + "learning_rate": 8.046665893218253e-05, + "loss": 0.8805, + "step": 91270 + }, + { + "epoch": 0.5831619028148678, + "grad_norm": 0.7809866666793823, + "learning_rate": 8.046268018461232e-05, + "loss": 0.8469, + "step": 91280 + }, + { + "epoch": 0.5832257899646065, + "grad_norm": 0.5992255806922913, + "learning_rate": 8.045870113025655e-05, + "loss": 0.9114, + "step": 91290 + }, + { + "epoch": 0.5832896771143452, + "grad_norm": 0.9385843873023987, + "learning_rate": 8.045472176915533e-05, + "loss": 0.9167, + "step": 91300 + }, + { + "epoch": 0.5833535642640839, + "grad_norm": 0.6832097172737122, + "learning_rate": 8.04507421013487e-05, + "loss": 1.0817, + "step": 91310 + }, + { + "epoch": 0.5834174514138226, + "grad_norm": 0.5917803049087524, + "learning_rate": 8.044676212687677e-05, + "loss": 0.8989, + "step": 91320 + }, + { + "epoch": 0.5834813385635613, + "grad_norm": 0.6751442551612854, + "learning_rate": 8.04427818457796e-05, + "loss": 0.8093, + "step": 91330 + }, + { + "epoch": 0.5835452257133, + "grad_norm": 0.5237501859664917, + "learning_rate": 8.043880125809727e-05, + "loss": 0.8325, + "step": 91340 + }, + { + "epoch": 0.5836091128630387, + "grad_norm": 1.9701563119888306, + "learning_rate": 8.043482036386989e-05, + "loss": 0.8434, + "step": 91350 + }, + { + "epoch": 0.5836730000127774, + "grad_norm": 1.0748164653778076, + "learning_rate": 8.043083916313752e-05, + "loss": 0.8942, + "step": 91360 + }, + { + "epoch": 0.5837368871625161, + "grad_norm": 0.7747710347175598, + "learning_rate": 8.042685765594029e-05, + "loss": 0.7678, + "step": 91370 + }, + { + "epoch": 0.5838007743122549, + "grad_norm": 1.0838667154312134, + "learning_rate": 8.042287584231828e-05, + "loss": 0.9147, + "step": 91380 + }, + { + "epoch": 0.5838646614619936, + "grad_norm": 1.229852557182312, + "learning_rate": 8.041889372231159e-05, + "loss": 1.0037, + "step": 91390 + }, + { + "epoch": 0.5839285486117323, + "grad_norm": 1.2635694742202759, + "learning_rate": 8.041491129596032e-05, + "loss": 0.8378, + "step": 91400 + }, + { + "epoch": 0.583992435761471, + "grad_norm": 1.1819652318954468, + "learning_rate": 8.041092856330457e-05, + "loss": 0.7489, + "step": 91410 + }, + { + "epoch": 0.5840563229112097, + "grad_norm": 1.0836447477340698, + "learning_rate": 8.040694552438448e-05, + "loss": 0.9781, + "step": 91420 + }, + { + "epoch": 0.5841202100609484, + "grad_norm": 2.449270248413086, + "learning_rate": 8.040296217924014e-05, + "loss": 0.9975, + "step": 91430 + }, + { + "epoch": 0.5841840972106871, + "grad_norm": 0.9335359334945679, + "learning_rate": 8.039897852791167e-05, + "loss": 0.8676, + "step": 91440 + }, + { + "epoch": 0.5842479843604258, + "grad_norm": 1.0198067426681519, + "learning_rate": 8.039499457043918e-05, + "loss": 0.9543, + "step": 91450 + }, + { + "epoch": 0.5843118715101644, + "grad_norm": 0.7770729660987854, + "learning_rate": 8.03910103068628e-05, + "loss": 0.8293, + "step": 91460 + }, + { + "epoch": 0.5843757586599031, + "grad_norm": 0.9211755990982056, + "learning_rate": 8.038702573722266e-05, + "loss": 0.9459, + "step": 91470 + }, + { + "epoch": 0.5844396458096418, + "grad_norm": 0.6153119802474976, + "learning_rate": 8.038304086155887e-05, + "loss": 0.9555, + "step": 91480 + }, + { + "epoch": 0.5845035329593805, + "grad_norm": 0.5979563593864441, + "learning_rate": 8.037905567991158e-05, + "loss": 0.906, + "step": 91490 + }, + { + "epoch": 0.5845674201091192, + "grad_norm": 0.7104209065437317, + "learning_rate": 8.037507019232091e-05, + "loss": 0.7133, + "step": 91500 + }, + { + "epoch": 0.5846313072588579, + "grad_norm": 0.8748192191123962, + "learning_rate": 8.037108439882702e-05, + "loss": 0.7568, + "step": 91510 + }, + { + "epoch": 0.5846951944085966, + "grad_norm": 0.9937753081321716, + "learning_rate": 8.036709829947003e-05, + "loss": 0.7123, + "step": 91520 + }, + { + "epoch": 0.5847590815583353, + "grad_norm": 1.2634817361831665, + "learning_rate": 8.036311189429009e-05, + "loss": 0.9345, + "step": 91530 + }, + { + "epoch": 0.584822968708074, + "grad_norm": 0.8244264721870422, + "learning_rate": 8.035912518332733e-05, + "loss": 1.0218, + "step": 91540 + }, + { + "epoch": 0.5848868558578127, + "grad_norm": 1.2409876585006714, + "learning_rate": 8.035513816662194e-05, + "loss": 0.9595, + "step": 91550 + }, + { + "epoch": 0.5849507430075515, + "grad_norm": 0.9279502034187317, + "learning_rate": 8.035115084421404e-05, + "loss": 0.7282, + "step": 91560 + }, + { + "epoch": 0.5850146301572902, + "grad_norm": 1.0995663404464722, + "learning_rate": 8.034716321614377e-05, + "loss": 1.0674, + "step": 91570 + }, + { + "epoch": 0.5850785173070289, + "grad_norm": 1.0315444469451904, + "learning_rate": 8.034317528245134e-05, + "loss": 0.8631, + "step": 91580 + }, + { + "epoch": 0.5851424044567676, + "grad_norm": 0.8768134117126465, + "learning_rate": 8.033918704317686e-05, + "loss": 0.7553, + "step": 91590 + }, + { + "epoch": 0.5852062916065063, + "grad_norm": 1.049591064453125, + "learning_rate": 8.033519849836055e-05, + "loss": 0.979, + "step": 91600 + }, + { + "epoch": 0.585270178756245, + "grad_norm": 0.7832081317901611, + "learning_rate": 8.033120964804252e-05, + "loss": 0.8019, + "step": 91610 + }, + { + "epoch": 0.5853340659059837, + "grad_norm": 1.217409372329712, + "learning_rate": 8.0327220492263e-05, + "loss": 0.96, + "step": 91620 + }, + { + "epoch": 0.5853979530557224, + "grad_norm": 1.7218462228775024, + "learning_rate": 8.03232310310621e-05, + "loss": 1.3111, + "step": 91630 + }, + { + "epoch": 0.5854618402054611, + "grad_norm": 0.9196959733963013, + "learning_rate": 8.031924126448005e-05, + "loss": 0.9832, + "step": 91640 + }, + { + "epoch": 0.5855257273551998, + "grad_norm": 0.9768834114074707, + "learning_rate": 8.031525119255701e-05, + "loss": 0.9437, + "step": 91650 + }, + { + "epoch": 0.5855896145049385, + "grad_norm": 0.707797646522522, + "learning_rate": 8.031126081533315e-05, + "loss": 0.7751, + "step": 91660 + }, + { + "epoch": 0.5856535016546772, + "grad_norm": 3.0832972526550293, + "learning_rate": 8.030727013284868e-05, + "loss": 0.8823, + "step": 91670 + }, + { + "epoch": 0.5857173888044159, + "grad_norm": 0.8001325130462646, + "learning_rate": 8.030327914514377e-05, + "loss": 0.9358, + "step": 91680 + }, + { + "epoch": 0.5857812759541546, + "grad_norm": 2.1798999309539795, + "learning_rate": 8.029928785225864e-05, + "loss": 0.8331, + "step": 91690 + }, + { + "epoch": 0.5858451631038932, + "grad_norm": 0.8021385669708252, + "learning_rate": 8.029529625423347e-05, + "loss": 0.9211, + "step": 91700 + }, + { + "epoch": 0.5859090502536319, + "grad_norm": 0.7000755071640015, + "learning_rate": 8.029130435110844e-05, + "loss": 0.8239, + "step": 91710 + }, + { + "epoch": 0.5859729374033706, + "grad_norm": 0.9345866441726685, + "learning_rate": 8.028731214292377e-05, + "loss": 0.8656, + "step": 91720 + }, + { + "epoch": 0.5860368245531093, + "grad_norm": 0.9513382911682129, + "learning_rate": 8.02833196297197e-05, + "loss": 0.8464, + "step": 91730 + }, + { + "epoch": 0.586100711702848, + "grad_norm": 1.2676148414611816, + "learning_rate": 8.027932681153636e-05, + "loss": 0.9357, + "step": 91740 + }, + { + "epoch": 0.5861645988525868, + "grad_norm": 0.7879144549369812, + "learning_rate": 8.027533368841402e-05, + "loss": 1.1884, + "step": 91750 + }, + { + "epoch": 0.5862284860023255, + "grad_norm": 0.6441530585289001, + "learning_rate": 8.027134026039288e-05, + "loss": 1.0553, + "step": 91760 + }, + { + "epoch": 0.5862923731520642, + "grad_norm": 2.0362207889556885, + "learning_rate": 8.026734652751316e-05, + "loss": 0.751, + "step": 91770 + }, + { + "epoch": 0.5863562603018029, + "grad_norm": 0.9429267644882202, + "learning_rate": 8.026335248981506e-05, + "loss": 0.7756, + "step": 91780 + }, + { + "epoch": 0.5864201474515416, + "grad_norm": 1.2869027853012085, + "learning_rate": 8.025935814733883e-05, + "loss": 0.7654, + "step": 91790 + }, + { + "epoch": 0.5864840346012803, + "grad_norm": 1.0939258337020874, + "learning_rate": 8.025536350012468e-05, + "loss": 0.9738, + "step": 91800 + }, + { + "epoch": 0.586547921751019, + "grad_norm": 0.5842766761779785, + "learning_rate": 8.025136854821285e-05, + "loss": 0.7524, + "step": 91810 + }, + { + "epoch": 0.5866118089007577, + "grad_norm": 0.8317599892616272, + "learning_rate": 8.024737329164356e-05, + "loss": 1.1187, + "step": 91820 + }, + { + "epoch": 0.5866756960504964, + "grad_norm": 0.8482229113578796, + "learning_rate": 8.024337773045704e-05, + "loss": 0.6012, + "step": 91830 + }, + { + "epoch": 0.5867395832002351, + "grad_norm": 0.7396560311317444, + "learning_rate": 8.023938186469357e-05, + "loss": 0.9791, + "step": 91840 + }, + { + "epoch": 0.5868034703499738, + "grad_norm": 0.7632973790168762, + "learning_rate": 8.023538569439335e-05, + "loss": 0.9775, + "step": 91850 + }, + { + "epoch": 0.5868673574997125, + "grad_norm": 1.459350347518921, + "learning_rate": 8.023138921959665e-05, + "loss": 0.7903, + "step": 91860 + }, + { + "epoch": 0.5869312446494512, + "grad_norm": 0.5813467502593994, + "learning_rate": 8.022739244034369e-05, + "loss": 0.8206, + "step": 91870 + }, + { + "epoch": 0.5869951317991899, + "grad_norm": 0.9439472556114197, + "learning_rate": 8.022339535667476e-05, + "loss": 0.6998, + "step": 91880 + }, + { + "epoch": 0.5870590189489286, + "grad_norm": 1.054968237876892, + "learning_rate": 8.021939796863007e-05, + "loss": 0.8882, + "step": 91890 + }, + { + "epoch": 0.5871229060986674, + "grad_norm": 0.7589655518531799, + "learning_rate": 8.021540027624991e-05, + "loss": 1.0338, + "step": 91900 + }, + { + "epoch": 0.5871867932484061, + "grad_norm": 5.119521617889404, + "learning_rate": 8.021140227957451e-05, + "loss": 1.1161, + "step": 91910 + }, + { + "epoch": 0.5872506803981448, + "grad_norm": 0.9572505354881287, + "learning_rate": 8.020740397864418e-05, + "loss": 0.9184, + "step": 91920 + }, + { + "epoch": 0.5873145675478835, + "grad_norm": 1.0495151281356812, + "learning_rate": 8.020340537349915e-05, + "loss": 0.8544, + "step": 91930 + }, + { + "epoch": 0.5873784546976221, + "grad_norm": 0.8135344386100769, + "learning_rate": 8.019940646417969e-05, + "loss": 0.8227, + "step": 91940 + }, + { + "epoch": 0.5874423418473608, + "grad_norm": 1.2222908735275269, + "learning_rate": 8.019540725072609e-05, + "loss": 1.034, + "step": 91950 + }, + { + "epoch": 0.5875062289970995, + "grad_norm": 0.953247606754303, + "learning_rate": 8.019140773317862e-05, + "loss": 0.9178, + "step": 91960 + }, + { + "epoch": 0.5875701161468382, + "grad_norm": 0.7658291459083557, + "learning_rate": 8.018740791157755e-05, + "loss": 0.7629, + "step": 91970 + }, + { + "epoch": 0.5876340032965769, + "grad_norm": 1.0047904253005981, + "learning_rate": 8.018340778596316e-05, + "loss": 0.9189, + "step": 91980 + }, + { + "epoch": 0.5876978904463156, + "grad_norm": 0.9360259771347046, + "learning_rate": 8.017940735637574e-05, + "loss": 1.0436, + "step": 91990 + }, + { + "epoch": 0.5877617775960543, + "grad_norm": 0.74342942237854, + "learning_rate": 8.017540662285558e-05, + "loss": 0.6901, + "step": 92000 + }, + { + "epoch": 0.587825664745793, + "grad_norm": 0.7133846879005432, + "learning_rate": 8.017140558544299e-05, + "loss": 0.9163, + "step": 92010 + }, + { + "epoch": 0.5878895518955317, + "grad_norm": 1.2013561725616455, + "learning_rate": 8.016740424417822e-05, + "loss": 0.8493, + "step": 92020 + }, + { + "epoch": 0.5879534390452704, + "grad_norm": 1.1158215999603271, + "learning_rate": 8.01634025991016e-05, + "loss": 0.9638, + "step": 92030 + }, + { + "epoch": 0.5880173261950091, + "grad_norm": 0.8271144032478333, + "learning_rate": 8.015940065025343e-05, + "loss": 0.7683, + "step": 92040 + }, + { + "epoch": 0.5880812133447478, + "grad_norm": 1.0541661977767944, + "learning_rate": 8.015539839767399e-05, + "loss": 0.8228, + "step": 92050 + }, + { + "epoch": 0.5881451004944865, + "grad_norm": 1.1830748319625854, + "learning_rate": 8.01513958414036e-05, + "loss": 0.8117, + "step": 92060 + }, + { + "epoch": 0.5882089876442252, + "grad_norm": 0.5275201201438904, + "learning_rate": 8.014739298148258e-05, + "loss": 0.7335, + "step": 92070 + }, + { + "epoch": 0.588272874793964, + "grad_norm": 0.9330576062202454, + "learning_rate": 8.014338981795122e-05, + "loss": 0.8961, + "step": 92080 + }, + { + "epoch": 0.5883367619437027, + "grad_norm": 1.2388197183609009, + "learning_rate": 8.013938635084983e-05, + "loss": 0.697, + "step": 92090 + }, + { + "epoch": 0.5884006490934414, + "grad_norm": 1.1590933799743652, + "learning_rate": 8.013538258021877e-05, + "loss": 1.1775, + "step": 92100 + }, + { + "epoch": 0.5884645362431801, + "grad_norm": 1.2639012336730957, + "learning_rate": 8.013137850609833e-05, + "loss": 0.7401, + "step": 92110 + }, + { + "epoch": 0.5885284233929188, + "grad_norm": 0.6682813763618469, + "learning_rate": 8.012737412852886e-05, + "loss": 0.8262, + "step": 92120 + }, + { + "epoch": 0.5885923105426575, + "grad_norm": 0.7417098879814148, + "learning_rate": 8.012336944755064e-05, + "loss": 1.0828, + "step": 92130 + }, + { + "epoch": 0.5886561976923962, + "grad_norm": 0.7538353800773621, + "learning_rate": 8.011936446320405e-05, + "loss": 0.647, + "step": 92140 + }, + { + "epoch": 0.5887200848421349, + "grad_norm": 0.7363274097442627, + "learning_rate": 8.01153591755294e-05, + "loss": 1.1043, + "step": 92150 + }, + { + "epoch": 0.5887839719918736, + "grad_norm": 0.6675977110862732, + "learning_rate": 8.011135358456701e-05, + "loss": 0.9313, + "step": 92160 + }, + { + "epoch": 0.5888478591416123, + "grad_norm": 0.7670975923538208, + "learning_rate": 8.010734769035726e-05, + "loss": 0.8746, + "step": 92170 + }, + { + "epoch": 0.588911746291351, + "grad_norm": 1.1923779249191284, + "learning_rate": 8.010334149294045e-05, + "loss": 1.0088, + "step": 92180 + }, + { + "epoch": 0.5889756334410896, + "grad_norm": 0.8132577538490295, + "learning_rate": 8.009933499235698e-05, + "loss": 1.0256, + "step": 92190 + }, + { + "epoch": 0.5890395205908283, + "grad_norm": 0.7252603769302368, + "learning_rate": 8.009532818864714e-05, + "loss": 0.7461, + "step": 92200 + }, + { + "epoch": 0.589103407740567, + "grad_norm": 1.4953992366790771, + "learning_rate": 8.009132108185132e-05, + "loss": 1.3034, + "step": 92210 + }, + { + "epoch": 0.5891672948903057, + "grad_norm": 0.7073407769203186, + "learning_rate": 8.008731367200988e-05, + "loss": 0.9938, + "step": 92220 + }, + { + "epoch": 0.5892311820400444, + "grad_norm": 1.0197674036026, + "learning_rate": 8.008330595916314e-05, + "loss": 0.9457, + "step": 92230 + }, + { + "epoch": 0.5892950691897831, + "grad_norm": 0.8998727798461914, + "learning_rate": 8.00792979433515e-05, + "loss": 0.7637, + "step": 92240 + }, + { + "epoch": 0.5893589563395218, + "grad_norm": 0.5019026398658752, + "learning_rate": 8.007528962461527e-05, + "loss": 0.9488, + "step": 92250 + }, + { + "epoch": 0.5894228434892606, + "grad_norm": 1.0908100605010986, + "learning_rate": 8.007128100299491e-05, + "loss": 1.1868, + "step": 92260 + }, + { + "epoch": 0.5894867306389993, + "grad_norm": 1.2412331104278564, + "learning_rate": 8.006727207853069e-05, + "loss": 0.7634, + "step": 92270 + }, + { + "epoch": 0.589550617788738, + "grad_norm": 0.8074179291725159, + "learning_rate": 8.006326285126305e-05, + "loss": 0.8855, + "step": 92280 + }, + { + "epoch": 0.5896145049384767, + "grad_norm": 1.471113920211792, + "learning_rate": 8.005925332123235e-05, + "loss": 0.7663, + "step": 92290 + }, + { + "epoch": 0.5896783920882154, + "grad_norm": 0.7869247794151306, + "learning_rate": 8.005524348847894e-05, + "loss": 0.9511, + "step": 92300 + }, + { + "epoch": 0.5897422792379541, + "grad_norm": 1.1059610843658447, + "learning_rate": 8.005123335304322e-05, + "loss": 0.9348, + "step": 92310 + }, + { + "epoch": 0.5898061663876928, + "grad_norm": 1.1069176197052002, + "learning_rate": 8.004722291496562e-05, + "loss": 0.8676, + "step": 92320 + }, + { + "epoch": 0.5898700535374315, + "grad_norm": 0.5974422693252563, + "learning_rate": 8.004321217428647e-05, + "loss": 0.7969, + "step": 92330 + }, + { + "epoch": 0.5899339406871702, + "grad_norm": 1.1670259237289429, + "learning_rate": 8.003920113104618e-05, + "loss": 0.8566, + "step": 92340 + }, + { + "epoch": 0.5899978278369089, + "grad_norm": 0.9760884642601013, + "learning_rate": 8.003518978528515e-05, + "loss": 0.8049, + "step": 92350 + }, + { + "epoch": 0.5900617149866476, + "grad_norm": 0.7791697978973389, + "learning_rate": 8.003117813704378e-05, + "loss": 0.6515, + "step": 92360 + }, + { + "epoch": 0.5901256021363863, + "grad_norm": 0.8998212218284607, + "learning_rate": 8.002716618636245e-05, + "loss": 0.9429, + "step": 92370 + }, + { + "epoch": 0.590189489286125, + "grad_norm": 0.9141538739204407, + "learning_rate": 8.00231539332816e-05, + "loss": 0.8178, + "step": 92380 + }, + { + "epoch": 0.5902533764358637, + "grad_norm": 0.765386164188385, + "learning_rate": 8.001914137784161e-05, + "loss": 0.973, + "step": 92390 + }, + { + "epoch": 0.5903172635856024, + "grad_norm": 0.7694385647773743, + "learning_rate": 8.00151285200829e-05, + "loss": 0.7036, + "step": 92400 + }, + { + "epoch": 0.5903811507353411, + "grad_norm": 1.3476502895355225, + "learning_rate": 8.001111536004586e-05, + "loss": 0.7961, + "step": 92410 + }, + { + "epoch": 0.5904450378850798, + "grad_norm": 1.1224573850631714, + "learning_rate": 8.000710189777094e-05, + "loss": 0.8736, + "step": 92420 + }, + { + "epoch": 0.5905089250348184, + "grad_norm": 0.8447276949882507, + "learning_rate": 8.000308813329855e-05, + "loss": 1.0167, + "step": 92430 + }, + { + "epoch": 0.5905728121845571, + "grad_norm": 2.5802526473999023, + "learning_rate": 7.999907406666909e-05, + "loss": 0.8453, + "step": 92440 + }, + { + "epoch": 0.5906366993342959, + "grad_norm": 0.7821781635284424, + "learning_rate": 7.999505969792302e-05, + "loss": 0.8587, + "step": 92450 + }, + { + "epoch": 0.5907005864840346, + "grad_norm": 0.9936961531639099, + "learning_rate": 7.999104502710074e-05, + "loss": 1.1057, + "step": 92460 + }, + { + "epoch": 0.5907644736337733, + "grad_norm": 0.6793760061264038, + "learning_rate": 7.998703005424268e-05, + "loss": 1.0557, + "step": 92470 + }, + { + "epoch": 0.590828360783512, + "grad_norm": 1.253307819366455, + "learning_rate": 7.99830147793893e-05, + "loss": 0.7205, + "step": 92480 + }, + { + "epoch": 0.5908922479332507, + "grad_norm": 0.8064923882484436, + "learning_rate": 7.997899920258101e-05, + "loss": 0.7813, + "step": 92490 + }, + { + "epoch": 0.5909561350829894, + "grad_norm": 1.2392529249191284, + "learning_rate": 7.997498332385827e-05, + "loss": 0.9077, + "step": 92500 + }, + { + "epoch": 0.5910200222327281, + "grad_norm": 1.3533644676208496, + "learning_rate": 7.997096714326151e-05, + "loss": 0.8915, + "step": 92510 + }, + { + "epoch": 0.5910839093824668, + "grad_norm": 0.8305091261863708, + "learning_rate": 7.99669506608312e-05, + "loss": 1.0454, + "step": 92520 + }, + { + "epoch": 0.5911477965322055, + "grad_norm": 0.884864866733551, + "learning_rate": 7.996293387660776e-05, + "loss": 0.9556, + "step": 92530 + }, + { + "epoch": 0.5912116836819442, + "grad_norm": 0.6807804703712463, + "learning_rate": 7.995891679063165e-05, + "loss": 1.0857, + "step": 92540 + }, + { + "epoch": 0.5912755708316829, + "grad_norm": 1.4398140907287598, + "learning_rate": 7.995489940294333e-05, + "loss": 0.8875, + "step": 92550 + }, + { + "epoch": 0.5913394579814216, + "grad_norm": 0.9925829768180847, + "learning_rate": 7.995088171358325e-05, + "loss": 1.038, + "step": 92560 + }, + { + "epoch": 0.5914033451311603, + "grad_norm": 0.7312915325164795, + "learning_rate": 7.99468637225919e-05, + "loss": 1.0752, + "step": 92570 + }, + { + "epoch": 0.591467232280899, + "grad_norm": 1.0309982299804688, + "learning_rate": 7.994284543000972e-05, + "loss": 0.8225, + "step": 92580 + }, + { + "epoch": 0.5915311194306377, + "grad_norm": 0.844560980796814, + "learning_rate": 7.993882683587717e-05, + "loss": 0.8288, + "step": 92590 + }, + { + "epoch": 0.5915950065803764, + "grad_norm": 1.377153754234314, + "learning_rate": 7.993480794023473e-05, + "loss": 0.8974, + "step": 92600 + }, + { + "epoch": 0.5916588937301152, + "grad_norm": 1.0687837600708008, + "learning_rate": 7.99307887431229e-05, + "loss": 0.6578, + "step": 92610 + }, + { + "epoch": 0.5917227808798539, + "grad_norm": 1.0184049606323242, + "learning_rate": 7.992676924458212e-05, + "loss": 0.7034, + "step": 92620 + }, + { + "epoch": 0.5917866680295926, + "grad_norm": 1.7974580526351929, + "learning_rate": 7.992274944465287e-05, + "loss": 0.8216, + "step": 92630 + }, + { + "epoch": 0.5918505551793313, + "grad_norm": 0.9643707871437073, + "learning_rate": 7.991872934337568e-05, + "loss": 0.9556, + "step": 92640 + }, + { + "epoch": 0.59191444232907, + "grad_norm": 2.1205992698669434, + "learning_rate": 7.991470894079098e-05, + "loss": 1.1122, + "step": 92650 + }, + { + "epoch": 0.5919783294788087, + "grad_norm": 0.7959349751472473, + "learning_rate": 7.991068823693928e-05, + "loss": 0.892, + "step": 92660 + }, + { + "epoch": 0.5920422166285473, + "grad_norm": 0.8903279900550842, + "learning_rate": 7.990666723186107e-05, + "loss": 0.909, + "step": 92670 + }, + { + "epoch": 0.592106103778286, + "grad_norm": 0.8091008067131042, + "learning_rate": 7.990264592559686e-05, + "loss": 1.0945, + "step": 92680 + }, + { + "epoch": 0.5921699909280247, + "grad_norm": 0.8795812726020813, + "learning_rate": 7.989862431818713e-05, + "loss": 1.0445, + "step": 92690 + }, + { + "epoch": 0.5922338780777634, + "grad_norm": 0.7716434001922607, + "learning_rate": 7.989460240967239e-05, + "loss": 1.0517, + "step": 92700 + }, + { + "epoch": 0.5922977652275021, + "grad_norm": 0.9718101024627686, + "learning_rate": 7.989058020009315e-05, + "loss": 0.9155, + "step": 92710 + }, + { + "epoch": 0.5923616523772408, + "grad_norm": 1.8687045574188232, + "learning_rate": 7.98865576894899e-05, + "loss": 0.8678, + "step": 92720 + }, + { + "epoch": 0.5924255395269795, + "grad_norm": 0.5522985458374023, + "learning_rate": 7.988253487790315e-05, + "loss": 0.9144, + "step": 92730 + }, + { + "epoch": 0.5924894266767182, + "grad_norm": 0.9412902593612671, + "learning_rate": 7.987851176537342e-05, + "loss": 0.7785, + "step": 92740 + }, + { + "epoch": 0.5925533138264569, + "grad_norm": 0.5858872532844543, + "learning_rate": 7.987448835194124e-05, + "loss": 0.7684, + "step": 92750 + }, + { + "epoch": 0.5926172009761956, + "grad_norm": 0.7545718550682068, + "learning_rate": 7.987046463764712e-05, + "loss": 0.8157, + "step": 92760 + }, + { + "epoch": 0.5926810881259343, + "grad_norm": 1.0280770063400269, + "learning_rate": 7.986644062253157e-05, + "loss": 0.8308, + "step": 92770 + }, + { + "epoch": 0.592744975275673, + "grad_norm": 0.6888710260391235, + "learning_rate": 7.986241630663512e-05, + "loss": 0.827, + "step": 92780 + }, + { + "epoch": 0.5928088624254118, + "grad_norm": 0.7648938298225403, + "learning_rate": 7.985839168999831e-05, + "loss": 0.7851, + "step": 92790 + }, + { + "epoch": 0.5928727495751505, + "grad_norm": 1.144452452659607, + "learning_rate": 7.985436677266166e-05, + "loss": 0.8219, + "step": 92800 + }, + { + "epoch": 0.5929366367248892, + "grad_norm": 0.9473939538002014, + "learning_rate": 7.985034155466572e-05, + "loss": 0.9113, + "step": 92810 + }, + { + "epoch": 0.5930005238746279, + "grad_norm": 1.33318030834198, + "learning_rate": 7.984631603605102e-05, + "loss": 0.7248, + "step": 92820 + }, + { + "epoch": 0.5930644110243666, + "grad_norm": 1.1216987371444702, + "learning_rate": 7.984229021685807e-05, + "loss": 0.7527, + "step": 92830 + }, + { + "epoch": 0.5931282981741053, + "grad_norm": 1.0254566669464111, + "learning_rate": 7.983826409712747e-05, + "loss": 1.0293, + "step": 92840 + }, + { + "epoch": 0.593192185323844, + "grad_norm": 0.7557952404022217, + "learning_rate": 7.983423767689972e-05, + "loss": 0.8593, + "step": 92850 + }, + { + "epoch": 0.5932560724735827, + "grad_norm": 0.8302872180938721, + "learning_rate": 7.983021095621539e-05, + "loss": 0.8756, + "step": 92860 + }, + { + "epoch": 0.5933199596233214, + "grad_norm": 0.7966361045837402, + "learning_rate": 7.982618393511503e-05, + "loss": 0.8578, + "step": 92870 + }, + { + "epoch": 0.5933838467730601, + "grad_norm": 1.1069227457046509, + "learning_rate": 7.982215661363918e-05, + "loss": 0.7703, + "step": 92880 + }, + { + "epoch": 0.5934477339227988, + "grad_norm": 0.5603930354118347, + "learning_rate": 7.981812899182844e-05, + "loss": 0.8665, + "step": 92890 + }, + { + "epoch": 0.5935116210725375, + "grad_norm": 1.1370865106582642, + "learning_rate": 7.981410106972333e-05, + "loss": 0.8621, + "step": 92900 + }, + { + "epoch": 0.5935755082222761, + "grad_norm": 1.1741241216659546, + "learning_rate": 7.981007284736442e-05, + "loss": 1.0893, + "step": 92910 + }, + { + "epoch": 0.5936393953720148, + "grad_norm": 1.065045714378357, + "learning_rate": 7.98060443247923e-05, + "loss": 1.0186, + "step": 92920 + }, + { + "epoch": 0.5937032825217535, + "grad_norm": 0.7145927548408508, + "learning_rate": 7.980201550204753e-05, + "loss": 0.9665, + "step": 92930 + }, + { + "epoch": 0.5937671696714922, + "grad_norm": 0.7131385803222656, + "learning_rate": 7.979798637917068e-05, + "loss": 0.9271, + "step": 92940 + }, + { + "epoch": 0.5938310568212309, + "grad_norm": 2.897143840789795, + "learning_rate": 7.979395695620234e-05, + "loss": 0.9113, + "step": 92950 + }, + { + "epoch": 0.5938949439709696, + "grad_norm": 0.9042292237281799, + "learning_rate": 7.978992723318305e-05, + "loss": 0.869, + "step": 92960 + }, + { + "epoch": 0.5939588311207084, + "grad_norm": 0.5825813412666321, + "learning_rate": 7.978589721015343e-05, + "loss": 1.1739, + "step": 92970 + }, + { + "epoch": 0.5940227182704471, + "grad_norm": 1.031434416770935, + "learning_rate": 7.978186688715406e-05, + "loss": 0.817, + "step": 92980 + }, + { + "epoch": 0.5940866054201858, + "grad_norm": 0.81146639585495, + "learning_rate": 7.977783626422553e-05, + "loss": 1.0523, + "step": 92990 + }, + { + "epoch": 0.5941504925699245, + "grad_norm": 1.5747389793395996, + "learning_rate": 7.977380534140843e-05, + "loss": 0.7111, + "step": 93000 + }, + { + "epoch": 0.5942143797196632, + "grad_norm": 0.8901327848434448, + "learning_rate": 7.976977411874334e-05, + "loss": 0.8639, + "step": 93010 + }, + { + "epoch": 0.5942782668694019, + "grad_norm": 0.6581230163574219, + "learning_rate": 7.976574259627087e-05, + "loss": 0.7677, + "step": 93020 + }, + { + "epoch": 0.5943421540191406, + "grad_norm": 0.660140335559845, + "learning_rate": 7.976171077403163e-05, + "loss": 1.058, + "step": 93030 + }, + { + "epoch": 0.5944060411688793, + "grad_norm": 1.2601295709609985, + "learning_rate": 7.97576786520662e-05, + "loss": 1.0485, + "step": 93040 + }, + { + "epoch": 0.594469928318618, + "grad_norm": 0.7179937362670898, + "learning_rate": 7.975364623041523e-05, + "loss": 0.7853, + "step": 93050 + }, + { + "epoch": 0.5945338154683567, + "grad_norm": 1.2381994724273682, + "learning_rate": 7.974961350911926e-05, + "loss": 0.8814, + "step": 93060 + }, + { + "epoch": 0.5945977026180954, + "grad_norm": 0.8336678743362427, + "learning_rate": 7.974558048821898e-05, + "loss": 0.8024, + "step": 93070 + }, + { + "epoch": 0.5946615897678341, + "grad_norm": 0.6095098257064819, + "learning_rate": 7.974154716775497e-05, + "loss": 1.0271, + "step": 93080 + }, + { + "epoch": 0.5947254769175728, + "grad_norm": 0.7402802109718323, + "learning_rate": 7.973791692324393e-05, + "loss": 1.0233, + "step": 93090 + }, + { + "epoch": 0.5947893640673115, + "grad_norm": 1.3775721788406372, + "learning_rate": 7.973388303372073e-05, + "loss": 0.8029, + "step": 93100 + }, + { + "epoch": 0.5948532512170502, + "grad_norm": 1.3130244016647339, + "learning_rate": 7.972984884475162e-05, + "loss": 0.7344, + "step": 93110 + }, + { + "epoch": 0.594917138366789, + "grad_norm": 1.00970458984375, + "learning_rate": 7.97258143563772e-05, + "loss": 0.871, + "step": 93120 + }, + { + "epoch": 0.5949810255165277, + "grad_norm": 0.5588665008544922, + "learning_rate": 7.972177956863811e-05, + "loss": 0.8927, + "step": 93130 + }, + { + "epoch": 0.5950449126662664, + "grad_norm": 0.5917448997497559, + "learning_rate": 7.971774448157499e-05, + "loss": 0.922, + "step": 93140 + }, + { + "epoch": 0.5951087998160051, + "grad_norm": 0.8795225620269775, + "learning_rate": 7.971370909522847e-05, + "loss": 1.1024, + "step": 93150 + }, + { + "epoch": 0.5951726869657437, + "grad_norm": 0.7276126146316528, + "learning_rate": 7.97096734096392e-05, + "loss": 0.9061, + "step": 93160 + }, + { + "epoch": 0.5952365741154824, + "grad_norm": 1.1168190240859985, + "learning_rate": 7.970563742484782e-05, + "loss": 0.8808, + "step": 93170 + }, + { + "epoch": 0.5953004612652211, + "grad_norm": 0.7256569266319275, + "learning_rate": 7.970160114089496e-05, + "loss": 0.8881, + "step": 93180 + }, + { + "epoch": 0.5953643484149598, + "grad_norm": 1.2579829692840576, + "learning_rate": 7.969756455782129e-05, + "loss": 0.986, + "step": 93190 + }, + { + "epoch": 0.5954282355646985, + "grad_norm": 1.1053801774978638, + "learning_rate": 7.969352767566744e-05, + "loss": 0.9049, + "step": 93200 + }, + { + "epoch": 0.5954921227144372, + "grad_norm": 1.0768349170684814, + "learning_rate": 7.96894904944741e-05, + "loss": 0.9947, + "step": 93210 + }, + { + "epoch": 0.5955560098641759, + "grad_norm": 0.8521894812583923, + "learning_rate": 7.968545301428188e-05, + "loss": 0.7064, + "step": 93220 + }, + { + "epoch": 0.5956198970139146, + "grad_norm": 0.9622153639793396, + "learning_rate": 7.968141523513149e-05, + "loss": 0.8189, + "step": 93230 + }, + { + "epoch": 0.5956837841636533, + "grad_norm": 1.014563798904419, + "learning_rate": 7.967737715706354e-05, + "loss": 1.0966, + "step": 93240 + }, + { + "epoch": 0.595747671313392, + "grad_norm": 1.3811198472976685, + "learning_rate": 7.967333878011875e-05, + "loss": 1.043, + "step": 93250 + }, + { + "epoch": 0.5958115584631307, + "grad_norm": 0.7607221007347107, + "learning_rate": 7.966930010433777e-05, + "loss": 0.9357, + "step": 93260 + }, + { + "epoch": 0.5958754456128694, + "grad_norm": 0.9977759122848511, + "learning_rate": 7.966526112976126e-05, + "loss": 1.0732, + "step": 93270 + }, + { + "epoch": 0.5959393327626081, + "grad_norm": 1.0473393201828003, + "learning_rate": 7.966122185642992e-05, + "loss": 0.9741, + "step": 93280 + }, + { + "epoch": 0.5960032199123468, + "grad_norm": 1.254073977470398, + "learning_rate": 7.965718228438442e-05, + "loss": 0.7127, + "step": 93290 + }, + { + "epoch": 0.5960671070620855, + "grad_norm": 0.9612134099006653, + "learning_rate": 7.965314241366542e-05, + "loss": 0.8608, + "step": 93300 + }, + { + "epoch": 0.5961309942118242, + "grad_norm": 1.8050369024276733, + "learning_rate": 7.964910224431361e-05, + "loss": 0.6996, + "step": 93310 + }, + { + "epoch": 0.596194881361563, + "grad_norm": 1.1563001871109009, + "learning_rate": 7.96450617763697e-05, + "loss": 0.7266, + "step": 93320 + }, + { + "epoch": 0.5962587685113017, + "grad_norm": 1.396944284439087, + "learning_rate": 7.964102100987439e-05, + "loss": 1.2127, + "step": 93330 + }, + { + "epoch": 0.5963226556610404, + "grad_norm": 1.3693512678146362, + "learning_rate": 7.963697994486834e-05, + "loss": 0.8019, + "step": 93340 + }, + { + "epoch": 0.5963865428107791, + "grad_norm": 0.8561686277389526, + "learning_rate": 7.963293858139227e-05, + "loss": 1.2762, + "step": 93350 + }, + { + "epoch": 0.5964504299605178, + "grad_norm": 1.4842015504837036, + "learning_rate": 7.962889691948687e-05, + "loss": 1.0344, + "step": 93360 + }, + { + "epoch": 0.5965143171102565, + "grad_norm": 0.6768175363540649, + "learning_rate": 7.962485495919285e-05, + "loss": 0.7479, + "step": 93370 + }, + { + "epoch": 0.5965782042599952, + "grad_norm": 0.8603546023368835, + "learning_rate": 7.96208127005509e-05, + "loss": 0.7013, + "step": 93380 + }, + { + "epoch": 0.5966420914097339, + "grad_norm": 0.8126310706138611, + "learning_rate": 7.961677014360174e-05, + "loss": 0.9827, + "step": 93390 + }, + { + "epoch": 0.5967059785594725, + "grad_norm": 0.7931829690933228, + "learning_rate": 7.961272728838609e-05, + "loss": 0.8245, + "step": 93400 + }, + { + "epoch": 0.5967698657092112, + "grad_norm": 0.7296050786972046, + "learning_rate": 7.960868413494465e-05, + "loss": 0.7299, + "step": 93410 + }, + { + "epoch": 0.5968337528589499, + "grad_norm": 1.148210883140564, + "learning_rate": 7.960464068331814e-05, + "loss": 0.9199, + "step": 93420 + }, + { + "epoch": 0.5968976400086886, + "grad_norm": 1.0481257438659668, + "learning_rate": 7.960059693354731e-05, + "loss": 0.8088, + "step": 93430 + }, + { + "epoch": 0.5969615271584273, + "grad_norm": 0.6699005365371704, + "learning_rate": 7.959655288567285e-05, + "loss": 1.055, + "step": 93440 + }, + { + "epoch": 0.597025414308166, + "grad_norm": 1.0113779306411743, + "learning_rate": 7.959250853973549e-05, + "loss": 1.0175, + "step": 93450 + }, + { + "epoch": 0.5970893014579047, + "grad_norm": 1.1636087894439697, + "learning_rate": 7.958846389577597e-05, + "loss": 0.8509, + "step": 93460 + }, + { + "epoch": 0.5971531886076434, + "grad_norm": 0.8707061409950256, + "learning_rate": 7.958441895383503e-05, + "loss": 1.0673, + "step": 93470 + }, + { + "epoch": 0.5972170757573821, + "grad_norm": 0.4944153130054474, + "learning_rate": 7.95803737139534e-05, + "loss": 0.8289, + "step": 93480 + }, + { + "epoch": 0.5972809629071208, + "grad_norm": 0.9866294860839844, + "learning_rate": 7.95763281761718e-05, + "loss": 0.9177, + "step": 93490 + }, + { + "epoch": 0.5973448500568596, + "grad_norm": 0.7539530992507935, + "learning_rate": 7.957228234053099e-05, + "loss": 0.6764, + "step": 93500 + }, + { + "epoch": 0.5974087372065983, + "grad_norm": 0.793980062007904, + "learning_rate": 7.956823620707172e-05, + "loss": 0.9874, + "step": 93510 + }, + { + "epoch": 0.597472624356337, + "grad_norm": 0.760221004486084, + "learning_rate": 7.956418977583474e-05, + "loss": 0.8189, + "step": 93520 + }, + { + "epoch": 0.5975365115060757, + "grad_norm": 0.4851871728897095, + "learning_rate": 7.956014304686076e-05, + "loss": 0.9784, + "step": 93530 + }, + { + "epoch": 0.5976003986558144, + "grad_norm": 0.6846779584884644, + "learning_rate": 7.95560960201906e-05, + "loss": 0.796, + "step": 93540 + }, + { + "epoch": 0.5976642858055531, + "grad_norm": 0.6320601105690002, + "learning_rate": 7.955204869586497e-05, + "loss": 0.9733, + "step": 93550 + }, + { + "epoch": 0.5977281729552918, + "grad_norm": 1.2107212543487549, + "learning_rate": 7.954800107392463e-05, + "loss": 0.928, + "step": 93560 + }, + { + "epoch": 0.5977920601050305, + "grad_norm": 1.3490923643112183, + "learning_rate": 7.954395315441039e-05, + "loss": 1.0885, + "step": 93570 + }, + { + "epoch": 0.5978559472547692, + "grad_norm": 0.724120020866394, + "learning_rate": 7.953990493736296e-05, + "loss": 0.7475, + "step": 93580 + }, + { + "epoch": 0.5979198344045079, + "grad_norm": 1.5954548120498657, + "learning_rate": 7.953585642282314e-05, + "loss": 1.1069, + "step": 93590 + }, + { + "epoch": 0.5979837215542466, + "grad_norm": 1.4548444747924805, + "learning_rate": 7.953180761083169e-05, + "loss": 1.0161, + "step": 93600 + }, + { + "epoch": 0.5980476087039853, + "grad_norm": 0.6197888851165771, + "learning_rate": 7.952775850142939e-05, + "loss": 0.7571, + "step": 93610 + }, + { + "epoch": 0.598111495853724, + "grad_norm": 0.7104484438896179, + "learning_rate": 7.952370909465702e-05, + "loss": 0.8744, + "step": 93620 + }, + { + "epoch": 0.5981753830034627, + "grad_norm": 0.6192456483840942, + "learning_rate": 7.951965939055535e-05, + "loss": 1.1207, + "step": 93630 + }, + { + "epoch": 0.5982392701532013, + "grad_norm": 1.01494300365448, + "learning_rate": 7.951560938916517e-05, + "loss": 0.7774, + "step": 93640 + }, + { + "epoch": 0.59830315730294, + "grad_norm": 1.5885846614837646, + "learning_rate": 7.951155909052727e-05, + "loss": 0.7792, + "step": 93650 + }, + { + "epoch": 0.5983670444526787, + "grad_norm": 0.9878436326980591, + "learning_rate": 7.950750849468245e-05, + "loss": 0.8927, + "step": 93660 + }, + { + "epoch": 0.5984309316024174, + "grad_norm": 0.7260859608650208, + "learning_rate": 7.950345760167148e-05, + "loss": 0.9279, + "step": 93670 + }, + { + "epoch": 0.5984948187521562, + "grad_norm": 0.9209891557693481, + "learning_rate": 7.949940641153517e-05, + "loss": 0.7823, + "step": 93680 + }, + { + "epoch": 0.5985587059018949, + "grad_norm": 0.6021539568901062, + "learning_rate": 7.949535492431433e-05, + "loss": 0.8444, + "step": 93690 + }, + { + "epoch": 0.5986225930516336, + "grad_norm": 1.3056902885437012, + "learning_rate": 7.949130314004974e-05, + "loss": 0.812, + "step": 93700 + }, + { + "epoch": 0.5986864802013723, + "grad_norm": 0.8002413511276245, + "learning_rate": 7.948725105878221e-05, + "loss": 0.9184, + "step": 93710 + }, + { + "epoch": 0.598750367351111, + "grad_norm": 1.4868652820587158, + "learning_rate": 7.948319868055254e-05, + "loss": 0.8872, + "step": 93720 + }, + { + "epoch": 0.5988142545008497, + "grad_norm": 0.8618303537368774, + "learning_rate": 7.947914600540158e-05, + "loss": 0.7252, + "step": 93730 + }, + { + "epoch": 0.5988781416505884, + "grad_norm": 0.5396475791931152, + "learning_rate": 7.947509303337009e-05, + "loss": 0.7807, + "step": 93740 + }, + { + "epoch": 0.5989420288003271, + "grad_norm": 0.7746068239212036, + "learning_rate": 7.947103976449892e-05, + "loss": 0.8018, + "step": 93750 + }, + { + "epoch": 0.5990059159500658, + "grad_norm": 1.0696126222610474, + "learning_rate": 7.94669861988289e-05, + "loss": 0.7254, + "step": 93760 + }, + { + "epoch": 0.5990698030998045, + "grad_norm": 0.8595736622810364, + "learning_rate": 7.946293233640082e-05, + "loss": 0.765, + "step": 93770 + }, + { + "epoch": 0.5991336902495432, + "grad_norm": 1.1637159585952759, + "learning_rate": 7.945887817725552e-05, + "loss": 0.8776, + "step": 93780 + }, + { + "epoch": 0.5991975773992819, + "grad_norm": 0.9262639284133911, + "learning_rate": 7.945482372143385e-05, + "loss": 1.2434, + "step": 93790 + }, + { + "epoch": 0.5992614645490206, + "grad_norm": 0.6639724969863892, + "learning_rate": 7.945076896897661e-05, + "loss": 0.9795, + "step": 93800 + }, + { + "epoch": 0.5993253516987593, + "grad_norm": 0.7169008255004883, + "learning_rate": 7.944671391992465e-05, + "loss": 1.1887, + "step": 93810 + }, + { + "epoch": 0.599389238848498, + "grad_norm": 0.9885384440422058, + "learning_rate": 7.944265857431881e-05, + "loss": 0.9445, + "step": 93820 + }, + { + "epoch": 0.5994531259982367, + "grad_norm": 0.639473021030426, + "learning_rate": 7.943860293219993e-05, + "loss": 0.7167, + "step": 93830 + }, + { + "epoch": 0.5995170131479755, + "grad_norm": 0.9396152496337891, + "learning_rate": 7.943454699360884e-05, + "loss": 0.7037, + "step": 93840 + }, + { + "epoch": 0.5995809002977142, + "grad_norm": 1.092712163925171, + "learning_rate": 7.94304907585864e-05, + "loss": 1.2453, + "step": 93850 + }, + { + "epoch": 0.5996447874474529, + "grad_norm": 1.040013313293457, + "learning_rate": 7.942643422717346e-05, + "loss": 0.7103, + "step": 93860 + }, + { + "epoch": 0.5997086745971916, + "grad_norm": 1.5030107498168945, + "learning_rate": 7.942237739941086e-05, + "loss": 1.5542, + "step": 93870 + }, + { + "epoch": 0.5997725617469303, + "grad_norm": 1.024461030960083, + "learning_rate": 7.941832027533948e-05, + "loss": 0.947, + "step": 93880 + }, + { + "epoch": 0.5998364488966689, + "grad_norm": 1.201267957687378, + "learning_rate": 7.941426285500016e-05, + "loss": 0.7646, + "step": 93890 + }, + { + "epoch": 0.5999003360464076, + "grad_norm": 1.2446759939193726, + "learning_rate": 7.941020513843376e-05, + "loss": 0.8437, + "step": 93900 + }, + { + "epoch": 0.5999642231961463, + "grad_norm": 0.7842534780502319, + "learning_rate": 7.940614712568115e-05, + "loss": 0.6985, + "step": 93910 + }, + { + "epoch": 0.600028110345885, + "grad_norm": 1.0466797351837158, + "learning_rate": 7.940208881678322e-05, + "loss": 0.9707, + "step": 93920 + }, + { + "epoch": 0.6000919974956237, + "grad_norm": 0.7793298363685608, + "learning_rate": 7.939803021178078e-05, + "loss": 0.9014, + "step": 93930 + }, + { + "epoch": 0.6001558846453624, + "grad_norm": 0.6594678163528442, + "learning_rate": 7.939397131071478e-05, + "loss": 0.8811, + "step": 93940 + }, + { + "epoch": 0.6002197717951011, + "grad_norm": 0.7732535004615784, + "learning_rate": 7.938991211362602e-05, + "loss": 0.9198, + "step": 93950 + }, + { + "epoch": 0.6002836589448398, + "grad_norm": 0.6953932642936707, + "learning_rate": 7.938585262055546e-05, + "loss": 0.8225, + "step": 93960 + }, + { + "epoch": 0.6003475460945785, + "grad_norm": 0.4991307556629181, + "learning_rate": 7.938179283154392e-05, + "loss": 0.7595, + "step": 93970 + }, + { + "epoch": 0.6004114332443172, + "grad_norm": 1.582554817199707, + "learning_rate": 7.937773274663231e-05, + "loss": 0.9514, + "step": 93980 + }, + { + "epoch": 0.6004753203940559, + "grad_norm": 0.8112611770629883, + "learning_rate": 7.937367236586153e-05, + "loss": 0.9182, + "step": 93990 + }, + { + "epoch": 0.6005392075437946, + "grad_norm": 0.8769091367721558, + "learning_rate": 7.936961168927244e-05, + "loss": 1.1743, + "step": 94000 + }, + { + "epoch": 0.6006030946935333, + "grad_norm": 0.733625054359436, + "learning_rate": 7.936555071690597e-05, + "loss": 0.9444, + "step": 94010 + }, + { + "epoch": 0.600666981843272, + "grad_norm": 0.9377986192703247, + "learning_rate": 7.936148944880297e-05, + "loss": 0.7817, + "step": 94020 + }, + { + "epoch": 0.6007308689930108, + "grad_norm": 0.7029353380203247, + "learning_rate": 7.935742788500438e-05, + "loss": 0.8874, + "step": 94030 + }, + { + "epoch": 0.6007947561427495, + "grad_norm": 1.3966723680496216, + "learning_rate": 7.93533660255511e-05, + "loss": 1.0347, + "step": 94040 + }, + { + "epoch": 0.6008586432924882, + "grad_norm": 0.5431897044181824, + "learning_rate": 7.934930387048405e-05, + "loss": 0.8037, + "step": 94050 + }, + { + "epoch": 0.6009225304422269, + "grad_norm": 0.6450621485710144, + "learning_rate": 7.93452414198441e-05, + "loss": 0.8367, + "step": 94060 + }, + { + "epoch": 0.6009864175919656, + "grad_norm": 0.6869795322418213, + "learning_rate": 7.93411786736722e-05, + "loss": 1.1556, + "step": 94070 + }, + { + "epoch": 0.6010503047417043, + "grad_norm": 1.2084323167800903, + "learning_rate": 7.93371156320092e-05, + "loss": 0.7216, + "step": 94080 + }, + { + "epoch": 0.601114191891443, + "grad_norm": 0.8116541504859924, + "learning_rate": 7.93330522948961e-05, + "loss": 0.9562, + "step": 94090 + }, + { + "epoch": 0.6011780790411817, + "grad_norm": 1.043238878250122, + "learning_rate": 7.932898866237378e-05, + "loss": 0.7446, + "step": 94100 + }, + { + "epoch": 0.6012419661909204, + "grad_norm": 1.0671076774597168, + "learning_rate": 7.932492473448318e-05, + "loss": 0.952, + "step": 94110 + }, + { + "epoch": 0.6013058533406591, + "grad_norm": 1.1957184076309204, + "learning_rate": 7.932086051126521e-05, + "loss": 0.8455, + "step": 94120 + }, + { + "epoch": 0.6013697404903977, + "grad_norm": 0.6923540830612183, + "learning_rate": 7.931679599276081e-05, + "loss": 0.8397, + "step": 94130 + }, + { + "epoch": 0.6014336276401364, + "grad_norm": 0.699149489402771, + "learning_rate": 7.931273117901091e-05, + "loss": 0.8874, + "step": 94140 + }, + { + "epoch": 0.6014975147898751, + "grad_norm": 2.1960628032684326, + "learning_rate": 7.930866607005643e-05, + "loss": 0.811, + "step": 94150 + }, + { + "epoch": 0.6015614019396138, + "grad_norm": 0.6790569424629211, + "learning_rate": 7.930460066593836e-05, + "loss": 1.0083, + "step": 94160 + }, + { + "epoch": 0.6016252890893525, + "grad_norm": 0.6594065427780151, + "learning_rate": 7.930053496669758e-05, + "loss": 0.9659, + "step": 94170 + }, + { + "epoch": 0.6016891762390912, + "grad_norm": 0.7281797528266907, + "learning_rate": 7.929646897237509e-05, + "loss": 0.8501, + "step": 94180 + }, + { + "epoch": 0.60175306338883, + "grad_norm": 1.0084353685379028, + "learning_rate": 7.929240268301179e-05, + "loss": 1.0269, + "step": 94190 + }, + { + "epoch": 0.6018169505385687, + "grad_norm": 0.6016590595245361, + "learning_rate": 7.928833609864867e-05, + "loss": 1.0357, + "step": 94200 + }, + { + "epoch": 0.6018808376883074, + "grad_norm": 0.7156760692596436, + "learning_rate": 7.928426921932665e-05, + "loss": 0.7571, + "step": 94210 + }, + { + "epoch": 0.6019447248380461, + "grad_norm": 0.6921888589859009, + "learning_rate": 7.928020204508673e-05, + "loss": 0.7635, + "step": 94220 + }, + { + "epoch": 0.6020086119877848, + "grad_norm": 0.9318345189094543, + "learning_rate": 7.927613457596983e-05, + "loss": 0.8359, + "step": 94230 + }, + { + "epoch": 0.6020724991375235, + "grad_norm": 1.3739084005355835, + "learning_rate": 7.927206681201693e-05, + "loss": 1.111, + "step": 94240 + }, + { + "epoch": 0.6021363862872622, + "grad_norm": 1.0711864233016968, + "learning_rate": 7.926799875326898e-05, + "loss": 0.941, + "step": 94250 + }, + { + "epoch": 0.6022002734370009, + "grad_norm": 0.7577251195907593, + "learning_rate": 7.926393039976698e-05, + "loss": 0.767, + "step": 94260 + }, + { + "epoch": 0.6022641605867396, + "grad_norm": 1.2173386812210083, + "learning_rate": 7.925986175155188e-05, + "loss": 0.9573, + "step": 94270 + }, + { + "epoch": 0.6023280477364783, + "grad_norm": 0.8785862922668457, + "learning_rate": 7.925579280866465e-05, + "loss": 0.9249, + "step": 94280 + }, + { + "epoch": 0.602391934886217, + "grad_norm": 0.6809684038162231, + "learning_rate": 7.925172357114628e-05, + "loss": 0.7592, + "step": 94290 + }, + { + "epoch": 0.6024558220359557, + "grad_norm": 0.7136757373809814, + "learning_rate": 7.924765403903775e-05, + "loss": 0.8912, + "step": 94300 + }, + { + "epoch": 0.6025197091856944, + "grad_norm": 1.4909855127334595, + "learning_rate": 7.924358421238005e-05, + "loss": 0.7308, + "step": 94310 + }, + { + "epoch": 0.6025835963354331, + "grad_norm": 0.7292258739471436, + "learning_rate": 7.923951409121416e-05, + "loss": 0.8375, + "step": 94320 + }, + { + "epoch": 0.6026474834851718, + "grad_norm": 0.7449434995651245, + "learning_rate": 7.923544367558104e-05, + "loss": 0.7606, + "step": 94330 + }, + { + "epoch": 0.6027113706349105, + "grad_norm": 1.1462843418121338, + "learning_rate": 7.923137296552174e-05, + "loss": 0.9557, + "step": 94340 + }, + { + "epoch": 0.6027752577846492, + "grad_norm": 1.5585012435913086, + "learning_rate": 7.92273019610772e-05, + "loss": 0.9248, + "step": 94350 + }, + { + "epoch": 0.602839144934388, + "grad_norm": 0.7382988929748535, + "learning_rate": 7.922323066228845e-05, + "loss": 0.7405, + "step": 94360 + }, + { + "epoch": 0.6029030320841265, + "grad_norm": 0.7861965894699097, + "learning_rate": 7.92191590691965e-05, + "loss": 0.8206, + "step": 94370 + }, + { + "epoch": 0.6029669192338653, + "grad_norm": 0.8503504395484924, + "learning_rate": 7.921508718184233e-05, + "loss": 0.7665, + "step": 94380 + }, + { + "epoch": 0.603030806383604, + "grad_norm": 1.1574392318725586, + "learning_rate": 7.921101500026695e-05, + "loss": 0.8627, + "step": 94390 + }, + { + "epoch": 0.6030946935333427, + "grad_norm": 1.1855578422546387, + "learning_rate": 7.92069425245114e-05, + "loss": 0.8954, + "step": 94400 + }, + { + "epoch": 0.6031585806830814, + "grad_norm": 0.5956348776817322, + "learning_rate": 7.920286975461665e-05, + "loss": 1.0047, + "step": 94410 + }, + { + "epoch": 0.6032224678328201, + "grad_norm": 1.5525617599487305, + "learning_rate": 7.919879669062376e-05, + "loss": 0.9846, + "step": 94420 + }, + { + "epoch": 0.6032863549825588, + "grad_norm": 0.8425145745277405, + "learning_rate": 7.919472333257369e-05, + "loss": 0.7559, + "step": 94430 + }, + { + "epoch": 0.6033502421322975, + "grad_norm": 3.6817235946655273, + "learning_rate": 7.919064968050753e-05, + "loss": 0.8537, + "step": 94440 + }, + { + "epoch": 0.6034141292820362, + "grad_norm": 1.1978166103363037, + "learning_rate": 7.918657573446626e-05, + "loss": 0.7715, + "step": 94450 + }, + { + "epoch": 0.6034780164317749, + "grad_norm": 1.2537955045700073, + "learning_rate": 7.918250149449093e-05, + "loss": 0.9165, + "step": 94460 + }, + { + "epoch": 0.6035419035815136, + "grad_norm": 0.7937589883804321, + "learning_rate": 7.917842696062257e-05, + "loss": 1.1595, + "step": 94470 + }, + { + "epoch": 0.6036057907312523, + "grad_norm": 0.7704179883003235, + "learning_rate": 7.917435213290218e-05, + "loss": 0.7351, + "step": 94480 + }, + { + "epoch": 0.603669677880991, + "grad_norm": 0.5766093730926514, + "learning_rate": 7.917027701137085e-05, + "loss": 0.763, + "step": 94490 + }, + { + "epoch": 0.6037335650307297, + "grad_norm": 1.0193499326705933, + "learning_rate": 7.916620159606958e-05, + "loss": 0.85, + "step": 94500 + }, + { + "epoch": 0.6037974521804684, + "grad_norm": 0.7235758304595947, + "learning_rate": 7.916212588703944e-05, + "loss": 0.9735, + "step": 94510 + }, + { + "epoch": 0.6038613393302071, + "grad_norm": 1.022512435913086, + "learning_rate": 7.915804988432146e-05, + "loss": 0.9828, + "step": 94520 + }, + { + "epoch": 0.6039252264799458, + "grad_norm": 1.0742284059524536, + "learning_rate": 7.915397358795669e-05, + "loss": 0.8061, + "step": 94530 + }, + { + "epoch": 0.6039891136296845, + "grad_norm": 0.8321564793586731, + "learning_rate": 7.914989699798618e-05, + "loss": 0.7689, + "step": 94540 + }, + { + "epoch": 0.6040530007794233, + "grad_norm": 1.0344573259353638, + "learning_rate": 7.9145820114451e-05, + "loss": 0.6514, + "step": 94550 + }, + { + "epoch": 0.604116887929162, + "grad_norm": 0.8881844878196716, + "learning_rate": 7.914174293739221e-05, + "loss": 0.8515, + "step": 94560 + }, + { + "epoch": 0.6041807750789007, + "grad_norm": 0.4412252604961395, + "learning_rate": 7.913766546685083e-05, + "loss": 0.824, + "step": 94570 + }, + { + "epoch": 0.6042446622286394, + "grad_norm": 0.8809495568275452, + "learning_rate": 7.913358770286796e-05, + "loss": 0.7449, + "step": 94580 + }, + { + "epoch": 0.6043085493783781, + "grad_norm": 0.5300993919372559, + "learning_rate": 7.912950964548466e-05, + "loss": 0.7912, + "step": 94590 + }, + { + "epoch": 0.6043724365281168, + "grad_norm": 1.1538459062576294, + "learning_rate": 7.9125431294742e-05, + "loss": 0.9233, + "step": 94600 + }, + { + "epoch": 0.6044363236778555, + "grad_norm": 1.1883444786071777, + "learning_rate": 7.912135265068104e-05, + "loss": 0.9529, + "step": 94610 + }, + { + "epoch": 0.6045002108275941, + "grad_norm": 0.4078890085220337, + "learning_rate": 7.911727371334285e-05, + "loss": 0.8081, + "step": 94620 + }, + { + "epoch": 0.6045640979773328, + "grad_norm": 0.7821246981620789, + "learning_rate": 7.911319448276855e-05, + "loss": 0.71, + "step": 94630 + }, + { + "epoch": 0.6046279851270715, + "grad_norm": 1.401476502418518, + "learning_rate": 7.910911495899919e-05, + "loss": 0.7582, + "step": 94640 + }, + { + "epoch": 0.6046918722768102, + "grad_norm": 0.75636887550354, + "learning_rate": 7.910503514207585e-05, + "loss": 0.8437, + "step": 94650 + }, + { + "epoch": 0.6047557594265489, + "grad_norm": 0.6574771404266357, + "learning_rate": 7.910095503203964e-05, + "loss": 0.701, + "step": 94660 + }, + { + "epoch": 0.6048196465762876, + "grad_norm": 0.7583115100860596, + "learning_rate": 7.909687462893163e-05, + "loss": 0.6434, + "step": 94670 + }, + { + "epoch": 0.6048835337260263, + "grad_norm": 0.9831967353820801, + "learning_rate": 7.909279393279292e-05, + "loss": 0.8715, + "step": 94680 + }, + { + "epoch": 0.604947420875765, + "grad_norm": 0.7744137048721313, + "learning_rate": 7.908871294366461e-05, + "loss": 0.9322, + "step": 94690 + }, + { + "epoch": 0.6050113080255037, + "grad_norm": 0.7049340605735779, + "learning_rate": 7.90846316615878e-05, + "loss": 0.7142, + "step": 94700 + }, + { + "epoch": 0.6050751951752424, + "grad_norm": 1.377447247505188, + "learning_rate": 7.908055008660358e-05, + "loss": 0.7331, + "step": 94710 + }, + { + "epoch": 0.6051390823249811, + "grad_norm": 0.7816984057426453, + "learning_rate": 7.907646821875305e-05, + "loss": 0.8407, + "step": 94720 + }, + { + "epoch": 0.6052029694747199, + "grad_norm": 0.7210595011711121, + "learning_rate": 7.907238605807734e-05, + "loss": 0.8329, + "step": 94730 + }, + { + "epoch": 0.6052668566244586, + "grad_norm": 0.8540478944778442, + "learning_rate": 7.906830360461757e-05, + "loss": 1.0039, + "step": 94740 + }, + { + "epoch": 0.6053307437741973, + "grad_norm": 1.395521879196167, + "learning_rate": 7.906422085841481e-05, + "loss": 0.8576, + "step": 94750 + }, + { + "epoch": 0.605394630923936, + "grad_norm": 1.551674246788025, + "learning_rate": 7.906013781951022e-05, + "loss": 0.8292, + "step": 94760 + }, + { + "epoch": 0.6054585180736747, + "grad_norm": 0.5742878317832947, + "learning_rate": 7.905605448794489e-05, + "loss": 0.7943, + "step": 94770 + }, + { + "epoch": 0.6055224052234134, + "grad_norm": 1.2160093784332275, + "learning_rate": 7.905197086375995e-05, + "loss": 0.7231, + "step": 94780 + }, + { + "epoch": 0.6055862923731521, + "grad_norm": 0.824417769908905, + "learning_rate": 7.904788694699654e-05, + "loss": 0.7877, + "step": 94790 + }, + { + "epoch": 0.6056501795228908, + "grad_norm": 0.9510350227355957, + "learning_rate": 7.904380273769578e-05, + "loss": 0.9848, + "step": 94800 + }, + { + "epoch": 0.6057140666726295, + "grad_norm": 1.160369634628296, + "learning_rate": 7.90397182358988e-05, + "loss": 0.7315, + "step": 94810 + }, + { + "epoch": 0.6057779538223682, + "grad_norm": 0.949501633644104, + "learning_rate": 7.903563344164673e-05, + "loss": 0.9625, + "step": 94820 + }, + { + "epoch": 0.6058418409721069, + "grad_norm": 1.4712127447128296, + "learning_rate": 7.90315483549807e-05, + "loss": 0.8038, + "step": 94830 + }, + { + "epoch": 0.6059057281218456, + "grad_norm": 1.125845193862915, + "learning_rate": 7.902746297594187e-05, + "loss": 0.9969, + "step": 94840 + }, + { + "epoch": 0.6059696152715843, + "grad_norm": 1.1382551193237305, + "learning_rate": 7.90233773045714e-05, + "loss": 1.1508, + "step": 94850 + }, + { + "epoch": 0.6060335024213229, + "grad_norm": 0.8378157615661621, + "learning_rate": 7.901929134091038e-05, + "loss": 0.8542, + "step": 94860 + }, + { + "epoch": 0.6060973895710616, + "grad_norm": 1.001591682434082, + "learning_rate": 7.9015205085e-05, + "loss": 0.81, + "step": 94870 + }, + { + "epoch": 0.6061612767208003, + "grad_norm": 0.8801831603050232, + "learning_rate": 7.901111853688141e-05, + "loss": 1.3241, + "step": 94880 + }, + { + "epoch": 0.606225163870539, + "grad_norm": 0.739589273929596, + "learning_rate": 7.900703169659574e-05, + "loss": 0.7344, + "step": 94890 + }, + { + "epoch": 0.6062890510202777, + "grad_norm": 0.6854998469352722, + "learning_rate": 7.900294456418418e-05, + "loss": 0.7173, + "step": 94900 + }, + { + "epoch": 0.6063529381700165, + "grad_norm": 0.9052863121032715, + "learning_rate": 7.899885713968789e-05, + "loss": 0.8141, + "step": 94910 + }, + { + "epoch": 0.6064168253197552, + "grad_norm": 0.6532935500144958, + "learning_rate": 7.8994769423148e-05, + "loss": 0.8135, + "step": 94920 + }, + { + "epoch": 0.6064807124694939, + "grad_norm": 1.091203212738037, + "learning_rate": 7.89906814146057e-05, + "loss": 0.9978, + "step": 94930 + }, + { + "epoch": 0.6065445996192326, + "grad_norm": 0.9277145266532898, + "learning_rate": 7.898659311410218e-05, + "loss": 0.8818, + "step": 94940 + }, + { + "epoch": 0.6066084867689713, + "grad_norm": 1.1531500816345215, + "learning_rate": 7.898250452167856e-05, + "loss": 0.7621, + "step": 94950 + }, + { + "epoch": 0.60667237391871, + "grad_norm": 0.7892251014709473, + "learning_rate": 7.897841563737605e-05, + "loss": 0.9858, + "step": 94960 + }, + { + "epoch": 0.6067362610684487, + "grad_norm": 0.8957139849662781, + "learning_rate": 7.897432646123583e-05, + "loss": 0.8984, + "step": 94970 + }, + { + "epoch": 0.6068001482181874, + "grad_norm": 1.0226024389266968, + "learning_rate": 7.89702369932991e-05, + "loss": 0.902, + "step": 94980 + }, + { + "epoch": 0.6068640353679261, + "grad_norm": 1.1574631929397583, + "learning_rate": 7.8966147233607e-05, + "loss": 0.9923, + "step": 94990 + }, + { + "epoch": 0.6069279225176648, + "grad_norm": 0.5747475624084473, + "learning_rate": 7.896205718220073e-05, + "loss": 0.8992, + "step": 95000 + }, + { + "epoch": 0.6069918096674035, + "grad_norm": 0.7874606847763062, + "learning_rate": 7.895796683912148e-05, + "loss": 0.9888, + "step": 95010 + }, + { + "epoch": 0.6070556968171422, + "grad_norm": 2.17077898979187, + "learning_rate": 7.895387620441049e-05, + "loss": 0.8266, + "step": 95020 + }, + { + "epoch": 0.6071195839668809, + "grad_norm": 1.0135728120803833, + "learning_rate": 7.894978527810889e-05, + "loss": 0.7683, + "step": 95030 + }, + { + "epoch": 0.6071834711166196, + "grad_norm": 1.4031306505203247, + "learning_rate": 7.894569406025791e-05, + "loss": 0.8458, + "step": 95040 + }, + { + "epoch": 0.6072473582663583, + "grad_norm": 2.591813325881958, + "learning_rate": 7.894160255089876e-05, + "loss": 0.8058, + "step": 95050 + }, + { + "epoch": 0.607311245416097, + "grad_norm": 0.827282726764679, + "learning_rate": 7.893751075007263e-05, + "loss": 1.1121, + "step": 95060 + }, + { + "epoch": 0.6073751325658358, + "grad_norm": 0.9045417904853821, + "learning_rate": 7.893341865782073e-05, + "loss": 0.9837, + "step": 95070 + }, + { + "epoch": 0.6074390197155745, + "grad_norm": 1.0362218618392944, + "learning_rate": 7.892932627418428e-05, + "loss": 0.9518, + "step": 95080 + }, + { + "epoch": 0.6075029068653132, + "grad_norm": 0.9262713193893433, + "learning_rate": 7.892523359920447e-05, + "loss": 0.7016, + "step": 95090 + }, + { + "epoch": 0.6075667940150518, + "grad_norm": 0.7739233374595642, + "learning_rate": 7.892114063292256e-05, + "loss": 0.726, + "step": 95100 + }, + { + "epoch": 0.6076306811647905, + "grad_norm": 0.9387316703796387, + "learning_rate": 7.891704737537972e-05, + "loss": 0.9781, + "step": 95110 + }, + { + "epoch": 0.6076945683145292, + "grad_norm": 0.8391212821006775, + "learning_rate": 7.89129538266172e-05, + "loss": 0.7901, + "step": 95120 + }, + { + "epoch": 0.6077584554642679, + "grad_norm": 1.2274523973464966, + "learning_rate": 7.890885998667623e-05, + "loss": 0.7709, + "step": 95130 + }, + { + "epoch": 0.6078223426140066, + "grad_norm": 2.197125196456909, + "learning_rate": 7.890476585559802e-05, + "loss": 0.8797, + "step": 95140 + }, + { + "epoch": 0.6078862297637453, + "grad_norm": 0.8918281197547913, + "learning_rate": 7.890067143342381e-05, + "loss": 0.9798, + "step": 95150 + }, + { + "epoch": 0.607950116913484, + "grad_norm": 0.9237262010574341, + "learning_rate": 7.889657672019483e-05, + "loss": 0.9019, + "step": 95160 + }, + { + "epoch": 0.6080140040632227, + "grad_norm": 2.1953394412994385, + "learning_rate": 7.889248171595235e-05, + "loss": 1.0639, + "step": 95170 + }, + { + "epoch": 0.6080778912129614, + "grad_norm": 1.0482672452926636, + "learning_rate": 7.888838642073757e-05, + "loss": 1.0944, + "step": 95180 + }, + { + "epoch": 0.6081417783627001, + "grad_norm": 0.8295831084251404, + "learning_rate": 7.888429083459175e-05, + "loss": 0.7337, + "step": 95190 + }, + { + "epoch": 0.6082056655124388, + "grad_norm": 0.771742045879364, + "learning_rate": 7.888019495755612e-05, + "loss": 0.8807, + "step": 95200 + }, + { + "epoch": 0.6082695526621775, + "grad_norm": 0.9289408922195435, + "learning_rate": 7.887609878967195e-05, + "loss": 0.8625, + "step": 95210 + }, + { + "epoch": 0.6083334398119162, + "grad_norm": 0.9988054633140564, + "learning_rate": 7.887200233098049e-05, + "loss": 0.992, + "step": 95220 + }, + { + "epoch": 0.6083973269616549, + "grad_norm": 1.2625335454940796, + "learning_rate": 7.8867905581523e-05, + "loss": 0.798, + "step": 95230 + }, + { + "epoch": 0.6084612141113936, + "grad_norm": 0.8305104374885559, + "learning_rate": 7.886421825844037e-05, + "loss": 0.9033, + "step": 95240 + }, + { + "epoch": 0.6085251012611324, + "grad_norm": 0.540634274482727, + "learning_rate": 7.886012095664107e-05, + "loss": 1.109, + "step": 95250 + }, + { + "epoch": 0.6085889884108711, + "grad_norm": 0.7756277918815613, + "learning_rate": 7.885602336419534e-05, + "loss": 1.0592, + "step": 95260 + }, + { + "epoch": 0.6086528755606098, + "grad_norm": 0.8292693495750427, + "learning_rate": 7.885192548114453e-05, + "loss": 0.9055, + "step": 95270 + }, + { + "epoch": 0.6087167627103485, + "grad_norm": 1.1361613273620605, + "learning_rate": 7.884782730752984e-05, + "loss": 0.9744, + "step": 95280 + }, + { + "epoch": 0.6087806498600872, + "grad_norm": 0.6474214792251587, + "learning_rate": 7.88437288433926e-05, + "loss": 0.8828, + "step": 95290 + }, + { + "epoch": 0.6088445370098259, + "grad_norm": 0.8289141058921814, + "learning_rate": 7.883963008877404e-05, + "loss": 0.8699, + "step": 95300 + }, + { + "epoch": 0.6089084241595646, + "grad_norm": 1.024628758430481, + "learning_rate": 7.883553104371547e-05, + "loss": 0.8444, + "step": 95310 + }, + { + "epoch": 0.6089723113093033, + "grad_norm": 0.7327681183815002, + "learning_rate": 7.883143170825815e-05, + "loss": 0.883, + "step": 95320 + }, + { + "epoch": 0.609036198459042, + "grad_norm": 1.0134698152542114, + "learning_rate": 7.882733208244337e-05, + "loss": 0.8742, + "step": 95330 + }, + { + "epoch": 0.6091000856087806, + "grad_norm": 1.6711806058883667, + "learning_rate": 7.882323216631241e-05, + "loss": 1.0554, + "step": 95340 + }, + { + "epoch": 0.6091639727585193, + "grad_norm": 0.8501431345939636, + "learning_rate": 7.881913195990658e-05, + "loss": 0.9567, + "step": 95350 + }, + { + "epoch": 0.609227859908258, + "grad_norm": 1.3267399072647095, + "learning_rate": 7.881503146326714e-05, + "loss": 0.763, + "step": 95360 + }, + { + "epoch": 0.6092917470579967, + "grad_norm": 1.0088346004486084, + "learning_rate": 7.881093067643541e-05, + "loss": 1.0992, + "step": 95370 + }, + { + "epoch": 0.6093556342077354, + "grad_norm": 0.7467685341835022, + "learning_rate": 7.88068295994527e-05, + "loss": 0.9101, + "step": 95380 + }, + { + "epoch": 0.6094195213574741, + "grad_norm": 1.0090692043304443, + "learning_rate": 7.880272823236027e-05, + "loss": 0.9231, + "step": 95390 + }, + { + "epoch": 0.6094834085072128, + "grad_norm": 0.705679178237915, + "learning_rate": 7.879862657519948e-05, + "loss": 0.7734, + "step": 95400 + }, + { + "epoch": 0.6095472956569515, + "grad_norm": 1.0415382385253906, + "learning_rate": 7.879452462801158e-05, + "loss": 0.767, + "step": 95410 + }, + { + "epoch": 0.6096111828066902, + "grad_norm": 1.0252981185913086, + "learning_rate": 7.879042239083792e-05, + "loss": 0.775, + "step": 95420 + }, + { + "epoch": 0.609675069956429, + "grad_norm": 1.6463427543640137, + "learning_rate": 7.878631986371978e-05, + "loss": 0.856, + "step": 95430 + }, + { + "epoch": 0.6097389571061677, + "grad_norm": 0.9976633191108704, + "learning_rate": 7.878221704669852e-05, + "loss": 0.8753, + "step": 95440 + }, + { + "epoch": 0.6098028442559064, + "grad_norm": 0.9319173693656921, + "learning_rate": 7.877811393981542e-05, + "loss": 0.7858, + "step": 95450 + }, + { + "epoch": 0.6098667314056451, + "grad_norm": 0.7984362840652466, + "learning_rate": 7.877401054311182e-05, + "loss": 0.7756, + "step": 95460 + }, + { + "epoch": 0.6099306185553838, + "grad_norm": 1.3641668558120728, + "learning_rate": 7.876990685662903e-05, + "loss": 0.8173, + "step": 95470 + }, + { + "epoch": 0.6099945057051225, + "grad_norm": 1.6693613529205322, + "learning_rate": 7.87658028804084e-05, + "loss": 0.7144, + "step": 95480 + }, + { + "epoch": 0.6100583928548612, + "grad_norm": 1.5207191705703735, + "learning_rate": 7.876169861449125e-05, + "loss": 0.8905, + "step": 95490 + }, + { + "epoch": 0.6101222800045999, + "grad_norm": 0.7093350887298584, + "learning_rate": 7.875759405891891e-05, + "loss": 1.0028, + "step": 95500 + }, + { + "epoch": 0.6101861671543386, + "grad_norm": 0.6331580281257629, + "learning_rate": 7.875348921373271e-05, + "loss": 0.699, + "step": 95510 + }, + { + "epoch": 0.6102500543040773, + "grad_norm": 0.6561687588691711, + "learning_rate": 7.8749384078974e-05, + "loss": 0.826, + "step": 95520 + }, + { + "epoch": 0.610313941453816, + "grad_norm": 1.0018503665924072, + "learning_rate": 7.874527865468414e-05, + "loss": 0.8969, + "step": 95530 + }, + { + "epoch": 0.6103778286035547, + "grad_norm": 0.9253289103507996, + "learning_rate": 7.874117294090445e-05, + "loss": 0.8585, + "step": 95540 + }, + { + "epoch": 0.6104417157532934, + "grad_norm": 0.924781858921051, + "learning_rate": 7.873706693767626e-05, + "loss": 1.0908, + "step": 95550 + }, + { + "epoch": 0.6105056029030321, + "grad_norm": 0.8251575231552124, + "learning_rate": 7.873296064504096e-05, + "loss": 0.7825, + "step": 95560 + }, + { + "epoch": 0.6105694900527708, + "grad_norm": 1.3431663513183594, + "learning_rate": 7.87288540630399e-05, + "loss": 0.9041, + "step": 95570 + }, + { + "epoch": 0.6106333772025095, + "grad_norm": 0.9507653117179871, + "learning_rate": 7.872474719171441e-05, + "loss": 0.8849, + "step": 95580 + }, + { + "epoch": 0.6106972643522481, + "grad_norm": 0.655210018157959, + "learning_rate": 7.872064003110589e-05, + "loss": 0.7671, + "step": 95590 + }, + { + "epoch": 0.6107611515019868, + "grad_norm": 0.8236376047134399, + "learning_rate": 7.871653258125564e-05, + "loss": 0.7553, + "step": 95600 + }, + { + "epoch": 0.6108250386517255, + "grad_norm": 1.1168328523635864, + "learning_rate": 7.871242484220509e-05, + "loss": 0.9552, + "step": 95610 + }, + { + "epoch": 0.6108889258014643, + "grad_norm": 0.9979621767997742, + "learning_rate": 7.870831681399558e-05, + "loss": 0.8804, + "step": 95620 + }, + { + "epoch": 0.610952812951203, + "grad_norm": 0.9995028972625732, + "learning_rate": 7.870420849666847e-05, + "loss": 0.9085, + "step": 95630 + }, + { + "epoch": 0.6110167001009417, + "grad_norm": 0.6226816773414612, + "learning_rate": 7.870009989026516e-05, + "loss": 0.8221, + "step": 95640 + }, + { + "epoch": 0.6110805872506804, + "grad_norm": 0.6372109055519104, + "learning_rate": 7.8695990994827e-05, + "loss": 0.7805, + "step": 95650 + }, + { + "epoch": 0.6111444744004191, + "grad_norm": 0.5445452928543091, + "learning_rate": 7.86918818103954e-05, + "loss": 0.8984, + "step": 95660 + }, + { + "epoch": 0.6112083615501578, + "grad_norm": 1.3520268201828003, + "learning_rate": 7.868777233701174e-05, + "loss": 0.6649, + "step": 95670 + }, + { + "epoch": 0.6112722486998965, + "grad_norm": 3.7762889862060547, + "learning_rate": 7.868366257471737e-05, + "loss": 1.2053, + "step": 95680 + }, + { + "epoch": 0.6113361358496352, + "grad_norm": 0.8047236204147339, + "learning_rate": 7.867955252355371e-05, + "loss": 0.8838, + "step": 95690 + }, + { + "epoch": 0.6114000229993739, + "grad_norm": 0.9412931799888611, + "learning_rate": 7.867544218356215e-05, + "loss": 1.0598, + "step": 95700 + }, + { + "epoch": 0.6114639101491126, + "grad_norm": 0.8420425057411194, + "learning_rate": 7.867133155478408e-05, + "loss": 1.0705, + "step": 95710 + }, + { + "epoch": 0.6115277972988513, + "grad_norm": 3.140885353088379, + "learning_rate": 7.866722063726089e-05, + "loss": 1.1464, + "step": 95720 + }, + { + "epoch": 0.61159168444859, + "grad_norm": 1.0284521579742432, + "learning_rate": 7.866310943103399e-05, + "loss": 1.22, + "step": 95730 + }, + { + "epoch": 0.6116555715983287, + "grad_norm": 0.5975868105888367, + "learning_rate": 7.865899793614478e-05, + "loss": 0.67, + "step": 95740 + }, + { + "epoch": 0.6117194587480674, + "grad_norm": 0.9867643117904663, + "learning_rate": 7.865488615263467e-05, + "loss": 0.8758, + "step": 95750 + }, + { + "epoch": 0.6117833458978061, + "grad_norm": 1.000255823135376, + "learning_rate": 7.865077408054507e-05, + "loss": 0.8155, + "step": 95760 + }, + { + "epoch": 0.6118472330475448, + "grad_norm": 0.6636435389518738, + "learning_rate": 7.864666171991736e-05, + "loss": 0.8283, + "step": 95770 + }, + { + "epoch": 0.6119111201972836, + "grad_norm": 0.43315425515174866, + "learning_rate": 7.864254907079302e-05, + "loss": 0.7155, + "step": 95780 + }, + { + "epoch": 0.6119750073470223, + "grad_norm": 0.7825480103492737, + "learning_rate": 7.863843613321342e-05, + "loss": 0.7891, + "step": 95790 + }, + { + "epoch": 0.612038894496761, + "grad_norm": 1.0271575450897217, + "learning_rate": 7.863432290722e-05, + "loss": 1.0863, + "step": 95800 + }, + { + "epoch": 0.6121027816464997, + "grad_norm": 0.8053982853889465, + "learning_rate": 7.863020939285415e-05, + "loss": 1.0562, + "step": 95810 + }, + { + "epoch": 0.6121666687962384, + "grad_norm": 0.6690106987953186, + "learning_rate": 7.862609559015735e-05, + "loss": 0.9627, + "step": 95820 + }, + { + "epoch": 0.612230555945977, + "grad_norm": 0.6722576022148132, + "learning_rate": 7.862198149917099e-05, + "loss": 0.9642, + "step": 95830 + }, + { + "epoch": 0.6122944430957157, + "grad_norm": 1.5476411581039429, + "learning_rate": 7.86178671199365e-05, + "loss": 0.9018, + "step": 95840 + }, + { + "epoch": 0.6123583302454544, + "grad_norm": 0.9946299195289612, + "learning_rate": 7.861375245249536e-05, + "loss": 0.8733, + "step": 95850 + }, + { + "epoch": 0.6124222173951931, + "grad_norm": 0.9345956444740295, + "learning_rate": 7.860963749688897e-05, + "loss": 0.6437, + "step": 95860 + }, + { + "epoch": 0.6124861045449318, + "grad_norm": 0.6753872036933899, + "learning_rate": 7.860552225315877e-05, + "loss": 0.8364, + "step": 95870 + }, + { + "epoch": 0.6125499916946705, + "grad_norm": 0.6145283579826355, + "learning_rate": 7.860140672134622e-05, + "loss": 0.7866, + "step": 95880 + }, + { + "epoch": 0.6126138788444092, + "grad_norm": 1.010877251625061, + "learning_rate": 7.859729090149275e-05, + "loss": 0.9073, + "step": 95890 + }, + { + "epoch": 0.6126777659941479, + "grad_norm": 0.9995620250701904, + "learning_rate": 7.859317479363983e-05, + "loss": 0.8959, + "step": 95900 + }, + { + "epoch": 0.6127416531438866, + "grad_norm": 0.7354846596717834, + "learning_rate": 7.85890583978289e-05, + "loss": 1.1287, + "step": 95910 + }, + { + "epoch": 0.6128055402936253, + "grad_norm": 1.1408978700637817, + "learning_rate": 7.858494171410144e-05, + "loss": 0.9378, + "step": 95920 + }, + { + "epoch": 0.612869427443364, + "grad_norm": 0.7992193698883057, + "learning_rate": 7.858082474249886e-05, + "loss": 0.7438, + "step": 95930 + }, + { + "epoch": 0.6129333145931027, + "grad_norm": 0.8835758566856384, + "learning_rate": 7.857670748306267e-05, + "loss": 0.8875, + "step": 95940 + }, + { + "epoch": 0.6129972017428414, + "grad_norm": 1.195634365081787, + "learning_rate": 7.857258993583429e-05, + "loss": 0.9826, + "step": 95950 + }, + { + "epoch": 0.6130610888925802, + "grad_norm": 0.5840584635734558, + "learning_rate": 7.856847210085523e-05, + "loss": 0.7557, + "step": 95960 + }, + { + "epoch": 0.6131249760423189, + "grad_norm": 0.8779674768447876, + "learning_rate": 7.856435397816693e-05, + "loss": 1.0267, + "step": 95970 + }, + { + "epoch": 0.6131888631920576, + "grad_norm": 0.7890565991401672, + "learning_rate": 7.856023556781087e-05, + "loss": 0.9257, + "step": 95980 + }, + { + "epoch": 0.6132527503417963, + "grad_norm": 0.8011789321899414, + "learning_rate": 7.855611686982854e-05, + "loss": 0.7494, + "step": 95990 + }, + { + "epoch": 0.613316637491535, + "grad_norm": 0.802932620048523, + "learning_rate": 7.85519978842614e-05, + "loss": 0.6817, + "step": 96000 + }, + { + "epoch": 0.6133805246412737, + "grad_norm": 0.604083240032196, + "learning_rate": 7.854787861115093e-05, + "loss": 0.7141, + "step": 96010 + }, + { + "epoch": 0.6134444117910124, + "grad_norm": 0.9903905391693115, + "learning_rate": 7.854375905053866e-05, + "loss": 0.8143, + "step": 96020 + }, + { + "epoch": 0.6135082989407511, + "grad_norm": 1.5344460010528564, + "learning_rate": 7.853963920246601e-05, + "loss": 0.6426, + "step": 96030 + }, + { + "epoch": 0.6135721860904898, + "grad_norm": 0.6864035129547119, + "learning_rate": 7.853551906697452e-05, + "loss": 0.7646, + "step": 96040 + }, + { + "epoch": 0.6136360732402285, + "grad_norm": 0.9132956862449646, + "learning_rate": 7.853139864410565e-05, + "loss": 1.0619, + "step": 96050 + }, + { + "epoch": 0.6136999603899672, + "grad_norm": 1.0728709697723389, + "learning_rate": 7.852727793390094e-05, + "loss": 0.962, + "step": 96060 + }, + { + "epoch": 0.6137638475397058, + "grad_norm": 0.7748542428016663, + "learning_rate": 7.852315693640184e-05, + "loss": 0.7858, + "step": 96070 + }, + { + "epoch": 0.6138277346894445, + "grad_norm": 0.7569340467453003, + "learning_rate": 7.851903565164987e-05, + "loss": 1.1526, + "step": 96080 + }, + { + "epoch": 0.6138916218391832, + "grad_norm": 1.0770772695541382, + "learning_rate": 7.851491407968655e-05, + "loss": 0.8305, + "step": 96090 + }, + { + "epoch": 0.6139555089889219, + "grad_norm": 0.9833508729934692, + "learning_rate": 7.851079222055338e-05, + "loss": 0.8214, + "step": 96100 + }, + { + "epoch": 0.6140193961386606, + "grad_norm": 0.6459149718284607, + "learning_rate": 7.850667007429187e-05, + "loss": 0.9574, + "step": 96110 + }, + { + "epoch": 0.6140832832883993, + "grad_norm": 0.6652225255966187, + "learning_rate": 7.850254764094351e-05, + "loss": 0.7369, + "step": 96120 + }, + { + "epoch": 0.614147170438138, + "grad_norm": 1.0020723342895508, + "learning_rate": 7.849842492054986e-05, + "loss": 0.8702, + "step": 96130 + }, + { + "epoch": 0.6142110575878768, + "grad_norm": 0.847522497177124, + "learning_rate": 7.84943019131524e-05, + "loss": 0.7167, + "step": 96140 + }, + { + "epoch": 0.6142749447376155, + "grad_norm": 0.7430154085159302, + "learning_rate": 7.849017861879266e-05, + "loss": 0.7177, + "step": 96150 + }, + { + "epoch": 0.6143388318873542, + "grad_norm": 0.7399063110351562, + "learning_rate": 7.84860550375122e-05, + "loss": 1.1364, + "step": 96160 + }, + { + "epoch": 0.6144027190370929, + "grad_norm": 1.0138983726501465, + "learning_rate": 7.84819311693525e-05, + "loss": 0.9364, + "step": 96170 + }, + { + "epoch": 0.6144666061868316, + "grad_norm": 0.8306713104248047, + "learning_rate": 7.847780701435514e-05, + "loss": 0.9233, + "step": 96180 + }, + { + "epoch": 0.6145304933365703, + "grad_norm": 0.8946832418441772, + "learning_rate": 7.84736825725616e-05, + "loss": 1.2262, + "step": 96190 + }, + { + "epoch": 0.614594380486309, + "grad_norm": 0.6287519931793213, + "learning_rate": 7.846955784401345e-05, + "loss": 0.7374, + "step": 96200 + }, + { + "epoch": 0.6146582676360477, + "grad_norm": 0.8874316215515137, + "learning_rate": 7.846543282875222e-05, + "loss": 0.8743, + "step": 96210 + }, + { + "epoch": 0.6147221547857864, + "grad_norm": 0.7302953004837036, + "learning_rate": 7.846130752681946e-05, + "loss": 0.7668, + "step": 96220 + }, + { + "epoch": 0.6147860419355251, + "grad_norm": 0.9165903925895691, + "learning_rate": 7.845718193825671e-05, + "loss": 0.8457, + "step": 96230 + }, + { + "epoch": 0.6148499290852638, + "grad_norm": 0.7827330827713013, + "learning_rate": 7.845305606310552e-05, + "loss": 0.9301, + "step": 96240 + }, + { + "epoch": 0.6149138162350025, + "grad_norm": 1.272709846496582, + "learning_rate": 7.844892990140744e-05, + "loss": 0.6134, + "step": 96250 + }, + { + "epoch": 0.6149777033847412, + "grad_norm": 0.8357341885566711, + "learning_rate": 7.844480345320402e-05, + "loss": 1.3462, + "step": 96260 + }, + { + "epoch": 0.6150415905344799, + "grad_norm": 1.107619047164917, + "learning_rate": 7.844067671853683e-05, + "loss": 1.1149, + "step": 96270 + }, + { + "epoch": 0.6151054776842186, + "grad_norm": 0.9972173571586609, + "learning_rate": 7.843654969744741e-05, + "loss": 1.0077, + "step": 96280 + }, + { + "epoch": 0.6151693648339573, + "grad_norm": 0.5411679148674011, + "learning_rate": 7.843242238997735e-05, + "loss": 0.9392, + "step": 96290 + }, + { + "epoch": 0.615233251983696, + "grad_norm": 0.6733404397964478, + "learning_rate": 7.842829479616818e-05, + "loss": 1.0999, + "step": 96300 + }, + { + "epoch": 0.6152971391334348, + "grad_norm": 1.565991997718811, + "learning_rate": 7.842416691606149e-05, + "loss": 1.13, + "step": 96310 + }, + { + "epoch": 0.6153610262831734, + "grad_norm": 0.7156278491020203, + "learning_rate": 7.842003874969886e-05, + "loss": 1.3407, + "step": 96320 + }, + { + "epoch": 0.6154249134329121, + "grad_norm": 1.1994881629943848, + "learning_rate": 7.841591029712185e-05, + "loss": 0.9944, + "step": 96330 + }, + { + "epoch": 0.6154888005826508, + "grad_norm": 0.923414945602417, + "learning_rate": 7.841178155837204e-05, + "loss": 0.8468, + "step": 96340 + }, + { + "epoch": 0.6155526877323895, + "grad_norm": 1.0727185010910034, + "learning_rate": 7.8407652533491e-05, + "loss": 0.8268, + "step": 96350 + }, + { + "epoch": 0.6156165748821282, + "grad_norm": 1.3032314777374268, + "learning_rate": 7.840352322252032e-05, + "loss": 0.8223, + "step": 96360 + }, + { + "epoch": 0.6156804620318669, + "grad_norm": 1.055098056793213, + "learning_rate": 7.839939362550161e-05, + "loss": 1.0348, + "step": 96370 + }, + { + "epoch": 0.6157443491816056, + "grad_norm": 1.0928640365600586, + "learning_rate": 7.839526374247642e-05, + "loss": 0.7047, + "step": 96380 + }, + { + "epoch": 0.6158082363313443, + "grad_norm": 1.1519923210144043, + "learning_rate": 7.839113357348637e-05, + "loss": 1.3878, + "step": 96390 + }, + { + "epoch": 0.615872123481083, + "grad_norm": 1.0324209928512573, + "learning_rate": 7.838700311857303e-05, + "loss": 1.0287, + "step": 96400 + }, + { + "epoch": 0.6159360106308217, + "grad_norm": 1.1393169164657593, + "learning_rate": 7.838287237777802e-05, + "loss": 0.7189, + "step": 96410 + }, + { + "epoch": 0.6159998977805604, + "grad_norm": 0.7285189628601074, + "learning_rate": 7.837874135114294e-05, + "loss": 0.8353, + "step": 96420 + }, + { + "epoch": 0.6160637849302991, + "grad_norm": 0.8310270309448242, + "learning_rate": 7.837461003870936e-05, + "loss": 0.9424, + "step": 96430 + }, + { + "epoch": 0.6161276720800378, + "grad_norm": 0.8968883752822876, + "learning_rate": 7.837047844051893e-05, + "loss": 1.1358, + "step": 96440 + }, + { + "epoch": 0.6161915592297765, + "grad_norm": 0.7500566244125366, + "learning_rate": 7.836634655661323e-05, + "loss": 0.9524, + "step": 96450 + }, + { + "epoch": 0.6162554463795152, + "grad_norm": 0.8886957764625549, + "learning_rate": 7.836221438703388e-05, + "loss": 0.6771, + "step": 96460 + }, + { + "epoch": 0.6163193335292539, + "grad_norm": 0.7337315082550049, + "learning_rate": 7.835808193182248e-05, + "loss": 0.7978, + "step": 96470 + }, + { + "epoch": 0.6163832206789926, + "grad_norm": 0.7529950141906738, + "learning_rate": 7.835394919102068e-05, + "loss": 0.8103, + "step": 96480 + }, + { + "epoch": 0.6164471078287314, + "grad_norm": 1.0321061611175537, + "learning_rate": 7.834981616467007e-05, + "loss": 1.1549, + "step": 96490 + }, + { + "epoch": 0.6165109949784701, + "grad_norm": 1.114965558052063, + "learning_rate": 7.83456828528123e-05, + "loss": 0.8008, + "step": 96500 + }, + { + "epoch": 0.6165748821282088, + "grad_norm": 0.8190954923629761, + "learning_rate": 7.834154925548898e-05, + "loss": 1.0127, + "step": 96510 + }, + { + "epoch": 0.6166387692779475, + "grad_norm": 1.0019639730453491, + "learning_rate": 7.833741537274173e-05, + "loss": 0.735, + "step": 96520 + }, + { + "epoch": 0.6167026564276862, + "grad_norm": 0.6965848207473755, + "learning_rate": 7.833328120461219e-05, + "loss": 1.2864, + "step": 96530 + }, + { + "epoch": 0.6167665435774249, + "grad_norm": 1.2692817449569702, + "learning_rate": 7.8329146751142e-05, + "loss": 0.8433, + "step": 96540 + }, + { + "epoch": 0.6168304307271636, + "grad_norm": 1.0348045825958252, + "learning_rate": 7.832501201237279e-05, + "loss": 0.7535, + "step": 96550 + }, + { + "epoch": 0.6168943178769022, + "grad_norm": 0.7855243682861328, + "learning_rate": 7.832087698834621e-05, + "loss": 0.8068, + "step": 96560 + }, + { + "epoch": 0.6169582050266409, + "grad_norm": 0.7087273001670837, + "learning_rate": 7.83167416791039e-05, + "loss": 1.015, + "step": 96570 + }, + { + "epoch": 0.6170220921763796, + "grad_norm": 0.6482358574867249, + "learning_rate": 7.83126060846875e-05, + "loss": 0.8353, + "step": 96580 + }, + { + "epoch": 0.6170859793261183, + "grad_norm": 1.1553382873535156, + "learning_rate": 7.830847020513867e-05, + "loss": 0.8833, + "step": 96590 + }, + { + "epoch": 0.617149866475857, + "grad_norm": 1.0533820390701294, + "learning_rate": 7.830433404049904e-05, + "loss": 0.9203, + "step": 96600 + }, + { + "epoch": 0.6172137536255957, + "grad_norm": 0.9476677775382996, + "learning_rate": 7.830019759081028e-05, + "loss": 0.9134, + "step": 96610 + }, + { + "epoch": 0.6172776407753344, + "grad_norm": 1.1443191766738892, + "learning_rate": 7.829606085611408e-05, + "loss": 1.0249, + "step": 96620 + }, + { + "epoch": 0.6173415279250731, + "grad_norm": 1.9042986631393433, + "learning_rate": 7.829192383645203e-05, + "loss": 0.8699, + "step": 96630 + }, + { + "epoch": 0.6174054150748118, + "grad_norm": 0.6363811492919922, + "learning_rate": 7.828778653186586e-05, + "loss": 0.7259, + "step": 96640 + }, + { + "epoch": 0.6174693022245505, + "grad_norm": 0.6034536957740784, + "learning_rate": 7.82836489423972e-05, + "loss": 1.0086, + "step": 96650 + }, + { + "epoch": 0.6175331893742892, + "grad_norm": 0.7996253371238708, + "learning_rate": 7.827951106808771e-05, + "loss": 0.8618, + "step": 96660 + }, + { + "epoch": 0.617597076524028, + "grad_norm": 0.9885534644126892, + "learning_rate": 7.827537290897908e-05, + "loss": 0.8571, + "step": 96670 + }, + { + "epoch": 0.6176609636737667, + "grad_norm": 0.7702460885047913, + "learning_rate": 7.827123446511298e-05, + "loss": 0.8013, + "step": 96680 + }, + { + "epoch": 0.6177248508235054, + "grad_norm": 1.5067464113235474, + "learning_rate": 7.82670957365311e-05, + "loss": 0.8273, + "step": 96690 + }, + { + "epoch": 0.6177887379732441, + "grad_norm": 0.8331496119499207, + "learning_rate": 7.826295672327512e-05, + "loss": 1.0143, + "step": 96700 + }, + { + "epoch": 0.6178526251229828, + "grad_norm": 1.1344146728515625, + "learning_rate": 7.82588174253867e-05, + "loss": 1.0309, + "step": 96710 + }, + { + "epoch": 0.6179165122727215, + "grad_norm": 0.6412261128425598, + "learning_rate": 7.825467784290755e-05, + "loss": 0.8596, + "step": 96720 + }, + { + "epoch": 0.6179803994224602, + "grad_norm": 0.5586232542991638, + "learning_rate": 7.825053797587936e-05, + "loss": 1.0329, + "step": 96730 + }, + { + "epoch": 0.6180442865721989, + "grad_norm": 0.8391451835632324, + "learning_rate": 7.824639782434379e-05, + "loss": 1.0402, + "step": 96740 + }, + { + "epoch": 0.6181081737219376, + "grad_norm": 0.9592933058738708, + "learning_rate": 7.824225738834256e-05, + "loss": 0.9333, + "step": 96750 + }, + { + "epoch": 0.6181720608716763, + "grad_norm": 0.5291448831558228, + "learning_rate": 7.823811666791738e-05, + "loss": 0.6241, + "step": 96760 + }, + { + "epoch": 0.618235948021415, + "grad_norm": 1.1414803266525269, + "learning_rate": 7.823397566310992e-05, + "loss": 0.8571, + "step": 96770 + }, + { + "epoch": 0.6182998351711537, + "grad_norm": 0.7651611566543579, + "learning_rate": 7.822983437396192e-05, + "loss": 0.7009, + "step": 96780 + }, + { + "epoch": 0.6183637223208924, + "grad_norm": 1.016514539718628, + "learning_rate": 7.822569280051505e-05, + "loss": 0.9783, + "step": 96790 + }, + { + "epoch": 0.618427609470631, + "grad_norm": 0.9900182485580444, + "learning_rate": 7.822155094281104e-05, + "loss": 0.919, + "step": 96800 + }, + { + "epoch": 0.6184914966203697, + "grad_norm": 0.969688892364502, + "learning_rate": 7.821740880089159e-05, + "loss": 1.054, + "step": 96810 + }, + { + "epoch": 0.6185553837701084, + "grad_norm": 0.9642791748046875, + "learning_rate": 7.821326637479842e-05, + "loss": 0.7227, + "step": 96820 + }, + { + "epoch": 0.6186192709198471, + "grad_norm": 0.9115810394287109, + "learning_rate": 7.820912366457327e-05, + "loss": 0.9451, + "step": 96830 + }, + { + "epoch": 0.6186831580695858, + "grad_norm": 1.4009279012680054, + "learning_rate": 7.820498067025782e-05, + "loss": 1.1126, + "step": 96840 + }, + { + "epoch": 0.6187470452193246, + "grad_norm": 1.6597306728363037, + "learning_rate": 7.820083739189381e-05, + "loss": 0.8138, + "step": 96850 + }, + { + "epoch": 0.6188109323690633, + "grad_norm": 2.7487285137176514, + "learning_rate": 7.819669382952299e-05, + "loss": 0.929, + "step": 96860 + }, + { + "epoch": 0.618874819518802, + "grad_norm": 0.7340418100357056, + "learning_rate": 7.819254998318706e-05, + "loss": 0.9411, + "step": 96870 + }, + { + "epoch": 0.6189387066685407, + "grad_norm": 0.8978639245033264, + "learning_rate": 7.818840585292775e-05, + "loss": 0.755, + "step": 96880 + }, + { + "epoch": 0.6190025938182794, + "grad_norm": 0.8307545781135559, + "learning_rate": 7.818426143878683e-05, + "loss": 0.8567, + "step": 96890 + }, + { + "epoch": 0.6190664809680181, + "grad_norm": 0.6987618803977966, + "learning_rate": 7.818011674080601e-05, + "loss": 0.7964, + "step": 96900 + }, + { + "epoch": 0.6191303681177568, + "grad_norm": 1.2218877077102661, + "learning_rate": 7.817597175902702e-05, + "loss": 1.2578, + "step": 96910 + }, + { + "epoch": 0.6191942552674955, + "grad_norm": 1.1471195220947266, + "learning_rate": 7.817182649349164e-05, + "loss": 0.8193, + "step": 96920 + }, + { + "epoch": 0.6192581424172342, + "grad_norm": 0.7587412595748901, + "learning_rate": 7.816768094424157e-05, + "loss": 0.8189, + "step": 96930 + }, + { + "epoch": 0.6193220295669729, + "grad_norm": 0.5350973010063171, + "learning_rate": 7.81635351113186e-05, + "loss": 0.6833, + "step": 96940 + }, + { + "epoch": 0.6193859167167116, + "grad_norm": 0.5886098146438599, + "learning_rate": 7.815938899476447e-05, + "loss": 0.7651, + "step": 96950 + }, + { + "epoch": 0.6194498038664503, + "grad_norm": 0.8069875240325928, + "learning_rate": 7.815524259462093e-05, + "loss": 0.8622, + "step": 96960 + }, + { + "epoch": 0.619513691016189, + "grad_norm": 0.9382511973381042, + "learning_rate": 7.815109591092973e-05, + "loss": 0.7166, + "step": 96970 + }, + { + "epoch": 0.6195775781659277, + "grad_norm": 1.3142880201339722, + "learning_rate": 7.814694894373263e-05, + "loss": 1.2337, + "step": 96980 + }, + { + "epoch": 0.6196414653156664, + "grad_norm": 0.8636249303817749, + "learning_rate": 7.814280169307142e-05, + "loss": 0.7238, + "step": 96990 + }, + { + "epoch": 0.6197053524654051, + "grad_norm": 0.7896556258201599, + "learning_rate": 7.813865415898785e-05, + "loss": 0.7152, + "step": 97000 + }, + { + "epoch": 0.6197692396151439, + "grad_norm": 0.7915673851966858, + "learning_rate": 7.813450634152369e-05, + "loss": 0.9909, + "step": 97010 + }, + { + "epoch": 0.6198331267648826, + "grad_norm": 0.6319288611412048, + "learning_rate": 7.81303582407207e-05, + "loss": 0.8935, + "step": 97020 + }, + { + "epoch": 0.6198970139146213, + "grad_norm": 0.72498619556427, + "learning_rate": 7.812620985662066e-05, + "loss": 0.8319, + "step": 97030 + }, + { + "epoch": 0.6199609010643599, + "grad_norm": 0.5626809000968933, + "learning_rate": 7.812206118926539e-05, + "loss": 0.6742, + "step": 97040 + }, + { + "epoch": 0.6200247882140986, + "grad_norm": 1.0397377014160156, + "learning_rate": 7.81179122386966e-05, + "loss": 0.863, + "step": 97050 + }, + { + "epoch": 0.6200886753638373, + "grad_norm": 0.5984945297241211, + "learning_rate": 7.811376300495612e-05, + "loss": 1.0026, + "step": 97060 + }, + { + "epoch": 0.620152562513576, + "grad_norm": 1.6048803329467773, + "learning_rate": 7.810961348808572e-05, + "loss": 0.9494, + "step": 97070 + }, + { + "epoch": 0.6202164496633147, + "grad_norm": 0.6135510206222534, + "learning_rate": 7.810546368812721e-05, + "loss": 0.7765, + "step": 97080 + }, + { + "epoch": 0.6202803368130534, + "grad_norm": 1.2818505764007568, + "learning_rate": 7.810131360512236e-05, + "loss": 0.7591, + "step": 97090 + }, + { + "epoch": 0.6203442239627921, + "grad_norm": 0.8664326071739197, + "learning_rate": 7.809716323911296e-05, + "loss": 0.8345, + "step": 97100 + }, + { + "epoch": 0.6204081111125308, + "grad_norm": 0.9286889433860779, + "learning_rate": 7.809301259014083e-05, + "loss": 1.1506, + "step": 97110 + }, + { + "epoch": 0.6204719982622695, + "grad_norm": 0.7341832518577576, + "learning_rate": 7.808886165824775e-05, + "loss": 0.8342, + "step": 97120 + }, + { + "epoch": 0.6205358854120082, + "grad_norm": 2.2999391555786133, + "learning_rate": 7.808471044347555e-05, + "loss": 0.8378, + "step": 97130 + }, + { + "epoch": 0.6205997725617469, + "grad_norm": 0.6908975839614868, + "learning_rate": 7.808055894586602e-05, + "loss": 1.0413, + "step": 97140 + }, + { + "epoch": 0.6206636597114856, + "grad_norm": 0.7688397765159607, + "learning_rate": 7.807640716546094e-05, + "loss": 0.7239, + "step": 97150 + }, + { + "epoch": 0.6207275468612243, + "grad_norm": 0.830764889717102, + "learning_rate": 7.807225510230216e-05, + "loss": 0.8647, + "step": 97160 + }, + { + "epoch": 0.620791434010963, + "grad_norm": 1.2622300386428833, + "learning_rate": 7.80681027564315e-05, + "loss": 0.8693, + "step": 97170 + }, + { + "epoch": 0.6208553211607017, + "grad_norm": 0.9150146842002869, + "learning_rate": 7.806395012789074e-05, + "loss": 0.854, + "step": 97180 + }, + { + "epoch": 0.6209192083104405, + "grad_norm": 0.8529565334320068, + "learning_rate": 7.805979721672175e-05, + "loss": 0.8418, + "step": 97190 + }, + { + "epoch": 0.6209830954601792, + "grad_norm": 1.1613361835479736, + "learning_rate": 7.80556440229663e-05, + "loss": 0.93, + "step": 97200 + }, + { + "epoch": 0.6210469826099179, + "grad_norm": 1.5015759468078613, + "learning_rate": 7.805149054666626e-05, + "loss": 1.0721, + "step": 97210 + }, + { + "epoch": 0.6211108697596566, + "grad_norm": 0.8608677387237549, + "learning_rate": 7.804733678786345e-05, + "loss": 0.9352, + "step": 97220 + }, + { + "epoch": 0.6211747569093953, + "grad_norm": 0.7824024558067322, + "learning_rate": 7.804318274659967e-05, + "loss": 0.7138, + "step": 97230 + }, + { + "epoch": 0.621238644059134, + "grad_norm": 0.6938091516494751, + "learning_rate": 7.803902842291679e-05, + "loss": 0.8414, + "step": 97240 + }, + { + "epoch": 0.6213025312088727, + "grad_norm": 1.1884207725524902, + "learning_rate": 7.803487381685665e-05, + "loss": 0.846, + "step": 97250 + }, + { + "epoch": 0.6213664183586114, + "grad_norm": 0.9938066005706787, + "learning_rate": 7.803071892846106e-05, + "loss": 1.0066, + "step": 97260 + }, + { + "epoch": 0.6214303055083501, + "grad_norm": 1.1937052011489868, + "learning_rate": 7.802656375777188e-05, + "loss": 1.042, + "step": 97270 + }, + { + "epoch": 0.6214941926580888, + "grad_norm": 0.7454966306686401, + "learning_rate": 7.802240830483096e-05, + "loss": 0.9139, + "step": 97280 + }, + { + "epoch": 0.6215580798078274, + "grad_norm": 0.8491148948669434, + "learning_rate": 7.801825256968015e-05, + "loss": 0.8569, + "step": 97290 + }, + { + "epoch": 0.6216219669575661, + "grad_norm": 1.283415675163269, + "learning_rate": 7.80140965523613e-05, + "loss": 1.134, + "step": 97300 + }, + { + "epoch": 0.6216858541073048, + "grad_norm": 0.9425275921821594, + "learning_rate": 7.800994025291626e-05, + "loss": 0.8734, + "step": 97310 + }, + { + "epoch": 0.6217497412570435, + "grad_norm": 0.8842566609382629, + "learning_rate": 7.800578367138688e-05, + "loss": 0.9209, + "step": 97320 + }, + { + "epoch": 0.6218136284067822, + "grad_norm": 0.9904354214668274, + "learning_rate": 7.800162680781504e-05, + "loss": 0.7265, + "step": 97330 + }, + { + "epoch": 0.6218775155565209, + "grad_norm": 0.956762969493866, + "learning_rate": 7.79974696622426e-05, + "loss": 0.911, + "step": 97340 + }, + { + "epoch": 0.6219414027062596, + "grad_norm": 0.7186155319213867, + "learning_rate": 7.79933122347114e-05, + "loss": 0.9249, + "step": 97350 + }, + { + "epoch": 0.6220052898559983, + "grad_norm": 0.733720064163208, + "learning_rate": 7.798915452526334e-05, + "loss": 0.9297, + "step": 97360 + }, + { + "epoch": 0.622069177005737, + "grad_norm": 0.8453028202056885, + "learning_rate": 7.798499653394028e-05, + "loss": 0.9505, + "step": 97370 + }, + { + "epoch": 0.6221330641554758, + "grad_norm": 1.2403620481491089, + "learning_rate": 7.798083826078408e-05, + "loss": 1.3309, + "step": 97380 + }, + { + "epoch": 0.6221969513052145, + "grad_norm": 1.1222939491271973, + "learning_rate": 7.797667970583666e-05, + "loss": 1.1289, + "step": 97390 + }, + { + "epoch": 0.6222608384549532, + "grad_norm": 0.633385956287384, + "learning_rate": 7.797252086913984e-05, + "loss": 0.9007, + "step": 97400 + }, + { + "epoch": 0.6223247256046919, + "grad_norm": 0.7996073365211487, + "learning_rate": 7.796877767525162e-05, + "loss": 0.9044, + "step": 97410 + }, + { + "epoch": 0.6223886127544306, + "grad_norm": 0.7875693440437317, + "learning_rate": 7.796461830334642e-05, + "loss": 1.0713, + "step": 97420 + }, + { + "epoch": 0.6224524999041693, + "grad_norm": 1.1441236734390259, + "learning_rate": 7.79604586498133e-05, + "loss": 0.9106, + "step": 97430 + }, + { + "epoch": 0.622516387053908, + "grad_norm": 1.1980715990066528, + "learning_rate": 7.795629871469419e-05, + "loss": 0.9184, + "step": 97440 + }, + { + "epoch": 0.6225802742036467, + "grad_norm": 0.8532522320747375, + "learning_rate": 7.795213849803094e-05, + "loss": 0.827, + "step": 97450 + }, + { + "epoch": 0.6226441613533854, + "grad_norm": 0.9568140506744385, + "learning_rate": 7.794797799986549e-05, + "loss": 0.8368, + "step": 97460 + }, + { + "epoch": 0.6227080485031241, + "grad_norm": 0.8139510750770569, + "learning_rate": 7.794381722023973e-05, + "loss": 1.1102, + "step": 97470 + }, + { + "epoch": 0.6227719356528628, + "grad_norm": 1.3737013339996338, + "learning_rate": 7.793965615919555e-05, + "loss": 1.0294, + "step": 97480 + }, + { + "epoch": 0.6228358228026015, + "grad_norm": 1.1957775354385376, + "learning_rate": 7.793549481677485e-05, + "loss": 0.75, + "step": 97490 + }, + { + "epoch": 0.6228997099523402, + "grad_norm": 0.7739052772521973, + "learning_rate": 7.793133319301956e-05, + "loss": 0.8989, + "step": 97500 + }, + { + "epoch": 0.6229635971020789, + "grad_norm": 1.287320852279663, + "learning_rate": 7.792717128797157e-05, + "loss": 1.1412, + "step": 97510 + }, + { + "epoch": 0.6230274842518176, + "grad_norm": 1.1825543642044067, + "learning_rate": 7.792300910167284e-05, + "loss": 0.7868, + "step": 97520 + }, + { + "epoch": 0.6230913714015562, + "grad_norm": 0.9416884183883667, + "learning_rate": 7.791884663416522e-05, + "loss": 1.0537, + "step": 97530 + }, + { + "epoch": 0.6231552585512949, + "grad_norm": 0.7893606424331665, + "learning_rate": 7.791468388549066e-05, + "loss": 0.9721, + "step": 97540 + }, + { + "epoch": 0.6232191457010336, + "grad_norm": 0.6625798344612122, + "learning_rate": 7.79105208556911e-05, + "loss": 1.0084, + "step": 97550 + }, + { + "epoch": 0.6232830328507724, + "grad_norm": 0.9183120131492615, + "learning_rate": 7.790635754480844e-05, + "loss": 1.2298, + "step": 97560 + }, + { + "epoch": 0.6233469200005111, + "grad_norm": 0.7689588069915771, + "learning_rate": 7.790219395288461e-05, + "loss": 0.8109, + "step": 97570 + }, + { + "epoch": 0.6234108071502498, + "grad_norm": 0.8191707134246826, + "learning_rate": 7.789803007996156e-05, + "loss": 0.9747, + "step": 97580 + }, + { + "epoch": 0.6234746942999885, + "grad_norm": 1.4356540441513062, + "learning_rate": 7.789386592608121e-05, + "loss": 1.053, + "step": 97590 + }, + { + "epoch": 0.6235385814497272, + "grad_norm": 0.5826048851013184, + "learning_rate": 7.78897014912855e-05, + "loss": 0.9845, + "step": 97600 + }, + { + "epoch": 0.6236024685994659, + "grad_norm": 1.0215983390808105, + "learning_rate": 7.788553677561635e-05, + "loss": 1.0139, + "step": 97610 + }, + { + "epoch": 0.6236663557492046, + "grad_norm": 1.153480887413025, + "learning_rate": 7.788137177911573e-05, + "loss": 0.8468, + "step": 97620 + }, + { + "epoch": 0.6237302428989433, + "grad_norm": 1.5136088132858276, + "learning_rate": 7.78772065018256e-05, + "loss": 0.7882, + "step": 97630 + }, + { + "epoch": 0.623794130048682, + "grad_norm": 1.1974624395370483, + "learning_rate": 7.787304094378785e-05, + "loss": 0.9654, + "step": 97640 + }, + { + "epoch": 0.6238580171984207, + "grad_norm": 0.695049524307251, + "learning_rate": 7.786887510504447e-05, + "loss": 0.8364, + "step": 97650 + }, + { + "epoch": 0.6239219043481594, + "grad_norm": 0.7446387410163879, + "learning_rate": 7.786470898563741e-05, + "loss": 0.5882, + "step": 97660 + }, + { + "epoch": 0.6239857914978981, + "grad_norm": 1.169751763343811, + "learning_rate": 7.786054258560863e-05, + "loss": 0.795, + "step": 97670 + }, + { + "epoch": 0.6240496786476368, + "grad_norm": 1.1560198068618774, + "learning_rate": 7.785637590500007e-05, + "loss": 0.7352, + "step": 97680 + }, + { + "epoch": 0.6241135657973755, + "grad_norm": 0.8361658453941345, + "learning_rate": 7.785220894385373e-05, + "loss": 0.7835, + "step": 97690 + }, + { + "epoch": 0.6241774529471142, + "grad_norm": 1.0349642038345337, + "learning_rate": 7.784804170221154e-05, + "loss": 0.7047, + "step": 97700 + }, + { + "epoch": 0.624241340096853, + "grad_norm": 0.7345200181007385, + "learning_rate": 7.784387418011547e-05, + "loss": 0.9272, + "step": 97710 + }, + { + "epoch": 0.6243052272465917, + "grad_norm": 1.210518717765808, + "learning_rate": 7.783970637760751e-05, + "loss": 0.9561, + "step": 97720 + }, + { + "epoch": 0.6243691143963304, + "grad_norm": 1.1094375848770142, + "learning_rate": 7.783553829472962e-05, + "loss": 1.1463, + "step": 97730 + }, + { + "epoch": 0.6244330015460691, + "grad_norm": 0.9743418097496033, + "learning_rate": 7.783136993152376e-05, + "loss": 0.8341, + "step": 97740 + }, + { + "epoch": 0.6244968886958078, + "grad_norm": 0.6543291211128235, + "learning_rate": 7.782720128803195e-05, + "loss": 0.9606, + "step": 97750 + }, + { + "epoch": 0.6245607758455465, + "grad_norm": 1.24593186378479, + "learning_rate": 7.782303236429614e-05, + "loss": 1.2391, + "step": 97760 + }, + { + "epoch": 0.6246246629952851, + "grad_norm": 1.0866676568984985, + "learning_rate": 7.781886316035834e-05, + "loss": 0.96, + "step": 97770 + }, + { + "epoch": 0.6246885501450238, + "grad_norm": 0.8642030358314514, + "learning_rate": 7.78146936762605e-05, + "loss": 1.1729, + "step": 97780 + }, + { + "epoch": 0.6247524372947625, + "grad_norm": 0.8341190218925476, + "learning_rate": 7.781052391204464e-05, + "loss": 0.8916, + "step": 97790 + }, + { + "epoch": 0.6248163244445012, + "grad_norm": 0.8593606948852539, + "learning_rate": 7.780635386775273e-05, + "loss": 0.7954, + "step": 97800 + }, + { + "epoch": 0.6248802115942399, + "grad_norm": 0.7424865365028381, + "learning_rate": 7.780218354342679e-05, + "loss": 0.8716, + "step": 97810 + }, + { + "epoch": 0.6249440987439786, + "grad_norm": 1.128391146659851, + "learning_rate": 7.779801293910883e-05, + "loss": 0.8336, + "step": 97820 + }, + { + "epoch": 0.6250079858937173, + "grad_norm": 0.8907873630523682, + "learning_rate": 7.779384205484079e-05, + "loss": 1.0559, + "step": 97830 + }, + { + "epoch": 0.625071873043456, + "grad_norm": 0.7362083792686462, + "learning_rate": 7.778967089066474e-05, + "loss": 1.0033, + "step": 97840 + }, + { + "epoch": 0.6251357601931947, + "grad_norm": 0.8434352278709412, + "learning_rate": 7.778549944662266e-05, + "loss": 0.9773, + "step": 97850 + }, + { + "epoch": 0.6251996473429334, + "grad_norm": 2.6534831523895264, + "learning_rate": 7.778132772275657e-05, + "loss": 0.921, + "step": 97860 + }, + { + "epoch": 0.6252635344926721, + "grad_norm": 1.1809990406036377, + "learning_rate": 7.777715571910846e-05, + "loss": 0.9785, + "step": 97870 + }, + { + "epoch": 0.6253274216424108, + "grad_norm": 0.7903746962547302, + "learning_rate": 7.777298343572038e-05, + "loss": 0.8915, + "step": 97880 + }, + { + "epoch": 0.6253913087921495, + "grad_norm": 0.6154451370239258, + "learning_rate": 7.776881087263433e-05, + "loss": 1.0139, + "step": 97890 + }, + { + "epoch": 0.6254551959418883, + "grad_norm": 0.7355427145957947, + "learning_rate": 7.776463802989232e-05, + "loss": 0.8841, + "step": 97900 + }, + { + "epoch": 0.625519083091627, + "grad_norm": 1.5603142976760864, + "learning_rate": 7.776046490753638e-05, + "loss": 0.9459, + "step": 97910 + }, + { + "epoch": 0.6255829702413657, + "grad_norm": 1.0645157098770142, + "learning_rate": 7.775629150560854e-05, + "loss": 0.8107, + "step": 97920 + }, + { + "epoch": 0.6256468573911044, + "grad_norm": 0.7436626553535461, + "learning_rate": 7.775211782415084e-05, + "loss": 0.6884, + "step": 97930 + }, + { + "epoch": 0.6257107445408431, + "grad_norm": 1.6905604600906372, + "learning_rate": 7.774794386320531e-05, + "loss": 0.9719, + "step": 97940 + }, + { + "epoch": 0.6257746316905818, + "grad_norm": 1.0564686059951782, + "learning_rate": 7.774376962281398e-05, + "loss": 0.9414, + "step": 97950 + }, + { + "epoch": 0.6258385188403205, + "grad_norm": 0.7647698521614075, + "learning_rate": 7.773959510301887e-05, + "loss": 1.0905, + "step": 97960 + }, + { + "epoch": 0.6259024059900592, + "grad_norm": 0.8428241610527039, + "learning_rate": 7.773542030386205e-05, + "loss": 1.0266, + "step": 97970 + }, + { + "epoch": 0.6259662931397979, + "grad_norm": 0.5705221891403198, + "learning_rate": 7.773124522538556e-05, + "loss": 0.8996, + "step": 97980 + }, + { + "epoch": 0.6260301802895366, + "grad_norm": 0.9240884780883789, + "learning_rate": 7.772706986763142e-05, + "loss": 0.6718, + "step": 97990 + }, + { + "epoch": 0.6260940674392753, + "grad_norm": 1.4182459115982056, + "learning_rate": 7.772289423064174e-05, + "loss": 0.9454, + "step": 98000 + }, + { + "epoch": 0.626157954589014, + "grad_norm": 0.46557140350341797, + "learning_rate": 7.77187183144585e-05, + "loss": 0.8053, + "step": 98010 + }, + { + "epoch": 0.6262218417387526, + "grad_norm": 1.070710301399231, + "learning_rate": 7.771454211912378e-05, + "loss": 0.8369, + "step": 98020 + }, + { + "epoch": 0.6262857288884913, + "grad_norm": 1.3407284021377563, + "learning_rate": 7.771036564467967e-05, + "loss": 0.6425, + "step": 98030 + }, + { + "epoch": 0.62634961603823, + "grad_norm": 1.1556596755981445, + "learning_rate": 7.770618889116819e-05, + "loss": 0.7995, + "step": 98040 + }, + { + "epoch": 0.6264135031879687, + "grad_norm": 0.8401532769203186, + "learning_rate": 7.770201185863142e-05, + "loss": 1.0753, + "step": 98050 + }, + { + "epoch": 0.6264773903377074, + "grad_norm": 0.927470862865448, + "learning_rate": 7.769783454711143e-05, + "loss": 0.8812, + "step": 98060 + }, + { + "epoch": 0.6265412774874461, + "grad_norm": 0.7423887252807617, + "learning_rate": 7.769365695665027e-05, + "loss": 0.9529, + "step": 98070 + }, + { + "epoch": 0.6266051646371849, + "grad_norm": 0.5495186448097229, + "learning_rate": 7.768947908729003e-05, + "loss": 0.7396, + "step": 98080 + }, + { + "epoch": 0.6266690517869236, + "grad_norm": 0.8177791833877563, + "learning_rate": 7.768530093907279e-05, + "loss": 0.9336, + "step": 98090 + }, + { + "epoch": 0.6267329389366623, + "grad_norm": 2.721142530441284, + "learning_rate": 7.768112251204061e-05, + "loss": 1.0718, + "step": 98100 + }, + { + "epoch": 0.626796826086401, + "grad_norm": 0.5694549679756165, + "learning_rate": 7.767694380623558e-05, + "loss": 0.6205, + "step": 98110 + }, + { + "epoch": 0.6268607132361397, + "grad_norm": 0.9336040616035461, + "learning_rate": 7.767276482169979e-05, + "loss": 0.8428, + "step": 98120 + }, + { + "epoch": 0.6269246003858784, + "grad_norm": 1.029270052909851, + "learning_rate": 7.766858555847531e-05, + "loss": 0.8425, + "step": 98130 + }, + { + "epoch": 0.6269884875356171, + "grad_norm": 1.2212886810302734, + "learning_rate": 7.766440601660424e-05, + "loss": 1.0028, + "step": 98140 + }, + { + "epoch": 0.6270523746853558, + "grad_norm": 2.6575090885162354, + "learning_rate": 7.766022619612867e-05, + "loss": 0.8225, + "step": 98150 + }, + { + "epoch": 0.6271162618350945, + "grad_norm": 0.7824742197990417, + "learning_rate": 7.765604609709069e-05, + "loss": 0.903, + "step": 98160 + }, + { + "epoch": 0.6271801489848332, + "grad_norm": 0.9830259084701538, + "learning_rate": 7.76518657195324e-05, + "loss": 1.0083, + "step": 98170 + }, + { + "epoch": 0.6272440361345719, + "grad_norm": 0.7284572720527649, + "learning_rate": 7.764768506349589e-05, + "loss": 0.7337, + "step": 98180 + }, + { + "epoch": 0.6273079232843106, + "grad_norm": 1.008009910583496, + "learning_rate": 7.764350412902328e-05, + "loss": 1.0301, + "step": 98190 + }, + { + "epoch": 0.6273718104340493, + "grad_norm": 0.7041063904762268, + "learning_rate": 7.763932291615667e-05, + "loss": 1.0467, + "step": 98200 + }, + { + "epoch": 0.627435697583788, + "grad_norm": 1.120405673980713, + "learning_rate": 7.763514142493818e-05, + "loss": 1.0133, + "step": 98210 + }, + { + "epoch": 0.6274995847335267, + "grad_norm": 0.855456531047821, + "learning_rate": 7.76309596554099e-05, + "loss": 1.1737, + "step": 98220 + }, + { + "epoch": 0.6275634718832654, + "grad_norm": 0.8081047534942627, + "learning_rate": 7.762677760761394e-05, + "loss": 0.9021, + "step": 98230 + }, + { + "epoch": 0.6276273590330042, + "grad_norm": 0.7557641267776489, + "learning_rate": 7.762259528159243e-05, + "loss": 0.9765, + "step": 98240 + }, + { + "epoch": 0.6276912461827429, + "grad_norm": 3.023898124694824, + "learning_rate": 7.76184126773875e-05, + "loss": 0.8915, + "step": 98250 + }, + { + "epoch": 0.6277551333324815, + "grad_norm": 1.2447547912597656, + "learning_rate": 7.761422979504128e-05, + "loss": 0.9107, + "step": 98260 + }, + { + "epoch": 0.6278190204822202, + "grad_norm": 1.0318201780319214, + "learning_rate": 7.761004663459584e-05, + "loss": 0.8787, + "step": 98270 + }, + { + "epoch": 0.6278829076319589, + "grad_norm": 0.69561767578125, + "learning_rate": 7.760586319609335e-05, + "loss": 0.8026, + "step": 98280 + }, + { + "epoch": 0.6279467947816976, + "grad_norm": 0.5632861256599426, + "learning_rate": 7.760167947957595e-05, + "loss": 0.7686, + "step": 98290 + }, + { + "epoch": 0.6280106819314363, + "grad_norm": 0.8044828772544861, + "learning_rate": 7.759749548508575e-05, + "loss": 0.8049, + "step": 98300 + }, + { + "epoch": 0.628074569081175, + "grad_norm": 1.2165446281433105, + "learning_rate": 7.759331121266489e-05, + "loss": 0.7037, + "step": 98310 + }, + { + "epoch": 0.6281384562309137, + "grad_norm": 0.5233101844787598, + "learning_rate": 7.758912666235552e-05, + "loss": 0.8435, + "step": 98320 + }, + { + "epoch": 0.6282023433806524, + "grad_norm": 0.7019632458686829, + "learning_rate": 7.758494183419978e-05, + "loss": 0.7907, + "step": 98330 + }, + { + "epoch": 0.6282662305303911, + "grad_norm": 0.7987385988235474, + "learning_rate": 7.758075672823982e-05, + "loss": 1.0906, + "step": 98340 + }, + { + "epoch": 0.6283301176801298, + "grad_norm": 0.8817057013511658, + "learning_rate": 7.757657134451776e-05, + "loss": 0.9428, + "step": 98350 + }, + { + "epoch": 0.6283940048298685, + "grad_norm": 0.7818195819854736, + "learning_rate": 7.757238568307576e-05, + "loss": 0.6948, + "step": 98360 + }, + { + "epoch": 0.6284578919796072, + "grad_norm": 1.5914932489395142, + "learning_rate": 7.756819974395602e-05, + "loss": 0.9719, + "step": 98370 + }, + { + "epoch": 0.6285217791293459, + "grad_norm": 0.9636878967285156, + "learning_rate": 7.756401352720063e-05, + "loss": 0.9189, + "step": 98380 + }, + { + "epoch": 0.6285856662790846, + "grad_norm": 1.070579171180725, + "learning_rate": 7.755982703285178e-05, + "loss": 0.9929, + "step": 98390 + }, + { + "epoch": 0.6286495534288233, + "grad_norm": 0.9646096229553223, + "learning_rate": 7.755564026095164e-05, + "loss": 0.7181, + "step": 98400 + }, + { + "epoch": 0.628713440578562, + "grad_norm": 1.1919089555740356, + "learning_rate": 7.755145321154235e-05, + "loss": 0.956, + "step": 98410 + }, + { + "epoch": 0.6287773277283008, + "grad_norm": 1.0525037050247192, + "learning_rate": 7.754726588466611e-05, + "loss": 0.8289, + "step": 98420 + }, + { + "epoch": 0.6288412148780395, + "grad_norm": 0.8866745233535767, + "learning_rate": 7.754307828036507e-05, + "loss": 1.1507, + "step": 98430 + }, + { + "epoch": 0.6289051020277782, + "grad_norm": 1.4663811922073364, + "learning_rate": 7.753889039868138e-05, + "loss": 0.8841, + "step": 98440 + }, + { + "epoch": 0.6289689891775169, + "grad_norm": 0.8468247056007385, + "learning_rate": 7.753470223965726e-05, + "loss": 1.0977, + "step": 98450 + }, + { + "epoch": 0.6290328763272556, + "grad_norm": 0.6286731958389282, + "learning_rate": 7.753051380333485e-05, + "loss": 0.6818, + "step": 98460 + }, + { + "epoch": 0.6290967634769943, + "grad_norm": 1.2429255247116089, + "learning_rate": 7.752632508975636e-05, + "loss": 0.8151, + "step": 98470 + }, + { + "epoch": 0.629160650626733, + "grad_norm": 1.2103321552276611, + "learning_rate": 7.752213609896396e-05, + "loss": 0.7415, + "step": 98480 + }, + { + "epoch": 0.6292245377764717, + "grad_norm": 1.9655529260635376, + "learning_rate": 7.751794683099986e-05, + "loss": 1.1028, + "step": 98490 + }, + { + "epoch": 0.6292884249262103, + "grad_norm": 0.7226641774177551, + "learning_rate": 7.75137572859062e-05, + "loss": 0.9055, + "step": 98500 + }, + { + "epoch": 0.629352312075949, + "grad_norm": 1.1247568130493164, + "learning_rate": 7.750956746372521e-05, + "loss": 1.0251, + "step": 98510 + }, + { + "epoch": 0.6294161992256877, + "grad_norm": 1.818439245223999, + "learning_rate": 7.750537736449908e-05, + "loss": 0.6323, + "step": 98520 + }, + { + "epoch": 0.6294800863754264, + "grad_norm": 0.6944345235824585, + "learning_rate": 7.750118698827e-05, + "loss": 0.9022, + "step": 98530 + }, + { + "epoch": 0.6295439735251651, + "grad_norm": 1.0383299589157104, + "learning_rate": 7.749699633508019e-05, + "loss": 0.9505, + "step": 98540 + }, + { + "epoch": 0.6296078606749038, + "grad_norm": 0.5083116888999939, + "learning_rate": 7.749280540497181e-05, + "loss": 0.7154, + "step": 98550 + }, + { + "epoch": 0.6296717478246425, + "grad_norm": 1.316440224647522, + "learning_rate": 7.748861419798712e-05, + "loss": 0.7183, + "step": 98560 + }, + { + "epoch": 0.6297356349743812, + "grad_norm": 0.9615148901939392, + "learning_rate": 7.74844227141683e-05, + "loss": 1.0438, + "step": 98570 + }, + { + "epoch": 0.6297995221241199, + "grad_norm": 1.223386526107788, + "learning_rate": 7.748023095355756e-05, + "loss": 0.8429, + "step": 98580 + }, + { + "epoch": 0.6298634092738586, + "grad_norm": 0.7648318409919739, + "learning_rate": 7.747603891619712e-05, + "loss": 0.8862, + "step": 98590 + }, + { + "epoch": 0.6299272964235973, + "grad_norm": 0.8979175686836243, + "learning_rate": 7.747184660212918e-05, + "loss": 0.9744, + "step": 98600 + }, + { + "epoch": 0.629991183573336, + "grad_norm": 0.8479979038238525, + "learning_rate": 7.7467654011396e-05, + "loss": 0.9066, + "step": 98610 + }, + { + "epoch": 0.6300550707230748, + "grad_norm": 1.0452567338943481, + "learning_rate": 7.746346114403978e-05, + "loss": 0.717, + "step": 98620 + }, + { + "epoch": 0.6301189578728135, + "grad_norm": 0.5586809515953064, + "learning_rate": 7.745926800010275e-05, + "loss": 0.7231, + "step": 98630 + }, + { + "epoch": 0.6301828450225522, + "grad_norm": 0.9758456945419312, + "learning_rate": 7.745507457962712e-05, + "loss": 0.7899, + "step": 98640 + }, + { + "epoch": 0.6302467321722909, + "grad_norm": 0.8799155354499817, + "learning_rate": 7.745088088265516e-05, + "loss": 0.8026, + "step": 98650 + }, + { + "epoch": 0.6303106193220296, + "grad_norm": 0.7209200263023376, + "learning_rate": 7.744668690922907e-05, + "loss": 0.9363, + "step": 98660 + }, + { + "epoch": 0.6303745064717683, + "grad_norm": 1.0429208278656006, + "learning_rate": 7.74424926593911e-05, + "loss": 1.1868, + "step": 98670 + }, + { + "epoch": 0.630438393621507, + "grad_norm": 0.829575777053833, + "learning_rate": 7.743829813318349e-05, + "loss": 0.795, + "step": 98680 + }, + { + "epoch": 0.6305022807712457, + "grad_norm": 0.7974848747253418, + "learning_rate": 7.743410333064847e-05, + "loss": 1.0371, + "step": 98690 + }, + { + "epoch": 0.6305661679209844, + "grad_norm": 1.1023069620132446, + "learning_rate": 7.74299082518283e-05, + "loss": 0.8404, + "step": 98700 + }, + { + "epoch": 0.6306300550707231, + "grad_norm": 1.1051509380340576, + "learning_rate": 7.742571289676522e-05, + "loss": 0.9185, + "step": 98710 + }, + { + "epoch": 0.6306939422204618, + "grad_norm": 1.0306414365768433, + "learning_rate": 7.742151726550149e-05, + "loss": 0.784, + "step": 98720 + }, + { + "epoch": 0.6307578293702005, + "grad_norm": 0.6763244271278381, + "learning_rate": 7.741732135807937e-05, + "loss": 0.983, + "step": 98730 + }, + { + "epoch": 0.6308217165199391, + "grad_norm": 0.5767059326171875, + "learning_rate": 7.741312517454109e-05, + "loss": 0.9532, + "step": 98740 + }, + { + "epoch": 0.6308856036696778, + "grad_norm": 0.8445504307746887, + "learning_rate": 7.740892871492894e-05, + "loss": 1.0529, + "step": 98750 + }, + { + "epoch": 0.6309494908194165, + "grad_norm": 0.5848102569580078, + "learning_rate": 7.740473197928513e-05, + "loss": 0.6972, + "step": 98760 + }, + { + "epoch": 0.6310133779691552, + "grad_norm": 1.0560247898101807, + "learning_rate": 7.740053496765199e-05, + "loss": 0.8724, + "step": 98770 + }, + { + "epoch": 0.631077265118894, + "grad_norm": 1.2998313903808594, + "learning_rate": 7.739633768007175e-05, + "loss": 1.0596, + "step": 98780 + }, + { + "epoch": 0.6311411522686327, + "grad_norm": 0.6636534333229065, + "learning_rate": 7.739214011658669e-05, + "loss": 0.9184, + "step": 98790 + }, + { + "epoch": 0.6312050394183714, + "grad_norm": 0.8721036911010742, + "learning_rate": 7.738794227723907e-05, + "loss": 0.9777, + "step": 98800 + }, + { + "epoch": 0.6312689265681101, + "grad_norm": 0.9047155380249023, + "learning_rate": 7.73837441620712e-05, + "loss": 0.8865, + "step": 98810 + }, + { + "epoch": 0.6313328137178488, + "grad_norm": 0.7591509819030762, + "learning_rate": 7.737954577112532e-05, + "loss": 0.8674, + "step": 98820 + }, + { + "epoch": 0.6313967008675875, + "grad_norm": 0.7103126049041748, + "learning_rate": 7.737534710444372e-05, + "loss": 1.0165, + "step": 98830 + }, + { + "epoch": 0.6314605880173262, + "grad_norm": 0.9940080642700195, + "learning_rate": 7.73711481620687e-05, + "loss": 0.8711, + "step": 98840 + }, + { + "epoch": 0.6315244751670649, + "grad_norm": 0.8602542281150818, + "learning_rate": 7.736694894404254e-05, + "loss": 0.8148, + "step": 98850 + }, + { + "epoch": 0.6315883623168036, + "grad_norm": 0.687978208065033, + "learning_rate": 7.736274945040753e-05, + "loss": 1.0732, + "step": 98860 + }, + { + "epoch": 0.6316522494665423, + "grad_norm": 0.7140915989875793, + "learning_rate": 7.735854968120596e-05, + "loss": 1.043, + "step": 98870 + }, + { + "epoch": 0.631716136616281, + "grad_norm": 0.7227775454521179, + "learning_rate": 7.735434963648013e-05, + "loss": 0.8709, + "step": 98880 + }, + { + "epoch": 0.6317800237660197, + "grad_norm": 0.5700997710227966, + "learning_rate": 7.735014931627234e-05, + "loss": 0.7587, + "step": 98890 + }, + { + "epoch": 0.6318439109157584, + "grad_norm": 1.2444158792495728, + "learning_rate": 7.734594872062486e-05, + "loss": 0.7822, + "step": 98900 + }, + { + "epoch": 0.6319077980654971, + "grad_norm": 1.6679047346115112, + "learning_rate": 7.734174784958004e-05, + "loss": 1.1175, + "step": 98910 + }, + { + "epoch": 0.6319716852152358, + "grad_norm": 0.6713977456092834, + "learning_rate": 7.733754670318016e-05, + "loss": 0.8503, + "step": 98920 + }, + { + "epoch": 0.6320355723649745, + "grad_norm": 1.2853600978851318, + "learning_rate": 7.733334528146753e-05, + "loss": 0.8824, + "step": 98930 + }, + { + "epoch": 0.6320994595147132, + "grad_norm": 1.0082619190216064, + "learning_rate": 7.732914358448448e-05, + "loss": 0.9314, + "step": 98940 + }, + { + "epoch": 0.632163346664452, + "grad_norm": 0.7332447171211243, + "learning_rate": 7.73249416122733e-05, + "loss": 0.7704, + "step": 98950 + }, + { + "epoch": 0.6322272338141907, + "grad_norm": 1.46458101272583, + "learning_rate": 7.732073936487631e-05, + "loss": 0.9249, + "step": 98960 + }, + { + "epoch": 0.6322911209639294, + "grad_norm": 0.8572206497192383, + "learning_rate": 7.731653684233585e-05, + "loss": 1.1214, + "step": 98970 + }, + { + "epoch": 0.6323550081136681, + "grad_norm": 0.780282199382782, + "learning_rate": 7.731233404469424e-05, + "loss": 0.7917, + "step": 98980 + }, + { + "epoch": 0.6324188952634067, + "grad_norm": 0.9528806209564209, + "learning_rate": 7.730813097199379e-05, + "loss": 0.9149, + "step": 98990 + }, + { + "epoch": 0.6324827824131454, + "grad_norm": 0.9242857098579407, + "learning_rate": 7.730392762427683e-05, + "loss": 0.7985, + "step": 99000 + }, + { + "epoch": 0.6325466695628841, + "grad_norm": 0.7129524350166321, + "learning_rate": 7.72997240015857e-05, + "loss": 0.978, + "step": 99010 + }, + { + "epoch": 0.6326105567126228, + "grad_norm": 0.7845136523246765, + "learning_rate": 7.729552010396274e-05, + "loss": 0.8295, + "step": 99020 + }, + { + "epoch": 0.6326744438623615, + "grad_norm": 0.6033239960670471, + "learning_rate": 7.729131593145027e-05, + "loss": 1.1444, + "step": 99030 + }, + { + "epoch": 0.6327383310121002, + "grad_norm": 0.8433771133422852, + "learning_rate": 7.728711148409063e-05, + "loss": 1.1936, + "step": 99040 + }, + { + "epoch": 0.6328022181618389, + "grad_norm": 1.812710165977478, + "learning_rate": 7.728290676192619e-05, + "loss": 0.7792, + "step": 99050 + }, + { + "epoch": 0.6328661053115776, + "grad_norm": 0.8290817737579346, + "learning_rate": 7.727870176499928e-05, + "loss": 0.756, + "step": 99060 + }, + { + "epoch": 0.6329299924613163, + "grad_norm": 1.5934466123580933, + "learning_rate": 7.727449649335222e-05, + "loss": 1.0873, + "step": 99070 + }, + { + "epoch": 0.632993879611055, + "grad_norm": 1.0674597024917603, + "learning_rate": 7.727029094702739e-05, + "loss": 0.7499, + "step": 99080 + }, + { + "epoch": 0.6330577667607937, + "grad_norm": 0.7989637851715088, + "learning_rate": 7.726608512606714e-05, + "loss": 0.926, + "step": 99090 + }, + { + "epoch": 0.6331216539105324, + "grad_norm": 0.6635336875915527, + "learning_rate": 7.726187903051383e-05, + "loss": 0.8383, + "step": 99100 + }, + { + "epoch": 0.6331855410602711, + "grad_norm": 0.4389435648918152, + "learning_rate": 7.725767266040982e-05, + "loss": 0.8101, + "step": 99110 + }, + { + "epoch": 0.6332494282100098, + "grad_norm": 0.8760795593261719, + "learning_rate": 7.725346601579744e-05, + "loss": 1.3585, + "step": 99120 + }, + { + "epoch": 0.6333133153597486, + "grad_norm": 1.2530437707901, + "learning_rate": 7.72492590967191e-05, + "loss": 1.0076, + "step": 99130 + }, + { + "epoch": 0.6333772025094873, + "grad_norm": 1.7212085723876953, + "learning_rate": 7.724505190321714e-05, + "loss": 1.144, + "step": 99140 + }, + { + "epoch": 0.633441089659226, + "grad_norm": 1.0681732892990112, + "learning_rate": 7.724084443533395e-05, + "loss": 0.8348, + "step": 99150 + }, + { + "epoch": 0.6335049768089647, + "grad_norm": 0.8044700026512146, + "learning_rate": 7.723663669311188e-05, + "loss": 0.9591, + "step": 99160 + }, + { + "epoch": 0.6335688639587034, + "grad_norm": 1.0960590839385986, + "learning_rate": 7.723242867659331e-05, + "loss": 0.9373, + "step": 99170 + }, + { + "epoch": 0.6336327511084421, + "grad_norm": 1.0857396125793457, + "learning_rate": 7.722822038582062e-05, + "loss": 1.23, + "step": 99180 + }, + { + "epoch": 0.6336966382581808, + "grad_norm": 0.9322388768196106, + "learning_rate": 7.722401182083621e-05, + "loss": 0.8729, + "step": 99190 + }, + { + "epoch": 0.6337605254079195, + "grad_norm": 0.8809077143669128, + "learning_rate": 7.721980298168243e-05, + "loss": 1.1811, + "step": 99200 + }, + { + "epoch": 0.6338244125576582, + "grad_norm": 0.662137508392334, + "learning_rate": 7.721559386840172e-05, + "loss": 0.8042, + "step": 99210 + }, + { + "epoch": 0.6338882997073969, + "grad_norm": 0.7486308813095093, + "learning_rate": 7.72113844810364e-05, + "loss": 0.8313, + "step": 99220 + }, + { + "epoch": 0.6339521868571355, + "grad_norm": 1.1322681903839111, + "learning_rate": 7.720717481962891e-05, + "loss": 0.9974, + "step": 99230 + }, + { + "epoch": 0.6340160740068742, + "grad_norm": 0.9434690475463867, + "learning_rate": 7.720296488422163e-05, + "loss": 0.7033, + "step": 99240 + }, + { + "epoch": 0.6340799611566129, + "grad_norm": 1.0906821489334106, + "learning_rate": 7.719875467485696e-05, + "loss": 0.7467, + "step": 99250 + }, + { + "epoch": 0.6341438483063516, + "grad_norm": 0.9866161942481995, + "learning_rate": 7.71945441915773e-05, + "loss": 0.9562, + "step": 99260 + }, + { + "epoch": 0.6342077354560903, + "grad_norm": 1.0579166412353516, + "learning_rate": 7.719033343442506e-05, + "loss": 0.7671, + "step": 99270 + }, + { + "epoch": 0.634271622605829, + "grad_norm": 0.8389692902565002, + "learning_rate": 7.718612240344264e-05, + "loss": 0.9807, + "step": 99280 + }, + { + "epoch": 0.6343355097555677, + "grad_norm": 0.9779929518699646, + "learning_rate": 7.718191109867244e-05, + "loss": 0.9206, + "step": 99290 + }, + { + "epoch": 0.6343993969053064, + "grad_norm": 1.6635096073150635, + "learning_rate": 7.717769952015687e-05, + "loss": 0.8582, + "step": 99300 + }, + { + "epoch": 0.6344632840550452, + "grad_norm": 1.1410149335861206, + "learning_rate": 7.717348766793837e-05, + "loss": 0.9895, + "step": 99310 + }, + { + "epoch": 0.6345271712047839, + "grad_norm": 0.8290955424308777, + "learning_rate": 7.716927554205935e-05, + "loss": 0.6947, + "step": 99320 + }, + { + "epoch": 0.6345910583545226, + "grad_norm": 0.919790506362915, + "learning_rate": 7.71650631425622e-05, + "loss": 0.8935, + "step": 99330 + }, + { + "epoch": 0.6346549455042613, + "grad_norm": 0.9774859547615051, + "learning_rate": 7.716085046948937e-05, + "loss": 0.9696, + "step": 99340 + }, + { + "epoch": 0.634718832654, + "grad_norm": 0.6959844827651978, + "learning_rate": 7.715663752288328e-05, + "loss": 0.8616, + "step": 99350 + }, + { + "epoch": 0.6347827198037387, + "grad_norm": 0.7934970855712891, + "learning_rate": 7.715242430278636e-05, + "loss": 1.0319, + "step": 99360 + }, + { + "epoch": 0.6348466069534774, + "grad_norm": 0.8985568284988403, + "learning_rate": 7.714821080924102e-05, + "loss": 1.032, + "step": 99370 + }, + { + "epoch": 0.6349104941032161, + "grad_norm": 0.8157596588134766, + "learning_rate": 7.714399704228972e-05, + "loss": 0.8366, + "step": 99380 + }, + { + "epoch": 0.6349743812529548, + "grad_norm": 0.6816175580024719, + "learning_rate": 7.713978300197488e-05, + "loss": 0.8663, + "step": 99390 + }, + { + "epoch": 0.6350382684026935, + "grad_norm": 1.0564788579940796, + "learning_rate": 7.713556868833896e-05, + "loss": 0.9809, + "step": 99400 + }, + { + "epoch": 0.6351021555524322, + "grad_norm": 1.043823480606079, + "learning_rate": 7.713135410142437e-05, + "loss": 0.7598, + "step": 99410 + }, + { + "epoch": 0.6351660427021709, + "grad_norm": 0.858410120010376, + "learning_rate": 7.71271392412736e-05, + "loss": 0.9618, + "step": 99420 + }, + { + "epoch": 0.6352299298519096, + "grad_norm": 1.3805270195007324, + "learning_rate": 7.712292410792905e-05, + "loss": 0.9138, + "step": 99430 + }, + { + "epoch": 0.6352938170016483, + "grad_norm": 1.437741756439209, + "learning_rate": 7.711870870143321e-05, + "loss": 1.0721, + "step": 99440 + }, + { + "epoch": 0.635357704151387, + "grad_norm": 0.5732793807983398, + "learning_rate": 7.711449302182849e-05, + "loss": 0.8065, + "step": 99450 + }, + { + "epoch": 0.6354215913011257, + "grad_norm": 0.852961003780365, + "learning_rate": 7.711027706915738e-05, + "loss": 0.8523, + "step": 99460 + }, + { + "epoch": 0.6354854784508643, + "grad_norm": 0.7680826783180237, + "learning_rate": 7.710606084346232e-05, + "loss": 1.0963, + "step": 99470 + }, + { + "epoch": 0.635549365600603, + "grad_norm": 0.7236658334732056, + "learning_rate": 7.710184434478577e-05, + "loss": 1.0257, + "step": 99480 + }, + { + "epoch": 0.6356132527503418, + "grad_norm": 0.8568646907806396, + "learning_rate": 7.709762757317021e-05, + "loss": 0.7875, + "step": 99490 + }, + { + "epoch": 0.6356771399000805, + "grad_norm": 1.2610046863555908, + "learning_rate": 7.709341052865811e-05, + "loss": 1.0912, + "step": 99500 + }, + { + "epoch": 0.6357410270498192, + "grad_norm": 0.8206515312194824, + "learning_rate": 7.708919321129192e-05, + "loss": 1.2463, + "step": 99510 + }, + { + "epoch": 0.6358049141995579, + "grad_norm": 1.1320310831069946, + "learning_rate": 7.70849756211141e-05, + "loss": 0.7328, + "step": 99520 + }, + { + "epoch": 0.6358688013492966, + "grad_norm": 0.832253098487854, + "learning_rate": 7.708075775816715e-05, + "loss": 1.0437, + "step": 99530 + }, + { + "epoch": 0.6359326884990353, + "grad_norm": 0.8778328895568848, + "learning_rate": 7.707653962249355e-05, + "loss": 0.748, + "step": 99540 + }, + { + "epoch": 0.635996575648774, + "grad_norm": 0.6165944933891296, + "learning_rate": 7.707232121413577e-05, + "loss": 0.9211, + "step": 99550 + }, + { + "epoch": 0.6360604627985127, + "grad_norm": 0.7879787683486938, + "learning_rate": 7.70681025331363e-05, + "loss": 0.7676, + "step": 99560 + }, + { + "epoch": 0.6361243499482514, + "grad_norm": 1.0421432256698608, + "learning_rate": 7.70638835795376e-05, + "loss": 0.951, + "step": 99570 + }, + { + "epoch": 0.6361882370979901, + "grad_norm": 0.9447425603866577, + "learning_rate": 7.705966435338218e-05, + "loss": 0.9486, + "step": 99580 + }, + { + "epoch": 0.6362521242477288, + "grad_norm": Infinity, + "learning_rate": 7.705586681684145e-05, + "loss": 1.0416, + "step": 99590 + }, + { + "epoch": 0.6363160113974675, + "grad_norm": 0.745619535446167, + "learning_rate": 7.705164707294533e-05, + "loss": 0.7663, + "step": 99600 + }, + { + "epoch": 0.6363798985472062, + "grad_norm": 1.6954360008239746, + "learning_rate": 7.704742705661573e-05, + "loss": 1.0033, + "step": 99610 + }, + { + "epoch": 0.6364437856969449, + "grad_norm": 0.6701345443725586, + "learning_rate": 7.704320676789514e-05, + "loss": 0.7165, + "step": 99620 + }, + { + "epoch": 0.6365076728466836, + "grad_norm": 0.7628158926963806, + "learning_rate": 7.703898620682606e-05, + "loss": 0.8939, + "step": 99630 + }, + { + "epoch": 0.6365715599964223, + "grad_norm": 1.4524946212768555, + "learning_rate": 7.7034765373451e-05, + "loss": 0.9185, + "step": 99640 + }, + { + "epoch": 0.636635447146161, + "grad_norm": 1.0932461023330688, + "learning_rate": 7.703054426781246e-05, + "loss": 0.8613, + "step": 99650 + }, + { + "epoch": 0.6366993342958998, + "grad_norm": 0.8748453855514526, + "learning_rate": 7.702632288995297e-05, + "loss": 0.9724, + "step": 99660 + }, + { + "epoch": 0.6367632214456385, + "grad_norm": 0.6235826015472412, + "learning_rate": 7.7022101239915e-05, + "loss": 0.8267, + "step": 99670 + }, + { + "epoch": 0.6368271085953772, + "grad_norm": 0.9133662581443787, + "learning_rate": 7.701787931774111e-05, + "loss": 0.9088, + "step": 99680 + }, + { + "epoch": 0.6368909957451159, + "grad_norm": 0.8982312679290771, + "learning_rate": 7.701365712347379e-05, + "loss": 0.9541, + "step": 99690 + }, + { + "epoch": 0.6369548828948546, + "grad_norm": 0.6857670545578003, + "learning_rate": 7.700943465715557e-05, + "loss": 1.0629, + "step": 99700 + }, + { + "epoch": 0.6370187700445933, + "grad_norm": 0.8805925250053406, + "learning_rate": 7.7005211918829e-05, + "loss": 0.9387, + "step": 99710 + }, + { + "epoch": 0.6370826571943319, + "grad_norm": 0.7308109402656555, + "learning_rate": 7.700098890853658e-05, + "loss": 0.6636, + "step": 99720 + }, + { + "epoch": 0.6371465443440706, + "grad_norm": 0.8776551485061646, + "learning_rate": 7.699676562632084e-05, + "loss": 0.9326, + "step": 99730 + }, + { + "epoch": 0.6372104314938093, + "grad_norm": 0.6855533719062805, + "learning_rate": 7.699254207222429e-05, + "loss": 1.1151, + "step": 99740 + }, + { + "epoch": 0.637274318643548, + "grad_norm": 1.2820175886154175, + "learning_rate": 7.698831824628951e-05, + "loss": 1.0056, + "step": 99750 + }, + { + "epoch": 0.6373382057932867, + "grad_norm": 0.8445931673049927, + "learning_rate": 7.698409414855902e-05, + "loss": 0.9203, + "step": 99760 + }, + { + "epoch": 0.6374020929430254, + "grad_norm": 1.1383895874023438, + "learning_rate": 7.697986977907534e-05, + "loss": 0.9536, + "step": 99770 + }, + { + "epoch": 0.6374659800927641, + "grad_norm": 1.0668944120407104, + "learning_rate": 7.697564513788105e-05, + "loss": 1.3005, + "step": 99780 + }, + { + "epoch": 0.6375298672425028, + "grad_norm": 1.0279992818832397, + "learning_rate": 7.697142022501866e-05, + "loss": 0.9918, + "step": 99790 + }, + { + "epoch": 0.6375937543922415, + "grad_norm": 0.8683011531829834, + "learning_rate": 7.696719504053075e-05, + "loss": 1.1289, + "step": 99800 + }, + { + "epoch": 0.6376576415419802, + "grad_norm": 0.6475001573562622, + "learning_rate": 7.696296958445985e-05, + "loss": 0.9956, + "step": 99810 + }, + { + "epoch": 0.6377215286917189, + "grad_norm": 0.9741643667221069, + "learning_rate": 7.695874385684852e-05, + "loss": 0.8118, + "step": 99820 + }, + { + "epoch": 0.6377854158414576, + "grad_norm": 0.6049178838729858, + "learning_rate": 7.695451785773931e-05, + "loss": 0.9677, + "step": 99830 + }, + { + "epoch": 0.6378493029911964, + "grad_norm": 0.7751360535621643, + "learning_rate": 7.695029158717479e-05, + "loss": 0.7172, + "step": 99840 + }, + { + "epoch": 0.6379131901409351, + "grad_norm": 0.6430035829544067, + "learning_rate": 7.694606504519752e-05, + "loss": 0.9808, + "step": 99850 + }, + { + "epoch": 0.6379770772906738, + "grad_norm": 1.0987446308135986, + "learning_rate": 7.694183823185005e-05, + "loss": 0.8621, + "step": 99860 + }, + { + "epoch": 0.6380409644404125, + "grad_norm": 0.7962204217910767, + "learning_rate": 7.6937611147175e-05, + "loss": 0.9515, + "step": 99870 + }, + { + "epoch": 0.6381048515901512, + "grad_norm": 0.8740015625953674, + "learning_rate": 7.693338379121486e-05, + "loss": 0.865, + "step": 99880 + }, + { + "epoch": 0.6381687387398899, + "grad_norm": 0.8522897362709045, + "learning_rate": 7.692915616401226e-05, + "loss": 0.7795, + "step": 99890 + }, + { + "epoch": 0.6382326258896286, + "grad_norm": 0.8299471139907837, + "learning_rate": 7.692492826560978e-05, + "loss": 1.0699, + "step": 99900 + }, + { + "epoch": 0.6382965130393673, + "grad_norm": 1.0273027420043945, + "learning_rate": 7.692070009604994e-05, + "loss": 0.8754, + "step": 99910 + }, + { + "epoch": 0.638360400189106, + "grad_norm": 0.8855130672454834, + "learning_rate": 7.69164716553754e-05, + "loss": 0.9245, + "step": 99920 + }, + { + "epoch": 0.6384242873388447, + "grad_norm": 0.9745055437088013, + "learning_rate": 7.691224294362866e-05, + "loss": 0.6921, + "step": 99930 + }, + { + "epoch": 0.6384881744885834, + "grad_norm": 0.7872833609580994, + "learning_rate": 7.690801396085239e-05, + "loss": 0.7978, + "step": 99940 + }, + { + "epoch": 0.6385520616383221, + "grad_norm": 0.8484395742416382, + "learning_rate": 7.690378470708912e-05, + "loss": 0.6428, + "step": 99950 + }, + { + "epoch": 0.6386159487880607, + "grad_norm": 0.9526743292808533, + "learning_rate": 7.689955518238148e-05, + "loss": 0.9055, + "step": 99960 + }, + { + "epoch": 0.6386798359377994, + "grad_norm": 0.7249189615249634, + "learning_rate": 7.689532538677203e-05, + "loss": 0.9418, + "step": 99970 + }, + { + "epoch": 0.6387437230875381, + "grad_norm": 0.695597767829895, + "learning_rate": 7.689109532030339e-05, + "loss": 0.9258, + "step": 99980 + }, + { + "epoch": 0.6388076102372768, + "grad_norm": 1.010576605796814, + "learning_rate": 7.688686498301816e-05, + "loss": 0.8175, + "step": 99990 + }, + { + "epoch": 0.6388714973870155, + "grad_norm": 0.8327822685241699, + "learning_rate": 7.688263437495892e-05, + "loss": 0.8687, + "step": 100000 + }, + { + "epoch": 0.6389353845367542, + "grad_norm": 0.7016774415969849, + "learning_rate": 7.687840349616833e-05, + "loss": 1.0383, + "step": 100010 + }, + { + "epoch": 0.638999271686493, + "grad_norm": 0.9525433778762817, + "learning_rate": 7.687417234668895e-05, + "loss": 0.8909, + "step": 100020 + }, + { + "epoch": 0.6390631588362317, + "grad_norm": 0.8068029284477234, + "learning_rate": 7.686994092656339e-05, + "loss": 0.9069, + "step": 100030 + }, + { + "epoch": 0.6391270459859704, + "grad_norm": 1.108211636543274, + "learning_rate": 7.686570923583429e-05, + "loss": 0.931, + "step": 100040 + }, + { + "epoch": 0.6391909331357091, + "grad_norm": 1.0641669034957886, + "learning_rate": 7.686147727454426e-05, + "loss": 0.8077, + "step": 100050 + }, + { + "epoch": 0.6392548202854478, + "grad_norm": 0.8529702425003052, + "learning_rate": 7.68572450427359e-05, + "loss": 1.153, + "step": 100060 + }, + { + "epoch": 0.6393187074351865, + "grad_norm": 0.6657126545906067, + "learning_rate": 7.685301254045188e-05, + "loss": 0.7854, + "step": 100070 + }, + { + "epoch": 0.6393825945849252, + "grad_norm": 1.6039618253707886, + "learning_rate": 7.684877976773476e-05, + "loss": 0.9493, + "step": 100080 + }, + { + "epoch": 0.6394464817346639, + "grad_norm": 0.7978668212890625, + "learning_rate": 7.684454672462723e-05, + "loss": 1.034, + "step": 100090 + }, + { + "epoch": 0.6395103688844026, + "grad_norm": 0.9934602379798889, + "learning_rate": 7.684031341117186e-05, + "loss": 1.1376, + "step": 100100 + }, + { + "epoch": 0.6395742560341413, + "grad_norm": 1.048313856124878, + "learning_rate": 7.683607982741132e-05, + "loss": 0.9767, + "step": 100110 + }, + { + "epoch": 0.63963814318388, + "grad_norm": 0.9221808910369873, + "learning_rate": 7.683184597338826e-05, + "loss": 0.836, + "step": 100120 + }, + { + "epoch": 0.6397020303336187, + "grad_norm": 0.9617723822593689, + "learning_rate": 7.682761184914528e-05, + "loss": 0.6673, + "step": 100130 + }, + { + "epoch": 0.6397659174833574, + "grad_norm": 1.2165039777755737, + "learning_rate": 7.682337745472505e-05, + "loss": 1.1207, + "step": 100140 + }, + { + "epoch": 0.6398298046330961, + "grad_norm": 1.1467498540878296, + "learning_rate": 7.681914279017019e-05, + "loss": 0.8988, + "step": 100150 + }, + { + "epoch": 0.6398936917828348, + "grad_norm": 1.0085322856903076, + "learning_rate": 7.681490785552337e-05, + "loss": 0.9632, + "step": 100160 + }, + { + "epoch": 0.6399575789325735, + "grad_norm": 1.1508851051330566, + "learning_rate": 7.681067265082721e-05, + "loss": 0.885, + "step": 100170 + }, + { + "epoch": 0.6400214660823123, + "grad_norm": 1.3940848112106323, + "learning_rate": 7.680643717612441e-05, + "loss": 0.9147, + "step": 100180 + }, + { + "epoch": 0.640085353232051, + "grad_norm": 1.0096964836120605, + "learning_rate": 7.680220143145757e-05, + "loss": 0.9108, + "step": 100190 + }, + { + "epoch": 0.6401492403817896, + "grad_norm": 0.6056742668151855, + "learning_rate": 7.679796541686942e-05, + "loss": 0.9895, + "step": 100200 + }, + { + "epoch": 0.6402131275315283, + "grad_norm": 0.8732916116714478, + "learning_rate": 7.679372913240252e-05, + "loss": 0.8695, + "step": 100210 + }, + { + "epoch": 0.640277014681267, + "grad_norm": 1.1940739154815674, + "learning_rate": 7.678949257809962e-05, + "loss": 0.851, + "step": 100220 + }, + { + "epoch": 0.6403409018310057, + "grad_norm": 0.9139200448989868, + "learning_rate": 7.678525575400335e-05, + "loss": 0.8596, + "step": 100230 + }, + { + "epoch": 0.6404047889807444, + "grad_norm": 0.871724009513855, + "learning_rate": 7.678101866015638e-05, + "loss": 1.1388, + "step": 100240 + }, + { + "epoch": 0.6404686761304831, + "grad_norm": 1.4798542261123657, + "learning_rate": 7.677678129660137e-05, + "loss": 0.8845, + "step": 100250 + }, + { + "epoch": 0.6405325632802218, + "grad_norm": 0.8862691521644592, + "learning_rate": 7.677254366338103e-05, + "loss": 0.8407, + "step": 100260 + }, + { + "epoch": 0.6405964504299605, + "grad_norm": 1.0017880201339722, + "learning_rate": 7.676830576053799e-05, + "loss": 0.9927, + "step": 100270 + }, + { + "epoch": 0.6406603375796992, + "grad_norm": 1.1630281209945679, + "learning_rate": 7.676406758811497e-05, + "loss": 1.0337, + "step": 100280 + }, + { + "epoch": 0.6407242247294379, + "grad_norm": 0.8417305946350098, + "learning_rate": 7.675982914615464e-05, + "loss": 0.799, + "step": 100290 + }, + { + "epoch": 0.6407881118791766, + "grad_norm": 0.5187862515449524, + "learning_rate": 7.675559043469966e-05, + "loss": 0.9353, + "step": 100300 + }, + { + "epoch": 0.6408519990289153, + "grad_norm": 0.6978999376296997, + "learning_rate": 7.675135145379276e-05, + "loss": 0.9321, + "step": 100310 + }, + { + "epoch": 0.640915886178654, + "grad_norm": 0.9272652268409729, + "learning_rate": 7.674711220347659e-05, + "loss": 0.9838, + "step": 100320 + }, + { + "epoch": 0.6409797733283927, + "grad_norm": 0.8608036637306213, + "learning_rate": 7.674287268379386e-05, + "loss": 0.9587, + "step": 100330 + }, + { + "epoch": 0.6410436604781314, + "grad_norm": 1.0041062831878662, + "learning_rate": 7.673863289478727e-05, + "loss": 1.0735, + "step": 100340 + }, + { + "epoch": 0.6411075476278701, + "grad_norm": 0.7018103003501892, + "learning_rate": 7.673439283649952e-05, + "loss": 0.9516, + "step": 100350 + }, + { + "epoch": 0.6411714347776089, + "grad_norm": 1.4162198305130005, + "learning_rate": 7.673015250897331e-05, + "loss": 1.0228, + "step": 100360 + }, + { + "epoch": 0.6412353219273476, + "grad_norm": 0.7780821919441223, + "learning_rate": 7.672591191225134e-05, + "loss": 0.9125, + "step": 100370 + }, + { + "epoch": 0.6412992090770863, + "grad_norm": 0.9130464792251587, + "learning_rate": 7.67216710463763e-05, + "loss": 0.779, + "step": 100380 + }, + { + "epoch": 0.641363096226825, + "grad_norm": 1.32298743724823, + "learning_rate": 7.671742991139093e-05, + "loss": 0.7318, + "step": 100390 + }, + { + "epoch": 0.6414269833765637, + "grad_norm": 0.8966230750083923, + "learning_rate": 7.671318850733791e-05, + "loss": 0.8519, + "step": 100400 + }, + { + "epoch": 0.6414908705263024, + "grad_norm": 0.7686439752578735, + "learning_rate": 7.670894683425997e-05, + "loss": 0.796, + "step": 100410 + }, + { + "epoch": 0.6415547576760411, + "grad_norm": 0.48994553089141846, + "learning_rate": 7.670470489219986e-05, + "loss": 0.8914, + "step": 100420 + }, + { + "epoch": 0.6416186448257798, + "grad_norm": 0.9652213454246521, + "learning_rate": 7.670046268120023e-05, + "loss": 0.8382, + "step": 100430 + }, + { + "epoch": 0.6416825319755184, + "grad_norm": 0.8491564989089966, + "learning_rate": 7.669622020130387e-05, + "loss": 0.6973, + "step": 100440 + }, + { + "epoch": 0.6417464191252571, + "grad_norm": 0.8289761543273926, + "learning_rate": 7.669197745255348e-05, + "loss": 1.0381, + "step": 100450 + }, + { + "epoch": 0.6418103062749958, + "grad_norm": 1.043124794960022, + "learning_rate": 7.668773443499176e-05, + "loss": 0.9624, + "step": 100460 + }, + { + "epoch": 0.6418741934247345, + "grad_norm": 0.8697907328605652, + "learning_rate": 7.668349114866149e-05, + "loss": 0.8019, + "step": 100470 + }, + { + "epoch": 0.6419380805744732, + "grad_norm": 2.3505332469940186, + "learning_rate": 7.667924759360537e-05, + "loss": 0.7879, + "step": 100480 + }, + { + "epoch": 0.6420019677242119, + "grad_norm": 0.638027548789978, + "learning_rate": 7.667500376986614e-05, + "loss": 0.8642, + "step": 100490 + }, + { + "epoch": 0.6420658548739506, + "grad_norm": 0.7815401554107666, + "learning_rate": 7.667075967748655e-05, + "loss": 0.9564, + "step": 100500 + }, + { + "epoch": 0.6421297420236893, + "grad_norm": 0.5453735589981079, + "learning_rate": 7.666651531650934e-05, + "loss": 0.9481, + "step": 100510 + }, + { + "epoch": 0.642193629173428, + "grad_norm": 0.6826755404472351, + "learning_rate": 7.666227068697722e-05, + "loss": 0.9086, + "step": 100520 + }, + { + "epoch": 0.6422575163231667, + "grad_norm": 1.5171852111816406, + "learning_rate": 7.665802578893301e-05, + "loss": 0.993, + "step": 100530 + }, + { + "epoch": 0.6423214034729055, + "grad_norm": 1.0722650289535522, + "learning_rate": 7.665378062241939e-05, + "loss": 1.005, + "step": 100540 + }, + { + "epoch": 0.6423852906226442, + "grad_norm": 0.9586762189865112, + "learning_rate": 7.664953518747916e-05, + "loss": 0.8817, + "step": 100550 + }, + { + "epoch": 0.6424491777723829, + "grad_norm": 1.246511459350586, + "learning_rate": 7.664528948415505e-05, + "loss": 0.7576, + "step": 100560 + }, + { + "epoch": 0.6425130649221216, + "grad_norm": 0.9459572434425354, + "learning_rate": 7.664104351248982e-05, + "loss": 1.0113, + "step": 100570 + }, + { + "epoch": 0.6425769520718603, + "grad_norm": 0.9673700332641602, + "learning_rate": 7.663679727252624e-05, + "loss": 1.2202, + "step": 100580 + }, + { + "epoch": 0.642640839221599, + "grad_norm": 1.0018703937530518, + "learning_rate": 7.663255076430707e-05, + "loss": 0.8106, + "step": 100590 + }, + { + "epoch": 0.6427047263713377, + "grad_norm": 2.040289878845215, + "learning_rate": 7.662830398787506e-05, + "loss": 0.8261, + "step": 100600 + }, + { + "epoch": 0.6427686135210764, + "grad_norm": 1.0974010229110718, + "learning_rate": 7.662405694327302e-05, + "loss": 0.9523, + "step": 100610 + }, + { + "epoch": 0.6428325006708151, + "grad_norm": 0.9285750389099121, + "learning_rate": 7.661980963054366e-05, + "loss": 0.7965, + "step": 100620 + }, + { + "epoch": 0.6428963878205538, + "grad_norm": 0.529984176158905, + "learning_rate": 7.66155620497298e-05, + "loss": 0.6934, + "step": 100630 + }, + { + "epoch": 0.6429602749702925, + "grad_norm": 0.9809777736663818, + "learning_rate": 7.661131420087421e-05, + "loss": 1.0064, + "step": 100640 + }, + { + "epoch": 0.6430241621200312, + "grad_norm": 0.7377033829689026, + "learning_rate": 7.660706608401965e-05, + "loss": 0.9637, + "step": 100650 + }, + { + "epoch": 0.6430880492697699, + "grad_norm": 0.7346864342689514, + "learning_rate": 7.660281769920893e-05, + "loss": 0.8089, + "step": 100660 + }, + { + "epoch": 0.6431519364195086, + "grad_norm": 0.6735924482345581, + "learning_rate": 7.659856904648482e-05, + "loss": 0.7855, + "step": 100670 + }, + { + "epoch": 0.6432158235692473, + "grad_norm": 1.3427221775054932, + "learning_rate": 7.659432012589009e-05, + "loss": 0.7815, + "step": 100680 + }, + { + "epoch": 0.6432797107189859, + "grad_norm": 0.7119907736778259, + "learning_rate": 7.659007093746757e-05, + "loss": 0.8669, + "step": 100690 + }, + { + "epoch": 0.6433435978687246, + "grad_norm": 1.4726430177688599, + "learning_rate": 7.658582148126001e-05, + "loss": 0.9779, + "step": 100700 + }, + { + "epoch": 0.6434074850184633, + "grad_norm": 0.721088171005249, + "learning_rate": 7.658157175731024e-05, + "loss": 0.7114, + "step": 100710 + }, + { + "epoch": 0.643471372168202, + "grad_norm": 0.8141944408416748, + "learning_rate": 7.657732176566105e-05, + "loss": 1.0834, + "step": 100720 + }, + { + "epoch": 0.6435352593179408, + "grad_norm": 1.5644798278808594, + "learning_rate": 7.657307150635524e-05, + "loss": 1.3679, + "step": 100730 + }, + { + "epoch": 0.6435991464676795, + "grad_norm": 1.1040544509887695, + "learning_rate": 7.65688209794356e-05, + "loss": 0.913, + "step": 100740 + }, + { + "epoch": 0.6436630336174182, + "grad_norm": 0.8945556282997131, + "learning_rate": 7.656457018494496e-05, + "loss": 0.9663, + "step": 100750 + }, + { + "epoch": 0.6437269207671569, + "grad_norm": 0.8544933199882507, + "learning_rate": 7.656031912292612e-05, + "loss": 0.7184, + "step": 100760 + }, + { + "epoch": 0.6437908079168956, + "grad_norm": 0.5265579223632812, + "learning_rate": 7.655606779342188e-05, + "loss": 0.9399, + "step": 100770 + }, + { + "epoch": 0.6438546950666343, + "grad_norm": 0.9724834561347961, + "learning_rate": 7.655181619647505e-05, + "loss": 0.919, + "step": 100780 + }, + { + "epoch": 0.643918582216373, + "grad_norm": 0.756826639175415, + "learning_rate": 7.654756433212848e-05, + "loss": 0.7773, + "step": 100790 + }, + { + "epoch": 0.6439824693661117, + "grad_norm": 0.599709689617157, + "learning_rate": 7.654331220042497e-05, + "loss": 0.8449, + "step": 100800 + }, + { + "epoch": 0.6440463565158504, + "grad_norm": 0.8727756142616272, + "learning_rate": 7.653905980140734e-05, + "loss": 0.8233, + "step": 100810 + }, + { + "epoch": 0.6441102436655891, + "grad_norm": 1.616363286972046, + "learning_rate": 7.653480713511841e-05, + "loss": 0.8221, + "step": 100820 + }, + { + "epoch": 0.6441741308153278, + "grad_norm": 0.8817083239555359, + "learning_rate": 7.653055420160102e-05, + "loss": 0.8405, + "step": 100830 + }, + { + "epoch": 0.6442380179650665, + "grad_norm": 1.3261228799819946, + "learning_rate": 7.6526301000898e-05, + "loss": 0.9731, + "step": 100840 + }, + { + "epoch": 0.6443019051148052, + "grad_norm": 0.7460963726043701, + "learning_rate": 7.652204753305217e-05, + "loss": 0.8105, + "step": 100850 + }, + { + "epoch": 0.6443657922645439, + "grad_norm": 1.7109055519104004, + "learning_rate": 7.651779379810639e-05, + "loss": 1.1521, + "step": 100860 + }, + { + "epoch": 0.6444296794142826, + "grad_norm": 1.031250238418579, + "learning_rate": 7.651353979610348e-05, + "loss": 0.9203, + "step": 100870 + }, + { + "epoch": 0.6444935665640213, + "grad_norm": 1.0401231050491333, + "learning_rate": 7.650928552708628e-05, + "loss": 0.7879, + "step": 100880 + }, + { + "epoch": 0.64455745371376, + "grad_norm": 0.9327844381332397, + "learning_rate": 7.650503099109765e-05, + "loss": 0.9528, + "step": 100890 + }, + { + "epoch": 0.6446213408634988, + "grad_norm": 0.803861677646637, + "learning_rate": 7.650077618818044e-05, + "loss": 0.7373, + "step": 100900 + }, + { + "epoch": 0.6446852280132375, + "grad_norm": 1.1715584993362427, + "learning_rate": 7.649652111837746e-05, + "loss": 0.963, + "step": 100910 + }, + { + "epoch": 0.6447491151629762, + "grad_norm": 0.7066060900688171, + "learning_rate": 7.649226578173161e-05, + "loss": 0.8937, + "step": 100920 + }, + { + "epoch": 0.6448130023127148, + "grad_norm": 0.9723853468894958, + "learning_rate": 7.648801017828571e-05, + "loss": 0.9408, + "step": 100930 + }, + { + "epoch": 0.6448768894624535, + "grad_norm": 0.7268878817558289, + "learning_rate": 7.648375430808264e-05, + "loss": 0.9303, + "step": 100940 + }, + { + "epoch": 0.6449407766121922, + "grad_norm": 0.8822718262672424, + "learning_rate": 7.647949817116525e-05, + "loss": 1.1401, + "step": 100950 + }, + { + "epoch": 0.6450046637619309, + "grad_norm": 0.9792453646659851, + "learning_rate": 7.64752417675764e-05, + "loss": 0.9359, + "step": 100960 + }, + { + "epoch": 0.6450685509116696, + "grad_norm": 0.5390404462814331, + "learning_rate": 7.647098509735897e-05, + "loss": 0.8654, + "step": 100970 + }, + { + "epoch": 0.6451324380614083, + "grad_norm": 0.7712870836257935, + "learning_rate": 7.646672816055583e-05, + "loss": 0.8716, + "step": 100980 + }, + { + "epoch": 0.645196325211147, + "grad_norm": 0.711517333984375, + "learning_rate": 7.646247095720982e-05, + "loss": 0.978, + "step": 100990 + }, + { + "epoch": 0.6452602123608857, + "grad_norm": 0.6369432210922241, + "learning_rate": 7.645821348736383e-05, + "loss": 0.8599, + "step": 101000 + }, + { + "epoch": 0.6453240995106244, + "grad_norm": 1.0040960311889648, + "learning_rate": 7.645395575106075e-05, + "loss": 0.8098, + "step": 101010 + }, + { + "epoch": 0.6453879866603631, + "grad_norm": 0.9577940106391907, + "learning_rate": 7.644969774834348e-05, + "loss": 0.6533, + "step": 101020 + }, + { + "epoch": 0.6454518738101018, + "grad_norm": 0.8677441477775574, + "learning_rate": 7.644543947925483e-05, + "loss": 0.8457, + "step": 101030 + }, + { + "epoch": 0.6455157609598405, + "grad_norm": 0.7165183424949646, + "learning_rate": 7.644118094383774e-05, + "loss": 0.8876, + "step": 101040 + }, + { + "epoch": 0.6455796481095792, + "grad_norm": 1.3950011730194092, + "learning_rate": 7.643692214213507e-05, + "loss": 1.1486, + "step": 101050 + }, + { + "epoch": 0.645643535259318, + "grad_norm": 0.824797511100769, + "learning_rate": 7.643266307418974e-05, + "loss": 0.8211, + "step": 101060 + }, + { + "epoch": 0.6457074224090567, + "grad_norm": 0.7592344284057617, + "learning_rate": 7.642840374004463e-05, + "loss": 1.0379, + "step": 101070 + }, + { + "epoch": 0.6457713095587954, + "grad_norm": 0.9701903462409973, + "learning_rate": 7.642414413974262e-05, + "loss": 0.6966, + "step": 101080 + }, + { + "epoch": 0.6458351967085341, + "grad_norm": 0.8895474672317505, + "learning_rate": 7.641988427332663e-05, + "loss": 1.1851, + "step": 101090 + }, + { + "epoch": 0.6458990838582728, + "grad_norm": 0.7744872570037842, + "learning_rate": 7.641562414083952e-05, + "loss": 0.8782, + "step": 101100 + }, + { + "epoch": 0.6459629710080115, + "grad_norm": 1.7571711540222168, + "learning_rate": 7.641136374232425e-05, + "loss": 0.9443, + "step": 101110 + }, + { + "epoch": 0.6460268581577502, + "grad_norm": 0.6616213917732239, + "learning_rate": 7.640710307782368e-05, + "loss": 0.7928, + "step": 101120 + }, + { + "epoch": 0.6460907453074889, + "grad_norm": 0.7461645603179932, + "learning_rate": 7.640284214738075e-05, + "loss": 1.0164, + "step": 101130 + }, + { + "epoch": 0.6461546324572276, + "grad_norm": 0.9420298933982849, + "learning_rate": 7.639858095103836e-05, + "loss": 0.7869, + "step": 101140 + }, + { + "epoch": 0.6462185196069663, + "grad_norm": 1.6276788711547852, + "learning_rate": 7.639431948883941e-05, + "loss": 0.6882, + "step": 101150 + }, + { + "epoch": 0.646282406756705, + "grad_norm": 1.1304795742034912, + "learning_rate": 7.639005776082683e-05, + "loss": 0.892, + "step": 101160 + }, + { + "epoch": 0.6463462939064436, + "grad_norm": 0.9252867102622986, + "learning_rate": 7.638579576704355e-05, + "loss": 0.8588, + "step": 101170 + }, + { + "epoch": 0.6464101810561823, + "grad_norm": 1.1409999132156372, + "learning_rate": 7.638153350753246e-05, + "loss": 0.927, + "step": 101180 + }, + { + "epoch": 0.646474068205921, + "grad_norm": 1.0091885328292847, + "learning_rate": 7.637727098233651e-05, + "loss": 0.9428, + "step": 101190 + }, + { + "epoch": 0.6465379553556597, + "grad_norm": 0.8046776652336121, + "learning_rate": 7.637300819149862e-05, + "loss": 1.0972, + "step": 101200 + }, + { + "epoch": 0.6466018425053984, + "grad_norm": 1.0795817375183105, + "learning_rate": 7.636874513506174e-05, + "loss": 1.1528, + "step": 101210 + }, + { + "epoch": 0.6466657296551371, + "grad_norm": 1.0064719915390015, + "learning_rate": 7.636448181306876e-05, + "loss": 0.876, + "step": 101220 + }, + { + "epoch": 0.6467296168048758, + "grad_norm": 0.786211371421814, + "learning_rate": 7.636021822556266e-05, + "loss": 0.9598, + "step": 101230 + }, + { + "epoch": 0.6467935039546145, + "grad_norm": 1.0055882930755615, + "learning_rate": 7.635595437258634e-05, + "loss": 0.8143, + "step": 101240 + }, + { + "epoch": 0.6468573911043533, + "grad_norm": 0.8458549380302429, + "learning_rate": 7.635169025418278e-05, + "loss": 0.89, + "step": 101250 + }, + { + "epoch": 0.646921278254092, + "grad_norm": 0.9229540228843689, + "learning_rate": 7.634742587039489e-05, + "loss": 0.6728, + "step": 101260 + }, + { + "epoch": 0.6469851654038307, + "grad_norm": 0.8958204388618469, + "learning_rate": 7.634316122126562e-05, + "loss": 0.7874, + "step": 101270 + }, + { + "epoch": 0.6470490525535694, + "grad_norm": 0.6617315411567688, + "learning_rate": 7.633889630683794e-05, + "loss": 0.8585, + "step": 101280 + }, + { + "epoch": 0.6471129397033081, + "grad_norm": 0.5204321146011353, + "learning_rate": 7.633463112715477e-05, + "loss": 0.8739, + "step": 101290 + }, + { + "epoch": 0.6471768268530468, + "grad_norm": 0.7551961541175842, + "learning_rate": 7.633036568225911e-05, + "loss": 0.7589, + "step": 101300 + }, + { + "epoch": 0.6472407140027855, + "grad_norm": 1.2205754518508911, + "learning_rate": 7.632609997219388e-05, + "loss": 1.1764, + "step": 101310 + }, + { + "epoch": 0.6473046011525242, + "grad_norm": 0.5850151181221008, + "learning_rate": 7.632183399700204e-05, + "loss": 0.7911, + "step": 101320 + }, + { + "epoch": 0.6473684883022629, + "grad_norm": 0.8024903535842896, + "learning_rate": 7.631756775672656e-05, + "loss": 0.9156, + "step": 101330 + }, + { + "epoch": 0.6474323754520016, + "grad_norm": 1.3309029340744019, + "learning_rate": 7.63133012514104e-05, + "loss": 0.8624, + "step": 101340 + }, + { + "epoch": 0.6474962626017403, + "grad_norm": 1.0072933435440063, + "learning_rate": 7.630903448109654e-05, + "loss": 0.7941, + "step": 101350 + }, + { + "epoch": 0.647560149751479, + "grad_norm": 0.6163814663887024, + "learning_rate": 7.630476744582794e-05, + "loss": 1.1017, + "step": 101360 + }, + { + "epoch": 0.6476240369012177, + "grad_norm": 1.1491094827651978, + "learning_rate": 7.630050014564755e-05, + "loss": 0.7671, + "step": 101370 + }, + { + "epoch": 0.6476879240509564, + "grad_norm": 0.9501873254776001, + "learning_rate": 7.62962325805984e-05, + "loss": 0.8288, + "step": 101380 + }, + { + "epoch": 0.6477518112006951, + "grad_norm": 1.3051087856292725, + "learning_rate": 7.629196475072345e-05, + "loss": 0.7035, + "step": 101390 + }, + { + "epoch": 0.6478156983504338, + "grad_norm": 0.8004158139228821, + "learning_rate": 7.628769665606564e-05, + "loss": 0.8295, + "step": 101400 + }, + { + "epoch": 0.6478795855001726, + "grad_norm": 0.8266284465789795, + "learning_rate": 7.628342829666799e-05, + "loss": 0.9296, + "step": 101410 + }, + { + "epoch": 0.6479434726499111, + "grad_norm": 0.7903985977172852, + "learning_rate": 7.627915967257348e-05, + "loss": 0.8982, + "step": 101420 + }, + { + "epoch": 0.6480073597996499, + "grad_norm": 0.9672759771347046, + "learning_rate": 7.62748907838251e-05, + "loss": 0.9396, + "step": 101430 + }, + { + "epoch": 0.6480712469493886, + "grad_norm": 0.5555049777030945, + "learning_rate": 7.627062163046585e-05, + "loss": 0.787, + "step": 101440 + }, + { + "epoch": 0.6481351340991273, + "grad_norm": 1.0060865879058838, + "learning_rate": 7.626635221253871e-05, + "loss": 1.3664, + "step": 101450 + }, + { + "epoch": 0.648199021248866, + "grad_norm": 0.8967821002006531, + "learning_rate": 7.626208253008667e-05, + "loss": 0.6339, + "step": 101460 + }, + { + "epoch": 0.6482629083986047, + "grad_norm": 0.9876995086669922, + "learning_rate": 7.625781258315273e-05, + "loss": 0.8212, + "step": 101470 + }, + { + "epoch": 0.6483267955483434, + "grad_norm": 0.8965706825256348, + "learning_rate": 7.625354237177991e-05, + "loss": 0.9081, + "step": 101480 + }, + { + "epoch": 0.6483906826980821, + "grad_norm": 0.7027506232261658, + "learning_rate": 7.624927189601121e-05, + "loss": 0.7717, + "step": 101490 + }, + { + "epoch": 0.6484545698478208, + "grad_norm": 1.207170844078064, + "learning_rate": 7.624500115588963e-05, + "loss": 1.0589, + "step": 101500 + }, + { + "epoch": 0.6485184569975595, + "grad_norm": 0.6559182405471802, + "learning_rate": 7.624073015145819e-05, + "loss": 0.9365, + "step": 101510 + }, + { + "epoch": 0.6485823441472982, + "grad_norm": 0.8271816372871399, + "learning_rate": 7.623645888275988e-05, + "loss": 1.0675, + "step": 101520 + }, + { + "epoch": 0.6486462312970369, + "grad_norm": 0.9976779818534851, + "learning_rate": 7.623218734983775e-05, + "loss": 0.9971, + "step": 101530 + }, + { + "epoch": 0.6487101184467756, + "grad_norm": 1.128108263015747, + "learning_rate": 7.622791555273478e-05, + "loss": 0.7268, + "step": 101540 + }, + { + "epoch": 0.6487740055965143, + "grad_norm": 1.8108998537063599, + "learning_rate": 7.622364349149402e-05, + "loss": 0.9972, + "step": 101550 + }, + { + "epoch": 0.648837892746253, + "grad_norm": 1.99355149269104, + "learning_rate": 7.621937116615849e-05, + "loss": 0.844, + "step": 101560 + }, + { + "epoch": 0.6489017798959917, + "grad_norm": 0.9173657894134521, + "learning_rate": 7.62150985767712e-05, + "loss": 0.7644, + "step": 101570 + }, + { + "epoch": 0.6489656670457304, + "grad_norm": 0.8146788477897644, + "learning_rate": 7.62108257233752e-05, + "loss": 0.8551, + "step": 101580 + }, + { + "epoch": 0.6490295541954691, + "grad_norm": 1.06039297580719, + "learning_rate": 7.62065526060135e-05, + "loss": 1.0242, + "step": 101590 + }, + { + "epoch": 0.6490934413452079, + "grad_norm": 0.5522758364677429, + "learning_rate": 7.620227922472914e-05, + "loss": 1.0448, + "step": 101600 + }, + { + "epoch": 0.6491573284949466, + "grad_norm": 1.0754237174987793, + "learning_rate": 7.619800557956516e-05, + "loss": 0.8727, + "step": 101610 + }, + { + "epoch": 0.6492212156446853, + "grad_norm": 0.5872507691383362, + "learning_rate": 7.619373167056461e-05, + "loss": 0.7673, + "step": 101620 + }, + { + "epoch": 0.649285102794424, + "grad_norm": 1.68289053440094, + "learning_rate": 7.618945749777051e-05, + "loss": 1.2075, + "step": 101630 + }, + { + "epoch": 0.6493489899441627, + "grad_norm": 0.673235297203064, + "learning_rate": 7.618518306122593e-05, + "loss": 0.8398, + "step": 101640 + }, + { + "epoch": 0.6494128770939014, + "grad_norm": 0.8392667174339294, + "learning_rate": 7.618090836097389e-05, + "loss": 1.0282, + "step": 101650 + }, + { + "epoch": 0.64947676424364, + "grad_norm": 0.834991991519928, + "learning_rate": 7.617706090531277e-05, + "loss": 0.8716, + "step": 101660 + }, + { + "epoch": 0.6495406513933787, + "grad_norm": 1.0076117515563965, + "learning_rate": 7.617278570413519e-05, + "loss": 1.0367, + "step": 101670 + }, + { + "epoch": 0.6496045385431174, + "grad_norm": 1.0270442962646484, + "learning_rate": 7.616851023937501e-05, + "loss": 0.9725, + "step": 101680 + }, + { + "epoch": 0.6496684256928561, + "grad_norm": 0.7313565611839294, + "learning_rate": 7.61642345110753e-05, + "loss": 0.858, + "step": 101690 + }, + { + "epoch": 0.6497323128425948, + "grad_norm": 0.8768913149833679, + "learning_rate": 7.615995851927911e-05, + "loss": 0.9933, + "step": 101700 + }, + { + "epoch": 0.6497961999923335, + "grad_norm": 1.0162863731384277, + "learning_rate": 7.615568226402951e-05, + "loss": 0.8052, + "step": 101710 + }, + { + "epoch": 0.6498600871420722, + "grad_norm": 0.6026840209960938, + "learning_rate": 7.615140574536956e-05, + "loss": 0.8807, + "step": 101720 + }, + { + "epoch": 0.6499239742918109, + "grad_norm": 1.1195095777511597, + "learning_rate": 7.614712896334233e-05, + "loss": 0.9153, + "step": 101730 + }, + { + "epoch": 0.6499878614415496, + "grad_norm": 1.266113042831421, + "learning_rate": 7.614285191799088e-05, + "loss": 1.144, + "step": 101740 + }, + { + "epoch": 0.6500517485912883, + "grad_norm": 0.7634327411651611, + "learning_rate": 7.613857460935831e-05, + "loss": 0.7877, + "step": 101750 + }, + { + "epoch": 0.650115635741027, + "grad_norm": 0.8808565139770508, + "learning_rate": 7.613429703748768e-05, + "loss": 0.8561, + "step": 101760 + }, + { + "epoch": 0.6501795228907657, + "grad_norm": 1.3923038244247437, + "learning_rate": 7.613001920242206e-05, + "loss": 0.9596, + "step": 101770 + }, + { + "epoch": 0.6502434100405045, + "grad_norm": 1.1155999898910522, + "learning_rate": 7.612574110420454e-05, + "loss": 0.7707, + "step": 101780 + }, + { + "epoch": 0.6503072971902432, + "grad_norm": 0.7540896534919739, + "learning_rate": 7.612146274287821e-05, + "loss": 1.2077, + "step": 101790 + }, + { + "epoch": 0.6503711843399819, + "grad_norm": 0.7972086071968079, + "learning_rate": 7.611718411848617e-05, + "loss": 0.9242, + "step": 101800 + }, + { + "epoch": 0.6504350714897206, + "grad_norm": 0.8194320797920227, + "learning_rate": 7.611290523107146e-05, + "loss": 1.0412, + "step": 101810 + }, + { + "epoch": 0.6504989586394593, + "grad_norm": 0.8786047101020813, + "learning_rate": 7.610862608067721e-05, + "loss": 0.901, + "step": 101820 + }, + { + "epoch": 0.650562845789198, + "grad_norm": 0.7013679146766663, + "learning_rate": 7.610434666734651e-05, + "loss": 0.9909, + "step": 101830 + }, + { + "epoch": 0.6506267329389367, + "grad_norm": 1.3710912466049194, + "learning_rate": 7.610006699112248e-05, + "loss": 1.1784, + "step": 101840 + }, + { + "epoch": 0.6506906200886754, + "grad_norm": 0.9212914705276489, + "learning_rate": 7.609578705204816e-05, + "loss": 0.8956, + "step": 101850 + }, + { + "epoch": 0.6507545072384141, + "grad_norm": 1.100433588027954, + "learning_rate": 7.609150685016671e-05, + "loss": 0.8263, + "step": 101860 + }, + { + "epoch": 0.6508183943881528, + "grad_norm": 1.0776516199111938, + "learning_rate": 7.60872263855212e-05, + "loss": 0.9641, + "step": 101870 + }, + { + "epoch": 0.6508822815378915, + "grad_norm": 0.9307558536529541, + "learning_rate": 7.608294565815476e-05, + "loss": 0.8491, + "step": 101880 + }, + { + "epoch": 0.6509461686876302, + "grad_norm": 0.9816484451293945, + "learning_rate": 7.60786646681105e-05, + "loss": 0.5917, + "step": 101890 + }, + { + "epoch": 0.6510100558373688, + "grad_norm": 0.8662287592887878, + "learning_rate": 7.607438341543152e-05, + "loss": 0.8599, + "step": 101900 + }, + { + "epoch": 0.6510739429871075, + "grad_norm": 1.7769293785095215, + "learning_rate": 7.607010190016093e-05, + "loss": 0.8963, + "step": 101910 + }, + { + "epoch": 0.6511378301368462, + "grad_norm": 0.9365469813346863, + "learning_rate": 7.606582012234188e-05, + "loss": 0.8052, + "step": 101920 + }, + { + "epoch": 0.6512017172865849, + "grad_norm": 0.8617343902587891, + "learning_rate": 7.606153808201746e-05, + "loss": 0.9196, + "step": 101930 + }, + { + "epoch": 0.6512656044363236, + "grad_norm": 0.689240038394928, + "learning_rate": 7.605725577923081e-05, + "loss": 0.9611, + "step": 101940 + }, + { + "epoch": 0.6513294915860623, + "grad_norm": 0.7218610644340515, + "learning_rate": 7.605297321402504e-05, + "loss": 0.8778, + "step": 101950 + }, + { + "epoch": 0.651393378735801, + "grad_norm": 1.1720585823059082, + "learning_rate": 7.60486903864433e-05, + "loss": 0.8404, + "step": 101960 + }, + { + "epoch": 0.6514572658855398, + "grad_norm": 1.2702221870422363, + "learning_rate": 7.60444072965287e-05, + "loss": 1.1349, + "step": 101970 + }, + { + "epoch": 0.6515211530352785, + "grad_norm": 0.8623278141021729, + "learning_rate": 7.60401239443244e-05, + "loss": 0.6295, + "step": 101980 + }, + { + "epoch": 0.6515850401850172, + "grad_norm": 1.1232866048812866, + "learning_rate": 7.603584032987353e-05, + "loss": 0.9372, + "step": 101990 + }, + { + "epoch": 0.6516489273347559, + "grad_norm": 1.010554552078247, + "learning_rate": 7.603155645321921e-05, + "loss": 0.9095, + "step": 102000 + }, + { + "epoch": 0.6517128144844946, + "grad_norm": 0.9294137358665466, + "learning_rate": 7.60272723144046e-05, + "loss": 0.9728, + "step": 102010 + }, + { + "epoch": 0.6517767016342333, + "grad_norm": 0.6776549816131592, + "learning_rate": 7.602298791347284e-05, + "loss": 0.9536, + "step": 102020 + }, + { + "epoch": 0.651840588783972, + "grad_norm": 0.8278113007545471, + "learning_rate": 7.601870325046707e-05, + "loss": 0.691, + "step": 102030 + }, + { + "epoch": 0.6519044759337107, + "grad_norm": 0.7509378790855408, + "learning_rate": 7.601441832543046e-05, + "loss": 0.7495, + "step": 102040 + }, + { + "epoch": 0.6519683630834494, + "grad_norm": 0.6673555374145508, + "learning_rate": 7.601013313840616e-05, + "loss": 1.0675, + "step": 102050 + }, + { + "epoch": 0.6520322502331881, + "grad_norm": 1.0808007717132568, + "learning_rate": 7.600584768943731e-05, + "loss": 0.8746, + "step": 102060 + }, + { + "epoch": 0.6520961373829268, + "grad_norm": 0.9976995587348938, + "learning_rate": 7.600156197856707e-05, + "loss": 0.7481, + "step": 102070 + }, + { + "epoch": 0.6521600245326655, + "grad_norm": 0.7592312693595886, + "learning_rate": 7.599727600583861e-05, + "loss": 0.7296, + "step": 102080 + }, + { + "epoch": 0.6522239116824042, + "grad_norm": 0.7450394034385681, + "learning_rate": 7.599298977129509e-05, + "loss": 0.8024, + "step": 102090 + }, + { + "epoch": 0.6522877988321429, + "grad_norm": 1.1607773303985596, + "learning_rate": 7.598870327497967e-05, + "loss": 1.0051, + "step": 102100 + }, + { + "epoch": 0.6523516859818816, + "grad_norm": 1.8432011604309082, + "learning_rate": 7.598441651693554e-05, + "loss": 0.8561, + "step": 102110 + }, + { + "epoch": 0.6524155731316204, + "grad_norm": 0.6616463661193848, + "learning_rate": 7.598012949720585e-05, + "loss": 0.981, + "step": 102120 + }, + { + "epoch": 0.6524794602813591, + "grad_norm": 0.715186357498169, + "learning_rate": 7.597584221583377e-05, + "loss": 0.8713, + "step": 102130 + }, + { + "epoch": 0.6525433474310978, + "grad_norm": 1.7039902210235596, + "learning_rate": 7.597155467286249e-05, + "loss": 0.8644, + "step": 102140 + }, + { + "epoch": 0.6526072345808364, + "grad_norm": 0.7133430242538452, + "learning_rate": 7.59672668683352e-05, + "loss": 0.8738, + "step": 102150 + }, + { + "epoch": 0.6526711217305751, + "grad_norm": 0.79267418384552, + "learning_rate": 7.596297880229504e-05, + "loss": 0.7017, + "step": 102160 + }, + { + "epoch": 0.6527350088803138, + "grad_norm": 3.0087802410125732, + "learning_rate": 7.595869047478524e-05, + "loss": 0.8555, + "step": 102170 + }, + { + "epoch": 0.6527988960300525, + "grad_norm": 1.167046070098877, + "learning_rate": 7.595440188584897e-05, + "loss": 0.9269, + "step": 102180 + }, + { + "epoch": 0.6528627831797912, + "grad_norm": 0.7473157644271851, + "learning_rate": 7.595011303552941e-05, + "loss": 1.1813, + "step": 102190 + }, + { + "epoch": 0.6529266703295299, + "grad_norm": 2.5482473373413086, + "learning_rate": 7.594582392386977e-05, + "loss": 0.9303, + "step": 102200 + }, + { + "epoch": 0.6529905574792686, + "grad_norm": 0.5995567440986633, + "learning_rate": 7.594153455091324e-05, + "loss": 0.8351, + "step": 102210 + }, + { + "epoch": 0.6530544446290073, + "grad_norm": 0.8906635046005249, + "learning_rate": 7.593724491670302e-05, + "loss": 1.0707, + "step": 102220 + }, + { + "epoch": 0.653118331778746, + "grad_norm": 1.3267639875411987, + "learning_rate": 7.593295502128229e-05, + "loss": 0.9804, + "step": 102230 + }, + { + "epoch": 0.6531822189284847, + "grad_norm": 0.9197192788124084, + "learning_rate": 7.592866486469427e-05, + "loss": 0.831, + "step": 102240 + }, + { + "epoch": 0.6532461060782234, + "grad_norm": 0.9400060772895813, + "learning_rate": 7.592437444698217e-05, + "loss": 1.1257, + "step": 102250 + }, + { + "epoch": 0.6533099932279621, + "grad_norm": 1.9750057458877563, + "learning_rate": 7.592008376818918e-05, + "loss": 1.1414, + "step": 102260 + }, + { + "epoch": 0.6533738803777008, + "grad_norm": 0.8337990045547485, + "learning_rate": 7.591579282835854e-05, + "loss": 0.9497, + "step": 102270 + }, + { + "epoch": 0.6534377675274395, + "grad_norm": 0.6679349541664124, + "learning_rate": 7.591150162753343e-05, + "loss": 0.859, + "step": 102280 + }, + { + "epoch": 0.6535016546771782, + "grad_norm": 1.1458660364151, + "learning_rate": 7.590721016575709e-05, + "loss": 0.8031, + "step": 102290 + }, + { + "epoch": 0.653565541826917, + "grad_norm": 1.3683353662490845, + "learning_rate": 7.590291844307274e-05, + "loss": 0.7009, + "step": 102300 + }, + { + "epoch": 0.6536294289766557, + "grad_norm": 1.0884777307510376, + "learning_rate": 7.589862645952358e-05, + "loss": 1.1495, + "step": 102310 + }, + { + "epoch": 0.6536933161263944, + "grad_norm": 2.0209991931915283, + "learning_rate": 7.589433421515284e-05, + "loss": 0.7371, + "step": 102320 + }, + { + "epoch": 0.6537572032761331, + "grad_norm": 0.8924853801727295, + "learning_rate": 7.589004171000376e-05, + "loss": 0.8453, + "step": 102330 + }, + { + "epoch": 0.6538210904258718, + "grad_norm": 0.9563447833061218, + "learning_rate": 7.588574894411957e-05, + "loss": 0.9245, + "step": 102340 + }, + { + "epoch": 0.6538849775756105, + "grad_norm": 0.6629459857940674, + "learning_rate": 7.588145591754348e-05, + "loss": 0.8992, + "step": 102350 + }, + { + "epoch": 0.6539488647253492, + "grad_norm": 0.7915505170822144, + "learning_rate": 7.587716263031875e-05, + "loss": 0.6657, + "step": 102360 + }, + { + "epoch": 0.6540127518750879, + "grad_norm": 0.9542189240455627, + "learning_rate": 7.587286908248859e-05, + "loss": 0.8998, + "step": 102370 + }, + { + "epoch": 0.6540766390248266, + "grad_norm": 0.6477545499801636, + "learning_rate": 7.586857527409625e-05, + "loss": 0.828, + "step": 102380 + }, + { + "epoch": 0.6541405261745652, + "grad_norm": 3.1310455799102783, + "learning_rate": 7.586428120518498e-05, + "loss": 0.8608, + "step": 102390 + }, + { + "epoch": 0.6542044133243039, + "grad_norm": 0.7806214094161987, + "learning_rate": 7.585998687579805e-05, + "loss": 0.7727, + "step": 102400 + }, + { + "epoch": 0.6542683004740426, + "grad_norm": 0.8768726587295532, + "learning_rate": 7.585569228597866e-05, + "loss": 0.7605, + "step": 102410 + }, + { + "epoch": 0.6543321876237813, + "grad_norm": 1.5629066228866577, + "learning_rate": 7.585139743577007e-05, + "loss": 0.7757, + "step": 102420 + }, + { + "epoch": 0.65439607477352, + "grad_norm": 0.6218248009681702, + "learning_rate": 7.584710232521558e-05, + "loss": 0.805, + "step": 102430 + }, + { + "epoch": 0.6544599619232587, + "grad_norm": 0.555633008480072, + "learning_rate": 7.584280695435839e-05, + "loss": 0.9534, + "step": 102440 + }, + { + "epoch": 0.6545238490729974, + "grad_norm": 0.9295303821563721, + "learning_rate": 7.583851132324176e-05, + "loss": 0.9972, + "step": 102450 + }, + { + "epoch": 0.6545877362227361, + "grad_norm": 0.4502405822277069, + "learning_rate": 7.583421543190899e-05, + "loss": 0.8702, + "step": 102460 + }, + { + "epoch": 0.6546516233724748, + "grad_norm": 1.0970765352249146, + "learning_rate": 7.58299192804033e-05, + "loss": 0.6886, + "step": 102470 + }, + { + "epoch": 0.6547155105222136, + "grad_norm": 0.967715859413147, + "learning_rate": 7.5825622868768e-05, + "loss": 0.769, + "step": 102480 + }, + { + "epoch": 0.6547793976719523, + "grad_norm": 1.3791766166687012, + "learning_rate": 7.582132619704632e-05, + "loss": 1.0848, + "step": 102490 + }, + { + "epoch": 0.654843284821691, + "grad_norm": 1.011976718902588, + "learning_rate": 7.581702926528156e-05, + "loss": 0.8708, + "step": 102500 + }, + { + "epoch": 0.6549071719714297, + "grad_norm": 1.187624454498291, + "learning_rate": 7.581273207351696e-05, + "loss": 0.8925, + "step": 102510 + }, + { + "epoch": 0.6549710591211684, + "grad_norm": 1.0658549070358276, + "learning_rate": 7.580843462179583e-05, + "loss": 0.6593, + "step": 102520 + }, + { + "epoch": 0.6550349462709071, + "grad_norm": 0.7370697855949402, + "learning_rate": 7.580413691016144e-05, + "loss": 0.9271, + "step": 102530 + }, + { + "epoch": 0.6550988334206458, + "grad_norm": 1.2200112342834473, + "learning_rate": 7.579983893865704e-05, + "loss": 0.9089, + "step": 102540 + }, + { + "epoch": 0.6551627205703845, + "grad_norm": 0.8738793730735779, + "learning_rate": 7.579554070732597e-05, + "loss": 0.9293, + "step": 102550 + }, + { + "epoch": 0.6552266077201232, + "grad_norm": 0.6876864433288574, + "learning_rate": 7.579124221621148e-05, + "loss": 0.7449, + "step": 102560 + }, + { + "epoch": 0.6552904948698619, + "grad_norm": 0.8214115500450134, + "learning_rate": 7.578694346535686e-05, + "loss": 0.7657, + "step": 102570 + }, + { + "epoch": 0.6553543820196006, + "grad_norm": 2.8567984104156494, + "learning_rate": 7.578264445480543e-05, + "loss": 0.8339, + "step": 102580 + }, + { + "epoch": 0.6554182691693393, + "grad_norm": 1.8069883584976196, + "learning_rate": 7.577834518460046e-05, + "loss": 0.772, + "step": 102590 + }, + { + "epoch": 0.655482156319078, + "grad_norm": 0.7831799387931824, + "learning_rate": 7.577404565478525e-05, + "loss": 1.1675, + "step": 102600 + }, + { + "epoch": 0.6555460434688167, + "grad_norm": 1.1496902704238892, + "learning_rate": 7.576974586540309e-05, + "loss": 0.727, + "step": 102610 + }, + { + "epoch": 0.6556099306185554, + "grad_norm": 0.9092468619346619, + "learning_rate": 7.57654458164973e-05, + "loss": 0.7713, + "step": 102620 + }, + { + "epoch": 0.655673817768294, + "grad_norm": 1.3994219303131104, + "learning_rate": 7.57611455081112e-05, + "loss": 0.9057, + "step": 102630 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 0.7715085744857788, + "learning_rate": 7.575684494028805e-05, + "loss": 0.9913, + "step": 102640 + }, + { + "epoch": 0.6558015920677714, + "grad_norm": 1.0739322900772095, + "learning_rate": 7.57525441130712e-05, + "loss": 1.0197, + "step": 102650 + }, + { + "epoch": 0.6558654792175102, + "grad_norm": 1.3112781047821045, + "learning_rate": 7.574824302650396e-05, + "loss": 0.819, + "step": 102660 + }, + { + "epoch": 0.6559293663672489, + "grad_norm": 0.8328858017921448, + "learning_rate": 7.574394168062964e-05, + "loss": 1.1271, + "step": 102670 + }, + { + "epoch": 0.6559932535169876, + "grad_norm": 1.3175311088562012, + "learning_rate": 7.573964007549155e-05, + "loss": 1.0228, + "step": 102680 + }, + { + "epoch": 0.6560571406667263, + "grad_norm": 1.8402165174484253, + "learning_rate": 7.573533821113302e-05, + "loss": 0.6843, + "step": 102690 + }, + { + "epoch": 0.656121027816465, + "grad_norm": 0.6495104432106018, + "learning_rate": 7.573103608759736e-05, + "loss": 0.8759, + "step": 102700 + }, + { + "epoch": 0.6561849149662037, + "grad_norm": 1.0838744640350342, + "learning_rate": 7.572673370492788e-05, + "loss": 0.9824, + "step": 102710 + }, + { + "epoch": 0.6562488021159424, + "grad_norm": 1.348099708557129, + "learning_rate": 7.572243106316798e-05, + "loss": 0.995, + "step": 102720 + }, + { + "epoch": 0.6563126892656811, + "grad_norm": 1.7211371660232544, + "learning_rate": 7.571812816236093e-05, + "loss": 1.0677, + "step": 102730 + }, + { + "epoch": 0.6563765764154198, + "grad_norm": 1.924634337425232, + "learning_rate": 7.571382500255009e-05, + "loss": 0.6923, + "step": 102740 + }, + { + "epoch": 0.6564404635651585, + "grad_norm": 0.7509860992431641, + "learning_rate": 7.570952158377877e-05, + "loss": 1.3095, + "step": 102750 + }, + { + "epoch": 0.6565043507148972, + "grad_norm": 2.123326301574707, + "learning_rate": 7.570521790609033e-05, + "loss": 1.0223, + "step": 102760 + }, + { + "epoch": 0.6565682378646359, + "grad_norm": 0.9180009961128235, + "learning_rate": 7.570091396952811e-05, + "loss": 0.7883, + "step": 102770 + }, + { + "epoch": 0.6566321250143746, + "grad_norm": 0.9530285596847534, + "learning_rate": 7.569660977413546e-05, + "loss": 0.6924, + "step": 102780 + }, + { + "epoch": 0.6566960121641133, + "grad_norm": 0.9887816905975342, + "learning_rate": 7.56923053199557e-05, + "loss": 0.9007, + "step": 102790 + }, + { + "epoch": 0.656759899313852, + "grad_norm": 0.853535532951355, + "learning_rate": 7.568800060703222e-05, + "loss": 0.9581, + "step": 102800 + }, + { + "epoch": 0.6568237864635907, + "grad_norm": 0.716618001461029, + "learning_rate": 7.568369563540834e-05, + "loss": 1.1438, + "step": 102810 + }, + { + "epoch": 0.6568876736133294, + "grad_norm": 0.9339463114738464, + "learning_rate": 7.567939040512742e-05, + "loss": 1.0451, + "step": 102820 + }, + { + "epoch": 0.6569515607630682, + "grad_norm": 1.0368573665618896, + "learning_rate": 7.567508491623283e-05, + "loss": 0.6927, + "step": 102830 + }, + { + "epoch": 0.6570154479128069, + "grad_norm": 0.9054602384567261, + "learning_rate": 7.567077916876793e-05, + "loss": 0.9451, + "step": 102840 + }, + { + "epoch": 0.6570793350625456, + "grad_norm": 0.7542839646339417, + "learning_rate": 7.566647316277607e-05, + "loss": 0.6406, + "step": 102850 + }, + { + "epoch": 0.6571432222122843, + "grad_norm": 0.7233191132545471, + "learning_rate": 7.566216689830061e-05, + "loss": 0.9122, + "step": 102860 + }, + { + "epoch": 0.6572071093620229, + "grad_norm": 0.8938806056976318, + "learning_rate": 7.565786037538492e-05, + "loss": 0.9786, + "step": 102870 + }, + { + "epoch": 0.6572709965117616, + "grad_norm": 0.8927178978919983, + "learning_rate": 7.56535535940724e-05, + "loss": 0.8482, + "step": 102880 + }, + { + "epoch": 0.6573348836615003, + "grad_norm": 0.9564715623855591, + "learning_rate": 7.564924655440639e-05, + "loss": 0.824, + "step": 102890 + }, + { + "epoch": 0.657398770811239, + "grad_norm": 1.4152295589447021, + "learning_rate": 7.564493925643028e-05, + "loss": 0.8507, + "step": 102900 + }, + { + "epoch": 0.6574626579609777, + "grad_norm": 2.762669324874878, + "learning_rate": 7.564063170018745e-05, + "loss": 1.0035, + "step": 102910 + }, + { + "epoch": 0.6575265451107164, + "grad_norm": 0.6634849905967712, + "learning_rate": 7.563632388572128e-05, + "loss": 1.004, + "step": 102920 + }, + { + "epoch": 0.6575904322604551, + "grad_norm": 0.7901267409324646, + "learning_rate": 7.563201581307516e-05, + "loss": 1.004, + "step": 102930 + }, + { + "epoch": 0.6576543194101938, + "grad_norm": 1.174972653388977, + "learning_rate": 7.562770748229245e-05, + "loss": 1.0687, + "step": 102940 + }, + { + "epoch": 0.6577182065599325, + "grad_norm": 0.6787839531898499, + "learning_rate": 7.562339889341655e-05, + "loss": 0.6921, + "step": 102950 + }, + { + "epoch": 0.6577820937096712, + "grad_norm": 1.1864277124404907, + "learning_rate": 7.561909004649086e-05, + "loss": 0.8603, + "step": 102960 + }, + { + "epoch": 0.6578459808594099, + "grad_norm": 0.715882420539856, + "learning_rate": 7.561478094155877e-05, + "loss": 0.9814, + "step": 102970 + }, + { + "epoch": 0.6579098680091486, + "grad_norm": 0.7654950618743896, + "learning_rate": 7.561047157866368e-05, + "loss": 0.7236, + "step": 102980 + }, + { + "epoch": 0.6579737551588873, + "grad_norm": 0.8814225196838379, + "learning_rate": 7.560616195784898e-05, + "loss": 1.0984, + "step": 102990 + }, + { + "epoch": 0.658037642308626, + "grad_norm": 0.6822389364242554, + "learning_rate": 7.560185207915808e-05, + "loss": 0.7056, + "step": 103000 + }, + { + "epoch": 0.6581015294583648, + "grad_norm": 0.9107899069786072, + "learning_rate": 7.559754194263438e-05, + "loss": 1.0334, + "step": 103010 + }, + { + "epoch": 0.6581654166081035, + "grad_norm": 1.1239162683486938, + "learning_rate": 7.559323154832128e-05, + "loss": 1.0588, + "step": 103020 + }, + { + "epoch": 0.6582293037578422, + "grad_norm": 0.7621965408325195, + "learning_rate": 7.558892089626222e-05, + "loss": 1.3516, + "step": 103030 + }, + { + "epoch": 0.6582931909075809, + "grad_norm": 0.9127793312072754, + "learning_rate": 7.558460998650056e-05, + "loss": 0.8635, + "step": 103040 + }, + { + "epoch": 0.6583570780573196, + "grad_norm": 1.0723611116409302, + "learning_rate": 7.558029881907977e-05, + "loss": 0.8784, + "step": 103050 + }, + { + "epoch": 0.6584209652070583, + "grad_norm": 1.1511666774749756, + "learning_rate": 7.557598739404322e-05, + "loss": 0.9645, + "step": 103060 + }, + { + "epoch": 0.658484852356797, + "grad_norm": 0.8360834717750549, + "learning_rate": 7.557167571143435e-05, + "loss": 1.2718, + "step": 103070 + }, + { + "epoch": 0.6585487395065357, + "grad_norm": 0.8610426187515259, + "learning_rate": 7.556736377129659e-05, + "loss": 0.6349, + "step": 103080 + }, + { + "epoch": 0.6586126266562744, + "grad_norm": 2.5912959575653076, + "learning_rate": 7.556305157367336e-05, + "loss": 0.9122, + "step": 103090 + }, + { + "epoch": 0.6586765138060131, + "grad_norm": 0.6567677855491638, + "learning_rate": 7.555873911860808e-05, + "loss": 0.7129, + "step": 103100 + }, + { + "epoch": 0.6587404009557518, + "grad_norm": 2.0863733291625977, + "learning_rate": 7.55544264061442e-05, + "loss": 0.9628, + "step": 103110 + }, + { + "epoch": 0.6588042881054904, + "grad_norm": 0.9286092519760132, + "learning_rate": 7.555011343632512e-05, + "loss": 0.8073, + "step": 103120 + }, + { + "epoch": 0.6588681752552291, + "grad_norm": 0.8470326662063599, + "learning_rate": 7.55458002091943e-05, + "loss": 0.7094, + "step": 103130 + }, + { + "epoch": 0.6589320624049678, + "grad_norm": 0.9523374438285828, + "learning_rate": 7.554148672479518e-05, + "loss": 0.898, + "step": 103140 + }, + { + "epoch": 0.6589959495547065, + "grad_norm": 0.7024726271629333, + "learning_rate": 7.553717298317118e-05, + "loss": 0.8187, + "step": 103150 + }, + { + "epoch": 0.6590598367044452, + "grad_norm": 0.8820728063583374, + "learning_rate": 7.553285898436577e-05, + "loss": 0.9663, + "step": 103160 + }, + { + "epoch": 0.6591237238541839, + "grad_norm": 0.6763830184936523, + "learning_rate": 7.552854472842238e-05, + "loss": 0.7318, + "step": 103170 + }, + { + "epoch": 0.6591876110039226, + "grad_norm": 0.881584644317627, + "learning_rate": 7.552423021538445e-05, + "loss": 1.0267, + "step": 103180 + }, + { + "epoch": 0.6592514981536614, + "grad_norm": 1.0598586797714233, + "learning_rate": 7.551991544529544e-05, + "loss": 0.9321, + "step": 103190 + }, + { + "epoch": 0.6593153853034001, + "grad_norm": 0.879578709602356, + "learning_rate": 7.55156004181988e-05, + "loss": 0.8715, + "step": 103200 + }, + { + "epoch": 0.6593792724531388, + "grad_norm": 0.9016001224517822, + "learning_rate": 7.5511285134138e-05, + "loss": 0.8316, + "step": 103210 + }, + { + "epoch": 0.6594431596028775, + "grad_norm": 0.7828614115715027, + "learning_rate": 7.550696959315647e-05, + "loss": 0.7837, + "step": 103220 + }, + { + "epoch": 0.6595070467526162, + "grad_norm": 0.6147605180740356, + "learning_rate": 7.550265379529771e-05, + "loss": 0.6583, + "step": 103230 + }, + { + "epoch": 0.6595709339023549, + "grad_norm": 0.5749229192733765, + "learning_rate": 7.549833774060515e-05, + "loss": 0.7125, + "step": 103240 + }, + { + "epoch": 0.6596348210520936, + "grad_norm": 0.9546531438827515, + "learning_rate": 7.549402142912228e-05, + "loss": 0.9588, + "step": 103250 + }, + { + "epoch": 0.6596987082018323, + "grad_norm": 0.8251008987426758, + "learning_rate": 7.548970486089255e-05, + "loss": 1.0117, + "step": 103260 + }, + { + "epoch": 0.659762595351571, + "grad_norm": 1.1722609996795654, + "learning_rate": 7.548538803595944e-05, + "loss": 0.6943, + "step": 103270 + }, + { + "epoch": 0.6598264825013097, + "grad_norm": 2.220587730407715, + "learning_rate": 7.548107095436644e-05, + "loss": 0.6568, + "step": 103280 + }, + { + "epoch": 0.6598903696510484, + "grad_norm": 1.1250571012496948, + "learning_rate": 7.547675361615701e-05, + "loss": 0.6475, + "step": 103290 + }, + { + "epoch": 0.6599542568007871, + "grad_norm": 0.6930386424064636, + "learning_rate": 7.547243602137462e-05, + "loss": 1.03, + "step": 103300 + }, + { + "epoch": 0.6600181439505258, + "grad_norm": 0.7208458185195923, + "learning_rate": 7.546811817006275e-05, + "loss": 0.7767, + "step": 103310 + }, + { + "epoch": 0.6600820311002645, + "grad_norm": 0.9552310705184937, + "learning_rate": 7.546380006226493e-05, + "loss": 1.2225, + "step": 103320 + }, + { + "epoch": 0.6601459182500032, + "grad_norm": 0.8683717250823975, + "learning_rate": 7.545948169802458e-05, + "loss": 1.0714, + "step": 103330 + }, + { + "epoch": 0.660209805399742, + "grad_norm": 1.0436851978302002, + "learning_rate": 7.545516307738524e-05, + "loss": 1.001, + "step": 103340 + }, + { + "epoch": 0.6602736925494807, + "grad_norm": 1.0748889446258545, + "learning_rate": 7.545084420039038e-05, + "loss": 0.9228, + "step": 103350 + }, + { + "epoch": 0.6603375796992192, + "grad_norm": 0.716839075088501, + "learning_rate": 7.54465250670835e-05, + "loss": 0.9845, + "step": 103360 + }, + { + "epoch": 0.660401466848958, + "grad_norm": 1.4549845457077026, + "learning_rate": 7.54422056775081e-05, + "loss": 0.8982, + "step": 103370 + }, + { + "epoch": 0.6604653539986967, + "grad_norm": 1.124532699584961, + "learning_rate": 7.54378860317077e-05, + "loss": 0.9131, + "step": 103380 + }, + { + "epoch": 0.6605292411484354, + "grad_norm": 0.7380385994911194, + "learning_rate": 7.543356612972575e-05, + "loss": 0.9161, + "step": 103390 + }, + { + "epoch": 0.6605931282981741, + "grad_norm": 1.4843467473983765, + "learning_rate": 7.54292459716058e-05, + "loss": 0.9268, + "step": 103400 + }, + { + "epoch": 0.6606570154479128, + "grad_norm": 0.9203116297721863, + "learning_rate": 7.542492555739135e-05, + "loss": 0.861, + "step": 103410 + }, + { + "epoch": 0.6607209025976515, + "grad_norm": 0.9751471877098083, + "learning_rate": 7.54206048871259e-05, + "loss": 0.7468, + "step": 103420 + }, + { + "epoch": 0.6607847897473902, + "grad_norm": 1.2186683416366577, + "learning_rate": 7.541628396085296e-05, + "loss": 0.967, + "step": 103430 + }, + { + "epoch": 0.6608486768971289, + "grad_norm": 1.1114938259124756, + "learning_rate": 7.541196277861604e-05, + "loss": 1.0421, + "step": 103440 + }, + { + "epoch": 0.6609125640468676, + "grad_norm": 1.0457226037979126, + "learning_rate": 7.540764134045869e-05, + "loss": 0.8752, + "step": 103450 + }, + { + "epoch": 0.6609764511966063, + "grad_norm": 1.4589784145355225, + "learning_rate": 7.540331964642441e-05, + "loss": 0.8713, + "step": 103460 + }, + { + "epoch": 0.661040338346345, + "grad_norm": 1.1460403203964233, + "learning_rate": 7.539899769655672e-05, + "loss": 0.832, + "step": 103470 + }, + { + "epoch": 0.6611042254960837, + "grad_norm": 0.9500607252120972, + "learning_rate": 7.539467549089914e-05, + "loss": 1.0559, + "step": 103480 + }, + { + "epoch": 0.6611681126458224, + "grad_norm": 1.8726199865341187, + "learning_rate": 7.539035302949523e-05, + "loss": 0.7371, + "step": 103490 + }, + { + "epoch": 0.6612319997955611, + "grad_norm": 1.0229368209838867, + "learning_rate": 7.538603031238849e-05, + "loss": 1.2995, + "step": 103500 + }, + { + "epoch": 0.6612958869452998, + "grad_norm": 0.8887920379638672, + "learning_rate": 7.538170733962245e-05, + "loss": 1.0423, + "step": 103510 + }, + { + "epoch": 0.6613597740950385, + "grad_norm": 1.2013378143310547, + "learning_rate": 7.537738411124066e-05, + "loss": 0.8154, + "step": 103520 + }, + { + "epoch": 0.6614236612447773, + "grad_norm": 0.6866891980171204, + "learning_rate": 7.537306062728669e-05, + "loss": 0.9957, + "step": 103530 + }, + { + "epoch": 0.661487548394516, + "grad_norm": 1.4273715019226074, + "learning_rate": 7.536873688780402e-05, + "loss": 0.9109, + "step": 103540 + }, + { + "epoch": 0.6615514355442547, + "grad_norm": 0.7437546253204346, + "learning_rate": 7.536441289283622e-05, + "loss": 0.9392, + "step": 103550 + }, + { + "epoch": 0.6616153226939934, + "grad_norm": 0.8337574005126953, + "learning_rate": 7.536008864242685e-05, + "loss": 1.0836, + "step": 103560 + }, + { + "epoch": 0.6616792098437321, + "grad_norm": 0.6678511500358582, + "learning_rate": 7.535576413661944e-05, + "loss": 0.7588, + "step": 103570 + }, + { + "epoch": 0.6617430969934708, + "grad_norm": 0.8168431520462036, + "learning_rate": 7.535143937545757e-05, + "loss": 0.7712, + "step": 103580 + }, + { + "epoch": 0.6618069841432095, + "grad_norm": 0.7926838994026184, + "learning_rate": 7.534711435898473e-05, + "loss": 0.8549, + "step": 103590 + }, + { + "epoch": 0.6618708712929481, + "grad_norm": 0.8065713047981262, + "learning_rate": 7.534278908724455e-05, + "loss": 0.9805, + "step": 103600 + }, + { + "epoch": 0.6619347584426868, + "grad_norm": 1.288443684577942, + "learning_rate": 7.533846356028056e-05, + "loss": 0.9803, + "step": 103610 + }, + { + "epoch": 0.6619986455924255, + "grad_norm": 0.8270924687385559, + "learning_rate": 7.533413777813632e-05, + "loss": 1.0178, + "step": 103620 + }, + { + "epoch": 0.6620625327421642, + "grad_norm": 0.969517707824707, + "learning_rate": 7.532981174085538e-05, + "loss": 0.7647, + "step": 103630 + }, + { + "epoch": 0.6621264198919029, + "grad_norm": 0.8974095582962036, + "learning_rate": 7.532548544848134e-05, + "loss": 0.92, + "step": 103640 + }, + { + "epoch": 0.6621903070416416, + "grad_norm": 0.6502230763435364, + "learning_rate": 7.532115890105776e-05, + "loss": 0.8038, + "step": 103650 + }, + { + "epoch": 0.6622541941913803, + "grad_norm": 2.234053134918213, + "learning_rate": 7.531683209862818e-05, + "loss": 0.6465, + "step": 103660 + }, + { + "epoch": 0.662318081341119, + "grad_norm": 0.8067479729652405, + "learning_rate": 7.531250504123622e-05, + "loss": 0.8337, + "step": 103670 + }, + { + "epoch": 0.6623819684908577, + "grad_norm": 1.0391908884048462, + "learning_rate": 7.530817772892543e-05, + "loss": 0.839, + "step": 103680 + }, + { + "epoch": 0.6624458556405964, + "grad_norm": 0.6917629241943359, + "learning_rate": 7.53038501617394e-05, + "loss": 1.0411, + "step": 103690 + }, + { + "epoch": 0.6625097427903351, + "grad_norm": 0.8440914750099182, + "learning_rate": 7.529952233972169e-05, + "loss": 0.886, + "step": 103700 + }, + { + "epoch": 0.6625736299400738, + "grad_norm": 0.7776359915733337, + "learning_rate": 7.529519426291591e-05, + "loss": 0.8062, + "step": 103710 + }, + { + "epoch": 0.6626375170898126, + "grad_norm": 0.7299126386642456, + "learning_rate": 7.529086593136564e-05, + "loss": 0.9034, + "step": 103720 + }, + { + "epoch": 0.6627014042395513, + "grad_norm": 1.0849508047103882, + "learning_rate": 7.528653734511447e-05, + "loss": 0.8913, + "step": 103730 + }, + { + "epoch": 0.66276529138929, + "grad_norm": 1.4763078689575195, + "learning_rate": 7.5282208504206e-05, + "loss": 1.0506, + "step": 103740 + }, + { + "epoch": 0.6628291785390287, + "grad_norm": 1.22445547580719, + "learning_rate": 7.52778794086838e-05, + "loss": 0.7745, + "step": 103750 + }, + { + "epoch": 0.6628930656887674, + "grad_norm": 0.757455587387085, + "learning_rate": 7.52735500585915e-05, + "loss": 0.7991, + "step": 103760 + }, + { + "epoch": 0.6629569528385061, + "grad_norm": 0.643725574016571, + "learning_rate": 7.526922045397269e-05, + "loss": 0.8059, + "step": 103770 + }, + { + "epoch": 0.6630208399882448, + "grad_norm": 0.8213297724723816, + "learning_rate": 7.526489059487097e-05, + "loss": 0.7859, + "step": 103780 + }, + { + "epoch": 0.6630847271379835, + "grad_norm": 0.90571528673172, + "learning_rate": 7.526056048132993e-05, + "loss": 0.8258, + "step": 103790 + }, + { + "epoch": 0.6631486142877222, + "grad_norm": 1.3528343439102173, + "learning_rate": 7.52562301133932e-05, + "loss": 0.8203, + "step": 103800 + }, + { + "epoch": 0.6632125014374609, + "grad_norm": 0.9805328845977783, + "learning_rate": 7.525189949110438e-05, + "loss": 0.9493, + "step": 103810 + }, + { + "epoch": 0.6632763885871996, + "grad_norm": 1.103614091873169, + "learning_rate": 7.52475686145071e-05, + "loss": 0.6765, + "step": 103820 + }, + { + "epoch": 0.6633402757369383, + "grad_norm": 0.864163875579834, + "learning_rate": 7.524323748364494e-05, + "loss": 0.9175, + "step": 103830 + }, + { + "epoch": 0.663404162886677, + "grad_norm": 0.5981049537658691, + "learning_rate": 7.523890609856157e-05, + "loss": 0.808, + "step": 103840 + }, + { + "epoch": 0.6634680500364156, + "grad_norm": 0.5768615007400513, + "learning_rate": 7.523457445930055e-05, + "loss": 0.9691, + "step": 103850 + }, + { + "epoch": 0.6635319371861543, + "grad_norm": 0.5739576816558838, + "learning_rate": 7.523024256590556e-05, + "loss": 0.852, + "step": 103860 + }, + { + "epoch": 0.663595824335893, + "grad_norm": 0.8774191737174988, + "learning_rate": 7.522591041842018e-05, + "loss": 0.9165, + "step": 103870 + }, + { + "epoch": 0.6636597114856317, + "grad_norm": 1.1826159954071045, + "learning_rate": 7.522157801688807e-05, + "loss": 0.648, + "step": 103880 + }, + { + "epoch": 0.6637235986353704, + "grad_norm": 1.5209389925003052, + "learning_rate": 7.521724536135287e-05, + "loss": 0.8307, + "step": 103890 + }, + { + "epoch": 0.6637874857851092, + "grad_norm": 0.7982348799705505, + "learning_rate": 7.521291245185815e-05, + "loss": 0.8145, + "step": 103900 + }, + { + "epoch": 0.6638513729348479, + "grad_norm": 0.9979506731033325, + "learning_rate": 7.52085792884476e-05, + "loss": 0.9305, + "step": 103910 + }, + { + "epoch": 0.6639152600845866, + "grad_norm": 0.9951682686805725, + "learning_rate": 7.520424587116485e-05, + "loss": 0.8077, + "step": 103920 + }, + { + "epoch": 0.6639791472343253, + "grad_norm": 0.7749238610267639, + "learning_rate": 7.519991220005355e-05, + "loss": 0.7822, + "step": 103930 + }, + { + "epoch": 0.664043034384064, + "grad_norm": 1.0329078435897827, + "learning_rate": 7.519557827515733e-05, + "loss": 0.8315, + "step": 103940 + }, + { + "epoch": 0.6641069215338027, + "grad_norm": 1.0199581384658813, + "learning_rate": 7.519124409651984e-05, + "loss": 0.9086, + "step": 103950 + }, + { + "epoch": 0.6641708086835414, + "grad_norm": 1.0277516841888428, + "learning_rate": 7.518690966418474e-05, + "loss": 1.0459, + "step": 103960 + }, + { + "epoch": 0.6642346958332801, + "grad_norm": 1.1918634176254272, + "learning_rate": 7.518257497819566e-05, + "loss": 1.1006, + "step": 103970 + }, + { + "epoch": 0.6642985829830188, + "grad_norm": 1.1144057512283325, + "learning_rate": 7.517824003859624e-05, + "loss": 0.8974, + "step": 103980 + }, + { + "epoch": 0.6643624701327575, + "grad_norm": 0.5517343282699585, + "learning_rate": 7.517390484543018e-05, + "loss": 0.9436, + "step": 103990 + }, + { + "epoch": 0.6644263572824962, + "grad_norm": 0.7781495451927185, + "learning_rate": 7.516956939874113e-05, + "loss": 0.9474, + "step": 104000 + }, + { + "epoch": 0.6644902444322349, + "grad_norm": 0.9537546634674072, + "learning_rate": 7.516523369857273e-05, + "loss": 0.7145, + "step": 104010 + }, + { + "epoch": 0.6645541315819736, + "grad_norm": 0.9538782238960266, + "learning_rate": 7.516089774496866e-05, + "loss": 0.7486, + "step": 104020 + }, + { + "epoch": 0.6646180187317123, + "grad_norm": 0.8699349164962769, + "learning_rate": 7.515656153797257e-05, + "loss": 0.8378, + "step": 104030 + }, + { + "epoch": 0.664681905881451, + "grad_norm": 0.7079137563705444, + "learning_rate": 7.515222507762815e-05, + "loss": 0.7564, + "step": 104040 + }, + { + "epoch": 0.6647457930311897, + "grad_norm": 0.596549391746521, + "learning_rate": 7.514788836397908e-05, + "loss": 0.9477, + "step": 104050 + }, + { + "epoch": 0.6648096801809285, + "grad_norm": 0.8176950812339783, + "learning_rate": 7.5143551397069e-05, + "loss": 0.883, + "step": 104060 + }, + { + "epoch": 0.6648735673306672, + "grad_norm": 0.5366206765174866, + "learning_rate": 7.51392141769416e-05, + "loss": 0.7585, + "step": 104070 + }, + { + "epoch": 0.6649374544804059, + "grad_norm": 2.3098976612091064, + "learning_rate": 7.51348767036406e-05, + "loss": 0.9656, + "step": 104080 + }, + { + "epoch": 0.6650013416301445, + "grad_norm": 1.27628755569458, + "learning_rate": 7.51305389772096e-05, + "loss": 0.8482, + "step": 104090 + }, + { + "epoch": 0.6650652287798832, + "grad_norm": 0.6717961430549622, + "learning_rate": 7.512620099769235e-05, + "loss": 0.94, + "step": 104100 + }, + { + "epoch": 0.6651291159296219, + "grad_norm": 2.8479366302490234, + "learning_rate": 7.512186276513252e-05, + "loss": 0.9146, + "step": 104110 + }, + { + "epoch": 0.6651930030793606, + "grad_norm": 1.3686326742172241, + "learning_rate": 7.51175242795738e-05, + "loss": 0.9382, + "step": 104120 + }, + { + "epoch": 0.6652568902290993, + "grad_norm": 1.0768921375274658, + "learning_rate": 7.511318554105988e-05, + "loss": 0.8419, + "step": 104130 + }, + { + "epoch": 0.665320777378838, + "grad_norm": 0.581240713596344, + "learning_rate": 7.510884654963446e-05, + "loss": 1.4625, + "step": 104140 + }, + { + "epoch": 0.6653846645285767, + "grad_norm": 0.8124034404754639, + "learning_rate": 7.510450730534123e-05, + "loss": 0.8727, + "step": 104150 + }, + { + "epoch": 0.6654485516783154, + "grad_norm": 0.9794655442237854, + "learning_rate": 7.510016780822388e-05, + "loss": 0.9003, + "step": 104160 + }, + { + "epoch": 0.6655124388280541, + "grad_norm": 1.2169163227081299, + "learning_rate": 7.509582805832614e-05, + "loss": 0.9785, + "step": 104170 + }, + { + "epoch": 0.6655763259777928, + "grad_norm": 1.4729397296905518, + "learning_rate": 7.50914880556917e-05, + "loss": 1.0235, + "step": 104180 + }, + { + "epoch": 0.6656402131275315, + "grad_norm": 0.7866071462631226, + "learning_rate": 7.508714780036428e-05, + "loss": 0.7818, + "step": 104190 + }, + { + "epoch": 0.6657041002772702, + "grad_norm": 1.0959784984588623, + "learning_rate": 7.508280729238754e-05, + "loss": 1.0379, + "step": 104200 + }, + { + "epoch": 0.6657679874270089, + "grad_norm": 0.6036289930343628, + "learning_rate": 7.507846653180527e-05, + "loss": 0.7128, + "step": 104210 + }, + { + "epoch": 0.6658318745767476, + "grad_norm": 1.0480402708053589, + "learning_rate": 7.507412551866113e-05, + "loss": 0.8218, + "step": 104220 + }, + { + "epoch": 0.6658957617264863, + "grad_norm": 0.8238396048545837, + "learning_rate": 7.506978425299886e-05, + "loss": 1.0993, + "step": 104230 + }, + { + "epoch": 0.665959648876225, + "grad_norm": 0.6929308176040649, + "learning_rate": 7.506544273486216e-05, + "loss": 0.7918, + "step": 104240 + }, + { + "epoch": 0.6660235360259638, + "grad_norm": 0.9507032036781311, + "learning_rate": 7.506110096429478e-05, + "loss": 1.1289, + "step": 104250 + }, + { + "epoch": 0.6660874231757025, + "grad_norm": 0.6241841316223145, + "learning_rate": 7.505675894134042e-05, + "loss": 0.6933, + "step": 104260 + }, + { + "epoch": 0.6661513103254412, + "grad_norm": 1.4490808248519897, + "learning_rate": 7.505241666604284e-05, + "loss": 0.9477, + "step": 104270 + }, + { + "epoch": 0.6662151974751799, + "grad_norm": 1.379927635192871, + "learning_rate": 7.504807413844573e-05, + "loss": 0.7406, + "step": 104280 + }, + { + "epoch": 0.6662790846249186, + "grad_norm": 0.7105908393859863, + "learning_rate": 7.504373135859283e-05, + "loss": 0.981, + "step": 104290 + }, + { + "epoch": 0.6663429717746573, + "grad_norm": 2.1415674686431885, + "learning_rate": 7.503938832652793e-05, + "loss": 0.804, + "step": 104300 + }, + { + "epoch": 0.666406858924396, + "grad_norm": 1.058493733406067, + "learning_rate": 7.50350450422947e-05, + "loss": 0.8421, + "step": 104310 + }, + { + "epoch": 0.6664707460741347, + "grad_norm": 0.8077415823936462, + "learning_rate": 7.503070150593692e-05, + "loss": 1.0033, + "step": 104320 + }, + { + "epoch": 0.6665346332238733, + "grad_norm": 1.5287679433822632, + "learning_rate": 7.502635771749832e-05, + "loss": 0.9708, + "step": 104330 + }, + { + "epoch": 0.666598520373612, + "grad_norm": 1.087836742401123, + "learning_rate": 7.502201367702264e-05, + "loss": 1.0815, + "step": 104340 + }, + { + "epoch": 0.6666624075233507, + "grad_norm": 1.3868272304534912, + "learning_rate": 7.501766938455365e-05, + "loss": 0.8257, + "step": 104350 + }, + { + "epoch": 0.6667262946730894, + "grad_norm": 0.9557937979698181, + "learning_rate": 7.501332484013508e-05, + "loss": 0.9096, + "step": 104360 + }, + { + "epoch": 0.6667901818228281, + "grad_norm": 0.964483916759491, + "learning_rate": 7.50089800438107e-05, + "loss": 0.8804, + "step": 104370 + }, + { + "epoch": 0.6668540689725668, + "grad_norm": 0.955265462398529, + "learning_rate": 7.500463499562423e-05, + "loss": 1.2936, + "step": 104380 + }, + { + "epoch": 0.6669179561223055, + "grad_norm": 0.7339872717857361, + "learning_rate": 7.500028969561947e-05, + "loss": 0.6539, + "step": 104390 + }, + { + "epoch": 0.6669818432720442, + "grad_norm": 0.724774181842804, + "learning_rate": 7.499594414384015e-05, + "loss": 1.0477, + "step": 104400 + }, + { + "epoch": 0.667045730421783, + "grad_norm": 0.9447941184043884, + "learning_rate": 7.499159834033006e-05, + "loss": 0.9432, + "step": 104410 + }, + { + "epoch": 0.6671096175715217, + "grad_norm": 1.0058971643447876, + "learning_rate": 7.498725228513295e-05, + "loss": 0.9858, + "step": 104420 + }, + { + "epoch": 0.6671735047212604, + "grad_norm": 0.9895200133323669, + "learning_rate": 7.49829059782926e-05, + "loss": 0.9344, + "step": 104430 + }, + { + "epoch": 0.6672373918709991, + "grad_norm": 0.8452537655830383, + "learning_rate": 7.497855941985274e-05, + "loss": 0.9845, + "step": 104440 + }, + { + "epoch": 0.6673012790207378, + "grad_norm": 1.1709915399551392, + "learning_rate": 7.497421260985721e-05, + "loss": 0.6073, + "step": 104450 + }, + { + "epoch": 0.6673651661704765, + "grad_norm": 0.6118887662887573, + "learning_rate": 7.496986554834974e-05, + "loss": 0.8184, + "step": 104460 + }, + { + "epoch": 0.6674290533202152, + "grad_norm": 1.082446575164795, + "learning_rate": 7.496551823537414e-05, + "loss": 0.7765, + "step": 104470 + }, + { + "epoch": 0.6674929404699539, + "grad_norm": 1.0023239850997925, + "learning_rate": 7.496117067097416e-05, + "loss": 0.9153, + "step": 104480 + }, + { + "epoch": 0.6675568276196926, + "grad_norm": 0.8243518471717834, + "learning_rate": 7.49568228551936e-05, + "loss": 0.7647, + "step": 104490 + }, + { + "epoch": 0.6676207147694313, + "grad_norm": 1.0458136796951294, + "learning_rate": 7.495247478807624e-05, + "loss": 0.7552, + "step": 104500 + }, + { + "epoch": 0.66768460191917, + "grad_norm": 1.0639128684997559, + "learning_rate": 7.494856131281384e-05, + "loss": 1.1131, + "step": 104510 + }, + { + "epoch": 0.6677484890689087, + "grad_norm": 0.9773111343383789, + "learning_rate": 7.494421276827722e-05, + "loss": 0.9946, + "step": 104520 + }, + { + "epoch": 0.6678123762186474, + "grad_norm": 0.8806718587875366, + "learning_rate": 7.493986397253079e-05, + "loss": 0.79, + "step": 104530 + }, + { + "epoch": 0.6678762633683861, + "grad_norm": 0.7560052275657654, + "learning_rate": 7.493551492561835e-05, + "loss": 0.8424, + "step": 104540 + }, + { + "epoch": 0.6679401505181248, + "grad_norm": 0.7948547601699829, + "learning_rate": 7.49311656275837e-05, + "loss": 0.992, + "step": 104550 + }, + { + "epoch": 0.6680040376678635, + "grad_norm": 0.8209701776504517, + "learning_rate": 7.492681607847064e-05, + "loss": 0.7234, + "step": 104560 + }, + { + "epoch": 0.6680679248176021, + "grad_norm": 0.6525776386260986, + "learning_rate": 7.492246627832297e-05, + "loss": 0.9871, + "step": 104570 + }, + { + "epoch": 0.6681318119673408, + "grad_norm": 0.7713031768798828, + "learning_rate": 7.491811622718454e-05, + "loss": 1.1684, + "step": 104580 + }, + { + "epoch": 0.6681956991170795, + "grad_norm": 0.7066755890846252, + "learning_rate": 7.49137659250991e-05, + "loss": 1.0549, + "step": 104590 + }, + { + "epoch": 0.6682595862668183, + "grad_norm": 0.9427279829978943, + "learning_rate": 7.490941537211047e-05, + "loss": 1.1706, + "step": 104600 + }, + { + "epoch": 0.668323473416557, + "grad_norm": 1.0990161895751953, + "learning_rate": 7.49050645682625e-05, + "loss": 0.7285, + "step": 104610 + }, + { + "epoch": 0.6683873605662957, + "grad_norm": 0.9260150790214539, + "learning_rate": 7.490071351359896e-05, + "loss": 1.0507, + "step": 104620 + }, + { + "epoch": 0.6684512477160344, + "grad_norm": 0.7509433627128601, + "learning_rate": 7.48963622081637e-05, + "loss": 1.1747, + "step": 104630 + }, + { + "epoch": 0.6685151348657731, + "grad_norm": 0.5178989768028259, + "learning_rate": 7.489201065200055e-05, + "loss": 0.7815, + "step": 104640 + }, + { + "epoch": 0.6685790220155118, + "grad_norm": 0.6780941486358643, + "learning_rate": 7.488765884515331e-05, + "loss": 0.8624, + "step": 104650 + }, + { + "epoch": 0.6686429091652505, + "grad_norm": 0.9320861101150513, + "learning_rate": 7.488330678766581e-05, + "loss": 0.7658, + "step": 104660 + }, + { + "epoch": 0.6687067963149892, + "grad_norm": 1.1950159072875977, + "learning_rate": 7.487895447958189e-05, + "loss": 0.7474, + "step": 104670 + }, + { + "epoch": 0.6687706834647279, + "grad_norm": 1.2028008699417114, + "learning_rate": 7.487460192094538e-05, + "loss": 1.0459, + "step": 104680 + }, + { + "epoch": 0.6688345706144666, + "grad_norm": 0.6161251068115234, + "learning_rate": 7.48702491118001e-05, + "loss": 0.8281, + "step": 104690 + }, + { + "epoch": 0.6688984577642053, + "grad_norm": 0.6632505059242249, + "learning_rate": 7.48658960521899e-05, + "loss": 0.927, + "step": 104700 + }, + { + "epoch": 0.668962344913944, + "grad_norm": 1.3930466175079346, + "learning_rate": 7.48615427421586e-05, + "loss": 0.8673, + "step": 104710 + }, + { + "epoch": 0.6690262320636827, + "grad_norm": 0.6448533535003662, + "learning_rate": 7.485718918175006e-05, + "loss": 1.0228, + "step": 104720 + }, + { + "epoch": 0.6690901192134214, + "grad_norm": 0.997040331363678, + "learning_rate": 7.485283537100813e-05, + "loss": 0.952, + "step": 104730 + }, + { + "epoch": 0.6691540063631601, + "grad_norm": 0.7598833441734314, + "learning_rate": 7.484848130997664e-05, + "loss": 0.7925, + "step": 104740 + }, + { + "epoch": 0.6692178935128988, + "grad_norm": 1.102980375289917, + "learning_rate": 7.484412699869946e-05, + "loss": 0.8564, + "step": 104750 + }, + { + "epoch": 0.6692817806626375, + "grad_norm": 0.7010207772254944, + "learning_rate": 7.483977243722042e-05, + "loss": 0.821, + "step": 104760 + }, + { + "epoch": 0.6693456678123763, + "grad_norm": 0.6999570727348328, + "learning_rate": 7.483541762558338e-05, + "loss": 0.9697, + "step": 104770 + }, + { + "epoch": 0.669409554962115, + "grad_norm": 1.1638367176055908, + "learning_rate": 7.48310625638322e-05, + "loss": 0.7476, + "step": 104780 + }, + { + "epoch": 0.6694734421118537, + "grad_norm": 1.272621512413025, + "learning_rate": 7.482670725201075e-05, + "loss": 0.9064, + "step": 104790 + }, + { + "epoch": 0.6695373292615924, + "grad_norm": 1.063870906829834, + "learning_rate": 7.482235169016286e-05, + "loss": 1.0145, + "step": 104800 + }, + { + "epoch": 0.6696012164113311, + "grad_norm": 0.9829151630401611, + "learning_rate": 7.481799587833241e-05, + "loss": 0.7884, + "step": 104810 + }, + { + "epoch": 0.6696651035610697, + "grad_norm": 0.8304445147514343, + "learning_rate": 7.481363981656329e-05, + "loss": 0.7588, + "step": 104820 + }, + { + "epoch": 0.6697289907108084, + "grad_norm": 0.8410045504570007, + "learning_rate": 7.480928350489935e-05, + "loss": 0.8533, + "step": 104830 + }, + { + "epoch": 0.6697928778605471, + "grad_norm": 0.8726821541786194, + "learning_rate": 7.480492694338445e-05, + "loss": 0.9978, + "step": 104840 + }, + { + "epoch": 0.6698567650102858, + "grad_norm": 0.6928712129592896, + "learning_rate": 7.480057013206248e-05, + "loss": 0.5947, + "step": 104850 + }, + { + "epoch": 0.6699206521600245, + "grad_norm": 0.9680970907211304, + "learning_rate": 7.479621307097732e-05, + "loss": 1.0667, + "step": 104860 + }, + { + "epoch": 0.6699845393097632, + "grad_norm": 1.0033652782440186, + "learning_rate": 7.479185576017283e-05, + "loss": 0.8343, + "step": 104870 + }, + { + "epoch": 0.6700484264595019, + "grad_norm": 0.9829990863800049, + "learning_rate": 7.478749819969291e-05, + "loss": 0.8276, + "step": 104880 + }, + { + "epoch": 0.6701123136092406, + "grad_norm": 0.591493546962738, + "learning_rate": 7.478314038958144e-05, + "loss": 0.8754, + "step": 104890 + }, + { + "epoch": 0.6701762007589793, + "grad_norm": 0.9688683748245239, + "learning_rate": 7.477878232988231e-05, + "loss": 0.7653, + "step": 104900 + }, + { + "epoch": 0.670240087908718, + "grad_norm": 1.1329195499420166, + "learning_rate": 7.47744240206394e-05, + "loss": 1.1918, + "step": 104910 + }, + { + "epoch": 0.6703039750584567, + "grad_norm": 0.9358948469161987, + "learning_rate": 7.47700654618966e-05, + "loss": 0.9408, + "step": 104920 + }, + { + "epoch": 0.6703678622081954, + "grad_norm": 0.8225820064544678, + "learning_rate": 7.476570665369782e-05, + "loss": 0.7708, + "step": 104930 + }, + { + "epoch": 0.6704317493579341, + "grad_norm": 0.5274181962013245, + "learning_rate": 7.476134759608695e-05, + "loss": 1.0094, + "step": 104940 + }, + { + "epoch": 0.6704956365076729, + "grad_norm": 0.9145537614822388, + "learning_rate": 7.475698828910789e-05, + "loss": 1.132, + "step": 104950 + }, + { + "epoch": 0.6705595236574116, + "grad_norm": 0.9951286315917969, + "learning_rate": 7.475262873280453e-05, + "loss": 0.7787, + "step": 104960 + }, + { + "epoch": 0.6706234108071503, + "grad_norm": 0.9020349979400635, + "learning_rate": 7.47482689272208e-05, + "loss": 0.8919, + "step": 104970 + }, + { + "epoch": 0.670687297956889, + "grad_norm": 1.0705788135528564, + "learning_rate": 7.474390887240058e-05, + "loss": 1.0581, + "step": 104980 + }, + { + "epoch": 0.6707511851066277, + "grad_norm": 0.665863573551178, + "learning_rate": 7.47395485683878e-05, + "loss": 1.1165, + "step": 104990 + }, + { + "epoch": 0.6708150722563664, + "grad_norm": 0.692719042301178, + "learning_rate": 7.473518801522636e-05, + "loss": 0.927, + "step": 105000 + }, + { + "epoch": 0.6708789594061051, + "grad_norm": 1.2955323457717896, + "learning_rate": 7.473082721296017e-05, + "loss": 0.7546, + "step": 105010 + }, + { + "epoch": 0.6709428465558438, + "grad_norm": 3.011267900466919, + "learning_rate": 7.472646616163317e-05, + "loss": 0.8958, + "step": 105020 + }, + { + "epoch": 0.6710067337055825, + "grad_norm": 1.2101144790649414, + "learning_rate": 7.472210486128926e-05, + "loss": 0.8662, + "step": 105030 + }, + { + "epoch": 0.6710706208553212, + "grad_norm": 1.1374763250350952, + "learning_rate": 7.471774331197235e-05, + "loss": 0.7575, + "step": 105040 + }, + { + "epoch": 0.6711345080050599, + "grad_norm": 0.7340751886367798, + "learning_rate": 7.47133815137264e-05, + "loss": 0.9587, + "step": 105050 + }, + { + "epoch": 0.6711983951547985, + "grad_norm": 0.9507089853286743, + "learning_rate": 7.470901946659529e-05, + "loss": 0.7413, + "step": 105060 + }, + { + "epoch": 0.6712622823045372, + "grad_norm": 0.9281381368637085, + "learning_rate": 7.470465717062301e-05, + "loss": 0.8137, + "step": 105070 + }, + { + "epoch": 0.6713261694542759, + "grad_norm": 1.0482163429260254, + "learning_rate": 7.470029462585344e-05, + "loss": 0.9596, + "step": 105080 + }, + { + "epoch": 0.6713900566040146, + "grad_norm": 0.8228697180747986, + "learning_rate": 7.469593183233055e-05, + "loss": 0.9596, + "step": 105090 + }, + { + "epoch": 0.6714539437537533, + "grad_norm": 0.9611666798591614, + "learning_rate": 7.469156879009824e-05, + "loss": 0.9952, + "step": 105100 + }, + { + "epoch": 0.671517830903492, + "grad_norm": 0.9121928811073303, + "learning_rate": 7.468720549920049e-05, + "loss": 0.9997, + "step": 105110 + }, + { + "epoch": 0.6715817180532307, + "grad_norm": 0.6626456379890442, + "learning_rate": 7.468284195968122e-05, + "loss": 1.3452, + "step": 105120 + }, + { + "epoch": 0.6716456052029695, + "grad_norm": 1.1006265878677368, + "learning_rate": 7.467847817158438e-05, + "loss": 0.8195, + "step": 105130 + }, + { + "epoch": 0.6717094923527082, + "grad_norm": 0.881610631942749, + "learning_rate": 7.46741141349539e-05, + "loss": 0.7741, + "step": 105140 + }, + { + "epoch": 0.6717733795024469, + "grad_norm": 0.8922167420387268, + "learning_rate": 7.466974984983374e-05, + "loss": 0.6272, + "step": 105150 + }, + { + "epoch": 0.6718372666521856, + "grad_norm": 0.9514210820198059, + "learning_rate": 7.466538531626788e-05, + "loss": 0.8509, + "step": 105160 + }, + { + "epoch": 0.6719011538019243, + "grad_norm": 1.4604068994522095, + "learning_rate": 7.466102053430023e-05, + "loss": 1.0156, + "step": 105170 + }, + { + "epoch": 0.671965040951663, + "grad_norm": 0.619574785232544, + "learning_rate": 7.46566555039748e-05, + "loss": 0.8557, + "step": 105180 + }, + { + "epoch": 0.6720289281014017, + "grad_norm": 0.6358922123908997, + "learning_rate": 7.46522902253355e-05, + "loss": 0.7148, + "step": 105190 + }, + { + "epoch": 0.6720928152511404, + "grad_norm": 1.179234266281128, + "learning_rate": 7.46479246984263e-05, + "loss": 1.074, + "step": 105200 + }, + { + "epoch": 0.6721567024008791, + "grad_norm": 1.14448881149292, + "learning_rate": 7.464355892329119e-05, + "loss": 1.2771, + "step": 105210 + }, + { + "epoch": 0.6722205895506178, + "grad_norm": 1.1141912937164307, + "learning_rate": 7.463919289997413e-05, + "loss": 1.0229, + "step": 105220 + }, + { + "epoch": 0.6722844767003565, + "grad_norm": 1.5927108526229858, + "learning_rate": 7.463482662851904e-05, + "loss": 0.7145, + "step": 105230 + }, + { + "epoch": 0.6723483638500952, + "grad_norm": 0.9684871435165405, + "learning_rate": 7.463046010896996e-05, + "loss": 0.862, + "step": 105240 + }, + { + "epoch": 0.6724122509998339, + "grad_norm": 2.3723561763763428, + "learning_rate": 7.462609334137085e-05, + "loss": 0.8466, + "step": 105250 + }, + { + "epoch": 0.6724761381495726, + "grad_norm": 1.4940024614334106, + "learning_rate": 7.462172632576566e-05, + "loss": 1.0536, + "step": 105260 + }, + { + "epoch": 0.6725400252993113, + "grad_norm": 2.5333075523376465, + "learning_rate": 7.46173590621984e-05, + "loss": 0.8354, + "step": 105270 + }, + { + "epoch": 0.67260391244905, + "grad_norm": 0.7402299046516418, + "learning_rate": 7.461299155071302e-05, + "loss": 0.8569, + "step": 105280 + }, + { + "epoch": 0.6726677995987888, + "grad_norm": 1.2563143968582153, + "learning_rate": 7.460862379135353e-05, + "loss": 0.9467, + "step": 105290 + }, + { + "epoch": 0.6727316867485273, + "grad_norm": 0.6169116497039795, + "learning_rate": 7.460425578416392e-05, + "loss": 0.8101, + "step": 105300 + }, + { + "epoch": 0.672795573898266, + "grad_norm": 0.9723251461982727, + "learning_rate": 7.459988752918815e-05, + "loss": 0.836, + "step": 105310 + }, + { + "epoch": 0.6728594610480048, + "grad_norm": 0.9852758049964905, + "learning_rate": 7.459551902647023e-05, + "loss": 0.7975, + "step": 105320 + }, + { + "epoch": 0.6729233481977435, + "grad_norm": 0.8491624593734741, + "learning_rate": 7.459115027605416e-05, + "loss": 1.0277, + "step": 105330 + }, + { + "epoch": 0.6729872353474822, + "grad_norm": 0.9614421725273132, + "learning_rate": 7.458678127798394e-05, + "loss": 0.6645, + "step": 105340 + }, + { + "epoch": 0.6730511224972209, + "grad_norm": 0.9115906357765198, + "learning_rate": 7.458241203230355e-05, + "loss": 0.8675, + "step": 105350 + }, + { + "epoch": 0.6731150096469596, + "grad_norm": 0.9010282158851624, + "learning_rate": 7.457804253905701e-05, + "loss": 0.8478, + "step": 105360 + }, + { + "epoch": 0.6731788967966983, + "grad_norm": 1.2737298011779785, + "learning_rate": 7.457367279828833e-05, + "loss": 0.7011, + "step": 105370 + }, + { + "epoch": 0.673242783946437, + "grad_norm": 1.0771639347076416, + "learning_rate": 7.456930281004148e-05, + "loss": 1.0038, + "step": 105380 + }, + { + "epoch": 0.6733066710961757, + "grad_norm": 0.9873582124710083, + "learning_rate": 7.456493257436052e-05, + "loss": 0.858, + "step": 105390 + }, + { + "epoch": 0.6733705582459144, + "grad_norm": 0.7805922031402588, + "learning_rate": 7.456056209128942e-05, + "loss": 0.9136, + "step": 105400 + }, + { + "epoch": 0.6734344453956531, + "grad_norm": 0.9018038511276245, + "learning_rate": 7.455619136087221e-05, + "loss": 1.0227, + "step": 105410 + }, + { + "epoch": 0.6734983325453918, + "grad_norm": 0.6552641987800598, + "learning_rate": 7.455182038315294e-05, + "loss": 1.0684, + "step": 105420 + }, + { + "epoch": 0.6735622196951305, + "grad_norm": 1.088218331336975, + "learning_rate": 7.454744915817557e-05, + "loss": 0.833, + "step": 105430 + }, + { + "epoch": 0.6736261068448692, + "grad_norm": 1.2808659076690674, + "learning_rate": 7.454307768598416e-05, + "loss": 0.6516, + "step": 105440 + }, + { + "epoch": 0.6736899939946079, + "grad_norm": 0.9723607301712036, + "learning_rate": 7.453870596662271e-05, + "loss": 1.0049, + "step": 105450 + }, + { + "epoch": 0.6737538811443466, + "grad_norm": 0.6379223465919495, + "learning_rate": 7.453433400013528e-05, + "loss": 0.9626, + "step": 105460 + }, + { + "epoch": 0.6738177682940854, + "grad_norm": 0.5692765712738037, + "learning_rate": 7.452996178656587e-05, + "loss": 0.7118, + "step": 105470 + }, + { + "epoch": 0.6738816554438241, + "grad_norm": 0.4991033971309662, + "learning_rate": 7.452558932595853e-05, + "loss": 0.8539, + "step": 105480 + }, + { + "epoch": 0.6739455425935628, + "grad_norm": 0.6770216226577759, + "learning_rate": 7.45212166183573e-05, + "loss": 0.797, + "step": 105490 + }, + { + "epoch": 0.6740094297433015, + "grad_norm": 1.0302858352661133, + "learning_rate": 7.451728097037279e-05, + "loss": 1.1371, + "step": 105500 + }, + { + "epoch": 0.6740733168930402, + "grad_norm": 0.986290693283081, + "learning_rate": 7.451290779360444e-05, + "loss": 0.8325, + "step": 105510 + }, + { + "epoch": 0.6741372040427789, + "grad_norm": 0.6854764223098755, + "learning_rate": 7.450853436996992e-05, + "loss": 1.201, + "step": 105520 + }, + { + "epoch": 0.6742010911925176, + "grad_norm": 1.172593116760254, + "learning_rate": 7.450416069951324e-05, + "loss": 0.7934, + "step": 105530 + }, + { + "epoch": 0.6742649783422563, + "grad_norm": 0.6270721554756165, + "learning_rate": 7.44997867822785e-05, + "loss": 0.944, + "step": 105540 + }, + { + "epoch": 0.6743288654919949, + "grad_norm": 1.119352102279663, + "learning_rate": 7.449541261830968e-05, + "loss": 0.9087, + "step": 105550 + }, + { + "epoch": 0.6743927526417336, + "grad_norm": 1.074959397315979, + "learning_rate": 7.449103820765086e-05, + "loss": 0.7586, + "step": 105560 + }, + { + "epoch": 0.6744566397914723, + "grad_norm": 0.7056079506874084, + "learning_rate": 7.44866635503461e-05, + "loss": 0.9738, + "step": 105570 + }, + { + "epoch": 0.674520526941211, + "grad_norm": 0.5963863730430603, + "learning_rate": 7.448228864643947e-05, + "loss": 0.7275, + "step": 105580 + }, + { + "epoch": 0.6745844140909497, + "grad_norm": 0.836320698261261, + "learning_rate": 7.447791349597502e-05, + "loss": 0.9359, + "step": 105590 + }, + { + "epoch": 0.6746483012406884, + "grad_norm": 0.8702114224433899, + "learning_rate": 7.447353809899677e-05, + "loss": 0.8465, + "step": 105600 + }, + { + "epoch": 0.6747121883904271, + "grad_norm": 0.9476937651634216, + "learning_rate": 7.446916245554885e-05, + "loss": 1.0381, + "step": 105610 + }, + { + "epoch": 0.6747760755401658, + "grad_norm": 1.5769599676132202, + "learning_rate": 7.446478656567529e-05, + "loss": 1.1323, + "step": 105620 + }, + { + "epoch": 0.6748399626899045, + "grad_norm": 0.9263478517532349, + "learning_rate": 7.446041042942016e-05, + "loss": 1.2046, + "step": 105630 + }, + { + "epoch": 0.6749038498396432, + "grad_norm": 0.5203749537467957, + "learning_rate": 7.445603404682754e-05, + "loss": 0.9345, + "step": 105640 + }, + { + "epoch": 0.674967736989382, + "grad_norm": 1.511046290397644, + "learning_rate": 7.445165741794149e-05, + "loss": 0.7772, + "step": 105650 + }, + { + "epoch": 0.6750316241391207, + "grad_norm": 0.868693470954895, + "learning_rate": 7.44472805428061e-05, + "loss": 0.9579, + "step": 105660 + }, + { + "epoch": 0.6750955112888594, + "grad_norm": 0.7717391848564148, + "learning_rate": 7.444290342146545e-05, + "loss": 0.911, + "step": 105670 + }, + { + "epoch": 0.6751593984385981, + "grad_norm": 1.1811197996139526, + "learning_rate": 7.443852605396361e-05, + "loss": 0.8844, + "step": 105680 + }, + { + "epoch": 0.6752232855883368, + "grad_norm": 0.9373357892036438, + "learning_rate": 7.443414844034468e-05, + "loss": 1.0509, + "step": 105690 + }, + { + "epoch": 0.6752871727380755, + "grad_norm": 1.0546302795410156, + "learning_rate": 7.442977058065273e-05, + "loss": 0.8802, + "step": 105700 + }, + { + "epoch": 0.6753510598878142, + "grad_norm": 0.8621144890785217, + "learning_rate": 7.442539247493185e-05, + "loss": 0.6729, + "step": 105710 + }, + { + "epoch": 0.6754149470375529, + "grad_norm": 0.6948429942131042, + "learning_rate": 7.442101412322613e-05, + "loss": 0.9194, + "step": 105720 + }, + { + "epoch": 0.6754788341872916, + "grad_norm": 1.381230354309082, + "learning_rate": 7.441663552557969e-05, + "loss": 0.7042, + "step": 105730 + }, + { + "epoch": 0.6755427213370303, + "grad_norm": 1.3487558364868164, + "learning_rate": 7.441225668203658e-05, + "loss": 0.8875, + "step": 105740 + }, + { + "epoch": 0.675606608486769, + "grad_norm": 1.0803183317184448, + "learning_rate": 7.440787759264095e-05, + "loss": 1.0096, + "step": 105750 + }, + { + "epoch": 0.6756704956365077, + "grad_norm": 1.2716935873031616, + "learning_rate": 7.440349825743687e-05, + "loss": 0.7651, + "step": 105760 + }, + { + "epoch": 0.6757343827862464, + "grad_norm": 1.0950727462768555, + "learning_rate": 7.439911867646845e-05, + "loss": 0.8156, + "step": 105770 + }, + { + "epoch": 0.6757982699359851, + "grad_norm": 0.8086333870887756, + "learning_rate": 7.43947388497798e-05, + "loss": 0.8297, + "step": 105780 + }, + { + "epoch": 0.6758621570857237, + "grad_norm": 0.7670168876647949, + "learning_rate": 7.439035877741503e-05, + "loss": 0.855, + "step": 105790 + }, + { + "epoch": 0.6759260442354624, + "grad_norm": 0.7938393950462341, + "learning_rate": 7.438597845941824e-05, + "loss": 0.8926, + "step": 105800 + }, + { + "epoch": 0.6759899313852011, + "grad_norm": 0.7349621057510376, + "learning_rate": 7.438159789583354e-05, + "loss": 0.9497, + "step": 105810 + }, + { + "epoch": 0.6760538185349398, + "grad_norm": 0.7684302926063538, + "learning_rate": 7.437721708670508e-05, + "loss": 0.9919, + "step": 105820 + }, + { + "epoch": 0.6761177056846785, + "grad_norm": 1.0219396352767944, + "learning_rate": 7.437283603207693e-05, + "loss": 0.8476, + "step": 105830 + }, + { + "epoch": 0.6761815928344173, + "grad_norm": 0.8535874485969543, + "learning_rate": 7.436845473199325e-05, + "loss": 0.8841, + "step": 105840 + }, + { + "epoch": 0.676245479984156, + "grad_norm": 0.8949576020240784, + "learning_rate": 7.436407318649814e-05, + "loss": 0.8545, + "step": 105850 + }, + { + "epoch": 0.6763093671338947, + "grad_norm": 1.5550041198730469, + "learning_rate": 7.435969139563574e-05, + "loss": 1.1006, + "step": 105860 + }, + { + "epoch": 0.6763732542836334, + "grad_norm": 0.8512755036354065, + "learning_rate": 7.435530935945018e-05, + "loss": 0.9107, + "step": 105870 + }, + { + "epoch": 0.6764371414333721, + "grad_norm": 0.6899836659431458, + "learning_rate": 7.435092707798559e-05, + "loss": 1.1033, + "step": 105880 + }, + { + "epoch": 0.6765010285831108, + "grad_norm": 0.6218075156211853, + "learning_rate": 7.434654455128607e-05, + "loss": 0.8597, + "step": 105890 + }, + { + "epoch": 0.6765649157328495, + "grad_norm": 0.6466425657272339, + "learning_rate": 7.43421617793958e-05, + "loss": 0.798, + "step": 105900 + }, + { + "epoch": 0.6766288028825882, + "grad_norm": 0.6886029243469238, + "learning_rate": 7.43377787623589e-05, + "loss": 0.8296, + "step": 105910 + }, + { + "epoch": 0.6766926900323269, + "grad_norm": 0.6372695565223694, + "learning_rate": 7.433339550021951e-05, + "loss": 0.8947, + "step": 105920 + }, + { + "epoch": 0.6767565771820656, + "grad_norm": 0.8008190393447876, + "learning_rate": 7.43290119930218e-05, + "loss": 0.9288, + "step": 105930 + }, + { + "epoch": 0.6768204643318043, + "grad_norm": 0.8058283925056458, + "learning_rate": 7.432462824080985e-05, + "loss": 0.8823, + "step": 105940 + }, + { + "epoch": 0.676884351481543, + "grad_norm": 0.7353378534317017, + "learning_rate": 7.432024424362789e-05, + "loss": 0.9896, + "step": 105950 + }, + { + "epoch": 0.6769482386312817, + "grad_norm": 1.0405430793762207, + "learning_rate": 7.431586000152001e-05, + "loss": 1.3537, + "step": 105960 + }, + { + "epoch": 0.6770121257810204, + "grad_norm": 0.5413171052932739, + "learning_rate": 7.431147551453038e-05, + "loss": 0.8819, + "step": 105970 + }, + { + "epoch": 0.6770760129307591, + "grad_norm": 1.0479340553283691, + "learning_rate": 7.430709078270316e-05, + "loss": 0.9891, + "step": 105980 + }, + { + "epoch": 0.6771399000804978, + "grad_norm": 1.1004263162612915, + "learning_rate": 7.430270580608252e-05, + "loss": 0.8119, + "step": 105990 + }, + { + "epoch": 0.6772037872302366, + "grad_norm": 0.6329840421676636, + "learning_rate": 7.42983205847126e-05, + "loss": 0.8123, + "step": 106000 + }, + { + "epoch": 0.6772676743799753, + "grad_norm": 0.8763070702552795, + "learning_rate": 7.429393511863757e-05, + "loss": 0.8839, + "step": 106010 + }, + { + "epoch": 0.677331561529714, + "grad_norm": 1.7734843492507935, + "learning_rate": 7.42895494079016e-05, + "loss": 0.9038, + "step": 106020 + }, + { + "epoch": 0.6773954486794526, + "grad_norm": 1.5639463663101196, + "learning_rate": 7.428516345254886e-05, + "loss": 0.7489, + "step": 106030 + }, + { + "epoch": 0.6774593358291913, + "grad_norm": 0.8904886245727539, + "learning_rate": 7.42807772526235e-05, + "loss": 0.7974, + "step": 106040 + }, + { + "epoch": 0.67752322297893, + "grad_norm": 0.9649606347084045, + "learning_rate": 7.42763908081697e-05, + "loss": 1.0977, + "step": 106050 + }, + { + "epoch": 0.6775871101286687, + "grad_norm": 0.7616420984268188, + "learning_rate": 7.427200411923166e-05, + "loss": 0.9091, + "step": 106060 + }, + { + "epoch": 0.6776509972784074, + "grad_norm": 1.3879841566085815, + "learning_rate": 7.426761718585353e-05, + "loss": 0.9823, + "step": 106070 + }, + { + "epoch": 0.6777148844281461, + "grad_norm": 1.2231416702270508, + "learning_rate": 7.426323000807951e-05, + "loss": 0.8862, + "step": 106080 + }, + { + "epoch": 0.6777787715778848, + "grad_norm": 1.1057007312774658, + "learning_rate": 7.425884258595377e-05, + "loss": 0.9518, + "step": 106090 + }, + { + "epoch": 0.6778426587276235, + "grad_norm": 0.7669041156768799, + "learning_rate": 7.42544549195205e-05, + "loss": 0.8201, + "step": 106100 + }, + { + "epoch": 0.6779065458773622, + "grad_norm": 0.9496064186096191, + "learning_rate": 7.425006700882388e-05, + "loss": 0.747, + "step": 106110 + }, + { + "epoch": 0.6779704330271009, + "grad_norm": 0.8966147899627686, + "learning_rate": 7.424567885390811e-05, + "loss": 0.9232, + "step": 106120 + }, + { + "epoch": 0.6780343201768396, + "grad_norm": 0.8240459561347961, + "learning_rate": 7.424129045481738e-05, + "loss": 0.9572, + "step": 106130 + }, + { + "epoch": 0.6780982073265783, + "grad_norm": 0.9006532430648804, + "learning_rate": 7.423690181159588e-05, + "loss": 1.0682, + "step": 106140 + }, + { + "epoch": 0.678162094476317, + "grad_norm": 0.6999794840812683, + "learning_rate": 7.423251292428782e-05, + "loss": 0.7835, + "step": 106150 + }, + { + "epoch": 0.6782259816260557, + "grad_norm": 0.639180600643158, + "learning_rate": 7.422812379293738e-05, + "loss": 0.9808, + "step": 106160 + }, + { + "epoch": 0.6782898687757944, + "grad_norm": 0.9818177819252014, + "learning_rate": 7.422373441758877e-05, + "loss": 0.9845, + "step": 106170 + }, + { + "epoch": 0.6783537559255332, + "grad_norm": 0.85085529088974, + "learning_rate": 7.421934479828621e-05, + "loss": 1.0079, + "step": 106180 + }, + { + "epoch": 0.6784176430752719, + "grad_norm": 1.0107144117355347, + "learning_rate": 7.421495493507388e-05, + "loss": 0.71, + "step": 106190 + }, + { + "epoch": 0.6784815302250106, + "grad_norm": 0.8467554450035095, + "learning_rate": 7.421056482799602e-05, + "loss": 0.8878, + "step": 106200 + }, + { + "epoch": 0.6785454173747493, + "grad_norm": 1.0272150039672852, + "learning_rate": 7.42061744770968e-05, + "loss": 1.0742, + "step": 106210 + }, + { + "epoch": 0.678609304524488, + "grad_norm": 0.5289245247840881, + "learning_rate": 7.42017838824205e-05, + "loss": 0.6677, + "step": 106220 + }, + { + "epoch": 0.6786731916742267, + "grad_norm": 0.78628009557724, + "learning_rate": 7.419739304401127e-05, + "loss": 0.9517, + "step": 106230 + }, + { + "epoch": 0.6787370788239654, + "grad_norm": 1.0156890153884888, + "learning_rate": 7.419300196191338e-05, + "loss": 1.0061, + "step": 106240 + }, + { + "epoch": 0.6788009659737041, + "grad_norm": 1.2271900177001953, + "learning_rate": 7.418861063617102e-05, + "loss": 0.8683, + "step": 106250 + }, + { + "epoch": 0.6788648531234428, + "grad_norm": 2.280670404434204, + "learning_rate": 7.41842190668284e-05, + "loss": 0.7783, + "step": 106260 + }, + { + "epoch": 0.6789287402731814, + "grad_norm": 0.7349517345428467, + "learning_rate": 7.41798272539298e-05, + "loss": 0.8688, + "step": 106270 + }, + { + "epoch": 0.6789926274229201, + "grad_norm": 0.9518811702728271, + "learning_rate": 7.417543519751943e-05, + "loss": 0.7962, + "step": 106280 + }, + { + "epoch": 0.6790565145726588, + "grad_norm": 1.090990662574768, + "learning_rate": 7.41710428976415e-05, + "loss": 0.9509, + "step": 106290 + }, + { + "epoch": 0.6791204017223975, + "grad_norm": 0.817570149898529, + "learning_rate": 7.416665035434025e-05, + "loss": 0.8855, + "step": 106300 + }, + { + "epoch": 0.6791842888721362, + "grad_norm": 0.6482291221618652, + "learning_rate": 7.416225756765993e-05, + "loss": 0.7992, + "step": 106310 + }, + { + "epoch": 0.6792481760218749, + "grad_norm": 1.4157582521438599, + "learning_rate": 7.415786453764478e-05, + "loss": 1.0889, + "step": 106320 + }, + { + "epoch": 0.6793120631716136, + "grad_norm": 1.1152769327163696, + "learning_rate": 7.415347126433903e-05, + "loss": 1.1192, + "step": 106330 + }, + { + "epoch": 0.6793759503213523, + "grad_norm": 0.8868082761764526, + "learning_rate": 7.414907774778693e-05, + "loss": 0.7684, + "step": 106340 + }, + { + "epoch": 0.679439837471091, + "grad_norm": 0.8544641137123108, + "learning_rate": 7.414468398803272e-05, + "loss": 0.8531, + "step": 106350 + }, + { + "epoch": 0.6795037246208298, + "grad_norm": 2.2637743949890137, + "learning_rate": 7.414028998512065e-05, + "loss": 0.8975, + "step": 106360 + }, + { + "epoch": 0.6795676117705685, + "grad_norm": 0.912253737449646, + "learning_rate": 7.413589573909498e-05, + "loss": 0.6954, + "step": 106370 + }, + { + "epoch": 0.6796314989203072, + "grad_norm": 0.7223014831542969, + "learning_rate": 7.413150124999997e-05, + "loss": 0.9605, + "step": 106380 + }, + { + "epoch": 0.6796953860700459, + "grad_norm": 0.7518347501754761, + "learning_rate": 7.412710651787986e-05, + "loss": 0.7462, + "step": 106390 + }, + { + "epoch": 0.6797592732197846, + "grad_norm": 0.6483036279678345, + "learning_rate": 7.412271154277891e-05, + "loss": 0.8934, + "step": 106400 + }, + { + "epoch": 0.6798231603695233, + "grad_norm": 1.010314702987671, + "learning_rate": 7.411831632474138e-05, + "loss": 0.724, + "step": 106410 + }, + { + "epoch": 0.679887047519262, + "grad_norm": 0.7592995762825012, + "learning_rate": 7.411392086381154e-05, + "loss": 0.8157, + "step": 106420 + }, + { + "epoch": 0.6799509346690007, + "grad_norm": 1.2588444948196411, + "learning_rate": 7.410952516003367e-05, + "loss": 0.9328, + "step": 106430 + }, + { + "epoch": 0.6800148218187394, + "grad_norm": 0.7056863903999329, + "learning_rate": 7.410512921345201e-05, + "loss": 1.0183, + "step": 106440 + }, + { + "epoch": 0.6800787089684781, + "grad_norm": 0.7405192255973816, + "learning_rate": 7.410073302411085e-05, + "loss": 0.7652, + "step": 106450 + }, + { + "epoch": 0.6801425961182168, + "grad_norm": 0.6608672738075256, + "learning_rate": 7.409633659205446e-05, + "loss": 0.9101, + "step": 106460 + }, + { + "epoch": 0.6802064832679555, + "grad_norm": 1.030137300491333, + "learning_rate": 7.409193991732711e-05, + "loss": 0.849, + "step": 106470 + }, + { + "epoch": 0.6802703704176942, + "grad_norm": 0.6124225854873657, + "learning_rate": 7.40875429999731e-05, + "loss": 0.758, + "step": 106480 + }, + { + "epoch": 0.6803342575674329, + "grad_norm": 0.8795433640480042, + "learning_rate": 7.408314584003666e-05, + "loss": 1.0669, + "step": 106490 + }, + { + "epoch": 0.6803981447171716, + "grad_norm": 0.7640893459320068, + "learning_rate": 7.407874843756213e-05, + "loss": 0.8179, + "step": 106500 + }, + { + "epoch": 0.6804620318669103, + "grad_norm": 0.6787682771682739, + "learning_rate": 7.407435079259377e-05, + "loss": 0.9895, + "step": 106510 + }, + { + "epoch": 0.6805259190166489, + "grad_norm": 0.8706437349319458, + "learning_rate": 7.406995290517587e-05, + "loss": 0.7269, + "step": 106520 + }, + { + "epoch": 0.6805898061663876, + "grad_norm": 0.6258346438407898, + "learning_rate": 7.406555477535271e-05, + "loss": 1.0131, + "step": 106530 + }, + { + "epoch": 0.6806536933161264, + "grad_norm": 1.0943886041641235, + "learning_rate": 7.406115640316861e-05, + "loss": 0.7417, + "step": 106540 + }, + { + "epoch": 0.6807175804658651, + "grad_norm": 0.7393679618835449, + "learning_rate": 7.405675778866785e-05, + "loss": 0.8613, + "step": 106550 + }, + { + "epoch": 0.6807814676156038, + "grad_norm": 0.8770964741706848, + "learning_rate": 7.40523589318947e-05, + "loss": 1.2317, + "step": 106560 + }, + { + "epoch": 0.6808453547653425, + "grad_norm": 0.980842649936676, + "learning_rate": 7.404795983289351e-05, + "loss": 0.8648, + "step": 106570 + }, + { + "epoch": 0.6809092419150812, + "grad_norm": 0.7715876698493958, + "learning_rate": 7.404356049170856e-05, + "loss": 0.9493, + "step": 106580 + }, + { + "epoch": 0.6809731290648199, + "grad_norm": 0.8744866847991943, + "learning_rate": 7.403916090838414e-05, + "loss": 1.3351, + "step": 106590 + }, + { + "epoch": 0.6810370162145586, + "grad_norm": 2.178861618041992, + "learning_rate": 7.403476108296458e-05, + "loss": 1.0401, + "step": 106600 + }, + { + "epoch": 0.6811009033642973, + "grad_norm": 0.7490164637565613, + "learning_rate": 7.40303610154942e-05, + "loss": 0.8825, + "step": 106610 + }, + { + "epoch": 0.681164790514036, + "grad_norm": 1.3160593509674072, + "learning_rate": 7.402596070601729e-05, + "loss": 0.9475, + "step": 106620 + }, + { + "epoch": 0.6812286776637747, + "grad_norm": 0.7300577163696289, + "learning_rate": 7.402156015457815e-05, + "loss": 0.7993, + "step": 106630 + }, + { + "epoch": 0.6812925648135134, + "grad_norm": 1.1624113321304321, + "learning_rate": 7.401715936122114e-05, + "loss": 1.0644, + "step": 106640 + }, + { + "epoch": 0.6813564519632521, + "grad_norm": 0.6754822134971619, + "learning_rate": 7.401275832599054e-05, + "loss": 0.9375, + "step": 106650 + }, + { + "epoch": 0.6814203391129908, + "grad_norm": 0.8442546129226685, + "learning_rate": 7.40083570489307e-05, + "loss": 1.0102, + "step": 106660 + }, + { + "epoch": 0.6814842262627295, + "grad_norm": 0.8470264673233032, + "learning_rate": 7.400395553008593e-05, + "loss": 0.8809, + "step": 106670 + }, + { + "epoch": 0.6815481134124682, + "grad_norm": 1.252909541130066, + "learning_rate": 7.399955376950056e-05, + "loss": 0.9274, + "step": 106680 + }, + { + "epoch": 0.6816120005622069, + "grad_norm": 1.0591319799423218, + "learning_rate": 7.399515176721894e-05, + "loss": 0.7077, + "step": 106690 + }, + { + "epoch": 0.6816758877119456, + "grad_norm": 0.9662178754806519, + "learning_rate": 7.399074952328536e-05, + "loss": 0.9326, + "step": 106700 + }, + { + "epoch": 0.6817397748616844, + "grad_norm": 0.6794439554214478, + "learning_rate": 7.398634703774417e-05, + "loss": 0.9654, + "step": 106710 + }, + { + "epoch": 0.6818036620114231, + "grad_norm": 1.1868617534637451, + "learning_rate": 7.398194431063974e-05, + "loss": 0.6711, + "step": 106720 + }, + { + "epoch": 0.6818675491611618, + "grad_norm": 0.6283101439476013, + "learning_rate": 7.397754134201637e-05, + "loss": 0.7644, + "step": 106730 + }, + { + "epoch": 0.6819314363109005, + "grad_norm": 0.9207131862640381, + "learning_rate": 7.397313813191842e-05, + "loss": 1.1804, + "step": 106740 + }, + { + "epoch": 0.6819953234606392, + "grad_norm": 0.7542859315872192, + "learning_rate": 7.396873468039022e-05, + "loss": 1.1054, + "step": 106750 + }, + { + "epoch": 0.6820592106103778, + "grad_norm": 1.1628599166870117, + "learning_rate": 7.396433098747613e-05, + "loss": 0.7886, + "step": 106760 + }, + { + "epoch": 0.6821230977601165, + "grad_norm": 0.9535654187202454, + "learning_rate": 7.39599270532205e-05, + "loss": 0.8177, + "step": 106770 + }, + { + "epoch": 0.6821869849098552, + "grad_norm": 1.606237769126892, + "learning_rate": 7.395552287766766e-05, + "loss": 0.9816, + "step": 106780 + }, + { + "epoch": 0.6822508720595939, + "grad_norm": 0.8882198333740234, + "learning_rate": 7.395111846086201e-05, + "loss": 0.6792, + "step": 106790 + }, + { + "epoch": 0.6823147592093326, + "grad_norm": 0.7362374067306519, + "learning_rate": 7.394671380284784e-05, + "loss": 0.8806, + "step": 106800 + }, + { + "epoch": 0.6823786463590713, + "grad_norm": 0.7599479556083679, + "learning_rate": 7.394230890366956e-05, + "loss": 0.8613, + "step": 106810 + }, + { + "epoch": 0.68244253350881, + "grad_norm": 0.7655912041664124, + "learning_rate": 7.393790376337153e-05, + "loss": 0.8717, + "step": 106820 + }, + { + "epoch": 0.6825064206585487, + "grad_norm": 1.046034812927246, + "learning_rate": 7.393349838199809e-05, + "loss": 1.1742, + "step": 106830 + }, + { + "epoch": 0.6825703078082874, + "grad_norm": 0.7715229392051697, + "learning_rate": 7.392909275959362e-05, + "loss": 1.078, + "step": 106840 + }, + { + "epoch": 0.6826341949580261, + "grad_norm": 0.7597649097442627, + "learning_rate": 7.39246868962025e-05, + "loss": 0.971, + "step": 106850 + }, + { + "epoch": 0.6826980821077648, + "grad_norm": 0.5466295480728149, + "learning_rate": 7.392028079186906e-05, + "loss": 0.7825, + "step": 106860 + }, + { + "epoch": 0.6827619692575035, + "grad_norm": 4.8443284034729, + "learning_rate": 7.39158744466377e-05, + "loss": 1.1009, + "step": 106870 + }, + { + "epoch": 0.6828258564072422, + "grad_norm": 0.6265544891357422, + "learning_rate": 7.39114678605528e-05, + "loss": 0.8694, + "step": 106880 + }, + { + "epoch": 0.682889743556981, + "grad_norm": 0.917610764503479, + "learning_rate": 7.390706103365873e-05, + "loss": 0.9782, + "step": 106890 + }, + { + "epoch": 0.6829536307067197, + "grad_norm": 0.9550445079803467, + "learning_rate": 7.390265396599987e-05, + "loss": 0.906, + "step": 106900 + }, + { + "epoch": 0.6830175178564584, + "grad_norm": 2.5587947368621826, + "learning_rate": 7.389824665762061e-05, + "loss": 0.8528, + "step": 106910 + }, + { + "epoch": 0.6830814050061971, + "grad_norm": 1.318000078201294, + "learning_rate": 7.389383910856534e-05, + "loss": 0.9362, + "step": 106920 + }, + { + "epoch": 0.6831452921559358, + "grad_norm": 1.0165103673934937, + "learning_rate": 7.388943131887842e-05, + "loss": 0.7795, + "step": 106930 + }, + { + "epoch": 0.6832091793056745, + "grad_norm": 0.9445672631263733, + "learning_rate": 7.388502328860427e-05, + "loss": 0.9833, + "step": 106940 + }, + { + "epoch": 0.6832730664554132, + "grad_norm": 1.0553864240646362, + "learning_rate": 7.388061501778727e-05, + "loss": 0.8713, + "step": 106950 + }, + { + "epoch": 0.6833369536051519, + "grad_norm": 0.913757860660553, + "learning_rate": 7.387620650647182e-05, + "loss": 0.9192, + "step": 106960 + }, + { + "epoch": 0.6834008407548906, + "grad_norm": 0.7922553420066833, + "learning_rate": 7.387179775470232e-05, + "loss": 0.8956, + "step": 106970 + }, + { + "epoch": 0.6834647279046293, + "grad_norm": 0.7192181348800659, + "learning_rate": 7.386738876252315e-05, + "loss": 0.8198, + "step": 106980 + }, + { + "epoch": 0.683528615054368, + "grad_norm": 1.0555779933929443, + "learning_rate": 7.386297952997874e-05, + "loss": 0.7005, + "step": 106990 + }, + { + "epoch": 0.6835925022041066, + "grad_norm": 1.0021594762802124, + "learning_rate": 7.385857005711348e-05, + "loss": 0.8504, + "step": 107000 + }, + { + "epoch": 0.6836563893538453, + "grad_norm": 0.5227010250091553, + "learning_rate": 7.385416034397177e-05, + "loss": 0.8899, + "step": 107010 + }, + { + "epoch": 0.683720276503584, + "grad_norm": 0.47646623849868774, + "learning_rate": 7.384975039059802e-05, + "loss": 1.0871, + "step": 107020 + }, + { + "epoch": 0.6837841636533227, + "grad_norm": 1.0652568340301514, + "learning_rate": 7.384534019703667e-05, + "loss": 0.9768, + "step": 107030 + }, + { + "epoch": 0.6838480508030614, + "grad_norm": 0.7635281682014465, + "learning_rate": 7.384092976333212e-05, + "loss": 0.957, + "step": 107040 + }, + { + "epoch": 0.6839119379528001, + "grad_norm": 0.6990230083465576, + "learning_rate": 7.383651908952877e-05, + "loss": 0.8431, + "step": 107050 + }, + { + "epoch": 0.6839758251025388, + "grad_norm": 1.1831239461898804, + "learning_rate": 7.383210817567104e-05, + "loss": 0.9295, + "step": 107060 + }, + { + "epoch": 0.6840397122522776, + "grad_norm": 0.9544264078140259, + "learning_rate": 7.382769702180339e-05, + "loss": 1.0323, + "step": 107070 + }, + { + "epoch": 0.6841035994020163, + "grad_norm": 0.7274150848388672, + "learning_rate": 7.38232856279702e-05, + "loss": 0.9134, + "step": 107080 + }, + { + "epoch": 0.684167486551755, + "grad_norm": 1.0423110723495483, + "learning_rate": 7.381887399421592e-05, + "loss": 0.9402, + "step": 107090 + }, + { + "epoch": 0.6842313737014937, + "grad_norm": 0.8618479371070862, + "learning_rate": 7.381446212058497e-05, + "loss": 0.7547, + "step": 107100 + }, + { + "epoch": 0.6842952608512324, + "grad_norm": 0.8330484628677368, + "learning_rate": 7.381005000712177e-05, + "loss": 0.8832, + "step": 107110 + }, + { + "epoch": 0.6843591480009711, + "grad_norm": 1.7487927675247192, + "learning_rate": 7.380563765387079e-05, + "loss": 0.8351, + "step": 107120 + }, + { + "epoch": 0.6844230351507098, + "grad_norm": 1.0328443050384521, + "learning_rate": 7.380122506087644e-05, + "loss": 0.7783, + "step": 107130 + }, + { + "epoch": 0.6844869223004485, + "grad_norm": 1.1022374629974365, + "learning_rate": 7.379681222818314e-05, + "loss": 0.8898, + "step": 107140 + }, + { + "epoch": 0.6845508094501872, + "grad_norm": 1.1118669509887695, + "learning_rate": 7.379239915583538e-05, + "loss": 0.8272, + "step": 107150 + }, + { + "epoch": 0.6846146965999259, + "grad_norm": 1.1815778017044067, + "learning_rate": 7.378798584387756e-05, + "loss": 1.045, + "step": 107160 + }, + { + "epoch": 0.6846785837496646, + "grad_norm": 0.781929612159729, + "learning_rate": 7.378357229235415e-05, + "loss": 1.0828, + "step": 107170 + }, + { + "epoch": 0.6847424708994033, + "grad_norm": 0.8094179630279541, + "learning_rate": 7.37791585013096e-05, + "loss": 0.6823, + "step": 107180 + }, + { + "epoch": 0.684806358049142, + "grad_norm": 0.9121211767196655, + "learning_rate": 7.377474447078835e-05, + "loss": 0.8463, + "step": 107190 + }, + { + "epoch": 0.6848702451988807, + "grad_norm": 0.9199677109718323, + "learning_rate": 7.377033020083485e-05, + "loss": 0.9192, + "step": 107200 + }, + { + "epoch": 0.6849341323486194, + "grad_norm": 1.0086863040924072, + "learning_rate": 7.376591569149356e-05, + "loss": 0.8648, + "step": 107210 + }, + { + "epoch": 0.6849980194983581, + "grad_norm": 0.6935834288597107, + "learning_rate": 7.376150094280894e-05, + "loss": 0.8088, + "step": 107220 + }, + { + "epoch": 0.6850619066480969, + "grad_norm": 1.3548187017440796, + "learning_rate": 7.375708595482544e-05, + "loss": 0.7954, + "step": 107230 + }, + { + "epoch": 0.6851257937978356, + "grad_norm": 2.9168577194213867, + "learning_rate": 7.375267072758753e-05, + "loss": 1.0147, + "step": 107240 + }, + { + "epoch": 0.6851896809475742, + "grad_norm": 0.9866139888763428, + "learning_rate": 7.37482552611397e-05, + "loss": 0.77, + "step": 107250 + }, + { + "epoch": 0.6852535680973129, + "grad_norm": 1.5297490358352661, + "learning_rate": 7.374383955552638e-05, + "loss": 1.2862, + "step": 107260 + }, + { + "epoch": 0.6853174552470516, + "grad_norm": 0.7798259854316711, + "learning_rate": 7.373942361079204e-05, + "loss": 0.7411, + "step": 107270 + }, + { + "epoch": 0.6853813423967903, + "grad_norm": 0.7515537738800049, + "learning_rate": 7.37350074269812e-05, + "loss": 0.6391, + "step": 107280 + }, + { + "epoch": 0.685445229546529, + "grad_norm": 1.7930855751037598, + "learning_rate": 7.373059100413829e-05, + "loss": 0.8938, + "step": 107290 + }, + { + "epoch": 0.6855091166962677, + "grad_norm": 1.0468648672103882, + "learning_rate": 7.372617434230778e-05, + "loss": 0.8846, + "step": 107300 + }, + { + "epoch": 0.6855730038460064, + "grad_norm": 0.9677194952964783, + "learning_rate": 7.372175744153417e-05, + "loss": 0.879, + "step": 107310 + }, + { + "epoch": 0.6856368909957451, + "grad_norm": 0.9054749608039856, + "learning_rate": 7.371734030186195e-05, + "loss": 0.9007, + "step": 107320 + }, + { + "epoch": 0.6857007781454838, + "grad_norm": 1.1012799739837646, + "learning_rate": 7.371292292333559e-05, + "loss": 0.7437, + "step": 107330 + }, + { + "epoch": 0.6857646652952225, + "grad_norm": 0.8656480312347412, + "learning_rate": 7.370850530599959e-05, + "loss": 0.7237, + "step": 107340 + }, + { + "epoch": 0.6858285524449612, + "grad_norm": 0.986134946346283, + "learning_rate": 7.370408744989844e-05, + "loss": 0.9098, + "step": 107350 + }, + { + "epoch": 0.6858924395946999, + "grad_norm": 1.038024663925171, + "learning_rate": 7.36996693550766e-05, + "loss": 0.7683, + "step": 107360 + }, + { + "epoch": 0.6859563267444386, + "grad_norm": 0.9421197175979614, + "learning_rate": 7.369525102157861e-05, + "loss": 0.7816, + "step": 107370 + }, + { + "epoch": 0.6860202138941773, + "grad_norm": 0.8556358218193054, + "learning_rate": 7.369083244944893e-05, + "loss": 0.9645, + "step": 107380 + }, + { + "epoch": 0.686084101043916, + "grad_norm": 0.7408592700958252, + "learning_rate": 7.368641363873207e-05, + "loss": 0.8846, + "step": 107390 + }, + { + "epoch": 0.6861479881936547, + "grad_norm": 0.5881041288375854, + "learning_rate": 7.368199458947254e-05, + "loss": 0.7665, + "step": 107400 + }, + { + "epoch": 0.6862118753433935, + "grad_norm": 0.9732454419136047, + "learning_rate": 7.367757530171482e-05, + "loss": 1.018, + "step": 107410 + }, + { + "epoch": 0.6862757624931322, + "grad_norm": 0.4878905415534973, + "learning_rate": 7.367315577550344e-05, + "loss": 1.0164, + "step": 107420 + }, + { + "epoch": 0.6863396496428709, + "grad_norm": 0.9142529368400574, + "learning_rate": 7.366873601088291e-05, + "loss": 0.8166, + "step": 107430 + }, + { + "epoch": 0.6864035367926096, + "grad_norm": 0.7303772568702698, + "learning_rate": 7.366431600789772e-05, + "loss": 0.6688, + "step": 107440 + }, + { + "epoch": 0.6864674239423483, + "grad_norm": 0.7583977580070496, + "learning_rate": 7.36598957665924e-05, + "loss": 0.859, + "step": 107450 + }, + { + "epoch": 0.686531311092087, + "grad_norm": 0.8306979537010193, + "learning_rate": 7.365547528701146e-05, + "loss": 0.9408, + "step": 107460 + }, + { + "epoch": 0.6865951982418257, + "grad_norm": 0.9841431379318237, + "learning_rate": 7.365105456919942e-05, + "loss": 0.9479, + "step": 107470 + }, + { + "epoch": 0.6866590853915644, + "grad_norm": 0.8412874341011047, + "learning_rate": 7.364663361320081e-05, + "loss": 1.2542, + "step": 107480 + }, + { + "epoch": 0.686722972541303, + "grad_norm": 0.9620808362960815, + "learning_rate": 7.364221241906014e-05, + "loss": 1.0792, + "step": 107490 + }, + { + "epoch": 0.6867868596910417, + "grad_norm": 0.8014304637908936, + "learning_rate": 7.363779098682193e-05, + "loss": 1.1819, + "step": 107500 + }, + { + "epoch": 0.6868507468407804, + "grad_norm": 1.1913782358169556, + "learning_rate": 7.36333693165307e-05, + "loss": 1.0777, + "step": 107510 + }, + { + "epoch": 0.6869146339905191, + "grad_norm": 0.6413132548332214, + "learning_rate": 7.362894740823102e-05, + "loss": 0.9969, + "step": 107520 + }, + { + "epoch": 0.6869785211402578, + "grad_norm": 2.0043857097625732, + "learning_rate": 7.362452526196738e-05, + "loss": 0.7761, + "step": 107530 + }, + { + "epoch": 0.6870424082899965, + "grad_norm": 2.9130804538726807, + "learning_rate": 7.362010287778435e-05, + "loss": 0.9517, + "step": 107540 + }, + { + "epoch": 0.6871062954397352, + "grad_norm": 0.6536256670951843, + "learning_rate": 7.361568025572644e-05, + "loss": 0.7987, + "step": 107550 + }, + { + "epoch": 0.6871701825894739, + "grad_norm": 0.8029404878616333, + "learning_rate": 7.36112573958382e-05, + "loss": 0.8677, + "step": 107560 + }, + { + "epoch": 0.6872340697392126, + "grad_norm": 1.2548484802246094, + "learning_rate": 7.360683429816418e-05, + "loss": 0.9721, + "step": 107570 + }, + { + "epoch": 0.6872979568889513, + "grad_norm": 0.6949800848960876, + "learning_rate": 7.360241096274892e-05, + "loss": 0.7863, + "step": 107580 + }, + { + "epoch": 0.68736184403869, + "grad_norm": 0.7144826054573059, + "learning_rate": 7.359798738963694e-05, + "loss": 0.6767, + "step": 107590 + }, + { + "epoch": 0.6874257311884288, + "grad_norm": 0.7971734404563904, + "learning_rate": 7.359356357887282e-05, + "loss": 0.7645, + "step": 107600 + }, + { + "epoch": 0.6874896183381675, + "grad_norm": 0.6574593186378479, + "learning_rate": 7.35891395305011e-05, + "loss": 1.1075, + "step": 107610 + }, + { + "epoch": 0.6875535054879062, + "grad_norm": 0.8098707795143127, + "learning_rate": 7.358471524456635e-05, + "loss": 0.9526, + "step": 107620 + }, + { + "epoch": 0.6876173926376449, + "grad_norm": 0.7118765711784363, + "learning_rate": 7.35802907211131e-05, + "loss": 0.7041, + "step": 107630 + }, + { + "epoch": 0.6876812797873836, + "grad_norm": 0.8008665442466736, + "learning_rate": 7.357586596018594e-05, + "loss": 0.8071, + "step": 107640 + }, + { + "epoch": 0.6877451669371223, + "grad_norm": 0.9328833222389221, + "learning_rate": 7.357144096182938e-05, + "loss": 1.3249, + "step": 107650 + }, + { + "epoch": 0.687809054086861, + "grad_norm": 0.6230046153068542, + "learning_rate": 7.356701572608806e-05, + "loss": 0.7683, + "step": 107660 + }, + { + "epoch": 0.6878729412365997, + "grad_norm": 0.6966734528541565, + "learning_rate": 7.356259025300646e-05, + "loss": 0.9071, + "step": 107670 + }, + { + "epoch": 0.6879368283863384, + "grad_norm": 0.8863798975944519, + "learning_rate": 7.355816454262923e-05, + "loss": 0.8069, + "step": 107680 + }, + { + "epoch": 0.6880007155360771, + "grad_norm": 0.928939700126648, + "learning_rate": 7.35537385950009e-05, + "loss": 0.8535, + "step": 107690 + }, + { + "epoch": 0.6880646026858158, + "grad_norm": 0.8435116410255432, + "learning_rate": 7.354931241016601e-05, + "loss": 0.798, + "step": 107700 + }, + { + "epoch": 0.6881284898355545, + "grad_norm": 1.1882624626159668, + "learning_rate": 7.35448859881692e-05, + "loss": 1.0083, + "step": 107710 + }, + { + "epoch": 0.6881923769852932, + "grad_norm": 0.8240717053413391, + "learning_rate": 7.3540459329055e-05, + "loss": 0.8051, + "step": 107720 + }, + { + "epoch": 0.6882562641350318, + "grad_norm": 0.9132935404777527, + "learning_rate": 7.353603243286805e-05, + "loss": 0.9164, + "step": 107730 + }, + { + "epoch": 0.6883201512847705, + "grad_norm": 0.9722372889518738, + "learning_rate": 7.353160529965285e-05, + "loss": 0.9007, + "step": 107740 + }, + { + "epoch": 0.6883840384345092, + "grad_norm": 0.7652561068534851, + "learning_rate": 7.352717792945404e-05, + "loss": 0.9988, + "step": 107750 + }, + { + "epoch": 0.6884479255842479, + "grad_norm": 1.0295495986938477, + "learning_rate": 7.352275032231619e-05, + "loss": 1.0438, + "step": 107760 + }, + { + "epoch": 0.6885118127339867, + "grad_norm": 1.0043138265609741, + "learning_rate": 7.351832247828391e-05, + "loss": 0.8635, + "step": 107770 + }, + { + "epoch": 0.6885756998837254, + "grad_norm": 0.9536296129226685, + "learning_rate": 7.351389439740176e-05, + "loss": 0.7845, + "step": 107780 + }, + { + "epoch": 0.6886395870334641, + "grad_norm": 1.182599663734436, + "learning_rate": 7.350946607971436e-05, + "loss": 0.9473, + "step": 107790 + }, + { + "epoch": 0.6887034741832028, + "grad_norm": 0.9443296194076538, + "learning_rate": 7.35050375252663e-05, + "loss": 1.12, + "step": 107800 + }, + { + "epoch": 0.6887673613329415, + "grad_norm": 1.2377766370773315, + "learning_rate": 7.350060873410216e-05, + "loss": 0.6376, + "step": 107810 + }, + { + "epoch": 0.6888312484826802, + "grad_norm": 1.1331062316894531, + "learning_rate": 7.349617970626658e-05, + "loss": 0.8585, + "step": 107820 + }, + { + "epoch": 0.6888951356324189, + "grad_norm": 0.9837049245834351, + "learning_rate": 7.349175044180414e-05, + "loss": 0.7217, + "step": 107830 + }, + { + "epoch": 0.6889590227821576, + "grad_norm": 0.4539640545845032, + "learning_rate": 7.348732094075942e-05, + "loss": 0.6076, + "step": 107840 + }, + { + "epoch": 0.6890229099318963, + "grad_norm": 0.9993829131126404, + "learning_rate": 7.348289120317709e-05, + "loss": 0.7641, + "step": 107850 + }, + { + "epoch": 0.689086797081635, + "grad_norm": 0.9905250072479248, + "learning_rate": 7.347846122910174e-05, + "loss": 0.6454, + "step": 107860 + }, + { + "epoch": 0.6891506842313737, + "grad_norm": 0.8237646818161011, + "learning_rate": 7.347403101857795e-05, + "loss": 0.8458, + "step": 107870 + }, + { + "epoch": 0.6892145713811124, + "grad_norm": 1.0233882665634155, + "learning_rate": 7.346960057165036e-05, + "loss": 0.8326, + "step": 107880 + }, + { + "epoch": 0.6892784585308511, + "grad_norm": 1.116274356842041, + "learning_rate": 7.34651698883636e-05, + "loss": 0.8269, + "step": 107890 + }, + { + "epoch": 0.6893423456805898, + "grad_norm": 0.6511923670768738, + "learning_rate": 7.346073896876227e-05, + "loss": 1.2329, + "step": 107900 + }, + { + "epoch": 0.6894062328303285, + "grad_norm": 1.0535688400268555, + "learning_rate": 7.345630781289102e-05, + "loss": 0.8788, + "step": 107910 + }, + { + "epoch": 0.6894701199800672, + "grad_norm": 1.045600175857544, + "learning_rate": 7.345187642079443e-05, + "loss": 0.8773, + "step": 107920 + }, + { + "epoch": 0.689534007129806, + "grad_norm": 1.1185442209243774, + "learning_rate": 7.344744479251717e-05, + "loss": 0.7012, + "step": 107930 + }, + { + "epoch": 0.6895978942795447, + "grad_norm": 0.8849347233772278, + "learning_rate": 7.344301292810385e-05, + "loss": 0.8627, + "step": 107940 + }, + { + "epoch": 0.6896617814292834, + "grad_norm": 0.8291599154472351, + "learning_rate": 7.343858082759912e-05, + "loss": 0.8959, + "step": 107950 + }, + { + "epoch": 0.6897256685790221, + "grad_norm": 0.6584329009056091, + "learning_rate": 7.34341484910476e-05, + "loss": 0.7189, + "step": 107960 + }, + { + "epoch": 0.6897895557287607, + "grad_norm": 1.2374427318572998, + "learning_rate": 7.342971591849393e-05, + "loss": 0.9428, + "step": 107970 + }, + { + "epoch": 0.6898534428784994, + "grad_norm": 0.8575314879417419, + "learning_rate": 7.342528310998275e-05, + "loss": 0.759, + "step": 107980 + }, + { + "epoch": 0.6899173300282381, + "grad_norm": 0.7263084650039673, + "learning_rate": 7.34208500655587e-05, + "loss": 0.7271, + "step": 107990 + }, + { + "epoch": 0.6899812171779768, + "grad_norm": 1.145310401916504, + "learning_rate": 7.341641678526643e-05, + "loss": 1.1193, + "step": 108000 + }, + { + "epoch": 0.6900451043277155, + "grad_norm": 1.2653499841690063, + "learning_rate": 7.341198326915057e-05, + "loss": 0.8146, + "step": 108010 + }, + { + "epoch": 0.6901089914774542, + "grad_norm": 0.6225971579551697, + "learning_rate": 7.340754951725582e-05, + "loss": 0.7708, + "step": 108020 + }, + { + "epoch": 0.6901728786271929, + "grad_norm": 2.235273838043213, + "learning_rate": 7.340311552962676e-05, + "loss": 0.8989, + "step": 108030 + }, + { + "epoch": 0.6902367657769316, + "grad_norm": 0.8102111220359802, + "learning_rate": 7.33986813063081e-05, + "loss": 1.1533, + "step": 108040 + }, + { + "epoch": 0.6903006529266703, + "grad_norm": 0.7722830772399902, + "learning_rate": 7.339424684734447e-05, + "loss": 1.0018, + "step": 108050 + }, + { + "epoch": 0.690364540076409, + "grad_norm": 0.7864007949829102, + "learning_rate": 7.338981215278055e-05, + "loss": 1.0525, + "step": 108060 + }, + { + "epoch": 0.6904284272261477, + "grad_norm": 0.6729293465614319, + "learning_rate": 7.338537722266097e-05, + "loss": 0.8472, + "step": 108070 + }, + { + "epoch": 0.6904923143758864, + "grad_norm": 0.7282936573028564, + "learning_rate": 7.338094205703043e-05, + "loss": 0.9557, + "step": 108080 + }, + { + "epoch": 0.6905562015256251, + "grad_norm": 1.0277268886566162, + "learning_rate": 7.337650665593355e-05, + "loss": 0.93, + "step": 108090 + }, + { + "epoch": 0.6906200886753638, + "grad_norm": 2.55513334274292, + "learning_rate": 7.337207101941503e-05, + "loss": 0.796, + "step": 108100 + }, + { + "epoch": 0.6906839758251025, + "grad_norm": 0.7738178968429565, + "learning_rate": 7.336763514751954e-05, + "loss": 0.8795, + "step": 108110 + }, + { + "epoch": 0.6907478629748413, + "grad_norm": 0.9889559149742126, + "learning_rate": 7.336319904029176e-05, + "loss": 0.848, + "step": 108120 + }, + { + "epoch": 0.69081175012458, + "grad_norm": 1.2246037721633911, + "learning_rate": 7.335876269777634e-05, + "loss": 0.8715, + "step": 108130 + }, + { + "epoch": 0.6908756372743187, + "grad_norm": 0.899691641330719, + "learning_rate": 7.335432612001798e-05, + "loss": 1.013, + "step": 108140 + }, + { + "epoch": 0.6909395244240574, + "grad_norm": 0.9258847236633301, + "learning_rate": 7.334988930706133e-05, + "loss": 0.8774, + "step": 108150 + }, + { + "epoch": 0.6910034115737961, + "grad_norm": 0.825404167175293, + "learning_rate": 7.334545225895111e-05, + "loss": 0.6752, + "step": 108160 + }, + { + "epoch": 0.6910672987235348, + "grad_norm": 0.6678471565246582, + "learning_rate": 7.334101497573199e-05, + "loss": 0.7239, + "step": 108170 + }, + { + "epoch": 0.6911311858732735, + "grad_norm": 0.8919599056243896, + "learning_rate": 7.333657745744866e-05, + "loss": 0.8604, + "step": 108180 + }, + { + "epoch": 0.6911950730230122, + "grad_norm": 0.4956168234348297, + "learning_rate": 7.333213970414579e-05, + "loss": 0.8364, + "step": 108190 + }, + { + "epoch": 0.6912589601727509, + "grad_norm": 2.8205111026763916, + "learning_rate": 7.332770171586811e-05, + "loss": 0.7711, + "step": 108200 + }, + { + "epoch": 0.6913228473224896, + "grad_norm": 0.9555968046188354, + "learning_rate": 7.332326349266028e-05, + "loss": 0.8765, + "step": 108210 + }, + { + "epoch": 0.6913867344722282, + "grad_norm": 0.928036093711853, + "learning_rate": 7.331882503456701e-05, + "loss": 1.0052, + "step": 108220 + }, + { + "epoch": 0.6914506216219669, + "grad_norm": 0.8674328923225403, + "learning_rate": 7.331438634163298e-05, + "loss": 0.7707, + "step": 108230 + }, + { + "epoch": 0.6915145087717056, + "grad_norm": 0.8306328058242798, + "learning_rate": 7.330994741390293e-05, + "loss": 0.8573, + "step": 108240 + }, + { + "epoch": 0.6915783959214443, + "grad_norm": 1.346864938735962, + "learning_rate": 7.330550825142156e-05, + "loss": 0.7394, + "step": 108250 + }, + { + "epoch": 0.691642283071183, + "grad_norm": 1.4455012083053589, + "learning_rate": 7.330106885423353e-05, + "loss": 0.8614, + "step": 108260 + }, + { + "epoch": 0.6917061702209217, + "grad_norm": 0.7791756391525269, + "learning_rate": 7.32966292223836e-05, + "loss": 0.7627, + "step": 108270 + }, + { + "epoch": 0.6917700573706604, + "grad_norm": 0.8995997905731201, + "learning_rate": 7.329218935591645e-05, + "loss": 0.8276, + "step": 108280 + }, + { + "epoch": 0.6918339445203991, + "grad_norm": 0.9824413657188416, + "learning_rate": 7.328774925487679e-05, + "loss": 0.9905, + "step": 108290 + }, + { + "epoch": 0.6918978316701379, + "grad_norm": 0.9453624486923218, + "learning_rate": 7.328330891930937e-05, + "loss": 0.9079, + "step": 108300 + }, + { + "epoch": 0.6919617188198766, + "grad_norm": 0.9004096388816833, + "learning_rate": 7.327886834925888e-05, + "loss": 0.9236, + "step": 108310 + }, + { + "epoch": 0.6920256059696153, + "grad_norm": 0.7478508353233337, + "learning_rate": 7.327442754477003e-05, + "loss": 0.8575, + "step": 108320 + }, + { + "epoch": 0.692089493119354, + "grad_norm": 2.181452751159668, + "learning_rate": 7.326998650588758e-05, + "loss": 0.7738, + "step": 108330 + }, + { + "epoch": 0.6921533802690927, + "grad_norm": 1.4748575687408447, + "learning_rate": 7.326554523265624e-05, + "loss": 1.3507, + "step": 108340 + }, + { + "epoch": 0.6922172674188314, + "grad_norm": 1.0010013580322266, + "learning_rate": 7.326110372512071e-05, + "loss": 0.8854, + "step": 108350 + }, + { + "epoch": 0.6922811545685701, + "grad_norm": 0.87949138879776, + "learning_rate": 7.325666198332575e-05, + "loss": 0.746, + "step": 108360 + }, + { + "epoch": 0.6923450417183088, + "grad_norm": 0.8844693303108215, + "learning_rate": 7.325222000731609e-05, + "loss": 0.9919, + "step": 108370 + }, + { + "epoch": 0.6924089288680475, + "grad_norm": 1.2705687284469604, + "learning_rate": 7.324777779713644e-05, + "loss": 0.9765, + "step": 108380 + }, + { + "epoch": 0.6924728160177862, + "grad_norm": 0.8071838021278381, + "learning_rate": 7.324333535283157e-05, + "loss": 0.837, + "step": 108390 + }, + { + "epoch": 0.6925367031675249, + "grad_norm": 0.9001646637916565, + "learning_rate": 7.323889267444621e-05, + "loss": 0.846, + "step": 108400 + }, + { + "epoch": 0.6926005903172636, + "grad_norm": 0.9376798272132874, + "learning_rate": 7.323444976202508e-05, + "loss": 0.7456, + "step": 108410 + }, + { + "epoch": 0.6926644774670023, + "grad_norm": 0.8280836939811707, + "learning_rate": 7.323000661561295e-05, + "loss": 0.9753, + "step": 108420 + }, + { + "epoch": 0.692728364616741, + "grad_norm": 1.4879751205444336, + "learning_rate": 7.322556323525456e-05, + "loss": 0.9096, + "step": 108430 + }, + { + "epoch": 0.6927922517664797, + "grad_norm": 1.0255200862884521, + "learning_rate": 7.322111962099465e-05, + "loss": 0.9377, + "step": 108440 + }, + { + "epoch": 0.6928561389162184, + "grad_norm": 0.9533114433288574, + "learning_rate": 7.321667577287799e-05, + "loss": 0.7927, + "step": 108450 + }, + { + "epoch": 0.692920026065957, + "grad_norm": 0.7866392731666565, + "learning_rate": 7.32122316909493e-05, + "loss": 0.9866, + "step": 108460 + }, + { + "epoch": 0.6929839132156957, + "grad_norm": 1.0992743968963623, + "learning_rate": 7.320778737525335e-05, + "loss": 0.7761, + "step": 108470 + }, + { + "epoch": 0.6930478003654345, + "grad_norm": 0.9191528558731079, + "learning_rate": 7.320334282583492e-05, + "loss": 0.8788, + "step": 108480 + }, + { + "epoch": 0.6931116875151732, + "grad_norm": 1.2555981874465942, + "learning_rate": 7.319889804273876e-05, + "loss": 0.9633, + "step": 108490 + }, + { + "epoch": 0.6931755746649119, + "grad_norm": 0.8771397471427917, + "learning_rate": 7.319445302600961e-05, + "loss": 0.615, + "step": 108500 + }, + { + "epoch": 0.6932394618146506, + "grad_norm": 0.714777946472168, + "learning_rate": 7.319000777569226e-05, + "loss": 0.7238, + "step": 108510 + }, + { + "epoch": 0.6933033489643893, + "grad_norm": 1.2296061515808105, + "learning_rate": 7.318556229183146e-05, + "loss": 0.7767, + "step": 108520 + }, + { + "epoch": 0.693367236114128, + "grad_norm": 0.7856013178825378, + "learning_rate": 7.3181116574472e-05, + "loss": 0.692, + "step": 108530 + }, + { + "epoch": 0.6934311232638667, + "grad_norm": 0.9102780818939209, + "learning_rate": 7.317667062365863e-05, + "loss": 0.9865, + "step": 108540 + }, + { + "epoch": 0.6934950104136054, + "grad_norm": 1.0297400951385498, + "learning_rate": 7.317222443943616e-05, + "loss": 0.9191, + "step": 108550 + }, + { + "epoch": 0.6935588975633441, + "grad_norm": 1.809927225112915, + "learning_rate": 7.316777802184934e-05, + "loss": 1.084, + "step": 108560 + }, + { + "epoch": 0.6936227847130828, + "grad_norm": 2.1884663105010986, + "learning_rate": 7.316333137094294e-05, + "loss": 0.8257, + "step": 108570 + }, + { + "epoch": 0.6936866718628215, + "grad_norm": 0.8382952213287354, + "learning_rate": 7.315888448676175e-05, + "loss": 0.8348, + "step": 108580 + }, + { + "epoch": 0.6937505590125602, + "grad_norm": 0.7834774851799011, + "learning_rate": 7.315443736935056e-05, + "loss": 0.8987, + "step": 108590 + }, + { + "epoch": 0.6938144461622989, + "grad_norm": 0.710081934928894, + "learning_rate": 7.314999001875415e-05, + "loss": 0.6713, + "step": 108600 + }, + { + "epoch": 0.6938783333120376, + "grad_norm": 0.9444938898086548, + "learning_rate": 7.314554243501732e-05, + "loss": 0.9177, + "step": 108610 + }, + { + "epoch": 0.6939422204617763, + "grad_norm": 0.6890098452568054, + "learning_rate": 7.314109461818485e-05, + "loss": 0.9145, + "step": 108620 + }, + { + "epoch": 0.694006107611515, + "grad_norm": 0.9023224115371704, + "learning_rate": 7.313664656830154e-05, + "loss": 1.0199, + "step": 108630 + }, + { + "epoch": 0.6940699947612538, + "grad_norm": 0.6425119638442993, + "learning_rate": 7.31321982854122e-05, + "loss": 0.862, + "step": 108640 + }, + { + "epoch": 0.6941338819109925, + "grad_norm": 1.188393473625183, + "learning_rate": 7.312774976956159e-05, + "loss": 0.801, + "step": 108650 + }, + { + "epoch": 0.6941977690607312, + "grad_norm": 0.7165592908859253, + "learning_rate": 7.312330102079454e-05, + "loss": 1.3727, + "step": 108660 + }, + { + "epoch": 0.6942616562104699, + "grad_norm": 0.6589129567146301, + "learning_rate": 7.311885203915585e-05, + "loss": 0.8308, + "step": 108670 + }, + { + "epoch": 0.6943255433602086, + "grad_norm": 1.0794988870620728, + "learning_rate": 7.31144028246903e-05, + "loss": 0.9766, + "step": 108680 + }, + { + "epoch": 0.6943894305099473, + "grad_norm": 1.6722362041473389, + "learning_rate": 7.310995337744271e-05, + "loss": 0.8217, + "step": 108690 + }, + { + "epoch": 0.6944533176596859, + "grad_norm": 2.617365598678589, + "learning_rate": 7.310550369745793e-05, + "loss": 0.8649, + "step": 108700 + }, + { + "epoch": 0.6945172048094246, + "grad_norm": 1.0052344799041748, + "learning_rate": 7.310105378478071e-05, + "loss": 0.8908, + "step": 108710 + }, + { + "epoch": 0.6945810919591633, + "grad_norm": 0.6017476320266724, + "learning_rate": 7.309660363945592e-05, + "loss": 0.8932, + "step": 108720 + }, + { + "epoch": 0.694644979108902, + "grad_norm": 1.1323217153549194, + "learning_rate": 7.309215326152833e-05, + "loss": 0.9389, + "step": 108730 + }, + { + "epoch": 0.6947088662586407, + "grad_norm": 1.0148589611053467, + "learning_rate": 7.308770265104279e-05, + "loss": 0.8976, + "step": 108740 + }, + { + "epoch": 0.6947727534083794, + "grad_norm": 1.195841670036316, + "learning_rate": 7.30832518080441e-05, + "loss": 1.0468, + "step": 108750 + }, + { + "epoch": 0.6948366405581181, + "grad_norm": 2.77616810798645, + "learning_rate": 7.307880073257711e-05, + "loss": 0.8265, + "step": 108760 + }, + { + "epoch": 0.6949005277078568, + "grad_norm": 0.820035457611084, + "learning_rate": 7.30743494246866e-05, + "loss": 1.0138, + "step": 108770 + }, + { + "epoch": 0.6949644148575955, + "grad_norm": 0.768181324005127, + "learning_rate": 7.306989788441747e-05, + "loss": 0.896, + "step": 108780 + }, + { + "epoch": 0.6950283020073342, + "grad_norm": 0.9276620745658875, + "learning_rate": 7.306544611181449e-05, + "loss": 0.9899, + "step": 108790 + }, + { + "epoch": 0.6950921891570729, + "grad_norm": 1.3727481365203857, + "learning_rate": 7.306099410692251e-05, + "loss": 0.9883, + "step": 108800 + }, + { + "epoch": 0.6951560763068116, + "grad_norm": 0.6537569165229797, + "learning_rate": 7.305654186978636e-05, + "loss": 0.7696, + "step": 108810 + }, + { + "epoch": 0.6952199634565503, + "grad_norm": 0.8590995669364929, + "learning_rate": 7.30520894004509e-05, + "loss": 0.9809, + "step": 108820 + }, + { + "epoch": 0.6952838506062891, + "grad_norm": 0.9551057815551758, + "learning_rate": 7.304763669896096e-05, + "loss": 0.9619, + "step": 108830 + }, + { + "epoch": 0.6953477377560278, + "grad_norm": 0.8596848845481873, + "learning_rate": 7.304318376536138e-05, + "loss": 0.8957, + "step": 108840 + }, + { + "epoch": 0.6954116249057665, + "grad_norm": 1.1509318351745605, + "learning_rate": 7.3038730599697e-05, + "loss": 0.6727, + "step": 108850 + }, + { + "epoch": 0.6954755120555052, + "grad_norm": 0.5256636142730713, + "learning_rate": 7.303427720201265e-05, + "loss": 0.7634, + "step": 108860 + }, + { + "epoch": 0.6955393992052439, + "grad_norm": 0.8332456350326538, + "learning_rate": 7.302982357235323e-05, + "loss": 1.3683, + "step": 108870 + }, + { + "epoch": 0.6956032863549826, + "grad_norm": 0.7100444436073303, + "learning_rate": 7.302536971076355e-05, + "loss": 0.8936, + "step": 108880 + }, + { + "epoch": 0.6956671735047213, + "grad_norm": 1.0301616191864014, + "learning_rate": 7.302091561728848e-05, + "loss": 0.784, + "step": 108890 + }, + { + "epoch": 0.69573106065446, + "grad_norm": 0.8167005777359009, + "learning_rate": 7.301646129197289e-05, + "loss": 1.0153, + "step": 108900 + }, + { + "epoch": 0.6957949478041987, + "grad_norm": 0.6708621382713318, + "learning_rate": 7.30120067348616e-05, + "loss": 0.9037, + "step": 108910 + }, + { + "epoch": 0.6958588349539374, + "grad_norm": 1.8930144309997559, + "learning_rate": 7.30075519459995e-05, + "loss": 0.9704, + "step": 108920 + }, + { + "epoch": 0.6959227221036761, + "grad_norm": 0.9844603538513184, + "learning_rate": 7.300309692543145e-05, + "loss": 1.0861, + "step": 108930 + }, + { + "epoch": 0.6959866092534148, + "grad_norm": 0.9566649198532104, + "learning_rate": 7.299864167320232e-05, + "loss": 1.0209, + "step": 108940 + }, + { + "epoch": 0.6960504964031534, + "grad_norm": 0.9092232584953308, + "learning_rate": 7.299418618935695e-05, + "loss": 1.0676, + "step": 108950 + }, + { + "epoch": 0.6961143835528921, + "grad_norm": 0.7573904395103455, + "learning_rate": 7.298973047394025e-05, + "loss": 0.7415, + "step": 108960 + }, + { + "epoch": 0.6961782707026308, + "grad_norm": 1.1252961158752441, + "learning_rate": 7.298527452699708e-05, + "loss": 1.0561, + "step": 108970 + }, + { + "epoch": 0.6962421578523695, + "grad_norm": 0.7041053175926208, + "learning_rate": 7.298081834857229e-05, + "loss": 0.9674, + "step": 108980 + }, + { + "epoch": 0.6963060450021082, + "grad_norm": 0.9071682095527649, + "learning_rate": 7.29763619387108e-05, + "loss": 0.9063, + "step": 108990 + }, + { + "epoch": 0.696369932151847, + "grad_norm": 0.698070228099823, + "learning_rate": 7.297190529745746e-05, + "loss": 0.8875, + "step": 109000 + }, + { + "epoch": 0.6964338193015857, + "grad_norm": 0.9515412449836731, + "learning_rate": 7.296744842485715e-05, + "loss": 0.8703, + "step": 109010 + }, + { + "epoch": 0.6964977064513244, + "grad_norm": 1.2427845001220703, + "learning_rate": 7.296299132095478e-05, + "loss": 0.9569, + "step": 109020 + }, + { + "epoch": 0.6965615936010631, + "grad_norm": 0.5841128826141357, + "learning_rate": 7.295853398579521e-05, + "loss": 0.9137, + "step": 109030 + }, + { + "epoch": 0.6966254807508018, + "grad_norm": 0.5396087765693665, + "learning_rate": 7.295407641942334e-05, + "loss": 0.7979, + "step": 109040 + }, + { + "epoch": 0.6966893679005405, + "grad_norm": 0.7131836414337158, + "learning_rate": 7.294961862188407e-05, + "loss": 1.0448, + "step": 109050 + }, + { + "epoch": 0.6967532550502792, + "grad_norm": 1.0554966926574707, + "learning_rate": 7.29451605932223e-05, + "loss": 0.92, + "step": 109060 + }, + { + "epoch": 0.6968171422000179, + "grad_norm": 0.7954362630844116, + "learning_rate": 7.294070233348289e-05, + "loss": 0.841, + "step": 109070 + }, + { + "epoch": 0.6968810293497566, + "grad_norm": 0.8883830308914185, + "learning_rate": 7.293624384271076e-05, + "loss": 0.7748, + "step": 109080 + }, + { + "epoch": 0.6969449164994953, + "grad_norm": 1.4885032176971436, + "learning_rate": 7.293178512095082e-05, + "loss": 0.8115, + "step": 109090 + }, + { + "epoch": 0.697008803649234, + "grad_norm": 0.9093277454376221, + "learning_rate": 7.292732616824797e-05, + "loss": 0.8182, + "step": 109100 + }, + { + "epoch": 0.6970726907989727, + "grad_norm": 0.9241993427276611, + "learning_rate": 7.29228669846471e-05, + "loss": 0.7127, + "step": 109110 + }, + { + "epoch": 0.6971365779487114, + "grad_norm": 0.6447529792785645, + "learning_rate": 7.291840757019314e-05, + "loss": 0.8501, + "step": 109120 + }, + { + "epoch": 0.6972004650984501, + "grad_norm": 0.7052245736122131, + "learning_rate": 7.291394792493098e-05, + "loss": 0.947, + "step": 109130 + }, + { + "epoch": 0.6972643522481888, + "grad_norm": 1.3450639247894287, + "learning_rate": 7.290948804890555e-05, + "loss": 0.7365, + "step": 109140 + }, + { + "epoch": 0.6973282393979275, + "grad_norm": 0.5776755213737488, + "learning_rate": 7.290502794216173e-05, + "loss": 0.7275, + "step": 109150 + }, + { + "epoch": 0.6973921265476662, + "grad_norm": 0.8304409980773926, + "learning_rate": 7.290056760474448e-05, + "loss": 0.9387, + "step": 109160 + }, + { + "epoch": 0.697456013697405, + "grad_norm": 0.8991537690162659, + "learning_rate": 7.289610703669872e-05, + "loss": 0.7778, + "step": 109170 + }, + { + "epoch": 0.6975199008471437, + "grad_norm": 0.8365470170974731, + "learning_rate": 7.289164623806933e-05, + "loss": 0.8706, + "step": 109180 + }, + { + "epoch": 0.6975837879968823, + "grad_norm": 0.855769157409668, + "learning_rate": 7.288718520890127e-05, + "loss": 0.7282, + "step": 109190 + }, + { + "epoch": 0.697647675146621, + "grad_norm": 0.7348789572715759, + "learning_rate": 7.288272394923945e-05, + "loss": 1.0745, + "step": 109200 + }, + { + "epoch": 0.6977115622963597, + "grad_norm": 1.0957111120224, + "learning_rate": 7.287826245912879e-05, + "loss": 0.7343, + "step": 109210 + }, + { + "epoch": 0.6977754494460984, + "grad_norm": 0.8726381063461304, + "learning_rate": 7.287380073861425e-05, + "loss": 1.0231, + "step": 109220 + }, + { + "epoch": 0.6978393365958371, + "grad_norm": 0.6815057992935181, + "learning_rate": 7.286933878774075e-05, + "loss": 0.8475, + "step": 109230 + }, + { + "epoch": 0.6979032237455758, + "grad_norm": 1.1125048398971558, + "learning_rate": 7.286487660655323e-05, + "loss": 0.8779, + "step": 109240 + }, + { + "epoch": 0.6979671108953145, + "grad_norm": 0.725688636302948, + "learning_rate": 7.28604141950966e-05, + "loss": 0.9028, + "step": 109250 + }, + { + "epoch": 0.6980309980450532, + "grad_norm": 0.8986996412277222, + "learning_rate": 7.285595155341583e-05, + "loss": 0.9237, + "step": 109260 + }, + { + "epoch": 0.6980948851947919, + "grad_norm": 0.9736185073852539, + "learning_rate": 7.285148868155587e-05, + "loss": 0.8967, + "step": 109270 + }, + { + "epoch": 0.6981587723445306, + "grad_norm": 1.0567455291748047, + "learning_rate": 7.284702557956165e-05, + "loss": 1.0126, + "step": 109280 + }, + { + "epoch": 0.6982226594942693, + "grad_norm": 1.032707691192627, + "learning_rate": 7.28425622474781e-05, + "loss": 1.0844, + "step": 109290 + }, + { + "epoch": 0.698286546644008, + "grad_norm": 0.6320337653160095, + "learning_rate": 7.283809868535018e-05, + "loss": 0.745, + "step": 109300 + }, + { + "epoch": 0.6983504337937467, + "grad_norm": 0.7750630974769592, + "learning_rate": 7.283363489322287e-05, + "loss": 1.0077, + "step": 109310 + }, + { + "epoch": 0.6984143209434854, + "grad_norm": 0.7525535821914673, + "learning_rate": 7.282917087114109e-05, + "loss": 0.9631, + "step": 109320 + }, + { + "epoch": 0.6984782080932241, + "grad_norm": 0.903925895690918, + "learning_rate": 7.282470661914982e-05, + "loss": 1.0631, + "step": 109330 + }, + { + "epoch": 0.6985420952429628, + "grad_norm": 0.6858085989952087, + "learning_rate": 7.282024213729399e-05, + "loss": 0.9775, + "step": 109340 + }, + { + "epoch": 0.6986059823927016, + "grad_norm": 1.176261067390442, + "learning_rate": 7.28157774256186e-05, + "loss": 0.8382, + "step": 109350 + }, + { + "epoch": 0.6986698695424403, + "grad_norm": 0.7239077091217041, + "learning_rate": 7.281131248416858e-05, + "loss": 0.8858, + "step": 109360 + }, + { + "epoch": 0.698733756692179, + "grad_norm": 1.3246084451675415, + "learning_rate": 7.280684731298892e-05, + "loss": 0.8572, + "step": 109370 + }, + { + "epoch": 0.6987976438419177, + "grad_norm": 0.7234712839126587, + "learning_rate": 7.280238191212455e-05, + "loss": 0.7359, + "step": 109380 + }, + { + "epoch": 0.6988615309916564, + "grad_norm": 1.1668168306350708, + "learning_rate": 7.27979162816205e-05, + "loss": 0.8897, + "step": 109390 + }, + { + "epoch": 0.6989254181413951, + "grad_norm": 0.9035739302635193, + "learning_rate": 7.279345042152167e-05, + "loss": 0.8598, + "step": 109400 + }, + { + "epoch": 0.6989893052911338, + "grad_norm": 0.9039598107337952, + "learning_rate": 7.278898433187311e-05, + "loss": 0.9865, + "step": 109410 + }, + { + "epoch": 0.6990531924408725, + "grad_norm": 0.9996391534805298, + "learning_rate": 7.278451801271975e-05, + "loss": 0.7356, + "step": 109420 + }, + { + "epoch": 0.6991170795906111, + "grad_norm": 0.8987241983413696, + "learning_rate": 7.27800514641066e-05, + "loss": 0.699, + "step": 109430 + }, + { + "epoch": 0.6991809667403498, + "grad_norm": 0.9513826370239258, + "learning_rate": 7.27755846860786e-05, + "loss": 1.0641, + "step": 109440 + }, + { + "epoch": 0.6992448538900885, + "grad_norm": 1.103652000427246, + "learning_rate": 7.277111767868076e-05, + "loss": 0.7386, + "step": 109450 + }, + { + "epoch": 0.6993087410398272, + "grad_norm": 0.7837316393852234, + "learning_rate": 7.276665044195808e-05, + "loss": 0.8191, + "step": 109460 + }, + { + "epoch": 0.6993726281895659, + "grad_norm": 0.8951888680458069, + "learning_rate": 7.276218297595553e-05, + "loss": 1.0341, + "step": 109470 + }, + { + "epoch": 0.6994365153393046, + "grad_norm": 0.8686550259590149, + "learning_rate": 7.275771528071811e-05, + "loss": 0.9451, + "step": 109480 + }, + { + "epoch": 0.6995004024890433, + "grad_norm": 0.7066924571990967, + "learning_rate": 7.27532473562908e-05, + "loss": 0.867, + "step": 109490 + }, + { + "epoch": 0.699564289638782, + "grad_norm": 1.1765514612197876, + "learning_rate": 7.274877920271861e-05, + "loss": 0.8446, + "step": 109500 + }, + { + "epoch": 0.6996281767885207, + "grad_norm": 1.2923158407211304, + "learning_rate": 7.274431082004652e-05, + "loss": 0.6812, + "step": 109510 + }, + { + "epoch": 0.6996920639382594, + "grad_norm": 1.4523509740829468, + "learning_rate": 7.273984220831956e-05, + "loss": 0.7639, + "step": 109520 + }, + { + "epoch": 0.6997559510879982, + "grad_norm": 1.0498130321502686, + "learning_rate": 7.273537336758272e-05, + "loss": 0.6902, + "step": 109530 + }, + { + "epoch": 0.6998198382377369, + "grad_norm": 0.7311170697212219, + "learning_rate": 7.273090429788098e-05, + "loss": 0.7766, + "step": 109540 + }, + { + "epoch": 0.6998837253874756, + "grad_norm": 1.5511587858200073, + "learning_rate": 7.272643499925937e-05, + "loss": 0.8909, + "step": 109550 + }, + { + "epoch": 0.6999476125372143, + "grad_norm": 0.810479998588562, + "learning_rate": 7.27219654717629e-05, + "loss": 1.0596, + "step": 109560 + }, + { + "epoch": 0.700011499686953, + "grad_norm": 0.8483265042304993, + "learning_rate": 7.27174957154366e-05, + "loss": 0.5923, + "step": 109570 + }, + { + "epoch": 0.7000753868366917, + "grad_norm": 1.1115506887435913, + "learning_rate": 7.271302573032546e-05, + "loss": 0.7011, + "step": 109580 + }, + { + "epoch": 0.7001392739864304, + "grad_norm": 0.8002986907958984, + "learning_rate": 7.270855551647449e-05, + "loss": 1.025, + "step": 109590 + }, + { + "epoch": 0.7002031611361691, + "grad_norm": 0.8855366110801697, + "learning_rate": 7.270408507392872e-05, + "loss": 0.7358, + "step": 109600 + }, + { + "epoch": 0.7002670482859078, + "grad_norm": 1.6254435777664185, + "learning_rate": 7.269961440273317e-05, + "loss": 1.0024, + "step": 109610 + }, + { + "epoch": 0.7003309354356465, + "grad_norm": 0.809699535369873, + "learning_rate": 7.269514350293287e-05, + "loss": 0.8733, + "step": 109620 + }, + { + "epoch": 0.7003948225853852, + "grad_norm": 1.2254425287246704, + "learning_rate": 7.269111949769275e-05, + "loss": 1.0453, + "step": 109630 + }, + { + "epoch": 0.7004587097351239, + "grad_norm": 0.8027240037918091, + "learning_rate": 7.268664816366747e-05, + "loss": 0.7901, + "step": 109640 + }, + { + "epoch": 0.7005225968848626, + "grad_norm": 0.5763524770736694, + "learning_rate": 7.268217660116801e-05, + "loss": 0.9035, + "step": 109650 + }, + { + "epoch": 0.7005864840346013, + "grad_norm": 0.8503153920173645, + "learning_rate": 7.267770481023941e-05, + "loss": 1.1969, + "step": 109660 + }, + { + "epoch": 0.70065037118434, + "grad_norm": 1.0496258735656738, + "learning_rate": 7.26732327909267e-05, + "loss": 0.8281, + "step": 109670 + }, + { + "epoch": 0.7007142583340786, + "grad_norm": 0.9994955658912659, + "learning_rate": 7.266876054327491e-05, + "loss": 1.0602, + "step": 109680 + }, + { + "epoch": 0.7007781454838173, + "grad_norm": 0.8883937001228333, + "learning_rate": 7.266428806732913e-05, + "loss": 1.1059, + "step": 109690 + }, + { + "epoch": 0.700842032633556, + "grad_norm": 0.8434162735939026, + "learning_rate": 7.265981536313432e-05, + "loss": 0.7179, + "step": 109700 + }, + { + "epoch": 0.7009059197832948, + "grad_norm": 0.8952722549438477, + "learning_rate": 7.265534243073558e-05, + "loss": 0.831, + "step": 109710 + }, + { + "epoch": 0.7009698069330335, + "grad_norm": 0.49630191922187805, + "learning_rate": 7.265086927017795e-05, + "loss": 0.8177, + "step": 109720 + }, + { + "epoch": 0.7010336940827722, + "grad_norm": 1.5639264583587646, + "learning_rate": 7.264639588150646e-05, + "loss": 0.9209, + "step": 109730 + }, + { + "epoch": 0.7010975812325109, + "grad_norm": 0.8661503791809082, + "learning_rate": 7.264192226476617e-05, + "loss": 0.7273, + "step": 109740 + }, + { + "epoch": 0.7011614683822496, + "grad_norm": 1.6213655471801758, + "learning_rate": 7.263744842000214e-05, + "loss": 1.4111, + "step": 109750 + }, + { + "epoch": 0.7012253555319883, + "grad_norm": 1.2722123861312866, + "learning_rate": 7.263297434725941e-05, + "loss": 0.9177, + "step": 109760 + }, + { + "epoch": 0.701289242681727, + "grad_norm": 0.9025132060050964, + "learning_rate": 7.262850004658308e-05, + "loss": 0.8063, + "step": 109770 + }, + { + "epoch": 0.7013531298314657, + "grad_norm": 0.753537654876709, + "learning_rate": 7.262402551801815e-05, + "loss": 0.7072, + "step": 109780 + }, + { + "epoch": 0.7014170169812044, + "grad_norm": 0.7488612532615662, + "learning_rate": 7.261955076160972e-05, + "loss": 0.9609, + "step": 109790 + }, + { + "epoch": 0.7014809041309431, + "grad_norm": 0.7096448540687561, + "learning_rate": 7.261507577740283e-05, + "loss": 1.1737, + "step": 109800 + }, + { + "epoch": 0.7015447912806818, + "grad_norm": 1.065198540687561, + "learning_rate": 7.261060056544258e-05, + "loss": 0.9114, + "step": 109810 + }, + { + "epoch": 0.7016086784304205, + "grad_norm": 0.7157565951347351, + "learning_rate": 7.260612512577402e-05, + "loss": 0.7947, + "step": 109820 + }, + { + "epoch": 0.7016725655801592, + "grad_norm": 0.6602898240089417, + "learning_rate": 7.260164945844222e-05, + "loss": 0.8586, + "step": 109830 + }, + { + "epoch": 0.7017364527298979, + "grad_norm": 1.0240232944488525, + "learning_rate": 7.259717356349224e-05, + "loss": 0.8433, + "step": 109840 + }, + { + "epoch": 0.7018003398796366, + "grad_norm": 0.7069511413574219, + "learning_rate": 7.25926974409692e-05, + "loss": 0.7815, + "step": 109850 + }, + { + "epoch": 0.7018642270293753, + "grad_norm": 0.8306097984313965, + "learning_rate": 7.258822109091813e-05, + "loss": 0.8288, + "step": 109860 + }, + { + "epoch": 0.701928114179114, + "grad_norm": 0.46350932121276855, + "learning_rate": 7.258374451338415e-05, + "loss": 0.846, + "step": 109870 + }, + { + "epoch": 0.7019920013288528, + "grad_norm": 0.7333908677101135, + "learning_rate": 7.257926770841231e-05, + "loss": 0.863, + "step": 109880 + }, + { + "epoch": 0.7020558884785915, + "grad_norm": 1.8804274797439575, + "learning_rate": 7.25747906760477e-05, + "loss": 1.2467, + "step": 109890 + }, + { + "epoch": 0.7021197756283302, + "grad_norm": 1.2987992763519287, + "learning_rate": 7.257031341633545e-05, + "loss": 0.8424, + "step": 109900 + }, + { + "epoch": 0.7021836627780689, + "grad_norm": 0.5555353164672852, + "learning_rate": 7.25658359293206e-05, + "loss": 0.6798, + "step": 109910 + }, + { + "epoch": 0.7022475499278075, + "grad_norm": 1.0028846263885498, + "learning_rate": 7.256135821504827e-05, + "loss": 0.8265, + "step": 109920 + }, + { + "epoch": 0.7023114370775462, + "grad_norm": 0.8981877565383911, + "learning_rate": 7.255688027356353e-05, + "loss": 1.0722, + "step": 109930 + }, + { + "epoch": 0.7023753242272849, + "grad_norm": 0.9332131147384644, + "learning_rate": 7.25524021049115e-05, + "loss": 0.8057, + "step": 109940 + }, + { + "epoch": 0.7024392113770236, + "grad_norm": 0.8092306852340698, + "learning_rate": 7.254792370913728e-05, + "loss": 0.9814, + "step": 109950 + }, + { + "epoch": 0.7025030985267623, + "grad_norm": 0.5844110250473022, + "learning_rate": 7.254344508628594e-05, + "loss": 0.9691, + "step": 109960 + }, + { + "epoch": 0.702566985676501, + "grad_norm": 0.7450523972511292, + "learning_rate": 7.253896623640262e-05, + "loss": 1.0341, + "step": 109970 + }, + { + "epoch": 0.7026308728262397, + "grad_norm": 1.2225341796875, + "learning_rate": 7.253448715953241e-05, + "loss": 0.9289, + "step": 109980 + }, + { + "epoch": 0.7026947599759784, + "grad_norm": 0.8283683061599731, + "learning_rate": 7.25300078557204e-05, + "loss": 0.9794, + "step": 109990 + }, + { + "epoch": 0.7027586471257171, + "grad_norm": 1.274552345275879, + "learning_rate": 7.252552832501174e-05, + "loss": 1.3319, + "step": 110000 + }, + { + "epoch": 0.7028225342754558, + "grad_norm": 1.2938295602798462, + "learning_rate": 7.252104856745153e-05, + "loss": 1.0077, + "step": 110010 + }, + { + "epoch": 0.7028864214251945, + "grad_norm": 0.6301164031028748, + "learning_rate": 7.251656858308484e-05, + "loss": 0.7968, + "step": 110020 + }, + { + "epoch": 0.7029503085749332, + "grad_norm": 1.0087745189666748, + "learning_rate": 7.251208837195686e-05, + "loss": 0.9097, + "step": 110030 + }, + { + "epoch": 0.7030141957246719, + "grad_norm": 1.1381967067718506, + "learning_rate": 7.250760793411265e-05, + "loss": 0.8822, + "step": 110040 + }, + { + "epoch": 0.7030780828744106, + "grad_norm": 1.071937918663025, + "learning_rate": 7.250312726959739e-05, + "loss": 0.8275, + "step": 110050 + }, + { + "epoch": 0.7031419700241494, + "grad_norm": 0.9654282927513123, + "learning_rate": 7.249864637845614e-05, + "loss": 0.8608, + "step": 110060 + }, + { + "epoch": 0.7032058571738881, + "grad_norm": 0.739723801612854, + "learning_rate": 7.249416526073405e-05, + "loss": 0.6858, + "step": 110070 + }, + { + "epoch": 0.7032697443236268, + "grad_norm": 0.9041827917098999, + "learning_rate": 7.248968391647628e-05, + "loss": 0.8474, + "step": 110080 + }, + { + "epoch": 0.7033336314733655, + "grad_norm": 0.9711044430732727, + "learning_rate": 7.248520234572794e-05, + "loss": 0.7781, + "step": 110090 + }, + { + "epoch": 0.7033975186231042, + "grad_norm": 0.8251720666885376, + "learning_rate": 7.248072054853414e-05, + "loss": 1.1387, + "step": 110100 + }, + { + "epoch": 0.7034614057728429, + "grad_norm": 0.7342681288719177, + "learning_rate": 7.247623852494005e-05, + "loss": 0.8043, + "step": 110110 + }, + { + "epoch": 0.7035252929225816, + "grad_norm": 0.8310518264770508, + "learning_rate": 7.247175627499078e-05, + "loss": 0.935, + "step": 110120 + }, + { + "epoch": 0.7035891800723203, + "grad_norm": 0.8513674736022949, + "learning_rate": 7.24672737987315e-05, + "loss": 1.1539, + "step": 110130 + }, + { + "epoch": 0.703653067222059, + "grad_norm": 1.232692003250122, + "learning_rate": 7.246279109620733e-05, + "loss": 0.9609, + "step": 110140 + }, + { + "epoch": 0.7037169543717977, + "grad_norm": 0.6645367741584778, + "learning_rate": 7.245830816746342e-05, + "loss": 0.8672, + "step": 110150 + }, + { + "epoch": 0.7037808415215363, + "grad_norm": 0.7718523740768433, + "learning_rate": 7.245382501254491e-05, + "loss": 0.8136, + "step": 110160 + }, + { + "epoch": 0.703844728671275, + "grad_norm": 0.9974465370178223, + "learning_rate": 7.244934163149697e-05, + "loss": 0.8419, + "step": 110170 + }, + { + "epoch": 0.7039086158210137, + "grad_norm": 1.4745326042175293, + "learning_rate": 7.244485802436472e-05, + "loss": 0.7839, + "step": 110180 + }, + { + "epoch": 0.7039725029707524, + "grad_norm": 0.7670671343803406, + "learning_rate": 7.244037419119333e-05, + "loss": 0.7499, + "step": 110190 + }, + { + "epoch": 0.7040363901204911, + "grad_norm": 0.8539519906044006, + "learning_rate": 7.243589013202799e-05, + "loss": 0.9251, + "step": 110200 + }, + { + "epoch": 0.7041002772702298, + "grad_norm": 0.8276684284210205, + "learning_rate": 7.24314058469138e-05, + "loss": 1.0614, + "step": 110210 + }, + { + "epoch": 0.7041641644199685, + "grad_norm": 1.3117198944091797, + "learning_rate": 7.242692133589596e-05, + "loss": 0.8776, + "step": 110220 + }, + { + "epoch": 0.7042280515697072, + "grad_norm": 1.0242599248886108, + "learning_rate": 7.242243659901961e-05, + "loss": 0.7777, + "step": 110230 + }, + { + "epoch": 0.704291938719446, + "grad_norm": 0.7861204743385315, + "learning_rate": 7.241795163632994e-05, + "loss": 0.9854, + "step": 110240 + }, + { + "epoch": 0.7043558258691847, + "grad_norm": 1.0489882230758667, + "learning_rate": 7.241346644787208e-05, + "loss": 0.9031, + "step": 110250 + }, + { + "epoch": 0.7044197130189234, + "grad_norm": 1.8665564060211182, + "learning_rate": 7.240898103369124e-05, + "loss": 0.762, + "step": 110260 + }, + { + "epoch": 0.7044836001686621, + "grad_norm": 0.9396441578865051, + "learning_rate": 7.240449539383257e-05, + "loss": 1.0817, + "step": 110270 + }, + { + "epoch": 0.7045474873184008, + "grad_norm": 0.686319887638092, + "learning_rate": 7.240000952834125e-05, + "loss": 1.1644, + "step": 110280 + }, + { + "epoch": 0.7046113744681395, + "grad_norm": 0.7401465773582458, + "learning_rate": 7.239552343726246e-05, + "loss": 0.7316, + "step": 110290 + }, + { + "epoch": 0.7046752616178782, + "grad_norm": 1.3043559789657593, + "learning_rate": 7.239103712064136e-05, + "loss": 1.0058, + "step": 110300 + }, + { + "epoch": 0.7047391487676169, + "grad_norm": 0.9462155699729919, + "learning_rate": 7.238655057852314e-05, + "loss": 0.9211, + "step": 110310 + }, + { + "epoch": 0.7048030359173556, + "grad_norm": 0.9175712466239929, + "learning_rate": 7.238206381095302e-05, + "loss": 0.9743, + "step": 110320 + }, + { + "epoch": 0.7048669230670943, + "grad_norm": 1.9088410139083862, + "learning_rate": 7.237757681797613e-05, + "loss": 0.9467, + "step": 110330 + }, + { + "epoch": 0.704930810216833, + "grad_norm": 0.5652353763580322, + "learning_rate": 7.237308959963769e-05, + "loss": 0.7986, + "step": 110340 + }, + { + "epoch": 0.7049946973665717, + "grad_norm": 0.8433083891868591, + "learning_rate": 7.236860215598288e-05, + "loss": 0.8457, + "step": 110350 + }, + { + "epoch": 0.7050585845163104, + "grad_norm": 0.7066556811332703, + "learning_rate": 7.236411448705689e-05, + "loss": 0.7944, + "step": 110360 + }, + { + "epoch": 0.7051224716660491, + "grad_norm": NaN, + "learning_rate": 7.2360075392454e-05, + "loss": 0.784, + "step": 110370 + }, + { + "epoch": 0.7051863588157878, + "grad_norm": 0.7894417643547058, + "learning_rate": 7.23555872956373e-05, + "loss": 0.6449, + "step": 110380 + }, + { + "epoch": 0.7052502459655265, + "grad_norm": 1.0463151931762695, + "learning_rate": 7.235109897368049e-05, + "loss": 0.8735, + "step": 110390 + }, + { + "epoch": 0.7053141331152651, + "grad_norm": 0.7032953500747681, + "learning_rate": 7.234661042662877e-05, + "loss": 0.8867, + "step": 110400 + }, + { + "epoch": 0.7053780202650038, + "grad_norm": 0.71566241979599, + "learning_rate": 7.234212165452736e-05, + "loss": 0.7895, + "step": 110410 + }, + { + "epoch": 0.7054419074147426, + "grad_norm": 2.098254680633545, + "learning_rate": 7.233763265742146e-05, + "loss": 1.1398, + "step": 110420 + }, + { + "epoch": 0.7055057945644813, + "grad_norm": 0.8573238253593445, + "learning_rate": 7.233314343535627e-05, + "loss": 0.778, + "step": 110430 + }, + { + "epoch": 0.70556968171422, + "grad_norm": 0.5292240381240845, + "learning_rate": 7.2328653988377e-05, + "loss": 0.7799, + "step": 110440 + }, + { + "epoch": 0.7056335688639587, + "grad_norm": 0.9991651773452759, + "learning_rate": 7.232416431652887e-05, + "loss": 0.9966, + "step": 110450 + }, + { + "epoch": 0.7056974560136974, + "grad_norm": 0.45349401235580444, + "learning_rate": 7.23196744198571e-05, + "loss": 0.6847, + "step": 110460 + }, + { + "epoch": 0.7057613431634361, + "grad_norm": 0.9464260935783386, + "learning_rate": 7.231518429840689e-05, + "loss": 0.979, + "step": 110470 + }, + { + "epoch": 0.7058252303131748, + "grad_norm": 1.0189330577850342, + "learning_rate": 7.231069395222347e-05, + "loss": 0.8901, + "step": 110480 + }, + { + "epoch": 0.7058891174629135, + "grad_norm": 1.015156865119934, + "learning_rate": 7.230620338135205e-05, + "loss": 0.5721, + "step": 110490 + }, + { + "epoch": 0.7059530046126522, + "grad_norm": 2.3729195594787598, + "learning_rate": 7.230171258583788e-05, + "loss": 0.8536, + "step": 110500 + }, + { + "epoch": 0.7060168917623909, + "grad_norm": 0.6549795269966125, + "learning_rate": 7.229722156572616e-05, + "loss": 0.743, + "step": 110510 + }, + { + "epoch": 0.7060807789121296, + "grad_norm": 0.968298614025116, + "learning_rate": 7.229273032106214e-05, + "loss": 0.965, + "step": 110520 + }, + { + "epoch": 0.7061446660618683, + "grad_norm": 1.6288883686065674, + "learning_rate": 7.228823885189103e-05, + "loss": 0.9272, + "step": 110530 + }, + { + "epoch": 0.706208553211607, + "grad_norm": 0.7440875172615051, + "learning_rate": 7.228374715825807e-05, + "loss": 1.0698, + "step": 110540 + }, + { + "epoch": 0.7062724403613457, + "grad_norm": 0.9549334645271301, + "learning_rate": 7.227925524020853e-05, + "loss": 0.8151, + "step": 110550 + }, + { + "epoch": 0.7063363275110844, + "grad_norm": 1.4166960716247559, + "learning_rate": 7.227476309778759e-05, + "loss": 0.8097, + "step": 110560 + }, + { + "epoch": 0.7064002146608231, + "grad_norm": 1.3286716938018799, + "learning_rate": 7.227027073104052e-05, + "loss": 1.0101, + "step": 110570 + }, + { + "epoch": 0.7064641018105619, + "grad_norm": 0.5131798386573792, + "learning_rate": 7.226577814001254e-05, + "loss": 1.0766, + "step": 110580 + }, + { + "epoch": 0.7065279889603006, + "grad_norm": 0.8524615168571472, + "learning_rate": 7.226128532474893e-05, + "loss": 1.2115, + "step": 110590 + }, + { + "epoch": 0.7065918761100393, + "grad_norm": 0.9582625031471252, + "learning_rate": 7.225679228529491e-05, + "loss": 0.8322, + "step": 110600 + }, + { + "epoch": 0.706655763259778, + "grad_norm": 1.0330963134765625, + "learning_rate": 7.225229902169575e-05, + "loss": 0.8503, + "step": 110610 + }, + { + "epoch": 0.7067196504095167, + "grad_norm": 1.450884222984314, + "learning_rate": 7.224780553399667e-05, + "loss": 0.792, + "step": 110620 + }, + { + "epoch": 0.7067835375592554, + "grad_norm": 1.2359329462051392, + "learning_rate": 7.224331182224296e-05, + "loss": 0.9444, + "step": 110630 + }, + { + "epoch": 0.7068474247089941, + "grad_norm": 0.6814751029014587, + "learning_rate": 7.223881788647984e-05, + "loss": 0.8523, + "step": 110640 + }, + { + "epoch": 0.7069113118587327, + "grad_norm": 0.5562382340431213, + "learning_rate": 7.223432372675258e-05, + "loss": 0.9113, + "step": 110650 + }, + { + "epoch": 0.7069751990084714, + "grad_norm": 1.0418486595153809, + "learning_rate": 7.222982934310645e-05, + "loss": 0.8662, + "step": 110660 + }, + { + "epoch": 0.7070390861582101, + "grad_norm": 1.3430500030517578, + "learning_rate": 7.222533473558671e-05, + "loss": 0.9428, + "step": 110670 + }, + { + "epoch": 0.7071029733079488, + "grad_norm": 0.9535601139068604, + "learning_rate": 7.222083990423863e-05, + "loss": 0.8891, + "step": 110680 + }, + { + "epoch": 0.7071668604576875, + "grad_norm": 0.8033544421195984, + "learning_rate": 7.221634484910746e-05, + "loss": 0.7643, + "step": 110690 + }, + { + "epoch": 0.7072307476074262, + "grad_norm": 0.8913300037384033, + "learning_rate": 7.221184957023848e-05, + "loss": 0.8317, + "step": 110700 + }, + { + "epoch": 0.7072946347571649, + "grad_norm": 0.8205702304840088, + "learning_rate": 7.220735406767696e-05, + "loss": 0.9091, + "step": 110710 + }, + { + "epoch": 0.7073585219069036, + "grad_norm": 0.7440993189811707, + "learning_rate": 7.220285834146816e-05, + "loss": 1.0378, + "step": 110720 + }, + { + "epoch": 0.7074224090566423, + "grad_norm": 1.0705007314682007, + "learning_rate": 7.219836239165737e-05, + "loss": 0.914, + "step": 110730 + }, + { + "epoch": 0.707486296206381, + "grad_norm": 0.6237671375274658, + "learning_rate": 7.219386621828989e-05, + "loss": 0.7851, + "step": 110740 + }, + { + "epoch": 0.7075501833561197, + "grad_norm": 0.8966255784034729, + "learning_rate": 7.218936982141096e-05, + "loss": 0.8537, + "step": 110750 + }, + { + "epoch": 0.7076140705058585, + "grad_norm": 0.8700849413871765, + "learning_rate": 7.218487320106588e-05, + "loss": 1.0906, + "step": 110760 + }, + { + "epoch": 0.7076779576555972, + "grad_norm": 0.9361857771873474, + "learning_rate": 7.218037635729993e-05, + "loss": 1.1403, + "step": 110770 + }, + { + "epoch": 0.7077418448053359, + "grad_norm": 0.9435377717018127, + "learning_rate": 7.21758792901584e-05, + "loss": 1.1844, + "step": 110780 + }, + { + "epoch": 0.7078057319550746, + "grad_norm": 2.09425950050354, + "learning_rate": 7.21713819996866e-05, + "loss": 1.0254, + "step": 110790 + }, + { + "epoch": 0.7078696191048133, + "grad_norm": 1.783277988433838, + "learning_rate": 7.21668844859298e-05, + "loss": 1.2829, + "step": 110800 + }, + { + "epoch": 0.707933506254552, + "grad_norm": 0.7960687279701233, + "learning_rate": 7.216238674893328e-05, + "loss": 0.7562, + "step": 110810 + }, + { + "epoch": 0.7079973934042907, + "grad_norm": 0.8216058611869812, + "learning_rate": 7.215788878874237e-05, + "loss": 0.937, + "step": 110820 + }, + { + "epoch": 0.7080612805540294, + "grad_norm": 0.8827890753746033, + "learning_rate": 7.215339060540231e-05, + "loss": 0.9726, + "step": 110830 + }, + { + "epoch": 0.7081251677037681, + "grad_norm": 1.042328119277954, + "learning_rate": 7.214889219895849e-05, + "loss": 0.8115, + "step": 110840 + }, + { + "epoch": 0.7081890548535068, + "grad_norm": 0.9282635450363159, + "learning_rate": 7.214439356945614e-05, + "loss": 1.0414, + "step": 110850 + }, + { + "epoch": 0.7082529420032455, + "grad_norm": 0.7158064246177673, + "learning_rate": 7.213989471694059e-05, + "loss": 0.9759, + "step": 110860 + }, + { + "epoch": 0.7083168291529842, + "grad_norm": 0.8982157111167908, + "learning_rate": 7.213539564145715e-05, + "loss": 0.7747, + "step": 110870 + }, + { + "epoch": 0.7083807163027229, + "grad_norm": 0.5959254503250122, + "learning_rate": 7.213089634305112e-05, + "loss": 0.844, + "step": 110880 + }, + { + "epoch": 0.7084446034524615, + "grad_norm": 1.0131826400756836, + "learning_rate": 7.212639682176782e-05, + "loss": 0.8006, + "step": 110890 + }, + { + "epoch": 0.7085084906022002, + "grad_norm": 3.458534002304077, + "learning_rate": 7.212189707765257e-05, + "loss": 0.8292, + "step": 110900 + }, + { + "epoch": 0.7085723777519389, + "grad_norm": 1.091951847076416, + "learning_rate": 7.211739711075067e-05, + "loss": 0.8682, + "step": 110910 + }, + { + "epoch": 0.7086362649016776, + "grad_norm": 1.005330204963684, + "learning_rate": 7.211289692110746e-05, + "loss": 0.8157, + "step": 110920 + }, + { + "epoch": 0.7087001520514163, + "grad_norm": 0.5316596627235413, + "learning_rate": 7.210839650876824e-05, + "loss": 0.721, + "step": 110930 + }, + { + "epoch": 0.708764039201155, + "grad_norm": 0.904162585735321, + "learning_rate": 7.210389587377833e-05, + "loss": 0.8916, + "step": 110940 + }, + { + "epoch": 0.7088279263508938, + "grad_norm": 0.6802520155906677, + "learning_rate": 7.209939501618308e-05, + "loss": 1.0842, + "step": 110950 + }, + { + "epoch": 0.7088918135006325, + "grad_norm": 0.8692640066146851, + "learning_rate": 7.20948939360278e-05, + "loss": 1.13, + "step": 110960 + }, + { + "epoch": 0.7089557006503712, + "grad_norm": 0.9759564399719238, + "learning_rate": 7.209039263335782e-05, + "loss": 1.1396, + "step": 110970 + }, + { + "epoch": 0.7090195878001099, + "grad_norm": 0.8422264456748962, + "learning_rate": 7.208589110821848e-05, + "loss": 0.9562, + "step": 110980 + }, + { + "epoch": 0.7090834749498486, + "grad_norm": 0.9133360385894775, + "learning_rate": 7.208138936065509e-05, + "loss": 0.9616, + "step": 110990 + }, + { + "epoch": 0.7091473620995873, + "grad_norm": 1.0148824453353882, + "learning_rate": 7.207688739071303e-05, + "loss": 0.8601, + "step": 111000 + }, + { + "epoch": 0.709211249249326, + "grad_norm": 0.9856421947479248, + "learning_rate": 7.207238519843761e-05, + "loss": 0.6616, + "step": 111010 + }, + { + "epoch": 0.7092751363990647, + "grad_norm": 1.0549514293670654, + "learning_rate": 7.206788278387417e-05, + "loss": 0.8587, + "step": 111020 + }, + { + "epoch": 0.7093390235488034, + "grad_norm": 0.8037036657333374, + "learning_rate": 7.206338014706806e-05, + "loss": 0.782, + "step": 111030 + }, + { + "epoch": 0.7094029106985421, + "grad_norm": 0.7747255563735962, + "learning_rate": 7.205887728806463e-05, + "loss": 0.8245, + "step": 111040 + }, + { + "epoch": 0.7094667978482808, + "grad_norm": 0.8603516221046448, + "learning_rate": 7.205437420690922e-05, + "loss": 0.884, + "step": 111050 + }, + { + "epoch": 0.7095306849980195, + "grad_norm": 0.5913922190666199, + "learning_rate": 7.204987090364717e-05, + "loss": 1.1146, + "step": 111060 + }, + { + "epoch": 0.7095945721477582, + "grad_norm": 1.0901986360549927, + "learning_rate": 7.204536737832385e-05, + "loss": 0.99, + "step": 111070 + }, + { + "epoch": 0.7096584592974969, + "grad_norm": 0.8610712885856628, + "learning_rate": 7.204086363098462e-05, + "loss": 1.1021, + "step": 111080 + }, + { + "epoch": 0.7097223464472356, + "grad_norm": 1.4266153573989868, + "learning_rate": 7.203635966167482e-05, + "loss": 0.9517, + "step": 111090 + }, + { + "epoch": 0.7097862335969743, + "grad_norm": 1.2162227630615234, + "learning_rate": 7.203185547043981e-05, + "loss": 0.8217, + "step": 111100 + }, + { + "epoch": 0.709850120746713, + "grad_norm": 1.2490782737731934, + "learning_rate": 7.202735105732497e-05, + "loss": 0.7838, + "step": 111110 + }, + { + "epoch": 0.7099140078964518, + "grad_norm": 0.9295443892478943, + "learning_rate": 7.202284642237563e-05, + "loss": 0.8437, + "step": 111120 + }, + { + "epoch": 0.7099778950461904, + "grad_norm": 1.247816562652588, + "learning_rate": 7.201834156563718e-05, + "loss": 0.7268, + "step": 111130 + }, + { + "epoch": 0.7100417821959291, + "grad_norm": 0.8253741264343262, + "learning_rate": 7.201383648715498e-05, + "loss": 0.8062, + "step": 111140 + }, + { + "epoch": 0.7101056693456678, + "grad_norm": 0.7604842185974121, + "learning_rate": 7.200933118697439e-05, + "loss": 0.9545, + "step": 111150 + }, + { + "epoch": 0.7101695564954065, + "grad_norm": 1.0069366693496704, + "learning_rate": 7.200482566514081e-05, + "loss": 0.8775, + "step": 111160 + }, + { + "epoch": 0.7102334436451452, + "grad_norm": 0.5719223022460938, + "learning_rate": 7.20003199216996e-05, + "loss": 0.8167, + "step": 111170 + }, + { + "epoch": 0.7102973307948839, + "grad_norm": 0.9412128925323486, + "learning_rate": 7.199581395669613e-05, + "loss": 0.7928, + "step": 111180 + }, + { + "epoch": 0.7103612179446226, + "grad_norm": 0.8376321196556091, + "learning_rate": 7.199130777017578e-05, + "loss": 0.9514, + "step": 111190 + }, + { + "epoch": 0.7104251050943613, + "grad_norm": 1.0094441175460815, + "learning_rate": 7.198680136218394e-05, + "loss": 0.9398, + "step": 111200 + }, + { + "epoch": 0.7104889922441, + "grad_norm": 2.3433310985565186, + "learning_rate": 7.1982294732766e-05, + "loss": 0.8579, + "step": 111210 + }, + { + "epoch": 0.7105528793938387, + "grad_norm": 0.6974323987960815, + "learning_rate": 7.197778788196732e-05, + "loss": 0.7997, + "step": 111220 + }, + { + "epoch": 0.7106167665435774, + "grad_norm": 0.4892445206642151, + "learning_rate": 7.197328080983331e-05, + "loss": 0.6908, + "step": 111230 + }, + { + "epoch": 0.7106806536933161, + "grad_norm": 0.5663550496101379, + "learning_rate": 7.196877351640934e-05, + "loss": 0.7576, + "step": 111240 + }, + { + "epoch": 0.7107445408430548, + "grad_norm": 1.5886496305465698, + "learning_rate": 7.196426600174083e-05, + "loss": 1.0888, + "step": 111250 + }, + { + "epoch": 0.7108084279927935, + "grad_norm": 1.0448259115219116, + "learning_rate": 7.195975826587315e-05, + "loss": 0.9002, + "step": 111260 + }, + { + "epoch": 0.7108723151425322, + "grad_norm": 1.3552470207214355, + "learning_rate": 7.195525030885173e-05, + "loss": 1.0711, + "step": 111270 + }, + { + "epoch": 0.710936202292271, + "grad_norm": 0.8098346590995789, + "learning_rate": 7.195074213072192e-05, + "loss": 0.7809, + "step": 111280 + }, + { + "epoch": 0.7110000894420097, + "grad_norm": 0.7576483488082886, + "learning_rate": 7.194623373152916e-05, + "loss": 1.0012, + "step": 111290 + }, + { + "epoch": 0.7110639765917484, + "grad_norm": 1.0490570068359375, + "learning_rate": 7.194172511131883e-05, + "loss": 0.8137, + "step": 111300 + }, + { + "epoch": 0.7111278637414871, + "grad_norm": 1.080182671546936, + "learning_rate": 7.193721627013635e-05, + "loss": 1.161, + "step": 111310 + }, + { + "epoch": 0.7111917508912258, + "grad_norm": 0.9876430034637451, + "learning_rate": 7.193270720802713e-05, + "loss": 1.1287, + "step": 111320 + }, + { + "epoch": 0.7112556380409645, + "grad_norm": 0.8091046214103699, + "learning_rate": 7.192819792503656e-05, + "loss": 0.8741, + "step": 111330 + }, + { + "epoch": 0.7113195251907032, + "grad_norm": 0.6781719326972961, + "learning_rate": 7.192368842121008e-05, + "loss": 0.8945, + "step": 111340 + }, + { + "epoch": 0.7113834123404419, + "grad_norm": 2.1423685550689697, + "learning_rate": 7.191917869659307e-05, + "loss": 0.7859, + "step": 111350 + }, + { + "epoch": 0.7114472994901806, + "grad_norm": 0.6364580988883972, + "learning_rate": 7.191466875123099e-05, + "loss": 0.7873, + "step": 111360 + }, + { + "epoch": 0.7115111866399193, + "grad_norm": 0.8204523921012878, + "learning_rate": 7.191015858516921e-05, + "loss": 0.8482, + "step": 111370 + }, + { + "epoch": 0.7115750737896579, + "grad_norm": 0.5086873173713684, + "learning_rate": 7.19056481984532e-05, + "loss": 0.951, + "step": 111380 + }, + { + "epoch": 0.7116389609393966, + "grad_norm": 0.7575923204421997, + "learning_rate": 7.190113759112837e-05, + "loss": 1.0358, + "step": 111390 + }, + { + "epoch": 0.7117028480891353, + "grad_norm": 0.6204468607902527, + "learning_rate": 7.189662676324012e-05, + "loss": 1.0471, + "step": 111400 + }, + { + "epoch": 0.711766735238874, + "grad_norm": 0.7456675171852112, + "learning_rate": 7.18921157148339e-05, + "loss": 0.7964, + "step": 111410 + }, + { + "epoch": 0.7118306223886127, + "grad_norm": 0.8651039004325867, + "learning_rate": 7.188760444595513e-05, + "loss": 0.596, + "step": 111420 + }, + { + "epoch": 0.7118945095383514, + "grad_norm": 1.0460362434387207, + "learning_rate": 7.188309295664926e-05, + "loss": 0.9561, + "step": 111430 + }, + { + "epoch": 0.7119583966880901, + "grad_norm": 1.1043407917022705, + "learning_rate": 7.187858124696171e-05, + "loss": 0.95, + "step": 111440 + }, + { + "epoch": 0.7120222838378288, + "grad_norm": 1.1724969148635864, + "learning_rate": 7.187406931693791e-05, + "loss": 1.278, + "step": 111450 + }, + { + "epoch": 0.7120861709875675, + "grad_norm": 0.8911068439483643, + "learning_rate": 7.186955716662332e-05, + "loss": 0.9101, + "step": 111460 + }, + { + "epoch": 0.7121500581373063, + "grad_norm": 0.9001386761665344, + "learning_rate": 7.186504479606336e-05, + "loss": 0.6922, + "step": 111470 + }, + { + "epoch": 0.712213945287045, + "grad_norm": 0.7261586785316467, + "learning_rate": 7.186053220530349e-05, + "loss": 0.9494, + "step": 111480 + }, + { + "epoch": 0.7122778324367837, + "grad_norm": 1.1621700525283813, + "learning_rate": 7.185601939438914e-05, + "loss": 1.1188, + "step": 111490 + }, + { + "epoch": 0.7123417195865224, + "grad_norm": 1.0625040531158447, + "learning_rate": 7.185150636336578e-05, + "loss": 0.6796, + "step": 111500 + }, + { + "epoch": 0.7124056067362611, + "grad_norm": 0.5753048062324524, + "learning_rate": 7.184699311227883e-05, + "loss": 0.6958, + "step": 111510 + }, + { + "epoch": 0.7124694938859998, + "grad_norm": 0.8998501896858215, + "learning_rate": 7.184247964117376e-05, + "loss": 0.765, + "step": 111520 + }, + { + "epoch": 0.7125333810357385, + "grad_norm": 1.5378433465957642, + "learning_rate": 7.183796595009604e-05, + "loss": 0.7618, + "step": 111530 + }, + { + "epoch": 0.7125972681854772, + "grad_norm": 0.5962892174720764, + "learning_rate": 7.18334520390911e-05, + "loss": 0.8256, + "step": 111540 + }, + { + "epoch": 0.7126611553352159, + "grad_norm": 0.8945503234863281, + "learning_rate": 7.182893790820441e-05, + "loss": 0.935, + "step": 111550 + }, + { + "epoch": 0.7127250424849546, + "grad_norm": 0.8816238045692444, + "learning_rate": 7.182442355748143e-05, + "loss": 0.6085, + "step": 111560 + }, + { + "epoch": 0.7127889296346933, + "grad_norm": 0.7212803959846497, + "learning_rate": 7.181990898696762e-05, + "loss": 0.7271, + "step": 111570 + }, + { + "epoch": 0.712852816784432, + "grad_norm": 0.9324487447738647, + "learning_rate": 7.181539419670847e-05, + "loss": 0.8302, + "step": 111580 + }, + { + "epoch": 0.7129167039341707, + "grad_norm": 0.7855546474456787, + "learning_rate": 7.18108791867494e-05, + "loss": 1.1443, + "step": 111590 + }, + { + "epoch": 0.7129805910839094, + "grad_norm": 1.1288433074951172, + "learning_rate": 7.180636395713592e-05, + "loss": 0.77, + "step": 111600 + }, + { + "epoch": 0.7130444782336481, + "grad_norm": 0.8179686665534973, + "learning_rate": 7.18018485079135e-05, + "loss": 0.9342, + "step": 111610 + }, + { + "epoch": 0.7131083653833867, + "grad_norm": 0.5550733804702759, + "learning_rate": 7.179733283912759e-05, + "loss": 1.1414, + "step": 111620 + }, + { + "epoch": 0.7131722525331254, + "grad_norm": 1.0934454202651978, + "learning_rate": 7.179281695082369e-05, + "loss": 1.1329, + "step": 111630 + }, + { + "epoch": 0.7132361396828641, + "grad_norm": 0.8720685243606567, + "learning_rate": 7.178830084304725e-05, + "loss": 0.9261, + "step": 111640 + }, + { + "epoch": 0.7133000268326029, + "grad_norm": 1.734836220741272, + "learning_rate": 7.17837845158438e-05, + "loss": 1.0047, + "step": 111650 + }, + { + "epoch": 0.7133639139823416, + "grad_norm": 0.9459303021430969, + "learning_rate": 7.177926796925877e-05, + "loss": 1.0278, + "step": 111660 + }, + { + "epoch": 0.7134278011320803, + "grad_norm": 1.2539857625961304, + "learning_rate": 7.177475120333767e-05, + "loss": 0.7048, + "step": 111670 + }, + { + "epoch": 0.713491688281819, + "grad_norm": 1.275357723236084, + "learning_rate": 7.177023421812601e-05, + "loss": 0.7448, + "step": 111680 + }, + { + "epoch": 0.7135555754315577, + "grad_norm": 0.5253180265426636, + "learning_rate": 7.176571701366924e-05, + "loss": 0.9166, + "step": 111690 + }, + { + "epoch": 0.7136194625812964, + "grad_norm": 2.3519647121429443, + "learning_rate": 7.176119959001287e-05, + "loss": 0.7995, + "step": 111700 + }, + { + "epoch": 0.7136833497310351, + "grad_norm": 0.8496062755584717, + "learning_rate": 7.17566819472024e-05, + "loss": 0.9215, + "step": 111710 + }, + { + "epoch": 0.7137472368807738, + "grad_norm": 0.6206763386726379, + "learning_rate": 7.175216408528331e-05, + "loss": 0.7705, + "step": 111720 + }, + { + "epoch": 0.7138111240305125, + "grad_norm": 0.9494533538818359, + "learning_rate": 7.174764600430112e-05, + "loss": 0.9813, + "step": 111730 + }, + { + "epoch": 0.7138750111802512, + "grad_norm": 0.609451413154602, + "learning_rate": 7.174312770430131e-05, + "loss": 0.8221, + "step": 111740 + }, + { + "epoch": 0.7139388983299899, + "grad_norm": 0.6117620468139648, + "learning_rate": 7.17386091853294e-05, + "loss": 1.1774, + "step": 111750 + }, + { + "epoch": 0.7140027854797286, + "grad_norm": 0.881416916847229, + "learning_rate": 7.173409044743092e-05, + "loss": 1.1644, + "step": 111760 + }, + { + "epoch": 0.7140666726294673, + "grad_norm": 0.9561392068862915, + "learning_rate": 7.17295714906513e-05, + "loss": 1.0794, + "step": 111770 + }, + { + "epoch": 0.714130559779206, + "grad_norm": 0.9907708764076233, + "learning_rate": 7.172505231503613e-05, + "loss": 0.8979, + "step": 111780 + }, + { + "epoch": 0.7141944469289447, + "grad_norm": 1.2147117853164673, + "learning_rate": 7.172053292063085e-05, + "loss": 0.8886, + "step": 111790 + }, + { + "epoch": 0.7142583340786834, + "grad_norm": 0.8002836108207703, + "learning_rate": 7.171601330748104e-05, + "loss": 0.9329, + "step": 111800 + }, + { + "epoch": 0.7143222212284221, + "grad_norm": 0.9203763604164124, + "learning_rate": 7.171149347563219e-05, + "loss": 0.7629, + "step": 111810 + }, + { + "epoch": 0.7143861083781609, + "grad_norm": 1.0033005475997925, + "learning_rate": 7.17069734251298e-05, + "loss": 0.5576, + "step": 111820 + }, + { + "epoch": 0.7144499955278996, + "grad_norm": 0.8255144357681274, + "learning_rate": 7.170245315601942e-05, + "loss": 0.8515, + "step": 111830 + }, + { + "epoch": 0.7145138826776383, + "grad_norm": 0.6121490597724915, + "learning_rate": 7.169793266834657e-05, + "loss": 0.7734, + "step": 111840 + }, + { + "epoch": 0.714577769827377, + "grad_norm": 0.9062861204147339, + "learning_rate": 7.169341196215675e-05, + "loss": 0.9229, + "step": 111850 + }, + { + "epoch": 0.7146416569771156, + "grad_norm": 1.0304701328277588, + "learning_rate": 7.168889103749552e-05, + "loss": 0.7766, + "step": 111860 + }, + { + "epoch": 0.7147055441268543, + "grad_norm": 0.49770671129226685, + "learning_rate": 7.168436989440838e-05, + "loss": 0.9304, + "step": 111870 + }, + { + "epoch": 0.714769431276593, + "grad_norm": 2.51269268989563, + "learning_rate": 7.167984853294087e-05, + "loss": 0.9755, + "step": 111880 + }, + { + "epoch": 0.7148333184263317, + "grad_norm": 0.7806798219680786, + "learning_rate": 7.167532695313855e-05, + "loss": 0.8631, + "step": 111890 + }, + { + "epoch": 0.7148972055760704, + "grad_norm": 0.9068145155906677, + "learning_rate": 7.167080515504692e-05, + "loss": 0.8938, + "step": 111900 + }, + { + "epoch": 0.7149610927258091, + "grad_norm": 0.778988778591156, + "learning_rate": 7.166628313871155e-05, + "loss": 1.0556, + "step": 111910 + }, + { + "epoch": 0.7150249798755478, + "grad_norm": 1.0016741752624512, + "learning_rate": 7.166176090417794e-05, + "loss": 0.9363, + "step": 111920 + }, + { + "epoch": 0.7150888670252865, + "grad_norm": 1.0079103708267212, + "learning_rate": 7.165723845149169e-05, + "loss": 0.9302, + "step": 111930 + }, + { + "epoch": 0.7151527541750252, + "grad_norm": 1.5212242603302002, + "learning_rate": 7.165271578069827e-05, + "loss": 0.9895, + "step": 111940 + }, + { + "epoch": 0.7152166413247639, + "grad_norm": 0.8512176871299744, + "learning_rate": 7.16481928918433e-05, + "loss": 0.7161, + "step": 111950 + }, + { + "epoch": 0.7152805284745026, + "grad_norm": 1.4987943172454834, + "learning_rate": 7.16436697849723e-05, + "loss": 0.8515, + "step": 111960 + }, + { + "epoch": 0.7153444156242413, + "grad_norm": 1.0176150798797607, + "learning_rate": 7.163914646013082e-05, + "loss": 0.8803, + "step": 111970 + }, + { + "epoch": 0.71540830277398, + "grad_norm": 0.5044176578521729, + "learning_rate": 7.16346229173644e-05, + "loss": 0.7152, + "step": 111980 + }, + { + "epoch": 0.7154721899237187, + "grad_norm": 2.0591912269592285, + "learning_rate": 7.163009915671863e-05, + "loss": 0.8914, + "step": 111990 + }, + { + "epoch": 0.7155360770734575, + "grad_norm": 0.6870359778404236, + "learning_rate": 7.162557517823904e-05, + "loss": 0.9074, + "step": 112000 + }, + { + "epoch": 0.7155999642231962, + "grad_norm": 0.9231833815574646, + "learning_rate": 7.16210509819712e-05, + "loss": 0.9811, + "step": 112010 + }, + { + "epoch": 0.7156638513729349, + "grad_norm": 1.032814860343933, + "learning_rate": 7.161652656796068e-05, + "loss": 0.8585, + "step": 112020 + }, + { + "epoch": 0.7157277385226736, + "grad_norm": 0.6325913667678833, + "learning_rate": 7.161200193625302e-05, + "loss": 0.8663, + "step": 112030 + }, + { + "epoch": 0.7157916256724123, + "grad_norm": 0.9060227870941162, + "learning_rate": 7.16074770868938e-05, + "loss": 0.904, + "step": 112040 + }, + { + "epoch": 0.715855512822151, + "grad_norm": 1.0490593910217285, + "learning_rate": 7.16029520199286e-05, + "loss": 0.7224, + "step": 112050 + }, + { + "epoch": 0.7159193999718897, + "grad_norm": 0.7744579911231995, + "learning_rate": 7.1598426735403e-05, + "loss": 0.953, + "step": 112060 + }, + { + "epoch": 0.7159832871216284, + "grad_norm": 1.1592707633972168, + "learning_rate": 7.159390123336253e-05, + "loss": 0.8888, + "step": 112070 + }, + { + "epoch": 0.7160471742713671, + "grad_norm": 1.0290485620498657, + "learning_rate": 7.158937551385281e-05, + "loss": 0.7144, + "step": 112080 + }, + { + "epoch": 0.7161110614211058, + "grad_norm": 0.7626058459281921, + "learning_rate": 7.15848495769194e-05, + "loss": 0.8806, + "step": 112090 + }, + { + "epoch": 0.7161749485708444, + "grad_norm": 1.230587363243103, + "learning_rate": 7.158032342260787e-05, + "loss": 0.9783, + "step": 112100 + }, + { + "epoch": 0.7162388357205831, + "grad_norm": 1.150781273841858, + "learning_rate": 7.157579705096384e-05, + "loss": 0.8128, + "step": 112110 + }, + { + "epoch": 0.7163027228703218, + "grad_norm": 0.9989941716194153, + "learning_rate": 7.157127046203285e-05, + "loss": 0.982, + "step": 112120 + }, + { + "epoch": 0.7163666100200605, + "grad_norm": 0.5104489922523499, + "learning_rate": 7.15667436558605e-05, + "loss": 1.1264, + "step": 112130 + }, + { + "epoch": 0.7164304971697992, + "grad_norm": 0.8887497186660767, + "learning_rate": 7.156221663249238e-05, + "loss": 0.9689, + "step": 112140 + }, + { + "epoch": 0.7164943843195379, + "grad_norm": 0.9902895092964172, + "learning_rate": 7.155768939197411e-05, + "loss": 0.8488, + "step": 112150 + }, + { + "epoch": 0.7165582714692766, + "grad_norm": 0.9794628024101257, + "learning_rate": 7.155316193435123e-05, + "loss": 0.8772, + "step": 112160 + }, + { + "epoch": 0.7166221586190153, + "grad_norm": 1.3520082235336304, + "learning_rate": 7.154863425966938e-05, + "loss": 1.1314, + "step": 112170 + }, + { + "epoch": 0.716686045768754, + "grad_norm": 0.871411919593811, + "learning_rate": 7.154410636797413e-05, + "loss": 0.9373, + "step": 112180 + }, + { + "epoch": 0.7167499329184928, + "grad_norm": 1.0834548473358154, + "learning_rate": 7.15395782593111e-05, + "loss": 0.7362, + "step": 112190 + }, + { + "epoch": 0.7168138200682315, + "grad_norm": 0.9471587538719177, + "learning_rate": 7.153504993372587e-05, + "loss": 0.8928, + "step": 112200 + }, + { + "epoch": 0.7168777072179702, + "grad_norm": 1.114270806312561, + "learning_rate": 7.153052139126407e-05, + "loss": 0.7417, + "step": 112210 + }, + { + "epoch": 0.7169415943677089, + "grad_norm": 1.393097162246704, + "learning_rate": 7.152599263197128e-05, + "loss": 1.0208, + "step": 112220 + }, + { + "epoch": 0.7170054815174476, + "grad_norm": 0.7526148557662964, + "learning_rate": 7.152146365589313e-05, + "loss": 1.0583, + "step": 112230 + }, + { + "epoch": 0.7170693686671863, + "grad_norm": 1.4979679584503174, + "learning_rate": 7.151693446307524e-05, + "loss": 0.8657, + "step": 112240 + }, + { + "epoch": 0.717133255816925, + "grad_norm": 0.7757830619812012, + "learning_rate": 7.151240505356318e-05, + "loss": 0.7491, + "step": 112250 + }, + { + "epoch": 0.7171971429666637, + "grad_norm": 1.1512292623519897, + "learning_rate": 7.15078754274026e-05, + "loss": 0.818, + "step": 112260 + }, + { + "epoch": 0.7172610301164024, + "grad_norm": 1.370301365852356, + "learning_rate": 7.150334558463911e-05, + "loss": 0.8435, + "step": 112270 + }, + { + "epoch": 0.7173249172661411, + "grad_norm": 1.1701213121414185, + "learning_rate": 7.149881552531832e-05, + "loss": 1.1135, + "step": 112280 + }, + { + "epoch": 0.7173888044158798, + "grad_norm": 0.6550779342651367, + "learning_rate": 7.149428524948585e-05, + "loss": 0.8574, + "step": 112290 + }, + { + "epoch": 0.7174526915656185, + "grad_norm": 0.9960959553718567, + "learning_rate": 7.148975475718734e-05, + "loss": 0.9118, + "step": 112300 + }, + { + "epoch": 0.7175165787153572, + "grad_norm": 0.6779653429985046, + "learning_rate": 7.148522404846841e-05, + "loss": 0.9284, + "step": 112310 + }, + { + "epoch": 0.7175804658650959, + "grad_norm": 0.8120177984237671, + "learning_rate": 7.14806931233747e-05, + "loss": 1.0227, + "step": 112320 + }, + { + "epoch": 0.7176443530148346, + "grad_norm": 1.1333248615264893, + "learning_rate": 7.147616198195181e-05, + "loss": 0.7896, + "step": 112330 + }, + { + "epoch": 0.7177082401645734, + "grad_norm": 0.5088018178939819, + "learning_rate": 7.147163062424539e-05, + "loss": 0.6615, + "step": 112340 + }, + { + "epoch": 0.717772127314312, + "grad_norm": 0.9224886894226074, + "learning_rate": 7.146709905030108e-05, + "loss": 0.9036, + "step": 112350 + }, + { + "epoch": 0.7178360144640507, + "grad_norm": 0.9467249512672424, + "learning_rate": 7.146256726016452e-05, + "loss": 0.8896, + "step": 112360 + }, + { + "epoch": 0.7178999016137894, + "grad_norm": 1.132169246673584, + "learning_rate": 7.145803525388132e-05, + "loss": 1.1764, + "step": 112370 + }, + { + "epoch": 0.7179637887635281, + "grad_norm": 0.9718843102455139, + "learning_rate": 7.145350303149715e-05, + "loss": 0.8263, + "step": 112380 + }, + { + "epoch": 0.7180276759132668, + "grad_norm": 0.5153815746307373, + "learning_rate": 7.144897059305764e-05, + "loss": 0.9022, + "step": 112390 + }, + { + "epoch": 0.7180915630630055, + "grad_norm": 1.2004663944244385, + "learning_rate": 7.144443793860845e-05, + "loss": 0.8924, + "step": 112400 + }, + { + "epoch": 0.7181554502127442, + "grad_norm": 0.48930367827415466, + "learning_rate": 7.14399050681952e-05, + "loss": 0.6762, + "step": 112410 + }, + { + "epoch": 0.7182193373624829, + "grad_norm": 0.9120885729789734, + "learning_rate": 7.143537198186356e-05, + "loss": 0.7038, + "step": 112420 + }, + { + "epoch": 0.7182832245122216, + "grad_norm": 0.7880602478981018, + "learning_rate": 7.14308386796592e-05, + "loss": 0.6403, + "step": 112430 + }, + { + "epoch": 0.7183471116619603, + "grad_norm": 0.8461630940437317, + "learning_rate": 7.142630516162774e-05, + "loss": 1.245, + "step": 112440 + }, + { + "epoch": 0.718410998811699, + "grad_norm": 0.7924696803092957, + "learning_rate": 7.142177142781485e-05, + "loss": 0.7825, + "step": 112450 + }, + { + "epoch": 0.7184748859614377, + "grad_norm": 1.4075403213500977, + "learning_rate": 7.14172374782662e-05, + "loss": 1.0038, + "step": 112460 + }, + { + "epoch": 0.7185387731111764, + "grad_norm": 0.6617513298988342, + "learning_rate": 7.141270331302743e-05, + "loss": 0.9936, + "step": 112470 + }, + { + "epoch": 0.7186026602609151, + "grad_norm": 0.9950401186943054, + "learning_rate": 7.140816893214421e-05, + "loss": 0.9455, + "step": 112480 + }, + { + "epoch": 0.7186665474106538, + "grad_norm": 1.0970572233200073, + "learning_rate": 7.140363433566224e-05, + "loss": 0.9689, + "step": 112490 + }, + { + "epoch": 0.7187304345603925, + "grad_norm": 0.8020222783088684, + "learning_rate": 7.139909952362712e-05, + "loss": 0.8451, + "step": 112500 + }, + { + "epoch": 0.7187943217101312, + "grad_norm": 1.0068836212158203, + "learning_rate": 7.139456449608458e-05, + "loss": 1.3056, + "step": 112510 + }, + { + "epoch": 0.71885820885987, + "grad_norm": 0.8701362013816833, + "learning_rate": 7.139002925308024e-05, + "loss": 0.8939, + "step": 112520 + }, + { + "epoch": 0.7189220960096087, + "grad_norm": 2.2286477088928223, + "learning_rate": 7.138549379465982e-05, + "loss": 1.0509, + "step": 112530 + }, + { + "epoch": 0.7189859831593474, + "grad_norm": 0.22156091034412384, + "learning_rate": 7.138095812086896e-05, + "loss": 0.6598, + "step": 112540 + }, + { + "epoch": 0.7190498703090861, + "grad_norm": 1.0287528038024902, + "learning_rate": 7.137642223175337e-05, + "loss": 0.9708, + "step": 112550 + }, + { + "epoch": 0.7191137574588248, + "grad_norm": 0.772317111492157, + "learning_rate": 7.13718861273587e-05, + "loss": 0.8922, + "step": 112560 + }, + { + "epoch": 0.7191776446085635, + "grad_norm": 1.1389001607894897, + "learning_rate": 7.136734980773066e-05, + "loss": 0.6469, + "step": 112570 + }, + { + "epoch": 0.7192415317583022, + "grad_norm": 0.9356949329376221, + "learning_rate": 7.136281327291491e-05, + "loss": 0.8607, + "step": 112580 + }, + { + "epoch": 0.7193054189080408, + "grad_norm": 0.8384791016578674, + "learning_rate": 7.135827652295715e-05, + "loss": 0.8469, + "step": 112590 + }, + { + "epoch": 0.7193693060577795, + "grad_norm": 1.2862927913665771, + "learning_rate": 7.135373955790308e-05, + "loss": 0.8742, + "step": 112600 + }, + { + "epoch": 0.7194331932075182, + "grad_norm": 0.7916562557220459, + "learning_rate": 7.134920237779837e-05, + "loss": 0.9424, + "step": 112610 + }, + { + "epoch": 0.7194970803572569, + "grad_norm": 1.3547780513763428, + "learning_rate": 7.134466498268872e-05, + "loss": 0.9223, + "step": 112620 + }, + { + "epoch": 0.7195609675069956, + "grad_norm": 0.8931356072425842, + "learning_rate": 7.134012737261985e-05, + "loss": 0.9001, + "step": 112630 + }, + { + "epoch": 0.7196248546567343, + "grad_norm": 1.0962817668914795, + "learning_rate": 7.133558954763741e-05, + "loss": 0.6841, + "step": 112640 + }, + { + "epoch": 0.719688741806473, + "grad_norm": 0.8664074540138245, + "learning_rate": 7.133105150778714e-05, + "loss": 0.84, + "step": 112650 + }, + { + "epoch": 0.7197526289562117, + "grad_norm": 0.7376250624656677, + "learning_rate": 7.132651325311472e-05, + "loss": 1.0094, + "step": 112660 + }, + { + "epoch": 0.7198165161059504, + "grad_norm": 0.8420968055725098, + "learning_rate": 7.132197478366587e-05, + "loss": 1.052, + "step": 112670 + }, + { + "epoch": 0.7198804032556891, + "grad_norm": 1.0015677213668823, + "learning_rate": 7.131743609948628e-05, + "loss": 0.9265, + "step": 112680 + }, + { + "epoch": 0.7199442904054278, + "grad_norm": 0.7853860259056091, + "learning_rate": 7.131289720062167e-05, + "loss": 0.8856, + "step": 112690 + }, + { + "epoch": 0.7200081775551666, + "grad_norm": 1.2557804584503174, + "learning_rate": 7.130835808711773e-05, + "loss": 0.975, + "step": 112700 + }, + { + "epoch": 0.7200720647049053, + "grad_norm": 0.8270767331123352, + "learning_rate": 7.130381875902021e-05, + "loss": 1.0227, + "step": 112710 + }, + { + "epoch": 0.720135951854644, + "grad_norm": 1.5142698287963867, + "learning_rate": 7.12992792163748e-05, + "loss": 0.9033, + "step": 112720 + }, + { + "epoch": 0.7201998390043827, + "grad_norm": 0.9564334750175476, + "learning_rate": 7.129473945922722e-05, + "loss": 0.6761, + "step": 112730 + }, + { + "epoch": 0.7202637261541214, + "grad_norm": 0.5844874382019043, + "learning_rate": 7.129019948762319e-05, + "loss": 0.7452, + "step": 112740 + }, + { + "epoch": 0.7203276133038601, + "grad_norm": 0.7936009764671326, + "learning_rate": 7.128565930160844e-05, + "loss": 0.8532, + "step": 112750 + }, + { + "epoch": 0.7203915004535988, + "grad_norm": 1.0036197900772095, + "learning_rate": 7.128111890122868e-05, + "loss": 0.6713, + "step": 112760 + }, + { + "epoch": 0.7204553876033375, + "grad_norm": 0.6564218997955322, + "learning_rate": 7.127657828652964e-05, + "loss": 0.8136, + "step": 112770 + }, + { + "epoch": 0.7205192747530762, + "grad_norm": 0.7329919338226318, + "learning_rate": 7.127203745755705e-05, + "loss": 0.9779, + "step": 112780 + }, + { + "epoch": 0.7205831619028149, + "grad_norm": 0.9217239022254944, + "learning_rate": 7.126749641435664e-05, + "loss": 1.0803, + "step": 112790 + }, + { + "epoch": 0.7206470490525536, + "grad_norm": 1.523088812828064, + "learning_rate": 7.126295515697414e-05, + "loss": 0.8893, + "step": 112800 + }, + { + "epoch": 0.7207109362022923, + "grad_norm": 1.220182180404663, + "learning_rate": 7.125841368545529e-05, + "loss": 0.7783, + "step": 112810 + }, + { + "epoch": 0.720774823352031, + "grad_norm": 0.7349340319633484, + "learning_rate": 7.125387199984583e-05, + "loss": 0.8759, + "step": 112820 + }, + { + "epoch": 0.7208387105017696, + "grad_norm": 1.1350910663604736, + "learning_rate": 7.124933010019148e-05, + "loss": 0.8982, + "step": 112830 + }, + { + "epoch": 0.7209025976515083, + "grad_norm": 1.2227561473846436, + "learning_rate": 7.124478798653801e-05, + "loss": 0.6953, + "step": 112840 + }, + { + "epoch": 0.720966484801247, + "grad_norm": 1.2927758693695068, + "learning_rate": 7.124024565893112e-05, + "loss": 0.8957, + "step": 112850 + }, + { + "epoch": 0.7210303719509857, + "grad_norm": 0.8586512804031372, + "learning_rate": 7.12357031174166e-05, + "loss": 1.1933, + "step": 112860 + }, + { + "epoch": 0.7210942591007244, + "grad_norm": 1.5274994373321533, + "learning_rate": 7.123116036204017e-05, + "loss": 0.8434, + "step": 112870 + }, + { + "epoch": 0.7211581462504632, + "grad_norm": 0.8376038670539856, + "learning_rate": 7.122661739284759e-05, + "loss": 0.7802, + "step": 112880 + }, + { + "epoch": 0.7212220334002019, + "grad_norm": 0.9995211362838745, + "learning_rate": 7.122207420988462e-05, + "loss": 0.7681, + "step": 112890 + }, + { + "epoch": 0.7212859205499406, + "grad_norm": 1.5679831504821777, + "learning_rate": 7.121753081319699e-05, + "loss": 0.9598, + "step": 112900 + }, + { + "epoch": 0.7213498076996793, + "grad_norm": 1.7187330722808838, + "learning_rate": 7.121298720283048e-05, + "loss": 1.0863, + "step": 112910 + }, + { + "epoch": 0.721413694849418, + "grad_norm": 0.8918151259422302, + "learning_rate": 7.120844337883082e-05, + "loss": 1.0098, + "step": 112920 + }, + { + "epoch": 0.7214775819991567, + "grad_norm": 0.8101955056190491, + "learning_rate": 7.120389934124379e-05, + "loss": 1.092, + "step": 112930 + }, + { + "epoch": 0.7215414691488954, + "grad_norm": 0.7652488946914673, + "learning_rate": 7.119935509011516e-05, + "loss": 0.8215, + "step": 112940 + }, + { + "epoch": 0.7216053562986341, + "grad_norm": 1.972680687904358, + "learning_rate": 7.119481062549067e-05, + "loss": 0.8574, + "step": 112950 + }, + { + "epoch": 0.7216692434483728, + "grad_norm": 0.8050053715705872, + "learning_rate": 7.11902659474161e-05, + "loss": 1.347, + "step": 112960 + }, + { + "epoch": 0.7217331305981115, + "grad_norm": 1.1817753314971924, + "learning_rate": 7.118572105593725e-05, + "loss": 0.7859, + "step": 112970 + }, + { + "epoch": 0.7217970177478502, + "grad_norm": 0.73277747631073, + "learning_rate": 7.118117595109984e-05, + "loss": 0.8999, + "step": 112980 + }, + { + "epoch": 0.7218609048975889, + "grad_norm": 0.7769888639450073, + "learning_rate": 7.117663063294965e-05, + "loss": 0.9172, + "step": 112990 + }, + { + "epoch": 0.7219247920473276, + "grad_norm": 1.4832031726837158, + "learning_rate": 7.117253966426993e-05, + "loss": 1.0567, + "step": 113000 + }, + { + "epoch": 0.7219886791970663, + "grad_norm": 0.9582386612892151, + "learning_rate": 7.11679939409516e-05, + "loss": 0.6723, + "step": 113010 + }, + { + "epoch": 0.722052566346805, + "grad_norm": 0.8009851574897766, + "learning_rate": 7.116344800445327e-05, + "loss": 0.8508, + "step": 113020 + }, + { + "epoch": 0.7221164534965437, + "grad_norm": 0.7712252736091614, + "learning_rate": 7.115890185482071e-05, + "loss": 0.7957, + "step": 113030 + }, + { + "epoch": 0.7221803406462824, + "grad_norm": 0.7053341865539551, + "learning_rate": 7.11543554920997e-05, + "loss": 0.8246, + "step": 113040 + }, + { + "epoch": 0.7222442277960212, + "grad_norm": 0.8619422912597656, + "learning_rate": 7.114980891633602e-05, + "loss": 1.0908, + "step": 113050 + }, + { + "epoch": 0.7223081149457599, + "grad_norm": 0.6670997738838196, + "learning_rate": 7.114526212757549e-05, + "loss": 0.8958, + "step": 113060 + }, + { + "epoch": 0.7223720020954986, + "grad_norm": 0.7335458397865295, + "learning_rate": 7.114071512586385e-05, + "loss": 1.0648, + "step": 113070 + }, + { + "epoch": 0.7224358892452372, + "grad_norm": 1.3357338905334473, + "learning_rate": 7.113616791124694e-05, + "loss": 0.7619, + "step": 113080 + }, + { + "epoch": 0.7224997763949759, + "grad_norm": 0.960849404335022, + "learning_rate": 7.113162048377053e-05, + "loss": 0.9608, + "step": 113090 + }, + { + "epoch": 0.7225636635447146, + "grad_norm": 0.47955894470214844, + "learning_rate": 7.112707284348042e-05, + "loss": 0.7415, + "step": 113100 + }, + { + "epoch": 0.7226275506944533, + "grad_norm": 0.6127536296844482, + "learning_rate": 7.11225249904224e-05, + "loss": 0.8654, + "step": 113110 + }, + { + "epoch": 0.722691437844192, + "grad_norm": 0.9961467385292053, + "learning_rate": 7.11179769246423e-05, + "loss": 0.8188, + "step": 113120 + }, + { + "epoch": 0.7227553249939307, + "grad_norm": 0.8931620121002197, + "learning_rate": 7.11134286461859e-05, + "loss": 0.7527, + "step": 113130 + }, + { + "epoch": 0.7228192121436694, + "grad_norm": 1.2289701700210571, + "learning_rate": 7.1108880155099e-05, + "loss": 0.7977, + "step": 113140 + }, + { + "epoch": 0.7228830992934081, + "grad_norm": 0.7578348517417908, + "learning_rate": 7.110433145142741e-05, + "loss": 1.0557, + "step": 113150 + }, + { + "epoch": 0.7229469864431468, + "grad_norm": 0.5664851069450378, + "learning_rate": 7.109978253521694e-05, + "loss": 0.6821, + "step": 113160 + }, + { + "epoch": 0.7230108735928855, + "grad_norm": 1.7400668859481812, + "learning_rate": 7.109523340651342e-05, + "loss": 0.993, + "step": 113170 + }, + { + "epoch": 0.7230747607426242, + "grad_norm": 0.9611324071884155, + "learning_rate": 7.109068406536265e-05, + "loss": 0.9319, + "step": 113180 + }, + { + "epoch": 0.7231386478923629, + "grad_norm": 0.6872121691703796, + "learning_rate": 7.108613451181043e-05, + "loss": 1.1291, + "step": 113190 + }, + { + "epoch": 0.7232025350421016, + "grad_norm": 0.8210169076919556, + "learning_rate": 7.108158474590261e-05, + "loss": 0.8069, + "step": 113200 + }, + { + "epoch": 0.7232664221918403, + "grad_norm": 1.7412443161010742, + "learning_rate": 7.107703476768497e-05, + "loss": 0.8441, + "step": 113210 + }, + { + "epoch": 0.723330309341579, + "grad_norm": 0.7331858277320862, + "learning_rate": 7.107248457720337e-05, + "loss": 0.7081, + "step": 113220 + }, + { + "epoch": 0.7233941964913178, + "grad_norm": 1.0155048370361328, + "learning_rate": 7.106793417450362e-05, + "loss": 0.8548, + "step": 113230 + }, + { + "epoch": 0.7234580836410565, + "grad_norm": 0.6159857511520386, + "learning_rate": 7.106338355963155e-05, + "loss": 0.7694, + "step": 113240 + }, + { + "epoch": 0.7235219707907952, + "grad_norm": 0.9287189841270447, + "learning_rate": 7.105883273263298e-05, + "loss": 0.965, + "step": 113250 + }, + { + "epoch": 0.7235858579405339, + "grad_norm": 0.7032301425933838, + "learning_rate": 7.105428169355375e-05, + "loss": 0.7095, + "step": 113260 + }, + { + "epoch": 0.7236497450902726, + "grad_norm": 0.7181684970855713, + "learning_rate": 7.104973044243969e-05, + "loss": 1.1339, + "step": 113270 + }, + { + "epoch": 0.7237136322400113, + "grad_norm": 0.703015148639679, + "learning_rate": 7.104517897933662e-05, + "loss": 0.8453, + "step": 113280 + }, + { + "epoch": 0.72377751938975, + "grad_norm": 2.4534735679626465, + "learning_rate": 7.10406273042904e-05, + "loss": 0.8108, + "step": 113290 + }, + { + "epoch": 0.7238414065394887, + "grad_norm": 0.7906631231307983, + "learning_rate": 7.103607541734688e-05, + "loss": 1.0326, + "step": 113300 + }, + { + "epoch": 0.7239052936892274, + "grad_norm": 0.8502821922302246, + "learning_rate": 7.103152331855187e-05, + "loss": 0.8985, + "step": 113310 + }, + { + "epoch": 0.723969180838966, + "grad_norm": 1.1090575456619263, + "learning_rate": 7.102697100795122e-05, + "loss": 0.8659, + "step": 113320 + }, + { + "epoch": 0.7240330679887047, + "grad_norm": 1.2656605243682861, + "learning_rate": 7.102241848559077e-05, + "loss": 0.7826, + "step": 113330 + }, + { + "epoch": 0.7240969551384434, + "grad_norm": 0.6625283360481262, + "learning_rate": 7.101786575151639e-05, + "loss": 0.7639, + "step": 113340 + }, + { + "epoch": 0.7241608422881821, + "grad_norm": 0.988913357257843, + "learning_rate": 7.101331280577392e-05, + "loss": 1.0447, + "step": 113350 + }, + { + "epoch": 0.7242247294379208, + "grad_norm": 1.8037471771240234, + "learning_rate": 7.100875964840922e-05, + "loss": 1.0335, + "step": 113360 + }, + { + "epoch": 0.7242886165876595, + "grad_norm": 0.8089826107025146, + "learning_rate": 7.100420627946812e-05, + "loss": 0.8345, + "step": 113370 + }, + { + "epoch": 0.7243525037373982, + "grad_norm": 0.894549548625946, + "learning_rate": 7.099965269899648e-05, + "loss": 0.9204, + "step": 113380 + }, + { + "epoch": 0.7244163908871369, + "grad_norm": 0.7039246559143066, + "learning_rate": 7.099509890704019e-05, + "loss": 0.6761, + "step": 113390 + }, + { + "epoch": 0.7244802780368756, + "grad_norm": 0.907172679901123, + "learning_rate": 7.09905449036451e-05, + "loss": 0.6746, + "step": 113400 + }, + { + "epoch": 0.7245441651866144, + "grad_norm": 0.6476519107818604, + "learning_rate": 7.098599068885704e-05, + "loss": 0.9781, + "step": 113410 + }, + { + "epoch": 0.7246080523363531, + "grad_norm": 0.5977177023887634, + "learning_rate": 7.098143626272192e-05, + "loss": 0.856, + "step": 113420 + }, + { + "epoch": 0.7246719394860918, + "grad_norm": 1.3149287700653076, + "learning_rate": 7.097688162528556e-05, + "loss": 0.7815, + "step": 113430 + }, + { + "epoch": 0.7247358266358305, + "grad_norm": 0.7405048608779907, + "learning_rate": 7.097232677659387e-05, + "loss": 0.895, + "step": 113440 + }, + { + "epoch": 0.7247997137855692, + "grad_norm": 0.8446990847587585, + "learning_rate": 7.09677717166927e-05, + "loss": 0.6864, + "step": 113450 + }, + { + "epoch": 0.7248636009353079, + "grad_norm": 0.761848509311676, + "learning_rate": 7.096321644562793e-05, + "loss": 0.8818, + "step": 113460 + }, + { + "epoch": 0.7249274880850466, + "grad_norm": 0.8329225778579712, + "learning_rate": 7.095866096344544e-05, + "loss": 0.8544, + "step": 113470 + }, + { + "epoch": 0.7249913752347853, + "grad_norm": 1.2276092767715454, + "learning_rate": 7.095410527019111e-05, + "loss": 0.8486, + "step": 113480 + }, + { + "epoch": 0.725055262384524, + "grad_norm": 1.3161121606826782, + "learning_rate": 7.094954936591081e-05, + "loss": 0.7856, + "step": 113490 + }, + { + "epoch": 0.7251191495342627, + "grad_norm": 1.220802664756775, + "learning_rate": 7.09449932506504e-05, + "loss": 0.8293, + "step": 113500 + }, + { + "epoch": 0.7251830366840014, + "grad_norm": 1.623045802116394, + "learning_rate": 7.094043692445581e-05, + "loss": 1.4337, + "step": 113510 + }, + { + "epoch": 0.7252469238337401, + "grad_norm": 1.979931116104126, + "learning_rate": 7.09358803873729e-05, + "loss": 0.7465, + "step": 113520 + }, + { + "epoch": 0.7253108109834788, + "grad_norm": 1.0702683925628662, + "learning_rate": 7.093132363944756e-05, + "loss": 0.7137, + "step": 113530 + }, + { + "epoch": 0.7253746981332175, + "grad_norm": 0.940356969833374, + "learning_rate": 7.092676668072569e-05, + "loss": 0.9097, + "step": 113540 + }, + { + "epoch": 0.7254385852829562, + "grad_norm": 0.7708202004432678, + "learning_rate": 7.092220951125315e-05, + "loss": 1.0073, + "step": 113550 + }, + { + "epoch": 0.7255024724326948, + "grad_norm": 1.922377347946167, + "learning_rate": 7.091765213107589e-05, + "loss": 0.77, + "step": 113560 + }, + { + "epoch": 0.7255663595824335, + "grad_norm": 0.671876847743988, + "learning_rate": 7.091309454023976e-05, + "loss": 0.6341, + "step": 113570 + }, + { + "epoch": 0.7256302467321722, + "grad_norm": 0.9415796995162964, + "learning_rate": 7.090853673879068e-05, + "loss": 0.6898, + "step": 113580 + }, + { + "epoch": 0.725694133881911, + "grad_norm": 0.5804014205932617, + "learning_rate": 7.090397872677455e-05, + "loss": 0.8133, + "step": 113590 + }, + { + "epoch": 0.7257580210316497, + "grad_norm": 1.0379526615142822, + "learning_rate": 7.089942050423725e-05, + "loss": 0.855, + "step": 113600 + }, + { + "epoch": 0.7258219081813884, + "grad_norm": 0.6411370635032654, + "learning_rate": 7.089486207122474e-05, + "loss": 0.7423, + "step": 113610 + }, + { + "epoch": 0.7258857953311271, + "grad_norm": 2.031083106994629, + "learning_rate": 7.089030342778288e-05, + "loss": 0.7442, + "step": 113620 + }, + { + "epoch": 0.7259496824808658, + "grad_norm": 0.9208039045333862, + "learning_rate": 7.088574457395758e-05, + "loss": 0.8881, + "step": 113630 + }, + { + "epoch": 0.7260135696306045, + "grad_norm": 1.0437136888504028, + "learning_rate": 7.088118550979477e-05, + "loss": 1.015, + "step": 113640 + }, + { + "epoch": 0.7260774567803432, + "grad_norm": 1.4243861436843872, + "learning_rate": 7.087662623534036e-05, + "loss": 0.7728, + "step": 113650 + }, + { + "epoch": 0.7261413439300819, + "grad_norm": 1.2670698165893555, + "learning_rate": 7.087206675064026e-05, + "loss": 0.9942, + "step": 113660 + }, + { + "epoch": 0.7262052310798206, + "grad_norm": 0.9617191553115845, + "learning_rate": 7.086750705574038e-05, + "loss": 1.0011, + "step": 113670 + }, + { + "epoch": 0.7262691182295593, + "grad_norm": 1.161468505859375, + "learning_rate": 7.086294715068667e-05, + "loss": 1.22, + "step": 113680 + }, + { + "epoch": 0.726333005379298, + "grad_norm": 1.2413885593414307, + "learning_rate": 7.085838703552503e-05, + "loss": 0.9355, + "step": 113690 + }, + { + "epoch": 0.7263968925290367, + "grad_norm": 0.607776403427124, + "learning_rate": 7.085382671030138e-05, + "loss": 1.0288, + "step": 113700 + }, + { + "epoch": 0.7264607796787754, + "grad_norm": 1.1839098930358887, + "learning_rate": 7.084926617506166e-05, + "loss": 1.0052, + "step": 113710 + }, + { + "epoch": 0.7265246668285141, + "grad_norm": 0.5576828718185425, + "learning_rate": 7.084470542985178e-05, + "loss": 1.1483, + "step": 113720 + }, + { + "epoch": 0.7265885539782528, + "grad_norm": 0.6610636711120605, + "learning_rate": 7.084014447471769e-05, + "loss": 0.777, + "step": 113730 + }, + { + "epoch": 0.7266524411279915, + "grad_norm": 0.726379930973053, + "learning_rate": 7.083558330970532e-05, + "loss": 0.7009, + "step": 113740 + }, + { + "epoch": 0.7267163282777303, + "grad_norm": 0.8821578621864319, + "learning_rate": 7.083102193486058e-05, + "loss": 1.1292, + "step": 113750 + }, + { + "epoch": 0.726780215427469, + "grad_norm": 0.8552307486534119, + "learning_rate": 7.082646035022946e-05, + "loss": 0.936, + "step": 113760 + }, + { + "epoch": 0.7268441025772077, + "grad_norm": 0.9332167506217957, + "learning_rate": 7.082189855585784e-05, + "loss": 0.9092, + "step": 113770 + }, + { + "epoch": 0.7269079897269464, + "grad_norm": 0.7814182043075562, + "learning_rate": 7.081733655179171e-05, + "loss": 1.0203, + "step": 113780 + }, + { + "epoch": 0.7269718768766851, + "grad_norm": 1.0449836254119873, + "learning_rate": 7.081277433807697e-05, + "loss": 0.8905, + "step": 113790 + }, + { + "epoch": 0.7270357640264237, + "grad_norm": 0.9948442578315735, + "learning_rate": 7.080821191475962e-05, + "loss": 0.8614, + "step": 113800 + }, + { + "epoch": 0.7270996511761624, + "grad_norm": 0.6735957860946655, + "learning_rate": 7.080364928188555e-05, + "loss": 0.7434, + "step": 113810 + }, + { + "epoch": 0.7271635383259011, + "grad_norm": 1.021897554397583, + "learning_rate": 7.079908643950072e-05, + "loss": 0.8579, + "step": 113820 + }, + { + "epoch": 0.7272274254756398, + "grad_norm": 0.7740781903266907, + "learning_rate": 7.079452338765112e-05, + "loss": 0.9425, + "step": 113830 + }, + { + "epoch": 0.7272913126253785, + "grad_norm": 0.5648607015609741, + "learning_rate": 7.078996012638268e-05, + "loss": 0.8408, + "step": 113840 + }, + { + "epoch": 0.7273551997751172, + "grad_norm": 0.5744165182113647, + "learning_rate": 7.078539665574135e-05, + "loss": 0.8827, + "step": 113850 + }, + { + "epoch": 0.7274190869248559, + "grad_norm": 0.5075027346611023, + "learning_rate": 7.07808329757731e-05, + "loss": 0.741, + "step": 113860 + }, + { + "epoch": 0.7274829740745946, + "grad_norm": 2.1645233631134033, + "learning_rate": 7.077626908652387e-05, + "loss": 0.8138, + "step": 113870 + }, + { + "epoch": 0.7275468612243333, + "grad_norm": 1.198243498802185, + "learning_rate": 7.077170498803964e-05, + "loss": 1.0056, + "step": 113880 + }, + { + "epoch": 0.727610748374072, + "grad_norm": 0.8382686376571655, + "learning_rate": 7.076714068036639e-05, + "loss": 0.8976, + "step": 113890 + }, + { + "epoch": 0.7276746355238107, + "grad_norm": 0.751120388507843, + "learning_rate": 7.076257616355003e-05, + "loss": 0.918, + "step": 113900 + }, + { + "epoch": 0.7277385226735494, + "grad_norm": 0.7622794508934021, + "learning_rate": 7.075801143763658e-05, + "loss": 1.125, + "step": 113910 + }, + { + "epoch": 0.7278024098232881, + "grad_norm": 0.7073959112167358, + "learning_rate": 7.0753446502672e-05, + "loss": 0.8341, + "step": 113920 + }, + { + "epoch": 0.7278662969730268, + "grad_norm": 0.6458455324172974, + "learning_rate": 7.074888135870227e-05, + "loss": 1.1074, + "step": 113930 + }, + { + "epoch": 0.7279301841227656, + "grad_norm": 0.837853729724884, + "learning_rate": 7.074431600577335e-05, + "loss": 0.8365, + "step": 113940 + }, + { + "epoch": 0.7279940712725043, + "grad_norm": 0.8658714890480042, + "learning_rate": 7.073975044393121e-05, + "loss": 0.9752, + "step": 113950 + }, + { + "epoch": 0.728057958422243, + "grad_norm": 1.0802479982376099, + "learning_rate": 7.073518467322186e-05, + "loss": 0.7144, + "step": 113960 + }, + { + "epoch": 0.7281218455719817, + "grad_norm": 0.8560570478439331, + "learning_rate": 7.073061869369124e-05, + "loss": 0.6664, + "step": 113970 + }, + { + "epoch": 0.7281857327217204, + "grad_norm": 0.9239840507507324, + "learning_rate": 7.072605250538536e-05, + "loss": 1.1588, + "step": 113980 + }, + { + "epoch": 0.7282496198714591, + "grad_norm": 0.7781822085380554, + "learning_rate": 7.07214861083502e-05, + "loss": 1.1213, + "step": 113990 + }, + { + "epoch": 0.7283135070211978, + "grad_norm": 0.6489850282669067, + "learning_rate": 7.071691950263177e-05, + "loss": 0.9027, + "step": 114000 + }, + { + "epoch": 0.7283773941709365, + "grad_norm": 0.8504793047904968, + "learning_rate": 7.071235268827601e-05, + "loss": 1.1383, + "step": 114010 + }, + { + "epoch": 0.7284412813206752, + "grad_norm": 0.9675794243812561, + "learning_rate": 7.070778566532896e-05, + "loss": 1.1451, + "step": 114020 + }, + { + "epoch": 0.7285051684704139, + "grad_norm": 0.9038040041923523, + "learning_rate": 7.070321843383659e-05, + "loss": 0.9998, + "step": 114030 + }, + { + "epoch": 0.7285690556201526, + "grad_norm": 0.6558981537818909, + "learning_rate": 7.06986509938449e-05, + "loss": 0.8363, + "step": 114040 + }, + { + "epoch": 0.7286329427698912, + "grad_norm": 0.9019342660903931, + "learning_rate": 7.069408334539987e-05, + "loss": 0.992, + "step": 114050 + }, + { + "epoch": 0.7286968299196299, + "grad_norm": 0.7221403121948242, + "learning_rate": 7.068951548854755e-05, + "loss": 0.8516, + "step": 114060 + }, + { + "epoch": 0.7287607170693686, + "grad_norm": 1.507217288017273, + "learning_rate": 7.068494742333388e-05, + "loss": 0.7527, + "step": 114070 + }, + { + "epoch": 0.7288246042191073, + "grad_norm": 0.8516684770584106, + "learning_rate": 7.06803791498049e-05, + "loss": 0.6731, + "step": 114080 + }, + { + "epoch": 0.728888491368846, + "grad_norm": 1.2646251916885376, + "learning_rate": 7.067581066800661e-05, + "loss": 0.8718, + "step": 114090 + }, + { + "epoch": 0.7289523785185847, + "grad_norm": 0.7315905094146729, + "learning_rate": 7.067124197798504e-05, + "loss": 1.0181, + "step": 114100 + }, + { + "epoch": 0.7290162656683234, + "grad_norm": 1.3411294221878052, + "learning_rate": 7.066667307978617e-05, + "loss": 0.8343, + "step": 114110 + }, + { + "epoch": 0.7290801528180622, + "grad_norm": 1.737226963043213, + "learning_rate": 7.0662103973456e-05, + "loss": 0.9154, + "step": 114120 + }, + { + "epoch": 0.7291440399678009, + "grad_norm": 0.9302464127540588, + "learning_rate": 7.065753465904059e-05, + "loss": 0.7609, + "step": 114130 + }, + { + "epoch": 0.7292079271175396, + "grad_norm": 0.7784197926521301, + "learning_rate": 7.065296513658594e-05, + "loss": 1.1499, + "step": 114140 + }, + { + "epoch": 0.7292718142672783, + "grad_norm": 1.0966049432754517, + "learning_rate": 7.064839540613805e-05, + "loss": 0.8757, + "step": 114150 + }, + { + "epoch": 0.729335701417017, + "grad_norm": 0.7327684760093689, + "learning_rate": 7.064382546774297e-05, + "loss": 0.8503, + "step": 114160 + }, + { + "epoch": 0.7293995885667557, + "grad_norm": 0.817319393157959, + "learning_rate": 7.063925532144668e-05, + "loss": 1.0078, + "step": 114170 + }, + { + "epoch": 0.7294634757164944, + "grad_norm": 0.5275333523750305, + "learning_rate": 7.063468496729526e-05, + "loss": 1.1289, + "step": 114180 + }, + { + "epoch": 0.7295273628662331, + "grad_norm": 0.9112656712532043, + "learning_rate": 7.06301144053347e-05, + "loss": 0.9515, + "step": 114190 + }, + { + "epoch": 0.7295912500159718, + "grad_norm": 0.7202227711677551, + "learning_rate": 7.062554363561105e-05, + "loss": 1.0629, + "step": 114200 + }, + { + "epoch": 0.7296551371657105, + "grad_norm": 0.6695742011070251, + "learning_rate": 7.062097265817031e-05, + "loss": 0.8514, + "step": 114210 + }, + { + "epoch": 0.7297190243154492, + "grad_norm": 1.3881930112838745, + "learning_rate": 7.061640147305856e-05, + "loss": 0.8752, + "step": 114220 + }, + { + "epoch": 0.7297829114651879, + "grad_norm": 0.9712892770767212, + "learning_rate": 7.06118300803218e-05, + "loss": 0.8757, + "step": 114230 + }, + { + "epoch": 0.7298467986149266, + "grad_norm": 0.7274371981620789, + "learning_rate": 7.060725848000607e-05, + "loss": 0.8209, + "step": 114240 + }, + { + "epoch": 0.7299106857646653, + "grad_norm": 1.0749263763427734, + "learning_rate": 7.060268667215743e-05, + "loss": 1.059, + "step": 114250 + }, + { + "epoch": 0.729974572914404, + "grad_norm": 0.828329861164093, + "learning_rate": 7.059811465682192e-05, + "loss": 1.3196, + "step": 114260 + }, + { + "epoch": 0.7300384600641427, + "grad_norm": 1.0756471157073975, + "learning_rate": 7.059354243404555e-05, + "loss": 0.9988, + "step": 114270 + }, + { + "epoch": 0.7301023472138815, + "grad_norm": 1.016433835029602, + "learning_rate": 7.05889700038744e-05, + "loss": 0.6839, + "step": 114280 + }, + { + "epoch": 0.73016623436362, + "grad_norm": 0.9101114273071289, + "learning_rate": 7.058439736635454e-05, + "loss": 1.0044, + "step": 114290 + }, + { + "epoch": 0.7302301215133588, + "grad_norm": 1.0833832025527954, + "learning_rate": 7.057982452153196e-05, + "loss": 1.0722, + "step": 114300 + }, + { + "epoch": 0.7302940086630975, + "grad_norm": 1.2768323421478271, + "learning_rate": 7.057525146945276e-05, + "loss": 1.0266, + "step": 114310 + }, + { + "epoch": 0.7303578958128362, + "grad_norm": 0.776053786277771, + "learning_rate": 7.057067821016297e-05, + "loss": 0.9476, + "step": 114320 + }, + { + "epoch": 0.7304217829625749, + "grad_norm": 0.6913490295410156, + "learning_rate": 7.056610474370865e-05, + "loss": 0.8089, + "step": 114330 + }, + { + "epoch": 0.7304856701123136, + "grad_norm": 0.6486290693283081, + "learning_rate": 7.056153107013588e-05, + "loss": 0.813, + "step": 114340 + }, + { + "epoch": 0.7305495572620523, + "grad_norm": 0.7323374152183533, + "learning_rate": 7.05569571894907e-05, + "loss": 0.9397, + "step": 114350 + }, + { + "epoch": 0.730613444411791, + "grad_norm": 1.247586727142334, + "learning_rate": 7.055238310181915e-05, + "loss": 0.8101, + "step": 114360 + }, + { + "epoch": 0.7306773315615297, + "grad_norm": 1.2062445878982544, + "learning_rate": 7.054780880716733e-05, + "loss": 0.7138, + "step": 114370 + }, + { + "epoch": 0.7307412187112684, + "grad_norm": 2.156268358230591, + "learning_rate": 7.054323430558132e-05, + "loss": 0.8173, + "step": 114380 + }, + { + "epoch": 0.7308051058610071, + "grad_norm": 1.4800580739974976, + "learning_rate": 7.053865959710717e-05, + "loss": 0.8313, + "step": 114390 + }, + { + "epoch": 0.7308689930107458, + "grad_norm": 0.7692814469337463, + "learning_rate": 7.053408468179093e-05, + "loss": 0.7696, + "step": 114400 + }, + { + "epoch": 0.7309328801604845, + "grad_norm": 1.617647409439087, + "learning_rate": 7.052950955967869e-05, + "loss": 0.8307, + "step": 114410 + }, + { + "epoch": 0.7309967673102232, + "grad_norm": 1.1893336772918701, + "learning_rate": 7.052493423081655e-05, + "loss": 0.8407, + "step": 114420 + }, + { + "epoch": 0.7310606544599619, + "grad_norm": 1.0456907749176025, + "learning_rate": 7.052035869525053e-05, + "loss": 1.0971, + "step": 114430 + }, + { + "epoch": 0.7311245416097006, + "grad_norm": 0.8813536167144775, + "learning_rate": 7.051578295302676e-05, + "loss": 0.9145, + "step": 114440 + }, + { + "epoch": 0.7311884287594393, + "grad_norm": 0.9212775826454163, + "learning_rate": 7.051120700419131e-05, + "loss": 0.9651, + "step": 114450 + }, + { + "epoch": 0.731252315909178, + "grad_norm": 0.7198718786239624, + "learning_rate": 7.050663084879027e-05, + "loss": 0.844, + "step": 114460 + }, + { + "epoch": 0.7313162030589168, + "grad_norm": 0.91295325756073, + "learning_rate": 7.050205448686971e-05, + "loss": 0.7403, + "step": 114470 + }, + { + "epoch": 0.7313800902086555, + "grad_norm": 1.0438035726547241, + "learning_rate": 7.049747791847574e-05, + "loss": 0.893, + "step": 114480 + }, + { + "epoch": 0.7314439773583942, + "grad_norm": 0.6840182542800903, + "learning_rate": 7.049290114365441e-05, + "loss": 0.7915, + "step": 114490 + }, + { + "epoch": 0.7315078645081329, + "grad_norm": 0.7132487893104553, + "learning_rate": 7.048832416245185e-05, + "loss": 1.0361, + "step": 114500 + }, + { + "epoch": 0.7315717516578716, + "grad_norm": 1.2934064865112305, + "learning_rate": 7.048374697491414e-05, + "loss": 0.9494, + "step": 114510 + }, + { + "epoch": 0.7316356388076103, + "grad_norm": 1.7626994848251343, + "learning_rate": 7.047916958108737e-05, + "loss": 1.0967, + "step": 114520 + }, + { + "epoch": 0.7316995259573489, + "grad_norm": 1.077722191810608, + "learning_rate": 7.047459198101766e-05, + "loss": 0.8701, + "step": 114530 + }, + { + "epoch": 0.7317634131070876, + "grad_norm": 0.6629199981689453, + "learning_rate": 7.047001417475109e-05, + "loss": 0.9575, + "step": 114540 + }, + { + "epoch": 0.7318273002568263, + "grad_norm": 0.9705789089202881, + "learning_rate": 7.046543616233376e-05, + "loss": 0.6918, + "step": 114550 + }, + { + "epoch": 0.731891187406565, + "grad_norm": 0.897789478302002, + "learning_rate": 7.046085794381179e-05, + "loss": 0.9453, + "step": 114560 + }, + { + "epoch": 0.7319550745563037, + "grad_norm": 0.7040274143218994, + "learning_rate": 7.045627951923127e-05, + "loss": 0.9701, + "step": 114570 + }, + { + "epoch": 0.7320189617060424, + "grad_norm": 1.2493051290512085, + "learning_rate": 7.045170088863834e-05, + "loss": 1.0576, + "step": 114580 + }, + { + "epoch": 0.7320828488557811, + "grad_norm": 1.6483665704727173, + "learning_rate": 7.044712205207907e-05, + "loss": 0.9102, + "step": 114590 + }, + { + "epoch": 0.7321467360055198, + "grad_norm": 0.8068165183067322, + "learning_rate": 7.044254300959958e-05, + "loss": 0.9693, + "step": 114600 + }, + { + "epoch": 0.7322106231552585, + "grad_norm": 1.0065369606018066, + "learning_rate": 7.043796376124602e-05, + "loss": 1.0352, + "step": 114610 + }, + { + "epoch": 0.7322745103049972, + "grad_norm": 0.5376549959182739, + "learning_rate": 7.043338430706448e-05, + "loss": 1.0502, + "step": 114620 + }, + { + "epoch": 0.732338397454736, + "grad_norm": 0.7183248400688171, + "learning_rate": 7.042880464710106e-05, + "loss": 0.6899, + "step": 114630 + }, + { + "epoch": 0.7324022846044747, + "grad_norm": 0.9804075956344604, + "learning_rate": 7.042422478140194e-05, + "loss": 0.9902, + "step": 114640 + }, + { + "epoch": 0.7324661717542134, + "grad_norm": 1.1558316946029663, + "learning_rate": 7.041964471001318e-05, + "loss": 0.8413, + "step": 114650 + }, + { + "epoch": 0.7325300589039521, + "grad_norm": 0.6205730438232422, + "learning_rate": 7.041506443298093e-05, + "loss": 0.8944, + "step": 114660 + }, + { + "epoch": 0.7325939460536908, + "grad_norm": 1.1276192665100098, + "learning_rate": 7.041048395035135e-05, + "loss": 0.7895, + "step": 114670 + }, + { + "epoch": 0.7326578332034295, + "grad_norm": 0.9537439942359924, + "learning_rate": 7.040590326217052e-05, + "loss": 0.875, + "step": 114680 + }, + { + "epoch": 0.7327217203531682, + "grad_norm": 0.7265706062316895, + "learning_rate": 7.040132236848457e-05, + "loss": 1.1706, + "step": 114690 + }, + { + "epoch": 0.7327856075029069, + "grad_norm": 0.9442006945610046, + "learning_rate": 7.039674126933969e-05, + "loss": 0.9093, + "step": 114700 + }, + { + "epoch": 0.7328494946526456, + "grad_norm": 1.7052466869354248, + "learning_rate": 7.039215996478195e-05, + "loss": 0.7726, + "step": 114710 + }, + { + "epoch": 0.7329133818023843, + "grad_norm": 0.691369891166687, + "learning_rate": 7.038757845485754e-05, + "loss": 0.7376, + "step": 114720 + }, + { + "epoch": 0.732977268952123, + "grad_norm": 1.4131755828857422, + "learning_rate": 7.038299673961258e-05, + "loss": 1.1266, + "step": 114730 + }, + { + "epoch": 0.7330411561018617, + "grad_norm": 1.0800511837005615, + "learning_rate": 7.037841481909319e-05, + "loss": 0.8857, + "step": 114740 + }, + { + "epoch": 0.7331050432516004, + "grad_norm": 0.9274218678474426, + "learning_rate": 7.037383269334555e-05, + "loss": 0.9862, + "step": 114750 + }, + { + "epoch": 0.7331689304013391, + "grad_norm": 1.05172860622406, + "learning_rate": 7.036925036241578e-05, + "loss": 1.024, + "step": 114760 + }, + { + "epoch": 0.7332328175510778, + "grad_norm": 1.1139484643936157, + "learning_rate": 7.036466782635003e-05, + "loss": 0.7455, + "step": 114770 + }, + { + "epoch": 0.7332967047008164, + "grad_norm": 0.9190326929092407, + "learning_rate": 7.036008508519446e-05, + "loss": 0.7979, + "step": 114780 + }, + { + "epoch": 0.7333605918505551, + "grad_norm": 0.7148388028144836, + "learning_rate": 7.03555021389952e-05, + "loss": 0.8942, + "step": 114790 + }, + { + "epoch": 0.7334244790002938, + "grad_norm": 1.5679404735565186, + "learning_rate": 7.035091898779846e-05, + "loss": 0.7687, + "step": 114800 + }, + { + "epoch": 0.7334883661500325, + "grad_norm": 1.5291404724121094, + "learning_rate": 7.034633563165034e-05, + "loss": 0.7391, + "step": 114810 + }, + { + "epoch": 0.7335522532997713, + "grad_norm": 1.019393801689148, + "learning_rate": 7.034175207059704e-05, + "loss": 0.7867, + "step": 114820 + }, + { + "epoch": 0.73361614044951, + "grad_norm": 0.5086638331413269, + "learning_rate": 7.033716830468467e-05, + "loss": 0.8308, + "step": 114830 + }, + { + "epoch": 0.7336800275992487, + "grad_norm": 0.7575461268424988, + "learning_rate": 7.033258433395944e-05, + "loss": 0.6507, + "step": 114840 + }, + { + "epoch": 0.7337439147489874, + "grad_norm": 1.0149903297424316, + "learning_rate": 7.032800015846749e-05, + "loss": 0.9033, + "step": 114850 + }, + { + "epoch": 0.7338078018987261, + "grad_norm": 1.2490330934524536, + "learning_rate": 7.032341577825499e-05, + "loss": 0.7165, + "step": 114860 + }, + { + "epoch": 0.7338716890484648, + "grad_norm": 0.9309149384498596, + "learning_rate": 7.031883119336811e-05, + "loss": 0.7779, + "step": 114870 + }, + { + "epoch": 0.7339355761982035, + "grad_norm": 1.0880928039550781, + "learning_rate": 7.031424640385303e-05, + "loss": 0.8678, + "step": 114880 + }, + { + "epoch": 0.7339994633479422, + "grad_norm": 1.2819446325302124, + "learning_rate": 7.03096614097559e-05, + "loss": 0.9925, + "step": 114890 + }, + { + "epoch": 0.7340633504976809, + "grad_norm": 0.588641345500946, + "learning_rate": 7.030507621112293e-05, + "loss": 0.6344, + "step": 114900 + }, + { + "epoch": 0.7341272376474196, + "grad_norm": 0.7878732085227966, + "learning_rate": 7.030049080800025e-05, + "loss": 0.7343, + "step": 114910 + }, + { + "epoch": 0.7341911247971583, + "grad_norm": 1.4143778085708618, + "learning_rate": 7.029590520043409e-05, + "loss": 0.7973, + "step": 114920 + }, + { + "epoch": 0.734255011946897, + "grad_norm": 2.662449598312378, + "learning_rate": 7.02913193884706e-05, + "loss": 0.8654, + "step": 114930 + }, + { + "epoch": 0.7343188990966357, + "grad_norm": 0.9117518663406372, + "learning_rate": 7.028673337215596e-05, + "loss": 0.8811, + "step": 114940 + }, + { + "epoch": 0.7343827862463744, + "grad_norm": 1.0859434604644775, + "learning_rate": 7.028214715153636e-05, + "loss": 0.8603, + "step": 114950 + }, + { + "epoch": 0.7344466733961131, + "grad_norm": 0.9327182173728943, + "learning_rate": 7.027756072665798e-05, + "loss": 1.0731, + "step": 114960 + }, + { + "epoch": 0.7345105605458518, + "grad_norm": 0.760019838809967, + "learning_rate": 7.027297409756706e-05, + "loss": 0.8329, + "step": 114970 + }, + { + "epoch": 0.7345744476955905, + "grad_norm": 1.3158267736434937, + "learning_rate": 7.026838726430972e-05, + "loss": 1.0469, + "step": 114980 + }, + { + "epoch": 0.7346383348453293, + "grad_norm": 0.855215311050415, + "learning_rate": 7.026380022693219e-05, + "loss": 1.0875, + "step": 114990 + }, + { + "epoch": 0.734702221995068, + "grad_norm": 0.7679759860038757, + "learning_rate": 7.025921298548069e-05, + "loss": 0.7983, + "step": 115000 + }, + { + "epoch": 0.7347661091448067, + "grad_norm": 0.7040578722953796, + "learning_rate": 7.025462554000136e-05, + "loss": 0.8196, + "step": 115010 + }, + { + "epoch": 0.7348299962945453, + "grad_norm": 0.7212196588516235, + "learning_rate": 7.025003789054044e-05, + "loss": 0.9989, + "step": 115020 + }, + { + "epoch": 0.734893883444284, + "grad_norm": 0.892850935459137, + "learning_rate": 7.024545003714411e-05, + "loss": 1.2155, + "step": 115030 + }, + { + "epoch": 0.7349577705940227, + "grad_norm": 0.9270761013031006, + "learning_rate": 7.02408619798586e-05, + "loss": 0.8714, + "step": 115040 + }, + { + "epoch": 0.7350216577437614, + "grad_norm": 0.827022910118103, + "learning_rate": 7.023627371873008e-05, + "loss": 0.7664, + "step": 115050 + }, + { + "epoch": 0.7350855448935001, + "grad_norm": 0.9258151650428772, + "learning_rate": 7.023168525380479e-05, + "loss": 0.8016, + "step": 115060 + }, + { + "epoch": 0.7351494320432388, + "grad_norm": 0.732083797454834, + "learning_rate": 7.022709658512892e-05, + "loss": 0.7217, + "step": 115070 + }, + { + "epoch": 0.7352133191929775, + "grad_norm": 1.1232346296310425, + "learning_rate": 7.02225077127487e-05, + "loss": 0.9083, + "step": 115080 + }, + { + "epoch": 0.7352772063427162, + "grad_norm": 0.9207686185836792, + "learning_rate": 7.021791863671032e-05, + "loss": 0.9492, + "step": 115090 + }, + { + "epoch": 0.7353410934924549, + "grad_norm": 1.1235870122909546, + "learning_rate": 7.021332935706e-05, + "loss": 1.0061, + "step": 115100 + }, + { + "epoch": 0.7354049806421936, + "grad_norm": 0.5921577215194702, + "learning_rate": 7.020873987384398e-05, + "loss": 1.0309, + "step": 115110 + }, + { + "epoch": 0.7354688677919323, + "grad_norm": 0.9022099375724792, + "learning_rate": 7.020415018710846e-05, + "loss": 0.956, + "step": 115120 + }, + { + "epoch": 0.735532754941671, + "grad_norm": 0.7757014036178589, + "learning_rate": 7.019956029689968e-05, + "loss": 1.2207, + "step": 115130 + }, + { + "epoch": 0.7355966420914097, + "grad_norm": 0.7293660640716553, + "learning_rate": 7.019497020326384e-05, + "loss": 0.8932, + "step": 115140 + }, + { + "epoch": 0.7356605292411484, + "grad_norm": 0.7669858932495117, + "learning_rate": 7.019037990624718e-05, + "loss": 0.8368, + "step": 115150 + }, + { + "epoch": 0.7357244163908871, + "grad_norm": 1.5827726125717163, + "learning_rate": 7.018578940589592e-05, + "loss": 0.7371, + "step": 115160 + }, + { + "epoch": 0.7357883035406259, + "grad_norm": 1.0692715644836426, + "learning_rate": 7.018119870225632e-05, + "loss": 0.9331, + "step": 115170 + }, + { + "epoch": 0.7358521906903646, + "grad_norm": 1.1816247701644897, + "learning_rate": 7.017660779537458e-05, + "loss": 0.7824, + "step": 115180 + }, + { + "epoch": 0.7359160778401033, + "grad_norm": 1.0669434070587158, + "learning_rate": 7.017201668529695e-05, + "loss": 0.8516, + "step": 115190 + }, + { + "epoch": 0.735979964989842, + "grad_norm": 1.2087671756744385, + "learning_rate": 7.016742537206965e-05, + "loss": 1.0443, + "step": 115200 + }, + { + "epoch": 0.7360438521395807, + "grad_norm": 0.8520811200141907, + "learning_rate": 7.016283385573893e-05, + "loss": 0.7544, + "step": 115210 + }, + { + "epoch": 0.7361077392893194, + "grad_norm": 0.6800863146781921, + "learning_rate": 7.015824213635104e-05, + "loss": 1.0469, + "step": 115220 + }, + { + "epoch": 0.7361716264390581, + "grad_norm": 1.1106778383255005, + "learning_rate": 7.01536502139522e-05, + "loss": 1.0884, + "step": 115230 + }, + { + "epoch": 0.7362355135887968, + "grad_norm": 1.0716586112976074, + "learning_rate": 7.014905808858868e-05, + "loss": 0.8972, + "step": 115240 + }, + { + "epoch": 0.7362994007385355, + "grad_norm": 0.961650550365448, + "learning_rate": 7.01444657603067e-05, + "loss": 0.7001, + "step": 115250 + }, + { + "epoch": 0.7363632878882741, + "grad_norm": 0.7025936245918274, + "learning_rate": 7.013987322915252e-05, + "loss": 0.9839, + "step": 115260 + }, + { + "epoch": 0.7364271750380128, + "grad_norm": 0.7885773181915283, + "learning_rate": 7.013528049517241e-05, + "loss": 0.8556, + "step": 115270 + }, + { + "epoch": 0.7364910621877515, + "grad_norm": 0.9782485365867615, + "learning_rate": 7.013068755841258e-05, + "loss": 0.7966, + "step": 115280 + }, + { + "epoch": 0.7365549493374902, + "grad_norm": 1.2593889236450195, + "learning_rate": 7.012609441891934e-05, + "loss": 0.7664, + "step": 115290 + }, + { + "epoch": 0.7366188364872289, + "grad_norm": 0.7124470472335815, + "learning_rate": 7.01215010767389e-05, + "loss": 1.2127, + "step": 115300 + }, + { + "epoch": 0.7366827236369676, + "grad_norm": 1.3822424411773682, + "learning_rate": 7.011690753191754e-05, + "loss": 1.2461, + "step": 115310 + }, + { + "epoch": 0.7367466107867063, + "grad_norm": 1.3084379434585571, + "learning_rate": 7.011231378450152e-05, + "loss": 0.7628, + "step": 115320 + }, + { + "epoch": 0.736810497936445, + "grad_norm": 1.2158416509628296, + "learning_rate": 7.01077198345371e-05, + "loss": 0.7993, + "step": 115330 + }, + { + "epoch": 0.7368743850861837, + "grad_norm": 1.6002458333969116, + "learning_rate": 7.010312568207055e-05, + "loss": 0.9804, + "step": 115340 + }, + { + "epoch": 0.7369382722359225, + "grad_norm": 1.1068108081817627, + "learning_rate": 7.009853132714812e-05, + "loss": 0.7037, + "step": 115350 + }, + { + "epoch": 0.7370021593856612, + "grad_norm": 0.4763220548629761, + "learning_rate": 7.00939367698161e-05, + "loss": 0.6586, + "step": 115360 + }, + { + "epoch": 0.7370660465353999, + "grad_norm": 0.9020888805389404, + "learning_rate": 7.008934201012076e-05, + "loss": 1.0595, + "step": 115370 + }, + { + "epoch": 0.7371299336851386, + "grad_norm": 1.2904754877090454, + "learning_rate": 7.008474704810835e-05, + "loss": 0.8614, + "step": 115380 + }, + { + "epoch": 0.7371938208348773, + "grad_norm": 1.1419029235839844, + "learning_rate": 7.008015188382517e-05, + "loss": 0.7458, + "step": 115390 + }, + { + "epoch": 0.737257707984616, + "grad_norm": 0.887784481048584, + "learning_rate": 7.00755565173175e-05, + "loss": 0.7534, + "step": 115400 + }, + { + "epoch": 0.7373215951343547, + "grad_norm": 0.8305013179779053, + "learning_rate": 7.007096094863159e-05, + "loss": 0.9282, + "step": 115410 + }, + { + "epoch": 0.7373854822840934, + "grad_norm": 0.9823849201202393, + "learning_rate": 7.006636517781376e-05, + "loss": 1.0846, + "step": 115420 + }, + { + "epoch": 0.7374493694338321, + "grad_norm": 1.4131437540054321, + "learning_rate": 7.006176920491025e-05, + "loss": 0.7773, + "step": 115430 + }, + { + "epoch": 0.7375132565835708, + "grad_norm": 0.7678616642951965, + "learning_rate": 7.005717302996739e-05, + "loss": 1.084, + "step": 115440 + }, + { + "epoch": 0.7375771437333095, + "grad_norm": 0.655906617641449, + "learning_rate": 7.005257665303142e-05, + "loss": 0.9624, + "step": 115450 + }, + { + "epoch": 0.7376410308830482, + "grad_norm": 0.9372738003730774, + "learning_rate": 7.004798007414867e-05, + "loss": 0.8082, + "step": 115460 + }, + { + "epoch": 0.7377049180327869, + "grad_norm": 1.036291480064392, + "learning_rate": 7.004338329336541e-05, + "loss": 1.0762, + "step": 115470 + }, + { + "epoch": 0.7377688051825256, + "grad_norm": 0.8642080426216125, + "learning_rate": 7.003878631072794e-05, + "loss": 0.9366, + "step": 115480 + }, + { + "epoch": 0.7378326923322643, + "grad_norm": 1.0046378374099731, + "learning_rate": 7.003418912628257e-05, + "loss": 0.9384, + "step": 115490 + }, + { + "epoch": 0.7378965794820029, + "grad_norm": 0.8233842849731445, + "learning_rate": 7.002959174007558e-05, + "loss": 0.6712, + "step": 115500 + }, + { + "epoch": 0.7379604666317416, + "grad_norm": 1.354340672492981, + "learning_rate": 7.002499415215325e-05, + "loss": 0.9442, + "step": 115510 + }, + { + "epoch": 0.7380243537814803, + "grad_norm": 1.4941532611846924, + "learning_rate": 7.002039636256192e-05, + "loss": 0.7647, + "step": 115520 + }, + { + "epoch": 0.738088240931219, + "grad_norm": 1.0750895738601685, + "learning_rate": 7.001579837134789e-05, + "loss": 1.0533, + "step": 115530 + }, + { + "epoch": 0.7381521280809578, + "grad_norm": 1.1024094820022583, + "learning_rate": 7.001120017855745e-05, + "loss": 0.959, + "step": 115540 + }, + { + "epoch": 0.7382160152306965, + "grad_norm": 1.5375107526779175, + "learning_rate": 7.000660178423691e-05, + "loss": 0.9801, + "step": 115550 + }, + { + "epoch": 0.7382799023804352, + "grad_norm": 0.8644910454750061, + "learning_rate": 7.000200318843258e-05, + "loss": 0.9112, + "step": 115560 + }, + { + "epoch": 0.7383437895301739, + "grad_norm": 0.8466353416442871, + "learning_rate": 6.999740439119078e-05, + "loss": 1.028, + "step": 115570 + }, + { + "epoch": 0.7384076766799126, + "grad_norm": 0.8331221342086792, + "learning_rate": 6.99928053925578e-05, + "loss": 0.7816, + "step": 115580 + }, + { + "epoch": 0.7384715638296513, + "grad_norm": 1.330575704574585, + "learning_rate": 6.998820619257999e-05, + "loss": 1.1076, + "step": 115590 + }, + { + "epoch": 0.73853545097939, + "grad_norm": 1.0668553113937378, + "learning_rate": 6.998360679130364e-05, + "loss": 0.7225, + "step": 115600 + }, + { + "epoch": 0.7385993381291287, + "grad_norm": 2.099946975708008, + "learning_rate": 6.997900718877509e-05, + "loss": 0.8623, + "step": 115610 + }, + { + "epoch": 0.7386632252788674, + "grad_norm": 1.1908918619155884, + "learning_rate": 6.997440738504065e-05, + "loss": 0.9463, + "step": 115620 + }, + { + "epoch": 0.7387271124286061, + "grad_norm": 0.8096728920936584, + "learning_rate": 6.996980738014665e-05, + "loss": 0.7725, + "step": 115630 + }, + { + "epoch": 0.7387909995783448, + "grad_norm": 0.7145434617996216, + "learning_rate": 6.996520717413939e-05, + "loss": 0.9579, + "step": 115640 + }, + { + "epoch": 0.7388548867280835, + "grad_norm": 2.0472443103790283, + "learning_rate": 6.996060676706525e-05, + "loss": 1.1305, + "step": 115650 + }, + { + "epoch": 0.7389187738778222, + "grad_norm": 0.7645730376243591, + "learning_rate": 6.995600615897052e-05, + "loss": 1.1646, + "step": 115660 + }, + { + "epoch": 0.7389826610275609, + "grad_norm": 0.8512725234031677, + "learning_rate": 6.995140534990155e-05, + "loss": 0.9598, + "step": 115670 + }, + { + "epoch": 0.7390465481772996, + "grad_norm": 1.5269567966461182, + "learning_rate": 6.994680433990466e-05, + "loss": 0.6954, + "step": 115680 + }, + { + "epoch": 0.7391104353270384, + "grad_norm": 1.0922000408172607, + "learning_rate": 6.99422031290262e-05, + "loss": 0.6855, + "step": 115690 + }, + { + "epoch": 0.7391743224767771, + "grad_norm": 0.9279537796974182, + "learning_rate": 6.993760171731251e-05, + "loss": 1.0426, + "step": 115700 + }, + { + "epoch": 0.7392382096265158, + "grad_norm": 1.6063908338546753, + "learning_rate": 6.993300010480991e-05, + "loss": 0.779, + "step": 115710 + }, + { + "epoch": 0.7393020967762545, + "grad_norm": 2.612882614135742, + "learning_rate": 6.992839829156475e-05, + "loss": 0.9478, + "step": 115720 + }, + { + "epoch": 0.7393659839259932, + "grad_norm": 1.1300225257873535, + "learning_rate": 6.992379627762339e-05, + "loss": 0.8534, + "step": 115730 + }, + { + "epoch": 0.7394298710757319, + "grad_norm": 0.7757554650306702, + "learning_rate": 6.991919406303216e-05, + "loss": 1.0868, + "step": 115740 + }, + { + "epoch": 0.7394937582254705, + "grad_norm": 0.7098391652107239, + "learning_rate": 6.991459164783741e-05, + "loss": 0.8308, + "step": 115750 + }, + { + "epoch": 0.7395576453752092, + "grad_norm": 0.7299323678016663, + "learning_rate": 6.99099890320855e-05, + "loss": 0.9169, + "step": 115760 + }, + { + "epoch": 0.7396215325249479, + "grad_norm": 0.8165879249572754, + "learning_rate": 6.990538621582278e-05, + "loss": 0.8462, + "step": 115770 + }, + { + "epoch": 0.7396854196746866, + "grad_norm": 0.8184595108032227, + "learning_rate": 6.990078319909559e-05, + "loss": 1.0554, + "step": 115780 + }, + { + "epoch": 0.7397493068244253, + "grad_norm": 0.7778357863426208, + "learning_rate": 6.989617998195032e-05, + "loss": 0.9903, + "step": 115790 + }, + { + "epoch": 0.739813193974164, + "grad_norm": 0.9583641290664673, + "learning_rate": 6.989157656443327e-05, + "loss": 0.9248, + "step": 115800 + }, + { + "epoch": 0.7398770811239027, + "grad_norm": 1.5333011150360107, + "learning_rate": 6.988697294659085e-05, + "loss": 0.9296, + "step": 115810 + }, + { + "epoch": 0.7399409682736414, + "grad_norm": 0.9242226481437683, + "learning_rate": 6.98823691284694e-05, + "loss": 0.9196, + "step": 115820 + }, + { + "epoch": 0.7400048554233801, + "grad_norm": 1.562110185623169, + "learning_rate": 6.98777651101153e-05, + "loss": 0.8195, + "step": 115830 + }, + { + "epoch": 0.7400687425731188, + "grad_norm": 0.8381466865539551, + "learning_rate": 6.987316089157492e-05, + "loss": 0.757, + "step": 115840 + }, + { + "epoch": 0.7401326297228575, + "grad_norm": 0.7180655002593994, + "learning_rate": 6.986855647289461e-05, + "loss": 0.7828, + "step": 115850 + }, + { + "epoch": 0.7401965168725962, + "grad_norm": 1.6034409999847412, + "learning_rate": 6.986395185412073e-05, + "loss": 1.4035, + "step": 115860 + }, + { + "epoch": 0.740260404022335, + "grad_norm": 1.0359718799591064, + "learning_rate": 6.985934703529969e-05, + "loss": 0.8757, + "step": 115870 + }, + { + "epoch": 0.7403242911720737, + "grad_norm": 0.6328555345535278, + "learning_rate": 6.985474201647784e-05, + "loss": 0.6099, + "step": 115880 + }, + { + "epoch": 0.7403881783218124, + "grad_norm": 0.6946358680725098, + "learning_rate": 6.985013679770156e-05, + "loss": 0.8574, + "step": 115890 + }, + { + "epoch": 0.7404520654715511, + "grad_norm": 1.2363545894622803, + "learning_rate": 6.984553137901722e-05, + "loss": 0.8605, + "step": 115900 + }, + { + "epoch": 0.7405159526212898, + "grad_norm": 1.1174249649047852, + "learning_rate": 6.984092576047123e-05, + "loss": 0.7995, + "step": 115910 + }, + { + "epoch": 0.7405798397710285, + "grad_norm": 0.9834713339805603, + "learning_rate": 6.983631994210994e-05, + "loss": 1.31, + "step": 115920 + }, + { + "epoch": 0.7406437269207672, + "grad_norm": 1.397002935409546, + "learning_rate": 6.983171392397975e-05, + "loss": 0.8187, + "step": 115930 + }, + { + "epoch": 0.7407076140705059, + "grad_norm": 1.3270031213760376, + "learning_rate": 6.982710770612704e-05, + "loss": 1.112, + "step": 115940 + }, + { + "epoch": 0.7407715012202446, + "grad_norm": 0.6648245453834534, + "learning_rate": 6.98225012885982e-05, + "loss": 0.8693, + "step": 115950 + }, + { + "epoch": 0.7408353883699833, + "grad_norm": 0.7026130557060242, + "learning_rate": 6.981789467143965e-05, + "loss": 0.8299, + "step": 115960 + }, + { + "epoch": 0.740899275519722, + "grad_norm": 0.9298969507217407, + "learning_rate": 6.981328785469772e-05, + "loss": 0.8988, + "step": 115970 + }, + { + "epoch": 0.7409631626694607, + "grad_norm": 0.921608567237854, + "learning_rate": 6.980868083841887e-05, + "loss": 0.7417, + "step": 115980 + }, + { + "epoch": 0.7410270498191993, + "grad_norm": 0.960408091545105, + "learning_rate": 6.980407362264945e-05, + "loss": 0.8521, + "step": 115990 + }, + { + "epoch": 0.741090936968938, + "grad_norm": 0.6170063018798828, + "learning_rate": 6.979946620743587e-05, + "loss": 0.8537, + "step": 116000 + }, + { + "epoch": 0.7411548241186767, + "grad_norm": 1.5411709547042847, + "learning_rate": 6.979485859282453e-05, + "loss": 1.0145, + "step": 116010 + }, + { + "epoch": 0.7412187112684154, + "grad_norm": 1.529691457748413, + "learning_rate": 6.979025077886185e-05, + "loss": 0.8084, + "step": 116020 + }, + { + "epoch": 0.7412825984181541, + "grad_norm": 1.5334669351577759, + "learning_rate": 6.978564276559423e-05, + "loss": 0.6795, + "step": 116030 + }, + { + "epoch": 0.7413464855678928, + "grad_norm": 0.7906901240348816, + "learning_rate": 6.978103455306808e-05, + "loss": 0.8419, + "step": 116040 + }, + { + "epoch": 0.7414103727176315, + "grad_norm": 1.2242873907089233, + "learning_rate": 6.977642614132979e-05, + "loss": 0.8119, + "step": 116050 + }, + { + "epoch": 0.7414742598673703, + "grad_norm": 0.705220639705658, + "learning_rate": 6.977181753042577e-05, + "loss": 1.2762, + "step": 116060 + }, + { + "epoch": 0.741538147017109, + "grad_norm": 0.8187665343284607, + "learning_rate": 6.976720872040245e-05, + "loss": 0.8109, + "step": 116070 + }, + { + "epoch": 0.7416020341668477, + "grad_norm": 1.5236612558364868, + "learning_rate": 6.976259971130624e-05, + "loss": 0.9235, + "step": 116080 + }, + { + "epoch": 0.7416659213165864, + "grad_norm": 3.65586519241333, + "learning_rate": 6.975799050318355e-05, + "loss": 0.8398, + "step": 116090 + }, + { + "epoch": 0.7417298084663251, + "grad_norm": 0.6140323877334595, + "learning_rate": 6.97533810960808e-05, + "loss": 0.9414, + "step": 116100 + }, + { + "epoch": 0.7417936956160638, + "grad_norm": 0.6180686950683594, + "learning_rate": 6.974877149004441e-05, + "loss": 0.8651, + "step": 116110 + }, + { + "epoch": 0.7418575827658025, + "grad_norm": 0.8425277471542358, + "learning_rate": 6.97441616851208e-05, + "loss": 1.0168, + "step": 116120 + }, + { + "epoch": 0.7419214699155412, + "grad_norm": 0.7344639897346497, + "learning_rate": 6.973955168135642e-05, + "loss": 0.6021, + "step": 116130 + }, + { + "epoch": 0.7419853570652799, + "grad_norm": 0.8152681589126587, + "learning_rate": 6.973494147879767e-05, + "loss": 0.9441, + "step": 116140 + }, + { + "epoch": 0.7420492442150186, + "grad_norm": 0.810077428817749, + "learning_rate": 6.973033107749098e-05, + "loss": 0.7308, + "step": 116150 + }, + { + "epoch": 0.7421131313647573, + "grad_norm": 0.8788096308708191, + "learning_rate": 6.972572047748281e-05, + "loss": 0.7761, + "step": 116160 + }, + { + "epoch": 0.742177018514496, + "grad_norm": 1.0663613080978394, + "learning_rate": 6.972110967881953e-05, + "loss": 1.0319, + "step": 116170 + }, + { + "epoch": 0.7422409056642347, + "grad_norm": 1.0700383186340332, + "learning_rate": 6.971649868154764e-05, + "loss": 0.7867, + "step": 116180 + }, + { + "epoch": 0.7423047928139734, + "grad_norm": 1.0407042503356934, + "learning_rate": 6.971188748571355e-05, + "loss": 1.0002, + "step": 116190 + }, + { + "epoch": 0.7423686799637121, + "grad_norm": 0.9793998599052429, + "learning_rate": 6.97072760913637e-05, + "loss": 1.1765, + "step": 116200 + }, + { + "epoch": 0.7424325671134508, + "grad_norm": 1.643677830696106, + "learning_rate": 6.970266449854452e-05, + "loss": 0.9574, + "step": 116210 + }, + { + "epoch": 0.7424964542631896, + "grad_norm": 0.7675092220306396, + "learning_rate": 6.969805270730248e-05, + "loss": 0.7953, + "step": 116220 + }, + { + "epoch": 0.7425603414129281, + "grad_norm": 1.0452982187271118, + "learning_rate": 6.969344071768398e-05, + "loss": 0.9938, + "step": 116230 + }, + { + "epoch": 0.7426242285626669, + "grad_norm": 1.2801587581634521, + "learning_rate": 6.968882852973553e-05, + "loss": 0.8495, + "step": 116240 + }, + { + "epoch": 0.7426881157124056, + "grad_norm": 0.7818521857261658, + "learning_rate": 6.968421614350352e-05, + "loss": 0.9984, + "step": 116250 + }, + { + "epoch": 0.7427520028621443, + "grad_norm": 1.2182539701461792, + "learning_rate": 6.967960355903442e-05, + "loss": 0.8587, + "step": 116260 + }, + { + "epoch": 0.742815890011883, + "grad_norm": 0.7654426097869873, + "learning_rate": 6.96749907763747e-05, + "loss": 0.8998, + "step": 116270 + }, + { + "epoch": 0.7428797771616217, + "grad_norm": 4.754692554473877, + "learning_rate": 6.967037779557082e-05, + "loss": 0.8934, + "step": 116280 + }, + { + "epoch": 0.7429436643113604, + "grad_norm": 0.9145668745040894, + "learning_rate": 6.966576461666919e-05, + "loss": 1.082, + "step": 116290 + }, + { + "epoch": 0.7430075514610991, + "grad_norm": 0.6794173717498779, + "learning_rate": 6.96611512397163e-05, + "loss": 0.82, + "step": 116300 + }, + { + "epoch": 0.7430714386108378, + "grad_norm": 1.012447714805603, + "learning_rate": 6.965653766475862e-05, + "loss": 0.9966, + "step": 116310 + }, + { + "epoch": 0.7431353257605765, + "grad_norm": 0.8482702374458313, + "learning_rate": 6.96519238918426e-05, + "loss": 0.8475, + "step": 116320 + }, + { + "epoch": 0.7431992129103152, + "grad_norm": 0.8447071313858032, + "learning_rate": 6.964730992101468e-05, + "loss": 0.9462, + "step": 116330 + }, + { + "epoch": 0.7432631000600539, + "grad_norm": 0.6218491792678833, + "learning_rate": 6.964269575232138e-05, + "loss": 0.8291, + "step": 116340 + }, + { + "epoch": 0.7433269872097926, + "grad_norm": 1.0346623659133911, + "learning_rate": 6.963808138580912e-05, + "loss": 0.9003, + "step": 116350 + }, + { + "epoch": 0.7433908743595313, + "grad_norm": 0.7092266082763672, + "learning_rate": 6.96334668215244e-05, + "loss": 0.906, + "step": 116360 + }, + { + "epoch": 0.74345476150927, + "grad_norm": 2.0959596633911133, + "learning_rate": 6.962885205951369e-05, + "loss": 1.2469, + "step": 116370 + }, + { + "epoch": 0.7435186486590087, + "grad_norm": 0.5212934613227844, + "learning_rate": 6.962423709982345e-05, + "loss": 0.6491, + "step": 116380 + }, + { + "epoch": 0.7435825358087474, + "grad_norm": 0.7486282587051392, + "learning_rate": 6.961962194250017e-05, + "loss": 0.8276, + "step": 116390 + }, + { + "epoch": 0.7436464229584862, + "grad_norm": 0.8907299041748047, + "learning_rate": 6.961500658759033e-05, + "loss": 0.8261, + "step": 116400 + }, + { + "epoch": 0.7437103101082249, + "grad_norm": 1.0088621377944946, + "learning_rate": 6.961039103514039e-05, + "loss": 0.8501, + "step": 116410 + }, + { + "epoch": 0.7437741972579636, + "grad_norm": 0.852279007434845, + "learning_rate": 6.960577528519685e-05, + "loss": 1.0004, + "step": 116420 + }, + { + "epoch": 0.7438380844077023, + "grad_norm": 0.6919979453086853, + "learning_rate": 6.96011593378062e-05, + "loss": 0.9841, + "step": 116430 + }, + { + "epoch": 0.743901971557441, + "grad_norm": 1.0231778621673584, + "learning_rate": 6.959654319301492e-05, + "loss": 0.9452, + "step": 116440 + }, + { + "epoch": 0.7439658587071797, + "grad_norm": 1.6192152500152588, + "learning_rate": 6.959238849396364e-05, + "loss": 0.8734, + "step": 116450 + }, + { + "epoch": 0.7440297458569184, + "grad_norm": 1.1661548614501953, + "learning_rate": 6.958777197423922e-05, + "loss": 0.8547, + "step": 116460 + }, + { + "epoch": 0.7440936330066571, + "grad_norm": 1.9804883003234863, + "learning_rate": 6.958315525724901e-05, + "loss": 0.8883, + "step": 116470 + }, + { + "epoch": 0.7441575201563957, + "grad_norm": 0.9184843897819519, + "learning_rate": 6.957853834303946e-05, + "loss": 1.0803, + "step": 116480 + }, + { + "epoch": 0.7442214073061344, + "grad_norm": 0.9524401426315308, + "learning_rate": 6.957392123165711e-05, + "loss": 0.823, + "step": 116490 + }, + { + "epoch": 0.7442852944558731, + "grad_norm": 0.7346342206001282, + "learning_rate": 6.956930392314845e-05, + "loss": 0.842, + "step": 116500 + }, + { + "epoch": 0.7443491816056118, + "grad_norm": 0.6176126003265381, + "learning_rate": 6.956468641755994e-05, + "loss": 0.824, + "step": 116510 + }, + { + "epoch": 0.7444130687553505, + "grad_norm": 1.0610926151275635, + "learning_rate": 6.956006871493814e-05, + "loss": 0.8781, + "step": 116520 + }, + { + "epoch": 0.7444769559050892, + "grad_norm": 1.190373420715332, + "learning_rate": 6.95554508153295e-05, + "loss": 0.9477, + "step": 116530 + }, + { + "epoch": 0.7445408430548279, + "grad_norm": 1.2164260149002075, + "learning_rate": 6.955083271878056e-05, + "loss": 0.7542, + "step": 116540 + }, + { + "epoch": 0.7446047302045666, + "grad_norm": 0.9188566207885742, + "learning_rate": 6.954621442533784e-05, + "loss": 0.9441, + "step": 116550 + }, + { + "epoch": 0.7446686173543053, + "grad_norm": 1.2796574831008911, + "learning_rate": 6.954159593504781e-05, + "loss": 0.738, + "step": 116560 + }, + { + "epoch": 0.744732504504044, + "grad_norm": 0.8466264605522156, + "learning_rate": 6.953697724795702e-05, + "loss": 1.0705, + "step": 116570 + }, + { + "epoch": 0.7447963916537828, + "grad_norm": 0.9667829275131226, + "learning_rate": 6.953235836411194e-05, + "loss": 0.9622, + "step": 116580 + }, + { + "epoch": 0.7448602788035215, + "grad_norm": 3.007852792739868, + "learning_rate": 6.952773928355913e-05, + "loss": 0.8599, + "step": 116590 + }, + { + "epoch": 0.7449241659532602, + "grad_norm": 0.6320720314979553, + "learning_rate": 6.95231200063451e-05, + "loss": 0.9257, + "step": 116600 + }, + { + "epoch": 0.7449880531029989, + "grad_norm": 0.8152862787246704, + "learning_rate": 6.951850053251636e-05, + "loss": 0.7299, + "step": 116610 + }, + { + "epoch": 0.7450519402527376, + "grad_norm": 0.709783673286438, + "learning_rate": 6.951388086211943e-05, + "loss": 0.8396, + "step": 116620 + }, + { + "epoch": 0.7451158274024763, + "grad_norm": 0.844637393951416, + "learning_rate": 6.950926099520084e-05, + "loss": 0.5487, + "step": 116630 + }, + { + "epoch": 0.745179714552215, + "grad_norm": 1.2991611957550049, + "learning_rate": 6.95046409318071e-05, + "loss": 0.8171, + "step": 116640 + }, + { + "epoch": 0.7452436017019537, + "grad_norm": 1.085801124572754, + "learning_rate": 6.950002067198475e-05, + "loss": 0.9065, + "step": 116650 + }, + { + "epoch": 0.7453074888516924, + "grad_norm": 0.8042502999305725, + "learning_rate": 6.949540021578034e-05, + "loss": 0.9323, + "step": 116660 + }, + { + "epoch": 0.7453713760014311, + "grad_norm": 0.6853629946708679, + "learning_rate": 6.949077956324038e-05, + "loss": 0.8285, + "step": 116670 + }, + { + "epoch": 0.7454352631511698, + "grad_norm": 0.7794731259346008, + "learning_rate": 6.94861587144114e-05, + "loss": 0.9293, + "step": 116680 + }, + { + "epoch": 0.7454991503009085, + "grad_norm": 0.9132157564163208, + "learning_rate": 6.948153766933995e-05, + "loss": 1.0174, + "step": 116690 + }, + { + "epoch": 0.7455630374506472, + "grad_norm": 1.2286196947097778, + "learning_rate": 6.947691642807256e-05, + "loss": 1.0193, + "step": 116700 + }, + { + "epoch": 0.7456269246003859, + "grad_norm": 0.5221213102340698, + "learning_rate": 6.947229499065578e-05, + "loss": 0.7766, + "step": 116710 + }, + { + "epoch": 0.7456908117501245, + "grad_norm": 1.4980177879333496, + "learning_rate": 6.946767335713613e-05, + "loss": 0.8005, + "step": 116720 + }, + { + "epoch": 0.7457546988998632, + "grad_norm": 0.8484123349189758, + "learning_rate": 6.946305152756017e-05, + "loss": 0.6168, + "step": 116730 + }, + { + "epoch": 0.7458185860496019, + "grad_norm": 0.9975723028182983, + "learning_rate": 6.945842950197446e-05, + "loss": 0.9931, + "step": 116740 + }, + { + "epoch": 0.7458824731993406, + "grad_norm": 1.0813270807266235, + "learning_rate": 6.945380728042549e-05, + "loss": 0.8407, + "step": 116750 + }, + { + "epoch": 0.7459463603490794, + "grad_norm": 1.2126818895339966, + "learning_rate": 6.944918486295989e-05, + "loss": 0.8995, + "step": 116760 + }, + { + "epoch": 0.7460102474988181, + "grad_norm": 0.9279122948646545, + "learning_rate": 6.944456224962417e-05, + "loss": 0.819, + "step": 116770 + }, + { + "epoch": 0.7460741346485568, + "grad_norm": 0.9613460302352905, + "learning_rate": 6.943993944046487e-05, + "loss": 1.41, + "step": 116780 + }, + { + "epoch": 0.7461380217982955, + "grad_norm": 0.6390883922576904, + "learning_rate": 6.943531643552857e-05, + "loss": 0.7246, + "step": 116790 + }, + { + "epoch": 0.7462019089480342, + "grad_norm": 1.0150036811828613, + "learning_rate": 6.943069323486183e-05, + "loss": 0.7269, + "step": 116800 + }, + { + "epoch": 0.7462657960977729, + "grad_norm": 0.7378376722335815, + "learning_rate": 6.942606983851116e-05, + "loss": 0.9508, + "step": 116810 + }, + { + "epoch": 0.7463296832475116, + "grad_norm": 0.9831222295761108, + "learning_rate": 6.94214462465232e-05, + "loss": 0.9079, + "step": 116820 + }, + { + "epoch": 0.7463935703972503, + "grad_norm": 1.4024206399917603, + "learning_rate": 6.941682245894446e-05, + "loss": 0.8302, + "step": 116830 + }, + { + "epoch": 0.746457457546989, + "grad_norm": 2.6313674449920654, + "learning_rate": 6.94121984758215e-05, + "loss": 0.9034, + "step": 116840 + }, + { + "epoch": 0.7465213446967277, + "grad_norm": 0.7993502616882324, + "learning_rate": 6.940757429720094e-05, + "loss": 0.681, + "step": 116850 + }, + { + "epoch": 0.7465852318464664, + "grad_norm": 1.0172010660171509, + "learning_rate": 6.940294992312932e-05, + "loss": 1.0379, + "step": 116860 + }, + { + "epoch": 0.7466491189962051, + "grad_norm": 0.764042854309082, + "learning_rate": 6.939832535365319e-05, + "loss": 0.8982, + "step": 116870 + }, + { + "epoch": 0.7467130061459438, + "grad_norm": 1.4996678829193115, + "learning_rate": 6.939370058881914e-05, + "loss": 1.0452, + "step": 116880 + }, + { + "epoch": 0.7467768932956825, + "grad_norm": 0.6007105708122253, + "learning_rate": 6.938907562867374e-05, + "loss": 0.8955, + "step": 116890 + }, + { + "epoch": 0.7468407804454212, + "grad_norm": 0.753452718257904, + "learning_rate": 6.93844504732636e-05, + "loss": 0.8096, + "step": 116900 + }, + { + "epoch": 0.7469046675951599, + "grad_norm": 0.8642897009849548, + "learning_rate": 6.937982512263528e-05, + "loss": 0.7916, + "step": 116910 + }, + { + "epoch": 0.7469685547448986, + "grad_norm": 0.8521572947502136, + "learning_rate": 6.937519957683534e-05, + "loss": 0.7633, + "step": 116920 + }, + { + "epoch": 0.7470324418946374, + "grad_norm": 1.336063027381897, + "learning_rate": 6.937057383591037e-05, + "loss": 0.8625, + "step": 116930 + }, + { + "epoch": 0.7470963290443761, + "grad_norm": 2.280611276626587, + "learning_rate": 6.936594789990696e-05, + "loss": 1.225, + "step": 116940 + }, + { + "epoch": 0.7471602161941148, + "grad_norm": 0.5119796991348267, + "learning_rate": 6.936132176887171e-05, + "loss": 0.7675, + "step": 116950 + }, + { + "epoch": 0.7472241033438534, + "grad_norm": 1.0470057725906372, + "learning_rate": 6.93566954428512e-05, + "loss": 0.9467, + "step": 116960 + }, + { + "epoch": 0.7472879904935921, + "grad_norm": 0.823755145072937, + "learning_rate": 6.935206892189202e-05, + "loss": 0.7573, + "step": 116970 + }, + { + "epoch": 0.7473518776433308, + "grad_norm": 0.7342879176139832, + "learning_rate": 6.934744220604076e-05, + "loss": 0.926, + "step": 116980 + }, + { + "epoch": 0.7474157647930695, + "grad_norm": 0.5635343790054321, + "learning_rate": 6.934281529534403e-05, + "loss": 0.8136, + "step": 116990 + }, + { + "epoch": 0.7474796519428082, + "grad_norm": 0.515261173248291, + "learning_rate": 6.93381881898484e-05, + "loss": 0.6738, + "step": 117000 + }, + { + "epoch": 0.7475435390925469, + "grad_norm": 1.1176193952560425, + "learning_rate": 6.93335608896005e-05, + "loss": 0.7295, + "step": 117010 + }, + { + "epoch": 0.7476074262422856, + "grad_norm": 0.7528888583183289, + "learning_rate": 6.93289333946469e-05, + "loss": 0.9808, + "step": 117020 + }, + { + "epoch": 0.7476713133920243, + "grad_norm": 0.6792475581169128, + "learning_rate": 6.932430570503423e-05, + "loss": 0.7732, + "step": 117030 + }, + { + "epoch": 0.747735200541763, + "grad_norm": 0.49816229939460754, + "learning_rate": 6.931967782080908e-05, + "loss": 1.0765, + "step": 117040 + }, + { + "epoch": 0.7477990876915017, + "grad_norm": 0.6919913291931152, + "learning_rate": 6.931504974201806e-05, + "loss": 0.9868, + "step": 117050 + }, + { + "epoch": 0.7478629748412404, + "grad_norm": 1.421985387802124, + "learning_rate": 6.931042146870779e-05, + "loss": 1.0408, + "step": 117060 + }, + { + "epoch": 0.7479268619909791, + "grad_norm": 0.6791272163391113, + "learning_rate": 6.930579300092487e-05, + "loss": 1.0113, + "step": 117070 + }, + { + "epoch": 0.7479907491407178, + "grad_norm": 0.6858437657356262, + "learning_rate": 6.93011643387159e-05, + "loss": 0.7507, + "step": 117080 + }, + { + "epoch": 0.7480546362904565, + "grad_norm": 0.5481752753257751, + "learning_rate": 6.92965354821275e-05, + "loss": 0.7424, + "step": 117090 + }, + { + "epoch": 0.7481185234401952, + "grad_norm": 0.6619887948036194, + "learning_rate": 6.929190643120632e-05, + "loss": 0.8167, + "step": 117100 + }, + { + "epoch": 0.748182410589934, + "grad_norm": 1.6470102071762085, + "learning_rate": 6.928727718599893e-05, + "loss": 0.9335, + "step": 117110 + }, + { + "epoch": 0.7482462977396727, + "grad_norm": 0.7294175028800964, + "learning_rate": 6.928264774655198e-05, + "loss": 0.8117, + "step": 117120 + }, + { + "epoch": 0.7483101848894114, + "grad_norm": 1.0997979640960693, + "learning_rate": 6.927801811291209e-05, + "loss": 1.0709, + "step": 117130 + }, + { + "epoch": 0.7483740720391501, + "grad_norm": 0.9277620911598206, + "learning_rate": 6.927338828512588e-05, + "loss": 0.8636, + "step": 117140 + }, + { + "epoch": 0.7484379591888888, + "grad_norm": 1.3911194801330566, + "learning_rate": 6.926875826323997e-05, + "loss": 0.9657, + "step": 117150 + }, + { + "epoch": 0.7485018463386275, + "grad_norm": 0.7593978047370911, + "learning_rate": 6.9264128047301e-05, + "loss": 0.8314, + "step": 117160 + }, + { + "epoch": 0.7485657334883662, + "grad_norm": 0.8902554512023926, + "learning_rate": 6.92594976373556e-05, + "loss": 1.0123, + "step": 117170 + }, + { + "epoch": 0.7486296206381049, + "grad_norm": 0.8889968991279602, + "learning_rate": 6.925486703345038e-05, + "loss": 0.9378, + "step": 117180 + }, + { + "epoch": 0.7486935077878436, + "grad_norm": 0.7080551981925964, + "learning_rate": 6.925023623563201e-05, + "loss": 1.1536, + "step": 117190 + }, + { + "epoch": 0.7487573949375823, + "grad_norm": 0.7877635955810547, + "learning_rate": 6.924560524394709e-05, + "loss": 0.9396, + "step": 117200 + }, + { + "epoch": 0.7488212820873209, + "grad_norm": 3.540712594985962, + "learning_rate": 6.924097405844227e-05, + "loss": 0.9817, + "step": 117210 + }, + { + "epoch": 0.7488851692370596, + "grad_norm": 0.6759431958198547, + "learning_rate": 6.923634267916422e-05, + "loss": 0.9781, + "step": 117220 + }, + { + "epoch": 0.7489490563867983, + "grad_norm": 0.7441065311431885, + "learning_rate": 6.923171110615954e-05, + "loss": 0.6876, + "step": 117230 + }, + { + "epoch": 0.749012943536537, + "grad_norm": 0.90581214427948, + "learning_rate": 6.92270793394749e-05, + "loss": 0.9416, + "step": 117240 + }, + { + "epoch": 0.7490768306862757, + "grad_norm": 0.991303563117981, + "learning_rate": 6.922244737915692e-05, + "loss": 0.9986, + "step": 117250 + }, + { + "epoch": 0.7491407178360144, + "grad_norm": 0.9509017467498779, + "learning_rate": 6.921781522525229e-05, + "loss": 0.9976, + "step": 117260 + }, + { + "epoch": 0.7492046049857531, + "grad_norm": 1.060160756111145, + "learning_rate": 6.921318287780763e-05, + "loss": 0.9884, + "step": 117270 + }, + { + "epoch": 0.7492684921354918, + "grad_norm": 0.8051804900169373, + "learning_rate": 6.920855033686959e-05, + "loss": 1.0251, + "step": 117280 + }, + { + "epoch": 0.7493323792852306, + "grad_norm": 0.6516896486282349, + "learning_rate": 6.920391760248482e-05, + "loss": 1.0851, + "step": 117290 + }, + { + "epoch": 0.7493962664349693, + "grad_norm": 1.0302486419677734, + "learning_rate": 6.91992846747e-05, + "loss": 0.9056, + "step": 117300 + }, + { + "epoch": 0.749460153584708, + "grad_norm": 0.5640853047370911, + "learning_rate": 6.919465155356177e-05, + "loss": 0.8081, + "step": 117310 + }, + { + "epoch": 0.7495240407344467, + "grad_norm": 0.6302241086959839, + "learning_rate": 6.91900182391168e-05, + "loss": 0.8306, + "step": 117320 + }, + { + "epoch": 0.7495879278841854, + "grad_norm": 1.006842851638794, + "learning_rate": 6.918538473141174e-05, + "loss": 0.7887, + "step": 117330 + }, + { + "epoch": 0.7496518150339241, + "grad_norm": 0.8143184781074524, + "learning_rate": 6.918075103049325e-05, + "loss": 1.0579, + "step": 117340 + }, + { + "epoch": 0.7497157021836628, + "grad_norm": 0.6692333817481995, + "learning_rate": 6.9176117136408e-05, + "loss": 0.7448, + "step": 117350 + }, + { + "epoch": 0.7497795893334015, + "grad_norm": 1.042280912399292, + "learning_rate": 6.917148304920267e-05, + "loss": 0.8737, + "step": 117360 + }, + { + "epoch": 0.7498434764831402, + "grad_norm": 1.1402003765106201, + "learning_rate": 6.916684876892391e-05, + "loss": 0.8044, + "step": 117370 + }, + { + "epoch": 0.7499073636328789, + "grad_norm": 1.2697054147720337, + "learning_rate": 6.916221429561843e-05, + "loss": 0.7297, + "step": 117380 + }, + { + "epoch": 0.7499712507826176, + "grad_norm": 0.7075098156929016, + "learning_rate": 6.915757962933284e-05, + "loss": 0.696, + "step": 117390 + }, + { + "epoch": 0.7500351379323563, + "grad_norm": 1.1511503458023071, + "learning_rate": 6.915294477011389e-05, + "loss": 0.8506, + "step": 117400 + }, + { + "epoch": 0.750099025082095, + "grad_norm": 0.8749009370803833, + "learning_rate": 6.914830971800818e-05, + "loss": 1.041, + "step": 117410 + }, + { + "epoch": 0.7501629122318337, + "grad_norm": 0.7011004686355591, + "learning_rate": 6.914367447306244e-05, + "loss": 0.8774, + "step": 117420 + }, + { + "epoch": 0.7502267993815724, + "grad_norm": 1.408872365951538, + "learning_rate": 6.913903903532334e-05, + "loss": 0.8093, + "step": 117430 + }, + { + "epoch": 0.7502906865313111, + "grad_norm": 0.79103022813797, + "learning_rate": 6.913440340483755e-05, + "loss": 0.6807, + "step": 117440 + }, + { + "epoch": 0.7503545736810497, + "grad_norm": 1.1001317501068115, + "learning_rate": 6.912976758165177e-05, + "loss": 0.8683, + "step": 117450 + }, + { + "epoch": 0.7504184608307884, + "grad_norm": 0.7966405749320984, + "learning_rate": 6.912513156581267e-05, + "loss": 0.8673, + "step": 117460 + }, + { + "epoch": 0.7504823479805272, + "grad_norm": 0.5426264405250549, + "learning_rate": 6.912049535736697e-05, + "loss": 0.6471, + "step": 117470 + }, + { + "epoch": 0.7505462351302659, + "grad_norm": 0.8461551070213318, + "learning_rate": 6.911585895636132e-05, + "loss": 0.9877, + "step": 117480 + }, + { + "epoch": 0.7506101222800046, + "grad_norm": 0.7813534736633301, + "learning_rate": 6.911122236284244e-05, + "loss": 0.8591, + "step": 117490 + }, + { + "epoch": 0.7506740094297433, + "grad_norm": 2.3332297801971436, + "learning_rate": 6.910658557685701e-05, + "loss": 1.2769, + "step": 117500 + }, + { + "epoch": 0.750737896579482, + "grad_norm": 0.9277547597885132, + "learning_rate": 6.910194859845174e-05, + "loss": 0.7398, + "step": 117510 + }, + { + "epoch": 0.7508017837292207, + "grad_norm": 0.5283322930335999, + "learning_rate": 6.909731142767333e-05, + "loss": 0.8047, + "step": 117520 + }, + { + "epoch": 0.7508656708789594, + "grad_norm": 0.9669355154037476, + "learning_rate": 6.909267406456847e-05, + "loss": 1.0771, + "step": 117530 + }, + { + "epoch": 0.7509295580286981, + "grad_norm": 0.8970743417739868, + "learning_rate": 6.908803650918385e-05, + "loss": 0.9949, + "step": 117540 + }, + { + "epoch": 0.7509934451784368, + "grad_norm": 0.9849328398704529, + "learning_rate": 6.90833987615662e-05, + "loss": 0.6818, + "step": 117550 + }, + { + "epoch": 0.7510573323281755, + "grad_norm": 0.4582323431968689, + "learning_rate": 6.907876082176222e-05, + "loss": 0.8551, + "step": 117560 + }, + { + "epoch": 0.7511212194779142, + "grad_norm": 0.8646737933158875, + "learning_rate": 6.90741226898186e-05, + "loss": 0.8972, + "step": 117570 + }, + { + "epoch": 0.7511851066276529, + "grad_norm": 1.1300363540649414, + "learning_rate": 6.906948436578206e-05, + "loss": 0.8249, + "step": 117580 + }, + { + "epoch": 0.7512489937773916, + "grad_norm": 0.7908068895339966, + "learning_rate": 6.906484584969934e-05, + "loss": 0.8863, + "step": 117590 + }, + { + "epoch": 0.7513128809271303, + "grad_norm": 1.099071741104126, + "learning_rate": 6.906020714161711e-05, + "loss": 0.9949, + "step": 117600 + }, + { + "epoch": 0.751376768076869, + "grad_norm": 0.874218761920929, + "learning_rate": 6.905556824158212e-05, + "loss": 0.8182, + "step": 117610 + }, + { + "epoch": 0.7514406552266077, + "grad_norm": 1.2314951419830322, + "learning_rate": 6.905092914964105e-05, + "loss": 1.1335, + "step": 117620 + }, + { + "epoch": 0.7515045423763465, + "grad_norm": 0.8381962180137634, + "learning_rate": 6.904628986584066e-05, + "loss": 0.8751, + "step": 117630 + }, + { + "epoch": 0.7515684295260852, + "grad_norm": 1.2576111555099487, + "learning_rate": 6.904165039022766e-05, + "loss": 0.8728, + "step": 117640 + }, + { + "epoch": 0.7516323166758239, + "grad_norm": 0.8034125566482544, + "learning_rate": 6.903701072284875e-05, + "loss": 0.8502, + "step": 117650 + }, + { + "epoch": 0.7516962038255626, + "grad_norm": 0.6873534917831421, + "learning_rate": 6.903237086375068e-05, + "loss": 0.8783, + "step": 117660 + }, + { + "epoch": 0.7517600909753013, + "grad_norm": 0.8254218101501465, + "learning_rate": 6.902773081298015e-05, + "loss": 0.8267, + "step": 117670 + }, + { + "epoch": 0.75182397812504, + "grad_norm": 2.0020554065704346, + "learning_rate": 6.902309057058393e-05, + "loss": 0.8673, + "step": 117680 + }, + { + "epoch": 0.7518878652747786, + "grad_norm": 1.3021756410598755, + "learning_rate": 6.901845013660873e-05, + "loss": 0.9579, + "step": 117690 + }, + { + "epoch": 0.7519517524245173, + "grad_norm": 0.6825690865516663, + "learning_rate": 6.901380951110128e-05, + "loss": 0.6835, + "step": 117700 + }, + { + "epoch": 0.752015639574256, + "grad_norm": 1.2867560386657715, + "learning_rate": 6.900916869410831e-05, + "loss": 0.9329, + "step": 117710 + }, + { + "epoch": 0.7520795267239947, + "grad_norm": 0.7000182271003723, + "learning_rate": 6.900452768567657e-05, + "loss": 0.7614, + "step": 117720 + }, + { + "epoch": 0.7521434138737334, + "grad_norm": 0.7229273319244385, + "learning_rate": 6.89998864858528e-05, + "loss": 1.082, + "step": 117730 + }, + { + "epoch": 0.7522073010234721, + "grad_norm": 0.7700644135475159, + "learning_rate": 6.899524509468375e-05, + "loss": 1.1019, + "step": 117740 + }, + { + "epoch": 0.7522711881732108, + "grad_norm": 1.3923498392105103, + "learning_rate": 6.899060351221613e-05, + "loss": 0.8848, + "step": 117750 + }, + { + "epoch": 0.7523350753229495, + "grad_norm": 0.6365454792976379, + "learning_rate": 6.898596173849672e-05, + "loss": 1.1148, + "step": 117760 + }, + { + "epoch": 0.7523989624726882, + "grad_norm": 1.0675365924835205, + "learning_rate": 6.898131977357223e-05, + "loss": 0.9215, + "step": 117770 + }, + { + "epoch": 0.7524628496224269, + "grad_norm": 1.0569766759872437, + "learning_rate": 6.897667761748943e-05, + "loss": 0.7153, + "step": 117780 + }, + { + "epoch": 0.7525267367721656, + "grad_norm": 1.0126324892044067, + "learning_rate": 6.897203527029508e-05, + "loss": 0.925, + "step": 117790 + }, + { + "epoch": 0.7525906239219043, + "grad_norm": 0.9633859992027283, + "learning_rate": 6.896739273203592e-05, + "loss": 1.0008, + "step": 117800 + }, + { + "epoch": 0.752654511071643, + "grad_norm": 1.014870047569275, + "learning_rate": 6.896275000275872e-05, + "loss": 0.8294, + "step": 117810 + }, + { + "epoch": 0.7527183982213818, + "grad_norm": 1.1315689086914062, + "learning_rate": 6.895810708251019e-05, + "loss": 0.8227, + "step": 117820 + }, + { + "epoch": 0.7527822853711205, + "grad_norm": 2.6392271518707275, + "learning_rate": 6.895346397133714e-05, + "loss": 0.8076, + "step": 117830 + }, + { + "epoch": 0.7528461725208592, + "grad_norm": 0.7529024481773376, + "learning_rate": 6.89488206692863e-05, + "loss": 0.8401, + "step": 117840 + }, + { + "epoch": 0.7529100596705979, + "grad_norm": 0.9664776921272278, + "learning_rate": 6.894417717640447e-05, + "loss": 1.1103, + "step": 117850 + }, + { + "epoch": 0.7529739468203366, + "grad_norm": 0.732601523399353, + "learning_rate": 6.893953349273836e-05, + "loss": 0.7659, + "step": 117860 + }, + { + "epoch": 0.7530378339700753, + "grad_norm": 0.8885082006454468, + "learning_rate": 6.893488961833477e-05, + "loss": 0.7957, + "step": 117870 + }, + { + "epoch": 0.753101721119814, + "grad_norm": 1.1969749927520752, + "learning_rate": 6.893024555324045e-05, + "loss": 1.0971, + "step": 117880 + }, + { + "epoch": 0.7531656082695527, + "grad_norm": 1.1240543127059937, + "learning_rate": 6.892560129750221e-05, + "loss": 0.7384, + "step": 117890 + }, + { + "epoch": 0.7532294954192914, + "grad_norm": 1.059037208557129, + "learning_rate": 6.892095685116677e-05, + "loss": 0.7629, + "step": 117900 + }, + { + "epoch": 0.7532933825690301, + "grad_norm": 0.8897120356559753, + "learning_rate": 6.891631221428092e-05, + "loss": 1.11, + "step": 117910 + }, + { + "epoch": 0.7533572697187688, + "grad_norm": 1.0134267807006836, + "learning_rate": 6.891166738689146e-05, + "loss": 0.8046, + "step": 117920 + }, + { + "epoch": 0.7534211568685074, + "grad_norm": 1.6655761003494263, + "learning_rate": 6.890702236904514e-05, + "loss": 0.9587, + "step": 117930 + }, + { + "epoch": 0.7534850440182461, + "grad_norm": 1.1611841917037964, + "learning_rate": 6.890237716078874e-05, + "loss": 1.0593, + "step": 117940 + }, + { + "epoch": 0.7535489311679848, + "grad_norm": 0.8009079098701477, + "learning_rate": 6.889773176216905e-05, + "loss": 0.901, + "step": 117950 + }, + { + "epoch": 0.7536128183177235, + "grad_norm": 1.4743709564208984, + "learning_rate": 6.889308617323286e-05, + "loss": 1.0526, + "step": 117960 + }, + { + "epoch": 0.7536767054674622, + "grad_norm": 0.7194849252700806, + "learning_rate": 6.888844039402695e-05, + "loss": 0.7397, + "step": 117970 + }, + { + "epoch": 0.7537405926172009, + "grad_norm": 1.0453287363052368, + "learning_rate": 6.88837944245981e-05, + "loss": 1.104, + "step": 117980 + }, + { + "epoch": 0.7538044797669397, + "grad_norm": 0.8043333292007446, + "learning_rate": 6.88791482649931e-05, + "loss": 0.798, + "step": 117990 + }, + { + "epoch": 0.7538683669166784, + "grad_norm": 0.8196787238121033, + "learning_rate": 6.887450191525875e-05, + "loss": 0.8353, + "step": 118000 + }, + { + "epoch": 0.7539322540664171, + "grad_norm": 0.6180843114852905, + "learning_rate": 6.886985537544183e-05, + "loss": 1.0143, + "step": 118010 + }, + { + "epoch": 0.7539961412161558, + "grad_norm": 1.4987810850143433, + "learning_rate": 6.886520864558914e-05, + "loss": 0.7404, + "step": 118020 + }, + { + "epoch": 0.7540600283658945, + "grad_norm": 0.7595736384391785, + "learning_rate": 6.886056172574747e-05, + "loss": 1.0103, + "step": 118030 + }, + { + "epoch": 0.7541239155156332, + "grad_norm": 0.7522437572479248, + "learning_rate": 6.885591461596364e-05, + "loss": 1.1119, + "step": 118040 + }, + { + "epoch": 0.7541878026653719, + "grad_norm": 0.7806057333946228, + "learning_rate": 6.885126731628445e-05, + "loss": 0.921, + "step": 118050 + }, + { + "epoch": 0.7542516898151106, + "grad_norm": 0.722546398639679, + "learning_rate": 6.884661982675666e-05, + "loss": 0.9464, + "step": 118060 + }, + { + "epoch": 0.7543155769648493, + "grad_norm": 1.262364149093628, + "learning_rate": 6.884197214742713e-05, + "loss": 0.9972, + "step": 118070 + }, + { + "epoch": 0.754379464114588, + "grad_norm": 0.8939563035964966, + "learning_rate": 6.883732427834263e-05, + "loss": 1.019, + "step": 118080 + }, + { + "epoch": 0.7544433512643267, + "grad_norm": 1.2244893312454224, + "learning_rate": 6.883267621954998e-05, + "loss": 0.965, + "step": 118090 + }, + { + "epoch": 0.7545072384140654, + "grad_norm": 0.7305797338485718, + "learning_rate": 6.882802797109599e-05, + "loss": 0.976, + "step": 118100 + }, + { + "epoch": 0.7545711255638041, + "grad_norm": 0.7178799510002136, + "learning_rate": 6.882337953302747e-05, + "loss": 0.781, + "step": 118110 + }, + { + "epoch": 0.7546350127135428, + "grad_norm": 1.3888294696807861, + "learning_rate": 6.881873090539121e-05, + "loss": 1.1481, + "step": 118120 + }, + { + "epoch": 0.7546988998632815, + "grad_norm": 1.024906873703003, + "learning_rate": 6.881408208823409e-05, + "loss": 0.8872, + "step": 118130 + }, + { + "epoch": 0.7547627870130202, + "grad_norm": 1.0983256101608276, + "learning_rate": 6.880943308160287e-05, + "loss": 0.7869, + "step": 118140 + }, + { + "epoch": 0.754826674162759, + "grad_norm": 0.6162832975387573, + "learning_rate": 6.880478388554438e-05, + "loss": 0.5942, + "step": 118150 + }, + { + "epoch": 0.7548905613124977, + "grad_norm": 0.8574840426445007, + "learning_rate": 6.880013450010545e-05, + "loss": 0.8025, + "step": 118160 + }, + { + "epoch": 0.7549544484622364, + "grad_norm": 0.7543234825134277, + "learning_rate": 6.87954849253329e-05, + "loss": 0.6696, + "step": 118170 + }, + { + "epoch": 0.755018335611975, + "grad_norm": 0.6971087455749512, + "learning_rate": 6.879083516127356e-05, + "loss": 1.0234, + "step": 118180 + }, + { + "epoch": 0.7550822227617137, + "grad_norm": 1.7899150848388672, + "learning_rate": 6.878618520797424e-05, + "loss": 0.791, + "step": 118190 + }, + { + "epoch": 0.7551461099114524, + "grad_norm": 0.8531982898712158, + "learning_rate": 6.87815350654818e-05, + "loss": 0.9281, + "step": 118200 + }, + { + "epoch": 0.7552099970611911, + "grad_norm": 1.196276068687439, + "learning_rate": 6.877688473384304e-05, + "loss": 0.8512, + "step": 118210 + }, + { + "epoch": 0.7552738842109298, + "grad_norm": 0.9306949377059937, + "learning_rate": 6.877223421310481e-05, + "loss": 1.0111, + "step": 118220 + }, + { + "epoch": 0.7553377713606685, + "grad_norm": 0.6089597940444946, + "learning_rate": 6.876758350331395e-05, + "loss": 1.1216, + "step": 118230 + }, + { + "epoch": 0.7554016585104072, + "grad_norm": 0.9803066253662109, + "learning_rate": 6.876293260451728e-05, + "loss": 0.8555, + "step": 118240 + }, + { + "epoch": 0.7554655456601459, + "grad_norm": 1.0859441757202148, + "learning_rate": 6.875828151676165e-05, + "loss": 0.9888, + "step": 118250 + }, + { + "epoch": 0.7555294328098846, + "grad_norm": 2.5297441482543945, + "learning_rate": 6.875363024009389e-05, + "loss": 1.0641, + "step": 118260 + }, + { + "epoch": 0.7555933199596233, + "grad_norm": 0.7911348938941956, + "learning_rate": 6.874897877456086e-05, + "loss": 0.9526, + "step": 118270 + }, + { + "epoch": 0.755657207109362, + "grad_norm": 0.6605247855186462, + "learning_rate": 6.874432712020938e-05, + "loss": 0.7285, + "step": 118280 + }, + { + "epoch": 0.7557210942591007, + "grad_norm": 1.0286058187484741, + "learning_rate": 6.873967527708633e-05, + "loss": 0.8865, + "step": 118290 + }, + { + "epoch": 0.7557849814088394, + "grad_norm": 0.8798809051513672, + "learning_rate": 6.873502324523852e-05, + "loss": 0.8385, + "step": 118300 + }, + { + "epoch": 0.7558488685585781, + "grad_norm": 0.9660366177558899, + "learning_rate": 6.873037102471283e-05, + "loss": 0.8482, + "step": 118310 + }, + { + "epoch": 0.7559127557083168, + "grad_norm": 0.5325950980186462, + "learning_rate": 6.872571861555609e-05, + "loss": 0.903, + "step": 118320 + }, + { + "epoch": 0.7559766428580555, + "grad_norm": 0.715363085269928, + "learning_rate": 6.872106601781518e-05, + "loss": 1.1637, + "step": 118330 + }, + { + "epoch": 0.7560405300077943, + "grad_norm": 0.5783165097236633, + "learning_rate": 6.871641323153692e-05, + "loss": 1.0579, + "step": 118340 + }, + { + "epoch": 0.756104417157533, + "grad_norm": 0.7666789889335632, + "learning_rate": 6.871176025676818e-05, + "loss": 0.673, + "step": 118350 + }, + { + "epoch": 0.7561683043072717, + "grad_norm": 0.9602919816970825, + "learning_rate": 6.870710709355584e-05, + "loss": 0.795, + "step": 118360 + }, + { + "epoch": 0.7562321914570104, + "grad_norm": 0.9613474011421204, + "learning_rate": 6.870245374194675e-05, + "loss": 0.8152, + "step": 118370 + }, + { + "epoch": 0.7562960786067491, + "grad_norm": 0.9112039804458618, + "learning_rate": 6.869780020198777e-05, + "loss": 0.797, + "step": 118380 + }, + { + "epoch": 0.7563599657564878, + "grad_norm": 1.0254278182983398, + "learning_rate": 6.869314647372577e-05, + "loss": 1.0978, + "step": 118390 + }, + { + "epoch": 0.7564238529062265, + "grad_norm": 1.1823453903198242, + "learning_rate": 6.86884925572076e-05, + "loss": 1.1565, + "step": 118400 + }, + { + "epoch": 0.7564877400559652, + "grad_norm": 0.7849447727203369, + "learning_rate": 6.868383845248015e-05, + "loss": 0.949, + "step": 118410 + }, + { + "epoch": 0.7565516272057038, + "grad_norm": 1.0870212316513062, + "learning_rate": 6.867918415959028e-05, + "loss": 1.1851, + "step": 118420 + }, + { + "epoch": 0.7566155143554425, + "grad_norm": 1.099289894104004, + "learning_rate": 6.867452967858487e-05, + "loss": 0.6001, + "step": 118430 + }, + { + "epoch": 0.7566794015051812, + "grad_norm": 0.7557351589202881, + "learning_rate": 6.866987500951079e-05, + "loss": 1.1415, + "step": 118440 + }, + { + "epoch": 0.7567432886549199, + "grad_norm": 1.02070152759552, + "learning_rate": 6.866522015241493e-05, + "loss": 0.8612, + "step": 118450 + }, + { + "epoch": 0.7568071758046586, + "grad_norm": 1.3030376434326172, + "learning_rate": 6.866056510734414e-05, + "loss": 1.0833, + "step": 118460 + }, + { + "epoch": 0.7568710629543973, + "grad_norm": 1.1571980714797974, + "learning_rate": 6.86559098743453e-05, + "loss": 1.0281, + "step": 118470 + }, + { + "epoch": 0.756934950104136, + "grad_norm": 0.5520379543304443, + "learning_rate": 6.865125445346533e-05, + "loss": 0.7683, + "step": 118480 + }, + { + "epoch": 0.7569988372538747, + "grad_norm": 1.1610947847366333, + "learning_rate": 6.864659884475108e-05, + "loss": 0.8432, + "step": 118490 + }, + { + "epoch": 0.7570627244036134, + "grad_norm": 0.5098273754119873, + "learning_rate": 6.864194304824946e-05, + "loss": 0.9143, + "step": 118500 + }, + { + "epoch": 0.7571266115533521, + "grad_norm": 0.9760499596595764, + "learning_rate": 6.863728706400734e-05, + "loss": 0.8953, + "step": 118510 + }, + { + "epoch": 0.7571904987030909, + "grad_norm": 0.7442782521247864, + "learning_rate": 6.863263089207162e-05, + "loss": 1.1744, + "step": 118520 + }, + { + "epoch": 0.7572543858528296, + "grad_norm": 1.004907488822937, + "learning_rate": 6.862797453248918e-05, + "loss": 0.8577, + "step": 118530 + }, + { + "epoch": 0.7573182730025683, + "grad_norm": 1.782226800918579, + "learning_rate": 6.862331798530692e-05, + "loss": 0.8857, + "step": 118540 + }, + { + "epoch": 0.757382160152307, + "grad_norm": 0.9573549032211304, + "learning_rate": 6.861866125057175e-05, + "loss": 0.8601, + "step": 118550 + }, + { + "epoch": 0.7574460473020457, + "grad_norm": 1.518573522567749, + "learning_rate": 6.861400432833053e-05, + "loss": 1.1323, + "step": 118560 + }, + { + "epoch": 0.7575099344517844, + "grad_norm": 0.9608773589134216, + "learning_rate": 6.86093472186302e-05, + "loss": 0.9014, + "step": 118570 + }, + { + "epoch": 0.7575738216015231, + "grad_norm": 1.326219081878662, + "learning_rate": 6.860468992151764e-05, + "loss": 0.8637, + "step": 118580 + }, + { + "epoch": 0.7576377087512618, + "grad_norm": 0.6975284218788147, + "learning_rate": 6.860003243703976e-05, + "loss": 0.8435, + "step": 118590 + }, + { + "epoch": 0.7577015959010005, + "grad_norm": 0.8688485026359558, + "learning_rate": 6.859537476524346e-05, + "loss": 0.7777, + "step": 118600 + }, + { + "epoch": 0.7577654830507392, + "grad_norm": 1.3222057819366455, + "learning_rate": 6.859071690617565e-05, + "loss": 0.8211, + "step": 118610 + }, + { + "epoch": 0.7578293702004779, + "grad_norm": 0.7221174836158752, + "learning_rate": 6.858605885988325e-05, + "loss": 0.9684, + "step": 118620 + }, + { + "epoch": 0.7578932573502166, + "grad_norm": 1.1914112567901611, + "learning_rate": 6.858140062641313e-05, + "loss": 0.7354, + "step": 118630 + }, + { + "epoch": 0.7579571444999553, + "grad_norm": 0.7366169095039368, + "learning_rate": 6.857674220581225e-05, + "loss": 0.8568, + "step": 118640 + }, + { + "epoch": 0.758021031649694, + "grad_norm": 0.91144859790802, + "learning_rate": 6.85720835981275e-05, + "loss": 0.8024, + "step": 118650 + }, + { + "epoch": 0.7580849187994326, + "grad_norm": 1.1600289344787598, + "learning_rate": 6.856742480340581e-05, + "loss": 0.7997, + "step": 118660 + }, + { + "epoch": 0.7581488059491713, + "grad_norm": 1.0003736019134521, + "learning_rate": 6.856276582169408e-05, + "loss": 0.8877, + "step": 118670 + }, + { + "epoch": 0.75821269309891, + "grad_norm": 1.289528489112854, + "learning_rate": 6.855810665303923e-05, + "loss": 0.6617, + "step": 118680 + }, + { + "epoch": 0.7582765802486487, + "grad_norm": 0.9788122773170471, + "learning_rate": 6.85534472974882e-05, + "loss": 0.7778, + "step": 118690 + }, + { + "epoch": 0.7583404673983875, + "grad_norm": 0.6570330858230591, + "learning_rate": 6.854878775508792e-05, + "loss": 0.9725, + "step": 118700 + }, + { + "epoch": 0.7584043545481262, + "grad_norm": 0.911495566368103, + "learning_rate": 6.854412802588528e-05, + "loss": 0.83, + "step": 118710 + }, + { + "epoch": 0.7584682416978649, + "grad_norm": 0.729153037071228, + "learning_rate": 6.853946810992722e-05, + "loss": 1.1369, + "step": 118720 + }, + { + "epoch": 0.7585321288476036, + "grad_norm": 1.3419857025146484, + "learning_rate": 6.853480800726069e-05, + "loss": 0.7661, + "step": 118730 + }, + { + "epoch": 0.7585960159973423, + "grad_norm": 0.9699906706809998, + "learning_rate": 6.853014771793261e-05, + "loss": 0.9556, + "step": 118740 + }, + { + "epoch": 0.758659903147081, + "grad_norm": 0.9490513205528259, + "learning_rate": 6.852548724198992e-05, + "loss": 0.8333, + "step": 118750 + }, + { + "epoch": 0.7587237902968197, + "grad_norm": 0.9197043776512146, + "learning_rate": 6.852082657947953e-05, + "loss": 0.888, + "step": 118760 + }, + { + "epoch": 0.7587876774465584, + "grad_norm": 0.6110111474990845, + "learning_rate": 6.85161657304484e-05, + "loss": 0.8897, + "step": 118770 + }, + { + "epoch": 0.7588515645962971, + "grad_norm": 0.7827733159065247, + "learning_rate": 6.851150469494347e-05, + "loss": 0.6686, + "step": 118780 + }, + { + "epoch": 0.7589154517460358, + "grad_norm": 0.9810250997543335, + "learning_rate": 6.850684347301166e-05, + "loss": 0.9115, + "step": 118790 + }, + { + "epoch": 0.7589793388957745, + "grad_norm": 0.816536545753479, + "learning_rate": 6.850218206469993e-05, + "loss": 1.0305, + "step": 118800 + }, + { + "epoch": 0.7590432260455132, + "grad_norm": 0.9732635617256165, + "learning_rate": 6.849752047005522e-05, + "loss": 0.8531, + "step": 118810 + }, + { + "epoch": 0.7591071131952519, + "grad_norm": 1.23931086063385, + "learning_rate": 6.849285868912448e-05, + "loss": 0.9552, + "step": 118820 + }, + { + "epoch": 0.7591710003449906, + "grad_norm": 1.2935327291488647, + "learning_rate": 6.848819672195466e-05, + "loss": 0.8719, + "step": 118830 + }, + { + "epoch": 0.7592348874947293, + "grad_norm": 1.2548699378967285, + "learning_rate": 6.84835345685927e-05, + "loss": 0.9801, + "step": 118840 + }, + { + "epoch": 0.759298774644468, + "grad_norm": 1.043545126914978, + "learning_rate": 6.847887222908555e-05, + "loss": 1.1205, + "step": 118850 + }, + { + "epoch": 0.7593626617942068, + "grad_norm": 0.5109646320343018, + "learning_rate": 6.847420970348018e-05, + "loss": 0.8246, + "step": 118860 + }, + { + "epoch": 0.7594265489439455, + "grad_norm": 0.5622779130935669, + "learning_rate": 6.846954699182352e-05, + "loss": 0.7426, + "step": 118870 + }, + { + "epoch": 0.7594904360936842, + "grad_norm": 0.672226071357727, + "learning_rate": 6.846488409416256e-05, + "loss": 0.7645, + "step": 118880 + }, + { + "epoch": 0.7595543232434229, + "grad_norm": 0.6986059546470642, + "learning_rate": 6.846022101054422e-05, + "loss": 1.0862, + "step": 118890 + }, + { + "epoch": 0.7596182103931616, + "grad_norm": 1.0945719480514526, + "learning_rate": 6.84555577410155e-05, + "loss": 1.0323, + "step": 118900 + }, + { + "epoch": 0.7596820975429002, + "grad_norm": 0.9224714040756226, + "learning_rate": 6.845089428562336e-05, + "loss": 0.8719, + "step": 118910 + }, + { + "epoch": 0.7597459846926389, + "grad_norm": 0.9247092008590698, + "learning_rate": 6.844623064441473e-05, + "loss": 0.8067, + "step": 118920 + }, + { + "epoch": 0.7598098718423776, + "grad_norm": 0.732523500919342, + "learning_rate": 6.84415668174366e-05, + "loss": 1.2519, + "step": 118930 + }, + { + "epoch": 0.7598737589921163, + "grad_norm": 0.9560425281524658, + "learning_rate": 6.843690280473596e-05, + "loss": 0.903, + "step": 118940 + }, + { + "epoch": 0.759937646141855, + "grad_norm": 0.913837194442749, + "learning_rate": 6.843223860635974e-05, + "loss": 0.9495, + "step": 118950 + }, + { + "epoch": 0.7600015332915937, + "grad_norm": 1.002140998840332, + "learning_rate": 6.842757422235494e-05, + "loss": 0.8853, + "step": 118960 + }, + { + "epoch": 0.7600654204413324, + "grad_norm": 0.9161799550056458, + "learning_rate": 6.842290965276852e-05, + "loss": 0.8618, + "step": 118970 + }, + { + "epoch": 0.7601293075910711, + "grad_norm": 0.6475778222084045, + "learning_rate": 6.841824489764746e-05, + "loss": 0.809, + "step": 118980 + }, + { + "epoch": 0.7601931947408098, + "grad_norm": 1.090684413909912, + "learning_rate": 6.841357995703874e-05, + "loss": 1.0392, + "step": 118990 + }, + { + "epoch": 0.7602570818905485, + "grad_norm": 0.6572669148445129, + "learning_rate": 6.840891483098935e-05, + "loss": 1.0299, + "step": 119000 + }, + { + "epoch": 0.7603209690402872, + "grad_norm": 0.9290599226951599, + "learning_rate": 6.840424951954625e-05, + "loss": 0.9194, + "step": 119010 + }, + { + "epoch": 0.7603848561900259, + "grad_norm": 0.7174366116523743, + "learning_rate": 6.839958402275643e-05, + "loss": 0.8349, + "step": 119020 + }, + { + "epoch": 0.7604487433397646, + "grad_norm": 0.9106315970420837, + "learning_rate": 6.839491834066691e-05, + "loss": 0.8991, + "step": 119030 + }, + { + "epoch": 0.7605126304895033, + "grad_norm": 1.6887893676757812, + "learning_rate": 6.839025247332462e-05, + "loss": 0.8116, + "step": 119040 + }, + { + "epoch": 0.7605765176392421, + "grad_norm": 0.7932513356208801, + "learning_rate": 6.838558642077658e-05, + "loss": 0.952, + "step": 119050 + }, + { + "epoch": 0.7606404047889808, + "grad_norm": 0.733961284160614, + "learning_rate": 6.838092018306979e-05, + "loss": 0.9746, + "step": 119060 + }, + { + "epoch": 0.7607042919387195, + "grad_norm": 0.9534251093864441, + "learning_rate": 6.837625376025123e-05, + "loss": 0.9129, + "step": 119070 + }, + { + "epoch": 0.7607681790884582, + "grad_norm": 0.867732048034668, + "learning_rate": 6.837158715236789e-05, + "loss": 1.0368, + "step": 119080 + }, + { + "epoch": 0.7608320662381969, + "grad_norm": 0.9323291778564453, + "learning_rate": 6.836692035946677e-05, + "loss": 0.7597, + "step": 119090 + }, + { + "epoch": 0.7608959533879356, + "grad_norm": 1.1437997817993164, + "learning_rate": 6.83622533815949e-05, + "loss": 0.9955, + "step": 119100 + }, + { + "epoch": 0.7609598405376743, + "grad_norm": 0.6462964415550232, + "learning_rate": 6.835758621879922e-05, + "loss": 1.2323, + "step": 119110 + }, + { + "epoch": 0.761023727687413, + "grad_norm": 0.7670947313308716, + "learning_rate": 6.835291887112678e-05, + "loss": 0.762, + "step": 119120 + }, + { + "epoch": 0.7610876148371517, + "grad_norm": 1.1615947484970093, + "learning_rate": 6.834825133862457e-05, + "loss": 0.8781, + "step": 119130 + }, + { + "epoch": 0.7611515019868904, + "grad_norm": 0.6855329871177673, + "learning_rate": 6.834358362133959e-05, + "loss": 0.9, + "step": 119140 + }, + { + "epoch": 0.761215389136629, + "grad_norm": 1.5815876722335815, + "learning_rate": 6.833891571931886e-05, + "loss": 0.6895, + "step": 119150 + }, + { + "epoch": 0.7612792762863677, + "grad_norm": 0.804578423500061, + "learning_rate": 6.833424763260938e-05, + "loss": 0.8916, + "step": 119160 + }, + { + "epoch": 0.7613431634361064, + "grad_norm": 0.6342503428459167, + "learning_rate": 6.832957936125816e-05, + "loss": 1.0142, + "step": 119170 + }, + { + "epoch": 0.7614070505858451, + "grad_norm": 1.194042682647705, + "learning_rate": 6.832491090531223e-05, + "loss": 0.7734, + "step": 119180 + }, + { + "epoch": 0.7614709377355838, + "grad_norm": 0.8138452172279358, + "learning_rate": 6.83202422648186e-05, + "loss": 0.833, + "step": 119190 + }, + { + "epoch": 0.7615348248853225, + "grad_norm": 0.6419638395309448, + "learning_rate": 6.831557343982427e-05, + "loss": 0.8826, + "step": 119200 + }, + { + "epoch": 0.7615987120350612, + "grad_norm": 0.9119747281074524, + "learning_rate": 6.831090443037626e-05, + "loss": 0.8635, + "step": 119210 + }, + { + "epoch": 0.7616625991848, + "grad_norm": 1.2391308546066284, + "learning_rate": 6.83062352365216e-05, + "loss": 1.0166, + "step": 119220 + }, + { + "epoch": 0.7617264863345387, + "grad_norm": 1.1494985818862915, + "learning_rate": 6.830156585830734e-05, + "loss": 1.0373, + "step": 119230 + }, + { + "epoch": 0.7617903734842774, + "grad_norm": 0.8222819566726685, + "learning_rate": 6.829689629578046e-05, + "loss": 0.7228, + "step": 119240 + }, + { + "epoch": 0.7618542606340161, + "grad_norm": 0.60460364818573, + "learning_rate": 6.829222654898799e-05, + "loss": 0.8322, + "step": 119250 + }, + { + "epoch": 0.7619181477837548, + "grad_norm": 1.7040772438049316, + "learning_rate": 6.828755661797699e-05, + "loss": 1.1171, + "step": 119260 + }, + { + "epoch": 0.7619820349334935, + "grad_norm": 0.7591485977172852, + "learning_rate": 6.828288650279448e-05, + "loss": 0.8535, + "step": 119270 + }, + { + "epoch": 0.7620459220832322, + "grad_norm": 1.0769449472427368, + "learning_rate": 6.827821620348749e-05, + "loss": 1.0974, + "step": 119280 + }, + { + "epoch": 0.7621098092329709, + "grad_norm": 0.7819190621376038, + "learning_rate": 6.827354572010303e-05, + "loss": 0.9247, + "step": 119290 + }, + { + "epoch": 0.7621736963827096, + "grad_norm": 0.7512619495391846, + "learning_rate": 6.826887505268818e-05, + "loss": 1.4029, + "step": 119300 + }, + { + "epoch": 0.7622375835324483, + "grad_norm": 0.7051581740379333, + "learning_rate": 6.826420420128993e-05, + "loss": 0.7934, + "step": 119310 + }, + { + "epoch": 0.762301470682187, + "grad_norm": 2.504819393157959, + "learning_rate": 6.825953316595535e-05, + "loss": 0.9571, + "step": 119320 + }, + { + "epoch": 0.7623653578319257, + "grad_norm": 1.1130508184432983, + "learning_rate": 6.825486194673148e-05, + "loss": 0.8393, + "step": 119330 + }, + { + "epoch": 0.7624292449816644, + "grad_norm": 0.8944107890129089, + "learning_rate": 6.825019054366536e-05, + "loss": 0.7698, + "step": 119340 + }, + { + "epoch": 0.7624931321314031, + "grad_norm": 1.020553469657898, + "learning_rate": 6.824551895680404e-05, + "loss": 0.8631, + "step": 119350 + }, + { + "epoch": 0.7625570192811418, + "grad_norm": 0.9540114402770996, + "learning_rate": 6.824084718619454e-05, + "loss": 0.8874, + "step": 119360 + }, + { + "epoch": 0.7626209064308805, + "grad_norm": 0.6075379252433777, + "learning_rate": 6.823617523188394e-05, + "loss": 0.8942, + "step": 119370 + }, + { + "epoch": 0.7626847935806192, + "grad_norm": 1.7551565170288086, + "learning_rate": 6.823150309391928e-05, + "loss": 1.041, + "step": 119380 + }, + { + "epoch": 0.7627486807303578, + "grad_norm": 0.7924169898033142, + "learning_rate": 6.82268307723476e-05, + "loss": 0.8415, + "step": 119390 + }, + { + "epoch": 0.7628125678800965, + "grad_norm": 0.9569699168205261, + "learning_rate": 6.822215826721597e-05, + "loss": 0.8566, + "step": 119400 + }, + { + "epoch": 0.7628764550298353, + "grad_norm": 0.8898611068725586, + "learning_rate": 6.821748557857144e-05, + "loss": 0.6289, + "step": 119410 + }, + { + "epoch": 0.762940342179574, + "grad_norm": 1.3441417217254639, + "learning_rate": 6.821281270646106e-05, + "loss": 0.7926, + "step": 119420 + }, + { + "epoch": 0.7630042293293127, + "grad_norm": 0.8085065484046936, + "learning_rate": 6.820813965093193e-05, + "loss": 1.0383, + "step": 119430 + }, + { + "epoch": 0.7630681164790514, + "grad_norm": 1.1543138027191162, + "learning_rate": 6.820346641203106e-05, + "loss": 0.769, + "step": 119440 + }, + { + "epoch": 0.7631320036287901, + "grad_norm": 1.0326635837554932, + "learning_rate": 6.819879298980553e-05, + "loss": 0.6679, + "step": 119450 + }, + { + "epoch": 0.7631958907785288, + "grad_norm": 0.7548609972000122, + "learning_rate": 6.819411938430243e-05, + "loss": 0.8651, + "step": 119460 + }, + { + "epoch": 0.7632597779282675, + "grad_norm": 0.6115458607673645, + "learning_rate": 6.818944559556879e-05, + "loss": 0.9125, + "step": 119470 + }, + { + "epoch": 0.7633236650780062, + "grad_norm": 0.8484747409820557, + "learning_rate": 6.818477162365172e-05, + "loss": 0.94, + "step": 119480 + }, + { + "epoch": 0.7633875522277449, + "grad_norm": 0.9800739288330078, + "learning_rate": 6.818009746859823e-05, + "loss": 0.6768, + "step": 119490 + }, + { + "epoch": 0.7634514393774836, + "grad_norm": 1.5265213251113892, + "learning_rate": 6.817542313045547e-05, + "loss": 1.0567, + "step": 119500 + }, + { + "epoch": 0.7635153265272223, + "grad_norm": 1.5931601524353027, + "learning_rate": 6.817074860927045e-05, + "loss": 1.4033, + "step": 119510 + }, + { + "epoch": 0.763579213676961, + "grad_norm": 0.976694643497467, + "learning_rate": 6.816607390509028e-05, + "loss": 0.8279, + "step": 119520 + }, + { + "epoch": 0.7636431008266997, + "grad_norm": 0.9799617528915405, + "learning_rate": 6.816139901796202e-05, + "loss": 1.0672, + "step": 119530 + }, + { + "epoch": 0.7637069879764384, + "grad_norm": 1.1323072910308838, + "learning_rate": 6.815672394793277e-05, + "loss": 0.9828, + "step": 119540 + }, + { + "epoch": 0.7637708751261771, + "grad_norm": 1.2014492750167847, + "learning_rate": 6.815204869504961e-05, + "loss": 1.0682, + "step": 119550 + }, + { + "epoch": 0.7638347622759158, + "grad_norm": 3.5408363342285156, + "learning_rate": 6.81473732593596e-05, + "loss": 1.0801, + "step": 119560 + }, + { + "epoch": 0.7638986494256546, + "grad_norm": 0.9492976069450378, + "learning_rate": 6.814269764090986e-05, + "loss": 0.9406, + "step": 119570 + }, + { + "epoch": 0.7639625365753933, + "grad_norm": 0.7474743723869324, + "learning_rate": 6.813802183974745e-05, + "loss": 0.8298, + "step": 119580 + }, + { + "epoch": 0.764026423725132, + "grad_norm": 1.4195810556411743, + "learning_rate": 6.813334585591946e-05, + "loss": 1.1686, + "step": 119590 + }, + { + "epoch": 0.7640903108748707, + "grad_norm": 1.6396797895431519, + "learning_rate": 6.8128669689473e-05, + "loss": 0.7154, + "step": 119600 + }, + { + "epoch": 0.7641541980246094, + "grad_norm": 1.0308012962341309, + "learning_rate": 6.812399334045514e-05, + "loss": 0.6851, + "step": 119610 + }, + { + "epoch": 0.7642180851743481, + "grad_norm": 0.7701680064201355, + "learning_rate": 6.8119316808913e-05, + "loss": 0.987, + "step": 119620 + }, + { + "epoch": 0.7642819723240867, + "grad_norm": 0.8354985117912292, + "learning_rate": 6.811464009489365e-05, + "loss": 0.8276, + "step": 119630 + }, + { + "epoch": 0.7643458594738254, + "grad_norm": 1.0913121700286865, + "learning_rate": 6.810996319844422e-05, + "loss": 0.8687, + "step": 119640 + }, + { + "epoch": 0.7644097466235641, + "grad_norm": 0.845710039138794, + "learning_rate": 6.81052861196118e-05, + "loss": 0.9144, + "step": 119650 + }, + { + "epoch": 0.7644736337733028, + "grad_norm": 0.9504249095916748, + "learning_rate": 6.810060885844346e-05, + "loss": 0.9089, + "step": 119660 + }, + { + "epoch": 0.7645375209230415, + "grad_norm": 0.846555769443512, + "learning_rate": 6.809593141498633e-05, + "loss": 0.9722, + "step": 119670 + }, + { + "epoch": 0.7646014080727802, + "grad_norm": 1.736290693283081, + "learning_rate": 6.809125378928754e-05, + "loss": 0.8593, + "step": 119680 + }, + { + "epoch": 0.7646652952225189, + "grad_norm": 0.732244610786438, + "learning_rate": 6.808657598139416e-05, + "loss": 0.8176, + "step": 119690 + }, + { + "epoch": 0.7647291823722576, + "grad_norm": 0.5221996307373047, + "learning_rate": 6.80818979913533e-05, + "loss": 0.7639, + "step": 119700 + }, + { + "epoch": 0.7647930695219963, + "grad_norm": 0.8514750003814697, + "learning_rate": 6.80772198192121e-05, + "loss": 0.9472, + "step": 119710 + }, + { + "epoch": 0.764856956671735, + "grad_norm": 0.9706042408943176, + "learning_rate": 6.807254146501766e-05, + "loss": 0.8994, + "step": 119720 + }, + { + "epoch": 0.7649208438214737, + "grad_norm": 0.792775571346283, + "learning_rate": 6.806786292881708e-05, + "loss": 0.9944, + "step": 119730 + }, + { + "epoch": 0.7649847309712124, + "grad_norm": 0.786178469657898, + "learning_rate": 6.80631842106575e-05, + "loss": 0.8212, + "step": 119740 + }, + { + "epoch": 0.7650486181209512, + "grad_norm": 0.7634421586990356, + "learning_rate": 6.805850531058604e-05, + "loss": 0.896, + "step": 119750 + }, + { + "epoch": 0.7651125052706899, + "grad_norm": 1.2600396871566772, + "learning_rate": 6.805382622864978e-05, + "loss": 0.8976, + "step": 119760 + }, + { + "epoch": 0.7651763924204286, + "grad_norm": 0.9852913618087769, + "learning_rate": 6.804914696489587e-05, + "loss": 0.807, + "step": 119770 + }, + { + "epoch": 0.7652402795701673, + "grad_norm": 0.7352543473243713, + "learning_rate": 6.804446751937146e-05, + "loss": 0.9483, + "step": 119780 + }, + { + "epoch": 0.765304166719906, + "grad_norm": 0.6477217674255371, + "learning_rate": 6.803978789212363e-05, + "loss": 0.7509, + "step": 119790 + }, + { + "epoch": 0.7653680538696447, + "grad_norm": 0.7691764831542969, + "learning_rate": 6.803510808319954e-05, + "loss": 0.9045, + "step": 119800 + }, + { + "epoch": 0.7654319410193834, + "grad_norm": 1.1947227716445923, + "learning_rate": 6.803042809264632e-05, + "loss": 1.0757, + "step": 119810 + }, + { + "epoch": 0.7654958281691221, + "grad_norm": 0.9047258496284485, + "learning_rate": 6.802574792051107e-05, + "loss": 0.8635, + "step": 119820 + }, + { + "epoch": 0.7655597153188608, + "grad_norm": 0.8005874752998352, + "learning_rate": 6.802106756684096e-05, + "loss": 0.7446, + "step": 119830 + }, + { + "epoch": 0.7656236024685995, + "grad_norm": 0.7462660670280457, + "learning_rate": 6.80163870316831e-05, + "loss": 0.6892, + "step": 119840 + }, + { + "epoch": 0.7656874896183382, + "grad_norm": 0.7342929244041443, + "learning_rate": 6.801170631508465e-05, + "loss": 0.8575, + "step": 119850 + }, + { + "epoch": 0.7657513767680769, + "grad_norm": 0.6299241781234741, + "learning_rate": 6.800702541709272e-05, + "loss": 1.0322, + "step": 119860 + }, + { + "epoch": 0.7658152639178156, + "grad_norm": 2.7845346927642822, + "learning_rate": 6.800234433775448e-05, + "loss": 0.9482, + "step": 119870 + }, + { + "epoch": 0.7658791510675542, + "grad_norm": 0.7888658046722412, + "learning_rate": 6.799766307711704e-05, + "loss": 0.9034, + "step": 119880 + }, + { + "epoch": 0.7659430382172929, + "grad_norm": 1.2494713068008423, + "learning_rate": 6.799298163522757e-05, + "loss": 0.7792, + "step": 119890 + }, + { + "epoch": 0.7660069253670316, + "grad_norm": 0.8245709538459778, + "learning_rate": 6.79883000121332e-05, + "loss": 0.7697, + "step": 119900 + }, + { + "epoch": 0.7660708125167703, + "grad_norm": 0.6942414045333862, + "learning_rate": 6.79836182078811e-05, + "loss": 0.9865, + "step": 119910 + }, + { + "epoch": 0.766134699666509, + "grad_norm": 0.8170196413993835, + "learning_rate": 6.797893622251841e-05, + "loss": 0.8344, + "step": 119920 + }, + { + "epoch": 0.7661985868162478, + "grad_norm": 1.3100725412368774, + "learning_rate": 6.797425405609225e-05, + "loss": 0.8997, + "step": 119930 + }, + { + "epoch": 0.7662624739659865, + "grad_norm": 0.8601463437080383, + "learning_rate": 6.796957170864984e-05, + "loss": 0.7987, + "step": 119940 + }, + { + "epoch": 0.7663263611157252, + "grad_norm": 0.9720836877822876, + "learning_rate": 6.796488918023827e-05, + "loss": 0.725, + "step": 119950 + }, + { + "epoch": 0.7663902482654639, + "grad_norm": 0.8214823007583618, + "learning_rate": 6.796020647090472e-05, + "loss": 0.6698, + "step": 119960 + }, + { + "epoch": 0.7664541354152026, + "grad_norm": 1.443429946899414, + "learning_rate": 6.795552358069637e-05, + "loss": 0.9442, + "step": 119970 + }, + { + "epoch": 0.7665180225649413, + "grad_norm": 0.8205900192260742, + "learning_rate": 6.795084050966038e-05, + "loss": 0.9229, + "step": 119980 + }, + { + "epoch": 0.76658190971468, + "grad_norm": 0.7707697153091431, + "learning_rate": 6.794615725784386e-05, + "loss": 1.2311, + "step": 119990 + }, + { + "epoch": 0.7666457968644187, + "grad_norm": 0.9643944501876831, + "learning_rate": 6.794147382529403e-05, + "loss": 0.8979, + "step": 120000 + }, + { + "epoch": 0.7667096840141574, + "grad_norm": 0.6266613006591797, + "learning_rate": 6.793679021205804e-05, + "loss": 0.8486, + "step": 120010 + }, + { + "epoch": 0.7667735711638961, + "grad_norm": 0.7396105527877808, + "learning_rate": 6.793210641818305e-05, + "loss": 0.6949, + "step": 120020 + }, + { + "epoch": 0.7668374583136348, + "grad_norm": 1.156002402305603, + "learning_rate": 6.792742244371624e-05, + "loss": 0.8869, + "step": 120030 + }, + { + "epoch": 0.7669013454633735, + "grad_norm": 0.8425427079200745, + "learning_rate": 6.792273828870477e-05, + "loss": 0.6861, + "step": 120040 + }, + { + "epoch": 0.7669652326131122, + "grad_norm": 0.8769024610519409, + "learning_rate": 6.791805395319582e-05, + "loss": 0.7712, + "step": 120050 + }, + { + "epoch": 0.7670291197628509, + "grad_norm": 0.7793144583702087, + "learning_rate": 6.791336943723657e-05, + "loss": 0.8332, + "step": 120060 + }, + { + "epoch": 0.7670930069125896, + "grad_norm": 0.6223095655441284, + "learning_rate": 6.790868474087419e-05, + "loss": 0.8934, + "step": 120070 + }, + { + "epoch": 0.7671568940623283, + "grad_norm": 1.2079180479049683, + "learning_rate": 6.790399986415587e-05, + "loss": 1.0223, + "step": 120080 + }, + { + "epoch": 0.767220781212067, + "grad_norm": 3.4866483211517334, + "learning_rate": 6.789931480712876e-05, + "loss": 0.9603, + "step": 120090 + }, + { + "epoch": 0.7672846683618058, + "grad_norm": 0.7265200614929199, + "learning_rate": 6.789462956984008e-05, + "loss": 0.8821, + "step": 120100 + }, + { + "epoch": 0.7673485555115445, + "grad_norm": 0.47647228837013245, + "learning_rate": 6.788994415233699e-05, + "loss": 0.8709, + "step": 120110 + }, + { + "epoch": 0.7674124426612831, + "grad_norm": 0.7587248682975769, + "learning_rate": 6.78852585546667e-05, + "loss": 0.9765, + "step": 120120 + }, + { + "epoch": 0.7674763298110218, + "grad_norm": 1.0275150537490845, + "learning_rate": 6.788057277687638e-05, + "loss": 0.9257, + "step": 120130 + }, + { + "epoch": 0.7675402169607605, + "grad_norm": 2.4082839488983154, + "learning_rate": 6.787588681901321e-05, + "loss": 0.7645, + "step": 120140 + }, + { + "epoch": 0.7676041041104992, + "grad_norm": 0.8140583634376526, + "learning_rate": 6.78712006811244e-05, + "loss": 0.9093, + "step": 120150 + }, + { + "epoch": 0.7676679912602379, + "grad_norm": 0.5556838512420654, + "learning_rate": 6.786651436325715e-05, + "loss": 0.8601, + "step": 120160 + }, + { + "epoch": 0.7677318784099766, + "grad_norm": 0.8413486480712891, + "learning_rate": 6.786182786545863e-05, + "loss": 0.7334, + "step": 120170 + }, + { + "epoch": 0.7677957655597153, + "grad_norm": 0.7864370346069336, + "learning_rate": 6.785714118777607e-05, + "loss": 1.0277, + "step": 120180 + }, + { + "epoch": 0.767859652709454, + "grad_norm": 0.9981165528297424, + "learning_rate": 6.785245433025662e-05, + "loss": 0.7542, + "step": 120190 + }, + { + "epoch": 0.7679235398591927, + "grad_norm": 2.119781255722046, + "learning_rate": 6.784776729294752e-05, + "loss": 0.6569, + "step": 120200 + }, + { + "epoch": 0.7679874270089314, + "grad_norm": 1.0580250024795532, + "learning_rate": 6.784308007589598e-05, + "loss": 0.7881, + "step": 120210 + }, + { + "epoch": 0.7680513141586701, + "grad_norm": 0.9876987338066101, + "learning_rate": 6.783839267914918e-05, + "loss": 0.7479, + "step": 120220 + }, + { + "epoch": 0.7681152013084088, + "grad_norm": 0.8951814770698547, + "learning_rate": 6.783370510275433e-05, + "loss": 0.8872, + "step": 120230 + }, + { + "epoch": 0.7681790884581475, + "grad_norm": 2.5809152126312256, + "learning_rate": 6.782901734675864e-05, + "loss": 0.8542, + "step": 120240 + }, + { + "epoch": 0.7682429756078862, + "grad_norm": 0.884162962436676, + "learning_rate": 6.782432941120932e-05, + "loss": 0.7915, + "step": 120250 + }, + { + "epoch": 0.7683068627576249, + "grad_norm": 0.7958625555038452, + "learning_rate": 6.781964129615359e-05, + "loss": 0.9709, + "step": 120260 + }, + { + "epoch": 0.7683707499073636, + "grad_norm": 0.8712575435638428, + "learning_rate": 6.781495300163865e-05, + "loss": 0.8752, + "step": 120270 + }, + { + "epoch": 0.7684346370571024, + "grad_norm": 0.8485830426216125, + "learning_rate": 6.781026452771172e-05, + "loss": 1.0295, + "step": 120280 + }, + { + "epoch": 0.7684985242068411, + "grad_norm": 0.7899221777915955, + "learning_rate": 6.780557587442001e-05, + "loss": 0.7579, + "step": 120290 + }, + { + "epoch": 0.7685624113565798, + "grad_norm": 0.5380288362503052, + "learning_rate": 6.780088704181075e-05, + "loss": 1.2273, + "step": 120300 + }, + { + "epoch": 0.7686262985063185, + "grad_norm": 0.8067999482154846, + "learning_rate": 6.779619802993118e-05, + "loss": 0.9209, + "step": 120310 + }, + { + "epoch": 0.7686901856560572, + "grad_norm": 0.7237452268600464, + "learning_rate": 6.779150883882848e-05, + "loss": 1.0752, + "step": 120320 + }, + { + "epoch": 0.7687540728057959, + "grad_norm": 0.6322763562202454, + "learning_rate": 6.77868194685499e-05, + "loss": 0.7969, + "step": 120330 + }, + { + "epoch": 0.7688179599555346, + "grad_norm": 1.1552351713180542, + "learning_rate": 6.778212991914266e-05, + "loss": 0.9154, + "step": 120340 + }, + { + "epoch": 0.7688818471052733, + "grad_norm": 1.1436083316802979, + "learning_rate": 6.777744019065399e-05, + "loss": 0.9167, + "step": 120350 + }, + { + "epoch": 0.7689457342550119, + "grad_norm": 1.0631415843963623, + "learning_rate": 6.77727502831311e-05, + "loss": 0.8223, + "step": 120360 + }, + { + "epoch": 0.7690096214047506, + "grad_norm": 0.9322156310081482, + "learning_rate": 6.776806019662127e-05, + "loss": 0.9355, + "step": 120370 + }, + { + "epoch": 0.7690735085544893, + "grad_norm": 0.9718419909477234, + "learning_rate": 6.776336993117168e-05, + "loss": 0.8536, + "step": 120380 + }, + { + "epoch": 0.769137395704228, + "grad_norm": 1.3241702318191528, + "learning_rate": 6.775867948682959e-05, + "loss": 0.9899, + "step": 120390 + }, + { + "epoch": 0.7692012828539667, + "grad_norm": 1.2391200065612793, + "learning_rate": 6.775398886364224e-05, + "loss": 0.7317, + "step": 120400 + }, + { + "epoch": 0.7692651700037054, + "grad_norm": 0.8078621029853821, + "learning_rate": 6.774929806165686e-05, + "loss": 1.015, + "step": 120410 + }, + { + "epoch": 0.7693290571534441, + "grad_norm": 0.7837865948677063, + "learning_rate": 6.77446070809207e-05, + "loss": 0.8237, + "step": 120420 + }, + { + "epoch": 0.7693929443031828, + "grad_norm": 0.9741398096084595, + "learning_rate": 6.773991592148098e-05, + "loss": 0.8702, + "step": 120430 + }, + { + "epoch": 0.7694568314529215, + "grad_norm": 1.1501209735870361, + "learning_rate": 6.773522458338497e-05, + "loss": 0.9291, + "step": 120440 + }, + { + "epoch": 0.7695207186026602, + "grad_norm": 0.5776450634002686, + "learning_rate": 6.77305330666799e-05, + "loss": 1.0505, + "step": 120450 + }, + { + "epoch": 0.769584605752399, + "grad_norm": 0.7684696316719055, + "learning_rate": 6.772584137141302e-05, + "loss": 0.7328, + "step": 120460 + }, + { + "epoch": 0.7696484929021377, + "grad_norm": 0.680523157119751, + "learning_rate": 6.772114949763158e-05, + "loss": 0.9261, + "step": 120470 + }, + { + "epoch": 0.7697123800518764, + "grad_norm": 0.8536334037780762, + "learning_rate": 6.771645744538284e-05, + "loss": 1.0571, + "step": 120480 + }, + { + "epoch": 0.7697762672016151, + "grad_norm": 0.7580819129943848, + "learning_rate": 6.771176521471405e-05, + "loss": 0.8517, + "step": 120490 + }, + { + "epoch": 0.7698401543513538, + "grad_norm": 0.9832444190979004, + "learning_rate": 6.770707280567247e-05, + "loss": 0.9181, + "step": 120500 + }, + { + "epoch": 0.7699040415010925, + "grad_norm": 0.7702086567878723, + "learning_rate": 6.770238021830532e-05, + "loss": 0.9504, + "step": 120510 + }, + { + "epoch": 0.7699679286508312, + "grad_norm": 1.5449334383010864, + "learning_rate": 6.769768745265991e-05, + "loss": 0.7662, + "step": 120520 + }, + { + "epoch": 0.7700318158005699, + "grad_norm": 1.5101966857910156, + "learning_rate": 6.769299450878349e-05, + "loss": 0.8513, + "step": 120530 + }, + { + "epoch": 0.7700957029503086, + "grad_norm": 0.6847185492515564, + "learning_rate": 6.768830138672327e-05, + "loss": 0.8803, + "step": 120540 + }, + { + "epoch": 0.7701595901000473, + "grad_norm": 0.6780611276626587, + "learning_rate": 6.768360808652659e-05, + "loss": 0.7456, + "step": 120550 + }, + { + "epoch": 0.770223477249786, + "grad_norm": 1.0173395872116089, + "learning_rate": 6.767891460824066e-05, + "loss": 0.7139, + "step": 120560 + }, + { + "epoch": 0.7702873643995247, + "grad_norm": 0.6809027791023254, + "learning_rate": 6.767422095191277e-05, + "loss": 1.1505, + "step": 120570 + }, + { + "epoch": 0.7703512515492634, + "grad_norm": 0.9474708437919617, + "learning_rate": 6.766952711759018e-05, + "loss": 0.8336, + "step": 120580 + }, + { + "epoch": 0.7704151386990021, + "grad_norm": 0.5041736960411072, + "learning_rate": 6.766483310532017e-05, + "loss": 0.7056, + "step": 120590 + }, + { + "epoch": 0.7704790258487408, + "grad_norm": 1.975868582725525, + "learning_rate": 6.766013891515e-05, + "loss": 0.9774, + "step": 120600 + }, + { + "epoch": 0.7705429129984794, + "grad_norm": 0.8999959826469421, + "learning_rate": 6.765544454712696e-05, + "loss": 0.8933, + "step": 120610 + }, + { + "epoch": 0.7706068001482181, + "grad_norm": 0.839335560798645, + "learning_rate": 6.765075000129831e-05, + "loss": 0.8328, + "step": 120620 + }, + { + "epoch": 0.7706706872979568, + "grad_norm": 0.8537786602973938, + "learning_rate": 6.764605527771133e-05, + "loss": 0.8143, + "step": 120630 + }, + { + "epoch": 0.7707345744476956, + "grad_norm": 0.7219642996788025, + "learning_rate": 6.764136037641333e-05, + "loss": 0.7989, + "step": 120640 + }, + { + "epoch": 0.7707984615974343, + "grad_norm": 0.6712138056755066, + "learning_rate": 6.763666529745156e-05, + "loss": 1.1548, + "step": 120650 + }, + { + "epoch": 0.770862348747173, + "grad_norm": 0.8392811417579651, + "learning_rate": 6.763197004087331e-05, + "loss": 0.7134, + "step": 120660 + }, + { + "epoch": 0.7709262358969117, + "grad_norm": 0.8870442509651184, + "learning_rate": 6.762727460672586e-05, + "loss": 0.7751, + "step": 120670 + }, + { + "epoch": 0.7709901230466504, + "grad_norm": 1.1646859645843506, + "learning_rate": 6.762257899505653e-05, + "loss": 1.0547, + "step": 120680 + }, + { + "epoch": 0.7710540101963891, + "grad_norm": 1.5529083013534546, + "learning_rate": 6.761788320591257e-05, + "loss": 0.8419, + "step": 120690 + }, + { + "epoch": 0.7711178973461278, + "grad_norm": 2.7997336387634277, + "learning_rate": 6.761318723934128e-05, + "loss": 0.9536, + "step": 120700 + }, + { + "epoch": 0.7711817844958665, + "grad_norm": 1.0143194198608398, + "learning_rate": 6.760849109538996e-05, + "loss": 0.9442, + "step": 120710 + }, + { + "epoch": 0.7712456716456052, + "grad_norm": 1.0322544574737549, + "learning_rate": 6.76037947741059e-05, + "loss": 0.951, + "step": 120720 + }, + { + "epoch": 0.7713095587953439, + "grad_norm": 0.9332642555236816, + "learning_rate": 6.759956793336986e-05, + "loss": 0.7949, + "step": 120730 + }, + { + "epoch": 0.7713734459450826, + "grad_norm": 0.8280695080757141, + "learning_rate": 6.759487127528388e-05, + "loss": 0.7483, + "step": 120740 + }, + { + "epoch": 0.7714373330948213, + "grad_norm": 0.6056891679763794, + "learning_rate": 6.759017444000235e-05, + "loss": 0.8854, + "step": 120750 + }, + { + "epoch": 0.77150122024456, + "grad_norm": 1.0320556163787842, + "learning_rate": 6.758547742757254e-05, + "loss": 0.9241, + "step": 120760 + }, + { + "epoch": 0.7715651073942987, + "grad_norm": 1.820793867111206, + "learning_rate": 6.758078023804176e-05, + "loss": 1.0678, + "step": 120770 + }, + { + "epoch": 0.7716289945440374, + "grad_norm": 2.8395168781280518, + "learning_rate": 6.757608287145731e-05, + "loss": 0.7885, + "step": 120780 + }, + { + "epoch": 0.7716928816937761, + "grad_norm": 0.7798264026641846, + "learning_rate": 6.75713853278665e-05, + "loss": 0.9502, + "step": 120790 + }, + { + "epoch": 0.7717567688435149, + "grad_norm": 0.8113306164741516, + "learning_rate": 6.756668760731665e-05, + "loss": 0.7035, + "step": 120800 + }, + { + "epoch": 0.7718206559932536, + "grad_norm": 0.7610470056533813, + "learning_rate": 6.756198970985506e-05, + "loss": 0.9429, + "step": 120810 + }, + { + "epoch": 0.7718845431429923, + "grad_norm": 0.8831712603569031, + "learning_rate": 6.755729163552902e-05, + "loss": 0.9622, + "step": 120820 + }, + { + "epoch": 0.771948430292731, + "grad_norm": 0.9428032040596008, + "learning_rate": 6.755259338438588e-05, + "loss": 0.9375, + "step": 120830 + }, + { + "epoch": 0.7720123174424697, + "grad_norm": 1.5266450643539429, + "learning_rate": 6.754789495647293e-05, + "loss": 0.9392, + "step": 120840 + }, + { + "epoch": 0.7720762045922083, + "grad_norm": 1.08087956905365, + "learning_rate": 6.75431963518375e-05, + "loss": 1.0333, + "step": 120850 + }, + { + "epoch": 0.772140091741947, + "grad_norm": 1.0593822002410889, + "learning_rate": 6.75384975705269e-05, + "loss": 0.9502, + "step": 120860 + }, + { + "epoch": 0.7722039788916857, + "grad_norm": 0.902668297290802, + "learning_rate": 6.753379861258846e-05, + "loss": 0.5924, + "step": 120870 + }, + { + "epoch": 0.7722678660414244, + "grad_norm": 1.1227551698684692, + "learning_rate": 6.752909947806951e-05, + "loss": 0.7154, + "step": 120880 + }, + { + "epoch": 0.7723317531911631, + "grad_norm": 0.7121851444244385, + "learning_rate": 6.752440016701736e-05, + "loss": 0.8883, + "step": 120890 + }, + { + "epoch": 0.7723956403409018, + "grad_norm": 0.878093421459198, + "learning_rate": 6.751970067947932e-05, + "loss": 1.0066, + "step": 120900 + }, + { + "epoch": 0.7724595274906405, + "grad_norm": 2.005844831466675, + "learning_rate": 6.751500101550275e-05, + "loss": 0.8736, + "step": 120910 + }, + { + "epoch": 0.7725234146403792, + "grad_norm": 1.369321346282959, + "learning_rate": 6.751030117513497e-05, + "loss": 1.1788, + "step": 120920 + }, + { + "epoch": 0.7725873017901179, + "grad_norm": 0.6035107374191284, + "learning_rate": 6.750560115842332e-05, + "loss": 1.1607, + "step": 120930 + }, + { + "epoch": 0.7726511889398566, + "grad_norm": 1.0282695293426514, + "learning_rate": 6.750090096541511e-05, + "loss": 0.7348, + "step": 120940 + }, + { + "epoch": 0.7727150760895953, + "grad_norm": 0.5575137734413147, + "learning_rate": 6.749620059615768e-05, + "loss": 0.8886, + "step": 120950 + }, + { + "epoch": 0.772778963239334, + "grad_norm": 0.8261436223983765, + "learning_rate": 6.749150005069838e-05, + "loss": 0.928, + "step": 120960 + }, + { + "epoch": 0.7728428503890727, + "grad_norm": 0.8338256478309631, + "learning_rate": 6.748679932908454e-05, + "loss": 0.646, + "step": 120970 + }, + { + "epoch": 0.7729067375388115, + "grad_norm": 0.7634387612342834, + "learning_rate": 6.74820984313635e-05, + "loss": 0.9501, + "step": 120980 + }, + { + "epoch": 0.7729706246885502, + "grad_norm": 0.8158954977989197, + "learning_rate": 6.747739735758262e-05, + "loss": 0.937, + "step": 120990 + }, + { + "epoch": 0.7730345118382889, + "grad_norm": 0.8353099226951599, + "learning_rate": 6.747269610778922e-05, + "loss": 0.9787, + "step": 121000 + }, + { + "epoch": 0.7730983989880276, + "grad_norm": 1.1733039617538452, + "learning_rate": 6.746799468203064e-05, + "loss": 0.924, + "step": 121010 + }, + { + "epoch": 0.7731622861377663, + "grad_norm": 0.6754859685897827, + "learning_rate": 6.746329308035426e-05, + "loss": 1.1289, + "step": 121020 + }, + { + "epoch": 0.773226173287505, + "grad_norm": 0.7313271164894104, + "learning_rate": 6.745859130280741e-05, + "loss": 0.8438, + "step": 121030 + }, + { + "epoch": 0.7732900604372437, + "grad_norm": 1.6041016578674316, + "learning_rate": 6.745388934943743e-05, + "loss": 0.7458, + "step": 121040 + }, + { + "epoch": 0.7733539475869824, + "grad_norm": 0.7553384900093079, + "learning_rate": 6.744918722029169e-05, + "loss": 0.8966, + "step": 121050 + }, + { + "epoch": 0.7734178347367211, + "grad_norm": 0.4830940365791321, + "learning_rate": 6.744448491541754e-05, + "loss": 0.6584, + "step": 121060 + }, + { + "epoch": 0.7734817218864598, + "grad_norm": 0.8653696179389954, + "learning_rate": 6.743978243486233e-05, + "loss": 1.2337, + "step": 121070 + }, + { + "epoch": 0.7735456090361985, + "grad_norm": 0.8902184963226318, + "learning_rate": 6.743507977867342e-05, + "loss": 0.8364, + "step": 121080 + }, + { + "epoch": 0.7736094961859371, + "grad_norm": 0.9658520817756653, + "learning_rate": 6.74303769468982e-05, + "loss": 0.9397, + "step": 121090 + }, + { + "epoch": 0.7736733833356758, + "grad_norm": 0.7507711052894592, + "learning_rate": 6.742567393958398e-05, + "loss": 0.723, + "step": 121100 + }, + { + "epoch": 0.7737372704854145, + "grad_norm": 0.6307206153869629, + "learning_rate": 6.742097075677815e-05, + "loss": 0.7924, + "step": 121110 + }, + { + "epoch": 0.7738011576351532, + "grad_norm": 1.159859538078308, + "learning_rate": 6.741626739852806e-05, + "loss": 0.8277, + "step": 121120 + }, + { + "epoch": 0.7738650447848919, + "grad_norm": 0.7750802636146545, + "learning_rate": 6.741156386488112e-05, + "loss": 1.0919, + "step": 121130 + }, + { + "epoch": 0.7739289319346306, + "grad_norm": 0.9529350399971008, + "learning_rate": 6.740686015588465e-05, + "loss": 1.0912, + "step": 121140 + }, + { + "epoch": 0.7739928190843693, + "grad_norm": 0.8599395751953125, + "learning_rate": 6.740215627158605e-05, + "loss": 1.0332, + "step": 121150 + }, + { + "epoch": 0.774056706234108, + "grad_norm": 0.9793898463249207, + "learning_rate": 6.739745221203268e-05, + "loss": 0.7607, + "step": 121160 + }, + { + "epoch": 0.7741205933838468, + "grad_norm": 0.8916863799095154, + "learning_rate": 6.739274797727191e-05, + "loss": 1.0146, + "step": 121170 + }, + { + "epoch": 0.7741844805335855, + "grad_norm": 1.1108578443527222, + "learning_rate": 6.738804356735113e-05, + "loss": 0.9828, + "step": 121180 + }, + { + "epoch": 0.7742483676833242, + "grad_norm": 1.299629807472229, + "learning_rate": 6.73833389823177e-05, + "loss": 1.1273, + "step": 121190 + }, + { + "epoch": 0.7743122548330629, + "grad_norm": 0.9776250123977661, + "learning_rate": 6.737863422221902e-05, + "loss": 0.7339, + "step": 121200 + }, + { + "epoch": 0.7743761419828016, + "grad_norm": 0.8308240175247192, + "learning_rate": 6.737392928710245e-05, + "loss": 0.9728, + "step": 121210 + }, + { + "epoch": 0.7744400291325403, + "grad_norm": 1.0700846910476685, + "learning_rate": 6.736922417701537e-05, + "loss": 0.8842, + "step": 121220 + }, + { + "epoch": 0.774503916282279, + "grad_norm": 0.8977962732315063, + "learning_rate": 6.736451889200518e-05, + "loss": 0.8738, + "step": 121230 + }, + { + "epoch": 0.7745678034320177, + "grad_norm": 0.8981806039810181, + "learning_rate": 6.735981343211927e-05, + "loss": 0.82, + "step": 121240 + }, + { + "epoch": 0.7746316905817564, + "grad_norm": 1.0636060237884521, + "learning_rate": 6.735510779740502e-05, + "loss": 0.8949, + "step": 121250 + }, + { + "epoch": 0.7746955777314951, + "grad_norm": 0.8184270858764648, + "learning_rate": 6.735040198790982e-05, + "loss": 0.8559, + "step": 121260 + }, + { + "epoch": 0.7747594648812338, + "grad_norm": 1.0047454833984375, + "learning_rate": 6.734569600368105e-05, + "loss": 1.2097, + "step": 121270 + }, + { + "epoch": 0.7748233520309725, + "grad_norm": 1.6020781993865967, + "learning_rate": 6.734098984476612e-05, + "loss": 0.757, + "step": 121280 + }, + { + "epoch": 0.7748872391807112, + "grad_norm": 0.7962193489074707, + "learning_rate": 6.733628351121243e-05, + "loss": 0.8267, + "step": 121290 + }, + { + "epoch": 0.7749511263304499, + "grad_norm": 1.1019634008407593, + "learning_rate": 6.733157700306737e-05, + "loss": 0.869, + "step": 121300 + }, + { + "epoch": 0.7750150134801886, + "grad_norm": 0.6633391976356506, + "learning_rate": 6.732687032037832e-05, + "loss": 0.843, + "step": 121310 + }, + { + "epoch": 0.7750789006299273, + "grad_norm": 1.0275635719299316, + "learning_rate": 6.73221634631927e-05, + "loss": 0.9104, + "step": 121320 + }, + { + "epoch": 0.7751427877796659, + "grad_norm": 0.7791745662689209, + "learning_rate": 6.73174564315579e-05, + "loss": 0.9885, + "step": 121330 + }, + { + "epoch": 0.7752066749294046, + "grad_norm": 1.2673611640930176, + "learning_rate": 6.731274922552135e-05, + "loss": 0.6765, + "step": 121340 + }, + { + "epoch": 0.7752705620791434, + "grad_norm": 0.8525099754333496, + "learning_rate": 6.730804184513044e-05, + "loss": 0.8447, + "step": 121350 + }, + { + "epoch": 0.7753344492288821, + "grad_norm": 0.8787998557090759, + "learning_rate": 6.730333429043256e-05, + "loss": 0.8673, + "step": 121360 + }, + { + "epoch": 0.7753983363786208, + "grad_norm": 0.7278786897659302, + "learning_rate": 6.729862656147514e-05, + "loss": 0.7846, + "step": 121370 + }, + { + "epoch": 0.7754622235283595, + "grad_norm": 1.0714443922042847, + "learning_rate": 6.729391865830559e-05, + "loss": 0.809, + "step": 121380 + }, + { + "epoch": 0.7755261106780982, + "grad_norm": 0.820010244846344, + "learning_rate": 6.72892105809713e-05, + "loss": 0.8847, + "step": 121390 + }, + { + "epoch": 0.7755899978278369, + "grad_norm": 1.3069791793823242, + "learning_rate": 6.728450232951972e-05, + "loss": 0.8478, + "step": 121400 + }, + { + "epoch": 0.7756538849775756, + "grad_norm": 0.992739737033844, + "learning_rate": 6.727979390399825e-05, + "loss": 1.0541, + "step": 121410 + }, + { + "epoch": 0.7757177721273143, + "grad_norm": 0.8804332613945007, + "learning_rate": 6.72750853044543e-05, + "loss": 0.9293, + "step": 121420 + }, + { + "epoch": 0.775781659277053, + "grad_norm": 0.9958817958831787, + "learning_rate": 6.72703765309353e-05, + "loss": 0.959, + "step": 121430 + }, + { + "epoch": 0.7758455464267917, + "grad_norm": 0.8248307704925537, + "learning_rate": 6.726566758348867e-05, + "loss": 0.8786, + "step": 121440 + }, + { + "epoch": 0.7759094335765304, + "grad_norm": 0.9788550138473511, + "learning_rate": 6.726095846216181e-05, + "loss": 0.6713, + "step": 121450 + }, + { + "epoch": 0.7759733207262691, + "grad_norm": 1.0201257467269897, + "learning_rate": 6.725624916700218e-05, + "loss": 1.1351, + "step": 121460 + }, + { + "epoch": 0.7760372078760078, + "grad_norm": 0.550969123840332, + "learning_rate": 6.72515396980572e-05, + "loss": 0.9425, + "step": 121470 + }, + { + "epoch": 0.7761010950257465, + "grad_norm": 0.7929662466049194, + "learning_rate": 6.724683005537427e-05, + "loss": 1.0115, + "step": 121480 + }, + { + "epoch": 0.7761649821754852, + "grad_norm": 1.1260933876037598, + "learning_rate": 6.724212023900086e-05, + "loss": 0.8828, + "step": 121490 + }, + { + "epoch": 0.776228869325224, + "grad_norm": 1.8724021911621094, + "learning_rate": 6.723741024898438e-05, + "loss": 1.1478, + "step": 121500 + }, + { + "epoch": 0.7762927564749627, + "grad_norm": 0.7166033387184143, + "learning_rate": 6.723270008537225e-05, + "loss": 0.9861, + "step": 121510 + }, + { + "epoch": 0.7763566436247014, + "grad_norm": 0.6689345240592957, + "learning_rate": 6.722798974821193e-05, + "loss": 1.0694, + "step": 121520 + }, + { + "epoch": 0.7764205307744401, + "grad_norm": 0.7042884230613708, + "learning_rate": 6.722327923755086e-05, + "loss": 1.0029, + "step": 121530 + }, + { + "epoch": 0.7764844179241788, + "grad_norm": 0.6996636390686035, + "learning_rate": 6.721856855343647e-05, + "loss": 0.7168, + "step": 121540 + }, + { + "epoch": 0.7765483050739175, + "grad_norm": 0.7664635181427002, + "learning_rate": 6.721385769591618e-05, + "loss": 1.13, + "step": 121550 + }, + { + "epoch": 0.7766121922236562, + "grad_norm": 1.2732270956039429, + "learning_rate": 6.720914666503746e-05, + "loss": 0.7654, + "step": 121560 + }, + { + "epoch": 0.7766760793733949, + "grad_norm": 1.799688458442688, + "learning_rate": 6.720443546084775e-05, + "loss": 0.9057, + "step": 121570 + }, + { + "epoch": 0.7767399665231335, + "grad_norm": 0.6729174852371216, + "learning_rate": 6.719972408339447e-05, + "loss": 0.8531, + "step": 121580 + }, + { + "epoch": 0.7768038536728722, + "grad_norm": 0.8907155394554138, + "learning_rate": 6.719501253272513e-05, + "loss": 0.6519, + "step": 121590 + }, + { + "epoch": 0.7768677408226109, + "grad_norm": 0.7867339253425598, + "learning_rate": 6.71903008088871e-05, + "loss": 0.7657, + "step": 121600 + }, + { + "epoch": 0.7769316279723496, + "grad_norm": 0.8111919164657593, + "learning_rate": 6.718558891192788e-05, + "loss": 0.89, + "step": 121610 + }, + { + "epoch": 0.7769955151220883, + "grad_norm": 1.4637147188186646, + "learning_rate": 6.718087684189491e-05, + "loss": 0.7084, + "step": 121620 + }, + { + "epoch": 0.777059402271827, + "grad_norm": 0.9606701731681824, + "learning_rate": 6.717616459883564e-05, + "loss": 0.8442, + "step": 121630 + }, + { + "epoch": 0.7771232894215657, + "grad_norm": 0.5550901293754578, + "learning_rate": 6.717145218279755e-05, + "loss": 0.785, + "step": 121640 + }, + { + "epoch": 0.7771871765713044, + "grad_norm": 0.8698827028274536, + "learning_rate": 6.716673959382806e-05, + "loss": 0.9919, + "step": 121650 + }, + { + "epoch": 0.7772510637210431, + "grad_norm": 1.0764882564544678, + "learning_rate": 6.716202683197468e-05, + "loss": 0.8303, + "step": 121660 + }, + { + "epoch": 0.7773149508707818, + "grad_norm": 0.6857743263244629, + "learning_rate": 6.715731389728484e-05, + "loss": 0.8815, + "step": 121670 + }, + { + "epoch": 0.7773788380205205, + "grad_norm": 1.6733158826828003, + "learning_rate": 6.715260078980599e-05, + "loss": 0.7927, + "step": 121680 + }, + { + "epoch": 0.7774427251702593, + "grad_norm": 0.7211989164352417, + "learning_rate": 6.714788750958561e-05, + "loss": 0.9503, + "step": 121690 + }, + { + "epoch": 0.777506612319998, + "grad_norm": 1.002265453338623, + "learning_rate": 6.714317405667118e-05, + "loss": 1.0882, + "step": 121700 + }, + { + "epoch": 0.7775704994697367, + "grad_norm": 0.9164408445358276, + "learning_rate": 6.713846043111014e-05, + "loss": 0.886, + "step": 121710 + }, + { + "epoch": 0.7776343866194754, + "grad_norm": 0.5526295304298401, + "learning_rate": 6.713374663294999e-05, + "loss": 0.7163, + "step": 121720 + }, + { + "epoch": 0.7776982737692141, + "grad_norm": 1.0541777610778809, + "learning_rate": 6.712903266223818e-05, + "loss": 1.219, + "step": 121730 + }, + { + "epoch": 0.7777621609189528, + "grad_norm": 1.3423256874084473, + "learning_rate": 6.71243185190222e-05, + "loss": 1.2439, + "step": 121740 + }, + { + "epoch": 0.7778260480686915, + "grad_norm": 0.900256335735321, + "learning_rate": 6.711960420334951e-05, + "loss": 0.9215, + "step": 121750 + }, + { + "epoch": 0.7778899352184302, + "grad_norm": 0.7287362813949585, + "learning_rate": 6.71148897152676e-05, + "loss": 0.7935, + "step": 121760 + }, + { + "epoch": 0.7779538223681689, + "grad_norm": 0.6165835857391357, + "learning_rate": 6.711017505482395e-05, + "loss": 0.8651, + "step": 121770 + }, + { + "epoch": 0.7780177095179076, + "grad_norm": 1.222276210784912, + "learning_rate": 6.710546022206603e-05, + "loss": 0.7607, + "step": 121780 + }, + { + "epoch": 0.7780815966676463, + "grad_norm": 0.9571607112884521, + "learning_rate": 6.71007452170413e-05, + "loss": 0.9691, + "step": 121790 + }, + { + "epoch": 0.778145483817385, + "grad_norm": 0.7661402821540833, + "learning_rate": 6.709603003979729e-05, + "loss": 0.8724, + "step": 121800 + }, + { + "epoch": 0.7782093709671237, + "grad_norm": 0.9721023440361023, + "learning_rate": 6.709131469038149e-05, + "loss": 0.6902, + "step": 121810 + }, + { + "epoch": 0.7782732581168623, + "grad_norm": 0.9388052821159363, + "learning_rate": 6.708659916884135e-05, + "loss": 0.8722, + "step": 121820 + }, + { + "epoch": 0.778337145266601, + "grad_norm": 1.3385436534881592, + "learning_rate": 6.708188347522438e-05, + "loss": 0.9258, + "step": 121830 + }, + { + "epoch": 0.7784010324163397, + "grad_norm": 1.1021685600280762, + "learning_rate": 6.707716760957805e-05, + "loss": 1.1272, + "step": 121840 + }, + { + "epoch": 0.7784649195660784, + "grad_norm": 0.7849000096321106, + "learning_rate": 6.707245157194987e-05, + "loss": 0.8731, + "step": 121850 + }, + { + "epoch": 0.7785288067158171, + "grad_norm": 0.7973129153251648, + "learning_rate": 6.706773536238734e-05, + "loss": 1.0259, + "step": 121860 + }, + { + "epoch": 0.7785926938655559, + "grad_norm": 0.8367007970809937, + "learning_rate": 6.706301898093795e-05, + "loss": 0.8157, + "step": 121870 + }, + { + "epoch": 0.7786565810152946, + "grad_norm": 0.8635137677192688, + "learning_rate": 6.70583024276492e-05, + "loss": 0.907, + "step": 121880 + }, + { + "epoch": 0.7787204681650333, + "grad_norm": 0.8511916399002075, + "learning_rate": 6.705358570256858e-05, + "loss": 0.9413, + "step": 121890 + }, + { + "epoch": 0.778784355314772, + "grad_norm": 0.8517649173736572, + "learning_rate": 6.70488688057436e-05, + "loss": 0.9937, + "step": 121900 + }, + { + "epoch": 0.7788482424645107, + "grad_norm": 0.8689191341400146, + "learning_rate": 6.704415173722176e-05, + "loss": 0.9242, + "step": 121910 + }, + { + "epoch": 0.7789121296142494, + "grad_norm": 0.7940566539764404, + "learning_rate": 6.70394344970506e-05, + "loss": 0.7244, + "step": 121920 + }, + { + "epoch": 0.7789760167639881, + "grad_norm": 1.276955008506775, + "learning_rate": 6.703471708527756e-05, + "loss": 1.1358, + "step": 121930 + }, + { + "epoch": 0.7790399039137268, + "grad_norm": 1.2477837800979614, + "learning_rate": 6.702999950195017e-05, + "loss": 1.0286, + "step": 121940 + }, + { + "epoch": 0.7791037910634655, + "grad_norm": 0.9349541664123535, + "learning_rate": 6.702528174711597e-05, + "loss": 0.9723, + "step": 121950 + }, + { + "epoch": 0.7791676782132042, + "grad_norm": 0.8674134612083435, + "learning_rate": 6.702056382082245e-05, + "loss": 1.0462, + "step": 121960 + }, + { + "epoch": 0.7792315653629429, + "grad_norm": 1.3984166383743286, + "learning_rate": 6.701584572311712e-05, + "loss": 0.9276, + "step": 121970 + }, + { + "epoch": 0.7792954525126816, + "grad_norm": 1.5692760944366455, + "learning_rate": 6.701112745404752e-05, + "loss": 0.7578, + "step": 121980 + }, + { + "epoch": 0.7793593396624203, + "grad_norm": 1.1319806575775146, + "learning_rate": 6.700640901366113e-05, + "loss": 0.927, + "step": 121990 + }, + { + "epoch": 0.779423226812159, + "grad_norm": 1.1491388082504272, + "learning_rate": 6.700169040200551e-05, + "loss": 0.8779, + "step": 122000 + }, + { + "epoch": 0.7794871139618977, + "grad_norm": 0.9306029081344604, + "learning_rate": 6.699697161912815e-05, + "loss": 0.7728, + "step": 122010 + }, + { + "epoch": 0.7795510011116364, + "grad_norm": 2.3695061206817627, + "learning_rate": 6.699225266507658e-05, + "loss": 1.0062, + "step": 122020 + }, + { + "epoch": 0.7796148882613751, + "grad_norm": 0.8340387344360352, + "learning_rate": 6.698753353989831e-05, + "loss": 0.7666, + "step": 122030 + }, + { + "epoch": 0.7796787754111139, + "grad_norm": 1.7601200342178345, + "learning_rate": 6.69828142436409e-05, + "loss": 0.9149, + "step": 122040 + }, + { + "epoch": 0.7797426625608526, + "grad_norm": 0.7318232655525208, + "learning_rate": 6.697809477635187e-05, + "loss": 0.8447, + "step": 122050 + }, + { + "epoch": 0.7798065497105912, + "grad_norm": 1.044084906578064, + "learning_rate": 6.697384710959896e-05, + "loss": 1.3071, + "step": 122060 + }, + { + "epoch": 0.7798704368603299, + "grad_norm": 0.5054813623428345, + "learning_rate": 6.696912731748075e-05, + "loss": 0.8569, + "step": 122070 + }, + { + "epoch": 0.7799343240100686, + "grad_norm": 0.8287333846092224, + "learning_rate": 6.696440735446876e-05, + "loss": 1.0033, + "step": 122080 + }, + { + "epoch": 0.7799982111598073, + "grad_norm": 0.8262792825698853, + "learning_rate": 6.695968722061052e-05, + "loss": 0.8806, + "step": 122090 + }, + { + "epoch": 0.780062098309546, + "grad_norm": 0.9939790964126587, + "learning_rate": 6.695496691595354e-05, + "loss": 1.0262, + "step": 122100 + }, + { + "epoch": 0.7801259854592847, + "grad_norm": 0.7839484810829163, + "learning_rate": 6.695024644054537e-05, + "loss": 1.0321, + "step": 122110 + }, + { + "epoch": 0.7801898726090234, + "grad_norm": 0.7679548859596252, + "learning_rate": 6.694552579443358e-05, + "loss": 0.8247, + "step": 122120 + }, + { + "epoch": 0.7802537597587621, + "grad_norm": 1.0078667402267456, + "learning_rate": 6.694080497766567e-05, + "loss": 0.8459, + "step": 122130 + }, + { + "epoch": 0.7803176469085008, + "grad_norm": 0.964644730091095, + "learning_rate": 6.69360839902892e-05, + "loss": 0.9135, + "step": 122140 + }, + { + "epoch": 0.7803815340582395, + "grad_norm": 1.4966347217559814, + "learning_rate": 6.69313628323517e-05, + "loss": 0.756, + "step": 122150 + }, + { + "epoch": 0.7804454212079782, + "grad_norm": 0.8424573540687561, + "learning_rate": 6.692664150390073e-05, + "loss": 0.8184, + "step": 122160 + }, + { + "epoch": 0.7805093083577169, + "grad_norm": 0.6761122345924377, + "learning_rate": 6.692192000498385e-05, + "loss": 1.1223, + "step": 122170 + }, + { + "epoch": 0.7805731955074556, + "grad_norm": 1.4543062448501587, + "learning_rate": 6.69171983356486e-05, + "loss": 0.7813, + "step": 122180 + }, + { + "epoch": 0.7806370826571943, + "grad_norm": 0.830903172492981, + "learning_rate": 6.691247649594251e-05, + "loss": 0.9245, + "step": 122190 + }, + { + "epoch": 0.780700969806933, + "grad_norm": 0.994420051574707, + "learning_rate": 6.690775448591316e-05, + "loss": 0.8715, + "step": 122200 + }, + { + "epoch": 0.7807648569566717, + "grad_norm": 0.7622717618942261, + "learning_rate": 6.69030323056081e-05, + "loss": 0.9868, + "step": 122210 + }, + { + "epoch": 0.7808287441064105, + "grad_norm": 0.8121097683906555, + "learning_rate": 6.689830995507487e-05, + "loss": 1.1996, + "step": 122220 + }, + { + "epoch": 0.7808926312561492, + "grad_norm": 0.6650993227958679, + "learning_rate": 6.689358743436105e-05, + "loss": 0.6315, + "step": 122230 + }, + { + "epoch": 0.7809565184058879, + "grad_norm": 1.005804419517517, + "learning_rate": 6.68888647435142e-05, + "loss": 0.9496, + "step": 122240 + }, + { + "epoch": 0.7810204055556266, + "grad_norm": 0.9154719114303589, + "learning_rate": 6.688414188258185e-05, + "loss": 0.7727, + "step": 122250 + }, + { + "epoch": 0.7810842927053653, + "grad_norm": 0.7355092167854309, + "learning_rate": 6.687941885161158e-05, + "loss": 0.84, + "step": 122260 + }, + { + "epoch": 0.781148179855104, + "grad_norm": 0.5671817064285278, + "learning_rate": 6.687469565065096e-05, + "loss": 0.9975, + "step": 122270 + }, + { + "epoch": 0.7812120670048427, + "grad_norm": 1.7897629737854004, + "learning_rate": 6.686997227974756e-05, + "loss": 0.9906, + "step": 122280 + }, + { + "epoch": 0.7812759541545814, + "grad_norm": 0.9264022707939148, + "learning_rate": 6.686524873894894e-05, + "loss": 0.9234, + "step": 122290 + }, + { + "epoch": 0.7813398413043201, + "grad_norm": 1.2895312309265137, + "learning_rate": 6.68605250283027e-05, + "loss": 0.8688, + "step": 122300 + }, + { + "epoch": 0.7814037284540587, + "grad_norm": 1.5715874433517456, + "learning_rate": 6.685580114785638e-05, + "loss": 0.951, + "step": 122310 + }, + { + "epoch": 0.7814676156037974, + "grad_norm": 0.6482036709785461, + "learning_rate": 6.685107709765755e-05, + "loss": 0.9561, + "step": 122320 + }, + { + "epoch": 0.7815315027535361, + "grad_norm": 0.810217559337616, + "learning_rate": 6.684635287775381e-05, + "loss": 1.0825, + "step": 122330 + }, + { + "epoch": 0.7815953899032748, + "grad_norm": 0.8282271027565002, + "learning_rate": 6.68416284881927e-05, + "loss": 0.8896, + "step": 122340 + }, + { + "epoch": 0.7816592770530135, + "grad_norm": 0.6604433059692383, + "learning_rate": 6.683690392902184e-05, + "loss": 0.8456, + "step": 122350 + }, + { + "epoch": 0.7817231642027522, + "grad_norm": 0.8533942699432373, + "learning_rate": 6.683217920028876e-05, + "loss": 0.8331, + "step": 122360 + }, + { + "epoch": 0.7817870513524909, + "grad_norm": 1.0768920183181763, + "learning_rate": 6.68274543020411e-05, + "loss": 0.9044, + "step": 122370 + }, + { + "epoch": 0.7818509385022296, + "grad_norm": 1.2052894830703735, + "learning_rate": 6.682272923432643e-05, + "loss": 0.9181, + "step": 122380 + }, + { + "epoch": 0.7819148256519683, + "grad_norm": 1.163033127784729, + "learning_rate": 6.681800399719229e-05, + "loss": 0.9498, + "step": 122390 + }, + { + "epoch": 0.781978712801707, + "grad_norm": 1.0256508588790894, + "learning_rate": 6.681327859068633e-05, + "loss": 0.9087, + "step": 122400 + }, + { + "epoch": 0.7820425999514458, + "grad_norm": 1.141444444656372, + "learning_rate": 6.680855301485609e-05, + "loss": 0.8819, + "step": 122410 + }, + { + "epoch": 0.7821064871011845, + "grad_norm": 1.1306743621826172, + "learning_rate": 6.680382726974918e-05, + "loss": 0.9268, + "step": 122420 + }, + { + "epoch": 0.7821703742509232, + "grad_norm": 0.9543069005012512, + "learning_rate": 6.67991013554132e-05, + "loss": 0.8443, + "step": 122430 + }, + { + "epoch": 0.7822342614006619, + "grad_norm": 0.49687138199806213, + "learning_rate": 6.679437527189571e-05, + "loss": 0.9003, + "step": 122440 + }, + { + "epoch": 0.7822981485504006, + "grad_norm": 1.3681707382202148, + "learning_rate": 6.678964901924435e-05, + "loss": 1.0405, + "step": 122450 + }, + { + "epoch": 0.7823620357001393, + "grad_norm": 2.7566139698028564, + "learning_rate": 6.678492259750672e-05, + "loss": 1.1397, + "step": 122460 + }, + { + "epoch": 0.782425922849878, + "grad_norm": 1.4756008386611938, + "learning_rate": 6.678019600673037e-05, + "loss": 0.8729, + "step": 122470 + }, + { + "epoch": 0.7824898099996167, + "grad_norm": 1.502285122871399, + "learning_rate": 6.677546924696295e-05, + "loss": 0.7695, + "step": 122480 + }, + { + "epoch": 0.7825536971493554, + "grad_norm": 0.9394874572753906, + "learning_rate": 6.677074231825203e-05, + "loss": 0.8499, + "step": 122490 + }, + { + "epoch": 0.7826175842990941, + "grad_norm": 0.6078043580055237, + "learning_rate": 6.676601522064522e-05, + "loss": 0.7862, + "step": 122500 + }, + { + "epoch": 0.7826814714488328, + "grad_norm": 0.6710939407348633, + "learning_rate": 6.676128795419015e-05, + "loss": 1.0033, + "step": 122510 + }, + { + "epoch": 0.7827453585985715, + "grad_norm": 1.7409946918487549, + "learning_rate": 6.67565605189344e-05, + "loss": 0.7387, + "step": 122520 + }, + { + "epoch": 0.7828092457483102, + "grad_norm": 1.1592122316360474, + "learning_rate": 6.67518329149256e-05, + "loss": 0.8663, + "step": 122530 + }, + { + "epoch": 0.7828731328980489, + "grad_norm": 0.7959754467010498, + "learning_rate": 6.674710514221133e-05, + "loss": 0.9003, + "step": 122540 + }, + { + "epoch": 0.7829370200477875, + "grad_norm": 1.363761305809021, + "learning_rate": 6.674237720083924e-05, + "loss": 0.7244, + "step": 122550 + }, + { + "epoch": 0.7830009071975262, + "grad_norm": 0.603500247001648, + "learning_rate": 6.673764909085692e-05, + "loss": 0.7391, + "step": 122560 + }, + { + "epoch": 0.783064794347265, + "grad_norm": 1.3545539379119873, + "learning_rate": 6.6732920812312e-05, + "loss": 0.8183, + "step": 122570 + }, + { + "epoch": 0.7831286814970037, + "grad_norm": 0.8553158044815063, + "learning_rate": 6.672819236525208e-05, + "loss": 0.8083, + "step": 122580 + }, + { + "epoch": 0.7831925686467424, + "grad_norm": 1.2060186862945557, + "learning_rate": 6.67234637497248e-05, + "loss": 1.0407, + "step": 122590 + }, + { + "epoch": 0.7832564557964811, + "grad_norm": 0.7966336607933044, + "learning_rate": 6.671873496577777e-05, + "loss": 1.004, + "step": 122600 + }, + { + "epoch": 0.7833203429462198, + "grad_norm": 1.0027638673782349, + "learning_rate": 6.671400601345861e-05, + "loss": 1.013, + "step": 122610 + }, + { + "epoch": 0.7833842300959585, + "grad_norm": 0.8682675957679749, + "learning_rate": 6.670927689281494e-05, + "loss": 1.0311, + "step": 122620 + }, + { + "epoch": 0.7834481172456972, + "grad_norm": 1.4984140396118164, + "learning_rate": 6.670454760389442e-05, + "loss": 0.6875, + "step": 122630 + }, + { + "epoch": 0.7835120043954359, + "grad_norm": 0.7457048892974854, + "learning_rate": 6.669981814674464e-05, + "loss": 0.6236, + "step": 122640 + }, + { + "epoch": 0.7835758915451746, + "grad_norm": 0.7693182826042175, + "learning_rate": 6.669508852141325e-05, + "loss": 0.7395, + "step": 122650 + }, + { + "epoch": 0.7836397786949133, + "grad_norm": 2.3714489936828613, + "learning_rate": 6.669035872794786e-05, + "loss": 1.1034, + "step": 122660 + }, + { + "epoch": 0.783703665844652, + "grad_norm": 0.9617331027984619, + "learning_rate": 6.668562876639614e-05, + "loss": 0.9657, + "step": 122670 + }, + { + "epoch": 0.7837675529943907, + "grad_norm": 1.0759632587432861, + "learning_rate": 6.66808986368057e-05, + "loss": 0.6798, + "step": 122680 + }, + { + "epoch": 0.7838314401441294, + "grad_norm": 0.8608376383781433, + "learning_rate": 6.667616833922416e-05, + "loss": 0.8573, + "step": 122690 + }, + { + "epoch": 0.7838953272938681, + "grad_norm": 0.802783727645874, + "learning_rate": 6.66714378736992e-05, + "loss": 0.8653, + "step": 122700 + }, + { + "epoch": 0.7839592144436068, + "grad_norm": 1.2049453258514404, + "learning_rate": 6.666670724027844e-05, + "loss": 0.8514, + "step": 122710 + }, + { + "epoch": 0.7840231015933455, + "grad_norm": 1.9265187978744507, + "learning_rate": 6.66619764390095e-05, + "loss": 0.8429, + "step": 122720 + }, + { + "epoch": 0.7840869887430842, + "grad_norm": 1.0026494264602661, + "learning_rate": 6.665724546994005e-05, + "loss": 1.0991, + "step": 122730 + }, + { + "epoch": 0.784150875892823, + "grad_norm": 1.0881091356277466, + "learning_rate": 6.665251433311773e-05, + "loss": 0.8073, + "step": 122740 + }, + { + "epoch": 0.7842147630425617, + "grad_norm": 0.8942010998725891, + "learning_rate": 6.664778302859018e-05, + "loss": 0.9164, + "step": 122750 + }, + { + "epoch": 0.7842786501923004, + "grad_norm": 1.0254307985305786, + "learning_rate": 6.664305155640507e-05, + "loss": 0.8306, + "step": 122760 + }, + { + "epoch": 0.7843425373420391, + "grad_norm": 1.6653152704238892, + "learning_rate": 6.663831991661002e-05, + "loss": 1.1331, + "step": 122770 + }, + { + "epoch": 0.7844064244917778, + "grad_norm": 0.8588860034942627, + "learning_rate": 6.663358810925269e-05, + "loss": 0.9881, + "step": 122780 + }, + { + "epoch": 0.7844703116415164, + "grad_norm": 0.5761985778808594, + "learning_rate": 6.662885613438074e-05, + "loss": 0.6635, + "step": 122790 + }, + { + "epoch": 0.7845341987912551, + "grad_norm": 0.762401282787323, + "learning_rate": 6.662412399204182e-05, + "loss": 0.9458, + "step": 122800 + }, + { + "epoch": 0.7845980859409938, + "grad_norm": 1.0655889511108398, + "learning_rate": 6.661939168228359e-05, + "loss": 0.9319, + "step": 122810 + }, + { + "epoch": 0.7846619730907325, + "grad_norm": 0.7457488775253296, + "learning_rate": 6.66146592051537e-05, + "loss": 0.6604, + "step": 122820 + }, + { + "epoch": 0.7847258602404712, + "grad_norm": 0.5781500935554504, + "learning_rate": 6.660992656069984e-05, + "loss": 0.9065, + "step": 122830 + }, + { + "epoch": 0.7847897473902099, + "grad_norm": 1.0758085250854492, + "learning_rate": 6.660519374896964e-05, + "loss": 0.8102, + "step": 122840 + }, + { + "epoch": 0.7848536345399486, + "grad_norm": 3.073899745941162, + "learning_rate": 6.660046077001076e-05, + "loss": 0.961, + "step": 122850 + }, + { + "epoch": 0.7849175216896873, + "grad_norm": 1.818791389465332, + "learning_rate": 6.65957276238709e-05, + "loss": 0.7407, + "step": 122860 + }, + { + "epoch": 0.784981408839426, + "grad_norm": 1.0063798427581787, + "learning_rate": 6.65909943105977e-05, + "loss": 0.8685, + "step": 122870 + }, + { + "epoch": 0.7850452959891647, + "grad_norm": 0.8023037314414978, + "learning_rate": 6.658626083023883e-05, + "loss": 0.6783, + "step": 122880 + }, + { + "epoch": 0.7851091831389034, + "grad_norm": 0.8545927405357361, + "learning_rate": 6.658152718284197e-05, + "loss": 0.8662, + "step": 122890 + }, + { + "epoch": 0.7851730702886421, + "grad_norm": 0.9020628929138184, + "learning_rate": 6.657679336845478e-05, + "loss": 0.8088, + "step": 122900 + }, + { + "epoch": 0.7852369574383808, + "grad_norm": 1.1107025146484375, + "learning_rate": 6.657205938712492e-05, + "loss": 0.9401, + "step": 122910 + }, + { + "epoch": 0.7853008445881196, + "grad_norm": 0.7026088833808899, + "learning_rate": 6.656732523890012e-05, + "loss": 0.7771, + "step": 122920 + }, + { + "epoch": 0.7853647317378583, + "grad_norm": 1.3951656818389893, + "learning_rate": 6.656259092382801e-05, + "loss": 1.0259, + "step": 122930 + }, + { + "epoch": 0.785428618887597, + "grad_norm": 1.104836106300354, + "learning_rate": 6.655785644195627e-05, + "loss": 0.9918, + "step": 122940 + }, + { + "epoch": 0.7854925060373357, + "grad_norm": 0.6789342761039734, + "learning_rate": 6.655312179333259e-05, + "loss": 0.8964, + "step": 122950 + }, + { + "epoch": 0.7855563931870744, + "grad_norm": 0.7267434597015381, + "learning_rate": 6.654838697800467e-05, + "loss": 1.1021, + "step": 122960 + }, + { + "epoch": 0.7856202803368131, + "grad_norm": 0.9028590321540833, + "learning_rate": 6.654365199602016e-05, + "loss": 0.9037, + "step": 122970 + }, + { + "epoch": 0.7856841674865518, + "grad_norm": 1.3182995319366455, + "learning_rate": 6.653891684742677e-05, + "loss": 1.2179, + "step": 122980 + }, + { + "epoch": 0.7857480546362905, + "grad_norm": 0.6001270413398743, + "learning_rate": 6.653418153227218e-05, + "loss": 0.7268, + "step": 122990 + }, + { + "epoch": 0.7858119417860292, + "grad_norm": 0.8544936776161194, + "learning_rate": 6.652944605060409e-05, + "loss": 0.8634, + "step": 123000 + }, + { + "epoch": 0.7858758289357679, + "grad_norm": 0.7163207530975342, + "learning_rate": 6.652471040247016e-05, + "loss": 0.7325, + "step": 123010 + }, + { + "epoch": 0.7859397160855066, + "grad_norm": 1.0763355493545532, + "learning_rate": 6.65199745879181e-05, + "loss": 1.0254, + "step": 123020 + }, + { + "epoch": 0.7860036032352452, + "grad_norm": 0.8155850172042847, + "learning_rate": 6.651523860699562e-05, + "loss": 0.9767, + "step": 123030 + }, + { + "epoch": 0.7860674903849839, + "grad_norm": 0.845534086227417, + "learning_rate": 6.651050245975039e-05, + "loss": 0.9393, + "step": 123040 + }, + { + "epoch": 0.7861313775347226, + "grad_norm": 1.392137885093689, + "learning_rate": 6.650576614623012e-05, + "loss": 0.7875, + "step": 123050 + }, + { + "epoch": 0.7861952646844613, + "grad_norm": 0.9796945452690125, + "learning_rate": 6.65010296664825e-05, + "loss": 0.7574, + "step": 123060 + }, + { + "epoch": 0.7862591518342, + "grad_norm": 0.8178929686546326, + "learning_rate": 6.649629302055524e-05, + "loss": 0.746, + "step": 123070 + }, + { + "epoch": 0.7863230389839387, + "grad_norm": 0.6431455612182617, + "learning_rate": 6.649155620849605e-05, + "loss": 0.8506, + "step": 123080 + }, + { + "epoch": 0.7863869261336774, + "grad_norm": 0.8767764568328857, + "learning_rate": 6.648681923035261e-05, + "loss": 0.7546, + "step": 123090 + }, + { + "epoch": 0.7864508132834162, + "grad_norm": 2.9324655532836914, + "learning_rate": 6.648208208617262e-05, + "loss": 0.7744, + "step": 123100 + }, + { + "epoch": 0.7865147004331549, + "grad_norm": 1.0613309144973755, + "learning_rate": 6.647734477600383e-05, + "loss": 0.7883, + "step": 123110 + }, + { + "epoch": 0.7865785875828936, + "grad_norm": 1.001646637916565, + "learning_rate": 6.647260729989391e-05, + "loss": 1.2337, + "step": 123120 + }, + { + "epoch": 0.7866424747326323, + "grad_norm": 1.7810345888137817, + "learning_rate": 6.646786965789057e-05, + "loss": 0.8567, + "step": 123130 + }, + { + "epoch": 0.786706361882371, + "grad_norm": 0.8368769288063049, + "learning_rate": 6.646313185004155e-05, + "loss": 0.935, + "step": 123140 + }, + { + "epoch": 0.7867702490321097, + "grad_norm": 0.6469975709915161, + "learning_rate": 6.645839387639456e-05, + "loss": 0.7664, + "step": 123150 + }, + { + "epoch": 0.7868341361818484, + "grad_norm": 0.7839746475219727, + "learning_rate": 6.645365573699729e-05, + "loss": 1.0381, + "step": 123160 + }, + { + "epoch": 0.7868980233315871, + "grad_norm": 1.2214899063110352, + "learning_rate": 6.644891743189749e-05, + "loss": 0.94, + "step": 123170 + }, + { + "epoch": 0.7869619104813258, + "grad_norm": 1.0104191303253174, + "learning_rate": 6.644417896114285e-05, + "loss": 0.9163, + "step": 123180 + }, + { + "epoch": 0.7870257976310645, + "grad_norm": 0.9507836699485779, + "learning_rate": 6.643944032478109e-05, + "loss": 0.9046, + "step": 123190 + }, + { + "epoch": 0.7870896847808032, + "grad_norm": 1.0643188953399658, + "learning_rate": 6.643470152285995e-05, + "loss": 0.9449, + "step": 123200 + }, + { + "epoch": 0.7871535719305419, + "grad_norm": 0.8711426258087158, + "learning_rate": 6.642996255542717e-05, + "loss": 0.7908, + "step": 123210 + }, + { + "epoch": 0.7872174590802806, + "grad_norm": 1.0922789573669434, + "learning_rate": 6.642522342253042e-05, + "loss": 0.8448, + "step": 123220 + }, + { + "epoch": 0.7872813462300193, + "grad_norm": 0.9862490892410278, + "learning_rate": 6.642048412421749e-05, + "loss": 0.758, + "step": 123230 + }, + { + "epoch": 0.787345233379758, + "grad_norm": 0.6439590454101562, + "learning_rate": 6.641574466053607e-05, + "loss": 0.6805, + "step": 123240 + }, + { + "epoch": 0.7874091205294967, + "grad_norm": 0.9654756784439087, + "learning_rate": 6.641100503153388e-05, + "loss": 0.7238, + "step": 123250 + }, + { + "epoch": 0.7874730076792354, + "grad_norm": 0.7384721636772156, + "learning_rate": 6.64062652372587e-05, + "loss": 0.9328, + "step": 123260 + }, + { + "epoch": 0.7875368948289742, + "grad_norm": 1.083748698234558, + "learning_rate": 6.640152527775821e-05, + "loss": 0.9526, + "step": 123270 + }, + { + "epoch": 0.7876007819787127, + "grad_norm": 1.286015510559082, + "learning_rate": 6.63967851530802e-05, + "loss": 0.9245, + "step": 123280 + }, + { + "epoch": 0.7876646691284515, + "grad_norm": 0.5310730338096619, + "learning_rate": 6.639204486327236e-05, + "loss": 0.8381, + "step": 123290 + }, + { + "epoch": 0.7877285562781902, + "grad_norm": 1.1618332862854004, + "learning_rate": 6.638730440838244e-05, + "loss": 1.1095, + "step": 123300 + }, + { + "epoch": 0.7877924434279289, + "grad_norm": 0.8446438908576965, + "learning_rate": 6.63825637884582e-05, + "loss": 0.9877, + "step": 123310 + }, + { + "epoch": 0.7878563305776676, + "grad_norm": 0.6772144436836243, + "learning_rate": 6.637782300354737e-05, + "loss": 0.6984, + "step": 123320 + }, + { + "epoch": 0.7879202177274063, + "grad_norm": 0.6928181052207947, + "learning_rate": 6.63730820536977e-05, + "loss": 1.007, + "step": 123330 + }, + { + "epoch": 0.787984104877145, + "grad_norm": 1.1651209592819214, + "learning_rate": 6.63683409389569e-05, + "loss": 0.889, + "step": 123340 + }, + { + "epoch": 0.7880479920268837, + "grad_norm": 1.0764139890670776, + "learning_rate": 6.636359965937278e-05, + "loss": 0.8781, + "step": 123350 + }, + { + "epoch": 0.7881118791766224, + "grad_norm": 0.5415597558021545, + "learning_rate": 6.635885821499304e-05, + "loss": 0.9395, + "step": 123360 + }, + { + "epoch": 0.7881757663263611, + "grad_norm": 1.080687165260315, + "learning_rate": 6.635411660586543e-05, + "loss": 0.9444, + "step": 123370 + }, + { + "epoch": 0.7882396534760998, + "grad_norm": 0.8479616045951843, + "learning_rate": 6.634937483203773e-05, + "loss": 0.914, + "step": 123380 + }, + { + "epoch": 0.7883035406258385, + "grad_norm": 0.7916926145553589, + "learning_rate": 6.634463289355768e-05, + "loss": 1.037, + "step": 123390 + }, + { + "epoch": 0.7883674277755772, + "grad_norm": 0.9710598587989807, + "learning_rate": 6.633989079047306e-05, + "loss": 0.8785, + "step": 123400 + }, + { + "epoch": 0.7884313149253159, + "grad_norm": 1.142552137374878, + "learning_rate": 6.633514852283159e-05, + "loss": 0.8654, + "step": 123410 + }, + { + "epoch": 0.7884952020750546, + "grad_norm": 0.7896512746810913, + "learning_rate": 6.633040609068103e-05, + "loss": 0.755, + "step": 123420 + }, + { + "epoch": 0.7885590892247933, + "grad_norm": 1.4209386110305786, + "learning_rate": 6.632566349406916e-05, + "loss": 0.797, + "step": 123430 + }, + { + "epoch": 0.788622976374532, + "grad_norm": 0.843025267124176, + "learning_rate": 6.632092073304374e-05, + "loss": 1.0926, + "step": 123440 + }, + { + "epoch": 0.7886868635242708, + "grad_norm": 0.7612596750259399, + "learning_rate": 6.631617780765252e-05, + "loss": 0.979, + "step": 123450 + }, + { + "epoch": 0.7887507506740095, + "grad_norm": 1.0904258489608765, + "learning_rate": 6.631143471794328e-05, + "loss": 0.9336, + "step": 123460 + }, + { + "epoch": 0.7888146378237482, + "grad_norm": 1.2260910272598267, + "learning_rate": 6.630669146396376e-05, + "loss": 1.0369, + "step": 123470 + }, + { + "epoch": 0.7888785249734869, + "grad_norm": 1.343691349029541, + "learning_rate": 6.630194804576177e-05, + "loss": 0.7612, + "step": 123480 + }, + { + "epoch": 0.7889424121232256, + "grad_norm": 0.8414347171783447, + "learning_rate": 6.629720446338506e-05, + "loss": 0.788, + "step": 123490 + }, + { + "epoch": 0.7890062992729643, + "grad_norm": 0.814517080783844, + "learning_rate": 6.62924607168814e-05, + "loss": 0.931, + "step": 123500 + }, + { + "epoch": 0.789070186422703, + "grad_norm": 1.7236671447753906, + "learning_rate": 6.628771680629856e-05, + "loss": 0.8326, + "step": 123510 + }, + { + "epoch": 0.7891340735724416, + "grad_norm": 0.738429605960846, + "learning_rate": 6.628297273168433e-05, + "loss": 0.8309, + "step": 123520 + }, + { + "epoch": 0.7891979607221803, + "grad_norm": 0.8377928137779236, + "learning_rate": 6.627822849308648e-05, + "loss": 0.8578, + "step": 123530 + }, + { + "epoch": 0.789261847871919, + "grad_norm": 3.6376450061798096, + "learning_rate": 6.627348409055278e-05, + "loss": 0.8637, + "step": 123540 + }, + { + "epoch": 0.7893257350216577, + "grad_norm": 1.038316011428833, + "learning_rate": 6.626873952413102e-05, + "loss": 1.0122, + "step": 123550 + }, + { + "epoch": 0.7893896221713964, + "grad_norm": 0.8108795881271362, + "learning_rate": 6.626399479386898e-05, + "loss": 1.0062, + "step": 123560 + }, + { + "epoch": 0.7894535093211351, + "grad_norm": 0.9439957737922668, + "learning_rate": 6.625924989981444e-05, + "loss": 0.8456, + "step": 123570 + }, + { + "epoch": 0.7895173964708738, + "grad_norm": 0.6934586763381958, + "learning_rate": 6.625450484201519e-05, + "loss": 0.7537, + "step": 123580 + }, + { + "epoch": 0.7895812836206125, + "grad_norm": 0.6399053335189819, + "learning_rate": 6.6249759620519e-05, + "loss": 0.6623, + "step": 123590 + }, + { + "epoch": 0.7896451707703512, + "grad_norm": 0.7249269485473633, + "learning_rate": 6.624501423537368e-05, + "loss": 0.8404, + "step": 123600 + }, + { + "epoch": 0.7897090579200899, + "grad_norm": 0.655685305595398, + "learning_rate": 6.624026868662701e-05, + "loss": 0.8336, + "step": 123610 + }, + { + "epoch": 0.7897729450698286, + "grad_norm": 0.9134299159049988, + "learning_rate": 6.623552297432679e-05, + "loss": 0.6717, + "step": 123620 + }, + { + "epoch": 0.7898368322195674, + "grad_norm": 0.7788447737693787, + "learning_rate": 6.623077709852081e-05, + "loss": 0.8576, + "step": 123630 + }, + { + "epoch": 0.7899007193693061, + "grad_norm": 0.6765179634094238, + "learning_rate": 6.622603105925686e-05, + "loss": 1.0578, + "step": 123640 + }, + { + "epoch": 0.7899646065190448, + "grad_norm": 0.7470600605010986, + "learning_rate": 6.622128485658273e-05, + "loss": 0.7213, + "step": 123650 + }, + { + "epoch": 0.7900284936687835, + "grad_norm": 0.9108386635780334, + "learning_rate": 6.621653849054623e-05, + "loss": 1.0726, + "step": 123660 + }, + { + "epoch": 0.7900923808185222, + "grad_norm": 1.1410408020019531, + "learning_rate": 6.621179196119518e-05, + "loss": 0.8654, + "step": 123670 + }, + { + "epoch": 0.7901562679682609, + "grad_norm": 1.901923656463623, + "learning_rate": 6.620704526857734e-05, + "loss": 1.0014, + "step": 123680 + }, + { + "epoch": 0.7902201551179996, + "grad_norm": 1.0020592212677002, + "learning_rate": 6.620229841274054e-05, + "loss": 0.7709, + "step": 123690 + }, + { + "epoch": 0.7902840422677383, + "grad_norm": 1.4192286729812622, + "learning_rate": 6.619755139373257e-05, + "loss": 0.8418, + "step": 123700 + }, + { + "epoch": 0.790347929417477, + "grad_norm": 0.7686444520950317, + "learning_rate": 6.619280421160125e-05, + "loss": 0.8845, + "step": 123710 + }, + { + "epoch": 0.7904118165672157, + "grad_norm": 1.1990658044815063, + "learning_rate": 6.618805686639439e-05, + "loss": 0.9405, + "step": 123720 + }, + { + "epoch": 0.7904757037169544, + "grad_norm": 1.1820399761199951, + "learning_rate": 6.618330935815979e-05, + "loss": 0.9327, + "step": 123730 + }, + { + "epoch": 0.7905395908666931, + "grad_norm": 1.1175780296325684, + "learning_rate": 6.617856168694526e-05, + "loss": 0.756, + "step": 123740 + }, + { + "epoch": 0.7906034780164318, + "grad_norm": 1.217108130455017, + "learning_rate": 6.617381385279862e-05, + "loss": 0.9991, + "step": 123750 + }, + { + "epoch": 0.7906673651661704, + "grad_norm": 0.42774638533592224, + "learning_rate": 6.616906585576768e-05, + "loss": 0.7166, + "step": 123760 + }, + { + "epoch": 0.7907312523159091, + "grad_norm": 0.831774890422821, + "learning_rate": 6.616431769590027e-05, + "loss": 0.9865, + "step": 123770 + }, + { + "epoch": 0.7907951394656478, + "grad_norm": 0.9391918778419495, + "learning_rate": 6.615956937324418e-05, + "loss": 0.8336, + "step": 123780 + }, + { + "epoch": 0.7908590266153865, + "grad_norm": 1.3029203414916992, + "learning_rate": 6.615482088784726e-05, + "loss": 1.0872, + "step": 123790 + }, + { + "epoch": 0.7909229137651252, + "grad_norm": 0.6850435733795166, + "learning_rate": 6.615007223975732e-05, + "loss": 0.6906, + "step": 123800 + }, + { + "epoch": 0.790986800914864, + "grad_norm": 1.2866231203079224, + "learning_rate": 6.614532342902216e-05, + "loss": 0.9472, + "step": 123810 + }, + { + "epoch": 0.7910506880646027, + "grad_norm": 2.2075138092041016, + "learning_rate": 6.614057445568961e-05, + "loss": 0.9311, + "step": 123820 + }, + { + "epoch": 0.7911145752143414, + "grad_norm": 0.7719504833221436, + "learning_rate": 6.613582531980755e-05, + "loss": 0.8272, + "step": 123830 + }, + { + "epoch": 0.7911784623640801, + "grad_norm": 0.922818660736084, + "learning_rate": 6.613107602142376e-05, + "loss": 0.9421, + "step": 123840 + }, + { + "epoch": 0.7912423495138188, + "grad_norm": 0.7339285612106323, + "learning_rate": 6.612632656058608e-05, + "loss": 0.8638, + "step": 123850 + }, + { + "epoch": 0.7913062366635575, + "grad_norm": 0.9554028511047363, + "learning_rate": 6.612157693734233e-05, + "loss": 0.7373, + "step": 123860 + }, + { + "epoch": 0.7913701238132962, + "grad_norm": 1.1785390377044678, + "learning_rate": 6.611682715174036e-05, + "loss": 0.6176, + "step": 123870 + }, + { + "epoch": 0.7914340109630349, + "grad_norm": 0.7205845713615417, + "learning_rate": 6.6112077203828e-05, + "loss": 0.844, + "step": 123880 + }, + { + "epoch": 0.7914978981127736, + "grad_norm": 0.773068368434906, + "learning_rate": 6.610732709365308e-05, + "loss": 0.7221, + "step": 123890 + }, + { + "epoch": 0.7915617852625123, + "grad_norm": 0.8639894127845764, + "learning_rate": 6.610257682126344e-05, + "loss": 0.8949, + "step": 123900 + }, + { + "epoch": 0.791625672412251, + "grad_norm": 0.8907663822174072, + "learning_rate": 6.609782638670692e-05, + "loss": 1.0061, + "step": 123910 + }, + { + "epoch": 0.7916895595619897, + "grad_norm": 0.8259413242340088, + "learning_rate": 6.609307579003136e-05, + "loss": 1.1113, + "step": 123920 + }, + { + "epoch": 0.7917534467117284, + "grad_norm": 0.9798697233200073, + "learning_rate": 6.608832503128461e-05, + "loss": 0.607, + "step": 123930 + }, + { + "epoch": 0.7918173338614671, + "grad_norm": 0.7070125937461853, + "learning_rate": 6.608357411051451e-05, + "loss": 0.7719, + "step": 123940 + }, + { + "epoch": 0.7918812210112058, + "grad_norm": 1.0658955574035645, + "learning_rate": 6.607882302776892e-05, + "loss": 0.9488, + "step": 123950 + }, + { + "epoch": 0.7919451081609445, + "grad_norm": 0.6668451428413391, + "learning_rate": 6.607407178309564e-05, + "loss": 0.8527, + "step": 123960 + }, + { + "epoch": 0.7920089953106833, + "grad_norm": 0.6440159678459167, + "learning_rate": 6.606932037654256e-05, + "loss": 0.9539, + "step": 123970 + }, + { + "epoch": 0.792072882460422, + "grad_norm": 0.7459390759468079, + "learning_rate": 6.606456880815754e-05, + "loss": 0.8549, + "step": 123980 + }, + { + "epoch": 0.7921367696101607, + "grad_norm": 0.9598776698112488, + "learning_rate": 6.60598170779884e-05, + "loss": 1.0577, + "step": 123990 + }, + { + "epoch": 0.7922006567598994, + "grad_norm": 1.091723084449768, + "learning_rate": 6.6055065186083e-05, + "loss": 0.8536, + "step": 124000 + }, + { + "epoch": 0.792264543909638, + "grad_norm": 1.7928060293197632, + "learning_rate": 6.605031313248922e-05, + "loss": 0.8274, + "step": 124010 + }, + { + "epoch": 0.7923284310593767, + "grad_norm": 1.0446691513061523, + "learning_rate": 6.604556091725489e-05, + "loss": 1.192, + "step": 124020 + }, + { + "epoch": 0.7923923182091154, + "grad_norm": 0.7123937010765076, + "learning_rate": 6.604080854042789e-05, + "loss": 0.9224, + "step": 124030 + }, + { + "epoch": 0.7924562053588541, + "grad_norm": 0.9317911863327026, + "learning_rate": 6.603605600205606e-05, + "loss": 0.8837, + "step": 124040 + }, + { + "epoch": 0.7925200925085928, + "grad_norm": 0.8845491409301758, + "learning_rate": 6.603130330218727e-05, + "loss": 0.9611, + "step": 124050 + }, + { + "epoch": 0.7925839796583315, + "grad_norm": 1.66157865524292, + "learning_rate": 6.60265504408694e-05, + "loss": 0.6554, + "step": 124060 + }, + { + "epoch": 0.7926478668080702, + "grad_norm": 0.9395459294319153, + "learning_rate": 6.60217974181503e-05, + "loss": 0.8756, + "step": 124070 + }, + { + "epoch": 0.7927117539578089, + "grad_norm": 1.385488748550415, + "learning_rate": 6.601704423407784e-05, + "loss": 0.9138, + "step": 124080 + }, + { + "epoch": 0.7927756411075476, + "grad_norm": 0.8289713263511658, + "learning_rate": 6.601229088869988e-05, + "loss": 0.8807, + "step": 124090 + }, + { + "epoch": 0.7928395282572863, + "grad_norm": 0.7335384488105774, + "learning_rate": 6.60075373820643e-05, + "loss": 0.8432, + "step": 124100 + }, + { + "epoch": 0.792903415407025, + "grad_norm": 0.8279372453689575, + "learning_rate": 6.600278371421898e-05, + "loss": 0.9776, + "step": 124110 + }, + { + "epoch": 0.7929673025567637, + "grad_norm": 0.7263229489326477, + "learning_rate": 6.599802988521178e-05, + "loss": 0.7568, + "step": 124120 + }, + { + "epoch": 0.7930311897065024, + "grad_norm": 0.8773966431617737, + "learning_rate": 6.599327589509056e-05, + "loss": 0.7536, + "step": 124130 + }, + { + "epoch": 0.7930950768562411, + "grad_norm": 0.7500774264335632, + "learning_rate": 6.598852174390324e-05, + "loss": 0.8422, + "step": 124140 + }, + { + "epoch": 0.7931589640059798, + "grad_norm": 0.6933243274688721, + "learning_rate": 6.598376743169767e-05, + "loss": 0.8359, + "step": 124150 + }, + { + "epoch": 0.7932228511557186, + "grad_norm": 0.848579466342926, + "learning_rate": 6.597901295852172e-05, + "loss": 0.8249, + "step": 124160 + }, + { + "epoch": 0.7932867383054573, + "grad_norm": 1.0843689441680908, + "learning_rate": 6.59742583244233e-05, + "loss": 0.8095, + "step": 124170 + }, + { + "epoch": 0.793350625455196, + "grad_norm": 0.6765615940093994, + "learning_rate": 6.596950352945026e-05, + "loss": 0.8884, + "step": 124180 + }, + { + "epoch": 0.7934145126049347, + "grad_norm": 0.7143245935440063, + "learning_rate": 6.596474857365052e-05, + "loss": 0.8451, + "step": 124190 + }, + { + "epoch": 0.7934783997546734, + "grad_norm": 0.9047801494598389, + "learning_rate": 6.595999345707195e-05, + "loss": 1.2198, + "step": 124200 + }, + { + "epoch": 0.7935422869044121, + "grad_norm": 0.6656885147094727, + "learning_rate": 6.595523817976243e-05, + "loss": 0.8917, + "step": 124210 + }, + { + "epoch": 0.7936061740541508, + "grad_norm": 0.7534079551696777, + "learning_rate": 6.595048274176986e-05, + "loss": 1.0604, + "step": 124220 + }, + { + "epoch": 0.7936700612038895, + "grad_norm": 1.1482276916503906, + "learning_rate": 6.594572714314213e-05, + "loss": 0.8904, + "step": 124230 + }, + { + "epoch": 0.7937339483536282, + "grad_norm": 0.8140796422958374, + "learning_rate": 6.594097138392715e-05, + "loss": 0.7813, + "step": 124240 + }, + { + "epoch": 0.7937978355033668, + "grad_norm": 1.066030502319336, + "learning_rate": 6.593621546417279e-05, + "loss": 0.6392, + "step": 124250 + }, + { + "epoch": 0.7938617226531055, + "grad_norm": 0.5749644041061401, + "learning_rate": 6.593145938392694e-05, + "loss": 1.107, + "step": 124260 + }, + { + "epoch": 0.7939256098028442, + "grad_norm": 0.8076421022415161, + "learning_rate": 6.592670314323753e-05, + "loss": 0.7277, + "step": 124270 + }, + { + "epoch": 0.7939894969525829, + "grad_norm": 0.9243035316467285, + "learning_rate": 6.592194674215242e-05, + "loss": 0.8893, + "step": 124280 + }, + { + "epoch": 0.7940533841023216, + "grad_norm": 0.696916401386261, + "learning_rate": 6.591719018071955e-05, + "loss": 0.848, + "step": 124290 + }, + { + "epoch": 0.7941172712520603, + "grad_norm": 1.0850187540054321, + "learning_rate": 6.591243345898679e-05, + "loss": 0.9137, + "step": 124300 + }, + { + "epoch": 0.794181158401799, + "grad_norm": 0.7324548363685608, + "learning_rate": 6.590767657700207e-05, + "loss": 0.9306, + "step": 124310 + }, + { + "epoch": 0.7942450455515377, + "grad_norm": 1.1049119234085083, + "learning_rate": 6.590291953481326e-05, + "loss": 0.9547, + "step": 124320 + }, + { + "epoch": 0.7943089327012764, + "grad_norm": 0.8178719878196716, + "learning_rate": 6.589816233246832e-05, + "loss": 0.9391, + "step": 124330 + }, + { + "epoch": 0.7943728198510152, + "grad_norm": 0.793376088142395, + "learning_rate": 6.589340497001511e-05, + "loss": 0.9583, + "step": 124340 + }, + { + "epoch": 0.7944367070007539, + "grad_norm": 0.9736217856407166, + "learning_rate": 6.588864744750158e-05, + "loss": 0.624, + "step": 124350 + }, + { + "epoch": 0.7945005941504926, + "grad_norm": 1.2875404357910156, + "learning_rate": 6.588388976497563e-05, + "loss": 0.866, + "step": 124360 + }, + { + "epoch": 0.7945644813002313, + "grad_norm": 1.0183568000793457, + "learning_rate": 6.587913192248515e-05, + "loss": 0.8561, + "step": 124370 + }, + { + "epoch": 0.79462836844997, + "grad_norm": 0.6129552125930786, + "learning_rate": 6.587437392007809e-05, + "loss": 0.8037, + "step": 124380 + }, + { + "epoch": 0.7946922555997087, + "grad_norm": 1.3483234643936157, + "learning_rate": 6.586961575780233e-05, + "loss": 0.7024, + "step": 124390 + }, + { + "epoch": 0.7947561427494474, + "grad_norm": 0.7584971189498901, + "learning_rate": 6.586485743570583e-05, + "loss": 0.9416, + "step": 124400 + }, + { + "epoch": 0.7948200298991861, + "grad_norm": 0.6718287467956543, + "learning_rate": 6.58600989538365e-05, + "loss": 0.9615, + "step": 124410 + }, + { + "epoch": 0.7948839170489248, + "grad_norm": 1.4930849075317383, + "learning_rate": 6.585534031224223e-05, + "loss": 1.3212, + "step": 124420 + }, + { + "epoch": 0.7949478041986635, + "grad_norm": 0.9690805673599243, + "learning_rate": 6.585058151097097e-05, + "loss": 0.9029, + "step": 124430 + }, + { + "epoch": 0.7950116913484022, + "grad_norm": 0.528648316860199, + "learning_rate": 6.584582255007065e-05, + "loss": 0.6965, + "step": 124440 + }, + { + "epoch": 0.7950755784981409, + "grad_norm": 0.8390137553215027, + "learning_rate": 6.584106342958917e-05, + "loss": 0.6836, + "step": 124450 + }, + { + "epoch": 0.7951394656478796, + "grad_norm": 0.4811376929283142, + "learning_rate": 6.583630414957449e-05, + "loss": 1.126, + "step": 124460 + }, + { + "epoch": 0.7952033527976183, + "grad_norm": 0.8852686882019043, + "learning_rate": 6.583154471007453e-05, + "loss": 0.8745, + "step": 124470 + }, + { + "epoch": 0.795267239947357, + "grad_norm": 1.2516735792160034, + "learning_rate": 6.582678511113722e-05, + "loss": 1.1301, + "step": 124480 + }, + { + "epoch": 0.7953311270970956, + "grad_norm": 0.831717848777771, + "learning_rate": 6.58220253528105e-05, + "loss": 0.8727, + "step": 124490 + }, + { + "epoch": 0.7953950142468343, + "grad_norm": 0.7440255284309387, + "learning_rate": 6.581726543514227e-05, + "loss": 0.8061, + "step": 124500 + }, + { + "epoch": 0.795458901396573, + "grad_norm": 1.1145399808883667, + "learning_rate": 6.581250535818051e-05, + "loss": 1.0605, + "step": 124510 + }, + { + "epoch": 0.7955227885463118, + "grad_norm": 0.678453266620636, + "learning_rate": 6.580774512197314e-05, + "loss": 0.9662, + "step": 124520 + }, + { + "epoch": 0.7955866756960505, + "grad_norm": 1.0213743448257446, + "learning_rate": 6.58029847265681e-05, + "loss": 0.915, + "step": 124530 + }, + { + "epoch": 0.7956505628457892, + "grad_norm": 0.8792978525161743, + "learning_rate": 6.579822417201333e-05, + "loss": 0.8723, + "step": 124540 + }, + { + "epoch": 0.7957144499955279, + "grad_norm": 0.9746803045272827, + "learning_rate": 6.579346345835677e-05, + "loss": 0.8153, + "step": 124550 + }, + { + "epoch": 0.7957783371452666, + "grad_norm": 0.7419812083244324, + "learning_rate": 6.578870258564637e-05, + "loss": 0.7329, + "step": 124560 + }, + { + "epoch": 0.7958422242950053, + "grad_norm": 0.9181807041168213, + "learning_rate": 6.57839415539301e-05, + "loss": 0.8517, + "step": 124570 + }, + { + "epoch": 0.795906111444744, + "grad_norm": 0.7871003746986389, + "learning_rate": 6.577918036325586e-05, + "loss": 0.7752, + "step": 124580 + }, + { + "epoch": 0.7959699985944827, + "grad_norm": 0.8576268553733826, + "learning_rate": 6.577441901367163e-05, + "loss": 0.94, + "step": 124590 + }, + { + "epoch": 0.7960338857442214, + "grad_norm": 1.1811336278915405, + "learning_rate": 6.576965750522534e-05, + "loss": 0.9644, + "step": 124600 + }, + { + "epoch": 0.7960977728939601, + "grad_norm": 1.184383511543274, + "learning_rate": 6.576489583796498e-05, + "loss": 1.1323, + "step": 124610 + }, + { + "epoch": 0.7961616600436988, + "grad_norm": 0.9622499346733093, + "learning_rate": 6.576013401193846e-05, + "loss": 0.8139, + "step": 124620 + }, + { + "epoch": 0.7962255471934375, + "grad_norm": 1.1531530618667603, + "learning_rate": 6.575537202719377e-05, + "loss": 0.7081, + "step": 124630 + }, + { + "epoch": 0.7962894343431762, + "grad_norm": 1.1562443971633911, + "learning_rate": 6.575060988377885e-05, + "loss": 0.9157, + "step": 124640 + }, + { + "epoch": 0.7963533214929149, + "grad_norm": 0.8580940365791321, + "learning_rate": 6.574584758174166e-05, + "loss": 1.0154, + "step": 124650 + }, + { + "epoch": 0.7964172086426536, + "grad_norm": 0.7232387065887451, + "learning_rate": 6.574108512113016e-05, + "loss": 0.8085, + "step": 124660 + }, + { + "epoch": 0.7964810957923923, + "grad_norm": 0.9032987952232361, + "learning_rate": 6.573632250199234e-05, + "loss": 1.0046, + "step": 124670 + }, + { + "epoch": 0.796544982942131, + "grad_norm": 0.9355868697166443, + "learning_rate": 6.57315597243761e-05, + "loss": 0.7971, + "step": 124680 + }, + { + "epoch": 0.7966088700918698, + "grad_norm": 0.8538186550140381, + "learning_rate": 6.572679678832946e-05, + "loss": 0.9047, + "step": 124690 + }, + { + "epoch": 0.7966727572416085, + "grad_norm": 0.8761003017425537, + "learning_rate": 6.572203369390038e-05, + "loss": 0.7487, + "step": 124700 + }, + { + "epoch": 0.7967366443913472, + "grad_norm": 0.643221914768219, + "learning_rate": 6.571727044113679e-05, + "loss": 0.8214, + "step": 124710 + }, + { + "epoch": 0.7968005315410859, + "grad_norm": 0.8245800137519836, + "learning_rate": 6.571250703008671e-05, + "loss": 0.996, + "step": 124720 + }, + { + "epoch": 0.7968644186908246, + "grad_norm": 0.692182719707489, + "learning_rate": 6.57077434607981e-05, + "loss": 0.8932, + "step": 124730 + }, + { + "epoch": 0.7969283058405632, + "grad_norm": 0.6998267769813538, + "learning_rate": 6.570297973331892e-05, + "loss": 0.8643, + "step": 124740 + }, + { + "epoch": 0.7969921929903019, + "grad_norm": 0.6680889129638672, + "learning_rate": 6.569821584769714e-05, + "loss": 0.8156, + "step": 124750 + }, + { + "epoch": 0.7970560801400406, + "grad_norm": 0.7822675704956055, + "learning_rate": 6.569345180398075e-05, + "loss": 0.8655, + "step": 124760 + }, + { + "epoch": 0.7971199672897793, + "grad_norm": 0.9974295496940613, + "learning_rate": 6.568868760221773e-05, + "loss": 0.6725, + "step": 124770 + }, + { + "epoch": 0.797183854439518, + "grad_norm": 0.6198078989982605, + "learning_rate": 6.568392324245605e-05, + "loss": 0.6848, + "step": 124780 + }, + { + "epoch": 0.7972477415892567, + "grad_norm": 1.088592767715454, + "learning_rate": 6.567915872474368e-05, + "loss": 1.1632, + "step": 124790 + }, + { + "epoch": 0.7973116287389954, + "grad_norm": 0.636913537979126, + "learning_rate": 6.567439404912864e-05, + "loss": 0.8826, + "step": 124800 + }, + { + "epoch": 0.7973755158887341, + "grad_norm": 0.7936016321182251, + "learning_rate": 6.566962921565886e-05, + "loss": 0.737, + "step": 124810 + }, + { + "epoch": 0.7974394030384728, + "grad_norm": 0.8261633515357971, + "learning_rate": 6.566486422438238e-05, + "loss": 0.8341, + "step": 124820 + }, + { + "epoch": 0.7975032901882115, + "grad_norm": 0.8050313591957092, + "learning_rate": 6.566009907534717e-05, + "loss": 0.7059, + "step": 124830 + }, + { + "epoch": 0.7975671773379502, + "grad_norm": 1.183261513710022, + "learning_rate": 6.565533376860121e-05, + "loss": 0.9832, + "step": 124840 + }, + { + "epoch": 0.797631064487689, + "grad_norm": 0.7606709003448486, + "learning_rate": 6.565056830419249e-05, + "loss": 0.9096, + "step": 124850 + }, + { + "epoch": 0.7976949516374277, + "grad_norm": 1.1363462209701538, + "learning_rate": 6.564580268216901e-05, + "loss": 0.6826, + "step": 124860 + }, + { + "epoch": 0.7977588387871664, + "grad_norm": 1.563015103340149, + "learning_rate": 6.564103690257875e-05, + "loss": 1.0033, + "step": 124870 + }, + { + "epoch": 0.7978227259369051, + "grad_norm": 1.6602332592010498, + "learning_rate": 6.563627096546973e-05, + "loss": 0.7878, + "step": 124880 + }, + { + "epoch": 0.7978866130866438, + "grad_norm": 1.8670592308044434, + "learning_rate": 6.563150487088994e-05, + "loss": 1.1101, + "step": 124890 + }, + { + "epoch": 0.7979505002363825, + "grad_norm": 0.8670040369033813, + "learning_rate": 6.562673861888735e-05, + "loss": 1.1995, + "step": 124900 + }, + { + "epoch": 0.7980143873861212, + "grad_norm": 0.7766642570495605, + "learning_rate": 6.562197220951e-05, + "loss": 0.8249, + "step": 124910 + }, + { + "epoch": 0.7980782745358599, + "grad_norm": 1.183617115020752, + "learning_rate": 6.561720564280588e-05, + "loss": 0.8269, + "step": 124920 + }, + { + "epoch": 0.7981421616855986, + "grad_norm": 0.9451962113380432, + "learning_rate": 6.561243891882298e-05, + "loss": 0.8455, + "step": 124930 + }, + { + "epoch": 0.7982060488353373, + "grad_norm": 1.272316813468933, + "learning_rate": 6.560767203760932e-05, + "loss": 0.9672, + "step": 124940 + }, + { + "epoch": 0.798269935985076, + "grad_norm": 0.775259256362915, + "learning_rate": 6.560290499921288e-05, + "loss": 0.7095, + "step": 124950 + }, + { + "epoch": 0.7983338231348147, + "grad_norm": 0.8193401098251343, + "learning_rate": 6.559813780368172e-05, + "loss": 0.8389, + "step": 124960 + }, + { + "epoch": 0.7983977102845534, + "grad_norm": 0.6283045411109924, + "learning_rate": 6.55933704510638e-05, + "loss": 0.969, + "step": 124970 + }, + { + "epoch": 0.798461597434292, + "grad_norm": 0.7653422951698303, + "learning_rate": 6.558860294140715e-05, + "loss": 0.9878, + "step": 124980 + }, + { + "epoch": 0.7985254845840307, + "grad_norm": 0.6775907874107361, + "learning_rate": 6.558383527475978e-05, + "loss": 0.8479, + "step": 124990 + }, + { + "epoch": 0.7985893717337694, + "grad_norm": 1.2088565826416016, + "learning_rate": 6.557906745116972e-05, + "loss": 1.0976, + "step": 125000 + }, + { + "epoch": 0.7986532588835081, + "grad_norm": 1.290498971939087, + "learning_rate": 6.557429947068496e-05, + "loss": 1.044, + "step": 125010 + }, + { + "epoch": 0.7987171460332468, + "grad_norm": 1.1517268419265747, + "learning_rate": 6.556953133335353e-05, + "loss": 1.1142, + "step": 125020 + }, + { + "epoch": 0.7987810331829855, + "grad_norm": 0.6305286884307861, + "learning_rate": 6.556476303922344e-05, + "loss": 0.9341, + "step": 125030 + }, + { + "epoch": 0.7988449203327243, + "grad_norm": 1.3918240070343018, + "learning_rate": 6.555999458834273e-05, + "loss": 0.7964, + "step": 125040 + }, + { + "epoch": 0.798908807482463, + "grad_norm": 1.885688304901123, + "learning_rate": 6.555522598075943e-05, + "loss": 0.9877, + "step": 125050 + }, + { + "epoch": 0.7989726946322017, + "grad_norm": 0.8548856973648071, + "learning_rate": 6.555045721652153e-05, + "loss": 0.8515, + "step": 125060 + }, + { + "epoch": 0.7990365817819404, + "grad_norm": 1.1888582706451416, + "learning_rate": 6.554568829567708e-05, + "loss": 0.8533, + "step": 125070 + }, + { + "epoch": 0.7991004689316791, + "grad_norm": 0.8727964162826538, + "learning_rate": 6.554091921827409e-05, + "loss": 0.6094, + "step": 125080 + }, + { + "epoch": 0.7991643560814178, + "grad_norm": 3.3259003162384033, + "learning_rate": 6.55361499843606e-05, + "loss": 1.0581, + "step": 125090 + }, + { + "epoch": 0.7992282432311565, + "grad_norm": 0.6152466535568237, + "learning_rate": 6.553138059398465e-05, + "loss": 0.6899, + "step": 125100 + }, + { + "epoch": 0.7992921303808952, + "grad_norm": 1.0475343465805054, + "learning_rate": 6.552661104719426e-05, + "loss": 0.9867, + "step": 125110 + }, + { + "epoch": 0.7993560175306339, + "grad_norm": 0.5357400178909302, + "learning_rate": 6.552184134403745e-05, + "loss": 0.8811, + "step": 125120 + }, + { + "epoch": 0.7994199046803726, + "grad_norm": 0.9087369441986084, + "learning_rate": 6.551707148456229e-05, + "loss": 0.8129, + "step": 125130 + }, + { + "epoch": 0.7994837918301113, + "grad_norm": 0.9281877279281616, + "learning_rate": 6.551230146881678e-05, + "loss": 0.8924, + "step": 125140 + }, + { + "epoch": 0.79954767897985, + "grad_norm": 1.1757920980453491, + "learning_rate": 6.550753129684897e-05, + "loss": 1.0507, + "step": 125150 + }, + { + "epoch": 0.7996115661295887, + "grad_norm": 1.0355859994888306, + "learning_rate": 6.550276096870692e-05, + "loss": 0.7105, + "step": 125160 + }, + { + "epoch": 0.7996754532793274, + "grad_norm": 1.1129631996154785, + "learning_rate": 6.549799048443865e-05, + "loss": 0.9878, + "step": 125170 + }, + { + "epoch": 0.7997393404290661, + "grad_norm": 0.9657943248748779, + "learning_rate": 6.549321984409221e-05, + "loss": 0.8496, + "step": 125180 + }, + { + "epoch": 0.7998032275788048, + "grad_norm": 0.8863055109977722, + "learning_rate": 6.548844904771564e-05, + "loss": 0.8804, + "step": 125190 + }, + { + "epoch": 0.7998671147285435, + "grad_norm": 0.6890332698822021, + "learning_rate": 6.548367809535699e-05, + "loss": 0.766, + "step": 125200 + }, + { + "epoch": 0.7999310018782823, + "grad_norm": 1.1075465679168701, + "learning_rate": 6.54789069870643e-05, + "loss": 0.8441, + "step": 125210 + }, + { + "epoch": 0.7999948890280209, + "grad_norm": 0.981423020362854, + "learning_rate": 6.547413572288564e-05, + "loss": 0.7874, + "step": 125220 + }, + { + "epoch": 0.8000587761777596, + "grad_norm": 0.9776214957237244, + "learning_rate": 6.546936430286903e-05, + "loss": 1.1112, + "step": 125230 + }, + { + "epoch": 0.8001226633274983, + "grad_norm": 0.6384032964706421, + "learning_rate": 6.546459272706254e-05, + "loss": 0.781, + "step": 125240 + }, + { + "epoch": 0.800186550477237, + "grad_norm": 1.0786020755767822, + "learning_rate": 6.545982099551422e-05, + "loss": 0.9097, + "step": 125250 + }, + { + "epoch": 0.8002504376269757, + "grad_norm": 1.0218867063522339, + "learning_rate": 6.545504910827214e-05, + "loss": 0.8466, + "step": 125260 + }, + { + "epoch": 0.8003143247767144, + "grad_norm": 0.5271647572517395, + "learning_rate": 6.545027706538434e-05, + "loss": 0.8278, + "step": 125270 + }, + { + "epoch": 0.8003782119264531, + "grad_norm": 0.5940924286842346, + "learning_rate": 6.544550486689889e-05, + "loss": 0.7146, + "step": 125280 + }, + { + "epoch": 0.8004420990761918, + "grad_norm": 1.223508596420288, + "learning_rate": 6.544073251286383e-05, + "loss": 0.9559, + "step": 125290 + }, + { + "epoch": 0.8005059862259305, + "grad_norm": 0.8251738548278809, + "learning_rate": 6.543596000332724e-05, + "loss": 0.8241, + "step": 125300 + }, + { + "epoch": 0.8005698733756692, + "grad_norm": 1.2967746257781982, + "learning_rate": 6.543118733833719e-05, + "loss": 0.7866, + "step": 125310 + }, + { + "epoch": 0.8006337605254079, + "grad_norm": 1.4892044067382812, + "learning_rate": 6.542641451794172e-05, + "loss": 0.785, + "step": 125320 + }, + { + "epoch": 0.8006976476751466, + "grad_norm": 0.7800642848014832, + "learning_rate": 6.54216415421889e-05, + "loss": 1.0806, + "step": 125330 + }, + { + "epoch": 0.8007615348248853, + "grad_norm": 1.0607541799545288, + "learning_rate": 6.541686841112685e-05, + "loss": 0.8438, + "step": 125340 + }, + { + "epoch": 0.800825421974624, + "grad_norm": 1.6348508596420288, + "learning_rate": 6.541209512480355e-05, + "loss": 0.9009, + "step": 125350 + }, + { + "epoch": 0.8008893091243627, + "grad_norm": 0.7919349670410156, + "learning_rate": 6.540732168326715e-05, + "loss": 0.9583, + "step": 125360 + }, + { + "epoch": 0.8009531962741014, + "grad_norm": 0.8712650537490845, + "learning_rate": 6.540254808656567e-05, + "loss": 0.7806, + "step": 125370 + }, + { + "epoch": 0.8010170834238401, + "grad_norm": 0.9894066452980042, + "learning_rate": 6.539825171690796e-05, + "loss": 0.9936, + "step": 125380 + }, + { + "epoch": 0.8010809705735789, + "grad_norm": 1.3204667568206787, + "learning_rate": 6.539347782552532e-05, + "loss": 0.9534, + "step": 125390 + }, + { + "epoch": 0.8011448577233176, + "grad_norm": 0.7939335107803345, + "learning_rate": 6.538870377911706e-05, + "loss": 0.9815, + "step": 125400 + }, + { + "epoch": 0.8012087448730563, + "grad_norm": 0.7755239605903625, + "learning_rate": 6.538392957773122e-05, + "loss": 0.8787, + "step": 125410 + }, + { + "epoch": 0.801272632022795, + "grad_norm": 1.006554126739502, + "learning_rate": 6.53791552214159e-05, + "loss": 0.9379, + "step": 125420 + }, + { + "epoch": 0.8013365191725337, + "grad_norm": 0.7018999457359314, + "learning_rate": 6.53743807102192e-05, + "loss": 0.8513, + "step": 125430 + }, + { + "epoch": 0.8014004063222724, + "grad_norm": 0.9612287878990173, + "learning_rate": 6.536960604418918e-05, + "loss": 1.0788, + "step": 125440 + }, + { + "epoch": 0.8014642934720111, + "grad_norm": 0.7687857151031494, + "learning_rate": 6.536483122337391e-05, + "loss": 0.9172, + "step": 125450 + }, + { + "epoch": 0.8015281806217497, + "grad_norm": 1.8492335081100464, + "learning_rate": 6.536005624782152e-05, + "loss": 0.8897, + "step": 125460 + }, + { + "epoch": 0.8015920677714884, + "grad_norm": 0.7783719301223755, + "learning_rate": 6.535528111758006e-05, + "loss": 1.1489, + "step": 125470 + }, + { + "epoch": 0.8016559549212271, + "grad_norm": 1.056986927986145, + "learning_rate": 6.535050583269764e-05, + "loss": 0.7073, + "step": 125480 + }, + { + "epoch": 0.8017198420709658, + "grad_norm": 0.8337403535842896, + "learning_rate": 6.534573039322235e-05, + "loss": 0.8441, + "step": 125490 + }, + { + "epoch": 0.8017837292207045, + "grad_norm": 0.8885868191719055, + "learning_rate": 6.534095479920227e-05, + "loss": 0.7835, + "step": 125500 + }, + { + "epoch": 0.8018476163704432, + "grad_norm": 1.2602735757827759, + "learning_rate": 6.533617905068549e-05, + "loss": 0.7678, + "step": 125510 + }, + { + "epoch": 0.8019115035201819, + "grad_norm": 1.2273060083389282, + "learning_rate": 6.533140314772015e-05, + "loss": 1.3602, + "step": 125520 + }, + { + "epoch": 0.8019753906699206, + "grad_norm": 0.9865765571594238, + "learning_rate": 6.532662709035431e-05, + "loss": 1.0048, + "step": 125530 + }, + { + "epoch": 0.8020392778196593, + "grad_norm": 0.8420624136924744, + "learning_rate": 6.532185087863607e-05, + "loss": 0.7444, + "step": 125540 + }, + { + "epoch": 0.802103164969398, + "grad_norm": 4.6297712326049805, + "learning_rate": 6.531707451261354e-05, + "loss": 0.921, + "step": 125550 + }, + { + "epoch": 0.8021670521191367, + "grad_norm": 0.9108629822731018, + "learning_rate": 6.531229799233482e-05, + "loss": 1.1591, + "step": 125560 + }, + { + "epoch": 0.8022309392688755, + "grad_norm": 1.6921429634094238, + "learning_rate": 6.530752131784801e-05, + "loss": 0.8015, + "step": 125570 + }, + { + "epoch": 0.8022948264186142, + "grad_norm": 0.6055482029914856, + "learning_rate": 6.530274448920122e-05, + "loss": 0.8499, + "step": 125580 + }, + { + "epoch": 0.8023587135683529, + "grad_norm": 1.0739818811416626, + "learning_rate": 6.529796750644255e-05, + "loss": 0.8019, + "step": 125590 + }, + { + "epoch": 0.8024226007180916, + "grad_norm": 1.0607513189315796, + "learning_rate": 6.52931903696201e-05, + "loss": 0.9353, + "step": 125600 + }, + { + "epoch": 0.8024864878678303, + "grad_norm": 0.9030999541282654, + "learning_rate": 6.528841307878201e-05, + "loss": 1.1511, + "step": 125610 + }, + { + "epoch": 0.802550375017569, + "grad_norm": 1.075486183166504, + "learning_rate": 6.528363563397638e-05, + "loss": 0.8964, + "step": 125620 + }, + { + "epoch": 0.8026142621673077, + "grad_norm": 1.8785415887832642, + "learning_rate": 6.527885803525131e-05, + "loss": 0.7627, + "step": 125630 + }, + { + "epoch": 0.8026781493170464, + "grad_norm": 0.9460232853889465, + "learning_rate": 6.527408028265491e-05, + "loss": 0.8828, + "step": 125640 + }, + { + "epoch": 0.8027420364667851, + "grad_norm": 0.7924548387527466, + "learning_rate": 6.526930237623533e-05, + "loss": 0.7436, + "step": 125650 + }, + { + "epoch": 0.8028059236165238, + "grad_norm": 0.8257904052734375, + "learning_rate": 6.526452431604065e-05, + "loss": 1.0438, + "step": 125660 + }, + { + "epoch": 0.8028698107662625, + "grad_norm": 1.1398284435272217, + "learning_rate": 6.5259746102119e-05, + "loss": 0.8756, + "step": 125670 + }, + { + "epoch": 0.8029336979160012, + "grad_norm": 0.955585777759552, + "learning_rate": 6.52549677345185e-05, + "loss": 1.0752, + "step": 125680 + }, + { + "epoch": 0.8029975850657399, + "grad_norm": 0.8351637721061707, + "learning_rate": 6.525018921328729e-05, + "loss": 0.9187, + "step": 125690 + }, + { + "epoch": 0.8030614722154786, + "grad_norm": 0.9746791124343872, + "learning_rate": 6.524541053847349e-05, + "loss": 0.6942, + "step": 125700 + }, + { + "epoch": 0.8031253593652172, + "grad_norm": 0.697482705116272, + "learning_rate": 6.52406317101252e-05, + "loss": 0.7528, + "step": 125710 + }, + { + "epoch": 0.8031892465149559, + "grad_norm": 0.9149326682090759, + "learning_rate": 6.523585272829056e-05, + "loss": 0.85, + "step": 125720 + }, + { + "epoch": 0.8032531336646946, + "grad_norm": 0.9170807003974915, + "learning_rate": 6.52310735930177e-05, + "loss": 1.1702, + "step": 125730 + }, + { + "epoch": 0.8033170208144333, + "grad_norm": 0.8044551014900208, + "learning_rate": 6.522629430435479e-05, + "loss": 1.0825, + "step": 125740 + }, + { + "epoch": 0.803380907964172, + "grad_norm": 1.228047490119934, + "learning_rate": 6.522151486234989e-05, + "loss": 0.7574, + "step": 125750 + }, + { + "epoch": 0.8034447951139108, + "grad_norm": 0.9429476857185364, + "learning_rate": 6.521673526705116e-05, + "loss": 1.0447, + "step": 125760 + }, + { + "epoch": 0.8035086822636495, + "grad_norm": 1.0148427486419678, + "learning_rate": 6.521195551850676e-05, + "loss": 1.0113, + "step": 125770 + }, + { + "epoch": 0.8035725694133882, + "grad_norm": 0.9460819959640503, + "learning_rate": 6.520717561676481e-05, + "loss": 1.0225, + "step": 125780 + }, + { + "epoch": 0.8036364565631269, + "grad_norm": 1.2216135263442993, + "learning_rate": 6.520239556187345e-05, + "loss": 0.999, + "step": 125790 + }, + { + "epoch": 0.8037003437128656, + "grad_norm": 0.7542139887809753, + "learning_rate": 6.519761535388079e-05, + "loss": 1.1307, + "step": 125800 + }, + { + "epoch": 0.8037642308626043, + "grad_norm": 0.6314334273338318, + "learning_rate": 6.519283499283502e-05, + "loss": 0.8114, + "step": 125810 + }, + { + "epoch": 0.803828118012343, + "grad_norm": 1.1564096212387085, + "learning_rate": 6.518805447878425e-05, + "loss": 0.8931, + "step": 125820 + }, + { + "epoch": 0.8038920051620817, + "grad_norm": 0.7837060689926147, + "learning_rate": 6.518327381177663e-05, + "loss": 0.7861, + "step": 125830 + }, + { + "epoch": 0.8039558923118204, + "grad_norm": 0.8681246042251587, + "learning_rate": 6.51784929918603e-05, + "loss": 1.194, + "step": 125840 + }, + { + "epoch": 0.8040197794615591, + "grad_norm": 1.4381413459777832, + "learning_rate": 6.517371201908342e-05, + "loss": 0.8307, + "step": 125850 + }, + { + "epoch": 0.8040836666112978, + "grad_norm": 1.4342834949493408, + "learning_rate": 6.516893089349414e-05, + "loss": 0.7483, + "step": 125860 + }, + { + "epoch": 0.8041475537610365, + "grad_norm": 0.9879970550537109, + "learning_rate": 6.516414961514059e-05, + "loss": 1.1164, + "step": 125870 + }, + { + "epoch": 0.8042114409107752, + "grad_norm": 1.0189735889434814, + "learning_rate": 6.515936818407095e-05, + "loss": 0.9046, + "step": 125880 + }, + { + "epoch": 0.8042753280605139, + "grad_norm": 1.1108025312423706, + "learning_rate": 6.515458660033335e-05, + "loss": 0.913, + "step": 125890 + }, + { + "epoch": 0.8043392152102526, + "grad_norm": 0.862022876739502, + "learning_rate": 6.514980486397595e-05, + "loss": 0.913, + "step": 125900 + }, + { + "epoch": 0.8044031023599914, + "grad_norm": 0.8951718807220459, + "learning_rate": 6.51450229750469e-05, + "loss": 0.8194, + "step": 125910 + }, + { + "epoch": 0.8044669895097301, + "grad_norm": 0.9488630890846252, + "learning_rate": 6.514024093359438e-05, + "loss": 0.9198, + "step": 125920 + }, + { + "epoch": 0.8045308766594688, + "grad_norm": 1.038546085357666, + "learning_rate": 6.513545873966654e-05, + "loss": 0.8252, + "step": 125930 + }, + { + "epoch": 0.8045947638092075, + "grad_norm": 0.8957170844078064, + "learning_rate": 6.513067639331151e-05, + "loss": 0.9968, + "step": 125940 + }, + { + "epoch": 0.8046586509589461, + "grad_norm": 0.7613710761070251, + "learning_rate": 6.512589389457751e-05, + "loss": 0.9103, + "step": 125950 + }, + { + "epoch": 0.8047225381086848, + "grad_norm": 1.0033246278762817, + "learning_rate": 6.512111124351265e-05, + "loss": 0.9965, + "step": 125960 + }, + { + "epoch": 0.8047864252584235, + "grad_norm": 0.5386576652526855, + "learning_rate": 6.511632844016512e-05, + "loss": 0.7118, + "step": 125970 + }, + { + "epoch": 0.8048503124081622, + "grad_norm": 0.74485844373703, + "learning_rate": 6.511154548458312e-05, + "loss": 0.7851, + "step": 125980 + }, + { + "epoch": 0.8049141995579009, + "grad_norm": 0.9282761812210083, + "learning_rate": 6.510676237681475e-05, + "loss": 0.9678, + "step": 125990 + }, + { + "epoch": 0.8049780867076396, + "grad_norm": 2.0792996883392334, + "learning_rate": 6.510197911690822e-05, + "loss": 1.4649, + "step": 126000 + }, + { + "epoch": 0.8050419738573783, + "grad_norm": 0.6187208294868469, + "learning_rate": 6.50971957049117e-05, + "loss": 0.9499, + "step": 126010 + }, + { + "epoch": 0.805105861007117, + "grad_norm": 0.8118966221809387, + "learning_rate": 6.509241214087334e-05, + "loss": 0.7766, + "step": 126020 + }, + { + "epoch": 0.8051697481568557, + "grad_norm": 0.8239946365356445, + "learning_rate": 6.508762842484135e-05, + "loss": 0.8789, + "step": 126030 + }, + { + "epoch": 0.8052336353065944, + "grad_norm": 1.1240622997283936, + "learning_rate": 6.508284455686388e-05, + "loss": 0.6257, + "step": 126040 + }, + { + "epoch": 0.8052975224563331, + "grad_norm": 1.1769168376922607, + "learning_rate": 6.507806053698912e-05, + "loss": 0.8184, + "step": 126050 + }, + { + "epoch": 0.8053614096060718, + "grad_norm": 1.071930170059204, + "learning_rate": 6.507327636526526e-05, + "loss": 0.778, + "step": 126060 + }, + { + "epoch": 0.8054252967558105, + "grad_norm": 0.9074715375900269, + "learning_rate": 6.506849204174045e-05, + "loss": 1.0664, + "step": 126070 + }, + { + "epoch": 0.8054891839055492, + "grad_norm": 1.1464279890060425, + "learning_rate": 6.50637075664629e-05, + "loss": 0.9875, + "step": 126080 + }, + { + "epoch": 0.805553071055288, + "grad_norm": 1.2106982469558716, + "learning_rate": 6.505892293948077e-05, + "loss": 0.9013, + "step": 126090 + }, + { + "epoch": 0.8056169582050267, + "grad_norm": 0.763820230960846, + "learning_rate": 6.505413816084227e-05, + "loss": 0.8071, + "step": 126100 + }, + { + "epoch": 0.8056808453547654, + "grad_norm": 1.4662244319915771, + "learning_rate": 6.504935323059558e-05, + "loss": 0.7963, + "step": 126110 + }, + { + "epoch": 0.8057447325045041, + "grad_norm": 1.1400254964828491, + "learning_rate": 6.504456814878888e-05, + "loss": 0.8126, + "step": 126120 + }, + { + "epoch": 0.8058086196542428, + "grad_norm": 0.844118595123291, + "learning_rate": 6.503978291547035e-05, + "loss": 0.8508, + "step": 126130 + }, + { + "epoch": 0.8058725068039815, + "grad_norm": 0.9051877856254578, + "learning_rate": 6.50349975306882e-05, + "loss": 0.788, + "step": 126140 + }, + { + "epoch": 0.8059363939537202, + "grad_norm": 0.8042912483215332, + "learning_rate": 6.503021199449063e-05, + "loss": 0.9271, + "step": 126150 + }, + { + "epoch": 0.8060002811034589, + "grad_norm": 0.8122944235801697, + "learning_rate": 6.50254263069258e-05, + "loss": 1.2381, + "step": 126160 + }, + { + "epoch": 0.8060641682531976, + "grad_norm": 0.9089512228965759, + "learning_rate": 6.502064046804193e-05, + "loss": 0.8603, + "step": 126170 + }, + { + "epoch": 0.8061280554029363, + "grad_norm": 0.9631441235542297, + "learning_rate": 6.501585447788724e-05, + "loss": 1.0211, + "step": 126180 + }, + { + "epoch": 0.8061919425526749, + "grad_norm": 0.5467532873153687, + "learning_rate": 6.501106833650989e-05, + "loss": 0.7911, + "step": 126190 + }, + { + "epoch": 0.8062558297024136, + "grad_norm": 0.8144317269325256, + "learning_rate": 6.500628204395809e-05, + "loss": 1.0271, + "step": 126200 + }, + { + "epoch": 0.8063197168521523, + "grad_norm": 0.6521165370941162, + "learning_rate": 6.500149560028005e-05, + "loss": 0.9794, + "step": 126210 + }, + { + "epoch": 0.806383604001891, + "grad_norm": 0.6328021287918091, + "learning_rate": 6.499670900552397e-05, + "loss": 0.8287, + "step": 126220 + }, + { + "epoch": 0.8064474911516297, + "grad_norm": 0.565212607383728, + "learning_rate": 6.499192225973806e-05, + "loss": 0.8106, + "step": 126230 + }, + { + "epoch": 0.8065113783013684, + "grad_norm": 0.8968755602836609, + "learning_rate": 6.498713536297053e-05, + "loss": 0.7972, + "step": 126240 + }, + { + "epoch": 0.8065752654511071, + "grad_norm": 0.9558743834495544, + "learning_rate": 6.498234831526957e-05, + "loss": 0.8166, + "step": 126250 + }, + { + "epoch": 0.8066391526008458, + "grad_norm": 0.8803595900535583, + "learning_rate": 6.497756111668342e-05, + "loss": 0.7788, + "step": 126260 + }, + { + "epoch": 0.8067030397505845, + "grad_norm": 0.6762875914573669, + "learning_rate": 6.497277376726025e-05, + "loss": 0.9889, + "step": 126270 + }, + { + "epoch": 0.8067669269003233, + "grad_norm": 0.7682203650474548, + "learning_rate": 6.496798626704831e-05, + "loss": 1.016, + "step": 126280 + }, + { + "epoch": 0.806830814050062, + "grad_norm": 0.8153054714202881, + "learning_rate": 6.496319861609579e-05, + "loss": 0.8005, + "step": 126290 + }, + { + "epoch": 0.8068947011998007, + "grad_norm": 0.767785906791687, + "learning_rate": 6.495841081445091e-05, + "loss": 0.678, + "step": 126300 + }, + { + "epoch": 0.8069585883495394, + "grad_norm": 1.054632544517517, + "learning_rate": 6.495362286216191e-05, + "loss": 0.8752, + "step": 126310 + }, + { + "epoch": 0.8070224754992781, + "grad_norm": 1.0451246500015259, + "learning_rate": 6.494883475927698e-05, + "loss": 0.8354, + "step": 126320 + }, + { + "epoch": 0.8070863626490168, + "grad_norm": 0.6930572986602783, + "learning_rate": 6.494404650584435e-05, + "loss": 0.9319, + "step": 126330 + }, + { + "epoch": 0.8071502497987555, + "grad_norm": 0.9584304094314575, + "learning_rate": 6.493925810191226e-05, + "loss": 0.924, + "step": 126340 + }, + { + "epoch": 0.8072141369484942, + "grad_norm": 1.0455424785614014, + "learning_rate": 6.49344695475289e-05, + "loss": 1.0299, + "step": 126350 + }, + { + "epoch": 0.8072780240982329, + "grad_norm": 1.3468433618545532, + "learning_rate": 6.49296808427425e-05, + "loss": 0.7191, + "step": 126360 + }, + { + "epoch": 0.8073419112479716, + "grad_norm": 1.8125096559524536, + "learning_rate": 6.492489198760131e-05, + "loss": 0.8762, + "step": 126370 + }, + { + "epoch": 0.8074057983977103, + "grad_norm": 1.148374080657959, + "learning_rate": 6.492010298215355e-05, + "loss": 0.8672, + "step": 126380 + }, + { + "epoch": 0.807469685547449, + "grad_norm": 0.7599702477455139, + "learning_rate": 6.491531382644744e-05, + "loss": 0.9156, + "step": 126390 + }, + { + "epoch": 0.8075335726971877, + "grad_norm": 1.1603766679763794, + "learning_rate": 6.491052452053123e-05, + "loss": 0.7981, + "step": 126400 + }, + { + "epoch": 0.8075974598469264, + "grad_norm": 0.9405614733695984, + "learning_rate": 6.490573506445312e-05, + "loss": 0.8393, + "step": 126410 + }, + { + "epoch": 0.8076613469966651, + "grad_norm": 1.048951268196106, + "learning_rate": 6.490094545826137e-05, + "loss": 1.0174, + "step": 126420 + }, + { + "epoch": 0.8077252341464038, + "grad_norm": 2.152139663696289, + "learning_rate": 6.48961557020042e-05, + "loss": 1.0473, + "step": 126430 + }, + { + "epoch": 0.8077891212961424, + "grad_norm": 0.6861464977264404, + "learning_rate": 6.489136579572987e-05, + "loss": 0.7224, + "step": 126440 + }, + { + "epoch": 0.8078530084458811, + "grad_norm": 0.8665691018104553, + "learning_rate": 6.48865757394866e-05, + "loss": 1.0713, + "step": 126450 + }, + { + "epoch": 0.8079168955956199, + "grad_norm": 0.651671826839447, + "learning_rate": 6.488178553332262e-05, + "loss": 1.0617, + "step": 126460 + }, + { + "epoch": 0.8079807827453586, + "grad_norm": 1.4242401123046875, + "learning_rate": 6.487699517728621e-05, + "loss": 0.8041, + "step": 126470 + }, + { + "epoch": 0.8080446698950973, + "grad_norm": 0.5821726322174072, + "learning_rate": 6.487220467142556e-05, + "loss": 0.9239, + "step": 126480 + }, + { + "epoch": 0.808108557044836, + "grad_norm": 0.5187436938285828, + "learning_rate": 6.486741401578897e-05, + "loss": 0.7294, + "step": 126490 + }, + { + "epoch": 0.8081724441945747, + "grad_norm": 0.7180354595184326, + "learning_rate": 6.486262321042465e-05, + "loss": 0.8105, + "step": 126500 + }, + { + "epoch": 0.8082363313443134, + "grad_norm": 1.0905053615570068, + "learning_rate": 6.485783225538084e-05, + "loss": 0.7558, + "step": 126510 + }, + { + "epoch": 0.8083002184940521, + "grad_norm": 0.7358648777008057, + "learning_rate": 6.485304115070582e-05, + "loss": 0.7867, + "step": 126520 + }, + { + "epoch": 0.8083641056437908, + "grad_norm": 0.6395271420478821, + "learning_rate": 6.484824989644783e-05, + "loss": 0.8508, + "step": 126530 + }, + { + "epoch": 0.8084279927935295, + "grad_norm": 0.877444863319397, + "learning_rate": 6.48434584926551e-05, + "loss": 1.0844, + "step": 126540 + }, + { + "epoch": 0.8084918799432682, + "grad_norm": 0.5219199061393738, + "learning_rate": 6.483866693937591e-05, + "loss": 0.7814, + "step": 126550 + }, + { + "epoch": 0.8085557670930069, + "grad_norm": 2.268413543701172, + "learning_rate": 6.483387523665852e-05, + "loss": 0.8243, + "step": 126560 + }, + { + "epoch": 0.8086196542427456, + "grad_norm": 0.6467793583869934, + "learning_rate": 6.482908338455113e-05, + "loss": 0.8682, + "step": 126570 + }, + { + "epoch": 0.8086835413924843, + "grad_norm": 1.109560489654541, + "learning_rate": 6.48242913831021e-05, + "loss": 0.7325, + "step": 126580 + }, + { + "epoch": 0.808747428542223, + "grad_norm": 2.1238925457000732, + "learning_rate": 6.48194992323596e-05, + "loss": 0.7621, + "step": 126590 + }, + { + "epoch": 0.8088113156919617, + "grad_norm": 0.7818292379379272, + "learning_rate": 6.481470693237193e-05, + "loss": 0.8513, + "step": 126600 + }, + { + "epoch": 0.8088752028417004, + "grad_norm": 0.8651500344276428, + "learning_rate": 6.480991448318735e-05, + "loss": 1.1925, + "step": 126610 + }, + { + "epoch": 0.8089390899914392, + "grad_norm": 1.0120964050292969, + "learning_rate": 6.48051218848541e-05, + "loss": 0.7734, + "step": 126620 + }, + { + "epoch": 0.8090029771411779, + "grad_norm": 0.6055595278739929, + "learning_rate": 6.480032913742047e-05, + "loss": 0.8752, + "step": 126630 + }, + { + "epoch": 0.8090668642909166, + "grad_norm": 0.9761593341827393, + "learning_rate": 6.479553624093473e-05, + "loss": 1.0153, + "step": 126640 + }, + { + "epoch": 0.8091307514406553, + "grad_norm": 0.926140308380127, + "learning_rate": 6.479074319544513e-05, + "loss": 0.7519, + "step": 126650 + }, + { + "epoch": 0.809194638590394, + "grad_norm": 1.0344536304473877, + "learning_rate": 6.478595000099996e-05, + "loss": 0.8786, + "step": 126660 + }, + { + "epoch": 0.8092585257401327, + "grad_norm": 1.2882970571517944, + "learning_rate": 6.478115665764748e-05, + "loss": 0.8631, + "step": 126670 + }, + { + "epoch": 0.8093224128898713, + "grad_norm": 0.98709636926651, + "learning_rate": 6.477636316543596e-05, + "loss": 0.9382, + "step": 126680 + }, + { + "epoch": 0.80938630003961, + "grad_norm": 0.9741780161857605, + "learning_rate": 6.477156952441368e-05, + "loss": 0.7388, + "step": 126690 + }, + { + "epoch": 0.8094501871893487, + "grad_norm": 0.7120775580406189, + "learning_rate": 6.476677573462893e-05, + "loss": 0.8167, + "step": 126700 + }, + { + "epoch": 0.8095140743390874, + "grad_norm": 0.9984919428825378, + "learning_rate": 6.476198179612995e-05, + "loss": 0.897, + "step": 126710 + }, + { + "epoch": 0.8095779614888261, + "grad_norm": 2.221468925476074, + "learning_rate": 6.475718770896505e-05, + "loss": 1.0463, + "step": 126720 + }, + { + "epoch": 0.8096418486385648, + "grad_norm": 0.9233216643333435, + "learning_rate": 6.47523934731825e-05, + "loss": 0.8333, + "step": 126730 + }, + { + "epoch": 0.8097057357883035, + "grad_norm": 0.7584207057952881, + "learning_rate": 6.47475990888306e-05, + "loss": 1.0903, + "step": 126740 + }, + { + "epoch": 0.8097696229380422, + "grad_norm": 0.913167417049408, + "learning_rate": 6.474280455595761e-05, + "loss": 0.9977, + "step": 126750 + }, + { + "epoch": 0.8098335100877809, + "grad_norm": 0.8217071890830994, + "learning_rate": 6.473800987461182e-05, + "loss": 0.8709, + "step": 126760 + }, + { + "epoch": 0.8098973972375196, + "grad_norm": 0.8127371072769165, + "learning_rate": 6.473321504484152e-05, + "loss": 0.9532, + "step": 126770 + }, + { + "epoch": 0.8099612843872583, + "grad_norm": 0.5659823417663574, + "learning_rate": 6.4728420066695e-05, + "loss": 0.9879, + "step": 126780 + }, + { + "epoch": 0.810025171536997, + "grad_norm": 4.89599084854126, + "learning_rate": 6.472362494022055e-05, + "loss": 0.973, + "step": 126790 + }, + { + "epoch": 0.8100890586867358, + "grad_norm": 2.1333658695220947, + "learning_rate": 6.471882966546647e-05, + "loss": 0.6452, + "step": 126800 + }, + { + "epoch": 0.8101529458364745, + "grad_norm": 0.8865774869918823, + "learning_rate": 6.471403424248102e-05, + "loss": 0.7954, + "step": 126810 + }, + { + "epoch": 0.8102168329862132, + "grad_norm": 0.8974156975746155, + "learning_rate": 6.470923867131254e-05, + "loss": 0.9977, + "step": 126820 + }, + { + "epoch": 0.8102807201359519, + "grad_norm": 1.3754866123199463, + "learning_rate": 6.47044429520093e-05, + "loss": 0.8972, + "step": 126830 + }, + { + "epoch": 0.8103446072856906, + "grad_norm": 0.8997700214385986, + "learning_rate": 6.469964708461957e-05, + "loss": 0.6717, + "step": 126840 + }, + { + "epoch": 0.8104084944354293, + "grad_norm": 0.7010866403579712, + "learning_rate": 6.469485106919171e-05, + "loss": 0.688, + "step": 126850 + }, + { + "epoch": 0.810472381585168, + "grad_norm": 1.2997609376907349, + "learning_rate": 6.469005490577397e-05, + "loss": 1.0006, + "step": 126860 + }, + { + "epoch": 0.8105362687349067, + "grad_norm": 0.8053306937217712, + "learning_rate": 6.468525859441466e-05, + "loss": 0.8973, + "step": 126870 + }, + { + "epoch": 0.8106001558846454, + "grad_norm": 0.7065293192863464, + "learning_rate": 6.46804621351621e-05, + "loss": 1.0594, + "step": 126880 + }, + { + "epoch": 0.8106640430343841, + "grad_norm": 1.1768196821212769, + "learning_rate": 6.467566552806458e-05, + "loss": 0.9089, + "step": 126890 + }, + { + "epoch": 0.8107279301841228, + "grad_norm": 1.5799227952957153, + "learning_rate": 6.467086877317042e-05, + "loss": 0.7268, + "step": 126900 + }, + { + "epoch": 0.8107918173338615, + "grad_norm": 0.7918219566345215, + "learning_rate": 6.466607187052791e-05, + "loss": 1.016, + "step": 126910 + }, + { + "epoch": 0.8108557044836001, + "grad_norm": 1.188558578491211, + "learning_rate": 6.466127482018538e-05, + "loss": 1.0635, + "step": 126920 + }, + { + "epoch": 0.8109195916333388, + "grad_norm": 0.8027870059013367, + "learning_rate": 6.465647762219113e-05, + "loss": 0.7027, + "step": 126930 + }, + { + "epoch": 0.8109834787830775, + "grad_norm": 0.84566330909729, + "learning_rate": 6.465168027659347e-05, + "loss": 0.8105, + "step": 126940 + }, + { + "epoch": 0.8110473659328162, + "grad_norm": 0.6688374280929565, + "learning_rate": 6.46468827834407e-05, + "loss": 0.7993, + "step": 126950 + }, + { + "epoch": 0.8111112530825549, + "grad_norm": 0.7672613263130188, + "learning_rate": 6.464208514278117e-05, + "loss": 0.9798, + "step": 126960 + }, + { + "epoch": 0.8111751402322936, + "grad_norm": 1.0003461837768555, + "learning_rate": 6.463728735466316e-05, + "loss": 0.8659, + "step": 126970 + }, + { + "epoch": 0.8112390273820324, + "grad_norm": 1.0807254314422607, + "learning_rate": 6.4632489419135e-05, + "loss": 0.9423, + "step": 126980 + }, + { + "epoch": 0.8113029145317711, + "grad_norm": 0.6084434390068054, + "learning_rate": 6.462769133624502e-05, + "loss": 0.6477, + "step": 126990 + }, + { + "epoch": 0.8113668016815098, + "grad_norm": 0.7331100106239319, + "learning_rate": 6.462289310604152e-05, + "loss": 1.0194, + "step": 127000 + }, + { + "epoch": 0.8114306888312485, + "grad_norm": 1.0678889751434326, + "learning_rate": 6.461809472857287e-05, + "loss": 0.7349, + "step": 127010 + }, + { + "epoch": 0.8114945759809872, + "grad_norm": 1.4180760383605957, + "learning_rate": 6.461329620388733e-05, + "loss": 0.7278, + "step": 127020 + }, + { + "epoch": 0.8115584631307259, + "grad_norm": 0.904155433177948, + "learning_rate": 6.460849753203326e-05, + "loss": 0.8996, + "step": 127030 + }, + { + "epoch": 0.8116223502804646, + "grad_norm": 0.8179849982261658, + "learning_rate": 6.460369871305899e-05, + "loss": 0.8029, + "step": 127040 + }, + { + "epoch": 0.8116862374302033, + "grad_norm": 0.8025046586990356, + "learning_rate": 6.459889974701284e-05, + "loss": 1.1434, + "step": 127050 + }, + { + "epoch": 0.811750124579942, + "grad_norm": 0.9315536022186279, + "learning_rate": 6.459410063394314e-05, + "loss": 0.7199, + "step": 127060 + }, + { + "epoch": 0.8118140117296807, + "grad_norm": 1.0310189723968506, + "learning_rate": 6.458930137389821e-05, + "loss": 0.8107, + "step": 127070 + }, + { + "epoch": 0.8118778988794194, + "grad_norm": 0.897158682346344, + "learning_rate": 6.45845019669264e-05, + "loss": 0.9007, + "step": 127080 + }, + { + "epoch": 0.8119417860291581, + "grad_norm": 0.8485071659088135, + "learning_rate": 6.457970241307603e-05, + "loss": 0.9143, + "step": 127090 + }, + { + "epoch": 0.8120056731788968, + "grad_norm": 1.0846539735794067, + "learning_rate": 6.457490271239546e-05, + "loss": 0.5329, + "step": 127100 + }, + { + "epoch": 0.8120695603286355, + "grad_norm": 1.0660455226898193, + "learning_rate": 6.457010286493299e-05, + "loss": 0.8025, + "step": 127110 + }, + { + "epoch": 0.8121334474783742, + "grad_norm": 0.7800552248954773, + "learning_rate": 6.4565302870737e-05, + "loss": 0.7744, + "step": 127120 + }, + { + "epoch": 0.8121973346281129, + "grad_norm": 1.9460397958755493, + "learning_rate": 6.45605027298558e-05, + "loss": 1.0076, + "step": 127130 + }, + { + "epoch": 0.8122612217778516, + "grad_norm": 0.7242342233657837, + "learning_rate": 6.455570244233774e-05, + "loss": 0.878, + "step": 127140 + }, + { + "epoch": 0.8123251089275904, + "grad_norm": 1.0195945501327515, + "learning_rate": 6.455090200823117e-05, + "loss": 1.0594, + "step": 127150 + }, + { + "epoch": 0.812388996077329, + "grad_norm": 0.4291139841079712, + "learning_rate": 6.454610142758442e-05, + "loss": 0.8271, + "step": 127160 + }, + { + "epoch": 0.8124528832270677, + "grad_norm": 0.8189600110054016, + "learning_rate": 6.454130070044584e-05, + "loss": 1.0931, + "step": 127170 + }, + { + "epoch": 0.8125167703768064, + "grad_norm": 0.6839133501052856, + "learning_rate": 6.45364998268638e-05, + "loss": 0.9161, + "step": 127180 + }, + { + "epoch": 0.8125806575265451, + "grad_norm": 0.835392951965332, + "learning_rate": 6.45316988068866e-05, + "loss": 0.9721, + "step": 127190 + }, + { + "epoch": 0.8126445446762838, + "grad_norm": 0.5460143089294434, + "learning_rate": 6.452689764056265e-05, + "loss": 0.9177, + "step": 127200 + }, + { + "epoch": 0.8127084318260225, + "grad_norm": 1.0494486093521118, + "learning_rate": 6.452209632794027e-05, + "loss": 0.8844, + "step": 127210 + }, + { + "epoch": 0.8127723189757612, + "grad_norm": 0.6247775554656982, + "learning_rate": 6.451729486906781e-05, + "loss": 0.9528, + "step": 127220 + }, + { + "epoch": 0.8128362061254999, + "grad_norm": 1.8002761602401733, + "learning_rate": 6.451249326399364e-05, + "loss": 1.1712, + "step": 127230 + }, + { + "epoch": 0.8129000932752386, + "grad_norm": 0.9478850960731506, + "learning_rate": 6.45076915127661e-05, + "loss": 0.8959, + "step": 127240 + }, + { + "epoch": 0.8129639804249773, + "grad_norm": 0.707378089427948, + "learning_rate": 6.450288961543355e-05, + "loss": 0.8968, + "step": 127250 + }, + { + "epoch": 0.813027867574716, + "grad_norm": 0.9674128890037537, + "learning_rate": 6.449808757204435e-05, + "loss": 0.862, + "step": 127260 + }, + { + "epoch": 0.8130917547244547, + "grad_norm": 1.1867669820785522, + "learning_rate": 6.449328538264687e-05, + "loss": 0.808, + "step": 127270 + }, + { + "epoch": 0.8131556418741934, + "grad_norm": 1.1251099109649658, + "learning_rate": 6.448848304728949e-05, + "loss": 0.8379, + "step": 127280 + }, + { + "epoch": 0.8132195290239321, + "grad_norm": 0.891304612159729, + "learning_rate": 6.448368056602053e-05, + "loss": 0.9116, + "step": 127290 + }, + { + "epoch": 0.8132834161736708, + "grad_norm": 1.0595531463623047, + "learning_rate": 6.447887793888838e-05, + "loss": 0.8859, + "step": 127300 + }, + { + "epoch": 0.8133473033234095, + "grad_norm": 0.8898464441299438, + "learning_rate": 6.447407516594142e-05, + "loss": 0.982, + "step": 127310 + }, + { + "epoch": 0.8134111904731482, + "grad_norm": 2.1470937728881836, + "learning_rate": 6.446927224722799e-05, + "loss": 0.8127, + "step": 127320 + }, + { + "epoch": 0.813475077622887, + "grad_norm": 1.043031096458435, + "learning_rate": 6.446446918279647e-05, + "loss": 0.8647, + "step": 127330 + }, + { + "epoch": 0.8135389647726257, + "grad_norm": 0.8971779942512512, + "learning_rate": 6.445966597269522e-05, + "loss": 1.0, + "step": 127340 + }, + { + "epoch": 0.8136028519223644, + "grad_norm": 0.8842697739601135, + "learning_rate": 6.445486261697263e-05, + "loss": 0.8011, + "step": 127350 + }, + { + "epoch": 0.8136667390721031, + "grad_norm": 0.8753737211227417, + "learning_rate": 6.445005911567707e-05, + "loss": 0.9761, + "step": 127360 + }, + { + "epoch": 0.8137306262218418, + "grad_norm": 0.7797544598579407, + "learning_rate": 6.444525546885692e-05, + "loss": 0.7789, + "step": 127370 + }, + { + "epoch": 0.8137945133715805, + "grad_norm": 0.49460268020629883, + "learning_rate": 6.444045167656055e-05, + "loss": 0.836, + "step": 127380 + }, + { + "epoch": 0.8138584005213192, + "grad_norm": 1.005393385887146, + "learning_rate": 6.443564773883634e-05, + "loss": 0.8948, + "step": 127390 + }, + { + "epoch": 0.8139222876710579, + "grad_norm": 0.7220799922943115, + "learning_rate": 6.443084365573265e-05, + "loss": 0.8677, + "step": 127400 + }, + { + "epoch": 0.8139861748207965, + "grad_norm": 0.8531742691993713, + "learning_rate": 6.44260394272979e-05, + "loss": 0.7786, + "step": 127410 + }, + { + "epoch": 0.8140500619705352, + "grad_norm": 1.4867233037948608, + "learning_rate": 6.442123505358043e-05, + "loss": 0.8496, + "step": 127420 + }, + { + "epoch": 0.8141139491202739, + "grad_norm": 0.6640691161155701, + "learning_rate": 6.441643053462867e-05, + "loss": 0.847, + "step": 127430 + }, + { + "epoch": 0.8141778362700126, + "grad_norm": 0.5438361763954163, + "learning_rate": 6.441162587049096e-05, + "loss": 0.7101, + "step": 127440 + }, + { + "epoch": 0.8142417234197513, + "grad_norm": 0.879038393497467, + "learning_rate": 6.440682106121574e-05, + "loss": 0.9942, + "step": 127450 + }, + { + "epoch": 0.81430561056949, + "grad_norm": 0.6721540689468384, + "learning_rate": 6.440201610685135e-05, + "loss": 0.6765, + "step": 127460 + }, + { + "epoch": 0.8143694977192287, + "grad_norm": 0.627669095993042, + "learning_rate": 6.43972110074462e-05, + "loss": 0.8154, + "step": 127470 + }, + { + "epoch": 0.8144333848689674, + "grad_norm": 1.5187098979949951, + "learning_rate": 6.439240576304868e-05, + "loss": 1.1218, + "step": 127480 + }, + { + "epoch": 0.8144972720187061, + "grad_norm": 0.519985556602478, + "learning_rate": 6.438760037370719e-05, + "loss": 0.7047, + "step": 127490 + }, + { + "epoch": 0.8145611591684448, + "grad_norm": 0.7375752329826355, + "learning_rate": 6.43827948394701e-05, + "loss": 0.8807, + "step": 127500 + }, + { + "epoch": 0.8146250463181836, + "grad_norm": 0.5669057369232178, + "learning_rate": 6.437798916038584e-05, + "loss": 0.8591, + "step": 127510 + }, + { + "epoch": 0.8146889334679223, + "grad_norm": 0.9515382051467896, + "learning_rate": 6.437318333650279e-05, + "loss": 0.8639, + "step": 127520 + }, + { + "epoch": 0.814752820617661, + "grad_norm": 0.9715726971626282, + "learning_rate": 6.436837736786934e-05, + "loss": 0.6827, + "step": 127530 + }, + { + "epoch": 0.8148167077673997, + "grad_norm": 1.2894679307937622, + "learning_rate": 6.43635712545339e-05, + "loss": 0.7838, + "step": 127540 + }, + { + "epoch": 0.8148805949171384, + "grad_norm": 0.9113032817840576, + "learning_rate": 6.43587649965449e-05, + "loss": 0.9437, + "step": 127550 + }, + { + "epoch": 0.8149444820668771, + "grad_norm": 0.8050090074539185, + "learning_rate": 6.435395859395068e-05, + "loss": 1.0804, + "step": 127560 + }, + { + "epoch": 0.8150083692166158, + "grad_norm": 1.1734331846237183, + "learning_rate": 6.434915204679969e-05, + "loss": 0.9166, + "step": 127570 + }, + { + "epoch": 0.8150722563663545, + "grad_norm": 1.3602896928787231, + "learning_rate": 6.434434535514031e-05, + "loss": 0.8164, + "step": 127580 + }, + { + "epoch": 0.8151361435160932, + "grad_norm": 0.9085065722465515, + "learning_rate": 6.433953851902097e-05, + "loss": 1.0613, + "step": 127590 + }, + { + "epoch": 0.8152000306658319, + "grad_norm": 0.8501441478729248, + "learning_rate": 6.433473153849007e-05, + "loss": 0.7548, + "step": 127600 + }, + { + "epoch": 0.8152639178155706, + "grad_norm": 1.4549124240875244, + "learning_rate": 6.432992441359605e-05, + "loss": 0.8768, + "step": 127610 + }, + { + "epoch": 0.8153278049653093, + "grad_norm": 0.9004676938056946, + "learning_rate": 6.432511714438727e-05, + "loss": 0.9546, + "step": 127620 + }, + { + "epoch": 0.815391692115048, + "grad_norm": 0.6986418962478638, + "learning_rate": 6.432030973091216e-05, + "loss": 0.8329, + "step": 127630 + }, + { + "epoch": 0.8154555792647867, + "grad_norm": 1.1968231201171875, + "learning_rate": 6.431550217321916e-05, + "loss": 0.8781, + "step": 127640 + }, + { + "epoch": 0.8155194664145253, + "grad_norm": 0.9904518723487854, + "learning_rate": 6.431069447135665e-05, + "loss": 0.7686, + "step": 127650 + }, + { + "epoch": 0.815583353564264, + "grad_norm": 0.846964955329895, + "learning_rate": 6.43058866253731e-05, + "loss": 0.7053, + "step": 127660 + }, + { + "epoch": 0.8156472407140027, + "grad_norm": 0.9893980026245117, + "learning_rate": 6.430107863531685e-05, + "loss": 0.7232, + "step": 127670 + }, + { + "epoch": 0.8157111278637414, + "grad_norm": 0.9716863632202148, + "learning_rate": 6.42962705012364e-05, + "loss": 0.9921, + "step": 127680 + }, + { + "epoch": 0.8157750150134802, + "grad_norm": 0.5748932361602783, + "learning_rate": 6.429146222318013e-05, + "loss": 0.7242, + "step": 127690 + }, + { + "epoch": 0.8158389021632189, + "grad_norm": 0.6904158592224121, + "learning_rate": 6.428665380119648e-05, + "loss": 1.1946, + "step": 127700 + }, + { + "epoch": 0.8159027893129576, + "grad_norm": 0.8382551074028015, + "learning_rate": 6.428184523533384e-05, + "loss": 0.9143, + "step": 127710 + }, + { + "epoch": 0.8159666764626963, + "grad_norm": 1.1233938932418823, + "learning_rate": 6.427703652564067e-05, + "loss": 1.239, + "step": 127720 + }, + { + "epoch": 0.816030563612435, + "grad_norm": 0.6471089720726013, + "learning_rate": 6.42722276721654e-05, + "loss": 0.9955, + "step": 127730 + }, + { + "epoch": 0.8160944507621737, + "grad_norm": 0.796449601650238, + "learning_rate": 6.426741867495645e-05, + "loss": 1.1798, + "step": 127740 + }, + { + "epoch": 0.8161583379119124, + "grad_norm": 1.216551423072815, + "learning_rate": 6.426260953406225e-05, + "loss": 0.9472, + "step": 127750 + }, + { + "epoch": 0.8162222250616511, + "grad_norm": 0.4935864508152008, + "learning_rate": 6.425780024953124e-05, + "loss": 1.0413, + "step": 127760 + }, + { + "epoch": 0.8162861122113898, + "grad_norm": 1.2737202644348145, + "learning_rate": 6.425299082141184e-05, + "loss": 1.1372, + "step": 127770 + }, + { + "epoch": 0.8163499993611285, + "grad_norm": 1.7472068071365356, + "learning_rate": 6.424818124975248e-05, + "loss": 0.7832, + "step": 127780 + }, + { + "epoch": 0.8164138865108672, + "grad_norm": 0.7582964897155762, + "learning_rate": 6.424337153460162e-05, + "loss": 0.8762, + "step": 127790 + }, + { + "epoch": 0.8164777736606059, + "grad_norm": 1.1670618057250977, + "learning_rate": 6.42385616760077e-05, + "loss": 1.2763, + "step": 127800 + }, + { + "epoch": 0.8165416608103446, + "grad_norm": 0.6307504773139954, + "learning_rate": 6.423375167401912e-05, + "loss": 0.6937, + "step": 127810 + }, + { + "epoch": 0.8166055479600833, + "grad_norm": 1.9756511449813843, + "learning_rate": 6.422894152868437e-05, + "loss": 0.7304, + "step": 127820 + }, + { + "epoch": 0.816669435109822, + "grad_norm": 1.0273828506469727, + "learning_rate": 6.422413124005185e-05, + "loss": 0.7999, + "step": 127830 + }, + { + "epoch": 0.8167333222595607, + "grad_norm": 1.1379588842391968, + "learning_rate": 6.421932080817003e-05, + "loss": 0.8498, + "step": 127840 + }, + { + "epoch": 0.8167972094092995, + "grad_norm": 0.7161405086517334, + "learning_rate": 6.421451023308735e-05, + "loss": 1.4902, + "step": 127850 + }, + { + "epoch": 0.8168610965590382, + "grad_norm": 1.0441093444824219, + "learning_rate": 6.420969951485225e-05, + "loss": 0.8677, + "step": 127860 + }, + { + "epoch": 0.8169249837087769, + "grad_norm": 0.8484379053115845, + "learning_rate": 6.42048886535132e-05, + "loss": 0.7921, + "step": 127870 + }, + { + "epoch": 0.8169888708585156, + "grad_norm": 0.8539422750473022, + "learning_rate": 6.420007764911861e-05, + "loss": 0.7991, + "step": 127880 + }, + { + "epoch": 0.8170527580082542, + "grad_norm": 0.5614151954650879, + "learning_rate": 6.419526650171697e-05, + "loss": 1.0511, + "step": 127890 + }, + { + "epoch": 0.8171166451579929, + "grad_norm": 0.9628438949584961, + "learning_rate": 6.41904552113567e-05, + "loss": 0.6749, + "step": 127900 + }, + { + "epoch": 0.8171805323077316, + "grad_norm": 0.8185387253761292, + "learning_rate": 6.418564377808627e-05, + "loss": 0.8436, + "step": 127910 + }, + { + "epoch": 0.8172444194574703, + "grad_norm": 0.6179929971694946, + "learning_rate": 6.418083220195414e-05, + "loss": 0.7563, + "step": 127920 + }, + { + "epoch": 0.817308306607209, + "grad_norm": 0.7823129892349243, + "learning_rate": 6.417602048300877e-05, + "loss": 0.8868, + "step": 127930 + }, + { + "epoch": 0.8173721937569477, + "grad_norm": 1.1083999872207642, + "learning_rate": 6.41712086212986e-05, + "loss": 0.8661, + "step": 127940 + }, + { + "epoch": 0.8174360809066864, + "grad_norm": 2.1160151958465576, + "learning_rate": 6.41663966168721e-05, + "loss": 0.8088, + "step": 127950 + }, + { + "epoch": 0.8174999680564251, + "grad_norm": 0.6990865468978882, + "learning_rate": 6.416158446977772e-05, + "loss": 1.0852, + "step": 127960 + }, + { + "epoch": 0.8175638552061638, + "grad_norm": 0.8294287919998169, + "learning_rate": 6.415677218006395e-05, + "loss": 0.8674, + "step": 127970 + }, + { + "epoch": 0.8176277423559025, + "grad_norm": 1.1241607666015625, + "learning_rate": 6.415195974777923e-05, + "loss": 0.939, + "step": 127980 + }, + { + "epoch": 0.8176916295056412, + "grad_norm": 0.8454298377037048, + "learning_rate": 6.414714717297203e-05, + "loss": 0.8675, + "step": 127990 + }, + { + "epoch": 0.8177555166553799, + "grad_norm": 0.7560257315635681, + "learning_rate": 6.414233445569083e-05, + "loss": 1.0024, + "step": 128000 + }, + { + "epoch": 0.8178194038051186, + "grad_norm": 0.7482271790504456, + "learning_rate": 6.413752159598408e-05, + "loss": 0.8029, + "step": 128010 + }, + { + "epoch": 0.8178832909548573, + "grad_norm": 1.5129786729812622, + "learning_rate": 6.413270859390026e-05, + "loss": 1.0656, + "step": 128020 + }, + { + "epoch": 0.817947178104596, + "grad_norm": 1.2773970365524292, + "learning_rate": 6.412789544948782e-05, + "loss": 0.9819, + "step": 128030 + }, + { + "epoch": 0.8180110652543348, + "grad_norm": 0.8020282983779907, + "learning_rate": 6.41230821627953e-05, + "loss": 0.6803, + "step": 128040 + }, + { + "epoch": 0.8180749524040735, + "grad_norm": 1.2904086112976074, + "learning_rate": 6.411826873387108e-05, + "loss": 1.1785, + "step": 128050 + }, + { + "epoch": 0.8181388395538122, + "grad_norm": 0.6891657710075378, + "learning_rate": 6.41134551627637e-05, + "loss": 0.8503, + "step": 128060 + }, + { + "epoch": 0.8182027267035509, + "grad_norm": 0.9676710963249207, + "learning_rate": 6.41086414495216e-05, + "loss": 0.8886, + "step": 128070 + }, + { + "epoch": 0.8182666138532896, + "grad_norm": 0.881633460521698, + "learning_rate": 6.410382759419328e-05, + "loss": 0.8703, + "step": 128080 + }, + { + "epoch": 0.8183305010030283, + "grad_norm": 2.246070146560669, + "learning_rate": 6.409901359682722e-05, + "loss": 0.9637, + "step": 128090 + }, + { + "epoch": 0.818394388152767, + "grad_norm": 0.9771053194999695, + "learning_rate": 6.409419945747189e-05, + "loss": 0.8403, + "step": 128100 + }, + { + "epoch": 0.8184582753025057, + "grad_norm": 0.9186480641365051, + "learning_rate": 6.408938517617576e-05, + "loss": 0.8156, + "step": 128110 + }, + { + "epoch": 0.8185221624522444, + "grad_norm": 0.9104690551757812, + "learning_rate": 6.408457075298734e-05, + "loss": 0.9832, + "step": 128120 + }, + { + "epoch": 0.8185860496019831, + "grad_norm": 0.849088191986084, + "learning_rate": 6.407975618795514e-05, + "loss": 1.0383, + "step": 128130 + }, + { + "epoch": 0.8186499367517217, + "grad_norm": 0.667122483253479, + "learning_rate": 6.40749414811276e-05, + "loss": 1.0298, + "step": 128140 + }, + { + "epoch": 0.8187138239014604, + "grad_norm": 0.7279898524284363, + "learning_rate": 6.407012663255321e-05, + "loss": 0.8889, + "step": 128150 + }, + { + "epoch": 0.8187777110511991, + "grad_norm": 0.6628199219703674, + "learning_rate": 6.406531164228048e-05, + "loss": 1.0062, + "step": 128160 + }, + { + "epoch": 0.8188415982009378, + "grad_norm": 0.8573051691055298, + "learning_rate": 6.406049651035789e-05, + "loss": 0.8815, + "step": 128170 + }, + { + "epoch": 0.8189054853506765, + "grad_norm": 0.6094196438789368, + "learning_rate": 6.405568123683395e-05, + "loss": 1.0032, + "step": 128180 + }, + { + "epoch": 0.8189693725004152, + "grad_norm": 1.0880091190338135, + "learning_rate": 6.405086582175712e-05, + "loss": 0.7361, + "step": 128190 + }, + { + "epoch": 0.8190332596501539, + "grad_norm": 4.45365571975708, + "learning_rate": 6.404605026517592e-05, + "loss": 0.925, + "step": 128200 + }, + { + "epoch": 0.8190971467998927, + "grad_norm": 1.8221222162246704, + "learning_rate": 6.404123456713884e-05, + "loss": 0.8888, + "step": 128210 + }, + { + "epoch": 0.8191610339496314, + "grad_norm": 0.8215370774269104, + "learning_rate": 6.403641872769439e-05, + "loss": 0.806, + "step": 128220 + }, + { + "epoch": 0.8192249210993701, + "grad_norm": 0.7593247890472412, + "learning_rate": 6.403160274689107e-05, + "loss": 0.7808, + "step": 128230 + }, + { + "epoch": 0.8192888082491088, + "grad_norm": 0.8794440031051636, + "learning_rate": 6.402678662477735e-05, + "loss": 0.9681, + "step": 128240 + }, + { + "epoch": 0.8193526953988475, + "grad_norm": 0.9610484838485718, + "learning_rate": 6.402197036140176e-05, + "loss": 0.6683, + "step": 128250 + }, + { + "epoch": 0.8194165825485862, + "grad_norm": 1.84084153175354, + "learning_rate": 6.40171539568128e-05, + "loss": 0.8565, + "step": 128260 + }, + { + "epoch": 0.8194804696983249, + "grad_norm": 0.8622168898582458, + "learning_rate": 6.401233741105898e-05, + "loss": 0.8958, + "step": 128270 + }, + { + "epoch": 0.8195443568480636, + "grad_norm": 0.8887889981269836, + "learning_rate": 6.400752072418878e-05, + "loss": 0.6639, + "step": 128280 + }, + { + "epoch": 0.8196082439978023, + "grad_norm": 1.2179750204086304, + "learning_rate": 6.400270389625075e-05, + "loss": 0.8348, + "step": 128290 + }, + { + "epoch": 0.819672131147541, + "grad_norm": 0.7495495080947876, + "learning_rate": 6.399788692729337e-05, + "loss": 0.732, + "step": 128300 + }, + { + "epoch": 0.8197360182972797, + "grad_norm": 0.9939058423042297, + "learning_rate": 6.399306981736515e-05, + "loss": 0.9292, + "step": 128310 + }, + { + "epoch": 0.8197999054470184, + "grad_norm": 1.0423085689544678, + "learning_rate": 6.398825256651463e-05, + "loss": 1.0162, + "step": 128320 + }, + { + "epoch": 0.8198637925967571, + "grad_norm": 1.109744668006897, + "learning_rate": 6.398343517479029e-05, + "loss": 0.8332, + "step": 128330 + }, + { + "epoch": 0.8199276797464958, + "grad_norm": 0.8410364389419556, + "learning_rate": 6.397861764224067e-05, + "loss": 0.6665, + "step": 128340 + }, + { + "epoch": 0.8199915668962345, + "grad_norm": 1.0010278224945068, + "learning_rate": 6.397379996891426e-05, + "loss": 0.8106, + "step": 128350 + }, + { + "epoch": 0.8200554540459732, + "grad_norm": 0.9419941902160645, + "learning_rate": 6.396898215485962e-05, + "loss": 0.768, + "step": 128360 + }, + { + "epoch": 0.820119341195712, + "grad_norm": 0.6813206076622009, + "learning_rate": 6.396416420012523e-05, + "loss": 0.7858, + "step": 128370 + }, + { + "epoch": 0.8201832283454505, + "grad_norm": 1.2011756896972656, + "learning_rate": 6.395934610475963e-05, + "loss": 0.8183, + "step": 128380 + }, + { + "epoch": 0.8202471154951892, + "grad_norm": 5.552373886108398, + "learning_rate": 6.395452786881133e-05, + "loss": 1.1759, + "step": 128390 + }, + { + "epoch": 0.820311002644928, + "grad_norm": 0.658540666103363, + "learning_rate": 6.394970949232887e-05, + "loss": 0.7854, + "step": 128400 + }, + { + "epoch": 0.8203748897946667, + "grad_norm": 1.2087280750274658, + "learning_rate": 6.394489097536076e-05, + "loss": 0.9018, + "step": 128410 + }, + { + "epoch": 0.8204387769444054, + "grad_norm": 0.8210545182228088, + "learning_rate": 6.394007231795554e-05, + "loss": 0.809, + "step": 128420 + }, + { + "epoch": 0.8205026640941441, + "grad_norm": 1.1109236478805542, + "learning_rate": 6.393525352016174e-05, + "loss": 0.8139, + "step": 128430 + }, + { + "epoch": 0.8205665512438828, + "grad_norm": 0.5671748518943787, + "learning_rate": 6.393043458202787e-05, + "loss": 0.7467, + "step": 128440 + }, + { + "epoch": 0.8206304383936215, + "grad_norm": 0.7894589304924011, + "learning_rate": 6.392561550360247e-05, + "loss": 0.7735, + "step": 128450 + }, + { + "epoch": 0.8206943255433602, + "grad_norm": 1.4838154315948486, + "learning_rate": 6.392079628493407e-05, + "loss": 0.8314, + "step": 128460 + }, + { + "epoch": 0.8207582126930989, + "grad_norm": 0.8588756918907166, + "learning_rate": 6.391597692607121e-05, + "loss": 0.9544, + "step": 128470 + }, + { + "epoch": 0.8208220998428376, + "grad_norm": 1.500656247138977, + "learning_rate": 6.391115742706243e-05, + "loss": 0.7166, + "step": 128480 + }, + { + "epoch": 0.8208859869925763, + "grad_norm": 0.9322123527526855, + "learning_rate": 6.390633778795626e-05, + "loss": 0.8543, + "step": 128490 + }, + { + "epoch": 0.820949874142315, + "grad_norm": 0.8133841753005981, + "learning_rate": 6.390151800880124e-05, + "loss": 0.935, + "step": 128500 + }, + { + "epoch": 0.8210137612920537, + "grad_norm": 1.0100511312484741, + "learning_rate": 6.38966980896459e-05, + "loss": 0.7153, + "step": 128510 + }, + { + "epoch": 0.8210776484417924, + "grad_norm": 1.2132251262664795, + "learning_rate": 6.38918780305388e-05, + "loss": 0.8784, + "step": 128520 + }, + { + "epoch": 0.8211415355915311, + "grad_norm": 0.8456812500953674, + "learning_rate": 6.388705783152846e-05, + "loss": 1.0612, + "step": 128530 + }, + { + "epoch": 0.8212054227412698, + "grad_norm": 0.9363921284675598, + "learning_rate": 6.388223749266344e-05, + "loss": 1.0582, + "step": 128540 + }, + { + "epoch": 0.8212693098910085, + "grad_norm": 2.2319607734680176, + "learning_rate": 6.387741701399228e-05, + "loss": 0.717, + "step": 128550 + }, + { + "epoch": 0.8213331970407473, + "grad_norm": 1.1460880041122437, + "learning_rate": 6.387259639556352e-05, + "loss": 0.8053, + "step": 128560 + }, + { + "epoch": 0.821397084190486, + "grad_norm": 0.7786633372306824, + "learning_rate": 6.386777563742571e-05, + "loss": 0.9295, + "step": 128570 + }, + { + "epoch": 0.8214609713402247, + "grad_norm": 0.8172239661216736, + "learning_rate": 6.38629547396274e-05, + "loss": 0.8758, + "step": 128580 + }, + { + "epoch": 0.8215248584899634, + "grad_norm": 0.6958255171775818, + "learning_rate": 6.385813370221716e-05, + "loss": 0.8538, + "step": 128590 + }, + { + "epoch": 0.8215887456397021, + "grad_norm": 0.9050196409225464, + "learning_rate": 6.38533125252435e-05, + "loss": 0.8245, + "step": 128600 + }, + { + "epoch": 0.8216526327894408, + "grad_norm": 0.9731044769287109, + "learning_rate": 6.384849120875502e-05, + "loss": 0.7545, + "step": 128610 + }, + { + "epoch": 0.8217165199391794, + "grad_norm": 1.5019901990890503, + "learning_rate": 6.384366975280024e-05, + "loss": 0.8528, + "step": 128620 + }, + { + "epoch": 0.8217804070889181, + "grad_norm": 0.9366435408592224, + "learning_rate": 6.383884815742772e-05, + "loss": 0.8599, + "step": 128630 + }, + { + "epoch": 0.8218442942386568, + "grad_norm": 0.6195958852767944, + "learning_rate": 6.383402642268603e-05, + "loss": 0.8433, + "step": 128640 + }, + { + "epoch": 0.8219081813883955, + "grad_norm": 0.6187912225723267, + "learning_rate": 6.382920454862374e-05, + "loss": 0.9079, + "step": 128650 + }, + { + "epoch": 0.8219720685381342, + "grad_norm": 1.1059194803237915, + "learning_rate": 6.382438253528939e-05, + "loss": 0.7321, + "step": 128660 + }, + { + "epoch": 0.8220359556878729, + "grad_norm": 0.8085727095603943, + "learning_rate": 6.381956038273156e-05, + "loss": 0.9373, + "step": 128670 + }, + { + "epoch": 0.8220998428376116, + "grad_norm": 2.0732922554016113, + "learning_rate": 6.381473809099878e-05, + "loss": 0.8538, + "step": 128680 + }, + { + "epoch": 0.8221637299873503, + "grad_norm": 0.6032936573028564, + "learning_rate": 6.380991566013966e-05, + "loss": 0.7414, + "step": 128690 + }, + { + "epoch": 0.822227617137089, + "grad_norm": 1.1853028535842896, + "learning_rate": 6.380509309020272e-05, + "loss": 0.8288, + "step": 128700 + }, + { + "epoch": 0.8222915042868277, + "grad_norm": 0.6363354325294495, + "learning_rate": 6.380027038123654e-05, + "loss": 0.8664, + "step": 128710 + }, + { + "epoch": 0.8223553914365664, + "grad_norm": 1.0098567008972168, + "learning_rate": 6.379544753328973e-05, + "loss": 1.0163, + "step": 128720 + }, + { + "epoch": 0.8224192785863051, + "grad_norm": 2.520233631134033, + "learning_rate": 6.379062454641081e-05, + "loss": 0.7209, + "step": 128730 + }, + { + "epoch": 0.8224831657360439, + "grad_norm": 1.0344457626342773, + "learning_rate": 6.378580142064838e-05, + "loss": 0.8217, + "step": 128740 + }, + { + "epoch": 0.8225470528857826, + "grad_norm": 0.8147494792938232, + "learning_rate": 6.378097815605099e-05, + "loss": 0.7642, + "step": 128750 + }, + { + "epoch": 0.8226109400355213, + "grad_norm": 1.220113754272461, + "learning_rate": 6.377615475266724e-05, + "loss": 0.9745, + "step": 128760 + }, + { + "epoch": 0.82267482718526, + "grad_norm": 0.7933838367462158, + "learning_rate": 6.377133121054571e-05, + "loss": 0.6175, + "step": 128770 + }, + { + "epoch": 0.8227387143349987, + "grad_norm": 1.4493178129196167, + "learning_rate": 6.376650752973493e-05, + "loss": 0.8895, + "step": 128780 + }, + { + "epoch": 0.8228026014847374, + "grad_norm": 0.9888495802879333, + "learning_rate": 6.376168371028351e-05, + "loss": 0.9483, + "step": 128790 + }, + { + "epoch": 0.8228664886344761, + "grad_norm": 1.066873550415039, + "learning_rate": 6.375685975224004e-05, + "loss": 0.9853, + "step": 128800 + }, + { + "epoch": 0.8229303757842148, + "grad_norm": 0.903079092502594, + "learning_rate": 6.375203565565308e-05, + "loss": 0.9505, + "step": 128810 + }, + { + "epoch": 0.8229942629339535, + "grad_norm": 1.5976263284683228, + "learning_rate": 6.374721142057125e-05, + "loss": 0.9695, + "step": 128820 + }, + { + "epoch": 0.8230581500836922, + "grad_norm": 0.6900495886802673, + "learning_rate": 6.374238704704308e-05, + "loss": 1.0342, + "step": 128830 + }, + { + "epoch": 0.8231220372334309, + "grad_norm": 0.9816431403160095, + "learning_rate": 6.37375625351172e-05, + "loss": 0.9036, + "step": 128840 + }, + { + "epoch": 0.8231859243831696, + "grad_norm": 1.096856713294983, + "learning_rate": 6.373273788484217e-05, + "loss": 0.7925, + "step": 128850 + }, + { + "epoch": 0.8232498115329082, + "grad_norm": 0.6943714022636414, + "learning_rate": 6.37279130962666e-05, + "loss": 0.836, + "step": 128860 + }, + { + "epoch": 0.8233136986826469, + "grad_norm": 1.0958727598190308, + "learning_rate": 6.372308816943908e-05, + "loss": 0.8033, + "step": 128870 + }, + { + "epoch": 0.8233775858323856, + "grad_norm": 0.9813776016235352, + "learning_rate": 6.371826310440816e-05, + "loss": 1.1422, + "step": 128880 + }, + { + "epoch": 0.8234414729821243, + "grad_norm": 1.0429863929748535, + "learning_rate": 6.371343790122249e-05, + "loss": 0.9248, + "step": 128890 + }, + { + "epoch": 0.823505360131863, + "grad_norm": 1.0575244426727295, + "learning_rate": 6.370861255993062e-05, + "loss": 1.0187, + "step": 128900 + }, + { + "epoch": 0.8235692472816017, + "grad_norm": 1.6961395740509033, + "learning_rate": 6.370378708058115e-05, + "loss": 0.7942, + "step": 128910 + }, + { + "epoch": 0.8236331344313405, + "grad_norm": 1.0016497373580933, + "learning_rate": 6.36989614632227e-05, + "loss": 0.8602, + "step": 128920 + }, + { + "epoch": 0.8236970215810792, + "grad_norm": 1.1897591352462769, + "learning_rate": 6.369413570790386e-05, + "loss": 0.8869, + "step": 128930 + }, + { + "epoch": 0.8237609087308179, + "grad_norm": 1.0436471700668335, + "learning_rate": 6.368930981467323e-05, + "loss": 0.9429, + "step": 128940 + }, + { + "epoch": 0.8238247958805566, + "grad_norm": 0.9144713878631592, + "learning_rate": 6.368448378357941e-05, + "loss": 0.7866, + "step": 128950 + }, + { + "epoch": 0.8238886830302953, + "grad_norm": 1.058947205543518, + "learning_rate": 6.367965761467098e-05, + "loss": 0.8541, + "step": 128960 + }, + { + "epoch": 0.823952570180034, + "grad_norm": 0.9938645958900452, + "learning_rate": 6.367483130799659e-05, + "loss": 0.9613, + "step": 128970 + }, + { + "epoch": 0.8240164573297727, + "grad_norm": 0.8759385943412781, + "learning_rate": 6.36700048636048e-05, + "loss": 1.0366, + "step": 128980 + }, + { + "epoch": 0.8240803444795114, + "grad_norm": 0.4528246223926544, + "learning_rate": 6.366517828154424e-05, + "loss": 1.0067, + "step": 128990 + }, + { + "epoch": 0.8241442316292501, + "grad_norm": 0.7166324257850647, + "learning_rate": 6.36603515618635e-05, + "loss": 0.8826, + "step": 129000 + }, + { + "epoch": 0.8242081187789888, + "grad_norm": 0.7682775259017944, + "learning_rate": 6.365552470461122e-05, + "loss": 0.8674, + "step": 129010 + }, + { + "epoch": 0.8242720059287275, + "grad_norm": 1.0459059476852417, + "learning_rate": 6.3650697709836e-05, + "loss": 0.7448, + "step": 129020 + }, + { + "epoch": 0.8243358930784662, + "grad_norm": 0.9293308854103088, + "learning_rate": 6.364587057758642e-05, + "loss": 1.0024, + "step": 129030 + }, + { + "epoch": 0.8243997802282049, + "grad_norm": 0.9621434807777405, + "learning_rate": 6.364104330791113e-05, + "loss": 0.853, + "step": 129040 + }, + { + "epoch": 0.8244636673779436, + "grad_norm": 0.5361142158508301, + "learning_rate": 6.363621590085873e-05, + "loss": 0.904, + "step": 129050 + }, + { + "epoch": 0.8245275545276823, + "grad_norm": 1.060625672340393, + "learning_rate": 6.363138835647784e-05, + "loss": 0.9679, + "step": 129060 + }, + { + "epoch": 0.824591441677421, + "grad_norm": 0.9136219024658203, + "learning_rate": 6.362656067481708e-05, + "loss": 1.0068, + "step": 129070 + }, + { + "epoch": 0.8246553288271598, + "grad_norm": 0.9109070301055908, + "learning_rate": 6.362173285592507e-05, + "loss": 0.7127, + "step": 129080 + }, + { + "epoch": 0.8247192159768985, + "grad_norm": 0.6838903427124023, + "learning_rate": 6.361690489985041e-05, + "loss": 0.806, + "step": 129090 + }, + { + "epoch": 0.8247831031266372, + "grad_norm": 0.5887237787246704, + "learning_rate": 6.361207680664174e-05, + "loss": 0.6983, + "step": 129100 + }, + { + "epoch": 0.8248469902763758, + "grad_norm": 1.1329760551452637, + "learning_rate": 6.36072485763477e-05, + "loss": 1.0387, + "step": 129110 + }, + { + "epoch": 0.8249108774261145, + "grad_norm": 1.2478314638137817, + "learning_rate": 6.360242020901688e-05, + "loss": 0.8467, + "step": 129120 + }, + { + "epoch": 0.8249747645758532, + "grad_norm": 1.1845048666000366, + "learning_rate": 6.359759170469791e-05, + "loss": 0.9406, + "step": 129130 + }, + { + "epoch": 0.8250386517255919, + "grad_norm": 0.9721893668174744, + "learning_rate": 6.359276306343944e-05, + "loss": 0.9239, + "step": 129140 + }, + { + "epoch": 0.8251025388753306, + "grad_norm": 0.7293797135353088, + "learning_rate": 6.358793428529008e-05, + "loss": 1.0397, + "step": 129150 + }, + { + "epoch": 0.8251664260250693, + "grad_norm": 1.1368262767791748, + "learning_rate": 6.358310537029847e-05, + "loss": 0.7218, + "step": 129160 + }, + { + "epoch": 0.825230313174808, + "grad_norm": 1.327905535697937, + "learning_rate": 6.357827631851324e-05, + "loss": 0.716, + "step": 129170 + }, + { + "epoch": 0.8252942003245467, + "grad_norm": 1.3848083019256592, + "learning_rate": 6.357344712998302e-05, + "loss": 0.8165, + "step": 129180 + }, + { + "epoch": 0.8253580874742854, + "grad_norm": 0.8067914843559265, + "learning_rate": 6.356861780475645e-05, + "loss": 0.8678, + "step": 129190 + }, + { + "epoch": 0.8254219746240241, + "grad_norm": 0.8428422212600708, + "learning_rate": 6.356378834288216e-05, + "loss": 0.9308, + "step": 129200 + }, + { + "epoch": 0.8254858617737628, + "grad_norm": 0.8563640713691711, + "learning_rate": 6.355895874440878e-05, + "loss": 1.295, + "step": 129210 + }, + { + "epoch": 0.8255497489235015, + "grad_norm": 0.9378432035446167, + "learning_rate": 6.355412900938496e-05, + "loss": 1.1254, + "step": 129220 + }, + { + "epoch": 0.8256136360732402, + "grad_norm": 1.2021596431732178, + "learning_rate": 6.354929913785932e-05, + "loss": 1.1068, + "step": 129230 + }, + { + "epoch": 0.8256775232229789, + "grad_norm": 0.6594062447547913, + "learning_rate": 6.354446912988053e-05, + "loss": 0.7613, + "step": 129240 + }, + { + "epoch": 0.8257414103727176, + "grad_norm": 0.6651197671890259, + "learning_rate": 6.353963898549723e-05, + "loss": 0.684, + "step": 129250 + }, + { + "epoch": 0.8258052975224563, + "grad_norm": 0.7577025890350342, + "learning_rate": 6.353480870475805e-05, + "loss": 0.639, + "step": 129260 + }, + { + "epoch": 0.8258691846721951, + "grad_norm": 1.147277593612671, + "learning_rate": 6.352997828771162e-05, + "loss": 1.1541, + "step": 129270 + }, + { + "epoch": 0.8259330718219338, + "grad_norm": 0.7437235116958618, + "learning_rate": 6.35251477344066e-05, + "loss": 0.9665, + "step": 129280 + }, + { + "epoch": 0.8259969589716725, + "grad_norm": 1.1867269277572632, + "learning_rate": 6.352031704489166e-05, + "loss": 1.0024, + "step": 129290 + }, + { + "epoch": 0.8260608461214112, + "grad_norm": 0.9756374359130859, + "learning_rate": 6.351548621921542e-05, + "loss": 0.9174, + "step": 129300 + }, + { + "epoch": 0.8261247332711499, + "grad_norm": 0.6619752049446106, + "learning_rate": 6.351065525742655e-05, + "loss": 1.0489, + "step": 129310 + }, + { + "epoch": 0.8261886204208886, + "grad_norm": 1.198135256767273, + "learning_rate": 6.350582415957367e-05, + "loss": 0.8207, + "step": 129320 + }, + { + "epoch": 0.8262525075706273, + "grad_norm": 1.183821678161621, + "learning_rate": 6.350099292570547e-05, + "loss": 1.1444, + "step": 129330 + }, + { + "epoch": 0.826316394720366, + "grad_norm": 0.8684186935424805, + "learning_rate": 6.349616155587059e-05, + "loss": 0.7594, + "step": 129340 + }, + { + "epoch": 0.8263802818701046, + "grad_norm": 0.7701572775840759, + "learning_rate": 6.34913300501177e-05, + "loss": 0.7771, + "step": 129350 + }, + { + "epoch": 0.8264441690198433, + "grad_norm": 0.8787345290184021, + "learning_rate": 6.348649840849543e-05, + "loss": 0.9283, + "step": 129360 + }, + { + "epoch": 0.826508056169582, + "grad_norm": 0.7705711126327515, + "learning_rate": 6.348166663105247e-05, + "loss": 0.908, + "step": 129370 + }, + { + "epoch": 0.8265719433193207, + "grad_norm": 0.7562994956970215, + "learning_rate": 6.347683471783744e-05, + "loss": 0.8175, + "step": 129380 + }, + { + "epoch": 0.8266358304690594, + "grad_norm": 0.6645038723945618, + "learning_rate": 6.347200266889904e-05, + "loss": 1.1337, + "step": 129390 + }, + { + "epoch": 0.8266997176187981, + "grad_norm": 0.8476589322090149, + "learning_rate": 6.346717048428592e-05, + "loss": 0.8446, + "step": 129400 + }, + { + "epoch": 0.8267636047685368, + "grad_norm": 0.7244347333908081, + "learning_rate": 6.346233816404674e-05, + "loss": 0.8702, + "step": 129410 + }, + { + "epoch": 0.8268274919182755, + "grad_norm": 1.6287295818328857, + "learning_rate": 6.345750570823017e-05, + "loss": 0.7862, + "step": 129420 + }, + { + "epoch": 0.8268913790680142, + "grad_norm": 0.9641634821891785, + "learning_rate": 6.345267311688486e-05, + "loss": 0.8193, + "step": 129430 + }, + { + "epoch": 0.826955266217753, + "grad_norm": 0.7857980728149414, + "learning_rate": 6.344784039005951e-05, + "loss": 1.0366, + "step": 129440 + }, + { + "epoch": 0.8270191533674917, + "grad_norm": 1.0319744348526, + "learning_rate": 6.344300752780277e-05, + "loss": 0.72, + "step": 129450 + }, + { + "epoch": 0.8270830405172304, + "grad_norm": 1.3282662630081177, + "learning_rate": 6.343817453016332e-05, + "loss": 0.7655, + "step": 129460 + }, + { + "epoch": 0.8271469276669691, + "grad_norm": 0.7755960822105408, + "learning_rate": 6.343334139718982e-05, + "loss": 0.7769, + "step": 129470 + }, + { + "epoch": 0.8272108148167078, + "grad_norm": 1.4779821634292603, + "learning_rate": 6.342850812893094e-05, + "loss": 0.7363, + "step": 129480 + }, + { + "epoch": 0.8272747019664465, + "grad_norm": 0.5803090929985046, + "learning_rate": 6.342367472543537e-05, + "loss": 0.7171, + "step": 129490 + }, + { + "epoch": 0.8273385891161852, + "grad_norm": 0.5924243330955505, + "learning_rate": 6.34188411867518e-05, + "loss": 0.8442, + "step": 129500 + }, + { + "epoch": 0.8274024762659239, + "grad_norm": 1.4736064672470093, + "learning_rate": 6.341400751292888e-05, + "loss": 0.8249, + "step": 129510 + }, + { + "epoch": 0.8274663634156626, + "grad_norm": 0.9478740692138672, + "learning_rate": 6.34091737040153e-05, + "loss": 0.9925, + "step": 129520 + }, + { + "epoch": 0.8275302505654013, + "grad_norm": 1.2648018598556519, + "learning_rate": 6.340433976005975e-05, + "loss": 0.8581, + "step": 129530 + }, + { + "epoch": 0.82759413771514, + "grad_norm": 0.4751732349395752, + "learning_rate": 6.339950568111088e-05, + "loss": 0.8622, + "step": 129540 + }, + { + "epoch": 0.8276580248648787, + "grad_norm": 0.8093522191047668, + "learning_rate": 6.339467146721741e-05, + "loss": 0.9958, + "step": 129550 + }, + { + "epoch": 0.8277219120146174, + "grad_norm": 0.8439111709594727, + "learning_rate": 6.3389837118428e-05, + "loss": 0.9307, + "step": 129560 + }, + { + "epoch": 0.8277857991643561, + "grad_norm": 0.615764319896698, + "learning_rate": 6.338500263479136e-05, + "loss": 0.9045, + "step": 129570 + }, + { + "epoch": 0.8278496863140948, + "grad_norm": 0.8498043417930603, + "learning_rate": 6.338016801635615e-05, + "loss": 0.9691, + "step": 129580 + }, + { + "epoch": 0.8279135734638334, + "grad_norm": 1.2179980278015137, + "learning_rate": 6.337533326317108e-05, + "loss": 0.6736, + "step": 129590 + }, + { + "epoch": 0.8279774606135721, + "grad_norm": 0.5712013244628906, + "learning_rate": 6.337049837528483e-05, + "loss": 0.8732, + "step": 129600 + }, + { + "epoch": 0.8280413477633108, + "grad_norm": 1.027066946029663, + "learning_rate": 6.336566335274609e-05, + "loss": 0.8487, + "step": 129610 + }, + { + "epoch": 0.8281052349130495, + "grad_norm": 0.8761221170425415, + "learning_rate": 6.336082819560357e-05, + "loss": 0.8251, + "step": 129620 + }, + { + "epoch": 0.8281691220627883, + "grad_norm": 0.9286890625953674, + "learning_rate": 6.335599290390595e-05, + "loss": 0.8774, + "step": 129630 + }, + { + "epoch": 0.828233009212527, + "grad_norm": 1.0241472721099854, + "learning_rate": 6.335115747770192e-05, + "loss": 1.0086, + "step": 129640 + }, + { + "epoch": 0.8282968963622657, + "grad_norm": 2.582179069519043, + "learning_rate": 6.334632191704018e-05, + "loss": 1.0137, + "step": 129650 + }, + { + "epoch": 0.8283607835120044, + "grad_norm": 0.8017721176147461, + "learning_rate": 6.334148622196945e-05, + "loss": 0.8961, + "step": 129660 + }, + { + "epoch": 0.8284246706617431, + "grad_norm": 1.0686819553375244, + "learning_rate": 6.33366503925384e-05, + "loss": 1.0203, + "step": 129670 + }, + { + "epoch": 0.8284885578114818, + "grad_norm": 0.6839226484298706, + "learning_rate": 6.333181442879573e-05, + "loss": 1.018, + "step": 129680 + }, + { + "epoch": 0.8285524449612205, + "grad_norm": 1.2881569862365723, + "learning_rate": 6.332697833079017e-05, + "loss": 0.7736, + "step": 129690 + }, + { + "epoch": 0.8286163321109592, + "grad_norm": 1.0879745483398438, + "learning_rate": 6.33221420985704e-05, + "loss": 0.8662, + "step": 129700 + }, + { + "epoch": 0.8286802192606979, + "grad_norm": 0.7041977643966675, + "learning_rate": 6.331730573218514e-05, + "loss": 0.6425, + "step": 129710 + }, + { + "epoch": 0.8287441064104366, + "grad_norm": 1.2785156965255737, + "learning_rate": 6.33124692316831e-05, + "loss": 1.1758, + "step": 129720 + }, + { + "epoch": 0.8288079935601753, + "grad_norm": 1.3215516805648804, + "learning_rate": 6.330763259711295e-05, + "loss": 0.901, + "step": 129730 + }, + { + "epoch": 0.828871880709914, + "grad_norm": 0.8521394729614258, + "learning_rate": 6.330279582852347e-05, + "loss": 0.9565, + "step": 129740 + }, + { + "epoch": 0.8289357678596527, + "grad_norm": 0.8731998205184937, + "learning_rate": 6.32979589259633e-05, + "loss": 0.9697, + "step": 129750 + }, + { + "epoch": 0.8289996550093914, + "grad_norm": 0.8813868761062622, + "learning_rate": 6.329312188948118e-05, + "loss": 1.3262, + "step": 129760 + }, + { + "epoch": 0.8290635421591301, + "grad_norm": 0.9669440984725952, + "learning_rate": 6.328828471912582e-05, + "loss": 1.0292, + "step": 129770 + }, + { + "epoch": 0.8291274293088688, + "grad_norm": 1.7891266345977783, + "learning_rate": 6.328344741494594e-05, + "loss": 1.0843, + "step": 129780 + }, + { + "epoch": 0.8291913164586076, + "grad_norm": 2.168109655380249, + "learning_rate": 6.327860997699025e-05, + "loss": 0.9686, + "step": 129790 + }, + { + "epoch": 0.8292552036083463, + "grad_norm": 1.232346773147583, + "learning_rate": 6.327377240530747e-05, + "loss": 0.8645, + "step": 129800 + }, + { + "epoch": 0.829319090758085, + "grad_norm": 0.9927831292152405, + "learning_rate": 6.326893469994633e-05, + "loss": 1.1181, + "step": 129810 + }, + { + "epoch": 0.8293829779078237, + "grad_norm": 0.6466804146766663, + "learning_rate": 6.326409686095553e-05, + "loss": 0.9161, + "step": 129820 + }, + { + "epoch": 0.8294468650575624, + "grad_norm": 1.5198026895523071, + "learning_rate": 6.325925888838379e-05, + "loss": 1.1528, + "step": 129830 + }, + { + "epoch": 0.829510752207301, + "grad_norm": 0.6202889084815979, + "learning_rate": 6.325442078227986e-05, + "loss": 0.9987, + "step": 129840 + }, + { + "epoch": 0.8295746393570397, + "grad_norm": 0.7182409167289734, + "learning_rate": 6.324958254269243e-05, + "loss": 0.9878, + "step": 129850 + }, + { + "epoch": 0.8296385265067784, + "grad_norm": 0.626463770866394, + "learning_rate": 6.324474416967024e-05, + "loss": 1.0748, + "step": 129860 + }, + { + "epoch": 0.8297024136565171, + "grad_norm": 1.071711540222168, + "learning_rate": 6.323990566326203e-05, + "loss": 1.0839, + "step": 129870 + }, + { + "epoch": 0.8297663008062558, + "grad_norm": 0.5460740327835083, + "learning_rate": 6.323506702351651e-05, + "loss": 1.0518, + "step": 129880 + }, + { + "epoch": 0.8298301879559945, + "grad_norm": 0.813046395778656, + "learning_rate": 6.323022825048243e-05, + "loss": 1.0735, + "step": 129890 + }, + { + "epoch": 0.8298940751057332, + "grad_norm": 1.204576849937439, + "learning_rate": 6.322538934420849e-05, + "loss": 0.9745, + "step": 129900 + }, + { + "epoch": 0.8299579622554719, + "grad_norm": 0.6870941519737244, + "learning_rate": 6.322055030474345e-05, + "loss": 0.7515, + "step": 129910 + }, + { + "epoch": 0.8300218494052106, + "grad_norm": 0.5987921357154846, + "learning_rate": 6.321571113213602e-05, + "loss": 0.8308, + "step": 129920 + }, + { + "epoch": 0.8300857365549493, + "grad_norm": 0.8666192889213562, + "learning_rate": 6.321087182643495e-05, + "loss": 1.0491, + "step": 129930 + }, + { + "epoch": 0.830149623704688, + "grad_norm": 0.699948251247406, + "learning_rate": 6.320603238768896e-05, + "loss": 0.8764, + "step": 129940 + }, + { + "epoch": 0.8302135108544267, + "grad_norm": 0.7432700991630554, + "learning_rate": 6.320119281594681e-05, + "loss": 0.7359, + "step": 129950 + }, + { + "epoch": 0.8302773980041654, + "grad_norm": 0.7470651268959045, + "learning_rate": 6.319635311125722e-05, + "loss": 0.9569, + "step": 129960 + }, + { + "epoch": 0.8303412851539042, + "grad_norm": 1.7274110317230225, + "learning_rate": 6.319151327366894e-05, + "loss": 0.6801, + "step": 129970 + }, + { + "epoch": 0.8304051723036429, + "grad_norm": 1.1418910026550293, + "learning_rate": 6.318667330323074e-05, + "loss": 0.9272, + "step": 129980 + }, + { + "epoch": 0.8304690594533816, + "grad_norm": 0.478630930185318, + "learning_rate": 6.31818331999913e-05, + "loss": 0.5905, + "step": 129990 + }, + { + "epoch": 0.8305329466031203, + "grad_norm": 1.0310248136520386, + "learning_rate": 6.317699296399939e-05, + "loss": 1.3088, + "step": 130000 + }, + { + "epoch": 0.830596833752859, + "grad_norm": 1.2773408889770508, + "learning_rate": 6.317215259530377e-05, + "loss": 0.7674, + "step": 130010 + }, + { + "epoch": 0.8306607209025977, + "grad_norm": 0.8546721339225769, + "learning_rate": 6.316731209395318e-05, + "loss": 0.8664, + "step": 130020 + }, + { + "epoch": 0.8307246080523364, + "grad_norm": 1.4127484560012817, + "learning_rate": 6.316247145999636e-05, + "loss": 0.7683, + "step": 130030 + }, + { + "epoch": 0.8307884952020751, + "grad_norm": 1.9064104557037354, + "learning_rate": 6.315763069348208e-05, + "loss": 0.865, + "step": 130040 + }, + { + "epoch": 0.8308523823518138, + "grad_norm": 1.1948989629745483, + "learning_rate": 6.315278979445906e-05, + "loss": 0.9118, + "step": 130050 + }, + { + "epoch": 0.8309162695015525, + "grad_norm": 1.745757818222046, + "learning_rate": 6.314794876297607e-05, + "loss": 0.8204, + "step": 130060 + }, + { + "epoch": 0.8309801566512912, + "grad_norm": 0.840900719165802, + "learning_rate": 6.314310759908187e-05, + "loss": 1.0959, + "step": 130070 + }, + { + "epoch": 0.8310440438010298, + "grad_norm": 1.8418869972229004, + "learning_rate": 6.313826630282521e-05, + "loss": 1.1114, + "step": 130080 + }, + { + "epoch": 0.8311079309507685, + "grad_norm": 1.024966835975647, + "learning_rate": 6.313342487425483e-05, + "loss": 0.7167, + "step": 130090 + }, + { + "epoch": 0.8311718181005072, + "grad_norm": 0.7477736473083496, + "learning_rate": 6.312858331341951e-05, + "loss": 0.971, + "step": 130100 + }, + { + "epoch": 0.8312357052502459, + "grad_norm": 0.9230465888977051, + "learning_rate": 6.312374162036798e-05, + "loss": 1.0544, + "step": 130110 + }, + { + "epoch": 0.8312995923999846, + "grad_norm": 0.986064612865448, + "learning_rate": 6.311889979514904e-05, + "loss": 1.332, + "step": 130120 + }, + { + "epoch": 0.8313634795497233, + "grad_norm": 0.894123375415802, + "learning_rate": 6.311405783781141e-05, + "loss": 1.1246, + "step": 130130 + }, + { + "epoch": 0.831427366699462, + "grad_norm": 0.7910020351409912, + "learning_rate": 6.310921574840389e-05, + "loss": 0.7583, + "step": 130140 + }, + { + "epoch": 0.8314912538492008, + "grad_norm": 1.064899206161499, + "learning_rate": 6.310437352697522e-05, + "loss": 0.7065, + "step": 130150 + }, + { + "epoch": 0.8315551409989395, + "grad_norm": 0.8578174710273743, + "learning_rate": 6.309953117357416e-05, + "loss": 0.7956, + "step": 130160 + }, + { + "epoch": 0.8316190281486782, + "grad_norm": 0.7391040325164795, + "learning_rate": 6.30946886882495e-05, + "loss": 0.801, + "step": 130170 + }, + { + "epoch": 0.8316829152984169, + "grad_norm": 0.6705387830734253, + "learning_rate": 6.308984607104999e-05, + "loss": 0.7235, + "step": 130180 + }, + { + "epoch": 0.8317468024481556, + "grad_norm": 0.5788788199424744, + "learning_rate": 6.308500332202443e-05, + "loss": 0.7199, + "step": 130190 + }, + { + "epoch": 0.8318106895978943, + "grad_norm": 0.9009729027748108, + "learning_rate": 6.308016044122153e-05, + "loss": 0.7121, + "step": 130200 + }, + { + "epoch": 0.831874576747633, + "grad_norm": 0.7175112962722778, + "learning_rate": 6.307531742869012e-05, + "loss": 0.7884, + "step": 130210 + }, + { + "epoch": 0.8319384638973717, + "grad_norm": 0.9665817022323608, + "learning_rate": 6.307047428447894e-05, + "loss": 0.9997, + "step": 130220 + }, + { + "epoch": 0.8320023510471104, + "grad_norm": 1.334688663482666, + "learning_rate": 6.306563100863679e-05, + "loss": 0.8198, + "step": 130230 + }, + { + "epoch": 0.8320662381968491, + "grad_norm": 0.5525422096252441, + "learning_rate": 6.306078760121243e-05, + "loss": 0.8491, + "step": 130240 + }, + { + "epoch": 0.8321301253465878, + "grad_norm": 0.7009544968605042, + "learning_rate": 6.305594406225464e-05, + "loss": 0.7649, + "step": 130250 + }, + { + "epoch": 0.8321940124963265, + "grad_norm": 1.0132756233215332, + "learning_rate": 6.305110039181219e-05, + "loss": 0.8507, + "step": 130260 + }, + { + "epoch": 0.8322578996460652, + "grad_norm": 1.0832351446151733, + "learning_rate": 6.304625658993388e-05, + "loss": 0.9625, + "step": 130270 + }, + { + "epoch": 0.8323217867958039, + "grad_norm": 1.351954698562622, + "learning_rate": 6.304141265666846e-05, + "loss": 1.1904, + "step": 130280 + }, + { + "epoch": 0.8323856739455426, + "grad_norm": 0.9040460586547852, + "learning_rate": 6.303656859206475e-05, + "loss": 1.0508, + "step": 130290 + }, + { + "epoch": 0.8324495610952813, + "grad_norm": 2.278919219970703, + "learning_rate": 6.30317243961715e-05, + "loss": 0.8939, + "step": 130300 + }, + { + "epoch": 0.83251344824502, + "grad_norm": 1.002038836479187, + "learning_rate": 6.302688006903753e-05, + "loss": 0.9378, + "step": 130310 + }, + { + "epoch": 0.8325773353947586, + "grad_norm": 1.055732250213623, + "learning_rate": 6.30220356107116e-05, + "loss": 1.185, + "step": 130320 + }, + { + "epoch": 0.8326412225444974, + "grad_norm": 0.8173093199729919, + "learning_rate": 6.301719102124251e-05, + "loss": 0.8708, + "step": 130330 + }, + { + "epoch": 0.8327051096942361, + "grad_norm": 0.8317503333091736, + "learning_rate": 6.301234630067902e-05, + "loss": 0.8327, + "step": 130340 + }, + { + "epoch": 0.8327689968439748, + "grad_norm": 0.7437106370925903, + "learning_rate": 6.300750144906997e-05, + "loss": 0.8685, + "step": 130350 + }, + { + "epoch": 0.8328328839937135, + "grad_norm": 0.8775268793106079, + "learning_rate": 6.300265646646413e-05, + "loss": 0.7828, + "step": 130360 + }, + { + "epoch": 0.8328967711434522, + "grad_norm": 0.8213568329811096, + "learning_rate": 6.299781135291028e-05, + "loss": 0.8267, + "step": 130370 + }, + { + "epoch": 0.8329606582931909, + "grad_norm": 0.6800155639648438, + "learning_rate": 6.299296610845721e-05, + "loss": 0.752, + "step": 130380 + }, + { + "epoch": 0.8330245454429296, + "grad_norm": 0.9931345582008362, + "learning_rate": 6.298812073315375e-05, + "loss": 0.7927, + "step": 130390 + }, + { + "epoch": 0.8330884325926683, + "grad_norm": 0.6396980881690979, + "learning_rate": 6.298327522704869e-05, + "loss": 0.8008, + "step": 130400 + }, + { + "epoch": 0.833152319742407, + "grad_norm": 1.1802109479904175, + "learning_rate": 6.29784295901908e-05, + "loss": 0.8763, + "step": 130410 + }, + { + "epoch": 0.8332162068921457, + "grad_norm": 0.64454185962677, + "learning_rate": 6.29735838226289e-05, + "loss": 0.8682, + "step": 130420 + }, + { + "epoch": 0.8332800940418844, + "grad_norm": 0.9595576524734497, + "learning_rate": 6.296873792441179e-05, + "loss": 0.7091, + "step": 130430 + }, + { + "epoch": 0.8333439811916231, + "grad_norm": 0.9316465854644775, + "learning_rate": 6.296389189558825e-05, + "loss": 0.9267, + "step": 130440 + }, + { + "epoch": 0.8334078683413618, + "grad_norm": 0.6795920133590698, + "learning_rate": 6.295904573620712e-05, + "loss": 0.9662, + "step": 130450 + }, + { + "epoch": 0.8334717554911005, + "grad_norm": 0.8197570443153381, + "learning_rate": 6.29541994463172e-05, + "loss": 0.9202, + "step": 130460 + }, + { + "epoch": 0.8335356426408392, + "grad_norm": 1.0630313158035278, + "learning_rate": 6.294935302596727e-05, + "loss": 1.089, + "step": 130470 + }, + { + "epoch": 0.8335995297905779, + "grad_norm": 1.3294517993927002, + "learning_rate": 6.294450647520616e-05, + "loss": 0.9093, + "step": 130480 + }, + { + "epoch": 0.8336634169403166, + "grad_norm": 1.1934936046600342, + "learning_rate": 6.293965979408267e-05, + "loss": 0.7484, + "step": 130490 + }, + { + "epoch": 0.8337273040900554, + "grad_norm": 0.4786630868911743, + "learning_rate": 6.29348129826456e-05, + "loss": 0.8751, + "step": 130500 + }, + { + "epoch": 0.8337911912397941, + "grad_norm": 1.0088316202163696, + "learning_rate": 6.292996604094378e-05, + "loss": 0.9007, + "step": 130510 + }, + { + "epoch": 0.8338550783895328, + "grad_norm": 0.8918789029121399, + "learning_rate": 6.292511896902602e-05, + "loss": 0.625, + "step": 130520 + }, + { + "epoch": 0.8339189655392715, + "grad_norm": 0.9836956262588501, + "learning_rate": 6.292027176694112e-05, + "loss": 1.0436, + "step": 130530 + }, + { + "epoch": 0.8339828526890102, + "grad_norm": 0.6663161516189575, + "learning_rate": 6.29154244347379e-05, + "loss": 1.0225, + "step": 130540 + }, + { + "epoch": 0.8340467398387489, + "grad_norm": 0.9562898278236389, + "learning_rate": 6.29105769724652e-05, + "loss": 1.0247, + "step": 130550 + }, + { + "epoch": 0.8341106269884875, + "grad_norm": 0.8911415338516235, + "learning_rate": 6.29057293801718e-05, + "loss": 0.7822, + "step": 130560 + }, + { + "epoch": 0.8341745141382262, + "grad_norm": 0.6790681481361389, + "learning_rate": 6.290088165790658e-05, + "loss": 0.8417, + "step": 130570 + }, + { + "epoch": 0.8342384012879649, + "grad_norm": 0.6850691437721252, + "learning_rate": 6.289603380571828e-05, + "loss": 0.808, + "step": 130580 + }, + { + "epoch": 0.8343022884377036, + "grad_norm": 0.5608751773834229, + "learning_rate": 6.289118582365578e-05, + "loss": 0.9613, + "step": 130590 + }, + { + "epoch": 0.8343661755874423, + "grad_norm": 1.013020634651184, + "learning_rate": 6.288633771176789e-05, + "loss": 1.102, + "step": 130600 + }, + { + "epoch": 0.834430062737181, + "grad_norm": 1.1625406742095947, + "learning_rate": 6.288148947010342e-05, + "loss": 0.7877, + "step": 130610 + }, + { + "epoch": 0.8344939498869197, + "grad_norm": 1.0570250749588013, + "learning_rate": 6.287664109871121e-05, + "loss": 0.9154, + "step": 130620 + }, + { + "epoch": 0.8345578370366584, + "grad_norm": 0.5707595348358154, + "learning_rate": 6.287179259764008e-05, + "loss": 0.7431, + "step": 130630 + }, + { + "epoch": 0.8346217241863971, + "grad_norm": 1.400903344154358, + "learning_rate": 6.286694396693888e-05, + "loss": 0.648, + "step": 130640 + }, + { + "epoch": 0.8346856113361358, + "grad_norm": 0.6360903382301331, + "learning_rate": 6.286209520665641e-05, + "loss": 0.9326, + "step": 130650 + }, + { + "epoch": 0.8347494984858745, + "grad_norm": 1.020822286605835, + "learning_rate": 6.285724631684153e-05, + "loss": 0.9831, + "step": 130660 + }, + { + "epoch": 0.8348133856356132, + "grad_norm": 0.9532088041305542, + "learning_rate": 6.285239729754304e-05, + "loss": 0.9734, + "step": 130670 + }, + { + "epoch": 0.834877272785352, + "grad_norm": 1.0500184297561646, + "learning_rate": 6.284754814880979e-05, + "loss": 1.1216, + "step": 130680 + }, + { + "epoch": 0.8349411599350907, + "grad_norm": 0.7444097399711609, + "learning_rate": 6.284269887069061e-05, + "loss": 0.6931, + "step": 130690 + }, + { + "epoch": 0.8350050470848294, + "grad_norm": 1.272265076637268, + "learning_rate": 6.283784946323435e-05, + "loss": 0.9984, + "step": 130700 + }, + { + "epoch": 0.8350689342345681, + "grad_norm": 3.133178949356079, + "learning_rate": 6.283299992648985e-05, + "loss": 1.0528, + "step": 130710 + }, + { + "epoch": 0.8351328213843068, + "grad_norm": 0.9269975423812866, + "learning_rate": 6.282815026050593e-05, + "loss": 0.7731, + "step": 130720 + }, + { + "epoch": 0.8351967085340455, + "grad_norm": 1.0282938480377197, + "learning_rate": 6.282330046533144e-05, + "loss": 0.9324, + "step": 130730 + }, + { + "epoch": 0.8352605956837842, + "grad_norm": 0.8756586909294128, + "learning_rate": 6.281845054101522e-05, + "loss": 0.7738, + "step": 130740 + }, + { + "epoch": 0.8353244828335229, + "grad_norm": 0.8821702599525452, + "learning_rate": 6.28136004876061e-05, + "loss": 0.9636, + "step": 130750 + }, + { + "epoch": 0.8353883699832616, + "grad_norm": 0.8497743010520935, + "learning_rate": 6.280875030515295e-05, + "loss": 0.8596, + "step": 130760 + }, + { + "epoch": 0.8354522571330003, + "grad_norm": 0.6755285263061523, + "learning_rate": 6.28038999937046e-05, + "loss": 0.8027, + "step": 130770 + }, + { + "epoch": 0.835516144282739, + "grad_norm": 1.5908513069152832, + "learning_rate": 6.279904955330991e-05, + "loss": 1.274, + "step": 130780 + }, + { + "epoch": 0.8355800314324777, + "grad_norm": 0.8962165713310242, + "learning_rate": 6.279419898401772e-05, + "loss": 0.8612, + "step": 130790 + }, + { + "epoch": 0.8356439185822164, + "grad_norm": 1.3172128200531006, + "learning_rate": 6.278934828587686e-05, + "loss": 0.8991, + "step": 130800 + }, + { + "epoch": 0.835707805731955, + "grad_norm": 0.7222731113433838, + "learning_rate": 6.278449745893621e-05, + "loss": 1.0712, + "step": 130810 + }, + { + "epoch": 0.8357716928816937, + "grad_norm": 0.9182828664779663, + "learning_rate": 6.27796465032446e-05, + "loss": 1.2312, + "step": 130820 + }, + { + "epoch": 0.8358355800314324, + "grad_norm": 0.9249597191810608, + "learning_rate": 6.277479541885091e-05, + "loss": 0.9577, + "step": 130830 + }, + { + "epoch": 0.8358994671811711, + "grad_norm": 0.8450763821601868, + "learning_rate": 6.276994420580397e-05, + "loss": 0.8475, + "step": 130840 + }, + { + "epoch": 0.8359633543309098, + "grad_norm": 0.8186871409416199, + "learning_rate": 6.276509286415265e-05, + "loss": 0.677, + "step": 130850 + }, + { + "epoch": 0.8360272414806486, + "grad_norm": 0.7687765955924988, + "learning_rate": 6.276024139394578e-05, + "loss": 0.7434, + "step": 130860 + }, + { + "epoch": 0.8360911286303873, + "grad_norm": 0.8875823616981506, + "learning_rate": 6.275538979523227e-05, + "loss": 1.1177, + "step": 130870 + }, + { + "epoch": 0.836155015780126, + "grad_norm": 1.5310128927230835, + "learning_rate": 6.275053806806093e-05, + "loss": 0.8904, + "step": 130880 + }, + { + "epoch": 0.8362189029298647, + "grad_norm": 0.9724732041358948, + "learning_rate": 6.274568621248065e-05, + "loss": 0.901, + "step": 130890 + }, + { + "epoch": 0.8362827900796034, + "grad_norm": 0.7733339071273804, + "learning_rate": 6.274083422854026e-05, + "loss": 0.8989, + "step": 130900 + }, + { + "epoch": 0.8363466772293421, + "grad_norm": 0.7655275464057922, + "learning_rate": 6.273598211628867e-05, + "loss": 0.9544, + "step": 130910 + }, + { + "epoch": 0.8364105643790808, + "grad_norm": 0.7450165748596191, + "learning_rate": 6.273112987577472e-05, + "loss": 0.5685, + "step": 130920 + }, + { + "epoch": 0.8364744515288195, + "grad_norm": 0.9278712868690491, + "learning_rate": 6.272627750704727e-05, + "loss": 0.9439, + "step": 130930 + }, + { + "epoch": 0.8365383386785582, + "grad_norm": 0.9437885284423828, + "learning_rate": 6.272142501015521e-05, + "loss": 0.8225, + "step": 130940 + }, + { + "epoch": 0.8366022258282969, + "grad_norm": 2.503553628921509, + "learning_rate": 6.27165723851474e-05, + "loss": 1.0931, + "step": 130950 + }, + { + "epoch": 0.8366661129780356, + "grad_norm": 0.7560781836509705, + "learning_rate": 6.271171963207269e-05, + "loss": 0.8088, + "step": 130960 + }, + { + "epoch": 0.8367300001277743, + "grad_norm": 0.9613698124885559, + "learning_rate": 6.270686675097997e-05, + "loss": 0.8562, + "step": 130970 + }, + { + "epoch": 0.836793887277513, + "grad_norm": 1.2895517349243164, + "learning_rate": 6.27020137419181e-05, + "loss": 0.9257, + "step": 130980 + }, + { + "epoch": 0.8368577744272517, + "grad_norm": 0.9719459414482117, + "learning_rate": 6.269716060493597e-05, + "loss": 0.9031, + "step": 130990 + }, + { + "epoch": 0.8369216615769904, + "grad_norm": 1.973030924797058, + "learning_rate": 6.269230734008245e-05, + "loss": 0.8468, + "step": 131000 + }, + { + "epoch": 0.8369855487267291, + "grad_norm": 0.9847891330718994, + "learning_rate": 6.26874539474064e-05, + "loss": 0.7736, + "step": 131010 + }, + { + "epoch": 0.8370494358764679, + "grad_norm": 0.9893288016319275, + "learning_rate": 6.268260042695672e-05, + "loss": 0.7408, + "step": 131020 + }, + { + "epoch": 0.8371133230262066, + "grad_norm": 0.742874026298523, + "learning_rate": 6.267823214934596e-05, + "loss": 1.013, + "step": 131030 + }, + { + "epoch": 0.8371772101759453, + "grad_norm": 1.0840404033660889, + "learning_rate": 6.267337838626103e-05, + "loss": 1.0696, + "step": 131040 + }, + { + "epoch": 0.8372410973256839, + "grad_norm": 1.0095386505126953, + "learning_rate": 6.266852449554422e-05, + "loss": 0.9302, + "step": 131050 + }, + { + "epoch": 0.8373049844754226, + "grad_norm": 1.0238926410675049, + "learning_rate": 6.266367047724442e-05, + "loss": 0.6575, + "step": 131060 + }, + { + "epoch": 0.8373688716251613, + "grad_norm": 1.125106930732727, + "learning_rate": 6.265881633141049e-05, + "loss": 0.891, + "step": 131070 + }, + { + "epoch": 0.8374327587749, + "grad_norm": 0.716673731803894, + "learning_rate": 6.265396205809132e-05, + "loss": 0.7425, + "step": 131080 + }, + { + "epoch": 0.8374966459246387, + "grad_norm": 1.2633848190307617, + "learning_rate": 6.264910765733582e-05, + "loss": 0.9292, + "step": 131090 + }, + { + "epoch": 0.8375605330743774, + "grad_norm": 1.3997453451156616, + "learning_rate": 6.264425312919288e-05, + "loss": 0.6964, + "step": 131100 + }, + { + "epoch": 0.8376244202241161, + "grad_norm": 1.081032156944275, + "learning_rate": 6.263939847371134e-05, + "loss": 0.885, + "step": 131110 + }, + { + "epoch": 0.8376883073738548, + "grad_norm": 0.6516674757003784, + "learning_rate": 6.263454369094014e-05, + "loss": 0.7648, + "step": 131120 + }, + { + "epoch": 0.8377521945235935, + "grad_norm": 0.8595327138900757, + "learning_rate": 6.262968878092814e-05, + "loss": 0.7661, + "step": 131130 + }, + { + "epoch": 0.8378160816733322, + "grad_norm": 1.0388331413269043, + "learning_rate": 6.262483374372426e-05, + "loss": 0.6961, + "step": 131140 + }, + { + "epoch": 0.8378799688230709, + "grad_norm": 1.5679370164871216, + "learning_rate": 6.261997857937738e-05, + "loss": 0.8494, + "step": 131150 + }, + { + "epoch": 0.8379438559728096, + "grad_norm": 1.0286293029785156, + "learning_rate": 6.261512328793639e-05, + "loss": 0.8269, + "step": 131160 + }, + { + "epoch": 0.8380077431225483, + "grad_norm": 1.3701919317245483, + "learning_rate": 6.261026786945021e-05, + "loss": 0.7509, + "step": 131170 + }, + { + "epoch": 0.838071630272287, + "grad_norm": 0.7950007915496826, + "learning_rate": 6.260541232396771e-05, + "loss": 1.3381, + "step": 131180 + }, + { + "epoch": 0.8381355174220257, + "grad_norm": 0.7836694717407227, + "learning_rate": 6.26005566515378e-05, + "loss": 0.9361, + "step": 131190 + }, + { + "epoch": 0.8381994045717645, + "grad_norm": 0.7753838300704956, + "learning_rate": 6.259570085220939e-05, + "loss": 0.8256, + "step": 131200 + }, + { + "epoch": 0.8382632917215032, + "grad_norm": 0.8219996690750122, + "learning_rate": 6.259084492603138e-05, + "loss": 0.7421, + "step": 131210 + }, + { + "epoch": 0.8383271788712419, + "grad_norm": 1.0222669839859009, + "learning_rate": 6.258598887305265e-05, + "loss": 0.6588, + "step": 131220 + }, + { + "epoch": 0.8383910660209806, + "grad_norm": 0.9310179352760315, + "learning_rate": 6.258113269332215e-05, + "loss": 0.9836, + "step": 131230 + }, + { + "epoch": 0.8384549531707193, + "grad_norm": 0.8922819495201111, + "learning_rate": 6.257627638688875e-05, + "loss": 0.8864, + "step": 131240 + }, + { + "epoch": 0.838518840320458, + "grad_norm": 1.2268577814102173, + "learning_rate": 6.257141995380136e-05, + "loss": 1.1079, + "step": 131250 + }, + { + "epoch": 0.8385827274701967, + "grad_norm": 0.7743911743164062, + "learning_rate": 6.25665633941089e-05, + "loss": 0.8269, + "step": 131260 + }, + { + "epoch": 0.8386466146199354, + "grad_norm": 2.4467434883117676, + "learning_rate": 6.256170670786028e-05, + "loss": 1.1068, + "step": 131270 + }, + { + "epoch": 0.8387105017696741, + "grad_norm": 1.4067353010177612, + "learning_rate": 6.25568498951044e-05, + "loss": 0.7124, + "step": 131280 + }, + { + "epoch": 0.8387743889194127, + "grad_norm": 0.9311875700950623, + "learning_rate": 6.255199295589018e-05, + "loss": 1.0478, + "step": 131290 + }, + { + "epoch": 0.8388382760691514, + "grad_norm": 0.8435894250869751, + "learning_rate": 6.254713589026652e-05, + "loss": 0.758, + "step": 131300 + }, + { + "epoch": 0.8389021632188901, + "grad_norm": 1.4599822759628296, + "learning_rate": 6.254227869828237e-05, + "loss": 1.0697, + "step": 131310 + }, + { + "epoch": 0.8389660503686288, + "grad_norm": 0.8313419818878174, + "learning_rate": 6.253742137998661e-05, + "loss": 1.1126, + "step": 131320 + }, + { + "epoch": 0.8390299375183675, + "grad_norm": 0.9159870743751526, + "learning_rate": 6.253256393542817e-05, + "loss": 0.8222, + "step": 131330 + }, + { + "epoch": 0.8390938246681062, + "grad_norm": 1.2503025531768799, + "learning_rate": 6.252770636465597e-05, + "loss": 0.9694, + "step": 131340 + }, + { + "epoch": 0.8391577118178449, + "grad_norm": 0.7653509974479675, + "learning_rate": 6.252284866771894e-05, + "loss": 0.9817, + "step": 131350 + }, + { + "epoch": 0.8392215989675836, + "grad_norm": 0.9600273370742798, + "learning_rate": 6.251799084466596e-05, + "loss": 0.9824, + "step": 131360 + }, + { + "epoch": 0.8392854861173223, + "grad_norm": 0.5949135422706604, + "learning_rate": 6.251313289554601e-05, + "loss": 0.9494, + "step": 131370 + }, + { + "epoch": 0.839349373267061, + "grad_norm": 0.6030875444412231, + "learning_rate": 6.250827482040797e-05, + "loss": 0.7904, + "step": 131380 + }, + { + "epoch": 0.8394132604167998, + "grad_norm": 0.9871885180473328, + "learning_rate": 6.25034166193008e-05, + "loss": 0.7854, + "step": 131390 + }, + { + "epoch": 0.8394771475665385, + "grad_norm": 1.0872247219085693, + "learning_rate": 6.24985582922734e-05, + "loss": 1.0537, + "step": 131400 + }, + { + "epoch": 0.8395410347162772, + "grad_norm": 0.9975283741950989, + "learning_rate": 6.24936998393747e-05, + "loss": 0.8564, + "step": 131410 + }, + { + "epoch": 0.8396049218660159, + "grad_norm": 1.5955655574798584, + "learning_rate": 6.248884126065364e-05, + "loss": 0.7626, + "step": 131420 + }, + { + "epoch": 0.8396688090157546, + "grad_norm": 0.8411442041397095, + "learning_rate": 6.248398255615913e-05, + "loss": 1.0739, + "step": 131430 + }, + { + "epoch": 0.8397326961654933, + "grad_norm": 0.9352008700370789, + "learning_rate": 6.247912372594013e-05, + "loss": 0.7532, + "step": 131440 + }, + { + "epoch": 0.839796583315232, + "grad_norm": 1.1712517738342285, + "learning_rate": 6.247426477004555e-05, + "loss": 0.7926, + "step": 131450 + }, + { + "epoch": 0.8398604704649707, + "grad_norm": 0.6964886784553528, + "learning_rate": 6.246940568852435e-05, + "loss": 0.7795, + "step": 131460 + }, + { + "epoch": 0.8399243576147094, + "grad_norm": 0.7911357283592224, + "learning_rate": 6.246454648142542e-05, + "loss": 1.0756, + "step": 131470 + }, + { + "epoch": 0.8399882447644481, + "grad_norm": 0.9874993562698364, + "learning_rate": 6.245968714879773e-05, + "loss": 1.0366, + "step": 131480 + }, + { + "epoch": 0.8400521319141868, + "grad_norm": 0.5494539141654968, + "learning_rate": 6.245482769069023e-05, + "loss": 0.8724, + "step": 131490 + }, + { + "epoch": 0.8401160190639255, + "grad_norm": 0.962289571762085, + "learning_rate": 6.244996810715183e-05, + "loss": 0.8694, + "step": 131500 + }, + { + "epoch": 0.8401799062136642, + "grad_norm": 0.762205183506012, + "learning_rate": 6.244510839823147e-05, + "loss": 0.8485, + "step": 131510 + }, + { + "epoch": 0.8402437933634029, + "grad_norm": 0.9968754649162292, + "learning_rate": 6.244024856397812e-05, + "loss": 0.7702, + "step": 131520 + }, + { + "epoch": 0.8403076805131416, + "grad_norm": 1.0157196521759033, + "learning_rate": 6.24353886044407e-05, + "loss": 0.7992, + "step": 131530 + }, + { + "epoch": 0.8403715676628802, + "grad_norm": 1.0793529748916626, + "learning_rate": 6.243052851966816e-05, + "loss": 0.8393, + "step": 131540 + }, + { + "epoch": 0.8404354548126189, + "grad_norm": 0.6391186118125916, + "learning_rate": 6.242566830970941e-05, + "loss": 1.1761, + "step": 131550 + }, + { + "epoch": 0.8404993419623576, + "grad_norm": 0.727288544178009, + "learning_rate": 6.242080797461346e-05, + "loss": 1.3424, + "step": 131560 + }, + { + "epoch": 0.8405632291120964, + "grad_norm": 0.7789543271064758, + "learning_rate": 6.241594751442923e-05, + "loss": 0.8127, + "step": 131570 + }, + { + "epoch": 0.8406271162618351, + "grad_norm": 0.8413047790527344, + "learning_rate": 6.241108692920566e-05, + "loss": 0.8366, + "step": 131580 + }, + { + "epoch": 0.8406910034115738, + "grad_norm": 1.12662672996521, + "learning_rate": 6.240622621899173e-05, + "loss": 0.7906, + "step": 131590 + }, + { + "epoch": 0.8407548905613125, + "grad_norm": 1.1771347522735596, + "learning_rate": 6.240136538383635e-05, + "loss": 0.9557, + "step": 131600 + }, + { + "epoch": 0.8408187777110512, + "grad_norm": 1.3687745332717896, + "learning_rate": 6.239650442378848e-05, + "loss": 1.0635, + "step": 131610 + }, + { + "epoch": 0.8408826648607899, + "grad_norm": 0.5779715180397034, + "learning_rate": 6.239164333889711e-05, + "loss": 0.6679, + "step": 131620 + }, + { + "epoch": 0.8409465520105286, + "grad_norm": 0.9042608737945557, + "learning_rate": 6.238678212921115e-05, + "loss": 0.9131, + "step": 131630 + }, + { + "epoch": 0.8410104391602673, + "grad_norm": 0.7568997144699097, + "learning_rate": 6.238192079477959e-05, + "loss": 0.8784, + "step": 131640 + }, + { + "epoch": 0.841074326310006, + "grad_norm": 0.7763069868087769, + "learning_rate": 6.237705933565137e-05, + "loss": 1.0966, + "step": 131650 + }, + { + "epoch": 0.8411382134597447, + "grad_norm": 0.7040805220603943, + "learning_rate": 6.237219775187545e-05, + "loss": 0.9219, + "step": 131660 + }, + { + "epoch": 0.8412021006094834, + "grad_norm": 1.0088971853256226, + "learning_rate": 6.236733604350081e-05, + "loss": 0.8971, + "step": 131670 + }, + { + "epoch": 0.8412659877592221, + "grad_norm": 1.101572036743164, + "learning_rate": 6.236247421057639e-05, + "loss": 0.7638, + "step": 131680 + }, + { + "epoch": 0.8413298749089608, + "grad_norm": 0.8880965709686279, + "learning_rate": 6.235761225315117e-05, + "loss": 1.1101, + "step": 131690 + }, + { + "epoch": 0.8413937620586995, + "grad_norm": 0.6575754284858704, + "learning_rate": 6.235275017127409e-05, + "loss": 0.7849, + "step": 131700 + }, + { + "epoch": 0.8414576492084382, + "grad_norm": 1.0469022989273071, + "learning_rate": 6.234788796499411e-05, + "loss": 1.1646, + "step": 131710 + }, + { + "epoch": 0.841521536358177, + "grad_norm": 0.6429979801177979, + "learning_rate": 6.234302563436024e-05, + "loss": 0.6877, + "step": 131720 + }, + { + "epoch": 0.8415854235079157, + "grad_norm": 0.8447049260139465, + "learning_rate": 6.233816317942143e-05, + "loss": 0.836, + "step": 131730 + }, + { + "epoch": 0.8416493106576544, + "grad_norm": 1.907248854637146, + "learning_rate": 6.233330060022662e-05, + "loss": 0.7056, + "step": 131740 + }, + { + "epoch": 0.8417131978073931, + "grad_norm": 0.9654474854469299, + "learning_rate": 6.232843789682483e-05, + "loss": 0.8356, + "step": 131750 + }, + { + "epoch": 0.8417770849571318, + "grad_norm": 1.3447314500808716, + "learning_rate": 6.2323575069265e-05, + "loss": 0.7557, + "step": 131760 + }, + { + "epoch": 0.8418409721068705, + "grad_norm": 2.194685220718384, + "learning_rate": 6.231871211759609e-05, + "loss": 0.885, + "step": 131770 + }, + { + "epoch": 0.8419048592566091, + "grad_norm": 1.1542408466339111, + "learning_rate": 6.23138490418671e-05, + "loss": 0.7755, + "step": 131780 + }, + { + "epoch": 0.8419687464063478, + "grad_norm": 0.7625725865364075, + "learning_rate": 6.2308985842127e-05, + "loss": 1.0261, + "step": 131790 + }, + { + "epoch": 0.8420326335560865, + "grad_norm": 1.0486656427383423, + "learning_rate": 6.230412251842477e-05, + "loss": 0.9115, + "step": 131800 + }, + { + "epoch": 0.8420965207058252, + "grad_norm": 1.7022581100463867, + "learning_rate": 6.229925907080937e-05, + "loss": 0.853, + "step": 131810 + }, + { + "epoch": 0.8421604078555639, + "grad_norm": 0.8428347110748291, + "learning_rate": 6.229439549932979e-05, + "loss": 0.9491, + "step": 131820 + }, + { + "epoch": 0.8422242950053026, + "grad_norm": 1.126781940460205, + "learning_rate": 6.228953180403503e-05, + "loss": 0.7966, + "step": 131830 + }, + { + "epoch": 0.8422881821550413, + "grad_norm": 0.7675802111625671, + "learning_rate": 6.228466798497403e-05, + "loss": 0.8985, + "step": 131840 + }, + { + "epoch": 0.84235206930478, + "grad_norm": 0.7653781771659851, + "learning_rate": 6.227980404219581e-05, + "loss": 0.8768, + "step": 131850 + }, + { + "epoch": 0.8424159564545187, + "grad_norm": 0.9524831175804138, + "learning_rate": 6.227493997574933e-05, + "loss": 0.93, + "step": 131860 + }, + { + "epoch": 0.8424798436042574, + "grad_norm": 0.7941969037055969, + "learning_rate": 6.227007578568358e-05, + "loss": 0.9524, + "step": 131870 + }, + { + "epoch": 0.8425437307539961, + "grad_norm": 1.5713688135147095, + "learning_rate": 6.226521147204757e-05, + "loss": 0.8452, + "step": 131880 + }, + { + "epoch": 0.8426076179037348, + "grad_norm": 0.7123538255691528, + "learning_rate": 6.226034703489025e-05, + "loss": 0.962, + "step": 131890 + }, + { + "epoch": 0.8426715050534735, + "grad_norm": 0.9128240942955017, + "learning_rate": 6.225548247426064e-05, + "loss": 0.6589, + "step": 131900 + }, + { + "epoch": 0.8427353922032123, + "grad_norm": 1.0599886178970337, + "learning_rate": 6.225061779020773e-05, + "loss": 0.7505, + "step": 131910 + }, + { + "epoch": 0.842799279352951, + "grad_norm": 0.7445502877235413, + "learning_rate": 6.224575298278048e-05, + "loss": 0.9809, + "step": 131920 + }, + { + "epoch": 0.8428631665026897, + "grad_norm": 1.1872540712356567, + "learning_rate": 6.224088805202791e-05, + "loss": 0.9065, + "step": 131930 + }, + { + "epoch": 0.8429270536524284, + "grad_norm": 1.1298291683197021, + "learning_rate": 6.2236022997999e-05, + "loss": 1.4187, + "step": 131940 + }, + { + "epoch": 0.8429909408021671, + "grad_norm": 0.8236765265464783, + "learning_rate": 6.223115782074278e-05, + "loss": 0.7978, + "step": 131950 + }, + { + "epoch": 0.8430548279519058, + "grad_norm": 0.953170895576477, + "learning_rate": 6.22262925203082e-05, + "loss": 0.8399, + "step": 131960 + }, + { + "epoch": 0.8431187151016445, + "grad_norm": 0.5813484787940979, + "learning_rate": 6.222142709674428e-05, + "loss": 0.7039, + "step": 131970 + }, + { + "epoch": 0.8431826022513832, + "grad_norm": 0.963085412979126, + "learning_rate": 6.22165615501e-05, + "loss": 0.7868, + "step": 131980 + }, + { + "epoch": 0.8432464894011219, + "grad_norm": 0.8191744089126587, + "learning_rate": 6.22116958804244e-05, + "loss": 0.9115, + "step": 131990 + }, + { + "epoch": 0.8433103765508606, + "grad_norm": 0.9577305912971497, + "learning_rate": 6.220683008776645e-05, + "loss": 0.7448, + "step": 132000 + }, + { + "epoch": 0.8433742637005993, + "grad_norm": 1.459100604057312, + "learning_rate": 6.220196417217516e-05, + "loss": 0.9975, + "step": 132010 + }, + { + "epoch": 0.8434381508503379, + "grad_norm": 0.7961175441741943, + "learning_rate": 6.219709813369953e-05, + "loss": 1.0328, + "step": 132020 + }, + { + "epoch": 0.8435020380000766, + "grad_norm": 0.9462569355964661, + "learning_rate": 6.219223197238858e-05, + "loss": 0.8782, + "step": 132030 + }, + { + "epoch": 0.8435659251498153, + "grad_norm": 1.1649507284164429, + "learning_rate": 6.21873656882913e-05, + "loss": 0.8889, + "step": 132040 + }, + { + "epoch": 0.843629812299554, + "grad_norm": 0.6289653182029724, + "learning_rate": 6.218249928145671e-05, + "loss": 1.052, + "step": 132050 + }, + { + "epoch": 0.8436936994492927, + "grad_norm": 0.8261928558349609, + "learning_rate": 6.21776327519338e-05, + "loss": 0.8439, + "step": 132060 + }, + { + "epoch": 0.8437575865990314, + "grad_norm": 1.1747194528579712, + "learning_rate": 6.21727660997716e-05, + "loss": 1.1163, + "step": 132070 + }, + { + "epoch": 0.8438214737487701, + "grad_norm": 0.8616983294487, + "learning_rate": 6.216789932501912e-05, + "loss": 0.7076, + "step": 132080 + }, + { + "epoch": 0.8438853608985089, + "grad_norm": 1.5313186645507812, + "learning_rate": 6.216303242772535e-05, + "loss": 0.763, + "step": 132090 + }, + { + "epoch": 0.8439492480482476, + "grad_norm": 0.84382164478302, + "learning_rate": 6.215816540793934e-05, + "loss": 0.9195, + "step": 132100 + }, + { + "epoch": 0.8440131351979863, + "grad_norm": 1.5501306056976318, + "learning_rate": 6.215329826571008e-05, + "loss": 1.2185, + "step": 132110 + }, + { + "epoch": 0.844077022347725, + "grad_norm": 1.0050925016403198, + "learning_rate": 6.214843100108659e-05, + "loss": 0.8055, + "step": 132120 + }, + { + "epoch": 0.8441409094974637, + "grad_norm": 1.1478122472763062, + "learning_rate": 6.214356361411788e-05, + "loss": 0.7929, + "step": 132130 + }, + { + "epoch": 0.8442047966472024, + "grad_norm": 1.0138068199157715, + "learning_rate": 6.2138696104853e-05, + "loss": 0.8049, + "step": 132140 + }, + { + "epoch": 0.8442686837969411, + "grad_norm": 3.256178379058838, + "learning_rate": 6.213382847334094e-05, + "loss": 0.8957, + "step": 132150 + }, + { + "epoch": 0.8443325709466798, + "grad_norm": 0.8341447710990906, + "learning_rate": 6.212896071963072e-05, + "loss": 1.0626, + "step": 132160 + }, + { + "epoch": 0.8443964580964185, + "grad_norm": 0.8610829710960388, + "learning_rate": 6.212409284377138e-05, + "loss": 0.8818, + "step": 132170 + }, + { + "epoch": 0.8444603452461572, + "grad_norm": 0.9030302166938782, + "learning_rate": 6.211922484581194e-05, + "loss": 0.9542, + "step": 132180 + }, + { + "epoch": 0.8445242323958959, + "grad_norm": 0.8101795315742493, + "learning_rate": 6.211435672580143e-05, + "loss": 1.069, + "step": 132190 + }, + { + "epoch": 0.8445881195456346, + "grad_norm": 0.8379024267196655, + "learning_rate": 6.210948848378884e-05, + "loss": 1.0042, + "step": 132200 + }, + { + "epoch": 0.8446520066953733, + "grad_norm": 1.2797821760177612, + "learning_rate": 6.210462011982325e-05, + "loss": 1.0052, + "step": 132210 + }, + { + "epoch": 0.844715893845112, + "grad_norm": 0.7084786295890808, + "learning_rate": 6.209975163395365e-05, + "loss": 0.9553, + "step": 132220 + }, + { + "epoch": 0.8447797809948507, + "grad_norm": 1.345314860343933, + "learning_rate": 6.209488302622909e-05, + "loss": 1.0062, + "step": 132230 + }, + { + "epoch": 0.8448436681445894, + "grad_norm": 1.0037018060684204, + "learning_rate": 6.209001429669859e-05, + "loss": 0.8894, + "step": 132240 + }, + { + "epoch": 0.8449075552943281, + "grad_norm": 0.7273179292678833, + "learning_rate": 6.208514544541118e-05, + "loss": 0.7736, + "step": 132250 + }, + { + "epoch": 0.8449714424440669, + "grad_norm": 1.832634687423706, + "learning_rate": 6.208027647241591e-05, + "loss": 0.79, + "step": 132260 + }, + { + "epoch": 0.8450353295938055, + "grad_norm": 0.895901083946228, + "learning_rate": 6.207540737776179e-05, + "loss": 0.8047, + "step": 132270 + }, + { + "epoch": 0.8450992167435442, + "grad_norm": 0.9929360151290894, + "learning_rate": 6.207053816149789e-05, + "loss": 0.9768, + "step": 132280 + }, + { + "epoch": 0.8451631038932829, + "grad_norm": 2.9557993412017822, + "learning_rate": 6.206566882367323e-05, + "loss": 0.6881, + "step": 132290 + }, + { + "epoch": 0.8452269910430216, + "grad_norm": 1.2809102535247803, + "learning_rate": 6.206079936433685e-05, + "loss": 0.7503, + "step": 132300 + }, + { + "epoch": 0.8452908781927603, + "grad_norm": 0.7714309692382812, + "learning_rate": 6.205592978353776e-05, + "loss": 0.8082, + "step": 132310 + }, + { + "epoch": 0.845354765342499, + "grad_norm": 1.0882459878921509, + "learning_rate": 6.205106008132505e-05, + "loss": 0.7081, + "step": 132320 + }, + { + "epoch": 0.8454186524922377, + "grad_norm": 1.8441599607467651, + "learning_rate": 6.204619025774774e-05, + "loss": 0.8468, + "step": 132330 + }, + { + "epoch": 0.8454825396419764, + "grad_norm": 0.8650910258293152, + "learning_rate": 6.204132031285485e-05, + "loss": 0.7917, + "step": 132340 + }, + { + "epoch": 0.8455464267917151, + "grad_norm": 0.9829961061477661, + "learning_rate": 6.203645024669548e-05, + "loss": 0.861, + "step": 132350 + }, + { + "epoch": 0.8456103139414538, + "grad_norm": 1.0663328170776367, + "learning_rate": 6.203158005931861e-05, + "loss": 0.9636, + "step": 132360 + }, + { + "epoch": 0.8456742010911925, + "grad_norm": 0.7675907611846924, + "learning_rate": 6.202670975077334e-05, + "loss": 0.985, + "step": 132370 + }, + { + "epoch": 0.8457380882409312, + "grad_norm": 0.8641635179519653, + "learning_rate": 6.20218393211087e-05, + "loss": 0.7433, + "step": 132380 + }, + { + "epoch": 0.8458019753906699, + "grad_norm": 1.2303731441497803, + "learning_rate": 6.201696877037373e-05, + "loss": 0.8609, + "step": 132390 + }, + { + "epoch": 0.8458658625404086, + "grad_norm": 0.7559641599655151, + "learning_rate": 6.201209809861747e-05, + "loss": 0.879, + "step": 132400 + }, + { + "epoch": 0.8459297496901473, + "grad_norm": 0.9814415574073792, + "learning_rate": 6.200722730588901e-05, + "loss": 0.9192, + "step": 132410 + }, + { + "epoch": 0.845993636839886, + "grad_norm": 0.9890975952148438, + "learning_rate": 6.20023563922374e-05, + "loss": 0.9853, + "step": 132420 + }, + { + "epoch": 0.8460575239896247, + "grad_norm": 0.8274144530296326, + "learning_rate": 6.199748535771165e-05, + "loss": 0.8835, + "step": 132430 + }, + { + "epoch": 0.8461214111393635, + "grad_norm": 0.6956206560134888, + "learning_rate": 6.199261420236086e-05, + "loss": 0.9952, + "step": 132440 + }, + { + "epoch": 0.8461852982891022, + "grad_norm": 0.6252676844596863, + "learning_rate": 6.198774292623406e-05, + "loss": 0.8467, + "step": 132450 + }, + { + "epoch": 0.8462491854388409, + "grad_norm": 1.0025311708450317, + "learning_rate": 6.198287152938031e-05, + "loss": 0.9938, + "step": 132460 + }, + { + "epoch": 0.8463130725885796, + "grad_norm": 1.1646289825439453, + "learning_rate": 6.197800001184869e-05, + "loss": 1.0137, + "step": 132470 + }, + { + "epoch": 0.8463769597383183, + "grad_norm": 0.8148912191390991, + "learning_rate": 6.197312837368825e-05, + "loss": 0.937, + "step": 132480 + }, + { + "epoch": 0.846440846888057, + "grad_norm": 1.7211027145385742, + "learning_rate": 6.196825661494805e-05, + "loss": 0.852, + "step": 132490 + }, + { + "epoch": 0.8465047340377957, + "grad_norm": 0.7814443707466125, + "learning_rate": 6.196338473567714e-05, + "loss": 1.0702, + "step": 132500 + }, + { + "epoch": 0.8465686211875343, + "grad_norm": 1.2225807905197144, + "learning_rate": 6.19585127359246e-05, + "loss": 0.7909, + "step": 132510 + }, + { + "epoch": 0.846632508337273, + "grad_norm": 0.9283073544502258, + "learning_rate": 6.19536406157395e-05, + "loss": 0.8267, + "step": 132520 + }, + { + "epoch": 0.8466963954870117, + "grad_norm": 0.9126421809196472, + "learning_rate": 6.194876837517089e-05, + "loss": 0.7771, + "step": 132530 + }, + { + "epoch": 0.8467602826367504, + "grad_norm": 1.5700596570968628, + "learning_rate": 6.194389601426784e-05, + "loss": 1.0307, + "step": 132540 + }, + { + "epoch": 0.8468241697864891, + "grad_norm": 0.8613380193710327, + "learning_rate": 6.193902353307943e-05, + "loss": 0.7906, + "step": 132550 + }, + { + "epoch": 0.8468880569362278, + "grad_norm": 0.9143043756484985, + "learning_rate": 6.193415093165473e-05, + "loss": 0.9351, + "step": 132560 + }, + { + "epoch": 0.8469519440859665, + "grad_norm": 1.215604543685913, + "learning_rate": 6.192927821004281e-05, + "loss": 0.8207, + "step": 132570 + }, + { + "epoch": 0.8470158312357052, + "grad_norm": 1.1382776498794556, + "learning_rate": 6.192440536829272e-05, + "loss": 0.6806, + "step": 132580 + }, + { + "epoch": 0.8470797183854439, + "grad_norm": 0.8516457080841064, + "learning_rate": 6.191953240645356e-05, + "loss": 0.9752, + "step": 132590 + }, + { + "epoch": 0.8471436055351826, + "grad_norm": 0.6986536383628845, + "learning_rate": 6.191465932457439e-05, + "loss": 0.7651, + "step": 132600 + }, + { + "epoch": 0.8472074926849213, + "grad_norm": 1.2519372701644897, + "learning_rate": 6.19097861227043e-05, + "loss": 0.8898, + "step": 132610 + }, + { + "epoch": 0.84727137983466, + "grad_norm": 0.8474778532981873, + "learning_rate": 6.190491280089236e-05, + "loss": 0.9425, + "step": 132620 + }, + { + "epoch": 0.8473352669843988, + "grad_norm": 0.9403843879699707, + "learning_rate": 6.190003935918766e-05, + "loss": 0.8585, + "step": 132630 + }, + { + "epoch": 0.8473991541341375, + "grad_norm": 0.9674764275550842, + "learning_rate": 6.189516579763925e-05, + "loss": 1.0548, + "step": 132640 + }, + { + "epoch": 0.8474630412838762, + "grad_norm": 1.1590251922607422, + "learning_rate": 6.189029211629625e-05, + "loss": 0.8096, + "step": 132650 + }, + { + "epoch": 0.8475269284336149, + "grad_norm": 1.0220919847488403, + "learning_rate": 6.188541831520772e-05, + "loss": 0.7432, + "step": 132660 + }, + { + "epoch": 0.8475908155833536, + "grad_norm": 0.7467325329780579, + "learning_rate": 6.188054439442273e-05, + "loss": 0.8427, + "step": 132670 + }, + { + "epoch": 0.8476547027330923, + "grad_norm": 1.00656259059906, + "learning_rate": 6.187567035399038e-05, + "loss": 1.0052, + "step": 132680 + }, + { + "epoch": 0.847718589882831, + "grad_norm": 1.0796363353729248, + "learning_rate": 6.187079619395976e-05, + "loss": 0.7158, + "step": 132690 + }, + { + "epoch": 0.8477824770325697, + "grad_norm": 0.8648567795753479, + "learning_rate": 6.186592191437995e-05, + "loss": 0.8839, + "step": 132700 + }, + { + "epoch": 0.8478463641823084, + "grad_norm": 1.2238774299621582, + "learning_rate": 6.186104751530004e-05, + "loss": 0.9307, + "step": 132710 + }, + { + "epoch": 0.8479102513320471, + "grad_norm": 1.1575216054916382, + "learning_rate": 6.185617299676913e-05, + "loss": 0.9198, + "step": 132720 + }, + { + "epoch": 0.8479741384817858, + "grad_norm": 0.6900240778923035, + "learning_rate": 6.18512983588363e-05, + "loss": 0.9578, + "step": 132730 + }, + { + "epoch": 0.8480380256315245, + "grad_norm": 1.0454217195510864, + "learning_rate": 6.184642360155062e-05, + "loss": 0.7726, + "step": 132740 + }, + { + "epoch": 0.8481019127812631, + "grad_norm": 1.078119158744812, + "learning_rate": 6.184154872496124e-05, + "loss": 0.8945, + "step": 132750 + }, + { + "epoch": 0.8481657999310018, + "grad_norm": 1.2794424295425415, + "learning_rate": 6.18366737291172e-05, + "loss": 0.8144, + "step": 132760 + }, + { + "epoch": 0.8482296870807405, + "grad_norm": 1.1177339553833008, + "learning_rate": 6.18317986140676e-05, + "loss": 0.711, + "step": 132770 + }, + { + "epoch": 0.8482935742304792, + "grad_norm": 0.9638439416885376, + "learning_rate": 6.182692337986157e-05, + "loss": 0.712, + "step": 132780 + }, + { + "epoch": 0.848357461380218, + "grad_norm": 0.7159675359725952, + "learning_rate": 6.18220480265482e-05, + "loss": 0.6036, + "step": 132790 + }, + { + "epoch": 0.8484213485299567, + "grad_norm": 0.9956109523773193, + "learning_rate": 6.181717255417658e-05, + "loss": 0.8205, + "step": 132800 + }, + { + "epoch": 0.8484852356796954, + "grad_norm": 0.7932936549186707, + "learning_rate": 6.18122969627958e-05, + "loss": 0.9146, + "step": 132810 + }, + { + "epoch": 0.8485491228294341, + "grad_norm": 1.288333535194397, + "learning_rate": 6.180742125245497e-05, + "loss": 0.8525, + "step": 132820 + }, + { + "epoch": 0.8486130099791728, + "grad_norm": 0.8285970687866211, + "learning_rate": 6.180254542320319e-05, + "loss": 0.9556, + "step": 132830 + }, + { + "epoch": 0.8486768971289115, + "grad_norm": 0.8148375153541565, + "learning_rate": 6.179766947508957e-05, + "loss": 0.822, + "step": 132840 + }, + { + "epoch": 0.8487407842786502, + "grad_norm": 0.717943549156189, + "learning_rate": 6.17927934081632e-05, + "loss": 0.9766, + "step": 132850 + }, + { + "epoch": 0.8488046714283889, + "grad_norm": 0.7930614948272705, + "learning_rate": 6.178791722247321e-05, + "loss": 0.8497, + "step": 132860 + }, + { + "epoch": 0.8488685585781276, + "grad_norm": 1.0193837881088257, + "learning_rate": 6.17830409180687e-05, + "loss": 0.9573, + "step": 132870 + }, + { + "epoch": 0.8489324457278663, + "grad_norm": 1.764022946357727, + "learning_rate": 6.177816449499878e-05, + "loss": 0.9453, + "step": 132880 + }, + { + "epoch": 0.848996332877605, + "grad_norm": 1.1653591394424438, + "learning_rate": 6.177328795331253e-05, + "loss": 0.7105, + "step": 132890 + }, + { + "epoch": 0.8490602200273437, + "grad_norm": 0.9944359064102173, + "learning_rate": 6.176841129305911e-05, + "loss": 0.8217, + "step": 132900 + }, + { + "epoch": 0.8491241071770824, + "grad_norm": 0.7695066928863525, + "learning_rate": 6.176353451428758e-05, + "loss": 0.8568, + "step": 132910 + }, + { + "epoch": 0.8491879943268211, + "grad_norm": 0.8422155976295471, + "learning_rate": 6.17586576170471e-05, + "loss": 0.7569, + "step": 132920 + }, + { + "epoch": 0.8492518814765598, + "grad_norm": 0.6993025541305542, + "learning_rate": 6.175378060138674e-05, + "loss": 0.8512, + "step": 132930 + }, + { + "epoch": 0.8493157686262985, + "grad_norm": 0.7745869755744934, + "learning_rate": 6.174890346735566e-05, + "loss": 0.8467, + "step": 132940 + }, + { + "epoch": 0.8493796557760372, + "grad_norm": 0.8811076283454895, + "learning_rate": 6.174402621500297e-05, + "loss": 0.6894, + "step": 132950 + }, + { + "epoch": 0.849443542925776, + "grad_norm": 1.6308493614196777, + "learning_rate": 6.173914884437777e-05, + "loss": 1.1379, + "step": 132960 + }, + { + "epoch": 0.8495074300755147, + "grad_norm": 0.7950766682624817, + "learning_rate": 6.173427135552917e-05, + "loss": 0.8812, + "step": 132970 + }, + { + "epoch": 0.8495713172252534, + "grad_norm": 0.5945727229118347, + "learning_rate": 6.172939374850633e-05, + "loss": 0.7214, + "step": 132980 + }, + { + "epoch": 0.849635204374992, + "grad_norm": 0.9707133173942566, + "learning_rate": 6.172451602335833e-05, + "loss": 0.8997, + "step": 132990 + }, + { + "epoch": 0.8496990915247307, + "grad_norm": 0.5925787091255188, + "learning_rate": 6.17196381801343e-05, + "loss": 0.6871, + "step": 133000 + }, + { + "epoch": 0.8497629786744694, + "grad_norm": 0.6003409624099731, + "learning_rate": 6.171476021888341e-05, + "loss": 0.8482, + "step": 133010 + }, + { + "epoch": 0.8498268658242081, + "grad_norm": 0.6482333540916443, + "learning_rate": 6.170988213965471e-05, + "loss": 0.912, + "step": 133020 + }, + { + "epoch": 0.8498907529739468, + "grad_norm": 0.6910186409950256, + "learning_rate": 6.170500394249739e-05, + "loss": 0.804, + "step": 133030 + }, + { + "epoch": 0.8499546401236855, + "grad_norm": 0.9655617475509644, + "learning_rate": 6.170012562746056e-05, + "loss": 0.8942, + "step": 133040 + }, + { + "epoch": 0.8500185272734242, + "grad_norm": 0.7532824873924255, + "learning_rate": 6.169524719459334e-05, + "loss": 0.6668, + "step": 133050 + }, + { + "epoch": 0.8500824144231629, + "grad_norm": 1.318796992301941, + "learning_rate": 6.169036864394485e-05, + "loss": 0.9058, + "step": 133060 + }, + { + "epoch": 0.8501463015729016, + "grad_norm": 0.89380943775177, + "learning_rate": 6.168548997556425e-05, + "loss": 0.8642, + "step": 133070 + }, + { + "epoch": 0.8502101887226403, + "grad_norm": 1.309441328048706, + "learning_rate": 6.168061118950063e-05, + "loss": 0.9597, + "step": 133080 + }, + { + "epoch": 0.850274075872379, + "grad_norm": 0.8857962489128113, + "learning_rate": 6.167573228580317e-05, + "loss": 0.7761, + "step": 133090 + }, + { + "epoch": 0.8503379630221177, + "grad_norm": 0.7629507780075073, + "learning_rate": 6.167085326452098e-05, + "loss": 0.9627, + "step": 133100 + }, + { + "epoch": 0.8504018501718564, + "grad_norm": 0.7469977140426636, + "learning_rate": 6.16659741257032e-05, + "loss": 0.8373, + "step": 133110 + }, + { + "epoch": 0.8504657373215951, + "grad_norm": 0.9111135005950928, + "learning_rate": 6.166109486939898e-05, + "loss": 0.8188, + "step": 133120 + }, + { + "epoch": 0.8505296244713338, + "grad_norm": 0.9041001796722412, + "learning_rate": 6.165621549565742e-05, + "loss": 1.0147, + "step": 133130 + }, + { + "epoch": 0.8505935116210726, + "grad_norm": 0.7008116245269775, + "learning_rate": 6.16513360045277e-05, + "loss": 0.8531, + "step": 133140 + }, + { + "epoch": 0.8506573987708113, + "grad_norm": NaN, + "learning_rate": 6.164694436218468e-05, + "loss": 0.9906, + "step": 133150 + }, + { + "epoch": 0.85072128592055, + "grad_norm": 0.6200425028800964, + "learning_rate": 6.164206464815282e-05, + "loss": 0.9371, + "step": 133160 + }, + { + "epoch": 0.8507851730702887, + "grad_norm": 1.1376943588256836, + "learning_rate": 6.16371848168753e-05, + "loss": 0.7415, + "step": 133170 + }, + { + "epoch": 0.8508490602200274, + "grad_norm": 0.8951854109764099, + "learning_rate": 6.163230486840124e-05, + "loss": 0.9746, + "step": 133180 + }, + { + "epoch": 0.8509129473697661, + "grad_norm": 0.6957682967185974, + "learning_rate": 6.162742480277984e-05, + "loss": 0.8415, + "step": 133190 + }, + { + "epoch": 0.8509768345195048, + "grad_norm": 0.7306455373764038, + "learning_rate": 6.162254462006018e-05, + "loss": 1.1689, + "step": 133200 + }, + { + "epoch": 0.8510407216692435, + "grad_norm": 0.8031535744667053, + "learning_rate": 6.161766432029146e-05, + "loss": 0.9466, + "step": 133210 + }, + { + "epoch": 0.8511046088189822, + "grad_norm": 0.6348420977592468, + "learning_rate": 6.16127839035228e-05, + "loss": 1.0381, + "step": 133220 + }, + { + "epoch": 0.8511684959687209, + "grad_norm": 1.0873243808746338, + "learning_rate": 6.160790336980335e-05, + "loss": 0.9977, + "step": 133230 + }, + { + "epoch": 0.8512323831184595, + "grad_norm": 0.8845553994178772, + "learning_rate": 6.160302271918229e-05, + "loss": 0.8587, + "step": 133240 + }, + { + "epoch": 0.8512962702681982, + "grad_norm": 0.7557221055030823, + "learning_rate": 6.159814195170876e-05, + "loss": 0.7288, + "step": 133250 + }, + { + "epoch": 0.8513601574179369, + "grad_norm": 0.9131662845611572, + "learning_rate": 6.159326106743188e-05, + "loss": 0.8813, + "step": 133260 + }, + { + "epoch": 0.8514240445676756, + "grad_norm": 1.1992850303649902, + "learning_rate": 6.158838006640086e-05, + "loss": 0.8098, + "step": 133270 + }, + { + "epoch": 0.8514879317174143, + "grad_norm": 0.9900951385498047, + "learning_rate": 6.15834989486648e-05, + "loss": 0.8796, + "step": 133280 + }, + { + "epoch": 0.851551818867153, + "grad_norm": 1.0529921054840088, + "learning_rate": 6.15786177142729e-05, + "loss": 1.2292, + "step": 133290 + }, + { + "epoch": 0.8516157060168917, + "grad_norm": 0.8537276983261108, + "learning_rate": 6.15737363632743e-05, + "loss": 0.7625, + "step": 133300 + }, + { + "epoch": 0.8516795931666304, + "grad_norm": 0.9666260480880737, + "learning_rate": 6.156885489571816e-05, + "loss": 1.091, + "step": 133310 + }, + { + "epoch": 0.8517434803163692, + "grad_norm": 0.8413388729095459, + "learning_rate": 6.156397331165364e-05, + "loss": 0.8328, + "step": 133320 + }, + { + "epoch": 0.8518073674661079, + "grad_norm": 0.5855796933174133, + "learning_rate": 6.155909161112992e-05, + "loss": 0.9068, + "step": 133330 + }, + { + "epoch": 0.8518712546158466, + "grad_norm": 1.3463701009750366, + "learning_rate": 6.155420979419612e-05, + "loss": 0.88, + "step": 133340 + }, + { + "epoch": 0.8519351417655853, + "grad_norm": 0.8949428796768188, + "learning_rate": 6.154932786090146e-05, + "loss": 0.9982, + "step": 133350 + }, + { + "epoch": 0.851999028915324, + "grad_norm": 1.747206687927246, + "learning_rate": 6.154444581129506e-05, + "loss": 1.1201, + "step": 133360 + }, + { + "epoch": 0.8520629160650627, + "grad_norm": 2.57167911529541, + "learning_rate": 6.153956364542612e-05, + "loss": 0.5995, + "step": 133370 + }, + { + "epoch": 0.8521268032148014, + "grad_norm": 0.8123181462287903, + "learning_rate": 6.153468136334377e-05, + "loss": 0.7424, + "step": 133380 + }, + { + "epoch": 0.8521906903645401, + "grad_norm": 0.9312867522239685, + "learning_rate": 6.15297989650972e-05, + "loss": 0.7271, + "step": 133390 + }, + { + "epoch": 0.8522545775142788, + "grad_norm": 0.9896630644798279, + "learning_rate": 6.15249164507356e-05, + "loss": 0.7866, + "step": 133400 + }, + { + "epoch": 0.8523184646640175, + "grad_norm": 1.394911527633667, + "learning_rate": 6.152003382030809e-05, + "loss": 0.9573, + "step": 133410 + }, + { + "epoch": 0.8523823518137562, + "grad_norm": 0.9811311960220337, + "learning_rate": 6.151515107386389e-05, + "loss": 1.0954, + "step": 133420 + }, + { + "epoch": 0.8524462389634949, + "grad_norm": 1.180068016052246, + "learning_rate": 6.15107565029105e-05, + "loss": 1.2012, + "step": 133430 + }, + { + "epoch": 0.8525101261132336, + "grad_norm": 1.4226782321929932, + "learning_rate": 6.150587353617002e-05, + "loss": 0.6689, + "step": 133440 + }, + { + "epoch": 0.8525740132629723, + "grad_norm": 0.7661568522453308, + "learning_rate": 6.150099045355547e-05, + "loss": 1.1808, + "step": 133450 + }, + { + "epoch": 0.852637900412711, + "grad_norm": 1.3695670366287231, + "learning_rate": 6.149610725511597e-05, + "loss": 0.8527, + "step": 133460 + }, + { + "epoch": 0.8527017875624497, + "grad_norm": 1.424774408340454, + "learning_rate": 6.149122394090073e-05, + "loss": 0.7823, + "step": 133470 + }, + { + "epoch": 0.8527656747121883, + "grad_norm": 1.2621248960494995, + "learning_rate": 6.148634051095893e-05, + "loss": 0.9556, + "step": 133480 + }, + { + "epoch": 0.852829561861927, + "grad_norm": 0.7116201519966125, + "learning_rate": 6.148145696533973e-05, + "loss": 0.8582, + "step": 133490 + }, + { + "epoch": 0.8528934490116657, + "grad_norm": 1.219441533088684, + "learning_rate": 6.147657330409234e-05, + "loss": 0.8675, + "step": 133500 + }, + { + "epoch": 0.8529573361614045, + "grad_norm": 0.8610523343086243, + "learning_rate": 6.147168952726593e-05, + "loss": 0.9913, + "step": 133510 + }, + { + "epoch": 0.8530212233111432, + "grad_norm": 1.2600919008255005, + "learning_rate": 6.146680563490968e-05, + "loss": 1.2457, + "step": 133520 + }, + { + "epoch": 0.8530851104608819, + "grad_norm": 0.7178075909614563, + "learning_rate": 6.146192162707275e-05, + "loss": 1.0738, + "step": 133530 + }, + { + "epoch": 0.8531489976106206, + "grad_norm": 0.7833428382873535, + "learning_rate": 6.145703750380439e-05, + "loss": 0.8051, + "step": 133540 + }, + { + "epoch": 0.8532128847603593, + "grad_norm": 0.6498239636421204, + "learning_rate": 6.145215326515375e-05, + "loss": 0.836, + "step": 133550 + }, + { + "epoch": 0.853276771910098, + "grad_norm": 1.1565107107162476, + "learning_rate": 6.144726891117e-05, + "loss": 0.6755, + "step": 133560 + }, + { + "epoch": 0.8533406590598367, + "grad_norm": 0.6840099692344666, + "learning_rate": 6.144238444190236e-05, + "loss": 0.9921, + "step": 133570 + }, + { + "epoch": 0.8534045462095754, + "grad_norm": 0.7863107323646545, + "learning_rate": 6.143749985740001e-05, + "loss": 0.8842, + "step": 133580 + }, + { + "epoch": 0.8534684333593141, + "grad_norm": 1.3972042798995972, + "learning_rate": 6.143261515771214e-05, + "loss": 0.7173, + "step": 133590 + }, + { + "epoch": 0.8535323205090528, + "grad_norm": 0.5784814357757568, + "learning_rate": 6.142773034288794e-05, + "loss": 1.0078, + "step": 133600 + }, + { + "epoch": 0.8535962076587915, + "grad_norm": 0.8995794057846069, + "learning_rate": 6.14228454129766e-05, + "loss": 0.7556, + "step": 133610 + }, + { + "epoch": 0.8536600948085302, + "grad_norm": 1.020451307296753, + "learning_rate": 6.141796036802734e-05, + "loss": 0.917, + "step": 133620 + }, + { + "epoch": 0.8537239819582689, + "grad_norm": 0.7589079141616821, + "learning_rate": 6.141307520808934e-05, + "loss": 0.7353, + "step": 133630 + }, + { + "epoch": 0.8537878691080076, + "grad_norm": 0.9039588570594788, + "learning_rate": 6.14081899332118e-05, + "loss": 1.0483, + "step": 133640 + }, + { + "epoch": 0.8538517562577463, + "grad_norm": 0.9937171339988708, + "learning_rate": 6.140330454344391e-05, + "loss": 0.7808, + "step": 133650 + }, + { + "epoch": 0.853915643407485, + "grad_norm": 2.9743402004241943, + "learning_rate": 6.139841903883488e-05, + "loss": 0.6187, + "step": 133660 + }, + { + "epoch": 0.8539795305572238, + "grad_norm": 0.9731509685516357, + "learning_rate": 6.139353341943391e-05, + "loss": 1.0048, + "step": 133670 + }, + { + "epoch": 0.8540434177069625, + "grad_norm": 0.7726428508758545, + "learning_rate": 6.13886476852902e-05, + "loss": 0.8544, + "step": 133680 + }, + { + "epoch": 0.8541073048567012, + "grad_norm": 1.0278613567352295, + "learning_rate": 6.138376183645295e-05, + "loss": 0.8038, + "step": 133690 + }, + { + "epoch": 0.8541711920064399, + "grad_norm": 0.9102327823638916, + "learning_rate": 6.137887587297138e-05, + "loss": 0.8712, + "step": 133700 + }, + { + "epoch": 0.8542350791561786, + "grad_norm": 0.6600117683410645, + "learning_rate": 6.137398979489468e-05, + "loss": 1.0407, + "step": 133710 + }, + { + "epoch": 0.8542989663059172, + "grad_norm": 1.2443808317184448, + "learning_rate": 6.136910360227207e-05, + "loss": 0.9258, + "step": 133720 + }, + { + "epoch": 0.8543628534556559, + "grad_norm": 0.6546837687492371, + "learning_rate": 6.136421729515275e-05, + "loss": 0.8981, + "step": 133730 + }, + { + "epoch": 0.8544267406053946, + "grad_norm": 0.5999804139137268, + "learning_rate": 6.135933087358591e-05, + "loss": 0.8701, + "step": 133740 + }, + { + "epoch": 0.8544906277551333, + "grad_norm": 0.6947192549705505, + "learning_rate": 6.135444433762081e-05, + "loss": 0.8316, + "step": 133750 + }, + { + "epoch": 0.854554514904872, + "grad_norm": 1.474822759628296, + "learning_rate": 6.134955768730663e-05, + "loss": 0.806, + "step": 133760 + }, + { + "epoch": 0.8546184020546107, + "grad_norm": 0.8612034320831299, + "learning_rate": 6.134467092269257e-05, + "loss": 0.7773, + "step": 133770 + }, + { + "epoch": 0.8546822892043494, + "grad_norm": 1.0290131568908691, + "learning_rate": 6.133978404382786e-05, + "loss": 0.8817, + "step": 133780 + }, + { + "epoch": 0.8547461763540881, + "grad_norm": 0.7579601407051086, + "learning_rate": 6.133489705076172e-05, + "loss": 0.9595, + "step": 133790 + }, + { + "epoch": 0.8548100635038268, + "grad_norm": 0.7710822224617004, + "learning_rate": 6.133000994354337e-05, + "loss": 0.8086, + "step": 133800 + }, + { + "epoch": 0.8548739506535655, + "grad_norm": 0.9965303540229797, + "learning_rate": 6.1325122722222e-05, + "loss": 0.8848, + "step": 133810 + }, + { + "epoch": 0.8549378378033042, + "grad_norm": 0.9291059970855713, + "learning_rate": 6.132023538684687e-05, + "loss": 0.7339, + "step": 133820 + }, + { + "epoch": 0.8550017249530429, + "grad_norm": 0.7711076140403748, + "learning_rate": 6.131534793746716e-05, + "loss": 0.9099, + "step": 133830 + }, + { + "epoch": 0.8550656121027816, + "grad_norm": 0.9747552871704102, + "learning_rate": 6.131046037413211e-05, + "loss": 1.076, + "step": 133840 + }, + { + "epoch": 0.8551294992525204, + "grad_norm": 0.6776683330535889, + "learning_rate": 6.130557269689092e-05, + "loss": 0.9466, + "step": 133850 + }, + { + "epoch": 0.8551933864022591, + "grad_norm": 2.5817158222198486, + "learning_rate": 6.130068490579286e-05, + "loss": 0.7916, + "step": 133860 + }, + { + "epoch": 0.8552572735519978, + "grad_norm": 0.8578413724899292, + "learning_rate": 6.129579700088711e-05, + "loss": 0.8225, + "step": 133870 + }, + { + "epoch": 0.8553211607017365, + "grad_norm": 0.8771921992301941, + "learning_rate": 6.129090898222291e-05, + "loss": 0.8542, + "step": 133880 + }, + { + "epoch": 0.8553850478514752, + "grad_norm": 0.8163079023361206, + "learning_rate": 6.128602084984951e-05, + "loss": 0.7394, + "step": 133890 + }, + { + "epoch": 0.8554489350012139, + "grad_norm": 1.6026225090026855, + "learning_rate": 6.128113260381611e-05, + "loss": 0.7461, + "step": 133900 + }, + { + "epoch": 0.8555128221509526, + "grad_norm": 1.152044415473938, + "learning_rate": 6.127624424417193e-05, + "loss": 0.9446, + "step": 133910 + }, + { + "epoch": 0.8555767093006913, + "grad_norm": 0.7472025156021118, + "learning_rate": 6.127135577096623e-05, + "loss": 0.7461, + "step": 133920 + }, + { + "epoch": 0.85564059645043, + "grad_norm": 0.7701200246810913, + "learning_rate": 6.126646718424822e-05, + "loss": 0.9566, + "step": 133930 + }, + { + "epoch": 0.8557044836001687, + "grad_norm": 0.626395583152771, + "learning_rate": 6.126157848406712e-05, + "loss": 0.6947, + "step": 133940 + }, + { + "epoch": 0.8557683707499074, + "grad_norm": 1.1391795873641968, + "learning_rate": 6.12566896704722e-05, + "loss": 0.8568, + "step": 133950 + }, + { + "epoch": 0.8558322578996461, + "grad_norm": 1.0339782238006592, + "learning_rate": 6.125180074351269e-05, + "loss": 0.9725, + "step": 133960 + }, + { + "epoch": 0.8558961450493847, + "grad_norm": 0.8343575596809387, + "learning_rate": 6.12469117032378e-05, + "loss": 1.1068, + "step": 133970 + }, + { + "epoch": 0.8559600321991234, + "grad_norm": 0.4701806604862213, + "learning_rate": 6.124202254969678e-05, + "loss": 0.7032, + "step": 133980 + }, + { + "epoch": 0.8560239193488621, + "grad_norm": 0.6901923418045044, + "learning_rate": 6.123713328293887e-05, + "loss": 0.9095, + "step": 133990 + }, + { + "epoch": 0.8560878064986008, + "grad_norm": 1.137757658958435, + "learning_rate": 6.123224390301329e-05, + "loss": 0.8695, + "step": 134000 + }, + { + "epoch": 0.8561516936483395, + "grad_norm": 1.1090092658996582, + "learning_rate": 6.122735440996931e-05, + "loss": 1.1309, + "step": 134010 + }, + { + "epoch": 0.8562155807980782, + "grad_norm": 0.6597867608070374, + "learning_rate": 6.122246480385616e-05, + "loss": 1.0033, + "step": 134020 + }, + { + "epoch": 0.856279467947817, + "grad_norm": 0.6848984360694885, + "learning_rate": 6.121757508472308e-05, + "loss": 0.9416, + "step": 134030 + }, + { + "epoch": 0.8563433550975557, + "grad_norm": 0.9123812913894653, + "learning_rate": 6.12126852526193e-05, + "loss": 0.98, + "step": 134040 + }, + { + "epoch": 0.8564072422472944, + "grad_norm": 0.5774307250976562, + "learning_rate": 6.120779530759409e-05, + "loss": 0.7818, + "step": 134050 + }, + { + "epoch": 0.8564711293970331, + "grad_norm": 1.0986335277557373, + "learning_rate": 6.120290524969668e-05, + "loss": 0.7268, + "step": 134060 + }, + { + "epoch": 0.8565350165467718, + "grad_norm": 4.184600830078125, + "learning_rate": 6.119801507897634e-05, + "loss": 1.0409, + "step": 134070 + }, + { + "epoch": 0.8565989036965105, + "grad_norm": 0.8018998503684998, + "learning_rate": 6.119312479548229e-05, + "loss": 0.9884, + "step": 134080 + }, + { + "epoch": 0.8566627908462492, + "grad_norm": 0.6833622455596924, + "learning_rate": 6.118823439926379e-05, + "loss": 0.7314, + "step": 134090 + }, + { + "epoch": 0.8567266779959879, + "grad_norm": 3.289335012435913, + "learning_rate": 6.118334389037008e-05, + "loss": 1.3275, + "step": 134100 + }, + { + "epoch": 0.8567905651457266, + "grad_norm": 2.1931798458099365, + "learning_rate": 6.117845326885043e-05, + "loss": 1.194, + "step": 134110 + }, + { + "epoch": 0.8568544522954653, + "grad_norm": 0.985336422920227, + "learning_rate": 6.11735625347541e-05, + "loss": 0.9866, + "step": 134120 + }, + { + "epoch": 0.856918339445204, + "grad_norm": 1.144832968711853, + "learning_rate": 6.116867168813031e-05, + "loss": 0.7581, + "step": 134130 + }, + { + "epoch": 0.8569822265949427, + "grad_norm": 0.8992478251457214, + "learning_rate": 6.116378072902833e-05, + "loss": 0.7862, + "step": 134140 + }, + { + "epoch": 0.8570461137446814, + "grad_norm": 0.4670437276363373, + "learning_rate": 6.115888965749744e-05, + "loss": 1.1326, + "step": 134150 + }, + { + "epoch": 0.8571100008944201, + "grad_norm": 0.688841700553894, + "learning_rate": 6.115399847358685e-05, + "loss": 0.7873, + "step": 134160 + }, + { + "epoch": 0.8571738880441588, + "grad_norm": 1.1751261949539185, + "learning_rate": 6.114910717734586e-05, + "loss": 0.8015, + "step": 134170 + }, + { + "epoch": 0.8572377751938975, + "grad_norm": 0.8896322846412659, + "learning_rate": 6.114421576882372e-05, + "loss": 1.0137, + "step": 134180 + }, + { + "epoch": 0.8573016623436363, + "grad_norm": 0.692790150642395, + "learning_rate": 6.113932424806969e-05, + "loss": 0.8688, + "step": 134190 + }, + { + "epoch": 0.857365549493375, + "grad_norm": 0.8335268497467041, + "learning_rate": 6.113443261513302e-05, + "loss": 0.9654, + "step": 134200 + }, + { + "epoch": 0.8574294366431136, + "grad_norm": 0.880637526512146, + "learning_rate": 6.112954087006297e-05, + "loss": 0.769, + "step": 134210 + }, + { + "epoch": 0.8574933237928523, + "grad_norm": 0.6039393544197083, + "learning_rate": 6.112464901290882e-05, + "loss": 0.9527, + "step": 134220 + }, + { + "epoch": 0.857557210942591, + "grad_norm": 1.0236138105392456, + "learning_rate": 6.111975704371984e-05, + "loss": 1.0147, + "step": 134230 + }, + { + "epoch": 0.8576210980923297, + "grad_norm": 1.451583743095398, + "learning_rate": 6.111486496254528e-05, + "loss": 0.7267, + "step": 134240 + }, + { + "epoch": 0.8576849852420684, + "grad_norm": 0.8510944843292236, + "learning_rate": 6.110997276943442e-05, + "loss": 1.272, + "step": 134250 + }, + { + "epoch": 0.8577488723918071, + "grad_norm": 0.7636389136314392, + "learning_rate": 6.110508046443652e-05, + "loss": 0.7339, + "step": 134260 + }, + { + "epoch": 0.8578127595415458, + "grad_norm": 0.7402799725532532, + "learning_rate": 6.110018804760085e-05, + "loss": 0.7794, + "step": 134270 + }, + { + "epoch": 0.8578766466912845, + "grad_norm": 0.8340638279914856, + "learning_rate": 6.109529551897669e-05, + "loss": 0.8052, + "step": 134280 + }, + { + "epoch": 0.8579405338410232, + "grad_norm": 1.0177710056304932, + "learning_rate": 6.109040287861331e-05, + "loss": 0.9876, + "step": 134290 + }, + { + "epoch": 0.8580044209907619, + "grad_norm": 0.8329386711120605, + "learning_rate": 6.108551012655996e-05, + "loss": 0.8806, + "step": 134300 + }, + { + "epoch": 0.8580683081405006, + "grad_norm": 0.7960025072097778, + "learning_rate": 6.108061726286596e-05, + "loss": 0.6743, + "step": 134310 + }, + { + "epoch": 0.8581321952902393, + "grad_norm": 1.751345157623291, + "learning_rate": 6.107572428758053e-05, + "loss": 0.9132, + "step": 134320 + }, + { + "epoch": 0.858196082439978, + "grad_norm": 0.8473448753356934, + "learning_rate": 6.1070831200753e-05, + "loss": 0.8131, + "step": 134330 + }, + { + "epoch": 0.8582599695897167, + "grad_norm": 0.8581190705299377, + "learning_rate": 6.10659380024326e-05, + "loss": 0.8037, + "step": 134340 + }, + { + "epoch": 0.8583238567394554, + "grad_norm": 0.964256227016449, + "learning_rate": 6.106104469266865e-05, + "loss": 0.6935, + "step": 134350 + }, + { + "epoch": 0.8583877438891941, + "grad_norm": 1.3757505416870117, + "learning_rate": 6.105615127151039e-05, + "loss": 0.7474, + "step": 134360 + }, + { + "epoch": 0.8584516310389328, + "grad_norm": 0.7548801898956299, + "learning_rate": 6.105125773900712e-05, + "loss": 0.8156, + "step": 134370 + }, + { + "epoch": 0.8585155181886716, + "grad_norm": 0.4941951036453247, + "learning_rate": 6.104636409520814e-05, + "loss": 0.7144, + "step": 134380 + }, + { + "epoch": 0.8585794053384103, + "grad_norm": 0.6011403799057007, + "learning_rate": 6.10414703401627e-05, + "loss": 1.0064, + "step": 134390 + }, + { + "epoch": 0.858643292488149, + "grad_norm": 1.6302911043167114, + "learning_rate": 6.103657647392012e-05, + "loss": 0.9264, + "step": 134400 + }, + { + "epoch": 0.8587071796378877, + "grad_norm": 0.9661455154418945, + "learning_rate": 6.103168249652966e-05, + "loss": 1.068, + "step": 134410 + }, + { + "epoch": 0.8587710667876264, + "grad_norm": 1.3184993267059326, + "learning_rate": 6.1026788408040616e-05, + "loss": 0.8662, + "step": 134420 + }, + { + "epoch": 0.8588349539373651, + "grad_norm": 0.8495551943778992, + "learning_rate": 6.102189420850226e-05, + "loss": 0.8618, + "step": 134430 + }, + { + "epoch": 0.8588988410871038, + "grad_norm": 0.7962411642074585, + "learning_rate": 6.101699989796391e-05, + "loss": 1.0453, + "step": 134440 + }, + { + "epoch": 0.8589627282368424, + "grad_norm": 0.7486063241958618, + "learning_rate": 6.1012105476474835e-05, + "loss": 0.9581, + "step": 134450 + }, + { + "epoch": 0.8590266153865811, + "grad_norm": 0.8825688362121582, + "learning_rate": 6.100721094408434e-05, + "loss": 0.9215, + "step": 134460 + }, + { + "epoch": 0.8590905025363198, + "grad_norm": 0.9865175485610962, + "learning_rate": 6.100231630084169e-05, + "loss": 0.8866, + "step": 134470 + }, + { + "epoch": 0.8591543896860585, + "grad_norm": 0.7580648064613342, + "learning_rate": 6.099742154679621e-05, + "loss": 0.7733, + "step": 134480 + }, + { + "epoch": 0.8592182768357972, + "grad_norm": 0.9302807450294495, + "learning_rate": 6.099252668199718e-05, + "loss": 0.7856, + "step": 134490 + }, + { + "epoch": 0.8592821639855359, + "grad_norm": 0.8940306305885315, + "learning_rate": 6.098763170649389e-05, + "loss": 1.2527, + "step": 134500 + }, + { + "epoch": 0.8593460511352746, + "grad_norm": 1.2878268957138062, + "learning_rate": 6.0982736620335644e-05, + "loss": 0.8244, + "step": 134510 + }, + { + "epoch": 0.8594099382850133, + "grad_norm": 0.8814017176628113, + "learning_rate": 6.097784142357174e-05, + "loss": 1.0, + "step": 134520 + }, + { + "epoch": 0.859473825434752, + "grad_norm": 0.6660280823707581, + "learning_rate": 6.097294611625147e-05, + "loss": 0.7688, + "step": 134530 + }, + { + "epoch": 0.8595377125844907, + "grad_norm": 1.1975473165512085, + "learning_rate": 6.0968050698424154e-05, + "loss": 1.033, + "step": 134540 + }, + { + "epoch": 0.8596015997342294, + "grad_norm": 0.822115421295166, + "learning_rate": 6.0963155170139066e-05, + "loss": 0.8708, + "step": 134550 + }, + { + "epoch": 0.8596654868839682, + "grad_norm": 0.9180407524108887, + "learning_rate": 6.095825953144553e-05, + "loss": 0.9883, + "step": 134560 + }, + { + "epoch": 0.8597293740337069, + "grad_norm": 1.0190486907958984, + "learning_rate": 6.095336378239284e-05, + "loss": 0.8116, + "step": 134570 + }, + { + "epoch": 0.8597932611834456, + "grad_norm": 0.6985743045806885, + "learning_rate": 6.094846792303029e-05, + "loss": 0.8544, + "step": 134580 + }, + { + "epoch": 0.8598571483331843, + "grad_norm": 0.9220016598701477, + "learning_rate": 6.0943571953407205e-05, + "loss": 1.1432, + "step": 134590 + }, + { + "epoch": 0.859921035482923, + "grad_norm": 1.1213401556015015, + "learning_rate": 6.093867587357288e-05, + "loss": 1.1309, + "step": 134600 + }, + { + "epoch": 0.8599849226326617, + "grad_norm": 0.7928663492202759, + "learning_rate": 6.093377968357663e-05, + "loss": 0.8537, + "step": 134610 + }, + { + "epoch": 0.8600488097824004, + "grad_norm": 1.3430203199386597, + "learning_rate": 6.092888338346775e-05, + "loss": 0.8679, + "step": 134620 + }, + { + "epoch": 0.8601126969321391, + "grad_norm": 0.7503795027732849, + "learning_rate": 6.0923986973295564e-05, + "loss": 1.0493, + "step": 134630 + }, + { + "epoch": 0.8601765840818778, + "grad_norm": 0.9265238046646118, + "learning_rate": 6.091909045310938e-05, + "loss": 0.9348, + "step": 134640 + }, + { + "epoch": 0.8602404712316165, + "grad_norm": 0.69620680809021, + "learning_rate": 6.091419382295851e-05, + "loss": 0.8313, + "step": 134650 + }, + { + "epoch": 0.8603043583813552, + "grad_norm": 0.8834882974624634, + "learning_rate": 6.090929708289227e-05, + "loss": 0.7323, + "step": 134660 + }, + { + "epoch": 0.8603682455310939, + "grad_norm": 0.9060829281806946, + "learning_rate": 6.0904400232959965e-05, + "loss": 0.8098, + "step": 134670 + }, + { + "epoch": 0.8604321326808326, + "grad_norm": 0.8208954930305481, + "learning_rate": 6.089950327321092e-05, + "loss": 0.9883, + "step": 134680 + }, + { + "epoch": 0.8604960198305712, + "grad_norm": 0.8429823517799377, + "learning_rate": 6.089460620369444e-05, + "loss": 0.9104, + "step": 134690 + }, + { + "epoch": 0.8605599069803099, + "grad_norm": 0.753400981426239, + "learning_rate": 6.088970902445985e-05, + "loss": 0.8429, + "step": 134700 + }, + { + "epoch": 0.8606237941300486, + "grad_norm": 1.0955417156219482, + "learning_rate": 6.088481173555648e-05, + "loss": 1.0817, + "step": 134710 + }, + { + "epoch": 0.8606876812797873, + "grad_norm": 1.585567831993103, + "learning_rate": 6.087991433703363e-05, + "loss": 0.8062, + "step": 134720 + }, + { + "epoch": 0.860751568429526, + "grad_norm": 0.6598794460296631, + "learning_rate": 6.0875016828940635e-05, + "loss": 0.8811, + "step": 134730 + }, + { + "epoch": 0.8608154555792648, + "grad_norm": 0.5704881548881531, + "learning_rate": 6.08701192113268e-05, + "loss": 0.7517, + "step": 134740 + }, + { + "epoch": 0.8608793427290035, + "grad_norm": 0.8759427666664124, + "learning_rate": 6.086522148424148e-05, + "loss": 0.874, + "step": 134750 + }, + { + "epoch": 0.8609432298787422, + "grad_norm": 0.7344384789466858, + "learning_rate": 6.086032364773396e-05, + "loss": 1.0344, + "step": 134760 + }, + { + "epoch": 0.8610071170284809, + "grad_norm": 1.3539067506790161, + "learning_rate": 6.0855425701853596e-05, + "loss": 1.1221, + "step": 134770 + }, + { + "epoch": 0.8610710041782196, + "grad_norm": 0.85466068983078, + "learning_rate": 6.08505276466497e-05, + "loss": 0.9896, + "step": 134780 + }, + { + "epoch": 0.8611348913279583, + "grad_norm": 1.4604058265686035, + "learning_rate": 6.0845629482171626e-05, + "loss": 0.762, + "step": 134790 + }, + { + "epoch": 0.861198778477697, + "grad_norm": 0.6832066178321838, + "learning_rate": 6.084073120846866e-05, + "loss": 0.8548, + "step": 134800 + }, + { + "epoch": 0.8612626656274357, + "grad_norm": 0.6687494516372681, + "learning_rate": 6.083583282559016e-05, + "loss": 1.1136, + "step": 134810 + }, + { + "epoch": 0.8613265527771744, + "grad_norm": 0.9443023800849915, + "learning_rate": 6.083093433358544e-05, + "loss": 1.0468, + "step": 134820 + }, + { + "epoch": 0.8613904399269131, + "grad_norm": 0.6644616723060608, + "learning_rate": 6.082603573250384e-05, + "loss": 0.7965, + "step": 134830 + }, + { + "epoch": 0.8614543270766518, + "grad_norm": 0.6500226855278015, + "learning_rate": 6.0821137022394705e-05, + "loss": 1.0983, + "step": 134840 + }, + { + "epoch": 0.8615182142263905, + "grad_norm": 1.1436829566955566, + "learning_rate": 6.0816238203307355e-05, + "loss": 1.0032, + "step": 134850 + }, + { + "epoch": 0.8615821013761292, + "grad_norm": 0.5248997211456299, + "learning_rate": 6.081133927529112e-05, + "loss": 0.7732, + "step": 134860 + }, + { + "epoch": 0.8616459885258679, + "grad_norm": 0.8002848625183105, + "learning_rate": 6.0806440238395347e-05, + "loss": 0.8625, + "step": 134870 + }, + { + "epoch": 0.8617098756756066, + "grad_norm": 1.1468842029571533, + "learning_rate": 6.080154109266938e-05, + "loss": 0.6841, + "step": 134880 + }, + { + "epoch": 0.8617737628253453, + "grad_norm": 1.0037697553634644, + "learning_rate": 6.0796641838162546e-05, + "loss": 0.83, + "step": 134890 + }, + { + "epoch": 0.861837649975084, + "grad_norm": 1.227607250213623, + "learning_rate": 6.0791742474924175e-05, + "loss": 0.9535, + "step": 134900 + }, + { + "epoch": 0.8619015371248228, + "grad_norm": 0.9437126517295837, + "learning_rate": 6.0786843003003636e-05, + "loss": 1.2439, + "step": 134910 + }, + { + "epoch": 0.8619654242745615, + "grad_norm": 1.4104220867156982, + "learning_rate": 6.078194342245025e-05, + "loss": 1.1915, + "step": 134920 + }, + { + "epoch": 0.8620293114243002, + "grad_norm": 0.9898942112922668, + "learning_rate": 6.0777043733313375e-05, + "loss": 0.8426, + "step": 134930 + }, + { + "epoch": 0.8620931985740388, + "grad_norm": 1.2425917387008667, + "learning_rate": 6.077214393564234e-05, + "loss": 0.7399, + "step": 134940 + }, + { + "epoch": 0.8621570857237775, + "grad_norm": 0.8567221760749817, + "learning_rate": 6.07672440294865e-05, + "loss": 0.8017, + "step": 134950 + }, + { + "epoch": 0.8622209728735162, + "grad_norm": 0.8979184031486511, + "learning_rate": 6.07623440148952e-05, + "loss": 1.1169, + "step": 134960 + }, + { + "epoch": 0.8622848600232549, + "grad_norm": 0.6733188033103943, + "learning_rate": 6.075744389191778e-05, + "loss": 0.9605, + "step": 134970 + }, + { + "epoch": 0.8623487471729936, + "grad_norm": 1.6359256505966187, + "learning_rate": 6.0752543660603587e-05, + "loss": 0.7852, + "step": 134980 + }, + { + "epoch": 0.8624126343227323, + "grad_norm": 1.0377789735794067, + "learning_rate": 6.074764332100199e-05, + "loss": 1.1418, + "step": 134990 + }, + { + "epoch": 0.862476521472471, + "grad_norm": 0.5584946274757385, + "learning_rate": 6.074274287316232e-05, + "loss": 0.6663, + "step": 135000 + }, + { + "epoch": 0.8625404086222097, + "grad_norm": 0.936705470085144, + "learning_rate": 6.073784231713393e-05, + "loss": 0.9404, + "step": 135010 + }, + { + "epoch": 0.8626042957719484, + "grad_norm": 0.8763816356658936, + "learning_rate": 6.0732941652966194e-05, + "loss": 0.7721, + "step": 135020 + }, + { + "epoch": 0.8626681829216871, + "grad_norm": 2.3810360431671143, + "learning_rate": 6.072804088070844e-05, + "loss": 0.6623, + "step": 135030 + }, + { + "epoch": 0.8627320700714258, + "grad_norm": 1.851711392402649, + "learning_rate": 6.0723140000410036e-05, + "loss": 0.8857, + "step": 135040 + }, + { + "epoch": 0.8627959572211645, + "grad_norm": 0.9787930846214294, + "learning_rate": 6.0718239012120334e-05, + "loss": 0.9424, + "step": 135050 + }, + { + "epoch": 0.8628598443709032, + "grad_norm": 1.1540073156356812, + "learning_rate": 6.071333791588868e-05, + "loss": 0.8247, + "step": 135060 + }, + { + "epoch": 0.862923731520642, + "grad_norm": 0.7594394087791443, + "learning_rate": 6.0708436711764464e-05, + "loss": 1.098, + "step": 135070 + }, + { + "epoch": 0.8629876186703807, + "grad_norm": 0.9598045349121094, + "learning_rate": 6.070353539979702e-05, + "loss": 0.7815, + "step": 135080 + }, + { + "epoch": 0.8630515058201194, + "grad_norm": 0.8562808632850647, + "learning_rate": 6.069863398003571e-05, + "loss": 0.9166, + "step": 135090 + }, + { + "epoch": 0.8631153929698581, + "grad_norm": 1.306075930595398, + "learning_rate": 6.0693732452529906e-05, + "loss": 0.7203, + "step": 135100 + }, + { + "epoch": 0.8631792801195968, + "grad_norm": 0.9534823298454285, + "learning_rate": 6.0688830817328955e-05, + "loss": 0.9033, + "step": 135110 + }, + { + "epoch": 0.8632431672693355, + "grad_norm": 0.8284388184547424, + "learning_rate": 6.068392907448224e-05, + "loss": 0.7617, + "step": 135120 + }, + { + "epoch": 0.8633070544190742, + "grad_norm": 0.9172964096069336, + "learning_rate": 6.067902722403912e-05, + "loss": 0.7122, + "step": 135130 + }, + { + "epoch": 0.8633709415688129, + "grad_norm": 1.129095435142517, + "learning_rate": 6.067412526604894e-05, + "loss": 1.0913, + "step": 135140 + }, + { + "epoch": 0.8634348287185516, + "grad_norm": 0.673621416091919, + "learning_rate": 6.06692232005611e-05, + "loss": 0.9686, + "step": 135150 + }, + { + "epoch": 0.8634987158682903, + "grad_norm": 0.5581203699111938, + "learning_rate": 6.066432102762495e-05, + "loss": 1.0076, + "step": 135160 + }, + { + "epoch": 0.863562603018029, + "grad_norm": 0.8796345591545105, + "learning_rate": 6.0659418747289864e-05, + "loss": 0.8523, + "step": 135170 + }, + { + "epoch": 0.8636264901677676, + "grad_norm": 0.9877477288246155, + "learning_rate": 6.06545163596052e-05, + "loss": 0.8171, + "step": 135180 + }, + { + "epoch": 0.8636903773175063, + "grad_norm": 1.176950216293335, + "learning_rate": 6.0649613864620345e-05, + "loss": 0.7933, + "step": 135190 + }, + { + "epoch": 0.863754264467245, + "grad_norm": 1.119374394416809, + "learning_rate": 6.064471126238467e-05, + "loss": 0.9705, + "step": 135200 + }, + { + "epoch": 0.8638181516169837, + "grad_norm": 0.757959246635437, + "learning_rate": 6.063980855294753e-05, + "loss": 0.9029, + "step": 135210 + }, + { + "epoch": 0.8638820387667224, + "grad_norm": 0.6737072467803955, + "learning_rate": 6.0634905736358326e-05, + "loss": 0.9489, + "step": 135220 + }, + { + "epoch": 0.8639459259164611, + "grad_norm": 0.7668609619140625, + "learning_rate": 6.063000281266641e-05, + "loss": 0.9851, + "step": 135230 + }, + { + "epoch": 0.8640098130661998, + "grad_norm": 0.9595603346824646, + "learning_rate": 6.062509978192118e-05, + "loss": 1.0468, + "step": 135240 + }, + { + "epoch": 0.8640737002159385, + "grad_norm": 1.4497737884521484, + "learning_rate": 6.062019664417199e-05, + "loss": 0.9572, + "step": 135250 + }, + { + "epoch": 0.8641375873656773, + "grad_norm": 0.8604928851127625, + "learning_rate": 6.061529339946824e-05, + "loss": 0.9146, + "step": 135260 + }, + { + "epoch": 0.864201474515416, + "grad_norm": 0.8765950798988342, + "learning_rate": 6.061039004785929e-05, + "loss": 0.7102, + "step": 135270 + }, + { + "epoch": 0.8642653616651547, + "grad_norm": 0.523587703704834, + "learning_rate": 6.060548658939456e-05, + "loss": 0.7571, + "step": 135280 + }, + { + "epoch": 0.8643292488148934, + "grad_norm": 0.93330979347229, + "learning_rate": 6.0600583024123394e-05, + "loss": 0.859, + "step": 135290 + }, + { + "epoch": 0.8643931359646321, + "grad_norm": 0.9948300123214722, + "learning_rate": 6.059567935209518e-05, + "loss": 0.8673, + "step": 135300 + }, + { + "epoch": 0.8644570231143708, + "grad_norm": 0.9706994295120239, + "learning_rate": 6.059077557335931e-05, + "loss": 0.7599, + "step": 135310 + }, + { + "epoch": 0.8645209102641095, + "grad_norm": 1.1021097898483276, + "learning_rate": 6.058587168796517e-05, + "loss": 0.7995, + "step": 135320 + }, + { + "epoch": 0.8645847974138482, + "grad_norm": 0.8516930937767029, + "learning_rate": 6.058096769596213e-05, + "loss": 0.9658, + "step": 135330 + }, + { + "epoch": 0.8646486845635869, + "grad_norm": 0.8828617930412292, + "learning_rate": 6.0576063597399615e-05, + "loss": 0.7387, + "step": 135340 + }, + { + "epoch": 0.8647125717133256, + "grad_norm": 0.8962679505348206, + "learning_rate": 6.0571159392326974e-05, + "loss": 1.0775, + "step": 135350 + }, + { + "epoch": 0.8647764588630643, + "grad_norm": 0.678126871585846, + "learning_rate": 6.056625508079361e-05, + "loss": 0.9041, + "step": 135360 + }, + { + "epoch": 0.864840346012803, + "grad_norm": 1.034623146057129, + "learning_rate": 6.056135066284893e-05, + "loss": 1.1352, + "step": 135370 + }, + { + "epoch": 0.8649042331625417, + "grad_norm": 0.7552897930145264, + "learning_rate": 6.05564461385423e-05, + "loss": 0.9906, + "step": 135380 + }, + { + "epoch": 0.8649681203122804, + "grad_norm": 1.2172116041183472, + "learning_rate": 6.055154150792313e-05, + "loss": 0.8446, + "step": 135390 + }, + { + "epoch": 0.8650320074620191, + "grad_norm": 0.5961598753929138, + "learning_rate": 6.054663677104081e-05, + "loss": 0.7268, + "step": 135400 + }, + { + "epoch": 0.8650958946117578, + "grad_norm": 0.595866322517395, + "learning_rate": 6.0541731927944734e-05, + "loss": 0.7465, + "step": 135410 + }, + { + "epoch": 0.8651597817614964, + "grad_norm": 0.8760963678359985, + "learning_rate": 6.0536826978684294e-05, + "loss": 0.8472, + "step": 135420 + }, + { + "epoch": 0.8652236689112351, + "grad_norm": 0.7106996774673462, + "learning_rate": 6.0531921923308874e-05, + "loss": 0.979, + "step": 135430 + }, + { + "epoch": 0.8652875560609739, + "grad_norm": 0.6956402063369751, + "learning_rate": 6.052701676186791e-05, + "loss": 0.9007, + "step": 135440 + }, + { + "epoch": 0.8653514432107126, + "grad_norm": 2.090533971786499, + "learning_rate": 6.0522111494410785e-05, + "loss": 0.8266, + "step": 135450 + }, + { + "epoch": 0.8654153303604513, + "grad_norm": 0.7869872450828552, + "learning_rate": 6.051720612098688e-05, + "loss": 0.6341, + "step": 135460 + }, + { + "epoch": 0.86547921751019, + "grad_norm": 0.6018970012664795, + "learning_rate": 6.051230064164561e-05, + "loss": 0.7308, + "step": 135470 + }, + { + "epoch": 0.8655431046599287, + "grad_norm": 0.8730195164680481, + "learning_rate": 6.050739505643639e-05, + "loss": 1.086, + "step": 135480 + }, + { + "epoch": 0.8656069918096674, + "grad_norm": 0.8639483451843262, + "learning_rate": 6.050248936540861e-05, + "loss": 1.0237, + "step": 135490 + }, + { + "epoch": 0.8656708789594061, + "grad_norm": 0.7378480434417725, + "learning_rate": 6.0497583568611674e-05, + "loss": 0.8411, + "step": 135500 + }, + { + "epoch": 0.8657347661091448, + "grad_norm": 0.876330554485321, + "learning_rate": 6.049267766609499e-05, + "loss": 1.0058, + "step": 135510 + }, + { + "epoch": 0.8657986532588835, + "grad_norm": 0.6682674884796143, + "learning_rate": 6.0487771657907974e-05, + "loss": 0.8205, + "step": 135520 + }, + { + "epoch": 0.8658625404086222, + "grad_norm": 1.085636019706726, + "learning_rate": 6.048286554410001e-05, + "loss": 0.9317, + "step": 135530 + }, + { + "epoch": 0.8659264275583609, + "grad_norm": 0.9509056210517883, + "learning_rate": 6.047795932472052e-05, + "loss": 0.7042, + "step": 135540 + }, + { + "epoch": 0.8659903147080996, + "grad_norm": 1.0311496257781982, + "learning_rate": 6.0473052999818925e-05, + "loss": 1.0667, + "step": 135550 + }, + { + "epoch": 0.8660542018578383, + "grad_norm": 1.0395874977111816, + "learning_rate": 6.0468146569444615e-05, + "loss": 1.0347, + "step": 135560 + }, + { + "epoch": 0.866118089007577, + "grad_norm": 0.933964729309082, + "learning_rate": 6.0463240033647025e-05, + "loss": 0.9422, + "step": 135570 + }, + { + "epoch": 0.8661819761573157, + "grad_norm": 0.8697935342788696, + "learning_rate": 6.045833339247555e-05, + "loss": 1.1477, + "step": 135580 + }, + { + "epoch": 0.8662458633070544, + "grad_norm": 0.7333647012710571, + "learning_rate": 6.045342664597959e-05, + "loss": 0.8173, + "step": 135590 + }, + { + "epoch": 0.8663097504567931, + "grad_norm": 0.761461079120636, + "learning_rate": 6.04485197942086e-05, + "loss": 0.9989, + "step": 135600 + }, + { + "epoch": 0.8663736376065319, + "grad_norm": 0.7777496576309204, + "learning_rate": 6.0443612837211984e-05, + "loss": 0.9211, + "step": 135610 + }, + { + "epoch": 0.8664375247562706, + "grad_norm": 0.8241527080535889, + "learning_rate": 6.043870577503914e-05, + "loss": 0.8573, + "step": 135620 + }, + { + "epoch": 0.8665014119060093, + "grad_norm": 1.33556067943573, + "learning_rate": 6.04337986077395e-05, + "loss": 0.88, + "step": 135630 + }, + { + "epoch": 0.866565299055748, + "grad_norm": 0.8477666974067688, + "learning_rate": 6.0428891335362484e-05, + "loss": 0.9298, + "step": 135640 + }, + { + "epoch": 0.8666291862054867, + "grad_norm": 1.5744928121566772, + "learning_rate": 6.0423983957957505e-05, + "loss": 0.9288, + "step": 135650 + }, + { + "epoch": 0.8666930733552254, + "grad_norm": 1.1519935131072998, + "learning_rate": 6.041907647557399e-05, + "loss": 0.9664, + "step": 135660 + }, + { + "epoch": 0.866756960504964, + "grad_norm": 0.649913489818573, + "learning_rate": 6.041416888826137e-05, + "loss": 0.9266, + "step": 135670 + }, + { + "epoch": 0.8668208476547027, + "grad_norm": 0.9466597437858582, + "learning_rate": 6.040926119606906e-05, + "loss": 0.8896, + "step": 135680 + }, + { + "epoch": 0.8668847348044414, + "grad_norm": 0.5539588332176208, + "learning_rate": 6.040435339904646e-05, + "loss": 0.9554, + "step": 135690 + }, + { + "epoch": 0.8669486219541801, + "grad_norm": 1.099380373954773, + "learning_rate": 6.039944549724305e-05, + "loss": 0.8488, + "step": 135700 + }, + { + "epoch": 0.8670125091039188, + "grad_norm": 0.6397864818572998, + "learning_rate": 6.0394537490708216e-05, + "loss": 0.9452, + "step": 135710 + }, + { + "epoch": 0.8670763962536575, + "grad_norm": 0.7527474164962769, + "learning_rate": 6.0389629379491395e-05, + "loss": 1.1831, + "step": 135720 + }, + { + "epoch": 0.8671402834033962, + "grad_norm": 1.0391385555267334, + "learning_rate": 6.0384721163642024e-05, + "loss": 0.951, + "step": 135730 + }, + { + "epoch": 0.8672041705531349, + "grad_norm": 0.9291607737541199, + "learning_rate": 6.0379812843209515e-05, + "loss": 0.8248, + "step": 135740 + }, + { + "epoch": 0.8672680577028736, + "grad_norm": 0.8267570734024048, + "learning_rate": 6.0374904418243315e-05, + "loss": 0.7667, + "step": 135750 + }, + { + "epoch": 0.8673319448526123, + "grad_norm": 1.5941237211227417, + "learning_rate": 6.0369995888792863e-05, + "loss": 0.8021, + "step": 135760 + }, + { + "epoch": 0.867395832002351, + "grad_norm": 0.5440096259117126, + "learning_rate": 6.036508725490757e-05, + "loss": 0.6538, + "step": 135770 + }, + { + "epoch": 0.8674597191520897, + "grad_norm": 1.1164497137069702, + "learning_rate": 6.036017851663689e-05, + "loss": 0.719, + "step": 135780 + }, + { + "epoch": 0.8675236063018285, + "grad_norm": 0.8377860188484192, + "learning_rate": 6.035526967403023e-05, + "loss": 0.9899, + "step": 135790 + }, + { + "epoch": 0.8675874934515672, + "grad_norm": 0.7305436730384827, + "learning_rate": 6.035036072713707e-05, + "loss": 0.744, + "step": 135800 + }, + { + "epoch": 0.8676513806013059, + "grad_norm": 1.4228670597076416, + "learning_rate": 6.034545167600682e-05, + "loss": 0.8398, + "step": 135810 + }, + { + "epoch": 0.8677152677510446, + "grad_norm": 0.7904695868492126, + "learning_rate": 6.0340542520688904e-05, + "loss": 0.8431, + "step": 135820 + }, + { + "epoch": 0.8677791549007833, + "grad_norm": 1.2709144353866577, + "learning_rate": 6.03356332612328e-05, + "loss": 0.8764, + "step": 135830 + }, + { + "epoch": 0.867843042050522, + "grad_norm": 0.8756301999092102, + "learning_rate": 6.03307238976879e-05, + "loss": 0.782, + "step": 135840 + }, + { + "epoch": 0.8679069292002607, + "grad_norm": 0.5899653434753418, + "learning_rate": 6.03258144301037e-05, + "loss": 0.7944, + "step": 135850 + }, + { + "epoch": 0.8679708163499994, + "grad_norm": 0.976030707359314, + "learning_rate": 6.03209048585296e-05, + "loss": 0.8582, + "step": 135860 + }, + { + "epoch": 0.8680347034997381, + "grad_norm": 1.095521092414856, + "learning_rate": 6.0315995183015064e-05, + "loss": 0.9034, + "step": 135870 + }, + { + "epoch": 0.8680985906494768, + "grad_norm": 0.8074119091033936, + "learning_rate": 6.031108540360954e-05, + "loss": 0.8167, + "step": 135880 + }, + { + "epoch": 0.8681624777992155, + "grad_norm": 1.544575810432434, + "learning_rate": 6.0306175520362454e-05, + "loss": 0.8362, + "step": 135890 + }, + { + "epoch": 0.8682263649489542, + "grad_norm": 0.7311546802520752, + "learning_rate": 6.030126553332327e-05, + "loss": 1.0513, + "step": 135900 + }, + { + "epoch": 0.8682902520986928, + "grad_norm": 1.0786371231079102, + "learning_rate": 6.029635544254143e-05, + "loss": 1.0181, + "step": 135910 + }, + { + "epoch": 0.8683541392484315, + "grad_norm": 1.0580967664718628, + "learning_rate": 6.029144524806638e-05, + "loss": 0.7061, + "step": 135920 + }, + { + "epoch": 0.8684180263981702, + "grad_norm": 0.9200608730316162, + "learning_rate": 6.028653494994757e-05, + "loss": 0.9578, + "step": 135930 + }, + { + "epoch": 0.8684819135479089, + "grad_norm": 1.2529308795928955, + "learning_rate": 6.028162454823446e-05, + "loss": 0.878, + "step": 135940 + }, + { + "epoch": 0.8685458006976476, + "grad_norm": 0.6350985765457153, + "learning_rate": 6.0276714042976504e-05, + "loss": 0.8528, + "step": 135950 + }, + { + "epoch": 0.8686096878473863, + "grad_norm": 1.1729838848114014, + "learning_rate": 6.0271803434223115e-05, + "loss": 0.8492, + "step": 135960 + }, + { + "epoch": 0.868673574997125, + "grad_norm": 0.679898738861084, + "learning_rate": 6.02668927220238e-05, + "loss": 1.0879, + "step": 135970 + }, + { + "epoch": 0.8687374621468638, + "grad_norm": 0.9746125936508179, + "learning_rate": 6.0261981906428e-05, + "loss": 0.9354, + "step": 135980 + }, + { + "epoch": 0.8688013492966025, + "grad_norm": 0.8682552576065063, + "learning_rate": 6.0257070987485166e-05, + "loss": 0.9114, + "step": 135990 + }, + { + "epoch": 0.8688652364463412, + "grad_norm": 0.879461944103241, + "learning_rate": 6.025215996524474e-05, + "loss": 0.7752, + "step": 136000 + }, + { + "epoch": 0.8689291235960799, + "grad_norm": 1.0179787874221802, + "learning_rate": 6.024724883975621e-05, + "loss": 0.9302, + "step": 136010 + }, + { + "epoch": 0.8689930107458186, + "grad_norm": 0.7405498623847961, + "learning_rate": 6.024233761106901e-05, + "loss": 0.8306, + "step": 136020 + }, + { + "epoch": 0.8690568978955573, + "grad_norm": 0.9917730689048767, + "learning_rate": 6.023742627923261e-05, + "loss": 0.8827, + "step": 136030 + }, + { + "epoch": 0.869120785045296, + "grad_norm": 1.0026957988739014, + "learning_rate": 6.023251484429647e-05, + "loss": 0.9303, + "step": 136040 + }, + { + "epoch": 0.8691846721950347, + "grad_norm": 0.6799507141113281, + "learning_rate": 6.022760330631005e-05, + "loss": 0.7576, + "step": 136050 + }, + { + "epoch": 0.8692485593447734, + "grad_norm": 0.7701660990715027, + "learning_rate": 6.0222691665322815e-05, + "loss": 0.7309, + "step": 136060 + }, + { + "epoch": 0.8693124464945121, + "grad_norm": 0.7990044355392456, + "learning_rate": 6.0217779921384246e-05, + "loss": 0.9324, + "step": 136070 + }, + { + "epoch": 0.8693763336442508, + "grad_norm": 0.8976256251335144, + "learning_rate": 6.0212868074543785e-05, + "loss": 0.855, + "step": 136080 + }, + { + "epoch": 0.8694402207939895, + "grad_norm": 1.0746898651123047, + "learning_rate": 6.02079561248509e-05, + "loss": 0.8045, + "step": 136090 + }, + { + "epoch": 0.8695041079437282, + "grad_norm": 1.292189359664917, + "learning_rate": 6.0203044072355065e-05, + "loss": 0.8419, + "step": 136100 + }, + { + "epoch": 0.8695679950934669, + "grad_norm": 0.673413097858429, + "learning_rate": 6.019813191710576e-05, + "loss": 0.7643, + "step": 136110 + }, + { + "epoch": 0.8696318822432056, + "grad_norm": 0.834862232208252, + "learning_rate": 6.0193219659152424e-05, + "loss": 0.8355, + "step": 136120 + }, + { + "epoch": 0.8696957693929444, + "grad_norm": 0.6725580096244812, + "learning_rate": 6.018830729854457e-05, + "loss": 1.1082, + "step": 136130 + }, + { + "epoch": 0.8697596565426831, + "grad_norm": 1.442153811454773, + "learning_rate": 6.018339483533163e-05, + "loss": 0.9068, + "step": 136140 + }, + { + "epoch": 0.8698235436924217, + "grad_norm": 1.0553643703460693, + "learning_rate": 6.017848226956311e-05, + "loss": 0.7979, + "step": 136150 + }, + { + "epoch": 0.8698874308421604, + "grad_norm": 1.0946028232574463, + "learning_rate": 6.017356960128846e-05, + "loss": 0.8663, + "step": 136160 + }, + { + "epoch": 0.8699513179918991, + "grad_norm": 1.3556715250015259, + "learning_rate": 6.0168656830557165e-05, + "loss": 0.9034, + "step": 136170 + }, + { + "epoch": 0.8700152051416378, + "grad_norm": 0.47752645611763, + "learning_rate": 6.016374395741869e-05, + "loss": 0.9813, + "step": 136180 + }, + { + "epoch": 0.8700790922913765, + "grad_norm": 1.1327354907989502, + "learning_rate": 6.0158830981922544e-05, + "loss": 0.838, + "step": 136190 + }, + { + "epoch": 0.8701429794411152, + "grad_norm": 0.8484867811203003, + "learning_rate": 6.0153917904118164e-05, + "loss": 0.8683, + "step": 136200 + }, + { + "epoch": 0.8702068665908539, + "grad_norm": 0.7545785307884216, + "learning_rate": 6.0149004724055046e-05, + "loss": 0.9844, + "step": 136210 + }, + { + "epoch": 0.8702707537405926, + "grad_norm": 0.7116890549659729, + "learning_rate": 6.0144091441782666e-05, + "loss": 0.8028, + "step": 136220 + }, + { + "epoch": 0.8703346408903313, + "grad_norm": 0.5172243714332581, + "learning_rate": 6.013917805735052e-05, + "loss": 0.8459, + "step": 136230 + }, + { + "epoch": 0.87039852804007, + "grad_norm": 0.9006187319755554, + "learning_rate": 6.0134264570808076e-05, + "loss": 1.1869, + "step": 136240 + }, + { + "epoch": 0.8704624151898087, + "grad_norm": 1.2033772468566895, + "learning_rate": 6.012935098220483e-05, + "loss": 0.7985, + "step": 136250 + }, + { + "epoch": 0.8705263023395474, + "grad_norm": 0.8718836903572083, + "learning_rate": 6.012443729159025e-05, + "loss": 0.8137, + "step": 136260 + }, + { + "epoch": 0.8705901894892861, + "grad_norm": 1.3549836874008179, + "learning_rate": 6.011952349901382e-05, + "loss": 1.1543, + "step": 136270 + }, + { + "epoch": 0.8706540766390248, + "grad_norm": 1.342417597770691, + "learning_rate": 6.011460960452503e-05, + "loss": 0.8184, + "step": 136280 + }, + { + "epoch": 0.8707179637887635, + "grad_norm": 0.5869442224502563, + "learning_rate": 6.010969560817338e-05, + "loss": 0.7865, + "step": 136290 + }, + { + "epoch": 0.8707818509385022, + "grad_norm": 1.237336277961731, + "learning_rate": 6.0104781510008345e-05, + "loss": 0.8941, + "step": 136300 + }, + { + "epoch": 0.870845738088241, + "grad_norm": 0.9919825792312622, + "learning_rate": 6.0099867310079416e-05, + "loss": 0.9696, + "step": 136310 + }, + { + "epoch": 0.8709096252379797, + "grad_norm": 0.6661075949668884, + "learning_rate": 6.0094953008436094e-05, + "loss": 0.7271, + "step": 136320 + }, + { + "epoch": 0.8709735123877184, + "grad_norm": 1.1574594974517822, + "learning_rate": 6.009003860512785e-05, + "loss": 0.6527, + "step": 136330 + }, + { + "epoch": 0.8710373995374571, + "grad_norm": 0.8676467537879944, + "learning_rate": 6.0085124100204205e-05, + "loss": 0.7372, + "step": 136340 + }, + { + "epoch": 0.8711012866871958, + "grad_norm": 0.6834307312965393, + "learning_rate": 6.0080209493714626e-05, + "loss": 0.9976, + "step": 136350 + }, + { + "epoch": 0.8711651738369345, + "grad_norm": 0.8142191767692566, + "learning_rate": 6.0075294785708617e-05, + "loss": 0.9738, + "step": 136360 + }, + { + "epoch": 0.8712290609866732, + "grad_norm": 1.038397192955017, + "learning_rate": 6.007037997623567e-05, + "loss": 0.78, + "step": 136370 + }, + { + "epoch": 0.8712929481364119, + "grad_norm": 1.5472460985183716, + "learning_rate": 6.006546506534529e-05, + "loss": 0.6741, + "step": 136380 + }, + { + "epoch": 0.8713568352861505, + "grad_norm": 0.9952694177627563, + "learning_rate": 6.006055005308697e-05, + "loss": 0.7892, + "step": 136390 + }, + { + "epoch": 0.8714207224358892, + "grad_norm": 1.1230021715164185, + "learning_rate": 6.005563493951021e-05, + "loss": 0.9274, + "step": 136400 + }, + { + "epoch": 0.8714846095856279, + "grad_norm": 1.2984684705734253, + "learning_rate": 6.005071972466449e-05, + "loss": 0.8563, + "step": 136410 + }, + { + "epoch": 0.8715484967353666, + "grad_norm": 0.9113028049468994, + "learning_rate": 6.004580440859934e-05, + "loss": 0.8594, + "step": 136420 + }, + { + "epoch": 0.8716123838851053, + "grad_norm": 1.0389131307601929, + "learning_rate": 6.0040888991364255e-05, + "loss": 1.0333, + "step": 136430 + }, + { + "epoch": 0.871676271034844, + "grad_norm": 0.7670816779136658, + "learning_rate": 6.003597347300872e-05, + "loss": 0.833, + "step": 136440 + }, + { + "epoch": 0.8717401581845827, + "grad_norm": 0.7277560234069824, + "learning_rate": 6.003105785358225e-05, + "loss": 1.0034, + "step": 136450 + }, + { + "epoch": 0.8718040453343214, + "grad_norm": 0.8484509587287903, + "learning_rate": 6.0026142133134354e-05, + "loss": 1.1544, + "step": 136460 + }, + { + "epoch": 0.8718679324840601, + "grad_norm": 1.0106443166732788, + "learning_rate": 6.0021226311714526e-05, + "loss": 0.7397, + "step": 136470 + }, + { + "epoch": 0.8719318196337988, + "grad_norm": 2.2280375957489014, + "learning_rate": 6.0016310389372275e-05, + "loss": 0.987, + "step": 136480 + }, + { + "epoch": 0.8719957067835375, + "grad_norm": 0.9950495958328247, + "learning_rate": 6.001139436615713e-05, + "loss": 0.7946, + "step": 136490 + }, + { + "epoch": 0.8720595939332763, + "grad_norm": 0.8028036952018738, + "learning_rate": 6.000647824211858e-05, + "loss": 0.8753, + "step": 136500 + }, + { + "epoch": 0.872123481083015, + "grad_norm": 1.1068840026855469, + "learning_rate": 6.000156201730614e-05, + "loss": 1.0608, + "step": 136510 + }, + { + "epoch": 0.8721873682327537, + "grad_norm": 0.5927395224571228, + "learning_rate": 5.9996645691769305e-05, + "loss": 0.7914, + "step": 136520 + }, + { + "epoch": 0.8722512553824924, + "grad_norm": 1.3745521306991577, + "learning_rate": 5.9991729265557605e-05, + "loss": 1.0867, + "step": 136530 + }, + { + "epoch": 0.8723151425322311, + "grad_norm": 1.0866520404815674, + "learning_rate": 5.998681273872055e-05, + "loss": 0.7932, + "step": 136540 + }, + { + "epoch": 0.8723790296819698, + "grad_norm": 1.1197307109832764, + "learning_rate": 5.998189611130764e-05, + "loss": 1.0212, + "step": 136550 + }, + { + "epoch": 0.8724429168317085, + "grad_norm": 0.9691267609596252, + "learning_rate": 5.9976979383368414e-05, + "loss": 0.7832, + "step": 136560 + }, + { + "epoch": 0.8725068039814472, + "grad_norm": 2.4745099544525146, + "learning_rate": 5.997206255495237e-05, + "loss": 0.9366, + "step": 136570 + }, + { + "epoch": 0.8725706911311859, + "grad_norm": 1.1133451461791992, + "learning_rate": 5.9967145626109035e-05, + "loss": 1.0052, + "step": 136580 + }, + { + "epoch": 0.8726345782809246, + "grad_norm": 0.8237787485122681, + "learning_rate": 5.996222859688791e-05, + "loss": 0.9783, + "step": 136590 + }, + { + "epoch": 0.8726984654306633, + "grad_norm": 0.960617184638977, + "learning_rate": 5.995731146733853e-05, + "loss": 0.7924, + "step": 136600 + }, + { + "epoch": 0.872762352580402, + "grad_norm": 0.8641276955604553, + "learning_rate": 5.99523942375104e-05, + "loss": 0.9258, + "step": 136610 + }, + { + "epoch": 0.8728262397301407, + "grad_norm": 0.6767961382865906, + "learning_rate": 5.994747690745306e-05, + "loss": 0.8195, + "step": 136620 + }, + { + "epoch": 0.8728901268798794, + "grad_norm": 0.8257598280906677, + "learning_rate": 5.9942559477216024e-05, + "loss": 1.0143, + "step": 136630 + }, + { + "epoch": 0.872954014029618, + "grad_norm": 0.9936842918395996, + "learning_rate": 5.99376419468488e-05, + "loss": 1.14, + "step": 136640 + }, + { + "epoch": 0.8730179011793567, + "grad_norm": 1.0033197402954102, + "learning_rate": 5.993272431640093e-05, + "loss": 0.9506, + "step": 136650 + }, + { + "epoch": 0.8730817883290954, + "grad_norm": 0.9221176505088806, + "learning_rate": 5.992780658592193e-05, + "loss": 0.8055, + "step": 136660 + }, + { + "epoch": 0.8731456754788341, + "grad_norm": 1.5162618160247803, + "learning_rate": 5.9922888755461336e-05, + "loss": 0.8351, + "step": 136670 + }, + { + "epoch": 0.8732095626285729, + "grad_norm": 0.7005751132965088, + "learning_rate": 5.991797082506867e-05, + "loss": 0.8902, + "step": 136680 + }, + { + "epoch": 0.8732734497783116, + "grad_norm": 0.8630402684211731, + "learning_rate": 5.9913052794793453e-05, + "loss": 0.9382, + "step": 136690 + }, + { + "epoch": 0.8733373369280503, + "grad_norm": 0.7950804829597473, + "learning_rate": 5.990813466468522e-05, + "loss": 1.0334, + "step": 136700 + }, + { + "epoch": 0.873401224077789, + "grad_norm": 0.9526651501655579, + "learning_rate": 5.9903216434793494e-05, + "loss": 1.1781, + "step": 136710 + }, + { + "epoch": 0.8734651112275277, + "grad_norm": 1.170040488243103, + "learning_rate": 5.989829810516782e-05, + "loss": 1.0522, + "step": 136720 + }, + { + "epoch": 0.8735289983772664, + "grad_norm": 1.0779001712799072, + "learning_rate": 5.9893379675857706e-05, + "loss": 0.8159, + "step": 136730 + }, + { + "epoch": 0.8735928855270051, + "grad_norm": 0.806840181350708, + "learning_rate": 5.9888461146912736e-05, + "loss": 0.7811, + "step": 136740 + }, + { + "epoch": 0.8736567726767438, + "grad_norm": 0.7434895634651184, + "learning_rate": 5.988354251838237e-05, + "loss": 0.8606, + "step": 136750 + }, + { + "epoch": 0.8737206598264825, + "grad_norm": 1.0427266359329224, + "learning_rate": 5.987862379031619e-05, + "loss": 1.006, + "step": 136760 + }, + { + "epoch": 0.8737845469762212, + "grad_norm": 1.1790105104446411, + "learning_rate": 5.987370496276372e-05, + "loss": 0.81, + "step": 136770 + }, + { + "epoch": 0.8738484341259599, + "grad_norm": 1.0513496398925781, + "learning_rate": 5.9868786035774504e-05, + "loss": 0.9206, + "step": 136780 + }, + { + "epoch": 0.8739123212756986, + "grad_norm": 1.0362788438796997, + "learning_rate": 5.986386700939808e-05, + "loss": 0.8835, + "step": 136790 + }, + { + "epoch": 0.8739762084254373, + "grad_norm": 1.3794645071029663, + "learning_rate": 5.985894788368397e-05, + "loss": 0.6992, + "step": 136800 + }, + { + "epoch": 0.874040095575176, + "grad_norm": 0.74660724401474, + "learning_rate": 5.9854028658681724e-05, + "loss": 1.0083, + "step": 136810 + }, + { + "epoch": 0.8741039827249147, + "grad_norm": 0.5785887837409973, + "learning_rate": 5.984910933444089e-05, + "loss": 0.9494, + "step": 136820 + }, + { + "epoch": 0.8741678698746534, + "grad_norm": 0.7972803115844727, + "learning_rate": 5.984418991101101e-05, + "loss": 1.1168, + "step": 136830 + }, + { + "epoch": 0.8742317570243922, + "grad_norm": 0.9159103631973267, + "learning_rate": 5.983927038844162e-05, + "loss": 1.1372, + "step": 136840 + }, + { + "epoch": 0.8742956441741309, + "grad_norm": 1.0228685140609741, + "learning_rate": 5.9834350766782255e-05, + "loss": 0.5965, + "step": 136850 + }, + { + "epoch": 0.8743595313238696, + "grad_norm": 1.0952800512313843, + "learning_rate": 5.982943104608247e-05, + "loss": 0.8082, + "step": 136860 + }, + { + "epoch": 0.8744234184736083, + "grad_norm": 0.7565765380859375, + "learning_rate": 5.982451122639182e-05, + "loss": 0.6736, + "step": 136870 + }, + { + "epoch": 0.8744873056233469, + "grad_norm": 0.7633196115493774, + "learning_rate": 5.981959130775985e-05, + "loss": 0.8517, + "step": 136880 + }, + { + "epoch": 0.8745511927730856, + "grad_norm": 2.1346609592437744, + "learning_rate": 5.981467129023609e-05, + "loss": 0.854, + "step": 136890 + }, + { + "epoch": 0.8746150799228243, + "grad_norm": 0.8160780668258667, + "learning_rate": 5.98097511738701e-05, + "loss": 1.1138, + "step": 136900 + }, + { + "epoch": 0.874678967072563, + "grad_norm": 0.6529989838600159, + "learning_rate": 5.9804830958711425e-05, + "loss": 0.675, + "step": 136910 + }, + { + "epoch": 0.8747428542223017, + "grad_norm": 0.846062421798706, + "learning_rate": 5.979991064480962e-05, + "loss": 0.763, + "step": 136920 + }, + { + "epoch": 0.8748067413720404, + "grad_norm": 0.8752646446228027, + "learning_rate": 5.9794990232214244e-05, + "loss": 0.7932, + "step": 136930 + }, + { + "epoch": 0.8748706285217791, + "grad_norm": 0.695993959903717, + "learning_rate": 5.979006972097484e-05, + "loss": 0.7567, + "step": 136940 + }, + { + "epoch": 0.8749345156715178, + "grad_norm": 0.825805127620697, + "learning_rate": 5.978514911114096e-05, + "loss": 0.7572, + "step": 136950 + }, + { + "epoch": 0.8749984028212565, + "grad_norm": 1.5052249431610107, + "learning_rate": 5.9780228402762165e-05, + "loss": 0.944, + "step": 136960 + }, + { + "epoch": 0.8750622899709952, + "grad_norm": 1.3028863668441772, + "learning_rate": 5.9775307595888006e-05, + "loss": 1.0014, + "step": 136970 + }, + { + "epoch": 0.8751261771207339, + "grad_norm": 2.27508282661438, + "learning_rate": 5.977038669056805e-05, + "loss": 0.8975, + "step": 136980 + }, + { + "epoch": 0.8751900642704726, + "grad_norm": 0.5962340235710144, + "learning_rate": 5.9765465686851854e-05, + "loss": 0.8318, + "step": 136990 + }, + { + "epoch": 0.8752539514202113, + "grad_norm": 0.7743219137191772, + "learning_rate": 5.976054458478896e-05, + "loss": 0.9495, + "step": 137000 + }, + { + "epoch": 0.87531783856995, + "grad_norm": 0.840707540512085, + "learning_rate": 5.975562338442893e-05, + "loss": 0.8466, + "step": 137010 + }, + { + "epoch": 0.8753817257196888, + "grad_norm": 0.7525313496589661, + "learning_rate": 5.975070208582134e-05, + "loss": 0.5504, + "step": 137020 + }, + { + "epoch": 0.8754456128694275, + "grad_norm": 2.833361864089966, + "learning_rate": 5.974578068901575e-05, + "loss": 0.9305, + "step": 137030 + }, + { + "epoch": 0.8755095000191662, + "grad_norm": 0.896931529045105, + "learning_rate": 5.9740859194061717e-05, + "loss": 1.0519, + "step": 137040 + }, + { + "epoch": 0.8755733871689049, + "grad_norm": 0.6994075179100037, + "learning_rate": 5.97359376010088e-05, + "loss": 0.9204, + "step": 137050 + }, + { + "epoch": 0.8756372743186436, + "grad_norm": 0.8043060898780823, + "learning_rate": 5.9731015909906565e-05, + "loss": 0.7847, + "step": 137060 + }, + { + "epoch": 0.8757011614683823, + "grad_norm": 0.9698672294616699, + "learning_rate": 5.9726094120804585e-05, + "loss": 0.7268, + "step": 137070 + }, + { + "epoch": 0.875765048618121, + "grad_norm": 1.0071710348129272, + "learning_rate": 5.972117223375242e-05, + "loss": 0.7952, + "step": 137080 + }, + { + "epoch": 0.8758289357678597, + "grad_norm": 0.7718594074249268, + "learning_rate": 5.9716250248799644e-05, + "loss": 0.7514, + "step": 137090 + }, + { + "epoch": 0.8758928229175984, + "grad_norm": 0.8059403300285339, + "learning_rate": 5.971132816599583e-05, + "loss": 0.9773, + "step": 137100 + }, + { + "epoch": 0.8759567100673371, + "grad_norm": 0.6279333829879761, + "learning_rate": 5.970640598539052e-05, + "loss": 1.1655, + "step": 137110 + }, + { + "epoch": 0.8760205972170757, + "grad_norm": 0.5626464486122131, + "learning_rate": 5.970148370703332e-05, + "loss": 0.7618, + "step": 137120 + }, + { + "epoch": 0.8760844843668144, + "grad_norm": 0.6805403828620911, + "learning_rate": 5.969656133097379e-05, + "loss": 0.9308, + "step": 137130 + }, + { + "epoch": 0.8761483715165531, + "grad_norm": 1.129631519317627, + "learning_rate": 5.969163885726148e-05, + "loss": 0.8858, + "step": 137140 + }, + { + "epoch": 0.8762122586662918, + "grad_norm": 0.6671173572540283, + "learning_rate": 5.9686716285946e-05, + "loss": 0.9919, + "step": 137150 + }, + { + "epoch": 0.8762761458160305, + "grad_norm": 0.8226957321166992, + "learning_rate": 5.9681793617076895e-05, + "loss": 0.8594, + "step": 137160 + }, + { + "epoch": 0.8763400329657692, + "grad_norm": 0.9677339792251587, + "learning_rate": 5.9676870850703747e-05, + "loss": 0.9001, + "step": 137170 + }, + { + "epoch": 0.8764039201155079, + "grad_norm": 1.0769922733306885, + "learning_rate": 5.967194798687615e-05, + "loss": 0.9104, + "step": 137180 + }, + { + "epoch": 0.8764678072652466, + "grad_norm": 0.9808753728866577, + "learning_rate": 5.966702502564366e-05, + "loss": 0.8969, + "step": 137190 + }, + { + "epoch": 0.8765316944149854, + "grad_norm": 0.8168275356292725, + "learning_rate": 5.9662101967055885e-05, + "loss": 0.8239, + "step": 137200 + }, + { + "epoch": 0.8765955815647241, + "grad_norm": 0.7705772519111633, + "learning_rate": 5.965717881116237e-05, + "loss": 0.7709, + "step": 137210 + }, + { + "epoch": 0.8766594687144628, + "grad_norm": 0.7873682975769043, + "learning_rate": 5.965225555801272e-05, + "loss": 0.8556, + "step": 137220 + }, + { + "epoch": 0.8767233558642015, + "grad_norm": 1.820673942565918, + "learning_rate": 5.9647332207656505e-05, + "loss": 1.1398, + "step": 137230 + }, + { + "epoch": 0.8767872430139402, + "grad_norm": 1.0492981672286987, + "learning_rate": 5.9642408760143296e-05, + "loss": 1.0855, + "step": 137240 + }, + { + "epoch": 0.8768511301636789, + "grad_norm": 1.0693048238754272, + "learning_rate": 5.9637485215522694e-05, + "loss": 1.043, + "step": 137250 + }, + { + "epoch": 0.8769150173134176, + "grad_norm": 1.1092848777770996, + "learning_rate": 5.963256157384427e-05, + "loss": 1.1529, + "step": 137260 + }, + { + "epoch": 0.8769789044631563, + "grad_norm": 1.148908257484436, + "learning_rate": 5.962763783515763e-05, + "loss": 0.7518, + "step": 137270 + }, + { + "epoch": 0.877042791612895, + "grad_norm": 1.0465582609176636, + "learning_rate": 5.9622713999512345e-05, + "loss": 0.8351, + "step": 137280 + }, + { + "epoch": 0.8771066787626337, + "grad_norm": 0.7496880888938904, + "learning_rate": 5.9617790066958e-05, + "loss": 1.0184, + "step": 137290 + }, + { + "epoch": 0.8771705659123724, + "grad_norm": 2.036813259124756, + "learning_rate": 5.96128660375442e-05, + "loss": 1.1939, + "step": 137300 + }, + { + "epoch": 0.8772344530621111, + "grad_norm": 0.8851515054702759, + "learning_rate": 5.9607941911320506e-05, + "loss": 0.8136, + "step": 137310 + }, + { + "epoch": 0.8772983402118498, + "grad_norm": 1.1349178552627563, + "learning_rate": 5.960301768833654e-05, + "loss": 0.8269, + "step": 137320 + }, + { + "epoch": 0.8773622273615885, + "grad_norm": 1.0561522245407104, + "learning_rate": 5.959809336864186e-05, + "loss": 1.0435, + "step": 137330 + }, + { + "epoch": 0.8774261145113272, + "grad_norm": 1.015069842338562, + "learning_rate": 5.959316895228609e-05, + "loss": 0.752, + "step": 137340 + }, + { + "epoch": 0.8774900016610659, + "grad_norm": 1.3497037887573242, + "learning_rate": 5.958824443931881e-05, + "loss": 0.7568, + "step": 137350 + }, + { + "epoch": 0.8775538888108046, + "grad_norm": 0.6360141038894653, + "learning_rate": 5.958331982978961e-05, + "loss": 0.8139, + "step": 137360 + }, + { + "epoch": 0.8776177759605432, + "grad_norm": 1.0269728899002075, + "learning_rate": 5.957839512374809e-05, + "loss": 0.9107, + "step": 137370 + }, + { + "epoch": 0.877681663110282, + "grad_norm": 0.7632153630256653, + "learning_rate": 5.957347032124384e-05, + "loss": 1.0206, + "step": 137380 + }, + { + "epoch": 0.8777455502600207, + "grad_norm": 0.7974910736083984, + "learning_rate": 5.9568545422326474e-05, + "loss": 0.8792, + "step": 137390 + }, + { + "epoch": 0.8778094374097594, + "grad_norm": 1.0210436582565308, + "learning_rate": 5.956362042704556e-05, + "loss": 1.0714, + "step": 137400 + }, + { + "epoch": 0.8778733245594981, + "grad_norm": 0.8050969839096069, + "learning_rate": 5.955869533545073e-05, + "loss": 0.9401, + "step": 137410 + }, + { + "epoch": 0.8779372117092368, + "grad_norm": 0.888954758644104, + "learning_rate": 5.955377014759156e-05, + "loss": 0.9508, + "step": 137420 + }, + { + "epoch": 0.8780010988589755, + "grad_norm": 0.69648677110672, + "learning_rate": 5.954884486351766e-05, + "loss": 0.9033, + "step": 137430 + }, + { + "epoch": 0.8780649860087142, + "grad_norm": 1.9958151578903198, + "learning_rate": 5.954391948327864e-05, + "loss": 0.9938, + "step": 137440 + }, + { + "epoch": 0.8781288731584529, + "grad_norm": 0.7439517378807068, + "learning_rate": 5.9538994006924085e-05, + "loss": 0.9702, + "step": 137450 + }, + { + "epoch": 0.8781927603081916, + "grad_norm": 0.9544771313667297, + "learning_rate": 5.953406843450361e-05, + "loss": 1.0634, + "step": 137460 + }, + { + "epoch": 0.8782566474579303, + "grad_norm": 0.9266231656074524, + "learning_rate": 5.9529142766066823e-05, + "loss": 0.8061, + "step": 137470 + }, + { + "epoch": 0.878320534607669, + "grad_norm": 0.9102841019630432, + "learning_rate": 5.952421700166333e-05, + "loss": 0.7466, + "step": 137480 + }, + { + "epoch": 0.8783844217574077, + "grad_norm": 0.9724735021591187, + "learning_rate": 5.9519291141342714e-05, + "loss": 0.7188, + "step": 137490 + }, + { + "epoch": 0.8784483089071464, + "grad_norm": 0.6619033217430115, + "learning_rate": 5.951436518515461e-05, + "loss": 0.696, + "step": 137500 + }, + { + "epoch": 0.8785121960568851, + "grad_norm": 1.0394726991653442, + "learning_rate": 5.9509439133148616e-05, + "loss": 0.9148, + "step": 137510 + }, + { + "epoch": 0.8785760832066238, + "grad_norm": 0.9882583618164062, + "learning_rate": 5.950451298537434e-05, + "loss": 0.7845, + "step": 137520 + }, + { + "epoch": 0.8786399703563625, + "grad_norm": 1.2519365549087524, + "learning_rate": 5.94995867418814e-05, + "loss": 0.7793, + "step": 137530 + }, + { + "epoch": 0.8787038575061012, + "grad_norm": 0.7872567772865295, + "learning_rate": 5.9494660402719404e-05, + "loss": 1.0541, + "step": 137540 + }, + { + "epoch": 0.87876774465584, + "grad_norm": 0.8353559970855713, + "learning_rate": 5.948973396793795e-05, + "loss": 1.0608, + "step": 137550 + }, + { + "epoch": 0.8788316318055787, + "grad_norm": 0.48675644397735596, + "learning_rate": 5.948480743758669e-05, + "loss": 0.9564, + "step": 137560 + }, + { + "epoch": 0.8788955189553174, + "grad_norm": 1.1137011051177979, + "learning_rate": 5.9479880811715195e-05, + "loss": 0.7974, + "step": 137570 + }, + { + "epoch": 0.8789594061050561, + "grad_norm": 1.0456700325012207, + "learning_rate": 5.9474954090373106e-05, + "loss": 0.9706, + "step": 137580 + }, + { + "epoch": 0.8790232932547948, + "grad_norm": 1.0245221853256226, + "learning_rate": 5.947002727361003e-05, + "loss": 0.8457, + "step": 137590 + }, + { + "epoch": 0.8790871804045335, + "grad_norm": 1.006938099861145, + "learning_rate": 5.9465593056979326e-05, + "loss": 0.7818, + "step": 137600 + }, + { + "epoch": 0.8791510675542721, + "grad_norm": 1.0359126329421997, + "learning_rate": 5.946066605905308e-05, + "loss": 0.7724, + "step": 137610 + }, + { + "epoch": 0.8792149547040108, + "grad_norm": 1.068823218345642, + "learning_rate": 5.945573896584974e-05, + "loss": 0.9845, + "step": 137620 + }, + { + "epoch": 0.8792788418537495, + "grad_norm": 0.46609166264533997, + "learning_rate": 5.945081177741892e-05, + "loss": 0.7789, + "step": 137630 + }, + { + "epoch": 0.8793427290034882, + "grad_norm": 1.1455978155136108, + "learning_rate": 5.9445884493810256e-05, + "loss": 0.7966, + "step": 137640 + }, + { + "epoch": 0.8794066161532269, + "grad_norm": 1.5680001974105835, + "learning_rate": 5.944095711507337e-05, + "loss": 0.9451, + "step": 137650 + }, + { + "epoch": 0.8794705033029656, + "grad_norm": 1.8977959156036377, + "learning_rate": 5.943602964125787e-05, + "loss": 0.8331, + "step": 137660 + }, + { + "epoch": 0.8795343904527043, + "grad_norm": 0.6231663227081299, + "learning_rate": 5.943110207241339e-05, + "loss": 0.8725, + "step": 137670 + }, + { + "epoch": 0.879598277602443, + "grad_norm": 0.8517551422119141, + "learning_rate": 5.942617440858955e-05, + "loss": 1.0001, + "step": 137680 + }, + { + "epoch": 0.8796621647521817, + "grad_norm": 0.9704746007919312, + "learning_rate": 5.9421246649835985e-05, + "loss": 1.2601, + "step": 137690 + }, + { + "epoch": 0.8797260519019204, + "grad_norm": 0.6457834839820862, + "learning_rate": 5.941631879620231e-05, + "loss": 0.6845, + "step": 137700 + }, + { + "epoch": 0.8797899390516591, + "grad_norm": 1.3771389722824097, + "learning_rate": 5.941139084773817e-05, + "loss": 1.0085, + "step": 137710 + }, + { + "epoch": 0.8798538262013978, + "grad_norm": 0.8982274532318115, + "learning_rate": 5.940646280449317e-05, + "loss": 0.754, + "step": 137720 + }, + { + "epoch": 0.8799177133511366, + "grad_norm": 1.1403874158859253, + "learning_rate": 5.9401534666516955e-05, + "loss": 0.9035, + "step": 137730 + }, + { + "epoch": 0.8799816005008753, + "grad_norm": 0.8235518932342529, + "learning_rate": 5.939660643385915e-05, + "loss": 0.9339, + "step": 137740 + }, + { + "epoch": 0.880045487650614, + "grad_norm": 0.8350309133529663, + "learning_rate": 5.939167810656939e-05, + "loss": 1.0703, + "step": 137750 + }, + { + "epoch": 0.8801093748003527, + "grad_norm": 1.7924656867980957, + "learning_rate": 5.938674968469731e-05, + "loss": 1.1085, + "step": 137760 + }, + { + "epoch": 0.8801732619500914, + "grad_norm": 1.2257702350616455, + "learning_rate": 5.9381821168292536e-05, + "loss": 0.9338, + "step": 137770 + }, + { + "epoch": 0.8802371490998301, + "grad_norm": 1.0357496738433838, + "learning_rate": 5.9376892557404704e-05, + "loss": 0.8123, + "step": 137780 + }, + { + "epoch": 0.8803010362495688, + "grad_norm": 0.990088164806366, + "learning_rate": 5.937196385208346e-05, + "loss": 1.004, + "step": 137790 + }, + { + "epoch": 0.8803649233993075, + "grad_norm": 0.97257000207901, + "learning_rate": 5.936703505237843e-05, + "loss": 0.6953, + "step": 137800 + }, + { + "epoch": 0.8804288105490462, + "grad_norm": 0.6690786480903625, + "learning_rate": 5.9362106158339245e-05, + "loss": 0.9455, + "step": 137810 + }, + { + "epoch": 0.8804926976987849, + "grad_norm": 1.3547656536102295, + "learning_rate": 5.935717717001556e-05, + "loss": 0.856, + "step": 137820 + }, + { + "epoch": 0.8805565848485236, + "grad_norm": 0.7319701313972473, + "learning_rate": 5.9352248087456994e-05, + "loss": 0.9645, + "step": 137830 + }, + { + "epoch": 0.8806204719982623, + "grad_norm": 0.9429260492324829, + "learning_rate": 5.934731891071321e-05, + "loss": 1.0002, + "step": 137840 + }, + { + "epoch": 0.8806843591480009, + "grad_norm": 0.8033540844917297, + "learning_rate": 5.934238963983384e-05, + "loss": 0.676, + "step": 137850 + }, + { + "epoch": 0.8807482462977396, + "grad_norm": 1.9178016185760498, + "learning_rate": 5.933746027486853e-05, + "loss": 0.727, + "step": 137860 + }, + { + "epoch": 0.8808121334474783, + "grad_norm": 0.7014086842536926, + "learning_rate": 5.9333023765997284e-05, + "loss": 1.0007, + "step": 137870 + }, + { + "epoch": 0.880876020597217, + "grad_norm": 1.6544643640518188, + "learning_rate": 5.9328094222405437e-05, + "loss": 0.8551, + "step": 137880 + }, + { + "epoch": 0.8809399077469557, + "grad_norm": 1.3481382131576538, + "learning_rate": 5.932316458487162e-05, + "loss": 0.7606, + "step": 137890 + }, + { + "epoch": 0.8810037948966944, + "grad_norm": 1.015859842300415, + "learning_rate": 5.931823485344545e-05, + "loss": 0.9227, + "step": 137900 + }, + { + "epoch": 0.8810676820464332, + "grad_norm": 1.004987359046936, + "learning_rate": 5.9313305028176606e-05, + "loss": 1.1689, + "step": 137910 + }, + { + "epoch": 0.8811315691961719, + "grad_norm": 0.8518670797348022, + "learning_rate": 5.930837510911471e-05, + "loss": 1.1651, + "step": 137920 + }, + { + "epoch": 0.8811954563459106, + "grad_norm": 0.6853091716766357, + "learning_rate": 5.930344509630943e-05, + "loss": 0.7861, + "step": 137930 + }, + { + "epoch": 0.8812593434956493, + "grad_norm": 1.4543042182922363, + "learning_rate": 5.929851498981041e-05, + "loss": 0.718, + "step": 137940 + }, + { + "epoch": 0.881323230645388, + "grad_norm": 0.791410505771637, + "learning_rate": 5.92935847896673e-05, + "loss": 0.8162, + "step": 137950 + }, + { + "epoch": 0.8813871177951267, + "grad_norm": 0.8567259311676025, + "learning_rate": 5.928865449592976e-05, + "loss": 0.797, + "step": 137960 + }, + { + "epoch": 0.8814510049448654, + "grad_norm": 0.9072690010070801, + "learning_rate": 5.928372410864742e-05, + "loss": 0.9948, + "step": 137970 + }, + { + "epoch": 0.8815148920946041, + "grad_norm": 1.3205629587173462, + "learning_rate": 5.9278793627869955e-05, + "loss": 1.0577, + "step": 137980 + }, + { + "epoch": 0.8815787792443428, + "grad_norm": 0.8285039663314819, + "learning_rate": 5.9273863053647015e-05, + "loss": 0.9573, + "step": 137990 + }, + { + "epoch": 0.8816426663940815, + "grad_norm": 0.7097443342208862, + "learning_rate": 5.926893238602825e-05, + "loss": 0.927, + "step": 138000 + }, + { + "epoch": 0.8817065535438202, + "grad_norm": 0.7954055666923523, + "learning_rate": 5.926400162506331e-05, + "loss": 1.0107, + "step": 138010 + }, + { + "epoch": 0.8817704406935589, + "grad_norm": 0.9735956788063049, + "learning_rate": 5.9259070770801874e-05, + "loss": 0.8408, + "step": 138020 + }, + { + "epoch": 0.8818343278432976, + "grad_norm": 1.4425255060195923, + "learning_rate": 5.925413982329357e-05, + "loss": 1.0734, + "step": 138030 + }, + { + "epoch": 0.8818982149930363, + "grad_norm": 0.9620723724365234, + "learning_rate": 5.9249208782588076e-05, + "loss": 0.9398, + "step": 138040 + }, + { + "epoch": 0.881962102142775, + "grad_norm": 0.6004499793052673, + "learning_rate": 5.924427764873505e-05, + "loss": 0.9196, + "step": 138050 + }, + { + "epoch": 0.8820259892925137, + "grad_norm": 0.9870150685310364, + "learning_rate": 5.9239346421784135e-05, + "loss": 0.9864, + "step": 138060 + }, + { + "epoch": 0.8820898764422525, + "grad_norm": 0.897495687007904, + "learning_rate": 5.9234415101785026e-05, + "loss": 1.0131, + "step": 138070 + }, + { + "epoch": 0.8821537635919912, + "grad_norm": 0.9317723512649536, + "learning_rate": 5.922948368878736e-05, + "loss": 0.856, + "step": 138080 + }, + { + "epoch": 0.8822176507417298, + "grad_norm": 0.9993261694908142, + "learning_rate": 5.922455218284081e-05, + "loss": 0.9035, + "step": 138090 + }, + { + "epoch": 0.8822815378914685, + "grad_norm": 0.8899745345115662, + "learning_rate": 5.921962058399504e-05, + "loss": 0.9287, + "step": 138100 + }, + { + "epoch": 0.8823454250412072, + "grad_norm": 0.59910649061203, + "learning_rate": 5.921468889229971e-05, + "loss": 1.0332, + "step": 138110 + }, + { + "epoch": 0.8824093121909459, + "grad_norm": 0.8276026248931885, + "learning_rate": 5.92097571078045e-05, + "loss": 0.8867, + "step": 138120 + }, + { + "epoch": 0.8824731993406846, + "grad_norm": 1.0071065425872803, + "learning_rate": 5.9204825230559056e-05, + "loss": 0.8795, + "step": 138130 + }, + { + "epoch": 0.8825370864904233, + "grad_norm": 1.0851963758468628, + "learning_rate": 5.919989326061307e-05, + "loss": 0.9147, + "step": 138140 + }, + { + "epoch": 0.882600973640162, + "grad_norm": 0.9725841879844666, + "learning_rate": 5.9194961198016196e-05, + "loss": 0.7633, + "step": 138150 + }, + { + "epoch": 0.8826648607899007, + "grad_norm": 0.5369303226470947, + "learning_rate": 5.9190029042818105e-05, + "loss": 0.9501, + "step": 138160 + }, + { + "epoch": 0.8827287479396394, + "grad_norm": 0.9604431390762329, + "learning_rate": 5.918509679506847e-05, + "loss": 0.785, + "step": 138170 + }, + { + "epoch": 0.8827926350893781, + "grad_norm": 1.0034009218215942, + "learning_rate": 5.918016445481698e-05, + "loss": 0.9936, + "step": 138180 + }, + { + "epoch": 0.8828565222391168, + "grad_norm": 0.8154608607292175, + "learning_rate": 5.917523202211328e-05, + "loss": 0.7805, + "step": 138190 + }, + { + "epoch": 0.8829204093888555, + "grad_norm": 1.2569918632507324, + "learning_rate": 5.9170299497007053e-05, + "loss": 0.6671, + "step": 138200 + }, + { + "epoch": 0.8829842965385942, + "grad_norm": 0.6132636666297913, + "learning_rate": 5.916536687954798e-05, + "loss": 0.7076, + "step": 138210 + }, + { + "epoch": 0.8830481836883329, + "grad_norm": 2.1591336727142334, + "learning_rate": 5.916043416978574e-05, + "loss": 1.1469, + "step": 138220 + }, + { + "epoch": 0.8831120708380716, + "grad_norm": 0.9249553084373474, + "learning_rate": 5.915550136776999e-05, + "loss": 0.8875, + "step": 138230 + }, + { + "epoch": 0.8831759579878103, + "grad_norm": 1.4961109161376953, + "learning_rate": 5.915056847355043e-05, + "loss": 0.7952, + "step": 138240 + }, + { + "epoch": 0.883239845137549, + "grad_norm": 1.0955626964569092, + "learning_rate": 5.914563548717673e-05, + "loss": 0.8794, + "step": 138250 + }, + { + "epoch": 0.8833037322872878, + "grad_norm": 0.5093604922294617, + "learning_rate": 5.9140702408698554e-05, + "loss": 0.7851, + "step": 138260 + }, + { + "epoch": 0.8833676194370265, + "grad_norm": 1.133516788482666, + "learning_rate": 5.913576923816562e-05, + "loss": 0.699, + "step": 138270 + }, + { + "epoch": 0.8834315065867652, + "grad_norm": 1.351069450378418, + "learning_rate": 5.9130835975627574e-05, + "loss": 0.9823, + "step": 138280 + }, + { + "epoch": 0.8834953937365039, + "grad_norm": 0.741649329662323, + "learning_rate": 5.912590262113411e-05, + "loss": 1.0134, + "step": 138290 + }, + { + "epoch": 0.8835592808862426, + "grad_norm": 0.5988890528678894, + "learning_rate": 5.912096917473491e-05, + "loss": 0.7114, + "step": 138300 + }, + { + "epoch": 0.8836231680359813, + "grad_norm": 0.9725940823554993, + "learning_rate": 5.911603563647966e-05, + "loss": 0.9138, + "step": 138310 + }, + { + "epoch": 0.88368705518572, + "grad_norm": 0.5736109018325806, + "learning_rate": 5.911110200641805e-05, + "loss": 0.7067, + "step": 138320 + }, + { + "epoch": 0.8837509423354587, + "grad_norm": 0.8855761885643005, + "learning_rate": 5.910616828459975e-05, + "loss": 1.2011, + "step": 138330 + }, + { + "epoch": 0.8838148294851973, + "grad_norm": 0.8970593810081482, + "learning_rate": 5.910123447107446e-05, + "loss": 0.9496, + "step": 138340 + }, + { + "epoch": 0.883878716634936, + "grad_norm": 0.8814042806625366, + "learning_rate": 5.909630056589188e-05, + "loss": 0.957, + "step": 138350 + }, + { + "epoch": 0.8839426037846747, + "grad_norm": 1.0015789270401, + "learning_rate": 5.909136656910167e-05, + "loss": 0.8841, + "step": 138360 + }, + { + "epoch": 0.8840064909344134, + "grad_norm": 0.869117796421051, + "learning_rate": 5.908643248075354e-05, + "loss": 0.7216, + "step": 138370 + }, + { + "epoch": 0.8840703780841521, + "grad_norm": 0.8888474702835083, + "learning_rate": 5.9081498300897167e-05, + "loss": 0.8551, + "step": 138380 + }, + { + "epoch": 0.8841342652338908, + "grad_norm": 0.8927051424980164, + "learning_rate": 5.907656402958226e-05, + "loss": 0.9334, + "step": 138390 + }, + { + "epoch": 0.8841981523836295, + "grad_norm": 1.0528945922851562, + "learning_rate": 5.907162966685849e-05, + "loss": 0.7634, + "step": 138400 + }, + { + "epoch": 0.8842620395333682, + "grad_norm": 1.1046907901763916, + "learning_rate": 5.906669521277557e-05, + "loss": 0.715, + "step": 138410 + }, + { + "epoch": 0.8843259266831069, + "grad_norm": 0.8882020711898804, + "learning_rate": 5.906176066738317e-05, + "loss": 1.0122, + "step": 138420 + }, + { + "epoch": 0.8843898138328457, + "grad_norm": 0.9222348928451538, + "learning_rate": 5.905682603073102e-05, + "loss": 0.8114, + "step": 138430 + }, + { + "epoch": 0.8844537009825844, + "grad_norm": 0.5830926895141602, + "learning_rate": 5.905189130286879e-05, + "loss": 1.0322, + "step": 138440 + }, + { + "epoch": 0.8845175881323231, + "grad_norm": 1.4993237257003784, + "learning_rate": 5.904695648384617e-05, + "loss": 0.6984, + "step": 138450 + }, + { + "epoch": 0.8845814752820618, + "grad_norm": 4.328673839569092, + "learning_rate": 5.904202157371288e-05, + "loss": 0.8778, + "step": 138460 + }, + { + "epoch": 0.8846453624318005, + "grad_norm": 0.7862850427627563, + "learning_rate": 5.903708657251861e-05, + "loss": 0.8364, + "step": 138470 + }, + { + "epoch": 0.8847092495815392, + "grad_norm": 1.8233660459518433, + "learning_rate": 5.903215148031307e-05, + "loss": 0.8774, + "step": 138480 + }, + { + "epoch": 0.8847731367312779, + "grad_norm": 0.7515198588371277, + "learning_rate": 5.902721629714595e-05, + "loss": 1.2306, + "step": 138490 + }, + { + "epoch": 0.8848370238810166, + "grad_norm": 1.1745033264160156, + "learning_rate": 5.902228102306695e-05, + "loss": 0.9555, + "step": 138500 + }, + { + "epoch": 0.8849009110307553, + "grad_norm": 1.0315542221069336, + "learning_rate": 5.901734565812577e-05, + "loss": 0.987, + "step": 138510 + }, + { + "epoch": 0.884964798180494, + "grad_norm": 1.1841830015182495, + "learning_rate": 5.9012410202372114e-05, + "loss": 1.1246, + "step": 138520 + }, + { + "epoch": 0.8850286853302327, + "grad_norm": 1.072008490562439, + "learning_rate": 5.9007474655855696e-05, + "loss": 0.8357, + "step": 138530 + }, + { + "epoch": 0.8850925724799714, + "grad_norm": 1.0678666830062866, + "learning_rate": 5.900253901862621e-05, + "loss": 0.7345, + "step": 138540 + }, + { + "epoch": 0.8851564596297101, + "grad_norm": 0.83828204870224, + "learning_rate": 5.899760329073338e-05, + "loss": 0.8972, + "step": 138550 + }, + { + "epoch": 0.8852203467794488, + "grad_norm": 0.9922822713851929, + "learning_rate": 5.899266747222689e-05, + "loss": 0.7582, + "step": 138560 + }, + { + "epoch": 0.8852842339291875, + "grad_norm": 2.141287088394165, + "learning_rate": 5.8987731563156464e-05, + "loss": 1.1712, + "step": 138570 + }, + { + "epoch": 0.8853481210789261, + "grad_norm": 0.8751981258392334, + "learning_rate": 5.89827955635718e-05, + "loss": 0.9515, + "step": 138580 + }, + { + "epoch": 0.8854120082286648, + "grad_norm": 0.6795740723609924, + "learning_rate": 5.897785947352262e-05, + "loss": 0.7279, + "step": 138590 + }, + { + "epoch": 0.8854758953784035, + "grad_norm": 0.8922616839408875, + "learning_rate": 5.8972923293058636e-05, + "loss": 1.0773, + "step": 138600 + }, + { + "epoch": 0.8855397825281422, + "grad_norm": 0.8627411127090454, + "learning_rate": 5.896798702222953e-05, + "loss": 1.0776, + "step": 138610 + }, + { + "epoch": 0.885603669677881, + "grad_norm": 0.6423478126525879, + "learning_rate": 5.896305066108504e-05, + "loss": 1.0121, + "step": 138620 + }, + { + "epoch": 0.8856675568276197, + "grad_norm": 0.9135613441467285, + "learning_rate": 5.895811420967489e-05, + "loss": 0.7514, + "step": 138630 + }, + { + "epoch": 0.8857314439773584, + "grad_norm": 1.0383354425430298, + "learning_rate": 5.895317766804877e-05, + "loss": 0.9648, + "step": 138640 + }, + { + "epoch": 0.8857953311270971, + "grad_norm": 1.2800050973892212, + "learning_rate": 5.89482410362564e-05, + "loss": 0.7818, + "step": 138650 + }, + { + "epoch": 0.8858592182768358, + "grad_norm": 0.8451805710792542, + "learning_rate": 5.894330431434751e-05, + "loss": 0.9926, + "step": 138660 + }, + { + "epoch": 0.8859231054265745, + "grad_norm": 0.9237948060035706, + "learning_rate": 5.893836750237181e-05, + "loss": 0.9855, + "step": 138670 + }, + { + "epoch": 0.8859869925763132, + "grad_norm": 0.6071786880493164, + "learning_rate": 5.893343060037902e-05, + "loss": 0.8057, + "step": 138680 + }, + { + "epoch": 0.8860508797260519, + "grad_norm": 1.0154786109924316, + "learning_rate": 5.892849360841886e-05, + "loss": 0.9599, + "step": 138690 + }, + { + "epoch": 0.8861147668757906, + "grad_norm": 1.8094230890274048, + "learning_rate": 5.892355652654102e-05, + "loss": 0.9918, + "step": 138700 + }, + { + "epoch": 0.8861786540255293, + "grad_norm": 0.8188693523406982, + "learning_rate": 5.891861935479527e-05, + "loss": 0.857, + "step": 138710 + }, + { + "epoch": 0.886242541175268, + "grad_norm": 0.6113899350166321, + "learning_rate": 5.891368209323129e-05, + "loss": 1.092, + "step": 138720 + }, + { + "epoch": 0.8863064283250067, + "grad_norm": 0.7425000667572021, + "learning_rate": 5.8908744741898846e-05, + "loss": 0.7412, + "step": 138730 + }, + { + "epoch": 0.8863703154747454, + "grad_norm": 1.9018378257751465, + "learning_rate": 5.8903807300847627e-05, + "loss": 0.889, + "step": 138740 + }, + { + "epoch": 0.8864342026244841, + "grad_norm": 1.2525657415390015, + "learning_rate": 5.889886977012735e-05, + "loss": 0.901, + "step": 138750 + }, + { + "epoch": 0.8864980897742228, + "grad_norm": 0.6941089630126953, + "learning_rate": 5.8893932149787764e-05, + "loss": 0.9275, + "step": 138760 + }, + { + "epoch": 0.8865619769239615, + "grad_norm": 0.9113277196884155, + "learning_rate": 5.8888994439878584e-05, + "loss": 0.9316, + "step": 138770 + }, + { + "epoch": 0.8866258640737003, + "grad_norm": 1.0078089237213135, + "learning_rate": 5.888405664044953e-05, + "loss": 0.9862, + "step": 138780 + }, + { + "epoch": 0.886689751223439, + "grad_norm": 0.7824812531471252, + "learning_rate": 5.887911875155036e-05, + "loss": 0.8826, + "step": 138790 + }, + { + "epoch": 0.8867536383731777, + "grad_norm": 0.5827013254165649, + "learning_rate": 5.887418077323077e-05, + "loss": 0.6422, + "step": 138800 + }, + { + "epoch": 0.8868175255229164, + "grad_norm": 1.1789437532424927, + "learning_rate": 5.886924270554051e-05, + "loss": 1.123, + "step": 138810 + }, + { + "epoch": 0.886881412672655, + "grad_norm": 0.9129090905189514, + "learning_rate": 5.886430454852929e-05, + "loss": 0.9861, + "step": 138820 + }, + { + "epoch": 0.8869452998223937, + "grad_norm": 0.82326340675354, + "learning_rate": 5.885936630224686e-05, + "loss": 0.8269, + "step": 138830 + }, + { + "epoch": 0.8870091869721324, + "grad_norm": 2.5597972869873047, + "learning_rate": 5.885442796674295e-05, + "loss": 0.9155, + "step": 138840 + }, + { + "epoch": 0.8870730741218711, + "grad_norm": 1.1469552516937256, + "learning_rate": 5.8849489542067296e-05, + "loss": 0.806, + "step": 138850 + }, + { + "epoch": 0.8871369612716098, + "grad_norm": 0.7060733437538147, + "learning_rate": 5.8844551028269625e-05, + "loss": 1.0475, + "step": 138860 + }, + { + "epoch": 0.8872008484213485, + "grad_norm": 0.5902007222175598, + "learning_rate": 5.883961242539966e-05, + "loss": 0.9141, + "step": 138870 + }, + { + "epoch": 0.8872647355710872, + "grad_norm": 1.3339205980300903, + "learning_rate": 5.883467373350716e-05, + "loss": 0.8036, + "step": 138880 + }, + { + "epoch": 0.8873286227208259, + "grad_norm": 0.8260666728019714, + "learning_rate": 5.882973495264186e-05, + "loss": 0.7641, + "step": 138890 + }, + { + "epoch": 0.8873925098705646, + "grad_norm": 1.3517704010009766, + "learning_rate": 5.8824796082853485e-05, + "loss": 0.7486, + "step": 138900 + }, + { + "epoch": 0.8874563970203033, + "grad_norm": 0.7072157859802246, + "learning_rate": 5.8819857124191766e-05, + "loss": 1.1322, + "step": 138910 + }, + { + "epoch": 0.887520284170042, + "grad_norm": 0.7836194634437561, + "learning_rate": 5.881491807670647e-05, + "loss": 1.0416, + "step": 138920 + }, + { + "epoch": 0.8875841713197807, + "grad_norm": 0.8094397783279419, + "learning_rate": 5.880997894044732e-05, + "loss": 0.7803, + "step": 138930 + }, + { + "epoch": 0.8876480584695194, + "grad_norm": 0.9594300985336304, + "learning_rate": 5.880503971546406e-05, + "loss": 0.6825, + "step": 138940 + }, + { + "epoch": 0.8877119456192581, + "grad_norm": 0.7078715562820435, + "learning_rate": 5.8800100401806436e-05, + "loss": 0.7998, + "step": 138950 + }, + { + "epoch": 0.8877758327689969, + "grad_norm": 1.1923208236694336, + "learning_rate": 5.879516099952418e-05, + "loss": 1.1095, + "step": 138960 + }, + { + "epoch": 0.8878397199187356, + "grad_norm": 0.8840433955192566, + "learning_rate": 5.8790221508667045e-05, + "loss": 0.8077, + "step": 138970 + }, + { + "epoch": 0.8879036070684743, + "grad_norm": 1.028594732284546, + "learning_rate": 5.878528192928479e-05, + "loss": 0.8315, + "step": 138980 + }, + { + "epoch": 0.887967494218213, + "grad_norm": 0.873859703540802, + "learning_rate": 5.878034226142712e-05, + "loss": 0.8896, + "step": 138990 + }, + { + "epoch": 0.8880313813679517, + "grad_norm": 1.0538140535354614, + "learning_rate": 5.877540250514383e-05, + "loss": 0.7489, + "step": 139000 + }, + { + "epoch": 0.8880952685176904, + "grad_norm": 1.347963571548462, + "learning_rate": 5.8770462660484625e-05, + "loss": 0.836, + "step": 139010 + }, + { + "epoch": 0.8881591556674291, + "grad_norm": 0.8959457874298096, + "learning_rate": 5.876552272749929e-05, + "loss": 0.7588, + "step": 139020 + }, + { + "epoch": 0.8882230428171678, + "grad_norm": 0.6587477922439575, + "learning_rate": 5.876058270623756e-05, + "loss": 0.7995, + "step": 139030 + }, + { + "epoch": 0.8882869299669065, + "grad_norm": 0.7410009503364563, + "learning_rate": 5.8755642596749164e-05, + "loss": 0.8671, + "step": 139040 + }, + { + "epoch": 0.8883508171166452, + "grad_norm": 0.96707683801651, + "learning_rate": 5.875070239908389e-05, + "loss": 0.7018, + "step": 139050 + }, + { + "epoch": 0.8884147042663839, + "grad_norm": 0.7956843972206116, + "learning_rate": 5.8745762113291455e-05, + "loss": 0.7706, + "step": 139060 + }, + { + "epoch": 0.8884785914161225, + "grad_norm": 0.9614824652671814, + "learning_rate": 5.874082173942165e-05, + "loss": 0.8501, + "step": 139070 + }, + { + "epoch": 0.8885424785658612, + "grad_norm": 1.660465121269226, + "learning_rate": 5.8735881277524195e-05, + "loss": 1.0422, + "step": 139080 + }, + { + "epoch": 0.8886063657155999, + "grad_norm": 0.7335018515586853, + "learning_rate": 5.8730940727648864e-05, + "loss": 1.0635, + "step": 139090 + }, + { + "epoch": 0.8886702528653386, + "grad_norm": 0.8188953399658203, + "learning_rate": 5.87260000898454e-05, + "loss": 0.9997, + "step": 139100 + }, + { + "epoch": 0.8887341400150773, + "grad_norm": 0.44217541813850403, + "learning_rate": 5.8721059364163564e-05, + "loss": 0.9746, + "step": 139110 + }, + { + "epoch": 0.888798027164816, + "grad_norm": 1.0299861431121826, + "learning_rate": 5.871611855065313e-05, + "loss": 1.0003, + "step": 139120 + }, + { + "epoch": 0.8888619143145547, + "grad_norm": 0.9896953105926514, + "learning_rate": 5.871117764936382e-05, + "loss": 0.8213, + "step": 139130 + }, + { + "epoch": 0.8889258014642935, + "grad_norm": 1.2226732969284058, + "learning_rate": 5.870623666034544e-05, + "loss": 0.8156, + "step": 139140 + }, + { + "epoch": 0.8889896886140322, + "grad_norm": 0.6431970000267029, + "learning_rate": 5.87012955836477e-05, + "loss": 0.9195, + "step": 139150 + }, + { + "epoch": 0.8890535757637709, + "grad_norm": 1.434300422668457, + "learning_rate": 5.86963544193204e-05, + "loss": 0.7808, + "step": 139160 + }, + { + "epoch": 0.8891174629135096, + "grad_norm": 0.8103228807449341, + "learning_rate": 5.869141316741328e-05, + "loss": 0.9496, + "step": 139170 + }, + { + "epoch": 0.8891813500632483, + "grad_norm": 0.9674835801124573, + "learning_rate": 5.868647182797612e-05, + "loss": 0.9964, + "step": 139180 + }, + { + "epoch": 0.889245237212987, + "grad_norm": 0.9061577916145325, + "learning_rate": 5.868153040105867e-05, + "loss": 0.9738, + "step": 139190 + }, + { + "epoch": 0.8893091243627257, + "grad_norm": 0.7973967790603638, + "learning_rate": 5.8676588886710695e-05, + "loss": 0.8598, + "step": 139200 + }, + { + "epoch": 0.8893730115124644, + "grad_norm": 1.2977885007858276, + "learning_rate": 5.867164728498197e-05, + "loss": 0.9625, + "step": 139210 + }, + { + "epoch": 0.8894368986622031, + "grad_norm": 0.897599458694458, + "learning_rate": 5.866670559592226e-05, + "loss": 1.0682, + "step": 139220 + }, + { + "epoch": 0.8895007858119418, + "grad_norm": 0.9015941023826599, + "learning_rate": 5.8661763819581314e-05, + "loss": 0.7841, + "step": 139230 + }, + { + "epoch": 0.8895646729616805, + "grad_norm": 1.129776120185852, + "learning_rate": 5.865682195600892e-05, + "loss": 1.0793, + "step": 139240 + }, + { + "epoch": 0.8896285601114192, + "grad_norm": 0.8388169407844543, + "learning_rate": 5.865188000525484e-05, + "loss": 0.9023, + "step": 139250 + }, + { + "epoch": 0.8896924472611579, + "grad_norm": 0.8551932573318481, + "learning_rate": 5.864693796736884e-05, + "loss": 0.7204, + "step": 139260 + }, + { + "epoch": 0.8897563344108966, + "grad_norm": 0.8071838021278381, + "learning_rate": 5.86419958424007e-05, + "loss": 0.7368, + "step": 139270 + }, + { + "epoch": 0.8898202215606353, + "grad_norm": 0.644696831703186, + "learning_rate": 5.863705363040017e-05, + "loss": 0.8828, + "step": 139280 + }, + { + "epoch": 0.889884108710374, + "grad_norm": 0.895206868648529, + "learning_rate": 5.863211133141705e-05, + "loss": 0.8151, + "step": 139290 + }, + { + "epoch": 0.8899479958601128, + "grad_norm": 1.169389009475708, + "learning_rate": 5.8627168945501096e-05, + "loss": 1.0632, + "step": 139300 + }, + { + "epoch": 0.8900118830098513, + "grad_norm": 0.695212721824646, + "learning_rate": 5.862222647270208e-05, + "loss": 0.884, + "step": 139310 + }, + { + "epoch": 0.89007577015959, + "grad_norm": 0.8649401664733887, + "learning_rate": 5.8617283913069796e-05, + "loss": 0.9877, + "step": 139320 + }, + { + "epoch": 0.8901396573093288, + "grad_norm": 1.0308243036270142, + "learning_rate": 5.8612341266654015e-05, + "loss": 0.9435, + "step": 139330 + }, + { + "epoch": 0.8902035444590675, + "grad_norm": 0.9423206448554993, + "learning_rate": 5.86073985335045e-05, + "loss": 0.8339, + "step": 139340 + }, + { + "epoch": 0.8902674316088062, + "grad_norm": 0.6064127087593079, + "learning_rate": 5.860245571367102e-05, + "loss": 0.8482, + "step": 139350 + }, + { + "epoch": 0.8903313187585449, + "grad_norm": 0.6485791206359863, + "learning_rate": 5.8597512807203393e-05, + "loss": 0.8751, + "step": 139360 + }, + { + "epoch": 0.8903952059082836, + "grad_norm": 4.558902740478516, + "learning_rate": 5.859256981415135e-05, + "loss": 0.9094, + "step": 139370 + }, + { + "epoch": 0.8904590930580223, + "grad_norm": 0.827876091003418, + "learning_rate": 5.858762673456472e-05, + "loss": 1.1908, + "step": 139380 + }, + { + "epoch": 0.890522980207761, + "grad_norm": 0.8407139182090759, + "learning_rate": 5.858268356849325e-05, + "loss": 0.8698, + "step": 139390 + }, + { + "epoch": 0.8905868673574997, + "grad_norm": 1.8288122415542603, + "learning_rate": 5.857774031598673e-05, + "loss": 1.018, + "step": 139400 + }, + { + "epoch": 0.8906507545072384, + "grad_norm": 0.8015510439872742, + "learning_rate": 5.8572796977094936e-05, + "loss": 0.803, + "step": 139410 + }, + { + "epoch": 0.8907146416569771, + "grad_norm": 1.08255934715271, + "learning_rate": 5.856785355186767e-05, + "loss": 0.8632, + "step": 139420 + }, + { + "epoch": 0.8907785288067158, + "grad_norm": 1.29863440990448, + "learning_rate": 5.8562910040354705e-05, + "loss": 0.8677, + "step": 139430 + }, + { + "epoch": 0.8908424159564545, + "grad_norm": 1.1884722709655762, + "learning_rate": 5.855796644260583e-05, + "loss": 0.8054, + "step": 139440 + }, + { + "epoch": 0.8909063031061932, + "grad_norm": 1.657820463180542, + "learning_rate": 5.8553022758670816e-05, + "loss": 0.866, + "step": 139450 + }, + { + "epoch": 0.8909701902559319, + "grad_norm": 1.095188021659851, + "learning_rate": 5.8548078988599484e-05, + "loss": 0.8458, + "step": 139460 + }, + { + "epoch": 0.8910340774056706, + "grad_norm": 0.8800215125083923, + "learning_rate": 5.8543135132441585e-05, + "loss": 0.7631, + "step": 139470 + }, + { + "epoch": 0.8910979645554093, + "grad_norm": 0.9168996214866638, + "learning_rate": 5.8538191190246924e-05, + "loss": 0.8652, + "step": 139480 + }, + { + "epoch": 0.8911618517051481, + "grad_norm": 0.6553764343261719, + "learning_rate": 5.85332471620653e-05, + "loss": 0.7113, + "step": 139490 + }, + { + "epoch": 0.8912257388548868, + "grad_norm": 1.1859967708587646, + "learning_rate": 5.85283030479465e-05, + "loss": 0.7221, + "step": 139500 + }, + { + "epoch": 0.8912896260046255, + "grad_norm": 0.7039145827293396, + "learning_rate": 5.852335884794029e-05, + "loss": 0.6689, + "step": 139510 + }, + { + "epoch": 0.8913535131543642, + "grad_norm": 2.0129079818725586, + "learning_rate": 5.85184145620965e-05, + "loss": 0.8524, + "step": 139520 + }, + { + "epoch": 0.8914174003041029, + "grad_norm": 0.9877476692199707, + "learning_rate": 5.8513470190464905e-05, + "loss": 0.8462, + "step": 139530 + }, + { + "epoch": 0.8914812874538416, + "grad_norm": 0.46770796179771423, + "learning_rate": 5.8508525733095285e-05, + "loss": 1.1589, + "step": 139540 + }, + { + "epoch": 0.8915451746035802, + "grad_norm": 1.1145647764205933, + "learning_rate": 5.8503581190037474e-05, + "loss": 1.1228, + "step": 139550 + }, + { + "epoch": 0.8916090617533189, + "grad_norm": 1.5091100931167603, + "learning_rate": 5.8498636561341224e-05, + "loss": 0.7566, + "step": 139560 + }, + { + "epoch": 0.8916729489030576, + "grad_norm": 0.8867336511611938, + "learning_rate": 5.849369184705635e-05, + "loss": 0.8833, + "step": 139570 + }, + { + "epoch": 0.8917368360527963, + "grad_norm": 1.0336995124816895, + "learning_rate": 5.8488747047232675e-05, + "loss": 0.9395, + "step": 139580 + }, + { + "epoch": 0.891800723202535, + "grad_norm": 1.3706622123718262, + "learning_rate": 5.848380216191995e-05, + "loss": 0.7776, + "step": 139590 + }, + { + "epoch": 0.8918646103522737, + "grad_norm": 1.2319433689117432, + "learning_rate": 5.8478857191168e-05, + "loss": 0.7916, + "step": 139600 + }, + { + "epoch": 0.8919284975020124, + "grad_norm": 0.5949766039848328, + "learning_rate": 5.847391213502663e-05, + "loss": 0.7991, + "step": 139610 + }, + { + "epoch": 0.8919923846517511, + "grad_norm": 0.7637538313865662, + "learning_rate": 5.846896699354564e-05, + "loss": 0.8839, + "step": 139620 + }, + { + "epoch": 0.8920562718014898, + "grad_norm": 1.117910385131836, + "learning_rate": 5.846402176677481e-05, + "loss": 0.7672, + "step": 139630 + }, + { + "epoch": 0.8921201589512285, + "grad_norm": 1.7316735982894897, + "learning_rate": 5.845907645476397e-05, + "loss": 1.0049, + "step": 139640 + }, + { + "epoch": 0.8921840461009672, + "grad_norm": 1.0572848320007324, + "learning_rate": 5.8454131057562914e-05, + "loss": 1.3189, + "step": 139650 + }, + { + "epoch": 0.892247933250706, + "grad_norm": 1.482458233833313, + "learning_rate": 5.844918557522143e-05, + "loss": 0.8126, + "step": 139660 + }, + { + "epoch": 0.8923118204004447, + "grad_norm": 1.1422396898269653, + "learning_rate": 5.8444240007789343e-05, + "loss": 0.946, + "step": 139670 + }, + { + "epoch": 0.8923757075501834, + "grad_norm": 0.6669201254844666, + "learning_rate": 5.8439294355316455e-05, + "loss": 0.8283, + "step": 139680 + }, + { + "epoch": 0.8924395946999221, + "grad_norm": 0.7748156785964966, + "learning_rate": 5.8434348617852566e-05, + "loss": 0.8111, + "step": 139690 + }, + { + "epoch": 0.8925034818496608, + "grad_norm": 0.7147510051727295, + "learning_rate": 5.842940279544751e-05, + "loss": 0.7302, + "step": 139700 + }, + { + "epoch": 0.8925673689993995, + "grad_norm": 0.9036562442779541, + "learning_rate": 5.842445688815106e-05, + "loss": 0.8618, + "step": 139710 + }, + { + "epoch": 0.8926312561491382, + "grad_norm": 1.1501970291137695, + "learning_rate": 5.841951089601304e-05, + "loss": 0.7836, + "step": 139720 + }, + { + "epoch": 0.8926951432988769, + "grad_norm": 1.0131080150604248, + "learning_rate": 5.8414564819083275e-05, + "loss": 0.7891, + "step": 139730 + }, + { + "epoch": 0.8927590304486156, + "grad_norm": 0.6381675601005554, + "learning_rate": 5.8409618657411544e-05, + "loss": 0.9683, + "step": 139740 + }, + { + "epoch": 0.8928229175983543, + "grad_norm": 0.8520289063453674, + "learning_rate": 5.840467241104769e-05, + "loss": 0.8815, + "step": 139750 + }, + { + "epoch": 0.892886804748093, + "grad_norm": 0.7620411515235901, + "learning_rate": 5.8399726080041504e-05, + "loss": 0.8859, + "step": 139760 + }, + { + "epoch": 0.8929506918978317, + "grad_norm": 0.7203412652015686, + "learning_rate": 5.839477966444282e-05, + "loss": 0.863, + "step": 139770 + }, + { + "epoch": 0.8930145790475704, + "grad_norm": 1.159543752670288, + "learning_rate": 5.8389833164301445e-05, + "loss": 0.7974, + "step": 139780 + }, + { + "epoch": 0.8930784661973091, + "grad_norm": 0.6249431371688843, + "learning_rate": 5.838488657966717e-05, + "loss": 0.954, + "step": 139790 + }, + { + "epoch": 0.8931423533470477, + "grad_norm": 0.8362451195716858, + "learning_rate": 5.8379939910589854e-05, + "loss": 0.8083, + "step": 139800 + }, + { + "epoch": 0.8932062404967864, + "grad_norm": 1.5072931051254272, + "learning_rate": 5.8374993157119296e-05, + "loss": 1.3744, + "step": 139810 + }, + { + "epoch": 0.8932701276465251, + "grad_norm": 0.9383344054222107, + "learning_rate": 5.8370046319305296e-05, + "loss": 0.8008, + "step": 139820 + }, + { + "epoch": 0.8933340147962638, + "grad_norm": 0.8047425150871277, + "learning_rate": 5.8365099397197695e-05, + "loss": 1.0529, + "step": 139830 + }, + { + "epoch": 0.8933979019460025, + "grad_norm": 0.8353585600852966, + "learning_rate": 5.8360152390846304e-05, + "loss": 0.6732, + "step": 139840 + }, + { + "epoch": 0.8934617890957413, + "grad_norm": 1.0151777267456055, + "learning_rate": 5.835520530030094e-05, + "loss": 0.7437, + "step": 139850 + }, + { + "epoch": 0.89352567624548, + "grad_norm": 0.9449456930160522, + "learning_rate": 5.8350258125611436e-05, + "loss": 0.8322, + "step": 139860 + }, + { + "epoch": 0.8935895633952187, + "grad_norm": 1.3340734243392944, + "learning_rate": 5.834531086682762e-05, + "loss": 0.9176, + "step": 139870 + }, + { + "epoch": 0.8936534505449574, + "grad_norm": 0.7839272022247314, + "learning_rate": 5.834036352399929e-05, + "loss": 0.8046, + "step": 139880 + }, + { + "epoch": 0.8937173376946961, + "grad_norm": 1.2315632104873657, + "learning_rate": 5.833541609717629e-05, + "loss": 0.9361, + "step": 139890 + }, + { + "epoch": 0.8937812248444348, + "grad_norm": 1.0572025775909424, + "learning_rate": 5.833046858640844e-05, + "loss": 0.7237, + "step": 139900 + }, + { + "epoch": 0.8938451119941735, + "grad_norm": 0.9382676482200623, + "learning_rate": 5.832552099174556e-05, + "loss": 0.8231, + "step": 139910 + }, + { + "epoch": 0.8939089991439122, + "grad_norm": 1.3315147161483765, + "learning_rate": 5.832057331323748e-05, + "loss": 0.8058, + "step": 139920 + }, + { + "epoch": 0.8939728862936509, + "grad_norm": 0.7122629284858704, + "learning_rate": 5.8316120330933764e-05, + "loss": 1.0295, + "step": 139930 + }, + { + "epoch": 0.8940367734433896, + "grad_norm": 0.9100248217582703, + "learning_rate": 5.831117249325708e-05, + "loss": 1.1005, + "step": 139940 + }, + { + "epoch": 0.8941006605931283, + "grad_norm": 1.913546085357666, + "learning_rate": 5.830622457187971e-05, + "loss": 0.9199, + "step": 139950 + }, + { + "epoch": 0.894164547742867, + "grad_norm": 0.9733704328536987, + "learning_rate": 5.830127656685145e-05, + "loss": 0.8767, + "step": 139960 + }, + { + "epoch": 0.8942284348926057, + "grad_norm": 1.0809566974639893, + "learning_rate": 5.8296328478222174e-05, + "loss": 0.7217, + "step": 139970 + }, + { + "epoch": 0.8942923220423444, + "grad_norm": 1.1782524585723877, + "learning_rate": 5.8291380306041685e-05, + "loss": 1.0244, + "step": 139980 + }, + { + "epoch": 0.8943562091920831, + "grad_norm": 0.9064955711364746, + "learning_rate": 5.828643205035982e-05, + "loss": 0.8093, + "step": 139990 + }, + { + "epoch": 0.8944200963418218, + "grad_norm": 1.3655163049697876, + "learning_rate": 5.828148371122643e-05, + "loss": 0.9088, + "step": 140000 + }, + { + "epoch": 0.8944839834915606, + "grad_norm": 0.760166585445404, + "learning_rate": 5.8276535288691325e-05, + "loss": 0.8999, + "step": 140010 + }, + { + "epoch": 0.8945478706412993, + "grad_norm": 1.0654029846191406, + "learning_rate": 5.8271586782804344e-05, + "loss": 0.9849, + "step": 140020 + }, + { + "epoch": 0.894611757791038, + "grad_norm": 0.7643817067146301, + "learning_rate": 5.826663819361534e-05, + "loss": 1.1517, + "step": 140030 + }, + { + "epoch": 0.8946756449407766, + "grad_norm": 0.8907740116119385, + "learning_rate": 5.8261689521174136e-05, + "loss": 1.0153, + "step": 140040 + }, + { + "epoch": 0.8947395320905153, + "grad_norm": 0.8669731616973877, + "learning_rate": 5.825674076553056e-05, + "loss": 0.9049, + "step": 140050 + }, + { + "epoch": 0.894803419240254, + "grad_norm": 0.9580491185188293, + "learning_rate": 5.8251791926734464e-05, + "loss": 0.885, + "step": 140060 + }, + { + "epoch": 0.8948673063899927, + "grad_norm": 1.1952874660491943, + "learning_rate": 5.8246843004835695e-05, + "loss": 0.8488, + "step": 140070 + }, + { + "epoch": 0.8949311935397314, + "grad_norm": 0.9386703372001648, + "learning_rate": 5.824189399988408e-05, + "loss": 0.8763, + "step": 140080 + }, + { + "epoch": 0.8949950806894701, + "grad_norm": 0.8566167950630188, + "learning_rate": 5.823694491192947e-05, + "loss": 0.7872, + "step": 140090 + }, + { + "epoch": 0.8950589678392088, + "grad_norm": 0.6722133755683899, + "learning_rate": 5.8231995741021685e-05, + "loss": 0.9128, + "step": 140100 + }, + { + "epoch": 0.8951228549889475, + "grad_norm": 0.688102662563324, + "learning_rate": 5.822704648721059e-05, + "loss": 0.8653, + "step": 140110 + }, + { + "epoch": 0.8951867421386862, + "grad_norm": 0.8262643814086914, + "learning_rate": 5.8222097150545996e-05, + "loss": 0.9295, + "step": 140120 + }, + { + "epoch": 0.8952506292884249, + "grad_norm": 0.7746517658233643, + "learning_rate": 5.821714773107779e-05, + "loss": 0.9269, + "step": 140130 + }, + { + "epoch": 0.8953145164381636, + "grad_norm": 0.5757784247398376, + "learning_rate": 5.82121982288558e-05, + "loss": 0.9843, + "step": 140140 + }, + { + "epoch": 0.8953784035879023, + "grad_norm": 0.8565959334373474, + "learning_rate": 5.8207248643929854e-05, + "loss": 0.9264, + "step": 140150 + }, + { + "epoch": 0.895442290737641, + "grad_norm": 0.7413806319236755, + "learning_rate": 5.820229897634983e-05, + "loss": 1.0038, + "step": 140160 + }, + { + "epoch": 0.8955061778873797, + "grad_norm": 0.862273633480072, + "learning_rate": 5.8197349226165556e-05, + "loss": 1.0254, + "step": 140170 + }, + { + "epoch": 0.8955700650371184, + "grad_norm": 1.1083346605300903, + "learning_rate": 5.8192399393426874e-05, + "loss": 1.0992, + "step": 140180 + }, + { + "epoch": 0.8956339521868572, + "grad_norm": 0.8349512219429016, + "learning_rate": 5.818744947818367e-05, + "loss": 0.9434, + "step": 140190 + }, + { + "epoch": 0.8956978393365959, + "grad_norm": 0.8543719053268433, + "learning_rate": 5.818249948048573e-05, + "loss": 0.7931, + "step": 140200 + }, + { + "epoch": 0.8957617264863346, + "grad_norm": 0.930448055267334, + "learning_rate": 5.817754940038296e-05, + "loss": 0.9503, + "step": 140210 + }, + { + "epoch": 0.8958256136360733, + "grad_norm": 1.1186769008636475, + "learning_rate": 5.8172599237925195e-05, + "loss": 0.9436, + "step": 140220 + }, + { + "epoch": 0.895889500785812, + "grad_norm": 0.9526256322860718, + "learning_rate": 5.8167648993162285e-05, + "loss": 0.7081, + "step": 140230 + }, + { + "epoch": 0.8959533879355507, + "grad_norm": 1.0328584909439087, + "learning_rate": 5.816269866614408e-05, + "loss": 0.9017, + "step": 140240 + }, + { + "epoch": 0.8960172750852894, + "grad_norm": 0.8597428798675537, + "learning_rate": 5.815774825692044e-05, + "loss": 0.8572, + "step": 140250 + }, + { + "epoch": 0.8960811622350281, + "grad_norm": 1.493808388710022, + "learning_rate": 5.815279776554121e-05, + "loss": 0.728, + "step": 140260 + }, + { + "epoch": 0.8961450493847668, + "grad_norm": 0.6569556593894958, + "learning_rate": 5.814784719205626e-05, + "loss": 0.7934, + "step": 140270 + }, + { + "epoch": 0.8962089365345054, + "grad_norm": 1.5247915983200073, + "learning_rate": 5.814289653651544e-05, + "loss": 0.9852, + "step": 140280 + }, + { + "epoch": 0.8962728236842441, + "grad_norm": 0.8611108064651489, + "learning_rate": 5.8137945798968606e-05, + "loss": 0.7608, + "step": 140290 + }, + { + "epoch": 0.8963367108339828, + "grad_norm": 1.3073726892471313, + "learning_rate": 5.813299497946562e-05, + "loss": 1.0608, + "step": 140300 + }, + { + "epoch": 0.8964005979837215, + "grad_norm": 0.9002431631088257, + "learning_rate": 5.812804407805633e-05, + "loss": 0.9049, + "step": 140310 + }, + { + "epoch": 0.8964644851334602, + "grad_norm": 0.9179620742797852, + "learning_rate": 5.8123093094790603e-05, + "loss": 0.761, + "step": 140320 + }, + { + "epoch": 0.8965283722831989, + "grad_norm": 1.2235110998153687, + "learning_rate": 5.8118142029718303e-05, + "loss": 0.7735, + "step": 140330 + }, + { + "epoch": 0.8965922594329376, + "grad_norm": 0.7875511646270752, + "learning_rate": 5.811319088288931e-05, + "loss": 1.0747, + "step": 140340 + }, + { + "epoch": 0.8966561465826763, + "grad_norm": 1.0100558996200562, + "learning_rate": 5.8108239654353444e-05, + "loss": 0.8439, + "step": 140350 + }, + { + "epoch": 0.896720033732415, + "grad_norm": 0.727079451084137, + "learning_rate": 5.81032883441606e-05, + "loss": 0.6591, + "step": 140360 + }, + { + "epoch": 0.8967839208821538, + "grad_norm": 0.9995219707489014, + "learning_rate": 5.809833695236063e-05, + "loss": 1.0365, + "step": 140370 + }, + { + "epoch": 0.8968478080318925, + "grad_norm": 1.349561095237732, + "learning_rate": 5.80933854790034e-05, + "loss": 0.8023, + "step": 140380 + }, + { + "epoch": 0.8969116951816312, + "grad_norm": 0.9812521934509277, + "learning_rate": 5.8088433924138785e-05, + "loss": 0.6831, + "step": 140390 + }, + { + "epoch": 0.8969755823313699, + "grad_norm": 0.8825498223304749, + "learning_rate": 5.808348228781662e-05, + "loss": 0.8826, + "step": 140400 + }, + { + "epoch": 0.8970394694811086, + "grad_norm": 1.0122778415679932, + "learning_rate": 5.807853057008682e-05, + "loss": 0.8666, + "step": 140410 + }, + { + "epoch": 0.8971033566308473, + "grad_norm": 0.6166019439697266, + "learning_rate": 5.807357877099922e-05, + "loss": 1.0452, + "step": 140420 + }, + { + "epoch": 0.897167243780586, + "grad_norm": 0.8858250379562378, + "learning_rate": 5.806862689060369e-05, + "loss": 1.0248, + "step": 140430 + }, + { + "epoch": 0.8972311309303247, + "grad_norm": 0.8427072167396545, + "learning_rate": 5.806367492895011e-05, + "loss": 0.7888, + "step": 140440 + }, + { + "epoch": 0.8972950180800634, + "grad_norm": 0.750184178352356, + "learning_rate": 5.805872288608834e-05, + "loss": 0.7918, + "step": 140450 + }, + { + "epoch": 0.8973589052298021, + "grad_norm": 1.128303050994873, + "learning_rate": 5.805377076206828e-05, + "loss": 0.6939, + "step": 140460 + }, + { + "epoch": 0.8974227923795408, + "grad_norm": 0.7906418442726135, + "learning_rate": 5.804881855693976e-05, + "loss": 0.9361, + "step": 140470 + }, + { + "epoch": 0.8974866795292795, + "grad_norm": 0.7705846428871155, + "learning_rate": 5.804386627075268e-05, + "loss": 1.2284, + "step": 140480 + }, + { + "epoch": 0.8975505666790182, + "grad_norm": 0.898186445236206, + "learning_rate": 5.803891390355691e-05, + "loss": 0.7291, + "step": 140490 + }, + { + "epoch": 0.8976144538287569, + "grad_norm": 0.7422863841056824, + "learning_rate": 5.803396145540232e-05, + "loss": 0.7275, + "step": 140500 + }, + { + "epoch": 0.8976783409784956, + "grad_norm": 1.0972338914871216, + "learning_rate": 5.802900892633879e-05, + "loss": 0.8772, + "step": 140510 + }, + { + "epoch": 0.8977422281282342, + "grad_norm": 1.0843368768692017, + "learning_rate": 5.8024056316416197e-05, + "loss": 0.9729, + "step": 140520 + }, + { + "epoch": 0.8978061152779729, + "grad_norm": 1.3560301065444946, + "learning_rate": 5.801910362568441e-05, + "loss": 1.0922, + "step": 140530 + }, + { + "epoch": 0.8978700024277116, + "grad_norm": 0.6802355647087097, + "learning_rate": 5.801415085419332e-05, + "loss": 0.851, + "step": 140540 + }, + { + "epoch": 0.8979338895774504, + "grad_norm": 1.0492808818817139, + "learning_rate": 5.800919800199279e-05, + "loss": 0.7976, + "step": 140550 + }, + { + "epoch": 0.8979977767271891, + "grad_norm": 0.4455210864543915, + "learning_rate": 5.8004245069132714e-05, + "loss": 1.0168, + "step": 140560 + }, + { + "epoch": 0.8980616638769278, + "grad_norm": 1.461052417755127, + "learning_rate": 5.799929205566296e-05, + "loss": 0.7563, + "step": 140570 + }, + { + "epoch": 0.8981255510266665, + "grad_norm": 0.7567191123962402, + "learning_rate": 5.799433896163342e-05, + "loss": 0.7135, + "step": 140580 + }, + { + "epoch": 0.8981894381764052, + "grad_norm": 1.1161195039749146, + "learning_rate": 5.7989385787093965e-05, + "loss": 1.2382, + "step": 140590 + }, + { + "epoch": 0.8982533253261439, + "grad_norm": 1.0105892419815063, + "learning_rate": 5.798443253209449e-05, + "loss": 0.9698, + "step": 140600 + }, + { + "epoch": 0.8983172124758826, + "grad_norm": 0.8450389504432678, + "learning_rate": 5.797947919668486e-05, + "loss": 1.1442, + "step": 140610 + }, + { + "epoch": 0.8983810996256213, + "grad_norm": 1.1079736948013306, + "learning_rate": 5.797452578091498e-05, + "loss": 0.9697, + "step": 140620 + }, + { + "epoch": 0.89844498677536, + "grad_norm": 1.0912140607833862, + "learning_rate": 5.796957228483473e-05, + "loss": 0.9725, + "step": 140630 + }, + { + "epoch": 0.8985088739250987, + "grad_norm": 1.1677342653274536, + "learning_rate": 5.7964618708493966e-05, + "loss": 0.996, + "step": 140640 + }, + { + "epoch": 0.8985727610748374, + "grad_norm": 0.8014145493507385, + "learning_rate": 5.7959665051942626e-05, + "loss": 0.8948, + "step": 140650 + }, + { + "epoch": 0.8986366482245761, + "grad_norm": 0.5833203792572021, + "learning_rate": 5.795471131523057e-05, + "loss": 0.8698, + "step": 140660 + }, + { + "epoch": 0.8987005353743148, + "grad_norm": 1.0062291622161865, + "learning_rate": 5.7949757498407686e-05, + "loss": 0.8926, + "step": 140670 + }, + { + "epoch": 0.8987644225240535, + "grad_norm": 0.8668988943099976, + "learning_rate": 5.7944803601523866e-05, + "loss": 0.9216, + "step": 140680 + }, + { + "epoch": 0.8988283096737922, + "grad_norm": 0.9266487956047058, + "learning_rate": 5.793984962462901e-05, + "loss": 0.887, + "step": 140690 + }, + { + "epoch": 0.8988921968235309, + "grad_norm": 0.897591769695282, + "learning_rate": 5.793489556777299e-05, + "loss": 0.815, + "step": 140700 + }, + { + "epoch": 0.8989560839732696, + "grad_norm": 1.11785089969635, + "learning_rate": 5.792994143100571e-05, + "loss": 0.7505, + "step": 140710 + }, + { + "epoch": 0.8990199711230084, + "grad_norm": 0.7704851627349854, + "learning_rate": 5.7924987214377056e-05, + "loss": 0.7002, + "step": 140720 + }, + { + "epoch": 0.8990838582727471, + "grad_norm": 1.0531551837921143, + "learning_rate": 5.7920032917936925e-05, + "loss": 0.8227, + "step": 140730 + }, + { + "epoch": 0.8991477454224858, + "grad_norm": 1.1784263849258423, + "learning_rate": 5.791507854173521e-05, + "loss": 1.0551, + "step": 140740 + }, + { + "epoch": 0.8992116325722245, + "grad_norm": 1.1204239130020142, + "learning_rate": 5.791012408582182e-05, + "loss": 0.7425, + "step": 140750 + }, + { + "epoch": 0.8992755197219632, + "grad_norm": 1.3130213022232056, + "learning_rate": 5.790516955024662e-05, + "loss": 0.6903, + "step": 140760 + }, + { + "epoch": 0.8993394068717018, + "grad_norm": 1.0173828601837158, + "learning_rate": 5.790021493505953e-05, + "loss": 1.0036, + "step": 140770 + }, + { + "epoch": 0.8994032940214405, + "grad_norm": 0.8122161030769348, + "learning_rate": 5.789526024031044e-05, + "loss": 0.909, + "step": 140780 + }, + { + "epoch": 0.8994671811711792, + "grad_norm": 1.2210887670516968, + "learning_rate": 5.7890305466049255e-05, + "loss": 0.9721, + "step": 140790 + }, + { + "epoch": 0.8995310683209179, + "grad_norm": 1.316361665725708, + "learning_rate": 5.788535061232586e-05, + "loss": 0.8888, + "step": 140800 + }, + { + "epoch": 0.8995949554706566, + "grad_norm": 0.7551913857460022, + "learning_rate": 5.788039567919017e-05, + "loss": 1.071, + "step": 140810 + }, + { + "epoch": 0.8996588426203953, + "grad_norm": 1.1682409048080444, + "learning_rate": 5.787544066669207e-05, + "loss": 0.7179, + "step": 140820 + }, + { + "epoch": 0.899722729770134, + "grad_norm": 1.1482545137405396, + "learning_rate": 5.787048557488147e-05, + "loss": 0.8796, + "step": 140830 + }, + { + "epoch": 0.8997866169198727, + "grad_norm": 0.6773545145988464, + "learning_rate": 5.786553040380828e-05, + "loss": 0.8058, + "step": 140840 + }, + { + "epoch": 0.8998505040696114, + "grad_norm": 1.3889111280441284, + "learning_rate": 5.7860575153522375e-05, + "loss": 0.7399, + "step": 140850 + }, + { + "epoch": 0.8999143912193501, + "grad_norm": 1.0887017250061035, + "learning_rate": 5.785561982407371e-05, + "loss": 0.7941, + "step": 140860 + }, + { + "epoch": 0.8999782783690888, + "grad_norm": 0.843041718006134, + "learning_rate": 5.785066441551212e-05, + "loss": 0.9538, + "step": 140870 + }, + { + "epoch": 0.9000421655188275, + "grad_norm": 0.9653436541557312, + "learning_rate": 5.784570892788758e-05, + "loss": 1.5095, + "step": 140880 + }, + { + "epoch": 0.9001060526685662, + "grad_norm": 0.824621319770813, + "learning_rate": 5.7840753361249945e-05, + "loss": 0.9549, + "step": 140890 + }, + { + "epoch": 0.900169939818305, + "grad_norm": 0.7692892551422119, + "learning_rate": 5.783579771564914e-05, + "loss": 0.8578, + "step": 140900 + }, + { + "epoch": 0.9002338269680437, + "grad_norm": 0.9946816563606262, + "learning_rate": 5.7830841991135086e-05, + "loss": 0.8378, + "step": 140910 + }, + { + "epoch": 0.9002977141177824, + "grad_norm": 0.8059331178665161, + "learning_rate": 5.782588618775766e-05, + "loss": 0.8517, + "step": 140920 + }, + { + "epoch": 0.9003616012675211, + "grad_norm": 0.7814688086509705, + "learning_rate": 5.782093030556681e-05, + "loss": 0.8913, + "step": 140930 + }, + { + "epoch": 0.9004254884172598, + "grad_norm": 0.7162201404571533, + "learning_rate": 5.781597434461241e-05, + "loss": 0.7852, + "step": 140940 + }, + { + "epoch": 0.9004893755669985, + "grad_norm": 0.7033975720405579, + "learning_rate": 5.78110183049444e-05, + "loss": 0.8596, + "step": 140950 + }, + { + "epoch": 0.9005532627167372, + "grad_norm": 1.4916331768035889, + "learning_rate": 5.7806062186612666e-05, + "loss": 0.9765, + "step": 140960 + }, + { + "epoch": 0.9006171498664759, + "grad_norm": 0.9655159115791321, + "learning_rate": 5.7801105989667134e-05, + "loss": 0.6591, + "step": 140970 + }, + { + "epoch": 0.9006810370162146, + "grad_norm": 1.061277985572815, + "learning_rate": 5.7796149714157724e-05, + "loss": 1.0504, + "step": 140980 + }, + { + "epoch": 0.9007449241659533, + "grad_norm": 0.7868290543556213, + "learning_rate": 5.779119336013433e-05, + "loss": 0.8025, + "step": 140990 + }, + { + "epoch": 0.900808811315692, + "grad_norm": 0.7589150667190552, + "learning_rate": 5.7786236927646886e-05, + "loss": 0.704, + "step": 141000 + }, + { + "epoch": 0.9008726984654306, + "grad_norm": 0.7982025742530823, + "learning_rate": 5.77812804167453e-05, + "loss": 0.8338, + "step": 141010 + }, + { + "epoch": 0.9009365856151693, + "grad_norm": 1.031693458557129, + "learning_rate": 5.7776323827479484e-05, + "loss": 0.8807, + "step": 141020 + }, + { + "epoch": 0.901000472764908, + "grad_norm": 0.9753697514533997, + "learning_rate": 5.777136715989936e-05, + "loss": 0.7684, + "step": 141030 + }, + { + "epoch": 0.9010643599146467, + "grad_norm": 0.9821022152900696, + "learning_rate": 5.776641041405485e-05, + "loss": 0.7901, + "step": 141040 + }, + { + "epoch": 0.9011282470643854, + "grad_norm": 0.5876692533493042, + "learning_rate": 5.776145358999587e-05, + "loss": 0.8646, + "step": 141050 + }, + { + "epoch": 0.9011921342141241, + "grad_norm": 1.2983272075653076, + "learning_rate": 5.7756496687772346e-05, + "loss": 0.721, + "step": 141060 + }, + { + "epoch": 0.9012560213638628, + "grad_norm": 0.7570874691009521, + "learning_rate": 5.775153970743418e-05, + "loss": 0.776, + "step": 141070 + }, + { + "epoch": 0.9013199085136016, + "grad_norm": 0.7375748157501221, + "learning_rate": 5.77465826490313e-05, + "loss": 0.7279, + "step": 141080 + }, + { + "epoch": 0.9013837956633403, + "grad_norm": 1.1768232583999634, + "learning_rate": 5.774162551261363e-05, + "loss": 0.8846, + "step": 141090 + }, + { + "epoch": 0.901447682813079, + "grad_norm": 0.6223422884941101, + "learning_rate": 5.7736668298231103e-05, + "loss": 0.8627, + "step": 141100 + }, + { + "epoch": 0.9015115699628177, + "grad_norm": 0.9526621699333191, + "learning_rate": 5.773171100593362e-05, + "loss": 1.0312, + "step": 141110 + }, + { + "epoch": 0.9015754571125564, + "grad_norm": 1.0160198211669922, + "learning_rate": 5.772675363577112e-05, + "loss": 1.0077, + "step": 141120 + }, + { + "epoch": 0.9016393442622951, + "grad_norm": 0.5634738802909851, + "learning_rate": 5.772179618779354e-05, + "loss": 0.8606, + "step": 141130 + }, + { + "epoch": 0.9017032314120338, + "grad_norm": 0.93465656042099, + "learning_rate": 5.7716838662050784e-05, + "loss": 1.0614, + "step": 141140 + }, + { + "epoch": 0.9017671185617725, + "grad_norm": 0.9534331560134888, + "learning_rate": 5.7711881058592786e-05, + "loss": 0.796, + "step": 141150 + }, + { + "epoch": 0.9018310057115112, + "grad_norm": 1.255573034286499, + "learning_rate": 5.7706923377469477e-05, + "loss": 0.7282, + "step": 141160 + }, + { + "epoch": 0.9018948928612499, + "grad_norm": 0.8945760726928711, + "learning_rate": 5.770196561873077e-05, + "loss": 0.7917, + "step": 141170 + }, + { + "epoch": 0.9019587800109886, + "grad_norm": 1.1224944591522217, + "learning_rate": 5.769700778242661e-05, + "loss": 1.3042, + "step": 141180 + }, + { + "epoch": 0.9020226671607273, + "grad_norm": 1.421687126159668, + "learning_rate": 5.769204986860692e-05, + "loss": 0.8137, + "step": 141190 + }, + { + "epoch": 0.902086554310466, + "grad_norm": 0.8872746229171753, + "learning_rate": 5.7687091877321654e-05, + "loss": 0.7287, + "step": 141200 + }, + { + "epoch": 0.9021504414602047, + "grad_norm": 0.8511372804641724, + "learning_rate": 5.7682133808620706e-05, + "loss": 0.6813, + "step": 141210 + }, + { + "epoch": 0.9022143286099434, + "grad_norm": 0.7379246354103088, + "learning_rate": 5.7677175662554025e-05, + "loss": 0.9436, + "step": 141220 + }, + { + "epoch": 0.9022782157596821, + "grad_norm": 0.7902219891548157, + "learning_rate": 5.767221743917155e-05, + "loss": 1.0224, + "step": 141230 + }, + { + "epoch": 0.9023421029094209, + "grad_norm": 1.0444148778915405, + "learning_rate": 5.766725913852321e-05, + "loss": 0.7918, + "step": 141240 + }, + { + "epoch": 0.9024059900591594, + "grad_norm": 0.641941249370575, + "learning_rate": 5.766230076065893e-05, + "loss": 0.749, + "step": 141250 + }, + { + "epoch": 0.9024698772088982, + "grad_norm": 0.4563290476799011, + "learning_rate": 5.7657342305628647e-05, + "loss": 0.6918, + "step": 141260 + }, + { + "epoch": 0.9025337643586369, + "grad_norm": 0.9786915183067322, + "learning_rate": 5.765238377348232e-05, + "loss": 0.6863, + "step": 141270 + }, + { + "epoch": 0.9025976515083756, + "grad_norm": 0.9505366086959839, + "learning_rate": 5.764742516426985e-05, + "loss": 0.8688, + "step": 141280 + }, + { + "epoch": 0.9026615386581143, + "grad_norm": 0.6907140612602234, + "learning_rate": 5.76424664780412e-05, + "loss": 0.6706, + "step": 141290 + }, + { + "epoch": 0.902725425807853, + "grad_norm": 0.9045562148094177, + "learning_rate": 5.7637507714846304e-05, + "loss": 0.9848, + "step": 141300 + }, + { + "epoch": 0.9027893129575917, + "grad_norm": 1.1987589597702026, + "learning_rate": 5.763254887473512e-05, + "loss": 0.8481, + "step": 141310 + }, + { + "epoch": 0.9028532001073304, + "grad_norm": 1.0016857385635376, + "learning_rate": 5.7627589957757535e-05, + "loss": 0.9294, + "step": 141320 + }, + { + "epoch": 0.9029170872570691, + "grad_norm": 0.8415845632553101, + "learning_rate": 5.762263096396351e-05, + "loss": 0.7345, + "step": 141330 + }, + { + "epoch": 0.9029809744068078, + "grad_norm": 0.8191704154014587, + "learning_rate": 5.761767189340302e-05, + "loss": 0.9255, + "step": 141340 + }, + { + "epoch": 0.9030448615565465, + "grad_norm": 0.7609559893608093, + "learning_rate": 5.761271274612597e-05, + "loss": 0.7549, + "step": 141350 + }, + { + "epoch": 0.9031087487062852, + "grad_norm": 0.6478745341300964, + "learning_rate": 5.7607753522182326e-05, + "loss": 0.882, + "step": 141360 + }, + { + "epoch": 0.9031726358560239, + "grad_norm": 1.0700238943099976, + "learning_rate": 5.7602794221622024e-05, + "loss": 0.8373, + "step": 141370 + }, + { + "epoch": 0.9032365230057626, + "grad_norm": 0.638687252998352, + "learning_rate": 5.7597834844495005e-05, + "loss": 0.8512, + "step": 141380 + }, + { + "epoch": 0.9033004101555013, + "grad_norm": 0.8440176248550415, + "learning_rate": 5.759287539085121e-05, + "loss": 0.8574, + "step": 141390 + }, + { + "epoch": 0.90336429730524, + "grad_norm": 1.7291699647903442, + "learning_rate": 5.7587915860740596e-05, + "loss": 0.9808, + "step": 141400 + }, + { + "epoch": 0.9034281844549787, + "grad_norm": 1.2312055826187134, + "learning_rate": 5.758295625421311e-05, + "loss": 0.7749, + "step": 141410 + }, + { + "epoch": 0.9034920716047175, + "grad_norm": 0.8590479493141174, + "learning_rate": 5.757799657131868e-05, + "loss": 0.9986, + "step": 141420 + }, + { + "epoch": 0.9035559587544562, + "grad_norm": 0.7374904155731201, + "learning_rate": 5.757303681210728e-05, + "loss": 0.7731, + "step": 141430 + }, + { + "epoch": 0.9036198459041949, + "grad_norm": 1.1177715063095093, + "learning_rate": 5.756807697662885e-05, + "loss": 1.0543, + "step": 141440 + }, + { + "epoch": 0.9036837330539336, + "grad_norm": 0.9571028351783752, + "learning_rate": 5.7563117064933327e-05, + "loss": 0.9526, + "step": 141450 + }, + { + "epoch": 0.9037476202036723, + "grad_norm": 0.8054457902908325, + "learning_rate": 5.755815707707067e-05, + "loss": 1.079, + "step": 141460 + }, + { + "epoch": 0.903811507353411, + "grad_norm": 1.010640263557434, + "learning_rate": 5.755319701309084e-05, + "loss": 0.9706, + "step": 141470 + }, + { + "epoch": 0.9038753945031497, + "grad_norm": 0.7342219948768616, + "learning_rate": 5.7548236873043795e-05, + "loss": 0.6875, + "step": 141480 + }, + { + "epoch": 0.9039392816528884, + "grad_norm": 0.8823431730270386, + "learning_rate": 5.754327665697945e-05, + "loss": 0.8113, + "step": 141490 + }, + { + "epoch": 0.904003168802627, + "grad_norm": 0.9259976744651794, + "learning_rate": 5.75383163649478e-05, + "loss": 0.7433, + "step": 141500 + }, + { + "epoch": 0.9040670559523657, + "grad_norm": 0.7435508370399475, + "learning_rate": 5.753335599699877e-05, + "loss": 0.725, + "step": 141510 + }, + { + "epoch": 0.9041309431021044, + "grad_norm": 1.2026573419570923, + "learning_rate": 5.752839555318235e-05, + "loss": 0.9269, + "step": 141520 + }, + { + "epoch": 0.9041948302518431, + "grad_norm": 0.9192835092544556, + "learning_rate": 5.752343503354844e-05, + "loss": 0.7876, + "step": 141530 + }, + { + "epoch": 0.9042587174015818, + "grad_norm": 0.9267190098762512, + "learning_rate": 5.7518474438147054e-05, + "loss": 0.9516, + "step": 141540 + }, + { + "epoch": 0.9043226045513205, + "grad_norm": 0.72882479429245, + "learning_rate": 5.7513513767028124e-05, + "loss": 0.8018, + "step": 141550 + }, + { + "epoch": 0.9043864917010592, + "grad_norm": 0.7397010326385498, + "learning_rate": 5.7508553020241606e-05, + "loss": 0.7555, + "step": 141560 + }, + { + "epoch": 0.9044503788507979, + "grad_norm": 1.5903676748275757, + "learning_rate": 5.750359219783746e-05, + "loss": 0.8326, + "step": 141570 + }, + { + "epoch": 0.9045142660005366, + "grad_norm": 1.2328916788101196, + "learning_rate": 5.749863129986566e-05, + "loss": 1.1779, + "step": 141580 + }, + { + "epoch": 0.9045781531502753, + "grad_norm": 1.1340621709823608, + "learning_rate": 5.7493670326376146e-05, + "loss": 1.0768, + "step": 141590 + }, + { + "epoch": 0.904642040300014, + "grad_norm": 0.6127023100852966, + "learning_rate": 5.74887092774189e-05, + "loss": 0.9385, + "step": 141600 + }, + { + "epoch": 0.9047059274497528, + "grad_norm": 1.0815454721450806, + "learning_rate": 5.748374815304386e-05, + "loss": 0.6736, + "step": 141610 + }, + { + "epoch": 0.9047698145994915, + "grad_norm": 1.3410221338272095, + "learning_rate": 5.7478786953301014e-05, + "loss": 0.8431, + "step": 141620 + }, + { + "epoch": 0.9048337017492302, + "grad_norm": 1.1525499820709229, + "learning_rate": 5.74738256782403e-05, + "loss": 0.872, + "step": 141630 + }, + { + "epoch": 0.9048975888989689, + "grad_norm": 0.8780061602592468, + "learning_rate": 5.74688643279117e-05, + "loss": 0.9206, + "step": 141640 + }, + { + "epoch": 0.9049614760487076, + "grad_norm": 1.385847568511963, + "learning_rate": 5.7463902902365174e-05, + "loss": 0.8192, + "step": 141650 + }, + { + "epoch": 0.9050253631984463, + "grad_norm": 1.0178450345993042, + "learning_rate": 5.745894140165069e-05, + "loss": 0.8582, + "step": 141660 + }, + { + "epoch": 0.905089250348185, + "grad_norm": 1.3906409740447998, + "learning_rate": 5.745397982581822e-05, + "loss": 0.79, + "step": 141670 + }, + { + "epoch": 0.9051531374979237, + "grad_norm": 0.7807207107543945, + "learning_rate": 5.7449018174917726e-05, + "loss": 0.721, + "step": 141680 + }, + { + "epoch": 0.9052170246476624, + "grad_norm": 0.7866250872612, + "learning_rate": 5.744405644899916e-05, + "loss": 0.7746, + "step": 141690 + }, + { + "epoch": 0.9052809117974011, + "grad_norm": 0.7729105949401855, + "learning_rate": 5.74390946481125e-05, + "loss": 0.682, + "step": 141700 + }, + { + "epoch": 0.9053447989471398, + "grad_norm": 1.1368606090545654, + "learning_rate": 5.7434132772307735e-05, + "loss": 1.2501, + "step": 141710 + }, + { + "epoch": 0.9054086860968785, + "grad_norm": 2.2718279361724854, + "learning_rate": 5.742917082163483e-05, + "loss": 0.8217, + "step": 141720 + }, + { + "epoch": 0.9054725732466172, + "grad_norm": 0.683323323726654, + "learning_rate": 5.742420879614373e-05, + "loss": 1.092, + "step": 141730 + }, + { + "epoch": 0.9055364603963558, + "grad_norm": 0.8295241594314575, + "learning_rate": 5.741924669588443e-05, + "loss": 0.6993, + "step": 141740 + }, + { + "epoch": 0.9056003475460945, + "grad_norm": 0.8456636667251587, + "learning_rate": 5.7414284520906905e-05, + "loss": 0.6898, + "step": 141750 + }, + { + "epoch": 0.9056642346958332, + "grad_norm": 0.8255101442337036, + "learning_rate": 5.7409322271261115e-05, + "loss": 0.9425, + "step": 141760 + }, + { + "epoch": 0.9057281218455719, + "grad_norm": 0.7259197235107422, + "learning_rate": 5.740435994699704e-05, + "loss": 0.7357, + "step": 141770 + }, + { + "epoch": 0.9057920089953106, + "grad_norm": 0.8073982000350952, + "learning_rate": 5.739939754816466e-05, + "loss": 1.0821, + "step": 141780 + }, + { + "epoch": 0.9058558961450494, + "grad_norm": 0.9572609663009644, + "learning_rate": 5.7394435074813944e-05, + "loss": 0.9942, + "step": 141790 + }, + { + "epoch": 0.9059197832947881, + "grad_norm": 0.7269392013549805, + "learning_rate": 5.738947252699487e-05, + "loss": 0.8068, + "step": 141800 + }, + { + "epoch": 0.9059836704445268, + "grad_norm": 0.710191547870636, + "learning_rate": 5.738450990475741e-05, + "loss": 0.8417, + "step": 141810 + }, + { + "epoch": 0.9060475575942655, + "grad_norm": 1.1868984699249268, + "learning_rate": 5.7379547208151554e-05, + "loss": 0.7552, + "step": 141820 + }, + { + "epoch": 0.9061114447440042, + "grad_norm": 0.8787739872932434, + "learning_rate": 5.737458443722726e-05, + "loss": 0.9598, + "step": 141830 + }, + { + "epoch": 0.9061753318937429, + "grad_norm": 0.5496005415916443, + "learning_rate": 5.736962159203453e-05, + "loss": 0.801, + "step": 141840 + }, + { + "epoch": 0.9062392190434816, + "grad_norm": 1.013084053993225, + "learning_rate": 5.736465867262333e-05, + "loss": 0.9996, + "step": 141850 + }, + { + "epoch": 0.9063031061932203, + "grad_norm": 0.8402630686759949, + "learning_rate": 5.735969567904363e-05, + "loss": 0.7736, + "step": 141860 + }, + { + "epoch": 0.906366993342959, + "grad_norm": 0.7525802850723267, + "learning_rate": 5.735473261134545e-05, + "loss": 0.6874, + "step": 141870 + }, + { + "epoch": 0.9064308804926977, + "grad_norm": 0.8910543918609619, + "learning_rate": 5.734976946957875e-05, + "loss": 0.8928, + "step": 141880 + }, + { + "epoch": 0.9064947676424364, + "grad_norm": 1.0312716960906982, + "learning_rate": 5.7344806253793504e-05, + "loss": 0.8406, + "step": 141890 + }, + { + "epoch": 0.9065586547921751, + "grad_norm": 0.8210422396659851, + "learning_rate": 5.733984296403971e-05, + "loss": 1.0218, + "step": 141900 + }, + { + "epoch": 0.9066225419419138, + "grad_norm": 1.0540539026260376, + "learning_rate": 5.733487960036735e-05, + "loss": 0.9388, + "step": 141910 + }, + { + "epoch": 0.9066864290916525, + "grad_norm": 1.0182119607925415, + "learning_rate": 5.73299161628264e-05, + "loss": 0.9436, + "step": 141920 + }, + { + "epoch": 0.9067503162413912, + "grad_norm": 0.9228383898735046, + "learning_rate": 5.732495265146687e-05, + "loss": 0.7732, + "step": 141930 + }, + { + "epoch": 0.90681420339113, + "grad_norm": 0.8174379467964172, + "learning_rate": 5.731998906633871e-05, + "loss": 1.08, + "step": 141940 + }, + { + "epoch": 0.9068780905408687, + "grad_norm": 0.807985246181488, + "learning_rate": 5.731502540749194e-05, + "loss": 0.9834, + "step": 141950 + }, + { + "epoch": 0.9069419776906074, + "grad_norm": 0.8408271670341492, + "learning_rate": 5.7310061674976526e-05, + "loss": 0.958, + "step": 141960 + }, + { + "epoch": 0.9070058648403461, + "grad_norm": 0.7872259616851807, + "learning_rate": 5.730509786884247e-05, + "loss": 1.0426, + "step": 141970 + }, + { + "epoch": 0.9070697519900847, + "grad_norm": 0.7922796010971069, + "learning_rate": 5.730013398913976e-05, + "loss": 1.0874, + "step": 141980 + }, + { + "epoch": 0.9071336391398234, + "grad_norm": 0.8514977693557739, + "learning_rate": 5.729517003591839e-05, + "loss": 1.1897, + "step": 141990 + }, + { + "epoch": 0.9071975262895621, + "grad_norm": 1.1067577600479126, + "learning_rate": 5.729020600922833e-05, + "loss": 1.1056, + "step": 142000 + }, + { + "epoch": 0.9072614134393008, + "grad_norm": 1.1628286838531494, + "learning_rate": 5.7285241909119606e-05, + "loss": 1.0116, + "step": 142010 + }, + { + "epoch": 0.9073253005890395, + "grad_norm": 1.2236912250518799, + "learning_rate": 5.7280277735642184e-05, + "loss": 0.7426, + "step": 142020 + }, + { + "epoch": 0.9073891877387782, + "grad_norm": 1.162598729133606, + "learning_rate": 5.727531348884607e-05, + "loss": 0.8706, + "step": 142030 + }, + { + "epoch": 0.9074530748885169, + "grad_norm": 1.4383586645126343, + "learning_rate": 5.7270349168781256e-05, + "loss": 0.8572, + "step": 142040 + }, + { + "epoch": 0.9075169620382556, + "grad_norm": 0.8928152322769165, + "learning_rate": 5.726538477549774e-05, + "loss": 0.8158, + "step": 142050 + }, + { + "epoch": 0.9075808491879943, + "grad_norm": 0.8349987864494324, + "learning_rate": 5.7260420309045507e-05, + "loss": 1.1324, + "step": 142060 + }, + { + "epoch": 0.907644736337733, + "grad_norm": 0.7434611320495605, + "learning_rate": 5.725545576947456e-05, + "loss": 0.9721, + "step": 142070 + }, + { + "epoch": 0.9077086234874717, + "grad_norm": 0.7635177373886108, + "learning_rate": 5.72504911568349e-05, + "loss": 0.8982, + "step": 142080 + }, + { + "epoch": 0.9077725106372104, + "grad_norm": 0.7675304412841797, + "learning_rate": 5.724552647117653e-05, + "loss": 0.9434, + "step": 142090 + }, + { + "epoch": 0.9078363977869491, + "grad_norm": 0.9475330710411072, + "learning_rate": 5.724056171254942e-05, + "loss": 0.6869, + "step": 142100 + }, + { + "epoch": 0.9079002849366878, + "grad_norm": 1.1535788774490356, + "learning_rate": 5.7235596881003604e-05, + "loss": 0.7812, + "step": 142110 + }, + { + "epoch": 0.9079641720864265, + "grad_norm": 0.9977142214775085, + "learning_rate": 5.723063197658907e-05, + "loss": 1.0825, + "step": 142120 + }, + { + "epoch": 0.9080280592361653, + "grad_norm": 0.7173039317131042, + "learning_rate": 5.722566699935581e-05, + "loss": 0.843, + "step": 142130 + }, + { + "epoch": 0.908091946385904, + "grad_norm": 1.8091074228286743, + "learning_rate": 5.7220701949353825e-05, + "loss": 0.8146, + "step": 142140 + }, + { + "epoch": 0.9081558335356427, + "grad_norm": 1.055882453918457, + "learning_rate": 5.7215736826633135e-05, + "loss": 0.7627, + "step": 142150 + }, + { + "epoch": 0.9082197206853814, + "grad_norm": 0.6005893349647522, + "learning_rate": 5.721077163124373e-05, + "loss": 1.0631, + "step": 142160 + }, + { + "epoch": 0.9082836078351201, + "grad_norm": 0.9407183527946472, + "learning_rate": 5.7205806363235616e-05, + "loss": 0.982, + "step": 142170 + }, + { + "epoch": 0.9083474949848588, + "grad_norm": 1.6951545476913452, + "learning_rate": 5.7200841022658804e-05, + "loss": 0.8731, + "step": 142180 + }, + { + "epoch": 0.9084113821345975, + "grad_norm": 1.102827787399292, + "learning_rate": 5.719587560956327e-05, + "loss": 0.9827, + "step": 142190 + }, + { + "epoch": 0.9084752692843362, + "grad_norm": 1.1529837846755981, + "learning_rate": 5.719091012399907e-05, + "loss": 0.8706, + "step": 142200 + }, + { + "epoch": 0.9085391564340749, + "grad_norm": 0.8350210785865784, + "learning_rate": 5.718594456601618e-05, + "loss": 0.8377, + "step": 142210 + }, + { + "epoch": 0.9086030435838135, + "grad_norm": 0.9454380869865417, + "learning_rate": 5.71809789356646e-05, + "loss": 0.893, + "step": 142220 + }, + { + "epoch": 0.9086669307335522, + "grad_norm": 0.8189147710800171, + "learning_rate": 5.7176013232994354e-05, + "loss": 0.9778, + "step": 142230 + }, + { + "epoch": 0.9087308178832909, + "grad_norm": 0.8861716389656067, + "learning_rate": 5.717104745805545e-05, + "loss": 1.0042, + "step": 142240 + }, + { + "epoch": 0.9087947050330296, + "grad_norm": 0.785626232624054, + "learning_rate": 5.716608161089789e-05, + "loss": 0.8079, + "step": 142250 + }, + { + "epoch": 0.9088585921827683, + "grad_norm": 1.1369320154190063, + "learning_rate": 5.716111569157169e-05, + "loss": 0.7916, + "step": 142260 + }, + { + "epoch": 0.908922479332507, + "grad_norm": 1.1900125741958618, + "learning_rate": 5.715614970012686e-05, + "loss": 0.9075, + "step": 142270 + }, + { + "epoch": 0.9089863664822457, + "grad_norm": 1.0034444332122803, + "learning_rate": 5.7151183636613425e-05, + "loss": 0.9664, + "step": 142280 + }, + { + "epoch": 0.9090502536319844, + "grad_norm": 0.9154879450798035, + "learning_rate": 5.714621750108138e-05, + "loss": 0.9571, + "step": 142290 + }, + { + "epoch": 0.9091141407817231, + "grad_norm": 0.6897664070129395, + "learning_rate": 5.714125129358072e-05, + "loss": 0.9097, + "step": 142300 + }, + { + "epoch": 0.9091780279314619, + "grad_norm": 0.6371667385101318, + "learning_rate": 5.7136285014161506e-05, + "loss": 0.7985, + "step": 142310 + }, + { + "epoch": 0.9092419150812006, + "grad_norm": 1.1131823062896729, + "learning_rate": 5.713131866287371e-05, + "loss": 0.9823, + "step": 142320 + }, + { + "epoch": 0.9093058022309393, + "grad_norm": 1.771092176437378, + "learning_rate": 5.712635223976738e-05, + "loss": 1.0425, + "step": 142330 + }, + { + "epoch": 0.909369689380678, + "grad_norm": 1.0553698539733887, + "learning_rate": 5.712138574489251e-05, + "loss": 0.9531, + "step": 142340 + }, + { + "epoch": 0.9094335765304167, + "grad_norm": 0.4885224401950836, + "learning_rate": 5.711641917829913e-05, + "loss": 1.0004, + "step": 142350 + }, + { + "epoch": 0.9094974636801554, + "grad_norm": 0.8409444093704224, + "learning_rate": 5.7111452540037245e-05, + "loss": 0.6038, + "step": 142360 + }, + { + "epoch": 0.9095613508298941, + "grad_norm": 0.7915950417518616, + "learning_rate": 5.7106485830156885e-05, + "loss": 0.72, + "step": 142370 + }, + { + "epoch": 0.9096252379796328, + "grad_norm": 0.8521913290023804, + "learning_rate": 5.710151904870806e-05, + "loss": 0.9703, + "step": 142380 + }, + { + "epoch": 0.9096891251293715, + "grad_norm": 0.759997546672821, + "learning_rate": 5.7096552195740797e-05, + "loss": 0.8757, + "step": 142390 + }, + { + "epoch": 0.9097530122791102, + "grad_norm": 1.0440067052841187, + "learning_rate": 5.7091585271305113e-05, + "loss": 0.9393, + "step": 142400 + }, + { + "epoch": 0.9098168994288489, + "grad_norm": 0.5940192341804504, + "learning_rate": 5.7086618275451034e-05, + "loss": 0.6164, + "step": 142410 + }, + { + "epoch": 0.9098807865785876, + "grad_norm": 0.9643594622612, + "learning_rate": 5.708165120822857e-05, + "loss": 0.9354, + "step": 142420 + }, + { + "epoch": 0.9099446737283263, + "grad_norm": 1.3111646175384521, + "learning_rate": 5.707668406968776e-05, + "loss": 0.8814, + "step": 142430 + }, + { + "epoch": 0.910008560878065, + "grad_norm": 0.7643352746963501, + "learning_rate": 5.7071716859878624e-05, + "loss": 0.7996, + "step": 142440 + }, + { + "epoch": 0.9100724480278037, + "grad_norm": 0.6600208878517151, + "learning_rate": 5.7066749578851163e-05, + "loss": 0.9294, + "step": 142450 + }, + { + "epoch": 0.9101363351775424, + "grad_norm": 0.8577162027359009, + "learning_rate": 5.706178222665543e-05, + "loss": 0.8142, + "step": 142460 + }, + { + "epoch": 0.910200222327281, + "grad_norm": 0.6898266077041626, + "learning_rate": 5.7056814803341454e-05, + "loss": 0.7175, + "step": 142470 + }, + { + "epoch": 0.9102641094770197, + "grad_norm": 0.6253551840782166, + "learning_rate": 5.705184730895924e-05, + "loss": 0.8652, + "step": 142480 + }, + { + "epoch": 0.9103279966267585, + "grad_norm": 1.0927127599716187, + "learning_rate": 5.704687974355881e-05, + "loss": 0.9197, + "step": 142490 + }, + { + "epoch": 0.9103918837764972, + "grad_norm": 1.2770670652389526, + "learning_rate": 5.7041912107190223e-05, + "loss": 0.7483, + "step": 142500 + }, + { + "epoch": 0.9104557709262359, + "grad_norm": 0.7076462507247925, + "learning_rate": 5.703694439990348e-05, + "loss": 0.9537, + "step": 142510 + }, + { + "epoch": 0.9105196580759746, + "grad_norm": 1.3433187007904053, + "learning_rate": 5.703197662174863e-05, + "loss": 0.8636, + "step": 142520 + }, + { + "epoch": 0.9105835452257133, + "grad_norm": 1.6730338335037231, + "learning_rate": 5.702700877277568e-05, + "loss": 0.9131, + "step": 142530 + }, + { + "epoch": 0.910647432375452, + "grad_norm": 1.130669355392456, + "learning_rate": 5.702204085303468e-05, + "loss": 0.9168, + "step": 142540 + }, + { + "epoch": 0.9107113195251907, + "grad_norm": 0.9546806216239929, + "learning_rate": 5.7017072862575626e-05, + "loss": 0.8348, + "step": 142550 + }, + { + "epoch": 0.9107752066749294, + "grad_norm": 0.6770340800285339, + "learning_rate": 5.701210480144861e-05, + "loss": 0.9684, + "step": 142560 + }, + { + "epoch": 0.9108390938246681, + "grad_norm": 1.0904873609542847, + "learning_rate": 5.700713666970361e-05, + "loss": 0.8426, + "step": 142570 + }, + { + "epoch": 0.9109029809744068, + "grad_norm": 0.8043915629386902, + "learning_rate": 5.7002168467390694e-05, + "loss": 1.1309, + "step": 142580 + }, + { + "epoch": 0.9109668681241455, + "grad_norm": 0.7632177472114563, + "learning_rate": 5.699720019455989e-05, + "loss": 0.8359, + "step": 142590 + }, + { + "epoch": 0.9110307552738842, + "grad_norm": 0.8584951758384705, + "learning_rate": 5.699223185126121e-05, + "loss": 0.8166, + "step": 142600 + }, + { + "epoch": 0.9110946424236229, + "grad_norm": 0.704011082649231, + "learning_rate": 5.698726343754472e-05, + "loss": 0.7772, + "step": 142610 + }, + { + "epoch": 0.9111585295733616, + "grad_norm": 0.954410970211029, + "learning_rate": 5.698229495346044e-05, + "loss": 0.9824, + "step": 142620 + }, + { + "epoch": 0.9112224167231003, + "grad_norm": 1.6050761938095093, + "learning_rate": 5.697732639905841e-05, + "loss": 0.986, + "step": 142630 + }, + { + "epoch": 0.911286303872839, + "grad_norm": 0.8036152720451355, + "learning_rate": 5.697235777438866e-05, + "loss": 1.1253, + "step": 142640 + }, + { + "epoch": 0.9113501910225777, + "grad_norm": 1.0591254234313965, + "learning_rate": 5.6967389079501234e-05, + "loss": 0.7433, + "step": 142650 + }, + { + "epoch": 0.9114140781723165, + "grad_norm": 0.63569176197052, + "learning_rate": 5.6962420314446186e-05, + "loss": 0.7921, + "step": 142660 + }, + { + "epoch": 0.9114779653220552, + "grad_norm": 0.6590409278869629, + "learning_rate": 5.6957451479273526e-05, + "loss": 0.7715, + "step": 142670 + }, + { + "epoch": 0.9115418524717939, + "grad_norm": 0.8721647262573242, + "learning_rate": 5.695248257403332e-05, + "loss": 0.9248, + "step": 142680 + }, + { + "epoch": 0.9116057396215326, + "grad_norm": 0.8576902151107788, + "learning_rate": 5.6947513598775605e-05, + "loss": 0.8463, + "step": 142690 + }, + { + "epoch": 0.9116696267712713, + "grad_norm": 1.4896668195724487, + "learning_rate": 5.69425445535504e-05, + "loss": 0.816, + "step": 142700 + }, + { + "epoch": 0.9117335139210099, + "grad_norm": 0.9425134062767029, + "learning_rate": 5.693757543840779e-05, + "loss": 0.9642, + "step": 142710 + }, + { + "epoch": 0.9117974010707486, + "grad_norm": 0.9548724889755249, + "learning_rate": 5.693260625339777e-05, + "loss": 0.6938, + "step": 142720 + }, + { + "epoch": 0.9118612882204873, + "grad_norm": 1.2961103916168213, + "learning_rate": 5.692763699857042e-05, + "loss": 0.993, + "step": 142730 + }, + { + "epoch": 0.911925175370226, + "grad_norm": 0.8667069673538208, + "learning_rate": 5.692266767397576e-05, + "loss": 0.8989, + "step": 142740 + }, + { + "epoch": 0.9119890625199647, + "grad_norm": 0.678893506526947, + "learning_rate": 5.691769827966386e-05, + "loss": 0.7032, + "step": 142750 + }, + { + "epoch": 0.9120529496697034, + "grad_norm": 0.5294567942619324, + "learning_rate": 5.6912728815684744e-05, + "loss": 0.9135, + "step": 142760 + }, + { + "epoch": 0.9121168368194421, + "grad_norm": 0.7864146828651428, + "learning_rate": 5.690775928208848e-05, + "loss": 0.7941, + "step": 142770 + }, + { + "epoch": 0.9121807239691808, + "grad_norm": 0.7031729817390442, + "learning_rate": 5.690278967892511e-05, + "loss": 0.8931, + "step": 142780 + }, + { + "epoch": 0.9122446111189195, + "grad_norm": 0.7443121671676636, + "learning_rate": 5.689782000624466e-05, + "loss": 0.8238, + "step": 142790 + }, + { + "epoch": 0.9123084982686582, + "grad_norm": 0.7489047050476074, + "learning_rate": 5.68928502640972e-05, + "loss": 0.731, + "step": 142800 + }, + { + "epoch": 0.9123723854183969, + "grad_norm": 1.1270229816436768, + "learning_rate": 5.688788045253277e-05, + "loss": 0.9834, + "step": 142810 + }, + { + "epoch": 0.9124362725681356, + "grad_norm": 0.6103757619857788, + "learning_rate": 5.688291057160143e-05, + "loss": 1.0271, + "step": 142820 + }, + { + "epoch": 0.9125001597178743, + "grad_norm": 0.7932960987091064, + "learning_rate": 5.687794062135322e-05, + "loss": 0.7008, + "step": 142830 + }, + { + "epoch": 0.912564046867613, + "grad_norm": 1.3297314643859863, + "learning_rate": 5.687297060183821e-05, + "loss": 0.9021, + "step": 142840 + }, + { + "epoch": 0.9126279340173518, + "grad_norm": 0.9981955289840698, + "learning_rate": 5.6868000513106435e-05, + "loss": 0.9884, + "step": 142850 + }, + { + "epoch": 0.9126918211670905, + "grad_norm": 1.4859464168548584, + "learning_rate": 5.6863030355207945e-05, + "loss": 1.0916, + "step": 142860 + }, + { + "epoch": 0.9127557083168292, + "grad_norm": 1.1081634759902954, + "learning_rate": 5.685806012819281e-05, + "loss": 0.7292, + "step": 142870 + }, + { + "epoch": 0.9128195954665679, + "grad_norm": 0.9023155570030212, + "learning_rate": 5.6853089832111076e-05, + "loss": 0.8272, + "step": 142880 + }, + { + "epoch": 0.9128834826163066, + "grad_norm": 0.712682843208313, + "learning_rate": 5.6848119467012795e-05, + "loss": 1.0256, + "step": 142890 + }, + { + "epoch": 0.9129473697660453, + "grad_norm": 1.0017153024673462, + "learning_rate": 5.684314903294803e-05, + "loss": 0.9368, + "step": 142900 + }, + { + "epoch": 0.913011256915784, + "grad_norm": 1.385031819343567, + "learning_rate": 5.6838178529966825e-05, + "loss": 0.8932, + "step": 142910 + }, + { + "epoch": 0.9130751440655227, + "grad_norm": 0.9716565608978271, + "learning_rate": 5.683320795811925e-05, + "loss": 1.127, + "step": 142920 + }, + { + "epoch": 0.9131390312152614, + "grad_norm": 2.391300916671753, + "learning_rate": 5.6828237317455365e-05, + "loss": 1.1424, + "step": 142930 + }, + { + "epoch": 0.9132029183650001, + "grad_norm": 2.1368186473846436, + "learning_rate": 5.682326660802523e-05, + "loss": 1.1169, + "step": 142940 + }, + { + "epoch": 0.9132668055147387, + "grad_norm": 1.1230798959732056, + "learning_rate": 5.6818295829878874e-05, + "loss": 0.791, + "step": 142950 + }, + { + "epoch": 0.9133306926644774, + "grad_norm": 0.8756389617919922, + "learning_rate": 5.6813324983066404e-05, + "loss": 0.7168, + "step": 142960 + }, + { + "epoch": 0.9133945798142161, + "grad_norm": 4.043021202087402, + "learning_rate": 5.680835406763785e-05, + "loss": 0.8593, + "step": 142970 + }, + { + "epoch": 0.9134584669639548, + "grad_norm": 0.9189384579658508, + "learning_rate": 5.680338308364328e-05, + "loss": 0.7222, + "step": 142980 + }, + { + "epoch": 0.9135223541136935, + "grad_norm": 0.9267463088035583, + "learning_rate": 5.679841203113275e-05, + "loss": 0.9252, + "step": 142990 + }, + { + "epoch": 0.9135862412634322, + "grad_norm": 0.9476075768470764, + "learning_rate": 5.6793440910156336e-05, + "loss": 0.8154, + "step": 143000 + }, + { + "epoch": 0.913650128413171, + "grad_norm": 0.7349272966384888, + "learning_rate": 5.67884697207641e-05, + "loss": 1.0062, + "step": 143010 + }, + { + "epoch": 0.9137140155629097, + "grad_norm": 0.6401690244674683, + "learning_rate": 5.67834984630061e-05, + "loss": 0.9801, + "step": 143020 + }, + { + "epoch": 0.9137779027126484, + "grad_norm": 0.9111157655715942, + "learning_rate": 5.677852713693239e-05, + "loss": 0.8259, + "step": 143030 + }, + { + "epoch": 0.9138417898623871, + "grad_norm": 0.5553478598594666, + "learning_rate": 5.6773555742593065e-05, + "loss": 0.8509, + "step": 143040 + }, + { + "epoch": 0.9139056770121258, + "grad_norm": 0.6528088450431824, + "learning_rate": 5.676858428003815e-05, + "loss": 0.6361, + "step": 143050 + }, + { + "epoch": 0.9139695641618645, + "grad_norm": 0.7725486755371094, + "learning_rate": 5.676361274931775e-05, + "loss": 0.9128, + "step": 143060 + }, + { + "epoch": 0.9140334513116032, + "grad_norm": 0.8535653948783875, + "learning_rate": 5.67586411504819e-05, + "loss": 0.9194, + "step": 143070 + }, + { + "epoch": 0.9140973384613419, + "grad_norm": 0.4683364927768707, + "learning_rate": 5.675366948358072e-05, + "loss": 0.8434, + "step": 143080 + }, + { + "epoch": 0.9141612256110806, + "grad_norm": 1.8194103240966797, + "learning_rate": 5.6748697748664225e-05, + "loss": 1.1056, + "step": 143090 + }, + { + "epoch": 0.9142251127608193, + "grad_norm": 0.8043122887611389, + "learning_rate": 5.674372594578251e-05, + "loss": 0.7171, + "step": 143100 + }, + { + "epoch": 0.914288999910558, + "grad_norm": 0.7784401774406433, + "learning_rate": 5.673875407498563e-05, + "loss": 0.8041, + "step": 143110 + }, + { + "epoch": 0.9143528870602967, + "grad_norm": 1.7115349769592285, + "learning_rate": 5.673378213632368e-05, + "loss": 0.863, + "step": 143120 + }, + { + "epoch": 0.9144167742100354, + "grad_norm": 0.8383505940437317, + "learning_rate": 5.672881012984672e-05, + "loss": 0.8503, + "step": 143130 + }, + { + "epoch": 0.9144806613597741, + "grad_norm": 0.8597167134284973, + "learning_rate": 5.672383805560482e-05, + "loss": 0.9283, + "step": 143140 + }, + { + "epoch": 0.9145445485095128, + "grad_norm": 1.1670582294464111, + "learning_rate": 5.6718865913648044e-05, + "loss": 0.9148, + "step": 143150 + }, + { + "epoch": 0.9146084356592515, + "grad_norm": 0.7308951616287231, + "learning_rate": 5.671389370402648e-05, + "loss": 0.8466, + "step": 143160 + }, + { + "epoch": 0.9146723228089902, + "grad_norm": 0.7195169925689697, + "learning_rate": 5.6708921426790194e-05, + "loss": 0.9054, + "step": 143170 + }, + { + "epoch": 0.914736209958729, + "grad_norm": 0.6677785515785217, + "learning_rate": 5.670394908198927e-05, + "loss": 0.7878, + "step": 143180 + }, + { + "epoch": 0.9148000971084677, + "grad_norm": 1.0069258213043213, + "learning_rate": 5.669897666967378e-05, + "loss": 0.8408, + "step": 143190 + }, + { + "epoch": 0.9148639842582063, + "grad_norm": 0.9950356483459473, + "learning_rate": 5.66940041898938e-05, + "loss": 0.9091, + "step": 143200 + }, + { + "epoch": 0.914927871407945, + "grad_norm": 0.8431347012519836, + "learning_rate": 5.6689031642699405e-05, + "loss": 0.7076, + "step": 143210 + }, + { + "epoch": 0.9149917585576837, + "grad_norm": 0.8251869678497314, + "learning_rate": 5.668405902814067e-05, + "loss": 0.9349, + "step": 143220 + }, + { + "epoch": 0.9150556457074224, + "grad_norm": 1.2934318780899048, + "learning_rate": 5.6679086346267685e-05, + "loss": 0.913, + "step": 143230 + }, + { + "epoch": 0.9151195328571611, + "grad_norm": 0.7144001722335815, + "learning_rate": 5.6674113597130515e-05, + "loss": 0.9816, + "step": 143240 + }, + { + "epoch": 0.9151834200068998, + "grad_norm": 0.8660211563110352, + "learning_rate": 5.666914078077926e-05, + "loss": 0.9166, + "step": 143250 + }, + { + "epoch": 0.9152473071566385, + "grad_norm": 0.8381525874137878, + "learning_rate": 5.6664167897263975e-05, + "loss": 0.8673, + "step": 143260 + }, + { + "epoch": 0.9153111943063772, + "grad_norm": 1.0956501960754395, + "learning_rate": 5.6659194946634764e-05, + "loss": 0.8773, + "step": 143270 + }, + { + "epoch": 0.9153750814561159, + "grad_norm": 0.6203900575637817, + "learning_rate": 5.6654221928941685e-05, + "loss": 0.9247, + "step": 143280 + }, + { + "epoch": 0.9154389686058546, + "grad_norm": 0.8943018317222595, + "learning_rate": 5.664924884423485e-05, + "loss": 0.8628, + "step": 143290 + }, + { + "epoch": 0.9155028557555933, + "grad_norm": 1.1351211071014404, + "learning_rate": 5.664427569256432e-05, + "loss": 0.7142, + "step": 143300 + }, + { + "epoch": 0.915566742905332, + "grad_norm": 0.7743847370147705, + "learning_rate": 5.663930247398018e-05, + "loss": 0.9053, + "step": 143310 + }, + { + "epoch": 0.9156306300550707, + "grad_norm": 0.7925605177879333, + "learning_rate": 5.663432918853253e-05, + "loss": 0.7516, + "step": 143320 + }, + { + "epoch": 0.9156945172048094, + "grad_norm": 0.9708541631698608, + "learning_rate": 5.6629355836271435e-05, + "loss": 1.1418, + "step": 143330 + }, + { + "epoch": 0.9157584043545481, + "grad_norm": 1.0878440141677856, + "learning_rate": 5.6624382417247004e-05, + "loss": 0.8489, + "step": 143340 + }, + { + "epoch": 0.9158222915042868, + "grad_norm": 0.7541216015815735, + "learning_rate": 5.66194089315093e-05, + "loss": 1.2666, + "step": 143350 + }, + { + "epoch": 0.9158861786540256, + "grad_norm": 0.780955970287323, + "learning_rate": 5.6614435379108434e-05, + "loss": 0.6196, + "step": 143360 + }, + { + "epoch": 0.9159500658037643, + "grad_norm": 0.8869837522506714, + "learning_rate": 5.6609461760094476e-05, + "loss": 0.7971, + "step": 143370 + }, + { + "epoch": 0.916013952953503, + "grad_norm": 0.9061605930328369, + "learning_rate": 5.660448807451752e-05, + "loss": 1.1207, + "step": 143380 + }, + { + "epoch": 0.9160778401032417, + "grad_norm": 0.8924505710601807, + "learning_rate": 5.659951432242765e-05, + "loss": 0.8292, + "step": 143390 + }, + { + "epoch": 0.9161417272529804, + "grad_norm": 1.1079713106155396, + "learning_rate": 5.659454050387496e-05, + "loss": 0.7443, + "step": 143400 + }, + { + "epoch": 0.9162056144027191, + "grad_norm": 1.7120331525802612, + "learning_rate": 5.658956661890955e-05, + "loss": 0.9726, + "step": 143410 + }, + { + "epoch": 0.9162695015524578, + "grad_norm": 0.6569525003433228, + "learning_rate": 5.6584592667581494e-05, + "loss": 1.1294, + "step": 143420 + }, + { + "epoch": 0.9163333887021965, + "grad_norm": 1.015156626701355, + "learning_rate": 5.65796186499409e-05, + "loss": 0.9498, + "step": 143430 + }, + { + "epoch": 0.9163972758519351, + "grad_norm": 0.7955893874168396, + "learning_rate": 5.657464456603785e-05, + "loss": 0.7562, + "step": 143440 + }, + { + "epoch": 0.9164611630016738, + "grad_norm": 0.87772136926651, + "learning_rate": 5.6569670415922436e-05, + "loss": 0.8287, + "step": 143450 + }, + { + "epoch": 0.9165250501514125, + "grad_norm": 0.7367571592330933, + "learning_rate": 5.656469619964477e-05, + "loss": 0.9409, + "step": 143460 + }, + { + "epoch": 0.9165889373011512, + "grad_norm": 0.940991997718811, + "learning_rate": 5.6559721917254924e-05, + "loss": 1.0162, + "step": 143470 + }, + { + "epoch": 0.9166528244508899, + "grad_norm": 0.8817263841629028, + "learning_rate": 5.655474756880301e-05, + "loss": 0.8085, + "step": 143480 + }, + { + "epoch": 0.9167167116006286, + "grad_norm": 0.982813835144043, + "learning_rate": 5.654977315433914e-05, + "loss": 0.9593, + "step": 143490 + }, + { + "epoch": 0.9167805987503673, + "grad_norm": 1.1913138628005981, + "learning_rate": 5.6544798673913354e-05, + "loss": 0.8467, + "step": 143500 + }, + { + "epoch": 0.916844485900106, + "grad_norm": 0.7284004092216492, + "learning_rate": 5.653982412757579e-05, + "loss": 0.8006, + "step": 143510 + }, + { + "epoch": 0.9169083730498447, + "grad_norm": 1.0794998407363892, + "learning_rate": 5.653484951537655e-05, + "loss": 0.794, + "step": 143520 + }, + { + "epoch": 0.9169722601995834, + "grad_norm": 1.2948535680770874, + "learning_rate": 5.652987483736572e-05, + "loss": 0.7273, + "step": 143530 + }, + { + "epoch": 0.9170361473493222, + "grad_norm": 0.7950011491775513, + "learning_rate": 5.652490009359339e-05, + "loss": 0.9643, + "step": 143540 + }, + { + "epoch": 0.9171000344990609, + "grad_norm": 0.5536199808120728, + "learning_rate": 5.651992528410967e-05, + "loss": 0.8933, + "step": 143550 + }, + { + "epoch": 0.9171639216487996, + "grad_norm": 0.880750298500061, + "learning_rate": 5.6514950408964685e-05, + "loss": 1.0077, + "step": 143560 + }, + { + "epoch": 0.9172278087985383, + "grad_norm": 1.2838913202285767, + "learning_rate": 5.6509975468208484e-05, + "loss": 0.7484, + "step": 143570 + }, + { + "epoch": 0.917291695948277, + "grad_norm": 0.8207671046257019, + "learning_rate": 5.650500046189122e-05, + "loss": 0.8739, + "step": 143580 + }, + { + "epoch": 0.9173555830980157, + "grad_norm": 0.9268253445625305, + "learning_rate": 5.650002539006296e-05, + "loss": 1.0368, + "step": 143590 + }, + { + "epoch": 0.9174194702477544, + "grad_norm": 0.7340649962425232, + "learning_rate": 5.649505025277382e-05, + "loss": 0.9109, + "step": 143600 + }, + { + "epoch": 0.9174833573974931, + "grad_norm": 1.0350323915481567, + "learning_rate": 5.649007505007391e-05, + "loss": 0.9756, + "step": 143610 + }, + { + "epoch": 0.9175472445472318, + "grad_norm": 0.784234344959259, + "learning_rate": 5.6485099782013326e-05, + "loss": 0.7373, + "step": 143620 + }, + { + "epoch": 0.9176111316969705, + "grad_norm": 0.8511738181114197, + "learning_rate": 5.648012444864219e-05, + "loss": 0.7554, + "step": 143630 + }, + { + "epoch": 0.9176750188467092, + "grad_norm": 0.9637295007705688, + "learning_rate": 5.647514905001059e-05, + "loss": 0.6863, + "step": 143640 + }, + { + "epoch": 0.9177389059964479, + "grad_norm": 0.6820176243782043, + "learning_rate": 5.6470173586168625e-05, + "loss": 0.974, + "step": 143650 + }, + { + "epoch": 0.9178027931461866, + "grad_norm": 0.7366352081298828, + "learning_rate": 5.646519805716643e-05, + "loss": 0.6999, + "step": 143660 + }, + { + "epoch": 0.9178666802959253, + "grad_norm": 0.7290107607841492, + "learning_rate": 5.646022246305409e-05, + "loss": 1.0206, + "step": 143670 + }, + { + "epoch": 0.9179305674456639, + "grad_norm": 1.0230958461761475, + "learning_rate": 5.645524680388172e-05, + "loss": 0.8786, + "step": 143680 + }, + { + "epoch": 0.9179944545954026, + "grad_norm": 1.1091904640197754, + "learning_rate": 5.645027107969942e-05, + "loss": 0.8807, + "step": 143690 + }, + { + "epoch": 0.9180583417451413, + "grad_norm": 1.9306013584136963, + "learning_rate": 5.644529529055733e-05, + "loss": 1.2533, + "step": 143700 + }, + { + "epoch": 0.91812222889488, + "grad_norm": 1.0572997331619263, + "learning_rate": 5.644031943650553e-05, + "loss": 0.7035, + "step": 143710 + }, + { + "epoch": 0.9181861160446187, + "grad_norm": 0.9027218818664551, + "learning_rate": 5.643534351759414e-05, + "loss": 0.8836, + "step": 143720 + }, + { + "epoch": 0.9182500031943575, + "grad_norm": 0.7371615767478943, + "learning_rate": 5.643036753387328e-05, + "loss": 0.6614, + "step": 143730 + }, + { + "epoch": 0.9183138903440962, + "grad_norm": 1.505537986755371, + "learning_rate": 5.642539148539306e-05, + "loss": 0.8683, + "step": 143740 + }, + { + "epoch": 0.9183777774938349, + "grad_norm": 0.8361949920654297, + "learning_rate": 5.64204153722036e-05, + "loss": 0.9926, + "step": 143750 + }, + { + "epoch": 0.9184416646435736, + "grad_norm": 1.110982894897461, + "learning_rate": 5.641543919435496e-05, + "loss": 0.8015, + "step": 143760 + }, + { + "epoch": 0.9185055517933123, + "grad_norm": 1.32961106300354, + "learning_rate": 5.641046295189733e-05, + "loss": 0.8, + "step": 143770 + }, + { + "epoch": 0.918569438943051, + "grad_norm": 0.8517293334007263, + "learning_rate": 5.640548664488078e-05, + "loss": 0.6818, + "step": 143780 + }, + { + "epoch": 0.9186333260927897, + "grad_norm": 0.8187686204910278, + "learning_rate": 5.6400510273355446e-05, + "loss": 0.793, + "step": 143790 + }, + { + "epoch": 0.9186972132425284, + "grad_norm": 2.4176223278045654, + "learning_rate": 5.639553383737143e-05, + "loss": 0.9948, + "step": 143800 + }, + { + "epoch": 0.9187611003922671, + "grad_norm": 0.7014775276184082, + "learning_rate": 5.6390557336978855e-05, + "loss": 1.1245, + "step": 143810 + }, + { + "epoch": 0.9188249875420058, + "grad_norm": 1.063392996788025, + "learning_rate": 5.638558077222784e-05, + "loss": 1.0894, + "step": 143820 + }, + { + "epoch": 0.9188888746917445, + "grad_norm": 0.49587613344192505, + "learning_rate": 5.63806041431685e-05, + "loss": 0.6732, + "step": 143830 + }, + { + "epoch": 0.9189527618414832, + "grad_norm": 0.848645031452179, + "learning_rate": 5.637562744985097e-05, + "loss": 0.9066, + "step": 143840 + }, + { + "epoch": 0.9190166489912219, + "grad_norm": 0.8816063404083252, + "learning_rate": 5.637065069232534e-05, + "loss": 0.9941, + "step": 143850 + }, + { + "epoch": 0.9190805361409606, + "grad_norm": 1.5030635595321655, + "learning_rate": 5.6365673870641755e-05, + "loss": 0.7683, + "step": 143860 + }, + { + "epoch": 0.9191444232906993, + "grad_norm": 1.574723482131958, + "learning_rate": 5.6360696984850324e-05, + "loss": 1.0703, + "step": 143870 + }, + { + "epoch": 0.919208310440438, + "grad_norm": 1.3411686420440674, + "learning_rate": 5.635572003500117e-05, + "loss": 0.9338, + "step": 143880 + }, + { + "epoch": 0.9192721975901768, + "grad_norm": 1.3527319431304932, + "learning_rate": 5.6350743021144416e-05, + "loss": 1.2366, + "step": 143890 + }, + { + "epoch": 0.9193360847399155, + "grad_norm": 0.8805691599845886, + "learning_rate": 5.634576594333019e-05, + "loss": 0.8299, + "step": 143900 + }, + { + "epoch": 0.9193999718896542, + "grad_norm": 0.6124188899993896, + "learning_rate": 5.634078880160861e-05, + "loss": 0.8055, + "step": 143910 + }, + { + "epoch": 0.9194638590393928, + "grad_norm": 1.0732533931732178, + "learning_rate": 5.63358115960298e-05, + "loss": 1.1149, + "step": 143920 + }, + { + "epoch": 0.9195277461891315, + "grad_norm": 0.5931747555732727, + "learning_rate": 5.633083432664389e-05, + "loss": 1.1673, + "step": 143930 + }, + { + "epoch": 0.9195916333388702, + "grad_norm": 1.0256335735321045, + "learning_rate": 5.632585699350099e-05, + "loss": 0.8732, + "step": 143940 + }, + { + "epoch": 0.9196555204886089, + "grad_norm": 0.9125732779502869, + "learning_rate": 5.632087959665124e-05, + "loss": 0.756, + "step": 143950 + }, + { + "epoch": 0.9197194076383476, + "grad_norm": 1.4520151615142822, + "learning_rate": 5.6315902136144784e-05, + "loss": 0.906, + "step": 143960 + }, + { + "epoch": 0.9197832947880863, + "grad_norm": 0.8130024671554565, + "learning_rate": 5.63109246120317e-05, + "loss": 0.7474, + "step": 143970 + }, + { + "epoch": 0.919847181937825, + "grad_norm": 0.40209460258483887, + "learning_rate": 5.630594702436217e-05, + "loss": 0.9289, + "step": 143980 + }, + { + "epoch": 0.9199110690875637, + "grad_norm": 0.8355494737625122, + "learning_rate": 5.630096937318629e-05, + "loss": 0.8839, + "step": 143990 + }, + { + "epoch": 0.9199749562373024, + "grad_norm": 1.9247257709503174, + "learning_rate": 5.629599165855419e-05, + "loss": 0.9603, + "step": 144000 + }, + { + "epoch": 0.9200388433870411, + "grad_norm": 0.7350730299949646, + "learning_rate": 5.629101388051602e-05, + "loss": 1.0115, + "step": 144010 + }, + { + "epoch": 0.9201027305367798, + "grad_norm": 0.9462405443191528, + "learning_rate": 5.628603603912188e-05, + "loss": 1.123, + "step": 144020 + }, + { + "epoch": 0.9201666176865185, + "grad_norm": 0.7234540581703186, + "learning_rate": 5.628105813442194e-05, + "loss": 0.8485, + "step": 144030 + }, + { + "epoch": 0.9202305048362572, + "grad_norm": 1.3080463409423828, + "learning_rate": 5.6276080166466294e-05, + "loss": 1.0137, + "step": 144040 + }, + { + "epoch": 0.9202943919859959, + "grad_norm": 1.0539608001708984, + "learning_rate": 5.62711021353051e-05, + "loss": 0.6729, + "step": 144050 + }, + { + "epoch": 0.9203582791357346, + "grad_norm": 0.9095763564109802, + "learning_rate": 5.626612404098848e-05, + "loss": 0.6638, + "step": 144060 + }, + { + "epoch": 0.9204221662854734, + "grad_norm": 1.001478672027588, + "learning_rate": 5.626114588356657e-05, + "loss": 0.9633, + "step": 144070 + }, + { + "epoch": 0.9204860534352121, + "grad_norm": 0.8986124396324158, + "learning_rate": 5.62561676630895e-05, + "loss": 0.8985, + "step": 144080 + }, + { + "epoch": 0.9205499405849508, + "grad_norm": 0.4983496367931366, + "learning_rate": 5.6251189379607415e-05, + "loss": 0.8668, + "step": 144090 + }, + { + "epoch": 0.9206138277346895, + "grad_norm": 1.6429502964019775, + "learning_rate": 5.6246211033170434e-05, + "loss": 0.9987, + "step": 144100 + }, + { + "epoch": 0.9206777148844282, + "grad_norm": 1.0039671659469604, + "learning_rate": 5.624123262382872e-05, + "loss": 0.8202, + "step": 144110 + }, + { + "epoch": 0.9207416020341669, + "grad_norm": 1.0661791563034058, + "learning_rate": 5.6236254151632385e-05, + "loss": 0.7685, + "step": 144120 + }, + { + "epoch": 0.9208054891839056, + "grad_norm": 0.973341166973114, + "learning_rate": 5.623177347295643e-05, + "loss": 0.8032, + "step": 144130 + }, + { + "epoch": 0.9208693763336443, + "grad_norm": 1.365503191947937, + "learning_rate": 5.6226794881474464e-05, + "loss": 0.7741, + "step": 144140 + }, + { + "epoch": 0.920933263483383, + "grad_norm": 0.7111403942108154, + "learning_rate": 5.622181622728329e-05, + "loss": 0.7476, + "step": 144150 + }, + { + "epoch": 0.9209971506331217, + "grad_norm": 0.641816258430481, + "learning_rate": 5.621683751043304e-05, + "loss": 0.9546, + "step": 144160 + }, + { + "epoch": 0.9210610377828603, + "grad_norm": 0.6329175233840942, + "learning_rate": 5.6211858730973856e-05, + "loss": 0.9538, + "step": 144170 + }, + { + "epoch": 0.921124924932599, + "grad_norm": 1.5208319425582886, + "learning_rate": 5.620687988895589e-05, + "loss": 0.8583, + "step": 144180 + }, + { + "epoch": 0.9211888120823377, + "grad_norm": 1.1061300039291382, + "learning_rate": 5.6201900984429255e-05, + "loss": 0.8138, + "step": 144190 + }, + { + "epoch": 0.9212526992320764, + "grad_norm": 0.8774361610412598, + "learning_rate": 5.619692201744413e-05, + "loss": 0.7494, + "step": 144200 + }, + { + "epoch": 0.9213165863818151, + "grad_norm": 1.0457643270492554, + "learning_rate": 5.6191942988050626e-05, + "loss": 0.8806, + "step": 144210 + }, + { + "epoch": 0.9213804735315538, + "grad_norm": 0.7023540139198303, + "learning_rate": 5.618696389629892e-05, + "loss": 0.739, + "step": 144220 + }, + { + "epoch": 0.9214443606812925, + "grad_norm": 0.671416699886322, + "learning_rate": 5.6181984742239117e-05, + "loss": 0.6146, + "step": 144230 + }, + { + "epoch": 0.9215082478310312, + "grad_norm": 0.7087250351905823, + "learning_rate": 5.6177005525921376e-05, + "loss": 1.0222, + "step": 144240 + }, + { + "epoch": 0.92157213498077, + "grad_norm": 1.0113321542739868, + "learning_rate": 5.617202624739585e-05, + "loss": 0.7315, + "step": 144250 + }, + { + "epoch": 0.9216360221305087, + "grad_norm": 1.1815264225006104, + "learning_rate": 5.616704690671267e-05, + "loss": 0.8089, + "step": 144260 + }, + { + "epoch": 0.9216999092802474, + "grad_norm": 0.7599946856498718, + "learning_rate": 5.616206750392201e-05, + "loss": 0.7554, + "step": 144270 + }, + { + "epoch": 0.9217637964299861, + "grad_norm": 1.0004082918167114, + "learning_rate": 5.6157088039074e-05, + "loss": 0.8352, + "step": 144280 + }, + { + "epoch": 0.9218276835797248, + "grad_norm": 1.208046317100525, + "learning_rate": 5.615210851221878e-05, + "loss": 0.9545, + "step": 144290 + }, + { + "epoch": 0.9218915707294635, + "grad_norm": 1.0685365200042725, + "learning_rate": 5.61471289234065e-05, + "loss": 1.0576, + "step": 144300 + }, + { + "epoch": 0.9219554578792022, + "grad_norm": 1.0764927864074707, + "learning_rate": 5.614214927268733e-05, + "loss": 0.946, + "step": 144310 + }, + { + "epoch": 0.9220193450289409, + "grad_norm": 1.0587629079818726, + "learning_rate": 5.613716956011139e-05, + "loss": 1.0278, + "step": 144320 + }, + { + "epoch": 0.9220832321786796, + "grad_norm": 1.0095889568328857, + "learning_rate": 5.613218978572884e-05, + "loss": 1.0609, + "step": 144330 + }, + { + "epoch": 0.9221471193284183, + "grad_norm": 0.8525044322013855, + "learning_rate": 5.6127209949589845e-05, + "loss": 1.2406, + "step": 144340 + }, + { + "epoch": 0.922211006478157, + "grad_norm": 0.7696043252944946, + "learning_rate": 5.612223005174454e-05, + "loss": 0.8016, + "step": 144350 + }, + { + "epoch": 0.9222748936278957, + "grad_norm": 0.8853589296340942, + "learning_rate": 5.6117250092243076e-05, + "loss": 0.929, + "step": 144360 + }, + { + "epoch": 0.9223387807776344, + "grad_norm": 1.2267171144485474, + "learning_rate": 5.611227007113563e-05, + "loss": 0.7504, + "step": 144370 + }, + { + "epoch": 0.9224026679273731, + "grad_norm": 0.9859304428100586, + "learning_rate": 5.6107289988472325e-05, + "loss": 0.9157, + "step": 144380 + }, + { + "epoch": 0.9224665550771118, + "grad_norm": 0.824668288230896, + "learning_rate": 5.6102309844303324e-05, + "loss": 1.1008, + "step": 144390 + }, + { + "epoch": 0.9225304422268505, + "grad_norm": 0.9470674991607666, + "learning_rate": 5.609732963867879e-05, + "loss": 0.852, + "step": 144400 + }, + { + "epoch": 0.9225943293765891, + "grad_norm": 0.7566165924072266, + "learning_rate": 5.609234937164886e-05, + "loss": 0.8084, + "step": 144410 + }, + { + "epoch": 0.9226582165263278, + "grad_norm": 0.963897168636322, + "learning_rate": 5.60873690432637e-05, + "loss": 1.0114, + "step": 144420 + }, + { + "epoch": 0.9227221036760666, + "grad_norm": 0.7856702208518982, + "learning_rate": 5.608238865357348e-05, + "loss": 0.9301, + "step": 144430 + }, + { + "epoch": 0.9227859908258053, + "grad_norm": 1.138763666152954, + "learning_rate": 5.6077408202628334e-05, + "loss": 0.7774, + "step": 144440 + }, + { + "epoch": 0.922849877975544, + "grad_norm": 1.0540574789047241, + "learning_rate": 5.607242769047843e-05, + "loss": 1.1135, + "step": 144450 + }, + { + "epoch": 0.9229137651252827, + "grad_norm": 0.8265545964241028, + "learning_rate": 5.606744711717393e-05, + "loss": 0.8917, + "step": 144460 + }, + { + "epoch": 0.9229776522750214, + "grad_norm": 0.5889626741409302, + "learning_rate": 5.6062466482765e-05, + "loss": 0.8798, + "step": 144470 + }, + { + "epoch": 0.9230415394247601, + "grad_norm": 0.7544617652893066, + "learning_rate": 5.6057485787301765e-05, + "loss": 0.9106, + "step": 144480 + }, + { + "epoch": 0.9231054265744988, + "grad_norm": 0.7426086664199829, + "learning_rate": 5.6052505030834425e-05, + "loss": 0.8295, + "step": 144490 + }, + { + "epoch": 0.9231693137242375, + "grad_norm": 0.8017638921737671, + "learning_rate": 5.6047524213413116e-05, + "loss": 0.7987, + "step": 144500 + }, + { + "epoch": 0.9232332008739762, + "grad_norm": 1.0374419689178467, + "learning_rate": 5.604254333508802e-05, + "loss": 0.885, + "step": 144510 + }, + { + "epoch": 0.9232970880237149, + "grad_norm": 1.0828471183776855, + "learning_rate": 5.603756239590926e-05, + "loss": 0.8636, + "step": 144520 + }, + { + "epoch": 0.9233609751734536, + "grad_norm": 0.7223926186561584, + "learning_rate": 5.603258139592704e-05, + "loss": 0.8139, + "step": 144530 + }, + { + "epoch": 0.9234248623231923, + "grad_norm": 0.8874870538711548, + "learning_rate": 5.60276003351915e-05, + "loss": 0.7938, + "step": 144540 + }, + { + "epoch": 0.923488749472931, + "grad_norm": 0.7778692841529846, + "learning_rate": 5.6022619213752816e-05, + "loss": 0.7878, + "step": 144550 + }, + { + "epoch": 0.9235526366226697, + "grad_norm": 0.779015302658081, + "learning_rate": 5.6017638031661144e-05, + "loss": 1.0048, + "step": 144560 + }, + { + "epoch": 0.9236165237724084, + "grad_norm": 0.6786310076713562, + "learning_rate": 5.6012656788966656e-05, + "loss": 0.7291, + "step": 144570 + }, + { + "epoch": 0.9236804109221471, + "grad_norm": 0.8063673377037048, + "learning_rate": 5.6007675485719504e-05, + "loss": 1.0574, + "step": 144580 + }, + { + "epoch": 0.9237442980718858, + "grad_norm": 1.407467007637024, + "learning_rate": 5.600269412196986e-05, + "loss": 0.9079, + "step": 144590 + }, + { + "epoch": 0.9238081852216246, + "grad_norm": 0.8846006393432617, + "learning_rate": 5.59977126977679e-05, + "loss": 0.9646, + "step": 144600 + }, + { + "epoch": 0.9238720723713633, + "grad_norm": 0.824082612991333, + "learning_rate": 5.5992731213163785e-05, + "loss": 0.9403, + "step": 144610 + }, + { + "epoch": 0.923935959521102, + "grad_norm": 0.7869988083839417, + "learning_rate": 5.598774966820768e-05, + "loss": 0.9048, + "step": 144620 + }, + { + "epoch": 0.9239998466708407, + "grad_norm": 1.9946730136871338, + "learning_rate": 5.5982768062949755e-05, + "loss": 1.2423, + "step": 144630 + }, + { + "epoch": 0.9240637338205794, + "grad_norm": 0.8373878598213196, + "learning_rate": 5.597778639744018e-05, + "loss": 1.0149, + "step": 144640 + }, + { + "epoch": 0.924127620970318, + "grad_norm": 0.6496232151985168, + "learning_rate": 5.5972804671729116e-05, + "loss": 0.8685, + "step": 144650 + }, + { + "epoch": 0.9241915081200567, + "grad_norm": 0.8842912912368774, + "learning_rate": 5.596782288586676e-05, + "loss": 0.9352, + "step": 144660 + }, + { + "epoch": 0.9242553952697954, + "grad_norm": 0.9695751667022705, + "learning_rate": 5.596284103990326e-05, + "loss": 1.1568, + "step": 144670 + }, + { + "epoch": 0.9243192824195341, + "grad_norm": 1.1602424383163452, + "learning_rate": 5.595785913388878e-05, + "loss": 0.8338, + "step": 144680 + }, + { + "epoch": 0.9243831695692728, + "grad_norm": 0.8936864733695984, + "learning_rate": 5.595287716787351e-05, + "loss": 0.915, + "step": 144690 + }, + { + "epoch": 0.9244470567190115, + "grad_norm": 1.0186363458633423, + "learning_rate": 5.5947895141907624e-05, + "loss": 0.9493, + "step": 144700 + }, + { + "epoch": 0.9245109438687502, + "grad_norm": 0.8145350217819214, + "learning_rate": 5.594291305604128e-05, + "loss": 0.8706, + "step": 144710 + }, + { + "epoch": 0.9245748310184889, + "grad_norm": 1.0934886932373047, + "learning_rate": 5.5937930910324666e-05, + "loss": 0.8845, + "step": 144720 + }, + { + "epoch": 0.9246387181682276, + "grad_norm": 1.0088940858840942, + "learning_rate": 5.593294870480794e-05, + "loss": 1.0438, + "step": 144730 + }, + { + "epoch": 0.9247026053179663, + "grad_norm": 0.632199764251709, + "learning_rate": 5.5927966439541304e-05, + "loss": 0.7483, + "step": 144740 + }, + { + "epoch": 0.924766492467705, + "grad_norm": 1.0264745950698853, + "learning_rate": 5.5922984114574904e-05, + "loss": 1.069, + "step": 144750 + }, + { + "epoch": 0.9248303796174437, + "grad_norm": 0.7027975916862488, + "learning_rate": 5.591800172995894e-05, + "loss": 1.0355, + "step": 144760 + }, + { + "epoch": 0.9248942667671824, + "grad_norm": 1.1865230798721313, + "learning_rate": 5.591301928574355e-05, + "loss": 0.7578, + "step": 144770 + }, + { + "epoch": 0.9249581539169212, + "grad_norm": 0.8094105124473572, + "learning_rate": 5.5908036781978966e-05, + "loss": 0.9229, + "step": 144780 + }, + { + "epoch": 0.9250220410666599, + "grad_norm": 1.438416600227356, + "learning_rate": 5.590305421871534e-05, + "loss": 1.0329, + "step": 144790 + }, + { + "epoch": 0.9250859282163986, + "grad_norm": 1.4989200830459595, + "learning_rate": 5.5898071596002855e-05, + "loss": 1.2769, + "step": 144800 + }, + { + "epoch": 0.9251498153661373, + "grad_norm": 1.0811078548431396, + "learning_rate": 5.589308891389168e-05, + "loss": 0.896, + "step": 144810 + }, + { + "epoch": 0.925213702515876, + "grad_norm": 0.8900967836380005, + "learning_rate": 5.5888106172431995e-05, + "loss": 0.8607, + "step": 144820 + }, + { + "epoch": 0.9252775896656147, + "grad_norm": 0.7876869440078735, + "learning_rate": 5.5883123371673995e-05, + "loss": 0.9373, + "step": 144830 + }, + { + "epoch": 0.9253414768153534, + "grad_norm": 1.5476784706115723, + "learning_rate": 5.5878140511667855e-05, + "loss": 0.9396, + "step": 144840 + }, + { + "epoch": 0.9254053639650921, + "grad_norm": 0.9162848591804504, + "learning_rate": 5.587315759246376e-05, + "loss": 0.9302, + "step": 144850 + }, + { + "epoch": 0.9254692511148308, + "grad_norm": 1.4244049787521362, + "learning_rate": 5.586817461411188e-05, + "loss": 0.8326, + "step": 144860 + }, + { + "epoch": 0.9255331382645695, + "grad_norm": 1.2231682538986206, + "learning_rate": 5.586319157666241e-05, + "loss": 0.6845, + "step": 144870 + }, + { + "epoch": 0.9255970254143082, + "grad_norm": 0.9394405484199524, + "learning_rate": 5.585820848016552e-05, + "loss": 0.6608, + "step": 144880 + }, + { + "epoch": 0.9256609125640469, + "grad_norm": 3.920628786087036, + "learning_rate": 5.585322532467141e-05, + "loss": 0.9313, + "step": 144890 + }, + { + "epoch": 0.9257247997137855, + "grad_norm": 0.7618585824966431, + "learning_rate": 5.5848242110230245e-05, + "loss": 0.8513, + "step": 144900 + }, + { + "epoch": 0.9257886868635242, + "grad_norm": 1.0813332796096802, + "learning_rate": 5.5843258836892234e-05, + "loss": 0.7353, + "step": 144910 + }, + { + "epoch": 0.9258525740132629, + "grad_norm": 1.2577931880950928, + "learning_rate": 5.583827550470755e-05, + "loss": 0.6659, + "step": 144920 + }, + { + "epoch": 0.9259164611630016, + "grad_norm": 1.2664730548858643, + "learning_rate": 5.583329211372637e-05, + "loss": 0.7771, + "step": 144930 + }, + { + "epoch": 0.9259803483127403, + "grad_norm": 0.9695154428482056, + "learning_rate": 5.582830866399888e-05, + "loss": 1.2447, + "step": 144940 + }, + { + "epoch": 0.926044235462479, + "grad_norm": 1.5105254650115967, + "learning_rate": 5.5823325155575314e-05, + "loss": 0.7378, + "step": 144950 + }, + { + "epoch": 0.9261081226122178, + "grad_norm": 0.6907379627227783, + "learning_rate": 5.5818341588505806e-05, + "loss": 1.249, + "step": 144960 + }, + { + "epoch": 0.9261720097619565, + "grad_norm": 0.8063596487045288, + "learning_rate": 5.581335796284057e-05, + "loss": 0.8475, + "step": 144970 + }, + { + "epoch": 0.9262358969116952, + "grad_norm": 1.5274707078933716, + "learning_rate": 5.5808374278629795e-05, + "loss": 1.2363, + "step": 144980 + }, + { + "epoch": 0.9262997840614339, + "grad_norm": 0.9278043508529663, + "learning_rate": 5.580339053592366e-05, + "loss": 0.9631, + "step": 144990 + }, + { + "epoch": 0.9263636712111726, + "grad_norm": 0.9493452310562134, + "learning_rate": 5.579840673477236e-05, + "loss": 0.8491, + "step": 145000 + }, + { + "epoch": 0.9264275583609113, + "grad_norm": 0.5410763621330261, + "learning_rate": 5.579342287522609e-05, + "loss": 0.9043, + "step": 145010 + }, + { + "epoch": 0.92649144551065, + "grad_norm": 0.8283954858779907, + "learning_rate": 5.578843895733504e-05, + "loss": 0.8748, + "step": 145020 + }, + { + "epoch": 0.9265553326603887, + "grad_norm": 0.9432761073112488, + "learning_rate": 5.57834549811494e-05, + "loss": 1.3108, + "step": 145030 + }, + { + "epoch": 0.9266192198101274, + "grad_norm": 0.6497068405151367, + "learning_rate": 5.5778470946719366e-05, + "loss": 0.8953, + "step": 145040 + }, + { + "epoch": 0.9266831069598661, + "grad_norm": 0.8885396122932434, + "learning_rate": 5.5773486854095134e-05, + "loss": 0.7763, + "step": 145050 + }, + { + "epoch": 0.9267469941096048, + "grad_norm": 0.7947267293930054, + "learning_rate": 5.576850270332689e-05, + "loss": 0.6814, + "step": 145060 + }, + { + "epoch": 0.9268108812593435, + "grad_norm": 0.9257674813270569, + "learning_rate": 5.576351849446484e-05, + "loss": 0.8433, + "step": 145070 + }, + { + "epoch": 0.9268747684090822, + "grad_norm": 0.6727604866027832, + "learning_rate": 5.575853422755917e-05, + "loss": 0.8165, + "step": 145080 + }, + { + "epoch": 0.9269386555588209, + "grad_norm": 0.8554814457893372, + "learning_rate": 5.5753549902660076e-05, + "loss": 0.9877, + "step": 145090 + }, + { + "epoch": 0.9270025427085596, + "grad_norm": 1.5152359008789062, + "learning_rate": 5.574856551981775e-05, + "loss": 0.9839, + "step": 145100 + }, + { + "epoch": 0.9270664298582983, + "grad_norm": 1.0759176015853882, + "learning_rate": 5.5743581079082405e-05, + "loss": 0.6801, + "step": 145110 + }, + { + "epoch": 0.927130317008037, + "grad_norm": 1.2060096263885498, + "learning_rate": 5.573859658050423e-05, + "loss": 0.7048, + "step": 145120 + }, + { + "epoch": 0.9271942041577758, + "grad_norm": 2.348177671432495, + "learning_rate": 5.5733612024133416e-05, + "loss": 1.1837, + "step": 145130 + }, + { + "epoch": 0.9272580913075144, + "grad_norm": 0.8047994375228882, + "learning_rate": 5.572862741002017e-05, + "loss": 0.7672, + "step": 145140 + }, + { + "epoch": 0.9273219784572531, + "grad_norm": 1.255176305770874, + "learning_rate": 5.57236427382147e-05, + "loss": 0.7974, + "step": 145150 + }, + { + "epoch": 0.9273858656069918, + "grad_norm": 0.7443729639053345, + "learning_rate": 5.571865800876719e-05, + "loss": 1.0677, + "step": 145160 + }, + { + "epoch": 0.9274497527567305, + "grad_norm": 0.9563167691230774, + "learning_rate": 5.571367322172785e-05, + "loss": 0.7602, + "step": 145170 + }, + { + "epoch": 0.9275136399064692, + "grad_norm": 1.492719292640686, + "learning_rate": 5.5708688377146866e-05, + "loss": 0.8196, + "step": 145180 + }, + { + "epoch": 0.9275775270562079, + "grad_norm": 1.4552744626998901, + "learning_rate": 5.570370347507446e-05, + "loss": 0.7228, + "step": 145190 + }, + { + "epoch": 0.9276414142059466, + "grad_norm": 1.105858564376831, + "learning_rate": 5.569871851556082e-05, + "loss": 0.9919, + "step": 145200 + }, + { + "epoch": 0.9277053013556853, + "grad_norm": 0.6973342299461365, + "learning_rate": 5.5693733498656165e-05, + "loss": 0.6737, + "step": 145210 + }, + { + "epoch": 0.927769188505424, + "grad_norm": 0.8317189812660217, + "learning_rate": 5.5688748424410675e-05, + "loss": 0.7846, + "step": 145220 + }, + { + "epoch": 0.9278330756551627, + "grad_norm": 1.7505478858947754, + "learning_rate": 5.568376329287458e-05, + "loss": 0.8123, + "step": 145230 + }, + { + "epoch": 0.9278969628049014, + "grad_norm": 1.1510494947433472, + "learning_rate": 5.567877810409806e-05, + "loss": 0.9568, + "step": 145240 + }, + { + "epoch": 0.9279608499546401, + "grad_norm": 0.599578857421875, + "learning_rate": 5.567379285813135e-05, + "loss": 1.1179, + "step": 145250 + }, + { + "epoch": 0.9280247371043788, + "grad_norm": 0.6046319007873535, + "learning_rate": 5.566880755502462e-05, + "loss": 0.9872, + "step": 145260 + }, + { + "epoch": 0.9280886242541175, + "grad_norm": 0.9385390281677246, + "learning_rate": 5.5663822194828095e-05, + "loss": 0.6484, + "step": 145270 + }, + { + "epoch": 0.9281525114038562, + "grad_norm": 1.1661081314086914, + "learning_rate": 5.565883677759198e-05, + "loss": 0.7681, + "step": 145280 + }, + { + "epoch": 0.928216398553595, + "grad_norm": 0.9337068200111389, + "learning_rate": 5.565385130336649e-05, + "loss": 1.2074, + "step": 145290 + }, + { + "epoch": 0.9282802857033337, + "grad_norm": 0.7268047332763672, + "learning_rate": 5.564886577220181e-05, + "loss": 1.2321, + "step": 145300 + }, + { + "epoch": 0.9283441728530724, + "grad_norm": 0.8471580743789673, + "learning_rate": 5.564388018414818e-05, + "loss": 0.7306, + "step": 145310 + }, + { + "epoch": 0.9284080600028111, + "grad_norm": 1.361351728439331, + "learning_rate": 5.563889453925579e-05, + "loss": 0.8582, + "step": 145320 + }, + { + "epoch": 0.9284719471525498, + "grad_norm": 0.8351898789405823, + "learning_rate": 5.563390883757485e-05, + "loss": 0.9428, + "step": 145330 + }, + { + "epoch": 0.9285358343022885, + "grad_norm": 0.5664870142936707, + "learning_rate": 5.562892307915559e-05, + "loss": 0.8726, + "step": 145340 + }, + { + "epoch": 0.9285997214520272, + "grad_norm": 0.9608287215232849, + "learning_rate": 5.56239372640482e-05, + "loss": 0.8297, + "step": 145350 + }, + { + "epoch": 0.9286636086017659, + "grad_norm": 0.5610537528991699, + "learning_rate": 5.5618951392302886e-05, + "loss": 1.07, + "step": 145360 + }, + { + "epoch": 0.9287274957515046, + "grad_norm": 1.0210973024368286, + "learning_rate": 5.561396546396988e-05, + "loss": 0.8373, + "step": 145370 + }, + { + "epoch": 0.9287913829012432, + "grad_norm": 0.8455613255500793, + "learning_rate": 5.560897947909938e-05, + "loss": 0.7514, + "step": 145380 + }, + { + "epoch": 0.9288552700509819, + "grad_norm": 0.7670028805732727, + "learning_rate": 5.56039934377416e-05, + "loss": 0.7576, + "step": 145390 + }, + { + "epoch": 0.9289191572007206, + "grad_norm": 0.9094594717025757, + "learning_rate": 5.559900733994676e-05, + "loss": 0.9407, + "step": 145400 + }, + { + "epoch": 0.9289830443504593, + "grad_norm": 0.8494235873222351, + "learning_rate": 5.559402118576508e-05, + "loss": 0.7876, + "step": 145410 + }, + { + "epoch": 0.929046931500198, + "grad_norm": 1.263748049736023, + "learning_rate": 5.558903497524676e-05, + "loss": 0.9246, + "step": 145420 + }, + { + "epoch": 0.9291108186499367, + "grad_norm": 0.7942246794700623, + "learning_rate": 5.558404870844201e-05, + "loss": 0.8941, + "step": 145430 + }, + { + "epoch": 0.9291747057996754, + "grad_norm": 1.4969481229782104, + "learning_rate": 5.557906238540108e-05, + "loss": 1.0584, + "step": 145440 + }, + { + "epoch": 0.9292385929494141, + "grad_norm": 1.607496738433838, + "learning_rate": 5.557407600617416e-05, + "loss": 1.0476, + "step": 145450 + }, + { + "epoch": 0.9293024800991528, + "grad_norm": 0.9495954513549805, + "learning_rate": 5.5569089570811464e-05, + "loss": 1.1759, + "step": 145460 + }, + { + "epoch": 0.9293663672488915, + "grad_norm": 1.1366685628890991, + "learning_rate": 5.556410307936322e-05, + "loss": 1.1819, + "step": 145470 + }, + { + "epoch": 0.9294302543986303, + "grad_norm": 1.110145926475525, + "learning_rate": 5.555911653187964e-05, + "loss": 0.8672, + "step": 145480 + }, + { + "epoch": 0.929494141548369, + "grad_norm": 0.8252993822097778, + "learning_rate": 5.5554129928410957e-05, + "loss": 0.7987, + "step": 145490 + }, + { + "epoch": 0.9295580286981077, + "grad_norm": 1.1678286790847778, + "learning_rate": 5.554914326900739e-05, + "loss": 0.8637, + "step": 145500 + }, + { + "epoch": 0.9296219158478464, + "grad_norm": 2.4226267337799072, + "learning_rate": 5.554415655371913e-05, + "loss": 1.0453, + "step": 145510 + }, + { + "epoch": 0.9296858029975851, + "grad_norm": 1.283092975616455, + "learning_rate": 5.553916978259642e-05, + "loss": 0.7881, + "step": 145520 + }, + { + "epoch": 0.9297496901473238, + "grad_norm": 1.1610386371612549, + "learning_rate": 5.55341829556895e-05, + "loss": 0.7663, + "step": 145530 + }, + { + "epoch": 0.9298135772970625, + "grad_norm": 1.2389222383499146, + "learning_rate": 5.552919607304854e-05, + "loss": 0.88, + "step": 145540 + }, + { + "epoch": 0.9298774644468012, + "grad_norm": 0.8321581482887268, + "learning_rate": 5.552420913472381e-05, + "loss": 0.7218, + "step": 145550 + }, + { + "epoch": 0.9299413515965399, + "grad_norm": 1.4073259830474854, + "learning_rate": 5.5519222140765514e-05, + "loss": 0.9816, + "step": 145560 + }, + { + "epoch": 0.9300052387462786, + "grad_norm": 0.8335140347480774, + "learning_rate": 5.5514235091223877e-05, + "loss": 1.096, + "step": 145570 + }, + { + "epoch": 0.9300691258960173, + "grad_norm": 1.2483246326446533, + "learning_rate": 5.5509247986149126e-05, + "loss": 0.8831, + "step": 145580 + }, + { + "epoch": 0.930133013045756, + "grad_norm": 0.7681006193161011, + "learning_rate": 5.550426082559147e-05, + "loss": 0.7553, + "step": 145590 + }, + { + "epoch": 0.9301969001954947, + "grad_norm": 0.8035659193992615, + "learning_rate": 5.5499273609601154e-05, + "loss": 0.8508, + "step": 145600 + }, + { + "epoch": 0.9302607873452334, + "grad_norm": 1.2194147109985352, + "learning_rate": 5.5494286338228384e-05, + "loss": 0.9484, + "step": 145610 + }, + { + "epoch": 0.930324674494972, + "grad_norm": 1.6273916959762573, + "learning_rate": 5.54892990115234e-05, + "loss": 0.8937, + "step": 145620 + }, + { + "epoch": 0.9303885616447107, + "grad_norm": 0.48322996497154236, + "learning_rate": 5.5484311629536425e-05, + "loss": 0.6711, + "step": 145630 + }, + { + "epoch": 0.9304524487944494, + "grad_norm": 0.7854679822921753, + "learning_rate": 5.5479324192317694e-05, + "loss": 0.8923, + "step": 145640 + }, + { + "epoch": 0.9305163359441881, + "grad_norm": 0.9290236830711365, + "learning_rate": 5.547433669991743e-05, + "loss": 0.8368, + "step": 145650 + }, + { + "epoch": 0.9305802230939269, + "grad_norm": 1.1597179174423218, + "learning_rate": 5.546934915238585e-05, + "loss": 0.7749, + "step": 145660 + }, + { + "epoch": 0.9306441102436656, + "grad_norm": 1.3749240636825562, + "learning_rate": 5.54643615497732e-05, + "loss": 0.9082, + "step": 145670 + }, + { + "epoch": 0.9307079973934043, + "grad_norm": 0.7642197608947754, + "learning_rate": 5.54593738921297e-05, + "loss": 1.0149, + "step": 145680 + }, + { + "epoch": 0.930771884543143, + "grad_norm": 0.974104642868042, + "learning_rate": 5.545438617950558e-05, + "loss": 0.635, + "step": 145690 + }, + { + "epoch": 0.9308357716928817, + "grad_norm": 0.8045002818107605, + "learning_rate": 5.544939841195108e-05, + "loss": 0.7151, + "step": 145700 + }, + { + "epoch": 0.9308996588426204, + "grad_norm": 1.036468267440796, + "learning_rate": 5.544441058951641e-05, + "loss": 0.9897, + "step": 145710 + }, + { + "epoch": 0.9309635459923591, + "grad_norm": 0.5775062441825867, + "learning_rate": 5.5439422712251835e-05, + "loss": 0.7542, + "step": 145720 + }, + { + "epoch": 0.9310274331420978, + "grad_norm": 0.88588947057724, + "learning_rate": 5.543443478020754e-05, + "loss": 0.7627, + "step": 145730 + }, + { + "epoch": 0.9310913202918365, + "grad_norm": 1.2807823419570923, + "learning_rate": 5.5429446793433814e-05, + "loss": 1.1104, + "step": 145740 + }, + { + "epoch": 0.9311552074415752, + "grad_norm": 1.0863866806030273, + "learning_rate": 5.5424458751980844e-05, + "loss": 0.6739, + "step": 145750 + }, + { + "epoch": 0.9312190945913139, + "grad_norm": 1.238873839378357, + "learning_rate": 5.5419470655898883e-05, + "loss": 0.9149, + "step": 145760 + }, + { + "epoch": 0.9312829817410526, + "grad_norm": 0.8662456274032593, + "learning_rate": 5.541448250523817e-05, + "loss": 0.8491, + "step": 145770 + }, + { + "epoch": 0.9313468688907913, + "grad_norm": 1.59992253780365, + "learning_rate": 5.5409494300048935e-05, + "loss": 1.2597, + "step": 145780 + }, + { + "epoch": 0.93141075604053, + "grad_norm": 0.7041088342666626, + "learning_rate": 5.540450604038141e-05, + "loss": 0.8117, + "step": 145790 + }, + { + "epoch": 0.9314746431902687, + "grad_norm": 0.9120072722434998, + "learning_rate": 5.539951772628583e-05, + "loss": 0.8138, + "step": 145800 + }, + { + "epoch": 0.9315385303400074, + "grad_norm": 0.865118682384491, + "learning_rate": 5.539452935781244e-05, + "loss": 0.7262, + "step": 145810 + }, + { + "epoch": 0.9316024174897461, + "grad_norm": 0.5491125583648682, + "learning_rate": 5.5389540935011466e-05, + "loss": 0.7411, + "step": 145820 + }, + { + "epoch": 0.9316663046394849, + "grad_norm": 0.8732421398162842, + "learning_rate": 5.538455245793316e-05, + "loss": 0.8556, + "step": 145830 + }, + { + "epoch": 0.9317301917892236, + "grad_norm": 0.7927635312080383, + "learning_rate": 5.5379563926627745e-05, + "loss": 1.0176, + "step": 145840 + }, + { + "epoch": 0.9317940789389623, + "grad_norm": 1.8838095664978027, + "learning_rate": 5.5374575341145476e-05, + "loss": 0.8793, + "step": 145850 + }, + { + "epoch": 0.931857966088701, + "grad_norm": 0.998735785484314, + "learning_rate": 5.536958670153658e-05, + "loss": 1.2281, + "step": 145860 + }, + { + "epoch": 0.9319218532384396, + "grad_norm": 0.7903896570205688, + "learning_rate": 5.53645980078513e-05, + "loss": 1.2174, + "step": 145870 + }, + { + "epoch": 0.9319857403881783, + "grad_norm": 0.8727949857711792, + "learning_rate": 5.535960926013987e-05, + "loss": 1.0978, + "step": 145880 + }, + { + "epoch": 0.932049627537917, + "grad_norm": 1.2408875226974487, + "learning_rate": 5.5354620458452546e-05, + "loss": 0.937, + "step": 145890 + }, + { + "epoch": 0.9321135146876557, + "grad_norm": 0.8686769604682922, + "learning_rate": 5.5349631602839557e-05, + "loss": 0.7722, + "step": 145900 + }, + { + "epoch": 0.9321774018373944, + "grad_norm": 0.914225697517395, + "learning_rate": 5.534464269335116e-05, + "loss": 0.7863, + "step": 145910 + }, + { + "epoch": 0.9322412889871331, + "grad_norm": 1.2484056949615479, + "learning_rate": 5.533965373003758e-05, + "loss": 0.8891, + "step": 145920 + }, + { + "epoch": 0.9323051761368718, + "grad_norm": 0.7330338358879089, + "learning_rate": 5.533466471294906e-05, + "loss": 0.9297, + "step": 145930 + }, + { + "epoch": 0.9323690632866105, + "grad_norm": 1.5863157510757446, + "learning_rate": 5.532967564213586e-05, + "loss": 0.926, + "step": 145940 + }, + { + "epoch": 0.9324329504363492, + "grad_norm": 1.0102980136871338, + "learning_rate": 5.532468651764822e-05, + "loss": 0.9029, + "step": 145950 + }, + { + "epoch": 0.9324968375860879, + "grad_norm": 1.0613675117492676, + "learning_rate": 5.531969733953637e-05, + "loss": 0.8485, + "step": 145960 + }, + { + "epoch": 0.9325607247358266, + "grad_norm": 0.8373472690582275, + "learning_rate": 5.531470810785057e-05, + "loss": 0.8047, + "step": 145970 + }, + { + "epoch": 0.9326246118855653, + "grad_norm": 3.7152907848358154, + "learning_rate": 5.5309718822641054e-05, + "loss": 0.8467, + "step": 145980 + }, + { + "epoch": 0.932688499035304, + "grad_norm": 1.0984721183776855, + "learning_rate": 5.5304729483958073e-05, + "loss": 0.6046, + "step": 145990 + }, + { + "epoch": 0.9327523861850427, + "grad_norm": 0.7963919043540955, + "learning_rate": 5.529974009185189e-05, + "loss": 0.9892, + "step": 146000 + }, + { + "epoch": 0.9328162733347815, + "grad_norm": 1.0803087949752808, + "learning_rate": 5.529475064637274e-05, + "loss": 0.965, + "step": 146010 + }, + { + "epoch": 0.9328801604845202, + "grad_norm": 0.7040061950683594, + "learning_rate": 5.528976114757086e-05, + "loss": 0.9054, + "step": 146020 + }, + { + "epoch": 0.9329440476342589, + "grad_norm": 1.2263482809066772, + "learning_rate": 5.528477159549652e-05, + "loss": 0.6155, + "step": 146030 + }, + { + "epoch": 0.9330079347839976, + "grad_norm": 0.532351016998291, + "learning_rate": 5.5279781990199954e-05, + "loss": 0.8038, + "step": 146040 + }, + { + "epoch": 0.9330718219337363, + "grad_norm": 0.8017789125442505, + "learning_rate": 5.527479233173142e-05, + "loss": 0.8907, + "step": 146050 + }, + { + "epoch": 0.933135709083475, + "grad_norm": 3.3494420051574707, + "learning_rate": 5.5269802620141155e-05, + "loss": 1.1866, + "step": 146060 + }, + { + "epoch": 0.9331995962332137, + "grad_norm": 0.9478211402893066, + "learning_rate": 5.526481285547943e-05, + "loss": 0.8094, + "step": 146070 + }, + { + "epoch": 0.9332634833829524, + "grad_norm": 0.8863970637321472, + "learning_rate": 5.525982303779648e-05, + "loss": 0.7409, + "step": 146080 + }, + { + "epoch": 0.9333273705326911, + "grad_norm": 1.6237622499465942, + "learning_rate": 5.525483316714256e-05, + "loss": 0.9142, + "step": 146090 + }, + { + "epoch": 0.9333912576824298, + "grad_norm": 0.8216589689254761, + "learning_rate": 5.524984324356792e-05, + "loss": 0.7625, + "step": 146100 + }, + { + "epoch": 0.9334551448321684, + "grad_norm": 2.1370699405670166, + "learning_rate": 5.524485326712282e-05, + "loss": 0.9642, + "step": 146110 + }, + { + "epoch": 0.9335190319819071, + "grad_norm": 1.2563636302947998, + "learning_rate": 5.5239863237857516e-05, + "loss": 1.0334, + "step": 146120 + }, + { + "epoch": 0.9335829191316458, + "grad_norm": 0.9968759417533875, + "learning_rate": 5.5234873155822256e-05, + "loss": 1.0448, + "step": 146130 + }, + { + "epoch": 0.9336468062813845, + "grad_norm": 1.1890705823898315, + "learning_rate": 5.5229883021067286e-05, + "loss": 0.9123, + "step": 146140 + }, + { + "epoch": 0.9337106934311232, + "grad_norm": 0.9829585552215576, + "learning_rate": 5.522489283364286e-05, + "loss": 0.8136, + "step": 146150 + }, + { + "epoch": 0.9337745805808619, + "grad_norm": 1.1096529960632324, + "learning_rate": 5.521990259359925e-05, + "loss": 1.0181, + "step": 146160 + }, + { + "epoch": 0.9338384677306006, + "grad_norm": 0.9074599146842957, + "learning_rate": 5.521491230098671e-05, + "loss": 0.8634, + "step": 146170 + }, + { + "epoch": 0.9339023548803393, + "grad_norm": 0.8308917284011841, + "learning_rate": 5.520992195585549e-05, + "loss": 0.7422, + "step": 146180 + }, + { + "epoch": 0.933966242030078, + "grad_norm": 0.6945448517799377, + "learning_rate": 5.5204931558255857e-05, + "loss": 0.7908, + "step": 146190 + }, + { + "epoch": 0.9340301291798168, + "grad_norm": 1.085972547531128, + "learning_rate": 5.519994110823805e-05, + "loss": 1.0072, + "step": 146200 + }, + { + "epoch": 0.9340940163295555, + "grad_norm": 1.0287305116653442, + "learning_rate": 5.519495060585235e-05, + "loss": 1.133, + "step": 146210 + }, + { + "epoch": 0.9341579034792942, + "grad_norm": 0.679003894329071, + "learning_rate": 5.5189960051148995e-05, + "loss": 0.726, + "step": 146220 + }, + { + "epoch": 0.9342217906290329, + "grad_norm": 1.126184105873108, + "learning_rate": 5.5184969444178246e-05, + "loss": 1.1845, + "step": 146230 + }, + { + "epoch": 0.9342856777787716, + "grad_norm": 0.5712942481040955, + "learning_rate": 5.517997878499037e-05, + "loss": 0.7351, + "step": 146240 + }, + { + "epoch": 0.9343495649285103, + "grad_norm": 0.9215618371963501, + "learning_rate": 5.517498807363564e-05, + "loss": 0.9922, + "step": 146250 + }, + { + "epoch": 0.934413452078249, + "grad_norm": 1.0021543502807617, + "learning_rate": 5.516999731016429e-05, + "loss": 0.9386, + "step": 146260 + }, + { + "epoch": 0.9344773392279877, + "grad_norm": 0.5854945778846741, + "learning_rate": 5.516500649462659e-05, + "loss": 0.6703, + "step": 146270 + }, + { + "epoch": 0.9345412263777264, + "grad_norm": 0.7984985709190369, + "learning_rate": 5.5160015627072824e-05, + "loss": 0.8576, + "step": 146280 + }, + { + "epoch": 0.9346051135274651, + "grad_norm": 2.2100253105163574, + "learning_rate": 5.5155024707553226e-05, + "loss": 1.2271, + "step": 146290 + }, + { + "epoch": 0.9346690006772038, + "grad_norm": 0.6737677454948425, + "learning_rate": 5.5150033736118065e-05, + "loss": 0.617, + "step": 146300 + }, + { + "epoch": 0.9347328878269425, + "grad_norm": 0.48655664920806885, + "learning_rate": 5.514504271281762e-05, + "loss": 0.6696, + "step": 146310 + }, + { + "epoch": 0.9347967749766812, + "grad_norm": 2.0812628269195557, + "learning_rate": 5.514005163770214e-05, + "loss": 1.1076, + "step": 146320 + }, + { + "epoch": 0.9348606621264199, + "grad_norm": 0.6728872060775757, + "learning_rate": 5.513506051082189e-05, + "loss": 0.9832, + "step": 146330 + }, + { + "epoch": 0.9349245492761586, + "grad_norm": 1.2701008319854736, + "learning_rate": 5.513006933222714e-05, + "loss": 0.9593, + "step": 146340 + }, + { + "epoch": 0.9349884364258972, + "grad_norm": 1.0993520021438599, + "learning_rate": 5.5125078101968155e-05, + "loss": 0.846, + "step": 146350 + }, + { + "epoch": 0.935052323575636, + "grad_norm": 0.947177529335022, + "learning_rate": 5.5120086820095195e-05, + "loss": 0.9143, + "step": 146360 + }, + { + "epoch": 0.9351162107253747, + "grad_norm": 0.9848998785018921, + "learning_rate": 5.511509548665854e-05, + "loss": 0.8256, + "step": 146370 + }, + { + "epoch": 0.9351800978751134, + "grad_norm": 1.219247579574585, + "learning_rate": 5.511010410170844e-05, + "loss": 0.7707, + "step": 146380 + }, + { + "epoch": 0.9352439850248521, + "grad_norm": 0.7544047832489014, + "learning_rate": 5.510511266529518e-05, + "loss": 0.9864, + "step": 146390 + }, + { + "epoch": 0.9353078721745908, + "grad_norm": 0.7642074227333069, + "learning_rate": 5.510012117746901e-05, + "loss": 0.7728, + "step": 146400 + }, + { + "epoch": 0.9353717593243295, + "grad_norm": 0.9915320873260498, + "learning_rate": 5.509512963828021e-05, + "loss": 1.2914, + "step": 146410 + }, + { + "epoch": 0.9354356464740682, + "grad_norm": 0.9371116757392883, + "learning_rate": 5.509013804777904e-05, + "loss": 0.8414, + "step": 146420 + }, + { + "epoch": 0.9354995336238069, + "grad_norm": 0.4904581904411316, + "learning_rate": 5.508514640601579e-05, + "loss": 0.9459, + "step": 146430 + }, + { + "epoch": 0.9355634207735456, + "grad_norm": 0.9960023760795593, + "learning_rate": 5.508015471304071e-05, + "loss": 0.8509, + "step": 146440 + }, + { + "epoch": 0.9356273079232843, + "grad_norm": 0.8758112788200378, + "learning_rate": 5.507516296890407e-05, + "loss": 0.8133, + "step": 146450 + }, + { + "epoch": 0.935691195073023, + "grad_norm": 0.8281605243682861, + "learning_rate": 5.507017117365616e-05, + "loss": 0.9745, + "step": 146460 + }, + { + "epoch": 0.9357550822227617, + "grad_norm": 0.9094333052635193, + "learning_rate": 5.5065179327347224e-05, + "loss": 0.9971, + "step": 146470 + }, + { + "epoch": 0.9358189693725004, + "grad_norm": 1.2272045612335205, + "learning_rate": 5.5060187430027565e-05, + "loss": 0.8394, + "step": 146480 + }, + { + "epoch": 0.9358828565222391, + "grad_norm": 1.1439415216445923, + "learning_rate": 5.505519548174745e-05, + "loss": 0.9516, + "step": 146490 + }, + { + "epoch": 0.9359467436719778, + "grad_norm": 0.7094035744667053, + "learning_rate": 5.5050203482557115e-05, + "loss": 1.0136, + "step": 146500 + }, + { + "epoch": 0.9360106308217165, + "grad_norm": 0.6839352250099182, + "learning_rate": 5.5045211432506884e-05, + "loss": 0.7337, + "step": 146510 + }, + { + "epoch": 0.9360745179714552, + "grad_norm": 2.7014598846435547, + "learning_rate": 5.504021933164699e-05, + "loss": 1.0982, + "step": 146520 + }, + { + "epoch": 0.936138405121194, + "grad_norm": 1.0742672681808472, + "learning_rate": 5.503522718002774e-05, + "loss": 0.9552, + "step": 146530 + }, + { + "epoch": 0.9362022922709327, + "grad_norm": 0.7901598811149597, + "learning_rate": 5.5030234977699394e-05, + "loss": 0.8024, + "step": 146540 + }, + { + "epoch": 0.9362661794206714, + "grad_norm": 0.9857130646705627, + "learning_rate": 5.502524272471223e-05, + "loss": 1.0354, + "step": 146550 + }, + { + "epoch": 0.9363300665704101, + "grad_norm": 0.9056035280227661, + "learning_rate": 5.502025042111654e-05, + "loss": 0.9316, + "step": 146560 + }, + { + "epoch": 0.9363939537201488, + "grad_norm": 0.8060906529426575, + "learning_rate": 5.501525806696257e-05, + "loss": 0.9312, + "step": 146570 + }, + { + "epoch": 0.9364578408698875, + "grad_norm": 1.0485197305679321, + "learning_rate": 5.5010265662300606e-05, + "loss": 0.8028, + "step": 146580 + }, + { + "epoch": 0.9365217280196262, + "grad_norm": 0.9534648656845093, + "learning_rate": 5.500527320718094e-05, + "loss": 0.6943, + "step": 146590 + }, + { + "epoch": 0.9365856151693648, + "grad_norm": 0.6805192828178406, + "learning_rate": 5.500028070165385e-05, + "loss": 0.8013, + "step": 146600 + }, + { + "epoch": 0.9366495023191035, + "grad_norm": 0.996015191078186, + "learning_rate": 5.49952881457696e-05, + "loss": 0.7835, + "step": 146610 + }, + { + "epoch": 0.9367133894688422, + "grad_norm": 1.0089763402938843, + "learning_rate": 5.4990295539578474e-05, + "loss": 0.7471, + "step": 146620 + }, + { + "epoch": 0.9367772766185809, + "grad_norm": 1.1047331094741821, + "learning_rate": 5.498530288313075e-05, + "loss": 0.8188, + "step": 146630 + }, + { + "epoch": 0.9368411637683196, + "grad_norm": 1.0767524242401123, + "learning_rate": 5.4980310176476726e-05, + "loss": 0.7166, + "step": 146640 + }, + { + "epoch": 0.9369050509180583, + "grad_norm": 1.266998291015625, + "learning_rate": 5.497531741966666e-05, + "loss": 0.8517, + "step": 146650 + }, + { + "epoch": 0.936968938067797, + "grad_norm": 0.7340264320373535, + "learning_rate": 5.497032461275085e-05, + "loss": 0.8263, + "step": 146660 + }, + { + "epoch": 0.9370328252175357, + "grad_norm": 0.8520843386650085, + "learning_rate": 5.496533175577957e-05, + "loss": 1.0392, + "step": 146670 + }, + { + "epoch": 0.9370967123672744, + "grad_norm": 1.1523327827453613, + "learning_rate": 5.4960338848803084e-05, + "loss": 0.8068, + "step": 146680 + }, + { + "epoch": 0.9371605995170131, + "grad_norm": 1.1300572156906128, + "learning_rate": 5.4955345891871716e-05, + "loss": 0.7149, + "step": 146690 + }, + { + "epoch": 0.9372244866667518, + "grad_norm": 1.2153987884521484, + "learning_rate": 5.495035288503573e-05, + "loss": 0.8757, + "step": 146700 + }, + { + "epoch": 0.9372883738164905, + "grad_norm": 0.7240068912506104, + "learning_rate": 5.49453598283454e-05, + "loss": 0.7754, + "step": 146710 + }, + { + "epoch": 0.9373522609662293, + "grad_norm": 0.8426817059516907, + "learning_rate": 5.494036672185102e-05, + "loss": 0.8532, + "step": 146720 + }, + { + "epoch": 0.937416148115968, + "grad_norm": 0.8055009245872498, + "learning_rate": 5.4935373565602864e-05, + "loss": 0.8759, + "step": 146730 + }, + { + "epoch": 0.9374800352657067, + "grad_norm": 1.1292035579681396, + "learning_rate": 5.4930380359651244e-05, + "loss": 0.9746, + "step": 146740 + }, + { + "epoch": 0.9375439224154454, + "grad_norm": 0.588070809841156, + "learning_rate": 5.492538710404642e-05, + "loss": 0.8524, + "step": 146750 + }, + { + "epoch": 0.9376078095651841, + "grad_norm": 0.8825819492340088, + "learning_rate": 5.492039379883869e-05, + "loss": 0.8543, + "step": 146760 + }, + { + "epoch": 0.9376716967149228, + "grad_norm": 0.8504408001899719, + "learning_rate": 5.491540044407833e-05, + "loss": 0.8576, + "step": 146770 + }, + { + "epoch": 0.9377355838646615, + "grad_norm": 0.845874011516571, + "learning_rate": 5.491040703981564e-05, + "loss": 0.6989, + "step": 146780 + }, + { + "epoch": 0.9377994710144002, + "grad_norm": 0.8666792511940002, + "learning_rate": 5.4905413586100904e-05, + "loss": 0.8939, + "step": 146790 + }, + { + "epoch": 0.9378633581641389, + "grad_norm": 1.1524525880813599, + "learning_rate": 5.4900420082984416e-05, + "loss": 1.1092, + "step": 146800 + }, + { + "epoch": 0.9379272453138776, + "grad_norm": 0.5927907824516296, + "learning_rate": 5.489542653051646e-05, + "loss": 0.6445, + "step": 146810 + }, + { + "epoch": 0.9379911324636163, + "grad_norm": 0.8610501885414124, + "learning_rate": 5.4890432928747306e-05, + "loss": 0.5961, + "step": 146820 + }, + { + "epoch": 0.938055019613355, + "grad_norm": 0.6616340279579163, + "learning_rate": 5.488543927772727e-05, + "loss": 0.952, + "step": 146830 + }, + { + "epoch": 0.9381189067630936, + "grad_norm": 0.5525166988372803, + "learning_rate": 5.488044557750662e-05, + "loss": 1.0917, + "step": 146840 + }, + { + "epoch": 0.9381827939128323, + "grad_norm": 1.3952975273132324, + "learning_rate": 5.487545182813568e-05, + "loss": 0.979, + "step": 146850 + }, + { + "epoch": 0.938246681062571, + "grad_norm": 0.9401289224624634, + "learning_rate": 5.4870458029664714e-05, + "loss": 0.8715, + "step": 146860 + }, + { + "epoch": 0.9383105682123097, + "grad_norm": 1.1923482418060303, + "learning_rate": 5.486546418214402e-05, + "loss": 1.0171, + "step": 146870 + }, + { + "epoch": 0.9383744553620484, + "grad_norm": 1.33669912815094, + "learning_rate": 5.486047028562391e-05, + "loss": 0.7589, + "step": 146880 + }, + { + "epoch": 0.9384383425117871, + "grad_norm": 0.8979326486587524, + "learning_rate": 5.4855476340154647e-05, + "loss": 0.6521, + "step": 146890 + }, + { + "epoch": 0.9385022296615259, + "grad_norm": 0.5857362747192383, + "learning_rate": 5.4850482345786534e-05, + "loss": 0.6615, + "step": 146900 + }, + { + "epoch": 0.9385661168112646, + "grad_norm": 1.1938517093658447, + "learning_rate": 5.484548830256987e-05, + "loss": 0.7288, + "step": 146910 + }, + { + "epoch": 0.9386300039610033, + "grad_norm": 0.7541248202323914, + "learning_rate": 5.484049421055495e-05, + "loss": 0.613, + "step": 146920 + }, + { + "epoch": 0.938693891110742, + "grad_norm": 1.009811282157898, + "learning_rate": 5.483550006979206e-05, + "loss": 0.7539, + "step": 146930 + }, + { + "epoch": 0.9387577782604807, + "grad_norm": 1.518933653831482, + "learning_rate": 5.4830505880331496e-05, + "loss": 0.7572, + "step": 146940 + }, + { + "epoch": 0.9388216654102194, + "grad_norm": 1.1620988845825195, + "learning_rate": 5.482551164222357e-05, + "loss": 0.8322, + "step": 146950 + }, + { + "epoch": 0.9388855525599581, + "grad_norm": 0.6771840453147888, + "learning_rate": 5.482051735551856e-05, + "loss": 0.8801, + "step": 146960 + }, + { + "epoch": 0.9389494397096968, + "grad_norm": 1.18392813205719, + "learning_rate": 5.481552302026678e-05, + "loss": 0.6552, + "step": 146970 + }, + { + "epoch": 0.9390133268594355, + "grad_norm": 0.7388846278190613, + "learning_rate": 5.481052863651851e-05, + "loss": 0.8432, + "step": 146980 + }, + { + "epoch": 0.9390772140091742, + "grad_norm": 0.739513099193573, + "learning_rate": 5.480553420432405e-05, + "loss": 0.732, + "step": 146990 + }, + { + "epoch": 0.9391411011589129, + "grad_norm": 1.6718555688858032, + "learning_rate": 5.4800539723733714e-05, + "loss": 0.7453, + "step": 147000 + }, + { + "epoch": 0.9392049883086516, + "grad_norm": 1.3321524858474731, + "learning_rate": 5.479554519479778e-05, + "loss": 0.9719, + "step": 147010 + }, + { + "epoch": 0.9392688754583903, + "grad_norm": 0.7319386601448059, + "learning_rate": 5.479055061756656e-05, + "loss": 0.7486, + "step": 147020 + }, + { + "epoch": 0.939332762608129, + "grad_norm": 1.211982250213623, + "learning_rate": 5.478555599209035e-05, + "loss": 0.8766, + "step": 147030 + }, + { + "epoch": 0.9393966497578677, + "grad_norm": 1.0847975015640259, + "learning_rate": 5.478056131841947e-05, + "loss": 0.9276, + "step": 147040 + }, + { + "epoch": 0.9394605369076064, + "grad_norm": 1.0367181301116943, + "learning_rate": 5.477556659660418e-05, + "loss": 0.7712, + "step": 147050 + }, + { + "epoch": 0.9395244240573452, + "grad_norm": 1.7131720781326294, + "learning_rate": 5.4770571826694806e-05, + "loss": 0.9039, + "step": 147060 + }, + { + "epoch": 0.9395883112070839, + "grad_norm": 0.8213444948196411, + "learning_rate": 5.4765577008741644e-05, + "loss": 0.7636, + "step": 147070 + }, + { + "epoch": 0.9396521983568225, + "grad_norm": 0.7397197484970093, + "learning_rate": 5.4760582142795006e-05, + "loss": 0.8527, + "step": 147080 + }, + { + "epoch": 0.9397160855065612, + "grad_norm": 0.7857769131660461, + "learning_rate": 5.475558722890518e-05, + "loss": 0.8647, + "step": 147090 + }, + { + "epoch": 0.9397799726562999, + "grad_norm": 0.7695388197898865, + "learning_rate": 5.4750592267122494e-05, + "loss": 0.8865, + "step": 147100 + }, + { + "epoch": 0.9398438598060386, + "grad_norm": 0.926115870475769, + "learning_rate": 5.4745597257497215e-05, + "loss": 1.1938, + "step": 147110 + }, + { + "epoch": 0.9399077469557773, + "grad_norm": 0.6379870772361755, + "learning_rate": 5.474060220007967e-05, + "loss": 0.7275, + "step": 147120 + }, + { + "epoch": 0.939971634105516, + "grad_norm": 0.6762766242027283, + "learning_rate": 5.473560709492016e-05, + "loss": 1.1361, + "step": 147130 + }, + { + "epoch": 0.9400355212552547, + "grad_norm": 0.9891712069511414, + "learning_rate": 5.4730611942069e-05, + "loss": 0.7849, + "step": 147140 + }, + { + "epoch": 0.9400994084049934, + "grad_norm": 1.2105982303619385, + "learning_rate": 5.472561674157647e-05, + "loss": 0.9796, + "step": 147150 + }, + { + "epoch": 0.9401632955547321, + "grad_norm": 0.9586398601531982, + "learning_rate": 5.47206214934929e-05, + "loss": 0.9349, + "step": 147160 + }, + { + "epoch": 0.9402271827044708, + "grad_norm": 1.1456085443496704, + "learning_rate": 5.471562619786858e-05, + "loss": 0.9046, + "step": 147170 + }, + { + "epoch": 0.9402910698542095, + "grad_norm": 0.9945286512374878, + "learning_rate": 5.471063085475383e-05, + "loss": 0.8429, + "step": 147180 + }, + { + "epoch": 0.9403549570039482, + "grad_norm": 0.9011921882629395, + "learning_rate": 5.4705635464198954e-05, + "loss": 0.8381, + "step": 147190 + }, + { + "epoch": 0.9404188441536869, + "grad_norm": 0.6775206327438354, + "learning_rate": 5.4700640026254246e-05, + "loss": 0.9295, + "step": 147200 + }, + { + "epoch": 0.9404827313034256, + "grad_norm": 0.9664705991744995, + "learning_rate": 5.469564454097004e-05, + "loss": 1.06, + "step": 147210 + }, + { + "epoch": 0.9405466184531643, + "grad_norm": 2.8239731788635254, + "learning_rate": 5.469064900839662e-05, + "loss": 0.8501, + "step": 147220 + }, + { + "epoch": 0.940610505602903, + "grad_norm": 1.0002690553665161, + "learning_rate": 5.4685653428584314e-05, + "loss": 0.9444, + "step": 147230 + }, + { + "epoch": 0.9406743927526418, + "grad_norm": 0.9000080227851868, + "learning_rate": 5.468065780158343e-05, + "loss": 0.912, + "step": 147240 + }, + { + "epoch": 0.9407382799023805, + "grad_norm": 1.004228115081787, + "learning_rate": 5.467566212744427e-05, + "loss": 0.8445, + "step": 147250 + }, + { + "epoch": 0.9408021670521192, + "grad_norm": 0.896317720413208, + "learning_rate": 5.467066640621714e-05, + "loss": 1.1256, + "step": 147260 + }, + { + "epoch": 0.9408660542018579, + "grad_norm": 1.0601049661636353, + "learning_rate": 5.466567063795237e-05, + "loss": 0.944, + "step": 147270 + }, + { + "epoch": 0.9409299413515966, + "grad_norm": 1.2464497089385986, + "learning_rate": 5.4660674822700264e-05, + "loss": 0.7599, + "step": 147280 + }, + { + "epoch": 0.9409938285013353, + "grad_norm": 0.9628870487213135, + "learning_rate": 5.4655678960511116e-05, + "loss": 0.9559, + "step": 147290 + }, + { + "epoch": 0.941057715651074, + "grad_norm": 0.6807940602302551, + "learning_rate": 5.465068305143526e-05, + "loss": 0.9153, + "step": 147300 + }, + { + "epoch": 0.9411216028008127, + "grad_norm": 0.7045243382453918, + "learning_rate": 5.4645687095523004e-05, + "loss": 0.6033, + "step": 147310 + }, + { + "epoch": 0.9411854899505514, + "grad_norm": 1.0960919857025146, + "learning_rate": 5.464069109282465e-05, + "loss": 0.9625, + "step": 147320 + }, + { + "epoch": 0.94124937710029, + "grad_norm": 0.8039271235466003, + "learning_rate": 5.4635695043390526e-05, + "loss": 1.0624, + "step": 147330 + }, + { + "epoch": 0.9413132642500287, + "grad_norm": 0.8753572106361389, + "learning_rate": 5.463069894727094e-05, + "loss": 0.7895, + "step": 147340 + }, + { + "epoch": 0.9413771513997674, + "grad_norm": 0.9138633608818054, + "learning_rate": 5.462570280451622e-05, + "loss": 1.1042, + "step": 147350 + }, + { + "epoch": 0.9414410385495061, + "grad_norm": 0.7882958054542542, + "learning_rate": 5.4620706615176645e-05, + "loss": 0.759, + "step": 147360 + }, + { + "epoch": 0.9415049256992448, + "grad_norm": 0.7644445896148682, + "learning_rate": 5.4615710379302574e-05, + "loss": 0.6677, + "step": 147370 + }, + { + "epoch": 0.9415688128489835, + "grad_norm": 0.8131184577941895, + "learning_rate": 5.461071409694432e-05, + "loss": 0.8138, + "step": 147380 + }, + { + "epoch": 0.9416326999987222, + "grad_norm": 1.2795737981796265, + "learning_rate": 5.460571776815216e-05, + "loss": 0.9862, + "step": 147390 + }, + { + "epoch": 0.9416965871484609, + "grad_norm": 0.9743437767028809, + "learning_rate": 5.460072139297646e-05, + "loss": 0.9635, + "step": 147400 + }, + { + "epoch": 0.9417604742981996, + "grad_norm": 0.8721929788589478, + "learning_rate": 5.459572497146751e-05, + "loss": 0.9526, + "step": 147410 + }, + { + "epoch": 0.9418243614479384, + "grad_norm": 0.8396299481391907, + "learning_rate": 5.459072850367563e-05, + "loss": 0.9158, + "step": 147420 + }, + { + "epoch": 0.9418882485976771, + "grad_norm": 0.7873830795288086, + "learning_rate": 5.4585731989651144e-05, + "loss": 0.8138, + "step": 147430 + }, + { + "epoch": 0.9419521357474158, + "grad_norm": 1.116898775100708, + "learning_rate": 5.458073542944436e-05, + "loss": 0.9122, + "step": 147440 + }, + { + "epoch": 0.9420160228971545, + "grad_norm": 0.5282606482505798, + "learning_rate": 5.4575738823105626e-05, + "loss": 0.7972, + "step": 147450 + }, + { + "epoch": 0.9420799100468932, + "grad_norm": 0.8728241920471191, + "learning_rate": 5.457074217068523e-05, + "loss": 0.6552, + "step": 147460 + }, + { + "epoch": 0.9421437971966319, + "grad_norm": 0.9961313009262085, + "learning_rate": 5.456574547223351e-05, + "loss": 0.896, + "step": 147470 + }, + { + "epoch": 0.9422076843463706, + "grad_norm": 0.6567425727844238, + "learning_rate": 5.456074872780078e-05, + "loss": 0.6517, + "step": 147480 + }, + { + "epoch": 0.9422715714961093, + "grad_norm": 1.023400068283081, + "learning_rate": 5.455575193743737e-05, + "loss": 1.0409, + "step": 147490 + }, + { + "epoch": 0.942335458645848, + "grad_norm": 1.032383918762207, + "learning_rate": 5.455075510119359e-05, + "loss": 1.1724, + "step": 147500 + }, + { + "epoch": 0.9423993457955867, + "grad_norm": 1.1703791618347168, + "learning_rate": 5.454575821911978e-05, + "loss": 0.9594, + "step": 147510 + }, + { + "epoch": 0.9424632329453254, + "grad_norm": 0.9019293785095215, + "learning_rate": 5.454076129126624e-05, + "loss": 0.8071, + "step": 147520 + }, + { + "epoch": 0.9425271200950641, + "grad_norm": 0.9911213517189026, + "learning_rate": 5.4535764317683314e-05, + "loss": 0.856, + "step": 147530 + }, + { + "epoch": 0.9425910072448028, + "grad_norm": 1.0215911865234375, + "learning_rate": 5.4530767298421315e-05, + "loss": 0.9103, + "step": 147540 + }, + { + "epoch": 0.9426548943945415, + "grad_norm": 1.2681025266647339, + "learning_rate": 5.452577023353057e-05, + "loss": 0.9712, + "step": 147550 + }, + { + "epoch": 0.9427187815442802, + "grad_norm": 1.3751388788223267, + "learning_rate": 5.4520773123061406e-05, + "loss": 1.0203, + "step": 147560 + }, + { + "epoch": 0.9427826686940188, + "grad_norm": 0.6695000529289246, + "learning_rate": 5.4515775967064145e-05, + "loss": 0.9345, + "step": 147570 + }, + { + "epoch": 0.9428465558437575, + "grad_norm": 0.8419767618179321, + "learning_rate": 5.4510778765589096e-05, + "loss": 0.8482, + "step": 147580 + }, + { + "epoch": 0.9429104429934962, + "grad_norm": 0.9437578320503235, + "learning_rate": 5.4505781518686626e-05, + "loss": 0.9328, + "step": 147590 + }, + { + "epoch": 0.942974330143235, + "grad_norm": 0.9840037226676941, + "learning_rate": 5.450078422640703e-05, + "loss": 1.0883, + "step": 147600 + }, + { + "epoch": 0.9430382172929737, + "grad_norm": 0.8432072401046753, + "learning_rate": 5.449578688880064e-05, + "loss": 0.7759, + "step": 147610 + }, + { + "epoch": 0.9431021044427124, + "grad_norm": 0.9873720407485962, + "learning_rate": 5.44907895059178e-05, + "loss": 0.7869, + "step": 147620 + }, + { + "epoch": 0.9431659915924511, + "grad_norm": 1.06051504611969, + "learning_rate": 5.44857920778088e-05, + "loss": 0.6161, + "step": 147630 + }, + { + "epoch": 0.9432298787421898, + "grad_norm": 0.6839306950569153, + "learning_rate": 5.448079460452401e-05, + "loss": 0.8623, + "step": 147640 + }, + { + "epoch": 0.9432937658919285, + "grad_norm": 0.9109451174736023, + "learning_rate": 5.4475797086113736e-05, + "loss": 0.7093, + "step": 147650 + }, + { + "epoch": 0.9433576530416672, + "grad_norm": 0.6764558553695679, + "learning_rate": 5.447079952262831e-05, + "loss": 0.715, + "step": 147660 + }, + { + "epoch": 0.9434215401914059, + "grad_norm": 1.947048306465149, + "learning_rate": 5.446580191411808e-05, + "loss": 1.3239, + "step": 147670 + }, + { + "epoch": 0.9434854273411446, + "grad_norm": 0.9166133403778076, + "learning_rate": 5.446080426063335e-05, + "loss": 1.124, + "step": 147680 + }, + { + "epoch": 0.9435493144908833, + "grad_norm": 0.7818967700004578, + "learning_rate": 5.4455806562224466e-05, + "loss": 0.9505, + "step": 147690 + }, + { + "epoch": 0.943613201640622, + "grad_norm": 1.0316641330718994, + "learning_rate": 5.445080881894174e-05, + "loss": 0.9351, + "step": 147700 + }, + { + "epoch": 0.9436770887903607, + "grad_norm": 0.9473081231117249, + "learning_rate": 5.444581103083553e-05, + "loss": 0.6593, + "step": 147710 + }, + { + "epoch": 0.9437409759400994, + "grad_norm": 2.7547168731689453, + "learning_rate": 5.4440813197956165e-05, + "loss": 0.8583, + "step": 147720 + }, + { + "epoch": 0.9438048630898381, + "grad_norm": 1.0668847560882568, + "learning_rate": 5.443581532035396e-05, + "loss": 0.8896, + "step": 147730 + }, + { + "epoch": 0.9438687502395768, + "grad_norm": 0.618463933467865, + "learning_rate": 5.443081739807926e-05, + "loss": 0.7973, + "step": 147740 + }, + { + "epoch": 0.9439326373893155, + "grad_norm": 0.8581937551498413, + "learning_rate": 5.442581943118239e-05, + "loss": 0.6358, + "step": 147750 + }, + { + "epoch": 0.9439965245390542, + "grad_norm": 0.7288326621055603, + "learning_rate": 5.44208214197137e-05, + "loss": 0.8207, + "step": 147760 + }, + { + "epoch": 0.944060411688793, + "grad_norm": 1.2387948036193848, + "learning_rate": 5.4415823363723515e-05, + "loss": 1.1969, + "step": 147770 + }, + { + "epoch": 0.9441242988385317, + "grad_norm": 0.8606761693954468, + "learning_rate": 5.441082526326217e-05, + "loss": 0.7147, + "step": 147780 + }, + { + "epoch": 0.9441881859882704, + "grad_norm": 0.8213832378387451, + "learning_rate": 5.4405827118379984e-05, + "loss": 0.8981, + "step": 147790 + }, + { + "epoch": 0.9442520731380091, + "grad_norm": 1.2024768590927124, + "learning_rate": 5.440082892912731e-05, + "loss": 0.7657, + "step": 147800 + }, + { + "epoch": 0.9443159602877477, + "grad_norm": 1.0111289024353027, + "learning_rate": 5.439583069555448e-05, + "loss": 0.9585, + "step": 147810 + }, + { + "epoch": 0.9443798474374864, + "grad_norm": 0.8769100904464722, + "learning_rate": 5.439083241771185e-05, + "loss": 0.8346, + "step": 147820 + }, + { + "epoch": 0.9444437345872251, + "grad_norm": 2.532590389251709, + "learning_rate": 5.438583409564972e-05, + "loss": 0.7687, + "step": 147830 + }, + { + "epoch": 0.9445076217369638, + "grad_norm": 0.7972337007522583, + "learning_rate": 5.4380835729418454e-05, + "loss": 0.9489, + "step": 147840 + }, + { + "epoch": 0.9445715088867025, + "grad_norm": 1.492598295211792, + "learning_rate": 5.437583731906838e-05, + "loss": 0.9452, + "step": 147850 + }, + { + "epoch": 0.9446353960364412, + "grad_norm": 0.8178605437278748, + "learning_rate": 5.4370838864649845e-05, + "loss": 0.8965, + "step": 147860 + }, + { + "epoch": 0.9446992831861799, + "grad_norm": 0.7244489789009094, + "learning_rate": 5.436584036621317e-05, + "loss": 0.7104, + "step": 147870 + }, + { + "epoch": 0.9447631703359186, + "grad_norm": 1.0198410749435425, + "learning_rate": 5.4360841823808715e-05, + "loss": 1.0496, + "step": 147880 + }, + { + "epoch": 0.9448270574856573, + "grad_norm": 0.7587177157402039, + "learning_rate": 5.435584323748679e-05, + "loss": 0.9377, + "step": 147890 + }, + { + "epoch": 0.944890944635396, + "grad_norm": 0.8727468252182007, + "learning_rate": 5.4350844607297776e-05, + "loss": 0.712, + "step": 147900 + }, + { + "epoch": 0.9449548317851347, + "grad_norm": 0.5229664444923401, + "learning_rate": 5.4345845933291984e-05, + "loss": 0.6849, + "step": 147910 + }, + { + "epoch": 0.9450187189348734, + "grad_norm": 0.65986567735672, + "learning_rate": 5.4340847215519776e-05, + "loss": 0.9049, + "step": 147920 + }, + { + "epoch": 0.9450826060846121, + "grad_norm": 1.3852014541625977, + "learning_rate": 5.4335848454031466e-05, + "loss": 0.7575, + "step": 147930 + }, + { + "epoch": 0.9451464932343508, + "grad_norm": 1.0181684494018555, + "learning_rate": 5.433084964887742e-05, + "loss": 0.879, + "step": 147940 + }, + { + "epoch": 0.9452103803840896, + "grad_norm": 2.153229236602783, + "learning_rate": 5.432585080010797e-05, + "loss": 0.8431, + "step": 147950 + }, + { + "epoch": 0.9452742675338283, + "grad_norm": 0.9575214385986328, + "learning_rate": 5.432085190777346e-05, + "loss": 0.659, + "step": 147960 + }, + { + "epoch": 0.945338154683567, + "grad_norm": 0.5660369396209717, + "learning_rate": 5.431585297192423e-05, + "loss": 0.967, + "step": 147970 + }, + { + "epoch": 0.9454020418333057, + "grad_norm": 1.3022630214691162, + "learning_rate": 5.431085399261063e-05, + "loss": 0.8355, + "step": 147980 + }, + { + "epoch": 0.9454659289830444, + "grad_norm": 1.3143341541290283, + "learning_rate": 5.4305854969883006e-05, + "loss": 0.8243, + "step": 147990 + }, + { + "epoch": 0.9455298161327831, + "grad_norm": 1.7055933475494385, + "learning_rate": 5.4300855903791694e-05, + "loss": 1.1814, + "step": 148000 + }, + { + "epoch": 0.9455937032825218, + "grad_norm": 0.5489552021026611, + "learning_rate": 5.429585679438705e-05, + "loss": 0.761, + "step": 148010 + }, + { + "epoch": 0.9456575904322605, + "grad_norm": 1.1143733263015747, + "learning_rate": 5.429085764171939e-05, + "loss": 1.1932, + "step": 148020 + }, + { + "epoch": 0.9457214775819992, + "grad_norm": 1.6179147958755493, + "learning_rate": 5.42858584458391e-05, + "loss": 1.4239, + "step": 148030 + }, + { + "epoch": 0.9457853647317379, + "grad_norm": 0.9910181164741516, + "learning_rate": 5.4280859206796506e-05, + "loss": 0.8706, + "step": 148040 + }, + { + "epoch": 0.9458492518814765, + "grad_norm": 1.4709466695785522, + "learning_rate": 5.4275859924641936e-05, + "loss": 0.6839, + "step": 148050 + }, + { + "epoch": 0.9459131390312152, + "grad_norm": 0.8617550730705261, + "learning_rate": 5.4270860599425775e-05, + "loss": 1.1046, + "step": 148060 + }, + { + "epoch": 0.9459770261809539, + "grad_norm": 1.1911267042160034, + "learning_rate": 5.426586123119835e-05, + "loss": 0.8552, + "step": 148070 + }, + { + "epoch": 0.9460409133306926, + "grad_norm": 0.9309574365615845, + "learning_rate": 5.426086182001001e-05, + "loss": 0.8093, + "step": 148080 + }, + { + "epoch": 0.9461048004804313, + "grad_norm": 0.9242094159126282, + "learning_rate": 5.425586236591112e-05, + "loss": 0.8593, + "step": 148090 + }, + { + "epoch": 0.94616868763017, + "grad_norm": 0.7709175944328308, + "learning_rate": 5.4250862868951994e-05, + "loss": 0.9407, + "step": 148100 + }, + { + "epoch": 0.9462325747799087, + "grad_norm": 2.212761878967285, + "learning_rate": 5.424586332918301e-05, + "loss": 0.8686, + "step": 148110 + }, + { + "epoch": 0.9462964619296474, + "grad_norm": 0.9792400598526001, + "learning_rate": 5.424086374665451e-05, + "loss": 0.8417, + "step": 148120 + }, + { + "epoch": 0.9463603490793862, + "grad_norm": 0.8189619183540344, + "learning_rate": 5.423586412141685e-05, + "loss": 0.7651, + "step": 148130 + }, + { + "epoch": 0.9464242362291249, + "grad_norm": 0.6825985312461853, + "learning_rate": 5.423086445352036e-05, + "loss": 0.6932, + "step": 148140 + }, + { + "epoch": 0.9464881233788636, + "grad_norm": 1.7145543098449707, + "learning_rate": 5.422586474301541e-05, + "loss": 0.789, + "step": 148150 + }, + { + "epoch": 0.9465520105286023, + "grad_norm": 0.9677301645278931, + "learning_rate": 5.4220864989952345e-05, + "loss": 0.9292, + "step": 148160 + }, + { + "epoch": 0.946615897678341, + "grad_norm": 0.8834055662155151, + "learning_rate": 5.421586519438152e-05, + "loss": 0.7546, + "step": 148170 + }, + { + "epoch": 0.9466797848280797, + "grad_norm": 1.360954999923706, + "learning_rate": 5.421086535635328e-05, + "loss": 0.9588, + "step": 148180 + }, + { + "epoch": 0.9467436719778184, + "grad_norm": 0.9436541795730591, + "learning_rate": 5.4205865475918e-05, + "loss": 0.7112, + "step": 148190 + }, + { + "epoch": 0.9468075591275571, + "grad_norm": 1.135512351989746, + "learning_rate": 5.420086555312599e-05, + "loss": 1.0807, + "step": 148200 + }, + { + "epoch": 0.9468714462772958, + "grad_norm": NaN, + "learning_rate": 5.419636558643983e-05, + "loss": 0.9952, + "step": 148210 + }, + { + "epoch": 0.9469353334270345, + "grad_norm": 0.8383281230926514, + "learning_rate": 5.4191365583308814e-05, + "loss": 0.9675, + "step": 148220 + }, + { + "epoch": 0.9469992205767732, + "grad_norm": 0.6859557032585144, + "learning_rate": 5.418636553796713e-05, + "loss": 0.9006, + "step": 148230 + }, + { + "epoch": 0.9470631077265119, + "grad_norm": 0.9620136022567749, + "learning_rate": 5.4181365450465125e-05, + "loss": 0.8653, + "step": 148240 + }, + { + "epoch": 0.9471269948762506, + "grad_norm": 0.5614147186279297, + "learning_rate": 5.417636532085315e-05, + "loss": 0.7507, + "step": 148250 + }, + { + "epoch": 0.9471908820259893, + "grad_norm": 1.3918198347091675, + "learning_rate": 5.417136514918156e-05, + "loss": 0.9124, + "step": 148260 + }, + { + "epoch": 0.947254769175728, + "grad_norm": 0.8057130575180054, + "learning_rate": 5.416636493550071e-05, + "loss": 0.8625, + "step": 148270 + }, + { + "epoch": 0.9473186563254667, + "grad_norm": 1.2097066640853882, + "learning_rate": 5.4161364679860974e-05, + "loss": 0.9413, + "step": 148280 + }, + { + "epoch": 0.9473825434752055, + "grad_norm": 0.8656195402145386, + "learning_rate": 5.415636438231269e-05, + "loss": 0.851, + "step": 148290 + }, + { + "epoch": 0.947446430624944, + "grad_norm": 0.9024816751480103, + "learning_rate": 5.4151364042906216e-05, + "loss": 0.6835, + "step": 148300 + }, + { + "epoch": 0.9475103177746828, + "grad_norm": 0.9172950983047485, + "learning_rate": 5.414636366169191e-05, + "loss": 0.9436, + "step": 148310 + }, + { + "epoch": 0.9475742049244215, + "grad_norm": 1.2229052782058716, + "learning_rate": 5.4141363238720144e-05, + "loss": 0.7715, + "step": 148320 + }, + { + "epoch": 0.9476380920741602, + "grad_norm": 0.7805074453353882, + "learning_rate": 5.4136362774041274e-05, + "loss": 0.7959, + "step": 148330 + }, + { + "epoch": 0.9477019792238989, + "grad_norm": 1.2638076543807983, + "learning_rate": 5.4131362267705635e-05, + "loss": 0.8026, + "step": 148340 + }, + { + "epoch": 0.9477658663736376, + "grad_norm": 0.8353046774864197, + "learning_rate": 5.412636171976362e-05, + "loss": 0.8111, + "step": 148350 + }, + { + "epoch": 0.9478297535233763, + "grad_norm": 0.7682856917381287, + "learning_rate": 5.4121361130265556e-05, + "loss": 1.0184, + "step": 148360 + }, + { + "epoch": 0.947893640673115, + "grad_norm": 1.0108745098114014, + "learning_rate": 5.411636049926183e-05, + "loss": 0.9613, + "step": 148370 + }, + { + "epoch": 0.9479575278228537, + "grad_norm": 1.050826907157898, + "learning_rate": 5.4111359826802785e-05, + "loss": 0.742, + "step": 148380 + }, + { + "epoch": 0.9480214149725924, + "grad_norm": 0.868506133556366, + "learning_rate": 5.41063591129388e-05, + "loss": 1.0165, + "step": 148390 + }, + { + "epoch": 0.9480853021223311, + "grad_norm": 0.8240692019462585, + "learning_rate": 5.410135835772023e-05, + "loss": 0.9583, + "step": 148400 + }, + { + "epoch": 0.9481491892720698, + "grad_norm": 0.7590421438217163, + "learning_rate": 5.409635756119742e-05, + "loss": 0.9101, + "step": 148410 + }, + { + "epoch": 0.9482130764218085, + "grad_norm": 0.9311391711235046, + "learning_rate": 5.409135672342076e-05, + "loss": 0.9968, + "step": 148420 + }, + { + "epoch": 0.9482769635715472, + "grad_norm": 0.7667366862297058, + "learning_rate": 5.408635584444058e-05, + "loss": 0.7345, + "step": 148430 + }, + { + "epoch": 0.9483408507212859, + "grad_norm": 2.864366292953491, + "learning_rate": 5.408135492430728e-05, + "loss": 0.9094, + "step": 148440 + }, + { + "epoch": 0.9484047378710246, + "grad_norm": 1.2095937728881836, + "learning_rate": 5.407635396307119e-05, + "loss": 0.9626, + "step": 148450 + }, + { + "epoch": 0.9484686250207633, + "grad_norm": 1.3204469680786133, + "learning_rate": 5.4071352960782697e-05, + "loss": 0.9588, + "step": 148460 + }, + { + "epoch": 0.948532512170502, + "grad_norm": 0.5598198771476746, + "learning_rate": 5.406635191749215e-05, + "loss": 0.8083, + "step": 148470 + }, + { + "epoch": 0.9485963993202408, + "grad_norm": 1.0732495784759521, + "learning_rate": 5.406135083324993e-05, + "loss": 0.8825, + "step": 148480 + }, + { + "epoch": 0.9486602864699795, + "grad_norm": 0.9504900574684143, + "learning_rate": 5.405634970810639e-05, + "loss": 1.174, + "step": 148490 + }, + { + "epoch": 0.9487241736197182, + "grad_norm": 1.0843398571014404, + "learning_rate": 5.40513485421119e-05, + "loss": 1.0179, + "step": 148500 + }, + { + "epoch": 0.9487880607694569, + "grad_norm": 1.0680909156799316, + "learning_rate": 5.404634733531683e-05, + "loss": 1.0718, + "step": 148510 + }, + { + "epoch": 0.9488519479191956, + "grad_norm": 0.8766992092132568, + "learning_rate": 5.404134608777154e-05, + "loss": 0.8991, + "step": 148520 + }, + { + "epoch": 0.9489158350689343, + "grad_norm": 0.7806427478790283, + "learning_rate": 5.4036344799526396e-05, + "loss": 1.0212, + "step": 148530 + }, + { + "epoch": 0.9489797222186729, + "grad_norm": 0.9401262998580933, + "learning_rate": 5.4031343470631756e-05, + "loss": 0.8057, + "step": 148540 + }, + { + "epoch": 0.9490436093684116, + "grad_norm": 0.99031001329422, + "learning_rate": 5.402634210113801e-05, + "loss": 0.8273, + "step": 148550 + }, + { + "epoch": 0.9491074965181503, + "grad_norm": 0.5672215223312378, + "learning_rate": 5.402134069109551e-05, + "loss": 1.0363, + "step": 148560 + }, + { + "epoch": 0.949171383667889, + "grad_norm": 0.7566940784454346, + "learning_rate": 5.401633924055464e-05, + "loss": 0.8976, + "step": 148570 + }, + { + "epoch": 0.9492352708176277, + "grad_norm": 1.022157907485962, + "learning_rate": 5.401133774956576e-05, + "loss": 0.9522, + "step": 148580 + }, + { + "epoch": 0.9492991579673664, + "grad_norm": 0.8552069664001465, + "learning_rate": 5.400633621817923e-05, + "loss": 0.7844, + "step": 148590 + }, + { + "epoch": 0.9493630451171051, + "grad_norm": 1.0440733432769775, + "learning_rate": 5.4001334646445436e-05, + "loss": 0.6886, + "step": 148600 + }, + { + "epoch": 0.9494269322668438, + "grad_norm": 1.094836711883545, + "learning_rate": 5.399633303441474e-05, + "loss": 1.0184, + "step": 148610 + }, + { + "epoch": 0.9494908194165825, + "grad_norm": 0.8054597973823547, + "learning_rate": 5.399133138213751e-05, + "loss": 0.9208, + "step": 148620 + }, + { + "epoch": 0.9495547065663212, + "grad_norm": 0.6301440000534058, + "learning_rate": 5.398632968966412e-05, + "loss": 0.8177, + "step": 148630 + }, + { + "epoch": 0.9496185937160599, + "grad_norm": 1.095146894454956, + "learning_rate": 5.398182813211199e-05, + "loss": 1.1332, + "step": 148640 + }, + { + "epoch": 0.9496824808657987, + "grad_norm": 1.182285189628601, + "learning_rate": 5.3976826363404665e-05, + "loss": 0.8807, + "step": 148650 + }, + { + "epoch": 0.9497463680155374, + "grad_norm": 1.0031092166900635, + "learning_rate": 5.397182455464725e-05, + "loss": 1.1409, + "step": 148660 + }, + { + "epoch": 0.9498102551652761, + "grad_norm": 0.9576486945152283, + "learning_rate": 5.396682270589015e-05, + "loss": 0.9945, + "step": 148670 + }, + { + "epoch": 0.9498741423150148, + "grad_norm": 1.1300849914550781, + "learning_rate": 5.396182081718369e-05, + "loss": 0.9599, + "step": 148680 + }, + { + "epoch": 0.9499380294647535, + "grad_norm": 1.3322839736938477, + "learning_rate": 5.395681888857829e-05, + "loss": 0.8257, + "step": 148690 + }, + { + "epoch": 0.9500019166144922, + "grad_norm": 1.616481065750122, + "learning_rate": 5.3951816920124285e-05, + "loss": 0.9125, + "step": 148700 + }, + { + "epoch": 0.9500658037642309, + "grad_norm": 0.7100057601928711, + "learning_rate": 5.394681491187207e-05, + "loss": 1.3379, + "step": 148710 + }, + { + "epoch": 0.9501296909139696, + "grad_norm": 0.8348981738090515, + "learning_rate": 5.394181286387202e-05, + "loss": 0.9037, + "step": 148720 + }, + { + "epoch": 0.9501935780637083, + "grad_norm": 1.0880135297775269, + "learning_rate": 5.3936810776174497e-05, + "loss": 0.8901, + "step": 148730 + }, + { + "epoch": 0.950257465213447, + "grad_norm": 1.474798560142517, + "learning_rate": 5.3931808648829887e-05, + "loss": 1.0315, + "step": 148740 + }, + { + "epoch": 0.9503213523631857, + "grad_norm": 1.2487114667892456, + "learning_rate": 5.392680648188856e-05, + "loss": 0.9557, + "step": 148750 + }, + { + "epoch": 0.9503852395129244, + "grad_norm": 0.5806934833526611, + "learning_rate": 5.392180427540089e-05, + "loss": 0.8193, + "step": 148760 + }, + { + "epoch": 0.9504491266626631, + "grad_norm": 0.8657655715942383, + "learning_rate": 5.391680202941727e-05, + "loss": 0.9947, + "step": 148770 + }, + { + "epoch": 0.9505130138124017, + "grad_norm": 1.0239652395248413, + "learning_rate": 5.3911799743988054e-05, + "loss": 0.9196, + "step": 148780 + }, + { + "epoch": 0.9505769009621404, + "grad_norm": 0.862191379070282, + "learning_rate": 5.390679741916365e-05, + "loss": 1.0819, + "step": 148790 + }, + { + "epoch": 0.9506407881118791, + "grad_norm": 0.7813977599143982, + "learning_rate": 5.39017950549944e-05, + "loss": 0.8585, + "step": 148800 + }, + { + "epoch": 0.9507046752616178, + "grad_norm": 0.829288125038147, + "learning_rate": 5.389679265153069e-05, + "loss": 0.9753, + "step": 148810 + }, + { + "epoch": 0.9507685624113565, + "grad_norm": 1.4242595434188843, + "learning_rate": 5.389179020882291e-05, + "loss": 0.5942, + "step": 148820 + }, + { + "epoch": 0.9508324495610952, + "grad_norm": 0.8533827066421509, + "learning_rate": 5.388678772692144e-05, + "loss": 1.042, + "step": 148830 + }, + { + "epoch": 0.950896336710834, + "grad_norm": 0.6416082978248596, + "learning_rate": 5.388178520587666e-05, + "loss": 1.0584, + "step": 148840 + }, + { + "epoch": 0.9509602238605727, + "grad_norm": 1.0949831008911133, + "learning_rate": 5.387678264573893e-05, + "loss": 0.8983, + "step": 148850 + }, + { + "epoch": 0.9510241110103114, + "grad_norm": 1.0142227411270142, + "learning_rate": 5.3871780046558664e-05, + "loss": 0.7403, + "step": 148860 + }, + { + "epoch": 0.9510879981600501, + "grad_norm": 0.9484434127807617, + "learning_rate": 5.3866777408386217e-05, + "loss": 0.9695, + "step": 148870 + }, + { + "epoch": 0.9511518853097888, + "grad_norm": 0.765370786190033, + "learning_rate": 5.386177473127197e-05, + "loss": 0.7995, + "step": 148880 + }, + { + "epoch": 0.9512157724595275, + "grad_norm": 0.7197050452232361, + "learning_rate": 5.385677201526631e-05, + "loss": 0.8397, + "step": 148890 + }, + { + "epoch": 0.9512796596092662, + "grad_norm": 0.9084450602531433, + "learning_rate": 5.385176926041963e-05, + "loss": 1.113, + "step": 148900 + }, + { + "epoch": 0.9513435467590049, + "grad_norm": 1.0009987354278564, + "learning_rate": 5.3846766466782294e-05, + "loss": 0.9134, + "step": 148910 + }, + { + "epoch": 0.9514074339087436, + "grad_norm": 0.9077139496803284, + "learning_rate": 5.3841763634404695e-05, + "loss": 1.0847, + "step": 148920 + }, + { + "epoch": 0.9514713210584823, + "grad_norm": 0.6592074632644653, + "learning_rate": 5.383676076333721e-05, + "loss": 0.7027, + "step": 148930 + }, + { + "epoch": 0.951535208208221, + "grad_norm": 0.9967330694198608, + "learning_rate": 5.383175785363023e-05, + "loss": 0.8229, + "step": 148940 + }, + { + "epoch": 0.9515990953579597, + "grad_norm": 0.9206305146217346, + "learning_rate": 5.382675490533413e-05, + "loss": 0.8789, + "step": 148950 + }, + { + "epoch": 0.9516629825076984, + "grad_norm": 2.410933017730713, + "learning_rate": 5.38217519184993e-05, + "loss": 0.8032, + "step": 148960 + }, + { + "epoch": 0.9517268696574371, + "grad_norm": 1.0102331638336182, + "learning_rate": 5.381674889317612e-05, + "loss": 1.1499, + "step": 148970 + }, + { + "epoch": 0.9517907568071758, + "grad_norm": 1.321588397026062, + "learning_rate": 5.3811745829414975e-05, + "loss": 0.9381, + "step": 148980 + }, + { + "epoch": 0.9518546439569145, + "grad_norm": 0.7851718664169312, + "learning_rate": 5.3806742727266245e-05, + "loss": 0.807, + "step": 148990 + }, + { + "epoch": 0.9519185311066533, + "grad_norm": 0.8672306537628174, + "learning_rate": 5.380173958678033e-05, + "loss": 0.8848, + "step": 149000 + }, + { + "epoch": 0.951982418256392, + "grad_norm": 1.5692130327224731, + "learning_rate": 5.379673640800761e-05, + "loss": 0.6902, + "step": 149010 + }, + { + "epoch": 0.9520463054061307, + "grad_norm": 0.9752570986747742, + "learning_rate": 5.379173319099845e-05, + "loss": 0.9114, + "step": 149020 + }, + { + "epoch": 0.9521101925558693, + "grad_norm": 0.89276123046875, + "learning_rate": 5.378672993580329e-05, + "loss": 0.7358, + "step": 149030 + }, + { + "epoch": 0.952174079705608, + "grad_norm": 0.9652552604675293, + "learning_rate": 5.378172664247246e-05, + "loss": 0.8794, + "step": 149040 + }, + { + "epoch": 0.9522379668553467, + "grad_norm": 0.7110236287117004, + "learning_rate": 5.377672331105639e-05, + "loss": 0.9261, + "step": 149050 + }, + { + "epoch": 0.9523018540050854, + "grad_norm": 0.5412469506263733, + "learning_rate": 5.3771719941605434e-05, + "loss": 0.8893, + "step": 149060 + }, + { + "epoch": 0.9523657411548241, + "grad_norm": 0.5765590071678162, + "learning_rate": 5.3766716534170004e-05, + "loss": 0.7548, + "step": 149070 + }, + { + "epoch": 0.9524296283045628, + "grad_norm": 0.9300665855407715, + "learning_rate": 5.376171308880047e-05, + "loss": 0.982, + "step": 149080 + }, + { + "epoch": 0.9524935154543015, + "grad_norm": 0.7088594436645508, + "learning_rate": 5.375670960554724e-05, + "loss": 0.7038, + "step": 149090 + }, + { + "epoch": 0.9525574026040402, + "grad_norm": 0.7242599129676819, + "learning_rate": 5.375170608446068e-05, + "loss": 0.8099, + "step": 149100 + }, + { + "epoch": 0.9526212897537789, + "grad_norm": 0.8361330628395081, + "learning_rate": 5.3746702525591205e-05, + "loss": 1.1563, + "step": 149110 + }, + { + "epoch": 0.9526851769035176, + "grad_norm": 1.0023174285888672, + "learning_rate": 5.3741698928989194e-05, + "loss": 0.8939, + "step": 149120 + }, + { + "epoch": 0.9527490640532563, + "grad_norm": 0.6952134966850281, + "learning_rate": 5.373669529470504e-05, + "loss": 0.9099, + "step": 149130 + }, + { + "epoch": 0.952812951202995, + "grad_norm": 0.7857193946838379, + "learning_rate": 5.373169162278913e-05, + "loss": 0.8369, + "step": 149140 + }, + { + "epoch": 0.9528768383527337, + "grad_norm": 0.8915547728538513, + "learning_rate": 5.372668791329185e-05, + "loss": 1.0329, + "step": 149150 + }, + { + "epoch": 0.9529407255024724, + "grad_norm": 0.9852863550186157, + "learning_rate": 5.372168416626361e-05, + "loss": 1.06, + "step": 149160 + }, + { + "epoch": 0.9530046126522111, + "grad_norm": 0.7365124225616455, + "learning_rate": 5.371668038175478e-05, + "loss": 0.7519, + "step": 149170 + }, + { + "epoch": 0.9530684998019499, + "grad_norm": 0.8659708499908447, + "learning_rate": 5.371167655981576e-05, + "loss": 0.8632, + "step": 149180 + }, + { + "epoch": 0.9531323869516886, + "grad_norm": 0.8980143070220947, + "learning_rate": 5.3706672700496954e-05, + "loss": 0.7586, + "step": 149190 + }, + { + "epoch": 0.9531962741014273, + "grad_norm": 0.9168791174888611, + "learning_rate": 5.370166880384875e-05, + "loss": 0.9606, + "step": 149200 + }, + { + "epoch": 0.953260161251166, + "grad_norm": 3.515974760055542, + "learning_rate": 5.369666486992153e-05, + "loss": 0.9327, + "step": 149210 + }, + { + "epoch": 0.9533240484009047, + "grad_norm": 0.5968050360679626, + "learning_rate": 5.3691660898765705e-05, + "loss": 0.7153, + "step": 149220 + }, + { + "epoch": 0.9533879355506434, + "grad_norm": 0.4954209625720978, + "learning_rate": 5.3686656890431665e-05, + "loss": 0.7143, + "step": 149230 + }, + { + "epoch": 0.9534518227003821, + "grad_norm": 0.938412070274353, + "learning_rate": 5.3681652844969785e-05, + "loss": 0.7915, + "step": 149240 + }, + { + "epoch": 0.9535157098501208, + "grad_norm": 0.7088356614112854, + "learning_rate": 5.3676648762430495e-05, + "loss": 0.8184, + "step": 149250 + }, + { + "epoch": 0.9535795969998595, + "grad_norm": 1.936963677406311, + "learning_rate": 5.367164464286416e-05, + "loss": 0.7562, + "step": 149260 + }, + { + "epoch": 0.9536434841495981, + "grad_norm": 0.9665189385414124, + "learning_rate": 5.366664048632118e-05, + "loss": 0.9453, + "step": 149270 + }, + { + "epoch": 0.9537073712993368, + "grad_norm": 1.063223123550415, + "learning_rate": 5.366163629285198e-05, + "loss": 0.8095, + "step": 149280 + }, + { + "epoch": 0.9537712584490755, + "grad_norm": 1.0628734827041626, + "learning_rate": 5.365663206250693e-05, + "loss": 1.1281, + "step": 149290 + }, + { + "epoch": 0.9538351455988142, + "grad_norm": 0.9469806551933289, + "learning_rate": 5.365162779533641e-05, + "loss": 1.0392, + "step": 149300 + }, + { + "epoch": 0.9538990327485529, + "grad_norm": 0.8327413201332092, + "learning_rate": 5.3646623491390855e-05, + "loss": 0.5963, + "step": 149310 + }, + { + "epoch": 0.9539629198982916, + "grad_norm": 0.7886581420898438, + "learning_rate": 5.3641619150720646e-05, + "loss": 0.8496, + "step": 149320 + }, + { + "epoch": 0.9540268070480303, + "grad_norm": 0.7735728621482849, + "learning_rate": 5.363661477337618e-05, + "loss": 0.7483, + "step": 149330 + }, + { + "epoch": 0.954090694197769, + "grad_norm": 0.7688968777656555, + "learning_rate": 5.363161035940785e-05, + "loss": 0.9939, + "step": 149340 + }, + { + "epoch": 0.9541545813475077, + "grad_norm": 1.1434403657913208, + "learning_rate": 5.362660590886607e-05, + "loss": 1.1379, + "step": 149350 + }, + { + "epoch": 0.9542184684972465, + "grad_norm": 1.1655575037002563, + "learning_rate": 5.362160142180123e-05, + "loss": 0.9408, + "step": 149360 + }, + { + "epoch": 0.9542823556469852, + "grad_norm": 0.7211439609527588, + "learning_rate": 5.361659689826373e-05, + "loss": 0.6937, + "step": 149370 + }, + { + "epoch": 0.9543462427967239, + "grad_norm": 0.9510606527328491, + "learning_rate": 5.361159233830396e-05, + "loss": 1.044, + "step": 149380 + }, + { + "epoch": 0.9544101299464626, + "grad_norm": 0.7282498478889465, + "learning_rate": 5.360658774197235e-05, + "loss": 0.7549, + "step": 149390 + }, + { + "epoch": 0.9544740170962013, + "grad_norm": 1.0271804332733154, + "learning_rate": 5.3601583109319264e-05, + "loss": 1.0737, + "step": 149400 + }, + { + "epoch": 0.95453790424594, + "grad_norm": 0.7570605278015137, + "learning_rate": 5.359657844039514e-05, + "loss": 0.6157, + "step": 149410 + }, + { + "epoch": 0.9546017913956787, + "grad_norm": 0.8889819979667664, + "learning_rate": 5.3591573735250344e-05, + "loss": 1.0411, + "step": 149420 + }, + { + "epoch": 0.9546656785454174, + "grad_norm": 1.0955427885055542, + "learning_rate": 5.358656899393529e-05, + "loss": 1.0119, + "step": 149430 + }, + { + "epoch": 0.9547295656951561, + "grad_norm": 0.8745180368423462, + "learning_rate": 5.35815642165004e-05, + "loss": 0.9069, + "step": 149440 + }, + { + "epoch": 0.9547934528448948, + "grad_norm": 0.9385198354721069, + "learning_rate": 5.357655940299605e-05, + "loss": 0.8872, + "step": 149450 + }, + { + "epoch": 0.9548573399946335, + "grad_norm": 0.8840739130973816, + "learning_rate": 5.357155455347265e-05, + "loss": 0.6832, + "step": 149460 + }, + { + "epoch": 0.9549212271443722, + "grad_norm": 0.6228633522987366, + "learning_rate": 5.3566549667980614e-05, + "loss": 0.927, + "step": 149470 + }, + { + "epoch": 0.9549851142941109, + "grad_norm": 1.233453631401062, + "learning_rate": 5.356154474657033e-05, + "loss": 1.0322, + "step": 149480 + }, + { + "epoch": 0.9550490014438496, + "grad_norm": 1.1719436645507812, + "learning_rate": 5.355653978929222e-05, + "loss": 0.9269, + "step": 149490 + }, + { + "epoch": 0.9551128885935883, + "grad_norm": 0.7223755717277527, + "learning_rate": 5.3551534796196656e-05, + "loss": 0.8008, + "step": 149500 + }, + { + "epoch": 0.9551767757433269, + "grad_norm": 0.946700930595398, + "learning_rate": 5.3546529767334085e-05, + "loss": 0.7767, + "step": 149510 + }, + { + "epoch": 0.9552406628930656, + "grad_norm": 1.2065627574920654, + "learning_rate": 5.3541524702754886e-05, + "loss": 1.0349, + "step": 149520 + }, + { + "epoch": 0.9553045500428043, + "grad_norm": 1.362046718597412, + "learning_rate": 5.353651960250946e-05, + "loss": 0.6447, + "step": 149530 + }, + { + "epoch": 0.955368437192543, + "grad_norm": 1.2870509624481201, + "learning_rate": 5.353151446664824e-05, + "loss": 0.7011, + "step": 149540 + }, + { + "epoch": 0.9554323243422818, + "grad_norm": 1.1624693870544434, + "learning_rate": 5.352650929522159e-05, + "loss": 1.0363, + "step": 149550 + }, + { + "epoch": 0.9554962114920205, + "grad_norm": 1.077800989151001, + "learning_rate": 5.352150408827996e-05, + "loss": 0.9021, + "step": 149560 + }, + { + "epoch": 0.9555600986417592, + "grad_norm": 1.4235363006591797, + "learning_rate": 5.351649884587373e-05, + "loss": 0.9034, + "step": 149570 + }, + { + "epoch": 0.9556239857914979, + "grad_norm": 0.7878531217575073, + "learning_rate": 5.351149356805332e-05, + "loss": 1.0202, + "step": 149580 + }, + { + "epoch": 0.9556878729412366, + "grad_norm": 0.9151926636695862, + "learning_rate": 5.3506488254869124e-05, + "loss": 0.7615, + "step": 149590 + }, + { + "epoch": 0.9557517600909753, + "grad_norm": 0.8599022626876831, + "learning_rate": 5.3501482906371556e-05, + "loss": 0.9448, + "step": 149600 + }, + { + "epoch": 0.955815647240714, + "grad_norm": 0.8533304929733276, + "learning_rate": 5.349647752261103e-05, + "loss": 1.0606, + "step": 149610 + }, + { + "epoch": 0.9558795343904527, + "grad_norm": 1.0226484537124634, + "learning_rate": 5.3491472103637955e-05, + "loss": 0.9859, + "step": 149620 + }, + { + "epoch": 0.9559434215401914, + "grad_norm": 0.7206386923789978, + "learning_rate": 5.3486466649502733e-05, + "loss": 0.9154, + "step": 149630 + }, + { + "epoch": 0.9560073086899301, + "grad_norm": 1.4392954111099243, + "learning_rate": 5.3481461160255773e-05, + "loss": 0.74, + "step": 149640 + }, + { + "epoch": 0.9560711958396688, + "grad_norm": 1.0331584215164185, + "learning_rate": 5.3476455635947484e-05, + "loss": 0.8146, + "step": 149650 + }, + { + "epoch": 0.9561350829894075, + "grad_norm": 0.8725119829177856, + "learning_rate": 5.3471450076628294e-05, + "loss": 0.8874, + "step": 149660 + }, + { + "epoch": 0.9561989701391462, + "grad_norm": 0.8222038149833679, + "learning_rate": 5.346644448234859e-05, + "loss": 0.8343, + "step": 149670 + }, + { + "epoch": 0.9562628572888849, + "grad_norm": 1.1900924444198608, + "learning_rate": 5.3461438853158784e-05, + "loss": 0.8374, + "step": 149680 + }, + { + "epoch": 0.9563267444386236, + "grad_norm": 0.8432425260543823, + "learning_rate": 5.34564331891093e-05, + "loss": 1.0343, + "step": 149690 + }, + { + "epoch": 0.9563906315883623, + "grad_norm": 1.0364454984664917, + "learning_rate": 5.3451427490250535e-05, + "loss": 0.9026, + "step": 149700 + }, + { + "epoch": 0.9564545187381011, + "grad_norm": 0.8370749950408936, + "learning_rate": 5.344642175663292e-05, + "loss": 0.8584, + "step": 149710 + }, + { + "epoch": 0.9565184058878398, + "grad_norm": 0.720355212688446, + "learning_rate": 5.3441415988306856e-05, + "loss": 0.7958, + "step": 149720 + }, + { + "epoch": 0.9565822930375785, + "grad_norm": 0.9760454297065735, + "learning_rate": 5.343641018532275e-05, + "loss": 0.653, + "step": 149730 + }, + { + "epoch": 0.9566461801873172, + "grad_norm": 0.9241121411323547, + "learning_rate": 5.3431404347731015e-05, + "loss": 0.9755, + "step": 149740 + }, + { + "epoch": 0.9567100673370558, + "grad_norm": 1.0608195066452026, + "learning_rate": 5.3426398475582086e-05, + "loss": 0.9359, + "step": 149750 + }, + { + "epoch": 0.9567739544867945, + "grad_norm": 0.5845559239387512, + "learning_rate": 5.3421392568926363e-05, + "loss": 0.5841, + "step": 149760 + }, + { + "epoch": 0.9568378416365332, + "grad_norm": 1.0789985656738281, + "learning_rate": 5.341638662781424e-05, + "loss": 0.8384, + "step": 149770 + }, + { + "epoch": 0.9569017287862719, + "grad_norm": 1.286792278289795, + "learning_rate": 5.341138065229616e-05, + "loss": 0.7882, + "step": 149780 + }, + { + "epoch": 0.9569656159360106, + "grad_norm": 0.814147412776947, + "learning_rate": 5.3406374642422516e-05, + "loss": 0.8117, + "step": 149790 + }, + { + "epoch": 0.9570295030857493, + "grad_norm": 0.7748686671257019, + "learning_rate": 5.340136859824374e-05, + "loss": 0.7822, + "step": 149800 + }, + { + "epoch": 0.957093390235488, + "grad_norm": 0.9128410220146179, + "learning_rate": 5.339636251981024e-05, + "loss": 0.8351, + "step": 149810 + }, + { + "epoch": 0.9571572773852267, + "grad_norm": 1.1304187774658203, + "learning_rate": 5.339135640717242e-05, + "loss": 0.66, + "step": 149820 + }, + { + "epoch": 0.9572211645349654, + "grad_norm": 3.093017816543579, + "learning_rate": 5.3386350260380724e-05, + "loss": 0.9673, + "step": 149830 + }, + { + "epoch": 0.9572850516847041, + "grad_norm": 0.8403214812278748, + "learning_rate": 5.338134407948554e-05, + "loss": 0.8767, + "step": 149840 + }, + { + "epoch": 0.9573489388344428, + "grad_norm": 0.7375771403312683, + "learning_rate": 5.33763378645373e-05, + "loss": 1.0156, + "step": 149850 + }, + { + "epoch": 0.9574128259841815, + "grad_norm": 1.0833170413970947, + "learning_rate": 5.3371331615586405e-05, + "loss": 0.7993, + "step": 149860 + }, + { + "epoch": 0.9574767131339202, + "grad_norm": 0.7025789618492126, + "learning_rate": 5.336632533268329e-05, + "loss": 0.8754, + "step": 149870 + }, + { + "epoch": 0.957540600283659, + "grad_norm": 1.240759253501892, + "learning_rate": 5.336131901587836e-05, + "loss": 1.0206, + "step": 149880 + }, + { + "epoch": 0.9576044874333977, + "grad_norm": 0.9655282497406006, + "learning_rate": 5.335631266522205e-05, + "loss": 0.8602, + "step": 149890 + }, + { + "epoch": 0.9576683745831364, + "grad_norm": 0.8315982818603516, + "learning_rate": 5.335130628076478e-05, + "loss": 0.6983, + "step": 149900 + }, + { + "epoch": 0.9577322617328751, + "grad_norm": 0.5027674436569214, + "learning_rate": 5.334629986255694e-05, + "loss": 0.8194, + "step": 149910 + }, + { + "epoch": 0.9577961488826138, + "grad_norm": 1.5360819101333618, + "learning_rate": 5.3341293410648964e-05, + "loss": 0.8028, + "step": 149920 + }, + { + "epoch": 0.9578600360323525, + "grad_norm": 0.6608107686042786, + "learning_rate": 5.333628692509128e-05, + "loss": 0.8155, + "step": 149930 + }, + { + "epoch": 0.9579239231820912, + "grad_norm": 0.8952988982200623, + "learning_rate": 5.33312804059343e-05, + "loss": 0.7765, + "step": 149940 + }, + { + "epoch": 0.9579878103318299, + "grad_norm": 0.556867241859436, + "learning_rate": 5.3326273853228435e-05, + "loss": 0.9709, + "step": 149950 + }, + { + "epoch": 0.9580516974815686, + "grad_norm": 1.2768900394439697, + "learning_rate": 5.332126726702413e-05, + "loss": 0.8193, + "step": 149960 + }, + { + "epoch": 0.9581155846313073, + "grad_norm": 1.526533603668213, + "learning_rate": 5.3316260647371785e-05, + "loss": 0.84, + "step": 149970 + }, + { + "epoch": 0.958179471781046, + "grad_norm": 0.7335947155952454, + "learning_rate": 5.3311253994321816e-05, + "loss": 0.7598, + "step": 149980 + }, + { + "epoch": 0.9582433589307847, + "grad_norm": 0.5931270718574524, + "learning_rate": 5.3306247307924676e-05, + "loss": 0.9405, + "step": 149990 + }, + { + "epoch": 0.9583072460805233, + "grad_norm": 0.8060052394866943, + "learning_rate": 5.330124058823074e-05, + "loss": 0.8914, + "step": 150000 + }, + { + "epoch": 0.958371133230262, + "grad_norm": 1.0445197820663452, + "learning_rate": 5.3296233835290485e-05, + "loss": 1.0033, + "step": 150010 + }, + { + "epoch": 0.9584350203800007, + "grad_norm": 1.0918605327606201, + "learning_rate": 5.329122704915428e-05, + "loss": 0.914, + "step": 150020 + }, + { + "epoch": 0.9584989075297394, + "grad_norm": 1.1090441942214966, + "learning_rate": 5.328622022987257e-05, + "loss": 1.0159, + "step": 150030 + }, + { + "epoch": 0.9585627946794781, + "grad_norm": 0.9032556414604187, + "learning_rate": 5.328121337749579e-05, + "loss": 0.7714, + "step": 150040 + }, + { + "epoch": 0.9586266818292168, + "grad_norm": 0.678882896900177, + "learning_rate": 5.3276206492074344e-05, + "loss": 0.8174, + "step": 150050 + }, + { + "epoch": 0.9586905689789555, + "grad_norm": 1.054736852645874, + "learning_rate": 5.327119957365867e-05, + "loss": 0.7512, + "step": 150060 + }, + { + "epoch": 0.9587544561286943, + "grad_norm": 0.6916370987892151, + "learning_rate": 5.326619262229918e-05, + "loss": 1.21, + "step": 150070 + }, + { + "epoch": 0.958818343278433, + "grad_norm": 1.1509549617767334, + "learning_rate": 5.326118563804632e-05, + "loss": 0.9652, + "step": 150080 + }, + { + "epoch": 0.9588822304281717, + "grad_norm": 0.5059418082237244, + "learning_rate": 5.325617862095049e-05, + "loss": 0.8353, + "step": 150090 + }, + { + "epoch": 0.9589461175779104, + "grad_norm": 1.0998530387878418, + "learning_rate": 5.325117157106212e-05, + "loss": 1.0926, + "step": 150100 + }, + { + "epoch": 0.9590100047276491, + "grad_norm": 0.799642026424408, + "learning_rate": 5.324616448843165e-05, + "loss": 0.8287, + "step": 150110 + }, + { + "epoch": 0.9590738918773878, + "grad_norm": 0.6795291304588318, + "learning_rate": 5.3241157373109485e-05, + "loss": 0.8267, + "step": 150120 + }, + { + "epoch": 0.9591377790271265, + "grad_norm": 0.9579530954360962, + "learning_rate": 5.323615022514607e-05, + "loss": 0.8384, + "step": 150130 + }, + { + "epoch": 0.9592016661768652, + "grad_norm": 1.0019688606262207, + "learning_rate": 5.3231143044591816e-05, + "loss": 0.7948, + "step": 150140 + }, + { + "epoch": 0.9592655533266039, + "grad_norm": 1.0456329584121704, + "learning_rate": 5.322613583149715e-05, + "loss": 1.0149, + "step": 150150 + }, + { + "epoch": 0.9593294404763426, + "grad_norm": 1.1081980466842651, + "learning_rate": 5.322112858591252e-05, + "loss": 1.0692, + "step": 150160 + }, + { + "epoch": 0.9593933276260813, + "grad_norm": 1.2669342756271362, + "learning_rate": 5.3216121307888336e-05, + "loss": 0.8101, + "step": 150170 + }, + { + "epoch": 0.95945721477582, + "grad_norm": 1.001755714416504, + "learning_rate": 5.3211113997475016e-05, + "loss": 0.6696, + "step": 150180 + }, + { + "epoch": 0.9595211019255587, + "grad_norm": 0.6659078001976013, + "learning_rate": 5.320610665472301e-05, + "loss": 0.713, + "step": 150190 + }, + { + "epoch": 0.9595849890752974, + "grad_norm": 0.7698034048080444, + "learning_rate": 5.320109927968273e-05, + "loss": 0.8432, + "step": 150200 + }, + { + "epoch": 0.9596488762250361, + "grad_norm": 0.7013608813285828, + "learning_rate": 5.319609187240462e-05, + "loss": 1.1162, + "step": 150210 + }, + { + "epoch": 0.9597127633747748, + "grad_norm": 0.7664918899536133, + "learning_rate": 5.319108443293909e-05, + "loss": 0.9773, + "step": 150220 + }, + { + "epoch": 0.9597766505245136, + "grad_norm": 1.7179349660873413, + "learning_rate": 5.3186076961336584e-05, + "loss": 0.8648, + "step": 150230 + }, + { + "epoch": 0.9598405376742521, + "grad_norm": 0.5642333030700684, + "learning_rate": 5.318106945764752e-05, + "loss": 0.8806, + "step": 150240 + }, + { + "epoch": 0.9599044248239909, + "grad_norm": 1.004900336265564, + "learning_rate": 5.317606192192235e-05, + "loss": 0.8323, + "step": 150250 + }, + { + "epoch": 0.9599683119737296, + "grad_norm": 0.6977973580360413, + "learning_rate": 5.317105435421148e-05, + "loss": 1.0794, + "step": 150260 + }, + { + "epoch": 0.9600321991234683, + "grad_norm": 0.8252426385879517, + "learning_rate": 5.316604675456535e-05, + "loss": 0.7651, + "step": 150270 + }, + { + "epoch": 0.960096086273207, + "grad_norm": 1.0018861293792725, + "learning_rate": 5.316103912303438e-05, + "loss": 0.9188, + "step": 150280 + }, + { + "epoch": 0.9601599734229457, + "grad_norm": 0.829059362411499, + "learning_rate": 5.3156031459669035e-05, + "loss": 0.756, + "step": 150290 + }, + { + "epoch": 0.9602238605726844, + "grad_norm": 1.1105847358703613, + "learning_rate": 5.31510237645197e-05, + "loss": 0.7177, + "step": 150300 + }, + { + "epoch": 0.9602877477224231, + "grad_norm": 0.7792565822601318, + "learning_rate": 5.314601603763684e-05, + "loss": 1.1403, + "step": 150310 + }, + { + "epoch": 0.9603516348721618, + "grad_norm": 0.5869541764259338, + "learning_rate": 5.314100827907087e-05, + "loss": 0.8808, + "step": 150320 + }, + { + "epoch": 0.9604155220219005, + "grad_norm": 0.8240609169006348, + "learning_rate": 5.313600048887224e-05, + "loss": 0.8968, + "step": 150330 + }, + { + "epoch": 0.9604794091716392, + "grad_norm": 0.8057375550270081, + "learning_rate": 5.313099266709136e-05, + "loss": 0.856, + "step": 150340 + }, + { + "epoch": 0.9605432963213779, + "grad_norm": 0.7334375977516174, + "learning_rate": 5.312598481377869e-05, + "loss": 1.004, + "step": 150350 + }, + { + "epoch": 0.9606071834711166, + "grad_norm": 0.95655757188797, + "learning_rate": 5.3120976928984635e-05, + "loss": 0.9142, + "step": 150360 + }, + { + "epoch": 0.9606710706208553, + "grad_norm": 0.6239562630653381, + "learning_rate": 5.311596901275965e-05, + "loss": 0.9041, + "step": 150370 + }, + { + "epoch": 0.960734957770594, + "grad_norm": 2.0877952575683594, + "learning_rate": 5.3110961065154154e-05, + "loss": 1.0505, + "step": 150380 + }, + { + "epoch": 0.9607988449203327, + "grad_norm": 1.8619037866592407, + "learning_rate": 5.31059530862186e-05, + "loss": 0.8509, + "step": 150390 + }, + { + "epoch": 0.9608627320700714, + "grad_norm": 1.0404013395309448, + "learning_rate": 5.310094507600338e-05, + "loss": 0.9482, + "step": 150400 + }, + { + "epoch": 0.9609266192198102, + "grad_norm": 0.5970574617385864, + "learning_rate": 5.3095937034558994e-05, + "loss": 1.0259, + "step": 150410 + }, + { + "epoch": 0.9609905063695489, + "grad_norm": 0.9825915098190308, + "learning_rate": 5.309092896193584e-05, + "loss": 1.0407, + "step": 150420 + }, + { + "epoch": 0.9610543935192876, + "grad_norm": 0.7270873188972473, + "learning_rate": 5.308592085818435e-05, + "loss": 0.7591, + "step": 150430 + }, + { + "epoch": 0.9611182806690263, + "grad_norm": 1.053653597831726, + "learning_rate": 5.308091272335497e-05, + "loss": 0.8987, + "step": 150440 + }, + { + "epoch": 0.961182167818765, + "grad_norm": 0.8554880023002625, + "learning_rate": 5.307590455749812e-05, + "loss": 0.7091, + "step": 150450 + }, + { + "epoch": 0.9612460549685037, + "grad_norm": 0.5443610548973083, + "learning_rate": 5.307089636066427e-05, + "loss": 0.9389, + "step": 150460 + }, + { + "epoch": 0.9613099421182424, + "grad_norm": 0.6648747324943542, + "learning_rate": 5.306588813290383e-05, + "loss": 1.0063, + "step": 150470 + }, + { + "epoch": 0.961373829267981, + "grad_norm": 1.133631944656372, + "learning_rate": 5.306087987426725e-05, + "loss": 0.8192, + "step": 150480 + }, + { + "epoch": 0.9614377164177197, + "grad_norm": 1.77051842212677, + "learning_rate": 5.305587158480496e-05, + "loss": 0.7231, + "step": 150490 + }, + { + "epoch": 0.9615016035674584, + "grad_norm": 1.1798468828201294, + "learning_rate": 5.3050863264567396e-05, + "loss": 0.6667, + "step": 150500 + }, + { + "epoch": 0.9615654907171971, + "grad_norm": 1.529371976852417, + "learning_rate": 5.3045854913605e-05, + "loss": 0.6694, + "step": 150510 + }, + { + "epoch": 0.9616293778669358, + "grad_norm": 0.8843643665313721, + "learning_rate": 5.30408465319682e-05, + "loss": 0.9087, + "step": 150520 + }, + { + "epoch": 0.9616932650166745, + "grad_norm": 0.660963773727417, + "learning_rate": 5.303583811970746e-05, + "loss": 0.7277, + "step": 150530 + }, + { + "epoch": 0.9617571521664132, + "grad_norm": 1.6010621786117554, + "learning_rate": 5.3030829676873196e-05, + "loss": 0.89, + "step": 150540 + }, + { + "epoch": 0.9618210393161519, + "grad_norm": 1.0877952575683594, + "learning_rate": 5.3025821203515855e-05, + "loss": 0.7486, + "step": 150550 + }, + { + "epoch": 0.9618849264658906, + "grad_norm": 1.9828286170959473, + "learning_rate": 5.302081269968587e-05, + "loss": 0.8163, + "step": 150560 + }, + { + "epoch": 0.9619488136156293, + "grad_norm": 0.7645601630210876, + "learning_rate": 5.301580416543369e-05, + "loss": 0.7069, + "step": 150570 + }, + { + "epoch": 0.962012700765368, + "grad_norm": 0.8405600190162659, + "learning_rate": 5.301079560080976e-05, + "loss": 0.859, + "step": 150580 + }, + { + "epoch": 0.9620765879151068, + "grad_norm": 1.2271150350570679, + "learning_rate": 5.3005787005864515e-05, + "loss": 0.7004, + "step": 150590 + }, + { + "epoch": 0.9621404750648455, + "grad_norm": 1.4471722841262817, + "learning_rate": 5.3000778380648396e-05, + "loss": 0.8751, + "step": 150600 + }, + { + "epoch": 0.9622043622145842, + "grad_norm": 0.9273978471755981, + "learning_rate": 5.299576972521183e-05, + "loss": 0.9883, + "step": 150610 + }, + { + "epoch": 0.9622682493643229, + "grad_norm": 0.882642924785614, + "learning_rate": 5.299076103960528e-05, + "loss": 0.8676, + "step": 150620 + }, + { + "epoch": 0.9623321365140616, + "grad_norm": 3.830977201461792, + "learning_rate": 5.298575232387918e-05, + "loss": 0.8193, + "step": 150630 + }, + { + "epoch": 0.9623960236638003, + "grad_norm": 1.230293869972229, + "learning_rate": 5.298074357808397e-05, + "loss": 0.8701, + "step": 150640 + }, + { + "epoch": 0.962459910813539, + "grad_norm": 1.0408953428268433, + "learning_rate": 5.29757348022701e-05, + "loss": 1.0132, + "step": 150650 + }, + { + "epoch": 0.9625237979632777, + "grad_norm": 1.4850291013717651, + "learning_rate": 5.297072599648799e-05, + "loss": 0.747, + "step": 150660 + }, + { + "epoch": 0.9625876851130164, + "grad_norm": 1.36034095287323, + "learning_rate": 5.296571716078811e-05, + "loss": 0.8511, + "step": 150670 + }, + { + "epoch": 0.9626515722627551, + "grad_norm": 0.6303446292877197, + "learning_rate": 5.29607082952209e-05, + "loss": 0.8755, + "step": 150680 + }, + { + "epoch": 0.9627154594124938, + "grad_norm": 1.7142499685287476, + "learning_rate": 5.2955699399836776e-05, + "loss": 0.8461, + "step": 150690 + }, + { + "epoch": 0.9627793465622325, + "grad_norm": 0.9409696459770203, + "learning_rate": 5.2950690474686215e-05, + "loss": 0.8414, + "step": 150700 + }, + { + "epoch": 0.9628432337119712, + "grad_norm": 1.2119961977005005, + "learning_rate": 5.2945681519819646e-05, + "loss": 0.7946, + "step": 150710 + }, + { + "epoch": 0.9629071208617099, + "grad_norm": 0.9473667740821838, + "learning_rate": 5.2940672535287516e-05, + "loss": 0.8334, + "step": 150720 + }, + { + "epoch": 0.9629710080114485, + "grad_norm": 0.7992385625839233, + "learning_rate": 5.2935663521140274e-05, + "loss": 0.8921, + "step": 150730 + }, + { + "epoch": 0.9630348951611872, + "grad_norm": 1.3898727893829346, + "learning_rate": 5.293065447742835e-05, + "loss": 0.6968, + "step": 150740 + }, + { + "epoch": 0.9630987823109259, + "grad_norm": 0.7555790543556213, + "learning_rate": 5.292564540420221e-05, + "loss": 0.9248, + "step": 150750 + }, + { + "epoch": 0.9631626694606646, + "grad_norm": 1.3342205286026, + "learning_rate": 5.292063630151228e-05, + "loss": 0.6645, + "step": 150760 + }, + { + "epoch": 0.9632265566104034, + "grad_norm": 1.6979509592056274, + "learning_rate": 5.2915627169409035e-05, + "loss": 1.0116, + "step": 150770 + }, + { + "epoch": 0.9632904437601421, + "grad_norm": 0.8660208582878113, + "learning_rate": 5.291061800794288e-05, + "loss": 0.8999, + "step": 150780 + }, + { + "epoch": 0.9633543309098808, + "grad_norm": 1.2357333898544312, + "learning_rate": 5.29056088171643e-05, + "loss": 0.6997, + "step": 150790 + }, + { + "epoch": 0.9634182180596195, + "grad_norm": 0.9448620676994324, + "learning_rate": 5.290059959712371e-05, + "loss": 0.7693, + "step": 150800 + }, + { + "epoch": 0.9634821052093582, + "grad_norm": 0.7834548354148865, + "learning_rate": 5.289559034787158e-05, + "loss": 0.7257, + "step": 150810 + }, + { + "epoch": 0.9635459923590969, + "grad_norm": 0.8070199489593506, + "learning_rate": 5.2890581069458355e-05, + "loss": 0.7485, + "step": 150820 + }, + { + "epoch": 0.9636098795088356, + "grad_norm": 1.6272724866867065, + "learning_rate": 5.288557176193447e-05, + "loss": 1.0041, + "step": 150830 + }, + { + "epoch": 0.9636737666585743, + "grad_norm": 0.8953685164451599, + "learning_rate": 5.288056242535039e-05, + "loss": 0.9897, + "step": 150840 + }, + { + "epoch": 0.963737653808313, + "grad_norm": 0.6054562330245972, + "learning_rate": 5.2875553059756545e-05, + "loss": 0.8495, + "step": 150850 + }, + { + "epoch": 0.9638015409580517, + "grad_norm": 1.346152663230896, + "learning_rate": 5.28705436652034e-05, + "loss": 0.9919, + "step": 150860 + }, + { + "epoch": 0.9638654281077904, + "grad_norm": 1.1231837272644043, + "learning_rate": 5.286553424174139e-05, + "loss": 0.8157, + "step": 150870 + }, + { + "epoch": 0.9639293152575291, + "grad_norm": 3.220287561416626, + "learning_rate": 5.286052478942097e-05, + "loss": 0.8534, + "step": 150880 + }, + { + "epoch": 0.9639932024072678, + "grad_norm": 0.7934685945510864, + "learning_rate": 5.28555153082926e-05, + "loss": 0.8603, + "step": 150890 + }, + { + "epoch": 0.9640570895570065, + "grad_norm": 0.9011366963386536, + "learning_rate": 5.2850505798406716e-05, + "loss": 0.846, + "step": 150900 + }, + { + "epoch": 0.9641209767067452, + "grad_norm": 0.7348203659057617, + "learning_rate": 5.2845496259813773e-05, + "loss": 1.0849, + "step": 150910 + }, + { + "epoch": 0.9641848638564839, + "grad_norm": 1.1324224472045898, + "learning_rate": 5.284098765057728e-05, + "loss": 1.0193, + "step": 150920 + }, + { + "epoch": 0.9642487510062226, + "grad_norm": 1.1051008701324463, + "learning_rate": 5.283597805757992e-05, + "loss": 1.0603, + "step": 150930 + }, + { + "epoch": 0.9643126381559614, + "grad_norm": 1.0409125089645386, + "learning_rate": 5.28309684360218e-05, + "loss": 1.0198, + "step": 150940 + }, + { + "epoch": 0.9643765253057001, + "grad_norm": 1.0346943140029907, + "learning_rate": 5.282595878595338e-05, + "loss": 0.9523, + "step": 150950 + }, + { + "epoch": 0.9644404124554388, + "grad_norm": 0.9301409125328064, + "learning_rate": 5.282094910742511e-05, + "loss": 0.8966, + "step": 150960 + }, + { + "epoch": 0.9645042996051774, + "grad_norm": 1.0205413103103638, + "learning_rate": 5.281593940048745e-05, + "loss": 0.8136, + "step": 150970 + }, + { + "epoch": 0.9645681867549161, + "grad_norm": 0.4575611650943756, + "learning_rate": 5.2810929665190836e-05, + "loss": 0.9879, + "step": 150980 + }, + { + "epoch": 0.9646320739046548, + "grad_norm": 1.4380875825881958, + "learning_rate": 5.280591990158572e-05, + "loss": 0.8518, + "step": 150990 + }, + { + "epoch": 0.9646959610543935, + "grad_norm": 1.1791918277740479, + "learning_rate": 5.280091010972258e-05, + "loss": 1.0687, + "step": 151000 + }, + { + "epoch": 0.9647598482041322, + "grad_norm": 1.3037655353546143, + "learning_rate": 5.279590028965185e-05, + "loss": 1.2233, + "step": 151010 + }, + { + "epoch": 0.9648237353538709, + "grad_norm": 1.227778673171997, + "learning_rate": 5.2790890441423965e-05, + "loss": 0.9386, + "step": 151020 + }, + { + "epoch": 0.9648876225036096, + "grad_norm": 0.6218796968460083, + "learning_rate": 5.2785880565089416e-05, + "loss": 1.1117, + "step": 151030 + }, + { + "epoch": 0.9649515096533483, + "grad_norm": 1.0044218301773071, + "learning_rate": 5.2780870660698634e-05, + "loss": 0.8787, + "step": 151040 + }, + { + "epoch": 0.965015396803087, + "grad_norm": 0.7901304960250854, + "learning_rate": 5.2775860728302084e-05, + "loss": 0.8695, + "step": 151050 + }, + { + "epoch": 0.9650792839528257, + "grad_norm": 1.453648328781128, + "learning_rate": 5.277085076795021e-05, + "loss": 0.8621, + "step": 151060 + }, + { + "epoch": 0.9651431711025644, + "grad_norm": 0.5584386587142944, + "learning_rate": 5.2765840779693474e-05, + "loss": 0.8665, + "step": 151070 + }, + { + "epoch": 0.9652070582523031, + "grad_norm": 0.5596110224723816, + "learning_rate": 5.2760830763582326e-05, + "loss": 0.8572, + "step": 151080 + }, + { + "epoch": 0.9652709454020418, + "grad_norm": 1.8040897846221924, + "learning_rate": 5.275582071966723e-05, + "loss": 0.885, + "step": 151090 + }, + { + "epoch": 0.9653348325517805, + "grad_norm": 0.8948487043380737, + "learning_rate": 5.275081064799864e-05, + "loss": 1.2298, + "step": 151100 + }, + { + "epoch": 0.9653987197015192, + "grad_norm": 0.6678011417388916, + "learning_rate": 5.2745800548626986e-05, + "loss": 0.9738, + "step": 151110 + }, + { + "epoch": 0.965462606851258, + "grad_norm": 0.5756567716598511, + "learning_rate": 5.274079042160278e-05, + "loss": 0.7736, + "step": 151120 + }, + { + "epoch": 0.9655264940009967, + "grad_norm": 2.063096046447754, + "learning_rate": 5.273578026697642e-05, + "loss": 0.7518, + "step": 151130 + }, + { + "epoch": 0.9655903811507354, + "grad_norm": 0.6849361658096313, + "learning_rate": 5.2730770084798384e-05, + "loss": 0.7166, + "step": 151140 + }, + { + "epoch": 0.9656542683004741, + "grad_norm": 1.2403055429458618, + "learning_rate": 5.272575987511914e-05, + "loss": 0.9516, + "step": 151150 + }, + { + "epoch": 0.9657181554502128, + "grad_norm": 1.0753352642059326, + "learning_rate": 5.272074963798913e-05, + "loss": 0.6363, + "step": 151160 + }, + { + "epoch": 0.9657820425999515, + "grad_norm": 0.6305554509162903, + "learning_rate": 5.271573937345882e-05, + "loss": 0.74, + "step": 151170 + }, + { + "epoch": 0.9658459297496902, + "grad_norm": 0.7894411087036133, + "learning_rate": 5.271072908157866e-05, + "loss": 0.7682, + "step": 151180 + }, + { + "epoch": 0.9659098168994289, + "grad_norm": 0.7625752687454224, + "learning_rate": 5.270571876239911e-05, + "loss": 0.7678, + "step": 151190 + }, + { + "epoch": 0.9659737040491676, + "grad_norm": 0.777441143989563, + "learning_rate": 5.270070841597062e-05, + "loss": 0.939, + "step": 151200 + }, + { + "epoch": 0.9660375911989062, + "grad_norm": 0.7669252157211304, + "learning_rate": 5.269569804234369e-05, + "loss": 1.0347, + "step": 151210 + }, + { + "epoch": 0.9661014783486449, + "grad_norm": 1.6018074750900269, + "learning_rate": 5.2691188682866444e-05, + "loss": 0.9971, + "step": 151220 + }, + { + "epoch": 0.9661653654983836, + "grad_norm": 1.066701889038086, + "learning_rate": 5.268617825770142e-05, + "loss": 0.8429, + "step": 151230 + }, + { + "epoch": 0.9662292526481223, + "grad_norm": 1.0202710628509521, + "learning_rate": 5.268116780548426e-05, + "loss": 0.8482, + "step": 151240 + }, + { + "epoch": 0.966293139797861, + "grad_norm": 0.9673142433166504, + "learning_rate": 5.267615732626542e-05, + "loss": 0.9325, + "step": 151250 + }, + { + "epoch": 0.9663570269475997, + "grad_norm": 0.8143606781959534, + "learning_rate": 5.2671146820095365e-05, + "loss": 0.8128, + "step": 151260 + }, + { + "epoch": 0.9664209140973384, + "grad_norm": 1.2554757595062256, + "learning_rate": 5.266613628702456e-05, + "loss": 0.8891, + "step": 151270 + }, + { + "epoch": 0.9664848012470771, + "grad_norm": 1.1865863800048828, + "learning_rate": 5.2661125727103434e-05, + "loss": 0.9087, + "step": 151280 + }, + { + "epoch": 0.9665486883968158, + "grad_norm": 0.8015139102935791, + "learning_rate": 5.265611514038248e-05, + "loss": 0.9303, + "step": 151290 + }, + { + "epoch": 0.9666125755465546, + "grad_norm": 0.9343031048774719, + "learning_rate": 5.2651104526912145e-05, + "loss": 1.0272, + "step": 151300 + }, + { + "epoch": 0.9666764626962933, + "grad_norm": 0.8166914582252502, + "learning_rate": 5.26460938867429e-05, + "loss": 1.2548, + "step": 151310 + }, + { + "epoch": 0.966740349846032, + "grad_norm": 0.9215566515922546, + "learning_rate": 5.264108321992518e-05, + "loss": 0.8869, + "step": 151320 + }, + { + "epoch": 0.9668042369957707, + "grad_norm": 1.8220895528793335, + "learning_rate": 5.2636072526509486e-05, + "loss": 1.0963, + "step": 151330 + }, + { + "epoch": 0.9668681241455094, + "grad_norm": 1.234108805656433, + "learning_rate": 5.2631061806546255e-05, + "loss": 0.9705, + "step": 151340 + }, + { + "epoch": 0.9669320112952481, + "grad_norm": 1.0234565734863281, + "learning_rate": 5.2626051060085956e-05, + "loss": 0.7106, + "step": 151350 + }, + { + "epoch": 0.9669958984449868, + "grad_norm": 0.9547996520996094, + "learning_rate": 5.262104028717906e-05, + "loss": 0.9427, + "step": 151360 + }, + { + "epoch": 0.9670597855947255, + "grad_norm": 0.819320559501648, + "learning_rate": 5.261602948787601e-05, + "loss": 0.8402, + "step": 151370 + }, + { + "epoch": 0.9671236727444642, + "grad_norm": 1.10805082321167, + "learning_rate": 5.261101866222728e-05, + "loss": 0.9087, + "step": 151380 + }, + { + "epoch": 0.9671875598942029, + "grad_norm": 0.9255377650260925, + "learning_rate": 5.260600781028334e-05, + "loss": 0.898, + "step": 151390 + }, + { + "epoch": 0.9672514470439416, + "grad_norm": 0.827276885509491, + "learning_rate": 5.2600996932094634e-05, + "loss": 0.8182, + "step": 151400 + }, + { + "epoch": 0.9673153341936803, + "grad_norm": 1.3312925100326538, + "learning_rate": 5.259598602771165e-05, + "loss": 0.8368, + "step": 151410 + }, + { + "epoch": 0.967379221343419, + "grad_norm": 2.995258092880249, + "learning_rate": 5.2590975097184844e-05, + "loss": 0.7184, + "step": 151420 + }, + { + "epoch": 0.9674431084931577, + "grad_norm": 0.8039790391921997, + "learning_rate": 5.258596414056467e-05, + "loss": 0.6866, + "step": 151430 + }, + { + "epoch": 0.9675069956428964, + "grad_norm": 1.1383510828018188, + "learning_rate": 5.25809531579016e-05, + "loss": 0.6508, + "step": 151440 + }, + { + "epoch": 0.967570882792635, + "grad_norm": 0.8737443089485168, + "learning_rate": 5.25759421492461e-05, + "loss": 0.8217, + "step": 151450 + }, + { + "epoch": 0.9676347699423737, + "grad_norm": 0.9597667455673218, + "learning_rate": 5.257093111464865e-05, + "loss": 0.8019, + "step": 151460 + }, + { + "epoch": 0.9676986570921124, + "grad_norm": 0.9724913239479065, + "learning_rate": 5.256592005415968e-05, + "loss": 1.0306, + "step": 151470 + }, + { + "epoch": 0.9677625442418512, + "grad_norm": 0.9813938736915588, + "learning_rate": 5.256090896782968e-05, + "loss": 1.2299, + "step": 151480 + }, + { + "epoch": 0.9678264313915899, + "grad_norm": 0.5077504515647888, + "learning_rate": 5.2555897855709114e-05, + "loss": 0.8514, + "step": 151490 + }, + { + "epoch": 0.9678903185413286, + "grad_norm": 0.8173817992210388, + "learning_rate": 5.2550886717848436e-05, + "loss": 1.0456, + "step": 151500 + }, + { + "epoch": 0.9679542056910673, + "grad_norm": 0.8337730765342712, + "learning_rate": 5.254587555429813e-05, + "loss": 0.9812, + "step": 151510 + }, + { + "epoch": 0.968018092840806, + "grad_norm": 0.9300865530967712, + "learning_rate": 5.254086436510866e-05, + "loss": 0.696, + "step": 151520 + }, + { + "epoch": 0.9680819799905447, + "grad_norm": 0.712742805480957, + "learning_rate": 5.253585315033047e-05, + "loss": 0.8063, + "step": 151530 + }, + { + "epoch": 0.9681458671402834, + "grad_norm": 0.685977578163147, + "learning_rate": 5.253084191001406e-05, + "loss": 0.9119, + "step": 151540 + }, + { + "epoch": 0.9682097542900221, + "grad_norm": 1.3182839155197144, + "learning_rate": 5.2525830644209885e-05, + "loss": 0.8322, + "step": 151550 + }, + { + "epoch": 0.9682736414397608, + "grad_norm": 1.8923838138580322, + "learning_rate": 5.25208193529684e-05, + "loss": 0.8107, + "step": 151560 + }, + { + "epoch": 0.9683375285894995, + "grad_norm": 0.7532184720039368, + "learning_rate": 5.251580803634008e-05, + "loss": 0.6918, + "step": 151570 + }, + { + "epoch": 0.9684014157392382, + "grad_norm": 0.9623368382453918, + "learning_rate": 5.2510796694375406e-05, + "loss": 0.8558, + "step": 151580 + }, + { + "epoch": 0.9684653028889769, + "grad_norm": 0.7997425198554993, + "learning_rate": 5.2505785327124836e-05, + "loss": 0.9148, + "step": 151590 + }, + { + "epoch": 0.9685291900387156, + "grad_norm": 0.9608021378517151, + "learning_rate": 5.250077393463884e-05, + "loss": 1.0758, + "step": 151600 + }, + { + "epoch": 0.9685930771884543, + "grad_norm": 1.5573434829711914, + "learning_rate": 5.2495762516967886e-05, + "loss": 0.8347, + "step": 151610 + }, + { + "epoch": 0.968656964338193, + "grad_norm": 2.805736541748047, + "learning_rate": 5.2490751074162446e-05, + "loss": 0.8639, + "step": 151620 + }, + { + "epoch": 0.9687208514879317, + "grad_norm": 0.5936098098754883, + "learning_rate": 5.2485739606272985e-05, + "loss": 0.8002, + "step": 151630 + }, + { + "epoch": 0.9687847386376705, + "grad_norm": 0.6356973648071289, + "learning_rate": 5.248072811334997e-05, + "loss": 0.8728, + "step": 151640 + }, + { + "epoch": 0.9688486257874092, + "grad_norm": 1.3857841491699219, + "learning_rate": 5.2475716595443894e-05, + "loss": 1.0488, + "step": 151650 + }, + { + "epoch": 0.9689125129371479, + "grad_norm": 1.0003827810287476, + "learning_rate": 5.24707050526052e-05, + "loss": 0.8302, + "step": 151660 + }, + { + "epoch": 0.9689764000868866, + "grad_norm": 0.884917140007019, + "learning_rate": 5.246569348488436e-05, + "loss": 0.734, + "step": 151670 + }, + { + "epoch": 0.9690402872366253, + "grad_norm": 0.648077130317688, + "learning_rate": 5.246068189233186e-05, + "loss": 1.0681, + "step": 151680 + }, + { + "epoch": 0.969104174386364, + "grad_norm": 0.6451058387756348, + "learning_rate": 5.245567027499816e-05, + "loss": 0.9954, + "step": 151690 + }, + { + "epoch": 0.9691680615361026, + "grad_norm": 1.31290602684021, + "learning_rate": 5.2450658632933736e-05, + "loss": 0.9458, + "step": 151700 + }, + { + "epoch": 0.9692319486858413, + "grad_norm": 0.7243294715881348, + "learning_rate": 5.244564696618907e-05, + "loss": 0.8819, + "step": 151710 + }, + { + "epoch": 0.96929583583558, + "grad_norm": 1.8024976253509521, + "learning_rate": 5.244063527481462e-05, + "loss": 0.7875, + "step": 151720 + }, + { + "epoch": 0.9693597229853187, + "grad_norm": 1.2114737033843994, + "learning_rate": 5.243562355886086e-05, + "loss": 0.7601, + "step": 151730 + }, + { + "epoch": 0.9694236101350574, + "grad_norm": 1.0889559984207153, + "learning_rate": 5.243061181837826e-05, + "loss": 0.9074, + "step": 151740 + }, + { + "epoch": 0.9694874972847961, + "grad_norm": 0.7654042840003967, + "learning_rate": 5.24256000534173e-05, + "loss": 0.7559, + "step": 151750 + }, + { + "epoch": 0.9695513844345348, + "grad_norm": 0.9789912700653076, + "learning_rate": 5.242058826402846e-05, + "loss": 0.8855, + "step": 151760 + }, + { + "epoch": 0.9696152715842735, + "grad_norm": 2.210740089416504, + "learning_rate": 5.241557645026219e-05, + "loss": 0.9904, + "step": 151770 + }, + { + "epoch": 0.9696791587340122, + "grad_norm": 0.6800025105476379, + "learning_rate": 5.241056461216898e-05, + "loss": 0.9026, + "step": 151780 + }, + { + "epoch": 0.9697430458837509, + "grad_norm": 1.204058289527893, + "learning_rate": 5.240555274979929e-05, + "loss": 1.0395, + "step": 151790 + }, + { + "epoch": 0.9698069330334896, + "grad_norm": 0.8177311420440674, + "learning_rate": 5.240054086320361e-05, + "loss": 1.1084, + "step": 151800 + }, + { + "epoch": 0.9698708201832283, + "grad_norm": 0.714911699295044, + "learning_rate": 5.239552895243241e-05, + "loss": 0.8985, + "step": 151810 + }, + { + "epoch": 0.969934707332967, + "grad_norm": 2.143303155899048, + "learning_rate": 5.239051701753614e-05, + "loss": 1.1244, + "step": 151820 + }, + { + "epoch": 0.9699985944827058, + "grad_norm": 0.977552592754364, + "learning_rate": 5.2385505058565324e-05, + "loss": 1.0981, + "step": 151830 + }, + { + "epoch": 0.9700624816324445, + "grad_norm": 2.233025312423706, + "learning_rate": 5.2380493075570394e-05, + "loss": 1.1075, + "step": 151840 + }, + { + "epoch": 0.9701263687821832, + "grad_norm": 1.9697365760803223, + "learning_rate": 5.237548106860183e-05, + "loss": 0.8237, + "step": 151850 + }, + { + "epoch": 0.9701902559319219, + "grad_norm": 3.052886486053467, + "learning_rate": 5.237046903771012e-05, + "loss": 0.762, + "step": 151860 + }, + { + "epoch": 0.9702541430816606, + "grad_norm": 2.318892002105713, + "learning_rate": 5.236545698294575e-05, + "loss": 0.7751, + "step": 151870 + }, + { + "epoch": 0.9703180302313993, + "grad_norm": 0.842376708984375, + "learning_rate": 5.2360444904359176e-05, + "loss": 0.8483, + "step": 151880 + }, + { + "epoch": 0.970381917381138, + "grad_norm": 1.4306411743164062, + "learning_rate": 5.235543280200088e-05, + "loss": 0.8895, + "step": 151890 + }, + { + "epoch": 0.9704458045308767, + "grad_norm": 4.7012939453125, + "learning_rate": 5.235042067592133e-05, + "loss": 1.0352, + "step": 151900 + }, + { + "epoch": 0.9705096916806154, + "grad_norm": 0.7660112977027893, + "learning_rate": 5.234540852617102e-05, + "loss": 0.6482, + "step": 151910 + }, + { + "epoch": 0.9705735788303541, + "grad_norm": 1.0199682712554932, + "learning_rate": 5.234039635280041e-05, + "loss": 0.9154, + "step": 151920 + }, + { + "epoch": 0.9706374659800928, + "grad_norm": 0.9796050190925598, + "learning_rate": 5.233538415585999e-05, + "loss": 0.9538, + "step": 151930 + }, + { + "epoch": 0.9707013531298314, + "grad_norm": 0.7912867069244385, + "learning_rate": 5.233037193540023e-05, + "loss": 0.7974, + "step": 151940 + }, + { + "epoch": 0.9707652402795701, + "grad_norm": 0.8988333344459534, + "learning_rate": 5.2325359691471606e-05, + "loss": 0.8347, + "step": 151950 + }, + { + "epoch": 0.9708291274293088, + "grad_norm": 0.6798145174980164, + "learning_rate": 5.2320347424124606e-05, + "loss": 0.8257, + "step": 151960 + }, + { + "epoch": 0.9708930145790475, + "grad_norm": 3.4491665363311768, + "learning_rate": 5.2315335133409694e-05, + "loss": 0.7976, + "step": 151970 + }, + { + "epoch": 0.9709569017287862, + "grad_norm": 0.5991364121437073, + "learning_rate": 5.2310322819377355e-05, + "loss": 0.9797, + "step": 151980 + }, + { + "epoch": 0.9710207888785249, + "grad_norm": 0.987819492816925, + "learning_rate": 5.2305310482078064e-05, + "loss": 0.9553, + "step": 151990 + }, + { + "epoch": 0.9710846760282636, + "grad_norm": 0.7812177538871765, + "learning_rate": 5.230029812156232e-05, + "loss": 0.7681, + "step": 152000 + }, + { + "epoch": 0.9711485631780024, + "grad_norm": 0.9155138731002808, + "learning_rate": 5.229528573788055e-05, + "loss": 0.7462, + "step": 152010 + }, + { + "epoch": 0.9712124503277411, + "grad_norm": 1.2678287029266357, + "learning_rate": 5.229027333108328e-05, + "loss": 1.0377, + "step": 152020 + }, + { + "epoch": 0.9712763374774798, + "grad_norm": 0.624622106552124, + "learning_rate": 5.228526090122099e-05, + "loss": 0.9255, + "step": 152030 + }, + { + "epoch": 0.9713402246272185, + "grad_norm": 1.6265813112258911, + "learning_rate": 5.228024844834414e-05, + "loss": 0.7908, + "step": 152040 + }, + { + "epoch": 0.9714041117769572, + "grad_norm": 0.7967225313186646, + "learning_rate": 5.227523597250321e-05, + "loss": 0.9871, + "step": 152050 + }, + { + "epoch": 0.9714679989266959, + "grad_norm": 1.2453161478042603, + "learning_rate": 5.227022347374868e-05, + "loss": 0.876, + "step": 152060 + }, + { + "epoch": 0.9715318860764346, + "grad_norm": 1.3441709280014038, + "learning_rate": 5.226521095213105e-05, + "loss": 0.7978, + "step": 152070 + }, + { + "epoch": 0.9715957732261733, + "grad_norm": 1.4316071271896362, + "learning_rate": 5.2260198407700775e-05, + "loss": 0.7867, + "step": 152080 + }, + { + "epoch": 0.971659660375912, + "grad_norm": 2.101490020751953, + "learning_rate": 5.225518584050835e-05, + "loss": 0.9115, + "step": 152090 + }, + { + "epoch": 0.9717235475256507, + "grad_norm": 0.7814741730690002, + "learning_rate": 5.225017325060425e-05, + "loss": 0.8807, + "step": 152100 + }, + { + "epoch": 0.9717874346753894, + "grad_norm": 0.9148108959197998, + "learning_rate": 5.224516063803897e-05, + "loss": 0.876, + "step": 152110 + }, + { + "epoch": 0.9718513218251281, + "grad_norm": 0.9834555983543396, + "learning_rate": 5.2240148002862964e-05, + "loss": 0.8846, + "step": 152120 + }, + { + "epoch": 0.9719152089748668, + "grad_norm": 0.8105610013008118, + "learning_rate": 5.223513534512674e-05, + "loss": 0.8168, + "step": 152130 + }, + { + "epoch": 0.9719790961246055, + "grad_norm": 1.4332830905914307, + "learning_rate": 5.223012266488076e-05, + "loss": 0.8217, + "step": 152140 + }, + { + "epoch": 0.9720429832743442, + "grad_norm": 0.8803772330284119, + "learning_rate": 5.222510996217554e-05, + "loss": 0.8183, + "step": 152150 + }, + { + "epoch": 0.972106870424083, + "grad_norm": 1.188647985458374, + "learning_rate": 5.222009723706151e-05, + "loss": 0.9345, + "step": 152160 + }, + { + "epoch": 0.9721707575738217, + "grad_norm": 0.9444614052772522, + "learning_rate": 5.2215084489589194e-05, + "loss": 0.9522, + "step": 152170 + }, + { + "epoch": 0.9722346447235602, + "grad_norm": 1.4221155643463135, + "learning_rate": 5.2210071719809064e-05, + "loss": 0.977, + "step": 152180 + }, + { + "epoch": 0.972298531873299, + "grad_norm": 1.2089207172393799, + "learning_rate": 5.220505892777159e-05, + "loss": 0.8424, + "step": 152190 + }, + { + "epoch": 0.9723624190230377, + "grad_norm": 0.8503243327140808, + "learning_rate": 5.220004611352727e-05, + "loss": 0.96, + "step": 152200 + }, + { + "epoch": 0.9724263061727764, + "grad_norm": 0.7801720499992371, + "learning_rate": 5.219503327712656e-05, + "loss": 0.7537, + "step": 152210 + }, + { + "epoch": 0.9724901933225151, + "grad_norm": 0.8834246397018433, + "learning_rate": 5.219002041861999e-05, + "loss": 0.8585, + "step": 152220 + }, + { + "epoch": 0.9725540804722538, + "grad_norm": 0.9630089998245239, + "learning_rate": 5.218500753805802e-05, + "loss": 0.7415, + "step": 152230 + }, + { + "epoch": 0.9726179676219925, + "grad_norm": 0.6178570985794067, + "learning_rate": 5.217999463549113e-05, + "loss": 0.8743, + "step": 152240 + }, + { + "epoch": 0.9726818547717312, + "grad_norm": 1.1636070013046265, + "learning_rate": 5.217498171096982e-05, + "loss": 0.8063, + "step": 152250 + }, + { + "epoch": 0.9727457419214699, + "grad_norm": 0.6922101378440857, + "learning_rate": 5.216996876454454e-05, + "loss": 0.806, + "step": 152260 + }, + { + "epoch": 0.9728096290712086, + "grad_norm": 0.6797016859054565, + "learning_rate": 5.2164955796265814e-05, + "loss": 0.8855, + "step": 152270 + }, + { + "epoch": 0.9728735162209473, + "grad_norm": 0.6983970403671265, + "learning_rate": 5.21599428061841e-05, + "loss": 0.8559, + "step": 152280 + }, + { + "epoch": 0.972937403370686, + "grad_norm": 1.0314277410507202, + "learning_rate": 5.2154929794349894e-05, + "loss": 0.6021, + "step": 152290 + }, + { + "epoch": 0.9730012905204247, + "grad_norm": 0.8680412173271179, + "learning_rate": 5.214991676081369e-05, + "loss": 0.7651, + "step": 152300 + }, + { + "epoch": 0.9730651776701634, + "grad_norm": 0.8231766819953918, + "learning_rate": 5.214490370562596e-05, + "loss": 0.7858, + "step": 152310 + }, + { + "epoch": 0.9731290648199021, + "grad_norm": 0.8090435266494751, + "learning_rate": 5.2139890628837183e-05, + "loss": 1.0899, + "step": 152320 + }, + { + "epoch": 0.9731929519696408, + "grad_norm": 1.0858497619628906, + "learning_rate": 5.213487753049787e-05, + "loss": 0.8816, + "step": 152330 + }, + { + "epoch": 0.9732568391193795, + "grad_norm": 0.9355524778366089, + "learning_rate": 5.212986441065849e-05, + "loss": 1.0293, + "step": 152340 + }, + { + "epoch": 0.9733207262691183, + "grad_norm": 1.1359403133392334, + "learning_rate": 5.2124851269369534e-05, + "loss": 0.919, + "step": 152350 + }, + { + "epoch": 0.973384613418857, + "grad_norm": 1.9253411293029785, + "learning_rate": 5.211983810668148e-05, + "loss": 1.0969, + "step": 152360 + }, + { + "epoch": 0.9734485005685957, + "grad_norm": 0.8547667264938354, + "learning_rate": 5.2114824922644824e-05, + "loss": 0.6838, + "step": 152370 + }, + { + "epoch": 0.9735123877183344, + "grad_norm": 1.0332297086715698, + "learning_rate": 5.210981171731005e-05, + "loss": 0.8215, + "step": 152380 + }, + { + "epoch": 0.9735762748680731, + "grad_norm": 1.0458303689956665, + "learning_rate": 5.210479849072765e-05, + "loss": 0.7492, + "step": 152390 + }, + { + "epoch": 0.9736401620178118, + "grad_norm": 0.9344658851623535, + "learning_rate": 5.209978524294811e-05, + "loss": 0.741, + "step": 152400 + }, + { + "epoch": 0.9737040491675505, + "grad_norm": 1.4250115156173706, + "learning_rate": 5.209477197402192e-05, + "loss": 0.6877, + "step": 152410 + }, + { + "epoch": 0.9737679363172892, + "grad_norm": 0.9061084985733032, + "learning_rate": 5.208975868399956e-05, + "loss": 1.0584, + "step": 152420 + }, + { + "epoch": 0.9738318234670278, + "grad_norm": 0.806816041469574, + "learning_rate": 5.208474537293152e-05, + "loss": 0.6887, + "step": 152430 + }, + { + "epoch": 0.9738957106167665, + "grad_norm": 0.933032751083374, + "learning_rate": 5.207973204086829e-05, + "loss": 0.8562, + "step": 152440 + }, + { + "epoch": 0.9739595977665052, + "grad_norm": 1.0426222085952759, + "learning_rate": 5.207471868786036e-05, + "loss": 0.871, + "step": 152450 + }, + { + "epoch": 0.9740234849162439, + "grad_norm": 1.1309046745300293, + "learning_rate": 5.206970531395822e-05, + "loss": 0.7954, + "step": 152460 + }, + { + "epoch": 0.9740873720659826, + "grad_norm": 0.8131570816040039, + "learning_rate": 5.2064691919212364e-05, + "loss": 0.91, + "step": 152470 + }, + { + "epoch": 0.9741512592157213, + "grad_norm": 0.9664103388786316, + "learning_rate": 5.205967850367326e-05, + "loss": 1.0082, + "step": 152480 + }, + { + "epoch": 0.97421514636546, + "grad_norm": 1.055430293083191, + "learning_rate": 5.205466506739143e-05, + "loss": 0.9536, + "step": 152490 + }, + { + "epoch": 0.9742790335151987, + "grad_norm": 0.8127159476280212, + "learning_rate": 5.2049651610417326e-05, + "loss": 0.7859, + "step": 152500 + }, + { + "epoch": 0.9743429206649374, + "grad_norm": 0.7035512924194336, + "learning_rate": 5.204463813280147e-05, + "loss": 0.9262, + "step": 152510 + }, + { + "epoch": 0.9744068078146761, + "grad_norm": 0.881603479385376, + "learning_rate": 5.203962463459433e-05, + "loss": 0.7748, + "step": 152520 + }, + { + "epoch": 0.9744706949644149, + "grad_norm": 0.8797648549079895, + "learning_rate": 5.203461111584641e-05, + "loss": 1.044, + "step": 152530 + }, + { + "epoch": 0.9745345821141536, + "grad_norm": 0.8272404670715332, + "learning_rate": 5.202959757660819e-05, + "loss": 0.6735, + "step": 152540 + }, + { + "epoch": 0.9745984692638923, + "grad_norm": 0.9870911240577698, + "learning_rate": 5.202458401693017e-05, + "loss": 0.811, + "step": 152550 + }, + { + "epoch": 0.974662356413631, + "grad_norm": 0.838467001914978, + "learning_rate": 5.2019570436862844e-05, + "loss": 1.024, + "step": 152560 + }, + { + "epoch": 0.9747262435633697, + "grad_norm": 0.5068958401679993, + "learning_rate": 5.2014556836456685e-05, + "loss": 1.1225, + "step": 152570 + }, + { + "epoch": 0.9747901307131084, + "grad_norm": 0.6512907147407532, + "learning_rate": 5.2009543215762204e-05, + "loss": 0.8789, + "step": 152580 + }, + { + "epoch": 0.9748540178628471, + "grad_norm": 0.9875491857528687, + "learning_rate": 5.200452957482988e-05, + "loss": 0.7917, + "step": 152590 + }, + { + "epoch": 0.9749179050125858, + "grad_norm": 0.5896627902984619, + "learning_rate": 5.199951591371022e-05, + "loss": 1.141, + "step": 152600 + }, + { + "epoch": 0.9749817921623245, + "grad_norm": 0.9969107508659363, + "learning_rate": 5.199450223245369e-05, + "loss": 0.8511, + "step": 152610 + }, + { + "epoch": 0.9750456793120632, + "grad_norm": 0.7358691096305847, + "learning_rate": 5.1989488531110794e-05, + "loss": 0.7029, + "step": 152620 + }, + { + "epoch": 0.9751095664618019, + "grad_norm": 0.5284615159034729, + "learning_rate": 5.198447480973204e-05, + "loss": 0.7927, + "step": 152630 + }, + { + "epoch": 0.9751734536115406, + "grad_norm": 1.0184814929962158, + "learning_rate": 5.1979461068367904e-05, + "loss": 0.9482, + "step": 152640 + }, + { + "epoch": 0.9752373407612793, + "grad_norm": 1.5319088697433472, + "learning_rate": 5.197444730706889e-05, + "loss": 0.871, + "step": 152650 + }, + { + "epoch": 0.975301227911018, + "grad_norm": 1.3232473134994507, + "learning_rate": 5.196943352588548e-05, + "loss": 0.8189, + "step": 152660 + }, + { + "epoch": 0.9753651150607566, + "grad_norm": 0.698314368724823, + "learning_rate": 5.196441972486816e-05, + "loss": 0.8088, + "step": 152670 + }, + { + "epoch": 0.9754290022104953, + "grad_norm": 0.8359566926956177, + "learning_rate": 5.1959405904067446e-05, + "loss": 0.8434, + "step": 152680 + }, + { + "epoch": 0.975492889360234, + "grad_norm": 0.5400557518005371, + "learning_rate": 5.195439206353381e-05, + "loss": 0.8659, + "step": 152690 + }, + { + "epoch": 0.9755567765099727, + "grad_norm": 1.6195552349090576, + "learning_rate": 5.1949378203317764e-05, + "loss": 0.6596, + "step": 152700 + }, + { + "epoch": 0.9756206636597115, + "grad_norm": 1.1681946516036987, + "learning_rate": 5.1944364323469785e-05, + "loss": 0.8251, + "step": 152710 + }, + { + "epoch": 0.9756845508094502, + "grad_norm": 1.1966689825057983, + "learning_rate": 5.1939350424040376e-05, + "loss": 0.9019, + "step": 152720 + }, + { + "epoch": 0.9757484379591889, + "grad_norm": 0.7698038220405579, + "learning_rate": 5.193433650508004e-05, + "loss": 0.6619, + "step": 152730 + }, + { + "epoch": 0.9758123251089276, + "grad_norm": 1.1396318674087524, + "learning_rate": 5.192932256663925e-05, + "loss": 0.9543, + "step": 152740 + }, + { + "epoch": 0.9758762122586663, + "grad_norm": 0.9183258414268494, + "learning_rate": 5.1924308608768524e-05, + "loss": 1.0159, + "step": 152750 + }, + { + "epoch": 0.975940099408405, + "grad_norm": 0.7935616374015808, + "learning_rate": 5.1919294631518336e-05, + "loss": 0.9497, + "step": 152760 + }, + { + "epoch": 0.9760039865581437, + "grad_norm": 0.8395205140113831, + "learning_rate": 5.1914280634939195e-05, + "loss": 0.9478, + "step": 152770 + }, + { + "epoch": 0.9760678737078824, + "grad_norm": 1.5380463600158691, + "learning_rate": 5.190926661908159e-05, + "loss": 0.9063, + "step": 152780 + }, + { + "epoch": 0.9761317608576211, + "grad_norm": 0.8509006500244141, + "learning_rate": 5.190425258399601e-05, + "loss": 1.1259, + "step": 152790 + }, + { + "epoch": 0.9761956480073598, + "grad_norm": 1.152294397354126, + "learning_rate": 5.189923852973297e-05, + "loss": 0.8142, + "step": 152800 + }, + { + "epoch": 0.9762595351570985, + "grad_norm": 0.7129946947097778, + "learning_rate": 5.1894224456342965e-05, + "loss": 0.9805, + "step": 152810 + }, + { + "epoch": 0.9763234223068372, + "grad_norm": 0.9357526302337646, + "learning_rate": 5.188921036387646e-05, + "loss": 0.9792, + "step": 152820 + }, + { + "epoch": 0.9763873094565759, + "grad_norm": 0.7643981575965881, + "learning_rate": 5.1884196252383986e-05, + "loss": 0.7641, + "step": 152830 + }, + { + "epoch": 0.9764511966063146, + "grad_norm": 0.860305666923523, + "learning_rate": 5.187918212191603e-05, + "loss": 0.8608, + "step": 152840 + }, + { + "epoch": 0.9765150837560533, + "grad_norm": 1.20055091381073, + "learning_rate": 5.187416797252307e-05, + "loss": 0.7898, + "step": 152850 + }, + { + "epoch": 0.976578970905792, + "grad_norm": 1.0174932479858398, + "learning_rate": 5.186915380425562e-05, + "loss": 0.7676, + "step": 152860 + }, + { + "epoch": 0.9766428580555307, + "grad_norm": 0.8433327674865723, + "learning_rate": 5.1864139617164174e-05, + "loss": 1.0166, + "step": 152870 + }, + { + "epoch": 0.9767067452052695, + "grad_norm": 0.8074188828468323, + "learning_rate": 5.185912541129924e-05, + "loss": 1.202, + "step": 152880 + }, + { + "epoch": 0.9767706323550082, + "grad_norm": 0.7383306622505188, + "learning_rate": 5.1854111186711295e-05, + "loss": 0.8643, + "step": 152890 + }, + { + "epoch": 0.9768345195047469, + "grad_norm": 0.704338014125824, + "learning_rate": 5.184909694345084e-05, + "loss": 0.7977, + "step": 152900 + }, + { + "epoch": 0.9768984066544855, + "grad_norm": 0.6669245362281799, + "learning_rate": 5.1844082681568386e-05, + "loss": 0.8619, + "step": 152910 + }, + { + "epoch": 0.9769622938042242, + "grad_norm": 0.9143712520599365, + "learning_rate": 5.183906840111442e-05, + "loss": 0.9948, + "step": 152920 + }, + { + "epoch": 0.9770261809539629, + "grad_norm": 1.7431244850158691, + "learning_rate": 5.1834054102139454e-05, + "loss": 0.6948, + "step": 152930 + }, + { + "epoch": 0.9770900681037016, + "grad_norm": 0.8700932264328003, + "learning_rate": 5.182903978469398e-05, + "loss": 0.8935, + "step": 152940 + }, + { + "epoch": 0.9771539552534403, + "grad_norm": 0.7274859547615051, + "learning_rate": 5.182402544882847e-05, + "loss": 0.7769, + "step": 152950 + }, + { + "epoch": 0.977217842403179, + "grad_norm": 0.48052549362182617, + "learning_rate": 5.181901109459347e-05, + "loss": 0.7875, + "step": 152960 + }, + { + "epoch": 0.9772817295529177, + "grad_norm": 1.1585367918014526, + "learning_rate": 5.181399672203946e-05, + "loss": 0.7967, + "step": 152970 + }, + { + "epoch": 0.9773456167026564, + "grad_norm": 1.8060321807861328, + "learning_rate": 5.1808982331216915e-05, + "loss": 0.8072, + "step": 152980 + }, + { + "epoch": 0.9774095038523951, + "grad_norm": 0.683174729347229, + "learning_rate": 5.1803967922176354e-05, + "loss": 0.8027, + "step": 152990 + }, + { + "epoch": 0.9774733910021338, + "grad_norm": 0.8326396346092224, + "learning_rate": 5.1798953494968285e-05, + "loss": 0.6853, + "step": 153000 + }, + { + "epoch": 0.9775372781518725, + "grad_norm": 0.6510669589042664, + "learning_rate": 5.179393904964319e-05, + "loss": 0.9173, + "step": 153010 + }, + { + "epoch": 0.9776011653016112, + "grad_norm": 0.894503653049469, + "learning_rate": 5.1788924586251575e-05, + "loss": 0.9928, + "step": 153020 + }, + { + "epoch": 0.9776650524513499, + "grad_norm": 0.8102232217788696, + "learning_rate": 5.178391010484395e-05, + "loss": 1.1457, + "step": 153030 + }, + { + "epoch": 0.9777289396010886, + "grad_norm": 0.8255794644355774, + "learning_rate": 5.17788956054708e-05, + "loss": 0.7739, + "step": 153040 + }, + { + "epoch": 0.9777928267508273, + "grad_norm": 0.968258261680603, + "learning_rate": 5.177388108818263e-05, + "loss": 0.8573, + "step": 153050 + }, + { + "epoch": 0.977856713900566, + "grad_norm": 0.6829890608787537, + "learning_rate": 5.176886655302994e-05, + "loss": 0.8869, + "step": 153060 + }, + { + "epoch": 0.9779206010503048, + "grad_norm": 3.4975321292877197, + "learning_rate": 5.176385200006324e-05, + "loss": 1.072, + "step": 153070 + }, + { + "epoch": 0.9779844882000435, + "grad_norm": 1.045479416847229, + "learning_rate": 5.1758837429333026e-05, + "loss": 0.6734, + "step": 153080 + }, + { + "epoch": 0.9780483753497822, + "grad_norm": 1.0568046569824219, + "learning_rate": 5.1753822840889796e-05, + "loss": 0.7752, + "step": 153090 + }, + { + "epoch": 0.9781122624995209, + "grad_norm": 0.843722403049469, + "learning_rate": 5.174880823478405e-05, + "loss": 0.9224, + "step": 153100 + }, + { + "epoch": 0.9781761496492596, + "grad_norm": 1.006016731262207, + "learning_rate": 5.174379361106629e-05, + "loss": 0.8705, + "step": 153110 + }, + { + "epoch": 0.9782400367989983, + "grad_norm": 2.0581963062286377, + "learning_rate": 5.173877896978703e-05, + "loss": 0.8711, + "step": 153120 + }, + { + "epoch": 0.978303923948737, + "grad_norm": 0.788337230682373, + "learning_rate": 5.173376431099676e-05, + "loss": 0.7674, + "step": 153130 + }, + { + "epoch": 0.9783678110984757, + "grad_norm": 1.791744351387024, + "learning_rate": 5.172874963474598e-05, + "loss": 0.9112, + "step": 153140 + }, + { + "epoch": 0.9784316982482143, + "grad_norm": 0.7716138362884521, + "learning_rate": 5.17237349410852e-05, + "loss": 0.7965, + "step": 153150 + }, + { + "epoch": 0.978495585397953, + "grad_norm": 0.616436779499054, + "learning_rate": 5.171872023006491e-05, + "loss": 1.0199, + "step": 153160 + }, + { + "epoch": 0.9785594725476917, + "grad_norm": 0.8398680686950684, + "learning_rate": 5.171370550173562e-05, + "loss": 1.0967, + "step": 153170 + }, + { + "epoch": 0.9786233596974304, + "grad_norm": 0.9748437404632568, + "learning_rate": 5.170869075614784e-05, + "loss": 0.9849, + "step": 153180 + }, + { + "epoch": 0.9786872468471691, + "grad_norm": 0.8368244171142578, + "learning_rate": 5.1703675993352064e-05, + "loss": 0.8339, + "step": 153190 + }, + { + "epoch": 0.9787511339969078, + "grad_norm": 0.9211305975914001, + "learning_rate": 5.169866121339879e-05, + "loss": 0.7099, + "step": 153200 + }, + { + "epoch": 0.9788150211466465, + "grad_norm": 1.1104360818862915, + "learning_rate": 5.169364641633855e-05, + "loss": 0.8684, + "step": 153210 + }, + { + "epoch": 0.9788789082963852, + "grad_norm": 0.6493495106697083, + "learning_rate": 5.1688631602221794e-05, + "loss": 1.0052, + "step": 153220 + }, + { + "epoch": 0.978942795446124, + "grad_norm": 1.0354483127593994, + "learning_rate": 5.168361677109908e-05, + "loss": 0.9606, + "step": 153230 + }, + { + "epoch": 0.9790066825958627, + "grad_norm": 0.6334801912307739, + "learning_rate": 5.1678601923020876e-05, + "loss": 1.0056, + "step": 153240 + }, + { + "epoch": 0.9790705697456014, + "grad_norm": 1.2471381425857544, + "learning_rate": 5.16735870580377e-05, + "loss": 0.9594, + "step": 153250 + }, + { + "epoch": 0.9791344568953401, + "grad_norm": 1.8686987161636353, + "learning_rate": 5.166857217620006e-05, + "loss": 0.8422, + "step": 153260 + }, + { + "epoch": 0.9791983440450788, + "grad_norm": 0.7281016707420349, + "learning_rate": 5.1663557277558447e-05, + "loss": 1.0227, + "step": 153270 + }, + { + "epoch": 0.9792622311948175, + "grad_norm": 1.3672218322753906, + "learning_rate": 5.1658542362163385e-05, + "loss": 1.2125, + "step": 153280 + }, + { + "epoch": 0.9793261183445562, + "grad_norm": 0.6560665369033813, + "learning_rate": 5.165352743006536e-05, + "loss": 0.7462, + "step": 153290 + }, + { + "epoch": 0.9793900054942949, + "grad_norm": 0.8147808313369751, + "learning_rate": 5.164851248131488e-05, + "loss": 0.8436, + "step": 153300 + }, + { + "epoch": 0.9794538926440336, + "grad_norm": 1.1131194829940796, + "learning_rate": 5.1643497515962455e-05, + "loss": 0.8414, + "step": 153310 + }, + { + "epoch": 0.9795177797937723, + "grad_norm": 0.8697034120559692, + "learning_rate": 5.16384825340586e-05, + "loss": 0.8524, + "step": 153320 + }, + { + "epoch": 0.979581666943511, + "grad_norm": 0.9882239699363708, + "learning_rate": 5.163346753565379e-05, + "loss": 0.7327, + "step": 153330 + }, + { + "epoch": 0.9796455540932497, + "grad_norm": 0.7860538959503174, + "learning_rate": 5.162845252079855e-05, + "loss": 0.7041, + "step": 153340 + }, + { + "epoch": 0.9797094412429884, + "grad_norm": 1.2516241073608398, + "learning_rate": 5.16234374895434e-05, + "loss": 1.284, + "step": 153350 + }, + { + "epoch": 0.9797733283927271, + "grad_norm": 1.390798568725586, + "learning_rate": 5.161842244193882e-05, + "loss": 1.0494, + "step": 153360 + }, + { + "epoch": 0.9798372155424658, + "grad_norm": 0.6051182746887207, + "learning_rate": 5.1613407378035326e-05, + "loss": 0.8672, + "step": 153370 + }, + { + "epoch": 0.9799011026922045, + "grad_norm": 1.087631344795227, + "learning_rate": 5.1608392297883426e-05, + "loss": 1.1912, + "step": 153380 + }, + { + "epoch": 0.9799649898419432, + "grad_norm": 0.9267514944076538, + "learning_rate": 5.160337720153362e-05, + "loss": 0.8654, + "step": 153390 + }, + { + "epoch": 0.9800288769916818, + "grad_norm": 1.898051142692566, + "learning_rate": 5.1598362089036424e-05, + "loss": 0.8454, + "step": 153400 + }, + { + "epoch": 0.9800927641414205, + "grad_norm": 0.8334454894065857, + "learning_rate": 5.1593346960442336e-05, + "loss": 0.9431, + "step": 153410 + }, + { + "epoch": 0.9801566512911593, + "grad_norm": 1.1599924564361572, + "learning_rate": 5.158833181580186e-05, + "loss": 0.7981, + "step": 153420 + }, + { + "epoch": 0.980220538440898, + "grad_norm": 0.7810460925102234, + "learning_rate": 5.1583316655165506e-05, + "loss": 1.0807, + "step": 153430 + }, + { + "epoch": 0.9802844255906367, + "grad_norm": 1.1010318994522095, + "learning_rate": 5.157830147858379e-05, + "loss": 0.7419, + "step": 153440 + }, + { + "epoch": 0.9803483127403754, + "grad_norm": 0.6888182163238525, + "learning_rate": 5.1573286286107216e-05, + "loss": 0.785, + "step": 153450 + }, + { + "epoch": 0.9804121998901141, + "grad_norm": 0.6871199011802673, + "learning_rate": 5.15682710777863e-05, + "loss": 0.8662, + "step": 153460 + }, + { + "epoch": 0.9804760870398528, + "grad_norm": 0.7507506012916565, + "learning_rate": 5.156325585367152e-05, + "loss": 0.8132, + "step": 153470 + }, + { + "epoch": 0.9805399741895915, + "grad_norm": 0.8823074698448181, + "learning_rate": 5.1558240613813416e-05, + "loss": 0.8198, + "step": 153480 + }, + { + "epoch": 0.9806038613393302, + "grad_norm": 0.6161057949066162, + "learning_rate": 5.155322535826246e-05, + "loss": 0.8405, + "step": 153490 + }, + { + "epoch": 0.9806677484890689, + "grad_norm": 0.7086304426193237, + "learning_rate": 5.1548210087069196e-05, + "loss": 0.7153, + "step": 153500 + }, + { + "epoch": 0.9807316356388076, + "grad_norm": 1.631806492805481, + "learning_rate": 5.154319480028411e-05, + "loss": 0.7625, + "step": 153510 + }, + { + "epoch": 0.9807955227885463, + "grad_norm": 0.7055345177650452, + "learning_rate": 5.153817949795772e-05, + "loss": 0.8304, + "step": 153520 + }, + { + "epoch": 0.980859409938285, + "grad_norm": 1.2235612869262695, + "learning_rate": 5.153316418014053e-05, + "loss": 0.9013, + "step": 153530 + }, + { + "epoch": 0.9809232970880237, + "grad_norm": 0.678455114364624, + "learning_rate": 5.152814884688305e-05, + "loss": 0.7831, + "step": 153540 + }, + { + "epoch": 0.9809871842377624, + "grad_norm": 1.0608277320861816, + "learning_rate": 5.152313349823579e-05, + "loss": 0.7629, + "step": 153550 + }, + { + "epoch": 0.9810510713875011, + "grad_norm": 0.8170053958892822, + "learning_rate": 5.151811813424926e-05, + "loss": 0.6033, + "step": 153560 + }, + { + "epoch": 0.9811149585372398, + "grad_norm": 0.7763380408287048, + "learning_rate": 5.151310275497396e-05, + "loss": 0.8555, + "step": 153570 + }, + { + "epoch": 0.9811788456869786, + "grad_norm": 1.2432817220687866, + "learning_rate": 5.150808736046042e-05, + "loss": 0.7298, + "step": 153580 + }, + { + "epoch": 0.9812427328367173, + "grad_norm": 0.9610840082168579, + "learning_rate": 5.150307195075912e-05, + "loss": 0.9684, + "step": 153590 + }, + { + "epoch": 0.981306619986456, + "grad_norm": 0.9366688132286072, + "learning_rate": 5.149805652592059e-05, + "loss": 0.7935, + "step": 153600 + }, + { + "epoch": 0.9813705071361947, + "grad_norm": 0.8396714329719543, + "learning_rate": 5.1493041085995334e-05, + "loss": 1.1084, + "step": 153610 + }, + { + "epoch": 0.9814343942859334, + "grad_norm": 0.9239795207977295, + "learning_rate": 5.148802563103387e-05, + "loss": 0.6051, + "step": 153620 + }, + { + "epoch": 0.9814982814356721, + "grad_norm": 0.866423487663269, + "learning_rate": 5.1483010161086695e-05, + "loss": 1.1068, + "step": 153630 + }, + { + "epoch": 0.9815621685854107, + "grad_norm": 1.2361878156661987, + "learning_rate": 5.147799467620432e-05, + "loss": 1.2309, + "step": 153640 + }, + { + "epoch": 0.9816260557351494, + "grad_norm": 0.9184995889663696, + "learning_rate": 5.147297917643728e-05, + "loss": 0.9739, + "step": 153650 + }, + { + "epoch": 0.9816899428848881, + "grad_norm": 0.8369823098182678, + "learning_rate": 5.146796366183604e-05, + "loss": 0.8687, + "step": 153660 + }, + { + "epoch": 0.9817538300346268, + "grad_norm": 0.8505387902259827, + "learning_rate": 5.146294813245115e-05, + "loss": 0.8085, + "step": 153670 + }, + { + "epoch": 0.9818177171843655, + "grad_norm": 1.1725592613220215, + "learning_rate": 5.14579325883331e-05, + "loss": 1.0974, + "step": 153680 + }, + { + "epoch": 0.9818816043341042, + "grad_norm": 1.1303421258926392, + "learning_rate": 5.145291702953241e-05, + "loss": 0.8779, + "step": 153690 + }, + { + "epoch": 0.9819454914838429, + "grad_norm": 1.3215134143829346, + "learning_rate": 5.144790145609961e-05, + "loss": 0.7935, + "step": 153700 + }, + { + "epoch": 0.9820093786335816, + "grad_norm": 0.9500746130943298, + "learning_rate": 5.1442885868085166e-05, + "loss": 1.2262, + "step": 153710 + }, + { + "epoch": 0.9820732657833203, + "grad_norm": 0.8446482419967651, + "learning_rate": 5.143787026553962e-05, + "loss": 0.7815, + "step": 153720 + }, + { + "epoch": 0.982137152933059, + "grad_norm": 1.9297406673431396, + "learning_rate": 5.143285464851347e-05, + "loss": 0.8834, + "step": 153730 + }, + { + "epoch": 0.9822010400827977, + "grad_norm": 1.5801446437835693, + "learning_rate": 5.1427839017057234e-05, + "loss": 0.8194, + "step": 153740 + }, + { + "epoch": 0.9822649272325364, + "grad_norm": 0.723784327507019, + "learning_rate": 5.142282337122142e-05, + "loss": 0.9339, + "step": 153750 + }, + { + "epoch": 0.9823288143822752, + "grad_norm": 0.9597413539886475, + "learning_rate": 5.141780771105655e-05, + "loss": 1.4497, + "step": 153760 + }, + { + "epoch": 0.9823927015320139, + "grad_norm": 1.1143347024917603, + "learning_rate": 5.1412792036613136e-05, + "loss": 1.099, + "step": 153770 + }, + { + "epoch": 0.9824565886817526, + "grad_norm": 0.9566546082496643, + "learning_rate": 5.1407776347941674e-05, + "loss": 1.1144, + "step": 153780 + }, + { + "epoch": 0.9825204758314913, + "grad_norm": 1.0242767333984375, + "learning_rate": 5.1402760645092696e-05, + "loss": 0.8559, + "step": 153790 + }, + { + "epoch": 0.98258436298123, + "grad_norm": 1.4230691194534302, + "learning_rate": 5.13977449281167e-05, + "loss": 1.1065, + "step": 153800 + }, + { + "epoch": 0.9826482501309687, + "grad_norm": 0.743841826915741, + "learning_rate": 5.139272919706421e-05, + "loss": 0.8184, + "step": 153810 + }, + { + "epoch": 0.9827121372807074, + "grad_norm": 1.586731195449829, + "learning_rate": 5.138771345198572e-05, + "loss": 0.8454, + "step": 153820 + }, + { + "epoch": 0.9827760244304461, + "grad_norm": 0.9233693480491638, + "learning_rate": 5.138269769293176e-05, + "loss": 1.0317, + "step": 153830 + }, + { + "epoch": 0.9828399115801848, + "grad_norm": 0.9364616274833679, + "learning_rate": 5.137768191995284e-05, + "loss": 0.9424, + "step": 153840 + }, + { + "epoch": 0.9829037987299235, + "grad_norm": 0.9385852217674255, + "learning_rate": 5.137266613309947e-05, + "loss": 1.1349, + "step": 153850 + }, + { + "epoch": 0.9829676858796622, + "grad_norm": 0.8022464513778687, + "learning_rate": 5.1367650332422155e-05, + "loss": 0.8832, + "step": 153860 + }, + { + "epoch": 0.9830315730294009, + "grad_norm": 1.424519658088684, + "learning_rate": 5.136263451797143e-05, + "loss": 0.8496, + "step": 153870 + }, + { + "epoch": 0.9830954601791395, + "grad_norm": 0.835070013999939, + "learning_rate": 5.1357618689797795e-05, + "loss": 0.7952, + "step": 153880 + }, + { + "epoch": 0.9831593473288782, + "grad_norm": 1.1491596698760986, + "learning_rate": 5.135260284795176e-05, + "loss": 1.0079, + "step": 153890 + }, + { + "epoch": 0.9832232344786169, + "grad_norm": 1.1634465456008911, + "learning_rate": 5.134758699248386e-05, + "loss": 0.8044, + "step": 153900 + }, + { + "epoch": 0.9832871216283556, + "grad_norm": 0.9196897745132446, + "learning_rate": 5.134257112344457e-05, + "loss": 0.8414, + "step": 153910 + }, + { + "epoch": 0.9833510087780943, + "grad_norm": 0.801220715045929, + "learning_rate": 5.133755524088444e-05, + "loss": 0.725, + "step": 153920 + }, + { + "epoch": 0.983414895927833, + "grad_norm": 0.6582996845245361, + "learning_rate": 5.133253934485397e-05, + "loss": 0.6951, + "step": 153930 + }, + { + "epoch": 0.9834787830775717, + "grad_norm": 1.0240771770477295, + "learning_rate": 5.132752343540368e-05, + "loss": 0.9298, + "step": 153940 + }, + { + "epoch": 0.9835426702273105, + "grad_norm": 0.8002316951751709, + "learning_rate": 5.132250751258407e-05, + "loss": 1.0542, + "step": 153950 + }, + { + "epoch": 0.9836065573770492, + "grad_norm": 1.1618034839630127, + "learning_rate": 5.131749157644568e-05, + "loss": 0.7801, + "step": 153960 + }, + { + "epoch": 0.9836704445267879, + "grad_norm": 0.9297146201133728, + "learning_rate": 5.1312475627039e-05, + "loss": 0.8619, + "step": 153970 + }, + { + "epoch": 0.9837343316765266, + "grad_norm": 0.7752575278282166, + "learning_rate": 5.130745966441456e-05, + "loss": 0.7556, + "step": 153980 + }, + { + "epoch": 0.9837982188262653, + "grad_norm": 1.0066181421279907, + "learning_rate": 5.130244368862286e-05, + "loss": 0.9884, + "step": 153990 + }, + { + "epoch": 0.983862105976004, + "grad_norm": 0.8494886159896851, + "learning_rate": 5.129742769971443e-05, + "loss": 0.8843, + "step": 154000 + }, + { + "epoch": 0.9839259931257427, + "grad_norm": 0.7732999920845032, + "learning_rate": 5.1292411697739786e-05, + "loss": 0.6683, + "step": 154010 + }, + { + "epoch": 0.9839898802754814, + "grad_norm": 1.17229425907135, + "learning_rate": 5.128739568274944e-05, + "loss": 0.7683, + "step": 154020 + }, + { + "epoch": 0.9840537674252201, + "grad_norm": 1.5680131912231445, + "learning_rate": 5.12823796547939e-05, + "loss": 1.2148, + "step": 154030 + }, + { + "epoch": 0.9841176545749588, + "grad_norm": 0.7837061882019043, + "learning_rate": 5.1277363613923676e-05, + "loss": 0.6788, + "step": 154040 + }, + { + "epoch": 0.9841815417246975, + "grad_norm": 1.022614598274231, + "learning_rate": 5.1272347560189314e-05, + "loss": 0.7454, + "step": 154050 + }, + { + "epoch": 0.9842454288744362, + "grad_norm": 1.2080639600753784, + "learning_rate": 5.12673314936413e-05, + "loss": 0.8185, + "step": 154060 + }, + { + "epoch": 0.9843093160241749, + "grad_norm": 1.0237927436828613, + "learning_rate": 5.126231541433018e-05, + "loss": 0.8095, + "step": 154070 + }, + { + "epoch": 0.9843732031739136, + "grad_norm": 0.5881984233856201, + "learning_rate": 5.125729932230643e-05, + "loss": 0.8783, + "step": 154080 + }, + { + "epoch": 0.9844370903236523, + "grad_norm": 0.7203654646873474, + "learning_rate": 5.12522832176206e-05, + "loss": 0.7445, + "step": 154090 + }, + { + "epoch": 0.984500977473391, + "grad_norm": 1.8319315910339355, + "learning_rate": 5.1247267100323195e-05, + "loss": 1.0696, + "step": 154100 + }, + { + "epoch": 0.9845648646231298, + "grad_norm": 0.8602228164672852, + "learning_rate": 5.124225097046472e-05, + "loss": 0.8352, + "step": 154110 + }, + { + "epoch": 0.9846287517728685, + "grad_norm": 0.7165752053260803, + "learning_rate": 5.12372348280957e-05, + "loss": 0.981, + "step": 154120 + }, + { + "epoch": 0.984692638922607, + "grad_norm": 0.9421229362487793, + "learning_rate": 5.123221867326666e-05, + "loss": 0.9251, + "step": 154130 + }, + { + "epoch": 0.9847565260723458, + "grad_norm": 1.153640866279602, + "learning_rate": 5.1227202506028117e-05, + "loss": 1.0239, + "step": 154140 + }, + { + "epoch": 0.9848204132220845, + "grad_norm": 0.8328729271888733, + "learning_rate": 5.122218632643059e-05, + "loss": 0.7723, + "step": 154150 + }, + { + "epoch": 0.9848843003718232, + "grad_norm": 3.4909727573394775, + "learning_rate": 5.1217170134524586e-05, + "loss": 0.9164, + "step": 154160 + }, + { + "epoch": 0.9849481875215619, + "grad_norm": 1.5162936449050903, + "learning_rate": 5.1212153930360615e-05, + "loss": 0.7539, + "step": 154170 + }, + { + "epoch": 0.9850120746713006, + "grad_norm": 0.8826167583465576, + "learning_rate": 5.1207137713989205e-05, + "loss": 0.7714, + "step": 154180 + }, + { + "epoch": 0.9850759618210393, + "grad_norm": 0.7749609351158142, + "learning_rate": 5.1202121485460894e-05, + "loss": 0.9825, + "step": 154190 + }, + { + "epoch": 0.985139848970778, + "grad_norm": 0.9990338683128357, + "learning_rate": 5.119710524482617e-05, + "loss": 0.7492, + "step": 154200 + }, + { + "epoch": 0.9852037361205167, + "grad_norm": Infinity, + "learning_rate": 5.119259061794569e-05, + "loss": 0.7843, + "step": 154210 + }, + { + "epoch": 0.9852676232702554, + "grad_norm": 0.6239207983016968, + "learning_rate": 5.118757435444798e-05, + "loss": 0.8757, + "step": 154220 + }, + { + "epoch": 0.9853315104199941, + "grad_norm": 0.8069121241569519, + "learning_rate": 5.118255807899036e-05, + "loss": 1.0016, + "step": 154230 + }, + { + "epoch": 0.9853953975697328, + "grad_norm": 0.8366886973381042, + "learning_rate": 5.117754179162335e-05, + "loss": 0.6895, + "step": 154240 + }, + { + "epoch": 0.9854592847194715, + "grad_norm": 0.8663131594657898, + "learning_rate": 5.1172525492397484e-05, + "loss": 0.683, + "step": 154250 + }, + { + "epoch": 0.9855231718692102, + "grad_norm": 1.0200055837631226, + "learning_rate": 5.116750918136327e-05, + "loss": 0.8902, + "step": 154260 + }, + { + "epoch": 0.9855870590189489, + "grad_norm": 1.0423588752746582, + "learning_rate": 5.116249285857123e-05, + "loss": 0.9948, + "step": 154270 + }, + { + "epoch": 0.9856509461686876, + "grad_norm": 1.235628366470337, + "learning_rate": 5.115747652407189e-05, + "loss": 0.9463, + "step": 154280 + }, + { + "epoch": 0.9857148333184264, + "grad_norm": 0.787745475769043, + "learning_rate": 5.115246017791575e-05, + "loss": 0.9099, + "step": 154290 + }, + { + "epoch": 0.9857787204681651, + "grad_norm": 0.8026257157325745, + "learning_rate": 5.114744382015334e-05, + "loss": 0.8882, + "step": 154300 + }, + { + "epoch": 0.9858426076179038, + "grad_norm": 0.9308233857154846, + "learning_rate": 5.114242745083517e-05, + "loss": 0.7079, + "step": 154310 + }, + { + "epoch": 0.9859064947676425, + "grad_norm": 1.0703706741333008, + "learning_rate": 5.1137411070011786e-05, + "loss": 0.8582, + "step": 154320 + }, + { + "epoch": 0.9859703819173812, + "grad_norm": 0.8068180680274963, + "learning_rate": 5.113239467773369e-05, + "loss": 0.666, + "step": 154330 + }, + { + "epoch": 0.9860342690671199, + "grad_norm": 1.0489506721496582, + "learning_rate": 5.1127378274051385e-05, + "loss": 0.9058, + "step": 154340 + }, + { + "epoch": 0.9860981562168586, + "grad_norm": 1.0829126834869385, + "learning_rate": 5.112236185901541e-05, + "loss": 0.9407, + "step": 154350 + }, + { + "epoch": 0.9861620433665973, + "grad_norm": 1.2354950904846191, + "learning_rate": 5.111734543267628e-05, + "loss": 1.0327, + "step": 154360 + }, + { + "epoch": 0.9862259305163359, + "grad_norm": 0.9200438857078552, + "learning_rate": 5.111232899508451e-05, + "loss": 0.884, + "step": 154370 + }, + { + "epoch": 0.9862898176660746, + "grad_norm": 1.6493422985076904, + "learning_rate": 5.110731254629063e-05, + "loss": 1.0967, + "step": 154380 + }, + { + "epoch": 0.9863537048158133, + "grad_norm": 0.49024438858032227, + "learning_rate": 5.110229608634516e-05, + "loss": 0.891, + "step": 154390 + }, + { + "epoch": 0.986417591965552, + "grad_norm": 0.9173659682273865, + "learning_rate": 5.1097279615298596e-05, + "loss": 0.8338, + "step": 154400 + }, + { + "epoch": 0.9864814791152907, + "grad_norm": 0.718056857585907, + "learning_rate": 5.109226313320149e-05, + "loss": 0.9905, + "step": 154410 + }, + { + "epoch": 0.9865453662650294, + "grad_norm": 0.8132973313331604, + "learning_rate": 5.108724664010435e-05, + "loss": 0.9829, + "step": 154420 + }, + { + "epoch": 0.9866092534147681, + "grad_norm": 1.2084906101226807, + "learning_rate": 5.1082230136057695e-05, + "loss": 0.8259, + "step": 154430 + }, + { + "epoch": 0.9866731405645068, + "grad_norm": 1.0011422634124756, + "learning_rate": 5.1077213621112043e-05, + "loss": 0.6755, + "step": 154440 + }, + { + "epoch": 0.9867370277142455, + "grad_norm": 0.930726945400238, + "learning_rate": 5.107219709531792e-05, + "loss": 0.8441, + "step": 154450 + }, + { + "epoch": 0.9868009148639842, + "grad_norm": 0.7071484923362732, + "learning_rate": 5.1067180558725846e-05, + "loss": 0.8279, + "step": 154460 + }, + { + "epoch": 0.986864802013723, + "grad_norm": 1.1952509880065918, + "learning_rate": 5.106216401138635e-05, + "loss": 0.7515, + "step": 154470 + }, + { + "epoch": 0.9869286891634617, + "grad_norm": 0.8858817219734192, + "learning_rate": 5.105714745334993e-05, + "loss": 0.8042, + "step": 154480 + }, + { + "epoch": 0.9869925763132004, + "grad_norm": 1.0529940128326416, + "learning_rate": 5.105213088466712e-05, + "loss": 0.8393, + "step": 154490 + }, + { + "epoch": 0.9870564634629391, + "grad_norm": 0.8459919095039368, + "learning_rate": 5.1047114305388445e-05, + "loss": 0.7869, + "step": 154500 + }, + { + "epoch": 0.9871203506126778, + "grad_norm": 0.8265353441238403, + "learning_rate": 5.104209771556443e-05, + "loss": 0.9366, + "step": 154510 + }, + { + "epoch": 0.9871842377624165, + "grad_norm": 1.7919999361038208, + "learning_rate": 5.1037081115245576e-05, + "loss": 0.8777, + "step": 154520 + }, + { + "epoch": 0.9872481249121552, + "grad_norm": 0.8435221910476685, + "learning_rate": 5.103206450448243e-05, + "loss": 0.9533, + "step": 154530 + }, + { + "epoch": 0.9873120120618939, + "grad_norm": 1.6864407062530518, + "learning_rate": 5.10270478833255e-05, + "loss": 0.7714, + "step": 154540 + }, + { + "epoch": 0.9873758992116326, + "grad_norm": 0.9249199032783508, + "learning_rate": 5.1022031251825306e-05, + "loss": 0.9767, + "step": 154550 + }, + { + "epoch": 0.9874397863613713, + "grad_norm": 0.6928601264953613, + "learning_rate": 5.101701461003238e-05, + "loss": 0.965, + "step": 154560 + }, + { + "epoch": 0.98750367351111, + "grad_norm": 1.4231996536254883, + "learning_rate": 5.101199795799723e-05, + "loss": 1.1854, + "step": 154570 + }, + { + "epoch": 0.9875675606608487, + "grad_norm": 1.1377532482147217, + "learning_rate": 5.1006981295770376e-05, + "loss": 0.8789, + "step": 154580 + }, + { + "epoch": 0.9876314478105874, + "grad_norm": 0.6748983860015869, + "learning_rate": 5.100196462340236e-05, + "loss": 0.8104, + "step": 154590 + }, + { + "epoch": 0.9876953349603261, + "grad_norm": 1.366390585899353, + "learning_rate": 5.0996947940943695e-05, + "loss": 1.0198, + "step": 154600 + }, + { + "epoch": 0.9877592221100647, + "grad_norm": 1.1231287717819214, + "learning_rate": 5.09919312484449e-05, + "loss": 0.9681, + "step": 154610 + }, + { + "epoch": 0.9878231092598034, + "grad_norm": 1.0698585510253906, + "learning_rate": 5.09869145459565e-05, + "loss": 0.7381, + "step": 154620 + }, + { + "epoch": 0.9878869964095421, + "grad_norm": 0.8807600736618042, + "learning_rate": 5.098189783352901e-05, + "loss": 0.8386, + "step": 154630 + }, + { + "epoch": 0.9879508835592808, + "grad_norm": 0.9870644211769104, + "learning_rate": 5.097688111121296e-05, + "loss": 0.7578, + "step": 154640 + }, + { + "epoch": 0.9880147707090196, + "grad_norm": 0.8701372742652893, + "learning_rate": 5.097186437905887e-05, + "loss": 0.9337, + "step": 154650 + }, + { + "epoch": 0.9880786578587583, + "grad_norm": 0.9452102780342102, + "learning_rate": 5.0966847637117275e-05, + "loss": 0.94, + "step": 154660 + }, + { + "epoch": 0.988142545008497, + "grad_norm": 1.2239512205123901, + "learning_rate": 5.096183088543869e-05, + "loss": 0.886, + "step": 154670 + }, + { + "epoch": 0.9882064321582357, + "grad_norm": 0.7913658022880554, + "learning_rate": 5.095681412407363e-05, + "loss": 0.9082, + "step": 154680 + }, + { + "epoch": 0.9882703193079744, + "grad_norm": 1.3978636264801025, + "learning_rate": 5.095179735307263e-05, + "loss": 0.8123, + "step": 154690 + }, + { + "epoch": 0.9883342064577131, + "grad_norm": 1.0464966297149658, + "learning_rate": 5.0946780572486194e-05, + "loss": 0.8111, + "step": 154700 + }, + { + "epoch": 0.9883980936074518, + "grad_norm": 0.7706538438796997, + "learning_rate": 5.094176378236487e-05, + "loss": 0.8334, + "step": 154710 + }, + { + "epoch": 0.9884619807571905, + "grad_norm": 0.8449127674102783, + "learning_rate": 5.0936746982759164e-05, + "loss": 0.8343, + "step": 154720 + }, + { + "epoch": 0.9885258679069292, + "grad_norm": 2.550860643386841, + "learning_rate": 5.093173017371961e-05, + "loss": 1.1222, + "step": 154730 + }, + { + "epoch": 0.9885897550566679, + "grad_norm": 0.8664345741271973, + "learning_rate": 5.0926713355296715e-05, + "loss": 0.9487, + "step": 154740 + }, + { + "epoch": 0.9886536422064066, + "grad_norm": 1.1151171922683716, + "learning_rate": 5.092169652754103e-05, + "loss": 0.8512, + "step": 154750 + }, + { + "epoch": 0.9887175293561453, + "grad_norm": 1.0279850959777832, + "learning_rate": 5.091667969050304e-05, + "loss": 0.9394, + "step": 154760 + }, + { + "epoch": 0.988781416505884, + "grad_norm": 0.8667672276496887, + "learning_rate": 5.091166284423332e-05, + "loss": 0.9599, + "step": 154770 + }, + { + "epoch": 0.9888453036556227, + "grad_norm": 0.6669974327087402, + "learning_rate": 5.0906645988782354e-05, + "loss": 0.7711, + "step": 154780 + }, + { + "epoch": 0.9889091908053614, + "grad_norm": 0.7633401155471802, + "learning_rate": 5.090162912420068e-05, + "loss": 0.8466, + "step": 154790 + }, + { + "epoch": 0.9889730779551001, + "grad_norm": 1.8261069059371948, + "learning_rate": 5.089661225053882e-05, + "loss": 0.7971, + "step": 154800 + }, + { + "epoch": 0.9890369651048388, + "grad_norm": 0.9991775155067444, + "learning_rate": 5.08915953678473e-05, + "loss": 1.0186, + "step": 154810 + }, + { + "epoch": 0.9891008522545776, + "grad_norm": 0.7954165935516357, + "learning_rate": 5.088657847617666e-05, + "loss": 0.9212, + "step": 154820 + }, + { + "epoch": 0.9891647394043163, + "grad_norm": 1.120200753211975, + "learning_rate": 5.0881561575577384e-05, + "loss": 0.7709, + "step": 154830 + }, + { + "epoch": 0.989228626554055, + "grad_norm": 0.8321331143379211, + "learning_rate": 5.0876544666100035e-05, + "loss": 1.0257, + "step": 154840 + }, + { + "epoch": 0.9892925137037936, + "grad_norm": 0.7614843249320984, + "learning_rate": 5.087152774779511e-05, + "loss": 1.0545, + "step": 154850 + }, + { + "epoch": 0.9893564008535323, + "grad_norm": 1.8227176666259766, + "learning_rate": 5.086651082071315e-05, + "loss": 0.8129, + "step": 154860 + }, + { + "epoch": 0.989420288003271, + "grad_norm": 0.577085018157959, + "learning_rate": 5.0861493884904686e-05, + "loss": 0.8638, + "step": 154870 + }, + { + "epoch": 0.9894841751530097, + "grad_norm": 0.9807033538818359, + "learning_rate": 5.0856476940420225e-05, + "loss": 0.9116, + "step": 154880 + }, + { + "epoch": 0.9895480623027484, + "grad_norm": 3.173016309738159, + "learning_rate": 5.0851459987310304e-05, + "loss": 0.675, + "step": 154890 + }, + { + "epoch": 0.9896119494524871, + "grad_norm": 4.230893611907959, + "learning_rate": 5.084644302562544e-05, + "loss": 0.7969, + "step": 154900 + }, + { + "epoch": 0.9896758366022258, + "grad_norm": 0.7749008536338806, + "learning_rate": 5.0841426055416164e-05, + "loss": 0.7723, + "step": 154910 + }, + { + "epoch": 0.9897397237519645, + "grad_norm": 1.0113115310668945, + "learning_rate": 5.083640907673299e-05, + "loss": 1.0151, + "step": 154920 + }, + { + "epoch": 0.9898036109017032, + "grad_norm": 3.3041319847106934, + "learning_rate": 5.083139208962646e-05, + "loss": 0.8368, + "step": 154930 + }, + { + "epoch": 0.9898674980514419, + "grad_norm": 0.8300958871841431, + "learning_rate": 5.082637509414709e-05, + "loss": 0.6277, + "step": 154940 + }, + { + "epoch": 0.9899313852011806, + "grad_norm": 0.720481276512146, + "learning_rate": 5.0821358090345414e-05, + "loss": 0.8199, + "step": 154950 + }, + { + "epoch": 0.9899952723509193, + "grad_norm": 0.7711173295974731, + "learning_rate": 5.081634107827196e-05, + "loss": 0.9664, + "step": 154960 + }, + { + "epoch": 0.990059159500658, + "grad_norm": 1.2588512897491455, + "learning_rate": 5.081132405797724e-05, + "loss": 0.8738, + "step": 154970 + }, + { + "epoch": 0.9901230466503967, + "grad_norm": 0.7315512895584106, + "learning_rate": 5.080630702951178e-05, + "loss": 0.9661, + "step": 154980 + }, + { + "epoch": 0.9901869338001354, + "grad_norm": 0.7708696722984314, + "learning_rate": 5.0801289992926106e-05, + "loss": 0.7121, + "step": 154990 + }, + { + "epoch": 0.9902508209498742, + "grad_norm": 1.7888171672821045, + "learning_rate": 5.079627294827075e-05, + "loss": 0.9311, + "step": 155000 + }, + { + "epoch": 0.9903147080996129, + "grad_norm": 0.5545375943183899, + "learning_rate": 5.0791255895596246e-05, + "loss": 1.0415, + "step": 155010 + }, + { + "epoch": 0.9903785952493516, + "grad_norm": 0.8352196216583252, + "learning_rate": 5.07862388349531e-05, + "loss": 0.9962, + "step": 155020 + }, + { + "epoch": 0.9904424823990903, + "grad_norm": 0.9001242518424988, + "learning_rate": 5.0781221766391865e-05, + "loss": 0.8372, + "step": 155030 + }, + { + "epoch": 0.990506369548829, + "grad_norm": 0.76966392993927, + "learning_rate": 5.0776204689963035e-05, + "loss": 0.8428, + "step": 155040 + }, + { + "epoch": 0.9905702566985677, + "grad_norm": 1.0977442264556885, + "learning_rate": 5.0771187605717154e-05, + "loss": 0.7625, + "step": 155050 + }, + { + "epoch": 0.9906341438483064, + "grad_norm": 0.6730074882507324, + "learning_rate": 5.076617051370476e-05, + "loss": 0.8101, + "step": 155060 + }, + { + "epoch": 0.9906980309980451, + "grad_norm": 0.7701064348220825, + "learning_rate": 5.076115341397636e-05, + "loss": 1.0062, + "step": 155070 + }, + { + "epoch": 0.9907619181477838, + "grad_norm": 0.697149932384491, + "learning_rate": 5.075613630658247e-05, + "loss": 0.9009, + "step": 155080 + }, + { + "epoch": 0.9908258052975225, + "grad_norm": 1.0090751647949219, + "learning_rate": 5.075111919157364e-05, + "loss": 0.6531, + "step": 155090 + }, + { + "epoch": 0.9908896924472611, + "grad_norm": 1.064965844154358, + "learning_rate": 5.07461020690004e-05, + "loss": 0.855, + "step": 155100 + }, + { + "epoch": 0.9909535795969998, + "grad_norm": 0.9386013150215149, + "learning_rate": 5.0741084938913265e-05, + "loss": 1.009, + "step": 155110 + }, + { + "epoch": 0.9910174667467385, + "grad_norm": 0.9192590117454529, + "learning_rate": 5.0736067801362754e-05, + "loss": 0.9065, + "step": 155120 + }, + { + "epoch": 0.9910813538964772, + "grad_norm": 0.8243198394775391, + "learning_rate": 5.073105065639942e-05, + "loss": 0.8052, + "step": 155130 + }, + { + "epoch": 0.9911452410462159, + "grad_norm": 0.8356258273124695, + "learning_rate": 5.072603350407376e-05, + "loss": 0.8429, + "step": 155140 + }, + { + "epoch": 0.9912091281959546, + "grad_norm": 0.962837815284729, + "learning_rate": 5.0721016344436314e-05, + "loss": 0.9432, + "step": 155150 + }, + { + "epoch": 0.9912730153456933, + "grad_norm": 0.7561596035957336, + "learning_rate": 5.071599917753761e-05, + "loss": 1.0909, + "step": 155160 + }, + { + "epoch": 0.991336902495432, + "grad_norm": 0.8569062352180481, + "learning_rate": 5.0710982003428187e-05, + "loss": 0.7786, + "step": 155170 + }, + { + "epoch": 0.9914007896451708, + "grad_norm": 1.1719166040420532, + "learning_rate": 5.0705964822158544e-05, + "loss": 0.7228, + "step": 155180 + }, + { + "epoch": 0.9914646767949095, + "grad_norm": 0.7665051221847534, + "learning_rate": 5.070094763377924e-05, + "loss": 0.8832, + "step": 155190 + }, + { + "epoch": 0.9915285639446482, + "grad_norm": 0.8756385445594788, + "learning_rate": 5.0695930438340776e-05, + "loss": 0.9353, + "step": 155200 + }, + { + "epoch": 0.9915924510943869, + "grad_norm": 1.0932066440582275, + "learning_rate": 5.069091323589369e-05, + "loss": 0.9989, + "step": 155210 + }, + { + "epoch": 0.9916563382441256, + "grad_norm": 1.00384521484375, + "learning_rate": 5.0685896026488514e-05, + "loss": 0.7546, + "step": 155220 + }, + { + "epoch": 0.9917202253938643, + "grad_norm": 1.111561894416809, + "learning_rate": 5.068087881017577e-05, + "loss": 1.0, + "step": 155230 + }, + { + "epoch": 0.991784112543603, + "grad_norm": 1.0187937021255493, + "learning_rate": 5.067586158700599e-05, + "loss": 0.8705, + "step": 155240 + }, + { + "epoch": 0.9918479996933417, + "grad_norm": 0.9533101320266724, + "learning_rate": 5.06708443570297e-05, + "loss": 1.0491, + "step": 155250 + }, + { + "epoch": 0.9919118868430804, + "grad_norm": 0.7527496218681335, + "learning_rate": 5.066582712029743e-05, + "loss": 0.852, + "step": 155260 + }, + { + "epoch": 0.9919757739928191, + "grad_norm": 0.5917440056800842, + "learning_rate": 5.0660809876859694e-05, + "loss": 0.9962, + "step": 155270 + }, + { + "epoch": 0.9920396611425578, + "grad_norm": 1.4870597124099731, + "learning_rate": 5.065579262676704e-05, + "loss": 0.8788, + "step": 155280 + }, + { + "epoch": 0.9921035482922965, + "grad_norm": 0.7296018004417419, + "learning_rate": 5.0650775370069966e-05, + "loss": 0.8849, + "step": 155290 + }, + { + "epoch": 0.9921674354420352, + "grad_norm": 1.0578160285949707, + "learning_rate": 5.0645758106819055e-05, + "loss": 0.7862, + "step": 155300 + }, + { + "epoch": 0.9922313225917739, + "grad_norm": 0.9313512444496155, + "learning_rate": 5.064074083706478e-05, + "loss": 0.7519, + "step": 155310 + }, + { + "epoch": 0.9922952097415126, + "grad_norm": 0.8470985293388367, + "learning_rate": 5.063572356085769e-05, + "loss": 0.9872, + "step": 155320 + }, + { + "epoch": 0.9923590968912513, + "grad_norm": 0.9447453618049622, + "learning_rate": 5.063070627824833e-05, + "loss": 1.0726, + "step": 155330 + }, + { + "epoch": 0.9924229840409899, + "grad_norm": 0.7884403467178345, + "learning_rate": 5.0625688989287204e-05, + "loss": 1.1167, + "step": 155340 + }, + { + "epoch": 0.9924868711907286, + "grad_norm": 1.2746003866195679, + "learning_rate": 5.0620671694024836e-05, + "loss": 1.0695, + "step": 155350 + }, + { + "epoch": 0.9925507583404674, + "grad_norm": 1.0743809938430786, + "learning_rate": 5.061565439251178e-05, + "loss": 0.9215, + "step": 155360 + }, + { + "epoch": 0.9926146454902061, + "grad_norm": 0.7811270952224731, + "learning_rate": 5.061063708479855e-05, + "loss": 0.8182, + "step": 155370 + }, + { + "epoch": 0.9926785326399448, + "grad_norm": 1.0729076862335205, + "learning_rate": 5.060561977093568e-05, + "loss": 0.7123, + "step": 155380 + }, + { + "epoch": 0.9927424197896835, + "grad_norm": 0.9297366142272949, + "learning_rate": 5.060060245097368e-05, + "loss": 0.9888, + "step": 155390 + }, + { + "epoch": 0.9928063069394222, + "grad_norm": 2.0715694427490234, + "learning_rate": 5.059558512496311e-05, + "loss": 0.9501, + "step": 155400 + }, + { + "epoch": 0.9928701940891609, + "grad_norm": 0.7811942100524902, + "learning_rate": 5.059056779295447e-05, + "loss": 0.8314, + "step": 155410 + }, + { + "epoch": 0.9929340812388996, + "grad_norm": 0.4893457591533661, + "learning_rate": 5.058555045499831e-05, + "loss": 1.1216, + "step": 155420 + }, + { + "epoch": 0.9929979683886383, + "grad_norm": 0.7355089783668518, + "learning_rate": 5.058053311114515e-05, + "loss": 0.9711, + "step": 155430 + }, + { + "epoch": 0.993061855538377, + "grad_norm": 0.8681902289390564, + "learning_rate": 5.057551576144551e-05, + "loss": 0.9379, + "step": 155440 + }, + { + "epoch": 0.9931257426881157, + "grad_norm": 1.1609843969345093, + "learning_rate": 5.0570498405949926e-05, + "loss": 0.8046, + "step": 155450 + }, + { + "epoch": 0.9931896298378544, + "grad_norm": 0.9073358774185181, + "learning_rate": 5.056548104470894e-05, + "loss": 0.8837, + "step": 155460 + }, + { + "epoch": 0.9932535169875931, + "grad_norm": 0.7544703483581543, + "learning_rate": 5.056046367777306e-05, + "loss": 1.0009, + "step": 155470 + }, + { + "epoch": 0.9933174041373318, + "grad_norm": 2.039977788925171, + "learning_rate": 5.055544630519284e-05, + "loss": 0.736, + "step": 155480 + }, + { + "epoch": 0.9933812912870705, + "grad_norm": 0.5482000112533569, + "learning_rate": 5.055042892701879e-05, + "loss": 0.7449, + "step": 155490 + }, + { + "epoch": 0.9934451784368092, + "grad_norm": 1.062432050704956, + "learning_rate": 5.054541154330145e-05, + "loss": 0.9661, + "step": 155500 + }, + { + "epoch": 0.993509065586548, + "grad_norm": 0.7228785753250122, + "learning_rate": 5.054039415409133e-05, + "loss": 1.0069, + "step": 155510 + }, + { + "epoch": 0.9935729527362867, + "grad_norm": 0.7066873908042908, + "learning_rate": 5.053537675943899e-05, + "loss": 0.911, + "step": 155520 + }, + { + "epoch": 0.9936368398860254, + "grad_norm": 1.2193684577941895, + "learning_rate": 5.053035935939493e-05, + "loss": 0.8247, + "step": 155530 + }, + { + "epoch": 0.9937007270357641, + "grad_norm": 0.8418088555335999, + "learning_rate": 5.05253419540097e-05, + "loss": 1.0304, + "step": 155540 + }, + { + "epoch": 0.9937646141855028, + "grad_norm": 0.9440509080886841, + "learning_rate": 5.052032454333383e-05, + "loss": 0.7883, + "step": 155550 + }, + { + "epoch": 0.9938285013352415, + "grad_norm": 2.196873426437378, + "learning_rate": 5.051530712741783e-05, + "loss": 0.7575, + "step": 155560 + }, + { + "epoch": 0.9938923884849802, + "grad_norm": 0.6164715886116028, + "learning_rate": 5.051028970631224e-05, + "loss": 0.9493, + "step": 155570 + }, + { + "epoch": 0.9939562756347188, + "grad_norm": 0.6802273392677307, + "learning_rate": 5.05052722800676e-05, + "loss": 0.7849, + "step": 155580 + }, + { + "epoch": 0.9940201627844575, + "grad_norm": 0.7959185242652893, + "learning_rate": 5.0500254848734415e-05, + "loss": 0.9129, + "step": 155590 + }, + { + "epoch": 0.9940840499341962, + "grad_norm": 0.7099493741989136, + "learning_rate": 5.049523741236325e-05, + "loss": 0.9131, + "step": 155600 + }, + { + "epoch": 0.9941479370839349, + "grad_norm": 1.0658419132232666, + "learning_rate": 5.049021997100459e-05, + "loss": 0.7746, + "step": 155610 + }, + { + "epoch": 0.9942118242336736, + "grad_norm": 1.2376176118850708, + "learning_rate": 5.048520252470901e-05, + "loss": 1.0721, + "step": 155620 + }, + { + "epoch": 0.9942757113834123, + "grad_norm": 3.2693676948547363, + "learning_rate": 5.048018507352702e-05, + "loss": 0.9045, + "step": 155630 + }, + { + "epoch": 0.994339598533151, + "grad_norm": 1.1164909601211548, + "learning_rate": 5.047516761750915e-05, + "loss": 0.8043, + "step": 155640 + }, + { + "epoch": 0.9944034856828897, + "grad_norm": 2.0805160999298096, + "learning_rate": 5.0470150156705933e-05, + "loss": 0.8283, + "step": 155650 + }, + { + "epoch": 0.9944673728326284, + "grad_norm": 1.3686854839324951, + "learning_rate": 5.0465132691167894e-05, + "loss": 0.7677, + "step": 155660 + }, + { + "epoch": 0.9945312599823671, + "grad_norm": 0.9186270833015442, + "learning_rate": 5.046011522094556e-05, + "loss": 0.8535, + "step": 155670 + }, + { + "epoch": 0.9945951471321058, + "grad_norm": 1.0402723550796509, + "learning_rate": 5.045509774608947e-05, + "loss": 0.7879, + "step": 155680 + }, + { + "epoch": 0.9946590342818445, + "grad_norm": 0.8195865154266357, + "learning_rate": 5.0450080266650165e-05, + "loss": 0.7122, + "step": 155690 + }, + { + "epoch": 0.9947229214315833, + "grad_norm": 1.0943642854690552, + "learning_rate": 5.0445062782678154e-05, + "loss": 0.9818, + "step": 155700 + }, + { + "epoch": 0.994786808581322, + "grad_norm": 0.9934229850769043, + "learning_rate": 5.044004529422397e-05, + "loss": 0.8661, + "step": 155710 + }, + { + "epoch": 0.9948506957310607, + "grad_norm": 1.000313639640808, + "learning_rate": 5.0435027801338164e-05, + "loss": 0.9347, + "step": 155720 + }, + { + "epoch": 0.9949145828807994, + "grad_norm": 0.7824245691299438, + "learning_rate": 5.043001030407124e-05, + "loss": 0.6195, + "step": 155730 + }, + { + "epoch": 0.9949784700305381, + "grad_norm": 0.848616898059845, + "learning_rate": 5.042499280247373e-05, + "loss": 0.6459, + "step": 155740 + }, + { + "epoch": 0.9950423571802768, + "grad_norm": 1.090847373008728, + "learning_rate": 5.04199752965962e-05, + "loss": 0.7625, + "step": 155750 + }, + { + "epoch": 0.9951062443300155, + "grad_norm": 1.3222191333770752, + "learning_rate": 5.041495778648914e-05, + "loss": 1.0372, + "step": 155760 + }, + { + "epoch": 0.9951701314797542, + "grad_norm": 1.4843127727508545, + "learning_rate": 5.0409940272203093e-05, + "loss": 0.7487, + "step": 155770 + }, + { + "epoch": 0.9952340186294929, + "grad_norm": 1.1687159538269043, + "learning_rate": 5.040492275378861e-05, + "loss": 0.823, + "step": 155780 + }, + { + "epoch": 0.9952979057792316, + "grad_norm": 0.8214207887649536, + "learning_rate": 5.039990523129618e-05, + "loss": 0.7828, + "step": 155790 + }, + { + "epoch": 0.9953617929289703, + "grad_norm": 1.5629782676696777, + "learning_rate": 5.0394887704776385e-05, + "loss": 0.7958, + "step": 155800 + }, + { + "epoch": 0.995425680078709, + "grad_norm": 1.2977160215377808, + "learning_rate": 5.038987017427971e-05, + "loss": 0.9585, + "step": 155810 + }, + { + "epoch": 0.9954895672284477, + "grad_norm": 0.7463441491127014, + "learning_rate": 5.0384852639856706e-05, + "loss": 0.7156, + "step": 155820 + }, + { + "epoch": 0.9955534543781863, + "grad_norm": 0.878885805606842, + "learning_rate": 5.037983510155791e-05, + "loss": 0.7061, + "step": 155830 + }, + { + "epoch": 0.995617341527925, + "grad_norm": 0.9723998308181763, + "learning_rate": 5.037481755943385e-05, + "loss": 1.1002, + "step": 155840 + }, + { + "epoch": 0.9956812286776637, + "grad_norm": 0.8004569411277771, + "learning_rate": 5.036980001353504e-05, + "loss": 0.7974, + "step": 155850 + }, + { + "epoch": 0.9957451158274024, + "grad_norm": 0.9547368288040161, + "learning_rate": 5.036478246391203e-05, + "loss": 1.1055, + "step": 155860 + }, + { + "epoch": 0.9958090029771411, + "grad_norm": 1.7963746786117554, + "learning_rate": 5.035976491061535e-05, + "loss": 0.9589, + "step": 155870 + }, + { + "epoch": 0.9958728901268799, + "grad_norm": 0.7605364918708801, + "learning_rate": 5.035474735369552e-05, + "loss": 1.0215, + "step": 155880 + }, + { + "epoch": 0.9959367772766186, + "grad_norm": 0.868885338306427, + "learning_rate": 5.0349729793203085e-05, + "loss": 1.1309, + "step": 155890 + }, + { + "epoch": 0.9960006644263573, + "grad_norm": 0.8125676512718201, + "learning_rate": 5.034471222918856e-05, + "loss": 0.7623, + "step": 155900 + }, + { + "epoch": 0.996064551576096, + "grad_norm": 0.9288298487663269, + "learning_rate": 5.033969466170248e-05, + "loss": 0.9507, + "step": 155910 + }, + { + "epoch": 0.9961284387258347, + "grad_norm": 1.1834713220596313, + "learning_rate": 5.033467709079539e-05, + "loss": 0.8471, + "step": 155920 + }, + { + "epoch": 0.9961923258755734, + "grad_norm": 0.5627526640892029, + "learning_rate": 5.032965951651781e-05, + "loss": 0.8455, + "step": 155930 + }, + { + "epoch": 0.9962562130253121, + "grad_norm": 1.100093126296997, + "learning_rate": 5.032464193892028e-05, + "loss": 0.7466, + "step": 155940 + }, + { + "epoch": 0.9963201001750508, + "grad_norm": 1.0520758628845215, + "learning_rate": 5.031962435805332e-05, + "loss": 0.8332, + "step": 155950 + }, + { + "epoch": 0.9963839873247895, + "grad_norm": 1.0780564546585083, + "learning_rate": 5.0314606773967456e-05, + "loss": 1.1039, + "step": 155960 + }, + { + "epoch": 0.9964478744745282, + "grad_norm": 2.222808361053467, + "learning_rate": 5.0309589186713235e-05, + "loss": 0.8462, + "step": 155970 + }, + { + "epoch": 0.9965117616242669, + "grad_norm": 0.8251408338546753, + "learning_rate": 5.030457159634118e-05, + "loss": 0.7902, + "step": 155980 + }, + { + "epoch": 0.9965756487740056, + "grad_norm": 0.9923737645149231, + "learning_rate": 5.029955400290183e-05, + "loss": 0.9637, + "step": 155990 + }, + { + "epoch": 0.9966395359237443, + "grad_norm": 0.7754630446434021, + "learning_rate": 5.029453640644571e-05, + "loss": 1.025, + "step": 156000 + }, + { + "epoch": 0.996703423073483, + "grad_norm": 0.49166470766067505, + "learning_rate": 5.028951880702336e-05, + "loss": 0.709, + "step": 156010 + }, + { + "epoch": 0.9967673102232217, + "grad_norm": 0.7618236541748047, + "learning_rate": 5.028450120468531e-05, + "loss": 0.9728, + "step": 156020 + }, + { + "epoch": 0.9968311973729604, + "grad_norm": 2.817028760910034, + "learning_rate": 5.027948359948209e-05, + "loss": 0.9221, + "step": 156030 + }, + { + "epoch": 0.9968950845226991, + "grad_norm": 0.8261348009109497, + "learning_rate": 5.027446599146421e-05, + "loss": 0.9886, + "step": 156040 + }, + { + "epoch": 0.9969589716724379, + "grad_norm": 0.749161958694458, + "learning_rate": 5.026944838068223e-05, + "loss": 0.9215, + "step": 156050 + }, + { + "epoch": 0.9970228588221766, + "grad_norm": 1.2337532043457031, + "learning_rate": 5.026443076718666e-05, + "loss": 0.9563, + "step": 156060 + }, + { + "epoch": 0.9970867459719152, + "grad_norm": 0.6758466362953186, + "learning_rate": 5.0259413151028066e-05, + "loss": 0.6758, + "step": 156070 + }, + { + "epoch": 0.9971506331216539, + "grad_norm": 1.36204993724823, + "learning_rate": 5.0254395532256935e-05, + "loss": 0.948, + "step": 156080 + }, + { + "epoch": 0.9972145202713926, + "grad_norm": 0.7348865866661072, + "learning_rate": 5.0249377910923834e-05, + "loss": 0.9833, + "step": 156090 + }, + { + "epoch": 0.9972784074211313, + "grad_norm": 0.6744032502174377, + "learning_rate": 5.0244360287079287e-05, + "loss": 0.9639, + "step": 156100 + }, + { + "epoch": 0.99734229457087, + "grad_norm": 0.9378622174263, + "learning_rate": 5.0239342660773804e-05, + "loss": 0.8584, + "step": 156110 + }, + { + "epoch": 0.9974061817206087, + "grad_norm": 1.2324236631393433, + "learning_rate": 5.023432503205794e-05, + "loss": 0.9609, + "step": 156120 + }, + { + "epoch": 0.9974700688703474, + "grad_norm": 0.9098687171936035, + "learning_rate": 5.0229307400982215e-05, + "loss": 1.008, + "step": 156130 + }, + { + "epoch": 0.9975339560200861, + "grad_norm": 0.9950636029243469, + "learning_rate": 5.0224289767597164e-05, + "loss": 0.892, + "step": 156140 + }, + { + "epoch": 0.9975978431698248, + "grad_norm": 0.6629953980445862, + "learning_rate": 5.021927213195333e-05, + "loss": 0.7111, + "step": 156150 + }, + { + "epoch": 0.9976617303195635, + "grad_norm": 0.7785073518753052, + "learning_rate": 5.021425449410123e-05, + "loss": 0.7638, + "step": 156160 + }, + { + "epoch": 0.9977256174693022, + "grad_norm": 0.6204543113708496, + "learning_rate": 5.0209236854091414e-05, + "loss": 0.8849, + "step": 156170 + }, + { + "epoch": 0.9977895046190409, + "grad_norm": 0.7661494016647339, + "learning_rate": 5.020421921197439e-05, + "loss": 0.7041, + "step": 156180 + }, + { + "epoch": 0.9978533917687796, + "grad_norm": 1.0413057804107666, + "learning_rate": 5.0199201567800704e-05, + "loss": 0.8334, + "step": 156190 + }, + { + "epoch": 0.9979172789185183, + "grad_norm": 1.1320688724517822, + "learning_rate": 5.0194183921620895e-05, + "loss": 0.8678, + "step": 156200 + }, + { + "epoch": 0.997981166068257, + "grad_norm": 1.0221413373947144, + "learning_rate": 5.0189166273485476e-05, + "loss": 0.7552, + "step": 156210 + }, + { + "epoch": 0.9980450532179957, + "grad_norm": 0.6623107194900513, + "learning_rate": 5.018414862344499e-05, + "loss": 0.8624, + "step": 156220 + }, + { + "epoch": 0.9981089403677345, + "grad_norm": 1.305783748626709, + "learning_rate": 5.017913097154997e-05, + "loss": 0.8993, + "step": 156230 + }, + { + "epoch": 0.9981728275174732, + "grad_norm": 1.0739842653274536, + "learning_rate": 5.017411331785094e-05, + "loss": 0.8799, + "step": 156240 + }, + { + "epoch": 0.9982367146672119, + "grad_norm": 0.6890377998352051, + "learning_rate": 5.016909566239846e-05, + "loss": 0.777, + "step": 156250 + }, + { + "epoch": 0.9983006018169506, + "grad_norm": 1.3199329376220703, + "learning_rate": 5.016407800524302e-05, + "loss": 1.0892, + "step": 156260 + }, + { + "epoch": 0.9983644889666893, + "grad_norm": 1.2792729139328003, + "learning_rate": 5.015906034643517e-05, + "loss": 0.7636, + "step": 156270 + }, + { + "epoch": 0.998428376116428, + "grad_norm": 1.0154145956039429, + "learning_rate": 5.015404268602547e-05, + "loss": 1.0007, + "step": 156280 + }, + { + "epoch": 0.9984922632661667, + "grad_norm": 1.130146861076355, + "learning_rate": 5.014902502406441e-05, + "loss": 0.8497, + "step": 156290 + }, + { + "epoch": 0.9985561504159054, + "grad_norm": 0.9147141575813293, + "learning_rate": 5.014400736060252e-05, + "loss": 1.0122, + "step": 156300 + }, + { + "epoch": 0.998620037565644, + "grad_norm": 2.783782720565796, + "learning_rate": 5.013898969569038e-05, + "loss": 1.0387, + "step": 156310 + }, + { + "epoch": 0.9986839247153827, + "grad_norm": 1.1431195735931396, + "learning_rate": 5.013397202937847e-05, + "loss": 1.1868, + "step": 156320 + }, + { + "epoch": 0.9987478118651214, + "grad_norm": 0.596916139125824, + "learning_rate": 5.0128954361717365e-05, + "loss": 0.841, + "step": 156330 + }, + { + "epoch": 0.9988116990148601, + "grad_norm": 0.7760692238807678, + "learning_rate": 5.0123936692757566e-05, + "loss": 0.821, + "step": 156340 + }, + { + "epoch": 0.9988755861645988, + "grad_norm": 0.8499806523323059, + "learning_rate": 5.011891902254963e-05, + "loss": 1.0573, + "step": 156350 + }, + { + "epoch": 0.9989394733143375, + "grad_norm": 0.9884029626846313, + "learning_rate": 5.0113901351144065e-05, + "loss": 1.0637, + "step": 156360 + }, + { + "epoch": 0.9990033604640762, + "grad_norm": 0.7256179451942444, + "learning_rate": 5.0108883678591424e-05, + "loss": 1.0313, + "step": 156370 + }, + { + "epoch": 0.9990672476138149, + "grad_norm": 1.0645331144332886, + "learning_rate": 5.010386600494222e-05, + "loss": 1.0018, + "step": 156380 + }, + { + "epoch": 0.9991311347635536, + "grad_norm": 1.1328057050704956, + "learning_rate": 5.0098848330247006e-05, + "loss": 0.9414, + "step": 156390 + }, + { + "epoch": 0.9991950219132923, + "grad_norm": 0.8693520426750183, + "learning_rate": 5.00938306545563e-05, + "loss": 0.7726, + "step": 156400 + }, + { + "epoch": 0.999258909063031, + "grad_norm": 1.004093050956726, + "learning_rate": 5.008881297792063e-05, + "loss": 0.7736, + "step": 156410 + }, + { + "epoch": 0.9993227962127698, + "grad_norm": 0.7669504284858704, + "learning_rate": 5.008379530039055e-05, + "loss": 1.1268, + "step": 156420 + }, + { + "epoch": 0.9993866833625085, + "grad_norm": 0.9872044324874878, + "learning_rate": 5.007877762201657e-05, + "loss": 0.7277, + "step": 156430 + }, + { + "epoch": 0.9994505705122472, + "grad_norm": 0.8186001181602478, + "learning_rate": 5.007375994284923e-05, + "loss": 0.9602, + "step": 156440 + }, + { + "epoch": 0.9995144576619859, + "grad_norm": 0.840390145778656, + "learning_rate": 5.006874226293907e-05, + "loss": 0.8884, + "step": 156450 + }, + { + "epoch": 0.9995783448117246, + "grad_norm": 1.0728955268859863, + "learning_rate": 5.0063724582336614e-05, + "loss": 1.0428, + "step": 156460 + }, + { + "epoch": 0.9996422319614633, + "grad_norm": 0.8296906352043152, + "learning_rate": 5.005870690109239e-05, + "loss": 0.7635, + "step": 156470 + }, + { + "epoch": 0.999706119111202, + "grad_norm": 0.9049399495124817, + "learning_rate": 5.0053689219256946e-05, + "loss": 0.7707, + "step": 156480 + }, + { + "epoch": 0.9997700062609407, + "grad_norm": 0.6958884000778198, + "learning_rate": 5.0048671536880797e-05, + "loss": 0.6962, + "step": 156490 + }, + { + "epoch": 0.9998338934106794, + "grad_norm": 1.1141598224639893, + "learning_rate": 5.0043653854014486e-05, + "loss": 0.6017, + "step": 156500 + }, + { + "epoch": 0.9998977805604181, + "grad_norm": 1.1344066858291626, + "learning_rate": 5.0038636170708544e-05, + "loss": 0.9523, + "step": 156510 + }, + { + "epoch": 0.9999616677101568, + "grad_norm": 0.8506420850753784, + "learning_rate": 5.003361848701351e-05, + "loss": 0.7334, + "step": 156520 + } + ], + "logging_steps": 10, + "max_steps": 313052, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.791753270791291e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}