diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,38357 +1,80721 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.9998174182946868, + "epoch": 2.0, "eval_steps": 500, - "global_step": 5476, + "global_step": 11528, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0003651634106262553, - "grad_norm": 22.664087295532227, - "learning_rate": 3.6496350364963505e-08, - "loss": 2.4666, + "epoch": 0.00017349063150589867, + "grad_norm": 6.243800640106201, + "learning_rate": 1.7346053772766696e-08, + "loss": 1.7671, "step": 1 }, { - "epoch": 0.0007303268212525105, - "grad_norm": 50.189918518066406, - "learning_rate": 7.299270072992701e-08, - "loss": 2.8521, + "epoch": 0.00034698126301179735, + "grad_norm": 8.843234062194824, + "learning_rate": 3.469210754553339e-08, + "loss": 1.6699, "step": 2 }, { - "epoch": 0.0010954902318787657, - "grad_norm": 37.17690658569336, - "learning_rate": 1.0948905109489053e-07, - "loss": 2.6934, + "epoch": 0.000520471894517696, + "grad_norm": 7.949714660644531, + "learning_rate": 5.203816131830009e-08, + "loss": 1.8369, "step": 3 }, { - "epoch": 0.001460653642505021, - "grad_norm": 31.44289207458496, - "learning_rate": 1.4598540145985402e-07, - "loss": 2.4824, + "epoch": 0.0006939625260235947, + "grad_norm": 6.156651973724365, + "learning_rate": 6.938421509106678e-08, + "loss": 1.646, "step": 4 }, { - "epoch": 0.0018258170531312763, - "grad_norm": 37.50605392456055, - "learning_rate": 1.8248175182481753e-07, - "loss": 2.9624, + "epoch": 0.0008674531575294934, + "grad_norm": 11.775126457214355, + "learning_rate": 8.673026886383348e-08, + "loss": 1.5864, "step": 5 }, { - "epoch": 0.0021909804637575314, - "grad_norm": 32.78224563598633, - "learning_rate": 2.1897810218978106e-07, - "loss": 2.7788, + "epoch": 0.001040943789035392, + "grad_norm": 7.592729568481445, + "learning_rate": 1.0407632263660019e-07, + "loss": 1.647, "step": 6 }, { - "epoch": 0.0025561438743837866, - "grad_norm": 47.43293380737305, - "learning_rate": 2.5547445255474454e-07, - "loss": 3.3416, + "epoch": 0.0012144344205412907, + "grad_norm": 10.206687927246094, + "learning_rate": 1.214223764093669e-07, + "loss": 1.6113, "step": 7 }, { - "epoch": 0.002921307285010042, - "grad_norm": 29.625720977783203, - "learning_rate": 2.9197080291970804e-07, - "loss": 2.7476, + "epoch": 0.0013879250520471894, + "grad_norm": 11.967615127563477, + "learning_rate": 1.3876843018213356e-07, + "loss": 1.5537, "step": 8 }, { - "epoch": 0.0032864706956362974, - "grad_norm": 44.875179290771484, - "learning_rate": 3.284671532846716e-07, - "loss": 2.8389, + "epoch": 0.001561415683553088, + "grad_norm": 8.581392288208008, + "learning_rate": 1.5611448395490029e-07, + "loss": 1.7715, "step": 9 }, { - "epoch": 0.0036516341062625525, - "grad_norm": 40.204383850097656, - "learning_rate": 3.6496350364963505e-07, - "loss": 3.0732, + "epoch": 0.0017349063150589867, + "grad_norm": 9.627989768981934, + "learning_rate": 1.7346053772766696e-07, + "loss": 1.5542, "step": 10 }, { - "epoch": 0.004016797516888808, - "grad_norm": 45.2133674621582, - "learning_rate": 4.0145985401459856e-07, - "loss": 3.1572, + "epoch": 0.0019083969465648854, + "grad_norm": 15.130707740783691, + "learning_rate": 1.9080659150043368e-07, + "loss": 1.6182, "step": 11 }, { - "epoch": 0.004381960927515063, - "grad_norm": 45.784915924072266, - "learning_rate": 4.379562043795621e-07, - "loss": 3.2739, + "epoch": 0.002081887578070784, + "grad_norm": 7.39605188369751, + "learning_rate": 2.0815264527320037e-07, + "loss": 1.6445, "step": 12 }, { - "epoch": 0.004747124338141318, - "grad_norm": 36.59327697753906, - "learning_rate": 4.7445255474452557e-07, - "loss": 2.8096, + "epoch": 0.002255378209576683, + "grad_norm": 8.058740615844727, + "learning_rate": 2.2549869904596704e-07, + "loss": 1.6509, "step": 13 }, { - "epoch": 0.005112287748767573, - "grad_norm": 28.56315803527832, - "learning_rate": 5.109489051094891e-07, - "loss": 2.5127, + "epoch": 0.0024288688410825814, + "grad_norm": 8.374455451965332, + "learning_rate": 2.428447528187338e-07, + "loss": 1.6436, "step": 14 }, { - "epoch": 0.005477451159393828, - "grad_norm": 38.90397644042969, - "learning_rate": 5.474452554744526e-07, - "loss": 2.814, + "epoch": 0.0026023594725884803, + "grad_norm": 9.052467346191406, + "learning_rate": 2.6019080659150043e-07, + "loss": 1.5425, "step": 15 }, { - "epoch": 0.005842614570020084, - "grad_norm": 40.58452606201172, - "learning_rate": 5.839416058394161e-07, - "loss": 2.8396, + "epoch": 0.002775850104094379, + "grad_norm": 8.19106388092041, + "learning_rate": 2.7753686036426713e-07, + "loss": 1.6802, "step": 16 }, { - "epoch": 0.0062077779806463396, - "grad_norm": 33.672916412353516, - "learning_rate": 6.204379562043796e-07, - "loss": 2.6477, + "epoch": 0.0029493407356002777, + "grad_norm": 8.290786743164062, + "learning_rate": 2.948829141370339e-07, + "loss": 1.7285, "step": 17 }, { - "epoch": 0.006572941391272595, - "grad_norm": 45.91011428833008, - "learning_rate": 6.569343065693432e-07, - "loss": 2.8613, + "epoch": 0.003122831367106176, + "grad_norm": 9.949291229248047, + "learning_rate": 3.1222896790980057e-07, + "loss": 1.5986, "step": 18 }, { - "epoch": 0.00693810480189885, - "grad_norm": 40.550048828125, - "learning_rate": 6.934306569343066e-07, - "loss": 2.8459, + "epoch": 0.003296321998612075, + "grad_norm": 6.864412307739258, + "learning_rate": 3.295750216825672e-07, + "loss": 1.8242, "step": 19 }, { - "epoch": 0.007303268212525105, - "grad_norm": 57.86399841308594, - "learning_rate": 7.299270072992701e-07, - "loss": 2.8582, + "epoch": 0.0034698126301179735, + "grad_norm": 8.50459098815918, + "learning_rate": 3.469210754553339e-07, + "loss": 1.6025, "step": 20 }, { - "epoch": 0.00766843162315136, - "grad_norm": 30.545454025268555, - "learning_rate": 7.664233576642337e-07, - "loss": 2.6929, + "epoch": 0.0036433032616238724, + "grad_norm": 6.432565212249756, + "learning_rate": 3.642671292281006e-07, + "loss": 1.7651, "step": 21 }, { - "epoch": 0.008033595033777615, - "grad_norm": 41.42045974731445, - "learning_rate": 8.029197080291971e-07, - "loss": 3.0132, + "epoch": 0.003816793893129771, + "grad_norm": 8.177473068237305, + "learning_rate": 3.8161318300086735e-07, + "loss": 1.6484, "step": 22 }, { - "epoch": 0.00839875844440387, - "grad_norm": 38.678375244140625, - "learning_rate": 8.394160583941606e-07, - "loss": 2.9917, + "epoch": 0.003990284524635669, + "grad_norm": 9.605158805847168, + "learning_rate": 3.9895923677363405e-07, + "loss": 1.6401, "step": 23 }, { - "epoch": 0.008763921855030126, - "grad_norm": 23.337677001953125, - "learning_rate": 8.759124087591242e-07, - "loss": 2.4231, + "epoch": 0.004163775156141568, + "grad_norm": 8.90903377532959, + "learning_rate": 4.1630529054640075e-07, + "loss": 1.7539, "step": 24 }, { - "epoch": 0.009129085265656381, - "grad_norm": 37.222496032714844, - "learning_rate": 9.124087591240876e-07, - "loss": 2.8438, + "epoch": 0.004337265787647467, + "grad_norm": 8.919925689697266, + "learning_rate": 4.3365134431916744e-07, + "loss": 1.5815, "step": 25 }, { - "epoch": 0.009494248676282636, - "grad_norm": 41.70036315917969, - "learning_rate": 9.489051094890511e-07, - "loss": 2.7737, + "epoch": 0.004510756419153366, + "grad_norm": 7.012167930603027, + "learning_rate": 4.509973980919341e-07, + "loss": 1.6895, "step": 26 }, { - "epoch": 0.009859412086908891, - "grad_norm": 40.49250030517578, - "learning_rate": 9.854014598540146e-07, - "loss": 2.8972, + "epoch": 0.004684247050659264, + "grad_norm": 9.661568641662598, + "learning_rate": 4.683434518647008e-07, + "loss": 1.9082, "step": 27 }, { - "epoch": 0.010224575497535146, - "grad_norm": 44.20450973510742, - "learning_rate": 1.0218978102189781e-06, - "loss": 2.8682, + "epoch": 0.004857737682165163, + "grad_norm": 5.005364418029785, + "learning_rate": 4.856895056374676e-07, + "loss": 1.6494, "step": 28 }, { - "epoch": 0.010589738908161402, - "grad_norm": 42.87916564941406, - "learning_rate": 1.0583941605839416e-06, - "loss": 2.8926, + "epoch": 0.005031228313671062, + "grad_norm": 7.4370551109313965, + "learning_rate": 5.030355594102343e-07, + "loss": 1.6445, "step": 29 }, { - "epoch": 0.010954902318787657, - "grad_norm": 44.111724853515625, - "learning_rate": 1.0948905109489052e-06, - "loss": 2.9465, + "epoch": 0.005204718945176961, + "grad_norm": 5.102890491485596, + "learning_rate": 5.203816131830009e-07, + "loss": 1.6558, "step": 30 }, { - "epoch": 0.011320065729413912, - "grad_norm": 30.04745101928711, - "learning_rate": 1.1313868613138687e-06, - "loss": 2.8755, + "epoch": 0.005378209576682859, + "grad_norm": 8.47376537322998, + "learning_rate": 5.377276669557676e-07, + "loss": 1.6699, "step": 31 }, { - "epoch": 0.011685229140040169, - "grad_norm": 35.79295349121094, - "learning_rate": 1.1678832116788322e-06, - "loss": 2.6541, + "epoch": 0.005551700208188758, + "grad_norm": 8.402555465698242, + "learning_rate": 5.550737207285343e-07, + "loss": 1.5796, "step": 32 }, { - "epoch": 0.012050392550666424, - "grad_norm": 36.13063430786133, - "learning_rate": 1.2043795620437959e-06, - "loss": 2.9041, + "epoch": 0.0057251908396946565, + "grad_norm": 6.561102867126465, + "learning_rate": 5.72419774501301e-07, + "loss": 1.6782, "step": 33 }, { - "epoch": 0.012415555961292679, - "grad_norm": 47.278839111328125, - "learning_rate": 1.2408759124087592e-06, - "loss": 2.7544, + "epoch": 0.005898681471200555, + "grad_norm": 7.251779556274414, + "learning_rate": 5.897658282740678e-07, + "loss": 1.5747, "step": 34 }, { - "epoch": 0.012780719371918934, - "grad_norm": 46.186214447021484, - "learning_rate": 1.2773722627737229e-06, - "loss": 2.7949, + "epoch": 0.006072172102706454, + "grad_norm": 9.628448486328125, + "learning_rate": 6.071118820468344e-07, + "loss": 1.584, "step": 35 }, { - "epoch": 0.01314588278254519, - "grad_norm": 38.55563735961914, - "learning_rate": 1.3138686131386864e-06, - "loss": 2.78, + "epoch": 0.006245662734212352, + "grad_norm": 8.048123359680176, + "learning_rate": 6.244579358196011e-07, + "loss": 1.5669, "step": 36 }, { - "epoch": 0.013511046193171445, - "grad_norm": 35.2288932800293, - "learning_rate": 1.3503649635036497e-06, - "loss": 2.5869, + "epoch": 0.006419153365718251, + "grad_norm": 6.632225036621094, + "learning_rate": 6.418039895923677e-07, + "loss": 1.7031, "step": 37 }, { - "epoch": 0.0138762096037977, - "grad_norm": 47.110042572021484, - "learning_rate": 1.3868613138686132e-06, - "loss": 2.7153, + "epoch": 0.00659264399722415, + "grad_norm": 9.393420219421387, + "learning_rate": 6.591500433651344e-07, + "loss": 1.582, "step": 38 }, { - "epoch": 0.014241373014423955, - "grad_norm": 17.50743865966797, - "learning_rate": 1.4233576642335767e-06, - "loss": 2.4653, + "epoch": 0.006766134628730049, + "grad_norm": 9.881068229675293, + "learning_rate": 6.764960971379011e-07, + "loss": 1.5913, "step": 39 }, { - "epoch": 0.01460653642505021, - "grad_norm": 39.75058364868164, - "learning_rate": 1.4598540145985402e-06, - "loss": 2.7083, + "epoch": 0.006939625260235947, + "grad_norm": 8.628946304321289, + "learning_rate": 6.938421509106678e-07, + "loss": 1.6616, "step": 40 }, { - "epoch": 0.014971699835676465, - "grad_norm": 39.014617919921875, - "learning_rate": 1.496350364963504e-06, - "loss": 2.7454, + "epoch": 0.007113115891741846, + "grad_norm": 7.260252475738525, + "learning_rate": 7.111882046834345e-07, + "loss": 1.5986, "step": 41 }, { - "epoch": 0.01533686324630272, - "grad_norm": 49.731136322021484, - "learning_rate": 1.5328467153284674e-06, - "loss": 2.7866, + "epoch": 0.007286606523247745, + "grad_norm": 8.120828628540039, + "learning_rate": 7.285342584562012e-07, + "loss": 1.6079, "step": 42 }, { - "epoch": 0.015702026656928977, - "grad_norm": 31.08563232421875, - "learning_rate": 1.5693430656934307e-06, - "loss": 2.3948, + "epoch": 0.007460097154753644, + "grad_norm": 7.6855998039245605, + "learning_rate": 7.45880312228968e-07, + "loss": 1.5996, "step": 43 }, { - "epoch": 0.01606719006755523, - "grad_norm": 30.714595794677734, - "learning_rate": 1.6058394160583942e-06, - "loss": 2.4863, + "epoch": 0.007633587786259542, + "grad_norm": 8.822908401489258, + "learning_rate": 7.632263660017347e-07, + "loss": 1.5684, "step": 44 }, { - "epoch": 0.016432353478181488, - "grad_norm": 14.033282279968262, - "learning_rate": 1.6423357664233577e-06, - "loss": 2.2781, + "epoch": 0.007807078417765441, + "grad_norm": 7.261823654174805, + "learning_rate": 7.805724197745014e-07, + "loss": 1.6343, "step": 45 }, { - "epoch": 0.01679751688880774, - "grad_norm": 24.680465698242188, - "learning_rate": 1.6788321167883212e-06, - "loss": 2.3298, + "epoch": 0.007980569049271339, + "grad_norm": 9.056549072265625, + "learning_rate": 7.979184735472681e-07, + "loss": 1.6362, "step": 46 }, { - "epoch": 0.017162680299433998, - "grad_norm": 26.36433982849121, - "learning_rate": 1.715328467153285e-06, - "loss": 2.416, + "epoch": 0.008154059680777238, + "grad_norm": 7.600607872009277, + "learning_rate": 8.152645273200348e-07, + "loss": 1.5352, "step": 47 }, { - "epoch": 0.01752784371006025, - "grad_norm": 29.39566421508789, - "learning_rate": 1.7518248175182485e-06, - "loss": 2.4373, + "epoch": 0.008327550312283136, + "grad_norm": 7.822445392608643, + "learning_rate": 8.326105810928015e-07, + "loss": 1.5522, "step": 48 }, { - "epoch": 0.01789300712068651, - "grad_norm": 29.02555274963379, - "learning_rate": 1.788321167883212e-06, - "loss": 2.3955, + "epoch": 0.008501040943789036, + "grad_norm": 5.54330587387085, + "learning_rate": 8.499566348655682e-07, + "loss": 1.5908, "step": 49 }, { - "epoch": 0.018258170531312762, - "grad_norm": 22.144216537475586, - "learning_rate": 1.8248175182481753e-06, - "loss": 2.3511, + "epoch": 0.008674531575294934, + "grad_norm": 10.899407386779785, + "learning_rate": 8.673026886383349e-07, + "loss": 1.5698, "step": 50 }, { - "epoch": 0.01862333394193902, - "grad_norm": 27.588821411132812, - "learning_rate": 1.8613138686131388e-06, - "loss": 2.3667, + "epoch": 0.008848022206800832, + "grad_norm": 5.691441535949707, + "learning_rate": 8.846487424111015e-07, + "loss": 1.6909, "step": 51 }, { - "epoch": 0.018988497352565272, - "grad_norm": 8.287466049194336, - "learning_rate": 1.8978102189781023e-06, - "loss": 2.2056, + "epoch": 0.009021512838306732, + "grad_norm": 8.123140335083008, + "learning_rate": 9.019947961838682e-07, + "loss": 1.5615, "step": 52 }, { - "epoch": 0.01935366076319153, - "grad_norm": 24.864910125732422, - "learning_rate": 1.934306569343066e-06, - "loss": 2.3606, + "epoch": 0.00919500346981263, + "grad_norm": 8.133471488952637, + "learning_rate": 9.193408499566349e-07, + "loss": 1.6162, "step": 53 }, { - "epoch": 0.019718824173817782, - "grad_norm": 7.857970714569092, - "learning_rate": 1.9708029197080293e-06, - "loss": 2.2024, + "epoch": 0.009368494101318528, + "grad_norm": 6.657607078552246, + "learning_rate": 9.366869037294016e-07, + "loss": 1.5752, "step": 54 }, { - "epoch": 0.02008398758444404, - "grad_norm": 20.672231674194336, - "learning_rate": 2.007299270072993e-06, - "loss": 2.3142, + "epoch": 0.009541984732824428, + "grad_norm": 8.637742042541504, + "learning_rate": 9.540329575021685e-07, + "loss": 1.6602, "step": 55 }, { - "epoch": 0.020449150995070293, - "grad_norm": 11.765204429626465, - "learning_rate": 2.0437956204379563e-06, - "loss": 2.2617, + "epoch": 0.009715475364330326, + "grad_norm": 8.159915924072266, + "learning_rate": 9.713790112749352e-07, + "loss": 1.6016, "step": 56 }, { - "epoch": 0.02081431440569655, - "grad_norm": 21.216463088989258, - "learning_rate": 2.08029197080292e-06, - "loss": 2.2339, + "epoch": 0.009888965995836226, + "grad_norm": 9.051621437072754, + "learning_rate": 9.887250650477019e-07, + "loss": 1.603, "step": 57 }, { - "epoch": 0.021179477816322803, - "grad_norm": 17.462955474853516, - "learning_rate": 2.1167883211678833e-06, - "loss": 2.3621, + "epoch": 0.010062456627342124, + "grad_norm": 8.7591552734375, + "learning_rate": 1.0060711188204686e-06, + "loss": 1.5386, "step": 58 }, { - "epoch": 0.02154464122694906, - "grad_norm": 23.96648597717285, - "learning_rate": 2.1532846715328466e-06, - "loss": 2.2866, + "epoch": 0.010235947258848022, + "grad_norm": 8.918630599975586, + "learning_rate": 1.0234171725932352e-06, + "loss": 1.6587, "step": 59 }, { - "epoch": 0.021909804637575313, - "grad_norm": 12.876380920410156, - "learning_rate": 2.1897810218978103e-06, - "loss": 2.2808, + "epoch": 0.010409437890353921, + "grad_norm": 9.020722389221191, + "learning_rate": 1.0407632263660017e-06, + "loss": 1.5215, "step": 60 }, { - "epoch": 0.02227496804820157, - "grad_norm": 18.42340850830078, - "learning_rate": 2.226277372262774e-06, - "loss": 2.2866, + "epoch": 0.01058292852185982, + "grad_norm": 7.6218109130859375, + "learning_rate": 1.0581092801387684e-06, + "loss": 1.5889, "step": 61 }, { - "epoch": 0.022640131458827824, - "grad_norm": 10.664287567138672, - "learning_rate": 2.2627737226277373e-06, - "loss": 2.1484, + "epoch": 0.010756419153365717, + "grad_norm": 5.183119773864746, + "learning_rate": 1.0754553339115351e-06, + "loss": 1.7412, "step": 62 }, { - "epoch": 0.02300529486945408, - "grad_norm": 5.376240253448486, - "learning_rate": 2.299270072992701e-06, - "loss": 2.0798, + "epoch": 0.010929909784871617, + "grad_norm": 7.608677864074707, + "learning_rate": 1.0928013876843018e-06, + "loss": 1.647, "step": 63 }, { - "epoch": 0.023370458280080338, - "grad_norm": 4.66656494140625, - "learning_rate": 2.3357664233576643e-06, - "loss": 2.0386, + "epoch": 0.011103400416377515, + "grad_norm": 7.287333011627197, + "learning_rate": 1.1101474414570685e-06, + "loss": 1.5063, "step": 64 }, { - "epoch": 0.02373562169070659, - "grad_norm": 4.532477378845215, - "learning_rate": 2.3722627737226276e-06, - "loss": 2.0461, + "epoch": 0.011276891047883415, + "grad_norm": 5.4141974449157715, + "learning_rate": 1.1274934952298352e-06, + "loss": 1.7095, "step": 65 }, { - "epoch": 0.024100785101332848, - "grad_norm": 4.635742664337158, - "learning_rate": 2.4087591240875918e-06, - "loss": 2.1047, + "epoch": 0.011450381679389313, + "grad_norm": 5.165591239929199, + "learning_rate": 1.144839549002602e-06, + "loss": 1.6538, "step": 66 }, { - "epoch": 0.0244659485119591, - "grad_norm": 4.122089385986328, - "learning_rate": 2.445255474452555e-06, - "loss": 2.0034, + "epoch": 0.011623872310895211, + "grad_norm": 7.284861087799072, + "learning_rate": 1.1621856027753688e-06, + "loss": 1.4585, "step": 67 }, { - "epoch": 0.024831111922585358, - "grad_norm": 4.436962127685547, - "learning_rate": 2.4817518248175183e-06, - "loss": 2.0808, + "epoch": 0.01179736294240111, + "grad_norm": 6.699769020080566, + "learning_rate": 1.1795316565481355e-06, + "loss": 1.5967, "step": 68 }, { - "epoch": 0.02519627533321161, - "grad_norm": 4.340993404388428, - "learning_rate": 2.518248175182482e-06, - "loss": 2.0745, + "epoch": 0.011970853573907009, + "grad_norm": 8.180192947387695, + "learning_rate": 1.1968777103209022e-06, + "loss": 1.5762, "step": 69 }, { - "epoch": 0.02556143874383787, - "grad_norm": 3.8762872219085693, - "learning_rate": 2.5547445255474458e-06, - "loss": 1.9814, + "epoch": 0.012144344205412909, + "grad_norm": 7.624974727630615, + "learning_rate": 1.214223764093669e-06, + "loss": 1.4902, "step": 70 }, { - "epoch": 0.025926602154464122, - "grad_norm": 4.280662536621094, - "learning_rate": 2.591240875912409e-06, - "loss": 2.0801, + "epoch": 0.012317834836918807, + "grad_norm": 6.223555564880371, + "learning_rate": 1.2315698178664356e-06, + "loss": 1.5142, "step": 71 }, { - "epoch": 0.02629176556509038, - "grad_norm": 4.360474586486816, - "learning_rate": 2.627737226277373e-06, - "loss": 1.9839, + "epoch": 0.012491325468424705, + "grad_norm": 5.306108474731445, + "learning_rate": 1.2489158716392023e-06, + "loss": 1.667, "step": 72 }, { - "epoch": 0.026656928975716632, - "grad_norm": 4.1049909591674805, - "learning_rate": 2.664233576642336e-06, - "loss": 1.9998, + "epoch": 0.012664816099930604, + "grad_norm": 6.075465679168701, + "learning_rate": 1.266261925411969e-06, + "loss": 1.5278, "step": 73 }, { - "epoch": 0.02702209238634289, - "grad_norm": 4.057889461517334, - "learning_rate": 2.7007299270072994e-06, - "loss": 2.0537, + "epoch": 0.012838306731436502, + "grad_norm": 8.141561508178711, + "learning_rate": 1.2836079791847355e-06, + "loss": 1.4546, "step": 74 }, { - "epoch": 0.027387255796969143, - "grad_norm": 4.610106468200684, - "learning_rate": 2.737226277372263e-06, - "loss": 2.0952, + "epoch": 0.0130117973629424, + "grad_norm": 4.308069229125977, + "learning_rate": 1.3009540329575024e-06, + "loss": 1.5483, "step": 75 }, { - "epoch": 0.0277524192075954, - "grad_norm": 3.848771572113037, - "learning_rate": 2.7737226277372264e-06, - "loss": 2.0066, + "epoch": 0.0131852879944483, + "grad_norm": 7.349766731262207, + "learning_rate": 1.3183000867302689e-06, + "loss": 1.4941, "step": 76 }, { - "epoch": 0.028117582618221653, - "grad_norm": 3.6226391792297363, - "learning_rate": 2.81021897810219e-06, - "loss": 2.0056, + "epoch": 0.013358778625954198, + "grad_norm": 5.778078556060791, + "learning_rate": 1.3356461405030358e-06, + "loss": 1.5425, "step": 77 }, { - "epoch": 0.02848274602884791, - "grad_norm": 3.898202896118164, - "learning_rate": 2.8467153284671534e-06, - "loss": 2.0696, + "epoch": 0.013532269257460098, + "grad_norm": 7.925509452819824, + "learning_rate": 1.3529921942758023e-06, + "loss": 1.4644, "step": 78 }, { - "epoch": 0.028847909439474163, - "grad_norm": 4.203927040100098, - "learning_rate": 2.8832116788321167e-06, - "loss": 1.9817, + "epoch": 0.013705759888965996, + "grad_norm": 5.942389011383057, + "learning_rate": 1.3703382480485692e-06, + "loss": 1.5977, "step": 79 }, { - "epoch": 0.02921307285010042, - "grad_norm": 4.142097473144531, - "learning_rate": 2.9197080291970804e-06, - "loss": 2.001, + "epoch": 0.013879250520471894, + "grad_norm": 6.470898628234863, + "learning_rate": 1.3876843018213356e-06, + "loss": 1.4902, "step": 80 }, { - "epoch": 0.029578236260726674, - "grad_norm": 3.5766549110412598, - "learning_rate": 2.956204379562044e-06, - "loss": 1.9392, + "epoch": 0.014052741151977794, + "grad_norm": 7.827963829040527, + "learning_rate": 1.4050303555941025e-06, + "loss": 1.4155, "step": 81 }, { - "epoch": 0.02994339967135293, - "grad_norm": 4.189686298370361, - "learning_rate": 2.992700729927008e-06, - "loss": 1.9783, + "epoch": 0.014226231783483692, + "grad_norm": 4.13720703125, + "learning_rate": 1.422376409366869e-06, + "loss": 1.4927, "step": 82 }, { - "epoch": 0.030308563081979184, - "grad_norm": 3.420266628265381, - "learning_rate": 3.029197080291971e-06, - "loss": 1.9788, + "epoch": 0.01439972241498959, + "grad_norm": 5.565674304962158, + "learning_rate": 1.439722463139636e-06, + "loss": 1.4414, "step": 83 }, { - "epoch": 0.03067372649260544, - "grad_norm": 3.294321298599243, - "learning_rate": 3.065693430656935e-06, - "loss": 1.9316, + "epoch": 0.01457321304649549, + "grad_norm": 7.0522541999816895, + "learning_rate": 1.4570685169124024e-06, + "loss": 1.5332, "step": 84 }, { - "epoch": 0.031038889903231698, - "grad_norm": 3.5636844635009766, - "learning_rate": 3.102189781021898e-06, - "loss": 1.9479, + "epoch": 0.014746703678001388, + "grad_norm": 7.5967535972595215, + "learning_rate": 1.4744145706851693e-06, + "loss": 1.4312, "step": 85 }, { - "epoch": 0.031404053313857955, - "grad_norm": 3.3803062438964844, - "learning_rate": 3.1386861313868614e-06, - "loss": 1.9905, + "epoch": 0.014920194309507287, + "grad_norm": 6.613969802856445, + "learning_rate": 1.491760624457936e-06, + "loss": 1.4678, "step": 86 }, { - "epoch": 0.031769216724484205, - "grad_norm": 3.2065281867980957, - "learning_rate": 3.175182481751825e-06, - "loss": 1.9639, + "epoch": 0.015093684941013185, + "grad_norm": 7.203578948974609, + "learning_rate": 1.5091066782307025e-06, + "loss": 1.418, "step": 87 }, { - "epoch": 0.03213438013511046, - "grad_norm": 3.341919183731079, - "learning_rate": 3.2116788321167884e-06, - "loss": 1.9897, + "epoch": 0.015267175572519083, + "grad_norm": 6.019166946411133, + "learning_rate": 1.5264527320034694e-06, + "loss": 1.4761, "step": 88 }, { - "epoch": 0.03249954354573672, - "grad_norm": 3.127070426940918, - "learning_rate": 3.248175182481752e-06, - "loss": 1.9363, + "epoch": 0.015440666204024983, + "grad_norm": 7.248750686645508, + "learning_rate": 1.543798785776236e-06, + "loss": 1.5898, "step": 89 }, { - "epoch": 0.032864706956362975, - "grad_norm": 3.10131573677063, - "learning_rate": 3.2846715328467155e-06, - "loss": 1.9265, + "epoch": 0.015614156835530881, + "grad_norm": 5.607358455657959, + "learning_rate": 1.5611448395490028e-06, + "loss": 1.4033, "step": 90 }, { - "epoch": 0.033229870366989225, - "grad_norm": 3.0348713397979736, - "learning_rate": 3.3211678832116788e-06, - "loss": 1.9036, + "epoch": 0.01578764746703678, + "grad_norm": 6.065204620361328, + "learning_rate": 1.5784908933217693e-06, + "loss": 1.313, "step": 91 }, { - "epoch": 0.03359503377761548, - "grad_norm": 3.5303900241851807, - "learning_rate": 3.3576642335766425e-06, - "loss": 1.9297, + "epoch": 0.015961138098542677, + "grad_norm": 5.404809951782227, + "learning_rate": 1.5958369470945362e-06, + "loss": 1.3682, "step": 92 }, { - "epoch": 0.03396019718824174, - "grad_norm": 2.5174522399902344, - "learning_rate": 3.3941605839416058e-06, - "loss": 1.8955, + "epoch": 0.01613462873004858, + "grad_norm": 4.453547954559326, + "learning_rate": 1.6131830008673027e-06, + "loss": 1.4365, "step": 93 }, { - "epoch": 0.034325360598867996, - "grad_norm": 2.674445867538452, - "learning_rate": 3.43065693430657e-06, - "loss": 1.8845, + "epoch": 0.016308119361554477, + "grad_norm": 3.278090238571167, + "learning_rate": 1.6305290546400696e-06, + "loss": 1.3657, "step": 94 }, { - "epoch": 0.034690524009494246, - "grad_norm": 2.7873001098632812, - "learning_rate": 3.467153284671533e-06, - "loss": 1.9026, + "epoch": 0.016481609993060375, + "grad_norm": 4.881364345550537, + "learning_rate": 1.647875108412836e-06, + "loss": 1.3027, "step": 95 }, { - "epoch": 0.0350556874201205, - "grad_norm": 2.4853591918945312, - "learning_rate": 3.503649635036497e-06, - "loss": 1.8462, + "epoch": 0.016655100624566273, + "grad_norm": 5.623530387878418, + "learning_rate": 1.665221162185603e-06, + "loss": 1.3516, "step": 96 }, { - "epoch": 0.03542085083074676, - "grad_norm": 2.3662006855010986, - "learning_rate": 3.54014598540146e-06, - "loss": 1.8833, + "epoch": 0.01682859125607217, + "grad_norm": 5.050769805908203, + "learning_rate": 1.6825672159583695e-06, + "loss": 1.4014, "step": 97 }, { - "epoch": 0.03578601424137302, - "grad_norm": 2.3720977306365967, - "learning_rate": 3.576642335766424e-06, - "loss": 1.8621, + "epoch": 0.017002081887578072, + "grad_norm": 4.636990070343018, + "learning_rate": 1.6999132697311364e-06, + "loss": 1.3936, "step": 98 }, { - "epoch": 0.03615117765199927, - "grad_norm": 2.456186532974243, - "learning_rate": 3.6131386861313872e-06, - "loss": 1.8274, + "epoch": 0.01717557251908397, + "grad_norm": 4.232283592224121, + "learning_rate": 1.717259323503903e-06, + "loss": 1.2778, "step": 99 }, { - "epoch": 0.036516341062625524, - "grad_norm": 2.3545117378234863, - "learning_rate": 3.6496350364963505e-06, - "loss": 1.8809, + "epoch": 0.01734906315058987, + "grad_norm": 5.0943522453308105, + "learning_rate": 1.7346053772766698e-06, + "loss": 1.3384, "step": 100 }, { - "epoch": 0.03688150447325178, - "grad_norm": 2.29801082611084, - "learning_rate": 3.6861313868613142e-06, - "loss": 1.8999, + "epoch": 0.017522553782095766, + "grad_norm": 4.834756851196289, + "learning_rate": 1.7519514310494365e-06, + "loss": 1.5537, "step": 101 }, { - "epoch": 0.03724666788387804, - "grad_norm": 1.9178146123886108, - "learning_rate": 3.7226277372262775e-06, - "loss": 1.7708, + "epoch": 0.017696044413601664, + "grad_norm": 6.545447826385498, + "learning_rate": 1.769297484822203e-06, + "loss": 1.3013, "step": 102 }, { - "epoch": 0.03761183129450429, - "grad_norm": 1.9274564981460571, - "learning_rate": 3.7591240875912412e-06, - "loss": 1.7783, + "epoch": 0.017869535045107566, + "grad_norm": 4.03840970993042, + "learning_rate": 1.7866435385949699e-06, + "loss": 1.3354, "step": 103 }, { - "epoch": 0.037976994705130544, - "grad_norm": 1.8388400077819824, - "learning_rate": 3.7956204379562045e-06, - "loss": 1.7793, + "epoch": 0.018043025676613464, + "grad_norm": 5.726967811584473, + "learning_rate": 1.8039895923677363e-06, + "loss": 1.2607, "step": 104 }, { - "epoch": 0.0383421581157568, - "grad_norm": 1.7276384830474854, - "learning_rate": 3.832116788321168e-06, - "loss": 1.7642, + "epoch": 0.018216516308119362, + "grad_norm": 3.582960605621338, + "learning_rate": 1.8213356461405032e-06, + "loss": 1.3374, "step": 105 }, { - "epoch": 0.03870732152638306, - "grad_norm": 1.737682819366455, - "learning_rate": 3.868613138686132e-06, - "loss": 1.7666, + "epoch": 0.01839000693962526, + "grad_norm": 3.3676187992095947, + "learning_rate": 1.8386816999132697e-06, + "loss": 1.416, "step": 106 }, { - "epoch": 0.039072484937009315, - "grad_norm": 2.1073811054229736, - "learning_rate": 3.905109489051096e-06, - "loss": 1.8352, + "epoch": 0.018563497571131158, + "grad_norm": 4.337418556213379, + "learning_rate": 1.8560277536860366e-06, + "loss": 1.3369, "step": 107 }, { - "epoch": 0.039437648347635565, - "grad_norm": 1.6888995170593262, - "learning_rate": 3.9416058394160585e-06, - "loss": 1.7949, + "epoch": 0.018736988202637056, + "grad_norm": 4.397348403930664, + "learning_rate": 1.8733738074588031e-06, + "loss": 1.2847, "step": 108 }, { - "epoch": 0.03980281175826182, - "grad_norm": 2.00239896774292, - "learning_rate": 3.978102189781022e-06, - "loss": 1.8623, + "epoch": 0.018910478834142957, + "grad_norm": 4.037115097045898, + "learning_rate": 1.89071986123157e-06, + "loss": 1.4497, "step": 109 }, { - "epoch": 0.04016797516888808, - "grad_norm": 1.8566627502441406, - "learning_rate": 4.014598540145986e-06, - "loss": 1.8457, + "epoch": 0.019083969465648856, + "grad_norm": 5.323732376098633, + "learning_rate": 1.908065915004337e-06, + "loss": 1.5273, "step": 110 }, { - "epoch": 0.040533138579514336, - "grad_norm": 1.8539583683013916, - "learning_rate": 4.05109489051095e-06, - "loss": 1.7996, + "epoch": 0.019257460097154754, + "grad_norm": 3.614849805831909, + "learning_rate": 1.925411968777103e-06, + "loss": 1.3804, "step": 111 }, { - "epoch": 0.040898301990140586, - "grad_norm": 1.692850112915039, - "learning_rate": 4.0875912408759126e-06, - "loss": 1.803, + "epoch": 0.01943095072866065, + "grad_norm": 4.579774379730225, + "learning_rate": 1.9427580225498703e-06, + "loss": 1.6113, "step": 112 }, { - "epoch": 0.04126346540076684, - "grad_norm": 1.717925786972046, - "learning_rate": 4.124087591240876e-06, - "loss": 1.8386, + "epoch": 0.01960444136016655, + "grad_norm": 3.760392189025879, + "learning_rate": 1.9601040763226366e-06, + "loss": 1.373, "step": 113 }, { - "epoch": 0.0416286288113931, - "grad_norm": 1.7031697034835815, - "learning_rate": 4.16058394160584e-06, - "loss": 1.8782, + "epoch": 0.01977793199167245, + "grad_norm": 3.529853105545044, + "learning_rate": 1.9774501300954037e-06, + "loss": 1.4487, "step": 114 }, { - "epoch": 0.041993792222019356, - "grad_norm": 1.870545506477356, - "learning_rate": 4.197080291970803e-06, - "loss": 1.8462, + "epoch": 0.01995142262317835, + "grad_norm": 3.8870127201080322, + "learning_rate": 1.99479618386817e-06, + "loss": 1.3779, "step": 115 }, { - "epoch": 0.042358955632645606, - "grad_norm": 1.9429596662521362, - "learning_rate": 4.233576642335767e-06, - "loss": 1.7712, + "epoch": 0.020124913254684247, + "grad_norm": 3.8887150287628174, + "learning_rate": 2.012142237640937e-06, + "loss": 1.3677, "step": 116 }, { - "epoch": 0.04272411904327186, - "grad_norm": 2.0376460552215576, - "learning_rate": 4.27007299270073e-06, - "loss": 1.7822, + "epoch": 0.020298403886190145, + "grad_norm": 4.131175994873047, + "learning_rate": 2.0294882914137034e-06, + "loss": 1.561, "step": 117 }, { - "epoch": 0.04308928245389812, - "grad_norm": 1.429470181465149, - "learning_rate": 4.306569343065693e-06, - "loss": 1.7421, + "epoch": 0.020471894517696043, + "grad_norm": 3.480945110321045, + "learning_rate": 2.0468343451864705e-06, + "loss": 1.3994, "step": 118 }, { - "epoch": 0.04345444586452438, - "grad_norm": 2.6520509719848633, - "learning_rate": 4.343065693430658e-06, - "loss": 1.8259, + "epoch": 0.020645385149201945, + "grad_norm": 3.1352598667144775, + "learning_rate": 2.0641803989592368e-06, + "loss": 1.3628, "step": 119 }, { - "epoch": 0.04381960927515063, - "grad_norm": 1.2009416818618774, - "learning_rate": 4.379562043795621e-06, - "loss": 1.7095, + "epoch": 0.020818875780707843, + "grad_norm": 4.520205974578857, + "learning_rate": 2.0815264527320035e-06, + "loss": 1.3491, "step": 120 }, { - "epoch": 0.044184772685776884, - "grad_norm": 1.7137818336486816, - "learning_rate": 4.416058394160584e-06, - "loss": 1.8005, + "epoch": 0.02099236641221374, + "grad_norm": 3.4987733364105225, + "learning_rate": 2.09887250650477e-06, + "loss": 1.2766, "step": 121 }, { - "epoch": 0.04454993609640314, - "grad_norm": 1.3915783166885376, - "learning_rate": 4.452554744525548e-06, - "loss": 1.7708, + "epoch": 0.02116585704371964, + "grad_norm": 3.274085760116577, + "learning_rate": 2.116218560277537e-06, + "loss": 1.2803, "step": 122 }, { - "epoch": 0.0449150995070294, - "grad_norm": 1.2666165828704834, - "learning_rate": 4.489051094890512e-06, - "loss": 1.7529, + "epoch": 0.021339347675225537, + "grad_norm": 5.043463706970215, + "learning_rate": 2.133564614050304e-06, + "loss": 1.4651, "step": 123 }, { - "epoch": 0.04528026291765565, - "grad_norm": 1.799212098121643, - "learning_rate": 4.525547445255475e-06, - "loss": 1.807, + "epoch": 0.021512838306731435, + "grad_norm": 4.889887809753418, + "learning_rate": 2.1509106678230702e-06, + "loss": 1.408, "step": 124 }, { - "epoch": 0.045645426328281904, - "grad_norm": 1.5205367803573608, - "learning_rate": 4.562043795620438e-06, - "loss": 1.7627, + "epoch": 0.021686328938237336, + "grad_norm": 3.0530457496643066, + "learning_rate": 2.1682567215958374e-06, + "loss": 1.2212, "step": 125 }, { - "epoch": 0.04601058973890816, - "grad_norm": 1.1480926275253296, - "learning_rate": 4.598540145985402e-06, - "loss": 1.7358, + "epoch": 0.021859819569743234, + "grad_norm": 2.1149747371673584, + "learning_rate": 2.1856027753686036e-06, + "loss": 1.1379, "step": 126 }, { - "epoch": 0.04637575314953442, - "grad_norm": 1.567167043685913, - "learning_rate": 4.635036496350365e-06, - "loss": 1.7571, + "epoch": 0.022033310201249132, + "grad_norm": 2.2608611583709717, + "learning_rate": 2.2029488291413708e-06, + "loss": 1.2104, "step": 127 }, { - "epoch": 0.046740916560160675, - "grad_norm": 1.3704636096954346, - "learning_rate": 4.671532846715329e-06, - "loss": 1.6951, + "epoch": 0.02220680083275503, + "grad_norm": 2.2134149074554443, + "learning_rate": 2.220294882914137e-06, + "loss": 1.1111, "step": 128 }, { - "epoch": 0.047106079970786925, - "grad_norm": 1.7180166244506836, - "learning_rate": 4.708029197080292e-06, - "loss": 1.751, + "epoch": 0.02238029146426093, + "grad_norm": 1.905030369758606, + "learning_rate": 2.237640936686904e-06, + "loss": 1.1289, "step": 129 }, { - "epoch": 0.04747124338141318, - "grad_norm": 1.6571511030197144, - "learning_rate": 4.744525547445255e-06, - "loss": 1.7498, + "epoch": 0.02255378209576683, + "grad_norm": 1.5522582530975342, + "learning_rate": 2.2549869904596704e-06, + "loss": 1.3647, "step": 130 }, { - "epoch": 0.04783640679203944, - "grad_norm": 1.1921205520629883, - "learning_rate": 4.78102189781022e-06, - "loss": 1.7214, + "epoch": 0.022727272727272728, + "grad_norm": 1.419764757156372, + "learning_rate": 2.2723330442324375e-06, + "loss": 1.2332, "step": 131 }, { - "epoch": 0.048201570202665696, - "grad_norm": 1.6451284885406494, - "learning_rate": 4.8175182481751835e-06, - "loss": 1.7898, + "epoch": 0.022900763358778626, + "grad_norm": 1.5006457567214966, + "learning_rate": 2.289679098005204e-06, + "loss": 1.2112, "step": 132 }, { - "epoch": 0.048566733613291946, - "grad_norm": 1.5454832315444946, - "learning_rate": 4.854014598540146e-06, - "loss": 1.7595, + "epoch": 0.023074253990284524, + "grad_norm": 1.4473450183868408, + "learning_rate": 2.3070251517779705e-06, + "loss": 1.0581, "step": 133 }, { - "epoch": 0.0489318970239182, - "grad_norm": 1.1816350221633911, - "learning_rate": 4.89051094890511e-06, - "loss": 1.6821, + "epoch": 0.023247744621790422, + "grad_norm": 1.392502784729004, + "learning_rate": 2.3243712055507376e-06, + "loss": 1.2295, "step": 134 }, { - "epoch": 0.04929706043454446, - "grad_norm": 1.2444313764572144, - "learning_rate": 4.927007299270074e-06, - "loss": 1.6516, + "epoch": 0.023421235253296323, + "grad_norm": 1.3306293487548828, + "learning_rate": 2.341717259323504e-06, + "loss": 1.2476, "step": 135 }, { - "epoch": 0.049662223845170717, - "grad_norm": 1.6849743127822876, - "learning_rate": 4.963503649635037e-06, - "loss": 1.7661, + "epoch": 0.02359472588480222, + "grad_norm": 1.2792822122573853, + "learning_rate": 2.359063313096271e-06, + "loss": 1.2253, "step": 136 }, { - "epoch": 0.050027387255796966, - "grad_norm": 1.5886048078536987, - "learning_rate": 5e-06, - "loss": 1.7241, + "epoch": 0.02376821651630812, + "grad_norm": 1.4925892353057861, + "learning_rate": 2.3764093668690373e-06, + "loss": 1.1433, "step": 137 }, { - "epoch": 0.05039255066642322, - "grad_norm": 2.7639904022216797, - "learning_rate": 5.036496350364964e-06, - "loss": 1.7209, + "epoch": 0.023941707147814018, + "grad_norm": 3.9102580547332764, + "learning_rate": 2.3937554206418044e-06, + "loss": 1.2437, "step": 138 }, { - "epoch": 0.05075771407704948, - "grad_norm": 1.509170651435852, - "learning_rate": 5.072992700729927e-06, - "loss": 1.7312, + "epoch": 0.024115197779319916, + "grad_norm": 1.4895426034927368, + "learning_rate": 2.4111014744145707e-06, + "loss": 1.2886, "step": 139 }, { - "epoch": 0.05112287748767574, - "grad_norm": 1.4473603963851929, - "learning_rate": 5.1094890510948916e-06, - "loss": 1.7122, + "epoch": 0.024288688410825817, + "grad_norm": 1.3456063270568848, + "learning_rate": 2.428447528187338e-06, + "loss": 1.2349, "step": 140 }, { - "epoch": 0.05148804089830199, - "grad_norm": 1.566351294517517, - "learning_rate": 5.1459854014598544e-06, - "loss": 1.6542, + "epoch": 0.024462179042331715, + "grad_norm": 1.1742236614227295, + "learning_rate": 2.445793581960104e-06, + "loss": 1.1858, "step": 141 }, { - "epoch": 0.051853204308928244, - "grad_norm": 1.073083758354187, - "learning_rate": 5.182481751824818e-06, - "loss": 1.6865, + "epoch": 0.024635669673837613, + "grad_norm": 1.3496001958847046, + "learning_rate": 2.463139635732871e-06, + "loss": 1.209, "step": 142 }, { - "epoch": 0.0522183677195545, - "grad_norm": 1.037796974182129, - "learning_rate": 5.218978102189781e-06, - "loss": 1.7324, + "epoch": 0.02480916030534351, + "grad_norm": 1.3690110445022583, + "learning_rate": 2.4804856895056375e-06, + "loss": 1.3113, "step": 143 }, { - "epoch": 0.05258353113018076, - "grad_norm": 1.011286973953247, - "learning_rate": 5.255474452554746e-06, - "loss": 1.665, + "epoch": 0.02498265093684941, + "grad_norm": 1.2551873922348022, + "learning_rate": 2.4978317432784046e-06, + "loss": 1.1169, "step": 144 }, { - "epoch": 0.05294869454080701, - "grad_norm": 1.460443139076233, - "learning_rate": 5.2919708029197084e-06, - "loss": 1.7573, + "epoch": 0.025156141568355307, + "grad_norm": 1.9081687927246094, + "learning_rate": 2.515177797051171e-06, + "loss": 1.0327, "step": 145 }, { - "epoch": 0.053313857951433265, - "grad_norm": 1.4593185186386108, - "learning_rate": 5.328467153284672e-06, - "loss": 1.7397, + "epoch": 0.02532963219986121, + "grad_norm": 1.3448957204818726, + "learning_rate": 2.532523850823938e-06, + "loss": 1.2668, "step": 146 }, { - "epoch": 0.05367902136205952, - "grad_norm": 1.0988737344741821, - "learning_rate": 5.364963503649635e-06, - "loss": 1.6663, + "epoch": 0.025503122831367107, + "grad_norm": 1.6748831272125244, + "learning_rate": 2.5498699045967047e-06, + "loss": 1.0354, "step": 147 }, { - "epoch": 0.05404418477268578, - "grad_norm": 1.0025031566619873, - "learning_rate": 5.401459854014599e-06, - "loss": 1.6848, + "epoch": 0.025676613462873005, + "grad_norm": 1.3172000646591187, + "learning_rate": 2.567215958369471e-06, + "loss": 1.0835, "step": 148 }, { - "epoch": 0.054409348183312035, - "grad_norm": 1.340061068534851, - "learning_rate": 5.437956204379562e-06, - "loss": 1.7, + "epoch": 0.025850104094378903, + "grad_norm": 1.501997947692871, + "learning_rate": 2.5845620121422376e-06, + "loss": 1.0356, "step": 149 }, { - "epoch": 0.054774511593938285, - "grad_norm": 1.350016474723816, - "learning_rate": 5.474452554744526e-06, - "loss": 1.6277, + "epoch": 0.0260235947258848, + "grad_norm": 1.328136682510376, + "learning_rate": 2.6019080659150048e-06, + "loss": 1.2046, "step": 150 }, { - "epoch": 0.05513967500456454, - "grad_norm": 1.3723485469818115, - "learning_rate": 5.51094890510949e-06, - "loss": 1.6353, + "epoch": 0.026197085357390702, + "grad_norm": 1.2275285720825195, + "learning_rate": 2.6192541196877714e-06, + "loss": 1.3462, "step": 151 }, { - "epoch": 0.0555048384151908, - "grad_norm": 1.1649401187896729, - "learning_rate": 5.547445255474453e-06, - "loss": 1.6956, + "epoch": 0.0263705759888966, + "grad_norm": 1.4004344940185547, + "learning_rate": 2.6366001734605377e-06, + "loss": 1.0352, "step": 152 }, { - "epoch": 0.055870001825817056, - "grad_norm": 1.1821953058242798, - "learning_rate": 5.5839416058394165e-06, - "loss": 1.6636, + "epoch": 0.0265440666204025, + "grad_norm": 1.1716245412826538, + "learning_rate": 2.6539462272333044e-06, + "loss": 1.1704, "step": 153 }, { - "epoch": 0.056235165236443306, - "grad_norm": 0.9762489199638367, - "learning_rate": 5.62043795620438e-06, - "loss": 1.6396, + "epoch": 0.026717557251908396, + "grad_norm": 1.3052043914794922, + "learning_rate": 2.6712922810060715e-06, + "loss": 1.1484, "step": 154 }, { - "epoch": 0.05660032864706956, - "grad_norm": 1.3453902006149292, - "learning_rate": 5.656934306569344e-06, - "loss": 1.6263, + "epoch": 0.026891047883414294, + "grad_norm": 1.442383885383606, + "learning_rate": 2.6886383347788382e-06, + "loss": 1.0791, "step": 155 }, { - "epoch": 0.05696549205769582, - "grad_norm": 1.0859270095825195, - "learning_rate": 5.693430656934307e-06, - "loss": 1.6206, + "epoch": 0.027064538514920196, + "grad_norm": 1.2842674255371094, + "learning_rate": 2.7059843885516045e-06, + "loss": 1.2466, "step": 156 }, { - "epoch": 0.05733065546832208, - "grad_norm": 1.057005524635315, - "learning_rate": 5.7299270072992705e-06, - "loss": 1.6428, + "epoch": 0.027238029146426094, + "grad_norm": 1.1651052236557007, + "learning_rate": 2.723330442324371e-06, + "loss": 1.2964, "step": 157 }, { - "epoch": 0.05769581887894833, - "grad_norm": 0.9143998622894287, - "learning_rate": 5.766423357664233e-06, - "loss": 1.6194, + "epoch": 0.027411519777931992, + "grad_norm": 1.4257391691207886, + "learning_rate": 2.7406764960971383e-06, + "loss": 1.1282, "step": 158 }, { - "epoch": 0.058060982289574584, - "grad_norm": 0.9662761092185974, - "learning_rate": 5.802919708029198e-06, - "loss": 1.5774, + "epoch": 0.02758501040943789, + "grad_norm": 1.307058334350586, + "learning_rate": 2.758022549869905e-06, + "loss": 1.0627, "step": 159 }, { - "epoch": 0.05842614570020084, - "grad_norm": 1.3798744678497314, - "learning_rate": 5.839416058394161e-06, - "loss": 1.6409, + "epoch": 0.027758501040943788, + "grad_norm": 1.4851658344268799, + "learning_rate": 2.7753686036426713e-06, + "loss": 1.3345, "step": 160 }, { - "epoch": 0.0587913091108271, - "grad_norm": 1.1135046482086182, - "learning_rate": 5.8759124087591245e-06, - "loss": 1.6304, + "epoch": 0.02793199167244969, + "grad_norm": 1.4127840995788574, + "learning_rate": 2.7927146574154384e-06, + "loss": 1.1372, "step": 161 }, { - "epoch": 0.05915647252145335, - "grad_norm": 1.4025578498840332, - "learning_rate": 5.912408759124088e-06, - "loss": 1.6982, + "epoch": 0.028105482303955587, + "grad_norm": 1.512790322303772, + "learning_rate": 2.810060711188205e-06, + "loss": 1.1411, "step": 162 }, { - "epoch": 0.059521635932079604, - "grad_norm": 1.649825930595398, - "learning_rate": 5.948905109489051e-06, - "loss": 1.627, + "epoch": 0.028278972935461485, + "grad_norm": 1.152248501777649, + "learning_rate": 2.8274067649609714e-06, + "loss": 1.2903, "step": 163 }, { - "epoch": 0.05988679934270586, - "grad_norm": 1.0209217071533203, - "learning_rate": 5.985401459854016e-06, - "loss": 1.6287, + "epoch": 0.028452463566967384, + "grad_norm": 1.222707986831665, + "learning_rate": 2.844752818733738e-06, + "loss": 1.1484, "step": 164 }, { - "epoch": 0.06025196275333212, - "grad_norm": 0.8191313743591309, - "learning_rate": 6.0218978102189786e-06, - "loss": 1.5793, + "epoch": 0.02862595419847328, + "grad_norm": 1.6512199640274048, + "learning_rate": 2.862098872506505e-06, + "loss": 1.1841, "step": 165 }, { - "epoch": 0.06061712616395837, - "grad_norm": 0.9669530391693115, - "learning_rate": 6.058394160583942e-06, - "loss": 1.5723, + "epoch": 0.02879944482997918, + "grad_norm": 1.0858792066574097, + "learning_rate": 2.879444926279272e-06, + "loss": 1.2874, "step": 166 }, { - "epoch": 0.060982289574584625, - "grad_norm": 1.2297686338424683, - "learning_rate": 6.094890510948905e-06, - "loss": 1.5752, + "epoch": 0.02897293546148508, + "grad_norm": 1.33135986328125, + "learning_rate": 2.896790980052038e-06, + "loss": 1.1323, "step": 167 }, { - "epoch": 0.06134745298521088, - "grad_norm": 1.0703727006912231, - "learning_rate": 6.13138686131387e-06, - "loss": 1.5986, + "epoch": 0.02914642609299098, + "grad_norm": 1.2415027618408203, + "learning_rate": 2.914137033824805e-06, + "loss": 1.3706, "step": 168 }, { - "epoch": 0.06171261639583714, - "grad_norm": 1.5144217014312744, - "learning_rate": 6.1678832116788326e-06, - "loss": 1.6641, + "epoch": 0.029319916724496877, + "grad_norm": 1.181244969367981, + "learning_rate": 2.931483087597572e-06, + "loss": 1.1602, "step": 169 }, { - "epoch": 0.062077779806463396, - "grad_norm": 1.023233413696289, - "learning_rate": 6.204379562043796e-06, - "loss": 1.6086, + "epoch": 0.029493407356002775, + "grad_norm": 1.0887519121170044, + "learning_rate": 2.9488291413703387e-06, + "loss": 1.1545, "step": 170 }, { - "epoch": 0.062442943217089646, - "grad_norm": 1.3149725198745728, - "learning_rate": 6.240875912408759e-06, - "loss": 1.6191, + "epoch": 0.029666897987508673, + "grad_norm": 1.3399016857147217, + "learning_rate": 2.966175195143105e-06, + "loss": 1.2202, "step": 171 }, { - "epoch": 0.06280810662771591, - "grad_norm": 1.0931243896484375, - "learning_rate": 6.277372262773723e-06, - "loss": 1.6196, + "epoch": 0.029840388619014575, + "grad_norm": 1.163425087928772, + "learning_rate": 2.983521248915872e-06, + "loss": 1.2466, "step": 172 }, { - "epoch": 0.06317327003834215, - "grad_norm": 1.2900152206420898, - "learning_rate": 6.313868613138686e-06, - "loss": 1.6445, + "epoch": 0.030013879250520473, + "grad_norm": 1.655005693435669, + "learning_rate": 3.0008673026886387e-06, + "loss": 1.1111, "step": 173 }, { - "epoch": 0.06353843344896841, - "grad_norm": 1.1451497077941895, - "learning_rate": 6.35036496350365e-06, - "loss": 1.6116, + "epoch": 0.03018736988202637, + "grad_norm": 1.1424258947372437, + "learning_rate": 3.018213356461405e-06, + "loss": 1.0762, "step": 174 }, { - "epoch": 0.06390359685959467, - "grad_norm": 1.1634633541107178, - "learning_rate": 6.386861313868614e-06, - "loss": 1.6411, + "epoch": 0.03036086051353227, + "grad_norm": 1.3480885028839111, + "learning_rate": 3.0355594102341717e-06, + "loss": 1.1096, "step": 175 }, { - "epoch": 0.06426876027022092, - "grad_norm": 1.0340324640274048, - "learning_rate": 6.423357664233577e-06, - "loss": 1.5979, + "epoch": 0.030534351145038167, + "grad_norm": 1.1233023405075073, + "learning_rate": 3.052905464006939e-06, + "loss": 1.157, "step": 176 }, { - "epoch": 0.06463392368084718, - "grad_norm": 0.9610005617141724, - "learning_rate": 6.4598540145985415e-06, - "loss": 1.6074, + "epoch": 0.030707841776544068, + "grad_norm": 1.3087818622589111, + "learning_rate": 3.0702515177797055e-06, + "loss": 1.1191, "step": 177 }, { - "epoch": 0.06499908709147344, - "grad_norm": 1.7364978790283203, - "learning_rate": 6.496350364963504e-06, - "loss": 1.5657, + "epoch": 0.030881332408049966, + "grad_norm": 1.5944652557373047, + "learning_rate": 3.087597571552472e-06, + "loss": 0.9917, "step": 178 }, { - "epoch": 0.0653642505020997, - "grad_norm": 1.0809636116027832, - "learning_rate": 6.532846715328468e-06, - "loss": 1.6606, + "epoch": 0.031054823039555864, + "grad_norm": 1.3071682453155518, + "learning_rate": 3.1049436253252385e-06, + "loss": 1.0127, "step": 179 }, { - "epoch": 0.06572941391272595, - "grad_norm": 0.9346624612808228, - "learning_rate": 6.569343065693431e-06, - "loss": 1.5901, + "epoch": 0.031228313671061762, + "grad_norm": 1.9531651735305786, + "learning_rate": 3.1222896790980056e-06, + "loss": 1.1179, "step": 180 }, { - "epoch": 0.0660945773233522, - "grad_norm": 1.563215732574463, - "learning_rate": 6.605839416058395e-06, - "loss": 1.6367, + "epoch": 0.031401804302567664, + "grad_norm": 1.0817384719848633, + "learning_rate": 3.1396357328707723e-06, + "loss": 1.1365, "step": 181 }, { - "epoch": 0.06645974073397845, - "grad_norm": 1.2820005416870117, - "learning_rate": 6.6423357664233575e-06, - "loss": 1.5994, + "epoch": 0.03157529493407356, + "grad_norm": 1.1562706232070923, + "learning_rate": 3.1569817866435386e-06, + "loss": 1.1261, "step": 182 }, { - "epoch": 0.06682490414460471, - "grad_norm": 3.616082191467285, - "learning_rate": 6.678832116788322e-06, - "loss": 1.623, + "epoch": 0.03174878556557946, + "grad_norm": 1.4375635385513306, + "learning_rate": 3.1743278404163057e-06, + "loss": 1.1431, "step": 183 }, { - "epoch": 0.06719006755523096, - "grad_norm": 1.2187740802764893, - "learning_rate": 6.715328467153285e-06, - "loss": 1.6003, + "epoch": 0.031922276197085354, + "grad_norm": 1.3159886598587036, + "learning_rate": 3.1916738941890724e-06, + "loss": 1.1147, "step": 184 }, { - "epoch": 0.06755523096585722, - "grad_norm": 2.1385509967803955, - "learning_rate": 6.751824817518249e-06, - "loss": 1.5588, + "epoch": 0.032095766828591256, + "grad_norm": 1.411533236503601, + "learning_rate": 3.209019947961839e-06, + "loss": 1.125, "step": 185 }, { - "epoch": 0.06792039437648348, - "grad_norm": 1.0435575246810913, - "learning_rate": 6.7883211678832115e-06, - "loss": 1.5991, + "epoch": 0.03226925746009716, + "grad_norm": 1.0745567083358765, + "learning_rate": 3.2263660017346054e-06, + "loss": 1.1184, "step": 186 }, { - "epoch": 0.06828555778710974, - "grad_norm": 0.9835289120674133, - "learning_rate": 6.824817518248176e-06, - "loss": 1.5527, + "epoch": 0.03244274809160305, + "grad_norm": 0.9977289438247681, + "learning_rate": 3.2437120555073725e-06, + "loss": 1.1228, "step": 187 }, { - "epoch": 0.06865072119773599, - "grad_norm": 1.178768277168274, - "learning_rate": 6.86131386861314e-06, - "loss": 1.5479, + "epoch": 0.03261623872310895, + "grad_norm": 1.025145173072815, + "learning_rate": 3.261058109280139e-06, + "loss": 1.2329, "step": 188 }, { - "epoch": 0.06901588460836225, - "grad_norm": 1.5454463958740234, - "learning_rate": 6.897810218978103e-06, - "loss": 1.6067, + "epoch": 0.03278972935461485, + "grad_norm": 1.042004942893982, + "learning_rate": 3.2784041630529055e-06, + "loss": 1.2124, "step": 189 }, { - "epoch": 0.06938104801898849, - "grad_norm": 1.3058518171310425, - "learning_rate": 6.934306569343066e-06, - "loss": 1.592, + "epoch": 0.03296321998612075, + "grad_norm": 1.0348999500274658, + "learning_rate": 3.295750216825672e-06, + "loss": 1.0337, "step": 190 }, { - "epoch": 0.06974621142961475, - "grad_norm": 0.8997343182563782, - "learning_rate": 6.970802919708029e-06, - "loss": 1.5947, + "epoch": 0.03313671061762665, + "grad_norm": 1.119643211364746, + "learning_rate": 3.3130962705984393e-06, + "loss": 1.1895, "step": 191 }, { - "epoch": 0.070111374840241, - "grad_norm": 0.8530521988868713, - "learning_rate": 7.007299270072994e-06, - "loss": 1.5233, + "epoch": 0.033310201249132546, + "grad_norm": 1.0312352180480957, + "learning_rate": 3.330442324371206e-06, + "loss": 1.1353, "step": 192 }, { - "epoch": 0.07047653825086726, - "grad_norm": 1.0962448120117188, - "learning_rate": 7.043795620437957e-06, - "loss": 1.5471, + "epoch": 0.03348369188063845, + "grad_norm": 1.2216565608978271, + "learning_rate": 3.3477883781439722e-06, + "loss": 0.9275, "step": 193 }, { - "epoch": 0.07084170166149352, - "grad_norm": 1.1726222038269043, - "learning_rate": 7.08029197080292e-06, - "loss": 1.5653, + "epoch": 0.03365718251214434, + "grad_norm": 1.2314172983169556, + "learning_rate": 3.365134431916739e-06, + "loss": 0.9741, "step": 194 }, { - "epoch": 0.07120686507211978, - "grad_norm": 1.3787479400634766, - "learning_rate": 7.116788321167883e-06, - "loss": 1.593, + "epoch": 0.03383067314365024, + "grad_norm": 1.611443042755127, + "learning_rate": 3.382480485689506e-06, + "loss": 1.0806, "step": 195 }, { - "epoch": 0.07157202848274603, - "grad_norm": 1.1022465229034424, - "learning_rate": 7.153284671532848e-06, - "loss": 1.5583, + "epoch": 0.034004163775156145, + "grad_norm": 1.0926381349563599, + "learning_rate": 3.3998265394622727e-06, + "loss": 1.1018, "step": 196 }, { - "epoch": 0.07193719189337229, - "grad_norm": 0.9105795621871948, - "learning_rate": 7.189781021897811e-06, - "loss": 1.5496, + "epoch": 0.03417765440666204, + "grad_norm": 1.3949100971221924, + "learning_rate": 3.417172593235039e-06, + "loss": 1.1719, "step": 197 }, { - "epoch": 0.07230235530399853, - "grad_norm": 1.1344600915908813, - "learning_rate": 7.2262773722627744e-06, - "loss": 1.5405, + "epoch": 0.03435114503816794, + "grad_norm": 0.9690477252006531, + "learning_rate": 3.434518647007806e-06, + "loss": 1.0356, "step": 198 }, { - "epoch": 0.07266751871462479, - "grad_norm": 1.0416783094406128, - "learning_rate": 7.262773722627737e-06, - "loss": 1.5698, + "epoch": 0.034524635669673835, + "grad_norm": 1.1134339570999146, + "learning_rate": 3.451864700780573e-06, + "loss": 1.1904, "step": 199 }, { - "epoch": 0.07303268212525105, - "grad_norm": 1.081969141960144, - "learning_rate": 7.299270072992701e-06, - "loss": 1.5211, + "epoch": 0.03469812630117974, + "grad_norm": 0.9384077787399292, + "learning_rate": 3.4692107545533395e-06, + "loss": 1.2329, "step": 200 }, { - "epoch": 0.0733978455358773, - "grad_norm": 1.3582813739776611, - "learning_rate": 7.335766423357666e-06, - "loss": 1.5835, + "epoch": 0.03487161693268564, + "grad_norm": 0.9059350490570068, + "learning_rate": 3.486556808326106e-06, + "loss": 1.1289, "step": 201 }, { - "epoch": 0.07376300894650356, - "grad_norm": 1.0823426246643066, - "learning_rate": 7.3722627737226285e-06, - "loss": 1.5637, + "epoch": 0.03504510756419153, + "grad_norm": 1.0115994215011597, + "learning_rate": 3.503902862098873e-06, + "loss": 1.1716, "step": 202 }, { - "epoch": 0.07412817235712982, - "grad_norm": 1.3806763887405396, - "learning_rate": 7.408759124087592e-06, - "loss": 1.5726, + "epoch": 0.035218598195697434, + "grad_norm": 1.4595848321914673, + "learning_rate": 3.5212489158716396e-06, + "loss": 0.9832, "step": 203 }, { - "epoch": 0.07449333576775607, - "grad_norm": 1.5348141193389893, - "learning_rate": 7.445255474452555e-06, - "loss": 1.6145, + "epoch": 0.03539208882720333, + "grad_norm": 1.1582152843475342, + "learning_rate": 3.538594969644406e-06, + "loss": 1.0273, "step": 204 }, { - "epoch": 0.07485849917838233, - "grad_norm": 0.9889458417892456, - "learning_rate": 7.481751824817519e-06, - "loss": 1.5312, + "epoch": 0.03556557945870923, + "grad_norm": 1.233208179473877, + "learning_rate": 3.5559410234171726e-06, + "loss": 1.1577, "step": 205 }, { - "epoch": 0.07522366258900857, - "grad_norm": 1.204418420791626, - "learning_rate": 7.5182481751824825e-06, - "loss": 1.5237, + "epoch": 0.03573907009021513, + "grad_norm": 1.0033769607543945, + "learning_rate": 3.5732870771899397e-06, + "loss": 1.1602, "step": 206 }, { - "epoch": 0.07558882599963483, - "grad_norm": 1.5592353343963623, - "learning_rate": 7.554744525547446e-06, - "loss": 1.5266, + "epoch": 0.035912560721721026, + "grad_norm": 3.774562120437622, + "learning_rate": 3.5906331309627064e-06, + "loss": 0.9226, "step": 207 }, { - "epoch": 0.07595398941026109, - "grad_norm": 1.2468366622924805, - "learning_rate": 7.591240875912409e-06, - "loss": 1.519, + "epoch": 0.03608605135322693, + "grad_norm": 1.484586477279663, + "learning_rate": 3.6079791847354727e-06, + "loss": 1.0662, "step": 208 }, { - "epoch": 0.07631915282088735, - "grad_norm": 1.068005084991455, - "learning_rate": 7.627737226277373e-06, - "loss": 1.5521, + "epoch": 0.03625954198473282, + "grad_norm": 1.4108099937438965, + "learning_rate": 3.62532523850824e-06, + "loss": 0.9568, "step": 209 }, { - "epoch": 0.0766843162315136, - "grad_norm": 1.109886646270752, - "learning_rate": 7.664233576642336e-06, - "loss": 1.5333, + "epoch": 0.036433032616238724, + "grad_norm": 1.1363781690597534, + "learning_rate": 3.6426712922810065e-06, + "loss": 1.05, "step": 210 }, { - "epoch": 0.07704947964213986, - "grad_norm": 1.1148935556411743, - "learning_rate": 7.7007299270073e-06, - "loss": 1.5051, + "epoch": 0.03660652324774462, + "grad_norm": 1.0070441961288452, + "learning_rate": 3.660017346053773e-06, + "loss": 1.0142, "step": 211 }, { - "epoch": 0.07741464305276612, - "grad_norm": 1.1436775922775269, - "learning_rate": 7.737226277372264e-06, - "loss": 1.4978, + "epoch": 0.03678001387925052, + "grad_norm": 1.262086033821106, + "learning_rate": 3.6773633998265395e-06, + "loss": 1.2256, "step": 212 }, { - "epoch": 0.07777980646339237, - "grad_norm": 1.1545244455337524, - "learning_rate": 7.773722627737227e-06, - "loss": 1.575, + "epoch": 0.03695350451075642, + "grad_norm": 1.1093039512634277, + "learning_rate": 3.6947094535993066e-06, + "loss": 1.0596, "step": 213 }, { - "epoch": 0.07814496987401863, - "grad_norm": 1.255852460861206, - "learning_rate": 7.810218978102191e-06, - "loss": 1.5288, + "epoch": 0.037126995142262316, + "grad_norm": 0.9051575660705566, + "learning_rate": 3.7120555073720733e-06, + "loss": 1.0369, "step": 214 }, { - "epoch": 0.07851013328464487, - "grad_norm": 1.1932826042175293, - "learning_rate": 7.846715328467154e-06, - "loss": 1.4838, + "epoch": 0.03730048577376822, + "grad_norm": 0.8574106693267822, + "learning_rate": 3.72940156114484e-06, + "loss": 1.0437, "step": 215 }, { - "epoch": 0.07887529669527113, - "grad_norm": 1.2818491458892822, - "learning_rate": 7.883211678832117e-06, - "loss": 1.521, + "epoch": 0.03747397640527411, + "grad_norm": 1.6395150423049927, + "learning_rate": 3.7467476149176062e-06, + "loss": 0.9216, "step": 216 }, { - "epoch": 0.07924046010589739, - "grad_norm": 1.0713249444961548, - "learning_rate": 7.91970802919708e-06, - "loss": 1.5413, + "epoch": 0.037647467036780013, + "grad_norm": 0.8975229859352112, + "learning_rate": 3.7640936686903734e-06, + "loss": 1.0642, "step": 217 }, { - "epoch": 0.07960562351652364, - "grad_norm": 1.0241321325302124, - "learning_rate": 7.956204379562045e-06, - "loss": 1.4833, + "epoch": 0.037820957668285915, + "grad_norm": 1.3739519119262695, + "learning_rate": 3.78143972246314e-06, + "loss": 0.9465, "step": 218 }, { - "epoch": 0.0799707869271499, - "grad_norm": 1.2504019737243652, - "learning_rate": 7.992700729927007e-06, - "loss": 1.5774, + "epoch": 0.03799444829979181, + "grad_norm": 1.076572299003601, + "learning_rate": 3.7987857762359063e-06, + "loss": 0.9504, "step": 219 }, { - "epoch": 0.08033595033777616, - "grad_norm": 1.0445505380630493, - "learning_rate": 8.029197080291972e-06, - "loss": 1.4883, + "epoch": 0.03816793893129771, + "grad_norm": 1.2218414545059204, + "learning_rate": 3.816131830008674e-06, + "loss": 1.0667, "step": 220 }, { - "epoch": 0.08070111374840241, - "grad_norm": 0.9249089360237122, - "learning_rate": 8.065693430656935e-06, - "loss": 1.4839, + "epoch": 0.038341429562803606, + "grad_norm": 1.4846584796905518, + "learning_rate": 3.8334778837814406e-06, + "loss": 1.054, "step": 221 }, { - "epoch": 0.08106627715902867, - "grad_norm": 1.3200856447219849, - "learning_rate": 8.1021897810219e-06, - "loss": 1.5774, + "epoch": 0.03851492019430951, + "grad_norm": 0.9900181293487549, + "learning_rate": 3.850823937554206e-06, + "loss": 1.0847, "step": 222 }, { - "epoch": 0.08143144056965491, - "grad_norm": 1.9654418230056763, - "learning_rate": 8.138686131386862e-06, - "loss": 1.5142, + "epoch": 0.03868841082581541, + "grad_norm": 0.9485524296760559, + "learning_rate": 3.868169991326973e-06, + "loss": 1.0608, "step": 223 }, { - "epoch": 0.08179660398028117, - "grad_norm": 1.265892505645752, - "learning_rate": 8.175182481751825e-06, - "loss": 1.5247, + "epoch": 0.0388619014573213, + "grad_norm": 1.1557109355926514, + "learning_rate": 3.885516045099741e-06, + "loss": 1.0254, "step": 224 }, { - "epoch": 0.08216176739090743, - "grad_norm": 1.628423810005188, - "learning_rate": 8.21167883211679e-06, - "loss": 1.5635, + "epoch": 0.039035392088827205, + "grad_norm": 1.863441824913025, + "learning_rate": 3.9028620988725065e-06, + "loss": 1.0911, "step": 225 }, { - "epoch": 0.08252693080153368, - "grad_norm": 1.27012300491333, - "learning_rate": 8.248175182481753e-06, - "loss": 1.5339, + "epoch": 0.0392088827203331, + "grad_norm": 0.9710054993629456, + "learning_rate": 3.920208152645273e-06, + "loss": 1.1125, "step": 226 }, { - "epoch": 0.08289209421215994, - "grad_norm": 1.158062219619751, - "learning_rate": 8.284671532846717e-06, - "loss": 1.4949, + "epoch": 0.039382373351839, + "grad_norm": 1.1565674543380737, + "learning_rate": 3.93755420641804e-06, + "loss": 1.0417, "step": 227 }, { - "epoch": 0.0832572576227862, - "grad_norm": 1.1388415098190308, - "learning_rate": 8.32116788321168e-06, - "loss": 1.4631, + "epoch": 0.0395558639833449, + "grad_norm": 1.1032792329788208, + "learning_rate": 3.9549002601908074e-06, + "loss": 1.0798, "step": 228 }, { - "epoch": 0.08362242103341246, - "grad_norm": 1.2437057495117188, - "learning_rate": 8.357664233576643e-06, - "loss": 1.4592, + "epoch": 0.0397293546148508, + "grad_norm": 2.845886707305908, + "learning_rate": 3.972246313963573e-06, + "loss": 1.0854, "step": 229 }, { - "epoch": 0.08398758444403871, - "grad_norm": 1.0962960720062256, - "learning_rate": 8.394160583941606e-06, - "loss": 1.541, + "epoch": 0.0399028452463567, + "grad_norm": 1.0946824550628662, + "learning_rate": 3.98959236773634e-06, + "loss": 0.9224, "step": 230 }, { - "epoch": 0.08435274785466497, - "grad_norm": 1.6231704950332642, - "learning_rate": 8.43065693430657e-06, - "loss": 1.4589, + "epoch": 0.04007633587786259, + "grad_norm": 0.9502869844436646, + "learning_rate": 4.0069384215091075e-06, + "loss": 1.0388, "step": 231 }, { - "epoch": 0.08471791126529121, - "grad_norm": 1.0528669357299805, - "learning_rate": 8.467153284671533e-06, - "loss": 1.4736, + "epoch": 0.040249826509368494, + "grad_norm": 1.3078712224960327, + "learning_rate": 4.024284475281874e-06, + "loss": 1.0615, "step": 232 }, { - "epoch": 0.08508307467591747, - "grad_norm": 0.9265260100364685, - "learning_rate": 8.503649635036498e-06, - "loss": 1.4915, + "epoch": 0.040423317140874396, + "grad_norm": 1.077901840209961, + "learning_rate": 4.04163052905464e-06, + "loss": 0.9543, "step": 233 }, { - "epoch": 0.08544823808654373, - "grad_norm": 1.1033251285552979, - "learning_rate": 8.54014598540146e-06, - "loss": 1.4591, + "epoch": 0.04059680777238029, + "grad_norm": 0.9809135794639587, + "learning_rate": 4.058976582827407e-06, + "loss": 1.116, "step": 234 }, { - "epoch": 0.08581340149716998, - "grad_norm": 1.191789150238037, - "learning_rate": 8.576642335766423e-06, - "loss": 1.4827, + "epoch": 0.04077029840388619, + "grad_norm": 1.0471081733703613, + "learning_rate": 4.076322636600174e-06, + "loss": 1.0964, "step": 235 }, { - "epoch": 0.08617856490779624, - "grad_norm": 1.2473351955413818, - "learning_rate": 8.613138686131386e-06, - "loss": 1.4709, + "epoch": 0.040943789035392086, + "grad_norm": 0.8584343194961548, + "learning_rate": 4.093668690372941e-06, + "loss": 1.1584, "step": 236 }, { - "epoch": 0.0865437283184225, - "grad_norm": 1.3822811841964722, - "learning_rate": 8.649635036496351e-06, - "loss": 1.4707, + "epoch": 0.04111727966689799, + "grad_norm": 0.9176291227340698, + "learning_rate": 4.111014744145707e-06, + "loss": 0.9346, "step": 237 }, { - "epoch": 0.08690889172904875, - "grad_norm": 0.9968782663345337, - "learning_rate": 8.686131386861315e-06, - "loss": 1.4271, + "epoch": 0.04129077029840389, + "grad_norm": 1.2530393600463867, + "learning_rate": 4.1283607979184735e-06, + "loss": 1.2812, "step": 238 }, { - "epoch": 0.08727405513967501, - "grad_norm": 1.6921288967132568, - "learning_rate": 8.722627737226278e-06, - "loss": 1.5481, + "epoch": 0.041464260929909784, + "grad_norm": 1.0103131532669067, + "learning_rate": 4.145706851691241e-06, + "loss": 0.9878, "step": 239 }, { - "epoch": 0.08763921855030125, - "grad_norm": 1.328550100326538, - "learning_rate": 8.759124087591241e-06, - "loss": 1.4683, + "epoch": 0.041637751561415685, + "grad_norm": 0.9029264450073242, + "learning_rate": 4.163052905464007e-06, + "loss": 1.0542, "step": 240 }, { - "epoch": 0.08800438196092751, - "grad_norm": 1.573909878730774, - "learning_rate": 8.795620437956204e-06, - "loss": 1.512, + "epoch": 0.04181124219292158, + "grad_norm": 0.8104596138000488, + "learning_rate": 4.180398959236774e-06, + "loss": 1.155, "step": 241 }, { - "epoch": 0.08836954537155377, - "grad_norm": 1.008296012878418, - "learning_rate": 8.832116788321169e-06, - "loss": 1.509, + "epoch": 0.04198473282442748, + "grad_norm": 1.1914087533950806, + "learning_rate": 4.19774501300954e-06, + "loss": 1.022, "step": 242 }, { - "epoch": 0.08873470878218002, - "grad_norm": 1.1074453592300415, - "learning_rate": 8.868613138686132e-06, - "loss": 1.4368, + "epoch": 0.04215822345593338, + "grad_norm": 2.3381969928741455, + "learning_rate": 4.215091066782308e-06, + "loss": 1.1335, "step": 243 }, { - "epoch": 0.08909987219280628, - "grad_norm": 0.9687774777412415, - "learning_rate": 8.905109489051096e-06, - "loss": 1.4656, + "epoch": 0.04233171408743928, + "grad_norm": 0.9077311754226685, + "learning_rate": 4.232437120555074e-06, + "loss": 1.186, "step": 244 }, { - "epoch": 0.08946503560343254, - "grad_norm": 1.461031436920166, - "learning_rate": 8.941605839416059e-06, - "loss": 1.4963, + "epoch": 0.04250520471894518, + "grad_norm": 1.3375426530838013, + "learning_rate": 4.24978317432784e-06, + "loss": 0.9519, "step": 245 }, { - "epoch": 0.0898301990140588, - "grad_norm": 1.0685254335403442, - "learning_rate": 8.978102189781024e-06, - "loss": 1.5225, + "epoch": 0.042678695350451074, + "grad_norm": 1.4387212991714478, + "learning_rate": 4.267129228100608e-06, + "loss": 0.854, "step": 246 }, { - "epoch": 0.09019536242468505, - "grad_norm": 11.069062232971191, - "learning_rate": 9.014598540145986e-06, - "loss": 1.5071, + "epoch": 0.042852185981956975, + "grad_norm": 1.3430448770523071, + "learning_rate": 4.284475281873375e-06, + "loss": 0.8931, "step": 247 }, { - "epoch": 0.0905605258353113, - "grad_norm": 1.171651840209961, - "learning_rate": 9.05109489051095e-06, - "loss": 1.5271, + "epoch": 0.04302567661346287, + "grad_norm": 1.1013959646224976, + "learning_rate": 4.3018213356461405e-06, + "loss": 1.1394, "step": 248 }, { - "epoch": 0.09092568924593755, - "grad_norm": 1.3868751525878906, - "learning_rate": 9.087591240875912e-06, - "loss": 1.4169, + "epoch": 0.04319916724496877, + "grad_norm": 0.9243041276931763, + "learning_rate": 4.319167389418907e-06, + "loss": 1.2869, "step": 249 }, { - "epoch": 0.09129085265656381, - "grad_norm": 1.0575600862503052, - "learning_rate": 9.124087591240877e-06, - "loss": 1.4456, + "epoch": 0.04337265787647467, + "grad_norm": 0.8229906558990479, + "learning_rate": 4.336513443191675e-06, + "loss": 0.9695, "step": 250 }, { - "epoch": 0.09165601606719007, - "grad_norm": 1.070965051651001, - "learning_rate": 9.160583941605841e-06, - "loss": 1.4668, + "epoch": 0.04354614850798057, + "grad_norm": 0.9344890713691711, + "learning_rate": 4.353859496964441e-06, + "loss": 1.1172, "step": 251 }, { - "epoch": 0.09202117947781632, - "grad_norm": 1.2832828760147095, - "learning_rate": 9.197080291970804e-06, - "loss": 1.4885, + "epoch": 0.04371963913948647, + "grad_norm": 0.9427030086517334, + "learning_rate": 4.371205550737207e-06, + "loss": 1.0452, "step": 252 }, { - "epoch": 0.09238634288844258, - "grad_norm": 0.8003165125846863, - "learning_rate": 9.233576642335767e-06, - "loss": 1.4412, + "epoch": 0.04389312977099236, + "grad_norm": 0.876641571521759, + "learning_rate": 4.388551604509974e-06, + "loss": 0.967, "step": 253 }, { - "epoch": 0.09275150629906884, - "grad_norm": 1.2488244771957397, - "learning_rate": 9.27007299270073e-06, - "loss": 1.4734, + "epoch": 0.044066620402498265, + "grad_norm": 2.097550392150879, + "learning_rate": 4.4058976582827415e-06, + "loss": 1.1821, "step": 254 }, { - "epoch": 0.0931166697096951, - "grad_norm": 1.5184513330459595, - "learning_rate": 9.306569343065694e-06, - "loss": 1.4695, + "epoch": 0.044240111034004166, + "grad_norm": 1.0620050430297852, + "learning_rate": 4.423243712055507e-06, + "loss": 1.0862, "step": 255 }, { - "epoch": 0.09348183312032135, - "grad_norm": 1.6158387660980225, - "learning_rate": 9.343065693430657e-06, - "loss": 1.478, + "epoch": 0.04441360166551006, + "grad_norm": 1.036466121673584, + "learning_rate": 4.440589765828274e-06, + "loss": 1.0115, "step": 256 }, { - "epoch": 0.0938469965309476, - "grad_norm": 3.6522462368011475, - "learning_rate": 9.379562043795622e-06, - "loss": 1.4536, + "epoch": 0.04458709229701596, + "grad_norm": 2.2063586711883545, + "learning_rate": 4.457935819601042e-06, + "loss": 1.2476, "step": 257 }, { - "epoch": 0.09421215994157385, - "grad_norm": 1.5849647521972656, - "learning_rate": 9.416058394160585e-06, - "loss": 1.447, + "epoch": 0.04476058292852186, + "grad_norm": 1.0659186840057373, + "learning_rate": 4.475281873373808e-06, + "loss": 1.0266, "step": 258 }, { - "epoch": 0.09457732335220011, - "grad_norm": 1.0375897884368896, - "learning_rate": 9.452554744525548e-06, - "loss": 1.4294, + "epoch": 0.04493407356002776, + "grad_norm": 1.0351845026016235, + "learning_rate": 4.492627927146574e-06, + "loss": 0.9602, "step": 259 }, { - "epoch": 0.09494248676282636, - "grad_norm": 1.292153239250183, - "learning_rate": 9.48905109489051e-06, - "loss": 1.4736, + "epoch": 0.04510756419153366, + "grad_norm": 1.0263773202896118, + "learning_rate": 4.509973980919341e-06, + "loss": 0.9702, "step": 260 }, { - "epoch": 0.09530765017345262, - "grad_norm": 1.6518572568893433, - "learning_rate": 9.525547445255475e-06, - "loss": 1.4219, + "epoch": 0.045281054823039554, + "grad_norm": 0.9431845545768738, + "learning_rate": 4.527320034692108e-06, + "loss": 1.1907, "step": 261 }, { - "epoch": 0.09567281358407888, - "grad_norm": 1.1988033056259155, - "learning_rate": 9.56204379562044e-06, - "loss": 1.4717, + "epoch": 0.045454545454545456, + "grad_norm": 1.4688133001327515, + "learning_rate": 4.544666088464875e-06, + "loss": 0.8894, "step": 262 }, { - "epoch": 0.09603797699470513, - "grad_norm": 1.4726011753082275, - "learning_rate": 9.598540145985402e-06, - "loss": 1.4824, + "epoch": 0.04562803608605135, + "grad_norm": 1.2890156507492065, + "learning_rate": 4.562012142237641e-06, + "loss": 0.9424, "step": 263 }, { - "epoch": 0.09640314040533139, - "grad_norm": 0.9365547895431519, - "learning_rate": 9.635036496350367e-06, - "loss": 1.4016, + "epoch": 0.04580152671755725, + "grad_norm": 0.9922130703926086, + "learning_rate": 4.579358196010408e-06, + "loss": 1.0635, "step": 264 }, { - "epoch": 0.09676830381595763, - "grad_norm": 0.9560263156890869, - "learning_rate": 9.67153284671533e-06, - "loss": 1.4375, + "epoch": 0.04597501734906315, + "grad_norm": 0.9697554111480713, + "learning_rate": 4.596704249783175e-06, + "loss": 1.0652, "step": 265 }, { - "epoch": 0.09713346722658389, - "grad_norm": 0.9617602825164795, - "learning_rate": 9.708029197080293e-06, - "loss": 1.4294, + "epoch": 0.04614850798056905, + "grad_norm": 1.1336649656295776, + "learning_rate": 4.614050303555941e-06, + "loss": 1.0559, "step": 266 }, { - "epoch": 0.09749863063721015, - "grad_norm": 1.1020822525024414, - "learning_rate": 9.744525547445256e-06, - "loss": 1.4177, + "epoch": 0.04632199861207495, + "grad_norm": 1.198110818862915, + "learning_rate": 4.631396357328708e-06, + "loss": 1.0222, "step": 267 }, { - "epoch": 0.0978637940478364, - "grad_norm": 2.970747709274292, - "learning_rate": 9.78102189781022e-06, - "loss": 1.4709, + "epoch": 0.046495489243580844, + "grad_norm": 0.9485757350921631, + "learning_rate": 4.648742411101475e-06, + "loss": 0.9741, "step": 268 }, { - "epoch": 0.09822895745846266, - "grad_norm": 1.4076952934265137, - "learning_rate": 9.817518248175183e-06, - "loss": 1.4727, + "epoch": 0.046668979875086745, + "grad_norm": 0.801455020904541, + "learning_rate": 4.666088464874242e-06, + "loss": 1.0818, "step": 269 }, { - "epoch": 0.09859412086908892, - "grad_norm": 1.3098509311676025, - "learning_rate": 9.854014598540148e-06, - "loss": 1.406, + "epoch": 0.04684247050659265, + "grad_norm": 1.033544659614563, + "learning_rate": 4.683434518647008e-06, + "loss": 1.0229, "step": 270 }, { - "epoch": 0.09895928427971518, - "grad_norm": 1.0975197553634644, - "learning_rate": 9.89051094890511e-06, - "loss": 1.4282, + "epoch": 0.04701596113809854, + "grad_norm": 1.6481877565383911, + "learning_rate": 4.7007805724197745e-06, + "loss": 1.1575, "step": 271 }, { - "epoch": 0.09932444769034143, - "grad_norm": 1.6375492811203003, - "learning_rate": 9.927007299270073e-06, - "loss": 1.4722, + "epoch": 0.04718945176960444, + "grad_norm": 1.164493441581726, + "learning_rate": 4.718126626192542e-06, + "loss": 1.1023, "step": 272 }, { - "epoch": 0.09968961110096769, - "grad_norm": 1.078250765800476, - "learning_rate": 9.963503649635036e-06, - "loss": 1.4448, + "epoch": 0.04736294240111034, + "grad_norm": 0.9398133158683777, + "learning_rate": 4.735472679965309e-06, + "loss": 1.019, "step": 273 }, { - "epoch": 0.10005477451159393, - "grad_norm": 0.7428270578384399, - "learning_rate": 1e-05, - "loss": 1.3992, + "epoch": 0.04753643303261624, + "grad_norm": 0.9447312951087952, + "learning_rate": 4.7528187337380746e-06, + "loss": 1.0713, "step": 274 }, { - "epoch": 0.10041993792222019, - "grad_norm": 1.3953334093093872, - "learning_rate": 1.0036496350364964e-05, - "loss": 1.468, + "epoch": 0.04770992366412214, + "grad_norm": 1.3094123601913452, + "learning_rate": 4.770164787510841e-06, + "loss": 0.9412, "step": 275 }, { - "epoch": 0.10078510133284645, - "grad_norm": 1.3903917074203491, - "learning_rate": 1.0072992700729928e-05, - "loss": 1.4031, + "epoch": 0.047883414295628035, + "grad_norm": 1.0912115573883057, + "learning_rate": 4.787510841283609e-06, + "loss": 1.0254, "step": 276 }, { - "epoch": 0.1011502647434727, - "grad_norm": 1.3393917083740234, - "learning_rate": 1.0109489051094891e-05, - "loss": 1.4836, + "epoch": 0.04805690492713394, + "grad_norm": 1.2621551752090454, + "learning_rate": 4.8048568950563755e-06, + "loss": 0.9802, "step": 277 }, { - "epoch": 0.10151542815409896, - "grad_norm": 1.080422043800354, - "learning_rate": 1.0145985401459854e-05, - "loss": 1.4253, + "epoch": 0.04823039555863983, + "grad_norm": 1.1452885866165161, + "learning_rate": 4.822202948829141e-06, + "loss": 1.0303, "step": 278 }, { - "epoch": 0.10188059156472522, - "grad_norm": 0.8771781325340271, - "learning_rate": 1.0182481751824817e-05, - "loss": 1.3938, + "epoch": 0.04840388619014573, + "grad_norm": 0.9421970844268799, + "learning_rate": 4.839549002601908e-06, + "loss": 1.0488, "step": 279 }, { - "epoch": 0.10224575497535147, - "grad_norm": 0.8846216201782227, - "learning_rate": 1.0218978102189783e-05, - "loss": 1.4365, + "epoch": 0.048577376821651634, + "grad_norm": 0.9050304889678955, + "learning_rate": 4.856895056374676e-06, + "loss": 0.9807, "step": 280 }, { - "epoch": 0.10261091838597773, - "grad_norm": 1.511834979057312, - "learning_rate": 1.0255474452554746e-05, - "loss": 1.48, + "epoch": 0.04875086745315753, + "grad_norm": 0.865789532661438, + "learning_rate": 4.8742411101474414e-06, + "loss": 1.144, "step": 281 }, { - "epoch": 0.10297608179660397, - "grad_norm": 1.3361351490020752, - "learning_rate": 1.0291970802919709e-05, - "loss": 1.4015, + "epoch": 0.04892435808466343, + "grad_norm": 1.0998272895812988, + "learning_rate": 4.891587163920208e-06, + "loss": 0.9641, "step": 282 }, { - "epoch": 0.10334124520723023, - "grad_norm": 1.2253375053405762, - "learning_rate": 1.0328467153284672e-05, - "loss": 1.4153, + "epoch": 0.049097848716169325, + "grad_norm": 0.9415982365608215, + "learning_rate": 4.908933217692976e-06, + "loss": 0.946, "step": 283 }, { - "epoch": 0.10370640861785649, - "grad_norm": 1.1841024160385132, - "learning_rate": 1.0364963503649636e-05, - "loss": 1.387, + "epoch": 0.049271339347675226, + "grad_norm": 0.7739574313163757, + "learning_rate": 4.926279271465742e-06, + "loss": 1.1621, "step": 284 }, { - "epoch": 0.10407157202848275, - "grad_norm": 1.010166883468628, - "learning_rate": 1.04014598540146e-05, - "loss": 1.4248, + "epoch": 0.04944482997918112, + "grad_norm": 0.9333693385124207, + "learning_rate": 4.943625325238508e-06, + "loss": 1.2188, "step": 285 }, { - "epoch": 0.104436735439109, - "grad_norm": 2.7126755714416504, - "learning_rate": 1.0437956204379562e-05, - "loss": 1.4475, + "epoch": 0.04961832061068702, + "grad_norm": 0.9140480160713196, + "learning_rate": 4.960971379011275e-06, + "loss": 1.0117, "step": 286 }, { - "epoch": 0.10480189884973526, - "grad_norm": 1.2636414766311646, - "learning_rate": 1.0474452554744528e-05, - "loss": 1.4631, + "epoch": 0.049791811242192924, + "grad_norm": 0.9500376582145691, + "learning_rate": 4.9783174327840425e-06, + "loss": 1.1589, "step": 287 }, { - "epoch": 0.10516706226036152, - "grad_norm": 2.057204008102417, - "learning_rate": 1.0510948905109491e-05, - "loss": 1.4797, + "epoch": 0.04996530187369882, + "grad_norm": 0.9249399900436401, + "learning_rate": 4.995663486556809e-06, + "loss": 1.0979, "step": 288 }, { - "epoch": 0.10553222567098777, - "grad_norm": 1.1315290927886963, - "learning_rate": 1.0547445255474454e-05, - "loss": 1.4207, + "epoch": 0.05013879250520472, + "grad_norm": 1.3139915466308594, + "learning_rate": 5.013009540329575e-06, + "loss": 1.0632, "step": 289 }, { - "epoch": 0.10589738908161402, - "grad_norm": 1.5360143184661865, - "learning_rate": 1.0583941605839417e-05, - "loss": 1.4104, + "epoch": 0.050312283136710614, + "grad_norm": 0.8485634922981262, + "learning_rate": 5.030355594102342e-06, + "loss": 1.1702, "step": 290 }, { - "epoch": 0.10626255249224027, - "grad_norm": 1.2010577917099, - "learning_rate": 1.0620437956204381e-05, - "loss": 1.4333, + "epoch": 0.050485773768216516, + "grad_norm": 1.3334038257598877, + "learning_rate": 5.047701647875108e-06, + "loss": 0.8318, "step": 291 }, { - "epoch": 0.10662771590286653, - "grad_norm": 1.7269439697265625, - "learning_rate": 1.0656934306569344e-05, - "loss": 1.4673, + "epoch": 0.05065926439972242, + "grad_norm": 1.2785637378692627, + "learning_rate": 5.065047701647876e-06, + "loss": 1.0488, "step": 292 }, { - "epoch": 0.10699287931349279, - "grad_norm": 1.2858600616455078, - "learning_rate": 1.0693430656934307e-05, - "loss": 1.4011, + "epoch": 0.05083275503122831, + "grad_norm": 0.8768314123153687, + "learning_rate": 5.082393755420643e-06, + "loss": 1.1714, "step": 293 }, { - "epoch": 0.10735804272411904, - "grad_norm": 1.0287435054779053, - "learning_rate": 1.072992700729927e-05, - "loss": 1.3873, + "epoch": 0.05100624566273421, + "grad_norm": 1.0624489784240723, + "learning_rate": 5.099739809193409e-06, + "loss": 1.0505, "step": 294 }, { - "epoch": 0.1077232061347453, - "grad_norm": 1.8532861471176147, - "learning_rate": 1.0766423357664235e-05, - "loss": 1.4155, + "epoch": 0.05117973629424011, + "grad_norm": 0.8365274667739868, + "learning_rate": 5.117085862966176e-06, + "loss": 1.1267, "step": 295 }, { - "epoch": 0.10808836954537156, - "grad_norm": 1.247887134552002, - "learning_rate": 1.0802919708029198e-05, - "loss": 1.4279, + "epoch": 0.05135322692574601, + "grad_norm": 0.969789445400238, + "learning_rate": 5.134431916738942e-06, + "loss": 0.927, "step": 296 }, { - "epoch": 0.10845353295599781, - "grad_norm": 1.0020791292190552, - "learning_rate": 1.083941605839416e-05, - "loss": 1.3536, + "epoch": 0.05152671755725191, + "grad_norm": 1.474252700805664, + "learning_rate": 5.1517779705117086e-06, + "loss": 1.1506, "step": 297 }, { - "epoch": 0.10881869636662407, - "grad_norm": 1.9740164279937744, - "learning_rate": 1.0875912408759123e-05, - "loss": 1.3455, + "epoch": 0.051700208188757805, + "grad_norm": 1.0048154592514038, + "learning_rate": 5.169124024284475e-06, + "loss": 1.1636, "step": 298 }, { - "epoch": 0.10918385977725031, - "grad_norm": 1.0626384019851685, - "learning_rate": 1.091240875912409e-05, - "loss": 1.3831, + "epoch": 0.05187369882026371, + "grad_norm": 1.1335208415985107, + "learning_rate": 5.186470078057242e-06, + "loss": 1.207, "step": 299 }, { - "epoch": 0.10954902318787657, - "grad_norm": 1.0477420091629028, - "learning_rate": 1.0948905109489052e-05, - "loss": 1.4253, + "epoch": 0.0520471894517696, + "grad_norm": 0.9158570766448975, + "learning_rate": 5.2038161318300095e-06, + "loss": 1.0344, "step": 300 }, { - "epoch": 0.10991418659850283, - "grad_norm": 0.8309475779533386, - "learning_rate": 1.0985401459854015e-05, - "loss": 1.3984, + "epoch": 0.0522206800832755, + "grad_norm": 0.8626315593719482, + "learning_rate": 5.221162185602776e-06, + "loss": 1.0588, "step": 301 }, { - "epoch": 0.11027935000912908, - "grad_norm": 1.2881908416748047, - "learning_rate": 1.102189781021898e-05, - "loss": 1.3499, + "epoch": 0.052394170714781405, + "grad_norm": 0.9168702960014343, + "learning_rate": 5.238508239375543e-06, + "loss": 1.0498, "step": 302 }, { - "epoch": 0.11064451341975534, - "grad_norm": 1.0825612545013428, - "learning_rate": 1.1058394160583943e-05, - "loss": 1.4175, + "epoch": 0.0525676613462873, + "grad_norm": 0.9662163853645325, + "learning_rate": 5.25585429314831e-06, + "loss": 1.0261, "step": 303 }, { - "epoch": 0.1110096768303816, - "grad_norm": 1.372568964958191, - "learning_rate": 1.1094890510948906e-05, - "loss": 1.4556, + "epoch": 0.0527411519777932, + "grad_norm": 0.7976101636886597, + "learning_rate": 5.2732003469210754e-06, + "loss": 1.0371, "step": 304 }, { - "epoch": 0.11137484024100786, - "grad_norm": 0.9283525347709656, - "learning_rate": 1.1131386861313868e-05, - "loss": 1.408, + "epoch": 0.052914642609299095, + "grad_norm": 1.177430510520935, + "learning_rate": 5.290546400693842e-06, + "loss": 0.8699, "step": 305 }, { - "epoch": 0.11174000365163411, - "grad_norm": 0.8976697325706482, - "learning_rate": 1.1167883211678833e-05, - "loss": 1.3428, + "epoch": 0.053088133240805, + "grad_norm": 0.830085039138794, + "learning_rate": 5.307892454466609e-06, + "loss": 1.0393, "step": 306 }, { - "epoch": 0.11210516706226036, - "grad_norm": 1.0083105564117432, - "learning_rate": 1.1204379562043798e-05, - "loss": 1.4009, + "epoch": 0.0532616238723109, + "grad_norm": 2.249694347381592, + "learning_rate": 5.325238508239376e-06, + "loss": 0.8853, "step": 307 }, { - "epoch": 0.11247033047288661, - "grad_norm": 1.1374458074569702, - "learning_rate": 1.124087591240876e-05, - "loss": 1.4077, + "epoch": 0.05343511450381679, + "grad_norm": 1.0712023973464966, + "learning_rate": 5.342584562012143e-06, + "loss": 0.9473, "step": 308 }, { - "epoch": 0.11283549388351287, - "grad_norm": 5.0454888343811035, - "learning_rate": 1.1277372262773723e-05, - "loss": 1.3955, + "epoch": 0.053608605135322694, + "grad_norm": 1.2278658151626587, + "learning_rate": 5.35993061578491e-06, + "loss": 1.0046, "step": 309 }, { - "epoch": 0.11320065729413913, - "grad_norm": 1.4130263328552246, - "learning_rate": 1.1313868613138688e-05, - "loss": 1.3809, + "epoch": 0.05378209576682859, + "grad_norm": 1.0765156745910645, + "learning_rate": 5.3772766695576765e-06, + "loss": 0.98, "step": 310 }, { - "epoch": 0.11356582070476538, - "grad_norm": 0.854888916015625, - "learning_rate": 1.135036496350365e-05, - "loss": 1.3562, + "epoch": 0.05395558639833449, + "grad_norm": 1.5827305316925049, + "learning_rate": 5.394622723330442e-06, + "loss": 0.9829, "step": 311 }, { - "epoch": 0.11393098411539164, - "grad_norm": 1.877086877822876, - "learning_rate": 1.1386861313868614e-05, - "loss": 1.3828, + "epoch": 0.05412907702984039, + "grad_norm": 0.9392313957214355, + "learning_rate": 5.411968777103209e-06, + "loss": 0.9268, "step": 312 }, { - "epoch": 0.1142961475260179, - "grad_norm": 1.6719337701797485, - "learning_rate": 1.1423357664233578e-05, - "loss": 1.3594, + "epoch": 0.054302567661346286, + "grad_norm": 1.1720887422561646, + "learning_rate": 5.429314830875976e-06, + "loss": 0.897, "step": 313 }, { - "epoch": 0.11466131093664415, - "grad_norm": 1.3448402881622314, - "learning_rate": 1.1459854014598541e-05, - "loss": 1.4045, + "epoch": 0.05447605829285219, + "grad_norm": 1.0451200008392334, + "learning_rate": 5.446660884648742e-06, + "loss": 0.825, "step": 314 }, { - "epoch": 0.11502647434727041, - "grad_norm": 1.0730476379394531, - "learning_rate": 1.1496350364963504e-05, - "loss": 1.405, + "epoch": 0.05464954892435808, + "grad_norm": 0.8499744534492493, + "learning_rate": 5.46400693842151e-06, + "loss": 1.2744, "step": 315 }, { - "epoch": 0.11539163775789665, - "grad_norm": 1.1203923225402832, - "learning_rate": 1.1532846715328467e-05, - "loss": 1.3856, + "epoch": 0.054823039555863984, + "grad_norm": 1.0547802448272705, + "learning_rate": 5.481352992194277e-06, + "loss": 0.988, "step": 316 }, { - "epoch": 0.11575680116852291, - "grad_norm": 0.9116138815879822, - "learning_rate": 1.1569343065693433e-05, - "loss": 1.3833, + "epoch": 0.054996530187369885, + "grad_norm": 0.96930330991745, + "learning_rate": 5.498699045967043e-06, + "loss": 1.2014, "step": 317 }, { - "epoch": 0.11612196457914917, - "grad_norm": 1.1952472925186157, - "learning_rate": 1.1605839416058396e-05, - "loss": 1.3745, + "epoch": 0.05517002081887578, + "grad_norm": 1.0862407684326172, + "learning_rate": 5.51604509973981e-06, + "loss": 0.8972, "step": 318 }, { - "epoch": 0.11648712798977542, - "grad_norm": 0.9983155131340027, - "learning_rate": 1.1642335766423359e-05, - "loss": 1.3745, + "epoch": 0.05534351145038168, + "grad_norm": 2.020535469055176, + "learning_rate": 5.533391153512576e-06, + "loss": 0.9824, "step": 319 }, { - "epoch": 0.11685229140040168, - "grad_norm": 2.0398621559143066, - "learning_rate": 1.1678832116788322e-05, - "loss": 1.4141, + "epoch": 0.055517002081887576, + "grad_norm": 0.888179361820221, + "learning_rate": 5.5507372072853426e-06, + "loss": 1.1094, "step": 320 }, { - "epoch": 0.11721745481102794, - "grad_norm": 1.1879554986953735, - "learning_rate": 1.1715328467153286e-05, - "loss": 1.3564, + "epoch": 0.05569049271339348, + "grad_norm": 0.7227441668510437, + "learning_rate": 5.568083261058109e-06, + "loss": 1.0764, "step": 321 }, { - "epoch": 0.1175826182216542, - "grad_norm": 1.1611847877502441, - "learning_rate": 1.1751824817518249e-05, - "loss": 1.3944, + "epoch": 0.05586398334489938, + "grad_norm": 1.4483895301818848, + "learning_rate": 5.585429314830877e-06, + "loss": 0.856, "step": 322 }, { - "epoch": 0.11794778163228045, - "grad_norm": 0.9878073334693909, - "learning_rate": 1.1788321167883212e-05, - "loss": 1.3972, + "epoch": 0.05603747397640527, + "grad_norm": 0.8146894574165344, + "learning_rate": 5.6027753686036435e-06, + "loss": 1.1274, "step": 323 }, { - "epoch": 0.1183129450429067, - "grad_norm": 0.9224428534507751, - "learning_rate": 1.1824817518248176e-05, - "loss": 1.3655, + "epoch": 0.056210964607911175, + "grad_norm": 1.4646304845809937, + "learning_rate": 5.62012142237641e-06, + "loss": 1.0442, "step": 324 }, { - "epoch": 0.11867810845353295, - "grad_norm": 1.7453678846359253, - "learning_rate": 1.186131386861314e-05, - "loss": 1.3979, + "epoch": 0.05638445523941707, + "grad_norm": 0.7141495943069458, + "learning_rate": 5.637467476149177e-06, + "loss": 1.0559, "step": 325 }, { - "epoch": 0.11904327186415921, - "grad_norm": 0.9843180775642395, - "learning_rate": 1.1897810218978102e-05, - "loss": 1.3628, + "epoch": 0.05655794587092297, + "grad_norm": 0.8768091797828674, + "learning_rate": 5.654813529921943e-06, + "loss": 1.0696, "step": 326 }, { - "epoch": 0.11940843527478547, - "grad_norm": 1.3384828567504883, - "learning_rate": 1.1934306569343067e-05, - "loss": 1.4076, + "epoch": 0.056731436502428866, + "grad_norm": 0.9401630759239197, + "learning_rate": 5.6721595836947094e-06, + "loss": 1.1926, "step": 327 }, { - "epoch": 0.11977359868541172, - "grad_norm": 1.3804056644439697, - "learning_rate": 1.1970802919708031e-05, - "loss": 1.4124, + "epoch": 0.05690492713393477, + "grad_norm": 0.9278298020362854, + "learning_rate": 5.689505637467476e-06, + "loss": 1.0056, "step": 328 }, { - "epoch": 0.12013876209603798, - "grad_norm": 0.9530078768730164, - "learning_rate": 1.2007299270072994e-05, - "loss": 1.344, + "epoch": 0.05707841776544067, + "grad_norm": 1.0270179510116577, + "learning_rate": 5.706851691240244e-06, + "loss": 1.1211, "step": 329 }, { - "epoch": 0.12050392550666424, - "grad_norm": 0.9892421960830688, - "learning_rate": 1.2043795620437957e-05, - "loss": 1.3125, + "epoch": 0.05725190839694656, + "grad_norm": 0.7549020648002625, + "learning_rate": 5.72419774501301e-06, + "loss": 1.0276, "step": 330 }, { - "epoch": 0.1208690889172905, - "grad_norm": 0.8752195239067078, - "learning_rate": 1.208029197080292e-05, - "loss": 1.3788, + "epoch": 0.057425399028452465, + "grad_norm": 1.131272554397583, + "learning_rate": 5.741543798785777e-06, + "loss": 0.9932, "step": 331 }, { - "epoch": 0.12123425232791674, - "grad_norm": 1.4306470155715942, - "learning_rate": 1.2116788321167885e-05, - "loss": 1.4043, + "epoch": 0.05759888965995836, + "grad_norm": 1.0542311668395996, + "learning_rate": 5.758889852558544e-06, + "loss": 1.0437, "step": 332 }, { - "epoch": 0.12159941573854299, - "grad_norm": 1.5841552019119263, - "learning_rate": 1.2153284671532847e-05, - "loss": 1.4287, + "epoch": 0.05777238029146426, + "grad_norm": 1.3760309219360352, + "learning_rate": 5.7762359063313105e-06, + "loss": 0.9846, "step": 333 }, { - "epoch": 0.12196457914916925, - "grad_norm": 1.056060791015625, - "learning_rate": 1.218978102189781e-05, - "loss": 1.3882, + "epoch": 0.05794587092297016, + "grad_norm": 1.085569143295288, + "learning_rate": 5.793581960104076e-06, + "loss": 0.8716, "step": 334 }, { - "epoch": 0.1223297425597955, - "grad_norm": 1.4017077684402466, - "learning_rate": 1.2226277372262773e-05, - "loss": 1.3682, + "epoch": 0.05811936155447606, + "grad_norm": 0.700622022151947, + "learning_rate": 5.810928013876843e-06, + "loss": 1.2139, "step": 335 }, { - "epoch": 0.12269490597042176, - "grad_norm": 1.42486572265625, - "learning_rate": 1.226277372262774e-05, - "loss": 1.376, + "epoch": 0.05829285218598196, + "grad_norm": 1.353592872619629, + "learning_rate": 5.82827406764961e-06, + "loss": 0.9514, "step": 336 }, { - "epoch": 0.12306006938104802, - "grad_norm": 1.143389344215393, - "learning_rate": 1.2299270072992702e-05, - "loss": 1.3669, + "epoch": 0.05846634281748785, + "grad_norm": 1.0243644714355469, + "learning_rate": 5.845620121422377e-06, + "loss": 0.886, "step": 337 }, { - "epoch": 0.12342523279167428, - "grad_norm": 2.052196979522705, - "learning_rate": 1.2335766423357665e-05, - "loss": 1.3505, + "epoch": 0.058639833448993754, + "grad_norm": 1.1798290014266968, + "learning_rate": 5.862966175195144e-06, + "loss": 1.0149, "step": 338 }, { - "epoch": 0.12379039620230053, - "grad_norm": 1.0062004327774048, - "learning_rate": 1.237226277372263e-05, - "loss": 1.3784, + "epoch": 0.058813324080499656, + "grad_norm": 0.7625278830528259, + "learning_rate": 5.880312228967911e-06, + "loss": 1.0701, "step": 339 }, { - "epoch": 0.12415555961292679, - "grad_norm": 0.902478039264679, - "learning_rate": 1.2408759124087593e-05, - "loss": 1.3196, + "epoch": 0.05898681471200555, + "grad_norm": 0.8068732619285583, + "learning_rate": 5.897658282740677e-06, + "loss": 1.0552, "step": 340 }, { - "epoch": 0.12452072302355303, - "grad_norm": 1.2943652868270874, - "learning_rate": 1.2445255474452555e-05, - "loss": 1.3899, + "epoch": 0.05916030534351145, + "grad_norm": 0.9275553822517395, + "learning_rate": 5.915004336513443e-06, + "loss": 0.9265, "step": 341 }, { - "epoch": 0.12488588643417929, - "grad_norm": 0.8503755927085876, - "learning_rate": 1.2481751824817518e-05, - "loss": 1.3438, + "epoch": 0.059333795975017346, + "grad_norm": 0.7661913633346558, + "learning_rate": 5.93235039028621e-06, + "loss": 1.1333, "step": 342 }, { - "epoch": 0.12525104984480556, - "grad_norm": 1.0718951225280762, - "learning_rate": 1.2518248175182483e-05, - "loss": 1.364, + "epoch": 0.05950728660652325, + "grad_norm": 0.8352433443069458, + "learning_rate": 5.9496964440589766e-06, + "loss": 0.925, "step": 343 }, { - "epoch": 0.12561621325543182, - "grad_norm": 0.9090144634246826, - "learning_rate": 1.2554744525547446e-05, - "loss": 1.3259, + "epoch": 0.05968077723802915, + "grad_norm": 1.3820022344589233, + "learning_rate": 5.967042497831744e-06, + "loss": 1.0159, "step": 344 }, { - "epoch": 0.12598137666605805, - "grad_norm": 1.2664545774459839, - "learning_rate": 1.2591240875912409e-05, - "loss": 1.3475, + "epoch": 0.059854267869535044, + "grad_norm": 0.8154564499855042, + "learning_rate": 5.984388551604511e-06, + "loss": 0.939, "step": 345 }, { - "epoch": 0.1263465400766843, - "grad_norm": 1.1439722776412964, - "learning_rate": 1.2627737226277371e-05, - "loss": 1.3397, + "epoch": 0.060027758501040945, + "grad_norm": 1.0361301898956299, + "learning_rate": 6.0017346053772775e-06, + "loss": 0.8599, "step": 346 }, { - "epoch": 0.12671170348731056, - "grad_norm": 1.0529422760009766, - "learning_rate": 1.2664233576642338e-05, - "loss": 1.3879, + "epoch": 0.06020124913254684, + "grad_norm": 0.8701859712600708, + "learning_rate": 6.019080659150044e-06, + "loss": 1.0613, "step": 347 }, { - "epoch": 0.12707686689793682, - "grad_norm": 1.2582058906555176, - "learning_rate": 1.27007299270073e-05, - "loss": 1.3665, + "epoch": 0.06037473976405274, + "grad_norm": 0.7980366349220276, + "learning_rate": 6.03642671292281e-06, + "loss": 1.0305, "step": 348 }, { - "epoch": 0.12744203030856308, - "grad_norm": 0.9276443123817444, - "learning_rate": 1.2737226277372263e-05, - "loss": 1.3199, + "epoch": 0.06054823039555864, + "grad_norm": 0.830289363861084, + "learning_rate": 6.053772766695577e-06, + "loss": 1.113, "step": 349 }, { - "epoch": 0.12780719371918933, - "grad_norm": 1.0268436670303345, - "learning_rate": 1.2773722627737228e-05, - "loss": 1.3323, + "epoch": 0.06072172102706454, + "grad_norm": 1.01641047000885, + "learning_rate": 6.0711188204683434e-06, + "loss": 1.1057, "step": 350 }, { - "epoch": 0.1281723571298156, - "grad_norm": 0.8266549706459045, - "learning_rate": 1.2810218978102191e-05, - "loss": 1.3252, + "epoch": 0.06089521165857044, + "grad_norm": 0.9694294929504395, + "learning_rate": 6.08846487424111e-06, + "loss": 0.968, "step": 351 }, { - "epoch": 0.12853752054044185, - "grad_norm": 0.8446390628814697, - "learning_rate": 1.2846715328467154e-05, - "loss": 1.353, + "epoch": 0.061068702290076333, + "grad_norm": 0.8516224026679993, + "learning_rate": 6.105810928013878e-06, + "loss": 0.9204, "step": 352 }, { - "epoch": 0.1289026839510681, - "grad_norm": 1.1027939319610596, - "learning_rate": 1.2883211678832117e-05, - "loss": 1.2875, + "epoch": 0.061242192921582235, + "grad_norm": 0.6658259630203247, + "learning_rate": 6.123156981786644e-06, + "loss": 1.0891, "step": 353 }, { - "epoch": 0.12926784736169436, - "grad_norm": 1.1213006973266602, - "learning_rate": 1.2919708029197083e-05, - "loss": 1.3618, + "epoch": 0.061415683553088136, + "grad_norm": 0.7363485097885132, + "learning_rate": 6.140503035559411e-06, + "loss": 1.0566, "step": 354 }, { - "epoch": 0.12963301077232062, - "grad_norm": 0.9873935580253601, - "learning_rate": 1.2956204379562046e-05, - "loss": 1.3354, + "epoch": 0.06158917418459403, + "grad_norm": 0.8270686864852905, + "learning_rate": 6.157849089332178e-06, + "loss": 1.0291, "step": 355 }, { - "epoch": 0.12999817418294687, - "grad_norm": 1.114080786705017, - "learning_rate": 1.2992700729927009e-05, - "loss": 1.3615, + "epoch": 0.06176266481609993, + "grad_norm": 1.1620134115219116, + "learning_rate": 6.175195143104944e-06, + "loss": 0.8875, "step": 356 }, { - "epoch": 0.13036333759357313, - "grad_norm": 0.9962366223335266, - "learning_rate": 1.3029197080291972e-05, - "loss": 1.3325, + "epoch": 0.06193615544760583, + "grad_norm": 0.806554913520813, + "learning_rate": 6.19254119687771e-06, + "loss": 0.9463, "step": 357 }, { - "epoch": 0.1307285010041994, - "grad_norm": 1.1926721334457397, - "learning_rate": 1.3065693430656936e-05, - "loss": 1.3953, + "epoch": 0.06210964607911173, + "grad_norm": 0.8697148561477661, + "learning_rate": 6.209887250650477e-06, + "loss": 1.1636, "step": 358 }, { - "epoch": 0.13109366441482564, - "grad_norm": 1.322531819343567, - "learning_rate": 1.3102189781021899e-05, - "loss": 1.3406, + "epoch": 0.06228313671061763, + "grad_norm": 1.1133986711502075, + "learning_rate": 6.2272333044232445e-06, + "loss": 0.9351, "step": 359 }, { - "epoch": 0.1314588278254519, - "grad_norm": 1.3915084600448608, - "learning_rate": 1.3138686131386862e-05, - "loss": 1.3523, + "epoch": 0.062456627342123525, + "grad_norm": 0.8614605069160461, + "learning_rate": 6.244579358196011e-06, + "loss": 0.9487, "step": 360 }, { - "epoch": 0.13182399123607816, - "grad_norm": 0.8676251173019409, - "learning_rate": 1.3175182481751825e-05, - "loss": 1.3303, + "epoch": 0.06263011797362943, + "grad_norm": 1.1043434143066406, + "learning_rate": 6.261925411968778e-06, + "loss": 1.198, "step": 361 }, { - "epoch": 0.1321891546467044, - "grad_norm": 1.0042997598648071, - "learning_rate": 1.321167883211679e-05, - "loss": 1.3713, + "epoch": 0.06280360860513533, + "grad_norm": 0.8437508344650269, + "learning_rate": 6.279271465741545e-06, + "loss": 1.168, "step": 362 }, { - "epoch": 0.13255431805733064, - "grad_norm": 0.999978244304657, - "learning_rate": 1.3248175182481752e-05, - "loss": 1.3472, + "epoch": 0.06297709923664122, + "grad_norm": 0.8266021609306335, + "learning_rate": 6.2966175195143105e-06, + "loss": 1.0066, "step": 363 }, { - "epoch": 0.1329194814679569, - "grad_norm": 1.1161954402923584, - "learning_rate": 1.3284671532846715e-05, - "loss": 1.3397, + "epoch": 0.06315058986814712, + "grad_norm": 0.9373189806938171, + "learning_rate": 6.313963573287077e-06, + "loss": 1.073, "step": 364 }, { - "epoch": 0.13328464487858316, - "grad_norm": 1.7889089584350586, - "learning_rate": 1.3321167883211681e-05, - "loss": 1.3076, + "epoch": 0.06332408049965302, + "grad_norm": 1.0874111652374268, + "learning_rate": 6.331309627059844e-06, + "loss": 0.8601, "step": 365 }, { - "epoch": 0.13364980828920942, - "grad_norm": 1.2017756700515747, - "learning_rate": 1.3357664233576644e-05, - "loss": 1.3564, + "epoch": 0.06349757113115892, + "grad_norm": 1.603624701499939, + "learning_rate": 6.348655680832611e-06, + "loss": 1.1187, "step": 366 }, { - "epoch": 0.13401497169983567, - "grad_norm": 1.4361116886138916, - "learning_rate": 1.3394160583941607e-05, - "loss": 1.3535, + "epoch": 0.06367106176266482, + "grad_norm": 1.022397518157959, + "learning_rate": 6.366001734605378e-06, + "loss": 0.8945, "step": 367 }, { - "epoch": 0.13438013511046193, - "grad_norm": 0.9656901359558105, - "learning_rate": 1.343065693430657e-05, - "loss": 1.2845, + "epoch": 0.06384455239417071, + "grad_norm": 0.8357471823692322, + "learning_rate": 6.383347788378145e-06, + "loss": 1.0898, "step": 368 }, { - "epoch": 0.13474529852108819, - "grad_norm": 1.0214009284973145, - "learning_rate": 1.3467153284671534e-05, - "loss": 1.3408, + "epoch": 0.06401804302567661, + "grad_norm": 1.1832951307296753, + "learning_rate": 6.4006938421509115e-06, + "loss": 0.9639, "step": 369 }, { - "epoch": 0.13511046193171444, - "grad_norm": 2.5494346618652344, - "learning_rate": 1.3503649635036497e-05, - "loss": 1.3109, + "epoch": 0.06419153365718251, + "grad_norm": 0.6972271203994751, + "learning_rate": 6.418039895923678e-06, + "loss": 1.0801, "step": 370 }, { - "epoch": 0.1354756253423407, - "grad_norm": 0.7934643030166626, - "learning_rate": 1.354014598540146e-05, - "loss": 1.3115, + "epoch": 0.06436502428868841, + "grad_norm": 0.8201931118965149, + "learning_rate": 6.435385949696444e-06, + "loss": 0.8159, "step": 371 }, { - "epoch": 0.13584078875296696, - "grad_norm": 1.1864356994628906, - "learning_rate": 1.3576642335766423e-05, - "loss": 1.3733, + "epoch": 0.06453851492019431, + "grad_norm": 0.8767890930175781, + "learning_rate": 6.452732003469211e-06, + "loss": 0.948, "step": 372 }, { - "epoch": 0.1362059521635932, - "grad_norm": 0.97276371717453, - "learning_rate": 1.361313868613139e-05, - "loss": 1.3229, + "epoch": 0.0647120055517002, + "grad_norm": 0.9244304895401001, + "learning_rate": 6.4700780572419774e-06, + "loss": 0.9983, "step": 373 }, { - "epoch": 0.13657111557421947, - "grad_norm": 3.46140193939209, - "learning_rate": 1.3649635036496352e-05, - "loss": 1.3491, + "epoch": 0.0648854961832061, + "grad_norm": 0.9409970641136169, + "learning_rate": 6.487424111014745e-06, + "loss": 0.9885, "step": 374 }, { - "epoch": 0.13693627898484573, - "grad_norm": 1.4121092557907104, - "learning_rate": 1.3686131386861315e-05, - "loss": 1.3071, + "epoch": 0.065058986814712, + "grad_norm": 1.0992331504821777, + "learning_rate": 6.504770164787512e-06, + "loss": 0.9219, "step": 375 }, { - "epoch": 0.13730144239547198, - "grad_norm": 1.1235361099243164, - "learning_rate": 1.372262773722628e-05, - "loss": 1.314, + "epoch": 0.0652324774462179, + "grad_norm": 0.8378692865371704, + "learning_rate": 6.522116218560278e-06, + "loss": 0.9954, "step": 376 }, { - "epoch": 0.13766660580609824, - "grad_norm": 1.2276830673217773, - "learning_rate": 1.3759124087591242e-05, - "loss": 1.3582, + "epoch": 0.06540596807772381, + "grad_norm": 0.8875526785850525, + "learning_rate": 6.539462272333045e-06, + "loss": 1.0564, "step": 377 }, { - "epoch": 0.1380317692167245, - "grad_norm": 1.2236053943634033, - "learning_rate": 1.3795620437956205e-05, - "loss": 1.3596, + "epoch": 0.0655794587092297, + "grad_norm": 0.935085654258728, + "learning_rate": 6.556808326105811e-06, + "loss": 0.887, "step": 378 }, { - "epoch": 0.13839693262735073, - "grad_norm": 1.123871922492981, - "learning_rate": 1.3832116788321168e-05, - "loss": 1.3069, + "epoch": 0.0657529493407356, + "grad_norm": 0.824652910232544, + "learning_rate": 6.574154379878578e-06, + "loss": 1.0535, "step": 379 }, { - "epoch": 0.13876209603797698, - "grad_norm": 1.0352568626403809, - "learning_rate": 1.3868613138686133e-05, - "loss": 1.2643, + "epoch": 0.0659264399722415, + "grad_norm": 0.9888954758644104, + "learning_rate": 6.591500433651344e-06, + "loss": 1.0061, "step": 380 }, { - "epoch": 0.13912725944860324, - "grad_norm": 1.9417728185653687, - "learning_rate": 1.3905109489051096e-05, - "loss": 1.3315, + "epoch": 0.0660999306037474, + "grad_norm": 0.9201369881629944, + "learning_rate": 6.608846487424112e-06, + "loss": 0.9622, "step": 381 }, { - "epoch": 0.1394924228592295, - "grad_norm": 1.372770071029663, - "learning_rate": 1.3941605839416059e-05, - "loss": 1.358, + "epoch": 0.0662734212352533, + "grad_norm": 0.8864395618438721, + "learning_rate": 6.6261925411968785e-06, + "loss": 0.9927, "step": 382 }, { - "epoch": 0.13985758626985575, - "grad_norm": 0.9731897115707397, - "learning_rate": 1.3978102189781021e-05, - "loss": 1.2852, + "epoch": 0.06644691186675919, + "grad_norm": 0.7740955948829651, + "learning_rate": 6.643538594969645e-06, + "loss": 0.9888, "step": 383 }, { - "epoch": 0.140222749680482, - "grad_norm": 1.041582465171814, - "learning_rate": 1.4014598540145988e-05, - "loss": 1.2872, + "epoch": 0.06662040249826509, + "grad_norm": 0.9387248754501343, + "learning_rate": 6.660884648742412e-06, + "loss": 0.8901, "step": 384 }, { - "epoch": 0.14058791309110827, - "grad_norm": 1.063693642616272, - "learning_rate": 1.405109489051095e-05, - "loss": 1.3137, + "epoch": 0.06679389312977099, + "grad_norm": 1.0369046926498413, + "learning_rate": 6.678230702515179e-06, + "loss": 0.8892, "step": 385 }, { - "epoch": 0.14095307650173453, - "grad_norm": 0.8546766042709351, - "learning_rate": 1.4087591240875913e-05, - "loss": 1.295, + "epoch": 0.0669673837612769, + "grad_norm": 1.0651519298553467, + "learning_rate": 6.6955767562879445e-06, + "loss": 0.8359, "step": 386 }, { - "epoch": 0.14131823991236078, - "grad_norm": 1.3430160284042358, - "learning_rate": 1.4124087591240878e-05, - "loss": 1.3477, + "epoch": 0.0671408743927828, + "grad_norm": 0.9861769080162048, + "learning_rate": 6.712922810060711e-06, + "loss": 0.9448, "step": 387 }, { - "epoch": 0.14168340332298704, - "grad_norm": 1.1072465181350708, - "learning_rate": 1.416058394160584e-05, - "loss": 1.3088, + "epoch": 0.06731436502428868, + "grad_norm": 0.8586755990982056, + "learning_rate": 6.730268863833478e-06, + "loss": 1.0139, "step": 388 }, { - "epoch": 0.1420485667336133, - "grad_norm": 0.9957759976387024, - "learning_rate": 1.4197080291970804e-05, - "loss": 1.323, + "epoch": 0.06748785565579458, + "grad_norm": 0.8464505672454834, + "learning_rate": 6.747614917606245e-06, + "loss": 1.0071, "step": 389 }, { - "epoch": 0.14241373014423955, - "grad_norm": 1.5631967782974243, - "learning_rate": 1.4233576642335767e-05, - "loss": 1.3806, + "epoch": 0.06766134628730049, + "grad_norm": 1.168440580368042, + "learning_rate": 6.764960971379012e-06, + "loss": 1.0232, "step": 390 }, { - "epoch": 0.1427788935548658, - "grad_norm": 1.3982360363006592, - "learning_rate": 1.4270072992700733e-05, - "loss": 1.3379, + "epoch": 0.06783483691880639, + "grad_norm": 0.9936010837554932, + "learning_rate": 6.782307025151779e-06, + "loss": 0.9697, "step": 391 }, { - "epoch": 0.14314405696549207, - "grad_norm": 1.0132577419281006, - "learning_rate": 1.4306569343065696e-05, - "loss": 1.2806, + "epoch": 0.06800832755031229, + "grad_norm": 0.9261284470558167, + "learning_rate": 6.7996530789245455e-06, + "loss": 0.8933, "step": 392 }, { - "epoch": 0.14350922037611832, - "grad_norm": 0.9312093257904053, - "learning_rate": 1.4343065693430659e-05, - "loss": 1.2704, + "epoch": 0.06818181818181818, + "grad_norm": 0.7910310626029968, + "learning_rate": 6.816999132697311e-06, + "loss": 1.1108, "step": 393 }, { - "epoch": 0.14387438378674458, - "grad_norm": 1.1648707389831543, - "learning_rate": 1.4379562043795621e-05, - "loss": 1.335, + "epoch": 0.06835530881332408, + "grad_norm": 0.8086631894111633, + "learning_rate": 6.834345186470078e-06, + "loss": 1.0867, "step": 394 }, { - "epoch": 0.1442395471973708, - "grad_norm": 1.7935665845870972, - "learning_rate": 1.4416058394160586e-05, - "loss": 1.3513, + "epoch": 0.06852879944482998, + "grad_norm": 0.9535518288612366, + "learning_rate": 6.851691240242845e-06, + "loss": 0.9116, "step": 395 }, { - "epoch": 0.14460471060799707, - "grad_norm": 1.0111833810806274, - "learning_rate": 1.4452554744525549e-05, - "loss": 1.3032, + "epoch": 0.06870229007633588, + "grad_norm": 0.8716365098953247, + "learning_rate": 6.869037294015612e-06, + "loss": 1.0459, "step": 396 }, { - "epoch": 0.14496987401862332, - "grad_norm": 1.0907845497131348, - "learning_rate": 1.4489051094890512e-05, - "loss": 1.3341, + "epoch": 0.06887578070784178, + "grad_norm": 0.8731881976127625, + "learning_rate": 6.886383347788379e-06, + "loss": 0.9224, "step": 397 }, { - "epoch": 0.14533503742924958, - "grad_norm": 0.9275170564651489, - "learning_rate": 1.4525547445255475e-05, - "loss": 1.3044, + "epoch": 0.06904927133934767, + "grad_norm": 0.6795954704284668, + "learning_rate": 6.903729401561146e-06, + "loss": 1.0874, "step": 398 }, { - "epoch": 0.14570020083987584, - "grad_norm": 1.3322445154190063, - "learning_rate": 1.456204379562044e-05, - "loss": 1.2926, + "epoch": 0.06922276197085357, + "grad_norm": 0.8938729763031006, + "learning_rate": 6.921075455333912e-06, + "loss": 0.9702, "step": 399 }, { - "epoch": 0.1460653642505021, - "grad_norm": 1.3825352191925049, - "learning_rate": 1.4598540145985402e-05, - "loss": 1.2971, + "epoch": 0.06939625260235947, + "grad_norm": 0.8065364956855774, + "learning_rate": 6.938421509106679e-06, + "loss": 0.9343, "step": 400 }, { - "epoch": 0.14643052766112835, - "grad_norm": 1.0189565420150757, - "learning_rate": 1.4635036496350365e-05, - "loss": 1.248, + "epoch": 0.06956974323386537, + "grad_norm": 0.719506561756134, + "learning_rate": 6.955767562879445e-06, + "loss": 0.9551, "step": 401 }, { - "epoch": 0.1467956910717546, - "grad_norm": 0.8858089447021484, - "learning_rate": 1.4671532846715331e-05, - "loss": 1.2826, + "epoch": 0.06974323386537128, + "grad_norm": 1.2143350839614868, + "learning_rate": 6.973113616652212e-06, + "loss": 0.9924, "step": 402 }, { - "epoch": 0.14716085448238087, - "grad_norm": 1.1177042722702026, - "learning_rate": 1.4708029197080294e-05, - "loss": 1.3042, + "epoch": 0.06991672449687716, + "grad_norm": 1.146195650100708, + "learning_rate": 6.990459670424979e-06, + "loss": 1.0747, "step": 403 }, { - "epoch": 0.14752601789300712, - "grad_norm": 1.450744867324829, - "learning_rate": 1.4744525547445257e-05, - "loss": 1.3396, + "epoch": 0.07009021512838307, + "grad_norm": 0.9017952680587769, + "learning_rate": 7.007805724197746e-06, + "loss": 0.9282, "step": 404 }, { - "epoch": 0.14789118130363338, - "grad_norm": 1.3961888551712036, - "learning_rate": 1.478102189781022e-05, - "loss": 1.2908, + "epoch": 0.07026370575988897, + "grad_norm": 0.9841042160987854, + "learning_rate": 7.0251517779705125e-06, + "loss": 0.9351, "step": 405 }, { - "epoch": 0.14825634471425964, - "grad_norm": 0.9936912059783936, - "learning_rate": 1.4817518248175184e-05, - "loss": 1.2976, + "epoch": 0.07043719639139487, + "grad_norm": 1.2008854150772095, + "learning_rate": 7.042497831743279e-06, + "loss": 1.0254, "step": 406 }, { - "epoch": 0.1486215081248859, - "grad_norm": 1.076718807220459, - "learning_rate": 1.4854014598540147e-05, - "loss": 1.2764, + "epoch": 0.07061068702290077, + "grad_norm": 1.0576246976852417, + "learning_rate": 7.059843885516046e-06, + "loss": 0.8909, "step": 407 }, { - "epoch": 0.14898667153551215, - "grad_norm": 1.1668205261230469, - "learning_rate": 1.489051094890511e-05, - "loss": 1.3203, + "epoch": 0.07078417765440666, + "grad_norm": 1.06325364112854, + "learning_rate": 7.077189939288812e-06, + "loss": 0.8491, "step": 408 }, { - "epoch": 0.1493518349461384, - "grad_norm": 0.9831510186195374, - "learning_rate": 1.4927007299270073e-05, - "loss": 1.3135, + "epoch": 0.07095766828591256, + "grad_norm": 0.9034683704376221, + "learning_rate": 7.0945359930615785e-06, + "loss": 0.9795, "step": 409 }, { - "epoch": 0.14971699835676466, - "grad_norm": 0.9131173491477966, - "learning_rate": 1.4963503649635038e-05, - "loss": 1.3008, + "epoch": 0.07113115891741846, + "grad_norm": 0.8972019553184509, + "learning_rate": 7.111882046834345e-06, + "loss": 0.9629, "step": 410 }, { - "epoch": 0.15008216176739092, - "grad_norm": 1.4321012496948242, - "learning_rate": 1.5000000000000002e-05, - "loss": 1.3621, + "epoch": 0.07130464954892436, + "grad_norm": 1.418184757232666, + "learning_rate": 7.129228100607113e-06, + "loss": 0.8655, "step": 411 }, { - "epoch": 0.15044732517801715, - "grad_norm": 1.1949595212936401, - "learning_rate": 1.5036496350364965e-05, - "loss": 1.3198, + "epoch": 0.07147814018043026, + "grad_norm": 1.0950132608413696, + "learning_rate": 7.146574154379879e-06, + "loss": 0.7983, "step": 412 }, { - "epoch": 0.1508124885886434, - "grad_norm": 0.9398919343948364, - "learning_rate": 1.507299270072993e-05, - "loss": 1.302, + "epoch": 0.07165163081193615, + "grad_norm": 0.9816370010375977, + "learning_rate": 7.163920208152646e-06, + "loss": 0.9172, "step": 413 }, { - "epoch": 0.15117765199926966, - "grad_norm": 1.2115134000778198, - "learning_rate": 1.5109489051094892e-05, - "loss": 1.2762, + "epoch": 0.07182512144344205, + "grad_norm": 0.8744332194328308, + "learning_rate": 7.181266261925413e-06, + "loss": 0.9651, "step": 414 }, { - "epoch": 0.15154281540989592, - "grad_norm": 1.2991162538528442, - "learning_rate": 1.5145985401459855e-05, - "loss": 1.3257, + "epoch": 0.07199861207494795, + "grad_norm": 0.8754039406776428, + "learning_rate": 7.1986123156981795e-06, + "loss": 0.9363, "step": 415 }, { - "epoch": 0.15190797882052218, - "grad_norm": 1.1902439594268799, - "learning_rate": 1.5182481751824818e-05, - "loss": 1.2791, + "epoch": 0.07217210270645386, + "grad_norm": 0.8603073358535767, + "learning_rate": 7.215958369470945e-06, + "loss": 1.1108, "step": 416 }, { - "epoch": 0.15227314223114843, - "grad_norm": 0.9622432589530945, - "learning_rate": 1.5218978102189783e-05, - "loss": 1.2937, + "epoch": 0.07234559333795976, + "grad_norm": 1.1682006120681763, + "learning_rate": 7.233304423243712e-06, + "loss": 0.8799, "step": 417 }, { - "epoch": 0.1526383056417747, - "grad_norm": 0.8861438035964966, - "learning_rate": 1.5255474452554746e-05, - "loss": 1.2805, + "epoch": 0.07251908396946564, + "grad_norm": 1.6574336290359497, + "learning_rate": 7.25065047701648e-06, + "loss": 1.0129, "step": 418 }, { - "epoch": 0.15300346905240095, - "grad_norm": 1.1244996786117554, - "learning_rate": 1.529197080291971e-05, - "loss": 1.2723, + "epoch": 0.07269257460097155, + "grad_norm": 0.7819543480873108, + "learning_rate": 7.267996530789246e-06, + "loss": 1.1169, "step": 419 }, { - "epoch": 0.1533686324630272, - "grad_norm": 1.2154823541641235, - "learning_rate": 1.5328467153284673e-05, - "loss": 1.3391, + "epoch": 0.07286606523247745, + "grad_norm": 1.5049974918365479, + "learning_rate": 7.285342584562013e-06, + "loss": 0.9102, "step": 420 }, { - "epoch": 0.15373379587365346, - "grad_norm": 0.8650785088539124, - "learning_rate": 1.5364963503649638e-05, - "loss": 1.2839, + "epoch": 0.07303955586398335, + "grad_norm": 1.1448173522949219, + "learning_rate": 7.30268863833478e-06, + "loss": 1.0312, "step": 421 }, { - "epoch": 0.15409895928427972, - "grad_norm": 1.3965401649475098, - "learning_rate": 1.54014598540146e-05, - "loss": 1.3145, + "epoch": 0.07321304649548924, + "grad_norm": 0.9552216529846191, + "learning_rate": 7.320034692107546e-06, + "loss": 0.9998, "step": 422 }, { - "epoch": 0.15446412269490598, - "grad_norm": 1.1235862970352173, - "learning_rate": 1.5437956204379563e-05, - "loss": 1.2697, + "epoch": 0.07338653712699514, + "grad_norm": 1.0234169960021973, + "learning_rate": 7.337380745880312e-06, + "loss": 0.9021, "step": 423 }, { - "epoch": 0.15482928610553223, - "grad_norm": 1.1510714292526245, - "learning_rate": 1.5474452554744528e-05, - "loss": 1.306, + "epoch": 0.07356002775850104, + "grad_norm": 1.1055302619934082, + "learning_rate": 7.354726799653079e-06, + "loss": 0.8965, "step": 424 }, { - "epoch": 0.1551944495161585, - "grad_norm": 1.091079592704773, - "learning_rate": 1.5510948905109492e-05, - "loss": 1.3198, + "epoch": 0.07373351839000694, + "grad_norm": 0.9109980463981628, + "learning_rate": 7.3720728534258464e-06, + "loss": 1.0012, "step": 425 }, { - "epoch": 0.15555961292678475, - "grad_norm": 1.3336533308029175, - "learning_rate": 1.5547445255474454e-05, - "loss": 1.2896, + "epoch": 0.07390700902151284, + "grad_norm": 1.096415400505066, + "learning_rate": 7.389418907198613e-06, + "loss": 1.0142, "step": 426 }, { - "epoch": 0.155924776337411, - "grad_norm": 1.0023530721664429, - "learning_rate": 1.5583941605839418e-05, - "loss": 1.2798, + "epoch": 0.07408049965301873, + "grad_norm": 0.8121293187141418, + "learning_rate": 7.40676496097138e-06, + "loss": 1.0461, "step": 427 }, { - "epoch": 0.15628993974803726, - "grad_norm": 1.2282360792160034, - "learning_rate": 1.5620437956204383e-05, - "loss": 1.287, + "epoch": 0.07425399028452463, + "grad_norm": 0.9318151473999023, + "learning_rate": 7.4241110147441465e-06, + "loss": 1.0835, "step": 428 }, { - "epoch": 0.1566551031586635, - "grad_norm": 0.9336280822753906, - "learning_rate": 1.5656934306569344e-05, - "loss": 1.2911, + "epoch": 0.07442748091603053, + "grad_norm": 0.8428558111190796, + "learning_rate": 7.441457068516913e-06, + "loss": 0.9734, "step": 429 }, { - "epoch": 0.15702026656928975, - "grad_norm": 1.3188025951385498, - "learning_rate": 1.569343065693431e-05, - "loss": 1.2834, + "epoch": 0.07460097154753643, + "grad_norm": 1.2282167673110962, + "learning_rate": 7.45880312228968e-06, + "loss": 0.8616, "step": 430 }, { - "epoch": 0.157385429979916, - "grad_norm": 0.9228805899620056, - "learning_rate": 1.572992700729927e-05, - "loss": 1.2881, + "epoch": 0.07477446217904234, + "grad_norm": 0.7140018343925476, + "learning_rate": 7.476149176062446e-06, + "loss": 1.105, "step": 431 }, { - "epoch": 0.15775059339054226, - "grad_norm": 0.8465338945388794, - "learning_rate": 1.5766423357664234e-05, - "loss": 1.2472, + "epoch": 0.07494795281054822, + "grad_norm": 0.8262775540351868, + "learning_rate": 7.4934952298352125e-06, + "loss": 0.907, "step": 432 }, { - "epoch": 0.15811575680116852, - "grad_norm": 1.1760077476501465, - "learning_rate": 1.58029197080292e-05, - "loss": 1.2778, + "epoch": 0.07512144344205413, + "grad_norm": 0.9862105250358582, + "learning_rate": 7.51084128360798e-06, + "loss": 0.9573, "step": 433 }, { - "epoch": 0.15848092021179477, - "grad_norm": 1.1061915159225464, - "learning_rate": 1.583941605839416e-05, - "loss": 1.2518, + "epoch": 0.07529493407356003, + "grad_norm": 0.9502255320549011, + "learning_rate": 7.528187337380747e-06, + "loss": 0.8286, "step": 434 }, { - "epoch": 0.15884608362242103, - "grad_norm": 1.2939090728759766, - "learning_rate": 1.5875912408759125e-05, - "loss": 1.2983, + "epoch": 0.07546842470506593, + "grad_norm": 0.9837796092033386, + "learning_rate": 7.545533391153513e-06, + "loss": 0.9375, "step": 435 }, { - "epoch": 0.1592112470330473, - "grad_norm": 1.2089791297912598, - "learning_rate": 1.591240875912409e-05, - "loss": 1.2788, + "epoch": 0.07564191533657183, + "grad_norm": 0.873621940612793, + "learning_rate": 7.56287944492628e-06, + "loss": 1.0251, "step": 436 }, { - "epoch": 0.15957641044367354, - "grad_norm": 1.1044162511825562, - "learning_rate": 1.5948905109489054e-05, - "loss": 1.2882, + "epoch": 0.07581540596807772, + "grad_norm": 0.8327478170394897, + "learning_rate": 7.580225498699047e-06, + "loss": 0.9172, "step": 437 }, { - "epoch": 0.1599415738542998, - "grad_norm": 1.2863142490386963, - "learning_rate": 1.5985401459854015e-05, - "loss": 1.2513, + "epoch": 0.07598889659958362, + "grad_norm": 0.9932119846343994, + "learning_rate": 7.597571552471813e-06, + "loss": 1.0315, "step": 438 }, { - "epoch": 0.16030673726492606, - "grad_norm": 1.0391117334365845, - "learning_rate": 1.602189781021898e-05, - "loss": 1.2649, + "epoch": 0.07616238723108952, + "grad_norm": 0.8143360018730164, + "learning_rate": 7.614917606244579e-06, + "loss": 0.9883, "step": 439 }, { - "epoch": 0.16067190067555231, - "grad_norm": 0.9854104518890381, - "learning_rate": 1.6058394160583944e-05, - "loss": 1.3081, + "epoch": 0.07633587786259542, + "grad_norm": 1.1451959609985352, + "learning_rate": 7.632263660017348e-06, + "loss": 1.1072, "step": 440 }, { - "epoch": 0.16103706408617857, - "grad_norm": 0.8545101881027222, - "learning_rate": 1.6094890510948905e-05, - "loss": 1.2242, + "epoch": 0.07650936849410132, + "grad_norm": 1.3686916828155518, + "learning_rate": 7.649609713790114e-06, + "loss": 0.9294, "step": 441 }, { - "epoch": 0.16140222749680483, - "grad_norm": 1.349980354309082, - "learning_rate": 1.613138686131387e-05, - "loss": 1.2902, + "epoch": 0.07668285912560721, + "grad_norm": 0.9336972236633301, + "learning_rate": 7.666955767562881e-06, + "loss": 1.0518, "step": 442 }, { - "epoch": 0.16176739090743109, - "grad_norm": 1.2008837461471558, - "learning_rate": 1.6167883211678834e-05, - "loss": 1.2705, + "epoch": 0.07685634975711311, + "grad_norm": 0.8345749974250793, + "learning_rate": 7.684301821335646e-06, + "loss": 0.9009, "step": 443 }, { - "epoch": 0.16213255431805734, - "grad_norm": 2.7046103477478027, - "learning_rate": 1.62043795620438e-05, - "loss": 1.3013, + "epoch": 0.07702984038861901, + "grad_norm": 0.7914079427719116, + "learning_rate": 7.701647875108413e-06, + "loss": 1.0356, "step": 444 }, { - "epoch": 0.1624977177286836, - "grad_norm": 1.4053059816360474, - "learning_rate": 1.624087591240876e-05, - "loss": 1.2822, + "epoch": 0.07720333102012492, + "grad_norm": 0.9881901144981384, + "learning_rate": 7.71899392888118e-06, + "loss": 0.9868, "step": 445 }, { - "epoch": 0.16286288113930983, - "grad_norm": 1.0000989437103271, - "learning_rate": 1.6277372262773725e-05, - "loss": 1.2653, + "epoch": 0.07737682165163082, + "grad_norm": 1.3121129274368286, + "learning_rate": 7.736339982653946e-06, + "loss": 0.9893, "step": 446 }, { - "epoch": 0.16322804454993609, - "grad_norm": 1.184553861618042, - "learning_rate": 1.631386861313869e-05, - "loss": 1.2798, + "epoch": 0.0775503122831367, + "grad_norm": 1.1553049087524414, + "learning_rate": 7.753686036426713e-06, + "loss": 0.9658, "step": 447 }, { - "epoch": 0.16359320796056234, - "grad_norm": 0.7760008573532104, - "learning_rate": 1.635036496350365e-05, - "loss": 1.2598, + "epoch": 0.0777238029146426, + "grad_norm": 1.0087815523147583, + "learning_rate": 7.771032090199481e-06, + "loss": 0.9192, "step": 448 }, { - "epoch": 0.1639583713711886, - "grad_norm": 0.9644598364830017, - "learning_rate": 1.6386861313868615e-05, - "loss": 1.2524, + "epoch": 0.07789729354614851, + "grad_norm": 0.9182251691818237, + "learning_rate": 7.788378143972248e-06, + "loss": 0.8599, "step": 449 }, { - "epoch": 0.16432353478181486, - "grad_norm": 0.8611111044883728, - "learning_rate": 1.642335766423358e-05, - "loss": 1.1823, + "epoch": 0.07807078417765441, + "grad_norm": 0.9385561347007751, + "learning_rate": 7.805724197745013e-06, + "loss": 0.8816, "step": 450 }, { - "epoch": 0.1646886981924411, - "grad_norm": 1.3381280899047852, - "learning_rate": 1.645985401459854e-05, - "loss": 1.297, + "epoch": 0.07824427480916031, + "grad_norm": 0.8525676131248474, + "learning_rate": 7.82307025151778e-06, + "loss": 0.96, "step": 451 }, { - "epoch": 0.16505386160306737, - "grad_norm": 1.5478073358535767, - "learning_rate": 1.6496350364963505e-05, - "loss": 1.3323, + "epoch": 0.0784177654406662, + "grad_norm": 2.0243334770202637, + "learning_rate": 7.840416305290546e-06, + "loss": 0.7554, "step": 452 }, { - "epoch": 0.16541902501369363, - "grad_norm": 1.1325156688690186, - "learning_rate": 1.6532846715328466e-05, - "loss": 1.2649, + "epoch": 0.0785912560721721, + "grad_norm": 0.9865366220474243, + "learning_rate": 7.857762359063313e-06, + "loss": 0.8137, "step": 453 }, { - "epoch": 0.16578418842431988, - "grad_norm": 1.0778687000274658, - "learning_rate": 1.6569343065693434e-05, - "loss": 1.2307, + "epoch": 0.078764746703678, + "grad_norm": 0.9368730187416077, + "learning_rate": 7.87510841283608e-06, + "loss": 0.9814, "step": 454 }, { - "epoch": 0.16614935183494614, - "grad_norm": 1.0737855434417725, - "learning_rate": 1.6605839416058395e-05, - "loss": 1.2079, + "epoch": 0.0789382373351839, + "grad_norm": 0.9842661023139954, + "learning_rate": 7.892454466608848e-06, + "loss": 1.0222, "step": 455 }, { - "epoch": 0.1665145152455724, - "grad_norm": 1.1118228435516357, - "learning_rate": 1.664233576642336e-05, - "loss": 1.3027, + "epoch": 0.0791117279666898, + "grad_norm": 0.9765263199806213, + "learning_rate": 7.909800520381615e-06, + "loss": 0.9011, "step": 456 }, { - "epoch": 0.16687967865619865, - "grad_norm": 1.0090575218200684, - "learning_rate": 1.667883211678832e-05, - "loss": 1.2542, + "epoch": 0.07928521859819569, + "grad_norm": 0.8481934666633606, + "learning_rate": 7.927146574154382e-06, + "loss": 0.9414, "step": 457 }, { - "epoch": 0.1672448420668249, - "grad_norm": 1.1420024633407593, - "learning_rate": 1.6715328467153286e-05, - "loss": 1.2959, + "epoch": 0.0794587092297016, + "grad_norm": 1.4840737581253052, + "learning_rate": 7.944492627927147e-06, + "loss": 1.0957, "step": 458 }, { - "epoch": 0.16761000547745117, - "grad_norm": 1.10788893699646, - "learning_rate": 1.675182481751825e-05, - "loss": 1.2377, + "epoch": 0.0796321998612075, + "grad_norm": 1.9366291761398315, + "learning_rate": 7.961838681699913e-06, + "loss": 1.1167, "step": 459 }, { - "epoch": 0.16797516888807743, - "grad_norm": 1.159864902496338, - "learning_rate": 1.678832116788321e-05, - "loss": 1.3014, + "epoch": 0.0798056904927134, + "grad_norm": 1.217577576637268, + "learning_rate": 7.97918473547268e-06, + "loss": 0.8154, "step": 460 }, { - "epoch": 0.16834033229870368, - "grad_norm": 1.370806336402893, - "learning_rate": 1.6824817518248176e-05, - "loss": 1.2355, + "epoch": 0.0799791811242193, + "grad_norm": 0.9839723110198975, + "learning_rate": 7.996530789245447e-06, + "loss": 1.0144, "step": 461 }, { - "epoch": 0.16870549570932994, - "grad_norm": 1.1762539148330688, - "learning_rate": 1.686131386861314e-05, - "loss": 1.2888, + "epoch": 0.08015267175572519, + "grad_norm": 0.8602697253227234, + "learning_rate": 8.013876843018215e-06, + "loss": 1.0005, "step": 462 }, { - "epoch": 0.16907065911995617, - "grad_norm": 1.1329491138458252, - "learning_rate": 1.6897810218978102e-05, - "loss": 1.3088, + "epoch": 0.08032616238723109, + "grad_norm": 0.8449880480766296, + "learning_rate": 8.031222896790982e-06, + "loss": 1.0774, "step": 463 }, { - "epoch": 0.16943582253058243, - "grad_norm": 1.4468282461166382, - "learning_rate": 1.6934306569343066e-05, - "loss": 1.3037, + "epoch": 0.08049965301873699, + "grad_norm": 0.8146034479141235, + "learning_rate": 8.048568950563748e-06, + "loss": 0.9185, "step": 464 }, { - "epoch": 0.16980098594120868, - "grad_norm": 0.9057644009590149, - "learning_rate": 1.697080291970803e-05, - "loss": 1.2642, + "epoch": 0.08067314365024289, + "grad_norm": 0.7100718021392822, + "learning_rate": 8.065915004336513e-06, + "loss": 1.0754, "step": 465 }, { - "epoch": 0.17016614935183494, - "grad_norm": 0.8372164368629456, - "learning_rate": 1.7007299270072995e-05, - "loss": 1.2629, + "epoch": 0.08084663428174879, + "grad_norm": 0.9153879880905151, + "learning_rate": 8.08326105810928e-06, + "loss": 0.908, "step": 466 }, { - "epoch": 0.1705313127624612, - "grad_norm": 0.8911778926849365, - "learning_rate": 1.7043795620437957e-05, - "loss": 1.2334, + "epoch": 0.08102012491325468, + "grad_norm": 1.086484670639038, + "learning_rate": 8.100607111882047e-06, + "loss": 0.803, "step": 467 }, { - "epoch": 0.17089647617308745, - "grad_norm": 0.9696996808052063, - "learning_rate": 1.708029197080292e-05, - "loss": 1.2277, + "epoch": 0.08119361554476058, + "grad_norm": 0.8893733620643616, + "learning_rate": 8.117953165654814e-06, + "loss": 0.9412, "step": 468 }, { - "epoch": 0.1712616395837137, - "grad_norm": 1.0355796813964844, - "learning_rate": 1.7116788321167886e-05, - "loss": 1.2629, + "epoch": 0.08136710617626648, + "grad_norm": 0.909820556640625, + "learning_rate": 8.13529921942758e-06, + "loss": 0.8567, "step": 469 }, { - "epoch": 0.17162680299433997, - "grad_norm": 1.4189716577529907, - "learning_rate": 1.7153284671532847e-05, - "loss": 1.2719, + "epoch": 0.08154059680777238, + "grad_norm": 1.0070123672485352, + "learning_rate": 8.152645273200349e-06, + "loss": 1.061, "step": 470 }, { - "epoch": 0.17199196640496622, - "grad_norm": 1.298628807067871, - "learning_rate": 1.718978102189781e-05, - "loss": 1.2959, + "epoch": 0.08171408743927829, + "grad_norm": 1.052509069442749, + "learning_rate": 8.169991326973115e-06, + "loss": 1.0762, "step": 471 }, { - "epoch": 0.17235712981559248, - "grad_norm": 1.1393733024597168, - "learning_rate": 1.7226277372262773e-05, - "loss": 1.2417, + "epoch": 0.08188757807078417, + "grad_norm": 1.276943325996399, + "learning_rate": 8.187337380745882e-06, + "loss": 0.9553, "step": 472 }, { - "epoch": 0.17272229322621874, - "grad_norm": 1.3793359994888306, - "learning_rate": 1.726277372262774e-05, - "loss": 1.2671, + "epoch": 0.08206106870229007, + "grad_norm": 0.7898209095001221, + "learning_rate": 8.204683434518647e-06, + "loss": 1.103, "step": 473 }, { - "epoch": 0.173087456636845, - "grad_norm": 0.8971158862113953, - "learning_rate": 1.7299270072992702e-05, - "loss": 1.2899, + "epoch": 0.08223455933379598, + "grad_norm": 1.1432394981384277, + "learning_rate": 8.222029488291414e-06, + "loss": 0.9438, "step": 474 }, { - "epoch": 0.17345262004747125, - "grad_norm": 1.2431188821792603, - "learning_rate": 1.7335766423357666e-05, - "loss": 1.2563, + "epoch": 0.08240804996530188, + "grad_norm": 0.9038352370262146, + "learning_rate": 8.23937554206418e-06, + "loss": 1.0669, "step": 475 }, { - "epoch": 0.1738177834580975, - "grad_norm": 1.0033910274505615, - "learning_rate": 1.737226277372263e-05, - "loss": 1.2434, + "epoch": 0.08258154059680778, + "grad_norm": 1.40608549118042, + "learning_rate": 8.256721595836947e-06, + "loss": 0.8728, "step": 476 }, { - "epoch": 0.17418294686872376, - "grad_norm": 1.1718307733535767, - "learning_rate": 1.7408759124087592e-05, - "loss": 1.2559, + "epoch": 0.08275503122831367, + "grad_norm": 1.3181334733963013, + "learning_rate": 8.274067649609715e-06, + "loss": 0.7393, "step": 477 }, { - "epoch": 0.17454811027935002, - "grad_norm": 0.8973157405853271, - "learning_rate": 1.7445255474452557e-05, - "loss": 1.2465, + "epoch": 0.08292852185981957, + "grad_norm": 1.2733434438705444, + "learning_rate": 8.291413703382482e-06, + "loss": 0.8989, "step": 478 }, { - "epoch": 0.17491327368997625, - "grad_norm": 1.1519674062728882, - "learning_rate": 1.7481751824817518e-05, - "loss": 1.2238, + "epoch": 0.08310201249132547, + "grad_norm": 0.9549227952957153, + "learning_rate": 8.308759757155249e-06, + "loss": 0.9387, "step": 479 }, { - "epoch": 0.1752784371006025, - "grad_norm": 1.1591613292694092, - "learning_rate": 1.7518248175182482e-05, - "loss": 1.2668, + "epoch": 0.08327550312283137, + "grad_norm": 0.8664029836654663, + "learning_rate": 8.326105810928014e-06, + "loss": 0.9773, "step": 480 }, { - "epoch": 0.17564360051122876, - "grad_norm": 2.300348997116089, - "learning_rate": 1.7554744525547447e-05, - "loss": 1.2413, + "epoch": 0.08344899375433727, + "grad_norm": 1.0745776891708374, + "learning_rate": 8.34345186470078e-06, + "loss": 1.0526, "step": 481 }, { - "epoch": 0.17600876392185502, - "grad_norm": 1.3322337865829468, - "learning_rate": 1.7591240875912408e-05, - "loss": 1.2954, + "epoch": 0.08362248438584316, + "grad_norm": 0.9441255927085876, + "learning_rate": 8.360797918473547e-06, + "loss": 1.0002, "step": 482 }, { - "epoch": 0.17637392733248128, - "grad_norm": 1.1343178749084473, - "learning_rate": 1.7627737226277373e-05, - "loss": 1.2465, + "epoch": 0.08379597501734906, + "grad_norm": 1.0308853387832642, + "learning_rate": 8.378143972246314e-06, + "loss": 0.7878, "step": 483 }, { - "epoch": 0.17673909074310754, - "grad_norm": 1.4003844261169434, - "learning_rate": 1.7664233576642337e-05, - "loss": 1.2695, + "epoch": 0.08396946564885496, + "grad_norm": 1.0898972749710083, + "learning_rate": 8.39549002601908e-06, + "loss": 0.7798, "step": 484 }, { - "epoch": 0.1771042541537338, - "grad_norm": 1.4345433712005615, - "learning_rate": 1.7700729927007302e-05, - "loss": 1.2412, + "epoch": 0.08414295628036086, + "grad_norm": 1.2257461547851562, + "learning_rate": 8.412836079791849e-06, + "loss": 0.926, "step": 485 }, { - "epoch": 0.17746941756436005, - "grad_norm": 1.1456555128097534, - "learning_rate": 1.7737226277372263e-05, - "loss": 1.2913, + "epoch": 0.08431644691186677, + "grad_norm": 1.070261836051941, + "learning_rate": 8.430182133564616e-06, + "loss": 0.7732, "step": 486 }, { - "epoch": 0.1778345809749863, - "grad_norm": 0.9798994660377502, - "learning_rate": 1.7773722627737228e-05, - "loss": 1.2756, + "epoch": 0.08448993754337265, + "grad_norm": 1.2098016738891602, + "learning_rate": 8.44752818733738e-06, + "loss": 0.8855, "step": 487 }, { - "epoch": 0.17819974438561256, - "grad_norm": 0.9393369555473328, - "learning_rate": 1.7810218978102192e-05, - "loss": 1.2324, + "epoch": 0.08466342817487855, + "grad_norm": 0.9859785437583923, + "learning_rate": 8.464874241110147e-06, + "loss": 1.0486, "step": 488 }, { - "epoch": 0.17856490779623882, - "grad_norm": 1.060135841369629, - "learning_rate": 1.7846715328467153e-05, - "loss": 1.225, + "epoch": 0.08483691880638446, + "grad_norm": 0.7091138958930969, + "learning_rate": 8.482220294882914e-06, + "loss": 1.1008, "step": 489 }, { - "epoch": 0.17893007120686508, - "grad_norm": 1.2235097885131836, - "learning_rate": 1.7883211678832118e-05, - "loss": 1.2422, + "epoch": 0.08501040943789036, + "grad_norm": 0.990513265132904, + "learning_rate": 8.49956634865568e-06, + "loss": 0.8391, "step": 490 }, { - "epoch": 0.17929523461749133, - "grad_norm": 0.8924283385276794, - "learning_rate": 1.7919708029197082e-05, - "loss": 1.2185, + "epoch": 0.08518390006939626, + "grad_norm": 0.9549128413200378, + "learning_rate": 8.516912402428448e-06, + "loss": 1.0303, "step": 491 }, { - "epoch": 0.1796603980281176, - "grad_norm": 1.3768352270126343, - "learning_rate": 1.7956204379562047e-05, - "loss": 1.2495, + "epoch": 0.08535739070090215, + "grad_norm": 0.8811132907867432, + "learning_rate": 8.534258456201216e-06, + "loss": 0.7654, "step": 492 }, { - "epoch": 0.18002556143874385, - "grad_norm": 0.9643235802650452, - "learning_rate": 1.7992700729927008e-05, - "loss": 1.2588, + "epoch": 0.08553088133240805, + "grad_norm": 1.0291334390640259, + "learning_rate": 8.551604509973983e-06, + "loss": 0.8542, "step": 493 }, { - "epoch": 0.1803907248493701, - "grad_norm": 1.4977421760559082, - "learning_rate": 1.8029197080291973e-05, - "loss": 1.2769, + "epoch": 0.08570437196391395, + "grad_norm": 0.9601544737815857, + "learning_rate": 8.56895056374675e-06, + "loss": 0.8687, "step": 494 }, { - "epoch": 0.18075588825999636, - "grad_norm": 0.8822349905967712, - "learning_rate": 1.8065693430656937e-05, - "loss": 1.24, + "epoch": 0.08587786259541985, + "grad_norm": 0.9118724465370178, + "learning_rate": 8.586296617519514e-06, + "loss": 1.0247, "step": 495 }, { - "epoch": 0.1811210516706226, - "grad_norm": 1.4520167112350464, - "learning_rate": 1.81021897810219e-05, - "loss": 1.2267, + "epoch": 0.08605135322692574, + "grad_norm": 0.7105740308761597, + "learning_rate": 8.603642671292281e-06, + "loss": 0.9062, "step": 496 }, { - "epoch": 0.18148621508124885, - "grad_norm": 1.117475986480713, - "learning_rate": 1.8138686131386863e-05, - "loss": 1.2291, + "epoch": 0.08622484385843164, + "grad_norm": 0.8579117059707642, + "learning_rate": 8.620988725065048e-06, + "loss": 0.9243, "step": 497 }, { - "epoch": 0.1818513784918751, - "grad_norm": 0.8799911737442017, - "learning_rate": 1.8175182481751824e-05, - "loss": 1.2689, + "epoch": 0.08639833448993754, + "grad_norm": 1.0373072624206543, + "learning_rate": 8.638334778837814e-06, + "loss": 0.8171, "step": 498 }, { - "epoch": 0.18221654190250136, - "grad_norm": 1.069397211074829, - "learning_rate": 1.821167883211679e-05, - "loss": 1.2391, + "epoch": 0.08657182512144344, + "grad_norm": 1.0610160827636719, + "learning_rate": 8.655680832610583e-06, + "loss": 0.9124, "step": 499 }, { - "epoch": 0.18258170531312762, - "grad_norm": 1.5612057447433472, - "learning_rate": 1.8248175182481753e-05, - "loss": 1.2771, + "epoch": 0.08674531575294935, + "grad_norm": 1.0703028440475464, + "learning_rate": 8.67302688638335e-06, + "loss": 0.8423, "step": 500 }, { - "epoch": 0.18294686872375387, - "grad_norm": 1.6341369152069092, - "learning_rate": 1.8284671532846715e-05, - "loss": 1.2883, + "epoch": 0.08691880638445523, + "grad_norm": 2.04844331741333, + "learning_rate": 8.690372940156116e-06, + "loss": 0.9077, "step": 501 }, { - "epoch": 0.18331203213438013, - "grad_norm": 1.7307225465774536, - "learning_rate": 1.8321167883211683e-05, - "loss": 1.2114, + "epoch": 0.08709229701596113, + "grad_norm": 0.8391432762145996, + "learning_rate": 8.707718993928881e-06, + "loss": 0.8926, "step": 502 }, { - "epoch": 0.1836771955450064, - "grad_norm": 0.9655431509017944, - "learning_rate": 1.8357664233576644e-05, - "loss": 1.2588, + "epoch": 0.08726578764746704, + "grad_norm": 1.0390406847000122, + "learning_rate": 8.725065047701648e-06, + "loss": 0.9502, "step": 503 }, { - "epoch": 0.18404235895563265, - "grad_norm": 0.8625669479370117, - "learning_rate": 1.8394160583941608e-05, - "loss": 1.2338, + "epoch": 0.08743927827897294, + "grad_norm": 1.6325030326843262, + "learning_rate": 8.742411101474415e-06, + "loss": 0.9189, "step": 504 }, { - "epoch": 0.1844075223662589, - "grad_norm": 1.0227766036987305, - "learning_rate": 1.843065693430657e-05, - "loss": 1.2366, + "epoch": 0.08761276891047884, + "grad_norm": 0.8052063584327698, + "learning_rate": 8.759757155247181e-06, + "loss": 1.0071, "step": 505 }, { - "epoch": 0.18477268577688516, - "grad_norm": 1.053835153579712, - "learning_rate": 1.8467153284671534e-05, - "loss": 1.2146, + "epoch": 0.08778625954198473, + "grad_norm": 0.9894332885742188, + "learning_rate": 8.777103209019948e-06, + "loss": 0.8833, "step": 506 }, { - "epoch": 0.18513784918751142, - "grad_norm": 1.0351970195770264, - "learning_rate": 1.85036496350365e-05, - "loss": 1.2317, + "epoch": 0.08795975017349063, + "grad_norm": 0.9255545735359192, + "learning_rate": 8.794449262792716e-06, + "loss": 0.8716, "step": 507 }, { - "epoch": 0.18550301259813767, - "grad_norm": 1.9667693376541138, - "learning_rate": 1.854014598540146e-05, - "loss": 1.2487, + "epoch": 0.08813324080499653, + "grad_norm": 1.977734923362732, + "learning_rate": 8.811795316565483e-06, + "loss": 0.7971, "step": 508 }, { - "epoch": 0.18586817600876393, - "grad_norm": 1.099134087562561, - "learning_rate": 1.8576642335766424e-05, - "loss": 1.2047, + "epoch": 0.08830673143650243, + "grad_norm": 0.849090576171875, + "learning_rate": 8.82914137033825e-06, + "loss": 0.8787, "step": 509 }, { - "epoch": 0.1862333394193902, - "grad_norm": 1.3945677280426025, - "learning_rate": 1.861313868613139e-05, - "loss": 1.2301, + "epoch": 0.08848022206800833, + "grad_norm": 1.3747910261154175, + "learning_rate": 8.846487424111015e-06, + "loss": 0.9514, "step": 510 }, { - "epoch": 0.18659850283001644, - "grad_norm": 0.9897311925888062, - "learning_rate": 1.8649635036496353e-05, - "loss": 1.2212, + "epoch": 0.08865371269951422, + "grad_norm": 0.9728672504425049, + "learning_rate": 8.863833477883781e-06, + "loss": 0.9653, "step": 511 }, { - "epoch": 0.1869636662406427, - "grad_norm": 1.161859154701233, - "learning_rate": 1.8686131386861315e-05, - "loss": 1.2452, + "epoch": 0.08882720333102012, + "grad_norm": 2.1216602325439453, + "learning_rate": 8.881179531656548e-06, + "loss": 0.9255, "step": 512 }, { - "epoch": 0.18732882965126893, - "grad_norm": 1.0715153217315674, - "learning_rate": 1.872262773722628e-05, - "loss": 1.2487, + "epoch": 0.08900069396252602, + "grad_norm": 1.3875950574874878, + "learning_rate": 8.898525585429315e-06, + "loss": 0.8523, "step": 513 }, { - "epoch": 0.1876939930618952, - "grad_norm": 0.9755675792694092, - "learning_rate": 1.8759124087591244e-05, - "loss": 1.2283, + "epoch": 0.08917418459403192, + "grad_norm": 0.7814494371414185, + "learning_rate": 8.915871639202083e-06, + "loss": 0.981, "step": 514 }, { - "epoch": 0.18805915647252144, - "grad_norm": 1.064605951309204, - "learning_rate": 1.8795620437956205e-05, - "loss": 1.2313, + "epoch": 0.08934767522553783, + "grad_norm": 1.0771421194076538, + "learning_rate": 8.93321769297485e-06, + "loss": 0.9631, "step": 515 }, { - "epoch": 0.1884243198831477, - "grad_norm": 1.280477523803711, - "learning_rate": 1.883211678832117e-05, - "loss": 1.2544, + "epoch": 0.08952116585704371, + "grad_norm": 0.8821408748626709, + "learning_rate": 8.950563746747617e-06, + "loss": 0.9954, "step": 516 }, { - "epoch": 0.18878948329377396, - "grad_norm": 1.2260833978652954, - "learning_rate": 1.8868613138686134e-05, - "loss": 1.2621, + "epoch": 0.08969465648854962, + "grad_norm": 0.8891401886940002, + "learning_rate": 8.967909800520382e-06, + "loss": 0.9812, "step": 517 }, { - "epoch": 0.18915464670440021, - "grad_norm": 0.9312050342559814, - "learning_rate": 1.8905109489051095e-05, - "loss": 1.2365, + "epoch": 0.08986814712005552, + "grad_norm": 1.0452988147735596, + "learning_rate": 8.985255854293148e-06, + "loss": 0.9016, "step": 518 }, { - "epoch": 0.18951981011502647, - "grad_norm": 0.8291168212890625, - "learning_rate": 1.894160583941606e-05, - "loss": 1.2095, + "epoch": 0.09004163775156142, + "grad_norm": 0.8508404493331909, + "learning_rate": 9.002601908065915e-06, + "loss": 0.9763, "step": 519 }, { - "epoch": 0.18988497352565273, - "grad_norm": 0.9339287877082825, - "learning_rate": 1.897810218978102e-05, - "loss": 1.194, + "epoch": 0.09021512838306732, + "grad_norm": 0.9749872088432312, + "learning_rate": 9.019947961838682e-06, + "loss": 0.9412, "step": 520 }, { - "epoch": 0.19025013693627899, - "grad_norm": 1.2912375926971436, - "learning_rate": 1.901459854014599e-05, - "loss": 1.2504, + "epoch": 0.09038861901457321, + "grad_norm": 1.017313003540039, + "learning_rate": 9.037294015611448e-06, + "loss": 0.9807, "step": 521 }, { - "epoch": 0.19061530034690524, - "grad_norm": 1.1497576236724854, - "learning_rate": 1.905109489051095e-05, - "loss": 1.2739, + "epoch": 0.09056210964607911, + "grad_norm": 0.9386571049690247, + "learning_rate": 9.054640069384217e-06, + "loss": 0.7878, "step": 522 }, { - "epoch": 0.1909804637575315, - "grad_norm": 1.0795012712478638, - "learning_rate": 1.9087591240875915e-05, - "loss": 1.2529, + "epoch": 0.09073560027758501, + "grad_norm": 1.0415630340576172, + "learning_rate": 9.071986123156983e-06, + "loss": 0.741, "step": 523 }, { - "epoch": 0.19134562716815776, - "grad_norm": 1.5087990760803223, - "learning_rate": 1.912408759124088e-05, - "loss": 1.2737, + "epoch": 0.09090909090909091, + "grad_norm": 0.8848255276679993, + "learning_rate": 9.08933217692975e-06, + "loss": 0.8997, "step": 524 }, { - "epoch": 0.191710790578784, - "grad_norm": 1.309861183166504, - "learning_rate": 1.916058394160584e-05, - "loss": 1.2498, + "epoch": 0.09108258154059681, + "grad_norm": 0.9307740926742554, + "learning_rate": 9.106678230702515e-06, + "loss": 0.8875, "step": 525 }, { - "epoch": 0.19207595398941027, - "grad_norm": 1.6578222513198853, - "learning_rate": 1.9197080291970805e-05, - "loss": 1.2388, + "epoch": 0.0912560721721027, + "grad_norm": 1.1625994443893433, + "learning_rate": 9.124024284475282e-06, + "loss": 1.0605, "step": 526 }, { - "epoch": 0.19244111740003653, - "grad_norm": 0.9590536952018738, - "learning_rate": 1.9233576642335766e-05, - "loss": 1.2299, + "epoch": 0.0914295628036086, + "grad_norm": 1.363017201423645, + "learning_rate": 9.141370338248049e-06, + "loss": 0.866, "step": 527 }, { - "epoch": 0.19280628081066278, - "grad_norm": 1.2576996088027954, - "learning_rate": 1.9270072992700734e-05, - "loss": 1.204, + "epoch": 0.0916030534351145, + "grad_norm": 1.049926519393921, + "learning_rate": 9.158716392020815e-06, + "loss": 0.9199, "step": 528 }, { - "epoch": 0.19317144422128904, - "grad_norm": 1.1013354063034058, - "learning_rate": 1.9306569343065695e-05, - "loss": 1.1874, + "epoch": 0.0917765440666204, + "grad_norm": 1.2771332263946533, + "learning_rate": 9.176062445793584e-06, + "loss": 0.8145, "step": 529 }, { - "epoch": 0.19353660763191527, - "grad_norm": 1.2674694061279297, - "learning_rate": 1.934306569343066e-05, - "loss": 1.1598, + "epoch": 0.0919500346981263, + "grad_norm": 1.0625146627426147, + "learning_rate": 9.19340849956635e-06, + "loss": 0.8496, "step": 530 }, { - "epoch": 0.19390177104254153, - "grad_norm": 1.1024235486984253, - "learning_rate": 1.937956204379562e-05, - "loss": 1.1887, + "epoch": 0.0921235253296322, + "grad_norm": 0.9698385000228882, + "learning_rate": 9.210754553339117e-06, + "loss": 0.772, "step": 531 }, { - "epoch": 0.19426693445316778, - "grad_norm": 1.0512255430221558, - "learning_rate": 1.9416058394160586e-05, - "loss": 1.1929, + "epoch": 0.0922970159611381, + "grad_norm": 0.8189213871955872, + "learning_rate": 9.228100607111882e-06, + "loss": 1.1282, "step": 532 }, { - "epoch": 0.19463209786379404, - "grad_norm": 1.0719166994094849, - "learning_rate": 1.945255474452555e-05, - "loss": 1.2544, + "epoch": 0.092470506592644, + "grad_norm": 1.5812128782272339, + "learning_rate": 9.245446660884649e-06, + "loss": 0.9304, "step": 533 }, { - "epoch": 0.1949972612744203, - "grad_norm": 1.0424017906188965, - "learning_rate": 1.948905109489051e-05, - "loss": 1.1949, + "epoch": 0.0926439972241499, + "grad_norm": 0.9835970401763916, + "learning_rate": 9.262792714657415e-06, + "loss": 1.001, "step": 534 }, { - "epoch": 0.19536242468504655, - "grad_norm": 1.3381285667419434, - "learning_rate": 1.9525547445255476e-05, - "loss": 1.2472, + "epoch": 0.0928174878556558, + "grad_norm": 0.9074248671531677, + "learning_rate": 9.280138768430182e-06, + "loss": 0.7507, "step": 535 }, { - "epoch": 0.1957275880956728, - "grad_norm": 1.2971011400222778, - "learning_rate": 1.956204379562044e-05, - "loss": 1.2329, + "epoch": 0.09299097848716169, + "grad_norm": 0.8931490182876587, + "learning_rate": 9.29748482220295e-06, + "loss": 0.9592, "step": 536 }, { - "epoch": 0.19609275150629907, - "grad_norm": 1.296181559562683, - "learning_rate": 1.95985401459854e-05, - "loss": 1.2792, + "epoch": 0.09316446911866759, + "grad_norm": 0.8351404666900635, + "learning_rate": 9.314830875975717e-06, + "loss": 1.0884, "step": 537 }, { - "epoch": 0.19645791491692532, - "grad_norm": 0.9261305928230286, - "learning_rate": 1.9635036496350366e-05, - "loss": 1.2241, + "epoch": 0.09333795975017349, + "grad_norm": 0.8529146313667297, + "learning_rate": 9.332176929748484e-06, + "loss": 0.9102, "step": 538 }, { - "epoch": 0.19682307832755158, - "grad_norm": 1.2688018083572388, - "learning_rate": 1.967153284671533e-05, - "loss": 1.2402, + "epoch": 0.09351145038167939, + "grad_norm": 0.8797134757041931, + "learning_rate": 9.34952298352125e-06, + "loss": 0.928, "step": 539 }, { - "epoch": 0.19718824173817784, - "grad_norm": 1.020119071006775, - "learning_rate": 1.9708029197080295e-05, - "loss": 1.2235, + "epoch": 0.0936849410131853, + "grad_norm": 0.7116356492042542, + "learning_rate": 9.366869037294016e-06, + "loss": 1.0845, "step": 540 }, { - "epoch": 0.1975534051488041, - "grad_norm": 0.9605289101600647, - "learning_rate": 1.9744525547445256e-05, - "loss": 1.1929, + "epoch": 0.09385843164469118, + "grad_norm": 1.0931313037872314, + "learning_rate": 9.384215091066782e-06, + "loss": 0.8694, "step": 541 }, { - "epoch": 0.19791856855943035, - "grad_norm": 1.2627853155136108, - "learning_rate": 1.978102189781022e-05, - "loss": 1.2058, + "epoch": 0.09403192227619708, + "grad_norm": 1.023492455482483, + "learning_rate": 9.401561144839549e-06, + "loss": 0.9175, "step": 542 }, { - "epoch": 0.1982837319700566, - "grad_norm": 1.678021788597107, - "learning_rate": 1.9817518248175186e-05, - "loss": 1.2241, + "epoch": 0.09420541290770298, + "grad_norm": 0.7224456071853638, + "learning_rate": 9.418907198612316e-06, + "loss": 1.1025, "step": 543 }, { - "epoch": 0.19864889538068287, - "grad_norm": 1.1672545671463013, - "learning_rate": 1.9854014598540147e-05, - "loss": 1.2375, + "epoch": 0.09437890353920889, + "grad_norm": 1.3516511917114258, + "learning_rate": 9.436253252385084e-06, + "loss": 1.0051, "step": 544 }, { - "epoch": 0.19901405879130912, - "grad_norm": 1.7733126878738403, - "learning_rate": 1.989051094890511e-05, - "loss": 1.191, + "epoch": 0.09455239417071479, + "grad_norm": 0.7616457939147949, + "learning_rate": 9.45359930615785e-06, + "loss": 1.0398, "step": 545 }, { - "epoch": 0.19937922220193538, - "grad_norm": 1.0653575658798218, - "learning_rate": 1.9927007299270073e-05, - "loss": 1.2656, + "epoch": 0.09472588480222068, + "grad_norm": 1.0315539836883545, + "learning_rate": 9.470945359930617e-06, + "loss": 0.8335, "step": 546 }, { - "epoch": 0.1997443856125616, - "grad_norm": 0.9928293824195862, - "learning_rate": 1.9963503649635037e-05, - "loss": 1.2617, + "epoch": 0.09489937543372658, + "grad_norm": 0.7184267044067383, + "learning_rate": 9.488291413703382e-06, + "loss": 1.0198, "step": 547 }, { - "epoch": 0.20010954902318787, - "grad_norm": 1.883912205696106, - "learning_rate": 2e-05, - "loss": 1.21, + "epoch": 0.09507286606523248, + "grad_norm": 0.9292049407958984, + "learning_rate": 9.505637467476149e-06, + "loss": 0.9614, "step": 548 }, { - "epoch": 0.20047471243381412, - "grad_norm": 1.221509575843811, - "learning_rate": 2.0036496350364966e-05, - "loss": 1.2896, + "epoch": 0.09524635669673838, + "grad_norm": 1.2576292753219604, + "learning_rate": 9.522983521248916e-06, + "loss": 0.9778, "step": 549 }, { - "epoch": 0.20083987584444038, - "grad_norm": 0.8971399068832397, - "learning_rate": 2.0072992700729927e-05, - "loss": 1.1907, + "epoch": 0.09541984732824428, + "grad_norm": 0.8963791131973267, + "learning_rate": 9.540329575021683e-06, + "loss": 0.9646, "step": 550 }, { - "epoch": 0.20120503925506664, - "grad_norm": 0.947982668876648, - "learning_rate": 2.0109489051094892e-05, - "loss": 1.2151, + "epoch": 0.09559333795975017, + "grad_norm": 0.8005450963973999, + "learning_rate": 9.557675628794451e-06, + "loss": 0.8906, "step": 551 }, { - "epoch": 0.2015702026656929, - "grad_norm": 1.2019115686416626, - "learning_rate": 2.0145985401459857e-05, - "loss": 1.229, + "epoch": 0.09576682859125607, + "grad_norm": 0.8869155645370483, + "learning_rate": 9.575021682567218e-06, + "loss": 0.946, "step": 552 }, { - "epoch": 0.20193536607631915, - "grad_norm": 1.039163589477539, - "learning_rate": 2.0182481751824818e-05, - "loss": 1.1912, + "epoch": 0.09594031922276197, + "grad_norm": 0.8830819129943848, + "learning_rate": 9.592367736339984e-06, + "loss": 0.9028, "step": 553 }, { - "epoch": 0.2023005294869454, - "grad_norm": 0.9176907539367676, - "learning_rate": 2.0218978102189782e-05, - "loss": 1.1887, + "epoch": 0.09611380985426787, + "grad_norm": 1.2271912097930908, + "learning_rate": 9.609713790112751e-06, + "loss": 0.8948, "step": 554 }, { - "epoch": 0.20266569289757166, - "grad_norm": 1.3604471683502197, - "learning_rate": 2.0255474452554743e-05, - "loss": 1.2385, + "epoch": 0.09628730048577377, + "grad_norm": 0.8978062868118286, + "learning_rate": 9.627059843885516e-06, + "loss": 0.9973, "step": 555 }, { - "epoch": 0.20303085630819792, - "grad_norm": 1.1554077863693237, - "learning_rate": 2.0291970802919708e-05, - "loss": 1.2463, + "epoch": 0.09646079111727966, + "grad_norm": 2.438136100769043, + "learning_rate": 9.644405897658283e-06, + "loss": 0.9219, "step": 556 }, { - "epoch": 0.20339601971882418, - "grad_norm": 1.2582931518554688, - "learning_rate": 2.0328467153284676e-05, - "loss": 1.2072, + "epoch": 0.09663428174878556, + "grad_norm": 0.9106120467185974, + "learning_rate": 9.66175195143105e-06, + "loss": 0.9553, "step": 557 }, { - "epoch": 0.20376118312945043, - "grad_norm": 1.0883839130401611, - "learning_rate": 2.0364963503649634e-05, - "loss": 1.2023, + "epoch": 0.09680777238029147, + "grad_norm": 0.8702414631843567, + "learning_rate": 9.679098005203816e-06, + "loss": 0.9094, "step": 558 }, { - "epoch": 0.2041263465400767, - "grad_norm": 0.8406802415847778, - "learning_rate": 2.04014598540146e-05, - "loss": 1.2225, + "epoch": 0.09698126301179737, + "grad_norm": 1.084152102470398, + "learning_rate": 9.696444058976584e-06, + "loss": 1.0303, "step": 559 }, { - "epoch": 0.20449150995070295, - "grad_norm": 1.074591875076294, - "learning_rate": 2.0437956204379566e-05, - "loss": 1.1746, + "epoch": 0.09715475364330327, + "grad_norm": 1.0476773977279663, + "learning_rate": 9.713790112749351e-06, + "loss": 0.9365, "step": 560 }, { - "epoch": 0.2048566733613292, - "grad_norm": 1.4184726476669312, - "learning_rate": 2.0474452554744527e-05, - "loss": 1.2222, + "epoch": 0.09732824427480916, + "grad_norm": 1.0904978513717651, + "learning_rate": 9.731136166522118e-06, + "loss": 0.8645, "step": 561 }, { - "epoch": 0.20522183677195546, - "grad_norm": 1.0703859329223633, - "learning_rate": 2.0510948905109492e-05, - "loss": 1.192, + "epoch": 0.09750173490631506, + "grad_norm": 1.1301112174987793, + "learning_rate": 9.748482220294883e-06, + "loss": 0.8101, "step": 562 }, { - "epoch": 0.2055870001825817, - "grad_norm": 1.0501224994659424, - "learning_rate": 2.0547445255474457e-05, - "loss": 1.1559, + "epoch": 0.09767522553782096, + "grad_norm": 0.807544469833374, + "learning_rate": 9.76582827406765e-06, + "loss": 0.9026, "step": 563 }, { - "epoch": 0.20595216359320795, - "grad_norm": 1.3215252161026, - "learning_rate": 2.0583941605839418e-05, - "loss": 1.2744, + "epoch": 0.09784871616932686, + "grad_norm": 0.8898547291755676, + "learning_rate": 9.783174327840416e-06, + "loss": 1.0229, "step": 564 }, { - "epoch": 0.2063173270038342, - "grad_norm": 1.1511387825012207, - "learning_rate": 2.0620437956204382e-05, - "loss": 1.2003, + "epoch": 0.09802220680083276, + "grad_norm": 1.2677826881408691, + "learning_rate": 9.800520381613183e-06, + "loss": 0.8938, "step": 565 }, { - "epoch": 0.20668249041446046, - "grad_norm": 1.8524385690689087, - "learning_rate": 2.0656934306569343e-05, - "loss": 1.2001, + "epoch": 0.09819569743233865, + "grad_norm": 0.9513853788375854, + "learning_rate": 9.817866435385951e-06, + "loss": 0.9072, "step": 566 }, { - "epoch": 0.20704765382508672, - "grad_norm": 0.7722145318984985, - "learning_rate": 2.0693430656934308e-05, - "loss": 1.1873, + "epoch": 0.09836918806384455, + "grad_norm": 0.852165937423706, + "learning_rate": 9.835212489158718e-06, + "loss": 1.0166, "step": 567 }, { - "epoch": 0.20741281723571298, - "grad_norm": 2.4096412658691406, - "learning_rate": 2.0729927007299273e-05, - "loss": 1.1965, + "epoch": 0.09854267869535045, + "grad_norm": 1.0011224746704102, + "learning_rate": 9.852558542931485e-06, + "loss": 1.0703, "step": 568 }, { - "epoch": 0.20777798064633923, - "grad_norm": 1.3234232664108276, - "learning_rate": 2.0766423357664234e-05, - "loss": 1.2531, + "epoch": 0.09871616932685635, + "grad_norm": 0.7936244010925293, + "learning_rate": 9.869904596704251e-06, + "loss": 0.9631, "step": 569 }, { - "epoch": 0.2081431440569655, - "grad_norm": 1.0573618412017822, - "learning_rate": 2.08029197080292e-05, - "loss": 1.2137, + "epoch": 0.09888965995836224, + "grad_norm": 0.8106185793876648, + "learning_rate": 9.887250650477016e-06, + "loss": 1.0808, "step": 570 }, { - "epoch": 0.20850830746759175, - "grad_norm": 1.0957746505737305, - "learning_rate": 2.0839416058394163e-05, - "loss": 1.2119, + "epoch": 0.09906315058986814, + "grad_norm": 0.7678360342979431, + "learning_rate": 9.904596704249783e-06, + "loss": 0.915, "step": 571 }, { - "epoch": 0.208873470878218, - "grad_norm": 1.1826972961425781, - "learning_rate": 2.0875912408759124e-05, - "loss": 1.231, + "epoch": 0.09923664122137404, + "grad_norm": 0.8073553442955017, + "learning_rate": 9.92194275802255e-06, + "loss": 0.906, "step": 572 }, { - "epoch": 0.20923863428884426, - "grad_norm": 1.0164505243301392, - "learning_rate": 2.091240875912409e-05, - "loss": 1.1732, + "epoch": 0.09941013185287995, + "grad_norm": 0.6842048764228821, + "learning_rate": 9.939288811795318e-06, + "loss": 1.0308, "step": 573 }, { - "epoch": 0.20960379769947052, - "grad_norm": 1.3358932733535767, - "learning_rate": 2.0948905109489057e-05, - "loss": 1.2126, + "epoch": 0.09958362248438585, + "grad_norm": 0.9859399795532227, + "learning_rate": 9.956634865568085e-06, + "loss": 0.8518, "step": 574 }, { - "epoch": 0.20996896111009677, - "grad_norm": 1.1095725297927856, - "learning_rate": 2.0985401459854014e-05, - "loss": 1.222, + "epoch": 0.09975711311589174, + "grad_norm": 0.9538061022758484, + "learning_rate": 9.973980919340852e-06, + "loss": 0.9209, "step": 575 }, { - "epoch": 0.21033412452072303, - "grad_norm": 1.0522537231445312, - "learning_rate": 2.1021897810218982e-05, - "loss": 1.2078, + "epoch": 0.09993060374739764, + "grad_norm": 1.1470329761505127, + "learning_rate": 9.991326973113618e-06, + "loss": 0.8442, "step": 576 }, { - "epoch": 0.2106992879313493, - "grad_norm": 1.5535293817520142, - "learning_rate": 2.105839416058394e-05, - "loss": 1.2047, + "epoch": 0.10010409437890354, + "grad_norm": 1.2836531400680542, + "learning_rate": 1.0008673026886383e-05, + "loss": 0.7847, "step": 577 }, { - "epoch": 0.21106445134197555, - "grad_norm": 0.913555920124054, - "learning_rate": 2.1094890510948908e-05, - "loss": 1.2319, + "epoch": 0.10027758501040944, + "grad_norm": 1.1748347282409668, + "learning_rate": 1.002601908065915e-05, + "loss": 0.8418, "step": 578 }, { - "epoch": 0.2114296147526018, - "grad_norm": 0.9559549689292908, - "learning_rate": 2.1131386861313873e-05, - "loss": 1.1887, + "epoch": 0.10045107564191534, + "grad_norm": 1.278590202331543, + "learning_rate": 1.0043365134431917e-05, + "loss": 0.9058, "step": 579 }, { - "epoch": 0.21179477816322803, - "grad_norm": 1.212504267692566, - "learning_rate": 2.1167883211678834e-05, - "loss": 1.1759, + "epoch": 0.10062456627342123, + "grad_norm": 1.050093412399292, + "learning_rate": 1.0060711188204683e-05, + "loss": 0.946, "step": 580 }, { - "epoch": 0.2121599415738543, - "grad_norm": 0.8631566166877747, - "learning_rate": 2.12043795620438e-05, - "loss": 1.2218, + "epoch": 0.10079805690492713, + "grad_norm": 1.171562910079956, + "learning_rate": 1.007805724197745e-05, + "loss": 1.0266, "step": 581 }, { - "epoch": 0.21252510498448055, - "grad_norm": 1.4603711366653442, - "learning_rate": 2.1240875912408763e-05, - "loss": 1.2095, + "epoch": 0.10097154753643303, + "grad_norm": 1.114644169807434, + "learning_rate": 1.0095403295750217e-05, + "loss": 0.9983, "step": 582 }, { - "epoch": 0.2128902683951068, - "grad_norm": 1.0459176301956177, - "learning_rate": 2.1277372262773724e-05, - "loss": 1.2205, + "epoch": 0.10114503816793893, + "grad_norm": 0.8765393495559692, + "learning_rate": 1.0112749349522983e-05, + "loss": 1.0706, "step": 583 }, { - "epoch": 0.21325543180573306, - "grad_norm": 0.9267477989196777, - "learning_rate": 2.131386861313869e-05, - "loss": 1.1547, + "epoch": 0.10131852879944483, + "grad_norm": 0.9083306193351746, + "learning_rate": 1.0130095403295752e-05, + "loss": 0.905, "step": 584 }, { - "epoch": 0.21362059521635932, - "grad_norm": 1.1062085628509521, - "learning_rate": 2.135036496350365e-05, - "loss": 1.1981, + "epoch": 0.10149201943095072, + "grad_norm": 0.9430668950080872, + "learning_rate": 1.0147441457068519e-05, + "loss": 0.8811, "step": 585 }, { - "epoch": 0.21398575862698557, - "grad_norm": 1.3526908159255981, - "learning_rate": 2.1386861313868614e-05, - "loss": 1.2573, + "epoch": 0.10166551006245662, + "grad_norm": 0.8351008892059326, + "learning_rate": 1.0164787510841285e-05, + "loss": 0.9412, "step": 586 }, { - "epoch": 0.21435092203761183, - "grad_norm": 1.480765700340271, - "learning_rate": 2.142335766423358e-05, - "loss": 1.236, + "epoch": 0.10183900069396253, + "grad_norm": 1.2506262063980103, + "learning_rate": 1.0182133564614052e-05, + "loss": 0.7832, "step": 587 }, { - "epoch": 0.2147160854482381, - "grad_norm": 0.8805032968521118, - "learning_rate": 2.145985401459854e-05, - "loss": 1.1661, + "epoch": 0.10201249132546843, + "grad_norm": 1.4867618083953857, + "learning_rate": 1.0199479618386819e-05, + "loss": 1.1064, "step": 588 }, { - "epoch": 0.21508124885886434, - "grad_norm": 1.1115320920944214, - "learning_rate": 2.1496350364963505e-05, - "loss": 1.1835, + "epoch": 0.10218598195697433, + "grad_norm": 1.5262563228607178, + "learning_rate": 1.0216825672159585e-05, + "loss": 0.7983, "step": 589 }, { - "epoch": 0.2154464122694906, - "grad_norm": 1.7420566082000732, - "learning_rate": 2.153284671532847e-05, - "loss": 1.2251, + "epoch": 0.10235947258848022, + "grad_norm": 0.6911176443099976, + "learning_rate": 1.0234171725932352e-05, + "loss": 1.0034, "step": 590 }, { - "epoch": 0.21581157568011686, - "grad_norm": 0.8414182662963867, - "learning_rate": 2.156934306569343e-05, - "loss": 1.2013, + "epoch": 0.10253296321998612, + "grad_norm": 0.8043000102043152, + "learning_rate": 1.0251517779705119e-05, + "loss": 0.9636, "step": 591 }, { - "epoch": 0.21617673909074311, - "grad_norm": 1.5628974437713623, - "learning_rate": 2.1605839416058395e-05, - "loss": 1.2305, + "epoch": 0.10270645385149202, + "grad_norm": 0.7817282676696777, + "learning_rate": 1.0268863833477884e-05, + "loss": 1.1316, "step": 592 }, { - "epoch": 0.21654190250136937, - "grad_norm": 1.1491961479187012, - "learning_rate": 2.1642335766423363e-05, - "loss": 1.2278, + "epoch": 0.10287994448299792, + "grad_norm": 0.9914267659187317, + "learning_rate": 1.028620988725065e-05, + "loss": 0.8792, "step": 593 }, { - "epoch": 0.21690706591199563, - "grad_norm": 0.9887685775756836, - "learning_rate": 2.167883211678832e-05, - "loss": 1.1901, + "epoch": 0.10305343511450382, + "grad_norm": 0.699419379234314, + "learning_rate": 1.0303555941023417e-05, + "loss": 0.9111, "step": 594 }, { - "epoch": 0.21727222932262188, - "grad_norm": 1.5623655319213867, - "learning_rate": 2.171532846715329e-05, - "loss": 1.1754, + "epoch": 0.10322692574600971, + "grad_norm": 0.8660849332809448, + "learning_rate": 1.0320901994796184e-05, + "loss": 0.8936, "step": 595 }, { - "epoch": 0.21763739273324814, - "grad_norm": 1.0047733783721924, - "learning_rate": 2.1751824817518246e-05, - "loss": 1.1968, + "epoch": 0.10340041637751561, + "grad_norm": 0.7065616250038147, + "learning_rate": 1.033824804856895e-05, + "loss": 0.9075, "step": 596 }, { - "epoch": 0.21800255614387437, - "grad_norm": 0.9528868794441223, - "learning_rate": 2.1788321167883214e-05, - "loss": 1.1931, + "epoch": 0.10357390700902151, + "grad_norm": 0.8598759770393372, + "learning_rate": 1.0355594102341717e-05, + "loss": 1.1033, "step": 597 }, { - "epoch": 0.21836771955450063, - "grad_norm": 0.9562740325927734, - "learning_rate": 2.182481751824818e-05, - "loss": 1.167, + "epoch": 0.10374739764052741, + "grad_norm": 0.8698667883872986, + "learning_rate": 1.0372940156114484e-05, + "loss": 1.0286, "step": 598 }, { - "epoch": 0.21873288296512688, - "grad_norm": 1.244248390197754, - "learning_rate": 2.186131386861314e-05, - "loss": 1.2058, + "epoch": 0.10392088827203332, + "grad_norm": 1.9381582736968994, + "learning_rate": 1.0390286209887252e-05, + "loss": 0.8501, "step": 599 }, { - "epoch": 0.21909804637575314, - "grad_norm": 1.130480170249939, - "learning_rate": 2.1897810218978105e-05, - "loss": 1.2249, + "epoch": 0.1040943789035392, + "grad_norm": 1.129848599433899, + "learning_rate": 1.0407632263660019e-05, + "loss": 0.9768, "step": 600 }, { - "epoch": 0.2194632097863794, - "grad_norm": 1.2542846202850342, - "learning_rate": 2.193430656934307e-05, - "loss": 1.2075, + "epoch": 0.1042678695350451, + "grad_norm": 0.8434154987335205, + "learning_rate": 1.0424978317432786e-05, + "loss": 0.9758, "step": 601 }, { - "epoch": 0.21982837319700566, - "grad_norm": 1.0532104969024658, - "learning_rate": 2.197080291970803e-05, - "loss": 1.1937, + "epoch": 0.104441360166551, + "grad_norm": 0.8697825074195862, + "learning_rate": 1.0442324371205552e-05, + "loss": 0.8289, "step": 602 }, { - "epoch": 0.2201935366076319, - "grad_norm": 1.305064082145691, - "learning_rate": 2.2007299270072995e-05, - "loss": 1.2384, + "epoch": 0.10461485079805691, + "grad_norm": 0.794294536113739, + "learning_rate": 1.0459670424978319e-05, + "loss": 1.1819, "step": 603 }, { - "epoch": 0.22055870001825817, - "grad_norm": 1.4485101699829102, - "learning_rate": 2.204379562043796e-05, - "loss": 1.239, + "epoch": 0.10478834142956281, + "grad_norm": 1.683416485786438, + "learning_rate": 1.0477016478751086e-05, + "loss": 0.8982, "step": 604 }, { - "epoch": 0.22092386342888443, - "grad_norm": 1.6443532705307007, - "learning_rate": 2.208029197080292e-05, - "loss": 1.1666, + "epoch": 0.1049618320610687, + "grad_norm": 0.9708885550498962, + "learning_rate": 1.0494362532523852e-05, + "loss": 0.8325, "step": 605 }, { - "epoch": 0.22128902683951068, - "grad_norm": 1.3319772481918335, - "learning_rate": 2.2116788321167885e-05, - "loss": 1.2544, + "epoch": 0.1051353226925746, + "grad_norm": 1.986560583114624, + "learning_rate": 1.051170858629662e-05, + "loss": 0.8967, "step": 606 }, { - "epoch": 0.22165419025013694, - "grad_norm": 1.2234549522399902, - "learning_rate": 2.2153284671532847e-05, - "loss": 1.1875, + "epoch": 0.1053088133240805, + "grad_norm": 1.2848860025405884, + "learning_rate": 1.0529054640069384e-05, + "loss": 0.7513, "step": 607 }, { - "epoch": 0.2220193536607632, - "grad_norm": 1.2111555337905884, - "learning_rate": 2.218978102189781e-05, - "loss": 1.1992, + "epoch": 0.1054823039555864, + "grad_norm": 0.7748952507972717, + "learning_rate": 1.0546400693842151e-05, + "loss": 0.9937, "step": 608 }, { - "epoch": 0.22238451707138945, - "grad_norm": 1.338094711303711, - "learning_rate": 2.2226277372262776e-05, - "loss": 1.218, + "epoch": 0.1056557945870923, + "grad_norm": 0.8420317769050598, + "learning_rate": 1.0563746747614918e-05, + "loss": 1.0325, "step": 609 }, { - "epoch": 0.2227496804820157, - "grad_norm": 0.8569027781486511, - "learning_rate": 2.2262773722627737e-05, - "loss": 1.1781, + "epoch": 0.10582928521859819, + "grad_norm": 0.8988560438156128, + "learning_rate": 1.0581092801387684e-05, + "loss": 0.7683, "step": 610 }, { - "epoch": 0.22311484389264197, - "grad_norm": 2.0115368366241455, - "learning_rate": 2.22992700729927e-05, - "loss": 1.1859, + "epoch": 0.10600277585010409, + "grad_norm": 1.0991547107696533, + "learning_rate": 1.0598438855160451e-05, + "loss": 0.8376, "step": 611 }, { - "epoch": 0.22348000730326822, - "grad_norm": 1.3075586557388306, - "learning_rate": 2.2335766423357666e-05, - "loss": 1.1823, + "epoch": 0.10617626648161, + "grad_norm": 0.8434244394302368, + "learning_rate": 1.0615784908933218e-05, + "loss": 0.9397, "step": 612 }, { - "epoch": 0.22384517071389448, - "grad_norm": 1.336425542831421, - "learning_rate": 2.2372262773722627e-05, - "loss": 1.2048, + "epoch": 0.1063497571131159, + "grad_norm": 0.6983385682106018, + "learning_rate": 1.0633130962705984e-05, + "loss": 1.0603, "step": 613 }, { - "epoch": 0.2242103341245207, - "grad_norm": 0.9553148150444031, - "learning_rate": 2.2408759124087595e-05, - "loss": 1.1644, + "epoch": 0.1065232477446218, + "grad_norm": 0.7839468121528625, + "learning_rate": 1.0650477016478753e-05, + "loss": 1.0315, "step": 614 }, { - "epoch": 0.22457549753514697, - "grad_norm": 1.5929687023162842, - "learning_rate": 2.244525547445256e-05, - "loss": 1.2126, + "epoch": 0.10669673837612768, + "grad_norm": 0.8148625493049622, + "learning_rate": 1.066782307025152e-05, + "loss": 0.8716, "step": 615 }, { - "epoch": 0.22494066094577322, - "grad_norm": 1.2962710857391357, - "learning_rate": 2.248175182481752e-05, - "loss": 1.2343, + "epoch": 0.10687022900763359, + "grad_norm": 0.7720238566398621, + "learning_rate": 1.0685169124024286e-05, + "loss": 1.0879, "step": 616 }, { - "epoch": 0.22530582435639948, - "grad_norm": 1.0312621593475342, - "learning_rate": 2.2518248175182485e-05, - "loss": 1.2371, + "epoch": 0.10704371963913949, + "grad_norm": 0.743954062461853, + "learning_rate": 1.0702515177797053e-05, + "loss": 1.062, "step": 617 }, { - "epoch": 0.22567098776702574, - "grad_norm": 1.2244962453842163, - "learning_rate": 2.2554744525547447e-05, - "loss": 1.1754, + "epoch": 0.10721721027064539, + "grad_norm": 0.8227225542068481, + "learning_rate": 1.071986123156982e-05, + "loss": 0.9329, "step": 618 }, { - "epoch": 0.226036151177652, - "grad_norm": 1.9034221172332764, - "learning_rate": 2.259124087591241e-05, - "loss": 1.2183, + "epoch": 0.10739070090215129, + "grad_norm": 1.5451098680496216, + "learning_rate": 1.0737207285342586e-05, + "loss": 1.0999, "step": 619 }, { - "epoch": 0.22640131458827825, - "grad_norm": 0.9052218794822693, - "learning_rate": 2.2627737226277376e-05, - "loss": 1.1797, + "epoch": 0.10756419153365718, + "grad_norm": 1.0258731842041016, + "learning_rate": 1.0754553339115353e-05, + "loss": 0.8938, "step": 620 }, { - "epoch": 0.2267664779989045, - "grad_norm": 0.802769660949707, - "learning_rate": 2.2664233576642337e-05, - "loss": 1.1737, + "epoch": 0.10773768216516308, + "grad_norm": 1.6985119581222534, + "learning_rate": 1.077189939288812e-05, + "loss": 0.8752, "step": 621 }, { - "epoch": 0.22713164140953077, - "grad_norm": 1.4254635572433472, - "learning_rate": 2.27007299270073e-05, - "loss": 1.1979, + "epoch": 0.10791117279666898, + "grad_norm": 1.1578688621520996, + "learning_rate": 1.0789245446660885e-05, + "loss": 0.9036, "step": 622 }, { - "epoch": 0.22749680482015702, - "grad_norm": 1.3750559091567993, - "learning_rate": 2.2737226277372266e-05, - "loss": 1.2168, + "epoch": 0.10808466342817488, + "grad_norm": 1.1156891584396362, + "learning_rate": 1.0806591500433651e-05, + "loss": 0.8953, "step": 623 }, { - "epoch": 0.22786196823078328, - "grad_norm": 0.8507483005523682, - "learning_rate": 2.2773722627737227e-05, - "loss": 1.1895, + "epoch": 0.10825815405968078, + "grad_norm": 2.143883228302002, + "learning_rate": 1.0823937554206418e-05, + "loss": 0.76, "step": 624 }, { - "epoch": 0.22822713164140954, - "grad_norm": 1.1509250402450562, - "learning_rate": 2.2810218978102192e-05, - "loss": 1.2107, + "epoch": 0.10843164469118667, + "grad_norm": 0.9136075377464294, + "learning_rate": 1.0841283607979185e-05, + "loss": 1.0327, "step": 625 }, { - "epoch": 0.2285922950520358, - "grad_norm": 1.2443866729736328, - "learning_rate": 2.2846715328467156e-05, - "loss": 1.1913, + "epoch": 0.10860513532269257, + "grad_norm": 0.8192294836044312, + "learning_rate": 1.0858629661751951e-05, + "loss": 0.9177, "step": 626 }, { - "epoch": 0.22895745846266205, - "grad_norm": 1.2669581174850464, - "learning_rate": 2.2883211678832117e-05, - "loss": 1.1859, + "epoch": 0.10877862595419847, + "grad_norm": 1.0420821905136108, + "learning_rate": 1.0875975715524718e-05, + "loss": 0.9089, "step": 627 }, { - "epoch": 0.2293226218732883, - "grad_norm": 0.9887440204620361, - "learning_rate": 2.2919708029197082e-05, - "loss": 1.1823, + "epoch": 0.10895211658570438, + "grad_norm": 1.1073530912399292, + "learning_rate": 1.0893321769297485e-05, + "loss": 0.8135, "step": 628 }, { - "epoch": 0.22968778528391456, - "grad_norm": 0.8271394371986389, - "learning_rate": 2.2956204379562043e-05, - "loss": 1.1655, + "epoch": 0.10912560721721028, + "grad_norm": 1.2883092164993286, + "learning_rate": 1.0910667823070253e-05, + "loss": 1.0034, "step": 629 }, { - "epoch": 0.23005294869454082, - "grad_norm": 1.3163014650344849, - "learning_rate": 2.2992700729927008e-05, - "loss": 1.1771, + "epoch": 0.10929909784871616, + "grad_norm": 1.843510389328003, + "learning_rate": 1.092801387684302e-05, + "loss": 0.7979, "step": 630 }, { - "epoch": 0.23041811210516705, - "grad_norm": 1.000643014907837, - "learning_rate": 2.3029197080291972e-05, - "loss": 1.171, + "epoch": 0.10947258848022207, + "grad_norm": 1.2382749319076538, + "learning_rate": 1.0945359930615787e-05, + "loss": 1.0181, "step": 631 }, { - "epoch": 0.2307832755157933, - "grad_norm": 1.1546783447265625, - "learning_rate": 2.3065693430656934e-05, - "loss": 1.2141, + "epoch": 0.10964607911172797, + "grad_norm": 0.7657341957092285, + "learning_rate": 1.0962705984388553e-05, + "loss": 1.0205, "step": 632 }, { - "epoch": 0.23114843892641956, - "grad_norm": 1.1916320323944092, - "learning_rate": 2.31021897810219e-05, - "loss": 1.2205, + "epoch": 0.10981956974323387, + "grad_norm": 0.9405484795570374, + "learning_rate": 1.098005203816132e-05, + "loss": 0.7903, "step": 633 }, { - "epoch": 0.23151360233704582, - "grad_norm": 1.2281887531280518, - "learning_rate": 2.3138686131386866e-05, - "loss": 1.1846, + "epoch": 0.10999306037473977, + "grad_norm": 2.004823684692383, + "learning_rate": 1.0997398091934087e-05, + "loss": 0.9053, "step": 634 }, { - "epoch": 0.23187876574767208, - "grad_norm": 1.0115739107131958, - "learning_rate": 2.3175182481751827e-05, - "loss": 1.2214, + "epoch": 0.11016655100624566, + "grad_norm": 1.0525791645050049, + "learning_rate": 1.1014744145706853e-05, + "loss": 0.9897, "step": 635 }, { - "epoch": 0.23224392915829833, - "grad_norm": 1.1583013534545898, - "learning_rate": 2.3211678832116792e-05, - "loss": 1.2109, + "epoch": 0.11034004163775156, + "grad_norm": 0.696193277835846, + "learning_rate": 1.103209019947962e-05, + "loss": 0.9238, "step": 636 }, { - "epoch": 0.2326090925689246, - "grad_norm": 1.056147813796997, - "learning_rate": 2.3248175182481756e-05, - "loss": 1.1777, + "epoch": 0.11051353226925746, + "grad_norm": 0.8357210755348206, + "learning_rate": 1.1049436253252385e-05, + "loss": 1.0208, "step": 637 }, { - "epoch": 0.23297425597955085, - "grad_norm": 0.7799858450889587, - "learning_rate": 2.3284671532846718e-05, - "loss": 1.15, + "epoch": 0.11068702290076336, + "grad_norm": 1.235217809677124, + "learning_rate": 1.1066782307025152e-05, + "loss": 0.8708, "step": 638 }, { - "epoch": 0.2333394193901771, - "grad_norm": 1.2036656141281128, - "learning_rate": 2.3321167883211682e-05, - "loss": 1.1575, + "epoch": 0.11086051353226926, + "grad_norm": 1.107261300086975, + "learning_rate": 1.1084128360797918e-05, + "loss": 0.939, "step": 639 }, { - "epoch": 0.23370458280080336, - "grad_norm": 1.272029995918274, - "learning_rate": 2.3357664233576643e-05, - "loss": 1.2028, + "epoch": 0.11103400416377515, + "grad_norm": 1.1334728002548218, + "learning_rate": 1.1101474414570685e-05, + "loss": 0.9326, "step": 640 }, { - "epoch": 0.23406974621142962, - "grad_norm": 0.9792819023132324, - "learning_rate": 2.3394160583941608e-05, - "loss": 1.1532, + "epoch": 0.11120749479528105, + "grad_norm": 0.7712641358375549, + "learning_rate": 1.1118820468343452e-05, + "loss": 0.9651, "step": 641 }, { - "epoch": 0.23443490962205588, - "grad_norm": 1.2136130332946777, - "learning_rate": 2.3430656934306572e-05, - "loss": 1.229, + "epoch": 0.11138098542678695, + "grad_norm": 0.9346804618835449, + "learning_rate": 1.1136166522116219e-05, + "loss": 0.8052, "step": 642 }, { - "epoch": 0.23480007303268213, - "grad_norm": 1.082485318183899, - "learning_rate": 2.3467153284671534e-05, - "loss": 1.1615, + "epoch": 0.11155447605829286, + "grad_norm": 1.1329578161239624, + "learning_rate": 1.1153512575888985e-05, + "loss": 0.9458, "step": 643 }, { - "epoch": 0.2351652364433084, - "grad_norm": 0.8880698084831238, - "learning_rate": 2.3503649635036498e-05, - "loss": 1.1801, + "epoch": 0.11172796668979876, + "grad_norm": 1.0851794481277466, + "learning_rate": 1.1170858629661754e-05, + "loss": 0.8376, "step": 644 }, { - "epoch": 0.23553039985393465, - "grad_norm": 1.2473796606063843, - "learning_rate": 2.3540145985401463e-05, - "loss": 1.1826, + "epoch": 0.11190145732130465, + "grad_norm": 1.0983810424804688, + "learning_rate": 1.118820468343452e-05, + "loss": 0.8037, "step": 645 }, { - "epoch": 0.2358955632645609, - "grad_norm": 1.7428573369979858, - "learning_rate": 2.3576642335766424e-05, - "loss": 1.1763, + "epoch": 0.11207494795281055, + "grad_norm": 1.0667318105697632, + "learning_rate": 1.1205550737207287e-05, + "loss": 0.802, "step": 646 }, { - "epoch": 0.23626072667518713, - "grad_norm": 1.143588900566101, - "learning_rate": 2.361313868613139e-05, - "loss": 1.196, + "epoch": 0.11224843858431645, + "grad_norm": 0.9273934960365295, + "learning_rate": 1.1222896790980054e-05, + "loss": 0.8059, "step": 647 }, { - "epoch": 0.2366258900858134, - "grad_norm": 1.376758098602295, - "learning_rate": 2.3649635036496353e-05, - "loss": 1.2039, + "epoch": 0.11242192921582235, + "grad_norm": 1.1451305150985718, + "learning_rate": 1.124024284475282e-05, + "loss": 0.7954, "step": 648 }, { - "epoch": 0.23699105349643965, - "grad_norm": 1.3171701431274414, - "learning_rate": 2.3686131386861314e-05, - "loss": 1.1921, + "epoch": 0.11259541984732824, + "grad_norm": 0.831831157207489, + "learning_rate": 1.1257588898525587e-05, + "loss": 0.9861, "step": 649 }, { - "epoch": 0.2373562169070659, - "grad_norm": 1.2571752071380615, - "learning_rate": 2.372262773722628e-05, - "loss": 1.1781, + "epoch": 0.11276891047883414, + "grad_norm": 0.8697718381881714, + "learning_rate": 1.1274934952298354e-05, + "loss": 0.8496, "step": 650 }, { - "epoch": 0.23772138031769216, - "grad_norm": 1.2064151763916016, - "learning_rate": 2.375912408759124e-05, - "loss": 1.1454, + "epoch": 0.11294240111034004, + "grad_norm": 0.9105772376060486, + "learning_rate": 1.129228100607112e-05, + "loss": 0.8997, "step": 651 }, { - "epoch": 0.23808654372831842, - "grad_norm": 1.130691647529602, - "learning_rate": 2.3795620437956204e-05, - "loss": 1.1897, + "epoch": 0.11311589174184594, + "grad_norm": 0.9654907584190369, + "learning_rate": 1.1309627059843885e-05, + "loss": 0.9705, "step": 652 }, { - "epoch": 0.23845170713894467, - "grad_norm": 1.1216059923171997, - "learning_rate": 2.3832116788321172e-05, - "loss": 1.1737, + "epoch": 0.11328938237335184, + "grad_norm": 1.9748125076293945, + "learning_rate": 1.1326973113616652e-05, + "loss": 1.1084, "step": 653 }, { - "epoch": 0.23881687054957093, - "grad_norm": 1.1413416862487793, - "learning_rate": 2.3868613138686134e-05, - "loss": 1.1522, + "epoch": 0.11346287300485773, + "grad_norm": 0.7869737148284912, + "learning_rate": 1.1344319167389419e-05, + "loss": 0.8445, "step": 654 }, { - "epoch": 0.2391820339601972, - "grad_norm": 1.071456789970398, - "learning_rate": 2.3905109489051098e-05, - "loss": 1.1588, + "epoch": 0.11363636363636363, + "grad_norm": 0.7861626148223877, + "learning_rate": 1.1361665221162186e-05, + "loss": 0.9758, "step": 655 }, { - "epoch": 0.23954719737082344, - "grad_norm": 1.4380015134811401, - "learning_rate": 2.3941605839416063e-05, - "loss": 1.1752, + "epoch": 0.11380985426786953, + "grad_norm": 0.9714059829711914, + "learning_rate": 1.1379011274934952e-05, + "loss": 1.001, "step": 656 }, { - "epoch": 0.2399123607814497, - "grad_norm": 1.0659916400909424, - "learning_rate": 2.3978102189781024e-05, - "loss": 1.1895, + "epoch": 0.11398334489937544, + "grad_norm": 1.4099642038345337, + "learning_rate": 1.1396357328707719e-05, + "loss": 0.9097, "step": 657 }, { - "epoch": 0.24027752419207596, - "grad_norm": 1.3234971761703491, - "learning_rate": 2.401459854014599e-05, - "loss": 1.1681, + "epoch": 0.11415683553088134, + "grad_norm": 0.8929559588432312, + "learning_rate": 1.1413703382480487e-05, + "loss": 0.8914, "step": 658 }, { - "epoch": 0.24064268760270222, - "grad_norm": 0.8867655396461487, - "learning_rate": 2.405109489051095e-05, - "loss": 1.1638, + "epoch": 0.11433032616238722, + "grad_norm": 1.0276211500167847, + "learning_rate": 1.1431049436253254e-05, + "loss": 0.833, "step": 659 }, { - "epoch": 0.24100785101332847, - "grad_norm": 0.8754520416259766, - "learning_rate": 2.4087591240875914e-05, - "loss": 1.1578, + "epoch": 0.11450381679389313, + "grad_norm": 0.8742998838424683, + "learning_rate": 1.144839549002602e-05, + "loss": 1.0149, "step": 660 }, { - "epoch": 0.24137301442395473, - "grad_norm": 0.8900312781333923, - "learning_rate": 2.412408759124088e-05, - "loss": 1.1718, + "epoch": 0.11467730742539903, + "grad_norm": 1.2670873403549194, + "learning_rate": 1.1465741543798787e-05, + "loss": 0.9536, "step": 661 }, { - "epoch": 0.241738177834581, - "grad_norm": 0.835760235786438, - "learning_rate": 2.416058394160584e-05, - "loss": 1.1921, + "epoch": 0.11485079805690493, + "grad_norm": 0.8692175149917603, + "learning_rate": 1.1483087597571554e-05, + "loss": 0.9866, "step": 662 }, { - "epoch": 0.24210334124520724, - "grad_norm": 1.265151023864746, - "learning_rate": 2.4197080291970805e-05, - "loss": 1.1692, + "epoch": 0.11502428868841083, + "grad_norm": 0.9323992133140564, + "learning_rate": 1.150043365134432e-05, + "loss": 0.8123, "step": 663 }, { - "epoch": 0.24246850465583347, - "grad_norm": 1.001950740814209, - "learning_rate": 2.423357664233577e-05, - "loss": 1.1525, + "epoch": 0.11519777931991672, + "grad_norm": 0.7953401803970337, + "learning_rate": 1.1517779705117088e-05, + "loss": 0.959, "step": 664 }, { - "epoch": 0.24283366806645973, - "grad_norm": 1.2160698175430298, - "learning_rate": 2.427007299270073e-05, - "loss": 1.1831, + "epoch": 0.11537126995142262, + "grad_norm": 1.9961274862289429, + "learning_rate": 1.1535125758889854e-05, + "loss": 1.0233, "step": 665 }, { - "epoch": 0.24319883147708599, - "grad_norm": 1.2094972133636475, - "learning_rate": 2.4306569343065695e-05, - "loss": 1.1299, + "epoch": 0.11554476058292852, + "grad_norm": 1.1108171939849854, + "learning_rate": 1.1552471812662621e-05, + "loss": 0.9309, "step": 666 }, { - "epoch": 0.24356399488771224, - "grad_norm": 0.8463646173477173, - "learning_rate": 2.434306569343066e-05, - "loss": 1.1542, + "epoch": 0.11571825121443442, + "grad_norm": 0.8608222007751465, + "learning_rate": 1.1569817866435386e-05, + "loss": 0.9788, "step": 667 }, { - "epoch": 0.2439291582983385, - "grad_norm": 1.518998146057129, - "learning_rate": 2.437956204379562e-05, - "loss": 1.2119, + "epoch": 0.11589174184594032, + "grad_norm": 1.2235745191574097, + "learning_rate": 1.1587163920208153e-05, + "loss": 1.1235, "step": 668 }, { - "epoch": 0.24429432170896476, - "grad_norm": 1.5065865516662598, - "learning_rate": 2.4416058394160585e-05, - "loss": 1.2074, + "epoch": 0.11606523247744621, + "grad_norm": 0.6438035368919373, + "learning_rate": 1.160450997398092e-05, + "loss": 1.0137, "step": 669 }, { - "epoch": 0.244659485119591, - "grad_norm": 1.3690471649169922, - "learning_rate": 2.4452554744525546e-05, - "loss": 1.1064, + "epoch": 0.11623872310895211, + "grad_norm": 1.1957402229309082, + "learning_rate": 1.1621856027753686e-05, + "loss": 0.7561, "step": 670 }, { - "epoch": 0.24502464853021727, - "grad_norm": 1.439218282699585, - "learning_rate": 2.448905109489051e-05, - "loss": 1.1558, + "epoch": 0.11641221374045801, + "grad_norm": 0.9235798120498657, + "learning_rate": 1.1639202081526453e-05, + "loss": 0.8608, "step": 671 }, { - "epoch": 0.24538981194084353, - "grad_norm": 1.6192086935043335, - "learning_rate": 2.452554744525548e-05, - "loss": 1.1641, + "epoch": 0.11658570437196392, + "grad_norm": 1.497923493385315, + "learning_rate": 1.165654813529922e-05, + "loss": 0.8533, "step": 672 }, { - "epoch": 0.24575497535146978, - "grad_norm": 1.6800084114074707, - "learning_rate": 2.4562043795620437e-05, - "loss": 1.1865, + "epoch": 0.11675919500346982, + "grad_norm": 0.9704906344413757, + "learning_rate": 1.1673894189071988e-05, + "loss": 0.7617, "step": 673 }, { - "epoch": 0.24612013876209604, - "grad_norm": 1.640601396560669, - "learning_rate": 2.4598540145985405e-05, - "loss": 1.1328, + "epoch": 0.1169326856349757, + "grad_norm": 2.252171754837036, + "learning_rate": 1.1691240242844754e-05, + "loss": 0.8904, "step": 674 }, { - "epoch": 0.2464853021727223, - "grad_norm": 1.0114965438842773, - "learning_rate": 2.463503649635037e-05, - "loss": 1.1942, + "epoch": 0.11710617626648161, + "grad_norm": 0.7956164479255676, + "learning_rate": 1.1708586296617521e-05, + "loss": 0.9678, "step": 675 }, { - "epoch": 0.24685046558334856, - "grad_norm": 1.5129473209381104, - "learning_rate": 2.467153284671533e-05, - "loss": 1.191, + "epoch": 0.11727966689798751, + "grad_norm": 0.8376802206039429, + "learning_rate": 1.1725932350390288e-05, + "loss": 0.8887, "step": 676 }, { - "epoch": 0.2472156289939748, - "grad_norm": 1.3807483911514282, - "learning_rate": 2.4708029197080295e-05, - "loss": 1.183, + "epoch": 0.11745315752949341, + "grad_norm": 1.017789602279663, + "learning_rate": 1.1743278404163055e-05, + "loss": 1.0369, "step": 677 }, { - "epoch": 0.24758079240460107, - "grad_norm": 1.0233219861984253, - "learning_rate": 2.474452554744526e-05, - "loss": 1.1675, + "epoch": 0.11762664816099931, + "grad_norm": 1.012820839881897, + "learning_rate": 1.1760624457935821e-05, + "loss": 0.8064, "step": 678 }, { - "epoch": 0.24794595581522733, - "grad_norm": 0.914831280708313, - "learning_rate": 2.478102189781022e-05, - "loss": 1.1516, + "epoch": 0.1178001387925052, + "grad_norm": 1.3267112970352173, + "learning_rate": 1.1777970511708588e-05, + "loss": 0.8201, "step": 679 }, { - "epoch": 0.24831111922585358, - "grad_norm": 1.2852866649627686, - "learning_rate": 2.4817518248175185e-05, - "loss": 1.1334, + "epoch": 0.1179736294240111, + "grad_norm": 0.9907165765762329, + "learning_rate": 1.1795316565481355e-05, + "loss": 0.9116, "step": 680 }, { - "epoch": 0.2486762826364798, - "grad_norm": 1.2587239742279053, - "learning_rate": 2.4854014598540146e-05, - "loss": 1.1931, + "epoch": 0.118147120055517, + "grad_norm": 0.7892305850982666, + "learning_rate": 1.181266261925412e-05, + "loss": 0.9604, "step": 681 }, { - "epoch": 0.24904144604710607, - "grad_norm": 2.095913887023926, - "learning_rate": 2.489051094890511e-05, - "loss": 1.1816, + "epoch": 0.1183206106870229, + "grad_norm": 0.8325390815734863, + "learning_rate": 1.1830008673026886e-05, + "loss": 0.9155, "step": 682 }, { - "epoch": 0.24940660945773233, - "grad_norm": 1.2520564794540405, - "learning_rate": 2.4927007299270075e-05, - "loss": 1.1525, + "epoch": 0.1184941013185288, + "grad_norm": 1.1410521268844604, + "learning_rate": 1.1847354726799653e-05, + "loss": 0.7419, "step": 683 }, { - "epoch": 0.24977177286835858, - "grad_norm": 1.2239229679107666, - "learning_rate": 2.4963503649635037e-05, - "loss": 1.1876, + "epoch": 0.11866759195003469, + "grad_norm": 2.5031025409698486, + "learning_rate": 1.186470078057242e-05, + "loss": 1.1099, "step": 684 }, { - "epoch": 0.25013693627898487, - "grad_norm": 0.9445881843566895, - "learning_rate": 2.5e-05, - "loss": 1.1393, + "epoch": 0.1188410825815406, + "grad_norm": 0.7818343639373779, + "learning_rate": 1.1882046834345186e-05, + "loss": 1.0137, "step": 685 }, { - "epoch": 0.2505020996896111, - "grad_norm": 1.4442678689956665, - "learning_rate": 2.5036496350364966e-05, - "loss": 1.1958, + "epoch": 0.1190145732130465, + "grad_norm": 0.910425066947937, + "learning_rate": 1.1899392888117953e-05, + "loss": 0.9065, "step": 686 }, { - "epoch": 0.2508672631002374, - "grad_norm": 1.1458219289779663, - "learning_rate": 2.5072992700729927e-05, - "loss": 1.2189, + "epoch": 0.1191880638445524, + "grad_norm": 1.0307916402816772, + "learning_rate": 1.191673894189072e-05, + "loss": 0.7656, "step": 687 }, { - "epoch": 0.25123242651086364, - "grad_norm": 0.9616707563400269, - "learning_rate": 2.510948905109489e-05, - "loss": 1.1533, + "epoch": 0.1193615544760583, + "grad_norm": 0.7868828177452087, + "learning_rate": 1.1934084995663488e-05, + "loss": 0.9573, "step": 688 }, { - "epoch": 0.25159758992148984, - "grad_norm": 1.5767484903335571, - "learning_rate": 2.514598540145986e-05, - "loss": 1.173, + "epoch": 0.11953504510756419, + "grad_norm": 0.6996117234230042, + "learning_rate": 1.1951431049436255e-05, + "loss": 1.0051, "step": 689 }, { - "epoch": 0.2519627533321161, - "grad_norm": 1.0976102352142334, - "learning_rate": 2.5182481751824817e-05, - "loss": 1.1456, + "epoch": 0.11970853573907009, + "grad_norm": 1.4863032102584839, + "learning_rate": 1.1968777103209022e-05, + "loss": 0.8772, "step": 690 }, { - "epoch": 0.25232791674274235, - "grad_norm": 1.2026135921478271, - "learning_rate": 2.5218978102189785e-05, - "loss": 1.2053, + "epoch": 0.11988202637057599, + "grad_norm": 1.569183349609375, + "learning_rate": 1.1986123156981788e-05, + "loss": 0.7285, "step": 691 }, { - "epoch": 0.2526930801533686, - "grad_norm": 1.7931241989135742, - "learning_rate": 2.5255474452554743e-05, - "loss": 1.1812, + "epoch": 0.12005551700208189, + "grad_norm": 0.8275611400604248, + "learning_rate": 1.2003469210754555e-05, + "loss": 0.9553, "step": 692 }, { - "epoch": 0.25305824356399487, - "grad_norm": 1.162247896194458, - "learning_rate": 2.529197080291971e-05, - "loss": 1.1621, + "epoch": 0.12022900763358779, + "grad_norm": 0.7835754752159119, + "learning_rate": 1.2020815264527322e-05, + "loss": 0.9678, "step": 693 }, { - "epoch": 0.2534234069746211, - "grad_norm": 1.2464817762374878, - "learning_rate": 2.5328467153284675e-05, - "loss": 1.1776, + "epoch": 0.12040249826509368, + "grad_norm": 0.7934280633926392, + "learning_rate": 1.2038161318300088e-05, + "loss": 0.9517, "step": 694 }, { - "epoch": 0.2537885703852474, - "grad_norm": 1.2272287607192993, - "learning_rate": 2.5364963503649637e-05, - "loss": 1.1661, + "epoch": 0.12057598889659958, + "grad_norm": 2.9330203533172607, + "learning_rate": 1.2055507372072855e-05, + "loss": 1.0142, "step": 695 }, { - "epoch": 0.25415373379587364, - "grad_norm": 0.8898343443870544, - "learning_rate": 2.54014598540146e-05, - "loss": 1.1371, + "epoch": 0.12074947952810548, + "grad_norm": 0.779927670955658, + "learning_rate": 1.207285342584562e-05, + "loss": 0.8748, "step": 696 }, { - "epoch": 0.2545188972064999, - "grad_norm": 1.3448374271392822, - "learning_rate": 2.5437956204379566e-05, - "loss": 1.1685, + "epoch": 0.12092297015961138, + "grad_norm": 1.1059417724609375, + "learning_rate": 1.2090199479618387e-05, + "loss": 1.0896, "step": 697 }, { - "epoch": 0.25488406061712615, - "grad_norm": 0.9594965577125549, - "learning_rate": 2.5474452554744527e-05, - "loss": 1.1344, + "epoch": 0.12109646079111729, + "grad_norm": 1.1194554567337036, + "learning_rate": 1.2107545533391153e-05, + "loss": 0.8303, "step": 698 }, { - "epoch": 0.2552492240277524, - "grad_norm": 1.4403787851333618, - "learning_rate": 2.551094890510949e-05, - "loss": 1.1991, + "epoch": 0.12126995142262317, + "grad_norm": 0.8807851672172546, + "learning_rate": 1.212489158716392e-05, + "loss": 0.978, "step": 699 }, { - "epoch": 0.25561438743837867, - "grad_norm": 1.1203335523605347, - "learning_rate": 2.5547445255474456e-05, - "loss": 1.1459, + "epoch": 0.12144344205412907, + "grad_norm": 1.101300597190857, + "learning_rate": 1.2142237640936687e-05, + "loss": 0.7712, "step": 700 }, { - "epoch": 0.2559795508490049, - "grad_norm": 1.0530915260314941, - "learning_rate": 2.5583941605839417e-05, - "loss": 1.1043, + "epoch": 0.12161693268563498, + "grad_norm": 1.0599615573883057, + "learning_rate": 1.2159583694709454e-05, + "loss": 0.7832, "step": 701 }, { - "epoch": 0.2563447142596312, - "grad_norm": 1.1028295755386353, - "learning_rate": 2.5620437956204382e-05, - "loss": 1.1375, + "epoch": 0.12179042331714088, + "grad_norm": 1.146477460861206, + "learning_rate": 1.217692974848222e-05, + "loss": 0.9373, "step": 702 }, { - "epoch": 0.25670987767025744, - "grad_norm": 1.2067461013793945, - "learning_rate": 2.5656934306569343e-05, - "loss": 1.1898, + "epoch": 0.12196391394864678, + "grad_norm": 1.0841584205627441, + "learning_rate": 1.2194275802254989e-05, + "loss": 0.8142, "step": 703 }, { - "epoch": 0.2570750410808837, - "grad_norm": 1.2985962629318237, - "learning_rate": 2.5693430656934308e-05, - "loss": 1.1731, + "epoch": 0.12213740458015267, + "grad_norm": 1.3879425525665283, + "learning_rate": 1.2211621856027755e-05, + "loss": 0.8994, "step": 704 }, { - "epoch": 0.25744020449150995, - "grad_norm": 1.5154839754104614, - "learning_rate": 2.5729927007299272e-05, - "loss": 1.2267, + "epoch": 0.12231089521165857, + "grad_norm": 0.9901794195175171, + "learning_rate": 1.2228967909800522e-05, + "loss": 1.0029, "step": 705 }, { - "epoch": 0.2578053679021362, - "grad_norm": 1.1521481275558472, - "learning_rate": 2.5766423357664233e-05, - "loss": 1.1671, + "epoch": 0.12248438584316447, + "grad_norm": 0.877913773059845, + "learning_rate": 1.2246313963573289e-05, + "loss": 0.853, "step": 706 }, { - "epoch": 0.25817053131276246, - "grad_norm": 1.4629254341125488, - "learning_rate": 2.5802919708029198e-05, - "loss": 1.1927, + "epoch": 0.12265787647467037, + "grad_norm": 0.7394211888313293, + "learning_rate": 1.2263660017346055e-05, + "loss": 0.9082, "step": 707 }, { - "epoch": 0.2585356947233887, - "grad_norm": 0.931195080280304, - "learning_rate": 2.5839416058394166e-05, - "loss": 1.1625, + "epoch": 0.12283136710617627, + "grad_norm": 0.9942540526390076, + "learning_rate": 1.2281006071118822e-05, + "loss": 1.043, "step": 708 }, { - "epoch": 0.258900858134015, - "grad_norm": 1.2156867980957031, - "learning_rate": 2.5875912408759124e-05, - "loss": 1.1589, + "epoch": 0.12300485773768216, + "grad_norm": 0.954436719417572, + "learning_rate": 1.2298352124891589e-05, + "loss": 0.9197, "step": 709 }, { - "epoch": 0.25926602154464123, - "grad_norm": 1.3166860342025757, - "learning_rate": 2.591240875912409e-05, - "loss": 1.1884, + "epoch": 0.12317834836918806, + "grad_norm": 1.1329634189605713, + "learning_rate": 1.2315698178664356e-05, + "loss": 0.9187, "step": 710 }, { - "epoch": 0.2596311849552675, - "grad_norm": 0.9052502512931824, - "learning_rate": 2.5948905109489056e-05, - "loss": 1.1687, + "epoch": 0.12335183900069396, + "grad_norm": 0.8735076785087585, + "learning_rate": 1.233304423243712e-05, + "loss": 0.9709, "step": 711 }, { - "epoch": 0.25999634836589375, - "grad_norm": 0.9608241319656372, - "learning_rate": 2.5985401459854017e-05, - "loss": 1.1814, + "epoch": 0.12352532963219987, + "grad_norm": 0.8588553667068481, + "learning_rate": 1.2350390286209887e-05, + "loss": 0.9014, "step": 712 }, { - "epoch": 0.26036151177652, - "grad_norm": 1.9573471546173096, - "learning_rate": 2.6021897810218982e-05, - "loss": 1.1639, + "epoch": 0.12369882026370577, + "grad_norm": 0.7242774963378906, + "learning_rate": 1.2367736339982654e-05, + "loss": 0.9048, "step": 713 }, { - "epoch": 0.26072667518714626, - "grad_norm": 1.6978472471237183, - "learning_rate": 2.6058394160583943e-05, - "loss": 1.1899, + "epoch": 0.12387231089521165, + "grad_norm": 1.2762919664382935, + "learning_rate": 1.238508239375542e-05, + "loss": 0.8682, "step": 714 }, { - "epoch": 0.2610918385977725, - "grad_norm": 2.0063018798828125, - "learning_rate": 2.6094890510948908e-05, - "loss": 1.1635, + "epoch": 0.12404580152671756, + "grad_norm": 1.003089427947998, + "learning_rate": 1.2402428447528187e-05, + "loss": 0.9314, "step": 715 }, { - "epoch": 0.2614570020083988, - "grad_norm": 1.1480631828308105, - "learning_rate": 2.6131386861313872e-05, - "loss": 1.1831, + "epoch": 0.12421929215822346, + "grad_norm": 1.4701381921768188, + "learning_rate": 1.2419774501300954e-05, + "loss": 0.8701, "step": 716 }, { - "epoch": 0.26182216541902503, - "grad_norm": 1.2340322732925415, - "learning_rate": 2.6167883211678833e-05, - "loss": 1.1565, + "epoch": 0.12439278278972936, + "grad_norm": 0.6911842226982117, + "learning_rate": 1.2437120555073722e-05, + "loss": 1.1802, "step": 717 }, { - "epoch": 0.2621873288296513, - "grad_norm": 1.3828108310699463, - "learning_rate": 2.6204379562043798e-05, - "loss": 1.1692, + "epoch": 0.12456627342123526, + "grad_norm": 0.8295224905014038, + "learning_rate": 1.2454466608846489e-05, + "loss": 0.99, "step": 718 }, { - "epoch": 0.26255249224027755, - "grad_norm": 1.1706445217132568, - "learning_rate": 2.6240875912408762e-05, - "loss": 1.1289, + "epoch": 0.12473976405274115, + "grad_norm": 1.0603885650634766, + "learning_rate": 1.2471812662619256e-05, + "loss": 0.9536, "step": 719 }, { - "epoch": 0.2629176556509038, - "grad_norm": 1.148840069770813, - "learning_rate": 2.6277372262773724e-05, - "loss": 1.106, + "epoch": 0.12491325468424705, + "grad_norm": 0.8924951553344727, + "learning_rate": 1.2489158716392022e-05, + "loss": 0.9612, "step": 720 }, { - "epoch": 0.26328281906153006, - "grad_norm": 1.266731858253479, - "learning_rate": 2.6313868613138688e-05, - "loss": 1.1868, + "epoch": 0.12508674531575295, + "grad_norm": 1.5201667547225952, + "learning_rate": 1.250650477016479e-05, + "loss": 0.8079, "step": 721 }, { - "epoch": 0.2636479824721563, - "grad_norm": 2.011859893798828, - "learning_rate": 2.635036496350365e-05, - "loss": 1.1844, + "epoch": 0.12526023594725885, + "grad_norm": 1.0303034782409668, + "learning_rate": 1.2523850823937556e-05, + "loss": 0.969, "step": 722 }, { - "epoch": 0.2640131458827825, - "grad_norm": 1.6553030014038086, - "learning_rate": 2.6386861313868614e-05, - "loss": 1.1422, + "epoch": 0.12543372657876475, + "grad_norm": 0.9095684885978699, + "learning_rate": 1.2541196877710323e-05, + "loss": 0.8779, "step": 723 }, { - "epoch": 0.2643783092934088, - "grad_norm": 1.1908308267593384, - "learning_rate": 2.642335766423358e-05, - "loss": 1.1417, + "epoch": 0.12560721721027066, + "grad_norm": 1.1793122291564941, + "learning_rate": 1.255854293148309e-05, + "loss": 1.0356, "step": 724 }, { - "epoch": 0.26474347270403503, - "grad_norm": 1.1574262380599976, - "learning_rate": 2.645985401459854e-05, - "loss": 1.1379, + "epoch": 0.12578070784177656, + "grad_norm": 0.8952274322509766, + "learning_rate": 1.2575888985255856e-05, + "loss": 0.7913, "step": 725 }, { - "epoch": 0.2651086361146613, - "grad_norm": 1.2108185291290283, - "learning_rate": 2.6496350364963504e-05, - "loss": 1.155, + "epoch": 0.12595419847328243, + "grad_norm": 0.9902394413948059, + "learning_rate": 1.2593235039028621e-05, + "loss": 0.7905, "step": 726 }, { - "epoch": 0.26547379952528755, - "grad_norm": 2.1041672229766846, - "learning_rate": 2.6532846715328472e-05, - "loss": 1.189, + "epoch": 0.12612768910478833, + "grad_norm": 1.1604326963424683, + "learning_rate": 1.2610581092801388e-05, + "loss": 0.8625, "step": 727 }, { - "epoch": 0.2658389629359138, - "grad_norm": 1.0267277956008911, - "learning_rate": 2.656934306569343e-05, - "loss": 1.1423, + "epoch": 0.12630117973629423, + "grad_norm": 1.337523341178894, + "learning_rate": 1.2627927146574154e-05, + "loss": 0.9692, "step": 728 }, { - "epoch": 0.26620412634654006, - "grad_norm": 1.2484291791915894, - "learning_rate": 2.6605839416058398e-05, - "loss": 1.16, + "epoch": 0.12647467036780013, + "grad_norm": 1.3001458644866943, + "learning_rate": 1.2645273200346921e-05, + "loss": 0.7957, "step": 729 }, { - "epoch": 0.2665692897571663, - "grad_norm": 0.9702587723731995, - "learning_rate": 2.6642335766423363e-05, - "loss": 1.1658, + "epoch": 0.12664816099930604, + "grad_norm": 1.1424648761749268, + "learning_rate": 1.2662619254119688e-05, + "loss": 0.8062, "step": 730 }, { - "epoch": 0.2669344531677926, - "grad_norm": 1.1128861904144287, - "learning_rate": 2.6678832116788324e-05, - "loss": 1.1073, + "epoch": 0.12682165163081194, + "grad_norm": 1.0109091997146606, + "learning_rate": 1.2679965307892454e-05, + "loss": 0.9375, "step": 731 }, { - "epoch": 0.26729961657841883, - "grad_norm": 1.3145692348480225, - "learning_rate": 2.6715328467153288e-05, - "loss": 1.1722, + "epoch": 0.12699514226231784, + "grad_norm": 0.8463825583457947, + "learning_rate": 1.2697311361665223e-05, + "loss": 0.8967, "step": 732 }, { - "epoch": 0.2676647799890451, - "grad_norm": 0.9529619812965393, - "learning_rate": 2.675182481751825e-05, - "loss": 1.1672, + "epoch": 0.12716863289382374, + "grad_norm": 0.8907234072685242, + "learning_rate": 1.271465741543799e-05, + "loss": 0.9373, "step": 733 }, { - "epoch": 0.26802994339967134, - "grad_norm": 1.1806892156600952, - "learning_rate": 2.6788321167883214e-05, - "loss": 1.1992, + "epoch": 0.12734212352532964, + "grad_norm": 1.7347222566604614, + "learning_rate": 1.2732003469210756e-05, + "loss": 1.0386, "step": 734 }, { - "epoch": 0.2683951068102976, - "grad_norm": 1.2123585939407349, - "learning_rate": 2.682481751824818e-05, - "loss": 1.1573, + "epoch": 0.12751561415683554, + "grad_norm": 0.8882890343666077, + "learning_rate": 1.2749349522983523e-05, + "loss": 0.8523, "step": 735 }, { - "epoch": 0.26876027022092386, - "grad_norm": 0.9810069799423218, - "learning_rate": 2.686131386861314e-05, - "loss": 1.1362, + "epoch": 0.12768910478834142, + "grad_norm": 1.0062092542648315, + "learning_rate": 1.276669557675629e-05, + "loss": 1.0461, "step": 736 }, { - "epoch": 0.2691254336315501, - "grad_norm": 1.4084688425064087, - "learning_rate": 2.6897810218978104e-05, - "loss": 1.1541, + "epoch": 0.12786259541984732, + "grad_norm": 0.7552171945571899, + "learning_rate": 1.2784041630529056e-05, + "loss": 0.9714, "step": 737 }, { - "epoch": 0.26949059704217637, - "grad_norm": 1.196547269821167, - "learning_rate": 2.693430656934307e-05, - "loss": 1.1473, + "epoch": 0.12803608605135322, + "grad_norm": 1.1329518556594849, + "learning_rate": 1.2801387684301823e-05, + "loss": 0.8079, "step": 738 }, { - "epoch": 0.26985576045280263, - "grad_norm": 0.8092412352561951, - "learning_rate": 2.697080291970803e-05, - "loss": 1.1215, + "epoch": 0.12820957668285912, + "grad_norm": 1.2414352893829346, + "learning_rate": 1.281873373807459e-05, + "loss": 0.7512, "step": 739 }, { - "epoch": 0.2702209238634289, - "grad_norm": 1.1390682458877563, - "learning_rate": 2.7007299270072995e-05, - "loss": 1.1042, + "epoch": 0.12838306731436502, + "grad_norm": 1.0510791540145874, + "learning_rate": 1.2836079791847356e-05, + "loss": 0.856, "step": 740 }, { - "epoch": 0.27058608727405514, - "grad_norm": 0.9913802742958069, - "learning_rate": 2.704379562043796e-05, - "loss": 1.1577, + "epoch": 0.12855655794587093, + "grad_norm": 0.8328848481178284, + "learning_rate": 1.2853425845620121e-05, + "loss": 0.8208, "step": 741 }, { - "epoch": 0.2709512506846814, - "grad_norm": 1.699746012687683, - "learning_rate": 2.708029197080292e-05, - "loss": 1.1609, + "epoch": 0.12873004857737683, + "grad_norm": 1.0422801971435547, + "learning_rate": 1.2870771899392888e-05, + "loss": 0.7739, "step": 742 }, { - "epoch": 0.27131641409530766, - "grad_norm": 1.9054877758026123, - "learning_rate": 2.7116788321167885e-05, - "loss": 1.1311, + "epoch": 0.12890353920888273, + "grad_norm": 1.0033189058303833, + "learning_rate": 1.2888117953165655e-05, + "loss": 0.7847, "step": 743 }, { - "epoch": 0.2716815775059339, - "grad_norm": 1.237195372581482, - "learning_rate": 2.7153284671532846e-05, - "loss": 1.1505, + "epoch": 0.12907702984038863, + "grad_norm": 1.0816172361373901, + "learning_rate": 1.2905464006938421e-05, + "loss": 0.7542, "step": 744 }, { - "epoch": 0.27204674091656017, - "grad_norm": 1.1040980815887451, - "learning_rate": 2.718978102189781e-05, - "loss": 1.1823, + "epoch": 0.12925052047189453, + "grad_norm": 0.8975710868835449, + "learning_rate": 1.2922810060711188e-05, + "loss": 1.0369, "step": 745 }, { - "epoch": 0.2724119043271864, - "grad_norm": 1.5431379079818726, - "learning_rate": 2.722627737226278e-05, - "loss": 1.1606, + "epoch": 0.1294240111034004, + "grad_norm": 1.1595011949539185, + "learning_rate": 1.2940156114483955e-05, + "loss": 1.0042, "step": 746 }, { - "epoch": 0.2727770677378127, - "grad_norm": 0.8744033575057983, - "learning_rate": 2.7262773722627736e-05, - "loss": 1.1334, + "epoch": 0.1295975017349063, + "grad_norm": 1.518700361251831, + "learning_rate": 1.2957502168256723e-05, + "loss": 0.9299, "step": 747 }, { - "epoch": 0.27314223114843894, - "grad_norm": 1.3441598415374756, - "learning_rate": 2.7299270072992704e-05, - "loss": 1.1444, + "epoch": 0.1297709923664122, + "grad_norm": 0.8237302303314209, + "learning_rate": 1.297484822202949e-05, + "loss": 0.7852, "step": 748 }, { - "epoch": 0.2735073945590652, - "grad_norm": 1.357157826423645, - "learning_rate": 2.733576642335767e-05, - "loss": 1.1619, + "epoch": 0.1299444829979181, + "grad_norm": 0.7467516660690308, + "learning_rate": 1.2992194275802257e-05, + "loss": 0.9521, "step": 749 }, { - "epoch": 0.27387255796969145, - "grad_norm": 1.157043695449829, - "learning_rate": 2.737226277372263e-05, - "loss": 1.1272, + "epoch": 0.130117973629424, + "grad_norm": 0.9222540855407715, + "learning_rate": 1.3009540329575023e-05, + "loss": 1.043, "step": 750 }, { - "epoch": 0.2742377213803177, - "grad_norm": 0.9237921833992004, - "learning_rate": 2.7408759124087595e-05, - "loss": 1.1104, + "epoch": 0.1302914642609299, + "grad_norm": 0.9915211796760559, + "learning_rate": 1.302688638334779e-05, + "loss": 0.834, "step": 751 }, { - "epoch": 0.27460288479094397, - "grad_norm": 0.7836745977401733, - "learning_rate": 2.744525547445256e-05, - "loss": 1.136, + "epoch": 0.1304649548924358, + "grad_norm": 1.0473942756652832, + "learning_rate": 1.3044232437120557e-05, + "loss": 0.9138, "step": 752 }, { - "epoch": 0.2749680482015702, - "grad_norm": 1.0475623607635498, - "learning_rate": 2.748175182481752e-05, - "loss": 1.1481, + "epoch": 0.13063844552394172, + "grad_norm": 0.9518341422080994, + "learning_rate": 1.3061578490893323e-05, + "loss": 0.8762, "step": 753 }, { - "epoch": 0.2753332116121965, - "grad_norm": 1.5412874221801758, - "learning_rate": 2.7518248175182485e-05, - "loss": 1.1265, + "epoch": 0.13081193615544762, + "grad_norm": 0.7944692373275757, + "learning_rate": 1.307892454466609e-05, + "loss": 0.9668, "step": 754 }, { - "epoch": 0.27569837502282274, - "grad_norm": 1.1352887153625488, - "learning_rate": 2.7554744525547446e-05, - "loss": 1.1753, + "epoch": 0.13098542678695352, + "grad_norm": 0.9148827791213989, + "learning_rate": 1.3096270598438857e-05, + "loss": 0.95, "step": 755 }, { - "epoch": 0.276063538433449, - "grad_norm": 1.1704764366149902, - "learning_rate": 2.759124087591241e-05, - "loss": 1.1659, + "epoch": 0.1311589174184594, + "grad_norm": 6.546123504638672, + "learning_rate": 1.3113616652211622e-05, + "loss": 0.9912, "step": 756 }, { - "epoch": 0.2764287018440752, - "grad_norm": 1.1367764472961426, - "learning_rate": 2.7627737226277375e-05, - "loss": 1.1583, + "epoch": 0.1313324080499653, + "grad_norm": 0.9369930028915405, + "learning_rate": 1.3130962705984389e-05, + "loss": 0.8494, "step": 757 }, { - "epoch": 0.27679386525470145, - "grad_norm": 0.9353354573249817, - "learning_rate": 2.7664233576642336e-05, - "loss": 1.1149, + "epoch": 0.1315058986814712, + "grad_norm": 0.7022165656089783, + "learning_rate": 1.3148308759757155e-05, + "loss": 0.9124, "step": 758 }, { - "epoch": 0.2771590286653277, - "grad_norm": 1.4971503019332886, - "learning_rate": 2.77007299270073e-05, - "loss": 1.1779, + "epoch": 0.1316793893129771, + "grad_norm": 0.977077305316925, + "learning_rate": 1.3165654813529922e-05, + "loss": 0.7991, "step": 759 }, { - "epoch": 0.27752419207595397, - "grad_norm": 1.3741331100463867, - "learning_rate": 2.7737226277372266e-05, - "loss": 1.1052, + "epoch": 0.131852879944483, + "grad_norm": 0.8781751990318298, + "learning_rate": 1.3183000867302689e-05, + "loss": 0.9255, "step": 760 }, { - "epoch": 0.2778893554865802, - "grad_norm": 1.4283833503723145, - "learning_rate": 2.7773722627737227e-05, - "loss": 1.1229, + "epoch": 0.1320263705759889, + "grad_norm": 1.6098239421844482, + "learning_rate": 1.3200346921075455e-05, + "loss": 0.9143, "step": 761 }, { - "epoch": 0.2782545188972065, - "grad_norm": 1.2721019983291626, - "learning_rate": 2.781021897810219e-05, - "loss": 1.1611, + "epoch": 0.1321998612074948, + "grad_norm": 0.9713040590286255, + "learning_rate": 1.3217692974848224e-05, + "loss": 0.9979, "step": 762 }, { - "epoch": 0.27861968230783274, - "grad_norm": 1.4124202728271484, - "learning_rate": 2.784671532846716e-05, - "loss": 1.1091, + "epoch": 0.1323733518390007, + "grad_norm": 0.8151401877403259, + "learning_rate": 1.323503902862099e-05, + "loss": 0.8967, "step": 763 }, { - "epoch": 0.278984845718459, - "grad_norm": 0.750219464302063, - "learning_rate": 2.7883211678832117e-05, - "loss": 1.1411, + "epoch": 0.1325468424705066, + "grad_norm": 1.207521677017212, + "learning_rate": 1.3252385082393757e-05, + "loss": 0.9526, "step": 764 }, { - "epoch": 0.27935000912908525, - "grad_norm": 1.2052608728408813, - "learning_rate": 2.7919708029197085e-05, - "loss": 1.1874, + "epoch": 0.13272033310201248, + "grad_norm": 0.8407717347145081, + "learning_rate": 1.3269731136166524e-05, + "loss": 0.8354, "step": 765 }, { - "epoch": 0.2797151725397115, - "grad_norm": 1.065862774848938, - "learning_rate": 2.7956204379562043e-05, - "loss": 1.1283, + "epoch": 0.13289382373351838, + "grad_norm": 0.981062650680542, + "learning_rate": 1.328707718993929e-05, + "loss": 0.8469, "step": 766 }, { - "epoch": 0.28008033595033777, - "grad_norm": 1.2131696939468384, - "learning_rate": 2.799270072992701e-05, - "loss": 1.1542, + "epoch": 0.13306731436502428, + "grad_norm": 0.9651755690574646, + "learning_rate": 1.3304423243712057e-05, + "loss": 0.6641, "step": 767 }, { - "epoch": 0.280445499360964, - "grad_norm": 1.6430608034133911, - "learning_rate": 2.8029197080291975e-05, - "loss": 1.1354, + "epoch": 0.13324080499653018, + "grad_norm": 0.7300646305084229, + "learning_rate": 1.3321769297484824e-05, + "loss": 0.9302, "step": 768 }, { - "epoch": 0.2808106627715903, - "grad_norm": 0.9005884528160095, - "learning_rate": 2.8065693430656936e-05, - "loss": 1.0773, + "epoch": 0.13341429562803608, + "grad_norm": 1.488232970237732, + "learning_rate": 1.333911535125759e-05, + "loss": 0.9185, "step": 769 }, { - "epoch": 0.28117582618221654, - "grad_norm": 1.074357032775879, - "learning_rate": 2.81021897810219e-05, - "loss": 1.1073, + "epoch": 0.13358778625954199, + "grad_norm": 1.1995389461517334, + "learning_rate": 1.3356461405030357e-05, + "loss": 0.8867, "step": 770 }, { - "epoch": 0.2815409895928428, - "grad_norm": 1.0276203155517578, - "learning_rate": 2.8138686131386866e-05, - "loss": 1.1206, + "epoch": 0.1337612768910479, + "grad_norm": 1.3114588260650635, + "learning_rate": 1.3373807458803122e-05, + "loss": 0.8696, "step": 771 }, { - "epoch": 0.28190615300346905, - "grad_norm": 1.3735047578811646, - "learning_rate": 2.8175182481751827e-05, - "loss": 1.1786, + "epoch": 0.1339347675225538, + "grad_norm": 1.0361328125, + "learning_rate": 1.3391153512575889e-05, + "loss": 0.9382, "step": 772 }, { - "epoch": 0.2822713164140953, - "grad_norm": 1.0709574222564697, - "learning_rate": 2.821167883211679e-05, - "loss": 1.0919, + "epoch": 0.1341082581540597, + "grad_norm": 0.9166584610939026, + "learning_rate": 1.3408499566348656e-05, + "loss": 0.7827, "step": 773 }, { - "epoch": 0.28263647982472156, - "grad_norm": 1.2485579252243042, - "learning_rate": 2.8248175182481756e-05, - "loss": 1.1339, + "epoch": 0.1342817487855656, + "grad_norm": 0.7488536834716797, + "learning_rate": 1.3425845620121422e-05, + "loss": 0.7671, "step": 774 }, { - "epoch": 0.2830016432353478, - "grad_norm": 1.4905271530151367, - "learning_rate": 2.8284671532846717e-05, - "loss": 1.1686, + "epoch": 0.13445523941707146, + "grad_norm": 1.604833960533142, + "learning_rate": 1.3443191673894189e-05, + "loss": 0.8611, "step": 775 }, { - "epoch": 0.2833668066459741, - "grad_norm": 1.3758679628372192, - "learning_rate": 2.832116788321168e-05, - "loss": 1.1637, + "epoch": 0.13462873004857737, + "grad_norm": 1.0341730117797852, + "learning_rate": 1.3460537727666956e-05, + "loss": 0.8708, "step": 776 }, { - "epoch": 0.28373197005660034, - "grad_norm": 1.2363642454147339, - "learning_rate": 2.8357664233576643e-05, - "loss": 1.1178, + "epoch": 0.13480222068008327, + "grad_norm": 1.1286307573318481, + "learning_rate": 1.3477883781439724e-05, + "loss": 0.7786, "step": 777 }, { - "epoch": 0.2840971334672266, - "grad_norm": 1.0885051488876343, - "learning_rate": 2.8394160583941607e-05, - "loss": 1.1274, + "epoch": 0.13497571131158917, + "grad_norm": 0.8988445997238159, + "learning_rate": 1.349522983521249e-05, + "loss": 0.78, "step": 778 }, { - "epoch": 0.28446229687785285, - "grad_norm": 1.374056339263916, - "learning_rate": 2.8430656934306572e-05, - "loss": 1.1189, + "epoch": 0.13514920194309507, + "grad_norm": 0.8827788829803467, + "learning_rate": 1.3512575888985258e-05, + "loss": 0.9431, "step": 779 }, { - "epoch": 0.2848274602884791, - "grad_norm": 1.396788239479065, - "learning_rate": 2.8467153284671533e-05, - "loss": 1.132, + "epoch": 0.13532269257460097, + "grad_norm": 0.8574429750442505, + "learning_rate": 1.3529921942758024e-05, + "loss": 0.9038, "step": 780 }, { - "epoch": 0.28519262369910536, - "grad_norm": 1.2649401426315308, - "learning_rate": 2.8503649635036498e-05, - "loss": 1.1161, + "epoch": 0.13549618320610687, + "grad_norm": 1.0163263082504272, + "learning_rate": 1.3547267996530791e-05, + "loss": 0.9482, "step": 781 }, { - "epoch": 0.2855577871097316, - "grad_norm": 1.1919310092926025, - "learning_rate": 2.8540145985401466e-05, - "loss": 1.1353, + "epoch": 0.13566967383761278, + "grad_norm": 0.8243135213851929, + "learning_rate": 1.3564614050303558e-05, + "loss": 0.9814, "step": 782 }, { - "epoch": 0.2859229505203579, - "grad_norm": 0.9390687346458435, - "learning_rate": 2.8576642335766423e-05, - "loss": 1.1158, + "epoch": 0.13584316446911868, + "grad_norm": 1.180700659751892, + "learning_rate": 1.3581960104076324e-05, + "loss": 1.1194, "step": 783 }, { - "epoch": 0.28628811393098413, - "grad_norm": 1.1426920890808105, - "learning_rate": 2.861313868613139e-05, - "loss": 1.113, + "epoch": 0.13601665510062458, + "grad_norm": 1.1771677732467651, + "learning_rate": 1.3599306157849091e-05, + "loss": 1.0193, "step": 784 }, { - "epoch": 0.2866532773416104, - "grad_norm": 1.1132735013961792, - "learning_rate": 2.8649635036496356e-05, - "loss": 1.0983, + "epoch": 0.13619014573213045, + "grad_norm": 1.6416809558868408, + "learning_rate": 1.3616652211621858e-05, + "loss": 0.9377, "step": 785 }, { - "epoch": 0.28701844075223665, - "grad_norm": 0.9970917701721191, - "learning_rate": 2.8686131386861317e-05, - "loss": 1.0867, + "epoch": 0.13636363636363635, + "grad_norm": 1.0690160989761353, + "learning_rate": 1.3633998265394623e-05, + "loss": 0.9854, "step": 786 }, { - "epoch": 0.2873836041628629, - "grad_norm": 0.9505035281181335, - "learning_rate": 2.872262773722628e-05, - "loss": 1.0778, + "epoch": 0.13653712699514226, + "grad_norm": 0.9088786840438843, + "learning_rate": 1.365134431916739e-05, + "loss": 0.7827, "step": 787 }, { - "epoch": 0.28774876757348916, - "grad_norm": 1.000669240951538, - "learning_rate": 2.8759124087591243e-05, - "loss": 1.141, + "epoch": 0.13671061762664816, + "grad_norm": 0.8830459117889404, + "learning_rate": 1.3668690372940156e-05, + "loss": 0.9175, "step": 788 }, { - "epoch": 0.2881139309841154, - "grad_norm": 1.5330545902252197, - "learning_rate": 2.8795620437956207e-05, - "loss": 1.162, + "epoch": 0.13688410825815406, + "grad_norm": 1.2134349346160889, + "learning_rate": 1.3686036426712923e-05, + "loss": 0.9036, "step": 789 }, { - "epoch": 0.2884790943947416, - "grad_norm": 0.9774994850158691, - "learning_rate": 2.8832116788321172e-05, - "loss": 1.1333, + "epoch": 0.13705759888965996, + "grad_norm": 1.1176358461380005, + "learning_rate": 1.370338248048569e-05, + "loss": 0.7722, "step": 790 }, { - "epoch": 0.2888442578053679, - "grad_norm": 1.0080770254135132, - "learning_rate": 2.8868613138686133e-05, - "loss": 1.1458, + "epoch": 0.13723108952116586, + "grad_norm": 0.870728611946106, + "learning_rate": 1.3720728534258458e-05, + "loss": 0.8367, "step": 791 }, { - "epoch": 0.28920942121599413, - "grad_norm": 1.2957684993743896, - "learning_rate": 2.8905109489051098e-05, - "loss": 1.0917, + "epoch": 0.13740458015267176, + "grad_norm": 0.8255333304405212, + "learning_rate": 1.3738074588031225e-05, + "loss": 0.8931, "step": 792 }, { - "epoch": 0.2895745846266204, - "grad_norm": 1.2285979986190796, - "learning_rate": 2.8941605839416062e-05, - "loss": 1.132, + "epoch": 0.13757807078417766, + "grad_norm": 0.8214908242225647, + "learning_rate": 1.3755420641803991e-05, + "loss": 0.7703, "step": 793 }, { - "epoch": 0.28993974803724665, - "grad_norm": 1.232651948928833, - "learning_rate": 2.8978102189781023e-05, - "loss": 1.1211, + "epoch": 0.13775156141568357, + "grad_norm": 0.7673734426498413, + "learning_rate": 1.3772766695576758e-05, + "loss": 1.0271, "step": 794 }, { - "epoch": 0.2903049114478729, - "grad_norm": 1.905069351196289, - "learning_rate": 2.9014598540145988e-05, - "loss": 1.1029, + "epoch": 0.13792505204718944, + "grad_norm": 0.815705418586731, + "learning_rate": 1.3790112749349525e-05, + "loss": 0.9036, "step": 795 }, { - "epoch": 0.29067007485849916, - "grad_norm": 1.2766122817993164, - "learning_rate": 2.905109489051095e-05, - "loss": 1.1802, + "epoch": 0.13809854267869534, + "grad_norm": 0.9190661311149597, + "learning_rate": 1.3807458803122291e-05, + "loss": 0.9485, "step": 796 }, { - "epoch": 0.2910352382691254, - "grad_norm": 1.2074211835861206, - "learning_rate": 2.9087591240875914e-05, - "loss": 1.1014, + "epoch": 0.13827203331020124, + "grad_norm": 1.1341471672058105, + "learning_rate": 1.3824804856895058e-05, + "loss": 0.948, "step": 797 }, { - "epoch": 0.2914004016797517, - "grad_norm": 1.5616487264633179, - "learning_rate": 2.912408759124088e-05, - "loss": 1.1093, + "epoch": 0.13844552394170714, + "grad_norm": 0.9912359714508057, + "learning_rate": 1.3842150910667825e-05, + "loss": 0.926, "step": 798 }, { - "epoch": 0.29176556509037793, - "grad_norm": 0.9823098182678223, - "learning_rate": 2.916058394160584e-05, - "loss": 1.1388, + "epoch": 0.13861901457321305, + "grad_norm": 1.0248475074768066, + "learning_rate": 1.3859496964440591e-05, + "loss": 0.9172, "step": 799 }, { - "epoch": 0.2921307285010042, - "grad_norm": 2.063249111175537, - "learning_rate": 2.9197080291970804e-05, - "loss": 1.1741, + "epoch": 0.13879250520471895, + "grad_norm": 1.172512412071228, + "learning_rate": 1.3876843018213358e-05, + "loss": 0.8223, "step": 800 }, { - "epoch": 0.29249589191163045, - "grad_norm": 0.9434864521026611, - "learning_rate": 2.923357664233577e-05, - "loss": 1.1053, + "epoch": 0.13896599583622485, + "grad_norm": 2.0029091835021973, + "learning_rate": 1.3894189071986123e-05, + "loss": 0.7954, "step": 801 }, { - "epoch": 0.2928610553222567, - "grad_norm": 0.9204654693603516, - "learning_rate": 2.927007299270073e-05, - "loss": 1.1147, + "epoch": 0.13913948646773075, + "grad_norm": 0.7321649789810181, + "learning_rate": 1.391153512575889e-05, + "loss": 1.1011, "step": 802 }, { - "epoch": 0.29322621873288296, - "grad_norm": 1.4833625555038452, - "learning_rate": 2.9306569343065698e-05, - "loss": 1.0975, + "epoch": 0.13931297709923665, + "grad_norm": 1.4552580118179321, + "learning_rate": 1.3928881179531657e-05, + "loss": 0.9546, "step": 803 }, { - "epoch": 0.2935913821435092, - "grad_norm": 1.3447284698486328, - "learning_rate": 2.9343065693430662e-05, - "loss": 1.1105, + "epoch": 0.13948646773074255, + "grad_norm": 0.9477526545524597, + "learning_rate": 1.3946227233304423e-05, + "loss": 0.8103, "step": 804 }, { - "epoch": 0.2939565455541355, - "grad_norm": 1.184554934501648, - "learning_rate": 2.9379562043795624e-05, - "loss": 1.0889, + "epoch": 0.13965995836224843, + "grad_norm": 1.010766863822937, + "learning_rate": 1.396357328707719e-05, + "loss": 0.8479, "step": 805 }, { - "epoch": 0.29432170896476173, - "grad_norm": 1.4573850631713867, - "learning_rate": 2.9416058394160588e-05, - "loss": 1.1439, + "epoch": 0.13983344899375433, + "grad_norm": 1.2861683368682861, + "learning_rate": 1.3980919340849958e-05, + "loss": 0.7598, "step": 806 }, { - "epoch": 0.294686872375388, - "grad_norm": 1.2136337757110596, - "learning_rate": 2.945255474452555e-05, - "loss": 1.0996, + "epoch": 0.14000693962526023, + "grad_norm": 0.8894898295402527, + "learning_rate": 1.3998265394622725e-05, + "loss": 1.009, "step": 807 }, { - "epoch": 0.29505203578601424, - "grad_norm": 0.9530742764472961, - "learning_rate": 2.9489051094890514e-05, - "loss": 1.1395, + "epoch": 0.14018043025676613, + "grad_norm": 1.181809663772583, + "learning_rate": 1.4015611448395492e-05, + "loss": 0.7915, "step": 808 }, { - "epoch": 0.2954171991966405, - "grad_norm": 1.279515266418457, - "learning_rate": 2.952554744525548e-05, - "loss": 1.1196, + "epoch": 0.14035392088827203, + "grad_norm": 0.831052303314209, + "learning_rate": 1.4032957502168258e-05, + "loss": 0.9189, "step": 809 }, { - "epoch": 0.29578236260726676, - "grad_norm": 1.4163950681686401, - "learning_rate": 2.956204379562044e-05, - "loss": 1.1235, + "epoch": 0.14052741151977793, + "grad_norm": 1.061376690864563, + "learning_rate": 1.4050303555941025e-05, + "loss": 0.9761, "step": 810 }, { - "epoch": 0.296147526017893, - "grad_norm": 1.2296873331069946, - "learning_rate": 2.9598540145985404e-05, - "loss": 1.1102, + "epoch": 0.14070090215128384, + "grad_norm": 0.9081786870956421, + "learning_rate": 1.4067649609713792e-05, + "loss": 1.1392, "step": 811 }, { - "epoch": 0.29651268942851927, - "grad_norm": 1.210404396057129, - "learning_rate": 2.963503649635037e-05, - "loss": 1.1288, + "epoch": 0.14087439278278974, + "grad_norm": 1.1470727920532227, + "learning_rate": 1.4084995663486558e-05, + "loss": 1.0474, "step": 812 }, { - "epoch": 0.29687785283914553, - "grad_norm": 1.2939397096633911, - "learning_rate": 2.967153284671533e-05, - "loss": 1.1978, + "epoch": 0.14104788341429564, + "grad_norm": 0.8203163743019104, + "learning_rate": 1.4102341717259325e-05, + "loss": 1.0376, "step": 813 }, { - "epoch": 0.2972430162497718, - "grad_norm": 1.846366047859192, - "learning_rate": 2.9708029197080294e-05, - "loss": 1.0946, + "epoch": 0.14122137404580154, + "grad_norm": 0.9975243806838989, + "learning_rate": 1.4119687771032092e-05, + "loss": 0.7976, "step": 814 }, { - "epoch": 0.29760817966039804, - "grad_norm": 1.1188578605651855, - "learning_rate": 2.974452554744526e-05, - "loss": 1.0956, + "epoch": 0.1413948646773074, + "grad_norm": 1.5964168310165405, + "learning_rate": 1.4137033824804859e-05, + "loss": 0.7324, "step": 815 }, { - "epoch": 0.2979733430710243, - "grad_norm": 1.5509464740753174, - "learning_rate": 2.978102189781022e-05, - "loss": 1.1299, + "epoch": 0.14156835530881332, + "grad_norm": 1.195165753364563, + "learning_rate": 1.4154379878577624e-05, + "loss": 0.8552, "step": 816 }, { - "epoch": 0.29833850648165056, - "grad_norm": 1.306592583656311, - "learning_rate": 2.9817518248175185e-05, - "loss": 1.1248, + "epoch": 0.14174184594031922, + "grad_norm": 0.7820180654525757, + "learning_rate": 1.417172593235039e-05, + "loss": 0.9202, "step": 817 }, { - "epoch": 0.2987036698922768, - "grad_norm": 1.319661259651184, - "learning_rate": 2.9854014598540146e-05, - "loss": 1.1243, + "epoch": 0.14191533657182512, + "grad_norm": 2.222125291824341, + "learning_rate": 1.4189071986123157e-05, + "loss": 0.9978, "step": 818 }, { - "epoch": 0.29906883330290307, - "grad_norm": 1.0591555833816528, - "learning_rate": 2.989051094890511e-05, - "loss": 1.1272, + "epoch": 0.14208882720333102, + "grad_norm": 0.7767307162284851, + "learning_rate": 1.4206418039895924e-05, + "loss": 0.9614, "step": 819 }, { - "epoch": 0.2994339967135293, - "grad_norm": 1.3213382959365845, - "learning_rate": 2.9927007299270075e-05, - "loss": 1.0887, + "epoch": 0.14226231783483692, + "grad_norm": 1.696215033531189, + "learning_rate": 1.422376409366869e-05, + "loss": 0.7234, "step": 820 }, { - "epoch": 0.2997991601241556, - "grad_norm": 1.5500199794769287, - "learning_rate": 2.9963503649635036e-05, - "loss": 1.126, + "epoch": 0.14243580846634282, + "grad_norm": 0.9754441976547241, + "learning_rate": 1.4241110147441459e-05, + "loss": 1.0693, "step": 821 }, { - "epoch": 0.30016432353478184, - "grad_norm": 1.3474305868148804, - "learning_rate": 3.0000000000000004e-05, - "loss": 1.0955, + "epoch": 0.14260929909784872, + "grad_norm": 1.7983765602111816, + "learning_rate": 1.4258456201214225e-05, + "loss": 0.8308, "step": 822 }, { - "epoch": 0.3005294869454081, - "grad_norm": 1.4620311260223389, - "learning_rate": 3.003649635036497e-05, - "loss": 1.1029, + "epoch": 0.14278278972935463, + "grad_norm": 1.0824410915374756, + "learning_rate": 1.4275802254986992e-05, + "loss": 0.8044, "step": 823 }, { - "epoch": 0.3008946503560343, - "grad_norm": 1.3238533735275269, - "learning_rate": 3.007299270072993e-05, - "loss": 1.0898, + "epoch": 0.14295628036086053, + "grad_norm": 1.1211001873016357, + "learning_rate": 1.4293148308759759e-05, + "loss": 0.7139, "step": 824 }, { - "epoch": 0.30125981376666056, - "grad_norm": 1.547220230102539, - "learning_rate": 3.0109489051094894e-05, - "loss": 1.1475, + "epoch": 0.1431297709923664, + "grad_norm": 0.8356219530105591, + "learning_rate": 1.4310494362532526e-05, + "loss": 0.9565, "step": 825 }, { - "epoch": 0.3016249771772868, - "grad_norm": 1.5689129829406738, - "learning_rate": 3.014598540145986e-05, - "loss": 1.1068, + "epoch": 0.1433032616238723, + "grad_norm": 0.8935127854347229, + "learning_rate": 1.4327840416305292e-05, + "loss": 0.9326, "step": 826 }, { - "epoch": 0.30199014058791307, - "grad_norm": 1.673783302307129, - "learning_rate": 3.018248175182482e-05, - "loss": 1.136, + "epoch": 0.1434767522553782, + "grad_norm": 1.659989595413208, + "learning_rate": 1.4345186470078059e-05, + "loss": 0.8735, "step": 827 }, { - "epoch": 0.3023553039985393, - "grad_norm": 1.1299020051956177, - "learning_rate": 3.0218978102189785e-05, - "loss": 1.1132, + "epoch": 0.1436502428868841, + "grad_norm": 0.9948083758354187, + "learning_rate": 1.4362532523850826e-05, + "loss": 0.8845, "step": 828 }, { - "epoch": 0.3027204674091656, - "grad_norm": 1.2408223152160645, - "learning_rate": 3.0255474452554746e-05, - "loss": 1.1456, + "epoch": 0.14382373351839, + "grad_norm": 1.061937928199768, + "learning_rate": 1.4379878577623592e-05, + "loss": 0.7766, "step": 829 }, { - "epoch": 0.30308563081979184, - "grad_norm": 1.0921306610107422, - "learning_rate": 3.029197080291971e-05, - "loss": 1.1433, + "epoch": 0.1439972241498959, + "grad_norm": 0.8871070146560669, + "learning_rate": 1.4397224631396359e-05, + "loss": 0.8967, "step": 830 }, { - "epoch": 0.3034507942304181, - "grad_norm": 1.0858033895492554, - "learning_rate": 3.0328467153284675e-05, - "loss": 1.1226, + "epoch": 0.1441707147814018, + "grad_norm": 0.9169383645057678, + "learning_rate": 1.4414570685169124e-05, + "loss": 0.8176, "step": 831 }, { - "epoch": 0.30381595764104435, - "grad_norm": 2.2049400806427, - "learning_rate": 3.0364963503649636e-05, - "loss": 1.1067, + "epoch": 0.1443442054129077, + "grad_norm": 1.3743515014648438, + "learning_rate": 1.443191673894189e-05, + "loss": 1.0312, "step": 832 }, { - "epoch": 0.3041811210516706, - "grad_norm": 1.147986888885498, - "learning_rate": 3.04014598540146e-05, - "loss": 1.0903, + "epoch": 0.1445176960444136, + "grad_norm": 1.1366435289382935, + "learning_rate": 1.4449262792714657e-05, + "loss": 0.8506, "step": 833 }, { - "epoch": 0.30454628446229687, - "grad_norm": 1.035316824913025, - "learning_rate": 3.0437956204379565e-05, - "loss": 1.0642, + "epoch": 0.14469118667591951, + "grad_norm": 1.0575315952301025, + "learning_rate": 1.4466608846487424e-05, + "loss": 0.835, "step": 834 }, { - "epoch": 0.3049114478729231, - "grad_norm": 1.1113356351852417, - "learning_rate": 3.0474452554744527e-05, - "loss": 1.109, + "epoch": 0.1448646773074254, + "grad_norm": 0.9829549789428711, + "learning_rate": 1.448395490026019e-05, + "loss": 0.8694, "step": 835 }, { - "epoch": 0.3052766112835494, - "grad_norm": 1.3039833307266235, - "learning_rate": 3.051094890510949e-05, - "loss": 1.1193, + "epoch": 0.1450381679389313, + "grad_norm": 1.0108870267868042, + "learning_rate": 1.450130095403296e-05, + "loss": 0.7776, "step": 836 }, { - "epoch": 0.30564177469417564, - "grad_norm": 0.9444441795349121, - "learning_rate": 3.0547445255474456e-05, - "loss": 1.0703, + "epoch": 0.1452116585704372, + "grad_norm": 0.896040141582489, + "learning_rate": 1.4518647007805726e-05, + "loss": 0.9412, "step": 837 }, { - "epoch": 0.3060069381048019, - "grad_norm": 1.2063283920288086, - "learning_rate": 3.058394160583942e-05, - "loss": 1.1213, + "epoch": 0.1453851492019431, + "grad_norm": 0.8911782503128052, + "learning_rate": 1.4535993061578493e-05, + "loss": 0.8306, "step": 838 }, { - "epoch": 0.30637210151542815, - "grad_norm": 0.972802996635437, - "learning_rate": 3.0620437956204385e-05, - "loss": 1.1191, + "epoch": 0.145558639833449, + "grad_norm": 0.9529337882995605, + "learning_rate": 1.455333911535126e-05, + "loss": 0.8931, "step": 839 }, { - "epoch": 0.3067372649260544, - "grad_norm": 1.2145520448684692, - "learning_rate": 3.0656934306569346e-05, - "loss": 1.0955, + "epoch": 0.1457321304649549, + "grad_norm": 1.145410418510437, + "learning_rate": 1.4570685169124026e-05, + "loss": 0.8455, "step": 840 }, { - "epoch": 0.30710242833668067, - "grad_norm": 1.1050636768341064, - "learning_rate": 3.069343065693431e-05, - "loss": 1.1157, + "epoch": 0.1459056210964608, + "grad_norm": 2.1482558250427246, + "learning_rate": 1.4588031222896793e-05, + "loss": 0.7241, "step": 841 }, { - "epoch": 0.3074675917473069, - "grad_norm": 1.8446202278137207, - "learning_rate": 3.0729927007299275e-05, - "loss": 1.094, + "epoch": 0.1460791117279667, + "grad_norm": 0.936776876449585, + "learning_rate": 1.460537727666956e-05, + "loss": 0.8628, "step": 842 }, { - "epoch": 0.3078327551579332, - "grad_norm": 1.1013379096984863, - "learning_rate": 3.0766423357664236e-05, - "loss": 1.126, + "epoch": 0.1462526023594726, + "grad_norm": 1.1759963035583496, + "learning_rate": 1.4622723330442326e-05, + "loss": 0.9209, "step": 843 }, { - "epoch": 0.30819791856855944, - "grad_norm": 0.9729995131492615, - "learning_rate": 3.08029197080292e-05, - "loss": 1.0533, + "epoch": 0.14642609299097847, + "grad_norm": 1.3949528932571411, + "learning_rate": 1.4640069384215093e-05, + "loss": 0.8586, "step": 844 }, { - "epoch": 0.3085630819791857, - "grad_norm": 1.2000426054000854, - "learning_rate": 3.0839416058394165e-05, - "loss": 1.1017, + "epoch": 0.14659958362248438, + "grad_norm": 2.006772041320801, + "learning_rate": 1.465741543798786e-05, + "loss": 0.7668, "step": 845 }, { - "epoch": 0.30892824538981195, - "grad_norm": 1.341558814048767, - "learning_rate": 3.0875912408759127e-05, - "loss": 1.147, + "epoch": 0.14677307425399028, + "grad_norm": 1.3961585760116577, + "learning_rate": 1.4674761491760624e-05, + "loss": 0.8574, "step": 846 }, { - "epoch": 0.3092934088004382, - "grad_norm": 1.4722051620483398, - "learning_rate": 3.091240875912409e-05, - "loss": 1.1606, + "epoch": 0.14694656488549618, + "grad_norm": 0.958802342414856, + "learning_rate": 1.4692107545533391e-05, + "loss": 0.9771, "step": 847 }, { - "epoch": 0.30965857221106446, - "grad_norm": 0.8768219947814941, - "learning_rate": 3.0948905109489056e-05, - "loss": 1.0645, + "epoch": 0.14712005551700208, + "grad_norm": 0.9644052982330322, + "learning_rate": 1.4709453599306158e-05, + "loss": 0.8735, "step": 848 }, { - "epoch": 0.3100237356216907, - "grad_norm": 1.330670952796936, - "learning_rate": 3.098540145985402e-05, - "loss": 1.0929, + "epoch": 0.14729354614850798, + "grad_norm": 1.0760672092437744, + "learning_rate": 1.4726799653078925e-05, + "loss": 0.9561, "step": 849 }, { - "epoch": 0.310388899032317, - "grad_norm": 1.5302362442016602, - "learning_rate": 3.1021897810218985e-05, - "loss": 1.13, + "epoch": 0.14746703678001388, + "grad_norm": 0.9004976749420166, + "learning_rate": 1.4744145706851693e-05, + "loss": 0.8958, "step": 850 }, { - "epoch": 0.31075406244294324, - "grad_norm": 0.813676655292511, - "learning_rate": 3.105839416058394e-05, - "loss": 1.0671, + "epoch": 0.14764052741151978, + "grad_norm": 0.9137843251228333, + "learning_rate": 1.476149176062446e-05, + "loss": 0.8103, "step": 851 }, { - "epoch": 0.3111192258535695, - "grad_norm": 1.5621293783187866, - "learning_rate": 3.109489051094891e-05, - "loss": 1.1062, + "epoch": 0.14781401804302569, + "grad_norm": 1.0726293325424194, + "learning_rate": 1.4778837814397226e-05, + "loss": 0.8428, "step": 852 }, { - "epoch": 0.31148438926419575, - "grad_norm": 1.2811596393585205, - "learning_rate": 3.1131386861313875e-05, - "loss": 1.131, + "epoch": 0.1479875086745316, + "grad_norm": 1.110036849975586, + "learning_rate": 1.4796183868169993e-05, + "loss": 0.8198, "step": 853 }, { - "epoch": 0.311849552674822, - "grad_norm": 1.3733336925506592, - "learning_rate": 3.1167883211678836e-05, - "loss": 1.1124, + "epoch": 0.14816099930603746, + "grad_norm": 1.1295381784439087, + "learning_rate": 1.481352992194276e-05, + "loss": 0.8132, "step": 854 }, { - "epoch": 0.31221471608544826, - "grad_norm": 1.2660307884216309, - "learning_rate": 3.12043795620438e-05, - "loss": 1.1732, + "epoch": 0.14833448993754336, + "grad_norm": 0.7471928596496582, + "learning_rate": 1.4830875975715526e-05, + "loss": 0.9219, "step": 855 }, { - "epoch": 0.3125798794960745, - "grad_norm": 1.028300404548645, - "learning_rate": 3.1240875912408765e-05, - "loss": 1.1011, + "epoch": 0.14850798056904926, + "grad_norm": 1.0388845205307007, + "learning_rate": 1.4848222029488293e-05, + "loss": 0.7722, "step": 856 }, { - "epoch": 0.3129450429067007, - "grad_norm": 1.0808995962142944, - "learning_rate": 3.127737226277373e-05, - "loss": 1.0963, + "epoch": 0.14868147120055517, + "grad_norm": 1.1333787441253662, + "learning_rate": 1.486556808326106e-05, + "loss": 0.8179, "step": 857 }, { - "epoch": 0.313310206317327, - "grad_norm": 0.9943464398384094, - "learning_rate": 3.131386861313869e-05, - "loss": 1.0612, + "epoch": 0.14885496183206107, + "grad_norm": 0.8636186122894287, + "learning_rate": 1.4882914137033826e-05, + "loss": 1.127, "step": 858 }, { - "epoch": 0.31367536972795323, - "grad_norm": 1.51658034324646, - "learning_rate": 3.135036496350365e-05, - "loss": 1.0874, + "epoch": 0.14902845246356697, + "grad_norm": 1.1516021490097046, + "learning_rate": 1.4900260190806593e-05, + "loss": 0.7402, "step": 859 }, { - "epoch": 0.3140405331385795, - "grad_norm": 1.1708227396011353, - "learning_rate": 3.138686131386862e-05, - "loss": 1.0966, + "epoch": 0.14920194309507287, + "grad_norm": 1.6836670637130737, + "learning_rate": 1.491760624457936e-05, + "loss": 0.7533, "step": 860 }, { - "epoch": 0.31440569654920575, - "grad_norm": 1.43307363986969, - "learning_rate": 3.142335766423358e-05, - "loss": 1.1326, + "epoch": 0.14937543372657877, + "grad_norm": 0.840652346611023, + "learning_rate": 1.4934952298352125e-05, + "loss": 0.8584, "step": 861 }, { - "epoch": 0.314770859959832, - "grad_norm": 1.1620780229568481, - "learning_rate": 3.145985401459854e-05, - "loss": 1.0442, + "epoch": 0.14954892435808467, + "grad_norm": 1.053740382194519, + "learning_rate": 1.4952298352124892e-05, + "loss": 0.9241, "step": 862 }, { - "epoch": 0.31513602337045826, - "grad_norm": 1.4015519618988037, - "learning_rate": 3.149635036496351e-05, - "loss": 1.0442, + "epoch": 0.14972241498959057, + "grad_norm": 1.3386101722717285, + "learning_rate": 1.4969644405897658e-05, + "loss": 0.7328, "step": 863 }, { - "epoch": 0.3155011867810845, - "grad_norm": 0.9535883069038391, - "learning_rate": 3.153284671532847e-05, - "loss": 1.1104, + "epoch": 0.14989590562109645, + "grad_norm": 0.7270936965942383, + "learning_rate": 1.4986990459670425e-05, + "loss": 0.9548, "step": 864 }, { - "epoch": 0.3158663501917108, - "grad_norm": 1.3170323371887207, - "learning_rate": 3.156934306569343e-05, - "loss": 1.0875, + "epoch": 0.15006939625260235, + "grad_norm": 1.3399126529693604, + "learning_rate": 1.5004336513443193e-05, + "loss": 0.8093, "step": 865 }, { - "epoch": 0.31623151360233703, - "grad_norm": 1.1585885286331177, - "learning_rate": 3.16058394160584e-05, - "loss": 1.0899, + "epoch": 0.15024288688410825, + "grad_norm": 0.8920833468437195, + "learning_rate": 1.502168256721596e-05, + "loss": 1.0615, "step": 866 }, { - "epoch": 0.3165966770129633, - "grad_norm": 1.2413902282714844, - "learning_rate": 3.1642335766423365e-05, - "loss": 1.1042, + "epoch": 0.15041637751561415, + "grad_norm": 0.797852635383606, + "learning_rate": 1.5039028620988727e-05, + "loss": 0.9504, "step": 867 }, { - "epoch": 0.31696184042358955, - "grad_norm": 1.089520812034607, - "learning_rate": 3.167883211678832e-05, - "loss": 1.0466, + "epoch": 0.15058986814712005, + "grad_norm": 0.9916612505912781, + "learning_rate": 1.5056374674761493e-05, + "loss": 0.9038, "step": 868 }, { - "epoch": 0.3173270038342158, - "grad_norm": 1.207255244255066, - "learning_rate": 3.171532846715329e-05, - "loss": 1.1228, + "epoch": 0.15076335877862596, + "grad_norm": 0.7986864447593689, + "learning_rate": 1.507372072853426e-05, + "loss": 0.8484, "step": 869 }, { - "epoch": 0.31769216724484206, - "grad_norm": 1.3315054178237915, - "learning_rate": 3.175182481751825e-05, - "loss": 1.0804, + "epoch": 0.15093684941013186, + "grad_norm": 1.8888822793960571, + "learning_rate": 1.5091066782307027e-05, + "loss": 0.8567, "step": 870 }, { - "epoch": 0.3180573306554683, - "grad_norm": 1.1967310905456543, - "learning_rate": 3.178832116788322e-05, - "loss": 1.0903, + "epoch": 0.15111034004163776, + "grad_norm": 1.01832115650177, + "learning_rate": 1.5108412836079793e-05, + "loss": 0.9412, "step": 871 }, { - "epoch": 0.3184224940660946, - "grad_norm": 1.4883594512939453, - "learning_rate": 3.182481751824818e-05, - "loss": 1.1499, + "epoch": 0.15128383067314366, + "grad_norm": 1.2546641826629639, + "learning_rate": 1.512575888985256e-05, + "loss": 0.9275, "step": 872 }, { - "epoch": 0.31878765747672083, - "grad_norm": 1.441544532775879, - "learning_rate": 3.186131386861314e-05, - "loss": 1.1167, + "epoch": 0.15145732130464956, + "grad_norm": 0.9539880752563477, + "learning_rate": 1.5143104943625327e-05, + "loss": 0.8328, "step": 873 }, { - "epoch": 0.3191528208873471, - "grad_norm": 1.4127689599990845, - "learning_rate": 3.189781021897811e-05, - "loss": 1.1104, + "epoch": 0.15163081193615544, + "grad_norm": 2.7705061435699463, + "learning_rate": 1.5160450997398094e-05, + "loss": 0.9568, "step": 874 }, { - "epoch": 0.31951798429797335, - "grad_norm": 0.8492609262466431, - "learning_rate": 3.193430656934307e-05, - "loss": 1.0566, + "epoch": 0.15180430256766134, + "grad_norm": 0.8761847019195557, + "learning_rate": 1.517779705117086e-05, + "loss": 0.9026, "step": 875 }, { - "epoch": 0.3198831477085996, - "grad_norm": 0.8268579840660095, - "learning_rate": 3.197080291970803e-05, - "loss": 1.105, + "epoch": 0.15197779319916724, + "grad_norm": 0.9445111751556396, + "learning_rate": 1.5195143104943625e-05, + "loss": 0.79, "step": 876 }, { - "epoch": 0.32024831111922586, - "grad_norm": 1.188431978225708, - "learning_rate": 3.2007299270073e-05, - "loss": 1.0787, + "epoch": 0.15215128383067314, + "grad_norm": 1.230183720588684, + "learning_rate": 1.5212489158716392e-05, + "loss": 0.9478, "step": 877 }, { - "epoch": 0.3206134745298521, - "grad_norm": 1.6380081176757812, - "learning_rate": 3.204379562043796e-05, - "loss": 1.0731, + "epoch": 0.15232477446217904, + "grad_norm": 1.3573399782180786, + "learning_rate": 1.5229835212489159e-05, + "loss": 0.9268, "step": 878 }, { - "epoch": 0.3209786379404784, - "grad_norm": 1.2689604759216309, - "learning_rate": 3.208029197080292e-05, - "loss": 1.1107, + "epoch": 0.15249826509368494, + "grad_norm": 1.20628023147583, + "learning_rate": 1.5247181266261925e-05, + "loss": 0.8877, "step": 879 }, { - "epoch": 0.32134380135110463, - "grad_norm": 1.0818451642990112, - "learning_rate": 3.211678832116789e-05, - "loss": 1.0634, + "epoch": 0.15267175572519084, + "grad_norm": 0.9635702967643738, + "learning_rate": 1.5264527320034695e-05, + "loss": 1.0198, "step": 880 }, { - "epoch": 0.3217089647617309, - "grad_norm": 1.9989469051361084, - "learning_rate": 3.215328467153285e-05, - "loss": 1.113, + "epoch": 0.15284524635669675, + "grad_norm": 0.8995382189750671, + "learning_rate": 1.528187337380746e-05, + "loss": 0.9407, "step": 881 }, { - "epoch": 0.32207412817235714, - "grad_norm": 1.181347131729126, - "learning_rate": 3.218978102189781e-05, - "loss": 1.1083, + "epoch": 0.15301873698820265, + "grad_norm": 1.101050853729248, + "learning_rate": 1.529921942758023e-05, + "loss": 0.9673, "step": 882 }, { - "epoch": 0.3224392915829834, - "grad_norm": 1.0107276439666748, - "learning_rate": 3.222627737226278e-05, - "loss": 1.1029, + "epoch": 0.15319222761970855, + "grad_norm": 0.963238000869751, + "learning_rate": 1.5316565481352994e-05, + "loss": 0.8704, "step": 883 }, { - "epoch": 0.32280445499360966, - "grad_norm": 1.0267194509506226, - "learning_rate": 3.226277372262774e-05, - "loss": 1.1045, + "epoch": 0.15336571825121442, + "grad_norm": 0.8461984395980835, + "learning_rate": 1.5333911535125762e-05, + "loss": 0.9062, "step": 884 }, { - "epoch": 0.3231696184042359, - "grad_norm": 1.5342499017715454, - "learning_rate": 3.22992700729927e-05, - "loss": 1.0825, + "epoch": 0.15353920888272032, + "grad_norm": 0.8093241453170776, + "learning_rate": 1.5351257588898527e-05, + "loss": 0.877, "step": 885 }, { - "epoch": 0.32353478181486217, - "grad_norm": 1.149175763130188, - "learning_rate": 3.233576642335767e-05, - "loss": 1.1195, + "epoch": 0.15371269951422623, + "grad_norm": 1.5282747745513916, + "learning_rate": 1.5368603642671292e-05, + "loss": 0.9758, "step": 886 }, { - "epoch": 0.32389994522548843, - "grad_norm": 1.0132466554641724, - "learning_rate": 3.237226277372263e-05, - "loss": 1.085, + "epoch": 0.15388619014573213, + "grad_norm": 1.0526878833770752, + "learning_rate": 1.538594969644406e-05, + "loss": 0.8005, "step": 887 }, { - "epoch": 0.3242651086361147, - "grad_norm": 1.3041964769363403, - "learning_rate": 3.24087591240876e-05, - "loss": 1.0812, + "epoch": 0.15405968077723803, + "grad_norm": 0.9032689929008484, + "learning_rate": 1.5403295750216826e-05, + "loss": 0.9126, "step": 888 }, { - "epoch": 0.32463027204674094, - "grad_norm": 1.5755618810653687, - "learning_rate": 3.244525547445256e-05, - "loss": 1.0652, + "epoch": 0.15423317140874393, + "grad_norm": 1.6991785764694214, + "learning_rate": 1.5420641803989594e-05, + "loss": 0.7698, "step": 889 }, { - "epoch": 0.3249954354573672, - "grad_norm": 0.9804404973983765, - "learning_rate": 3.248175182481752e-05, - "loss": 1.0858, + "epoch": 0.15440666204024983, + "grad_norm": 1.03450608253479, + "learning_rate": 1.543798785776236e-05, + "loss": 0.9072, "step": 890 }, { - "epoch": 0.3253605988679934, - "grad_norm": 1.1439754962921143, - "learning_rate": 3.251824817518249e-05, - "loss": 1.0721, + "epoch": 0.15458015267175573, + "grad_norm": 0.9682037830352783, + "learning_rate": 1.5455333911535127e-05, + "loss": 1.0103, "step": 891 }, { - "epoch": 0.32572576227861966, - "grad_norm": 1.1914987564086914, - "learning_rate": 3.255474452554745e-05, - "loss": 1.1039, + "epoch": 0.15475364330326163, + "grad_norm": 1.123001217842102, + "learning_rate": 1.5472679965307892e-05, + "loss": 0.7522, "step": 892 }, { - "epoch": 0.3260909256892459, - "grad_norm": 1.3361140489578247, - "learning_rate": 3.259124087591241e-05, - "loss": 1.1296, + "epoch": 0.15492713393476754, + "grad_norm": 0.7928397059440613, + "learning_rate": 1.549002601908066e-05, + "loss": 0.991, "step": 893 }, { - "epoch": 0.32645608909987217, - "grad_norm": 1.1943520307540894, - "learning_rate": 3.262773722627738e-05, - "loss": 1.0447, + "epoch": 0.1551006245662734, + "grad_norm": 1.1405808925628662, + "learning_rate": 1.5507372072853426e-05, + "loss": 0.8503, "step": 894 }, { - "epoch": 0.32682125251049843, - "grad_norm": 0.8193743228912354, - "learning_rate": 3.266423357664234e-05, - "loss": 1.1083, + "epoch": 0.1552741151977793, + "grad_norm": 1.0719610452651978, + "learning_rate": 1.5524718126626194e-05, + "loss": 0.9194, "step": 895 }, { - "epoch": 0.3271864159211247, - "grad_norm": 1.2428557872772217, - "learning_rate": 3.27007299270073e-05, - "loss": 1.0643, + "epoch": 0.1554476058292852, + "grad_norm": 0.7587058544158936, + "learning_rate": 1.5542064180398963e-05, + "loss": 0.9963, "step": 896 }, { - "epoch": 0.32755157933175094, - "grad_norm": 0.9138479232788086, - "learning_rate": 3.273722627737227e-05, - "loss": 1.1008, + "epoch": 0.15562109646079111, + "grad_norm": 0.9101662635803223, + "learning_rate": 1.5559410234171728e-05, + "loss": 0.8982, "step": 897 }, { - "epoch": 0.3279167427423772, - "grad_norm": 1.0029480457305908, - "learning_rate": 3.277372262773723e-05, - "loss": 1.0842, + "epoch": 0.15579458709229702, + "grad_norm": 0.9171735048294067, + "learning_rate": 1.5576756287944496e-05, + "loss": 0.9529, "step": 898 }, { - "epoch": 0.32828190615300346, - "grad_norm": 1.1993231773376465, - "learning_rate": 3.281021897810219e-05, - "loss": 1.1019, + "epoch": 0.15596807772380292, + "grad_norm": 1.4089183807373047, + "learning_rate": 1.559410234171726e-05, + "loss": 0.8884, "step": 899 }, { - "epoch": 0.3286470695636297, - "grad_norm": 1.215217113494873, - "learning_rate": 3.284671532846716e-05, - "loss": 1.0884, + "epoch": 0.15614156835530882, + "grad_norm": 1.0912784337997437, + "learning_rate": 1.5611448395490026e-05, + "loss": 0.9727, "step": 900 }, { - "epoch": 0.32901223297425597, - "grad_norm": 1.1636677980422974, - "learning_rate": 3.288321167883212e-05, - "loss": 1.0895, + "epoch": 0.15631505898681472, + "grad_norm": 0.9150531888008118, + "learning_rate": 1.5628794449262794e-05, + "loss": 0.8252, "step": 901 }, { - "epoch": 0.3293773963848822, - "grad_norm": 1.1540857553482056, - "learning_rate": 3.291970802919708e-05, - "loss": 1.1056, + "epoch": 0.15648854961832062, + "grad_norm": 0.9000369906425476, + "learning_rate": 1.564614050303556e-05, + "loss": 0.8833, "step": 902 }, { - "epoch": 0.3297425597955085, - "grad_norm": 1.8437178134918213, - "learning_rate": 3.295620437956204e-05, - "loss": 1.1619, + "epoch": 0.15666204024982652, + "grad_norm": 0.7976706027984619, + "learning_rate": 1.5663486556808328e-05, + "loss": 0.8367, "step": 903 }, { - "epoch": 0.33010772320613474, - "grad_norm": 1.1463125944137573, - "learning_rate": 3.299270072992701e-05, - "loss": 1.0883, + "epoch": 0.1568355308813324, + "grad_norm": 1.0485780239105225, + "learning_rate": 1.5680832610581093e-05, + "loss": 0.8403, "step": 904 }, { - "epoch": 0.330472886616761, - "grad_norm": 1.0244412422180176, - "learning_rate": 3.302919708029197e-05, - "loss": 1.106, + "epoch": 0.1570090215128383, + "grad_norm": 0.7505069971084595, + "learning_rate": 1.569817866435386e-05, + "loss": 0.9907, "step": 905 }, { - "epoch": 0.33083805002738725, - "grad_norm": 0.9281713962554932, - "learning_rate": 3.306569343065693e-05, - "loss": 1.0918, + "epoch": 0.1571825121443442, + "grad_norm": 0.8456043004989624, + "learning_rate": 1.5715524718126626e-05, + "loss": 0.8877, "step": 906 }, { - "epoch": 0.3312032134380135, - "grad_norm": 1.3270792961120605, - "learning_rate": 3.31021897810219e-05, - "loss": 1.1006, + "epoch": 0.1573560027758501, + "grad_norm": 0.7804064154624939, + "learning_rate": 1.5732870771899395e-05, + "loss": 1.0791, "step": 907 }, { - "epoch": 0.33156837684863977, - "grad_norm": 1.0984102487564087, - "learning_rate": 3.313868613138687e-05, - "loss": 1.0973, + "epoch": 0.157529493407356, + "grad_norm": 0.9269134998321533, + "learning_rate": 1.575021682567216e-05, + "loss": 0.9014, "step": 908 }, { - "epoch": 0.331933540259266, - "grad_norm": 0.9106290340423584, - "learning_rate": 3.317518248175183e-05, - "loss": 1.0856, + "epoch": 0.1577029840388619, + "grad_norm": 0.8350370526313782, + "learning_rate": 1.5767562879444928e-05, + "loss": 0.8423, "step": 909 }, { - "epoch": 0.3322987036698923, - "grad_norm": 1.0817501544952393, - "learning_rate": 3.321167883211679e-05, - "loss": 1.0662, + "epoch": 0.1578764746703678, + "grad_norm": 1.9963208436965942, + "learning_rate": 1.5784908933217696e-05, + "loss": 0.8882, "step": 910 }, { - "epoch": 0.33266386708051854, - "grad_norm": 0.8276938199996948, - "learning_rate": 3.324817518248176e-05, - "loss": 1.0593, + "epoch": 0.1580499653018737, + "grad_norm": 1.6762498617172241, + "learning_rate": 1.580225498699046e-05, + "loss": 0.9204, "step": 911 }, { - "epoch": 0.3330290304911448, - "grad_norm": 1.1110670566558838, - "learning_rate": 3.328467153284672e-05, - "loss": 1.0695, + "epoch": 0.1582234559333796, + "grad_norm": 1.0437973737716675, + "learning_rate": 1.581960104076323e-05, + "loss": 0.915, "step": 912 }, { - "epoch": 0.33339419390177105, - "grad_norm": 1.286852478981018, - "learning_rate": 3.332116788321168e-05, - "loss": 1.0709, + "epoch": 0.15839694656488548, + "grad_norm": 1.0473750829696655, + "learning_rate": 1.5836947094535995e-05, + "loss": 0.8982, "step": 913 }, { - "epoch": 0.3337593573123973, - "grad_norm": 1.287414312362671, - "learning_rate": 3.335766423357664e-05, - "loss": 1.0996, + "epoch": 0.15857043719639138, + "grad_norm": 0.8714802265167236, + "learning_rate": 1.5854293148308763e-05, + "loss": 0.8691, "step": 914 }, { - "epoch": 0.33412452072302357, - "grad_norm": 1.3532828092575073, - "learning_rate": 3.339416058394161e-05, - "loss": 1.1035, + "epoch": 0.15874392782789729, + "grad_norm": 1.8291577100753784, + "learning_rate": 1.5871639202081528e-05, + "loss": 0.8369, "step": 915 }, { - "epoch": 0.3344896841336498, - "grad_norm": 1.1716351509094238, - "learning_rate": 3.343065693430657e-05, - "loss": 1.0973, + "epoch": 0.1589174184594032, + "grad_norm": 1.3107647895812988, + "learning_rate": 1.5888985255854293e-05, + "loss": 0.8647, "step": 916 }, { - "epoch": 0.3348548475442761, - "grad_norm": 0.9757788777351379, - "learning_rate": 3.346715328467153e-05, - "loss": 1.0677, + "epoch": 0.1590909090909091, + "grad_norm": 0.7822266221046448, + "learning_rate": 1.590633130962706e-05, + "loss": 0.998, "step": 917 }, { - "epoch": 0.33522001095490234, - "grad_norm": 1.1536986827850342, - "learning_rate": 3.35036496350365e-05, - "loss": 1.1155, + "epoch": 0.159264399722415, + "grad_norm": 0.8174435496330261, + "learning_rate": 1.5923677363399826e-05, + "loss": 0.8809, "step": 918 }, { - "epoch": 0.3355851743655286, - "grad_norm": 1.2348177433013916, - "learning_rate": 3.354014598540146e-05, - "loss": 1.1246, + "epoch": 0.1594378903539209, + "grad_norm": 1.0724822282791138, + "learning_rate": 1.5941023417172595e-05, + "loss": 0.8516, "step": 919 }, { - "epoch": 0.33595033777615485, - "grad_norm": 1.1202832460403442, - "learning_rate": 3.357664233576642e-05, - "loss": 1.0681, + "epoch": 0.1596113809854268, + "grad_norm": 1.0761146545410156, + "learning_rate": 1.595836947094536e-05, + "loss": 0.7898, "step": 920 }, { - "epoch": 0.3363155011867811, - "grad_norm": 1.480118751525879, - "learning_rate": 3.361313868613139e-05, - "loss": 1.1365, + "epoch": 0.1597848716169327, + "grad_norm": 0.8688669204711914, + "learning_rate": 1.5975715524718128e-05, + "loss": 1.0139, "step": 921 }, { - "epoch": 0.33668066459740736, - "grad_norm": 1.3212007284164429, - "learning_rate": 3.364963503649635e-05, - "loss": 1.1544, + "epoch": 0.1599583622484386, + "grad_norm": 0.9635099172592163, + "learning_rate": 1.5993061578490893e-05, + "loss": 0.8469, "step": 922 }, { - "epoch": 0.3370458280080336, - "grad_norm": 1.1504400968551636, - "learning_rate": 3.368613138686131e-05, - "loss": 1.0743, + "epoch": 0.16013185287994447, + "grad_norm": 1.3248603343963623, + "learning_rate": 1.601040763226366e-05, + "loss": 0.8203, "step": 923 }, { - "epoch": 0.3374109914186599, - "grad_norm": 0.9850884675979614, - "learning_rate": 3.372262773722628e-05, - "loss": 1.1161, + "epoch": 0.16030534351145037, + "grad_norm": 1.0994491577148438, + "learning_rate": 1.602775368603643e-05, + "loss": 0.8186, "step": 924 }, { - "epoch": 0.3377761548292861, - "grad_norm": 1.1027743816375732, - "learning_rate": 3.375912408759124e-05, - "loss": 1.0555, + "epoch": 0.16047883414295627, + "grad_norm": 1.368530511856079, + "learning_rate": 1.6045099739809195e-05, + "loss": 0.7615, "step": 925 }, { - "epoch": 0.33814131823991234, - "grad_norm": 1.1353791952133179, - "learning_rate": 3.3795620437956204e-05, - "loss": 1.0693, + "epoch": 0.16065232477446217, + "grad_norm": 0.911961019039154, + "learning_rate": 1.6062445793581963e-05, + "loss": 1.0217, "step": 926 }, { - "epoch": 0.3385064816505386, - "grad_norm": 0.9665250182151794, - "learning_rate": 3.383211678832117e-05, - "loss": 1.0917, + "epoch": 0.16082581540596808, + "grad_norm": 0.8017191290855408, + "learning_rate": 1.607979184735473e-05, + "loss": 0.9045, "step": 927 }, { - "epoch": 0.33887164506116485, - "grad_norm": 1.475665807723999, - "learning_rate": 3.386861313868613e-05, - "loss": 1.1353, + "epoch": 0.16099930603747398, + "grad_norm": 1.115915060043335, + "learning_rate": 1.6097137901127497e-05, + "loss": 0.8059, "step": 928 }, { - "epoch": 0.3392368084717911, - "grad_norm": 1.1586151123046875, - "learning_rate": 3.39051094890511e-05, - "loss": 1.0643, + "epoch": 0.16117279666897988, + "grad_norm": 0.8216562867164612, + "learning_rate": 1.6114483954900262e-05, + "loss": 0.9978, "step": 929 }, { - "epoch": 0.33960197188241736, - "grad_norm": 1.1307848691940308, - "learning_rate": 3.394160583941606e-05, - "loss": 1.1453, + "epoch": 0.16134628730048578, + "grad_norm": 1.0194272994995117, + "learning_rate": 1.6131830008673027e-05, + "loss": 0.897, "step": 930 }, { - "epoch": 0.3399671352930436, - "grad_norm": 1.4533435106277466, - "learning_rate": 3.397810218978102e-05, - "loss": 1.1121, + "epoch": 0.16151977793199168, + "grad_norm": 1.076393961906433, + "learning_rate": 1.6149176062445795e-05, + "loss": 0.7856, "step": 931 }, { - "epoch": 0.3403322987036699, - "grad_norm": 1.5788315534591675, - "learning_rate": 3.401459854014599e-05, - "loss": 1.1105, + "epoch": 0.16169326856349758, + "grad_norm": 0.8271511197090149, + "learning_rate": 1.616652211621856e-05, + "loss": 0.8022, "step": 932 }, { - "epoch": 0.34069746211429613, - "grad_norm": 1.494590401649475, - "learning_rate": 3.405109489051095e-05, - "loss": 1.1205, + "epoch": 0.16186675919500346, + "grad_norm": 0.8089983463287354, + "learning_rate": 1.618386816999133e-05, + "loss": 1.002, "step": 933 }, { - "epoch": 0.3410626255249224, - "grad_norm": 1.1085124015808105, - "learning_rate": 3.408759124087591e-05, - "loss": 1.0798, + "epoch": 0.16204024982650936, + "grad_norm": 1.1716415882110596, + "learning_rate": 1.6201214223764094e-05, + "loss": 0.8474, "step": 934 }, { - "epoch": 0.34142778893554865, - "grad_norm": 1.3422898054122925, - "learning_rate": 3.412408759124088e-05, - "loss": 1.0797, + "epoch": 0.16221374045801526, + "grad_norm": 0.8982862830162048, + "learning_rate": 1.6218560277536862e-05, + "loss": 0.9207, "step": 935 }, { - "epoch": 0.3417929523461749, - "grad_norm": 1.1060492992401123, - "learning_rate": 3.416058394160584e-05, - "loss": 1.0841, + "epoch": 0.16238723108952116, + "grad_norm": 0.9712823033332825, + "learning_rate": 1.6235906331309627e-05, + "loss": 0.8582, "step": 936 }, { - "epoch": 0.34215811575680116, - "grad_norm": 1.3374489545822144, - "learning_rate": 3.4197080291970804e-05, - "loss": 1.0876, + "epoch": 0.16256072172102706, + "grad_norm": 1.5961570739746094, + "learning_rate": 1.6253252385082395e-05, + "loss": 0.9385, "step": 937 }, { - "epoch": 0.3425232791674274, - "grad_norm": 0.9092893600463867, - "learning_rate": 3.423357664233577e-05, - "loss": 1.1013, + "epoch": 0.16273421235253296, + "grad_norm": 0.9695717692375183, + "learning_rate": 1.627059843885516e-05, + "loss": 0.9387, "step": 938 }, { - "epoch": 0.3428884425780537, - "grad_norm": 1.258057951927185, - "learning_rate": 3.427007299270073e-05, - "loss": 1.0892, + "epoch": 0.16290770298403887, + "grad_norm": 0.8938982486724854, + "learning_rate": 1.628794449262793e-05, + "loss": 0.8997, "step": 939 }, { - "epoch": 0.34325360598867993, - "grad_norm": 1.227363109588623, - "learning_rate": 3.4306569343065694e-05, - "loss": 1.1201, + "epoch": 0.16308119361554477, + "grad_norm": 0.8795838952064514, + "learning_rate": 1.6305290546400697e-05, + "loss": 0.8284, "step": 940 }, { - "epoch": 0.3436187693993062, - "grad_norm": 1.099668264389038, - "learning_rate": 3.434306569343066e-05, - "loss": 1.0538, + "epoch": 0.16325468424705067, + "grad_norm": 1.056602954864502, + "learning_rate": 1.6322636600173462e-05, + "loss": 0.9253, "step": 941 }, { - "epoch": 0.34398393280993245, - "grad_norm": 1.3873218297958374, - "learning_rate": 3.437956204379562e-05, - "loss": 1.1287, + "epoch": 0.16342817487855657, + "grad_norm": 1.1011998653411865, + "learning_rate": 1.633998265394623e-05, + "loss": 0.8574, "step": 942 }, { - "epoch": 0.3443490962205587, - "grad_norm": 1.2251898050308228, - "learning_rate": 3.4416058394160584e-05, - "loss": 1.1508, + "epoch": 0.16360166551006244, + "grad_norm": 0.7836947441101074, + "learning_rate": 1.6357328707718996e-05, + "loss": 0.8787, "step": 943 }, { - "epoch": 0.34471425963118496, - "grad_norm": 1.3095734119415283, - "learning_rate": 3.4452554744525545e-05, - "loss": 1.0917, + "epoch": 0.16377515614156835, + "grad_norm": 0.7722834944725037, + "learning_rate": 1.6374674761491764e-05, + "loss": 0.9319, "step": 944 }, { - "epoch": 0.3450794230418112, - "grad_norm": 1.0240797996520996, - "learning_rate": 3.448905109489051e-05, - "loss": 1.1096, + "epoch": 0.16394864677307425, + "grad_norm": 1.1815451383590698, + "learning_rate": 1.639202081526453e-05, + "loss": 0.8049, "step": 945 }, { - "epoch": 0.3454445864524375, - "grad_norm": 1.0779938697814941, - "learning_rate": 3.452554744525548e-05, - "loss": 1.0968, + "epoch": 0.16412213740458015, + "grad_norm": 0.850128710269928, + "learning_rate": 1.6409366869037294e-05, + "loss": 1.0037, "step": 946 }, { - "epoch": 0.34580974986306373, - "grad_norm": 1.477289080619812, - "learning_rate": 3.4562043795620436e-05, - "loss": 1.0942, + "epoch": 0.16429562803608605, + "grad_norm": 0.9460355043411255, + "learning_rate": 1.6426712922810062e-05, + "loss": 0.8276, "step": 947 }, { - "epoch": 0.34617491327369, - "grad_norm": 1.3629343509674072, - "learning_rate": 3.4598540145985404e-05, - "loss": 1.0913, + "epoch": 0.16446911866759195, + "grad_norm": 1.0745185613632202, + "learning_rate": 1.6444058976582827e-05, + "loss": 0.8113, "step": 948 }, { - "epoch": 0.34654007668431625, - "grad_norm": 1.5416985750198364, - "learning_rate": 3.463503649635037e-05, - "loss": 1.1168, + "epoch": 0.16464260929909785, + "grad_norm": 0.7476915717124939, + "learning_rate": 1.6461405030355596e-05, + "loss": 0.8931, "step": 949 }, { - "epoch": 0.3469052400949425, - "grad_norm": 1.061420202255249, - "learning_rate": 3.467153284671533e-05, - "loss": 1.0837, + "epoch": 0.16481609993060375, + "grad_norm": 0.7818108201026917, + "learning_rate": 1.647875108412836e-05, + "loss": 0.8784, "step": 950 }, { - "epoch": 0.34727040350556876, - "grad_norm": 0.8137184977531433, - "learning_rate": 3.4708029197080294e-05, - "loss": 1.1031, + "epoch": 0.16498959056210966, + "grad_norm": 1.043671727180481, + "learning_rate": 1.649609713790113e-05, + "loss": 0.8691, "step": 951 }, { - "epoch": 0.347635566916195, - "grad_norm": 1.7453768253326416, - "learning_rate": 3.474452554744526e-05, - "loss": 1.0684, + "epoch": 0.16516308119361556, + "grad_norm": 0.8186126351356506, + "learning_rate": 1.6513443191673894e-05, + "loss": 0.8706, "step": 952 }, { - "epoch": 0.3480007303268213, - "grad_norm": 1.1327828168869019, - "learning_rate": 3.478102189781022e-05, - "loss": 1.1066, + "epoch": 0.16533657182512143, + "grad_norm": 0.9577771425247192, + "learning_rate": 1.6530789245446663e-05, + "loss": 1.0537, "step": 953 }, { - "epoch": 0.34836589373744753, - "grad_norm": 0.9644864201545715, - "learning_rate": 3.4817518248175184e-05, - "loss": 1.0541, + "epoch": 0.16551006245662733, + "grad_norm": 0.867947518825531, + "learning_rate": 1.654813529921943e-05, + "loss": 0.8789, "step": 954 }, { - "epoch": 0.3487310571480738, - "grad_norm": 1.0864717960357666, - "learning_rate": 3.4854014598540145e-05, - "loss": 1.0758, + "epoch": 0.16568355308813323, + "grad_norm": 1.7598025798797607, + "learning_rate": 1.6565481352992196e-05, + "loss": 1.0059, "step": 955 }, { - "epoch": 0.34909622055870004, - "grad_norm": 1.387127161026001, - "learning_rate": 3.4890510948905113e-05, - "loss": 1.0474, + "epoch": 0.16585704371963914, + "grad_norm": 1.011852502822876, + "learning_rate": 1.6582827406764964e-05, + "loss": 0.8694, "step": 956 }, { - "epoch": 0.3494613839693263, - "grad_norm": 1.213873267173767, - "learning_rate": 3.4927007299270075e-05, - "loss": 1.074, + "epoch": 0.16603053435114504, + "grad_norm": 0.9569384455680847, + "learning_rate": 1.660017346053773e-05, + "loss": 0.8833, "step": 957 }, { - "epoch": 0.3498265473799525, - "grad_norm": 1.5771349668502808, - "learning_rate": 3.4963503649635036e-05, - "loss": 1.1284, + "epoch": 0.16620402498265094, + "grad_norm": 1.3738152980804443, + "learning_rate": 1.6617519514310498e-05, + "loss": 0.8223, "step": 958 }, { - "epoch": 0.35019171079057876, - "grad_norm": 1.3702975511550903, - "learning_rate": 3.5000000000000004e-05, - "loss": 1.1487, + "epoch": 0.16637751561415684, + "grad_norm": 0.8025304675102234, + "learning_rate": 1.6634865568083263e-05, + "loss": 0.8984, "step": 959 }, { - "epoch": 0.350556874201205, - "grad_norm": 1.1497656106948853, - "learning_rate": 3.5036496350364965e-05, - "loss": 1.0901, + "epoch": 0.16655100624566274, + "grad_norm": 0.8641000986099243, + "learning_rate": 1.6652211621856028e-05, + "loss": 0.8616, "step": 960 }, { - "epoch": 0.3509220376118313, - "grad_norm": 1.7558009624481201, - "learning_rate": 3.5072992700729926e-05, - "loss": 1.0985, + "epoch": 0.16672449687716864, + "grad_norm": 2.0065486431121826, + "learning_rate": 1.6669557675628796e-05, + "loss": 0.8064, "step": 961 }, { - "epoch": 0.35128720102245753, - "grad_norm": 0.874011218547821, - "learning_rate": 3.5109489051094894e-05, - "loss": 1.0569, + "epoch": 0.16689798750867454, + "grad_norm": 0.8103736042976379, + "learning_rate": 1.668690372940156e-05, + "loss": 1.0, "step": 962 }, { - "epoch": 0.3516523644330838, - "grad_norm": 1.2936913967132568, - "learning_rate": 3.514598540145986e-05, - "loss": 1.1302, + "epoch": 0.16707147814018042, + "grad_norm": 0.7029256820678711, + "learning_rate": 1.670424978317433e-05, + "loss": 0.8872, "step": 963 }, { - "epoch": 0.35201752784371004, - "grad_norm": 1.194183588027954, - "learning_rate": 3.5182481751824816e-05, - "loss": 1.1122, + "epoch": 0.16724496877168632, + "grad_norm": 1.0545709133148193, + "learning_rate": 1.6721595836947094e-05, + "loss": 0.8047, "step": 964 }, { - "epoch": 0.3523826912543363, - "grad_norm": 1.1380178928375244, - "learning_rate": 3.5218978102189784e-05, - "loss": 1.0686, + "epoch": 0.16741845940319222, + "grad_norm": 0.8875688314437866, + "learning_rate": 1.6738941890719863e-05, + "loss": 0.8669, "step": 965 }, { - "epoch": 0.35274785466496256, - "grad_norm": 1.2229827642440796, - "learning_rate": 3.5255474452554745e-05, - "loss": 1.1349, + "epoch": 0.16759195003469812, + "grad_norm": 1.2358845472335815, + "learning_rate": 1.6756287944492628e-05, + "loss": 0.9082, "step": 966 }, { - "epoch": 0.3531130180755888, - "grad_norm": 1.2150096893310547, - "learning_rate": 3.5291970802919713e-05, - "loss": 1.088, + "epoch": 0.16776544066620402, + "grad_norm": 0.8347636461257935, + "learning_rate": 1.6773633998265396e-05, + "loss": 0.8933, "step": 967 }, { - "epoch": 0.35347818148621507, - "grad_norm": 1.0461947917938232, - "learning_rate": 3.5328467153284675e-05, - "loss": 1.0895, + "epoch": 0.16793893129770993, + "grad_norm": 0.9026004076004028, + "learning_rate": 1.679098005203816e-05, + "loss": 0.9114, "step": 968 }, { - "epoch": 0.3538433448968413, - "grad_norm": 1.8620778322219849, - "learning_rate": 3.5364963503649636e-05, - "loss": 1.1083, + "epoch": 0.16811242192921583, + "grad_norm": 1.115957260131836, + "learning_rate": 1.680832610581093e-05, + "loss": 0.7441, "step": 969 }, { - "epoch": 0.3542085083074676, - "grad_norm": 1.107633352279663, - "learning_rate": 3.5401459854014604e-05, - "loss": 1.0719, + "epoch": 0.16828591256072173, + "grad_norm": 1.157446265220642, + "learning_rate": 1.6825672159583698e-05, + "loss": 0.8521, "step": 970 }, { - "epoch": 0.35457367171809384, - "grad_norm": 1.385096788406372, - "learning_rate": 3.5437956204379565e-05, - "loss": 1.1028, + "epoch": 0.16845940319222763, + "grad_norm": 0.8267524242401123, + "learning_rate": 1.6843018213356463e-05, + "loss": 0.824, "step": 971 }, { - "epoch": 0.3549388351287201, - "grad_norm": 1.1560183763504028, - "learning_rate": 3.5474452554744526e-05, - "loss": 1.0917, + "epoch": 0.16863289382373353, + "grad_norm": 0.7227283716201782, + "learning_rate": 1.686036426712923e-05, + "loss": 1.0637, "step": 972 }, { - "epoch": 0.35530399853934636, - "grad_norm": 1.2626185417175293, - "learning_rate": 3.5510948905109494e-05, - "loss": 1.1429, + "epoch": 0.1688063844552394, + "grad_norm": 0.7015002369880676, + "learning_rate": 1.6877710320901996e-05, + "loss": 1.0205, "step": 973 }, { - "epoch": 0.3556691619499726, - "grad_norm": 1.617483377456665, - "learning_rate": 3.5547445255474455e-05, - "loss": 1.0962, + "epoch": 0.1689798750867453, + "grad_norm": 0.9543939232826233, + "learning_rate": 1.689505637467476e-05, + "loss": 0.9719, "step": 974 }, { - "epoch": 0.35603432536059887, - "grad_norm": 1.2361632585525513, - "learning_rate": 3.5583941605839416e-05, - "loss": 1.0508, + "epoch": 0.1691533657182512, + "grad_norm": 0.9609969854354858, + "learning_rate": 1.691240242844753e-05, + "loss": 0.8318, "step": 975 }, { - "epoch": 0.3563994887712251, - "grad_norm": 1.2084460258483887, - "learning_rate": 3.5620437956204384e-05, - "loss": 1.0829, + "epoch": 0.1693268563497571, + "grad_norm": 0.8504872918128967, + "learning_rate": 1.6929748482220295e-05, + "loss": 0.916, "step": 976 }, { - "epoch": 0.3567646521818514, - "grad_norm": 1.4205139875411987, - "learning_rate": 3.5656934306569346e-05, - "loss": 1.1161, + "epoch": 0.169500346981263, + "grad_norm": 0.9507573843002319, + "learning_rate": 1.6947094535993063e-05, + "loss": 0.9792, "step": 977 }, { - "epoch": 0.35712981559247764, - "grad_norm": 1.1710001230239868, - "learning_rate": 3.569343065693431e-05, - "loss": 1.0786, + "epoch": 0.1696738376127689, + "grad_norm": 0.7466042041778564, + "learning_rate": 1.6964440589765828e-05, + "loss": 0.9221, "step": 978 }, { - "epoch": 0.3574949790031039, - "grad_norm": 1.0095936059951782, - "learning_rate": 3.5729927007299275e-05, - "loss": 1.0563, + "epoch": 0.16984732824427481, + "grad_norm": 0.7589272856712341, + "learning_rate": 1.6981786643538597e-05, + "loss": 0.9758, "step": 979 }, { - "epoch": 0.35786014241373015, - "grad_norm": 1.1532397270202637, - "learning_rate": 3.5766423357664236e-05, - "loss": 1.0695, + "epoch": 0.17002081887578072, + "grad_norm": 0.9818898439407349, + "learning_rate": 1.699913269731136e-05, + "loss": 0.906, "step": 980 }, { - "epoch": 0.3582253058243564, - "grad_norm": 1.296919584274292, - "learning_rate": 3.58029197080292e-05, - "loss": 1.106, + "epoch": 0.17019430950728662, + "grad_norm": 0.9262455701828003, + "learning_rate": 1.701647875108413e-05, + "loss": 0.8008, "step": 981 }, { - "epoch": 0.35859046923498267, - "grad_norm": 1.4357057809829712, - "learning_rate": 3.5839416058394165e-05, - "loss": 1.0968, + "epoch": 0.17036780013879252, + "grad_norm": 1.1445931196212769, + "learning_rate": 1.7033824804856895e-05, + "loss": 0.813, "step": 982 }, { - "epoch": 0.3589556326456089, - "grad_norm": 1.6949366331100464, - "learning_rate": 3.5875912408759126e-05, - "loss": 1.0529, + "epoch": 0.1705412907702984, + "grad_norm": 0.9090064764022827, + "learning_rate": 1.7051170858629663e-05, + "loss": 0.771, "step": 983 }, { - "epoch": 0.3593207960562352, - "grad_norm": 1.4346821308135986, - "learning_rate": 3.5912408759124094e-05, - "loss": 1.0621, + "epoch": 0.1707147814018043, + "grad_norm": 1.0458335876464844, + "learning_rate": 1.7068516912402432e-05, + "loss": 0.9895, "step": 984 }, { - "epoch": 0.35968595946686144, - "grad_norm": 1.179490566253662, - "learning_rate": 3.5948905109489055e-05, - "loss": 1.1102, + "epoch": 0.1708882720333102, + "grad_norm": 0.8798192143440247, + "learning_rate": 1.7085862966175197e-05, + "loss": 0.9351, "step": 985 }, { - "epoch": 0.3600511228774877, - "grad_norm": 1.1279298067092896, - "learning_rate": 3.5985401459854016e-05, - "loss": 1.0586, + "epoch": 0.1710617626648161, + "grad_norm": 0.8899279236793518, + "learning_rate": 1.7103209019947965e-05, + "loss": 0.8608, "step": 986 }, { - "epoch": 0.36041628628811395, - "grad_norm": 1.3119022846221924, - "learning_rate": 3.6021897810218984e-05, - "loss": 1.0612, + "epoch": 0.171235253296322, + "grad_norm": 0.7836707234382629, + "learning_rate": 1.712055507372073e-05, + "loss": 0.9297, "step": 987 }, { - "epoch": 0.3607814496987402, - "grad_norm": 1.0083409547805786, - "learning_rate": 3.6058394160583946e-05, - "loss": 1.0459, + "epoch": 0.1714087439278279, + "grad_norm": 1.2844257354736328, + "learning_rate": 1.71379011274935e-05, + "loss": 0.729, "step": 988 }, { - "epoch": 0.36114661310936647, - "grad_norm": 1.0758939981460571, - "learning_rate": 3.609489051094891e-05, - "loss": 1.0599, + "epoch": 0.1715822345593338, + "grad_norm": 0.892307698726654, + "learning_rate": 1.7155247181266264e-05, + "loss": 0.8726, "step": 989 }, { - "epoch": 0.3615117765199927, - "grad_norm": 1.1632319688796997, - "learning_rate": 3.6131386861313875e-05, - "loss": 1.0959, + "epoch": 0.1717557251908397, + "grad_norm": 0.9911701083183289, + "learning_rate": 1.717259323503903e-05, + "loss": 0.8164, "step": 990 }, { - "epoch": 0.361876939930619, - "grad_norm": 0.9749258160591125, - "learning_rate": 3.6167883211678836e-05, - "loss": 1.0714, + "epoch": 0.1719292158223456, + "grad_norm": 1.358222246170044, + "learning_rate": 1.7189939288811797e-05, + "loss": 0.7993, "step": 991 }, { - "epoch": 0.3622421033412452, - "grad_norm": 1.2663055658340454, - "learning_rate": 3.62043795620438e-05, - "loss": 1.08, + "epoch": 0.17210270645385148, + "grad_norm": 0.7723311185836792, + "learning_rate": 1.7207285342584562e-05, + "loss": 0.9939, "step": 992 }, { - "epoch": 0.36260726675187144, - "grad_norm": 1.220440149307251, - "learning_rate": 3.6240875912408765e-05, - "loss": 1.0679, + "epoch": 0.17227619708535738, + "grad_norm": 0.9668728709220886, + "learning_rate": 1.722463139635733e-05, + "loss": 0.7764, "step": 993 }, { - "epoch": 0.3629724301624977, - "grad_norm": 0.9589149355888367, - "learning_rate": 3.6277372262773726e-05, - "loss": 1.0675, + "epoch": 0.17244968771686328, + "grad_norm": 2.05971360206604, + "learning_rate": 1.7241977450130095e-05, + "loss": 0.9543, "step": 994 }, { - "epoch": 0.36333759357312395, - "grad_norm": 1.0809324979782104, - "learning_rate": 3.631386861313869e-05, - "loss": 1.0984, + "epoch": 0.17262317834836918, + "grad_norm": 0.8839221596717834, + "learning_rate": 1.7259323503902864e-05, + "loss": 0.9121, "step": 995 }, { - "epoch": 0.3637027569837502, - "grad_norm": 1.261988639831543, - "learning_rate": 3.635036496350365e-05, - "loss": 1.016, + "epoch": 0.17279666897987508, + "grad_norm": 0.8395109176635742, + "learning_rate": 1.727666955767563e-05, + "loss": 0.8574, "step": 996 }, { - "epoch": 0.36406792039437647, - "grad_norm": 1.6527141332626343, - "learning_rate": 3.6386861313868616e-05, - "loss": 1.0831, + "epoch": 0.17297015961138099, + "grad_norm": 0.8401376605033875, + "learning_rate": 1.7294015611448397e-05, + "loss": 0.9041, "step": 997 }, { - "epoch": 0.3644330838050027, - "grad_norm": 1.651012659072876, - "learning_rate": 3.642335766423358e-05, - "loss": 1.0461, + "epoch": 0.1731436502428869, + "grad_norm": 1.5835697650909424, + "learning_rate": 1.7311361665221166e-05, + "loss": 0.8091, "step": 998 }, { - "epoch": 0.364798247215629, - "grad_norm": 1.263540506362915, - "learning_rate": 3.645985401459854e-05, - "loss": 1.0715, + "epoch": 0.1733171408743928, + "grad_norm": 0.8792264461517334, + "learning_rate": 1.732870771899393e-05, + "loss": 0.7888, "step": 999 }, { - "epoch": 0.36516341062625524, - "grad_norm": 1.215338110923767, - "learning_rate": 3.649635036496351e-05, - "loss": 1.0903, + "epoch": 0.1734906315058987, + "grad_norm": 1.2893178462982178, + "learning_rate": 1.73460537727667e-05, + "loss": 0.8882, "step": 1000 }, { - "epoch": 0.3655285740368815, - "grad_norm": 1.15705144405365, - "learning_rate": 3.6532846715328475e-05, - "loss": 1.03, + "epoch": 0.1736641221374046, + "grad_norm": 1.0048354864120483, + "learning_rate": 1.7363399826539464e-05, + "loss": 0.834, "step": 1001 }, { - "epoch": 0.36589373744750775, - "grad_norm": 1.178078532218933, - "learning_rate": 3.656934306569343e-05, - "loss": 1.0547, + "epoch": 0.17383761276891047, + "grad_norm": 0.8436573147773743, + "learning_rate": 1.7380745880312232e-05, + "loss": 0.7773, "step": 1002 }, { - "epoch": 0.366258900858134, - "grad_norm": 1.110164761543274, - "learning_rate": 3.66058394160584e-05, - "loss": 1.0707, + "epoch": 0.17401110340041637, + "grad_norm": 1.5717145204544067, + "learning_rate": 1.7398091934084997e-05, + "loss": 0.9121, "step": 1003 }, { - "epoch": 0.36662406426876026, - "grad_norm": 1.3152765035629272, - "learning_rate": 3.6642335766423365e-05, - "loss": 1.0834, + "epoch": 0.17418459403192227, + "grad_norm": 0.7580949664115906, + "learning_rate": 1.7415437987857762e-05, + "loss": 0.8267, "step": 1004 }, { - "epoch": 0.3669892276793865, - "grad_norm": 1.3928614854812622, - "learning_rate": 3.6678832116788326e-05, - "loss": 1.101, + "epoch": 0.17435808466342817, + "grad_norm": 1.1851462125778198, + "learning_rate": 1.743278404163053e-05, + "loss": 0.9167, "step": 1005 }, { - "epoch": 0.3673543910900128, - "grad_norm": 0.9466423988342285, - "learning_rate": 3.671532846715329e-05, - "loss": 1.0895, + "epoch": 0.17453157529493407, + "grad_norm": 0.6987919211387634, + "learning_rate": 1.7450130095403296e-05, + "loss": 1.0171, "step": 1006 }, { - "epoch": 0.36771955450063903, - "grad_norm": 1.3716709613800049, - "learning_rate": 3.675182481751825e-05, - "loss": 1.0501, + "epoch": 0.17470506592643997, + "grad_norm": 0.9006944298744202, + "learning_rate": 1.7467476149176064e-05, + "loss": 0.9072, "step": 1007 }, { - "epoch": 0.3680847179112653, - "grad_norm": 0.9824967980384827, - "learning_rate": 3.6788321167883217e-05, - "loss": 1.0555, + "epoch": 0.17487855655794587, + "grad_norm": 1.4969807863235474, + "learning_rate": 1.748482220294883e-05, + "loss": 0.7207, "step": 1008 }, { - "epoch": 0.36844988132189155, - "grad_norm": 0.9950441718101501, - "learning_rate": 3.682481751824818e-05, - "loss": 1.0693, + "epoch": 0.17505204718945178, + "grad_norm": 1.0250056982040405, + "learning_rate": 1.7502168256721597e-05, + "loss": 0.9178, "step": 1009 }, { - "epoch": 0.3688150447325178, - "grad_norm": 1.1767274141311646, - "learning_rate": 3.686131386861314e-05, - "loss": 1.0684, + "epoch": 0.17522553782095768, + "grad_norm": 0.9165359735488892, + "learning_rate": 1.7519514310494362e-05, + "loss": 0.8118, "step": 1010 }, { - "epoch": 0.36918020814314406, - "grad_norm": 1.0600765943527222, - "learning_rate": 3.689781021897811e-05, - "loss": 1.0033, + "epoch": 0.17539902845246358, + "grad_norm": 0.8604927659034729, + "learning_rate": 1.753686036426713e-05, + "loss": 0.9434, "step": 1011 }, { - "epoch": 0.3695453715537703, - "grad_norm": 1.4236944913864136, - "learning_rate": 3.693430656934307e-05, - "loss": 1.0975, + "epoch": 0.17557251908396945, + "grad_norm": 1.1479854583740234, + "learning_rate": 1.7554206418039896e-05, + "loss": 0.8176, "step": 1012 }, { - "epoch": 0.3699105349643966, - "grad_norm": 1.759039044380188, - "learning_rate": 3.697080291970803e-05, - "loss": 1.1155, + "epoch": 0.17574600971547535, + "grad_norm": 1.1277000904083252, + "learning_rate": 1.7571552471812664e-05, + "loss": 0.9751, "step": 1013 }, { - "epoch": 0.37027569837502283, - "grad_norm": 1.324727177619934, - "learning_rate": 3.7007299270073e-05, - "loss": 1.0513, + "epoch": 0.17591950034698126, + "grad_norm": 0.8372589945793152, + "learning_rate": 1.7588898525585433e-05, + "loss": 0.9255, "step": 1014 }, { - "epoch": 0.3706408617856491, - "grad_norm": 1.1971713304519653, - "learning_rate": 3.704379562043796e-05, - "loss": 1.0734, + "epoch": 0.17609299097848716, + "grad_norm": 0.8416146636009216, + "learning_rate": 1.7606244579358198e-05, + "loss": 0.9514, "step": 1015 }, { - "epoch": 0.37100602519627535, - "grad_norm": 1.2363351583480835, - "learning_rate": 3.708029197080292e-05, - "loss": 1.1021, + "epoch": 0.17626648160999306, + "grad_norm": 0.8313689827919006, + "learning_rate": 1.7623590633130966e-05, + "loss": 0.9839, "step": 1016 }, { - "epoch": 0.3713711886069016, - "grad_norm": 1.694140076637268, - "learning_rate": 3.711678832116789e-05, - "loss": 1.052, + "epoch": 0.17643997224149896, + "grad_norm": 0.8255547285079956, + "learning_rate": 1.764093668690373e-05, + "loss": 0.8994, "step": 1017 }, { - "epoch": 0.37173635201752786, - "grad_norm": 1.148966670036316, - "learning_rate": 3.715328467153285e-05, - "loss": 1.0486, + "epoch": 0.17661346287300486, + "grad_norm": 1.2307218313217163, + "learning_rate": 1.76582827406765e-05, + "loss": 0.863, "step": 1018 }, { - "epoch": 0.3721015154281541, - "grad_norm": 1.2454609870910645, - "learning_rate": 3.718978102189781e-05, - "loss": 1.0978, + "epoch": 0.17678695350451076, + "grad_norm": 0.9237003326416016, + "learning_rate": 1.7675628794449264e-05, + "loss": 0.8599, "step": 1019 }, { - "epoch": 0.3724666788387804, - "grad_norm": 1.385877251625061, - "learning_rate": 3.722627737226278e-05, - "loss": 1.0557, + "epoch": 0.17696044413601666, + "grad_norm": 1.4484760761260986, + "learning_rate": 1.769297484822203e-05, + "loss": 0.7319, "step": 1020 }, { - "epoch": 0.37283184224940663, - "grad_norm": 1.2863870859146118, - "learning_rate": 3.726277372262774e-05, - "loss": 1.0729, + "epoch": 0.17713393476752257, + "grad_norm": 1.2265433073043823, + "learning_rate": 1.7710320901994798e-05, + "loss": 0.9297, "step": 1021 }, { - "epoch": 0.3731970056600329, - "grad_norm": 1.4944157600402832, - "learning_rate": 3.729927007299271e-05, - "loss": 1.066, + "epoch": 0.17730742539902844, + "grad_norm": 0.8774186968803406, + "learning_rate": 1.7727666955767563e-05, + "loss": 0.9565, "step": 1022 }, { - "epoch": 0.37356216907065914, - "grad_norm": 1.1679842472076416, - "learning_rate": 3.733576642335767e-05, - "loss": 1.0679, + "epoch": 0.17748091603053434, + "grad_norm": 0.7973244190216064, + "learning_rate": 1.774501300954033e-05, + "loss": 0.8977, "step": 1023 }, { - "epoch": 0.3739273324812854, - "grad_norm": 1.530861496925354, - "learning_rate": 3.737226277372263e-05, - "loss": 1.0756, + "epoch": 0.17765440666204024, + "grad_norm": 1.0264477729797363, + "learning_rate": 1.7762359063313096e-05, + "loss": 0.9954, "step": 1024 }, { - "epoch": 0.3742924958919116, - "grad_norm": 1.280126929283142, - "learning_rate": 3.74087591240876e-05, - "loss": 1.0323, + "epoch": 0.17782789729354614, + "grad_norm": 1.0230461359024048, + "learning_rate": 1.7779705117085865e-05, + "loss": 0.8379, "step": 1025 }, { - "epoch": 0.37465765930253786, - "grad_norm": 1.1576896905899048, - "learning_rate": 3.744525547445256e-05, - "loss": 0.9989, + "epoch": 0.17800138792505205, + "grad_norm": 0.8978738784790039, + "learning_rate": 1.779705117085863e-05, + "loss": 0.8997, "step": 1026 }, { - "epoch": 0.3750228227131641, - "grad_norm": 1.2105504274368286, - "learning_rate": 3.748175182481752e-05, - "loss": 1.0007, + "epoch": 0.17817487855655795, + "grad_norm": 0.7228987216949463, + "learning_rate": 1.7814397224631398e-05, + "loss": 0.9065, "step": 1027 }, { - "epoch": 0.3753879861237904, - "grad_norm": 1.4735031127929688, - "learning_rate": 3.751824817518249e-05, - "loss": 1.0084, + "epoch": 0.17834836918806385, + "grad_norm": 0.7938546538352966, + "learning_rate": 1.7831743278404166e-05, + "loss": 0.9304, "step": 1028 }, { - "epoch": 0.37575314953441663, - "grad_norm": 1.579299807548523, - "learning_rate": 3.755474452554745e-05, - "loss": 1.0745, + "epoch": 0.17852185981956975, + "grad_norm": 1.8081859350204468, + "learning_rate": 1.784908933217693e-05, + "loss": 0.8662, "step": 1029 }, { - "epoch": 0.3761183129450429, - "grad_norm": 1.2502379417419434, - "learning_rate": 3.759124087591241e-05, - "loss": 1.0558, + "epoch": 0.17869535045107565, + "grad_norm": 0.7295611500740051, + "learning_rate": 1.78664353859497e-05, + "loss": 0.8345, "step": 1030 }, { - "epoch": 0.37648347635566914, - "grad_norm": 1.2470508813858032, - "learning_rate": 3.762773722627738e-05, - "loss": 1.0587, + "epoch": 0.17886884108258155, + "grad_norm": 0.8012073040008545, + "learning_rate": 1.7883781439722465e-05, + "loss": 0.8499, "step": 1031 }, { - "epoch": 0.3768486397662954, - "grad_norm": 1.3190096616744995, - "learning_rate": 3.766423357664234e-05, - "loss": 1.0748, + "epoch": 0.17904233171408743, + "grad_norm": 1.0692211389541626, + "learning_rate": 1.7901127493495233e-05, + "loss": 0.8257, "step": 1032 }, { - "epoch": 0.37721380317692166, - "grad_norm": 1.3318780660629272, - "learning_rate": 3.77007299270073e-05, - "loss": 1.0341, + "epoch": 0.17921582234559333, + "grad_norm": 1.4763771295547485, + "learning_rate": 1.7918473547267998e-05, + "loss": 0.9941, "step": 1033 }, { - "epoch": 0.3775789665875479, - "grad_norm": 1.627843976020813, - "learning_rate": 3.773722627737227e-05, - "loss": 1.1019, + "epoch": 0.17938931297709923, + "grad_norm": 0.9654231071472168, + "learning_rate": 1.7935819601040763e-05, + "loss": 0.8884, "step": 1034 }, { - "epoch": 0.37794412999817417, - "grad_norm": 1.3563354015350342, - "learning_rate": 3.777372262773723e-05, - "loss": 1.0511, + "epoch": 0.17956280360860513, + "grad_norm": 0.9759186506271362, + "learning_rate": 1.795316565481353e-05, + "loss": 0.8574, "step": 1035 }, { - "epoch": 0.37830929340880043, - "grad_norm": 1.848395824432373, - "learning_rate": 3.781021897810219e-05, - "loss": 1.1057, + "epoch": 0.17973629424011103, + "grad_norm": 0.9331534504890442, + "learning_rate": 1.7970511708586297e-05, + "loss": 0.7788, "step": 1036 }, { - "epoch": 0.3786744568194267, - "grad_norm": 0.8704839944839478, - "learning_rate": 3.784671532846716e-05, - "loss": 1.0566, + "epoch": 0.17990978487161693, + "grad_norm": 0.907806396484375, + "learning_rate": 1.7987857762359065e-05, + "loss": 0.8008, "step": 1037 }, { - "epoch": 0.37903962023005294, - "grad_norm": 3.322103261947632, - "learning_rate": 3.788321167883212e-05, - "loss": 1.0992, + "epoch": 0.18008327550312284, + "grad_norm": 0.9000624418258667, + "learning_rate": 1.800520381613183e-05, + "loss": 0.814, "step": 1038 }, { - "epoch": 0.3794047836406792, - "grad_norm": 1.5346883535385132, - "learning_rate": 3.791970802919709e-05, - "loss": 1.0386, + "epoch": 0.18025676613462874, + "grad_norm": 0.8217934966087341, + "learning_rate": 1.80225498699046e-05, + "loss": 0.9456, "step": 1039 }, { - "epoch": 0.37976994705130546, - "grad_norm": 1.1669354438781738, - "learning_rate": 3.795620437956204e-05, - "loss": 1.0953, + "epoch": 0.18043025676613464, + "grad_norm": 0.8157238960266113, + "learning_rate": 1.8039895923677363e-05, + "loss": 0.8447, "step": 1040 }, { - "epoch": 0.3801351104619317, - "grad_norm": 1.3449947834014893, - "learning_rate": 3.799270072992701e-05, - "loss": 1.0844, + "epoch": 0.18060374739764054, + "grad_norm": 0.8732859492301941, + "learning_rate": 1.8057241977450132e-05, + "loss": 0.9094, "step": 1041 }, { - "epoch": 0.38050027387255797, - "grad_norm": 1.6431771516799927, - "learning_rate": 3.802919708029198e-05, - "loss": 1.022, + "epoch": 0.18077723802914641, + "grad_norm": 1.1334575414657593, + "learning_rate": 1.8074588031222897e-05, + "loss": 0.7715, "step": 1042 }, { - "epoch": 0.3808654372831842, - "grad_norm": 1.7015012502670288, - "learning_rate": 3.806569343065694e-05, - "loss": 1.0513, + "epoch": 0.18095072866065232, + "grad_norm": 0.962907612323761, + "learning_rate": 1.8091934084995665e-05, + "loss": 0.7219, "step": 1043 }, { - "epoch": 0.3812306006938105, - "grad_norm": 1.242341160774231, - "learning_rate": 3.81021897810219e-05, - "loss": 1.0287, + "epoch": 0.18112421929215822, + "grad_norm": 0.8657878041267395, + "learning_rate": 1.8109280138768434e-05, + "loss": 0.8357, "step": 1044 }, { - "epoch": 0.38159576410443674, - "grad_norm": 1.3954999446868896, - "learning_rate": 3.813868613138687e-05, - "loss": 1.0691, + "epoch": 0.18129770992366412, + "grad_norm": 1.1050974130630493, + "learning_rate": 1.81266261925412e-05, + "loss": 0.7615, "step": 1045 }, { - "epoch": 0.381960927515063, - "grad_norm": 1.1503889560699463, - "learning_rate": 3.817518248175183e-05, - "loss": 1.0699, + "epoch": 0.18147120055517002, + "grad_norm": 0.744647741317749, + "learning_rate": 1.8143972246313967e-05, + "loss": 0.9182, "step": 1046 }, { - "epoch": 0.38232609092568925, - "grad_norm": 1.1910291910171509, - "learning_rate": 3.821167883211679e-05, - "loss": 1.0919, + "epoch": 0.18164469118667592, + "grad_norm": 0.9615671634674072, + "learning_rate": 1.8161318300086732e-05, + "loss": 0.728, "step": 1047 }, { - "epoch": 0.3826912543363155, - "grad_norm": 1.6541327238082886, - "learning_rate": 3.824817518248176e-05, - "loss": 1.067, + "epoch": 0.18181818181818182, + "grad_norm": 1.0303088426589966, + "learning_rate": 1.81786643538595e-05, + "loss": 0.8728, "step": 1048 }, { - "epoch": 0.38305641774694177, - "grad_norm": 1.7566941976547241, - "learning_rate": 3.828467153284672e-05, - "loss": 1.0446, + "epoch": 0.18199167244968772, + "grad_norm": 1.5389691591262817, + "learning_rate": 1.8196010407632265e-05, + "loss": 0.8694, "step": 1049 }, { - "epoch": 0.383421581157568, - "grad_norm": 1.545143723487854, - "learning_rate": 3.832116788321168e-05, - "loss": 1.0638, + "epoch": 0.18216516308119363, + "grad_norm": 1.5999655723571777, + "learning_rate": 1.821335646140503e-05, + "loss": 0.9104, "step": 1050 }, { - "epoch": 0.3837867445681943, - "grad_norm": 1.2948969602584839, - "learning_rate": 3.835766423357664e-05, - "loss": 1.0732, + "epoch": 0.18233865371269953, + "grad_norm": 0.9172372817993164, + "learning_rate": 1.82307025151778e-05, + "loss": 0.9558, "step": 1051 }, { - "epoch": 0.38415190797882054, - "grad_norm": 1.781628966331482, - "learning_rate": 3.839416058394161e-05, - "loss": 1.0826, + "epoch": 0.1825121443442054, + "grad_norm": 0.7845026254653931, + "learning_rate": 1.8248048568950564e-05, + "loss": 0.8423, "step": 1052 }, { - "epoch": 0.3845170713894468, - "grad_norm": 1.4051412343978882, - "learning_rate": 3.843065693430657e-05, - "loss": 1.0612, + "epoch": 0.1826856349757113, + "grad_norm": 0.8035831451416016, + "learning_rate": 1.8265394622723332e-05, + "loss": 0.8101, "step": 1053 }, { - "epoch": 0.38488223480007305, - "grad_norm": 1.2231218814849854, - "learning_rate": 3.846715328467153e-05, - "loss": 1.0858, + "epoch": 0.1828591256072172, + "grad_norm": 0.8926576972007751, + "learning_rate": 1.8282740676496097e-05, + "loss": 0.6758, "step": 1054 }, { - "epoch": 0.3852473982106993, - "grad_norm": 1.2689377069473267, - "learning_rate": 3.85036496350365e-05, - "loss": 1.042, + "epoch": 0.1830326162387231, + "grad_norm": 0.8564839959144592, + "learning_rate": 1.8300086730268865e-05, + "loss": 0.8584, "step": 1055 }, { - "epoch": 0.38561256162132557, - "grad_norm": 1.2875949144363403, - "learning_rate": 3.854014598540147e-05, - "loss": 1.0972, + "epoch": 0.183206106870229, + "grad_norm": 1.6128764152526855, + "learning_rate": 1.831743278404163e-05, + "loss": 0.7327, "step": 1056 }, { - "epoch": 0.3859777250319518, - "grad_norm": 0.9562207460403442, - "learning_rate": 3.857664233576642e-05, - "loss": 1.0643, + "epoch": 0.1833795975017349, + "grad_norm": 1.2095292806625366, + "learning_rate": 1.83347788378144e-05, + "loss": 0.8188, "step": 1057 }, { - "epoch": 0.3863428884425781, - "grad_norm": 0.9304961562156677, - "learning_rate": 3.861313868613139e-05, - "loss": 1.0648, + "epoch": 0.1835530881332408, + "grad_norm": 1.1021777391433716, + "learning_rate": 1.8352124891587167e-05, + "loss": 0.8916, "step": 1058 }, { - "epoch": 0.3867080518532043, - "grad_norm": 1.4941751956939697, - "learning_rate": 3.864963503649636e-05, - "loss": 1.0494, + "epoch": 0.1837265787647467, + "grad_norm": 0.7823024392127991, + "learning_rate": 1.8369470945359932e-05, + "loss": 0.9326, "step": 1059 }, { - "epoch": 0.38707321526383054, - "grad_norm": 1.058205246925354, - "learning_rate": 3.868613138686132e-05, - "loss": 1.0634, + "epoch": 0.1839000693962526, + "grad_norm": 1.1157408952713013, + "learning_rate": 1.83868169991327e-05, + "loss": 0.8638, "step": 1060 }, { - "epoch": 0.3874383786744568, - "grad_norm": 1.2509559392929077, - "learning_rate": 3.872262773722628e-05, - "loss": 1.0597, + "epoch": 0.18407356002775851, + "grad_norm": 0.8544512391090393, + "learning_rate": 1.8404163052905466e-05, + "loss": 0.9519, "step": 1061 }, { - "epoch": 0.38780354208508305, - "grad_norm": 1.4179550409317017, - "learning_rate": 3.875912408759124e-05, - "loss": 1.0399, + "epoch": 0.1842470506592644, + "grad_norm": 0.6742433309555054, + "learning_rate": 1.8421509106678234e-05, + "loss": 0.9148, "step": 1062 }, { - "epoch": 0.3881687054957093, - "grad_norm": 1.5804535150527954, - "learning_rate": 3.879562043795621e-05, - "loss": 1.0613, + "epoch": 0.1844205412907703, + "grad_norm": 0.870310366153717, + "learning_rate": 1.8438855160451e-05, + "loss": 0.833, "step": 1063 }, { - "epoch": 0.38853386890633557, - "grad_norm": 1.6716043949127197, - "learning_rate": 3.883211678832117e-05, - "loss": 1.0535, + "epoch": 0.1845940319222762, + "grad_norm": 0.8651427626609802, + "learning_rate": 1.8456201214223764e-05, + "loss": 0.9714, "step": 1064 }, { - "epoch": 0.3888990323169618, - "grad_norm": 1.8054860830307007, - "learning_rate": 3.886861313868613e-05, - "loss": 1.1276, + "epoch": 0.1847675225537821, + "grad_norm": 0.9442116618156433, + "learning_rate": 1.8473547267996532e-05, + "loss": 0.9507, "step": 1065 }, { - "epoch": 0.3892641957275881, - "grad_norm": 1.2135201692581177, - "learning_rate": 3.89051094890511e-05, - "loss": 1.1047, + "epoch": 0.184941013185288, + "grad_norm": 0.8498819470405579, + "learning_rate": 1.8490893321769297e-05, + "loss": 0.9534, "step": 1066 }, { - "epoch": 0.38962935913821434, - "grad_norm": 1.2736563682556152, - "learning_rate": 3.894160583941606e-05, - "loss": 1.0359, + "epoch": 0.1851145038167939, + "grad_norm": 1.022501826286316, + "learning_rate": 1.8508239375542066e-05, + "loss": 0.741, "step": 1067 }, { - "epoch": 0.3899945225488406, - "grad_norm": 1.5540190935134888, - "learning_rate": 3.897810218978102e-05, - "loss": 1.0237, + "epoch": 0.1852879944482998, + "grad_norm": 1.0645262002944946, + "learning_rate": 1.852558542931483e-05, + "loss": 0.7517, "step": 1068 }, { - "epoch": 0.39035968595946685, - "grad_norm": 1.193400263786316, - "learning_rate": 3.901459854014599e-05, - "loss": 1.0383, + "epoch": 0.1854614850798057, + "grad_norm": 0.7361165881156921, + "learning_rate": 1.85429314830876e-05, + "loss": 0.885, "step": 1069 }, { - "epoch": 0.3907248493700931, - "grad_norm": 1.1911488771438599, - "learning_rate": 3.905109489051095e-05, - "loss": 1.0103, + "epoch": 0.1856349757113116, + "grad_norm": 1.5586457252502441, + "learning_rate": 1.8560277536860364e-05, + "loss": 0.8822, "step": 1070 }, { - "epoch": 0.39109001278071936, - "grad_norm": 1.6968015432357788, - "learning_rate": 3.908759124087591e-05, - "loss": 1.0693, + "epoch": 0.18580846634281747, + "grad_norm": 0.858544111251831, + "learning_rate": 1.8577623590633133e-05, + "loss": 0.8398, "step": 1071 }, { - "epoch": 0.3914551761913456, - "grad_norm": 1.0422827005386353, - "learning_rate": 3.912408759124088e-05, - "loss": 1.0415, + "epoch": 0.18598195697432338, + "grad_norm": 0.9131850004196167, + "learning_rate": 1.85949696444059e-05, + "loss": 0.9097, "step": 1072 }, { - "epoch": 0.3918203396019719, - "grad_norm": 1.2056610584259033, - "learning_rate": 3.916058394160584e-05, - "loss": 1.0817, + "epoch": 0.18615544760582928, + "grad_norm": 0.9032670855522156, + "learning_rate": 1.8612315698178666e-05, + "loss": 0.8103, "step": 1073 }, { - "epoch": 0.39218550301259814, - "grad_norm": 1.1182008981704712, - "learning_rate": 3.91970802919708e-05, - "loss": 1.0365, + "epoch": 0.18632893823733518, + "grad_norm": 0.9513208270072937, + "learning_rate": 1.8629661751951434e-05, + "loss": 0.9038, "step": 1074 }, { - "epoch": 0.3925506664232244, - "grad_norm": 1.2231231927871704, - "learning_rate": 3.923357664233577e-05, - "loss": 1.0448, + "epoch": 0.18650242886884108, + "grad_norm": 0.9131335020065308, + "learning_rate": 1.86470078057242e-05, + "loss": 0.8083, "step": 1075 }, { - "epoch": 0.39291582983385065, - "grad_norm": 1.366044282913208, - "learning_rate": 3.927007299270073e-05, - "loss": 1.0629, + "epoch": 0.18667591950034698, + "grad_norm": 0.9009382128715515, + "learning_rate": 1.8664353859496968e-05, + "loss": 0.8696, "step": 1076 }, { - "epoch": 0.3932809932444769, - "grad_norm": 1.6295936107635498, - "learning_rate": 3.93065693430657e-05, - "loss": 1.111, + "epoch": 0.18684941013185288, + "grad_norm": 1.3538861274719238, + "learning_rate": 1.8681699913269733e-05, + "loss": 0.7839, "step": 1077 }, { - "epoch": 0.39364615665510316, - "grad_norm": 1.882778525352478, - "learning_rate": 3.934306569343066e-05, - "loss": 1.0719, + "epoch": 0.18702290076335878, + "grad_norm": 0.7064875364303589, + "learning_rate": 1.86990459670425e-05, + "loss": 0.9641, "step": 1078 }, { - "epoch": 0.3940113200657294, - "grad_norm": 1.4148269891738892, - "learning_rate": 3.937956204379562e-05, - "loss": 1.0776, + "epoch": 0.1871963913948647, + "grad_norm": 1.2610442638397217, + "learning_rate": 1.8716392020815266e-05, + "loss": 0.823, "step": 1079 }, { - "epoch": 0.3943764834763557, - "grad_norm": 1.386021375656128, - "learning_rate": 3.941605839416059e-05, - "loss": 1.0902, + "epoch": 0.1873698820263706, + "grad_norm": 0.8477995991706848, + "learning_rate": 1.873373807458803e-05, + "loss": 0.8687, "step": 1080 }, { - "epoch": 0.39474164688698193, - "grad_norm": 1.1143816709518433, - "learning_rate": 3.945255474452555e-05, - "loss": 1.063, + "epoch": 0.18754337265787646, + "grad_norm": 0.8461495637893677, + "learning_rate": 1.87510841283608e-05, + "loss": 0.8872, "step": 1081 }, { - "epoch": 0.3951068102976082, - "grad_norm": 1.4111757278442383, - "learning_rate": 3.948905109489051e-05, - "loss": 1.0833, + "epoch": 0.18771686328938236, + "grad_norm": 0.758055567741394, + "learning_rate": 1.8768430182133565e-05, + "loss": 0.8914, "step": 1082 }, { - "epoch": 0.39547197370823445, - "grad_norm": 1.940871238708496, - "learning_rate": 3.952554744525548e-05, - "loss": 1.0671, + "epoch": 0.18789035392088826, + "grad_norm": 0.778022050857544, + "learning_rate": 1.8785776235906333e-05, + "loss": 0.8389, "step": 1083 }, { - "epoch": 0.3958371371188607, - "grad_norm": 1.5787734985351562, - "learning_rate": 3.956204379562044e-05, - "loss": 1.0654, + "epoch": 0.18806384455239417, + "grad_norm": 0.7191702127456665, + "learning_rate": 1.8803122289679098e-05, + "loss": 0.9297, "step": 1084 }, { - "epoch": 0.39620230052948696, - "grad_norm": 1.0383380651474, - "learning_rate": 3.95985401459854e-05, - "loss": 1.0392, + "epoch": 0.18823733518390007, + "grad_norm": 0.7985841631889343, + "learning_rate": 1.8820468343451866e-05, + "loss": 0.9543, "step": 1085 }, { - "epoch": 0.3965674639401132, - "grad_norm": 1.0594536066055298, - "learning_rate": 3.963503649635037e-05, - "loss": 1.0964, + "epoch": 0.18841082581540597, + "grad_norm": 0.8521026968955994, + "learning_rate": 1.883781439722463e-05, + "loss": 0.9563, "step": 1086 }, { - "epoch": 0.3969326273507395, - "grad_norm": 1.1077114343643188, - "learning_rate": 3.967153284671533e-05, - "loss": 1.025, + "epoch": 0.18858431644691187, + "grad_norm": 1.3044359683990479, + "learning_rate": 1.88551604509974e-05, + "loss": 0.9055, "step": 1087 }, { - "epoch": 0.39729779076136573, - "grad_norm": 1.0958856344223022, - "learning_rate": 3.9708029197080294e-05, - "loss": 1.0856, + "epoch": 0.18875780707841777, + "grad_norm": 0.8546336889266968, + "learning_rate": 1.8872506504770168e-05, + "loss": 0.8511, "step": 1088 }, { - "epoch": 0.397662954171992, - "grad_norm": 1.0740013122558594, - "learning_rate": 3.974452554744526e-05, - "loss": 1.0311, + "epoch": 0.18893129770992367, + "grad_norm": 0.8610173463821411, + "learning_rate": 1.8889852558542933e-05, + "loss": 0.8042, "step": 1089 }, { - "epoch": 0.39802811758261825, - "grad_norm": 1.2233268022537231, - "learning_rate": 3.978102189781022e-05, - "loss": 1.0743, + "epoch": 0.18910478834142957, + "grad_norm": 1.2829304933547974, + "learning_rate": 1.89071986123157e-05, + "loss": 0.7993, "step": 1090 }, { - "epoch": 0.3983932809932445, - "grad_norm": 1.3717782497406006, - "learning_rate": 3.9817518248175184e-05, - "loss": 1.0548, + "epoch": 0.18927827897293545, + "grad_norm": 1.5449819564819336, + "learning_rate": 1.8924544666088467e-05, + "loss": 0.8948, "step": 1091 }, { - "epoch": 0.39875844440387076, - "grad_norm": 1.3720403909683228, - "learning_rate": 3.9854014598540145e-05, - "loss": 1.0762, + "epoch": 0.18945176960444135, + "grad_norm": 1.049821376800537, + "learning_rate": 1.8941890719861235e-05, + "loss": 0.7842, "step": 1092 }, { - "epoch": 0.39912360781449696, - "grad_norm": 1.3393198251724243, - "learning_rate": 3.989051094890511e-05, - "loss": 1.1035, + "epoch": 0.18962526023594725, + "grad_norm": 0.8324046730995178, + "learning_rate": 1.8959236773634e-05, + "loss": 0.8274, "step": 1093 }, { - "epoch": 0.3994887712251232, - "grad_norm": 1.3722527027130127, - "learning_rate": 3.9927007299270074e-05, - "loss": 1.064, + "epoch": 0.18979875086745315, + "grad_norm": 0.9129648208618164, + "learning_rate": 1.8976582827406765e-05, + "loss": 0.9216, "step": 1094 }, { - "epoch": 0.3998539346357495, - "grad_norm": 1.4581358432769775, - "learning_rate": 3.9963503649635035e-05, - "loss": 1.0192, + "epoch": 0.18997224149895905, + "grad_norm": 0.7167275547981262, + "learning_rate": 1.8993928881179533e-05, + "loss": 0.8948, "step": 1095 }, { - "epoch": 0.40021909804637573, - "grad_norm": 1.4697636365890503, - "learning_rate": 4e-05, - "loss": 1.0397, + "epoch": 0.19014573213046496, + "grad_norm": 0.7464287281036377, + "learning_rate": 1.9011274934952298e-05, + "loss": 0.8328, "step": 1096 }, { - "epoch": 0.400584261457002, - "grad_norm": 1.0955256223678589, - "learning_rate": 3.999999485540128e-05, - "loss": 1.061, + "epoch": 0.19031922276197086, + "grad_norm": 0.7323386073112488, + "learning_rate": 1.9028620988725067e-05, + "loss": 0.9685, "step": 1097 }, { - "epoch": 0.40094942486762825, - "grad_norm": 1.1996790170669556, - "learning_rate": 3.999997942160775e-05, - "loss": 1.0587, + "epoch": 0.19049271339347676, + "grad_norm": 0.9647856950759888, + "learning_rate": 1.904596704249783e-05, + "loss": 0.7505, "step": 1098 }, { - "epoch": 0.4013145882782545, - "grad_norm": 0.9571264982223511, - "learning_rate": 3.9999953698627355e-05, - "loss": 1.0422, + "epoch": 0.19066620402498266, + "grad_norm": 0.8408564925193787, + "learning_rate": 1.90633130962706e-05, + "loss": 0.8242, "step": 1099 }, { - "epoch": 0.40167975168888076, - "grad_norm": 0.8335952162742615, - "learning_rate": 3.9999917686473335e-05, - "loss": 1.0099, + "epoch": 0.19083969465648856, + "grad_norm": 3.48551082611084, + "learning_rate": 1.9080659150043365e-05, + "loss": 0.7373, "step": 1100 }, { - "epoch": 0.402044915099507, - "grad_norm": 1.1695386171340942, - "learning_rate": 3.9999871385164215e-05, - "loss": 1.0182, + "epoch": 0.19101318528799444, + "grad_norm": 1.3884447813034058, + "learning_rate": 1.9098005203816133e-05, + "loss": 1.0496, "step": 1101 }, { - "epoch": 0.4024100785101333, - "grad_norm": 1.2104897499084473, - "learning_rate": 3.9999814794723805e-05, - "loss": 1.0715, + "epoch": 0.19118667591950034, + "grad_norm": 0.9413248896598816, + "learning_rate": 1.9115351257588902e-05, + "loss": 0.8633, "step": 1102 }, { - "epoch": 0.40277524192075953, - "grad_norm": 0.9011780619621277, - "learning_rate": 3.999974791518123e-05, - "loss": 1.0779, + "epoch": 0.19136016655100624, + "grad_norm": 1.1441295146942139, + "learning_rate": 1.9132697311361667e-05, + "loss": 0.7773, "step": 1103 }, { - "epoch": 0.4031404053313858, - "grad_norm": 1.3028141260147095, - "learning_rate": 3.999967074657089e-05, - "loss": 1.0642, + "epoch": 0.19153365718251214, + "grad_norm": 1.0632127523422241, + "learning_rate": 1.9150043365134435e-05, + "loss": 0.7786, "step": 1104 }, { - "epoch": 0.40350556874201204, - "grad_norm": 1.2466671466827393, - "learning_rate": 3.9999583288932495e-05, - "loss": 1.0255, + "epoch": 0.19170714781401804, + "grad_norm": 0.9430599212646484, + "learning_rate": 1.91673894189072e-05, + "loss": 0.8274, "step": 1105 }, { - "epoch": 0.4038707321526383, - "grad_norm": 1.1066969633102417, - "learning_rate": 3.999948554231102e-05, - "loss": 1.0422, + "epoch": 0.19188063844552394, + "grad_norm": 0.8103283047676086, + "learning_rate": 1.918473547267997e-05, + "loss": 0.9541, "step": 1106 }, { - "epoch": 0.40423589556326456, - "grad_norm": 0.9009634852409363, - "learning_rate": 3.9999377506756765e-05, - "loss": 1.0182, + "epoch": 0.19205412907702984, + "grad_norm": 0.7432756423950195, + "learning_rate": 1.9202081526452734e-05, + "loss": 0.8809, "step": 1107 }, { - "epoch": 0.4046010589738908, - "grad_norm": 1.3242288827896118, - "learning_rate": 3.9999259182325315e-05, - "loss": 1.038, + "epoch": 0.19222761970853575, + "grad_norm": 1.067231297492981, + "learning_rate": 1.9219427580225502e-05, + "loss": 0.8181, "step": 1108 }, { - "epoch": 0.40496622238451707, - "grad_norm": 2.6725430488586426, - "learning_rate": 3.999913056907753e-05, - "loss": 1.0546, + "epoch": 0.19240111034004165, + "grad_norm": 1.0702846050262451, + "learning_rate": 1.9236773633998267e-05, + "loss": 0.762, "step": 1109 }, { - "epoch": 0.40533138579514333, - "grad_norm": 1.1711183786392212, - "learning_rate": 3.999899166707959e-05, - "loss": 1.032, + "epoch": 0.19257460097154755, + "grad_norm": 1.0375348329544067, + "learning_rate": 1.9254119687771032e-05, + "loss": 0.8477, "step": 1110 }, { - "epoch": 0.4056965492057696, - "grad_norm": 1.3052092790603638, - "learning_rate": 3.999884247640293e-05, - "loss": 1.0282, + "epoch": 0.19274809160305342, + "grad_norm": 0.8479153513908386, + "learning_rate": 1.92714657415438e-05, + "loss": 0.8835, "step": 1111 }, { - "epoch": 0.40606171261639584, - "grad_norm": 1.599260687828064, - "learning_rate": 3.999868299712434e-05, - "loss": 1.0702, + "epoch": 0.19292158223455932, + "grad_norm": 1.1228195428848267, + "learning_rate": 1.9288811795316565e-05, + "loss": 0.8733, "step": 1112 }, { - "epoch": 0.4064268760270221, - "grad_norm": 1.2356266975402832, - "learning_rate": 3.999851322932583e-05, - "loss": 1.046, + "epoch": 0.19309507286606523, + "grad_norm": 2.4480655193328857, + "learning_rate": 1.9306157849089334e-05, + "loss": 0.8706, "step": 1113 }, { - "epoch": 0.40679203943764836, - "grad_norm": 1.1261721849441528, - "learning_rate": 3.9998333173094764e-05, - "loss": 1.0603, + "epoch": 0.19326856349757113, + "grad_norm": 1.0299562215805054, + "learning_rate": 1.93235039028621e-05, + "loss": 0.9995, "step": 1114 }, { - "epoch": 0.4071572028482746, - "grad_norm": 1.4510836601257324, - "learning_rate": 3.999814282852375e-05, - "loss": 1.002, + "epoch": 0.19344205412907703, + "grad_norm": 0.8545568585395813, + "learning_rate": 1.9340849956634867e-05, + "loss": 0.8723, "step": 1115 }, { - "epoch": 0.40752236625890087, - "grad_norm": 1.6417961120605469, - "learning_rate": 3.9997942195710744e-05, - "loss": 1.057, + "epoch": 0.19361554476058293, + "grad_norm": 1.039278268814087, + "learning_rate": 1.9358196010407632e-05, + "loss": 0.7161, "step": 1116 }, { - "epoch": 0.4078875296695271, - "grad_norm": 2.250013828277588, - "learning_rate": 3.999773127475894e-05, - "loss": 1.0439, + "epoch": 0.19378903539208883, + "grad_norm": 0.8973332643508911, + "learning_rate": 1.93755420641804e-05, + "loss": 0.8291, "step": 1117 }, { - "epoch": 0.4082526930801534, - "grad_norm": 1.5681899785995483, - "learning_rate": 3.9997510065776843e-05, - "loss": 1.046, + "epoch": 0.19396252602359473, + "grad_norm": 0.8743516206741333, + "learning_rate": 1.939288811795317e-05, + "loss": 0.8875, "step": 1118 }, { - "epoch": 0.40861785649077964, - "grad_norm": 1.3140218257904053, - "learning_rate": 3.9997278568878275e-05, - "loss": 1.0171, + "epoch": 0.19413601665510063, + "grad_norm": 0.8596274852752686, + "learning_rate": 1.9410234171725934e-05, + "loss": 0.8357, "step": 1119 }, { - "epoch": 0.4089830199014059, - "grad_norm": 0.9010523557662964, - "learning_rate": 3.9997036784182325e-05, - "loss": 1.0237, + "epoch": 0.19430950728660654, + "grad_norm": 0.8768712878227234, + "learning_rate": 1.9427580225498702e-05, + "loss": 0.8284, "step": 1120 }, { - "epoch": 0.40934818331203215, - "grad_norm": 1.0226876735687256, - "learning_rate": 3.999678471181338e-05, - "loss": 1.0201, + "epoch": 0.1944829979181124, + "grad_norm": 0.7377764582633972, + "learning_rate": 1.9444926279271467e-05, + "loss": 0.8452, "step": 1121 }, { - "epoch": 0.4097133467226584, - "grad_norm": 1.2696917057037354, - "learning_rate": 3.999652235190112e-05, - "loss": 1.0571, + "epoch": 0.1946564885496183, + "grad_norm": 0.8955880403518677, + "learning_rate": 1.9462272333044236e-05, + "loss": 0.9104, "step": 1122 }, { - "epoch": 0.41007851013328467, - "grad_norm": 1.5432963371276855, - "learning_rate": 3.999624970458053e-05, - "loss": 1.0516, + "epoch": 0.1948299791811242, + "grad_norm": 0.782400369644165, + "learning_rate": 1.9479618386817e-05, + "loss": 0.9016, "step": 1123 }, { - "epoch": 0.4104436735439109, - "grad_norm": 1.1600122451782227, - "learning_rate": 3.999596676999185e-05, - "loss": 1.0454, + "epoch": 0.19500346981263011, + "grad_norm": 0.9617409110069275, + "learning_rate": 1.9496964440589766e-05, + "loss": 0.7664, "step": 1124 }, { - "epoch": 0.4108088369545372, - "grad_norm": 1.2443245649337769, - "learning_rate": 3.999567354828067e-05, - "loss": 1.121, + "epoch": 0.19517696044413602, + "grad_norm": 0.9044870138168335, + "learning_rate": 1.9514310494362534e-05, + "loss": 0.9851, "step": 1125 }, { - "epoch": 0.4111740003651634, - "grad_norm": 1.2136924266815186, - "learning_rate": 3.9995370039597826e-05, - "loss": 1.041, + "epoch": 0.19535045107564192, + "grad_norm": 0.981440007686615, + "learning_rate": 1.95316565481353e-05, + "loss": 0.9246, "step": 1126 }, { - "epoch": 0.41153916377578964, - "grad_norm": 1.0153218507766724, - "learning_rate": 3.9995056244099444e-05, - "loss": 1.0305, + "epoch": 0.19552394170714782, + "grad_norm": 1.3709393739700317, + "learning_rate": 1.9549002601908068e-05, + "loss": 0.9604, "step": 1127 }, { - "epoch": 0.4119043271864159, - "grad_norm": 1.299992322921753, - "learning_rate": 3.9994732161946986e-05, - "loss": 1.0365, + "epoch": 0.19569743233865372, + "grad_norm": 0.7237723469734192, + "learning_rate": 1.9566348655680833e-05, + "loss": 0.9189, "step": 1128 }, { - "epoch": 0.41226949059704215, - "grad_norm": 0.8568185567855835, - "learning_rate": 3.9994397793307175e-05, - "loss": 1.0294, + "epoch": 0.19587092297015962, + "grad_norm": 1.2013028860092163, + "learning_rate": 1.95836947094536e-05, + "loss": 0.7786, "step": 1129 }, { - "epoch": 0.4126346540076684, - "grad_norm": 1.0070760250091553, - "learning_rate": 3.999405313835202e-05, - "loss": 1.0198, + "epoch": 0.19604441360166552, + "grad_norm": 0.7311549782752991, + "learning_rate": 1.9601040763226366e-05, + "loss": 0.9199, "step": 1130 }, { - "epoch": 0.41299981741829467, - "grad_norm": 0.8885939717292786, - "learning_rate": 3.999369819725884e-05, - "loss": 1.0408, + "epoch": 0.1962179042331714, + "grad_norm": 1.1062647104263306, + "learning_rate": 1.9618386816999134e-05, + "loss": 0.7839, "step": 1131 }, { - "epoch": 0.4133649808289209, - "grad_norm": 0.9783234000205994, - "learning_rate": 3.999333297021023e-05, - "loss": 1.0068, + "epoch": 0.1963913948646773, + "grad_norm": 1.6833813190460205, + "learning_rate": 1.9635732870771903e-05, + "loss": 0.7053, "step": 1132 }, { - "epoch": 0.4137301442395472, - "grad_norm": 1.0961719751358032, - "learning_rate": 3.999295745739409e-05, - "loss": 1.0562, + "epoch": 0.1965648854961832, + "grad_norm": 1.0765060186386108, + "learning_rate": 1.9653078924544668e-05, + "loss": 0.8103, "step": 1133 }, { - "epoch": 0.41409530765017344, - "grad_norm": 1.0994256734848022, - "learning_rate": 3.999257165900361e-05, - "loss": 1.0583, + "epoch": 0.1967383761276891, + "grad_norm": 1.032185435295105, + "learning_rate": 1.9670424978317436e-05, + "loss": 0.78, "step": 1134 }, { - "epoch": 0.4144604710607997, - "grad_norm": 1.39691162109375, - "learning_rate": 3.999217557523725e-05, - "loss": 1.0581, + "epoch": 0.196911866759195, + "grad_norm": 1.1329410076141357, + "learning_rate": 1.96877710320902e-05, + "loss": 0.7864, "step": 1135 }, { - "epoch": 0.41482563447142595, - "grad_norm": 0.933196485042572, - "learning_rate": 3.9991769206298805e-05, - "loss": 1.0334, + "epoch": 0.1970853573907009, + "grad_norm": 1.0962532758712769, + "learning_rate": 1.970511708586297e-05, + "loss": 0.8469, "step": 1136 }, { - "epoch": 0.4151907978820522, - "grad_norm": 1.5245027542114258, - "learning_rate": 3.999135255239732e-05, - "loss": 1.085, + "epoch": 0.1972588480222068, + "grad_norm": 1.0061286687850952, + "learning_rate": 1.9722463139635735e-05, + "loss": 0.8394, "step": 1137 }, { - "epoch": 0.41555596129267847, - "grad_norm": 1.0224543809890747, - "learning_rate": 3.999092561374715e-05, - "loss": 1.022, + "epoch": 0.1974323386537127, + "grad_norm": 1.1442840099334717, + "learning_rate": 1.9739809193408503e-05, + "loss": 0.7417, "step": 1138 }, { - "epoch": 0.4159211247033047, - "grad_norm": 1.44600248336792, - "learning_rate": 3.999048839056793e-05, - "loss": 1.0886, + "epoch": 0.1976058292852186, + "grad_norm": 1.0214344263076782, + "learning_rate": 1.9757155247181268e-05, + "loss": 0.7678, "step": 1139 }, { - "epoch": 0.416286288113931, - "grad_norm": 1.0260883569717407, - "learning_rate": 3.999004088308462e-05, - "loss": 1.0608, + "epoch": 0.19777931991672448, + "grad_norm": 0.9408628344535828, + "learning_rate": 1.9774501300954033e-05, + "loss": 0.8005, "step": 1140 }, { - "epoch": 0.41665145152455724, - "grad_norm": 1.2564923763275146, - "learning_rate": 3.998958309152741e-05, - "loss": 1.0332, + "epoch": 0.19795281054823038, + "grad_norm": 0.9302935600280762, + "learning_rate": 1.97918473547268e-05, + "loss": 0.8726, "step": 1141 }, { - "epoch": 0.4170166149351835, - "grad_norm": 1.4178814888000488, - "learning_rate": 3.998911501613184e-05, - "loss": 1.0651, + "epoch": 0.19812630117973629, + "grad_norm": 1.2831724882125854, + "learning_rate": 1.9809193408499566e-05, + "loss": 0.72, "step": 1142 }, { - "epoch": 0.41738177834580975, - "grad_norm": 1.2618945837020874, - "learning_rate": 3.99886366571387e-05, - "loss": 1.0494, + "epoch": 0.1982997918112422, + "grad_norm": 1.0520926713943481, + "learning_rate": 1.9826539462272335e-05, + "loss": 0.887, "step": 1143 }, { - "epoch": 0.417746941756436, - "grad_norm": 1.1892222166061401, - "learning_rate": 3.99881480147941e-05, - "loss": 1.026, + "epoch": 0.1984732824427481, + "grad_norm": 0.8452304005622864, + "learning_rate": 1.98438855160451e-05, + "loss": 0.844, "step": 1144 }, { - "epoch": 0.41811210516706226, - "grad_norm": 1.4228870868682861, - "learning_rate": 3.998764908934942e-05, - "loss": 1.0763, + "epoch": 0.198646773074254, + "grad_norm": 1.0608121156692505, + "learning_rate": 1.9861231569817868e-05, + "loss": 0.9395, "step": 1145 }, { - "epoch": 0.4184772685776885, - "grad_norm": 1.5030168294906616, - "learning_rate": 3.998713988106134e-05, - "loss": 1.0687, + "epoch": 0.1988202637057599, + "grad_norm": 0.7357398867607117, + "learning_rate": 1.9878577623590636e-05, + "loss": 0.9695, "step": 1146 }, { - "epoch": 0.4188424319883148, - "grad_norm": 1.1838278770446777, - "learning_rate": 3.9986620390191815e-05, - "loss": 1.0369, + "epoch": 0.1989937543372658, + "grad_norm": 1.6007386445999146, + "learning_rate": 1.98959236773634e-05, + "loss": 0.7546, "step": 1147 }, { - "epoch": 0.41920759539894104, - "grad_norm": 2.355543375015259, - "learning_rate": 3.998609061700812e-05, - "loss": 1.001, + "epoch": 0.1991672449687717, + "grad_norm": 1.2272436618804932, + "learning_rate": 1.991326973113617e-05, + "loss": 0.8665, "step": 1148 }, { - "epoch": 0.4195727588095673, - "grad_norm": 1.2859777212142944, - "learning_rate": 3.998555056178279e-05, - "loss": 1.0745, + "epoch": 0.1993407356002776, + "grad_norm": 0.8333637714385986, + "learning_rate": 1.9930615784908935e-05, + "loss": 0.9495, "step": 1149 }, { - "epoch": 0.41993792222019355, - "grad_norm": 1.125076413154602, - "learning_rate": 3.998500022479367e-05, - "loss": 1.0648, + "epoch": 0.19951422623178347, + "grad_norm": 0.8695045709609985, + "learning_rate": 1.9947961838681703e-05, + "loss": 0.8123, "step": 1150 }, { - "epoch": 0.4203030856308198, - "grad_norm": 1.0421031713485718, - "learning_rate": 3.998443960632388e-05, - "loss": 1.028, + "epoch": 0.19968771686328937, + "grad_norm": 1.0767509937286377, + "learning_rate": 1.9965307892454468e-05, + "loss": 0.96, "step": 1151 }, { - "epoch": 0.42066824904144606, - "grad_norm": 0.9748599529266357, - "learning_rate": 3.998386870666185e-05, - "loss": 1.0492, + "epoch": 0.19986120749479527, + "grad_norm": 1.1710973978042603, + "learning_rate": 1.9982653946227237e-05, + "loss": 0.8105, "step": 1152 }, { - "epoch": 0.4210334124520723, - "grad_norm": 2.4433956146240234, - "learning_rate": 3.9983287526101256e-05, - "loss": 1.0576, + "epoch": 0.20003469812630117, + "grad_norm": 0.9286783933639526, + "learning_rate": 2e-05, + "loss": 0.8337, "step": 1153 }, { - "epoch": 0.4213985758626986, - "grad_norm": 1.4316203594207764, - "learning_rate": 3.9982696064941116e-05, - "loss": 1.0374, + "epoch": 0.20020818875780708, + "grad_norm": 0.9539753794670105, + "learning_rate": 2.0017346053772767e-05, + "loss": 0.8218, "step": 1154 }, { - "epoch": 0.42176373927332483, - "grad_norm": 1.1265262365341187, - "learning_rate": 3.9982094323485706e-05, - "loss": 1.0289, + "epoch": 0.20038167938931298, + "grad_norm": 1.4093074798583984, + "learning_rate": 2.0034692107545535e-05, + "loss": 0.8657, "step": 1155 }, { - "epoch": 0.4221289026839511, - "grad_norm": 1.2048323154449463, - "learning_rate": 3.9981482302044604e-05, - "loss": 1.0101, + "epoch": 0.20055517002081888, + "grad_norm": 1.5611107349395752, + "learning_rate": 2.00520381613183e-05, + "loss": 0.7773, "step": 1156 }, { - "epoch": 0.42249406609457735, - "grad_norm": 1.0414894819259644, - "learning_rate": 3.998086000093266e-05, - "loss": 1.0409, + "epoch": 0.20072866065232478, + "grad_norm": 1.043455719947815, + "learning_rate": 2.006938421509107e-05, + "loss": 0.7196, "step": 1157 }, { - "epoch": 0.4228592295052036, - "grad_norm": 10.712392807006836, - "learning_rate": 3.998022742047002e-05, - "loss": 1.051, + "epoch": 0.20090215128383068, + "grad_norm": 0.7280066013336182, + "learning_rate": 2.0086730268863833e-05, + "loss": 0.9248, "step": 1158 }, { - "epoch": 0.42322439291582986, - "grad_norm": 1.2368547916412354, - "learning_rate": 3.9979584560982144e-05, - "loss": 1.026, + "epoch": 0.20107564191533658, + "grad_norm": 0.8233473300933838, + "learning_rate": 2.0104076322636602e-05, + "loss": 0.9641, "step": 1159 }, { - "epoch": 0.42358955632645606, - "grad_norm": 1.550370454788208, - "learning_rate": 3.997893142279973e-05, - "loss": 1.1033, + "epoch": 0.20124913254684246, + "grad_norm": 0.8716951012611389, + "learning_rate": 2.0121422376409367e-05, + "loss": 0.7805, "step": 1160 }, { - "epoch": 0.4239547197370823, - "grad_norm": 1.8472901582717896, - "learning_rate": 3.997826800625881e-05, - "loss": 1.0764, + "epoch": 0.20142262317834836, + "grad_norm": 0.8178510665893555, + "learning_rate": 2.0138768430182135e-05, + "loss": 0.9336, "step": 1161 }, { - "epoch": 0.4243198831477086, - "grad_norm": 0.8468568325042725, - "learning_rate": 3.9977594311700676e-05, - "loss": 1.0468, + "epoch": 0.20159611380985426, + "grad_norm": 1.017986536026001, + "learning_rate": 2.01561144839549e-05, + "loss": 0.7156, "step": 1162 }, { - "epoch": 0.42468504655833483, - "grad_norm": 1.43262779712677, - "learning_rate": 3.9976910339471914e-05, - "loss": 1.0236, + "epoch": 0.20176960444136016, + "grad_norm": 0.9040745496749878, + "learning_rate": 2.017346053772767e-05, + "loss": 0.9014, "step": 1163 }, { - "epoch": 0.4250502099689611, - "grad_norm": 1.1666349172592163, - "learning_rate": 3.9976216089924415e-05, - "loss": 0.9823, + "epoch": 0.20194309507286606, + "grad_norm": 1.2191495895385742, + "learning_rate": 2.0190806591500434e-05, + "loss": 0.9014, "step": 1164 }, { - "epoch": 0.42541537337958735, - "grad_norm": 1.6177401542663574, - "learning_rate": 3.9975511563415336e-05, - "loss": 1.0906, + "epoch": 0.20211658570437196, + "grad_norm": 0.8765791058540344, + "learning_rate": 2.02081526452732e-05, + "loss": 0.8435, "step": 1165 }, { - "epoch": 0.4257805367902136, - "grad_norm": 0.9350042343139648, - "learning_rate": 3.997479676030711e-05, - "loss": 1.0298, + "epoch": 0.20229007633587787, + "grad_norm": 0.8944528102874756, + "learning_rate": 2.0225498699045967e-05, + "loss": 0.782, "step": 1166 }, { - "epoch": 0.42614570020083986, - "grad_norm": 1.4860293865203857, - "learning_rate": 3.9974071680967504e-05, - "loss": 1.0717, + "epoch": 0.20246356696738377, + "grad_norm": 0.9801760911941528, + "learning_rate": 2.0242844752818732e-05, + "loss": 0.8594, "step": 1167 }, { - "epoch": 0.4265108636114661, - "grad_norm": 1.4998146295547485, - "learning_rate": 3.9973336325769526e-05, - "loss": 1.0236, + "epoch": 0.20263705759888967, + "grad_norm": 2.095978021621704, + "learning_rate": 2.0260190806591504e-05, + "loss": 0.7432, "step": 1168 }, { - "epoch": 0.4268760270220924, - "grad_norm": 1.3722031116485596, - "learning_rate": 3.9972590695091476e-05, - "loss": 1.0569, + "epoch": 0.20281054823039557, + "grad_norm": 1.1703161001205444, + "learning_rate": 2.0277536860364272e-05, + "loss": 0.8005, "step": 1169 }, { - "epoch": 0.42724119043271863, - "grad_norm": 1.2058231830596924, - "learning_rate": 3.997183478931698e-05, - "loss": 1.0543, + "epoch": 0.20298403886190144, + "grad_norm": 1.069144368171692, + "learning_rate": 2.0294882914137037e-05, + "loss": 0.8208, "step": 1170 }, { - "epoch": 0.4276063538433449, - "grad_norm": 1.0915815830230713, - "learning_rate": 3.9971068608834895e-05, - "loss": 1.0392, + "epoch": 0.20315752949340735, + "grad_norm": 0.8345847725868225, + "learning_rate": 2.0312228967909806e-05, + "loss": 0.8989, "step": 1171 }, { - "epoch": 0.42797151725397115, - "grad_norm": 1.691857099533081, - "learning_rate": 3.9970292154039396e-05, - "loss": 1.0433, + "epoch": 0.20333102012491325, + "grad_norm": 1.581639051437378, + "learning_rate": 2.032957502168257e-05, + "loss": 0.832, "step": 1172 }, { - "epoch": 0.4283366806645974, - "grad_norm": 1.2902330160140991, - "learning_rate": 3.9969505425329955e-05, - "loss": 1.0356, + "epoch": 0.20350451075641915, + "grad_norm": 1.3069044351577759, + "learning_rate": 2.0346921075455336e-05, + "loss": 0.825, "step": 1173 }, { - "epoch": 0.42870184407522366, - "grad_norm": 1.1884143352508545, - "learning_rate": 3.996870842311129e-05, - "loss": 1.0776, + "epoch": 0.20367800138792505, + "grad_norm": 0.9406945705413818, + "learning_rate": 2.0364267129228104e-05, + "loss": 0.8961, "step": 1174 }, { - "epoch": 0.4290670074858499, - "grad_norm": 1.7202750444412231, - "learning_rate": 3.9967901147793436e-05, - "loss": 1.051, + "epoch": 0.20385149201943095, + "grad_norm": 0.9498026371002197, + "learning_rate": 2.038161318300087e-05, + "loss": 0.7417, "step": 1175 }, { - "epoch": 0.4294321708964762, - "grad_norm": 1.0213874578475952, - "learning_rate": 3.99670835997917e-05, - "loss": 1.0439, + "epoch": 0.20402498265093685, + "grad_norm": 0.861413300037384, + "learning_rate": 2.0398959236773637e-05, + "loss": 0.854, "step": 1176 }, { - "epoch": 0.42979733430710243, - "grad_norm": 1.0967782735824585, - "learning_rate": 3.996625577952669e-05, - "loss": 1.0183, + "epoch": 0.20419847328244276, + "grad_norm": 1.052204966545105, + "learning_rate": 2.0416305290546402e-05, + "loss": 0.8772, "step": 1177 }, { - "epoch": 0.4301624977177287, - "grad_norm": 1.2511194944381714, - "learning_rate": 3.9965417687424274e-05, - "loss": 1.0596, + "epoch": 0.20437196391394866, + "grad_norm": 1.1044455766677856, + "learning_rate": 2.043365134431917e-05, + "loss": 0.7898, "step": 1178 }, { - "epoch": 0.43052766112835494, - "grad_norm": 1.1474601030349731, - "learning_rate": 3.996456932391562e-05, - "loss": 1.0651, + "epoch": 0.20454545454545456, + "grad_norm": 1.0541276931762695, + "learning_rate": 2.0450997398091936e-05, + "loss": 0.8726, "step": 1179 }, { - "epoch": 0.4308928245389812, - "grad_norm": 0.9371693730354309, - "learning_rate": 3.9963710689437174e-05, - "loss": 1.0322, + "epoch": 0.20471894517696043, + "grad_norm": 0.8354906439781189, + "learning_rate": 2.0468343451864704e-05, + "loss": 0.8865, "step": 1180 }, { - "epoch": 0.43125798794960746, - "grad_norm": 1.312596321105957, - "learning_rate": 3.996284178443068e-05, - "loss": 1.0494, + "epoch": 0.20489243580846633, + "grad_norm": 1.0439603328704834, + "learning_rate": 2.048568950563747e-05, + "loss": 0.8076, "step": 1181 }, { - "epoch": 0.4316231513602337, - "grad_norm": 1.3545986413955688, - "learning_rate": 3.996196260934314e-05, - "loss": 1.0502, + "epoch": 0.20506592643997223, + "grad_norm": 1.7082873582839966, + "learning_rate": 2.0503035559410238e-05, + "loss": 0.8962, "step": 1182 }, { - "epoch": 0.43198831477085997, - "grad_norm": 1.554949402809143, - "learning_rate": 3.996107316462686e-05, - "loss": 1.0466, + "epoch": 0.20523941707147814, + "grad_norm": 0.8092603087425232, + "learning_rate": 2.0520381613183003e-05, + "loss": 0.8772, "step": 1183 }, { - "epoch": 0.43235347818148623, - "grad_norm": 1.021992564201355, - "learning_rate": 3.9960173450739425e-05, - "loss": 1.0273, + "epoch": 0.20541290770298404, + "grad_norm": 0.8688147068023682, + "learning_rate": 2.0537727666955768e-05, + "loss": 0.8855, "step": 1184 }, { - "epoch": 0.4327186415921125, - "grad_norm": 1.9283751249313354, - "learning_rate": 3.9959263468143706e-05, - "loss": 1.0638, + "epoch": 0.20558639833448994, + "grad_norm": 0.7964572310447693, + "learning_rate": 2.0555073720728536e-05, + "loss": 1.0303, "step": 1185 }, { - "epoch": 0.43308380500273874, - "grad_norm": 1.2399693727493286, - "learning_rate": 3.995834321730785e-05, - "loss": 1.0591, + "epoch": 0.20575988896599584, + "grad_norm": 0.8607844710350037, + "learning_rate": 2.05724197745013e-05, + "loss": 0.8682, "step": 1186 }, { - "epoch": 0.433448968413365, - "grad_norm": 1.3178949356079102, - "learning_rate": 3.995741269870528e-05, - "loss": 1.0649, + "epoch": 0.20593337959750174, + "grad_norm": 0.7864561676979065, + "learning_rate": 2.058976582827407e-05, + "loss": 1.0291, "step": 1187 }, { - "epoch": 0.43381413182399126, - "grad_norm": 1.3378502130508423, - "learning_rate": 3.9956471912814715e-05, - "loss": 1.0244, + "epoch": 0.20610687022900764, + "grad_norm": 0.7906190752983093, + "learning_rate": 2.0607111882046834e-05, + "loss": 0.8892, "step": 1188 }, { - "epoch": 0.4341792952346175, - "grad_norm": 1.131489872932434, - "learning_rate": 3.9955520860120164e-05, - "loss": 1.0459, + "epoch": 0.20628036086051355, + "grad_norm": 1.1501116752624512, + "learning_rate": 2.0624457935819603e-05, + "loss": 0.8076, "step": 1189 }, { - "epoch": 0.43454445864524377, - "grad_norm": 1.424012303352356, - "learning_rate": 3.995455954111089e-05, - "loss": 1.0426, + "epoch": 0.20645385149201942, + "grad_norm": 0.8381333947181702, + "learning_rate": 2.0641803989592368e-05, + "loss": 0.9062, "step": 1190 }, { - "epoch": 0.43490962205587, - "grad_norm": 1.0501662492752075, - "learning_rate": 3.995358795628146e-05, - "loss": 1.038, + "epoch": 0.20662734212352532, + "grad_norm": 0.7598695158958435, + "learning_rate": 2.0659150043365136e-05, + "loss": 0.8657, "step": 1191 }, { - "epoch": 0.4352747854664963, - "grad_norm": 1.0466276407241821, - "learning_rate": 3.995260610613172e-05, - "loss": 1.0448, + "epoch": 0.20680083275503122, + "grad_norm": 0.9513621926307678, + "learning_rate": 2.06764960971379e-05, + "loss": 0.8899, "step": 1192 }, { - "epoch": 0.4356399488771225, - "grad_norm": 1.3244295120239258, - "learning_rate": 3.995161399116678e-05, - "loss": 1.0056, + "epoch": 0.20697432338653712, + "grad_norm": 1.0183488130569458, + "learning_rate": 2.069384215091067e-05, + "loss": 0.7217, "step": 1193 }, { - "epoch": 0.43600511228774874, - "grad_norm": 1.2504527568817139, - "learning_rate": 3.9950611611897055e-05, - "loss": 1.0334, + "epoch": 0.20714781401804302, + "grad_norm": 0.9675999283790588, + "learning_rate": 2.0711188204683434e-05, + "loss": 0.8865, "step": 1194 }, { - "epoch": 0.436370275698375, - "grad_norm": 1.0547796487808228, - "learning_rate": 3.994959896883821e-05, - "loss": 0.9799, + "epoch": 0.20732130464954893, + "grad_norm": 1.018736481666565, + "learning_rate": 2.07285342584562e-05, + "loss": 0.761, "step": 1195 }, { - "epoch": 0.43673543910900126, - "grad_norm": 1.1913384199142456, - "learning_rate": 3.994857606251124e-05, - "loss": 1.0225, + "epoch": 0.20749479528105483, + "grad_norm": 1.036081075668335, + "learning_rate": 2.0745880312228968e-05, + "loss": 0.8066, "step": 1196 }, { - "epoch": 0.4371006025196275, - "grad_norm": 1.1323100328445435, - "learning_rate": 3.994754289344236e-05, - "loss": 1.0511, + "epoch": 0.20766828591256073, + "grad_norm": 0.9184001088142395, + "learning_rate": 2.0763226366001733e-05, + "loss": 0.8662, "step": 1197 }, { - "epoch": 0.43746576593025377, - "grad_norm": 1.3135157823562622, - "learning_rate": 3.9946499462163116e-05, - "loss": 1.0291, + "epoch": 0.20784177654406663, + "grad_norm": 1.0168554782867432, + "learning_rate": 2.0780572419774505e-05, + "loss": 0.8489, "step": 1198 }, { - "epoch": 0.43783092934088, - "grad_norm": 1.3036956787109375, - "learning_rate": 3.99454457692103e-05, - "loss": 1.0591, + "epoch": 0.20801526717557253, + "grad_norm": 0.9712396860122681, + "learning_rate": 2.0797918473547273e-05, + "loss": 0.8237, "step": 1199 }, { - "epoch": 0.4381960927515063, - "grad_norm": 1.4519540071487427, - "learning_rate": 3.9944381815125987e-05, - "loss": 1.0579, + "epoch": 0.2081887578070784, + "grad_norm": 0.8076720237731934, + "learning_rate": 2.0815264527320038e-05, + "loss": 1.0417, "step": 1200 }, { - "epoch": 0.43856125616213254, - "grad_norm": 0.8528894782066345, - "learning_rate": 3.9943307600457563e-05, - "loss": 1.0293, + "epoch": 0.2083622484385843, + "grad_norm": 0.9912758469581604, + "learning_rate": 2.0832610581092806e-05, + "loss": 0.8767, "step": 1201 }, { - "epoch": 0.4389264195727588, - "grad_norm": 1.3160288333892822, - "learning_rate": 3.994222312575764e-05, - "loss": 1.0645, + "epoch": 0.2085357390700902, + "grad_norm": 2.2674293518066406, + "learning_rate": 2.084995663486557e-05, + "loss": 0.8264, "step": 1202 }, { - "epoch": 0.43929158298338505, - "grad_norm": 1.004645586013794, - "learning_rate": 3.994112839158416e-05, - "loss": 0.9988, + "epoch": 0.2087092297015961, + "grad_norm": 0.9456791877746582, + "learning_rate": 2.0867302688638336e-05, + "loss": 0.8113, "step": 1203 }, { - "epoch": 0.4396567463940113, - "grad_norm": 0.884960949420929, - "learning_rate": 3.99400233985003e-05, - "loss": 1.0366, + "epoch": 0.208882720333102, + "grad_norm": 0.8563554883003235, + "learning_rate": 2.0884648742411105e-05, + "loss": 0.793, "step": 1204 }, { - "epoch": 0.44002190980463757, - "grad_norm": 1.7585129737854004, - "learning_rate": 3.993890814707455e-05, - "loss": 1.0319, + "epoch": 0.2090562109646079, + "grad_norm": 1.0406992435455322, + "learning_rate": 2.090199479618387e-05, + "loss": 0.949, "step": 1205 }, { - "epoch": 0.4403870732152638, - "grad_norm": 1.753636360168457, - "learning_rate": 3.9937782637880665e-05, - "loss": 1.0847, + "epoch": 0.20922970159611382, + "grad_norm": 0.9023845195770264, + "learning_rate": 2.0919340849956638e-05, + "loss": 0.9165, "step": 1206 }, { - "epoch": 0.4407522366258901, - "grad_norm": 1.0258582830429077, - "learning_rate": 3.9936646871497656e-05, - "loss": 1.0333, + "epoch": 0.20940319222761972, + "grad_norm": 0.7458107471466064, + "learning_rate": 2.0936686903729403e-05, + "loss": 1.0183, "step": 1207 }, { - "epoch": 0.44111740003651634, - "grad_norm": 1.1033796072006226, - "learning_rate": 3.9935500848509845e-05, - "loss": 1.0287, + "epoch": 0.20957668285912562, + "grad_norm": 0.748633861541748, + "learning_rate": 2.095403295750217e-05, + "loss": 0.9912, "step": 1208 }, { - "epoch": 0.4414825634471426, - "grad_norm": 1.5457887649536133, - "learning_rate": 3.993434456950681e-05, - "loss": 1.0045, + "epoch": 0.20975017349063152, + "grad_norm": 0.998204231262207, + "learning_rate": 2.0971379011274937e-05, + "loss": 0.75, "step": 1209 }, { - "epoch": 0.44184772685776885, - "grad_norm": 1.292353868484497, - "learning_rate": 3.9933178035083406e-05, - "loss": 0.9968, + "epoch": 0.2099236641221374, + "grad_norm": 0.9955114126205444, + "learning_rate": 2.0988725065047705e-05, + "loss": 0.8713, "step": 1210 }, { - "epoch": 0.4422128902683951, - "grad_norm": 2.105164051055908, - "learning_rate": 3.993200124583977e-05, - "loss": 1.0381, + "epoch": 0.2100971547536433, + "grad_norm": 1.1266932487487793, + "learning_rate": 2.100607111882047e-05, + "loss": 0.8047, "step": 1211 }, { - "epoch": 0.44257805367902137, - "grad_norm": 1.2336411476135254, - "learning_rate": 3.993081420238132e-05, - "loss": 1.0177, + "epoch": 0.2102706453851492, + "grad_norm": 0.8944442868232727, + "learning_rate": 2.102341717259324e-05, + "loss": 0.9204, "step": 1212 }, { - "epoch": 0.4429432170896476, - "grad_norm": 1.6655219793319702, - "learning_rate": 3.992961690531873e-05, - "loss": 1.0054, + "epoch": 0.2104441360166551, + "grad_norm": 1.1496959924697876, + "learning_rate": 2.1040763226366003e-05, + "loss": 0.8135, "step": 1213 }, { - "epoch": 0.4433083805002739, - "grad_norm": 1.0908045768737793, - "learning_rate": 3.992840935526797e-05, - "loss": 0.9996, + "epoch": 0.210617626648161, + "grad_norm": 1.043591022491455, + "learning_rate": 2.105810928013877e-05, + "loss": 0.7642, "step": 1214 }, { - "epoch": 0.44367354391090014, - "grad_norm": 1.1778606176376343, - "learning_rate": 3.992719155285028e-05, - "loss": 1.0613, + "epoch": 0.2107911172796669, + "grad_norm": 0.8149956464767456, + "learning_rate": 2.1075455333911537e-05, + "loss": 0.793, "step": 1215 }, { - "epoch": 0.4440387073215264, - "grad_norm": 1.574092149734497, - "learning_rate": 3.992596349869216e-05, - "loss": 1.0309, + "epoch": 0.2109646079111728, + "grad_norm": 0.8631976842880249, + "learning_rate": 2.1092801387684302e-05, + "loss": 0.7852, "step": 1216 }, { - "epoch": 0.44440387073215265, - "grad_norm": 1.3183116912841797, - "learning_rate": 3.99247251934254e-05, - "loss": 1.0536, + "epoch": 0.2111380985426787, + "grad_norm": 1.0441302061080933, + "learning_rate": 2.111014744145707e-05, + "loss": 0.8069, "step": 1217 }, { - "epoch": 0.4447690341427789, - "grad_norm": 1.3037097454071045, - "learning_rate": 3.992347663768705e-05, - "loss": 1.0197, + "epoch": 0.2113115891741846, + "grad_norm": 0.925491988658905, + "learning_rate": 2.1127493495229835e-05, + "loss": 0.74, "step": 1218 }, { - "epoch": 0.44513419755340516, - "grad_norm": 1.1869858503341675, - "learning_rate": 3.9922217832119464e-05, - "loss": 1.0105, + "epoch": 0.21148507980569048, + "grad_norm": 1.075188398361206, + "learning_rate": 2.1144839549002604e-05, + "loss": 0.8223, "step": 1219 }, { - "epoch": 0.4454993609640314, - "grad_norm": 1.3817838430404663, - "learning_rate": 3.992094877737022e-05, - "loss": 1.0615, + "epoch": 0.21165857043719638, + "grad_norm": 5.305887222290039, + "learning_rate": 2.116218560277537e-05, + "loss": 0.7817, "step": 1220 }, { - "epoch": 0.4458645243746577, - "grad_norm": 1.5071978569030762, - "learning_rate": 3.991966947409221e-05, - "loss": 1.0138, + "epoch": 0.21183206106870228, + "grad_norm": 0.8401800394058228, + "learning_rate": 2.1179531656548137e-05, + "loss": 0.9077, "step": 1221 }, { - "epoch": 0.44622968778528393, - "grad_norm": 1.5340261459350586, - "learning_rate": 3.991837992294358e-05, - "loss": 1.0504, + "epoch": 0.21200555170020818, + "grad_norm": 0.7709339261054993, + "learning_rate": 2.1196877710320902e-05, + "loss": 0.9634, "step": 1222 }, { - "epoch": 0.4465948511959102, - "grad_norm": 1.2594711780548096, - "learning_rate": 3.991708012458777e-05, - "loss": 1.049, + "epoch": 0.21217904233171409, + "grad_norm": 1.174468755722046, + "learning_rate": 2.121422376409367e-05, + "loss": 0.8809, "step": 1223 }, { - "epoch": 0.44696001460653645, - "grad_norm": 1.434049367904663, - "learning_rate": 3.991577007969344e-05, - "loss": 1.071, + "epoch": 0.21235253296322, + "grad_norm": 0.8118718862533569, + "learning_rate": 2.1231569817866435e-05, + "loss": 0.929, "step": 1224 }, { - "epoch": 0.4473251780171627, - "grad_norm": 1.471665859222412, - "learning_rate": 3.9914449788934584e-05, - "loss": 1.0531, + "epoch": 0.2125260235947259, + "grad_norm": 1.1632200479507446, + "learning_rate": 2.12489158716392e-05, + "loss": 0.9028, "step": 1225 }, { - "epoch": 0.44769034142778896, - "grad_norm": 1.0306696891784668, - "learning_rate": 3.991311925299042e-05, - "loss": 1.0048, + "epoch": 0.2126995142262318, + "grad_norm": 1.4389945268630981, + "learning_rate": 2.126626192541197e-05, + "loss": 0.7686, "step": 1226 }, { - "epoch": 0.44805550483841516, - "grad_norm": 1.2370059490203857, - "learning_rate": 3.991177847254547e-05, - "loss": 1.0107, + "epoch": 0.2128730048577377, + "grad_norm": 1.083821177482605, + "learning_rate": 2.128360797918474e-05, + "loss": 0.865, "step": 1227 }, { - "epoch": 0.4484206682490414, - "grad_norm": 1.7563802003860474, - "learning_rate": 3.991042744828951e-05, - "loss": 1.0421, + "epoch": 0.2130464954892436, + "grad_norm": 1.0478723049163818, + "learning_rate": 2.1300954032957505e-05, + "loss": 0.7769, "step": 1228 }, { - "epoch": 0.4487858316596677, - "grad_norm": 1.081297755241394, - "learning_rate": 3.990906618091758e-05, - "loss": 0.9938, + "epoch": 0.21321998612074947, + "grad_norm": 0.9397873878479004, + "learning_rate": 2.1318300086730274e-05, + "loss": 0.7615, "step": 1229 }, { - "epoch": 0.44915099507029393, - "grad_norm": 1.2118027210235596, - "learning_rate": 3.9907694671129996e-05, - "loss": 0.9846, + "epoch": 0.21339347675225537, + "grad_norm": 2.6524014472961426, + "learning_rate": 2.133564614050304e-05, + "loss": 0.8167, "step": 1230 }, { - "epoch": 0.4495161584809202, - "grad_norm": 1.3195831775665283, - "learning_rate": 3.990631291963236e-05, - "loss": 0.9989, + "epoch": 0.21356696738376127, + "grad_norm": 0.9945835471153259, + "learning_rate": 2.1352992194275807e-05, + "loss": 0.8872, "step": 1231 }, { - "epoch": 0.44988132189154645, - "grad_norm": 1.251294732093811, - "learning_rate": 3.9904920927135504e-05, - "loss": 1.0825, + "epoch": 0.21374045801526717, + "grad_norm": 1.1906661987304688, + "learning_rate": 2.1370338248048572e-05, + "loss": 0.7793, "step": 1232 }, { - "epoch": 0.4502464853021727, - "grad_norm": 1.0974398851394653, - "learning_rate": 3.9903518694355575e-05, - "loss": 1.0321, + "epoch": 0.21391394864677307, + "grad_norm": 1.1342781782150269, + "learning_rate": 2.1387684301821337e-05, + "loss": 0.687, "step": 1233 }, { - "epoch": 0.45061164871279896, - "grad_norm": 1.2594048976898193, - "learning_rate": 3.990210622201396e-05, - "loss": 1.0114, + "epoch": 0.21408743927827897, + "grad_norm": 0.9246183633804321, + "learning_rate": 2.1405030355594106e-05, + "loss": 0.7571, "step": 1234 }, { - "epoch": 0.4509768121234252, - "grad_norm": 1.2848155498504639, - "learning_rate": 3.9900683510837306e-05, - "loss": 0.9979, + "epoch": 0.21426092990978488, + "grad_norm": 1.8368370532989502, + "learning_rate": 2.142237640936687e-05, + "loss": 0.8198, "step": 1235 }, { - "epoch": 0.4513419755340515, - "grad_norm": 0.9289810657501221, - "learning_rate": 3.989925056155756e-05, - "loss": 1.007, + "epoch": 0.21443442054129078, + "grad_norm": 1.0605031251907349, + "learning_rate": 2.143972246313964e-05, + "loss": 0.7935, "step": 1236 }, { - "epoch": 0.45170713894467773, - "grad_norm": 1.5265246629714966, - "learning_rate": 3.9897807374911895e-05, - "loss": 1.0032, + "epoch": 0.21460791117279668, + "grad_norm": 0.998390257358551, + "learning_rate": 2.1457068516912404e-05, + "loss": 0.7739, "step": 1237 }, { - "epoch": 0.452072302355304, - "grad_norm": 0.9852722883224487, - "learning_rate": 3.9896353951642795e-05, - "loss": 0.983, + "epoch": 0.21478140180430258, + "grad_norm": 0.946682870388031, + "learning_rate": 2.1474414570685172e-05, + "loss": 0.9824, "step": 1238 }, { - "epoch": 0.45243746576593025, - "grad_norm": 1.351599931716919, - "learning_rate": 3.989489029249797e-05, - "loss": 1.033, + "epoch": 0.21495489243580845, + "grad_norm": 2.7235705852508545, + "learning_rate": 2.1491760624457937e-05, + "loss": 0.9932, "step": 1239 }, { - "epoch": 0.4528026291765565, - "grad_norm": 0.943283200263977, - "learning_rate": 3.989341639823042e-05, - "loss": 1.0042, + "epoch": 0.21512838306731435, + "grad_norm": 1.0329878330230713, + "learning_rate": 2.1509106678230706e-05, + "loss": 0.8184, "step": 1240 }, { - "epoch": 0.45316779258718276, - "grad_norm": 1.0475043058395386, - "learning_rate": 3.9891932269598414e-05, - "loss": 0.9996, + "epoch": 0.21530187369882026, + "grad_norm": 0.8765996694564819, + "learning_rate": 2.152645273200347e-05, + "loss": 0.9539, "step": 1241 }, { - "epoch": 0.453532955997809, - "grad_norm": 1.1262457370758057, - "learning_rate": 3.989043790736547e-05, - "loss": 1.0276, + "epoch": 0.21547536433032616, + "grad_norm": 0.8750390410423279, + "learning_rate": 2.154379878577624e-05, + "loss": 0.9414, "step": 1242 }, { - "epoch": 0.4538981194084353, - "grad_norm": 1.4499996900558472, - "learning_rate": 3.988893331230038e-05, - "loss": 1.0776, + "epoch": 0.21564885496183206, + "grad_norm": 1.1806519031524658, + "learning_rate": 2.1561144839549004e-05, + "loss": 0.8137, "step": 1243 }, { - "epoch": 0.45426328281906153, - "grad_norm": 1.1179189682006836, - "learning_rate": 3.9887418485177175e-05, - "loss": 1.0482, + "epoch": 0.21582234559333796, + "grad_norm": 1.2203469276428223, + "learning_rate": 2.157849089332177e-05, + "loss": 0.8262, "step": 1244 }, { - "epoch": 0.4546284462296878, - "grad_norm": 1.635564923286438, - "learning_rate": 3.9885893426775204e-05, - "loss": 1.0382, + "epoch": 0.21599583622484386, + "grad_norm": 1.1158983707427979, + "learning_rate": 2.1595836947094538e-05, + "loss": 0.7612, "step": 1245 }, { - "epoch": 0.45499360964031405, - "grad_norm": 1.20588219165802, - "learning_rate": 3.988435813787904e-05, - "loss": 1.0381, + "epoch": 0.21616932685634976, + "grad_norm": 0.8536527752876282, + "learning_rate": 2.1613183000867303e-05, + "loss": 0.9202, "step": 1246 }, { - "epoch": 0.4553587730509403, - "grad_norm": 2.0726113319396973, - "learning_rate": 3.988281261927852e-05, - "loss": 1.001, + "epoch": 0.21634281748785567, + "grad_norm": 0.9013782739639282, + "learning_rate": 2.163052905464007e-05, + "loss": 0.8564, "step": 1247 }, { - "epoch": 0.45572393646156656, - "grad_norm": 1.650940179824829, - "learning_rate": 3.9881256871768756e-05, - "loss": 1.0586, + "epoch": 0.21651630811936157, + "grad_norm": 1.9464770555496216, + "learning_rate": 2.1647875108412836e-05, + "loss": 0.7404, "step": 1248 }, { - "epoch": 0.4560890998721928, - "grad_norm": 1.2380098104476929, - "learning_rate": 3.9879690896150114e-05, - "loss": 1.0189, + "epoch": 0.21668979875086744, + "grad_norm": 4.886880397796631, + "learning_rate": 2.1665221162185604e-05, + "loss": 0.959, "step": 1249 }, { - "epoch": 0.4564542632828191, - "grad_norm": 1.3324612379074097, - "learning_rate": 3.9878114693228236e-05, - "loss": 1.0095, + "epoch": 0.21686328938237334, + "grad_norm": 1.3555476665496826, + "learning_rate": 2.168256721595837e-05, + "loss": 0.9607, "step": 1250 }, { - "epoch": 0.45681942669344533, - "grad_norm": 1.3955351114273071, - "learning_rate": 3.9876528263813995e-05, - "loss": 1.0499, + "epoch": 0.21703678001387924, + "grad_norm": 1.7396936416625977, + "learning_rate": 2.1699913269731138e-05, + "loss": 0.9402, "step": 1251 }, { - "epoch": 0.4571845901040716, - "grad_norm": 1.2534099817276, - "learning_rate": 3.9874931608723566e-05, - "loss": 1.0502, + "epoch": 0.21721027064538515, + "grad_norm": 1.482854962348938, + "learning_rate": 2.1717259323503903e-05, + "loss": 0.7891, "step": 1252 }, { - "epoch": 0.45754975351469784, - "grad_norm": 1.0142911672592163, - "learning_rate": 3.9873324728778354e-05, - "loss": 0.9976, + "epoch": 0.21738376127689105, + "grad_norm": 0.9328083992004395, + "learning_rate": 2.173460537727667e-05, + "loss": 0.8308, "step": 1253 }, { - "epoch": 0.4579149169253241, - "grad_norm": 1.3869272470474243, - "learning_rate": 3.9871707624805037e-05, - "loss": 1.0258, + "epoch": 0.21755725190839695, + "grad_norm": 1.3100626468658447, + "learning_rate": 2.1751951431049436e-05, + "loss": 0.7667, "step": 1254 }, { - "epoch": 0.45828008033595036, - "grad_norm": 1.2351263761520386, - "learning_rate": 3.987008029763555e-05, - "loss": 1.0399, + "epoch": 0.21773074253990285, + "grad_norm": 1.6620234251022339, + "learning_rate": 2.17692974848222e-05, + "loss": 0.7672, "step": 1255 }, { - "epoch": 0.4586452437465766, - "grad_norm": 1.6822501420974731, - "learning_rate": 3.9868442748107076e-05, - "loss": 1.02, + "epoch": 0.21790423317140875, + "grad_norm": 3.293863534927368, + "learning_rate": 2.178664353859497e-05, + "loss": 0.8353, "step": 1256 }, { - "epoch": 0.45901040715720287, - "grad_norm": 2.145875930786133, - "learning_rate": 3.9866794977062086e-05, - "loss": 1.0402, + "epoch": 0.21807772380291465, + "grad_norm": 1.054571270942688, + "learning_rate": 2.180398959236774e-05, + "loss": 0.7363, "step": 1257 }, { - "epoch": 0.45937557056782913, - "grad_norm": 1.9201343059539795, - "learning_rate": 3.986513698534829e-05, - "loss": 0.9998, + "epoch": 0.21825121443442055, + "grad_norm": 1.420456886291504, + "learning_rate": 2.1821335646140506e-05, + "loss": 0.9019, "step": 1258 }, { - "epoch": 0.4597407339784554, - "grad_norm": 1.4489502906799316, - "learning_rate": 3.9863468773818646e-05, - "loss": 1.0493, + "epoch": 0.21842470506592643, + "grad_norm": 2.0792715549468994, + "learning_rate": 2.1838681699913275e-05, + "loss": 0.938, "step": 1259 }, { - "epoch": 0.46010589738908164, - "grad_norm": 1.319622278213501, - "learning_rate": 3.986179034333139e-05, - "loss": 0.9801, + "epoch": 0.21859819569743233, + "grad_norm": 1.3484419584274292, + "learning_rate": 2.185602775368604e-05, + "loss": 0.9351, "step": 1260 }, { - "epoch": 0.46047106079970784, - "grad_norm": 1.120734453201294, - "learning_rate": 3.986010169475002e-05, - "loss": 0.9667, + "epoch": 0.21877168632893823, + "grad_norm": 1.4846261739730835, + "learning_rate": 2.1873373807458808e-05, + "loss": 0.8425, "step": 1261 }, { - "epoch": 0.4608362242103341, - "grad_norm": 1.3661092519760132, - "learning_rate": 3.985840282894325e-05, - "loss": 0.979, + "epoch": 0.21894517696044413, + "grad_norm": 2.935652732849121, + "learning_rate": 2.1890719861231573e-05, + "loss": 0.863, "step": 1262 }, { - "epoch": 0.46120138762096036, - "grad_norm": 1.142958641052246, - "learning_rate": 3.9856693746785095e-05, - "loss": 0.9921, + "epoch": 0.21911866759195003, + "grad_norm": 0.9013956785202026, + "learning_rate": 2.1908065915004338e-05, + "loss": 0.8469, "step": 1263 }, { - "epoch": 0.4615665510315866, - "grad_norm": 1.2640470266342163, - "learning_rate": 3.9854974449154805e-05, - "loss": 0.9951, + "epoch": 0.21929215822345594, + "grad_norm": 1.4698193073272705, + "learning_rate": 2.1925411968777107e-05, + "loss": 0.9685, "step": 1264 }, { - "epoch": 0.46193171444221287, - "grad_norm": 1.6640262603759766, - "learning_rate": 3.985324493693689e-05, - "loss": 1.0399, + "epoch": 0.21946564885496184, + "grad_norm": 0.986090362071991, + "learning_rate": 2.194275802254987e-05, + "loss": 0.9636, "step": 1265 }, { - "epoch": 0.4622968778528391, - "grad_norm": 1.045358657836914, - "learning_rate": 3.985150521102113e-05, - "loss": 1.0364, + "epoch": 0.21963913948646774, + "grad_norm": 1.097683072090149, + "learning_rate": 2.196010407632264e-05, + "loss": 0.811, "step": 1266 }, { - "epoch": 0.4626620412634654, - "grad_norm": 1.4429734945297241, - "learning_rate": 3.9849755272302515e-05, - "loss": 1.0359, + "epoch": 0.21981263011797364, + "grad_norm": 1.3839457035064697, + "learning_rate": 2.1977450130095405e-05, + "loss": 0.7795, "step": 1267 }, { - "epoch": 0.46302720467409164, - "grad_norm": 1.2711185216903687, - "learning_rate": 3.984799512168134e-05, - "loss": 1.0161, + "epoch": 0.21998612074947954, + "grad_norm": 0.9724128842353821, + "learning_rate": 2.1994796183868173e-05, + "loss": 0.8928, "step": 1268 }, { - "epoch": 0.4633923680847179, - "grad_norm": 1.3700382709503174, - "learning_rate": 3.9846224760063125e-05, - "loss": 0.9888, + "epoch": 0.22015961138098541, + "grad_norm": 1.8776582479476929, + "learning_rate": 2.201214223764094e-05, + "loss": 1.0012, "step": 1269 }, { - "epoch": 0.46375753149534416, - "grad_norm": 1.278171420097351, - "learning_rate": 3.984444418835865e-05, - "loss": 1.0242, + "epoch": 0.22033310201249132, + "grad_norm": 1.7933049201965332, + "learning_rate": 2.2029488291413707e-05, + "loss": 0.897, "step": 1270 }, { - "epoch": 0.4641226949059704, - "grad_norm": 1.4254947900772095, - "learning_rate": 3.984265340748395e-05, - "loss": 1.0508, + "epoch": 0.22050659264399722, + "grad_norm": 1.5407241582870483, + "learning_rate": 2.2046834345186472e-05, + "loss": 0.8806, "step": 1271 }, { - "epoch": 0.46448785831659667, - "grad_norm": 1.36069917678833, - "learning_rate": 3.98408524183603e-05, - "loss": 1.0197, + "epoch": 0.22068008327550312, + "grad_norm": 1.2563272714614868, + "learning_rate": 2.206418039895924e-05, + "loss": 0.8354, "step": 1272 }, { - "epoch": 0.4648530217272229, - "grad_norm": 1.976660132408142, - "learning_rate": 3.983904122191425e-05, - "loss": 1.0334, + "epoch": 0.22085357390700902, + "grad_norm": 1.5789728164672852, + "learning_rate": 2.2081526452732005e-05, + "loss": 0.865, "step": 1273 }, { - "epoch": 0.4652181851378492, - "grad_norm": 1.1975443363189697, - "learning_rate": 3.9837219819077584e-05, - "loss": 1.026, + "epoch": 0.22102706453851492, + "grad_norm": 1.270313024520874, + "learning_rate": 2.209887250650477e-05, + "loss": 0.7329, "step": 1274 }, { - "epoch": 0.46558334854847544, - "grad_norm": 1.6423403024673462, - "learning_rate": 3.983538821078734e-05, - "loss": 1.0098, + "epoch": 0.22120055517002082, + "grad_norm": 3.244511604309082, + "learning_rate": 2.211621856027754e-05, + "loss": 0.8625, "step": 1275 }, { - "epoch": 0.4659485119591017, - "grad_norm": 1.6539251804351807, - "learning_rate": 3.98335463979858e-05, - "loss": 1.0072, + "epoch": 0.22137404580152673, + "grad_norm": 1.2354989051818848, + "learning_rate": 2.2133564614050303e-05, + "loss": 0.8162, "step": 1276 }, { - "epoch": 0.46631367536972795, - "grad_norm": 2.0238630771636963, - "learning_rate": 3.9831694381620513e-05, - "loss": 1.0327, + "epoch": 0.22154753643303263, + "grad_norm": 0.9207253456115723, + "learning_rate": 2.2150910667823072e-05, + "loss": 0.9258, "step": 1277 }, { - "epoch": 0.4666788387803542, - "grad_norm": 1.216406226158142, - "learning_rate": 3.982983216264427e-05, - "loss": 0.9807, + "epoch": 0.22172102706453853, + "grad_norm": 0.7436503171920776, + "learning_rate": 2.2168256721595837e-05, + "loss": 0.9846, "step": 1278 }, { - "epoch": 0.46704400219098047, - "grad_norm": 1.065962791442871, - "learning_rate": 3.982795974201509e-05, - "loss": 1.045, + "epoch": 0.2218945176960444, + "grad_norm": 1.502890944480896, + "learning_rate": 2.2185602775368605e-05, + "loss": 0.8555, "step": 1279 }, { - "epoch": 0.4674091656016067, - "grad_norm": 1.716241717338562, - "learning_rate": 3.982607712069627e-05, - "loss": 0.9836, + "epoch": 0.2220680083275503, + "grad_norm": 0.9581775665283203, + "learning_rate": 2.220294882914137e-05, + "loss": 0.7642, "step": 1280 }, { - "epoch": 0.467774329012233, - "grad_norm": 1.7508296966552734, - "learning_rate": 3.982418429965635e-05, - "loss": 1.0267, + "epoch": 0.2222414989590562, + "grad_norm": 1.2616033554077148, + "learning_rate": 2.222029488291414e-05, + "loss": 0.8621, "step": 1281 }, { - "epoch": 0.46813949242285924, - "grad_norm": 1.2000401020050049, - "learning_rate": 3.982228127986909e-05, - "loss": 0.9669, + "epoch": 0.2224149895905621, + "grad_norm": 1.6869112253189087, + "learning_rate": 2.2237640936686904e-05, + "loss": 0.8728, "step": 1282 }, { - "epoch": 0.4685046558334855, - "grad_norm": 1.5883054733276367, - "learning_rate": 3.9820368062313546e-05, - "loss": 1.0332, + "epoch": 0.222588480222068, + "grad_norm": 1.3160701990127563, + "learning_rate": 2.2254986990459672e-05, + "loss": 0.6907, "step": 1283 }, { - "epoch": 0.46886981924411175, - "grad_norm": 1.3146636486053467, - "learning_rate": 3.981844464797397e-05, - "loss": 1.0253, + "epoch": 0.2227619708535739, + "grad_norm": 1.0710477828979492, + "learning_rate": 2.2272333044232437e-05, + "loss": 0.9072, "step": 1284 }, { - "epoch": 0.469234982654738, - "grad_norm": 2.104714870452881, - "learning_rate": 3.981651103783988e-05, - "loss": 1.0232, + "epoch": 0.2229354614850798, + "grad_norm": 1.0992356538772583, + "learning_rate": 2.2289679098005202e-05, + "loss": 0.7618, "step": 1285 }, { - "epoch": 0.46960014606536427, - "grad_norm": 2.3598854541778564, - "learning_rate": 3.9814567232906054e-05, - "loss": 0.9919, + "epoch": 0.2231089521165857, + "grad_norm": 2.3481509685516357, + "learning_rate": 2.230702515177797e-05, + "loss": 0.7991, "step": 1286 }, { - "epoch": 0.4699653094759905, - "grad_norm": 0.9598666429519653, - "learning_rate": 3.98126132341725e-05, - "loss": 0.976, + "epoch": 0.22328244274809161, + "grad_norm": 3.143268346786499, + "learning_rate": 2.2324371205550742e-05, + "loss": 0.8206, "step": 1287 }, { - "epoch": 0.4703304728866168, - "grad_norm": 1.803100824356079, - "learning_rate": 3.981064904264446e-05, - "loss": 1.0037, + "epoch": 0.22345593337959752, + "grad_norm": 1.1431220769882202, + "learning_rate": 2.2341717259323507e-05, + "loss": 0.6946, "step": 1288 }, { - "epoch": 0.47069563629724304, - "grad_norm": 3.304150342941284, - "learning_rate": 3.9808674659332445e-05, - "loss": 1.0522, + "epoch": 0.2236294240111034, + "grad_norm": 1.4438862800598145, + "learning_rate": 2.2359063313096276e-05, + "loss": 0.877, "step": 1289 }, { - "epoch": 0.4710607997078693, - "grad_norm": 1.86021888256073, - "learning_rate": 3.9806690085252184e-05, - "loss": 1.0383, + "epoch": 0.2238029146426093, + "grad_norm": 0.8360527753829956, + "learning_rate": 2.237640936686904e-05, + "loss": 0.8779, "step": 1290 }, { - "epoch": 0.47142596311849555, - "grad_norm": 1.286505103111267, - "learning_rate": 3.980469532142467e-05, - "loss": 0.9794, + "epoch": 0.2239764052741152, + "grad_norm": 1.27219557762146, + "learning_rate": 2.2393755420641806e-05, + "loss": 0.8418, "step": 1291 }, { - "epoch": 0.4717911265291218, - "grad_norm": 2.087522029876709, - "learning_rate": 3.980269036887613e-05, - "loss": 1.0045, + "epoch": 0.2241498959056211, + "grad_norm": 0.9910455942153931, + "learning_rate": 2.2411101474414574e-05, + "loss": 0.8086, "step": 1292 }, { - "epoch": 0.47215628993974806, - "grad_norm": 1.861139178276062, - "learning_rate": 3.980067522863802e-05, - "loss": 1.0271, + "epoch": 0.224323386537127, + "grad_norm": 0.8662088513374329, + "learning_rate": 2.242844752818734e-05, + "loss": 0.9282, "step": 1293 }, { - "epoch": 0.47252145335037427, - "grad_norm": 1.5117992162704468, - "learning_rate": 3.9798649901747064e-05, - "loss": 1.0419, + "epoch": 0.2244968771686329, + "grad_norm": 0.7254410982131958, + "learning_rate": 2.2445793581960107e-05, + "loss": 0.9834, "step": 1294 }, { - "epoch": 0.4728866167610005, - "grad_norm": 3.4195640087127686, - "learning_rate": 3.9796614389245205e-05, - "loss": 1.0601, + "epoch": 0.2246703678001388, + "grad_norm": 1.1039694547653198, + "learning_rate": 2.2463139635732872e-05, + "loss": 0.7863, "step": 1295 }, { - "epoch": 0.4732517801716268, - "grad_norm": 2.0240163803100586, - "learning_rate": 3.979456869217962e-05, - "loss": 1.0232, + "epoch": 0.2248438584316447, + "grad_norm": 0.9022557735443115, + "learning_rate": 2.248048568950564e-05, + "loss": 0.8479, "step": 1296 }, { - "epoch": 0.47361694358225304, - "grad_norm": 1.2953453063964844, - "learning_rate": 3.979251281160277e-05, - "loss": 1.0372, + "epoch": 0.2250173490631506, + "grad_norm": 0.9574583172798157, + "learning_rate": 2.2497831743278406e-05, + "loss": 0.811, "step": 1297 }, { - "epoch": 0.4739821069928793, - "grad_norm": 1.6462711095809937, - "learning_rate": 3.979044674857228e-05, - "loss": 1.0243, + "epoch": 0.22519083969465647, + "grad_norm": 0.9563328623771667, + "learning_rate": 2.2515177797051174e-05, + "loss": 0.8413, "step": 1298 }, { - "epoch": 0.47434727040350555, - "grad_norm": 3.6176419258117676, - "learning_rate": 3.978837050415109e-05, - "loss": 1.0291, + "epoch": 0.22536433032616238, + "grad_norm": 3.308180093765259, + "learning_rate": 2.253252385082394e-05, + "loss": 0.8152, "step": 1299 }, { - "epoch": 0.4747124338141318, - "grad_norm": 1.4731038808822632, - "learning_rate": 3.9786284079407325e-05, - "loss": 0.9845, + "epoch": 0.22553782095766828, + "grad_norm": 1.843286156654358, + "learning_rate": 2.2549869904596708e-05, + "loss": 0.8286, "step": 1300 }, { - "epoch": 0.47507759722475806, - "grad_norm": 1.9942798614501953, - "learning_rate": 3.978418747541438e-05, - "loss": 1.0099, + "epoch": 0.22571131158917418, + "grad_norm": 0.843877375125885, + "learning_rate": 2.2567215958369473e-05, + "loss": 0.8396, "step": 1301 }, { - "epoch": 0.4754427606353843, - "grad_norm": 1.667412281036377, - "learning_rate": 3.9782080693250875e-05, - "loss": 0.9912, + "epoch": 0.22588480222068008, + "grad_norm": 1.0161038637161255, + "learning_rate": 2.258456201214224e-05, + "loss": 0.8374, "step": 1302 }, { - "epoch": 0.4758079240460106, - "grad_norm": 1.290334701538086, - "learning_rate": 3.977996373400066e-05, - "loss": 1.0461, + "epoch": 0.22605829285218598, + "grad_norm": 1.4490329027175903, + "learning_rate": 2.2601908065915006e-05, + "loss": 0.8459, "step": 1303 }, { - "epoch": 0.47617308745663683, - "grad_norm": 1.5926707983016968, - "learning_rate": 3.9777836598752814e-05, - "loss": 1.0197, + "epoch": 0.22623178348369188, + "grad_norm": 1.2153831720352173, + "learning_rate": 2.261925411968777e-05, + "loss": 1.0286, "step": 1304 }, { - "epoch": 0.4765382508672631, - "grad_norm": 1.3383557796478271, - "learning_rate": 3.977569928860168e-05, - "loss": 1.0111, + "epoch": 0.22640527411519779, + "grad_norm": 1.2392733097076416, + "learning_rate": 2.263660017346054e-05, + "loss": 0.8638, "step": 1305 }, { - "epoch": 0.47690341427788935, - "grad_norm": 1.2263858318328857, - "learning_rate": 3.977355180464681e-05, - "loss": 1.0026, + "epoch": 0.2265787647467037, + "grad_norm": 3.0440316200256348, + "learning_rate": 2.2653946227233304e-05, + "loss": 0.8967, "step": 1306 }, { - "epoch": 0.4772685776885156, - "grad_norm": 1.5220201015472412, - "learning_rate": 3.9771394147993e-05, - "loss": 1.0087, + "epoch": 0.2267522553782096, + "grad_norm": 1.2699401378631592, + "learning_rate": 2.2671292281006073e-05, + "loss": 0.8132, "step": 1307 }, { - "epoch": 0.47763374109914186, - "grad_norm": 1.3456478118896484, - "learning_rate": 3.976922631975028e-05, - "loss": 1.0485, + "epoch": 0.22692574600971546, + "grad_norm": 1.0204271078109741, + "learning_rate": 2.2688638334778838e-05, + "loss": 0.8711, "step": 1308 }, { - "epoch": 0.4779989045097681, - "grad_norm": 1.1277402639389038, - "learning_rate": 3.97670483210339e-05, - "loss": 0.9915, + "epoch": 0.22709923664122136, + "grad_norm": 0.9696646928787231, + "learning_rate": 2.2705984388551606e-05, + "loss": 0.7366, "step": 1309 }, { - "epoch": 0.4783640679203944, - "grad_norm": 1.4186856746673584, - "learning_rate": 3.9764860152964365e-05, - "loss": 1.046, + "epoch": 0.22727272727272727, + "grad_norm": 1.3806955814361572, + "learning_rate": 2.272333044232437e-05, + "loss": 0.9385, "step": 1310 }, { - "epoch": 0.47872923133102063, - "grad_norm": 1.265997290611267, - "learning_rate": 3.9762661816667404e-05, - "loss": 1.0322, + "epoch": 0.22744621790423317, + "grad_norm": 0.8159229159355164, + "learning_rate": 2.274067649609714e-05, + "loss": 0.9524, "step": 1311 }, { - "epoch": 0.4790943947416469, - "grad_norm": 1.2460129261016846, - "learning_rate": 3.9760453313273954e-05, - "loss": 1.0034, + "epoch": 0.22761970853573907, + "grad_norm": 1.1370463371276855, + "learning_rate": 2.2758022549869905e-05, + "loss": 0.9695, "step": 1312 }, { - "epoch": 0.47945955815227315, - "grad_norm": 1.6398743391036987, - "learning_rate": 3.9758234643920214e-05, - "loss": 0.9611, + "epoch": 0.22779319916724497, + "grad_norm": 0.9748329520225525, + "learning_rate": 2.2775368603642673e-05, + "loss": 0.8635, "step": 1313 }, { - "epoch": 0.4798247215628994, - "grad_norm": 1.0905705690383911, - "learning_rate": 3.9756005809747604e-05, - "loss": 1.0405, + "epoch": 0.22796668979875087, + "grad_norm": 1.0955657958984375, + "learning_rate": 2.2792714657415438e-05, + "loss": 0.8835, "step": 1314 }, { - "epoch": 0.48018988497352566, - "grad_norm": 1.249118447303772, - "learning_rate": 3.9753766811902756e-05, - "loss": 0.975, + "epoch": 0.22814018043025677, + "grad_norm": 0.951465368270874, + "learning_rate": 2.2810060711188203e-05, + "loss": 0.7693, "step": 1315 }, { - "epoch": 0.4805550483841519, - "grad_norm": 1.3340857028961182, - "learning_rate": 3.975151765153756e-05, - "loss": 1.0165, + "epoch": 0.22831367106176267, + "grad_norm": 0.7842404246330261, + "learning_rate": 2.2827406764960975e-05, + "loss": 0.7966, "step": 1316 }, { - "epoch": 0.4809202117947782, - "grad_norm": 1.1250171661376953, - "learning_rate": 3.9749258329809104e-05, - "loss": 1.0139, + "epoch": 0.22848716169326858, + "grad_norm": 0.9946502447128296, + "learning_rate": 2.2844752818733743e-05, + "loss": 0.8064, "step": 1317 }, { - "epoch": 0.48128537520540443, - "grad_norm": 1.5478105545043945, - "learning_rate": 3.974698884787973e-05, - "loss": 1.0087, + "epoch": 0.22866065232477445, + "grad_norm": 1.8226921558380127, + "learning_rate": 2.2862098872506508e-05, + "loss": 0.8303, "step": 1318 }, { - "epoch": 0.4816505386160307, - "grad_norm": 1.148398518562317, - "learning_rate": 3.974470920691699e-05, - "loss": 1.0024, + "epoch": 0.22883414295628035, + "grad_norm": 0.8996050357818604, + "learning_rate": 2.2879444926279276e-05, + "loss": 0.8723, "step": 1319 }, { - "epoch": 0.48201570202665694, - "grad_norm": 1.4410706758499146, - "learning_rate": 3.974241940809367e-05, - "loss": 1.0226, + "epoch": 0.22900763358778625, + "grad_norm": 1.172995686531067, + "learning_rate": 2.289679098005204e-05, + "loss": 0.855, "step": 1320 }, { - "epoch": 0.4823808654372832, - "grad_norm": 1.0901566743850708, - "learning_rate": 3.9740119452587784e-05, - "loss": 0.9792, + "epoch": 0.22918112421929215, + "grad_norm": 0.9299336671829224, + "learning_rate": 2.2914137033824806e-05, + "loss": 0.7991, "step": 1321 }, { - "epoch": 0.48274602884790946, - "grad_norm": 0.9938544034957886, - "learning_rate": 3.9737809341582545e-05, - "loss": 0.986, + "epoch": 0.22935461485079806, + "grad_norm": 1.085505723953247, + "learning_rate": 2.2931483087597575e-05, + "loss": 0.8513, "step": 1322 }, { - "epoch": 0.4831111922585357, - "grad_norm": 1.7326453924179077, - "learning_rate": 3.973548907626644e-05, - "loss": 1.0007, + "epoch": 0.22952810548230396, + "grad_norm": 1.1205852031707764, + "learning_rate": 2.294882914137034e-05, + "loss": 0.7717, "step": 1323 }, { - "epoch": 0.483476355669162, - "grad_norm": 1.0866512060165405, - "learning_rate": 3.973315865783314e-05, - "loss": 1.0046, + "epoch": 0.22970159611380986, + "grad_norm": 1.7333638668060303, + "learning_rate": 2.2966175195143108e-05, + "loss": 0.8086, "step": 1324 }, { - "epoch": 0.48384151907978823, - "grad_norm": 1.3494595289230347, - "learning_rate": 3.9730818087481554e-05, - "loss": 1.033, + "epoch": 0.22987508674531576, + "grad_norm": 1.0828955173492432, + "learning_rate": 2.2983521248915873e-05, + "loss": 0.877, "step": 1325 }, { - "epoch": 0.4842066824904145, - "grad_norm": 1.138445496559143, - "learning_rate": 3.9728467366415815e-05, - "loss": 1.0153, + "epoch": 0.23004857737682166, + "grad_norm": 1.0463714599609375, + "learning_rate": 2.300086730268864e-05, + "loss": 0.877, "step": 1326 }, { - "epoch": 0.48457184590104074, - "grad_norm": 1.1589608192443848, - "learning_rate": 3.972610649584526e-05, - "loss": 0.9874, + "epoch": 0.23022206800832756, + "grad_norm": 1.275964617729187, + "learning_rate": 2.3018213356461407e-05, + "loss": 0.8052, "step": 1327 }, { - "epoch": 0.48493700931166694, - "grad_norm": 1.4395620822906494, - "learning_rate": 3.972373547698448e-05, - "loss": 0.9927, + "epoch": 0.23039555863983344, + "grad_norm": 1.1356350183486938, + "learning_rate": 2.3035559410234175e-05, + "loss": 0.7275, "step": 1328 }, { - "epoch": 0.4853021727222932, - "grad_norm": 1.3195831775665283, - "learning_rate": 3.9721354311053256e-05, - "loss": 1.0042, + "epoch": 0.23056904927133934, + "grad_norm": 0.9226983785629272, + "learning_rate": 2.305290546400694e-05, + "loss": 0.864, "step": 1329 }, { - "epoch": 0.48566733613291946, - "grad_norm": 1.2380083799362183, - "learning_rate": 3.971896299927661e-05, - "loss": 1.0455, + "epoch": 0.23074253990284524, + "grad_norm": 0.9160507321357727, + "learning_rate": 2.307025151777971e-05, + "loss": 0.9629, "step": 1330 }, { - "epoch": 0.4860324995435457, - "grad_norm": 1.563293695449829, - "learning_rate": 3.971656154288477e-05, - "loss": 0.9868, + "epoch": 0.23091603053435114, + "grad_norm": 1.0793954133987427, + "learning_rate": 2.3087597571552473e-05, + "loss": 0.9014, "step": 1331 }, { - "epoch": 0.48639766295417197, - "grad_norm": 1.0185102224349976, - "learning_rate": 3.97141499431132e-05, - "loss": 1.0244, + "epoch": 0.23108952116585704, + "grad_norm": 1.1329586505889893, + "learning_rate": 2.3104943625325242e-05, + "loss": 0.9795, "step": 1332 }, { - "epoch": 0.48676282636479823, - "grad_norm": 2.4113428592681885, - "learning_rate": 3.971172820120256e-05, - "loss": 0.9769, + "epoch": 0.23126301179736294, + "grad_norm": 0.783107340335846, + "learning_rate": 2.3122289679098007e-05, + "loss": 0.9343, "step": 1333 }, { - "epoch": 0.4871279897754245, - "grad_norm": 1.0562794208526611, - "learning_rate": 3.970929631839874e-05, - "loss": 1.009, + "epoch": 0.23143650242886885, + "grad_norm": 0.8025109767913818, + "learning_rate": 2.3139635732870772e-05, + "loss": 0.9409, "step": 1334 }, { - "epoch": 0.48749315318605074, - "grad_norm": 1.4912586212158203, - "learning_rate": 3.9706854295952856e-05, - "loss": 0.9768, + "epoch": 0.23160999306037475, + "grad_norm": 1.2028032541275024, + "learning_rate": 2.315698178664354e-05, + "loss": 0.8364, "step": 1335 }, { - "epoch": 0.487858316596677, - "grad_norm": 1.529910683631897, - "learning_rate": 3.9704402135121214e-05, - "loss": 1.0206, + "epoch": 0.23178348369188065, + "grad_norm": 0.9573748707771301, + "learning_rate": 2.3174327840416305e-05, + "loss": 0.8208, "step": 1336 }, { - "epoch": 0.48822348000730326, - "grad_norm": 1.6169029474258423, - "learning_rate": 3.970193983716537e-05, - "loss": 1.0686, + "epoch": 0.23195697432338655, + "grad_norm": 1.1717379093170166, + "learning_rate": 2.3191673894189074e-05, + "loss": 0.7094, "step": 1337 }, { - "epoch": 0.4885886434179295, - "grad_norm": 1.1456373929977417, - "learning_rate": 3.9699467403352066e-05, - "loss": 1.0398, + "epoch": 0.23213046495489242, + "grad_norm": 1.904675841331482, + "learning_rate": 2.320901994796184e-05, + "loss": 0.9062, "step": 1338 }, { - "epoch": 0.48895380682855577, - "grad_norm": 0.9989175796508789, - "learning_rate": 3.9696984834953274e-05, - "loss": 0.9904, + "epoch": 0.23230395558639833, + "grad_norm": 0.9160438776016235, + "learning_rate": 2.3226366001734607e-05, + "loss": 0.8992, "step": 1339 }, { - "epoch": 0.489318970239182, - "grad_norm": 1.2508445978164673, - "learning_rate": 3.969449213324617e-05, - "loss": 1.001, + "epoch": 0.23247744621790423, + "grad_norm": 0.9467340111732483, + "learning_rate": 2.3243712055507372e-05, + "loss": 0.8809, "step": 1340 }, { - "epoch": 0.4896841336498083, - "grad_norm": 0.9336028099060059, - "learning_rate": 3.969198929951316e-05, - "loss": 0.993, + "epoch": 0.23265093684941013, + "grad_norm": 0.9190947413444519, + "learning_rate": 2.326105810928014e-05, + "loss": 0.926, "step": 1341 }, { - "epoch": 0.49004929706043454, - "grad_norm": 1.069340467453003, - "learning_rate": 3.9689476335041844e-05, - "loss": 1.0267, + "epoch": 0.23282442748091603, + "grad_norm": 0.881736695766449, + "learning_rate": 2.3278404163052905e-05, + "loss": 0.7422, "step": 1342 }, { - "epoch": 0.4904144604710608, - "grad_norm": 1.2860420942306519, - "learning_rate": 3.9686953241125045e-05, - "loss": 1.0101, + "epoch": 0.23299791811242193, + "grad_norm": 0.9676398038864136, + "learning_rate": 2.3295750216825674e-05, + "loss": 0.6909, "step": 1343 }, { - "epoch": 0.49077962388168705, - "grad_norm": 1.2218884229660034, - "learning_rate": 3.96844200190608e-05, - "loss": 0.9763, + "epoch": 0.23317140874392783, + "grad_norm": 1.7058171033859253, + "learning_rate": 2.331309627059844e-05, + "loss": 0.8984, "step": 1344 }, { - "epoch": 0.4911447872923133, - "grad_norm": 1.134279727935791, - "learning_rate": 3.968187667015233e-05, - "loss": 0.9855, + "epoch": 0.23334489937543373, + "grad_norm": 1.3067808151245117, + "learning_rate": 2.3330442324371204e-05, + "loss": 0.8191, "step": 1345 }, { - "epoch": 0.49150995070293957, - "grad_norm": 1.0788251161575317, - "learning_rate": 3.9679323195708095e-05, - "loss": 0.9741, + "epoch": 0.23351839000693964, + "grad_norm": 1.0901249647140503, + "learning_rate": 2.3347788378143976e-05, + "loss": 0.7742, "step": 1346 }, { - "epoch": 0.4918751141135658, - "grad_norm": 1.2616616487503052, - "learning_rate": 3.9676759597041765e-05, - "loss": 0.9969, + "epoch": 0.23369188063844554, + "grad_norm": 1.327304720878601, + "learning_rate": 2.3365134431916744e-05, + "loss": 0.9485, "step": 1347 }, { - "epoch": 0.4922402775241921, - "grad_norm": 1.703948974609375, - "learning_rate": 3.96741858754722e-05, - "loss": 1.0375, + "epoch": 0.2338653712699514, + "grad_norm": 1.272048830986023, + "learning_rate": 2.338248048568951e-05, + "loss": 0.8047, "step": 1348 }, { - "epoch": 0.49260544093481834, - "grad_norm": 1.095320701599121, - "learning_rate": 3.9671602032323475e-05, - "loss": 1.0018, + "epoch": 0.2340388619014573, + "grad_norm": 0.8990715146064758, + "learning_rate": 2.3399826539462277e-05, + "loss": 0.825, "step": 1349 }, { - "epoch": 0.4929706043454446, - "grad_norm": 0.985589325428009, - "learning_rate": 3.9669008068924885e-05, - "loss": 0.9711, + "epoch": 0.23421235253296321, + "grad_norm": 0.9674850702285767, + "learning_rate": 2.3417172593235042e-05, + "loss": 0.9128, "step": 1350 }, { - "epoch": 0.49333576775607085, - "grad_norm": 1.1995898485183716, - "learning_rate": 3.9666403986610904e-05, - "loss": 1.0397, + "epoch": 0.23438584316446912, + "grad_norm": 1.1514836549758911, + "learning_rate": 2.3434518647007807e-05, + "loss": 0.76, "step": 1351 }, { - "epoch": 0.4937009311666971, - "grad_norm": 1.3763004541397095, - "learning_rate": 3.9663789786721235e-05, - "loss": 1.0125, + "epoch": 0.23455933379597502, + "grad_norm": 0.9687449932098389, + "learning_rate": 2.3451864700780576e-05, + "loss": 0.7717, "step": 1352 }, { - "epoch": 0.49406609457732337, - "grad_norm": 2.183267593383789, - "learning_rate": 3.966116547060078e-05, - "loss": 0.9939, + "epoch": 0.23473282442748092, + "grad_norm": 1.189566731452942, + "learning_rate": 2.346921075455334e-05, + "loss": 0.7625, "step": 1353 }, { - "epoch": 0.4944312579879496, - "grad_norm": 1.32038152217865, - "learning_rate": 3.965853103959965e-05, - "loss": 1.0038, + "epoch": 0.23490631505898682, + "grad_norm": 0.7078501582145691, + "learning_rate": 2.348655680832611e-05, + "loss": 0.9124, "step": 1354 }, { - "epoch": 0.4947964213985759, - "grad_norm": 1.2144324779510498, - "learning_rate": 3.965588649507314e-05, - "loss": 0.9783, + "epoch": 0.23507980569049272, + "grad_norm": 0.8724525570869446, + "learning_rate": 2.3503902862098874e-05, + "loss": 0.8162, "step": 1355 }, { - "epoch": 0.49516158480920214, - "grad_norm": 2.0010688304901123, - "learning_rate": 3.965323183838177e-05, - "loss": 1.0637, + "epoch": 0.23525329632199862, + "grad_norm": 0.8340880870819092, + "learning_rate": 2.3521248915871643e-05, + "loss": 0.8376, "step": 1356 }, { - "epoch": 0.4955267482198284, - "grad_norm": 1.3796601295471191, - "learning_rate": 3.9650567070891256e-05, - "loss": 0.9628, + "epoch": 0.23542678695350452, + "grad_norm": 1.2912070751190186, + "learning_rate": 2.3538594969644408e-05, + "loss": 0.823, "step": 1357 }, { - "epoch": 0.49589191163045465, - "grad_norm": 1.3124300241470337, - "learning_rate": 3.964789219397252e-05, - "loss": 1.0189, + "epoch": 0.2356002775850104, + "grad_norm": 1.2448360919952393, + "learning_rate": 2.3555941023417176e-05, + "loss": 0.8672, "step": 1358 }, { - "epoch": 0.4962570750410809, - "grad_norm": 1.9950721263885498, - "learning_rate": 3.964520720900167e-05, - "loss": 1.0272, + "epoch": 0.2357737682165163, + "grad_norm": 0.9103480577468872, + "learning_rate": 2.357328707718994e-05, + "loss": 0.7834, "step": 1359 }, { - "epoch": 0.49662223845170717, - "grad_norm": 1.1976826190948486, - "learning_rate": 3.964251211736002e-05, - "loss": 1.0017, + "epoch": 0.2359472588480222, + "grad_norm": 1.6489218473434448, + "learning_rate": 2.359063313096271e-05, + "loss": 0.6786, "step": 1360 }, { - "epoch": 0.49698740186233337, - "grad_norm": 1.1701394319534302, - "learning_rate": 3.963980692043408e-05, - "loss": 1.0013, + "epoch": 0.2361207494795281, + "grad_norm": 1.0229804515838623, + "learning_rate": 2.3607979184735474e-05, + "loss": 0.7627, "step": 1361 }, { - "epoch": 0.4973525652729596, - "grad_norm": 1.0524539947509766, - "learning_rate": 3.963709161961559e-05, - "loss": 0.9984, + "epoch": 0.236294240111034, + "grad_norm": 0.8608489632606506, + "learning_rate": 2.362532523850824e-05, + "loss": 0.7842, "step": 1362 }, { - "epoch": 0.4977177286835859, - "grad_norm": 1.5126538276672363, - "learning_rate": 3.9634366216301445e-05, - "loss": 1.0213, + "epoch": 0.2364677307425399, + "grad_norm": 0.8253280520439148, + "learning_rate": 2.3642671292281008e-05, + "loss": 0.9617, "step": 1363 }, { - "epoch": 0.49808289209421214, - "grad_norm": 0.9199450016021729, - "learning_rate": 3.963163071189376e-05, - "loss": 0.9794, + "epoch": 0.2366412213740458, + "grad_norm": 0.78313148021698, + "learning_rate": 2.3660017346053773e-05, + "loss": 0.8938, "step": 1364 }, { - "epoch": 0.4984480555048384, - "grad_norm": 1.4226179122924805, - "learning_rate": 3.962888510779984e-05, - "loss": 1.033, + "epoch": 0.2368147120055517, + "grad_norm": 1.0798664093017578, + "learning_rate": 2.367736339982654e-05, + "loss": 0.8052, "step": 1365 }, { - "epoch": 0.49881321891546465, - "grad_norm": 1.623841404914856, - "learning_rate": 3.962612940543219e-05, - "loss": 1.0532, + "epoch": 0.2369882026370576, + "grad_norm": 0.7591267228126526, + "learning_rate": 2.3694709453599306e-05, + "loss": 1.0044, "step": 1366 }, { - "epoch": 0.4991783823260909, - "grad_norm": 1.1378660202026367, - "learning_rate": 3.962336360620851e-05, - "loss": 0.9766, + "epoch": 0.23716169326856348, + "grad_norm": 0.7405969500541687, + "learning_rate": 2.3712055507372074e-05, + "loss": 0.9658, "step": 1367 }, { - "epoch": 0.49954354573671716, - "grad_norm": 1.134692668914795, - "learning_rate": 3.962058771155169e-05, - "loss": 0.9658, + "epoch": 0.23733518390006939, + "grad_norm": 0.9639427661895752, + "learning_rate": 2.372940156114484e-05, + "loss": 0.832, "step": 1368 }, { - "epoch": 0.4999087091473434, - "grad_norm": 1.0753190517425537, - "learning_rate": 3.9617801722889815e-05, - "loss": 0.9769, + "epoch": 0.2375086745315753, + "grad_norm": 0.771919846534729, + "learning_rate": 2.3746747614917608e-05, + "loss": 0.8787, "step": 1369 }, { - "epoch": 0.5002738725579697, - "grad_norm": 1.695361614227295, - "learning_rate": 3.9615005641656175e-05, - "loss": 1.0011, + "epoch": 0.2376821651630812, + "grad_norm": 0.9164493680000305, + "learning_rate": 2.3764093668690373e-05, + "loss": 0.864, "step": 1370 }, { - "epoch": 0.5006390359685959, - "grad_norm": 1.4313452243804932, - "learning_rate": 3.961219946928923e-05, - "loss": 1.0487, + "epoch": 0.2378556557945871, + "grad_norm": 1.456276535987854, + "learning_rate": 2.378143972246314e-05, + "loss": 0.8921, "step": 1371 }, { - "epoch": 0.5010041993792222, - "grad_norm": 1.2225441932678223, - "learning_rate": 3.960938320723265e-05, - "loss": 0.9933, + "epoch": 0.238029146426093, + "grad_norm": 1.1949793100357056, + "learning_rate": 2.3798785776235906e-05, + "loss": 0.9072, "step": 1372 }, { - "epoch": 0.5013693627898484, - "grad_norm": 1.492661714553833, - "learning_rate": 3.960655685693528e-05, - "loss": 1.0365, + "epoch": 0.2382026370575989, + "grad_norm": 1.170979380607605, + "learning_rate": 2.3816131830008675e-05, + "loss": 0.7771, "step": 1373 }, { - "epoch": 0.5017345262004748, - "grad_norm": 1.2588542699813843, - "learning_rate": 3.960372041985117e-05, - "loss": 0.9813, + "epoch": 0.2383761276891048, + "grad_norm": 1.8588552474975586, + "learning_rate": 2.383347788378144e-05, + "loss": 0.7976, "step": 1374 }, { - "epoch": 0.502099689611101, - "grad_norm": 1.4852863550186157, - "learning_rate": 3.960087389743955e-05, - "loss": 1.0198, + "epoch": 0.2385496183206107, + "grad_norm": 1.0293928384780884, + "learning_rate": 2.385082393755421e-05, + "loss": 0.8186, "step": 1375 }, { - "epoch": 0.5024648530217273, - "grad_norm": 1.150386095046997, - "learning_rate": 3.959801729116485e-05, - "loss": 1.0016, + "epoch": 0.2387231089521166, + "grad_norm": 0.8507135510444641, + "learning_rate": 2.3868169991326976e-05, + "loss": 0.8933, "step": 1376 }, { - "epoch": 0.5028300164323535, - "grad_norm": 2.0727345943450928, - "learning_rate": 3.959515060249666e-05, - "loss": 0.9579, + "epoch": 0.23889659958362247, + "grad_norm": 1.135013461112976, + "learning_rate": 2.3885516045099745e-05, + "loss": 0.8616, "step": 1377 }, { - "epoch": 0.5031951798429797, - "grad_norm": 1.2821109294891357, - "learning_rate": 3.959227383290981e-05, - "loss": 0.9886, + "epoch": 0.23907009021512837, + "grad_norm": 0.8193609714508057, + "learning_rate": 2.390286209887251e-05, + "loss": 0.855, "step": 1378 }, { - "epoch": 0.503560343253606, - "grad_norm": 4.803966045379639, - "learning_rate": 3.9589386983884245e-05, - "loss": 1.0007, + "epoch": 0.23924358084663427, + "grad_norm": 0.9920783638954163, + "learning_rate": 2.3920208152645278e-05, + "loss": 0.7046, "step": 1379 }, { - "epoch": 0.5039255066642322, - "grad_norm": 1.1743614673614502, - "learning_rate": 3.9586490056905155e-05, - "loss": 0.9923, + "epoch": 0.23941707147814018, + "grad_norm": 0.9920967221260071, + "learning_rate": 2.3937554206418043e-05, + "loss": 0.835, "step": 1380 }, { - "epoch": 0.5042906700748585, - "grad_norm": 0.9767692685127258, - "learning_rate": 3.958358305346289e-05, - "loss": 1.0062, + "epoch": 0.23959056210964608, + "grad_norm": 0.7946383357048035, + "learning_rate": 2.3954900260190808e-05, + "loss": 0.9019, "step": 1381 }, { - "epoch": 0.5046558334854847, - "grad_norm": 1.813043236732483, - "learning_rate": 3.958066597505299e-05, - "loss": 1.0188, + "epoch": 0.23976405274115198, + "grad_norm": 0.8107618093490601, + "learning_rate": 2.3972246313963577e-05, + "loss": 0.8372, "step": 1382 }, { - "epoch": 0.505020996896111, - "grad_norm": 1.3042802810668945, - "learning_rate": 3.957773882317615e-05, - "loss": 0.9504, + "epoch": 0.23993754337265788, + "grad_norm": 0.7465943098068237, + "learning_rate": 2.398959236773634e-05, + "loss": 0.9194, "step": 1383 }, { - "epoch": 0.5053861603067372, - "grad_norm": 1.1490743160247803, - "learning_rate": 3.957480159933831e-05, - "loss": 1.0043, + "epoch": 0.24011103400416378, + "grad_norm": 1.7401539087295532, + "learning_rate": 2.400693842150911e-05, + "loss": 1.0459, "step": 1384 }, { - "epoch": 0.5057513237173635, - "grad_norm": 1.2278079986572266, - "learning_rate": 3.957185430505052e-05, - "loss": 1.0054, + "epoch": 0.24028452463566968, + "grad_norm": 0.954939603805542, + "learning_rate": 2.4024284475281875e-05, + "loss": 0.7825, "step": 1385 }, { - "epoch": 0.5061164871279897, - "grad_norm": 1.0399527549743652, - "learning_rate": 3.9568896941829076e-05, - "loss": 0.922, + "epoch": 0.24045801526717558, + "grad_norm": 1.0076252222061157, + "learning_rate": 2.4041630529054643e-05, + "loss": 0.762, "step": 1386 }, { - "epoch": 0.506481650538616, - "grad_norm": 1.0321624279022217, - "learning_rate": 3.9565929511195395e-05, - "loss": 0.9803, + "epoch": 0.24063150589868146, + "grad_norm": 0.8649041652679443, + "learning_rate": 2.405897658282741e-05, + "loss": 0.8196, "step": 1387 }, { - "epoch": 0.5068468139492422, - "grad_norm": 1.2660083770751953, - "learning_rate": 3.9562952014676116e-05, - "loss": 1.0146, + "epoch": 0.24080499653018736, + "grad_norm": 1.2306489944458008, + "learning_rate": 2.4076322636600177e-05, + "loss": 0.7761, "step": 1388 }, { - "epoch": 0.5072119773598686, - "grad_norm": 1.2195584774017334, - "learning_rate": 3.955996445380303e-05, - "loss": 1.0433, + "epoch": 0.24097848716169326, + "grad_norm": 1.803061842918396, + "learning_rate": 2.4093668690372942e-05, + "loss": 0.8347, "step": 1389 }, { - "epoch": 0.5075771407704948, - "grad_norm": 1.2831250429153442, - "learning_rate": 3.955696683011314e-05, - "loss": 0.9762, + "epoch": 0.24115197779319916, + "grad_norm": 0.9742139577865601, + "learning_rate": 2.411101474414571e-05, + "loss": 0.8391, "step": 1390 }, { - "epoch": 0.5079423041811211, - "grad_norm": 0.79669588804245, - "learning_rate": 3.9553959145148585e-05, - "loss": 0.9715, + "epoch": 0.24132546842470506, + "grad_norm": 1.140151023864746, + "learning_rate": 2.4128360797918475e-05, + "loss": 0.7454, "step": 1391 }, { - "epoch": 0.5083074675917473, - "grad_norm": 1.4404723644256592, - "learning_rate": 3.955094140045669e-05, - "loss": 0.9923, + "epoch": 0.24149895905621097, + "grad_norm": 0.8394145965576172, + "learning_rate": 2.414570685169124e-05, + "loss": 0.9104, "step": 1392 }, { - "epoch": 0.5086726310023736, - "grad_norm": 1.4139586687088013, - "learning_rate": 3.954791359758998e-05, - "loss": 1.0573, + "epoch": 0.24167244968771687, + "grad_norm": 0.9834004044532776, + "learning_rate": 2.416305290546401e-05, + "loss": 0.8445, "step": 1393 }, { - "epoch": 0.5090377944129998, - "grad_norm": 1.0949894189834595, - "learning_rate": 3.9544875738106136e-05, - "loss": 0.9819, + "epoch": 0.24184594031922277, + "grad_norm": 0.9821028113365173, + "learning_rate": 2.4180398959236774e-05, + "loss": 0.9119, "step": 1394 }, { - "epoch": 0.5094029578236261, - "grad_norm": 1.3531064987182617, - "learning_rate": 3.9541827823568016e-05, - "loss": 1.0094, + "epoch": 0.24201943095072867, + "grad_norm": 0.9467939734458923, + "learning_rate": 2.4197745013009542e-05, + "loss": 0.761, "step": 1395 }, { - "epoch": 0.5097681212342523, - "grad_norm": 1.866912841796875, - "learning_rate": 3.953876985554364e-05, - "loss": 1.0439, + "epoch": 0.24219292158223457, + "grad_norm": 1.0909706354141235, + "learning_rate": 2.4215091066782307e-05, + "loss": 0.8743, "step": 1396 }, { - "epoch": 0.5101332846448786, - "grad_norm": 1.6573362350463867, - "learning_rate": 3.953570183560621e-05, - "loss": 1.0237, + "epoch": 0.24236641221374045, + "grad_norm": 0.8646726012229919, + "learning_rate": 2.4232437120555075e-05, + "loss": 0.7656, "step": 1397 }, { - "epoch": 0.5104984480555048, - "grad_norm": 1.233207106590271, - "learning_rate": 3.953262376533412e-05, - "loss": 0.9905, + "epoch": 0.24253990284524635, + "grad_norm": 0.908886194229126, + "learning_rate": 2.424978317432784e-05, + "loss": 0.991, "step": 1398 }, { - "epoch": 0.5108636114661311, - "grad_norm": 1.2306886911392212, - "learning_rate": 3.9529535646310876e-05, - "loss": 1.0021, + "epoch": 0.24271339347675225, + "grad_norm": 0.9267173409461975, + "learning_rate": 2.426712922810061e-05, + "loss": 0.7677, "step": 1399 }, { - "epoch": 0.5112287748767573, - "grad_norm": 1.3753598928451538, - "learning_rate": 3.9526437480125227e-05, - "loss": 1.0542, + "epoch": 0.24288688410825815, + "grad_norm": 0.7766507267951965, + "learning_rate": 2.4284475281873374e-05, + "loss": 0.8328, "step": 1400 }, { - "epoch": 0.5115939382873836, - "grad_norm": 0.9914408922195435, - "learning_rate": 3.952332926837105e-05, - "loss": 0.999, + "epoch": 0.24306037473976405, + "grad_norm": 0.7757072448730469, + "learning_rate": 2.4301821335646142e-05, + "loss": 0.9866, "step": 1401 }, { - "epoch": 0.5119591016980098, - "grad_norm": 1.2988622188568115, - "learning_rate": 3.9520211012647366e-05, - "loss": 0.9921, + "epoch": 0.24323386537126995, + "grad_norm": 1.1244064569473267, + "learning_rate": 2.4319167389418907e-05, + "loss": 0.8994, "step": 1402 }, { - "epoch": 0.5123242651086362, - "grad_norm": 1.3345063924789429, - "learning_rate": 3.951708271455843e-05, - "loss": 1.0068, + "epoch": 0.24340735600277585, + "grad_norm": 0.9728024005889893, + "learning_rate": 2.4336513443191676e-05, + "loss": 0.8112, "step": 1403 }, { - "epoch": 0.5126894285192624, - "grad_norm": 1.7901772260665894, - "learning_rate": 3.95139443757136e-05, - "loss": 0.9984, + "epoch": 0.24358084663428176, + "grad_norm": 1.008533239364624, + "learning_rate": 2.435385949696444e-05, + "loss": 0.885, "step": 1404 }, { - "epoch": 0.5130545919298887, - "grad_norm": 2.972961187362671, - "learning_rate": 3.951079599772744e-05, - "loss": 1.0413, + "epoch": 0.24375433726578766, + "grad_norm": 0.8124392032623291, + "learning_rate": 2.4371205550737212e-05, + "loss": 0.8567, "step": 1405 }, { - "epoch": 0.5134197553405149, - "grad_norm": 1.5796719789505005, - "learning_rate": 3.950763758221966e-05, - "loss": 1.0128, + "epoch": 0.24392782789729356, + "grad_norm": 1.1923298835754395, + "learning_rate": 2.4388551604509977e-05, + "loss": 0.8499, "step": 1406 }, { - "epoch": 0.5137849187511412, - "grad_norm": 1.1167912483215332, - "learning_rate": 3.950446913081513e-05, - "loss": 1.0165, + "epoch": 0.24410131852879943, + "grad_norm": 1.0970982313156128, + "learning_rate": 2.4405897658282746e-05, + "loss": 0.7695, "step": 1407 }, { - "epoch": 0.5141500821617674, - "grad_norm": 1.1835947036743164, - "learning_rate": 3.9501290645143905e-05, - "loss": 1.0471, + "epoch": 0.24427480916030533, + "grad_norm": 1.81326162815094, + "learning_rate": 2.442324371205551e-05, + "loss": 0.9187, "step": 1408 }, { - "epoch": 0.5145152455723937, - "grad_norm": 1.1647603511810303, - "learning_rate": 3.949810212684117e-05, - "loss": 0.9993, + "epoch": 0.24444829979181124, + "grad_norm": 1.6771832704544067, + "learning_rate": 2.444058976582828e-05, + "loss": 0.8242, "step": 1409 }, { - "epoch": 0.5148804089830199, - "grad_norm": 0.9810660481452942, - "learning_rate": 3.949490357754731e-05, - "loss": 0.9684, + "epoch": 0.24462179042331714, + "grad_norm": 0.8878164291381836, + "learning_rate": 2.4457935819601044e-05, + "loss": 0.8511, "step": 1410 }, { - "epoch": 0.5152455723936462, - "grad_norm": 1.442687749862671, - "learning_rate": 3.9491694998907835e-05, - "loss": 0.974, + "epoch": 0.24479528105482304, + "grad_norm": 0.9496182203292847, + "learning_rate": 2.447528187337381e-05, + "loss": 0.9341, "step": 1411 }, { - "epoch": 0.5156107358042724, - "grad_norm": 1.1547551155090332, - "learning_rate": 3.948847639257344e-05, - "loss": 1.0177, + "epoch": 0.24496877168632894, + "grad_norm": 1.014953851699829, + "learning_rate": 2.4492627927146577e-05, + "loss": 0.6633, "step": 1412 }, { - "epoch": 0.5159758992148986, - "grad_norm": 1.3633129596710205, - "learning_rate": 3.948524776019997e-05, - "loss": 1.0271, + "epoch": 0.24514226231783484, + "grad_norm": 0.8489567637443542, + "learning_rate": 2.4509973980919342e-05, + "loss": 0.7572, "step": 1413 }, { - "epoch": 0.5163410626255249, - "grad_norm": 1.350351095199585, - "learning_rate": 3.9482009103448415e-05, - "loss": 0.9753, + "epoch": 0.24531575294934074, + "grad_norm": 0.9119280576705933, + "learning_rate": 2.452732003469211e-05, + "loss": 0.885, "step": 1414 }, { - "epoch": 0.5167062260361511, - "grad_norm": 0.9496940970420837, - "learning_rate": 3.947876042398494e-05, - "loss": 0.995, + "epoch": 0.24548924358084664, + "grad_norm": 0.7408756613731384, + "learning_rate": 2.4544666088464876e-05, + "loss": 1.0251, "step": 1415 }, { - "epoch": 0.5170713894467774, - "grad_norm": 1.3523801565170288, - "learning_rate": 3.947550172348087e-05, - "loss": 1.0342, + "epoch": 0.24566273421235255, + "grad_norm": 0.882038950920105, + "learning_rate": 2.4562012142237644e-05, + "loss": 0.8604, "step": 1416 }, { - "epoch": 0.5174365528574036, - "grad_norm": 1.0836437940597534, - "learning_rate": 3.947223300361265e-05, - "loss": 0.9702, + "epoch": 0.24583622484385842, + "grad_norm": 0.9417827129364014, + "learning_rate": 2.457935819601041e-05, + "loss": 0.8242, "step": 1417 }, { - "epoch": 0.51780171626803, - "grad_norm": 1.2671315670013428, - "learning_rate": 3.946895426606194e-05, - "loss": 1.0079, + "epoch": 0.24600971547536432, + "grad_norm": 0.8975327014923096, + "learning_rate": 2.4596704249783178e-05, + "loss": 0.9609, "step": 1418 }, { - "epoch": 0.5181668796786562, - "grad_norm": 1.1432974338531494, - "learning_rate": 3.946566551251549e-05, - "loss": 0.9823, + "epoch": 0.24618320610687022, + "grad_norm": 0.9386401772499084, + "learning_rate": 2.4614050303555943e-05, + "loss": 0.8035, "step": 1419 }, { - "epoch": 0.5185320430892825, - "grad_norm": 3.2193729877471924, - "learning_rate": 3.946236674466524e-05, - "loss": 1.0337, + "epoch": 0.24635669673837612, + "grad_norm": 1.181177020072937, + "learning_rate": 2.463139635732871e-05, + "loss": 0.756, "step": 1420 }, { - "epoch": 0.5188972064999087, - "grad_norm": 0.9479828476905823, - "learning_rate": 3.945905796420828e-05, - "loss": 0.9775, + "epoch": 0.24653018736988203, + "grad_norm": 1.186638593673706, + "learning_rate": 2.4648742411101476e-05, + "loss": 0.7742, "step": 1421 }, { - "epoch": 0.519262369910535, - "grad_norm": 1.0697745084762573, - "learning_rate": 3.945573917284685e-05, - "loss": 0.9763, + "epoch": 0.24670367800138793, + "grad_norm": 0.7176553010940552, + "learning_rate": 2.466608846487424e-05, + "loss": 0.9177, "step": 1422 }, { - "epoch": 0.5196275333211612, - "grad_norm": 0.9012410640716553, - "learning_rate": 3.945241037228831e-05, - "loss": 0.9857, + "epoch": 0.24687716863289383, + "grad_norm": 0.8021829128265381, + "learning_rate": 2.468343451864701e-05, + "loss": 0.8247, "step": 1423 }, { - "epoch": 0.5199926967317875, - "grad_norm": 1.1790229082107544, - "learning_rate": 3.944907156424522e-05, - "loss": 0.9875, + "epoch": 0.24705065926439973, + "grad_norm": 0.8400259613990784, + "learning_rate": 2.4700780572419774e-05, + "loss": 0.8538, "step": 1424 }, { - "epoch": 0.5203578601424137, - "grad_norm": 1.0255002975463867, - "learning_rate": 3.9445722750435244e-05, - "loss": 1.0017, + "epoch": 0.24722414989590563, + "grad_norm": 0.8240237832069397, + "learning_rate": 2.4718126626192543e-05, + "loss": 0.9075, "step": 1425 }, { - "epoch": 0.52072302355304, - "grad_norm": 0.9206615686416626, - "learning_rate": 3.944236393258123e-05, - "loss": 0.9841, + "epoch": 0.24739764052741153, + "grad_norm": 0.8309081196784973, + "learning_rate": 2.4735472679965308e-05, + "loss": 0.7847, "step": 1426 }, { - "epoch": 0.5210881869636662, - "grad_norm": 1.266700029373169, - "learning_rate": 3.9438995112411144e-05, - "loss": 1.0413, + "epoch": 0.2475711311589174, + "grad_norm": 0.8432890176773071, + "learning_rate": 2.4752818733738076e-05, + "loss": 0.8621, "step": 1427 }, { - "epoch": 0.5214533503742925, - "grad_norm": 1.2661389112472534, - "learning_rate": 3.943561629165811e-05, - "loss": 0.9636, + "epoch": 0.2477446217904233, + "grad_norm": 1.605841875076294, + "learning_rate": 2.477016478751084e-05, + "loss": 0.7415, "step": 1428 }, { - "epoch": 0.5218185137849187, - "grad_norm": 1.0864936113357544, - "learning_rate": 3.94322274720604e-05, - "loss": 1.0127, + "epoch": 0.2479181124219292, + "grad_norm": 0.9571753740310669, + "learning_rate": 2.478751084128361e-05, + "loss": 0.6492, "step": 1429 }, { - "epoch": 0.522183677195545, - "grad_norm": 1.1362204551696777, - "learning_rate": 3.942882865536142e-05, - "loss": 0.9998, + "epoch": 0.2480916030534351, + "grad_norm": 0.8037225008010864, + "learning_rate": 2.4804856895056375e-05, + "loss": 0.9026, "step": 1430 }, { - "epoch": 0.5225488406061712, - "grad_norm": 1.544856071472168, - "learning_rate": 3.942541984330972e-05, - "loss": 0.9696, + "epoch": 0.248265093684941, + "grad_norm": 0.6853242516517639, + "learning_rate": 2.4822202948829143e-05, + "loss": 0.9861, "step": 1431 }, { - "epoch": 0.5229140040167976, - "grad_norm": 1.0038840770721436, - "learning_rate": 3.942200103765901e-05, - "loss": 0.9906, + "epoch": 0.24843858431644691, + "grad_norm": 0.7944977879524231, + "learning_rate": 2.4839549002601908e-05, + "loss": 0.9409, "step": 1432 }, { - "epoch": 0.5232791674274238, - "grad_norm": 0.9548418521881104, - "learning_rate": 3.941857224016812e-05, - "loss": 0.9646, + "epoch": 0.24861207494795282, + "grad_norm": 0.8900196552276611, + "learning_rate": 2.4856895056374676e-05, + "loss": 0.9141, "step": 1433 }, { - "epoch": 0.5236443308380501, - "grad_norm": 1.160529613494873, - "learning_rate": 3.941513345260104e-05, - "loss": 0.9844, + "epoch": 0.24878556557945872, + "grad_norm": 0.993699848651886, + "learning_rate": 2.4874241110147445e-05, + "loss": 0.8108, "step": 1434 }, { - "epoch": 0.5240094942486763, - "grad_norm": 1.1083402633666992, - "learning_rate": 3.941168467672687e-05, - "loss": 1.0458, + "epoch": 0.24895905621096462, + "grad_norm": 0.8940425515174866, + "learning_rate": 2.4891587163920213e-05, + "loss": 0.7539, "step": 1435 }, { - "epoch": 0.5243746576593026, - "grad_norm": 1.428874135017395, - "learning_rate": 3.940822591431988e-05, - "loss": 1.002, + "epoch": 0.24913254684247052, + "grad_norm": 1.9174562692642212, + "learning_rate": 2.4908933217692978e-05, + "loss": 0.7688, "step": 1436 }, { - "epoch": 0.5247398210699288, - "grad_norm": 2.062041759490967, - "learning_rate": 3.940475716715946e-05, - "loss": 1.0227, + "epoch": 0.2493060374739764, + "grad_norm": 1.1360152959823608, + "learning_rate": 2.4926279271465747e-05, + "loss": 0.7976, "step": 1437 }, { - "epoch": 0.5251049844805551, - "grad_norm": 1.1903979778289795, - "learning_rate": 3.9401278437030144e-05, - "loss": 1.0018, + "epoch": 0.2494795281054823, + "grad_norm": 1.1454304456710815, + "learning_rate": 2.494362532523851e-05, + "loss": 0.6716, "step": 1438 }, { - "epoch": 0.5254701478911813, - "grad_norm": 1.3673650026321411, - "learning_rate": 3.9397789725721594e-05, - "loss": 1.015, + "epoch": 0.2496530187369882, + "grad_norm": 0.9847767949104309, + "learning_rate": 2.496097137901128e-05, + "loss": 0.7949, "step": 1439 }, { - "epoch": 0.5258353113018076, - "grad_norm": 1.2105910778045654, - "learning_rate": 3.939429103502862e-05, - "loss": 1.0068, + "epoch": 0.2498265093684941, + "grad_norm": 0.7555945515632629, + "learning_rate": 2.4978317432784045e-05, + "loss": 0.8923, "step": 1440 }, { - "epoch": 0.5262004747124338, - "grad_norm": 0.8718734383583069, - "learning_rate": 3.939078236675115e-05, - "loss": 0.9789, + "epoch": 0.25, + "grad_norm": 1.0712836980819702, + "learning_rate": 2.499566348655681e-05, + "loss": 0.7236, "step": 1441 }, { - "epoch": 0.5265656381230601, - "grad_norm": 1.127640724182129, - "learning_rate": 3.938726372269425e-05, - "loss": 0.9595, + "epoch": 0.2501734906315059, + "grad_norm": 1.4039801359176636, + "learning_rate": 2.501300954032958e-05, + "loss": 0.8879, "step": 1442 }, { - "epoch": 0.5269308015336863, - "grad_norm": 0.9305435419082642, - "learning_rate": 3.9383735104668135e-05, - "loss": 0.9766, + "epoch": 0.2503469812630118, + "grad_norm": 0.9388163089752197, + "learning_rate": 2.5030355594102343e-05, + "loss": 0.7878, "step": 1443 }, { - "epoch": 0.5272959649443126, - "grad_norm": 1.4327954053878784, - "learning_rate": 3.9380196514488126e-05, - "loss": 1.0055, + "epoch": 0.2505204718945177, + "grad_norm": 0.8452283143997192, + "learning_rate": 2.5047701647875112e-05, + "loss": 0.8064, "step": 1444 }, { - "epoch": 0.5276611283549388, - "grad_norm": 0.8867698311805725, - "learning_rate": 3.937664795397469e-05, - "loss": 1.0013, + "epoch": 0.2506939625260236, + "grad_norm": 1.030892252922058, + "learning_rate": 2.5065047701647877e-05, + "loss": 0.9084, "step": 1445 }, { - "epoch": 0.528026291765565, - "grad_norm": 1.1874901056289673, - "learning_rate": 3.937308942495342e-05, - "loss": 1.0028, + "epoch": 0.2508674531575295, + "grad_norm": 1.4707410335540771, + "learning_rate": 2.5082393755420645e-05, + "loss": 0.8232, "step": 1446 }, { - "epoch": 0.5283914551761913, - "grad_norm": 1.7306830883026123, - "learning_rate": 3.936952092925503e-05, - "loss": 0.9984, + "epoch": 0.2510409437890354, + "grad_norm": 1.2931935787200928, + "learning_rate": 2.509973980919341e-05, + "loss": 0.7671, "step": 1447 }, { - "epoch": 0.5287566185868176, - "grad_norm": 1.5312912464141846, - "learning_rate": 3.9365942468715375e-05, - "loss": 0.9995, + "epoch": 0.2512144344205413, + "grad_norm": 0.8157335519790649, + "learning_rate": 2.511708586296618e-05, + "loss": 0.8047, "step": 1448 }, { - "epoch": 0.5291217819974439, - "grad_norm": 1.2866657972335815, - "learning_rate": 3.936235404517543e-05, - "loss": 0.9803, + "epoch": 0.2513879250520472, + "grad_norm": 0.8536398410797119, + "learning_rate": 2.5134431916738944e-05, + "loss": 0.8193, "step": 1449 }, { - "epoch": 0.5294869454080701, - "grad_norm": 0.9989070296287537, - "learning_rate": 3.935875566048129e-05, - "loss": 0.9822, + "epoch": 0.2515614156835531, + "grad_norm": 0.9467887878417969, + "learning_rate": 2.5151777970511712e-05, + "loss": 0.854, "step": 1450 }, { - "epoch": 0.5298521088186964, - "grad_norm": 1.2456800937652588, - "learning_rate": 3.935514731648418e-05, - "loss": 1.0199, + "epoch": 0.25173490631505896, + "grad_norm": 0.9875763058662415, + "learning_rate": 2.5169124024284477e-05, + "loss": 0.858, "step": 1451 }, { - "epoch": 0.5302172722293226, - "grad_norm": 1.6688508987426758, - "learning_rate": 3.935152901504045e-05, - "loss": 1.0244, + "epoch": 0.25190839694656486, + "grad_norm": 1.4338651895523071, + "learning_rate": 2.5186470078057242e-05, + "loss": 1.0125, "step": 1452 }, { - "epoch": 0.5305824356399489, - "grad_norm": 1.2286927700042725, - "learning_rate": 3.934790075801156e-05, - "loss": 0.9881, + "epoch": 0.25208188757807076, + "grad_norm": 0.8509945869445801, + "learning_rate": 2.520381613183001e-05, + "loss": 1.0525, "step": 1453 }, { - "epoch": 0.5309475990505751, - "grad_norm": 0.9670413732528687, - "learning_rate": 3.934426254726413e-05, - "loss": 0.9706, + "epoch": 0.25225537820957666, + "grad_norm": 1.3411931991577148, + "learning_rate": 2.5221162185602775e-05, + "loss": 0.9219, "step": 1454 }, { - "epoch": 0.5313127624612014, - "grad_norm": 0.9822050333023071, - "learning_rate": 3.934061438466985e-05, - "loss": 1.0107, + "epoch": 0.25242886884108257, + "grad_norm": 0.8101678490638733, + "learning_rate": 2.5238508239375544e-05, + "loss": 0.7559, "step": 1455 }, { - "epoch": 0.5316779258718276, - "grad_norm": 0.8563000559806824, - "learning_rate": 3.933695627210555e-05, - "loss": 1.0189, + "epoch": 0.25260235947258847, + "grad_norm": 0.9387151598930359, + "learning_rate": 2.525585429314831e-05, + "loss": 0.7217, "step": 1456 }, { - "epoch": 0.5320430892824539, - "grad_norm": 0.8624609708786011, - "learning_rate": 3.93332882114532e-05, - "loss": 0.9587, + "epoch": 0.25277585010409437, + "grad_norm": 1.6971986293792725, + "learning_rate": 2.5273200346921077e-05, + "loss": 1.0728, "step": 1457 }, { - "epoch": 0.5324082526930801, - "grad_norm": 1.1684212684631348, - "learning_rate": 3.9329610204599864e-05, - "loss": 0.9829, + "epoch": 0.25294934073560027, + "grad_norm": 1.0177127122879028, + "learning_rate": 2.5290546400693842e-05, + "loss": 0.9268, "step": 1458 }, { - "epoch": 0.5327734161037064, - "grad_norm": 0.8735179901123047, - "learning_rate": 3.932592225343772e-05, - "loss": 0.9938, + "epoch": 0.25312283136710617, + "grad_norm": 1.0046263933181763, + "learning_rate": 2.530789245446661e-05, + "loss": 0.7275, "step": 1459 }, { - "epoch": 0.5331385795143326, - "grad_norm": 1.2071043252944946, - "learning_rate": 3.932222435986408e-05, - "loss": 0.9944, + "epoch": 0.2532963219986121, + "grad_norm": 0.7330162525177002, + "learning_rate": 2.5325238508239375e-05, + "loss": 0.9497, "step": 1460 }, { - "epoch": 0.533503742924959, - "grad_norm": 1.3718838691711426, - "learning_rate": 3.931851652578137e-05, - "loss": 1.0231, + "epoch": 0.253469812630118, + "grad_norm": 1.0836029052734375, + "learning_rate": 2.5342584562012144e-05, + "loss": 0.8706, "step": 1461 }, { - "epoch": 0.5338689063355851, - "grad_norm": 1.0394768714904785, - "learning_rate": 3.93147987530971e-05, - "loss": 0.9883, + "epoch": 0.2536433032616239, + "grad_norm": 0.9146780371665955, + "learning_rate": 2.535993061578491e-05, + "loss": 0.8718, "step": 1462 }, { - "epoch": 0.5342340697462115, - "grad_norm": 1.050279974937439, - "learning_rate": 3.9311071043723927e-05, - "loss": 0.9756, + "epoch": 0.2538167938931298, + "grad_norm": 0.9753580689430237, + "learning_rate": 2.5377276669557674e-05, + "loss": 0.7305, "step": 1463 }, { - "epoch": 0.5345992331568377, - "grad_norm": 1.0993777513504028, - "learning_rate": 3.930733339957961e-05, - "loss": 1.0096, + "epoch": 0.2539902845246357, + "grad_norm": 1.564149022102356, + "learning_rate": 2.5394622723330446e-05, + "loss": 0.8066, "step": 1464 }, { - "epoch": 0.534964396567464, - "grad_norm": 1.1152960062026978, - "learning_rate": 3.9303585822587014e-05, - "loss": 0.9741, + "epoch": 0.2541637751561416, + "grad_norm": 0.8751254677772522, + "learning_rate": 2.5411968777103214e-05, + "loss": 0.7466, "step": 1465 }, { - "epoch": 0.5353295599780902, - "grad_norm": 2.4758949279785156, - "learning_rate": 3.929982831467412e-05, - "loss": 0.999, + "epoch": 0.2543372657876475, + "grad_norm": 0.9803287386894226, + "learning_rate": 2.542931483087598e-05, + "loss": 0.8191, "step": 1466 }, { - "epoch": 0.5356947233887165, - "grad_norm": 1.1261727809906006, - "learning_rate": 3.9296060877774004e-05, - "loss": 0.9739, + "epoch": 0.2545107564191534, + "grad_norm": 0.9452791810035706, + "learning_rate": 2.5446660884648747e-05, + "loss": 0.8743, "step": 1467 }, { - "epoch": 0.5360598867993427, - "grad_norm": 1.4378201961517334, - "learning_rate": 3.9292283513824873e-05, - "loss": 0.9746, + "epoch": 0.2546842470506593, + "grad_norm": 1.0528842210769653, + "learning_rate": 2.5464006938421512e-05, + "loss": 0.751, "step": 1468 }, { - "epoch": 0.536425050209969, - "grad_norm": 1.1009440422058105, - "learning_rate": 3.928849622477002e-05, - "loss": 0.9542, + "epoch": 0.2548577376821652, + "grad_norm": 1.065266728401184, + "learning_rate": 2.548135299219428e-05, + "loss": 0.8733, "step": 1469 }, { - "epoch": 0.5367902136205952, - "grad_norm": 1.187432885169983, - "learning_rate": 3.928469901255787e-05, - "loss": 0.9707, + "epoch": 0.2550312283136711, + "grad_norm": 0.9358468651771545, + "learning_rate": 2.5498699045967046e-05, + "loss": 0.6685, "step": 1470 }, { - "epoch": 0.5371553770312215, - "grad_norm": 0.7985403537750244, - "learning_rate": 3.928089187914192e-05, - "loss": 0.9807, + "epoch": 0.25520471894517693, + "grad_norm": 0.7982171177864075, + "learning_rate": 2.551604509973981e-05, + "loss": 0.8076, "step": 1471 }, { - "epoch": 0.5375205404418477, - "grad_norm": 0.818397045135498, - "learning_rate": 3.927707482648079e-05, - "loss": 0.9692, + "epoch": 0.25537820957668284, + "grad_norm": 1.2300615310668945, + "learning_rate": 2.553339115351258e-05, + "loss": 0.897, "step": 1472 }, { - "epoch": 0.537885703852474, - "grad_norm": 1.169758677482605, - "learning_rate": 3.92732478565382e-05, - "loss": 1.04, + "epoch": 0.25555170020818874, + "grad_norm": 0.9254710078239441, + "learning_rate": 2.5550737207285344e-05, + "loss": 0.8616, "step": 1473 }, { - "epoch": 0.5382508672631002, - "grad_norm": 1.511186957359314, - "learning_rate": 3.926941097128298e-05, - "loss": 0.9856, + "epoch": 0.25572519083969464, + "grad_norm": 1.1417391300201416, + "learning_rate": 2.5568083261058113e-05, + "loss": 0.7517, "step": 1474 }, { - "epoch": 0.5386160306737265, - "grad_norm": 1.4727163314819336, - "learning_rate": 3.9265564172689046e-05, - "loss": 0.9974, + "epoch": 0.25589868147120054, + "grad_norm": 0.850305438041687, + "learning_rate": 2.5585429314830878e-05, + "loss": 0.7207, "step": 1475 }, { - "epoch": 0.5389811940843527, - "grad_norm": 1.0213814973831177, - "learning_rate": 3.926170746273543e-05, - "loss": 0.9803, + "epoch": 0.25607217210270644, + "grad_norm": 0.8620747327804565, + "learning_rate": 2.5602775368603646e-05, + "loss": 0.7883, "step": 1476 }, { - "epoch": 0.5393463574949791, - "grad_norm": 1.0162007808685303, - "learning_rate": 3.925784084340624e-05, - "loss": 0.9701, + "epoch": 0.25624566273421234, + "grad_norm": 0.8000349998474121, + "learning_rate": 2.562012142237641e-05, + "loss": 0.8406, "step": 1477 }, { - "epoch": 0.5397115209056053, - "grad_norm": 1.256233811378479, - "learning_rate": 3.9253964316690707e-05, - "loss": 1.0073, + "epoch": 0.25641915336571824, + "grad_norm": 1.5429391860961914, + "learning_rate": 2.563746747614918e-05, + "loss": 0.7654, "step": 1478 }, { - "epoch": 0.5400766843162315, - "grad_norm": 1.2060352563858032, - "learning_rate": 3.925007788458315e-05, - "loss": 0.9749, + "epoch": 0.25659264399722415, + "grad_norm": 0.9601867198944092, + "learning_rate": 2.5654813529921944e-05, + "loss": 0.7654, "step": 1479 }, { - "epoch": 0.5404418477268578, - "grad_norm": 1.1554151773452759, - "learning_rate": 3.924618154908298e-05, - "loss": 0.9862, + "epoch": 0.25676613462873005, + "grad_norm": 0.8377664685249329, + "learning_rate": 2.5672159583694713e-05, + "loss": 0.7722, "step": 1480 }, { - "epoch": 0.540807011137484, - "grad_norm": 0.9307277202606201, - "learning_rate": 3.9242275312194694e-05, - "loss": 1.0068, + "epoch": 0.25693962526023595, + "grad_norm": 0.9915479421615601, + "learning_rate": 2.5689505637467478e-05, + "loss": 0.8428, "step": 1481 }, { - "epoch": 0.5411721745481103, - "grad_norm": 1.0373626947402954, - "learning_rate": 3.923835917592792e-05, - "loss": 0.9879, + "epoch": 0.25711311589174185, + "grad_norm": 1.5171838998794556, + "learning_rate": 2.5706851691240243e-05, + "loss": 0.9353, "step": 1482 }, { - "epoch": 0.5415373379587365, - "grad_norm": 1.0350438356399536, - "learning_rate": 3.923443314229732e-05, - "loss": 0.972, + "epoch": 0.25728660652324775, + "grad_norm": 1.0475784540176392, + "learning_rate": 2.572419774501301e-05, + "loss": 0.8269, "step": 1483 }, { - "epoch": 0.5419025013693628, - "grad_norm": 1.1358007192611694, - "learning_rate": 3.9230497213322715e-05, - "loss": 0.988, + "epoch": 0.25746009715475365, + "grad_norm": 1.0411832332611084, + "learning_rate": 2.5741543798785776e-05, + "loss": 0.8055, "step": 1484 }, { - "epoch": 0.542267664779989, - "grad_norm": 1.1690858602523804, - "learning_rate": 3.922655139102895e-05, - "loss": 0.9941, + "epoch": 0.25763358778625955, + "grad_norm": 1.0417648553848267, + "learning_rate": 2.5758889852558545e-05, + "loss": 0.7183, "step": 1485 }, { - "epoch": 0.5426328281906153, - "grad_norm": 1.0593832731246948, - "learning_rate": 3.922259567744602e-05, - "loss": 0.9966, + "epoch": 0.25780707841776546, + "grad_norm": 0.9317733645439148, + "learning_rate": 2.577623590633131e-05, + "loss": 0.7729, "step": 1486 }, { - "epoch": 0.5429979916012415, - "grad_norm": 1.3002471923828125, - "learning_rate": 3.9218630074608966e-05, - "loss": 0.9757, + "epoch": 0.25798056904927136, + "grad_norm": 1.374594807624817, + "learning_rate": 2.5793581960104078e-05, + "loss": 0.906, "step": 1487 }, { - "epoch": 0.5433631550118678, - "grad_norm": 1.191044569015503, - "learning_rate": 3.921465458455793e-05, - "loss": 0.9641, + "epoch": 0.25815405968077726, + "grad_norm": 1.2243852615356445, + "learning_rate": 2.5810928013876843e-05, + "loss": 0.8496, "step": 1488 }, { - "epoch": 0.543728318422494, - "grad_norm": 1.060248613357544, - "learning_rate": 3.9210669209338144e-05, - "loss": 0.9779, + "epoch": 0.25832755031228316, + "grad_norm": 0.8611832857131958, + "learning_rate": 2.582827406764961e-05, + "loss": 0.8984, "step": 1489 }, { - "epoch": 0.5440934818331203, - "grad_norm": 1.314370036125183, - "learning_rate": 3.920667395099993e-05, - "loss": 0.9847, + "epoch": 0.25850104094378906, + "grad_norm": 1.4302983283996582, + "learning_rate": 2.5845620121422376e-05, + "loss": 0.7815, "step": 1490 }, { - "epoch": 0.5444586452437465, - "grad_norm": 1.3142666816711426, - "learning_rate": 3.920266881159869e-05, - "loss": 1.019, + "epoch": 0.2586745315752949, + "grad_norm": 1.0146147012710571, + "learning_rate": 2.5862966175195145e-05, + "loss": 0.8142, "step": 1491 }, { - "epoch": 0.5448238086543729, - "grad_norm": 0.9495648741722107, - "learning_rate": 3.9198653793194896e-05, - "loss": 0.9561, + "epoch": 0.2588480222068008, + "grad_norm": 0.9432351589202881, + "learning_rate": 2.588031222896791e-05, + "loss": 0.9099, "step": 1492 }, { - "epoch": 0.5451889720649991, - "grad_norm": 1.1412338018417358, - "learning_rate": 3.919462889785412e-05, - "loss": 0.9907, + "epoch": 0.2590215128383067, + "grad_norm": 1.0478154420852661, + "learning_rate": 2.589765828274068e-05, + "loss": 0.7292, "step": 1493 }, { - "epoch": 0.5455541354756254, - "grad_norm": 1.242210030555725, - "learning_rate": 3.9190594127647005e-05, - "loss": 0.9977, + "epoch": 0.2591950034698126, + "grad_norm": 1.098466157913208, + "learning_rate": 2.5915004336513447e-05, + "loss": 0.7048, "step": 1494 }, { - "epoch": 0.5459192988862516, - "grad_norm": 0.9227830171585083, - "learning_rate": 3.918654948464928e-05, - "loss": 0.9637, + "epoch": 0.2593684941013185, + "grad_norm": 1.4520113468170166, + "learning_rate": 2.5932350390286215e-05, + "loss": 0.8057, "step": 1495 }, { - "epoch": 0.5462844622968779, - "grad_norm": 0.9293296933174133, - "learning_rate": 3.918249497094176e-05, - "loss": 0.9662, + "epoch": 0.2595419847328244, + "grad_norm": 0.8676396012306213, + "learning_rate": 2.594969644405898e-05, + "loss": 0.7046, "step": 1496 }, { - "epoch": 0.5466496257075041, - "grad_norm": 1.064624309539795, - "learning_rate": 3.917843058861032e-05, - "loss": 0.9738, + "epoch": 0.2597154753643303, + "grad_norm": 0.7934767007827759, + "learning_rate": 2.5967042497831748e-05, + "loss": 0.8621, "step": 1497 }, { - "epoch": 0.5470147891181304, - "grad_norm": 1.0935320854187012, - "learning_rate": 3.9174356339745933e-05, - "loss": 0.9856, + "epoch": 0.2598889659958362, + "grad_norm": 0.6284911632537842, + "learning_rate": 2.5984388551604513e-05, + "loss": 0.9192, "step": 1498 }, { - "epoch": 0.5473799525287566, - "grad_norm": 1.3575880527496338, - "learning_rate": 3.917027222644462e-05, - "loss": 0.983, + "epoch": 0.2600624566273421, + "grad_norm": 1.022574782371521, + "learning_rate": 2.600173460537728e-05, + "loss": 0.6967, "step": 1499 }, { - "epoch": 0.5477451159393829, - "grad_norm": 1.1410921812057495, - "learning_rate": 3.9166178250807504e-05, - "loss": 0.9805, + "epoch": 0.260235947258848, + "grad_norm": 0.9580214023590088, + "learning_rate": 2.6019080659150047e-05, + "loss": 0.7339, "step": 1500 }, { - "epoch": 0.5481102793500091, - "grad_norm": 0.9118988513946533, - "learning_rate": 3.9162074414940764e-05, - "loss": 0.9018, + "epoch": 0.2604094378903539, + "grad_norm": 1.0245991945266724, + "learning_rate": 2.603642671292281e-05, + "loss": 0.8042, "step": 1501 }, { - "epoch": 0.5484754427606354, - "grad_norm": 1.218697190284729, - "learning_rate": 3.915796072095567e-05, - "loss": 0.9902, + "epoch": 0.2605829285218598, + "grad_norm": 0.9912607073783875, + "learning_rate": 2.605377276669558e-05, + "loss": 0.7573, "step": 1502 }, { - "epoch": 0.5488406061712616, - "grad_norm": 1.2307707071304321, - "learning_rate": 3.9153837170968544e-05, - "loss": 1.0166, + "epoch": 0.2607564191533657, + "grad_norm": 1.9116547107696533, + "learning_rate": 2.6071118820468345e-05, + "loss": 0.957, "step": 1503 }, { - "epoch": 0.5492057695818879, - "grad_norm": 1.316801905632019, - "learning_rate": 3.914970376710079e-05, - "loss": 0.9697, + "epoch": 0.2609299097848716, + "grad_norm": 0.9331893920898438, + "learning_rate": 2.6088464874241113e-05, + "loss": 0.8098, "step": 1504 }, { - "epoch": 0.5495709329925141, - "grad_norm": 1.2871805429458618, - "learning_rate": 3.914556051147887e-05, - "loss": 0.9934, + "epoch": 0.26110340041637753, + "grad_norm": 1.24367356300354, + "learning_rate": 2.610581092801388e-05, + "loss": 0.8372, "step": 1505 }, { - "epoch": 0.5499360964031405, - "grad_norm": 0.9597147107124329, - "learning_rate": 3.914140740623434e-05, - "loss": 0.9775, + "epoch": 0.26127689104788343, + "grad_norm": 0.9407436847686768, + "learning_rate": 2.6123156981786647e-05, + "loss": 0.8038, "step": 1506 }, { - "epoch": 0.5503012598137667, - "grad_norm": 0.8540073037147522, - "learning_rate": 3.9137244453503794e-05, - "loss": 0.9612, + "epoch": 0.26145038167938933, + "grad_norm": 0.9653863906860352, + "learning_rate": 2.6140503035559412e-05, + "loss": 0.8792, "step": 1507 }, { - "epoch": 0.550666423224393, - "grad_norm": 1.340399146080017, - "learning_rate": 3.9133071655428904e-05, - "loss": 0.9784, + "epoch": 0.26162387231089523, + "grad_norm": 0.8112083077430725, + "learning_rate": 2.615784908933218e-05, + "loss": 0.8022, "step": 1508 }, { - "epoch": 0.5510315866350192, - "grad_norm": 1.1207612752914429, - "learning_rate": 3.9128889014156415e-05, - "loss": 0.9895, + "epoch": 0.26179736294240114, + "grad_norm": 0.7876482605934143, + "learning_rate": 2.6175195143104945e-05, + "loss": 0.9067, "step": 1509 }, { - "epoch": 0.5513967500456455, - "grad_norm": 1.0734164714813232, - "learning_rate": 3.9124696531838114e-05, - "loss": 0.9965, + "epoch": 0.26197085357390704, + "grad_norm": 1.0191528797149658, + "learning_rate": 2.6192541196877714e-05, + "loss": 0.7932, "step": 1510 }, { - "epoch": 0.5517619134562717, - "grad_norm": 1.2319486141204834, - "learning_rate": 3.9120494210630886e-05, - "loss": 0.9868, + "epoch": 0.2621443442054129, + "grad_norm": 0.6915796399116516, + "learning_rate": 2.620988725065048e-05, + "loss": 0.929, "step": 1511 }, { - "epoch": 0.552127076866898, - "grad_norm": 1.1655147075653076, - "learning_rate": 3.911628205269663e-05, - "loss": 0.944, + "epoch": 0.2623178348369188, + "grad_norm": 0.8228687644004822, + "learning_rate": 2.6227233304423244e-05, + "loss": 0.9319, "step": 1512 }, { - "epoch": 0.5524922402775242, - "grad_norm": 1.0723462104797363, - "learning_rate": 3.911206006020235e-05, - "loss": 0.9878, + "epoch": 0.2624913254684247, + "grad_norm": 0.9723125696182251, + "learning_rate": 2.6244579358196012e-05, + "loss": 0.7916, "step": 1513 }, { - "epoch": 0.5528574036881504, - "grad_norm": 1.4839738607406616, - "learning_rate": 3.910782823532009e-05, - "loss": 1.0059, + "epoch": 0.2626648160999306, + "grad_norm": 1.0258363485336304, + "learning_rate": 2.6261925411968777e-05, + "loss": 0.7795, "step": 1514 }, { - "epoch": 0.5532225670987767, - "grad_norm": 1.4428439140319824, - "learning_rate": 3.910358658022696e-05, - "loss": 0.959, + "epoch": 0.2628383067314365, + "grad_norm": 1.0159300565719604, + "learning_rate": 2.6279271465741545e-05, + "loss": 0.8779, "step": 1515 }, { - "epoch": 0.5535877305094029, - "grad_norm": 1.2980058193206787, - "learning_rate": 3.909933509710511e-05, - "loss": 0.9867, + "epoch": 0.2630117973629424, + "grad_norm": 1.066523551940918, + "learning_rate": 2.629661751951431e-05, + "loss": 0.7109, "step": 1516 }, { - "epoch": 0.5539528939200292, - "grad_norm": 1.0891823768615723, - "learning_rate": 3.909507378814175e-05, - "loss": 1.0182, + "epoch": 0.2631852879944483, + "grad_norm": 0.9525664448738098, + "learning_rate": 2.631396357328708e-05, + "loss": 0.6938, "step": 1517 }, { - "epoch": 0.5543180573306554, - "grad_norm": 1.0796889066696167, - "learning_rate": 3.909080265552918e-05, - "loss": 0.9823, + "epoch": 0.2633587786259542, + "grad_norm": 0.8220499753952026, + "learning_rate": 2.6331309627059844e-05, + "loss": 0.8062, "step": 1518 }, { - "epoch": 0.5546832207412817, - "grad_norm": 0.8387145400047302, - "learning_rate": 3.90865217014647e-05, - "loss": 0.9492, + "epoch": 0.2635322692574601, + "grad_norm": 1.0748132467269897, + "learning_rate": 2.6348655680832612e-05, + "loss": 0.7229, "step": 1519 }, { - "epoch": 0.5550483841519079, - "grad_norm": 1.2405532598495483, - "learning_rate": 3.90822309281507e-05, - "loss": 0.946, + "epoch": 0.263705759888966, + "grad_norm": 0.8485494256019592, + "learning_rate": 2.6366001734605377e-05, + "loss": 0.7605, "step": 1520 }, { - "epoch": 0.5554135475625342, - "grad_norm": 1.1036583185195923, - "learning_rate": 3.9077930337794614e-05, - "loss": 0.9791, + "epoch": 0.2638792505204719, + "grad_norm": 1.868552565574646, + "learning_rate": 2.6383347788378146e-05, + "loss": 0.7742, "step": 1521 }, { - "epoch": 0.5557787109731605, - "grad_norm": 0.9167155027389526, - "learning_rate": 3.907361993260891e-05, - "loss": 0.9531, + "epoch": 0.2640527411519778, + "grad_norm": 0.841484010219574, + "learning_rate": 2.640069384215091e-05, + "loss": 0.8938, "step": 1522 }, { - "epoch": 0.5561438743837868, - "grad_norm": 1.264602541923523, - "learning_rate": 3.906929971481114e-05, - "loss": 0.9471, + "epoch": 0.2642262317834837, + "grad_norm": 1.3391854763031006, + "learning_rate": 2.6418039895923682e-05, + "loss": 0.7493, "step": 1523 }, { - "epoch": 0.556509037794413, - "grad_norm": 1.568609595298767, - "learning_rate": 3.906496968662386e-05, - "loss": 0.9778, + "epoch": 0.2643997224149896, + "grad_norm": 2.429743528366089, + "learning_rate": 2.6435385949696447e-05, + "loss": 0.7205, "step": 1524 }, { - "epoch": 0.5568742012050393, - "grad_norm": 0.8640694618225098, - "learning_rate": 3.906062985027471e-05, - "loss": 0.9872, + "epoch": 0.2645732130464955, + "grad_norm": 0.9355248808860779, + "learning_rate": 2.6452732003469216e-05, + "loss": 0.8093, "step": 1525 }, { - "epoch": 0.5572393646156655, - "grad_norm": 1.6712572574615479, - "learning_rate": 3.905628020799636e-05, - "loss": 0.9648, + "epoch": 0.2647467036780014, + "grad_norm": 0.8317021727561951, + "learning_rate": 2.647007805724198e-05, + "loss": 0.812, "step": 1526 }, { - "epoch": 0.5576045280262918, - "grad_norm": 1.164849042892456, - "learning_rate": 3.905192076202652e-05, - "loss": 0.9523, + "epoch": 0.2649201943095073, + "grad_norm": 1.0157843828201294, + "learning_rate": 2.648742411101475e-05, + "loss": 0.7394, "step": 1527 }, { - "epoch": 0.557969691436918, - "grad_norm": 1.0195542573928833, - "learning_rate": 3.904755151460795e-05, - "loss": 0.952, + "epoch": 0.2650936849410132, + "grad_norm": 0.7860603928565979, + "learning_rate": 2.6504770164787514e-05, + "loss": 0.8242, "step": 1528 }, { - "epoch": 0.5583348548475443, - "grad_norm": 1.1641738414764404, - "learning_rate": 3.9043172467988464e-05, - "loss": 0.9518, + "epoch": 0.2652671755725191, + "grad_norm": 0.7116766571998596, + "learning_rate": 2.6522116218560283e-05, + "loss": 0.9429, "step": 1529 }, { - "epoch": 0.5587000182581705, - "grad_norm": 1.135114073753357, - "learning_rate": 3.9038783624420894e-05, - "loss": 1.0007, + "epoch": 0.26544066620402496, + "grad_norm": 0.8520632386207581, + "learning_rate": 2.6539462272333048e-05, + "loss": 0.7825, "step": 1530 }, { - "epoch": 0.5590651816687968, - "grad_norm": 1.6706990003585815, - "learning_rate": 3.9034384986163126e-05, - "loss": 0.9889, + "epoch": 0.26561415683553086, + "grad_norm": 0.8119803667068481, + "learning_rate": 2.6556808326105813e-05, + "loss": 0.9934, "step": 1531 }, { - "epoch": 0.559430345079423, - "grad_norm": 1.2139647006988525, - "learning_rate": 3.902997655547809e-05, - "loss": 0.9611, + "epoch": 0.26578764746703676, + "grad_norm": 0.9791209697723389, + "learning_rate": 2.657415437987858e-05, + "loss": 0.7739, "step": 1532 }, { - "epoch": 0.5597955084900493, - "grad_norm": 0.9248563647270203, - "learning_rate": 3.9025558334633735e-05, - "loss": 0.9656, + "epoch": 0.26596113809854266, + "grad_norm": 0.8177501559257507, + "learning_rate": 2.6591500433651346e-05, + "loss": 0.7855, "step": 1533 }, { - "epoch": 0.5601606719006755, - "grad_norm": 1.2305361032485962, - "learning_rate": 3.9021130325903076e-05, - "loss": 0.9188, + "epoch": 0.26613462873004856, + "grad_norm": 0.8181318640708923, + "learning_rate": 2.6608846487424114e-05, + "loss": 0.7876, "step": 1534 }, { - "epoch": 0.5605258353113018, - "grad_norm": 1.0008493661880493, - "learning_rate": 3.9016692531564125e-05, - "loss": 0.9272, + "epoch": 0.26630811936155446, + "grad_norm": 0.9333963990211487, + "learning_rate": 2.662619254119688e-05, + "loss": 0.8799, "step": 1535 }, { - "epoch": 0.560890998721928, - "grad_norm": 0.9667797684669495, - "learning_rate": 3.901224495389996e-05, - "loss": 0.9989, + "epoch": 0.26648160999306036, + "grad_norm": 1.2133582830429077, + "learning_rate": 2.6643538594969648e-05, + "loss": 0.7935, "step": 1536 }, { - "epoch": 0.5612561621325544, - "grad_norm": 1.1529128551483154, - "learning_rate": 3.900778759519868e-05, - "loss": 1.0067, + "epoch": 0.26665510062456627, + "grad_norm": 0.8452572822570801, + "learning_rate": 2.6660884648742413e-05, + "loss": 0.8655, "step": 1537 }, { - "epoch": 0.5616213255431806, - "grad_norm": 1.0638667345046997, - "learning_rate": 3.9003320457753425e-05, - "loss": 0.9796, + "epoch": 0.26682859125607217, + "grad_norm": 1.1495928764343262, + "learning_rate": 2.667823070251518e-05, + "loss": 0.7073, "step": 1538 }, { - "epoch": 0.5619864889538069, - "grad_norm": 1.6627936363220215, - "learning_rate": 3.8998843543862347e-05, - "loss": 0.9941, + "epoch": 0.26700208188757807, + "grad_norm": 1.128671407699585, + "learning_rate": 2.6695576756287946e-05, + "loss": 0.7898, "step": 1539 }, { - "epoch": 0.5623516523644331, - "grad_norm": 1.2160228490829468, - "learning_rate": 3.899435685582864e-05, - "loss": 0.9547, + "epoch": 0.26717557251908397, + "grad_norm": 1.4267427921295166, + "learning_rate": 2.6712922810060714e-05, + "loss": 0.7192, "step": 1540 }, { - "epoch": 0.5627168157750594, - "grad_norm": 1.3134812116622925, - "learning_rate": 3.898986039596052e-05, - "loss": 0.9409, + "epoch": 0.26734906315058987, + "grad_norm": 1.4336055517196655, + "learning_rate": 2.673026886383348e-05, + "loss": 0.6941, "step": 1541 }, { - "epoch": 0.5630819791856856, - "grad_norm": 1.3066587448120117, - "learning_rate": 3.898535416657125e-05, - "loss": 0.9959, + "epoch": 0.2675225537820958, + "grad_norm": 1.0389174222946167, + "learning_rate": 2.6747614917606245e-05, + "loss": 0.696, "step": 1542 }, { - "epoch": 0.5634471425963119, - "grad_norm": 1.3279896974563599, - "learning_rate": 3.89808381699791e-05, - "loss": 0.9851, + "epoch": 0.2676960444136017, + "grad_norm": 0.8892258405685425, + "learning_rate": 2.6764960971379013e-05, + "loss": 0.7603, "step": 1543 }, { - "epoch": 0.5638123060069381, - "grad_norm": 1.1506808996200562, - "learning_rate": 3.8976312408507356e-05, - "loss": 1.0297, + "epoch": 0.2678695350451076, + "grad_norm": 0.8321637511253357, + "learning_rate": 2.6782307025151778e-05, + "loss": 0.9607, "step": 1544 }, { - "epoch": 0.5641774694175644, - "grad_norm": 0.9332963824272156, - "learning_rate": 3.897177688448435e-05, - "loss": 0.9922, + "epoch": 0.2680430256766135, + "grad_norm": 1.0381375551223755, + "learning_rate": 2.6799653078924546e-05, + "loss": 0.8246, "step": 1545 }, { - "epoch": 0.5645426328281906, - "grad_norm": 0.898692786693573, - "learning_rate": 3.8967231600243434e-05, - "loss": 0.9769, + "epoch": 0.2682165163081194, + "grad_norm": 1.1981574296951294, + "learning_rate": 2.681699913269731e-05, + "loss": 0.8022, "step": 1546 }, { - "epoch": 0.5649077962388168, - "grad_norm": 0.872531533241272, - "learning_rate": 3.8962676558122965e-05, - "loss": 0.9564, + "epoch": 0.2683900069396253, + "grad_norm": 0.9407922625541687, + "learning_rate": 2.683434518647008e-05, + "loss": 0.8347, "step": 1547 }, { - "epoch": 0.5652729596494431, - "grad_norm": 1.1303616762161255, - "learning_rate": 3.895811176046633e-05, - "loss": 0.9756, + "epoch": 0.2685634975711312, + "grad_norm": 1.2049758434295654, + "learning_rate": 2.6851691240242845e-05, + "loss": 0.8118, "step": 1548 }, { - "epoch": 0.5656381230600693, - "grad_norm": 1.200279951095581, - "learning_rate": 3.895353720962193e-05, - "loss": 0.9648, + "epoch": 0.2687369882026371, + "grad_norm": 0.7561994194984436, + "learning_rate": 2.6869037294015613e-05, + "loss": 0.9504, "step": 1549 }, { - "epoch": 0.5660032864706956, - "grad_norm": 1.0431454181671143, - "learning_rate": 3.8948952907943206e-05, - "loss": 1.0065, + "epoch": 0.26891047883414293, + "grad_norm": 0.7517123222351074, + "learning_rate": 2.6886383347788378e-05, + "loss": 0.9175, "step": 1550 }, { - "epoch": 0.5663684498813218, - "grad_norm": 1.3610957860946655, - "learning_rate": 3.8944358857788576e-05, - "loss": 1.0459, + "epoch": 0.26908396946564883, + "grad_norm": 0.9555516242980957, + "learning_rate": 2.6903729401561146e-05, + "loss": 0.8359, "step": 1551 }, { - "epoch": 0.5667336132919482, - "grad_norm": 1.2236528396606445, - "learning_rate": 3.893975506152151e-05, - "loss": 0.9758, + "epoch": 0.26925746009715473, + "grad_norm": 1.0077297687530518, + "learning_rate": 2.692107545533391e-05, + "loss": 0.9504, "step": 1552 }, { - "epoch": 0.5670987767025744, - "grad_norm": 1.0757087469100952, - "learning_rate": 3.8935141521510466e-05, - "loss": 1.0195, + "epoch": 0.26943095072866063, + "grad_norm": 0.9798529148101807, + "learning_rate": 2.6938421509106683e-05, + "loss": 0.8918, "step": 1553 }, { - "epoch": 0.5674639401132007, - "grad_norm": 1.0813853740692139, - "learning_rate": 3.8930518240128926e-05, - "loss": 0.9843, + "epoch": 0.26960444136016654, + "grad_norm": 0.9303553700447083, + "learning_rate": 2.6955767562879448e-05, + "loss": 0.7524, "step": 1554 }, { - "epoch": 0.5678291035238269, - "grad_norm": 1.0328736305236816, - "learning_rate": 3.892588521975539e-05, - "loss": 1.0107, + "epoch": 0.26977793199167244, + "grad_norm": 0.9828739166259766, + "learning_rate": 2.6973113616652217e-05, + "loss": 0.8828, "step": 1555 }, { - "epoch": 0.5681942669344532, - "grad_norm": 1.3049678802490234, - "learning_rate": 3.892124246277336e-05, - "loss": 1.0027, + "epoch": 0.26995142262317834, + "grad_norm": 1.5303444862365723, + "learning_rate": 2.699045967042498e-05, + "loss": 0.7424, "step": 1556 }, { - "epoch": 0.5685594303450794, - "grad_norm": 0.8871817588806152, - "learning_rate": 3.891658997157134e-05, - "loss": 0.9727, + "epoch": 0.27012491325468424, + "grad_norm": 0.7324809432029724, + "learning_rate": 2.700780572419775e-05, + "loss": 0.7998, "step": 1557 }, { - "epoch": 0.5689245937557057, - "grad_norm": 1.3569291830062866, - "learning_rate": 3.891192774854285e-05, - "loss": 0.9775, + "epoch": 0.27029840388619014, + "grad_norm": 0.9847830533981323, + "learning_rate": 2.7025151777970515e-05, + "loss": 0.8394, "step": 1558 }, { - "epoch": 0.5692897571663319, - "grad_norm": 1.2009457349777222, - "learning_rate": 3.890725579608643e-05, - "loss": 0.9985, + "epoch": 0.27047189451769604, + "grad_norm": 0.8360482454299927, + "learning_rate": 2.7042497831743283e-05, + "loss": 0.7705, "step": 1559 }, { - "epoch": 0.5696549205769582, - "grad_norm": 1.0664366483688354, - "learning_rate": 3.89025741166056e-05, - "loss": 1.0061, + "epoch": 0.27064538514920194, + "grad_norm": 1.8534644842147827, + "learning_rate": 2.705984388551605e-05, + "loss": 0.8567, "step": 1560 }, { - "epoch": 0.5700200839875844, - "grad_norm": 1.1238949298858643, - "learning_rate": 3.8897882712508906e-05, - "loss": 0.9829, + "epoch": 0.27081887578070785, + "grad_norm": 0.7412980794906616, + "learning_rate": 2.7077189939288813e-05, + "loss": 0.8574, "step": 1561 }, { - "epoch": 0.5703852473982107, - "grad_norm": 1.0603467226028442, - "learning_rate": 3.8893181586209883e-05, - "loss": 1.0222, + "epoch": 0.27099236641221375, + "grad_norm": 1.028213381767273, + "learning_rate": 2.7094535993061582e-05, + "loss": 0.7705, "step": 1562 }, { - "epoch": 0.5707504108088369, - "grad_norm": 0.9015482664108276, - "learning_rate": 3.888847074012706e-05, - "loss": 0.9756, + "epoch": 0.27116585704371965, + "grad_norm": 0.870495617389679, + "learning_rate": 2.7111882046834347e-05, + "loss": 0.9136, "step": 1563 }, { - "epoch": 0.5711155742194632, - "grad_norm": 1.2519042491912842, - "learning_rate": 3.8883750176684e-05, - "loss": 1.0216, + "epoch": 0.27133934767522555, + "grad_norm": 1.2011674642562866, + "learning_rate": 2.7129228100607115e-05, + "loss": 0.8115, "step": 1564 }, { - "epoch": 0.5714807376300894, - "grad_norm": 1.0089011192321777, - "learning_rate": 3.8879019898309215e-05, - "loss": 1.0011, + "epoch": 0.27151283830673145, + "grad_norm": 1.7729501724243164, + "learning_rate": 2.714657415437988e-05, + "loss": 0.7664, "step": 1565 }, { - "epoch": 0.5718459010407158, - "grad_norm": 1.319982886314392, - "learning_rate": 3.8874279907436274e-05, - "loss": 0.9404, + "epoch": 0.27168632893823735, + "grad_norm": 1.0037693977355957, + "learning_rate": 2.716392020815265e-05, + "loss": 0.7212, "step": 1566 }, { - "epoch": 0.572211064451342, - "grad_norm": 0.9722253680229187, - "learning_rate": 3.886953020650369e-05, - "loss": 0.9725, + "epoch": 0.27185981956974326, + "grad_norm": 0.9191541075706482, + "learning_rate": 2.7181266261925414e-05, + "loss": 0.7122, "step": 1567 }, { - "epoch": 0.5725762278619683, - "grad_norm": 0.796387791633606, - "learning_rate": 3.8864770797955e-05, - "loss": 1.0048, + "epoch": 0.27203331020124916, + "grad_norm": 1.126654863357544, + "learning_rate": 2.7198612315698182e-05, + "loss": 0.7102, "step": 1568 }, { - "epoch": 0.5729413912725945, - "grad_norm": 0.9689775705337524, - "learning_rate": 3.8860001684238744e-05, - "loss": 0.9747, + "epoch": 0.27220680083275506, + "grad_norm": 1.062692403793335, + "learning_rate": 2.7215958369470947e-05, + "loss": 0.8358, "step": 1569 }, { - "epoch": 0.5733065546832208, - "grad_norm": 1.118059515953064, - "learning_rate": 3.885522286780842e-05, - "loss": 0.9835, + "epoch": 0.2723802914642609, + "grad_norm": 0.7963952422142029, + "learning_rate": 2.7233304423243715e-05, + "loss": 0.8621, "step": 1570 }, { - "epoch": 0.573671718093847, - "grad_norm": 1.2315990924835205, - "learning_rate": 3.8850434351122536e-05, - "loss": 0.9858, + "epoch": 0.2725537820957668, + "grad_norm": 1.1302334070205688, + "learning_rate": 2.725065047701648e-05, + "loss": 0.7107, "step": 1571 }, { - "epoch": 0.5740368815044733, - "grad_norm": 1.1389317512512207, - "learning_rate": 3.884563613664461e-05, - "loss": 0.9919, + "epoch": 0.2727272727272727, + "grad_norm": 0.887178897857666, + "learning_rate": 2.7267996530789245e-05, + "loss": 0.8384, "step": 1572 }, { - "epoch": 0.5744020449150995, - "grad_norm": 1.2420762777328491, - "learning_rate": 3.8840828226843113e-05, - "loss": 0.9703, + "epoch": 0.2729007633587786, + "grad_norm": 0.9384209513664246, + "learning_rate": 2.7285342584562014e-05, + "loss": 0.8318, "step": 1573 }, { - "epoch": 0.5747672083257258, - "grad_norm": 1.5438235998153687, - "learning_rate": 3.8836010624191535e-05, - "loss": 0.9703, + "epoch": 0.2730742539902845, + "grad_norm": 1.0068254470825195, + "learning_rate": 2.730268863833478e-05, + "loss": 0.8142, "step": 1574 }, { - "epoch": 0.575132371736352, - "grad_norm": 1.4246882200241089, - "learning_rate": 3.883118333116833e-05, - "loss": 0.973, + "epoch": 0.2732477446217904, + "grad_norm": 0.8393951654434204, + "learning_rate": 2.7320034692107547e-05, + "loss": 0.9061, "step": 1575 }, { - "epoch": 0.5754975351469783, - "grad_norm": 1.3072428703308105, - "learning_rate": 3.8826346350256943e-05, - "loss": 0.9716, + "epoch": 0.2734212352532963, + "grad_norm": 0.919269323348999, + "learning_rate": 2.7337380745880312e-05, + "loss": 0.8176, "step": 1576 }, { - "epoch": 0.5758626985576045, - "grad_norm": 1.2013180255889893, - "learning_rate": 3.882149968394582e-05, - "loss": 0.9808, + "epoch": 0.2735947258848022, + "grad_norm": 0.8218263983726501, + "learning_rate": 2.735472679965308e-05, + "loss": 0.7977, "step": 1577 }, { - "epoch": 0.5762278619682308, - "grad_norm": 1.7751115560531616, - "learning_rate": 3.881664333472837e-05, - "loss": 0.9637, + "epoch": 0.2737682165163081, + "grad_norm": 0.9225082993507385, + "learning_rate": 2.7372072853425846e-05, + "loss": 0.8015, "step": 1578 }, { - "epoch": 0.576593025378857, - "grad_norm": 1.3324915170669556, - "learning_rate": 3.8811777305102986e-05, - "loss": 0.9454, + "epoch": 0.273941707147814, + "grad_norm": 0.856626570224762, + "learning_rate": 2.7389418907198614e-05, + "loss": 0.9468, "step": 1579 }, { - "epoch": 0.5769581887894832, - "grad_norm": 1.0434328317642212, - "learning_rate": 3.880690159757305e-05, - "loss": 0.9633, + "epoch": 0.2741151977793199, + "grad_norm": 0.9709966778755188, + "learning_rate": 2.740676496097138e-05, + "loss": 0.7925, "step": 1580 }, { - "epoch": 0.5773233522001096, - "grad_norm": 1.0102862119674683, - "learning_rate": 3.880201621464691e-05, - "loss": 0.9562, + "epoch": 0.2742886884108258, + "grad_norm": 0.8097913861274719, + "learning_rate": 2.7424111014744147e-05, + "loss": 0.9597, "step": 1581 }, { - "epoch": 0.5776885156107358, - "grad_norm": 1.8087244033813477, - "learning_rate": 3.8797121158837914e-05, - "loss": 0.954, + "epoch": 0.2744621790423317, + "grad_norm": 0.7738749384880066, + "learning_rate": 2.7441457068516916e-05, + "loss": 0.8501, "step": 1582 }, { - "epoch": 0.5780536790213621, - "grad_norm": 1.4402133226394653, - "learning_rate": 3.8792216432664356e-05, - "loss": 1.0146, + "epoch": 0.2746356696738376, + "grad_norm": 1.1862645149230957, + "learning_rate": 2.7458803122289684e-05, + "loss": 0.7943, "step": 1583 }, { - "epoch": 0.5784188424319883, - "grad_norm": 1.055045485496521, - "learning_rate": 3.878730203864954e-05, - "loss": 0.9477, + "epoch": 0.2748091603053435, + "grad_norm": 0.926042377948761, + "learning_rate": 2.747614917606245e-05, + "loss": 0.8118, "step": 1584 }, { - "epoch": 0.5787840058426146, - "grad_norm": 1.1960052251815796, - "learning_rate": 3.87823779793217e-05, - "loss": 0.9816, + "epoch": 0.2749826509368494, + "grad_norm": 1.0054707527160645, + "learning_rate": 2.7493495229835217e-05, + "loss": 0.811, "step": 1585 }, { - "epoch": 0.5791491692532408, - "grad_norm": 1.46686589717865, - "learning_rate": 3.877744425721408e-05, - "loss": 0.979, + "epoch": 0.27515614156835533, + "grad_norm": 1.0153892040252686, + "learning_rate": 2.7510841283607982e-05, + "loss": 0.8516, "step": 1586 }, { - "epoch": 0.5795143326638671, - "grad_norm": 1.3198533058166504, - "learning_rate": 3.8772500874864886e-05, - "loss": 0.9944, + "epoch": 0.27532963219986123, + "grad_norm": 0.9643231630325317, + "learning_rate": 2.752818733738075e-05, + "loss": 0.7537, "step": 1587 }, { - "epoch": 0.5798794960744933, - "grad_norm": 0.9557154774665833, - "learning_rate": 3.876754783481729e-05, - "loss": 0.9393, + "epoch": 0.27550312283136713, + "grad_norm": 1.036177635192871, + "learning_rate": 2.7545533391153516e-05, + "loss": 0.708, "step": 1588 }, { - "epoch": 0.5802446594851196, - "grad_norm": 1.361045002937317, - "learning_rate": 3.8762585139619415e-05, - "loss": 0.9852, + "epoch": 0.27567661346287303, + "grad_norm": 1.2434580326080322, + "learning_rate": 2.7562879444926284e-05, + "loss": 0.7344, "step": 1589 }, { - "epoch": 0.5806098228957458, - "grad_norm": 1.1899428367614746, - "learning_rate": 3.875761279182439e-05, - "loss": 0.9294, + "epoch": 0.2758501040943789, + "grad_norm": 0.8967628479003906, + "learning_rate": 2.758022549869905e-05, + "loss": 0.8882, "step": 1590 }, { - "epoch": 0.5809749863063721, - "grad_norm": 1.0059396028518677, - "learning_rate": 3.875263079399028e-05, - "loss": 0.9989, + "epoch": 0.2760235947258848, + "grad_norm": 0.9281281232833862, + "learning_rate": 2.7597571552471814e-05, + "loss": 0.877, "step": 1591 }, { - "epoch": 0.5813401497169983, - "grad_norm": 1.002609133720398, - "learning_rate": 3.874763914868013e-05, - "loss": 0.9516, + "epoch": 0.2761970853573907, + "grad_norm": 0.9629256129264832, + "learning_rate": 2.7614917606244583e-05, + "loss": 0.6561, "step": 1592 }, { - "epoch": 0.5817053131276246, - "grad_norm": 1.52043616771698, - "learning_rate": 3.874263785846192e-05, - "loss": 0.9785, + "epoch": 0.2763705759888966, + "grad_norm": 1.0486847162246704, + "learning_rate": 2.7632263660017348e-05, + "loss": 0.7327, "step": 1593 }, { - "epoch": 0.5820704765382508, - "grad_norm": 1.233569860458374, - "learning_rate": 3.873762692590863e-05, - "loss": 1.0221, + "epoch": 0.2765440666204025, + "grad_norm": 0.9916507601737976, + "learning_rate": 2.7649609713790116e-05, + "loss": 0.7791, "step": 1594 }, { - "epoch": 0.5824356399488771, - "grad_norm": 1.1855578422546387, - "learning_rate": 3.8732606353598185e-05, - "loss": 0.9862, + "epoch": 0.2767175572519084, + "grad_norm": 0.8916260004043579, + "learning_rate": 2.766695576756288e-05, + "loss": 0.842, "step": 1595 }, { - "epoch": 0.5828008033595033, - "grad_norm": 1.0793896913528442, - "learning_rate": 3.872757614411346e-05, - "loss": 0.9642, + "epoch": 0.2768910478834143, + "grad_norm": 0.7631053924560547, + "learning_rate": 2.768430182133565e-05, + "loss": 0.873, "step": 1596 }, { - "epoch": 0.5831659667701297, - "grad_norm": 1.624808669090271, - "learning_rate": 3.8722536300042305e-05, - "loss": 1.0514, + "epoch": 0.2770645385149202, + "grad_norm": 0.8734738230705261, + "learning_rate": 2.7701647875108414e-05, + "loss": 0.7971, "step": 1597 }, { - "epoch": 0.5835311301807559, - "grad_norm": 1.3094017505645752, - "learning_rate": 3.871748682397751e-05, - "loss": 0.9417, + "epoch": 0.2772380291464261, + "grad_norm": 0.9085443615913391, + "learning_rate": 2.7718993928881183e-05, + "loss": 0.9685, "step": 1598 }, { - "epoch": 0.5838962935913822, - "grad_norm": 1.4937210083007812, - "learning_rate": 3.871242771851683e-05, - "loss": 0.9684, + "epoch": 0.277411519777932, + "grad_norm": 0.9521569013595581, + "learning_rate": 2.7736339982653948e-05, + "loss": 0.8157, "step": 1599 }, { - "epoch": 0.5842614570020084, - "grad_norm": 1.1046578884124756, - "learning_rate": 3.870735898626297e-05, - "loss": 0.9734, + "epoch": 0.2775850104094379, + "grad_norm": 0.8425480723381042, + "learning_rate": 2.7753686036426716e-05, + "loss": 0.866, "step": 1600 }, { - "epoch": 0.5846266204126347, - "grad_norm": 1.2050797939300537, - "learning_rate": 3.8702280629823595e-05, - "loss": 1.0089, + "epoch": 0.2777585010409438, + "grad_norm": 1.3279259204864502, + "learning_rate": 2.777103209019948e-05, + "loss": 0.8118, "step": 1601 }, { - "epoch": 0.5849917838232609, - "grad_norm": 0.9763110876083374, - "learning_rate": 3.8697192651811305e-05, - "loss": 0.944, + "epoch": 0.2779319916724497, + "grad_norm": 0.784000039100647, + "learning_rate": 2.7788378143972246e-05, + "loss": 0.7966, "step": 1602 }, { - "epoch": 0.5853569472338872, - "grad_norm": 1.7331297397613525, - "learning_rate": 3.869209505484367e-05, - "loss": 0.9933, + "epoch": 0.2781054823039556, + "grad_norm": 0.7886543273925781, + "learning_rate": 2.7805724197745015e-05, + "loss": 0.8767, "step": 1603 }, { - "epoch": 0.5857221106445134, - "grad_norm": 1.0651476383209229, - "learning_rate": 3.86869878415432e-05, - "loss": 0.9746, + "epoch": 0.2782789729354615, + "grad_norm": 0.8421694040298462, + "learning_rate": 2.782307025151778e-05, + "loss": 0.7451, "step": 1604 }, { - "epoch": 0.5860872740551397, - "grad_norm": 1.935243844985962, - "learning_rate": 3.868187101453734e-05, - "loss": 0.9418, + "epoch": 0.2784524635669674, + "grad_norm": 0.8511319160461426, + "learning_rate": 2.7840416305290548e-05, + "loss": 0.7683, "step": 1605 }, { - "epoch": 0.5864524374657659, - "grad_norm": 1.2386976480484009, - "learning_rate": 3.867674457645851e-05, - "loss": 1.0247, + "epoch": 0.2786259541984733, + "grad_norm": 1.0992226600646973, + "learning_rate": 2.7857762359063313e-05, + "loss": 0.7764, "step": 1606 }, { - "epoch": 0.5868176008763922, - "grad_norm": 0.9830459356307983, - "learning_rate": 3.8671608529944035e-05, - "loss": 0.9465, + "epoch": 0.2787994448299792, + "grad_norm": 0.8646966218948364, + "learning_rate": 2.787510841283608e-05, + "loss": 0.8005, "step": 1607 }, { - "epoch": 0.5871827642870184, - "grad_norm": 1.2278460264205933, - "learning_rate": 3.866646287763622e-05, - "loss": 1.0006, + "epoch": 0.2789729354614851, + "grad_norm": 0.9306199550628662, + "learning_rate": 2.7892454466608846e-05, + "loss": 0.8489, "step": 1608 }, { - "epoch": 0.5875479276976447, - "grad_norm": 0.8639232516288757, - "learning_rate": 3.86613076221823e-05, - "loss": 0.9249, + "epoch": 0.27914642609299095, + "grad_norm": 0.9545014500617981, + "learning_rate": 2.7909800520381615e-05, + "loss": 0.908, "step": 1609 }, { - "epoch": 0.587913091108271, - "grad_norm": 1.022477626800537, - "learning_rate": 3.865614276623443e-05, - "loss": 0.991, + "epoch": 0.27931991672449685, + "grad_norm": 1.0205230712890625, + "learning_rate": 2.792714657415438e-05, + "loss": 0.7156, "step": 1610 }, { - "epoch": 0.5882782545188973, - "grad_norm": 1.3306070566177368, - "learning_rate": 3.8650968312449745e-05, - "loss": 0.9609, + "epoch": 0.27949340735600275, + "grad_norm": 1.6990107297897339, + "learning_rate": 2.7944492627927148e-05, + "loss": 0.8075, "step": 1611 }, { - "epoch": 0.5886434179295235, - "grad_norm": 0.9014416337013245, - "learning_rate": 3.864578426349027e-05, - "loss": 0.9696, + "epoch": 0.27966689798750866, + "grad_norm": 1.5446090698242188, + "learning_rate": 2.7961838681699917e-05, + "loss": 0.6886, "step": 1612 }, { - "epoch": 0.5890085813401497, - "grad_norm": 1.432660460472107, - "learning_rate": 3.8640590622023005e-05, - "loss": 1.006, + "epoch": 0.27984038861901456, + "grad_norm": 1.4868167638778687, + "learning_rate": 2.7979184735472685e-05, + "loss": 0.7312, "step": 1613 }, { - "epoch": 0.589373744750776, - "grad_norm": 1.2367098331451416, - "learning_rate": 3.863538739071986e-05, - "loss": 0.9386, + "epoch": 0.28001387925052046, + "grad_norm": 0.9284515380859375, + "learning_rate": 2.799653078924545e-05, + "loss": 0.7502, "step": 1614 }, { - "epoch": 0.5897389081614022, - "grad_norm": 1.5805164575576782, - "learning_rate": 3.86301745722577e-05, - "loss": 0.9429, + "epoch": 0.28018736988202636, + "grad_norm": 1.0670562982559204, + "learning_rate": 2.801387684301822e-05, + "loss": 0.8055, "step": 1615 }, { - "epoch": 0.5901040715720285, - "grad_norm": 1.2031488418579102, - "learning_rate": 3.86249521693183e-05, - "loss": 0.9771, + "epoch": 0.28036086051353226, + "grad_norm": 0.8268882036209106, + "learning_rate": 2.8031222896790983e-05, + "loss": 0.8438, "step": 1616 }, { - "epoch": 0.5904692349826547, - "grad_norm": 0.9730921387672424, - "learning_rate": 3.861972018458838e-05, - "loss": 0.9554, + "epoch": 0.28053435114503816, + "grad_norm": 0.8545418381690979, + "learning_rate": 2.8048568950563752e-05, + "loss": 0.7725, "step": 1617 }, { - "epoch": 0.590834398393281, - "grad_norm": 1.0020546913146973, - "learning_rate": 3.861447862075959e-05, - "loss": 0.908, + "epoch": 0.28070784177654406, + "grad_norm": 0.8699156045913696, + "learning_rate": 2.8065915004336517e-05, + "loss": 0.7098, "step": 1618 }, { - "epoch": 0.5911995618039072, - "grad_norm": 1.9879236221313477, - "learning_rate": 3.86092274805285e-05, - "loss": 0.9733, + "epoch": 0.28088133240804997, + "grad_norm": 0.9826617240905762, + "learning_rate": 2.8083261058109285e-05, + "loss": 0.7998, "step": 1619 }, { - "epoch": 0.5915647252145335, - "grad_norm": 1.207742691040039, - "learning_rate": 3.8603966766596624e-05, - "loss": 0.9352, + "epoch": 0.28105482303955587, + "grad_norm": 0.8360748291015625, + "learning_rate": 2.810060711188205e-05, + "loss": 0.8931, "step": 1620 }, { - "epoch": 0.5919298886251597, - "grad_norm": 1.1802479028701782, - "learning_rate": 3.8598696481670364e-05, - "loss": 0.9912, + "epoch": 0.28122831367106177, + "grad_norm": 0.9160963296890259, + "learning_rate": 2.8117953165654815e-05, + "loss": 0.9153, "step": 1621 }, { - "epoch": 0.592295052035786, - "grad_norm": 1.2929272651672363, - "learning_rate": 3.859341662846109e-05, - "loss": 0.935, + "epoch": 0.28140180430256767, + "grad_norm": 1.853236436843872, + "learning_rate": 2.8135299219427584e-05, + "loss": 0.7693, "step": 1622 }, { - "epoch": 0.5926602154464122, - "grad_norm": 0.9918064475059509, - "learning_rate": 3.858812720968507e-05, - "loss": 0.9495, + "epoch": 0.28157529493407357, + "grad_norm": 0.9883246421813965, + "learning_rate": 2.815264527320035e-05, + "loss": 0.7668, "step": 1623 }, { - "epoch": 0.5930253788570385, - "grad_norm": 1.14697265625, - "learning_rate": 3.858282822806349e-05, - "loss": 0.9703, + "epoch": 0.2817487855655795, + "grad_norm": 0.9887538552284241, + "learning_rate": 2.8169991326973117e-05, + "loss": 0.6775, "step": 1624 }, { - "epoch": 0.5933905422676647, - "grad_norm": 1.3090518712997437, - "learning_rate": 3.857751968632247e-05, - "loss": 0.9725, + "epoch": 0.2819222761970854, + "grad_norm": 0.7843948602676392, + "learning_rate": 2.8187337380745882e-05, + "loss": 0.9031, "step": 1625 }, { - "epoch": 0.5937557056782911, - "grad_norm": 1.3748619556427002, - "learning_rate": 3.857220158719305e-05, - "loss": 0.9962, + "epoch": 0.2820957668285913, + "grad_norm": 0.7027727961540222, + "learning_rate": 2.820468343451865e-05, + "loss": 0.8618, "step": 1626 }, { - "epoch": 0.5941208690889173, - "grad_norm": 2.7073593139648438, - "learning_rate": 3.8566873933411156e-05, - "loss": 0.9621, + "epoch": 0.2822692574600972, + "grad_norm": 0.9403607845306396, + "learning_rate": 2.8222029488291415e-05, + "loss": 0.7216, "step": 1627 }, { - "epoch": 0.5944860324995436, - "grad_norm": 1.152963638305664, - "learning_rate": 3.856153672771767e-05, - "loss": 0.996, + "epoch": 0.2824427480916031, + "grad_norm": 1.1979365348815918, + "learning_rate": 2.8239375542064184e-05, + "loss": 0.6919, "step": 1628 }, { - "epoch": 0.5948511959101698, - "grad_norm": 0.976712167263031, - "learning_rate": 3.855618997285837e-05, - "loss": 0.9943, + "epoch": 0.2826162387231089, + "grad_norm": 0.9449877142906189, + "learning_rate": 2.825672159583695e-05, + "loss": 0.8684, "step": 1629 }, { - "epoch": 0.5952163593207961, - "grad_norm": 1.3514211177825928, - "learning_rate": 3.855083367158394e-05, - "loss": 0.9662, + "epoch": 0.2827897293546148, + "grad_norm": 1.2824125289916992, + "learning_rate": 2.8274067649609717e-05, + "loss": 0.6604, "step": 1630 }, { - "epoch": 0.5955815227314223, - "grad_norm": 1.2582956552505493, - "learning_rate": 3.854546782664998e-05, - "loss": 0.9681, + "epoch": 0.28296321998612073, + "grad_norm": 0.8959324955940247, + "learning_rate": 2.8291413703382482e-05, + "loss": 0.8174, "step": 1631 }, { - "epoch": 0.5959466861420486, - "grad_norm": 1.2225971221923828, - "learning_rate": 3.854009244081701e-05, - "loss": 0.9355, + "epoch": 0.28313671061762663, + "grad_norm": 1.2085224390029907, + "learning_rate": 2.8308759757155247e-05, + "loss": 0.7031, "step": 1632 }, { - "epoch": 0.5963118495526748, - "grad_norm": 0.7337695956230164, - "learning_rate": 3.8534707516850446e-05, - "loss": 0.9694, + "epoch": 0.28331020124913253, + "grad_norm": 1.2435226440429688, + "learning_rate": 2.8326105810928015e-05, + "loss": 0.7727, "step": 1633 }, { - "epoch": 0.5966770129633011, - "grad_norm": 1.3417986631393433, - "learning_rate": 3.852931305752062e-05, - "loss": 0.9777, + "epoch": 0.28348369188063843, + "grad_norm": 0.8147986531257629, + "learning_rate": 2.834345186470078e-05, + "loss": 0.8062, "step": 1634 }, { - "epoch": 0.5970421763739273, - "grad_norm": 1.122308611869812, - "learning_rate": 3.852390906560276e-05, - "loss": 0.9443, + "epoch": 0.28365718251214433, + "grad_norm": 1.0525031089782715, + "learning_rate": 2.836079791847355e-05, + "loss": 0.7371, "step": 1635 }, { - "epoch": 0.5974073397845536, - "grad_norm": 1.1461122035980225, - "learning_rate": 3.8518495543877e-05, - "loss": 0.9891, + "epoch": 0.28383067314365024, + "grad_norm": 0.9755192399024963, + "learning_rate": 2.8378143972246314e-05, + "loss": 0.7456, "step": 1636 }, { - "epoch": 0.5977725031951798, - "grad_norm": 1.2063510417938232, - "learning_rate": 3.8513072495128385e-05, - "loss": 0.9641, + "epoch": 0.28400416377515614, + "grad_norm": 1.0865083932876587, + "learning_rate": 2.8395490026019082e-05, + "loss": 0.7573, "step": 1637 }, { - "epoch": 0.5981376666058061, - "grad_norm": 1.1760812997817993, - "learning_rate": 3.850763992214686e-05, - "loss": 1.0101, + "epoch": 0.28417765440666204, + "grad_norm": 1.1308993101119995, + "learning_rate": 2.8412836079791847e-05, + "loss": 0.7952, "step": 1638 }, { - "epoch": 0.5985028300164323, - "grad_norm": 1.279311180114746, - "learning_rate": 3.8502197827727254e-05, - "loss": 0.9828, + "epoch": 0.28435114503816794, + "grad_norm": 1.446656584739685, + "learning_rate": 2.8430182133564616e-05, + "loss": 0.8633, "step": 1639 }, { - "epoch": 0.5988679934270587, - "grad_norm": 1.1655765771865845, - "learning_rate": 3.849674621466931e-05, - "loss": 0.9263, + "epoch": 0.28452463566967384, + "grad_norm": 3.241093158721924, + "learning_rate": 2.844752818733738e-05, + "loss": 0.7356, "step": 1640 }, { - "epoch": 0.5992331568376849, - "grad_norm": 1.3590879440307617, - "learning_rate": 3.849128508577767e-05, - "loss": 0.9954, + "epoch": 0.28469812630117974, + "grad_norm": 0.8879625797271729, + "learning_rate": 2.8464874241110152e-05, + "loss": 0.855, "step": 1641 }, { - "epoch": 0.5995983202483112, - "grad_norm": 1.293040156364441, - "learning_rate": 3.848581444386187e-05, - "loss": 0.9703, + "epoch": 0.28487161693268565, + "grad_norm": 0.7868815660476685, + "learning_rate": 2.8482220294882917e-05, + "loss": 0.8979, "step": 1642 }, { - "epoch": 0.5999634836589374, - "grad_norm": 1.182997226715088, - "learning_rate": 3.848033429173632e-05, - "loss": 1.0052, + "epoch": 0.28504510756419155, + "grad_norm": 1.1814491748809814, + "learning_rate": 2.8499566348655686e-05, + "loss": 0.9617, "step": 1643 }, { - "epoch": 0.6003286470695637, - "grad_norm": 1.0324985980987549, - "learning_rate": 3.847484463222035e-05, - "loss": 0.9063, + "epoch": 0.28521859819569745, + "grad_norm": 0.7644641995429993, + "learning_rate": 2.851691240242845e-05, + "loss": 0.8857, "step": 1644 }, { - "epoch": 0.6006938104801899, - "grad_norm": 0.8648563623428345, - "learning_rate": 3.846934546813816e-05, - "loss": 0.9744, + "epoch": 0.28539208882720335, + "grad_norm": 0.8879674077033997, + "learning_rate": 2.853425845620122e-05, + "loss": 0.7023, "step": 1645 }, { - "epoch": 0.6010589738908162, - "grad_norm": 1.0205049514770508, - "learning_rate": 3.8463836802318865e-05, - "loss": 0.9407, + "epoch": 0.28556557945870925, + "grad_norm": 1.050529956817627, + "learning_rate": 2.8551604509973984e-05, + "loss": 0.7996, "step": 1646 }, { - "epoch": 0.6014241373014424, - "grad_norm": 1.2607346773147583, - "learning_rate": 3.8458318637596434e-05, - "loss": 0.9979, + "epoch": 0.28573907009021515, + "grad_norm": 1.0993930101394653, + "learning_rate": 2.8568950563746753e-05, + "loss": 0.8428, "step": 1647 }, { - "epoch": 0.6017893007120686, - "grad_norm": 1.15291166305542, - "learning_rate": 3.845279097680975e-05, - "loss": 0.9487, + "epoch": 0.28591256072172105, + "grad_norm": 0.8269695043563843, + "learning_rate": 2.8586296617519518e-05, + "loss": 0.8623, "step": 1648 }, { - "epoch": 0.6021544641226949, - "grad_norm": 1.1354620456695557, - "learning_rate": 3.844725382280258e-05, - "loss": 0.9874, + "epoch": 0.2860860513532269, + "grad_norm": 0.8869283199310303, + "learning_rate": 2.8603642671292286e-05, + "loss": 1.011, "step": 1649 }, { - "epoch": 0.6025196275333211, - "grad_norm": 1.1499407291412354, - "learning_rate": 3.8441707178423554e-05, - "loss": 0.945, + "epoch": 0.2862595419847328, + "grad_norm": 1.0457019805908203, + "learning_rate": 2.862098872506505e-05, + "loss": 0.8403, "step": 1650 }, { - "epoch": 0.6028847909439474, - "grad_norm": 0.9872992634773254, - "learning_rate": 3.843615104652621e-05, - "loss": 0.939, + "epoch": 0.2864330326162387, + "grad_norm": 1.1874597072601318, + "learning_rate": 2.8638334778837816e-05, + "loss": 0.8582, "step": 1651 }, { - "epoch": 0.6032499543545736, - "grad_norm": 1.1841249465942383, - "learning_rate": 3.843058542996895e-05, - "loss": 1.0023, + "epoch": 0.2866065232477446, + "grad_norm": 0.821327805519104, + "learning_rate": 2.8655680832610584e-05, + "loss": 0.8772, "step": 1652 }, { - "epoch": 0.6036151177651999, - "grad_norm": 0.8201302289962769, - "learning_rate": 3.842501033161505e-05, - "loss": 0.9591, + "epoch": 0.2867800138792505, + "grad_norm": 0.7324346899986267, + "learning_rate": 2.867302688638335e-05, + "loss": 0.9062, "step": 1653 }, { - "epoch": 0.6039802811758261, - "grad_norm": 1.2150993347167969, - "learning_rate": 3.8419425754332694e-05, - "loss": 0.9468, + "epoch": 0.2869535045107564, + "grad_norm": 1.2484420537948608, + "learning_rate": 2.8690372940156118e-05, + "loss": 0.6326, "step": 1654 }, { - "epoch": 0.6043454445864525, - "grad_norm": 1.4054694175720215, - "learning_rate": 3.8413831700994905e-05, - "loss": 1.0139, + "epoch": 0.2871269951422623, + "grad_norm": 1.0408523082733154, + "learning_rate": 2.8707718993928883e-05, + "loss": 0.7061, "step": 1655 }, { - "epoch": 0.6047106079970787, - "grad_norm": 1.6987162828445435, - "learning_rate": 3.840822817447961e-05, - "loss": 1.0356, + "epoch": 0.2873004857737682, + "grad_norm": 0.8671010136604309, + "learning_rate": 2.872506504770165e-05, + "loss": 0.9277, "step": 1656 }, { - "epoch": 0.605075771407705, - "grad_norm": 1.2456711530685425, - "learning_rate": 3.8402615177669604e-05, - "loss": 0.9835, + "epoch": 0.2874739764052741, + "grad_norm": 0.8296065330505371, + "learning_rate": 2.8742411101474416e-05, + "loss": 1.011, "step": 1657 }, { - "epoch": 0.6054409348183312, - "grad_norm": 1.6318790912628174, - "learning_rate": 3.839699271345253e-05, - "loss": 0.9808, + "epoch": 0.28764746703678, + "grad_norm": 0.7865563631057739, + "learning_rate": 2.8759757155247185e-05, + "loss": 0.7434, "step": 1658 }, { - "epoch": 0.6058060982289575, - "grad_norm": 1.1166234016418457, - "learning_rate": 3.839136078472093e-05, - "loss": 0.981, + "epoch": 0.2878209576682859, + "grad_norm": 0.900084376335144, + "learning_rate": 2.877710320901995e-05, + "loss": 0.6738, "step": 1659 }, { - "epoch": 0.6061712616395837, - "grad_norm": 1.2439149618148804, - "learning_rate": 3.838571939437221e-05, - "loss": 0.9714, + "epoch": 0.2879944482997918, + "grad_norm": 0.8910037875175476, + "learning_rate": 2.8794449262792718e-05, + "loss": 0.7634, "step": 1660 }, { - "epoch": 0.60653642505021, - "grad_norm": 1.184201955795288, - "learning_rate": 3.838006854530863e-05, - "loss": 0.9375, + "epoch": 0.2881679389312977, + "grad_norm": 1.1437269449234009, + "learning_rate": 2.8811795316565483e-05, + "loss": 0.6714, "step": 1661 }, { - "epoch": 0.6069015884608362, - "grad_norm": 1.2011412382125854, - "learning_rate": 3.837440824043734e-05, - "loss": 0.9706, + "epoch": 0.2883414295628036, + "grad_norm": 1.217750906944275, + "learning_rate": 2.8829141370338248e-05, + "loss": 0.896, "step": 1662 }, { - "epoch": 0.6072667518714625, - "grad_norm": 1.0476619005203247, - "learning_rate": 3.8368738482670315e-05, - "loss": 0.9709, + "epoch": 0.2885149201943095, + "grad_norm": 0.8291050791740417, + "learning_rate": 2.8846487424111016e-05, + "loss": 0.8162, "step": 1663 }, { - "epoch": 0.6076319152820887, - "grad_norm": 1.2018295526504517, - "learning_rate": 3.8363059274924445e-05, - "loss": 0.9569, + "epoch": 0.2886884108258154, + "grad_norm": 2.4025022983551025, + "learning_rate": 2.886383347788378e-05, + "loss": 0.7378, "step": 1664 }, { - "epoch": 0.607997078692715, - "grad_norm": 1.1869772672653198, - "learning_rate": 3.835737062012143e-05, - "loss": 1.0033, + "epoch": 0.2888619014573213, + "grad_norm": 1.0357331037521362, + "learning_rate": 2.888117953165655e-05, + "loss": 0.7389, "step": 1665 }, { - "epoch": 0.6083622421033412, - "grad_norm": 1.933923363685608, - "learning_rate": 3.8351672521187874e-05, - "loss": 0.958, + "epoch": 0.2890353920888272, + "grad_norm": 1.4848742485046387, + "learning_rate": 2.8898525585429315e-05, + "loss": 0.7974, "step": 1666 }, { - "epoch": 0.6087274055139675, - "grad_norm": 1.3390458822250366, - "learning_rate": 3.834596498105521e-05, - "loss": 0.9626, + "epoch": 0.2892088827203331, + "grad_norm": 2.062593460083008, + "learning_rate": 2.8915871639202083e-05, + "loss": 0.6948, "step": 1667 }, { - "epoch": 0.6090925689245937, - "grad_norm": 0.7330198884010315, - "learning_rate": 3.8340248002659745e-05, - "loss": 0.9697, + "epoch": 0.28938237335183903, + "grad_norm": 0.9019619822502136, + "learning_rate": 2.8933217692974848e-05, + "loss": 0.7366, "step": 1668 }, { - "epoch": 0.60945773233522, - "grad_norm": 0.9614207744598389, - "learning_rate": 3.8334521588942626e-05, - "loss": 0.9644, + "epoch": 0.2895558639833449, + "grad_norm": 1.0095984935760498, + "learning_rate": 2.8950563746747617e-05, + "loss": 0.7244, "step": 1669 }, { - "epoch": 0.6098228957458462, - "grad_norm": 1.1485025882720947, - "learning_rate": 3.832878574284988e-05, - "loss": 0.9719, + "epoch": 0.2897293546148508, + "grad_norm": 0.9408945441246033, + "learning_rate": 2.896790980052038e-05, + "loss": 0.6956, "step": 1670 }, { - "epoch": 0.6101880591564726, - "grad_norm": 1.0057835578918457, - "learning_rate": 3.8323040467332344e-05, - "loss": 0.9479, + "epoch": 0.2899028452463567, + "grad_norm": 0.9276244044303894, + "learning_rate": 2.8985255854293153e-05, + "loss": 0.8726, "step": 1671 }, { - "epoch": 0.6105532225670988, - "grad_norm": 1.0965198278427124, - "learning_rate": 3.8317285765345746e-05, - "loss": 0.9723, + "epoch": 0.2900763358778626, + "grad_norm": 0.9453510642051697, + "learning_rate": 2.900260190806592e-05, + "loss": 0.8081, "step": 1672 }, { - "epoch": 0.6109183859777251, - "grad_norm": 1.2669695615768433, - "learning_rate": 3.831152163985065e-05, - "loss": 0.9626, + "epoch": 0.2902498265093685, + "grad_norm": 1.4317481517791748, + "learning_rate": 2.9019947961838687e-05, + "loss": 0.7974, "step": 1673 }, { - "epoch": 0.6112835493883513, - "grad_norm": 1.1820439100265503, - "learning_rate": 3.830574809381247e-05, - "loss": 0.9392, + "epoch": 0.2904233171408744, + "grad_norm": 0.9829313158988953, + "learning_rate": 2.9037294015611452e-05, + "loss": 0.9028, "step": 1674 }, { - "epoch": 0.6116487127989776, - "grad_norm": 1.0535825490951538, - "learning_rate": 3.829996513020146e-05, - "loss": 0.9615, + "epoch": 0.2905968077723803, + "grad_norm": 0.7977801561355591, + "learning_rate": 2.905464006938422e-05, + "loss": 0.8201, "step": 1675 }, { - "epoch": 0.6120138762096038, - "grad_norm": 1.4263200759887695, - "learning_rate": 3.829417275199272e-05, - "loss": 0.9777, + "epoch": 0.2907702984038862, + "grad_norm": 1.097178339958191, + "learning_rate": 2.9071986123156985e-05, + "loss": 0.7903, "step": 1676 }, { - "epoch": 0.6123790396202301, - "grad_norm": 1.0502464771270752, - "learning_rate": 3.8288370962166194e-05, - "loss": 0.9865, + "epoch": 0.2909437890353921, + "grad_norm": 2.5548722743988037, + "learning_rate": 2.9089332176929753e-05, + "loss": 0.7937, "step": 1677 }, { - "epoch": 0.6127442030308563, - "grad_norm": 1.0033894777297974, - "learning_rate": 3.828255976370668e-05, - "loss": 0.9813, + "epoch": 0.291117279666898, + "grad_norm": 0.7736148238182068, + "learning_rate": 2.910667823070252e-05, + "loss": 0.8608, "step": 1678 }, { - "epoch": 0.6131093664414826, - "grad_norm": 1.0654264688491821, - "learning_rate": 3.8276739159603795e-05, - "loss": 0.9611, + "epoch": 0.2912907702984039, + "grad_norm": 0.8809636235237122, + "learning_rate": 2.9124024284475287e-05, + "loss": 0.9314, "step": 1679 }, { - "epoch": 0.6134745298521088, - "grad_norm": 1.6478551626205444, - "learning_rate": 3.827090915285202e-05, - "loss": 0.9956, + "epoch": 0.2914642609299098, + "grad_norm": 2.513631582260132, + "learning_rate": 2.9141370338248052e-05, + "loss": 0.9355, "step": 1680 }, { - "epoch": 0.613839693262735, - "grad_norm": 1.2268860340118408, - "learning_rate": 3.826506974645065e-05, - "loss": 0.9674, + "epoch": 0.2916377515614157, + "grad_norm": 0.8027017712593079, + "learning_rate": 2.9158716392020817e-05, + "loss": 0.8613, "step": 1681 }, { - "epoch": 0.6142048566733613, - "grad_norm": 0.8647541403770447, - "learning_rate": 3.8259220943403825e-05, - "loss": 0.9456, + "epoch": 0.2918112421929216, + "grad_norm": 0.7957848906517029, + "learning_rate": 2.9176062445793585e-05, + "loss": 1.0049, "step": 1682 }, { - "epoch": 0.6145700200839875, - "grad_norm": 1.1083359718322754, - "learning_rate": 3.825336274672053e-05, - "loss": 0.9374, + "epoch": 0.2919847328244275, + "grad_norm": 0.8930432796478271, + "learning_rate": 2.919340849956635e-05, + "loss": 0.865, "step": 1683 }, { - "epoch": 0.6149351834946138, - "grad_norm": 1.4323110580444336, - "learning_rate": 3.824749515941455e-05, - "loss": 0.9675, + "epoch": 0.2921582234559334, + "grad_norm": 0.7868476510047913, + "learning_rate": 2.921075455333912e-05, + "loss": 0.9255, "step": 1684 }, { - "epoch": 0.61530034690524, - "grad_norm": 1.1527587175369263, - "learning_rate": 3.824161818450454e-05, - "loss": 1.007, + "epoch": 0.2923317140874393, + "grad_norm": 0.7728238105773926, + "learning_rate": 2.9228100607111884e-05, + "loss": 0.7251, "step": 1685 }, { - "epoch": 0.6156655103158664, - "grad_norm": 0.9963729977607727, - "learning_rate": 3.823573182501397e-05, - "loss": 0.9403, + "epoch": 0.2925052047189452, + "grad_norm": 0.7930857539176941, + "learning_rate": 2.9245446660884652e-05, + "loss": 0.8052, "step": 1686 }, { - "epoch": 0.6160306737264926, - "grad_norm": 1.4043936729431152, - "learning_rate": 3.822983608397113e-05, - "loss": 0.948, + "epoch": 0.2926786953504511, + "grad_norm": 0.964941680431366, + "learning_rate": 2.9262792714657417e-05, + "loss": 0.8066, "step": 1687 }, { - "epoch": 0.6163958371371189, - "grad_norm": 1.1846081018447876, - "learning_rate": 3.8223930964409136e-05, - "loss": 0.9747, + "epoch": 0.29285218598195695, + "grad_norm": 1.3852626085281372, + "learning_rate": 2.9280138768430185e-05, + "loss": 0.7104, "step": 1688 }, { - "epoch": 0.6167610005477451, - "grad_norm": 1.230036735534668, - "learning_rate": 3.821801646936595e-05, - "loss": 0.9764, + "epoch": 0.29302567661346285, + "grad_norm": 0.8429183959960938, + "learning_rate": 2.929748482220295e-05, + "loss": 0.8889, "step": 1689 }, { - "epoch": 0.6171261639583714, - "grad_norm": 0.921221911907196, - "learning_rate": 3.821209260188433e-05, - "loss": 0.9733, + "epoch": 0.29319916724496875, + "grad_norm": 0.9193442463874817, + "learning_rate": 2.931483087597572e-05, + "loss": 0.8777, "step": 1690 }, { - "epoch": 0.6174913273689976, - "grad_norm": 1.1442986726760864, - "learning_rate": 3.8206159365011875e-05, - "loss": 0.9536, + "epoch": 0.29337265787647465, + "grad_norm": 0.8422867059707642, + "learning_rate": 2.9332176929748484e-05, + "loss": 0.7937, "step": 1691 }, { - "epoch": 0.6178564907796239, - "grad_norm": 0.9673666954040527, - "learning_rate": 3.8200216761800986e-05, - "loss": 0.9734, + "epoch": 0.29354614850798055, + "grad_norm": 1.0377291440963745, + "learning_rate": 2.934952298352125e-05, + "loss": 0.7634, "step": 1692 }, { - "epoch": 0.6182216541902501, - "grad_norm": 1.7896100282669067, - "learning_rate": 3.819426479530891e-05, - "loss": 0.9492, + "epoch": 0.29371963913948645, + "grad_norm": 0.8674811720848083, + "learning_rate": 2.9366869037294017e-05, + "loss": 1.0037, "step": 1693 }, { - "epoch": 0.6185868176008764, - "grad_norm": 0.9377707242965698, - "learning_rate": 3.8188303468597684e-05, - "loss": 0.9447, + "epoch": 0.29389312977099236, + "grad_norm": 0.8458404541015625, + "learning_rate": 2.9384215091066782e-05, + "loss": 0.7756, "step": 1694 }, { - "epoch": 0.6189519810115026, - "grad_norm": 1.6506208181381226, - "learning_rate": 3.818233278473417e-05, - "loss": 0.9476, + "epoch": 0.29406662040249826, + "grad_norm": 0.8742087483406067, + "learning_rate": 2.940156114483955e-05, + "loss": 0.7847, "step": 1695 }, { - "epoch": 0.6193171444221289, - "grad_norm": 0.9017136693000793, - "learning_rate": 3.817635274679006e-05, - "loss": 0.9427, + "epoch": 0.29424011103400416, + "grad_norm": 1.1683319807052612, + "learning_rate": 2.9418907198612316e-05, + "loss": 0.7869, "step": 1696 }, { - "epoch": 0.6196823078327551, - "grad_norm": 1.076751947402954, - "learning_rate": 3.817036335784183e-05, - "loss": 0.9341, + "epoch": 0.29441360166551006, + "grad_norm": 1.1201775074005127, + "learning_rate": 2.9436253252385084e-05, + "loss": 0.6924, "step": 1697 }, { - "epoch": 0.6200474712433814, - "grad_norm": 1.1123216152191162, - "learning_rate": 3.816436462097079e-05, - "loss": 0.9628, + "epoch": 0.29458709229701596, + "grad_norm": 1.3529385328292847, + "learning_rate": 2.945359930615785e-05, + "loss": 0.6798, "step": 1698 }, { - "epoch": 0.6204126346540076, - "grad_norm": 1.0265611410140991, - "learning_rate": 3.815835653926303e-05, - "loss": 0.9465, + "epoch": 0.29476058292852186, + "grad_norm": 1.0174338817596436, + "learning_rate": 2.9470945359930617e-05, + "loss": 0.6848, "step": 1699 }, { - "epoch": 0.620777798064634, - "grad_norm": 1.1636338233947754, - "learning_rate": 3.8152339115809486e-05, - "loss": 0.9036, + "epoch": 0.29493407356002777, + "grad_norm": 1.214248538017273, + "learning_rate": 2.9488291413703386e-05, + "loss": 0.728, "step": 1700 }, { - "epoch": 0.6211429614752602, - "grad_norm": 1.0679070949554443, - "learning_rate": 3.814631235370587e-05, - "loss": 0.9669, + "epoch": 0.29510756419153367, + "grad_norm": 1.0517425537109375, + "learning_rate": 2.9505637467476154e-05, + "loss": 0.8912, "step": 1701 }, { - "epoch": 0.6215081248858865, - "grad_norm": 0.9552412033081055, - "learning_rate": 3.814027625605272e-05, - "loss": 0.9164, + "epoch": 0.29528105482303957, + "grad_norm": 0.9108189940452576, + "learning_rate": 2.952298352124892e-05, + "loss": 0.6902, "step": 1702 }, { - "epoch": 0.6218732882965127, - "grad_norm": 1.4121439456939697, - "learning_rate": 3.8134230825955366e-05, - "loss": 0.9421, + "epoch": 0.29545454545454547, + "grad_norm": 1.255733609199524, + "learning_rate": 2.9540329575021688e-05, + "loss": 0.7793, "step": 1703 }, { - "epoch": 0.622238451707139, - "grad_norm": 1.356169581413269, - "learning_rate": 3.812817606652392e-05, - "loss": 0.9905, + "epoch": 0.29562803608605137, + "grad_norm": 1.1171400547027588, + "learning_rate": 2.9557675628794453e-05, + "loss": 0.8528, "step": 1704 }, { - "epoch": 0.6226036151177652, - "grad_norm": 1.1744301319122314, - "learning_rate": 3.812211198087333e-05, - "loss": 0.9474, + "epoch": 0.2958015267175573, + "grad_norm": 1.054380178451538, + "learning_rate": 2.957502168256722e-05, + "loss": 0.9478, "step": 1705 }, { - "epoch": 0.6229687785283915, - "grad_norm": 0.9278100728988647, - "learning_rate": 3.8116038572123325e-05, - "loss": 0.9579, + "epoch": 0.2959750173490632, + "grad_norm": 0.9808557629585266, + "learning_rate": 2.9592367736339986e-05, + "loss": 0.7639, "step": 1706 }, { - "epoch": 0.6233339419390177, - "grad_norm": 1.2588489055633545, - "learning_rate": 3.810995584339843e-05, - "loss": 0.9969, + "epoch": 0.2961485079805691, + "grad_norm": 0.9000292420387268, + "learning_rate": 2.9609713790112754e-05, + "loss": 0.8293, "step": 1707 }, { - "epoch": 0.623699105349644, - "grad_norm": 0.8428167104721069, - "learning_rate": 3.8103863797827955e-05, - "loss": 0.9727, + "epoch": 0.2963219986120749, + "grad_norm": 1.2940888404846191, + "learning_rate": 2.962705984388552e-05, + "loss": 0.855, "step": 1708 }, { - "epoch": 0.6240642687602702, - "grad_norm": 1.2606332302093506, - "learning_rate": 3.809776243854602e-05, - "loss": 0.9822, + "epoch": 0.2964954892435808, + "grad_norm": 0.8848286867141724, + "learning_rate": 2.9644405897658284e-05, + "loss": 0.7927, "step": 1709 }, { - "epoch": 0.6244294321708965, - "grad_norm": 1.6279391050338745, - "learning_rate": 3.8091651768691526e-05, - "loss": 0.9359, + "epoch": 0.2966689798750867, + "grad_norm": 0.7000380754470825, + "learning_rate": 2.9661751951431053e-05, + "loss": 0.8483, "step": 1710 }, { - "epoch": 0.6247945955815227, - "grad_norm": 1.4186426401138306, - "learning_rate": 3.808553179140817e-05, - "loss": 0.9396, + "epoch": 0.2968424705065926, + "grad_norm": 0.859656572341919, + "learning_rate": 2.9679098005203818e-05, + "loss": 0.8184, "step": 1711 }, { - "epoch": 0.625159758992149, - "grad_norm": 1.0185045003890991, - "learning_rate": 3.807940250984444e-05, - "loss": 0.9219, + "epoch": 0.2970159611380985, + "grad_norm": 0.8366445302963257, + "learning_rate": 2.9696444058976586e-05, + "loss": 0.7427, "step": 1712 }, { - "epoch": 0.6255249224027752, - "grad_norm": 1.4092063903808594, - "learning_rate": 3.807326392715359e-05, - "loss": 0.9904, + "epoch": 0.29718945176960443, + "grad_norm": 0.8027821779251099, + "learning_rate": 2.971379011274935e-05, + "loss": 0.958, "step": 1713 }, { - "epoch": 0.6258900858134014, - "grad_norm": 1.1440441608428955, - "learning_rate": 3.806711604649369e-05, - "loss": 0.9563, + "epoch": 0.29736294240111033, + "grad_norm": 1.0442862510681152, + "learning_rate": 2.973113616652212e-05, + "loss": 0.7983, "step": 1714 }, { - "epoch": 0.6262552492240278, - "grad_norm": 1.007660984992981, - "learning_rate": 3.806095887102757e-05, - "loss": 0.9227, + "epoch": 0.29753643303261623, + "grad_norm": 0.8695030808448792, + "learning_rate": 2.9748482220294885e-05, + "loss": 0.9868, "step": 1715 }, { - "epoch": 0.626620412634654, - "grad_norm": 1.4953409433364868, - "learning_rate": 3.805479240392286e-05, - "loss": 0.9877, + "epoch": 0.29770992366412213, + "grad_norm": 1.1406121253967285, + "learning_rate": 2.9765828274067653e-05, + "loss": 0.7402, "step": 1716 }, { - "epoch": 0.6269855760452803, - "grad_norm": 1.3530348539352417, - "learning_rate": 3.804861664835195e-05, - "loss": 0.9935, + "epoch": 0.29788341429562804, + "grad_norm": 1.0053128004074097, + "learning_rate": 2.9783174327840418e-05, + "loss": 0.7363, "step": 1717 }, { - "epoch": 0.6273507394559065, - "grad_norm": 0.9533717632293701, - "learning_rate": 3.8042431607492015e-05, - "loss": 0.9371, + "epoch": 0.29805690492713394, + "grad_norm": 1.6684331893920898, + "learning_rate": 2.9800520381613186e-05, + "loss": 0.9126, "step": 1718 }, { - "epoch": 0.6277159028665328, - "grad_norm": 1.0188987255096436, - "learning_rate": 3.8036237284525016e-05, - "loss": 0.9597, + "epoch": 0.29823039555863984, + "grad_norm": 1.1673744916915894, + "learning_rate": 2.981786643538595e-05, + "loss": 0.8088, "step": 1719 }, { - "epoch": 0.628081066277159, - "grad_norm": 1.3123247623443604, - "learning_rate": 3.8030033682637686e-05, - "loss": 0.965, + "epoch": 0.29840388619014574, + "grad_norm": 0.7744690179824829, + "learning_rate": 2.983521248915872e-05, + "loss": 0.7908, "step": 1720 }, { - "epoch": 0.6284462296877853, - "grad_norm": 1.5189393758773804, - "learning_rate": 3.8023820805021524e-05, - "loss": 0.9701, + "epoch": 0.29857737682165164, + "grad_norm": 0.8944323062896729, + "learning_rate": 2.9852558542931485e-05, + "loss": 0.8091, "step": 1721 }, { - "epoch": 0.6288113930984115, - "grad_norm": 1.3095381259918213, - "learning_rate": 3.801759865487281e-05, - "loss": 0.9751, + "epoch": 0.29875086745315754, + "grad_norm": 1.1848338842391968, + "learning_rate": 2.986990459670425e-05, + "loss": 0.6812, "step": 1722 }, { - "epoch": 0.6291765565090378, - "grad_norm": 1.3902641534805298, - "learning_rate": 3.801136723539259e-05, - "loss": 1.0167, + "epoch": 0.29892435808466344, + "grad_norm": 1.125245213508606, + "learning_rate": 2.9887250650477018e-05, + "loss": 0.6517, "step": 1723 }, { - "epoch": 0.629541719919664, - "grad_norm": 1.0632237195968628, - "learning_rate": 3.8005126549786674e-05, - "loss": 0.9652, + "epoch": 0.29909784871616935, + "grad_norm": 1.1567728519439697, + "learning_rate": 2.9904596704249783e-05, + "loss": 0.7627, "step": 1724 }, { - "epoch": 0.6299068833302903, - "grad_norm": 0.923629105091095, - "learning_rate": 3.7998876601265654e-05, - "loss": 0.9587, + "epoch": 0.29927133934767525, + "grad_norm": 1.8745869398117065, + "learning_rate": 2.992194275802255e-05, + "loss": 0.8339, "step": 1725 }, { - "epoch": 0.6302720467409165, - "grad_norm": 1.4732447862625122, - "learning_rate": 3.799261739304487e-05, - "loss": 0.9081, + "epoch": 0.29944482997918115, + "grad_norm": 0.9886743426322937, + "learning_rate": 2.9939288811795316e-05, + "loss": 0.6875, "step": 1726 }, { - "epoch": 0.6306372101515428, - "grad_norm": 1.176621437072754, - "learning_rate": 3.798634892834444e-05, - "loss": 0.9575, + "epoch": 0.29961832061068705, + "grad_norm": 1.2610546350479126, + "learning_rate": 2.9956634865568085e-05, + "loss": 0.8069, "step": 1727 }, { - "epoch": 0.631002373562169, - "grad_norm": 1.0964040756225586, - "learning_rate": 3.798007121038923e-05, - "loss": 0.9562, + "epoch": 0.2997918112421929, + "grad_norm": 0.8037664294242859, + "learning_rate": 2.997398091934085e-05, + "loss": 0.9443, "step": 1728 }, { - "epoch": 0.6313675369727954, - "grad_norm": 1.4079523086547852, - "learning_rate": 3.797378424240888e-05, - "loss": 0.9896, + "epoch": 0.2999653018736988, + "grad_norm": 0.8852689862251282, + "learning_rate": 2.9991326973113618e-05, + "loss": 0.7546, "step": 1729 }, { - "epoch": 0.6317327003834216, - "grad_norm": 1.4832806587219238, - "learning_rate": 3.7967488027637776e-05, - "loss": 0.9512, + "epoch": 0.3001387925052047, + "grad_norm": 0.6861998438835144, + "learning_rate": 3.0008673026886387e-05, + "loss": 0.8975, "step": 1730 }, { - "epoch": 0.6320978637940479, - "grad_norm": 1.2028474807739258, - "learning_rate": 3.796118256931507e-05, - "loss": 0.9476, + "epoch": 0.3003122831367106, + "grad_norm": 0.7353999614715576, + "learning_rate": 3.0026019080659155e-05, + "loss": 0.8691, "step": 1731 }, { - "epoch": 0.6324630272046741, - "grad_norm": 1.3926475048065186, - "learning_rate": 3.7954867870684677e-05, - "loss": 0.9866, + "epoch": 0.3004857737682165, + "grad_norm": 0.9657226800918579, + "learning_rate": 3.004336513443192e-05, + "loss": 0.772, "step": 1732 }, { - "epoch": 0.6328281906153004, - "grad_norm": 1.3717031478881836, - "learning_rate": 3.794854393499525e-05, - "loss": 0.9315, + "epoch": 0.3006592643997224, + "grad_norm": 0.9317792654037476, + "learning_rate": 3.006071118820469e-05, + "loss": 0.822, "step": 1733 }, { - "epoch": 0.6331933540259266, - "grad_norm": 1.3110661506652832, - "learning_rate": 3.7942210765500197e-05, - "loss": 1.011, + "epoch": 0.3008327550312283, + "grad_norm": 0.8374948501586914, + "learning_rate": 3.0078057241977453e-05, + "loss": 0.7097, "step": 1734 }, { - "epoch": 0.6335585174365529, - "grad_norm": 1.0015228986740112, - "learning_rate": 3.7935868365457674e-05, - "loss": 0.9409, + "epoch": 0.3010062456627342, + "grad_norm": 1.2599648237228394, + "learning_rate": 3.0095403295750222e-05, + "loss": 0.8015, "step": 1735 }, { - "epoch": 0.6339236808471791, - "grad_norm": 1.2499682903289795, - "learning_rate": 3.7929516738130606e-05, - "loss": 0.9202, + "epoch": 0.3011797362942401, + "grad_norm": 1.122849464416504, + "learning_rate": 3.0112749349522987e-05, + "loss": 0.6926, "step": 1736 }, { - "epoch": 0.6342888442578054, - "grad_norm": 0.9458034038543701, - "learning_rate": 3.7923155886786636e-05, - "loss": 0.9443, + "epoch": 0.301353226925746, + "grad_norm": 1.5063469409942627, + "learning_rate": 3.0130095403295755e-05, + "loss": 0.7753, "step": 1737 }, { - "epoch": 0.6346540076684316, - "grad_norm": 1.1153990030288696, - "learning_rate": 3.791678581469818e-05, - "loss": 1.0131, + "epoch": 0.3015267175572519, + "grad_norm": 0.7204892039299011, + "learning_rate": 3.014744145706852e-05, + "loss": 0.8975, "step": 1738 }, { - "epoch": 0.6350191710790579, - "grad_norm": 1.0678365230560303, - "learning_rate": 3.7910406525142374e-05, - "loss": 0.9646, + "epoch": 0.3017002081887578, + "grad_norm": 0.6785767078399658, + "learning_rate": 3.0164787510841285e-05, + "loss": 0.9341, "step": 1739 }, { - "epoch": 0.6353843344896841, - "grad_norm": 0.9294977188110352, - "learning_rate": 3.790401802140111e-05, - "loss": 0.9331, + "epoch": 0.3018736988202637, + "grad_norm": 0.831244707107544, + "learning_rate": 3.0182133564614054e-05, + "loss": 0.864, "step": 1740 }, { - "epoch": 0.6357494979003104, - "grad_norm": 1.0095150470733643, - "learning_rate": 3.789762030676103e-05, - "loss": 0.9368, + "epoch": 0.3020471894517696, + "grad_norm": 0.9846959710121155, + "learning_rate": 3.019947961838682e-05, + "loss": 0.7002, "step": 1741 }, { - "epoch": 0.6361146613109366, - "grad_norm": 1.0587514638900757, - "learning_rate": 3.7891213384513476e-05, - "loss": 0.9802, + "epoch": 0.3022206800832755, + "grad_norm": 0.958961546421051, + "learning_rate": 3.0216825672159587e-05, + "loss": 0.8282, "step": 1742 }, { - "epoch": 0.636479824721563, - "grad_norm": 1.0915364027023315, - "learning_rate": 3.7884797257954565e-05, - "loss": 0.9192, + "epoch": 0.3023941707147814, + "grad_norm": 0.893563985824585, + "learning_rate": 3.0234171725932352e-05, + "loss": 0.8713, "step": 1743 }, { - "epoch": 0.6368449881321891, - "grad_norm": 1.0475094318389893, - "learning_rate": 3.7878371930385144e-05, - "loss": 0.9584, + "epoch": 0.3025676613462873, + "grad_norm": 1.6310943365097046, + "learning_rate": 3.025151777970512e-05, + "loss": 0.7661, "step": 1744 }, { - "epoch": 0.6372101515428155, - "grad_norm": 1.1574326753616333, - "learning_rate": 3.787193740511077e-05, - "loss": 0.97, + "epoch": 0.3027411519777932, + "grad_norm": 1.0230166912078857, + "learning_rate": 3.0268863833477885e-05, + "loss": 0.8567, "step": 1745 }, { - "epoch": 0.6375753149534417, - "grad_norm": 1.375541090965271, - "learning_rate": 3.786549368544177e-05, - "loss": 0.9669, + "epoch": 0.3029146426092991, + "grad_norm": 0.8506000638008118, + "learning_rate": 3.0286209887250654e-05, + "loss": 0.8689, "step": 1746 }, { - "epoch": 0.637940478364068, - "grad_norm": 0.9644486904144287, - "learning_rate": 3.7859040774693156e-05, - "loss": 0.9427, + "epoch": 0.30308813324080497, + "grad_norm": 0.9473209977149963, + "learning_rate": 3.030355594102342e-05, + "loss": 0.7742, "step": 1747 }, { - "epoch": 0.6383056417746942, - "grad_norm": 1.1332459449768066, - "learning_rate": 3.7852578676184705e-05, - "loss": 0.959, + "epoch": 0.30326162387231087, + "grad_norm": 1.5788992643356323, + "learning_rate": 3.0320901994796187e-05, + "loss": 1.0017, "step": 1748 }, { - "epoch": 0.6386708051853204, - "grad_norm": 0.9746343493461609, - "learning_rate": 3.784610739324091e-05, - "loss": 0.9735, + "epoch": 0.30343511450381677, + "grad_norm": 0.8944735527038574, + "learning_rate": 3.0338248048568952e-05, + "loss": 0.9309, "step": 1749 }, { - "epoch": 0.6390359685959467, - "grad_norm": 1.233921766281128, - "learning_rate": 3.7839626929190976e-05, - "loss": 0.9575, + "epoch": 0.3036086051353227, + "grad_norm": 1.0035227537155151, + "learning_rate": 3.035559410234172e-05, + "loss": 0.7927, "step": 1750 }, { - "epoch": 0.6394011320065729, - "grad_norm": 1.209659218788147, - "learning_rate": 3.783313728736884e-05, - "loss": 0.9507, + "epoch": 0.3037820957668286, + "grad_norm": 0.9242807030677795, + "learning_rate": 3.0372940156114486e-05, + "loss": 0.8616, "step": 1751 }, { - "epoch": 0.6397662954171992, - "grad_norm": 1.0326811075210571, - "learning_rate": 3.782663847111318e-05, - "loss": 0.9467, + "epoch": 0.3039555863983345, + "grad_norm": 1.1387596130371094, + "learning_rate": 3.039028620988725e-05, + "loss": 0.775, "step": 1752 }, { - "epoch": 0.6401314588278254, - "grad_norm": 1.1225247383117676, - "learning_rate": 3.782013048376736e-05, - "loss": 0.9535, + "epoch": 0.3041290770298404, + "grad_norm": 1.1345518827438354, + "learning_rate": 3.040763226366002e-05, + "loss": 0.8506, "step": 1753 }, { - "epoch": 0.6404966222384517, - "grad_norm": 1.1711870431900024, - "learning_rate": 3.781361332867948e-05, - "loss": 0.9823, + "epoch": 0.3043025676613463, + "grad_norm": 0.8300006985664368, + "learning_rate": 3.0424978317432784e-05, + "loss": 0.7429, "step": 1754 }, { - "epoch": 0.6408617856490779, - "grad_norm": 1.961878776550293, - "learning_rate": 3.7807087009202366e-05, - "loss": 0.9906, + "epoch": 0.3044760582928522, + "grad_norm": 0.9101284742355347, + "learning_rate": 3.0442324371205552e-05, + "loss": 0.7131, "step": 1755 }, { - "epoch": 0.6412269490597042, - "grad_norm": 0.8660821318626404, - "learning_rate": 3.780055152869354e-05, - "loss": 0.9329, + "epoch": 0.3046495489243581, + "grad_norm": 0.9008650779724121, + "learning_rate": 3.0459670424978317e-05, + "loss": 0.8147, "step": 1756 }, { - "epoch": 0.6415921124703304, - "grad_norm": 1.4752761125564575, - "learning_rate": 3.7794006890515235e-05, - "loss": 0.9551, + "epoch": 0.304823039555864, + "grad_norm": 0.9326288104057312, + "learning_rate": 3.0477016478751086e-05, + "loss": 0.8544, "step": 1757 }, { - "epoch": 0.6419572758809567, - "grad_norm": 1.0036879777908325, - "learning_rate": 3.778745309803442e-05, - "loss": 0.9528, + "epoch": 0.3049965301873699, + "grad_norm": 1.1514352560043335, + "learning_rate": 3.049436253252385e-05, + "loss": 0.8203, "step": 1758 }, { - "epoch": 0.642322439291583, - "grad_norm": 1.0074748992919922, - "learning_rate": 3.778089015462275e-05, - "loss": 0.9512, + "epoch": 0.3051700208188758, + "grad_norm": 0.8488438725471497, + "learning_rate": 3.051170858629662e-05, + "loss": 0.9519, "step": 1759 }, { - "epoch": 0.6426876027022093, - "grad_norm": 0.8019533753395081, - "learning_rate": 3.77743180636566e-05, - "loss": 0.9447, + "epoch": 0.3053435114503817, + "grad_norm": 0.9555184245109558, + "learning_rate": 3.052905464006939e-05, + "loss": 0.7147, "step": 1760 }, { - "epoch": 0.6430527661128355, - "grad_norm": 1.051918625831604, - "learning_rate": 3.776773682851705e-05, - "loss": 0.9401, + "epoch": 0.3055170020818876, + "grad_norm": 0.8575806617736816, + "learning_rate": 3.0546400693842156e-05, + "loss": 0.8833, "step": 1761 }, { - "epoch": 0.6434179295234618, - "grad_norm": 1.3353664875030518, - "learning_rate": 3.776114645258987e-05, - "loss": 0.9866, + "epoch": 0.3056904927133935, + "grad_norm": 0.7477969527244568, + "learning_rate": 3.056374674761492e-05, + "loss": 0.8845, "step": 1762 }, { - "epoch": 0.643783092934088, - "grad_norm": 0.9800504446029663, - "learning_rate": 3.775454693926554e-05, - "loss": 0.9572, + "epoch": 0.3058639833448994, + "grad_norm": 0.6945277452468872, + "learning_rate": 3.0581092801387686e-05, + "loss": 0.8677, "step": 1763 }, { - "epoch": 0.6441482563447143, - "grad_norm": 1.1214323043823242, - "learning_rate": 3.774793829193927e-05, - "loss": 0.9247, + "epoch": 0.3060374739764053, + "grad_norm": 0.7588815689086914, + "learning_rate": 3.059843885516046e-05, + "loss": 0.8853, "step": 1764 }, { - "epoch": 0.6445134197553405, - "grad_norm": 1.2562134265899658, - "learning_rate": 3.774132051401093e-05, - "loss": 0.948, + "epoch": 0.3062109646079112, + "grad_norm": 0.9022750854492188, + "learning_rate": 3.061578490893322e-05, + "loss": 0.653, "step": 1765 }, { - "epoch": 0.6448785831659668, - "grad_norm": 1.3930929899215698, - "learning_rate": 3.77346936088851e-05, - "loss": 0.9624, + "epoch": 0.3063844552394171, + "grad_norm": 0.9152644276618958, + "learning_rate": 3.063313096270599e-05, + "loss": 0.8773, "step": 1766 }, { - "epoch": 0.645243746576593, - "grad_norm": 0.9946709871292114, - "learning_rate": 3.772805757997105e-05, - "loss": 0.8903, + "epoch": 0.30655794587092294, + "grad_norm": 0.8464672565460205, + "learning_rate": 3.065047701647875e-05, + "loss": 0.9307, "step": 1767 }, { - "epoch": 0.6456089099872193, - "grad_norm": 1.0349606275558472, - "learning_rate": 3.7721412430682766e-05, - "loss": 0.9774, + "epoch": 0.30673143650242884, + "grad_norm": 1.552269697189331, + "learning_rate": 3.0667823070251524e-05, + "loss": 0.8865, "step": 1768 }, { - "epoch": 0.6459740733978455, - "grad_norm": 1.1462488174438477, - "learning_rate": 3.7714758164438896e-05, - "loss": 0.996, + "epoch": 0.30690492713393475, + "grad_norm": 0.9044069051742554, + "learning_rate": 3.068516912402429e-05, + "loss": 0.9233, "step": 1769 }, { - "epoch": 0.6463392368084718, - "grad_norm": 1.4554134607315063, - "learning_rate": 3.7708094784662804e-05, - "loss": 0.9573, + "epoch": 0.30707841776544065, + "grad_norm": 1.121402621269226, + "learning_rate": 3.0702515177797054e-05, + "loss": 0.7561, "step": 1770 }, { - "epoch": 0.646704400219098, - "grad_norm": 1.2475395202636719, - "learning_rate": 3.7701422294782514e-05, - "loss": 0.9714, + "epoch": 0.30725190839694655, + "grad_norm": 0.9554601311683655, + "learning_rate": 3.071986123156982e-05, + "loss": 0.7241, "step": 1771 }, { - "epoch": 0.6470695636297243, - "grad_norm": 1.0382839441299438, - "learning_rate": 3.769474069823078e-05, - "loss": 0.9672, + "epoch": 0.30742539902845245, + "grad_norm": 0.8401093482971191, + "learning_rate": 3.0737207285342584e-05, + "loss": 0.8184, "step": 1772 }, { - "epoch": 0.6474347270403505, - "grad_norm": 1.3262828588485718, - "learning_rate": 3.7688049998445e-05, - "loss": 0.9917, + "epoch": 0.30759888965995835, + "grad_norm": 0.9703673124313354, + "learning_rate": 3.0754553339115356e-05, + "loss": 0.728, "step": 1773 }, { - "epoch": 0.6477998904509769, - "grad_norm": 1.116832971572876, - "learning_rate": 3.7681350198867274e-05, - "loss": 0.9537, + "epoch": 0.30777238029146425, + "grad_norm": 1.4850648641586304, + "learning_rate": 3.077189939288812e-05, + "loss": 0.7639, "step": 1774 }, { - "epoch": 0.6481650538616031, - "grad_norm": 0.9366448521614075, - "learning_rate": 3.767464130294438e-05, - "loss": 0.9766, + "epoch": 0.30794587092297016, + "grad_norm": 0.8237951993942261, + "learning_rate": 3.0789245446660886e-05, + "loss": 0.8201, "step": 1775 }, { - "epoch": 0.6485302172722294, - "grad_norm": 1.1169066429138184, - "learning_rate": 3.7667923314127774e-05, - "loss": 0.9559, + "epoch": 0.30811936155447606, + "grad_norm": 0.8153607249259949, + "learning_rate": 3.080659150043365e-05, + "loss": 0.9104, "step": 1776 }, { - "epoch": 0.6488953806828556, - "grad_norm": 1.1752116680145264, - "learning_rate": 3.766119623587359e-05, - "loss": 0.9414, + "epoch": 0.30829285218598196, + "grad_norm": 1.0392217636108398, + "learning_rate": 3.082393755420642e-05, + "loss": 0.928, "step": 1777 }, { - "epoch": 0.6492605440934819, - "grad_norm": 1.2178727388381958, - "learning_rate": 3.765446007164264e-05, - "loss": 0.8997, + "epoch": 0.30846634281748786, + "grad_norm": 1.0524784326553345, + "learning_rate": 3.084128360797919e-05, + "loss": 0.9121, "step": 1778 }, { - "epoch": 0.6496257075041081, - "grad_norm": 1.0676909685134888, - "learning_rate": 3.764771482490042e-05, - "loss": 0.9414, + "epoch": 0.30863983344899376, + "grad_norm": 1.1679737567901611, + "learning_rate": 3.085862966175195e-05, + "loss": 0.8689, "step": 1779 }, { - "epoch": 0.6499908709147344, - "grad_norm": 1.2527064085006714, - "learning_rate": 3.7640960499117076e-05, - "loss": 0.9509, + "epoch": 0.30881332408049966, + "grad_norm": 0.8028808832168579, + "learning_rate": 3.087597571552472e-05, + "loss": 0.856, "step": 1780 }, { - "epoch": 0.6503560343253606, - "grad_norm": 0.8683475255966187, - "learning_rate": 3.763419709776744e-05, - "loss": 0.952, + "epoch": 0.30898681471200556, + "grad_norm": 0.9776463508605957, + "learning_rate": 3.089332176929748e-05, + "loss": 0.8224, "step": 1781 }, { - "epoch": 0.6507211977359868, - "grad_norm": 0.7974107265472412, - "learning_rate": 3.762742462433102e-05, - "loss": 0.9348, + "epoch": 0.30916030534351147, + "grad_norm": 0.8996575474739075, + "learning_rate": 3.0910667823070255e-05, + "loss": 0.9333, "step": 1782 }, { - "epoch": 0.6510863611466131, - "grad_norm": 0.9000236988067627, - "learning_rate": 3.7620643082291976e-05, - "loss": 0.9028, + "epoch": 0.30933379597501737, + "grad_norm": 1.5661325454711914, + "learning_rate": 3.092801387684302e-05, + "loss": 0.7378, "step": 1783 }, { - "epoch": 0.6514515245572393, - "grad_norm": 1.050227403640747, - "learning_rate": 3.761385247513913e-05, - "loss": 1.0192, + "epoch": 0.30950728660652327, + "grad_norm": 0.9223610758781433, + "learning_rate": 3.0945359930615785e-05, + "loss": 0.801, "step": 1784 }, { - "epoch": 0.6518166879678656, - "grad_norm": 1.1808457374572754, - "learning_rate": 3.760705280636599e-05, - "loss": 0.9374, + "epoch": 0.30968077723802917, + "grad_norm": 0.9057555794715881, + "learning_rate": 3.096270598438855e-05, + "loss": 0.8049, "step": 1785 }, { - "epoch": 0.6521818513784918, - "grad_norm": 1.1207365989685059, - "learning_rate": 3.760024407947072e-05, - "loss": 0.9308, + "epoch": 0.30985426786953507, + "grad_norm": 0.993172824382782, + "learning_rate": 3.098005203816132e-05, + "loss": 0.7166, "step": 1786 }, { - "epoch": 0.6525470147891181, - "grad_norm": 1.4128456115722656, - "learning_rate": 3.759342629795611e-05, - "loss": 1.0103, + "epoch": 0.3100277585010409, + "grad_norm": 1.1409454345703125, + "learning_rate": 3.0997398091934087e-05, + "loss": 0.825, "step": 1787 }, { - "epoch": 0.6529121781997443, - "grad_norm": 1.101938009262085, - "learning_rate": 3.758659946532965e-05, - "loss": 0.951, + "epoch": 0.3102012491325468, + "grad_norm": 0.7769619822502136, + "learning_rate": 3.101474414570685e-05, + "loss": 0.8313, "step": 1788 }, { - "epoch": 0.6532773416103707, - "grad_norm": 0.9535780549049377, - "learning_rate": 3.757976358510348e-05, - "loss": 0.9744, + "epoch": 0.3103747397640527, + "grad_norm": 1.0350453853607178, + "learning_rate": 3.103209019947962e-05, + "loss": 0.8271, "step": 1789 }, { - "epoch": 0.6536425050209969, - "grad_norm": 1.1083014011383057, - "learning_rate": 3.757291866079437e-05, - "loss": 0.8992, + "epoch": 0.3105482303955586, + "grad_norm": 0.7878040075302124, + "learning_rate": 3.104943625325239e-05, + "loss": 0.8865, "step": 1790 }, { - "epoch": 0.6540076684316232, - "grad_norm": 1.1415414810180664, - "learning_rate": 3.756606469592377e-05, - "loss": 0.9363, + "epoch": 0.3107217210270645, + "grad_norm": 0.9085053205490112, + "learning_rate": 3.106678230702515e-05, + "loss": 0.9614, "step": 1791 }, { - "epoch": 0.6543728318422494, - "grad_norm": 0.9151849746704102, - "learning_rate": 3.755920169401777e-05, - "loss": 0.9547, + "epoch": 0.3108952116585704, + "grad_norm": 0.863218367099762, + "learning_rate": 3.1084128360797925e-05, + "loss": 0.9053, "step": 1792 }, { - "epoch": 0.6547379952528757, - "grad_norm": 1.3087586164474487, - "learning_rate": 3.7552329658607096e-05, - "loss": 0.9971, + "epoch": 0.3110687022900763, + "grad_norm": 0.9422206878662109, + "learning_rate": 3.110147441457069e-05, + "loss": 0.7607, "step": 1793 }, { - "epoch": 0.6551031586635019, - "grad_norm": 1.2354145050048828, - "learning_rate": 3.754544859322715e-05, - "loss": 0.9731, + "epoch": 0.31124219292158223, + "grad_norm": 1.0422240495681763, + "learning_rate": 3.1118820468343455e-05, + "loss": 0.7517, "step": 1794 }, { - "epoch": 0.6554683220741282, - "grad_norm": 1.0923519134521484, - "learning_rate": 3.753855850141795e-05, - "loss": 0.9542, + "epoch": 0.31141568355308813, + "grad_norm": 0.9393618702888489, + "learning_rate": 3.113616652211622e-05, + "loss": 0.834, "step": 1795 }, { - "epoch": 0.6558334854847544, - "grad_norm": 0.9212494492530823, - "learning_rate": 3.7531659386724195e-05, - "loss": 0.9874, + "epoch": 0.31158917418459403, + "grad_norm": 0.871377170085907, + "learning_rate": 3.115351257588899e-05, + "loss": 0.8401, "step": 1796 }, { - "epoch": 0.6561986488953807, - "grad_norm": 1.0260984897613525, - "learning_rate": 3.752475125269517e-05, - "loss": 0.9795, + "epoch": 0.31176266481609993, + "grad_norm": 0.9729689359664917, + "learning_rate": 3.117085862966176e-05, + "loss": 0.9141, "step": 1797 }, { - "epoch": 0.6565638123060069, - "grad_norm": 1.4144386053085327, - "learning_rate": 3.7517834102884865e-05, - "loss": 1.0066, + "epoch": 0.31193615544760583, + "grad_norm": 1.0335659980773926, + "learning_rate": 3.118820468343452e-05, + "loss": 0.7097, "step": 1798 }, { - "epoch": 0.6569289757166332, - "grad_norm": 1.2987159490585327, - "learning_rate": 3.751090794085185e-05, - "loss": 0.9044, + "epoch": 0.31210964607911174, + "grad_norm": 1.03711998462677, + "learning_rate": 3.120555073720729e-05, + "loss": 0.7471, "step": 1799 }, { - "epoch": 0.6572941391272594, - "grad_norm": 1.4466197490692139, - "learning_rate": 3.750397277015937e-05, - "loss": 0.9165, + "epoch": 0.31228313671061764, + "grad_norm": 1.435189127922058, + "learning_rate": 3.122289679098005e-05, + "loss": 0.8141, "step": 1800 }, { - "epoch": 0.6576593025378857, - "grad_norm": 1.2733914852142334, - "learning_rate": 3.74970285943753e-05, - "loss": 0.9486, + "epoch": 0.31245662734212354, + "grad_norm": 0.8106119632720947, + "learning_rate": 3.1240242844752824e-05, + "loss": 0.8328, "step": 1801 }, { - "epoch": 0.6580244659485119, - "grad_norm": 1.10292649269104, - "learning_rate": 3.749007541707212e-05, - "loss": 0.9542, + "epoch": 0.31263011797362944, + "grad_norm": 0.9504688382148743, + "learning_rate": 3.125758889852559e-05, + "loss": 0.7419, "step": 1802 }, { - "epoch": 0.6583896293591383, - "grad_norm": 1.247214436531067, - "learning_rate": 3.7483113241826974e-05, - "loss": 0.9758, + "epoch": 0.31280360860513534, + "grad_norm": 0.9424661993980408, + "learning_rate": 3.1274934952298354e-05, + "loss": 0.6982, "step": 1803 }, { - "epoch": 0.6587547927697645, - "grad_norm": 1.0165550708770752, - "learning_rate": 3.747614207222162e-05, - "loss": 0.9277, + "epoch": 0.31297709923664124, + "grad_norm": 1.191731333732605, + "learning_rate": 3.129228100607112e-05, + "loss": 0.875, "step": 1804 }, { - "epoch": 0.6591199561803908, - "grad_norm": 1.0388840436935425, - "learning_rate": 3.7469161911842444e-05, - "loss": 0.9443, + "epoch": 0.31315058986814714, + "grad_norm": 1.3449954986572266, + "learning_rate": 3.130962705984389e-05, + "loss": 0.7571, "step": 1805 }, { - "epoch": 0.659485119591017, - "grad_norm": 1.348332166671753, - "learning_rate": 3.7462172764280456e-05, - "loss": 0.9316, + "epoch": 0.31332408049965305, + "grad_norm": 0.7874204516410828, + "learning_rate": 3.1326973113616656e-05, + "loss": 0.7637, "step": 1806 }, { - "epoch": 0.6598502830016433, - "grad_norm": 1.5531115531921387, - "learning_rate": 3.745517463313129e-05, - "loss": 0.9692, + "epoch": 0.3134975711311589, + "grad_norm": 0.9826467633247375, + "learning_rate": 3.134431916738942e-05, + "loss": 0.7395, "step": 1807 }, { - "epoch": 0.6602154464122695, - "grad_norm": 1.3495476245880127, - "learning_rate": 3.7448167521995216e-05, - "loss": 0.988, + "epoch": 0.3136710617626648, + "grad_norm": 1.1938284635543823, + "learning_rate": 3.1361665221162186e-05, + "loss": 0.7932, "step": 1808 }, { - "epoch": 0.6605806098228958, - "grad_norm": 1.017197608947754, - "learning_rate": 3.7441151434477096e-05, - "loss": 0.9344, + "epoch": 0.3138445523941707, + "grad_norm": 1.1258822679519653, + "learning_rate": 3.137901127493496e-05, + "loss": 0.7395, "step": 1809 }, { - "epoch": 0.660945773233522, - "grad_norm": 0.8345208764076233, - "learning_rate": 3.743412637418644e-05, - "loss": 0.9536, + "epoch": 0.3140180430256766, + "grad_norm": 0.9145428538322449, + "learning_rate": 3.139635732870772e-05, + "loss": 0.7703, "step": 1810 }, { - "epoch": 0.6613109366441483, - "grad_norm": 1.0259665250778198, - "learning_rate": 3.742709234473735e-05, - "loss": 0.948, + "epoch": 0.3141915336571825, + "grad_norm": 0.7178246378898621, + "learning_rate": 3.141370338248049e-05, + "loss": 0.8707, "step": 1811 }, { - "epoch": 0.6616761000547745, - "grad_norm": 1.1891881227493286, - "learning_rate": 3.7420049349748555e-05, - "loss": 0.9075, + "epoch": 0.3143650242886884, + "grad_norm": 0.715694010257721, + "learning_rate": 3.143104943625325e-05, + "loss": 0.9128, "step": 1812 }, { - "epoch": 0.6620412634654008, - "grad_norm": 0.9035482406616211, - "learning_rate": 3.7412997392843385e-05, - "loss": 0.932, + "epoch": 0.3145385149201943, + "grad_norm": 1.1028105020523071, + "learning_rate": 3.144839549002602e-05, + "loss": 0.7231, "step": 1813 }, { - "epoch": 0.662406426876027, - "grad_norm": 1.0295895338058472, - "learning_rate": 3.7405936477649806e-05, - "loss": 0.9735, + "epoch": 0.3147120055517002, + "grad_norm": 1.0331463813781738, + "learning_rate": 3.146574154379879e-05, + "loss": 0.7246, "step": 1814 }, { - "epoch": 0.6627715902866532, - "grad_norm": 0.9567296504974365, - "learning_rate": 3.739886660780037e-05, - "loss": 0.9583, + "epoch": 0.3148854961832061, + "grad_norm": 0.9650563597679138, + "learning_rate": 3.1483087597571554e-05, + "loss": 0.7122, "step": 1815 }, { - "epoch": 0.6631367536972795, - "grad_norm": 1.1081693172454834, - "learning_rate": 3.739178778693222e-05, - "loss": 0.9769, + "epoch": 0.315058986814712, + "grad_norm": 0.7394370436668396, + "learning_rate": 3.150043365134432e-05, + "loss": 0.8506, "step": 1816 }, { - "epoch": 0.6635019171079057, - "grad_norm": 1.0146452188491821, - "learning_rate": 3.7384700018687154e-05, - "loss": 0.932, + "epoch": 0.3152324774462179, + "grad_norm": 1.8304815292358398, + "learning_rate": 3.1517779705117084e-05, + "loss": 0.8204, "step": 1817 }, { - "epoch": 0.663867080518532, - "grad_norm": 1.050704836845398, - "learning_rate": 3.737760330671153e-05, - "loss": 0.9492, + "epoch": 0.3154059680777238, + "grad_norm": 1.2419334650039673, + "learning_rate": 3.1535125758889856e-05, + "loss": 0.9172, "step": 1818 }, { - "epoch": 0.6642322439291583, - "grad_norm": 1.1224082708358765, - "learning_rate": 3.737049765465633e-05, - "loss": 0.9375, + "epoch": 0.3155794587092297, + "grad_norm": 1.0156910419464111, + "learning_rate": 3.155247181266262e-05, + "loss": 0.7424, "step": 1819 }, { - "epoch": 0.6645974073397846, - "grad_norm": 1.2410085201263428, - "learning_rate": 3.736338306617712e-05, - "loss": 0.9523, + "epoch": 0.3157529493407356, + "grad_norm": 0.8228117227554321, + "learning_rate": 3.156981786643539e-05, + "loss": 0.8418, "step": 1820 }, { - "epoch": 0.6649625707504108, - "grad_norm": 1.097634196281433, - "learning_rate": 3.735625954493406e-05, - "loss": 0.9771, + "epoch": 0.3159264399722415, + "grad_norm": 1.0255918502807617, + "learning_rate": 3.158716392020816e-05, + "loss": 0.7559, "step": 1821 }, { - "epoch": 0.6653277341610371, - "grad_norm": 1.1790845394134521, - "learning_rate": 3.734912709459194e-05, - "loss": 0.9465, + "epoch": 0.3160999306037474, + "grad_norm": 0.8285799622535706, + "learning_rate": 3.160450997398092e-05, + "loss": 0.7693, "step": 1822 }, { - "epoch": 0.6656928975716633, - "grad_norm": 1.4344674348831177, - "learning_rate": 3.7341985718820106e-05, - "loss": 0.9404, + "epoch": 0.3162734212352533, + "grad_norm": 1.3323465585708618, + "learning_rate": 3.162185602775369e-05, + "loss": 0.8022, "step": 1823 }, { - "epoch": 0.6660580609822896, - "grad_norm": 0.9069438576698303, - "learning_rate": 3.733483542129251e-05, - "loss": 0.9429, + "epoch": 0.3164469118667592, + "grad_norm": 1.800735354423523, + "learning_rate": 3.163920208152646e-05, + "loss": 0.9412, "step": 1824 }, { - "epoch": 0.6664232243929158, - "grad_norm": 1.4149705171585083, - "learning_rate": 3.732767620568769e-05, - "loss": 0.9008, + "epoch": 0.3166204024982651, + "grad_norm": 0.9644576907157898, + "learning_rate": 3.1656548135299224e-05, + "loss": 0.8857, "step": 1825 }, { - "epoch": 0.6667883878035421, - "grad_norm": 0.8944642543792725, - "learning_rate": 3.732050807568878e-05, - "loss": 0.9069, + "epoch": 0.31679389312977096, + "grad_norm": 0.891331672668457, + "learning_rate": 3.167389418907199e-05, + "loss": 0.8635, "step": 1826 }, { - "epoch": 0.6671535512141683, - "grad_norm": 1.440977931022644, - "learning_rate": 3.731333103498349e-05, - "loss": 1.0046, + "epoch": 0.31696738376127687, + "grad_norm": 1.6046355962753296, + "learning_rate": 3.1691240242844754e-05, + "loss": 0.8984, "step": 1827 }, { - "epoch": 0.6675187146247946, - "grad_norm": 1.2334364652633667, - "learning_rate": 3.730614508726413e-05, - "loss": 0.929, + "epoch": 0.31714087439278277, + "grad_norm": 0.9138913154602051, + "learning_rate": 3.1708586296617526e-05, + "loss": 1.0146, "step": 1828 }, { - "epoch": 0.6678838780354208, - "grad_norm": 1.0228042602539062, - "learning_rate": 3.729895023622756e-05, - "loss": 0.9266, + "epoch": 0.31731436502428867, + "grad_norm": 0.7231571674346924, + "learning_rate": 3.172593235039029e-05, + "loss": 0.9812, "step": 1829 }, { - "epoch": 0.6682490414460471, - "grad_norm": 0.8590624332427979, - "learning_rate": 3.729174648557528e-05, - "loss": 0.8921, + "epoch": 0.31748785565579457, + "grad_norm": 0.8388176560401917, + "learning_rate": 3.1743278404163056e-05, + "loss": 0.9011, "step": 1830 }, { - "epoch": 0.6686142048566733, - "grad_norm": 1.1811816692352295, - "learning_rate": 3.728453383901329e-05, - "loss": 0.9664, + "epoch": 0.3176613462873005, + "grad_norm": 0.9080206155776978, + "learning_rate": 3.176062445793582e-05, + "loss": 0.8555, "step": 1831 }, { - "epoch": 0.6689793682672996, - "grad_norm": 1.0860999822616577, - "learning_rate": 3.727731230025224e-05, - "loss": 0.9559, + "epoch": 0.3178348369188064, + "grad_norm": 0.8524741530418396, + "learning_rate": 3.1777970511708586e-05, + "loss": 0.7612, "step": 1832 }, { - "epoch": 0.6693445316779258, - "grad_norm": 1.2224920988082886, - "learning_rate": 3.727008187300729e-05, - "loss": 0.9304, + "epoch": 0.3180083275503123, + "grad_norm": 0.7899658679962158, + "learning_rate": 3.179531656548136e-05, + "loss": 0.8623, "step": 1833 }, { - "epoch": 0.6697096950885522, - "grad_norm": 1.1019855737686157, - "learning_rate": 3.726284256099823e-05, - "loss": 0.9372, + "epoch": 0.3181818181818182, + "grad_norm": 0.9792136549949646, + "learning_rate": 3.181266261925412e-05, + "loss": 0.7856, "step": 1834 }, { - "epoch": 0.6700748584991784, - "grad_norm": 0.8839512467384338, - "learning_rate": 3.725559436794939e-05, - "loss": 0.9304, + "epoch": 0.3183553088133241, + "grad_norm": 1.1496461629867554, + "learning_rate": 3.183000867302689e-05, + "loss": 0.7939, "step": 1835 }, { - "epoch": 0.6704400219098047, - "grad_norm": 0.8367845416069031, - "learning_rate": 3.7248337297589666e-05, - "loss": 0.9608, + "epoch": 0.31852879944483, + "grad_norm": 1.4393705129623413, + "learning_rate": 3.184735472679965e-05, + "loss": 0.7395, "step": 1836 }, { - "epoch": 0.6708051853204309, - "grad_norm": 0.9474363923072815, - "learning_rate": 3.724107135365254e-05, - "loss": 0.9673, + "epoch": 0.3187022900763359, + "grad_norm": 0.8804884552955627, + "learning_rate": 3.1864700780572425e-05, + "loss": 0.6907, "step": 1837 }, { - "epoch": 0.6711703487310572, - "grad_norm": 1.3062212467193604, - "learning_rate": 3.723379653987604e-05, - "loss": 0.9698, + "epoch": 0.3188757807078418, + "grad_norm": 1.7861661911010742, + "learning_rate": 3.188204683434519e-05, + "loss": 0.8831, "step": 1838 }, { - "epoch": 0.6715355121416834, - "grad_norm": 1.1866589784622192, - "learning_rate": 3.722651286000277e-05, - "loss": 0.9109, + "epoch": 0.3190492713393477, + "grad_norm": 1.5346360206604004, + "learning_rate": 3.1899392888117955e-05, + "loss": 0.718, "step": 1839 }, { - "epoch": 0.6719006755523097, - "grad_norm": 1.1608940362930298, - "learning_rate": 3.7219220317779886e-05, - "loss": 0.9521, + "epoch": 0.3192227619708536, + "grad_norm": 1.5538510084152222, + "learning_rate": 3.191673894189072e-05, + "loss": 0.6907, "step": 1840 }, { - "epoch": 0.6722658389629359, - "grad_norm": 0.9668271541595459, - "learning_rate": 3.721191891695912e-05, - "loss": 0.8928, + "epoch": 0.3193962526023595, + "grad_norm": 0.8598672747612, + "learning_rate": 3.1934084995663485e-05, + "loss": 0.79, "step": 1841 }, { - "epoch": 0.6726310023735622, - "grad_norm": 1.2307372093200684, - "learning_rate": 3.720460866129674e-05, - "loss": 0.9246, + "epoch": 0.3195697432338654, + "grad_norm": 1.4663066864013672, + "learning_rate": 3.1951431049436257e-05, + "loss": 0.8599, "step": 1842 }, { - "epoch": 0.6729961657841884, - "grad_norm": 0.8643574118614197, - "learning_rate": 3.719728955455359e-05, - "loss": 0.954, + "epoch": 0.3197432338653713, + "grad_norm": 1.1334919929504395, + "learning_rate": 3.196877710320902e-05, + "loss": 0.8909, "step": 1843 }, { - "epoch": 0.6733613291948147, - "grad_norm": 1.0964161157608032, - "learning_rate": 3.718996160049504e-05, - "loss": 0.9794, + "epoch": 0.3199167244968772, + "grad_norm": 0.8314929008483887, + "learning_rate": 3.1986123156981787e-05, + "loss": 0.8789, "step": 1844 }, { - "epoch": 0.6737264926054409, - "grad_norm": 1.1825811862945557, - "learning_rate": 3.718262480289103e-05, - "loss": 0.9496, + "epoch": 0.3200902151283831, + "grad_norm": 0.7797781825065613, + "learning_rate": 3.200346921075455e-05, + "loss": 0.9038, "step": 1845 }, { - "epoch": 0.6740916560160672, - "grad_norm": 0.9016256928443909, - "learning_rate": 3.7175279165516064e-05, - "loss": 0.9275, + "epoch": 0.32026370575988894, + "grad_norm": 0.7539864182472229, + "learning_rate": 3.202081526452732e-05, + "loss": 0.8047, "step": 1846 }, { - "epoch": 0.6744568194266934, - "grad_norm": 1.524819254875183, - "learning_rate": 3.7167924692149164e-05, - "loss": 1.0007, + "epoch": 0.32043719639139484, + "grad_norm": 0.7307820916175842, + "learning_rate": 3.203816131830009e-05, + "loss": 0.9524, "step": 1847 }, { - "epoch": 0.6748219828373198, - "grad_norm": 1.2162039279937744, - "learning_rate": 3.7160561386573916e-05, - "loss": 0.9619, + "epoch": 0.32061068702290074, + "grad_norm": 1.3119773864746094, + "learning_rate": 3.205550737207286e-05, + "loss": 0.7992, "step": 1848 }, { - "epoch": 0.675187146247946, - "grad_norm": 1.1253219842910767, - "learning_rate": 3.7153189252578454e-05, - "loss": 0.9547, + "epoch": 0.32078417765440664, + "grad_norm": 0.7714871168136597, + "learning_rate": 3.2072853425845625e-05, + "loss": 0.937, "step": 1849 }, { - "epoch": 0.6755523096585722, - "grad_norm": 0.9265063405036926, - "learning_rate": 3.7145808293955427e-05, - "loss": 0.9376, + "epoch": 0.32095766828591255, + "grad_norm": 1.2853626012802124, + "learning_rate": 3.209019947961839e-05, + "loss": 0.73, "step": 1850 }, { - "epoch": 0.6759174730691985, - "grad_norm": 1.1999282836914062, - "learning_rate": 3.7138418514502055e-05, - "loss": 0.9597, + "epoch": 0.32113115891741845, + "grad_norm": 0.8714017271995544, + "learning_rate": 3.2107545533391155e-05, + "loss": 0.8271, "step": 1851 }, { - "epoch": 0.6762826364798247, - "grad_norm": 1.3449455499649048, - "learning_rate": 3.7131019918020074e-05, - "loss": 0.9547, + "epoch": 0.32130464954892435, + "grad_norm": 0.9162073731422424, + "learning_rate": 3.212489158716393e-05, + "loss": 0.811, "step": 1852 }, { - "epoch": 0.676647799890451, - "grad_norm": 1.2630411386489868, - "learning_rate": 3.712361250831578e-05, - "loss": 0.9246, + "epoch": 0.32147814018043025, + "grad_norm": 0.9974989891052246, + "learning_rate": 3.214223764093669e-05, + "loss": 0.7939, "step": 1853 }, { - "epoch": 0.6770129633010772, - "grad_norm": 1.2750487327575684, - "learning_rate": 3.711619628919997e-05, - "loss": 0.9091, + "epoch": 0.32165163081193615, + "grad_norm": 1.185793161392212, + "learning_rate": 3.215958369470946e-05, + "loss": 0.7463, "step": 1854 }, { - "epoch": 0.6773781267117035, - "grad_norm": 1.3865666389465332, - "learning_rate": 3.7108771264488e-05, - "loss": 0.9438, + "epoch": 0.32182512144344205, + "grad_norm": 0.8699380159378052, + "learning_rate": 3.217692974848222e-05, + "loss": 0.9548, "step": 1855 }, { - "epoch": 0.6777432901223297, - "grad_norm": 0.9382888078689575, - "learning_rate": 3.7101337437999746e-05, - "loss": 0.9083, + "epoch": 0.32199861207494795, + "grad_norm": 1.248387098312378, + "learning_rate": 3.2194275802254994e-05, + "loss": 0.8662, "step": 1856 }, { - "epoch": 0.678108453532956, - "grad_norm": 1.0028668642044067, - "learning_rate": 3.709389481355962e-05, - "loss": 0.9651, + "epoch": 0.32217210270645386, + "grad_norm": 1.5067499876022339, + "learning_rate": 3.221162185602776e-05, + "loss": 0.7505, "step": 1857 }, { - "epoch": 0.6784736169435822, - "grad_norm": 1.0609986782073975, - "learning_rate": 3.708644339499654e-05, - "loss": 0.9457, + "epoch": 0.32234559333795976, + "grad_norm": 1.5727344751358032, + "learning_rate": 3.2228967909800524e-05, + "loss": 0.8141, "step": 1858 }, { - "epoch": 0.6788387803542085, - "grad_norm": 1.0726428031921387, - "learning_rate": 3.7078983186143976e-05, - "loss": 0.9309, + "epoch": 0.32251908396946566, + "grad_norm": 1.4997962713241577, + "learning_rate": 3.224631396357329e-05, + "loss": 0.8857, "step": 1859 }, { - "epoch": 0.6792039437648347, - "grad_norm": 1.1071470975875854, - "learning_rate": 3.7071514190839895e-05, - "loss": 0.9639, + "epoch": 0.32269257460097156, + "grad_norm": 1.0850422382354736, + "learning_rate": 3.2263660017346054e-05, + "loss": 0.7742, "step": 1860 }, { - "epoch": 0.679569107175461, - "grad_norm": 1.0558290481567383, - "learning_rate": 3.706403641292681e-05, - "loss": 0.9294, + "epoch": 0.32286606523247746, + "grad_norm": 1.8524852991104126, + "learning_rate": 3.2281006071118825e-05, + "loss": 0.8955, "step": 1861 }, { - "epoch": 0.6799342705860872, - "grad_norm": 1.0755356550216675, - "learning_rate": 3.705654985625171e-05, - "loss": 0.9379, + "epoch": 0.32303955586398336, + "grad_norm": 1.193983793258667, + "learning_rate": 3.229835212489159e-05, + "loss": 0.845, "step": 1862 }, { - "epoch": 0.6802994339967136, - "grad_norm": 0.8738650679588318, - "learning_rate": 3.704905452466616e-05, - "loss": 0.9399, + "epoch": 0.32321304649548926, + "grad_norm": 0.9535468816757202, + "learning_rate": 3.2315698178664355e-05, + "loss": 0.811, "step": 1863 }, { - "epoch": 0.6806645974073398, - "grad_norm": 1.1751739978790283, - "learning_rate": 3.704155042202619e-05, - "loss": 0.9624, + "epoch": 0.32338653712699517, + "grad_norm": 1.0521818399429321, + "learning_rate": 3.233304423243712e-05, + "loss": 0.8362, "step": 1864 }, { - "epoch": 0.6810297608179661, - "grad_norm": 1.0844022035598755, - "learning_rate": 3.703403755219236e-05, - "loss": 0.9386, + "epoch": 0.32356002775850107, + "grad_norm": 1.8297070264816284, + "learning_rate": 3.235039028620989e-05, + "loss": 0.7722, "step": 1865 }, { - "epoch": 0.6813949242285923, - "grad_norm": 1.1327035427093506, - "learning_rate": 3.702651591902974e-05, - "loss": 0.949, + "epoch": 0.3237335183900069, + "grad_norm": 1.059195637702942, + "learning_rate": 3.236773633998266e-05, + "loss": 0.7007, "step": 1866 }, { - "epoch": 0.6817600876392186, - "grad_norm": 1.3151122331619263, - "learning_rate": 3.701898552640792e-05, - "loss": 0.9667, + "epoch": 0.3239070090215128, + "grad_norm": 0.9987848401069641, + "learning_rate": 3.238508239375542e-05, + "loss": 0.8989, "step": 1867 }, { - "epoch": 0.6821252510498448, - "grad_norm": 1.075447916984558, - "learning_rate": 3.7011446378200965e-05, - "loss": 0.9289, + "epoch": 0.3240804996530187, + "grad_norm": 1.1433897018432617, + "learning_rate": 3.240242844752819e-05, + "loss": 0.8813, "step": 1868 }, { - "epoch": 0.6824904144604711, - "grad_norm": 1.3037465810775757, - "learning_rate": 3.700389847828749e-05, - "loss": 0.9195, + "epoch": 0.3242539902845246, + "grad_norm": 0.9972354173660278, + "learning_rate": 3.241977450130096e-05, + "loss": 0.7175, "step": 1869 }, { - "epoch": 0.6828555778710973, - "grad_norm": 1.061158299446106, - "learning_rate": 3.699634183055056e-05, - "loss": 0.9305, + "epoch": 0.3244274809160305, + "grad_norm": 0.8698007464408875, + "learning_rate": 3.2437120555073724e-05, + "loss": 0.9304, "step": 1870 }, { - "epoch": 0.6832207412817236, - "grad_norm": 1.2008591890335083, - "learning_rate": 3.6988776438877784e-05, - "loss": 0.962, + "epoch": 0.3246009715475364, + "grad_norm": 2.2581543922424316, + "learning_rate": 3.245446660884649e-05, + "loss": 0.6727, "step": 1871 }, { - "epoch": 0.6835859046923498, - "grad_norm": 0.8645357489585876, - "learning_rate": 3.698120230716124e-05, - "loss": 0.9275, + "epoch": 0.3247744621790423, + "grad_norm": 1.1504334211349487, + "learning_rate": 3.2471812662619254e-05, + "loss": 0.8425, "step": 1872 }, { - "epoch": 0.6839510681029761, - "grad_norm": 1.6274298429489136, - "learning_rate": 3.697361943929753e-05, - "loss": 0.9762, + "epoch": 0.3249479528105482, + "grad_norm": 0.8594744205474854, + "learning_rate": 3.248915871639202e-05, + "loss": 0.9133, "step": 1873 }, { - "epoch": 0.6843162315136023, - "grad_norm": 1.3760597705841064, - "learning_rate": 3.696602783918773e-05, - "loss": 0.9181, + "epoch": 0.3251214434420541, + "grad_norm": 1.2735743522644043, + "learning_rate": 3.250650477016479e-05, + "loss": 0.6792, "step": 1874 }, { - "epoch": 0.6846813949242286, - "grad_norm": 1.7165113687515259, - "learning_rate": 3.69584275107374e-05, - "loss": 0.968, + "epoch": 0.32529493407356, + "grad_norm": 1.3539754152297974, + "learning_rate": 3.2523850823937556e-05, + "loss": 0.8733, "step": 1875 }, { - "epoch": 0.6850465583348548, - "grad_norm": 1.1045314073562622, - "learning_rate": 3.695081845785663e-05, - "loss": 0.9008, + "epoch": 0.32546842470506593, + "grad_norm": 0.7005854845046997, + "learning_rate": 3.254119687771032e-05, + "loss": 0.8381, "step": 1876 }, { - "epoch": 0.6854117217454812, - "grad_norm": 1.0448204278945923, - "learning_rate": 3.6943200684459944e-05, - "loss": 0.8976, + "epoch": 0.32564191533657183, + "grad_norm": 1.6914273500442505, + "learning_rate": 3.2558542931483086e-05, + "loss": 0.8193, "step": 1877 }, { - "epoch": 0.6857768851561074, - "grad_norm": 1.1573553085327148, - "learning_rate": 3.69355741944664e-05, - "loss": 0.912, + "epoch": 0.32581540596807773, + "grad_norm": 0.8933230042457581, + "learning_rate": 3.257588898525586e-05, + "loss": 0.8586, "step": 1878 }, { - "epoch": 0.6861420485667337, - "grad_norm": 0.9787659645080566, - "learning_rate": 3.692793899179951e-05, - "loss": 0.9762, + "epoch": 0.32598889659958363, + "grad_norm": 1.4548304080963135, + "learning_rate": 3.259323503902862e-05, + "loss": 0.6843, "step": 1879 }, { - "epoch": 0.6865072119773599, - "grad_norm": 1.0320667028427124, - "learning_rate": 3.6920295080387295e-05, - "loss": 0.9817, + "epoch": 0.32616238723108953, + "grad_norm": 0.8697642683982849, + "learning_rate": 3.2610581092801394e-05, + "loss": 0.7734, "step": 1880 }, { - "epoch": 0.6868723753879862, - "grad_norm": 1.0232908725738525, - "learning_rate": 3.691264246416222e-05, - "loss": 0.9236, + "epoch": 0.32633587786259544, + "grad_norm": 0.7502459287643433, + "learning_rate": 3.262792714657416e-05, + "loss": 0.8716, "step": 1881 }, { - "epoch": 0.6872375387986124, - "grad_norm": 0.9261885285377502, - "learning_rate": 3.6904981147061265e-05, - "loss": 0.9572, + "epoch": 0.32650936849410134, + "grad_norm": 1.2018510103225708, + "learning_rate": 3.2645273200346924e-05, + "loss": 0.7878, "step": 1882 }, { - "epoch": 0.6876027022092386, - "grad_norm": 0.959498941898346, - "learning_rate": 3.689731113302587e-05, - "loss": 0.9407, + "epoch": 0.32668285912560724, + "grad_norm": 0.9761494398117065, + "learning_rate": 3.266261925411969e-05, + "loss": 0.8337, "step": 1883 }, { - "epoch": 0.6879678656198649, - "grad_norm": 1.229457974433899, - "learning_rate": 3.688963242600193e-05, - "loss": 0.9518, + "epoch": 0.32685634975711314, + "grad_norm": 0.8519040942192078, + "learning_rate": 3.267996530789246e-05, + "loss": 0.9153, "step": 1884 }, { - "epoch": 0.6883330290304911, - "grad_norm": 1.507131576538086, - "learning_rate": 3.688194502993985e-05, - "loss": 0.9475, + "epoch": 0.32702984038861904, + "grad_norm": 0.9097318649291992, + "learning_rate": 3.2697311361665226e-05, + "loss": 0.7986, "step": 1885 }, { - "epoch": 0.6886981924411174, - "grad_norm": 1.143388271331787, - "learning_rate": 3.6874248948794494e-05, - "loss": 0.9299, + "epoch": 0.3272033310201249, + "grad_norm": 1.5067397356033325, + "learning_rate": 3.271465741543799e-05, + "loss": 0.7683, "step": 1886 }, { - "epoch": 0.6890633558517436, - "grad_norm": 0.8943531513214111, - "learning_rate": 3.6866544186525156e-05, - "loss": 0.9241, + "epoch": 0.3273768216516308, + "grad_norm": 1.0035514831542969, + "learning_rate": 3.2732003469210756e-05, + "loss": 0.7532, "step": 1887 }, { - "epoch": 0.6894285192623699, - "grad_norm": 1.531944751739502, - "learning_rate": 3.685883074709566e-05, - "loss": 0.9486, + "epoch": 0.3275503122831367, + "grad_norm": 1.813836693763733, + "learning_rate": 3.274934952298353e-05, + "loss": 0.7625, "step": 1888 }, { - "epoch": 0.6897936826729961, - "grad_norm": 1.336308479309082, - "learning_rate": 3.685110863447424e-05, - "loss": 0.9907, + "epoch": 0.3277238029146426, + "grad_norm": 0.869946300983429, + "learning_rate": 3.276669557675629e-05, + "loss": 0.8357, "step": 1889 }, { - "epoch": 0.6901588460836224, - "grad_norm": 1.1234655380249023, - "learning_rate": 3.684337785263363e-05, - "loss": 0.9692, + "epoch": 0.3278972935461485, + "grad_norm": 1.7698111534118652, + "learning_rate": 3.278404163052906e-05, + "loss": 0.7166, "step": 1890 }, { - "epoch": 0.6905240094942486, - "grad_norm": 1.2529271841049194, - "learning_rate": 3.6835638405550994e-05, - "loss": 0.9365, + "epoch": 0.3280707841776544, + "grad_norm": 1.0368653535842896, + "learning_rate": 3.280138768430182e-05, + "loss": 0.8108, "step": 1891 }, { - "epoch": 0.690889172904875, - "grad_norm": 1.178864598274231, - "learning_rate": 3.6827890297207964e-05, - "loss": 0.9167, + "epoch": 0.3282442748091603, + "grad_norm": 0.8692549467086792, + "learning_rate": 3.281873373807459e-05, + "loss": 0.9546, "step": 1892 }, { - "epoch": 0.6912543363155011, - "grad_norm": 1.1787397861480713, - "learning_rate": 3.682013353159065e-05, - "loss": 0.9402, + "epoch": 0.3284177654406662, + "grad_norm": 1.178022027015686, + "learning_rate": 3.283607979184736e-05, + "loss": 0.7477, "step": 1893 }, { - "epoch": 0.6916194997261275, - "grad_norm": 1.266817569732666, - "learning_rate": 3.681236811268957e-05, - "loss": 0.9635, + "epoch": 0.3285912560721721, + "grad_norm": 1.4519685506820679, + "learning_rate": 3.2853425845620125e-05, + "loss": 0.8882, "step": 1894 }, { - "epoch": 0.6919846631367537, - "grad_norm": 1.2918291091918945, - "learning_rate": 3.680459404449974e-05, - "loss": 0.9332, + "epoch": 0.328764746703678, + "grad_norm": 1.1099780797958374, + "learning_rate": 3.287077189939289e-05, + "loss": 0.7195, "step": 1895 }, { - "epoch": 0.69234982654738, - "grad_norm": 0.9812869429588318, - "learning_rate": 3.67968113310206e-05, - "loss": 0.9543, + "epoch": 0.3289382373351839, + "grad_norm": 0.9599144458770752, + "learning_rate": 3.2888117953165655e-05, + "loss": 0.8535, "step": 1896 }, { - "epoch": 0.6927149899580062, - "grad_norm": 1.04782235622406, - "learning_rate": 3.6789019976256045e-05, - "loss": 0.9502, + "epoch": 0.3291117279666898, + "grad_norm": 1.2066985368728638, + "learning_rate": 3.2905464006938426e-05, + "loss": 0.7568, "step": 1897 }, { - "epoch": 0.6930801533686325, - "grad_norm": 1.0867310762405396, - "learning_rate": 3.678121998421441e-05, - "loss": 0.969, + "epoch": 0.3292852185981957, + "grad_norm": 1.1503981351852417, + "learning_rate": 3.292281006071119e-05, + "loss": 0.7424, "step": 1898 }, { - "epoch": 0.6934453167792587, - "grad_norm": 1.0260337591171265, - "learning_rate": 3.6773411358908486e-05, - "loss": 0.921, + "epoch": 0.3294587092297016, + "grad_norm": 1.2057057619094849, + "learning_rate": 3.2940156114483956e-05, + "loss": 0.6772, "step": 1899 }, { - "epoch": 0.693810480189885, - "grad_norm": 1.5085606575012207, - "learning_rate": 3.676559410435549e-05, - "loss": 0.9645, + "epoch": 0.3296321998612075, + "grad_norm": 1.3639425039291382, + "learning_rate": 3.295750216825672e-05, + "loss": 0.6785, "step": 1900 }, { - "epoch": 0.6941756436005112, - "grad_norm": 1.2610340118408203, - "learning_rate": 3.6757768224577086e-05, - "loss": 0.938, + "epoch": 0.3298056904927134, + "grad_norm": 0.9733070135116577, + "learning_rate": 3.2974848222029487e-05, + "loss": 0.8257, "step": 1901 }, { - "epoch": 0.6945408070111375, - "grad_norm": 1.0793862342834473, - "learning_rate": 3.6749933723599385e-05, - "loss": 0.9336, + "epoch": 0.3299791811242193, + "grad_norm": 1.3842231035232544, + "learning_rate": 3.299219427580226e-05, + "loss": 0.708, "step": 1902 }, { - "epoch": 0.6949059704217637, - "grad_norm": 1.179988980293274, - "learning_rate": 3.674209060545291e-05, - "loss": 0.9882, + "epoch": 0.3301526717557252, + "grad_norm": 0.8262988328933716, + "learning_rate": 3.300954032957502e-05, + "loss": 0.8525, "step": 1903 }, { - "epoch": 0.69527113383239, - "grad_norm": 0.9569182991981506, - "learning_rate": 3.6734238874172644e-05, - "loss": 0.942, + "epoch": 0.3303261623872311, + "grad_norm": 0.940148115158081, + "learning_rate": 3.302688638334779e-05, + "loss": 0.8066, "step": 1904 }, { - "epoch": 0.6956362972430162, - "grad_norm": 0.8363674283027649, - "learning_rate": 3.6726378533797976e-05, - "loss": 0.8956, + "epoch": 0.33049965301873696, + "grad_norm": 1.1103507280349731, + "learning_rate": 3.304423243712055e-05, + "loss": 0.6981, "step": 1905 }, { - "epoch": 0.6960014606536425, - "grad_norm": 1.038861870765686, - "learning_rate": 3.6718509588372737e-05, - "loss": 0.8944, + "epoch": 0.33067314365024286, + "grad_norm": 1.4021570682525635, + "learning_rate": 3.3061578490893325e-05, + "loss": 0.8916, "step": 1906 }, { - "epoch": 0.6963666240642687, - "grad_norm": 1.007489800453186, - "learning_rate": 3.6710632041945195e-05, - "loss": 0.9097, + "epoch": 0.33084663428174876, + "grad_norm": 0.8823056817054749, + "learning_rate": 3.307892454466609e-05, + "loss": 1.0483, "step": 1907 }, { - "epoch": 0.6967317874748951, - "grad_norm": 0.836804211139679, - "learning_rate": 3.670274589856802e-05, - "loss": 0.9535, + "epoch": 0.33102012491325467, + "grad_norm": 1.2667510509490967, + "learning_rate": 3.309627059843886e-05, + "loss": 0.8457, "step": 1908 }, { - "epoch": 0.6970969508855213, - "grad_norm": 0.8125410676002502, - "learning_rate": 3.6694851162298315e-05, - "loss": 0.9529, + "epoch": 0.33119361554476057, + "grad_norm": 1.1111856698989868, + "learning_rate": 3.311361665221163e-05, + "loss": 0.762, "step": 1909 }, { - "epoch": 0.6974621142961476, - "grad_norm": 1.410980463027954, - "learning_rate": 3.668694783719762e-05, - "loss": 0.926, + "epoch": 0.33136710617626647, + "grad_norm": 1.0099698305130005, + "learning_rate": 3.313096270598439e-05, + "loss": 0.6569, "step": 1910 }, { - "epoch": 0.6978272777067738, - "grad_norm": 1.1134490966796875, - "learning_rate": 3.6679035927331855e-05, - "loss": 0.8898, + "epoch": 0.33154059680777237, + "grad_norm": 1.1645022630691528, + "learning_rate": 3.314830875975716e-05, + "loss": 0.7104, "step": 1911 }, { - "epoch": 0.6981924411174001, - "grad_norm": 0.925121009349823, - "learning_rate": 3.6671115436771404e-05, - "loss": 0.9399, + "epoch": 0.33171408743927827, + "grad_norm": 1.1959211826324463, + "learning_rate": 3.316565481352993e-05, + "loss": 0.8, "step": 1912 }, { - "epoch": 0.6985576045280263, - "grad_norm": 1.3035352230072021, - "learning_rate": 3.666318636959102e-05, - "loss": 0.9207, + "epoch": 0.3318875780707842, + "grad_norm": 0.9819892048835754, + "learning_rate": 3.3183000867302694e-05, + "loss": 0.8093, "step": 1913 }, { - "epoch": 0.6989227679386526, - "grad_norm": 1.2563941478729248, - "learning_rate": 3.665524872986991e-05, - "loss": 0.9395, + "epoch": 0.3320610687022901, + "grad_norm": 0.9523700475692749, + "learning_rate": 3.320034692107546e-05, + "loss": 0.8525, "step": 1914 }, { - "epoch": 0.6992879313492788, - "grad_norm": 1.216444492340088, - "learning_rate": 3.664730252169166e-05, - "loss": 0.9069, + "epoch": 0.332234559333796, + "grad_norm": 1.2322837114334106, + "learning_rate": 3.3217692974848224e-05, + "loss": 0.8137, "step": 1915 }, { - "epoch": 0.699653094759905, - "grad_norm": 1.2547234296798706, - "learning_rate": 3.663934774914428e-05, - "loss": 0.8896, + "epoch": 0.3324080499653019, + "grad_norm": 0.943067193031311, + "learning_rate": 3.3235039028620995e-05, + "loss": 0.8511, "step": 1916 }, { - "epoch": 0.7000182581705313, - "grad_norm": 1.2641855478286743, - "learning_rate": 3.6631384416320176e-05, - "loss": 0.9027, + "epoch": 0.3325815405968078, + "grad_norm": 0.9316331148147583, + "learning_rate": 3.325238508239376e-05, + "loss": 0.8206, "step": 1917 }, { - "epoch": 0.7003834215811575, - "grad_norm": 1.0289359092712402, - "learning_rate": 3.662341252731616e-05, - "loss": 0.9886, + "epoch": 0.3327550312283137, + "grad_norm": 1.077662706375122, + "learning_rate": 3.3269731136166525e-05, + "loss": 0.9993, "step": 1918 }, { - "epoch": 0.7007485849917838, - "grad_norm": 1.009670376777649, - "learning_rate": 3.6615432086233466e-05, - "loss": 0.936, + "epoch": 0.3329285218598196, + "grad_norm": 0.7238175272941589, + "learning_rate": 3.328707718993929e-05, + "loss": 0.9375, "step": 1919 }, { - "epoch": 0.70111374840241, - "grad_norm": 0.9077454805374146, - "learning_rate": 3.660744309717769e-05, - "loss": 0.949, + "epoch": 0.3331020124913255, + "grad_norm": 0.9927915334701538, + "learning_rate": 3.3304423243712055e-05, + "loss": 0.8481, "step": 1920 }, { - "epoch": 0.7014789118130363, - "grad_norm": 1.0789963006973267, - "learning_rate": 3.6599445564258855e-05, - "loss": 0.9357, + "epoch": 0.3332755031228314, + "grad_norm": 0.8813720345497131, + "learning_rate": 3.332176929748483e-05, + "loss": 0.8425, "step": 1921 }, { - "epoch": 0.7018440752236625, - "grad_norm": 1.15958571434021, - "learning_rate": 3.659143949159138e-05, - "loss": 0.9945, + "epoch": 0.3334489937543373, + "grad_norm": 1.9053723812103271, + "learning_rate": 3.333911535125759e-05, + "loss": 0.7654, "step": 1922 }, { - "epoch": 0.7022092386342889, - "grad_norm": 0.8594781160354614, - "learning_rate": 3.6583424883294053e-05, - "loss": 0.9435, + "epoch": 0.3336224843858432, + "grad_norm": 1.025921106338501, + "learning_rate": 3.335646140503036e-05, + "loss": 0.8567, "step": 1923 }, { - "epoch": 0.7025744020449151, - "grad_norm": 1.645010232925415, - "learning_rate": 3.657540174349007e-05, - "loss": 0.9338, + "epoch": 0.3337959750173491, + "grad_norm": 1.0865601301193237, + "learning_rate": 3.337380745880312e-05, + "loss": 0.7029, "step": 1924 }, { - "epoch": 0.7029395654555414, - "grad_norm": 0.8434624671936035, - "learning_rate": 3.656737007630703e-05, - "loss": 0.9651, + "epoch": 0.33396946564885494, + "grad_norm": 1.077301263809204, + "learning_rate": 3.3391153512575894e-05, + "loss": 0.7678, "step": 1925 }, { - "epoch": 0.7033047288661676, - "grad_norm": 1.149718165397644, - "learning_rate": 3.6559329885876896e-05, - "loss": 0.9353, + "epoch": 0.33414295628036084, + "grad_norm": 1.0540457963943481, + "learning_rate": 3.340849956634866e-05, + "loss": 0.926, "step": 1926 }, { - "epoch": 0.7036698922767939, - "grad_norm": 1.1305135488510132, - "learning_rate": 3.6551281176336015e-05, - "loss": 0.9526, + "epoch": 0.33431644691186674, + "grad_norm": 1.158047080039978, + "learning_rate": 3.3425845620121424e-05, + "loss": 0.7673, "step": 1927 }, { - "epoch": 0.7040350556874201, - "grad_norm": 0.9795079827308655, - "learning_rate": 3.654322395182512e-05, - "loss": 0.9188, + "epoch": 0.33448993754337264, + "grad_norm": 0.9718716740608215, + "learning_rate": 3.344319167389419e-05, + "loss": 0.7024, "step": 1928 }, { - "epoch": 0.7044002190980464, - "grad_norm": 1.0851081609725952, - "learning_rate": 3.653515821648936e-05, - "loss": 0.9232, + "epoch": 0.33466342817487854, + "grad_norm": 0.8918688893318176, + "learning_rate": 3.346053772766696e-05, + "loss": 0.8804, "step": 1929 }, { - "epoch": 0.7047653825086726, - "grad_norm": 2.0348126888275146, - "learning_rate": 3.6527083974478193e-05, - "loss": 0.9336, + "epoch": 0.33483691880638444, + "grad_norm": 0.9005969166755676, + "learning_rate": 3.3477883781439726e-05, + "loss": 0.8989, "step": 1930 }, { - "epoch": 0.7051305459192989, - "grad_norm": 1.1215265989303589, - "learning_rate": 3.651900122994552e-05, - "loss": 0.9377, + "epoch": 0.33501040943789034, + "grad_norm": 0.9463533163070679, + "learning_rate": 3.349522983521249e-05, + "loss": 0.8895, "step": 1931 }, { - "epoch": 0.7054957093299251, - "grad_norm": 1.219777226448059, - "learning_rate": 3.651090998704958e-05, - "loss": 0.9413, + "epoch": 0.33518390006939625, + "grad_norm": 1.1812310218811035, + "learning_rate": 3.3512575888985256e-05, + "loss": 0.849, "step": 1932 }, { - "epoch": 0.7058608727405514, - "grad_norm": 1.146498203277588, - "learning_rate": 3.650281024995299e-05, - "loss": 0.9707, + "epoch": 0.33535739070090215, + "grad_norm": 1.0115389823913574, + "learning_rate": 3.352992194275802e-05, + "loss": 0.751, "step": 1933 }, { - "epoch": 0.7062260361511776, - "grad_norm": 1.2766329050064087, - "learning_rate": 3.649470202282275e-05, - "loss": 0.9047, + "epoch": 0.33553088133240805, + "grad_norm": 0.7756454944610596, + "learning_rate": 3.354726799653079e-05, + "loss": 0.9092, "step": 1934 }, { - "epoch": 0.7065911995618039, - "grad_norm": 0.8604010939598083, - "learning_rate": 3.648658530983021e-05, - "loss": 0.9419, + "epoch": 0.33570437196391395, + "grad_norm": 0.9885166883468628, + "learning_rate": 3.356461405030356e-05, + "loss": 0.7866, "step": 1935 }, { - "epoch": 0.7069563629724301, - "grad_norm": 0.9688825011253357, - "learning_rate": 3.6478460115151084e-05, - "loss": 0.9077, + "epoch": 0.33587786259541985, + "grad_norm": 0.9943401217460632, + "learning_rate": 3.358196010407632e-05, + "loss": 0.9546, "step": 1936 }, { - "epoch": 0.7073215263830565, - "grad_norm": 1.0672504901885986, - "learning_rate": 3.6470326442965475e-05, - "loss": 0.9607, + "epoch": 0.33605135322692575, + "grad_norm": 1.183051347732544, + "learning_rate": 3.3599306157849094e-05, + "loss": 0.6936, "step": 1937 }, { - "epoch": 0.7076866897936827, - "grad_norm": 0.9548637270927429, - "learning_rate": 3.6462184297457826e-05, - "loss": 0.9131, + "epoch": 0.33622484385843165, + "grad_norm": 1.1374094486236572, + "learning_rate": 3.361665221162186e-05, + "loss": 0.9563, "step": 1938 }, { - "epoch": 0.708051853204309, - "grad_norm": 1.5463451147079468, - "learning_rate": 3.6454033682816946e-05, - "loss": 0.9471, + "epoch": 0.33639833448993756, + "grad_norm": 0.8758159279823303, + "learning_rate": 3.3633998265394624e-05, + "loss": 0.9436, "step": 1939 }, { - "epoch": 0.7084170166149352, - "grad_norm": 3.7445669174194336, - "learning_rate": 3.6445874603235986e-05, - "loss": 0.9583, + "epoch": 0.33657182512144346, + "grad_norm": 1.0233036279678345, + "learning_rate": 3.3651344319167396e-05, + "loss": 0.9089, "step": 1940 }, { - "epoch": 0.7087821800255615, - "grad_norm": 1.1429415941238403, - "learning_rate": 3.643770706291248e-05, - "loss": 0.9155, + "epoch": 0.33674531575294936, + "grad_norm": 1.0810378789901733, + "learning_rate": 3.366869037294016e-05, + "loss": 0.7905, "step": 1941 }, { - "epoch": 0.7091473434361877, - "grad_norm": 0.9995725154876709, - "learning_rate": 3.642953106604829e-05, - "loss": 0.9056, + "epoch": 0.33691880638445526, + "grad_norm": 1.0955419540405273, + "learning_rate": 3.3686036426712926e-05, + "loss": 0.696, "step": 1942 }, { - "epoch": 0.709512506846814, - "grad_norm": 1.395482063293457, - "learning_rate": 3.6421346616849645e-05, - "loss": 0.9554, + "epoch": 0.33709229701596116, + "grad_norm": 1.1725412607192993, + "learning_rate": 3.370338248048569e-05, + "loss": 0.8118, "step": 1943 }, { - "epoch": 0.7098776702574402, - "grad_norm": 1.0018835067749023, - "learning_rate": 3.641315371952711e-05, - "loss": 0.9567, + "epoch": 0.33726578764746706, + "grad_norm": 0.9102433323860168, + "learning_rate": 3.372072853425846e-05, + "loss": 0.875, "step": 1944 }, { - "epoch": 0.7102428336680665, - "grad_norm": 1.5387073755264282, - "learning_rate": 3.640495237829561e-05, - "loss": 0.9747, + "epoch": 0.3374392782789729, + "grad_norm": 1.4029414653778076, + "learning_rate": 3.373807458803123e-05, + "loss": 0.7993, "step": 1945 }, { - "epoch": 0.7106079970786927, - "grad_norm": 1.1105773448944092, - "learning_rate": 3.63967425973744e-05, - "loss": 0.9293, + "epoch": 0.3376127689104788, + "grad_norm": 0.8229091167449951, + "learning_rate": 3.375542064180399e-05, + "loss": 0.7717, "step": 1946 }, { - "epoch": 0.710973160489319, - "grad_norm": 0.9825156331062317, - "learning_rate": 3.638852438098708e-05, - "loss": 0.889, + "epoch": 0.3377862595419847, + "grad_norm": 0.8888067007064819, + "learning_rate": 3.377276669557676e-05, + "loss": 1.0107, "step": 1947 }, { - "epoch": 0.7113383238999452, - "grad_norm": 1.1243983507156372, - "learning_rate": 3.63802977333616e-05, - "loss": 0.9288, + "epoch": 0.3379597501734906, + "grad_norm": 0.8750840425491333, + "learning_rate": 3.379011274934952e-05, + "loss": 0.876, "step": 1948 }, { - "epoch": 0.7117034873105714, - "grad_norm": 0.8195019364356995, - "learning_rate": 3.637206265873024e-05, - "loss": 0.9614, + "epoch": 0.3381332408049965, + "grad_norm": 0.8015028834342957, + "learning_rate": 3.3807458803122295e-05, + "loss": 0.8425, "step": 1949 }, { - "epoch": 0.7120686507211977, - "grad_norm": 1.1837973594665527, - "learning_rate": 3.6363819161329606e-05, - "loss": 0.9677, + "epoch": 0.3383067314365024, + "grad_norm": 0.9893739223480225, + "learning_rate": 3.382480485689506e-05, + "loss": 0.9106, "step": 1950 }, { - "epoch": 0.7124338141318239, - "grad_norm": 0.9282038807868958, - "learning_rate": 3.6355567245400655e-05, - "loss": 0.912, + "epoch": 0.3384802220680083, + "grad_norm": 0.9822006821632385, + "learning_rate": 3.3842150910667825e-05, + "loss": 0.8193, "step": 1951 }, { - "epoch": 0.7127989775424503, - "grad_norm": 1.0558252334594727, - "learning_rate": 3.634730691518866e-05, - "loss": 0.9525, + "epoch": 0.3386537126995142, + "grad_norm": 0.7584022283554077, + "learning_rate": 3.385949696444059e-05, + "loss": 0.9055, "step": 1952 }, { - "epoch": 0.7131641409530765, - "grad_norm": 1.4195339679718018, - "learning_rate": 3.633903817494324e-05, - "loss": 0.9524, + "epoch": 0.3388272033310201, + "grad_norm": 1.1308552026748657, + "learning_rate": 3.387684301821336e-05, + "loss": 0.8613, "step": 1953 }, { - "epoch": 0.7135293043637028, - "grad_norm": 0.9032779932022095, - "learning_rate": 3.633076102891832e-05, - "loss": 0.9746, + "epoch": 0.339000693962526, + "grad_norm": 1.3956739902496338, + "learning_rate": 3.3894189071986126e-05, + "loss": 0.8506, "step": 1954 }, { - "epoch": 0.713894467774329, - "grad_norm": 0.9755726456642151, - "learning_rate": 3.632247548137217e-05, - "loss": 0.9253, + "epoch": 0.3391741845940319, + "grad_norm": 0.8418461680412292, + "learning_rate": 3.391153512575889e-05, + "loss": 1.0361, "step": 1955 }, { - "epoch": 0.7142596311849553, - "grad_norm": 0.8415127992630005, - "learning_rate": 3.631418153656736e-05, - "loss": 0.9137, + "epoch": 0.3393476752255378, + "grad_norm": 1.1066426038742065, + "learning_rate": 3.3928881179531656e-05, + "loss": 0.8389, "step": 1956 }, { - "epoch": 0.7146247945955815, - "grad_norm": 1.248856544494629, - "learning_rate": 3.630587919877079e-05, - "loss": 0.9761, + "epoch": 0.3395211658570437, + "grad_norm": 0.9013575911521912, + "learning_rate": 3.394622723330443e-05, + "loss": 0.8657, "step": 1957 }, { - "epoch": 0.7149899580062078, - "grad_norm": 1.0752594470977783, - "learning_rate": 3.6297568472253694e-05, - "loss": 0.947, + "epoch": 0.33969465648854963, + "grad_norm": 1.0212585926055908, + "learning_rate": 3.396357328707719e-05, + "loss": 0.8691, "step": 1958 }, { - "epoch": 0.715355121416834, - "grad_norm": 0.9795998930931091, - "learning_rate": 3.628924936129161e-05, - "loss": 0.9301, + "epoch": 0.33986814712005553, + "grad_norm": 1.7378517389297485, + "learning_rate": 3.398091934084996e-05, + "loss": 0.8423, "step": 1959 }, { - "epoch": 0.7157202848274603, - "grad_norm": 0.9856581091880798, - "learning_rate": 3.628092187016436e-05, - "loss": 0.9791, + "epoch": 0.34004163775156143, + "grad_norm": 1.103600025177002, + "learning_rate": 3.399826539462272e-05, + "loss": 0.7974, "step": 1960 }, { - "epoch": 0.7160854482380865, - "grad_norm": 1.8521298170089722, - "learning_rate": 3.627258600315612e-05, - "loss": 0.9698, + "epoch": 0.34021512838306733, + "grad_norm": 1.118898868560791, + "learning_rate": 3.401561144839549e-05, + "loss": 0.9175, "step": 1961 }, { - "epoch": 0.7164506116487128, - "grad_norm": 1.0116791725158691, - "learning_rate": 3.626424176455537e-05, - "loss": 0.9263, + "epoch": 0.34038861901457323, + "grad_norm": 1.0663821697235107, + "learning_rate": 3.403295750216826e-05, + "loss": 0.939, "step": 1962 }, { - "epoch": 0.716815775059339, - "grad_norm": 1.004432201385498, - "learning_rate": 3.625588915865487e-05, - "loss": 0.9237, + "epoch": 0.34056210964607914, + "grad_norm": 0.936802327632904, + "learning_rate": 3.4050303555941025e-05, + "loss": 0.7253, "step": 1963 }, { - "epoch": 0.7171809384699653, - "grad_norm": 1.4358326196670532, - "learning_rate": 3.624752818975171e-05, - "loss": 0.9324, + "epoch": 0.34073560027758504, + "grad_norm": 1.1755987405776978, + "learning_rate": 3.406764960971379e-05, + "loss": 0.7428, "step": 1964 }, { - "epoch": 0.7175461018805915, - "grad_norm": 1.263378620147705, - "learning_rate": 3.623915886214726e-05, - "loss": 0.9209, + "epoch": 0.3409090909090909, + "grad_norm": 0.9720269441604614, + "learning_rate": 3.4084995663486555e-05, + "loss": 0.6709, "step": 1965 }, { - "epoch": 0.7179112652912178, - "grad_norm": 1.05778968334198, - "learning_rate": 3.6230781180147225e-05, - "loss": 0.9432, + "epoch": 0.3410825815405968, + "grad_norm": 1.3342781066894531, + "learning_rate": 3.410234171725933e-05, + "loss": 0.7622, "step": 1966 }, { - "epoch": 0.718276428701844, - "grad_norm": 1.2332619428634644, - "learning_rate": 3.622239514806157e-05, - "loss": 0.9784, + "epoch": 0.3412560721721027, + "grad_norm": 0.8785350322723389, + "learning_rate": 3.411968777103209e-05, + "loss": 1.0066, "step": 1967 }, { - "epoch": 0.7186415921124704, - "grad_norm": 1.0369032621383667, - "learning_rate": 3.621400077020457e-05, - "loss": 0.9117, + "epoch": 0.3414295628036086, + "grad_norm": 1.1968451738357544, + "learning_rate": 3.4137033824804864e-05, + "loss": 0.7346, "step": 1968 }, { - "epoch": 0.7190067555230966, - "grad_norm": 1.2543957233428955, - "learning_rate": 3.62055980508948e-05, - "loss": 0.9325, + "epoch": 0.3416030534351145, + "grad_norm": 0.86801677942276, + "learning_rate": 3.415437987857763e-05, + "loss": 0.9033, "step": 1969 }, { - "epoch": 0.7193719189337229, - "grad_norm": 0.9119171500205994, - "learning_rate": 3.619718699445513e-05, - "loss": 0.8977, + "epoch": 0.3417765440666204, + "grad_norm": 1.0314303636550903, + "learning_rate": 3.4171725932350394e-05, + "loss": 0.8881, "step": 1970 }, { - "epoch": 0.7197370823443491, - "grad_norm": 1.069026231765747, - "learning_rate": 3.61887676052127e-05, - "loss": 0.9006, + "epoch": 0.3419500346981263, + "grad_norm": 1.1210917234420776, + "learning_rate": 3.418907198612316e-05, + "loss": 0.7715, "step": 1971 }, { - "epoch": 0.7201022457549754, - "grad_norm": 0.7960483431816101, - "learning_rate": 3.6180339887498953e-05, - "loss": 0.9205, + "epoch": 0.3421235253296322, + "grad_norm": 0.742502748966217, + "learning_rate": 3.420641803989593e-05, + "loss": 0.9341, "step": 1972 }, { - "epoch": 0.7204674091656016, - "grad_norm": 1.0640980005264282, - "learning_rate": 3.617190384564961e-05, - "loss": 0.9325, + "epoch": 0.3422970159611381, + "grad_norm": 0.9064629673957825, + "learning_rate": 3.4223764093668695e-05, + "loss": 0.7869, "step": 1973 }, { - "epoch": 0.7208325725762279, - "grad_norm": 1.064061164855957, - "learning_rate": 3.616345948400468e-05, - "loss": 0.947, + "epoch": 0.342470506592644, + "grad_norm": 0.8634181022644043, + "learning_rate": 3.424111014744146e-05, + "loss": 0.8555, "step": 1974 }, { - "epoch": 0.7211977359868541, - "grad_norm": 1.13766610622406, - "learning_rate": 3.615500680690843e-05, - "loss": 0.9387, + "epoch": 0.3426439972241499, + "grad_norm": 0.9857492446899414, + "learning_rate": 3.4258456201214225e-05, + "loss": 0.7351, "step": 1975 }, { - "epoch": 0.7215628993974804, - "grad_norm": 1.0495824813842773, - "learning_rate": 3.614654581870945e-05, - "loss": 0.9211, + "epoch": 0.3428174878556558, + "grad_norm": 1.1154719591140747, + "learning_rate": 3.4275802254987e-05, + "loss": 0.7798, "step": 1976 }, { - "epoch": 0.7219280628081066, - "grad_norm": 0.8311571478843689, - "learning_rate": 3.613807652376057e-05, - "loss": 0.9071, + "epoch": 0.3429909784871617, + "grad_norm": 1.0221623182296753, + "learning_rate": 3.429314830875976e-05, + "loss": 0.803, "step": 1977 }, { - "epoch": 0.7222932262187329, - "grad_norm": 0.999047040939331, - "learning_rate": 3.6129598926418896e-05, - "loss": 0.8928, + "epoch": 0.3431644691186676, + "grad_norm": 1.6454132795333862, + "learning_rate": 3.431049436253253e-05, + "loss": 0.7537, "step": 1978 }, { - "epoch": 0.7226583896293591, - "grad_norm": 0.8920377492904663, - "learning_rate": 3.6121113031045815e-05, - "loss": 0.9216, + "epoch": 0.3433379597501735, + "grad_norm": 1.1189544200897217, + "learning_rate": 3.432784041630529e-05, + "loss": 0.6555, "step": 1979 }, { - "epoch": 0.7230235530399854, - "grad_norm": 1.2837882041931152, - "learning_rate": 3.611261884200698e-05, - "loss": 0.9325, + "epoch": 0.3435114503816794, + "grad_norm": 0.7680050730705261, + "learning_rate": 3.434518647007806e-05, + "loss": 0.8787, "step": 1980 }, { - "epoch": 0.7233887164506116, - "grad_norm": 1.0123838186264038, - "learning_rate": 3.6104116363672304e-05, - "loss": 0.9392, + "epoch": 0.3436849410131853, + "grad_norm": 0.9554882049560547, + "learning_rate": 3.436253252385083e-05, + "loss": 0.8423, "step": 1981 }, { - "epoch": 0.723753879861238, - "grad_norm": 1.7658016681671143, - "learning_rate": 3.6095605600415985e-05, - "loss": 0.9102, + "epoch": 0.3438584316446912, + "grad_norm": 1.0437198877334595, + "learning_rate": 3.4379878577623594e-05, + "loss": 0.7786, "step": 1982 }, { - "epoch": 0.7241190432718642, - "grad_norm": 1.2692636251449585, - "learning_rate": 3.6087086556616457e-05, - "loss": 0.9359, + "epoch": 0.3440319222761971, + "grad_norm": 0.837895393371582, + "learning_rate": 3.439722463139636e-05, + "loss": 0.9417, "step": 1983 }, { - "epoch": 0.7244842066824904, - "grad_norm": 1.1934521198272705, - "learning_rate": 3.607855923665643e-05, - "loss": 0.9049, + "epoch": 0.34420541290770296, + "grad_norm": 1.1112855672836304, + "learning_rate": 3.4414570685169124e-05, + "loss": 0.7939, "step": 1984 }, { - "epoch": 0.7248493700931167, - "grad_norm": 1.2956231832504272, - "learning_rate": 3.607002364492287e-05, - "loss": 0.8772, + "epoch": 0.34437890353920886, + "grad_norm": 0.7975980639457703, + "learning_rate": 3.4431916738941896e-05, + "loss": 0.8147, "step": 1985 }, { - "epoch": 0.7252145335037429, - "grad_norm": 1.3445693254470825, - "learning_rate": 3.6061479785806996e-05, - "loss": 0.9381, + "epoch": 0.34455239417071476, + "grad_norm": 0.757238507270813, + "learning_rate": 3.444926279271466e-05, + "loss": 0.8098, "step": 1986 }, { - "epoch": 0.7255796969143692, - "grad_norm": 1.367138385772705, - "learning_rate": 3.6052927663704276e-05, - "loss": 0.9575, + "epoch": 0.34472588480222066, + "grad_norm": 0.8225416541099548, + "learning_rate": 3.4466608846487426e-05, + "loss": 0.7776, "step": 1987 }, { - "epoch": 0.7259448603249954, - "grad_norm": 1.2053515911102295, - "learning_rate": 3.604436728301443e-05, - "loss": 0.9792, + "epoch": 0.34489937543372656, + "grad_norm": 0.8365417718887329, + "learning_rate": 3.448395490026019e-05, + "loss": 0.8291, "step": 1988 }, { - "epoch": 0.7263100237356217, - "grad_norm": 0.9317446351051331, - "learning_rate": 3.603579864814145e-05, - "loss": 0.9921, + "epoch": 0.34507286606523246, + "grad_norm": 1.095427393913269, + "learning_rate": 3.4501300954032956e-05, + "loss": 0.8188, "step": 1989 }, { - "epoch": 0.7266751871462479, - "grad_norm": 0.9904404878616333, - "learning_rate": 3.6027221763493534e-05, - "loss": 0.9521, + "epoch": 0.34524635669673837, + "grad_norm": 0.8960191607475281, + "learning_rate": 3.451864700780573e-05, + "loss": 0.8806, "step": 1990 }, { - "epoch": 0.7270403505568742, - "grad_norm": 1.3663259744644165, - "learning_rate": 3.6018636633483154e-05, - "loss": 0.9346, + "epoch": 0.34541984732824427, + "grad_norm": 2.0384774208068848, + "learning_rate": 3.453599306157849e-05, + "loss": 0.8958, "step": 1991 }, { - "epoch": 0.7274055139675004, - "grad_norm": 1.3204405307769775, - "learning_rate": 3.601004326252702e-05, - "loss": 0.9343, + "epoch": 0.34559333795975017, + "grad_norm": 0.9212133884429932, + "learning_rate": 3.455333911535126e-05, + "loss": 0.8643, "step": 1992 }, { - "epoch": 0.7277706773781267, - "grad_norm": 1.3128724098205566, - "learning_rate": 3.600144165504607e-05, - "loss": 0.928, + "epoch": 0.34576682859125607, + "grad_norm": 1.4166983366012573, + "learning_rate": 3.457068516912402e-05, + "loss": 0.8694, "step": 1993 }, { - "epoch": 0.7281358407887529, - "grad_norm": 1.166178584098816, - "learning_rate": 3.5992831815465476e-05, - "loss": 0.9329, + "epoch": 0.34594031922276197, + "grad_norm": 0.834001362323761, + "learning_rate": 3.4588031222896794e-05, + "loss": 0.8774, "step": 1994 }, { - "epoch": 0.7285010041993792, - "grad_norm": 1.0174297094345093, - "learning_rate": 3.598421374821468e-05, - "loss": 0.9331, + "epoch": 0.3461138098542679, + "grad_norm": 0.9860048890113831, + "learning_rate": 3.460537727666956e-05, + "loss": 0.8489, "step": 1995 }, { - "epoch": 0.7288661676100054, - "grad_norm": 0.9264554381370544, - "learning_rate": 3.59755874577273e-05, - "loss": 0.9106, + "epoch": 0.3462873004857738, + "grad_norm": 0.8251680731773376, + "learning_rate": 3.462272333044233e-05, + "loss": 0.7881, "step": 1996 }, { - "epoch": 0.7292313310206318, - "grad_norm": 2.011765956878662, - "learning_rate": 3.596695294844124e-05, - "loss": 0.9156, + "epoch": 0.3464607911172797, + "grad_norm": 1.0361677408218384, + "learning_rate": 3.4640069384215096e-05, + "loss": 0.759, "step": 1997 }, { - "epoch": 0.729596494431258, - "grad_norm": 1.3578325510025024, - "learning_rate": 3.5958310224798605e-05, - "loss": 0.9456, + "epoch": 0.3466342817487856, + "grad_norm": 0.8769716620445251, + "learning_rate": 3.465741543798786e-05, + "loss": 0.7356, "step": 1998 }, { - "epoch": 0.7299616578418843, - "grad_norm": 1.095751166343689, - "learning_rate": 3.5949659291245727e-05, - "loss": 0.9287, + "epoch": 0.3468077723802915, + "grad_norm": 0.8745731115341187, + "learning_rate": 3.4674761491760626e-05, + "loss": 0.8367, "step": 1999 }, { - "epoch": 0.7303268212525105, - "grad_norm": 1.1574761867523193, - "learning_rate": 3.5941000152233166e-05, - "loss": 0.8928, + "epoch": 0.3469812630117974, + "grad_norm": 1.044663906097412, + "learning_rate": 3.46921075455334e-05, + "loss": 0.6624, "step": 2000 }, { - "epoch": 0.7306919846631368, - "grad_norm": 1.0384608507156372, - "learning_rate": 3.5932332812215694e-05, - "loss": 0.923, + "epoch": 0.3471547536433033, + "grad_norm": 1.18858802318573, + "learning_rate": 3.470945359930616e-05, + "loss": 0.7061, "step": 2001 }, { - "epoch": 0.731057148073763, - "grad_norm": 1.0505645275115967, - "learning_rate": 3.5923657275652316e-05, - "loss": 0.9097, + "epoch": 0.3473282442748092, + "grad_norm": 0.9339502453804016, + "learning_rate": 3.472679965307893e-05, + "loss": 0.8042, "step": 2002 }, { - "epoch": 0.7314223114843893, - "grad_norm": 1.2344129085540771, - "learning_rate": 3.5914973547006244e-05, - "loss": 0.9514, + "epoch": 0.3475017349063151, + "grad_norm": 1.0052956342697144, + "learning_rate": 3.474414570685169e-05, + "loss": 0.8198, "step": 2003 }, { - "epoch": 0.7317874748950155, - "grad_norm": 1.2565823793411255, - "learning_rate": 3.5906281630744914e-05, - "loss": 0.9563, + "epoch": 0.34767522553782093, + "grad_norm": 2.3821754455566406, + "learning_rate": 3.4761491760624465e-05, + "loss": 0.9255, "step": 2004 }, { - "epoch": 0.7321526383056418, - "grad_norm": 1.4656277894973755, - "learning_rate": 3.589758153133996e-05, - "loss": 0.9543, + "epoch": 0.34784871616932683, + "grad_norm": 1.1659489870071411, + "learning_rate": 3.477883781439723e-05, + "loss": 0.7817, "step": 2005 }, { - "epoch": 0.732517801716268, - "grad_norm": 1.2206615209579468, - "learning_rate": 3.588887325326725e-05, - "loss": 0.9501, + "epoch": 0.34802220680083273, + "grad_norm": 0.81935054063797, + "learning_rate": 3.4796183868169995e-05, + "loss": 0.8855, "step": 2006 }, { - "epoch": 0.7328829651268943, - "grad_norm": 1.137550950050354, - "learning_rate": 3.5880156801006826e-05, - "loss": 0.9088, + "epoch": 0.34819569743233864, + "grad_norm": 1.137302279472351, + "learning_rate": 3.481352992194276e-05, + "loss": 0.7854, "step": 2007 }, { - "epoch": 0.7332481285375205, - "grad_norm": 1.0446393489837646, - "learning_rate": 3.587143217904295e-05, - "loss": 0.8965, + "epoch": 0.34836918806384454, + "grad_norm": 1.24305260181427, + "learning_rate": 3.4830875975715525e-05, + "loss": 0.7805, "step": 2008 }, { - "epoch": 0.7336132919481468, - "grad_norm": 1.1560595035552979, - "learning_rate": 3.58626993918641e-05, - "loss": 0.9227, + "epoch": 0.34854267869535044, + "grad_norm": 0.9296610951423645, + "learning_rate": 3.4848222029488296e-05, + "loss": 0.9502, "step": 2009 }, { - "epoch": 0.733978455358773, - "grad_norm": 1.0209001302719116, - "learning_rate": 3.585395844396295e-05, - "loss": 0.8823, + "epoch": 0.34871616932685634, + "grad_norm": 0.9316244125366211, + "learning_rate": 3.486556808326106e-05, + "loss": 0.8608, "step": 2010 }, { - "epoch": 0.7343436187693994, - "grad_norm": 0.818286657333374, - "learning_rate": 3.584520933983636e-05, - "loss": 0.9417, + "epoch": 0.34888965995836224, + "grad_norm": 1.0122873783111572, + "learning_rate": 3.4882914137033826e-05, + "loss": 0.7588, "step": 2011 }, { - "epoch": 0.7347087821800256, - "grad_norm": 1.2019844055175781, - "learning_rate": 3.5836452083985394e-05, - "loss": 0.9547, + "epoch": 0.34906315058986814, + "grad_norm": 0.9167733788490295, + "learning_rate": 3.490026019080659e-05, + "loss": 0.8774, "step": 2012 }, { - "epoch": 0.7350739455906519, - "grad_norm": 1.0730644464492798, - "learning_rate": 3.58276866809153e-05, - "loss": 0.8715, + "epoch": 0.34923664122137404, + "grad_norm": 0.8625271916389465, + "learning_rate": 3.491760624457936e-05, + "loss": 0.7161, "step": 2013 }, { - "epoch": 0.7354391090012781, - "grad_norm": 0.9025311470031738, - "learning_rate": 3.581891313513555e-05, - "loss": 0.9471, + "epoch": 0.34941013185287995, + "grad_norm": 1.0057225227355957, + "learning_rate": 3.493495229835213e-05, + "loss": 0.7952, "step": 2014 }, { - "epoch": 0.7358042724119044, - "grad_norm": 1.4166213274002075, - "learning_rate": 3.581013145115975e-05, - "loss": 0.9308, + "epoch": 0.34958362248438585, + "grad_norm": 1.0647094249725342, + "learning_rate": 3.495229835212489e-05, + "loss": 0.7368, "step": 2015 }, { - "epoch": 0.7361694358225306, - "grad_norm": 0.9844183921813965, - "learning_rate": 3.580134163350575e-05, - "loss": 0.915, + "epoch": 0.34975711311589175, + "grad_norm": 0.8171172142028809, + "learning_rate": 3.496964440589766e-05, + "loss": 0.7891, "step": 2016 }, { - "epoch": 0.7365345992331568, - "grad_norm": 1.100968599319458, - "learning_rate": 3.5792543686695544e-05, - "loss": 0.9126, + "epoch": 0.34993060374739765, + "grad_norm": 0.8449965119361877, + "learning_rate": 3.498699045967043e-05, + "loss": 0.9312, "step": 2017 }, { - "epoch": 0.7368997626437831, - "grad_norm": 0.9758762121200562, - "learning_rate": 3.5783737615255326e-05, - "loss": 0.9344, + "epoch": 0.35010409437890355, + "grad_norm": 0.8817417621612549, + "learning_rate": 3.5004336513443195e-05, + "loss": 1.0083, "step": 2018 }, { - "epoch": 0.7372649260544093, - "grad_norm": 1.0051456689834595, - "learning_rate": 3.5774923423715464e-05, - "loss": 0.931, + "epoch": 0.35027758501040945, + "grad_norm": 0.8974473476409912, + "learning_rate": 3.502168256721596e-05, + "loss": 0.8679, "step": 2019 }, { - "epoch": 0.7376300894650356, - "grad_norm": 1.428348183631897, - "learning_rate": 3.576610111661051e-05, - "loss": 0.8705, + "epoch": 0.35045107564191535, + "grad_norm": 1.2601897716522217, + "learning_rate": 3.5039028620988725e-05, + "loss": 0.7808, "step": 2020 }, { - "epoch": 0.7379952528756618, - "grad_norm": 0.7692030072212219, - "learning_rate": 3.5757270698479186e-05, - "loss": 0.9285, + "epoch": 0.35062456627342126, + "grad_norm": 1.1432396173477173, + "learning_rate": 3.505637467476149e-05, + "loss": 0.7083, "step": 2021 }, { - "epoch": 0.7383604162862881, - "grad_norm": 1.0516414642333984, - "learning_rate": 3.5748432173864394e-05, - "loss": 0.947, + "epoch": 0.35079805690492716, + "grad_norm": 1.602184534072876, + "learning_rate": 3.507372072853426e-05, + "loss": 0.729, "step": 2022 }, { - "epoch": 0.7387255796969143, - "grad_norm": 1.0566405057907104, - "learning_rate": 3.573958554731319e-05, - "loss": 0.9291, + "epoch": 0.35097154753643306, + "grad_norm": 1.0737391710281372, + "learning_rate": 3.509106678230703e-05, + "loss": 0.7606, "step": 2023 }, { - "epoch": 0.7390907431075406, - "grad_norm": 1.1255320310592651, - "learning_rate": 3.573073082337681e-05, - "loss": 0.9504, + "epoch": 0.3511450381679389, + "grad_norm": 0.8534942269325256, + "learning_rate": 3.510841283607979e-05, + "loss": 0.8467, "step": 2024 }, { - "epoch": 0.7394559065181668, - "grad_norm": 0.7945982813835144, - "learning_rate": 3.572186800661065e-05, - "loss": 0.9442, + "epoch": 0.3513185287994448, + "grad_norm": 0.9060901999473572, + "learning_rate": 3.512575888985256e-05, + "loss": 0.8833, "step": 2025 }, { - "epoch": 0.7398210699287932, - "grad_norm": 1.0442267656326294, - "learning_rate": 3.571299710157429e-05, - "loss": 0.9294, + "epoch": 0.3514920194309507, + "grad_norm": 0.6534338593482971, + "learning_rate": 3.514310494362533e-05, + "loss": 0.905, "step": 2026 }, { - "epoch": 0.7401862333394194, - "grad_norm": 1.2144863605499268, - "learning_rate": 3.570411811283144e-05, - "loss": 0.955, + "epoch": 0.3516655100624566, + "grad_norm": 0.7141449451446533, + "learning_rate": 3.5160450997398094e-05, + "loss": 0.9619, "step": 2027 }, { - "epoch": 0.7405513967500457, - "grad_norm": 1.1833593845367432, - "learning_rate": 3.569523104494999e-05, - "loss": 0.9056, + "epoch": 0.3518390006939625, + "grad_norm": 1.1378304958343506, + "learning_rate": 3.5177797051170865e-05, + "loss": 0.7664, "step": 2028 }, { - "epoch": 0.7409165601606719, - "grad_norm": 1.2964634895324707, - "learning_rate": 3.568633590250198e-05, - "loss": 0.9363, + "epoch": 0.3520124913254684, + "grad_norm": 1.020209550857544, + "learning_rate": 3.519514310494363e-05, + "loss": 0.7507, "step": 2029 }, { - "epoch": 0.7412817235712982, - "grad_norm": 0.8896535038948059, - "learning_rate": 3.56774326900636e-05, - "loss": 0.913, + "epoch": 0.3521859819569743, + "grad_norm": 1.4680802822113037, + "learning_rate": 3.5212489158716395e-05, + "loss": 0.8699, "step": 2030 }, { - "epoch": 0.7416468869819244, - "grad_norm": 1.058685302734375, - "learning_rate": 3.5668521412215194e-05, - "loss": 0.9673, + "epoch": 0.3523594725884802, + "grad_norm": 1.0365432500839233, + "learning_rate": 3.522983521248916e-05, + "loss": 0.7227, "step": 2031 }, { - "epoch": 0.7420120503925507, - "grad_norm": 1.0688618421554565, - "learning_rate": 3.5659602073541256e-05, - "loss": 0.9661, + "epoch": 0.3525329632199861, + "grad_norm": 0.875889241695404, + "learning_rate": 3.524718126626193e-05, + "loss": 0.7778, "step": 2032 }, { - "epoch": 0.7423772138031769, - "grad_norm": 1.0344313383102417, - "learning_rate": 3.565067467863044e-05, - "loss": 0.947, + "epoch": 0.352706453851492, + "grad_norm": 0.7997333407402039, + "learning_rate": 3.52645273200347e-05, + "loss": 0.896, "step": 2033 }, { - "epoch": 0.7427423772138032, - "grad_norm": 1.7594391107559204, - "learning_rate": 3.564173923207553e-05, - "loss": 0.9242, + "epoch": 0.3528799444829979, + "grad_norm": 1.2736715078353882, + "learning_rate": 3.528187337380746e-05, + "loss": 0.7092, "step": 2034 }, { - "epoch": 0.7431075406244294, - "grad_norm": 1.6160258054733276, - "learning_rate": 3.563279573847344e-05, - "loss": 0.9523, + "epoch": 0.3530534351145038, + "grad_norm": 0.9862885475158691, + "learning_rate": 3.529921942758023e-05, + "loss": 0.7927, "step": 2035 }, { - "epoch": 0.7434727040350557, - "grad_norm": 1.1309272050857544, - "learning_rate": 3.5623844202425245e-05, - "loss": 0.9414, + "epoch": 0.3532269257460097, + "grad_norm": 0.9566293954849243, + "learning_rate": 3.5316565481353e-05, + "loss": 0.8462, "step": 2036 }, { - "epoch": 0.7438378674456819, - "grad_norm": 1.0305252075195312, - "learning_rate": 3.5614884628536156e-05, - "loss": 0.9418, + "epoch": 0.3534004163775156, + "grad_norm": 1.4425415992736816, + "learning_rate": 3.5333911535125764e-05, + "loss": 0.8606, "step": 2037 }, { - "epoch": 0.7442030308563082, - "grad_norm": 1.0900720357894897, - "learning_rate": 3.560591702141552e-05, - "loss": 0.9301, + "epoch": 0.3535739070090215, + "grad_norm": 1.2802646160125732, + "learning_rate": 3.535125758889853e-05, + "loss": 0.8096, "step": 2038 }, { - "epoch": 0.7445681942669344, - "grad_norm": 0.9725579023361206, - "learning_rate": 3.559694138567679e-05, - "loss": 0.9202, + "epoch": 0.35374739764052743, + "grad_norm": 0.7803860902786255, + "learning_rate": 3.5368603642671294e-05, + "loss": 0.843, "step": 2039 }, { - "epoch": 0.7449333576775607, - "grad_norm": 1.2923104763031006, - "learning_rate": 3.558795772593759e-05, - "loss": 0.9454, + "epoch": 0.35392088827203333, + "grad_norm": 1.3120194673538208, + "learning_rate": 3.538594969644406e-05, + "loss": 0.8054, "step": 2040 }, { - "epoch": 0.745298521088187, - "grad_norm": 1.0758559703826904, - "learning_rate": 3.5578966046819644e-05, - "loss": 0.9221, + "epoch": 0.35409437890353923, + "grad_norm": 0.6876341104507446, + "learning_rate": 3.540329575021683e-05, + "loss": 0.756, "step": 2041 }, { - "epoch": 0.7456636844988133, - "grad_norm": 1.3801772594451904, - "learning_rate": 3.556996635294881e-05, - "loss": 0.9083, + "epoch": 0.35426786953504513, + "grad_norm": 0.8121494650840759, + "learning_rate": 3.5420641803989596e-05, + "loss": 0.8105, "step": 2042 }, { - "epoch": 0.7460288479094395, - "grad_norm": 1.0228190422058105, - "learning_rate": 3.556095864895508e-05, - "loss": 0.8953, + "epoch": 0.35444136016655103, + "grad_norm": 0.9092084169387817, + "learning_rate": 3.543798785776236e-05, + "loss": 0.8445, "step": 2043 }, { - "epoch": 0.7463940113200658, - "grad_norm": 1.2529135942459106, - "learning_rate": 3.555194293947254e-05, - "loss": 0.926, + "epoch": 0.3546148507980569, + "grad_norm": 1.3432433605194092, + "learning_rate": 3.5455333911535126e-05, + "loss": 0.8701, "step": 2044 }, { - "epoch": 0.746759174730692, - "grad_norm": 1.0172481536865234, - "learning_rate": 3.554291922913942e-05, - "loss": 0.9467, + "epoch": 0.3547883414295628, + "grad_norm": 0.9198431372642517, + "learning_rate": 3.54726799653079e-05, + "loss": 0.8394, "step": 2045 }, { - "epoch": 0.7471243381413183, - "grad_norm": 1.6456912755966187, - "learning_rate": 3.553388752259806e-05, - "loss": 0.9426, + "epoch": 0.3549618320610687, + "grad_norm": 0.9898739457130432, + "learning_rate": 3.549002601908066e-05, + "loss": 0.8081, "step": 2046 }, { - "epoch": 0.7474895015519445, - "grad_norm": 1.2197327613830566, - "learning_rate": 3.5524847824494896e-05, - "loss": 0.941, + "epoch": 0.3551353226925746, + "grad_norm": 0.9238489270210266, + "learning_rate": 3.550737207285343e-05, + "loss": 0.772, "step": 2047 }, { - "epoch": 0.7478546649625708, - "grad_norm": 0.9726659059524536, - "learning_rate": 3.5515800139480505e-05, - "loss": 0.9437, + "epoch": 0.3553088133240805, + "grad_norm": 1.0620425939559937, + "learning_rate": 3.552471812662619e-05, + "loss": 0.8806, "step": 2048 }, { - "epoch": 0.748219828373197, - "grad_norm": 0.723832368850708, - "learning_rate": 3.5506744472209556e-05, - "loss": 0.9128, + "epoch": 0.3554823039555864, + "grad_norm": 1.0493426322937012, + "learning_rate": 3.554206418039896e-05, + "loss": 0.7682, "step": 2049 }, { - "epoch": 0.7485849917838232, - "grad_norm": 1.3554929494857788, - "learning_rate": 3.5497680827340816e-05, - "loss": 0.9608, + "epoch": 0.3556557945870923, + "grad_norm": 0.8164659738540649, + "learning_rate": 3.555941023417173e-05, + "loss": 0.8796, "step": 2050 }, { - "epoch": 0.7489501551944495, - "grad_norm": 0.8726962208747864, - "learning_rate": 3.5488609209537176e-05, - "loss": 0.9138, + "epoch": 0.3558292852185982, + "grad_norm": 0.9630699753761292, + "learning_rate": 3.5576756287944494e-05, + "loss": 0.895, "step": 2051 }, { - "epoch": 0.7493153186050757, - "grad_norm": 1.4007759094238281, - "learning_rate": 3.547952962346562e-05, - "loss": 0.9423, + "epoch": 0.3560027758501041, + "grad_norm": 0.8913407325744629, + "learning_rate": 3.559410234171726e-05, + "loss": 0.7363, "step": 2052 }, { - "epoch": 0.749680482015702, - "grad_norm": 1.183057188987732, - "learning_rate": 3.5470442073797224e-05, - "loss": 0.9108, + "epoch": 0.35617626648161, + "grad_norm": 0.9014322757720947, + "learning_rate": 3.5611448395490024e-05, + "loss": 0.731, "step": 2053 }, { - "epoch": 0.7500456454263282, - "grad_norm": 1.048283576965332, - "learning_rate": 3.5461346565207174e-05, - "loss": 0.9285, + "epoch": 0.3563497571131159, + "grad_norm": 0.9465219378471375, + "learning_rate": 3.5628794449262796e-05, + "loss": 0.752, "step": 2054 }, { - "epoch": 0.7504108088369545, - "grad_norm": 0.9665740132331848, - "learning_rate": 3.5452243102374737e-05, - "loss": 0.8929, + "epoch": 0.3565232477446218, + "grad_norm": 0.9615862369537354, + "learning_rate": 3.564614050303557e-05, + "loss": 0.7275, "step": 2055 }, { - "epoch": 0.7507759722475807, - "grad_norm": 1.2260096073150635, - "learning_rate": 3.5443131689983285e-05, - "loss": 0.896, + "epoch": 0.3566967383761277, + "grad_norm": 0.9010798335075378, + "learning_rate": 3.566348655680833e-05, + "loss": 0.7185, "step": 2056 }, { - "epoch": 0.7511411356582071, - "grad_norm": 1.088097333908081, - "learning_rate": 3.543401233272028e-05, - "loss": 0.9385, + "epoch": 0.3568702290076336, + "grad_norm": 1.3801579475402832, + "learning_rate": 3.56808326105811e-05, + "loss": 0.7314, "step": 2057 }, { - "epoch": 0.7515062990688333, - "grad_norm": 1.4327890872955322, - "learning_rate": 3.5424885035277255e-05, - "loss": 0.8987, + "epoch": 0.3570437196391395, + "grad_norm": 0.8271446824073792, + "learning_rate": 3.569817866435386e-05, + "loss": 0.9242, "step": 2058 }, { - "epoch": 0.7518714624794596, - "grad_norm": 0.8798975348472595, - "learning_rate": 3.541574980234983e-05, - "loss": 0.8976, + "epoch": 0.3572172102706454, + "grad_norm": 1.0096848011016846, + "learning_rate": 3.571552471812663e-05, + "loss": 0.69, "step": 2059 }, { - "epoch": 0.7522366258900858, - "grad_norm": 1.1647648811340332, - "learning_rate": 3.540660663863774e-05, - "loss": 0.9503, + "epoch": 0.3573907009021513, + "grad_norm": 0.9437965750694275, + "learning_rate": 3.57328707718994e-05, + "loss": 0.7495, "step": 2060 }, { - "epoch": 0.7526017893007121, - "grad_norm": 1.0108572244644165, - "learning_rate": 3.539745554884476e-05, - "loss": 0.8849, + "epoch": 0.3575641915336572, + "grad_norm": 0.7619524598121643, + "learning_rate": 3.5750216825672165e-05, + "loss": 0.9263, "step": 2061 }, { - "epoch": 0.7529669527113383, - "grad_norm": 0.9932869076728821, - "learning_rate": 3.5388296537678765e-05, - "loss": 0.9542, + "epoch": 0.3577376821651631, + "grad_norm": 1.1454344987869263, + "learning_rate": 3.576756287944493e-05, + "loss": 0.7795, "step": 2062 }, { - "epoch": 0.7533321161219646, - "grad_norm": 0.8637552261352539, - "learning_rate": 3.537912960985169e-05, - "loss": 0.8845, + "epoch": 0.35791117279666895, + "grad_norm": 1.2527416944503784, + "learning_rate": 3.5784908933217695e-05, + "loss": 0.6604, "step": 2063 }, { - "epoch": 0.7536972795325908, - "grad_norm": 1.2607074975967407, - "learning_rate": 3.536995477007955e-05, - "loss": 0.9256, + "epoch": 0.35808466342817485, + "grad_norm": 1.025180697441101, + "learning_rate": 3.5802254986990466e-05, + "loss": 0.7598, "step": 2064 }, { - "epoch": 0.7540624429432171, - "grad_norm": 1.5357863903045654, - "learning_rate": 3.5360772023082446e-05, - "loss": 0.9554, + "epoch": 0.35825815405968076, + "grad_norm": 0.6621628999710083, + "learning_rate": 3.581960104076323e-05, + "loss": 0.8345, "step": 2065 }, { - "epoch": 0.7544276063538433, - "grad_norm": 1.3150715827941895, - "learning_rate": 3.535158137358453e-05, - "loss": 0.9459, + "epoch": 0.35843164469118666, + "grad_norm": 0.9974340796470642, + "learning_rate": 3.5836947094535996e-05, + "loss": 0.8447, "step": 2066 }, { - "epoch": 0.7547927697644696, - "grad_norm": 0.8139840364456177, - "learning_rate": 3.534238282631401e-05, - "loss": 0.9395, + "epoch": 0.35860513532269256, + "grad_norm": 0.9111983180046082, + "learning_rate": 3.585429314830876e-05, + "loss": 0.8284, "step": 2067 }, { - "epoch": 0.7551579331750958, - "grad_norm": 1.0026363134384155, - "learning_rate": 3.533317638600319e-05, - "loss": 0.9269, + "epoch": 0.35877862595419846, + "grad_norm": 0.8982375860214233, + "learning_rate": 3.5871639202081526e-05, + "loss": 0.7554, "step": 2068 }, { - "epoch": 0.7555230965857221, - "grad_norm": 1.0478194952011108, - "learning_rate": 3.532396205738839e-05, - "loss": 0.9096, + "epoch": 0.35895211658570436, + "grad_norm": 1.0266188383102417, + "learning_rate": 3.58889852558543e-05, + "loss": 0.7013, "step": 2069 }, { - "epoch": 0.7558882599963483, - "grad_norm": 0.994141161441803, - "learning_rate": 3.5314739845210027e-05, - "loss": 0.8871, + "epoch": 0.35912560721721026, + "grad_norm": 0.7805616855621338, + "learning_rate": 3.590633130962706e-05, + "loss": 0.7671, "step": 2070 }, { - "epoch": 0.7562534234069747, - "grad_norm": 1.2004483938217163, - "learning_rate": 3.530550975421255e-05, - "loss": 0.934, + "epoch": 0.35929909784871616, + "grad_norm": 2.07834792137146, + "learning_rate": 3.592367736339983e-05, + "loss": 0.8271, "step": 2071 }, { - "epoch": 0.7566185868176009, - "grad_norm": 1.0200645923614502, - "learning_rate": 3.529627178914448e-05, - "loss": 0.9569, + "epoch": 0.35947258848022207, + "grad_norm": 0.8038728833198547, + "learning_rate": 3.594102341717259e-05, + "loss": 0.8252, "step": 2072 }, { - "epoch": 0.7569837502282272, - "grad_norm": 1.0953030586242676, - "learning_rate": 3.5287025954758385e-05, - "loss": 0.9257, + "epoch": 0.35964607911172797, + "grad_norm": 0.9268532991409302, + "learning_rate": 3.5958369470945365e-05, + "loss": 0.7079, "step": 2073 }, { - "epoch": 0.7573489136388534, - "grad_norm": 1.2551528215408325, - "learning_rate": 3.5277772255810855e-05, - "loss": 0.8583, + "epoch": 0.35981956974323387, + "grad_norm": 1.049972414970398, + "learning_rate": 3.597571552471813e-05, + "loss": 0.7722, "step": 2074 }, { - "epoch": 0.7577140770494797, - "grad_norm": 0.9791481494903564, - "learning_rate": 3.526851069706256e-05, - "loss": 0.9346, + "epoch": 0.35999306037473977, + "grad_norm": 1.170380711555481, + "learning_rate": 3.5993061578490895e-05, + "loss": 0.6953, "step": 2075 }, { - "epoch": 0.7580792404601059, - "grad_norm": 0.9141026139259338, - "learning_rate": 3.5259241283278204e-05, - "loss": 0.9368, + "epoch": 0.36016655100624567, + "grad_norm": 0.9299865961074829, + "learning_rate": 3.601040763226366e-05, + "loss": 0.8159, "step": 2076 }, { - "epoch": 0.7584444038707322, - "grad_norm": 1.3128681182861328, - "learning_rate": 3.5249964019226514e-05, - "loss": 0.9519, + "epoch": 0.3603400416377516, + "grad_norm": 0.849678099155426, + "learning_rate": 3.602775368603643e-05, + "loss": 0.8606, "step": 2077 }, { - "epoch": 0.7588095672813584, - "grad_norm": 1.2170532941818237, - "learning_rate": 3.524067890968029e-05, - "loss": 0.9319, + "epoch": 0.3605135322692575, + "grad_norm": 0.8873193264007568, + "learning_rate": 3.60450997398092e-05, + "loss": 0.8154, "step": 2078 }, { - "epoch": 0.7591747306919847, - "grad_norm": 0.9500519037246704, - "learning_rate": 3.523138595941633e-05, - "loss": 0.9226, + "epoch": 0.3606870229007634, + "grad_norm": 0.9975008964538574, + "learning_rate": 3.606244579358196e-05, + "loss": 0.7954, "step": 2079 }, { - "epoch": 0.7595398941026109, - "grad_norm": 0.9866312742233276, - "learning_rate": 3.5222085173215495e-05, - "loss": 0.8983, + "epoch": 0.3608605135322693, + "grad_norm": 2.18876576423645, + "learning_rate": 3.607979184735473e-05, + "loss": 0.7239, "step": 2080 }, { - "epoch": 0.7599050575132372, - "grad_norm": 1.1628645658493042, - "learning_rate": 3.521277655586266e-05, - "loss": 0.9308, + "epoch": 0.3610340041637752, + "grad_norm": 1.281145453453064, + "learning_rate": 3.609713790112749e-05, + "loss": 0.7832, "step": 2081 }, { - "epoch": 0.7602702209238634, - "grad_norm": 1.4750910997390747, - "learning_rate": 3.520346011214674e-05, - "loss": 0.9161, + "epoch": 0.3612074947952811, + "grad_norm": 0.879976212978363, + "learning_rate": 3.6114483954900263e-05, + "loss": 0.8533, "step": 2082 }, { - "epoch": 0.7606353843344897, - "grad_norm": 1.0252578258514404, - "learning_rate": 3.519413584686067e-05, - "loss": 0.9109, + "epoch": 0.3613809854267869, + "grad_norm": 1.3822752237319946, + "learning_rate": 3.613183000867303e-05, + "loss": 0.665, "step": 2083 }, { - "epoch": 0.7610005477451159, - "grad_norm": 1.3238831758499146, - "learning_rate": 3.518480376480141e-05, - "loss": 0.9359, + "epoch": 0.36155447605829283, + "grad_norm": 1.199411392211914, + "learning_rate": 3.6149176062445793e-05, + "loss": 0.6807, "step": 2084 }, { - "epoch": 0.7613657111557421, - "grad_norm": 1.280437707901001, - "learning_rate": 3.5175463870769935e-05, - "loss": 0.9252, + "epoch": 0.36172796668979873, + "grad_norm": 1.0246819257736206, + "learning_rate": 3.6166522116218565e-05, + "loss": 0.7883, "step": 2085 }, { - "epoch": 0.7617308745663685, - "grad_norm": 0.9670564532279968, - "learning_rate": 3.516611616957125e-05, - "loss": 0.9381, + "epoch": 0.36190145732130463, + "grad_norm": 1.016158103942871, + "learning_rate": 3.618386816999133e-05, + "loss": 0.7705, "step": 2086 }, { - "epoch": 0.7620960379769947, - "grad_norm": 1.0356251001358032, - "learning_rate": 3.515676066601438e-05, - "loss": 0.9135, + "epoch": 0.36207494795281053, + "grad_norm": 1.2033147811889648, + "learning_rate": 3.6201214223764095e-05, + "loss": 0.7427, "step": 2087 }, { - "epoch": 0.762461201387621, - "grad_norm": 1.0064624547958374, - "learning_rate": 3.514739736491235e-05, - "loss": 0.8953, + "epoch": 0.36224843858431643, + "grad_norm": 1.095438003540039, + "learning_rate": 3.621856027753687e-05, + "loss": 0.7786, "step": 2088 }, { - "epoch": 0.7628263647982472, - "grad_norm": 0.8240634202957153, - "learning_rate": 3.51380262710822e-05, - "loss": 0.9088, + "epoch": 0.36242192921582234, + "grad_norm": 0.967070460319519, + "learning_rate": 3.623590633130963e-05, + "loss": 0.7585, "step": 2089 }, { - "epoch": 0.7631915282088735, - "grad_norm": 0.9105607271194458, - "learning_rate": 3.512864738934499e-05, - "loss": 0.9335, + "epoch": 0.36259541984732824, + "grad_norm": 0.8528753519058228, + "learning_rate": 3.62532523850824e-05, + "loss": 0.738, "step": 2090 }, { - "epoch": 0.7635566916194997, - "grad_norm": 1.1227301359176636, - "learning_rate": 3.511926072452578e-05, - "loss": 0.9573, + "epoch": 0.36276891047883414, + "grad_norm": 0.9499258995056152, + "learning_rate": 3.627059843885516e-05, + "loss": 0.7117, "step": 2091 }, { - "epoch": 0.763921855030126, - "grad_norm": 1.121342420578003, - "learning_rate": 3.5109866281453606e-05, - "loss": 0.8885, + "epoch": 0.36294240111034004, + "grad_norm": 0.843414306640625, + "learning_rate": 3.6287944492627934e-05, + "loss": 0.8225, "step": 2092 }, { - "epoch": 0.7642870184407522, - "grad_norm": 1.0823091268539429, - "learning_rate": 3.510046406496157e-05, - "loss": 0.8885, + "epoch": 0.36311589174184594, + "grad_norm": 2.2019407749176025, + "learning_rate": 3.63052905464007e-05, + "loss": 0.9292, "step": 2093 }, { - "epoch": 0.7646521818513785, - "grad_norm": 1.3159319162368774, - "learning_rate": 3.50910540798867e-05, - "loss": 0.8966, + "epoch": 0.36328938237335184, + "grad_norm": 0.8190827369689941, + "learning_rate": 3.6322636600173464e-05, + "loss": 0.9241, "step": 2094 }, { - "epoch": 0.7650173452620047, - "grad_norm": 1.4409334659576416, - "learning_rate": 3.508163633107008e-05, - "loss": 0.9423, + "epoch": 0.36346287300485774, + "grad_norm": 0.8127410411834717, + "learning_rate": 3.633998265394623e-05, + "loss": 0.9468, "step": 2095 }, { - "epoch": 0.765382508672631, - "grad_norm": 1.0065773725509644, - "learning_rate": 3.507221082335676e-05, - "loss": 0.9154, + "epoch": 0.36363636363636365, + "grad_norm": 0.975398600101471, + "learning_rate": 3.6357328707719e-05, + "loss": 0.8093, "step": 2096 }, { - "epoch": 0.7657476720832572, - "grad_norm": 1.155322551727295, - "learning_rate": 3.5062777561595776e-05, - "loss": 0.9486, + "epoch": 0.36380985426786955, + "grad_norm": 0.6739497184753418, + "learning_rate": 3.6374674761491766e-05, + "loss": 0.929, "step": 2097 }, { - "epoch": 0.7661128354938835, - "grad_norm": 1.4005717039108276, - "learning_rate": 3.505333655064017e-05, - "loss": 0.9274, + "epoch": 0.36398334489937545, + "grad_norm": 0.964533269405365, + "learning_rate": 3.639202081526453e-05, + "loss": 0.7024, "step": 2098 }, { - "epoch": 0.7664779989045097, - "grad_norm": 1.379913091659546, - "learning_rate": 3.5043887795346966e-05, - "loss": 0.9377, + "epoch": 0.36415683553088135, + "grad_norm": 1.01326322555542, + "learning_rate": 3.6409366869037296e-05, + "loss": 0.7822, "step": 2099 }, { - "epoch": 0.766843162315136, - "grad_norm": 1.1601260900497437, - "learning_rate": 3.503443130057715e-05, - "loss": 0.9296, + "epoch": 0.36433032616238725, + "grad_norm": 1.1553412675857544, + "learning_rate": 3.642671292281006e-05, + "loss": 0.7577, "step": 2100 }, { - "epoch": 0.7672083257257623, - "grad_norm": 1.5547581911087036, - "learning_rate": 3.5024967071195736e-05, - "loss": 0.9716, + "epoch": 0.36450381679389315, + "grad_norm": 2.323359727859497, + "learning_rate": 3.644405897658283e-05, + "loss": 0.7681, "step": 2101 }, { - "epoch": 0.7675734891363886, - "grad_norm": 1.062050700187683, - "learning_rate": 3.501549511207168e-05, - "loss": 0.9386, + "epoch": 0.36467730742539906, + "grad_norm": 0.7318731546401978, + "learning_rate": 3.64614050303556e-05, + "loss": 0.9431, "step": 2102 }, { - "epoch": 0.7679386525470148, - "grad_norm": 1.1471830606460571, - "learning_rate": 3.500601542807792e-05, - "loss": 0.9017, + "epoch": 0.3648507980569049, + "grad_norm": 0.7037378549575806, + "learning_rate": 3.647875108412836e-05, + "loss": 0.9695, "step": 2103 }, { - "epoch": 0.7683038159576411, - "grad_norm": 1.8942958116531372, - "learning_rate": 3.499652802409137e-05, - "loss": 0.898, + "epoch": 0.3650242886884108, + "grad_norm": 0.8044214844703674, + "learning_rate": 3.649609713790113e-05, + "loss": 0.9678, "step": 2104 }, { - "epoch": 0.7686689793682673, - "grad_norm": 0.8174071907997131, - "learning_rate": 3.4987032904992935e-05, - "loss": 0.9199, + "epoch": 0.3651977793199167, + "grad_norm": 1.0131793022155762, + "learning_rate": 3.65134431916739e-05, + "loss": 0.7468, "step": 2105 }, { - "epoch": 0.7690341427788936, - "grad_norm": 3.3359742164611816, - "learning_rate": 3.497753007566746e-05, - "loss": 0.9064, + "epoch": 0.3653712699514226, + "grad_norm": 1.057051658630371, + "learning_rate": 3.6530789245446664e-05, + "loss": 0.8552, "step": 2106 }, { - "epoch": 0.7693993061895198, - "grad_norm": 1.1310157775878906, - "learning_rate": 3.4968019541003765e-05, - "loss": 0.9146, + "epoch": 0.3655447605829285, + "grad_norm": 1.101645827293396, + "learning_rate": 3.654813529921943e-05, + "loss": 0.8892, "step": 2107 }, { - "epoch": 0.7697644696001461, - "grad_norm": 1.3152884244918823, - "learning_rate": 3.495850130589465e-05, - "loss": 0.9215, + "epoch": 0.3657182512144344, + "grad_norm": 0.7840586304664612, + "learning_rate": 3.6565481352992194e-05, + "loss": 0.8787, "step": 2108 }, { - "epoch": 0.7701296330107723, - "grad_norm": 0.8497095704078674, - "learning_rate": 3.494897537523686e-05, - "loss": 0.9266, + "epoch": 0.3658917418459403, + "grad_norm": 0.9244402647018433, + "learning_rate": 3.658282740676496e-05, + "loss": 0.844, "step": 2109 }, { - "epoch": 0.7704947964213986, - "grad_norm": 1.2291125059127808, - "learning_rate": 3.493944175393111e-05, - "loss": 0.8995, + "epoch": 0.3660652324774462, + "grad_norm": 0.826235294342041, + "learning_rate": 3.660017346053773e-05, + "loss": 0.6877, "step": 2110 }, { - "epoch": 0.7708599598320248, - "grad_norm": 1.1493576765060425, - "learning_rate": 3.492990044688205e-05, - "loss": 0.9135, + "epoch": 0.3662387231089521, + "grad_norm": 0.9598122835159302, + "learning_rate": 3.6617519514310496e-05, + "loss": 0.802, "step": 2111 }, { - "epoch": 0.7712251232426511, - "grad_norm": 1.111647605895996, - "learning_rate": 3.4920351458998316e-05, - "loss": 0.929, + "epoch": 0.366412213740458, + "grad_norm": 0.8815783262252808, + "learning_rate": 3.663486556808326e-05, + "loss": 0.7424, "step": 2112 }, { - "epoch": 0.7715902866532773, - "grad_norm": 1.1744599342346191, - "learning_rate": 3.491079479519248e-05, - "loss": 0.9369, + "epoch": 0.3665857043719639, + "grad_norm": 1.2261182069778442, + "learning_rate": 3.6652211621856026e-05, + "loss": 0.8403, "step": 2113 }, { - "epoch": 0.7719554500639036, - "grad_norm": 1.0402218103408813, - "learning_rate": 3.490123046038104e-05, - "loss": 0.9039, + "epoch": 0.3667591950034698, + "grad_norm": 1.377503752708435, + "learning_rate": 3.66695576756288e-05, + "loss": 0.6941, "step": 2114 }, { - "epoch": 0.7723206134745298, - "grad_norm": 1.1436413526535034, - "learning_rate": 3.489165845948448e-05, - "loss": 0.9175, + "epoch": 0.3669326856349757, + "grad_norm": 1.368821144104004, + "learning_rate": 3.668690372940157e-05, + "loss": 0.686, "step": 2115 }, { - "epoch": 0.7726857768851562, - "grad_norm": 1.3146084547042847, - "learning_rate": 3.488207879742722e-05, - "loss": 0.9242, + "epoch": 0.3671061762664816, + "grad_norm": 0.9995987415313721, + "learning_rate": 3.6704249783174335e-05, + "loss": 0.7014, "step": 2116 }, { - "epoch": 0.7730509402957824, - "grad_norm": 1.0402237176895142, - "learning_rate": 3.487249147913759e-05, - "loss": 0.8921, + "epoch": 0.3672796668979875, + "grad_norm": 0.9219637513160706, + "learning_rate": 3.67215958369471e-05, + "loss": 0.7019, "step": 2117 }, { - "epoch": 0.7734161037064086, - "grad_norm": 1.3639869689941406, - "learning_rate": 3.4862896509547886e-05, - "loss": 0.9208, + "epoch": 0.3674531575294934, + "grad_norm": 0.8058903813362122, + "learning_rate": 3.6738941890719865e-05, + "loss": 0.7661, "step": 2118 }, { - "epoch": 0.7737812671170349, - "grad_norm": 0.9357107877731323, - "learning_rate": 3.485329389359434e-05, - "loss": 0.9692, + "epoch": 0.3676266481609993, + "grad_norm": 1.2080923318862915, + "learning_rate": 3.675628794449263e-05, + "loss": 0.8533, "step": 2119 }, { - "epoch": 0.7741464305276611, - "grad_norm": 1.0580675601959229, - "learning_rate": 3.484368363621712e-05, - "loss": 0.927, + "epoch": 0.3678001387925052, + "grad_norm": 0.6671381592750549, + "learning_rate": 3.67736339982654e-05, + "loss": 0.8201, "step": 2120 }, { - "epoch": 0.7745115939382874, - "grad_norm": 2.1447317600250244, - "learning_rate": 3.48340657423603e-05, - "loss": 0.9611, + "epoch": 0.36797362942401113, + "grad_norm": 0.9427962303161621, + "learning_rate": 3.6790980052038166e-05, + "loss": 0.6362, "step": 2121 }, { - "epoch": 0.7748767573489136, - "grad_norm": 1.1341729164123535, - "learning_rate": 3.482444021697192e-05, - "loss": 0.9174, + "epoch": 0.36814712005551703, + "grad_norm": 0.8577224612236023, + "learning_rate": 3.680832610581093e-05, + "loss": 0.718, "step": 2122 }, { - "epoch": 0.7752419207595399, - "grad_norm": 1.1582592725753784, - "learning_rate": 3.481480706500391e-05, - "loss": 0.9183, + "epoch": 0.3683206106870229, + "grad_norm": 1.1259560585021973, + "learning_rate": 3.6825672159583696e-05, + "loss": 0.7793, "step": 2123 }, { - "epoch": 0.7756070841701661, - "grad_norm": 1.090794563293457, - "learning_rate": 3.480516629141214e-05, - "loss": 0.9493, + "epoch": 0.3684941013185288, + "grad_norm": 0.8137093186378479, + "learning_rate": 3.684301821335647e-05, + "loss": 0.7384, "step": 2124 }, { - "epoch": 0.7759722475807924, - "grad_norm": 1.2919611930847168, - "learning_rate": 3.479551790115642e-05, - "loss": 0.9089, + "epoch": 0.3686675919500347, + "grad_norm": 1.1045384407043457, + "learning_rate": 3.686036426712923e-05, + "loss": 0.7957, "step": 2125 }, { - "epoch": 0.7763374109914186, - "grad_norm": 1.2869877815246582, - "learning_rate": 3.4785861899200434e-05, - "loss": 0.923, + "epoch": 0.3688410825815406, + "grad_norm": 0.8655889630317688, + "learning_rate": 3.6877710320902e-05, + "loss": 0.9219, "step": 2126 }, { - "epoch": 0.7767025744020449, - "grad_norm": 1.1584030389785767, - "learning_rate": 3.477619829051183e-05, - "loss": 0.9248, + "epoch": 0.3690145732130465, + "grad_norm": 0.9727876782417297, + "learning_rate": 3.689505637467476e-05, + "loss": 0.937, "step": 2127 }, { - "epoch": 0.7770677378126711, - "grad_norm": 1.1653350591659546, - "learning_rate": 3.476652708006214e-05, - "loss": 0.9452, + "epoch": 0.3691880638445524, + "grad_norm": 0.793245255947113, + "learning_rate": 3.691240242844753e-05, + "loss": 0.8281, "step": 2128 }, { - "epoch": 0.7774329012232974, - "grad_norm": 0.9587676525115967, - "learning_rate": 3.4756848272826795e-05, - "loss": 0.913, + "epoch": 0.3693615544760583, + "grad_norm": 1.3345848321914673, + "learning_rate": 3.69297484822203e-05, + "loss": 0.8611, "step": 2129 }, { - "epoch": 0.7777980646339236, - "grad_norm": 1.6270071268081665, - "learning_rate": 3.474716187378518e-05, - "loss": 0.9064, + "epoch": 0.3695350451075642, + "grad_norm": 0.7692463994026184, + "learning_rate": 3.6947094535993065e-05, + "loss": 0.8801, "step": 2130 }, { - "epoch": 0.77816322804455, - "grad_norm": 1.276771903038025, - "learning_rate": 3.4737467887920556e-05, - "loss": 0.918, + "epoch": 0.3697085357390701, + "grad_norm": 1.0163977146148682, + "learning_rate": 3.696444058976583e-05, + "loss": 0.7156, "step": 2131 }, { - "epoch": 0.7785283914551762, - "grad_norm": 1.377997636795044, - "learning_rate": 3.4727766320220064e-05, - "loss": 0.8965, + "epoch": 0.369882026370576, + "grad_norm": 0.8741844296455383, + "learning_rate": 3.6981786643538595e-05, + "loss": 0.8071, "step": 2132 }, { - "epoch": 0.7788935548658025, - "grad_norm": 0.9750809073448181, - "learning_rate": 3.47180571756748e-05, - "loss": 0.9032, + "epoch": 0.3700555170020819, + "grad_norm": 0.9002806544303894, + "learning_rate": 3.699913269731137e-05, + "loss": 0.7661, "step": 2133 }, { - "epoch": 0.7792587182764287, - "grad_norm": 1.1636263132095337, - "learning_rate": 3.470834045927971e-05, - "loss": 0.9452, + "epoch": 0.3702290076335878, + "grad_norm": 1.122450590133667, + "learning_rate": 3.701647875108413e-05, + "loss": 0.7981, "step": 2134 }, { - "epoch": 0.779623881687055, - "grad_norm": 0.8865572810173035, - "learning_rate": 3.469861617603367e-05, - "loss": 0.931, + "epoch": 0.3704024982650937, + "grad_norm": 0.8047002553939819, + "learning_rate": 3.70338248048569e-05, + "loss": 0.8777, "step": 2135 }, { - "epoch": 0.7799890450976812, - "grad_norm": 1.3759772777557373, - "learning_rate": 3.468888433093943e-05, - "loss": 0.8922, + "epoch": 0.3705759888965996, + "grad_norm": 0.888335645198822, + "learning_rate": 3.705117085862966e-05, + "loss": 0.7588, "step": 2136 }, { - "epoch": 0.7803542085083075, - "grad_norm": 1.227399468421936, - "learning_rate": 3.4679144929003624e-05, - "loss": 0.9285, + "epoch": 0.3707494795281055, + "grad_norm": 2.0107083320617676, + "learning_rate": 3.7068516912402433e-05, + "loss": 0.821, "step": 2137 }, { - "epoch": 0.7807193719189337, - "grad_norm": 1.3621705770492554, - "learning_rate": 3.466939797523679e-05, - "loss": 0.9114, + "epoch": 0.3709229701596114, + "grad_norm": 0.7450129389762878, + "learning_rate": 3.70858629661752e-05, + "loss": 0.843, "step": 2138 }, { - "epoch": 0.78108453532956, - "grad_norm": 1.40164315700531, - "learning_rate": 3.465964347465334e-05, - "loss": 0.9104, + "epoch": 0.3710964607911173, + "grad_norm": 0.8212995529174805, + "learning_rate": 3.7103209019947963e-05, + "loss": 0.8313, "step": 2139 }, { - "epoch": 0.7814496987401862, - "grad_norm": 1.253547191619873, - "learning_rate": 3.464988143227158e-05, - "loss": 0.9523, + "epoch": 0.3712699514226232, + "grad_norm": 1.0979955196380615, + "learning_rate": 3.712055507372073e-05, + "loss": 0.6914, "step": 2140 }, { - "epoch": 0.7818148621508125, - "grad_norm": 1.1945288181304932, - "learning_rate": 3.4640111853113686e-05, - "loss": 0.8425, + "epoch": 0.3714434420541291, + "grad_norm": 1.2642909288406372, + "learning_rate": 3.7137901127493493e-05, + "loss": 0.8242, "step": 2141 }, { - "epoch": 0.7821800255614387, - "grad_norm": 1.8071683645248413, - "learning_rate": 3.463033474220572e-05, - "loss": 0.9384, + "epoch": 0.37161693268563495, + "grad_norm": 1.1901463270187378, + "learning_rate": 3.7155247181266265e-05, + "loss": 0.7805, "step": 2142 }, { - "epoch": 0.782545188972065, - "grad_norm": 1.025434970855713, - "learning_rate": 3.46205501045776e-05, - "loss": 0.8779, + "epoch": 0.37179042331714085, + "grad_norm": 1.1451181173324585, + "learning_rate": 3.717259323503903e-05, + "loss": 0.8679, "step": 2143 }, { - "epoch": 0.7829103523826912, - "grad_norm": 2.0920820236206055, - "learning_rate": 3.461075794526314e-05, - "loss": 0.9166, + "epoch": 0.37196391394864675, + "grad_norm": 0.980608344078064, + "learning_rate": 3.71899392888118e-05, + "loss": 0.8035, "step": 2144 }, { - "epoch": 0.7832755157933176, - "grad_norm": 0.9783653616905212, - "learning_rate": 3.460095826930001e-05, - "loss": 0.9363, + "epoch": 0.37213740458015265, + "grad_norm": 0.871525228023529, + "learning_rate": 3.720728534258457e-05, + "loss": 0.8499, "step": 2145 }, { - "epoch": 0.7836406792039438, - "grad_norm": 1.3614214658737183, - "learning_rate": 3.4591151081729756e-05, - "loss": 0.9214, + "epoch": 0.37231089521165855, + "grad_norm": 1.020147681236267, + "learning_rate": 3.722463139635733e-05, + "loss": 0.7012, "step": 2146 }, { - "epoch": 0.7840058426145701, - "grad_norm": 1.2778105735778809, - "learning_rate": 3.458133638759777e-05, - "loss": 0.937, + "epoch": 0.37248438584316446, + "grad_norm": 0.8300002813339233, + "learning_rate": 3.72419774501301e-05, + "loss": 0.7622, "step": 2147 }, { - "epoch": 0.7843710060251963, - "grad_norm": 1.093834400177002, - "learning_rate": 3.457151419195332e-05, - "loss": 0.9517, + "epoch": 0.37265787647467036, + "grad_norm": 1.280008316040039, + "learning_rate": 3.725932350390287e-05, + "loss": 0.8269, "step": 2148 }, { - "epoch": 0.7847361694358226, - "grad_norm": 1.426297664642334, - "learning_rate": 3.456168449984955e-05, - "loss": 0.8948, + "epoch": 0.37283136710617626, + "grad_norm": 1.1751400232315063, + "learning_rate": 3.7276669557675634e-05, + "loss": 0.7158, "step": 2149 }, { - "epoch": 0.7851013328464488, - "grad_norm": 0.9592084288597107, - "learning_rate": 3.4551847316343426e-05, - "loss": 0.9442, + "epoch": 0.37300485773768216, + "grad_norm": 1.1361275911331177, + "learning_rate": 3.72940156114484e-05, + "loss": 0.7363, "step": 2150 }, { - "epoch": 0.785466496257075, - "grad_norm": 1.395951747894287, - "learning_rate": 3.4542002646495784e-05, - "loss": 0.9622, + "epoch": 0.37317834836918806, + "grad_norm": 0.9622294902801514, + "learning_rate": 3.7311361665221164e-05, + "loss": 0.7037, "step": 2151 }, { - "epoch": 0.7858316596677013, - "grad_norm": 1.144709587097168, - "learning_rate": 3.453215049537131e-05, - "loss": 0.8937, + "epoch": 0.37335183900069396, + "grad_norm": 0.9768035411834717, + "learning_rate": 3.7328707718993936e-05, + "loss": 0.7341, "step": 2152 }, { - "epoch": 0.7861968230783275, - "grad_norm": 0.9091548919677734, - "learning_rate": 3.452229086803856e-05, - "loss": 0.8777, + "epoch": 0.37352532963219987, + "grad_norm": 0.8860333561897278, + "learning_rate": 3.73460537727667e-05, + "loss": 0.9163, "step": 2153 }, { - "epoch": 0.7865619864889538, - "grad_norm": 1.1414861679077148, - "learning_rate": 3.451242376956988e-05, - "loss": 0.9863, + "epoch": 0.37369882026370577, + "grad_norm": 0.8218538165092468, + "learning_rate": 3.7363399826539466e-05, + "loss": 0.877, "step": 2154 }, { - "epoch": 0.78692714989958, - "grad_norm": 1.1166890859603882, - "learning_rate": 3.4502549205041534e-05, - "loss": 0.9252, + "epoch": 0.37387231089521167, + "grad_norm": 1.3049238920211792, + "learning_rate": 3.738074588031223e-05, + "loss": 0.73, "step": 2155 }, { - "epoch": 0.7872923133102063, - "grad_norm": 0.8954434394836426, - "learning_rate": 3.449266717953357e-05, - "loss": 0.9163, + "epoch": 0.37404580152671757, + "grad_norm": 1.1257781982421875, + "learning_rate": 3.7398091934085e-05, + "loss": 0.7335, "step": 2156 }, { - "epoch": 0.7876574767208325, - "grad_norm": 1.025185465812683, - "learning_rate": 3.44827776981299e-05, - "loss": 0.9478, + "epoch": 0.37421929215822347, + "grad_norm": 0.9626754522323608, + "learning_rate": 3.741543798785777e-05, + "loss": 0.8699, "step": 2157 }, { - "epoch": 0.7880226401314588, - "grad_norm": 1.2928316593170166, - "learning_rate": 3.447288076591825e-05, - "loss": 0.9536, + "epoch": 0.3743927827897294, + "grad_norm": 0.7499566078186035, + "learning_rate": 3.743278404163053e-05, + "loss": 0.906, "step": 2158 }, { - "epoch": 0.788387803542085, - "grad_norm": 1.3412227630615234, - "learning_rate": 3.446297638799022e-05, - "loss": 0.9388, + "epoch": 0.3745662734212353, + "grad_norm": 0.9744541049003601, + "learning_rate": 3.74501300954033e-05, + "loss": 0.8706, "step": 2159 }, { - "epoch": 0.7887529669527114, - "grad_norm": 1.1929829120635986, - "learning_rate": 3.445306456944119e-05, - "loss": 0.9207, + "epoch": 0.3747397640527412, + "grad_norm": 1.0549876689910889, + "learning_rate": 3.746747614917606e-05, + "loss": 0.6836, "step": 2160 }, { - "epoch": 0.7891181303633376, - "grad_norm": 0.8803319931030273, - "learning_rate": 3.444314531537041e-05, - "loss": 0.9246, + "epoch": 0.3749132546842471, + "grad_norm": 0.9324131011962891, + "learning_rate": 3.7484822202948834e-05, + "loss": 0.8215, "step": 2161 }, { - "epoch": 0.7894832937739639, - "grad_norm": 1.2273976802825928, - "learning_rate": 3.443321863088093e-05, - "loss": 0.9221, + "epoch": 0.3750867453157529, + "grad_norm": 1.025583028793335, + "learning_rate": 3.75021682567216e-05, + "loss": 0.7327, "step": 2162 }, { - "epoch": 0.7898484571845901, - "grad_norm": 0.9959760308265686, - "learning_rate": 3.4423284521079635e-05, - "loss": 0.9304, + "epoch": 0.3752602359472588, + "grad_norm": 0.973997175693512, + "learning_rate": 3.7519514310494364e-05, + "loss": 0.7351, "step": 2163 }, { - "epoch": 0.7902136205952164, - "grad_norm": 1.1141496896743774, - "learning_rate": 3.441334299107722e-05, - "loss": 0.9194, + "epoch": 0.3754337265787647, + "grad_norm": 0.9087184071540833, + "learning_rate": 3.753686036426713e-05, + "loss": 0.8307, "step": 2164 }, { - "epoch": 0.7905787840058426, - "grad_norm": 1.0833743810653687, - "learning_rate": 3.440339404598822e-05, - "loss": 0.8958, + "epoch": 0.3756072172102706, + "grad_norm": 0.793548583984375, + "learning_rate": 3.75542064180399e-05, + "loss": 0.7189, "step": 2165 }, { - "epoch": 0.7909439474164689, - "grad_norm": 1.1847440004348755, - "learning_rate": 3.4393437690930944e-05, - "loss": 0.9558, + "epoch": 0.37578070784177653, + "grad_norm": 0.9981021881103516, + "learning_rate": 3.7571552471812666e-05, + "loss": 1.019, "step": 2166 }, { - "epoch": 0.7913091108270951, - "grad_norm": 1.0982320308685303, - "learning_rate": 3.438347393102755e-05, - "loss": 0.8785, + "epoch": 0.37595419847328243, + "grad_norm": 0.86371248960495, + "learning_rate": 3.758889852558543e-05, + "loss": 0.8596, "step": 2167 }, { - "epoch": 0.7916742742377214, - "grad_norm": 0.9179049134254456, - "learning_rate": 3.4373502771403995e-05, - "loss": 0.9125, + "epoch": 0.37612768910478833, + "grad_norm": 1.2592934370040894, + "learning_rate": 3.7606244579358196e-05, + "loss": 0.647, "step": 2168 }, { - "epoch": 0.7920394376483476, - "grad_norm": 1.1268888711929321, - "learning_rate": 3.436352421719004e-05, - "loss": 0.9384, + "epoch": 0.37630117973629423, + "grad_norm": 1.079402208328247, + "learning_rate": 3.762359063313096e-05, + "loss": 0.7546, "step": 2169 }, { - "epoch": 0.7924046010589739, - "grad_norm": 0.8618208765983582, - "learning_rate": 3.4353538273519244e-05, - "loss": 0.9258, + "epoch": 0.37647467036780013, + "grad_norm": 0.7089620232582092, + "learning_rate": 3.764093668690373e-05, + "loss": 0.9497, "step": 2170 }, { - "epoch": 0.7927697644696001, - "grad_norm": 1.3194196224212646, - "learning_rate": 3.4343544945528975e-05, - "loss": 0.924, + "epoch": 0.37664816099930604, + "grad_norm": 1.072975516319275, + "learning_rate": 3.76582827406765e-05, + "loss": 0.7153, "step": 2171 }, { - "epoch": 0.7931349278802264, - "grad_norm": 0.9566547274589539, - "learning_rate": 3.4333544238360404e-05, - "loss": 0.8871, + "epoch": 0.37682165163081194, + "grad_norm": 0.9028416275978088, + "learning_rate": 3.767562879444926e-05, + "loss": 0.7758, "step": 2172 }, { - "epoch": 0.7935000912908526, - "grad_norm": 0.9387026429176331, - "learning_rate": 3.432353615715849e-05, - "loss": 0.927, + "epoch": 0.37699514226231784, + "grad_norm": 0.806329607963562, + "learning_rate": 3.7692974848222034e-05, + "loss": 0.8262, "step": 2173 }, { - "epoch": 0.793865254701479, - "grad_norm": 0.9449287056922913, - "learning_rate": 3.431352070707199e-05, - "loss": 0.928, + "epoch": 0.37716863289382374, + "grad_norm": 0.9252581000328064, + "learning_rate": 3.77103209019948e-05, + "loss": 0.8293, "step": 2174 }, { - "epoch": 0.7942304181121052, - "grad_norm": 1.2963557243347168, - "learning_rate": 3.430349789325346e-05, - "loss": 0.925, + "epoch": 0.37734212352532964, + "grad_norm": 1.1630088090896606, + "learning_rate": 3.772766695576757e-05, + "loss": 0.8167, "step": 2175 }, { - "epoch": 0.7945955815227315, - "grad_norm": 0.9801070094108582, - "learning_rate": 3.429346772085923e-05, - "loss": 0.9036, + "epoch": 0.37751561415683554, + "grad_norm": 1.166528344154358, + "learning_rate": 3.7745013009540336e-05, + "loss": 0.8416, "step": 2176 }, { - "epoch": 0.7949607449333577, - "grad_norm": 1.091010332107544, - "learning_rate": 3.42834301950494e-05, - "loss": 0.9359, + "epoch": 0.37768910478834145, + "grad_norm": 0.8716579079627991, + "learning_rate": 3.77623590633131e-05, + "loss": 0.7281, "step": 2177 }, { - "epoch": 0.795325908343984, - "grad_norm": 1.012907862663269, - "learning_rate": 3.427338532098791e-05, - "loss": 0.8832, + "epoch": 0.37786259541984735, + "grad_norm": 0.8718217015266418, + "learning_rate": 3.7779705117085866e-05, + "loss": 0.8518, "step": 2178 }, { - "epoch": 0.7956910717546102, - "grad_norm": 1.4174619913101196, - "learning_rate": 3.4263333103842415e-05, - "loss": 0.9298, + "epoch": 0.37803608605135325, + "grad_norm": 0.9101107716560364, + "learning_rate": 3.779705117085863e-05, + "loss": 0.8098, "step": 2179 }, { - "epoch": 0.7960562351652365, - "grad_norm": 1.4473166465759277, - "learning_rate": 3.42532735487844e-05, - "loss": 0.9043, + "epoch": 0.37820957668285915, + "grad_norm": 0.94647616147995, + "learning_rate": 3.78143972246314e-05, + "loss": 0.8306, "step": 2180 }, { - "epoch": 0.7964213985758627, - "grad_norm": 0.990759015083313, - "learning_rate": 3.424320666098909e-05, - "loss": 0.9288, + "epoch": 0.37838306731436505, + "grad_norm": 0.9389774203300476, + "learning_rate": 3.783174327840417e-05, + "loss": 0.8877, "step": 2181 }, { - "epoch": 0.796786561986489, - "grad_norm": 0.9119299650192261, - "learning_rate": 3.4233132445635496e-05, - "loss": 0.8685, + "epoch": 0.3785565579458709, + "grad_norm": 1.284490704536438, + "learning_rate": 3.784908933217693e-05, + "loss": 0.8162, "step": 2182 }, { - "epoch": 0.7971517253971152, - "grad_norm": 1.014418363571167, - "learning_rate": 3.42230509079064e-05, - "loss": 0.9406, + "epoch": 0.3787300485773768, + "grad_norm": 1.91201651096344, + "learning_rate": 3.78664353859497e-05, + "loss": 0.7207, "step": 2183 }, { - "epoch": 0.7975168888077415, - "grad_norm": 1.2333072423934937, - "learning_rate": 3.421296205298835e-05, - "loss": 0.9077, + "epoch": 0.3789035392088827, + "grad_norm": 0.8811169862747192, + "learning_rate": 3.788378143972247e-05, + "loss": 0.9072, "step": 2184 }, { - "epoch": 0.7978820522183677, - "grad_norm": 1.3653199672698975, - "learning_rate": 3.420286588607165e-05, - "loss": 0.9099, + "epoch": 0.3790770298403886, + "grad_norm": 0.9257833957672119, + "learning_rate": 3.7901127493495235e-05, + "loss": 0.8422, "step": 2185 }, { - "epoch": 0.7982472156289939, - "grad_norm": 1.1463234424591064, - "learning_rate": 3.4192762412350375e-05, - "loss": 0.9149, + "epoch": 0.3792505204718945, + "grad_norm": 0.9536060094833374, + "learning_rate": 3.7918473547268e-05, + "loss": 0.8518, "step": 2186 }, { - "epoch": 0.7986123790396202, - "grad_norm": 1.263271689414978, - "learning_rate": 3.418265163702236e-05, - "loss": 0.9196, + "epoch": 0.3794240111034004, + "grad_norm": 0.9797681570053101, + "learning_rate": 3.7935819601040765e-05, + "loss": 0.9075, "step": 2187 }, { - "epoch": 0.7989775424502464, - "grad_norm": 1.3002774715423584, - "learning_rate": 3.41725335652892e-05, - "loss": 0.9316, + "epoch": 0.3795975017349063, + "grad_norm": 0.9083612561225891, + "learning_rate": 3.795316565481353e-05, + "loss": 0.7727, "step": 2188 }, { - "epoch": 0.7993427058608727, - "grad_norm": 2.905634641647339, - "learning_rate": 3.4162408202356224e-05, - "loss": 0.9307, + "epoch": 0.3797709923664122, + "grad_norm": 1.1681925058364868, + "learning_rate": 3.79705117085863e-05, + "loss": 0.7727, "step": 2189 }, { - "epoch": 0.799707869271499, - "grad_norm": 1.0616352558135986, - "learning_rate": 3.4152275553432524e-05, - "loss": 0.8842, + "epoch": 0.3799444829979181, + "grad_norm": 0.9059127569198608, + "learning_rate": 3.7987857762359067e-05, + "loss": 0.8289, "step": 2190 }, { - "epoch": 0.8000730326821253, - "grad_norm": 1.1164469718933105, - "learning_rate": 3.4142135623730954e-05, - "loss": 0.9083, + "epoch": 0.380117973629424, + "grad_norm": 1.0577943325042725, + "learning_rate": 3.800520381613183e-05, + "loss": 0.6776, "step": 2191 }, { - "epoch": 0.8004381960927515, - "grad_norm": 0.880877673625946, - "learning_rate": 3.413198841846809e-05, - "loss": 0.9116, + "epoch": 0.3802914642609299, + "grad_norm": 1.1945301294326782, + "learning_rate": 3.8022549869904597e-05, + "loss": 0.6816, "step": 2192 }, { - "epoch": 0.8008033595033778, - "grad_norm": 0.8510430455207825, - "learning_rate": 3.412183394286427e-05, - "loss": 0.8776, + "epoch": 0.3804649548924358, + "grad_norm": 0.821591854095459, + "learning_rate": 3.803989592367737e-05, + "loss": 0.8569, "step": 2193 }, { - "epoch": 0.801168522914004, - "grad_norm": 1.2370086908340454, - "learning_rate": 3.411167220214356e-05, - "loss": 0.954, + "epoch": 0.3806384455239417, + "grad_norm": 0.6789693832397461, + "learning_rate": 3.805724197745013e-05, + "loss": 0.9163, "step": 2194 }, { - "epoch": 0.8015336863246303, - "grad_norm": 0.8150460720062256, - "learning_rate": 3.410150320153377e-05, - "loss": 0.8679, + "epoch": 0.3808119361554476, + "grad_norm": 0.9603880643844604, + "learning_rate": 3.80745880312229e-05, + "loss": 0.8325, "step": 2195 }, { - "epoch": 0.8018988497352565, - "grad_norm": 1.0506614446640015, - "learning_rate": 3.409132694626643e-05, - "loss": 0.9196, + "epoch": 0.3809854267869535, + "grad_norm": 0.7775828838348389, + "learning_rate": 3.809193408499566e-05, + "loss": 0.7273, "step": 2196 }, { - "epoch": 0.8022640131458828, - "grad_norm": 0.8772296905517578, - "learning_rate": 3.408114344157684e-05, - "loss": 0.9334, + "epoch": 0.3811589174184594, + "grad_norm": 0.9606022238731384, + "learning_rate": 3.8109280138768435e-05, + "loss": 0.7976, "step": 2197 }, { - "epoch": 0.802629176556509, - "grad_norm": 1.3481740951538086, - "learning_rate": 3.407095269270398e-05, - "loss": 0.9017, + "epoch": 0.3813324080499653, + "grad_norm": 2.0619938373565674, + "learning_rate": 3.81266261925412e-05, + "loss": 0.8887, "step": 2198 }, { - "epoch": 0.8029943399671353, - "grad_norm": 1.1616783142089844, - "learning_rate": 3.40607547048906e-05, - "loss": 0.9292, + "epoch": 0.3815058986814712, + "grad_norm": 0.979112982749939, + "learning_rate": 3.8143972246313965e-05, + "loss": 0.791, "step": 2199 }, { - "epoch": 0.8033595033777615, - "grad_norm": 0.9066925048828125, - "learning_rate": 3.405054948338314e-05, - "loss": 0.8961, + "epoch": 0.3816793893129771, + "grad_norm": 0.8349247574806213, + "learning_rate": 3.816131830008673e-05, + "loss": 0.9336, "step": 2200 }, { - "epoch": 0.8037246667883878, - "grad_norm": 1.0977064371109009, - "learning_rate": 3.404033703343179e-05, - "loss": 0.9324, + "epoch": 0.38185287994448297, + "grad_norm": 0.7552117109298706, + "learning_rate": 3.8178664353859495e-05, + "loss": 0.7842, "step": 2201 }, { - "epoch": 0.804089830199014, - "grad_norm": 1.0586031675338745, - "learning_rate": 3.4030117360290436e-05, - "loss": 0.9397, + "epoch": 0.38202637057598887, + "grad_norm": 1.014573574066162, + "learning_rate": 3.819601040763227e-05, + "loss": 0.7529, "step": 2202 }, { - "epoch": 0.8044549936096403, - "grad_norm": 1.364642858505249, - "learning_rate": 3.40198904692167e-05, - "loss": 0.9286, + "epoch": 0.3821998612074948, + "grad_norm": 1.5275585651397705, + "learning_rate": 3.821335646140504e-05, + "loss": 0.7424, "step": 2203 }, { - "epoch": 0.8048201570202665, - "grad_norm": 1.27865731716156, - "learning_rate": 3.4009656365471895e-05, - "loss": 0.9414, + "epoch": 0.3823733518390007, + "grad_norm": 0.7810157537460327, + "learning_rate": 3.8230702515177804e-05, + "loss": 0.8677, "step": 2204 }, { - "epoch": 0.8051853204308929, - "grad_norm": 1.1917954683303833, - "learning_rate": 3.399941505432106e-05, - "loss": 0.9397, + "epoch": 0.3825468424705066, + "grad_norm": 1.125082015991211, + "learning_rate": 3.824804856895057e-05, + "loss": 0.6708, "step": 2205 }, { - "epoch": 0.8055504838415191, - "grad_norm": 1.1749337911605835, - "learning_rate": 3.398916654103294e-05, - "loss": 0.9041, + "epoch": 0.3827203331020125, + "grad_norm": 0.9721659421920776, + "learning_rate": 3.8265394622723334e-05, + "loss": 0.7134, "step": 2206 }, { - "epoch": 0.8059156472521454, - "grad_norm": 0.9851516485214233, - "learning_rate": 3.397891083088e-05, - "loss": 0.9222, + "epoch": 0.3828938237335184, + "grad_norm": 0.946829080581665, + "learning_rate": 3.82827406764961e-05, + "loss": 0.7585, "step": 2207 }, { - "epoch": 0.8062808106627716, - "grad_norm": 0.9759044647216797, - "learning_rate": 3.396864792913836e-05, - "loss": 0.8959, + "epoch": 0.3830673143650243, + "grad_norm": 1.0976260900497437, + "learning_rate": 3.830008673026887e-05, + "loss": 0.847, "step": 2208 }, { - "epoch": 0.8066459740733979, - "grad_norm": 1.389238715171814, - "learning_rate": 3.3958377841087894e-05, - "loss": 0.9147, + "epoch": 0.3832408049965302, + "grad_norm": 1.6471534967422485, + "learning_rate": 3.8317432784041636e-05, + "loss": 0.7922, "step": 2209 }, { - "epoch": 0.8070111374840241, - "grad_norm": 0.9051432609558105, - "learning_rate": 3.3948100572012145e-05, - "loss": 0.8951, + "epoch": 0.3834142956280361, + "grad_norm": 1.0063518285751343, + "learning_rate": 3.83347788378144e-05, + "loss": 0.6904, "step": 2210 }, { - "epoch": 0.8073763008946504, - "grad_norm": 1.1388543844223022, - "learning_rate": 3.393781612719835e-05, - "loss": 0.8746, + "epoch": 0.383587786259542, + "grad_norm": 1.549196720123291, + "learning_rate": 3.8352124891587166e-05, + "loss": 0.7261, "step": 2211 }, { - "epoch": 0.8077414643052766, - "grad_norm": 1.528680682182312, - "learning_rate": 3.3927524511937446e-05, - "loss": 0.8669, + "epoch": 0.3837612768910479, + "grad_norm": 0.9521839618682861, + "learning_rate": 3.836947094535994e-05, + "loss": 0.7026, "step": 2212 }, { - "epoch": 0.8081066277159029, - "grad_norm": 0.9611297249794006, - "learning_rate": 3.391722573152406e-05, - "loss": 0.953, + "epoch": 0.3839347675225538, + "grad_norm": 1.012204885482788, + "learning_rate": 3.83868169991327e-05, + "loss": 0.7881, "step": 2213 }, { - "epoch": 0.8084717911265291, - "grad_norm": 0.8487144112586975, - "learning_rate": 3.39069197912565e-05, - "loss": 0.9067, + "epoch": 0.3841082581540597, + "grad_norm": 0.8614507913589478, + "learning_rate": 3.840416305290547e-05, + "loss": 0.7269, "step": 2214 }, { - "epoch": 0.8088369545371554, - "grad_norm": 1.3437159061431885, - "learning_rate": 3.389660669643676e-05, - "loss": 0.9486, + "epoch": 0.3842817487855656, + "grad_norm": 1.3161779642105103, + "learning_rate": 3.842150910667823e-05, + "loss": 0.7373, "step": 2215 }, { - "epoch": 0.8092021179477816, - "grad_norm": 1.4769995212554932, - "learning_rate": 3.3886286452370505e-05, - "loss": 0.9318, + "epoch": 0.3844552394170715, + "grad_norm": 1.0459922552108765, + "learning_rate": 3.8438855160451004e-05, + "loss": 0.8916, "step": 2216 }, { - "epoch": 0.8095672813584079, - "grad_norm": 1.075202226638794, - "learning_rate": 3.387595906436709e-05, - "loss": 0.9098, + "epoch": 0.3846287300485774, + "grad_norm": 0.8897533416748047, + "learning_rate": 3.845620121422377e-05, + "loss": 0.7947, "step": 2217 }, { - "epoch": 0.8099324447690341, - "grad_norm": 1.2898781299591064, - "learning_rate": 3.386562453773955e-05, - "loss": 0.9307, + "epoch": 0.3848022206800833, + "grad_norm": 1.4636198282241821, + "learning_rate": 3.8473547267996534e-05, + "loss": 0.7495, "step": 2218 }, { - "epoch": 0.8102976081796603, - "grad_norm": 1.2500375509262085, - "learning_rate": 3.3855282877804575e-05, - "loss": 0.9578, + "epoch": 0.3849757113115892, + "grad_norm": 0.9355475902557373, + "learning_rate": 3.84908933217693e-05, + "loss": 0.7661, "step": 2219 }, { - "epoch": 0.8106627715902867, - "grad_norm": 1.2944809198379517, - "learning_rate": 3.384493408988254e-05, - "loss": 0.9474, + "epoch": 0.3851492019430951, + "grad_norm": 0.7645488381385803, + "learning_rate": 3.8508239375542064e-05, + "loss": 0.8862, "step": 2220 }, { - "epoch": 0.8110279350009129, - "grad_norm": 1.431190013885498, - "learning_rate": 3.3834578179297484e-05, - "loss": 0.9401, + "epoch": 0.38532269257460094, + "grad_norm": 1.1526408195495605, + "learning_rate": 3.8525585429314836e-05, + "loss": 0.813, "step": 2221 }, { - "epoch": 0.8113930984115392, - "grad_norm": 0.8970507383346558, - "learning_rate": 3.3824215151377095e-05, - "loss": 0.8749, + "epoch": 0.38549618320610685, + "grad_norm": 0.8962175846099854, + "learning_rate": 3.85429314830876e-05, + "loss": 0.8108, "step": 2222 }, { - "epoch": 0.8117582618221654, - "grad_norm": 1.0138885974884033, - "learning_rate": 3.381384501145274e-05, - "loss": 0.9158, + "epoch": 0.38566967383761275, + "grad_norm": 0.8569124341011047, + "learning_rate": 3.8560277536860366e-05, + "loss": 0.9233, "step": 2223 }, { - "epoch": 0.8121234252327917, - "grad_norm": 0.8881771564483643, - "learning_rate": 3.380346776485944e-05, - "loss": 0.8951, + "epoch": 0.38584316446911865, + "grad_norm": 0.839561402797699, + "learning_rate": 3.857762359063313e-05, + "loss": 0.8281, "step": 2224 }, { - "epoch": 0.8124885886434179, - "grad_norm": 1.248698353767395, - "learning_rate": 3.379308341693588e-05, - "loss": 0.892, + "epoch": 0.38601665510062455, + "grad_norm": 0.79317307472229, + "learning_rate": 3.85949696444059e-05, + "loss": 0.7285, "step": 2225 }, { - "epoch": 0.8128537520540442, - "grad_norm": 1.2730597257614136, - "learning_rate": 3.378269197302438e-05, - "loss": 0.9305, + "epoch": 0.38619014573213045, + "grad_norm": 1.031294345855713, + "learning_rate": 3.861231569817867e-05, + "loss": 0.6796, "step": 2226 }, { - "epoch": 0.8132189154646704, - "grad_norm": 1.2280094623565674, - "learning_rate": 3.3772293438470924e-05, - "loss": 0.8892, + "epoch": 0.38636363636363635, + "grad_norm": 0.900271475315094, + "learning_rate": 3.862966175195143e-05, + "loss": 0.7866, "step": 2227 }, { - "epoch": 0.8135840788752967, - "grad_norm": 0.9839041829109192, - "learning_rate": 3.376188781862515e-05, - "loss": 0.9062, + "epoch": 0.38653712699514226, + "grad_norm": 0.78937166929245, + "learning_rate": 3.86470078057242e-05, + "loss": 0.8403, "step": 2228 }, { - "epoch": 0.8139492422859229, - "grad_norm": 1.1843029260635376, - "learning_rate": 3.375147511884032e-05, - "loss": 0.9562, + "epoch": 0.38671061762664816, + "grad_norm": 1.252397060394287, + "learning_rate": 3.866435385949696e-05, + "loss": 0.7996, "step": 2229 }, { - "epoch": 0.8143144056965492, - "grad_norm": 0.8962262272834778, - "learning_rate": 3.374105534447334e-05, - "loss": 0.8774, + "epoch": 0.38688410825815406, + "grad_norm": 0.7428364157676697, + "learning_rate": 3.8681699913269734e-05, + "loss": 0.8987, "step": 2230 }, { - "epoch": 0.8146795691071754, - "grad_norm": 1.342620611190796, - "learning_rate": 3.37306285008848e-05, - "loss": 0.9097, + "epoch": 0.38705759888965996, + "grad_norm": 2.4350712299346924, + "learning_rate": 3.86990459670425e-05, + "loss": 0.804, "step": 2231 }, { - "epoch": 0.8150447325178017, - "grad_norm": 1.4475151300430298, - "learning_rate": 3.372019459343886e-05, - "loss": 0.8914, + "epoch": 0.38723108952116586, + "grad_norm": 1.0472556352615356, + "learning_rate": 3.8716392020815264e-05, + "loss": 0.7468, "step": 2232 }, { - "epoch": 0.8154098959284279, - "grad_norm": 1.1764025688171387, - "learning_rate": 3.370975362750335e-05, - "loss": 0.9326, + "epoch": 0.38740458015267176, + "grad_norm": 1.4252434968948364, + "learning_rate": 3.8733738074588036e-05, + "loss": 0.8259, "step": 2233 }, { - "epoch": 0.8157750593390543, - "grad_norm": 1.2965624332427979, - "learning_rate": 3.369930560844975e-05, - "loss": 0.9066, + "epoch": 0.38757807078417766, + "grad_norm": 0.8465484380722046, + "learning_rate": 3.87510841283608e-05, + "loss": 0.8002, "step": 2234 }, { - "epoch": 0.8161402227496805, - "grad_norm": 0.8750057220458984, - "learning_rate": 3.368885054165314e-05, - "loss": 0.9084, + "epoch": 0.38775156141568357, + "grad_norm": 1.0581845045089722, + "learning_rate": 3.8768430182133566e-05, + "loss": 0.8656, "step": 2235 }, { - "epoch": 0.8165053861603068, - "grad_norm": 1.1192911863327026, - "learning_rate": 3.367838843249222e-05, - "loss": 0.9066, + "epoch": 0.38792505204718947, + "grad_norm": 0.8487623333930969, + "learning_rate": 3.878577623590634e-05, + "loss": 0.9346, "step": 2236 }, { - "epoch": 0.816870549570933, - "grad_norm": 0.8392890095710754, - "learning_rate": 3.366791928634932e-05, - "loss": 0.8956, + "epoch": 0.38809854267869537, + "grad_norm": 0.9689618349075317, + "learning_rate": 3.88031222896791e-05, + "loss": 0.7324, "step": 2237 }, { - "epoch": 0.8172357129815593, - "grad_norm": 1.099877953529358, - "learning_rate": 3.365744310861041e-05, - "loss": 0.944, + "epoch": 0.38827203331020127, + "grad_norm": 0.8215360045433044, + "learning_rate": 3.882046834345187e-05, + "loss": 0.8203, "step": 2238 }, { - "epoch": 0.8176008763921855, - "grad_norm": 1.1367907524108887, - "learning_rate": 3.364695990466507e-05, - "loss": 0.9156, + "epoch": 0.38844552394170717, + "grad_norm": 0.996002733707428, + "learning_rate": 3.883781439722463e-05, + "loss": 0.8345, "step": 2239 }, { - "epoch": 0.8179660398028118, - "grad_norm": 1.2760621309280396, - "learning_rate": 3.363646967990647e-05, - "loss": 0.8954, + "epoch": 0.3886190145732131, + "grad_norm": 1.2828646898269653, + "learning_rate": 3.8855160450997405e-05, + "loss": 0.7747, "step": 2240 }, { - "epoch": 0.818331203213438, - "grad_norm": 1.3708332777023315, - "learning_rate": 3.3625972439731425e-05, - "loss": 0.95, + "epoch": 0.3887925052047189, + "grad_norm": 1.0353268384933472, + "learning_rate": 3.887250650477017e-05, + "loss": 0.7942, "step": 2241 }, { - "epoch": 0.8186963666240643, - "grad_norm": 1.6421089172363281, - "learning_rate": 3.361546818954033e-05, - "loss": 0.8596, + "epoch": 0.3889659958362248, + "grad_norm": 1.1124967336654663, + "learning_rate": 3.8889852558542935e-05, + "loss": 0.7166, "step": 2242 }, { - "epoch": 0.8190615300346905, - "grad_norm": 1.204816222190857, - "learning_rate": 3.3604956934737206e-05, - "loss": 0.8951, + "epoch": 0.3891394864677307, + "grad_norm": 0.8714799880981445, + "learning_rate": 3.89071986123157e-05, + "loss": 0.9058, "step": 2243 }, { - "epoch": 0.8194266934453168, - "grad_norm": 1.361356496810913, - "learning_rate": 3.359443868072967e-05, - "loss": 0.8779, + "epoch": 0.3893129770992366, + "grad_norm": 0.9119824767112732, + "learning_rate": 3.892454466608847e-05, + "loss": 0.7227, "step": 2244 }, { - "epoch": 0.819791856855943, - "grad_norm": 1.2309714555740356, - "learning_rate": 3.3583913432928945e-05, - "loss": 0.9005, + "epoch": 0.3894864677307425, + "grad_norm": 2.0248124599456787, + "learning_rate": 3.8941890719861237e-05, + "loss": 0.8035, "step": 2245 }, { - "epoch": 0.8201570202665693, - "grad_norm": 1.4007517099380493, - "learning_rate": 3.357338119674985e-05, - "loss": 0.906, + "epoch": 0.3896599583622484, + "grad_norm": 0.8775182366371155, + "learning_rate": 3.8959236773634e-05, + "loss": 0.7024, "step": 2246 }, { - "epoch": 0.8205221836771955, - "grad_norm": 0.8810081481933594, - "learning_rate": 3.3562841977610796e-05, - "loss": 0.8905, + "epoch": 0.38983344899375433, + "grad_norm": 0.9370778203010559, + "learning_rate": 3.8976582827406767e-05, + "loss": 0.7188, "step": 2247 }, { - "epoch": 0.8208873470878219, - "grad_norm": 0.8359472751617432, - "learning_rate": 3.355229578093378e-05, - "loss": 0.9095, + "epoch": 0.39000693962526023, + "grad_norm": 0.8898071646690369, + "learning_rate": 3.899392888117953e-05, + "loss": 0.791, "step": 2248 }, { - "epoch": 0.821252510498448, - "grad_norm": 1.2127701044082642, - "learning_rate": 3.354174261214441e-05, - "loss": 0.8986, + "epoch": 0.39018043025676613, + "grad_norm": 1.156437635421753, + "learning_rate": 3.90112749349523e-05, + "loss": 0.8635, "step": 2249 }, { - "epoch": 0.8216176739090744, - "grad_norm": 1.2031699419021606, - "learning_rate": 3.353118247667186e-05, - "loss": 0.9117, + "epoch": 0.39035392088827203, + "grad_norm": 1.0303646326065063, + "learning_rate": 3.902862098872507e-05, + "loss": 0.8018, "step": 2250 }, { - "epoch": 0.8219828373197006, - "grad_norm": 1.0527660846710205, - "learning_rate": 3.35206153799489e-05, - "loss": 0.897, + "epoch": 0.39052741151977793, + "grad_norm": 1.2971370220184326, + "learning_rate": 3.904596704249783e-05, + "loss": 0.7517, "step": 2251 }, { - "epoch": 0.8223480007303268, - "grad_norm": 1.2384157180786133, - "learning_rate": 3.351004132741188e-05, - "loss": 0.9187, + "epoch": 0.39070090215128384, + "grad_norm": 0.8360400199890137, + "learning_rate": 3.90633130962706e-05, + "loss": 0.7808, "step": 2252 }, { - "epoch": 0.8227131641409531, - "grad_norm": 1.5382440090179443, - "learning_rate": 3.349946032450071e-05, - "loss": 0.9653, + "epoch": 0.39087439278278974, + "grad_norm": 1.333060622215271, + "learning_rate": 3.908065915004337e-05, + "loss": 0.8269, "step": 2253 }, { - "epoch": 0.8230783275515793, - "grad_norm": 0.8294605612754822, - "learning_rate": 3.348887237665891e-05, - "loss": 0.8875, + "epoch": 0.39104788341429564, + "grad_norm": 0.9425747394561768, + "learning_rate": 3.9098005203816135e-05, + "loss": 0.7317, "step": 2254 }, { - "epoch": 0.8234434909622056, - "grad_norm": 1.1140406131744385, - "learning_rate": 3.3478277489333554e-05, - "loss": 0.9232, + "epoch": 0.39122137404580154, + "grad_norm": 0.9587958455085754, + "learning_rate": 3.91153512575889e-05, + "loss": 0.7522, "step": 2255 }, { - "epoch": 0.8238086543728318, - "grad_norm": 1.4023823738098145, - "learning_rate": 3.346767566797527e-05, - "loss": 0.9091, + "epoch": 0.39139486467730744, + "grad_norm": 1.1406816244125366, + "learning_rate": 3.9132697311361665e-05, + "loss": 0.7656, "step": 2256 }, { - "epoch": 0.8241738177834581, - "grad_norm": 1.1313494443893433, - "learning_rate": 3.345706691803828e-05, - "loss": 0.9507, + "epoch": 0.39156835530881334, + "grad_norm": 0.8012204170227051, + "learning_rate": 3.915004336513444e-05, + "loss": 0.7983, "step": 2257 }, { - "epoch": 0.8245389811940843, - "grad_norm": 1.3286069631576538, - "learning_rate": 3.344645124498036e-05, - "loss": 0.9437, + "epoch": 0.39174184594031924, + "grad_norm": 0.8068287968635559, + "learning_rate": 3.91673894189072e-05, + "loss": 0.7485, "step": 2258 }, { - "epoch": 0.8249041446047106, - "grad_norm": 1.3923512697219849, - "learning_rate": 3.3435828654262844e-05, - "loss": 0.9572, + "epoch": 0.39191533657182515, + "grad_norm": 0.7549537420272827, + "learning_rate": 3.918473547267997e-05, + "loss": 0.8745, "step": 2259 }, { - "epoch": 0.8252693080153368, - "grad_norm": 1.1253995895385742, - "learning_rate": 3.3425199151350636e-05, - "loss": 0.895, + "epoch": 0.39208882720333105, + "grad_norm": 0.9742448329925537, + "learning_rate": 3.920208152645273e-05, + "loss": 0.8496, "step": 2260 }, { - "epoch": 0.8256344714259631, - "grad_norm": 0.7993583679199219, - "learning_rate": 3.341456274171218e-05, - "loss": 0.8894, + "epoch": 0.3922623178348369, + "grad_norm": 0.7768865823745728, + "learning_rate": 3.92194275802255e-05, + "loss": 0.8838, "step": 2261 }, { - "epoch": 0.8259996348365893, - "grad_norm": 1.0631810426712036, - "learning_rate": 3.340391943081949e-05, - "loss": 0.8969, + "epoch": 0.3924358084663428, + "grad_norm": 1.069046139717102, + "learning_rate": 3.923677363399827e-05, + "loss": 0.9312, "step": 2262 }, { - "epoch": 0.8263647982472156, - "grad_norm": 1.076901912689209, - "learning_rate": 3.339326922414812e-05, - "loss": 0.9392, + "epoch": 0.3926092990978487, + "grad_norm": 0.9792579412460327, + "learning_rate": 3.925411968777104e-05, + "loss": 0.6847, "step": 2263 }, { - "epoch": 0.8267299616578418, - "grad_norm": 0.8915268182754517, - "learning_rate": 3.3382612127177166e-05, - "loss": 0.8881, + "epoch": 0.3927827897293546, + "grad_norm": 0.8852686882019043, + "learning_rate": 3.9271465741543805e-05, + "loss": 0.7964, "step": 2264 }, { - "epoch": 0.8270951250684682, - "grad_norm": 0.8814072608947754, - "learning_rate": 3.337194814538929e-05, - "loss": 0.857, + "epoch": 0.3929562803608605, + "grad_norm": 1.1182090044021606, + "learning_rate": 3.928881179531657e-05, + "loss": 0.8123, "step": 2265 }, { - "epoch": 0.8274602884790944, - "grad_norm": 1.0786418914794922, - "learning_rate": 3.336127728427067e-05, - "loss": 0.9028, + "epoch": 0.3931297709923664, + "grad_norm": 1.0699650049209595, + "learning_rate": 3.9306157849089335e-05, + "loss": 0.7949, "step": 2266 }, { - "epoch": 0.8278254518897207, - "grad_norm": 1.1282809972763062, - "learning_rate": 3.335059954931105e-05, - "loss": 0.8898, + "epoch": 0.3933032616238723, + "grad_norm": 1.0542981624603271, + "learning_rate": 3.93235039028621e-05, + "loss": 0.7427, "step": 2267 }, { - "epoch": 0.8281906153003469, - "grad_norm": 0.9671773910522461, - "learning_rate": 3.333991494600368e-05, - "loss": 0.8866, + "epoch": 0.3934767522553782, + "grad_norm": 1.7038938999176025, + "learning_rate": 3.934084995663487e-05, + "loss": 0.8057, "step": 2268 }, { - "epoch": 0.8285557787109732, - "grad_norm": 1.1951189041137695, - "learning_rate": 3.332922347984537e-05, - "loss": 0.9481, + "epoch": 0.3936502428868841, + "grad_norm": 1.088059663772583, + "learning_rate": 3.935819601040764e-05, + "loss": 0.7493, "step": 2269 }, { - "epoch": 0.8289209421215994, - "grad_norm": 1.149143099784851, - "learning_rate": 3.331852515633645e-05, - "loss": 0.8914, + "epoch": 0.39382373351839, + "grad_norm": 1.2380781173706055, + "learning_rate": 3.93755420641804e-05, + "loss": 0.8372, "step": 2270 }, { - "epoch": 0.8292861055322257, - "grad_norm": 1.058846116065979, - "learning_rate": 3.330781998098078e-05, - "loss": 0.8961, + "epoch": 0.3939972241498959, + "grad_norm": 0.8143036961555481, + "learning_rate": 3.939288811795317e-05, + "loss": 0.8181, "step": 2271 }, { - "epoch": 0.8296512689428519, - "grad_norm": 1.1387629508972168, - "learning_rate": 3.3297107959285734e-05, - "loss": 0.8895, + "epoch": 0.3941707147814018, + "grad_norm": 1.830065369606018, + "learning_rate": 3.941023417172594e-05, + "loss": 0.8145, "step": 2272 }, { - "epoch": 0.8300164323534782, - "grad_norm": 1.113734483718872, - "learning_rate": 3.328638909676222e-05, - "loss": 0.8928, + "epoch": 0.3943442054129077, + "grad_norm": 0.8494362235069275, + "learning_rate": 3.9427580225498704e-05, + "loss": 0.6648, "step": 2273 }, { - "epoch": 0.8303815957641044, - "grad_norm": 1.1224305629730225, - "learning_rate": 3.327566339892467e-05, - "loss": 0.8818, + "epoch": 0.3945176960444136, + "grad_norm": 1.0385959148406982, + "learning_rate": 3.944492627927147e-05, + "loss": 0.7419, "step": 2274 }, { - "epoch": 0.8307467591747307, - "grad_norm": 1.2523547410964966, - "learning_rate": 3.326493087129102e-05, - "loss": 0.9235, + "epoch": 0.3946911866759195, + "grad_norm": 0.9224268198013306, + "learning_rate": 3.9462272333044234e-05, + "loss": 0.7659, "step": 2275 }, { - "epoch": 0.8311119225853569, - "grad_norm": 0.8614885210990906, - "learning_rate": 3.325419151938273e-05, - "loss": 0.9196, + "epoch": 0.3948646773074254, + "grad_norm": 1.2050153017044067, + "learning_rate": 3.9479618386817006e-05, + "loss": 0.9114, "step": 2276 }, { - "epoch": 0.8314770859959832, - "grad_norm": 1.137614130973816, - "learning_rate": 3.3243445348724756e-05, - "loss": 0.8969, + "epoch": 0.3950381679389313, + "grad_norm": 0.9125112891197205, + "learning_rate": 3.949696444058977e-05, + "loss": 0.9753, "step": 2277 }, { - "epoch": 0.8318422494066094, - "grad_norm": 1.450865626335144, - "learning_rate": 3.323269236484557e-05, - "loss": 0.9393, + "epoch": 0.3952116585704372, + "grad_norm": 1.127112627029419, + "learning_rate": 3.9514310494362536e-05, + "loss": 0.7964, "step": 2278 }, { - "epoch": 0.8322074128172358, - "grad_norm": 1.0543047189712524, - "learning_rate": 3.322193257327716e-05, - "loss": 0.9355, + "epoch": 0.3953851492019431, + "grad_norm": 1.0509998798370361, + "learning_rate": 3.95316565481353e-05, + "loss": 0.7795, "step": 2279 }, { - "epoch": 0.832572576227862, - "grad_norm": 1.024518370628357, - "learning_rate": 3.321116597955501e-05, - "loss": 0.8973, + "epoch": 0.39555863983344897, + "grad_norm": 1.491776943206787, + "learning_rate": 3.9549002601908066e-05, + "loss": 0.8525, "step": 2280 }, { - "epoch": 0.8329377396384883, - "grad_norm": 1.1692466735839844, - "learning_rate": 3.320039258921809e-05, - "loss": 0.8785, + "epoch": 0.39573213046495487, + "grad_norm": 1.3932883739471436, + "learning_rate": 3.956634865568084e-05, + "loss": 0.885, "step": 2281 }, { - "epoch": 0.8333029030491145, - "grad_norm": 0.8858698606491089, - "learning_rate": 3.318961240780889e-05, - "loss": 0.9211, + "epoch": 0.39590562109646077, + "grad_norm": 0.7782127857208252, + "learning_rate": 3.95836947094536e-05, + "loss": 0.9553, "step": 2282 }, { - "epoch": 0.8336680664597408, - "grad_norm": 1.9830307960510254, - "learning_rate": 3.317882544087336e-05, - "loss": 0.9093, + "epoch": 0.39607911172796667, + "grad_norm": 1.0614386796951294, + "learning_rate": 3.960104076322637e-05, + "loss": 0.7437, "step": 2283 }, { - "epoch": 0.834033229870367, - "grad_norm": 1.1879889965057373, - "learning_rate": 3.316803169396098e-05, - "loss": 0.8602, + "epoch": 0.39625260235947257, + "grad_norm": 1.0118857622146606, + "learning_rate": 3.961838681699913e-05, + "loss": 0.6533, "step": 2284 }, { - "epoch": 0.8343983932809932, - "grad_norm": 1.2925167083740234, - "learning_rate": 3.31572311726247e-05, - "loss": 0.9034, + "epoch": 0.3964260929909785, + "grad_norm": 0.9429373145103455, + "learning_rate": 3.9635732870771904e-05, + "loss": 0.8125, "step": 2285 }, { - "epoch": 0.8347635566916195, - "grad_norm": 1.148078441619873, - "learning_rate": 3.3146423882420935e-05, - "loss": 0.9613, + "epoch": 0.3965995836224844, + "grad_norm": 0.9157687425613403, + "learning_rate": 3.965307892454467e-05, + "loss": 0.6855, "step": 2286 }, { - "epoch": 0.8351287201022457, - "grad_norm": 1.1038404703140259, - "learning_rate": 3.313560982890963e-05, - "loss": 0.8843, + "epoch": 0.3967730742539903, + "grad_norm": 0.9982434511184692, + "learning_rate": 3.9670424978317434e-05, + "loss": 0.7544, "step": 2287 }, { - "epoch": 0.835493883512872, - "grad_norm": 1.2176798582077026, - "learning_rate": 3.3124789017654154e-05, - "loss": 0.8771, + "epoch": 0.3969465648854962, + "grad_norm": 1.0053291320800781, + "learning_rate": 3.96877710320902e-05, + "loss": 0.7852, "step": 2288 }, { - "epoch": 0.8358590469234982, - "grad_norm": 0.7965716123580933, - "learning_rate": 3.31139614542214e-05, - "loss": 0.9048, + "epoch": 0.3971200555170021, + "grad_norm": 1.1249643564224243, + "learning_rate": 3.9705117085862964e-05, + "loss": 0.6692, "step": 2289 }, { - "epoch": 0.8362242103341245, - "grad_norm": 1.2514487504959106, - "learning_rate": 3.310312714418171e-05, - "loss": 0.9463, + "epoch": 0.397293546148508, + "grad_norm": 0.9941114187240601, + "learning_rate": 3.9722463139635736e-05, + "loss": 0.7634, "step": 2290 }, { - "epoch": 0.8365893737447507, - "grad_norm": 1.1472480297088623, - "learning_rate": 3.3092286093108894e-05, - "loss": 0.8845, + "epoch": 0.3974670367800139, + "grad_norm": 1.263395071029663, + "learning_rate": 3.97398091934085e-05, + "loss": 0.8528, "step": 2291 }, { - "epoch": 0.836954537155377, - "grad_norm": 1.043430209159851, - "learning_rate": 3.308143830658025e-05, - "loss": 0.8945, + "epoch": 0.3976405274115198, + "grad_norm": 0.7247194051742554, + "learning_rate": 3.975715524718127e-05, + "loss": 1.0029, "step": 2292 }, { - "epoch": 0.8373197005660032, - "grad_norm": 0.9235261082649231, - "learning_rate": 3.307058379017652e-05, - "loss": 0.851, + "epoch": 0.3978140180430257, + "grad_norm": 0.8766226172447205, + "learning_rate": 3.977450130095404e-05, + "loss": 0.7841, "step": 2293 }, { - "epoch": 0.8376848639766296, - "grad_norm": 1.06767737865448, - "learning_rate": 3.305972254948191e-05, - "loss": 0.9049, + "epoch": 0.3979875086745316, + "grad_norm": 0.9673104286193848, + "learning_rate": 3.97918473547268e-05, + "loss": 0.9531, "step": 2294 }, { - "epoch": 0.8380500273872558, - "grad_norm": 1.252581000328064, - "learning_rate": 3.304885459008412e-05, - "loss": 0.915, + "epoch": 0.3981609993060375, + "grad_norm": 1.2450391054153442, + "learning_rate": 3.980919340849957e-05, + "loss": 0.7488, "step": 2295 }, { - "epoch": 0.8384151907978821, - "grad_norm": 1.3699047565460205, - "learning_rate": 3.303797991757425e-05, - "loss": 0.9662, + "epoch": 0.3983344899375434, + "grad_norm": 0.7495173811912537, + "learning_rate": 3.982653946227234e-05, + "loss": 0.8005, "step": 2296 }, { - "epoch": 0.8387803542085083, - "grad_norm": 1.1066439151763916, - "learning_rate": 3.3027098537546904e-05, - "loss": 0.9073, + "epoch": 0.3985079805690493, + "grad_norm": 0.9778833389282227, + "learning_rate": 3.9843885516045105e-05, + "loss": 0.7174, "step": 2297 }, { - "epoch": 0.8391455176191346, - "grad_norm": 1.031402587890625, - "learning_rate": 3.3016210455600094e-05, - "loss": 0.9292, + "epoch": 0.3986814712005552, + "grad_norm": 1.0114597082138062, + "learning_rate": 3.986123156981787e-05, + "loss": 0.7451, "step": 2298 }, { - "epoch": 0.8395106810297608, - "grad_norm": 0.8174817562103271, - "learning_rate": 3.300531567733532e-05, - "loss": 0.892, + "epoch": 0.3988549618320611, + "grad_norm": 0.9955242276191711, + "learning_rate": 3.9878577623590635e-05, + "loss": 0.6573, "step": 2299 }, { - "epoch": 0.8398758444403871, - "grad_norm": 1.0135891437530518, - "learning_rate": 3.2994414208357496e-05, - "loss": 0.9124, + "epoch": 0.39902845246356694, + "grad_norm": 1.055274486541748, + "learning_rate": 3.9895923677363406e-05, + "loss": 0.7219, "step": 2300 }, { - "epoch": 0.8402410078510133, - "grad_norm": 1.108098030090332, - "learning_rate": 3.2983506054274995e-05, - "loss": 0.8667, + "epoch": 0.39920194309507284, + "grad_norm": 1.2745747566223145, + "learning_rate": 3.991326973113617e-05, + "loss": 0.7576, "step": 2301 }, { - "epoch": 0.8406061712616396, - "grad_norm": 1.3940798044204712, - "learning_rate": 3.297259122069963e-05, - "loss": 0.9651, + "epoch": 0.39937543372657874, + "grad_norm": 1.686747431755066, + "learning_rate": 3.9930615784908936e-05, + "loss": 0.9846, "step": 2302 }, { - "epoch": 0.8409713346722658, - "grad_norm": 1.0065932273864746, - "learning_rate": 3.296166971324664e-05, - "loss": 0.9478, + "epoch": 0.39954892435808465, + "grad_norm": 1.0305421352386475, + "learning_rate": 3.99479618386817e-05, + "loss": 0.7729, "step": 2303 }, { - "epoch": 0.8413364980828921, - "grad_norm": 1.1080431938171387, - "learning_rate": 3.29507415375347e-05, - "loss": 0.9154, + "epoch": 0.39972241498959055, + "grad_norm": 2.3437044620513916, + "learning_rate": 3.996530789245447e-05, + "loss": 0.7844, "step": 2304 }, { - "epoch": 0.8417016614935183, - "grad_norm": 1.2220046520233154, - "learning_rate": 3.293980669918592e-05, - "loss": 0.9274, + "epoch": 0.39989590562109645, + "grad_norm": 1.045218586921692, + "learning_rate": 3.998265394622724e-05, + "loss": 0.7419, "step": 2305 }, { - "epoch": 0.8420668249041446, - "grad_norm": 1.138776183128357, - "learning_rate": 3.292886520382583e-05, - "loss": 0.9131, + "epoch": 0.40006939625260235, + "grad_norm": 0.932973325252533, + "learning_rate": 4e-05, + "loss": 0.8296, "step": 2306 }, { - "epoch": 0.8424319883147708, - "grad_norm": 1.0227338075637817, - "learning_rate": 3.29179170570834e-05, - "loss": 0.8876, + "epoch": 0.40024288688410825, + "grad_norm": 0.8467320203781128, + "learning_rate": 3.9999998839488355e-05, + "loss": 0.8386, "step": 2307 }, { - "epoch": 0.8427971517253972, - "grad_norm": 1.1860309839248657, - "learning_rate": 3.2906962264591014e-05, - "loss": 0.8904, + "epoch": 0.40041637751561415, + "grad_norm": 0.8150543570518494, + "learning_rate": 3.999999535795353e-05, + "loss": 0.8784, "step": 2308 }, { - "epoch": 0.8431623151360234, - "grad_norm": 0.971283495426178, - "learning_rate": 3.2896000831984456e-05, - "loss": 0.9133, + "epoch": 0.40058986814712005, + "grad_norm": 1.0478501319885254, + "learning_rate": 3.999998955539594e-05, + "loss": 0.8318, "step": 2309 }, { - "epoch": 0.8435274785466497, - "grad_norm": 1.038649082183838, - "learning_rate": 3.288503276490296e-05, - "loss": 0.9506, + "epoch": 0.40076335877862596, + "grad_norm": 0.9109731316566467, + "learning_rate": 3.9999981431816256e-05, + "loss": 0.8906, "step": 2310 }, { - "epoch": 0.8438926419572759, - "grad_norm": 1.078331470489502, - "learning_rate": 3.287405806898915e-05, - "loss": 0.8876, + "epoch": 0.40093684941013186, + "grad_norm": 1.8413771390914917, + "learning_rate": 3.999997098721543e-05, + "loss": 0.7656, "step": 2311 }, { - "epoch": 0.8442578053679022, - "grad_norm": 1.3238461017608643, - "learning_rate": 3.2863076749889064e-05, - "loss": 0.8866, + "epoch": 0.40111034004163776, + "grad_norm": 1.7368420362472534, + "learning_rate": 3.999995822159466e-05, + "loss": 0.9382, "step": 2312 }, { - "epoch": 0.8446229687785284, - "grad_norm": 1.1150463819503784, - "learning_rate": 3.285208881325216e-05, - "loss": 0.9006, + "epoch": 0.40128383067314366, + "grad_norm": 0.822659432888031, + "learning_rate": 3.9999943134955436e-05, + "loss": 0.804, "step": 2313 }, { - "epoch": 0.8449881321891547, - "grad_norm": 1.050068736076355, - "learning_rate": 3.2841094264731274e-05, - "loss": 0.9279, + "epoch": 0.40145732130464956, + "grad_norm": 0.8807315826416016, + "learning_rate": 3.9999925727299505e-05, + "loss": 0.8608, "step": 2314 }, { - "epoch": 0.8453532955997809, - "grad_norm": 1.1285725831985474, - "learning_rate": 3.283009310998268e-05, - "loss": 0.8571, + "epoch": 0.40163081193615546, + "grad_norm": 1.5850759744644165, + "learning_rate": 3.999990599862889e-05, + "loss": 0.7231, "step": 2315 }, { - "epoch": 0.8457184590104072, - "grad_norm": 1.3799028396606445, - "learning_rate": 3.2819085354666015e-05, - "loss": 0.9254, + "epoch": 0.40180430256766136, + "grad_norm": 1.041966438293457, + "learning_rate": 3.999988394894588e-05, + "loss": 0.8436, "step": 2316 }, { - "epoch": 0.8460836224210334, - "grad_norm": 1.036365032196045, - "learning_rate": 3.280807100444433e-05, - "loss": 0.8848, + "epoch": 0.40197779319916727, + "grad_norm": 0.9052964448928833, + "learning_rate": 3.999985957825303e-05, + "loss": 0.7944, "step": 2317 }, { - "epoch": 0.8464487858316597, - "grad_norm": 0.8339391946792603, - "learning_rate": 3.279705006498408e-05, - "loss": 0.9266, + "epoch": 0.40215128383067317, + "grad_norm": 0.8429526090621948, + "learning_rate": 3.999983288655318e-05, + "loss": 0.8457, "step": 2318 }, { - "epoch": 0.8468139492422859, - "grad_norm": 1.762960433959961, - "learning_rate": 3.278602254195507e-05, - "loss": 0.9148, + "epoch": 0.40232477446217907, + "grad_norm": 1.2592369318008423, + "learning_rate": 3.999980387384941e-05, + "loss": 0.8218, "step": 2319 }, { - "epoch": 0.8471791126529121, - "grad_norm": 0.9137365221977234, - "learning_rate": 3.277498844103055e-05, - "loss": 0.861, + "epoch": 0.4024982650936849, + "grad_norm": 0.7658020853996277, + "learning_rate": 3.9999772540145104e-05, + "loss": 0.8542, "step": 2320 }, { - "epoch": 0.8475442760635384, - "grad_norm": 1.3130300045013428, - "learning_rate": 3.276394776788709e-05, - "loss": 0.8813, + "epoch": 0.4026717557251908, + "grad_norm": 1.5753639936447144, + "learning_rate": 3.9999738885443885e-05, + "loss": 0.8528, "step": 2321 }, { - "epoch": 0.8479094394741646, - "grad_norm": 1.1927039623260498, - "learning_rate": 3.27529005282047e-05, - "loss": 0.927, + "epoch": 0.4028452463566967, + "grad_norm": 0.806293249130249, + "learning_rate": 3.999970290974967e-05, + "loss": 0.8345, "step": 2322 }, { - "epoch": 0.848274602884791, - "grad_norm": 1.1714924573898315, - "learning_rate": 3.274184672766673e-05, - "loss": 0.8937, + "epoch": 0.4030187369882026, + "grad_norm": 3.332958936691284, + "learning_rate": 3.9999664613066615e-05, + "loss": 0.833, "step": 2323 }, { - "epoch": 0.8486397662954172, - "grad_norm": 1.418370246887207, - "learning_rate": 3.2730786371959906e-05, - "loss": 0.9053, + "epoch": 0.4031922276197085, + "grad_norm": 0.9595586657524109, + "learning_rate": 3.999962399539919e-05, + "loss": 0.8135, "step": 2324 }, { - "epoch": 0.8490049297060435, - "grad_norm": 0.9238560795783997, - "learning_rate": 3.271971946677436e-05, - "loss": 0.9181, + "epoch": 0.4033657182512144, + "grad_norm": 2.7309529781341553, + "learning_rate": 3.9999581056752085e-05, + "loss": 0.7632, "step": 2325 }, { - "epoch": 0.8493700931166697, - "grad_norm": 0.9603902101516724, - "learning_rate": 3.270864601780355e-05, - "loss": 0.9207, + "epoch": 0.4035392088827203, + "grad_norm": 1.1456687450408936, + "learning_rate": 3.9999535797130304e-05, + "loss": 0.8362, "step": 2326 }, { - "epoch": 0.849735256527296, - "grad_norm": 1.0544580221176147, - "learning_rate": 3.269756603074433e-05, - "loss": 0.9163, + "epoch": 0.4037126995142262, + "grad_norm": 0.8260728120803833, + "learning_rate": 3.999948821653908e-05, + "loss": 0.8462, "step": 2327 }, { - "epoch": 0.8501004199379222, - "grad_norm": 1.3136160373687744, - "learning_rate": 3.268647951129692e-05, - "loss": 0.8945, + "epoch": 0.4038861901457321, + "grad_norm": 0.9700015187263489, + "learning_rate": 3.999943831498395e-05, + "loss": 0.7698, "step": 2328 }, { - "epoch": 0.8504655833485485, - "grad_norm": 0.9908052086830139, - "learning_rate": 3.267538646516487e-05, - "loss": 0.8663, + "epoch": 0.40405968077723803, + "grad_norm": 1.6528294086456299, + "learning_rate": 3.99993860924707e-05, + "loss": 0.8198, "step": 2329 }, { - "epoch": 0.8508307467591747, - "grad_norm": 0.788976788520813, - "learning_rate": 3.266428689805512e-05, - "loss": 0.9028, + "epoch": 0.40423317140874393, + "grad_norm": 0.9726113677024841, + "learning_rate": 3.9999331549005394e-05, + "loss": 0.7999, "step": 2330 }, { - "epoch": 0.851195910169801, - "grad_norm": 1.0005877017974854, - "learning_rate": 3.265318081567794e-05, - "loss": 0.9149, + "epoch": 0.40440666204024983, + "grad_norm": 0.9606515169143677, + "learning_rate": 3.999927468459435e-05, + "loss": 0.7434, "step": 2331 }, { - "epoch": 0.8515610735804272, - "grad_norm": 0.9816792607307434, - "learning_rate": 3.2642068223746975e-05, - "loss": 0.845, + "epoch": 0.40458015267175573, + "grad_norm": 1.0718632936477661, + "learning_rate": 3.999921549924418e-05, + "loss": 0.7773, "step": 2332 }, { - "epoch": 0.8519262369910535, - "grad_norm": 0.9434790015220642, - "learning_rate": 3.2630949127979204e-05, - "loss": 0.9021, + "epoch": 0.40475364330326163, + "grad_norm": 1.2083808183670044, + "learning_rate": 3.999915399296175e-05, + "loss": 0.863, "step": 2333 }, { - "epoch": 0.8522914004016797, - "grad_norm": 0.9779006242752075, - "learning_rate": 3.2619823534094956e-05, - "loss": 0.8931, + "epoch": 0.40492713393476754, + "grad_norm": 1.0222811698913574, + "learning_rate": 3.999909016575419e-05, + "loss": 0.7854, "step": 2334 }, { - "epoch": 0.852656563812306, - "grad_norm": 1.2115291357040405, - "learning_rate": 3.26086914478179e-05, - "loss": 0.9054, + "epoch": 0.40510062456627344, + "grad_norm": 0.9264998435974121, + "learning_rate": 3.999902401762892e-05, + "loss": 0.8984, "step": 2335 }, { - "epoch": 0.8530217272229322, - "grad_norm": 1.2338892221450806, - "learning_rate": 3.259755287487505e-05, - "loss": 0.8947, + "epoch": 0.40527411519777934, + "grad_norm": 0.9389799237251282, + "learning_rate": 3.99989555485936e-05, + "loss": 0.8962, "step": 2336 }, { - "epoch": 0.8533868906335585, - "grad_norm": 0.8474389314651489, - "learning_rate": 3.258640782099675e-05, - "loss": 0.8676, + "epoch": 0.40544760582928524, + "grad_norm": 1.3145450353622437, + "learning_rate": 3.999888475865619e-05, + "loss": 0.8645, "step": 2337 }, { - "epoch": 0.8537520540441847, - "grad_norm": 1.138380527496338, - "learning_rate": 3.257525629191669e-05, - "loss": 0.8718, + "epoch": 0.40562109646079114, + "grad_norm": 1.0105791091918945, + "learning_rate": 3.99988116478249e-05, + "loss": 0.8357, "step": 2338 }, { - "epoch": 0.8541172174548111, - "grad_norm": 1.1867327690124512, - "learning_rate": 3.2564098293371884e-05, - "loss": 0.8867, + "epoch": 0.40579458709229704, + "grad_norm": 1.0831013917922974, + "learning_rate": 3.999873621610822e-05, + "loss": 0.7136, "step": 2339 }, { - "epoch": 0.8544823808654373, - "grad_norm": 1.2721573114395142, - "learning_rate": 3.255293383110267e-05, - "loss": 0.8669, + "epoch": 0.4059680777238029, + "grad_norm": 2.765923500061035, + "learning_rate": 3.999865846351489e-05, + "loss": 0.865, "step": 2340 }, { - "epoch": 0.8548475442760636, - "grad_norm": 1.1000010967254639, - "learning_rate": 3.2541762910852716e-05, - "loss": 0.9189, + "epoch": 0.4061415683553088, + "grad_norm": 0.9362756609916687, + "learning_rate": 3.999857839005395e-05, + "loss": 0.7036, "step": 2341 }, { - "epoch": 0.8552127076866898, - "grad_norm": 0.9832971096038818, - "learning_rate": 3.253058553836902e-05, - "loss": 0.9082, + "epoch": 0.4063150589868147, + "grad_norm": 0.8713677525520325, + "learning_rate": 3.9998495995734677e-05, + "loss": 0.8699, "step": 2342 }, { - "epoch": 0.8555778710973161, - "grad_norm": 1.265341877937317, - "learning_rate": 3.251940171940188e-05, - "loss": 0.9025, + "epoch": 0.4064885496183206, + "grad_norm": 0.944359540939331, + "learning_rate": 3.999841128056664e-05, + "loss": 0.8438, "step": 2343 }, { - "epoch": 0.8559430345079423, - "grad_norm": 0.9770424962043762, - "learning_rate": 3.250821145970493e-05, - "loss": 0.9078, + "epoch": 0.4066620402498265, + "grad_norm": 1.1011619567871094, + "learning_rate": 3.999832424455968e-05, + "loss": 0.8391, "step": 2344 }, { - "epoch": 0.8563081979185686, - "grad_norm": 1.082483172416687, - "learning_rate": 3.2497014765035105e-05, - "loss": 0.9083, + "epoch": 0.4068355308813324, + "grad_norm": 0.8997476696968079, + "learning_rate": 3.999823488772388e-05, + "loss": 0.748, "step": 2345 }, { - "epoch": 0.8566733613291948, - "grad_norm": 1.1719576120376587, - "learning_rate": 3.2485811641152655e-05, - "loss": 0.8676, + "epoch": 0.4070090215128383, + "grad_norm": 1.443051815032959, + "learning_rate": 3.999814321006963e-05, + "loss": 0.7568, "step": 2346 }, { - "epoch": 0.8570385247398211, - "grad_norm": 0.9048495888710022, - "learning_rate": 3.2474602093821145e-05, - "loss": 0.9197, + "epoch": 0.4071825121443442, + "grad_norm": 1.452035903930664, + "learning_rate": 3.9998049211607546e-05, + "loss": 0.9526, "step": 2347 }, { - "epoch": 0.8574036881504473, - "grad_norm": 1.1852236986160278, - "learning_rate": 3.246338612880743e-05, - "loss": 0.8949, + "epoch": 0.4073560027758501, + "grad_norm": 1.0006368160247803, + "learning_rate": 3.999795289234856e-05, + "loss": 0.7861, "step": 2348 }, { - "epoch": 0.8577688515610736, - "grad_norm": 1.2471216917037964, - "learning_rate": 3.245216375188168e-05, - "loss": 0.9099, + "epoch": 0.407529493407356, + "grad_norm": 3.313105344772339, + "learning_rate": 3.9997854252303826e-05, + "loss": 0.667, "step": 2349 }, { - "epoch": 0.8581340149716998, - "grad_norm": 1.5178124904632568, - "learning_rate": 3.2440934968817355e-05, - "loss": 0.902, + "epoch": 0.4077029840388619, + "grad_norm": 0.9012083411216736, + "learning_rate": 3.9997753291484816e-05, + "loss": 0.8521, "step": 2350 }, { - "epoch": 0.8584991783823261, - "grad_norm": 1.366114616394043, - "learning_rate": 3.2429699785391205e-05, - "loss": 0.8881, + "epoch": 0.4078764746703678, + "grad_norm": 1.5778453350067139, + "learning_rate": 3.9997650009903226e-05, + "loss": 0.8992, "step": 2351 }, { - "epoch": 0.8588643417929523, - "grad_norm": 0.8483562469482422, - "learning_rate": 3.241845820738329e-05, - "loss": 0.8661, + "epoch": 0.4080499653018737, + "grad_norm": 1.552453875541687, + "learning_rate": 3.999754440757105e-05, + "loss": 0.8632, "step": 2352 }, { - "epoch": 0.8592295052035785, - "grad_norm": 1.1250901222229004, - "learning_rate": 3.240721024057695e-05, - "loss": 0.9092, + "epoch": 0.4082234559333796, + "grad_norm": 9.123459815979004, + "learning_rate": 3.999743648450055e-05, + "loss": 0.7295, "step": 2353 }, { - "epoch": 0.8595946686142049, - "grad_norm": 1.0794893503189087, - "learning_rate": 3.239595589075881e-05, - "loss": 0.8871, + "epoch": 0.4083969465648855, + "grad_norm": 0.9598536491394043, + "learning_rate": 3.999732624070424e-05, + "loss": 0.681, "step": 2354 }, { - "epoch": 0.8599598320248311, - "grad_norm": 1.2193162441253662, - "learning_rate": 3.238469516371879e-05, - "loss": 0.9271, + "epoch": 0.4085704371963914, + "grad_norm": 3.1576545238494873, + "learning_rate": 3.999721367619492e-05, + "loss": 0.8374, "step": 2355 }, { - "epoch": 0.8603249954354574, - "grad_norm": 2.1797313690185547, - "learning_rate": 3.237342806525007e-05, - "loss": 0.9036, + "epoch": 0.4087439278278973, + "grad_norm": 0.9056648015975952, + "learning_rate": 3.999709879098565e-05, + "loss": 0.9143, "step": 2356 }, { - "epoch": 0.8606901588460836, - "grad_norm": 1.0270274877548218, - "learning_rate": 3.236215460114913e-05, - "loss": 0.9171, + "epoch": 0.4089174184594032, + "grad_norm": 1.2248834371566772, + "learning_rate": 3.999698158508977e-05, + "loss": 0.6912, "step": 2357 }, { - "epoch": 0.8610553222567099, - "grad_norm": 1.077868938446045, - "learning_rate": 3.23508747772157e-05, - "loss": 0.9469, + "epoch": 0.4090909090909091, + "grad_norm": 1.1833930015563965, + "learning_rate": 3.999686205852087e-05, + "loss": 0.6604, "step": 2358 }, { - "epoch": 0.8614204856673361, - "grad_norm": 0.9075025320053101, - "learning_rate": 3.233958859925282e-05, - "loss": 0.8951, + "epoch": 0.40926439972241496, + "grad_norm": 1.4246364831924438, + "learning_rate": 3.999674021129283e-05, + "loss": 0.762, "step": 2359 }, { - "epoch": 0.8617856490779624, - "grad_norm": 1.0181728601455688, - "learning_rate": 3.232829607306675e-05, - "loss": 0.8866, + "epoch": 0.40943789035392086, + "grad_norm": 1.0935314893722534, + "learning_rate": 3.999661604341978e-05, + "loss": 0.6444, "step": 2360 }, { - "epoch": 0.8621508124885886, - "grad_norm": 1.1690034866333008, - "learning_rate": 3.231699720446706e-05, - "loss": 0.9042, + "epoch": 0.40961138098542677, + "grad_norm": 2.356513738632202, + "learning_rate": 3.9996489554916145e-05, + "loss": 0.7983, "step": 2361 }, { - "epoch": 0.8625159758992149, - "grad_norm": 1.0616707801818848, - "learning_rate": 3.230569199926656e-05, - "loss": 0.9342, + "epoch": 0.40978487161693267, + "grad_norm": 0.9326343536376953, + "learning_rate": 3.99963607457966e-05, + "loss": 0.8643, "step": 2362 }, { - "epoch": 0.8628811393098411, - "grad_norm": 1.1491398811340332, - "learning_rate": 3.2294380463281315e-05, - "loss": 0.8754, + "epoch": 0.40995836224843857, + "grad_norm": 0.9769896268844604, + "learning_rate": 3.9996229616076086e-05, + "loss": 0.907, "step": 2363 }, { - "epoch": 0.8632463027204674, - "grad_norm": 1.1388192176818848, - "learning_rate": 3.228306260233067e-05, - "loss": 0.8768, + "epoch": 0.41013185287994447, + "grad_norm": 1.3590095043182373, + "learning_rate": 3.999609616576982e-05, + "loss": 0.8914, "step": 2364 }, { - "epoch": 0.8636114661310936, - "grad_norm": 2.0608267784118652, - "learning_rate": 3.227173842223721e-05, - "loss": 0.9554, + "epoch": 0.41030534351145037, + "grad_norm": 1.0681648254394531, + "learning_rate": 3.99959603948933e-05, + "loss": 0.864, "step": 2365 }, { - "epoch": 0.8639766295417199, - "grad_norm": 1.3108954429626465, - "learning_rate": 3.226040792882676e-05, - "loss": 0.8724, + "epoch": 0.4104788341429563, + "grad_norm": 0.9749590158462524, + "learning_rate": 3.9995822303462273e-05, + "loss": 0.8188, "step": 2366 }, { - "epoch": 0.8643417929523461, - "grad_norm": 1.2228803634643555, - "learning_rate": 3.224907112792841e-05, - "loss": 0.9098, + "epoch": 0.4106523247744622, + "grad_norm": 0.747654914855957, + "learning_rate": 3.9995681891492774e-05, + "loss": 0.8887, "step": 2367 }, { - "epoch": 0.8647069563629725, - "grad_norm": 0.8829416632652283, - "learning_rate": 3.223772802537449e-05, - "loss": 0.9348, + "epoch": 0.4108258154059681, + "grad_norm": 2.4150869846343994, + "learning_rate": 3.9995539159001074e-05, + "loss": 0.8346, "step": 2368 }, { - "epoch": 0.8650721197735987, - "grad_norm": 0.9618242383003235, - "learning_rate": 3.2226378627000574e-05, - "loss": 0.8809, + "epoch": 0.410999306037474, + "grad_norm": 1.0865124464035034, + "learning_rate": 3.999539410600378e-05, + "loss": 0.8101, "step": 2369 }, { - "epoch": 0.865437283184225, - "grad_norm": 1.4193679094314575, - "learning_rate": 3.2215022938645465e-05, - "loss": 0.9008, + "epoch": 0.4111727966689799, + "grad_norm": 0.9155034422874451, + "learning_rate": 3.999524673251768e-05, + "loss": 0.8369, "step": 2370 }, { - "epoch": 0.8658024465948512, - "grad_norm": 0.947960615158081, - "learning_rate": 3.2203660966151206e-05, - "loss": 0.8926, + "epoch": 0.4113462873004858, + "grad_norm": 0.8397771716117859, + "learning_rate": 3.999509703855991e-05, + "loss": 0.8796, "step": 2371 }, { - "epoch": 0.8661676100054775, - "grad_norm": 1.1539700031280518, - "learning_rate": 3.219229271536309e-05, - "loss": 0.8981, + "epoch": 0.4115197779319917, + "grad_norm": 1.026986837387085, + "learning_rate": 3.999494502414783e-05, + "loss": 0.8401, "step": 2372 }, { - "epoch": 0.8665327734161037, - "grad_norm": 1.1868197917938232, - "learning_rate": 3.218091819212962e-05, - "loss": 0.9491, + "epoch": 0.4116932685634976, + "grad_norm": 0.8268187046051025, + "learning_rate": 3.999479068929907e-05, + "loss": 0.8601, "step": 2373 }, { - "epoch": 0.86689793682673, - "grad_norm": 1.0864989757537842, - "learning_rate": 3.2169537402302525e-05, - "loss": 0.9227, + "epoch": 0.4118667591950035, + "grad_norm": 1.6661032438278198, + "learning_rate": 3.999463403403156e-05, + "loss": 0.8416, "step": 2374 }, { - "epoch": 0.8672631002373562, - "grad_norm": 1.1930218935012817, - "learning_rate": 3.215815035173678e-05, - "loss": 0.9047, + "epoch": 0.4120402498265094, + "grad_norm": 0.8753964304924011, + "learning_rate": 3.999447505836347e-05, + "loss": 0.821, "step": 2375 }, { - "epoch": 0.8676282636479825, - "grad_norm": 1.3359171152114868, - "learning_rate": 3.214675704629054e-05, - "loss": 0.9016, + "epoch": 0.4122137404580153, + "grad_norm": 0.8967012763023376, + "learning_rate": 3.999431376231326e-05, + "loss": 0.8701, "step": 2376 }, { - "epoch": 0.8679934270586087, - "grad_norm": 1.2036510705947876, - "learning_rate": 3.213535749182523e-05, - "loss": 0.915, + "epoch": 0.4123872310895212, + "grad_norm": 1.417569875717163, + "learning_rate": 3.999415014589963e-05, + "loss": 0.6687, "step": 2377 }, { - "epoch": 0.868358590469235, - "grad_norm": 1.0027682781219482, - "learning_rate": 3.212395169420544e-05, - "loss": 0.9264, + "epoch": 0.4125607217210271, + "grad_norm": 1.0323526859283447, + "learning_rate": 3.9993984209141576e-05, + "loss": 0.6173, "step": 2378 }, { - "epoch": 0.8687237538798612, - "grad_norm": 0.9930129051208496, - "learning_rate": 3.211253965929902e-05, - "loss": 0.91, + "epoch": 0.41273421235253294, + "grad_norm": 0.8562255501747131, + "learning_rate": 3.999381595205836e-05, + "loss": 0.8105, "step": 2379 }, { - "epoch": 0.8690889172904875, - "grad_norm": 0.9424401521682739, - "learning_rate": 3.2101121392976986e-05, - "loss": 0.8958, + "epoch": 0.41290770298403884, + "grad_norm": 0.9660404324531555, + "learning_rate": 3.999364537466951e-05, + "loss": 0.7344, "step": 2380 }, { - "epoch": 0.8694540807011137, - "grad_norm": 1.0139580965042114, - "learning_rate": 3.2089696901113576e-05, - "loss": 0.9272, + "epoch": 0.41308119361554474, + "grad_norm": 0.9344460964202881, + "learning_rate": 3.999347247699481e-05, + "loss": 0.7827, "step": 2381 }, { - "epoch": 0.86981924411174, - "grad_norm": 1.4429867267608643, - "learning_rate": 3.2078266189586256e-05, - "loss": 0.9532, + "epoch": 0.41325468424705064, + "grad_norm": 1.2031251192092896, + "learning_rate": 3.999329725905434e-05, + "loss": 0.7136, "step": 2382 }, { - "epoch": 0.8701844075223663, - "grad_norm": 1.3660881519317627, - "learning_rate": 3.2066829264275644e-05, - "loss": 0.9009, + "epoch": 0.41342817487855654, + "grad_norm": 1.0225433111190796, + "learning_rate": 3.999311972086842e-05, + "loss": 0.7415, "step": 2383 }, { - "epoch": 0.8705495709329926, - "grad_norm": 1.1566431522369385, - "learning_rate": 3.205538613106558e-05, - "loss": 0.861, + "epoch": 0.41360166551006244, + "grad_norm": 0.849705696105957, + "learning_rate": 3.999293986245766e-05, + "loss": 0.7896, "step": 2384 }, { - "epoch": 0.8709147343436188, - "grad_norm": 1.2306350469589233, - "learning_rate": 3.204393679584311e-05, - "loss": 0.9296, + "epoch": 0.41377515614156835, + "grad_norm": 1.0171597003936768, + "learning_rate": 3.999275768384294e-05, + "loss": 0.7239, "step": 2385 }, { - "epoch": 0.871279897754245, - "grad_norm": 0.9939724206924438, - "learning_rate": 3.203248126449845e-05, - "loss": 0.8657, + "epoch": 0.41394864677307425, + "grad_norm": 1.0857945680618286, + "learning_rate": 3.9992573185045386e-05, + "loss": 0.7518, "step": 2386 }, { - "epoch": 0.8716450611648713, - "grad_norm": 0.9505504369735718, - "learning_rate": 3.2021019542925015e-05, - "loss": 0.8824, + "epoch": 0.41412213740458015, + "grad_norm": 0.913644552230835, + "learning_rate": 3.9992386366086415e-05, + "loss": 0.8713, "step": 2387 }, { - "epoch": 0.8720102245754975, - "grad_norm": 0.9705086350440979, - "learning_rate": 3.20095516370194e-05, - "loss": 0.9081, + "epoch": 0.41429562803608605, + "grad_norm": 1.9091912508010864, + "learning_rate": 3.9992197226987725e-05, + "loss": 0.7173, "step": 2388 }, { - "epoch": 0.8723753879861238, - "grad_norm": 1.240056037902832, - "learning_rate": 3.1998077552681387e-05, - "loss": 0.9739, + "epoch": 0.41446911866759195, + "grad_norm": 1.0591715574264526, + "learning_rate": 3.9992005767771236e-05, + "loss": 0.8101, "step": 2389 }, { - "epoch": 0.87274055139675, - "grad_norm": 1.0980809926986694, - "learning_rate": 3.198659729581391e-05, - "loss": 0.8942, + "epoch": 0.41464260929909785, + "grad_norm": 2.086602210998535, + "learning_rate": 3.999181198845919e-05, + "loss": 0.7271, "step": 2390 }, { - "epoch": 0.8731057148073763, - "grad_norm": 1.2435935735702515, - "learning_rate": 3.197511087232313e-05, - "loss": 0.9249, + "epoch": 0.41481609993060375, + "grad_norm": 0.7881263494491577, + "learning_rate": 3.9991615889074065e-05, + "loss": 0.8804, "step": 2391 }, { - "epoch": 0.8734708782180025, - "grad_norm": 1.254884123802185, - "learning_rate": 3.1963618288118334e-05, - "loss": 0.9387, + "epoch": 0.41498959056210966, + "grad_norm": 1.3324766159057617, + "learning_rate": 3.999141746963862e-05, + "loss": 0.7869, "step": 2392 }, { - "epoch": 0.8738360416286288, - "grad_norm": 1.156901240348816, - "learning_rate": 3.195211954911199e-05, - "loss": 0.947, + "epoch": 0.41516308119361556, + "grad_norm": 0.9704148173332214, + "learning_rate": 3.999121673017589e-05, + "loss": 0.7556, "step": 2393 }, { - "epoch": 0.874201205039255, - "grad_norm": 1.1092950105667114, - "learning_rate": 3.194061466121976e-05, - "loss": 0.9252, + "epoch": 0.41533657182512146, + "grad_norm": 4.316880702972412, + "learning_rate": 3.999101367070916e-05, + "loss": 0.7537, "step": 2394 }, { - "epoch": 0.8745663684498813, - "grad_norm": 1.1882591247558594, - "learning_rate": 3.192910363036043e-05, - "loss": 0.9012, + "epoch": 0.41551006245662736, + "grad_norm": 1.0933399200439453, + "learning_rate": 3.9990808291262e-05, + "loss": 0.7393, "step": 2395 }, { - "epoch": 0.8749315318605075, - "grad_norm": 1.1754741668701172, - "learning_rate": 3.191758646245596e-05, - "loss": 0.8591, + "epoch": 0.41568355308813326, + "grad_norm": 0.991733193397522, + "learning_rate": 3.9990600591858244e-05, + "loss": 0.865, "step": 2396 }, { - "epoch": 0.8752966952711339, - "grad_norm": 1.1027655601501465, - "learning_rate": 3.1906063163431485e-05, - "loss": 0.8597, + "epoch": 0.41585704371963916, + "grad_norm": 1.8536913394927979, + "learning_rate": 3.9990390572522e-05, + "loss": 0.7727, "step": 2397 }, { - "epoch": 0.87566185868176, - "grad_norm": 1.1564725637435913, - "learning_rate": 3.189453373921527e-05, - "loss": 0.8741, + "epoch": 0.41603053435114506, + "grad_norm": 1.0766284465789795, + "learning_rate": 3.999017823327762e-05, + "loss": 0.8301, "step": 2398 }, { - "epoch": 0.8760270220923864, - "grad_norm": 1.069939136505127, - "learning_rate": 3.1882998195738744e-05, - "loss": 0.877, + "epoch": 0.4162040249826509, + "grad_norm": 0.7867087721824646, + "learning_rate": 3.998996357414978e-05, + "loss": 0.9646, "step": 2399 }, { - "epoch": 0.8763921855030126, - "grad_norm": 1.0649820566177368, - "learning_rate": 3.187145653893648e-05, - "loss": 0.9042, + "epoch": 0.4163775156141568, + "grad_norm": 1.0421615839004517, + "learning_rate": 3.9989746595163364e-05, + "loss": 0.7654, "step": 2400 }, { - "epoch": 0.8767573489136389, - "grad_norm": 1.3197886943817139, - "learning_rate": 3.1859908774746205e-05, - "loss": 0.9244, + "epoch": 0.4165510062456627, + "grad_norm": 0.9341615438461304, + "learning_rate": 3.998952729634357e-05, + "loss": 0.8352, "step": 2401 }, { - "epoch": 0.8771225123242651, - "grad_norm": 1.082298994064331, - "learning_rate": 3.184835490910877e-05, - "loss": 0.952, + "epoch": 0.4167244968771686, + "grad_norm": 1.1899088621139526, + "learning_rate": 3.998930567771583e-05, + "loss": 0.8469, "step": 2402 }, { - "epoch": 0.8774876757348914, - "grad_norm": 0.9365941286087036, - "learning_rate": 3.1836794947968175e-05, - "loss": 0.8997, + "epoch": 0.4168979875086745, + "grad_norm": 1.6622980833053589, + "learning_rate": 3.998908173930589e-05, + "loss": 0.7283, "step": 2403 }, { - "epoch": 0.8778528391455176, - "grad_norm": 1.2733501195907593, - "learning_rate": 3.182522889727157e-05, - "loss": 0.9031, + "epoch": 0.4170714781401804, + "grad_norm": 0.9373778700828552, + "learning_rate": 3.998885548113971e-05, + "loss": 0.7673, "step": 2404 }, { - "epoch": 0.8782180025561439, - "grad_norm": 0.6785301566123962, - "learning_rate": 3.1813656762969206e-05, - "loss": 0.8923, + "epoch": 0.4172449687716863, + "grad_norm": 0.7620701789855957, + "learning_rate": 3.998862690324357e-05, + "loss": 0.9146, "step": 2405 }, { - "epoch": 0.8785831659667701, - "grad_norm": 0.9988470673561096, - "learning_rate": 3.180207855101449e-05, - "loss": 0.9388, + "epoch": 0.4174184594031922, + "grad_norm": 1.0200213193893433, + "learning_rate": 3.998839600564398e-05, + "loss": 0.7222, "step": 2406 }, { - "epoch": 0.8789483293773964, - "grad_norm": 0.9840304255485535, - "learning_rate": 3.1790494267363954e-05, - "loss": 0.9042, + "epoch": 0.4175919500346981, + "grad_norm": 0.9609151482582092, + "learning_rate": 3.9988162788367744e-05, + "loss": 0.8508, "step": 2407 }, { - "epoch": 0.8793134927880226, - "grad_norm": 1.1129623651504517, - "learning_rate": 3.177890391797724e-05, - "loss": 0.9197, + "epoch": 0.417765440666204, + "grad_norm": 1.1236156225204468, + "learning_rate": 3.998792725144192e-05, + "loss": 0.7087, "step": 2408 }, { - "epoch": 0.8796786561986489, - "grad_norm": 0.8818925619125366, - "learning_rate": 3.176730750881711e-05, - "loss": 0.9213, + "epoch": 0.4179389312977099, + "grad_norm": 0.8955191969871521, + "learning_rate": 3.9987689394893855e-05, + "loss": 0.8403, "step": 2409 }, { - "epoch": 0.8800438196092751, - "grad_norm": 1.1916090250015259, - "learning_rate": 3.1755705045849465e-05, - "loss": 0.904, + "epoch": 0.4181124219292158, + "grad_norm": 1.5847419500350952, + "learning_rate": 3.9987449218751134e-05, + "loss": 0.8298, "step": 2410 }, { - "epoch": 0.8804089830199014, - "grad_norm": 0.7871175408363342, - "learning_rate": 3.17440965350433e-05, - "loss": 0.9048, + "epoch": 0.41828591256072173, + "grad_norm": 1.2498855590820312, + "learning_rate": 3.9987206723041654e-05, + "loss": 0.8616, "step": 2411 }, { - "epoch": 0.8807741464305276, - "grad_norm": 1.0579344034194946, - "learning_rate": 3.173248198237073e-05, - "loss": 0.9012, + "epoch": 0.41845940319222763, + "grad_norm": 0.9945202469825745, + "learning_rate": 3.998696190779354e-05, + "loss": 0.72, "step": 2412 }, { - "epoch": 0.881139309841154, - "grad_norm": 1.1252928972244263, - "learning_rate": 3.172086139380698e-05, - "loss": 0.9108, + "epoch": 0.41863289382373353, + "grad_norm": 0.8612411618232727, + "learning_rate": 3.9986714773035207e-05, + "loss": 0.8063, "step": 2413 }, { - "epoch": 0.8815044732517802, - "grad_norm": 1.1915268898010254, - "learning_rate": 3.170923477533036e-05, - "loss": 0.9103, + "epoch": 0.41880638445523943, + "grad_norm": 1.1474491357803345, + "learning_rate": 3.9986465318795336e-05, + "loss": 0.7964, "step": 2414 }, { - "epoch": 0.8818696366624065, - "grad_norm": 1.2513328790664673, - "learning_rate": 3.169760213292232e-05, - "loss": 0.8531, + "epoch": 0.41897987508674533, + "grad_norm": 1.0371332168579102, + "learning_rate": 3.998621354510288e-05, + "loss": 0.7188, "step": 2415 }, { - "epoch": 0.8822348000730327, - "grad_norm": 0.949276328086853, - "learning_rate": 3.168596347256737e-05, - "loss": 0.9213, + "epoch": 0.41915336571825124, + "grad_norm": 0.7528420686721802, + "learning_rate": 3.998595945198705e-05, + "loss": 0.8916, "step": 2416 }, { - "epoch": 0.882599963483659, - "grad_norm": 0.7598105072975159, - "learning_rate": 3.1674318800253146e-05, - "loss": 0.9196, + "epoch": 0.41932685634975714, + "grad_norm": 0.936761200428009, + "learning_rate": 3.998570303947733e-05, + "loss": 0.8528, "step": 2417 }, { - "epoch": 0.8829651268942852, - "grad_norm": 0.990066409111023, - "learning_rate": 3.166266812197036e-05, - "loss": 0.8748, + "epoch": 0.41950034698126304, + "grad_norm": 1.3958734273910522, + "learning_rate": 3.9985444307603497e-05, + "loss": 0.704, "step": 2418 }, { - "epoch": 0.8833302903049115, - "grad_norm": 1.1372958421707153, - "learning_rate": 3.1651011443712825e-05, - "loss": 0.9028, + "epoch": 0.4196738376127689, + "grad_norm": 1.1508572101593018, + "learning_rate": 3.998518325639556e-05, + "loss": 0.8019, "step": 2419 }, { - "epoch": 0.8836954537155377, - "grad_norm": 1.0053482055664062, - "learning_rate": 3.1639348771477424e-05, - "loss": 0.9424, + "epoch": 0.4198473282442748, + "grad_norm": 1.2608684301376343, + "learning_rate": 3.998491988588381e-05, + "loss": 0.7805, "step": 2420 }, { - "epoch": 0.8840606171261639, - "grad_norm": 1.0567158460617065, - "learning_rate": 3.1627680111264134e-05, - "loss": 0.8953, + "epoch": 0.4200208188757807, + "grad_norm": 2.120544195175171, + "learning_rate": 3.9984654196098825e-05, + "loss": 0.6973, "step": 2421 }, { - "epoch": 0.8844257805367902, - "grad_norm": 1.2739530801773071, - "learning_rate": 3.161600546907602e-05, - "loss": 0.8794, + "epoch": 0.4201943095072866, + "grad_norm": 1.0377626419067383, + "learning_rate": 3.998438618707144e-05, + "loss": 0.8105, "step": 2422 }, { - "epoch": 0.8847909439474164, - "grad_norm": 1.4563584327697754, - "learning_rate": 3.160432485091922e-05, - "loss": 0.8821, + "epoch": 0.4203678001387925, + "grad_norm": 1.0013350248336792, + "learning_rate": 3.998411585883274e-05, + "loss": 0.7344, "step": 2423 }, { - "epoch": 0.8851561073580427, - "grad_norm": 0.8537071943283081, - "learning_rate": 3.1592638262802926e-05, - "loss": 0.8928, + "epoch": 0.4205412907702984, + "grad_norm": 0.7909770607948303, + "learning_rate": 3.9983843211414124e-05, + "loss": 0.8455, "step": 2424 }, { - "epoch": 0.8855212707686689, - "grad_norm": 1.3625949621200562, - "learning_rate": 3.1580945710739435e-05, - "loss": 0.8738, + "epoch": 0.4207147814018043, + "grad_norm": 0.9233974814414978, + "learning_rate": 3.998356824484721e-05, + "loss": 0.782, "step": 2425 }, { - "epoch": 0.8858864341792952, - "grad_norm": 1.0497000217437744, - "learning_rate": 3.156924720074408e-05, - "loss": 0.8717, + "epoch": 0.4208882720333102, + "grad_norm": 0.7796130180358887, + "learning_rate": 3.9983290959163914e-05, + "loss": 0.8638, "step": 2426 }, { - "epoch": 0.8862515975899214, - "grad_norm": 1.0398533344268799, - "learning_rate": 3.1557542738835295e-05, - "loss": 0.9021, + "epoch": 0.4210617626648161, + "grad_norm": 0.8853896856307983, + "learning_rate": 3.998301135439642e-05, + "loss": 0.9031, "step": 2427 }, { - "epoch": 0.8866167610005478, - "grad_norm": 0.9112217426300049, - "learning_rate": 3.154583233103455e-05, - "loss": 0.9229, + "epoch": 0.421235253296322, + "grad_norm": 0.9214218258857727, + "learning_rate": 3.998272943057717e-05, + "loss": 0.9233, "step": 2428 }, { - "epoch": 0.886981924411174, - "grad_norm": 0.9952887892723083, - "learning_rate": 3.153411598336637e-05, - "loss": 0.9081, + "epoch": 0.4214087439278279, + "grad_norm": 0.8790314793586731, + "learning_rate": 3.9982445187738885e-05, + "loss": 0.802, "step": 2429 }, { - "epoch": 0.8873470878218003, - "grad_norm": 1.1253986358642578, - "learning_rate": 3.1522393701858353e-05, - "loss": 0.9014, + "epoch": 0.4215822345593338, + "grad_norm": 0.8974714875221252, + "learning_rate": 3.998215862591455e-05, + "loss": 0.7712, "step": 2430 }, { - "epoch": 0.8877122512324265, - "grad_norm": 1.2453022003173828, - "learning_rate": 3.151066549254115e-05, - "loss": 0.9274, + "epoch": 0.4217557251908397, + "grad_norm": 1.2589672803878784, + "learning_rate": 3.998186974513743e-05, + "loss": 0.8352, "step": 2431 }, { - "epoch": 0.8880774146430528, - "grad_norm": 1.2295217514038086, - "learning_rate": 3.149893136144843e-05, - "loss": 0.895, + "epoch": 0.4219292158223456, + "grad_norm": 0.8181871175765991, + "learning_rate": 3.998157854544104e-05, + "loss": 0.9231, "step": 2432 }, { - "epoch": 0.888442578053679, - "grad_norm": 1.3340058326721191, - "learning_rate": 3.148719131461695e-05, - "loss": 0.8659, + "epoch": 0.4221027064538515, + "grad_norm": 0.7463948726654053, + "learning_rate": 3.998128502685917e-05, + "loss": 0.7954, "step": 2433 }, { - "epoch": 0.8888077414643053, - "grad_norm": 1.4019211530685425, - "learning_rate": 3.14754453580865e-05, - "loss": 0.9498, + "epoch": 0.4222761970853574, + "grad_norm": 1.3237310647964478, + "learning_rate": 3.99809891894259e-05, + "loss": 0.7087, "step": 2434 }, { - "epoch": 0.8891729048749315, - "grad_norm": 0.8233980536460876, - "learning_rate": 3.1463693497899894e-05, - "loss": 0.8867, + "epoch": 0.4224496877168633, + "grad_norm": 0.8923051953315735, + "learning_rate": 3.998069103317555e-05, + "loss": 0.7537, "step": 2435 }, { - "epoch": 0.8895380682855578, - "grad_norm": 1.0522586107254028, - "learning_rate": 3.145193574010298e-05, - "loss": 0.8607, + "epoch": 0.4226231783483692, + "grad_norm": 0.8334577083587646, + "learning_rate": 3.998039055814272e-05, + "loss": 0.8945, "step": 2436 }, { - "epoch": 0.889903231696184, - "grad_norm": 1.2605302333831787, - "learning_rate": 3.1440172090744674e-05, - "loss": 0.9136, + "epoch": 0.4227966689798751, + "grad_norm": 0.8836405277252197, + "learning_rate": 3.998008776436228e-05, + "loss": 0.8606, "step": 2437 }, { - "epoch": 0.8902683951068103, - "grad_norm": 0.9410290122032166, - "learning_rate": 3.1428402555876896e-05, - "loss": 0.8855, + "epoch": 0.42297015961138096, + "grad_norm": 0.91793292760849, + "learning_rate": 3.9979782651869384e-05, + "loss": 0.7942, "step": 2438 }, { - "epoch": 0.8906335585174365, - "grad_norm": 1.0935149192810059, - "learning_rate": 3.1416627141554595e-05, - "loss": 0.8833, + "epoch": 0.42314365024288686, + "grad_norm": 1.1676362752914429, + "learning_rate": 3.997947522069942e-05, + "loss": 0.7136, "step": 2439 }, { - "epoch": 0.8909987219280628, - "grad_norm": 1.2515472173690796, - "learning_rate": 3.1404845853835744e-05, - "loss": 0.9435, + "epoch": 0.42331714087439276, + "grad_norm": 1.5658996105194092, + "learning_rate": 3.997916547088808e-05, + "loss": 0.6447, "step": 2440 }, { - "epoch": 0.891363885338689, - "grad_norm": 1.035130500793457, - "learning_rate": 3.139305869878135e-05, - "loss": 0.933, + "epoch": 0.42349063150589866, + "grad_norm": 0.897371232509613, + "learning_rate": 3.9978853402471306e-05, + "loss": 0.8843, "step": 2441 }, { - "epoch": 0.8917290487493154, - "grad_norm": 1.057878017425537, - "learning_rate": 3.1381265682455436e-05, - "loss": 0.9081, + "epoch": 0.42366412213740456, + "grad_norm": 3.3999593257904053, + "learning_rate": 3.997853901548532e-05, + "loss": 0.6875, "step": 2442 }, { - "epoch": 0.8920942121599416, - "grad_norm": 1.2074347734451294, - "learning_rate": 3.136946681092503e-05, - "loss": 0.9078, + "epoch": 0.42383761276891047, + "grad_norm": 1.0058984756469727, + "learning_rate": 3.9978222309966594e-05, + "loss": 0.8127, "step": 2443 }, { - "epoch": 0.8924593755705679, - "grad_norm": 1.03203284740448, - "learning_rate": 3.135766209026017e-05, - "loss": 0.942, + "epoch": 0.42401110340041637, + "grad_norm": 1.2285758256912231, + "learning_rate": 3.9977903285951896e-05, + "loss": 0.74, "step": 2444 }, { - "epoch": 0.8928245389811941, - "grad_norm": 1.5605872869491577, - "learning_rate": 3.134585152653393e-05, - "loss": 0.9142, + "epoch": 0.42418459403192227, + "grad_norm": 2.268646001815796, + "learning_rate": 3.9977581943478236e-05, + "loss": 0.74, "step": 2445 }, { - "epoch": 0.8931897023918204, - "grad_norm": 1.094165563583374, - "learning_rate": 3.133403512582236e-05, - "loss": 0.9423, + "epoch": 0.42435808466342817, + "grad_norm": 0.9421398043632507, + "learning_rate": 3.9977258282582916e-05, + "loss": 0.7244, "step": 2446 }, { - "epoch": 0.8935548658024466, - "grad_norm": 1.1768535375595093, - "learning_rate": 3.132221289420451e-05, - "loss": 0.9127, + "epoch": 0.42453157529493407, + "grad_norm": 0.7844792604446411, + "learning_rate": 3.99769323033035e-05, + "loss": 0.8655, "step": 2447 }, { - "epoch": 0.8939200292130729, - "grad_norm": 1.05989670753479, - "learning_rate": 3.131038483776247e-05, - "loss": 0.9056, + "epoch": 0.42470506592644, + "grad_norm": 0.7219191789627075, + "learning_rate": 3.99766040056778e-05, + "loss": 0.927, "step": 2448 }, { - "epoch": 0.8942851926236991, - "grad_norm": 1.322707176208496, - "learning_rate": 3.129855096258129e-05, - "loss": 0.9183, + "epoch": 0.4248785565579459, + "grad_norm": 1.4582059383392334, + "learning_rate": 3.997627338974394e-05, + "loss": 0.7727, "step": 2449 }, { - "epoch": 0.8946503560343254, - "grad_norm": 1.1722118854522705, - "learning_rate": 3.128671127474902e-05, - "loss": 0.944, + "epoch": 0.4250520471894518, + "grad_norm": 2.624171257019043, + "learning_rate": 3.997594045554027e-05, + "loss": 0.7612, "step": 2450 }, { - "epoch": 0.8950155194449516, - "grad_norm": 0.9634767174720764, - "learning_rate": 3.127486578035671e-05, - "loss": 0.9336, + "epoch": 0.4252255378209577, + "grad_norm": 0.9157883524894714, + "learning_rate": 3.9975605203105434e-05, + "loss": 0.8464, "step": 2451 }, { - "epoch": 0.8953806828555779, - "grad_norm": 1.219672441482544, - "learning_rate": 3.1263014485498374e-05, - "loss": 0.8967, + "epoch": 0.4253990284524636, + "grad_norm": 0.889087975025177, + "learning_rate": 3.9975267632478336e-05, + "loss": 0.8584, "step": 2452 }, { - "epoch": 0.8957458462662041, - "grad_norm": 1.2199424505233765, - "learning_rate": 3.1251157396271055e-05, - "loss": 0.8582, + "epoch": 0.4255725190839695, + "grad_norm": 0.7356145977973938, + "learning_rate": 3.997492774369816e-05, + "loss": 0.839, "step": 2453 }, { - "epoch": 0.8961110096768303, - "grad_norm": 1.1338456869125366, - "learning_rate": 3.123929451877473e-05, - "loss": 0.9034, + "epoch": 0.4257460097154754, + "grad_norm": 0.9127195477485657, + "learning_rate": 3.997458553680434e-05, + "loss": 0.8157, "step": 2454 }, { - "epoch": 0.8964761730874566, - "grad_norm": 1.3800227642059326, - "learning_rate": 3.122742585911238e-05, - "loss": 0.9102, + "epoch": 0.4259195003469813, + "grad_norm": 1.296396017074585, + "learning_rate": 3.9974241011836594e-05, + "loss": 0.7496, "step": 2455 }, { - "epoch": 0.8968413364980828, - "grad_norm": 1.3210792541503906, - "learning_rate": 3.121555142338996e-05, - "loss": 0.933, + "epoch": 0.4260929909784872, + "grad_norm": 1.1448200941085815, + "learning_rate": 3.99738941688349e-05, + "loss": 0.9243, "step": 2456 }, { - "epoch": 0.8972064999087092, - "grad_norm": 1.1100627183914185, - "learning_rate": 3.120367121771638e-05, - "loss": 0.9442, + "epoch": 0.4262664816099931, + "grad_norm": 0.7947261929512024, + "learning_rate": 3.997354500783952e-05, + "loss": 0.7388, "step": 2457 }, { - "epoch": 0.8975716633193354, - "grad_norm": 1.1476476192474365, - "learning_rate": 3.119178524820354e-05, - "loss": 0.8824, + "epoch": 0.42643997224149893, + "grad_norm": 0.8150436282157898, + "learning_rate": 3.997319352889096e-05, + "loss": 0.7976, "step": 2458 }, { - "epoch": 0.8979368267299617, - "grad_norm": 1.0068159103393555, - "learning_rate": 3.1179893520966276e-05, - "loss": 0.8959, + "epoch": 0.42661346287300483, + "grad_norm": 0.901246190071106, + "learning_rate": 3.997283973203003e-05, + "loss": 0.813, "step": 2459 }, { - "epoch": 0.8983019901405879, - "grad_norm": 1.1935549974441528, - "learning_rate": 3.1167996042122426e-05, - "loss": 0.8996, + "epoch": 0.42678695350451074, + "grad_norm": 1.9731829166412354, + "learning_rate": 3.997248361729777e-05, + "loss": 0.7584, "step": 2460 }, { - "epoch": 0.8986671535512142, - "grad_norm": 1.057350516319275, - "learning_rate": 3.1156092817792756e-05, - "loss": 0.8668, + "epoch": 0.42696044413601664, + "grad_norm": 0.7499437928199768, + "learning_rate": 3.9972125184735505e-05, + "loss": 0.8022, "step": 2461 }, { - "epoch": 0.8990323169618404, - "grad_norm": 1.2303168773651123, - "learning_rate": 3.1144183854100996e-05, - "loss": 0.8708, + "epoch": 0.42713393476752254, + "grad_norm": 1.2553960084915161, + "learning_rate": 3.997176443438485e-05, + "loss": 0.8831, "step": 2462 }, { - "epoch": 0.8993974803724667, - "grad_norm": 1.0893819332122803, - "learning_rate": 3.113226915717383e-05, - "loss": 0.8518, + "epoch": 0.42730742539902844, + "grad_norm": 1.2123353481292725, + "learning_rate": 3.9971401366287666e-05, + "loss": 0.6643, "step": 2463 }, { - "epoch": 0.8997626437830929, - "grad_norm": 0.8773859739303589, - "learning_rate": 3.1120348733140896e-05, - "loss": 0.9218, + "epoch": 0.42748091603053434, + "grad_norm": 1.2330788373947144, + "learning_rate": 3.997103598048607e-05, + "loss": 0.7168, "step": 2464 }, { - "epoch": 0.9001278071937192, - "grad_norm": 1.0458978414535522, - "learning_rate": 3.110842258813477e-05, - "loss": 0.9565, + "epoch": 0.42765440666204024, + "grad_norm": 1.2354925870895386, + "learning_rate": 3.997066827702248e-05, + "loss": 0.6843, "step": 2465 }, { - "epoch": 0.9004929706043454, - "grad_norm": 1.1016786098480225, - "learning_rate": 3.109649072829097e-05, - "loss": 0.901, + "epoch": 0.42782789729354614, + "grad_norm": 1.3304318189620972, + "learning_rate": 3.9970298255939564e-05, + "loss": 0.6987, "step": 2466 }, { - "epoch": 0.9008581340149717, - "grad_norm": 1.246741771697998, - "learning_rate": 3.108455315974796e-05, - "loss": 0.9369, + "epoch": 0.42800138792505205, + "grad_norm": 1.0831223726272583, + "learning_rate": 3.9969925917280276e-05, + "loss": 0.8564, "step": 2467 }, { - "epoch": 0.9012232974255979, - "grad_norm": 0.9858847260475159, - "learning_rate": 3.107260988864716e-05, - "loss": 0.9017, + "epoch": 0.42817487855655795, + "grad_norm": 1.1596765518188477, + "learning_rate": 3.9969551261087806e-05, + "loss": 0.7209, "step": 2468 }, { - "epoch": 0.9015884608362242, - "grad_norm": 1.4518101215362549, - "learning_rate": 3.106066092113288e-05, - "loss": 0.8799, + "epoch": 0.42834836918806385, + "grad_norm": 1.1352468729019165, + "learning_rate": 3.996917428740565e-05, + "loss": 0.7323, "step": 2469 }, { - "epoch": 0.9019536242468504, - "grad_norm": 1.1444424390792847, - "learning_rate": 3.10487062633524e-05, - "loss": 0.8945, + "epoch": 0.42852185981956975, + "grad_norm": 1.0548738241195679, + "learning_rate": 3.996879499627754e-05, + "loss": 0.7776, "step": 2470 }, { - "epoch": 0.9023187876574768, - "grad_norm": 1.3481340408325195, - "learning_rate": 3.1036745921455895e-05, - "loss": 0.9358, + "epoch": 0.42869535045107565, + "grad_norm": 0.934288740158081, + "learning_rate": 3.996841338774751e-05, + "loss": 0.7811, "step": 2471 }, { - "epoch": 0.902683951068103, - "grad_norm": 1.4463859796524048, - "learning_rate": 3.1024779901596496e-05, - "loss": 0.9126, + "epoch": 0.42886884108258155, + "grad_norm": 0.8638343214988708, + "learning_rate": 3.996802946185984e-05, + "loss": 0.9812, "step": 2472 }, { - "epoch": 0.9030491144787293, - "grad_norm": 1.1613800525665283, - "learning_rate": 3.101280820993023e-05, - "loss": 0.8969, + "epoch": 0.42904233171408745, + "grad_norm": 1.0519049167633057, + "learning_rate": 3.996764321865907e-05, + "loss": 0.7936, "step": 2473 }, { - "epoch": 0.9034142778893555, - "grad_norm": 0.9108444452285767, - "learning_rate": 3.100083085261606e-05, - "loss": 0.8934, + "epoch": 0.42921582234559336, + "grad_norm": 1.377454161643982, + "learning_rate": 3.9967254658190055e-05, + "loss": 0.6863, "step": 2474 }, { - "epoch": 0.9037794412999818, - "grad_norm": 1.006577730178833, - "learning_rate": 3.098884783581586e-05, - "loss": 0.9041, + "epoch": 0.42938931297709926, + "grad_norm": 1.722349762916565, + "learning_rate": 3.996686378049786e-05, + "loss": 0.7551, "step": 2475 }, { - "epoch": 0.904144604710608, - "grad_norm": 1.196187138557434, - "learning_rate": 3.097685916569439e-05, - "loss": 0.8834, + "epoch": 0.42956280360860516, + "grad_norm": 0.9257386922836304, + "learning_rate": 3.996647058562786e-05, + "loss": 0.8828, "step": 2476 }, { - "epoch": 0.9045097681212343, - "grad_norm": 0.9780898690223694, - "learning_rate": 3.096486484841935e-05, - "loss": 0.8987, + "epoch": 0.42973629424011106, + "grad_norm": 0.9074738025665283, + "learning_rate": 3.9966075073625684e-05, + "loss": 0.7769, "step": 2477 }, { - "epoch": 0.9048749315318605, - "grad_norm": 1.275571584701538, - "learning_rate": 3.095286489016135e-05, - "loss": 0.8424, + "epoch": 0.4299097848716169, + "grad_norm": 1.8305493593215942, + "learning_rate": 3.9965677244537226e-05, + "loss": 0.8987, "step": 2478 }, { - "epoch": 0.9052400949424868, - "grad_norm": 1.0573374032974243, - "learning_rate": 3.0940859297093874e-05, - "loss": 0.9219, + "epoch": 0.4300832755031228, + "grad_norm": 1.188607096672058, + "learning_rate": 3.9965277098408666e-05, + "loss": 0.8121, "step": 2479 }, { - "epoch": 0.905605258353113, - "grad_norm": 1.3158808946609497, - "learning_rate": 3.092884807539331e-05, - "loss": 0.916, + "epoch": 0.4302567661346287, + "grad_norm": 1.2999130487442017, + "learning_rate": 3.9964874635286436e-05, + "loss": 0.8221, "step": 2480 }, { - "epoch": 0.9059704217637393, - "grad_norm": 1.2299778461456299, - "learning_rate": 3.091683123123897e-05, - "loss": 0.9084, + "epoch": 0.4304302567661346, + "grad_norm": 0.7679153680801392, + "learning_rate": 3.996446985521723e-05, + "loss": 0.7408, "step": 2481 }, { - "epoch": 0.9063355851743655, - "grad_norm": 0.8647922277450562, - "learning_rate": 3.0904808770813024e-05, - "loss": 0.8801, + "epoch": 0.4306037473976405, + "grad_norm": 0.9113188982009888, + "learning_rate": 3.996406275824804e-05, + "loss": 0.7781, "step": 2482 }, { - "epoch": 0.9067007485849918, - "grad_norm": 1.2949285507202148, - "learning_rate": 3.0892780700300544e-05, - "loss": 0.8892, + "epoch": 0.4307772380291464, + "grad_norm": 0.919550895690918, + "learning_rate": 3.996365334442611e-05, + "loss": 0.901, "step": 2483 }, { - "epoch": 0.907065911995618, - "grad_norm": 0.9885082840919495, - "learning_rate": 3.08807470258895e-05, - "loss": 0.8912, + "epoch": 0.4309507286606523, + "grad_norm": 1.177147388458252, + "learning_rate": 3.996324161379894e-05, + "loss": 0.6475, "step": 2484 }, { - "epoch": 0.9074310754062443, - "grad_norm": 1.205407738685608, - "learning_rate": 3.086870775377072e-05, - "loss": 0.9456, + "epoch": 0.4311242192921582, + "grad_norm": 0.9344490170478821, + "learning_rate": 3.996282756641432e-05, + "loss": 0.8606, "step": 2485 }, { - "epoch": 0.9077962388168705, - "grad_norm": 1.1545631885528564, - "learning_rate": 3.085666289013794e-05, - "loss": 0.9054, + "epoch": 0.4312977099236641, + "grad_norm": 1.0607526302337646, + "learning_rate": 3.9962411202320296e-05, + "loss": 0.6978, "step": 2486 }, { - "epoch": 0.9081614022274968, - "grad_norm": 0.9817137122154236, - "learning_rate": 3.0844612441187755e-05, - "loss": 0.8748, + "epoch": 0.43147120055517, + "grad_norm": 0.8835808634757996, + "learning_rate": 3.99619925215652e-05, + "loss": 0.9182, "step": 2487 }, { - "epoch": 0.9085265656381231, - "grad_norm": 0.8002792000770569, - "learning_rate": 3.083255641311963e-05, - "loss": 0.9247, + "epoch": 0.4316446911866759, + "grad_norm": 1.014094352722168, + "learning_rate": 3.99615715241976e-05, + "loss": 0.8271, "step": 2488 }, { - "epoch": 0.9088917290487493, - "grad_norm": 1.1831262111663818, - "learning_rate": 3.082049481213592e-05, - "loss": 0.9023, + "epoch": 0.4318181818181818, + "grad_norm": 0.912253201007843, + "learning_rate": 3.996114821026638e-05, + "loss": 0.8826, "step": 2489 }, { - "epoch": 0.9092568924593756, - "grad_norm": 0.999395489692688, - "learning_rate": 3.0808427644441825e-05, - "loss": 0.8915, + "epoch": 0.4319916724496877, + "grad_norm": 0.7626254558563232, + "learning_rate": 3.996072257982064e-05, + "loss": 0.8364, "step": 2490 }, { - "epoch": 0.9096220558700018, - "grad_norm": 1.0484224557876587, - "learning_rate": 3.079635491624542e-05, - "loss": 0.8408, + "epoch": 0.4321651630811936, + "grad_norm": 1.214220404624939, + "learning_rate": 3.996029463290978e-05, + "loss": 0.8704, "step": 2491 }, { - "epoch": 0.9099872192806281, - "grad_norm": 1.040368914604187, - "learning_rate": 3.078427663375765e-05, - "loss": 0.9232, + "epoch": 0.4323386537126995, + "grad_norm": 1.601532220840454, + "learning_rate": 3.9959864369583485e-05, + "loss": 0.7109, "step": 2492 }, { - "epoch": 0.9103523826912543, - "grad_norm": 1.017867922782898, - "learning_rate": 3.077219280319229e-05, - "loss": 0.895, + "epoch": 0.43251214434420543, + "grad_norm": 0.7571848034858704, + "learning_rate": 3.9959431789891665e-05, + "loss": 0.881, "step": 2493 }, { - "epoch": 0.9107175461018806, - "grad_norm": 1.4710633754730225, - "learning_rate": 3.0760103430766e-05, - "loss": 0.9119, + "epoch": 0.43268563497571133, + "grad_norm": 0.8073483109474182, + "learning_rate": 3.9958996893884525e-05, + "loss": 0.7815, "step": 2494 }, { - "epoch": 0.9110827095125068, - "grad_norm": 1.1537384986877441, - "learning_rate": 3.0748008522698265e-05, - "loss": 0.8826, + "epoch": 0.43285912560721723, + "grad_norm": 0.8151943683624268, + "learning_rate": 3.9958559681612544e-05, + "loss": 0.7479, "step": 2495 }, { - "epoch": 0.9114478729231331, - "grad_norm": 1.1142319440841675, - "learning_rate": 3.073590808521144e-05, - "loss": 0.9265, + "epoch": 0.43303261623872313, + "grad_norm": 0.9222496151924133, + "learning_rate": 3.9958120153126454e-05, + "loss": 0.9131, "step": 2496 }, { - "epoch": 0.9118130363337593, - "grad_norm": 1.1177244186401367, - "learning_rate": 3.072380212453071e-05, - "loss": 0.9121, + "epoch": 0.43320610687022904, + "grad_norm": 0.7602196335792542, + "learning_rate": 3.995767830847726e-05, + "loss": 0.7876, "step": 2497 }, { - "epoch": 0.9121781997443856, - "grad_norm": 1.0034847259521484, - "learning_rate": 3.07116906468841e-05, - "loss": 0.8859, + "epoch": 0.4333795975017349, + "grad_norm": 0.8795971274375916, + "learning_rate": 3.995723414771625e-05, + "loss": 0.7313, "step": 2498 }, { - "epoch": 0.9125433631550118, - "grad_norm": 1.0170037746429443, - "learning_rate": 3.069957365850249e-05, - "loss": 0.9146, + "epoch": 0.4335530881332408, + "grad_norm": 1.1231931447982788, + "learning_rate": 3.9956787670894954e-05, + "loss": 0.7456, "step": 2499 }, { - "epoch": 0.9129085265656381, - "grad_norm": 1.2142237424850464, - "learning_rate": 3.0687451165619586e-05, - "loss": 0.9153, + "epoch": 0.4337265787647467, + "grad_norm": 1.0837032794952393, + "learning_rate": 3.9956338878065205e-05, + "loss": 0.8733, "step": 2500 }, { - "epoch": 0.9132736899762643, - "grad_norm": 0.960737407207489, - "learning_rate": 3.0675323174471905e-05, - "loss": 0.8809, + "epoch": 0.4339000693962526, + "grad_norm": 1.3121693134307861, + "learning_rate": 3.995588776927907e-05, + "loss": 0.6868, "step": 2501 }, { - "epoch": 0.9136388533868907, - "grad_norm": 0.9063526391983032, - "learning_rate": 3.0663189691298836e-05, - "loss": 0.9127, + "epoch": 0.4340735600277585, + "grad_norm": 0.9681256413459778, + "learning_rate": 3.99554343445889e-05, + "loss": 0.708, "step": 2502 }, { - "epoch": 0.9140040167975169, - "grad_norm": 1.010685920715332, - "learning_rate": 3.0651050722342554e-05, - "loss": 0.855, + "epoch": 0.4342470506592644, + "grad_norm": 1.1847668886184692, + "learning_rate": 3.995497860404733e-05, + "loss": 0.7288, "step": 2503 }, { - "epoch": 0.9143691802081432, - "grad_norm": 1.0342044830322266, - "learning_rate": 3.0638906273848075e-05, - "loss": 0.8939, + "epoch": 0.4344205412907703, + "grad_norm": 1.03617525100708, + "learning_rate": 3.995452054770724e-05, + "loss": 0.705, "step": 2504 }, { - "epoch": 0.9147343436187694, - "grad_norm": 0.8174975514411926, - "learning_rate": 3.062675635206323e-05, - "loss": 0.8658, + "epoch": 0.4345940319222762, + "grad_norm": 0.8961045145988464, + "learning_rate": 3.995406017562179e-05, + "loss": 0.8657, "step": 2505 }, { - "epoch": 0.9150995070293957, - "grad_norm": 1.0362099409103394, - "learning_rate": 3.061460096323867e-05, - "loss": 0.8983, + "epoch": 0.4347675225537821, + "grad_norm": 1.1994738578796387, + "learning_rate": 3.99535974878444e-05, + "loss": 0.6914, "step": 2506 }, { - "epoch": 0.9154646704400219, - "grad_norm": 1.1548041105270386, - "learning_rate": 3.060244011362785e-05, - "loss": 0.9036, + "epoch": 0.434941013185288, + "grad_norm": 1.0716725587844849, + "learning_rate": 3.995313248442878e-05, + "loss": 0.7561, "step": 2507 }, { - "epoch": 0.9158298338506482, - "grad_norm": 1.1866950988769531, - "learning_rate": 3.0590273809487037e-05, - "loss": 0.8852, + "epoch": 0.4351145038167939, + "grad_norm": 1.0222324132919312, + "learning_rate": 3.995266516542887e-05, + "loss": 0.875, "step": 2508 }, { - "epoch": 0.9161949972612744, - "grad_norm": 1.1936439275741577, - "learning_rate": 3.057810205707532e-05, - "loss": 0.9159, + "epoch": 0.4352879944482998, + "grad_norm": 1.0977429151535034, + "learning_rate": 3.9952195530898926e-05, + "loss": 0.6667, "step": 2509 }, { - "epoch": 0.9165601606719007, - "grad_norm": 1.014291524887085, - "learning_rate": 3.0565924862654556e-05, - "loss": 0.9131, + "epoch": 0.4354614850798057, + "grad_norm": 1.7281131744384766, + "learning_rate": 3.995172358089344e-05, + "loss": 0.6744, "step": 2510 }, { - "epoch": 0.9169253240825269, - "grad_norm": 0.9708791375160217, - "learning_rate": 3.055374223248944e-05, - "loss": 0.8904, + "epoch": 0.4356349757113116, + "grad_norm": 1.47886061668396, + "learning_rate": 3.9951249315467194e-05, + "loss": 0.7622, "step": 2511 }, { - "epoch": 0.9172904874931532, - "grad_norm": 1.112380027770996, - "learning_rate": 3.054155417284745e-05, - "loss": 0.8805, + "epoch": 0.4358084663428175, + "grad_norm": 1.344706654548645, + "learning_rate": 3.995077273467521e-05, + "loss": 0.7292, "step": 2512 }, { - "epoch": 0.9176556509037794, - "grad_norm": 0.9831346869468689, - "learning_rate": 3.0529360689998836e-05, - "loss": 0.9148, + "epoch": 0.4359819569743234, + "grad_norm": 2.0525286197662354, + "learning_rate": 3.99502938385728e-05, + "loss": 0.865, "step": 2513 }, { - "epoch": 0.9180208143144057, - "grad_norm": 1.2571674585342407, - "learning_rate": 3.051716179021666e-05, - "loss": 0.9236, + "epoch": 0.4361554476058293, + "grad_norm": 1.1086620092391968, + "learning_rate": 3.994981262721555e-05, + "loss": 0.6671, "step": 2514 }, { - "epoch": 0.9183859777250319, - "grad_norm": 0.8745986223220825, - "learning_rate": 3.050495747977677e-05, - "loss": 0.8719, + "epoch": 0.4363289382373352, + "grad_norm": 1.3676657676696777, + "learning_rate": 3.994932910065929e-05, + "loss": 0.7126, "step": 2515 }, { - "epoch": 0.9187511411356583, - "grad_norm": 1.220208764076233, - "learning_rate": 3.0492747764957798e-05, - "loss": 0.8577, + "epoch": 0.4365024288688411, + "grad_norm": 0.9539294838905334, + "learning_rate": 3.9948843258960154e-05, + "loss": 0.7554, "step": 2516 }, { - "epoch": 0.9191163045462845, - "grad_norm": 1.1007022857666016, - "learning_rate": 3.0480532652041153e-05, - "loss": 0.8905, + "epoch": 0.43667591950034695, + "grad_norm": 1.0832470655441284, + "learning_rate": 3.9948355102174503e-05, + "loss": 0.8555, "step": 2517 }, { - "epoch": 0.9194814679569108, - "grad_norm": 1.341386079788208, - "learning_rate": 3.0468312147311007e-05, - "loss": 0.9037, + "epoch": 0.43684941013185286, + "grad_norm": 1.4796980619430542, + "learning_rate": 3.9947864630359005e-05, + "loss": 0.7273, "step": 2518 }, { - "epoch": 0.919846631367537, - "grad_norm": 1.07036292552948, - "learning_rate": 3.0456086257054336e-05, - "loss": 0.8666, + "epoch": 0.43702290076335876, + "grad_norm": 1.182033658027649, + "learning_rate": 3.9947371843570565e-05, + "loss": 0.8352, "step": 2519 }, { - "epoch": 0.9202117947781633, - "grad_norm": 1.1970009803771973, - "learning_rate": 3.0443854987560856e-05, - "loss": 0.8751, + "epoch": 0.43719639139486466, + "grad_norm": 1.1116958856582642, + "learning_rate": 3.994687674186638e-05, + "loss": 0.7167, "step": 2520 }, { - "epoch": 0.9205769581887895, - "grad_norm": 1.4103699922561646, - "learning_rate": 3.043161834512308e-05, - "loss": 0.9005, + "epoch": 0.43736988202637056, + "grad_norm": 0.9026437401771545, + "learning_rate": 3.994637932530391e-05, + "loss": 0.8513, "step": 2521 }, { - "epoch": 0.9209421215994157, - "grad_norm": 1.3167098760604858, - "learning_rate": 3.0419376336036252e-05, - "loss": 0.9005, + "epoch": 0.43754337265787646, + "grad_norm": 1.1112323999404907, + "learning_rate": 3.9945879593940874e-05, + "loss": 0.9163, "step": 2522 }, { - "epoch": 0.921307285010042, - "grad_norm": 1.2057210206985474, - "learning_rate": 3.04071289665984e-05, - "loss": 0.849, + "epoch": 0.43771686328938236, + "grad_norm": 1.5064748525619507, + "learning_rate": 3.994537754783527e-05, + "loss": 0.8606, "step": 2523 }, { - "epoch": 0.9216724484206682, - "grad_norm": 1.0020947456359863, - "learning_rate": 3.0394876243110318e-05, - "loss": 0.8827, + "epoch": 0.43789035392088826, + "grad_norm": 0.8527150750160217, + "learning_rate": 3.994487318704536e-05, + "loss": 0.7869, "step": 2524 }, { - "epoch": 0.9220376118312945, - "grad_norm": 0.9749206304550171, - "learning_rate": 3.038261817187552e-05, - "loss": 0.8643, + "epoch": 0.43806384455239417, + "grad_norm": 1.508033037185669, + "learning_rate": 3.994436651162969e-05, + "loss": 0.7146, "step": 2525 }, { - "epoch": 0.9224027752419207, - "grad_norm": 1.0270098447799683, - "learning_rate": 3.0370354759200307e-05, - "loss": 0.8748, + "epoch": 0.43823733518390007, + "grad_norm": 1.0513464212417603, + "learning_rate": 3.994385752164703e-05, + "loss": 0.7112, "step": 2526 }, { - "epoch": 0.922767938652547, - "grad_norm": 1.2331634759902954, - "learning_rate": 3.03580860113937e-05, - "loss": 0.9049, + "epoch": 0.43841082581540597, + "grad_norm": 0.952623188495636, + "learning_rate": 3.994334621715647e-05, + "loss": 0.7456, "step": 2527 }, { - "epoch": 0.9231331020631732, - "grad_norm": 1.2471660375595093, - "learning_rate": 3.034581193476749e-05, - "loss": 0.891, + "epoch": 0.43858431644691187, + "grad_norm": 1.3120490312576294, + "learning_rate": 3.9942832598217345e-05, + "loss": 0.8708, "step": 2528 }, { - "epoch": 0.9234982654737995, - "grad_norm": 1.2325183153152466, - "learning_rate": 3.0333532535636193e-05, - "loss": 0.876, + "epoch": 0.43875780707841777, + "grad_norm": 0.9523609280586243, + "learning_rate": 3.9942316664889255e-05, + "loss": 0.6963, "step": 2529 }, { - "epoch": 0.9238634288844257, - "grad_norm": 1.1209557056427002, - "learning_rate": 3.032124782031706e-05, - "loss": 0.8306, + "epoch": 0.4389312977099237, + "grad_norm": 1.2361972332000732, + "learning_rate": 3.9941798417232084e-05, + "loss": 0.696, "step": 2530 }, { - "epoch": 0.924228592295052, - "grad_norm": 1.0054899454116821, - "learning_rate": 3.0308957795130092e-05, - "loss": 0.8909, + "epoch": 0.4391047883414296, + "grad_norm": 0.9125909805297852, + "learning_rate": 3.994127785530596e-05, + "loss": 0.7266, "step": 2531 }, { - "epoch": 0.9245937557056783, - "grad_norm": 0.9828513860702515, - "learning_rate": 3.0296662466398005e-05, - "loss": 0.8374, + "epoch": 0.4392782789729355, + "grad_norm": 1.039093017578125, + "learning_rate": 3.9940754979171317e-05, + "loss": 0.8635, "step": 2532 }, { - "epoch": 0.9249589191163046, - "grad_norm": 1.4755783081054688, - "learning_rate": 3.028436184044626e-05, - "loss": 0.8761, + "epoch": 0.4394517696044414, + "grad_norm": 0.9297025203704834, + "learning_rate": 3.994022978888882e-05, + "loss": 0.7578, "step": 2533 }, { - "epoch": 0.9253240825269308, - "grad_norm": 1.5002682209014893, - "learning_rate": 3.0272055923603046e-05, - "loss": 0.8881, + "epoch": 0.4396252602359473, + "grad_norm": 0.9675312638282776, + "learning_rate": 3.9939702284519416e-05, + "loss": 0.7526, "step": 2534 }, { - "epoch": 0.9256892459375571, - "grad_norm": 1.0132485628128052, - "learning_rate": 3.025974472219924e-05, - "loss": 0.9537, + "epoch": 0.4397987508674532, + "grad_norm": 1.262641429901123, + "learning_rate": 3.993917246612433e-05, + "loss": 0.6814, "step": 2535 }, { - "epoch": 0.9260544093481833, - "grad_norm": 1.1246033906936646, - "learning_rate": 3.024742824256848e-05, - "loss": 0.9451, + "epoch": 0.4399722414989591, + "grad_norm": 1.08133864402771, + "learning_rate": 3.9938640333765046e-05, + "loss": 0.7144, "step": 2536 }, { - "epoch": 0.9264195727588096, - "grad_norm": 1.0005650520324707, - "learning_rate": 3.0235106491047078e-05, - "loss": 0.8573, + "epoch": 0.44014573213046493, + "grad_norm": 1.427842378616333, + "learning_rate": 3.993810588750332e-05, + "loss": 0.8376, "step": 2537 }, { - "epoch": 0.9267847361694358, - "grad_norm": 1.4259898662567139, - "learning_rate": 3.022277947397411e-05, - "loss": 0.8986, + "epoch": 0.44031922276197083, + "grad_norm": 1.6817984580993652, + "learning_rate": 3.993756912740117e-05, + "loss": 0.9954, "step": 2538 }, { - "epoch": 0.9271498995800621, - "grad_norm": 1.4420839548110962, - "learning_rate": 3.0210447197691317e-05, - "loss": 0.8839, + "epoch": 0.44049271339347673, + "grad_norm": 1.3721686601638794, + "learning_rate": 3.993703005352089e-05, + "loss": 0.8604, "step": 2539 }, { - "epoch": 0.9275150629906883, - "grad_norm": 1.0873898267745972, - "learning_rate": 3.019810966854315e-05, - "loss": 0.9048, + "epoch": 0.44066620402498263, + "grad_norm": 0.9119483828544617, + "learning_rate": 3.9936488665925045e-05, + "loss": 0.9419, "step": 2540 }, { - "epoch": 0.9278802264013146, - "grad_norm": 0.9887116551399231, - "learning_rate": 3.018576689287679e-05, - "loss": 0.8843, + "epoch": 0.44083969465648853, + "grad_norm": 0.9420273900032043, + "learning_rate": 3.993594496467646e-05, + "loss": 0.7224, "step": 2541 }, { - "epoch": 0.9282453898119408, - "grad_norm": 1.1139992475509644, - "learning_rate": 3.0173418877042092e-05, - "loss": 0.8772, + "epoch": 0.44101318528799444, + "grad_norm": 0.9687826037406921, + "learning_rate": 3.993539894983823e-05, + "loss": 0.7202, "step": 2542 }, { - "epoch": 0.9286105532225671, - "grad_norm": 1.0057564973831177, - "learning_rate": 3.0161065627391618e-05, - "loss": 0.9176, + "epoch": 0.44118667591950034, + "grad_norm": 2.01166033744812, + "learning_rate": 3.993485062147372e-05, + "loss": 0.658, "step": 2543 }, { - "epoch": 0.9289757166331933, - "grad_norm": 0.9588862061500549, - "learning_rate": 3.0148707150280613e-05, - "loss": 0.8655, + "epoch": 0.44136016655100624, + "grad_norm": 1.7483936548233032, + "learning_rate": 3.993429997964657e-05, + "loss": 0.7981, "step": 2544 }, { - "epoch": 0.9293408800438197, - "grad_norm": 1.0876340866088867, - "learning_rate": 3.0136343452067023e-05, - "loss": 0.8901, + "epoch": 0.44153365718251214, + "grad_norm": 0.8401866555213928, + "learning_rate": 3.993374702442068e-05, + "loss": 0.7869, "step": 2545 }, { - "epoch": 0.9297060434544459, - "grad_norm": 1.03085196018219, - "learning_rate": 3.012397453911147e-05, - "loss": 0.8579, + "epoch": 0.44170714781401804, + "grad_norm": 0.8949095606803894, + "learning_rate": 3.993319175586021e-05, + "loss": 0.8469, "step": 2546 }, { - "epoch": 0.9300712068650722, - "grad_norm": 0.8998397588729858, - "learning_rate": 3.011160041777727e-05, - "loss": 0.9271, + "epoch": 0.44188063844552394, + "grad_norm": 1.0634665489196777, + "learning_rate": 3.993263417402962e-05, + "loss": 0.7737, "step": 2547 }, { - "epoch": 0.9304363702756984, - "grad_norm": 1.1103460788726807, - "learning_rate": 3.009922109443041e-05, - "loss": 0.9243, + "epoch": 0.44205412907702984, + "grad_norm": 0.9373010396957397, + "learning_rate": 3.9932074278993604e-05, + "loss": 0.7534, "step": 2548 }, { - "epoch": 0.9308015336863247, - "grad_norm": 1.1912057399749756, - "learning_rate": 3.0086836575439554e-05, - "loss": 0.9083, + "epoch": 0.44222761970853575, + "grad_norm": 0.794353723526001, + "learning_rate": 3.993151207081715e-05, + "loss": 0.8135, "step": 2549 }, { - "epoch": 0.9311666970969509, - "grad_norm": 1.0755178928375244, - "learning_rate": 3.0074446867176035e-05, - "loss": 0.8544, + "epoch": 0.44240111034004165, + "grad_norm": 0.7902843356132507, + "learning_rate": 3.993094754956549e-05, + "loss": 0.908, "step": 2550 }, { - "epoch": 0.9315318605075772, - "grad_norm": 1.1778404712677002, - "learning_rate": 3.006205197601387e-05, - "loss": 0.8986, + "epoch": 0.44257460097154755, + "grad_norm": 0.8102786540985107, + "learning_rate": 3.9930380715304143e-05, + "loss": 0.9626, "step": 2551 }, { - "epoch": 0.9318970239182034, - "grad_norm": 1.0563524961471558, - "learning_rate": 3.0049651908329724e-05, + "epoch": 0.44274809160305345, + "grad_norm": 0.864153265953064, + "learning_rate": 3.992981156809889e-05, "loss": 0.9104, "step": 2552 }, { - "epoch": 0.9322621873288297, - "grad_norm": 1.5895740985870361, - "learning_rate": 3.0037246670502943e-05, - "loss": 0.8933, + "epoch": 0.44292158223455935, + "grad_norm": 0.9110079407691956, + "learning_rate": 3.992924010801578e-05, + "loss": 0.8813, "step": 2553 }, { - "epoch": 0.9326273507394559, - "grad_norm": 1.0430275201797485, - "learning_rate": 3.002483626891551e-05, - "loss": 0.9247, + "epoch": 0.44309507286606525, + "grad_norm": 0.9933997392654419, + "learning_rate": 3.9928666335121135e-05, + "loss": 0.7881, "step": 2554 }, { - "epoch": 0.9329925141500821, - "grad_norm": 1.195866346359253, - "learning_rate": 3.00124207099521e-05, - "loss": 0.9135, + "epoch": 0.44326856349757116, + "grad_norm": 0.8075977563858032, + "learning_rate": 3.992809024948154e-05, + "loss": 0.9058, "step": 2555 }, { - "epoch": 0.9333576775607084, - "grad_norm": 1.0520483255386353, - "learning_rate": 3.0000000000000004e-05, - "loss": 0.8586, + "epoch": 0.44344205412907706, + "grad_norm": 1.0643093585968018, + "learning_rate": 3.992751185116385e-05, + "loss": 0.7593, "step": 2556 }, { - "epoch": 0.9337228409713346, - "grad_norm": 1.6619088649749756, - "learning_rate": 2.998757414544918e-05, - "loss": 0.9265, + "epoch": 0.4436155447605829, + "grad_norm": 0.9359849691390991, + "learning_rate": 3.992693114023519e-05, + "loss": 0.8157, "step": 2557 }, { - "epoch": 0.9340880043819609, - "grad_norm": 1.222413420677185, - "learning_rate": 2.9975143152692242e-05, - "loss": 0.9236, + "epoch": 0.4437890353920888, + "grad_norm": 0.9884755611419678, + "learning_rate": 3.992634811676296e-05, + "loss": 0.6913, "step": 2558 }, { - "epoch": 0.9344531677925871, - "grad_norm": 1.5049315690994263, - "learning_rate": 2.9962707028124417e-05, - "loss": 0.8956, + "epoch": 0.4439625260235947, + "grad_norm": 1.0372505187988281, + "learning_rate": 3.9925762780814804e-05, + "loss": 0.7266, "step": 2559 }, { - "epoch": 0.9348183312032134, - "grad_norm": 1.0230950117111206, - "learning_rate": 2.9950265778143616e-05, - "loss": 0.8926, + "epoch": 0.4441360166551006, + "grad_norm": 1.1888887882232666, + "learning_rate": 3.992517513245865e-05, + "loss": 0.6733, "step": 2560 }, { - "epoch": 0.9351834946138396, - "grad_norm": 0.8047017455101013, - "learning_rate": 2.9937819409150343e-05, - "loss": 0.8865, + "epoch": 0.4443095072866065, + "grad_norm": 1.3243591785430908, + "learning_rate": 3.992458517176272e-05, + "loss": 0.7002, "step": 2561 }, { - "epoch": 0.935548658024466, - "grad_norm": 1.0304129123687744, - "learning_rate": 2.9925367927547772e-05, - "loss": 0.9065, + "epoch": 0.4444829979181124, + "grad_norm": 1.684969425201416, + "learning_rate": 3.992399289879546e-05, + "loss": 0.681, "step": 2562 }, { - "epoch": 0.9359138214350922, - "grad_norm": 1.2216764688491821, - "learning_rate": 2.991291133974168e-05, - "loss": 0.9255, + "epoch": 0.4446564885496183, + "grad_norm": 0.8238731026649475, + "learning_rate": 3.992339831362561e-05, + "loss": 0.8638, "step": 2563 }, { - "epoch": 0.9362789848457185, - "grad_norm": 1.150650978088379, - "learning_rate": 2.990044965214048e-05, - "loss": 0.9158, + "epoch": 0.4448299791811242, + "grad_norm": 1.0097507238388062, + "learning_rate": 3.992280141632216e-05, + "loss": 0.8376, "step": 2564 }, { - "epoch": 0.9366441482563447, - "grad_norm": 0.9835485816001892, - "learning_rate": 2.988798287115522e-05, - "loss": 0.9016, + "epoch": 0.4450034698126301, + "grad_norm": 0.8137010335922241, + "learning_rate": 3.9922202206954395e-05, + "loss": 0.6936, "step": 2565 }, { - "epoch": 0.937009311666971, - "grad_norm": 1.3219966888427734, - "learning_rate": 2.9875511003199547e-05, - "loss": 0.8754, + "epoch": 0.445176960444136, + "grad_norm": 1.183872103691101, + "learning_rate": 3.9921600685591856e-05, + "loss": 0.6516, "step": 2566 }, { - "epoch": 0.9373744750775972, - "grad_norm": 1.0398063659667969, - "learning_rate": 2.9863034054689744e-05, - "loss": 0.9208, + "epoch": 0.4453504510756419, + "grad_norm": 1.0476155281066895, + "learning_rate": 3.992099685230434e-05, + "loss": 0.667, "step": 2567 }, { - "epoch": 0.9377396384882235, - "grad_norm": 1.0791022777557373, - "learning_rate": 2.98505520320447e-05, - "loss": 0.9008, + "epoch": 0.4455239417071478, + "grad_norm": 0.9813032746315002, + "learning_rate": 3.9920390707161927e-05, + "loss": 0.8577, "step": 2568 }, { - "epoch": 0.9381048018988497, - "grad_norm": 1.0053070783615112, - "learning_rate": 2.9838064941685914e-05, - "loss": 0.9445, + "epoch": 0.4456974323386537, + "grad_norm": 0.7087277173995972, + "learning_rate": 3.991978225023497e-05, + "loss": 0.8628, "step": 2569 }, { - "epoch": 0.938469965309476, - "grad_norm": 1.1131993532180786, - "learning_rate": 2.9825572790037497e-05, - "loss": 0.8905, + "epoch": 0.4458709229701596, + "grad_norm": 1.0909929275512695, + "learning_rate": 3.9919171481594056e-05, + "loss": 0.7485, "step": 2570 }, { - "epoch": 0.9388351287201022, - "grad_norm": 1.2495945692062378, - "learning_rate": 2.9813075583526146e-05, - "loss": 0.8733, + "epoch": 0.4460444136016655, + "grad_norm": 0.8282626867294312, + "learning_rate": 3.991855840131009e-05, + "loss": 0.9209, "step": 2571 }, { - "epoch": 0.9392002921307285, - "grad_norm": 0.8986935615539551, - "learning_rate": 2.9800573328581187e-05, - "loss": 0.8979, + "epoch": 0.4462179042331714, + "grad_norm": 0.8845242261886597, + "learning_rate": 3.9917943009454206e-05, + "loss": 0.7483, "step": 2572 }, { - "epoch": 0.9395654555413547, - "grad_norm": 0.9922747611999512, - "learning_rate": 2.9788066031634523e-05, - "loss": 0.8466, + "epoch": 0.4463913948646773, + "grad_norm": 1.1177986860275269, + "learning_rate": 3.991732530609783e-05, + "loss": 0.7209, "step": 2573 }, { - "epoch": 0.939930618951981, - "grad_norm": 1.5492703914642334, - "learning_rate": 2.9775553699120654e-05, - "loss": 0.917, + "epoch": 0.44656488549618323, + "grad_norm": 1.0662497282028198, + "learning_rate": 3.9916705291312646e-05, + "loss": 0.6938, "step": 2574 }, { - "epoch": 0.9402957823626072, - "grad_norm": 1.280344843864441, - "learning_rate": 2.976303633747668e-05, - "loss": 0.9087, + "epoch": 0.44673837612768913, + "grad_norm": 0.8103126883506775, + "learning_rate": 3.9916082965170604e-05, + "loss": 0.9065, "step": 2575 }, { - "epoch": 0.9406609457732336, - "grad_norm": 1.0054874420166016, - "learning_rate": 2.975051395314227e-05, - "loss": 0.8844, + "epoch": 0.44691186675919503, + "grad_norm": 0.861494243144989, + "learning_rate": 3.991545832774393e-05, + "loss": 0.7349, "step": 2576 }, { - "epoch": 0.9410261091838598, - "grad_norm": 1.135419487953186, - "learning_rate": 2.97379865525597e-05, - "loss": 0.9028, + "epoch": 0.4470853573907009, + "grad_norm": 0.9285809397697449, + "learning_rate": 3.9914831379105104e-05, + "loss": 0.8281, "step": 2577 }, { - "epoch": 0.9413912725944861, - "grad_norm": 1.424904465675354, - "learning_rate": 2.9725454142173805e-05, - "loss": 0.9172, + "epoch": 0.4472588480222068, + "grad_norm": 0.9210159182548523, + "learning_rate": 3.9914202119326895e-05, + "loss": 0.8225, "step": 2578 }, { - "epoch": 0.9417564360051123, - "grad_norm": 0.9768377542495728, - "learning_rate": 2.9712916728432016e-05, - "loss": 0.884, + "epoch": 0.4474323386537127, + "grad_norm": 1.0557841062545776, + "learning_rate": 3.991357054848233e-05, + "loss": 0.7246, "step": 2579 }, { - "epoch": 0.9421215994157386, - "grad_norm": 1.0887343883514404, - "learning_rate": 2.9700374317784326e-05, - "loss": 0.8846, + "epoch": 0.4476058292852186, + "grad_norm": 0.9017823934555054, + "learning_rate": 3.991293666664469e-05, + "loss": 0.7917, "step": 2580 }, { - "epoch": 0.9424867628263648, - "grad_norm": 0.8538926243782043, - "learning_rate": 2.9687826916683293e-05, - "loss": 0.8679, + "epoch": 0.4477793199167245, + "grad_norm": 1.0003315210342407, + "learning_rate": 3.991230047388755e-05, + "loss": 0.7012, "step": 2581 }, { - "epoch": 0.9428519262369911, - "grad_norm": 1.368065595626831, - "learning_rate": 2.967527453158407e-05, - "loss": 0.9374, + "epoch": 0.4479528105482304, + "grad_norm": 0.9392502903938293, + "learning_rate": 3.991166197028474e-05, + "loss": 0.6161, "step": 2582 }, { - "epoch": 0.9432170896476173, - "grad_norm": 1.339691162109375, - "learning_rate": 2.9662717168944343e-05, - "loss": 0.8953, + "epoch": 0.4481263011797363, + "grad_norm": 0.8567666411399841, + "learning_rate": 3.9911021155910355e-05, + "loss": 0.7667, "step": 2583 }, { - "epoch": 0.9435822530582436, - "grad_norm": 1.203379511833191, - "learning_rate": 2.9650154835224373e-05, - "loss": 0.8591, + "epoch": 0.4482997918112422, + "grad_norm": 0.8300613164901733, + "learning_rate": 3.9910378030838765e-05, + "loss": 0.8076, "step": 2584 }, { - "epoch": 0.9439474164688698, - "grad_norm": 0.9852968454360962, - "learning_rate": 2.963758753688697e-05, - "loss": 0.891, + "epoch": 0.4484732824427481, + "grad_norm": 1.1942838430404663, + "learning_rate": 3.99097325951446e-05, + "loss": 0.9919, "step": 2585 }, { - "epoch": 0.9443125798794961, - "grad_norm": 1.0267891883850098, - "learning_rate": 2.962501528039752e-05, - "loss": 0.8602, + "epoch": 0.448646773074254, + "grad_norm": 0.9984294176101685, + "learning_rate": 3.990908484890277e-05, + "loss": 0.7781, "step": 2586 }, { - "epoch": 0.9446777432901223, - "grad_norm": 0.9160406589508057, - "learning_rate": 2.9612438072223926e-05, - "loss": 0.9058, + "epoch": 0.4488202637057599, + "grad_norm": 3.65987491607666, + "learning_rate": 3.9908434792188443e-05, + "loss": 0.7017, "step": 2587 }, { - "epoch": 0.9450429067007485, - "grad_norm": 1.2831951379776, - "learning_rate": 2.9599855918836677e-05, - "loss": 0.8921, + "epoch": 0.4489937543372658, + "grad_norm": 0.8636199235916138, + "learning_rate": 3.990778242507707e-05, + "loss": 0.8438, "step": 2588 }, { - "epoch": 0.9454080701113748, - "grad_norm": 0.8434587717056274, - "learning_rate": 2.9587268826708774e-05, - "loss": 0.8582, + "epoch": 0.4491672449687717, + "grad_norm": 0.8812211751937866, + "learning_rate": 3.990712774764434e-05, + "loss": 0.8347, "step": 2589 }, { - "epoch": 0.945773233522001, - "grad_norm": 0.8247495889663696, - "learning_rate": 2.9574676802315775e-05, - "loss": 0.9181, + "epoch": 0.4493407356002776, + "grad_norm": 0.8458404541015625, + "learning_rate": 3.990647075996624e-05, + "loss": 0.884, "step": 2590 }, { - "epoch": 0.9461383969326274, - "grad_norm": 0.9670163989067078, - "learning_rate": 2.9562079852135767e-05, - "loss": 0.8804, + "epoch": 0.4495142262317835, + "grad_norm": 0.9072719812393188, + "learning_rate": 3.9905811462119014e-05, + "loss": 0.8074, "step": 2591 }, { - "epoch": 0.9465035603432536, - "grad_norm": 1.2330880165100098, - "learning_rate": 2.9549477982649372e-05, - "loss": 0.9354, + "epoch": 0.4496877168632894, + "grad_norm": 1.1364291906356812, + "learning_rate": 3.9905149854179174e-05, + "loss": 0.8831, "step": 2592 }, { - "epoch": 0.9468687237538799, - "grad_norm": 0.972687840461731, - "learning_rate": 2.9536871200339754e-05, - "loss": 0.7759, + "epoch": 0.4498612074947953, + "grad_norm": 0.9060369729995728, + "learning_rate": 3.990448593622349e-05, + "loss": 0.7261, "step": 2593 }, { - "epoch": 0.9472338871645061, - "grad_norm": 0.8301343321800232, - "learning_rate": 2.95242595116926e-05, - "loss": 0.8567, + "epoch": 0.4500346981263012, + "grad_norm": 1.0222080945968628, + "learning_rate": 3.990381970832903e-05, + "loss": 0.7429, "step": 2594 }, { - "epoch": 0.9475990505751324, - "grad_norm": 1.315301775932312, - "learning_rate": 2.951164292319611e-05, - "loss": 0.8679, + "epoch": 0.4502081887578071, + "grad_norm": 1.1342495679855347, + "learning_rate": 3.99031511705731e-05, + "loss": 0.9241, "step": 2595 }, { - "epoch": 0.9479642139857586, - "grad_norm": 1.0382542610168457, - "learning_rate": 2.9499021441341012e-05, - "loss": 0.8866, + "epoch": 0.45038167938931295, + "grad_norm": 1.0633578300476074, + "learning_rate": 3.9902480323033285e-05, + "loss": 0.9248, "step": 2596 }, { - "epoch": 0.9483293773963849, - "grad_norm": 1.329770565032959, - "learning_rate": 2.9486395072620552e-05, - "loss": 0.8896, + "epoch": 0.45055517002081885, + "grad_norm": 0.9833748936653137, + "learning_rate": 3.990180716578744e-05, + "loss": 0.7109, "step": 2597 }, { - "epoch": 0.9486945408070111, - "grad_norm": 1.0653737783432007, - "learning_rate": 2.947376382353049e-05, - "loss": 0.9115, + "epoch": 0.45072866065232475, + "grad_norm": 1.074702262878418, + "learning_rate": 3.990113169891367e-05, + "loss": 0.7524, "step": 2598 }, { - "epoch": 0.9490597042176374, - "grad_norm": 1.0917714834213257, - "learning_rate": 2.946112770056911e-05, - "loss": 0.9052, + "epoch": 0.45090215128383065, + "grad_norm": 1.1612343788146973, + "learning_rate": 3.990045392249039e-05, + "loss": 0.9114, "step": 2599 }, { - "epoch": 0.9494248676282636, - "grad_norm": 1.1067699193954468, - "learning_rate": 2.9448486710237173e-05, - "loss": 0.8773, + "epoch": 0.45107564191533656, + "grad_norm": 0.8593735694885254, + "learning_rate": 3.989977383659624e-05, + "loss": 0.7532, "step": 2600 }, { - "epoch": 0.9497900310388899, - "grad_norm": 1.1767078638076782, - "learning_rate": 2.943584085903797e-05, - "loss": 0.9468, + "epoch": 0.45124913254684246, + "grad_norm": 0.9816779494285583, + "learning_rate": 3.989909144131015e-05, + "loss": 0.8608, "step": 2601 }, { - "epoch": 0.9501551944495161, - "grad_norm": 1.1313189268112183, - "learning_rate": 2.942319015347728e-05, - "loss": 0.8685, + "epoch": 0.45142262317834836, + "grad_norm": 0.8141211867332458, + "learning_rate": 3.989840673671131e-05, + "loss": 0.8386, "step": 2602 }, { - "epoch": 0.9505203578601424, - "grad_norm": 1.0258231163024902, - "learning_rate": 2.9410534600063387e-05, - "loss": 0.8986, + "epoch": 0.45159611380985426, + "grad_norm": 0.9326730370521545, + "learning_rate": 3.9897719722879176e-05, + "loss": 0.8225, "step": 2603 }, { - "epoch": 0.9508855212707686, - "grad_norm": 1.2864134311676025, - "learning_rate": 2.939787420530706e-05, - "loss": 0.8948, + "epoch": 0.45176960444136016, + "grad_norm": 0.9008981585502625, + "learning_rate": 3.989703039989349e-05, + "loss": 0.874, "step": 2604 }, { - "epoch": 0.951250684681395, - "grad_norm": 1.177599549293518, - "learning_rate": 2.9385208975721568e-05, - "loss": 0.8768, + "epoch": 0.45194309507286606, + "grad_norm": 1.2505275011062622, + "learning_rate": 3.989633876783423e-05, + "loss": 0.8811, "step": 2605 }, { - "epoch": 0.9516158480920212, - "grad_norm": 0.9371570944786072, - "learning_rate": 2.9372538917822666e-05, - "loss": 0.915, + "epoch": 0.45211658570437196, + "grad_norm": 1.3901854753494263, + "learning_rate": 3.989564482678168e-05, + "loss": 0.7153, "step": 2606 }, { - "epoch": 0.9519810115026475, - "grad_norm": 1.1968104839324951, - "learning_rate": 2.9359864038128586e-05, - "loss": 0.8707, + "epoch": 0.45229007633587787, + "grad_norm": 1.5285706520080566, + "learning_rate": 3.9894948576816374e-05, + "loss": 0.6602, "step": 2607 }, { - "epoch": 0.9523461749132737, - "grad_norm": 1.1708532571792603, - "learning_rate": 2.934718434316005e-05, - "loss": 0.8682, + "epoch": 0.45246356696738377, + "grad_norm": 1.2756774425506592, + "learning_rate": 3.989425001801909e-05, + "loss": 0.6237, "step": 2608 }, { - "epoch": 0.9527113383239, - "grad_norm": 1.0788037776947021, - "learning_rate": 2.933449983944024e-05, - "loss": 0.8999, + "epoch": 0.45263705759888967, + "grad_norm": 1.2169909477233887, + "learning_rate": 3.9893549150470925e-05, + "loss": 0.7815, "step": 2609 }, { - "epoch": 0.9530765017345262, - "grad_norm": 0.8838430643081665, - "learning_rate": 2.932181053349484e-05, - "loss": 0.875, + "epoch": 0.45281054823039557, + "grad_norm": 1.498103380203247, + "learning_rate": 3.9892845974253184e-05, + "loss": 0.8733, "step": 2610 }, { - "epoch": 0.9534416651451525, - "grad_norm": 1.1632007360458374, - "learning_rate": 2.930911643185198e-05, - "loss": 0.8906, + "epoch": 0.4529840388619015, + "grad_norm": 0.8378575444221497, + "learning_rate": 3.989214048944749e-05, + "loss": 0.825, "step": 2611 }, { - "epoch": 0.9538068285557787, - "grad_norm": 1.1042110919952393, - "learning_rate": 2.9296417541042267e-05, - "loss": 0.8938, + "epoch": 0.4531575294934074, + "grad_norm": 1.0778298377990723, + "learning_rate": 3.989143269613572e-05, + "loss": 0.8184, "step": 2612 }, { - "epoch": 0.954171991966405, - "grad_norm": 4.797541618347168, - "learning_rate": 2.928371386759877e-05, - "loss": 0.902, + "epoch": 0.4533310201249133, + "grad_norm": 0.9333769679069519, + "learning_rate": 3.9890722594400005e-05, + "loss": 0.9226, "step": 2613 }, { - "epoch": 0.9545371553770312, - "grad_norm": 1.1516057252883911, - "learning_rate": 2.9271005418057023e-05, - "loss": 0.8757, + "epoch": 0.4535045107564192, + "grad_norm": 1.0471779108047485, + "learning_rate": 3.989001018432276e-05, + "loss": 0.7349, "step": 2614 }, { - "epoch": 0.9549023187876575, - "grad_norm": 0.8867661356925964, - "learning_rate": 2.9258292198955004e-05, - "loss": 0.8456, + "epoch": 0.4536780013879251, + "grad_norm": 1.0787594318389893, + "learning_rate": 3.988929546598665e-05, + "loss": 0.9045, "step": 2615 }, { - "epoch": 0.9552674821982837, - "grad_norm": 1.2563271522521973, - "learning_rate": 2.924557421683317e-05, - "loss": 0.8649, + "epoch": 0.4538514920194309, + "grad_norm": 1.1247055530548096, + "learning_rate": 3.988857843947463e-05, + "loss": 0.6692, "step": 2616 }, { - "epoch": 0.95563264560891, - "grad_norm": 1.2422395944595337, - "learning_rate": 2.9232851478234397e-05, - "loss": 0.8438, + "epoch": 0.4540249826509368, + "grad_norm": 0.8627386093139648, + "learning_rate": 3.9887859104869905e-05, + "loss": 0.6399, "step": 2617 }, { - "epoch": 0.9559978090195362, - "grad_norm": 1.1696786880493164, - "learning_rate": 2.9220123989704034e-05, - "loss": 0.9037, + "epoch": 0.4541984732824427, + "grad_norm": 1.1878464221954346, + "learning_rate": 3.988713746225596e-05, + "loss": 0.8218, "step": 2618 }, { - "epoch": 0.9563629724301626, - "grad_norm": 1.2376277446746826, - "learning_rate": 2.9207391757789857e-05, - "loss": 0.8622, + "epoch": 0.45437196391394863, + "grad_norm": 0.8698839545249939, + "learning_rate": 3.988641351171653e-05, + "loss": 0.9141, "step": 2619 }, { - "epoch": 0.9567281358407888, - "grad_norm": 1.0720192193984985, - "learning_rate": 2.9194654789042088e-05, - "loss": 0.8488, + "epoch": 0.45454545454545453, + "grad_norm": 0.832637369632721, + "learning_rate": 3.988568725333565e-05, + "loss": 0.9197, "step": 2620 }, { - "epoch": 0.957093299251415, - "grad_norm": 1.3359063863754272, - "learning_rate": 2.9181913090013386e-05, - "loss": 0.8873, + "epoch": 0.45471894517696043, + "grad_norm": 0.9984008073806763, + "learning_rate": 3.988495868719759e-05, + "loss": 0.7366, "step": 2621 }, { - "epoch": 0.9574584626620413, - "grad_norm": 1.438002586364746, - "learning_rate": 2.9169166667258856e-05, - "loss": 0.8971, + "epoch": 0.45489243580846633, + "grad_norm": 0.848672091960907, + "learning_rate": 3.98842278133869e-05, + "loss": 0.8232, "step": 2622 }, { - "epoch": 0.9578236260726675, - "grad_norm": 1.042914628982544, - "learning_rate": 2.9156415527336003e-05, - "loss": 0.8476, + "epoch": 0.45506592643997223, + "grad_norm": 1.0701887607574463, + "learning_rate": 3.988349463198841e-05, + "loss": 0.7686, "step": 2623 }, { - "epoch": 0.9581887894832938, - "grad_norm": 1.23924720287323, - "learning_rate": 2.9143659676804788e-05, - "loss": 0.8823, + "epoch": 0.45523941707147814, + "grad_norm": 0.836147129535675, + "learning_rate": 3.9882759143087194e-05, + "loss": 0.7947, "step": 2624 }, { - "epoch": 0.95855395289392, - "grad_norm": 1.186662197113037, - "learning_rate": 2.9130899122227583e-05, - "loss": 0.9214, + "epoch": 0.45541290770298404, + "grad_norm": 1.2238225936889648, + "learning_rate": 3.9882021346768613e-05, + "loss": 0.8101, "step": 2625 }, { - "epoch": 0.9589191163045463, - "grad_norm": 1.0803686380386353, - "learning_rate": 2.911813387016918e-05, - "loss": 0.8855, + "epoch": 0.45558639833448994, + "grad_norm": 0.8200496435165405, + "learning_rate": 3.9881281243118285e-05, + "loss": 0.8088, "step": 2626 }, { - "epoch": 0.9592842797151725, - "grad_norm": 1.6229324340820312, - "learning_rate": 2.910536392719679e-05, - "loss": 0.9313, + "epoch": 0.45575988896599584, + "grad_norm": 0.9634228348731995, + "learning_rate": 3.9880538832222105e-05, + "loss": 0.9187, "step": 2627 }, { - "epoch": 0.9596494431257988, - "grad_norm": 1.0317401885986328, - "learning_rate": 2.9092589299880028e-05, - "loss": 0.8463, + "epoch": 0.45593337959750174, + "grad_norm": 1.1180576086044312, + "learning_rate": 3.987979411416623e-05, + "loss": 0.6926, "step": 2628 }, { - "epoch": 0.960014606536425, - "grad_norm": 1.250490427017212, - "learning_rate": 2.9079809994790937e-05, - "loss": 0.8845, + "epoch": 0.45610687022900764, + "grad_norm": 1.861584186553955, + "learning_rate": 3.987904708903708e-05, + "loss": 0.8896, "step": 2629 }, { - "epoch": 0.9603797699470513, - "grad_norm": 1.1076995134353638, - "learning_rate": 2.9067026018503956e-05, - "loss": 0.8801, + "epoch": 0.45628036086051355, + "grad_norm": 4.076051235198975, + "learning_rate": 3.987829775692135e-05, + "loss": 0.7197, "step": 2630 }, { - "epoch": 0.9607449333576775, - "grad_norm": 1.2672418355941772, - "learning_rate": 2.9054237377595926e-05, - "loss": 0.8997, + "epoch": 0.45645385149201945, + "grad_norm": 0.8663637638092041, + "learning_rate": 3.987754611790601e-05, + "loss": 0.8093, "step": 2631 }, { - "epoch": 0.9611100967683038, - "grad_norm": 0.9533954858779907, - "learning_rate": 2.904144407864609e-05, - "loss": 0.8726, + "epoch": 0.45662734212352535, + "grad_norm": 1.2111930847167969, + "learning_rate": 3.987679217207827e-05, + "loss": 0.6992, "step": 2632 }, { - "epoch": 0.96147526017893, - "grad_norm": 1.1799991130828857, - "learning_rate": 2.9028646128236083e-05, - "loss": 0.9055, + "epoch": 0.45680083275503125, + "grad_norm": 0.8179115056991577, + "learning_rate": 3.9876035919525644e-05, + "loss": 0.8108, "step": 2633 }, { - "epoch": 0.9618404235895563, - "grad_norm": 1.2244930267333984, - "learning_rate": 2.901584353294994e-05, - "loss": 0.9055, + "epoch": 0.45697432338653715, + "grad_norm": 1.0579832792282104, + "learning_rate": 3.987527736033589e-05, + "loss": 0.6746, "step": 2634 }, { - "epoch": 0.9622055870001825, - "grad_norm": 1.393638253211975, - "learning_rate": 2.9003036299374083e-05, - "loss": 0.9365, + "epoch": 0.45714781401804305, + "grad_norm": 0.8458178043365479, + "learning_rate": 3.9874516494597035e-05, + "loss": 0.769, "step": 2635 }, { - "epoch": 0.9625707504108089, - "grad_norm": 1.0351239442825317, - "learning_rate": 2.899022443409732e-05, - "loss": 0.881, + "epoch": 0.4573213046495489, + "grad_norm": 0.8525532484054565, + "learning_rate": 3.987375332239739e-05, + "loss": 0.7795, "step": 2636 }, { - "epoch": 0.9629359138214351, - "grad_norm": 0.8210397362709045, - "learning_rate": 2.8977407943710846e-05, - "loss": 0.8839, + "epoch": 0.4574947952810548, + "grad_norm": 2.4828004837036133, + "learning_rate": 3.9872987843825505e-05, + "loss": 0.6592, "step": 2637 }, { - "epoch": 0.9633010772320614, - "grad_norm": 1.0315479040145874, - "learning_rate": 2.8964586834808214e-05, - "loss": 0.9174, + "epoch": 0.4576682859125607, + "grad_norm": 0.9502774477005005, + "learning_rate": 3.9872220058970226e-05, + "loss": 0.8464, "step": 2638 }, { - "epoch": 0.9636662406426876, - "grad_norm": 1.1998043060302734, - "learning_rate": 2.8951761113985393e-05, - "loss": 0.9465, + "epoch": 0.4578417765440666, + "grad_norm": 0.7002145051956177, + "learning_rate": 3.9871449967920656e-05, + "loss": 0.7268, "step": 2639 }, { - "epoch": 0.9640314040533139, - "grad_norm": 1.2388027906417847, - "learning_rate": 2.8938930787840683e-05, - "loss": 0.873, + "epoch": 0.4580152671755725, + "grad_norm": 0.920948326587677, + "learning_rate": 3.9870677570766167e-05, + "loss": 0.7715, "step": 2640 }, { - "epoch": 0.9643965674639401, - "grad_norm": 1.1343432664871216, - "learning_rate": 2.8926095862974782e-05, - "loss": 0.9102, + "epoch": 0.4581887578070784, + "grad_norm": 0.7829631567001343, + "learning_rate": 3.986990286759639e-05, + "loss": 0.8843, "step": 2641 }, { - "epoch": 0.9647617308745664, - "grad_norm": 0.9101622104644775, - "learning_rate": 2.8913256345990746e-05, - "loss": 0.9011, + "epoch": 0.4583622484385843, + "grad_norm": 1.1254407167434692, + "learning_rate": 3.986912585850123e-05, + "loss": 0.7743, "step": 2642 }, { - "epoch": 0.9651268942851926, - "grad_norm": 1.0069020986557007, - "learning_rate": 2.890041224349398e-05, - "loss": 0.8903, + "epoch": 0.4585357390700902, + "grad_norm": 1.2735583782196045, + "learning_rate": 3.986834654357086e-05, + "loss": 0.6952, "step": 2643 }, { - "epoch": 0.9654920576958189, - "grad_norm": 1.137132167816162, - "learning_rate": 2.888756356209227e-05, - "loss": 0.8945, + "epoch": 0.4587092297015961, + "grad_norm": 1.2347609996795654, + "learning_rate": 3.9867564922895724e-05, + "loss": 0.6537, "step": 2644 }, { - "epoch": 0.9658572211064451, - "grad_norm": 1.0776578187942505, - "learning_rate": 2.8874710308395745e-05, - "loss": 0.8548, + "epoch": 0.458882720333102, + "grad_norm": 1.01522696018219, + "learning_rate": 3.9866780996566525e-05, + "loss": 0.8855, "step": 2645 }, { - "epoch": 0.9662223845170714, - "grad_norm": 0.8349401950836182, - "learning_rate": 2.8861852489016882e-05, - "loss": 0.9088, + "epoch": 0.4590562109646079, + "grad_norm": 1.362505316734314, + "learning_rate": 3.986599476467425e-05, + "loss": 0.9211, "step": 2646 }, { - "epoch": 0.9665875479276976, - "grad_norm": 0.9649350047111511, - "learning_rate": 2.884899011057052e-05, - "loss": 0.8599, + "epoch": 0.4592297015961138, + "grad_norm": 0.9765545725822449, + "learning_rate": 3.986520622731012e-05, + "loss": 0.748, "step": 2647 }, { - "epoch": 0.966952711338324, - "grad_norm": 1.1331411600112915, - "learning_rate": 2.8836123179673828e-05, - "loss": 0.8829, + "epoch": 0.4594031922276197, + "grad_norm": 0.8034124374389648, + "learning_rate": 3.9864415384565675e-05, + "loss": 0.8318, "step": 2648 }, { - "epoch": 0.9673178747489501, - "grad_norm": 1.1683356761932373, - "learning_rate": 2.882325170294634e-05, - "loss": 0.9039, + "epoch": 0.4595766828591256, + "grad_norm": 0.9476749897003174, + "learning_rate": 3.986362223653267e-05, + "loss": 0.8813, "step": 2649 }, { - "epoch": 0.9676830381595765, - "grad_norm": 1.1856131553649902, - "learning_rate": 2.88103756870099e-05, - "loss": 0.8998, + "epoch": 0.4597501734906315, + "grad_norm": 0.910181999206543, + "learning_rate": 3.986282678330316e-05, + "loss": 0.998, "step": 2650 }, { - "epoch": 0.9680482015702027, - "grad_norm": 1.2359143495559692, - "learning_rate": 2.879749513848871e-05, - "loss": 0.9293, + "epoch": 0.4599236641221374, + "grad_norm": 1.4199895858764648, + "learning_rate": 3.9862029024969466e-05, + "loss": 0.7061, "step": 2651 }, { - "epoch": 0.968413364980829, - "grad_norm": 1.0429662466049194, - "learning_rate": 2.8784610064009297e-05, - "loss": 0.8976, + "epoch": 0.4600971547536433, + "grad_norm": 0.7885187268257141, + "learning_rate": 3.9861228961624146e-05, + "loss": 0.7332, "step": 2652 }, { - "epoch": 0.9687785283914552, - "grad_norm": 1.383043646812439, - "learning_rate": 2.87717204702005e-05, - "loss": 0.8563, + "epoch": 0.4602706453851492, + "grad_norm": 1.060520052909851, + "learning_rate": 3.986042659336007e-05, + "loss": 0.7383, "step": 2653 }, { - "epoch": 0.9691436918020815, - "grad_norm": 1.1264389753341675, - "learning_rate": 2.8758826363693516e-05, - "loss": 0.9354, + "epoch": 0.4604441360166551, + "grad_norm": 0.7739086151123047, + "learning_rate": 3.985962192027034e-05, + "loss": 0.7163, "step": 2654 }, { - "epoch": 0.9695088552127077, - "grad_norm": 1.9514962434768677, - "learning_rate": 2.8745927751121834e-05, - "loss": 0.9, + "epoch": 0.46061762664816097, + "grad_norm": 0.7051017880439758, + "learning_rate": 3.985881494244835e-05, + "loss": 0.7515, "step": 2655 }, { - "epoch": 0.9698740186233339, - "grad_norm": 1.429307222366333, - "learning_rate": 2.8733024639121283e-05, - "loss": 0.8707, + "epoch": 0.4607911172796669, + "grad_norm": 0.7917957901954651, + "learning_rate": 3.985800565998775e-05, + "loss": 0.8267, "step": 2656 }, { - "epoch": 0.9702391820339602, - "grad_norm": 1.425366997718811, - "learning_rate": 2.8720117034329984e-05, - "loss": 0.9064, + "epoch": 0.4609646079111728, + "grad_norm": 1.1403555870056152, + "learning_rate": 3.9857194072982445e-05, + "loss": 0.663, "step": 2657 }, { - "epoch": 0.9706043454445864, - "grad_norm": 1.050869345664978, - "learning_rate": 2.8707204943388386e-05, - "loss": 0.8531, + "epoch": 0.4611380985426787, + "grad_norm": 0.8834183812141418, + "learning_rate": 3.9856380181526634e-05, + "loss": 0.7725, "step": 2658 }, { - "epoch": 0.9709695088552127, - "grad_norm": 1.1377168893814087, - "learning_rate": 2.8694288372939243e-05, - "loss": 0.9076, + "epoch": 0.4613115891741846, + "grad_norm": 0.9769955277442932, + "learning_rate": 3.985556398571476e-05, + "loss": 0.6481, "step": 2659 }, { - "epoch": 0.9713346722658389, - "grad_norm": 1.217653751373291, - "learning_rate": 2.8681367329627617e-05, - "loss": 0.9126, + "epoch": 0.4614850798056905, + "grad_norm": 0.8712765574455261, + "learning_rate": 3.9854745485641556e-05, + "loss": 0.7585, "step": 2660 }, { - "epoch": 0.9716998356764652, - "grad_norm": 0.9770109057426453, - "learning_rate": 2.8668441820100857e-05, - "loss": 0.906, + "epoch": 0.4616585704371964, + "grad_norm": 1.0707367658615112, + "learning_rate": 3.9853924681402e-05, + "loss": 0.7087, "step": 2661 }, { - "epoch": 0.9720649990870914, - "grad_norm": 1.1983699798583984, - "learning_rate": 2.865551185100863e-05, - "loss": 0.8807, + "epoch": 0.4618320610687023, + "grad_norm": 0.9707021713256836, + "learning_rate": 3.985310157309135e-05, + "loss": 0.7192, "step": 2662 }, { - "epoch": 0.9724301624977177, - "grad_norm": 1.0460134744644165, - "learning_rate": 2.864257742900287e-05, - "loss": 0.8708, + "epoch": 0.4620055517002082, + "grad_norm": 1.2150181531906128, + "learning_rate": 3.985227616080513e-05, + "loss": 0.7166, "step": 2663 }, { - "epoch": 0.9727953259083439, - "grad_norm": 1.4826875925064087, - "learning_rate": 2.8629638560737832e-05, - "loss": 0.8618, + "epoch": 0.4621790423317141, + "grad_norm": 0.8197634816169739, + "learning_rate": 3.985144844463913e-05, + "loss": 0.8711, "step": 2664 }, { - "epoch": 0.9731604893189703, - "grad_norm": 0.9148849248886108, - "learning_rate": 2.8616695252870044e-05, - "loss": 0.903, + "epoch": 0.46235253296322, + "grad_norm": 0.9485721588134766, + "learning_rate": 3.985061842468941e-05, + "loss": 0.7213, "step": 2665 }, { - "epoch": 0.9735256527295965, - "grad_norm": 1.0338202714920044, - "learning_rate": 2.8603747512058312e-05, - "loss": 0.9015, + "epoch": 0.4625260235947259, + "grad_norm": 1.0045021772384644, + "learning_rate": 3.9849786101052285e-05, + "loss": 0.743, "step": 2666 }, { - "epoch": 0.9738908161402228, - "grad_norm": 1.1524542570114136, - "learning_rate": 2.859079534496373e-05, - "loss": 0.8573, + "epoch": 0.4626995142262318, + "grad_norm": 1.8938586711883545, + "learning_rate": 3.984895147382435e-05, + "loss": 0.7593, "step": 2667 }, { - "epoch": 0.974255979550849, - "grad_norm": 1.0691535472869873, - "learning_rate": 2.8577838758249674e-05, - "loss": 0.8676, + "epoch": 0.4628730048577377, + "grad_norm": 1.5645906925201416, + "learning_rate": 3.984811454310248e-05, + "loss": 0.7837, "step": 2668 }, { - "epoch": 0.9746211429614753, - "grad_norm": 1.4684078693389893, - "learning_rate": 2.8564877758581784e-05, - "loss": 0.9064, + "epoch": 0.4630464954892436, + "grad_norm": 0.9002296924591064, + "learning_rate": 3.984727530898378e-05, + "loss": 0.7461, "step": 2669 }, { - "epoch": 0.9749863063721015, - "grad_norm": 1.3998236656188965, - "learning_rate": 2.855191235262797e-05, - "loss": 0.9093, + "epoch": 0.4632199861207495, + "grad_norm": 0.9177200198173523, + "learning_rate": 3.9846433771565655e-05, + "loss": 0.8318, "step": 2670 }, { - "epoch": 0.9753514697827278, - "grad_norm": 1.2197835445404053, - "learning_rate": 2.8538942547058425e-05, - "loss": 0.892, + "epoch": 0.4633934767522554, + "grad_norm": 1.5229835510253906, + "learning_rate": 3.9845589930945764e-05, + "loss": 0.7976, "step": 2671 }, { - "epoch": 0.975716633193354, - "grad_norm": 0.980048418045044, - "learning_rate": 2.8525968348545574e-05, - "loss": 0.8704, + "epoch": 0.4635669673837613, + "grad_norm": 0.8550512790679932, + "learning_rate": 3.9844743787222046e-05, + "loss": 0.8572, "step": 2672 }, { - "epoch": 0.9760817966039803, - "grad_norm": 1.0013681650161743, - "learning_rate": 2.8512989763764146e-05, - "loss": 0.8854, + "epoch": 0.4637404580152672, + "grad_norm": 0.8087087869644165, + "learning_rate": 3.984389534049268e-05, + "loss": 0.7815, "step": 2673 }, { - "epoch": 0.9764469600146065, - "grad_norm": 1.1448907852172852, - "learning_rate": 2.850000679939108e-05, - "loss": 0.9064, + "epoch": 0.4639139486467731, + "grad_norm": 2.0529415607452393, + "learning_rate": 3.984304459085614e-05, + "loss": 0.7454, "step": 2674 }, { - "epoch": 0.9768121234252328, - "grad_norm": 0.898410439491272, - "learning_rate": 2.8487019462105606e-05, - "loss": 0.8834, + "epoch": 0.46408743927827895, + "grad_norm": 1.3661733865737915, + "learning_rate": 3.9842191538411145e-05, + "loss": 0.824, "step": 2675 }, { - "epoch": 0.977177286835859, - "grad_norm": 1.2168394327163696, - "learning_rate": 2.847402775858918e-05, - "loss": 0.9015, + "epoch": 0.46426092990978485, + "grad_norm": 0.9027232527732849, + "learning_rate": 3.984133618325671e-05, + "loss": 0.8318, "step": 2676 }, { - "epoch": 0.9775424502464853, - "grad_norm": 1.206009864807129, - "learning_rate": 2.846103169552551e-05, - "loss": 0.8692, + "epoch": 0.46443442054129075, + "grad_norm": 1.1154446601867676, + "learning_rate": 3.984047852549209e-05, + "loss": 0.679, "step": 2677 }, { - "epoch": 0.9779076136571115, - "grad_norm": 1.085206151008606, - "learning_rate": 2.844803127960056e-05, - "loss": 0.9082, + "epoch": 0.46460791117279665, + "grad_norm": 0.8295098543167114, + "learning_rate": 3.983961856521682e-05, + "loss": 0.8467, "step": 2678 }, { - "epoch": 0.9782727770677379, - "grad_norm": 0.9082009196281433, - "learning_rate": 2.843502651750251e-05, - "loss": 0.8716, + "epoch": 0.46478140180430255, + "grad_norm": 1.0528907775878906, + "learning_rate": 3.983875630253069e-05, + "loss": 0.8569, "step": 2679 }, { - "epoch": 0.978637940478364, - "grad_norm": 0.9145800471305847, - "learning_rate": 2.8422017415921793e-05, - "loss": 0.9061, + "epoch": 0.46495489243580845, + "grad_norm": 0.7411402463912964, + "learning_rate": 3.983789173753378e-05, + "loss": 0.8696, "step": 2680 }, { - "epoch": 0.9790031038889904, - "grad_norm": 1.0284546613693237, - "learning_rate": 2.840900398155107e-05, - "loss": 0.8794, + "epoch": 0.46512838306731435, + "grad_norm": 1.1203161478042603, + "learning_rate": 3.9837024870326425e-05, + "loss": 0.7935, "step": 2681 }, { - "epoch": 0.9793682672996166, - "grad_norm": 0.9090563058853149, - "learning_rate": 2.839598622108523e-05, - "loss": 0.9373, + "epoch": 0.46530187369882026, + "grad_norm": 1.0621238946914673, + "learning_rate": 3.983615570100921e-05, + "loss": 0.6951, "step": 2682 }, { - "epoch": 0.9797334307102429, - "grad_norm": 1.0278183221817017, - "learning_rate": 2.8382964141221396e-05, - "loss": 0.8918, + "epoch": 0.46547536433032616, + "grad_norm": 0.8231312036514282, + "learning_rate": 3.983528422968301e-05, + "loss": 0.8584, "step": 2683 }, { - "epoch": 0.9800985941208691, - "grad_norm": 1.3192719221115112, - "learning_rate": 2.8369937748658892e-05, - "loss": 0.8644, + "epoch": 0.46564885496183206, + "grad_norm": 1.6299453973770142, + "learning_rate": 3.9834410456448966e-05, + "loss": 0.7529, "step": 2684 }, { - "epoch": 0.9804637575314954, - "grad_norm": 1.1484616994857788, - "learning_rate": 2.8356907050099284e-05, - "loss": 0.9446, + "epoch": 0.46582234559333796, + "grad_norm": 2.104614019393921, + "learning_rate": 3.983353438140848e-05, + "loss": 0.8833, "step": 2685 }, { - "epoch": 0.9808289209421216, - "grad_norm": 1.1969355344772339, - "learning_rate": 2.834387205224634e-05, - "loss": 0.933, + "epoch": 0.46599583622484386, + "grad_norm": 0.7799380421638489, + "learning_rate": 3.983265600466321e-05, + "loss": 0.8425, "step": 2686 }, { - "epoch": 0.9811940843527479, - "grad_norm": 2.055298089981079, - "learning_rate": 2.833083276180604e-05, - "loss": 0.9356, + "epoch": 0.46616932685634976, + "grad_norm": 0.8442371487617493, + "learning_rate": 3.983177532631511e-05, + "loss": 0.8452, "step": 2687 }, { - "epoch": 0.9815592477633741, - "grad_norm": 1.0452311038970947, - "learning_rate": 2.8317789185486587e-05, - "loss": 0.9103, + "epoch": 0.46634281748785567, + "grad_norm": 1.0199551582336426, + "learning_rate": 3.983089234646637e-05, + "loss": 0.7932, "step": 2688 }, { - "epoch": 0.9819244111740003, - "grad_norm": 1.0692678689956665, - "learning_rate": 2.8304741329998364e-05, - "loss": 0.9175, + "epoch": 0.46651630811936157, + "grad_norm": 1.1247429847717285, + "learning_rate": 3.9830007065219466e-05, + "loss": 0.8179, "step": 2689 }, { - "epoch": 0.9822895745846266, - "grad_norm": 1.2027676105499268, - "learning_rate": 2.829168920205398e-05, - "loss": 0.8898, + "epoch": 0.46668979875086747, + "grad_norm": 1.2893389463424683, + "learning_rate": 3.9829119482677144e-05, + "loss": 0.8339, "step": 2690 }, { - "epoch": 0.9826547379952528, - "grad_norm": 1.1173428297042847, - "learning_rate": 2.8278632808368222e-05, - "loss": 0.8964, + "epoch": 0.46686328938237337, + "grad_norm": 1.1685383319854736, + "learning_rate": 3.982822959894239e-05, + "loss": 0.8409, "step": 2691 }, { - "epoch": 0.9830199014058791, - "grad_norm": 1.172051191329956, - "learning_rate": 2.826557215565809e-05, - "loss": 0.84, + "epoch": 0.46703678001387927, + "grad_norm": 0.9392901062965393, + "learning_rate": 3.9827337414118486e-05, + "loss": 0.7717, "step": 2692 }, { - "epoch": 0.9833850648165053, - "grad_norm": 1.8154791593551636, - "learning_rate": 2.825250725064275e-05, - "loss": 0.8846, + "epoch": 0.4672102706453852, + "grad_norm": 0.9655500650405884, + "learning_rate": 3.9826442928308974e-05, + "loss": 0.701, "step": 2693 }, { - "epoch": 0.9837502282271317, - "grad_norm": 1.2368260622024536, - "learning_rate": 2.8239438100043597e-05, - "loss": 0.8865, + "epoch": 0.4673837612768911, + "grad_norm": 1.0029101371765137, + "learning_rate": 3.982554614161766e-05, + "loss": 0.8445, "step": 2694 }, { - "epoch": 0.9841153916377579, - "grad_norm": 1.0890463590621948, - "learning_rate": 2.822636471058416e-05, - "loss": 0.8735, + "epoch": 0.4675572519083969, + "grad_norm": 0.8814912438392639, + "learning_rate": 3.9824647054148614e-05, + "loss": 0.7126, "step": 2695 }, { - "epoch": 0.9844805550483842, - "grad_norm": 1.8456718921661377, - "learning_rate": 2.8213287088990184e-05, - "loss": 0.8859, + "epoch": 0.4677307425399028, + "grad_norm": 1.2653889656066895, + "learning_rate": 3.9823745666006176e-05, + "loss": 0.6781, "step": 2696 }, { - "epoch": 0.9848457184590104, - "grad_norm": 1.01983642578125, - "learning_rate": 2.8200205241989583e-05, - "loss": 0.8776, + "epoch": 0.4679042331714087, + "grad_norm": 1.129898190498352, + "learning_rate": 3.982284197729496e-05, + "loss": 0.7041, "step": 2697 }, { - "epoch": 0.9852108818696367, - "grad_norm": 1.2972252368927002, - "learning_rate": 2.818711917631243e-05, - "loss": 0.8623, + "epoch": 0.4680777238029146, + "grad_norm": 1.197079062461853, + "learning_rate": 3.982193598811983e-05, + "loss": 0.7078, "step": 2698 }, { - "epoch": 0.9855760452802629, - "grad_norm": 1.146558165550232, - "learning_rate": 2.8174028898690998e-05, - "loss": 0.8981, + "epoch": 0.4682512144344205, + "grad_norm": 1.2100567817687988, + "learning_rate": 3.982102769858593e-05, + "loss": 0.8132, "step": 2699 }, { - "epoch": 0.9859412086908892, - "grad_norm": 0.9103665351867676, - "learning_rate": 2.81609344158597e-05, - "loss": 0.8615, + "epoch": 0.46842470506592643, + "grad_norm": 1.2843912839889526, + "learning_rate": 3.9820117108798666e-05, + "loss": 0.626, "step": 2700 }, { - "epoch": 0.9863063721015154, - "grad_norm": 1.154638648033142, - "learning_rate": 2.8147835734555114e-05, - "loss": 0.8776, + "epoch": 0.46859819569743233, + "grad_norm": 0.9979259371757507, + "learning_rate": 3.981920421886372e-05, + "loss": 0.6401, "step": 2701 }, { - "epoch": 0.9866715355121417, - "grad_norm": 1.4782180786132812, - "learning_rate": 2.813473286151601e-05, - "loss": 0.908, + "epoch": 0.46877168632893823, + "grad_norm": 1.043851613998413, + "learning_rate": 3.981828902888704e-05, + "loss": 0.7441, "step": 2702 }, { - "epoch": 0.9870366989227679, - "grad_norm": 1.1068899631500244, - "learning_rate": 2.8121625803483264e-05, - "loss": 0.876, + "epoch": 0.46894517696044413, + "grad_norm": 1.5754739046096802, + "learning_rate": 3.981737153897481e-05, + "loss": 0.9185, "step": 2703 }, { - "epoch": 0.9874018623333942, - "grad_norm": 1.3221932649612427, - "learning_rate": 2.810851456719995e-05, - "loss": 0.8977, + "epoch": 0.46911866759195003, + "grad_norm": 0.9061071276664734, + "learning_rate": 3.981645174923353e-05, + "loss": 0.7251, "step": 2704 }, { - "epoch": 0.9877670257440204, - "grad_norm": 1.494868278503418, - "learning_rate": 2.8095399159411258e-05, - "loss": 0.8717, + "epoch": 0.46929215822345594, + "grad_norm": 1.3001538515090942, + "learning_rate": 3.981552965976993e-05, + "loss": 0.645, "step": 2705 }, { - "epoch": 0.9881321891546467, - "grad_norm": 1.067365050315857, - "learning_rate": 2.8082279586864548e-05, - "loss": 0.8735, + "epoch": 0.46946564885496184, + "grad_norm": 1.3982876539230347, + "learning_rate": 3.9814605270691025e-05, + "loss": 0.7014, "step": 2706 }, { - "epoch": 0.9884973525652729, - "grad_norm": 0.7983874082565308, - "learning_rate": 2.8069155856309316e-05, - "loss": 0.8569, + "epoch": 0.46963913948646774, + "grad_norm": 1.0255459547042847, + "learning_rate": 3.9813678582104095e-05, + "loss": 0.7334, "step": 2707 }, { - "epoch": 0.9888625159758992, - "grad_norm": 1.1638718843460083, - "learning_rate": 2.805602797449719e-05, - "loss": 0.9066, + "epoch": 0.46981263011797364, + "grad_norm": 1.1063698530197144, + "learning_rate": 3.981274959411667e-05, + "loss": 0.7607, "step": 2708 }, { - "epoch": 0.9892276793865254, - "grad_norm": 1.1207901239395142, - "learning_rate": 2.8042895948181944e-05, - "loss": 0.9291, + "epoch": 0.46998612074947954, + "grad_norm": 1.3352556228637695, + "learning_rate": 3.981181830683657e-05, + "loss": 0.8004, "step": 2709 }, { - "epoch": 0.9895928427971518, - "grad_norm": 1.0904790163040161, - "learning_rate": 2.8029759784119465e-05, - "loss": 0.8696, + "epoch": 0.47015961138098544, + "grad_norm": 0.9325754642486572, + "learning_rate": 3.9810884720371874e-05, + "loss": 0.9329, "step": 2710 }, { - "epoch": 0.989958006207778, - "grad_norm": 0.9803836345672607, - "learning_rate": 2.80166194890678e-05, - "loss": 0.936, + "epoch": 0.47033310201249134, + "grad_norm": 0.8515332341194153, + "learning_rate": 3.9809948834830914e-05, + "loss": 0.8162, "step": 2711 }, { - "epoch": 0.9903231696184043, - "grad_norm": 0.8458499312400818, - "learning_rate": 2.8003475069787084e-05, - "loss": 0.9071, + "epoch": 0.47050659264399725, + "grad_norm": 1.1165862083435059, + "learning_rate": 3.980901065032232e-05, + "loss": 0.7695, "step": 2712 }, { - "epoch": 0.9906883330290305, - "grad_norm": 1.0386416912078857, - "learning_rate": 2.799032653303961e-05, - "loss": 0.8955, + "epoch": 0.47068008327550315, + "grad_norm": 0.95283442735672, + "learning_rate": 3.9808070166954945e-05, + "loss": 0.7998, "step": 2713 }, { - "epoch": 0.9910534964396568, - "grad_norm": 1.120476245880127, - "learning_rate": 2.7977173885589768e-05, - "loss": 0.8973, + "epoch": 0.47085357390700905, + "grad_norm": 2.6334545612335205, + "learning_rate": 3.9807127384837955e-05, + "loss": 0.7607, "step": 2714 }, { - "epoch": 0.991418659850283, - "grad_norm": 0.8963499069213867, - "learning_rate": 2.796401713420406e-05, - "loss": 0.8636, + "epoch": 0.4710270645385149, + "grad_norm": 1.2495638132095337, + "learning_rate": 3.980618230408075e-05, + "loss": 0.7366, "step": 2715 }, { - "epoch": 0.9917838232609093, - "grad_norm": 0.9619870185852051, - "learning_rate": 2.7950856285651124e-05, - "loss": 0.9205, + "epoch": 0.4712005551700208, + "grad_norm": 1.2504578828811646, + "learning_rate": 3.980523492479301e-05, + "loss": 0.7328, "step": 2716 }, { - "epoch": 0.9921489866715355, - "grad_norm": 0.9517703652381897, - "learning_rate": 2.793769134670167e-05, - "loss": 0.8948, + "epoch": 0.4713740458015267, + "grad_norm": 1.5618733167648315, + "learning_rate": 3.9804285247084674e-05, + "loss": 0.8416, "step": 2717 }, { - "epoch": 0.9925141500821618, - "grad_norm": 1.2442158460617065, - "learning_rate": 2.7924522324128535e-05, - "loss": 0.8699, + "epoch": 0.4715475364330326, + "grad_norm": 1.0170127153396606, + "learning_rate": 3.980333327106596e-05, + "loss": 0.8264, "step": 2718 }, { - "epoch": 0.992879313492788, - "grad_norm": 0.8317010402679443, - "learning_rate": 2.791134922470666e-05, - "loss": 0.8783, + "epoch": 0.4717210270645385, + "grad_norm": 2.091218948364258, + "learning_rate": 3.980237899684735e-05, + "loss": 0.9753, "step": 2719 }, { - "epoch": 0.9932444769034143, - "grad_norm": 1.1644020080566406, - "learning_rate": 2.7898172055213067e-05, - "loss": 0.9449, + "epoch": 0.4718945176960444, + "grad_norm": 0.9628425240516663, + "learning_rate": 3.980142242453958e-05, + "loss": 0.8784, "step": 2720 }, { - "epoch": 0.9936096403140405, - "grad_norm": 1.039124846458435, - "learning_rate": 2.788499082242689e-05, - "loss": 0.8243, + "epoch": 0.4720680083275503, + "grad_norm": 0.8876476883888245, + "learning_rate": 3.980046355425366e-05, + "loss": 0.8352, "step": 2721 }, { - "epoch": 0.9939748037246667, - "grad_norm": 1.0049586296081543, - "learning_rate": 2.7871805533129337e-05, - "loss": 0.8745, + "epoch": 0.4722414989590562, + "grad_norm": 1.324562907218933, + "learning_rate": 3.979950238610088e-05, + "loss": 0.6965, "step": 2722 }, { - "epoch": 0.994339967135293, - "grad_norm": 1.2463852167129517, - "learning_rate": 2.7858616194103714e-05, - "loss": 0.8772, + "epoch": 0.4724149895905621, + "grad_norm": 1.195199966430664, + "learning_rate": 3.979853892019278e-05, + "loss": 0.7913, "step": 2723 }, { - "epoch": 0.9947051305459192, - "grad_norm": 0.9621063470840454, - "learning_rate": 2.7845422812135406e-05, - "loss": 0.8479, + "epoch": 0.472588480222068, + "grad_norm": 1.2216423749923706, + "learning_rate": 3.9797573156641165e-05, + "loss": 0.7349, "step": 2724 }, { - "epoch": 0.9950702939565456, - "grad_norm": 1.2086292505264282, - "learning_rate": 2.783222539401188e-05, - "loss": 0.9016, + "epoch": 0.4727619708535739, + "grad_norm": 0.984237551689148, + "learning_rate": 3.9796605095558115e-05, + "loss": 0.7161, "step": 2725 }, { - "epoch": 0.9954354573671718, - "grad_norm": 1.0144617557525635, - "learning_rate": 2.7819023946522678e-05, - "loss": 0.8849, + "epoch": 0.4729354614850798, + "grad_norm": 1.3275266885757446, + "learning_rate": 3.9795634737055974e-05, + "loss": 0.7874, "step": 2726 }, { - "epoch": 0.9958006207777981, - "grad_norm": 1.3671644926071167, - "learning_rate": 2.7805818476459404e-05, - "loss": 0.9491, + "epoch": 0.4731089521165857, + "grad_norm": 1.1041113138198853, + "learning_rate": 3.979466208124736e-05, + "loss": 0.6384, "step": 2727 }, { - "epoch": 0.9961657841884243, - "grad_norm": 1.2919927835464478, - "learning_rate": 2.7792608990615763e-05, - "loss": 0.9144, + "epoch": 0.4732824427480916, + "grad_norm": 1.0636224746704102, + "learning_rate": 3.979368712824514e-05, + "loss": 0.6636, "step": 2728 }, { - "epoch": 0.9965309475990506, - "grad_norm": 1.164105772972107, - "learning_rate": 2.777939549578749e-05, - "loss": 0.8722, + "epoch": 0.4734559333795975, + "grad_norm": 1.1491512060165405, + "learning_rate": 3.9792709878162466e-05, + "loss": 0.8613, "step": 2729 }, { - "epoch": 0.9968961110096768, - "grad_norm": 1.398402452468872, - "learning_rate": 2.77661779987724e-05, - "loss": 0.9043, + "epoch": 0.4736294240111034, + "grad_norm": 1.0807361602783203, + "learning_rate": 3.979173033111275e-05, + "loss": 0.6909, "step": 2730 }, { - "epoch": 0.9972612744203031, - "grad_norm": 1.4663875102996826, - "learning_rate": 2.7752956506370366e-05, - "loss": 0.8994, + "epoch": 0.4738029146426093, + "grad_norm": 0.8976280093193054, + "learning_rate": 3.979074848720967e-05, + "loss": 0.8547, "step": 2731 }, { - "epoch": 0.9976264378309293, - "grad_norm": 1.147277593612671, - "learning_rate": 2.773973102538332e-05, - "loss": 0.899, + "epoch": 0.4739764052741152, + "grad_norm": 1.1234210729599, + "learning_rate": 3.9789764346567154e-05, + "loss": 0.8696, "step": 2732 }, { - "epoch": 0.9979916012415556, - "grad_norm": 1.0844606161117554, - "learning_rate": 2.7726501562615237e-05, - "loss": 0.8926, + "epoch": 0.4741498959056211, + "grad_norm": 1.0510085821151733, + "learning_rate": 3.978877790929944e-05, + "loss": 0.8096, "step": 2733 }, { - "epoch": 0.9983567646521818, - "grad_norm": 1.1824665069580078, - "learning_rate": 2.7713268124872145e-05, - "loss": 0.9034, + "epoch": 0.47432338653712697, + "grad_norm": 1.8456037044525146, + "learning_rate": 3.978778917552099e-05, + "loss": 0.7678, "step": 2734 }, { - "epoch": 0.9987219280628081, - "grad_norm": 1.3015096187591553, - "learning_rate": 2.770003071896212e-05, - "loss": 0.8855, + "epoch": 0.47449687716863287, + "grad_norm": 1.0931169986724854, + "learning_rate": 3.978679814534654e-05, + "loss": 0.9136, "step": 2735 }, { - "epoch": 0.9990870914734343, - "grad_norm": 1.2266253232955933, - "learning_rate": 2.768678935169527e-05, - "loss": 0.8956, + "epoch": 0.47467036780013877, + "grad_norm": 0.959671139717102, + "learning_rate": 3.9785804818891117e-05, + "loss": 0.6738, "step": 2736 }, { - "epoch": 0.9994522548840606, - "grad_norm": 0.8484414219856262, - "learning_rate": 2.7673544029883745e-05, - "loss": 0.8717, + "epoch": 0.47484385843164467, + "grad_norm": 1.0955255031585693, + "learning_rate": 3.9784809196269986e-05, + "loss": 0.7091, "step": 2737 }, { - "epoch": 0.9998174182946868, - "grad_norm": 1.4588756561279297, - "learning_rate": 2.7660294760341744e-05, - "loss": 0.9065, + "epoch": 0.4750173490631506, + "grad_norm": 0.9166812300682068, + "learning_rate": 3.978381127759869e-05, + "loss": 0.8657, "step": 2738 }, { - "epoch": 1.0001825817053132, - "grad_norm": 1.1145268678665161, - "learning_rate": 2.7647041549885472e-05, - "loss": 0.9161, + "epoch": 0.4751908396946565, + "grad_norm": 3.8510091304779053, + "learning_rate": 3.978281106299305e-05, + "loss": 0.7217, "step": 2739 }, { - "epoch": 1.0005477451159395, - "grad_norm": 1.075659990310669, - "learning_rate": 2.7633784405333183e-05, - "loss": 0.8718, + "epoch": 0.4753643303261624, + "grad_norm": 1.0606828927993774, + "learning_rate": 3.9781808552569134e-05, + "loss": 0.8008, "step": 2740 }, { - "epoch": 1.0009129085265656, - "grad_norm": 1.0387517213821411, - "learning_rate": 2.7620523333505142e-05, - "loss": 0.8789, + "epoch": 0.4755378209576683, + "grad_norm": 0.8503128290176392, + "learning_rate": 3.9780803746443284e-05, + "loss": 0.8379, "step": 2741 }, { - "epoch": 1.0012780719371919, - "grad_norm": 0.9671303033828735, - "learning_rate": 2.7607258341223636e-05, - "loss": 0.8506, + "epoch": 0.4757113115891742, + "grad_norm": 1.0500198602676392, + "learning_rate": 3.97797966447321e-05, + "loss": 0.6597, "step": 2742 }, { - "epoch": 1.0016432353478182, - "grad_norm": 1.2011957168579102, - "learning_rate": 2.7593989435312976e-05, - "loss": 0.8552, + "epoch": 0.4758848022206801, + "grad_norm": 1.1878410577774048, + "learning_rate": 3.9778787247552484e-05, + "loss": 0.7041, "step": 2743 }, { - "epoch": 1.0020083987584445, - "grad_norm": 0.9761734008789062, - "learning_rate": 2.7580716622599478e-05, - "loss": 0.8541, + "epoch": 0.476058292852186, + "grad_norm": 0.9951611757278442, + "learning_rate": 3.977777555502155e-05, + "loss": 0.7297, "step": 2744 }, { - "epoch": 1.0023735621690706, - "grad_norm": 1.496596097946167, - "learning_rate": 2.756743990991148e-05, - "loss": 0.8729, + "epoch": 0.4762317834836919, + "grad_norm": 0.9877262115478516, + "learning_rate": 3.9776761567256726e-05, + "loss": 0.79, "step": 2745 }, { - "epoch": 1.002738725579697, - "grad_norm": 1.2966418266296387, - "learning_rate": 2.7554159304079298e-05, - "loss": 0.8539, + "epoch": 0.4764052741151978, + "grad_norm": 0.8991420269012451, + "learning_rate": 3.977574528437567e-05, + "loss": 0.8154, "step": 2746 }, { - "epoch": 1.0031038889903232, - "grad_norm": 0.9659630656242371, - "learning_rate": 2.7540874811935295e-05, - "loss": 0.8139, + "epoch": 0.4765787647467037, + "grad_norm": 0.8735139966011047, + "learning_rate": 3.977472670649634e-05, + "loss": 0.8491, "step": 2747 }, { - "epoch": 1.0034690524009495, - "grad_norm": 0.8895799517631531, - "learning_rate": 2.7527586440313786e-05, - "loss": 0.8412, + "epoch": 0.4767522553782096, + "grad_norm": 3.7087111473083496, + "learning_rate": 3.977370583373692e-05, + "loss": 0.7939, "step": 2748 }, { - "epoch": 1.0038342158115756, - "grad_norm": 1.364639401435852, - "learning_rate": 2.7514294196051124e-05, - "loss": 0.9073, + "epoch": 0.4769257460097155, + "grad_norm": 1.416062593460083, + "learning_rate": 3.9772682666215906e-05, + "loss": 0.7427, "step": 2749 }, { - "epoch": 1.004199379222202, - "grad_norm": 1.1539818048477173, - "learning_rate": 2.750099808598563e-05, - "loss": 0.8501, + "epoch": 0.4770992366412214, + "grad_norm": 1.2632842063903809, + "learning_rate": 3.9771657204052026e-05, + "loss": 0.6902, "step": 2750 }, { - "epoch": 1.0045645426328282, - "grad_norm": 1.0537173748016357, - "learning_rate": 2.7487698116957617e-05, - "loss": 0.8647, + "epoch": 0.4772727272727273, + "grad_norm": 0.9330185055732727, + "learning_rate": 3.977062944736429e-05, + "loss": 0.7725, "step": 2751 }, { - "epoch": 1.0049297060434546, - "grad_norm": 0.8203901648521423, - "learning_rate": 2.747439429580938e-05, - "loss": 0.8591, + "epoch": 0.4774462179042332, + "grad_norm": 0.9623755812644958, + "learning_rate": 3.976959939627196e-05, + "loss": 0.8389, "step": 2752 }, { - "epoch": 1.0052948694540806, - "grad_norm": 1.0629971027374268, - "learning_rate": 2.746108662938521e-05, - "loss": 0.8416, + "epoch": 0.4776197085357391, + "grad_norm": 1.0597249269485474, + "learning_rate": 3.97685670508946e-05, + "loss": 0.7532, "step": 2753 }, { - "epoch": 1.005660032864707, - "grad_norm": 1.3172725439071655, - "learning_rate": 2.7447775124531367e-05, - "loss": 0.8841, + "epoch": 0.47779319916724494, + "grad_norm": 1.113258957862854, + "learning_rate": 3.9767532411351985e-05, + "loss": 0.6838, "step": 2754 }, { - "epoch": 1.0060251962753333, - "grad_norm": 1.400835633277893, - "learning_rate": 2.7434459788096077e-05, - "loss": 0.8864, + "epoch": 0.47796668979875084, + "grad_norm": 1.2035083770751953, + "learning_rate": 3.976649547776421e-05, + "loss": 0.8047, "step": 2755 }, { - "epoch": 1.0063903596859594, - "grad_norm": 1.0118491649627686, - "learning_rate": 2.7421140626929556e-05, - "loss": 0.8705, + "epoch": 0.47814018043025674, + "grad_norm": 0.940751314163208, + "learning_rate": 3.97654562502516e-05, + "loss": 0.7632, "step": 2756 }, { - "epoch": 1.0067555230965857, - "grad_norm": 1.151246190071106, - "learning_rate": 2.7407817647883973e-05, - "loss": 0.874, + "epoch": 0.47831367106176265, + "grad_norm": 0.8253898024559021, + "learning_rate": 3.9764414728934755e-05, + "loss": 0.7754, "step": 2757 }, { - "epoch": 1.007120686507212, - "grad_norm": 1.1992406845092773, - "learning_rate": 2.7394490857813467e-05, - "loss": 0.8503, + "epoch": 0.47848716169326855, + "grad_norm": 1.1687121391296387, + "learning_rate": 3.9763370913934554e-05, + "loss": 0.6895, "step": 2758 }, { - "epoch": 1.0074858499178383, - "grad_norm": 1.098996877670288, - "learning_rate": 2.738116026357414e-05, - "loss": 0.8644, + "epoch": 0.47866065232477445, + "grad_norm": 1.130563735961914, + "learning_rate": 3.976232480537213e-05, + "loss": 0.6635, "step": 2759 }, { - "epoch": 1.0078510133284644, - "grad_norm": 0.9148181080818176, - "learning_rate": 2.7367825872024042e-05, - "loss": 0.9148, + "epoch": 0.47883414295628035, + "grad_norm": 1.0760321617126465, + "learning_rate": 3.976127640336889e-05, + "loss": 0.8027, "step": 2760 }, { - "epoch": 1.0082161767390907, - "grad_norm": 1.0138267278671265, - "learning_rate": 2.7354487690023178e-05, - "loss": 0.8578, + "epoch": 0.47900763358778625, + "grad_norm": 1.0723400115966797, + "learning_rate": 3.976022570804649e-05, + "loss": 0.8135, "step": 2761 }, { - "epoch": 1.008581340149717, - "grad_norm": 1.1369482278823853, - "learning_rate": 2.7341145724433532e-05, - "loss": 0.8843, + "epoch": 0.47918112421929215, + "grad_norm": 1.385394811630249, + "learning_rate": 3.9759172719526876e-05, + "loss": 0.7017, "step": 2762 }, { - "epoch": 1.0089465035603433, - "grad_norm": 1.1365288496017456, - "learning_rate": 2.732779998211898e-05, - "loss": 0.8584, + "epoch": 0.47935461485079806, + "grad_norm": 1.106462836265564, + "learning_rate": 3.975811743793224e-05, + "loss": 0.7162, "step": 2763 }, { - "epoch": 1.0093116669709694, - "grad_norm": 1.013693928718567, - "learning_rate": 2.731445046994539e-05, - "loss": 0.8757, + "epoch": 0.47952810548230396, + "grad_norm": 1.0033589601516724, + "learning_rate": 3.975705986338505e-05, + "loss": 0.8523, "step": 2764 }, { - "epoch": 1.0096768303815957, - "grad_norm": 1.0474008321762085, - "learning_rate": 2.7301097194780536e-05, - "loss": 0.8739, + "epoch": 0.47970159611380986, + "grad_norm": 1.0237081050872803, + "learning_rate": 3.975599999600804e-05, + "loss": 0.8334, "step": 2765 }, { - "epoch": 1.010041993792222, - "grad_norm": 1.0152982473373413, - "learning_rate": 2.7287740163494153e-05, - "loss": 0.8628, + "epoch": 0.47987508674531576, + "grad_norm": 1.5185967683792114, + "learning_rate": 3.9754937835924214e-05, + "loss": 0.7881, "step": 2766 }, { - "epoch": 1.0104071572028483, - "grad_norm": 0.8896036148071289, - "learning_rate": 2.7274379382957897e-05, - "loss": 0.8221, + "epoch": 0.48004857737682166, + "grad_norm": 1.682679533958435, + "learning_rate": 3.975387338325684e-05, + "loss": 0.7324, "step": 2767 }, { - "epoch": 1.0107723206134744, - "grad_norm": 1.1937261819839478, - "learning_rate": 2.726101486004535e-05, - "loss": 0.8536, + "epoch": 0.48022206800832756, + "grad_norm": 1.1459957361221313, + "learning_rate": 3.9752806638129435e-05, + "loss": 0.7375, "step": 2768 }, { - "epoch": 1.0111374840241008, - "grad_norm": 0.9447047114372253, - "learning_rate": 2.724764660163203e-05, - "loss": 0.7795, + "epoch": 0.48039555863983346, + "grad_norm": 1.2045303583145142, + "learning_rate": 3.975173760066581e-05, + "loss": 0.9011, "step": 2769 }, { - "epoch": 1.011502647434727, - "grad_norm": 1.2285399436950684, - "learning_rate": 2.7234274614595353e-05, - "loss": 0.8338, + "epoch": 0.48056904927133937, + "grad_norm": 0.8476513624191284, + "learning_rate": 3.975066627099e-05, + "loss": 0.7983, "step": 2770 }, { - "epoch": 1.0118678108453534, - "grad_norm": 1.1542187929153442, - "learning_rate": 2.722089890581469e-05, - "loss": 0.8502, + "epoch": 0.48074253990284527, + "grad_norm": 2.0825326442718506, + "learning_rate": 3.974959264922638e-05, + "loss": 0.8252, "step": 2771 }, { - "epoch": 1.0122329742559795, - "grad_norm": 1.2695060968399048, - "learning_rate": 2.7207519482171285e-05, - "loss": 0.8402, + "epoch": 0.48091603053435117, + "grad_norm": 1.0608158111572266, + "learning_rate": 3.974851673549951e-05, + "loss": 0.7566, "step": 2772 }, { - "epoch": 1.0125981376666058, - "grad_norm": 1.341461420059204, - "learning_rate": 2.7194136350548332e-05, - "loss": 0.875, + "epoch": 0.48108952116585707, + "grad_norm": 1.0013381242752075, + "learning_rate": 3.974743852993426e-05, + "loss": 0.7684, "step": 2773 }, { - "epoch": 1.012963301077232, - "grad_norm": 1.2313005924224854, - "learning_rate": 2.7180749517830912e-05, - "loss": 0.8467, + "epoch": 0.4812630117973629, + "grad_norm": 0.8724400997161865, + "learning_rate": 3.9746358032655764e-05, + "loss": 0.8857, "step": 2774 }, { - "epoch": 1.0133284644878584, - "grad_norm": 1.1208305358886719, - "learning_rate": 2.7167358990906e-05, - "loss": 0.8666, + "epoch": 0.4814365024288688, + "grad_norm": 1.0278187990188599, + "learning_rate": 3.9745275243789396e-05, + "loss": 0.7233, "step": 2775 }, { - "epoch": 1.0136936278984845, - "grad_norm": 1.0961536169052124, - "learning_rate": 2.7153964776662517e-05, - "loss": 0.8352, + "epoch": 0.4816099930603747, + "grad_norm": 1.0100672245025635, + "learning_rate": 3.974419016346084e-05, + "loss": 0.6819, "step": 2776 }, { - "epoch": 1.0140587913091108, - "grad_norm": 1.110100269317627, - "learning_rate": 2.7140566881991213e-05, - "loss": 0.86, + "epoch": 0.4817834836918806, + "grad_norm": 0.9835875630378723, + "learning_rate": 3.9743102791796005e-05, + "loss": 0.7332, "step": 2777 }, { - "epoch": 1.0144239547197371, - "grad_norm": 1.3814820051193237, - "learning_rate": 2.712716531378478e-05, - "loss": 0.8401, + "epoch": 0.4819569743233865, + "grad_norm": 0.9953315854072571, + "learning_rate": 3.974201312892109e-05, + "loss": 0.8242, "step": 2778 }, { - "epoch": 1.0147891181303634, - "grad_norm": 0.9428169131278992, - "learning_rate": 2.7113760078937787e-05, - "loss": 0.8859, + "epoch": 0.4821304649548924, + "grad_norm": 2.2718396186828613, + "learning_rate": 3.9740921174962545e-05, + "loss": 0.8159, "step": 2779 }, { - "epoch": 1.0151542815409895, - "grad_norm": 1.2245419025421143, - "learning_rate": 2.7100351184346694e-05, - "loss": 0.8434, + "epoch": 0.4823039555863983, + "grad_norm": 1.0260593891143799, + "learning_rate": 3.9739826930047095e-05, + "loss": 0.7954, "step": 2780 }, { - "epoch": 1.0155194449516158, - "grad_norm": 1.2533327341079712, - "learning_rate": 2.708693863690984e-05, - "loss": 0.8645, + "epoch": 0.4824774462179042, + "grad_norm": 1.3404943943023682, + "learning_rate": 3.9738730394301726e-05, + "loss": 0.7976, "step": 2781 }, { - "epoch": 1.0158846083622421, - "grad_norm": 1.4478111267089844, - "learning_rate": 2.707352244352744e-05, - "loss": 0.8592, + "epoch": 0.48265093684941013, + "grad_norm": 1.1638083457946777, + "learning_rate": 3.9737631567853695e-05, + "loss": 0.8323, "step": 2782 }, { - "epoch": 1.0162497717728685, - "grad_norm": 1.909035325050354, - "learning_rate": 2.7060102611101577e-05, - "loss": 0.8544, + "epoch": 0.48282442748091603, + "grad_norm": 0.9738689064979553, + "learning_rate": 3.9736530450830525e-05, + "loss": 0.8389, "step": 2783 }, { - "epoch": 1.0166149351834946, - "grad_norm": 1.1811480522155762, - "learning_rate": 2.704667914653623e-05, - "loss": 0.8806, + "epoch": 0.48299791811242193, + "grad_norm": 0.7957325577735901, + "learning_rate": 3.9735427043359996e-05, + "loss": 0.8149, "step": 2784 }, { - "epoch": 1.0169800985941209, - "grad_norm": 1.0174238681793213, - "learning_rate": 2.7033252056737224e-05, - "loss": 0.903, + "epoch": 0.48317140874392783, + "grad_norm": 1.235429048538208, + "learning_rate": 3.9734321345570165e-05, + "loss": 0.6722, "step": 2785 }, { - "epoch": 1.0173452620047472, - "grad_norm": 0.9724805951118469, - "learning_rate": 2.7019821348612265e-05, - "loss": 0.8591, + "epoch": 0.48334489937543373, + "grad_norm": 0.8625333309173584, + "learning_rate": 3.973321335758934e-05, + "loss": 0.8088, "step": 2786 }, { - "epoch": 1.0177104254153735, - "grad_norm": 1.215047001838684, - "learning_rate": 2.7006387029070907e-05, - "loss": 0.838, + "epoch": 0.48351839000693964, + "grad_norm": 1.143398404121399, + "learning_rate": 3.9732103079546125e-05, + "loss": 0.7087, "step": 2787 }, { - "epoch": 1.0180755888259996, - "grad_norm": 1.017359733581543, - "learning_rate": 2.699294910502457e-05, - "loss": 0.805, + "epoch": 0.48369188063844554, + "grad_norm": 0.8664839267730713, + "learning_rate": 3.973099051156934e-05, + "loss": 0.8622, "step": 2788 }, { - "epoch": 1.018440752236626, - "grad_norm": 0.7614380717277527, - "learning_rate": 2.6979507583386537e-05, - "loss": 0.8699, + "epoch": 0.48386537126995144, + "grad_norm": 0.9524120092391968, + "learning_rate": 3.9729875653788125e-05, + "loss": 0.7964, "step": 2789 }, { - "epoch": 1.0188059156472522, - "grad_norm": 0.9692870378494263, - "learning_rate": 2.6966062471071914e-05, - "loss": 0.8534, + "epoch": 0.48403886190145734, + "grad_norm": 0.8989015221595764, + "learning_rate": 3.972875850633184e-05, + "loss": 0.7351, "step": 2790 }, { - "epoch": 1.0191710790578783, - "grad_norm": 1.056990146636963, - "learning_rate": 2.6952613774997683e-05, - "loss": 0.8776, + "epoch": 0.48421235253296324, + "grad_norm": 1.084872841835022, + "learning_rate": 3.972763906933015e-05, + "loss": 0.9072, "step": 2791 }, { - "epoch": 1.0195362424685046, - "grad_norm": 1.3532108068466187, - "learning_rate": 2.6939161502082653e-05, - "loss": 0.887, + "epoch": 0.48438584316446914, + "grad_norm": 2.132781505584717, + "learning_rate": 3.9726517342912954e-05, + "loss": 0.7401, "step": 2792 }, { - "epoch": 1.019901405879131, - "grad_norm": 0.9886606335639954, - "learning_rate": 2.692570565924749e-05, - "loss": 0.8535, + "epoch": 0.48455933379597504, + "grad_norm": 0.9603497385978699, + "learning_rate": 3.972539332721044e-05, + "loss": 0.7247, "step": 2793 }, { - "epoch": 1.0202665692897572, - "grad_norm": 0.8822911381721497, - "learning_rate": 2.691224625341467e-05, - "loss": 0.837, + "epoch": 0.4847328244274809, + "grad_norm": 0.9750885963439941, + "learning_rate": 3.972426702235304e-05, + "loss": 0.8721, "step": 2794 }, { - "epoch": 1.0206317327003833, - "grad_norm": 1.0117628574371338, - "learning_rate": 2.6898783291508524e-05, - "loss": 0.8712, + "epoch": 0.4849063150589868, + "grad_norm": 0.9580737948417664, + "learning_rate": 3.972313842847148e-05, + "loss": 0.6467, "step": 2795 }, { - "epoch": 1.0209968961110096, - "grad_norm": 1.5235037803649902, - "learning_rate": 2.6885316780455208e-05, - "loss": 0.8912, + "epoch": 0.4850798056904927, + "grad_norm": 1.1689152717590332, + "learning_rate": 3.972200754569671e-05, + "loss": 0.6772, "step": 2796 }, { - "epoch": 1.021362059521636, - "grad_norm": 1.167954683303833, - "learning_rate": 2.6871846727182696e-05, - "loss": 0.8956, + "epoch": 0.4852532963219986, + "grad_norm": 0.784303605556488, + "learning_rate": 3.972087437415999e-05, + "loss": 0.8694, "step": 2797 }, { - "epoch": 1.0217272229322623, - "grad_norm": 1.1382135152816772, - "learning_rate": 2.6858373138620794e-05, - "loss": 0.8997, + "epoch": 0.4854267869535045, + "grad_norm": 2.1501083374023438, + "learning_rate": 3.9719738913992815e-05, + "loss": 0.6912, "step": 2798 }, { - "epoch": 1.0220923863428883, - "grad_norm": 1.1929303407669067, - "learning_rate": 2.684489602170112e-05, - "loss": 0.8571, + "epoch": 0.4856002775850104, + "grad_norm": 1.7092463970184326, + "learning_rate": 3.971860116532696e-05, + "loss": 0.6871, "step": 2799 }, { - "epoch": 1.0224575497535147, - "grad_norm": 0.893012285232544, - "learning_rate": 2.6831415383357113e-05, - "loss": 0.8408, + "epoch": 0.4857737682165163, + "grad_norm": 1.0703061819076538, + "learning_rate": 3.971746112829447e-05, + "loss": 0.8302, "step": 2800 }, { - "epoch": 1.022822713164141, - "grad_norm": 1.0211272239685059, - "learning_rate": 2.6817931230524016e-05, - "loss": 0.828, + "epoch": 0.4859472588480222, + "grad_norm": 0.7961493134498596, + "learning_rate": 3.971631880302764e-05, + "loss": 0.8225, "step": 2801 }, { - "epoch": 1.0231878765747673, - "grad_norm": 1.4490092992782593, - "learning_rate": 2.6804443570138895e-05, - "loss": 0.8512, + "epoch": 0.4861207494795281, + "grad_norm": 0.9069642424583435, + "learning_rate": 3.9715174189659036e-05, + "loss": 0.8445, "step": 2802 }, { - "epoch": 1.0235530399853934, - "grad_norm": 0.8967427015304565, - "learning_rate": 2.6790952409140597e-05, - "loss": 0.8467, + "epoch": 0.486294240111034, + "grad_norm": 1.2195520401000977, + "learning_rate": 3.97140272883215e-05, + "loss": 0.749, "step": 2803 }, { - "epoch": 1.0239182033960197, - "grad_norm": 1.1083368062973022, - "learning_rate": 2.6777457754469788e-05, - "loss": 0.8633, + "epoch": 0.4864677307425399, + "grad_norm": 0.9408312439918518, + "learning_rate": 3.971287809914811e-05, + "loss": 0.8757, "step": 2804 }, { - "epoch": 1.024283366806646, - "grad_norm": 1.2721424102783203, - "learning_rate": 2.6763959613068933e-05, - "loss": 0.8181, + "epoch": 0.4866412213740458, + "grad_norm": 1.136525273323059, + "learning_rate": 3.9711726622272266e-05, + "loss": 0.7969, "step": 2805 }, { - "epoch": 1.0246485302172723, - "grad_norm": 1.0791912078857422, - "learning_rate": 2.675045799188227e-05, - "loss": 0.8473, + "epoch": 0.4868147120055517, + "grad_norm": 0.8225727081298828, + "learning_rate": 3.971057285782757e-05, + "loss": 0.687, "step": 2806 }, { - "epoch": 1.0250136936278984, - "grad_norm": 0.992801308631897, - "learning_rate": 2.6736952897855856e-05, - "loss": 0.8259, + "epoch": 0.4869882026370576, + "grad_norm": 1.8151087760925293, + "learning_rate": 3.970941680594792e-05, + "loss": 0.7742, "step": 2807 }, { - "epoch": 1.0253788570385247, - "grad_norm": 0.977733314037323, - "learning_rate": 2.6723444337937502e-05, - "loss": 0.8591, + "epoch": 0.4871616932685635, + "grad_norm": 1.0052050352096558, + "learning_rate": 3.970825846676749e-05, + "loss": 0.7935, "step": 2808 }, { - "epoch": 1.025744020449151, - "grad_norm": 0.9268465638160706, - "learning_rate": 2.670993231907684e-05, - "loss": 0.832, + "epoch": 0.4873351839000694, + "grad_norm": 0.8003208041191101, + "learning_rate": 3.9707097840420706e-05, + "loss": 0.8911, "step": 2809 }, { - "epoch": 1.0261091838597773, - "grad_norm": 0.8373004794120789, - "learning_rate": 2.6696416848225256e-05, - "loss": 0.8494, + "epoch": 0.4875086745315753, + "grad_norm": 1.6214138269424438, + "learning_rate": 3.970593492704225e-05, + "loss": 0.689, "step": 2810 }, { - "epoch": 1.0264743472704034, - "grad_norm": 1.3669061660766602, - "learning_rate": 2.6682897932335907e-05, - "loss": 0.8865, + "epoch": 0.4876821651630812, + "grad_norm": 1.040968418121338, + "learning_rate": 3.970476972676708e-05, + "loss": 0.7408, "step": 2811 }, { - "epoch": 1.0268395106810297, - "grad_norm": 1.061291217803955, - "learning_rate": 2.666937557836374e-05, - "loss": 0.8268, + "epoch": 0.4878556557945871, + "grad_norm": 0.8554683923721313, + "learning_rate": 3.9703602239730425e-05, + "loss": 0.8135, "step": 2812 }, { - "epoch": 1.027204674091656, - "grad_norm": 0.9198890328407288, - "learning_rate": 2.665584979326546e-05, - "loss": 0.8322, + "epoch": 0.48802914642609296, + "grad_norm": 0.9011906981468201, + "learning_rate": 3.970243246606777e-05, + "loss": 0.9656, "step": 2813 }, { - "epoch": 1.0275698375022824, - "grad_norm": 1.101178526878357, - "learning_rate": 2.6642320583999556e-05, - "loss": 0.8837, + "epoch": 0.48820263705759886, + "grad_norm": 1.083061933517456, + "learning_rate": 3.9701260405914874e-05, + "loss": 0.8396, "step": 2814 }, { - "epoch": 1.0279350009129085, - "grad_norm": 1.094210147857666, - "learning_rate": 2.662878795752624e-05, - "loss": 0.8545, + "epoch": 0.48837612768910477, + "grad_norm": 1.009628415107727, + "learning_rate": 3.9700086059407745e-05, + "loss": 0.8723, "step": 2815 }, { - "epoch": 1.0283001643235348, - "grad_norm": 1.0437318086624146, - "learning_rate": 2.6615251920807527e-05, - "loss": 0.8776, + "epoch": 0.48854961832061067, + "grad_norm": 0.8301673531532288, + "learning_rate": 3.9698909426682674e-05, + "loss": 0.6799, "step": 2816 }, { - "epoch": 1.028665327734161, - "grad_norm": 1.088657021522522, - "learning_rate": 2.6601712480807145e-05, - "loss": 0.8713, + "epoch": 0.48872310895211657, + "grad_norm": 0.9138254523277283, + "learning_rate": 3.969773050787622e-05, + "loss": 0.7871, "step": 2817 }, { - "epoch": 1.0290304911447874, - "grad_norm": 1.5004582405090332, - "learning_rate": 2.6588169644490608e-05, - "loss": 0.8518, + "epoch": 0.48889659958362247, + "grad_norm": 3.1282012462615967, + "learning_rate": 3.9696549303125176e-05, + "loss": 0.6689, "step": 2818 }, { - "epoch": 1.0293956545554135, - "grad_norm": 1.0034005641937256, - "learning_rate": 2.6574623418825152e-05, - "loss": 0.875, + "epoch": 0.4890700902151284, + "grad_norm": 1.025175929069519, + "learning_rate": 3.9695365812566645e-05, + "loss": 0.6805, "step": 2819 }, { - "epoch": 1.0297608179660398, - "grad_norm": 1.07731294631958, - "learning_rate": 2.656107381077977e-05, - "loss": 0.7963, + "epoch": 0.4892435808466343, + "grad_norm": 1.2804850339889526, + "learning_rate": 3.969418003633795e-05, + "loss": 0.7296, "step": 2820 }, { - "epoch": 1.0301259813766661, - "grad_norm": 1.170408010482788, - "learning_rate": 2.6547520827325192e-05, - "loss": 0.817, + "epoch": 0.4894170714781402, + "grad_norm": 1.0639983415603638, + "learning_rate": 3.9692991974576725e-05, + "loss": 0.6897, "step": 2821 }, { - "epoch": 1.0304911447872924, - "grad_norm": 1.1633068323135376, - "learning_rate": 2.6533964475433886e-05, - "loss": 0.8368, + "epoch": 0.4895905621096461, + "grad_norm": 1.1107088327407837, + "learning_rate": 3.969180162742082e-05, + "loss": 0.7671, "step": 2822 }, { - "epoch": 1.0308563081979185, - "grad_norm": 1.1122967004776, - "learning_rate": 2.6520404762080048e-05, - "loss": 0.869, + "epoch": 0.489764052741152, + "grad_norm": 1.0773844718933105, + "learning_rate": 3.96906089950084e-05, + "loss": 0.748, "step": 2823 }, { - "epoch": 1.0312214716085448, - "grad_norm": 0.9573561549186707, - "learning_rate": 2.650684169423961e-05, - "loss": 0.8961, + "epoch": 0.4899375433726579, + "grad_norm": 0.8784615993499756, + "learning_rate": 3.9689414077477865e-05, + "loss": 0.8215, "step": 2824 }, { - "epoch": 1.0315866350191711, - "grad_norm": 1.1507915258407593, - "learning_rate": 2.649327527889022e-05, - "loss": 0.8092, + "epoch": 0.4901110340041638, + "grad_norm": 0.8128589391708374, + "learning_rate": 3.968821687496788e-05, + "loss": 0.8889, "step": 2825 }, { - "epoch": 1.0319517984297972, - "grad_norm": 1.1480003595352173, - "learning_rate": 2.6479705523011254e-05, - "loss": 0.8586, + "epoch": 0.4902845246356697, + "grad_norm": 0.945442259311676, + "learning_rate": 3.968701738761739e-05, + "loss": 0.8689, "step": 2826 }, { - "epoch": 1.0323169618404235, - "grad_norm": 1.3515429496765137, - "learning_rate": 2.646613243358382e-05, - "loss": 0.8288, + "epoch": 0.4904580152671756, + "grad_norm": 0.984286367893219, + "learning_rate": 3.968581561556558e-05, + "loss": 0.8647, "step": 2827 }, { - "epoch": 1.0326821252510499, - "grad_norm": 0.9083110094070435, - "learning_rate": 2.645255601759071e-05, - "loss": 0.8275, + "epoch": 0.4906315058986815, + "grad_norm": 0.8786442875862122, + "learning_rate": 3.968461155895194e-05, + "loss": 0.7725, "step": 2828 }, { - "epoch": 1.0330472886616762, - "grad_norm": 1.201520562171936, - "learning_rate": 2.6438976282016465e-05, - "loss": 0.8849, + "epoch": 0.4908049965301874, + "grad_norm": 0.9667304158210754, + "learning_rate": 3.968340521791619e-05, + "loss": 0.8699, "step": 2829 }, { - "epoch": 1.0334124520723023, - "grad_norm": 0.9572790861129761, - "learning_rate": 2.642539323384729e-05, - "loss": 0.8661, + "epoch": 0.4909784871616933, + "grad_norm": 1.087353229522705, + "learning_rate": 3.9682196592598324e-05, + "loss": 0.7244, "step": 2830 }, { - "epoch": 1.0337776154829286, - "grad_norm": 1.1355044841766357, - "learning_rate": 2.641180688007114e-05, - "loss": 0.8698, + "epoch": 0.4911519777931992, + "grad_norm": 0.9560117125511169, + "learning_rate": 3.968098568313862e-05, + "loss": 0.8403, "step": 2831 }, { - "epoch": 1.0341427788935549, - "grad_norm": 1.1903276443481445, - "learning_rate": 2.6398217227677636e-05, - "loss": 0.871, + "epoch": 0.4913254684247051, + "grad_norm": 0.8641455769538879, + "learning_rate": 3.967977248967758e-05, + "loss": 0.7643, "step": 2832 }, { - "epoch": 1.0345079423041812, - "grad_norm": 1.275565266609192, - "learning_rate": 2.638462428365811e-05, - "loss": 0.8229, + "epoch": 0.49149895905621094, + "grad_norm": 1.4067440032958984, + "learning_rate": 3.9678557012356014e-05, + "loss": 0.6711, "step": 2833 }, { - "epoch": 1.0348731057148073, - "grad_norm": 1.1798431873321533, - "learning_rate": 2.63710280550056e-05, - "loss": 0.8448, + "epoch": 0.49167244968771684, + "grad_norm": 1.2160656452178955, + "learning_rate": 3.967733925131498e-05, + "loss": 0.7135, "step": 2834 }, { - "epoch": 1.0352382691254336, - "grad_norm": 1.0240646600723267, - "learning_rate": 2.63574285487148e-05, - "loss": 0.87, + "epoch": 0.49184594031922274, + "grad_norm": 1.0596822500228882, + "learning_rate": 3.96761192066958e-05, + "loss": 0.8293, "step": 2835 }, { - "epoch": 1.03560343253606, - "grad_norm": 0.976059079170227, - "learning_rate": 2.6343825771782125e-05, - "loss": 0.8605, + "epoch": 0.49201943095072864, + "grad_norm": 0.9582585096359253, + "learning_rate": 3.9674896878640054e-05, + "loss": 0.7664, "step": 2836 }, { - "epoch": 1.0359685959466862, - "grad_norm": 1.2800264358520508, - "learning_rate": 2.633021973120565e-05, - "loss": 0.8031, + "epoch": 0.49219292158223454, + "grad_norm": 1.05730402469635, + "learning_rate": 3.9673672267289604e-05, + "loss": 0.7554, "step": 2837 }, { - "epoch": 1.0363337593573123, - "grad_norm": 1.202407717704773, - "learning_rate": 2.6316610433985136e-05, - "loss": 0.8794, + "epoch": 0.49236641221374045, + "grad_norm": 0.9122896790504456, + "learning_rate": 3.9672445372786565e-05, + "loss": 0.7759, "step": 2838 }, { - "epoch": 1.0366989227679386, - "grad_norm": 1.174521803855896, - "learning_rate": 2.6302997887122024e-05, - "loss": 0.8373, + "epoch": 0.49253990284524635, + "grad_norm": 1.1461971998214722, + "learning_rate": 3.967121619527331e-05, + "loss": 0.7036, "step": 2839 }, { - "epoch": 1.037064086178565, - "grad_norm": 1.0167180299758911, - "learning_rate": 2.6289382097619426e-05, - "loss": 0.8773, + "epoch": 0.49271339347675225, + "grad_norm": 0.90616774559021, + "learning_rate": 3.96699847348925e-05, + "loss": 0.7925, "step": 2840 }, { - "epoch": 1.0374292495891912, - "grad_norm": 1.0424269437789917, - "learning_rate": 2.6275763072482116e-05, - "loss": 0.8341, + "epoch": 0.49288688410825815, + "grad_norm": 0.8871726393699646, + "learning_rate": 3.9668750991787034e-05, + "loss": 0.9021, "step": 2841 }, { - "epoch": 1.0377944129998173, - "grad_norm": 1.067244291305542, - "learning_rate": 2.6262140818716537e-05, - "loss": 0.8176, + "epoch": 0.49306037473976405, + "grad_norm": 0.7454671859741211, + "learning_rate": 3.966751496610011e-05, + "loss": 0.8076, "step": 2842 }, { - "epoch": 1.0381595764104437, - "grad_norm": 1.2530555725097656, - "learning_rate": 2.6248515343330784e-05, - "loss": 0.8621, + "epoch": 0.49323386537126995, + "grad_norm": 0.7161187529563904, + "learning_rate": 3.9666276657975144e-05, + "loss": 0.9089, "step": 2843 }, { - "epoch": 1.03852473982107, - "grad_norm": 1.2995247840881348, - "learning_rate": 2.6234886653334632e-05, - "loss": 0.8528, + "epoch": 0.49340735600277585, + "grad_norm": 0.7835230827331543, + "learning_rate": 3.966503606755586e-05, + "loss": 0.7568, "step": 2844 }, { - "epoch": 1.0388899032316963, - "grad_norm": 1.4149093627929688, - "learning_rate": 2.622125475573948e-05, - "loss": 0.8542, + "epoch": 0.49358084663428176, + "grad_norm": 1.0219131708145142, + "learning_rate": 3.966379319498623e-05, + "loss": 0.782, "step": 2845 }, { - "epoch": 1.0392550666423224, - "grad_norm": 1.1874417066574097, - "learning_rate": 2.6207619657558404e-05, - "loss": 0.8163, + "epoch": 0.49375433726578766, + "grad_norm": 1.0994873046875, + "learning_rate": 3.9662548040410485e-05, + "loss": 0.8619, "step": 2846 }, { - "epoch": 1.0396202300529487, - "grad_norm": 0.9907617568969727, - "learning_rate": 2.6193981365806108e-05, - "loss": 0.8342, + "epoch": 0.49392782789729356, + "grad_norm": 0.8486074805259705, + "learning_rate": 3.966130060397312e-05, + "loss": 0.8306, "step": 2847 }, { - "epoch": 1.039985393463575, - "grad_norm": 0.9719352126121521, - "learning_rate": 2.618033988749895e-05, - "loss": 0.8845, + "epoch": 0.49410131852879946, + "grad_norm": 0.8550471067428589, + "learning_rate": 3.9660050885818925e-05, + "loss": 0.835, "step": 2848 }, { - "epoch": 1.0403505568742013, - "grad_norm": 1.0714778900146484, - "learning_rate": 2.6166695229654923e-05, - "loss": 0.8089, + "epoch": 0.49427480916030536, + "grad_norm": 1.0783939361572266, + "learning_rate": 3.9658798886092904e-05, + "loss": 0.7163, "step": 2849 }, { - "epoch": 1.0407157202848274, - "grad_norm": 1.0454349517822266, - "learning_rate": 2.6153047399293653e-05, - "loss": 0.858, + "epoch": 0.49444829979181126, + "grad_norm": 1.0653290748596191, + "learning_rate": 3.965754460494037e-05, + "loss": 0.8557, "step": 2850 }, { - "epoch": 1.0410808836954537, - "grad_norm": 0.94047611951828, - "learning_rate": 2.6139396403436404e-05, - "loss": 0.8452, + "epoch": 0.49462179042331716, + "grad_norm": 0.9266890287399292, + "learning_rate": 3.965628804250688e-05, + "loss": 0.8289, "step": 2851 }, { - "epoch": 1.04144604710608, - "grad_norm": 0.9428697824478149, - "learning_rate": 2.612574224910606e-05, - "loss": 0.8645, + "epoch": 0.49479528105482307, + "grad_norm": 1.16752290725708, + "learning_rate": 3.9655029198938256e-05, + "loss": 0.7502, "step": 2852 }, { - "epoch": 1.0418112105167063, - "grad_norm": 1.582295298576355, - "learning_rate": 2.6112084943327146e-05, - "loss": 0.8582, + "epoch": 0.4949687716863289, + "grad_norm": 0.867067813873291, + "learning_rate": 3.965376807438059e-05, + "loss": 0.8127, "step": 2853 }, { - "epoch": 1.0421763739273324, - "grad_norm": 1.396539568901062, - "learning_rate": 2.609842449312578e-05, - "loss": 0.8622, + "epoch": 0.4951422623178348, + "grad_norm": 0.8516867160797119, + "learning_rate": 3.965250466898024e-05, + "loss": 0.7444, "step": 2854 }, { - "epoch": 1.0425415373379587, - "grad_norm": 1.0399787425994873, - "learning_rate": 2.608476090552974e-05, - "loss": 0.8787, + "epoch": 0.4953157529493407, + "grad_norm": 0.8625990748405457, + "learning_rate": 3.9651238982883826e-05, + "loss": 0.74, "step": 2855 }, { - "epoch": 1.042906700748585, - "grad_norm": 1.0698236227035522, - "learning_rate": 2.6071094187568374e-05, - "loss": 0.8322, + "epoch": 0.4954892435808466, + "grad_norm": 0.7811792492866516, + "learning_rate": 3.964997101623823e-05, + "loss": 0.7893, "step": 2856 }, { - "epoch": 1.0432718641592111, - "grad_norm": 1.2872263193130493, - "learning_rate": 2.6057424346272667e-05, - "loss": 0.8604, + "epoch": 0.4956627342123525, + "grad_norm": 1.3468042612075806, + "learning_rate": 3.964870076919059e-05, + "loss": 0.6938, "step": 2857 }, { - "epoch": 1.0436370275698374, - "grad_norm": 1.0364550352096558, - "learning_rate": 2.6043751388675205e-05, - "loss": 0.8413, + "epoch": 0.4958362248438584, + "grad_norm": 1.1263383626937866, + "learning_rate": 3.964742824188834e-05, + "loss": 0.8268, "step": 2858 }, { - "epoch": 1.0440021909804638, - "grad_norm": 1.7925204038619995, - "learning_rate": 2.6030075321810166e-05, - "loss": 0.8767, + "epoch": 0.4960097154753643, + "grad_norm": 1.1437833309173584, + "learning_rate": 3.964615343447915e-05, + "loss": 0.8696, "step": 2859 }, { - "epoch": 1.04436735439109, - "grad_norm": 1.2542215585708618, - "learning_rate": 2.6016396152713355e-05, - "loss": 0.846, + "epoch": 0.4961832061068702, + "grad_norm": 0.860876202583313, + "learning_rate": 3.9644876347110956e-05, + "loss": 0.9382, "step": 2860 }, { - "epoch": 1.0447325178017162, - "grad_norm": 1.0002667903900146, - "learning_rate": 2.600271388842214e-05, - "loss": 0.8478, + "epoch": 0.4963566967383761, + "grad_norm": 0.7627927660942078, + "learning_rate": 3.964359697993198e-05, + "loss": 0.8823, "step": 2861 }, { - "epoch": 1.0450976812123425, - "grad_norm": 1.306111216545105, - "learning_rate": 2.5989028535975508e-05, - "loss": 0.8752, + "epoch": 0.496530187369882, + "grad_norm": 1.1085230112075806, + "learning_rate": 3.964231533309067e-05, + "loss": 0.6914, "step": 2862 }, { - "epoch": 1.0454628446229688, - "grad_norm": 1.0535792112350464, - "learning_rate": 2.5975340102414023e-05, - "loss": 0.8585, + "epoch": 0.4967036780013879, + "grad_norm": 0.9693295359611511, + "learning_rate": 3.964103140673579e-05, + "loss": 0.7, "step": 2863 }, { - "epoch": 1.045828008033595, - "grad_norm": 1.1057648658752441, - "learning_rate": 2.5961648594779824e-05, - "loss": 0.8348, + "epoch": 0.49687716863289383, + "grad_norm": 0.9646068215370178, + "learning_rate": 3.963974520101632e-05, + "loss": 0.7952, "step": 2864 }, { - "epoch": 1.0461931714442212, - "grad_norm": 0.7510945796966553, - "learning_rate": 2.594795402011665e-05, - "loss": 0.8566, + "epoch": 0.49705065926439973, + "grad_norm": 0.8884201645851135, + "learning_rate": 3.963845671608154e-05, + "loss": 0.8521, "step": 2865 }, { - "epoch": 1.0465583348548475, - "grad_norm": 1.1363285779953003, - "learning_rate": 2.5934256385469807e-05, - "loss": 0.8361, + "epoch": 0.49722414989590563, + "grad_norm": 0.9544508457183838, + "learning_rate": 3.963716595208098e-05, + "loss": 0.6593, "step": 2866 }, { - "epoch": 1.0469234982654738, - "grad_norm": 0.9640259146690369, - "learning_rate": 2.5920555697886184e-05, - "loss": 0.8262, + "epoch": 0.49739764052741153, + "grad_norm": 0.9609083533287048, + "learning_rate": 3.963587290916442e-05, + "loss": 0.7963, "step": 2867 }, { - "epoch": 1.0472886616761001, - "grad_norm": 0.8524945974349976, - "learning_rate": 2.590685196441423e-05, - "loss": 0.8785, + "epoch": 0.49757113115891743, + "grad_norm": 0.9066245555877686, + "learning_rate": 3.963457758748193e-05, + "loss": 0.8218, "step": 2868 }, { - "epoch": 1.0476538250867262, - "grad_norm": 1.319930911064148, - "learning_rate": 2.589314519210397e-05, - "loss": 0.8787, + "epoch": 0.49774462179042334, + "grad_norm": 1.0375564098358154, + "learning_rate": 3.963327998718385e-05, + "loss": 0.6846, "step": 2869 }, { - "epoch": 1.0480189884973525, - "grad_norm": 1.123026967048645, - "learning_rate": 2.5879435388006986e-05, - "loss": 0.8381, + "epoch": 0.49791811242192924, + "grad_norm": 1.184002161026001, + "learning_rate": 3.963198010842073e-05, + "loss": 0.7075, "step": 2870 }, { - "epoch": 1.0483841519079788, - "grad_norm": 1.1558854579925537, - "learning_rate": 2.586572255917642e-05, - "loss": 0.845, + "epoch": 0.49809160305343514, + "grad_norm": 1.2760109901428223, + "learning_rate": 3.963067795134344e-05, + "loss": 0.9653, "step": 2871 }, { - "epoch": 1.0487493153186052, - "grad_norm": 1.2218902111053467, - "learning_rate": 2.5852006712666975e-05, - "loss": 0.7909, + "epoch": 0.49826509368494104, + "grad_norm": 1.0634478330612183, + "learning_rate": 3.9629373516103114e-05, + "loss": 0.7948, "step": 2872 }, { - "epoch": 1.0491144787292312, - "grad_norm": 1.002800464630127, - "learning_rate": 2.5838287855534895e-05, - "loss": 0.7812, + "epoch": 0.4984385843164469, + "grad_norm": 1.229050636291504, + "learning_rate": 3.962806680285111e-05, + "loss": 0.6504, "step": 2873 }, { - "epoch": 1.0494796421398576, - "grad_norm": 1.0977967977523804, - "learning_rate": 2.5824565994838e-05, - "loss": 0.837, + "epoch": 0.4986120749479528, + "grad_norm": 0.931806743144989, + "learning_rate": 3.9626757811739084e-05, + "loss": 0.6575, "step": 2874 }, { - "epoch": 1.0498448055504839, - "grad_norm": 0.9287051558494568, - "learning_rate": 2.581084113763562e-05, - "loss": 0.8763, + "epoch": 0.4987855655794587, + "grad_norm": 2.8295507431030273, + "learning_rate": 3.962544654291894e-05, + "loss": 0.7258, "step": 2875 }, { - "epoch": 1.0502099689611102, - "grad_norm": 0.9775145649909973, - "learning_rate": 2.5797113290988655e-05, - "loss": 0.8511, + "epoch": 0.4989590562109646, + "grad_norm": 1.5089484453201294, + "learning_rate": 3.962413299654286e-05, + "loss": 0.8586, "step": 2876 }, { - "epoch": 1.0505751323717363, - "grad_norm": 1.2182035446166992, - "learning_rate": 2.5783382461959523e-05, - "loss": 0.7878, + "epoch": 0.4991325468424705, + "grad_norm": 2.003451347351074, + "learning_rate": 3.962281717276328e-05, + "loss": 0.6997, "step": 2877 }, { - "epoch": 1.0509402957823626, - "grad_norm": 1.1740126609802246, - "learning_rate": 2.576964865761218e-05, - "loss": 0.8335, + "epoch": 0.4993060374739764, + "grad_norm": 1.1774883270263672, + "learning_rate": 3.962149907173291e-05, + "loss": 0.6467, "step": 2878 }, { - "epoch": 1.051305459192989, - "grad_norm": 0.9073178172111511, - "learning_rate": 2.575591188501213e-05, - "loss": 0.8074, + "epoch": 0.4994795281054823, + "grad_norm": 1.1778582334518433, + "learning_rate": 3.9620178693604696e-05, + "loss": 0.6602, "step": 2879 }, { - "epoch": 1.0516706226036152, - "grad_norm": 0.9603595733642578, - "learning_rate": 2.574217215122639e-05, - "loss": 0.8505, + "epoch": 0.4996530187369882, + "grad_norm": 1.2562226057052612, + "learning_rate": 3.961885603853189e-05, + "loss": 0.6818, "step": 2880 }, { - "epoch": 1.0520357860142413, - "grad_norm": 0.9663758873939514, - "learning_rate": 2.5728429463323487e-05, - "loss": 0.7868, + "epoch": 0.4998265093684941, + "grad_norm": 0.8714596629142761, + "learning_rate": 3.961753110666798e-05, + "loss": 0.7673, "step": 2881 }, { - "epoch": 1.0524009494248676, - "grad_norm": 0.889695942401886, - "learning_rate": 2.57146838283735e-05, - "loss": 0.8297, + "epoch": 0.5, + "grad_norm": 0.8953967094421387, + "learning_rate": 3.9616203898166724e-05, + "loss": 0.7876, "step": 2882 }, { - "epoch": 1.052766112835494, - "grad_norm": 1.0866589546203613, - "learning_rate": 2.5700935253447998e-05, - "loss": 0.8423, + "epoch": 0.5001734906315058, + "grad_norm": 0.8786922097206116, + "learning_rate": 3.9614874413182144e-05, + "loss": 0.7441, "step": 2883 }, { - "epoch": 1.0531312762461202, - "grad_norm": 1.1245691776275635, - "learning_rate": 2.5687183745620078e-05, - "loss": 0.8683, + "epoch": 0.5003469812630118, + "grad_norm": 1.6581569910049438, + "learning_rate": 3.961354265186854e-05, + "loss": 0.8096, "step": 2884 }, { - "epoch": 1.0534964396567463, - "grad_norm": 0.9057197570800781, - "learning_rate": 2.567342931196432e-05, - "loss": 0.8416, + "epoch": 0.5005204718945176, + "grad_norm": 0.9149590134620667, + "learning_rate": 3.961220861438045e-05, + "loss": 0.8049, "step": 2885 }, { - "epoch": 1.0538616030673726, - "grad_norm": 1.088468313217163, - "learning_rate": 2.5659671959556848e-05, - "loss": 0.9038, + "epoch": 0.5006939625260236, + "grad_norm": 0.9125156998634338, + "learning_rate": 3.9610872300872704e-05, + "loss": 0.8943, "step": 2886 }, { - "epoch": 1.054226766477999, - "grad_norm": 1.1295790672302246, - "learning_rate": 2.5645911695475264e-05, - "loss": 0.8712, + "epoch": 0.5008674531575295, + "grad_norm": 1.016135573387146, + "learning_rate": 3.960953371150037e-05, + "loss": 0.7485, "step": 2887 }, { - "epoch": 1.054591929888625, - "grad_norm": 0.7760172486305237, - "learning_rate": 2.563214852679867e-05, - "loss": 0.8638, + "epoch": 0.5010409437890354, + "grad_norm": 0.9009554386138916, + "learning_rate": 3.96081928464188e-05, + "loss": 0.6694, "step": 2888 }, { - "epoch": 1.0549570932992514, - "grad_norm": 1.0388389825820923, - "learning_rate": 2.5618382460607666e-05, - "loss": 0.8297, + "epoch": 0.5012144344205413, + "grad_norm": 1.1002380847930908, + "learning_rate": 3.9606849705783606e-05, + "loss": 0.6467, "step": 2889 }, { - "epoch": 1.0553222567098777, - "grad_norm": 1.0910496711730957, - "learning_rate": 2.5604613503984327e-05, - "loss": 0.8529, + "epoch": 0.5013879250520472, + "grad_norm": 0.9687401652336121, + "learning_rate": 3.960550428975066e-05, + "loss": 0.8108, "step": 2890 }, { - "epoch": 1.055687420120504, - "grad_norm": 1.1121442317962646, - "learning_rate": 2.559084166401224e-05, - "loss": 0.8922, + "epoch": 0.5015614156835531, + "grad_norm": 1.070892333984375, + "learning_rate": 3.960415659847609e-05, + "loss": 0.6432, "step": 2891 }, { - "epoch": 1.05605258353113, - "grad_norm": 1.4638217687606812, - "learning_rate": 2.557706694777647e-05, - "loss": 0.8931, + "epoch": 0.501734906315059, + "grad_norm": 1.106458306312561, + "learning_rate": 3.9602806632116304e-05, + "loss": 0.6467, "step": 2892 }, { - "epoch": 1.0564177469417564, - "grad_norm": 1.193512201309204, - "learning_rate": 2.5563289362363547e-05, - "loss": 0.8502, + "epoch": 0.5019083969465649, + "grad_norm": 1.0255151987075806, + "learning_rate": 3.960145439082797e-05, + "loss": 0.6772, "step": 2893 }, { - "epoch": 1.0567829103523827, - "grad_norm": 1.013169288635254, - "learning_rate": 2.554950891486149e-05, - "loss": 0.819, + "epoch": 0.5020818875780708, + "grad_norm": 0.7847845554351807, + "learning_rate": 3.960009987476801e-05, + "loss": 0.694, "step": 2894 }, { - "epoch": 1.057148073763009, - "grad_norm": 1.1525129079818726, - "learning_rate": 2.5535725612359778e-05, - "loss": 0.8726, + "epoch": 0.5022553782095767, + "grad_norm": 1.115521788597107, + "learning_rate": 3.959874308409362e-05, + "loss": 0.7622, "step": 2895 }, { - "epoch": 1.057513237173635, - "grad_norm": 1.382262110710144, - "learning_rate": 2.5521939461949384e-05, - "loss": 0.8806, + "epoch": 0.5024288688410826, + "grad_norm": 0.854830265045166, + "learning_rate": 3.959738401896227e-05, + "loss": 0.8862, "step": 2896 }, { - "epoch": 1.0578784005842614, - "grad_norm": 0.9343236088752747, - "learning_rate": 2.5508150470722708e-05, - "loss": 0.8826, + "epoch": 0.5026023594725885, + "grad_norm": 0.8922322392463684, + "learning_rate": 3.959602267953165e-05, + "loss": 0.6893, "step": 2897 }, { - "epoch": 1.0582435639948877, - "grad_norm": 1.6366080045700073, - "learning_rate": 2.5494358645773646e-05, - "loss": 0.8588, + "epoch": 0.5027758501040944, + "grad_norm": 0.9057824611663818, + "learning_rate": 3.9594659065959774e-05, + "loss": 0.6638, "step": 2898 }, { - "epoch": 1.058608727405514, - "grad_norm": 1.0705798864364624, - "learning_rate": 2.548056399419754e-05, - "loss": 0.8583, + "epoch": 0.5029493407356003, + "grad_norm": 0.8589410781860352, + "learning_rate": 3.9593293178404885e-05, + "loss": 0.6149, "step": 2899 }, { - "epoch": 1.0589738908161401, - "grad_norm": 1.0156774520874023, - "learning_rate": 2.5466766523091172e-05, - "loss": 0.839, + "epoch": 0.5031228313671062, + "grad_norm": 0.839524507522583, + "learning_rate": 3.959192501702548e-05, + "loss": 0.8132, "step": 2900 }, { - "epoch": 1.0593390542267664, - "grad_norm": 1.2533537149429321, - "learning_rate": 2.5452966239552802e-05, - "loss": 0.8594, + "epoch": 0.5032963219986121, + "grad_norm": 0.9620363712310791, + "learning_rate": 3.959055458198036e-05, + "loss": 0.7432, "step": 2901 }, { - "epoch": 1.0597042176373928, - "grad_norm": 1.088197112083435, - "learning_rate": 2.543916315068211e-05, - "loss": 0.7866, + "epoch": 0.5034698126301179, + "grad_norm": 0.9434872269630432, + "learning_rate": 3.958918187342855e-05, + "loss": 0.804, "step": 2902 }, { - "epoch": 1.060069381048019, - "grad_norm": 1.1335351467132568, - "learning_rate": 2.5425357263580246e-05, - "loss": 0.8735, + "epoch": 0.5036433032616239, + "grad_norm": 0.7687748670578003, + "learning_rate": 3.9587806891529354e-05, + "loss": 0.8567, "step": 2903 }, { - "epoch": 1.0604345444586452, - "grad_norm": 0.9706670045852661, - "learning_rate": 2.5411548585349772e-05, - "loss": 0.8318, + "epoch": 0.5038167938931297, + "grad_norm": 0.7927574515342712, + "learning_rate": 3.9586429636442346e-05, + "loss": 0.7888, "step": 2904 }, { - "epoch": 1.0607997078692715, - "grad_norm": 0.9958271980285645, - "learning_rate": 2.5397737123094697e-05, - "loss": 0.8707, + "epoch": 0.5039902845246357, + "grad_norm": 0.8664775490760803, + "learning_rate": 3.958505010832735e-05, + "loss": 1.0164, "step": 2905 }, { - "epoch": 1.0611648712798978, - "grad_norm": 1.2428768873214722, - "learning_rate": 2.5383922883920476e-05, - "loss": 0.8528, + "epoch": 0.5041637751561415, + "grad_norm": 0.8578490614891052, + "learning_rate": 3.958366830734448e-05, + "loss": 0.9199, "step": 2906 }, { - "epoch": 1.061530034690524, - "grad_norm": 0.9771381616592407, - "learning_rate": 2.5370105874933972e-05, - "loss": 0.8364, + "epoch": 0.5043372657876475, + "grad_norm": 1.0172429084777832, + "learning_rate": 3.958228423365408e-05, + "loss": 0.6816, "step": 2907 }, { - "epoch": 1.0618951981011502, - "grad_norm": 1.1079576015472412, - "learning_rate": 2.5356286103243485e-05, - "loss": 0.8157, + "epoch": 0.5045107564191533, + "grad_norm": 0.960933268070221, + "learning_rate": 3.958089788741677e-05, + "loss": 0.7607, "step": 2908 }, { - "epoch": 1.0622603615117765, - "grad_norm": 1.063029408454895, - "learning_rate": 2.5342463575958737e-05, - "loss": 0.7913, + "epoch": 0.5046842470506593, + "grad_norm": 0.748773455619812, + "learning_rate": 3.957950926879345e-05, + "loss": 0.791, "step": 2909 }, { - "epoch": 1.0626255249224028, - "grad_norm": 1.2698330879211426, - "learning_rate": 2.5328638300190856e-05, - "loss": 0.9265, + "epoch": 0.5048577376821651, + "grad_norm": 0.9004460573196411, + "learning_rate": 3.957811837794526e-05, + "loss": 0.6436, "step": 2910 }, { - "epoch": 1.0629906883330291, - "grad_norm": 0.8568744659423828, - "learning_rate": 2.531481028305239e-05, - "loss": 0.8845, + "epoch": 0.5050312283136711, + "grad_norm": 1.01688551902771, + "learning_rate": 3.9576725215033624e-05, + "loss": 0.7666, "step": 2911 }, { - "epoch": 1.0633558517436552, - "grad_norm": 1.125173568725586, - "learning_rate": 2.5300979531657305e-05, - "loss": 0.8159, + "epoch": 0.5052047189451769, + "grad_norm": 0.8028964996337891, + "learning_rate": 3.9575329780220215e-05, + "loss": 0.7712, "step": 2912 }, { - "epoch": 1.0637210151542815, - "grad_norm": 1.10563325881958, - "learning_rate": 2.528714605312097e-05, - "loss": 0.8417, + "epoch": 0.5053782095766829, + "grad_norm": 1.0828757286071777, + "learning_rate": 3.957393207366697e-05, + "loss": 0.7429, "step": 2913 }, { - "epoch": 1.0640861785649078, - "grad_norm": 1.0114808082580566, - "learning_rate": 2.5273309854560148e-05, - "loss": 0.8125, + "epoch": 0.5055517002081887, + "grad_norm": 1.0363036394119263, + "learning_rate": 3.957253209553611e-05, + "loss": 0.8286, "step": 2914 }, { - "epoch": 1.0644513419755341, - "grad_norm": 1.0786919593811035, - "learning_rate": 2.525947094309301e-05, - "loss": 0.8218, + "epoch": 0.5057251908396947, + "grad_norm": 0.9265990853309631, + "learning_rate": 3.9571129845990084e-05, + "loss": 0.8181, "step": 2915 }, { - "epoch": 1.0648165053861602, - "grad_norm": 1.1904783248901367, - "learning_rate": 2.5245629325839125e-05, - "loss": 0.8326, + "epoch": 0.5058986814712005, + "grad_norm": 0.9159021377563477, + "learning_rate": 3.956972532519164e-05, + "loss": 0.6707, "step": 2916 }, { - "epoch": 1.0651816687967866, - "grad_norm": 1.550451636314392, - "learning_rate": 2.5231785009919437e-05, - "loss": 0.8849, + "epoch": 0.5060721721027065, + "grad_norm": 1.0046429634094238, + "learning_rate": 3.956831853330376e-05, + "loss": 0.8743, "step": 2917 }, { - "epoch": 1.0655468322074129, - "grad_norm": 1.4624685049057007, - "learning_rate": 2.521793800245631e-05, - "loss": 0.8367, + "epoch": 0.5062456627342123, + "grad_norm": 1.0171116590499878, + "learning_rate": 3.956690947048972e-05, + "loss": 0.8164, "step": 2918 }, { - "epoch": 1.0659119956180392, - "grad_norm": 1.2864861488342285, - "learning_rate": 2.5204088310573455e-05, - "loss": 0.8402, + "epoch": 0.5064191533657183, + "grad_norm": 1.0186817646026611, + "learning_rate": 3.956549813691304e-05, + "loss": 0.6924, "step": 2919 }, { - "epoch": 1.0662771590286653, - "grad_norm": 1.0355206727981567, - "learning_rate": 2.5190235941395996e-05, - "loss": 0.7924, + "epoch": 0.5065926439972241, + "grad_norm": 0.8515982627868652, + "learning_rate": 3.9564084532737495e-05, + "loss": 0.8579, "step": 2920 }, { - "epoch": 1.0666423224392916, - "grad_norm": 1.0127636194229126, - "learning_rate": 2.5176380902050418e-05, - "loss": 0.8655, + "epoch": 0.5067661346287301, + "grad_norm": 1.6872906684875488, + "learning_rate": 3.956266865812714e-05, + "loss": 0.7427, "step": 2921 }, { - "epoch": 1.067007485849918, - "grad_norm": 1.0824427604675293, - "learning_rate": 2.5162523199664583e-05, - "loss": 0.88, + "epoch": 0.506939625260236, + "grad_norm": 0.9383764266967773, + "learning_rate": 3.9561250513246306e-05, + "loss": 0.8022, "step": 2922 }, { - "epoch": 1.0673726492605442, - "grad_norm": 1.2596038579940796, - "learning_rate": 2.5148662841367718e-05, - "loss": 0.8278, + "epoch": 0.5071131158917418, + "grad_norm": 1.3262748718261719, + "learning_rate": 3.9559830098259544e-05, + "loss": 0.7769, "step": 2923 }, { - "epoch": 1.0677378126711703, - "grad_norm": 1.028663992881775, - "learning_rate": 2.5134799834290417e-05, - "loss": 0.8737, + "epoch": 0.5072866065232478, + "grad_norm": 1.2814624309539795, + "learning_rate": 3.955840741333171e-05, + "loss": 0.6249, "step": 2924 }, { - "epoch": 1.0681029760817966, - "grad_norm": 0.9494677782058716, - "learning_rate": 2.512093418556466e-05, - "loss": 0.809, + "epoch": 0.5074600971547536, + "grad_norm": 1.011806607246399, + "learning_rate": 3.9556982458627905e-05, + "loss": 0.7603, "step": 2925 }, { - "epoch": 1.068468139492423, - "grad_norm": 1.1487404108047485, - "learning_rate": 2.510706590232374e-05, - "loss": 0.8329, + "epoch": 0.5076335877862596, + "grad_norm": 0.7577089667320251, + "learning_rate": 3.9555555234313506e-05, + "loss": 0.7593, "step": 2926 }, { - "epoch": 1.068833302903049, - "grad_norm": 1.1503838300704956, - "learning_rate": 2.5093194991702362e-05, - "loss": 0.8154, + "epoch": 0.5078070784177654, + "grad_norm": 0.9670544862747192, + "learning_rate": 3.955412574055413e-05, + "loss": 0.6934, "step": 2927 }, { - "epoch": 1.0691984663136753, - "grad_norm": 0.8905759453773499, - "learning_rate": 2.5079321460836528e-05, - "loss": 0.8442, + "epoch": 0.5079805690492714, + "grad_norm": 1.693077802658081, + "learning_rate": 3.9552693977515675e-05, + "loss": 0.6819, "step": 2928 }, { - "epoch": 1.0695636297243016, - "grad_norm": 1.208771824836731, - "learning_rate": 2.5065445316863627e-05, - "loss": 0.8481, + "epoch": 0.5081540596807772, + "grad_norm": 1.0255695581436157, + "learning_rate": 3.95512599453643e-05, + "loss": 0.7134, "step": 2929 }, { - "epoch": 1.069928793134928, - "grad_norm": 1.0280394554138184, - "learning_rate": 2.5051566566922377e-05, - "loss": 0.8673, + "epoch": 0.5083275503122832, + "grad_norm": 0.7281595468521118, + "learning_rate": 3.9549823644266434e-05, + "loss": 0.8066, "step": 2930 }, { - "epoch": 1.070293956545554, - "grad_norm": 1.0656131505966187, - "learning_rate": 2.503768521815283e-05, - "loss": 0.8361, + "epoch": 0.508501040943789, + "grad_norm": 0.9152772426605225, + "learning_rate": 3.9548385074388745e-05, + "loss": 0.7998, "step": 2931 }, { - "epoch": 1.0706591199561803, - "grad_norm": 1.103561282157898, - "learning_rate": 2.5023801277696393e-05, - "loss": 0.8865, + "epoch": 0.508674531575295, + "grad_norm": 0.9469582438468933, + "learning_rate": 3.9546944235898194e-05, + "loss": 0.8909, "step": 2932 }, { - "epoch": 1.0710242833668067, - "grad_norm": 1.0599820613861084, - "learning_rate": 2.5009914752695785e-05, - "loss": 0.8868, + "epoch": 0.5088480222068008, + "grad_norm": 0.929584264755249, + "learning_rate": 3.9545501128961985e-05, + "loss": 0.8188, "step": 2933 }, { - "epoch": 1.071389446777433, - "grad_norm": 0.8091131448745728, - "learning_rate": 2.4996025650295072e-05, - "loss": 0.8621, + "epoch": 0.5090215128383068, + "grad_norm": 0.7629474401473999, + "learning_rate": 3.954405575374759e-05, + "loss": 0.9312, "step": 2934 }, { - "epoch": 1.071754610188059, - "grad_norm": 1.1902296543121338, - "learning_rate": 2.4982133977639644e-05, - "loss": 0.8544, + "epoch": 0.5091950034698126, + "grad_norm": 0.8364317417144775, + "learning_rate": 3.9542608110422764e-05, + "loss": 0.7522, "step": 2935 }, { - "epoch": 1.0721197735986854, - "grad_norm": 1.046217679977417, - "learning_rate": 2.4968239741876205e-05, - "loss": 0.8301, + "epoch": 0.5093684941013186, + "grad_norm": 0.9466094374656677, + "learning_rate": 3.954115819915549e-05, + "loss": 0.74, "step": 2936 }, { - "epoch": 1.0724849370093117, - "grad_norm": 1.0528533458709717, - "learning_rate": 2.4954342950152786e-05, - "loss": 0.8528, + "epoch": 0.5095419847328244, + "grad_norm": 1.1584111452102661, + "learning_rate": 3.953970602011404e-05, + "loss": 0.7881, "step": 2937 }, { - "epoch": 1.072850100419938, - "grad_norm": 1.0934892892837524, - "learning_rate": 2.4940443609618713e-05, - "loss": 0.8855, + "epoch": 0.5097154753643304, + "grad_norm": 0.9450492858886719, + "learning_rate": 3.9538251573466926e-05, + "loss": 0.7671, "step": 2938 }, { - "epoch": 1.073215263830564, - "grad_norm": 1.3011666536331177, - "learning_rate": 2.4926541727424663e-05, - "loss": 0.8003, + "epoch": 0.5098889659958362, + "grad_norm": 1.058920979499817, + "learning_rate": 3.9536794859382966e-05, + "loss": 0.7405, "step": 2939 }, { - "epoch": 1.0735804272411904, - "grad_norm": 0.9563901424407959, - "learning_rate": 2.4912637310722575e-05, - "loss": 0.8631, + "epoch": 0.5100624566273422, + "grad_norm": 0.8548610210418701, + "learning_rate": 3.9535335878031185e-05, + "loss": 0.8716, "step": 2940 }, { - "epoch": 1.0739455906518167, - "grad_norm": 1.175918698310852, - "learning_rate": 2.4898730366665724e-05, - "loss": 0.8538, + "epoch": 0.510235947258848, + "grad_norm": 0.7722176909446716, + "learning_rate": 3.953387462958092e-05, + "loss": 0.6863, "step": 2941 }, { - "epoch": 1.074310754062443, - "grad_norm": 0.9328047037124634, - "learning_rate": 2.488482090240868e-05, - "loss": 0.8726, + "epoch": 0.5104094378903539, + "grad_norm": 0.9942623376846313, + "learning_rate": 3.953241111420174e-05, + "loss": 0.7737, "step": 2942 }, { - "epoch": 1.0746759174730691, - "grad_norm": 1.162660837173462, - "learning_rate": 2.487090892510729e-05, - "loss": 0.813, + "epoch": 0.5105829285218598, + "grad_norm": 0.8595191240310669, + "learning_rate": 3.9530945332063486e-05, + "loss": 0.7402, "step": 2943 }, { - "epoch": 1.0750410808836954, - "grad_norm": 1.0445667505264282, - "learning_rate": 2.4856994441918718e-05, - "loss": 0.8496, + "epoch": 0.5107564191533657, + "grad_norm": 0.983489453792572, + "learning_rate": 3.9529477283336274e-05, + "loss": 0.7932, "step": 2944 }, { - "epoch": 1.0754062442943217, - "grad_norm": 1.1934785842895508, - "learning_rate": 2.4843077460001405e-05, - "loss": 0.8719, + "epoch": 0.5109299097848716, + "grad_norm": 0.7555999159812927, + "learning_rate": 3.952800696819046e-05, + "loss": 0.8591, "step": 2945 }, { - "epoch": 1.075771407704948, - "grad_norm": 1.4441841840744019, - "learning_rate": 2.482915798651507e-05, - "loss": 0.8549, + "epoch": 0.5111034004163775, + "grad_norm": 1.0308246612548828, + "learning_rate": 3.9526534386796696e-05, + "loss": 0.7559, "step": 2946 }, { - "epoch": 1.0761365711155741, - "grad_norm": 0.9177552461624146, - "learning_rate": 2.481523602862075e-05, - "loss": 0.8368, + "epoch": 0.5112768910478834, + "grad_norm": 0.8571133613586426, + "learning_rate": 3.9525059539325854e-05, + "loss": 0.7336, "step": 2947 }, { - "epoch": 1.0765017345262005, - "grad_norm": 1.0575183629989624, - "learning_rate": 2.48013115934807e-05, - "loss": 0.8356, + "epoch": 0.5114503816793893, + "grad_norm": 0.9205681085586548, + "learning_rate": 3.95235824259491e-05, + "loss": 0.6934, "step": 2948 }, { - "epoch": 1.0768668979368268, - "grad_norm": 1.0422428846359253, - "learning_rate": 2.4787384688258514e-05, - "loss": 0.8734, + "epoch": 0.5116238723108952, + "grad_norm": 1.1804134845733643, + "learning_rate": 3.952210304683786e-05, + "loss": 0.8584, "step": 2949 }, { - "epoch": 1.077232061347453, - "grad_norm": 1.1633937358856201, - "learning_rate": 2.4773455320119005e-05, - "loss": 0.8259, + "epoch": 0.5117973629424011, + "grad_norm": 0.8137403726577759, + "learning_rate": 3.952062140216381e-05, + "loss": 0.7238, "step": 2950 }, { - "epoch": 1.0775972247580792, - "grad_norm": 1.133314609527588, - "learning_rate": 2.475952349622828e-05, - "loss": 0.8561, + "epoch": 0.511970853573907, + "grad_norm": 1.1116070747375488, + "learning_rate": 3.951913749209891e-05, + "loss": 0.8074, "step": 2951 }, { - "epoch": 1.0779623881687055, - "grad_norm": 1.3093271255493164, - "learning_rate": 2.474558922375371e-05, - "loss": 0.8882, + "epoch": 0.5121443442054129, + "grad_norm": 1.0000746250152588, + "learning_rate": 3.951765131681535e-05, + "loss": 0.8945, "step": 2952 }, { - "epoch": 1.0783275515793318, - "grad_norm": 1.4035468101501465, - "learning_rate": 2.4731652509863904e-05, - "loss": 0.8151, + "epoch": 0.5123178348369188, + "grad_norm": 0.8885847926139832, + "learning_rate": 3.951616287648561e-05, + "loss": 0.7947, "step": 2953 }, { - "epoch": 1.0786927149899581, - "grad_norm": 0.9490573406219482, - "learning_rate": 2.471771336172876e-05, - "loss": 0.7961, + "epoch": 0.5124913254684247, + "grad_norm": 1.013608455657959, + "learning_rate": 3.9514672171282435e-05, + "loss": 0.6637, "step": 2954 }, { - "epoch": 1.0790578784005842, - "grad_norm": 1.1828008890151978, - "learning_rate": 2.4703771786519392e-05, - "loss": 0.826, + "epoch": 0.5126648160999306, + "grad_norm": 0.8114069700241089, + "learning_rate": 3.951317920137881e-05, + "loss": 0.8062, "step": 2955 }, { - "epoch": 1.0794230418112105, - "grad_norm": 1.390709400177002, - "learning_rate": 2.4689827791408198e-05, - "loss": 0.8273, + "epoch": 0.5128383067314365, + "grad_norm": 1.4771299362182617, + "learning_rate": 3.951168396694801e-05, + "loss": 0.7473, "step": 2956 }, { - "epoch": 1.0797882052218368, - "grad_norm": 0.8040955066680908, - "learning_rate": 2.4675881383568797e-05, - "loss": 0.874, + "epoch": 0.5130117973629424, + "grad_norm": 1.705639123916626, + "learning_rate": 3.951018646816354e-05, + "loss": 0.9487, "step": 2957 }, { - "epoch": 1.080153368632463, - "grad_norm": 1.0622693300247192, - "learning_rate": 2.4661932570176047e-05, - "loss": 0.9028, + "epoch": 0.5131852879944483, + "grad_norm": 1.2709617614746094, + "learning_rate": 3.9508686705199196e-05, + "loss": 0.6777, "step": 2958 }, { - "epoch": 1.0805185320430892, - "grad_norm": 0.9782107472419739, - "learning_rate": 2.464798135840607e-05, - "loss": 0.8636, + "epoch": 0.5133587786259542, + "grad_norm": 1.1813833713531494, + "learning_rate": 3.9507184678229035e-05, + "loss": 0.6471, "step": 2959 }, { - "epoch": 1.0808836954537155, - "grad_norm": 1.3454830646514893, - "learning_rate": 2.4634027755436192e-05, - "loss": 0.8926, + "epoch": 0.5135322692574601, + "grad_norm": 0.7091451287269592, + "learning_rate": 3.950568038742736e-05, + "loss": 0.8972, "step": 2960 }, { - "epoch": 1.0812488588643419, - "grad_norm": 1.221238613128662, - "learning_rate": 2.4620071768444985e-05, - "loss": 0.8292, + "epoch": 0.5137057598889659, + "grad_norm": 1.0388126373291016, + "learning_rate": 3.9504173832968744e-05, + "loss": 0.8684, "step": 2961 }, { - "epoch": 1.081614022274968, - "grad_norm": 0.9671429991722107, - "learning_rate": 2.4606113404612244e-05, - "loss": 0.8209, + "epoch": 0.5138792505204719, + "grad_norm": 1.0451043844223022, + "learning_rate": 3.950266501502803e-05, + "loss": 0.689, "step": 2962 }, { - "epoch": 1.0819791856855943, - "grad_norm": 2.2429728507995605, - "learning_rate": 2.4592152671118993e-05, - "loss": 0.8871, + "epoch": 0.5140527411519777, + "grad_norm": 1.0161809921264648, + "learning_rate": 3.9501153933780314e-05, + "loss": 0.7461, "step": 2963 }, { - "epoch": 1.0823443490962206, - "grad_norm": 1.1923274993896484, - "learning_rate": 2.4578189575147465e-05, - "loss": 0.8412, + "epoch": 0.5142262317834837, + "grad_norm": 0.8203110098838806, + "learning_rate": 3.9499640589400964e-05, + "loss": 0.8521, "step": 2964 }, { - "epoch": 1.0827095125068469, - "grad_norm": 1.0253819227218628, - "learning_rate": 2.4564224123881103e-05, - "loss": 0.8445, + "epoch": 0.5143997224149895, + "grad_norm": 1.6004801988601685, + "learning_rate": 3.94981249820656e-05, + "loss": 0.8083, "step": 2965 }, { - "epoch": 1.083074675917473, - "grad_norm": 1.2940071821212769, - "learning_rate": 2.4550256324504594e-05, - "loss": 0.8281, + "epoch": 0.5145732130464955, + "grad_norm": 1.0159929990768433, + "learning_rate": 3.949660711195011e-05, + "loss": 0.9341, "step": 2966 }, { - "epoch": 1.0834398393280993, - "grad_norm": 1.426045536994934, - "learning_rate": 2.4536286184203783e-05, - "loss": 0.8355, + "epoch": 0.5147467036780013, + "grad_norm": 3.65317964553833, + "learning_rate": 3.9495086979230656e-05, + "loss": 0.7202, "step": 2967 }, { - "epoch": 1.0838050027387256, - "grad_norm": 0.9397575259208679, - "learning_rate": 2.4522313710165765e-05, - "loss": 0.7874, + "epoch": 0.5149201943095073, + "grad_norm": 0.9104076623916626, + "learning_rate": 3.949356458408363e-05, + "loss": 0.7891, "step": 2968 }, { - "epoch": 1.084170166149352, - "grad_norm": 1.793645977973938, - "learning_rate": 2.4508338909578817e-05, - "loss": 0.8423, + "epoch": 0.5150936849410132, + "grad_norm": 1.1983460187911987, + "learning_rate": 3.9492039926685724e-05, + "loss": 0.6676, "step": 2969 }, { - "epoch": 1.084535329559978, - "grad_norm": 1.2501137256622314, - "learning_rate": 2.4494361789632405e-05, - "loss": 0.8459, + "epoch": 0.5152671755725191, + "grad_norm": 1.0507065057754517, + "learning_rate": 3.9490513007213874e-05, + "loss": 0.7273, "step": 2970 }, { - "epoch": 1.0849004929706043, - "grad_norm": 0.9520185589790344, - "learning_rate": 2.4480382357517195e-05, - "loss": 0.8136, + "epoch": 0.515440666204025, + "grad_norm": 0.8703423738479614, + "learning_rate": 3.948898382584528e-05, + "loss": 0.6948, "step": 2971 }, { - "epoch": 1.0852656563812306, - "grad_norm": 0.9832046031951904, - "learning_rate": 2.4466400620425054e-05, - "loss": 0.8934, + "epoch": 0.5156141568355309, + "grad_norm": 0.8557333946228027, + "learning_rate": 3.94874523827574e-05, + "loss": 0.8257, "step": 2972 }, { - "epoch": 1.085630819791857, - "grad_norm": 1.3599169254302979, - "learning_rate": 2.4452416585549018e-05, - "loss": 0.8593, + "epoch": 0.5157876474670368, + "grad_norm": 1.1015901565551758, + "learning_rate": 3.9485918678127954e-05, + "loss": 0.9783, "step": 2973 }, { - "epoch": 1.085995983202483, - "grad_norm": 0.9382385611534119, - "learning_rate": 2.443843026008331e-05, - "loss": 0.887, + "epoch": 0.5159611380985427, + "grad_norm": 1.1745623350143433, + "learning_rate": 3.9484382712134956e-05, + "loss": 0.8706, "step": 2974 }, { - "epoch": 1.0863611466131093, - "grad_norm": 1.0336482524871826, - "learning_rate": 2.4424441651223343e-05, - "loss": 0.8329, + "epoch": 0.5161346287300486, + "grad_norm": 0.8550458550453186, + "learning_rate": 3.948284448495663e-05, + "loss": 0.7917, "step": 2975 }, { - "epoch": 1.0867263100237357, - "grad_norm": 0.9639894366264343, - "learning_rate": 2.4410450766165688e-05, - "loss": 0.8591, + "epoch": 0.5163081193615545, + "grad_norm": 0.8362998962402344, + "learning_rate": 3.9481303996771505e-05, + "loss": 0.886, "step": 2976 }, { - "epoch": 1.087091473434362, - "grad_norm": 1.1854790449142456, - "learning_rate": 2.439645761210809e-05, - "loss": 0.8492, + "epoch": 0.5164816099930604, + "grad_norm": 1.2496426105499268, + "learning_rate": 3.947976124775835e-05, + "loss": 0.8395, "step": 2977 }, { - "epoch": 1.087456636844988, - "grad_norm": 1.2354331016540527, - "learning_rate": 2.438246219624947e-05, - "loss": 0.8314, + "epoch": 0.5166551006245663, + "grad_norm": 0.971497654914856, + "learning_rate": 3.9478216238096206e-05, + "loss": 0.6655, "step": 2978 }, { - "epoch": 1.0878218002556144, - "grad_norm": 1.2114131450653076, - "learning_rate": 2.4368464525789905e-05, - "loss": 0.804, + "epoch": 0.5168285912560722, + "grad_norm": 1.1311416625976562, + "learning_rate": 3.947666896796436e-05, + "loss": 0.8601, "step": 2979 }, { - "epoch": 1.0881869636662407, - "grad_norm": 1.0063916444778442, - "learning_rate": 2.435446460793064e-05, - "loss": 0.8438, + "epoch": 0.5170020818875781, + "grad_norm": 0.8275728821754456, + "learning_rate": 3.94751194375424e-05, + "loss": 0.874, "step": 2980 }, { - "epoch": 1.088552127076867, - "grad_norm": 1.0877543687820435, - "learning_rate": 2.4340462449874063e-05, - "loss": 0.868, + "epoch": 0.517175572519084, + "grad_norm": 1.7668403387069702, + "learning_rate": 3.947356764701013e-05, + "loss": 0.6873, "step": 2981 }, { - "epoch": 1.088917290487493, - "grad_norm": 0.9337718486785889, - "learning_rate": 2.4326458058823735e-05, - "loss": 0.8789, + "epoch": 0.5173490631505898, + "grad_norm": 0.9337339997291565, + "learning_rate": 3.9472013596547646e-05, + "loss": 0.7438, "step": 2982 }, { - "epoch": 1.0892824538981194, - "grad_norm": 1.2234938144683838, - "learning_rate": 2.4312451441984344e-05, - "loss": 0.8286, + "epoch": 0.5175225537820958, + "grad_norm": 1.4055837392807007, + "learning_rate": 3.947045728633529e-05, + "loss": 0.7146, "step": 2983 }, { - "epoch": 1.0896476173087457, - "grad_norm": 1.1594382524490356, - "learning_rate": 2.429844260656173e-05, - "loss": 0.855, + "epoch": 0.5176960444136016, + "grad_norm": 0.8422994613647461, + "learning_rate": 3.946889871655368e-05, + "loss": 0.6719, "step": 2984 }, { - "epoch": 1.090012780719372, - "grad_norm": 1.1013435125350952, - "learning_rate": 2.4284431559762888e-05, - "loss": 0.8876, + "epoch": 0.5178695350451076, + "grad_norm": 0.8104444146156311, + "learning_rate": 3.9467337887383695e-05, + "loss": 0.6991, "step": 2985 }, { - "epoch": 1.0903779441299981, - "grad_norm": 1.0936610698699951, - "learning_rate": 2.4270418308795923e-05, - "loss": 0.8748, + "epoch": 0.5180430256766134, + "grad_norm": 1.076478362083435, + "learning_rate": 3.946577479900645e-05, + "loss": 0.8701, "step": 2986 }, { - "epoch": 1.0907431075406244, - "grad_norm": 0.8980793356895447, - "learning_rate": 2.4256402860870107e-05, - "loss": 0.8457, + "epoch": 0.5182165163081194, + "grad_norm": 1.0439811944961548, + "learning_rate": 3.9464209451603367e-05, + "loss": 0.7563, "step": 2987 }, { - "epoch": 1.0911082709512507, - "grad_norm": 1.0930765867233276, - "learning_rate": 2.424238522319581e-05, - "loss": 0.8256, + "epoch": 0.5183900069396252, + "grad_norm": 1.176451325416565, + "learning_rate": 3.9462641845356096e-05, + "loss": 0.7771, "step": 2988 }, { - "epoch": 1.0914734343618768, - "grad_norm": 1.1165603399276733, - "learning_rate": 2.4228365402984562e-05, - "loss": 0.8613, + "epoch": 0.5185634975711312, + "grad_norm": 0.9350600838661194, + "learning_rate": 3.946107198044656e-05, + "loss": 0.7463, "step": 2989 }, { - "epoch": 1.0918385977725031, - "grad_norm": 0.8945240378379822, - "learning_rate": 2.4214343407448984e-05, - "loss": 0.8593, + "epoch": 0.518736988202637, + "grad_norm": 0.7793442010879517, + "learning_rate": 3.945949985705694e-05, + "loss": 0.7812, "step": 2990 }, { - "epoch": 1.0922037611831295, - "grad_norm": 1.0547864437103271, - "learning_rate": 2.4200319243802826e-05, - "loss": 0.8196, + "epoch": 0.518910478834143, + "grad_norm": 0.8322274684906006, + "learning_rate": 3.945792547536969e-05, + "loss": 0.8384, "step": 2991 }, { - "epoch": 1.0925689245937558, - "grad_norm": 1.047508716583252, - "learning_rate": 2.4186292919260975e-05, - "loss": 0.8542, + "epoch": 0.5190839694656488, + "grad_norm": 1.171189785003662, + "learning_rate": 3.945634883556752e-05, + "loss": 0.7588, "step": 2992 }, { - "epoch": 1.092934088004382, - "grad_norm": 1.1461979150772095, - "learning_rate": 2.41722644410394e-05, - "loss": 0.8627, + "epoch": 0.5192574600971548, + "grad_norm": 1.136230230331421, + "learning_rate": 3.945476993783339e-05, + "loss": 0.7249, "step": 2993 }, { - "epoch": 1.0932992514150082, - "grad_norm": 1.0230441093444824, - "learning_rate": 2.4158233816355185e-05, - "loss": 0.8474, + "epoch": 0.5194309507286606, + "grad_norm": 0.8590058088302612, + "learning_rate": 3.945318878235054e-05, + "loss": 0.6796, "step": 2994 }, { - "epoch": 1.0936644148256345, - "grad_norm": 1.2001020908355713, - "learning_rate": 2.4144201052426543e-05, - "loss": 0.8503, + "epoch": 0.5196044413601666, + "grad_norm": 0.8924686908721924, + "learning_rate": 3.945160536930247e-05, + "loss": 0.8401, "step": 2995 }, { - "epoch": 1.0940295782362608, - "grad_norm": 1.1239519119262695, - "learning_rate": 2.413016615647275e-05, - "loss": 0.8285, + "epoch": 0.5197779319916724, + "grad_norm": 0.9075596332550049, + "learning_rate": 3.945001969887293e-05, + "loss": 0.7557, "step": 2996 }, { - "epoch": 1.0943947416468869, - "grad_norm": 1.1647320985794067, - "learning_rate": 2.4116129135714208e-05, - "loss": 0.8563, + "epoch": 0.5199514226231784, + "grad_norm": 0.7987745404243469, + "learning_rate": 3.944843177124593e-05, + "loss": 0.8422, "step": 2997 }, { - "epoch": 1.0947599050575132, - "grad_norm": 1.0578867197036743, - "learning_rate": 2.410208999737239e-05, - "loss": 0.8271, + "epoch": 0.5201249132546842, + "grad_norm": 0.9596798419952393, + "learning_rate": 3.944684158660577e-05, + "loss": 0.7194, "step": 2998 }, { - "epoch": 1.0951250684681395, - "grad_norm": 0.8538228869438171, - "learning_rate": 2.408804874866987e-05, - "loss": 0.8363, + "epoch": 0.5202984038861902, + "grad_norm": 1.2856864929199219, + "learning_rate": 3.944524914513698e-05, + "loss": 0.718, "step": 2999 }, { - "epoch": 1.0954902318787658, - "grad_norm": 1.2909101247787476, - "learning_rate": 2.4074005396830322e-05, - "loss": 0.8207, + "epoch": 0.520471894517696, + "grad_norm": 1.2975484132766724, + "learning_rate": 3.944365444702437e-05, + "loss": 0.7751, "step": 3000 }, { - "epoch": 1.095855395289392, - "grad_norm": 0.8900764584541321, - "learning_rate": 2.4059959949078467e-05, - "loss": 0.8314, + "epoch": 0.5206453851492019, + "grad_norm": 0.908949613571167, + "learning_rate": 3.944205749245301e-05, + "loss": 0.8523, "step": 3001 }, { - "epoch": 1.0962205587000182, - "grad_norm": 1.3303413391113281, - "learning_rate": 2.4045912412640146e-05, - "loss": 0.8833, + "epoch": 0.5208188757807078, + "grad_norm": 1.0155155658721924, + "learning_rate": 3.944045828160822e-05, + "loss": 0.7365, "step": 3002 }, { - "epoch": 1.0965857221106445, - "grad_norm": 0.9703379273414612, - "learning_rate": 2.4031862794742238e-05, - "loss": 0.8358, + "epoch": 0.5209923664122137, + "grad_norm": 0.7426880598068237, + "learning_rate": 3.943885681467559e-05, + "loss": 0.7957, "step": 3003 }, { - "epoch": 1.0969508855212708, - "grad_norm": 1.808646559715271, - "learning_rate": 2.401781110261271e-05, - "loss": 0.8505, + "epoch": 0.5211658570437196, + "grad_norm": 0.7610426545143127, + "learning_rate": 3.943725309184098e-05, + "loss": 0.8273, "step": 3004 }, { - "epoch": 1.097316048931897, - "grad_norm": 1.283957839012146, - "learning_rate": 2.400375734348059e-05, - "loss": 0.8186, + "epoch": 0.5213393476752255, + "grad_norm": 1.0974400043487549, + "learning_rate": 3.94356471132905e-05, + "loss": 0.7839, "step": 3005 }, { - "epoch": 1.0976812123425232, - "grad_norm": 1.112578272819519, - "learning_rate": 2.3989701524575976e-05, - "loss": 0.8533, + "epoch": 0.5215128383067315, + "grad_norm": 0.8054469227790833, + "learning_rate": 3.943403887921052e-05, + "loss": 0.8235, "step": 3006 }, { - "epoch": 1.0980463757531496, - "grad_norm": 1.5075180530548096, - "learning_rate": 2.3975643653130032e-05, - "loss": 0.7976, + "epoch": 0.5216863289382373, + "grad_norm": 1.4393067359924316, + "learning_rate": 3.943242838978769e-05, + "loss": 0.7185, "step": 3007 }, { - "epoch": 1.0984115391637759, - "grad_norm": 1.0767825841903687, - "learning_rate": 2.3961583736374955e-05, - "loss": 0.8194, + "epoch": 0.5218598195697433, + "grad_norm": 0.8773645162582397, + "learning_rate": 3.94308156452089e-05, + "loss": 0.824, "step": 3008 }, { - "epoch": 1.098776702574402, - "grad_norm": 1.3347430229187012, - "learning_rate": 2.394752178154402e-05, - "loss": 0.8344, + "epoch": 0.5220333102012491, + "grad_norm": 0.8960959911346436, + "learning_rate": 3.942920064566131e-05, + "loss": 0.7905, "step": 3009 }, { - "epoch": 1.0991418659850283, - "grad_norm": 1.363793969154358, - "learning_rate": 2.393345779587153e-05, - "loss": 0.8438, + "epoch": 0.5222068008327551, + "grad_norm": 0.8004446029663086, + "learning_rate": 3.9427583391332354e-05, + "loss": 0.8525, "step": 3010 }, { - "epoch": 1.0995070293956546, - "grad_norm": 1.318525791168213, - "learning_rate": 2.3919391786592842e-05, - "loss": 0.8298, + "epoch": 0.5223802914642609, + "grad_norm": 0.8982938528060913, + "learning_rate": 3.94259638824097e-05, + "loss": 0.8101, "step": 3011 }, { - "epoch": 1.099872192806281, - "grad_norm": 1.36709463596344, - "learning_rate": 2.3905323760944356e-05, - "loss": 0.9082, + "epoch": 0.5225537820957669, + "grad_norm": 5.315499782562256, + "learning_rate": 3.94243421190813e-05, + "loss": 0.8422, "step": 3012 }, { - "epoch": 1.100237356216907, - "grad_norm": 1.1094307899475098, - "learning_rate": 2.3891253726163505e-05, - "loss": 0.7972, + "epoch": 0.5227272727272727, + "grad_norm": 0.8894351124763489, + "learning_rate": 3.942271810153537e-05, + "loss": 0.689, "step": 3013 }, { - "epoch": 1.1006025196275333, - "grad_norm": 1.328971028327942, - "learning_rate": 2.387718168948876e-05, - "loss": 0.8394, + "epoch": 0.5229007633587787, + "grad_norm": 0.8814420104026794, + "learning_rate": 3.9421091829960364e-05, + "loss": 0.7241, "step": 3014 }, { - "epoch": 1.1009676830381596, - "grad_norm": 1.186680555343628, - "learning_rate": 2.3863107658159614e-05, - "loss": 0.8123, + "epoch": 0.5230742539902845, + "grad_norm": 0.9443041086196899, + "learning_rate": 3.941946330454503e-05, + "loss": 0.7646, "step": 3015 }, { - "epoch": 1.101332846448786, - "grad_norm": 1.053097128868103, - "learning_rate": 2.38490316394166e-05, - "loss": 0.8862, + "epoch": 0.5232477446217905, + "grad_norm": 0.8961316347122192, + "learning_rate": 3.9417832525478344e-05, + "loss": 0.7991, "step": 3016 }, { - "epoch": 1.101698009859412, - "grad_norm": 0.9675694108009338, - "learning_rate": 2.3834953640501262e-05, - "loss": 0.8969, + "epoch": 0.5234212352532963, + "grad_norm": 0.7391827702522278, + "learning_rate": 3.941619949294957e-05, + "loss": 0.7545, "step": 3017 }, { - "epoch": 1.1020631732700383, - "grad_norm": 1.4018760919570923, - "learning_rate": 2.3820873668656154e-05, - "loss": 0.8263, + "epoch": 0.5235947258848023, + "grad_norm": 1.3026995658874512, + "learning_rate": 3.941456420714822e-05, + "loss": 0.7502, "step": 3018 }, { - "epoch": 1.1024283366806646, - "grad_norm": 1.1145563125610352, - "learning_rate": 2.380679173112487e-05, - "loss": 0.8705, + "epoch": 0.5237682165163081, + "grad_norm": 0.8779506087303162, + "learning_rate": 3.941292666826408e-05, + "loss": 0.6271, "step": 3019 }, { - "epoch": 1.1027935000912907, - "grad_norm": 1.1317867040634155, - "learning_rate": 2.3792707835151995e-05, - "loss": 0.8108, + "epoch": 0.5239417071478141, + "grad_norm": 0.9425469040870667, + "learning_rate": 3.941128687648717e-05, + "loss": 0.8286, "step": 3020 }, { - "epoch": 1.103158663501917, - "grad_norm": 1.1498045921325684, - "learning_rate": 2.3778621987983133e-05, - "loss": 0.7963, + "epoch": 0.5241151977793199, + "grad_norm": 1.1279855966567993, + "learning_rate": 3.94096448320078e-05, + "loss": 0.8108, "step": 3021 }, { - "epoch": 1.1035238269125434, - "grad_norm": 1.0250571966171265, - "learning_rate": 2.3764534196864886e-05, - "loss": 0.816, + "epoch": 0.5242886884108258, + "grad_norm": 0.8621366024017334, + "learning_rate": 3.940800053501653e-05, + "loss": 0.8909, "step": 3022 }, { - "epoch": 1.1038889903231697, - "grad_norm": 1.019167184829712, - "learning_rate": 2.3750444469044856e-05, - "loss": 0.8989, + "epoch": 0.5244621790423317, + "grad_norm": 1.0737360715866089, + "learning_rate": 3.940635398570418e-05, + "loss": 0.7622, "step": 3023 }, { - "epoch": 1.104254153733796, - "grad_norm": 1.1096253395080566, - "learning_rate": 2.3736352811771647e-05, - "loss": 0.9025, + "epoch": 0.5246356696738376, + "grad_norm": 1.2045055627822876, + "learning_rate": 3.9404705184261846e-05, + "loss": 0.7146, "step": 3024 }, { - "epoch": 1.104619317144422, - "grad_norm": 1.2118819952011108, - "learning_rate": 2.3722259232294835e-05, - "loss": 0.8116, + "epoch": 0.5248091603053435, + "grad_norm": 0.9849977493286133, + "learning_rate": 3.9403054130880864e-05, + "loss": 0.7454, "step": 3025 }, { - "epoch": 1.1049844805550484, - "grad_norm": 1.4517922401428223, - "learning_rate": 2.370816373786502e-05, - "loss": 0.8412, + "epoch": 0.5249826509368494, + "grad_norm": 0.7090593576431274, + "learning_rate": 3.9401400825752835e-05, + "loss": 0.9209, "step": 3026 }, { - "epoch": 1.1053496439656747, - "grad_norm": 0.9246478080749512, - "learning_rate": 2.3694066335733758e-05, - "loss": 0.8391, + "epoch": 0.5251561415683553, + "grad_norm": 2.141369581222534, + "learning_rate": 3.9399745269069636e-05, + "loss": 0.908, "step": 3027 }, { - "epoch": 1.1057148073763008, - "grad_norm": 1.0096077919006348, - "learning_rate": 2.3679967033153605e-05, - "loss": 0.8152, + "epoch": 0.5253296321998612, + "grad_norm": 0.7489013671875, + "learning_rate": 3.939808746102339e-05, + "loss": 0.751, "step": 3028 }, { - "epoch": 1.106079970786927, - "grad_norm": 1.157086968421936, - "learning_rate": 2.366586583737808e-05, - "loss": 0.8276, + "epoch": 0.5255031228313671, + "grad_norm": 3.3418264389038086, + "learning_rate": 3.93964274018065e-05, + "loss": 0.6926, "step": 3029 }, { - "epoch": 1.1064451341975534, - "grad_norm": 1.2601640224456787, - "learning_rate": 2.3651762755661683e-05, - "loss": 0.8142, + "epoch": 0.525676613462873, + "grad_norm": 0.8912116885185242, + "learning_rate": 3.9394765091611596e-05, + "loss": 0.7454, "step": 3030 }, { - "epoch": 1.1068102976081797, - "grad_norm": 0.9273794293403625, - "learning_rate": 2.3637657795259883e-05, - "loss": 0.8649, + "epoch": 0.5258501040943789, + "grad_norm": 1.640193223953247, + "learning_rate": 3.939310053063161e-05, + "loss": 0.8207, "step": 3031 }, { - "epoch": 1.1071754610188058, - "grad_norm": 1.0837106704711914, - "learning_rate": 2.3623550963429117e-05, - "loss": 0.8361, + "epoch": 0.5260235947258848, + "grad_norm": 0.8046913743019104, + "learning_rate": 3.939143371905971e-05, + "loss": 1.0085, "step": 3032 }, { - "epoch": 1.1075406244294321, - "grad_norm": 0.9654865860939026, - "learning_rate": 2.3609442267426787e-05, - "loss": 0.8599, + "epoch": 0.5261970853573907, + "grad_norm": 0.8675574660301208, + "learning_rate": 3.9389764657089334e-05, + "loss": 0.8481, "step": 3033 }, { - "epoch": 1.1079057878400584, - "grad_norm": 1.000752329826355, - "learning_rate": 2.3595331714511243e-05, - "loss": 0.8607, + "epoch": 0.5263705759888966, + "grad_norm": 0.9807295203208923, + "learning_rate": 3.938809334491417e-05, + "loss": 0.886, "step": 3034 }, { - "epoch": 1.1082709512506848, - "grad_norm": 1.240233302116394, - "learning_rate": 2.3581219311941813e-05, - "loss": 0.8838, + "epoch": 0.5265440666204025, + "grad_norm": 2.2702739238739014, + "learning_rate": 3.938641978272819e-05, + "loss": 0.8027, "step": 3035 }, { - "epoch": 1.1086361146613108, - "grad_norm": 0.9443249106407166, - "learning_rate": 2.3567105066978744e-05, - "loss": 0.8638, + "epoch": 0.5267175572519084, + "grad_norm": 1.0625413656234741, + "learning_rate": 3.9384743970725596e-05, + "loss": 0.751, "step": 3036 }, { - "epoch": 1.1090012780719372, - "grad_norm": 0.9443702697753906, - "learning_rate": 2.355298898688326e-05, - "loss": 0.8536, + "epoch": 0.5268910478834143, + "grad_norm": 1.0873180627822876, + "learning_rate": 3.938306590910088e-05, + "loss": 0.8125, "step": 3037 }, { - "epoch": 1.1093664414825635, - "grad_norm": 1.2399287223815918, - "learning_rate": 2.3538871078917514e-05, - "loss": 0.87, + "epoch": 0.5270645385149202, + "grad_norm": 1.0596976280212402, + "learning_rate": 3.938138559804878e-05, + "loss": 0.6921, "step": 3038 }, { - "epoch": 1.1097316048931898, - "grad_norm": 1.1163215637207031, - "learning_rate": 2.3524751350344602e-05, - "loss": 0.8477, + "epoch": 0.5272380291464261, + "grad_norm": 0.9662942290306091, + "learning_rate": 3.937970303776429e-05, + "loss": 0.7512, "step": 3039 }, { - "epoch": 1.1100967683038159, - "grad_norm": 1.0480562448501587, - "learning_rate": 2.3510629808428568e-05, - "loss": 0.8486, + "epoch": 0.527411519777932, + "grad_norm": 0.8078197240829468, + "learning_rate": 3.9378018228442696e-05, + "loss": 0.811, "step": 3040 }, { - "epoch": 1.1104619317144422, - "grad_norm": 1.1756761074066162, - "learning_rate": 2.349650646043436e-05, - "loss": 0.8353, + "epoch": 0.5275850104094378, + "grad_norm": 0.7719531059265137, + "learning_rate": 3.93763311702795e-05, + "loss": 0.8875, "step": 3041 }, { - "epoch": 1.1108270951250685, - "grad_norm": 1.2451987266540527, - "learning_rate": 2.3482381313627886e-05, - "loss": 0.875, + "epoch": 0.5277585010409438, + "grad_norm": 1.0576577186584473, + "learning_rate": 3.937464186347049e-05, + "loss": 0.7842, "step": 3042 }, { - "epoch": 1.1111922585356948, - "grad_norm": 1.1054638624191284, - "learning_rate": 2.3468254375275973e-05, - "loss": 0.8269, + "epoch": 0.5279319916724496, + "grad_norm": 0.9805222749710083, + "learning_rate": 3.9372950308211715e-05, + "loss": 0.6659, "step": 3043 }, { - "epoch": 1.111557421946321, - "grad_norm": 1.1607131958007812, - "learning_rate": 2.3454125652646348e-05, - "loss": 0.837, + "epoch": 0.5281054823039556, + "grad_norm": 1.0311145782470703, + "learning_rate": 3.9371256504699486e-05, + "loss": 0.7607, "step": 3044 }, { - "epoch": 1.1119225853569472, - "grad_norm": 1.3158084154129028, - "learning_rate": 2.343999515300769e-05, - "loss": 0.8818, + "epoch": 0.5282789729354614, + "grad_norm": 1.873684287071228, + "learning_rate": 3.9369560453130366e-05, + "loss": 0.8684, "step": 3045 }, { - "epoch": 1.1122877487675735, - "grad_norm": 1.1013789176940918, - "learning_rate": 2.3425862883629553e-05, - "loss": 0.8372, + "epoch": 0.5284524635669674, + "grad_norm": 0.7881127595901489, + "learning_rate": 3.936786215370119e-05, + "loss": 0.7344, "step": 3046 }, { - "epoch": 1.1126529121781998, - "grad_norm": 1.1742994785308838, - "learning_rate": 2.3411728851782442e-05, - "loss": 0.8301, + "epoch": 0.5286259541984732, + "grad_norm": 0.8100263476371765, + "learning_rate": 3.9366161606609045e-05, + "loss": 0.7859, "step": 3047 }, { - "epoch": 1.113018075588826, - "grad_norm": 1.118415117263794, - "learning_rate": 2.3397593064737737e-05, - "loss": 0.8677, + "epoch": 0.5287994448299792, + "grad_norm": 0.9468390941619873, + "learning_rate": 3.936445881205127e-05, + "loss": 0.8184, "step": 3048 }, { - "epoch": 1.1133832389994522, - "grad_norm": 1.429076075553894, - "learning_rate": 2.338345552976774e-05, - "loss": 0.8065, + "epoch": 0.528972935461485, + "grad_norm": 0.8014081120491028, + "learning_rate": 3.936275377022549e-05, + "loss": 0.8557, "step": 3049 }, { - "epoch": 1.1137484024100786, - "grad_norm": 1.069743275642395, - "learning_rate": 2.3369316254145636e-05, - "loss": 0.8975, + "epoch": 0.529146426092991, + "grad_norm": 0.9327256679534912, + "learning_rate": 3.936104648132957e-05, + "loss": 0.7236, "step": 3050 }, { - "epoch": 1.1141135658207049, - "grad_norm": 1.148484468460083, - "learning_rate": 2.3355175245145526e-05, - "loss": 0.8521, + "epoch": 0.5293199167244969, + "grad_norm": 1.0287213325500488, + "learning_rate": 3.9359336945561656e-05, + "loss": 0.8079, "step": 3051 }, { - "epoch": 1.114478729231331, - "grad_norm": 0.8761084079742432, - "learning_rate": 2.3341032510042387e-05, - "loss": 0.8442, + "epoch": 0.5294934073560028, + "grad_norm": 0.7664023637771606, + "learning_rate": 3.935762516312012e-05, + "loss": 0.7791, "step": 3052 }, { - "epoch": 1.1148438926419573, - "grad_norm": 1.0181853771209717, - "learning_rate": 2.3326888056112086e-05, - "loss": 0.8491, + "epoch": 0.5296668979875087, + "grad_norm": 0.863982617855072, + "learning_rate": 3.9355911134203626e-05, + "loss": 0.6143, "step": 3053 }, { - "epoch": 1.1152090560525836, - "grad_norm": 1.3679285049438477, - "learning_rate": 2.3312741890631383e-05, - "loss": 0.8617, + "epoch": 0.5298403886190146, + "grad_norm": 1.3836195468902588, + "learning_rate": 3.9354194859011105e-05, + "loss": 0.7102, "step": 3054 }, { - "epoch": 1.11557421946321, - "grad_norm": 1.1162315607070923, - "learning_rate": 2.3298594020877913e-05, - "loss": 0.8196, + "epoch": 0.5300138792505205, + "grad_norm": 0.946747362613678, + "learning_rate": 3.935247633774171e-05, + "loss": 0.7603, "step": 3055 }, { - "epoch": 1.115939382873836, - "grad_norm": 1.049153447151184, - "learning_rate": 2.328444445413018e-05, - "loss": 0.8557, + "epoch": 0.5301873698820264, + "grad_norm": 1.5280121564865112, + "learning_rate": 3.935075557059488e-05, + "loss": 0.8894, "step": 3056 }, { - "epoch": 1.1163045462844623, - "grad_norm": 1.1019624471664429, - "learning_rate": 2.3270293197667573e-05, - "loss": 0.8682, + "epoch": 0.5303608605135323, + "grad_norm": 0.7216495871543884, + "learning_rate": 3.934903255777033e-05, + "loss": 0.7842, "step": 3057 }, { - "epoch": 1.1166697096950886, - "grad_norm": 1.1146800518035889, - "learning_rate": 2.325614025877034e-05, - "loss": 0.829, + "epoch": 0.5305343511450382, + "grad_norm": 0.994425892829895, + "learning_rate": 3.934730729946799e-05, + "loss": 0.7605, "step": 3058 }, { - "epoch": 1.1170348731057147, - "grad_norm": 1.0454012155532837, - "learning_rate": 2.3241985644719603e-05, - "loss": 0.869, + "epoch": 0.5307078417765441, + "grad_norm": 0.7839878797531128, + "learning_rate": 3.934557979588811e-05, + "loss": 0.7858, "step": 3059 }, { - "epoch": 1.117400036516341, - "grad_norm": 1.2846031188964844, - "learning_rate": 2.3227829362797355e-05, - "loss": 0.8706, + "epoch": 0.5308813324080499, + "grad_norm": 1.4215407371520996, + "learning_rate": 3.9343850047231144e-05, + "loss": 0.6591, "step": 3060 }, { - "epoch": 1.1177651999269673, - "grad_norm": 1.067111611366272, - "learning_rate": 2.3213671420286413e-05, - "loss": 0.8346, + "epoch": 0.5310548230395559, + "grad_norm": 0.8328208923339844, + "learning_rate": 3.9342118053697837e-05, + "loss": 0.7018, "step": 3061 }, { - "epoch": 1.1181303633375936, - "grad_norm": 0.9509674906730652, - "learning_rate": 2.3199511824470487e-05, - "loss": 0.8867, + "epoch": 0.5312283136710617, + "grad_norm": 0.9908860921859741, + "learning_rate": 3.9340383815489204e-05, + "loss": 0.7454, "step": 3062 }, { - "epoch": 1.1184955267482197, - "grad_norm": 0.9968467950820923, - "learning_rate": 2.318535058263412e-05, - "loss": 0.8737, + "epoch": 0.5314018043025677, + "grad_norm": 1.0089342594146729, + "learning_rate": 3.933864733280648e-05, + "loss": 0.7678, "step": 3063 }, { - "epoch": 1.118860690158846, - "grad_norm": 1.114318609237671, - "learning_rate": 2.3171187702062693e-05, - "loss": 0.8496, + "epoch": 0.5315752949340735, + "grad_norm": 0.794348418712616, + "learning_rate": 3.933690860585121e-05, + "loss": 0.7058, "step": 3064 }, { - "epoch": 1.1192258535694724, - "grad_norm": 1.1825518608093262, - "learning_rate": 2.3157023190042448e-05, - "loss": 0.8832, + "epoch": 0.5317487855655795, + "grad_norm": 0.9596083760261536, + "learning_rate": 3.933516763482516e-05, + "loss": 0.8127, "step": 3065 }, { - "epoch": 1.1195910169800987, - "grad_norm": 1.2312291860580444, - "learning_rate": 2.3142857053860454e-05, - "loss": 0.88, + "epoch": 0.5319222761970853, + "grad_norm": 0.880204439163208, + "learning_rate": 3.933342441993037e-05, + "loss": 0.8627, "step": 3066 }, { - "epoch": 1.1199561803907248, - "grad_norm": 1.0980688333511353, - "learning_rate": 2.312868930080462e-05, - "loss": 0.8512, + "epoch": 0.5320957668285913, + "grad_norm": 0.7481338381767273, + "learning_rate": 3.9331678961369156e-05, + "loss": 0.8425, "step": 3067 }, { - "epoch": 1.120321343801351, - "grad_norm": 1.1012957096099854, - "learning_rate": 2.3114519938163683e-05, - "loss": 0.9009, + "epoch": 0.5322692574600971, + "grad_norm": 1.1428142786026, + "learning_rate": 3.932993125934407e-05, + "loss": 0.7412, "step": 3068 }, { - "epoch": 1.1206865072119774, - "grad_norm": 0.9966577887535095, - "learning_rate": 2.3100348973227224e-05, - "loss": 0.8544, + "epoch": 0.5324427480916031, + "grad_norm": 0.7860745191574097, + "learning_rate": 3.932818131405794e-05, + "loss": 0.8462, "step": 3069 }, { - "epoch": 1.1210516706226037, - "grad_norm": 1.0554652214050293, - "learning_rate": 2.3086176413285627e-05, - "loss": 0.8165, + "epoch": 0.5326162387231089, + "grad_norm": 0.7405009269714355, + "learning_rate": 3.932642912571385e-05, + "loss": 0.7423, "step": 3070 }, { - "epoch": 1.1214168340332298, - "grad_norm": 0.8021441698074341, - "learning_rate": 2.3072002265630102e-05, - "loss": 0.8627, + "epoch": 0.5327897293546149, + "grad_norm": 0.7272678017616272, + "learning_rate": 3.9324674694515126e-05, + "loss": 0.8258, "step": 3071 }, { - "epoch": 1.121781997443856, - "grad_norm": 1.059640645980835, - "learning_rate": 2.3057826537552684e-05, - "loss": 0.8077, + "epoch": 0.5329632199861207, + "grad_norm": 1.3885780572891235, + "learning_rate": 3.932291802066539e-05, + "loss": 0.6978, "step": 3072 }, { - "epoch": 1.1221471608544824, - "grad_norm": 1.254610300064087, - "learning_rate": 2.3043649236346216e-05, - "loss": 0.8245, + "epoch": 0.5331367106176267, + "grad_norm": 0.8554738163948059, + "learning_rate": 3.932115910436851e-05, + "loss": 0.7063, "step": 3073 }, { - "epoch": 1.1225123242651087, - "grad_norm": 1.2180901765823364, - "learning_rate": 2.302947036930435e-05, - "loss": 0.8705, + "epoch": 0.5333102012491325, + "grad_norm": 0.8081940412521362, + "learning_rate": 3.93193979458286e-05, + "loss": 0.6665, "step": 3074 }, { - "epoch": 1.1228774876757348, - "grad_norm": 1.0760871171951294, - "learning_rate": 2.301528994372154e-05, - "loss": 0.8419, + "epoch": 0.5334836918806385, + "grad_norm": 0.8720418810844421, + "learning_rate": 3.931763454525005e-05, + "loss": 0.8206, "step": 3075 }, { - "epoch": 1.1232426510863611, - "grad_norm": 1.087729811668396, - "learning_rate": 2.3001107966893054e-05, - "loss": 0.8506, + "epoch": 0.5336571825121443, + "grad_norm": 0.9934757351875305, + "learning_rate": 3.93158689028375e-05, + "loss": 0.6455, "step": 3076 }, { - "epoch": 1.1236078144969874, - "grad_norm": 1.3595077991485596, - "learning_rate": 2.2986924446114947e-05, - "loss": 0.8284, + "epoch": 0.5338306731436503, + "grad_norm": 1.828513741493225, + "learning_rate": 3.931410101879585e-05, + "loss": 0.7424, "step": 3077 }, { - "epoch": 1.1239729779076137, - "grad_norm": 1.440434217453003, - "learning_rate": 2.2972739388684068e-05, - "loss": 0.8112, + "epoch": 0.5340041637751561, + "grad_norm": 0.7511942386627197, + "learning_rate": 3.931233089333027e-05, + "loss": 0.6996, "step": 3078 }, { - "epoch": 1.1243381413182398, - "grad_norm": 0.8335994482040405, - "learning_rate": 2.2958552801898068e-05, - "loss": 0.8505, + "epoch": 0.5341776544066621, + "grad_norm": 0.6905186772346497, + "learning_rate": 3.931055852664619e-05, + "loss": 0.8386, "step": 3079 }, { - "epoch": 1.1247033047288661, - "grad_norm": 1.116257667541504, - "learning_rate": 2.294436469305536e-05, - "loss": 0.8454, + "epoch": 0.5343511450381679, + "grad_norm": 1.2208346128463745, + "learning_rate": 3.9308783918949296e-05, + "loss": 0.7196, "step": 3080 }, { - "epoch": 1.1250684681394925, - "grad_norm": 1.0973936319351196, - "learning_rate": 2.2930175069455175e-05, - "loss": 0.86, + "epoch": 0.5345246356696738, + "grad_norm": 0.8233733773231506, + "learning_rate": 3.930700707044552e-05, + "loss": 0.8105, "step": 3081 }, { - "epoch": 1.1254336315501188, - "grad_norm": 1.2892956733703613, - "learning_rate": 2.2915983938397494e-05, - "loss": 0.8474, + "epoch": 0.5346981263011797, + "grad_norm": 0.9079757332801819, + "learning_rate": 3.9305227981341085e-05, + "loss": 0.8093, "step": 3082 }, { - "epoch": 1.1257987949607449, - "grad_norm": 1.316504716873169, - "learning_rate": 2.290179130718309e-05, - "loss": 0.8865, + "epoch": 0.5348716169326856, + "grad_norm": 1.066965103149414, + "learning_rate": 3.9303446651842444e-05, + "loss": 0.8625, "step": 3083 }, { - "epoch": 1.1261639583713712, - "grad_norm": 0.9817110896110535, - "learning_rate": 2.2887597183113503e-05, - "loss": 0.8824, + "epoch": 0.5350451075641915, + "grad_norm": 0.9460259079933167, + "learning_rate": 3.930166308215633e-05, + "loss": 0.8499, "step": 3084 }, { - "epoch": 1.1265291217819975, - "grad_norm": 0.961394190788269, - "learning_rate": 2.2873401573491033e-05, - "loss": 0.8662, + "epoch": 0.5352185981956974, + "grad_norm": 0.8538378477096558, + "learning_rate": 3.929987727248972e-05, + "loss": 0.7974, "step": 3085 }, { - "epoch": 1.1268942851926238, - "grad_norm": 0.9967735409736633, - "learning_rate": 2.2859204485618758e-05, - "loss": 0.8555, + "epoch": 0.5353920888272033, + "grad_norm": 0.8713475465774536, + "learning_rate": 3.929808922304987e-05, + "loss": 0.7029, "step": 3086 }, { - "epoch": 1.12725944860325, - "grad_norm": 1.3058576583862305, - "learning_rate": 2.2845005926800502e-05, - "loss": 0.8071, + "epoch": 0.5355655794587092, + "grad_norm": 0.6908689737319946, + "learning_rate": 3.929629893404428e-05, + "loss": 0.7379, "step": 3087 }, { - "epoch": 1.1276246120138762, - "grad_norm": 1.1800068616867065, - "learning_rate": 2.2830805904340867e-05, - "loss": 0.8878, + "epoch": 0.5357390700902152, + "grad_norm": 0.7806771397590637, + "learning_rate": 3.92945064056807e-05, + "loss": 0.6836, "step": 3088 }, { - "epoch": 1.1279897754245025, - "grad_norm": 0.9269964098930359, - "learning_rate": 2.281660442554518e-05, - "loss": 0.8514, + "epoch": 0.535912560721721, + "grad_norm": 0.8447021245956421, + "learning_rate": 3.929271163816718e-05, + "loss": 0.8564, "step": 3089 }, { - "epoch": 1.1283549388351286, - "grad_norm": 0.9969589710235596, - "learning_rate": 2.2802401497719545e-05, - "loss": 0.8368, + "epoch": 0.536086051353227, + "grad_norm": 1.0376663208007812, + "learning_rate": 3.929091463171199e-05, + "loss": 0.7271, "step": 3090 }, { - "epoch": 1.128720102245755, - "grad_norm": 1.1620794534683228, - "learning_rate": 2.2788197128170798e-05, - "loss": 0.8395, + "epoch": 0.5362595419847328, + "grad_norm": 0.7298492193222046, + "learning_rate": 3.9289115386523676e-05, + "loss": 0.8301, "step": 3091 }, { - "epoch": 1.1290852656563812, - "grad_norm": 1.0380107164382935, - "learning_rate": 2.27739913242065e-05, - "loss": 0.8351, + "epoch": 0.5364330326162388, + "grad_norm": 4.9445624351501465, + "learning_rate": 3.928731390281105e-05, + "loss": 0.7666, "step": 3092 }, { - "epoch": 1.1294504290670075, - "grad_norm": 0.9299238920211792, - "learning_rate": 2.2759784093134987e-05, - "loss": 0.8334, + "epoch": 0.5366065232477446, + "grad_norm": 0.837226152420044, + "learning_rate": 3.928551018078317e-05, + "loss": 0.8118, "step": 3093 }, { - "epoch": 1.1298155924776339, - "grad_norm": 1.5031709671020508, - "learning_rate": 2.2745575442265297e-05, - "loss": 0.8665, + "epoch": 0.5367800138792506, + "grad_norm": 1.0294944047927856, + "learning_rate": 3.928370422064936e-05, + "loss": 0.8091, "step": 3094 }, { - "epoch": 1.13018075588826, - "grad_norm": 1.1107112169265747, - "learning_rate": 2.273136537890722e-05, - "loss": 0.8513, + "epoch": 0.5369535045107564, + "grad_norm": 0.9657663702964783, + "learning_rate": 3.928189602261921e-05, + "loss": 0.9197, "step": 3095 }, { - "epoch": 1.1305459192988863, - "grad_norm": 1.1472328901290894, - "learning_rate": 2.271715391037126e-05, - "loss": 0.8406, + "epoch": 0.5371269951422624, + "grad_norm": 0.6981934905052185, + "learning_rate": 3.928008558690255e-05, + "loss": 0.8184, "step": 3096 }, { - "epoch": 1.1309110827095126, - "grad_norm": 1.1645950078964233, - "learning_rate": 2.2702941043968635e-05, - "loss": 0.8164, + "epoch": 0.5373004857737682, + "grad_norm": 1.1711726188659668, + "learning_rate": 3.927827291370951e-05, + "loss": 0.7671, "step": 3097 }, { - "epoch": 1.1312762461201387, - "grad_norm": 1.2465038299560547, - "learning_rate": 2.2688726787011315e-05, - "loss": 0.8366, + "epoch": 0.5374739764052742, + "grad_norm": 0.8442531228065491, + "learning_rate": 3.927645800325041e-05, + "loss": 0.7025, "step": 3098 }, { - "epoch": 1.131641409530765, - "grad_norm": 0.9235344529151917, - "learning_rate": 2.267451114681195e-05, - "loss": 0.8359, + "epoch": 0.53764746703678, + "grad_norm": 1.1082375049591064, + "learning_rate": 3.9274640855735914e-05, + "loss": 0.9019, "step": 3099 }, { - "epoch": 1.1320065729413913, - "grad_norm": 1.4504356384277344, - "learning_rate": 2.2660294130683923e-05, - "loss": 0.8239, + "epoch": 0.5378209576682859, + "grad_norm": 0.921712338924408, + "learning_rate": 3.927282147137688e-05, + "loss": 0.9023, "step": 3100 }, { - "epoch": 1.1323717363520176, - "grad_norm": 1.2051177024841309, - "learning_rate": 2.2646075745941315e-05, - "loss": 0.8169, + "epoch": 0.5379944482997918, + "grad_norm": 0.877833902835846, + "learning_rate": 3.927099985038446e-05, + "loss": 0.6403, "step": 3101 }, { - "epoch": 1.1327368997626437, - "grad_norm": 1.1447759866714478, - "learning_rate": 2.2631855999898914e-05, - "loss": 0.845, + "epoch": 0.5381679389312977, + "grad_norm": 1.180526852607727, + "learning_rate": 3.9269175992970055e-05, + "loss": 0.7074, "step": 3102 }, { - "epoch": 1.13310206317327, - "grad_norm": 1.5182143449783325, - "learning_rate": 2.261763489987222e-05, - "loss": 0.8118, + "epoch": 0.5383414295628036, + "grad_norm": 0.8574577569961548, + "learning_rate": 3.926734989934532e-05, + "loss": 0.7231, "step": 3103 }, { - "epoch": 1.1334672265838963, - "grad_norm": 1.1197258234024048, - "learning_rate": 2.26034124531774e-05, - "loss": 0.8566, + "epoch": 0.5385149201943095, + "grad_norm": 0.9093102812767029, + "learning_rate": 3.9265521569722176e-05, + "loss": 0.8015, "step": 3104 }, { - "epoch": 1.1338323899945226, - "grad_norm": 1.1590198278427124, - "learning_rate": 2.2589188667131346e-05, - "loss": 0.8206, + "epoch": 0.5386884108258154, + "grad_norm": 0.7713680267333984, + "learning_rate": 3.9263691004312804e-05, + "loss": 0.9043, "step": 3105 }, { - "epoch": 1.1341975534051487, - "grad_norm": 1.1853392124176025, - "learning_rate": 2.257496354905162e-05, - "loss": 0.783, + "epoch": 0.5388619014573213, + "grad_norm": 0.7736133933067322, + "learning_rate": 3.926185820332965e-05, + "loss": 0.6897, "step": 3106 }, { - "epoch": 1.134562716815775, - "grad_norm": 0.8430858850479126, - "learning_rate": 2.2560737106256472e-05, - "loss": 0.8641, + "epoch": 0.5390353920888272, + "grad_norm": 0.6753774285316467, + "learning_rate": 3.9260023166985407e-05, + "loss": 0.7485, "step": 3107 }, { - "epoch": 1.1349278802264013, - "grad_norm": 1.11115562915802, - "learning_rate": 2.254650934606484e-05, - "loss": 0.8589, + "epoch": 0.5392088827203331, + "grad_norm": 0.8383522033691406, + "learning_rate": 3.9258185895493026e-05, + "loss": 0.8083, "step": 3108 }, { - "epoch": 1.1352930436370277, - "grad_norm": 1.0517470836639404, - "learning_rate": 2.2532280275796333e-05, - "loss": 0.8384, + "epoch": 0.539382373351839, + "grad_norm": 0.7497417330741882, + "learning_rate": 3.925634638906574e-05, + "loss": 0.7527, "step": 3109 }, { - "epoch": 1.1356582070476537, - "grad_norm": 0.9531464576721191, - "learning_rate": 2.251804990277125e-05, - "loss": 0.8885, + "epoch": 0.5395558639833449, + "grad_norm": 2.6327080726623535, + "learning_rate": 3.925450464791701e-05, + "loss": 0.7197, "step": 3110 }, { - "epoch": 1.13602337045828, - "grad_norm": 1.3814489841461182, - "learning_rate": 2.250381823431052e-05, - "loss": 0.8288, + "epoch": 0.5397293546148508, + "grad_norm": 1.1912345886230469, + "learning_rate": 3.925266067226058e-05, + "loss": 0.6768, "step": 3111 }, { - "epoch": 1.1363885338689064, - "grad_norm": 1.116489291191101, - "learning_rate": 2.248958527773579e-05, - "loss": 0.8246, + "epoch": 0.5399028452463567, + "grad_norm": 0.8497080206871033, + "learning_rate": 3.925081446231045e-05, + "loss": 0.7153, "step": 3112 }, { - "epoch": 1.1367536972795327, - "grad_norm": 1.0089389085769653, - "learning_rate": 2.2475351040369327e-05, - "loss": 0.7951, + "epoch": 0.5400763358778626, + "grad_norm": 0.9387142062187195, + "learning_rate": 3.924896601828087e-05, + "loss": 0.7014, "step": 3113 }, { - "epoch": 1.1371188606901588, - "grad_norm": 1.1224013566970825, - "learning_rate": 2.2461115529534084e-05, - "loss": 0.8316, + "epoch": 0.5402498265093685, + "grad_norm": 0.840358316898346, + "learning_rate": 3.924711534038635e-05, + "loss": 0.6799, "step": 3114 }, { - "epoch": 1.137484024100785, - "grad_norm": 0.9848836064338684, - "learning_rate": 2.244687875255367e-05, - "loss": 0.8337, + "epoch": 0.5404233171408744, + "grad_norm": 1.0234789848327637, + "learning_rate": 3.924526242884167e-05, + "loss": 0.9209, "step": 3115 }, { - "epoch": 1.1378491875114114, - "grad_norm": 1.5092921257019043, - "learning_rate": 2.2432640716752316e-05, - "loss": 0.8131, + "epoch": 0.5405968077723803, + "grad_norm": 1.0805331468582153, + "learning_rate": 3.9243407283861866e-05, + "loss": 0.7581, "step": 3116 }, { - "epoch": 1.1382143509220377, - "grad_norm": 1.2929739952087402, - "learning_rate": 2.241840142945494e-05, - "loss": 0.86, + "epoch": 0.5407702984038862, + "grad_norm": 0.9171424508094788, + "learning_rate": 3.924154990566222e-05, + "loss": 0.7971, "step": 3117 }, { - "epoch": 1.1385795143326638, - "grad_norm": 1.3208165168762207, - "learning_rate": 2.2404160897987056e-05, - "loss": 0.8544, + "epoch": 0.5409437890353921, + "grad_norm": 0.7319105267524719, + "learning_rate": 3.923969029445828e-05, + "loss": 0.9146, "step": 3118 }, { - "epoch": 1.1389446777432901, - "grad_norm": 1.1140897274017334, - "learning_rate": 2.2389919129674872e-05, - "loss": 0.8853, + "epoch": 0.5411172796668979, + "grad_norm": 0.8976501822471619, + "learning_rate": 3.9237828450465866e-05, + "loss": 0.8425, "step": 3119 }, { - "epoch": 1.1393098411539164, - "grad_norm": 1.4441865682601929, - "learning_rate": 2.2375676131845196e-05, - "loss": 0.884, + "epoch": 0.5412907702984039, + "grad_norm": 1.3943482637405396, + "learning_rate": 3.923596437390105e-05, + "loss": 0.7008, "step": 3120 }, { - "epoch": 1.1396750045645425, - "grad_norm": 1.4260921478271484, - "learning_rate": 2.236143191182548e-05, - "loss": 0.8905, + "epoch": 0.5414642609299097, + "grad_norm": 0.8941110968589783, + "learning_rate": 3.9234098064980145e-05, + "loss": 0.8816, "step": 3121 }, { - "epoch": 1.1400401679751688, - "grad_norm": 1.2085423469543457, - "learning_rate": 2.2347186476943805e-05, - "loss": 0.8385, + "epoch": 0.5416377515614157, + "grad_norm": 0.9413111805915833, + "learning_rate": 3.923222952391975e-05, + "loss": 0.8164, "step": 3122 }, { - "epoch": 1.1404053313857951, - "grad_norm": 0.7249525785446167, - "learning_rate": 2.2332939834528875e-05, - "loss": 0.8805, + "epoch": 0.5418112421929215, + "grad_norm": 1.0656286478042603, + "learning_rate": 3.923035875093671e-05, + "loss": 0.8708, "step": 3123 }, { - "epoch": 1.1407704947964215, - "grad_norm": 1.3782665729522705, - "learning_rate": 2.2318691991910014e-05, - "loss": 0.8427, + "epoch": 0.5419847328244275, + "grad_norm": 1.0583869218826294, + "learning_rate": 3.9228485746248134e-05, + "loss": 0.7146, "step": 3124 }, { - "epoch": 1.1411356582070478, - "grad_norm": 1.1320949792861938, - "learning_rate": 2.2304442956417164e-05, - "loss": 0.7999, + "epoch": 0.5421582234559333, + "grad_norm": 1.754012107849121, + "learning_rate": 3.922661051007137e-05, + "loss": 0.7588, "step": 3125 }, { - "epoch": 1.1415008216176739, - "grad_norm": 1.4169334173202515, - "learning_rate": 2.229019273538089e-05, - "loss": 0.8727, + "epoch": 0.5423317140874393, + "grad_norm": 0.9628296494483948, + "learning_rate": 3.922473304262406e-05, + "loss": 0.7236, "step": 3126 }, { - "epoch": 1.1418659850283002, - "grad_norm": 1.0095173120498657, - "learning_rate": 2.227594133613235e-05, - "loss": 0.8486, + "epoch": 0.5425052047189451, + "grad_norm": 0.7915809154510498, + "learning_rate": 3.922285334412408e-05, + "loss": 0.7241, "step": 3127 }, { - "epoch": 1.1422311484389265, - "grad_norm": 1.0201703310012817, - "learning_rate": 2.2261688766003317e-05, - "loss": 0.8834, + "epoch": 0.5426786953504511, + "grad_norm": 0.8458516597747803, + "learning_rate": 3.922097141478957e-05, + "loss": 0.8752, "step": 3128 }, { - "epoch": 1.1425963118495526, - "grad_norm": 1.0847793817520142, - "learning_rate": 2.2247435032326178e-05, - "loss": 0.8766, + "epoch": 0.542852185981957, + "grad_norm": 0.991320788860321, + "learning_rate": 3.9219087254838925e-05, + "loss": 0.7847, "step": 3129 }, { - "epoch": 1.1429614752601789, - "grad_norm": 1.1372663974761963, - "learning_rate": 2.2233180142433894e-05, - "loss": 0.8383, + "epoch": 0.5430256766134629, + "grad_norm": 0.8510996699333191, + "learning_rate": 3.921720086449082e-05, + "loss": 0.7979, "step": 3130 }, { - "epoch": 1.1433266386708052, - "grad_norm": 1.3192739486694336, - "learning_rate": 2.2218924103660035e-05, - "loss": 0.8479, + "epoch": 0.5431991672449688, + "grad_norm": 1.2209041118621826, + "learning_rate": 3.921531224396415e-05, + "loss": 0.7542, "step": 3131 }, { - "epoch": 1.1436918020814315, - "grad_norm": 1.5340696573257446, - "learning_rate": 2.2204666923338772e-05, - "loss": 0.8584, + "epoch": 0.5433726578764747, + "grad_norm": 0.9169350862503052, + "learning_rate": 3.921342139347811e-05, + "loss": 0.6978, "step": 3132 }, { - "epoch": 1.1440569654920578, - "grad_norm": 2.285275936126709, - "learning_rate": 2.219040860880484e-05, - "loss": 0.8541, + "epoch": 0.5435461485079806, + "grad_norm": 0.895456850528717, + "learning_rate": 3.921152831325213e-05, + "loss": 0.7085, "step": 3133 }, { - "epoch": 1.144422128902684, - "grad_norm": 1.139524221420288, - "learning_rate": 2.217614916739358e-05, - "loss": 0.8563, + "epoch": 0.5437196391394865, + "grad_norm": 0.89678955078125, + "learning_rate": 3.92096330035059e-05, + "loss": 0.782, "step": 3134 }, { - "epoch": 1.1447872923133102, - "grad_norm": 1.4410278797149658, - "learning_rate": 2.2161888606440885e-05, - "loss": 0.8326, + "epoch": 0.5438931297709924, + "grad_norm": 0.9710404872894287, + "learning_rate": 3.920773546445938e-05, + "loss": 0.8528, "step": 3135 }, { - "epoch": 1.1451524557239365, - "grad_norm": 0.8954200744628906, - "learning_rate": 2.2147626933283265e-05, - "loss": 0.8545, + "epoch": 0.5440666204024983, + "grad_norm": 0.8696531057357788, + "learning_rate": 3.9205835696332775e-05, + "loss": 0.7439, "step": 3136 }, { - "epoch": 1.1455176191345626, - "grad_norm": 1.1759560108184814, - "learning_rate": 2.213336415525776e-05, - "loss": 0.8441, + "epoch": 0.5442401110340042, + "grad_norm": 0.9369046688079834, + "learning_rate": 3.9203933699346555e-05, + "loss": 0.7051, "step": 3137 }, { - "epoch": 1.145882782545189, - "grad_norm": 1.2794287204742432, - "learning_rate": 2.2119100279702005e-05, - "loss": 0.8171, + "epoch": 0.5444136016655101, + "grad_norm": 0.7649116516113281, + "learning_rate": 3.920202947372146e-05, + "loss": 0.7578, "step": 3138 }, { - "epoch": 1.1462479459558153, - "grad_norm": 1.6668262481689453, - "learning_rate": 2.2104835313954193e-05, - "loss": 0.8005, + "epoch": 0.544587092297016, + "grad_norm": 1.2905229330062866, + "learning_rate": 3.9200123019678467e-05, + "loss": 0.6863, "step": 3139 }, { - "epoch": 1.1466131093664416, - "grad_norm": 2.0467426776885986, - "learning_rate": 2.209056926535307e-05, - "loss": 0.8263, + "epoch": 0.5447605829285218, + "grad_norm": 2.2870028018951416, + "learning_rate": 3.919821433743882e-05, + "loss": 0.9287, "step": 3140 }, { - "epoch": 1.1469782727770677, - "grad_norm": 1.1897947788238525, - "learning_rate": 2.2076302141237953e-05, - "loss": 0.8591, + "epoch": 0.5449340735600278, + "grad_norm": 1.3974392414093018, + "learning_rate": 3.9196303427224036e-05, + "loss": 0.8606, "step": 3141 }, { - "epoch": 1.147343436187694, - "grad_norm": 1.424034595489502, - "learning_rate": 2.2062033948948697e-05, - "loss": 0.8042, + "epoch": 0.5451075641915336, + "grad_norm": 0.811080276966095, + "learning_rate": 3.919439028925587e-05, + "loss": 0.7878, "step": 3142 }, { - "epoch": 1.1477085995983203, - "grad_norm": 1.0790678262710571, - "learning_rate": 2.2047764695825725e-05, - "loss": 0.8397, + "epoch": 0.5452810548230396, + "grad_norm": 1.072441816329956, + "learning_rate": 3.919247492375634e-05, + "loss": 0.7793, "step": 3143 }, { - "epoch": 1.1480737630089466, - "grad_norm": 1.1731537580490112, - "learning_rate": 2.2033494389209988e-05, - "loss": 0.8638, + "epoch": 0.5454545454545454, + "grad_norm": 0.8549439907073975, + "learning_rate": 3.919055733094774e-05, + "loss": 0.7649, "step": 3144 }, { - "epoch": 1.1484389264195727, - "grad_norm": 0.7549064755439758, - "learning_rate": 2.201922303644298e-05, - "loss": 0.8639, + "epoch": 0.5456280360860514, + "grad_norm": 1.4729071855545044, + "learning_rate": 3.918863751105259e-05, + "loss": 0.7036, "step": 3145 }, { - "epoch": 1.148804089830199, - "grad_norm": 0.9344198107719421, - "learning_rate": 2.200495064486675e-05, - "loss": 0.8397, + "epoch": 0.5458015267175572, + "grad_norm": 0.8632094264030457, + "learning_rate": 3.91867154642937e-05, + "loss": 0.7896, "step": 3146 }, { - "epoch": 1.1491692532408253, - "grad_norm": 1.5928245782852173, - "learning_rate": 2.1990677221823865e-05, - "loss": 0.8088, + "epoch": 0.5459750173490632, + "grad_norm": 1.1599199771881104, + "learning_rate": 3.918479119089413e-05, + "loss": 0.7046, "step": 3147 }, { - "epoch": 1.1495344166514516, - "grad_norm": 1.3351231813430786, - "learning_rate": 2.1976402774657432e-05, - "loss": 0.8198, + "epoch": 0.546148507980569, + "grad_norm": 0.960551917552948, + "learning_rate": 3.918286469107718e-05, + "loss": 0.7758, "step": 3148 }, { - "epoch": 1.1498995800620777, - "grad_norm": 1.6619000434875488, - "learning_rate": 2.196212731071108e-05, - "loss": 0.8274, + "epoch": 0.546321998612075, + "grad_norm": 0.9028762578964233, + "learning_rate": 3.918093596506643e-05, + "loss": 0.7524, "step": 3149 }, { - "epoch": 1.150264743472704, - "grad_norm": 1.406879186630249, - "learning_rate": 2.194785083732896e-05, - "loss": 0.8355, + "epoch": 0.5464954892435808, + "grad_norm": 0.8706607818603516, + "learning_rate": 3.917900501308572e-05, + "loss": 0.7676, "step": 3150 }, { - "epoch": 1.1506299068833303, - "grad_norm": 1.5239226818084717, - "learning_rate": 2.193357336185575e-05, - "loss": 0.8442, + "epoch": 0.5466689798750868, + "grad_norm": 0.7877565026283264, + "learning_rate": 3.917707183535913e-05, + "loss": 0.8567, "step": 3151 }, { - "epoch": 1.1509950702939564, - "grad_norm": 1.1352418661117554, - "learning_rate": 2.191929489163663e-05, - "loss": 0.8003, + "epoch": 0.5468424705065926, + "grad_norm": 0.7896848320960999, + "learning_rate": 3.9175136432111e-05, + "loss": 0.8313, "step": 3152 }, { - "epoch": 1.1513602337045827, - "grad_norm": 0.8341194987297058, - "learning_rate": 2.1905015434017313e-05, - "loss": 0.8201, + "epoch": 0.5470159611380986, + "grad_norm": 0.8352651596069336, + "learning_rate": 3.917319880356594e-05, + "loss": 0.8298, "step": 3153 }, { - "epoch": 1.151725397115209, - "grad_norm": 1.325024127960205, - "learning_rate": 2.1890734996343985e-05, - "loss": 0.7971, + "epoch": 0.5471894517696044, + "grad_norm": 1.1247204542160034, + "learning_rate": 3.9171258949948827e-05, + "loss": 0.719, "step": 3154 }, { - "epoch": 1.1520905605258354, - "grad_norm": 1.0179567337036133, - "learning_rate": 2.1876453585963384e-05, - "loss": 0.8312, + "epoch": 0.5473629424011104, + "grad_norm": 1.0792063474655151, + "learning_rate": 3.916931687148477e-05, + "loss": 0.7246, "step": 3155 }, { - "epoch": 1.1524557239364617, - "grad_norm": 0.9895704984664917, - "learning_rate": 2.1862171210222708e-05, - "loss": 0.8102, + "epoch": 0.5475364330326162, + "grad_norm": 0.8742115497589111, + "learning_rate": 3.916737256839916e-05, + "loss": 0.7749, "step": 3156 }, { - "epoch": 1.1528208873470878, - "grad_norm": 1.1374938488006592, - "learning_rate": 2.1847887876469666e-05, - "loss": 0.8765, + "epoch": 0.5477099236641222, + "grad_norm": 0.9016923308372498, + "learning_rate": 3.916542604091762e-05, + "loss": 0.7235, "step": 3157 }, { - "epoch": 1.153186050757714, - "grad_norm": 1.5420539379119873, - "learning_rate": 2.1833603592052464e-05, - "loss": 0.813, + "epoch": 0.547883414295628, + "grad_norm": 1.2905267477035522, + "learning_rate": 3.916347728926606e-05, + "loss": 0.9033, "step": 3158 }, { - "epoch": 1.1535512141683404, - "grad_norm": 0.9463297724723816, - "learning_rate": 2.181931836431979e-05, - "loss": 0.8307, + "epoch": 0.5480569049271339, + "grad_norm": 1.989302635192871, + "learning_rate": 3.916152631367063e-05, + "loss": 0.686, "step": 3159 }, { - "epoch": 1.1539163775789665, - "grad_norm": 1.251704454421997, - "learning_rate": 2.1805032200620824e-05, - "loss": 0.8859, + "epoch": 0.5482303955586398, + "grad_norm": 1.1719547510147095, + "learning_rate": 3.915957311435774e-05, + "loss": 0.7344, "step": 3160 }, { - "epoch": 1.1542815409895928, - "grad_norm": 2.0637423992156982, - "learning_rate": 2.1790745108305222e-05, - "loss": 0.8585, + "epoch": 0.5484038861901457, + "grad_norm": 1.1103343963623047, + "learning_rate": 3.915761769155407e-05, + "loss": 0.6703, "step": 3161 }, { - "epoch": 1.154646704400219, - "grad_norm": 0.9310083985328674, - "learning_rate": 2.1776457094723115e-05, - "loss": 0.8474, + "epoch": 0.5485773768216516, + "grad_norm": 1.3326486349105835, + "learning_rate": 3.915566004548654e-05, + "loss": 0.6711, "step": 3162 }, { - "epoch": 1.1550118678108454, - "grad_norm": 1.2046207189559937, - "learning_rate": 2.176216816722513e-05, - "loss": 0.849, + "epoch": 0.5487508674531575, + "grad_norm": 1.0211853981018066, + "learning_rate": 3.9153700176382344e-05, + "loss": 0.8542, "step": 3163 }, { - "epoch": 1.1553770312214717, - "grad_norm": 1.171525001525879, - "learning_rate": 2.1747878333162326e-05, - "loss": 0.8392, + "epoch": 0.5489243580846634, + "grad_norm": 0.8541906476020813, + "learning_rate": 3.915173808446892e-05, + "loss": 0.8137, "step": 3164 }, { - "epoch": 1.1557421946320978, - "grad_norm": 1.1761959791183472, - "learning_rate": 2.173358759988626e-05, - "loss": 0.8693, + "epoch": 0.5490978487161693, + "grad_norm": 0.9299607872962952, + "learning_rate": 3.9149773769973985e-05, + "loss": 0.6367, "step": 3165 }, { - "epoch": 1.1561073580427241, - "grad_norm": 1.0785889625549316, - "learning_rate": 2.1719295974748934e-05, - "loss": 0.8169, + "epoch": 0.5492713393476752, + "grad_norm": 0.8646824955940247, + "learning_rate": 3.914780723312548e-05, + "loss": 0.6877, "step": 3166 }, { - "epoch": 1.1564725214533504, - "grad_norm": 0.9280647039413452, - "learning_rate": 2.1705003465102818e-05, - "loss": 0.8121, + "epoch": 0.5494448299791811, + "grad_norm": 0.9782006740570068, + "learning_rate": 3.9145838474151633e-05, + "loss": 0.6882, "step": 3167 }, { - "epoch": 1.1568376848639765, - "grad_norm": 1.0930193662643433, - "learning_rate": 2.1690710078300847e-05, - "loss": 0.8499, + "epoch": 0.549618320610687, + "grad_norm": 0.857393741607666, + "learning_rate": 3.914386749328093e-05, + "loss": 0.8022, "step": 3168 }, { - "epoch": 1.1572028482746028, - "grad_norm": 1.0562762022018433, - "learning_rate": 2.167641582169637e-05, - "loss": 0.8074, + "epoch": 0.5497918112421929, + "grad_norm": 0.7993167638778687, + "learning_rate": 3.914189429074209e-05, + "loss": 0.8342, "step": 3169 }, { - "epoch": 1.1575680116852292, - "grad_norm": 1.3404974937438965, - "learning_rate": 2.166212070264324e-05, - "loss": 0.8103, + "epoch": 0.5499653018736989, + "grad_norm": 0.8862253427505493, + "learning_rate": 3.913991886676412e-05, + "loss": 0.8679, "step": 3170 }, { - "epoch": 1.1579331750958555, - "grad_norm": 1.1025499105453491, - "learning_rate": 2.1647824728495696e-05, - "loss": 0.8566, + "epoch": 0.5501387925052047, + "grad_norm": 0.7517207860946655, + "learning_rate": 3.913794122157626e-05, + "loss": 0.7673, "step": 3171 }, { - "epoch": 1.1582983385064816, - "grad_norm": 0.8815589547157288, - "learning_rate": 2.1633527906608457e-05, - "loss": 0.8497, + "epoch": 0.5503122831367107, + "grad_norm": 0.8645022511482239, + "learning_rate": 3.9135961355408024e-05, + "loss": 0.7332, "step": 3172 }, { - "epoch": 1.1586635019171079, - "grad_norm": 0.9772539138793945, - "learning_rate": 2.1619230244336652e-05, - "loss": 0.8338, + "epoch": 0.5504857737682165, + "grad_norm": 1.2676156759262085, + "learning_rate": 3.913397926848917e-05, + "loss": 0.698, "step": 3173 }, { - "epoch": 1.1590286653277342, - "grad_norm": 1.2946699857711792, - "learning_rate": 2.1604931749035865e-05, - "loss": 0.8301, + "epoch": 0.5506592643997225, + "grad_norm": 0.8478946089744568, + "learning_rate": 3.913199496104972e-05, + "loss": 0.7598, "step": 3174 }, { - "epoch": 1.1593938287383605, - "grad_norm": 1.3454798460006714, - "learning_rate": 2.1590632428062097e-05, - "loss": 0.869, + "epoch": 0.5508327550312283, + "grad_norm": 0.7236790060997009, + "learning_rate": 3.9130008433319974e-05, + "loss": 0.9236, "step": 3175 }, { - "epoch": 1.1597589921489866, - "grad_norm": 1.1978145837783813, - "learning_rate": 2.1576332288771776e-05, - "loss": 0.8103, + "epoch": 0.5510062456627343, + "grad_norm": 0.7967181205749512, + "learning_rate": 3.912801968553045e-05, + "loss": 0.8218, "step": 3176 }, { - "epoch": 1.160124155559613, - "grad_norm": 1.2905806303024292, - "learning_rate": 2.1562031338521745e-05, - "loss": 0.8306, + "epoch": 0.5511797362942401, + "grad_norm": 0.8625802397727966, + "learning_rate": 3.912602871791196e-05, + "loss": 0.7944, "step": 3177 }, { - "epoch": 1.1604893189702392, - "grad_norm": 1.2784709930419922, - "learning_rate": 2.1547729584669262e-05, - "loss": 0.8615, + "epoch": 0.5513532269257461, + "grad_norm": 0.9473859071731567, + "learning_rate": 3.9124035530695546e-05, + "loss": 0.7922, "step": 3178 }, { - "epoch": 1.1608544823808655, - "grad_norm": 1.3251971006393433, - "learning_rate": 2.1533427034572022e-05, - "loss": 0.8571, + "epoch": 0.5515267175572519, + "grad_norm": 1.866816520690918, + "learning_rate": 3.912204012411253e-05, + "loss": 0.7207, "step": 3179 }, { - "epoch": 1.1612196457914916, - "grad_norm": 1.3579729795455933, - "learning_rate": 2.1519123695588106e-05, - "loss": 0.8511, + "epoch": 0.5517002081887578, + "grad_norm": 0.8984963297843933, + "learning_rate": 3.912004249839447e-05, + "loss": 0.7549, "step": 3180 }, { - "epoch": 1.161584809202118, - "grad_norm": 1.1262983083724976, - "learning_rate": 2.1504819575076e-05, - "loss": 0.8179, + "epoch": 0.5518736988202637, + "grad_norm": 1.2476195096969604, + "learning_rate": 3.9118042653773194e-05, + "loss": 0.8555, "step": 3181 }, { - "epoch": 1.1619499726127442, - "grad_norm": 1.2025413513183594, - "learning_rate": 2.1490514680394616e-05, - "loss": 0.8754, + "epoch": 0.5520471894517696, + "grad_norm": 1.1945667266845703, + "learning_rate": 3.91160405904808e-05, + "loss": 0.7761, "step": 3182 }, { - "epoch": 1.1623151360233706, - "grad_norm": 1.0984830856323242, - "learning_rate": 2.147620901890324e-05, - "loss": 0.8375, + "epoch": 0.5522206800832755, + "grad_norm": 0.8139678239822388, + "learning_rate": 3.9114036308749625e-05, + "loss": 0.7563, "step": 3183 }, { - "epoch": 1.1626802994339966, - "grad_norm": 1.3294727802276611, - "learning_rate": 2.146190259796155e-05, - "loss": 0.8485, + "epoch": 0.5523941707147814, + "grad_norm": 1.421354055404663, + "learning_rate": 3.911202980881226e-05, + "loss": 0.7498, "step": 3184 }, { - "epoch": 1.163045462844623, - "grad_norm": 1.4669780731201172, - "learning_rate": 2.1447595424929647e-05, - "loss": 0.8685, + "epoch": 0.5525676613462873, + "grad_norm": 1.0158376693725586, + "learning_rate": 3.911002109090156e-05, + "loss": 0.8347, "step": 3185 }, { - "epoch": 1.1634106262552493, - "grad_norm": 1.1618907451629639, - "learning_rate": 2.143328750716798e-05, - "loss": 0.8871, + "epoch": 0.5527411519777932, + "grad_norm": 0.753299355506897, + "learning_rate": 3.910801015525064e-05, + "loss": 0.7883, "step": 3186 }, { - "epoch": 1.1637757896658756, - "grad_norm": 1.0696464776992798, - "learning_rate": 2.141897885203741e-05, - "loss": 0.8019, + "epoch": 0.5529146426092991, + "grad_norm": 0.9519997835159302, + "learning_rate": 3.9105997002092896e-05, + "loss": 0.8267, "step": 3187 }, { - "epoch": 1.1641409530765017, - "grad_norm": 1.2928698062896729, - "learning_rate": 2.140466946689915e-05, - "loss": 0.8713, + "epoch": 0.553088133240805, + "grad_norm": 0.8156053423881531, + "learning_rate": 3.910398163166192e-05, + "loss": 0.7888, "step": 3188 }, { - "epoch": 1.164506116487128, - "grad_norm": 0.8214814066886902, - "learning_rate": 2.1390359359114826e-05, - "loss": 0.8241, + "epoch": 0.5532616238723109, + "grad_norm": 1.200778603553772, + "learning_rate": 3.910196404419163e-05, + "loss": 0.718, "step": 3189 }, { - "epoch": 1.1648712798977543, - "grad_norm": 1.1614676713943481, - "learning_rate": 2.13760485360464e-05, - "loss": 0.8658, + "epoch": 0.5534351145038168, + "grad_norm": 1.6525177955627441, + "learning_rate": 3.909994423991614e-05, + "loss": 0.7012, "step": 3190 }, { - "epoch": 1.1652364433083804, - "grad_norm": 1.3665642738342285, - "learning_rate": 2.136173700505622e-05, - "loss": 0.8621, + "epoch": 0.5536086051353227, + "grad_norm": 0.9430911540985107, + "learning_rate": 3.909792221906987e-05, + "loss": 0.7166, "step": 3191 }, { - "epoch": 1.1656016067190067, - "grad_norm": 1.0744699239730835, - "learning_rate": 2.134742477350699e-05, - "loss": 0.8494, + "epoch": 0.5537820957668286, + "grad_norm": 0.7951689958572388, + "learning_rate": 3.909589798188747e-05, + "loss": 0.7712, "step": 3192 }, { - "epoch": 1.165966770129633, - "grad_norm": 0.9296935796737671, - "learning_rate": 2.133311184876179e-05, - "loss": 0.8458, + "epoch": 0.5539555863983345, + "grad_norm": 1.16310453414917, + "learning_rate": 3.909387152860386e-05, + "loss": 0.6731, "step": 3193 }, { - "epoch": 1.1663319335402593, - "grad_norm": 1.6463240385055542, - "learning_rate": 2.1318798238184036e-05, - "loss": 0.8386, + "epoch": 0.5541290770298404, + "grad_norm": 1.0941250324249268, + "learning_rate": 3.909184285945421e-05, + "loss": 0.7505, "step": 3194 }, { - "epoch": 1.1666970969508856, - "grad_norm": 0.8908168077468872, - "learning_rate": 2.1304483949137503e-05, - "loss": 0.8882, + "epoch": 0.5543025676613463, + "grad_norm": 1.19204580783844, + "learning_rate": 3.908981197467396e-05, + "loss": 0.7733, "step": 3195 }, { - "epoch": 1.1670622603615117, - "grad_norm": 0.9840510487556458, - "learning_rate": 2.1290168988986332e-05, - "loss": 0.8259, + "epoch": 0.5544760582928522, + "grad_norm": 0.7835580110549927, + "learning_rate": 3.908777887449877e-05, + "loss": 0.9395, "step": 3196 }, { - "epoch": 1.167427423772138, - "grad_norm": 1.2271904945373535, - "learning_rate": 2.127585336509498e-05, - "loss": 0.8105, + "epoch": 0.5546495489243581, + "grad_norm": 1.0850136280059814, + "learning_rate": 3.908574355916461e-05, + "loss": 0.8774, "step": 3197 }, { - "epoch": 1.1677925871827644, - "grad_norm": 1.8669118881225586, - "learning_rate": 2.1261537084828274e-05, - "loss": 0.8094, + "epoch": 0.554823039555864, + "grad_norm": 0.9652018547058105, + "learning_rate": 3.9083706028907665e-05, + "loss": 0.7885, "step": 3198 }, { - "epoch": 1.1681577505933904, - "grad_norm": 0.9081668257713318, - "learning_rate": 2.1247220155551357e-05, - "loss": 0.8618, + "epoch": 0.5549965301873698, + "grad_norm": 0.9598507881164551, + "learning_rate": 3.90816662839644e-05, + "loss": 0.8313, "step": 3199 }, { - "epoch": 1.1685229140040168, - "grad_norm": 1.6584268808364868, - "learning_rate": 2.1232902584629716e-05, - "loss": 0.8655, + "epoch": 0.5551700208188758, + "grad_norm": 0.9954084157943726, + "learning_rate": 3.9079624324571536e-05, + "loss": 0.749, "step": 3200 }, { - "epoch": 1.168888077414643, - "grad_norm": 1.3628352880477905, - "learning_rate": 2.121858437942917e-05, - "loss": 0.8411, + "epoch": 0.5553435114503816, + "grad_norm": 1.0781548023223877, + "learning_rate": 3.907758015096603e-05, + "loss": 0.835, "step": 3201 }, { - "epoch": 1.1692532408252694, - "grad_norm": 0.9034229516983032, - "learning_rate": 2.1204265547315862e-05, - "loss": 0.8287, + "epoch": 0.5555170020818876, + "grad_norm": 0.7844817042350769, + "learning_rate": 3.9075533763385116e-05, + "loss": 0.948, "step": 3202 }, { - "epoch": 1.1696184042358955, - "grad_norm": 1.2040073871612549, - "learning_rate": 2.1189946095656255e-05, - "loss": 0.808, + "epoch": 0.5556904927133934, + "grad_norm": 0.883001983165741, + "learning_rate": 3.907348516206629e-05, + "loss": 0.7605, "step": 3203 }, { - "epoch": 1.1699835676465218, - "grad_norm": 1.0805410146713257, - "learning_rate": 2.117562603181713e-05, - "loss": 0.8607, + "epoch": 0.5558639833448994, + "grad_norm": 2.294121742248535, + "learning_rate": 3.9071434347247275e-05, + "loss": 0.7903, "step": 3204 }, { - "epoch": 1.170348731057148, - "grad_norm": 0.8349794149398804, - "learning_rate": 2.116130536316558e-05, - "loss": 0.866, + "epoch": 0.5560374739764052, + "grad_norm": 0.8835831880569458, + "learning_rate": 3.906938131916609e-05, + "loss": 0.7717, "step": 3205 }, { - "epoch": 1.1707138944677744, - "grad_norm": 1.4036293029785156, - "learning_rate": 2.114698409706903e-05, - "loss": 0.7976, + "epoch": 0.5562109646079112, + "grad_norm": 1.0615869760513306, + "learning_rate": 3.906732607806098e-05, + "loss": 0.7263, "step": 3206 }, { - "epoch": 1.1710790578784005, - "grad_norm": 0.9691435694694519, - "learning_rate": 2.1132662240895182e-05, - "loss": 0.8308, + "epoch": 0.556384455239417, + "grad_norm": 0.7271368503570557, + "learning_rate": 3.906526862417046e-05, + "loss": 0.7981, "step": 3207 }, { - "epoch": 1.1714442212890268, - "grad_norm": 0.9851435422897339, - "learning_rate": 2.111833980201207e-05, - "loss": 0.8101, + "epoch": 0.556557945870923, + "grad_norm": 0.9867111444473267, + "learning_rate": 3.906320895773329e-05, + "loss": 0.8354, "step": 3208 }, { - "epoch": 1.1718093846996531, - "grad_norm": 0.940098762512207, - "learning_rate": 2.1104016787787994e-05, - "loss": 0.8235, + "epoch": 0.5567314365024288, + "grad_norm": 1.376067876815796, + "learning_rate": 3.9061147078988526e-05, + "loss": 0.7949, "step": 3209 }, { - "epoch": 1.1721745481102794, - "grad_norm": 1.0418394804000854, - "learning_rate": 2.108969320559159e-05, - "loss": 0.8657, + "epoch": 0.5569049271339348, + "grad_norm": 1.1133368015289307, + "learning_rate": 3.905908298817543e-05, + "loss": 0.8435, "step": 3210 }, { - "epoch": 1.1725397115209055, - "grad_norm": 1.1476168632507324, - "learning_rate": 2.107536906279176e-05, - "loss": 0.8583, + "epoch": 0.5570784177654406, + "grad_norm": 0.934894859790802, + "learning_rate": 3.905701668553353e-05, + "loss": 0.8455, "step": 3211 }, { - "epoch": 1.1729048749315318, - "grad_norm": 1.219645380973816, - "learning_rate": 2.106104436675769e-05, - "loss": 0.8127, + "epoch": 0.5572519083969466, + "grad_norm": 0.994011402130127, + "learning_rate": 3.905494817130265e-05, + "loss": 0.8213, "step": 3212 }, { - "epoch": 1.1732700383421582, - "grad_norm": 0.8512141108512878, - "learning_rate": 2.1046719124858882e-05, - "loss": 0.8, + "epoch": 0.5574253990284525, + "grad_norm": 1.2984954118728638, + "learning_rate": 3.905287744572283e-05, + "loss": 0.6871, "step": 3213 }, { - "epoch": 1.1736352017527845, - "grad_norm": 0.8096923232078552, - "learning_rate": 2.1032393344465077e-05, - "loss": 0.8306, + "epoch": 0.5575988896599584, + "grad_norm": 1.1686304807662964, + "learning_rate": 3.9050804509034383e-05, + "loss": 0.6592, "step": 3214 }, { - "epoch": 1.1740003651634106, - "grad_norm": 1.2277377843856812, - "learning_rate": 2.1018067032946327e-05, - "loss": 0.8657, + "epoch": 0.5577723802914643, + "grad_norm": 0.9199947118759155, + "learning_rate": 3.904872936147787e-05, + "loss": 0.7122, "step": 3215 }, { - "epoch": 1.1743655285740369, - "grad_norm": 1.3578410148620605, - "learning_rate": 2.1003740197672946e-05, - "loss": 0.8127, + "epoch": 0.5579458709229702, + "grad_norm": 1.0472463369369507, + "learning_rate": 3.904665200329411e-05, + "loss": 0.7388, "step": 3216 }, { - "epoch": 1.1747306919846632, - "grad_norm": 1.165096640586853, - "learning_rate": 2.0989412846015504e-05, - "loss": 0.8353, + "epoch": 0.5581193615544761, + "grad_norm": 0.8351315259933472, + "learning_rate": 3.904457243472421e-05, + "loss": 0.7856, "step": 3217 }, { - "epoch": 1.1750958553952895, - "grad_norm": 1.313920259475708, - "learning_rate": 2.0975084985344857e-05, - "loss": 0.8928, + "epoch": 0.5582928521859819, + "grad_norm": 0.8953219056129456, + "learning_rate": 3.904249065600948e-05, + "loss": 0.804, "step": 3218 }, { - "epoch": 1.1754610188059156, - "grad_norm": 0.840957522392273, - "learning_rate": 2.0960756623032114e-05, - "loss": 0.8861, + "epoch": 0.5584663428174879, + "grad_norm": 1.0878286361694336, + "learning_rate": 3.904040666739151e-05, + "loss": 0.844, "step": 3219 }, { - "epoch": 1.175826182216542, - "grad_norm": 1.101118803024292, - "learning_rate": 2.0946427766448642e-05, - "loss": 0.86, + "epoch": 0.5586398334489937, + "grad_norm": 1.3281797170639038, + "learning_rate": 3.903832046911218e-05, + "loss": 0.8413, "step": 3220 }, { - "epoch": 1.1761913456271682, - "grad_norm": 1.2549452781677246, - "learning_rate": 2.093209842296606e-05, - "loss": 0.8762, + "epoch": 0.5588133240804997, + "grad_norm": 0.9216591715812683, + "learning_rate": 3.903623206141356e-05, + "loss": 0.7031, "step": 3221 }, { - "epoch": 1.1765565090377943, - "grad_norm": 1.250970482826233, - "learning_rate": 2.0917768599956236e-05, - "loss": 0.8394, + "epoch": 0.5589868147120055, + "grad_norm": 0.9696487188339233, + "learning_rate": 3.9034141444538034e-05, + "loss": 0.7766, "step": 3222 }, { - "epoch": 1.1769216724484206, - "grad_norm": 1.125745415687561, - "learning_rate": 2.090343830479131e-05, - "loss": 0.8361, + "epoch": 0.5591603053435115, + "grad_norm": 1.2620514631271362, + "learning_rate": 3.903204861872821e-05, + "loss": 0.8892, "step": 3223 }, { - "epoch": 1.177286835859047, - "grad_norm": 1.5244765281677246, - "learning_rate": 2.0889107544843615e-05, - "loss": 0.8429, + "epoch": 0.5593337959750173, + "grad_norm": 1.1057207584381104, + "learning_rate": 3.902995358422697e-05, + "loss": 0.7588, "step": 3224 }, { - "epoch": 1.1776519992696732, - "grad_norm": 1.1048673391342163, - "learning_rate": 2.0874776327485777e-05, - "loss": 0.8462, + "epoch": 0.5595072866065233, + "grad_norm": 2.529022216796875, + "learning_rate": 3.902785634127744e-05, + "loss": 0.8855, "step": 3225 }, { - "epoch": 1.1780171626802995, - "grad_norm": 1.2062811851501465, - "learning_rate": 2.0860444660090612e-05, - "loss": 0.8522, + "epoch": 0.5596807772380291, + "grad_norm": 0.8438230752944946, + "learning_rate": 3.902575689012301e-05, + "loss": 0.8748, "step": 3226 }, { - "epoch": 1.1783823260909256, - "grad_norm": 1.050065279006958, - "learning_rate": 2.0846112550031198e-05, - "loss": 0.8611, + "epoch": 0.5598542678695351, + "grad_norm": 0.7619295716285706, + "learning_rate": 3.9023655231007325e-05, + "loss": 0.7424, "step": 3227 }, { - "epoch": 1.178747489501552, - "grad_norm": 1.2127424478530884, - "learning_rate": 2.0831780004680834e-05, - "loss": 0.827, + "epoch": 0.5600277585010409, + "grad_norm": 1.152724266052246, + "learning_rate": 3.9021551364174286e-05, + "loss": 0.7209, "step": 3228 }, { - "epoch": 1.1791126529121783, - "grad_norm": 0.8911066651344299, - "learning_rate": 2.081744703141303e-05, - "loss": 0.83, + "epoch": 0.5602012491325469, + "grad_norm": 1.3193612098693848, + "learning_rate": 3.901944528986804e-05, + "loss": 0.7671, "step": 3229 }, { - "epoch": 1.1794778163228044, - "grad_norm": 1.2688570022583008, - "learning_rate": 2.0803113637601543e-05, - "loss": 0.8269, + "epoch": 0.5603747397640527, + "grad_norm": 1.0640499591827393, + "learning_rate": 3.901733700833301e-05, + "loss": 0.8875, "step": 3230 }, { - "epoch": 1.1798429797334307, - "grad_norm": 0.8879021406173706, - "learning_rate": 2.07887798306203e-05, - "loss": 0.8447, + "epoch": 0.5605482303955587, + "grad_norm": 0.8309997916221619, + "learning_rate": 3.9015226519813864e-05, + "loss": 0.8796, "step": 3231 }, { - "epoch": 1.180208143144057, - "grad_norm": 1.0300418138504028, - "learning_rate": 2.0774445617843493e-05, - "loss": 0.8264, + "epoch": 0.5607217210270645, + "grad_norm": 1.1093846559524536, + "learning_rate": 3.9013113824555515e-05, + "loss": 0.967, "step": 3232 }, { - "epoch": 1.1805733065546833, - "grad_norm": 1.470511555671692, - "learning_rate": 2.076011100664549e-05, - "loss": 0.8272, + "epoch": 0.5608952116585705, + "grad_norm": 1.1995917558670044, + "learning_rate": 3.901099892280316e-05, + "loss": 0.8147, "step": 3233 }, { - "epoch": 1.1809384699653096, - "grad_norm": 0.9907848834991455, - "learning_rate": 2.0745776004400876e-05, - "loss": 0.8446, + "epoch": 0.5610687022900763, + "grad_norm": 3.0650577545166016, + "learning_rate": 3.9008881814802225e-05, + "loss": 0.772, "step": 3234 }, { - "epoch": 1.1813036333759357, - "grad_norm": 1.087559461593628, - "learning_rate": 2.0731440618484436e-05, - "loss": 0.8967, + "epoch": 0.5612421929215823, + "grad_norm": 0.9237606525421143, + "learning_rate": 3.900676250079841e-05, + "loss": 0.7739, "step": 3235 }, { - "epoch": 1.181668796786562, - "grad_norm": 1.138514757156372, - "learning_rate": 2.0717104856271152e-05, - "loss": 0.8492, + "epoch": 0.5614156835530881, + "grad_norm": 0.9703139066696167, + "learning_rate": 3.900464098103765e-05, + "loss": 0.825, "step": 3236 }, { - "epoch": 1.1820339601971883, - "grad_norm": 1.2271310091018677, - "learning_rate": 2.0702768725136192e-05, - "loss": 0.7915, + "epoch": 0.5615891741845941, + "grad_norm": 0.7603265047073364, + "learning_rate": 3.9002517255766163e-05, + "loss": 0.6936, "step": 3237 }, { - "epoch": 1.1823991236078144, - "grad_norm": 1.6847772598266602, - "learning_rate": 2.068843223245492e-05, - "loss": 0.8269, + "epoch": 0.5617626648160999, + "grad_norm": 1.3383305072784424, + "learning_rate": 3.9000391325230405e-05, + "loss": 0.679, "step": 3238 }, { - "epoch": 1.1827642870184407, - "grad_norm": 1.1751292943954468, - "learning_rate": 2.0674095385602885e-05, - "loss": 0.8481, + "epoch": 0.5619361554476058, + "grad_norm": 0.9948176145553589, + "learning_rate": 3.89982631896771e-05, + "loss": 0.8271, "step": 3239 }, { - "epoch": 1.183129450429067, - "grad_norm": 1.0609362125396729, - "learning_rate": 2.0659758191955833e-05, - "loss": 0.8573, + "epoch": 0.5621096460791117, + "grad_norm": 1.204573392868042, + "learning_rate": 3.899613284935321e-05, + "loss": 0.6516, "step": 3240 }, { - "epoch": 1.1834946138396933, - "grad_norm": 1.0310548543930054, - "learning_rate": 2.0645420658889662e-05, - "loss": 0.8505, + "epoch": 0.5622831367106176, + "grad_norm": 0.905728280544281, + "learning_rate": 3.899400030450597e-05, + "loss": 0.7129, "step": 3241 }, { - "epoch": 1.1838597772503194, - "grad_norm": 1.006202220916748, - "learning_rate": 2.0631082793780464e-05, - "loss": 0.8448, + "epoch": 0.5624566273421235, + "grad_norm": 2.4333813190460205, + "learning_rate": 3.899186555538286e-05, + "loss": 0.8782, "step": 3242 }, { - "epoch": 1.1842249406609457, - "grad_norm": 1.3653069734573364, - "learning_rate": 2.0616744604004496e-05, - "loss": 0.795, + "epoch": 0.5626301179736294, + "grad_norm": 0.9529344439506531, + "learning_rate": 3.8989728602231623e-05, + "loss": 0.8279, "step": 3243 }, { - "epoch": 1.184590104071572, - "grad_norm": 1.2771992683410645, - "learning_rate": 2.0602406096938168e-05, - "loss": 0.839, + "epoch": 0.5628036086051353, + "grad_norm": 0.9341503381729126, + "learning_rate": 3.898758944530025e-05, + "loss": 0.7837, "step": 3244 }, { - "epoch": 1.1849552674821984, - "grad_norm": 1.49496591091156, - "learning_rate": 2.058806727995808e-05, - "loss": 0.8726, + "epoch": 0.5629770992366412, + "grad_norm": 1.059771180152893, + "learning_rate": 3.8985448084837e-05, + "loss": 0.6906, "step": 3245 }, { - "epoch": 1.1853204308928245, - "grad_norm": 0.9123517870903015, - "learning_rate": 2.0573728160440972e-05, - "loss": 0.8247, + "epoch": 0.5631505898681471, + "grad_norm": 0.9048548340797424, + "learning_rate": 3.898330452109038e-05, + "loss": 0.7078, "step": 3246 }, { - "epoch": 1.1856855943034508, - "grad_norm": 1.3055938482284546, - "learning_rate": 2.0559388745763754e-05, - "loss": 0.8468, + "epoch": 0.563324080499653, + "grad_norm": 1.0250953435897827, + "learning_rate": 3.8981158754309156e-05, + "loss": 0.7185, "step": 3247 }, { - "epoch": 1.186050757714077, - "grad_norm": 1.3630379438400269, - "learning_rate": 2.0545049043303463e-05, - "loss": 0.8535, + "epoch": 0.563497571131159, + "grad_norm": 1.130002737045288, + "learning_rate": 3.897901078474233e-05, + "loss": 0.7772, "step": 3248 }, { - "epoch": 1.1864159211247034, - "grad_norm": 1.21796715259552, - "learning_rate": 2.0530709060437323e-05, - "loss": 0.821, + "epoch": 0.5636710617626648, + "grad_norm": 0.8695076107978821, + "learning_rate": 3.897686061263919e-05, + "loss": 0.7319, "step": 3249 }, { - "epoch": 1.1867810845353295, - "grad_norm": 1.4203135967254639, - "learning_rate": 2.0516368804542662e-05, - "loss": 0.8694, + "epoch": 0.5638445523941708, + "grad_norm": 0.9831162095069885, + "learning_rate": 3.897470823824927e-05, + "loss": 0.7156, "step": 3250 }, { - "epoch": 1.1871462479459558, - "grad_norm": 1.4311378002166748, - "learning_rate": 2.050202828299697e-05, - "loss": 0.8282, + "epoch": 0.5640180430256766, + "grad_norm": 0.7917822003364563, + "learning_rate": 3.8972553661822334e-05, + "loss": 0.8611, "step": 3251 }, { - "epoch": 1.1875114113565821, - "grad_norm": 1.133628487586975, - "learning_rate": 2.0487687503177874e-05, - "loss": 0.796, + "epoch": 0.5641915336571826, + "grad_norm": 0.7945999503135681, + "learning_rate": 3.897039688360845e-05, + "loss": 0.7947, "step": 3252 }, { - "epoch": 1.1878765747672082, - "grad_norm": 0.6560249924659729, - "learning_rate": 2.0473346472463125e-05, - "loss": 0.853, + "epoch": 0.5643650242886884, + "grad_norm": 0.9222840070724487, + "learning_rate": 3.8968237903857906e-05, + "loss": 0.7644, "step": 3253 }, { - "epoch": 1.1882417381778345, - "grad_norm": 1.101206660270691, - "learning_rate": 2.045900519823061e-05, - "loss": 0.8566, + "epoch": 0.5645385149201944, + "grad_norm": 0.9562234282493591, + "learning_rate": 3.8966076722821245e-05, + "loss": 0.7905, "step": 3254 }, { - "epoch": 1.1886069015884608, - "grad_norm": 0.8909426927566528, - "learning_rate": 2.044466368785834e-05, - "loss": 0.8208, + "epoch": 0.5647120055517002, + "grad_norm": 0.9005074501037598, + "learning_rate": 3.896391334074928e-05, + "loss": 0.7573, "step": 3255 }, { - "epoch": 1.1889720649990871, - "grad_norm": 1.0966593027114868, - "learning_rate": 2.0430321948724447e-05, - "loss": 0.8185, + "epoch": 0.5648854961832062, + "grad_norm": 0.8899845480918884, + "learning_rate": 3.8961747757893075e-05, + "loss": 0.707, "step": 3256 }, { - "epoch": 1.1893372284097135, - "grad_norm": 1.753303050994873, - "learning_rate": 2.041597998820718e-05, - "loss": 0.8513, + "epoch": 0.565058986814712, + "grad_norm": 1.002325177192688, + "learning_rate": 3.8959579974503947e-05, + "loss": 0.6654, "step": 3257 }, { - "epoch": 1.1897023918203395, - "grad_norm": 1.3225566148757935, - "learning_rate": 2.0401637813684897e-05, - "loss": 0.8662, + "epoch": 0.5652324774462179, + "grad_norm": 0.8155195116996765, + "learning_rate": 3.895740999083347e-05, + "loss": 0.6946, "step": 3258 }, { - "epoch": 1.1900675552309659, - "grad_norm": 1.331086277961731, - "learning_rate": 2.038729543253608e-05, - "loss": 0.7994, + "epoch": 0.5654059680777238, + "grad_norm": 0.8891662955284119, + "learning_rate": 3.8955237807133485e-05, + "loss": 0.7471, "step": 3259 }, { - "epoch": 1.1904327186415922, - "grad_norm": 1.151976466178894, - "learning_rate": 2.0372952852139297e-05, - "loss": 0.8427, + "epoch": 0.5655794587092297, + "grad_norm": 0.9632456302642822, + "learning_rate": 3.8953063423656055e-05, + "loss": 0.7776, "step": 3260 }, { - "epoch": 1.1907978820522183, - "grad_norm": 1.5228445529937744, - "learning_rate": 2.0358610079873248e-05, - "loss": 0.8537, + "epoch": 0.5657529493407356, + "grad_norm": 1.337928056716919, + "learning_rate": 3.8950886840653524e-05, + "loss": 0.6899, "step": 3261 }, { - "epoch": 1.1911630454628446, - "grad_norm": 0.9811953902244568, - "learning_rate": 2.0344267123116697e-05, - "loss": 0.8406, + "epoch": 0.5659264399722415, + "grad_norm": 1.0427669286727905, + "learning_rate": 3.8948708058378504e-05, + "loss": 0.8832, "step": 3262 }, { - "epoch": 1.1915282088734709, - "grad_norm": 1.299730896949768, - "learning_rate": 2.0329923989248525e-05, - "loss": 0.8106, + "epoch": 0.5660999306037474, + "grad_norm": 0.8038963675498962, + "learning_rate": 3.894652707708383e-05, + "loss": 0.8787, "step": 3263 }, { - "epoch": 1.1918933722840972, - "grad_norm": 1.0496644973754883, - "learning_rate": 2.0315580685647703e-05, - "loss": 0.8356, + "epoch": 0.5662734212352533, + "grad_norm": 1.0677791833877563, + "learning_rate": 3.894434389702261e-05, + "loss": 0.7566, "step": 3264 }, { - "epoch": 1.1922585356947235, - "grad_norm": 1.2529593706130981, - "learning_rate": 2.0301237219693278e-05, - "loss": 0.8334, + "epoch": 0.5664469118667592, + "grad_norm": 1.1517128944396973, + "learning_rate": 3.894215851844821e-05, + "loss": 0.7393, "step": 3265 }, { - "epoch": 1.1926236991053496, - "grad_norm": 1.804771065711975, - "learning_rate": 2.0286893598764393e-05, - "loss": 0.8352, + "epoch": 0.5666204024982651, + "grad_norm": 0.8408830761909485, + "learning_rate": 3.8939970941614247e-05, + "loss": 0.7166, "step": 3266 }, { - "epoch": 1.192988862515976, - "grad_norm": 1.5234531164169312, - "learning_rate": 2.0272549830240265e-05, - "loss": 0.8214, + "epoch": 0.566793893129771, + "grad_norm": 0.983093798160553, + "learning_rate": 3.893778116677457e-05, + "loss": 0.755, "step": 3267 }, { - "epoch": 1.1933540259266022, - "grad_norm": 1.5725809335708618, - "learning_rate": 2.0258205921500183e-05, - "loss": 0.8679, + "epoch": 0.5669673837612769, + "grad_norm": 0.8686254024505615, + "learning_rate": 3.893558919418334e-05, + "loss": 0.7625, "step": 3268 }, { - "epoch": 1.1937191893372283, - "grad_norm": 1.0099960565567017, - "learning_rate": 2.024386187992352e-05, - "loss": 0.8715, + "epoch": 0.5671408743927828, + "grad_norm": 0.88538658618927, + "learning_rate": 3.8933395024094904e-05, + "loss": 0.8539, "step": 3269 }, { - "epoch": 1.1940843527478546, - "grad_norm": 1.3067671060562134, - "learning_rate": 2.02295177128897e-05, - "loss": 0.8182, + "epoch": 0.5673143650242887, + "grad_norm": 0.9174114465713501, + "learning_rate": 3.893119865676393e-05, + "loss": 0.7195, "step": 3270 }, { - "epoch": 1.194449516158481, - "grad_norm": 1.915399432182312, - "learning_rate": 2.0215173427778234e-05, - "loss": 0.8308, + "epoch": 0.5674878556557946, + "grad_norm": 1.260210394859314, + "learning_rate": 3.892900009244528e-05, + "loss": 0.7749, "step": 3271 }, { - "epoch": 1.1948146795691073, - "grad_norm": 1.3404712677001953, - "learning_rate": 2.0200829031968667e-05, - "loss": 0.8428, + "epoch": 0.5676613462873005, + "grad_norm": 0.9439926147460938, + "learning_rate": 3.892679933139412e-05, + "loss": 0.6865, "step": 3272 }, { - "epoch": 1.1951798429797333, - "grad_norm": 1.1268606185913086, - "learning_rate": 2.018648453284062e-05, - "loss": 0.8113, + "epoch": 0.5678348369188064, + "grad_norm": 1.074705958366394, + "learning_rate": 3.8924596373865834e-05, + "loss": 0.8311, "step": 3273 }, { - "epoch": 1.1955450063903597, - "grad_norm": 1.129716396331787, - "learning_rate": 2.017213993777377e-05, - "loss": 0.8168, + "epoch": 0.5680083275503123, + "grad_norm": 1.0000978708267212, + "learning_rate": 3.8922391220116094e-05, + "loss": 0.782, "step": 3274 }, { - "epoch": 1.195910169800986, - "grad_norm": 1.306874394416809, - "learning_rate": 2.0157795254147826e-05, - "loss": 0.8441, + "epoch": 0.5681818181818182, + "grad_norm": 0.8173935413360596, + "learning_rate": 3.89201838704008e-05, + "loss": 0.8618, "step": 3275 }, { - "epoch": 1.1962753332116123, - "grad_norm": 1.0080983638763428, - "learning_rate": 2.0143450489342563e-05, - "loss": 0.8501, + "epoch": 0.5683553088133241, + "grad_norm": 1.0270001888275146, + "learning_rate": 3.891797432497613e-05, + "loss": 0.6328, "step": 3276 }, { - "epoch": 1.1966404966222384, - "grad_norm": 1.2047613859176636, - "learning_rate": 2.012910565073777e-05, - "loss": 0.8519, + "epoch": 0.5685287994448299, + "grad_norm": 0.9559722542762756, + "learning_rate": 3.8915762584098484e-05, + "loss": 0.6401, "step": 3277 }, { - "epoch": 1.1970056600328647, - "grad_norm": 1.0406370162963867, - "learning_rate": 2.0114760745713305e-05, - "loss": 0.8549, + "epoch": 0.5687022900763359, + "grad_norm": 1.1604856252670288, + "learning_rate": 3.891354864802455e-05, + "loss": 0.7705, "step": 3278 }, { - "epoch": 1.197370823443491, - "grad_norm": 1.7499263286590576, - "learning_rate": 2.010041578164904e-05, - "loss": 0.8505, + "epoch": 0.5688757807078417, + "grad_norm": 0.9836113452911377, + "learning_rate": 3.891133251701127e-05, + "loss": 0.8292, "step": 3279 }, { - "epoch": 1.1977359868541173, - "grad_norm": 1.171832799911499, - "learning_rate": 2.0086070765924886e-05, - "loss": 0.8399, + "epoch": 0.5690492713393477, + "grad_norm": 0.8840076923370361, + "learning_rate": 3.89091141913158e-05, + "loss": 0.7574, "step": 3280 }, { - "epoch": 1.1981011502647434, - "grad_norm": 1.3671813011169434, - "learning_rate": 2.0071725705920776e-05, - "loss": 0.8408, + "epoch": 0.5692227619708535, + "grad_norm": 1.0821856260299683, + "learning_rate": 3.890689367119561e-05, + "loss": 0.656, "step": 3281 }, { - "epoch": 1.1984663136753697, - "grad_norm": 1.1691529750823975, - "learning_rate": 2.0057380609016666e-05, - "loss": 0.8125, + "epoch": 0.5693962526023595, + "grad_norm": 1.0997991561889648, + "learning_rate": 3.890467095690837e-05, + "loss": 0.8037, "step": 3282 }, { - "epoch": 1.198831477085996, - "grad_norm": 1.4721035957336426, - "learning_rate": 2.0043035482592543e-05, - "loss": 0.8496, + "epoch": 0.5695697432338653, + "grad_norm": 0.9432158470153809, + "learning_rate": 3.890244604871203e-05, + "loss": 0.845, "step": 3283 }, { - "epoch": 1.1991966404966223, - "grad_norm": 0.8874739408493042, - "learning_rate": 2.0028690334028384e-05, - "loss": 0.8389, + "epoch": 0.5697432338653713, + "grad_norm": 1.090069055557251, + "learning_rate": 3.890021894686481e-05, + "loss": 0.8311, "step": 3284 }, { - "epoch": 1.1995618039072484, - "grad_norm": 1.035582184791565, - "learning_rate": 2.0014345170704202e-05, - "loss": 0.8485, + "epoch": 0.5699167244968771, + "grad_norm": 0.9775926470756531, + "learning_rate": 3.889798965162516e-05, + "loss": 0.9165, "step": 3285 }, { - "epoch": 1.1999269673178747, - "grad_norm": 1.2435203790664673, - "learning_rate": 2e-05, - "loss": 0.8134, + "epoch": 0.5700902151283831, + "grad_norm": 1.5019277334213257, + "learning_rate": 3.8895758163251783e-05, + "loss": 0.8215, "step": 3286 }, { - "epoch": 1.200292130728501, - "grad_norm": 0.931449830532074, - "learning_rate": 1.99856548292958e-05, - "loss": 0.8076, + "epoch": 0.5702637057598889, + "grad_norm": 1.141744613647461, + "learning_rate": 3.889352448200366e-05, + "loss": 0.7598, "step": 3287 }, { - "epoch": 1.2006572941391274, - "grad_norm": 1.1458673477172852, - "learning_rate": 1.9971309665971623e-05, - "loss": 0.8416, + "epoch": 0.5704371963913949, + "grad_norm": 1.1989127397537231, + "learning_rate": 3.889128860814e-05, + "loss": 0.6943, "step": 3288 }, { - "epoch": 1.2010224575497535, - "grad_norm": 1.1441457271575928, - "learning_rate": 1.995696451740746e-05, - "loss": 0.8593, + "epoch": 0.5706106870229007, + "grad_norm": 1.115509271621704, + "learning_rate": 3.8889050541920285e-05, + "loss": 0.7097, "step": 3289 }, { - "epoch": 1.2013876209603798, - "grad_norm": 1.2669241428375244, - "learning_rate": 1.9942619390983334e-05, - "loss": 0.8292, + "epoch": 0.5707841776544067, + "grad_norm": 0.9037405252456665, + "learning_rate": 3.8886810283604245e-05, + "loss": 0.729, "step": 3290 }, { - "epoch": 1.201752784371006, - "grad_norm": 1.309957504272461, - "learning_rate": 1.9928274294079227e-05, - "loss": 0.8049, + "epoch": 0.5709576682859125, + "grad_norm": 1.0752978324890137, + "learning_rate": 3.888456783345187e-05, + "loss": 0.7205, "step": 3291 }, { - "epoch": 1.2021179477816322, - "grad_norm": 1.1433966159820557, - "learning_rate": 1.9913929234075117e-05, - "loss": 0.8293, + "epoch": 0.5711311589174185, + "grad_norm": 1.1317309141159058, + "learning_rate": 3.888232319172338e-05, + "loss": 0.7925, "step": 3292 }, { - "epoch": 1.2024831111922585, - "grad_norm": 1.1158641576766968, - "learning_rate": 1.989958421835097e-05, - "loss": 0.8636, + "epoch": 0.5713046495489243, + "grad_norm": 0.8673656582832336, + "learning_rate": 3.8880076358679295e-05, + "loss": 0.7805, "step": 3293 }, { - "epoch": 1.2028482746028848, - "grad_norm": 1.4010262489318848, - "learning_rate": 1.9885239254286705e-05, - "loss": 0.8124, + "epoch": 0.5714781401804303, + "grad_norm": 1.3401610851287842, + "learning_rate": 3.887782733458034e-05, + "loss": 0.6604, "step": 3294 }, { - "epoch": 1.203213438013511, - "grad_norm": 1.1075462102890015, - "learning_rate": 1.987089434926224e-05, - "loss": 0.8828, + "epoch": 0.5716516308119362, + "grad_norm": 1.143747329711914, + "learning_rate": 3.887557611968753e-05, + "loss": 0.7198, "step": 3295 }, { - "epoch": 1.2035786014241374, - "grad_norm": 1.159801721572876, - "learning_rate": 1.9856549510657447e-05, - "loss": 0.8383, + "epoch": 0.5718251214434421, + "grad_norm": 0.9949926137924194, + "learning_rate": 3.887332271426211e-05, + "loss": 0.8398, "step": 3296 }, { - "epoch": 1.2039437648347635, - "grad_norm": 1.1048412322998047, - "learning_rate": 1.984220474585218e-05, - "loss": 0.8224, + "epoch": 0.571998612074948, + "grad_norm": 0.950626015663147, + "learning_rate": 3.887106711856561e-05, + "loss": 0.7732, "step": 3297 }, { - "epoch": 1.2043089282453898, - "grad_norm": 1.0828522443771362, - "learning_rate": 1.9827860062226236e-05, - "loss": 0.8307, + "epoch": 0.5721721027064538, + "grad_norm": 0.897932231426239, + "learning_rate": 3.886880933285977e-05, + "loss": 0.8257, "step": 3298 }, { - "epoch": 1.2046740916560161, - "grad_norm": 1.0090134143829346, - "learning_rate": 1.9813515467159382e-05, - "loss": 0.8071, + "epoch": 0.5723455933379598, + "grad_norm": 2.511120557785034, + "learning_rate": 3.886654935740663e-05, + "loss": 0.8074, "step": 3299 }, { - "epoch": 1.2050392550666422, - "grad_norm": 0.9179986715316772, - "learning_rate": 1.979917096803134e-05, - "loss": 0.8015, + "epoch": 0.5725190839694656, + "grad_norm": 1.066027283668518, + "learning_rate": 3.886428719246845e-05, + "loss": 0.8538, "step": 3300 }, { - "epoch": 1.2054044184772685, - "grad_norm": 0.9157357811927795, - "learning_rate": 1.9784826572221773e-05, - "loss": 0.8563, + "epoch": 0.5726925746009716, + "grad_norm": 0.8356720209121704, + "learning_rate": 3.886202283830776e-05, + "loss": 0.8513, "step": 3301 }, { - "epoch": 1.2057695818878948, - "grad_norm": 2.046492576599121, - "learning_rate": 1.9770482287110305e-05, - "loss": 0.8774, + "epoch": 0.5728660652324774, + "grad_norm": 0.9208877682685852, + "learning_rate": 3.885975629518734e-05, + "loss": 0.6959, "step": 3302 }, { - "epoch": 1.2061347452985212, - "grad_norm": 1.0792189836502075, - "learning_rate": 1.9756138120076484e-05, - "loss": 0.8196, + "epoch": 0.5730395558639834, + "grad_norm": 1.0051295757293701, + "learning_rate": 3.885748756337022e-05, + "loss": 0.7068, "step": 3303 }, { - "epoch": 1.2064999087091473, - "grad_norm": 1.1643941402435303, - "learning_rate": 1.974179407849982e-05, - "loss": 0.863, + "epoch": 0.5732130464954892, + "grad_norm": 0.8778530359268188, + "learning_rate": 3.8855216643119697e-05, + "loss": 0.8281, "step": 3304 }, { - "epoch": 1.2068650721197736, - "grad_norm": 1.2341009378433228, - "learning_rate": 1.9727450169759738e-05, - "loss": 0.8607, + "epoch": 0.5733865371269952, + "grad_norm": 1.0762512683868408, + "learning_rate": 3.885294353469932e-05, + "loss": 0.6523, "step": 3305 }, { - "epoch": 1.2072302355303999, - "grad_norm": 1.56360924243927, - "learning_rate": 1.971310640123561e-05, - "loss": 0.8174, + "epoch": 0.573560027758501, + "grad_norm": 1.2974227666854858, + "learning_rate": 3.885066823837287e-05, + "loss": 0.856, "step": 3306 }, { - "epoch": 1.2075953989410262, - "grad_norm": 0.9564784169197083, - "learning_rate": 1.9698762780306732e-05, - "loss": 0.8016, + "epoch": 0.573733518390007, + "grad_norm": 0.8534640073776245, + "learning_rate": 3.884839075440441e-05, + "loss": 0.7007, "step": 3307 }, { - "epoch": 1.2079605623516523, - "grad_norm": 1.1537206172943115, - "learning_rate": 1.9684419314352307e-05, - "loss": 0.8588, + "epoch": 0.5739070090215128, + "grad_norm": 6.018133640289307, + "learning_rate": 3.884611108305824e-05, + "loss": 0.8494, "step": 3308 }, { - "epoch": 1.2083257257622786, - "grad_norm": 0.941397488117218, - "learning_rate": 1.9670076010751478e-05, - "loss": 0.8365, + "epoch": 0.5740804996530188, + "grad_norm": 1.186869502067566, + "learning_rate": 3.884382922459891e-05, + "loss": 0.7742, "step": 3309 }, { - "epoch": 1.208690889172905, - "grad_norm": 1.3067145347595215, - "learning_rate": 1.965573287688331e-05, - "loss": 0.8671, + "epoch": 0.5742539902845246, + "grad_norm": 1.7233161926269531, + "learning_rate": 3.8841545179291254e-05, + "loss": 0.7871, "step": 3310 }, { - "epoch": 1.2090560525835312, - "grad_norm": 0.9691867232322693, - "learning_rate": 1.964138992012676e-05, - "loss": 0.8359, + "epoch": 0.5744274809160306, + "grad_norm": 0.8519104719161987, + "learning_rate": 3.8839258947400325e-05, + "loss": 0.8652, "step": 3311 }, { - "epoch": 1.2094212159941573, - "grad_norm": 1.2289597988128662, - "learning_rate": 1.9627047147860706e-05, - "loss": 0.8433, + "epoch": 0.5746009715475364, + "grad_norm": 1.1321083307266235, + "learning_rate": 3.883697052919143e-05, + "loss": 0.7273, "step": 3312 }, { - "epoch": 1.2097863794047836, - "grad_norm": 2.477412700653076, - "learning_rate": 1.9612704567463926e-05, - "loss": 0.8435, + "epoch": 0.5747744621790424, + "grad_norm": 1.221879243850708, + "learning_rate": 3.883467992493017e-05, + "loss": 0.7976, "step": 3313 }, { - "epoch": 1.21015154281541, - "grad_norm": 1.1635750532150269, - "learning_rate": 1.959836218631511e-05, - "loss": 0.8755, + "epoch": 0.5749479528105482, + "grad_norm": 0.9935945868492126, + "learning_rate": 3.883238713488235e-05, + "loss": 0.7292, "step": 3314 }, { - "epoch": 1.2105167062260362, - "grad_norm": 0.9641543030738831, - "learning_rate": 1.9584020011792825e-05, - "loss": 0.8414, + "epoch": 0.5751214434420542, + "grad_norm": 0.9340473413467407, + "learning_rate": 3.883009215931406e-05, + "loss": 0.8088, "step": 3315 }, { - "epoch": 1.2108818696366623, - "grad_norm": 1.742769718170166, - "learning_rate": 1.956967805127556e-05, - "loss": 0.8696, + "epoch": 0.57529493407356, + "grad_norm": 0.8700243830680847, + "learning_rate": 3.882779499849163e-05, + "loss": 0.8147, "step": 3316 }, { - "epoch": 1.2112470330472886, - "grad_norm": 1.1160908937454224, - "learning_rate": 1.955533631214166e-05, - "loss": 0.8326, + "epoch": 0.5754684247050659, + "grad_norm": 0.7040034532546997, + "learning_rate": 3.8825495652681653e-05, + "loss": 0.8335, "step": 3317 }, { - "epoch": 1.211612196457915, - "grad_norm": 1.0219699144363403, - "learning_rate": 1.9540994801769392e-05, - "loss": 0.8818, + "epoch": 0.5756419153365718, + "grad_norm": 1.4113163948059082, + "learning_rate": 3.8823194122150975e-05, + "loss": 0.6691, "step": 3318 }, { - "epoch": 1.2119773598685413, - "grad_norm": 1.1280195713043213, - "learning_rate": 1.952665352753688e-05, - "loss": 0.836, + "epoch": 0.5758154059680777, + "grad_norm": 0.9696629047393799, + "learning_rate": 3.8820890407166683e-05, + "loss": 0.9363, "step": 3319 }, { - "epoch": 1.2123425232791674, - "grad_norm": 0.9732888340950012, - "learning_rate": 1.9512312496822136e-05, - "loss": 0.8322, + "epoch": 0.5759888965995836, + "grad_norm": 1.9210888147354126, + "learning_rate": 3.881858450799612e-05, + "loss": 0.8188, "step": 3320 }, { - "epoch": 1.2127076866897937, - "grad_norm": 1.1802831888198853, - "learning_rate": 1.949797171700304e-05, - "loss": 0.7772, + "epoch": 0.5761623872310895, + "grad_norm": 2.6112306118011475, + "learning_rate": 3.881627642490691e-05, + "loss": 0.7871, "step": 3321 }, { - "epoch": 1.21307285010042, - "grad_norm": 0.915736973285675, - "learning_rate": 1.9483631195457348e-05, - "loss": 0.8442, + "epoch": 0.5763358778625954, + "grad_norm": 1.3238176107406616, + "learning_rate": 3.8813966158166894e-05, + "loss": 0.8323, "step": 3322 }, { - "epoch": 1.213438013511046, - "grad_norm": 1.0336682796478271, - "learning_rate": 1.9469290939562684e-05, - "loss": 0.7877, + "epoch": 0.5765093684941013, + "grad_norm": 1.537766456604004, + "learning_rate": 3.8811653708044176e-05, + "loss": 0.7417, "step": 3323 }, { - "epoch": 1.2138031769216724, - "grad_norm": 1.2619839906692505, - "learning_rate": 1.945495095669654e-05, - "loss": 0.832, + "epoch": 0.5766828591256072, + "grad_norm": 4.600706100463867, + "learning_rate": 3.8809339074807125e-05, + "loss": 0.5975, "step": 3324 }, { - "epoch": 1.2141683403322987, - "grad_norm": 0.887502908706665, - "learning_rate": 1.9440611254236253e-05, - "loss": 0.7965, + "epoch": 0.5768563497571131, + "grad_norm": 0.9510989785194397, + "learning_rate": 3.880702225872437e-05, + "loss": 0.7012, "step": 3325 }, { - "epoch": 1.214533503742925, - "grad_norm": 0.8556562066078186, - "learning_rate": 1.942627183955903e-05, - "loss": 0.8664, + "epoch": 0.577029840388619, + "grad_norm": 1.6305487155914307, + "learning_rate": 3.8804703260064756e-05, + "loss": 0.6831, "step": 3326 }, { - "epoch": 1.2148986671535513, - "grad_norm": 0.9830933809280396, - "learning_rate": 1.9411932720041926e-05, - "loss": 0.8562, + "epoch": 0.5772033310201249, + "grad_norm": 1.3558731079101562, + "learning_rate": 3.880238207909742e-05, + "loss": 0.6458, "step": 3327 }, { - "epoch": 1.2152638305641774, - "grad_norm": 1.19175124168396, - "learning_rate": 1.939759390306184e-05, - "loss": 0.8669, + "epoch": 0.5773768216516308, + "grad_norm": 1.5405890941619873, + "learning_rate": 3.880005871609173e-05, + "loss": 0.7245, "step": 3328 }, { - "epoch": 1.2156289939748037, - "grad_norm": 1.3945707082748413, - "learning_rate": 1.9383255395995514e-05, - "loss": 0.8012, + "epoch": 0.5775503122831367, + "grad_norm": 1.0386523008346558, + "learning_rate": 3.879773317131732e-05, + "loss": 0.6283, "step": 3329 }, { - "epoch": 1.21599415738543, - "grad_norm": 1.3378413915634155, - "learning_rate": 1.9368917206219536e-05, - "loss": 0.8838, + "epoch": 0.5777238029146426, + "grad_norm": 1.3538662195205688, + "learning_rate": 3.879540544504408e-05, + "loss": 0.7251, "step": 3330 }, { - "epoch": 1.2163593207960561, - "grad_norm": 1.195921540260315, - "learning_rate": 1.935457934111034e-05, - "loss": 0.8079, + "epoch": 0.5778972935461485, + "grad_norm": 1.5458240509033203, + "learning_rate": 3.879307553754213e-05, + "loss": 0.9058, "step": 3331 }, { - "epoch": 1.2167244842066824, - "grad_norm": 0.9895886182785034, - "learning_rate": 1.9340241808044167e-05, - "loss": 0.8668, + "epoch": 0.5780707841776545, + "grad_norm": 0.9138756990432739, + "learning_rate": 3.879074344908187e-05, + "loss": 0.9038, "step": 3332 }, { - "epoch": 1.2170896476173088, - "grad_norm": 0.9527052044868469, - "learning_rate": 1.932590461439712e-05, - "loss": 0.8362, + "epoch": 0.5782442748091603, + "grad_norm": 1.088436484336853, + "learning_rate": 3.878840917993393e-05, + "loss": 0.8477, "step": 3333 }, { - "epoch": 1.217454811027935, - "grad_norm": 1.582803726196289, - "learning_rate": 1.931156776754509e-05, - "loss": 0.8644, + "epoch": 0.5784177654406663, + "grad_norm": 2.881063222885132, + "learning_rate": 3.878607273036922e-05, + "loss": 0.8689, "step": 3334 }, { - "epoch": 1.2178199744385612, - "grad_norm": 1.0854495763778687, - "learning_rate": 1.9297231274863818e-05, - "loss": 0.8524, + "epoch": 0.5785912560721721, + "grad_norm": 0.9589551687240601, + "learning_rate": 3.8783734100658874e-05, + "loss": 0.7493, "step": 3335 }, { - "epoch": 1.2181851378491875, - "grad_norm": 1.2603267431259155, - "learning_rate": 1.9282895143728858e-05, - "loss": 0.8523, + "epoch": 0.5787647467036781, + "grad_norm": 1.0763031244277954, + "learning_rate": 3.8781393291074296e-05, + "loss": 0.7717, "step": 3336 }, { - "epoch": 1.2185503012598138, - "grad_norm": 0.9232439398765564, - "learning_rate": 1.9268559381515567e-05, - "loss": 0.8271, + "epoch": 0.5789382373351839, + "grad_norm": 1.104504942893982, + "learning_rate": 3.877905030188715e-05, + "loss": 0.6675, "step": 3337 }, { - "epoch": 1.21891546467044, - "grad_norm": 0.8192688822746277, - "learning_rate": 1.925422399559913e-05, - "loss": 0.8595, + "epoch": 0.5791117279666897, + "grad_norm": 5.572535514831543, + "learning_rate": 3.8776705133369333e-05, + "loss": 0.7423, "step": 3338 }, { - "epoch": 1.2192806280810662, - "grad_norm": 1.1674103736877441, - "learning_rate": 1.9239888993354513e-05, - "loss": 0.8519, + "epoch": 0.5792852185981957, + "grad_norm": 1.4182738065719604, + "learning_rate": 3.8774357785793e-05, + "loss": 0.8385, "step": 3339 }, { - "epoch": 1.2196457914916925, - "grad_norm": 1.4427227973937988, - "learning_rate": 1.9225554382156514e-05, - "loss": 0.8215, + "epoch": 0.5794587092297016, + "grad_norm": 1.098347783088684, + "learning_rate": 3.8772008259430575e-05, + "loss": 0.8245, "step": 3340 }, { - "epoch": 1.2200109549023188, - "grad_norm": 1.0442452430725098, - "learning_rate": 1.9211220169379706e-05, - "loss": 0.8217, + "epoch": 0.5796321998612075, + "grad_norm": 1.3457553386688232, + "learning_rate": 3.8769656554554716e-05, + "loss": 0.7727, "step": 3341 }, { - "epoch": 1.2203761183129451, - "grad_norm": 1.0108400583267212, - "learning_rate": 1.9196886362398467e-05, - "loss": 0.8722, + "epoch": 0.5798056904927134, + "grad_norm": 1.3080657720565796, + "learning_rate": 3.876730267143834e-05, + "loss": 0.8015, "step": 3342 }, { - "epoch": 1.2207412817235712, - "grad_norm": 1.1154096126556396, - "learning_rate": 1.9182552968586973e-05, - "loss": 0.8698, + "epoch": 0.5799791811242193, + "grad_norm": 2.249117851257324, + "learning_rate": 3.8764946610354626e-05, + "loss": 0.6565, "step": 3343 }, { - "epoch": 1.2211064451341975, - "grad_norm": 1.0552412271499634, - "learning_rate": 1.9168219995319166e-05, - "loss": 0.8169, + "epoch": 0.5801526717557252, + "grad_norm": 1.3174009323120117, + "learning_rate": 3.876258837157699e-05, + "loss": 0.6562, "step": 3344 }, { - "epoch": 1.2214716085448238, - "grad_norm": 1.1640373468399048, - "learning_rate": 1.9153887449968802e-05, - "loss": 0.8178, + "epoch": 0.5803261623872311, + "grad_norm": 0.9355014562606812, + "learning_rate": 3.876022795537911e-05, + "loss": 0.6492, "step": 3345 }, { - "epoch": 1.2218367719554502, - "grad_norm": 1.4044495820999146, - "learning_rate": 1.9139555339909388e-05, - "loss": 0.8316, + "epoch": 0.580499653018737, + "grad_norm": 0.9691296219825745, + "learning_rate": 3.8757865362034914e-05, + "loss": 0.6727, "step": 3346 }, { - "epoch": 1.2222019353660762, - "grad_norm": 1.0380274057388306, - "learning_rate": 1.9125223672514233e-05, - "loss": 0.8781, + "epoch": 0.5806731436502429, + "grad_norm": 2.9596078395843506, + "learning_rate": 3.875550059181859e-05, + "loss": 0.8247, "step": 3347 }, { - "epoch": 1.2225670987767026, - "grad_norm": 1.1844145059585571, - "learning_rate": 1.9110892455156395e-05, - "loss": 0.8633, + "epoch": 0.5808466342817488, + "grad_norm": 1.0954926013946533, + "learning_rate": 3.875313364500456e-05, + "loss": 0.824, "step": 3348 }, { - "epoch": 1.2229322621873289, - "grad_norm": 1.0224616527557373, - "learning_rate": 1.90965616952087e-05, - "loss": 0.8363, + "epoch": 0.5810201249132547, + "grad_norm": 1.1232010126113892, + "learning_rate": 3.8750764521867526e-05, + "loss": 0.6606, "step": 3349 }, { - "epoch": 1.2232974255979552, - "grad_norm": 1.3052558898925781, - "learning_rate": 1.908223140004377e-05, - "loss": 0.835, + "epoch": 0.5811936155447606, + "grad_norm": 1.053740382194519, + "learning_rate": 3.8748393222682425e-05, + "loss": 0.6559, "step": 3350 }, { - "epoch": 1.2236625890085813, - "grad_norm": 1.1268895864486694, - "learning_rate": 1.906790157703395e-05, - "loss": 0.8291, + "epoch": 0.5813671061762665, + "grad_norm": 1.42743980884552, + "learning_rate": 3.8746019747724436e-05, + "loss": 0.7588, "step": 3351 }, { - "epoch": 1.2240277524192076, - "grad_norm": 1.0528055429458618, - "learning_rate": 1.9053572233551365e-05, - "loss": 0.8149, + "epoch": 0.5815405968077724, + "grad_norm": 1.4473203420639038, + "learning_rate": 3.874364409726901e-05, + "loss": 0.8248, "step": 3352 }, { - "epoch": 1.224392915829834, - "grad_norm": 1.012868046760559, - "learning_rate": 1.9039243376967893e-05, - "loss": 0.8286, + "epoch": 0.5817140874392783, + "grad_norm": 1.9037114381790161, + "learning_rate": 3.8741266271591846e-05, + "loss": 0.8459, "step": 3353 }, { - "epoch": 1.22475807924046, - "grad_norm": 1.5613254308700562, - "learning_rate": 1.9024915014655146e-05, - "loss": 0.8523, + "epoch": 0.5818875780707842, + "grad_norm": 1.0604567527770996, + "learning_rate": 3.87388862709689e-05, + "loss": 0.655, "step": 3354 }, { - "epoch": 1.2251232426510863, - "grad_norm": 0.6615365743637085, - "learning_rate": 1.9010587153984503e-05, - "loss": 0.8618, + "epoch": 0.5820610687022901, + "grad_norm": 1.0767502784729004, + "learning_rate": 3.8736504095676364e-05, + "loss": 0.6543, "step": 3355 }, { - "epoch": 1.2254884060617126, - "grad_norm": 0.8976117968559265, - "learning_rate": 1.899625980232706e-05, - "loss": 0.809, + "epoch": 0.582234559333796, + "grad_norm": 1.0078351497650146, + "learning_rate": 3.8734119745990696e-05, + "loss": 0.7825, "step": 3356 }, { - "epoch": 1.225853569472339, - "grad_norm": 1.0098590850830078, - "learning_rate": 1.8981932967053677e-05, - "loss": 0.8625, + "epoch": 0.5824080499653018, + "grad_norm": 1.05112886428833, + "learning_rate": 3.8731733222188605e-05, + "loss": 0.8049, "step": 3357 }, { - "epoch": 1.2262187328829652, - "grad_norm": 2.223496198654175, - "learning_rate": 1.8967606655534926e-05, - "loss": 0.8212, + "epoch": 0.5825815405968078, + "grad_norm": 0.9623383283615112, + "learning_rate": 3.872934452454704e-05, + "loss": 0.7378, "step": 3358 }, { - "epoch": 1.2265838962935913, - "grad_norm": 1.1447545289993286, - "learning_rate": 1.8953280875141125e-05, - "loss": 0.7942, + "epoch": 0.5827550312283136, + "grad_norm": 0.7098467350006104, + "learning_rate": 3.8726953653343226e-05, + "loss": 0.8093, "step": 3359 }, { - "epoch": 1.2269490597042176, - "grad_norm": 1.1946122646331787, - "learning_rate": 1.893895563324232e-05, - "loss": 0.8447, + "epoch": 0.5829285218598196, + "grad_norm": 0.8150134682655334, + "learning_rate": 3.872456060885461e-05, + "loss": 0.7584, "step": 3360 }, { - "epoch": 1.227314223114844, - "grad_norm": 1.2247114181518555, - "learning_rate": 1.892463093720825e-05, - "loss": 0.8423, + "epoch": 0.5831020124913254, + "grad_norm": 1.2539478540420532, + "learning_rate": 3.8722165391358926e-05, + "loss": 0.6401, "step": 3361 }, { - "epoch": 1.22767938652547, - "grad_norm": 1.212943196296692, - "learning_rate": 1.891030679440842e-05, - "loss": 0.8607, + "epoch": 0.5832755031228314, + "grad_norm": 1.1310327053070068, + "learning_rate": 3.8719768001134124e-05, + "loss": 0.7732, "step": 3362 }, { - "epoch": 1.2280445499360964, - "grad_norm": 1.2249228954315186, - "learning_rate": 1.889598321221201e-05, - "loss": 0.8328, + "epoch": 0.5834489937543372, + "grad_norm": 2.0693206787109375, + "learning_rate": 3.8717368438458435e-05, + "loss": 0.6714, "step": 3363 }, { - "epoch": 1.2284097133467227, - "grad_norm": 0.9054585099220276, - "learning_rate": 1.8881660197987937e-05, - "loss": 0.8187, + "epoch": 0.5836224843858432, + "grad_norm": 0.8484922647476196, + "learning_rate": 3.871496670361033e-05, + "loss": 0.8062, "step": 3364 }, { - "epoch": 1.228774876757349, - "grad_norm": 1.0008068084716797, - "learning_rate": 1.886733775910482e-05, - "loss": 0.8534, + "epoch": 0.583795975017349, + "grad_norm": 1.114558219909668, + "learning_rate": 3.871256279686854e-05, + "loss": 0.7881, "step": 3365 }, { - "epoch": 1.2291400401679753, - "grad_norm": 1.1131821870803833, - "learning_rate": 1.8853015902930974e-05, - "loss": 0.8428, + "epoch": 0.583969465648855, + "grad_norm": 1.2425038814544678, + "learning_rate": 3.871015671851202e-05, + "loss": 0.7087, "step": 3366 }, { - "epoch": 1.2295052035786014, - "grad_norm": 1.0614910125732422, - "learning_rate": 1.8838694636834423e-05, - "loss": 0.8016, + "epoch": 0.5841429562803608, + "grad_norm": 1.4629952907562256, + "learning_rate": 3.8707748468820024e-05, + "loss": 0.7688, "step": 3367 }, { - "epoch": 1.2298703669892277, - "grad_norm": 0.8531900644302368, - "learning_rate": 1.8824373968182875e-05, - "loss": 0.8114, + "epoch": 0.5843164469118668, + "grad_norm": 0.8878246545791626, + "learning_rate": 3.870533804807201e-05, + "loss": 0.7603, "step": 3368 }, { - "epoch": 1.230235530399854, - "grad_norm": 1.0466670989990234, - "learning_rate": 1.881005390434375e-05, - "loss": 0.8278, + "epoch": 0.5844899375433726, + "grad_norm": 0.9428712725639343, + "learning_rate": 3.870292545654772e-05, + "loss": 0.7489, "step": 3369 }, { - "epoch": 1.23060069381048, - "grad_norm": 1.3784377574920654, - "learning_rate": 1.879573445268414e-05, - "loss": 0.8744, + "epoch": 0.5846634281748786, + "grad_norm": 1.213358759880066, + "learning_rate": 3.870051069452714e-05, + "loss": 0.6848, "step": 3370 }, { - "epoch": 1.2309658572211064, - "grad_norm": 1.0605783462524414, - "learning_rate": 1.8781415620570832e-05, - "loss": 0.8331, + "epoch": 0.5848369188063844, + "grad_norm": 1.0936118364334106, + "learning_rate": 3.86980937622905e-05, + "loss": 0.7712, "step": 3371 }, { - "epoch": 1.2313310206317327, - "grad_norm": 1.191085696220398, - "learning_rate": 1.8767097415370287e-05, - "loss": 0.8084, + "epoch": 0.5850104094378904, + "grad_norm": 1.2070332765579224, + "learning_rate": 3.8695674660118294e-05, + "loss": 0.9226, "step": 3372 }, { - "epoch": 1.231696184042359, - "grad_norm": 1.1043466329574585, - "learning_rate": 1.8752779844448653e-05, - "loss": 0.8094, + "epoch": 0.5851839000693962, + "grad_norm": 0.9931301474571228, + "learning_rate": 3.8693253388291256e-05, + "loss": 0.9146, "step": 3373 }, { - "epoch": 1.2320613474529851, - "grad_norm": 1.3228189945220947, - "learning_rate": 1.8738462915171736e-05, - "loss": 0.8132, + "epoch": 0.5853573907009022, + "grad_norm": 0.9781160354614258, + "learning_rate": 3.8690829947090386e-05, + "loss": 0.7434, "step": 3374 }, { - "epoch": 1.2324265108636114, - "grad_norm": 1.437719702720642, - "learning_rate": 1.8724146634905026e-05, - "loss": 0.8373, + "epoch": 0.585530881332408, + "grad_norm": 0.9379550218582153, + "learning_rate": 3.868840433679692e-05, + "loss": 0.885, "step": 3375 }, { - "epoch": 1.2327916742742377, - "grad_norm": 0.8636904358863831, - "learning_rate": 1.8709831011013678e-05, - "loss": 0.8149, + "epoch": 0.5857043719639139, + "grad_norm": 0.7997726202011108, + "learning_rate": 3.868597655769235e-05, + "loss": 0.8359, "step": 3376 }, { - "epoch": 1.233156837684864, - "grad_norm": 1.2160757780075073, - "learning_rate": 1.8695516050862504e-05, - "loss": 0.8311, + "epoch": 0.5858778625954199, + "grad_norm": 1.0879937410354614, + "learning_rate": 3.8683546610058434e-05, + "loss": 0.7094, "step": 3377 }, { - "epoch": 1.2335220010954902, - "grad_norm": 1.4576058387756348, - "learning_rate": 1.8681201761815974e-05, - "loss": 0.8711, + "epoch": 0.5860513532269257, + "grad_norm": 0.7437997460365295, + "learning_rate": 3.868111449417716e-05, + "loss": 0.7686, "step": 3378 }, { - "epoch": 1.2338871645061165, - "grad_norm": 1.0419987440109253, - "learning_rate": 1.8666888151238217e-05, - "loss": 0.8314, + "epoch": 0.5862248438584317, + "grad_norm": 0.862391471862793, + "learning_rate": 3.867868021033078e-05, + "loss": 0.8008, "step": 3379 }, { - "epoch": 1.2342523279167428, - "grad_norm": 1.218010663986206, - "learning_rate": 1.8652575226493012e-05, - "loss": 0.8534, + "epoch": 0.5863983344899375, + "grad_norm": 0.778441309928894, + "learning_rate": 3.867624375880179e-05, + "loss": 0.7932, "step": 3380 }, { - "epoch": 1.234617491327369, - "grad_norm": 0.891401469707489, - "learning_rate": 1.863826299494379e-05, - "loss": 0.886, + "epoch": 0.5865718251214435, + "grad_norm": 0.8389182090759277, + "learning_rate": 3.8673805139872966e-05, + "loss": 0.6383, "step": 3381 }, { - "epoch": 1.2349826547379952, - "grad_norm": 2.645328998565674, - "learning_rate": 1.8623951463953605e-05, - "loss": 0.8302, + "epoch": 0.5867453157529493, + "grad_norm": 0.7327997088432312, + "learning_rate": 3.8671364353827284e-05, + "loss": 0.7891, "step": 3382 }, { - "epoch": 1.2353478181486215, - "grad_norm": 1.0609201192855835, - "learning_rate": 1.8609640640885177e-05, - "loss": 0.789, + "epoch": 0.5869188063844553, + "grad_norm": 0.8832686543464661, + "learning_rate": 3.8668921400948015e-05, + "loss": 0.6917, "step": 3383 }, { - "epoch": 1.2357129815592478, - "grad_norm": 1.327784538269043, - "learning_rate": 1.859533053310085e-05, - "loss": 0.8457, + "epoch": 0.5870922970159611, + "grad_norm": 1.2886186838150024, + "learning_rate": 3.8666476281518665e-05, + "loss": 0.6672, "step": 3384 }, { - "epoch": 1.2360781449698741, - "grad_norm": 3.900519609451294, - "learning_rate": 1.8581021147962593e-05, - "loss": 0.8527, + "epoch": 0.5872657876474671, + "grad_norm": 0.7292872667312622, + "learning_rate": 3.866402899582299e-05, + "loss": 0.9248, "step": 3385 }, { - "epoch": 1.2364433083805002, - "grad_norm": 1.2702217102050781, - "learning_rate": 1.856671249283202e-05, - "loss": 0.8333, + "epoch": 0.5874392782789729, + "grad_norm": 0.7534664869308472, + "learning_rate": 3.8661579544145e-05, + "loss": 0.7778, "step": 3386 }, { - "epoch": 1.2368084717911265, - "grad_norm": 1.5276168584823608, - "learning_rate": 1.8552404575070363e-05, - "loss": 0.8693, + "epoch": 0.5876127689104789, + "grad_norm": 1.2874999046325684, + "learning_rate": 3.865912792676897e-05, + "loss": 0.7563, "step": 3387 }, { - "epoch": 1.2371736352017528, - "grad_norm": 1.2198715209960938, - "learning_rate": 1.8538097402038452e-05, - "loss": 0.8404, + "epoch": 0.5877862595419847, + "grad_norm": 0.9338006377220154, + "learning_rate": 3.8656674143979386e-05, + "loss": 0.7329, "step": 3388 }, { - "epoch": 1.2375387986123791, - "grad_norm": 1.2659879922866821, - "learning_rate": 1.852379098109677e-05, - "loss": 0.8365, + "epoch": 0.5879597501734907, + "grad_norm": 0.937761664390564, + "learning_rate": 3.865421819606104e-05, + "loss": 0.7751, "step": 3389 }, { - "epoch": 1.2379039620230052, - "grad_norm": 1.1647974252700806, - "learning_rate": 1.850948531960539e-05, - "loss": 0.8167, + "epoch": 0.5881332408049965, + "grad_norm": 0.8478882908821106, + "learning_rate": 3.8651760083298926e-05, + "loss": 0.6923, "step": 3390 }, { - "epoch": 1.2382691254336315, - "grad_norm": 1.3742437362670898, - "learning_rate": 1.8495180424924003e-05, - "loss": 0.772, + "epoch": 0.5883067314365025, + "grad_norm": 0.7252093553543091, + "learning_rate": 3.8649299805978324e-05, + "loss": 0.8159, "step": 3391 }, { - "epoch": 1.2386342888442579, - "grad_norm": 1.1507313251495361, - "learning_rate": 1.84808763044119e-05, - "loss": 0.8137, + "epoch": 0.5884802220680083, + "grad_norm": 0.9549953937530518, + "learning_rate": 3.864683736438475e-05, + "loss": 0.8585, "step": 3392 }, { - "epoch": 1.238999452254884, - "grad_norm": 1.156319260597229, - "learning_rate": 1.8466572965427984e-05, - "loss": 0.8135, + "epoch": 0.5886537126995143, + "grad_norm": 0.8501011729240417, + "learning_rate": 3.864437275880398e-05, + "loss": 0.75, "step": 3393 }, { - "epoch": 1.2393646156655103, - "grad_norm": 1.1585853099822998, - "learning_rate": 1.845227041533074e-05, - "loss": 0.8055, + "epoch": 0.5888272033310201, + "grad_norm": 0.99753737449646, + "learning_rate": 3.8641905989522016e-05, + "loss": 0.7603, "step": 3394 }, { - "epoch": 1.2397297790761366, - "grad_norm": 1.1398268938064575, - "learning_rate": 1.8437968661478262e-05, - "loss": 0.7998, + "epoch": 0.5890006939625261, + "grad_norm": 0.9806280732154846, + "learning_rate": 3.8639437056825146e-05, + "loss": 0.6565, "step": 3395 }, { - "epoch": 1.2400949424867629, - "grad_norm": 1.1739405393600464, - "learning_rate": 1.842366771122823e-05, - "loss": 0.854, + "epoch": 0.5891741845940319, + "grad_norm": 0.8904507160186768, + "learning_rate": 3.863696596099988e-05, + "loss": 0.7174, "step": 3396 }, { - "epoch": 1.2404601058973892, - "grad_norm": 1.2232005596160889, - "learning_rate": 1.8409367571937903e-05, - "loss": 0.8383, + "epoch": 0.5893476752255378, + "grad_norm": 0.8172715902328491, + "learning_rate": 3.8634492702333e-05, + "loss": 0.8464, "step": 3397 }, { - "epoch": 1.2408252693080153, - "grad_norm": 1.128341555595398, - "learning_rate": 1.8395068250964138e-05, - "loss": 0.8522, + "epoch": 0.5895211658570437, + "grad_norm": 1.009013056755066, + "learning_rate": 3.863201728111153e-05, + "loss": 0.8033, "step": 3398 }, { - "epoch": 1.2411904327186416, - "grad_norm": 1.1243896484375, - "learning_rate": 1.8380769755663348e-05, - "loss": 0.7963, + "epoch": 0.5896946564885496, + "grad_norm": 1.1151797771453857, + "learning_rate": 3.8629539697622746e-05, + "loss": 0.9177, "step": 3399 }, { - "epoch": 1.241555596129268, - "grad_norm": 1.2763251066207886, - "learning_rate": 1.8366472093391553e-05, - "loss": 0.8506, + "epoch": 0.5898681471200555, + "grad_norm": 1.0590484142303467, + "learning_rate": 3.862705995215417e-05, + "loss": 0.6509, "step": 3400 }, { - "epoch": 1.241920759539894, - "grad_norm": 0.9339412450790405, - "learning_rate": 1.8352175271504314e-05, - "loss": 0.8297, + "epoch": 0.5900416377515614, + "grad_norm": 1.0769354104995728, + "learning_rate": 3.862457804499358e-05, + "loss": 0.7305, "step": 3401 }, { - "epoch": 1.2422859229505203, - "grad_norm": 1.0000334978103638, - "learning_rate": 1.833787929735677e-05, - "loss": 0.8152, + "epoch": 0.5902151283830673, + "grad_norm": 1.0432320833206177, + "learning_rate": 3.862209397642901e-05, + "loss": 0.8206, "step": 3402 }, { - "epoch": 1.2426510863611466, - "grad_norm": 0.8394801020622253, - "learning_rate": 1.8323584178303632e-05, - "loss": 0.8232, + "epoch": 0.5903886190145732, + "grad_norm": 1.3740973472595215, + "learning_rate": 3.861960774674874e-05, + "loss": 0.6465, "step": 3403 }, { - "epoch": 1.243016249771773, - "grad_norm": 1.0811841487884521, - "learning_rate": 1.8309289921699163e-05, - "loss": 0.8363, + "epoch": 0.5905621096460791, + "grad_norm": 1.1858608722686768, + "learning_rate": 3.861711935624129e-05, + "loss": 0.687, "step": 3404 }, { - "epoch": 1.243381413182399, - "grad_norm": 1.2057284116744995, - "learning_rate": 1.8294996534897185e-05, - "loss": 0.827, + "epoch": 0.590735600277585, + "grad_norm": 0.8744890093803406, + "learning_rate": 3.861462880519543e-05, + "loss": 0.7896, "step": 3405 }, { - "epoch": 1.2437465765930253, - "grad_norm": 0.9744519591331482, - "learning_rate": 1.8280704025251076e-05, - "loss": 0.7968, + "epoch": 0.5909090909090909, + "grad_norm": 0.8413711786270142, + "learning_rate": 3.8612136093900224e-05, + "loss": 0.7397, "step": 3406 }, { - "epoch": 1.2441117400036517, - "grad_norm": 1.4442938566207886, - "learning_rate": 1.8266412400113747e-05, - "loss": 0.8459, + "epoch": 0.5910825815405968, + "grad_norm": 0.896005392074585, + "learning_rate": 3.860964122264493e-05, + "loss": 0.8672, "step": 3407 }, { - "epoch": 1.244476903414278, - "grad_norm": 1.1395527124404907, - "learning_rate": 1.825212166683768e-05, - "loss": 0.8447, + "epoch": 0.5912560721721027, + "grad_norm": 0.8524481058120728, + "learning_rate": 3.860714419171909e-05, + "loss": 0.7559, "step": 3408 }, { - "epoch": 1.244842066824904, - "grad_norm": 1.0495845079421997, - "learning_rate": 1.8237831832774877e-05, - "loss": 0.84, + "epoch": 0.5914295628036086, + "grad_norm": 1.201607584953308, + "learning_rate": 3.860464500141249e-05, + "loss": 0.7412, "step": 3409 }, { - "epoch": 1.2452072302355304, - "grad_norm": 1.0997246503829956, - "learning_rate": 1.8223542905276885e-05, - "loss": 0.8336, + "epoch": 0.5916030534351145, + "grad_norm": 0.7329137921333313, + "learning_rate": 3.860214365201515e-05, + "loss": 0.8662, "step": 3410 }, { - "epoch": 1.2455723936461567, - "grad_norm": 0.9937338829040527, - "learning_rate": 1.820925489169478e-05, - "loss": 0.8433, + "epoch": 0.5917765440666204, + "grad_norm": 0.7128985524177551, + "learning_rate": 3.859964014381737e-05, + "loss": 0.8533, "step": 3411 }, { - "epoch": 1.245937557056783, - "grad_norm": 1.1684598922729492, - "learning_rate": 1.819496779937918e-05, - "loss": 0.8623, + "epoch": 0.5919500346981263, + "grad_norm": 0.8044170141220093, + "learning_rate": 3.8597134477109674e-05, + "loss": 0.929, "step": 3412 }, { - "epoch": 1.246302720467409, - "grad_norm": 1.1507258415222168, - "learning_rate": 1.818068163568022e-05, - "loss": 0.8459, + "epoch": 0.5921235253296322, + "grad_norm": 0.9758091568946838, + "learning_rate": 3.859462665218286e-05, + "loss": 0.8423, "step": 3413 }, { - "epoch": 1.2466678838780354, - "grad_norm": 1.025323510169983, - "learning_rate": 1.8166396407947546e-05, - "loss": 0.7988, + "epoch": 0.5922970159611382, + "grad_norm": 1.145073652267456, + "learning_rate": 3.8592116669327945e-05, + "loss": 0.7195, "step": 3414 }, { - "epoch": 1.2470330472886617, - "grad_norm": 1.3004682064056396, - "learning_rate": 1.8152112123530345e-05, - "loss": 0.8479, + "epoch": 0.592470506592644, + "grad_norm": 0.9008615016937256, + "learning_rate": 3.858960452883623e-05, + "loss": 0.8, "step": 3415 }, { - "epoch": 1.247398210699288, - "grad_norm": 1.205893635749817, - "learning_rate": 1.8137828789777302e-05, - "loss": 0.8224, + "epoch": 0.5926439972241498, + "grad_norm": 0.8740783929824829, + "learning_rate": 3.858709023099925e-05, + "loss": 0.745, "step": 3416 }, { - "epoch": 1.2477633741099141, - "grad_norm": 1.1480350494384766, - "learning_rate": 1.8123546414036623e-05, - "loss": 0.8569, + "epoch": 0.5928174878556558, + "grad_norm": 2.2715659141540527, + "learning_rate": 3.8584573776108794e-05, + "loss": 0.7295, "step": 3417 }, { - "epoch": 1.2481285375205404, - "grad_norm": 1.0440844297409058, - "learning_rate": 1.810926500365602e-05, - "loss": 0.809, + "epoch": 0.5929909784871616, + "grad_norm": 0.9212040901184082, + "learning_rate": 3.858205516445689e-05, + "loss": 0.6697, "step": 3418 }, { - "epoch": 1.2484937009311667, - "grad_norm": 1.421542763710022, - "learning_rate": 1.8094984565982697e-05, - "loss": 0.8131, + "epoch": 0.5931644691186676, + "grad_norm": 0.9718971252441406, + "learning_rate": 3.8579534396335835e-05, + "loss": 0.7651, "step": 3419 }, { - "epoch": 1.248858864341793, - "grad_norm": 1.1528058052062988, - "learning_rate": 1.8080705108363376e-05, - "loss": 0.8103, + "epoch": 0.5933379597501734, + "grad_norm": 0.8354684114456177, + "learning_rate": 3.857701147203816e-05, + "loss": 0.7266, "step": 3420 }, { - "epoch": 1.2492240277524191, - "grad_norm": 1.0017021894454956, - "learning_rate": 1.8066426638144253e-05, - "loss": 0.8198, + "epoch": 0.5935114503816794, + "grad_norm": 0.957469642162323, + "learning_rate": 3.8574486391856655e-05, + "loss": 0.6328, "step": 3421 }, { - "epoch": 1.2495891911630455, - "grad_norm": 1.109309196472168, - "learning_rate": 1.8052149162671045e-05, - "loss": 0.834, + "epoch": 0.5936849410131853, + "grad_norm": 1.366143822669983, + "learning_rate": 3.857195915608437e-05, + "loss": 0.7605, "step": 3422 }, { - "epoch": 1.2499543545736718, - "grad_norm": 1.270740270614624, - "learning_rate": 1.8037872689288923e-05, - "loss": 0.832, + "epoch": 0.5938584316446912, + "grad_norm": 0.9694093465805054, + "learning_rate": 3.856942976501458e-05, + "loss": 0.7917, "step": 3423 }, { - "epoch": 1.2503195179842979, - "grad_norm": 1.1250813007354736, - "learning_rate": 1.802359722534257e-05, - "loss": 0.8619, + "epoch": 0.594031922276197, + "grad_norm": 1.4652742147445679, + "learning_rate": 3.8566898218940825e-05, + "loss": 0.7419, "step": 3424 }, { - "epoch": 1.2506846813949242, - "grad_norm": 1.0589739084243774, - "learning_rate": 1.800932277817614e-05, - "loss": 0.8301, + "epoch": 0.594205412907703, + "grad_norm": 0.843357264995575, + "learning_rate": 3.85643645181569e-05, + "loss": 0.6637, "step": 3425 }, { - "epoch": 1.2510498448055505, - "grad_norm": 1.0760276317596436, - "learning_rate": 1.7995049355133254e-05, - "loss": 0.8168, + "epoch": 0.5943789035392089, + "grad_norm": 0.8733358979225159, + "learning_rate": 3.856182866295684e-05, + "loss": 0.7004, "step": 3426 }, { - "epoch": 1.2514150082161768, - "grad_norm": 1.2931861877441406, - "learning_rate": 1.798077696355703e-05, - "loss": 0.8182, + "epoch": 0.5945523941707148, + "grad_norm": 0.741859495639801, + "learning_rate": 3.855929065363493e-05, + "loss": 0.7825, "step": 3427 }, { - "epoch": 1.251780171626803, - "grad_norm": 0.9973229765892029, - "learning_rate": 1.7966505610790022e-05, - "loss": 0.8121, + "epoch": 0.5947258848022207, + "grad_norm": 1.0911678075790405, + "learning_rate": 3.8556750490485724e-05, + "loss": 0.8184, "step": 3428 }, { - "epoch": 1.2521453350374292, - "grad_norm": 0.9700384140014648, - "learning_rate": 1.795223530417428e-05, - "loss": 0.7979, + "epoch": 0.5948993754337266, + "grad_norm": 1.2065812349319458, + "learning_rate": 3.8554208173804e-05, + "loss": 0.7109, "step": 3429 }, { - "epoch": 1.2525104984480555, - "grad_norm": 1.2129998207092285, - "learning_rate": 1.7937966051051306e-05, - "loss": 0.8497, + "epoch": 0.5950728660652325, + "grad_norm": 0.6435400247573853, + "learning_rate": 3.855166370388479e-05, + "loss": 0.8312, "step": 3430 }, { - "epoch": 1.2528756618586818, - "grad_norm": 1.2133640050888062, - "learning_rate": 1.7923697858762054e-05, - "loss": 0.8301, + "epoch": 0.5952463566967384, + "grad_norm": 0.8599523305892944, + "learning_rate": 3.854911708102339e-05, + "loss": 0.8041, "step": 3431 }, { - "epoch": 1.253240825269308, - "grad_norm": 1.3849108219146729, - "learning_rate": 1.7909430734646936e-05, - "loss": 0.8303, + "epoch": 0.5954198473282443, + "grad_norm": 0.940289318561554, + "learning_rate": 3.8546568305515345e-05, + "loss": 0.7224, "step": 3432 }, { - "epoch": 1.2536059886799342, - "grad_norm": 1.076184630393982, - "learning_rate": 1.789516468604581e-05, - "loss": 0.8295, + "epoch": 0.5955933379597502, + "grad_norm": 0.9222247004508972, + "learning_rate": 3.854401737765644e-05, + "loss": 0.7467, "step": 3433 }, { - "epoch": 1.2539711520905605, - "grad_norm": 1.0818583965301514, - "learning_rate": 1.7880899720297998e-05, - "loss": 0.8364, + "epoch": 0.5957668285912561, + "grad_norm": 2.1609842777252197, + "learning_rate": 3.85414642977427e-05, + "loss": 0.8716, "step": 3434 }, { - "epoch": 1.2543363155011868, - "grad_norm": 1.053590178489685, - "learning_rate": 1.7866635844742243e-05, - "loss": 0.8096, + "epoch": 0.5959403192227619, + "grad_norm": 0.9145712852478027, + "learning_rate": 3.853890906607043e-05, + "loss": 0.7134, "step": 3435 }, { - "epoch": 1.2547014789118132, - "grad_norm": 1.1410716772079468, - "learning_rate": 1.785237306671674e-05, - "loss": 0.8288, + "epoch": 0.5961138098542679, + "grad_norm": 0.7529888153076172, + "learning_rate": 3.8536351682936155e-05, + "loss": 0.7988, "step": 3436 }, { - "epoch": 1.2550666423224393, - "grad_norm": 1.231340765953064, - "learning_rate": 1.7838111393559115e-05, - "loss": 0.8672, + "epoch": 0.5962873004857737, + "grad_norm": 0.8802627325057983, + "learning_rate": 3.853379214863667e-05, + "loss": 0.7441, "step": 3437 }, { - "epoch": 1.2554318057330656, - "grad_norm": 1.0887519121170044, - "learning_rate": 1.7823850832606425e-05, - "loss": 0.7864, + "epoch": 0.5964607911172797, + "grad_norm": 0.7511977553367615, + "learning_rate": 3.8531230463469015e-05, + "loss": 0.834, "step": 3438 }, { - "epoch": 1.2557969691436919, - "grad_norm": 0.9378100633621216, - "learning_rate": 1.7809591391195162e-05, - "loss": 0.8389, + "epoch": 0.5966342817487855, + "grad_norm": 0.9242227673530579, + "learning_rate": 3.852866662773047e-05, + "loss": 0.9009, "step": 3439 }, { - "epoch": 1.256162132554318, - "grad_norm": 1.1967710256576538, - "learning_rate": 1.7795333076661238e-05, - "loss": 0.8563, + "epoch": 0.5968077723802915, + "grad_norm": 0.8057268857955933, + "learning_rate": 3.852610064171857e-05, + "loss": 0.9285, "step": 3440 }, { - "epoch": 1.2565272959649443, - "grad_norm": 1.116106390953064, - "learning_rate": 1.7781075896339968e-05, - "loss": 0.7943, + "epoch": 0.5969812630117973, + "grad_norm": 1.0160197019577026, + "learning_rate": 3.85235325057311e-05, + "loss": 0.8276, "step": 3441 }, { - "epoch": 1.2568924593755706, - "grad_norm": 1.0389379262924194, - "learning_rate": 1.7766819857566116e-05, - "loss": 0.7977, + "epoch": 0.5971547536433033, + "grad_norm": 1.3350718021392822, + "learning_rate": 3.85209622200661e-05, + "loss": 0.874, "step": 3442 }, { - "epoch": 1.257257622786197, - "grad_norm": 1.4053122997283936, - "learning_rate": 1.7752564967673832e-05, - "loss": 0.8661, + "epoch": 0.5973282442748091, + "grad_norm": 0.8590342402458191, + "learning_rate": 3.851838978502186e-05, + "loss": 0.797, "step": 3443 }, { - "epoch": 1.2576227861968232, - "grad_norm": 1.2795815467834473, - "learning_rate": 1.7738311233996686e-05, - "loss": 0.8542, + "epoch": 0.5975017349063151, + "grad_norm": 0.8840834498405457, + "learning_rate": 3.8515815200896905e-05, + "loss": 0.7351, "step": 3444 }, { - "epoch": 1.2579879496074493, - "grad_norm": 0.9910547733306885, - "learning_rate": 1.7724058663867656e-05, - "loss": 0.8473, + "epoch": 0.5976752255378209, + "grad_norm": 0.7865497469902039, + "learning_rate": 3.851323846799002e-05, + "loss": 0.7664, "step": 3445 }, { - "epoch": 1.2583531130180756, - "grad_norm": 1.1510053873062134, - "learning_rate": 1.770980726461912e-05, - "loss": 0.8746, + "epoch": 0.5978487161693269, + "grad_norm": 0.8701044917106628, + "learning_rate": 3.851065958660023e-05, + "loss": 0.8005, "step": 3446 }, { - "epoch": 1.2587182764287017, - "grad_norm": 1.249042272567749, - "learning_rate": 1.769555704358284e-05, - "loss": 0.7946, + "epoch": 0.5980222068008327, + "grad_norm": 1.0119539499282837, + "learning_rate": 3.8508078557026835e-05, + "loss": 0.9258, "step": 3447 }, { - "epoch": 1.259083439839328, - "grad_norm": 1.0324974060058594, - "learning_rate": 1.7681308008089993e-05, - "loss": 0.8578, + "epoch": 0.5981956974323387, + "grad_norm": 0.731112003326416, + "learning_rate": 3.8505495379569354e-05, + "loss": 0.7654, "step": 3448 }, { - "epoch": 1.2594486032499543, - "grad_norm": 1.0417890548706055, - "learning_rate": 1.766706016547113e-05, - "loss": 0.8049, + "epoch": 0.5983691880638445, + "grad_norm": 0.9019063711166382, + "learning_rate": 3.850291005452757e-05, + "loss": 0.6763, "step": 3449 }, { - "epoch": 1.2598137666605806, - "grad_norm": 1.0525404214859009, - "learning_rate": 1.7652813523056195e-05, - "loss": 0.8508, + "epoch": 0.5985426786953505, + "grad_norm": 1.2520347833633423, + "learning_rate": 3.850032258220152e-05, + "loss": 0.7092, "step": 3450 }, { - "epoch": 1.260178930071207, - "grad_norm": 1.000821590423584, - "learning_rate": 1.763856808817452e-05, - "loss": 0.83, + "epoch": 0.5987161693268563, + "grad_norm": 0.9555583000183105, + "learning_rate": 3.849773296289147e-05, + "loss": 0.8826, "step": 3451 }, { - "epoch": 1.260544093481833, - "grad_norm": 1.1112663745880127, - "learning_rate": 1.7624323868154804e-05, - "loss": 0.8284, + "epoch": 0.5988896599583623, + "grad_norm": 0.8887721300125122, + "learning_rate": 3.849514119689796e-05, + "loss": 0.7622, "step": 3452 }, { - "epoch": 1.2609092568924594, - "grad_norm": 0.9179114699363708, - "learning_rate": 1.7610080870325135e-05, - "loss": 0.8079, + "epoch": 0.5990631505898681, + "grad_norm": 0.922235906124115, + "learning_rate": 3.849254728452176e-05, + "loss": 0.7468, "step": 3453 }, { - "epoch": 1.2612744203030857, - "grad_norm": 0.9821780920028687, - "learning_rate": 1.7595839102012954e-05, - "loss": 0.821, + "epoch": 0.5992366412213741, + "grad_norm": 0.9837933778762817, + "learning_rate": 3.84899512260639e-05, + "loss": 0.7771, "step": 3454 }, { - "epoch": 1.2616395837137118, - "grad_norm": 1.0964279174804688, - "learning_rate": 1.7581598570545075e-05, - "loss": 0.798, + "epoch": 0.59941013185288, + "grad_norm": 0.8274145722389221, + "learning_rate": 3.848735302182566e-05, + "loss": 0.7429, "step": 3455 }, { - "epoch": 1.262004747124338, - "grad_norm": 1.1051195859909058, - "learning_rate": 1.756735928324769e-05, - "loss": 0.8402, + "epoch": 0.5995836224843858, + "grad_norm": 0.8602612614631653, + "learning_rate": 3.848475267210856e-05, + "loss": 0.7629, "step": 3456 }, { - "epoch": 1.2623699105349644, - "grad_norm": 0.8635133504867554, - "learning_rate": 1.7553121247446337e-05, - "loss": 0.8507, + "epoch": 0.5997571131158917, + "grad_norm": 0.9239373207092285, + "learning_rate": 3.848215017721437e-05, + "loss": 0.6617, "step": 3457 }, { - "epoch": 1.2627350739455907, - "grad_norm": 1.1635404825210571, - "learning_rate": 1.753888447046592e-05, - "loss": 0.8344, + "epoch": 0.5999306037473976, + "grad_norm": 0.9328330159187317, + "learning_rate": 3.8479545537445115e-05, + "loss": 0.7947, "step": 3458 }, { - "epoch": 1.263100237356217, - "grad_norm": 1.1399527788162231, - "learning_rate": 1.7524648959630676e-05, - "loss": 0.8134, + "epoch": 0.6001040943789036, + "grad_norm": 1.0622949600219727, + "learning_rate": 3.8476938753103066e-05, + "loss": 0.6072, "step": 3459 }, { - "epoch": 1.263465400766843, - "grad_norm": 1.1310791969299316, - "learning_rate": 1.7510414722264217e-05, - "loss": 0.84, + "epoch": 0.6002775850104094, + "grad_norm": 0.8597349524497986, + "learning_rate": 3.847432982449075e-05, + "loss": 0.8198, "step": 3460 }, { - "epoch": 1.2638305641774694, - "grad_norm": 0.7761222124099731, - "learning_rate": 1.7496181765689485e-05, - "loss": 0.8124, + "epoch": 0.6004510756419154, + "grad_norm": 0.7216087579727173, + "learning_rate": 3.8471718751910926e-05, + "loss": 0.8298, "step": 3461 }, { - "epoch": 1.2641957275880957, - "grad_norm": 0.6614735126495361, - "learning_rate": 1.7481950097228757e-05, - "loss": 0.8575, + "epoch": 0.6006245662734212, + "grad_norm": 1.0891770124435425, + "learning_rate": 3.846910553566662e-05, + "loss": 0.7661, "step": 3462 }, { - "epoch": 1.2645608909987218, - "grad_norm": 0.8485050201416016, - "learning_rate": 1.7467719724203667e-05, - "loss": 0.8253, + "epoch": 0.6007980569049272, + "grad_norm": 0.6546376943588257, + "learning_rate": 3.846649017606109e-05, + "loss": 0.8813, "step": 3463 }, { - "epoch": 1.2649260544093481, - "grad_norm": 1.4292575120925903, - "learning_rate": 1.7453490653935162e-05, - "loss": 0.8343, + "epoch": 0.600971547536433, + "grad_norm": 0.9823792576789856, + "learning_rate": 3.846387267339787e-05, + "loss": 0.7773, "step": 3464 }, { - "epoch": 1.2652912178199744, - "grad_norm": 1.129494547843933, - "learning_rate": 1.743926289374353e-05, - "loss": 0.861, + "epoch": 0.601145038167939, + "grad_norm": 0.7921059727668762, + "learning_rate": 3.84612530279807e-05, + "loss": 0.8335, "step": 3465 }, { - "epoch": 1.2656563812306008, - "grad_norm": 1.1691826581954956, - "learning_rate": 1.7425036450948383e-05, - "loss": 0.8468, + "epoch": 0.6013185287994448, + "grad_norm": 0.962945282459259, + "learning_rate": 3.845863124011361e-05, + "loss": 0.8425, "step": 3466 }, { - "epoch": 1.266021544641227, - "grad_norm": 1.1429643630981445, - "learning_rate": 1.7410811332868664e-05, - "loss": 0.8218, + "epoch": 0.6014920194309508, + "grad_norm": 0.8620479106903076, + "learning_rate": 3.845600731010085e-05, + "loss": 0.7155, "step": 3467 }, { - "epoch": 1.2663867080518532, - "grad_norm": 0.9845857620239258, - "learning_rate": 1.739658754682261e-05, - "loss": 0.827, + "epoch": 0.6016655100624566, + "grad_norm": 0.7894574999809265, + "learning_rate": 3.845338123824694e-05, + "loss": 0.8013, "step": 3468 }, { - "epoch": 1.2667518714624795, - "grad_norm": 1.2960211038589478, - "learning_rate": 1.738236510012779e-05, - "loss": 0.8115, + "epoch": 0.6018390006939626, + "grad_norm": 1.7136043310165405, + "learning_rate": 3.845075302485664e-05, + "loss": 0.7594, "step": 3469 }, { - "epoch": 1.2671170348731058, - "grad_norm": 1.0385823249816895, - "learning_rate": 1.7368144000101093e-05, - "loss": 0.8597, + "epoch": 0.6020124913254684, + "grad_norm": 0.7016565203666687, + "learning_rate": 3.844812267023495e-05, + "loss": 0.8135, "step": 3470 }, { - "epoch": 1.2674821982837319, - "grad_norm": 1.117127537727356, - "learning_rate": 1.7353924254058695e-05, - "loss": 0.8362, + "epoch": 0.6021859819569744, + "grad_norm": 1.1872366666793823, + "learning_rate": 3.844549017468712e-05, + "loss": 0.7244, "step": 3471 }, { - "epoch": 1.2678473616943582, - "grad_norm": 1.1923749446868896, - "learning_rate": 1.7339705869316083e-05, - "loss": 0.8304, + "epoch": 0.6023594725884802, + "grad_norm": 1.7377769947052002, + "learning_rate": 3.8442855538518667e-05, + "loss": 0.6846, "step": 3472 }, { - "epoch": 1.2682125251049845, - "grad_norm": 0.978095293045044, - "learning_rate": 1.732548885318806e-05, - "loss": 0.8185, + "epoch": 0.6025329632199862, + "grad_norm": 1.0813875198364258, + "learning_rate": 3.844021876203534e-05, + "loss": 0.6711, "step": 3473 }, { - "epoch": 1.2685776885156108, - "grad_norm": 1.043229341506958, - "learning_rate": 1.7311273212988692e-05, - "loss": 0.8749, + "epoch": 0.602706453851492, + "grad_norm": 0.9143376350402832, + "learning_rate": 3.8437579845543133e-05, + "loss": 0.832, "step": 3474 }, { - "epoch": 1.2689428519262371, - "grad_norm": 1.012888789176941, - "learning_rate": 1.729705895603137e-05, - "loss": 0.8265, + "epoch": 0.6028799444829979, + "grad_norm": 0.9712104797363281, + "learning_rate": 3.843493878934831e-05, + "loss": 0.7656, "step": 3475 }, { - "epoch": 1.2693080153368632, - "grad_norm": 1.2508578300476074, - "learning_rate": 1.728284608962875e-05, - "loss": 0.8652, + "epoch": 0.6030534351145038, + "grad_norm": 1.2051842212677002, + "learning_rate": 3.843229559375735e-05, + "loss": 0.7749, "step": 3476 }, { - "epoch": 1.2696731787474895, - "grad_norm": 1.2667968273162842, - "learning_rate": 1.7268634621092786e-05, - "loss": 0.8315, + "epoch": 0.6032269257460097, + "grad_norm": 0.9672859311103821, + "learning_rate": 3.842965025907701e-05, + "loss": 0.7283, "step": 3477 }, { - "epoch": 1.2700383421581156, - "grad_norm": 1.1106866598129272, - "learning_rate": 1.7254424557734703e-05, - "loss": 0.8661, + "epoch": 0.6034004163775156, + "grad_norm": 1.122472882270813, + "learning_rate": 3.842700278561429e-05, + "loss": 0.637, "step": 3478 }, { - "epoch": 1.270403505568742, - "grad_norm": 1.0266146659851074, - "learning_rate": 1.7240215906865016e-05, - "loss": 0.8408, + "epoch": 0.6035739070090215, + "grad_norm": 0.8705981969833374, + "learning_rate": 3.842435317367642e-05, + "loss": 0.764, "step": 3479 }, { - "epoch": 1.2707686689793682, - "grad_norm": 1.1031829118728638, - "learning_rate": 1.722600867579351e-05, - "loss": 0.8327, + "epoch": 0.6037473976405274, + "grad_norm": 1.1004600524902344, + "learning_rate": 3.8421701423570895e-05, + "loss": 0.7405, "step": 3480 }, { - "epoch": 1.2711338323899946, - "grad_norm": 1.0254428386688232, - "learning_rate": 1.7211802871829216e-05, - "loss": 0.8251, + "epoch": 0.6039208882720333, + "grad_norm": 0.7455385327339172, + "learning_rate": 3.8419047535605456e-05, + "loss": 0.9089, "step": 3481 }, { - "epoch": 1.2714989958006209, - "grad_norm": 1.3752076625823975, - "learning_rate": 1.719759850228046e-05, - "loss": 0.8049, + "epoch": 0.6040943789035392, + "grad_norm": 2.2865424156188965, + "learning_rate": 3.841639151008809e-05, + "loss": 0.6543, "step": 3482 }, { - "epoch": 1.271864159211247, - "grad_norm": 0.5929539799690247, - "learning_rate": 1.7183395574454823e-05, - "loss": 0.8247, + "epoch": 0.6042678695350451, + "grad_norm": 1.0027199983596802, + "learning_rate": 3.8413733347327024e-05, + "loss": 0.6638, "step": 3483 }, { - "epoch": 1.2722293226218733, - "grad_norm": 1.380918264389038, - "learning_rate": 1.716919409565914e-05, - "loss": 0.8197, + "epoch": 0.604441360166551, + "grad_norm": 0.8492140769958496, + "learning_rate": 3.8411073047630745e-05, + "loss": 0.6984, "step": 3484 }, { - "epoch": 1.2725944860324996, - "grad_norm": 1.1477532386779785, - "learning_rate": 1.71549940731995e-05, - "loss": 0.8075, + "epoch": 0.6046148507980569, + "grad_norm": 1.3302181959152222, + "learning_rate": 3.8408410611308e-05, + "loss": 0.6667, "step": 3485 }, { - "epoch": 1.2729596494431257, - "grad_norm": 1.1100423336029053, - "learning_rate": 1.714079551438125e-05, - "loss": 0.8337, + "epoch": 0.6047883414295628, + "grad_norm": 0.8644422292709351, + "learning_rate": 3.840574603866774e-05, + "loss": 0.6794, "step": 3486 }, { - "epoch": 1.273324812853752, - "grad_norm": 1.1188337802886963, - "learning_rate": 1.7126598426508974e-05, - "loss": 0.8302, + "epoch": 0.6049618320610687, + "grad_norm": 1.1322799921035767, + "learning_rate": 3.840307933001921e-05, + "loss": 0.7493, "step": 3487 }, { - "epoch": 1.2736899762643783, - "grad_norm": 1.0144188404083252, - "learning_rate": 1.7112402816886504e-05, - "loss": 0.8339, + "epoch": 0.6051353226925746, + "grad_norm": 1.421201229095459, + "learning_rate": 3.840041048567188e-05, + "loss": 0.7102, "step": 3488 }, { - "epoch": 1.2740551396750046, - "grad_norm": 1.55870521068573, - "learning_rate": 1.7098208692816915e-05, - "loss": 0.7952, + "epoch": 0.6053088133240805, + "grad_norm": 1.4640833139419556, + "learning_rate": 3.839773950593547e-05, + "loss": 0.7112, "step": 3489 }, { - "epoch": 1.274420303085631, - "grad_norm": 0.9733501672744751, - "learning_rate": 1.708401606160251e-05, - "loss": 0.8517, + "epoch": 0.6054823039555864, + "grad_norm": 1.3643790483474731, + "learning_rate": 3.839506639111996e-05, + "loss": 0.8389, "step": 3490 }, { - "epoch": 1.274785466496257, - "grad_norm": 1.0788167715072632, - "learning_rate": 1.706982493054483e-05, - "loss": 0.8773, + "epoch": 0.6056557945870923, + "grad_norm": 0.9055225253105164, + "learning_rate": 3.839239114153555e-05, + "loss": 0.6666, "step": 3491 }, { - "epoch": 1.2751506299068833, - "grad_norm": 1.6937346458435059, - "learning_rate": 1.705563530694464e-05, - "loss": 0.8744, + "epoch": 0.6058292852185982, + "grad_norm": 0.9785338640213013, + "learning_rate": 3.838971375749272e-05, + "loss": 0.6387, "step": 3492 }, { - "epoch": 1.2755157933175096, - "grad_norm": 1.0220861434936523, - "learning_rate": 1.7041447198101946e-05, - "loss": 0.8322, + "epoch": 0.6060027758501041, + "grad_norm": 0.8709136247634888, + "learning_rate": 3.838703423930218e-05, + "loss": 0.7764, "step": 3493 }, { - "epoch": 1.2758809567281357, - "grad_norm": 1.0699372291564941, - "learning_rate": 1.7027260611315936e-05, - "loss": 0.8599, + "epoch": 0.6061762664816099, + "grad_norm": 0.6854754686355591, + "learning_rate": 3.83843525872749e-05, + "loss": 0.8799, "step": 3494 }, { - "epoch": 1.276246120138762, - "grad_norm": 0.9636276960372925, - "learning_rate": 1.7013075553885063e-05, - "loss": 0.8135, + "epoch": 0.6063497571131159, + "grad_norm": 0.8455032110214233, + "learning_rate": 3.838166880172207e-05, + "loss": 0.7281, "step": 3495 }, { - "epoch": 1.2766112835493884, - "grad_norm": 1.1396762132644653, - "learning_rate": 1.699889203310695e-05, - "loss": 0.7737, + "epoch": 0.6065232477446217, + "grad_norm": 1.5191826820373535, + "learning_rate": 3.837898288295516e-05, + "loss": 0.7275, "step": 3496 }, { - "epoch": 1.2769764469600147, - "grad_norm": 1.4217638969421387, - "learning_rate": 1.6984710056278462e-05, - "loss": 0.8075, + "epoch": 0.6066967383761277, + "grad_norm": 0.8960263729095459, + "learning_rate": 3.837629483128587e-05, + "loss": 0.8335, "step": 3497 }, { - "epoch": 1.277341610370641, - "grad_norm": 0.9965364933013916, - "learning_rate": 1.6970529630695656e-05, - "loss": 0.847, + "epoch": 0.6068702290076335, + "grad_norm": 0.9295046925544739, + "learning_rate": 3.837360464702616e-05, + "loss": 0.7493, "step": 3498 }, { - "epoch": 1.277706773781267, - "grad_norm": 0.7475082278251648, - "learning_rate": 1.695635076365379e-05, - "loss": 0.8408, + "epoch": 0.6070437196391395, + "grad_norm": 0.982965886592865, + "learning_rate": 3.837091233048821e-05, + "loss": 0.7246, "step": 3499 }, { - "epoch": 1.2780719371918934, - "grad_norm": 1.2116901874542236, - "learning_rate": 1.694217346244732e-05, - "loss": 0.8582, + "epoch": 0.6072172102706453, + "grad_norm": 1.3403279781341553, + "learning_rate": 3.8368217881984484e-05, + "loss": 0.6755, "step": 3500 }, { - "epoch": 1.2784371006025197, - "grad_norm": 1.2332903146743774, - "learning_rate": 1.6927997734369904e-05, - "loss": 0.89, + "epoch": 0.6073907009021513, + "grad_norm": 0.8452673554420471, + "learning_rate": 3.836552130182766e-05, + "loss": 0.7472, "step": 3501 }, { - "epoch": 1.2788022640131458, - "grad_norm": 1.0116569995880127, - "learning_rate": 1.691382358671438e-05, - "loss": 0.8585, + "epoch": 0.6075641915336571, + "grad_norm": 0.845382034778595, + "learning_rate": 3.83628225903307e-05, + "loss": 0.641, "step": 3502 }, { - "epoch": 1.279167427423772, - "grad_norm": 1.1604492664337158, - "learning_rate": 1.6899651026772776e-05, - "loss": 0.8094, + "epoch": 0.6077376821651631, + "grad_norm": 0.8018450140953064, + "learning_rate": 3.836012174780678e-05, + "loss": 0.761, "step": 3503 }, { - "epoch": 1.2795325908343984, - "grad_norm": 0.7727996706962585, - "learning_rate": 1.6885480061836314e-05, - "loss": 0.851, + "epoch": 0.607911172796669, + "grad_norm": 0.8521077036857605, + "learning_rate": 3.8357418774569335e-05, + "loss": 0.8572, "step": 3504 }, { - "epoch": 1.2798977542450247, - "grad_norm": 1.0348765850067139, - "learning_rate": 1.687131069919538e-05, - "loss": 0.7766, + "epoch": 0.6080846634281749, + "grad_norm": 1.4106907844543457, + "learning_rate": 3.835471367093205e-05, + "loss": 0.6873, "step": 3505 }, { - "epoch": 1.280262917655651, - "grad_norm": 1.0160865783691406, - "learning_rate": 1.685714294613955e-05, - "loss": 0.8158, + "epoch": 0.6082581540596808, + "grad_norm": 1.094164252281189, + "learning_rate": 3.835200643720886e-05, + "loss": 0.6849, "step": 3506 }, { - "epoch": 1.2806280810662771, - "grad_norm": 1.2991604804992676, - "learning_rate": 1.6842976809957562e-05, - "loss": 0.8069, + "epoch": 0.6084316446911867, + "grad_norm": 1.3368526697158813, + "learning_rate": 3.834929707371394e-05, + "loss": 0.7646, "step": 3507 }, { - "epoch": 1.2809932444769034, - "grad_norm": 0.921115517616272, - "learning_rate": 1.6828812297937314e-05, - "loss": 0.8594, + "epoch": 0.6086051353226926, + "grad_norm": 0.8373224139213562, + "learning_rate": 3.8346585580761705e-05, + "loss": 0.8137, "step": 3508 }, { - "epoch": 1.2813584078875297, - "grad_norm": 1.0949211120605469, - "learning_rate": 1.681464941736589e-05, - "loss": 0.8292, + "epoch": 0.6087786259541985, + "grad_norm": 0.7067712545394897, + "learning_rate": 3.834387195866684e-05, + "loss": 0.8398, "step": 3509 }, { - "epoch": 1.2817235712981558, - "grad_norm": 1.2378089427947998, - "learning_rate": 1.6800488175529516e-05, - "loss": 0.8207, + "epoch": 0.6089521165857044, + "grad_norm": 0.8465063571929932, + "learning_rate": 3.8341156207744254e-05, + "loss": 0.7507, "step": 3510 }, { - "epoch": 1.2820887347087822, - "grad_norm": 0.8532893061637878, - "learning_rate": 1.6786328579713593e-05, - "loss": 0.8265, + "epoch": 0.6091256072172103, + "grad_norm": 1.350939393043518, + "learning_rate": 3.8338438328309126e-05, + "loss": 0.7097, "step": 3511 }, { - "epoch": 1.2824538981194085, - "grad_norm": 0.9264019131660461, - "learning_rate": 1.6772170637202655e-05, - "loss": 0.7928, + "epoch": 0.6092990978487162, + "grad_norm": 1.03564453125, + "learning_rate": 3.833571832067685e-05, + "loss": 0.7356, "step": 3512 }, { - "epoch": 1.2828190615300348, - "grad_norm": 1.0482763051986694, - "learning_rate": 1.67580143552804e-05, - "loss": 0.8337, + "epoch": 0.6094725884802221, + "grad_norm": 0.787018895149231, + "learning_rate": 3.833299618516311e-05, + "loss": 0.7441, "step": 3513 }, { - "epoch": 1.2831842249406609, - "grad_norm": 1.0018996000289917, - "learning_rate": 1.6743859741229667e-05, - "loss": 0.8398, + "epoch": 0.609646079111728, + "grad_norm": 1.3566306829452515, + "learning_rate": 3.8330271922083795e-05, + "loss": 0.7584, "step": 3514 }, { - "epoch": 1.2835493883512872, - "grad_norm": 0.9044629335403442, - "learning_rate": 1.6729706802332433e-05, - "loss": 0.8459, + "epoch": 0.6098195697432338, + "grad_norm": 0.8293810486793518, + "learning_rate": 3.832754553175507e-05, + "loss": 0.6802, "step": 3515 }, { - "epoch": 1.2839145517619135, - "grad_norm": 1.382750153541565, - "learning_rate": 1.6715555545869827e-05, - "loss": 0.8611, + "epoch": 0.6099930603747398, + "grad_norm": 0.8345538377761841, + "learning_rate": 3.8324817014493326e-05, + "loss": 0.7258, "step": 3516 }, { - "epoch": 1.2842797151725396, - "grad_norm": 1.15854811668396, - "learning_rate": 1.670140597912209e-05, - "loss": 0.8156, + "epoch": 0.6101665510062456, + "grad_norm": 1.5288026332855225, + "learning_rate": 3.832208637061522e-05, + "loss": 0.6785, "step": 3517 }, { - "epoch": 1.284644878583166, - "grad_norm": 1.1667481660842896, - "learning_rate": 1.6687258109368617e-05, - "loss": 0.8213, + "epoch": 0.6103400416377516, + "grad_norm": 1.256028175354004, + "learning_rate": 3.831935360043763e-05, + "loss": 0.7876, "step": 3518 }, { - "epoch": 1.2850100419937922, - "grad_norm": 0.7909616827964783, - "learning_rate": 1.667311194388791e-05, - "loss": 0.8885, + "epoch": 0.6105135322692574, + "grad_norm": 0.7324166893959045, + "learning_rate": 3.8316618704277715e-05, + "loss": 0.8318, "step": 3519 }, { - "epoch": 1.2853752054044185, - "grad_norm": 0.9848148226737976, - "learning_rate": 1.665896748995762e-05, - "loss": 0.8478, + "epoch": 0.6106870229007634, + "grad_norm": 0.7806276679039001, + "learning_rate": 3.8313881682452854e-05, + "loss": 0.748, "step": 3520 }, { - "epoch": 1.2857403688150448, - "grad_norm": 1.1149239540100098, - "learning_rate": 1.6644824754854484e-05, - "loss": 0.7889, + "epoch": 0.6108605135322692, + "grad_norm": 0.9679577350616455, + "learning_rate": 3.8311142535280684e-05, + "loss": 0.7126, "step": 3521 }, { - "epoch": 1.286105532225671, - "grad_norm": 1.2905640602111816, - "learning_rate": 1.663068374585437e-05, - "loss": 0.8325, + "epoch": 0.6110340041637752, + "grad_norm": 0.7812911868095398, + "learning_rate": 3.830840126307909e-05, + "loss": 0.7476, "step": 3522 }, { - "epoch": 1.2864706956362972, - "grad_norm": 0.7198203802108765, - "learning_rate": 1.661654447023227e-05, - "loss": 0.8539, + "epoch": 0.611207494795281, + "grad_norm": 0.8129391670227051, + "learning_rate": 3.830565786616619e-05, + "loss": 0.7617, "step": 3523 }, { - "epoch": 1.2868358590469235, - "grad_norm": 1.6150325536727905, - "learning_rate": 1.6602406935262273e-05, - "loss": 0.8616, + "epoch": 0.611380985426787, + "grad_norm": 1.4109854698181152, + "learning_rate": 3.830291234486037e-05, + "loss": 0.7306, "step": 3524 }, { - "epoch": 1.2872010224575496, - "grad_norm": 0.9030619263648987, - "learning_rate": 1.658827114821756e-05, - "loss": 0.8149, + "epoch": 0.6115544760582928, + "grad_norm": 1.4091542959213257, + "learning_rate": 3.8300164699480246e-05, + "loss": 0.6934, "step": 3525 }, { - "epoch": 1.287566185868176, - "grad_norm": 1.1158199310302734, - "learning_rate": 1.657413711637045e-05, - "loss": 0.8123, + "epoch": 0.6117279666897988, + "grad_norm": 0.8591094613075256, + "learning_rate": 3.8297414930344684e-05, + "loss": 0.8127, "step": 3526 }, { - "epoch": 1.2879313492788023, - "grad_norm": 1.4698060750961304, - "learning_rate": 1.656000484699232e-05, - "loss": 0.8264, + "epoch": 0.6119014573213046, + "grad_norm": 0.9200711846351624, + "learning_rate": 3.8294663037772794e-05, + "loss": 0.8508, "step": 3527 }, { - "epoch": 1.2882965126894286, - "grad_norm": 1.155719518661499, - "learning_rate": 1.6545874347353655e-05, - "loss": 0.8038, + "epoch": 0.6120749479528106, + "grad_norm": 1.189212679862976, + "learning_rate": 3.829190902208394e-05, + "loss": 0.6924, "step": 3528 }, { - "epoch": 1.2886616761000549, - "grad_norm": 1.1920326948165894, - "learning_rate": 1.653174562472403e-05, - "loss": 0.8421, + "epoch": 0.6122484385843164, + "grad_norm": 0.8852182030677795, + "learning_rate": 3.828915288359774e-05, + "loss": 0.6466, "step": 3529 }, { - "epoch": 1.289026839510681, - "grad_norm": 1.070228099822998, - "learning_rate": 1.6517618686372114e-05, - "loss": 0.834, + "epoch": 0.6124219292158224, + "grad_norm": 0.9260478019714355, + "learning_rate": 3.828639462263403e-05, + "loss": 0.6719, "step": 3530 }, { - "epoch": 1.2893920029213073, - "grad_norm": 0.8854255080223083, - "learning_rate": 1.6503493539565642e-05, - "loss": 0.8159, + "epoch": 0.6125954198473282, + "grad_norm": 1.0712709426879883, + "learning_rate": 3.828363423951291e-05, + "loss": 0.8523, "step": 3531 }, { - "epoch": 1.2897571663319336, - "grad_norm": 1.3860585689544678, - "learning_rate": 1.648937019157144e-05, - "loss": 0.8035, + "epoch": 0.6127689104788342, + "grad_norm": 0.9192002415657043, + "learning_rate": 3.8280871734554746e-05, + "loss": 0.7224, "step": 3532 }, { - "epoch": 1.2901223297425597, - "grad_norm": 1.404840350151062, - "learning_rate": 1.6475248649655398e-05, - "loss": 0.791, + "epoch": 0.61294240111034, + "grad_norm": 0.8380149602890015, + "learning_rate": 3.8278107108080104e-05, + "loss": 0.7871, "step": 3533 }, { - "epoch": 1.290487493153186, - "grad_norm": 0.8702025413513184, - "learning_rate": 1.6461128921082496e-05, - "loss": 0.7812, + "epoch": 0.6131158917418459, + "grad_norm": 0.8971619009971619, + "learning_rate": 3.827534036040984e-05, + "loss": 0.928, "step": 3534 }, { - "epoch": 1.2908526565638123, - "grad_norm": 0.9247628450393677, - "learning_rate": 1.6447011013116753e-05, + "epoch": 0.6132893823733518, + "grad_norm": 1.0236653089523315, + "learning_rate": 3.827257149186502e-05, "loss": 0.8301, "step": 3535 }, { - "epoch": 1.2912178199744386, - "grad_norm": 1.1045771837234497, - "learning_rate": 1.6432894933021266e-05, - "loss": 0.8619, + "epoch": 0.6134628730048577, + "grad_norm": 1.0391610860824585, + "learning_rate": 3.8269800502767e-05, + "loss": 0.6521, "step": 3536 }, { - "epoch": 1.291582983385065, - "grad_norm": 1.1704728603363037, - "learning_rate": 1.6418780688058197e-05, - "loss": 0.8768, + "epoch": 0.6136363636363636, + "grad_norm": 0.8443355560302734, + "learning_rate": 3.826702739343734e-05, + "loss": 0.7415, "step": 3537 }, { - "epoch": 1.291948146795691, - "grad_norm": 0.9538334608078003, - "learning_rate": 1.6404668285488763e-05, - "loss": 0.8196, + "epoch": 0.6138098542678695, + "grad_norm": 0.893353283405304, + "learning_rate": 3.8264252164197866e-05, + "loss": 0.7625, "step": 3538 }, { - "epoch": 1.2923133102063173, - "grad_norm": 0.8366324305534363, - "learning_rate": 1.6390557732573217e-05, - "loss": 0.7961, + "epoch": 0.6139833448993754, + "grad_norm": 1.1700191497802734, + "learning_rate": 3.826147481537065e-05, + "loss": 0.6945, "step": 3539 }, { - "epoch": 1.2926784736169437, - "grad_norm": 1.1036263704299927, - "learning_rate": 1.637644903657089e-05, - "loss": 0.8182, + "epoch": 0.6141568355308813, + "grad_norm": 0.8513479828834534, + "learning_rate": 3.825869534727799e-05, + "loss": 0.6831, "step": 3540 }, { - "epoch": 1.2930436370275697, - "grad_norm": 1.1032747030258179, - "learning_rate": 1.6362342204740124e-05, - "loss": 0.8399, + "epoch": 0.6143303261623873, + "grad_norm": 1.142037272453308, + "learning_rate": 3.825591376024247e-05, + "loss": 0.6938, "step": 3541 }, { - "epoch": 1.293408800438196, - "grad_norm": 1.3894418478012085, - "learning_rate": 1.6348237244338324e-05, - "loss": 0.8286, + "epoch": 0.6145038167938931, + "grad_norm": 0.7934979796409607, + "learning_rate": 3.8253130054586886e-05, + "loss": 0.726, "step": 3542 }, { - "epoch": 1.2937739638488224, - "grad_norm": 0.8577991724014282, - "learning_rate": 1.6334134162621923e-05, - "loss": 0.8186, + "epoch": 0.614677307425399, + "grad_norm": 0.9316867589950562, + "learning_rate": 3.825034423063429e-05, + "loss": 0.7222, "step": 3543 }, { - "epoch": 1.2941391272594487, - "grad_norm": 1.1209667921066284, - "learning_rate": 1.63200329668464e-05, - "loss": 0.8326, + "epoch": 0.6148507980569049, + "grad_norm": 0.9361812472343445, + "learning_rate": 3.824755628870797e-05, + "loss": 0.7952, "step": 3544 }, { - "epoch": 1.2945042906700748, - "grad_norm": 1.4829539060592651, - "learning_rate": 1.6305933664266242e-05, - "loss": 0.8397, + "epoch": 0.6150242886884109, + "grad_norm": 1.1102279424667358, + "learning_rate": 3.824476622913149e-05, + "loss": 0.7546, "step": 3545 }, { - "epoch": 1.294869454080701, - "grad_norm": 0.9488922953605652, - "learning_rate": 1.629183626213498e-05, - "loss": 0.8518, + "epoch": 0.6151977793199167, + "grad_norm": 0.7748406529426575, + "learning_rate": 3.824197405222863e-05, + "loss": 0.9302, "step": 3546 }, { - "epoch": 1.2952346174913274, - "grad_norm": 1.1429461240768433, - "learning_rate": 1.627774076770517e-05, - "loss": 0.8389, + "epoch": 0.6153712699514227, + "grad_norm": 0.8773168325424194, + "learning_rate": 3.8239179758323424e-05, + "loss": 0.8413, "step": 3547 }, { - "epoch": 1.2955997809019535, - "grad_norm": 1.2023231983184814, - "learning_rate": 1.6263647188228366e-05, - "loss": 0.8405, + "epoch": 0.6155447605829285, + "grad_norm": 1.074357271194458, + "learning_rate": 3.8236383347740146e-05, + "loss": 0.8667, "step": 3548 }, { - "epoch": 1.2959649443125798, - "grad_norm": 1.0864830017089844, - "learning_rate": 1.624955553095515e-05, - "loss": 0.8271, + "epoch": 0.6157182512144345, + "grad_norm": 0.9391390681266785, + "learning_rate": 3.823358482080334e-05, + "loss": 0.8582, "step": 3549 }, { - "epoch": 1.2963301077232061, - "grad_norm": 0.9832045435905457, - "learning_rate": 1.623546580313512e-05, - "loss": 0.8284, + "epoch": 0.6158917418459403, + "grad_norm": 1.2318036556243896, + "learning_rate": 3.823078417783777e-05, + "loss": 0.8032, "step": 3550 }, { - "epoch": 1.2966952711338324, - "grad_norm": 1.0863577127456665, - "learning_rate": 1.622137801201687e-05, - "loss": 0.8589, + "epoch": 0.6160652324774463, + "grad_norm": 0.9933357238769531, + "learning_rate": 3.8227981419168445e-05, + "loss": 0.7001, "step": 3551 }, { - "epoch": 1.2970604345444587, - "grad_norm": 1.2331651449203491, - "learning_rate": 1.620729216484801e-05, - "loss": 0.8412, + "epoch": 0.6162387231089521, + "grad_norm": 1.054290771484375, + "learning_rate": 3.8225176545120646e-05, + "loss": 0.7217, "step": 3552 }, { - "epoch": 1.2974255979550848, - "grad_norm": 1.7309322357177734, - "learning_rate": 1.6193208268875133e-05, - "loss": 0.828, + "epoch": 0.6164122137404581, + "grad_norm": 1.1670640707015991, + "learning_rate": 3.822236955601987e-05, + "loss": 0.8293, "step": 3553 }, { - "epoch": 1.2977907613657111, - "grad_norm": 0.8256141543388367, - "learning_rate": 1.617912633134385e-05, - "loss": 0.8457, + "epoch": 0.6165857043719639, + "grad_norm": 1.4348877668380737, + "learning_rate": 3.821956045219186e-05, + "loss": 0.7661, "step": 3554 }, { - "epoch": 1.2981559247763375, - "grad_norm": 1.1117355823516846, - "learning_rate": 1.6165046359498748e-05, - "loss": 0.848, + "epoch": 0.6167591950034698, + "grad_norm": 0.7861756682395935, + "learning_rate": 3.821674923396265e-05, + "loss": 0.7249, "step": 3555 }, { - "epoch": 1.2985210881869635, - "grad_norm": 0.8906260132789612, - "learning_rate": 1.6150968360583404e-05, - "loss": 0.8903, + "epoch": 0.6169326856349757, + "grad_norm": 0.9936113953590393, + "learning_rate": 3.821393590165845e-05, + "loss": 0.7268, "step": 3556 }, { - "epoch": 1.2988862515975899, - "grad_norm": 1.1782234907150269, - "learning_rate": 1.6136892341840386e-05, - "loss": 0.7902, + "epoch": 0.6171061762664816, + "grad_norm": 0.7439637184143066, + "learning_rate": 3.8211120455605774e-05, + "loss": 0.8308, "step": 3557 }, { - "epoch": 1.2992514150082162, - "grad_norm": 0.9057909846305847, - "learning_rate": 1.612281831051124e-05, - "loss": 0.8097, + "epoch": 0.6172796668979875, + "grad_norm": 0.6409836411476135, + "learning_rate": 3.8208302896131344e-05, + "loss": 0.8652, "step": 3558 }, { - "epoch": 1.2996165784188425, - "grad_norm": 1.183213472366333, - "learning_rate": 1.6108746273836495e-05, - "loss": 0.7935, + "epoch": 0.6174531575294934, + "grad_norm": 0.7968433499336243, + "learning_rate": 3.820548322356215e-05, + "loss": 0.7546, "step": 3559 }, { - "epoch": 1.2999817418294688, - "grad_norm": 1.256788969039917, - "learning_rate": 1.6094676239055654e-05, - "loss": 0.8171, + "epoch": 0.6176266481609993, + "grad_norm": 0.9983441829681396, + "learning_rate": 3.820266143822541e-05, + "loss": 0.6396, "step": 3560 }, { - "epoch": 1.3003469052400949, - "grad_norm": 0.9434470534324646, - "learning_rate": 1.6080608213407164e-05, - "loss": 0.8387, + "epoch": 0.6178001387925052, + "grad_norm": 0.8518350124359131, + "learning_rate": 3.81998375404486e-05, + "loss": 0.7717, "step": 3561 }, { - "epoch": 1.3007120686507212, - "grad_norm": 1.0963407754898071, - "learning_rate": 1.606654220412848e-05, - "loss": 0.8353, + "epoch": 0.6179736294240111, + "grad_norm": 0.8588294386863708, + "learning_rate": 3.819701153055944e-05, + "loss": 0.729, "step": 3562 }, { - "epoch": 1.3010772320613475, - "grad_norm": 0.8769721388816833, - "learning_rate": 1.6052478218455986e-05, - "loss": 0.8293, + "epoch": 0.618147120055517, + "grad_norm": 0.9098778367042542, + "learning_rate": 3.8194183408885885e-05, + "loss": 0.8395, "step": 3563 }, { - "epoch": 1.3014423954719736, - "grad_norm": 1.1497924327850342, - "learning_rate": 1.603841626362505e-05, - "loss": 0.8074, + "epoch": 0.6183206106870229, + "grad_norm": 0.7801840305328369, + "learning_rate": 3.8191353175756145e-05, + "loss": 0.8298, "step": 3564 }, { - "epoch": 1.3018075588826, - "grad_norm": 1.146071434020996, - "learning_rate": 1.6024356346869975e-05, - "loss": 0.8579, + "epoch": 0.6184941013185288, + "grad_norm": 0.8042619824409485, + "learning_rate": 3.818852083149867e-05, + "loss": 0.8013, "step": 3565 }, { - "epoch": 1.3021727222932262, - "grad_norm": 1.1271910667419434, - "learning_rate": 1.6010298475424028e-05, - "loss": 0.8435, + "epoch": 0.6186675919500347, + "grad_norm": 0.7687529921531677, + "learning_rate": 3.818568637644217e-05, + "loss": 0.8208, "step": 3566 }, { - "epoch": 1.3025378857038525, - "grad_norm": 0.77430659532547, - "learning_rate": 1.5996242656519418e-05, - "loss": 0.8588, + "epoch": 0.6188410825815406, + "grad_norm": 1.2628023624420166, + "learning_rate": 3.818284981091556e-05, + "loss": 0.7549, "step": 3567 }, { - "epoch": 1.3029030491144789, - "grad_norm": 1.3940712213516235, - "learning_rate": 1.5982188897387296e-05, - "loss": 0.8353, + "epoch": 0.6190145732130465, + "grad_norm": 0.7444674968719482, + "learning_rate": 3.8180011135248055e-05, + "loss": 0.8455, "step": 3568 }, { - "epoch": 1.303268212525105, - "grad_norm": 1.0806456804275513, - "learning_rate": 1.596813720525777e-05, - "loss": 0.8439, + "epoch": 0.6191880638445524, + "grad_norm": 1.077534794807434, + "learning_rate": 3.8177170349769064e-05, + "loss": 0.7179, "step": 3569 }, { - "epoch": 1.3036333759357313, - "grad_norm": 1.0975135564804077, - "learning_rate": 1.5954087587359857e-05, - "loss": 0.8724, + "epoch": 0.6193615544760583, + "grad_norm": 0.8209437727928162, + "learning_rate": 3.8174327454808275e-05, + "loss": 0.7666, "step": 3570 }, { - "epoch": 1.3039985393463576, - "grad_norm": 1.7172619104385376, - "learning_rate": 1.594004005092153e-05, - "loss": 0.847, + "epoch": 0.6195350451075642, + "grad_norm": 1.0938361883163452, + "learning_rate": 3.8171482450695616e-05, + "loss": 0.6781, "step": 3571 }, { - "epoch": 1.3043637027569837, - "grad_norm": 1.2518310546875, - "learning_rate": 1.5925994603169678e-05, - "loss": 0.8146, + "epoch": 0.6197085357390701, + "grad_norm": 0.9858092069625854, + "learning_rate": 3.816863533776124e-05, + "loss": 0.8228, "step": 3572 }, { - "epoch": 1.30472886616761, - "grad_norm": 1.3586459159851074, - "learning_rate": 1.5911951251330127e-05, - "loss": 0.8524, + "epoch": 0.619882026370576, + "grad_norm": 0.6374485492706299, + "learning_rate": 3.816578611633556e-05, + "loss": 0.8845, "step": 3573 }, { - "epoch": 1.3050940295782363, - "grad_norm": 1.086983323097229, - "learning_rate": 1.5897910002627625e-05, - "loss": 0.7934, + "epoch": 0.6200555170020818, + "grad_norm": 1.0592073202133179, + "learning_rate": 3.816293478674923e-05, + "loss": 0.7371, "step": 3574 }, { - "epoch": 1.3054591929888626, - "grad_norm": 1.0906528234481812, - "learning_rate": 1.5883870864285806e-05, - "loss": 0.8052, + "epoch": 0.6202290076335878, + "grad_norm": 0.7683749198913574, + "learning_rate": 3.816008134933317e-05, + "loss": 0.6313, "step": 3575 }, { - "epoch": 1.305824356399489, - "grad_norm": 1.2705082893371582, - "learning_rate": 1.586983384352726e-05, - "loss": 0.814, + "epoch": 0.6204024982650936, + "grad_norm": 0.9330496788024902, + "learning_rate": 3.815722580441849e-05, + "loss": 0.7524, "step": 3576 }, { - "epoch": 1.306189519810115, - "grad_norm": 1.1473627090454102, - "learning_rate": 1.5855798947573464e-05, - "loss": 0.7871, + "epoch": 0.6205759888965996, + "grad_norm": 0.7893474698066711, + "learning_rate": 3.8154368152336606e-05, + "loss": 0.8245, "step": 3577 }, { - "epoch": 1.3065546832207413, - "grad_norm": 1.1623848676681519, - "learning_rate": 1.584176618364482e-05, - "loss": 0.8436, + "epoch": 0.6207494795281054, + "grad_norm": 0.9138994812965393, + "learning_rate": 3.815150839341915e-05, + "loss": 0.835, "step": 3578 }, { - "epoch": 1.3069198466313674, - "grad_norm": 1.1879956722259521, - "learning_rate": 1.582773555896061e-05, - "loss": 0.8319, + "epoch": 0.6209229701596114, + "grad_norm": 0.909781813621521, + "learning_rate": 3.814864652799798e-05, + "loss": 0.8813, "step": 3579 }, { - "epoch": 1.3072850100419937, - "grad_norm": 0.9860876798629761, - "learning_rate": 1.5813707080739028e-05, - "loss": 0.8101, + "epoch": 0.6210964607911172, + "grad_norm": 1.3303890228271484, + "learning_rate": 3.8145782556405244e-05, + "loss": 0.9023, "step": 3580 }, { - "epoch": 1.30765017345262, - "grad_norm": 0.8568575382232666, - "learning_rate": 1.5799680756197177e-05, - "loss": 0.8419, + "epoch": 0.6212699514226232, + "grad_norm": 3.3276925086975098, + "learning_rate": 3.81429164789733e-05, + "loss": 0.7666, "step": 3581 }, { - "epoch": 1.3080153368632463, - "grad_norm": 1.2421221733093262, - "learning_rate": 1.5785656592551022e-05, - "loss": 0.8453, + "epoch": 0.621443442054129, + "grad_norm": 0.9911302328109741, + "learning_rate": 3.814004829603475e-05, + "loss": 0.7012, "step": 3582 }, { - "epoch": 1.3083805002738726, - "grad_norm": 1.1135344505310059, - "learning_rate": 1.5771634597015445e-05, - "loss": 0.8652, + "epoch": 0.621616932685635, + "grad_norm": 0.8486824035644531, + "learning_rate": 3.813717800792246e-05, + "loss": 0.8459, "step": 3583 }, { - "epoch": 1.3087456636844987, - "grad_norm": 1.1107925176620483, - "learning_rate": 1.575761477680419e-05, - "loss": 0.8669, + "epoch": 0.6217904233171409, + "grad_norm": 0.9072778820991516, + "learning_rate": 3.813430561496953e-05, + "loss": 0.7969, "step": 3584 }, { - "epoch": 1.309110827095125, - "grad_norm": 1.5740898847579956, - "learning_rate": 1.57435971391299e-05, - "loss": 0.8098, + "epoch": 0.6219639139486468, + "grad_norm": 0.6988579034805298, + "learning_rate": 3.81314311175093e-05, + "loss": 0.7903, "step": 3585 }, { - "epoch": 1.3094759905057514, - "grad_norm": 1.3956072330474854, - "learning_rate": 1.572958169120408e-05, - "loss": 0.8171, + "epoch": 0.6221374045801527, + "grad_norm": 0.8814089894294739, + "learning_rate": 3.812855451587537e-05, + "loss": 0.6876, "step": 3586 }, { - "epoch": 1.3098411539163775, - "grad_norm": 0.9701876044273376, - "learning_rate": 1.5715568440237122e-05, - "loss": 0.8547, + "epoch": 0.6223108952116586, + "grad_norm": 0.6496028304100037, + "learning_rate": 3.812567581040155e-05, + "loss": 0.8052, "step": 3587 }, { - "epoch": 1.3102063173270038, - "grad_norm": 1.2201708555221558, - "learning_rate": 1.5701557393438277e-05, - "loss": 0.8484, + "epoch": 0.6224843858431645, + "grad_norm": 1.459839940071106, + "learning_rate": 3.812279500142194e-05, + "loss": 0.8289, "step": 3588 }, { - "epoch": 1.31057148073763, - "grad_norm": 1.0848890542984009, - "learning_rate": 1.5687548558015663e-05, - "loss": 0.8127, + "epoch": 0.6226578764746704, + "grad_norm": 3.343271255493164, + "learning_rate": 3.811991208927085e-05, + "loss": 0.7488, "step": 3589 }, { - "epoch": 1.3109366441482564, - "grad_norm": 1.0762882232666016, - "learning_rate": 1.567354194117627e-05, - "loss": 0.8461, + "epoch": 0.6228313671061763, + "grad_norm": 0.8139446973800659, + "learning_rate": 3.811702707428285e-05, + "loss": 0.8696, "step": 3590 }, { - "epoch": 1.3113018075588827, - "grad_norm": 1.1355403661727905, - "learning_rate": 1.565953755012594e-05, - "loss": 0.8451, + "epoch": 0.6230048577376822, + "grad_norm": 0.9058511853218079, + "learning_rate": 3.811413995679275e-05, + "loss": 0.6865, "step": 3591 }, { - "epoch": 1.3116669709695088, - "grad_norm": 0.92193204164505, - "learning_rate": 1.5645535392069366e-05, - "loss": 0.819, + "epoch": 0.6231783483691881, + "grad_norm": 2.024523973464966, + "learning_rate": 3.81112507371356e-05, + "loss": 0.7461, "step": 3592 }, { - "epoch": 1.312032134380135, - "grad_norm": 0.7957248091697693, - "learning_rate": 1.56315354742101e-05, - "loss": 0.8076, + "epoch": 0.6233518390006939, + "grad_norm": 0.8802201747894287, + "learning_rate": 3.8108359415646694e-05, + "loss": 0.7935, "step": 3593 }, { - "epoch": 1.3123972977907614, - "grad_norm": 1.1485823392868042, - "learning_rate": 1.5617537803750538e-05, - "loss": 0.8591, + "epoch": 0.6235253296321999, + "grad_norm": 0.6889973878860474, + "learning_rate": 3.810546599266158e-05, + "loss": 0.7026, "step": 3594 }, { - "epoch": 1.3127624612013875, - "grad_norm": 1.2435699701309204, - "learning_rate": 1.560354238789192e-05, - "loss": 0.7968, + "epoch": 0.6236988202637057, + "grad_norm": 0.9034347534179688, + "learning_rate": 3.810257046851604e-05, + "loss": 0.6554, "step": 3595 }, { - "epoch": 1.3131276246120138, - "grad_norm": 1.0878108739852905, - "learning_rate": 1.558954923383432e-05, - "loss": 0.8643, + "epoch": 0.6238723108952117, + "grad_norm": 1.012420415878296, + "learning_rate": 3.8099672843546106e-05, + "loss": 0.6864, "step": 3596 }, { - "epoch": 1.3134927880226401, - "grad_norm": 1.0522301197052002, - "learning_rate": 1.5575558348776664e-05, - "loss": 0.8217, + "epoch": 0.6240458015267175, + "grad_norm": 0.9575663208961487, + "learning_rate": 3.8096773118088045e-05, + "loss": 0.6599, "step": 3597 }, { - "epoch": 1.3138579514332664, - "grad_norm": 0.9500163197517395, - "learning_rate": 1.556156973991669e-05, - "loss": 0.8434, + "epoch": 0.6242192921582235, + "grad_norm": 0.9464178085327148, + "learning_rate": 3.809387129247838e-05, + "loss": 0.7378, "step": 3598 }, { - "epoch": 1.3142231148438928, - "grad_norm": 1.0970503091812134, - "learning_rate": 1.5547583414450985e-05, - "loss": 0.899, + "epoch": 0.6243927827897293, + "grad_norm": 0.9099921584129333, + "learning_rate": 3.809096736705387e-05, + "loss": 0.8337, "step": 3599 }, { - "epoch": 1.3145882782545188, - "grad_norm": 1.1323518753051758, - "learning_rate": 1.5533599379574956e-05, - "loss": 0.8354, + "epoch": 0.6245662734212353, + "grad_norm": 1.577717900276184, + "learning_rate": 3.808806134215151e-05, + "loss": 0.8767, "step": 3600 }, { - "epoch": 1.3149534416651452, - "grad_norm": 1.1361324787139893, - "learning_rate": 1.551961764248281e-05, - "loss": 0.8627, + "epoch": 0.6247397640527411, + "grad_norm": 0.8841474652290344, + "learning_rate": 3.8085153218108555e-05, + "loss": 0.8235, "step": 3601 }, { - "epoch": 1.3153186050757715, - "grad_norm": 1.058864712715149, - "learning_rate": 1.5505638210367605e-05, - "loss": 0.8584, + "epoch": 0.6249132546842471, + "grad_norm": 1.0177043676376343, + "learning_rate": 3.80822429952625e-05, + "loss": 0.7253, "step": 3602 }, { - "epoch": 1.3156837684863976, - "grad_norm": 1.3910236358642578, - "learning_rate": 1.5491661090421193e-05, - "loss": 0.8387, + "epoch": 0.6250867453157529, + "grad_norm": 1.330697774887085, + "learning_rate": 3.807933067395108e-05, + "loss": 0.7664, "step": 3603 }, { - "epoch": 1.3160489318970239, - "grad_norm": 1.0433735847473145, - "learning_rate": 1.5477686289834238e-05, - "loss": 0.8184, + "epoch": 0.6252602359472589, + "grad_norm": 0.9070950746536255, + "learning_rate": 3.8076416254512256e-05, + "loss": 0.8893, "step": 3604 }, { - "epoch": 1.3164140953076502, - "grad_norm": 1.1415810585021973, - "learning_rate": 1.5463713815796223e-05, - "loss": 0.8243, + "epoch": 0.6254337265787647, + "grad_norm": 0.7157330513000488, + "learning_rate": 3.807349973728427e-05, + "loss": 0.7064, "step": 3605 }, { - "epoch": 1.3167792587182765, - "grad_norm": 1.7279770374298096, - "learning_rate": 1.5449743675495416e-05, - "loss": 0.7843, + "epoch": 0.6256072172102707, + "grad_norm": 0.804823100566864, + "learning_rate": 3.807058112260558e-05, + "loss": 0.8625, "step": 3606 }, { - "epoch": 1.3171444221289028, - "grad_norm": 1.1184653043746948, - "learning_rate": 1.54357758761189e-05, - "loss": 0.8696, + "epoch": 0.6257807078417765, + "grad_norm": 0.8883898854255676, + "learning_rate": 3.8067660410814895e-05, + "loss": 0.822, "step": 3607 }, { - "epoch": 1.317509585539529, - "grad_norm": 1.0065276622772217, - "learning_rate": 1.5421810424852542e-05, - "loss": 0.8358, + "epoch": 0.6259541984732825, + "grad_norm": 1.0663114786148071, + "learning_rate": 3.8064737602251155e-05, + "loss": 0.8098, "step": 3608 }, { - "epoch": 1.3178747489501552, - "grad_norm": 0.9345651865005493, - "learning_rate": 1.5407847328881013e-05, - "loss": 0.8831, + "epoch": 0.6261276891047883, + "grad_norm": 0.9496017694473267, + "learning_rate": 3.8061812697253576e-05, + "loss": 0.7949, "step": 3609 }, { - "epoch": 1.3182399123607815, - "grad_norm": 1.0912920236587524, - "learning_rate": 1.5393886595387756e-05, - "loss": 0.8014, + "epoch": 0.6263011797362943, + "grad_norm": 0.8654940128326416, + "learning_rate": 3.8058885696161595e-05, + "loss": 0.7214, "step": 3610 }, { - "epoch": 1.3186050757714076, - "grad_norm": 0.7381747961044312, - "learning_rate": 1.5379928231555014e-05, - "loss": 0.8468, + "epoch": 0.6264746703678001, + "grad_norm": 0.9565306901931763, + "learning_rate": 3.805595659931487e-05, + "loss": 0.7791, "step": 3611 }, { - "epoch": 1.318970239182034, - "grad_norm": 1.0284016132354736, - "learning_rate": 1.536597224456381e-05, - "loss": 0.8019, + "epoch": 0.6266481609993061, + "grad_norm": 0.8744076490402222, + "learning_rate": 3.805302540705335e-05, + "loss": 0.8003, "step": 3612 }, { - "epoch": 1.3193354025926602, - "grad_norm": 0.8215433955192566, - "learning_rate": 1.5352018641593933e-05, - "loss": 0.827, + "epoch": 0.6268216516308119, + "grad_norm": 1.0787898302078247, + "learning_rate": 3.80500921197172e-05, + "loss": 0.7251, "step": 3613 }, { - "epoch": 1.3197005660032866, - "grad_norm": 1.1033109426498413, - "learning_rate": 1.5338067429823956e-05, - "loss": 0.8599, + "epoch": 0.6269951422623178, + "grad_norm": 0.8317021727561951, + "learning_rate": 3.8047156737646825e-05, + "loss": 0.7834, "step": 3614 }, { - "epoch": 1.3200657294139126, - "grad_norm": 1.2291046380996704, - "learning_rate": 1.5324118616431216e-05, - "loss": 0.8105, + "epoch": 0.6271686328938237, + "grad_norm": 0.9381603598594666, + "learning_rate": 3.8044219261182876e-05, + "loss": 0.825, "step": 3615 }, { - "epoch": 1.320430892824539, - "grad_norm": 1.010799765586853, - "learning_rate": 1.531017220859181e-05, - "loss": 0.7961, + "epoch": 0.6273421235253296, + "grad_norm": 0.8574730753898621, + "learning_rate": 3.8041279690666254e-05, + "loss": 0.7325, "step": 3616 }, { - "epoch": 1.3207960562351653, - "grad_norm": 0.9741177558898926, - "learning_rate": 1.5296228213480615e-05, - "loss": 0.801, + "epoch": 0.6275156141568355, + "grad_norm": 0.9893508553504944, + "learning_rate": 3.8038338026438116e-05, + "loss": 0.7246, "step": 3617 }, { - "epoch": 1.3211612196457914, - "grad_norm": 1.2881453037261963, - "learning_rate": 1.5282286638271248e-05, - "loss": 0.8635, + "epoch": 0.6276891047883414, + "grad_norm": 0.8447514772415161, + "learning_rate": 3.803539426883982e-05, + "loss": 0.8118, "step": 3618 }, { - "epoch": 1.3215263830564177, - "grad_norm": 1.7311906814575195, - "learning_rate": 1.5268347490136102e-05, - "loss": 0.8647, + "epoch": 0.6278625954198473, + "grad_norm": 0.9278315901756287, + "learning_rate": 3.803244841821301e-05, + "loss": 0.7234, "step": 3619 }, { - "epoch": 1.321891546467044, - "grad_norm": 1.655568242073059, - "learning_rate": 1.5254410776246299e-05, - "loss": 0.8507, + "epoch": 0.6280360860513532, + "grad_norm": 0.9707409739494324, + "learning_rate": 3.8029500474899544e-05, + "loss": 0.6738, "step": 3620 }, { - "epoch": 1.3222567098776703, - "grad_norm": 1.0737171173095703, - "learning_rate": 1.5240476503771726e-05, - "loss": 0.8237, + "epoch": 0.6282095766828591, + "grad_norm": 0.9030182957649231, + "learning_rate": 3.8026550439241535e-05, + "loss": 0.7725, "step": 3621 }, { - "epoch": 1.3226218732882966, - "grad_norm": 1.0756748914718628, - "learning_rate": 1.5226544679881e-05, - "loss": 0.8214, + "epoch": 0.628383067314365, + "grad_norm": 1.1056774854660034, + "learning_rate": 3.802359831158135e-05, + "loss": 0.6508, "step": 3622 }, { - "epoch": 1.3229870366989227, - "grad_norm": 0.9513784050941467, - "learning_rate": 1.5212615311741488e-05, - "loss": 0.824, + "epoch": 0.628556557945871, + "grad_norm": 0.8719127774238586, + "learning_rate": 3.802064409226158e-05, + "loss": 0.6738, "step": 3623 }, { - "epoch": 1.323352200109549, - "grad_norm": 1.5072650909423828, - "learning_rate": 1.5198688406519297e-05, - "loss": 0.8535, + "epoch": 0.6287300485773768, + "grad_norm": 1.486180067062378, + "learning_rate": 3.801768778162506e-05, + "loss": 0.7065, "step": 3624 }, { - "epoch": 1.3237173635201753, - "grad_norm": 1.3833688497543335, - "learning_rate": 1.5184763971379255e-05, - "loss": 0.8237, + "epoch": 0.6289035392088828, + "grad_norm": 0.845059335231781, + "learning_rate": 3.801472938001488e-05, + "loss": 0.7015, "step": 3625 }, { - "epoch": 1.3240825269308014, - "grad_norm": 1.458528995513916, - "learning_rate": 1.5170842013484928e-05, - "loss": 0.8359, + "epoch": 0.6290770298403886, + "grad_norm": 0.9731988906860352, + "learning_rate": 3.8011768887774365e-05, + "loss": 0.9163, "step": 3626 }, { - "epoch": 1.3244476903414277, - "grad_norm": 1.1463803052902222, - "learning_rate": 1.5156922539998609e-05, - "loss": 0.8481, + "epoch": 0.6292505204718946, + "grad_norm": 0.9627339243888855, + "learning_rate": 3.8008806305247083e-05, + "loss": 0.7229, "step": 3627 }, { - "epoch": 1.324812853752054, - "grad_norm": 1.037043809890747, - "learning_rate": 1.5143005558081292e-05, - "loss": 0.8013, + "epoch": 0.6294240111034004, + "grad_norm": 1.2256109714508057, + "learning_rate": 3.800584163277684e-05, + "loss": 0.8254, "step": 3628 }, { - "epoch": 1.3251780171626804, - "grad_norm": 0.9797232151031494, - "learning_rate": 1.5129091074892721e-05, - "loss": 0.8004, + "epoch": 0.6295975017349064, + "grad_norm": 1.8356471061706543, + "learning_rate": 3.80028748707077e-05, + "loss": 0.6733, "step": 3629 }, { - "epoch": 1.3255431805733067, - "grad_norm": 1.9655098915100098, - "learning_rate": 1.5115179097591331e-05, - "loss": 0.8585, + "epoch": 0.6297709923664122, + "grad_norm": 1.2657802104949951, + "learning_rate": 3.7999906019383954e-05, + "loss": 0.6982, "step": 3630 }, { - "epoch": 1.3259083439839328, - "grad_norm": 0.878628671169281, - "learning_rate": 1.5101269633334284e-05, - "loss": 0.8116, + "epoch": 0.6299444829979182, + "grad_norm": 0.9069787859916687, + "learning_rate": 3.799693507915014e-05, + "loss": 0.8694, "step": 3631 }, { - "epoch": 1.326273507394559, - "grad_norm": 1.0065144300460815, - "learning_rate": 1.5087362689277431e-05, - "loss": 0.8721, + "epoch": 0.630117973629424, + "grad_norm": 1.4572343826293945, + "learning_rate": 3.799396205035104e-05, + "loss": 0.6365, "step": 3632 }, { - "epoch": 1.3266386708051854, - "grad_norm": 1.1627243757247925, - "learning_rate": 1.5073458272575345e-05, - "loss": 0.8336, + "epoch": 0.6302914642609299, + "grad_norm": 1.6305638551712036, + "learning_rate": 3.799098693333167e-05, + "loss": 0.6809, "step": 3633 }, { - "epoch": 1.3270038342158115, - "grad_norm": 1.060829520225525, - "learning_rate": 1.5059556390381289e-05, - "loss": 0.8723, + "epoch": 0.6304649548924358, + "grad_norm": 0.9818524122238159, + "learning_rate": 3.7988009728437304e-05, + "loss": 0.6367, "step": 3634 }, { - "epoch": 1.3273689976264378, - "grad_norm": 0.9508363008499146, - "learning_rate": 1.5045657049847223e-05, - "loss": 0.8368, + "epoch": 0.6306384455239417, + "grad_norm": 0.7550144195556641, + "learning_rate": 3.7985030436013454e-05, + "loss": 0.7471, "step": 3635 }, { - "epoch": 1.327734161037064, - "grad_norm": 1.2730499505996704, - "learning_rate": 1.50317602581238e-05, - "loss": 0.8693, + "epoch": 0.6308119361554476, + "grad_norm": 1.7759835720062256, + "learning_rate": 3.7982049056405866e-05, + "loss": 0.7147, "step": 3636 }, { - "epoch": 1.3280993244476904, - "grad_norm": 1.086773157119751, - "learning_rate": 1.5017866022360356e-05, - "loss": 0.8452, + "epoch": 0.6309854267869535, + "grad_norm": 0.7388194799423218, + "learning_rate": 3.797906558996053e-05, + "loss": 0.7411, "step": 3637 }, { - "epoch": 1.3284644878583167, - "grad_norm": 1.2027463912963867, - "learning_rate": 1.5003974349704931e-05, - "loss": 0.8101, + "epoch": 0.6311589174184594, + "grad_norm": 0.9029819369316101, + "learning_rate": 3.797608003702368e-05, + "loss": 0.7267, "step": 3638 }, { - "epoch": 1.3288296512689428, - "grad_norm": 0.9403995275497437, - "learning_rate": 1.4990085247304218e-05, - "loss": 0.8362, + "epoch": 0.6313324080499653, + "grad_norm": 0.8891667127609253, + "learning_rate": 3.79730923979418e-05, + "loss": 0.6741, "step": 3639 }, { - "epoch": 1.3291948146795691, - "grad_norm": 1.2783534526824951, - "learning_rate": 1.4976198722303619e-05, - "loss": 0.8466, + "epoch": 0.6315058986814712, + "grad_norm": 0.9093294143676758, + "learning_rate": 3.79701026730616e-05, + "loss": 0.7451, "step": 3640 }, { - "epoch": 1.3295599780901954, - "grad_norm": 1.0429176092147827, - "learning_rate": 1.496231478184718e-05, - "loss": 0.7796, + "epoch": 0.6316793893129771, + "grad_norm": 0.9961654543876648, + "learning_rate": 3.7967110862730045e-05, + "loss": 0.6864, "step": 3641 }, { - "epoch": 1.3299251415008215, - "grad_norm": 1.5396568775177002, - "learning_rate": 1.4948433433077632e-05, - "loss": 0.8516, + "epoch": 0.631852879944483, + "grad_norm": 0.9704126715660095, + "learning_rate": 3.796411696729434e-05, + "loss": 0.7966, "step": 3642 }, { - "epoch": 1.3302903049114478, - "grad_norm": 0.8653243780136108, - "learning_rate": 1.4934554683136382e-05, - "loss": 0.8179, + "epoch": 0.6320263705759889, + "grad_norm": 1.0483578443527222, + "learning_rate": 3.7961120987101933e-05, + "loss": 0.9009, "step": 3643 }, { - "epoch": 1.3306554683220742, - "grad_norm": 0.9938915371894836, - "learning_rate": 1.4920678539163479e-05, - "loss": 0.8507, + "epoch": 0.6321998612074948, + "grad_norm": 1.4261513948440552, + "learning_rate": 3.79581229225005e-05, + "loss": 0.6864, "step": 3644 }, { - "epoch": 1.3310206317327005, - "grad_norm": 1.527647852897644, - "learning_rate": 1.4906805008297645e-05, - "loss": 0.7766, + "epoch": 0.6323733518390007, + "grad_norm": 0.9542103409767151, + "learning_rate": 3.795512277383798e-05, + "loss": 0.8149, "step": 3645 }, { - "epoch": 1.3313857951433266, - "grad_norm": 1.4680825471878052, - "learning_rate": 1.4892934097676262e-05, - "loss": 0.8069, + "epoch": 0.6325468424705066, + "grad_norm": 0.8987018465995789, + "learning_rate": 3.795212054146254e-05, + "loss": 0.8403, "step": 3646 }, { - "epoch": 1.3317509585539529, - "grad_norm": 1.380802869796753, - "learning_rate": 1.4879065814435349e-05, - "loss": 0.8299, + "epoch": 0.6327203331020125, + "grad_norm": 1.1396336555480957, + "learning_rate": 3.794911622572259e-05, + "loss": 0.876, "step": 3647 }, { - "epoch": 1.3321161219645792, - "grad_norm": 1.0578718185424805, - "learning_rate": 1.4865200165709588e-05, - "loss": 0.8329, + "epoch": 0.6328938237335184, + "grad_norm": 0.8584318161010742, + "learning_rate": 3.794610982696679e-05, + "loss": 0.6144, "step": 3648 }, { - "epoch": 1.3324812853752053, - "grad_norm": 1.116199016571045, - "learning_rate": 1.485133715863229e-05, - "loss": 0.822, + "epoch": 0.6330673143650243, + "grad_norm": 0.6770728826522827, + "learning_rate": 3.794310134554403e-05, + "loss": 0.8098, "step": 3649 }, { - "epoch": 1.3328464487858316, - "grad_norm": 1.0101293325424194, - "learning_rate": 1.4837476800335427e-05, - "loss": 0.8486, + "epoch": 0.6332408049965302, + "grad_norm": 0.9955157041549683, + "learning_rate": 3.7940090781803454e-05, + "loss": 0.6666, "step": 3650 }, { - "epoch": 1.333211612196458, - "grad_norm": 1.5879167318344116, - "learning_rate": 1.4823619097949584e-05, - "loss": 0.7713, + "epoch": 0.6334142956280361, + "grad_norm": 1.0177963972091675, + "learning_rate": 3.793707813609444e-05, + "loss": 0.7152, "step": 3651 }, { - "epoch": 1.3335767756070842, - "grad_norm": 0.8238763213157654, - "learning_rate": 1.4809764058604006e-05, - "loss": 0.8055, + "epoch": 0.6335877862595419, + "grad_norm": 1.7858588695526123, + "learning_rate": 3.7934063408766606e-05, + "loss": 0.8682, "step": 3652 }, { - "epoch": 1.3339419390177105, - "grad_norm": 1.0831371545791626, - "learning_rate": 1.4795911689426543e-05, - "loss": 0.8198, + "epoch": 0.6337612768910479, + "grad_norm": 1.1762322187423706, + "learning_rate": 3.7931046600169815e-05, + "loss": 0.782, "step": 3653 }, { - "epoch": 1.3343071024283366, - "grad_norm": 1.1857067346572876, - "learning_rate": 1.4782061997543699e-05, - "loss": 0.8437, + "epoch": 0.6339347675225537, + "grad_norm": 0.9590029120445251, + "learning_rate": 3.792802771065417e-05, + "loss": 0.652, "step": 3654 }, { - "epoch": 1.334672265838963, - "grad_norm": 1.1707007884979248, - "learning_rate": 1.476821499008057e-05, - "loss": 0.8358, + "epoch": 0.6341082581540597, + "grad_norm": 0.8667209148406982, + "learning_rate": 3.792500674057002e-05, + "loss": 0.7854, "step": 3655 }, { - "epoch": 1.3350374292495892, - "grad_norm": 1.4417773485183716, - "learning_rate": 1.4754370674160885e-05, - "loss": 0.7845, + "epoch": 0.6342817487855655, + "grad_norm": 1.2064733505249023, + "learning_rate": 3.792198369026796e-05, + "loss": 0.7, "step": 3656 }, { - "epoch": 1.3354025926602153, - "grad_norm": 1.1512993574142456, - "learning_rate": 1.4740529056906994e-05, - "loss": 0.9062, + "epoch": 0.6344552394170715, + "grad_norm": 0.8654311299324036, + "learning_rate": 3.79189585600988e-05, + "loss": 0.825, "step": 3657 }, { - "epoch": 1.3357677560708416, - "grad_norm": 1.2779810428619385, - "learning_rate": 1.4726690145439858e-05, - "loss": 0.8052, + "epoch": 0.6346287300485773, + "grad_norm": 0.881046712398529, + "learning_rate": 3.791593135041362e-05, + "loss": 0.9258, "step": 3658 }, { - "epoch": 1.336132919481468, - "grad_norm": 1.4382147789001465, - "learning_rate": 1.4712853946879035e-05, - "loss": 0.8363, + "epoch": 0.6348022206800833, + "grad_norm": 1.3855266571044922, + "learning_rate": 3.791290206156373e-05, + "loss": 0.8845, "step": 3659 }, { - "epoch": 1.3364980828920943, - "grad_norm": 2.144674777984619, - "learning_rate": 1.46990204683427e-05, - "loss": 0.8063, + "epoch": 0.6349757113115891, + "grad_norm": 1.0757869482040405, + "learning_rate": 3.790987069390069e-05, + "loss": 0.7932, "step": 3660 }, { - "epoch": 1.3368632463027206, - "grad_norm": 1.0582269430160522, - "learning_rate": 1.4685189716947614e-05, - "loss": 0.8601, + "epoch": 0.6351492019430951, + "grad_norm": 1.2183403968811035, + "learning_rate": 3.790683724777628e-05, + "loss": 0.8369, "step": 3661 }, { - "epoch": 1.3372284097133467, - "grad_norm": 1.2110309600830078, - "learning_rate": 1.4671361699809153e-05, - "loss": 0.8436, + "epoch": 0.635322692574601, + "grad_norm": 0.8872250318527222, + "learning_rate": 3.790380172354255e-05, + "loss": 0.74, "step": 3662 }, { - "epoch": 1.337593573123973, - "grad_norm": 1.046242594718933, - "learning_rate": 1.4657536424041268e-05, - "loss": 0.7971, + "epoch": 0.6354961832061069, + "grad_norm": 0.9924710392951965, + "learning_rate": 3.790076412155176e-05, + "loss": 0.7793, "step": 3663 }, { - "epoch": 1.3379587365345993, - "grad_norm": 1.383315920829773, - "learning_rate": 1.4643713896756518e-05, - "loss": 0.799, + "epoch": 0.6356696738376127, + "grad_norm": 0.8749369978904724, + "learning_rate": 3.789772444215644e-05, + "loss": 0.834, "step": 3664 }, { - "epoch": 1.3383238999452254, - "grad_norm": 0.998514711856842, - "learning_rate": 1.4629894125066028e-05, - "loss": 0.8268, + "epoch": 0.6358431644691187, + "grad_norm": 1.8593519926071167, + "learning_rate": 3.7894682685709335e-05, + "loss": 0.9587, "step": 3665 }, { - "epoch": 1.3386890633558517, - "grad_norm": 1.1768016815185547, - "learning_rate": 1.4616077116079524e-05, - "loss": 0.8832, + "epoch": 0.6360166551006246, + "grad_norm": 0.8556535243988037, + "learning_rate": 3.7891638852563455e-05, + "loss": 0.6276, "step": 3666 }, { - "epoch": 1.339054226766478, - "grad_norm": 1.0866308212280273, - "learning_rate": 1.4602262876905306e-05, - "loss": 0.8403, + "epoch": 0.6361901457321305, + "grad_norm": 2.0134222507476807, + "learning_rate": 3.788859294307204e-05, + "loss": 0.8113, "step": 3667 }, { - "epoch": 1.3394193901771043, - "grad_norm": 0.9695183634757996, - "learning_rate": 1.458845141465024e-05, - "loss": 0.8257, + "epoch": 0.6363636363636364, + "grad_norm": 1.2132906913757324, + "learning_rate": 3.788554495758858e-05, + "loss": 0.7197, "step": 3668 }, { - "epoch": 1.3397845535877306, - "grad_norm": 1.046994686126709, - "learning_rate": 1.4574642736419763e-05, - "loss": 0.8114, + "epoch": 0.6365371269951423, + "grad_norm": 0.8966903686523438, + "learning_rate": 3.788249489646677e-05, + "loss": 0.6812, "step": 3669 }, { - "epoch": 1.3401497169983567, - "grad_norm": 0.914162278175354, - "learning_rate": 1.4560836849317895e-05, - "loss": 0.7739, + "epoch": 0.6367106176266482, + "grad_norm": 1.1028051376342773, + "learning_rate": 3.7879442760060604e-05, + "loss": 0.6807, "step": 3670 }, { - "epoch": 1.340514880408983, - "grad_norm": 1.1910851001739502, - "learning_rate": 1.4547033760447202e-05, - "loss": 0.807, + "epoch": 0.6368841082581541, + "grad_norm": 0.9868062138557434, + "learning_rate": 3.7876388548724256e-05, + "loss": 0.7773, "step": 3671 }, { - "epoch": 1.3408800438196093, - "grad_norm": 0.8392941355705261, - "learning_rate": 1.4533233476908835e-05, - "loss": 0.8604, + "epoch": 0.63705759888966, + "grad_norm": 0.8853698968887329, + "learning_rate": 3.78733322628122e-05, + "loss": 0.6877, "step": 3672 }, { - "epoch": 1.3412452072302354, - "grad_norm": 0.8457450866699219, - "learning_rate": 1.451943600580247e-05, - "loss": 0.811, + "epoch": 0.6372310895211658, + "grad_norm": 0.9421465992927551, + "learning_rate": 3.787027390267911e-05, + "loss": 0.7766, "step": 3673 }, { - "epoch": 1.3416103706408617, - "grad_norm": 1.5399824380874634, - "learning_rate": 1.450564135422636e-05, - "loss": 0.8666, + "epoch": 0.6374045801526718, + "grad_norm": 0.7131218314170837, + "learning_rate": 3.786721346867991e-05, + "loss": 0.7627, "step": 3674 }, { - "epoch": 1.341975534051488, - "grad_norm": 1.0158084630966187, - "learning_rate": 1.4491849529277295e-05, - "loss": 0.8414, + "epoch": 0.6375780707841776, + "grad_norm": 0.9366453886032104, + "learning_rate": 3.786415096116976e-05, + "loss": 0.8052, "step": 3675 }, { - "epoch": 1.3423406974621144, - "grad_norm": 1.0190531015396118, - "learning_rate": 1.4478060538050622e-05, - "loss": 0.8287, + "epoch": 0.6377515614156836, + "grad_norm": 1.075919508934021, + "learning_rate": 3.786108638050408e-05, + "loss": 0.7832, "step": 3676 }, { - "epoch": 1.3427058608727407, - "grad_norm": 1.31387460231781, - "learning_rate": 1.4464274387640224e-05, - "loss": 0.818, + "epoch": 0.6379250520471894, + "grad_norm": 1.7976361513137817, + "learning_rate": 3.785801972703851e-05, + "loss": 0.8201, "step": 3677 }, { - "epoch": 1.3430710242833668, - "grad_norm": 1.0871951580047607, - "learning_rate": 1.4450491085138514e-05, - "loss": 0.8159, + "epoch": 0.6380985426786954, + "grad_norm": 0.7233350872993469, + "learning_rate": 3.785495100112894e-05, + "loss": 0.8752, "step": 3678 }, { - "epoch": 1.343436187693993, - "grad_norm": 0.9599098563194275, - "learning_rate": 1.4436710637636456e-05, - "loss": 0.8304, + "epoch": 0.6382720333102012, + "grad_norm": 0.6948670744895935, + "learning_rate": 3.7851880203131506e-05, + "loss": 0.9055, "step": 3679 }, { - "epoch": 1.3438013511046192, - "grad_norm": 1.1340875625610352, - "learning_rate": 1.442293305222354e-05, - "loss": 0.8032, + "epoch": 0.6384455239417072, + "grad_norm": 0.9208732843399048, + "learning_rate": 3.784880733340257e-05, + "loss": 0.9204, "step": 3680 }, { - "epoch": 1.3441665145152455, - "grad_norm": 1.3302578926086426, - "learning_rate": 1.4409158335987763e-05, - "loss": 0.7991, + "epoch": 0.638619014573213, + "grad_norm": 0.9864568710327148, + "learning_rate": 3.7845732392298746e-05, + "loss": 0.8286, "step": 3681 }, { - "epoch": 1.3445316779258718, - "grad_norm": 1.3686498403549194, - "learning_rate": 1.4395386496015685e-05, - "loss": 0.84, + "epoch": 0.638792505204719, + "grad_norm": 1.1356685161590576, + "learning_rate": 3.784265538017689e-05, + "loss": 0.7585, "step": 3682 }, { - "epoch": 1.3448968413364981, - "grad_norm": 0.9095948934555054, - "learning_rate": 1.4381617539392347e-05, - "loss": 0.825, + "epoch": 0.6389659958362248, + "grad_norm": 1.667280673980713, + "learning_rate": 3.783957629739408e-05, + "loss": 0.8037, "step": 3683 }, { - "epoch": 1.3452620047471244, - "grad_norm": 0.7651985287666321, - "learning_rate": 1.436785147320134e-05, - "loss": 0.8317, + "epoch": 0.6391394864677308, + "grad_norm": 0.8861865401268005, + "learning_rate": 3.7836495144307644e-05, + "loss": 0.6687, "step": 3684 }, { - "epoch": 1.3456271681577505, - "grad_norm": 0.9624016284942627, - "learning_rate": 1.4354088304524739e-05, - "loss": 0.8617, + "epoch": 0.6393129770992366, + "grad_norm": 1.2541218996047974, + "learning_rate": 3.783341192127518e-05, + "loss": 0.7404, "step": 3685 }, { - "epoch": 1.3459923315683768, - "grad_norm": 1.2830535173416138, - "learning_rate": 1.4340328040443154e-05, - "loss": 0.8096, + "epoch": 0.6394864677307426, + "grad_norm": 0.695833146572113, + "learning_rate": 3.783032662865447e-05, + "loss": 0.7268, "step": 3686 }, { - "epoch": 1.3463574949790031, - "grad_norm": 0.9762023091316223, - "learning_rate": 1.4326570688035682e-05, - "loss": 0.8234, + "epoch": 0.6396599583622484, + "grad_norm": 1.2837748527526855, + "learning_rate": 3.7827239266803584e-05, + "loss": 0.6919, "step": 3687 }, { - "epoch": 1.3467226583896292, - "grad_norm": 0.8931922316551208, - "learning_rate": 1.4312816254379928e-05, - "loss": 0.8376, + "epoch": 0.6398334489937544, + "grad_norm": 0.83601313829422, + "learning_rate": 3.782414983608081e-05, + "loss": 0.7343, "step": 3688 }, { - "epoch": 1.3470878218002555, - "grad_norm": 0.9348151087760925, - "learning_rate": 1.4299064746552005e-05, - "loss": 0.8157, + "epoch": 0.6400069396252602, + "grad_norm": 1.009537935256958, + "learning_rate": 3.7821058336844676e-05, + "loss": 0.7406, "step": 3689 }, { - "epoch": 1.3474529852108819, - "grad_norm": 0.9170137643814087, - "learning_rate": 1.42853161716265e-05, - "loss": 0.7844, + "epoch": 0.6401804302567662, + "grad_norm": 1.0054012537002563, + "learning_rate": 3.7817964769453956e-05, + "loss": 0.8206, "step": 3690 }, { - "epoch": 1.3478181486215082, - "grad_norm": 1.1884022951126099, - "learning_rate": 1.4271570536676513e-05, - "loss": 0.7939, + "epoch": 0.640353920888272, + "grad_norm": 1.0783183574676514, + "learning_rate": 3.781486913426766e-05, + "loss": 0.821, "step": 3691 }, { - "epoch": 1.3481833120321345, - "grad_norm": 1.2120693922042847, - "learning_rate": 1.4257827848773613e-05, - "loss": 0.8325, + "epoch": 0.6405274115197779, + "grad_norm": 0.986361563205719, + "learning_rate": 3.781177143164505e-05, + "loss": 0.7456, "step": 3692 }, { - "epoch": 1.3485484754427606, - "grad_norm": 1.2090035676956177, - "learning_rate": 1.424408811498787e-05, - "loss": 0.7796, + "epoch": 0.6407009021512838, + "grad_norm": 0.8562521934509277, + "learning_rate": 3.7808671661945606e-05, + "loss": 0.7607, "step": 3693 }, { - "epoch": 1.3489136388533869, - "grad_norm": 1.2506871223449707, - "learning_rate": 1.4230351342387827e-05, - "loss": 0.8398, + "epoch": 0.6408743927827897, + "grad_norm": 0.7921318411827087, + "learning_rate": 3.7805569825529055e-05, + "loss": 0.8342, "step": 3694 }, { - "epoch": 1.3492788022640132, - "grad_norm": 1.6333377361297607, - "learning_rate": 1.4216617538040488e-05, - "loss": 0.8306, + "epoch": 0.6410478834142956, + "grad_norm": 0.7843275666236877, + "learning_rate": 3.780246592275539e-05, + "loss": 0.9036, "step": 3695 }, { - "epoch": 1.3496439656746393, - "grad_norm": 1.1921755075454712, - "learning_rate": 1.4202886709011357e-05, - "loss": 0.8349, + "epoch": 0.6412213740458015, + "grad_norm": 1.076306939125061, + "learning_rate": 3.779935995398481e-05, + "loss": 0.6779, "step": 3696 }, { - "epoch": 1.3500091290852656, - "grad_norm": 0.9489463567733765, - "learning_rate": 1.4189158862364386e-05, - "loss": 0.8005, + "epoch": 0.6413948646773074, + "grad_norm": 1.1491665840148926, + "learning_rate": 3.7796251919577764e-05, + "loss": 0.8564, "step": 3697 }, { - "epoch": 1.350374292495892, - "grad_norm": 1.2093394994735718, - "learning_rate": 1.417543400516201e-05, - "loss": 0.8207, + "epoch": 0.6415683553088133, + "grad_norm": 0.7769327759742737, + "learning_rate": 3.7793141819894955e-05, + "loss": 0.8291, "step": 3698 }, { - "epoch": 1.3507394559065182, - "grad_norm": 1.212045431137085, - "learning_rate": 1.4161712144465108e-05, - "loss": 0.8427, + "epoch": 0.6417418459403192, + "grad_norm": 1.3981653451919556, + "learning_rate": 3.779002965529729e-05, + "loss": 0.7021, "step": 3699 }, { - "epoch": 1.3511046193171445, - "grad_norm": 1.012719988822937, - "learning_rate": 1.4147993287333032e-05, - "loss": 0.7926, + "epoch": 0.6419153365718251, + "grad_norm": 1.6185808181762695, + "learning_rate": 3.778691542614596e-05, + "loss": 0.7881, "step": 3700 }, { - "epoch": 1.3514697827277706, - "grad_norm": 1.2295879125595093, - "learning_rate": 1.413427744082359e-05, - "loss": 0.8627, + "epoch": 0.642088827203331, + "grad_norm": 1.2341827154159546, + "learning_rate": 3.7783799132802365e-05, + "loss": 0.7705, "step": 3701 }, { - "epoch": 1.351834946138397, - "grad_norm": 0.9974138140678406, - "learning_rate": 1.412056461199302e-05, - "loss": 0.8461, + "epoch": 0.6422623178348369, + "grad_norm": 0.9371026158332825, + "learning_rate": 3.778068077562817e-05, + "loss": 0.8378, "step": 3702 }, { - "epoch": 1.3522001095490233, - "grad_norm": 0.8432328701019287, - "learning_rate": 1.4106854807896035e-05, - "loss": 0.8222, + "epoch": 0.6424358084663429, + "grad_norm": 0.9320362210273743, + "learning_rate": 3.7777560354985246e-05, + "loss": 0.7854, "step": 3703 }, { - "epoch": 1.3525652729596493, - "grad_norm": 0.9057111144065857, - "learning_rate": 1.4093148035585774e-05, - "loss": 0.8597, + "epoch": 0.6426092990978487, + "grad_norm": 0.8724380731582642, + "learning_rate": 3.7774437871235724e-05, + "loss": 0.7448, "step": 3704 }, { - "epoch": 1.3529304363702757, - "grad_norm": 1.0232951641082764, - "learning_rate": 1.4079444302113821e-05, - "loss": 0.8168, + "epoch": 0.6427827897293547, + "grad_norm": 1.335938811302185, + "learning_rate": 3.7771313324741974e-05, + "loss": 0.749, "step": 3705 }, { - "epoch": 1.353295599780902, - "grad_norm": 0.9140546917915344, - "learning_rate": 1.4065743614530193e-05, - "loss": 0.7999, + "epoch": 0.6429562803608605, + "grad_norm": 0.9211609363555908, + "learning_rate": 3.776818671586662e-05, + "loss": 0.7314, "step": 3706 }, { - "epoch": 1.3536607631915283, - "grad_norm": 3.035421371459961, - "learning_rate": 1.405204597988336e-05, - "loss": 0.8083, + "epoch": 0.6431297709923665, + "grad_norm": 0.9657136797904968, + "learning_rate": 3.776505804497248e-05, + "loss": 0.7146, "step": 3707 }, { - "epoch": 1.3540259266021546, - "grad_norm": 1.4334715604782104, - "learning_rate": 1.403835140522019e-05, - "loss": 0.8072, + "epoch": 0.6433032616238723, + "grad_norm": 1.3756898641586304, + "learning_rate": 3.776192731242265e-05, + "loss": 0.782, "step": 3708 }, { - "epoch": 1.3543910900127807, - "grad_norm": 0.9211825728416443, - "learning_rate": 1.4024659897585989e-05, - "loss": 0.8317, + "epoch": 0.6434767522553783, + "grad_norm": 0.8694839477539062, + "learning_rate": 3.775879451858047e-05, + "loss": 0.7561, "step": 3709 }, { - "epoch": 1.354756253423407, - "grad_norm": 0.8945208787918091, - "learning_rate": 1.4010971464024494e-05, - "loss": 0.8327, + "epoch": 0.6436502428868841, + "grad_norm": 0.9332098960876465, + "learning_rate": 3.775565966380949e-05, + "loss": 0.7043, "step": 3710 }, { - "epoch": 1.3551214168340333, - "grad_norm": 1.0248135328292847, - "learning_rate": 1.3997286111577864e-05, - "loss": 0.8119, + "epoch": 0.6438237335183901, + "grad_norm": 0.9874651432037354, + "learning_rate": 3.7752522748473517e-05, + "loss": 0.7816, "step": 3711 }, { - "epoch": 1.3554865802446594, - "grad_norm": 1.1548928022384644, - "learning_rate": 1.3983603847286648e-05, - "loss": 0.854, + "epoch": 0.6439972241498959, + "grad_norm": 0.9495199918746948, + "learning_rate": 3.774938377293659e-05, + "loss": 0.8125, "step": 3712 }, { - "epoch": 1.3558517436552857, - "grad_norm": 1.1951075792312622, - "learning_rate": 1.3969924678189837e-05, - "loss": 0.8051, + "epoch": 0.6441707147814018, + "grad_norm": 0.853571891784668, + "learning_rate": 3.774624273756299e-05, + "loss": 0.7446, "step": 3713 }, { - "epoch": 1.356216907065912, - "grad_norm": 0.8732988834381104, - "learning_rate": 1.3956248611324803e-05, - "loss": 0.7979, + "epoch": 0.6443442054129077, + "grad_norm": 1.7014731168746948, + "learning_rate": 3.774309964271725e-05, + "loss": 0.8103, "step": 3714 }, { - "epoch": 1.3565820704765383, - "grad_norm": 1.1131194829940796, - "learning_rate": 1.3942575653727341e-05, - "loss": 0.8043, + "epoch": 0.6445176960444136, + "grad_norm": 0.882931113243103, + "learning_rate": 3.773995448876412e-05, + "loss": 0.7693, "step": 3715 }, { - "epoch": 1.3569472338871644, - "grad_norm": 1.181660532951355, - "learning_rate": 1.392890581243163e-05, - "loss": 0.835, + "epoch": 0.6446911866759195, + "grad_norm": 0.919424831867218, + "learning_rate": 3.7736807276068604e-05, + "loss": 0.8802, "step": 3716 }, { - "epoch": 1.3573123972977907, - "grad_norm": 0.9524421095848083, - "learning_rate": 1.3915239094470268e-05, - "loss": 0.7924, + "epoch": 0.6448646773074254, + "grad_norm": 0.8978388905525208, + "learning_rate": 3.773365800499592e-05, + "loss": 0.7893, "step": 3717 }, { - "epoch": 1.357677560708417, - "grad_norm": 1.6798298358917236, - "learning_rate": 1.3901575506874218e-05, - "loss": 0.8583, + "epoch": 0.6450381679389313, + "grad_norm": 0.7352043390274048, + "learning_rate": 3.773050667591158e-05, + "loss": 0.7085, "step": 3718 }, { - "epoch": 1.3580427241190431, - "grad_norm": 1.2879528999328613, - "learning_rate": 1.3887915056672863e-05, - "loss": 0.8616, + "epoch": 0.6452116585704372, + "grad_norm": 1.5832805633544922, + "learning_rate": 3.772735328918127e-05, + "loss": 0.655, "step": 3719 }, { - "epoch": 1.3584078875296695, - "grad_norm": 0.8703517317771912, - "learning_rate": 1.387425775089395e-05, - "loss": 0.8005, + "epoch": 0.6453851492019431, + "grad_norm": 0.995969831943512, + "learning_rate": 3.772419784517095e-05, + "loss": 0.8126, "step": 3720 }, { - "epoch": 1.3587730509402958, - "grad_norm": 1.0851984024047852, - "learning_rate": 1.3860603596563606e-05, - "loss": 0.7604, + "epoch": 0.645558639833449, + "grad_norm": 1.1823722124099731, + "learning_rate": 3.7721040344246824e-05, + "loss": 0.7106, "step": 3721 }, { - "epoch": 1.359138214350922, - "grad_norm": 0.8560203313827515, - "learning_rate": 1.3846952600706354e-05, - "loss": 0.8513, + "epoch": 0.6457321304649549, + "grad_norm": 1.2935973405838013, + "learning_rate": 3.771788078677532e-05, + "loss": 0.7346, "step": 3722 }, { - "epoch": 1.3595033777615484, - "grad_norm": 1.046786904335022, - "learning_rate": 1.3833304770345084e-05, - "loss": 0.8123, + "epoch": 0.6459056210964608, + "grad_norm": 0.9434872269630432, + "learning_rate": 3.7714719173123104e-05, + "loss": 0.6156, "step": 3723 }, { - "epoch": 1.3598685411721745, - "grad_norm": 1.246211290359497, - "learning_rate": 1.3819660112501054e-05, - "loss": 0.795, + "epoch": 0.6460791117279667, + "grad_norm": 1.9453701972961426, + "learning_rate": 3.771155550365708e-05, + "loss": 0.76, "step": 3724 }, { - "epoch": 1.3602337045828008, - "grad_norm": 1.4204074144363403, - "learning_rate": 1.3806018634193899e-05, - "loss": 0.8305, + "epoch": 0.6462526023594726, + "grad_norm": 0.7095528841018677, + "learning_rate": 3.770838977874441e-05, + "loss": 0.7869, "step": 3725 }, { - "epoch": 1.360598867993427, - "grad_norm": 1.328125238418579, - "learning_rate": 1.3792380342441601e-05, - "loss": 0.8478, + "epoch": 0.6464260929909785, + "grad_norm": 0.9229303002357483, + "learning_rate": 3.770522199875247e-05, + "loss": 0.6648, "step": 3726 }, { - "epoch": 1.3609640314040532, - "grad_norm": 1.2889227867126465, - "learning_rate": 1.3778745244260528e-05, - "loss": 0.8497, + "epoch": 0.6465995836224844, + "grad_norm": 1.0972743034362793, + "learning_rate": 3.770205216404888e-05, + "loss": 0.801, "step": 3727 }, { - "epoch": 1.3613291948146795, - "grad_norm": 1.0543410778045654, - "learning_rate": 1.3765113346665375e-05, - "loss": 0.8551, + "epoch": 0.6467730742539903, + "grad_norm": 0.7733034491539001, + "learning_rate": 3.7698880275001516e-05, + "loss": 0.6444, "step": 3728 }, { - "epoch": 1.3616943582253058, - "grad_norm": 1.1541430950164795, - "learning_rate": 1.3751484656669223e-05, - "loss": 0.8029, + "epoch": 0.6469465648854962, + "grad_norm": 0.903525710105896, + "learning_rate": 3.7695706331978465e-05, + "loss": 0.7631, "step": 3729 }, { - "epoch": 1.3620595216359321, - "grad_norm": 0.8703985810279846, - "learning_rate": 1.3737859181283471e-05, - "loss": 0.8281, + "epoch": 0.6471200555170021, + "grad_norm": 1.044716477394104, + "learning_rate": 3.769253033534808e-05, + "loss": 0.7991, "step": 3730 }, { - "epoch": 1.3624246850465584, - "grad_norm": 1.0752136707305908, - "learning_rate": 1.3724236927517887e-05, - "loss": 0.8151, + "epoch": 0.647293546148508, + "grad_norm": 2.6977760791778564, + "learning_rate": 3.768935228547894e-05, + "loss": 0.7949, "step": 3731 }, { - "epoch": 1.3627898484571845, - "grad_norm": 0.9767337441444397, - "learning_rate": 1.3710617902380579e-05, - "loss": 0.804, + "epoch": 0.6474670367800138, + "grad_norm": 0.9257286190986633, + "learning_rate": 3.7686172182739845e-05, + "loss": 0.6843, "step": 3732 }, { - "epoch": 1.3631550118678109, - "grad_norm": 1.1774232387542725, - "learning_rate": 1.3697002112877975e-05, - "loss": 0.7714, + "epoch": 0.6476405274115198, + "grad_norm": 0.7519979476928711, + "learning_rate": 3.7682990027499864e-05, + "loss": 0.7715, "step": 3733 }, { - "epoch": 1.3635201752784372, - "grad_norm": 0.775357186794281, - "learning_rate": 1.3683389566014871e-05, - "loss": 0.8285, + "epoch": 0.6478140180430256, + "grad_norm": 0.9366804957389832, + "learning_rate": 3.767980582012828e-05, + "loss": 0.8376, "step": 3734 }, { - "epoch": 1.3638853386890633, - "grad_norm": 1.0300897359848022, - "learning_rate": 1.3669780268794362e-05, - "loss": 0.8511, + "epoch": 0.6479875086745316, + "grad_norm": 1.479036808013916, + "learning_rate": 3.767661956099464e-05, + "loss": 0.795, "step": 3735 }, { - "epoch": 1.3642505020996896, - "grad_norm": 1.1514583826065063, - "learning_rate": 1.3656174228217883e-05, - "loss": 0.8633, + "epoch": 0.6481609993060374, + "grad_norm": 0.9265851974487305, + "learning_rate": 3.7673431250468695e-05, + "loss": 0.7715, "step": 3736 }, { - "epoch": 1.3646156655103159, - "grad_norm": 0.9546794891357422, - "learning_rate": 1.3642571451285207e-05, - "loss": 0.8008, + "epoch": 0.6483344899375434, + "grad_norm": 0.8994808197021484, + "learning_rate": 3.767024088892046e-05, + "loss": 0.6934, "step": 3737 }, { - "epoch": 1.3649808289209422, - "grad_norm": 1.0406373739242554, - "learning_rate": 1.3628971944994407e-05, - "loss": 0.829, + "epoch": 0.6485079805690492, + "grad_norm": 1.191148042678833, + "learning_rate": 3.766704847672018e-05, + "loss": 0.8088, "step": 3738 }, { - "epoch": 1.3653459923315685, - "grad_norm": 1.0436930656433105, - "learning_rate": 1.3615375716341893e-05, - "loss": 0.8474, + "epoch": 0.6486814712005552, + "grad_norm": 0.8841029405593872, + "learning_rate": 3.7663854014238344e-05, + "loss": 0.6914, "step": 3739 }, { - "epoch": 1.3657111557421946, - "grad_norm": 1.1990852355957031, - "learning_rate": 1.3601782772322368e-05, - "loss": 0.8047, + "epoch": 0.648854961832061, + "grad_norm": 1.067733645439148, + "learning_rate": 3.766065750184566e-05, + "loss": 0.6829, "step": 3740 }, { - "epoch": 1.366076319152821, - "grad_norm": 1.0162107944488525, - "learning_rate": 1.3588193119928868e-05, - "loss": 0.8726, + "epoch": 0.649028452463567, + "grad_norm": 0.8427601456642151, + "learning_rate": 3.765745893991309e-05, + "loss": 0.5979, "step": 3741 }, { - "epoch": 1.3664414825634472, - "grad_norm": 1.6443133354187012, - "learning_rate": 1.3574606766152712e-05, - "loss": 0.7739, + "epoch": 0.6492019430950728, + "grad_norm": 1.0116441249847412, + "learning_rate": 3.7654258328811856e-05, + "loss": 0.7019, "step": 3742 }, { - "epoch": 1.3668066459740733, - "grad_norm": 1.1415188312530518, - "learning_rate": 1.3561023717983541e-05, - "loss": 0.8154, + "epoch": 0.6493754337265788, + "grad_norm": 1.0613133907318115, + "learning_rate": 3.765105566891335e-05, + "loss": 0.658, "step": 3743 }, { - "epoch": 1.3671718093846996, - "grad_norm": 1.2959623336791992, - "learning_rate": 1.3547443982409291e-05, - "loss": 0.8142, + "epoch": 0.6495489243580846, + "grad_norm": 0.7282751798629761, + "learning_rate": 3.764785096058927e-05, + "loss": 0.7343, "step": 3744 }, { - "epoch": 1.367536972795326, - "grad_norm": 1.7988396883010864, - "learning_rate": 1.3533867566416184e-05, - "loss": 0.8052, + "epoch": 0.6497224149895906, + "grad_norm": 1.6192538738250732, + "learning_rate": 3.764464420421153e-05, + "loss": 0.7822, "step": 3745 }, { - "epoch": 1.3679021362059522, - "grad_norm": 1.1665593385696411, - "learning_rate": 1.3520294476988747e-05, - "loss": 0.8698, + "epoch": 0.6498959056210964, + "grad_norm": 0.8762547969818115, + "learning_rate": 3.764143540015227e-05, + "loss": 0.8123, "step": 3746 }, { - "epoch": 1.3682672996165783, - "grad_norm": 1.2495356798171997, - "learning_rate": 1.3506724721109792e-05, - "loss": 0.8538, + "epoch": 0.6500693962526024, + "grad_norm": 1.0434865951538086, + "learning_rate": 3.763822454878387e-05, + "loss": 0.8167, "step": 3747 }, { - "epoch": 1.3686324630272046, - "grad_norm": 0.9520424008369446, - "learning_rate": 1.3493158305760401e-05, - "loss": 0.8369, + "epoch": 0.6502428868841083, + "grad_norm": 0.8376615047454834, + "learning_rate": 3.763501165047896e-05, + "loss": 0.8008, "step": 3748 }, { - "epoch": 1.368997626437831, - "grad_norm": 1.2787840366363525, - "learning_rate": 1.3479595237919963e-05, - "loss": 0.8365, + "epoch": 0.6504163775156142, + "grad_norm": 1.7960706949234009, + "learning_rate": 3.76317967056104e-05, + "loss": 0.6322, "step": 3749 }, { - "epoch": 1.369362789848457, - "grad_norm": 0.8628684878349304, - "learning_rate": 1.346603552456612e-05, - "loss": 0.7989, + "epoch": 0.65058986814712, + "grad_norm": 0.7618784308433533, + "learning_rate": 3.7628579714551285e-05, + "loss": 0.8257, "step": 3750 }, { - "epoch": 1.3697279532590834, - "grad_norm": 1.1261874437332153, - "learning_rate": 1.3452479172674817e-05, - "loss": 0.809, + "epoch": 0.6507633587786259, + "grad_norm": 1.1690599918365479, + "learning_rate": 3.762536067767495e-05, + "loss": 0.9697, "step": 3751 }, { - "epoch": 1.3700931166697097, - "grad_norm": 0.9201136231422424, - "learning_rate": 1.3438926189220239e-05, - "loss": 0.8285, + "epoch": 0.6509368494101319, + "grad_norm": 0.7351138591766357, + "learning_rate": 3.7622139595354976e-05, + "loss": 0.8718, "step": 3752 }, { - "epoch": 1.370458280080336, - "grad_norm": 0.7692937850952148, - "learning_rate": 1.3425376581174855e-05, - "loss": 0.8373, + "epoch": 0.6511103400416377, + "grad_norm": 1.0308220386505127, + "learning_rate": 3.761891646796517e-05, + "loss": 0.6714, "step": 3753 }, { - "epoch": 1.3708234434909623, - "grad_norm": 1.1584373712539673, - "learning_rate": 1.34118303555094e-05, - "loss": 0.8286, + "epoch": 0.6512838306731437, + "grad_norm": 1.0136557817459106, + "learning_rate": 3.7615691295879574e-05, + "loss": 0.6675, "step": 3754 }, { - "epoch": 1.3711886069015884, - "grad_norm": 1.286441445350647, - "learning_rate": 1.3398287519192858e-05, - "loss": 0.8193, + "epoch": 0.6514573213046495, + "grad_norm": 0.925405740737915, + "learning_rate": 3.7612464079472474e-05, + "loss": 0.8674, "step": 3755 }, { - "epoch": 1.3715537703122147, - "grad_norm": 0.9206037521362305, - "learning_rate": 1.3384748079192482e-05, - "loss": 0.8377, + "epoch": 0.6516308119361555, + "grad_norm": 1.1010034084320068, + "learning_rate": 3.76092348191184e-05, + "loss": 0.7651, "step": 3756 }, { - "epoch": 1.371918933722841, - "grad_norm": 1.353319525718689, - "learning_rate": 1.337121204247376e-05, - "loss": 0.8502, + "epoch": 0.6518043025676613, + "grad_norm": 0.8915197253227234, + "learning_rate": 3.7606003515192103e-05, + "loss": 0.8638, "step": 3757 }, { - "epoch": 1.372284097133467, - "grad_norm": 1.2221179008483887, - "learning_rate": 1.335767941600045e-05, - "loss": 0.8064, + "epoch": 0.6519777931991673, + "grad_norm": 0.9036543965339661, + "learning_rate": 3.7602770168068586e-05, + "loss": 0.7791, "step": 3758 }, { - "epoch": 1.3726492605440934, - "grad_norm": 0.9800985455513, - "learning_rate": 1.3344150206734537e-05, - "loss": 0.8522, + "epoch": 0.6521512838306731, + "grad_norm": 0.9122968316078186, + "learning_rate": 3.7599534778123074e-05, + "loss": 0.7733, "step": 3759 }, { - "epoch": 1.3730144239547197, - "grad_norm": 0.8817344307899475, - "learning_rate": 1.3330624421636265e-05, - "loss": 0.791, + "epoch": 0.6523247744621791, + "grad_norm": 0.8854377865791321, + "learning_rate": 3.759629734573105e-05, + "loss": 0.7839, "step": 3760 }, { - "epoch": 1.373379587365346, - "grad_norm": 1.1487219333648682, - "learning_rate": 1.3317102067664104e-05, - "loss": 0.798, + "epoch": 0.6524982650936849, + "grad_norm": 0.8214502334594727, + "learning_rate": 3.759305787126821e-05, + "loss": 0.8472, "step": 3761 }, { - "epoch": 1.3737447507759724, - "grad_norm": 1.2145106792449951, - "learning_rate": 1.3303583151774758e-05, - "loss": 0.8506, + "epoch": 0.6526717557251909, + "grad_norm": 0.8351401090621948, + "learning_rate": 3.758981635511051e-05, + "loss": 0.8635, "step": 3762 }, { - "epoch": 1.3741099141865984, - "grad_norm": 1.560409665107727, - "learning_rate": 1.3290067680923169e-05, - "loss": 0.8506, + "epoch": 0.6528452463566967, + "grad_norm": 0.8662011623382568, + "learning_rate": 3.758657279763412e-05, + "loss": 0.9226, "step": 3763 }, { - "epoch": 1.3744750775972248, - "grad_norm": 0.9840587973594666, - "learning_rate": 1.3276555662062503e-05, - "loss": 0.8496, + "epoch": 0.6530187369882027, + "grad_norm": 0.9462679624557495, + "learning_rate": 3.758332719921547e-05, + "loss": 0.7952, "step": 3764 }, { - "epoch": 1.374840241007851, - "grad_norm": 1.1929097175598145, - "learning_rate": 1.3263047102144154e-05, - "loss": 0.8196, + "epoch": 0.6531922276197085, + "grad_norm": 0.8452800512313843, + "learning_rate": 3.758007956023121e-05, + "loss": 0.7428, "step": 3765 }, { - "epoch": 1.3752054044184772, - "grad_norm": 1.101738452911377, - "learning_rate": 1.3249542008117737e-05, - "loss": 0.8218, + "epoch": 0.6533657182512145, + "grad_norm": 1.190227746963501, + "learning_rate": 3.757682988105823e-05, + "loss": 0.8264, "step": 3766 }, { - "epoch": 1.3755705678291035, - "grad_norm": 1.3025932312011719, - "learning_rate": 1.3236040386931075e-05, - "loss": 0.7976, + "epoch": 0.6535392088827203, + "grad_norm": 1.0532466173171997, + "learning_rate": 3.757357816207366e-05, + "loss": 0.7207, "step": 3767 }, { - "epoch": 1.3759357312397298, - "grad_norm": 0.8629334568977356, - "learning_rate": 1.3222542245530217e-05, - "loss": 0.8183, + "epoch": 0.6537126995142263, + "grad_norm": 0.7011163234710693, + "learning_rate": 3.7570324403654866e-05, + "loss": 0.7344, "step": 3768 }, { - "epoch": 1.376300894650356, - "grad_norm": 1.3761518001556396, - "learning_rate": 1.3209047590859407e-05, - "loss": 0.8265, + "epoch": 0.6538861901457321, + "grad_norm": 1.0230262279510498, + "learning_rate": 3.756706860617945e-05, + "loss": 0.8252, "step": 3769 }, { - "epoch": 1.3766660580609824, - "grad_norm": 0.7708101272583008, - "learning_rate": 1.3195556429861112e-05, - "loss": 0.8575, + "epoch": 0.6540596807772381, + "grad_norm": 0.8177589178085327, + "learning_rate": 3.756381077002526e-05, + "loss": 0.8135, "step": 3770 }, { - "epoch": 1.3770312214716085, - "grad_norm": 1.1597199440002441, - "learning_rate": 1.3182068769475984e-05, - "loss": 0.8723, + "epoch": 0.6542331714087439, + "grad_norm": 1.3306204080581665, + "learning_rate": 3.756055089557036e-05, + "loss": 0.7562, "step": 3771 }, { - "epoch": 1.3773963848822348, - "grad_norm": 1.3086869716644287, - "learning_rate": 1.316858461664289e-05, - "loss": 0.8157, + "epoch": 0.6544066620402498, + "grad_norm": 0.8750917911529541, + "learning_rate": 3.755728898319306e-05, + "loss": 0.8508, "step": 3772 }, { - "epoch": 1.3777615482928611, - "grad_norm": 0.8759774565696716, - "learning_rate": 1.3155103978298882e-05, - "loss": 0.7443, + "epoch": 0.6545801526717557, + "grad_norm": 1.1924347877502441, + "learning_rate": 3.7554025033271923e-05, + "loss": 0.7006, "step": 3773 }, { - "epoch": 1.3781267117034872, - "grad_norm": 1.0069693326950073, - "learning_rate": 1.3141626861379215e-05, - "loss": 0.8337, + "epoch": 0.6547536433032616, + "grad_norm": 1.244933009147644, + "learning_rate": 3.7550759046185726e-05, + "loss": 0.6973, "step": 3774 }, { - "epoch": 1.3784918751141135, - "grad_norm": 0.8816486597061157, - "learning_rate": 1.3128153272817312e-05, - "loss": 0.8087, + "epoch": 0.6549271339347675, + "grad_norm": 0.9220853447914124, + "learning_rate": 3.754749102231349e-05, + "loss": 0.6799, "step": 3775 }, { - "epoch": 1.3788570385247398, - "grad_norm": 0.9522383809089661, - "learning_rate": 1.31146832195448e-05, - "loss": 0.8191, + "epoch": 0.6551006245662734, + "grad_norm": 0.8354787826538086, + "learning_rate": 3.7544220962034475e-05, + "loss": 0.7007, "step": 3776 }, { - "epoch": 1.3792222019353662, - "grad_norm": 1.176023244857788, - "learning_rate": 1.3101216708491482e-05, - "loss": 0.8354, + "epoch": 0.6552741151977793, + "grad_norm": 0.9305773973464966, + "learning_rate": 3.7540948865728174e-05, + "loss": 0.7957, "step": 3777 }, { - "epoch": 1.3795873653459925, - "grad_norm": 1.200738787651062, - "learning_rate": 1.308775374658534e-05, - "loss": 0.8429, + "epoch": 0.6554476058292852, + "grad_norm": 0.9184396862983704, + "learning_rate": 3.7537674733774315e-05, + "loss": 0.8643, "step": 3778 }, { - "epoch": 1.3799525287566186, - "grad_norm": 1.076651930809021, - "learning_rate": 1.3074294340752518e-05, - "loss": 0.8433, + "epoch": 0.6556210964607911, + "grad_norm": 0.7796075940132141, + "learning_rate": 3.7534398566552866e-05, + "loss": 0.8691, "step": 3779 }, { - "epoch": 1.3803176921672449, - "grad_norm": 1.0389529466629028, - "learning_rate": 1.306083849791735e-05, - "loss": 0.8043, + "epoch": 0.655794587092297, + "grad_norm": 0.766631543636322, + "learning_rate": 3.753112036444404e-05, + "loss": 0.7367, "step": 3780 }, { - "epoch": 1.380682855577871, - "grad_norm": 0.8898241519927979, - "learning_rate": 1.304738622500232e-05, - "loss": 0.8157, + "epoch": 0.655968077723803, + "grad_norm": 1.1207430362701416, + "learning_rate": 3.7527840127828256e-05, + "loss": 0.7074, "step": 3781 }, { - "epoch": 1.3810480189884973, - "grad_norm": 1.82668936252594, - "learning_rate": 1.3033937528928093e-05, - "loss": 0.7917, + "epoch": 0.6561415683553088, + "grad_norm": 0.7750427722930908, + "learning_rate": 3.752455785708622e-05, + "loss": 0.9193, "step": 3782 }, { - "epoch": 1.3814131823991236, - "grad_norm": 1.1979440450668335, - "learning_rate": 1.3020492416613468e-05, - "loss": 0.8192, + "epoch": 0.6563150589868147, + "grad_norm": 0.9121679663658142, + "learning_rate": 3.752127355259881e-05, + "loss": 0.6392, "step": 3783 }, { - "epoch": 1.38177834580975, - "grad_norm": 1.039689540863037, - "learning_rate": 1.3007050894975433e-05, - "loss": 0.8325, + "epoch": 0.6564885496183206, + "grad_norm": 0.9043782353401184, + "learning_rate": 3.7517987214747186e-05, + "loss": 0.6206, "step": 3784 }, { - "epoch": 1.3821435092203762, - "grad_norm": 1.2584363222122192, - "learning_rate": 1.2993612970929097e-05, - "loss": 0.8514, + "epoch": 0.6566620402498266, + "grad_norm": 1.176137089729309, + "learning_rate": 3.751469884391274e-05, + "loss": 0.7107, "step": 3785 }, { - "epoch": 1.3825086726310023, - "grad_norm": 1.1663841009140015, - "learning_rate": 1.2980178651387738e-05, - "loss": 0.8535, + "epoch": 0.6568355308813324, + "grad_norm": 0.7804182767868042, + "learning_rate": 3.751140844047708e-05, + "loss": 0.6965, "step": 3786 }, { - "epoch": 1.3828738360416286, - "grad_norm": 1.164734959602356, - "learning_rate": 1.2966747943262786e-05, - "loss": 0.8133, + "epoch": 0.6570090215128384, + "grad_norm": 0.9824159145355225, + "learning_rate": 3.750811600482207e-05, + "loss": 0.7922, "step": 3787 }, { - "epoch": 1.383238999452255, - "grad_norm": 1.0888620615005493, - "learning_rate": 1.2953320853463782e-05, - "loss": 0.7897, + "epoch": 0.6571825121443442, + "grad_norm": 1.7952382564544678, + "learning_rate": 3.7504821537329795e-05, + "loss": 0.8193, "step": 3788 }, { - "epoch": 1.383604162862881, - "grad_norm": 1.2994195222854614, - "learning_rate": 1.293989738889843e-05, - "loss": 0.7914, + "epoch": 0.6573560027758502, + "grad_norm": 0.674872875213623, + "learning_rate": 3.750152503838258e-05, + "loss": 0.865, "step": 3789 }, { - "epoch": 1.3839693262735073, - "grad_norm": 1.1686034202575684, - "learning_rate": 1.2926477556472573e-05, - "loss": 0.8242, + "epoch": 0.657529493407356, + "grad_norm": 0.9421955943107605, + "learning_rate": 3.7498226508362996e-05, + "loss": 0.7522, "step": 3790 }, { - "epoch": 1.3843344896841336, - "grad_norm": 0.8679730892181396, - "learning_rate": 1.2913061363090166e-05, - "loss": 0.8444, + "epoch": 0.6577029840388618, + "grad_norm": 0.9721437096595764, + "learning_rate": 3.7494925947653835e-05, + "loss": 0.6996, "step": 3791 }, { - "epoch": 1.38469965309476, - "grad_norm": 1.0104047060012817, - "learning_rate": 1.289964881565331e-05, - "loss": 0.8455, + "epoch": 0.6578764746703678, + "grad_norm": 0.8404197096824646, + "learning_rate": 3.749162335663813e-05, + "loss": 0.905, "step": 3792 }, { - "epoch": 1.3850648165053863, - "grad_norm": 1.1732920408248901, - "learning_rate": 1.2886239921062216e-05, - "loss": 0.8154, + "epoch": 0.6580499653018737, + "grad_norm": 0.8210102915763855, + "learning_rate": 3.7488318735699154e-05, + "loss": 0.8787, "step": 3793 }, { - "epoch": 1.3854299799160124, - "grad_norm": 1.2865604162216187, - "learning_rate": 1.2872834686215227e-05, - "loss": 0.8308, + "epoch": 0.6582234559333796, + "grad_norm": 0.8707680106163025, + "learning_rate": 3.7485012085220416e-05, + "loss": 0.855, "step": 3794 }, { - "epoch": 1.3857951433266387, - "grad_norm": 1.0825175046920776, - "learning_rate": 1.2859433118008796e-05, - "loss": 0.8542, + "epoch": 0.6583969465648855, + "grad_norm": 1.060255765914917, + "learning_rate": 3.7481703405585646e-05, + "loss": 0.6567, "step": 3795 }, { - "epoch": 1.386160306737265, - "grad_norm": 1.6838303804397583, - "learning_rate": 1.284603522333749e-05, - "loss": 0.8218, + "epoch": 0.6585704371963914, + "grad_norm": 0.7925611138343811, + "learning_rate": 3.747839269717882e-05, + "loss": 0.7585, "step": 3796 }, { - "epoch": 1.386525470147891, - "grad_norm": 1.7537041902542114, - "learning_rate": 1.2832641009093995e-05, - "loss": 0.7961, + "epoch": 0.6587439278278973, + "grad_norm": 0.7592456936836243, + "learning_rate": 3.747507996038416e-05, + "loss": 0.7076, "step": 3797 }, { - "epoch": 1.3868906335585174, - "grad_norm": 1.1526859998703003, - "learning_rate": 1.281925048216909e-05, - "loss": 0.8328, + "epoch": 0.6589174184594032, + "grad_norm": 0.7548115253448486, + "learning_rate": 3.7471765195586115e-05, + "loss": 0.6542, "step": 3798 }, { - "epoch": 1.3872557969691437, - "grad_norm": 1.8621759414672852, - "learning_rate": 1.2805863649451671e-05, - "loss": 0.7786, + "epoch": 0.6590909090909091, + "grad_norm": 0.8299365043640137, + "learning_rate": 3.746844840316935e-05, + "loss": 0.7263, "step": 3799 }, { - "epoch": 1.38762096037977, - "grad_norm": 0.8780944347381592, - "learning_rate": 1.2792480517828714e-05, - "loss": 0.8269, + "epoch": 0.659264399722415, + "grad_norm": 1.0825649499893188, + "learning_rate": 3.74651295835188e-05, + "loss": 0.6587, "step": 3800 }, { - "epoch": 1.3879861237903963, - "grad_norm": 1.2219297885894775, - "learning_rate": 1.2779101094185322e-05, - "loss": 0.7804, + "epoch": 0.6594378903539209, + "grad_norm": 1.1731160879135132, + "learning_rate": 3.7461808737019606e-05, + "loss": 0.6519, "step": 3801 }, { - "epoch": 1.3883512872010224, - "grad_norm": 0.7974051237106323, - "learning_rate": 1.2765725385404655e-05, - "loss": 0.8309, + "epoch": 0.6596113809854268, + "grad_norm": 0.8041303753852844, + "learning_rate": 3.745848586405717e-05, + "loss": 0.8374, "step": 3802 }, { - "epoch": 1.3887164506116487, - "grad_norm": 1.0959047079086304, - "learning_rate": 1.2752353398367982e-05, - "loss": 0.7711, + "epoch": 0.6597848716169327, + "grad_norm": 0.851334273815155, + "learning_rate": 3.745516096501709e-05, + "loss": 0.6613, "step": 3803 }, { - "epoch": 1.389081614022275, - "grad_norm": 1.1830520629882812, - "learning_rate": 1.2738985139954658e-05, - "loss": 0.8333, + "epoch": 0.6599583622484386, + "grad_norm": 1.6367579698562622, + "learning_rate": 3.745183404028525e-05, + "loss": 0.6007, "step": 3804 }, { - "epoch": 1.3894467774329011, - "grad_norm": 0.9805305600166321, - "learning_rate": 1.272562061704211e-05, - "loss": 0.8199, + "epoch": 0.6601318528799445, + "grad_norm": 1.2259317636489868, + "learning_rate": 3.744850509024774e-05, + "loss": 0.717, "step": 3805 }, { - "epoch": 1.3898119408435274, - "grad_norm": 1.0992192029953003, - "learning_rate": 1.2712259836505854e-05, - "loss": 0.8134, + "epoch": 0.6603053435114504, + "grad_norm": 1.5810184478759766, + "learning_rate": 3.7445174115290875e-05, + "loss": 0.7078, "step": 3806 }, { - "epoch": 1.3901771042541538, - "grad_norm": 1.1118355989456177, - "learning_rate": 1.269890280521947e-05, - "loss": 0.7963, + "epoch": 0.6604788341429563, + "grad_norm": 2.2267251014709473, + "learning_rate": 3.744184111580123e-05, + "loss": 0.6948, "step": 3807 }, { - "epoch": 1.39054226766478, - "grad_norm": 1.1043366193771362, - "learning_rate": 1.2685549530054617e-05, - "loss": 0.8492, + "epoch": 0.6606523247744622, + "grad_norm": 0.8869484066963196, + "learning_rate": 3.74385060921656e-05, + "loss": 0.717, "step": 3808 }, { - "epoch": 1.3909074310754064, - "grad_norm": 1.165765404701233, - "learning_rate": 1.2672200017881027e-05, - "loss": 0.8029, + "epoch": 0.6608258154059681, + "grad_norm": 1.1639631986618042, + "learning_rate": 3.7435169044771016e-05, + "loss": 0.7231, "step": 3809 }, { - "epoch": 1.3912725944860325, - "grad_norm": 1.068179965019226, - "learning_rate": 1.2658854275566475e-05, - "loss": 0.834, + "epoch": 0.6609993060374739, + "grad_norm": 0.8588725328445435, + "learning_rate": 3.743182997400475e-05, + "loss": 0.7615, "step": 3810 }, { - "epoch": 1.3916377578966588, - "grad_norm": 1.3448392152786255, - "learning_rate": 1.264551230997682e-05, - "loss": 0.7969, + "epoch": 0.6611727966689799, + "grad_norm": 0.9964105486869812, + "learning_rate": 3.74284888802543e-05, + "loss": 0.619, "step": 3811 }, { - "epoch": 1.392002921307285, - "grad_norm": 1.6202325820922852, - "learning_rate": 1.2632174127975963e-05, - "loss": 0.8257, + "epoch": 0.6613462873004857, + "grad_norm": 1.0834521055221558, + "learning_rate": 3.742514576390741e-05, + "loss": 0.7012, "step": 3812 }, { - "epoch": 1.3923680847179112, - "grad_norm": 1.2316148281097412, - "learning_rate": 1.2618839736425867e-05, - "loss": 0.8003, + "epoch": 0.6615197779319917, + "grad_norm": 0.7681102156639099, + "learning_rate": 3.742180062535205e-05, + "loss": 0.8281, "step": 3813 }, { - "epoch": 1.3927332481285375, - "grad_norm": 1.2874771356582642, - "learning_rate": 1.2605509142186543e-05, - "loss": 0.845, + "epoch": 0.6616932685634975, + "grad_norm": 0.8053924441337585, + "learning_rate": 3.741845346497643e-05, + "loss": 0.9214, "step": 3814 }, { - "epoch": 1.3930984115391638, - "grad_norm": 1.063185691833496, - "learning_rate": 1.2592182352116037e-05, - "loss": 0.8102, + "epoch": 0.6618667591950035, + "grad_norm": 0.8515164256095886, + "learning_rate": 3.741510428316898e-05, + "loss": 0.6659, "step": 3815 }, { - "epoch": 1.3934635749497901, - "grad_norm": 0.9673600792884827, - "learning_rate": 1.2578859373070453e-05, - "loss": 0.8214, + "epoch": 0.6620402498265093, + "grad_norm": 1.133805751800537, + "learning_rate": 3.741175308031839e-05, + "loss": 0.7008, "step": 3816 }, { - "epoch": 1.3938287383604162, - "grad_norm": 0.7296835780143738, - "learning_rate": 1.2565540211903931e-05, - "loss": 0.8009, + "epoch": 0.6622137404580153, + "grad_norm": 0.7388227581977844, + "learning_rate": 3.7408399856813565e-05, + "loss": 0.7917, "step": 3817 }, { - "epoch": 1.3941939017710425, - "grad_norm": 0.8433011174201965, - "learning_rate": 1.2552224875468642e-05, - "loss": 0.8206, + "epoch": 0.6623872310895211, + "grad_norm": 0.7120879292488098, + "learning_rate": 3.740504461304366e-05, + "loss": 0.9138, "step": 3818 }, { - "epoch": 1.3945590651816688, - "grad_norm": 1.112445592880249, - "learning_rate": 1.2538913370614795e-05, - "loss": 0.839, + "epoch": 0.6625607217210271, + "grad_norm": 0.9715093970298767, + "learning_rate": 3.7401687349398044e-05, + "loss": 0.6599, "step": 3819 }, { - "epoch": 1.394924228592295, - "grad_norm": 1.1870442628860474, - "learning_rate": 1.2525605704190622e-05, - "loss": 0.8629, + "epoch": 0.6627342123525329, + "grad_norm": 1.0262633562088013, + "learning_rate": 3.739832806626632e-05, + "loss": 0.6147, "step": 3820 }, { - "epoch": 1.3952893920029212, - "grad_norm": 1.0724048614501953, - "learning_rate": 1.251230188304239e-05, - "loss": 0.8243, + "epoch": 0.6629077029840389, + "grad_norm": 0.6030563116073608, + "learning_rate": 3.7394966764038366e-05, + "loss": 0.8906, "step": 3821 }, { - "epoch": 1.3956545554135475, - "grad_norm": 1.1925603151321411, - "learning_rate": 1.2499001914014373e-05, - "loss": 0.8627, + "epoch": 0.6630811936155447, + "grad_norm": 0.8427760004997253, + "learning_rate": 3.7391603443104244e-05, + "loss": 0.793, "step": 3822 }, { - "epoch": 1.3960197188241739, - "grad_norm": 1.1308726072311401, - "learning_rate": 1.2485705803948877e-05, - "loss": 0.8453, + "epoch": 0.6632546842470507, + "grad_norm": 0.7783617973327637, + "learning_rate": 3.738823810385428e-05, + "loss": 0.7859, "step": 3823 }, { - "epoch": 1.3963848822348002, - "grad_norm": 1.021034598350525, - "learning_rate": 1.2472413559686212e-05, - "loss": 0.8048, + "epoch": 0.6634281748785565, + "grad_norm": 0.7392114996910095, + "learning_rate": 3.738487074667902e-05, + "loss": 0.8523, "step": 3824 }, { - "epoch": 1.3967500456454263, - "grad_norm": 0.9815759658813477, - "learning_rate": 1.2459125188064713e-05, - "loss": 0.7723, + "epoch": 0.6636016655100625, + "grad_norm": 0.7338570952415466, + "learning_rate": 3.738150137196925e-05, + "loss": 0.8982, "step": 3825 }, { - "epoch": 1.3971152090560526, - "grad_norm": 1.2795416116714478, - "learning_rate": 1.24458406959207e-05, - "loss": 0.8246, + "epoch": 0.6637751561415683, + "grad_norm": 0.8131448030471802, + "learning_rate": 3.7378129980116e-05, + "loss": 0.8416, "step": 3826 }, { - "epoch": 1.3974803724666789, - "grad_norm": 1.3675626516342163, - "learning_rate": 1.2432560090088533e-05, - "loss": 0.8348, + "epoch": 0.6639486467730743, + "grad_norm": 0.841201663017273, + "learning_rate": 3.737475657151051e-05, + "loss": 0.8513, "step": 3827 }, { - "epoch": 1.397845535877305, - "grad_norm": 1.1099711656570435, - "learning_rate": 1.241928337740053e-05, - "loss": 0.7707, + "epoch": 0.6641221374045801, + "grad_norm": 0.855930507183075, + "learning_rate": 3.7371381146544276e-05, + "loss": 0.8198, "step": 3828 }, { - "epoch": 1.3982106992879313, - "grad_norm": 1.0689696073532104, - "learning_rate": 1.240601056468703e-05, - "loss": 0.8312, + "epoch": 0.6642956280360861, + "grad_norm": 0.9803434610366821, + "learning_rate": 3.736800370560902e-05, + "loss": 0.7905, "step": 3829 }, { - "epoch": 1.3985758626985576, - "grad_norm": 1.0764185190200806, - "learning_rate": 1.2392741658776368e-05, - "loss": 0.8252, + "epoch": 0.664469118667592, + "grad_norm": 1.885951280593872, + "learning_rate": 3.736462424909669e-05, + "loss": 0.6997, "step": 3830 }, { - "epoch": 1.398941026109184, - "grad_norm": 1.4795631170272827, - "learning_rate": 1.2379476666494866e-05, - "loss": 0.847, + "epoch": 0.6646426092990978, + "grad_norm": 0.9760525226593018, + "learning_rate": 3.736124277739949e-05, + "loss": 0.7891, "step": 3831 }, { - "epoch": 1.3993061895198102, - "grad_norm": 1.2912445068359375, - "learning_rate": 1.2366215594666822e-05, - "loss": 0.8577, + "epoch": 0.6648160999306038, + "grad_norm": 1.0914124250411987, + "learning_rate": 3.735785929090983e-05, + "loss": 0.6689, "step": 3832 }, { - "epoch": 1.3996713529304363, - "grad_norm": 0.9995322823524475, - "learning_rate": 1.2352958450114532e-05, - "loss": 0.834, + "epoch": 0.6649895905621096, + "grad_norm": 0.7972826957702637, + "learning_rate": 3.7354473790020375e-05, + "loss": 0.8406, "step": 3833 }, { - "epoch": 1.4000365163410626, - "grad_norm": 1.1202073097229004, - "learning_rate": 1.2339705239658262e-05, - "loss": 0.8222, + "epoch": 0.6651630811936156, + "grad_norm": 0.8276130557060242, + "learning_rate": 3.7351086275124023e-05, + "loss": 0.8264, "step": 3834 }, { - "epoch": 1.400401679751689, - "grad_norm": 1.0936659574508667, - "learning_rate": 1.232645597011626e-05, - "loss": 0.8395, + "epoch": 0.6653365718251214, + "grad_norm": 0.6808685064315796, + "learning_rate": 3.734769674661388e-05, + "loss": 0.9221, "step": 3835 }, { - "epoch": 1.400766843162315, - "grad_norm": 1.4010112285614014, - "learning_rate": 1.2313210648304739e-05, - "loss": 0.866, + "epoch": 0.6655100624566274, + "grad_norm": 0.962364673614502, + "learning_rate": 3.7344305204883326e-05, + "loss": 0.7336, "step": 3836 }, { - "epoch": 1.4011320065729413, - "grad_norm": 1.0923198461532593, - "learning_rate": 1.229996928103789e-05, - "loss": 0.8247, + "epoch": 0.6656835530881332, + "grad_norm": 0.6433223485946655, + "learning_rate": 3.7340911650325934e-05, + "loss": 0.8499, "step": 3837 }, { - "epoch": 1.4014971699835677, - "grad_norm": 1.1804614067077637, - "learning_rate": 1.2286731875127858e-05, - "loss": 0.8074, + "epoch": 0.6658570437196392, + "grad_norm": 0.7941041588783264, + "learning_rate": 3.7337516083335536e-05, + "loss": 0.7908, "step": 3838 }, { - "epoch": 1.401862333394194, - "grad_norm": 1.0767707824707031, - "learning_rate": 1.2273498437384763e-05, - "loss": 0.8232, + "epoch": 0.666030534351145, + "grad_norm": 0.6979531645774841, + "learning_rate": 3.73341185043062e-05, + "loss": 0.7617, "step": 3839 }, { - "epoch": 1.4022274968048203, - "grad_norm": 1.137485146522522, - "learning_rate": 1.2260268974616683e-05, - "loss": 0.8627, + "epoch": 0.666204024982651, + "grad_norm": 1.3611345291137695, + "learning_rate": 3.7330718913632215e-05, + "loss": 0.8579, "step": 3840 }, { - "epoch": 1.4025926602154464, - "grad_norm": 1.0327794551849365, - "learning_rate": 1.224704349362964e-05, - "loss": 0.8381, + "epoch": 0.6663775156141568, + "grad_norm": 0.8756178021430969, + "learning_rate": 3.7327317311708095e-05, + "loss": 0.7883, "step": 3841 }, { - "epoch": 1.4029578236260727, - "grad_norm": 0.9799919724464417, - "learning_rate": 1.2233822001227606e-05, - "loss": 0.8301, + "epoch": 0.6665510062456628, + "grad_norm": 1.2878376245498657, + "learning_rate": 3.732391369892862e-05, + "loss": 0.7476, "step": 3842 }, { - "epoch": 1.403322987036699, - "grad_norm": 1.4156063795089722, - "learning_rate": 1.2220604504212519e-05, - "loss": 0.8223, + "epoch": 0.6667244968771686, + "grad_norm": 0.8387046456336975, + "learning_rate": 3.732050807568878e-05, + "loss": 0.7648, "step": 3843 }, { - "epoch": 1.403688150447325, - "grad_norm": 1.173728108406067, - "learning_rate": 1.2207391009384244e-05, - "loss": 0.8044, + "epoch": 0.6668979875086746, + "grad_norm": 0.6943221688270569, + "learning_rate": 3.731710044238378e-05, + "loss": 0.6526, "step": 3844 }, { - "epoch": 1.4040533138579514, - "grad_norm": 1.2133715152740479, - "learning_rate": 1.2194181523540601e-05, - "loss": 0.8428, + "epoch": 0.6670714781401804, + "grad_norm": 1.002894401550293, + "learning_rate": 3.731369079940911e-05, + "loss": 0.6348, "step": 3845 }, { - "epoch": 1.4044184772685777, - "grad_norm": 1.3543936014175415, - "learning_rate": 1.2180976053477332e-05, - "loss": 0.8439, + "epoch": 0.6672449687716864, + "grad_norm": 0.8670671582221985, + "learning_rate": 3.731027914716044e-05, + "loss": 0.6848, "step": 3846 }, { - "epoch": 1.404783640679204, - "grad_norm": 1.162394404411316, - "learning_rate": 1.2167774605988126e-05, - "loss": 0.8466, + "epoch": 0.6674184594031922, + "grad_norm": 1.08168363571167, + "learning_rate": 3.73068654860337e-05, + "loss": 0.6893, "step": 3847 }, { - "epoch": 1.4051488040898301, - "grad_norm": 1.513676404953003, - "learning_rate": 1.2154577187864595e-05, - "loss": 0.7777, + "epoch": 0.6675919500346982, + "grad_norm": 1.1559993028640747, + "learning_rate": 3.7303449816425066e-05, + "loss": 0.7119, "step": 3848 }, { - "epoch": 1.4055139675004564, - "grad_norm": 1.1374335289001465, - "learning_rate": 1.214138380589629e-05, - "loss": 0.8596, + "epoch": 0.667765440666204, + "grad_norm": 1.3283518552780151, + "learning_rate": 3.730003213873091e-05, + "loss": 0.8468, "step": 3849 }, { - "epoch": 1.4058791309110827, - "grad_norm": 0.8928597569465637, - "learning_rate": 1.2128194466870666e-05, - "loss": 0.8567, + "epoch": 0.6679389312977099, + "grad_norm": 0.9709506630897522, + "learning_rate": 3.729661245334787e-05, + "loss": 0.6838, "step": 3850 }, { - "epoch": 1.4062442943217088, - "grad_norm": 0.9707872271537781, - "learning_rate": 1.2115009177573112e-05, - "loss": 0.8073, + "epoch": 0.6681124219292158, + "grad_norm": 0.9223878979682922, + "learning_rate": 3.72931907606728e-05, + "loss": 0.7853, "step": 3851 }, { - "epoch": 1.4066094577323351, - "grad_norm": 1.1005405187606812, - "learning_rate": 1.2101827944786936e-05, - "loss": 0.7885, + "epoch": 0.6682859125607217, + "grad_norm": 0.9174668788909912, + "learning_rate": 3.728976706110278e-05, + "loss": 0.7831, "step": 3852 }, { - "epoch": 1.4069746211429615, - "grad_norm": 1.4502025842666626, - "learning_rate": 1.2088650775293344e-05, - "loss": 0.8624, + "epoch": 0.6684594031922276, + "grad_norm": 0.8756910562515259, + "learning_rate": 3.7286341355035155e-05, + "loss": 0.7834, "step": 3853 }, { - "epoch": 1.4073397845535878, - "grad_norm": 1.390778660774231, - "learning_rate": 1.2075477675871472e-05, - "loss": 0.8064, + "epoch": 0.6686328938237335, + "grad_norm": 0.9817021489143372, + "learning_rate": 3.7282913642867484e-05, + "loss": 0.7141, "step": 3854 }, { - "epoch": 1.407704947964214, - "grad_norm": 1.244486927986145, - "learning_rate": 1.2062308653298343e-05, - "loss": 0.7983, + "epoch": 0.6688063844552394, + "grad_norm": 1.0360482931137085, + "learning_rate": 3.7279483924997534e-05, + "loss": 0.8885, "step": 3855 }, { - "epoch": 1.4080701113748402, - "grad_norm": 1.1478341817855835, - "learning_rate": 1.2049143714348886e-05, - "loss": 0.8085, + "epoch": 0.6689798750867453, + "grad_norm": 0.8886391520500183, + "learning_rate": 3.727605220182334e-05, + "loss": 0.7074, "step": 3856 }, { - "epoch": 1.4084352747854665, - "grad_norm": 1.1413037776947021, - "learning_rate": 1.2035982865795944e-05, - "loss": 0.8221, + "epoch": 0.6691533657182512, + "grad_norm": 0.8684951663017273, + "learning_rate": 3.727261847374316e-05, + "loss": 0.6915, "step": 3857 }, { - "epoch": 1.4088004381960928, - "grad_norm": 1.2539416551589966, - "learning_rate": 1.202282611441024e-05, - "loss": 0.8167, + "epoch": 0.6693268563497571, + "grad_norm": 0.7517601847648621, + "learning_rate": 3.726918274115548e-05, + "loss": 0.6985, "step": 3858 }, { - "epoch": 1.4091656016067189, - "grad_norm": 2.119509220123291, - "learning_rate": 1.20096734669604e-05, - "loss": 0.8508, + "epoch": 0.669500346981263, + "grad_norm": 0.654456615447998, + "learning_rate": 3.726574500445902e-05, + "loss": 0.7681, "step": 3859 }, { - "epoch": 1.4095307650173452, - "grad_norm": 0.8477818965911865, - "learning_rate": 1.1996524930212921e-05, - "loss": 0.7839, + "epoch": 0.6696738376127689, + "grad_norm": 0.8097657561302185, + "learning_rate": 3.726230526405273e-05, + "loss": 0.8584, "step": 3860 }, { - "epoch": 1.4098959284279715, - "grad_norm": 1.144994854927063, - "learning_rate": 1.198338051093221e-05, - "loss": 0.8378, + "epoch": 0.6698473282442748, + "grad_norm": 2.5083086490631104, + "learning_rate": 3.7258863520335804e-05, + "loss": 0.6752, "step": 3861 }, { - "epoch": 1.4102610918385978, - "grad_norm": 1.1481770277023315, - "learning_rate": 1.197024021588054e-05, - "loss": 0.7955, + "epoch": 0.6700208188757807, + "grad_norm": 0.8640668392181396, + "learning_rate": 3.725541977370765e-05, + "loss": 0.7527, "step": 3862 }, { - "epoch": 1.4106262552492241, - "grad_norm": 1.590401291847229, - "learning_rate": 1.1957104051818063e-05, - "loss": 0.8413, + "epoch": 0.6701943095072866, + "grad_norm": 1.1906371116638184, + "learning_rate": 3.725197402456793e-05, + "loss": 0.6401, "step": 3863 }, { - "epoch": 1.4109914186598502, - "grad_norm": 0.7769972681999207, - "learning_rate": 1.1943972025502815e-05, - "loss": 0.8173, + "epoch": 0.6703678001387925, + "grad_norm": 0.9314893484115601, + "learning_rate": 3.7248526273316524e-05, + "loss": 0.6782, "step": 3864 }, { - "epoch": 1.4113565820704765, - "grad_norm": 1.26297926902771, - "learning_rate": 1.1930844143690686e-05, - "loss": 0.8741, + "epoch": 0.6705412907702984, + "grad_norm": 0.7665075659751892, + "learning_rate": 3.724507652035354e-05, + "loss": 0.7256, "step": 3865 }, { - "epoch": 1.4117217454811029, - "grad_norm": 1.0686346292495728, - "learning_rate": 1.1917720413135454e-05, - "loss": 0.838, + "epoch": 0.6707147814018043, + "grad_norm": 1.3357821702957153, + "learning_rate": 3.724162476607933e-05, + "loss": 0.7422, "step": 3866 }, { - "epoch": 1.412086908891729, - "grad_norm": 1.098856806755066, - "learning_rate": 1.1904600840588752e-05, - "loss": 0.7977, + "epoch": 0.6708882720333103, + "grad_norm": 0.8044497966766357, + "learning_rate": 3.723817101089448e-05, + "loss": 0.8831, "step": 3867 }, { - "epoch": 1.4124520723023553, - "grad_norm": 0.9571990966796875, - "learning_rate": 1.189148543280006e-05, - "loss": 0.8398, + "epoch": 0.6710617626648161, + "grad_norm": 0.8715711832046509, + "learning_rate": 3.72347152551998e-05, + "loss": 0.764, "step": 3868 }, { - "epoch": 1.4128172357129816, - "grad_norm": 1.0079846382141113, - "learning_rate": 1.1878374196516745e-05, - "loss": 0.8445, + "epoch": 0.6712352532963219, + "grad_norm": 0.9304384589195251, + "learning_rate": 3.723125749939633e-05, + "loss": 0.7784, "step": 3869 }, { - "epoch": 1.4131823991236079, - "grad_norm": 0.9652655124664307, - "learning_rate": 1.1865267138484e-05, - "loss": 0.8392, + "epoch": 0.6714087439278279, + "grad_norm": 2.2010717391967773, + "learning_rate": 3.722779774388535e-05, + "loss": 0.8213, "step": 3870 }, { - "epoch": 1.4135475625342342, - "grad_norm": 1.3001506328582764, - "learning_rate": 1.185216426544489e-05, - "loss": 0.7789, + "epoch": 0.6715822345593337, + "grad_norm": 0.7881472706794739, + "learning_rate": 3.722433598906836e-05, + "loss": 0.7673, "step": 3871 }, { - "epoch": 1.4139127259448603, - "grad_norm": 1.3654508590698242, - "learning_rate": 1.1839065584140308e-05, - "loss": 0.8293, + "epoch": 0.6717557251908397, + "grad_norm": 0.9250732660293579, + "learning_rate": 3.722087223534711e-05, + "loss": 0.6057, "step": 3872 }, { - "epoch": 1.4142778893554866, - "grad_norm": 0.9310105443000793, - "learning_rate": 1.1825971101309007e-05, - "loss": 0.8198, + "epoch": 0.6719292158223455, + "grad_norm": 1.2116553783416748, + "learning_rate": 3.7217406483123575e-05, + "loss": 0.7881, "step": 3873 }, { - "epoch": 1.414643052766113, - "grad_norm": 1.0619949102401733, - "learning_rate": 1.1812880823687574e-05, - "loss": 0.7897, + "epoch": 0.6721027064538515, + "grad_norm": 0.8310737609863281, + "learning_rate": 3.721393873279996e-05, + "loss": 0.715, "step": 3874 }, { - "epoch": 1.415008216176739, - "grad_norm": 1.6291648149490356, - "learning_rate": 1.1799794758010425e-05, - "loss": 0.8239, + "epoch": 0.6722761970853574, + "grad_norm": 1.0040377378463745, + "learning_rate": 3.721046898477869e-05, + "loss": 0.7737, "step": 3875 }, { - "epoch": 1.4153733795873653, - "grad_norm": 1.0706971883773804, - "learning_rate": 1.1786712911009821e-05, - "loss": 0.8291, + "epoch": 0.6724496877168633, + "grad_norm": 0.8445886969566345, + "learning_rate": 3.720699723946244e-05, + "loss": 0.7719, "step": 3876 }, { - "epoch": 1.4157385429979916, - "grad_norm": 1.0782705545425415, - "learning_rate": 1.1773635289415846e-05, - "loss": 0.821, + "epoch": 0.6726231783483692, + "grad_norm": 0.7812818288803101, + "learning_rate": 3.720352349725411e-05, + "loss": 0.8801, "step": 3877 }, { - "epoch": 1.416103706408618, - "grad_norm": 1.4459401369094849, - "learning_rate": 1.1760561899956412e-05, - "loss": 0.7954, + "epoch": 0.6727966689798751, + "grad_norm": 0.7532784938812256, + "learning_rate": 3.720004775855684e-05, + "loss": 0.8232, "step": 3878 }, { - "epoch": 1.4164688698192442, - "grad_norm": 1.4389663934707642, - "learning_rate": 1.1747492749357248e-05, - "loss": 0.8678, + "epoch": 0.672970159611381, + "grad_norm": 1.031844139099121, + "learning_rate": 3.719657002377397e-05, + "loss": 0.7629, "step": 3879 }, { - "epoch": 1.4168340332298703, - "grad_norm": 1.1069241762161255, - "learning_rate": 1.1734427844341916e-05, - "loss": 0.7953, + "epoch": 0.6731436502428869, + "grad_norm": 4.258119583129883, + "learning_rate": 3.719309029330912e-05, + "loss": 0.6318, "step": 3880 }, { - "epoch": 1.4171991966404967, - "grad_norm": 1.3345195055007935, - "learning_rate": 1.1721367191631788e-05, - "loss": 0.8569, + "epoch": 0.6733171408743928, + "grad_norm": 0.7771827578544617, + "learning_rate": 3.718960856756611e-05, + "loss": 0.7292, "step": 3881 }, { - "epoch": 1.4175643600511227, - "grad_norm": 0.9455767273902893, - "learning_rate": 1.1708310797946028e-05, - "loss": 0.799, + "epoch": 0.6734906315058987, + "grad_norm": 0.8113532662391663, + "learning_rate": 3.7186124846948995e-05, + "loss": 0.7991, "step": 3882 }, { - "epoch": 1.417929523461749, - "grad_norm": 1.339704155921936, - "learning_rate": 1.169525867000164e-05, - "loss": 0.8138, + "epoch": 0.6736641221374046, + "grad_norm": 0.7523580193519592, + "learning_rate": 3.718263913186206e-05, + "loss": 0.8242, "step": 3883 }, { - "epoch": 1.4182946868723754, - "grad_norm": 0.7608948349952698, - "learning_rate": 1.1682210814513422e-05, - "loss": 0.8041, + "epoch": 0.6738376127689105, + "grad_norm": 0.7665042877197266, + "learning_rate": 3.7179151422709845e-05, + "loss": 0.7976, "step": 3884 }, { - "epoch": 1.4186598502830017, - "grad_norm": 0.9308651685714722, - "learning_rate": 1.1669167238193965e-05, - "loss": 0.8507, + "epoch": 0.6740111034004164, + "grad_norm": 0.8652667999267578, + "learning_rate": 3.717566171989708e-05, + "loss": 0.9419, "step": 3885 }, { - "epoch": 1.419025013693628, - "grad_norm": 1.0889698266983032, - "learning_rate": 1.1656127947753668e-05, - "loss": 0.8033, + "epoch": 0.6741845940319223, + "grad_norm": 0.9599747061729431, + "learning_rate": 3.717217002382875e-05, + "loss": 0.7563, "step": 3886 }, { - "epoch": 1.419390177104254, - "grad_norm": 1.084707260131836, - "learning_rate": 1.1643092949900721e-05, - "loss": 0.8394, + "epoch": 0.6743580846634282, + "grad_norm": 0.8445802927017212, + "learning_rate": 3.716867633491009e-05, + "loss": 0.7378, "step": 3887 }, { - "epoch": 1.4197553405148804, - "grad_norm": 1.5792300701141357, - "learning_rate": 1.1630062251341108e-05, - "loss": 0.8236, + "epoch": 0.6745315752949341, + "grad_norm": 1.1664235591888428, + "learning_rate": 3.716518065354654e-05, + "loss": 0.7766, "step": 3888 }, { - "epoch": 1.4201205039255067, - "grad_norm": 1.0083192586898804, - "learning_rate": 1.161703585877861e-05, - "loss": 0.807, + "epoch": 0.67470506592644, + "grad_norm": 1.1557211875915527, + "learning_rate": 3.7161682980143766e-05, + "loss": 0.6887, "step": 3889 }, { - "epoch": 1.4204856673361328, - "grad_norm": 4.953549385070801, - "learning_rate": 1.1604013778914771e-05, - "loss": 0.8205, + "epoch": 0.6748785565579458, + "grad_norm": 0.9795308709144592, + "learning_rate": 3.715818331510769e-05, + "loss": 0.8389, "step": 3890 }, { - "epoch": 1.420850830746759, - "grad_norm": 1.5128848552703857, - "learning_rate": 1.159099601844893e-05, - "loss": 0.8026, + "epoch": 0.6750520471894518, + "grad_norm": 1.4089268445968628, + "learning_rate": 3.715468165884444e-05, + "loss": 0.6575, "step": 3891 }, { - "epoch": 1.4212159941573854, - "grad_norm": 1.1733510494232178, - "learning_rate": 1.1577982584078207e-05, - "loss": 0.8068, + "epoch": 0.6752255378209576, + "grad_norm": 1.0239907503128052, + "learning_rate": 3.71511780117604e-05, + "loss": 0.8691, "step": 3892 }, { - "epoch": 1.4215811575680117, - "grad_norm": 0.8824619054794312, - "learning_rate": 1.156497348249749e-05, - "loss": 0.8224, + "epoch": 0.6753990284524636, + "grad_norm": 1.2295536994934082, + "learning_rate": 3.7147672374262165e-05, + "loss": 0.7217, "step": 3893 }, { - "epoch": 1.421946320978638, - "grad_norm": 1.136305570602417, - "learning_rate": 1.1551968720399444e-05, - "loss": 0.8025, + "epoch": 0.6755725190839694, + "grad_norm": 1.0653493404388428, + "learning_rate": 3.714416474675657e-05, + "loss": 0.6381, "step": 3894 }, { - "epoch": 1.4223114843892641, - "grad_norm": 1.0673370361328125, - "learning_rate": 1.1538968304474499e-05, - "loss": 0.8214, + "epoch": 0.6757460097154754, + "grad_norm": 1.0900731086730957, + "learning_rate": 3.7140655129650676e-05, + "loss": 0.6046, "step": 3895 }, { - "epoch": 1.4226766477998904, - "grad_norm": 1.098137617111206, - "learning_rate": 1.1525972241410827e-05, - "loss": 0.8477, + "epoch": 0.6759195003469812, + "grad_norm": 0.8163803219795227, + "learning_rate": 3.7137143523351787e-05, + "loss": 0.8, "step": 3896 }, { - "epoch": 1.4230418112105168, - "grad_norm": 1.4050343036651611, - "learning_rate": 1.15129805378944e-05, - "loss": 0.8362, + "epoch": 0.6760929909784872, + "grad_norm": 1.12442946434021, + "learning_rate": 3.713362992826742e-05, + "loss": 0.8643, "step": 3897 }, { - "epoch": 1.4234069746211429, - "grad_norm": 1.4199162721633911, - "learning_rate": 1.1499993200608921e-05, - "loss": 0.7997, + "epoch": 0.676266481609993, + "grad_norm": 0.9294216632843018, + "learning_rate": 3.713011434480534e-05, + "loss": 0.832, "step": 3898 }, { - "epoch": 1.4237721380317692, - "grad_norm": 0.9484448432922363, - "learning_rate": 1.1487010236235865e-05, - "loss": 0.814, + "epoch": 0.676439972241499, + "grad_norm": 0.9325127005577087, + "learning_rate": 3.712659677337352e-05, + "loss": 0.7917, "step": 3899 }, { - "epoch": 1.4241373014423955, - "grad_norm": 1.3538793325424194, - "learning_rate": 1.147403165145443e-05, - "loss": 0.8436, + "epoch": 0.6766134628730048, + "grad_norm": 1.7728430032730103, + "learning_rate": 3.71230772143802e-05, + "loss": 0.6995, "step": 3900 }, { - "epoch": 1.4245024648530218, - "grad_norm": 1.3449430465698242, - "learning_rate": 1.1461057452941584e-05, - "loss": 0.772, + "epoch": 0.6767869535045108, + "grad_norm": 0.9635071754455566, + "learning_rate": 3.711955566823381e-05, + "loss": 0.6499, "step": 3901 }, { - "epoch": 1.424867628263648, - "grad_norm": 1.01315438747406, - "learning_rate": 1.1448087647372032e-05, - "loss": 0.8165, + "epoch": 0.6769604441360166, + "grad_norm": 1.0445501804351807, + "learning_rate": 3.711603213534303e-05, + "loss": 0.7188, "step": 3902 }, { - "epoch": 1.4252327916742742, - "grad_norm": 0.9835547208786011, - "learning_rate": 1.1435122241418224e-05, - "loss": 0.8098, + "epoch": 0.6771339347675226, + "grad_norm": 0.7741987109184265, + "learning_rate": 3.7112506616116794e-05, + "loss": 0.7971, "step": 3903 }, { - "epoch": 1.4255979550849005, - "grad_norm": 0.8614714741706848, - "learning_rate": 1.142216124175033e-05, - "loss": 0.8596, + "epoch": 0.6773074253990284, + "grad_norm": 0.8521127104759216, + "learning_rate": 3.710897911096421e-05, + "loss": 0.7185, "step": 3904 }, { - "epoch": 1.4259631184955268, - "grad_norm": 0.91315096616745, - "learning_rate": 1.1409204655036272e-05, - "loss": 0.7888, + "epoch": 0.6774809160305344, + "grad_norm": 0.9648956060409546, + "learning_rate": 3.710544962029467e-05, + "loss": 0.6414, "step": 3905 }, { - "epoch": 1.426328281906153, - "grad_norm": 1.0541163682937622, - "learning_rate": 1.139625248794169e-05, - "loss": 0.8447, + "epoch": 0.6776544066620402, + "grad_norm": 0.938955545425415, + "learning_rate": 3.710191814451777e-05, + "loss": 0.7549, "step": 3906 }, { - "epoch": 1.4266934453167792, - "grad_norm": 1.0815497636795044, - "learning_rate": 1.1383304747129964e-05, - "loss": 0.7625, + "epoch": 0.6778278972935462, + "grad_norm": 0.963428258895874, + "learning_rate": 3.709838468404334e-05, + "loss": 0.7903, "step": 3907 }, { - "epoch": 1.4270586087274055, - "grad_norm": 1.0432144403457642, - "learning_rate": 1.137036143926217e-05, - "loss": 0.7847, + "epoch": 0.678001387925052, + "grad_norm": 1.8454617261886597, + "learning_rate": 3.7094849239281444e-05, + "loss": 0.6639, "step": 3908 }, { - "epoch": 1.4274237721380318, - "grad_norm": 0.9004088044166565, - "learning_rate": 1.1357422570997138e-05, - "loss": 0.842, + "epoch": 0.6781748785565579, + "grad_norm": 1.4228715896606445, + "learning_rate": 3.709131181064238e-05, + "loss": 0.601, "step": 3909 }, { - "epoch": 1.4277889355486582, - "grad_norm": 1.0826886892318726, - "learning_rate": 1.134448814899138e-05, - "loss": 0.797, + "epoch": 0.6783483691880638, + "grad_norm": 1.090362548828125, + "learning_rate": 3.7087772398536656e-05, + "loss": 0.6794, "step": 3910 }, { - "epoch": 1.4281540989592842, - "grad_norm": 1.2553116083145142, - "learning_rate": 1.1331558179899148e-05, - "loss": 0.8307, + "epoch": 0.6785218598195697, + "grad_norm": 1.0310989618301392, + "learning_rate": 3.708423100337504e-05, + "loss": 0.6165, "step": 3911 }, { - "epoch": 1.4285192623699106, - "grad_norm": 1.227670669555664, - "learning_rate": 1.1318632670372388e-05, - "loss": 0.8275, + "epoch": 0.6786953504510757, + "grad_norm": 1.3437929153442383, + "learning_rate": 3.70806876255685e-05, + "loss": 0.6641, "step": 3912 }, { - "epoch": 1.4288844257805369, - "grad_norm": 1.3115992546081543, - "learning_rate": 1.1305711627060765e-05, - "loss": 0.8392, + "epoch": 0.6788688410825815, + "grad_norm": 0.9126314520835876, + "learning_rate": 3.707714226552827e-05, + "loss": 0.7903, "step": 3913 }, { - "epoch": 1.429249589191163, - "grad_norm": 1.4489649534225464, - "learning_rate": 1.1292795056611621e-05, - "loss": 0.8074, + "epoch": 0.6790423317140875, + "grad_norm": 0.8334142565727234, + "learning_rate": 3.7073594923665774e-05, + "loss": 0.8555, "step": 3914 }, { - "epoch": 1.4296147526017893, - "grad_norm": 2.2217607498168945, - "learning_rate": 1.1279882965670024e-05, - "loss": 0.8051, + "epoch": 0.6792158223455933, + "grad_norm": 0.7649350762367249, + "learning_rate": 3.70700456003927e-05, + "loss": 0.7445, "step": 3915 }, { - "epoch": 1.4299799160124156, - "grad_norm": 1.1254748106002808, - "learning_rate": 1.1266975360878723e-05, - "loss": 0.8409, + "epoch": 0.6793893129770993, + "grad_norm": 0.9720479846000671, + "learning_rate": 3.7066494296120935e-05, + "loss": 0.6544, "step": 3916 }, { - "epoch": 1.430345079423042, - "grad_norm": 1.0165865421295166, - "learning_rate": 1.1254072248878164e-05, - "loss": 0.8176, + "epoch": 0.6795628036086051, + "grad_norm": 0.7873635292053223, + "learning_rate": 3.7062941011262624e-05, + "loss": 0.79, "step": 3917 }, { - "epoch": 1.430710242833668, - "grad_norm": 1.1313799619674683, - "learning_rate": 1.1241173636306488e-05, - "loss": 0.8088, + "epoch": 0.6797362942401111, + "grad_norm": 0.9784716367721558, + "learning_rate": 3.705938574623012e-05, + "loss": 0.677, "step": 3918 }, { - "epoch": 1.4310754062442943, - "grad_norm": 1.0095664262771606, - "learning_rate": 1.1228279529799501e-05, - "loss": 0.8551, + "epoch": 0.6799097848716169, + "grad_norm": 1.0137048959732056, + "learning_rate": 3.705582850143603e-05, + "loss": 0.7534, "step": 3919 }, { - "epoch": 1.4314405696549206, - "grad_norm": 1.340989351272583, - "learning_rate": 1.1215389935990708e-05, - "loss": 0.8416, + "epoch": 0.6800832755031229, + "grad_norm": 0.8410398364067078, + "learning_rate": 3.705226927729317e-05, + "loss": 0.676, "step": 3920 }, { - "epoch": 1.4318057330655467, - "grad_norm": 1.282934546470642, - "learning_rate": 1.1202504861511296e-05, - "loss": 0.8201, + "epoch": 0.6802567661346287, + "grad_norm": 0.9461576342582703, + "learning_rate": 3.7048708074214586e-05, + "loss": 0.8254, "step": 3921 }, { - "epoch": 1.432170896476173, - "grad_norm": 1.4079012870788574, - "learning_rate": 1.1189624312990103e-05, - "loss": 0.796, + "epoch": 0.6804302567661347, + "grad_norm": 0.7689977288246155, + "learning_rate": 3.704514489261357e-05, + "loss": 0.8014, "step": 3922 }, { - "epoch": 1.4325360598867993, - "grad_norm": 2.352402687072754, - "learning_rate": 1.1176748297053672e-05, - "loss": 0.8501, + "epoch": 0.6806037473976405, + "grad_norm": 1.046152114868164, + "learning_rate": 3.7041579732903617e-05, + "loss": 0.689, "step": 3923 }, { - "epoch": 1.4329012232974256, - "grad_norm": 1.108473539352417, - "learning_rate": 1.1163876820326179e-05, - "loss": 0.818, + "epoch": 0.6807772380291465, + "grad_norm": 0.9133134484291077, + "learning_rate": 3.703801259549848e-05, + "loss": 0.824, "step": 3924 }, { - "epoch": 1.433266386708052, - "grad_norm": 1.280699610710144, - "learning_rate": 1.1151009889429489e-05, - "loss": 0.8272, + "epoch": 0.6809507286606523, + "grad_norm": 0.771054208278656, + "learning_rate": 3.7034443480812144e-05, + "loss": 0.7207, "step": 3925 }, { - "epoch": 1.433631550118678, - "grad_norm": 2.11877179145813, - "learning_rate": 1.1138147510983121e-05, - "loss": 0.824, + "epoch": 0.6811242192921583, + "grad_norm": 0.7080139517784119, + "learning_rate": 3.7030872389258777e-05, + "loss": 0.7367, "step": 3926 }, { - "epoch": 1.4339967135293044, - "grad_norm": 1.068321943283081, - "learning_rate": 1.112528969160426e-05, - "loss": 0.8053, + "epoch": 0.6812977099236641, + "grad_norm": 0.7426927089691162, + "learning_rate": 3.7027299321252825e-05, + "loss": 0.834, "step": 3927 }, { - "epoch": 1.4343618769399307, - "grad_norm": 1.2185230255126953, - "learning_rate": 1.1112436437907737e-05, - "loss": 0.8274, + "epoch": 0.6814712005551701, + "grad_norm": 1.0418637990951538, + "learning_rate": 3.702372427720895e-05, + "loss": 0.7053, "step": 3928 }, { - "epoch": 1.4347270403505568, - "grad_norm": 0.9663365483283997, - "learning_rate": 1.1099587756506022e-05, - "loss": 0.8138, + "epoch": 0.6816446911866759, + "grad_norm": 0.8950718641281128, + "learning_rate": 3.702014725754204e-05, + "loss": 0.6995, "step": 3929 }, { - "epoch": 1.435092203761183, - "grad_norm": 0.9515323638916016, - "learning_rate": 1.1086743654009257e-05, - "loss": 0.8198, + "epoch": 0.6818181818181818, + "grad_norm": 0.9180693626403809, + "learning_rate": 3.701656826266721e-05, + "loss": 0.7039, "step": 3930 }, { - "epoch": 1.4354573671718094, - "grad_norm": 1.1959147453308105, - "learning_rate": 1.1073904137025218e-05, - "loss": 0.8589, + "epoch": 0.6819916724496877, + "grad_norm": 0.738006591796875, + "learning_rate": 3.701298729299979e-05, + "loss": 0.7709, "step": 3931 }, { - "epoch": 1.4358225305824357, - "grad_norm": 1.028428077697754, - "learning_rate": 1.106106921215932e-05, - "loss": 0.818, + "epoch": 0.6821651630811936, + "grad_norm": 1.1717820167541504, + "learning_rate": 3.7009404348955385e-05, + "loss": 0.8362, "step": 3932 }, { - "epoch": 1.436187693993062, - "grad_norm": 1.1367653608322144, - "learning_rate": 1.1048238886014616e-05, - "loss": 0.8049, + "epoch": 0.6823386537126995, + "grad_norm": 0.8515201807022095, + "learning_rate": 3.700581943094978e-05, + "loss": 0.7122, "step": 3933 }, { - "epoch": 1.436552857403688, - "grad_norm": 1.2805367708206177, - "learning_rate": 1.1035413165191792e-05, - "loss": 0.8624, + "epoch": 0.6825121443442054, + "grad_norm": 1.0482242107391357, + "learning_rate": 3.7002232539399014e-05, + "loss": 0.9346, "step": 3934 }, { - "epoch": 1.4369180208143144, - "grad_norm": 1.5403871536254883, - "learning_rate": 1.1022592056289168e-05, - "loss": 0.8264, + "epoch": 0.6826856349757113, + "grad_norm": 0.8403418064117432, + "learning_rate": 3.699864367471935e-05, + "loss": 0.8191, "step": 3935 }, { - "epoch": 1.4372831842249407, - "grad_norm": 0.8571110963821411, - "learning_rate": 1.1009775565902686e-05, - "loss": 0.7693, + "epoch": 0.6828591256072172, + "grad_norm": 1.446290373802185, + "learning_rate": 3.6995052837327274e-05, + "loss": 0.7043, "step": 3936 }, { - "epoch": 1.4376483476355668, - "grad_norm": 1.0812859535217285, - "learning_rate": 1.099696370062592e-05, - "loss": 0.8009, + "epoch": 0.6830326162387231, + "grad_norm": 1.2951269149780273, + "learning_rate": 3.699146002763953e-05, + "loss": 0.9612, "step": 3937 }, { - "epoch": 1.4380135110461931, - "grad_norm": 1.2995152473449707, - "learning_rate": 1.098415646705007e-05, - "loss": 0.8057, + "epoch": 0.683206106870229, + "grad_norm": 1.0710084438323975, + "learning_rate": 3.6987865246073035e-05, + "loss": 0.709, "step": 3938 }, { - "epoch": 1.4383786744568194, - "grad_norm": 1.4856594800949097, - "learning_rate": 1.0971353871763925e-05, - "loss": 0.8184, + "epoch": 0.6833795975017349, + "grad_norm": 1.319824457168579, + "learning_rate": 3.6984268493044994e-05, + "loss": 0.7249, "step": 3939 }, { - "epoch": 1.4387438378674458, - "grad_norm": 1.2145944833755493, - "learning_rate": 1.0958555921353918e-05, - "loss": 0.8361, + "epoch": 0.6835530881332408, + "grad_norm": 1.9729501008987427, + "learning_rate": 3.6980669768972795e-05, + "loss": 0.802, "step": 3940 }, { - "epoch": 1.439109001278072, - "grad_norm": 0.9954750537872314, - "learning_rate": 1.0945762622404078e-05, - "loss": 0.809, + "epoch": 0.6837265787647467, + "grad_norm": 0.9966709017753601, + "learning_rate": 3.697706907427409e-05, + "loss": 0.9158, "step": 3941 }, { - "epoch": 1.4394741646886982, - "grad_norm": 1.0378574132919312, - "learning_rate": 1.0932973981496051e-05, - "loss": 0.8743, + "epoch": 0.6839000693962526, + "grad_norm": 0.6740148067474365, + "learning_rate": 3.6973466409366735e-05, + "loss": 0.9065, "step": 3942 }, { - "epoch": 1.4398393280993245, - "grad_norm": 0.9344686269760132, - "learning_rate": 1.0920190005209066e-05, - "loss": 0.8359, + "epoch": 0.6840735600277585, + "grad_norm": 0.860862135887146, + "learning_rate": 3.696986177466882e-05, + "loss": 0.7427, "step": 3943 }, { - "epoch": 1.4402044915099508, - "grad_norm": 1.2032856941223145, - "learning_rate": 1.0907410700119976e-05, - "loss": 0.8418, + "epoch": 0.6842470506592644, + "grad_norm": 1.2444928884506226, + "learning_rate": 3.696625517059868e-05, + "loss": 0.714, "step": 3944 }, { - "epoch": 1.4405696549205769, - "grad_norm": 1.3901318311691284, - "learning_rate": 1.0894636072803214e-05, - "loss": 0.7815, + "epoch": 0.6844205412907703, + "grad_norm": 1.092228889465332, + "learning_rate": 3.696264659757485e-05, + "loss": 0.7852, "step": 3945 }, { - "epoch": 1.4409348183312032, - "grad_norm": 0.9449167847633362, - "learning_rate": 1.0881866129830829e-05, - "loss": 0.8011, + "epoch": 0.6845940319222762, + "grad_norm": 0.9167332053184509, + "learning_rate": 3.695903605601612e-05, + "loss": 0.8132, "step": 3946 }, { - "epoch": 1.4412999817418295, - "grad_norm": 1.2971882820129395, - "learning_rate": 1.086910087777242e-05, - "loss": 0.8264, + "epoch": 0.6847675225537821, + "grad_norm": 0.9258657693862915, + "learning_rate": 3.6955423546341494e-05, + "loss": 0.7458, "step": 3947 }, { - "epoch": 1.4416651451524558, - "grad_norm": 1.1493326425552368, - "learning_rate": 1.085634032319522e-05, - "loss": 0.8694, + "epoch": 0.684941013185288, + "grad_norm": 0.9212138652801514, + "learning_rate": 3.695180906897021e-05, + "loss": 0.7585, "step": 3948 }, { - "epoch": 1.442030308563082, - "grad_norm": 1.1940674781799316, - "learning_rate": 1.0843584472664004e-05, - "loss": 0.787, + "epoch": 0.6851145038167938, + "grad_norm": 1.0807209014892578, + "learning_rate": 3.694819262432173e-05, + "loss": 0.63, "step": 3949 }, { - "epoch": 1.4423954719737082, - "grad_norm": 0.9843290448188782, - "learning_rate": 1.0830833332741154e-05, - "loss": 0.8386, + "epoch": 0.6852879944482998, + "grad_norm": 0.7466068863868713, + "learning_rate": 3.694457421281575e-05, + "loss": 0.7965, "step": 3950 }, { - "epoch": 1.4427606353843345, - "grad_norm": 1.0374629497528076, - "learning_rate": 1.0818086909986613e-05, - "loss": 0.7975, + "epoch": 0.6854614850798056, + "grad_norm": 0.9240715503692627, + "learning_rate": 3.694095383487219e-05, + "loss": 0.811, "step": 3951 }, { - "epoch": 1.4431257987949606, - "grad_norm": 1.022169828414917, - "learning_rate": 1.080534521095792e-05, - "loss": 0.8564, + "epoch": 0.6856349757113116, + "grad_norm": 0.822340726852417, + "learning_rate": 3.693733149091119e-05, + "loss": 0.8494, "step": 3952 }, { - "epoch": 1.443490962205587, - "grad_norm": 1.3832981586456299, - "learning_rate": 1.0792608242210151e-05, - "loss": 0.8024, + "epoch": 0.6858084663428174, + "grad_norm": 0.6844519972801208, + "learning_rate": 3.693370718135314e-05, + "loss": 0.8406, "step": 3953 }, { - "epoch": 1.4438561256162132, - "grad_norm": 1.3716254234313965, - "learning_rate": 1.0779876010295971e-05, - "loss": 0.8268, + "epoch": 0.6859819569743234, + "grad_norm": 0.8672341108322144, + "learning_rate": 3.693008090661864e-05, + "loss": 0.7083, "step": 3954 }, { - "epoch": 1.4442212890268395, - "grad_norm": 1.049843668937683, - "learning_rate": 1.0767148521765604e-05, - "loss": 0.8066, + "epoch": 0.6861554476058292, + "grad_norm": 1.2700855731964111, + "learning_rate": 3.692645266712852e-05, + "loss": 0.9041, "step": 3955 }, { - "epoch": 1.4445864524374659, - "grad_norm": 1.3271234035491943, - "learning_rate": 1.0754425783166837e-05, - "loss": 0.8081, + "epoch": 0.6863289382373352, + "grad_norm": 0.8135233521461487, + "learning_rate": 3.6922822463303846e-05, + "loss": 0.6409, "step": 3956 }, { - "epoch": 1.444951615848092, - "grad_norm": 1.1672413349151611, - "learning_rate": 1.0741707801044998e-05, - "loss": 0.848, + "epoch": 0.686502428868841, + "grad_norm": 0.9609891772270203, + "learning_rate": 3.691919029556591e-05, + "loss": 0.7485, "step": 3957 }, { - "epoch": 1.4453167792587183, - "grad_norm": 0.8953167796134949, - "learning_rate": 1.0728994581942982e-05, - "loss": 0.8083, + "epoch": 0.686675919500347, + "grad_norm": 0.9678765535354614, + "learning_rate": 3.691555616433622e-05, + "loss": 0.8074, "step": 3958 }, { - "epoch": 1.4456819426693446, - "grad_norm": 1.3601937294006348, - "learning_rate": 1.0716286132401232e-05, - "loss": 0.8401, + "epoch": 0.6868494101318529, + "grad_norm": 0.8136806488037109, + "learning_rate": 3.691192007003652e-05, + "loss": 0.8538, "step": 3959 }, { - "epoch": 1.4460471060799707, - "grad_norm": 1.0242805480957031, - "learning_rate": 1.0703582458957733e-05, - "loss": 0.8289, + "epoch": 0.6870229007633588, + "grad_norm": 1.0647093057632446, + "learning_rate": 3.69082820130888e-05, + "loss": 0.851, "step": 3960 }, { - "epoch": 1.446412269490597, - "grad_norm": 1.05952787399292, - "learning_rate": 1.0690883568148025e-05, - "loss": 0.8284, + "epoch": 0.6871963913948647, + "grad_norm": 1.0678809881210327, + "learning_rate": 3.690464199391525e-05, + "loss": 0.688, "step": 3961 }, { - "epoch": 1.4467774329012233, - "grad_norm": 1.0008491277694702, - "learning_rate": 1.0678189466505172e-05, - "loss": 0.8229, + "epoch": 0.6873698820263706, + "grad_norm": 0.6669706702232361, + "learning_rate": 3.69010000129383e-05, + "loss": 0.741, "step": 3962 }, { - "epoch": 1.4471425963118496, - "grad_norm": 1.2056705951690674, - "learning_rate": 1.0665500160559765e-05, - "loss": 0.85, + "epoch": 0.6875433726578765, + "grad_norm": 0.6757645010948181, + "learning_rate": 3.6897356070580596e-05, + "loss": 0.8108, "step": 3963 }, { - "epoch": 1.447507759722476, - "grad_norm": 1.1323577165603638, - "learning_rate": 1.065281565683996e-05, - "loss": 0.8605, + "epoch": 0.6877168632893824, + "grad_norm": 0.795318603515625, + "learning_rate": 3.689371016726504e-05, + "loss": 0.7328, "step": 3964 }, { - "epoch": 1.447872923133102, - "grad_norm": 1.3046954870224, - "learning_rate": 1.0640135961871417e-05, - "loss": 0.7753, + "epoch": 0.6878903539208883, + "grad_norm": 0.8117619752883911, + "learning_rate": 3.6890062303414734e-05, + "loss": 0.7141, "step": 3965 }, { - "epoch": 1.4482380865437283, - "grad_norm": 1.1514002084732056, - "learning_rate": 1.0627461082177342e-05, - "loss": 0.8552, + "epoch": 0.6880638445523942, + "grad_norm": 1.1661378145217896, + "learning_rate": 3.6886412479453004e-05, + "loss": 0.8064, "step": 3966 }, { - "epoch": 1.4486032499543546, - "grad_norm": 0.8155121803283691, - "learning_rate": 1.0614791024278437e-05, - "loss": 0.8365, + "epoch": 0.6882373351839001, + "grad_norm": 0.8318164944648743, + "learning_rate": 3.6882760695803444e-05, + "loss": 0.6902, "step": 3967 }, { - "epoch": 1.4489684133649807, - "grad_norm": 1.1945875883102417, - "learning_rate": 1.0602125794692943e-05, - "loss": 0.8203, + "epoch": 0.6884108258154059, + "grad_norm": 0.9009799361228943, + "learning_rate": 3.6879106952889826e-05, + "loss": 0.6625, "step": 3968 }, { - "epoch": 1.449333576775607, - "grad_norm": 0.9721695184707642, - "learning_rate": 1.0589465399936616e-05, - "loss": 0.8331, + "epoch": 0.6885843164469119, + "grad_norm": 1.7003979682922363, + "learning_rate": 3.687545125113618e-05, + "loss": 0.7722, "step": 3969 }, { - "epoch": 1.4496987401862333, - "grad_norm": 1.0919986963272095, - "learning_rate": 1.0576809846522721e-05, - "loss": 0.797, + "epoch": 0.6887578070784177, + "grad_norm": 1.671402096748352, + "learning_rate": 3.687179359096675e-05, + "loss": 0.7227, "step": 3970 }, { - "epoch": 1.4500639035968597, - "grad_norm": 1.071091890335083, - "learning_rate": 1.0564159140962036e-05, - "loss": 0.7957, + "epoch": 0.6889312977099237, + "grad_norm": 1.0365982055664062, + "learning_rate": 3.686813397280602e-05, + "loss": 0.7078, "step": 3971 }, { - "epoch": 1.450429067007486, - "grad_norm": 1.1214354038238525, - "learning_rate": 1.0551513289762832e-05, - "loss": 0.8363, + "epoch": 0.6891047883414295, + "grad_norm": 0.8849665522575378, + "learning_rate": 3.686447239707868e-05, + "loss": 0.8108, "step": 3972 }, { - "epoch": 1.450794230418112, - "grad_norm": 1.1599748134613037, - "learning_rate": 1.0538872299430892e-05, - "loss": 0.8184, + "epoch": 0.6892782789729355, + "grad_norm": 0.8704875707626343, + "learning_rate": 3.686080886420968e-05, + "loss": 0.6576, "step": 3973 }, { - "epoch": 1.4511593938287384, - "grad_norm": 1.1406340599060059, - "learning_rate": 1.052623617646951e-05, - "loss": 0.774, + "epoch": 0.6894517696044413, + "grad_norm": 0.8746877312660217, + "learning_rate": 3.685714337462415e-05, + "loss": 0.7238, "step": 3974 }, { - "epoch": 1.4515245572393647, - "grad_norm": 1.0090221166610718, - "learning_rate": 1.0513604927379455e-05, - "loss": 0.7609, + "epoch": 0.6896252602359473, + "grad_norm": 0.7485343813896179, + "learning_rate": 3.685347592874749e-05, + "loss": 0.8396, "step": 3975 }, { - "epoch": 1.4518897206499908, - "grad_norm": 1.1632927656173706, - "learning_rate": 1.0500978558659001e-05, - "loss": 0.8053, + "epoch": 0.6897987508674531, + "grad_norm": 0.7725034356117249, + "learning_rate": 3.6849806527005316e-05, + "loss": 0.884, "step": 3976 }, { - "epoch": 1.452254884060617, - "grad_norm": 0.8751922845840454, - "learning_rate": 1.0488357076803903e-05, - "loss": 0.8477, + "epoch": 0.6899722414989591, + "grad_norm": 1.199515700340271, + "learning_rate": 3.684613516982346e-05, + "loss": 0.7673, "step": 3977 }, { - "epoch": 1.4526200474712434, - "grad_norm": 1.4618579149246216, - "learning_rate": 1.047574048830741e-05, - "loss": 0.8247, + "epoch": 0.6901457321304649, + "grad_norm": 0.8316595554351807, + "learning_rate": 3.6842461857627986e-05, + "loss": 0.682, "step": 3978 }, { - "epoch": 1.4529852108818697, - "grad_norm": 1.1022400856018066, - "learning_rate": 1.046312879966025e-05, - "loss": 0.8281, + "epoch": 0.6903192227619709, + "grad_norm": 1.0535657405853271, + "learning_rate": 3.683878659084519e-05, + "loss": 0.7472, "step": 3979 }, { - "epoch": 1.453350374292496, - "grad_norm": 1.3208749294281006, - "learning_rate": 1.045052201735063e-05, - "loss": 0.8171, + "epoch": 0.6904927133934767, + "grad_norm": 0.9928874373435974, + "learning_rate": 3.6835109369901586e-05, + "loss": 0.7587, "step": 3980 }, { - "epoch": 1.4537155377031221, - "grad_norm": 1.1326371431350708, - "learning_rate": 1.0437920147864245e-05, - "loss": 0.825, + "epoch": 0.6906662040249827, + "grad_norm": 0.7219775915145874, + "learning_rate": 3.6831430195223927e-05, + "loss": 0.8489, "step": 3981 }, { - "epoch": 1.4540807011137484, - "grad_norm": 1.0590094327926636, - "learning_rate": 1.0425323197684233e-05, - "loss": 0.8458, + "epoch": 0.6908396946564885, + "grad_norm": 0.8584325909614563, + "learning_rate": 3.682774906723918e-05, + "loss": 0.6483, "step": 3982 }, { - "epoch": 1.4544458645243745, - "grad_norm": 1.1023225784301758, - "learning_rate": 1.0412731173291229e-05, - "loss": 0.8153, + "epoch": 0.6910131852879945, + "grad_norm": 0.7732763290405273, + "learning_rate": 3.6824065986374546e-05, + "loss": 0.7798, "step": 3983 }, { - "epoch": 1.4548110279350008, - "grad_norm": 0.9351067543029785, - "learning_rate": 1.0400144081163321e-05, - "loss": 0.8428, + "epoch": 0.6911866759195003, + "grad_norm": 1.0467569828033447, + "learning_rate": 3.6820380953057446e-05, + "loss": 0.6901, "step": 3984 }, { - "epoch": 1.4551761913456271, - "grad_norm": 0.9216538071632385, - "learning_rate": 1.0387561927776075e-05, - "loss": 0.8141, + "epoch": 0.6913601665510063, + "grad_norm": 0.8217043280601501, + "learning_rate": 3.681669396771554e-05, + "loss": 0.7314, "step": 3985 }, { - "epoch": 1.4555413547562535, - "grad_norm": 1.1216219663619995, - "learning_rate": 1.0374984719602486e-05, - "loss": 0.8331, + "epoch": 0.6915336571825121, + "grad_norm": 1.3557056188583374, + "learning_rate": 3.681300503077671e-05, + "loss": 0.6702, "step": 3986 }, { - "epoch": 1.4559065181668798, - "grad_norm": 0.8778404593467712, - "learning_rate": 1.036241246311303e-05, - "loss": 0.8082, + "epoch": 0.6917071478140181, + "grad_norm": 0.7430259585380554, + "learning_rate": 3.6809314142669044e-05, + "loss": 0.8071, "step": 3987 }, { - "epoch": 1.4562716815775059, - "grad_norm": 1.2441129684448242, - "learning_rate": 1.0349845164775639e-05, - "loss": 0.8477, + "epoch": 0.6918806384455239, + "grad_norm": 0.7189929485321045, + "learning_rate": 3.680562130382089e-05, + "loss": 0.8794, "step": 3988 }, { - "epoch": 1.4566368449881322, - "grad_norm": 1.114303708076477, - "learning_rate": 1.0337282831055664e-05, - "loss": 0.8295, + "epoch": 0.6920541290770298, + "grad_norm": 0.6898748874664307, + "learning_rate": 3.68019265146608e-05, + "loss": 0.7974, "step": 3989 }, { - "epoch": 1.4570020083987585, - "grad_norm": 1.0987379550933838, - "learning_rate": 1.0324725468415942e-05, - "loss": 0.825, + "epoch": 0.6922276197085357, + "grad_norm": 1.5028873682022095, + "learning_rate": 3.679822977561756e-05, + "loss": 0.8201, "step": 3990 }, { - "epoch": 1.4573671718093846, - "grad_norm": 1.45084810256958, - "learning_rate": 1.0312173083316712e-05, - "loss": 0.8453, + "epoch": 0.6924011103400416, + "grad_norm": 0.5858355164527893, + "learning_rate": 3.679453108712018e-05, + "loss": 0.7952, "step": 3991 }, { - "epoch": 1.4577323352200109, - "grad_norm": 1.077230453491211, - "learning_rate": 1.0299625682215684e-05, - "loss": 0.8378, + "epoch": 0.6925746009715475, + "grad_norm": 1.2589186429977417, + "learning_rate": 3.67908304495979e-05, + "loss": 0.6971, "step": 3992 }, { - "epoch": 1.4580974986306372, - "grad_norm": 1.1329363584518433, - "learning_rate": 1.028708327156799e-05, - "loss": 0.8585, + "epoch": 0.6927480916030534, + "grad_norm": 0.8313594460487366, + "learning_rate": 3.678712786348018e-05, + "loss": 0.7144, "step": 3993 }, { - "epoch": 1.4584626620412635, - "grad_norm": 1.0085538625717163, - "learning_rate": 1.0274545857826195e-05, - "loss": 0.8058, + "epoch": 0.6929215822345594, + "grad_norm": 0.7927899956703186, + "learning_rate": 3.678342332919671e-05, + "loss": 0.8606, "step": 3994 }, { - "epoch": 1.4588278254518898, - "grad_norm": 0.9443773031234741, - "learning_rate": 1.0262013447440311e-05, - "loss": 0.8026, + "epoch": 0.6930950728660652, + "grad_norm": 0.6135585308074951, + "learning_rate": 3.677971684717741e-05, + "loss": 0.8987, "step": 3995 }, { - "epoch": 1.459192988862516, - "grad_norm": 0.8522350788116455, - "learning_rate": 1.0249486046857735e-05, - "loss": 0.8403, + "epoch": 0.6932685634975712, + "grad_norm": 1.0129666328430176, + "learning_rate": 3.6776008417852415e-05, + "loss": 0.8169, "step": 3996 }, { - "epoch": 1.4595581522731422, - "grad_norm": 1.0960335731506348, - "learning_rate": 1.0236963662523328e-05, - "loss": 0.8738, + "epoch": 0.693442054129077, + "grad_norm": 5.538034439086914, + "learning_rate": 3.6772298041652095e-05, + "loss": 0.8013, "step": 3997 }, { - "epoch": 1.4599233156837685, - "grad_norm": 0.7571086287498474, - "learning_rate": 1.0224446300879344e-05, - "loss": 0.8236, + "epoch": 0.693615544760583, + "grad_norm": 0.7858914136886597, + "learning_rate": 3.676858571900704e-05, + "loss": 0.8296, "step": 3998 }, { - "epoch": 1.4602884790943946, - "grad_norm": 1.0532584190368652, - "learning_rate": 1.0211933968365484e-05, - "loss": 0.8032, + "epoch": 0.6937890353920888, + "grad_norm": 0.7888327240943909, + "learning_rate": 3.676487145034808e-05, + "loss": 0.7893, "step": 3999 }, { - "epoch": 1.460653642505021, - "grad_norm": 0.8001096844673157, - "learning_rate": 1.0199426671418818e-05, - "loss": 0.8679, + "epoch": 0.6939625260235948, + "grad_norm": 0.8881151676177979, + "learning_rate": 3.6761155236106246e-05, + "loss": 0.7781, "step": 4000 }, { - "epoch": 1.4610188059156473, - "grad_norm": 1.1434293985366821, - "learning_rate": 1.0186924416473862e-05, - "loss": 0.8106, + "epoch": 0.6941360166551006, + "grad_norm": 0.7775428295135498, + "learning_rate": 3.675743707671282e-05, + "loss": 0.8398, "step": 4001 }, { - "epoch": 1.4613839693262736, - "grad_norm": 0.9003200531005859, - "learning_rate": 1.0174427209962513e-05, - "loss": 0.8539, + "epoch": 0.6943095072866066, + "grad_norm": 0.742073655128479, + "learning_rate": 3.67537169725993e-05, + "loss": 0.7671, "step": 4002 }, { - "epoch": 1.4617491327368999, - "grad_norm": 1.4394025802612305, - "learning_rate": 1.0161935058314087e-05, - "loss": 0.837, + "epoch": 0.6944829979181124, + "grad_norm": 0.9802519679069519, + "learning_rate": 3.6749994924197394e-05, + "loss": 0.6749, "step": 4003 }, { - "epoch": 1.462114296147526, - "grad_norm": 1.3173221349716187, - "learning_rate": 1.01494479679553e-05, - "loss": 0.8284, + "epoch": 0.6946564885496184, + "grad_norm": 0.9778761863708496, + "learning_rate": 3.6746270931939064e-05, + "loss": 0.7722, "step": 4004 }, { - "epoch": 1.4624794595581523, - "grad_norm": 1.132232904434204, - "learning_rate": 1.0136965945310262e-05, - "loss": 0.8372, + "epoch": 0.6948299791811242, + "grad_norm": 1.1063568592071533, + "learning_rate": 3.674254499625648e-05, + "loss": 0.7153, "step": 4005 }, { - "epoch": 1.4628446229687786, - "grad_norm": 1.1471248865127563, - "learning_rate": 1.0124488996800456e-05, - "loss": 0.7772, + "epoch": 0.6950034698126302, + "grad_norm": 0.8329475522041321, + "learning_rate": 3.6738817117582045e-05, + "loss": 0.6005, "step": 4006 }, { - "epoch": 1.4632097863794047, - "grad_norm": 1.0344172716140747, - "learning_rate": 1.0112017128844784e-05, - "loss": 0.8299, + "epoch": 0.695176960444136, + "grad_norm": 4.4256110191345215, + "learning_rate": 3.6735087296348366e-05, + "loss": 0.7565, "step": 4007 }, { - "epoch": 1.463574949790031, - "grad_norm": 0.9453524947166443, - "learning_rate": 1.0099550347859522e-05, - "loss": 0.7837, + "epoch": 0.6953504510756419, + "grad_norm": 0.941733717918396, + "learning_rate": 3.6731355532988315e-05, + "loss": 0.6941, "step": 4008 }, { - "epoch": 1.4639401132006573, - "grad_norm": 1.1959450244903564, - "learning_rate": 1.008708866025833e-05, - "loss": 0.8392, + "epoch": 0.6955239417071478, + "grad_norm": 0.6739019155502319, + "learning_rate": 3.672762182793496e-05, + "loss": 0.8323, "step": 4009 }, { - "epoch": 1.4643052766112836, - "grad_norm": 1.2504914999008179, - "learning_rate": 1.0074632072452233e-05, - "loss": 0.8186, + "epoch": 0.6956974323386537, + "grad_norm": 0.7617316246032715, + "learning_rate": 3.6723886181621595e-05, + "loss": 0.7819, "step": 4010 }, { - "epoch": 1.46467044002191, - "grad_norm": 1.2203630208969116, - "learning_rate": 1.0062180590849655e-05, - "loss": 0.8265, + "epoch": 0.6958709229701596, + "grad_norm": 0.8465570211410522, + "learning_rate": 3.672014859448175e-05, + "loss": 0.8706, "step": 4011 }, { - "epoch": 1.465035603432536, - "grad_norm": 1.0203382968902588, - "learning_rate": 1.0049734221856387e-05, - "loss": 0.8114, + "epoch": 0.6960444136016655, + "grad_norm": 0.8746000528335571, + "learning_rate": 3.6716409066949184e-05, + "loss": 0.6561, "step": 4012 }, { - "epoch": 1.4654007668431623, - "grad_norm": 1.0780603885650635, - "learning_rate": 1.003729297187558e-05, - "loss": 0.8549, + "epoch": 0.6962179042331714, + "grad_norm": 1.1261851787567139, + "learning_rate": 3.671266759945786e-05, + "loss": 0.8284, "step": 4013 }, { - "epoch": 1.4657659302537887, - "grad_norm": 0.9810544848442078, - "learning_rate": 1.0024856847307766e-05, - "loss": 0.8198, + "epoch": 0.6963913948646773, + "grad_norm": 1.0032203197479248, + "learning_rate": 3.670892419244199e-05, + "loss": 0.6882, "step": 4014 }, { - "epoch": 1.4661310936644147, - "grad_norm": 1.0900015830993652, - "learning_rate": 1.001242585455083e-05, - "loss": 0.8491, + "epoch": 0.6965648854961832, + "grad_norm": 1.1753140687942505, + "learning_rate": 3.6705178846336004e-05, + "loss": 0.7834, "step": 4015 }, { - "epoch": 1.466496257075041, - "grad_norm": 1.1001746654510498, - "learning_rate": 1.0000000000000006e-05, - "loss": 0.7942, + "epoch": 0.6967383761276891, + "grad_norm": 1.0542224645614624, + "learning_rate": 3.670143156157454e-05, + "loss": 0.77, "step": 4016 }, { - "epoch": 1.4668614204856674, - "grad_norm": 1.3537399768829346, - "learning_rate": 9.987579290047906e-06, - "loss": 0.813, + "epoch": 0.696911866759195, + "grad_norm": 1.1103049516677856, + "learning_rate": 3.669768233859249e-05, + "loss": 0.6909, "step": 4017 }, { - "epoch": 1.4672265838962937, - "grad_norm": 1.7660858631134033, - "learning_rate": 9.97516373108449e-06, - "loss": 0.8446, + "epoch": 0.6970853573907009, + "grad_norm": 0.7358279824256897, + "learning_rate": 3.6693931177824934e-05, + "loss": 0.6801, "step": 4018 }, { - "epoch": 1.4675917473069198, - "grad_norm": 1.0782952308654785, - "learning_rate": 9.962753329497069e-06, - "loss": 0.8678, + "epoch": 0.6972588480222068, + "grad_norm": 0.9822719693183899, + "learning_rate": 3.6690178079707226e-05, + "loss": 0.6947, "step": 4019 }, { - "epoch": 1.467956910717546, - "grad_norm": 1.2959240674972534, - "learning_rate": 9.950348091670281e-06, - "loss": 0.8201, + "epoch": 0.6974323386537127, + "grad_norm": 0.7368535995483398, + "learning_rate": 3.66864230446749e-05, + "loss": 0.7297, "step": 4020 }, { - "epoch": 1.4683220741281724, - "grad_norm": 1.2790840864181519, - "learning_rate": 9.937948023986135e-06, - "loss": 0.7898, + "epoch": 0.6976058292852186, + "grad_norm": 0.7747143507003784, + "learning_rate": 3.668266607316373e-05, + "loss": 0.8311, "step": 4021 }, { - "epoch": 1.4686872375387985, - "grad_norm": 0.7995054721832275, - "learning_rate": 9.925553132823967e-06, - "loss": 0.8286, + "epoch": 0.6977793199167245, + "grad_norm": 0.9637696146965027, + "learning_rate": 3.667890716560973e-05, + "loss": 0.8689, "step": 4022 }, { - "epoch": 1.4690524009494248, - "grad_norm": 1.5725055932998657, - "learning_rate": 9.913163424560446e-06, - "loss": 0.809, + "epoch": 0.6979528105482304, + "grad_norm": 0.6698401570320129, + "learning_rate": 3.667514632244912e-05, + "loss": 0.8074, "step": 4023 }, { - "epoch": 1.469417564360051, - "grad_norm": 0.9727697968482971, - "learning_rate": 9.900778905569592e-06, - "loss": 0.8246, + "epoch": 0.6981263011797363, + "grad_norm": 0.7229856252670288, + "learning_rate": 3.667138354411834e-05, + "loss": 0.7888, "step": 4024 }, { - "epoch": 1.4697827277706774, - "grad_norm": 1.120904803276062, - "learning_rate": 9.88839958222273e-06, - "loss": 0.8253, + "epoch": 0.6982997918112422, + "grad_norm": 1.414007544517517, + "learning_rate": 3.666761883105408e-05, + "loss": 0.9172, "step": 4025 }, { - "epoch": 1.4701478911813037, - "grad_norm": 1.5191235542297363, - "learning_rate": 9.876025460888528e-06, - "loss": 0.8255, + "epoch": 0.6984732824427481, + "grad_norm": 1.0068095922470093, + "learning_rate": 3.666385218369324e-05, + "loss": 0.7251, "step": 4026 }, { - "epoch": 1.4705130545919298, - "grad_norm": 1.2207565307617188, - "learning_rate": 9.863656547932976e-06, - "loss": 0.8406, + "epoch": 0.6986467730742539, + "grad_norm": 0.8478007912635803, + "learning_rate": 3.6660083602472924e-05, + "loss": 0.8115, "step": 4027 }, { - "epoch": 1.4708782180025561, - "grad_norm": 0.914360523223877, - "learning_rate": 9.851292849719392e-06, - "loss": 0.8088, + "epoch": 0.6988202637057599, + "grad_norm": 0.9956014156341553, + "learning_rate": 3.6656313087830505e-05, + "loss": 0.7754, "step": 4028 }, { - "epoch": 1.4712433814131824, - "grad_norm": 0.7425459027290344, - "learning_rate": 9.838934372608394e-06, - "loss": 0.825, + "epoch": 0.6989937543372657, + "grad_norm": 0.9531354308128357, + "learning_rate": 3.665254064020353e-05, + "loss": 0.5999, "step": 4029 }, { - "epoch": 1.4716085448238085, - "grad_norm": 1.0881599187850952, - "learning_rate": 9.826581122957915e-06, - "loss": 0.8022, + "epoch": 0.6991672449687717, + "grad_norm": 1.0477161407470703, + "learning_rate": 3.664876626002982e-05, + "loss": 0.9114, "step": 4030 }, { - "epoch": 1.4719737082344349, - "grad_norm": 1.071592926979065, - "learning_rate": 9.814233107123215e-06, - "loss": 0.8505, + "epoch": 0.6993407356002775, + "grad_norm": 1.066310167312622, + "learning_rate": 3.664498994774738e-05, + "loss": 0.7769, "step": 4031 }, { - "epoch": 1.4723388716450612, - "grad_norm": 1.3767377138137817, - "learning_rate": 9.801890331456851e-06, - "loss": 0.8419, + "epoch": 0.6995142262317835, + "grad_norm": 1.1402382850646973, + "learning_rate": 3.6641211703794466e-05, + "loss": 0.8201, "step": 4032 }, { - "epoch": 1.4727040350556875, - "grad_norm": 1.1472898721694946, - "learning_rate": 9.789552802308697e-06, - "loss": 0.8457, + "epoch": 0.6996877168632893, + "grad_norm": 0.7412152290344238, + "learning_rate": 3.663743152860954e-05, + "loss": 0.9187, "step": 4033 }, { - "epoch": 1.4730691984663138, - "grad_norm": 1.1836241483688354, - "learning_rate": 9.777220526025897e-06, - "loss": 0.7977, + "epoch": 0.6998612074947953, + "grad_norm": 0.8276572823524475, + "learning_rate": 3.66336494226313e-05, + "loss": 0.833, "step": 4034 }, { - "epoch": 1.4734343618769399, - "grad_norm": 1.0936325788497925, - "learning_rate": 9.764893508952924e-06, - "loss": 0.8043, + "epoch": 0.7000346981263011, + "grad_norm": 1.046972632408142, + "learning_rate": 3.662986538629866e-05, + "loss": 0.7526, "step": 4035 }, { - "epoch": 1.4737995252875662, - "grad_norm": 1.2448891401290894, - "learning_rate": 9.752571757431528e-06, - "loss": 0.7946, + "epoch": 0.7002081887578071, + "grad_norm": 1.2260123491287231, + "learning_rate": 3.662607942005077e-05, + "loss": 0.8284, "step": 4036 }, { - "epoch": 1.4741646886981925, - "grad_norm": 1.2634477615356445, - "learning_rate": 9.740255277800761e-06, - "loss": 0.8077, + "epoch": 0.700381679389313, + "grad_norm": 0.7654524445533752, + "learning_rate": 3.6622291524326986e-05, + "loss": 0.7592, "step": 4037 }, { - "epoch": 1.4745298521088186, - "grad_norm": 1.0453424453735352, - "learning_rate": 9.727944076396962e-06, - "loss": 0.8546, + "epoch": 0.7005551700208189, + "grad_norm": 1.4399133920669556, + "learning_rate": 3.66185016995669e-05, + "loss": 0.7043, "step": 4038 }, { - "epoch": 1.474895015519445, - "grad_norm": 1.3987904787063599, - "learning_rate": 9.715638159553737e-06, - "loss": 0.8291, + "epoch": 0.7007286606523248, + "grad_norm": 1.0339127779006958, + "learning_rate": 3.661470994621033e-05, + "loss": 0.7083, "step": 4039 }, { - "epoch": 1.4752601789300712, - "grad_norm": 0.9465808272361755, - "learning_rate": 9.703337533601995e-06, - "loss": 0.8199, + "epoch": 0.7009021512838307, + "grad_norm": 0.9347897171974182, + "learning_rate": 3.661091626469731e-05, + "loss": 0.743, "step": 4040 }, { - "epoch": 1.4756253423406975, - "grad_norm": 1.0264639854431152, - "learning_rate": 9.691042204869918e-06, - "loss": 0.8264, + "epoch": 0.7010756419153366, + "grad_norm": 1.145824670791626, + "learning_rate": 3.66071206554681e-05, + "loss": 0.9619, "step": 4041 }, { - "epoch": 1.4759905057513238, - "grad_norm": 1.6162896156311035, - "learning_rate": 9.678752179682947e-06, - "loss": 0.7718, + "epoch": 0.7012491325468425, + "grad_norm": 0.6713416576385498, + "learning_rate": 3.6603323118963194e-05, + "loss": 0.7739, "step": 4042 }, { - "epoch": 1.47635566916195, - "grad_norm": 1.6115630865097046, - "learning_rate": 9.666467464363822e-06, - "loss": 0.8666, + "epoch": 0.7014226231783484, + "grad_norm": 1.1311761140823364, + "learning_rate": 3.6599523655623285e-05, + "loss": 0.8319, "step": 4043 }, { - "epoch": 1.4767208325725762, - "grad_norm": 0.8226298093795776, - "learning_rate": 9.65418806523252e-06, - "loss": 0.8228, + "epoch": 0.7015961138098543, + "grad_norm": 0.872946560382843, + "learning_rate": 3.659572226588932e-05, + "loss": 0.6572, "step": 4044 }, { - "epoch": 1.4770859959832026, - "grad_norm": 1.3465148210525513, - "learning_rate": 9.641913988606304e-06, - "loss": 0.769, + "epoch": 0.7017696044413602, + "grad_norm": 1.0239970684051514, + "learning_rate": 3.659191895020244e-05, + "loss": 0.6552, "step": 4045 }, { - "epoch": 1.4774511593938287, - "grad_norm": 0.8644867539405823, - "learning_rate": 9.629645240799698e-06, - "loss": 0.7953, + "epoch": 0.7019430950728661, + "grad_norm": 0.7353981137275696, + "learning_rate": 3.658811370900404e-05, + "loss": 0.8054, "step": 4046 }, { - "epoch": 1.477816322804455, - "grad_norm": 1.204419732093811, - "learning_rate": 9.617381828124482e-06, - "loss": 0.8429, + "epoch": 0.702116585704372, + "grad_norm": 0.7874762415885925, + "learning_rate": 3.6584306542735715e-05, + "loss": 0.8662, "step": 4047 }, { - "epoch": 1.4781814862150813, - "grad_norm": 0.8916066288948059, - "learning_rate": 9.605123756889692e-06, - "loss": 0.8232, + "epoch": 0.7022900763358778, + "grad_norm": 1.0413366556167603, + "learning_rate": 3.658049745183928e-05, + "loss": 0.7295, "step": 4048 }, { - "epoch": 1.4785466496257076, - "grad_norm": 0.9282631874084473, - "learning_rate": 9.5928710334016e-06, - "loss": 0.8417, + "epoch": 0.7024635669673838, + "grad_norm": 0.8664577007293701, + "learning_rate": 3.657668643675681e-05, + "loss": 0.73, "step": 4049 }, { - "epoch": 1.4789118130363337, - "grad_norm": 1.4894983768463135, - "learning_rate": 9.580623663963753e-06, - "loss": 0.8255, + "epoch": 0.7026370575988896, + "grad_norm": 0.7039134502410889, + "learning_rate": 3.657287349793056e-05, + "loss": 0.8672, "step": 4050 }, { - "epoch": 1.47927697644696, - "grad_norm": 2.033125400543213, - "learning_rate": 9.568381654876924e-06, - "loss": 0.7918, + "epoch": 0.7028105482303956, + "grad_norm": 0.8764761686325073, + "learning_rate": 3.656905863580302e-05, + "loss": 0.6821, "step": 4051 }, { - "epoch": 1.4796421398575863, - "grad_norm": 1.3833266496658325, - "learning_rate": 9.55614501243915e-06, - "loss": 0.8476, + "epoch": 0.7029840388619014, + "grad_norm": 1.1630231142044067, + "learning_rate": 3.656524185081693e-05, + "loss": 0.7307, "step": 4052 }, { - "epoch": 1.4800073032682124, - "grad_norm": 1.107370138168335, - "learning_rate": 9.54391374294567e-06, - "loss": 0.8103, + "epoch": 0.7031575294934074, + "grad_norm": 0.7717318534851074, + "learning_rate": 3.6561423143415216e-05, + "loss": 0.7971, "step": 4053 }, { - "epoch": 1.4803724666788387, - "grad_norm": 1.289602279663086, - "learning_rate": 9.531687852689003e-06, - "loss": 0.7989, + "epoch": 0.7033310201249132, + "grad_norm": 0.9757189154624939, + "learning_rate": 3.655760251404105e-05, + "loss": 0.7659, "step": 4054 }, { - "epoch": 1.480737630089465, - "grad_norm": 1.028420090675354, - "learning_rate": 9.519467347958857e-06, - "loss": 0.8549, + "epoch": 0.7035045107564192, + "grad_norm": 1.08539879322052, + "learning_rate": 3.655377996313782e-05, + "loss": 0.6866, "step": 4055 }, { - "epoch": 1.4811027935000913, - "grad_norm": 1.3203877210617065, - "learning_rate": 9.507252235042205e-06, - "loss": 0.8326, + "epoch": 0.703678001387925, + "grad_norm": 0.7478412985801697, + "learning_rate": 3.654995549114913e-05, + "loss": 0.7158, "step": 4056 }, { - "epoch": 1.4814679569107176, - "grad_norm": 1.0531619787216187, - "learning_rate": 9.495042520223233e-06, - "loss": 0.837, + "epoch": 0.703851492019431, + "grad_norm": 2.470996379852295, + "learning_rate": 3.654612909851882e-05, + "loss": 0.641, "step": 4057 }, { - "epoch": 1.4818331203213437, - "grad_norm": 1.4407105445861816, - "learning_rate": 9.482838209783351e-06, - "loss": 0.813, + "epoch": 0.7040249826509368, + "grad_norm": 1.2903586626052856, + "learning_rate": 3.6542300785690954e-05, + "loss": 0.7466, "step": 4058 }, { - "epoch": 1.48219828373197, - "grad_norm": 1.5786292552947998, - "learning_rate": 9.470639310001176e-06, - "loss": 0.8191, + "epoch": 0.7041984732824428, + "grad_norm": 0.7569193840026855, + "learning_rate": 3.653847055310981e-05, + "loss": 0.8164, "step": 4059 }, { - "epoch": 1.4825634471425964, - "grad_norm": 0.825387179851532, - "learning_rate": 9.458445827152558e-06, - "loss": 0.8416, + "epoch": 0.7043719639139486, + "grad_norm": 1.30368173122406, + "learning_rate": 3.6534638401219874e-05, + "loss": 0.6754, "step": 4060 }, { - "epoch": 1.4829286105532224, - "grad_norm": 0.8722606301307678, - "learning_rate": 9.446257767510559e-06, - "loss": 0.8105, + "epoch": 0.7045454545454546, + "grad_norm": 1.0516095161437988, + "learning_rate": 3.653080433046589e-05, + "loss": 0.6104, "step": 4061 }, { - "epoch": 1.4832937739638488, - "grad_norm": 1.2076358795166016, - "learning_rate": 9.434075137345447e-06, - "loss": 0.7999, + "epoch": 0.7047189451769604, + "grad_norm": 0.8746193647384644, + "learning_rate": 3.652696834129281e-05, + "loss": 0.7292, "step": 4062 }, { - "epoch": 1.483658937374475, - "grad_norm": 0.9067274332046509, - "learning_rate": 9.421897942924687e-06, - "loss": 0.8173, + "epoch": 0.7048924358084664, + "grad_norm": 0.800766110420227, + "learning_rate": 3.652313043414579e-05, + "loss": 0.6885, "step": 4063 }, { - "epoch": 1.4840241007851014, - "grad_norm": 1.0865105390548706, - "learning_rate": 9.409726190512962e-06, - "loss": 0.8336, + "epoch": 0.7050659264399722, + "grad_norm": 0.9395405650138855, + "learning_rate": 3.6519290609470225e-05, + "loss": 0.8838, "step": 4064 }, { - "epoch": 1.4843892641957277, - "grad_norm": 1.2221941947937012, - "learning_rate": 9.397559886372152e-06, - "loss": 0.8295, + "epoch": 0.7052394170714782, + "grad_norm": 0.8437332510948181, + "learning_rate": 3.651544886771174e-05, + "loss": 0.7036, "step": 4065 }, { - "epoch": 1.4847544276063538, - "grad_norm": 1.2529449462890625, - "learning_rate": 9.385399036761329e-06, - "loss": 0.8309, + "epoch": 0.705412907702984, + "grad_norm": 0.8933786153793335, + "learning_rate": 3.651160520931617e-05, + "loss": 0.7498, "step": 4066 }, { - "epoch": 1.48511959101698, - "grad_norm": 1.1461626291275024, - "learning_rate": 9.373243647936773e-06, - "loss": 0.8027, + "epoch": 0.7055863983344899, + "grad_norm": 0.754020631313324, + "learning_rate": 3.650775963472958e-05, + "loss": 0.7563, "step": 4067 }, { - "epoch": 1.4854847544276064, - "grad_norm": 1.293305516242981, - "learning_rate": 9.361093726151935e-06, - "loss": 0.8179, + "epoch": 0.7057598889659958, + "grad_norm": 1.2931221723556519, + "learning_rate": 3.650391214439825e-05, + "loss": 0.7303, "step": 4068 }, { - "epoch": 1.4858499178382325, - "grad_norm": 0.8800475001335144, - "learning_rate": 9.348949277657455e-06, - "loss": 0.8234, + "epoch": 0.7059333795975017, + "grad_norm": 0.9485383033752441, + "learning_rate": 3.6500062738768675e-05, + "loss": 0.7615, "step": 4069 }, { - "epoch": 1.4862150812488588, - "grad_norm": 1.3795913457870483, - "learning_rate": 9.33681030870117e-06, - "loss": 0.8254, + "epoch": 0.7061068702290076, + "grad_norm": 0.8281378746032715, + "learning_rate": 3.64962114182876e-05, + "loss": 0.7153, "step": 4070 }, { - "epoch": 1.4865802446594851, - "grad_norm": 1.229042887687683, - "learning_rate": 9.324676825528095e-06, - "loss": 0.8025, + "epoch": 0.7062803608605135, + "grad_norm": 0.8707072138786316, + "learning_rate": 3.649235818340197e-05, + "loss": 0.6921, "step": 4071 }, { - "epoch": 1.4869454080701114, - "grad_norm": 0.7794329524040222, - "learning_rate": 9.312548834380429e-06, - "loss": 0.8405, + "epoch": 0.7064538514920194, + "grad_norm": 1.2330180406570435, + "learning_rate": 3.648850303455895e-05, + "loss": 0.6562, "step": 4072 }, { - "epoch": 1.4873105714807378, - "grad_norm": 0.9986517429351807, - "learning_rate": 9.300426341497515e-06, - "loss": 0.7951, + "epoch": 0.7066273421235253, + "grad_norm": 1.0307742357254028, + "learning_rate": 3.648464597220594e-05, + "loss": 0.7703, "step": 4073 }, { - "epoch": 1.4876757348913638, - "grad_norm": 1.118321418762207, - "learning_rate": 9.288309353115903e-06, - "loss": 0.824, + "epoch": 0.7068008327550312, + "grad_norm": 0.9141172170639038, + "learning_rate": 3.6480786996790554e-05, + "loss": 0.624, "step": 4074 }, { - "epoch": 1.4880408983019902, - "grad_norm": 1.2831103801727295, - "learning_rate": 9.276197875469298e-06, - "loss": 0.8057, + "epoch": 0.7069743233865371, + "grad_norm": 1.5019493103027344, + "learning_rate": 3.647692610876064e-05, + "loss": 0.7516, "step": 4075 }, { - "epoch": 1.4884060617126165, - "grad_norm": 1.030640721321106, - "learning_rate": 9.264091914788572e-06, - "loss": 0.8698, + "epoch": 0.707147814018043, + "grad_norm": 0.9932965040206909, + "learning_rate": 3.647306330856425e-05, + "loss": 0.6895, "step": 4076 }, { - "epoch": 1.4887712251232426, - "grad_norm": 1.6356472969055176, - "learning_rate": 9.251991477301742e-06, - "loss": 0.8142, + "epoch": 0.7073213046495489, + "grad_norm": 0.9260485172271729, + "learning_rate": 3.6469198596649663e-05, + "loss": 0.825, "step": 4077 }, { - "epoch": 1.4891363885338689, - "grad_norm": 0.9869392514228821, - "learning_rate": 9.239896569234008e-06, - "loss": 0.8568, + "epoch": 0.7074947952810549, + "grad_norm": 1.9332634210586548, + "learning_rate": 3.646533197346539e-05, + "loss": 0.802, "step": 4078 }, { - "epoch": 1.4895015519444952, - "grad_norm": 1.1379454135894775, - "learning_rate": 9.227807196807711e-06, - "loss": 0.8394, + "epoch": 0.7076682859125607, + "grad_norm": 0.7419928312301636, + "learning_rate": 3.6461463439460156e-05, + "loss": 0.739, "step": 4079 }, { - "epoch": 1.4898667153551215, - "grad_norm": 1.1055103540420532, - "learning_rate": 9.215723366242352e-06, - "loss": 0.8336, + "epoch": 0.7078417765440667, + "grad_norm": 1.623947262763977, + "learning_rate": 3.6457592995082915e-05, + "loss": 0.7694, "step": 4080 }, { - "epoch": 1.4902318787657478, - "grad_norm": 0.861528217792511, - "learning_rate": 9.203645083754581e-06, - "loss": 0.8121, + "epoch": 0.7080152671755725, + "grad_norm": 0.8942829370498657, + "learning_rate": 3.645372064078282e-05, + "loss": 0.6863, "step": 4081 }, { - "epoch": 1.490597042176374, - "grad_norm": 1.795096755027771, - "learning_rate": 9.191572355558187e-06, - "loss": 0.8645, + "epoch": 0.7081887578070785, + "grad_norm": 0.813556969165802, + "learning_rate": 3.644984637700928e-05, + "loss": 0.8132, "step": 4082 }, { - "epoch": 1.4909622055870002, - "grad_norm": 1.313956379890442, - "learning_rate": 9.17950518786409e-06, - "loss": 0.8203, + "epoch": 0.7083622484385843, + "grad_norm": 0.7513930201530457, + "learning_rate": 3.644597020421189e-05, + "loss": 0.7205, "step": 4083 }, { - "epoch": 1.4913273689976263, - "grad_norm": 1.140581727027893, - "learning_rate": 9.167443586880376e-06, - "loss": 0.8113, + "epoch": 0.7085357390700903, + "grad_norm": 0.9424650073051453, + "learning_rate": 3.6442092122840505e-05, + "loss": 0.7166, "step": 4084 }, { - "epoch": 1.4916925324082526, - "grad_norm": 1.2640315294265747, - "learning_rate": 9.155387558812252e-06, - "loss": 0.7922, + "epoch": 0.7087092297015961, + "grad_norm": 1.0423951148986816, + "learning_rate": 3.6438212133345164e-05, + "loss": 0.661, "step": 4085 }, { - "epoch": 1.492057695818879, - "grad_norm": 1.0180869102478027, - "learning_rate": 9.14333710986207e-06, - "loss": 0.8217, + "epoch": 0.7088827203331021, + "grad_norm": 1.0095725059509277, + "learning_rate": 3.643433023617616e-05, + "loss": 0.8633, "step": 4086 }, { - "epoch": 1.4924228592295052, - "grad_norm": 1.0719835758209229, - "learning_rate": 9.131292246229286e-06, - "loss": 0.8177, + "epoch": 0.7090562109646079, + "grad_norm": 0.8471446633338928, + "learning_rate": 3.643044643178397e-05, + "loss": 0.7676, "step": 4087 }, { - "epoch": 1.4927880226401316, - "grad_norm": 1.0945136547088623, - "learning_rate": 9.119252974110508e-06, - "loss": 0.8011, + "epoch": 0.7092297015961138, + "grad_norm": 0.9072524309158325, + "learning_rate": 3.642656072061933e-05, + "loss": 0.8571, "step": 4088 }, { - "epoch": 1.4931531860507576, - "grad_norm": 1.3694754838943481, - "learning_rate": 9.107219299699459e-06, - "loss": 0.8455, + "epoch": 0.7094031922276197, + "grad_norm": 0.9445000886917114, + "learning_rate": 3.6422673103133186e-05, + "loss": 0.6877, "step": 4089 }, { - "epoch": 1.493518349461384, - "grad_norm": 1.1530365943908691, - "learning_rate": 9.095191229186977e-06, - "loss": 0.8011, + "epoch": 0.7095766828591256, + "grad_norm": 1.0938985347747803, + "learning_rate": 3.641878357977668e-05, + "loss": 0.6902, "step": 4090 }, { - "epoch": 1.4938835128720103, - "grad_norm": 0.9750694632530212, - "learning_rate": 9.083168768761035e-06, - "loss": 0.7914, + "epoch": 0.7097501734906315, + "grad_norm": 0.85311359167099, + "learning_rate": 3.641489215100122e-05, + "loss": 0.6469, "step": 4091 }, { - "epoch": 1.4942486762826364, - "grad_norm": 1.4250874519348145, - "learning_rate": 9.071151924606688e-06, - "loss": 0.8243, + "epoch": 0.7099236641221374, + "grad_norm": 1.3281558752059937, + "learning_rate": 3.641099881725839e-05, + "loss": 0.772, "step": 4092 }, { - "epoch": 1.4946138396932627, - "grad_norm": 1.2021548748016357, - "learning_rate": 9.059140702906128e-06, - "loss": 0.8297, + "epoch": 0.7100971547536433, + "grad_norm": 2.6322741508483887, + "learning_rate": 3.6407103579000024e-05, + "loss": 0.6571, "step": 4093 }, { - "epoch": 1.494979003103889, - "grad_norm": 1.1803535223007202, - "learning_rate": 9.047135109838654e-06, - "loss": 0.8348, + "epoch": 0.7102706453851492, + "grad_norm": 0.9399757385253906, + "learning_rate": 3.6403206436678173e-05, + "loss": 0.625, "step": 4094 }, { - "epoch": 1.4953441665145153, - "grad_norm": 0.8983317613601685, - "learning_rate": 9.035135151580649e-06, - "loss": 0.8126, + "epoch": 0.7104441360166551, + "grad_norm": 1.1080659627914429, + "learning_rate": 3.63993073907451e-05, + "loss": 0.6094, "step": 4095 }, { - "epoch": 1.4957093299251416, - "grad_norm": 1.1152926683425903, - "learning_rate": 9.023140834305621e-06, - "loss": 0.79, + "epoch": 0.710617626648161, + "grad_norm": 1.3501157760620117, + "learning_rate": 3.63954064416533e-05, + "loss": 0.7434, "step": 4096 }, { - "epoch": 1.4960744933357677, - "grad_norm": 1.1868177652359009, - "learning_rate": 9.011152164184157e-06, - "loss": 0.792, + "epoch": 0.7107911172796669, + "grad_norm": 1.0539360046386719, + "learning_rate": 3.639150358985547e-05, + "loss": 0.6987, "step": 4097 }, { - "epoch": 1.496439656746394, - "grad_norm": 1.4260473251342773, - "learning_rate": 8.999169147383943e-06, - "loss": 0.8942, + "epoch": 0.7109646079111728, + "grad_norm": 0.9256290793418884, + "learning_rate": 3.6387598835804555e-05, + "loss": 0.7643, "step": 4098 }, { - "epoch": 1.4968048201570203, - "grad_norm": 1.1968029737472534, - "learning_rate": 8.987191790069771e-06, - "loss": 0.8181, + "epoch": 0.7111380985426787, + "grad_norm": 1.0216971635818481, + "learning_rate": 3.63836921799537e-05, + "loss": 0.7464, "step": 4099 }, { - "epoch": 1.4971699835676464, - "grad_norm": 0.8912290334701538, - "learning_rate": 8.975220098403507e-06, - "loss": 0.8015, + "epoch": 0.7113115891741846, + "grad_norm": 2.5454986095428467, + "learning_rate": 3.6379783622756275e-05, + "loss": 0.7714, "step": 4100 }, { - "epoch": 1.4975351469782727, - "grad_norm": 1.0320634841918945, - "learning_rate": 8.963254078544112e-06, - "loss": 0.8052, + "epoch": 0.7114850798056905, + "grad_norm": 0.7312251329421997, + "learning_rate": 3.637587316466587e-05, + "loss": 0.7361, "step": 4101 }, { - "epoch": 1.497900310388899, - "grad_norm": 0.9280909895896912, - "learning_rate": 8.951293736647608e-06, - "loss": 0.8666, + "epoch": 0.7116585704371964, + "grad_norm": 0.7561297416687012, + "learning_rate": 3.6371960806136313e-05, + "loss": 0.6967, "step": 4102 }, { - "epoch": 1.4982654737995253, - "grad_norm": 0.9688608050346375, - "learning_rate": 8.93933907886712e-06, - "loss": 0.8272, + "epoch": 0.7118320610687023, + "grad_norm": 1.9516030550003052, + "learning_rate": 3.636804654762162e-05, + "loss": 0.7418, "step": 4103 }, { - "epoch": 1.4986306372101517, - "grad_norm": 1.632715106010437, - "learning_rate": 8.92739011135284e-06, - "loss": 0.8674, + "epoch": 0.7120055517002082, + "grad_norm": 0.881834864616394, + "learning_rate": 3.636413038957605e-05, + "loss": 0.6399, "step": 4104 }, { - "epoch": 1.4989958006207778, - "grad_norm": 1.2433996200561523, - "learning_rate": 8.91544684025204e-06, - "loss": 0.8485, + "epoch": 0.7121790423317141, + "grad_norm": 0.897731602191925, + "learning_rate": 3.6360212332454087e-05, + "loss": 0.699, "step": 4105 }, { - "epoch": 1.499360964031404, - "grad_norm": 1.106766939163208, - "learning_rate": 8.903509271709035e-06, - "loss": 0.8268, + "epoch": 0.71235253296322, + "grad_norm": 0.9043697118759155, + "learning_rate": 3.635629237671041e-05, + "loss": 0.7415, "step": 4106 }, { - "epoch": 1.4997261274420304, - "grad_norm": 2.415583848953247, - "learning_rate": 8.891577411865237e-06, - "loss": 0.8132, + "epoch": 0.7125260235947258, + "grad_norm": 1.1444686651229858, + "learning_rate": 3.6352370522799956e-05, + "loss": 0.7164, "step": 4107 }, { - "epoch": 1.5000912908526565, - "grad_norm": 1.0082298517227173, - "learning_rate": 8.879651266859116e-06, - "loss": 0.8075, + "epoch": 0.7126995142262318, + "grad_norm": 0.9036661982536316, + "learning_rate": 3.634844677117784e-05, + "loss": 0.6938, "step": 4108 }, { - "epoch": 1.5004564542632828, - "grad_norm": 1.2329856157302856, - "learning_rate": 8.867730842826177e-06, - "loss": 0.8314, + "epoch": 0.7128730048577376, + "grad_norm": 1.1364202499389648, + "learning_rate": 3.634452112229942e-05, + "loss": 0.7173, "step": 4109 }, { - "epoch": 1.500821617673909, - "grad_norm": 1.1341005563735962, - "learning_rate": 8.855816145899016e-06, - "loss": 0.8429, + "epoch": 0.7130464954892436, + "grad_norm": 0.8395532965660095, + "learning_rate": 3.63405935766203e-05, + "loss": 0.6956, "step": 4110 }, { - "epoch": 1.5011867810845354, - "grad_norm": 1.0439532995224, - "learning_rate": 8.843907182207254e-06, - "loss": 0.8523, + "epoch": 0.7132199861207494, + "grad_norm": 0.9547373056411743, + "learning_rate": 3.633666413459624e-05, + "loss": 0.7563, "step": 4111 }, { - "epoch": 1.5015519444951617, - "grad_norm": 1.0233772993087769, - "learning_rate": 8.832003957877579e-06, - "loss": 0.8553, + "epoch": 0.7133934767522554, + "grad_norm": 0.8992285132408142, + "learning_rate": 3.633273279668327e-05, + "loss": 0.7097, "step": 4112 }, { - "epoch": 1.5019171079057878, - "grad_norm": 1.0232164859771729, - "learning_rate": 8.820106479033725e-06, - "loss": 0.8353, + "epoch": 0.7135669673837612, + "grad_norm": 0.742043673992157, + "learning_rate": 3.632879956333763e-05, + "loss": 0.7075, "step": 4113 }, { - "epoch": 1.5022822713164141, - "grad_norm": 1.4347275495529175, - "learning_rate": 8.808214751796467e-06, - "loss": 0.8043, + "epoch": 0.7137404580152672, + "grad_norm": 0.7700583338737488, + "learning_rate": 3.632486443501578e-05, + "loss": 0.7257, "step": 4114 }, { - "epoch": 1.5026474347270402, - "grad_norm": 1.2399473190307617, - "learning_rate": 8.796328782283627e-06, - "loss": 0.8401, + "epoch": 0.713913948646773, + "grad_norm": 0.6504289507865906, + "learning_rate": 3.632092741217438e-05, + "loss": 0.855, "step": 4115 }, { - "epoch": 1.5030125981376665, - "grad_norm": 1.0432449579238892, - "learning_rate": 8.784448576610045e-06, - "loss": 0.8058, + "epoch": 0.714087439278279, + "grad_norm": 1.1263492107391357, + "learning_rate": 3.631698849527034e-05, + "loss": 0.8584, "step": 4116 }, { - "epoch": 1.5033777615482928, - "grad_norm": 1.035149335861206, - "learning_rate": 8.77257414088762e-06, - "loss": 0.7797, + "epoch": 0.7142609299097848, + "grad_norm": 0.7401215434074402, + "learning_rate": 3.631304768476078e-05, + "loss": 0.7321, "step": 4117 }, { - "epoch": 1.5037429249589191, - "grad_norm": 1.0088367462158203, - "learning_rate": 8.760705481225271e-06, - "loss": 0.8196, + "epoch": 0.7144344205412908, + "grad_norm": 0.7783308029174805, + "learning_rate": 3.630910498110302e-05, + "loss": 0.7493, "step": 4118 }, { - "epoch": 1.5041080883695455, - "grad_norm": 1.0742065906524658, - "learning_rate": 8.748842603728954e-06, - "loss": 0.8024, + "epoch": 0.7146079111727967, + "grad_norm": 0.8238902688026428, + "learning_rate": 3.630516038475462e-05, + "loss": 0.8772, "step": 4119 }, { - "epoch": 1.5044732517801718, - "grad_norm": 0.9499788880348206, - "learning_rate": 8.736985514501627e-06, - "loss": 0.8417, + "epoch": 0.7147814018043026, + "grad_norm": 1.0470646619796753, + "learning_rate": 3.630121389617336e-05, + "loss": 0.7776, "step": 4120 }, { - "epoch": 1.5048384151907979, - "grad_norm": 1.019378662109375, - "learning_rate": 8.725134219643307e-06, - "loss": 0.8175, + "epoch": 0.7149548924358085, + "grad_norm": 0.7510324716567993, + "learning_rate": 3.6297265515817234e-05, + "loss": 0.8259, "step": 4121 }, { - "epoch": 1.5052035786014242, - "grad_norm": 1.3460936546325684, - "learning_rate": 8.71328872525099e-06, - "loss": 0.7999, + "epoch": 0.7151283830673144, + "grad_norm": 0.746124267578125, + "learning_rate": 3.629331524414446e-05, + "loss": 0.6832, "step": 4122 }, { - "epoch": 1.5055687420120503, - "grad_norm": 1.0814775228500366, - "learning_rate": 8.701449037418717e-06, - "loss": 0.8051, + "epoch": 0.7153018736988203, + "grad_norm": 0.8589576482772827, + "learning_rate": 3.628936308161346e-05, + "loss": 0.8046, "step": 4123 }, { - "epoch": 1.5059339054226766, - "grad_norm": 0.8658519387245178, - "learning_rate": 8.68961516223753e-06, - "loss": 0.8422, + "epoch": 0.7154753643303262, + "grad_norm": 1.4506326913833618, + "learning_rate": 3.6285409028682895e-05, + "loss": 0.7633, "step": 4124 }, { - "epoch": 1.506299068833303, - "grad_norm": 1.2129791975021362, - "learning_rate": 8.677787105795494e-06, - "loss": 0.8286, + "epoch": 0.7156488549618321, + "grad_norm": 0.8556270599365234, + "learning_rate": 3.6281453085811634e-05, + "loss": 0.7235, "step": 4125 }, { - "epoch": 1.5066642322439292, - "grad_norm": 1.040724277496338, - "learning_rate": 8.66596487417765e-06, - "loss": 0.8375, + "epoch": 0.7158223455933379, + "grad_norm": 1.3583617210388184, + "learning_rate": 3.627749525345878e-05, + "loss": 0.7344, "step": 4126 }, { - "epoch": 1.5070293956545555, - "grad_norm": 0.8417491912841797, - "learning_rate": 8.654148473466075e-06, - "loss": 0.7906, + "epoch": 0.7159958362248439, + "grad_norm": 0.9798122644424438, + "learning_rate": 3.627353553208362e-05, + "loss": 0.6769, "step": 4127 }, { - "epoch": 1.5073945590651818, - "grad_norm": 0.9858561754226685, - "learning_rate": 8.642337909739826e-06, - "loss": 0.8238, + "epoch": 0.7161693268563497, + "grad_norm": 0.9259627461433411, + "learning_rate": 3.626957392214571e-05, + "loss": 0.6483, "step": 4128 }, { - "epoch": 1.507759722475808, - "grad_norm": 0.9423806667327881, - "learning_rate": 8.630533189074979e-06, - "loss": 0.8002, + "epoch": 0.7163428174878557, + "grad_norm": 0.7991718649864197, + "learning_rate": 3.626561042410479e-05, + "loss": 0.7485, "step": 4129 }, { - "epoch": 1.5081248858864342, - "grad_norm": 1.2686686515808105, - "learning_rate": 8.618734317544569e-06, - "loss": 0.8361, + "epoch": 0.7165163081193615, + "grad_norm": 0.9961663484573364, + "learning_rate": 3.626164503842082e-05, + "loss": 0.6223, "step": 4130 }, { - "epoch": 1.5084900492970603, - "grad_norm": 1.3225703239440918, - "learning_rate": 8.60694130121865e-06, - "loss": 0.8295, + "epoch": 0.7166897987508675, + "grad_norm": 0.8273112773895264, + "learning_rate": 3.6257677765553996e-05, + "loss": 0.7449, "step": 4131 }, { - "epoch": 1.5088552127076866, - "grad_norm": 1.1594396829605103, - "learning_rate": 8.595154146164257e-06, - "loss": 0.863, + "epoch": 0.7168632893823733, + "grad_norm": 0.953797459602356, + "learning_rate": 3.6253708605964724e-05, + "loss": 0.7786, "step": 4132 }, { - "epoch": 1.509220376118313, - "grad_norm": 1.516986608505249, - "learning_rate": 8.583372858445408e-06, - "loss": 0.8204, + "epoch": 0.7170367800138793, + "grad_norm": 0.845944344997406, + "learning_rate": 3.624973756011363e-05, + "loss": 0.7612, "step": 4133 }, { - "epoch": 1.5095855395289393, - "grad_norm": 1.0578380823135376, - "learning_rate": 8.57159744412311e-06, - "loss": 0.7545, + "epoch": 0.7172102706453851, + "grad_norm": 0.77565598487854, + "learning_rate": 3.6245764628461556e-05, + "loss": 0.8181, "step": 4134 }, { - "epoch": 1.5099507029395656, - "grad_norm": 0.9757749438285828, - "learning_rate": 8.559827909255333e-06, - "loss": 0.83, + "epoch": 0.7173837612768911, + "grad_norm": 0.8768342137336731, + "learning_rate": 3.624178981146956e-05, + "loss": 0.7456, "step": 4135 }, { - "epoch": 1.5103158663501917, - "grad_norm": 0.953019917011261, - "learning_rate": 8.548064259897024e-06, - "loss": 0.8418, + "epoch": 0.7175572519083969, + "grad_norm": 0.7703996300697327, + "learning_rate": 3.6237813109598944e-05, + "loss": 0.8848, "step": 4136 }, { - "epoch": 1.510681029760818, - "grad_norm": 1.07356858253479, - "learning_rate": 8.536306502100118e-06, - "loss": 0.782, + "epoch": 0.7177307425399029, + "grad_norm": 0.8010103106498718, + "learning_rate": 3.623383452331119e-05, + "loss": 0.7427, "step": 4137 }, { - "epoch": 1.511046193171444, - "grad_norm": 0.8663842678070068, - "learning_rate": 8.524554641913504e-06, - "loss": 0.823, + "epoch": 0.7179042331714087, + "grad_norm": 0.8242023587226868, + "learning_rate": 3.622985405306803e-05, + "loss": 0.6526, "step": 4138 }, { - "epoch": 1.5114113565820704, - "grad_norm": 1.2117549180984497, - "learning_rate": 8.512808685383056e-06, - "loss": 0.7944, + "epoch": 0.7180777238029147, + "grad_norm": 0.9645797610282898, + "learning_rate": 3.622587169933138e-05, + "loss": 0.6875, "step": 4139 }, { - "epoch": 1.5117765199926967, - "grad_norm": 1.2439460754394531, - "learning_rate": 8.501068638551577e-06, - "loss": 0.8271, + "epoch": 0.7182512144344205, + "grad_norm": 1.4879436492919922, + "learning_rate": 3.622188746256343e-05, + "loss": 0.7701, "step": 4140 }, { - "epoch": 1.512141683403323, - "grad_norm": 1.2013447284698486, - "learning_rate": 8.489334507458862e-06, - "loss": 0.8295, + "epoch": 0.7184247050659265, + "grad_norm": 0.667752742767334, + "learning_rate": 3.6217901343226526e-05, + "loss": 0.8079, "step": 4141 }, { - "epoch": 1.5125068468139493, - "grad_norm": 1.1373149156570435, - "learning_rate": 8.47760629814165e-06, - "loss": 0.8171, + "epoch": 0.7185981956974323, + "grad_norm": 0.9773068428039551, + "learning_rate": 3.621391334178328e-05, + "loss": 0.79, "step": 4142 }, { - "epoch": 1.5128720102245756, - "grad_norm": 1.3387954235076904, - "learning_rate": 8.465884016633629e-06, - "loss": 0.823, + "epoch": 0.7187716863289383, + "grad_norm": 0.9373596906661987, + "learning_rate": 3.620992345869649e-05, + "loss": 0.8691, "step": 4143 }, { - "epoch": 1.5132371736352017, - "grad_norm": 0.7233942151069641, - "learning_rate": 8.454167668965457e-06, - "loss": 0.8181, + "epoch": 0.7189451769604441, + "grad_norm": 1.2208003997802734, + "learning_rate": 3.62059316944292e-05, + "loss": 0.7715, "step": 4144 }, { - "epoch": 1.513602337045828, - "grad_norm": 1.2604109048843384, - "learning_rate": 8.442457261164705e-06, - "loss": 0.8241, + "epoch": 0.7191186675919501, + "grad_norm": 1.0328786373138428, + "learning_rate": 3.6201938049444654e-05, + "loss": 0.6541, "step": 4145 }, { - "epoch": 1.5139675004564541, - "grad_norm": 1.5177028179168701, - "learning_rate": 8.430752799255918e-06, - "loss": 0.8044, + "epoch": 0.7192921582234559, + "grad_norm": 1.5308524370193481, + "learning_rate": 3.619794252420632e-05, + "loss": 0.6975, "step": 4146 }, { - "epoch": 1.5143326638670804, - "grad_norm": 0.8127067685127258, - "learning_rate": 8.419054289260569e-06, - "loss": 0.7774, + "epoch": 0.7194656488549618, + "grad_norm": 0.9636774063110352, + "learning_rate": 3.619394511917788e-05, + "loss": 0.8293, "step": 4147 }, { - "epoch": 1.5146978272777067, - "grad_norm": 0.829156219959259, - "learning_rate": 8.407361737197079e-06, - "loss": 0.8429, + "epoch": 0.7196391394864677, + "grad_norm": 1.051942229270935, + "learning_rate": 3.618994583482323e-05, + "loss": 0.7015, "step": 4148 }, { - "epoch": 1.515062990688333, - "grad_norm": 0.9569194316864014, - "learning_rate": 8.395675149080795e-06, - "loss": 0.786, + "epoch": 0.7198126301179736, + "grad_norm": 0.6598153114318848, + "learning_rate": 3.618594467160651e-05, + "loss": 0.7874, "step": 4149 }, { - "epoch": 1.5154281540989594, - "grad_norm": 0.8182485699653625, - "learning_rate": 8.383994530923987e-06, - "loss": 0.8068, + "epoch": 0.7199861207494795, + "grad_norm": 1.6635960340499878, + "learning_rate": 3.618194162999205e-05, + "loss": 0.9399, "step": 4150 }, { - "epoch": 1.5157933175095857, - "grad_norm": 1.1036359071731567, - "learning_rate": 8.372319888735872e-06, - "loss": 0.7916, + "epoch": 0.7201596113809854, + "grad_norm": 1.260628342628479, + "learning_rate": 3.617793671044441e-05, + "loss": 0.6782, "step": 4151 }, { - "epoch": 1.5161584809202118, - "grad_norm": 0.9602381587028503, - "learning_rate": 8.360651228522583e-06, - "loss": 0.8066, + "epoch": 0.7203331020124913, + "grad_norm": 0.7404012084007263, + "learning_rate": 3.617392991342836e-05, + "loss": 0.6604, "step": 4152 }, { - "epoch": 1.516523644330838, - "grad_norm": 1.0537254810333252, - "learning_rate": 8.348988556287185e-06, - "loss": 0.8231, + "epoch": 0.7205065926439972, + "grad_norm": 0.9810252785682678, + "learning_rate": 3.6169921239408894e-05, + "loss": 0.7184, "step": 4153 }, { - "epoch": 1.5168888077414642, - "grad_norm": 0.8438145518302917, - "learning_rate": 8.337331878029644e-06, - "loss": 0.8149, + "epoch": 0.7206800832755031, + "grad_norm": 0.9210187792778015, + "learning_rate": 3.616591068885123e-05, + "loss": 0.7572, "step": 4154 }, { - "epoch": 1.5172539711520905, - "grad_norm": 0.9253329634666443, - "learning_rate": 8.325681199746856e-06, - "loss": 0.8282, + "epoch": 0.720853573907009, + "grad_norm": 0.8296277523040771, + "learning_rate": 3.61618982622208e-05, + "loss": 0.6925, "step": 4155 }, { - "epoch": 1.5176191345627168, - "grad_norm": 1.8270090818405151, - "learning_rate": 8.314036527432631e-06, - "loss": 0.8616, + "epoch": 0.721027064538515, + "grad_norm": 0.9490920305252075, + "learning_rate": 3.6157883959983234e-05, + "loss": 0.8879, "step": 4156 }, { - "epoch": 1.517984297973343, - "grad_norm": 0.9606066942214966, - "learning_rate": 8.302397867077683e-06, - "loss": 0.8079, + "epoch": 0.7212005551700208, + "grad_norm": 0.9323275685310364, + "learning_rate": 3.615386778260441e-05, + "loss": 0.7605, "step": 4157 }, { - "epoch": 1.5183494613839694, - "grad_norm": 0.9083921909332275, - "learning_rate": 8.290765224669646e-06, - "loss": 0.8034, + "epoch": 0.7213740458015268, + "grad_norm": 0.7767539620399475, + "learning_rate": 3.614984973055041e-05, + "loss": 0.7234, "step": 4158 }, { - "epoch": 1.5187146247945957, - "grad_norm": 0.8462758660316467, - "learning_rate": 8.27913860619303e-06, - "loss": 0.8131, + "epoch": 0.7215475364330326, + "grad_norm": 1.029406189918518, + "learning_rate": 3.6145829804287526e-05, + "loss": 0.7092, "step": 4159 }, { - "epoch": 1.5190797882052218, - "grad_norm": 1.138016939163208, - "learning_rate": 8.267518017629272e-06, - "loss": 0.809, + "epoch": 0.7217210270645386, + "grad_norm": 0.8024153113365173, + "learning_rate": 3.614180800428228e-05, + "loss": 0.8569, "step": 4160 }, { - "epoch": 1.5194449516158481, - "grad_norm": 1.1181797981262207, - "learning_rate": 8.255903464956707e-06, - "loss": 0.8544, + "epoch": 0.7218945176960444, + "grad_norm": 0.7872860431671143, + "learning_rate": 3.613778433100141e-05, + "loss": 0.8242, "step": 4161 }, { - "epoch": 1.5198101150264742, - "grad_norm": 1.0089361667633057, - "learning_rate": 8.24429495415054e-06, - "loss": 0.8298, + "epoch": 0.7220680083275504, + "grad_norm": 0.6617968082427979, + "learning_rate": 3.6133758784911864e-05, + "loss": 0.6827, "step": 4162 }, { - "epoch": 1.5201752784371005, - "grad_norm": 0.9383481740951538, - "learning_rate": 8.232692491182898e-06, - "loss": 0.7991, + "epoch": 0.7222414989590562, + "grad_norm": 0.7136391401290894, + "learning_rate": 3.612973136648081e-05, + "loss": 0.8337, "step": 4163 }, { - "epoch": 1.5205404418477269, - "grad_norm": 1.0730175971984863, - "learning_rate": 8.221096082022773e-06, - "loss": 0.8113, + "epoch": 0.7224149895905622, + "grad_norm": 0.7647514939308167, + "learning_rate": 3.6125702076175636e-05, + "loss": 0.7583, "step": 4164 }, { - "epoch": 1.5209056052583532, - "grad_norm": 1.106719732284546, - "learning_rate": 8.209505732636052e-06, - "loss": 0.8045, + "epoch": 0.722588480222068, + "grad_norm": 0.8157259225845337, + "learning_rate": 3.612167091446394e-05, + "loss": 0.8479, "step": 4165 }, { - "epoch": 1.5212707686689795, - "grad_norm": 1.2439172267913818, - "learning_rate": 8.197921448985512e-06, - "loss": 0.7885, + "epoch": 0.7227619708535739, + "grad_norm": 1.0402411222457886, + "learning_rate": 3.611763788181356e-05, + "loss": 0.7036, "step": 4166 }, { - "epoch": 1.5216359320796056, - "grad_norm": 0.7484830021858215, - "learning_rate": 8.186343237030795e-06, - "loss": 0.8173, + "epoch": 0.7229354614850798, + "grad_norm": 0.8964327573776245, + "learning_rate": 3.6113602978692514e-05, + "loss": 0.6975, "step": 4167 }, { - "epoch": 1.5220010954902319, - "grad_norm": 1.0567265748977661, - "learning_rate": 8.174771102728438e-06, - "loss": 0.8363, + "epoch": 0.7231089521165857, + "grad_norm": 0.887392520904541, + "learning_rate": 3.610956620556907e-05, + "loss": 0.6327, "step": 4168 }, { - "epoch": 1.522366258900858, - "grad_norm": 1.9291954040527344, - "learning_rate": 8.163205052031827e-06, - "loss": 0.8217, + "epoch": 0.7232824427480916, + "grad_norm": 0.7320978045463562, + "learning_rate": 3.61055275629117e-05, + "loss": 0.7561, "step": 4169 }, { - "epoch": 1.5227314223114843, - "grad_norm": 0.9876084327697754, - "learning_rate": 8.151645090891234e-06, - "loss": 0.8137, + "epoch": 0.7234559333795975, + "grad_norm": 0.8071873784065247, + "learning_rate": 3.610148705118908e-05, + "loss": 0.8442, "step": 4170 }, { - "epoch": 1.5230965857221106, - "grad_norm": 1.2247403860092163, - "learning_rate": 8.140091225253797e-06, - "loss": 0.7955, + "epoch": 0.7236294240111034, + "grad_norm": 0.9665724635124207, + "learning_rate": 3.6097444670870136e-05, + "loss": 0.8152, "step": 4171 }, { - "epoch": 1.523461749132737, - "grad_norm": 1.4781579971313477, - "learning_rate": 8.128543461063523e-06, - "loss": 0.8055, + "epoch": 0.7238029146426093, + "grad_norm": 0.8418596982955933, + "learning_rate": 3.609340042242397e-05, + "loss": 0.7122, "step": 4172 }, { - "epoch": 1.5238269125433632, - "grad_norm": 1.0876411199569702, - "learning_rate": 8.11700180426126e-06, - "loss": 0.8153, + "epoch": 0.7239764052741152, + "grad_norm": 0.8701696395874023, + "learning_rate": 3.608935430631994e-05, + "loss": 0.6809, "step": 4173 }, { - "epoch": 1.5241920759539895, - "grad_norm": 1.2441809177398682, - "learning_rate": 8.105466260784733e-06, - "loss": 0.8495, + "epoch": 0.7241498959056211, + "grad_norm": 0.7750836610794067, + "learning_rate": 3.6085306323027596e-05, + "loss": 0.7595, "step": 4174 }, { - "epoch": 1.5245572393646156, - "grad_norm": 1.0092616081237793, - "learning_rate": 8.093936836568523e-06, - "loss": 0.8182, + "epoch": 0.724323386537127, + "grad_norm": 0.8153827786445618, + "learning_rate": 3.608125647301671e-05, + "loss": 0.675, "step": 4175 }, { - "epoch": 1.524922402775242, - "grad_norm": 1.133512020111084, - "learning_rate": 8.082413537544045e-06, - "loss": 0.7924, + "epoch": 0.7244968771686329, + "grad_norm": 0.613461971282959, + "learning_rate": 3.607720475675727e-05, + "loss": 0.7178, "step": 4176 }, { - "epoch": 1.525287566185868, - "grad_norm": 1.3034881353378296, - "learning_rate": 8.070896369639578e-06, - "loss": 0.8212, + "epoch": 0.7246703678001388, + "grad_norm": 0.6600214838981628, + "learning_rate": 3.607315117471948e-05, + "loss": 0.8093, "step": 4177 }, { - "epoch": 1.5256527295964943, - "grad_norm": 0.9621842503547668, - "learning_rate": 8.05938533878025e-06, - "loss": 0.8175, + "epoch": 0.7248438584316447, + "grad_norm": 0.8077488541603088, + "learning_rate": 3.606909572737378e-05, + "loss": 0.7507, "step": 4178 }, { - "epoch": 1.5260178930071207, - "grad_norm": 1.1325085163116455, - "learning_rate": 8.047880450888013e-06, - "loss": 0.8231, + "epoch": 0.7250173490631506, + "grad_norm": 1.0633569955825806, + "learning_rate": 3.6065038415190775e-05, + "loss": 0.8247, "step": 4179 }, { - "epoch": 1.526383056417747, - "grad_norm": 1.7151836156845093, - "learning_rate": 8.036381711881674e-06, - "loss": 0.7975, + "epoch": 0.7251908396946565, + "grad_norm": 0.8471539616584778, + "learning_rate": 3.6060979238641363e-05, + "loss": 0.769, "step": 4180 }, { - "epoch": 1.5267482198283733, - "grad_norm": 1.0249691009521484, - "learning_rate": 8.024889127676874e-06, - "loss": 0.8038, + "epoch": 0.7253643303261624, + "grad_norm": 0.6736499071121216, + "learning_rate": 3.605691819819659e-05, + "loss": 0.7233, "step": 4181 }, { - "epoch": 1.5271133832389996, - "grad_norm": 1.0218939781188965, - "learning_rate": 8.013402704186095e-06, - "loss": 0.8069, + "epoch": 0.7255378209576683, + "grad_norm": 0.8920660614967346, + "learning_rate": 3.6052855294327746e-05, + "loss": 0.7903, "step": 4182 }, { - "epoch": 1.5274785466496257, - "grad_norm": 0.9302506446838379, - "learning_rate": 8.001922447318624e-06, - "loss": 0.8405, + "epoch": 0.7257113115891742, + "grad_norm": 0.9682316780090332, + "learning_rate": 3.604879052750634e-05, + "loss": 0.6621, "step": 4183 }, { - "epoch": 1.527843710060252, - "grad_norm": 1.0445654392242432, - "learning_rate": 7.990448362980601e-06, - "loss": 0.8318, + "epoch": 0.7258848022206801, + "grad_norm": 0.9438630938529968, + "learning_rate": 3.604472389820409e-05, + "loss": 0.6963, "step": 4184 }, { - "epoch": 1.528208873470878, - "grad_norm": 1.024112582206726, - "learning_rate": 7.978980457074983e-06, - "loss": 0.8348, + "epoch": 0.7260582928521859, + "grad_norm": 0.7714619040489197, + "learning_rate": 3.604065540689295e-05, + "loss": 0.7598, "step": 4185 }, { - "epoch": 1.5285740368815044, - "grad_norm": 1.2349852323532104, - "learning_rate": 7.967518735501545e-06, - "loss": 0.7957, + "epoch": 0.7262317834836919, + "grad_norm": 1.4401975870132446, + "learning_rate": 3.6036585054045044e-05, + "loss": 0.709, "step": 4186 }, { - "epoch": 1.5289392002921307, - "grad_norm": 1.103870153427124, - "learning_rate": 7.956063204156892e-06, - "loss": 0.8146, + "epoch": 0.7264052741151977, + "grad_norm": 1.3234390020370483, + "learning_rate": 3.603251284013276e-05, + "loss": 0.7383, "step": 4187 }, { - "epoch": 1.529304363702757, - "grad_norm": 1.1702057123184204, - "learning_rate": 7.944613868934428e-06, - "loss": 0.7836, + "epoch": 0.7265787647467037, + "grad_norm": 0.7896291613578796, + "learning_rate": 3.602843876562868e-05, + "loss": 0.6409, "step": 4188 }, { - "epoch": 1.5296695271133833, - "grad_norm": 2.8380343914031982, - "learning_rate": 7.93317073572437e-06, - "loss": 0.8136, + "epoch": 0.7267522553782095, + "grad_norm": 0.8358627557754517, + "learning_rate": 3.602436283100561e-05, + "loss": 0.7703, "step": 4189 }, { - "epoch": 1.5300346905240096, - "grad_norm": 1.138486623764038, - "learning_rate": 7.921733810413754e-06, - "loss": 0.8247, + "epoch": 0.7269257460097155, + "grad_norm": 0.8157979846000671, + "learning_rate": 3.6020285036736554e-05, + "loss": 0.8594, "step": 4190 }, { - "epoch": 1.5303998539346357, - "grad_norm": 1.1046239137649536, - "learning_rate": 7.910303098886422e-06, - "loss": 0.8282, + "epoch": 0.7270992366412213, + "grad_norm": 1.1627576351165771, + "learning_rate": 3.601620538329476e-05, + "loss": 0.7366, "step": 4191 }, { - "epoch": 1.530765017345262, - "grad_norm": 0.8704138994216919, - "learning_rate": 7.898878607023024e-06, - "loss": 0.8041, + "epoch": 0.7272727272727273, + "grad_norm": 0.7465658187866211, + "learning_rate": 3.601212387115366e-05, + "loss": 0.7463, "step": 4192 }, { - "epoch": 1.5311301807558881, - "grad_norm": 1.4630558490753174, - "learning_rate": 7.887460340700988e-06, - "loss": 0.8656, + "epoch": 0.7274462179042331, + "grad_norm": 0.8549017310142517, + "learning_rate": 3.6008040500786926e-05, + "loss": 0.7351, "step": 4193 }, { - "epoch": 1.5314953441665144, - "grad_norm": 1.0675346851348877, - "learning_rate": 7.87604830579456e-06, - "loss": 0.7867, + "epoch": 0.7276197085357391, + "grad_norm": 0.8412423729896545, + "learning_rate": 3.6003955272668444e-05, + "loss": 0.8123, "step": 4194 }, { - "epoch": 1.5318605075771408, - "grad_norm": 0.898078978061676, - "learning_rate": 7.864642508174778e-06, - "loss": 0.7951, + "epoch": 0.7277931991672449, + "grad_norm": 0.9544307589530945, + "learning_rate": 3.599986818727231e-05, + "loss": 0.7402, "step": 4195 }, { - "epoch": 1.532225670987767, - "grad_norm": 1.244705319404602, - "learning_rate": 7.853242953709467e-06, - "loss": 0.8342, + "epoch": 0.7279666897987509, + "grad_norm": 0.928392767906189, + "learning_rate": 3.5995779245072816e-05, + "loss": 0.7454, "step": 4196 }, { - "epoch": 1.5325908343983934, - "grad_norm": 1.2306435108184814, - "learning_rate": 7.841849648263233e-06, - "loss": 0.8105, + "epoch": 0.7281401804302567, + "grad_norm": 0.7447654604911804, + "learning_rate": 3.599168844654451e-05, + "loss": 0.8955, "step": 4197 }, { - "epoch": 1.5329559978090197, - "grad_norm": 1.0501149892807007, - "learning_rate": 7.830462597697476e-06, - "loss": 0.8204, + "epoch": 0.7283136710617627, + "grad_norm": 0.7823044061660767, + "learning_rate": 3.5987595792162126e-05, + "loss": 0.6573, "step": 4198 }, { - "epoch": 1.5333211612196458, - "grad_norm": 1.0130101442337036, - "learning_rate": 7.819081807870383e-06, - "loss": 0.808, + "epoch": 0.7284871616932685, + "grad_norm": 1.0316659212112427, + "learning_rate": 3.5983501282400617e-05, + "loss": 0.844, "step": 4199 }, { - "epoch": 1.533686324630272, - "grad_norm": 0.811671257019043, - "learning_rate": 7.807707284636906e-06, - "loss": 0.8281, + "epoch": 0.7286606523247745, + "grad_norm": 0.96084064245224, + "learning_rate": 3.597940491773516e-05, + "loss": 0.7308, "step": 4200 }, { - "epoch": 1.5340514880408982, - "grad_norm": 0.9360955953598022, - "learning_rate": 7.796339033848797e-06, - "loss": 0.8491, + "epoch": 0.7288341429562804, + "grad_norm": 1.0136480331420898, + "learning_rate": 3.597530669864115e-05, + "loss": 0.8204, "step": 4201 }, { - "epoch": 1.5344166514515245, - "grad_norm": 0.7820254564285278, - "learning_rate": 7.784977061354548e-06, - "loss": 0.8529, + "epoch": 0.7290076335877863, + "grad_norm": 1.0834189653396606, + "learning_rate": 3.5971206625594176e-05, + "loss": 0.6765, "step": 4202 }, { - "epoch": 1.5347818148621508, - "grad_norm": 0.784467339515686, - "learning_rate": 7.773621372999437e-06, - "loss": 0.8224, + "epoch": 0.7291811242192922, + "grad_norm": 0.6816880702972412, + "learning_rate": 3.596710469907006e-05, + "loss": 0.8357, "step": 4203 }, { - "epoch": 1.5351469782727771, - "grad_norm": 0.8656306862831116, - "learning_rate": 7.762271974625516e-06, - "loss": 0.8336, + "epoch": 0.7293546148507981, + "grad_norm": 0.7355456948280334, + "learning_rate": 3.5963000919544844e-05, + "loss": 0.7604, "step": 4204 }, { - "epoch": 1.5355121416834034, - "grad_norm": 0.8525277376174927, - "learning_rate": 7.750928872071594e-06, - "loss": 0.8278, + "epoch": 0.729528105482304, + "grad_norm": 1.0635669231414795, + "learning_rate": 3.595889528749477e-05, + "loss": 0.8423, "step": 4205 }, { - "epoch": 1.5358773050940295, - "grad_norm": 0.9223628044128418, - "learning_rate": 7.73959207117325e-06, - "loss": 0.815, + "epoch": 0.7297015961138098, + "grad_norm": 0.8261038661003113, + "learning_rate": 3.59547878033963e-05, + "loss": 0.6849, "step": 4206 }, { - "epoch": 1.5362424685046558, - "grad_norm": 1.5185773372650146, - "learning_rate": 7.728261577762798e-06, - "loss": 0.7751, + "epoch": 0.7298750867453158, + "grad_norm": 0.8566262125968933, + "learning_rate": 3.595067846772612e-05, + "loss": 0.6831, "step": 4207 }, { - "epoch": 1.536607631915282, - "grad_norm": 1.3920658826828003, - "learning_rate": 7.716937397669333e-06, - "loss": 0.7917, + "epoch": 0.7300485773768216, + "grad_norm": 1.9557998180389404, + "learning_rate": 3.594656728096111e-05, + "loss": 0.7302, "step": 4208 }, { - "epoch": 1.5369727953259082, - "grad_norm": 1.04731285572052, - "learning_rate": 7.705619536718685e-06, - "loss": 0.8285, + "epoch": 0.7302220680083276, + "grad_norm": 0.7875620126724243, + "learning_rate": 3.594245424357839e-05, + "loss": 0.751, "step": 4209 }, { - "epoch": 1.5373379587365346, - "grad_norm": 0.8284503817558289, - "learning_rate": 7.694308000733443e-06, - "loss": 0.7666, + "epoch": 0.7303955586398334, + "grad_norm": 0.7147670984268188, + "learning_rate": 3.5938339356055274e-05, + "loss": 0.8392, "step": 4210 }, { - "epoch": 1.5377031221471609, - "grad_norm": 1.1600371599197388, - "learning_rate": 7.683002795532947e-06, - "loss": 0.8188, + "epoch": 0.7305690492713394, + "grad_norm": 0.8009697794914246, + "learning_rate": 3.593422261886931e-05, + "loss": 0.7396, "step": 4211 }, { - "epoch": 1.5380682855577872, - "grad_norm": 1.1038812398910522, - "learning_rate": 7.671703926933253e-06, - "loss": 0.8452, + "epoch": 0.7307425399028452, + "grad_norm": 1.092092514038086, + "learning_rate": 3.593010403249824e-05, + "loss": 0.7756, "step": 4212 }, { - "epoch": 1.5384334489684135, - "grad_norm": 1.0530604124069214, - "learning_rate": 7.660411400747188e-06, - "loss": 0.8237, + "epoch": 0.7309160305343512, + "grad_norm": 1.2354230880737305, + "learning_rate": 3.592598359742004e-05, + "loss": 0.7134, "step": 4213 }, { - "epoch": 1.5387986123790396, - "grad_norm": 1.262991189956665, - "learning_rate": 7.649125222784298e-06, - "loss": 0.8175, + "epoch": 0.731089521165857, + "grad_norm": 1.0563613176345825, + "learning_rate": 3.592186131411288e-05, + "loss": 0.709, "step": 4214 }, { - "epoch": 1.539163775789666, - "grad_norm": 1.1239620447158813, - "learning_rate": 7.637845398850879e-06, - "loss": 0.816, + "epoch": 0.731263011797363, + "grad_norm": 0.8943386673927307, + "learning_rate": 3.591773718305517e-05, + "loss": 0.9124, "step": 4215 }, { - "epoch": 1.539528939200292, - "grad_norm": 1.0000991821289062, - "learning_rate": 7.6265719347499376e-06, - "loss": 0.8333, + "epoch": 0.7314365024288688, + "grad_norm": 0.7086063623428345, + "learning_rate": 3.5913611204725496e-05, + "loss": 0.8181, "step": 4216 }, { - "epoch": 1.5398941026109183, - "grad_norm": 1.218320608139038, - "learning_rate": 7.6153048362812166e-06, - "loss": 0.8442, + "epoch": 0.7316099930603748, + "grad_norm": 0.7875753045082092, + "learning_rate": 3.590948337960271e-05, + "loss": 0.6554, "step": 4217 }, { - "epoch": 1.5402592660215446, - "grad_norm": 1.0781303644180298, - "learning_rate": 7.604044109241191e-06, - "loss": 0.8119, + "epoch": 0.7317834836918806, + "grad_norm": 0.7309883236885071, + "learning_rate": 3.590535370816584e-05, + "loss": 0.6354, "step": 4218 }, { - "epoch": 1.540624429432171, - "grad_norm": 0.8994264602661133, - "learning_rate": 7.592789759423049e-06, - "loss": 0.7854, + "epoch": 0.7319569743233866, + "grad_norm": 0.7606029510498047, + "learning_rate": 3.5901222190894136e-05, + "loss": 0.7878, "step": 4219 }, { - "epoch": 1.5409895928427972, - "grad_norm": 1.0980600118637085, - "learning_rate": 7.581541792616709e-06, - "loss": 0.8235, + "epoch": 0.7321304649548924, + "grad_norm": 1.0773366689682007, + "learning_rate": 3.589708882826707e-05, + "loss": 0.7949, "step": 4220 }, { - "epoch": 1.5413547562534236, - "grad_norm": 1.0180491209030151, - "learning_rate": 7.570300214608801e-06, - "loss": 0.8195, + "epoch": 0.7323039555863984, + "grad_norm": 0.887047529220581, + "learning_rate": 3.589295362076432e-05, + "loss": 0.7046, "step": 4221 }, { - "epoch": 1.5417199196640496, - "grad_norm": 1.170632243156433, - "learning_rate": 7.559065031182653e-06, - "loss": 0.7957, + "epoch": 0.7324774462179042, + "grad_norm": 0.8906805515289307, + "learning_rate": 3.588881656886578e-05, + "loss": 0.7214, "step": 4222 }, { - "epoch": 1.542085083074676, - "grad_norm": 1.3009498119354248, - "learning_rate": 7.547836248118321e-06, - "loss": 0.8073, + "epoch": 0.7326509368494102, + "grad_norm": 0.8993430733680725, + "learning_rate": 3.588467767305157e-05, + "loss": 0.7505, "step": 4223 }, { - "epoch": 1.542450246485302, - "grad_norm": 1.2317428588867188, - "learning_rate": 7.536613871192566e-06, - "loss": 0.8176, + "epoch": 0.732824427480916, + "grad_norm": 0.7440751791000366, + "learning_rate": 3.5880536933802e-05, + "loss": 0.79, "step": 4224 }, { - "epoch": 1.5428154098959284, - "grad_norm": 1.0331577062606812, - "learning_rate": 7.525397906178858e-06, - "loss": 0.7935, + "epoch": 0.7329979181124219, + "grad_norm": 0.6793804168701172, + "learning_rate": 3.587639435159762e-05, + "loss": 0.8369, "step": 4225 }, { - "epoch": 1.5431805733065547, - "grad_norm": 1.226067304611206, - "learning_rate": 7.514188358847345e-06, - "loss": 0.806, + "epoch": 0.7331714087439278, + "grad_norm": 0.9028554558753967, + "learning_rate": 3.587224992691917e-05, + "loss": 0.748, "step": 4226 }, { - "epoch": 1.543545736717181, - "grad_norm": 0.9489091634750366, - "learning_rate": 7.502985234964897e-06, - "loss": 0.7698, + "epoch": 0.7333448993754337, + "grad_norm": 0.7417241334915161, + "learning_rate": 3.586810366024763e-05, + "loss": 0.7272, "step": 4227 }, { - "epoch": 1.5439109001278073, - "grad_norm": 1.1840530633926392, - "learning_rate": 7.491788540295077e-06, - "loss": 0.8063, + "epoch": 0.7335183900069396, + "grad_norm": 1.7033075094223022, + "learning_rate": 3.586395555206417e-05, + "loss": 0.7075, "step": 4228 }, { - "epoch": 1.5442760635384336, - "grad_norm": 1.0798759460449219, - "learning_rate": 7.480598280598126e-06, - "loss": 0.7783, + "epoch": 0.7336918806384455, + "grad_norm": 0.9641558527946472, + "learning_rate": 3.585980560285017e-05, + "loss": 0.7539, "step": 4229 }, { - "epoch": 1.5446412269490597, - "grad_norm": 1.1688851118087769, - "learning_rate": 7.4694144616309835e-06, - "loss": 0.8267, + "epoch": 0.7338653712699514, + "grad_norm": 0.8385981321334839, + "learning_rate": 3.585565381308726e-05, + "loss": 0.7651, "step": 4230 }, { - "epoch": 1.545006390359686, - "grad_norm": 0.884789764881134, - "learning_rate": 7.458237089147289e-06, - "loss": 0.8067, + "epoch": 0.7340388619014573, + "grad_norm": 0.94377201795578, + "learning_rate": 3.5851500183257246e-05, + "loss": 0.7369, "step": 4231 }, { - "epoch": 1.545371553770312, - "grad_norm": 1.1926696300506592, - "learning_rate": 7.447066168897334e-06, - "loss": 0.8435, + "epoch": 0.7342123525329632, + "grad_norm": 0.8626048564910889, + "learning_rate": 3.584734471384217e-05, + "loss": 0.6588, "step": 4232 }, { - "epoch": 1.5457367171809384, - "grad_norm": 1.1990981101989746, - "learning_rate": 7.435901706628119e-06, - "loss": 0.8076, + "epoch": 0.7343858431644691, + "grad_norm": 0.8232465386390686, + "learning_rate": 3.5843187405324266e-05, + "loss": 0.7556, "step": 4233 }, { - "epoch": 1.5461018805915647, - "grad_norm": 1.1236292123794556, - "learning_rate": 7.424743708083308e-06, - "loss": 0.8499, + "epoch": 0.734559333795975, + "grad_norm": 0.9516615271568298, + "learning_rate": 3.5839028258186014e-05, + "loss": 0.6643, "step": 4234 }, { - "epoch": 1.546467044002191, - "grad_norm": 1.2768346071243286, - "learning_rate": 7.413592179003255e-06, - "loss": 0.7904, + "epoch": 0.7347328244274809, + "grad_norm": 0.8121882081031799, + "learning_rate": 3.583486727291007e-05, + "loss": 0.8223, "step": 4235 }, { - "epoch": 1.5468322074128174, - "grad_norm": 0.9599570631980896, - "learning_rate": 7.402447125124956e-06, - "loss": 0.8258, + "epoch": 0.7349063150589868, + "grad_norm": 1.1183699369430542, + "learning_rate": 3.583070444997932e-05, + "loss": 0.6302, "step": 4236 }, { - "epoch": 1.5471973708234434, - "grad_norm": 1.3588385581970215, - "learning_rate": 7.391308552182104e-06, - "loss": 0.8309, + "epoch": 0.7350798056904927, + "grad_norm": 0.9208543300628662, + "learning_rate": 3.5826539789876885e-05, + "loss": 0.8269, "step": 4237 }, { - "epoch": 1.5475625342340698, - "grad_norm": 1.0768319368362427, - "learning_rate": 7.380176465905047e-06, - "loss": 0.8328, + "epoch": 0.7352532963219987, + "grad_norm": 0.8757738471031189, + "learning_rate": 3.5822373293086055e-05, + "loss": 0.8135, "step": 4238 }, { - "epoch": 1.5479276976446958, - "grad_norm": 1.3383028507232666, - "learning_rate": 7.369050872020802e-06, - "loss": 0.8201, + "epoch": 0.7354267869535045, + "grad_norm": 1.028744101524353, + "learning_rate": 3.581820496009038e-05, + "loss": 0.7288, "step": 4239 }, { - "epoch": 1.5482928610553222, - "grad_norm": 0.9518313407897949, - "learning_rate": 7.357931776253027e-06, - "loss": 0.8217, + "epoch": 0.7356002775850105, + "grad_norm": 2.2424509525299072, + "learning_rate": 3.581403479137358e-05, + "loss": 0.6971, "step": 4240 }, { - "epoch": 1.5486580244659485, - "grad_norm": 0.7836207747459412, - "learning_rate": 7.346819184322067e-06, - "loss": 0.817, + "epoch": 0.7357737682165163, + "grad_norm": 0.8396310210227966, + "learning_rate": 3.580986278741961e-05, + "loss": 0.8875, "step": 4241 }, { - "epoch": 1.5490231878765748, - "grad_norm": 1.0258196592330933, - "learning_rate": 7.3357131019448906e-06, - "loss": 0.8001, + "epoch": 0.7359472588480223, + "grad_norm": 0.7625975012779236, + "learning_rate": 3.580568894871265e-05, + "loss": 0.8999, "step": 4242 }, { - "epoch": 1.549388351287201, - "grad_norm": 0.9492394924163818, - "learning_rate": 7.324613534835134e-06, - "loss": 0.7925, + "epoch": 0.7361207494795281, + "grad_norm": 0.8817811012268066, + "learning_rate": 3.580151327573707e-05, + "loss": 0.7637, "step": 4243 }, { - "epoch": 1.5497535146978274, - "grad_norm": 1.6985188722610474, - "learning_rate": 7.313520488703083e-06, - "loss": 0.8206, + "epoch": 0.7362942401110341, + "grad_norm": 0.9465728402137756, + "learning_rate": 3.579733576897746e-05, + "loss": 0.688, "step": 4244 }, { - "epoch": 1.5501186781084535, - "grad_norm": 1.0924746990203857, - "learning_rate": 7.3024339692556714e-06, - "loss": 0.8425, + "epoch": 0.7364677307425399, + "grad_norm": 0.8975119590759277, + "learning_rate": 3.579315642891862e-05, + "loss": 0.865, "step": 4245 }, { - "epoch": 1.5504838415190798, - "grad_norm": 0.8802509903907776, - "learning_rate": 7.291353982196454e-06, - "loss": 0.7688, + "epoch": 0.7366412213740458, + "grad_norm": 0.6972750425338745, + "learning_rate": 3.578897525604558e-05, + "loss": 0.9104, "step": 4246 }, { - "epoch": 1.550849004929706, - "grad_norm": 0.9517748951911926, - "learning_rate": 7.280280533225648e-06, - "loss": 0.8127, + "epoch": 0.7368147120055517, + "grad_norm": 1.3165876865386963, + "learning_rate": 3.5784792250843564e-05, + "loss": 0.7314, "step": 4247 }, { - "epoch": 1.5512141683403322, - "grad_norm": 1.0136562585830688, - "learning_rate": 7.269213628040095e-06, - "loss": 0.8245, + "epoch": 0.7369882026370576, + "grad_norm": 0.9945239424705505, + "learning_rate": 3.578060741379801e-05, + "loss": 0.7795, "step": 4248 }, { - "epoch": 1.5515793317509585, - "grad_norm": 0.879878044128418, - "learning_rate": 7.258153272333281e-06, - "loss": 0.8287, + "epoch": 0.7371616932685635, + "grad_norm": 0.8358196020126343, + "learning_rate": 3.5776420745394584e-05, + "loss": 0.7434, "step": 4249 }, { - "epoch": 1.5519444951615848, - "grad_norm": 1.0922677516937256, - "learning_rate": 7.247099471795307e-06, - "loss": 0.7835, + "epoch": 0.7373351839000694, + "grad_norm": 0.7596993446350098, + "learning_rate": 3.577223224611915e-05, + "loss": 0.8643, "step": 4250 }, { - "epoch": 1.5523096585722111, - "grad_norm": 0.9979987144470215, - "learning_rate": 7.236052232112912e-06, - "loss": 0.8057, + "epoch": 0.7375086745315753, + "grad_norm": 0.7174896597862244, + "learning_rate": 3.576804191645778e-05, + "loss": 0.7974, "step": 4251 }, { - "epoch": 1.5526748219828375, - "grad_norm": 1.256954550743103, - "learning_rate": 7.225011558969457e-06, - "loss": 0.8058, + "epoch": 0.7376821651630812, + "grad_norm": 0.9958018660545349, + "learning_rate": 3.576384975689677e-05, + "loss": 0.8152, "step": 4252 }, { - "epoch": 1.5530399853934636, - "grad_norm": 1.163932204246521, - "learning_rate": 7.213977458044925e-06, - "loss": 0.8217, + "epoch": 0.7378556557945871, + "grad_norm": 0.9593260884284973, + "learning_rate": 3.5759655767922624e-05, + "loss": 0.6765, "step": 4253 }, { - "epoch": 1.5534051488040899, - "grad_norm": 1.1109575033187866, - "learning_rate": 7.202949935015928e-06, - "loss": 0.8145, + "epoch": 0.738029146426093, + "grad_norm": 0.9977091550827026, + "learning_rate": 3.575545995002207e-05, + "loss": 0.6396, "step": 4254 }, { - "epoch": 1.553770312214716, - "grad_norm": 1.2055176496505737, - "learning_rate": 7.191928995555677e-06, - "loss": 0.8144, + "epoch": 0.7382026370575989, + "grad_norm": 1.6859333515167236, + "learning_rate": 3.5751262303682034e-05, + "loss": 0.6649, "step": 4255 }, { - "epoch": 1.5541354756253423, - "grad_norm": 1.0766215324401855, - "learning_rate": 7.180914645333994e-06, - "loss": 0.7845, + "epoch": 0.7383761276891048, + "grad_norm": 0.9630656838417053, + "learning_rate": 3.574706282938964e-05, + "loss": 0.8242, "step": 4256 }, { - "epoch": 1.5545006390359686, - "grad_norm": 1.372202754020691, - "learning_rate": 7.1699068900173286e-06, - "loss": 0.8118, + "epoch": 0.7385496183206107, + "grad_norm": 0.8310582637786865, + "learning_rate": 3.574286152763226e-05, + "loss": 0.8044, "step": 4257 }, { - "epoch": 1.554865802446595, - "grad_norm": 1.2036309242248535, - "learning_rate": 7.158905735268728e-06, - "loss": 0.7965, + "epoch": 0.7387231089521166, + "grad_norm": 0.9618940353393555, + "learning_rate": 3.573865839889746e-05, + "loss": 0.6993, "step": 4258 }, { - "epoch": 1.5552309658572212, - "grad_norm": 1.834342122077942, - "learning_rate": 7.147911186747853e-06, - "loss": 0.808, + "epoch": 0.7388965995836225, + "grad_norm": 0.8789200782775879, + "learning_rate": 3.573445344367302e-05, + "loss": 0.7803, "step": 4259 }, { - "epoch": 1.5555961292678475, - "grad_norm": 1.6173207759857178, - "learning_rate": 7.136923250110943e-06, - "loss": 0.7981, + "epoch": 0.7390700902151284, + "grad_norm": 0.6992388367652893, + "learning_rate": 3.5730246662446916e-05, + "loss": 0.8137, "step": 4260 }, { - "epoch": 1.5559612926784736, - "grad_norm": 1.0179721117019653, - "learning_rate": 7.125941931010858e-06, - "loss": 0.8069, + "epoch": 0.7392435808466343, + "grad_norm": 0.8071144819259644, + "learning_rate": 3.572603805570736e-05, + "loss": 0.8777, "step": 4261 }, { - "epoch": 1.5563264560891, - "grad_norm": 1.117740273475647, - "learning_rate": 7.114967235097046e-06, - "loss": 0.8374, + "epoch": 0.7394170714781402, + "grad_norm": 0.7780818343162537, + "learning_rate": 3.572182762394276e-05, + "loss": 0.7561, "step": 4262 }, { - "epoch": 1.556691619499726, - "grad_norm": 0.9989866018295288, - "learning_rate": 7.103999168015548e-06, - "loss": 0.8142, + "epoch": 0.7395905621096461, + "grad_norm": 0.825623095035553, + "learning_rate": 3.571761536764174e-05, + "loss": 0.8794, "step": 4263 }, { - "epoch": 1.5570567829103523, - "grad_norm": 1.336606502532959, - "learning_rate": 7.093037735408998e-06, - "loss": 0.854, + "epoch": 0.739764052741152, + "grad_norm": 1.3565974235534668, + "learning_rate": 3.571340128729315e-05, + "loss": 0.7559, "step": 4264 }, { - "epoch": 1.5574219463209786, - "grad_norm": 1.1876146793365479, - "learning_rate": 7.082082942916604e-06, - "loss": 0.8427, + "epoch": 0.7399375433726578, + "grad_norm": 0.7715621590614319, + "learning_rate": 3.5709185383386024e-05, + "loss": 0.7961, "step": 4265 }, { - "epoch": 1.557787109731605, - "grad_norm": 1.3910378217697144, - "learning_rate": 7.071134796174171e-06, - "loss": 0.7794, + "epoch": 0.7401110340041638, + "grad_norm": 0.7676424384117126, + "learning_rate": 3.570496765640964e-05, + "loss": 0.6543, "step": 4266 }, { - "epoch": 1.5581522731422313, - "grad_norm": 1.0858198404312134, - "learning_rate": 7.060193300814085e-06, - "loss": 0.8158, + "epoch": 0.7402845246356696, + "grad_norm": 0.9876802563667297, + "learning_rate": 3.570074810685345e-05, + "loss": 0.6317, "step": 4267 }, { - "epoch": 1.5585174365528573, - "grad_norm": 1.057889699935913, - "learning_rate": 7.049258462465307e-06, - "loss": 0.8169, + "epoch": 0.7404580152671756, + "grad_norm": 0.9290856122970581, + "learning_rate": 3.569652673520715e-05, + "loss": 0.7283, "step": 4268 }, { - "epoch": 1.5588825999634837, - "grad_norm": 0.816980242729187, - "learning_rate": 7.03833028675337e-06, - "loss": 0.8059, + "epoch": 0.7406315058986814, + "grad_norm": 0.97150057554245, + "learning_rate": 3.569230354196063e-05, + "loss": 0.6761, "step": 4269 }, { - "epoch": 1.5592477633741098, - "grad_norm": 0.9098889231681824, - "learning_rate": 7.027408779300375e-06, - "loss": 0.8081, + "epoch": 0.7408049965301874, + "grad_norm": 0.9783480763435364, + "learning_rate": 3.5688078527604e-05, + "loss": 0.8369, "step": 4270 }, { - "epoch": 1.559612926784736, - "grad_norm": 1.0527859926223755, - "learning_rate": 7.016493945725007e-06, - "loss": 0.8452, + "epoch": 0.7409784871616932, + "grad_norm": 0.8522402048110962, + "learning_rate": 3.568385169262758e-05, + "loss": 0.8179, "step": 4271 }, { - "epoch": 1.5599780901953624, - "grad_norm": 0.9489878416061401, - "learning_rate": 7.005585791642506e-06, - "loss": 0.8344, + "epoch": 0.7411519777931992, + "grad_norm": 1.241417407989502, + "learning_rate": 3.56796230375219e-05, + "loss": 0.6554, "step": 4272 }, { - "epoch": 1.5603432536059887, - "grad_norm": 1.2542532682418823, - "learning_rate": 6.994684322664682e-06, - "loss": 0.8588, + "epoch": 0.741325468424705, + "grad_norm": 1.0408235788345337, + "learning_rate": 3.567539256277769e-05, + "loss": 0.746, "step": 4273 }, { - "epoch": 1.560708417016615, - "grad_norm": 0.8651612997055054, - "learning_rate": 6.983789544399911e-06, - "loss": 0.8104, + "epoch": 0.741498959056211, + "grad_norm": 0.8021348714828491, + "learning_rate": 3.567116026888591e-05, + "loss": 0.8376, "step": 4274 }, { - "epoch": 1.5610735804272413, - "grad_norm": 1.0711215734481812, - "learning_rate": 6.9729014624531035e-06, - "loss": 0.8317, + "epoch": 0.7416724496877168, + "grad_norm": 1.0203137397766113, + "learning_rate": 3.566692615633771e-05, + "loss": 0.8064, "step": 4275 }, { - "epoch": 1.5614387438378674, - "grad_norm": 1.2938932180404663, - "learning_rate": 6.962020082425749e-06, - "loss": 0.845, + "epoch": 0.7418459403192228, + "grad_norm": 1.0188658237457275, + "learning_rate": 3.5662690225624484e-05, + "loss": 0.6925, "step": 4276 }, { - "epoch": 1.5618039072484937, - "grad_norm": 1.3753381967544556, - "learning_rate": 6.951145409915881e-06, - "loss": 0.8464, + "epoch": 0.7420194309507286, + "grad_norm": 0.9997870922088623, + "learning_rate": 3.56584524772378e-05, + "loss": 0.5836, "step": 4277 }, { - "epoch": 1.5621690706591198, - "grad_norm": 1.1286115646362305, - "learning_rate": 6.940277450518089e-06, - "loss": 0.8087, + "epoch": 0.7421929215822346, + "grad_norm": 0.9728227257728577, + "learning_rate": 3.565421291166946e-05, + "loss": 0.6259, "step": 4278 }, { - "epoch": 1.5625342340697461, - "grad_norm": 0.8481032848358154, - "learning_rate": 6.929416209823485e-06, - "loss": 0.7994, + "epoch": 0.7423664122137404, + "grad_norm": 0.9021795392036438, + "learning_rate": 3.564997152941148e-05, + "loss": 0.7794, "step": 4279 }, { - "epoch": 1.5628993974803724, - "grad_norm": 1.0390089750289917, - "learning_rate": 6.918561693419754e-06, - "loss": 0.7952, + "epoch": 0.7425399028452464, + "grad_norm": 0.6946586966514587, + "learning_rate": 3.5645728330956074e-05, + "loss": 0.8994, "step": 4280 }, { - "epoch": 1.5632645608909987, - "grad_norm": 0.967129647731781, - "learning_rate": 6.907713906891114e-06, - "loss": 0.7926, + "epoch": 0.7427133934767522, + "grad_norm": 0.8245841860771179, + "learning_rate": 3.564148331679565e-05, + "loss": 0.7659, "step": 4281 }, { - "epoch": 1.563629724301625, - "grad_norm": 1.0026472806930542, - "learning_rate": 6.896872855818298e-06, - "loss": 0.8328, + "epoch": 0.7428868841082582, + "grad_norm": 1.7269904613494873, + "learning_rate": 3.563723648742286e-05, + "loss": 0.8069, "step": 4282 }, { - "epoch": 1.5639948877122514, - "grad_norm": 1.0032398700714111, - "learning_rate": 6.886038545778611e-06, - "loss": 0.8236, + "epoch": 0.743060374739764, + "grad_norm": 1.0742179155349731, + "learning_rate": 3.563298784333056e-05, + "loss": 0.6768, "step": 4283 }, { - "epoch": 1.5643600511228775, - "grad_norm": 1.090318202972412, - "learning_rate": 6.875210982345855e-06, - "loss": 0.8408, + "epoch": 0.7432338653712699, + "grad_norm": 0.9660969376564026, + "learning_rate": 3.5628737385011814e-05, + "loss": 0.6982, "step": 4284 }, { - "epoch": 1.5647252145335038, - "grad_norm": 1.3907748460769653, - "learning_rate": 6.8643901710903825e-06, - "loss": 0.7963, + "epoch": 0.7434073560027759, + "grad_norm": 1.0108922719955444, + "learning_rate": 3.562448511295987e-05, + "loss": 0.8064, "step": 4285 }, { - "epoch": 1.5650903779441299, - "grad_norm": 0.9576075077056885, - "learning_rate": 6.8535761175790686e-06, - "loss": 0.8153, + "epoch": 0.7435808466342817, + "grad_norm": 0.873816192150116, + "learning_rate": 3.562023102766822e-05, + "loss": 0.8503, "step": 4286 }, { - "epoch": 1.5654555413547562, - "grad_norm": 1.0637818574905396, - "learning_rate": 6.842768827375308e-06, - "loss": 0.7743, + "epoch": 0.7437543372657877, + "grad_norm": 0.9142091870307922, + "learning_rate": 3.561597512963057e-05, + "loss": 0.692, "step": 4287 }, { - "epoch": 1.5658207047653825, - "grad_norm": 0.8485956192016602, - "learning_rate": 6.831968306039025e-06, - "loss": 0.7968, + "epoch": 0.7439278278972935, + "grad_norm": 0.9670136570930481, + "learning_rate": 3.561171741934081e-05, + "loss": 0.6471, "step": 4288 }, { - "epoch": 1.5661858681760088, - "grad_norm": 1.1013484001159668, - "learning_rate": 6.821174559126644e-06, - "loss": 0.7947, + "epoch": 0.7441013185287995, + "grad_norm": 0.8491453528404236, + "learning_rate": 3.560745789729304e-05, + "loss": 0.7512, "step": 4289 }, { - "epoch": 1.5665510315866351, - "grad_norm": 1.266189455986023, - "learning_rate": 6.8103875921911185e-06, - "loss": 0.7845, + "epoch": 0.7442748091603053, + "grad_norm": 0.83407062292099, + "learning_rate": 3.56031965639816e-05, + "loss": 0.7041, "step": 4290 }, { - "epoch": 1.5669161949972614, - "grad_norm": 1.3435524702072144, - "learning_rate": 6.79960741078191e-06, - "loss": 0.7758, + "epoch": 0.7444482997918113, + "grad_norm": 1.3474425077438354, + "learning_rate": 3.559893341990102e-05, + "loss": 0.8904, "step": 4291 }, { - "epoch": 1.5672813584078875, - "grad_norm": 1.01138174533844, - "learning_rate": 6.788834020444997e-06, - "loss": 0.8256, + "epoch": 0.7446217904233171, + "grad_norm": 3.457521915435791, + "learning_rate": 3.559466846554604e-05, + "loss": 0.8419, "step": 4292 }, { - "epoch": 1.5676465218185138, - "grad_norm": 1.2200738191604614, - "learning_rate": 6.778067426722841e-06, - "loss": 0.8334, + "epoch": 0.7447952810548231, + "grad_norm": 2.68611478805542, + "learning_rate": 3.559040170141161e-05, + "loss": 0.7751, "step": 4293 }, { - "epoch": 1.56801168522914, - "grad_norm": 1.1858364343643188, - "learning_rate": 6.767307635154432e-06, - "loss": 0.8628, + "epoch": 0.7449687716863289, + "grad_norm": 1.4606422185897827, + "learning_rate": 3.5586133127992904e-05, + "loss": 0.8064, "step": 4294 }, { - "epoch": 1.5683768486397662, - "grad_norm": 1.0893235206604004, - "learning_rate": 6.7565546512752575e-06, - "loss": 0.843, + "epoch": 0.7451422623178349, + "grad_norm": 0.9593561887741089, + "learning_rate": 3.558186274578527e-05, + "loss": 0.6968, "step": 4295 }, { - "epoch": 1.5687420120503925, - "grad_norm": 1.013452410697937, - "learning_rate": 6.74580848061728e-06, - "loss": 0.8496, + "epoch": 0.7453157529493407, + "grad_norm": 0.764473021030426, + "learning_rate": 3.557759055528433e-05, + "loss": 0.7117, "step": 4296 }, { - "epoch": 1.5691071754610189, - "grad_norm": 0.9623308777809143, - "learning_rate": 6.735069128708984e-06, - "loss": 0.8235, + "epoch": 0.7454892435808467, + "grad_norm": 1.2757244110107422, + "learning_rate": 3.5573316556985845e-05, + "loss": 0.7734, "step": 4297 }, { - "epoch": 1.5694723388716452, - "grad_norm": 1.437452793121338, - "learning_rate": 6.724336601075339e-06, - "loss": 0.814, + "epoch": 0.7456627342123525, + "grad_norm": 0.9681846499443054, + "learning_rate": 3.5569040751385825e-05, + "loss": 0.7979, "step": 4298 }, { - "epoch": 1.5698375022822715, - "grad_norm": 1.3125574588775635, - "learning_rate": 6.713610903237784e-06, - "loss": 0.8149, + "epoch": 0.7458362248438585, + "grad_norm": 0.8133819103240967, + "learning_rate": 3.556476313898048e-05, + "loss": 0.7427, "step": 4299 }, { - "epoch": 1.5702026656928976, - "grad_norm": 1.3052008152008057, - "learning_rate": 6.702892040714273e-06, - "loss": 0.8239, + "epoch": 0.7460097154753643, + "grad_norm": 0.9322758316993713, + "learning_rate": 3.556048372026625e-05, + "loss": 0.8511, "step": 4300 }, { - "epoch": 1.5705678291035237, - "grad_norm": 1.01767098903656, - "learning_rate": 6.692180019019226e-06, - "loss": 0.7742, + "epoch": 0.7461832061068703, + "grad_norm": 0.8947701454162598, + "learning_rate": 3.5556202495739736e-05, + "loss": 0.6569, "step": 4301 }, { - "epoch": 1.57093299251415, - "grad_norm": 1.0551096200942993, - "learning_rate": 6.681474843663556e-06, - "loss": 0.8582, + "epoch": 0.7463566967383761, + "grad_norm": 0.6529932022094727, + "learning_rate": 3.555191946589781e-05, + "loss": 0.7238, "step": 4302 }, { - "epoch": 1.5712981559247763, - "grad_norm": 0.9280059933662415, - "learning_rate": 6.670776520154634e-06, - "loss": 0.7936, + "epoch": 0.7465301873698821, + "grad_norm": 0.6616665720939636, + "learning_rate": 3.55476346312375e-05, + "loss": 0.719, "step": 4303 }, { - "epoch": 1.5716633193354026, - "grad_norm": 1.1273958683013916, - "learning_rate": 6.6600850539963215e-06, - "loss": 0.7908, + "epoch": 0.7467036780013879, + "grad_norm": 0.9642495512962341, + "learning_rate": 3.554334799225608e-05, + "loss": 0.6244, "step": 4304 }, { - "epoch": 1.572028482746029, - "grad_norm": 1.1085063219070435, - "learning_rate": 6.6494004506889545e-06, - "loss": 0.8008, + "epoch": 0.7468771686328938, + "grad_norm": 1.5717378854751587, + "learning_rate": 3.5539059549451e-05, + "loss": 0.7507, "step": 4305 }, { - "epoch": 1.5723936461566552, - "grad_norm": 0.9482899308204651, - "learning_rate": 6.638722715729327e-06, - "loss": 0.8287, + "epoch": 0.7470506592643997, + "grad_norm": 0.8565177917480469, + "learning_rate": 3.553476930331996e-05, + "loss": 0.8289, "step": 4306 }, { - "epoch": 1.5727588095672813, - "grad_norm": 1.2293729782104492, - "learning_rate": 6.628051854610715e-06, - "loss": 0.815, + "epoch": 0.7472241498959056, + "grad_norm": 0.7450271844863892, + "learning_rate": 3.553047725436085e-05, + "loss": 0.705, "step": 4307 }, { - "epoch": 1.5731239729779076, - "grad_norm": 1.1533304452896118, - "learning_rate": 6.617387872822842e-06, - "loss": 0.839, + "epoch": 0.7473976405274115, + "grad_norm": 0.8469099402427673, + "learning_rate": 3.5526183403071754e-05, + "loss": 0.6976, "step": 4308 }, { - "epoch": 1.5734891363885337, - "grad_norm": 0.9252516627311707, - "learning_rate": 6.606730775851891e-06, - "loss": 0.8407, + "epoch": 0.7475711311589174, + "grad_norm": 1.8467116355895996, + "learning_rate": 3.552188774995098e-05, + "loss": 0.8066, "step": 4309 }, { - "epoch": 1.57385429979916, - "grad_norm": 1.0522563457489014, - "learning_rate": 6.596080569180517e-06, - "loss": 0.7812, + "epoch": 0.7477446217904233, + "grad_norm": 0.8901004791259766, + "learning_rate": 3.551759029549705e-05, + "loss": 0.7681, "step": 4310 }, { - "epoch": 1.5742194632097863, - "grad_norm": 1.1410102844238281, - "learning_rate": 6.585437258287823e-06, - "loss": 0.812, + "epoch": 0.7479181124219292, + "grad_norm": 0.9781139492988586, + "learning_rate": 3.5513291040208674e-05, + "loss": 0.866, "step": 4311 }, { - "epoch": 1.5745846266204127, - "grad_norm": 1.1532783508300781, - "learning_rate": 6.574800848649374e-06, - "loss": 0.7679, + "epoch": 0.7480916030534351, + "grad_norm": 1.0195224285125732, + "learning_rate": 3.550898998458481e-05, + "loss": 0.7767, "step": 4312 }, { - "epoch": 1.574949790031039, - "grad_norm": 1.0394028425216675, - "learning_rate": 6.564171345737163e-06, - "loss": 0.8082, + "epoch": 0.748265093684941, + "grad_norm": 0.9443199634552002, + "learning_rate": 3.550468712912458e-05, + "loss": 0.6533, "step": 4313 }, { - "epoch": 1.5753149534416653, - "grad_norm": 1.4831945896148682, - "learning_rate": 6.553548755019648e-06, - "loss": 0.808, + "epoch": 0.7484385843164469, + "grad_norm": 0.7689177393913269, + "learning_rate": 3.550038247432734e-05, + "loss": 0.8286, "step": 4314 }, { - "epoch": 1.5756801168522914, - "grad_norm": 1.474453330039978, - "learning_rate": 6.542933081961724e-06, - "loss": 0.8351, + "epoch": 0.7486120749479528, + "grad_norm": 1.4414094686508179, + "learning_rate": 3.549607602069265e-05, + "loss": 0.6921, "step": 4315 }, { - "epoch": 1.5760452802629177, - "grad_norm": 0.9869124293327332, - "learning_rate": 6.532324332024733e-06, - "loss": 0.8181, + "epoch": 0.7487855655794587, + "grad_norm": 1.3660107851028442, + "learning_rate": 3.549176776872029e-05, + "loss": 0.7618, "step": 4316 }, { - "epoch": 1.5764104436735438, - "grad_norm": 1.7460864782333374, - "learning_rate": 6.521722510666457e-06, - "loss": 0.7946, + "epoch": 0.7489590562109646, + "grad_norm": 0.9471346735954285, + "learning_rate": 3.5487457718910226e-05, + "loss": 0.7112, "step": 4317 }, { - "epoch": 1.57677560708417, - "grad_norm": 1.778612732887268, - "learning_rate": 6.511127623341091e-06, - "loss": 0.813, + "epoch": 0.7491325468424705, + "grad_norm": 1.2779288291931152, + "learning_rate": 3.5483145871762646e-05, + "loss": 0.6897, "step": 4318 }, { - "epoch": 1.5771407704947964, - "grad_norm": 1.0033833980560303, - "learning_rate": 6.5005396754992885e-06, - "loss": 0.807, + "epoch": 0.7493060374739764, + "grad_norm": 0.7252038717269897, + "learning_rate": 3.5478832227777945e-05, + "loss": 0.7095, "step": 4319 }, { - "epoch": 1.5775059339054227, - "grad_norm": 1.1204307079315186, - "learning_rate": 6.4899586725881235e-06, - "loss": 0.8142, + "epoch": 0.7494795281054824, + "grad_norm": 0.8649352788925171, + "learning_rate": 3.547451678745673e-05, + "loss": 0.7299, "step": 4320 }, { - "epoch": 1.577871097316049, - "grad_norm": 1.1938536167144775, - "learning_rate": 6.479384620051103e-06, - "loss": 0.8029, + "epoch": 0.7496530187369882, + "grad_norm": 0.8754439353942871, + "learning_rate": 3.547019955129981e-05, + "loss": 0.6538, "step": 4321 }, { - "epoch": 1.5782362607266753, - "grad_norm": 1.2890982627868652, - "learning_rate": 6.468817523328148e-06, - "loss": 0.8138, + "epoch": 0.7498265093684942, + "grad_norm": 0.9896043539047241, + "learning_rate": 3.54658805198082e-05, + "loss": 0.7322, "step": 4322 }, { - "epoch": 1.5786014241373014, - "grad_norm": 1.1504931449890137, - "learning_rate": 6.4582573878555975e-06, - "loss": 0.834, + "epoch": 0.75, + "grad_norm": 0.7359915375709534, + "learning_rate": 3.546155969348315e-05, + "loss": 0.6931, "step": 4323 }, { - "epoch": 1.5789665875479277, - "grad_norm": 0.7661682963371277, - "learning_rate": 6.447704219066224e-06, - "loss": 0.7353, + "epoch": 0.7501734906315058, + "grad_norm": 0.7597223520278931, + "learning_rate": 3.545723707282606e-05, + "loss": 0.8723, "step": 4324 }, { - "epoch": 1.5793317509585538, - "grad_norm": 1.1475045680999756, - "learning_rate": 6.437158022389212e-06, - "loss": 0.8139, + "epoch": 0.7503469812630118, + "grad_norm": 1.3808608055114746, + "learning_rate": 3.5452912658338605e-05, + "loss": 0.6447, "step": 4325 }, { - "epoch": 1.5796969143691801, - "grad_norm": 1.0185563564300537, - "learning_rate": 6.4266188032501595e-06, - "loss": 0.8251, + "epoch": 0.7505204718945176, + "grad_norm": 0.8837897777557373, + "learning_rate": 3.5448586450522635e-05, + "loss": 0.7806, "step": 4326 }, { - "epoch": 1.5800620777798065, - "grad_norm": 1.1343326568603516, - "learning_rate": 6.4160865670710605e-06, - "loss": 0.8113, + "epoch": 0.7506939625260236, + "grad_norm": 0.976927638053894, + "learning_rate": 3.5444258449880205e-05, + "loss": 0.6123, "step": 4327 }, { - "epoch": 1.5804272411904328, - "grad_norm": 1.0993815660476685, - "learning_rate": 6.405561319270335e-06, - "loss": 0.7986, + "epoch": 0.7508674531575295, + "grad_norm": 1.1657615900039673, + "learning_rate": 3.5439928656913586e-05, + "loss": 0.6343, "step": 4328 }, { - "epoch": 1.580792404601059, - "grad_norm": 2.0873429775238037, - "learning_rate": 6.395043065262798e-06, - "loss": 0.7787, + "epoch": 0.7510409437890354, + "grad_norm": 1.442787766456604, + "learning_rate": 3.543559707212525e-05, + "loss": 0.7142, "step": 4329 }, { - "epoch": 1.5811575680116854, - "grad_norm": 1.6233267784118652, - "learning_rate": 6.384531810459673e-06, - "loss": 0.8485, + "epoch": 0.7512144344205413, + "grad_norm": 0.8507841229438782, + "learning_rate": 3.543126369601789e-05, + "loss": 0.8007, "step": 4330 }, { - "epoch": 1.5815227314223115, - "grad_norm": 1.029131531715393, - "learning_rate": 6.3740275602685835e-06, - "loss": 0.8073, + "epoch": 0.7513879250520472, + "grad_norm": 1.005346417427063, + "learning_rate": 3.54269285290944e-05, + "loss": 0.8245, "step": 4331 }, { - "epoch": 1.5818878948329378, - "grad_norm": 0.8874304890632629, - "learning_rate": 6.363530320093529e-06, - "loss": 0.7852, + "epoch": 0.7515614156835531, + "grad_norm": 7.214276313781738, + "learning_rate": 3.542259157185787e-05, + "loss": 0.8008, "step": 4332 }, { - "epoch": 1.5822530582435639, - "grad_norm": 1.1994154453277588, - "learning_rate": 6.353040095334931e-06, - "loss": 0.8007, + "epoch": 0.751734906315059, + "grad_norm": 0.9966731071472168, + "learning_rate": 3.541825282481162e-05, + "loss": 0.7415, "step": 4333 }, { - "epoch": 1.5826182216541902, - "grad_norm": 1.114932656288147, - "learning_rate": 6.342556891389582e-06, - "loss": 0.8177, + "epoch": 0.7519083969465649, + "grad_norm": 0.9736891388893127, + "learning_rate": 3.5413912288459174e-05, + "loss": 0.6327, "step": 4334 }, { - "epoch": 1.5829833850648165, - "grad_norm": 1.0690252780914307, - "learning_rate": 6.332080713650684e-06, - "loss": 0.7825, + "epoch": 0.7520818875780708, + "grad_norm": 0.8952970504760742, + "learning_rate": 3.540956996330424e-05, + "loss": 0.6934, "step": 4335 }, { - "epoch": 1.5833485484754428, - "grad_norm": 1.0347954034805298, - "learning_rate": 6.321611567507795e-06, - "loss": 0.8376, + "epoch": 0.7522553782095767, + "grad_norm": 0.969652533531189, + "learning_rate": 3.5405225849850754e-05, + "loss": 0.7418, "step": 4336 }, { - "epoch": 1.5837137118860691, - "grad_norm": 0.9802241921424866, - "learning_rate": 6.31114945834687e-06, - "loss": 0.7838, + "epoch": 0.7524288688410826, + "grad_norm": 0.9006686806678772, + "learning_rate": 3.5400879948602854e-05, + "loss": 0.7544, "step": 4337 }, { - "epoch": 1.5840788752966952, - "grad_norm": 0.9400674700737, - "learning_rate": 6.3006943915502506e-06, - "loss": 0.8225, + "epoch": 0.7526023594725885, + "grad_norm": 4.152871131896973, + "learning_rate": 3.53965322600649e-05, + "loss": 0.7444, "step": 4338 }, { - "epoch": 1.5844440387073215, - "grad_norm": 1.4022554159164429, - "learning_rate": 6.290246372496646e-06, - "loss": 0.8352, + "epoch": 0.7527758501040944, + "grad_norm": 0.9015775322914124, + "learning_rate": 3.539218278474143e-05, + "loss": 0.8159, "step": 4339 }, { - "epoch": 1.5848092021179476, - "grad_norm": 1.2020740509033203, - "learning_rate": 6.279805406561146e-06, - "loss": 0.7835, + "epoch": 0.7529493407356003, + "grad_norm": 0.8637308478355408, + "learning_rate": 3.5387831523137216e-05, + "loss": 0.7751, "step": 4340 }, { - "epoch": 1.585174365528574, - "grad_norm": 0.8537296652793884, - "learning_rate": 6.269371499115213e-06, - "loss": 0.8257, + "epoch": 0.7531228313671062, + "grad_norm": 0.7733387351036072, + "learning_rate": 3.538347847575722e-05, + "loss": 0.738, "step": 4341 }, { - "epoch": 1.5855395289392002, - "grad_norm": 1.5112252235412598, - "learning_rate": 6.258944655526662e-06, - "loss": 0.793, + "epoch": 0.7532963219986121, + "grad_norm": 0.8239704966545105, + "learning_rate": 3.5379123643106625e-05, + "loss": 0.6938, "step": 4342 }, { - "epoch": 1.5859046923498266, - "grad_norm": 1.2092258930206299, - "learning_rate": 6.24852488115969e-06, - "loss": 0.8019, + "epoch": 0.7534698126301179, + "grad_norm": 0.8544909954071045, + "learning_rate": 3.537476702569081e-05, + "loss": 0.6787, "step": 4343 }, { - "epoch": 1.5862698557604529, - "grad_norm": 1.0907233953475952, - "learning_rate": 6.238112181374856e-06, - "loss": 0.8285, + "epoch": 0.7536433032616239, + "grad_norm": 1.1855380535125732, + "learning_rate": 3.5370408624015364e-05, + "loss": 0.7646, "step": 4344 }, { - "epoch": 1.5866350191710792, - "grad_norm": 1.2837003469467163, - "learning_rate": 6.227706561529079e-06, - "loss": 0.8298, + "epoch": 0.7538167938931297, + "grad_norm": 0.963719367980957, + "learning_rate": 3.536604843858609e-05, + "loss": 0.7651, "step": 4345 }, { - "epoch": 1.5870001825817053, - "grad_norm": 1.0324846506118774, - "learning_rate": 6.217308026975623e-06, - "loss": 0.8015, + "epoch": 0.7539902845246357, + "grad_norm": 0.8535280823707581, + "learning_rate": 3.536168646990899e-05, + "loss": 0.8604, "step": 4346 }, { - "epoch": 1.5873653459923316, - "grad_norm": 0.9266912937164307, - "learning_rate": 6.206916583064124e-06, - "loss": 0.8165, + "epoch": 0.7541637751561415, + "grad_norm": 1.1439218521118164, + "learning_rate": 3.535732271849028e-05, + "loss": 0.7427, "step": 4347 }, { - "epoch": 1.5877305094029577, - "grad_norm": 1.0453143119812012, - "learning_rate": 6.196532235140564e-06, - "loss": 0.8351, + "epoch": 0.7543372657876475, + "grad_norm": 1.4610395431518555, + "learning_rate": 3.535295718483636e-05, + "loss": 0.6321, "step": 4348 }, { - "epoch": 1.588095672813584, - "grad_norm": 1.5646964311599731, - "learning_rate": 6.186154988547266e-06, - "loss": 0.8175, + "epoch": 0.7545107564191533, + "grad_norm": 0.9337558150291443, + "learning_rate": 3.5348589869453874e-05, + "loss": 0.6852, "step": 4349 }, { - "epoch": 1.5884608362242103, - "grad_norm": 1.072772741317749, - "learning_rate": 6.175784848622913e-06, - "loss": 0.8448, + "epoch": 0.7546842470506593, + "grad_norm": 0.8542115092277527, + "learning_rate": 3.5344220772849654e-05, + "loss": 0.8584, "step": 4350 }, { - "epoch": 1.5888259996348366, - "grad_norm": 1.0466597080230713, - "learning_rate": 6.1654218207025285e-06, - "loss": 0.8129, + "epoch": 0.7548577376821651, + "grad_norm": 0.7556518912315369, + "learning_rate": 3.533984989553073e-05, + "loss": 0.8132, "step": 4351 }, { - "epoch": 1.589191163045463, - "grad_norm": 0.8577800989151001, - "learning_rate": 6.155065910117464e-06, - "loss": 0.8253, + "epoch": 0.7550312283136711, + "grad_norm": 0.9514586329460144, + "learning_rate": 3.533547723800435e-05, + "loss": 0.8353, "step": 4352 }, { - "epoch": 1.5895563264560892, - "grad_norm": 1.349931240081787, - "learning_rate": 6.144717122195425e-06, - "loss": 0.8053, + "epoch": 0.7552047189451769, + "grad_norm": 0.8876003623008728, + "learning_rate": 3.533110280077797e-05, + "loss": 0.606, "step": 4353 }, { - "epoch": 1.5899214898667153, - "grad_norm": 1.1004916429519653, - "learning_rate": 6.134375462260449e-06, - "loss": 0.8212, + "epoch": 0.7553782095766829, + "grad_norm": 1.052347183227539, + "learning_rate": 3.532672658435925e-05, + "loss": 0.6743, "step": 4354 }, { - "epoch": 1.5902866532773416, - "grad_norm": 1.2642276287078857, - "learning_rate": 6.124040935632913e-06, - "loss": 0.8048, + "epoch": 0.7555517002081887, + "grad_norm": 1.1470117568969727, + "learning_rate": 3.5322348589256044e-05, + "loss": 0.7084, "step": 4355 }, { - "epoch": 1.5906518166879677, - "grad_norm": 1.3392140865325928, - "learning_rate": 6.113713547629501e-06, - "loss": 0.8522, + "epoch": 0.7557251908396947, + "grad_norm": 1.049479365348816, + "learning_rate": 3.531796881597643e-05, + "loss": 0.6616, "step": 4356 }, { - "epoch": 1.591016980098594, - "grad_norm": 1.1274559497833252, - "learning_rate": 6.103393303563245e-06, - "loss": 0.809, + "epoch": 0.7558986814712005, + "grad_norm": 1.4789034128189087, + "learning_rate": 3.5313587265028686e-05, + "loss": 0.6183, "step": 4357 }, { - "epoch": 1.5913821435092204, - "grad_norm": 1.1746021509170532, - "learning_rate": 6.0930802087435005e-06, - "loss": 0.7592, + "epoch": 0.7560721721027065, + "grad_norm": 0.8428752422332764, + "learning_rate": 3.53092039369213e-05, + "loss": 0.6838, "step": 4358 }, { - "epoch": 1.5917473069198467, - "grad_norm": 1.1739888191223145, - "learning_rate": 6.0827742684759375e-06, - "loss": 0.8063, + "epoch": 0.7562456627342123, + "grad_norm": 0.7610135674476624, + "learning_rate": 3.5304818832162956e-05, + "loss": 0.6819, "step": 4359 }, { - "epoch": 1.592112470330473, - "grad_norm": 0.9006239175796509, - "learning_rate": 6.072475488062557e-06, - "loss": 0.8413, + "epoch": 0.7564191533657183, + "grad_norm": 0.9645675420761108, + "learning_rate": 3.530043195126255e-05, + "loss": 0.7124, "step": 4360 }, { - "epoch": 1.5924776337410993, - "grad_norm": 1.0514041185379028, - "learning_rate": 6.062183872801662e-06, - "loss": 0.7807, + "epoch": 0.7565926439972241, + "grad_norm": 1.1461029052734375, + "learning_rate": 3.529604329472919e-05, + "loss": 0.7605, "step": 4361 }, { - "epoch": 1.5928427971517254, - "grad_norm": 1.6152596473693848, - "learning_rate": 6.051899427987866e-06, - "loss": 0.7784, + "epoch": 0.7567661346287301, + "grad_norm": 1.2277296781539917, + "learning_rate": 3.529165286307219e-05, + "loss": 0.8054, "step": 4362 }, { - "epoch": 1.5932079605623517, - "grad_norm": 1.0673727989196777, - "learning_rate": 6.041622158912113e-06, - "loss": 0.8099, + "epoch": 0.756939625260236, + "grad_norm": 0.8763591647148132, + "learning_rate": 3.5287260656801044e-05, + "loss": 0.8425, "step": 4363 }, { - "epoch": 1.5935731239729778, - "grad_norm": 1.110504388809204, - "learning_rate": 6.031352070861645e-06, - "loss": 0.7589, + "epoch": 0.7571131158917418, + "grad_norm": 0.8977773785591125, + "learning_rate": 3.528286667642549e-05, + "loss": 0.7279, "step": 4364 }, { - "epoch": 1.593938287383604, - "grad_norm": 1.2639414072036743, - "learning_rate": 6.021089169120013e-06, - "loss": 0.8078, + "epoch": 0.7572866065232478, + "grad_norm": 1.0455257892608643, + "learning_rate": 3.5278470922455453e-05, + "loss": 0.7078, "step": 4365 }, { - "epoch": 1.5943034507942304, - "grad_norm": 0.8172780871391296, - "learning_rate": 6.010833458967063e-06, - "loss": 0.8109, + "epoch": 0.7574600971547536, + "grad_norm": 2.1409990787506104, + "learning_rate": 3.527407339540106e-05, + "loss": 0.9224, "step": 4366 }, { - "epoch": 1.5946686142048567, - "grad_norm": 1.0855575799942017, - "learning_rate": 6.000584945678944e-06, - "loss": 0.8179, + "epoch": 0.7576335877862596, + "grad_norm": 0.7846871614456177, + "learning_rate": 3.5269674095772654e-05, + "loss": 0.7397, "step": 4367 }, { - "epoch": 1.595033777615483, - "grad_norm": 0.8283522725105286, - "learning_rate": 5.99034363452811e-06, - "loss": 0.8358, + "epoch": 0.7578070784177654, + "grad_norm": 1.1696999073028564, + "learning_rate": 3.5265273024080776e-05, + "loss": 0.6848, "step": 4368 }, { - "epoch": 1.5953989410261091, - "grad_norm": 0.8005900979042053, - "learning_rate": 5.980109530783311e-06, - "loss": 0.8, + "epoch": 0.7579805690492714, + "grad_norm": 0.8066532015800476, + "learning_rate": 3.526087018083617e-05, + "loss": 0.7869, "step": 4369 }, { - "epoch": 1.5957641044367354, - "grad_norm": 1.194327712059021, - "learning_rate": 5.9698826397095676e-06, - "loss": 0.8239, + "epoch": 0.7581540596807772, + "grad_norm": 0.7241030931472778, + "learning_rate": 3.52564655665498e-05, + "loss": 0.9033, "step": 4370 }, { - "epoch": 1.5961292678473615, - "grad_norm": 1.3567266464233398, - "learning_rate": 5.959662966568214e-06, - "loss": 0.8441, + "epoch": 0.7583275503122832, + "grad_norm": 1.2239329814910889, + "learning_rate": 3.525205918173283e-05, + "loss": 0.7551, "step": 4371 }, { - "epoch": 1.5964944312579878, - "grad_norm": 1.7000679969787598, - "learning_rate": 5.949450516616859e-06, - "loss": 0.7877, + "epoch": 0.758501040943789, + "grad_norm": 0.8060351014137268, + "learning_rate": 3.524765102689662e-05, + "loss": 0.6637, "step": 4372 }, { - "epoch": 1.5968595946686142, - "grad_norm": 1.2285451889038086, - "learning_rate": 5.939245295109401e-06, - "loss": 0.8014, + "epoch": 0.758674531575295, + "grad_norm": 1.711591362953186, + "learning_rate": 3.524324110255273e-05, + "loss": 0.6553, "step": 4373 }, { - "epoch": 1.5972247580792405, - "grad_norm": 1.066339135169983, - "learning_rate": 5.929047307296023e-06, - "loss": 0.8112, + "epoch": 0.7588480222068008, + "grad_norm": 1.1077489852905273, + "learning_rate": 3.523882940921296e-05, + "loss": 0.6343, "step": 4374 }, { - "epoch": 1.5975899214898668, - "grad_norm": 0.7674596309661865, - "learning_rate": 5.918856558423171e-06, - "loss": 0.8123, + "epoch": 0.7590215128383068, + "grad_norm": 0.8500475883483887, + "learning_rate": 3.523441594738927e-05, + "loss": 0.6526, "step": 4375 }, { - "epoch": 1.597955084900493, - "grad_norm": 1.0824549198150635, - "learning_rate": 5.908673053733573e-06, - "loss": 0.8129, + "epoch": 0.7591950034698126, + "grad_norm": 1.011919617652893, + "learning_rate": 3.523000071759387e-05, + "loss": 0.7478, "step": 4376 }, { - "epoch": 1.5983202483111192, - "grad_norm": 2.315363883972168, - "learning_rate": 5.89849679846624e-06, - "loss": 0.8312, + "epoch": 0.7593684941013186, + "grad_norm": 1.5901223421096802, + "learning_rate": 3.522558372033912e-05, + "loss": 0.8799, "step": 4377 }, { - "epoch": 1.5986854117217455, - "grad_norm": 1.4630100727081299, - "learning_rate": 5.8883277978564434e-06, - "loss": 0.8639, + "epoch": 0.7595419847328244, + "grad_norm": 1.1270251274108887, + "learning_rate": 3.522116495613766e-05, + "loss": 0.748, "step": 4378 }, { - "epoch": 1.5990505751323716, - "grad_norm": 1.868547797203064, - "learning_rate": 5.878166057135737e-06, - "loss": 0.8594, + "epoch": 0.7597154753643304, + "grad_norm": 1.6275814771652222, + "learning_rate": 3.521674442550226e-05, + "loss": 0.7452, "step": 4379 }, { - "epoch": 1.599415738542998, - "grad_norm": 0.9213641285896301, - "learning_rate": 5.868011581531914e-06, - "loss": 0.7943, + "epoch": 0.7598889659958362, + "grad_norm": 0.9776718020439148, + "learning_rate": 3.521232212894594e-05, + "loss": 0.7671, "step": 4380 }, { - "epoch": 1.5997809019536242, - "grad_norm": 1.603021264076233, - "learning_rate": 5.857864376269051e-06, - "loss": 0.7871, + "epoch": 0.7600624566273422, + "grad_norm": 0.8217899203300476, + "learning_rate": 3.520789806698191e-05, + "loss": 0.8953, "step": 4381 }, { - "epoch": 1.6001460653642505, - "grad_norm": 1.1616698503494263, - "learning_rate": 5.847724446567477e-06, - "loss": 0.7943, + "epoch": 0.760235947258848, + "grad_norm": 0.7588617205619812, + "learning_rate": 3.5203472240123594e-05, + "loss": 0.8923, "step": 4382 }, { - "epoch": 1.6005112287748768, - "grad_norm": 1.1344735622406006, - "learning_rate": 5.83759179764378e-06, - "loss": 0.7968, + "epoch": 0.7604094378903539, + "grad_norm": 1.2253409624099731, + "learning_rate": 3.5199044648884605e-05, + "loss": 0.7719, "step": 4383 }, { - "epoch": 1.6008763921855031, - "grad_norm": 0.9122528433799744, - "learning_rate": 5.8274664347108086e-06, - "loss": 0.8403, + "epoch": 0.7605829285218598, + "grad_norm": 0.7918983101844788, + "learning_rate": 3.519461529377877e-05, + "loss": 0.7051, "step": 4384 }, { - "epoch": 1.6012415555961292, - "grad_norm": 0.9776244163513184, - "learning_rate": 5.817348362977642e-06, - "loss": 0.8348, + "epoch": 0.7607564191533657, + "grad_norm": 0.8577303886413574, + "learning_rate": 3.519018417532013e-05, + "loss": 0.7764, "step": 4385 }, { - "epoch": 1.6016067190067556, - "grad_norm": 1.4099971055984497, - "learning_rate": 5.807237587649625e-06, - "loss": 0.8344, + "epoch": 0.7609299097848716, + "grad_norm": 1.308056116104126, + "learning_rate": 3.5185751294022914e-05, + "loss": 0.7676, "step": 4386 }, { - "epoch": 1.6019718824173816, - "grad_norm": 1.0371805429458618, - "learning_rate": 5.7971341139283535e-06, - "loss": 0.8239, + "epoch": 0.7611034004163775, + "grad_norm": 1.0214859247207642, + "learning_rate": 3.518131665040157e-05, + "loss": 0.6897, "step": 4387 }, { - "epoch": 1.602337045828008, - "grad_norm": 1.2408367395401, - "learning_rate": 5.787037947011658e-06, - "loss": 0.7962, + "epoch": 0.7612768910478834, + "grad_norm": 0.79840487241745, + "learning_rate": 3.5176880244970735e-05, + "loss": 0.6653, "step": 4388 }, { - "epoch": 1.6027022092386343, - "grad_norm": 1.0422794818878174, - "learning_rate": 5.7769490920936085e-06, - "loss": 0.7854, + "epoch": 0.7614503816793893, + "grad_norm": 2.44914174079895, + "learning_rate": 3.517244207824526e-05, + "loss": 0.8577, "step": 4389 }, { - "epoch": 1.6030673726492606, - "grad_norm": 1.372788906097412, - "learning_rate": 5.76686755436451e-06, - "loss": 0.7792, + "epoch": 0.7616238723108952, + "grad_norm": 0.9639493823051453, + "learning_rate": 3.516800215074021e-05, + "loss": 0.6564, "step": 4390 }, { - "epoch": 1.603432536059887, - "grad_norm": 1.2325608730316162, - "learning_rate": 5.756793339010915e-06, - "loss": 0.8254, + "epoch": 0.7617973629424011, + "grad_norm": 0.9847102165222168, + "learning_rate": 3.516356046297083e-05, + "loss": 0.8755, "step": 4391 }, { - "epoch": 1.6037976994705132, - "grad_norm": 1.051535725593567, - "learning_rate": 5.746726451215603e-06, - "loss": 0.8027, + "epoch": 0.761970853573907, + "grad_norm": 1.6155734062194824, + "learning_rate": 3.515911701545259e-05, + "loss": 0.7814, "step": 4392 }, { - "epoch": 1.6041628628811393, - "grad_norm": 0.9803805351257324, - "learning_rate": 5.7366668961575835e-06, - "loss": 0.7997, + "epoch": 0.7621443442054129, + "grad_norm": 1.5140767097473145, + "learning_rate": 3.515467180870116e-05, + "loss": 0.7568, "step": 4393 }, { - "epoch": 1.6045280262917656, - "grad_norm": 1.279620885848999, - "learning_rate": 5.726614679012099e-06, - "loss": 0.8376, + "epoch": 0.7623178348369188, + "grad_norm": 1.0687850713729858, + "learning_rate": 3.5150224843232405e-05, + "loss": 0.6642, "step": 4394 }, { - "epoch": 1.6048931897023917, - "grad_norm": 0.9331808686256409, - "learning_rate": 5.716569804950603e-06, - "loss": 0.8007, + "epoch": 0.7624913254684247, + "grad_norm": 4.490041732788086, + "learning_rate": 3.51457761195624e-05, + "loss": 0.6967, "step": 4395 }, { - "epoch": 1.605258353113018, - "grad_norm": 1.1361150741577148, - "learning_rate": 5.706532279140782e-06, - "loss": 0.7985, + "epoch": 0.7626648160999306, + "grad_norm": 0.9783996343612671, + "learning_rate": 3.514132563820744e-05, + "loss": 0.7461, "step": 4396 }, { - "epoch": 1.6056235165236443, - "grad_norm": 0.9480416774749756, - "learning_rate": 5.696502106746542e-06, - "loss": 0.822, + "epoch": 0.7628383067314365, + "grad_norm": 0.8389992713928223, + "learning_rate": 3.513687339968399e-05, + "loss": 0.7784, "step": 4397 }, { - "epoch": 1.6059886799342706, - "grad_norm": 1.2463479042053223, - "learning_rate": 5.686479292928011e-06, - "loss": 0.8131, + "epoch": 0.7630117973629424, + "grad_norm": 1.0125219821929932, + "learning_rate": 3.513241940450874e-05, + "loss": 0.7554, "step": 4398 }, { - "epoch": 1.606353843344897, - "grad_norm": 1.3375215530395508, - "learning_rate": 5.676463842841513e-06, - "loss": 0.8221, + "epoch": 0.7631852879944483, + "grad_norm": 0.9197133183479309, + "learning_rate": 3.5127963653198583e-05, + "loss": 0.8254, "step": 4399 }, { - "epoch": 1.6067190067555233, - "grad_norm": 1.2921079397201538, - "learning_rate": 5.666455761639598e-06, - "loss": 0.7954, + "epoch": 0.7633587786259542, + "grad_norm": 0.8506860733032227, + "learning_rate": 3.512350614627062e-05, + "loss": 0.7177, "step": 4400 }, { - "epoch": 1.6070841701661494, - "grad_norm": 1.2101329565048218, - "learning_rate": 5.656455054471026e-06, - "loss": 0.8005, + "epoch": 0.7635322692574601, + "grad_norm": 0.9811919331550598, + "learning_rate": 3.511904688424215e-05, + "loss": 0.8511, "step": 4401 }, { - "epoch": 1.6074493335767754, - "grad_norm": 1.0295536518096924, - "learning_rate": 5.646461726480763e-06, - "loss": 0.7928, + "epoch": 0.7637057598889659, + "grad_norm": 2.218024730682373, + "learning_rate": 3.511458586763067e-05, + "loss": 0.6458, "step": 4402 }, { - "epoch": 1.6078144969874018, - "grad_norm": 1.5723223686218262, - "learning_rate": 5.636475782809972e-06, - "loss": 0.8556, + "epoch": 0.7638792505204719, + "grad_norm": 1.0824449062347412, + "learning_rate": 3.511012309695389e-05, + "loss": 0.7312, "step": 4403 }, { - "epoch": 1.608179660398028, - "grad_norm": 1.5556474924087524, - "learning_rate": 5.6264972285960104e-06, - "loss": 0.7922, + "epoch": 0.7640527411519777, + "grad_norm": 0.9960730075836182, + "learning_rate": 3.510565857272972e-05, + "loss": 0.6726, "step": 4404 }, { - "epoch": 1.6085448238086544, - "grad_norm": 1.1261545419692993, - "learning_rate": 5.616526068972452e-06, - "loss": 0.85, + "epoch": 0.7642262317834837, + "grad_norm": 1.1020361185073853, + "learning_rate": 3.510119229547626e-05, + "loss": 0.7534, "step": 4405 }, { - "epoch": 1.6089099872192807, - "grad_norm": 0.9497619867324829, - "learning_rate": 5.606562309069059e-06, - "loss": 0.7836, + "epoch": 0.7643997224149895, + "grad_norm": 0.8751320242881775, + "learning_rate": 3.509672426571185e-05, + "loss": 0.7686, "step": 4406 }, { - "epoch": 1.609275150629907, - "grad_norm": 1.3275059461593628, - "learning_rate": 5.596605954011785e-06, - "loss": 0.8011, + "epoch": 0.7645732130464955, + "grad_norm": 0.7954155206680298, + "learning_rate": 3.509225448395499e-05, + "loss": 0.6855, "step": 4407 }, { - "epoch": 1.609640314040533, - "grad_norm": 0.9530651569366455, - "learning_rate": 5.586657008922782e-06, - "loss": 0.8379, + "epoch": 0.7647467036780013, + "grad_norm": 1.117141604423523, + "learning_rate": 3.508778295072441e-05, + "loss": 0.6719, "step": 4408 }, { - "epoch": 1.6100054774511594, - "grad_norm": 0.9755319356918335, - "learning_rate": 5.576715478920369e-06, - "loss": 0.8636, + "epoch": 0.7649201943095073, + "grad_norm": 0.8545970916748047, + "learning_rate": 3.5083309666539043e-05, + "loss": 0.7395, "step": 4409 }, { - "epoch": 1.6103706408617855, - "grad_norm": 1.2138700485229492, - "learning_rate": 5.566781369119072e-06, - "loss": 0.7761, + "epoch": 0.7650936849410132, + "grad_norm": 1.1490668058395386, + "learning_rate": 3.5078834631918014e-05, + "loss": 0.8463, "step": 4410 }, { - "epoch": 1.6107358042724118, - "grad_norm": 1.1236599683761597, - "learning_rate": 5.556854684629593e-06, - "loss": 0.7845, + "epoch": 0.7652671755725191, + "grad_norm": 1.2524504661560059, + "learning_rate": 3.507435784738065e-05, + "loss": 0.7642, "step": 4411 }, { - "epoch": 1.6111009676830381, - "grad_norm": 1.2611252069473267, - "learning_rate": 5.5469354305588175e-06, - "loss": 0.8099, + "epoch": 0.765440666204025, + "grad_norm": 1.1120007038116455, + "learning_rate": 3.506987931344649e-05, + "loss": 0.8088, "step": 4412 }, { - "epoch": 1.6114661310936644, - "grad_norm": 0.912324845790863, - "learning_rate": 5.537023612009791e-06, - "loss": 0.7594, + "epoch": 0.7656141568355309, + "grad_norm": 1.272675633430481, + "learning_rate": 3.5065399030635286e-05, + "loss": 0.7349, "step": 4413 }, { - "epoch": 1.6118312945042907, - "grad_norm": 1.1070760488510132, - "learning_rate": 5.527119234081752e-06, - "loss": 0.8226, + "epoch": 0.7657876474670368, + "grad_norm": 4.261226654052734, + "learning_rate": 3.506091699946697e-05, + "loss": 0.7664, "step": 4414 }, { - "epoch": 1.612196457914917, - "grad_norm": 1.1197280883789062, - "learning_rate": 5.5172223018701135e-06, - "loss": 0.7847, + "epoch": 0.7659611380985427, + "grad_norm": 0.9747645854949951, + "learning_rate": 3.505643322046168e-05, + "loss": 0.7367, "step": 4415 }, { - "epoch": 1.6125616213255431, - "grad_norm": 1.058461308479309, - "learning_rate": 5.5073328204664354e-06, - "loss": 0.7898, + "epoch": 0.7661346287300486, + "grad_norm": 0.951988935470581, + "learning_rate": 3.505194769413977e-05, + "loss": 0.6464, "step": 4416 }, { - "epoch": 1.6129267847361695, - "grad_norm": 8.55376148223877, - "learning_rate": 5.49745079495847e-06, - "loss": 0.7988, + "epoch": 0.7663081193615545, + "grad_norm": 0.8861560821533203, + "learning_rate": 3.5047460421021796e-05, + "loss": 0.7268, "step": 4417 }, { - "epoch": 1.6132919481467956, - "grad_norm": 1.0564197301864624, - "learning_rate": 5.487576230430123e-06, - "loss": 0.8334, + "epoch": 0.7664816099930604, + "grad_norm": 0.9109834432601929, + "learning_rate": 3.504297140162851e-05, + "loss": 0.7542, "step": 4418 }, { - "epoch": 1.6136571115574219, - "grad_norm": 1.678716778755188, - "learning_rate": 5.477709131961453e-06, - "loss": 0.7849, + "epoch": 0.7666551006245663, + "grad_norm": 1.0811225175857544, + "learning_rate": 3.503848063648086e-05, + "loss": 0.8252, "step": 4419 }, { - "epoch": 1.6140222749680482, - "grad_norm": 0.8755790591239929, - "learning_rate": 5.46784950462869e-06, - "loss": 0.8511, + "epoch": 0.7668285912560722, + "grad_norm": 0.973717212677002, + "learning_rate": 3.50339881261e-05, + "loss": 0.696, "step": 4420 }, { - "epoch": 1.6143874383786745, - "grad_norm": 0.9364663362503052, - "learning_rate": 5.457997353504221e-06, - "loss": 0.8146, + "epoch": 0.7670020818875781, + "grad_norm": 0.7981738448143005, + "learning_rate": 3.502949387100731e-05, + "loss": 0.8206, "step": 4421 }, { - "epoch": 1.6147526017893008, - "grad_norm": 1.024704098701477, - "learning_rate": 5.448152683656582e-06, - "loss": 0.8207, + "epoch": 0.767175572519084, + "grad_norm": 0.9120929837226868, + "learning_rate": 3.502499787172434e-05, + "loss": 0.7603, "step": 4422 }, { - "epoch": 1.6151177651999271, - "grad_norm": 1.091179609298706, - "learning_rate": 5.4383155001504525e-06, - "loss": 0.7902, + "epoch": 0.7673490631505898, + "grad_norm": 0.9891156554222107, + "learning_rate": 3.502050012877286e-05, + "loss": 0.6536, "step": 4423 }, { - "epoch": 1.6154829286105532, - "grad_norm": 1.0671055316925049, - "learning_rate": 5.428485808046677e-06, - "loss": 0.8278, + "epoch": 0.7675225537820958, + "grad_norm": 0.7954277396202087, + "learning_rate": 3.5016000642674836e-05, + "loss": 0.782, "step": 4424 }, { - "epoch": 1.6158480920211795, - "grad_norm": 1.7032924890518188, - "learning_rate": 5.418663612402233e-06, - "loss": 0.7789, + "epoch": 0.7676960444136016, + "grad_norm": 0.8590980172157288, + "learning_rate": 3.501149941395245e-05, + "loss": 0.7935, "step": 4425 }, { - "epoch": 1.6162132554318056, - "grad_norm": 1.1641969680786133, - "learning_rate": 5.408848918270246e-06, - "loss": 0.8071, + "epoch": 0.7678695350451076, + "grad_norm": 0.9861512780189514, + "learning_rate": 3.500699644312805e-05, + "loss": 0.8721, "step": 4426 }, { - "epoch": 1.616578418842432, - "grad_norm": 1.497086763381958, - "learning_rate": 5.399041730699992e-06, - "loss": 0.8284, + "epoch": 0.7680430256766134, + "grad_norm": 0.8314587473869324, + "learning_rate": 3.5002491730724235e-05, + "loss": 0.7673, "step": 4427 }, { - "epoch": 1.6169435822530582, - "grad_norm": 1.1225141286849976, - "learning_rate": 5.389242054736867e-06, - "loss": 0.8519, + "epoch": 0.7682165163081194, + "grad_norm": 0.939253568649292, + "learning_rate": 3.4997985277263765e-05, + "loss": 0.7358, "step": 4428 }, { - "epoch": 1.6173087456636845, - "grad_norm": 1.1222800016403198, - "learning_rate": 5.3794498954224085e-06, - "loss": 0.8312, + "epoch": 0.7683900069396252, + "grad_norm": 0.8166649341583252, + "learning_rate": 3.499347708326964e-05, + "loss": 0.73, "step": 4429 }, { - "epoch": 1.6176739090743109, - "grad_norm": 1.274857521057129, - "learning_rate": 5.36966525779429e-06, - "loss": 0.8174, + "epoch": 0.7685634975711312, + "grad_norm": 0.6916984915733337, + "learning_rate": 3.498896714926502e-05, + "loss": 0.7542, "step": 4430 }, { - "epoch": 1.6180390724849372, - "grad_norm": 1.3062522411346436, - "learning_rate": 5.359888146886316e-06, - "loss": 0.8037, + "epoch": 0.768736988202637, + "grad_norm": 0.7392248511314392, + "learning_rate": 3.4984455475773304e-05, + "loss": 0.8303, "step": 4431 }, { - "epoch": 1.6184042358955633, - "grad_norm": 1.1047271490097046, - "learning_rate": 5.350118567728429e-06, - "loss": 0.792, + "epoch": 0.768910478834143, + "grad_norm": 1.1106517314910889, + "learning_rate": 3.4979942063318066e-05, + "loss": 0.679, "step": 4432 }, { - "epoch": 1.6187693993061896, - "grad_norm": 0.9961979985237122, - "learning_rate": 5.340356525346666e-06, - "loss": 0.8232, + "epoch": 0.7690839694656488, + "grad_norm": 0.8788468837738037, + "learning_rate": 3.497542691242309e-05, + "loss": 0.7783, "step": 4433 }, { - "epoch": 1.6191345627168157, - "grad_norm": 1.2175962924957275, - "learning_rate": 5.330602024763218e-06, - "loss": 0.7806, + "epoch": 0.7692574600971548, + "grad_norm": 0.7951076030731201, + "learning_rate": 3.497091002361238e-05, + "loss": 0.8291, "step": 4434 }, { - "epoch": 1.619499726127442, - "grad_norm": 1.209874153137207, - "learning_rate": 5.320855070996383e-06, - "loss": 0.785, + "epoch": 0.7694309507286606, + "grad_norm": 0.9497836828231812, + "learning_rate": 3.496639139741011e-05, + "loss": 0.6738, "step": 4435 }, { - "epoch": 1.6198648895380683, - "grad_norm": 0.9965007305145264, - "learning_rate": 5.311115669060576e-06, - "loss": 0.8129, + "epoch": 0.7696044413601666, + "grad_norm": 0.8483584523200989, + "learning_rate": 3.496187103434069e-05, + "loss": 0.6238, "step": 4436 }, { - "epoch": 1.6202300529486946, - "grad_norm": 0.903425395488739, - "learning_rate": 5.3013838239663325e-06, - "loss": 0.8394, + "epoch": 0.7697779319916724, + "grad_norm": 0.9556950330734253, + "learning_rate": 3.495734893492869e-05, + "loss": 0.6816, "step": 4437 }, { - "epoch": 1.620595216359321, - "grad_norm": 1.2627040147781372, - "learning_rate": 5.291659540720289e-06, - "loss": 0.8162, + "epoch": 0.7699514226231784, + "grad_norm": 0.8394126892089844, + "learning_rate": 3.4952825099698926e-05, + "loss": 0.8645, "step": 4438 }, { - "epoch": 1.620960379769947, - "grad_norm": 1.221693992614746, - "learning_rate": 5.281942824325204e-06, - "loss": 0.8109, + "epoch": 0.7701249132546842, + "grad_norm": 0.9741814732551575, + "learning_rate": 3.494829952917638e-05, + "loss": 0.9172, "step": 4439 }, { - "epoch": 1.6213255431805733, - "grad_norm": 1.3941375017166138, - "learning_rate": 5.272233679779934e-06, - "loss": 0.7888, + "epoch": 0.7702984038861902, + "grad_norm": 1.2359071969985962, + "learning_rate": 3.4943772223886264e-05, + "loss": 0.7507, "step": 4440 }, { - "epoch": 1.6216907065911994, - "grad_norm": 1.3213870525360107, - "learning_rate": 5.262532112079455e-06, - "loss": 0.8059, + "epoch": 0.770471894517696, + "grad_norm": 0.7628108263015747, + "learning_rate": 3.493924318435395e-05, + "loss": 0.7599, "step": 4441 }, { - "epoch": 1.6220558700018257, - "grad_norm": 1.0521044731140137, - "learning_rate": 5.252838126214827e-06, - "loss": 0.811, + "epoch": 0.7706453851492019, + "grad_norm": 1.3468672037124634, + "learning_rate": 3.493471241110507e-05, + "loss": 0.7493, "step": 4442 }, { - "epoch": 1.622421033412452, - "grad_norm": 1.1098337173461914, - "learning_rate": 5.2431517271732106e-06, - "loss": 0.8126, + "epoch": 0.7708188757807078, + "grad_norm": 0.962803840637207, + "learning_rate": 3.493017990466542e-05, + "loss": 0.8218, "step": 4443 }, { - "epoch": 1.6227861968230783, - "grad_norm": 1.2647901773452759, - "learning_rate": 5.233472919937872e-06, - "loss": 0.8047, + "epoch": 0.7709923664122137, + "grad_norm": 0.8495938181877136, + "learning_rate": 3.492564566556098e-05, + "loss": 0.7389, "step": 4444 }, { - "epoch": 1.6231513602337047, - "grad_norm": 1.0067957639694214, - "learning_rate": 5.2238017094881765e-06, - "loss": 0.8351, + "epoch": 0.7711658570437196, + "grad_norm": 0.8642054200172424, + "learning_rate": 3.4921109694317974e-05, + "loss": 0.7417, "step": 4445 }, { - "epoch": 1.623516523644331, - "grad_norm": 1.1683073043823242, - "learning_rate": 5.214138100799573e-06, - "loss": 0.8103, + "epoch": 0.7713393476752255, + "grad_norm": 1.5357446670532227, + "learning_rate": 3.491657199146281e-05, + "loss": 0.8087, "step": 4446 }, { - "epoch": 1.623881687054957, - "grad_norm": 1.3753080368041992, - "learning_rate": 5.2044820988435906e-06, - "loss": 0.794, + "epoch": 0.7715128383067315, + "grad_norm": 0.8969196081161499, + "learning_rate": 3.4912032557522075e-05, + "loss": 0.8445, "step": 4447 }, { - "epoch": 1.6242468504655834, - "grad_norm": 0.9003234505653381, - "learning_rate": 5.194833708587863e-06, - "loss": 0.812, + "epoch": 0.7716863289382373, + "grad_norm": 1.0372133255004883, + "learning_rate": 3.490749139302258e-05, + "loss": 0.676, "step": 4448 }, { - "epoch": 1.6246120138762095, - "grad_norm": 0.9112487435340881, - "learning_rate": 5.185192934996097e-06, - "loss": 0.8791, + "epoch": 0.7718598195697433, + "grad_norm": 0.8731522560119629, + "learning_rate": 3.4902948498491357e-05, + "loss": 0.7509, "step": 4449 }, { - "epoch": 1.6249771772868358, - "grad_norm": 1.1415514945983887, - "learning_rate": 5.175559783028084e-06, - "loss": 0.8506, + "epoch": 0.7720333102012491, + "grad_norm": 1.0199079513549805, + "learning_rate": 3.4898403874455584e-05, + "loss": 0.8441, "step": 4450 }, { - "epoch": 1.625342340697462, - "grad_norm": 1.1334431171417236, - "learning_rate": 5.165934257639702e-06, - "loss": 0.8241, + "epoch": 0.7722068008327551, + "grad_norm": 1.2908271551132202, + "learning_rate": 3.489385752144268e-05, + "loss": 0.7954, "step": 4451 }, { - "epoch": 1.6257075041080884, - "grad_norm": 1.0748672485351562, - "learning_rate": 5.156316363782885e-06, - "loss": 0.8015, + "epoch": 0.7723802914642609, + "grad_norm": 1.0643309354782104, + "learning_rate": 3.4889309439980256e-05, + "loss": 0.8192, "step": 4452 }, { - "epoch": 1.6260726675187147, - "grad_norm": 0.9802602529525757, - "learning_rate": 5.146706106405657e-06, - "loss": 0.8324, + "epoch": 0.7725537820957669, + "grad_norm": 0.7285794019699097, + "learning_rate": 3.4884759630596124e-05, + "loss": 0.8462, "step": 4453 }, { - "epoch": 1.626437830929341, - "grad_norm": 1.120448112487793, - "learning_rate": 5.137103490452113e-06, - "loss": 0.8065, + "epoch": 0.7727272727272727, + "grad_norm": 0.9079310894012451, + "learning_rate": 3.488020809381829e-05, + "loss": 0.7166, "step": 4454 }, { - "epoch": 1.6268029943399671, - "grad_norm": 1.0719910860061646, - "learning_rate": 5.1275085208624185e-06, - "loss": 0.8203, + "epoch": 0.7729007633587787, + "grad_norm": 0.7547510266304016, + "learning_rate": 3.4875654830174975e-05, + "loss": 0.731, "step": 4455 }, { - "epoch": 1.6271681577505934, - "grad_norm": 1.697625756263733, - "learning_rate": 5.1179212025727935e-06, - "loss": 0.7958, + "epoch": 0.7730742539902845, + "grad_norm": 0.7736081480979919, + "learning_rate": 3.4871099840194575e-05, + "loss": 0.767, "step": 4456 }, { - "epoch": 1.6275333211612195, - "grad_norm": 1.2465591430664062, - "learning_rate": 5.108341540515522e-06, - "loss": 0.8292, + "epoch": 0.7732477446217905, + "grad_norm": 0.9601176977157593, + "learning_rate": 3.4866543124405714e-05, + "loss": 0.7776, "step": 4457 }, { - "epoch": 1.6278984845718458, - "grad_norm": 1.0539612770080566, - "learning_rate": 5.098769539618964e-06, - "loss": 0.8354, + "epoch": 0.7734212352532963, + "grad_norm": 0.8856390118598938, + "learning_rate": 3.4861984683337205e-05, + "loss": 0.7518, "step": 4458 }, { - "epoch": 1.6282636479824721, - "grad_norm": 1.4959781169891357, - "learning_rate": 5.08920520480753e-06, - "loss": 0.83, + "epoch": 0.7735947258848023, + "grad_norm": 1.149114727973938, + "learning_rate": 3.485742451751805e-05, + "loss": 0.6902, "step": 4459 }, { - "epoch": 1.6286288113930985, - "grad_norm": 0.8524137735366821, - "learning_rate": 5.0796485410016825e-06, - "loss": 0.7942, + "epoch": 0.7737682165163081, + "grad_norm": 0.6759079098701477, + "learning_rate": 3.485286262747747e-05, + "loss": 0.8171, "step": 4460 }, { - "epoch": 1.6289939748037248, - "grad_norm": 1.2815080881118774, - "learning_rate": 5.070099553117953e-06, - "loss": 0.7591, + "epoch": 0.7739417071478141, + "grad_norm": 0.8336185216903687, + "learning_rate": 3.484829901374487e-05, + "loss": 0.7629, "step": 4461 }, { - "epoch": 1.629359138214351, - "grad_norm": 0.9742132425308228, - "learning_rate": 5.060558246068897e-06, - "loss": 0.8373, + "epoch": 0.7741151977793199, + "grad_norm": 0.831173300743103, + "learning_rate": 3.4843733676849876e-05, + "loss": 0.8413, "step": 4462 }, { - "epoch": 1.6297243016249772, - "grad_norm": 1.1350549459457397, - "learning_rate": 5.0510246247631385e-06, - "loss": 0.8524, + "epoch": 0.7742886884108258, + "grad_norm": 0.9335201978683472, + "learning_rate": 3.4839166617322285e-05, + "loss": 0.7903, "step": 4463 }, { - "epoch": 1.6300894650356035, - "grad_norm": 1.0183902978897095, - "learning_rate": 5.041498694105349e-06, - "loss": 0.7906, + "epoch": 0.7744621790423317, + "grad_norm": 1.0892484188079834, + "learning_rate": 3.4834597835692117e-05, + "loss": 0.7295, "step": 4464 }, { - "epoch": 1.6304546284462296, - "grad_norm": 0.9935386776924133, - "learning_rate": 5.03198045899624e-06, - "loss": 0.7571, + "epoch": 0.7746356696738376, + "grad_norm": 0.7319350242614746, + "learning_rate": 3.483002733248959e-05, + "loss": 0.887, "step": 4465 }, { - "epoch": 1.6308197918568559, - "grad_norm": 1.0178877115249634, - "learning_rate": 5.022469924332547e-06, - "loss": 0.783, + "epoch": 0.7748091603053435, + "grad_norm": 0.7419962882995605, + "learning_rate": 3.482545510824511e-05, + "loss": 0.7288, "step": 4466 }, { - "epoch": 1.6311849552674822, - "grad_norm": 0.9873657822608948, - "learning_rate": 5.012967095007068e-06, - "loss": 0.8339, + "epoch": 0.7749826509368494, + "grad_norm": 0.7979143261909485, + "learning_rate": 3.4820881163489284e-05, + "loss": 0.7483, "step": 4467 }, { - "epoch": 1.6315501186781085, - "grad_norm": 0.9515143036842346, - "learning_rate": 5.0034719759086335e-06, - "loss": 0.8512, + "epoch": 0.7751561415683553, + "grad_norm": 0.9539660215377808, + "learning_rate": 3.481630549875293e-05, + "loss": 0.7876, "step": 4468 }, { - "epoch": 1.6319152820887348, - "grad_norm": 0.9072102904319763, - "learning_rate": 4.993984571922086e-06, - "loss": 0.8016, + "epoch": 0.7753296321998612, + "grad_norm": 0.9407137632369995, + "learning_rate": 3.481172811456707e-05, + "loss": 0.6998, "step": 4469 }, { - "epoch": 1.632280445499361, - "grad_norm": 1.0515618324279785, - "learning_rate": 4.984504887928325e-06, - "loss": 0.783, + "epoch": 0.7755031228313671, + "grad_norm": 0.8826245069503784, + "learning_rate": 3.480714901146289e-05, + "loss": 0.6754, "step": 4470 }, { - "epoch": 1.6326456089099872, - "grad_norm": 1.0709545612335205, - "learning_rate": 4.975032928804269e-06, - "loss": 0.8708, + "epoch": 0.775676613462873, + "grad_norm": 1.0516557693481445, + "learning_rate": 3.4802568189971814e-05, + "loss": 0.8264, "step": 4471 }, { - "epoch": 1.6330107723206133, - "grad_norm": 1.1815272569656372, - "learning_rate": 4.965568699422851e-06, - "loss": 0.7916, + "epoch": 0.7758501040943789, + "grad_norm": 0.8027101755142212, + "learning_rate": 3.479798565062546e-05, + "loss": 0.7493, "step": 4472 }, { - "epoch": 1.6333759357312396, - "grad_norm": 1.1043927669525146, - "learning_rate": 4.956112204653043e-06, - "loss": 0.8203, + "epoch": 0.7760235947258848, + "grad_norm": 0.6311869025230408, + "learning_rate": 3.479340139395562e-05, + "loss": 0.8792, "step": 4473 }, { - "epoch": 1.633741099141866, - "grad_norm": 0.9106027483940125, - "learning_rate": 4.946663449359834e-06, - "loss": 0.8353, + "epoch": 0.7761970853573907, + "grad_norm": 0.6964933276176453, + "learning_rate": 3.478881542049432e-05, + "loss": 0.6525, "step": 4474 }, { - "epoch": 1.6341062625524923, - "grad_norm": 1.0802770853042603, - "learning_rate": 4.937222438404232e-06, - "loss": 0.842, + "epoch": 0.7763705759888966, + "grad_norm": 0.7259735465049744, + "learning_rate": 3.478422773077375e-05, + "loss": 0.8088, "step": 4475 }, { - "epoch": 1.6344714259631186, - "grad_norm": 1.32493257522583, - "learning_rate": 4.927789176643247e-06, - "loss": 0.83, + "epoch": 0.7765440666204025, + "grad_norm": 0.9515973925590515, + "learning_rate": 3.4779638325326326e-05, + "loss": 0.7214, "step": 4476 }, { - "epoch": 1.6348365893737449, - "grad_norm": 1.3165159225463867, - "learning_rate": 4.918363668929922e-06, - "loss": 0.8121, + "epoch": 0.7767175572519084, + "grad_norm": 0.7791823744773865, + "learning_rate": 3.477504720468465e-05, + "loss": 0.7454, "step": 4477 }, { - "epoch": 1.635201752784371, - "grad_norm": 1.1204863786697388, - "learning_rate": 4.908945920113299e-06, - "loss": 0.8241, + "epoch": 0.7768910478834143, + "grad_norm": 2.5212619304656982, + "learning_rate": 3.477045436938154e-05, + "loss": 0.6877, "step": 4478 }, { - "epoch": 1.6355669161949973, - "grad_norm": 0.9250267744064331, - "learning_rate": 4.899535935038436e-06, - "loss": 0.7941, + "epoch": 0.7770645385149202, + "grad_norm": 1.199476718902588, + "learning_rate": 3.4765859819949977e-05, + "loss": 0.7092, "step": 4479 }, { - "epoch": 1.6359320796056234, - "grad_norm": 1.0335289239883423, - "learning_rate": 4.890133718546395e-06, - "loss": 0.8246, + "epoch": 0.7772380291464261, + "grad_norm": 0.9219550490379333, + "learning_rate": 3.476126355692318e-05, + "loss": 0.9226, "step": 4480 }, { - "epoch": 1.6362972430162497, - "grad_norm": 1.1344773769378662, - "learning_rate": 4.880739275474229e-06, - "loss": 0.7786, + "epoch": 0.777411519777932, + "grad_norm": 0.8859871029853821, + "learning_rate": 3.475666558083455e-05, + "loss": 0.7991, "step": 4481 }, { - "epoch": 1.636662406426876, - "grad_norm": 1.5544350147247314, - "learning_rate": 4.8713526106550134e-06, - "loss": 0.8002, + "epoch": 0.7775850104094378, + "grad_norm": 1.141983985900879, + "learning_rate": 3.475206589221768e-05, + "loss": 0.6093, "step": 4482 }, { - "epoch": 1.6370275698375023, - "grad_norm": 1.0704041719436646, - "learning_rate": 4.861973728917799e-06, - "loss": 0.7873, + "epoch": 0.7777585010409438, + "grad_norm": 0.8446226119995117, + "learning_rate": 3.4747464491606376e-05, + "loss": 0.7072, "step": 4483 }, { - "epoch": 1.6373927332481286, - "grad_norm": 1.0938000679016113, - "learning_rate": 4.852602635087651e-06, - "loss": 0.8331, + "epoch": 0.7779319916724496, + "grad_norm": 0.7664715647697449, + "learning_rate": 3.4742861379534636e-05, + "loss": 0.8203, "step": 4484 }, { - "epoch": 1.637757896658755, - "grad_norm": 1.5662450790405273, - "learning_rate": 4.843239333985625e-06, - "loss": 0.8146, + "epoch": 0.7781054823039556, + "grad_norm": 0.80955970287323, + "learning_rate": 3.4738256556536654e-05, + "loss": 0.8271, "step": 4485 }, { - "epoch": 1.638123060069381, - "grad_norm": 1.2406224012374878, - "learning_rate": 4.833883830428754e-06, - "loss": 0.8359, + "epoch": 0.7782789729354614, + "grad_norm": 0.8710863590240479, + "learning_rate": 3.473365002314682e-05, + "loss": 0.7711, "step": 4486 }, { - "epoch": 1.6384882234800073, - "grad_norm": 0.8192363977432251, - "learning_rate": 4.8245361292300705e-06, - "loss": 0.7833, + "epoch": 0.7784524635669674, + "grad_norm": 0.8899707198143005, + "learning_rate": 3.4729041779899736e-05, + "loss": 0.6191, "step": 4487 }, { - "epoch": 1.6388533868906334, - "grad_norm": 0.9832175374031067, - "learning_rate": 4.815196235198598e-06, - "loss": 0.8051, + "epoch": 0.7786259541984732, + "grad_norm": 1.9692116975784302, + "learning_rate": 3.4724431827330196e-05, + "loss": 0.6005, "step": 4488 }, { - "epoch": 1.6392185503012597, - "grad_norm": 1.1779032945632935, - "learning_rate": 4.805864153139339e-06, - "loss": 0.828, + "epoch": 0.7787994448299792, + "grad_norm": 0.9263486266136169, + "learning_rate": 3.471982016597317e-05, + "loss": 0.6804, "step": 4489 }, { - "epoch": 1.639583713711886, - "grad_norm": 1.2273808717727661, - "learning_rate": 4.796539887853266e-06, - "loss": 0.8253, + "epoch": 0.778972935461485, + "grad_norm": 0.8461465835571289, + "learning_rate": 3.4715206796363876e-05, + "loss": 0.865, "step": 4490 }, { - "epoch": 1.6399488771225124, - "grad_norm": 0.9103972911834717, - "learning_rate": 4.7872234441373434e-06, - "loss": 0.807, + "epoch": 0.779146426092991, + "grad_norm": 0.7799747586250305, + "learning_rate": 3.4710591719037685e-05, + "loss": 0.8281, "step": 4491 }, { - "epoch": 1.6403140405331387, - "grad_norm": 1.0704798698425293, - "learning_rate": 4.7779148267845065e-06, - "loss": 0.8199, + "epoch": 0.7793199167244969, + "grad_norm": 0.7524052262306213, + "learning_rate": 3.470597493453018e-05, + "loss": 0.7812, "step": 4492 }, { - "epoch": 1.640679203943765, - "grad_norm": 1.2084496021270752, - "learning_rate": 4.768614040583668e-06, - "loss": 0.8442, + "epoch": 0.7794934073560028, + "grad_norm": 2.66054630279541, + "learning_rate": 3.470135644337715e-05, + "loss": 0.8242, "step": 4493 }, { - "epoch": 1.641044367354391, - "grad_norm": 1.3812192678451538, - "learning_rate": 4.7593210903197155e-06, - "loss": 0.7875, + "epoch": 0.7796668979875087, + "grad_norm": 1.181273341178894, + "learning_rate": 3.469673624611457e-05, + "loss": 0.729, "step": 4494 }, { - "epoch": 1.6414095307650174, - "grad_norm": 1.4102391004562378, - "learning_rate": 4.750035980773488e-06, - "loss": 0.8301, + "epoch": 0.7798403886190146, + "grad_norm": 1.2465263605117798, + "learning_rate": 3.4692114343278626e-05, + "loss": 0.6934, "step": 4495 }, { - "epoch": 1.6417746941756435, - "grad_norm": 0.8265354037284851, - "learning_rate": 4.740758716721803e-06, - "loss": 0.8074, + "epoch": 0.7800138792505205, + "grad_norm": 1.0363940000534058, + "learning_rate": 3.4687490735405696e-05, + "loss": 0.6542, "step": 4496 }, { - "epoch": 1.6421398575862698, - "grad_norm": 0.9566201567649841, - "learning_rate": 4.731489302937442e-06, - "loss": 0.8015, + "epoch": 0.7801873698820264, + "grad_norm": 0.9432730674743652, + "learning_rate": 3.468286542303235e-05, + "loss": 0.6023, "step": 4497 }, { - "epoch": 1.642505020996896, - "grad_norm": 0.9264112114906311, - "learning_rate": 4.722227744189148e-06, - "loss": 0.7744, + "epoch": 0.7803608605135323, + "grad_norm": 0.8999814391136169, + "learning_rate": 3.467823840669536e-05, + "loss": 0.5657, "step": 4498 }, { - "epoch": 1.6428701844075224, - "grad_norm": 0.896152138710022, - "learning_rate": 4.712974045241625e-06, - "loss": 0.8309, + "epoch": 0.7805343511450382, + "grad_norm": 0.8244766592979431, + "learning_rate": 3.4673609686931697e-05, + "loss": 0.8311, "step": 4499 }, { - "epoch": 1.6432353478181487, - "grad_norm": 1.2148700952529907, - "learning_rate": 4.70372821085552e-06, - "loss": 0.8363, + "epoch": 0.7807078417765441, + "grad_norm": 0.9977414608001709, + "learning_rate": 3.466897926427854e-05, + "loss": 0.7117, "step": 4500 }, { - "epoch": 1.643600511228775, - "grad_norm": 0.7038260698318481, - "learning_rate": 4.694490245787451e-06, - "loss": 0.8217, + "epoch": 0.7808813324080499, + "grad_norm": 0.9764508605003357, + "learning_rate": 3.4664347139273245e-05, + "loss": 0.7761, "step": 4501 }, { - "epoch": 1.6439656746394011, - "grad_norm": 0.993716835975647, - "learning_rate": 4.685260154789979e-06, - "loss": 0.8378, + "epoch": 0.7810548230395559, + "grad_norm": 0.9173213839530945, + "learning_rate": 3.465971331245337e-05, + "loss": 0.6875, "step": 4502 }, { - "epoch": 1.6443308380500272, - "grad_norm": 1.1224515438079834, - "learning_rate": 4.676037942611613e-06, - "loss": 0.7796, + "epoch": 0.7812283136710617, + "grad_norm": 1.0777039527893066, + "learning_rate": 3.465507778435669e-05, + "loss": 0.6255, "step": 4503 }, { - "epoch": 1.6446960014606535, - "grad_norm": 0.9353373646736145, - "learning_rate": 4.6668236139968205e-06, - "loss": 0.8384, + "epoch": 0.7814018043025677, + "grad_norm": 0.9607329368591309, + "learning_rate": 3.465044055552116e-05, + "loss": 0.8486, "step": 4504 }, { - "epoch": 1.6450611648712798, - "grad_norm": 0.971235990524292, - "learning_rate": 4.657617173685989e-06, - "loss": 0.8279, + "epoch": 0.7815752949340735, + "grad_norm": 0.7243859171867371, + "learning_rate": 3.464580162648492e-05, + "loss": 0.8323, "step": 4505 }, { - "epoch": 1.6454263282819062, - "grad_norm": 1.0624948740005493, - "learning_rate": 4.648418626415472e-06, - "loss": 0.8636, + "epoch": 0.7817487855655795, + "grad_norm": 0.9147722721099854, + "learning_rate": 3.464116099778634e-05, + "loss": 0.78, "step": 4506 }, { - "epoch": 1.6457914916925325, - "grad_norm": 1.0562167167663574, - "learning_rate": 4.63922797691755e-06, - "loss": 0.7944, + "epoch": 0.7819222761970853, + "grad_norm": 0.8615881204605103, + "learning_rate": 3.463651866996397e-05, + "loss": 0.708, "step": 4507 }, { - "epoch": 1.6461566551031588, - "grad_norm": 1.3680952787399292, - "learning_rate": 4.63004522992045e-06, - "loss": 0.8142, + "epoch": 0.7820957668285913, + "grad_norm": 0.7966358661651611, + "learning_rate": 3.463187464355655e-05, + "loss": 0.8555, "step": 4508 }, { - "epoch": 1.6465218185137849, - "grad_norm": 1.3380866050720215, - "learning_rate": 4.620870390148322e-06, - "loss": 0.7913, + "epoch": 0.7822692574600971, + "grad_norm": 1.0634485483169556, + "learning_rate": 3.462722891910303e-05, + "loss": 0.8071, "step": 4509 }, { - "epoch": 1.6468869819244112, - "grad_norm": 0.928678035736084, - "learning_rate": 4.611703462321246e-06, - "loss": 0.8463, + "epoch": 0.7824427480916031, + "grad_norm": 0.8468574285507202, + "learning_rate": 3.462258149714255e-05, + "loss": 0.7964, "step": 4510 }, { - "epoch": 1.6472521453350373, - "grad_norm": 0.9685364961624146, - "learning_rate": 4.602544451155247e-06, - "loss": 0.8208, + "epoch": 0.7826162387231089, + "grad_norm": 2.184201717376709, + "learning_rate": 3.461793237821445e-05, + "loss": 0.8079, "step": 4511 }, { - "epoch": 1.6476173087456636, - "grad_norm": 1.22025465965271, - "learning_rate": 4.593393361362264e-06, - "loss": 0.8195, + "epoch": 0.7827897293546149, + "grad_norm": 0.7968469858169556, + "learning_rate": 3.461328156285826e-05, + "loss": 0.9136, "step": 4512 }, { - "epoch": 1.64798247215629, - "grad_norm": 1.1866446733474731, - "learning_rate": 4.584250197650169e-06, - "loss": 0.8717, + "epoch": 0.7829632199861207, + "grad_norm": 0.8219817280769348, + "learning_rate": 3.460862905161372e-05, + "loss": 0.8367, "step": 4513 }, { - "epoch": 1.6483476355669162, - "grad_norm": 0.818331778049469, - "learning_rate": 4.575114964722758e-06, - "loss": 0.8007, + "epoch": 0.7831367106176267, + "grad_norm": 2.755932092666626, + "learning_rate": 3.4603974845020754e-05, + "loss": 0.6617, "step": 4514 }, { - "epoch": 1.6487127989775425, - "grad_norm": 1.0766884088516235, - "learning_rate": 4.565987667279728e-06, - "loss": 0.8134, + "epoch": 0.7833102012491325, + "grad_norm": 0.8261186480522156, + "learning_rate": 3.459931894361949e-05, + "loss": 0.6707, "step": 4515 }, { - "epoch": 1.6490779623881688, - "grad_norm": 2.012032985687256, - "learning_rate": 4.556868310016715e-06, - "loss": 0.8118, + "epoch": 0.7834836918806385, + "grad_norm": 1.0474417209625244, + "learning_rate": 3.4594661347950255e-05, + "loss": 0.7776, "step": 4516 }, { - "epoch": 1.649443125798795, - "grad_norm": 1.3108714818954468, - "learning_rate": 4.547756897625264e-06, - "loss": 0.8287, + "epoch": 0.7836571825121443, + "grad_norm": 1.0841500759124756, + "learning_rate": 3.459000205855356e-05, + "loss": 0.7312, "step": 4517 }, { - "epoch": 1.6498082892094212, - "grad_norm": 1.3259278535842896, - "learning_rate": 4.538653434792833e-06, - "loss": 0.7898, + "epoch": 0.7838306731436503, + "grad_norm": 1.032971739768982, + "learning_rate": 3.458534107597013e-05, + "loss": 0.7991, "step": 4518 }, { - "epoch": 1.6501734526200473, - "grad_norm": 1.1810096502304077, - "learning_rate": 4.529557926202781e-06, - "loss": 0.8669, + "epoch": 0.7840041637751561, + "grad_norm": 0.9403311610221863, + "learning_rate": 3.458067840074087e-05, + "loss": 0.7366, "step": 4519 }, { - "epoch": 1.6505386160306736, - "grad_norm": 1.0852611064910889, - "learning_rate": 4.520470376534385e-06, - "loss": 0.8362, + "epoch": 0.7841776544066621, + "grad_norm": 1.055198073387146, + "learning_rate": 3.457601403340689e-05, + "loss": 0.6802, "step": 4520 }, { - "epoch": 1.6509037794413, - "grad_norm": 1.3698539733886719, - "learning_rate": 4.511390790462824e-06, - "loss": 0.7997, + "epoch": 0.7843511450381679, + "grad_norm": 1.2381149530410767, + "learning_rate": 3.45713479745095e-05, + "loss": 0.7135, "step": 4521 }, { - "epoch": 1.6512689428519263, - "grad_norm": 0.8695532083511353, - "learning_rate": 4.502319172659189e-06, - "loss": 0.8365, + "epoch": 0.7845246356696738, + "grad_norm": 1.5668121576309204, + "learning_rate": 3.45666802245902e-05, + "loss": 0.7262, "step": 4522 }, { - "epoch": 1.6516341062625526, - "grad_norm": 0.9645466208457947, - "learning_rate": 4.49325552779045e-06, - "loss": 0.8247, + "epoch": 0.7846981263011797, + "grad_norm": 0.8783586025238037, + "learning_rate": 3.456201078419068e-05, + "loss": 0.6292, "step": 4523 }, { - "epoch": 1.651999269673179, - "grad_norm": 1.1406689882278442, - "learning_rate": 4.484199860519502e-06, - "loss": 0.7744, + "epoch": 0.7848716169326856, + "grad_norm": 1.021485686302185, + "learning_rate": 3.455733965385284e-05, + "loss": 0.7043, "step": 4524 }, { - "epoch": 1.652364433083805, - "grad_norm": 1.005447506904602, - "learning_rate": 4.475152175505108e-06, - "loss": 0.858, + "epoch": 0.7850451075641915, + "grad_norm": 1.7147735357284546, + "learning_rate": 3.455266683411878e-05, + "loss": 0.6909, "step": 4525 }, { - "epoch": 1.6527295964944313, - "grad_norm": 1.2018187046051025, - "learning_rate": 4.466112477401949e-06, - "loss": 0.8105, + "epoch": 0.7852185981956974, + "grad_norm": 0.8205025792121887, + "learning_rate": 3.454799232553077e-05, + "loss": 0.6956, "step": 4526 }, { - "epoch": 1.6530947599050574, - "grad_norm": 1.2923096418380737, - "learning_rate": 4.4570807708605825e-06, - "loss": 0.8088, + "epoch": 0.7853920888272033, + "grad_norm": 1.3591874837875366, + "learning_rate": 3.45433161286313e-05, + "loss": 0.5922, "step": 4527 }, { - "epoch": 1.6534599233156837, - "grad_norm": 1.1786904335021973, - "learning_rate": 4.448057060527466e-06, - "loss": 0.7991, + "epoch": 0.7855655794587092, + "grad_norm": 1.5777838230133057, + "learning_rate": 3.4538638243963045e-05, + "loss": 0.7607, "step": 4528 }, { - "epoch": 1.65382508672631, - "grad_norm": 1.0121408700942993, - "learning_rate": 4.439041351044926e-06, - "loss": 0.801, + "epoch": 0.7857390700902152, + "grad_norm": 0.9818868637084961, + "learning_rate": 3.453395867206888e-05, + "loss": 0.6782, "step": 4529 }, { - "epoch": 1.6541902501369363, - "grad_norm": 1.0171006917953491, - "learning_rate": 4.430033647051191e-06, - "loss": 0.8466, + "epoch": 0.785912560721721, + "grad_norm": 0.9363613724708557, + "learning_rate": 3.4529277413491885e-05, + "loss": 0.8208, "step": 4530 }, { - "epoch": 1.6545554135475626, - "grad_norm": 0.8858730792999268, - "learning_rate": 4.421033953180358e-06, - "loss": 0.8109, + "epoch": 0.786086051353227, + "grad_norm": 1.594916820526123, + "learning_rate": 3.452459446877531e-05, + "loss": 0.7546, "step": 4531 }, { - "epoch": 1.654920576958189, - "grad_norm": 1.2972527742385864, - "learning_rate": 4.412042274062415e-06, - "loss": 0.7912, + "epoch": 0.7862595419847328, + "grad_norm": 3.1859042644500732, + "learning_rate": 3.451990983846262e-05, + "loss": 0.7361, "step": 4532 }, { - "epoch": 1.655285740368815, - "grad_norm": 0.9590739011764526, - "learning_rate": 4.4030586143232145e-06, - "loss": 0.78, + "epoch": 0.7864330326162388, + "grad_norm": 1.0521354675292969, + "learning_rate": 3.4515223523097476e-05, + "loss": 0.7061, "step": 4533 }, { - "epoch": 1.6556509037794414, - "grad_norm": 1.2281558513641357, - "learning_rate": 4.394082978584488e-06, - "loss": 0.8232, + "epoch": 0.7866065232477446, + "grad_norm": 0.8013879656791687, + "learning_rate": 3.451053552322373e-05, + "loss": 0.6692, "step": 4534 }, { - "epoch": 1.6560160671900674, - "grad_norm": 0.7967516183853149, - "learning_rate": 4.38511537146385e-06, - "loss": 0.7729, + "epoch": 0.7867800138792506, + "grad_norm": 1.0064632892608643, + "learning_rate": 3.4505845839385426e-05, + "loss": 0.6882, "step": 4535 }, { - "epoch": 1.6563812306006938, - "grad_norm": 1.0337488651275635, - "learning_rate": 4.376155797574761e-06, - "loss": 0.8079, + "epoch": 0.7869535045107564, + "grad_norm": 1.22476327419281, + "learning_rate": 3.4501154472126815e-05, + "loss": 0.6875, "step": 4536 }, { - "epoch": 1.65674639401132, - "grad_norm": 1.1772987842559814, - "learning_rate": 4.367204261526568e-06, - "loss": 0.8185, + "epoch": 0.7871269951422624, + "grad_norm": 0.7659679055213928, + "learning_rate": 3.449646142199233e-05, + "loss": 0.7415, "step": 4537 }, { - "epoch": 1.6571115574219464, - "grad_norm": 1.9737513065338135, - "learning_rate": 4.358260767924482e-06, - "loss": 0.8174, + "epoch": 0.7873004857737682, + "grad_norm": 0.9621604681015015, + "learning_rate": 3.4491766689526596e-05, + "loss": 0.8809, "step": 4538 }, { - "epoch": 1.6574767208325727, - "grad_norm": 0.8760918378829956, - "learning_rate": 4.349325321369564e-06, - "loss": 0.7985, + "epoch": 0.7874739764052742, + "grad_norm": 0.7625668048858643, + "learning_rate": 3.4487070275274454e-05, + "loss": 0.7285, "step": 4539 }, { - "epoch": 1.6578418842431988, - "grad_norm": 1.3092888593673706, - "learning_rate": 4.340397926458744e-06, - "loss": 0.8469, + "epoch": 0.78764746703678, + "grad_norm": 0.9477669596672058, + "learning_rate": 3.448237217978093e-05, + "loss": 0.7705, "step": 4540 }, { - "epoch": 1.658207047653825, - "grad_norm": 1.2694766521453857, - "learning_rate": 4.331478587784809e-06, - "loss": 0.8395, + "epoch": 0.7878209576682859, + "grad_norm": 0.9085525870323181, + "learning_rate": 3.447767240359124e-05, + "loss": 0.9036, "step": 4541 }, { - "epoch": 1.6585722110644512, - "grad_norm": 1.2553669214248657, - "learning_rate": 4.32256730993641e-06, - "loss": 0.8255, + "epoch": 0.7879944482997918, + "grad_norm": 1.1131858825683594, + "learning_rate": 3.4472970947250794e-05, + "loss": 0.7458, "step": 4542 }, { - "epoch": 1.6589373744750775, - "grad_norm": 0.9214420914649963, - "learning_rate": 4.313664097498027e-06, - "loss": 0.7893, + "epoch": 0.7881679389312977, + "grad_norm": 0.7832034826278687, + "learning_rate": 3.4468267811305206e-05, + "loss": 0.7288, "step": 4543 }, { - "epoch": 1.6593025378857038, - "grad_norm": 0.9097098708152771, - "learning_rate": 4.304768955050014e-06, - "loss": 0.8013, + "epoch": 0.7883414295628036, + "grad_norm": 0.9926349520683289, + "learning_rate": 3.446356299630028e-05, + "loss": 0.6552, "step": 4544 }, { - "epoch": 1.6596677012963301, - "grad_norm": 1.1653425693511963, - "learning_rate": 4.29588188716856e-06, - "loss": 0.8386, + "epoch": 0.7885149201943095, + "grad_norm": 0.8618327379226685, + "learning_rate": 3.4458856502782016e-05, + "loss": 0.7028, "step": 4545 }, { - "epoch": 1.6600328647069564, - "grad_norm": 1.02627432346344, - "learning_rate": 4.287002898425709e-06, - "loss": 0.7763, + "epoch": 0.7886884108258154, + "grad_norm": 0.9549086689949036, + "learning_rate": 3.4454148331296606e-05, + "loss": 0.7085, "step": 4546 }, { - "epoch": 1.6603980281175827, - "grad_norm": 0.9814475774765015, - "learning_rate": 4.2781319933893515e-06, - "loss": 0.8412, + "epoch": 0.7888619014573213, + "grad_norm": 1.3327431678771973, + "learning_rate": 3.444943848239044e-05, + "loss": 0.7196, "step": 4547 }, { - "epoch": 1.6607631915282088, - "grad_norm": 1.2622781991958618, - "learning_rate": 4.269269176623203e-06, - "loss": 0.8417, + "epoch": 0.7890353920888272, + "grad_norm": 1.4848185777664185, + "learning_rate": 3.44447269566101e-05, + "loss": 0.6672, "step": 4548 }, { - "epoch": 1.6611283549388351, - "grad_norm": 0.8316565155982971, - "learning_rate": 4.260414452686821e-06, - "loss": 0.8003, + "epoch": 0.7892088827203331, + "grad_norm": 0.9225616455078125, + "learning_rate": 3.444001375450237e-05, + "loss": 0.6584, "step": 4549 }, { - "epoch": 1.6614935183494612, - "grad_norm": 1.2000550031661987, - "learning_rate": 4.251567826135614e-06, - "loss": 0.7729, + "epoch": 0.789382373351839, + "grad_norm": 3.1193253993988037, + "learning_rate": 3.443529887661421e-05, + "loss": 0.7526, "step": 4550 }, { - "epoch": 1.6618586817600876, - "grad_norm": 0.9989985823631287, - "learning_rate": 4.242729301520816e-06, - "loss": 0.8051, + "epoch": 0.7895558639833449, + "grad_norm": 0.9151145219802856, + "learning_rate": 3.4430582323492805e-05, + "loss": 0.6797, "step": 4551 }, { - "epoch": 1.6622238451707139, - "grad_norm": 1.315536379814148, - "learning_rate": 4.233898883389496e-06, - "loss": 0.8267, + "epoch": 0.7897293546148508, + "grad_norm": 0.8249428868293762, + "learning_rate": 3.44258640956855e-05, + "loss": 0.7253, "step": 4552 }, { - "epoch": 1.6625890085813402, - "grad_norm": 1.1684764623641968, - "learning_rate": 4.225076576284541e-06, - "loss": 0.8047, + "epoch": 0.7899028452463567, + "grad_norm": 0.9682416915893555, + "learning_rate": 3.4421144193739856e-05, + "loss": 0.7886, "step": 4553 }, { - "epoch": 1.6629541719919665, - "grad_norm": 1.0321834087371826, - "learning_rate": 4.2162623847446806e-06, - "loss": 0.7912, + "epoch": 0.7900763358778626, + "grad_norm": 0.8650185465812683, + "learning_rate": 3.441642261820363e-05, + "loss": 0.7269, "step": 4554 }, { - "epoch": 1.6633193354025928, - "grad_norm": 1.1076546907424927, - "learning_rate": 4.207456313304461e-06, - "loss": 0.7775, + "epoch": 0.7902498265093685, + "grad_norm": 0.9776616096496582, + "learning_rate": 3.441169936962475e-05, + "loss": 0.7483, "step": 4555 }, { - "epoch": 1.663684498813219, - "grad_norm": 2.088075637817383, - "learning_rate": 4.198658366494252e-06, - "loss": 0.7816, + "epoch": 0.7904233171408744, + "grad_norm": 5.041821479797363, + "learning_rate": 3.440697444855137e-05, + "loss": 0.6702, "step": 4556 }, { - "epoch": 1.6640496622238452, - "grad_norm": 0.8155393004417419, - "learning_rate": 4.189868548840253e-06, - "loss": 0.8124, + "epoch": 0.7905968077723803, + "grad_norm": 0.7313758134841919, + "learning_rate": 3.440224785553183e-05, + "loss": 0.7725, "step": 4557 }, { - "epoch": 1.6644148256344713, - "grad_norm": 1.212587833404541, - "learning_rate": 4.181086864864457e-06, - "loss": 0.8132, + "epoch": 0.7907702984038862, + "grad_norm": 0.8756664991378784, + "learning_rate": 3.439751959111463e-05, + "loss": 0.864, "step": 4558 }, { - "epoch": 1.6647799890450976, - "grad_norm": 1.2642267942428589, - "learning_rate": 4.172313319084695e-06, - "loss": 0.7614, + "epoch": 0.7909437890353921, + "grad_norm": 0.867315948009491, + "learning_rate": 3.439278965584851e-05, + "loss": 0.7295, "step": 4559 }, { - "epoch": 1.665145152455724, - "grad_norm": 0.8478181958198547, - "learning_rate": 4.163547916014605e-06, - "loss": 0.7607, + "epoch": 0.7911172796668979, + "grad_norm": 0.9051001667976379, + "learning_rate": 3.438805805028238e-05, + "loss": 0.6572, "step": 4560 }, { - "epoch": 1.6655103158663502, - "grad_norm": 1.1103899478912354, - "learning_rate": 4.154790660163641e-06, - "loss": 0.8545, + "epoch": 0.7912907702984039, + "grad_norm": 0.9859288930892944, + "learning_rate": 3.438332477496534e-05, + "loss": 0.7917, "step": 4561 }, { - "epoch": 1.6658754792769765, - "grad_norm": 1.1273603439331055, - "learning_rate": 4.1460415560370545e-06, - "loss": 0.8341, + "epoch": 0.7914642609299097, + "grad_norm": 0.7328219413757324, + "learning_rate": 3.4378589830446714e-05, + "loss": 0.845, "step": 4562 }, { - "epoch": 1.6662406426876029, - "grad_norm": 1.2004300355911255, - "learning_rate": 4.137300608135901e-06, - "loss": 0.8091, + "epoch": 0.7916377515614157, + "grad_norm": 0.8915688395500183, + "learning_rate": 3.437385321727597e-05, + "loss": 0.7925, "step": 4563 }, { - "epoch": 1.666605806098229, - "grad_norm": 0.9749136567115784, - "learning_rate": 4.128567820957054e-06, - "loss": 0.8231, + "epoch": 0.7918112421929215, + "grad_norm": 0.8831157088279724, + "learning_rate": 3.436911493600282e-05, + "loss": 0.8818, "step": 4564 }, { - "epoch": 1.6669709695088553, - "grad_norm": 1.2499133348464966, - "learning_rate": 4.119843198993185e-06, - "loss": 0.7898, + "epoch": 0.7919847328244275, + "grad_norm": 1.2664246559143066, + "learning_rate": 3.436437498717713e-05, + "loss": 0.8933, "step": 4565 }, { - "epoch": 1.6673361329194814, - "grad_norm": 1.1007726192474365, - "learning_rate": 4.111126746732756e-06, - "loss": 0.8461, + "epoch": 0.7921582234559333, + "grad_norm": 0.9133292436599731, + "learning_rate": 3.4359633371349e-05, + "loss": 0.6823, "step": 4566 }, { - "epoch": 1.6677012963301077, - "grad_norm": 1.0110352039337158, - "learning_rate": 4.102418468660041e-06, - "loss": 0.8269, + "epoch": 0.7923317140874393, + "grad_norm": 0.8010714650154114, + "learning_rate": 3.435489008906867e-05, + "loss": 0.8701, "step": 4567 }, { - "epoch": 1.668066459740734, - "grad_norm": 1.254642128944397, - "learning_rate": 4.0937183692550885e-06, - "loss": 0.7782, + "epoch": 0.7925052047189451, + "grad_norm": 0.6782846450805664, + "learning_rate": 3.435014514088662e-05, + "loss": 0.8915, "step": 4568 }, { - "epoch": 1.6684316231513603, - "grad_norm": 0.7789933085441589, - "learning_rate": 4.0850264529937565e-06, - "loss": 0.8392, + "epoch": 0.7926786953504511, + "grad_norm": 1.013588786125183, + "learning_rate": 3.434539852735352e-05, + "loss": 0.6639, "step": 4569 }, { - "epoch": 1.6687967865619866, - "grad_norm": 1.1118007898330688, - "learning_rate": 4.076342724347686e-06, - "loss": 0.8188, + "epoch": 0.792852185981957, + "grad_norm": 0.9564565420150757, + "learning_rate": 3.434065024902019e-05, + "loss": 0.7063, "step": 4570 }, { - "epoch": 1.6691619499726127, - "grad_norm": 0.8889054656028748, - "learning_rate": 4.067667187784312e-06, - "loss": 0.8307, + "epoch": 0.7930256766134629, + "grad_norm": 1.17200767993927, + "learning_rate": 3.4335900306437694e-05, + "loss": 0.7876, "step": 4571 }, { - "epoch": 1.669527113383239, - "grad_norm": 0.9068534970283508, - "learning_rate": 4.0589998477668405e-06, - "loss": 0.803, + "epoch": 0.7931991672449688, + "grad_norm": 0.859592080116272, + "learning_rate": 3.4331148700157263e-05, + "loss": 0.6649, "step": 4572 }, { - "epoch": 1.669892276793865, - "grad_norm": 1.120553731918335, - "learning_rate": 4.050340708754274e-06, - "loss": 0.7959, + "epoch": 0.7933726578764747, + "grad_norm": 0.764409065246582, + "learning_rate": 3.4326395430730325e-05, + "loss": 0.8984, "step": 4573 }, { - "epoch": 1.6702574402044914, - "grad_norm": 1.3889740705490112, - "learning_rate": 4.041689775201394e-06, - "loss": 0.7977, + "epoch": 0.7935461485079806, + "grad_norm": 1.2467230558395386, + "learning_rate": 3.43216404987085e-05, + "loss": 0.7341, "step": 4574 }, { - "epoch": 1.6706226036151177, - "grad_norm": 1.093053936958313, - "learning_rate": 4.03304705155876e-06, - "loss": 0.7847, + "epoch": 0.7937196391394865, + "grad_norm": 3.483198404312134, + "learning_rate": 3.431688390464361e-05, + "loss": 0.7373, "step": 4575 }, { - "epoch": 1.670987767025744, - "grad_norm": 0.98985755443573, - "learning_rate": 4.024412542272706e-06, - "loss": 0.8014, + "epoch": 0.7938931297709924, + "grad_norm": 1.085963249206543, + "learning_rate": 3.4312125649087664e-05, + "loss": 0.6964, "step": 4576 }, { - "epoch": 1.6713529304363703, - "grad_norm": 1.3342225551605225, - "learning_rate": 4.015786251785334e-06, - "loss": 0.7933, + "epoch": 0.7940666204024983, + "grad_norm": 1.00095534324646, + "learning_rate": 3.4307365732592854e-05, + "loss": 0.7649, "step": 4577 }, { - "epoch": 1.6717180938469967, - "grad_norm": 1.0215411186218262, - "learning_rate": 4.007168184534529e-06, - "loss": 0.8043, + "epoch": 0.7942401110340042, + "grad_norm": 1.1112349033355713, + "learning_rate": 3.430260415571158e-05, + "loss": 0.6473, "step": 4578 }, { - "epoch": 1.6720832572576227, - "grad_norm": 1.0114665031433105, - "learning_rate": 3.99855834495394e-06, - "loss": 0.8676, + "epoch": 0.7944136016655101, + "grad_norm": 0.8980705142021179, + "learning_rate": 3.429784091899644e-05, + "loss": 0.6548, "step": 4579 }, { - "epoch": 1.672448420668249, - "grad_norm": 3.4189155101776123, - "learning_rate": 3.989956737472984e-06, - "loss": 0.8075, + "epoch": 0.794587092297016, + "grad_norm": 0.8265002369880676, + "learning_rate": 3.429307602300019e-05, + "loss": 0.7987, "step": 4580 }, { - "epoch": 1.6728135840788751, - "grad_norm": 1.0076063871383667, - "learning_rate": 3.98136336651685e-06, - "loss": 0.8355, + "epoch": 0.7947605829285218, + "grad_norm": 0.758318305015564, + "learning_rate": 3.428830946827581e-05, + "loss": 0.8342, "step": 4581 }, { - "epoch": 1.6731787474895015, - "grad_norm": 0.8224057555198669, - "learning_rate": 3.9727782365064695e-06, - "loss": 0.8184, + "epoch": 0.7949340735600278, + "grad_norm": 1.2179605960845947, + "learning_rate": 3.4283541255376466e-05, + "loss": 0.8108, "step": 4582 }, { - "epoch": 1.6735439109001278, - "grad_norm": 1.2690553665161133, - "learning_rate": 3.9642013518585564e-06, - "loss": 0.8148, + "epoch": 0.7951075641915336, + "grad_norm": 0.7644733190536499, + "learning_rate": 3.427877138485552e-05, + "loss": 0.7852, "step": 4583 }, { - "epoch": 1.673909074310754, - "grad_norm": 0.995733380317688, - "learning_rate": 3.955632716985569e-06, - "loss": 0.8396, + "epoch": 0.7952810548230396, + "grad_norm": 1.7125033140182495, + "learning_rate": 3.427399985726652e-05, + "loss": 0.5758, "step": 4584 }, { - "epoch": 1.6742742377213804, - "grad_norm": 1.1864736080169678, - "learning_rate": 3.947072336295734e-06, - "loss": 0.7963, + "epoch": 0.7954545454545454, + "grad_norm": 1.0241776704788208, + "learning_rate": 3.4269226673163204e-05, + "loss": 0.677, "step": 4585 }, { - "epoch": 1.6746394011320067, - "grad_norm": 1.2453827857971191, - "learning_rate": 3.938520214193014e-06, - "loss": 0.8013, + "epoch": 0.7956280360860514, + "grad_norm": 0.8177672624588013, + "learning_rate": 3.42644518330995e-05, + "loss": 0.7501, "step": 4586 }, { - "epoch": 1.6750045645426328, - "grad_norm": 1.2128833532333374, - "learning_rate": 3.929976355077134e-06, - "loss": 0.7532, + "epoch": 0.7958015267175572, + "grad_norm": 0.894249439239502, + "learning_rate": 3.425967533762954e-05, + "loss": 0.7925, "step": 4587 }, { - "epoch": 1.6753697279532591, - "grad_norm": 0.9610769152641296, - "learning_rate": 3.921440763343578e-06, - "loss": 0.7742, + "epoch": 0.7959750173490632, + "grad_norm": 1.4861716032028198, + "learning_rate": 3.425489718730765e-05, + "loss": 0.7656, "step": 4588 }, { - "epoch": 1.6757348913638852, - "grad_norm": 1.152714490890503, - "learning_rate": 3.91291344338355e-06, - "loss": 0.8148, + "epoch": 0.796148507980569, + "grad_norm": 0.7730518579483032, + "learning_rate": 3.425011738268832e-05, + "loss": 0.6986, "step": 4589 }, { - "epoch": 1.6761000547745115, - "grad_norm": 1.6050902605056763, - "learning_rate": 3.90439439958402e-06, - "loss": 0.8391, + "epoch": 0.796321998612075, + "grad_norm": 0.9385941624641418, + "learning_rate": 3.4245335924326274e-05, + "loss": 0.6711, "step": 4590 }, { - "epoch": 1.6764652181851378, - "grad_norm": 1.1968127489089966, - "learning_rate": 3.8958836363277016e-06, - "loss": 0.8062, + "epoch": 0.7964954892435808, + "grad_norm": 0.9974027872085571, + "learning_rate": 3.424055281277638e-05, + "loss": 0.6881, "step": 4591 }, { - "epoch": 1.6768303815957641, - "grad_norm": 1.3690496683120728, - "learning_rate": 3.887381157993029e-06, - "loss": 0.8113, + "epoch": 0.7966689798750868, + "grad_norm": 0.9831873774528503, + "learning_rate": 3.423576804859375e-05, + "loss": 0.6497, "step": 4592 }, { - "epoch": 1.6771955450063905, - "grad_norm": 1.206547737121582, - "learning_rate": 3.87888696895419e-06, - "loss": 0.8464, + "epoch": 0.7968424705065926, + "grad_norm": 0.9662128686904907, + "learning_rate": 3.423098163233365e-05, + "loss": 0.6743, "step": 4593 }, { - "epoch": 1.6775607084170168, - "grad_norm": 1.21383535861969, - "learning_rate": 3.870401073581107e-06, - "loss": 0.8627, + "epoch": 0.7970159611380986, + "grad_norm": 1.4226628541946411, + "learning_rate": 3.422619356455154e-05, + "loss": 0.8821, "step": 4594 }, { - "epoch": 1.6779258718276429, - "grad_norm": 1.2389349937438965, - "learning_rate": 3.861923476239435e-06, - "loss": 0.8232, + "epoch": 0.7971894517696044, + "grad_norm": 1.0037726163864136, + "learning_rate": 3.42214038458031e-05, + "loss": 0.6648, "step": 4595 }, { - "epoch": 1.6782910352382692, - "grad_norm": 1.2185567617416382, - "learning_rate": 3.85345418129055e-06, - "loss": 0.7893, + "epoch": 0.7973629424011104, + "grad_norm": 0.7889747619628906, + "learning_rate": 3.421661247664417e-05, + "loss": 0.6672, "step": 4596 }, { - "epoch": 1.6786561986488953, - "grad_norm": 1.2512537240982056, - "learning_rate": 3.844993193091568e-06, - "loss": 0.8041, + "epoch": 0.7975364330326162, + "grad_norm": 1.037923812866211, + "learning_rate": 3.4211819457630795e-05, + "loss": 0.7306, "step": 4597 }, { - "epoch": 1.6790213620595216, - "grad_norm": 1.0427098274230957, - "learning_rate": 3.8365405159953265e-06, - "loss": 0.8074, + "epoch": 0.7977099236641222, + "grad_norm": 0.9054591655731201, + "learning_rate": 3.420702478931921e-05, + "loss": 0.7195, "step": 4598 }, { - "epoch": 1.6793865254701479, - "grad_norm": 1.2413181066513062, - "learning_rate": 3.828096154350391e-06, - "loss": 0.8271, + "epoch": 0.797883414295628, + "grad_norm": 0.6746551394462585, + "learning_rate": 3.420222847226585e-05, + "loss": 0.8792, "step": 4599 }, { - "epoch": 1.6797516888807742, - "grad_norm": 1.3001432418823242, - "learning_rate": 3.819660112501053e-06, - "loss": 0.8219, + "epoch": 0.7980569049271339, + "grad_norm": 0.887339174747467, + "learning_rate": 3.419743050702732e-05, + "loss": 0.7229, "step": 4600 }, { - "epoch": 1.6801168522914005, - "grad_norm": 1.535656452178955, - "learning_rate": 3.811232394787303e-06, - "loss": 0.7982, + "epoch": 0.7982303955586398, + "grad_norm": 0.7682983875274658, + "learning_rate": 3.4192630894160435e-05, + "loss": 0.6387, "step": 4601 }, { - "epoch": 1.6804820157020268, - "grad_norm": 1.2796317338943481, - "learning_rate": 3.802813005544879e-06, - "loss": 0.7712, + "epoch": 0.7984038861901457, + "grad_norm": 0.8016814589500427, + "learning_rate": 3.4187829634222205e-05, + "loss": 0.7307, "step": 4602 }, { - "epoch": 1.680847179112653, - "grad_norm": 1.004874348640442, - "learning_rate": 3.7944019491052043e-06, - "loss": 0.8127, + "epoch": 0.7985773768216516, + "grad_norm": 1.030385971069336, + "learning_rate": 3.4183026727769806e-05, + "loss": 0.6422, "step": 4603 }, { - "epoch": 1.681212342523279, - "grad_norm": 1.1810832023620605, - "learning_rate": 3.7859992297954363e-06, - "loss": 0.8226, + "epoch": 0.7987508674531575, + "grad_norm": 1.161787986755371, + "learning_rate": 3.417822217536064e-05, + "loss": 0.6562, "step": 4604 }, { - "epoch": 1.6815775059339053, - "grad_norm": 1.0403006076812744, - "learning_rate": 3.7776048519384413e-06, - "loss": 0.8623, + "epoch": 0.7989243580846634, + "grad_norm": 0.8058196902275085, + "learning_rate": 3.417341597755226e-05, + "loss": 0.8638, "step": 4605 }, { - "epoch": 1.6819426693445316, - "grad_norm": 0.9892855286598206, - "learning_rate": 3.7692188198527822e-06, - "loss": 0.8218, + "epoch": 0.7990978487161693, + "grad_norm": 0.672274649143219, + "learning_rate": 3.4168608134902443e-05, + "loss": 0.8376, "step": 4606 }, { - "epoch": 1.682307832755158, - "grad_norm": 1.1075366735458374, - "learning_rate": 3.76084113785274e-06, - "loss": 0.8011, + "epoch": 0.7992713393476752, + "grad_norm": 0.7473246455192566, + "learning_rate": 3.416379864796914e-05, + "loss": 0.7827, "step": 4607 }, { - "epoch": 1.6826729961657843, - "grad_norm": 1.1454349756240845, - "learning_rate": 3.7524718102482947e-06, - "loss": 0.7922, + "epoch": 0.7994448299791811, + "grad_norm": 1.0134990215301514, + "learning_rate": 3.4158987517310506e-05, + "loss": 0.8745, "step": 4608 }, { - "epoch": 1.6830381595764106, - "grad_norm": 0.882426381111145, - "learning_rate": 3.74411084134513e-06, - "loss": 0.8203, + "epoch": 0.799618320610687, + "grad_norm": 0.9348793029785156, + "learning_rate": 3.4154174743484865e-05, + "loss": 0.7334, "step": 4609 }, { - "epoch": 1.6834033229870367, - "grad_norm": 1.3049910068511963, - "learning_rate": 3.7357582354446352e-06, - "loss": 0.8241, + "epoch": 0.7997918112421929, + "grad_norm": 0.8017755150794983, + "learning_rate": 3.414936032705075e-05, + "loss": 0.8569, "step": 4610 }, { - "epoch": 1.683768486397663, - "grad_norm": 1.3997046947479248, - "learning_rate": 3.7274139968438782e-06, - "loss": 0.826, + "epoch": 0.7999653018736989, + "grad_norm": 0.843885600566864, + "learning_rate": 3.414454426856689e-05, + "loss": 0.7449, "step": 4611 }, { - "epoch": 1.684133649808289, - "grad_norm": 0.7800149917602539, - "learning_rate": 3.7190781298356428e-06, - "loss": 0.8125, + "epoch": 0.8001387925052047, + "grad_norm": 0.7985478043556213, + "learning_rate": 3.413972656859218e-05, + "loss": 0.6992, "step": 4612 }, { - "epoch": 1.6844988132189154, - "grad_norm": 0.7373388409614563, - "learning_rate": 3.710750638708398e-06, - "loss": 0.8127, + "epoch": 0.8003122831367107, + "grad_norm": 1.096488356590271, + "learning_rate": 3.413490722768573e-05, + "loss": 0.762, "step": 4613 }, { - "epoch": 1.6848639766295417, - "grad_norm": 1.01949143409729, - "learning_rate": 3.7024315277463064e-06, - "loss": 0.816, + "epoch": 0.8004857737682165, + "grad_norm": 1.0406458377838135, + "learning_rate": 3.413008624640683e-05, + "loss": 0.7427, "step": 4614 }, { - "epoch": 1.685229140040168, - "grad_norm": 0.8814488649368286, - "learning_rate": 3.694120801229213e-06, - "loss": 0.7636, + "epoch": 0.8006592643997225, + "grad_norm": 0.902504563331604, + "learning_rate": 3.412526362531495e-05, + "loss": 0.6997, "step": 4615 }, { - "epoch": 1.6855943034507943, - "grad_norm": 1.0889003276824951, - "learning_rate": 3.685818463432649e-06, - "loss": 0.8116, + "epoch": 0.8008327550312283, + "grad_norm": 0.9546231031417847, + "learning_rate": 3.4120439364969766e-05, + "loss": 0.6514, "step": 4616 }, { - "epoch": 1.6859594668614206, - "grad_norm": 1.0459073781967163, - "learning_rate": 3.6775245186278375e-06, - "loss": 0.8342, + "epoch": 0.8010062456627343, + "grad_norm": 0.9010053873062134, + "learning_rate": 3.4115613465931145e-05, + "loss": 0.8118, "step": 4617 }, { - "epoch": 1.6863246302720467, - "grad_norm": 1.045575737953186, - "learning_rate": 3.669238971081681e-06, - "loss": 0.7888, + "epoch": 0.8011797362942401, + "grad_norm": 1.0061980485916138, + "learning_rate": 3.411078592875912e-05, + "loss": 0.6909, "step": 4618 }, { - "epoch": 1.686689793682673, - "grad_norm": 1.3779103755950928, - "learning_rate": 3.6609618250567657e-06, - "loss": 0.8088, + "epoch": 0.8013532269257461, + "grad_norm": 1.0768229961395264, + "learning_rate": 3.4105956754013966e-05, + "loss": 0.796, "step": 4619 }, { - "epoch": 1.6870549570932991, - "grad_norm": 1.0927506685256958, - "learning_rate": 3.652693084811343e-06, - "loss": 0.8318, + "epoch": 0.8015267175572519, + "grad_norm": 0.8910127282142639, + "learning_rate": 3.410112594225607e-05, + "loss": 0.7351, "step": 4620 }, { - "epoch": 1.6874201205039254, - "grad_norm": 1.0611379146575928, - "learning_rate": 3.6444327545993497e-06, - "loss": 0.8553, + "epoch": 0.8017002081887578, + "grad_norm": 0.8863735795021057, + "learning_rate": 3.4096293494046103e-05, + "loss": 0.8145, "step": 4621 }, { - "epoch": 1.6877852839145517, - "grad_norm": 1.1649178266525269, - "learning_rate": 3.636180838670398e-06, - "loss": 0.777, + "epoch": 0.8018736988202637, + "grad_norm": 0.7098711133003235, + "learning_rate": 3.4091459409944836e-05, + "loss": 0.791, "step": 4622 }, { - "epoch": 1.688150447325178, - "grad_norm": 1.0912953615188599, - "learning_rate": 3.627937341269765e-06, - "loss": 0.7728, + "epoch": 0.8020471894517696, + "grad_norm": 0.9198716282844543, + "learning_rate": 3.408662369051329e-05, + "loss": 0.6877, "step": 4623 }, { - "epoch": 1.6885156107358044, - "grad_norm": 0.9403161406517029, - "learning_rate": 3.619702266638405e-06, - "loss": 0.8426, + "epoch": 0.8022206800832755, + "grad_norm": 0.9134693145751953, + "learning_rate": 3.408178633631265e-05, + "loss": 0.7205, "step": 4624 }, { - "epoch": 1.6888807741464307, - "grad_norm": 1.354013442993164, - "learning_rate": 3.611475619012923e-06, - "loss": 0.8017, + "epoch": 0.8023941707147814, + "grad_norm": 0.940019965171814, + "learning_rate": 3.407694734790429e-05, + "loss": 0.7192, "step": 4625 }, { - "epoch": 1.6892459375570568, - "grad_norm": 0.9787935018539429, - "learning_rate": 3.603257402625604e-06, - "loss": 0.8344, + "epoch": 0.8025676613462873, + "grad_norm": 0.9711433053016663, + "learning_rate": 3.407210672584979e-05, + "loss": 0.7158, "step": 4626 }, { - "epoch": 1.689611100967683, - "grad_norm": 1.0935029983520508, - "learning_rate": 3.5950476217043928e-06, - "loss": 0.803, + "epoch": 0.8027411519777932, + "grad_norm": 1.2038646936416626, + "learning_rate": 3.406726447071091e-05, + "loss": 0.6691, "step": 4627 }, { - "epoch": 1.6899762643783092, - "grad_norm": 1.0885210037231445, - "learning_rate": 3.5868462804728933e-06, - "loss": 0.7714, + "epoch": 0.8029146426092991, + "grad_norm": 1.047505259513855, + "learning_rate": 3.40624205830496e-05, + "loss": 0.7488, "step": 4628 }, { - "epoch": 1.6903414277889355, - "grad_norm": 1.130252718925476, - "learning_rate": 3.5786533831503654e-06, - "loss": 0.8315, + "epoch": 0.803088133240805, + "grad_norm": 0.7403028011322021, + "learning_rate": 3.405757506342799e-05, + "loss": 0.7975, "step": 4629 }, { - "epoch": 1.6907065911995618, - "grad_norm": 1.0543928146362305, - "learning_rate": 3.5704689339517184e-06, - "loss": 0.8442, + "epoch": 0.8032616238723109, + "grad_norm": 1.8478658199310303, + "learning_rate": 3.4052727912408414e-05, + "loss": 0.875, "step": 4630 }, { - "epoch": 1.691071754610188, - "grad_norm": 0.8478699326515198, - "learning_rate": 3.56229293708753e-06, - "loss": 0.8475, + "epoch": 0.8034351145038168, + "grad_norm": 1.019914150238037, + "learning_rate": 3.404787913055339e-05, + "loss": 0.7668, "step": 4631 }, { - "epoch": 1.6914369180208144, - "grad_norm": 1.006618618965149, - "learning_rate": 3.554125396764021e-06, - "loss": 0.8269, + "epoch": 0.8036086051353227, + "grad_norm": 0.7999995946884155, + "learning_rate": 3.404302871842563e-05, + "loss": 0.7893, "step": 4632 }, { - "epoch": 1.6918020814314407, - "grad_norm": 1.0763574838638306, - "learning_rate": 3.5459663171830626e-06, - "loss": 0.848, + "epoch": 0.8037820957668286, + "grad_norm": 0.8527405858039856, + "learning_rate": 3.4038176676588014e-05, + "loss": 0.7288, "step": 4633 }, { - "epoch": 1.6921672448420668, - "grad_norm": 0.775421679019928, - "learning_rate": 3.53781570254218e-06, - "loss": 0.8457, + "epoch": 0.8039555863983345, + "grad_norm": 0.7944797873497009, + "learning_rate": 3.403332300560364e-05, + "loss": 0.8689, "step": 4634 }, { - "epoch": 1.6925324082526931, - "grad_norm": 1.0747804641723633, - "learning_rate": 3.5296735570345276e-06, - "loss": 0.8505, + "epoch": 0.8041290770298404, + "grad_norm": 1.210817575454712, + "learning_rate": 3.402846770603578e-05, + "loss": 0.7788, "step": 4635 }, { - "epoch": 1.6928975716633192, - "grad_norm": 1.0088167190551758, - "learning_rate": 3.5215398848489167e-06, - "loss": 0.8392, + "epoch": 0.8043025676613463, + "grad_norm": 0.699792742729187, + "learning_rate": 3.4023610778447895e-05, + "loss": 0.7539, "step": 4636 }, { - "epoch": 1.6932627350739455, - "grad_norm": 0.8691213130950928, - "learning_rate": 3.513414690169794e-06, - "loss": 0.8446, + "epoch": 0.8044760582928522, + "grad_norm": 0.9307442307472229, + "learning_rate": 3.4018752223403634e-05, + "loss": 0.7351, "step": 4637 }, { - "epoch": 1.6936278984845718, - "grad_norm": 0.9323434233665466, - "learning_rate": 3.5052979771772555e-06, - "loss": 0.8447, + "epoch": 0.8046495489243581, + "grad_norm": 1.0085394382476807, + "learning_rate": 3.401389204146685e-05, + "loss": 0.7854, "step": 4638 }, { - "epoch": 1.6939930618951982, - "grad_norm": 1.0773636102676392, - "learning_rate": 3.49718975004701e-06, - "loss": 0.7894, + "epoch": 0.804823039555864, + "grad_norm": 0.9513494372367859, + "learning_rate": 3.400903023320156e-05, + "loss": 0.7258, "step": 4639 }, { - "epoch": 1.6943582253058245, - "grad_norm": 1.230257511138916, - "learning_rate": 3.489090012950422e-06, - "loss": 0.7935, + "epoch": 0.8049965301873698, + "grad_norm": 0.7112436294555664, + "learning_rate": 3.4004166799172004e-05, + "loss": 0.7581, "step": 4640 }, { - "epoch": 1.6947233887164506, - "grad_norm": 0.9413135051727295, - "learning_rate": 3.48099877005448e-06, - "loss": 0.7899, + "epoch": 0.8051700208188758, + "grad_norm": 1.7374703884124756, + "learning_rate": 3.399930173994255e-05, + "loss": 0.9436, "step": 4641 }, { - "epoch": 1.6950885521270769, - "grad_norm": 2.025879144668579, - "learning_rate": 3.4729160255218107e-06, - "loss": 0.8077, + "epoch": 0.8053435114503816, + "grad_norm": 1.2099913358688354, + "learning_rate": 3.3994435056077827e-05, + "loss": 0.6753, "step": 4642 }, { - "epoch": 1.695453715537703, - "grad_norm": 0.9407145977020264, - "learning_rate": 3.4648417835106507e-06, - "loss": 0.7819, + "epoch": 0.8055170020818876, + "grad_norm": 0.7770482301712036, + "learning_rate": 3.398956674814261e-05, + "loss": 0.8691, "step": 4643 }, { - "epoch": 1.6958188789483293, - "grad_norm": 1.1211671829223633, - "learning_rate": 3.4567760481748835e-06, - "loss": 0.7851, + "epoch": 0.8056904927133934, + "grad_norm": 0.9426470398902893, + "learning_rate": 3.3984696816701865e-05, + "loss": 0.7246, "step": 4644 }, { - "epoch": 1.6961840423589556, - "grad_norm": 0.9375742077827454, - "learning_rate": 3.4487188236639966e-06, - "loss": 0.8481, + "epoch": 0.8058639833448994, + "grad_norm": 0.8202270865440369, + "learning_rate": 3.397982526232077e-05, + "loss": 0.7051, "step": 4645 }, { - "epoch": 1.696549205769582, - "grad_norm": 1.1037667989730835, - "learning_rate": 3.4406701141231126e-06, - "loss": 0.8043, + "epoch": 0.8060374739764052, + "grad_norm": 0.9414758682250977, + "learning_rate": 3.397495208556465e-05, + "loss": 0.718, "step": 4646 }, { - "epoch": 1.6969143691802082, - "grad_norm": 0.9661404490470886, - "learning_rate": 3.4326299236929693e-06, - "loss": 0.8167, + "epoch": 0.8062109646079112, + "grad_norm": 0.9688811302185059, + "learning_rate": 3.397007728699907e-05, + "loss": 0.7981, "step": 4647 }, { - "epoch": 1.6972795325908345, - "grad_norm": 1.356053113937378, - "learning_rate": 3.42459825650993e-06, - "loss": 0.8358, + "epoch": 0.806384455239417, + "grad_norm": 1.0462385416030884, + "learning_rate": 3.3965200867189734e-05, + "loss": 0.6393, "step": 4648 }, { - "epoch": 1.6976446960014606, - "grad_norm": 1.1572924852371216, - "learning_rate": 3.416575116705951e-06, - "loss": 0.7892, + "epoch": 0.806557945870923, + "grad_norm": 0.962817370891571, + "learning_rate": 3.3960322826702565e-05, + "loss": 0.7722, "step": 4649 }, { - "epoch": 1.698009859412087, - "grad_norm": 1.2224515676498413, - "learning_rate": 3.408560508408625e-06, - "loss": 0.8169, + "epoch": 0.8067314365024288, + "grad_norm": 0.7542101144790649, + "learning_rate": 3.395544316610367e-05, + "loss": 0.7244, "step": 4650 }, { - "epoch": 1.698375022822713, - "grad_norm": 1.1562143564224243, - "learning_rate": 3.4005544357411433e-06, - "loss": 0.8259, + "epoch": 0.8069049271339348, + "grad_norm": 1.7485052347183228, + "learning_rate": 3.395056188595933e-05, + "loss": 0.7202, "step": 4651 }, { - "epoch": 1.6987401862333393, - "grad_norm": 0.9855158925056458, - "learning_rate": 3.392556902822313e-06, - "loss": 0.809, + "epoch": 0.8070784177654406, + "grad_norm": 0.8214434385299683, + "learning_rate": 3.394567898683602e-05, + "loss": 0.7004, "step": 4652 }, { - "epoch": 1.6991053496439656, - "grad_norm": 1.2144527435302734, - "learning_rate": 3.3845679137665434e-06, - "loss": 0.8229, + "epoch": 0.8072519083969466, + "grad_norm": 0.9503586888313293, + "learning_rate": 3.394079446930043e-05, + "loss": 0.7332, "step": 4653 }, { - "epoch": 1.699470513054592, - "grad_norm": 1.1592378616333008, - "learning_rate": 3.376587472683841e-06, - "loss": 0.814, + "epoch": 0.8074253990284525, + "grad_norm": 0.9069979190826416, + "learning_rate": 3.3935908333919385e-05, + "loss": 0.875, "step": 4654 }, { - "epoch": 1.6998356764652183, - "grad_norm": 1.1051242351531982, - "learning_rate": 3.368615583679833e-06, - "loss": 0.8008, + "epoch": 0.8075988896599584, + "grad_norm": 0.869105339050293, + "learning_rate": 3.393102058125995e-05, + "loss": 0.6931, "step": 4655 }, { - "epoch": 1.7002008398758446, - "grad_norm": 1.4122095108032227, - "learning_rate": 3.360652250855727e-06, - "loss": 0.8445, + "epoch": 0.8077723802914643, + "grad_norm": 2.693324327468872, + "learning_rate": 3.3926131211889336e-05, + "loss": 0.717, "step": 4656 }, { - "epoch": 1.7005660032864707, - "grad_norm": 1.0100466012954712, - "learning_rate": 3.352697478308342e-06, - "loss": 0.8275, + "epoch": 0.8079458709229702, + "grad_norm": 0.7997711300849915, + "learning_rate": 3.392124022637497e-05, + "loss": 0.8499, "step": 4657 }, { - "epoch": 1.700931166697097, - "grad_norm": 1.226925015449524, - "learning_rate": 3.3447512701300956e-06, - "loss": 0.8073, + "epoch": 0.8081193615544761, + "grad_norm": 0.8881179094314575, + "learning_rate": 3.391634762528445e-05, + "loss": 0.7676, "step": 4658 }, { - "epoch": 1.701296330107723, - "grad_norm": 1.5681209564208984, - "learning_rate": 3.3368136304089815e-06, - "loss": 0.7831, + "epoch": 0.8082928521859819, + "grad_norm": 1.1024980545043945, + "learning_rate": 3.391145340918557e-05, + "loss": 0.6555, "step": 4659 }, { - "epoch": 1.7016614935183494, - "grad_norm": 0.8373473882675171, - "learning_rate": 3.328884563228605e-06, - "loss": 0.8425, + "epoch": 0.8084663428174879, + "grad_norm": 0.9129153490066528, + "learning_rate": 3.3906557578646317e-05, + "loss": 0.7578, "step": 4660 }, { - "epoch": 1.7020266569289757, - "grad_norm": 1.1384503841400146, - "learning_rate": 3.320964072668147e-06, - "loss": 0.8182, + "epoch": 0.8086398334489937, + "grad_norm": 1.1250828504562378, + "learning_rate": 3.390166013423485e-05, + "loss": 0.6213, "step": 4661 }, { - "epoch": 1.702391820339602, - "grad_norm": 1.1391408443450928, - "learning_rate": 3.3130521628023926e-06, - "loss": 0.8654, + "epoch": 0.8088133240804997, + "grad_norm": 0.8600782752037048, + "learning_rate": 3.389676107651953e-05, + "loss": 0.8025, "step": 4662 }, { - "epoch": 1.7027569837502283, - "grad_norm": 1.2088404893875122, - "learning_rate": 3.3051488377016884e-06, - "loss": 0.7912, + "epoch": 0.8089868147120055, + "grad_norm": 0.8186443448066711, + "learning_rate": 3.38918604060689e-05, + "loss": 0.7942, "step": 4663 }, { - "epoch": 1.7031221471608546, - "grad_norm": 0.8696324229240417, - "learning_rate": 3.297254101431986e-06, - "loss": 0.8053, + "epoch": 0.8091603053435115, + "grad_norm": 0.8229193091392517, + "learning_rate": 3.388695812345168e-05, + "loss": 0.6981, "step": 4664 }, { - "epoch": 1.7034873105714807, - "grad_norm": 0.7911489009857178, - "learning_rate": 3.2893679580548075e-06, - "loss": 0.8403, + "epoch": 0.8093337959750173, + "grad_norm": 0.9411155581474304, + "learning_rate": 3.388205422923678e-05, + "loss": 0.822, "step": 4665 }, { - "epoch": 1.703852473982107, - "grad_norm": 1.057161569595337, - "learning_rate": 3.2814904116272595e-06, - "loss": 0.8344, + "epoch": 0.8095072866065233, + "grad_norm": 0.7442103028297424, + "learning_rate": 3.3877148723993306e-05, + "loss": 0.7002, "step": 4666 }, { - "epoch": 1.7042176373927331, - "grad_norm": 1.181178331375122, - "learning_rate": 3.2736214662020284e-06, - "loss": 0.7872, + "epoch": 0.8096807772380291, + "grad_norm": 0.8391214609146118, + "learning_rate": 3.387224160829057e-05, + "loss": 0.7415, "step": 4667 }, { - "epoch": 1.7045828008033594, - "grad_norm": 1.0946240425109863, - "learning_rate": 3.2657611258273602e-06, - "loss": 0.8333, + "epoch": 0.8098542678695351, + "grad_norm": 1.0253221988677979, + "learning_rate": 3.3867332882698016e-05, + "loss": 0.864, "step": 4668 }, { - "epoch": 1.7049479642139858, - "grad_norm": 0.8447458148002625, - "learning_rate": 3.257909394547092e-06, - "loss": 0.8202, + "epoch": 0.8100277585010409, + "grad_norm": 0.959554135799408, + "learning_rate": 3.386242254778533e-05, + "loss": 0.7695, "step": 4669 }, { - "epoch": 1.705313127624612, - "grad_norm": 1.0043836832046509, - "learning_rate": 3.250066276400621e-06, - "loss": 0.8207, + "epoch": 0.8102012491325469, + "grad_norm": 0.8422642350196838, + "learning_rate": 3.385751060412235e-05, + "loss": 0.7991, "step": 4670 }, { - "epoch": 1.7056782910352384, - "grad_norm": 1.1556264162063599, - "learning_rate": 3.242231775422915e-06, - "loss": 0.8235, + "epoch": 0.8103747397640527, + "grad_norm": 0.8814015984535217, + "learning_rate": 3.3852597052279124e-05, + "loss": 0.6599, "step": 4671 }, { - "epoch": 1.7060434544458645, - "grad_norm": 1.015289545059204, - "learning_rate": 3.234405895644519e-06, - "loss": 0.8192, + "epoch": 0.8105482303955587, + "grad_norm": 1.1542834043502808, + "learning_rate": 3.3847681892825865e-05, + "loss": 0.697, "step": 4672 }, { - "epoch": 1.7064086178564908, - "grad_norm": 1.1183212995529175, - "learning_rate": 3.2265886410915214e-06, - "loss": 0.8135, + "epoch": 0.8107217210270645, + "grad_norm": 0.9115173816680908, + "learning_rate": 3.3842765126332984e-05, + "loss": 0.8071, "step": 4673 }, { - "epoch": 1.7067737812671169, - "grad_norm": 1.0853688716888428, - "learning_rate": 3.2187800157855964e-06, - "loss": 0.8036, + "epoch": 0.8108952116585705, + "grad_norm": 0.987003743648529, + "learning_rate": 3.383784675337108e-05, + "loss": 0.6412, "step": 4674 }, { - "epoch": 1.7071389446777432, - "grad_norm": 0.896252453327179, - "learning_rate": 3.2109800237439616e-06, - "loss": 0.8182, + "epoch": 0.8110687022900763, + "grad_norm": 0.8712530732154846, + "learning_rate": 3.383292677451094e-05, + "loss": 0.733, "step": 4675 }, { - "epoch": 1.7075041080883695, - "grad_norm": 1.0309017896652222, - "learning_rate": 3.2031886689794044e-06, - "loss": 0.8245, + "epoch": 0.8112421929215823, + "grad_norm": 0.8238782286643982, + "learning_rate": 3.382800519032353e-05, + "loss": 0.6716, "step": 4676 }, { - "epoch": 1.7078692714989958, - "grad_norm": 1.6589974164962769, - "learning_rate": 3.1954059555002683e-06, - "loss": 0.8345, + "epoch": 0.8114156835530881, + "grad_norm": 0.6596318483352661, + "learning_rate": 3.382308200138e-05, + "loss": 0.8145, "step": 4677 }, { - "epoch": 1.7082344349096221, - "grad_norm": 1.4586600065231323, - "learning_rate": 3.1876318873104383e-06, - "loss": 0.7916, + "epoch": 0.8115891741845941, + "grad_norm": 0.7977187633514404, + "learning_rate": 3.38181572082517e-05, + "loss": 0.7847, "step": 4678 }, { - "epoch": 1.7085995983202484, - "grad_norm": 0.8651988506317139, - "learning_rate": 3.1798664684093606e-06, - "loss": 0.8421, + "epoch": 0.8117626648160999, + "grad_norm": 0.9770283102989197, + "learning_rate": 3.381323081151015e-05, + "loss": 0.7175, "step": 4679 }, { - "epoch": 1.7089647617308745, - "grad_norm": 1.1348179578781128, - "learning_rate": 3.1721097027920367e-06, - "loss": 0.7802, + "epoch": 0.8119361554476058, + "grad_norm": 0.9186651110649109, + "learning_rate": 3.3808302811727074e-05, + "loss": 0.6589, "step": 4680 }, { - "epoch": 1.7093299251415008, - "grad_norm": 1.1178635358810425, - "learning_rate": 3.1643615944490147e-06, - "loss": 0.7627, + "epoch": 0.8121096460791117, + "grad_norm": 1.1531708240509033, + "learning_rate": 3.380337320947437e-05, + "loss": 0.6353, "step": 4681 }, { - "epoch": 1.709695088552127, - "grad_norm": 0.9017081260681152, - "learning_rate": 3.1566221473663794e-06, - "loss": 0.8245, + "epoch": 0.8122831367106176, + "grad_norm": 0.765068531036377, + "learning_rate": 3.379844200532411e-05, + "loss": 0.8472, "step": 4682 }, { - "epoch": 1.7100602519627532, - "grad_norm": 0.9653119444847107, - "learning_rate": 3.1488913655257635e-06, - "loss": 0.8254, + "epoch": 0.8124566273421235, + "grad_norm": 1.0348665714263916, + "learning_rate": 3.379350919984858e-05, + "loss": 0.7, "step": 4683 }, { - "epoch": 1.7104254153733796, - "grad_norm": 0.9762559533119202, - "learning_rate": 3.1411692529043457e-06, - "loss": 0.8291, + "epoch": 0.8126301179736294, + "grad_norm": 0.8701842427253723, + "learning_rate": 3.378857479362024e-05, + "loss": 0.6901, "step": 4684 }, { - "epoch": 1.7107905787840059, - "grad_norm": 1.2863898277282715, - "learning_rate": 3.133455813474844e-06, - "loss": 0.8337, + "epoch": 0.8128036086051353, + "grad_norm": 1.000014066696167, + "learning_rate": 3.378363878721173e-05, + "loss": 0.6649, "step": 4685 }, { - "epoch": 1.7111557421946322, - "grad_norm": 1.2191870212554932, - "learning_rate": 3.1257510512055145e-06, - "loss": 0.8362, + "epoch": 0.8129770992366412, + "grad_norm": 0.8522836565971375, + "learning_rate": 3.377870118119587e-05, + "loss": 0.8325, "step": 4686 }, { - "epoch": 1.7115209056052585, - "grad_norm": 1.1288683414459229, - "learning_rate": 3.1180549700601535e-06, - "loss": 0.7606, + "epoch": 0.8131505898681471, + "grad_norm": 1.6286545991897583, + "learning_rate": 3.377376197614568e-05, + "loss": 0.8066, "step": 4687 }, { - "epoch": 1.7118860690158846, - "grad_norm": 1.3574138879776, - "learning_rate": 3.1103675739980745e-06, - "loss": 0.8111, + "epoch": 0.813324080499653, + "grad_norm": 0.9966020584106445, + "learning_rate": 3.376882117263437e-05, + "loss": 0.8049, "step": 4688 }, { - "epoch": 1.712251232426511, - "grad_norm": 0.9972654581069946, - "learning_rate": 3.1026888669741396e-06, - "loss": 0.8002, + "epoch": 0.813497571131159, + "grad_norm": 1.8508763313293457, + "learning_rate": 3.376387877123533e-05, + "loss": 0.6904, "step": 4689 }, { - "epoch": 1.712616395837137, - "grad_norm": 1.700941801071167, - "learning_rate": 3.095018852938736e-06, - "loss": 0.7924, + "epoch": 0.8136710617626648, + "grad_norm": 0.9233130812644958, + "learning_rate": 3.37589347725221e-05, + "loss": 0.7734, "step": 4690 }, { - "epoch": 1.7129815592477633, - "grad_norm": 1.406071424484253, - "learning_rate": 3.0873575358377826e-06, - "loss": 0.8461, + "epoch": 0.8138445523941708, + "grad_norm": 0.7623953819274902, + "learning_rate": 3.375398917706847e-05, + "loss": 0.8889, "step": 4691 }, { - "epoch": 1.7133467226583896, - "grad_norm": 1.1977708339691162, - "learning_rate": 3.0797049196127115e-06, - "loss": 0.7822, + "epoch": 0.8140180430256766, + "grad_norm": 1.1773940324783325, + "learning_rate": 3.374904198544836e-05, + "loss": 0.7681, "step": 4692 }, { - "epoch": 1.713711886069016, - "grad_norm": 1.1031547784805298, - "learning_rate": 3.0720610082004887e-06, - "loss": 0.832, + "epoch": 0.8141915336571826, + "grad_norm": 2.755359172821045, + "learning_rate": 3.374409319823592e-05, + "loss": 0.8284, "step": 4693 }, { - "epoch": 1.7140770494796422, - "grad_norm": 1.792107105255127, - "learning_rate": 3.0644258055336017e-06, - "loss": 0.8165, + "epoch": 0.8143650242886884, + "grad_norm": 0.8596736788749695, + "learning_rate": 3.373914281600544e-05, + "loss": 0.7316, "step": 4694 }, { - "epoch": 1.7144422128902685, - "grad_norm": 1.2078214883804321, - "learning_rate": 3.05679931554006e-06, - "loss": 0.7912, + "epoch": 0.8145385149201944, + "grad_norm": 1.0579583644866943, + "learning_rate": 3.3734190839331425e-05, + "loss": 0.6598, "step": 4695 }, { - "epoch": 1.7148073763008946, - "grad_norm": 1.3812817335128784, - "learning_rate": 3.0491815421433825e-06, - "loss": 0.8143, + "epoch": 0.8147120055517002, + "grad_norm": 1.0207267999649048, + "learning_rate": 3.372923726878856e-05, + "loss": 0.8992, "step": 4696 }, { - "epoch": 1.715172539711521, - "grad_norm": 0.9092957973480225, - "learning_rate": 3.041572489262603e-06, - "loss": 0.8085, + "epoch": 0.8148854961832062, + "grad_norm": 0.7435789704322815, + "learning_rate": 3.372428210495172e-05, + "loss": 0.8606, "step": 4697 }, { - "epoch": 1.715537703122147, - "grad_norm": 1.9804608821868896, - "learning_rate": 3.0339721608122774e-06, - "loss": 0.8163, + "epoch": 0.815058986814712, + "grad_norm": 0.8075913786888123, + "learning_rate": 3.371932534839594e-05, + "loss": 0.8464, "step": 4698 }, { - "epoch": 1.7159028665327734, - "grad_norm": 1.244433045387268, - "learning_rate": 3.026380560702471e-06, - "loss": 0.8184, + "epoch": 0.8152324774462179, + "grad_norm": 1.151850700378418, + "learning_rate": 3.371436699969648e-05, + "loss": 0.6091, "step": 4699 }, { - "epoch": 1.7162680299433997, - "grad_norm": 0.992311418056488, - "learning_rate": 3.0187976928387573e-06, - "loss": 0.8514, + "epoch": 0.8154059680777238, + "grad_norm": 0.880418062210083, + "learning_rate": 3.370940705942874e-05, + "loss": 0.6633, "step": 4700 }, { - "epoch": 1.716633193354026, - "grad_norm": 1.2603802680969238, - "learning_rate": 3.011223561122223e-06, - "loss": 0.8202, + "epoch": 0.8155794587092297, + "grad_norm": 0.9823868274688721, + "learning_rate": 3.3704445528168335e-05, + "loss": 0.7769, "step": 4701 }, { - "epoch": 1.7169983567646523, - "grad_norm": 0.9738311767578125, - "learning_rate": 3.0036581694494436e-06, - "loss": 0.7856, + "epoch": 0.8157529493407356, + "grad_norm": 1.3908098936080933, + "learning_rate": 3.369948240649106e-05, + "loss": 0.6536, "step": 4702 }, { - "epoch": 1.7173635201752786, - "grad_norm": 0.9094842076301575, - "learning_rate": 2.9961015217125155e-06, - "loss": 0.781, + "epoch": 0.8159264399722415, + "grad_norm": 0.8435240387916565, + "learning_rate": 3.369451769497289e-05, + "loss": 0.7102, "step": 4703 }, { - "epoch": 1.7177286835859047, - "grad_norm": 1.1991461515426636, - "learning_rate": 2.988553621799033e-06, - "loss": 0.802, + "epoch": 0.8160999306037474, + "grad_norm": 1.0886040925979614, + "learning_rate": 3.368955139418998e-05, + "loss": 0.7061, "step": 4704 }, { - "epoch": 1.7180938469965308, - "grad_norm": 1.7127655744552612, - "learning_rate": 2.9810144735920877e-06, - "loss": 0.8394, + "epoch": 0.8162734212352533, + "grad_norm": 0.9108531475067139, + "learning_rate": 3.368458350471868e-05, + "loss": 0.8323, "step": 4705 }, { - "epoch": 1.718459010407157, - "grad_norm": 1.362473964691162, - "learning_rate": 2.9734840809702613e-06, - "loss": 0.8184, + "epoch": 0.8164469118667592, + "grad_norm": 0.8659676313400269, + "learning_rate": 3.367961402713553e-05, + "loss": 0.7874, "step": 4706 }, { - "epoch": 1.7188241738177834, - "grad_norm": 1.0843855142593384, - "learning_rate": 2.965962447807644e-06, - "loss": 0.8016, + "epoch": 0.8166204024982651, + "grad_norm": 0.9554901123046875, + "learning_rate": 3.3674642962017215e-05, + "loss": 0.7637, "step": 4707 }, { - "epoch": 1.7191893372284097, - "grad_norm": 1.504137635231018, - "learning_rate": 2.9584495779738144e-06, - "loss": 0.8129, + "epoch": 0.816793893129771, + "grad_norm": 1.0279065370559692, + "learning_rate": 3.3669670309940663e-05, + "loss": 0.6146, "step": 4708 }, { - "epoch": 1.719554500639036, - "grad_norm": 1.7557711601257324, - "learning_rate": 2.950945475333846e-06, - "loss": 0.8052, + "epoch": 0.8169673837612769, + "grad_norm": 0.7166879177093506, + "learning_rate": 3.366469607148293e-05, + "loss": 0.7736, "step": 4709 }, { - "epoch": 1.7199196640496623, - "grad_norm": 0.9953482747077942, - "learning_rate": 2.94345014374829e-06, - "loss": 0.8322, + "epoch": 0.8171408743927828, + "grad_norm": 1.196611762046814, + "learning_rate": 3.365972024722131e-05, + "loss": 0.8458, "step": 4710 }, { - "epoch": 1.7202848274602884, - "grad_norm": 1.2738533020019531, - "learning_rate": 2.9359635870732028e-06, - "loss": 0.7877, + "epoch": 0.8173143650242887, + "grad_norm": 0.9180603623390198, + "learning_rate": 3.365474283773323e-05, + "loss": 0.6957, "step": 4711 }, { - "epoch": 1.7206499908709147, - "grad_norm": 0.9172689318656921, - "learning_rate": 2.928485809160109e-06, - "loss": 0.8024, + "epoch": 0.8174878556557946, + "grad_norm": 1.1366982460021973, + "learning_rate": 3.3649763843596334e-05, + "loss": 0.7113, "step": 4712 }, { - "epoch": 1.7210151542815408, - "grad_norm": 1.023495078086853, - "learning_rate": 2.921016813856028e-06, - "loss": 0.8344, + "epoch": 0.8176613462873005, + "grad_norm": 0.9072386622428894, + "learning_rate": 3.364478326538844e-05, + "loss": 0.6818, "step": 4713 }, { - "epoch": 1.7213803176921672, - "grad_norm": 0.9917659759521484, - "learning_rate": 2.91355660500346e-06, - "loss": 0.7993, + "epoch": 0.8178348369188064, + "grad_norm": 0.7893767356872559, + "learning_rate": 3.363980110368755e-05, + "loss": 0.6953, "step": 4714 }, { - "epoch": 1.7217454811027935, - "grad_norm": 0.9108633399009705, - "learning_rate": 2.906105186440389e-06, - "loss": 0.7784, + "epoch": 0.8180083275503123, + "grad_norm": 0.8182500004768372, + "learning_rate": 3.363481735907185e-05, + "loss": 0.8154, "step": 4715 }, { - "epoch": 1.7221106445134198, - "grad_norm": 1.0507220029830933, - "learning_rate": 2.8986625620002586e-06, - "loss": 0.7892, + "epoch": 0.8181818181818182, + "grad_norm": 1.0506726503372192, + "learning_rate": 3.36298320321197e-05, + "loss": 0.726, "step": 4716 }, { - "epoch": 1.722475807924046, - "grad_norm": 1.204466462135315, - "learning_rate": 2.891228735512004e-06, - "loss": 0.8274, + "epoch": 0.8183553088133241, + "grad_norm": 1.034210443496704, + "learning_rate": 3.3624845123409665e-05, + "loss": 0.6218, "step": 4717 }, { - "epoch": 1.7228409713346724, - "grad_norm": 1.201046347618103, - "learning_rate": 2.883803710800035e-06, - "loss": 0.8274, + "epoch": 0.8185287994448299, + "grad_norm": 0.9111889600753784, + "learning_rate": 3.361985663352048e-05, + "loss": 0.6836, "step": 4718 }, { - "epoch": 1.7232061347452985, - "grad_norm": 0.9916613698005676, - "learning_rate": 2.876387491684225e-06, - "loss": 0.8097, + "epoch": 0.8187022900763359, + "grad_norm": 0.9074407815933228, + "learning_rate": 3.361486656303106e-05, + "loss": 0.6993, "step": 4719 }, { - "epoch": 1.7235712981559248, - "grad_norm": 1.5393697023391724, - "learning_rate": 2.8689800819799286e-06, - "loss": 0.8267, + "epoch": 0.8188757807078417, + "grad_norm": 0.834688663482666, + "learning_rate": 3.360987491252051e-05, + "loss": 0.8378, "step": 4720 }, { - "epoch": 1.723936461566551, - "grad_norm": 0.9582063555717468, - "learning_rate": 2.8615814854979507e-06, - "loss": 0.7683, + "epoch": 0.8190492713393477, + "grad_norm": 0.9987834095954895, + "learning_rate": 3.3604881682568126e-05, + "loss": 0.6116, "step": 4721 }, { - "epoch": 1.7243016249771772, - "grad_norm": 2.5330939292907715, - "learning_rate": 2.85419170604458e-06, - "loss": 0.7794, + "epoch": 0.8192227619708535, + "grad_norm": 1.1932008266448975, + "learning_rate": 3.359988687375336e-05, + "loss": 0.6211, "step": 4722 }, { - "epoch": 1.7246667883878035, - "grad_norm": 0.9230392575263977, - "learning_rate": 2.846810747421553e-06, - "loss": 0.827, + "epoch": 0.8193962526023595, + "grad_norm": 1.0863169431686401, + "learning_rate": 3.359489048665587e-05, + "loss": 0.7275, "step": 4723 }, { - "epoch": 1.7250319517984298, - "grad_norm": 1.8249539136886597, - "learning_rate": 2.8394386134260843e-06, - "loss": 0.8463, + "epoch": 0.8195697432338653, + "grad_norm": 1.3110911846160889, + "learning_rate": 3.3589892521855515e-05, + "loss": 0.688, "step": 4724 }, { - "epoch": 1.7253971152090561, - "grad_norm": 0.9649474620819092, - "learning_rate": 2.832075307850841e-06, - "loss": 0.8197, + "epoch": 0.8197432338653713, + "grad_norm": 0.957463800907135, + "learning_rate": 3.3584892979932284e-05, + "loss": 0.6687, "step": 4725 }, { - "epoch": 1.7257622786196825, - "grad_norm": 1.0250506401062012, - "learning_rate": 2.8247208344839428e-06, - "loss": 0.8419, + "epoch": 0.8199167244968771, + "grad_norm": 1.6898056268692017, + "learning_rate": 3.35798918614664e-05, + "loss": 0.7095, "step": 4726 }, { - "epoch": 1.7261274420303085, - "grad_norm": 0.9853530526161194, - "learning_rate": 2.8173751971089734e-06, - "loss": 0.8104, + "epoch": 0.8200902151283831, + "grad_norm": 0.9258905053138733, + "learning_rate": 3.357488916703824e-05, + "loss": 0.6123, "step": 4727 }, { - "epoch": 1.7264926054409349, - "grad_norm": 0.9839889407157898, - "learning_rate": 2.8100383995049687e-06, - "loss": 0.814, + "epoch": 0.8202637057598889, + "grad_norm": 0.8883718252182007, + "learning_rate": 3.356988489722837e-05, + "loss": 0.7103, "step": 4728 }, { - "epoch": 1.726857768851561, - "grad_norm": 0.9257577061653137, - "learning_rate": 2.8027104454464172e-06, - "loss": 0.8174, + "epoch": 0.8204371963913949, + "grad_norm": 0.833127498626709, + "learning_rate": 3.3564879052617555e-05, + "loss": 0.7881, "step": 4729 }, { - "epoch": 1.7272229322621873, - "grad_norm": 1.2466052770614624, - "learning_rate": 2.795391338703264e-06, - "loss": 0.7974, + "epoch": 0.8206106870229007, + "grad_norm": 0.8044516444206238, + "learning_rate": 3.355987163378671e-05, + "loss": 0.7654, "step": 4730 }, { - "epoch": 1.7275880956728136, - "grad_norm": 1.1432262659072876, - "learning_rate": 2.7880810830408834e-06, - "loss": 0.8488, + "epoch": 0.8207841776544067, + "grad_norm": 0.8004693984985352, + "learning_rate": 3.3554862641316965e-05, + "loss": 0.7238, "step": 4731 }, { - "epoch": 1.7279532590834399, - "grad_norm": 1.0671768188476562, - "learning_rate": 2.7807796822201137e-06, - "loss": 0.8253, + "epoch": 0.8209576682859125, + "grad_norm": 0.8925685882568359, + "learning_rate": 3.354985207578961e-05, + "loss": 0.7737, "step": 4732 }, { - "epoch": 1.7283184224940662, - "grad_norm": 1.398992896080017, - "learning_rate": 2.773487139997233e-06, - "loss": 0.7853, + "epoch": 0.8211311589174185, + "grad_norm": 0.7443047761917114, + "learning_rate": 3.354483993778614e-05, + "loss": 0.7947, "step": 4733 }, { - "epoch": 1.7286835859046925, - "grad_norm": 1.5769710540771484, - "learning_rate": 2.7662034601239664e-06, - "loss": 0.8309, + "epoch": 0.8213046495489243, + "grad_norm": 0.7192960977554321, + "learning_rate": 3.3539826227888216e-05, + "loss": 0.9417, "step": 4734 }, { - "epoch": 1.7290487493153186, - "grad_norm": 0.9666868448257446, - "learning_rate": 2.7589286463474698e-06, - "loss": 0.7969, + "epoch": 0.8214781401804303, + "grad_norm": 1.5496827363967896, + "learning_rate": 3.3534810946677676e-05, + "loss": 0.8809, "step": 4735 }, { - "epoch": 1.729413912725945, - "grad_norm": 1.3063603639602661, - "learning_rate": 2.7516627024103403e-06, - "loss": 0.7493, + "epoch": 0.8216516308119362, + "grad_norm": 0.9868696928024292, + "learning_rate": 3.352979409473656e-05, + "loss": 0.7959, "step": 4736 }, { - "epoch": 1.729779076136571, - "grad_norm": 1.1282740831375122, - "learning_rate": 2.7444056320506175e-06, - "loss": 0.8031, + "epoch": 0.8218251214434421, + "grad_norm": 1.0700030326843262, + "learning_rate": 3.3524775672647064e-05, + "loss": 0.6992, "step": 4737 }, { - "epoch": 1.7301442395471973, - "grad_norm": 1.0634880065917969, - "learning_rate": 2.7371574390017742e-06, - "loss": 0.8253, + "epoch": 0.821998612074948, + "grad_norm": 1.3022713661193848, + "learning_rate": 3.351975568099159e-05, + "loss": 0.7344, "step": 4738 }, { - "epoch": 1.7305094029578236, - "grad_norm": 1.1114223003387451, - "learning_rate": 2.7299181269927165e-06, - "loss": 0.8408, + "epoch": 0.8221721027064538, + "grad_norm": 1.3976041078567505, + "learning_rate": 3.3514734120352735e-05, + "loss": 0.7083, "step": 4739 }, { - "epoch": 1.73087456636845, - "grad_norm": 1.0223162174224854, - "learning_rate": 2.7226876997477723e-06, - "loss": 0.8185, + "epoch": 0.8223455933379598, + "grad_norm": 0.9280670881271362, + "learning_rate": 3.350971099131322e-05, + "loss": 0.6228, "step": 4740 }, { - "epoch": 1.7312397297790763, - "grad_norm": 1.6440062522888184, - "learning_rate": 2.7154661609867126e-06, - "loss": 0.8561, + "epoch": 0.8225190839694656, + "grad_norm": 1.3128758668899536, + "learning_rate": 3.350468629445601e-05, + "loss": 0.7502, "step": 4741 }, { - "epoch": 1.7316048931897023, - "grad_norm": 0.8999413847923279, - "learning_rate": 2.708253514424728e-06, - "loss": 0.8283, + "epoch": 0.8226925746009716, + "grad_norm": 0.8534931540489197, + "learning_rate": 3.349966003036421e-05, + "loss": 0.8042, "step": 4742 }, { - "epoch": 1.7319700566003287, - "grad_norm": 1.4075323343276978, - "learning_rate": 2.701049763772434e-06, - "loss": 0.8073, + "epoch": 0.8228660652324774, + "grad_norm": 0.9725764393806458, + "learning_rate": 3.3494632199621146e-05, + "loss": 0.6934, "step": 4743 }, { - "epoch": 1.7323352200109547, - "grad_norm": 0.8998292088508606, - "learning_rate": 2.6938549127358803e-06, - "loss": 0.8729, + "epoch": 0.8230395558639834, + "grad_norm": 1.0885205268859863, + "learning_rate": 3.3489602802810276e-05, + "loss": 0.6331, "step": 4744 }, { - "epoch": 1.732700383421581, - "grad_norm": 1.1163274049758911, - "learning_rate": 2.6866689650165146e-06, - "loss": 0.8461, + "epoch": 0.8232130464954892, + "grad_norm": 1.2970077991485596, + "learning_rate": 3.3484571840515295e-05, + "loss": 0.5997, "step": 4745 }, { - "epoch": 1.7330655468322074, - "grad_norm": 0.9799559116363525, - "learning_rate": 2.679491924311226e-06, - "loss": 0.8138, + "epoch": 0.8233865371269952, + "grad_norm": 0.8239089846611023, + "learning_rate": 3.347953931332004e-05, + "loss": 0.7123, "step": 4746 }, { - "epoch": 1.7334307102428337, - "grad_norm": 1.6628844738006592, - "learning_rate": 2.672323794312315e-06, - "loss": 0.7955, + "epoch": 0.823560027758501, + "grad_norm": 1.1290653944015503, + "learning_rate": 3.347450522180854e-05, + "loss": 0.7, "step": 4747 }, { - "epoch": 1.73379587365346, - "grad_norm": 0.8688253164291382, - "learning_rate": 2.6651645787075e-06, - "loss": 0.8502, + "epoch": 0.823733518390007, + "grad_norm": 3.735804319381714, + "learning_rate": 3.3469469566565e-05, + "loss": 0.7292, "step": 4748 }, { - "epoch": 1.7341610370640863, - "grad_norm": 1.044459342956543, - "learning_rate": 2.6580142811799037e-06, - "loss": 0.8148, + "epoch": 0.8239070090215128, + "grad_norm": 1.126335620880127, + "learning_rate": 3.3464432348173827e-05, + "loss": 0.7346, "step": 4749 }, { - "epoch": 1.7345262004747124, - "grad_norm": 1.098548173904419, - "learning_rate": 2.6508729054080664e-06, - "loss": 0.8861, + "epoch": 0.8240804996530188, + "grad_norm": 1.001314401626587, + "learning_rate": 3.345939356721959e-05, + "loss": 0.6582, "step": 4750 }, { - "epoch": 1.7348913638853387, - "grad_norm": 1.4963109493255615, - "learning_rate": 2.6437404550659416e-06, - "loss": 0.8032, + "epoch": 0.8242539902845246, + "grad_norm": 0.7831737399101257, + "learning_rate": 3.345435322428705e-05, + "loss": 0.6451, "step": 4751 }, { - "epoch": 1.7352565272959648, - "grad_norm": 1.0711097717285156, - "learning_rate": 2.6366169338228885e-06, - "loss": 0.8333, + "epoch": 0.8244274809160306, + "grad_norm": 1.2740843296051025, + "learning_rate": 3.3449311319961134e-05, + "loss": 0.8503, "step": 4752 }, { - "epoch": 1.7356216907065911, - "grad_norm": 1.1238175630569458, - "learning_rate": 2.629502345343675e-06, - "loss": 0.8007, + "epoch": 0.8246009715475364, + "grad_norm": 1.3307411670684814, + "learning_rate": 3.344426785482697e-05, + "loss": 0.8014, "step": 4753 }, { - "epoch": 1.7359868541172174, - "grad_norm": 0.8647558093070984, - "learning_rate": 2.622396693288474e-06, - "loss": 0.803, + "epoch": 0.8247744621790424, + "grad_norm": 1.1435797214508057, + "learning_rate": 3.343922282946985e-05, + "loss": 0.739, "step": 4754 }, { - "epoch": 1.7363520175278437, - "grad_norm": 1.2941838502883911, - "learning_rate": 2.6152999813128487e-06, - "loss": 0.8021, + "epoch": 0.8249479528105482, + "grad_norm": 0.8582712411880493, + "learning_rate": 3.343417624447527e-05, + "loss": 0.6746, "step": 4755 }, { - "epoch": 1.73671718093847, - "grad_norm": 0.9902263283729553, - "learning_rate": 2.60821221306778e-06, - "loss": 0.8065, + "epoch": 0.8251214434420542, + "grad_norm": 0.7478053569793701, + "learning_rate": 3.342912810042888e-05, + "loss": 0.9077, "step": 4756 }, { - "epoch": 1.7370823443490964, - "grad_norm": 0.7625173926353455, - "learning_rate": 2.6011333921996397e-06, - "loss": 0.8394, + "epoch": 0.82529493407356, + "grad_norm": 0.839023232460022, + "learning_rate": 3.342407839791653e-05, + "loss": 0.814, "step": 4757 }, { - "epoch": 1.7374475077597225, - "grad_norm": 1.0463464260101318, - "learning_rate": 2.5940635223501985e-06, - "loss": 0.7847, + "epoch": 0.8254684247050659, + "grad_norm": 1.0041440725326538, + "learning_rate": 3.3419027137524236e-05, + "loss": 0.804, "step": 4758 }, { - "epoch": 1.7378126711703488, - "grad_norm": 1.4295697212219238, - "learning_rate": 2.5870026071566145e-06, - "loss": 0.8256, + "epoch": 0.8256419153365718, + "grad_norm": 0.9418413639068604, + "learning_rate": 3.34139743198382e-05, + "loss": 0.647, "step": 4759 }, { - "epoch": 1.7381778345809749, - "grad_norm": 1.0796008110046387, - "learning_rate": 2.5799506502514504e-06, - "loss": 0.7977, + "epoch": 0.8258154059680777, + "grad_norm": 0.8709951043128967, + "learning_rate": 3.340891994544483e-05, + "loss": 0.6511, "step": 4760 }, { - "epoch": 1.7385429979916012, - "grad_norm": 1.3021360635757446, - "learning_rate": 2.572907655262653e-06, - "loss": 0.7882, + "epoch": 0.8259888965995836, + "grad_norm": 0.9272386431694031, + "learning_rate": 3.3403864014930665e-05, + "loss": 0.8708, "step": 4761 }, { - "epoch": 1.7389081614022275, - "grad_norm": 1.500536561012268, - "learning_rate": 2.565873625813564e-06, - "loss": 0.8157, + "epoch": 0.8261623872310895, + "grad_norm": 0.7006701827049255, + "learning_rate": 3.339880652888246e-05, + "loss": 0.9312, "step": 4762 }, { - "epoch": 1.7392733248128538, - "grad_norm": 1.2518084049224854, - "learning_rate": 2.5588485655229046e-06, - "loss": 0.8382, + "epoch": 0.8263358778625954, + "grad_norm": 0.9944937229156494, + "learning_rate": 3.339374748788715e-05, + "loss": 0.7351, "step": 4763 }, { - "epoch": 1.73963848822348, - "grad_norm": 1.2362033128738403, - "learning_rate": 2.5518324780047922e-06, - "loss": 0.8098, + "epoch": 0.8265093684941013, + "grad_norm": 1.9571611881256104, + "learning_rate": 3.338868689253183e-05, + "loss": 0.5908, "step": 4764 }, { - "epoch": 1.7400036516341064, - "grad_norm": 0.9677146673202515, - "learning_rate": 2.544825366868713e-06, - "loss": 0.7854, + "epoch": 0.8266828591256072, + "grad_norm": 0.9580154418945312, + "learning_rate": 3.338362474340381e-05, + "loss": 0.7275, "step": 4765 }, { - "epoch": 1.7403688150447325, - "grad_norm": 1.569866418838501, - "learning_rate": 2.53782723571955e-06, - "loss": 0.853, + "epoch": 0.8268563497571131, + "grad_norm": 1.3119512796401978, + "learning_rate": 3.337856104109053e-05, + "loss": 0.8323, "step": 4766 }, { - "epoch": 1.7407339784553588, - "grad_norm": 1.1469392776489258, - "learning_rate": 2.5308380881575613e-06, - "loss": 0.8138, + "epoch": 0.827029840388619, + "grad_norm": 0.8618447780609131, + "learning_rate": 3.337349578617965e-05, + "loss": 0.808, "step": 4767 }, { - "epoch": 1.741099141865985, - "grad_norm": 1.1222312450408936, - "learning_rate": 2.523857927778388e-06, - "loss": 0.835, + "epoch": 0.8272033310201249, + "grad_norm": 1.42372727394104, + "learning_rate": 3.3368428979259006e-05, + "loss": 0.8972, "step": 4768 }, { - "epoch": 1.7414643052766112, - "grad_norm": 1.027269959449768, - "learning_rate": 2.5168867581730315e-06, - "loss": 0.8298, + "epoch": 0.8273768216516308, + "grad_norm": 0.9988352656364441, + "learning_rate": 3.336336062091661e-05, + "loss": 0.8137, "step": 4769 }, { - "epoch": 1.7418294686872375, - "grad_norm": 0.9417722821235657, - "learning_rate": 2.509924582927883e-06, - "loss": 0.8344, + "epoch": 0.8275503122831367, + "grad_norm": 0.8918443918228149, + "learning_rate": 3.335829071174063e-05, + "loss": 0.755, "step": 4770 }, { - "epoch": 1.7421946320978638, - "grad_norm": 0.9256226420402527, - "learning_rate": 2.502971405624706e-06, - "loss": 0.8326, + "epoch": 0.8277238029146426, + "grad_norm": 0.9804509282112122, + "learning_rate": 3.335321925231946e-05, + "loss": 0.7583, "step": 4771 }, { - "epoch": 1.7425597955084902, - "grad_norm": 0.951561450958252, - "learning_rate": 2.4960272298406276e-06, - "loss": 0.8578, + "epoch": 0.8278972935461485, + "grad_norm": 1.2636436223983765, + "learning_rate": 3.334814624324163e-05, + "loss": 0.6959, "step": 4772 }, { - "epoch": 1.7429249589191163, - "grad_norm": 0.9855883717536926, - "learning_rate": 2.4890920591481525e-06, - "loss": 0.8262, + "epoch": 0.8280707841776545, + "grad_norm": 2.309222936630249, + "learning_rate": 3.334307168509587e-05, + "loss": 0.7104, "step": 4773 }, { - "epoch": 1.7432901223297426, - "grad_norm": 1.1054891347885132, - "learning_rate": 2.4821658971151406e-06, - "loss": 0.7874, + "epoch": 0.8282442748091603, + "grad_norm": 0.9871782064437866, + "learning_rate": 3.333799557847109e-05, + "loss": 0.8345, "step": 4774 }, { - "epoch": 1.7436552857403687, - "grad_norm": 1.2670353651046753, - "learning_rate": 2.4752487473048327e-06, - "loss": 0.8571, + "epoch": 0.8284177654406663, + "grad_norm": 0.8893906474113464, + "learning_rate": 3.3332917923956394e-05, + "loss": 0.7368, "step": 4775 }, { - "epoch": 1.744020449150995, - "grad_norm": 1.1199125051498413, - "learning_rate": 2.4683406132758147e-06, - "loss": 0.8235, + "epoch": 0.8285912560721721, + "grad_norm": 0.984903872013092, + "learning_rate": 3.332783872214103e-05, + "loss": 0.6898, "step": 4776 }, { - "epoch": 1.7443856125616213, - "grad_norm": 1.4023686647415161, - "learning_rate": 2.461441498582049e-06, - "loss": 0.8022, + "epoch": 0.8287647467036781, + "grad_norm": 0.9056780934333801, + "learning_rate": 3.332275797361446e-05, + "loss": 0.6633, "step": 4777 }, { - "epoch": 1.7447507759722476, - "grad_norm": 1.4417157173156738, - "learning_rate": 2.454551406772858e-06, - "loss": 0.8246, + "epoch": 0.8289382373351839, + "grad_norm": 1.1342127323150635, + "learning_rate": 3.331767567896629e-05, + "loss": 0.7913, "step": 4778 }, { - "epoch": 1.745115939382874, - "grad_norm": 1.00605046749115, - "learning_rate": 2.447670341392909e-06, - "loss": 0.8098, + "epoch": 0.8291117279666897, + "grad_norm": 0.8897329568862915, + "learning_rate": 3.331259183878635e-05, + "loss": 0.6797, "step": 4779 }, { - "epoch": 1.7454811027935002, - "grad_norm": 1.440796136856079, - "learning_rate": 2.4407983059822394e-06, - "loss": 0.7959, + "epoch": 0.8292852185981957, + "grad_norm": 1.69610595703125, + "learning_rate": 3.330750645366461e-05, + "loss": 0.9359, "step": 4780 }, { - "epoch": 1.7458462662041263, - "grad_norm": 1.1380994319915771, - "learning_rate": 2.4339353040762337e-06, - "loss": 0.7896, + "epoch": 0.8294587092297016, + "grad_norm": 1.6854100227355957, + "learning_rate": 3.330241952419123e-05, + "loss": 0.7156, "step": 4781 }, { - "epoch": 1.7462114296147526, - "grad_norm": 0.9975089430809021, - "learning_rate": 2.427081339205635e-06, - "loss": 0.776, + "epoch": 0.8296321998612075, + "grad_norm": 0.8037377595901489, + "learning_rate": 3.3297331050956576e-05, + "loss": 0.839, "step": 4782 }, { - "epoch": 1.7465765930253787, - "grad_norm": 1.1963858604431152, - "learning_rate": 2.4202364148965262e-06, - "loss": 0.7946, + "epoch": 0.8298056904927134, + "grad_norm": 1.0026473999023438, + "learning_rate": 3.329224103455116e-05, + "loss": 0.765, "step": 4783 }, { - "epoch": 1.746941756436005, - "grad_norm": 1.085980772972107, - "learning_rate": 2.4134005346703517e-06, - "loss": 0.753, + "epoch": 0.8299791811242193, + "grad_norm": 1.1098582744598389, + "learning_rate": 3.328714947556568e-05, + "loss": 0.75, "step": 4784 }, { - "epoch": 1.7473069198466313, - "grad_norm": 1.0902576446533203, - "learning_rate": 2.406573702043893e-06, - "loss": 0.8157, + "epoch": 0.8301526717557252, + "grad_norm": 0.7283819317817688, + "learning_rate": 3.328205637459102e-05, + "loss": 0.8396, "step": 4785 }, { - "epoch": 1.7476720832572576, - "grad_norm": 1.1020607948303223, - "learning_rate": 2.3997559205292877e-06, - "loss": 0.7979, + "epoch": 0.8303261623872311, + "grad_norm": 0.9151557087898254, + "learning_rate": 3.327696173221824e-05, + "loss": 0.8423, "step": 4786 }, { - "epoch": 1.748037246667884, - "grad_norm": 1.1687233448028564, - "learning_rate": 2.3929471936340075e-06, - "loss": 0.7953, + "epoch": 0.830499653018737, + "grad_norm": 1.5194469690322876, + "learning_rate": 3.327186554903859e-05, + "loss": 0.6941, "step": 4787 }, { - "epoch": 1.7484024100785103, - "grad_norm": 1.2209196090698242, - "learning_rate": 2.386147524860869e-06, - "loss": 0.7775, + "epoch": 0.8306731436502429, + "grad_norm": 1.4918030500411987, + "learning_rate": 3.326676782564347e-05, + "loss": 0.8315, "step": 4788 }, { - "epoch": 1.7487675734891364, - "grad_norm": 2.2289905548095703, - "learning_rate": 2.379356917708031e-06, - "loss": 0.8212, + "epoch": 0.8308466342817488, + "grad_norm": 0.8973992466926575, + "learning_rate": 3.3261668562624484e-05, + "loss": 0.7776, "step": 4789 }, { - "epoch": 1.7491327368997627, - "grad_norm": 1.2285690307617188, - "learning_rate": 2.3725753756689816e-06, - "loss": 0.8206, + "epoch": 0.8310201249132547, + "grad_norm": 1.6165553331375122, + "learning_rate": 3.325656776057341e-05, + "loss": 0.7396, "step": 4790 }, { - "epoch": 1.7494979003103888, - "grad_norm": 1.2078580856323242, - "learning_rate": 2.365802902232559e-06, - "loss": 0.7868, + "epoch": 0.8311936155447606, + "grad_norm": 0.9081304669380188, + "learning_rate": 3.32514654200822e-05, + "loss": 0.7137, "step": 4791 }, { - "epoch": 1.749863063721015, - "grad_norm": 1.7274525165557861, - "learning_rate": 2.3590395008829314e-06, - "loss": 0.8054, + "epoch": 0.8313671061762665, + "grad_norm": 0.7230401635169983, + "learning_rate": 3.324636154174299e-05, + "loss": 0.8201, "step": 4792 }, { - "epoch": 1.7502282271316414, - "grad_norm": 1.3786813020706177, - "learning_rate": 2.352285175099587e-06, - "loss": 0.8635, + "epoch": 0.8315405968077724, + "grad_norm": 0.6617628931999207, + "learning_rate": 3.3241256126148084e-05, + "loss": 0.8096, "step": 4793 }, { - "epoch": 1.7505933905422677, - "grad_norm": 0.9278981685638428, - "learning_rate": 2.345539928357361e-06, - "loss": 0.817, + "epoch": 0.8317140874392783, + "grad_norm": 1.1607024669647217, + "learning_rate": 3.3236149173889975e-05, + "loss": 0.7595, "step": 4794 }, { - "epoch": 1.750958553952894, - "grad_norm": 1.134604811668396, - "learning_rate": 2.338803764126414e-06, - "loss": 0.8362, + "epoch": 0.8318875780707842, + "grad_norm": 0.9779430627822876, + "learning_rate": 3.323104068556133e-05, + "loss": 0.863, "step": 4795 }, { - "epoch": 1.7513237173635203, - "grad_norm": 1.1692993640899658, - "learning_rate": 2.332076685872231e-06, - "loss": 0.7897, + "epoch": 0.8320610687022901, + "grad_norm": 0.8085692524909973, + "learning_rate": 3.3225930661755005e-05, + "loss": 0.7554, "step": 4796 }, { - "epoch": 1.7516888807741464, - "grad_norm": 1.1167852878570557, - "learning_rate": 2.325358697055626e-06, - "loss": 0.7979, + "epoch": 0.832234559333796, + "grad_norm": 1.1984941959381104, + "learning_rate": 3.322081910306401e-05, + "loss": 0.7549, "step": 4797 }, { - "epoch": 1.7520540441847727, - "grad_norm": 1.1321020126342773, - "learning_rate": 2.3186498011327286e-06, - "loss": 0.8032, + "epoch": 0.8324080499653018, + "grad_norm": 1.1695032119750977, + "learning_rate": 3.321570601008155e-05, + "loss": 0.6853, "step": 4798 }, { - "epoch": 1.7524192075953988, - "grad_norm": 0.6649745106697083, - "learning_rate": 2.3119500015550012e-06, - "loss": 0.8367, + "epoch": 0.8325815405968078, + "grad_norm": 0.9397539496421814, + "learning_rate": 3.321059138340101e-05, + "loss": 0.7402, "step": 4799 }, { - "epoch": 1.7527843710060251, - "grad_norm": 1.7170383930206299, - "learning_rate": 2.3052593017692184e-06, - "loss": 0.7885, + "epoch": 0.8327550312283136, + "grad_norm": 0.9138932824134827, + "learning_rate": 3.320547522361595e-05, + "loss": 0.6337, "step": 4800 }, { - "epoch": 1.7531495344166514, - "grad_norm": 1.0064960718154907, - "learning_rate": 2.298577705217486e-06, - "loss": 0.8123, + "epoch": 0.8329285218598196, + "grad_norm": 1.1076256036758423, + "learning_rate": 3.32003575313201e-05, + "loss": 0.6882, "step": 4801 }, { - "epoch": 1.7535146978272778, - "grad_norm": 1.2823454141616821, - "learning_rate": 2.291905215337209e-06, - "loss": 0.8142, + "epoch": 0.8331020124913254, + "grad_norm": 0.8360020518302917, + "learning_rate": 3.3195238307107375e-05, + "loss": 0.7, "step": 4802 }, { - "epoch": 1.753879861237904, - "grad_norm": 1.1806176900863647, - "learning_rate": 2.285241835561112e-06, - "loss": 0.7888, + "epoch": 0.8332755031228314, + "grad_norm": 1.1174418926239014, + "learning_rate": 3.3190117551571876e-05, + "loss": 0.7784, "step": 4803 }, { - "epoch": 1.7542450246485304, - "grad_norm": 0.8787094950675964, - "learning_rate": 2.2785875693172433e-06, - "loss": 0.7946, + "epoch": 0.8334489937543372, + "grad_norm": 1.9967058897018433, + "learning_rate": 3.318499526530786e-05, + "loss": 0.7021, "step": 4804 }, { - "epoch": 1.7546101880591565, - "grad_norm": 1.2207602262496948, - "learning_rate": 2.271942420028954e-06, - "loss": 0.7827, + "epoch": 0.8336224843858432, + "grad_norm": 1.7008179426193237, + "learning_rate": 3.317987144890978e-05, + "loss": 0.7893, "step": 4805 }, { - "epoch": 1.7549753514697826, - "grad_norm": 0.898269534111023, - "learning_rate": 2.2653063911149052e-06, - "loss": 0.8228, + "epoch": 0.833795975017349, + "grad_norm": 1.0818512439727783, + "learning_rate": 3.317474610297226e-05, + "loss": 0.6591, "step": 4806 }, { - "epoch": 1.7553405148804089, - "grad_norm": 1.028560996055603, - "learning_rate": 2.258679485989075e-06, - "loss": 0.8, + "epoch": 0.833969465648855, + "grad_norm": 0.7501543760299683, + "learning_rate": 3.31696192280901e-05, + "loss": 0.7908, "step": 4807 }, { - "epoch": 1.7557056782910352, - "grad_norm": 1.1462211608886719, - "learning_rate": 2.252061708060731e-06, - "loss": 0.8293, + "epoch": 0.8341429562803608, + "grad_norm": 0.8780678510665894, + "learning_rate": 3.316449082485829e-05, + "loss": 0.6779, "step": 4808 }, { - "epoch": 1.7560708417016615, - "grad_norm": 1.0502408742904663, - "learning_rate": 2.245453060734457e-06, - "loss": 0.8337, + "epoch": 0.8343164469118668, + "grad_norm": 0.9260657429695129, + "learning_rate": 3.315936089387198e-05, + "loss": 0.7115, "step": 4809 }, { - "epoch": 1.7564360051122878, - "grad_norm": 0.9547353386878967, - "learning_rate": 2.238853547410136e-06, - "loss": 0.8291, + "epoch": 0.8344899375433726, + "grad_norm": 1.049065351486206, + "learning_rate": 3.31542294357265e-05, + "loss": 0.7239, "step": 4810 }, { - "epoch": 1.7568011685229141, - "grad_norm": 1.2789034843444824, - "learning_rate": 2.23226317148296e-06, - "loss": 0.7954, + "epoch": 0.8346634281748786, + "grad_norm": 0.9399887919425964, + "learning_rate": 3.314909645101737e-05, + "loss": 0.7729, "step": 4811 }, { - "epoch": 1.7571663319335402, - "grad_norm": 1.0870743989944458, - "learning_rate": 2.2256819363434048e-06, - "loss": 0.848, + "epoch": 0.8348369188063844, + "grad_norm": 0.7559486031532288, + "learning_rate": 3.3143961940340274e-05, + "loss": 0.6124, "step": 4812 }, { - "epoch": 1.7575314953441665, - "grad_norm": 1.3006113767623901, - "learning_rate": 2.219109845377252e-06, - "loss": 0.8198, + "epoch": 0.8350104094378904, + "grad_norm": 0.8814842104911804, + "learning_rate": 3.313882590429108e-05, + "loss": 0.7295, "step": 4813 }, { - "epoch": 1.7578966587547926, - "grad_norm": 1.2652848958969116, - "learning_rate": 2.212546901965582e-06, - "loss": 0.8207, + "epoch": 0.8351839000693962, + "grad_norm": 0.9307559728622437, + "learning_rate": 3.313368834346583e-05, + "loss": 0.6655, "step": 4814 }, { - "epoch": 1.758261822165419, - "grad_norm": 1.3693933486938477, - "learning_rate": 2.2059931094847676e-06, - "loss": 0.8067, + "epoch": 0.8353573907009022, + "grad_norm": 1.5790090560913086, + "learning_rate": 3.3128549258460734e-05, + "loss": 0.8289, "step": 4815 }, { - "epoch": 1.7586269855760452, - "grad_norm": 1.1051583290100098, - "learning_rate": 2.199448471306467e-06, - "loss": 0.8229, + "epoch": 0.835530881332408, + "grad_norm": 1.2477068901062012, + "learning_rate": 3.312340864987221e-05, + "loss": 0.801, "step": 4816 }, { - "epoch": 1.7589921489866716, - "grad_norm": 0.9956026077270508, - "learning_rate": 2.19291299079764e-06, - "loss": 0.8422, + "epoch": 0.8357043719639139, + "grad_norm": 0.7024942636489868, + "learning_rate": 3.311826651829682e-05, + "loss": 0.7727, "step": 4817 }, { - "epoch": 1.7593573123972979, - "grad_norm": 0.9622489809989929, - "learning_rate": 2.186386671320522e-06, - "loss": 0.8302, + "epoch": 0.8358778625954199, + "grad_norm": 0.9320955872535706, + "learning_rate": 3.311312286433131e-05, + "loss": 0.7979, "step": 4818 }, { - "epoch": 1.7597224758079242, - "grad_norm": 0.8418428897857666, - "learning_rate": 2.1798695162326444e-06, - "loss": 0.8099, + "epoch": 0.8360513532269257, + "grad_norm": 0.9380242228507996, + "learning_rate": 3.310797768857262e-05, + "loss": 0.6807, "step": 4819 }, { - "epoch": 1.7600876392185503, - "grad_norm": 1.030246376991272, - "learning_rate": 2.1733615288868236e-06, - "loss": 0.7947, + "epoch": 0.8362248438584317, + "grad_norm": 0.955429196357727, + "learning_rate": 3.310283099161783e-05, + "loss": 0.7866, "step": 4820 }, { - "epoch": 1.7604528026291766, - "grad_norm": 1.2405564785003662, - "learning_rate": 2.1668627126311613e-06, - "loss": 0.7827, + "epoch": 0.8363983344899375, + "grad_norm": 0.9379730820655823, + "learning_rate": 3.3097682774064255e-05, + "loss": 0.6705, "step": 4821 }, { - "epoch": 1.7608179660398027, - "grad_norm": 1.2058964967727661, - "learning_rate": 2.160373070809032e-06, - "loss": 0.8367, + "epoch": 0.8365718251214435, + "grad_norm": 1.5986607074737549, + "learning_rate": 3.309253303650932e-05, + "loss": 0.677, "step": 4822 }, { - "epoch": 1.761183129450429, - "grad_norm": 1.164934515953064, - "learning_rate": 2.153892606759096e-06, - "loss": 0.7698, + "epoch": 0.8367453157529493, + "grad_norm": 0.9068677425384521, + "learning_rate": 3.308738177955067e-05, + "loss": 0.7853, "step": 4823 }, { - "epoch": 1.7615482928610553, - "grad_norm": 0.9754539132118225, - "learning_rate": 2.1474213238152954e-06, - "loss": 0.8292, + "epoch": 0.8369188063844553, + "grad_norm": 0.923128068447113, + "learning_rate": 3.3082229003786114e-05, + "loss": 0.7509, "step": 4824 }, { - "epoch": 1.7619134562716816, - "grad_norm": 1.0043588876724243, - "learning_rate": 2.1409592253068467e-06, - "loss": 0.7928, + "epoch": 0.8370922970159611, + "grad_norm": 0.8961623311042786, + "learning_rate": 3.307707470981364e-05, + "loss": 0.8855, "step": 4825 }, { - "epoch": 1.762278619682308, - "grad_norm": 0.9869155287742615, - "learning_rate": 2.1345063145582357e-06, - "loss": 0.8396, + "epoch": 0.8372657876474671, + "grad_norm": 0.9725954532623291, + "learning_rate": 3.3071918898231413e-05, + "loss": 0.6965, "step": 4826 }, { - "epoch": 1.7626437830929342, - "grad_norm": 1.1354718208312988, - "learning_rate": 2.128062594889229e-06, - "loss": 0.8245, + "epoch": 0.8374392782789729, + "grad_norm": 0.8058086633682251, + "learning_rate": 3.306676156963776e-05, + "loss": 0.7751, "step": 4827 }, { - "epoch": 1.7630089465035603, - "grad_norm": 0.9219787120819092, - "learning_rate": 2.1216280696148585e-06, - "loss": 0.8103, + "epoch": 0.8376127689104789, + "grad_norm": 1.086820363998413, + "learning_rate": 3.3061602724631205e-05, + "loss": 0.6705, "step": 4828 }, { - "epoch": 1.7633741099141866, - "grad_norm": 1.0897216796875, - "learning_rate": 2.115202742045437e-06, - "loss": 0.799, + "epoch": 0.8377862595419847, + "grad_norm": 1.529167652130127, + "learning_rate": 3.3056442363810435e-05, + "loss": 0.7422, "step": 4829 }, { - "epoch": 1.7637392733248127, - "grad_norm": 1.0758386850357056, - "learning_rate": 2.108786615486529e-06, - "loss": 0.7743, + "epoch": 0.8379597501734907, + "grad_norm": 0.9842532873153687, + "learning_rate": 3.3051280487774316e-05, + "loss": 0.6152, "step": 4830 }, { - "epoch": 1.764104436735439, - "grad_norm": 0.6810785531997681, - "learning_rate": 2.1023796932389805e-06, - "loss": 0.8411, + "epoch": 0.8381332408049965, + "grad_norm": 1.169345498085022, + "learning_rate": 3.3046117097121884e-05, + "loss": 0.6785, "step": 4831 }, { - "epoch": 1.7644696001460654, - "grad_norm": 0.9016165137290955, - "learning_rate": 2.0959819785988912e-06, - "loss": 0.8348, + "epoch": 0.8383067314365025, + "grad_norm": 1.193697214126587, + "learning_rate": 3.304095219245236e-05, + "loss": 0.6539, "step": 4832 }, { - "epoch": 1.7648347635566917, - "grad_norm": 1.3151658773422241, - "learning_rate": 2.0895934748576273e-06, - "loss": 0.8253, + "epoch": 0.8384802220680083, + "grad_norm": 1.0610277652740479, + "learning_rate": 3.3035785774365136e-05, + "loss": 0.7214, "step": 4833 }, { - "epoch": 1.765199926967318, - "grad_norm": 1.1708877086639404, - "learning_rate": 2.0832141853018227e-06, - "loss": 0.8261, + "epoch": 0.8386537126995143, + "grad_norm": 0.9396154880523682, + "learning_rate": 3.303061784345979e-05, + "loss": 0.7129, "step": 4834 }, { - "epoch": 1.7655650903779443, - "grad_norm": 1.3313127756118774, - "learning_rate": 2.0768441132133676e-06, - "loss": 0.8016, + "epoch": 0.8388272033310201, + "grad_norm": 0.8436903357505798, + "learning_rate": 3.3025448400336064e-05, + "loss": 0.7324, "step": 4835 }, { - "epoch": 1.7659302537885704, - "grad_norm": 1.1862196922302246, - "learning_rate": 2.0704832618694006e-06, - "loss": 0.8331, + "epoch": 0.8390006939625261, + "grad_norm": 0.6488824486732483, + "learning_rate": 3.302027744559387e-05, + "loss": 0.8148, "step": 4836 }, { - "epoch": 1.7662954171991967, - "grad_norm": 0.8666213154792786, - "learning_rate": 2.0641316345423303e-06, - "loss": 0.8384, + "epoch": 0.8391741845940319, + "grad_norm": 1.0152256488800049, + "learning_rate": 3.30151049798333e-05, + "loss": 0.6094, "step": 4837 }, { - "epoch": 1.7666605806098228, - "grad_norm": 0.9848610162734985, - "learning_rate": 2.0577892344998097e-06, - "loss": 0.8262, + "epoch": 0.8393476752255378, + "grad_norm": 1.0491200685501099, + "learning_rate": 3.300993100365463e-05, + "loss": 0.6346, "step": 4838 }, { - "epoch": 1.767025744020449, - "grad_norm": 1.0880247354507446, - "learning_rate": 2.051456065004753e-06, - "loss": 0.818, + "epoch": 0.8395211658570437, + "grad_norm": 1.2424668073654175, + "learning_rate": 3.3004755517658314e-05, + "loss": 0.7861, "step": 4839 }, { - "epoch": 1.7673909074310754, - "grad_norm": 1.0240784883499146, - "learning_rate": 2.045132129315326e-06, - "loss": 0.8177, + "epoch": 0.8396946564885496, + "grad_norm": 0.8872154355049133, + "learning_rate": 3.299957852244496e-05, + "loss": 0.792, "step": 4840 }, { - "epoch": 1.7677560708417017, - "grad_norm": 1.4336079359054565, - "learning_rate": 2.0388174306849297e-06, - "loss": 0.8513, + "epoch": 0.8398681471200555, + "grad_norm": 0.9287905097007751, + "learning_rate": 3.299440001861538e-05, + "loss": 0.7384, "step": 4841 }, { - "epoch": 1.768121234252328, - "grad_norm": 0.9912111759185791, - "learning_rate": 2.03251197236223e-06, - "loss": 0.7869, + "epoch": 0.8400416377515614, + "grad_norm": 0.8010587692260742, + "learning_rate": 3.298922000677053e-05, + "loss": 0.718, "step": 4842 }, { - "epoch": 1.7684863976629541, - "grad_norm": 0.9164789915084839, - "learning_rate": 2.026215757591128e-06, - "loss": 0.7774, + "epoch": 0.8402151283830673, + "grad_norm": 0.8250033855438232, + "learning_rate": 3.298403848751157e-05, + "loss": 0.6782, "step": 4843 }, { - "epoch": 1.7688515610735804, - "grad_norm": 1.2653248310089111, - "learning_rate": 2.0199287896107743e-06, - "loss": 0.7959, + "epoch": 0.8403886190145732, + "grad_norm": 0.9710628986358643, + "learning_rate": 3.2978855461439806e-05, + "loss": 0.656, "step": 4844 }, { - "epoch": 1.7692167244842065, - "grad_norm": 1.435330867767334, - "learning_rate": 2.013651071655569e-06, - "loss": 0.8187, + "epoch": 0.8405621096460791, + "grad_norm": 1.2840906381607056, + "learning_rate": 3.297367092915675e-05, + "loss": 0.7532, "step": 4845 }, { - "epoch": 1.7695818878948328, - "grad_norm": 1.03030526638031, - "learning_rate": 2.007382606955135e-06, - "loss": 0.7745, + "epoch": 0.840735600277585, + "grad_norm": 0.8356399536132812, + "learning_rate": 3.296848489126406e-05, + "loss": 0.6541, "step": 4846 }, { - "epoch": 1.7699470513054592, - "grad_norm": 1.1324703693389893, - "learning_rate": 2.00112339873435e-06, - "loss": 0.8368, + "epoch": 0.8409090909090909, + "grad_norm": 0.8740513324737549, + "learning_rate": 3.296329734836359e-05, + "loss": 0.761, "step": 4847 }, { - "epoch": 1.7703122147160855, - "grad_norm": 1.3516701459884644, - "learning_rate": 1.9948734502133284e-06, - "loss": 0.7872, + "epoch": 0.8410825815405968, + "grad_norm": 0.8611583113670349, + "learning_rate": 3.295810830105736e-05, + "loss": 0.5941, "step": 4848 }, { - "epoch": 1.7706773781267118, - "grad_norm": 1.079081654548645, - "learning_rate": 1.9886327646074143e-06, - "loss": 0.8706, + "epoch": 0.8412560721721027, + "grad_norm": 1.0718257427215576, + "learning_rate": 3.2952917749947556e-05, + "loss": 0.8052, "step": 4849 }, { - "epoch": 1.771042541537338, - "grad_norm": 1.044124960899353, - "learning_rate": 1.9824013451271964e-06, - "loss": 0.8038, + "epoch": 0.8414295628036086, + "grad_norm": 1.0798165798187256, + "learning_rate": 3.294772569563656e-05, + "loss": 0.6877, "step": 4850 }, { - "epoch": 1.7714077049479642, - "grad_norm": 1.0424288511276245, - "learning_rate": 1.9761791949784827e-06, - "loss": 0.8704, + "epoch": 0.8416030534351145, + "grad_norm": 1.0169899463653564, + "learning_rate": 3.2942532138726906e-05, + "loss": 0.5862, "step": 4851 }, { - "epoch": 1.7717728683585905, - "grad_norm": 1.2283903360366821, - "learning_rate": 1.9699663173623195e-06, - "loss": 0.8495, + "epoch": 0.8417765440666204, + "grad_norm": 1.3886168003082275, + "learning_rate": 3.293733707982132e-05, + "loss": 0.8674, "step": 4852 }, { - "epoch": 1.7721380317692166, - "grad_norm": 1.7518435716629028, - "learning_rate": 1.9637627154749882e-06, - "loss": 0.7864, + "epoch": 0.8419500346981263, + "grad_norm": 0.8438948392868042, + "learning_rate": 3.2932140519522676e-05, + "loss": 0.9231, "step": 4853 }, { - "epoch": 1.772503195179843, - "grad_norm": 1.1629589796066284, - "learning_rate": 1.9575683925079913e-06, - "loss": 0.8196, + "epoch": 0.8421235253296322, + "grad_norm": 1.0665990114212036, + "learning_rate": 3.292694245843407e-05, + "loss": 0.7104, "step": 4854 }, { - "epoch": 1.7728683585904692, - "grad_norm": 1.3142404556274414, - "learning_rate": 1.951383351648057e-06, - "loss": 0.8423, + "epoch": 0.8422970159611382, + "grad_norm": 0.9788002967834473, + "learning_rate": 3.2921742897158726e-05, + "loss": 0.7793, "step": 4855 }, { - "epoch": 1.7732335220010955, - "grad_norm": 0.8626876473426819, - "learning_rate": 1.945207596077148e-06, - "loss": 0.797, + "epoch": 0.842470506592644, + "grad_norm": 1.1839380264282227, + "learning_rate": 3.2916541836300065e-05, + "loss": 0.7659, "step": 4856 }, { - "epoch": 1.7735986854117218, - "grad_norm": 1.3441905975341797, - "learning_rate": 1.93904112897243e-06, - "loss": 0.8268, + "epoch": 0.8426439972241498, + "grad_norm": 0.6911187767982483, + "learning_rate": 3.2911339276461665e-05, + "loss": 0.6957, "step": 4857 }, { - "epoch": 1.7739638488223481, - "grad_norm": 1.252750039100647, - "learning_rate": 1.9328839535063125e-06, - "loss": 0.8218, + "epoch": 0.8428174878556558, + "grad_norm": 0.9902245998382568, + "learning_rate": 3.290613521824731e-05, + "loss": 0.7495, "step": 4858 }, { - "epoch": 1.7743290122329742, - "grad_norm": 0.9853514432907104, - "learning_rate": 1.9267360728464113e-06, - "loss": 0.7914, + "epoch": 0.8429909784871616, + "grad_norm": 1.2898427248001099, + "learning_rate": 3.290092966226092e-05, + "loss": 0.7612, "step": 4859 }, { - "epoch": 1.7746941756436005, - "grad_norm": 1.164645791053772, - "learning_rate": 1.920597490155569e-06, - "loss": 0.7664, + "epoch": 0.8431644691186676, + "grad_norm": 1.46266770362854, + "learning_rate": 3.28957226091066e-05, + "loss": 0.8313, "step": 4860 }, { - "epoch": 1.7750593390542266, - "grad_norm": 1.2468080520629883, - "learning_rate": 1.9144682085918354e-06, - "loss": 0.7443, + "epoch": 0.8433379597501734, + "grad_norm": 2.440092086791992, + "learning_rate": 3.289051405938865e-05, + "loss": 0.9817, "step": 4861 }, { - "epoch": 1.775424502464853, - "grad_norm": 1.0138967037200928, - "learning_rate": 1.908348231308479e-06, - "loss": 0.8064, + "epoch": 0.8435114503816794, + "grad_norm": 0.8309836983680725, + "learning_rate": 3.2885304013711525e-05, + "loss": 0.7742, "step": 4862 }, { - "epoch": 1.7757896658754793, - "grad_norm": 1.247631549835205, - "learning_rate": 1.9022375614539857e-06, - "loss": 0.853, + "epoch": 0.8436849410131853, + "grad_norm": 0.9249649047851562, + "learning_rate": 3.2880092472679854e-05, + "loss": 0.7754, "step": 4863 }, { - "epoch": 1.7761548292861056, - "grad_norm": 1.3188008069992065, - "learning_rate": 1.896136202172052e-06, - "loss": 0.8314, + "epoch": 0.8438584316446912, + "grad_norm": 0.9647582173347473, + "learning_rate": 3.2874879436898444e-05, + "loss": 0.8469, "step": 4864 }, { - "epoch": 1.7765199926967319, - "grad_norm": 1.063135027885437, - "learning_rate": 1.890044156601576e-06, - "loss": 0.8665, + "epoch": 0.844031922276197, + "grad_norm": 1.0043631792068481, + "learning_rate": 3.286966490697227e-05, + "loss": 0.7156, "step": 4865 }, { - "epoch": 1.7768851561073582, - "grad_norm": 1.4286372661590576, - "learning_rate": 1.883961427876675e-06, - "loss": 0.7789, + "epoch": 0.844205412907703, + "grad_norm": 0.8158316016197205, + "learning_rate": 3.286444888350649e-05, + "loss": 0.9072, "step": 4866 }, { - "epoch": 1.7772503195179843, - "grad_norm": 1.0102839469909668, - "learning_rate": 1.877888019126668e-06, - "loss": 0.8185, + "epoch": 0.8443789035392089, + "grad_norm": 0.7924819588661194, + "learning_rate": 3.285923136710643e-05, + "loss": 0.7285, "step": 4867 }, { - "epoch": 1.7776154829286106, - "grad_norm": 1.0423943996429443, - "learning_rate": 1.8718239334760824e-06, - "loss": 0.8369, + "epoch": 0.8445523941707148, + "grad_norm": 1.051452398300171, + "learning_rate": 3.285401235837758e-05, + "loss": 0.6931, "step": 4868 }, { - "epoch": 1.7779806463392367, - "grad_norm": 0.9563375115394592, - "learning_rate": 1.865769174044647e-06, - "loss": 0.8076, + "epoch": 0.8447258848022207, + "grad_norm": 1.0950305461883545, + "learning_rate": 3.284879185792562e-05, + "loss": 0.6868, "step": 4869 }, { - "epoch": 1.778345809749863, - "grad_norm": 1.2182204723358154, - "learning_rate": 1.8597237439472837e-06, - "loss": 0.8165, + "epoch": 0.8448993754337266, + "grad_norm": 0.9245551228523254, + "learning_rate": 3.28435698663564e-05, + "loss": 0.7031, "step": 4870 }, { - "epoch": 1.7787109731604893, - "grad_norm": 1.081437587738037, - "learning_rate": 1.8536876462941311e-06, - "loss": 0.8072, + "epoch": 0.8450728660652325, + "grad_norm": 0.8948962688446045, + "learning_rate": 3.2838346384275924e-05, + "loss": 0.8162, "step": 4871 }, { - "epoch": 1.7790761365711156, - "grad_norm": 0.9819970726966858, - "learning_rate": 1.8476608841905186e-06, - "loss": 0.8227, + "epoch": 0.8452463566967384, + "grad_norm": 0.7970435619354248, + "learning_rate": 3.283312141229039e-05, + "loss": 0.8289, "step": 4872 }, { - "epoch": 1.779441299981742, - "grad_norm": 1.1524152755737305, - "learning_rate": 1.841643460736975e-06, - "loss": 0.8113, + "epoch": 0.8454198473282443, + "grad_norm": 1.1342363357543945, + "learning_rate": 3.282789495100616e-05, + "loss": 0.5746, "step": 4873 }, { - "epoch": 1.779806463392368, - "grad_norm": 0.9682931303977966, - "learning_rate": 1.8356353790292237e-06, - "loss": 0.8055, + "epoch": 0.8455933379597502, + "grad_norm": 1.4479836225509644, + "learning_rate": 3.282266700102978e-05, + "loss": 0.7922, "step": 4874 }, { - "epoch": 1.7801716268029943, - "grad_norm": 0.9844881892204285, - "learning_rate": 1.8296366421581747e-06, - "loss": 0.84, + "epoch": 0.8457668285912561, + "grad_norm": 1.1032180786132812, + "learning_rate": 3.281743756296795e-05, + "loss": 0.6512, "step": 4875 }, { - "epoch": 1.7805367902136204, - "grad_norm": 0.9452268481254578, - "learning_rate": 1.8236472532099413e-06, - "loss": 0.7841, + "epoch": 0.8459403192227619, + "grad_norm": 1.274580478668213, + "learning_rate": 3.281220663742756e-05, + "loss": 0.6082, "step": 4876 }, { - "epoch": 1.7809019536242467, - "grad_norm": 1.1414281129837036, - "learning_rate": 1.817667215265826e-06, - "loss": 0.8129, + "epoch": 0.8461138098542679, + "grad_norm": 0.928234875202179, + "learning_rate": 3.280697422501565e-05, + "loss": 0.5997, "step": 4877 }, { - "epoch": 1.781267117034873, - "grad_norm": 1.2571309804916382, - "learning_rate": 1.8116965314023205e-06, - "loss": 0.8363, + "epoch": 0.8462873004857737, + "grad_norm": 0.9475992321968079, + "learning_rate": 3.280174032633947e-05, + "loss": 0.7607, "step": 4878 }, { - "epoch": 1.7816322804454994, - "grad_norm": 1.0156824588775635, - "learning_rate": 1.8057352046910948e-06, - "loss": 0.8226, + "epoch": 0.8464607911172797, + "grad_norm": 0.9008442759513855, + "learning_rate": 3.27965049420064e-05, + "loss": 0.7307, "step": 4879 }, { - "epoch": 1.7819974438561257, - "grad_norm": 1.1025018692016602, - "learning_rate": 1.7997832381990156e-06, - "loss": 0.7924, + "epoch": 0.8466342817487855, + "grad_norm": 0.8726187944412231, + "learning_rate": 3.279126807262403e-05, + "loss": 0.7444, "step": 4880 }, { - "epoch": 1.782362607266752, - "grad_norm": 1.09976065158844, - "learning_rate": 1.793840634988131e-06, - "loss": 0.8265, + "epoch": 0.8468077723802915, + "grad_norm": 1.0681473016738892, + "learning_rate": 3.278602971880009e-05, + "loss": 0.5945, "step": 4881 }, { - "epoch": 1.782727770677378, - "grad_norm": 1.043810248374939, - "learning_rate": 1.787907398115676e-06, - "loss": 0.8322, + "epoch": 0.8469812630117973, + "grad_norm": 2.3759987354278564, + "learning_rate": 3.27807898811425e-05, + "loss": 0.6409, "step": 4882 }, { - "epoch": 1.7830929340880044, - "grad_norm": 1.3693504333496094, - "learning_rate": 1.781983530634055e-06, - "loss": 0.8158, + "epoch": 0.8471547536433033, + "grad_norm": 0.7334998846054077, + "learning_rate": 3.2775548560259355e-05, + "loss": 0.738, "step": 4883 }, { - "epoch": 1.7834580974986305, - "grad_norm": 1.271196722984314, - "learning_rate": 1.7760690355908682e-06, - "loss": 0.8473, + "epoch": 0.8473282442748091, + "grad_norm": 0.9804109930992126, + "learning_rate": 3.277030575675891e-05, + "loss": 0.7229, "step": 4884 }, { - "epoch": 1.7838232609092568, - "grad_norm": 1.2101701498031616, - "learning_rate": 1.7701639160288775e-06, - "loss": 0.8168, + "epoch": 0.8475017349063151, + "grad_norm": 1.2675697803497314, + "learning_rate": 3.27650614712496e-05, + "loss": 0.7812, "step": 4885 }, { - "epoch": 1.7841884243198831, - "grad_norm": 1.1699579954147339, - "learning_rate": 1.7642681749860346e-06, - "loss": 0.8214, + "epoch": 0.8476752255378209, + "grad_norm": 0.8617616295814514, + "learning_rate": 3.2759815704340034e-05, + "loss": 0.8413, "step": 4886 }, { - "epoch": 1.7845535877305094, - "grad_norm": 1.1995930671691895, - "learning_rate": 1.7583818154954602e-06, - "loss": 0.8109, + "epoch": 0.8478487161693269, + "grad_norm": 0.811677098274231, + "learning_rate": 3.275456845663899e-05, + "loss": 0.6567, "step": 4887 }, { - "epoch": 1.7849187511411357, - "grad_norm": 1.0155256986618042, - "learning_rate": 1.7525048405854562e-06, - "loss": 0.7724, + "epoch": 0.8480222068008327, + "grad_norm": 0.9465263485908508, + "learning_rate": 3.2749319728755415e-05, + "loss": 0.6786, "step": 4888 }, { - "epoch": 1.785283914551762, - "grad_norm": 1.1529591083526611, - "learning_rate": 1.7466372532794818e-06, - "loss": 0.8209, + "epoch": 0.8481956974323387, + "grad_norm": 0.9435146450996399, + "learning_rate": 3.2744069521298424e-05, + "loss": 0.7087, "step": 4889 }, { - "epoch": 1.7856490779623881, - "grad_norm": 1.2713501453399658, - "learning_rate": 1.740779056596178e-06, - "loss": 0.8163, + "epoch": 0.8483691880638445, + "grad_norm": 1.086719274520874, + "learning_rate": 3.273881783487732e-05, + "loss": 0.78, "step": 4890 }, { - "epoch": 1.7860142413730145, - "grad_norm": 1.0870299339294434, - "learning_rate": 1.7349302535493539e-06, - "loss": 0.8145, + "epoch": 0.8485426786953505, + "grad_norm": 0.8270217776298523, + "learning_rate": 3.273356467010156e-05, + "loss": 0.692, "step": 4891 }, { - "epoch": 1.7863794047836405, - "grad_norm": 1.0858241319656372, - "learning_rate": 1.7290908471479805e-06, - "loss": 0.8129, + "epoch": 0.8487161693268563, + "grad_norm": 0.8391377925872803, + "learning_rate": 3.2728310027580786e-05, + "loss": 0.728, "step": 4892 }, { - "epoch": 1.7867445681942669, - "grad_norm": 0.9564353227615356, - "learning_rate": 1.723260840396206e-06, - "loss": 0.8127, + "epoch": 0.8488896599583623, + "grad_norm": 0.8886091709136963, + "learning_rate": 3.27230539079248e-05, + "loss": 0.7102, "step": 4893 }, { - "epoch": 1.7871097316048932, - "grad_norm": 1.125215768814087, - "learning_rate": 1.717440236293324e-06, - "loss": 0.8137, + "epoch": 0.8490631505898681, + "grad_norm": 1.0419142246246338, + "learning_rate": 3.271779631174358e-05, + "loss": 0.6147, "step": 4894 }, { - "epoch": 1.7874748950155195, - "grad_norm": 0.8403804302215576, - "learning_rate": 1.7116290378338085e-06, - "loss": 0.7936, + "epoch": 0.8492366412213741, + "grad_norm": 1.0301762819290161, + "learning_rate": 3.271253723964728e-05, + "loss": 0.6814, "step": 4895 }, { - "epoch": 1.7878400584261458, - "grad_norm": 2.118398904800415, - "learning_rate": 1.7058272480072879e-06, - "loss": 0.7878, + "epoch": 0.84941013185288, + "grad_norm": 1.6135895252227783, + "learning_rate": 3.270727669224622e-05, + "loss": 0.8003, "step": 4896 }, { - "epoch": 1.788205221836772, - "grad_norm": 1.020594835281372, - "learning_rate": 1.7000348697985481e-06, - "loss": 0.8732, + "epoch": 0.8495836224843858, + "grad_norm": 1.7061048746109009, + "learning_rate": 3.2702014670150904e-05, + "loss": 0.7085, "step": 4897 }, { - "epoch": 1.7885703852473982, - "grad_norm": 1.1473095417022705, - "learning_rate": 1.6942519061875361e-06, - "loss": 0.8254, + "epoch": 0.8497571131158917, + "grad_norm": 0.947030782699585, + "learning_rate": 3.269675117397196e-05, + "loss": 0.5907, "step": 4898 }, { - "epoch": 1.7889355486580245, - "grad_norm": 0.9456803798675537, - "learning_rate": 1.6884783601493525e-06, - "loss": 0.804, + "epoch": 0.8499306037473976, + "grad_norm": 0.9155808091163635, + "learning_rate": 3.269148620432027e-05, + "loss": 0.6936, "step": 4899 }, { - "epoch": 1.7893007120686506, - "grad_norm": 1.1878252029418945, - "learning_rate": 1.682714234654259e-06, - "loss": 0.8191, + "epoch": 0.8501040943789036, + "grad_norm": 0.897598922252655, + "learning_rate": 3.268621976180681e-05, + "loss": 0.7324, "step": 4900 }, { - "epoch": 1.789665875479277, - "grad_norm": 0.9535050988197327, - "learning_rate": 1.6769595326676614e-06, - "loss": 0.8013, + "epoch": 0.8502775850104094, + "grad_norm": 0.7420191764831543, + "learning_rate": 3.2680951847042766e-05, + "loss": 0.8225, "step": 4901 }, { - "epoch": 1.7900310388899032, - "grad_norm": 1.1710432767868042, - "learning_rate": 1.6712142571501289e-06, - "loss": 0.8618, + "epoch": 0.8504510756419154, + "grad_norm": 0.9240177869796753, + "learning_rate": 3.267568246063948e-05, + "loss": 0.6619, "step": 4902 }, { - "epoch": 1.7903962023005295, - "grad_norm": 1.1958469152450562, - "learning_rate": 1.6654784110573752e-06, - "loss": 0.8267, + "epoch": 0.8506245662734212, + "grad_norm": 1.591069221496582, + "learning_rate": 3.2670411603208484e-05, + "loss": 0.6476, "step": 4903 }, { - "epoch": 1.7907613657111559, - "grad_norm": 0.9352374076843262, - "learning_rate": 1.6597519973402576e-06, - "loss": 0.7668, + "epoch": 0.8507980569049272, + "grad_norm": 0.8833100199699402, + "learning_rate": 3.2665139275361446e-05, + "loss": 0.8027, "step": 4904 }, { - "epoch": 1.7911265291217822, - "grad_norm": 1.0698031187057495, - "learning_rate": 1.6540350189447885e-06, - "loss": 0.7946, + "epoch": 0.850971547536433, + "grad_norm": 0.8060975074768066, + "learning_rate": 3.2659865477710244e-05, + "loss": 0.8257, "step": 4905 }, { - "epoch": 1.7914916925324083, - "grad_norm": 1.023133635520935, - "learning_rate": 1.6483274788121239e-06, - "loss": 0.8334, + "epoch": 0.851145038167939, + "grad_norm": 1.0106072425842285, + "learning_rate": 3.26545902108669e-05, + "loss": 0.6351, "step": 4906 }, { - "epoch": 1.7918568559430343, - "grad_norm": 1.1895335912704468, - "learning_rate": 1.6426293798785687e-06, - "loss": 0.8551, + "epoch": 0.8513185287994448, + "grad_norm": 1.045235276222229, + "learning_rate": 3.264931347544361e-05, + "loss": 0.6691, "step": 4907 }, { - "epoch": 1.7922220193536607, - "grad_norm": 1.4366601705551147, - "learning_rate": 1.6369407250755598e-06, - "loss": 0.7948, + "epoch": 0.8514920194309508, + "grad_norm": 0.7937108278274536, + "learning_rate": 3.2644035272052756e-05, + "loss": 0.8281, "step": 4908 }, { - "epoch": 1.792587182764287, - "grad_norm": 1.4068704843521118, - "learning_rate": 1.6312615173296853e-06, - "loss": 0.8013, + "epoch": 0.8516655100624566, + "grad_norm": 0.9887246489524841, + "learning_rate": 3.263875560130689e-05, + "loss": 0.7783, "step": 4909 }, { - "epoch": 1.7929523461749133, - "grad_norm": 1.0818917751312256, - "learning_rate": 1.6255917595626681e-06, - "loss": 0.7838, + "epoch": 0.8518390006939626, + "grad_norm": 0.8486551642417908, + "learning_rate": 3.263347446381869e-05, + "loss": 0.7168, "step": 4910 }, { - "epoch": 1.7933175095855396, - "grad_norm": 1.1915884017944336, - "learning_rate": 1.619931454691368e-06, - "loss": 0.8191, + "epoch": 0.8520124913254684, + "grad_norm": 0.8031161427497864, + "learning_rate": 3.262819186020106e-05, + "loss": 0.6602, "step": 4911 }, { - "epoch": 1.793682672996166, - "grad_norm": 2.234466552734375, - "learning_rate": 1.6142806056277937e-06, - "loss": 0.8134, + "epoch": 0.8521859819569744, + "grad_norm": 0.8690439462661743, + "learning_rate": 3.2622907791067056e-05, + "loss": 0.7544, "step": 4912 }, { - "epoch": 1.794047836406792, - "grad_norm": 1.1683375835418701, - "learning_rate": 1.6086392152790709e-06, - "loss": 0.8372, + "epoch": 0.8523594725884802, + "grad_norm": 0.8877313733100891, + "learning_rate": 3.261762225702989e-05, + "loss": 0.7803, "step": 4913 }, { - "epoch": 1.7944129998174183, - "grad_norm": 0.9523593783378601, - "learning_rate": 1.6030072865474733e-06, - "loss": 0.7944, + "epoch": 0.8525329632199862, + "grad_norm": 0.9963412880897522, + "learning_rate": 3.261233525870296e-05, + "loss": 0.6237, "step": 4914 }, { - "epoch": 1.7947781632280444, - "grad_norm": 1.3497326374053955, - "learning_rate": 1.5973848223304012e-06, - "loss": 0.8413, + "epoch": 0.852706453851492, + "grad_norm": 0.725067138671875, + "learning_rate": 3.2607046796699824e-05, + "loss": 0.8271, "step": 4915 }, { - "epoch": 1.7951433266386707, - "grad_norm": 0.9586232304573059, - "learning_rate": 1.5917718255203873e-06, - "loss": 0.8112, + "epoch": 0.8528799444829979, + "grad_norm": 0.9406625628471375, + "learning_rate": 3.260175687163423e-05, + "loss": 0.7043, "step": 4916 }, { - "epoch": 1.795508490049297, - "grad_norm": 0.9739806652069092, - "learning_rate": 1.5861682990050954e-06, - "loss": 0.7865, + "epoch": 0.8530534351145038, + "grad_norm": 1.9307119846343994, + "learning_rate": 3.259646548412005e-05, + "loss": 0.7164, "step": 4917 }, { - "epoch": 1.7958736534599233, - "grad_norm": 1.122029185295105, - "learning_rate": 1.5805742456673101e-06, - "loss": 0.8295, + "epoch": 0.8532269257460097, + "grad_norm": 0.9125030040740967, + "learning_rate": 3.259117263477138e-05, + "loss": 0.6548, "step": 4918 }, { - "epoch": 1.7962388168705496, - "grad_norm": 1.1903799772262573, - "learning_rate": 1.5749896683849474e-06, - "loss": 0.7902, + "epoch": 0.8534004163775156, + "grad_norm": 0.9924011826515198, + "learning_rate": 3.258587832420246e-05, + "loss": 0.9062, "step": 4919 }, { - "epoch": 1.796603980281176, - "grad_norm": 1.0841221809387207, - "learning_rate": 1.5694145700310536e-06, - "loss": 0.7928, + "epoch": 0.8535739070090215, + "grad_norm": 0.938946008682251, + "learning_rate": 3.2580582553027684e-05, + "loss": 0.9026, "step": 4920 }, { - "epoch": 1.796969143691802, - "grad_norm": 1.7442679405212402, - "learning_rate": 1.563848953473792e-06, - "loss": 0.7974, + "epoch": 0.8537473976405274, + "grad_norm": 0.9578953385353088, + "learning_rate": 3.2575285321861656e-05, + "loss": 0.7639, "step": 4921 }, { - "epoch": 1.7973343071024284, - "grad_norm": 1.029191493988037, - "learning_rate": 1.5582928215764481e-06, - "loss": 0.7987, + "epoch": 0.8539208882720333, + "grad_norm": 0.7756941914558411, + "learning_rate": 3.2569986631319104e-05, + "loss": 0.8093, "step": 4922 }, { - "epoch": 1.7976994705130545, - "grad_norm": 0.8104425668716431, - "learning_rate": 1.552746177197424e-06, - "loss": 0.8138, + "epoch": 0.8540943789035392, + "grad_norm": 0.8257486820220947, + "learning_rate": 3.256468648201496e-05, + "loss": 0.6808, "step": 4923 }, { - "epoch": 1.7980646339236808, - "grad_norm": 1.2640546560287476, - "learning_rate": 1.5472090231902504e-06, - "loss": 0.7961, + "epoch": 0.8542678695350451, + "grad_norm": 0.8294185996055603, + "learning_rate": 3.2559384874564305e-05, + "loss": 0.7119, "step": 4924 }, { - "epoch": 1.798429797334307, - "grad_norm": 1.0844556093215942, - "learning_rate": 1.5416813624035688e-06, - "loss": 0.811, + "epoch": 0.854441360166551, + "grad_norm": 1.0248199701309204, + "learning_rate": 3.25540818095824e-05, + "loss": 0.6294, "step": 4925 }, { - "epoch": 1.7987949607449334, - "grad_norm": 1.206799030303955, - "learning_rate": 1.5361631976811397e-06, - "loss": 0.8601, + "epoch": 0.8546148507980569, + "grad_norm": 1.093147873878479, + "learning_rate": 3.254877728768468e-05, + "loss": 0.6522, "step": 4926 }, { - "epoch": 1.7991601241555597, - "grad_norm": 1.1709402799606323, - "learning_rate": 1.5306545318618437e-06, - "loss": 0.7599, + "epoch": 0.8547883414295628, + "grad_norm": 0.6836899518966675, + "learning_rate": 3.254347130948673e-05, + "loss": 0.7617, "step": 4927 }, { - "epoch": 1.799525287566186, - "grad_norm": 0.7518690824508667, - "learning_rate": 1.525155367779656e-06, - "loss": 0.8323, + "epoch": 0.8549618320610687, + "grad_norm": 0.9026235938072205, + "learning_rate": 3.2538163875604316e-05, + "loss": 0.6864, "step": 4928 }, { - "epoch": 1.799890450976812, - "grad_norm": 1.2502812147140503, - "learning_rate": 1.5196657082636845e-06, - "loss": 0.8506, + "epoch": 0.8551353226925746, + "grad_norm": 0.9055376052856445, + "learning_rate": 3.2532854986653375e-05, + "loss": 0.6567, "step": 4929 }, { - "epoch": 1.8002556143874384, - "grad_norm": 1.3050079345703125, - "learning_rate": 1.5141855561381347e-06, - "loss": 0.8334, + "epoch": 0.8553088133240805, + "grad_norm": 0.9900588989257812, + "learning_rate": 3.252754464325001e-05, + "loss": 0.6285, "step": 4930 }, { - "epoch": 1.8006207777980645, - "grad_norm": 1.0599333047866821, - "learning_rate": 1.5087149142223313e-06, - "loss": 0.8416, + "epoch": 0.8554823039555864, + "grad_norm": 0.945695161819458, + "learning_rate": 3.2522232846010496e-05, + "loss": 0.6481, "step": 4931 }, { - "epoch": 1.8009859412086908, - "grad_norm": 1.1715906858444214, - "learning_rate": 1.5032537853306917e-06, - "loss": 0.8215, + "epoch": 0.8556557945870923, + "grad_norm": 0.8635254502296448, + "learning_rate": 3.251691959555126e-05, + "loss": 0.6195, "step": 4932 }, { - "epoch": 1.8013511046193171, - "grad_norm": 1.0878701210021973, - "learning_rate": 1.4978021722727509e-06, - "loss": 0.855, + "epoch": 0.8558292852185982, + "grad_norm": 0.9437160491943359, + "learning_rate": 3.251160489248893e-05, + "loss": 0.7456, "step": 4933 }, { - "epoch": 1.8017162680299434, - "grad_norm": 1.0205191373825073, - "learning_rate": 1.4923600778531456e-06, - "loss": 0.7989, + "epoch": 0.8560027758501041, + "grad_norm": 0.8045133352279663, + "learning_rate": 3.2506288737440265e-05, + "loss": 0.8022, "step": 4934 }, { - "epoch": 1.8020814314405698, - "grad_norm": 0.8168391585350037, - "learning_rate": 1.486927504871616e-06, - "loss": 0.7931, + "epoch": 0.8561762664816099, + "grad_norm": 0.8184975981712341, + "learning_rate": 3.250097113102222e-05, + "loss": 0.7629, "step": 4935 }, { - "epoch": 1.802446594851196, - "grad_norm": 0.7500855326652527, - "learning_rate": 1.481504456123004e-06, - "loss": 0.7979, + "epoch": 0.8563497571131159, + "grad_norm": 0.8181275725364685, + "learning_rate": 3.249565207385192e-05, + "loss": 0.6881, "step": 4936 }, { - "epoch": 1.8028117582618222, - "grad_norm": 0.9906246066093445, - "learning_rate": 1.4760909343972473e-06, - "loss": 0.845, + "epoch": 0.8565232477446217, + "grad_norm": 1.6034315824508667, + "learning_rate": 3.249033156654663e-05, + "loss": 0.7086, "step": 4937 }, { - "epoch": 1.8031769216724485, - "grad_norm": 1.0908602476119995, - "learning_rate": 1.4706869424793847e-06, - "loss": 0.7977, + "epoch": 0.8566967383761277, + "grad_norm": 0.8114014863967896, + "learning_rate": 3.24850096097238e-05, + "loss": 0.8906, "step": 4938 }, { - "epoch": 1.8035420850830746, - "grad_norm": 1.6367558240890503, - "learning_rate": 1.4652924831495563e-06, - "loss": 0.8131, + "epoch": 0.8568702290076335, + "grad_norm": 1.2547131776809692, + "learning_rate": 3.247968620400106e-05, + "loss": 0.6757, "step": 4939 }, { - "epoch": 1.8039072484937009, - "grad_norm": 1.1155606508255005, - "learning_rate": 1.4599075591829915e-06, - "loss": 0.7849, + "epoch": 0.8570437196391395, + "grad_norm": 1.8634538650512695, + "learning_rate": 3.2474361349996205e-05, + "loss": 0.7118, "step": 4940 }, { - "epoch": 1.8042724119043272, - "grad_norm": 1.2644360065460205, - "learning_rate": 1.454532173350025e-06, - "loss": 0.8486, + "epoch": 0.8572172102706453, + "grad_norm": 1.488273024559021, + "learning_rate": 3.2469035048327166e-05, + "loss": 0.7649, "step": 4941 }, { - "epoch": 1.8046375753149535, - "grad_norm": 0.9574212431907654, - "learning_rate": 1.4491663284160694e-06, - "loss": 0.8256, + "epoch": 0.8573907009021513, + "grad_norm": 0.890212893486023, + "learning_rate": 3.2463707299612086e-05, + "loss": 0.6646, "step": 4942 }, { - "epoch": 1.8050027387255798, - "grad_norm": 1.0828630924224854, - "learning_rate": 1.4438100271416367e-06, - "loss": 0.8197, + "epoch": 0.8575641915336571, + "grad_norm": 0.9469761252403259, + "learning_rate": 3.245837810446925e-05, + "loss": 0.7225, "step": 4943 }, { - "epoch": 1.805367902136206, - "grad_norm": 0.9204723238945007, - "learning_rate": 1.4384632722823333e-06, - "loss": 0.8239, + "epoch": 0.8577376821651631, + "grad_norm": 0.9159486889839172, + "learning_rate": 3.245304746351712e-05, + "loss": 0.697, "step": 4944 }, { - "epoch": 1.8057330655468322, - "grad_norm": 1.0041311979293823, - "learning_rate": 1.433126066588848e-06, - "loss": 0.8258, + "epoch": 0.857911172796669, + "grad_norm": 1.2341123819351196, + "learning_rate": 3.244771537737432e-05, + "loss": 0.7354, "step": 4945 }, { - "epoch": 1.8060982289574583, - "grad_norm": 1.1605628728866577, - "learning_rate": 1.4277984128069622e-06, - "loss": 0.7974, + "epoch": 0.8580846634281749, + "grad_norm": 0.8514829277992249, + "learning_rate": 3.2442381846659644e-05, + "loss": 0.7827, "step": 4946 }, { - "epoch": 1.8064633923680846, - "grad_norm": 1.1221680641174316, - "learning_rate": 1.4224803136775323e-06, - "loss": 0.8063, + "epoch": 0.8582581540596808, + "grad_norm": 1.00812566280365, + "learning_rate": 3.243704687199206e-05, + "loss": 0.7859, "step": 4947 }, { - "epoch": 1.806828555778711, - "grad_norm": 1.0430214405059814, - "learning_rate": 1.417171771936514e-06, - "loss": 0.7421, + "epoch": 0.8584316446911867, + "grad_norm": 1.0788377523422241, + "learning_rate": 3.243171045399069e-05, + "loss": 0.769, "step": 4948 }, { - "epoch": 1.8071937191893372, - "grad_norm": 0.7924364805221558, - "learning_rate": 1.4118727903149387e-06, - "loss": 0.7824, + "epoch": 0.8586051353226926, + "grad_norm": 0.8725404739379883, + "learning_rate": 3.2426372593274834e-05, + "loss": 0.7876, "step": 4949 }, { - "epoch": 1.8075588825999636, - "grad_norm": 1.2920478582382202, - "learning_rate": 1.4065833715389143e-06, - "loss": 0.8088, + "epoch": 0.8587786259541985, + "grad_norm": 0.8824693560600281, + "learning_rate": 3.2421033290463966e-05, + "loss": 0.6021, "step": 4950 }, { - "epoch": 1.8079240460105899, - "grad_norm": 0.7410767674446106, - "learning_rate": 1.401303518329642e-06, - "loss": 0.8337, + "epoch": 0.8589521165857044, + "grad_norm": 1.0551276206970215, + "learning_rate": 3.2415692546177714e-05, + "loss": 0.6969, "step": 4951 }, { - "epoch": 1.808289209421216, - "grad_norm": 0.8920024633407593, - "learning_rate": 1.3960332334033844e-06, - "loss": 0.8074, + "epoch": 0.8591256072172103, + "grad_norm": 0.9867327809333801, + "learning_rate": 3.241035036103587e-05, + "loss": 0.7781, "step": 4952 }, { - "epoch": 1.8086543728318423, - "grad_norm": 0.939798891544342, - "learning_rate": 1.3907725194714994e-06, - "loss": 0.8168, + "epoch": 0.8592990978487162, + "grad_norm": 0.8247190117835999, + "learning_rate": 3.24050067356584e-05, + "loss": 0.8586, "step": 4953 }, { - "epoch": 1.8090195362424684, - "grad_norm": 1.1574920415878296, - "learning_rate": 1.3855213792404132e-06, - "loss": 0.8219, + "epoch": 0.8594725884802221, + "grad_norm": 1.0025110244750977, + "learning_rate": 3.239966167066545e-05, + "loss": 0.6736, "step": 4954 }, { - "epoch": 1.8093846996530947, - "grad_norm": 1.326380729675293, - "learning_rate": 1.3802798154116249e-06, - "loss": 0.8767, + "epoch": 0.859646079111728, + "grad_norm": 0.8294478058815002, + "learning_rate": 3.239431516667732e-05, + "loss": 0.6224, "step": 4955 }, { - "epoch": 1.809749863063721, - "grad_norm": 1.5962122678756714, - "learning_rate": 1.3750478306817082e-06, - "loss": 0.7897, + "epoch": 0.8598195697432338, + "grad_norm": 1.1593735218048096, + "learning_rate": 3.2388967224314464e-05, + "loss": 0.7649, "step": 4956 }, { - "epoch": 1.8101150264743473, - "grad_norm": 1.0430737733840942, - "learning_rate": 1.3698254277423083e-06, - "loss": 0.7769, + "epoch": 0.8599930603747398, + "grad_norm": 1.5607370138168335, + "learning_rate": 3.238361784419753e-05, + "loss": 0.7935, "step": 4957 }, { - "epoch": 1.8104801898849736, - "grad_norm": 1.0271413326263428, - "learning_rate": 1.3646126092801425e-06, - "loss": 0.7553, + "epoch": 0.8601665510062456, + "grad_norm": 1.0494519472122192, + "learning_rate": 3.2378267026947314e-05, + "loss": 0.7313, "step": 4958 }, { - "epoch": 1.8108453532956, - "grad_norm": 1.072827696800232, - "learning_rate": 1.359409377976999e-06, - "loss": 0.8316, + "epoch": 0.8603400416377516, + "grad_norm": 1.0768216848373413, + "learning_rate": 3.237291477318478e-05, + "loss": 0.7556, "step": 4959 }, { - "epoch": 1.811210516706226, - "grad_norm": 1.4858274459838867, - "learning_rate": 1.354215736509734e-06, - "loss": 0.801, + "epoch": 0.8605135322692574, + "grad_norm": 1.4303416013717651, + "learning_rate": 3.2367561083531074e-05, + "loss": 0.8481, "step": 4960 }, { - "epoch": 1.8115756801168523, - "grad_norm": 0.715782642364502, - "learning_rate": 1.3490316875502597e-06, - "loss": 0.8197, + "epoch": 0.8606870229007634, + "grad_norm": 0.8928084373474121, + "learning_rate": 3.2362205958607495e-05, + "loss": 0.8169, "step": 4961 }, { - "epoch": 1.8119408435274784, - "grad_norm": 1.2959026098251343, - "learning_rate": 1.3438572337655686e-06, - "loss": 0.7931, + "epoch": 0.8608605135322692, + "grad_norm": 1.0629396438598633, + "learning_rate": 3.235684939903551e-05, + "loss": 0.686, "step": 4962 }, { - "epoch": 1.8123060069381047, - "grad_norm": 0.838121235370636, - "learning_rate": 1.338692377817703e-06, - "loss": 0.7961, + "epoch": 0.8610340041637752, + "grad_norm": 1.1496520042419434, + "learning_rate": 3.235149140543675e-05, + "loss": 0.6565, "step": 4963 }, { - "epoch": 1.812671170348731, - "grad_norm": 1.1482964754104614, - "learning_rate": 1.3335371223637772e-06, - "loss": 0.8, + "epoch": 0.861207494795281, + "grad_norm": 1.4653269052505493, + "learning_rate": 3.234613197843302e-05, + "loss": 0.8027, "step": 4964 }, { - "epoch": 1.8130363337593574, - "grad_norm": 1.3699849843978882, - "learning_rate": 1.3283914700559675e-06, - "loss": 0.7793, + "epoch": 0.861380985426787, + "grad_norm": 0.8582742214202881, + "learning_rate": 3.234077111864629e-05, + "loss": 0.7461, "step": 4965 }, { - "epoch": 1.8134014971699837, - "grad_norm": 0.8411401510238647, - "learning_rate": 1.3232554235414985e-06, - "loss": 0.7751, + "epoch": 0.8615544760582928, + "grad_norm": 0.9596072435379028, + "learning_rate": 3.233540882669869e-05, + "loss": 0.6207, "step": 4966 }, { - "epoch": 1.81376666058061, - "grad_norm": 1.120723009109497, - "learning_rate": 1.3181289854626633e-06, - "loss": 0.8004, + "epoch": 0.8617279666897988, + "grad_norm": 0.9244804978370667, + "learning_rate": 3.233004510321253e-05, + "loss": 0.8032, "step": 4967 }, { - "epoch": 1.814131823991236, - "grad_norm": 1.0042999982833862, - "learning_rate": 1.3130121584568055e-06, - "loss": 0.7881, + "epoch": 0.8619014573213046, + "grad_norm": 0.9032056331634521, + "learning_rate": 3.232467994881026e-05, + "loss": 0.6609, "step": 4968 }, { - "epoch": 1.8144969874018624, - "grad_norm": 0.7997451424598694, - "learning_rate": 1.3079049451563331e-06, - "loss": 0.8368, + "epoch": 0.8620749479528106, + "grad_norm": 0.6906391978263855, + "learning_rate": 3.2319313364114524e-05, + "loss": 0.8367, "step": 4969 }, { - "epoch": 1.8148621508124885, - "grad_norm": 0.9074176549911499, - "learning_rate": 1.3028073481887016e-06, - "loss": 0.7928, + "epoch": 0.8622484385843164, + "grad_norm": 0.8855649828910828, + "learning_rate": 3.2313945349748116e-05, + "loss": 0.7529, "step": 4970 }, { - "epoch": 1.8152273142231148, - "grad_norm": 1.138295292854309, - "learning_rate": 1.2977193701764135e-06, - "loss": 0.8159, + "epoch": 0.8624219292158224, + "grad_norm": 0.8236820101737976, + "learning_rate": 3.2308575906334004e-05, + "loss": 0.8789, "step": 4971 }, { - "epoch": 1.815592477633741, - "grad_norm": 1.0078554153442383, - "learning_rate": 1.2926410137370348e-06, - "loss": 0.8512, + "epoch": 0.8625954198473282, + "grad_norm": 0.691373348236084, + "learning_rate": 3.230320503449531e-05, + "loss": 0.8074, "step": 4972 }, { - "epoch": 1.8159576410443674, - "grad_norm": 1.489197850227356, - "learning_rate": 1.2875722814831737e-06, - "loss": 0.8215, + "epoch": 0.8627689104788342, + "grad_norm": 0.7874361276626587, + "learning_rate": 3.229783273485534e-05, + "loss": 0.7393, "step": 4973 }, { - "epoch": 1.8163228044549937, - "grad_norm": 1.238554835319519, - "learning_rate": 1.2825131760224952e-06, - "loss": 0.8176, + "epoch": 0.86294240111034, + "grad_norm": 0.8611727356910706, + "learning_rate": 3.2292459008037554e-05, + "loss": 0.7268, "step": 4974 }, { - "epoch": 1.8166879678656198, - "grad_norm": 1.3073149919509888, - "learning_rate": 1.2774636999576995e-06, - "loss": 0.8096, + "epoch": 0.8631158917418459, + "grad_norm": 0.8052560687065125, + "learning_rate": 3.2287083854665566e-05, + "loss": 0.6593, "step": 4975 }, { - "epoch": 1.8170531312762461, - "grad_norm": 1.7516006231307983, - "learning_rate": 1.272423855886542e-06, - "loss": 0.853, + "epoch": 0.8632893823733518, + "grad_norm": 0.7947577834129333, + "learning_rate": 3.228170727536319e-05, + "loss": 0.791, "step": 4976 }, { - "epoch": 1.8174182946868722, - "grad_norm": 1.2132388353347778, - "learning_rate": 1.26739364640182e-06, - "loss": 0.7747, + "epoch": 0.8634628730048577, + "grad_norm": 0.7168683409690857, + "learning_rate": 3.227632927075437e-05, + "loss": 0.7281, "step": 4977 }, { - "epoch": 1.8177834580974985, - "grad_norm": 1.2462884187698364, - "learning_rate": 1.262373074091372e-06, - "loss": 0.8101, + "epoch": 0.8636363636363636, + "grad_norm": 1.201189637184143, + "learning_rate": 3.227094984146323e-05, + "loss": 0.6556, "step": 4978 }, { - "epoch": 1.8181486215081248, - "grad_norm": 0.8514479994773865, - "learning_rate": 1.2573621415380832e-06, - "loss": 0.8501, + "epoch": 0.8638098542678695, + "grad_norm": 0.7431387305259705, + "learning_rate": 3.2265568988114075e-05, + "loss": 0.7599, "step": 4979 }, { - "epoch": 1.8185137849187512, - "grad_norm": 1.482871651649475, - "learning_rate": 1.2523608513198803e-06, - "loss": 0.7894, + "epoch": 0.8639833448993754, + "grad_norm": 0.7491896152496338, + "learning_rate": 3.226018671133134e-05, + "loss": 0.781, "step": 4980 }, { - "epoch": 1.8188789483293775, - "grad_norm": 1.5335067510604858, - "learning_rate": 1.247369206009721e-06, - "loss": 0.7999, + "epoch": 0.8641568355308813, + "grad_norm": 0.9482730031013489, + "learning_rate": 3.225480301173964e-05, + "loss": 0.6962, "step": 4981 }, { - "epoch": 1.8192441117400038, - "grad_norm": 1.1428700685501099, - "learning_rate": 1.2423872081756106e-06, - "loss": 0.7686, + "epoch": 0.8643303261623873, + "grad_norm": 1.131107211112976, + "learning_rate": 3.224941788996378e-05, + "loss": 0.6163, "step": 4982 }, { - "epoch": 1.8196092751506299, - "grad_norm": 1.024242639541626, - "learning_rate": 1.2374148603805835e-06, - "loss": 0.7883, + "epoch": 0.8645038167938931, + "grad_norm": 0.889661967754364, + "learning_rate": 3.22440313466287e-05, + "loss": 0.6072, "step": 4983 }, { - "epoch": 1.8199744385612562, - "grad_norm": 0.9019961953163147, - "learning_rate": 1.2324521651827182e-06, - "loss": 0.8087, + "epoch": 0.864677307425399, + "grad_norm": 0.8412299752235413, + "learning_rate": 3.223864338235951e-05, + "loss": 0.6238, "step": 4984 }, { - "epoch": 1.8203396019718823, - "grad_norm": 1.2244259119033813, - "learning_rate": 1.2274991251351166e-06, - "loss": 0.8107, + "epoch": 0.8648507980569049, + "grad_norm": 0.8964213728904724, + "learning_rate": 3.22332539977815e-05, + "loss": 0.6687, "step": 4985 }, { - "epoch": 1.8207047653825086, - "grad_norm": 1.0073013305664062, - "learning_rate": 1.2225557427859203e-06, - "loss": 0.8341, + "epoch": 0.8650242886884109, + "grad_norm": 1.1179722547531128, + "learning_rate": 3.2227863193520115e-05, + "loss": 0.7524, "step": 4986 }, { - "epoch": 1.821069928793135, - "grad_norm": 1.3434255123138428, - "learning_rate": 1.217622020678304e-06, - "loss": 0.8278, + "epoch": 0.8651977793199167, + "grad_norm": 1.1653023958206177, + "learning_rate": 3.222247097020095e-05, + "loss": 0.6572, "step": 4987 }, { - "epoch": 1.8214350922037612, - "grad_norm": 1.134324073791504, - "learning_rate": 1.2126979613504664e-06, - "loss": 0.8456, + "epoch": 0.8653712699514227, + "grad_norm": 0.9420323967933655, + "learning_rate": 3.221707732844979e-05, + "loss": 0.7208, "step": 4988 }, { - "epoch": 1.8218002556143875, - "grad_norm": 1.0145128965377808, - "learning_rate": 1.2077835673356454e-06, - "loss": 0.824, + "epoch": 0.8655447605829285, + "grad_norm": 0.8667080998420715, + "learning_rate": 3.221168226889257e-05, + "loss": 0.8828, "step": 4989 }, { - "epoch": 1.8221654190250138, - "grad_norm": 1.3407748937606812, - "learning_rate": 1.202878841162094e-06, - "loss": 0.8256, + "epoch": 0.8657182512144345, + "grad_norm": 0.6585778594017029, + "learning_rate": 3.220628579215539e-05, + "loss": 0.7305, "step": 4990 }, { - "epoch": 1.82253058243564, - "grad_norm": 1.2486720085144043, - "learning_rate": 1.197983785353094e-06, - "loss": 0.8224, + "epoch": 0.8658917418459403, + "grad_norm": 1.178911805152893, + "learning_rate": 3.220088789886452e-05, + "loss": 0.7495, "step": 4991 }, { - "epoch": 1.8228957458462662, - "grad_norm": 0.8362981081008911, - "learning_rate": 1.1930984024269575e-06, - "loss": 0.8322, + "epoch": 0.8660652324774463, + "grad_norm": 1.1632426977157593, + "learning_rate": 3.21954885896464e-05, + "loss": 0.7273, "step": 4992 }, { - "epoch": 1.8232609092568923, - "grad_norm": 0.8762801885604858, - "learning_rate": 1.1882226948970188e-06, - "loss": 0.8102, + "epoch": 0.8662387231089521, + "grad_norm": 0.8341097831726074, + "learning_rate": 3.219008786512762e-05, + "loss": 0.7283, "step": 4993 }, { - "epoch": 1.8236260726675186, - "grad_norm": 1.2065532207489014, - "learning_rate": 1.1833566652716378e-06, - "loss": 0.8, + "epoch": 0.8664122137404581, + "grad_norm": 1.4920570850372314, + "learning_rate": 3.2184685725934926e-05, + "loss": 0.6819, "step": 4994 }, { - "epoch": 1.823991236078145, - "grad_norm": 1.2027690410614014, - "learning_rate": 1.1785003160541852e-06, - "loss": 0.7914, + "epoch": 0.8665857043719639, + "grad_norm": 0.9339298605918884, + "learning_rate": 3.217928217269527e-05, + "loss": 0.6805, "step": 4995 }, { - "epoch": 1.8243563994887713, - "grad_norm": 1.1477783918380737, - "learning_rate": 1.1736536497430584e-06, - "loss": 0.7568, + "epoch": 0.8667591950034698, + "grad_norm": 0.9702214002609253, + "learning_rate": 3.2173877206035714e-05, + "loss": 0.7505, "step": 4996 }, { - "epoch": 1.8247215628993976, - "grad_norm": 1.3435066938400269, - "learning_rate": 1.168816668831676e-06, - "loss": 0.7961, + "epoch": 0.8669326856349757, + "grad_norm": 0.8499478697776794, + "learning_rate": 3.2168470826583525e-05, + "loss": 0.7695, "step": 4997 }, { - "epoch": 1.8250867263100239, - "grad_norm": 0.9692015647888184, - "learning_rate": 1.1639893758084719e-06, - "loss": 0.8245, + "epoch": 0.8671061762664816, + "grad_norm": 1.3000338077545166, + "learning_rate": 3.2163063034966126e-05, + "loss": 0.7693, "step": 4998 }, { - "epoch": 1.82545188972065, - "grad_norm": 1.433664321899414, - "learning_rate": 1.1591717731568909e-06, - "loss": 0.8165, + "epoch": 0.8672796668979875, + "grad_norm": 0.9412140250205994, + "learning_rate": 3.215765383181108e-05, + "loss": 0.719, "step": 4999 }, { - "epoch": 1.8258170531312763, - "grad_norm": 1.1576590538024902, - "learning_rate": 1.1543638633553945e-06, - "loss": 0.7589, + "epoch": 0.8674531575294934, + "grad_norm": 1.0281404256820679, + "learning_rate": 3.215224321774614e-05, + "loss": 0.7222, "step": 5000 }, { - "epoch": 1.8261822165419024, - "grad_norm": 1.3327196836471558, - "learning_rate": 1.149565648877462e-06, - "loss": 0.8455, + "epoch": 0.8676266481609993, + "grad_norm": 0.9447991251945496, + "learning_rate": 3.2146831193399225e-05, + "loss": 0.7358, "step": 5001 }, { - "epoch": 1.8265473799525287, - "grad_norm": 1.0819675922393799, - "learning_rate": 1.144777132191588e-06, - "loss": 0.8126, + "epoch": 0.8678001387925052, + "grad_norm": 0.8452675938606262, + "learning_rate": 3.214141775939839e-05, + "loss": 0.6597, "step": 5002 }, { - "epoch": 1.826912543363155, - "grad_norm": 0.8900010585784912, - "learning_rate": 1.1399983157612616e-06, - "loss": 0.7733, + "epoch": 0.8679736294240111, + "grad_norm": 0.7685485482215881, + "learning_rate": 3.213600291637187e-05, + "loss": 0.6975, "step": 5003 }, { - "epoch": 1.8272777067737813, - "grad_norm": 1.1520458459854126, - "learning_rate": 1.1352292020449984e-06, - "loss": 0.7869, + "epoch": 0.868147120055517, + "grad_norm": 0.9125070571899414, + "learning_rate": 3.213058666494807e-05, + "loss": 0.7354, "step": 5004 }, { - "epoch": 1.8276428701844076, - "grad_norm": 1.0667668581008911, - "learning_rate": 1.130469793496314e-06, - "loss": 0.8251, + "epoch": 0.8683206106870229, + "grad_norm": 0.9179530143737793, + "learning_rate": 3.2125169005755566e-05, + "loss": 0.6931, "step": 5005 }, { - "epoch": 1.828008033595034, - "grad_norm": 1.1746535301208496, - "learning_rate": 1.1257200925637336e-06, - "loss": 0.8322, + "epoch": 0.8684941013185288, + "grad_norm": 0.9830337166786194, + "learning_rate": 3.2119749939423063e-05, + "loss": 0.6854, "step": 5006 }, { - "epoch": 1.82837319700566, - "grad_norm": 1.8004931211471558, - "learning_rate": 1.1209801016907872e-06, - "loss": 0.8296, + "epoch": 0.8686675919500347, + "grad_norm": 0.766217827796936, + "learning_rate": 3.211432946657946e-05, + "loss": 0.8848, "step": 5007 }, { - "epoch": 1.8287383604162861, - "grad_norm": 1.0191929340362549, - "learning_rate": 1.1162498233160136e-06, - "loss": 0.8049, + "epoch": 0.8688410825815406, + "grad_norm": 0.8001906871795654, + "learning_rate": 3.2108907587853794e-05, + "loss": 0.8046, "step": 5008 }, { - "epoch": 1.8291035238269124, - "grad_norm": 1.2385547161102295, - "learning_rate": 1.1115292598729454e-06, - "loss": 0.8331, + "epoch": 0.8690145732130465, + "grad_norm": 0.6718818545341492, + "learning_rate": 3.210348430387531e-05, + "loss": 0.854, "step": 5009 }, { - "epoch": 1.8294686872375387, - "grad_norm": 1.0617761611938477, - "learning_rate": 1.106818413790125e-06, - "loss": 0.8444, + "epoch": 0.8691880638445524, + "grad_norm": 0.9552324414253235, + "learning_rate": 3.2098059615273354e-05, + "loss": 0.8677, "step": 5010 }, { - "epoch": 1.829833850648165, - "grad_norm": 1.0398308038711548, - "learning_rate": 1.1021172874910957e-06, - "loss": 0.7464, + "epoch": 0.8693615544760583, + "grad_norm": 0.7607008218765259, + "learning_rate": 3.209263352267749e-05, + "loss": 0.7786, "step": 5011 }, { - "epoch": 1.8301990140587914, - "grad_norm": 1.3106063604354858, - "learning_rate": 1.0974258833943985e-06, - "loss": 0.8088, + "epoch": 0.8695350451075642, + "grad_norm": 0.7460717558860779, + "learning_rate": 3.2087206026717415e-05, + "loss": 0.8674, "step": 5012 }, { - "epoch": 1.8305641774694177, - "grad_norm": 0.9229283332824707, - "learning_rate": 1.0927442039135717e-06, - "loss": 0.8271, + "epoch": 0.8697085357390701, + "grad_norm": 0.9010863900184631, + "learning_rate": 3.2081777128023005e-05, + "loss": 0.7217, "step": 5013 }, { - "epoch": 1.8309293408800438, - "grad_norm": 1.2371296882629395, - "learning_rate": 1.0880722514571484e-06, - "loss": 0.814, + "epoch": 0.869882026370576, + "grad_norm": 0.808225691318512, + "learning_rate": 3.207634682722427e-05, + "loss": 0.7058, "step": 5014 }, { - "epoch": 1.83129450429067, - "grad_norm": 1.2277987003326416, - "learning_rate": 1.0834100284286641e-06, - "loss": 0.8074, + "epoch": 0.8700555170020818, + "grad_norm": 1.042389154434204, + "learning_rate": 3.2070915124951406e-05, + "loss": 0.6624, "step": 5015 }, { - "epoch": 1.8316596677012962, - "grad_norm": 1.1591938734054565, - "learning_rate": 1.0787575372266467e-06, - "loss": 0.847, + "epoch": 0.8702290076335878, + "grad_norm": 1.8158230781555176, + "learning_rate": 3.206548202183479e-05, + "loss": 0.6433, "step": 5016 }, { - "epoch": 1.8320248311119225, - "grad_norm": 1.253165364265442, - "learning_rate": 1.0741147802446128e-06, - "loss": 0.8107, + "epoch": 0.8704024982650936, + "grad_norm": 0.8919952511787415, + "learning_rate": 3.206004751850493e-05, + "loss": 0.6427, "step": 5017 }, { - "epoch": 1.8323899945225488, - "grad_norm": 1.0269856452941895, - "learning_rate": 1.0694817598710782e-06, - "loss": 0.7763, + "epoch": 0.8705759888965996, + "grad_norm": 1.0868960618972778, + "learning_rate": 3.20546116155925e-05, + "loss": 0.7698, "step": 5018 }, { - "epoch": 1.8327551579331751, - "grad_norm": 0.8906711339950562, - "learning_rate": 1.0648584784895411e-06, - "loss": 0.7999, + "epoch": 0.8707494795281054, + "grad_norm": 0.8514827489852905, + "learning_rate": 3.204917431372833e-05, + "loss": 0.6451, "step": 5019 }, { - "epoch": 1.8331203213438014, - "grad_norm": 1.1055452823638916, - "learning_rate": 1.0602449384784963e-06, - "loss": 0.7937, + "epoch": 0.8709229701596114, + "grad_norm": 1.2476423978805542, + "learning_rate": 3.204373561354345e-05, + "loss": 0.7588, "step": 5020 }, { - "epoch": 1.8334854847544277, - "grad_norm": 1.4059200286865234, - "learning_rate": 1.0556411422114254e-06, - "loss": 0.8232, + "epoch": 0.8710964607911172, + "grad_norm": 0.8844627737998962, + "learning_rate": 3.2038295515669024e-05, + "loss": 0.8157, "step": 5021 }, { - "epoch": 1.8338506481650538, - "grad_norm": 1.1882480382919312, - "learning_rate": 1.0510470920567983e-06, - "loss": 0.8215, + "epoch": 0.8712699514226232, + "grad_norm": 1.0802786350250244, + "learning_rate": 3.203285402073637e-05, + "loss": 0.6688, "step": 5022 }, { - "epoch": 1.8342158115756801, - "grad_norm": 1.4281370639801025, - "learning_rate": 1.0464627903780689e-06, - "loss": 0.8091, + "epoch": 0.871443442054129, + "grad_norm": 1.2018306255340576, + "learning_rate": 3.202741112937699e-05, + "loss": 0.6482, "step": 5023 }, { - "epoch": 1.8345809749863062, - "grad_norm": 0.8893885612487793, - "learning_rate": 1.041888239533675e-06, - "loss": 0.8401, + "epoch": 0.871616932685635, + "grad_norm": 0.9051889181137085, + "learning_rate": 3.202196684222253e-05, + "loss": 0.6793, "step": 5024 }, { - "epoch": 1.8349461383969325, - "grad_norm": 1.0957190990447998, - "learning_rate": 1.0373234418770385e-06, - "loss": 0.8016, + "epoch": 0.8717904233171409, + "grad_norm": 0.8569917678833008, + "learning_rate": 3.201652115990481e-05, + "loss": 0.8357, "step": 5025 }, { - "epoch": 1.8353113018075589, - "grad_norm": 2.234016180038452, - "learning_rate": 1.0327683997565674e-06, - "loss": 0.7859, + "epoch": 0.8719639139486468, + "grad_norm": 1.8812474012374878, + "learning_rate": 3.2011074083055814e-05, + "loss": 0.7163, "step": 5026 }, { - "epoch": 1.8356764652181852, - "grad_norm": 0.8406291604042053, - "learning_rate": 1.0282231155156498e-06, - "loss": 0.7822, + "epoch": 0.8721374045801527, + "grad_norm": 1.5051785707473755, + "learning_rate": 3.200562561230767e-05, + "loss": 0.7168, "step": 5027 }, { - "epoch": 1.8360416286288115, - "grad_norm": 1.1558398008346558, - "learning_rate": 1.0236875914926458e-06, - "loss": 0.7836, + "epoch": 0.8723108952116586, + "grad_norm": 2.98382830619812, + "learning_rate": 3.200017574829268e-05, + "loss": 0.7844, "step": 5028 }, { - "epoch": 1.8364067920394378, - "grad_norm": 1.6059396266937256, - "learning_rate": 1.0191618300209094e-06, - "loss": 0.8099, + "epoch": 0.8724843858431645, + "grad_norm": 0.8066723346710205, + "learning_rate": 3.199472449164332e-05, + "loss": 0.6998, "step": 5029 }, { - "epoch": 1.8367719554500639, - "grad_norm": 0.8841313719749451, - "learning_rate": 1.0146458334287513e-06, - "loss": 0.8088, + "epoch": 0.8726578764746704, + "grad_norm": 0.7075682878494263, + "learning_rate": 3.19892718429922e-05, + "loss": 0.7988, "step": 5030 }, { - "epoch": 1.8371371188606902, - "grad_norm": 1.26387619972229, - "learning_rate": 1.0101396040394795e-06, - "loss": 0.79, + "epoch": 0.8728313671061763, + "grad_norm": 0.8010169267654419, + "learning_rate": 3.198381780297211e-05, + "loss": 0.8318, "step": 5031 }, { - "epoch": 1.8375022822713163, - "grad_norm": 1.0011826753616333, - "learning_rate": 1.0056431441713643e-06, - "loss": 0.8154, + "epoch": 0.8730048577376822, + "grad_norm": 0.8783714771270752, + "learning_rate": 3.1978362372216006e-05, + "loss": 0.8596, "step": 5032 }, { - "epoch": 1.8378674456819426, - "grad_norm": 1.0266637802124023, - "learning_rate": 1.0011564561376596e-06, - "loss": 0.8237, + "epoch": 0.8731783483691881, + "grad_norm": 0.7289406657218933, + "learning_rate": 3.1972905551356995e-05, + "loss": 0.8552, "step": 5033 }, { - "epoch": 1.838232609092569, - "grad_norm": 1.0087268352508545, - "learning_rate": 9.966795422465792e-07, - "loss": 0.8109, + "epoch": 0.8733518390006939, + "grad_norm": 0.7430582046508789, + "learning_rate": 3.196744734102833e-05, + "loss": 0.782, "step": 5034 }, { - "epoch": 1.8385977725031952, - "grad_norm": 0.8733835816383362, - "learning_rate": 9.922124048013183e-07, - "loss": 0.8057, + "epoch": 0.8735253296321999, + "grad_norm": 1.0886791944503784, + "learning_rate": 3.196198774186347e-05, + "loss": 0.76, "step": 5035 }, { - "epoch": 1.8389629359138215, - "grad_norm": 1.2109812498092651, - "learning_rate": 9.877550461000385e-07, - "loss": 0.8475, + "epoch": 0.8736988202637057, + "grad_norm": 1.0438178777694702, + "learning_rate": 3.195652675449599e-05, + "loss": 0.5934, "step": 5036 }, { - "epoch": 1.8393280993244479, - "grad_norm": 0.7467372417449951, - "learning_rate": 9.833074684358768e-07, - "loss": 0.8219, + "epoch": 0.8738723108952117, + "grad_norm": 0.8648255467414856, + "learning_rate": 3.195106437955964e-05, + "loss": 0.8289, "step": 5037 }, { - "epoch": 1.839693262735074, - "grad_norm": 1.0083929300308228, - "learning_rate": 9.788696740969295e-07, - "loss": 0.8198, + "epoch": 0.8740458015267175, + "grad_norm": 0.9054954051971436, + "learning_rate": 3.194560061768835e-05, + "loss": 0.723, "step": 5038 }, { - "epoch": 1.8400584261457003, - "grad_norm": 0.8368090391159058, - "learning_rate": 9.744416653662636e-07, - "loss": 0.7935, + "epoch": 0.8742192921582235, + "grad_norm": 0.6930512189865112, + "learning_rate": 3.194013546951619e-05, + "loss": 0.7369, "step": 5039 }, { - "epoch": 1.8404235895563263, - "grad_norm": 1.242389440536499, - "learning_rate": 9.700234445219126e-07, - "loss": 0.8275, + "epoch": 0.8743927827897293, + "grad_norm": 0.66817307472229, + "learning_rate": 3.193466893567739e-05, + "loss": 0.8118, "step": 5040 }, { - "epoch": 1.8407887529669527, - "grad_norm": 1.4806021451950073, - "learning_rate": 9.656150138368758e-07, - "loss": 0.7623, + "epoch": 0.8745662734212353, + "grad_norm": 0.8216201066970825, + "learning_rate": 3.1929201016806355e-05, + "loss": 0.7521, "step": 5041 }, { - "epoch": 1.841153916377579, - "grad_norm": 1.8265529870986938, - "learning_rate": 9.612163755791105e-07, - "loss": 0.8308, + "epoch": 0.8747397640527411, + "grad_norm": 0.8151934742927551, + "learning_rate": 3.192373171353765e-05, + "loss": 0.8328, "step": 5042 }, { - "epoch": 1.8415190797882053, - "grad_norm": 1.1082772016525269, - "learning_rate": 9.568275320115438e-07, - "loss": 0.7955, + "epoch": 0.8749132546842471, + "grad_norm": 0.9188916683197021, + "learning_rate": 3.1918261026505974e-05, + "loss": 0.8083, "step": 5043 }, { - "epoch": 1.8418842431988316, - "grad_norm": 1.0090476274490356, - "learning_rate": 9.524484853920524e-07, - "loss": 0.8187, + "epoch": 0.8750867453157529, + "grad_norm": 1.5555962324142456, + "learning_rate": 3.1912788956346226e-05, + "loss": 0.6938, "step": 5044 }, { - "epoch": 1.8422494066094577, - "grad_norm": 1.113504409790039, - "learning_rate": 9.480792379734871e-07, - "loss": 0.7939, + "epoch": 0.8752602359472589, + "grad_norm": 0.8149041533470154, + "learning_rate": 3.1907315503693434e-05, + "loss": 0.7369, "step": 5045 }, { - "epoch": 1.842614570020084, - "grad_norm": 1.0851188898086548, - "learning_rate": 9.437197920036456e-07, - "loss": 0.8149, + "epoch": 0.8754337265787647, + "grad_norm": 1.5108963251113892, + "learning_rate": 3.190184066918281e-05, + "loss": 0.9473, "step": 5046 }, { - "epoch": 1.84297973343071, - "grad_norm": 1.564560055732727, - "learning_rate": 9.393701497252939e-07, - "loss": 0.7972, + "epoch": 0.8756072172102707, + "grad_norm": 1.0369080305099487, + "learning_rate": 3.18963644534497e-05, + "loss": 0.7158, "step": 5047 }, { - "epoch": 1.8433448968413364, - "grad_norm": 0.9749797582626343, - "learning_rate": 9.35030313376144e-07, - "loss": 0.8555, + "epoch": 0.8757807078417765, + "grad_norm": 0.8049048781394958, + "learning_rate": 3.189088685712964e-05, + "loss": 0.7886, "step": 5048 }, { - "epoch": 1.8437100602519627, - "grad_norm": 1.0360527038574219, - "learning_rate": 9.307002851888658e-07, - "loss": 0.8362, + "epoch": 0.8759541984732825, + "grad_norm": 1.158144474029541, + "learning_rate": 3.18854078808583e-05, + "loss": 0.6885, "step": 5049 }, { - "epoch": 1.844075223662589, - "grad_norm": 1.249164342880249, - "learning_rate": 9.263800673910883e-07, - "loss": 0.8551, + "epoch": 0.8761276891047883, + "grad_norm": 0.9332644939422607, + "learning_rate": 3.187992752527153e-05, + "loss": 0.6376, "step": 5050 }, { - "epoch": 1.8444403870732153, - "grad_norm": 1.0993528366088867, - "learning_rate": 9.220696622053915e-07, - "loss": 0.8419, + "epoch": 0.8763011797362943, + "grad_norm": 0.860964834690094, + "learning_rate": 3.187444579100533e-05, + "loss": 0.6747, "step": 5051 }, { - "epoch": 1.8448055504838416, - "grad_norm": 1.2344956398010254, - "learning_rate": 9.177690718493016e-07, - "loss": 0.8253, + "epoch": 0.8764746703678001, + "grad_norm": 1.0333610773086548, + "learning_rate": 3.186896267869585e-05, + "loss": 0.6241, "step": 5052 }, { - "epoch": 1.8451707138944677, - "grad_norm": 1.1123336553573608, - "learning_rate": 9.134782985353019e-07, - "loss": 0.8197, + "epoch": 0.8766481609993061, + "grad_norm": 0.7846736907958984, + "learning_rate": 3.1863478188979424e-05, + "loss": 0.6427, "step": 5053 }, { - "epoch": 1.845535877305094, - "grad_norm": 1.1678645610809326, - "learning_rate": 9.091973444708247e-07, - "loss": 0.8279, + "epoch": 0.8768216516308119, + "grad_norm": 0.7380347847938538, + "learning_rate": 3.1857992322492525e-05, + "loss": 0.7196, "step": 5054 }, { - "epoch": 1.8459010407157201, - "grad_norm": 0.7918805480003357, - "learning_rate": 9.049262118582458e-07, - "loss": 0.8092, + "epoch": 0.8769951422623178, + "grad_norm": 0.7659241557121277, + "learning_rate": 3.185250507987181e-05, + "loss": 0.674, "step": 5055 }, { - "epoch": 1.8462662041263465, - "grad_norm": 1.0166640281677246, - "learning_rate": 9.006649028948966e-07, - "loss": 0.7941, + "epoch": 0.8771686328938237, + "grad_norm": 0.9966756105422974, + "learning_rate": 3.184701646175407e-05, + "loss": 0.7321, "step": 5056 }, { - "epoch": 1.8466313675369728, - "grad_norm": 1.3033148050308228, - "learning_rate": 8.964134197730457e-07, - "loss": 0.8127, + "epoch": 0.8773421235253296, + "grad_norm": 0.8442345261573792, + "learning_rate": 3.184152646877626e-05, + "loss": 0.7888, "step": 5057 }, { - "epoch": 1.846996530947599, - "grad_norm": 1.118239164352417, - "learning_rate": 8.921717646799077e-07, - "loss": 0.8037, + "epoch": 0.8775156141568355, + "grad_norm": 0.9669578671455383, + "learning_rate": 3.183603510157551e-05, + "loss": 0.8511, "step": 5058 }, { - "epoch": 1.8473616943582254, - "grad_norm": 0.9590117335319519, - "learning_rate": 8.879399397976484e-07, - "loss": 0.8562, + "epoch": 0.8776891047883414, + "grad_norm": 0.8399578928947449, + "learning_rate": 3.183054236078909e-05, + "loss": 0.6257, "step": 5059 }, { - "epoch": 1.8477268577688517, - "grad_norm": 1.508384346961975, - "learning_rate": 8.83717947303373e-07, - "loss": 0.8513, + "epoch": 0.8778625954198473, + "grad_norm": 1.26483952999115, + "learning_rate": 3.1825048247054444e-05, + "loss": 0.6122, "step": 5060 }, { - "epoch": 1.8480920211794778, - "grad_norm": 1.153883457183838, - "learning_rate": 8.795057893691239e-07, - "loss": 0.777, + "epoch": 0.8780360860513532, + "grad_norm": 1.1161212921142578, + "learning_rate": 3.181955276100917e-05, + "loss": 0.6343, "step": 5061 }, { - "epoch": 1.848457184590104, - "grad_norm": 0.9769600033760071, - "learning_rate": 8.753034681618877e-07, - "loss": 0.8022, + "epoch": 0.8782095766828591, + "grad_norm": 0.8771495819091797, + "learning_rate": 3.181405590329102e-05, + "loss": 0.7131, "step": 5062 }, { - "epoch": 1.8488223480007302, - "grad_norm": 0.7666865587234497, - "learning_rate": 8.711109858435907e-07, - "loss": 0.8208, + "epoch": 0.878383067314365, + "grad_norm": 1.1585261821746826, + "learning_rate": 3.180855767453793e-05, + "loss": 0.8149, "step": 5063 }, { - "epoch": 1.8491875114113565, - "grad_norm": 1.2945324182510376, - "learning_rate": 8.669283445710985e-07, - "loss": 0.8542, + "epoch": 0.878556557945871, + "grad_norm": 0.8821806311607361, + "learning_rate": 3.180305807538796e-05, + "loss": 0.7015, "step": 5064 }, { - "epoch": 1.8495526748219828, - "grad_norm": 1.3389134407043457, - "learning_rate": 8.627555464962078e-07, - "loss": 0.8329, + "epoch": 0.8787300485773768, + "grad_norm": 0.7033977508544922, + "learning_rate": 3.179755710647934e-05, + "loss": 0.7401, "step": 5065 }, { - "epoch": 1.8499178382326091, - "grad_norm": 1.0302551984786987, - "learning_rate": 8.585925937656636e-07, - "loss": 0.8115, + "epoch": 0.8789035392088828, + "grad_norm": 0.7614237666130066, + "learning_rate": 3.1792054768450466e-05, + "loss": 0.7296, "step": 5066 }, { - "epoch": 1.8502830016432354, - "grad_norm": 1.0978283882141113, - "learning_rate": 8.544394885211305e-07, - "loss": 0.8301, + "epoch": 0.8790770298403886, + "grad_norm": 1.571118712425232, + "learning_rate": 3.1786551061939905e-05, + "loss": 0.6992, "step": 5067 }, { - "epoch": 1.8506481650538618, - "grad_norm": 1.051652431488037, - "learning_rate": 8.502962328992149e-07, - "loss": 0.7955, + "epoch": 0.8792505204718946, + "grad_norm": 0.766777515411377, + "learning_rate": 3.178104598758636e-05, + "loss": 0.9448, "step": 5068 }, { - "epoch": 1.8510133284644879, - "grad_norm": 1.2965326309204102, - "learning_rate": 8.461628290314605e-07, - "loss": 0.7723, + "epoch": 0.8794240111034004, + "grad_norm": 0.7451822757720947, + "learning_rate": 3.17755395460287e-05, + "loss": 0.7161, "step": 5069 }, { - "epoch": 1.8513784918751142, - "grad_norm": 1.5410418510437012, - "learning_rate": 8.420392790443332e-07, - "loss": 0.7853, + "epoch": 0.8795975017349064, + "grad_norm": 0.8063064217567444, + "learning_rate": 3.1770031737905946e-05, + "loss": 0.783, "step": 5070 }, { - "epoch": 1.8517436552857403, - "grad_norm": 1.2286213636398315, - "learning_rate": 8.379255850592404e-07, - "loss": 0.7975, + "epoch": 0.8797709923664122, + "grad_norm": 0.8605402112007141, + "learning_rate": 3.17645225638573e-05, + "loss": 0.6534, "step": 5071 }, { - "epoch": 1.8521088186963666, - "grad_norm": 1.5505956411361694, - "learning_rate": 8.338217491925027e-07, - "loss": 0.8507, + "epoch": 0.8799444829979182, + "grad_norm": 0.7218802571296692, + "learning_rate": 3.1759012024522103e-05, + "loss": 0.6071, "step": 5072 }, { - "epoch": 1.8524739821069929, - "grad_norm": 0.8931164741516113, - "learning_rate": 8.297277735553844e-07, - "loss": 0.7975, + "epoch": 0.880117973629424, + "grad_norm": 0.8733262419700623, + "learning_rate": 3.1753500120539856e-05, + "loss": 0.7344, "step": 5073 }, { - "epoch": 1.8528391455176192, - "grad_norm": 1.4756321907043457, - "learning_rate": 8.256436602540718e-07, - "loss": 0.7788, + "epoch": 0.8802914642609299, + "grad_norm": 4.826393127441406, + "learning_rate": 3.1747986852550225e-05, + "loss": 0.6123, "step": 5074 }, { - "epoch": 1.8532043089282455, - "grad_norm": 1.0872808694839478, - "learning_rate": 8.215694113896777e-07, - "loss": 0.7681, + "epoch": 0.8804649548924358, + "grad_norm": 0.8293867707252502, + "learning_rate": 3.174247222119303e-05, + "loss": 0.6047, "step": 5075 }, { - "epoch": 1.8535694723388716, - "grad_norm": 1.2140264511108398, - "learning_rate": 8.17505029058241e-07, - "loss": 0.8423, + "epoch": 0.8806384455239417, + "grad_norm": 1.0090335607528687, + "learning_rate": 3.173695622710826e-05, + "loss": 0.691, "step": 5076 }, { - "epoch": 1.853934635749498, - "grad_norm": 0.8135843873023987, - "learning_rate": 8.134505153507177e-07, - "loss": 0.827, + "epoch": 0.8808119361554476, + "grad_norm": 0.9846869707107544, + "learning_rate": 3.173143887093603e-05, + "loss": 0.7966, "step": 5077 }, { - "epoch": 1.854299799160124, - "grad_norm": 1.0905756950378418, - "learning_rate": 8.094058723529974e-07, - "loss": 0.7528, + "epoch": 0.8809854267869535, + "grad_norm": 0.8283978700637817, + "learning_rate": 3.172592015331666e-05, + "loss": 0.7131, "step": 5078 }, { - "epoch": 1.8546649625707503, - "grad_norm": 1.104324221611023, - "learning_rate": 8.053711021458843e-07, - "loss": 0.8135, + "epoch": 0.8811589174184594, + "grad_norm": 0.7315669059753418, + "learning_rate": 3.172040007489058e-05, + "loss": 0.8092, "step": 5079 }, { - "epoch": 1.8550301259813766, - "grad_norm": 1.2699737548828125, - "learning_rate": 8.013462068051092e-07, - "loss": 0.7969, + "epoch": 0.8813324080499653, + "grad_norm": 0.9311553239822388, + "learning_rate": 3.171487863629843e-05, + "loss": 0.6564, "step": 5080 }, { - "epoch": 1.855395289392003, - "grad_norm": 0.9700952768325806, - "learning_rate": 7.973311884013158e-07, - "loss": 0.8126, + "epoch": 0.8815058986814712, + "grad_norm": 0.7981926202774048, + "learning_rate": 3.1709355838180953e-05, + "loss": 0.8489, "step": 5081 }, { - "epoch": 1.8557604528026292, - "grad_norm": 1.6214972734451294, - "learning_rate": 7.933260490000694e-07, - "loss": 0.8218, + "epoch": 0.8816793893129771, + "grad_norm": 0.6928908228874207, + "learning_rate": 3.1703831681179096e-05, + "loss": 0.9048, "step": 5082 }, { - "epoch": 1.8561256162132556, - "grad_norm": 1.1548322439193726, - "learning_rate": 7.893307906618575e-07, - "loss": 0.8029, + "epoch": 0.881852879944483, + "grad_norm": 5.535857200622559, + "learning_rate": 3.169830616593392e-05, + "loss": 0.693, "step": 5083 }, { - "epoch": 1.8564907796238816, - "grad_norm": 1.2474943399429321, - "learning_rate": 7.853454154420758e-07, - "loss": 0.8141, + "epoch": 0.8820263705759889, + "grad_norm": 1.2096701860427856, + "learning_rate": 3.169277929308669e-05, + "loss": 0.7411, "step": 5084 }, { - "epoch": 1.856855943034508, - "grad_norm": 1.1113373041152954, - "learning_rate": 7.813699253910423e-07, - "loss": 0.8251, + "epoch": 0.8821998612074948, + "grad_norm": 1.8867725133895874, + "learning_rate": 3.1687251063278806e-05, + "loss": 0.7676, "step": 5085 }, { - "epoch": 1.857221106445134, - "grad_norm": 1.226819634437561, - "learning_rate": 7.774043225539874e-07, - "loss": 0.8047, + "epoch": 0.8823733518390007, + "grad_norm": 0.83200603723526, + "learning_rate": 3.168172147715181e-05, + "loss": 0.7717, "step": 5086 }, { - "epoch": 1.8575862698557604, - "grad_norm": 1.2865041494369507, - "learning_rate": 7.734486089710502e-07, - "loss": 0.8119, + "epoch": 0.8825468424705066, + "grad_norm": 1.0139598846435547, + "learning_rate": 3.167619053534742e-05, + "loss": 0.7095, "step": 5087 }, { - "epoch": 1.8579514332663867, - "grad_norm": 1.0961582660675049, - "learning_rate": 7.695027866772919e-07, - "loss": 0.7949, + "epoch": 0.8827203331020125, + "grad_norm": 0.7081445455551147, + "learning_rate": 3.1670658238507524e-05, + "loss": 0.6997, "step": 5088 }, { - "epoch": 1.858316596677013, - "grad_norm": 1.1340854167938232, - "learning_rate": 7.655668577026798e-07, - "loss": 0.8298, + "epoch": 0.8828938237335184, + "grad_norm": 1.199028730392456, + "learning_rate": 3.1665124587274134e-05, + "loss": 0.7513, "step": 5089 }, { - "epoch": 1.8586817600876393, - "grad_norm": 1.3414506912231445, - "learning_rate": 7.616408240720896e-07, - "loss": 0.8196, + "epoch": 0.8830673143650243, + "grad_norm": 1.0357216596603394, + "learning_rate": 3.1659589582289446e-05, + "loss": 0.7283, "step": 5090 }, { - "epoch": 1.8590469234982656, - "grad_norm": 1.1695104837417603, - "learning_rate": 7.577246878053057e-07, - "loss": 0.7744, + "epoch": 0.8832408049965302, + "grad_norm": 2.1662375926971436, + "learning_rate": 3.16540532241958e-05, + "loss": 0.8323, "step": 5091 }, { - "epoch": 1.8594120869088917, - "grad_norm": 1.0442728996276855, - "learning_rate": 7.538184509170276e-07, - "loss": 0.7988, + "epoch": 0.8834142956280361, + "grad_norm": 0.8247448801994324, + "learning_rate": 3.16485155136357e-05, + "loss": 0.7205, "step": 5092 }, { - "epoch": 1.859777250319518, - "grad_norm": 1.5783740282058716, - "learning_rate": 7.499221154168545e-07, - "loss": 0.847, + "epoch": 0.8835877862595419, + "grad_norm": 1.0994131565093994, + "learning_rate": 3.164297645125179e-05, + "loss": 0.739, "step": 5093 }, { - "epoch": 1.860142413730144, - "grad_norm": 1.1943351030349731, - "learning_rate": 7.460356833092963e-07, - "loss": 0.8181, + "epoch": 0.8837612768910479, + "grad_norm": 0.9191309213638306, + "learning_rate": 3.16374360376869e-05, + "loss": 0.712, "step": 5094 }, { - "epoch": 1.8605075771407704, - "grad_norm": 1.1251246929168701, - "learning_rate": 7.421591565937647e-07, - "loss": 0.8435, + "epoch": 0.8839347675225537, + "grad_norm": 2.6935746669769287, + "learning_rate": 3.1631894273584007e-05, + "loss": 0.6641, "step": 5095 }, { - "epoch": 1.8608727405513967, - "grad_norm": 1.1375538110733032, - "learning_rate": 7.3829253726458e-07, - "loss": 0.8074, + "epoch": 0.8841082581540597, + "grad_norm": 0.7479792833328247, + "learning_rate": 3.1626351159586224e-05, + "loss": 0.6915, "step": 5096 }, { - "epoch": 1.861237903962023, - "grad_norm": 0.9549663066864014, - "learning_rate": 7.344358273109575e-07, - "loss": 0.8403, + "epoch": 0.8842817487855655, + "grad_norm": 0.7383730411529541, + "learning_rate": 3.1620806696336845e-05, + "loss": 0.6537, "step": 5097 }, { - "epoch": 1.8616030673726494, - "grad_norm": 1.0508960485458374, - "learning_rate": 7.305890287170236e-07, - "loss": 0.7547, + "epoch": 0.8844552394170715, + "grad_norm": 0.8478103280067444, + "learning_rate": 3.1615260884479304e-05, + "loss": 0.7214, "step": 5098 }, { - "epoch": 1.8619682307832757, - "grad_norm": 0.9284607172012329, - "learning_rate": 7.267521434618018e-07, - "loss": 0.7739, + "epoch": 0.8846287300485773, + "grad_norm": 0.791926383972168, + "learning_rate": 3.16097137246572e-05, + "loss": 0.7729, "step": 5099 }, { - "epoch": 1.8623333941939018, - "grad_norm": 0.9098523259162903, - "learning_rate": 7.229251735192178e-07, - "loss": 0.8259, + "epoch": 0.8848022206800833, + "grad_norm": 1.1538969278335571, + "learning_rate": 3.1604165217514296e-05, + "loss": 0.6815, "step": 5100 }, { - "epoch": 1.862698557604528, - "grad_norm": 1.4012022018432617, - "learning_rate": 7.191081208580874e-07, - "loss": 0.8481, + "epoch": 0.8849757113115891, + "grad_norm": 0.7147714495658875, + "learning_rate": 3.15986153636945e-05, + "loss": 0.8384, "step": 5101 }, { - "epoch": 1.8630637210151542, - "grad_norm": 0.8641846776008606, - "learning_rate": 7.153009874421357e-07, - "loss": 0.8018, + "epoch": 0.8851492019430951, + "grad_norm": 0.7775940299034119, + "learning_rate": 3.159306416384187e-05, + "loss": 0.7979, "step": 5102 }, { - "epoch": 1.8634288844257805, - "grad_norm": 1.0086969137191772, - "learning_rate": 7.11503775229978e-07, - "loss": 0.8357, + "epoch": 0.885322692574601, + "grad_norm": 1.0154776573181152, + "learning_rate": 3.158751161860063e-05, + "loss": 0.6829, "step": 5103 }, { - "epoch": 1.8637940478364068, - "grad_norm": 1.019025206565857, - "learning_rate": 7.077164861751318e-07, - "loss": 0.8381, + "epoch": 0.8854961832061069, + "grad_norm": 0.9802752733230591, + "learning_rate": 3.158195772861517e-05, + "loss": 0.7129, "step": 5104 }, { - "epoch": 1.864159211247033, - "grad_norm": 0.8930433392524719, - "learning_rate": 7.039391222260005e-07, - "loss": 0.8418, + "epoch": 0.8856696738376127, + "grad_norm": 1.1216245889663696, + "learning_rate": 3.1576402494530025e-05, + "loss": 0.6874, "step": 5105 }, { - "epoch": 1.8645243746576594, - "grad_norm": 0.9140726327896118, - "learning_rate": 7.001716853258877e-07, - "loss": 0.7955, + "epoch": 0.8858431644691187, + "grad_norm": 0.803287148475647, + "learning_rate": 3.1570845916989875e-05, + "loss": 0.6492, "step": 5106 }, { - "epoch": 1.8648895380682857, - "grad_norm": 0.9555326104164124, - "learning_rate": 6.964141774129873e-07, - "loss": 0.8207, + "epoch": 0.8860166551006246, + "grad_norm": 0.8395341038703918, + "learning_rate": 3.156528799663957e-05, + "loss": 0.7637, "step": 5107 }, { - "epoch": 1.8652547014789118, - "grad_norm": 0.9261258840560913, - "learning_rate": 6.926666004203908e-07, - "loss": 0.806, + "epoch": 0.8861901457321305, + "grad_norm": 0.9133461713790894, + "learning_rate": 3.1559728734124125e-05, + "loss": 0.7328, "step": 5108 }, { - "epoch": 1.865619864889538, - "grad_norm": 1.190231442451477, - "learning_rate": 6.889289562760738e-07, - "loss": 0.7943, + "epoch": 0.8863636363636364, + "grad_norm": 1.0140420198440552, + "learning_rate": 3.155416813008869e-05, + "loss": 0.7026, "step": 5109 }, { - "epoch": 1.8659850283001642, - "grad_norm": 0.9860614538192749, - "learning_rate": 6.852012469029046e-07, - "loss": 0.8113, + "epoch": 0.8865371269951423, + "grad_norm": 0.6723513007164001, + "learning_rate": 3.154860618517858e-05, + "loss": 0.7493, "step": 5110 }, { - "epoch": 1.8663501917107905, - "grad_norm": 1.1597843170166016, - "learning_rate": 6.814834742186361e-07, - "loss": 0.7731, + "epoch": 0.8867106176266482, + "grad_norm": 1.2454440593719482, + "learning_rate": 3.154304290003926e-05, + "loss": 0.626, "step": 5111 }, { - "epoch": 1.8667153551214168, - "grad_norm": 1.1094763278961182, - "learning_rate": 6.777756401359159e-07, - "loss": 0.8127, + "epoch": 0.8868841082581541, + "grad_norm": 0.7305765748023987, + "learning_rate": 3.1537478275316364e-05, + "loss": 0.6683, "step": 5112 }, { - "epoch": 1.8670805185320432, - "grad_norm": 0.8625651597976685, - "learning_rate": 6.740777465622784e-07, - "loss": 0.812, + "epoch": 0.88705759888966, + "grad_norm": 1.1363459825515747, + "learning_rate": 3.153191231165568e-05, + "loss": 0.6858, "step": 5113 }, { - "epoch": 1.8674456819426695, - "grad_norm": 1.2039518356323242, - "learning_rate": 6.703897954001392e-07, - "loss": 0.8525, + "epoch": 0.8872310895211658, + "grad_norm": 1.2262595891952515, + "learning_rate": 3.152634500970312e-05, + "loss": 0.7214, "step": 5114 }, { - "epoch": 1.8678108453532956, - "grad_norm": 1.124470591545105, - "learning_rate": 6.667117885468011e-07, - "loss": 0.8408, + "epoch": 0.8874045801526718, + "grad_norm": 1.088165521621704, + "learning_rate": 3.15207763701048e-05, + "loss": 0.6647, "step": 5115 }, { - "epoch": 1.8681760087639219, - "grad_norm": 1.108760952949524, - "learning_rate": 6.630437278944501e-07, - "loss": 0.8096, + "epoch": 0.8875780707841776, + "grad_norm": 0.8008639216423035, + "learning_rate": 3.151520639350695e-05, + "loss": 0.6569, "step": 5116 }, { - "epoch": 1.868541172174548, - "grad_norm": 0.9019249081611633, - "learning_rate": 6.59385615330157e-07, - "loss": 0.8502, + "epoch": 0.8877515614156836, + "grad_norm": 1.0471192598342896, + "learning_rate": 3.150963508055599e-05, + "loss": 0.6475, "step": 5117 }, { - "epoch": 1.8689063355851743, - "grad_norm": 0.9353397488594055, - "learning_rate": 6.557374527358762e-07, - "loss": 0.801, + "epoch": 0.8879250520471894, + "grad_norm": 0.767434298992157, + "learning_rate": 3.150406243189846e-05, + "loss": 0.7573, "step": 5118 }, { - "epoch": 1.8692714989958006, - "grad_norm": 1.0525779724121094, - "learning_rate": 6.520992419884398e-07, - "loss": 0.8155, + "epoch": 0.8880985426786954, + "grad_norm": 0.8699899911880493, + "learning_rate": 3.1498488448181074e-05, + "loss": 0.8276, "step": 5119 }, { - "epoch": 1.869636662406427, - "grad_norm": 1.1534416675567627, - "learning_rate": 6.484709849595572e-07, - "loss": 0.8314, + "epoch": 0.8882720333102012, + "grad_norm": 0.7862910628318787, + "learning_rate": 3.1492913130050715e-05, + "loss": 0.8362, "step": 5120 }, { - "epoch": 1.8700018258170532, - "grad_norm": 1.1228749752044678, - "learning_rate": 6.448526835158264e-07, - "loss": 0.815, + "epoch": 0.8884455239417072, + "grad_norm": 0.7552646398544312, + "learning_rate": 3.14873364781544e-05, + "loss": 0.8245, "step": 5121 }, { - "epoch": 1.8703669892276795, - "grad_norm": 1.017716884613037, - "learning_rate": 6.41244339518714e-07, - "loss": 0.8676, + "epoch": 0.888619014573213, + "grad_norm": 0.7258608341217041, + "learning_rate": 3.1481758493139295e-05, + "loss": 0.7671, "step": 5122 }, { - "epoch": 1.8707321526383056, - "grad_norm": 1.4886960983276367, - "learning_rate": 6.37645954824575e-07, - "loss": 0.8163, + "epoch": 0.888792505204719, + "grad_norm": 0.6883817911148071, + "learning_rate": 3.147617917565273e-05, + "loss": 0.7844, "step": 5123 }, { - "epoch": 1.871097316048932, - "grad_norm": 1.4005881547927856, - "learning_rate": 6.340575312846287e-07, - "loss": 0.7966, + "epoch": 0.8889659958362248, + "grad_norm": 0.820789098739624, + "learning_rate": 3.147059852634221e-05, + "loss": 0.7761, "step": 5124 }, { - "epoch": 1.871462479459558, - "grad_norm": 1.0804754495620728, - "learning_rate": 6.304790707449738e-07, - "loss": 0.7721, + "epoch": 0.8891394864677308, + "grad_norm": 0.8716245889663696, + "learning_rate": 3.146501654585537e-05, + "loss": 0.6459, "step": 5125 }, { - "epoch": 1.8718276428701843, - "grad_norm": 1.1676632165908813, - "learning_rate": 6.269105750465843e-07, - "loss": 0.8322, + "epoch": 0.8893129770992366, + "grad_norm": 0.7219957113265991, + "learning_rate": 3.145943323483999e-05, + "loss": 0.6951, "step": 5126 }, { - "epoch": 1.8721928062808106, - "grad_norm": 1.155674695968628, - "learning_rate": 6.233520460253117e-07, - "loss": 0.8248, + "epoch": 0.8894864677307426, + "grad_norm": 1.119250774383545, + "learning_rate": 3.145384859394403e-05, + "loss": 0.6592, "step": 5127 }, { - "epoch": 1.872557969691437, - "grad_norm": 1.156808853149414, - "learning_rate": 6.198034855118784e-07, - "loss": 0.7833, + "epoch": 0.8896599583622484, + "grad_norm": 1.2952008247375488, + "learning_rate": 3.144826262381559e-05, + "loss": 0.6191, "step": 5128 }, { - "epoch": 1.8729231331020633, - "grad_norm": 1.5410116910934448, - "learning_rate": 6.162648953318684e-07, - "loss": 0.7758, + "epoch": 0.8898334489937544, + "grad_norm": 0.8841040134429932, + "learning_rate": 3.144267532510295e-05, + "loss": 0.6812, "step": 5129 }, { - "epoch": 1.8732882965126896, - "grad_norm": 1.1243414878845215, - "learning_rate": 6.1273627730575e-07, - "loss": 0.7775, + "epoch": 0.8900069396252602, + "grad_norm": 0.7857872843742371, + "learning_rate": 3.143708669845449e-05, + "loss": 0.6965, "step": 5130 }, { - "epoch": 1.8736534599233157, - "grad_norm": 1.3026496171951294, - "learning_rate": 6.092176332488553e-07, - "loss": 0.8101, + "epoch": 0.8901804302567662, + "grad_norm": 1.0300575494766235, + "learning_rate": 3.1431496744518794e-05, + "loss": 0.8135, "step": 5131 }, { - "epoch": 1.874018623333942, - "grad_norm": 0.9167211651802063, - "learning_rate": 6.057089649713832e-07, - "loss": 0.8145, + "epoch": 0.890353920888272, + "grad_norm": 1.378900408744812, + "learning_rate": 3.142590546394459e-05, + "loss": 0.6819, "step": 5132 }, { - "epoch": 1.874383786744568, - "grad_norm": 1.1009725332260132, - "learning_rate": 6.022102742784075e-07, - "loss": 0.8161, + "epoch": 0.8905274115197779, + "grad_norm": 0.8480303883552551, + "learning_rate": 3.142031285738073e-05, + "loss": 0.7474, "step": 5133 }, { - "epoch": 1.8747489501551944, - "grad_norm": 1.2465821504592896, - "learning_rate": 5.987215629698595e-07, - "loss": 0.7832, + "epoch": 0.8907009021512838, + "grad_norm": 1.0328832864761353, + "learning_rate": 3.141471892547627e-05, + "loss": 0.7358, "step": 5134 }, { - "epoch": 1.8751141135658207, - "grad_norm": 0.9862917065620422, - "learning_rate": 5.952428328405413e-07, - "loss": 0.8013, + "epoch": 0.8908743927827897, + "grad_norm": 0.7811802625656128, + "learning_rate": 3.140912366888037e-05, + "loss": 0.6675, "step": 5135 }, { - "epoch": 1.875479276976447, - "grad_norm": 1.110976219177246, - "learning_rate": 5.917740856801235e-07, - "loss": 0.8184, + "epoch": 0.8910478834142956, + "grad_norm": 0.8336309790611267, + "learning_rate": 3.140352708824237e-05, + "loss": 0.6184, "step": 5136 }, { - "epoch": 1.8758444403870733, - "grad_norm": 1.0587036609649658, - "learning_rate": 5.88315323273132e-07, - "loss": 0.8092, + "epoch": 0.8912213740458015, + "grad_norm": 0.9653008580207825, + "learning_rate": 3.139792918421177e-05, + "loss": 0.636, "step": 5137 }, { - "epoch": 1.8762096037976996, - "grad_norm": 1.1086170673370361, - "learning_rate": 5.848665473989679e-07, - "loss": 0.8206, + "epoch": 0.8913948646773074, + "grad_norm": 0.9215421080589294, + "learning_rate": 3.1392329957438203e-05, + "loss": 0.6373, "step": 5138 }, { - "epoch": 1.8765747672083257, - "grad_norm": 1.0342856645584106, - "learning_rate": 5.814277598318808e-07, - "loss": 0.7773, + "epoch": 0.8915683553088133, + "grad_norm": 0.9746629595756531, + "learning_rate": 3.1386729408571467e-05, + "loss": 0.7715, "step": 5139 }, { - "epoch": 1.876939930618952, - "grad_norm": 1.3455678224563599, - "learning_rate": 5.779989623409932e-07, - "loss": 0.7678, + "epoch": 0.8917418459403192, + "grad_norm": 0.7721441388130188, + "learning_rate": 3.1381127538261524e-05, + "loss": 0.8091, "step": 5140 }, { - "epoch": 1.8773050940295781, - "grad_norm": 0.8918232917785645, - "learning_rate": 5.745801566902831e-07, - "loss": 0.8312, + "epoch": 0.8919153365718251, + "grad_norm": 1.2780160903930664, + "learning_rate": 3.137552434715846e-05, + "loss": 0.6785, "step": 5141 }, { - "epoch": 1.8776702574402044, - "grad_norm": 1.263620138168335, - "learning_rate": 5.71171344638588e-07, - "loss": 0.8141, + "epoch": 0.892088827203331, + "grad_norm": 1.5337705612182617, + "learning_rate": 3.136991983591255e-05, + "loss": 0.6678, "step": 5142 }, { - "epoch": 1.8780354208508308, - "grad_norm": 1.2042826414108276, - "learning_rate": 5.677725279396096e-07, - "loss": 0.7804, + "epoch": 0.8922623178348369, + "grad_norm": 0.7886557579040527, + "learning_rate": 3.1364314005174184e-05, + "loss": 0.8337, "step": 5143 }, { - "epoch": 1.878400584261457, - "grad_norm": 1.140391230583191, - "learning_rate": 5.643837083418957e-07, - "loss": 0.8226, + "epoch": 0.8924358084663429, + "grad_norm": 0.7030831575393677, + "learning_rate": 3.1358706855593935e-05, + "loss": 0.6714, "step": 5144 }, { - "epoch": 1.8787657476720834, - "grad_norm": 1.3010114431381226, - "learning_rate": 5.610048875888607e-07, - "loss": 0.8431, + "epoch": 0.8926092990978487, + "grad_norm": 0.7563207745552063, + "learning_rate": 3.135309838782253e-05, + "loss": 0.801, "step": 5145 }, { - "epoch": 1.8791309110827095, - "grad_norm": 1.2075308561325073, - "learning_rate": 5.57636067418772e-07, - "loss": 0.7869, + "epoch": 0.8927827897293547, + "grad_norm": 0.9161301851272583, + "learning_rate": 3.1347488602510824e-05, + "loss": 0.7954, "step": 5146 }, { - "epoch": 1.8794960744933358, - "grad_norm": 1.1196225881576538, - "learning_rate": 5.542772495647563e-07, - "loss": 0.7822, + "epoch": 0.8929562803608605, + "grad_norm": 0.9876148104667664, + "learning_rate": 3.134187750030984e-05, + "loss": 0.7773, "step": 5147 }, { - "epoch": 1.8798612379039619, - "grad_norm": 1.2089276313781738, - "learning_rate": 5.509284357547873e-07, - "loss": 0.7818, + "epoch": 0.8931297709923665, + "grad_norm": 0.8576440215110779, + "learning_rate": 3.133626508187076e-05, + "loss": 0.6675, "step": 5148 }, { - "epoch": 1.8802264013145882, - "grad_norm": 1.9129070043563843, - "learning_rate": 5.475896277116954e-07, - "loss": 0.8225, + "epoch": 0.8933032616238723, + "grad_norm": 1.3703583478927612, + "learning_rate": 3.133065134784491e-05, + "loss": 0.6199, "step": 5149 }, { - "epoch": 1.8805915647252145, - "grad_norm": 1.0082679986953735, - "learning_rate": 5.442608271531602e-07, - "loss": 0.8174, + "epoch": 0.8934767522553783, + "grad_norm": 1.4603122472763062, + "learning_rate": 3.132503629888376e-05, + "loss": 0.8995, "step": 5150 }, { - "epoch": 1.8809567281358408, - "grad_norm": 1.0171728134155273, - "learning_rate": 5.409420357917205e-07, - "loss": 0.8115, + "epoch": 0.8936502428868841, + "grad_norm": 0.822787880897522, + "learning_rate": 3.131941993563896e-05, + "loss": 0.8011, "step": 5151 }, { - "epoch": 1.8813218915464671, - "grad_norm": 1.1920099258422852, - "learning_rate": 5.376332553347618e-07, - "loss": 0.8124, + "epoch": 0.8938237335183901, + "grad_norm": 1.3452613353729248, + "learning_rate": 3.131380225876228e-05, + "loss": 0.6204, "step": 5152 }, { - "epoch": 1.8816870549570934, - "grad_norm": 1.0130373239517212, - "learning_rate": 5.34334487484518e-07, - "loss": 0.8121, + "epoch": 0.8939972241498959, + "grad_norm": 0.8480443358421326, + "learning_rate": 3.130818326890567e-05, + "loss": 0.6729, "step": 5153 }, { - "epoch": 1.8820522183677195, - "grad_norm": 1.3202323913574219, - "learning_rate": 5.310457339380693e-07, - "loss": 0.815, + "epoch": 0.8941707147814018, + "grad_norm": 0.9972120523452759, + "learning_rate": 3.130256296672121e-05, + "loss": 0.8699, "step": 5154 }, { - "epoch": 1.8824173817783458, - "grad_norm": 1.0110034942626953, - "learning_rate": 5.277669963873489e-07, - "loss": 0.8189, + "epoch": 0.8943442054129077, + "grad_norm": 0.8395876884460449, + "learning_rate": 3.1296941352861146e-05, + "loss": 0.6404, "step": 5155 }, { - "epoch": 1.882782545188972, - "grad_norm": 1.1695104837417603, - "learning_rate": 5.244982765191387e-07, - "loss": 0.7786, + "epoch": 0.8945176960444136, + "grad_norm": 1.5504693984985352, + "learning_rate": 3.1291318427977876e-05, + "loss": 0.77, "step": 5156 }, { - "epoch": 1.8831477085995982, - "grad_norm": 1.00706148147583, - "learning_rate": 5.212395760150623e-07, - "loss": 0.8333, + "epoch": 0.8946911866759195, + "grad_norm": 0.785139799118042, + "learning_rate": 3.128569419272395e-05, + "loss": 0.7412, "step": 5157 }, { - "epoch": 1.8835128720102245, - "grad_norm": 1.5423526763916016, - "learning_rate": 5.1799089655159e-07, - "loss": 0.8429, + "epoch": 0.8948646773074254, + "grad_norm": 0.8977523446083069, + "learning_rate": 3.1280068647752066e-05, + "loss": 0.6548, "step": 5158 }, { - "epoch": 1.8838780354208509, - "grad_norm": 1.2426701784133911, - "learning_rate": 5.14752239800036e-07, - "loss": 0.7607, + "epoch": 0.8950381679389313, + "grad_norm": 0.893190324306488, + "learning_rate": 3.127444179371506e-05, + "loss": 0.6333, "step": 5159 }, { - "epoch": 1.8842431988314772, - "grad_norm": 0.9250880479812622, - "learning_rate": 5.115236074265606e-07, - "loss": 0.7959, + "epoch": 0.8952116585704372, + "grad_norm": 0.7139684557914734, + "learning_rate": 3.126881363126595e-05, + "loss": 0.8777, "step": 5160 }, { - "epoch": 1.8846083622421035, - "grad_norm": 1.3435306549072266, - "learning_rate": 5.083050010921642e-07, - "loss": 0.8333, + "epoch": 0.8953851492019431, + "grad_norm": 0.9105933308601379, + "learning_rate": 3.126318416105789e-05, + "loss": 0.7683, "step": 5161 }, { - "epoch": 1.8849735256527296, - "grad_norm": 1.1707812547683716, - "learning_rate": 5.050964224526956e-07, - "loss": 0.7909, + "epoch": 0.895558639833449, + "grad_norm": 0.6696258783340454, + "learning_rate": 3.1257553383744186e-05, + "loss": 0.7336, "step": 5162 }, { - "epoch": 1.8853386890633559, - "grad_norm": 1.9985002279281616, - "learning_rate": 5.018978731588342e-07, - "loss": 0.7512, + "epoch": 0.8957321304649549, + "grad_norm": 0.9180962443351746, + "learning_rate": 3.125192129997829e-05, + "loss": 0.6672, "step": 5163 }, { - "epoch": 1.885703852473982, - "grad_norm": 0.9456954002380371, - "learning_rate": 4.987093548561062e-07, - "loss": 0.7953, + "epoch": 0.8959056210964608, + "grad_norm": 0.7981016039848328, + "learning_rate": 3.1246287910413824e-05, + "loss": 0.7734, "step": 5164 }, { - "epoch": 1.8860690158846083, - "grad_norm": 1.0280941724777222, - "learning_rate": 4.95530869184877e-07, - "loss": 0.8087, + "epoch": 0.8960791117279667, + "grad_norm": 0.9008550047874451, + "learning_rate": 3.124065321570453e-05, + "loss": 0.7323, "step": 5165 }, { - "epoch": 1.8864341792952346, - "grad_norm": 1.0113369226455688, - "learning_rate": 4.923624177803498e-07, - "loss": 0.8325, + "epoch": 0.8962526023594726, + "grad_norm": 0.7848231792449951, + "learning_rate": 3.123501721650434e-05, + "loss": 0.681, "step": 5166 }, { - "epoch": 1.886799342705861, - "grad_norm": 0.9669683575630188, - "learning_rate": 4.892040022725675e-07, - "loss": 0.8186, + "epoch": 0.8964260929909785, + "grad_norm": 0.8776686191558838, + "learning_rate": 3.1229379913467305e-05, + "loss": 0.6619, "step": 5167 }, { - "epoch": 1.8871645061164872, - "grad_norm": 1.2961219549179077, - "learning_rate": 4.860556242864034e-07, - "loss": 0.7874, + "epoch": 0.8965995836224844, + "grad_norm": 0.9436081051826477, + "learning_rate": 3.122374130724765e-05, + "loss": 0.72, "step": 5168 }, { - "epoch": 1.8875296695271135, - "grad_norm": 0.8871257305145264, - "learning_rate": 4.829172854415775e-07, - "loss": 0.8096, + "epoch": 0.8967730742539903, + "grad_norm": 0.8813380599021912, + "learning_rate": 3.1218101398499736e-05, + "loss": 0.671, "step": 5169 }, { - "epoch": 1.8878948329377396, - "grad_norm": 2.020317792892456, - "learning_rate": 4.79788987352634e-07, - "loss": 0.7142, + "epoch": 0.8969465648854962, + "grad_norm": 1.034523606300354, + "learning_rate": 3.1212460187878085e-05, + "loss": 0.6672, "step": 5170 }, { - "epoch": 1.888259996348366, - "grad_norm": 1.389561653137207, - "learning_rate": 4.7667073162896315e-07, - "loss": 0.835, + "epoch": 0.8971200555170021, + "grad_norm": 0.9077121615409851, + "learning_rate": 3.1206817676037365e-05, + "loss": 0.7426, "step": 5171 }, { - "epoch": 1.888625159758992, - "grad_norm": 0.9510175585746765, - "learning_rate": 4.7356251987477507e-07, - "loss": 0.7982, + "epoch": 0.897293546148508, + "grad_norm": 0.8170250058174133, + "learning_rate": 3.1201173863632396e-05, + "loss": 0.8386, "step": 5172 }, { - "epoch": 1.8889903231696183, - "grad_norm": 1.452484369277954, - "learning_rate": 4.7046435368912404e-07, - "loss": 0.8344, + "epoch": 0.8974670367800138, + "grad_norm": 0.7554052472114563, + "learning_rate": 3.119552875131814e-05, + "loss": 0.7131, "step": 5173 }, { - "epoch": 1.8893554865802447, - "grad_norm": 1.2465940713882446, - "learning_rate": 4.6737623466589055e-07, - "loss": 0.7934, + "epoch": 0.8976405274115198, + "grad_norm": 0.8118773698806763, + "learning_rate": 3.1189882339749735e-05, + "loss": 0.6802, "step": 5174 }, { - "epoch": 1.889720649990871, - "grad_norm": 1.1198451519012451, - "learning_rate": 4.642981643937905e-07, - "loss": 0.7667, + "epoch": 0.8978140180430256, + "grad_norm": 0.8132635951042175, + "learning_rate": 3.1184234629582444e-05, + "loss": 0.7208, "step": 5175 }, { - "epoch": 1.8900858134014973, - "grad_norm": 1.1426713466644287, - "learning_rate": 4.6123014445636605e-07, - "loss": 0.8182, + "epoch": 0.8979875086745316, + "grad_norm": 0.7176066040992737, + "learning_rate": 3.117858562147169e-05, + "loss": 0.6908, "step": 5176 }, { - "epoch": 1.8904509768121234, - "grad_norm": 1.3373569250106812, - "learning_rate": 4.581721764319924e-07, - "loss": 0.7656, + "epoch": 0.8981609993060374, + "grad_norm": 0.7513592839241028, + "learning_rate": 3.117293531607305e-05, + "loss": 0.75, "step": 5177 }, { - "epoch": 1.8908161402227497, - "grad_norm": 0.9231318831443787, - "learning_rate": 4.5512426189386674e-07, - "loss": 0.8206, + "epoch": 0.8983344899375434, + "grad_norm": 0.9760264158248901, + "learning_rate": 3.116728371404225e-05, + "loss": 0.6597, "step": 5178 }, { - "epoch": 1.8911813036333758, - "grad_norm": 1.0900070667266846, - "learning_rate": 4.520864024100191e-07, - "loss": 0.8234, + "epoch": 0.8985079805690492, + "grad_norm": 1.0758423805236816, + "learning_rate": 3.116163081603516e-05, + "loss": 0.642, "step": 5179 }, { - "epoch": 1.891546467044002, - "grad_norm": 0.8621076941490173, - "learning_rate": 4.4905859954331057e-07, - "loss": 0.8115, + "epoch": 0.8986814712005552, + "grad_norm": 0.8656315803527832, + "learning_rate": 3.115597662270781e-05, + "loss": 0.6864, "step": 5180 }, { - "epoch": 1.8919116304546284, - "grad_norm": 0.8874961137771606, - "learning_rate": 4.460408548514239e-07, - "loss": 0.8162, + "epoch": 0.898854961832061, + "grad_norm": 0.8485645651817322, + "learning_rate": 3.115032113471637e-05, + "loss": 0.8179, "step": 5181 }, { - "epoch": 1.8922767938652547, - "grad_norm": 1.0494745969772339, - "learning_rate": 4.4303316988686396e-07, - "loss": 0.7758, + "epoch": 0.899028452463567, + "grad_norm": 0.6770945191383362, + "learning_rate": 3.114466435271717e-05, + "loss": 0.8351, "step": 5182 }, { - "epoch": 1.892641957275881, - "grad_norm": 0.9513950347900391, - "learning_rate": 4.400355461969663e-07, - "loss": 0.8365, + "epoch": 0.8992019430950728, + "grad_norm": 0.7802444696426392, + "learning_rate": 3.113900627736669e-05, + "loss": 0.8811, "step": 5183 }, { - "epoch": 1.8930071206865073, - "grad_norm": 1.093780517578125, - "learning_rate": 4.3704798532388624e-07, - "loss": 0.83, + "epoch": 0.8993754337265788, + "grad_norm": 0.8848913908004761, + "learning_rate": 3.113334690932155e-05, + "loss": 0.6699, "step": 5184 }, { - "epoch": 1.8933722840971334, - "grad_norm": 1.5487844944000244, - "learning_rate": 4.3407048880460765e-07, - "loss": 0.8417, + "epoch": 0.8995489243580846, + "grad_norm": 0.8570581078529358, + "learning_rate": 3.112768624923853e-05, + "loss": 0.724, "step": 5185 }, { - "epoch": 1.8937374475077597, - "grad_norm": 0.9460033774375916, - "learning_rate": 4.311030581709297e-07, - "loss": 0.7855, + "epoch": 0.8997224149895906, + "grad_norm": 1.272689938545227, + "learning_rate": 3.1122024297774545e-05, + "loss": 0.5854, "step": 5186 }, { - "epoch": 1.8941026109183858, - "grad_norm": 1.0634361505508423, - "learning_rate": 4.281456949494778e-07, - "loss": 0.8248, + "epoch": 0.8998959056210964, + "grad_norm": 0.8560371994972229, + "learning_rate": 3.111636105558669e-05, + "loss": 0.8436, "step": 5187 }, { - "epoch": 1.8944677743290121, - "grad_norm": 1.08961021900177, - "learning_rate": 4.2519840066169493e-07, - "loss": 0.8144, + "epoch": 0.9000693962526024, + "grad_norm": 0.8085232973098755, + "learning_rate": 3.111069652333219e-05, + "loss": 0.7566, "step": 5188 }, { - "epoch": 1.8948329377396385, - "grad_norm": 1.3022533655166626, - "learning_rate": 4.222611768238505e-07, - "loss": 0.8257, + "epoch": 0.9002428868841083, + "grad_norm": 0.790971577167511, + "learning_rate": 3.1105030701668395e-05, + "loss": 0.7408, "step": 5189 }, { - "epoch": 1.8951981011502648, - "grad_norm": 1.3670023679733276, - "learning_rate": 4.1933402494702235e-07, - "loss": 0.8057, + "epoch": 0.9004163775156142, + "grad_norm": 1.1233073472976685, + "learning_rate": 3.1099363591252844e-05, + "loss": 0.6228, "step": 5190 }, { - "epoch": 1.895563264560891, - "grad_norm": 1.1096742153167725, - "learning_rate": 4.164169465371148e-07, - "loss": 0.8064, + "epoch": 0.90058986814712, + "grad_norm": 0.9813838601112366, + "learning_rate": 3.109369519274323e-05, + "loss": 0.6488, "step": 5191 }, { - "epoch": 1.8959284279715174, - "grad_norm": 1.217635154724121, - "learning_rate": 4.135099430948475e-07, - "loss": 0.7863, + "epoch": 0.9007633587786259, + "grad_norm": 0.7003257870674133, + "learning_rate": 3.1088025506797356e-05, + "loss": 0.7832, "step": 5192 }, { - "epoch": 1.8962935913821435, - "grad_norm": 1.7254976034164429, - "learning_rate": 4.106130161157595e-07, - "loss": 0.8612, + "epoch": 0.9009368494101319, + "grad_norm": 0.7485539317131042, + "learning_rate": 3.1082354534073206e-05, + "loss": 0.772, "step": 5193 }, { - "epoch": 1.8966587547927698, - "grad_norm": 1.229048490524292, - "learning_rate": 4.077261670901989e-07, - "loss": 0.7947, + "epoch": 0.9011103400416377, + "grad_norm": 0.9677616953849792, + "learning_rate": 3.107668227522889e-05, + "loss": 0.7461, "step": 5194 }, { - "epoch": 1.8970239182033959, - "grad_norm": 1.235695481300354, - "learning_rate": 4.0484939750333743e-07, - "loss": 0.816, + "epoch": 0.9012838306731437, + "grad_norm": 0.8266445994377136, + "learning_rate": 3.107100873092269e-05, + "loss": 0.6686, "step": 5195 }, { - "epoch": 1.8973890816140222, - "grad_norm": 0.8763514161109924, - "learning_rate": 4.01982708835158e-07, - "loss": 0.7875, + "epoch": 0.9014573213046495, + "grad_norm": 0.6901938915252686, + "learning_rate": 3.106533390181304e-05, + "loss": 0.8335, "step": 5196 }, { - "epoch": 1.8977542450246485, - "grad_norm": 1.1427050828933716, - "learning_rate": 3.991261025604543e-07, - "loss": 0.8352, + "epoch": 0.9016308119361555, + "grad_norm": 0.7612466812133789, + "learning_rate": 3.105965778855848e-05, + "loss": 0.7397, "step": 5197 }, { - "epoch": 1.8981194084352748, - "grad_norm": 0.9073944687843323, - "learning_rate": 3.9627958014883725e-07, - "loss": 0.8578, + "epoch": 0.9018043025676613, + "grad_norm": 0.8418919444084167, + "learning_rate": 3.105398039181775e-05, + "loss": 0.8022, "step": 5198 }, { - "epoch": 1.8984845718459011, - "grad_norm": 1.0562399625778198, - "learning_rate": 3.9344314306472674e-07, - "loss": 0.8226, + "epoch": 0.9019777931991673, + "grad_norm": 0.947342574596405, + "learning_rate": 3.1048301712249726e-05, + "loss": 0.8926, "step": 5199 }, { - "epoch": 1.8988497352565274, - "grad_norm": 1.2340843677520752, - "learning_rate": 3.9061679276735986e-07, - "loss": 0.8467, + "epoch": 0.9021512838306731, + "grad_norm": 1.062010645866394, + "learning_rate": 3.1042621750513405e-05, + "loss": 0.6322, "step": 5200 }, { - "epoch": 1.8992148986671535, - "grad_norm": 1.1323697566986084, - "learning_rate": 3.878005307107735e-07, - "loss": 0.8289, + "epoch": 0.9023247744621791, + "grad_norm": 0.8903411030769348, + "learning_rate": 3.103694050726797e-05, + "loss": 0.6133, "step": 5201 }, { - "epoch": 1.8995800620777799, - "grad_norm": 1.1706836223602295, - "learning_rate": 3.849943583438287e-07, - "loss": 0.8383, + "epoch": 0.9024982650936849, + "grad_norm": 0.884560763835907, + "learning_rate": 3.103125798317272e-05, + "loss": 0.5946, "step": 5202 }, { - "epoch": 1.899945225488406, - "grad_norm": 1.318920373916626, - "learning_rate": 3.8219827711018397e-07, - "loss": 0.7759, + "epoch": 0.9026717557251909, + "grad_norm": 0.8027233481407166, + "learning_rate": 3.102557417888713e-05, + "loss": 0.7532, "step": 5203 }, { - "epoch": 1.9003103888990323, - "grad_norm": 1.4141075611114502, - "learning_rate": 3.794122884483131e-07, - "loss": 0.8699, + "epoch": 0.9028452463566967, + "grad_norm": 0.9059818983078003, + "learning_rate": 3.101988909507081e-05, + "loss": 0.6934, "step": 5204 }, { - "epoch": 1.9006755523096586, - "grad_norm": 1.2898719310760498, - "learning_rate": 3.7663639379149406e-07, - "loss": 0.8712, + "epoch": 0.9030187369882027, + "grad_norm": 0.8239308595657349, + "learning_rate": 3.1014202732383525e-05, + "loss": 0.8032, "step": 5205 }, { - "epoch": 1.9010407157202849, - "grad_norm": 1.6594758033752441, - "learning_rate": 3.738705945678134e-07, - "loss": 0.811, + "epoch": 0.9031922276197085, + "grad_norm": 0.783078670501709, + "learning_rate": 3.100851509148517e-05, + "loss": 0.6851, "step": 5206 }, { - "epoch": 1.9014058791309112, - "grad_norm": 1.0505629777908325, - "learning_rate": 3.7111489220016617e-07, - "loss": 0.798, + "epoch": 0.9033657182512145, + "grad_norm": 1.1378719806671143, + "learning_rate": 3.100282617303581e-05, + "loss": 0.7869, "step": 5207 }, { - "epoch": 1.9017710425415375, - "grad_norm": 1.277854084968567, - "learning_rate": 3.6836928810624506e-07, - "loss": 0.7906, + "epoch": 0.9035392088827203, + "grad_norm": 1.1681543588638306, + "learning_rate": 3.099713597769566e-05, + "loss": 0.6627, "step": 5208 }, { - "epoch": 1.9021362059521636, - "grad_norm": 1.0627226829528809, - "learning_rate": 3.656337836985602e-07, - "loss": 0.817, + "epoch": 0.9037126995142263, + "grad_norm": 0.7621902823448181, + "learning_rate": 3.0991444506125066e-05, + "loss": 0.7417, "step": 5209 }, { - "epoch": 1.9025013693627897, - "grad_norm": 1.0204670429229736, - "learning_rate": 3.629083803844147e-07, - "loss": 0.7936, + "epoch": 0.9038861901457321, + "grad_norm": 0.7894797921180725, + "learning_rate": 3.098575175898452e-05, + "loss": 0.7218, "step": 5210 }, { - "epoch": 1.902866532773416, - "grad_norm": 1.1570723056793213, - "learning_rate": 3.6019307956592034e-07, - "loss": 0.7887, + "epoch": 0.9040596807772381, + "grad_norm": 1.042537808418274, + "learning_rate": 3.098005773693469e-05, + "loss": 0.6532, "step": 5211 }, { - "epoch": 1.9032316961840423, - "grad_norm": 1.5875599384307861, - "learning_rate": 3.5748788263998855e-07, - "loss": 0.8513, + "epoch": 0.9042331714087439, + "grad_norm": 0.658474862575531, + "learning_rate": 3.097436244063636e-05, + "loss": 0.7781, "step": 5212 }, { - "epoch": 1.9035968595946686, - "grad_norm": 1.2208552360534668, - "learning_rate": 3.547927909983373e-07, - "loss": 0.8226, + "epoch": 0.9044066620402498, + "grad_norm": 3.973468780517578, + "learning_rate": 3.0968665870750484e-05, + "loss": 0.6221, "step": 5213 }, { - "epoch": 1.903962023005295, - "grad_norm": 1.1741605997085571, - "learning_rate": 3.521078060274841e-07, - "loss": 0.7963, + "epoch": 0.9045801526717557, + "grad_norm": 0.765002965927124, + "learning_rate": 3.0962968027938156e-05, + "loss": 0.7612, "step": 5214 }, { - "epoch": 1.9043271864159212, - "grad_norm": 1.038886547088623, - "learning_rate": 3.4943292910874173e-07, - "loss": 0.8124, + "epoch": 0.9047536433032616, + "grad_norm": 0.8569828867912292, + "learning_rate": 3.09572689128606e-05, + "loss": 0.6792, "step": 5215 }, { - "epoch": 1.9046923498265473, - "grad_norm": 1.1076269149780273, - "learning_rate": 3.4676816161822947e-07, - "loss": 0.8204, + "epoch": 0.9049271339347675, + "grad_norm": 1.1240427494049072, + "learning_rate": 3.0951568526179235e-05, + "loss": 0.683, "step": 5216 }, { - "epoch": 1.9050575132371736, - "grad_norm": 1.0346667766571045, - "learning_rate": 3.4411350492686404e-07, - "loss": 0.8129, + "epoch": 0.9051006245662734, + "grad_norm": 0.848465621471405, + "learning_rate": 3.094586686855558e-05, + "loss": 0.8184, "step": 5217 }, { - "epoch": 1.9054226766477997, - "grad_norm": 1.234651803970337, - "learning_rate": 3.4146896040035514e-07, - "loss": 0.8239, + "epoch": 0.9052741151977793, + "grad_norm": 0.7352258563041687, + "learning_rate": 3.094016394065131e-05, + "loss": 0.7053, "step": 5218 }, { - "epoch": 1.905787840058426, - "grad_norm": 0.8321973085403442, - "learning_rate": 3.3883452939922123e-07, - "loss": 0.804, + "epoch": 0.9054476058292852, + "grad_norm": 1.1834973096847534, + "learning_rate": 3.093445974312828e-05, + "loss": 0.6941, "step": 5219 }, { - "epoch": 1.9061530034690524, - "grad_norm": 1.400312900543213, - "learning_rate": 3.3621021327876923e-07, - "loss": 0.8218, + "epoch": 0.9056210964607911, + "grad_norm": 1.0254300832748413, + "learning_rate": 3.0928754276648443e-05, + "loss": 0.7349, "step": 5220 }, { - "epoch": 1.9065181668796787, - "grad_norm": 1.0724843740463257, - "learning_rate": 3.3359601338910143e-07, - "loss": 0.8585, + "epoch": 0.905794587092297, + "grad_norm": 0.9113889932632446, + "learning_rate": 3.092304754187394e-05, + "loss": 0.8256, "step": 5221 }, { - "epoch": 1.906883330290305, - "grad_norm": 1.2191624641418457, - "learning_rate": 3.3099193107512197e-07, - "loss": 0.8024, + "epoch": 0.905968077723803, + "grad_norm": 1.038996934890747, + "learning_rate": 3.091733953946705e-05, + "loss": 0.6033, "step": 5222 }, { - "epoch": 1.9072484937009313, - "grad_norm": 1.1179629564285278, - "learning_rate": 3.283979676765259e-07, - "loss": 0.801, + "epoch": 0.9061415683553088, + "grad_norm": 1.4803439378738403, + "learning_rate": 3.0911630270090177e-05, + "loss": 0.6625, "step": 5223 }, { - "epoch": 1.9076136571115574, - "grad_norm": 1.5922861099243164, - "learning_rate": 3.258141245278057e-07, - "loss": 0.7731, + "epoch": 0.9063150589868147, + "grad_norm": 0.6601648330688477, + "learning_rate": 3.09059197344059e-05, + "loss": 0.6992, "step": 5224 }, { - "epoch": 1.9079788205221837, - "grad_norm": 0.9356517195701599, - "learning_rate": 3.2324040295824033e-07, - "loss": 0.8151, + "epoch": 0.9064885496183206, + "grad_norm": 0.9602124094963074, + "learning_rate": 3.090020793307693e-05, + "loss": 0.8457, "step": 5225 }, { - "epoch": 1.9083439839328098, - "grad_norm": 1.303373098373413, - "learning_rate": 3.2067680429190617e-07, - "loss": 0.7939, + "epoch": 0.9066620402498266, + "grad_norm": 1.1334660053253174, + "learning_rate": 3.089449486676613e-05, + "loss": 0.7485, "step": 5226 }, { - "epoch": 1.908709147343436, - "grad_norm": 1.032668948173523, - "learning_rate": 3.181233298476771e-07, - "loss": 0.8323, + "epoch": 0.9068355308813324, + "grad_norm": 0.9934324026107788, + "learning_rate": 3.08887805361365e-05, + "loss": 0.7893, "step": 5227 }, { - "epoch": 1.9090743107540624, - "grad_norm": 1.1654844284057617, - "learning_rate": 3.1557998093920904e-07, - "loss": 0.8173, + "epoch": 0.9070090215128384, + "grad_norm": 0.9669413566589355, + "learning_rate": 3.08830649418512e-05, + "loss": 0.7385, "step": 5228 }, { - "epoch": 1.9094394741646887, - "grad_norm": 0.873712956905365, - "learning_rate": 3.130467588749553e-07, - "loss": 0.7829, + "epoch": 0.9071825121443442, + "grad_norm": 0.7920812368392944, + "learning_rate": 3.087734808457354e-05, + "loss": 0.6711, "step": 5229 }, { - "epoch": 1.909804637575315, - "grad_norm": 1.1964845657348633, - "learning_rate": 3.105236649581556e-07, - "loss": 0.7949, + "epoch": 0.9073560027758502, + "grad_norm": 0.7538244724273682, + "learning_rate": 3.087162996496696e-05, + "loss": 0.8413, "step": 5230 }, { - "epoch": 1.9101698009859414, - "grad_norm": 0.8477409482002258, - "learning_rate": 3.0801070048684046e-07, - "loss": 0.8434, + "epoch": 0.907529493407356, + "grad_norm": 1.2160277366638184, + "learning_rate": 3.086591058369505e-05, + "loss": 0.6799, "step": 5231 }, { - "epoch": 1.9105349643965674, - "grad_norm": 1.1020374298095703, - "learning_rate": 3.055078667538292e-07, - "loss": 0.8151, + "epoch": 0.9077029840388618, + "grad_norm": 0.8374723792076111, + "learning_rate": 3.086018994142156e-05, + "loss": 0.6885, "step": 5232 }, { - "epoch": 1.9109001278071938, - "grad_norm": 1.0286706686019897, - "learning_rate": 3.0301516504672944e-07, - "loss": 0.8439, + "epoch": 0.9078764746703678, + "grad_norm": 1.0471830368041992, + "learning_rate": 3.0854468038810365e-05, + "loss": 0.5935, "step": 5233 }, { - "epoch": 1.9112652912178199, - "grad_norm": 1.3145369291305542, - "learning_rate": 3.0053259664793997e-07, - "loss": 0.7896, + "epoch": 0.9080499653018737, + "grad_norm": 0.7503795623779297, + "learning_rate": 3.084874487652551e-05, + "loss": 0.7388, "step": 5234 }, { - "epoch": 1.9116304546284462, - "grad_norm": 0.7846037745475769, - "learning_rate": 2.980601628346347e-07, - "loss": 0.8149, + "epoch": 0.9082234559333796, + "grad_norm": 0.8881401419639587, + "learning_rate": 3.0843020455231173e-05, + "loss": 0.7554, "step": 5235 }, { - "epoch": 1.9119956180390725, - "grad_norm": 1.0644419193267822, - "learning_rate": 2.9559786487878716e-07, - "loss": 0.7799, + "epoch": 0.9083969465648855, + "grad_norm": 0.7779415845870972, + "learning_rate": 3.0837294775591675e-05, + "loss": 0.709, "step": 5236 }, { - "epoch": 1.9123607814496988, - "grad_norm": 1.398024559020996, - "learning_rate": 2.931457040471508e-07, - "loss": 0.784, + "epoch": 0.9085704371963914, + "grad_norm": 0.8909716606140137, + "learning_rate": 3.08315678382715e-05, + "loss": 0.6453, "step": 5237 }, { - "epoch": 1.912725944860325, - "grad_norm": 1.327805995941162, - "learning_rate": 2.907036816012609e-07, - "loss": 0.7943, + "epoch": 0.9087439278278973, + "grad_norm": 0.8728950023651123, + "learning_rate": 3.082583964393524e-05, + "loss": 0.7375, "step": 5238 }, { - "epoch": 1.9130911082709514, - "grad_norm": 1.1091991662979126, - "learning_rate": 2.882717987974437e-07, - "loss": 0.8027, + "epoch": 0.9089174184594032, + "grad_norm": 0.8251045346260071, + "learning_rate": 3.082011019324768e-05, + "loss": 0.7935, "step": 5239 }, { - "epoch": 1.9134562716815775, - "grad_norm": 1.1848961114883423, - "learning_rate": 2.85850056886805e-07, - "loss": 0.8246, + "epoch": 0.9090909090909091, + "grad_norm": 0.7541353106498718, + "learning_rate": 3.081437948687373e-05, + "loss": 0.7534, "step": 5240 }, { - "epoch": 1.9138214350922038, - "grad_norm": 1.085229754447937, - "learning_rate": 2.834384571152282e-07, - "loss": 0.7624, + "epoch": 0.909264399722415, + "grad_norm": 1.128610610961914, + "learning_rate": 3.0808647525478434e-05, + "loss": 0.6545, "step": 5241 }, { - "epoch": 1.91418659850283, - "grad_norm": 0.9212002158164978, - "learning_rate": 2.8103700072339203e-07, - "loss": 0.8077, + "epoch": 0.9094378903539209, + "grad_norm": 0.967620313167572, + "learning_rate": 3.0802914309727004e-05, + "loss": 0.8455, "step": 5242 }, { - "epoch": 1.9145517619134562, - "grad_norm": 0.9157971739768982, - "learning_rate": 2.7864568894674593e-07, - "loss": 0.8088, + "epoch": 0.9096113809854268, + "grad_norm": 0.8294617533683777, + "learning_rate": 3.079717984028478e-05, + "loss": 0.715, "step": 5243 }, { - "epoch": 1.9149169253240825, - "grad_norm": 0.9892614483833313, - "learning_rate": 2.7626452301552586e-07, - "loss": 0.8082, + "epoch": 0.9097848716169327, + "grad_norm": 0.756712794303894, + "learning_rate": 3.0791444117817247e-05, + "loss": 0.8237, "step": 5244 }, { - "epoch": 1.9152820887347088, - "grad_norm": 1.1238117218017578, - "learning_rate": 2.7389350415474305e-07, - "loss": 0.7938, + "epoch": 0.9099583622484386, + "grad_norm": 0.7892747521400452, + "learning_rate": 3.078570714299005e-05, + "loss": 0.7568, "step": 5245 }, { - "epoch": 1.9156472521453352, - "grad_norm": 1.1064144372940063, - "learning_rate": 2.715326335841906e-07, - "loss": 0.7683, + "epoch": 0.9101318528799445, + "grad_norm": 0.7037228941917419, + "learning_rate": 3.0779968916468974e-05, + "loss": 0.8809, "step": 5246 }, { - "epoch": 1.9160124155559612, - "grad_norm": 0.9432430863380432, - "learning_rate": 2.691819125184458e-07, - "loss": 0.8336, + "epoch": 0.9103053435114504, + "grad_norm": 0.818271815776825, + "learning_rate": 3.0774229438919944e-05, + "loss": 0.7365, "step": 5247 }, { - "epoch": 1.9163775789665876, - "grad_norm": 1.1905878782272339, - "learning_rate": 2.668413421668592e-07, - "loss": 0.853, + "epoch": 0.9104788341429563, + "grad_norm": 1.3470276594161987, + "learning_rate": 3.076848871100904e-05, + "loss": 0.6765, "step": 5248 }, { - "epoch": 1.9167427423772136, - "grad_norm": 1.3120321035385132, - "learning_rate": 2.645109237335608e-07, - "loss": 0.8074, + "epoch": 0.9106523247744622, + "grad_norm": 0.8403735756874084, + "learning_rate": 3.0762746733402456e-05, + "loss": 0.7188, "step": 5249 }, { - "epoch": 1.91710790578784, - "grad_norm": 0.8973969221115112, - "learning_rate": 2.6219065841745383e-07, - "loss": 0.8253, + "epoch": 0.9108258154059681, + "grad_norm": 0.7437754273414612, + "learning_rate": 3.075700350676659e-05, + "loss": 0.7316, "step": 5250 }, { - "epoch": 1.9174730691984663, - "grad_norm": 1.0573065280914307, - "learning_rate": 2.5988054741222345e-07, - "loss": 0.7995, + "epoch": 0.9109993060374739, + "grad_norm": 0.7628846168518066, + "learning_rate": 3.075125903176792e-05, + "loss": 0.7939, "step": 5251 }, { - "epoch": 1.9178382326090926, - "grad_norm": 1.3160756826400757, - "learning_rate": 2.5758059190633233e-07, - "loss": 0.833, + "epoch": 0.9111727966689799, + "grad_norm": 1.0209200382232666, + "learning_rate": 3.074551330907312e-05, + "loss": 0.708, "step": 5252 }, { - "epoch": 1.918203396019719, - "grad_norm": 2.7520790100097656, - "learning_rate": 2.5529079308301174e-07, - "loss": 0.813, + "epoch": 0.9113462873004857, + "grad_norm": 0.8924606442451477, + "learning_rate": 3.073976633934898e-05, + "loss": 0.8208, "step": 5253 }, { - "epoch": 1.9185685594303452, - "grad_norm": 0.9606177806854248, - "learning_rate": 2.530111521202727e-07, - "loss": 0.825, + "epoch": 0.9115197779319917, + "grad_norm": 1.0019614696502686, + "learning_rate": 3.073401812326244e-05, + "loss": 0.7842, "step": 5254 }, { - "epoch": 1.9189337228409713, - "grad_norm": 1.2361303567886353, - "learning_rate": 2.5074167019089714e-07, - "loss": 0.8376, + "epoch": 0.9116932685634975, + "grad_norm": 1.207947015762329, + "learning_rate": 3.072826866148058e-05, + "loss": 0.7378, "step": 5255 }, { - "epoch": 1.9192988862515976, - "grad_norm": 1.2694206237792969, - "learning_rate": 2.484823484624466e-07, - "loss": 0.8148, + "epoch": 0.9118667591950035, + "grad_norm": 1.0377312898635864, + "learning_rate": 3.072251795467065e-05, + "loss": 0.6597, "step": 5256 }, { - "epoch": 1.9196640496622237, - "grad_norm": 1.1198644638061523, - "learning_rate": 2.462331880972468e-07, - "loss": 0.8124, + "epoch": 0.9120402498265093, + "grad_norm": 0.9633491635322571, + "learning_rate": 3.071676600350002e-05, + "loss": 0.6663, "step": 5257 }, { - "epoch": 1.92002921307285, - "grad_norm": 0.9613643884658813, - "learning_rate": 2.4399419025240344e-07, - "loss": 0.8542, + "epoch": 0.9122137404580153, + "grad_norm": 1.1710857152938843, + "learning_rate": 3.071101280863621e-05, + "loss": 0.7228, "step": 5258 }, { - "epoch": 1.9203943764834763, - "grad_norm": 1.106259822845459, - "learning_rate": 2.4176535607978835e-07, - "loss": 0.818, + "epoch": 0.9123872310895211, + "grad_norm": 0.9057410955429077, + "learning_rate": 3.0705258370746874e-05, + "loss": 0.7386, "step": 5259 }, { - "epoch": 1.9207595398941026, - "grad_norm": 0.7930060029029846, - "learning_rate": 2.3954668672604874e-07, - "loss": 0.8238, + "epoch": 0.9125607217210271, + "grad_norm": 1.166103482246399, + "learning_rate": 3.069950269049983e-05, + "loss": 0.7461, "step": 5260 }, { - "epoch": 1.921124703304729, - "grad_norm": 1.140190601348877, - "learning_rate": 2.373381833326027e-07, - "loss": 0.8517, + "epoch": 0.9127342123525329, + "grad_norm": 0.8628996014595032, + "learning_rate": 3.069374576856304e-05, + "loss": 0.6257, "step": 5261 }, { - "epoch": 1.9214898667153553, - "grad_norm": 1.0191292762756348, - "learning_rate": 2.3513984703563476e-07, - "loss": 0.8292, + "epoch": 0.9129077029840389, + "grad_norm": 0.7821905612945557, + "learning_rate": 3.068798760560458e-05, + "loss": 0.6978, "step": 5262 }, { - "epoch": 1.9218550301259814, - "grad_norm": 1.0746217966079712, - "learning_rate": 2.3295167896610016e-07, - "loss": 0.8232, + "epoch": 0.9130811936155447, + "grad_norm": 0.7690654397010803, + "learning_rate": 3.068222820229272e-05, + "loss": 0.7437, "step": 5263 }, { - "epoch": 1.9222201935366077, - "grad_norm": 1.2783238887786865, - "learning_rate": 2.3077368024972514e-07, - "loss": 0.768, + "epoch": 0.9132546842470507, + "grad_norm": 0.8153859972953796, + "learning_rate": 3.067646755929582e-05, + "loss": 0.6348, "step": 5264 }, { - "epoch": 1.9225853569472338, - "grad_norm": 1.2157031297683716, - "learning_rate": 2.2860585200700226e-07, - "loss": 0.8322, + "epoch": 0.9134281748785565, + "grad_norm": 1.0400488376617432, + "learning_rate": 3.067070567728242e-05, + "loss": 0.7378, "step": 5265 }, { - "epoch": 1.92295052035786, - "grad_norm": 1.0915255546569824, - "learning_rate": 2.2644819535319051e-07, - "loss": 0.8056, + "epoch": 0.9136016655100625, + "grad_norm": 0.8829591870307922, + "learning_rate": 3.066494255692119e-05, + "loss": 0.7661, "step": 5266 }, { - "epoch": 1.9233156837684864, - "grad_norm": 0.8251357674598694, - "learning_rate": 2.2430071139832198e-07, - "loss": 0.7789, + "epoch": 0.9137751561415683, + "grad_norm": 2.09403395652771, + "learning_rate": 3.065917819888095e-05, + "loss": 0.803, "step": 5267 }, { - "epoch": 1.9236808471791127, - "grad_norm": 1.0294837951660156, - "learning_rate": 2.2216340124718626e-07, - "loss": 0.8154, + "epoch": 0.9139486467730743, + "grad_norm": 0.9900574088096619, + "learning_rate": 3.0653412603830665e-05, + "loss": 0.7683, "step": 5268 }, { - "epoch": 1.924046010589739, - "grad_norm": 0.9072071313858032, - "learning_rate": 2.2003626599934602e-07, - "loss": 0.8348, + "epoch": 0.9141221374045801, + "grad_norm": 0.8823701739311218, + "learning_rate": 3.064764577243943e-05, + "loss": 0.7548, "step": 5269 }, { - "epoch": 1.9244111740003653, - "grad_norm": 0.9668275713920593, - "learning_rate": 2.1791930674912587e-07, - "loss": 0.8046, + "epoch": 0.9142956280360861, + "grad_norm": 1.0035617351531982, + "learning_rate": 3.06418777053765e-05, + "loss": 0.6588, "step": 5270 }, { - "epoch": 1.9247763374109914, - "grad_norm": 1.2277557849884033, - "learning_rate": 2.1581252458561684e-07, - "loss": 0.813, + "epoch": 0.914469118667592, + "grad_norm": 0.7772724628448486, + "learning_rate": 3.063610840331125e-05, + "loss": 0.7444, "step": 5271 }, { - "epoch": 1.9251415008216177, - "grad_norm": 1.218826413154602, - "learning_rate": 2.137159205926742e-07, - "loss": 0.7903, + "epoch": 0.9146426092990978, + "grad_norm": 0.6985281705856323, + "learning_rate": 3.0630337866913236e-05, + "loss": 0.8429, "step": 5272 }, { - "epoch": 1.9255066642322438, - "grad_norm": 1.1150739192962646, - "learning_rate": 2.1162949584891512e-07, - "loss": 0.8104, + "epoch": 0.9148160999306038, + "grad_norm": 1.675358772277832, + "learning_rate": 3.0624566096852124e-05, + "loss": 0.6044, "step": 5273 }, { - "epoch": 1.9258718276428701, - "grad_norm": 1.134478211402893, - "learning_rate": 2.095532514277232e-07, - "loss": 0.8267, + "epoch": 0.9149895905621096, + "grad_norm": 0.8433858752250671, + "learning_rate": 3.061879309379774e-05, + "loss": 0.6926, "step": 5274 }, { - "epoch": 1.9262369910534964, - "grad_norm": 1.2030701637268066, - "learning_rate": 2.0748718839724403e-07, - "loss": 0.8118, + "epoch": 0.9151630811936156, + "grad_norm": 0.8576712012290955, + "learning_rate": 3.061301885842004e-05, + "loss": 0.7278, "step": 5275 }, { - "epoch": 1.9266021544641228, - "grad_norm": 1.0348098278045654, - "learning_rate": 2.0543130782037845e-07, - "loss": 0.7489, + "epoch": 0.9153365718251214, + "grad_norm": 0.848386824131012, + "learning_rate": 3.060724339138913e-05, + "loss": 0.7416, "step": 5276 }, { - "epoch": 1.926967317874749, - "grad_norm": 1.202282428741455, - "learning_rate": 2.0338561075480269e-07, - "loss": 0.8134, + "epoch": 0.9155100624566274, + "grad_norm": 0.9103161096572876, + "learning_rate": 3.060146669337528e-05, + "loss": 0.7485, "step": 5277 }, { - "epoch": 1.9273324812853752, - "grad_norm": 1.2809053659439087, - "learning_rate": 2.0135009825293928e-07, - "loss": 0.794, + "epoch": 0.9156835530881332, + "grad_norm": 0.8948942422866821, + "learning_rate": 3.0595688765048855e-05, + "loss": 0.7039, "step": 5278 }, { - "epoch": 1.9276976446960015, - "grad_norm": 3.28293776512146, - "learning_rate": 1.9932477136197949e-07, - "loss": 0.8168, + "epoch": 0.9158570437196392, + "grad_norm": 1.0620768070220947, + "learning_rate": 3.05899096070804e-05, + "loss": 0.8313, "step": 5279 }, { - "epoch": 1.9280628081066276, - "grad_norm": 1.206003189086914, - "learning_rate": 1.9730963112387425e-07, - "loss": 0.78, + "epoch": 0.916030534351145, + "grad_norm": 0.991454005241394, + "learning_rate": 3.058412922014061e-05, + "loss": 0.8132, "step": 5280 }, { - "epoch": 1.9284279715172539, - "grad_norm": 1.0610723495483398, - "learning_rate": 1.9530467857532986e-07, - "loss": 0.7643, + "epoch": 0.916204024982651, + "grad_norm": 0.8241454362869263, + "learning_rate": 3.057834760490027e-05, + "loss": 0.7582, "step": 5281 }, { - "epoch": 1.9287931349278802, - "grad_norm": 1.0115915536880493, - "learning_rate": 1.93309914747819e-07, - "loss": 0.8297, + "epoch": 0.9163775156141568, + "grad_norm": 0.8508726954460144, + "learning_rate": 3.057256476203038e-05, + "loss": 0.7368, "step": 5282 }, { - "epoch": 1.9291582983385065, - "grad_norm": 1.0169142484664917, - "learning_rate": 1.9132534066756304e-07, - "loss": 0.8127, + "epoch": 0.9165510062456628, + "grad_norm": 0.8525331020355225, + "learning_rate": 3.056678069220203e-05, + "loss": 0.876, "step": 5283 }, { - "epoch": 1.9295234617491328, - "grad_norm": 1.1483492851257324, - "learning_rate": 1.8935095735554522e-07, - "loss": 0.8286, + "epoch": 0.9167244968771686, + "grad_norm": 0.9183812141418457, + "learning_rate": 3.056099539608646e-05, + "loss": 0.7861, "step": 5284 }, { - "epoch": 1.9298886251597591, - "grad_norm": 1.0427485704421997, - "learning_rate": 1.8738676582750638e-07, - "loss": 0.8342, + "epoch": 0.9168979875086746, + "grad_norm": 0.825893223285675, + "learning_rate": 3.055520887435507e-05, + "loss": 0.6982, "step": 5285 }, { - "epoch": 1.9302537885703852, - "grad_norm": 1.0612778663635254, - "learning_rate": 1.854327670939471e-07, - "loss": 0.813, + "epoch": 0.9170714781401804, + "grad_norm": 0.8790959715843201, + "learning_rate": 3.0549421127679395e-05, + "loss": 0.7046, "step": 5286 }, { - "epoch": 1.9306189519810115, - "grad_norm": 1.4222874641418457, - "learning_rate": 1.8348896216012102e-07, - "loss": 0.8132, + "epoch": 0.9172449687716864, + "grad_norm": 1.598661184310913, + "learning_rate": 3.0543632156731105e-05, + "loss": 0.6458, "step": 5287 }, { - "epoch": 1.9309841153916376, - "grad_norm": 1.0912880897521973, - "learning_rate": 1.8155535202603712e-07, - "loss": 0.7778, + "epoch": 0.9174184594031922, + "grad_norm": 1.046098232269287, + "learning_rate": 3.053784196218201e-05, + "loss": 0.6628, "step": 5288 }, { - "epoch": 1.931349278802264, - "grad_norm": 0.9643873572349548, - "learning_rate": 1.796319376864597e-07, - "loss": 0.7906, + "epoch": 0.9175919500346982, + "grad_norm": 0.6967904567718506, + "learning_rate": 3.053205054470408e-05, + "loss": 0.8545, "step": 5289 }, { - "epoch": 1.9317144422128902, - "grad_norm": 1.0497418642044067, - "learning_rate": 1.7771872013090608e-07, - "loss": 0.7878, + "epoch": 0.917765440666204, + "grad_norm": 0.9449397921562195, + "learning_rate": 3.052625790496942e-05, + "loss": 0.7188, "step": 5290 }, { - "epoch": 1.9320796056235165, - "grad_norm": 0.9912148118019104, - "learning_rate": 1.7581570034365557e-07, - "loss": 0.8251, + "epoch": 0.9179389312977099, + "grad_norm": 0.953161358833313, + "learning_rate": 3.052046404365025e-05, + "loss": 0.6887, "step": 5291 }, { - "epoch": 1.9324447690341429, - "grad_norm": 0.9904718399047852, - "learning_rate": 1.7392287930373175e-07, - "loss": 0.8159, + "epoch": 0.9181124219292158, + "grad_norm": 1.0231623649597168, + "learning_rate": 3.0514668961418984e-05, + "loss": 0.793, "step": 5292 }, { - "epoch": 1.9328099324447692, - "grad_norm": 0.9564522504806519, - "learning_rate": 1.7204025798491342e-07, - "loss": 0.814, + "epoch": 0.9182859125607217, + "grad_norm": 1.0023390054702759, + "learning_rate": 3.0508872658948125e-05, + "loss": 0.7059, "step": 5293 }, { - "epoch": 1.9331750958553953, - "grad_norm": 0.9929212927818298, - "learning_rate": 1.7016783735573693e-07, - "loss": 0.7987, + "epoch": 0.9184594031922276, + "grad_norm": 1.0593211650848389, + "learning_rate": 3.050307513691035e-05, + "loss": 0.707, "step": 5294 }, { - "epoch": 1.9335402592660216, - "grad_norm": 1.1384081840515137, - "learning_rate": 1.6830561837948735e-07, - "loss": 0.7946, + "epoch": 0.9186328938237335, + "grad_norm": 0.7810466289520264, + "learning_rate": 3.0497276395978468e-05, + "loss": 0.8113, "step": 5295 }, { - "epoch": 1.9339054226766477, - "grad_norm": 1.2471195459365845, - "learning_rate": 1.6645360201420046e-07, - "loss": 0.8293, + "epoch": 0.9188063844552394, + "grad_norm": 0.7791539430618286, + "learning_rate": 3.0491476436825427e-05, + "loss": 0.7205, "step": 5296 }, { - "epoch": 1.934270586087274, - "grad_norm": 0.8780003190040588, - "learning_rate": 1.646117892126653e-07, - "loss": 0.806, + "epoch": 0.9189798750867453, + "grad_norm": 0.7177509069442749, + "learning_rate": 3.048567526012432e-05, + "loss": 0.8245, "step": 5297 }, { - "epoch": 1.9346357494979003, - "grad_norm": 0.9264772534370422, - "learning_rate": 1.6278018092241943e-07, - "loss": 0.7889, + "epoch": 0.9191533657182512, + "grad_norm": 0.7066273093223572, + "learning_rate": 3.047987286654838e-05, + "loss": 0.791, "step": 5298 }, { - "epoch": 1.9350009129085266, - "grad_norm": 1.2101718187332153, - "learning_rate": 1.6095877808575133e-07, - "loss": 0.8235, + "epoch": 0.9193268563497571, + "grad_norm": 0.6412243843078613, + "learning_rate": 3.0474069256770983e-05, + "loss": 0.8167, "step": 5299 }, { - "epoch": 1.935366076319153, - "grad_norm": 1.1560693979263306, - "learning_rate": 1.5914758163970033e-07, - "loss": 0.8061, + "epoch": 0.919500346981263, + "grad_norm": 0.6365392804145813, + "learning_rate": 3.0468264431465643e-05, + "loss": 0.908, "step": 5300 }, { - "epoch": 1.9357312397297792, - "grad_norm": 0.8128798604011536, - "learning_rate": 1.5734659251605666e-07, - "loss": 0.8307, + "epoch": 0.9196738376127689, + "grad_norm": 0.9257527589797974, + "learning_rate": 3.0462458391306023e-05, + "loss": 0.7307, "step": 5301 }, { - "epoch": 1.9360964031404053, - "grad_norm": 1.121811866760254, - "learning_rate": 1.5555581164135468e-07, - "loss": 0.7853, + "epoch": 0.9198473282442748, + "grad_norm": 1.3213740587234497, + "learning_rate": 3.045665113696591e-05, + "loss": 0.6315, "step": 5302 }, { - "epoch": 1.9364615665510316, - "grad_norm": 1.5476486682891846, - "learning_rate": 1.537752399368797e-07, - "loss": 0.8235, + "epoch": 0.9200208188757807, + "grad_norm": 0.7863581776618958, + "learning_rate": 3.0450842669119255e-05, + "loss": 0.7063, "step": 5303 }, { - "epoch": 1.9368267299616577, - "grad_norm": 0.9755883812904358, - "learning_rate": 1.520048783186634e-07, - "loss": 0.8071, + "epoch": 0.9201943095072866, + "grad_norm": 1.485454797744751, + "learning_rate": 3.0445032988440126e-05, + "loss": 0.6417, "step": 5304 }, { - "epoch": 1.937191893372284, - "grad_norm": 1.1878598928451538, - "learning_rate": 1.502447276974861e-07, - "loss": 0.817, + "epoch": 0.9203678001387925, + "grad_norm": 0.8752457499504089, + "learning_rate": 3.0439222095602744e-05, + "loss": 0.6554, "step": 5305 }, { - "epoch": 1.9375570567829103, - "grad_norm": 0.8553702235221863, - "learning_rate": 1.484947889788768e-07, - "loss": 0.8267, + "epoch": 0.9205412907702984, + "grad_norm": 1.4305877685546875, + "learning_rate": 3.0433409991281483e-05, + "loss": 0.5637, "step": 5306 }, { - "epoch": 1.9379222201935367, - "grad_norm": 0.8980113863945007, - "learning_rate": 1.4675506306310873e-07, - "loss": 0.8107, + "epoch": 0.9207147814018043, + "grad_norm": 0.8847557306289673, + "learning_rate": 3.042759667615083e-05, + "loss": 0.6833, "step": 5307 }, { - "epoch": 1.938287383604163, - "grad_norm": 1.0202759504318237, - "learning_rate": 1.4502555084519698e-07, - "loss": 0.8083, + "epoch": 0.9208882720333103, + "grad_norm": 1.3408520221710205, + "learning_rate": 3.042178215088543e-05, + "loss": 0.8264, "step": 5308 }, { - "epoch": 1.938652547014789, - "grad_norm": 1.0261701345443726, - "learning_rate": 1.4330625321490988e-07, - "loss": 0.8079, + "epoch": 0.9210617626648161, + "grad_norm": 1.5968462228775024, + "learning_rate": 3.041596641616007e-05, + "loss": 0.6316, "step": 5309 }, { - "epoch": 1.9390177104254154, - "grad_norm": 1.5377358198165894, - "learning_rate": 1.4159717105675542e-07, - "loss": 0.7975, + "epoch": 0.9212352532963219, + "grad_norm": 0.8478186130523682, + "learning_rate": 3.041014947264967e-05, + "loss": 0.7806, "step": 5310 }, { - "epoch": 1.9393828738360415, - "grad_norm": 0.8103115558624268, - "learning_rate": 1.3989830524999025e-07, - "loss": 0.7919, + "epoch": 0.9214087439278279, + "grad_norm": 0.8762840628623962, + "learning_rate": 3.0404331321029293e-05, + "loss": 0.8562, "step": 5311 }, { - "epoch": 1.9397480372466678, - "grad_norm": 0.7506356835365295, - "learning_rate": 1.3820965666860865e-07, - "loss": 0.8292, + "epoch": 0.9215822345593337, + "grad_norm": 0.8370490670204163, + "learning_rate": 3.0398511961974143e-05, + "loss": 0.7473, "step": 5312 }, { - "epoch": 1.940113200657294, - "grad_norm": 0.93381667137146, - "learning_rate": 1.3653122618135562e-07, - "loss": 0.8359, + "epoch": 0.9217557251908397, + "grad_norm": 0.8492000102996826, + "learning_rate": 3.0392691396159562e-05, + "loss": 0.7727, "step": 5313 }, { - "epoch": 1.9404783640679204, - "grad_norm": 1.0345453023910522, - "learning_rate": 1.348630146517138e-07, - "loss": 0.7559, + "epoch": 0.9219292158223455, + "grad_norm": 0.9833300113677979, + "learning_rate": 3.0386869624261036e-05, + "loss": 0.6388, "step": 5314 }, { - "epoch": 1.9408435274785467, - "grad_norm": 2.838883638381958, - "learning_rate": 1.3320502293791448e-07, - "loss": 0.8013, + "epoch": 0.9221027064538515, + "grad_norm": 0.8711003661155701, + "learning_rate": 3.0381046646954185e-05, + "loss": 0.6736, "step": 5315 }, { - "epoch": 1.941208690889173, - "grad_norm": 0.962410569190979, - "learning_rate": 1.3155725189292646e-07, - "loss": 0.8348, + "epoch": 0.9222761970853574, + "grad_norm": 0.7810627818107605, + "learning_rate": 3.0375222464914782e-05, + "loss": 0.5748, "step": 5316 }, { - "epoch": 1.9415738542997991, - "grad_norm": 1.1765908002853394, - "learning_rate": 1.2991970236445828e-07, - "loss": 0.847, + "epoch": 0.9224496877168633, + "grad_norm": 2.428886651992798, + "learning_rate": 3.036939707881871e-05, + "loss": 0.7285, "step": 5317 }, { - "epoch": 1.9419390177104254, - "grad_norm": 0.9318401217460632, - "learning_rate": 1.282923751949694e-07, - "loss": 0.8226, + "epoch": 0.9226231783483692, + "grad_norm": 0.9417861700057983, + "learning_rate": 3.0363570489342033e-05, + "loss": 0.7043, "step": 5318 }, { - "epoch": 1.9423041811210515, - "grad_norm": 0.8872619271278381, - "learning_rate": 1.2667527122165014e-07, - "loss": 0.8101, + "epoch": 0.9227966689798751, + "grad_norm": 0.8593915104866028, + "learning_rate": 3.0357742697160924e-05, + "loss": 0.8301, "step": 5319 }, { - "epoch": 1.9426693445316778, - "grad_norm": 1.2465029954910278, - "learning_rate": 1.2506839127643943e-07, - "loss": 0.822, + "epoch": 0.922970159611381, + "grad_norm": 0.9161787033081055, + "learning_rate": 3.03519137029517e-05, + "loss": 0.6957, "step": 5320 }, { - "epoch": 1.9430345079423041, - "grad_norm": 1.3642973899841309, - "learning_rate": 1.2347173618600717e-07, - "loss": 0.833, + "epoch": 0.9231436502428869, + "grad_norm": 0.843027651309967, + "learning_rate": 3.034608350739084e-05, + "loss": 0.7463, "step": 5321 }, { - "epoch": 1.9433996713529305, - "grad_norm": 1.044567584991455, - "learning_rate": 1.218853067717718e-07, - "loss": 0.8215, + "epoch": 0.9233171408743928, + "grad_norm": 0.6177093386650085, + "learning_rate": 3.034025211115492e-05, + "loss": 0.8262, "step": 5322 }, { - "epoch": 1.9437648347635568, - "grad_norm": 0.9015693068504333, - "learning_rate": 1.2030910384988716e-07, - "loss": 0.8132, + "epoch": 0.9234906315058987, + "grad_norm": 0.9865739345550537, + "learning_rate": 3.03344195149207e-05, + "loss": 0.7208, "step": 5323 }, { - "epoch": 1.944129998174183, - "grad_norm": 1.2895466089248657, - "learning_rate": 1.1874312823124678e-07, - "loss": 0.8209, + "epoch": 0.9236641221374046, + "grad_norm": 0.7415364980697632, + "learning_rate": 3.0328585719365057e-05, + "loss": 0.7167, "step": 5324 }, { - "epoch": 1.9444951615848092, - "grad_norm": 0.9532686471939087, - "learning_rate": 1.1718738072148184e-07, - "loss": 0.8055, + "epoch": 0.9238376127689105, + "grad_norm": 0.8021391034126282, + "learning_rate": 3.0322750725165e-05, + "loss": 0.7637, "step": 5325 }, { - "epoch": 1.9448603249954355, - "grad_norm": 1.1029144525527954, - "learning_rate": 1.1564186212096317e-07, - "loss": 0.8272, + "epoch": 0.9240111034004164, + "grad_norm": 1.2115792036056519, + "learning_rate": 3.0316914532997694e-05, + "loss": 0.618, "step": 5326 }, { - "epoch": 1.9452254884060616, - "grad_norm": 1.3343206644058228, - "learning_rate": 1.1410657322479479e-07, - "loss": 0.8694, + "epoch": 0.9241845940319223, + "grad_norm": 0.8165175914764404, + "learning_rate": 3.031107714354044e-05, + "loss": 0.7334, "step": 5327 }, { - "epoch": 1.9455906518166879, - "grad_norm": 0.9152016639709473, - "learning_rate": 1.1258151482282265e-07, - "loss": 0.8288, + "epoch": 0.9243580846634282, + "grad_norm": 0.6886077523231506, + "learning_rate": 3.030523855747066e-05, + "loss": 0.7677, "step": 5328 }, { - "epoch": 1.9459558152273142, - "grad_norm": 1.1817156076431274, - "learning_rate": 1.1106668769963025e-07, - "loss": 0.7877, + "epoch": 0.9245315752949341, + "grad_norm": 0.8972025513648987, + "learning_rate": 3.0299398775465945e-05, + "loss": 0.757, "step": 5329 }, { - "epoch": 1.9463209786379405, - "grad_norm": 0.9032189846038818, - "learning_rate": 1.0956209263453421e-07, - "loss": 0.8544, + "epoch": 0.92470506592644, + "grad_norm": 0.8991239666938782, + "learning_rate": 3.0293557798203998e-05, + "loss": 0.6578, "step": 5330 }, { - "epoch": 1.9466861420485668, - "grad_norm": 1.1358200311660767, - "learning_rate": 1.0806773040158647e-07, - "loss": 0.818, + "epoch": 0.9248785565579458, + "grad_norm": 0.8940245509147644, + "learning_rate": 3.0287715626362676e-05, + "loss": 0.7229, "step": 5331 }, { - "epoch": 1.9470513054591931, - "grad_norm": 1.0684336423873901, - "learning_rate": 1.0658360176957871e-07, - "loss": 0.8086, + "epoch": 0.9250520471894518, + "grad_norm": 0.8216572999954224, + "learning_rate": 3.0281872260619965e-05, + "loss": 0.66, "step": 5332 }, { - "epoch": 1.9474164688698192, - "grad_norm": 1.5901676416397095, - "learning_rate": 1.0510970750203353e-07, - "loss": 0.7775, + "epoch": 0.9252255378209576, + "grad_norm": 1.146180272102356, + "learning_rate": 3.0276027701654e-05, + "loss": 0.6075, "step": 5333 }, { - "epoch": 1.9477816322804455, - "grad_norm": 1.112302303314209, - "learning_rate": 1.0364604835721325e-07, - "loss": 0.8259, + "epoch": 0.9253990284524636, + "grad_norm": 0.7307398319244385, + "learning_rate": 3.0270181950143045e-05, + "loss": 0.8848, "step": 5334 }, { - "epoch": 1.9481467956910716, - "grad_norm": 1.0544458627700806, - "learning_rate": 1.021926250881089e-07, - "loss": 0.8291, + "epoch": 0.9255725190839694, + "grad_norm": 0.7599238157272339, + "learning_rate": 3.0264335006765506e-05, + "loss": 0.7122, "step": 5335 }, { - "epoch": 1.948511959101698, - "grad_norm": 1.2954243421554565, - "learning_rate": 1.0074943844245122e-07, - "loss": 0.8466, + "epoch": 0.9257460097154754, + "grad_norm": 1.541717529296875, + "learning_rate": 3.025848687219993e-05, + "loss": 0.8601, "step": 5336 }, { - "epoch": 1.9488771225123243, - "grad_norm": 0.8077088594436646, - "learning_rate": 9.931648916269965e-08, - "loss": 0.7522, + "epoch": 0.9259195003469812, + "grad_norm": 0.9474151134490967, + "learning_rate": 3.0252637547125e-05, + "loss": 0.6779, "step": 5337 }, { - "epoch": 1.9492422859229506, - "grad_norm": 1.1170779466629028, - "learning_rate": 9.789377798604894e-08, - "loss": 0.7576, + "epoch": 0.9260929909784872, + "grad_norm": 1.1366173028945923, + "learning_rate": 3.0246787032219535e-05, + "loss": 0.6395, "step": 5338 }, { - "epoch": 1.9496074493335769, - "grad_norm": 1.185459852218628, - "learning_rate": 9.648130564442915e-08, - "loss": 0.7927, + "epoch": 0.926266481609993, + "grad_norm": 1.2210899591445923, + "learning_rate": 3.0240935328162498e-05, + "loss": 0.7145, "step": 5339 }, { - "epoch": 1.9499726127442032, - "grad_norm": 1.3010276556015015, - "learning_rate": 9.507907286449903e-08, - "loss": 0.7957, + "epoch": 0.926439972241499, + "grad_norm": 0.7405300140380859, + "learning_rate": 3.0235082435632984e-05, + "loss": 0.8162, "step": 5340 }, { - "epoch": 1.9503377761548293, - "grad_norm": 0.9810428619384766, - "learning_rate": 9.368708036764818e-08, - "loss": 0.8212, + "epoch": 0.9266134628730048, + "grad_norm": 1.1649495363235474, + "learning_rate": 3.0229228355310218e-05, + "loss": 0.699, "step": 5341 }, { - "epoch": 1.9507029395654556, - "grad_norm": 0.943995475769043, - "learning_rate": 9.230532887000598e-08, - "loss": 0.8217, + "epoch": 0.9267869535045108, + "grad_norm": 0.7334667444229126, + "learning_rate": 3.022337308787359e-05, + "loss": 0.7368, "step": 5342 }, { - "epoch": 1.9510681029760817, - "grad_norm": 1.082377552986145, - "learning_rate": 9.093381908242605e-08, - "loss": 0.7683, + "epoch": 0.9269604441360166, + "grad_norm": 0.9375115633010864, + "learning_rate": 3.0217516634002596e-05, + "loss": 0.6759, "step": 5343 }, { - "epoch": 1.951433266386708, - "grad_norm": 5.398486137390137, - "learning_rate": 8.957255171049506e-08, - "loss": 0.7957, + "epoch": 0.9271339347675226, + "grad_norm": 0.7184907793998718, + "learning_rate": 3.02116589943769e-05, + "loss": 0.8159, "step": 5344 }, { - "epoch": 1.9517984297973343, - "grad_norm": 1.0937148332595825, - "learning_rate": 8.822152745453061e-08, - "loss": 0.8445, + "epoch": 0.9273074253990284, + "grad_norm": 0.9646907448768616, + "learning_rate": 3.020580016967627e-05, + "loss": 0.8401, "step": 5345 }, { - "epoch": 1.9521635932079606, - "grad_norm": 0.99629807472229, - "learning_rate": 8.688074700958115e-08, - "loss": 0.7997, + "epoch": 0.9274809160305344, + "grad_norm": 0.7808434963226318, + "learning_rate": 3.019994016058064e-05, + "loss": 0.7021, "step": 5346 }, { - "epoch": 1.952528756618587, - "grad_norm": 0.6229676604270935, - "learning_rate": 8.55502110654216e-08, - "loss": 0.8451, + "epoch": 0.9276544066620402, + "grad_norm": 1.3060905933380127, + "learning_rate": 3.019407896777007e-05, + "loss": 0.6768, "step": 5347 }, { - "epoch": 1.952893920029213, - "grad_norm": 1.2303833961486816, - "learning_rate": 8.422992030656218e-08, - "loss": 0.824, + "epoch": 0.9278278972935462, + "grad_norm": 0.7936145663261414, + "learning_rate": 3.018821659192476e-05, + "loss": 0.668, "step": 5348 }, { - "epoch": 1.9532590834398393, - "grad_norm": 1.424055814743042, - "learning_rate": 8.291987541223955e-08, - "loss": 0.7911, + "epoch": 0.928001387925052, + "grad_norm": 0.9437910914421082, + "learning_rate": 3.018235303372504e-05, + "loss": 0.759, "step": 5349 }, { - "epoch": 1.9536242468504654, - "grad_norm": 1.312880039215088, - "learning_rate": 8.162007705641905e-08, - "loss": 0.8157, + "epoch": 0.9281748785565579, + "grad_norm": 1.091312050819397, + "learning_rate": 3.0176488293851388e-05, + "loss": 0.6672, "step": 5350 }, { - "epoch": 1.9539894102610917, - "grad_norm": 0.967231810092926, - "learning_rate": 8.033052590779245e-08, - "loss": 0.837, + "epoch": 0.9283483691880638, + "grad_norm": 0.7233453392982483, + "learning_rate": 3.017062237298441e-05, + "loss": 0.8281, "step": 5351 }, { - "epoch": 1.954354573671718, - "grad_norm": 0.9697086811065674, - "learning_rate": 7.90512226297846e-08, - "loss": 0.8069, + "epoch": 0.9285218598195697, + "grad_norm": 1.5331515073776245, + "learning_rate": 3.0164755271804856e-05, + "loss": 0.759, "step": 5352 }, { - "epoch": 1.9547197370823444, - "grad_norm": 1.198870062828064, - "learning_rate": 7.77821678805446e-08, - "loss": 0.7729, + "epoch": 0.9286953504510757, + "grad_norm": 1.0835716724395752, + "learning_rate": 3.0158886990993612e-05, + "loss": 0.7561, "step": 5353 }, { - "epoch": 1.9550849004929707, - "grad_norm": 1.1817984580993652, - "learning_rate": 7.652336231295021e-08, - "loss": 0.813, + "epoch": 0.9288688410825815, + "grad_norm": 0.6356520652770996, + "learning_rate": 3.015301753123169e-05, + "loss": 0.7979, "step": 5354 }, { - "epoch": 1.955450063903597, - "grad_norm": 1.7721412181854248, - "learning_rate": 7.527480657460562e-08, - "loss": 0.7766, + "epoch": 0.9290423317140875, + "grad_norm": 0.7558521628379822, + "learning_rate": 3.0147146893200248e-05, + "loss": 0.7979, "step": 5355 }, { - "epoch": 1.955815227314223, - "grad_norm": 1.0006965398788452, - "learning_rate": 7.403650130784368e-08, - "loss": 0.7732, + "epoch": 0.9292158223455933, + "grad_norm": 1.031262755393982, + "learning_rate": 3.0141275077580592e-05, + "loss": 0.6997, "step": 5356 }, { - "epoch": 1.9561803907248494, - "grad_norm": 1.1956435441970825, - "learning_rate": 7.280844714972368e-08, - "loss": 0.8063, + "epoch": 0.9293893129770993, + "grad_norm": 0.6869120001792908, + "learning_rate": 3.0135402085054148e-05, + "loss": 0.7507, "step": 5357 }, { - "epoch": 1.9565455541354755, - "grad_norm": 0.9270648956298828, - "learning_rate": 7.159064473202914e-08, - "loss": 0.8606, + "epoch": 0.9295628036086051, + "grad_norm": 0.6340370774269104, + "learning_rate": 3.0129527916302482e-05, + "loss": 0.906, "step": 5358 }, { - "epoch": 1.9569107175461018, - "grad_norm": 0.9440163969993591, - "learning_rate": 7.038309468127225e-08, - "loss": 0.7846, + "epoch": 0.9297362942401111, + "grad_norm": 0.9854668974876404, + "learning_rate": 3.0123652572007295e-05, + "loss": 0.8818, "step": 5359 }, { - "epoch": 1.957275880956728, - "grad_norm": 0.9993908405303955, - "learning_rate": 6.918579761868493e-08, - "loss": 0.8085, + "epoch": 0.9299097848716169, + "grad_norm": 0.793328046798706, + "learning_rate": 3.0117776052850427e-05, + "loss": 0.7688, "step": 5360 }, { - "epoch": 1.9576410443673544, - "grad_norm": 1.2688987255096436, - "learning_rate": 6.799875416023005e-08, - "loss": 0.7959, + "epoch": 0.9300832755031229, + "grad_norm": 0.6880084872245789, + "learning_rate": 3.0111898359513865e-05, + "loss": 0.7998, "step": 5361 }, { - "epoch": 1.9580062077779807, - "grad_norm": 1.2205994129180908, - "learning_rate": 6.682196491659687e-08, - "loss": 0.7869, + "epoch": 0.9302567661346287, + "grad_norm": 0.8677441477775574, + "learning_rate": 3.0106019492679714e-05, + "loss": 0.8127, "step": 5362 }, { - "epoch": 1.958371371188607, - "grad_norm": 1.3547202348709106, - "learning_rate": 6.565543049319445e-08, - "loss": 0.7903, + "epoch": 0.9304302567661347, + "grad_norm": 1.1005345582962036, + "learning_rate": 3.0100139453030222e-05, + "loss": 0.6467, "step": 5363 }, { - "epoch": 1.9587365345992331, - "grad_norm": 2.473787784576416, - "learning_rate": 6.449915149015828e-08, - "loss": 0.8346, + "epoch": 0.9306037473976405, + "grad_norm": 0.9107725620269775, + "learning_rate": 3.009425824124778e-05, + "loss": 0.7754, "step": 5364 }, { - "epoch": 1.9591016980098594, - "grad_norm": 1.8857228755950928, - "learning_rate": 6.335312850234365e-08, - "loss": 0.8084, + "epoch": 0.9307772380291465, + "grad_norm": 0.8253245949745178, + "learning_rate": 3.0088375858014905e-05, + "loss": 0.8232, "step": 5365 }, { - "epoch": 1.9594668614204855, - "grad_norm": 1.0424976348876953, - "learning_rate": 6.221736211933893e-08, - "loss": 0.8136, + "epoch": 0.9309507286606523, + "grad_norm": 0.6939373016357422, + "learning_rate": 3.008249230401426e-05, + "loss": 0.8503, "step": 5366 }, { - "epoch": 1.9598320248311119, - "grad_norm": 0.9217798709869385, - "learning_rate": 6.109185292544784e-08, - "loss": 0.8348, + "epoch": 0.9311242192921583, + "grad_norm": 0.7065132856369019, + "learning_rate": 3.007660757992863e-05, + "loss": 0.8523, "step": 5367 }, { - "epoch": 1.9601971882417382, - "grad_norm": 1.0138442516326904, - "learning_rate": 5.99766014996983e-08, - "loss": 0.8116, + "epoch": 0.9312977099236641, + "grad_norm": 0.9618935585021973, + "learning_rate": 3.0070721686440953e-05, + "loss": 0.7491, "step": 5368 }, { - "epoch": 1.9605623516523645, - "grad_norm": 1.7415562868118286, - "learning_rate": 5.887160841584472e-08, - "loss": 0.788, + "epoch": 0.9314712005551701, + "grad_norm": 1.3304764032363892, + "learning_rate": 3.0064834624234283e-05, + "loss": 0.8162, "step": 5369 }, { - "epoch": 1.9609275150629908, - "grad_norm": 1.1707872152328491, - "learning_rate": 5.777687424236123e-08, - "loss": 0.8272, + "epoch": 0.9316446911866759, + "grad_norm": 1.6112165451049805, + "learning_rate": 3.0058946393991833e-05, + "loss": 0.6047, "step": 5370 }, { - "epoch": 1.961292678473617, - "grad_norm": 1.1762518882751465, - "learning_rate": 5.669239954244399e-08, - "loss": 0.8125, + "epoch": 0.9318181818181818, + "grad_norm": 0.6773014664649963, + "learning_rate": 3.005305699639693e-05, + "loss": 0.8506, "step": 5371 }, { - "epoch": 1.9616578418842432, - "grad_norm": 1.2740592956542969, - "learning_rate": 5.561818487401338e-08, - "loss": 0.8251, + "epoch": 0.9319916724496877, + "grad_norm": 0.8730319738388062, + "learning_rate": 3.004716643213305e-05, + "loss": 0.864, "step": 5372 }, { - "epoch": 1.9620230052948695, - "grad_norm": 1.1995941400527954, - "learning_rate": 5.455423078970734e-08, - "loss": 0.8276, + "epoch": 0.9321651630811936, + "grad_norm": 1.1658484935760498, + "learning_rate": 3.0041274701883794e-05, + "loss": 0.6133, "step": 5373 }, { - "epoch": 1.9623881687054956, - "grad_norm": 0.9427727460861206, - "learning_rate": 5.350053783689024e-08, - "loss": 0.8073, + "epoch": 0.9323386537126995, + "grad_norm": 0.8673306107521057, + "learning_rate": 3.003538180633292e-05, + "loss": 0.7415, "step": 5374 }, { - "epoch": 1.962753332116122, - "grad_norm": 1.080256700515747, - "learning_rate": 5.2457106557641803e-08, - "loss": 0.7621, + "epoch": 0.9325121443442054, + "grad_norm": 0.7544202208518982, + "learning_rate": 3.002948774616429e-05, + "loss": 0.8674, "step": 5375 }, { - "epoch": 1.9631184955267482, - "grad_norm": 1.2521687746047974, - "learning_rate": 5.142393748876595e-08, - "loss": 0.7667, + "epoch": 0.9326856349757113, + "grad_norm": 1.0501501560211182, + "learning_rate": 3.0023592522061916e-05, + "loss": 0.7843, "step": 5376 }, { - "epoch": 1.9634836589373745, - "grad_norm": 1.1659667491912842, - "learning_rate": 5.040103116178863e-08, - "loss": 0.8104, + "epoch": 0.9328591256072172, + "grad_norm": 1.0250227451324463, + "learning_rate": 3.0017696134709946e-05, + "loss": 0.8433, "step": 5377 }, { - "epoch": 1.9638488223480008, - "grad_norm": 1.2434018850326538, - "learning_rate": 4.938838810295554e-08, - "loss": 0.7962, + "epoch": 0.9330326162387231, + "grad_norm": 0.725367546081543, + "learning_rate": 3.0011798584792672e-05, + "loss": 0.7207, "step": 5378 }, { - "epoch": 1.964213985758627, - "grad_norm": 1.2677363157272339, - "learning_rate": 4.8386008833225526e-08, - "loss": 0.79, + "epoch": 0.933206106870229, + "grad_norm": 1.4293593168258667, + "learning_rate": 3.000589987299451e-05, + "loss": 0.759, "step": 5379 }, { - "epoch": 1.9645791491692532, - "grad_norm": 0.9631461501121521, - "learning_rate": 4.739389386828608e-08, - "loss": 0.8126, + "epoch": 0.9333795975017349, + "grad_norm": 0.7632710337638855, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.8258, "step": 5380 }, { - "epoch": 1.9649443125798793, - "grad_norm": 1.3667045831680298, - "learning_rate": 4.6412043718540024e-08, - "loss": 0.7938, + "epoch": 0.9335530881332408, + "grad_norm": 0.6621934175491333, + "learning_rate": 2.9994098966493842e-05, + "loss": 0.756, "step": 5381 }, { - "epoch": 1.9653094759905056, - "grad_norm": 1.00787353515625, - "learning_rate": 4.544045888910997e-08, - "loss": 0.7784, + "epoch": 0.9337265787647467, + "grad_norm": 0.7780185341835022, + "learning_rate": 2.9988196773160857e-05, + "loss": 0.7397, "step": 5382 }, { - "epoch": 1.965674639401132, - "grad_norm": 1.3089302778244019, - "learning_rate": 4.447913987983832e-08, - "loss": 0.7751, + "epoch": 0.9339000693962526, + "grad_norm": 0.6961732506752014, + "learning_rate": 2.9982293420685998e-05, + "loss": 0.8713, "step": 5383 }, { - "epoch": 1.9660398028117583, - "grad_norm": 1.1874531507492065, - "learning_rate": 4.352808718528279e-08, - "loss": 0.8404, + "epoch": 0.9340735600277585, + "grad_norm": 0.6256163120269775, + "learning_rate": 2.9976388909754348e-05, + "loss": 0.7454, "step": 5384 }, { - "epoch": 1.9664049662223846, - "grad_norm": 1.5676307678222656, - "learning_rate": 4.2587301294723105e-08, - "loss": 0.814, + "epoch": 0.9342470506592644, + "grad_norm": 1.3283485174179077, + "learning_rate": 2.997048324105115e-05, + "loss": 0.7493, "step": 5385 }, { - "epoch": 1.966770129633011, - "grad_norm": 1.255475401878357, - "learning_rate": 4.165678269215656e-08, - "loss": 0.7834, + "epoch": 0.9344205412907703, + "grad_norm": 0.8063017725944519, + "learning_rate": 2.996457641526174e-05, + "loss": 0.6312, "step": 5386 }, { - "epoch": 1.967135293043637, - "grad_norm": 1.4698609113693237, - "learning_rate": 4.073653185629578e-08, - "loss": 0.823, + "epoch": 0.9345940319222762, + "grad_norm": 0.7630572319030762, + "learning_rate": 2.995866843307164e-05, + "loss": 0.782, "step": 5387 }, { - "epoch": 1.9675004564542633, - "grad_norm": 1.0739995241165161, - "learning_rate": 3.982654926057539e-08, - "loss": 0.8006, + "epoch": 0.9347675225537821, + "grad_norm": 0.7898972034454346, + "learning_rate": 2.995275929516646e-05, + "loss": 0.6592, "step": 5388 }, { - "epoch": 1.9678656198648894, - "grad_norm": 1.2462353706359863, - "learning_rate": 3.8926835373143125e-08, - "loss": 0.8208, + "epoch": 0.934941013185288, + "grad_norm": 1.2129666805267334, + "learning_rate": 2.9946849002231962e-05, + "loss": 0.642, "step": 5389 }, { - "epoch": 1.9682307832755157, - "grad_norm": 0.9381763339042664, - "learning_rate": 3.803739065686651e-08, - "loss": 0.8103, + "epoch": 0.9351145038167938, + "grad_norm": 1.0103672742843628, + "learning_rate": 2.9940937554954053e-05, + "loss": 0.6436, "step": 5390 }, { - "epoch": 1.968595946686142, - "grad_norm": 1.131927728652954, - "learning_rate": 3.7158215569326194e-08, - "loss": 0.804, + "epoch": 0.9352879944482998, + "grad_norm": 0.8073412775993347, + "learning_rate": 2.993502495401875e-05, + "loss": 0.8406, "step": 5391 }, { - "epoch": 1.9689611100967683, - "grad_norm": 1.1394319534301758, - "learning_rate": 3.628931056282703e-08, - "loss": 0.8145, + "epoch": 0.9354614850798056, + "grad_norm": 0.6961068511009216, + "learning_rate": 2.9929111200112233e-05, + "loss": 0.7312, "step": 5392 }, { - "epoch": 1.9693262735073946, - "grad_norm": 1.2094918489456177, - "learning_rate": 3.5430676084384775e-08, - "loss": 0.7898, + "epoch": 0.9356349757113116, + "grad_norm": 0.783565878868103, + "learning_rate": 2.9923196293920786e-05, + "loss": 0.8203, "step": 5393 }, { - "epoch": 1.969691436918021, - "grad_norm": 1.1086103916168213, - "learning_rate": 3.4582312575728306e-08, - "loss": 0.8202, + "epoch": 0.9358084663428174, + "grad_norm": 0.641018807888031, + "learning_rate": 2.991728023613085e-05, + "loss": 0.8503, "step": 5394 }, { - "epoch": 1.970056600328647, - "grad_norm": 1.0427817106246948, - "learning_rate": 3.3744220473312937e-08, - "loss": 0.796, + "epoch": 0.9359819569743234, + "grad_norm": 0.7806763052940369, + "learning_rate": 2.9911363027428982e-05, + "loss": 0.8379, "step": 5395 }, { - "epoch": 1.9704217637392734, - "grad_norm": 1.0124870538711548, - "learning_rate": 3.291640020829823e-08, - "loss": 0.8578, + "epoch": 0.9361554476058292, + "grad_norm": 0.7442367076873779, + "learning_rate": 2.990544466850189e-05, + "loss": 0.8545, "step": 5396 }, { - "epoch": 1.9707869271498994, - "grad_norm": 1.1541005373001099, - "learning_rate": 3.2098852206567944e-08, - "loss": 0.7877, + "epoch": 0.9363289382373352, + "grad_norm": 0.7240622639656067, + "learning_rate": 2.9899525160036398e-05, + "loss": 0.8636, "step": 5397 }, { - "epoch": 1.9711520905605258, - "grad_norm": 1.1417664289474487, - "learning_rate": 3.1291576888714536e-08, - "loss": 0.8092, + "epoch": 0.936502428868841, + "grad_norm": 0.7714655995368958, + "learning_rate": 2.9893604502719474e-05, + "loss": 0.8152, "step": 5398 }, { - "epoch": 1.971517253971152, - "grad_norm": 1.0063320398330688, - "learning_rate": 3.0494574670050236e-08, - "loss": 0.8307, + "epoch": 0.936675919500347, + "grad_norm": 1.0628761053085327, + "learning_rate": 2.9887682697238226e-05, + "loss": 0.7258, "step": 5399 }, { - "epoch": 1.9718824173817784, - "grad_norm": 1.0324442386627197, - "learning_rate": 2.970784596060261e-08, - "loss": 0.7623, + "epoch": 0.9368494101318529, + "grad_norm": 1.0560474395751953, + "learning_rate": 2.9881759744279875e-05, + "loss": 0.8538, "step": 5400 }, { - "epoch": 1.9724301624977176, - "grad_norm": 1.0018106698989868, - "learning_rate": 2.8931391165107902e-08, - "loss": 0.7583, + "epoch": 0.9370229007633588, + "grad_norm": 1.2630460262298584, + "learning_rate": 2.9875835644531793e-05, + "loss": 0.6707, "step": 5401 }, { - "epoch": 1.972795325908344, - "grad_norm": 1.081957221031189, - "learning_rate": 2.816521068302658e-08, - "loss": 0.7455, + "epoch": 0.9371963913948647, + "grad_norm": 0.7020349502563477, + "learning_rate": 2.986991039868148e-05, + "loss": 0.7439, "step": 5402 }, { - "epoch": 1.9731604893189703, - "grad_norm": 0.8807290196418762, - "learning_rate": 2.740930490852334e-08, - "loss": 0.819, + "epoch": 0.9373698820263706, + "grad_norm": 0.7619720697402954, + "learning_rate": 2.986398400741656e-05, + "loss": 0.7222, "step": 5403 }, { - "epoch": 1.9735256527295966, - "grad_norm": 1.0957330465316772, - "learning_rate": 2.6663674230482663e-08, - "loss": 0.8055, + "epoch": 0.9375433726578765, + "grad_norm": 1.0399168729782104, + "learning_rate": 2.9858056471424804e-05, + "loss": 0.824, "step": 5404 }, { - "epoch": 1.9738908161402229, - "grad_norm": 1.0040874481201172, - "learning_rate": 2.5928319032499928e-08, - "loss": 0.7319, + "epoch": 0.9377168632893824, + "grad_norm": 0.7383559346199036, + "learning_rate": 2.9852127791394106e-05, + "loss": 0.7485, "step": 5405 }, { - "epoch": 1.974255979550849, - "grad_norm": 1.1895182132720947, - "learning_rate": 2.520323969288807e-08, - "loss": 0.765, + "epoch": 0.9378903539208883, + "grad_norm": 0.8381448984146118, + "learning_rate": 2.98461979680125e-05, + "loss": 0.6677, "step": 5406 }, { - "epoch": 1.9746211429614753, - "grad_norm": 1.082749605178833, - "learning_rate": 2.4488436584670928e-08, - "loss": 0.7963, + "epoch": 0.9380638445523942, + "grad_norm": 0.9233793616294861, + "learning_rate": 2.984026700196814e-05, + "loss": 0.6299, "step": 5407 }, { - "epoch": 1.9749863063721014, - "grad_norm": 1.110884189605713, - "learning_rate": 2.378391007558767e-08, - "loss": 0.7864, + "epoch": 0.9382373351839001, + "grad_norm": 0.7783847451210022, + "learning_rate": 2.983433489394934e-05, + "loss": 0.6276, "step": 5408 }, { - "epoch": 1.9753514697827277, - "grad_norm": 1.2691941261291504, - "learning_rate": 2.3089660528083923e-08, - "loss": 0.757, + "epoch": 0.9384108258154059, + "grad_norm": 1.1404356956481934, + "learning_rate": 2.982840164464451e-05, + "loss": 0.7351, "step": 5409 }, { - "epoch": 1.975716633193354, - "grad_norm": 1.2070232629776, - "learning_rate": 2.240568829932732e-08, - "loss": 0.7756, + "epoch": 0.9385843164469119, + "grad_norm": 0.8917362093925476, + "learning_rate": 2.9822467254742212e-05, + "loss": 0.6423, "step": 5410 }, { - "epoch": 1.9760817966039803, - "grad_norm": 1.0093021392822266, - "learning_rate": 2.173199374119417e-08, - "loss": 0.7859, + "epoch": 0.9387578070784177, + "grad_norm": 0.7541847825050354, + "learning_rate": 2.9816531724931152e-05, + "loss": 0.5818, "step": 5411 }, { - "epoch": 1.9764469600146066, - "grad_norm": 1.0858479738235474, - "learning_rate": 2.106857720027167e-08, - "loss": 0.8048, + "epoch": 0.9389312977099237, + "grad_norm": 0.9122717380523682, + "learning_rate": 2.9810595055900148e-05, + "loss": 0.6613, "step": 5412 }, { - "epoch": 1.976812123425233, - "grad_norm": 0.922063946723938, - "learning_rate": 2.041543901786236e-08, - "loss": 0.7916, + "epoch": 0.9391047883414295, + "grad_norm": 0.750385582447052, + "learning_rate": 2.9804657248338146e-05, + "loss": 0.6344, "step": 5413 }, { - "epoch": 1.977177286835859, - "grad_norm": 1.0178011655807495, - "learning_rate": 1.9772579529977463e-08, - "loss": 0.7911, + "epoch": 0.9392782789729355, + "grad_norm": 0.8465158343315125, + "learning_rate": 2.9798718302934255e-05, + "loss": 0.7395, "step": 5414 }, { - "epoch": 1.9775424502464853, - "grad_norm": 1.1042615175247192, - "learning_rate": 1.913999906734354e-08, - "loss": 0.7421, + "epoch": 0.9394517696044413, + "grad_norm": 0.90374356508255, + "learning_rate": 2.9792778220377675e-05, + "loss": 0.7485, "step": 5415 }, { - "epoch": 1.9779076136571114, - "grad_norm": 1.7755894660949707, - "learning_rate": 1.851769795540026e-08, - "loss": 0.803, + "epoch": 0.9396252602359473, + "grad_norm": 1.0222971439361572, + "learning_rate": 2.9786837001357782e-05, + "loss": 0.7832, "step": 5416 }, { - "epoch": 1.9782727770677377, - "grad_norm": 0.8576592206954956, - "learning_rate": 1.7905676514293757e-08, - "loss": 0.7893, + "epoch": 0.9397987508674531, + "grad_norm": 0.7116925716400146, + "learning_rate": 2.978089464656405e-05, + "loss": 0.8787, "step": 5417 }, { - "epoch": 1.978637940478364, - "grad_norm": 0.9696841239929199, - "learning_rate": 1.7303935058885502e-08, - "loss": 0.8187, + "epoch": 0.9399722414989591, + "grad_norm": 0.8535963892936707, + "learning_rate": 2.9774951156686094e-05, + "loss": 0.7546, "step": 5418 }, { - "epoch": 1.9790031038889904, - "grad_norm": 1.0287052392959595, - "learning_rate": 1.6712473898745642e-08, - "loss": 0.7735, + "epoch": 0.9401457321304649, + "grad_norm": 1.1796780824661255, + "learning_rate": 2.9769006532413667e-05, + "loss": 0.6895, "step": 5419 }, { - "epoch": 1.9793682672996167, - "grad_norm": 0.882977306842804, - "learning_rate": 1.6131293338157438e-08, - "loss": 0.8422, + "epoch": 0.9403192227619709, + "grad_norm": 1.0555028915405273, + "learning_rate": 2.976306077443665e-05, + "loss": 0.7061, "step": 5420 }, { - "epoch": 1.979733430710243, - "grad_norm": 1.0711408853530884, - "learning_rate": 1.55603936761195e-08, - "loss": 0.7963, + "epoch": 0.9404927133934767, + "grad_norm": 0.6422418355941772, + "learning_rate": 2.9757113883445056e-05, + "loss": 0.5978, "step": 5421 }, { - "epoch": 1.980098594120869, - "grad_norm": 1.1984899044036865, - "learning_rate": 1.4999775206330224e-08, - "loss": 0.7392, + "epoch": 0.9406662040249827, + "grad_norm": 0.9957783222198486, + "learning_rate": 2.9751165860129024e-05, + "loss": 0.6438, "step": 5422 }, { - "epoch": 1.9804637575314954, - "grad_norm": 1.069635272026062, - "learning_rate": 1.444943821721001e-08, - "loss": 0.823, + "epoch": 0.9408396946564885, + "grad_norm": 0.9451017379760742, + "learning_rate": 2.974521670517883e-05, + "loss": 0.8934, "step": 5423 }, { - "epoch": 1.9808289209421215, - "grad_norm": 1.1592940092086792, - "learning_rate": 1.390938299188349e-08, - "loss": 0.8244, + "epoch": 0.9410131852879945, + "grad_norm": 0.9176308512687683, + "learning_rate": 2.973926641928489e-05, + "loss": 0.6915, "step": 5424 }, { - "epoch": 1.9811940843527478, - "grad_norm": 1.0786094665527344, - "learning_rate": 1.337960980818842e-08, - "loss": 0.8061, + "epoch": 0.9411866759195003, + "grad_norm": 0.9451529383659363, + "learning_rate": 2.9733315003137725e-05, + "loss": 0.6152, "step": 5425 }, { - "epoch": 1.981559247763374, - "grad_norm": 0.867221474647522, - "learning_rate": 1.2860118938669008e-08, - "loss": 0.8178, + "epoch": 0.9413601665510063, + "grad_norm": 1.0227429866790771, + "learning_rate": 2.9727362457428012e-05, + "loss": 0.6115, "step": 5426 }, { - "epoch": 1.9819244111740004, - "grad_norm": 0.9132350087165833, - "learning_rate": 1.2350910650587022e-08, - "loss": 0.8189, + "epoch": 0.9415336571825121, + "grad_norm": 0.9630193114280701, + "learning_rate": 2.9721408782846554e-05, + "loss": 0.7047, "step": 5427 }, { - "epoch": 1.9822895745846267, - "grad_norm": 1.4670971632003784, - "learning_rate": 1.1851985205904026e-08, - "loss": 0.7829, + "epoch": 0.9417071478140181, + "grad_norm": 0.684417724609375, + "learning_rate": 2.971545398008428e-05, + "loss": 0.7778, "step": 5428 }, { - "epoch": 1.9826547379952528, - "grad_norm": 0.9495171308517456, - "learning_rate": 1.1363342861301363e-08, - "loss": 0.8037, + "epoch": 0.9418806384455239, + "grad_norm": 0.9652040004730225, + "learning_rate": 2.970949804983225e-05, + "loss": 0.8513, "step": 5429 }, { - "epoch": 1.9830199014058791, - "grad_norm": 0.9802129864692688, - "learning_rate": 1.0884983868166832e-08, - "loss": 0.7336, + "epoch": 0.9420541290770298, + "grad_norm": 0.9616713523864746, + "learning_rate": 2.970354099278166e-05, + "loss": 0.7307, "step": 5430 }, { - "epoch": 1.9833850648165052, - "grad_norm": 1.1494317054748535, - "learning_rate": 1.0416908472592468e-08, - "loss": 0.7509, + "epoch": 0.9422276197085357, + "grad_norm": 1.0854194164276123, + "learning_rate": 2.9697582809623828e-05, + "loss": 0.5876, "step": 5431 }, { - "epoch": 1.9837502282271315, - "grad_norm": 1.032422661781311, - "learning_rate": 9.959116915387868e-09, - "loss": 0.7803, + "epoch": 0.9424011103400416, + "grad_norm": 0.8601900339126587, + "learning_rate": 2.9691623501050212e-05, + "loss": 0.7802, "step": 5432 }, { - "epoch": 1.9841153916377579, - "grad_norm": 0.9846093654632568, - "learning_rate": 9.51160943206686e-09, - "loss": 0.7794, + "epoch": 0.9425746009715475, + "grad_norm": 2.6932709217071533, + "learning_rate": 2.968566306775239e-05, + "loss": 0.8289, "step": 5433 }, { - "epoch": 1.9844805550483842, - "grad_norm": 1.334524154663086, - "learning_rate": 9.074386252854172e-09, - "loss": 0.7525, + "epoch": 0.9427480916030534, + "grad_norm": 1.0580146312713623, + "learning_rate": 2.967970151042209e-05, + "loss": 0.7123, "step": 5434 }, { - "epoch": 1.9848457184590105, - "grad_norm": 1.1554641723632812, - "learning_rate": 8.647447602683212e-09, - "loss": 0.7854, + "epoch": 0.9429215822345594, + "grad_norm": 0.887374758720398, + "learning_rate": 2.9673738829751148e-05, + "loss": 0.8069, "step": 5435 }, { - "epoch": 1.9852108818696368, - "grad_norm": 1.2283408641815186, - "learning_rate": 8.23079370119828e-09, - "loss": 0.7467, + "epoch": 0.9430950728660652, + "grad_norm": 0.9350341558456421, + "learning_rate": 2.9667775026431544e-05, + "loss": 0.5878, "step": 5436 }, { - "epoch": 1.9855760452802629, - "grad_norm": 1.2019792795181274, - "learning_rate": 7.824424762750137e-09, - "loss": 0.7789, + "epoch": 0.9432685634975712, + "grad_norm": 1.2136422395706177, + "learning_rate": 2.9661810101155387e-05, + "loss": 0.6854, "step": 5437 }, { - "epoch": 1.9859412086908892, - "grad_norm": 1.6912720203399658, - "learning_rate": 7.428340996400441e-09, - "loss": 0.7689, + "epoch": 0.943442054129077, + "grad_norm": 0.8023399114608765, + "learning_rate": 2.96558440546149e-05, + "loss": 0.7576, "step": 5438 }, { - "epoch": 1.9863063721015153, - "grad_norm": 0.8840200304985046, - "learning_rate": 7.042542605915081e-09, - "loss": 0.7864, + "epoch": 0.943615544760583, + "grad_norm": 1.0437283515930176, + "learning_rate": 2.9649876887502467e-05, + "loss": 0.7051, "step": 5439 }, { - "epoch": 1.9866715355121416, - "grad_norm": 1.1264209747314453, - "learning_rate": 6.667029789775292e-09, - "loss": 0.7827, + "epoch": 0.9437890353920888, + "grad_norm": 1.3257557153701782, + "learning_rate": 2.9643908600510572e-05, + "loss": 0.7173, "step": 5440 }, { - "epoch": 1.987036698922768, - "grad_norm": 1.1234359741210938, - "learning_rate": 6.301802741166541e-09, - "loss": 0.7755, + "epoch": 0.9439625260235948, + "grad_norm": 2.181450128555298, + "learning_rate": 2.9637939194331848e-05, + "loss": 0.666, "step": 5441 }, { - "epoch": 1.9874018623333942, - "grad_norm": 0.9075736403465271, - "learning_rate": 5.946861647982971e-09, - "loss": 0.7928, + "epoch": 0.9441360166551006, + "grad_norm": 0.9346305727958679, + "learning_rate": 2.9631968669659047e-05, + "loss": 0.7284, "step": 5442 }, { - "epoch": 1.9877670257440205, - "grad_norm": 1.0858089923858643, - "learning_rate": 5.602206692827405e-09, - "loss": 0.7586, + "epoch": 0.9443095072866066, + "grad_norm": 0.9683916568756104, + "learning_rate": 2.9625997027185064e-05, + "loss": 0.7153, "step": 5443 }, { - "epoch": 1.9881321891546468, - "grad_norm": 1.286030650138855, - "learning_rate": 5.267838053011343e-09, - "loss": 0.7763, + "epoch": 0.9444829979181124, + "grad_norm": 0.6567923426628113, + "learning_rate": 2.9620024267602906e-05, + "loss": 0.7893, "step": 5444 }, { - "epoch": 1.988497352565273, - "grad_norm": 0.7586479187011719, - "learning_rate": 4.943755900554958e-09, - "loss": 0.7808, + "epoch": 0.9446564885496184, + "grad_norm": 0.7897910475730896, + "learning_rate": 2.9614050391605725e-05, + "loss": 0.7739, "step": 5445 }, { - "epoch": 1.9888625159758992, - "grad_norm": 0.8890017867088318, - "learning_rate": 4.629960402182665e-09, - "loss": 0.8255, + "epoch": 0.9448299791811242, + "grad_norm": 0.8999233841896057, + "learning_rate": 2.9608075399886792e-05, + "loss": 0.7399, "step": 5446 }, { - "epoch": 1.9892276793865253, - "grad_norm": 1.0157991647720337, - "learning_rate": 4.326451719334213e-09, - "loss": 0.8167, + "epoch": 0.9450034698126302, + "grad_norm": 1.6164944171905518, + "learning_rate": 2.9602099293139512e-05, + "loss": 0.7881, "step": 5447 }, { - "epoch": 1.9895928427971517, - "grad_norm": 1.0608521699905396, - "learning_rate": 4.033230008146927e-09, - "loss": 0.7635, + "epoch": 0.945176960444136, + "grad_norm": 0.9821656942367554, + "learning_rate": 2.9596122072057424e-05, + "loss": 0.6753, "step": 5448 }, { - "epoch": 1.989958006207778, - "grad_norm": 0.9976930022239685, - "learning_rate": 3.750295419475692e-09, - "loss": 0.8546, + "epoch": 0.9453504510756419, + "grad_norm": 0.9516210556030273, + "learning_rate": 2.9590143737334182e-05, + "loss": 0.6873, "step": 5449 }, { - "epoch": 1.9903231696184043, - "grad_norm": 0.8211843967437744, - "learning_rate": 3.477648098879627e-09, - "loss": 0.8257, + "epoch": 0.9455239417071478, + "grad_norm": 0.8133457899093628, + "learning_rate": 2.958416428966359e-05, + "loss": 0.782, "step": 5450 }, { - "epoch": 1.9906883330290306, - "grad_norm": 0.9779754281044006, - "learning_rate": 3.2152881866198695e-09, - "loss": 0.7916, + "epoch": 0.9456974323386537, + "grad_norm": 0.8450877666473389, + "learning_rate": 2.9578183729739566e-05, + "loss": 0.7395, "step": 5451 }, { - "epoch": 1.991053496439657, - "grad_norm": 1.104073166847229, - "learning_rate": 2.9632158176751134e-09, - "loss": 0.7897, + "epoch": 0.9458709229701596, + "grad_norm": 0.8753635883331299, + "learning_rate": 2.957220205825615e-05, + "loss": 0.7025, "step": 5452 }, { - "epoch": 1.991418659850283, - "grad_norm": 1.038170337677002, - "learning_rate": 2.72143112172607e-09, - "loss": 0.7815, + "epoch": 0.9460444136016655, + "grad_norm": 1.5894778966903687, + "learning_rate": 2.956621927590754e-05, + "loss": 0.6554, "step": 5453 }, { - "epoch": 1.9917838232609093, - "grad_norm": 1.1254079341888428, - "learning_rate": 2.489934223157686e-09, - "loss": 0.8263, + "epoch": 0.9462179042331714, + "grad_norm": 0.7335399985313416, + "learning_rate": 2.956023538338803e-05, + "loss": 0.6873, "step": 5454 }, { - "epoch": 1.9921489866715354, - "grad_norm": 0.8627570271492004, - "learning_rate": 2.268725241068026e-09, - "loss": 0.8042, + "epoch": 0.9463913948646773, + "grad_norm": 0.8804233074188232, + "learning_rate": 2.9554250381392067e-05, + "loss": 0.6588, "step": 5455 }, { - "epoch": 1.9925141500821617, - "grad_norm": 1.064124584197998, - "learning_rate": 2.0578042892616114e-09, - "loss": 0.768, + "epoch": 0.9465648854961832, + "grad_norm": 0.8268002867698669, + "learning_rate": 2.9548264270614217e-05, + "loss": 0.726, "step": 5456 }, { - "epoch": 1.992879313492788, - "grad_norm": 0.8981339335441589, - "learning_rate": 1.8571714762471993e-09, - "loss": 0.8042, + "epoch": 0.9467383761276891, + "grad_norm": 0.9540483951568604, + "learning_rate": 2.954227705174917e-05, + "loss": 0.7063, "step": 5457 }, { - "epoch": 1.9932444769034143, - "grad_norm": 1.0292837619781494, - "learning_rate": 1.6668269052422248e-09, - "loss": 0.8441, + "epoch": 0.946911866759195, + "grad_norm": 0.8591688871383667, + "learning_rate": 2.953628872549175e-05, + "loss": 0.7188, "step": 5458 }, { - "epoch": 1.9936096403140406, - "grad_norm": 1.0158228874206543, - "learning_rate": 1.4867706741727994e-09, - "loss": 0.731, + "epoch": 0.9470853573907009, + "grad_norm": 1.2600600719451904, + "learning_rate": 2.9530299292536918e-05, + "loss": 0.7786, "step": 5459 }, { - "epoch": 1.9939748037246667, - "grad_norm": 1.0490115880966187, - "learning_rate": 1.3170028756670506e-09, - "loss": 0.7838, + "epoch": 0.9472588480222068, + "grad_norm": 3.727069616317749, + "learning_rate": 2.952430875357974e-05, + "loss": 0.7874, "step": 5460 }, { - "epoch": 1.994339967135293, - "grad_norm": 1.0367121696472168, - "learning_rate": 1.157523597068444e-09, - "loss": 0.7578, + "epoch": 0.9474323386537127, + "grad_norm": 0.974780261516571, + "learning_rate": 2.951831710931544e-05, + "loss": 0.6697, "step": 5461 }, { - "epoch": 1.9947051305459191, - "grad_norm": 1.0636622905731201, - "learning_rate": 1.0083329204180204e-09, - "loss": 0.7612, + "epoch": 0.9476058292852186, + "grad_norm": 1.0150014162063599, + "learning_rate": 2.9512324360439347e-05, + "loss": 0.626, "step": 5462 }, { - "epoch": 1.9950702939565454, - "grad_norm": 1.1342140436172485, - "learning_rate": 8.694309224721586e-10, - "loss": 0.7935, + "epoch": 0.9477793199167245, + "grad_norm": 0.7214654684066772, + "learning_rate": 2.950633050764692e-05, + "loss": 0.8015, "step": 5463 }, { - "epoch": 1.9954354573671718, - "grad_norm": 0.8528012633323669, - "learning_rate": 7.408176746892537e-10, - "loss": 0.8126, + "epoch": 0.9479528105482304, + "grad_norm": 1.030455231666565, + "learning_rate": 2.9500335551633773e-05, + "loss": 0.6958, "step": 5464 }, { - "epoch": 1.995800620777798, - "grad_norm": 1.2457681894302368, - "learning_rate": 6.224932432363773e-10, - "loss": 0.8134, + "epoch": 0.9481263011797363, + "grad_norm": 1.0040634870529175, + "learning_rate": 2.9494339493095605e-05, + "loss": 0.5719, "step": 5465 }, { - "epoch": 1.9961657841884244, - "grad_norm": 1.0373698472976685, - "learning_rate": 5.144576889826169e-10, - "loss": 0.798, + "epoch": 0.9482997918112422, + "grad_norm": 2.2815096378326416, + "learning_rate": 2.9488342332728276e-05, + "loss": 0.6417, "step": 5466 }, { - "epoch": 1.9965309475990507, - "grad_norm": 1.2331247329711914, - "learning_rate": 4.1671106751239866e-10, - "loss": 0.7533, + "epoch": 0.9484732824427481, + "grad_norm": 0.9178109765052795, + "learning_rate": 2.9482344071227767e-05, + "loss": 0.6027, "step": 5467 }, { - "epoch": 1.9968961110096768, - "grad_norm": 1.1192184686660767, - "learning_rate": 3.2925342911216405e-10, - "loss": 0.8074, + "epoch": 0.9486467730742539, + "grad_norm": 0.7549407482147217, + "learning_rate": 2.9476344709290175e-05, + "loss": 0.8582, "step": 5468 }, { - "epoch": 1.997261274420303, - "grad_norm": 1.3498742580413818, - "learning_rate": 2.5208481877259103e-10, - "loss": 0.8009, + "epoch": 0.9488202637057599, + "grad_norm": 0.8371760249137878, + "learning_rate": 2.9470344247611744e-05, + "loss": 0.7062, "step": 5469 }, { - "epoch": 1.9976264378309292, - "grad_norm": 1.1625001430511475, - "learning_rate": 1.8520527619747543e-10, - "loss": 0.7954, + "epoch": 0.9489937543372657, + "grad_norm": 1.089241623878479, + "learning_rate": 2.9464342686888826e-05, + "loss": 0.8484, "step": 5470 }, { - "epoch": 1.9979916012415555, - "grad_norm": 1.1209075450897217, - "learning_rate": 1.2861483579040824e-10, - "loss": 0.7837, + "epoch": 0.9491672449687717, + "grad_norm": 1.0411460399627686, + "learning_rate": 2.9458340027817912e-05, + "loss": 0.8301, "step": 5471 }, { - "epoch": 1.9983567646521818, - "grad_norm": 1.1680302619934082, - "learning_rate": 8.231352666587811e-11, - "loss": 0.7945, + "epoch": 0.9493407356002775, + "grad_norm": 0.9168943166732788, + "learning_rate": 2.9452336271095613e-05, + "loss": 0.7959, "step": 5472 }, { - "epoch": 1.9987219280628081, - "grad_norm": 1.1322518587112427, - "learning_rate": 4.630137264483026e-11, - "loss": 0.7754, + "epoch": 0.9495142262317835, + "grad_norm": 1.034737467765808, + "learning_rate": 2.9446331417418678e-05, + "loss": 0.7612, "step": 5473 }, { - "epoch": 1.9990870914734344, - "grad_norm": 1.179162859916687, - "learning_rate": 2.0578392252446066e-11, - "loss": 0.7766, + "epoch": 0.9496877168632893, + "grad_norm": 1.2714648246765137, + "learning_rate": 2.9440325467483974e-05, + "loss": 0.6792, "step": 5474 }, { - "epoch": 1.9994522548840608, - "grad_norm": 0.8329395055770874, - "learning_rate": 5.1445987248044394e-12, - "loss": 0.79, + "epoch": 0.9498612074947953, + "grad_norm": 0.7233021855354309, + "learning_rate": 2.94343184219885e-05, + "loss": 0.7976, "step": 5475 }, { - "epoch": 1.9998174182946868, - "grad_norm": 1.3422150611877441, - "learning_rate": 0.0, - "loss": 0.7775, + "epoch": 0.9500346981263011, + "grad_norm": 0.600640594959259, + "learning_rate": 2.942831028162938e-05, + "loss": 0.8643, "step": 5476 }, { - "epoch": 1.9998174182946868, - "step": 5476, + "epoch": 0.9502081887578071, + "grad_norm": 0.9557732939720154, + "learning_rate": 2.942230104710387e-05, + "loss": 0.6547, + "step": 5477 + }, + { + "epoch": 0.950381679389313, + "grad_norm": 0.6332347393035889, + "learning_rate": 2.9416290719109333e-05, + "loss": 0.8384, + "step": 5478 + }, + { + "epoch": 0.9505551700208189, + "grad_norm": 1.0564005374908447, + "learning_rate": 2.9410279298343294e-05, + "loss": 0.8215, + "step": 5479 + }, + { + "epoch": 0.9507286606523248, + "grad_norm": 2.3100028038024902, + "learning_rate": 2.9404266785503376e-05, + "loss": 0.6372, + "step": 5480 + }, + { + "epoch": 0.9509021512838307, + "grad_norm": 0.6764401197433472, + "learning_rate": 2.939825318128734e-05, + "loss": 0.8259, + "step": 5481 + }, + { + "epoch": 0.9510756419153366, + "grad_norm": 0.7380715608596802, + "learning_rate": 2.9392238486393068e-05, + "loss": 0.9136, + "step": 5482 + }, + { + "epoch": 0.9512491325468425, + "grad_norm": 0.9035728573799133, + "learning_rate": 2.9386222701518582e-05, + "loss": 0.7347, + "step": 5483 + }, + { + "epoch": 0.9514226231783484, + "grad_norm": 1.3185487985610962, + "learning_rate": 2.9380205827362007e-05, + "loss": 0.6826, + "step": 5484 + }, + { + "epoch": 0.9515961138098543, + "grad_norm": 0.9550650119781494, + "learning_rate": 2.937418786462162e-05, + "loss": 0.7991, + "step": 5485 + }, + { + "epoch": 0.9517696044413602, + "grad_norm": 0.8029499650001526, + "learning_rate": 2.9368168813995806e-05, + "loss": 0.728, + "step": 5486 + }, + { + "epoch": 0.9519430950728661, + "grad_norm": 0.8706030249595642, + "learning_rate": 2.9362148676183087e-05, + "loss": 0.6512, + "step": 5487 + }, + { + "epoch": 0.952116585704372, + "grad_norm": 1.1258177757263184, + "learning_rate": 2.9356127451882105e-05, + "loss": 0.7847, + "step": 5488 + }, + { + "epoch": 0.9522900763358778, + "grad_norm": 1.3309884071350098, + "learning_rate": 2.9350105141791627e-05, + "loss": 0.7065, + "step": 5489 + }, + { + "epoch": 0.9524635669673838, + "grad_norm": 0.7769266366958618, + "learning_rate": 2.934408174661055e-05, + "loss": 0.8347, + "step": 5490 + }, + { + "epoch": 0.9526370575988896, + "grad_norm": 0.8437520265579224, + "learning_rate": 2.9338057267037906e-05, + "loss": 0.752, + "step": 5491 + }, + { + "epoch": 0.9528105482303956, + "grad_norm": 1.7555534839630127, + "learning_rate": 2.933203170377283e-05, + "loss": 0.7325, + "step": 5492 + }, + { + "epoch": 0.9529840388619014, + "grad_norm": 0.9326121211051941, + "learning_rate": 2.9326005057514605e-05, + "loss": 0.7328, + "step": 5493 + }, + { + "epoch": 0.9531575294934074, + "grad_norm": 0.8728808760643005, + "learning_rate": 2.931997732896262e-05, + "loss": 0.6758, + "step": 5494 + }, + { + "epoch": 0.9533310201249132, + "grad_norm": 1.0149542093276978, + "learning_rate": 2.9313948518816417e-05, + "loss": 0.7458, + "step": 5495 + }, + { + "epoch": 0.9535045107564192, + "grad_norm": 1.0819560289382935, + "learning_rate": 2.9307918627775627e-05, + "loss": 0.8049, + "step": 5496 + }, + { + "epoch": 0.953678001387925, + "grad_norm": 1.0369726419448853, + "learning_rate": 2.9301887656540034e-05, + "loss": 0.6489, + "step": 5497 + }, + { + "epoch": 0.953851492019431, + "grad_norm": 1.0735323429107666, + "learning_rate": 2.9295855605809543e-05, + "loss": 0.6666, + "step": 5498 + }, + { + "epoch": 0.9540249826509368, + "grad_norm": 0.8008173108100891, + "learning_rate": 2.9289822476284172e-05, + "loss": 0.782, + "step": 5499 + }, + { + "epoch": 0.9541984732824428, + "grad_norm": 1.309561848640442, + "learning_rate": 2.9283788268664085e-05, + "loss": 0.8496, + "step": 5500 + }, + { + "epoch": 0.9543719639139486, + "grad_norm": 0.9931257963180542, + "learning_rate": 2.9277752983649548e-05, + "loss": 0.6868, + "step": 5501 + }, + { + "epoch": 0.9545454545454546, + "grad_norm": 0.8853698372840881, + "learning_rate": 2.9271716621940965e-05, + "loss": 0.761, + "step": 5502 + }, + { + "epoch": 0.9547189451769604, + "grad_norm": 0.957352340221405, + "learning_rate": 2.9265679184238867e-05, + "loss": 0.8613, + "step": 5503 + }, + { + "epoch": 0.9548924358084664, + "grad_norm": 0.7933668494224548, + "learning_rate": 2.9259640671243903e-05, + "loss": 0.8474, + "step": 5504 + }, + { + "epoch": 0.9550659264399722, + "grad_norm": 0.8517404198646545, + "learning_rate": 2.9253601083656846e-05, + "loss": 0.7256, + "step": 5505 + }, + { + "epoch": 0.9552394170714782, + "grad_norm": 0.838321328163147, + "learning_rate": 2.9247560422178604e-05, + "loss": 0.8062, + "step": 5506 + }, + { + "epoch": 0.955412907702984, + "grad_norm": 0.7293458580970764, + "learning_rate": 2.92415186875102e-05, + "loss": 0.8293, + "step": 5507 + }, + { + "epoch": 0.9555863983344899, + "grad_norm": 0.754668653011322, + "learning_rate": 2.923547588035278e-05, + "loss": 0.8202, + "step": 5508 + }, + { + "epoch": 0.9557598889659958, + "grad_norm": 0.8310223817825317, + "learning_rate": 2.922943200140763e-05, + "loss": 0.7712, + "step": 5509 + }, + { + "epoch": 0.9559333795975017, + "grad_norm": 1.2016735076904297, + "learning_rate": 2.9223387051376133e-05, + "loss": 0.7629, + "step": 5510 + }, + { + "epoch": 0.9561068702290076, + "grad_norm": 1.0882593393325806, + "learning_rate": 2.9217341030959824e-05, + "loss": 0.7996, + "step": 5511 + }, + { + "epoch": 0.9562803608605135, + "grad_norm": 1.421604037284851, + "learning_rate": 2.921129394086035e-05, + "loss": 0.7085, + "step": 5512 + }, + { + "epoch": 0.9564538514920194, + "grad_norm": 1.5316734313964844, + "learning_rate": 2.920524578177948e-05, + "loss": 0.7385, + "step": 5513 + }, + { + "epoch": 0.9566273421235253, + "grad_norm": 0.766771674156189, + "learning_rate": 2.919919655441911e-05, + "loss": 0.7986, + "step": 5514 + }, + { + "epoch": 0.9568008327550312, + "grad_norm": 1.0445606708526611, + "learning_rate": 2.9193146259481265e-05, + "loss": 0.8147, + "step": 5515 + }, + { + "epoch": 0.9569743233865371, + "grad_norm": 0.7108529806137085, + "learning_rate": 2.9187094897668076e-05, + "loss": 0.9065, + "step": 5516 + }, + { + "epoch": 0.957147814018043, + "grad_norm": 0.7270875573158264, + "learning_rate": 2.9181042469681825e-05, + "loss": 0.9509, + "step": 5517 + }, + { + "epoch": 0.9573213046495489, + "grad_norm": 0.7581190466880798, + "learning_rate": 2.9174988976224897e-05, + "loss": 0.6903, + "step": 5518 + }, + { + "epoch": 0.9574947952810549, + "grad_norm": 0.9458627700805664, + "learning_rate": 2.9168934417999803e-05, + "loss": 0.8218, + "step": 5519 + }, + { + "epoch": 0.9576682859125607, + "grad_norm": 1.2696605920791626, + "learning_rate": 2.916287879570919e-05, + "loss": 0.6681, + "step": 5520 + }, + { + "epoch": 0.9578417765440667, + "grad_norm": 1.0369662046432495, + "learning_rate": 2.9156822110055816e-05, + "loss": 0.7023, + "step": 5521 + }, + { + "epoch": 0.9580152671755725, + "grad_norm": 0.9375792145729065, + "learning_rate": 2.9150764361742564e-05, + "loss": 0.6277, + "step": 5522 + }, + { + "epoch": 0.9581887578070785, + "grad_norm": 0.8462186455726624, + "learning_rate": 2.9144705551472445e-05, + "loss": 0.7598, + "step": 5523 + }, + { + "epoch": 0.9583622484385843, + "grad_norm": 0.7707379460334778, + "learning_rate": 2.9138645679948596e-05, + "loss": 0.6528, + "step": 5524 + }, + { + "epoch": 0.9585357390700903, + "grad_norm": 0.8648982644081116, + "learning_rate": 2.9132584747874265e-05, + "loss": 0.6777, + "step": 5525 + }, + { + "epoch": 0.9587092297015961, + "grad_norm": 1.0049614906311035, + "learning_rate": 2.912652275595283e-05, + "loss": 0.6483, + "step": 5526 + }, + { + "epoch": 0.9588827203331021, + "grad_norm": 1.6015599966049194, + "learning_rate": 2.91204597048878e-05, + "loss": 0.6504, + "step": 5527 + }, + { + "epoch": 0.9590562109646079, + "grad_norm": 0.8254128694534302, + "learning_rate": 2.9114395595382786e-05, + "loss": 0.6997, + "step": 5528 + }, + { + "epoch": 0.9592297015961138, + "grad_norm": 2.3671796321868896, + "learning_rate": 2.9108330428141544e-05, + "loss": 0.6814, + "step": 5529 + }, + { + "epoch": 0.9594031922276197, + "grad_norm": 1.077761173248291, + "learning_rate": 2.9102264203867948e-05, + "loss": 0.7759, + "step": 5530 + }, + { + "epoch": 0.9595766828591256, + "grad_norm": 0.9708044528961182, + "learning_rate": 2.9096196923265984e-05, + "loss": 0.691, + "step": 5531 + }, + { + "epoch": 0.9597501734906315, + "grad_norm": 3.4632887840270996, + "learning_rate": 2.9090128587039763e-05, + "loss": 0.7573, + "step": 5532 + }, + { + "epoch": 0.9599236641221374, + "grad_norm": 1.2126164436340332, + "learning_rate": 2.9084059195893536e-05, + "loss": 0.6727, + "step": 5533 + }, + { + "epoch": 0.9600971547536433, + "grad_norm": 1.296203851699829, + "learning_rate": 2.9077988750531645e-05, + "loss": 0.6214, + "step": 5534 + }, + { + "epoch": 0.9602706453851492, + "grad_norm": 0.9332613945007324, + "learning_rate": 2.9071917251658583e-05, + "loss": 0.8224, + "step": 5535 + }, + { + "epoch": 0.9604441360166551, + "grad_norm": 0.832705557346344, + "learning_rate": 2.9065844699978957e-05, + "loss": 0.7285, + "step": 5536 + }, + { + "epoch": 0.960617626648161, + "grad_norm": 2.302546977996826, + "learning_rate": 2.9059771096197488e-05, + "loss": 0.6498, + "step": 5537 + }, + { + "epoch": 0.9607911172796669, + "grad_norm": 0.9694088697433472, + "learning_rate": 2.9053696441019022e-05, + "loss": 0.6744, + "step": 5538 + }, + { + "epoch": 0.9609646079111728, + "grad_norm": 1.2494804859161377, + "learning_rate": 2.9047620735148542e-05, + "loss": 0.6759, + "step": 5539 + }, + { + "epoch": 0.9611380985426787, + "grad_norm": 0.7921517491340637, + "learning_rate": 2.9041543979291125e-05, + "loss": 0.7125, + "step": 5540 + }, + { + "epoch": 0.9613115891741846, + "grad_norm": 0.644878089427948, + "learning_rate": 2.9035466174152004e-05, + "loss": 0.8142, + "step": 5541 + }, + { + "epoch": 0.9614850798056905, + "grad_norm": 0.7948095202445984, + "learning_rate": 2.90293873204365e-05, + "loss": 0.7614, + "step": 5542 + }, + { + "epoch": 0.9616585704371964, + "grad_norm": 0.8653268814086914, + "learning_rate": 2.9023307418850074e-05, + "loss": 0.8213, + "step": 5543 + }, + { + "epoch": 0.9618320610687023, + "grad_norm": 1.272780418395996, + "learning_rate": 2.9017226470098307e-05, + "loss": 0.8521, + "step": 5544 + }, + { + "epoch": 0.9620055517002082, + "grad_norm": 0.9396153688430786, + "learning_rate": 2.9011144474886905e-05, + "loss": 0.7502, + "step": 5545 + }, + { + "epoch": 0.9621790423317141, + "grad_norm": 0.9030144214630127, + "learning_rate": 2.9005061433921685e-05, + "loss": 0.7528, + "step": 5546 + }, + { + "epoch": 0.96235253296322, + "grad_norm": 1.6547647714614868, + "learning_rate": 2.8998977347908593e-05, + "loss": 0.8524, + "step": 5547 + }, + { + "epoch": 0.9625260235947258, + "grad_norm": 1.0420563220977783, + "learning_rate": 2.8992892217553693e-05, + "loss": 0.7576, + "step": 5548 + }, + { + "epoch": 0.9626995142262318, + "grad_norm": 0.9057609438896179, + "learning_rate": 2.8986806043563174e-05, + "loss": 0.771, + "step": 5549 + }, + { + "epoch": 0.9628730048577376, + "grad_norm": 0.8019469380378723, + "learning_rate": 2.898071882664334e-05, + "loss": 0.719, + "step": 5550 + }, + { + "epoch": 0.9630464954892436, + "grad_norm": 0.8692736029624939, + "learning_rate": 2.8974630567500627e-05, + "loss": 0.7275, + "step": 5551 + }, + { + "epoch": 0.9632199861207494, + "grad_norm": 0.7932985424995422, + "learning_rate": 2.896854126684157e-05, + "loss": 0.7859, + "step": 5552 + }, + { + "epoch": 0.9633934767522554, + "grad_norm": 0.9315156936645508, + "learning_rate": 2.8962450925372855e-05, + "loss": 0.7014, + "step": 5553 + }, + { + "epoch": 0.9635669673837612, + "grad_norm": 0.8505563735961914, + "learning_rate": 2.895635954380127e-05, + "loss": 0.7959, + "step": 5554 + }, + { + "epoch": 0.9637404580152672, + "grad_norm": 0.8692546486854553, + "learning_rate": 2.8950267122833712e-05, + "loss": 0.7192, + "step": 5555 + }, + { + "epoch": 0.963913948646773, + "grad_norm": 0.9367578029632568, + "learning_rate": 2.894417366317724e-05, + "loss": 0.7074, + "step": 5556 + }, + { + "epoch": 0.964087439278279, + "grad_norm": 0.9274025559425354, + "learning_rate": 2.893807916553898e-05, + "loss": 0.7125, + "step": 5557 + }, + { + "epoch": 0.9642609299097848, + "grad_norm": 0.677331805229187, + "learning_rate": 2.8931983630626218e-05, + "loss": 0.8608, + "step": 5558 + }, + { + "epoch": 0.9644344205412908, + "grad_norm": 1.6443876028060913, + "learning_rate": 2.8925887059146357e-05, + "loss": 0.8313, + "step": 5559 + }, + { + "epoch": 0.9646079111727967, + "grad_norm": 0.8314826488494873, + "learning_rate": 2.8919789451806893e-05, + "loss": 0.7708, + "step": 5560 + }, + { + "epoch": 0.9647814018043026, + "grad_norm": 0.9173914790153503, + "learning_rate": 2.891369080931548e-05, + "loss": 0.8052, + "step": 5561 + }, + { + "epoch": 0.9649548924358085, + "grad_norm": 0.9360355734825134, + "learning_rate": 2.890759113237985e-05, + "loss": 0.7007, + "step": 5562 + }, + { + "epoch": 0.9651283830673144, + "grad_norm": 0.6958872675895691, + "learning_rate": 2.89014904217079e-05, + "loss": 0.6464, + "step": 5563 + }, + { + "epoch": 0.9653018736988203, + "grad_norm": 0.9266376495361328, + "learning_rate": 2.8895388678007602e-05, + "loss": 0.639, + "step": 5564 + }, + { + "epoch": 0.9654753643303262, + "grad_norm": 4.140871524810791, + "learning_rate": 2.8889285901987097e-05, + "loss": 0.6438, + "step": 5565 + }, + { + "epoch": 0.9656488549618321, + "grad_norm": 0.6531555652618408, + "learning_rate": 2.8883182094354594e-05, + "loss": 0.6743, + "step": 5566 + }, + { + "epoch": 0.9658223455933379, + "grad_norm": 1.0709491968154907, + "learning_rate": 2.8877077255818464e-05, + "loss": 0.7908, + "step": 5567 + }, + { + "epoch": 0.9659958362248439, + "grad_norm": 0.8646407723426819, + "learning_rate": 2.887097138708717e-05, + "loss": 0.6759, + "step": 5568 + }, + { + "epoch": 0.9661693268563497, + "grad_norm": 0.9538975954055786, + "learning_rate": 2.8864864488869314e-05, + "loss": 0.6377, + "step": 5569 + }, + { + "epoch": 0.9663428174878557, + "grad_norm": 0.916787326335907, + "learning_rate": 2.8858756561873605e-05, + "loss": 0.6885, + "step": 5570 + }, + { + "epoch": 0.9665163081193615, + "grad_norm": 0.8382698893547058, + "learning_rate": 2.885264760680887e-05, + "loss": 0.8098, + "step": 5571 + }, + { + "epoch": 0.9666897987508675, + "grad_norm": 0.7141633629798889, + "learning_rate": 2.884653762438407e-05, + "loss": 0.7245, + "step": 5572 + }, + { + "epoch": 0.9668632893823733, + "grad_norm": 1.1267939805984497, + "learning_rate": 2.8840426615308267e-05, + "loss": 0.8376, + "step": 5573 + }, + { + "epoch": 0.9670367800138793, + "grad_norm": 0.7455251812934875, + "learning_rate": 2.8834314580290655e-05, + "loss": 0.7291, + "step": 5574 + }, + { + "epoch": 0.9672102706453851, + "grad_norm": 0.9580469131469727, + "learning_rate": 2.8828201520040546e-05, + "loss": 0.7576, + "step": 5575 + }, + { + "epoch": 0.9673837612768911, + "grad_norm": 1.0662447214126587, + "learning_rate": 2.882208743526736e-05, + "loss": 0.7854, + "step": 5576 + }, + { + "epoch": 0.9675572519083969, + "grad_norm": 0.8430594801902771, + "learning_rate": 2.8815972326680648e-05, + "loss": 0.6189, + "step": 5577 + }, + { + "epoch": 0.9677307425399029, + "grad_norm": 0.7679371237754822, + "learning_rate": 2.8809856194990067e-05, + "loss": 0.8304, + "step": 5578 + }, + { + "epoch": 0.9679042331714087, + "grad_norm": 0.7795034050941467, + "learning_rate": 2.880373904090542e-05, + "loss": 0.6215, + "step": 5579 + }, + { + "epoch": 0.9680777238029147, + "grad_norm": 0.9354127645492554, + "learning_rate": 2.8797620865136594e-05, + "loss": 0.6418, + "step": 5580 + }, + { + "epoch": 0.9682512144344205, + "grad_norm": 0.8040018677711487, + "learning_rate": 2.8791501668393614e-05, + "loss": 0.7352, + "step": 5581 + }, + { + "epoch": 0.9684247050659265, + "grad_norm": 0.8099852204322815, + "learning_rate": 2.8785381451386628e-05, + "loss": 0.8245, + "step": 5582 + }, + { + "epoch": 0.9685981956974323, + "grad_norm": 0.6776793003082275, + "learning_rate": 2.8779260214825883e-05, + "loss": 0.755, + "step": 5583 + }, + { + "epoch": 0.9687716863289383, + "grad_norm": 0.9343713521957397, + "learning_rate": 2.877313795942176e-05, + "loss": 0.6531, + "step": 5584 + }, + { + "epoch": 0.9689451769604441, + "grad_norm": 1.070207118988037, + "learning_rate": 2.8767014685884755e-05, + "loss": 0.897, + "step": 5585 + }, + { + "epoch": 0.9691186675919501, + "grad_norm": 0.8318377137184143, + "learning_rate": 2.8760890394925477e-05, + "loss": 0.7107, + "step": 5586 + }, + { + "epoch": 0.9692921582234559, + "grad_norm": 1.1992378234863281, + "learning_rate": 2.8754765087254665e-05, + "loss": 0.6072, + "step": 5587 + }, + { + "epoch": 0.9694656488549618, + "grad_norm": 1.0993587970733643, + "learning_rate": 2.8748638763583158e-05, + "loss": 0.8303, + "step": 5588 + }, + { + "epoch": 0.9696391394864677, + "grad_norm": 0.916732907295227, + "learning_rate": 2.874251142462193e-05, + "loss": 0.7358, + "step": 5589 + }, + { + "epoch": 0.9698126301179736, + "grad_norm": 0.8407129049301147, + "learning_rate": 2.8736383071082065e-05, + "loss": 0.6375, + "step": 5590 + }, + { + "epoch": 0.9699861207494795, + "grad_norm": 1.1360074281692505, + "learning_rate": 2.8730253703674765e-05, + "loss": 0.7517, + "step": 5591 + }, + { + "epoch": 0.9701596113809854, + "grad_norm": 0.6836175322532654, + "learning_rate": 2.872412332311135e-05, + "loss": 0.8889, + "step": 5592 + }, + { + "epoch": 0.9703331020124913, + "grad_norm": 0.856776773929596, + "learning_rate": 2.8717991930103258e-05, + "loss": 0.7112, + "step": 5593 + }, + { + "epoch": 0.9705065926439972, + "grad_norm": 0.925445556640625, + "learning_rate": 2.8711859525362045e-05, + "loss": 0.6063, + "step": 5594 + }, + { + "epoch": 0.9706800832755031, + "grad_norm": 1.2320623397827148, + "learning_rate": 2.8705726109599382e-05, + "loss": 0.6211, + "step": 5595 + }, + { + "epoch": 0.970853573907009, + "grad_norm": 0.7088514566421509, + "learning_rate": 2.8699591683527058e-05, + "loss": 0.7589, + "step": 5596 + }, + { + "epoch": 0.971027064538515, + "grad_norm": 0.9134536385536194, + "learning_rate": 2.869345624785698e-05, + "loss": 0.6401, + "step": 5597 + }, + { + "epoch": 0.9712005551700208, + "grad_norm": 0.90672367811203, + "learning_rate": 2.8687319803301183e-05, + "loss": 0.856, + "step": 5598 + }, + { + "epoch": 0.9713740458015268, + "grad_norm": 0.9839548468589783, + "learning_rate": 2.8681182350571795e-05, + "loss": 0.7509, + "step": 5599 + }, + { + "epoch": 0.9715475364330326, + "grad_norm": 0.6866105198860168, + "learning_rate": 2.867504389038108e-05, + "loss": 0.7402, + "step": 5600 + }, + { + "epoch": 0.9717210270645386, + "grad_norm": 1.1568611860275269, + "learning_rate": 2.8668904423441413e-05, + "loss": 0.7363, + "step": 5601 + }, + { + "epoch": 0.9718945176960444, + "grad_norm": 0.9181439876556396, + "learning_rate": 2.8662763950465284e-05, + "loss": 0.6332, + "step": 5602 + }, + { + "epoch": 0.9720680083275504, + "grad_norm": 0.9394729733467102, + "learning_rate": 2.865662247216531e-05, + "loss": 0.6244, + "step": 5603 + }, + { + "epoch": 0.9722414989590562, + "grad_norm": 1.0880484580993652, + "learning_rate": 2.8650479989254206e-05, + "loss": 0.6648, + "step": 5604 + }, + { + "epoch": 0.9724149895905622, + "grad_norm": 0.9529463052749634, + "learning_rate": 2.864433650244482e-05, + "loss": 0.631, + "step": 5605 + }, + { + "epoch": 0.972588480222068, + "grad_norm": 1.0534076690673828, + "learning_rate": 2.8638192012450115e-05, + "loss": 0.7917, + "step": 5606 + }, + { + "epoch": 0.9727619708535739, + "grad_norm": 1.0361669063568115, + "learning_rate": 2.8632046519983157e-05, + "loss": 0.7554, + "step": 5607 + }, + { + "epoch": 0.9729354614850798, + "grad_norm": 0.6760294437408447, + "learning_rate": 2.862590002575714e-05, + "loss": 0.7651, + "step": 5608 + }, + { + "epoch": 0.9731089521165857, + "grad_norm": 0.9197137951850891, + "learning_rate": 2.861975253048538e-05, + "loss": 0.6019, + "step": 5609 + }, + { + "epoch": 0.9732824427480916, + "grad_norm": 0.9305890202522278, + "learning_rate": 2.861360403488129e-05, + "loss": 0.7928, + "step": 5610 + }, + { + "epoch": 0.9734559333795975, + "grad_norm": 1.5684350728988647, + "learning_rate": 2.8607454539658417e-05, + "loss": 0.5896, + "step": 5611 + }, + { + "epoch": 0.9736294240111034, + "grad_norm": 1.4757722616195679, + "learning_rate": 2.8601304045530414e-05, + "loss": 0.7346, + "step": 5612 + }, + { + "epoch": 0.9738029146426093, + "grad_norm": 1.1783311367034912, + "learning_rate": 2.859515255321105e-05, + "loss": 0.7281, + "step": 5613 + }, + { + "epoch": 0.9739764052741152, + "grad_norm": 0.9304174184799194, + "learning_rate": 2.858900006341422e-05, + "loss": 0.6044, + "step": 5614 + }, + { + "epoch": 0.9741498959056211, + "grad_norm": 0.8764861822128296, + "learning_rate": 2.858284657685393e-05, + "loss": 0.6476, + "step": 5615 + }, + { + "epoch": 0.974323386537127, + "grad_norm": 0.7725515961647034, + "learning_rate": 2.8576692094244286e-05, + "loss": 0.7617, + "step": 5616 + }, + { + "epoch": 0.9744968771686329, + "grad_norm": 0.9032357335090637, + "learning_rate": 2.8570536616299538e-05, + "loss": 0.6906, + "step": 5617 + }, + { + "epoch": 0.9746703678001388, + "grad_norm": 0.952859103679657, + "learning_rate": 2.856438014373402e-05, + "loss": 0.5934, + "step": 5618 + }, + { + "epoch": 0.9748438584316447, + "grad_norm": 0.8009156584739685, + "learning_rate": 2.855822267726222e-05, + "loss": 0.6873, + "step": 5619 + }, + { + "epoch": 0.9750173490631506, + "grad_norm": 0.8137307167053223, + "learning_rate": 2.8552064217598694e-05, + "loss": 0.6836, + "step": 5620 + }, + { + "epoch": 0.9751908396946565, + "grad_norm": 0.9093945026397705, + "learning_rate": 2.8545904765458153e-05, + "loss": 0.7302, + "step": 5621 + }, + { + "epoch": 0.9753643303261624, + "grad_norm": 1.0348138809204102, + "learning_rate": 2.853974432155541e-05, + "loss": 0.7092, + "step": 5622 + }, + { + "epoch": 0.9755378209576683, + "grad_norm": 0.9188106656074524, + "learning_rate": 2.8533582886605383e-05, + "loss": 0.8469, + "step": 5623 + }, + { + "epoch": 0.9757113115891742, + "grad_norm": 1.1312189102172852, + "learning_rate": 2.852742046132312e-05, + "loss": 0.6985, + "step": 5624 + }, + { + "epoch": 0.9758848022206801, + "grad_norm": 0.8131586909294128, + "learning_rate": 2.8521257046423782e-05, + "loss": 0.7943, + "step": 5625 + }, + { + "epoch": 0.9760582928521859, + "grad_norm": 0.8568660020828247, + "learning_rate": 2.8515092642622625e-05, + "loss": 0.77, + "step": 5626 + }, + { + "epoch": 0.9762317834836919, + "grad_norm": 0.8347184062004089, + "learning_rate": 2.850892725063505e-05, + "loss": 0.6863, + "step": 5627 + }, + { + "epoch": 0.9764052741151977, + "grad_norm": 0.7859092354774475, + "learning_rate": 2.8502760871176546e-05, + "loss": 0.8135, + "step": 5628 + }, + { + "epoch": 0.9765787647467037, + "grad_norm": 1.0778720378875732, + "learning_rate": 2.849659350496275e-05, + "loss": 0.6582, + "step": 5629 + }, + { + "epoch": 0.9767522553782095, + "grad_norm": 1.0106498003005981, + "learning_rate": 2.8490425152709367e-05, + "loss": 0.8772, + "step": 5630 + }, + { + "epoch": 0.9769257460097155, + "grad_norm": 0.851842999458313, + "learning_rate": 2.848425581513225e-05, + "loss": 0.7617, + "step": 5631 + }, + { + "epoch": 0.9770992366412213, + "grad_norm": 0.7654263973236084, + "learning_rate": 2.847808549294736e-05, + "loss": 0.7753, + "step": 5632 + }, + { + "epoch": 0.9772727272727273, + "grad_norm": 1.0892833471298218, + "learning_rate": 2.847191418687078e-05, + "loss": 0.8655, + "step": 5633 + }, + { + "epoch": 0.9774462179042331, + "grad_norm": 0.8580739498138428, + "learning_rate": 2.8465741897618673e-05, + "loss": 0.8472, + "step": 5634 + }, + { + "epoch": 0.9776197085357391, + "grad_norm": 0.6827507019042969, + "learning_rate": 2.845956862590736e-05, + "loss": 0.7543, + "step": 5635 + }, + { + "epoch": 0.9777931991672449, + "grad_norm": 1.131205439567566, + "learning_rate": 2.8453394372453253e-05, + "loss": 0.7518, + "step": 5636 + }, + { + "epoch": 0.9779666897987509, + "grad_norm": 1.5969923734664917, + "learning_rate": 2.844721913797287e-05, + "loss": 0.8455, + "step": 5637 + }, + { + "epoch": 0.9781401804302567, + "grad_norm": 0.9288991093635559, + "learning_rate": 2.8441042923182872e-05, + "loss": 0.665, + "step": 5638 + }, + { + "epoch": 0.9783136710617627, + "grad_norm": 2.3287148475646973, + "learning_rate": 2.84348657288e-05, + "loss": 0.7172, + "step": 5639 + }, + { + "epoch": 0.9784871616932685, + "grad_norm": 0.9257984757423401, + "learning_rate": 2.842868755554114e-05, + "loss": 0.6282, + "step": 5640 + }, + { + "epoch": 0.9786606523247745, + "grad_norm": 0.9199004173278809, + "learning_rate": 2.8422508404123264e-05, + "loss": 0.5878, + "step": 5641 + }, + { + "epoch": 0.9788341429562804, + "grad_norm": 1.7947003841400146, + "learning_rate": 2.8416328275263472e-05, + "loss": 0.6904, + "step": 5642 + }, + { + "epoch": 0.9790076335877863, + "grad_norm": 0.8478562831878662, + "learning_rate": 2.841014716967898e-05, + "loss": 0.8064, + "step": 5643 + }, + { + "epoch": 0.9791811242192922, + "grad_norm": 0.8355135917663574, + "learning_rate": 2.8403965088087105e-05, + "loss": 0.8337, + "step": 5644 + }, + { + "epoch": 0.9793546148507981, + "grad_norm": 0.8470054864883423, + "learning_rate": 2.8397782031205295e-05, + "loss": 0.6179, + "step": 5645 + }, + { + "epoch": 0.979528105482304, + "grad_norm": 1.1282260417938232, + "learning_rate": 2.839159799975109e-05, + "loss": 0.8064, + "step": 5646 + }, + { + "epoch": 0.9797015961138098, + "grad_norm": 0.7946552634239197, + "learning_rate": 2.838541299444216e-05, + "loss": 0.7488, + "step": 5647 + }, + { + "epoch": 0.9798750867453158, + "grad_norm": 0.9537225961685181, + "learning_rate": 2.8379227015996283e-05, + "loss": 0.6974, + "step": 5648 + }, + { + "epoch": 0.9800485773768216, + "grad_norm": 1.0301340818405151, + "learning_rate": 2.837304006513135e-05, + "loss": 0.8259, + "step": 5649 + }, + { + "epoch": 0.9802220680083276, + "grad_norm": 2.1033637523651123, + "learning_rate": 2.8366852142565352e-05, + "loss": 0.7373, + "step": 5650 + }, + { + "epoch": 0.9803955586398334, + "grad_norm": 1.0000661611557007, + "learning_rate": 2.8360663249016417e-05, + "loss": 0.6672, + "step": 5651 + }, + { + "epoch": 0.9805690492713394, + "grad_norm": 0.8921299576759338, + "learning_rate": 2.8354473385202772e-05, + "loss": 0.7446, + "step": 5652 + }, + { + "epoch": 0.9807425399028452, + "grad_norm": 0.8094157576560974, + "learning_rate": 2.8348282551842756e-05, + "loss": 0.8875, + "step": 5653 + }, + { + "epoch": 0.9809160305343512, + "grad_norm": 0.8721374869346619, + "learning_rate": 2.834209074965482e-05, + "loss": 0.6914, + "step": 5654 + }, + { + "epoch": 0.981089521165857, + "grad_norm": 0.8218604922294617, + "learning_rate": 2.833589797935753e-05, + "loss": 0.5726, + "step": 5655 + }, + { + "epoch": 0.981263011797363, + "grad_norm": 0.7885540723800659, + "learning_rate": 2.8329704241669574e-05, + "loss": 0.7791, + "step": 5656 + }, + { + "epoch": 0.9814365024288688, + "grad_norm": 0.8054550290107727, + "learning_rate": 2.8323509537309725e-05, + "loss": 0.7642, + "step": 5657 + }, + { + "epoch": 0.9816099930603748, + "grad_norm": 1.4975783824920654, + "learning_rate": 2.8317313866996897e-05, + "loss": 0.5959, + "step": 5658 + }, + { + "epoch": 0.9817834836918806, + "grad_norm": 0.8719802498817444, + "learning_rate": 2.831111723145011e-05, + "loss": 0.7151, + "step": 5659 + }, + { + "epoch": 0.9819569743233866, + "grad_norm": 0.8405832052230835, + "learning_rate": 2.830491963138848e-05, + "loss": 0.6438, + "step": 5660 + }, + { + "epoch": 0.9821304649548924, + "grad_norm": 0.9352977871894836, + "learning_rate": 2.8298721067531248e-05, + "loss": 0.6864, + "step": 5661 + }, + { + "epoch": 0.9823039555863984, + "grad_norm": 0.7990924119949341, + "learning_rate": 2.8292521540597767e-05, + "loss": 0.6855, + "step": 5662 + }, + { + "epoch": 0.9824774462179042, + "grad_norm": 0.8844391107559204, + "learning_rate": 2.8286321051307494e-05, + "loss": 0.6445, + "step": 5663 + }, + { + "epoch": 0.9826509368494102, + "grad_norm": 0.786125898361206, + "learning_rate": 2.828011960038002e-05, + "loss": 0.6843, + "step": 5664 + }, + { + "epoch": 0.982824427480916, + "grad_norm": 1.0770198106765747, + "learning_rate": 2.827391718853501e-05, + "loss": 0.6255, + "step": 5665 + }, + { + "epoch": 0.9829979181124219, + "grad_norm": 1.2975877523422241, + "learning_rate": 2.826771381649227e-05, + "loss": 0.7402, + "step": 5666 + }, + { + "epoch": 0.9831714087439278, + "grad_norm": 0.857201874256134, + "learning_rate": 2.826150948497171e-05, + "loss": 0.6562, + "step": 5667 + }, + { + "epoch": 0.9833448993754337, + "grad_norm": 0.8456276655197144, + "learning_rate": 2.8255304194693343e-05, + "loss": 0.7205, + "step": 5668 + }, + { + "epoch": 0.9835183900069396, + "grad_norm": 0.7739366888999939, + "learning_rate": 2.8249097946377307e-05, + "loss": 0.6708, + "step": 5669 + }, + { + "epoch": 0.9836918806384455, + "grad_norm": 0.9517167210578918, + "learning_rate": 2.8242890740743844e-05, + "loss": 0.6842, + "step": 5670 + }, + { + "epoch": 0.9838653712699514, + "grad_norm": 0.7968878746032715, + "learning_rate": 2.8236682578513302e-05, + "loss": 0.8167, + "step": 5671 + }, + { + "epoch": 0.9840388619014573, + "grad_norm": 0.9193726181983948, + "learning_rate": 2.8230473460406154e-05, + "loss": 0.7681, + "step": 5672 + }, + { + "epoch": 0.9842123525329632, + "grad_norm": 0.9202367067337036, + "learning_rate": 2.8224263387142963e-05, + "loss": 0.7361, + "step": 5673 + }, + { + "epoch": 0.9843858431644691, + "grad_norm": 0.871128499507904, + "learning_rate": 2.8218052359444434e-05, + "loss": 0.7158, + "step": 5674 + }, + { + "epoch": 0.984559333795975, + "grad_norm": 0.8008781671524048, + "learning_rate": 2.821184037803135e-05, + "loss": 0.6699, + "step": 5675 + }, + { + "epoch": 0.9847328244274809, + "grad_norm": 1.6074049472808838, + "learning_rate": 2.8205627443624616e-05, + "loss": 0.5959, + "step": 5676 + }, + { + "epoch": 0.9849063150589868, + "grad_norm": 1.0196146965026855, + "learning_rate": 2.8199413556945256e-05, + "loss": 0.7803, + "step": 5677 + }, + { + "epoch": 0.9850798056904927, + "grad_norm": 0.9796152710914612, + "learning_rate": 2.8193198718714402e-05, + "loss": 0.6656, + "step": 5678 + }, + { + "epoch": 0.9852532963219987, + "grad_norm": 0.7988877892494202, + "learning_rate": 2.8186982929653287e-05, + "loss": 0.675, + "step": 5679 + }, + { + "epoch": 0.9854267869535045, + "grad_norm": 0.8923824429512024, + "learning_rate": 2.8180766190483263e-05, + "loss": 0.6814, + "step": 5680 + }, + { + "epoch": 0.9856002775850105, + "grad_norm": 0.8727774024009705, + "learning_rate": 2.817454850192579e-05, + "loss": 0.7139, + "step": 5681 + }, + { + "epoch": 0.9857737682165163, + "grad_norm": 0.8929604291915894, + "learning_rate": 2.8168329864702443e-05, + "loss": 0.7113, + "step": 5682 + }, + { + "epoch": 0.9859472588480223, + "grad_norm": 0.8190316557884216, + "learning_rate": 2.8162110279534893e-05, + "loss": 0.7656, + "step": 5683 + }, + { + "epoch": 0.9861207494795281, + "grad_norm": 0.9942129850387573, + "learning_rate": 2.8155889747144933e-05, + "loss": 0.7844, + "step": 5684 + }, + { + "epoch": 0.9862942401110341, + "grad_norm": 0.8592444062232971, + "learning_rate": 2.8149668268254465e-05, + "loss": 0.7827, + "step": 5685 + }, + { + "epoch": 0.9864677307425399, + "grad_norm": 0.9739055037498474, + "learning_rate": 2.8143445843585498e-05, + "loss": 0.7769, + "step": 5686 + }, + { + "epoch": 0.9866412213740458, + "grad_norm": 2.5608577728271484, + "learning_rate": 2.8137222473860154e-05, + "loss": 0.8152, + "step": 5687 + }, + { + "epoch": 0.9868147120055517, + "grad_norm": 0.7669788002967834, + "learning_rate": 2.8130998159800663e-05, + "loss": 0.8804, + "step": 5688 + }, + { + "epoch": 0.9869882026370576, + "grad_norm": 0.8556749820709229, + "learning_rate": 2.8124772902129353e-05, + "loss": 0.7969, + "step": 5689 + }, + { + "epoch": 0.9871616932685635, + "grad_norm": 1.156733751296997, + "learning_rate": 2.8118546701568687e-05, + "loss": 0.7002, + "step": 5690 + }, + { + "epoch": 0.9873351839000694, + "grad_norm": 0.8353960514068604, + "learning_rate": 2.8112319558841216e-05, + "loss": 0.7676, + "step": 5691 + }, + { + "epoch": 0.9875086745315753, + "grad_norm": 1.136831283569336, + "learning_rate": 2.81060914746696e-05, + "loss": 0.6285, + "step": 5692 + }, + { + "epoch": 0.9876821651630812, + "grad_norm": 0.895106852054596, + "learning_rate": 2.8099862449776637e-05, + "loss": 0.6816, + "step": 5693 + }, + { + "epoch": 0.9878556557945871, + "grad_norm": 1.1519594192504883, + "learning_rate": 2.8093632484885182e-05, + "loss": 0.7583, + "step": 5694 + }, + { + "epoch": 0.988029146426093, + "grad_norm": 0.827491283416748, + "learning_rate": 2.8087401580718258e-05, + "loss": 0.5928, + "step": 5695 + }, + { + "epoch": 0.9882026370575989, + "grad_norm": 0.9537809491157532, + "learning_rate": 2.8081169737998956e-05, + "loss": 0.6616, + "step": 5696 + }, + { + "epoch": 0.9883761276891048, + "grad_norm": 0.926488995552063, + "learning_rate": 2.8074936957450485e-05, + "loss": 0.6956, + "step": 5697 + }, + { + "epoch": 0.9885496183206107, + "grad_norm": 0.9495089650154114, + "learning_rate": 2.8068703239796175e-05, + "loss": 0.6517, + "step": 5698 + }, + { + "epoch": 0.9887231089521166, + "grad_norm": 0.8300470113754272, + "learning_rate": 2.806246858575945e-05, + "loss": 0.771, + "step": 5699 + }, + { + "epoch": 0.9888965995836225, + "grad_norm": 0.9037389755249023, + "learning_rate": 2.805623299606385e-05, + "loss": 0.7231, + "step": 5700 + }, + { + "epoch": 0.9890700902151284, + "grad_norm": 0.9935937523841858, + "learning_rate": 2.8049996471433022e-05, + "loss": 0.6907, + "step": 5701 + }, + { + "epoch": 0.9892435808466343, + "grad_norm": 0.7973750829696655, + "learning_rate": 2.8043759012590723e-05, + "loss": 0.7777, + "step": 5702 + }, + { + "epoch": 0.9894170714781402, + "grad_norm": 0.8939965963363647, + "learning_rate": 2.8037520620260826e-05, + "loss": 0.8037, + "step": 5703 + }, + { + "epoch": 0.9895905621096461, + "grad_norm": 2.008754014968872, + "learning_rate": 2.803128129516729e-05, + "loss": 0.6375, + "step": 5704 + }, + { + "epoch": 0.989764052741152, + "grad_norm": 0.8862380385398865, + "learning_rate": 2.8025041038034197e-05, + "loss": 0.761, + "step": 5705 + }, + { + "epoch": 0.9899375433726578, + "grad_norm": 0.9103255867958069, + "learning_rate": 2.801879984958575e-05, + "loss": 0.6329, + "step": 5706 + }, + { + "epoch": 0.9901110340041638, + "grad_norm": 0.8047431111335754, + "learning_rate": 2.8012557730546224e-05, + "loss": 0.6975, + "step": 5707 + }, + { + "epoch": 0.9902845246356696, + "grad_norm": 0.8079007267951965, + "learning_rate": 2.800631468164005e-05, + "loss": 0.6685, + "step": 5708 + }, + { + "epoch": 0.9904580152671756, + "grad_norm": 0.9173434972763062, + "learning_rate": 2.800007070359172e-05, + "loss": 0.7319, + "step": 5709 + }, + { + "epoch": 0.9906315058986814, + "grad_norm": 1.1438117027282715, + "learning_rate": 2.7993825797125866e-05, + "loss": 0.7634, + "step": 5710 + }, + { + "epoch": 0.9908049965301874, + "grad_norm": 1.0076875686645508, + "learning_rate": 2.798757996296721e-05, + "loss": 0.6904, + "step": 5711 + }, + { + "epoch": 0.9909784871616932, + "grad_norm": 0.7601367831230164, + "learning_rate": 2.7981333201840595e-05, + "loss": 0.7063, + "step": 5712 + }, + { + "epoch": 0.9911519777931992, + "grad_norm": 0.8445897102355957, + "learning_rate": 2.7975085514470958e-05, + "loss": 0.7144, + "step": 5713 + }, + { + "epoch": 0.991325468424705, + "grad_norm": 1.3627687692642212, + "learning_rate": 2.7968836901583364e-05, + "loss": 0.7466, + "step": 5714 + }, + { + "epoch": 0.991498959056211, + "grad_norm": 0.6561018228530884, + "learning_rate": 2.7962587363902952e-05, + "loss": 0.8652, + "step": 5715 + }, + { + "epoch": 0.9916724496877168, + "grad_norm": 0.7793660163879395, + "learning_rate": 2.7956336902155003e-05, + "loss": 0.8652, + "step": 5716 + }, + { + "epoch": 0.9918459403192228, + "grad_norm": 1.0341880321502686, + "learning_rate": 2.7950085517064884e-05, + "loss": 0.6392, + "step": 5717 + }, + { + "epoch": 0.9920194309507286, + "grad_norm": 0.7888169884681702, + "learning_rate": 2.7943833209358076e-05, + "loss": 0.8186, + "step": 5718 + }, + { + "epoch": 0.9921929215822346, + "grad_norm": 0.7402393221855164, + "learning_rate": 2.793757997976017e-05, + "loss": 0.7426, + "step": 5719 + }, + { + "epoch": 0.9923664122137404, + "grad_norm": 1.2194817066192627, + "learning_rate": 2.793132582899686e-05, + "loss": 0.7195, + "step": 5720 + }, + { + "epoch": 0.9925399028452464, + "grad_norm": 0.8696430921554565, + "learning_rate": 2.7925070757793943e-05, + "loss": 0.7009, + "step": 5721 + }, + { + "epoch": 0.9927133934767522, + "grad_norm": 0.9652647376060486, + "learning_rate": 2.791881476687733e-05, + "loss": 0.6575, + "step": 5722 + }, + { + "epoch": 0.9928868841082582, + "grad_norm": 0.933112382888794, + "learning_rate": 2.7912557856973035e-05, + "loss": 0.761, + "step": 5723 + }, + { + "epoch": 0.993060374739764, + "grad_norm": 1.1211885213851929, + "learning_rate": 2.790630002880718e-05, + "loss": 0.8679, + "step": 5724 + }, + { + "epoch": 0.9932338653712699, + "grad_norm": 0.8920709490776062, + "learning_rate": 2.7900041283106e-05, + "loss": 0.7104, + "step": 5725 + }, + { + "epoch": 0.9934073560027759, + "grad_norm": 0.7077953815460205, + "learning_rate": 2.7893781620595818e-05, + "loss": 0.656, + "step": 5726 + }, + { + "epoch": 0.9935808466342817, + "grad_norm": 0.7241314053535461, + "learning_rate": 2.7887521042003084e-05, + "loss": 0.7552, + "step": 5727 + }, + { + "epoch": 0.9937543372657877, + "grad_norm": 0.8450825810432434, + "learning_rate": 2.788125954805434e-05, + "loss": 0.6943, + "step": 5728 + }, + { + "epoch": 0.9939278278972935, + "grad_norm": 0.8261436223983765, + "learning_rate": 2.787499713947624e-05, + "loss": 0.7299, + "step": 5729 + }, + { + "epoch": 0.9941013185287995, + "grad_norm": 0.8096888065338135, + "learning_rate": 2.7868733816995553e-05, + "loss": 0.7793, + "step": 5730 + }, + { + "epoch": 0.9942748091603053, + "grad_norm": 0.7579706311225891, + "learning_rate": 2.786246958133913e-05, + "loss": 0.7646, + "step": 5731 + }, + { + "epoch": 0.9944482997918113, + "grad_norm": 0.7397536039352417, + "learning_rate": 2.7856204433233954e-05, + "loss": 0.6451, + "step": 5732 + }, + { + "epoch": 0.9946217904233171, + "grad_norm": 1.011802315711975, + "learning_rate": 2.7849938373407095e-05, + "loss": 0.663, + "step": 5733 + }, + { + "epoch": 0.9947952810548231, + "grad_norm": 1.0138686895370483, + "learning_rate": 2.7843671402585747e-05, + "loss": 0.7383, + "step": 5734 + }, + { + "epoch": 0.9949687716863289, + "grad_norm": 0.7887297868728638, + "learning_rate": 2.783740352149719e-05, + "loss": 0.653, + "step": 5735 + }, + { + "epoch": 0.9951422623178349, + "grad_norm": 0.8260210752487183, + "learning_rate": 2.783113473086882e-05, + "loss": 0.644, + "step": 5736 + }, + { + "epoch": 0.9953157529493407, + "grad_norm": 1.2480870485305786, + "learning_rate": 2.7824865031428144e-05, + "loss": 0.7673, + "step": 5737 + }, + { + "epoch": 0.9954892435808467, + "grad_norm": 1.2316044569015503, + "learning_rate": 2.781859442390276e-05, + "loss": 0.5765, + "step": 5738 + }, + { + "epoch": 0.9956627342123525, + "grad_norm": 0.8259606957435608, + "learning_rate": 2.7812322909020385e-05, + "loss": 0.6628, + "step": 5739 + }, + { + "epoch": 0.9958362248438585, + "grad_norm": 0.8364043831825256, + "learning_rate": 2.780605048750883e-05, + "loss": 0.7014, + "step": 5740 + }, + { + "epoch": 0.9960097154753643, + "grad_norm": 0.9259915947914124, + "learning_rate": 2.7799777160096025e-05, + "loss": 0.7109, + "step": 5741 + }, + { + "epoch": 0.9961832061068703, + "grad_norm": 1.0249725580215454, + "learning_rate": 2.7793502927509988e-05, + "loss": 0.7979, + "step": 5742 + }, + { + "epoch": 0.9963566967383761, + "grad_norm": 1.5877562761306763, + "learning_rate": 2.7787227790478856e-05, + "loss": 0.7275, + "step": 5743 + }, + { + "epoch": 0.9965301873698821, + "grad_norm": 1.11116361618042, + "learning_rate": 2.7780951749730864e-05, + "loss": 0.7532, + "step": 5744 + }, + { + "epoch": 0.9967036780013879, + "grad_norm": 0.8628994226455688, + "learning_rate": 2.7774674805994356e-05, + "loss": 0.7252, + "step": 5745 + }, + { + "epoch": 0.9968771686328938, + "grad_norm": 0.7765992879867554, + "learning_rate": 2.7768396959997783e-05, + "loss": 0.7587, + "step": 5746 + }, + { + "epoch": 0.9970506592643997, + "grad_norm": 0.7644587755203247, + "learning_rate": 2.7762118212469686e-05, + "loss": 0.6826, + "step": 5747 + }, + { + "epoch": 0.9972241498959056, + "grad_norm": 1.6112723350524902, + "learning_rate": 2.7755838564138722e-05, + "loss": 0.7786, + "step": 5748 + }, + { + "epoch": 0.9973976405274115, + "grad_norm": 0.9348565936088562, + "learning_rate": 2.774955801573366e-05, + "loss": 0.684, + "step": 5749 + }, + { + "epoch": 0.9975711311589174, + "grad_norm": 0.9413449168205261, + "learning_rate": 2.7743276567983354e-05, + "loss": 0.6619, + "step": 5750 + }, + { + "epoch": 0.9977446217904233, + "grad_norm": 1.087456226348877, + "learning_rate": 2.7736994221616788e-05, + "loss": 0.6421, + "step": 5751 + }, + { + "epoch": 0.9979181124219292, + "grad_norm": 1.0703678131103516, + "learning_rate": 2.7730710977363023e-05, + "loss": 0.5863, + "step": 5752 + }, + { + "epoch": 0.9980916030534351, + "grad_norm": 1.200529932975769, + "learning_rate": 2.7724426835951242e-05, + "loss": 0.6539, + "step": 5753 + }, + { + "epoch": 0.998265093684941, + "grad_norm": 0.7245806455612183, + "learning_rate": 2.771814179811073e-05, + "loss": 0.697, + "step": 5754 + }, + { + "epoch": 0.9984385843164469, + "grad_norm": 0.9894890189170837, + "learning_rate": 2.7711855864570858e-05, + "loss": 0.7013, + "step": 5755 + }, + { + "epoch": 0.9986120749479528, + "grad_norm": 1.0838929414749146, + "learning_rate": 2.7705569036061137e-05, + "loss": 0.6934, + "step": 5756 + }, + { + "epoch": 0.9987855655794587, + "grad_norm": 1.3047393560409546, + "learning_rate": 2.7699281313311144e-05, + "loss": 0.5587, + "step": 5757 + }, + { + "epoch": 0.9989590562109646, + "grad_norm": 0.7709349989891052, + "learning_rate": 2.7692992697050587e-05, + "loss": 0.6545, + "step": 5758 + }, + { + "epoch": 0.9991325468424705, + "grad_norm": 0.7729399800300598, + "learning_rate": 2.768670318800926e-05, + "loss": 0.8481, + "step": 5759 + }, + { + "epoch": 0.9993060374739764, + "grad_norm": 1.2289849519729614, + "learning_rate": 2.7680412786917074e-05, + "loss": 0.6708, + "step": 5760 + }, + { + "epoch": 0.9994795281054824, + "grad_norm": 1.0843605995178223, + "learning_rate": 2.7674121494504032e-05, + "loss": 0.7205, + "step": 5761 + }, + { + "epoch": 0.9996530187369882, + "grad_norm": 0.7498700618743896, + "learning_rate": 2.7667829311500255e-05, + "loss": 0.7781, + "step": 5762 + }, + { + "epoch": 0.9998265093684942, + "grad_norm": 0.6707912087440491, + "learning_rate": 2.766153623863594e-05, + "loss": 0.7449, + "step": 5763 + }, + { + "epoch": 1.0, + "grad_norm": 0.7985667586326599, + "learning_rate": 2.765524227664143e-05, + "loss": 0.6177, + "step": 5764 + }, + { + "epoch": 1.0001734906315058, + "grad_norm": 0.8117948770523071, + "learning_rate": 2.7648947426247122e-05, + "loss": 0.8757, + "step": 5765 + }, + { + "epoch": 1.0003469812630117, + "grad_norm": 0.9299020171165466, + "learning_rate": 2.7642651688183558e-05, + "loss": 0.7269, + "step": 5766 + }, + { + "epoch": 1.0005204718945178, + "grad_norm": 1.0808806419372559, + "learning_rate": 2.763635506318137e-05, + "loss": 0.5979, + "step": 5767 + }, + { + "epoch": 1.0006939625260236, + "grad_norm": 1.0950453281402588, + "learning_rate": 2.763005755197126e-05, + "loss": 0.6464, + "step": 5768 + }, + { + "epoch": 1.0008674531575295, + "grad_norm": 0.8875908851623535, + "learning_rate": 2.7623759155284093e-05, + "loss": 0.6594, + "step": 5769 + }, + { + "epoch": 1.0010409437890353, + "grad_norm": 0.854020357131958, + "learning_rate": 2.7617459873850792e-05, + "loss": 0.8157, + "step": 5770 + }, + { + "epoch": 1.0012144344205414, + "grad_norm": 0.8862354159355164, + "learning_rate": 2.7611159708402387e-05, + "loss": 0.7622, + "step": 5771 + }, + { + "epoch": 1.0013879250520472, + "grad_norm": 0.7499619126319885, + "learning_rate": 2.760485865967004e-05, + "loss": 0.8137, + "step": 5772 + }, + { + "epoch": 1.001561415683553, + "grad_norm": 1.0493364334106445, + "learning_rate": 2.759855672838498e-05, + "loss": 0.5636, + "step": 5773 + }, + { + "epoch": 1.001734906315059, + "grad_norm": 1.9358316659927368, + "learning_rate": 2.7592253915278556e-05, + "loss": 0.7593, + "step": 5774 + }, + { + "epoch": 1.001908396946565, + "grad_norm": 1.2463414669036865, + "learning_rate": 2.7585950221082223e-05, + "loss": 0.7509, + "step": 5775 + }, + { + "epoch": 1.0020818875780708, + "grad_norm": 1.0677660703659058, + "learning_rate": 2.7579645646527522e-05, + "loss": 0.5964, + "step": 5776 + }, + { + "epoch": 1.0022553782095767, + "grad_norm": 1.27033269405365, + "learning_rate": 2.7573340192346117e-05, + "loss": 0.7427, + "step": 5777 + }, + { + "epoch": 1.0024288688410825, + "grad_norm": 0.9518104791641235, + "learning_rate": 2.7567033859269754e-05, + "loss": 0.5775, + "step": 5778 + }, + { + "epoch": 1.0026023594725886, + "grad_norm": 0.9113044142723083, + "learning_rate": 2.7560726648030294e-05, + "loss": 0.6328, + "step": 5779 + }, + { + "epoch": 1.0027758501040944, + "grad_norm": 0.8365322947502136, + "learning_rate": 2.75544185593597e-05, + "loss": 0.6783, + "step": 5780 + }, + { + "epoch": 1.0029493407356003, + "grad_norm": 1.0085475444793701, + "learning_rate": 2.7548109593990022e-05, + "loss": 0.7947, + "step": 5781 + }, + { + "epoch": 1.0031228313671061, + "grad_norm": 1.758383870124817, + "learning_rate": 2.754179975265344e-05, + "loss": 0.8389, + "step": 5782 + }, + { + "epoch": 1.0032963219986122, + "grad_norm": 1.084945559501648, + "learning_rate": 2.7535489036082198e-05, + "loss": 0.6174, + "step": 5783 + }, + { + "epoch": 1.003469812630118, + "grad_norm": 0.9316495656967163, + "learning_rate": 2.752917744500868e-05, + "loss": 0.754, + "step": 5784 + }, + { + "epoch": 1.0036433032616239, + "grad_norm": 0.6856520175933838, + "learning_rate": 2.7522864980165346e-05, + "loss": 0.8635, + "step": 5785 + }, + { + "epoch": 1.0038167938931297, + "grad_norm": 1.1234437227249146, + "learning_rate": 2.7516551642284765e-05, + "loss": 0.7273, + "step": 5786 + }, + { + "epoch": 1.0039902845246356, + "grad_norm": 0.8727921843528748, + "learning_rate": 2.7510237432099605e-05, + "loss": 0.7393, + "step": 5787 + }, + { + "epoch": 1.0041637751561416, + "grad_norm": 0.7289925813674927, + "learning_rate": 2.7503922350342645e-05, + "loss": 0.7604, + "step": 5788 + }, + { + "epoch": 1.0043372657876475, + "grad_norm": 0.7926239371299744, + "learning_rate": 2.7497606397746745e-05, + "loss": 0.7637, + "step": 5789 + }, + { + "epoch": 1.0045107564191533, + "grad_norm": 0.9810934066772461, + "learning_rate": 2.7491289575044893e-05, + "loss": 0.697, + "step": 5790 + }, + { + "epoch": 1.0046842470506592, + "grad_norm": 1.0046035051345825, + "learning_rate": 2.7484971882970156e-05, + "loss": 0.699, + "step": 5791 + }, + { + "epoch": 1.0048577376821652, + "grad_norm": 0.8993969559669495, + "learning_rate": 2.7478653322255707e-05, + "loss": 0.7544, + "step": 5792 + }, + { + "epoch": 1.005031228313671, + "grad_norm": 0.7708430886268616, + "learning_rate": 2.7472333893634824e-05, + "loss": 0.6527, + "step": 5793 + }, + { + "epoch": 1.005204718945177, + "grad_norm": 0.7352195978164673, + "learning_rate": 2.746601359784089e-05, + "loss": 0.8115, + "step": 5794 + }, + { + "epoch": 1.0053782095766828, + "grad_norm": 0.8851518034934998, + "learning_rate": 2.7459692435607376e-05, + "loss": 0.8428, + "step": 5795 + }, + { + "epoch": 1.0055517002081888, + "grad_norm": 0.8018458485603333, + "learning_rate": 2.745337040766787e-05, + "loss": 0.6215, + "step": 5796 + }, + { + "epoch": 1.0057251908396947, + "grad_norm": 1.0792179107666016, + "learning_rate": 2.7447047514756032e-05, + "loss": 0.7321, + "step": 5797 + }, + { + "epoch": 1.0058986814712005, + "grad_norm": 1.3612674474716187, + "learning_rate": 2.744072375760566e-05, + "loss": 0.5918, + "step": 5798 + }, + { + "epoch": 1.0060721721027064, + "grad_norm": 1.1529147624969482, + "learning_rate": 2.7434399136950625e-05, + "loss": 0.5983, + "step": 5799 + }, + { + "epoch": 1.0062456627342125, + "grad_norm": 0.8176974058151245, + "learning_rate": 2.74280736535249e-05, + "loss": 0.6331, + "step": 5800 + }, + { + "epoch": 1.0064191533657183, + "grad_norm": 0.9548895955085754, + "learning_rate": 2.742174730806258e-05, + "loss": 0.667, + "step": 5801 + }, + { + "epoch": 1.0065926439972241, + "grad_norm": 0.96796053647995, + "learning_rate": 2.7415420101297836e-05, + "loss": 0.6887, + "step": 5802 + }, + { + "epoch": 1.00676613462873, + "grad_norm": 0.8146540522575378, + "learning_rate": 2.7409092033964943e-05, + "loss": 0.8713, + "step": 5803 + }, + { + "epoch": 1.0069396252602358, + "grad_norm": 0.7730301022529602, + "learning_rate": 2.7402763106798295e-05, + "loss": 0.7856, + "step": 5804 + }, + { + "epoch": 1.007113115891742, + "grad_norm": 0.8887173533439636, + "learning_rate": 2.7396433320532356e-05, + "loss": 0.741, + "step": 5805 + }, + { + "epoch": 1.0072866065232478, + "grad_norm": 0.8578243255615234, + "learning_rate": 2.739010267590171e-05, + "loss": 0.7556, + "step": 5806 + }, + { + "epoch": 1.0074600971547536, + "grad_norm": 1.8549203872680664, + "learning_rate": 2.7383771173641037e-05, + "loss": 0.6292, + "step": 5807 + }, + { + "epoch": 1.0076335877862594, + "grad_norm": 0.996551513671875, + "learning_rate": 2.7377438814485117e-05, + "loss": 0.6233, + "step": 5808 + }, + { + "epoch": 1.0078070784177655, + "grad_norm": 1.3798279762268066, + "learning_rate": 2.7371105599168833e-05, + "loss": 0.7671, + "step": 5809 + }, + { + "epoch": 1.0079805690492714, + "grad_norm": 1.0427993535995483, + "learning_rate": 2.7364771528427145e-05, + "loss": 0.7427, + "step": 5810 + }, + { + "epoch": 1.0081540596807772, + "grad_norm": 0.892963707447052, + "learning_rate": 2.735843660299515e-05, + "loss": 0.7268, + "step": 5811 + }, + { + "epoch": 1.008327550312283, + "grad_norm": 0.8680844902992249, + "learning_rate": 2.7352100823608006e-05, + "loss": 0.6954, + "step": 5812 + }, + { + "epoch": 1.0085010409437891, + "grad_norm": 0.9097238779067993, + "learning_rate": 2.7345764191000993e-05, + "loss": 0.5996, + "step": 5813 + }, + { + "epoch": 1.008674531575295, + "grad_norm": 0.9284329414367676, + "learning_rate": 2.733942670590949e-05, + "loss": 0.853, + "step": 5814 + }, + { + "epoch": 1.0088480222068008, + "grad_norm": 0.9330607056617737, + "learning_rate": 2.7333088369068967e-05, + "loss": 0.7471, + "step": 5815 + }, + { + "epoch": 1.0090215128383067, + "grad_norm": 1.1408175230026245, + "learning_rate": 2.7326749181214992e-05, + "loss": 0.5735, + "step": 5816 + }, + { + "epoch": 1.0091950034698127, + "grad_norm": 0.8013665080070496, + "learning_rate": 2.732040914308324e-05, + "loss": 0.6741, + "step": 5817 + }, + { + "epoch": 1.0093684941013186, + "grad_norm": 0.9091314077377319, + "learning_rate": 2.7314068255409466e-05, + "loss": 0.7944, + "step": 5818 + }, + { + "epoch": 1.0095419847328244, + "grad_norm": 1.0760241746902466, + "learning_rate": 2.7307726518929562e-05, + "loss": 0.5984, + "step": 5819 + }, + { + "epoch": 1.0097154753643303, + "grad_norm": 0.9312578439712524, + "learning_rate": 2.7301383934379475e-05, + "loss": 0.6215, + "step": 5820 + }, + { + "epoch": 1.0098889659958363, + "grad_norm": 0.7693641781806946, + "learning_rate": 2.7295040502495274e-05, + "loss": 0.7621, + "step": 5821 + }, + { + "epoch": 1.0100624566273422, + "grad_norm": 0.9591032266616821, + "learning_rate": 2.7288696224013124e-05, + "loss": 0.6617, + "step": 5822 + }, + { + "epoch": 1.010235947258848, + "grad_norm": 0.8362924456596375, + "learning_rate": 2.728235109966928e-05, + "loss": 0.6763, + "step": 5823 + }, + { + "epoch": 1.0104094378903539, + "grad_norm": 1.5078800916671753, + "learning_rate": 2.727600513020011e-05, + "loss": 0.6055, + "step": 5824 + }, + { + "epoch": 1.0105829285218597, + "grad_norm": 0.9187929630279541, + "learning_rate": 2.7269658316342065e-05, + "loss": 0.6766, + "step": 5825 + }, + { + "epoch": 1.0107564191533658, + "grad_norm": 0.7115576863288879, + "learning_rate": 2.7263310658831697e-05, + "loss": 0.8389, + "step": 5826 + }, + { + "epoch": 1.0109299097848716, + "grad_norm": 0.7753402590751648, + "learning_rate": 2.725696215840567e-05, + "loss": 0.7186, + "step": 5827 + }, + { + "epoch": 1.0111034004163775, + "grad_norm": 0.9686505198478699, + "learning_rate": 2.725061281580073e-05, + "loss": 0.7338, + "step": 5828 + }, + { + "epoch": 1.0112768910478833, + "grad_norm": 0.8830453753471375, + "learning_rate": 2.724426263175372e-05, + "loss": 0.7173, + "step": 5829 + }, + { + "epoch": 1.0114503816793894, + "grad_norm": 1.0601557493209839, + "learning_rate": 2.7237911607001586e-05, + "loss": 0.6293, + "step": 5830 + }, + { + "epoch": 1.0116238723108952, + "grad_norm": 0.8798278570175171, + "learning_rate": 2.7231559742281382e-05, + "loss": 0.8413, + "step": 5831 + }, + { + "epoch": 1.011797362942401, + "grad_norm": 0.8522995114326477, + "learning_rate": 2.722520703833024e-05, + "loss": 0.8491, + "step": 5832 + }, + { + "epoch": 1.011970853573907, + "grad_norm": 0.8679379224777222, + "learning_rate": 2.7218853495885406e-05, + "loss": 0.7119, + "step": 5833 + }, + { + "epoch": 1.012144344205413, + "grad_norm": 1.3872684240341187, + "learning_rate": 2.7212499115684204e-05, + "loss": 0.8706, + "step": 5834 + }, + { + "epoch": 1.0123178348369188, + "grad_norm": 0.8215222358703613, + "learning_rate": 2.7206143898464084e-05, + "loss": 0.9214, + "step": 5835 + }, + { + "epoch": 1.0124913254684247, + "grad_norm": 0.9816405177116394, + "learning_rate": 2.719978784496257e-05, + "loss": 0.5673, + "step": 5836 + }, + { + "epoch": 1.0126648160999305, + "grad_norm": 0.874737024307251, + "learning_rate": 2.719343095591728e-05, + "loss": 0.7397, + "step": 5837 + }, + { + "epoch": 1.0128383067314366, + "grad_norm": 0.7579262852668762, + "learning_rate": 2.718707323206595e-05, + "loss": 0.6573, + "step": 5838 + }, + { + "epoch": 1.0130117973629424, + "grad_norm": 1.880773901939392, + "learning_rate": 2.7180714674146388e-05, + "loss": 0.6287, + "step": 5839 + }, + { + "epoch": 1.0131852879944483, + "grad_norm": 0.9030709862709045, + "learning_rate": 2.717435528289653e-05, + "loss": 0.7954, + "step": 5840 + }, + { + "epoch": 1.0133587786259541, + "grad_norm": 0.7966489195823669, + "learning_rate": 2.7167995059054386e-05, + "loss": 0.822, + "step": 5841 + }, + { + "epoch": 1.0135322692574602, + "grad_norm": 0.9916248917579651, + "learning_rate": 2.7161634003358056e-05, + "loss": 0.5605, + "step": 5842 + }, + { + "epoch": 1.013705759888966, + "grad_norm": 0.867904543876648, + "learning_rate": 2.715527211654575e-05, + "loss": 0.7358, + "step": 5843 + }, + { + "epoch": 1.013879250520472, + "grad_norm": 0.8174094557762146, + "learning_rate": 2.7148909399355785e-05, + "loss": 0.6344, + "step": 5844 + }, + { + "epoch": 1.0140527411519777, + "grad_norm": 0.6690670251846313, + "learning_rate": 2.7142545852526555e-05, + "loss": 0.8655, + "step": 5845 + }, + { + "epoch": 1.0142262317834836, + "grad_norm": 1.0890955924987793, + "learning_rate": 2.713618147679655e-05, + "loss": 0.5399, + "step": 5846 + }, + { + "epoch": 1.0143997224149897, + "grad_norm": 0.9246536493301392, + "learning_rate": 2.7129816272904372e-05, + "loss": 0.7057, + "step": 5847 + }, + { + "epoch": 1.0145732130464955, + "grad_norm": 0.8728846907615662, + "learning_rate": 2.712345024158871e-05, + "loss": 0.7598, + "step": 5848 + }, + { + "epoch": 1.0147467036780013, + "grad_norm": 1.1579136848449707, + "learning_rate": 2.711708338358835e-05, + "loss": 0.7416, + "step": 5849 + }, + { + "epoch": 1.0149201943095072, + "grad_norm": 1.3917264938354492, + "learning_rate": 2.711071569964216e-05, + "loss": 0.7168, + "step": 5850 + }, + { + "epoch": 1.0150936849410133, + "grad_norm": 0.8378201723098755, + "learning_rate": 2.7104347190489134e-05, + "loss": 0.7617, + "step": 5851 + }, + { + "epoch": 1.015267175572519, + "grad_norm": 0.897121250629425, + "learning_rate": 2.7097977856868336e-05, + "loss": 0.6387, + "step": 5852 + }, + { + "epoch": 1.015440666204025, + "grad_norm": 1.658364176750183, + "learning_rate": 2.7091607699518936e-05, + "loss": 0.559, + "step": 5853 + }, + { + "epoch": 1.0156141568355308, + "grad_norm": 7.1017680168151855, + "learning_rate": 2.70852367191802e-05, + "loss": 0.7214, + "step": 5854 + }, + { + "epoch": 1.0157876474670369, + "grad_norm": 0.6552796363830566, + "learning_rate": 2.707886491659149e-05, + "loss": 0.6821, + "step": 5855 + }, + { + "epoch": 1.0159611380985427, + "grad_norm": 0.9112135171890259, + "learning_rate": 2.707249229249225e-05, + "loss": 0.5874, + "step": 5856 + }, + { + "epoch": 1.0161346287300486, + "grad_norm": 0.8820544481277466, + "learning_rate": 2.7066118847622053e-05, + "loss": 0.746, + "step": 5857 + }, + { + "epoch": 1.0163081193615544, + "grad_norm": 0.723624587059021, + "learning_rate": 2.7059744582720515e-05, + "loss": 0.6396, + "step": 5858 + }, + { + "epoch": 1.0164816099930605, + "grad_norm": 0.8817532658576965, + "learning_rate": 2.7053369498527404e-05, + "loss": 0.6543, + "step": 5859 + }, + { + "epoch": 1.0166551006245663, + "grad_norm": 0.6829749941825867, + "learning_rate": 2.7046993595782532e-05, + "loss": 0.6725, + "step": 5860 + }, + { + "epoch": 1.0168285912560722, + "grad_norm": 0.9110373258590698, + "learning_rate": 2.704061687522585e-05, + "loss": 0.8801, + "step": 5861 + }, + { + "epoch": 1.017002081887578, + "grad_norm": 0.9026188254356384, + "learning_rate": 2.7034239337597378e-05, + "loss": 0.6169, + "step": 5862 + }, + { + "epoch": 1.017175572519084, + "grad_norm": 0.9920259118080139, + "learning_rate": 2.7027860983637223e-05, + "loss": 0.821, + "step": 5863 + }, + { + "epoch": 1.01734906315059, + "grad_norm": 0.8366192579269409, + "learning_rate": 2.7021481814085622e-05, + "loss": 0.6283, + "step": 5864 + }, + { + "epoch": 1.0175225537820958, + "grad_norm": 1.2905395030975342, + "learning_rate": 2.7015101829682867e-05, + "loss": 0.7053, + "step": 5865 + }, + { + "epoch": 1.0176960444136016, + "grad_norm": 0.8769221305847168, + "learning_rate": 2.7008721031169378e-05, + "loss": 0.683, + "step": 5866 + }, + { + "epoch": 1.0178695350451075, + "grad_norm": 0.9730294942855835, + "learning_rate": 2.7002339419285646e-05, + "loss": 0.6907, + "step": 5867 + }, + { + "epoch": 1.0180430256766135, + "grad_norm": 1.1432398557662964, + "learning_rate": 2.699595699477226e-05, + "loss": 0.812, + "step": 5868 + }, + { + "epoch": 1.0182165163081194, + "grad_norm": 1.0417373180389404, + "learning_rate": 2.6989573758369915e-05, + "loss": 0.5746, + "step": 5869 + }, + { + "epoch": 1.0183900069396252, + "grad_norm": 1.2020565271377563, + "learning_rate": 2.6983189710819396e-05, + "loss": 0.6484, + "step": 5870 + }, + { + "epoch": 1.018563497571131, + "grad_norm": 1.094376564025879, + "learning_rate": 2.6976804852861564e-05, + "loss": 0.576, + "step": 5871 + }, + { + "epoch": 1.0187369882026371, + "grad_norm": 0.9119104743003845, + "learning_rate": 2.697041918523741e-05, + "loss": 0.6068, + "step": 5872 + }, + { + "epoch": 1.018910478834143, + "grad_norm": 0.6706774234771729, + "learning_rate": 2.696403270868798e-05, + "loss": 0.8108, + "step": 5873 + }, + { + "epoch": 1.0190839694656488, + "grad_norm": 1.1253536939620972, + "learning_rate": 2.6957645423954438e-05, + "loss": 0.7832, + "step": 5874 + }, + { + "epoch": 1.0192574600971547, + "grad_norm": 0.9818238615989685, + "learning_rate": 2.6951257331778045e-05, + "loss": 0.8928, + "step": 5875 + }, + { + "epoch": 1.0194309507286607, + "grad_norm": 1.332409381866455, + "learning_rate": 2.694486843290013e-05, + "loss": 0.5698, + "step": 5876 + }, + { + "epoch": 1.0196044413601666, + "grad_norm": 0.7787851691246033, + "learning_rate": 2.6938478728062148e-05, + "loss": 0.6395, + "step": 5877 + }, + { + "epoch": 1.0197779319916724, + "grad_norm": 0.891055703163147, + "learning_rate": 2.6932088218005623e-05, + "loss": 0.6328, + "step": 5878 + }, + { + "epoch": 1.0199514226231783, + "grad_norm": 0.9191906452178955, + "learning_rate": 2.692569690347218e-05, + "loss": 0.8018, + "step": 5879 + }, + { + "epoch": 1.0201249132546844, + "grad_norm": 0.7539854049682617, + "learning_rate": 2.6919304785203543e-05, + "loss": 0.6813, + "step": 5880 + }, + { + "epoch": 1.0202984038861902, + "grad_norm": 0.8395010828971863, + "learning_rate": 2.6912911863941525e-05, + "loss": 0.63, + "step": 5881 + }, + { + "epoch": 1.020471894517696, + "grad_norm": 1.1735496520996094, + "learning_rate": 2.6906518140428027e-05, + "loss": 0.7363, + "step": 5882 + }, + { + "epoch": 1.020645385149202, + "grad_norm": 1.434009075164795, + "learning_rate": 2.6900123615405052e-05, + "loss": 0.5508, + "step": 5883 + }, + { + "epoch": 1.0208188757807077, + "grad_norm": 1.011090874671936, + "learning_rate": 2.6893728289614693e-05, + "loss": 0.5283, + "step": 5884 + }, + { + "epoch": 1.0209923664122138, + "grad_norm": 0.7098178863525391, + "learning_rate": 2.6887332163799133e-05, + "loss": 0.887, + "step": 5885 + }, + { + "epoch": 1.0211658570437196, + "grad_norm": 0.787161648273468, + "learning_rate": 2.688093523870065e-05, + "loss": 0.75, + "step": 5886 + }, + { + "epoch": 1.0213393476752255, + "grad_norm": 0.8358664512634277, + "learning_rate": 2.6874537515061612e-05, + "loss": 0.7505, + "step": 5887 + }, + { + "epoch": 1.0215128383067313, + "grad_norm": 0.9729760885238647, + "learning_rate": 2.6868138993624486e-05, + "loss": 0.7622, + "step": 5888 + }, + { + "epoch": 1.0216863289382374, + "grad_norm": 0.9353317022323608, + "learning_rate": 2.6861739675131823e-05, + "loss": 0.583, + "step": 5889 + }, + { + "epoch": 1.0218598195697433, + "grad_norm": 0.891065239906311, + "learning_rate": 2.6855339560326284e-05, + "loss": 0.78, + "step": 5890 + }, + { + "epoch": 1.022033310201249, + "grad_norm": 1.0871925354003906, + "learning_rate": 2.6848938649950597e-05, + "loss": 0.5951, + "step": 5891 + }, + { + "epoch": 1.022206800832755, + "grad_norm": 0.7330968976020813, + "learning_rate": 2.6842536944747597e-05, + "loss": 0.7903, + "step": 5892 + }, + { + "epoch": 1.022380291464261, + "grad_norm": 0.8664817810058594, + "learning_rate": 2.6836134445460212e-05, + "loss": 0.7135, + "step": 5893 + }, + { + "epoch": 1.0225537820957669, + "grad_norm": 0.9971588253974915, + "learning_rate": 2.682973115283146e-05, + "loss": 0.5474, + "step": 5894 + }, + { + "epoch": 1.0227272727272727, + "grad_norm": 0.989402174949646, + "learning_rate": 2.6823327067604452e-05, + "loss": 0.624, + "step": 5895 + }, + { + "epoch": 1.0229007633587786, + "grad_norm": 0.7410433888435364, + "learning_rate": 2.6816922190522386e-05, + "loss": 0.7742, + "step": 5896 + }, + { + "epoch": 1.0230742539902846, + "grad_norm": 0.8231345415115356, + "learning_rate": 2.6810516522328553e-05, + "loss": 0.5657, + "step": 5897 + }, + { + "epoch": 1.0232477446217905, + "grad_norm": 0.9923977851867676, + "learning_rate": 2.6804110063766345e-05, + "loss": 0.7347, + "step": 5898 + }, + { + "epoch": 1.0234212352532963, + "grad_norm": 0.764682948589325, + "learning_rate": 2.6797702815579234e-05, + "loss": 0.7338, + "step": 5899 + }, + { + "epoch": 1.0235947258848022, + "grad_norm": 0.871781587600708, + "learning_rate": 2.679129477851079e-05, + "loss": 0.7898, + "step": 5900 + }, + { + "epoch": 1.0237682165163082, + "grad_norm": 1.3401107788085938, + "learning_rate": 2.6784885953304676e-05, + "loss": 0.6638, + "step": 5901 + }, + { + "epoch": 1.023941707147814, + "grad_norm": 0.9512198567390442, + "learning_rate": 2.6778476340704636e-05, + "loss": 0.835, + "step": 5902 + }, + { + "epoch": 1.02411519777932, + "grad_norm": 0.997464120388031, + "learning_rate": 2.6772065941454527e-05, + "loss": 0.6956, + "step": 5903 + }, + { + "epoch": 1.0242886884108258, + "grad_norm": 0.8455097079277039, + "learning_rate": 2.6765654756298264e-05, + "loss": 0.6924, + "step": 5904 + }, + { + "epoch": 1.0244621790423316, + "grad_norm": 0.8826768398284912, + "learning_rate": 2.675924278597989e-05, + "loss": 0.7035, + "step": 5905 + }, + { + "epoch": 1.0246356696738377, + "grad_norm": 0.7397896647453308, + "learning_rate": 2.675283003124351e-05, + "loss": 0.7484, + "step": 5906 + }, + { + "epoch": 1.0248091603053435, + "grad_norm": 0.9405354261398315, + "learning_rate": 2.6746416492833343e-05, + "loss": 0.8005, + "step": 5907 + }, + { + "epoch": 1.0249826509368494, + "grad_norm": 0.9797018766403198, + "learning_rate": 2.6740002171493676e-05, + "loss": 0.676, + "step": 5908 + }, + { + "epoch": 1.0251561415683552, + "grad_norm": 0.9584313631057739, + "learning_rate": 2.67335870679689e-05, + "loss": 0.6804, + "step": 5909 + }, + { + "epoch": 1.0253296321998613, + "grad_norm": 0.8365959525108337, + "learning_rate": 2.6727171183003502e-05, + "loss": 0.6958, + "step": 5910 + }, + { + "epoch": 1.0255031228313671, + "grad_norm": 0.9888023734092712, + "learning_rate": 2.6720754517342053e-05, + "loss": 0.52, + "step": 5911 + }, + { + "epoch": 1.025676613462873, + "grad_norm": 1.013359785079956, + "learning_rate": 2.6714337071729207e-05, + "loss": 0.6233, + "step": 5912 + }, + { + "epoch": 1.0258501040943788, + "grad_norm": 1.0898033380508423, + "learning_rate": 2.6707918846909722e-05, + "loss": 0.6528, + "step": 5913 + }, + { + "epoch": 1.026023594725885, + "grad_norm": 0.8186234831809998, + "learning_rate": 2.6701499843628443e-05, + "loss": 0.8896, + "step": 5914 + }, + { + "epoch": 1.0261970853573907, + "grad_norm": 0.9258219003677368, + "learning_rate": 2.669508006263029e-05, + "loss": 0.6257, + "step": 5915 + }, + { + "epoch": 1.0263705759888966, + "grad_norm": 0.8620462417602539, + "learning_rate": 2.66886595046603e-05, + "loss": 0.8184, + "step": 5916 + }, + { + "epoch": 1.0265440666204024, + "grad_norm": 1.1799187660217285, + "learning_rate": 2.6682238170463575e-05, + "loss": 0.6874, + "step": 5917 + }, + { + "epoch": 1.0267175572519085, + "grad_norm": 0.9480648636817932, + "learning_rate": 2.6675816060785327e-05, + "loss": 0.7734, + "step": 5918 + }, + { + "epoch": 1.0268910478834143, + "grad_norm": 1.0558720827102661, + "learning_rate": 2.666939317637085e-05, + "loss": 0.6394, + "step": 5919 + }, + { + "epoch": 1.0270645385149202, + "grad_norm": 1.0143375396728516, + "learning_rate": 2.666296951796552e-05, + "loss": 0.8677, + "step": 5920 + }, + { + "epoch": 1.027238029146426, + "grad_norm": 0.7274793982505798, + "learning_rate": 2.665654508631481e-05, + "loss": 0.7207, + "step": 5921 + }, + { + "epoch": 1.027411519777932, + "grad_norm": 0.9707695245742798, + "learning_rate": 2.6650119882164292e-05, + "loss": 0.7534, + "step": 5922 + }, + { + "epoch": 1.027585010409438, + "grad_norm": 0.9701602458953857, + "learning_rate": 2.664369390625961e-05, + "loss": 0.7168, + "step": 5923 + }, + { + "epoch": 1.0277585010409438, + "grad_norm": 0.8117621541023254, + "learning_rate": 2.663726715934651e-05, + "loss": 0.7316, + "step": 5924 + }, + { + "epoch": 1.0279319916724496, + "grad_norm": 0.994392454624176, + "learning_rate": 2.663083964217082e-05, + "loss": 0.7166, + "step": 5925 + }, + { + "epoch": 1.0281054823039555, + "grad_norm": 0.772894561290741, + "learning_rate": 2.6624411355478463e-05, + "loss": 0.7131, + "step": 5926 + }, + { + "epoch": 1.0282789729354616, + "grad_norm": 0.8361301422119141, + "learning_rate": 2.6617982300015457e-05, + "loss": 0.7499, + "step": 5927 + }, + { + "epoch": 1.0284524635669674, + "grad_norm": 1.144643783569336, + "learning_rate": 2.661155247652788e-05, + "loss": 0.6852, + "step": 5928 + }, + { + "epoch": 1.0286259541984732, + "grad_norm": 0.9734717607498169, + "learning_rate": 2.6605121885761948e-05, + "loss": 0.6953, + "step": 5929 + }, + { + "epoch": 1.028799444829979, + "grad_norm": 1.9994429349899292, + "learning_rate": 2.6598690528463916e-05, + "loss": 0.6969, + "step": 5930 + }, + { + "epoch": 1.0289729354614852, + "grad_norm": 1.5050146579742432, + "learning_rate": 2.659225840538016e-05, + "loss": 0.7217, + "step": 5931 + }, + { + "epoch": 1.029146426092991, + "grad_norm": 1.0938124656677246, + "learning_rate": 2.6585825517257133e-05, + "loss": 0.7125, + "step": 5932 + }, + { + "epoch": 1.0293199167244969, + "grad_norm": 1.526424765586853, + "learning_rate": 2.657939186484139e-05, + "loss": 0.7192, + "step": 5933 + }, + { + "epoch": 1.0294934073560027, + "grad_norm": 1.1924688816070557, + "learning_rate": 2.6572957448879547e-05, + "loss": 0.6069, + "step": 5934 + }, + { + "epoch": 1.0296668979875088, + "grad_norm": 1.0248382091522217, + "learning_rate": 2.6566522270118333e-05, + "loss": 0.6149, + "step": 5935 + }, + { + "epoch": 1.0298403886190146, + "grad_norm": 1.7175177335739136, + "learning_rate": 2.656008632930456e-05, + "loss": 0.6333, + "step": 5936 + }, + { + "epoch": 1.0300138792505205, + "grad_norm": 0.7957581281661987, + "learning_rate": 2.6553649627185122e-05, + "loss": 0.8403, + "step": 5937 + }, + { + "epoch": 1.0301873698820263, + "grad_norm": 0.9218323826789856, + "learning_rate": 2.654721216450701e-05, + "loss": 0.8228, + "step": 5938 + }, + { + "epoch": 1.0303608605135324, + "grad_norm": 0.7854690551757812, + "learning_rate": 2.65407739420173e-05, + "loss": 0.7849, + "step": 5939 + }, + { + "epoch": 1.0305343511450382, + "grad_norm": 0.7573587894439697, + "learning_rate": 2.653433496046315e-05, + "loss": 0.719, + "step": 5940 + }, + { + "epoch": 1.030707841776544, + "grad_norm": 0.9685264825820923, + "learning_rate": 2.652789522059181e-05, + "loss": 0.5875, + "step": 5941 + }, + { + "epoch": 1.03088133240805, + "grad_norm": 0.8424142003059387, + "learning_rate": 2.652145472315063e-05, + "loss": 0.8467, + "step": 5942 + }, + { + "epoch": 1.0310548230395558, + "grad_norm": 0.9956643581390381, + "learning_rate": 2.6515013468887026e-05, + "loss": 0.692, + "step": 5943 + }, + { + "epoch": 1.0312283136710618, + "grad_norm": 0.8201285004615784, + "learning_rate": 2.650857145854852e-05, + "loss": 0.8044, + "step": 5944 + }, + { + "epoch": 1.0314018043025677, + "grad_norm": 0.8475446105003357, + "learning_rate": 2.650212869288271e-05, + "loss": 0.7513, + "step": 5945 + }, + { + "epoch": 1.0315752949340735, + "grad_norm": 1.2164384126663208, + "learning_rate": 2.6495685172637292e-05, + "loss": 0.606, + "step": 5946 + }, + { + "epoch": 1.0317487855655794, + "grad_norm": 1.2601618766784668, + "learning_rate": 2.6489240898560035e-05, + "loss": 0.517, + "step": 5947 + }, + { + "epoch": 1.0319222761970854, + "grad_norm": 1.0107890367507935, + "learning_rate": 2.6482795871398815e-05, + "loss": 0.6458, + "step": 5948 + }, + { + "epoch": 1.0320957668285913, + "grad_norm": 0.733284592628479, + "learning_rate": 2.6476350091901583e-05, + "loss": 0.698, + "step": 5949 + }, + { + "epoch": 1.0322692574600971, + "grad_norm": 1.2513178586959839, + "learning_rate": 2.646990356081637e-05, + "loss": 0.5837, + "step": 5950 + }, + { + "epoch": 1.032442748091603, + "grad_norm": 0.8141224980354309, + "learning_rate": 2.646345627889131e-05, + "loss": 0.7109, + "step": 5951 + }, + { + "epoch": 1.032616238723109, + "grad_norm": 1.3363779783248901, + "learning_rate": 2.645700824687462e-05, + "loss": 0.7072, + "step": 5952 + }, + { + "epoch": 1.0327897293546149, + "grad_norm": 0.9605594873428345, + "learning_rate": 2.6450559465514598e-05, + "loss": 0.8016, + "step": 5953 + }, + { + "epoch": 1.0329632199861207, + "grad_norm": 1.0082218647003174, + "learning_rate": 2.644410993555963e-05, + "loss": 0.7649, + "step": 5954 + }, + { + "epoch": 1.0331367106176266, + "grad_norm": 1.0920716524124146, + "learning_rate": 2.6437659657758198e-05, + "loss": 0.6316, + "step": 5955 + }, + { + "epoch": 1.0333102012491326, + "grad_norm": 0.8766392469406128, + "learning_rate": 2.643120863285886e-05, + "loss": 0.7417, + "step": 5956 + }, + { + "epoch": 1.0334836918806385, + "grad_norm": 1.3609262704849243, + "learning_rate": 2.6424756861610274e-05, + "loss": 0.6504, + "step": 5957 + }, + { + "epoch": 1.0336571825121443, + "grad_norm": 1.0491697788238525, + "learning_rate": 2.6418304344761165e-05, + "loss": 0.6189, + "step": 5958 + }, + { + "epoch": 1.0338306731436502, + "grad_norm": 0.9058055281639099, + "learning_rate": 2.6411851083060355e-05, + "loss": 0.6448, + "step": 5959 + }, + { + "epoch": 1.0340041637751562, + "grad_norm": 0.8935065865516663, + "learning_rate": 2.6405397077256752e-05, + "loss": 0.6008, + "step": 5960 + }, + { + "epoch": 1.034177654406662, + "grad_norm": 1.0522528886795044, + "learning_rate": 2.639894232809936e-05, + "loss": 0.7021, + "step": 5961 + }, + { + "epoch": 1.034351145038168, + "grad_norm": 4.148308753967285, + "learning_rate": 2.6392486836337256e-05, + "loss": 0.7571, + "step": 5962 + }, + { + "epoch": 1.0345246356696738, + "grad_norm": 1.703848958015442, + "learning_rate": 2.63860306027196e-05, + "loss": 0.5576, + "step": 5963 + }, + { + "epoch": 1.0346981263011796, + "grad_norm": 1.4969253540039062, + "learning_rate": 2.637957362799566e-05, + "loss": 0.6578, + "step": 5964 + }, + { + "epoch": 1.0348716169326857, + "grad_norm": 0.9624981880187988, + "learning_rate": 2.637311591291476e-05, + "loss": 0.5709, + "step": 5965 + }, + { + "epoch": 1.0350451075641915, + "grad_norm": 1.0903147459030151, + "learning_rate": 2.636665745822633e-05, + "loss": 0.6798, + "step": 5966 + }, + { + "epoch": 1.0352185981956974, + "grad_norm": 1.0320631265640259, + "learning_rate": 2.636019826467989e-05, + "loss": 0.7188, + "step": 5967 + }, + { + "epoch": 1.0353920888272032, + "grad_norm": 1.1003899574279785, + "learning_rate": 2.6353738333025022e-05, + "loss": 0.7463, + "step": 5968 + }, + { + "epoch": 1.0355655794587093, + "grad_norm": 0.8987311720848083, + "learning_rate": 2.6347277664011426e-05, + "loss": 0.6326, + "step": 5969 + }, + { + "epoch": 1.0357390700902152, + "grad_norm": 0.8579810261726379, + "learning_rate": 2.6340816258388858e-05, + "loss": 0.7859, + "step": 5970 + }, + { + "epoch": 1.035912560721721, + "grad_norm": 1.2032078504562378, + "learning_rate": 2.6334354116907173e-05, + "loss": 0.6858, + "step": 5971 + }, + { + "epoch": 1.0360860513532268, + "grad_norm": 1.1967957019805908, + "learning_rate": 2.6327891240316313e-05, + "loss": 0.7632, + "step": 5972 + }, + { + "epoch": 1.036259541984733, + "grad_norm": 1.7021011114120483, + "learning_rate": 2.6321427629366295e-05, + "loss": 0.7101, + "step": 5973 + }, + { + "epoch": 1.0364330326162388, + "grad_norm": 1.006156086921692, + "learning_rate": 2.6314963284807246e-05, + "loss": 0.7943, + "step": 5974 + }, + { + "epoch": 1.0366065232477446, + "grad_norm": 1.4427818059921265, + "learning_rate": 2.6308498207389344e-05, + "loss": 0.6677, + "step": 5975 + }, + { + "epoch": 1.0367800138792505, + "grad_norm": 1.3219972848892212, + "learning_rate": 2.630203239786287e-05, + "loss": 0.6848, + "step": 5976 + }, + { + "epoch": 1.0369535045107565, + "grad_norm": 1.3149486780166626, + "learning_rate": 2.6295565856978202e-05, + "loss": 0.7039, + "step": 5977 + }, + { + "epoch": 1.0371269951422624, + "grad_norm": 0.8783060908317566, + "learning_rate": 2.628909858548577e-05, + "loss": 0.6135, + "step": 5978 + }, + { + "epoch": 1.0373004857737682, + "grad_norm": 0.8092104196548462, + "learning_rate": 2.6282630584136123e-05, + "loss": 0.6561, + "step": 5979 + }, + { + "epoch": 1.037473976405274, + "grad_norm": 1.106422781944275, + "learning_rate": 2.6276161853679877e-05, + "loss": 0.7576, + "step": 5980 + }, + { + "epoch": 1.0376474670367801, + "grad_norm": 0.7469962239265442, + "learning_rate": 2.626969239486773e-05, + "loss": 0.7781, + "step": 5981 + }, + { + "epoch": 1.037820957668286, + "grad_norm": 1.0709221363067627, + "learning_rate": 2.626322220845048e-05, + "loss": 0.6891, + "step": 5982 + }, + { + "epoch": 1.0379944482997918, + "grad_norm": 1.106683611869812, + "learning_rate": 2.6256751295179e-05, + "loss": 0.5992, + "step": 5983 + }, + { + "epoch": 1.0381679389312977, + "grad_norm": 1.307600736618042, + "learning_rate": 2.6250279655804232e-05, + "loss": 0.7021, + "step": 5984 + }, + { + "epoch": 1.0383414295628035, + "grad_norm": 1.7808228731155396, + "learning_rate": 2.624380729107723e-05, + "loss": 0.6287, + "step": 5985 + }, + { + "epoch": 1.0385149201943096, + "grad_norm": 0.9543058276176453, + "learning_rate": 2.6237334201749126e-05, + "loss": 0.7201, + "step": 5986 + }, + { + "epoch": 1.0386884108258154, + "grad_norm": 0.8104865550994873, + "learning_rate": 2.6230860388571103e-05, + "loss": 0.6404, + "step": 5987 + }, + { + "epoch": 1.0388619014573213, + "grad_norm": 1.4280214309692383, + "learning_rate": 2.6224385852294484e-05, + "loss": 0.6829, + "step": 5988 + }, + { + "epoch": 1.0390353920888271, + "grad_norm": 0.8720922470092773, + "learning_rate": 2.6217910593670626e-05, + "loss": 0.7418, + "step": 5989 + }, + { + "epoch": 1.0392088827203332, + "grad_norm": 0.9216084480285645, + "learning_rate": 2.6211434613451006e-05, + "loss": 0.7125, + "step": 5990 + }, + { + "epoch": 1.039382373351839, + "grad_norm": 1.019770622253418, + "learning_rate": 2.6204957912387156e-05, + "loss": 0.7407, + "step": 5991 + }, + { + "epoch": 1.0395558639833449, + "grad_norm": 0.764509916305542, + "learning_rate": 2.6198480491230712e-05, + "loss": 0.6748, + "step": 5992 + }, + { + "epoch": 1.0397293546148507, + "grad_norm": 1.1720954179763794, + "learning_rate": 2.6192002350733387e-05, + "loss": 0.7212, + "step": 5993 + }, + { + "epoch": 1.0399028452463568, + "grad_norm": 0.8889636993408203, + "learning_rate": 2.618552349164697e-05, + "loss": 0.668, + "step": 5994 + }, + { + "epoch": 1.0400763358778626, + "grad_norm": 1.080152153968811, + "learning_rate": 2.617904391472334e-05, + "loss": 0.5853, + "step": 5995 + }, + { + "epoch": 1.0402498265093685, + "grad_norm": 1.0356941223144531, + "learning_rate": 2.6172563620714475e-05, + "loss": 0.7566, + "step": 5996 + }, + { + "epoch": 1.0404233171408743, + "grad_norm": 0.9450023174285889, + "learning_rate": 2.61660826103724e-05, + "loss": 0.719, + "step": 5997 + }, + { + "epoch": 1.0405968077723804, + "grad_norm": 0.7715034484863281, + "learning_rate": 2.6159600884449258e-05, + "loss": 0.8276, + "step": 5998 + }, + { + "epoch": 1.0407702984038862, + "grad_norm": 1.2489008903503418, + "learning_rate": 2.6153118443697255e-05, + "loss": 0.6787, + "step": 5999 + }, + { + "epoch": 1.040943789035392, + "grad_norm": 1.7975891828536987, + "learning_rate": 2.6146635288868685e-05, + "loss": 0.7321, + "step": 6000 + }, + { + "epoch": 1.041117279666898, + "grad_norm": 1.1592804193496704, + "learning_rate": 2.6140151420715932e-05, + "loss": 0.5724, + "step": 6001 + }, + { + "epoch": 1.0412907702984038, + "grad_norm": 1.1514254808425903, + "learning_rate": 2.6133666839991444e-05, + "loss": 0.6736, + "step": 6002 + }, + { + "epoch": 1.0414642609299098, + "grad_norm": 0.9709215760231018, + "learning_rate": 2.6127181547447773e-05, + "loss": 0.6318, + "step": 6003 + }, + { + "epoch": 1.0416377515614157, + "grad_norm": 0.7522684335708618, + "learning_rate": 2.612069554383755e-05, + "loss": 0.5846, + "step": 6004 + }, + { + "epoch": 1.0418112421929215, + "grad_norm": 1.85126531124115, + "learning_rate": 2.6114208829913473e-05, + "loss": 0.7402, + "step": 6005 + }, + { + "epoch": 1.0419847328244274, + "grad_norm": 0.8680480122566223, + "learning_rate": 2.6107721406428338e-05, + "loss": 0.7542, + "step": 6006 + }, + { + "epoch": 1.0421582234559335, + "grad_norm": 1.0733129978179932, + "learning_rate": 2.6101233274135017e-05, + "loss": 0.5442, + "step": 6007 + }, + { + "epoch": 1.0423317140874393, + "grad_norm": 0.8770571947097778, + "learning_rate": 2.6094744433786467e-05, + "loss": 0.7502, + "step": 6008 + }, + { + "epoch": 1.0425052047189451, + "grad_norm": 1.176040530204773, + "learning_rate": 2.608825488613572e-05, + "loss": 0.627, + "step": 6009 + }, + { + "epoch": 1.042678695350451, + "grad_norm": 0.9277677536010742, + "learning_rate": 2.6081764631935896e-05, + "loss": 0.6349, + "step": 6010 + }, + { + "epoch": 1.042852185981957, + "grad_norm": 0.8327305912971497, + "learning_rate": 2.6075273671940212e-05, + "loss": 0.7059, + "step": 6011 + }, + { + "epoch": 1.043025676613463, + "grad_norm": 0.792158842086792, + "learning_rate": 2.606878200690193e-05, + "loss": 0.6628, + "step": 6012 + }, + { + "epoch": 1.0431991672449688, + "grad_norm": 1.0641145706176758, + "learning_rate": 2.6062289637574428e-05, + "loss": 0.5348, + "step": 6013 + }, + { + "epoch": 1.0433726578764746, + "grad_norm": 0.8910120129585266, + "learning_rate": 2.605579656471115e-05, + "loss": 0.7339, + "step": 6014 + }, + { + "epoch": 1.0435461485079807, + "grad_norm": 0.910671055316925, + "learning_rate": 2.6049302789065624e-05, + "loss": 0.7323, + "step": 6015 + }, + { + "epoch": 1.0437196391394865, + "grad_norm": 0.8647057414054871, + "learning_rate": 2.6042808311391456e-05, + "loss": 0.7288, + "step": 6016 + }, + { + "epoch": 1.0438931297709924, + "grad_norm": 0.9558467268943787, + "learning_rate": 2.603631313244235e-05, + "loss": 0.5897, + "step": 6017 + }, + { + "epoch": 1.0440666204024982, + "grad_norm": 1.298647403717041, + "learning_rate": 2.6029817252972064e-05, + "loss": 0.7991, + "step": 6018 + }, + { + "epoch": 1.0442401110340043, + "grad_norm": 1.5438718795776367, + "learning_rate": 2.6023320673734462e-05, + "loss": 0.6194, + "step": 6019 + }, + { + "epoch": 1.0444136016655101, + "grad_norm": 0.8710010051727295, + "learning_rate": 2.6016823395483482e-05, + "loss": 0.6344, + "step": 6020 + }, + { + "epoch": 1.044587092297016, + "grad_norm": 1.1649746894836426, + "learning_rate": 2.6010325418973127e-05, + "loss": 0.7002, + "step": 6021 + }, + { + "epoch": 1.0447605829285218, + "grad_norm": 1.0860332250595093, + "learning_rate": 2.600382674495751e-05, + "loss": 0.6223, + "step": 6022 + }, + { + "epoch": 1.0449340735600277, + "grad_norm": 0.7389101386070251, + "learning_rate": 2.5997327374190797e-05, + "loss": 0.688, + "step": 6023 + }, + { + "epoch": 1.0451075641915337, + "grad_norm": 0.9267382025718689, + "learning_rate": 2.5990827307427263e-05, + "loss": 0.6918, + "step": 6024 + }, + { + "epoch": 1.0452810548230396, + "grad_norm": 0.66621994972229, + "learning_rate": 2.5984326545421238e-05, + "loss": 0.7695, + "step": 6025 + }, + { + "epoch": 1.0454545454545454, + "grad_norm": 0.936700165271759, + "learning_rate": 2.5977825088927135e-05, + "loss": 0.8359, + "step": 6026 + }, + { + "epoch": 1.0456280360860513, + "grad_norm": 1.0298784971237183, + "learning_rate": 2.597132293869947e-05, + "loss": 0.5989, + "step": 6027 + }, + { + "epoch": 1.0458015267175573, + "grad_norm": 1.2993345260620117, + "learning_rate": 2.5964820095492825e-05, + "loss": 0.5636, + "step": 6028 + }, + { + "epoch": 1.0459750173490632, + "grad_norm": 1.4780561923980713, + "learning_rate": 2.5958316560061853e-05, + "loss": 0.6932, + "step": 6029 + }, + { + "epoch": 1.046148507980569, + "grad_norm": 0.9956575632095337, + "learning_rate": 2.5951812333161298e-05, + "loss": 0.7446, + "step": 6030 + }, + { + "epoch": 1.0463219986120749, + "grad_norm": 2.4054532051086426, + "learning_rate": 2.594530741554599e-05, + "loss": 0.6055, + "step": 6031 + }, + { + "epoch": 1.046495489243581, + "grad_norm": 0.6693838238716125, + "learning_rate": 2.593880180797083e-05, + "loss": 0.7771, + "step": 6032 + }, + { + "epoch": 1.0466689798750868, + "grad_norm": 1.1368287801742554, + "learning_rate": 2.5932295511190803e-05, + "loss": 0.6387, + "step": 6033 + }, + { + "epoch": 1.0468424705065926, + "grad_norm": 0.8197100162506104, + "learning_rate": 2.5925788525960964e-05, + "loss": 0.7072, + "step": 6034 + }, + { + "epoch": 1.0470159611380985, + "grad_norm": 0.9514124989509583, + "learning_rate": 2.5919280853036462e-05, + "loss": 0.7069, + "step": 6035 + }, + { + "epoch": 1.0471894517696045, + "grad_norm": 1.1060067415237427, + "learning_rate": 2.5912772493172523e-05, + "loss": 0.7637, + "step": 6036 + }, + { + "epoch": 1.0473629424011104, + "grad_norm": 0.8402566909790039, + "learning_rate": 2.590626344712444e-05, + "loss": 0.6343, + "step": 6037 + }, + { + "epoch": 1.0475364330326162, + "grad_norm": 1.4169939756393433, + "learning_rate": 2.5899753715647614e-05, + "loss": 0.613, + "step": 6038 + }, + { + "epoch": 1.047709923664122, + "grad_norm": 0.9187904000282288, + "learning_rate": 2.5893243299497483e-05, + "loss": 0.5973, + "step": 6039 + }, + { + "epoch": 1.0478834142956281, + "grad_norm": 0.7955947518348694, + "learning_rate": 2.5886732199429606e-05, + "loss": 0.7183, + "step": 6040 + }, + { + "epoch": 1.048056904927134, + "grad_norm": 1.105449914932251, + "learning_rate": 2.5880220416199598e-05, + "loss": 0.613, + "step": 6041 + }, + { + "epoch": 1.0482303955586398, + "grad_norm": 1.114923119544983, + "learning_rate": 2.587370795056315e-05, + "loss": 0.6714, + "step": 6042 + }, + { + "epoch": 1.0484038861901457, + "grad_norm": 1.590577244758606, + "learning_rate": 2.5867194803276058e-05, + "loss": 0.6342, + "step": 6043 + }, + { + "epoch": 1.0485773768216515, + "grad_norm": 0.7708985805511475, + "learning_rate": 2.5860680975094178e-05, + "loss": 0.7299, + "step": 6044 + }, + { + "epoch": 1.0487508674531576, + "grad_norm": 1.2186282873153687, + "learning_rate": 2.585416646677343e-05, + "loss": 0.6478, + "step": 6045 + }, + { + "epoch": 1.0489243580846634, + "grad_norm": 1.1871280670166016, + "learning_rate": 2.5847651279069847e-05, + "loss": 0.7549, + "step": 6046 + }, + { + "epoch": 1.0490978487161693, + "grad_norm": 2.7765071392059326, + "learning_rate": 2.584113541273952e-05, + "loss": 0.5964, + "step": 6047 + }, + { + "epoch": 1.0492713393476751, + "grad_norm": 1.1401163339614868, + "learning_rate": 2.5834618868538623e-05, + "loss": 0.5892, + "step": 6048 + }, + { + "epoch": 1.0494448299791812, + "grad_norm": 0.868272066116333, + "learning_rate": 2.58281016472234e-05, + "loss": 0.7389, + "step": 6049 + }, + { + "epoch": 1.049618320610687, + "grad_norm": 0.9484847784042358, + "learning_rate": 2.58215837495502e-05, + "loss": 0.7218, + "step": 6050 + }, + { + "epoch": 1.049791811242193, + "grad_norm": 1.0394455194473267, + "learning_rate": 2.5815065176275417e-05, + "loss": 0.6626, + "step": 6051 + }, + { + "epoch": 1.0499653018736987, + "grad_norm": 0.8406088352203369, + "learning_rate": 2.5808545928155547e-05, + "loss": 0.6731, + "step": 6052 + }, + { + "epoch": 1.0501387925052048, + "grad_norm": 1.379712462425232, + "learning_rate": 2.580202600594715e-05, + "loss": 0.5524, + "step": 6053 + }, + { + "epoch": 1.0503122831367107, + "grad_norm": 0.7615464329719543, + "learning_rate": 2.5795505410406878e-05, + "loss": 0.8455, + "step": 6054 + }, + { + "epoch": 1.0504857737682165, + "grad_norm": 0.8118395805358887, + "learning_rate": 2.5788984142291448e-05, + "loss": 0.8542, + "step": 6055 + }, + { + "epoch": 1.0506592643997223, + "grad_norm": 1.0067602396011353, + "learning_rate": 2.5782462202357664e-05, + "loss": 0.6559, + "step": 6056 + }, + { + "epoch": 1.0508327550312284, + "grad_norm": 0.9628933668136597, + "learning_rate": 2.5775939591362403e-05, + "loss": 0.7424, + "step": 6057 + }, + { + "epoch": 1.0510062456627343, + "grad_norm": 0.7792642712593079, + "learning_rate": 2.5769416310062622e-05, + "loss": 0.5588, + "step": 6058 + }, + { + "epoch": 1.05117973629424, + "grad_norm": 0.689254879951477, + "learning_rate": 2.576289235921536e-05, + "loss": 0.8047, + "step": 6059 + }, + { + "epoch": 1.051353226925746, + "grad_norm": 1.3105641603469849, + "learning_rate": 2.5756367739577713e-05, + "loss": 0.6836, + "step": 6060 + }, + { + "epoch": 1.0515267175572518, + "grad_norm": 1.1062088012695312, + "learning_rate": 2.574984245190689e-05, + "loss": 0.6985, + "step": 6061 + }, + { + "epoch": 1.0517002081887579, + "grad_norm": 0.6656411290168762, + "learning_rate": 2.5743316496960154e-05, + "loss": 0.8579, + "step": 6062 + }, + { + "epoch": 1.0518736988202637, + "grad_norm": 1.0541574954986572, + "learning_rate": 2.5736789875494844e-05, + "loss": 0.5732, + "step": 6063 + }, + { + "epoch": 1.0520471894517696, + "grad_norm": 1.3774231672286987, + "learning_rate": 2.573026258826838e-05, + "loss": 0.6635, + "step": 6064 + }, + { + "epoch": 1.0522206800832754, + "grad_norm": 1.0305851697921753, + "learning_rate": 2.5723734636038272e-05, + "loss": 0.6525, + "step": 6065 + }, + { + "epoch": 1.0523941707147815, + "grad_norm": 1.1351374387741089, + "learning_rate": 2.571720601956208e-05, + "loss": 0.6541, + "step": 6066 + }, + { + "epoch": 1.0525676613462873, + "grad_norm": 1.4381340742111206, + "learning_rate": 2.571067673959748e-05, + "loss": 0.6927, + "step": 6067 + }, + { + "epoch": 1.0527411519777932, + "grad_norm": 1.0119799375534058, + "learning_rate": 2.570414679690218e-05, + "loss": 0.8621, + "step": 6068 + }, + { + "epoch": 1.052914642609299, + "grad_norm": 0.7759705185890198, + "learning_rate": 2.5697616192234005e-05, + "loss": 0.6755, + "step": 6069 + }, + { + "epoch": 1.053088133240805, + "grad_norm": 0.8403199315071106, + "learning_rate": 2.5691084926350825e-05, + "loss": 0.7675, + "step": 6070 + }, + { + "epoch": 1.053261623872311, + "grad_norm": 0.827292263507843, + "learning_rate": 2.568455300001061e-05, + "loss": 0.7477, + "step": 6071 + }, + { + "epoch": 1.0534351145038168, + "grad_norm": 0.7212702035903931, + "learning_rate": 2.56780204139714e-05, + "loss": 0.5526, + "step": 6072 + }, + { + "epoch": 1.0536086051353226, + "grad_norm": 1.3276541233062744, + "learning_rate": 2.56714871689913e-05, + "loss": 0.582, + "step": 6073 + }, + { + "epoch": 1.0537820957668287, + "grad_norm": 0.7033882141113281, + "learning_rate": 2.5664953265828504e-05, + "loss": 0.8296, + "step": 6074 + }, + { + "epoch": 1.0539555863983345, + "grad_norm": 0.955555260181427, + "learning_rate": 2.5658418705241283e-05, + "loss": 0.5874, + "step": 6075 + }, + { + "epoch": 1.0541290770298404, + "grad_norm": 0.9649580121040344, + "learning_rate": 2.565188348798798e-05, + "loss": 0.8765, + "step": 6076 + }, + { + "epoch": 1.0543025676613462, + "grad_norm": 1.00332510471344, + "learning_rate": 2.5645347614827008e-05, + "loss": 0.7092, + "step": 6077 + }, + { + "epoch": 1.0544760582928523, + "grad_norm": 1.6023979187011719, + "learning_rate": 2.5638811086516873e-05, + "loss": 0.5956, + "step": 6078 + }, + { + "epoch": 1.0546495489243581, + "grad_norm": 1.1717655658721924, + "learning_rate": 2.5632273903816133e-05, + "loss": 0.6951, + "step": 6079 + }, + { + "epoch": 1.054823039555864, + "grad_norm": 1.4340835809707642, + "learning_rate": 2.562573606748345e-05, + "loss": 0.6039, + "step": 6080 + }, + { + "epoch": 1.0549965301873698, + "grad_norm": 1.4536679983139038, + "learning_rate": 2.561919757827754e-05, + "loss": 0.7933, + "step": 6081 + }, + { + "epoch": 1.0551700208188757, + "grad_norm": 0.988136351108551, + "learning_rate": 2.5612658436957204e-05, + "loss": 0.7239, + "step": 6082 + }, + { + "epoch": 1.0553435114503817, + "grad_norm": 0.9415732026100159, + "learning_rate": 2.5606118644281318e-05, + "loss": 0.6321, + "step": 6083 + }, + { + "epoch": 1.0555170020818876, + "grad_norm": 1.3682222366333008, + "learning_rate": 2.5599578201008824e-05, + "loss": 0.6742, + "step": 6084 + }, + { + "epoch": 1.0556904927133934, + "grad_norm": 1.0026897192001343, + "learning_rate": 2.559303710789876e-05, + "loss": 0.5579, + "step": 6085 + }, + { + "epoch": 1.0558639833448993, + "grad_norm": 1.343124270439148, + "learning_rate": 2.5586495365710225e-05, + "loss": 0.7412, + "step": 6086 + }, + { + "epoch": 1.0560374739764053, + "grad_norm": 1.0506234169006348, + "learning_rate": 2.557995297520239e-05, + "loss": 0.7197, + "step": 6087 + }, + { + "epoch": 1.0562109646079112, + "grad_norm": 1.3366260528564453, + "learning_rate": 2.5573409937134508e-05, + "loss": 0.5928, + "step": 6088 + }, + { + "epoch": 1.056384455239417, + "grad_norm": 0.9347487688064575, + "learning_rate": 2.5566866252265908e-05, + "loss": 0.7407, + "step": 6089 + }, + { + "epoch": 1.0565579458709229, + "grad_norm": 1.2604597806930542, + "learning_rate": 2.5560321921355996e-05, + "loss": 0.585, + "step": 6090 + }, + { + "epoch": 1.056731436502429, + "grad_norm": 0.9852869510650635, + "learning_rate": 2.555377694516425e-05, + "loss": 0.5913, + "step": 6091 + }, + { + "epoch": 1.0569049271339348, + "grad_norm": 0.7745399475097656, + "learning_rate": 2.554723132445021e-05, + "loss": 0.821, + "step": 6092 + }, + { + "epoch": 1.0570784177654406, + "grad_norm": 1.1352410316467285, + "learning_rate": 2.5540685059973514e-05, + "loss": 0.7451, + "step": 6093 + }, + { + "epoch": 1.0572519083969465, + "grad_norm": 1.138653039932251, + "learning_rate": 2.5534138152493863e-05, + "loss": 0.7771, + "step": 6094 + }, + { + "epoch": 1.0574253990284526, + "grad_norm": 1.1257606744766235, + "learning_rate": 2.5527590602771026e-05, + "loss": 0.6322, + "step": 6095 + }, + { + "epoch": 1.0575988896599584, + "grad_norm": 1.5965912342071533, + "learning_rate": 2.5521042411564866e-05, + "loss": 0.6877, + "step": 6096 + }, + { + "epoch": 1.0577723802914643, + "grad_norm": 1.3666285276412964, + "learning_rate": 2.551449357963529e-05, + "loss": 0.578, + "step": 6097 + }, + { + "epoch": 1.05794587092297, + "grad_norm": 0.9524034857749939, + "learning_rate": 2.5507944107742314e-05, + "loss": 0.8293, + "step": 6098 + }, + { + "epoch": 1.0581193615544762, + "grad_norm": 0.9199702143669128, + "learning_rate": 2.550139399664601e-05, + "loss": 0.5321, + "step": 6099 + }, + { + "epoch": 1.058292852185982, + "grad_norm": 0.6754572987556458, + "learning_rate": 2.549484324710652e-05, + "loss": 0.7335, + "step": 6100 + }, + { + "epoch": 1.0584663428174879, + "grad_norm": 0.8482014536857605, + "learning_rate": 2.5488291859884067e-05, + "loss": 0.5797, + "step": 6101 + }, + { + "epoch": 1.0586398334489937, + "grad_norm": 1.1112004518508911, + "learning_rate": 2.548173983573895e-05, + "loss": 0.6566, + "step": 6102 + }, + { + "epoch": 1.0588133240804996, + "grad_norm": 1.1890714168548584, + "learning_rate": 2.5475187175431532e-05, + "loss": 0.6406, + "step": 6103 + }, + { + "epoch": 1.0589868147120056, + "grad_norm": 1.6508711576461792, + "learning_rate": 2.5468633879722272e-05, + "loss": 0.646, + "step": 6104 + }, + { + "epoch": 1.0591603053435115, + "grad_norm": 1.0344789028167725, + "learning_rate": 2.5462079949371665e-05, + "loss": 0.8259, + "step": 6105 + }, + { + "epoch": 1.0593337959750173, + "grad_norm": 0.8345190286636353, + "learning_rate": 2.545552538514033e-05, + "loss": 0.7686, + "step": 6106 + }, + { + "epoch": 1.0595072866065232, + "grad_norm": 1.073713779449463, + "learning_rate": 2.5448970187788913e-05, + "loss": 0.7012, + "step": 6107 + }, + { + "epoch": 1.0596807772380292, + "grad_norm": 0.8083980083465576, + "learning_rate": 2.5442414358078148e-05, + "loss": 0.7244, + "step": 6108 + }, + { + "epoch": 1.059854267869535, + "grad_norm": 1.0002110004425049, + "learning_rate": 2.5435857896768862e-05, + "loss": 0.5946, + "step": 6109 + }, + { + "epoch": 1.060027758501041, + "grad_norm": 1.1601295471191406, + "learning_rate": 2.5429300804621934e-05, + "loss": 0.7145, + "step": 6110 + }, + { + "epoch": 1.0602012491325468, + "grad_norm": 0.7113509774208069, + "learning_rate": 2.542274308239832e-05, + "loss": 0.7249, + "step": 6111 + }, + { + "epoch": 1.0603747397640528, + "grad_norm": 0.7518972754478455, + "learning_rate": 2.541618473085905e-05, + "loss": 0.7052, + "step": 6112 + }, + { + "epoch": 1.0605482303955587, + "grad_norm": 0.9015205502510071, + "learning_rate": 2.540962575076523e-05, + "loss": 0.7119, + "step": 6113 + }, + { + "epoch": 1.0607217210270645, + "grad_norm": 1.3080379962921143, + "learning_rate": 2.5403066142878047e-05, + "loss": 0.7313, + "step": 6114 + }, + { + "epoch": 1.0608952116585704, + "grad_norm": 0.9788916707038879, + "learning_rate": 2.5396505907958736e-05, + "loss": 0.8462, + "step": 6115 + }, + { + "epoch": 1.0610687022900764, + "grad_norm": 0.9193121790885925, + "learning_rate": 2.538994504676862e-05, + "loss": 0.7125, + "step": 6116 + }, + { + "epoch": 1.0612421929215823, + "grad_norm": 1.0796575546264648, + "learning_rate": 2.5383383560069113e-05, + "loss": 0.5829, + "step": 6117 + }, + { + "epoch": 1.0614156835530881, + "grad_norm": 1.3193600177764893, + "learning_rate": 2.537682144862166e-05, + "loss": 0.8176, + "step": 6118 + }, + { + "epoch": 1.061589174184594, + "grad_norm": 0.8457702994346619, + "learning_rate": 2.537025871318782e-05, + "loss": 0.7715, + "step": 6119 + }, + { + "epoch": 1.0617626648160998, + "grad_norm": 1.0997400283813477, + "learning_rate": 2.53636953545292e-05, + "loss": 0.5577, + "step": 6120 + }, + { + "epoch": 1.061936155447606, + "grad_norm": 1.0828720331192017, + "learning_rate": 2.5357131373407478e-05, + "loss": 0.6975, + "step": 6121 + }, + { + "epoch": 1.0621096460791117, + "grad_norm": 1.179482102394104, + "learning_rate": 2.5350566770584423e-05, + "loss": 0.6854, + "step": 6122 + }, + { + "epoch": 1.0622831367106176, + "grad_norm": 0.8115193843841553, + "learning_rate": 2.534400154682185e-05, + "loss": 0.7592, + "step": 6123 + }, + { + "epoch": 1.0624566273421234, + "grad_norm": 0.9126799702644348, + "learning_rate": 2.5337435702881683e-05, + "loss": 0.6362, + "step": 6124 + }, + { + "epoch": 1.0626301179736295, + "grad_norm": 0.7792446613311768, + "learning_rate": 2.5330869239525874e-05, + "loss": 0.8232, + "step": 6125 + }, + { + "epoch": 1.0628036086051353, + "grad_norm": 1.4506880044937134, + "learning_rate": 2.5324302157516486e-05, + "loss": 0.532, + "step": 6126 + }, + { + "epoch": 1.0629770992366412, + "grad_norm": 0.9600554704666138, + "learning_rate": 2.531773445761562e-05, + "loss": 0.5985, + "step": 6127 + }, + { + "epoch": 1.063150589868147, + "grad_norm": 0.7474873661994934, + "learning_rate": 2.531116614058548e-05, + "loss": 0.8577, + "step": 6128 + }, + { + "epoch": 1.063324080499653, + "grad_norm": 1.0871683359146118, + "learning_rate": 2.5304597207188318e-05, + "loss": 0.8098, + "step": 6129 + }, + { + "epoch": 1.063497571131159, + "grad_norm": 1.204296588897705, + "learning_rate": 2.5298027658186472e-05, + "loss": 0.6676, + "step": 6130 + }, + { + "epoch": 1.0636710617626648, + "grad_norm": 1.1559405326843262, + "learning_rate": 2.529145749434234e-05, + "loss": 0.5854, + "step": 6131 + }, + { + "epoch": 1.0638445523941706, + "grad_norm": 1.0512157678604126, + "learning_rate": 2.52848867164184e-05, + "loss": 0.7078, + "step": 6132 + }, + { + "epoch": 1.0640180430256767, + "grad_norm": 1.02376127243042, + "learning_rate": 2.52783153251772e-05, + "loss": 0.6746, + "step": 6133 + }, + { + "epoch": 1.0641915336571826, + "grad_norm": 0.8188244700431824, + "learning_rate": 2.5271743321381354e-05, + "loss": 0.6227, + "step": 6134 + }, + { + "epoch": 1.0643650242886884, + "grad_norm": 0.7822453379631042, + "learning_rate": 2.5265170705793555e-05, + "loss": 0.6243, + "step": 6135 + }, + { + "epoch": 1.0645385149201942, + "grad_norm": 0.9012399911880493, + "learning_rate": 2.525859747917656e-05, + "loss": 0.7263, + "step": 6136 + }, + { + "epoch": 1.0647120055517, + "grad_norm": 0.982111394405365, + "learning_rate": 2.52520236422932e-05, + "loss": 0.8401, + "step": 6137 + }, + { + "epoch": 1.0648854961832062, + "grad_norm": 0.9321725368499756, + "learning_rate": 2.524544919590638e-05, + "loss": 0.6244, + "step": 6138 + }, + { + "epoch": 1.065058986814712, + "grad_norm": 0.728926956653595, + "learning_rate": 2.5238874140779057e-05, + "loss": 0.8296, + "step": 6139 + }, + { + "epoch": 1.0652324774462179, + "grad_norm": 0.9953757524490356, + "learning_rate": 2.5232298477674297e-05, + "loss": 0.7728, + "step": 6140 + }, + { + "epoch": 1.065405968077724, + "grad_norm": 1.5170553922653198, + "learning_rate": 2.5225722207355202e-05, + "loss": 0.6371, + "step": 6141 + }, + { + "epoch": 1.0655794587092298, + "grad_norm": 0.9082801938056946, + "learning_rate": 2.5219145330584945e-05, + "loss": 0.6776, + "step": 6142 + }, + { + "epoch": 1.0657529493407356, + "grad_norm": 1.1077604293823242, + "learning_rate": 2.5212567848126802e-05, + "loss": 0.5677, + "step": 6143 + }, + { + "epoch": 1.0659264399722415, + "grad_norm": 0.6880508661270142, + "learning_rate": 2.5205989760744084e-05, + "loss": 0.7549, + "step": 6144 + }, + { + "epoch": 1.0660999306037473, + "grad_norm": 0.8310517072677612, + "learning_rate": 2.519941106920018e-05, + "loss": 0.7058, + "step": 6145 + }, + { + "epoch": 1.0662734212352534, + "grad_norm": 0.6548441052436829, + "learning_rate": 2.5192831774258575e-05, + "loss": 0.7542, + "step": 6146 + }, + { + "epoch": 1.0664469118667592, + "grad_norm": 0.8065654635429382, + "learning_rate": 2.5186251876682782e-05, + "loss": 0.7058, + "step": 6147 + }, + { + "epoch": 1.066620402498265, + "grad_norm": 0.9487408995628357, + "learning_rate": 2.5179671377236422e-05, + "loss": 0.7847, + "step": 6148 + }, + { + "epoch": 1.066793893129771, + "grad_norm": 1.006015419960022, + "learning_rate": 2.5173090276683157e-05, + "loss": 0.5857, + "step": 6149 + }, + { + "epoch": 1.066967383761277, + "grad_norm": 0.7247277498245239, + "learning_rate": 2.516650857578674e-05, + "loss": 0.7233, + "step": 6150 + }, + { + "epoch": 1.0671408743927828, + "grad_norm": 0.9885105490684509, + "learning_rate": 2.515992627531098e-05, + "loss": 0.5863, + "step": 6151 + }, + { + "epoch": 1.0673143650242887, + "grad_norm": 0.964225172996521, + "learning_rate": 2.515334337601977e-05, + "loss": 0.5857, + "step": 6152 + }, + { + "epoch": 1.0674878556557945, + "grad_norm": 0.8715090751647949, + "learning_rate": 2.514675987867705e-05, + "loss": 0.8022, + "step": 6153 + }, + { + "epoch": 1.0676613462873006, + "grad_norm": 0.6956591010093689, + "learning_rate": 2.5140175784046858e-05, + "loss": 0.8618, + "step": 6154 + }, + { + "epoch": 1.0678348369188064, + "grad_norm": 1.3712687492370605, + "learning_rate": 2.5133591092893265e-05, + "loss": 0.728, + "step": 6155 + }, + { + "epoch": 1.0680083275503123, + "grad_norm": 0.8442012667655945, + "learning_rate": 2.512700580598045e-05, + "loss": 0.728, + "step": 6156 + }, + { + "epoch": 1.0681818181818181, + "grad_norm": 1.4859000444412231, + "learning_rate": 2.512041992407264e-05, + "loss": 0.5919, + "step": 6157 + }, + { + "epoch": 1.0683553088133242, + "grad_norm": 1.1923282146453857, + "learning_rate": 2.5113833447934126e-05, + "loss": 0.5426, + "step": 6158 + }, + { + "epoch": 1.06852879944483, + "grad_norm": 1.0268504619598389, + "learning_rate": 2.5107246378329287e-05, + "loss": 0.7449, + "step": 6159 + }, + { + "epoch": 1.0687022900763359, + "grad_norm": 1.0911448001861572, + "learning_rate": 2.510065871602255e-05, + "loss": 0.6216, + "step": 6160 + }, + { + "epoch": 1.0688757807078417, + "grad_norm": 1.2605198621749878, + "learning_rate": 2.5094070461778424e-05, + "loss": 0.6011, + "step": 6161 + }, + { + "epoch": 1.0690492713393476, + "grad_norm": 0.9996030926704407, + "learning_rate": 2.5087481616361493e-05, + "loss": 0.6018, + "step": 6162 + }, + { + "epoch": 1.0692227619708536, + "grad_norm": 1.2714729309082031, + "learning_rate": 2.5080892180536386e-05, + "loss": 0.6624, + "step": 6163 + }, + { + "epoch": 1.0693962526023595, + "grad_norm": 0.9976373910903931, + "learning_rate": 2.5074302155067823e-05, + "loss": 0.7809, + "step": 6164 + }, + { + "epoch": 1.0695697432338653, + "grad_norm": 0.766667366027832, + "learning_rate": 2.506771154072058e-05, + "loss": 0.7754, + "step": 6165 + }, + { + "epoch": 1.0697432338653712, + "grad_norm": 0.7594448924064636, + "learning_rate": 2.5061120338259512e-05, + "loss": 0.7556, + "step": 6166 + }, + { + "epoch": 1.0699167244968772, + "grad_norm": 0.632052481174469, + "learning_rate": 2.505452854844953e-05, + "loss": 0.9033, + "step": 6167 + }, + { + "epoch": 1.070090215128383, + "grad_norm": 1.154930591583252, + "learning_rate": 2.5047936172055613e-05, + "loss": 0.5396, + "step": 6168 + }, + { + "epoch": 1.070263705759889, + "grad_norm": 0.8313484191894531, + "learning_rate": 2.504134320984283e-05, + "loss": 0.7059, + "step": 6169 + }, + { + "epoch": 1.0704371963913948, + "grad_norm": 1.0997209548950195, + "learning_rate": 2.5034749662576293e-05, + "loss": 0.5625, + "step": 6170 + }, + { + "epoch": 1.0706106870229009, + "grad_norm": 0.9104084372520447, + "learning_rate": 2.5028155531021186e-05, + "loss": 0.6036, + "step": 6171 + }, + { + "epoch": 1.0707841776544067, + "grad_norm": 1.2219531536102295, + "learning_rate": 2.5021560815942777e-05, + "loss": 0.5428, + "step": 6172 + }, + { + "epoch": 1.0709576682859125, + "grad_norm": 0.8190534710884094, + "learning_rate": 2.5014965518106372e-05, + "loss": 0.6954, + "step": 6173 + }, + { + "epoch": 1.0711311589174184, + "grad_norm": 1.2299097776412964, + "learning_rate": 2.5008369638277382e-05, + "loss": 0.6816, + "step": 6174 + }, + { + "epoch": 1.0713046495489245, + "grad_norm": 0.7392492890357971, + "learning_rate": 2.500177317722126e-05, + "loss": 0.8015, + "step": 6175 + }, + { + "epoch": 1.0714781401804303, + "grad_norm": 1.947956919670105, + "learning_rate": 2.4995176135703533e-05, + "loss": 0.7837, + "step": 6176 + }, + { + "epoch": 1.0716516308119362, + "grad_norm": 0.910539984703064, + "learning_rate": 2.4988578514489797e-05, + "loss": 0.5887, + "step": 6177 + }, + { + "epoch": 1.071825121443442, + "grad_norm": 0.8339071869850159, + "learning_rate": 2.498198031434571e-05, + "loss": 0.75, + "step": 6178 + }, + { + "epoch": 1.0719986120749478, + "grad_norm": 0.8082987666130066, + "learning_rate": 2.4975381536037e-05, + "loss": 0.8115, + "step": 6179 + }, + { + "epoch": 1.072172102706454, + "grad_norm": 1.445878028869629, + "learning_rate": 2.496878218032947e-05, + "loss": 0.6334, + "step": 6180 + }, + { + "epoch": 1.0723455933379598, + "grad_norm": 0.9616848826408386, + "learning_rate": 2.4962182247988974e-05, + "loss": 0.7264, + "step": 6181 + }, + { + "epoch": 1.0725190839694656, + "grad_norm": 0.7805618047714233, + "learning_rate": 2.495558173978145e-05, + "loss": 0.7415, + "step": 6182 + }, + { + "epoch": 1.0726925746009714, + "grad_norm": 1.666275143623352, + "learning_rate": 2.494898065647289e-05, + "loss": 0.7217, + "step": 6183 + }, + { + "epoch": 1.0728660652324775, + "grad_norm": 0.8906477093696594, + "learning_rate": 2.494237899882935e-05, + "loss": 0.6356, + "step": 6184 + }, + { + "epoch": 1.0730395558639834, + "grad_norm": 0.9435046911239624, + "learning_rate": 2.4935776767616978e-05, + "loss": 0.5645, + "step": 6185 + }, + { + "epoch": 1.0732130464954892, + "grad_norm": 0.8139978051185608, + "learning_rate": 2.4929173963601958e-05, + "loss": 0.6613, + "step": 6186 + }, + { + "epoch": 1.073386537126995, + "grad_norm": 0.749391496181488, + "learning_rate": 2.4922570587550552e-05, + "loss": 0.7585, + "step": 6187 + }, + { + "epoch": 1.0735600277585011, + "grad_norm": 1.2135744094848633, + "learning_rate": 2.4915966640229098e-05, + "loss": 0.7664, + "step": 6188 + }, + { + "epoch": 1.073733518390007, + "grad_norm": 0.7901684045791626, + "learning_rate": 2.4909362122403984e-05, + "loss": 0.8647, + "step": 6189 + }, + { + "epoch": 1.0739070090215128, + "grad_norm": 1.0394105911254883, + "learning_rate": 2.4902757034841674e-05, + "loss": 0.6527, + "step": 6190 + }, + { + "epoch": 1.0740804996530187, + "grad_norm": 1.0079429149627686, + "learning_rate": 2.4896151378308706e-05, + "loss": 0.7316, + "step": 6191 + }, + { + "epoch": 1.0742539902845247, + "grad_norm": 0.9585371613502502, + "learning_rate": 2.4889545153571657e-05, + "loss": 0.6154, + "step": 6192 + }, + { + "epoch": 1.0744274809160306, + "grad_norm": 1.367793083190918, + "learning_rate": 2.48829383613972e-05, + "loss": 0.8188, + "step": 6193 + }, + { + "epoch": 1.0746009715475364, + "grad_norm": 1.343592643737793, + "learning_rate": 2.4876331002552055e-05, + "loss": 0.7554, + "step": 6194 + }, + { + "epoch": 1.0747744621790423, + "grad_norm": 1.1830205917358398, + "learning_rate": 2.486972307780301e-05, + "loss": 0.6454, + "step": 6195 + }, + { + "epoch": 1.0749479528105481, + "grad_norm": 0.8163788914680481, + "learning_rate": 2.4863114587916933e-05, + "loss": 0.5776, + "step": 6196 + }, + { + "epoch": 1.0751214434420542, + "grad_norm": 1.035078525543213, + "learning_rate": 2.485650553366074e-05, + "loss": 0.8696, + "step": 6197 + }, + { + "epoch": 1.07529493407356, + "grad_norm": 1.113258957862854, + "learning_rate": 2.484989591580142e-05, + "loss": 0.7292, + "step": 6198 + }, + { + "epoch": 1.0754684247050659, + "grad_norm": 1.0528130531311035, + "learning_rate": 2.484328573510603e-05, + "loss": 0.6566, + "step": 6199 + }, + { + "epoch": 1.075641915336572, + "grad_norm": 0.8905289173126221, + "learning_rate": 2.4836674992341684e-05, + "loss": 0.7382, + "step": 6200 + }, + { + "epoch": 1.0758154059680778, + "grad_norm": 0.907850980758667, + "learning_rate": 2.483006368827557e-05, + "loss": 0.7023, + "step": 6201 + }, + { + "epoch": 1.0759888965995836, + "grad_norm": 0.9017595052719116, + "learning_rate": 2.4823451823674943e-05, + "loss": 0.6733, + "step": 6202 + }, + { + "epoch": 1.0761623872310895, + "grad_norm": 1.0516083240509033, + "learning_rate": 2.4816839399307102e-05, + "loss": 0.7605, + "step": 6203 + }, + { + "epoch": 1.0763358778625953, + "grad_norm": 1.0763555765151978, + "learning_rate": 2.481022641593944e-05, + "loss": 0.5306, + "step": 6204 + }, + { + "epoch": 1.0765093684941014, + "grad_norm": 0.9949150085449219, + "learning_rate": 2.48036128743394e-05, + "loss": 0.6094, + "step": 6205 + }, + { + "epoch": 1.0766828591256072, + "grad_norm": 0.8883573412895203, + "learning_rate": 2.4796998775274482e-05, + "loss": 0.6765, + "step": 6206 + }, + { + "epoch": 1.076856349757113, + "grad_norm": 1.0383806228637695, + "learning_rate": 2.4790384119512275e-05, + "loss": 0.6744, + "step": 6207 + }, + { + "epoch": 1.077029840388619, + "grad_norm": 1.0247704982757568, + "learning_rate": 2.4783768907820403e-05, + "loss": 0.7732, + "step": 6208 + }, + { + "epoch": 1.077203331020125, + "grad_norm": 1.59627366065979, + "learning_rate": 2.4777153140966583e-05, + "loss": 0.6337, + "step": 6209 + }, + { + "epoch": 1.0773768216516308, + "grad_norm": 1.1517270803451538, + "learning_rate": 2.4770536819718562e-05, + "loss": 0.595, + "step": 6210 + }, + { + "epoch": 1.0775503122831367, + "grad_norm": 0.7707021832466125, + "learning_rate": 2.476391994484419e-05, + "loss": 0.8162, + "step": 6211 + }, + { + "epoch": 1.0777238029146425, + "grad_norm": 1.3106776475906372, + "learning_rate": 2.475730251711136e-05, + "loss": 0.6158, + "step": 6212 + }, + { + "epoch": 1.0778972935461486, + "grad_norm": 0.9525348544120789, + "learning_rate": 2.4750684537288024e-05, + "loss": 0.7966, + "step": 6213 + }, + { + "epoch": 1.0780707841776545, + "grad_norm": 1.0171763896942139, + "learning_rate": 2.4744066006142218e-05, + "loss": 0.6074, + "step": 6214 + }, + { + "epoch": 1.0782442748091603, + "grad_norm": 0.8631925582885742, + "learning_rate": 2.4737446924442025e-05, + "loss": 0.6484, + "step": 6215 + }, + { + "epoch": 1.0784177654406661, + "grad_norm": 1.0509843826293945, + "learning_rate": 2.4730827292955592e-05, + "loss": 0.6698, + "step": 6216 + }, + { + "epoch": 1.0785912560721722, + "grad_norm": 1.012939214706421, + "learning_rate": 2.472420711245114e-05, + "loss": 0.6599, + "step": 6217 + }, + { + "epoch": 1.078764746703678, + "grad_norm": 0.8967079520225525, + "learning_rate": 2.4717586383696947e-05, + "loss": 0.9614, + "step": 6218 + }, + { + "epoch": 1.078938237335184, + "grad_norm": 1.3350712060928345, + "learning_rate": 2.4710965107461354e-05, + "loss": 0.719, + "step": 6219 + }, + { + "epoch": 1.0791117279666897, + "grad_norm": 1.352610468864441, + "learning_rate": 2.470434328451278e-05, + "loss": 0.7211, + "step": 6220 + }, + { + "epoch": 1.0792852185981956, + "grad_norm": 0.7530908584594727, + "learning_rate": 2.469772091561968e-05, + "loss": 0.7991, + "step": 6221 + }, + { + "epoch": 1.0794587092297017, + "grad_norm": 0.9921525120735168, + "learning_rate": 2.4691098001550588e-05, + "loss": 0.7292, + "step": 6222 + }, + { + "epoch": 1.0796321998612075, + "grad_norm": 0.7464292645454407, + "learning_rate": 2.4684474543074116e-05, + "loss": 0.7551, + "step": 6223 + }, + { + "epoch": 1.0798056904927134, + "grad_norm": 0.9595939517021179, + "learning_rate": 2.4677850540958906e-05, + "loss": 0.7417, + "step": 6224 + }, + { + "epoch": 1.0799791811242192, + "grad_norm": 0.7740707993507385, + "learning_rate": 2.46712259959737e-05, + "loss": 0.7964, + "step": 6225 + }, + { + "epoch": 1.0801526717557253, + "grad_norm": 1.164500117301941, + "learning_rate": 2.4664600908887272e-05, + "loss": 0.7954, + "step": 6226 + }, + { + "epoch": 1.0803261623872311, + "grad_norm": 0.8895793557167053, + "learning_rate": 2.465797528046847e-05, + "loss": 0.6868, + "step": 6227 + }, + { + "epoch": 1.080499653018737, + "grad_norm": 0.9298650622367859, + "learning_rate": 2.4651349111486212e-05, + "loss": 0.5759, + "step": 6228 + }, + { + "epoch": 1.0806731436502428, + "grad_norm": 2.9539175033569336, + "learning_rate": 2.4644722402709467e-05, + "loss": 0.8503, + "step": 6229 + }, + { + "epoch": 1.0808466342817489, + "grad_norm": 1.8359984159469604, + "learning_rate": 2.4638095154907276e-05, + "loss": 0.6572, + "step": 6230 + }, + { + "epoch": 1.0810201249132547, + "grad_norm": 1.3150044679641724, + "learning_rate": 2.463146736884874e-05, + "loss": 0.7535, + "step": 6231 + }, + { + "epoch": 1.0811936155447606, + "grad_norm": 0.8676493167877197, + "learning_rate": 2.4624839045303014e-05, + "loss": 0.7651, + "step": 6232 + }, + { + "epoch": 1.0813671061762664, + "grad_norm": 1.1096277236938477, + "learning_rate": 2.4618210185039333e-05, + "loss": 0.6772, + "step": 6233 + }, + { + "epoch": 1.0815405968077725, + "grad_norm": 1.1102814674377441, + "learning_rate": 2.4611580788826973e-05, + "loss": 0.6052, + "step": 6234 + }, + { + "epoch": 1.0817140874392783, + "grad_norm": 0.7336384654045105, + "learning_rate": 2.4604950857435297e-05, + "loss": 0.7961, + "step": 6235 + }, + { + "epoch": 1.0818875780707842, + "grad_norm": 2.2028696537017822, + "learning_rate": 2.4598320391633702e-05, + "loss": 0.8318, + "step": 6236 + }, + { + "epoch": 1.08206106870229, + "grad_norm": 1.0769991874694824, + "learning_rate": 2.4591689392191667e-05, + "loss": 0.6436, + "step": 6237 + }, + { + "epoch": 1.0822345593337959, + "grad_norm": 0.9212601184844971, + "learning_rate": 2.4585057859878732e-05, + "loss": 0.6919, + "step": 6238 + }, + { + "epoch": 1.082408049965302, + "grad_norm": 0.9518373012542725, + "learning_rate": 2.4578425795464487e-05, + "loss": 0.7466, + "step": 6239 + }, + { + "epoch": 1.0825815405968078, + "grad_norm": 0.8198143243789673, + "learning_rate": 2.4571793199718593e-05, + "loss": 0.7194, + "step": 6240 + }, + { + "epoch": 1.0827550312283136, + "grad_norm": 0.9098920226097107, + "learning_rate": 2.4565160073410774e-05, + "loss": 0.6743, + "step": 6241 + }, + { + "epoch": 1.0829285218598195, + "grad_norm": 1.113472580909729, + "learning_rate": 2.4558526417310805e-05, + "loss": 0.5927, + "step": 6242 + }, + { + "epoch": 1.0831020124913255, + "grad_norm": 0.8821173310279846, + "learning_rate": 2.4551892232188535e-05, + "loss": 0.6261, + "step": 6243 + }, + { + "epoch": 1.0832755031228314, + "grad_norm": 1.0970388650894165, + "learning_rate": 2.4545257518813866e-05, + "loss": 0.678, + "step": 6244 + }, + { + "epoch": 1.0834489937543372, + "grad_norm": 1.7997934818267822, + "learning_rate": 2.453862227795677e-05, + "loss": 0.692, + "step": 6245 + }, + { + "epoch": 1.083622484385843, + "grad_norm": 0.94905024766922, + "learning_rate": 2.4531986510387268e-05, + "loss": 0.703, + "step": 6246 + }, + { + "epoch": 1.0837959750173491, + "grad_norm": 1.0244591236114502, + "learning_rate": 2.452535021687545e-05, + "loss": 0.7666, + "step": 6247 + }, + { + "epoch": 1.083969465648855, + "grad_norm": 0.7989615201950073, + "learning_rate": 2.4518713398191464e-05, + "loss": 0.7358, + "step": 6248 + }, + { + "epoch": 1.0841429562803608, + "grad_norm": 0.8999284505844116, + "learning_rate": 2.4512076055105527e-05, + "loss": 0.7217, + "step": 6249 + }, + { + "epoch": 1.0843164469118667, + "grad_norm": 0.9312961101531982, + "learning_rate": 2.45054381883879e-05, + "loss": 0.5973, + "step": 6250 + }, + { + "epoch": 1.0844899375433728, + "grad_norm": 1.3799471855163574, + "learning_rate": 2.4498799798808926e-05, + "loss": 0.7859, + "step": 6251 + }, + { + "epoch": 1.0846634281748786, + "grad_norm": 0.851758599281311, + "learning_rate": 2.4492160887138998e-05, + "loss": 0.6969, + "step": 6252 + }, + { + "epoch": 1.0848369188063844, + "grad_norm": 1.4421849250793457, + "learning_rate": 2.4485521454148558e-05, + "loss": 0.6804, + "step": 6253 + }, + { + "epoch": 1.0850104094378903, + "grad_norm": 0.8441764116287231, + "learning_rate": 2.447888150060813e-05, + "loss": 0.7798, + "step": 6254 + }, + { + "epoch": 1.0851839000693964, + "grad_norm": 0.9149577617645264, + "learning_rate": 2.4472241027288276e-05, + "loss": 0.7161, + "step": 6255 + }, + { + "epoch": 1.0853573907009022, + "grad_norm": 1.2399441003799438, + "learning_rate": 2.4465600034959654e-05, + "loss": 0.6616, + "step": 6256 + }, + { + "epoch": 1.085530881332408, + "grad_norm": 1.6665741205215454, + "learning_rate": 2.4458958524392937e-05, + "loss": 0.6779, + "step": 6257 + }, + { + "epoch": 1.085704371963914, + "grad_norm": 0.8756848573684692, + "learning_rate": 2.4452316496358885e-05, + "loss": 0.5878, + "step": 6258 + }, + { + "epoch": 1.08587786259542, + "grad_norm": 0.6656708717346191, + "learning_rate": 2.444567395162832e-05, + "loss": 0.7246, + "step": 6259 + }, + { + "epoch": 1.0860513532269258, + "grad_norm": 1.0137656927108765, + "learning_rate": 2.443903089097211e-05, + "loss": 0.682, + "step": 6260 + }, + { + "epoch": 1.0862248438584317, + "grad_norm": 0.7258992195129395, + "learning_rate": 2.4432387315161194e-05, + "loss": 0.7451, + "step": 6261 + }, + { + "epoch": 1.0863983344899375, + "grad_norm": 2.1581428050994873, + "learning_rate": 2.4425743224966567e-05, + "loss": 0.7767, + "step": 6262 + }, + { + "epoch": 1.0865718251214433, + "grad_norm": 0.7568233609199524, + "learning_rate": 2.4419098621159275e-05, + "loss": 0.7256, + "step": 6263 + }, + { + "epoch": 1.0867453157529494, + "grad_norm": 0.7976101636886597, + "learning_rate": 2.4412453504510447e-05, + "loss": 0.5752, + "step": 6264 + }, + { + "epoch": 1.0869188063844553, + "grad_norm": 0.9345763921737671, + "learning_rate": 2.4405807875791246e-05, + "loss": 0.7371, + "step": 6265 + }, + { + "epoch": 1.087092297015961, + "grad_norm": 0.6488348245620728, + "learning_rate": 2.43991617357729e-05, + "loss": 0.6373, + "step": 6266 + }, + { + "epoch": 1.087265787647467, + "grad_norm": 1.01821768283844, + "learning_rate": 2.4392515085226722e-05, + "loss": 0.6184, + "step": 6267 + }, + { + "epoch": 1.087439278278973, + "grad_norm": 0.9135203957557678, + "learning_rate": 2.4385867924924037e-05, + "loss": 0.7393, + "step": 6268 + }, + { + "epoch": 1.0876127689104789, + "grad_norm": 0.814350962638855, + "learning_rate": 2.4379220255636278e-05, + "loss": 0.8016, + "step": 6269 + }, + { + "epoch": 1.0877862595419847, + "grad_norm": 0.7199669480323792, + "learning_rate": 2.43725720781349e-05, + "loss": 0.7939, + "step": 6270 + }, + { + "epoch": 1.0879597501734906, + "grad_norm": 0.9696216583251953, + "learning_rate": 2.4365923393191443e-05, + "loss": 0.6501, + "step": 6271 + }, + { + "epoch": 1.0881332408049966, + "grad_norm": 1.1581882238388062, + "learning_rate": 2.4359274201577478e-05, + "loss": 0.5951, + "step": 6272 + }, + { + "epoch": 1.0883067314365025, + "grad_norm": 0.7994092106819153, + "learning_rate": 2.4352624504064672e-05, + "loss": 0.7324, + "step": 6273 + }, + { + "epoch": 1.0884802220680083, + "grad_norm": 1.6323617696762085, + "learning_rate": 2.4345974301424717e-05, + "loss": 0.6949, + "step": 6274 + }, + { + "epoch": 1.0886537126995142, + "grad_norm": 0.868382453918457, + "learning_rate": 2.433932359442938e-05, + "loss": 0.7026, + "step": 6275 + }, + { + "epoch": 1.0888272033310202, + "grad_norm": 1.4811197519302368, + "learning_rate": 2.433267238385048e-05, + "loss": 0.6621, + "step": 6276 + }, + { + "epoch": 1.089000693962526, + "grad_norm": 0.9039541482925415, + "learning_rate": 2.4326020670459912e-05, + "loss": 0.5789, + "step": 6277 + }, + { + "epoch": 1.089174184594032, + "grad_norm": 0.7657870054244995, + "learning_rate": 2.4319368455029598e-05, + "loss": 0.6611, + "step": 6278 + }, + { + "epoch": 1.0893476752255378, + "grad_norm": 1.0388113260269165, + "learning_rate": 2.4312715738331542e-05, + "loss": 0.5879, + "step": 6279 + }, + { + "epoch": 1.0895211658570436, + "grad_norm": 1.0058248043060303, + "learning_rate": 2.43060625211378e-05, + "loss": 0.7019, + "step": 6280 + }, + { + "epoch": 1.0896946564885497, + "grad_norm": 0.7304778695106506, + "learning_rate": 2.4299408804220485e-05, + "loss": 0.7006, + "step": 6281 + }, + { + "epoch": 1.0898681471200555, + "grad_norm": 1.0047492980957031, + "learning_rate": 2.4292754588351768e-05, + "loss": 0.5736, + "step": 6282 + }, + { + "epoch": 1.0900416377515614, + "grad_norm": 0.9248976707458496, + "learning_rate": 2.4286099874303876e-05, + "loss": 0.6807, + "step": 6283 + }, + { + "epoch": 1.0902151283830672, + "grad_norm": 0.8637429475784302, + "learning_rate": 2.42794446628491e-05, + "loss": 0.6899, + "step": 6284 + }, + { + "epoch": 1.0903886190145733, + "grad_norm": 1.1877309083938599, + "learning_rate": 2.4272788954759793e-05, + "loss": 0.6677, + "step": 6285 + }, + { + "epoch": 1.0905621096460791, + "grad_norm": 0.8602573275566101, + "learning_rate": 2.426613275080834e-05, + "loss": 0.5765, + "step": 6286 + }, + { + "epoch": 1.090735600277585, + "grad_norm": 1.044047474861145, + "learning_rate": 2.4259476051767213e-05, + "loss": 0.5868, + "step": 6287 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 1.1750353574752808, + "learning_rate": 2.4252818858408923e-05, + "loss": 0.5511, + "step": 6288 + }, + { + "epoch": 1.091082581540597, + "grad_norm": 1.0458310842514038, + "learning_rate": 2.4246161171506054e-05, + "loss": 0.7305, + "step": 6289 + }, + { + "epoch": 1.0912560721721027, + "grad_norm": 0.904930055141449, + "learning_rate": 2.4239502991831233e-05, + "loss": 0.5645, + "step": 6290 + }, + { + "epoch": 1.0914295628036086, + "grad_norm": 0.8289532661437988, + "learning_rate": 2.4232844320157146e-05, + "loss": 0.7957, + "step": 6291 + }, + { + "epoch": 1.0916030534351144, + "grad_norm": 0.8243095874786377, + "learning_rate": 2.4226185157256546e-05, + "loss": 0.8762, + "step": 6292 + }, + { + "epoch": 1.0917765440666205, + "grad_norm": 1.2541393041610718, + "learning_rate": 2.4219525503902234e-05, + "loss": 0.5454, + "step": 6293 + }, + { + "epoch": 1.0919500346981263, + "grad_norm": 1.3988404273986816, + "learning_rate": 2.421286536086707e-05, + "loss": 0.6643, + "step": 6294 + }, + { + "epoch": 1.0921235253296322, + "grad_norm": 1.5109177827835083, + "learning_rate": 2.4206204728923974e-05, + "loss": 0.6816, + "step": 6295 + }, + { + "epoch": 1.092297015961138, + "grad_norm": 1.5124131441116333, + "learning_rate": 2.4199543608845916e-05, + "loss": 0.6887, + "step": 6296 + }, + { + "epoch": 1.0924705065926439, + "grad_norm": 0.7373611330986023, + "learning_rate": 2.419288200140593e-05, + "loss": 0.5457, + "step": 6297 + }, + { + "epoch": 1.09264399722415, + "grad_norm": 0.7709951996803284, + "learning_rate": 2.4186219907377097e-05, + "loss": 0.6721, + "step": 6298 + }, + { + "epoch": 1.0928174878556558, + "grad_norm": 0.7761841416358948, + "learning_rate": 2.4179557327532574e-05, + "loss": 0.6677, + "step": 6299 + }, + { + "epoch": 1.0929909784871616, + "grad_norm": 0.7670193910598755, + "learning_rate": 2.4172894262645544e-05, + "loss": 0.677, + "step": 6300 + }, + { + "epoch": 1.0931644691186675, + "grad_norm": 0.9096062183380127, + "learning_rate": 2.4166230713489277e-05, + "loss": 0.6189, + "step": 6301 + }, + { + "epoch": 1.0933379597501736, + "grad_norm": 0.9048565030097961, + "learning_rate": 2.4159566680837086e-05, + "loss": 0.7684, + "step": 6302 + }, + { + "epoch": 1.0935114503816794, + "grad_norm": 0.6420848369598389, + "learning_rate": 2.415290216546233e-05, + "loss": 0.8716, + "step": 6303 + }, + { + "epoch": 1.0936849410131853, + "grad_norm": 0.8392890095710754, + "learning_rate": 2.414623716813844e-05, + "loss": 0.5623, + "step": 6304 + }, + { + "epoch": 1.093858431644691, + "grad_norm": 0.7947709560394287, + "learning_rate": 2.4139571689638893e-05, + "loss": 0.7025, + "step": 6305 + }, + { + "epoch": 1.0940319222761972, + "grad_norm": 1.1001137495040894, + "learning_rate": 2.413290573073723e-05, + "loss": 0.5776, + "step": 6306 + }, + { + "epoch": 1.094205412907703, + "grad_norm": 0.8945869207382202, + "learning_rate": 2.412623929220704e-05, + "loss": 0.7405, + "step": 6307 + }, + { + "epoch": 1.0943789035392089, + "grad_norm": 0.9172083139419556, + "learning_rate": 2.4119572374821968e-05, + "loss": 0.5688, + "step": 6308 + }, + { + "epoch": 1.0945523941707147, + "grad_norm": 0.7892106175422668, + "learning_rate": 2.411290497935573e-05, + "loss": 0.7703, + "step": 6309 + }, + { + "epoch": 1.0947258848022208, + "grad_norm": 1.4300426244735718, + "learning_rate": 2.4106237106582072e-05, + "loss": 0.6602, + "step": 6310 + }, + { + "epoch": 1.0948993754337266, + "grad_norm": 1.1403499841690063, + "learning_rate": 2.4099568757274812e-05, + "loss": 0.5651, + "step": 6311 + }, + { + "epoch": 1.0950728660652325, + "grad_norm": 0.8188315033912659, + "learning_rate": 2.4092899932207824e-05, + "loss": 0.6235, + "step": 6312 + }, + { + "epoch": 1.0952463566967383, + "grad_norm": 0.7609954476356506, + "learning_rate": 2.408623063215503e-05, + "loss": 0.671, + "step": 6313 + }, + { + "epoch": 1.0954198473282444, + "grad_norm": 0.8535282015800476, + "learning_rate": 2.4079560857890405e-05, + "loss": 0.7159, + "step": 6314 + }, + { + "epoch": 1.0955933379597502, + "grad_norm": 1.2411531209945679, + "learning_rate": 2.4072890610187997e-05, + "loss": 0.5814, + "step": 6315 + }, + { + "epoch": 1.095766828591256, + "grad_norm": 1.2875280380249023, + "learning_rate": 2.406621988982188e-05, + "loss": 0.6624, + "step": 6316 + }, + { + "epoch": 1.095940319222762, + "grad_norm": 1.1764861345291138, + "learning_rate": 2.405954869756621e-05, + "loss": 0.7534, + "step": 6317 + }, + { + "epoch": 1.096113809854268, + "grad_norm": 0.7732667326927185, + "learning_rate": 2.405287703419518e-05, + "loss": 0.6646, + "step": 6318 + }, + { + "epoch": 1.0962873004857738, + "grad_norm": 1.1853611469268799, + "learning_rate": 2.4046204900483052e-05, + "loss": 0.7046, + "step": 6319 + }, + { + "epoch": 1.0964607911172797, + "grad_norm": 1.4048949480056763, + "learning_rate": 2.4039532297204125e-05, + "loss": 0.5682, + "step": 6320 + }, + { + "epoch": 1.0966342817487855, + "grad_norm": 1.1131248474121094, + "learning_rate": 2.403285922513277e-05, + "loss": 0.5807, + "step": 6321 + }, + { + "epoch": 1.0968077723802914, + "grad_norm": 0.8301013112068176, + "learning_rate": 2.4026185685043405e-05, + "loss": 0.7324, + "step": 6322 + }, + { + "epoch": 1.0969812630117974, + "grad_norm": 1.0868568420410156, + "learning_rate": 2.40195116777105e-05, + "loss": 0.8147, + "step": 6323 + }, + { + "epoch": 1.0971547536433033, + "grad_norm": 1.1144930124282837, + "learning_rate": 2.4012837203908582e-05, + "loss": 0.7502, + "step": 6324 + }, + { + "epoch": 1.0973282442748091, + "grad_norm": 0.9322969913482666, + "learning_rate": 2.4006162264412227e-05, + "loss": 0.7224, + "step": 6325 + }, + { + "epoch": 1.097501734906315, + "grad_norm": 1.1206276416778564, + "learning_rate": 2.3999486859996073e-05, + "loss": 0.5847, + "step": 6326 + }, + { + "epoch": 1.097675225537821, + "grad_norm": 1.9222625494003296, + "learning_rate": 2.3992810991434815e-05, + "loss": 0.7688, + "step": 6327 + }, + { + "epoch": 1.0978487161693269, + "grad_norm": 0.8205929398536682, + "learning_rate": 2.3986134659503187e-05, + "loss": 0.676, + "step": 6328 + }, + { + "epoch": 1.0980222068008327, + "grad_norm": 0.8067792654037476, + "learning_rate": 2.397945786497598e-05, + "loss": 0.6193, + "step": 6329 + }, + { + "epoch": 1.0981956974323386, + "grad_norm": 0.6618004441261292, + "learning_rate": 2.3972780608628057e-05, + "loss": 0.6722, + "step": 6330 + }, + { + "epoch": 1.0983691880638446, + "grad_norm": 0.8711894154548645, + "learning_rate": 2.3966102891234314e-05, + "loss": 0.6968, + "step": 6331 + }, + { + "epoch": 1.0985426786953505, + "grad_norm": 0.7417333126068115, + "learning_rate": 2.3959424713569708e-05, + "loss": 0.7677, + "step": 6332 + }, + { + "epoch": 1.0987161693268563, + "grad_norm": 0.8832112550735474, + "learning_rate": 2.395274607640925e-05, + "loss": 0.6978, + "step": 6333 + }, + { + "epoch": 1.0988896599583622, + "grad_norm": 1.0390900373458862, + "learning_rate": 2.3946066980528e-05, + "loss": 0.7725, + "step": 6334 + }, + { + "epoch": 1.0990631505898683, + "grad_norm": 0.8636231422424316, + "learning_rate": 2.393938742670109e-05, + "loss": 0.658, + "step": 6335 + }, + { + "epoch": 1.099236641221374, + "grad_norm": 0.876723051071167, + "learning_rate": 2.3932707415703673e-05, + "loss": 0.7274, + "step": 6336 + }, + { + "epoch": 1.09941013185288, + "grad_norm": 1.4160152673721313, + "learning_rate": 2.3926026948310975e-05, + "loss": 0.8669, + "step": 6337 + }, + { + "epoch": 1.0995836224843858, + "grad_norm": 1.2533090114593506, + "learning_rate": 2.391934602529828e-05, + "loss": 0.7443, + "step": 6338 + }, + { + "epoch": 1.0997571131158916, + "grad_norm": 1.231245994567871, + "learning_rate": 2.3912664647440903e-05, + "loss": 0.7708, + "step": 6339 + }, + { + "epoch": 1.0999306037473977, + "grad_norm": 0.9831942319869995, + "learning_rate": 2.3905982815514243e-05, + "loss": 0.7412, + "step": 6340 + }, + { + "epoch": 1.1001040943789036, + "grad_norm": 1.5691592693328857, + "learning_rate": 2.3899300530293728e-05, + "loss": 0.6266, + "step": 6341 + }, + { + "epoch": 1.1002775850104094, + "grad_norm": 0.8700266480445862, + "learning_rate": 2.3892617792554833e-05, + "loss": 0.6504, + "step": 6342 + }, + { + "epoch": 1.1004510756419152, + "grad_norm": 0.8784376382827759, + "learning_rate": 2.3885934603073117e-05, + "loss": 0.845, + "step": 6343 + }, + { + "epoch": 1.1006245662734213, + "grad_norm": 1.1687365770339966, + "learning_rate": 2.3879250962624152e-05, + "loss": 0.6704, + "step": 6344 + }, + { + "epoch": 1.1007980569049272, + "grad_norm": 1.0510494709014893, + "learning_rate": 2.38725668719836e-05, + "loss": 0.6719, + "step": 6345 + }, + { + "epoch": 1.100971547536433, + "grad_norm": 0.8505674600601196, + "learning_rate": 2.386588233192715e-05, + "loss": 0.6873, + "step": 6346 + }, + { + "epoch": 1.1011450381679388, + "grad_norm": 0.9000505208969116, + "learning_rate": 2.3859197343230546e-05, + "loss": 0.6858, + "step": 6347 + }, + { + "epoch": 1.101318528799445, + "grad_norm": 0.9991929531097412, + "learning_rate": 2.38525119066696e-05, + "loss": 0.7418, + "step": 6348 + }, + { + "epoch": 1.1014920194309508, + "grad_norm": 0.774596631526947, + "learning_rate": 2.3845826023020156e-05, + "loss": 0.7383, + "step": 6349 + }, + { + "epoch": 1.1016655100624566, + "grad_norm": 1.4849753379821777, + "learning_rate": 2.3839139693058116e-05, + "loss": 0.7498, + "step": 6350 + }, + { + "epoch": 1.1018390006939625, + "grad_norm": 1.0282248258590698, + "learning_rate": 2.3832452917559446e-05, + "loss": 0.7273, + "step": 6351 + }, + { + "epoch": 1.1020124913254685, + "grad_norm": 1.390684962272644, + "learning_rate": 2.382576569730015e-05, + "loss": 0.6107, + "step": 6352 + }, + { + "epoch": 1.1021859819569744, + "grad_norm": 1.0211693048477173, + "learning_rate": 2.3819078033056284e-05, + "loss": 0.7136, + "step": 6353 + }, + { + "epoch": 1.1023594725884802, + "grad_norm": 1.5367127656936646, + "learning_rate": 2.3812389925603963e-05, + "loss": 0.61, + "step": 6354 + }, + { + "epoch": 1.102532963219986, + "grad_norm": 0.8522369861602783, + "learning_rate": 2.380570137571935e-05, + "loss": 0.7107, + "step": 6355 + }, + { + "epoch": 1.102706453851492, + "grad_norm": 1.1042028665542603, + "learning_rate": 2.3799012384178654e-05, + "loss": 0.5826, + "step": 6356 + }, + { + "epoch": 1.102879944482998, + "grad_norm": 0.8942098021507263, + "learning_rate": 2.3792322951758152e-05, + "loss": 0.6537, + "step": 6357 + }, + { + "epoch": 1.1030534351145038, + "grad_norm": 1.8127554655075073, + "learning_rate": 2.3785633079234144e-05, + "loss": 0.7012, + "step": 6358 + }, + { + "epoch": 1.1032269257460097, + "grad_norm": 0.8503397107124329, + "learning_rate": 2.3778942767383012e-05, + "loss": 0.8137, + "step": 6359 + }, + { + "epoch": 1.1034004163775155, + "grad_norm": 1.0427448749542236, + "learning_rate": 2.377225201698117e-05, + "loss": 0.6366, + "step": 6360 + }, + { + "epoch": 1.1035739070090216, + "grad_norm": 1.4795600175857544, + "learning_rate": 2.3765560828805075e-05, + "loss": 0.7117, + "step": 6361 + }, + { + "epoch": 1.1037473976405274, + "grad_norm": 1.7720617055892944, + "learning_rate": 2.3758869203631266e-05, + "loss": 0.629, + "step": 6362 + }, + { + "epoch": 1.1039208882720333, + "grad_norm": 1.2639597654342651, + "learning_rate": 2.3752177142236303e-05, + "loss": 0.6641, + "step": 6363 + }, + { + "epoch": 1.1040943789035391, + "grad_norm": 0.8957327604293823, + "learning_rate": 2.3745484645396816e-05, + "loss": 0.5879, + "step": 6364 + }, + { + "epoch": 1.1042678695350452, + "grad_norm": 0.7454932332038879, + "learning_rate": 2.3738791713889467e-05, + "loss": 0.7701, + "step": 6365 + }, + { + "epoch": 1.104441360166551, + "grad_norm": 1.826181411743164, + "learning_rate": 2.373209834849098e-05, + "loss": 0.8274, + "step": 6366 + }, + { + "epoch": 1.1046148507980569, + "grad_norm": 0.6872713565826416, + "learning_rate": 2.3725404549978143e-05, + "loss": 0.802, + "step": 6367 + }, + { + "epoch": 1.1047883414295627, + "grad_norm": 0.9346470832824707, + "learning_rate": 2.3718710319127755e-05, + "loss": 0.6171, + "step": 6368 + }, + { + "epoch": 1.1049618320610688, + "grad_norm": 1.4122846126556396, + "learning_rate": 2.3712015656716703e-05, + "loss": 0.6755, + "step": 6369 + }, + { + "epoch": 1.1051353226925746, + "grad_norm": 1.4891350269317627, + "learning_rate": 2.370532056352191e-05, + "loss": 0.5697, + "step": 6370 + }, + { + "epoch": 1.1053088133240805, + "grad_norm": 0.9756210446357727, + "learning_rate": 2.3698625040320346e-05, + "loss": 0.5995, + "step": 6371 + }, + { + "epoch": 1.1054823039555863, + "grad_norm": 1.1148258447647095, + "learning_rate": 2.3691929087889042e-05, + "loss": 0.704, + "step": 6372 + }, + { + "epoch": 1.1056557945870924, + "grad_norm": 0.8400816321372986, + "learning_rate": 2.3685232707005064e-05, + "loss": 0.5862, + "step": 6373 + }, + { + "epoch": 1.1058292852185982, + "grad_norm": 1.3582990169525146, + "learning_rate": 2.3678535898445533e-05, + "loss": 0.6614, + "step": 6374 + }, + { + "epoch": 1.106002775850104, + "grad_norm": 0.9388094544410706, + "learning_rate": 2.367183866298763e-05, + "loss": 0.7798, + "step": 6375 + }, + { + "epoch": 1.10617626648161, + "grad_norm": 1.2814013957977295, + "learning_rate": 2.3665141001408562e-05, + "loss": 0.7407, + "step": 6376 + }, + { + "epoch": 1.106349757113116, + "grad_norm": 0.8755448460578918, + "learning_rate": 2.365844291448562e-05, + "loss": 0.7473, + "step": 6377 + }, + { + "epoch": 1.1065232477446219, + "grad_norm": 0.9785711169242859, + "learning_rate": 2.3651744402996114e-05, + "loss": 0.6685, + "step": 6378 + }, + { + "epoch": 1.1066967383761277, + "grad_norm": 0.7452518939971924, + "learning_rate": 2.3645045467717405e-05, + "loss": 0.7258, + "step": 6379 + }, + { + "epoch": 1.1068702290076335, + "grad_norm": 0.849071741104126, + "learning_rate": 2.3638346109426932e-05, + "loss": 0.6826, + "step": 6380 + }, + { + "epoch": 1.1070437196391394, + "grad_norm": 1.5840638875961304, + "learning_rate": 2.3631646328902153e-05, + "loss": 0.5785, + "step": 6381 + }, + { + "epoch": 1.1072172102706455, + "grad_norm": 1.825454831123352, + "learning_rate": 2.362494612692058e-05, + "loss": 0.5759, + "step": 6382 + }, + { + "epoch": 1.1073907009021513, + "grad_norm": 0.9227408170700073, + "learning_rate": 2.361824550425979e-05, + "loss": 0.5566, + "step": 6383 + }, + { + "epoch": 1.1075641915336571, + "grad_norm": 0.8869020938873291, + "learning_rate": 2.361154446169739e-05, + "loss": 0.5985, + "step": 6384 + }, + { + "epoch": 1.107737682165163, + "grad_norm": 1.1362404823303223, + "learning_rate": 2.360484300001105e-05, + "loss": 0.7014, + "step": 6385 + }, + { + "epoch": 1.107911172796669, + "grad_norm": 2.439845561981201, + "learning_rate": 2.3598141119978482e-05, + "loss": 0.6667, + "step": 6386 + }, + { + "epoch": 1.108084663428175, + "grad_norm": 1.3233968019485474, + "learning_rate": 2.3591438822377434e-05, + "loss": 0.6317, + "step": 6387 + }, + { + "epoch": 1.1082581540596808, + "grad_norm": 0.7024808526039124, + "learning_rate": 2.3584736107985737e-05, + "loss": 0.7729, + "step": 6388 + }, + { + "epoch": 1.1084316446911866, + "grad_norm": 0.937938392162323, + "learning_rate": 2.3578032977581234e-05, + "loss": 0.6361, + "step": 6389 + }, + { + "epoch": 1.1086051353226927, + "grad_norm": 0.9426661133766174, + "learning_rate": 2.357132943194183e-05, + "loss": 0.5669, + "step": 6390 + }, + { + "epoch": 1.1087786259541985, + "grad_norm": 0.8811507821083069, + "learning_rate": 2.356462547184549e-05, + "loss": 0.7512, + "step": 6391 + }, + { + "epoch": 1.1089521165857044, + "grad_norm": 1.1587499380111694, + "learning_rate": 2.35579210980702e-05, + "loss": 0.6792, + "step": 6392 + }, + { + "epoch": 1.1091256072172102, + "grad_norm": 1.3745578527450562, + "learning_rate": 2.355121631139403e-05, + "loss": 0.6342, + "step": 6393 + }, + { + "epoch": 1.1092990978487163, + "grad_norm": 1.1952487230300903, + "learning_rate": 2.3544511112595068e-05, + "loss": 0.5627, + "step": 6394 + }, + { + "epoch": 1.1094725884802221, + "grad_norm": 1.025934100151062, + "learning_rate": 2.353780550245146e-05, + "loss": 0.6779, + "step": 6395 + }, + { + "epoch": 1.109646079111728, + "grad_norm": 0.7619770765304565, + "learning_rate": 2.3531099481741403e-05, + "loss": 0.6742, + "step": 6396 + }, + { + "epoch": 1.1098195697432338, + "grad_norm": 0.7683534026145935, + "learning_rate": 2.352439305124313e-05, + "loss": 0.6584, + "step": 6397 + }, + { + "epoch": 1.1099930603747397, + "grad_norm": 1.091141700744629, + "learning_rate": 2.351768621173495e-05, + "loss": 0.615, + "step": 6398 + }, + { + "epoch": 1.1101665510062457, + "grad_norm": 0.7419967651367188, + "learning_rate": 2.3510978963995176e-05, + "loss": 0.8015, + "step": 6399 + }, + { + "epoch": 1.1103400416377516, + "grad_norm": 0.8811755776405334, + "learning_rate": 2.3504271308802204e-05, + "loss": 0.6335, + "step": 6400 + }, + { + "epoch": 1.1105135322692574, + "grad_norm": 1.7310471534729004, + "learning_rate": 2.3497563246934464e-05, + "loss": 0.8757, + "step": 6401 + }, + { + "epoch": 1.1106870229007633, + "grad_norm": 0.7779544591903687, + "learning_rate": 2.3490854779170436e-05, + "loss": 0.8147, + "step": 6402 + }, + { + "epoch": 1.1108605135322693, + "grad_norm": 0.8219868540763855, + "learning_rate": 2.348414590628864e-05, + "loss": 0.6962, + "step": 6403 + }, + { + "epoch": 1.1110340041637752, + "grad_norm": 1.5568569898605347, + "learning_rate": 2.347743662906765e-05, + "loss": 0.5522, + "step": 6404 + }, + { + "epoch": 1.111207494795281, + "grad_norm": 2.099905490875244, + "learning_rate": 2.347072694828609e-05, + "loss": 0.5543, + "step": 6405 + }, + { + "epoch": 1.1113809854267869, + "grad_norm": 0.8699040412902832, + "learning_rate": 2.3464016864722625e-05, + "loss": 0.6384, + "step": 6406 + }, + { + "epoch": 1.111554476058293, + "grad_norm": 1.1191864013671875, + "learning_rate": 2.3457306379155965e-05, + "loss": 0.7222, + "step": 6407 + }, + { + "epoch": 1.1117279666897988, + "grad_norm": 0.8614413738250732, + "learning_rate": 2.345059549236487e-05, + "loss": 0.5731, + "step": 6408 + }, + { + "epoch": 1.1119014573213046, + "grad_norm": 1.2872446775436401, + "learning_rate": 2.3443884205128148e-05, + "loss": 0.6108, + "step": 6409 + }, + { + "epoch": 1.1120749479528105, + "grad_norm": 0.901212215423584, + "learning_rate": 2.343717251822465e-05, + "loss": 0.6608, + "step": 6410 + }, + { + "epoch": 1.1122484385843165, + "grad_norm": 0.9153985977172852, + "learning_rate": 2.343046043243328e-05, + "loss": 0.712, + "step": 6411 + }, + { + "epoch": 1.1124219292158224, + "grad_norm": 0.9375565648078918, + "learning_rate": 2.3423747948532976e-05, + "loss": 0.6932, + "step": 6412 + }, + { + "epoch": 1.1125954198473282, + "grad_norm": 0.8394069671630859, + "learning_rate": 2.3417035067302733e-05, + "loss": 0.6584, + "step": 6413 + }, + { + "epoch": 1.112768910478834, + "grad_norm": 0.7831284403800964, + "learning_rate": 2.341032178952159e-05, + "loss": 0.6567, + "step": 6414 + }, + { + "epoch": 1.11294240111034, + "grad_norm": 1.0655075311660767, + "learning_rate": 2.340360811596863e-05, + "loss": 0.6975, + "step": 6415 + }, + { + "epoch": 1.113115891741846, + "grad_norm": 0.8313942551612854, + "learning_rate": 2.339689404742298e-05, + "loss": 0.6979, + "step": 6416 + }, + { + "epoch": 1.1132893823733518, + "grad_norm": 1.3177754878997803, + "learning_rate": 2.3390179584663815e-05, + "loss": 0.7369, + "step": 6417 + }, + { + "epoch": 1.1134628730048577, + "grad_norm": 1.0837479829788208, + "learning_rate": 2.338346472847037e-05, + "loss": 0.6818, + "step": 6418 + }, + { + "epoch": 1.1136363636363635, + "grad_norm": 0.8832026124000549, + "learning_rate": 2.3376749479621886e-05, + "loss": 0.6366, + "step": 6419 + }, + { + "epoch": 1.1138098542678696, + "grad_norm": 3.499490976333618, + "learning_rate": 2.3370033838897702e-05, + "loss": 0.5474, + "step": 6420 + }, + { + "epoch": 1.1139833448993754, + "grad_norm": 0.843454122543335, + "learning_rate": 2.3363317807077157e-05, + "loss": 0.6971, + "step": 6421 + }, + { + "epoch": 1.1141568355308813, + "grad_norm": 1.5543310642242432, + "learning_rate": 2.3356601384939665e-05, + "loss": 0.7032, + "step": 6422 + }, + { + "epoch": 1.1143303261623871, + "grad_norm": 0.9629480242729187, + "learning_rate": 2.3349884573264673e-05, + "loss": 0.7021, + "step": 6423 + }, + { + "epoch": 1.1145038167938932, + "grad_norm": 0.7708274126052856, + "learning_rate": 2.3343167372831665e-05, + "loss": 0.6084, + "step": 6424 + }, + { + "epoch": 1.114677307425399, + "grad_norm": 1.041024923324585, + "learning_rate": 2.3336449784420197e-05, + "loss": 0.7021, + "step": 6425 + }, + { + "epoch": 1.114850798056905, + "grad_norm": 0.9115402102470398, + "learning_rate": 2.3329731808809836e-05, + "loss": 0.6155, + "step": 6426 + }, + { + "epoch": 1.1150242886884107, + "grad_norm": 0.9475339651107788, + "learning_rate": 2.3323013446780226e-05, + "loss": 0.6115, + "step": 6427 + }, + { + "epoch": 1.1151977793199168, + "grad_norm": 0.9986066818237305, + "learning_rate": 2.331629469911103e-05, + "loss": 0.6653, + "step": 6428 + }, + { + "epoch": 1.1153712699514227, + "grad_norm": 0.8616572022438049, + "learning_rate": 2.3309575566581968e-05, + "loss": 0.6152, + "step": 6429 + }, + { + "epoch": 1.1155447605829285, + "grad_norm": 0.8340502977371216, + "learning_rate": 2.330285604997281e-05, + "loss": 0.7468, + "step": 6430 + }, + { + "epoch": 1.1157182512144344, + "grad_norm": 0.894149124622345, + "learning_rate": 2.329613615006336e-05, + "loss": 0.6685, + "step": 6431 + }, + { + "epoch": 1.1158917418459404, + "grad_norm": 0.9897655248641968, + "learning_rate": 2.328941586763346e-05, + "loss": 0.6337, + "step": 6432 + }, + { + "epoch": 1.1160652324774463, + "grad_norm": 0.8582626581192017, + "learning_rate": 2.3282695203463022e-05, + "loss": 0.774, + "step": 6433 + }, + { + "epoch": 1.1162387231089521, + "grad_norm": 1.1655590534210205, + "learning_rate": 2.3275974158331977e-05, + "loss": 0.7219, + "step": 6434 + }, + { + "epoch": 1.116412213740458, + "grad_norm": 0.805227518081665, + "learning_rate": 2.326925273302032e-05, + "loss": 0.6992, + "step": 6435 + }, + { + "epoch": 1.116585704371964, + "grad_norm": 4.722814083099365, + "learning_rate": 2.3262530928308068e-05, + "loss": 0.5955, + "step": 6436 + }, + { + "epoch": 1.1167591950034699, + "grad_norm": 0.9515796303749084, + "learning_rate": 2.32558087449753e-05, + "loss": 0.7262, + "step": 6437 + }, + { + "epoch": 1.1169326856349757, + "grad_norm": 1.139648675918579, + "learning_rate": 2.3249086183802137e-05, + "loss": 0.6428, + "step": 6438 + }, + { + "epoch": 1.1171061762664816, + "grad_norm": 0.8903799653053284, + "learning_rate": 2.324236324556873e-05, + "loss": 0.7832, + "step": 6439 + }, + { + "epoch": 1.1172796668979874, + "grad_norm": 0.7936215996742249, + "learning_rate": 2.32356399310553e-05, + "loss": 0.7285, + "step": 6440 + }, + { + "epoch": 1.1174531575294935, + "grad_norm": 1.0543421506881714, + "learning_rate": 2.3228916241042078e-05, + "loss": 0.6465, + "step": 6441 + }, + { + "epoch": 1.1176266481609993, + "grad_norm": 0.8641033172607422, + "learning_rate": 2.3222192176309367e-05, + "loss": 0.743, + "step": 6442 + }, + { + "epoch": 1.1178001387925052, + "grad_norm": 1.1386210918426514, + "learning_rate": 2.3215467737637498e-05, + "loss": 0.5649, + "step": 6443 + }, + { + "epoch": 1.117973629424011, + "grad_norm": 1.0614453554153442, + "learning_rate": 2.320874292580685e-05, + "loss": 0.8291, + "step": 6444 + }, + { + "epoch": 1.118147120055517, + "grad_norm": 0.8073827028274536, + "learning_rate": 2.320201774159785e-05, + "loss": 0.7886, + "step": 6445 + }, + { + "epoch": 1.118320610687023, + "grad_norm": 0.9970085024833679, + "learning_rate": 2.3195292185790957e-05, + "loss": 0.6982, + "step": 6446 + }, + { + "epoch": 1.1184941013185288, + "grad_norm": 1.2892099618911743, + "learning_rate": 2.318856625916668e-05, + "loss": 0.7068, + "step": 6447 + }, + { + "epoch": 1.1186675919500346, + "grad_norm": 1.4790650606155396, + "learning_rate": 2.318183996250558e-05, + "loss": 0.5231, + "step": 6448 + }, + { + "epoch": 1.1188410825815407, + "grad_norm": 0.7768178582191467, + "learning_rate": 2.3175113296588244e-05, + "loss": 0.8242, + "step": 6449 + }, + { + "epoch": 1.1190145732130465, + "grad_norm": 0.8919392228126526, + "learning_rate": 2.3168386262195307e-05, + "loss": 0.7126, + "step": 6450 + }, + { + "epoch": 1.1191880638445524, + "grad_norm": 0.8571761846542358, + "learning_rate": 2.3161658860107457e-05, + "loss": 0.7617, + "step": 6451 + }, + { + "epoch": 1.1193615544760582, + "grad_norm": 1.1411502361297607, + "learning_rate": 2.315493109110541e-05, + "loss": 0.668, + "step": 6452 + }, + { + "epoch": 1.1195350451075643, + "grad_norm": 4.71232271194458, + "learning_rate": 2.314820295596993e-05, + "loss": 0.6102, + "step": 6453 + }, + { + "epoch": 1.1197085357390701, + "grad_norm": 0.9350519776344299, + "learning_rate": 2.314147445548183e-05, + "loss": 0.7064, + "step": 6454 + }, + { + "epoch": 1.119882026370576, + "grad_norm": 0.9818018078804016, + "learning_rate": 2.313474559042196e-05, + "loss": 0.7614, + "step": 6455 + }, + { + "epoch": 1.1200555170020818, + "grad_norm": 0.7869362235069275, + "learning_rate": 2.3128016361571213e-05, + "loss": 0.7421, + "step": 6456 + }, + { + "epoch": 1.1202290076335877, + "grad_norm": 0.8836187720298767, + "learning_rate": 2.312128676971052e-05, + "loss": 0.6628, + "step": 6457 + }, + { + "epoch": 1.1204024982650937, + "grad_norm": 0.6679983139038086, + "learning_rate": 2.3114556815620863e-05, + "loss": 0.7668, + "step": 6458 + }, + { + "epoch": 1.1205759888965996, + "grad_norm": 2.854456901550293, + "learning_rate": 2.310782650008326e-05, + "loss": 0.8115, + "step": 6459 + }, + { + "epoch": 1.1207494795281054, + "grad_norm": 0.8670933842658997, + "learning_rate": 2.3101095823878764e-05, + "loss": 0.7361, + "step": 6460 + }, + { + "epoch": 1.1209229701596113, + "grad_norm": 0.7189038991928101, + "learning_rate": 2.3094364787788487e-05, + "loss": 0.8293, + "step": 6461 + }, + { + "epoch": 1.1210964607911174, + "grad_norm": 0.7957229614257812, + "learning_rate": 2.308763339259357e-05, + "loss": 0.8105, + "step": 6462 + }, + { + "epoch": 1.1212699514226232, + "grad_norm": 0.71563720703125, + "learning_rate": 2.30809016390752e-05, + "loss": 0.9177, + "step": 6463 + }, + { + "epoch": 1.121443442054129, + "grad_norm": 0.8233482241630554, + "learning_rate": 2.3074169528014605e-05, + "loss": 0.7736, + "step": 6464 + }, + { + "epoch": 1.121616932685635, + "grad_norm": 2.623896598815918, + "learning_rate": 2.3067437060193055e-05, + "loss": 0.6809, + "step": 6465 + }, + { + "epoch": 1.121790423317141, + "grad_norm": 1.0637873411178589, + "learning_rate": 2.306070423639186e-05, + "loss": 0.6682, + "step": 6466 + }, + { + "epoch": 1.1219639139486468, + "grad_norm": 0.8225600719451904, + "learning_rate": 2.3053971057392368e-05, + "loss": 0.7791, + "step": 6467 + }, + { + "epoch": 1.1221374045801527, + "grad_norm": 1.7840458154678345, + "learning_rate": 2.3047237523975984e-05, + "loss": 0.7045, + "step": 6468 + }, + { + "epoch": 1.1223108952116585, + "grad_norm": 0.7688014507293701, + "learning_rate": 2.3040503636924126e-05, + "loss": 0.7822, + "step": 6469 + }, + { + "epoch": 1.1224843858431646, + "grad_norm": 0.8458659052848816, + "learning_rate": 2.3033769397018286e-05, + "loss": 0.8494, + "step": 6470 + }, + { + "epoch": 1.1226578764746704, + "grad_norm": 0.8628953099250793, + "learning_rate": 2.3027034805039965e-05, + "loss": 0.5818, + "step": 6471 + }, + { + "epoch": 1.1228313671061763, + "grad_norm": 0.7793022990226746, + "learning_rate": 2.3020299861770732e-05, + "loss": 0.6899, + "step": 6472 + }, + { + "epoch": 1.123004857737682, + "grad_norm": 0.948785126209259, + "learning_rate": 2.3013564567992184e-05, + "loss": 0.637, + "step": 6473 + }, + { + "epoch": 1.123178348369188, + "grad_norm": 1.003341794013977, + "learning_rate": 2.300682892448595e-05, + "loss": 0.6792, + "step": 6474 + }, + { + "epoch": 1.123351839000694, + "grad_norm": 1.0351016521453857, + "learning_rate": 2.3000092932033718e-05, + "loss": 0.7438, + "step": 6475 + }, + { + "epoch": 1.1235253296321999, + "grad_norm": 1.0819847583770752, + "learning_rate": 2.2993356591417203e-05, + "loss": 0.7191, + "step": 6476 + }, + { + "epoch": 1.1236988202637057, + "grad_norm": 1.4824042320251465, + "learning_rate": 2.2986619903418172e-05, + "loss": 0.5648, + "step": 6477 + }, + { + "epoch": 1.1238723108952116, + "grad_norm": 1.0801019668579102, + "learning_rate": 2.2979882868818422e-05, + "loss": 0.7949, + "step": 6478 + }, + { + "epoch": 1.1240458015267176, + "grad_norm": 1.3666574954986572, + "learning_rate": 2.2973145488399792e-05, + "loss": 0.6086, + "step": 6479 + }, + { + "epoch": 1.1242192921582235, + "grad_norm": 0.8624630570411682, + "learning_rate": 2.296640776294416e-05, + "loss": 0.7319, + "step": 6480 + }, + { + "epoch": 1.1243927827897293, + "grad_norm": 1.235365629196167, + "learning_rate": 2.2959669693233453e-05, + "loss": 0.9231, + "step": 6481 + }, + { + "epoch": 1.1245662734212352, + "grad_norm": 0.7404849529266357, + "learning_rate": 2.2952931280049628e-05, + "loss": 0.7999, + "step": 6482 + }, + { + "epoch": 1.1247397640527412, + "grad_norm": 1.1630373001098633, + "learning_rate": 2.294619252417469e-05, + "loss": 0.6825, + "step": 6483 + }, + { + "epoch": 1.124913254684247, + "grad_norm": 0.8648441433906555, + "learning_rate": 2.293945342639067e-05, + "loss": 0.6429, + "step": 6484 + }, + { + "epoch": 1.125086745315753, + "grad_norm": 0.9052178859710693, + "learning_rate": 2.2932713987479664e-05, + "loss": 0.7708, + "step": 6485 + }, + { + "epoch": 1.1252602359472588, + "grad_norm": 1.3946322202682495, + "learning_rate": 2.2925974208223778e-05, + "loss": 0.7577, + "step": 6486 + }, + { + "epoch": 1.1254337265787648, + "grad_norm": 0.9525977373123169, + "learning_rate": 2.2919234089405173e-05, + "loss": 0.6991, + "step": 6487 + }, + { + "epoch": 1.1256072172102707, + "grad_norm": 0.8492418527603149, + "learning_rate": 2.2912493631806055e-05, + "loss": 0.6847, + "step": 6488 + }, + { + "epoch": 1.1257807078417765, + "grad_norm": 0.9812506437301636, + "learning_rate": 2.290575283620865e-05, + "loss": 0.5612, + "step": 6489 + }, + { + "epoch": 1.1259541984732824, + "grad_norm": 0.9823220372200012, + "learning_rate": 2.2899011703395254e-05, + "loss": 0.6948, + "step": 6490 + }, + { + "epoch": 1.1261276891047882, + "grad_norm": 0.8331122994422913, + "learning_rate": 2.289227023414816e-05, + "loss": 0.7098, + "step": 6491 + }, + { + "epoch": 1.1263011797362943, + "grad_norm": 1.3848315477371216, + "learning_rate": 2.288552842924974e-05, + "loss": 0.7456, + "step": 6492 + }, + { + "epoch": 1.1264746703678001, + "grad_norm": 4.823281764984131, + "learning_rate": 2.2878786289482384e-05, + "loss": 0.6421, + "step": 6493 + }, + { + "epoch": 1.126648160999306, + "grad_norm": 0.80062335729599, + "learning_rate": 2.2872043815628525e-05, + "loss": 0.7778, + "step": 6494 + }, + { + "epoch": 1.126821651630812, + "grad_norm": 1.4428578615188599, + "learning_rate": 2.2865301008470633e-05, + "loss": 0.5879, + "step": 6495 + }, + { + "epoch": 1.126995142262318, + "grad_norm": 1.0876051187515259, + "learning_rate": 2.2858557868791222e-05, + "loss": 0.6509, + "step": 6496 + }, + { + "epoch": 1.1271686328938237, + "grad_norm": 0.8760416507720947, + "learning_rate": 2.2851814397372838e-05, + "loss": 0.6971, + "step": 6497 + }, + { + "epoch": 1.1273421235253296, + "grad_norm": 1.0385993719100952, + "learning_rate": 2.284507059499807e-05, + "loss": 0.7407, + "step": 6498 + }, + { + "epoch": 1.1275156141568354, + "grad_norm": 0.9742932915687561, + "learning_rate": 2.283832646244955e-05, + "loss": 0.7471, + "step": 6499 + }, + { + "epoch": 1.1276891047883415, + "grad_norm": 0.8071939945220947, + "learning_rate": 2.283158200050993e-05, + "loss": 0.7056, + "step": 6500 + }, + { + "epoch": 1.1278625954198473, + "grad_norm": 1.0312625169754028, + "learning_rate": 2.2824837209961924e-05, + "loss": 0.6274, + "step": 6501 + }, + { + "epoch": 1.1280360860513532, + "grad_norm": 0.9815395474433899, + "learning_rate": 2.2818092091588266e-05, + "loss": 0.7065, + "step": 6502 + }, + { + "epoch": 1.128209576682859, + "grad_norm": 0.9070963859558105, + "learning_rate": 2.2811346646171734e-05, + "loss": 0.7467, + "step": 6503 + }, + { + "epoch": 1.128383067314365, + "grad_norm": 1.173703670501709, + "learning_rate": 2.280460087449515e-05, + "loss": 0.8096, + "step": 6504 + }, + { + "epoch": 1.128556557945871, + "grad_norm": 1.2648683786392212, + "learning_rate": 2.2797854777341368e-05, + "loss": 0.7197, + "step": 6505 + }, + { + "epoch": 1.1287300485773768, + "grad_norm": 0.912273108959198, + "learning_rate": 2.2791108355493278e-05, + "loss": 0.564, + "step": 6506 + }, + { + "epoch": 1.1289035392088826, + "grad_norm": 0.7432977557182312, + "learning_rate": 2.2784361609733812e-05, + "loss": 0.8098, + "step": 6507 + }, + { + "epoch": 1.1290770298403887, + "grad_norm": 1.2777351140975952, + "learning_rate": 2.2777614540845934e-05, + "loss": 0.6687, + "step": 6508 + }, + { + "epoch": 1.1292505204718946, + "grad_norm": 1.1175650358200073, + "learning_rate": 2.2770867149612658e-05, + "loss": 0.5897, + "step": 6509 + }, + { + "epoch": 1.1294240111034004, + "grad_norm": 1.0799572467803955, + "learning_rate": 2.2764119436817015e-05, + "loss": 0.5751, + "step": 6510 + }, + { + "epoch": 1.1295975017349063, + "grad_norm": 0.9715133309364319, + "learning_rate": 2.2757371403242094e-05, + "loss": 0.7823, + "step": 6511 + }, + { + "epoch": 1.1297709923664123, + "grad_norm": 0.8013166785240173, + "learning_rate": 2.2750623049671003e-05, + "loss": 0.6996, + "step": 6512 + }, + { + "epoch": 1.1299444829979182, + "grad_norm": 0.8605124950408936, + "learning_rate": 2.2743874376886903e-05, + "loss": 0.7239, + "step": 6513 + }, + { + "epoch": 1.130117973629424, + "grad_norm": 1.0074958801269531, + "learning_rate": 2.273712538567299e-05, + "loss": 0.5912, + "step": 6514 + }, + { + "epoch": 1.1302914642609299, + "grad_norm": 0.7116672992706299, + "learning_rate": 2.273037607681248e-05, + "loss": 0.8301, + "step": 6515 + }, + { + "epoch": 1.1304649548924357, + "grad_norm": 1.3745534420013428, + "learning_rate": 2.2723626451088644e-05, + "loss": 0.6484, + "step": 6516 + }, + { + "epoch": 1.1306384455239418, + "grad_norm": 0.8941896557807922, + "learning_rate": 2.2716876509284794e-05, + "loss": 0.7036, + "step": 6517 + }, + { + "epoch": 1.1308119361554476, + "grad_norm": 0.8822718858718872, + "learning_rate": 2.2710126252184255e-05, + "loss": 0.6261, + "step": 6518 + }, + { + "epoch": 1.1309854267869535, + "grad_norm": 0.9627441167831421, + "learning_rate": 2.2703375680570402e-05, + "loss": 0.7288, + "step": 6519 + }, + { + "epoch": 1.1311589174184593, + "grad_norm": 1.0084397792816162, + "learning_rate": 2.2696624795226662e-05, + "loss": 0.5952, + "step": 6520 + }, + { + "epoch": 1.1313324080499654, + "grad_norm": 0.8540730476379395, + "learning_rate": 2.2689873596936458e-05, + "loss": 0.5938, + "step": 6521 + }, + { + "epoch": 1.1315058986814712, + "grad_norm": 0.748188316822052, + "learning_rate": 2.2683122086483297e-05, + "loss": 0.6812, + "step": 6522 + }, + { + "epoch": 1.131679389312977, + "grad_norm": 1.0149214267730713, + "learning_rate": 2.2676370264650694e-05, + "loss": 0.6212, + "step": 6523 + }, + { + "epoch": 1.131852879944483, + "grad_norm": 0.7705399990081787, + "learning_rate": 2.26696181322222e-05, + "loss": 0.6847, + "step": 6524 + }, + { + "epoch": 1.132026370575989, + "grad_norm": 1.6486752033233643, + "learning_rate": 2.266286568998141e-05, + "loss": 0.6689, + "step": 6525 + }, + { + "epoch": 1.1321998612074948, + "grad_norm": 1.1810269355773926, + "learning_rate": 2.2656112938711952e-05, + "loss": 0.6427, + "step": 6526 + }, + { + "epoch": 1.1323733518390007, + "grad_norm": 1.2801228761672974, + "learning_rate": 2.2649359879197497e-05, + "loss": 0.7329, + "step": 6527 + }, + { + "epoch": 1.1325468424705065, + "grad_norm": 1.0683790445327759, + "learning_rate": 2.264260651222174e-05, + "loss": 0.6627, + "step": 6528 + }, + { + "epoch": 1.1327203331020126, + "grad_norm": 0.7992064952850342, + "learning_rate": 2.263585283856841e-05, + "loss": 0.762, + "step": 6529 + }, + { + "epoch": 1.1328938237335184, + "grad_norm": 0.8770368099212646, + "learning_rate": 2.26290988590213e-05, + "loss": 0.7014, + "step": 6530 + }, + { + "epoch": 1.1330673143650243, + "grad_norm": 0.8680418729782104, + "learning_rate": 2.2622344574364197e-05, + "loss": 0.577, + "step": 6531 + }, + { + "epoch": 1.1332408049965301, + "grad_norm": 0.9364667534828186, + "learning_rate": 2.261558998538095e-05, + "loss": 0.6312, + "step": 6532 + }, + { + "epoch": 1.133414295628036, + "grad_norm": 0.9488011002540588, + "learning_rate": 2.2608835092855443e-05, + "loss": 0.6956, + "step": 6533 + }, + { + "epoch": 1.133587786259542, + "grad_norm": 0.7835585474967957, + "learning_rate": 2.2602079897571576e-05, + "loss": 0.6996, + "step": 6534 + }, + { + "epoch": 1.1337612768910479, + "grad_norm": 0.8999343514442444, + "learning_rate": 2.2595324400313306e-05, + "loss": 0.5944, + "step": 6535 + }, + { + "epoch": 1.1339347675225537, + "grad_norm": 0.7106151580810547, + "learning_rate": 2.258856860186462e-05, + "loss": 0.8975, + "step": 6536 + }, + { + "epoch": 1.1341082581540598, + "grad_norm": 1.0132262706756592, + "learning_rate": 2.2581812503009527e-05, + "loss": 0.7439, + "step": 6537 + }, + { + "epoch": 1.1342817487855656, + "grad_norm": 0.9652183651924133, + "learning_rate": 2.257505610453209e-05, + "loss": 0.7361, + "step": 6538 + }, + { + "epoch": 1.1344552394170715, + "grad_norm": 0.9883400797843933, + "learning_rate": 2.2568299407216384e-05, + "loss": 0.7147, + "step": 6539 + }, + { + "epoch": 1.1346287300485773, + "grad_norm": 0.9289145469665527, + "learning_rate": 2.2561542411846537e-05, + "loss": 0.6484, + "step": 6540 + }, + { + "epoch": 1.1348022206800832, + "grad_norm": 1.0054991245269775, + "learning_rate": 2.255478511920672e-05, + "loss": 0.6914, + "step": 6541 + }, + { + "epoch": 1.1349757113115893, + "grad_norm": 0.9544798731803894, + "learning_rate": 2.25480275300811e-05, + "loss": 0.6409, + "step": 6542 + }, + { + "epoch": 1.135149201943095, + "grad_norm": 1.0211178064346313, + "learning_rate": 2.254126964525393e-05, + "loss": 0.567, + "step": 6543 + }, + { + "epoch": 1.135322692574601, + "grad_norm": 1.6077580451965332, + "learning_rate": 2.253451146550945e-05, + "loss": 0.7529, + "step": 6544 + }, + { + "epoch": 1.1354961832061068, + "grad_norm": 0.9097134470939636, + "learning_rate": 2.2527752991631958e-05, + "loss": 0.7664, + "step": 6545 + }, + { + "epoch": 1.1356696738376129, + "grad_norm": 0.8754276633262634, + "learning_rate": 2.252099422440579e-05, + "loss": 0.7661, + "step": 6546 + }, + { + "epoch": 1.1358431644691187, + "grad_norm": 1.2457427978515625, + "learning_rate": 2.2514235164615305e-05, + "loss": 0.8601, + "step": 6547 + }, + { + "epoch": 1.1360166551006246, + "grad_norm": 0.8978387713432312, + "learning_rate": 2.2507475813044896e-05, + "loss": 0.6038, + "step": 6548 + }, + { + "epoch": 1.1361901457321304, + "grad_norm": 0.8373621106147766, + "learning_rate": 2.2500716170479e-05, + "loss": 0.8015, + "step": 6549 + }, + { + "epoch": 1.1363636363636362, + "grad_norm": 1.049743890762329, + "learning_rate": 2.2493956237702075e-05, + "loss": 0.6898, + "step": 6550 + }, + { + "epoch": 1.1365371269951423, + "grad_norm": 0.9578847885131836, + "learning_rate": 2.248719601549863e-05, + "loss": 0.5824, + "step": 6551 + }, + { + "epoch": 1.1367106176266482, + "grad_norm": 0.8890000581741333, + "learning_rate": 2.2480435504653185e-05, + "loss": 0.6512, + "step": 6552 + }, + { + "epoch": 1.136884108258154, + "grad_norm": 1.1816989183425903, + "learning_rate": 2.2473674705950303e-05, + "loss": 0.5924, + "step": 6553 + }, + { + "epoch": 1.13705759888966, + "grad_norm": 1.5043936967849731, + "learning_rate": 2.24669136201746e-05, + "loss": 0.6969, + "step": 6554 + }, + { + "epoch": 1.137231089521166, + "grad_norm": 1.3431493043899536, + "learning_rate": 2.246015224811069e-05, + "loss": 0.8219, + "step": 6555 + }, + { + "epoch": 1.1374045801526718, + "grad_norm": 0.8006736040115356, + "learning_rate": 2.2453390590543246e-05, + "loss": 0.7954, + "step": 6556 + }, + { + "epoch": 1.1375780707841776, + "grad_norm": 0.8421595096588135, + "learning_rate": 2.2446628648256964e-05, + "loss": 0.887, + "step": 6557 + }, + { + "epoch": 1.1377515614156835, + "grad_norm": 1.064440131187439, + "learning_rate": 2.243986642203658e-05, + "loss": 0.6592, + "step": 6558 + }, + { + "epoch": 1.1379250520471895, + "grad_norm": 1.2082492113113403, + "learning_rate": 2.2433103912666852e-05, + "loss": 0.6278, + "step": 6559 + }, + { + "epoch": 1.1380985426786954, + "grad_norm": 0.7330751419067383, + "learning_rate": 2.2426341120932582e-05, + "loss": 0.7312, + "step": 6560 + }, + { + "epoch": 1.1382720333102012, + "grad_norm": 1.0169564485549927, + "learning_rate": 2.2419578047618598e-05, + "loss": 0.5879, + "step": 6561 + }, + { + "epoch": 1.138445523941707, + "grad_norm": 1.0434348583221436, + "learning_rate": 2.241281469350976e-05, + "loss": 0.6445, + "step": 6562 + }, + { + "epoch": 1.1386190145732131, + "grad_norm": 1.0131971836090088, + "learning_rate": 2.2406051059390963e-05, + "loss": 0.6156, + "step": 6563 + }, + { + "epoch": 1.138792505204719, + "grad_norm": 0.8809821009635925, + "learning_rate": 2.2399287146047137e-05, + "loss": 0.6134, + "step": 6564 + }, + { + "epoch": 1.1389659958362248, + "grad_norm": 0.8209224343299866, + "learning_rate": 2.239252295426325e-05, + "loss": 0.8206, + "step": 6565 + }, + { + "epoch": 1.1391394864677307, + "grad_norm": 0.8236876130104065, + "learning_rate": 2.2385758484824275e-05, + "loss": 0.7026, + "step": 6566 + }, + { + "epoch": 1.1393129770992367, + "grad_norm": 0.7136307954788208, + "learning_rate": 2.2378993738515255e-05, + "loss": 0.8582, + "step": 6567 + }, + { + "epoch": 1.1394864677307426, + "grad_norm": 0.6974970102310181, + "learning_rate": 2.2372228716121246e-05, + "loss": 0.5442, + "step": 6568 + }, + { + "epoch": 1.1396599583622484, + "grad_norm": 0.7300047278404236, + "learning_rate": 2.2365463418427317e-05, + "loss": 0.8032, + "step": 6569 + }, + { + "epoch": 1.1398334489937543, + "grad_norm": 0.9855304956436157, + "learning_rate": 2.235869784621861e-05, + "loss": 0.7102, + "step": 6570 + }, + { + "epoch": 1.1400069396252603, + "grad_norm": 0.8372164368629456, + "learning_rate": 2.2351932000280266e-05, + "loss": 0.6974, + "step": 6571 + }, + { + "epoch": 1.1401804302567662, + "grad_norm": 1.1013511419296265, + "learning_rate": 2.2345165881397475e-05, + "loss": 0.522, + "step": 6572 + }, + { + "epoch": 1.140353920888272, + "grad_norm": 0.979479968547821, + "learning_rate": 2.233839949035545e-05, + "loss": 0.6488, + "step": 6573 + }, + { + "epoch": 1.1405274115197779, + "grad_norm": 1.6654982566833496, + "learning_rate": 2.2331632827939438e-05, + "loss": 0.5996, + "step": 6574 + }, + { + "epoch": 1.1407009021512837, + "grad_norm": 0.9644398093223572, + "learning_rate": 2.232486589493472e-05, + "loss": 0.7729, + "step": 6575 + }, + { + "epoch": 1.1408743927827898, + "grad_norm": 1.0833280086517334, + "learning_rate": 2.231809869212661e-05, + "loss": 0.6547, + "step": 6576 + }, + { + "epoch": 1.1410478834142956, + "grad_norm": 1.2152340412139893, + "learning_rate": 2.231133122030044e-05, + "loss": 0.6053, + "step": 6577 + }, + { + "epoch": 1.1412213740458015, + "grad_norm": 0.7362704873085022, + "learning_rate": 2.230456348024159e-05, + "loss": 0.7871, + "step": 6578 + }, + { + "epoch": 1.1413948646773073, + "grad_norm": 0.8777305483818054, + "learning_rate": 2.2297795472735462e-05, + "loss": 0.7344, + "step": 6579 + }, + { + "epoch": 1.1415683553088134, + "grad_norm": 0.9169719815254211, + "learning_rate": 2.2291027198567502e-05, + "loss": 0.6927, + "step": 6580 + }, + { + "epoch": 1.1417418459403192, + "grad_norm": 1.0318807363510132, + "learning_rate": 2.228425865852316e-05, + "loss": 0.644, + "step": 6581 + }, + { + "epoch": 1.141915336571825, + "grad_norm": 0.8027568459510803, + "learning_rate": 2.2277489853387932e-05, + "loss": 0.7732, + "step": 6582 + }, + { + "epoch": 1.142088827203331, + "grad_norm": 0.9743322134017944, + "learning_rate": 2.2270720783947358e-05, + "loss": 0.7743, + "step": 6583 + }, + { + "epoch": 1.142262317834837, + "grad_norm": 0.8603004813194275, + "learning_rate": 2.2263951450986987e-05, + "loss": 0.673, + "step": 6584 + }, + { + "epoch": 1.1424358084663429, + "grad_norm": 0.9544937014579773, + "learning_rate": 2.225718185529242e-05, + "loss": 0.6353, + "step": 6585 + }, + { + "epoch": 1.1426092990978487, + "grad_norm": 0.9090085625648499, + "learning_rate": 2.2250411997649266e-05, + "loss": 0.6158, + "step": 6586 + }, + { + "epoch": 1.1427827897293545, + "grad_norm": 1.0532691478729248, + "learning_rate": 2.2243641878843172e-05, + "loss": 0.6345, + "step": 6587 + }, + { + "epoch": 1.1429562803608606, + "grad_norm": 1.0016942024230957, + "learning_rate": 2.2236871499659824e-05, + "loss": 0.5612, + "step": 6588 + }, + { + "epoch": 1.1431297709923665, + "grad_norm": 0.79554682970047, + "learning_rate": 2.2230100860884937e-05, + "loss": 0.8567, + "step": 6589 + }, + { + "epoch": 1.1433032616238723, + "grad_norm": 0.8780015110969543, + "learning_rate": 2.2223329963304242e-05, + "loss": 0.7041, + "step": 6590 + }, + { + "epoch": 1.1434767522553781, + "grad_norm": 0.8874915242195129, + "learning_rate": 2.2216558807703522e-05, + "loss": 0.5884, + "step": 6591 + }, + { + "epoch": 1.143650242886884, + "grad_norm": 2.180896520614624, + "learning_rate": 2.2209787394868562e-05, + "loss": 0.5526, + "step": 6592 + }, + { + "epoch": 1.14382373351839, + "grad_norm": 1.3695894479751587, + "learning_rate": 2.22030157255852e-05, + "loss": 0.676, + "step": 6593 + }, + { + "epoch": 1.143997224149896, + "grad_norm": 0.8087796568870544, + "learning_rate": 2.2196243800639303e-05, + "loss": 0.6035, + "step": 6594 + }, + { + "epoch": 1.1441707147814018, + "grad_norm": 1.438629150390625, + "learning_rate": 2.2189471620816745e-05, + "loss": 0.7009, + "step": 6595 + }, + { + "epoch": 1.1443442054129078, + "grad_norm": 0.9096373915672302, + "learning_rate": 2.2182699186903462e-05, + "loss": 0.594, + "step": 6596 + }, + { + "epoch": 1.1445176960444137, + "grad_norm": 0.758114218711853, + "learning_rate": 2.217592649968539e-05, + "loss": 0.7673, + "step": 6597 + }, + { + "epoch": 1.1446911866759195, + "grad_norm": 1.2858482599258423, + "learning_rate": 2.2169153559948513e-05, + "loss": 0.7866, + "step": 6598 + }, + { + "epoch": 1.1448646773074254, + "grad_norm": 1.7094236612319946, + "learning_rate": 2.2162380368478836e-05, + "loss": 0.7822, + "step": 6599 + }, + { + "epoch": 1.1450381679389312, + "grad_norm": 0.8077036738395691, + "learning_rate": 2.21556069260624e-05, + "loss": 0.6653, + "step": 6600 + }, + { + "epoch": 1.1452116585704373, + "grad_norm": 0.709796130657196, + "learning_rate": 2.2148833233485273e-05, + "loss": 0.6904, + "step": 6601 + }, + { + "epoch": 1.1453851492019431, + "grad_norm": 0.9894205927848816, + "learning_rate": 2.2142059291533542e-05, + "loss": 0.8035, + "step": 6602 + }, + { + "epoch": 1.145558639833449, + "grad_norm": 0.7714876532554626, + "learning_rate": 2.2135285100993328e-05, + "loss": 0.7356, + "step": 6603 + }, + { + "epoch": 1.1457321304649548, + "grad_norm": 2.458812713623047, + "learning_rate": 2.2128510662650796e-05, + "loss": 0.599, + "step": 6604 + }, + { + "epoch": 1.1459056210964609, + "grad_norm": 2.0926389694213867, + "learning_rate": 2.212173597729212e-05, + "loss": 0.5654, + "step": 6605 + }, + { + "epoch": 1.1460791117279667, + "grad_norm": 0.8042656779289246, + "learning_rate": 2.211496104570351e-05, + "loss": 0.6415, + "step": 6606 + }, + { + "epoch": 1.1462526023594726, + "grad_norm": 0.7531824707984924, + "learning_rate": 2.210818586867121e-05, + "loss": 0.7261, + "step": 6607 + }, + { + "epoch": 1.1464260929909784, + "grad_norm": 0.8199403285980225, + "learning_rate": 2.210141044698148e-05, + "loss": 0.7404, + "step": 6608 + }, + { + "epoch": 1.1465995836224843, + "grad_norm": 0.7963539958000183, + "learning_rate": 2.2094634781420626e-05, + "loss": 0.8152, + "step": 6609 + }, + { + "epoch": 1.1467730742539903, + "grad_norm": 0.8738901615142822, + "learning_rate": 2.2087858872774954e-05, + "loss": 0.5947, + "step": 6610 + }, + { + "epoch": 1.1469465648854962, + "grad_norm": 0.8320996165275574, + "learning_rate": 2.2081082721830834e-05, + "loss": 0.6686, + "step": 6611 + }, + { + "epoch": 1.147120055517002, + "grad_norm": 1.381185531616211, + "learning_rate": 2.2074306329374636e-05, + "loss": 0.5951, + "step": 6612 + }, + { + "epoch": 1.147293546148508, + "grad_norm": 0.7681248784065247, + "learning_rate": 2.2067529696192772e-05, + "loss": 0.6858, + "step": 6613 + }, + { + "epoch": 1.147467036780014, + "grad_norm": 1.103968858718872, + "learning_rate": 2.206075282307168e-05, + "loss": 0.6813, + "step": 6614 + }, + { + "epoch": 1.1476405274115198, + "grad_norm": 0.8008636236190796, + "learning_rate": 2.205397571079782e-05, + "loss": 0.7432, + "step": 6615 + }, + { + "epoch": 1.1478140180430256, + "grad_norm": 0.9304375648498535, + "learning_rate": 2.2047198360157683e-05, + "loss": 0.8259, + "step": 6616 + }, + { + "epoch": 1.1479875086745315, + "grad_norm": 0.8949493765830994, + "learning_rate": 2.2040420771937793e-05, + "loss": 0.8054, + "step": 6617 + }, + { + "epoch": 1.1481609993060375, + "grad_norm": 0.9316536784172058, + "learning_rate": 2.2033642946924698e-05, + "loss": 0.6233, + "step": 6618 + }, + { + "epoch": 1.1483344899375434, + "grad_norm": 0.7765052318572998, + "learning_rate": 2.2026864885904965e-05, + "loss": 0.7363, + "step": 6619 + }, + { + "epoch": 1.1485079805690492, + "grad_norm": 0.9371815323829651, + "learning_rate": 2.2020086589665203e-05, + "loss": 0.6565, + "step": 6620 + }, + { + "epoch": 1.148681471200555, + "grad_norm": 1.1352558135986328, + "learning_rate": 2.2013308058992037e-05, + "loss": 0.5654, + "step": 6621 + }, + { + "epoch": 1.1488549618320612, + "grad_norm": 1.140245795249939, + "learning_rate": 2.2006529294672126e-05, + "loss": 0.7064, + "step": 6622 + }, + { + "epoch": 1.149028452463567, + "grad_norm": 1.2445989847183228, + "learning_rate": 2.199975029749215e-05, + "loss": 0.6772, + "step": 6623 + }, + { + "epoch": 1.1492019430950728, + "grad_norm": 0.7243613600730896, + "learning_rate": 2.1992971068238826e-05, + "loss": 0.6947, + "step": 6624 + }, + { + "epoch": 1.1493754337265787, + "grad_norm": 1.2093373537063599, + "learning_rate": 2.198619160769888e-05, + "loss": 0.614, + "step": 6625 + }, + { + "epoch": 1.1495489243580848, + "grad_norm": 1.140641212463379, + "learning_rate": 2.197941191665909e-05, + "loss": 0.7284, + "step": 6626 + }, + { + "epoch": 1.1497224149895906, + "grad_norm": 1.009981393814087, + "learning_rate": 2.1972631995906237e-05, + "loss": 0.6803, + "step": 6627 + }, + { + "epoch": 1.1498959056210964, + "grad_norm": 0.779171884059906, + "learning_rate": 2.196585184622715e-05, + "loss": 0.688, + "step": 6628 + }, + { + "epoch": 1.1500693962526023, + "grad_norm": 0.8884458541870117, + "learning_rate": 2.1959071468408656e-05, + "loss": 0.7593, + "step": 6629 + }, + { + "epoch": 1.1502428868841084, + "grad_norm": 0.9647377729415894, + "learning_rate": 2.195229086323764e-05, + "loss": 0.6708, + "step": 6630 + }, + { + "epoch": 1.1504163775156142, + "grad_norm": 0.926071286201477, + "learning_rate": 2.1945510031500992e-05, + "loss": 0.6067, + "step": 6631 + }, + { + "epoch": 1.15058986814712, + "grad_norm": 0.822839081287384, + "learning_rate": 2.193872897398564e-05, + "loss": 0.6329, + "step": 6632 + }, + { + "epoch": 1.150763358778626, + "grad_norm": 1.094459056854248, + "learning_rate": 2.193194769147853e-05, + "loss": 0.5859, + "step": 6633 + }, + { + "epoch": 1.1509368494101317, + "grad_norm": 1.0647273063659668, + "learning_rate": 2.1925166184766636e-05, + "loss": 0.7876, + "step": 6634 + }, + { + "epoch": 1.1511103400416378, + "grad_norm": 0.7545997500419617, + "learning_rate": 2.191838445463697e-05, + "loss": 0.6445, + "step": 6635 + }, + { + "epoch": 1.1512838306731437, + "grad_norm": 0.7574499249458313, + "learning_rate": 2.1911602501876546e-05, + "loss": 0.6504, + "step": 6636 + }, + { + "epoch": 1.1514573213046495, + "grad_norm": 1.0339741706848145, + "learning_rate": 2.190482032727243e-05, + "loss": 0.7815, + "step": 6637 + }, + { + "epoch": 1.1516308119361554, + "grad_norm": 0.9146485924720764, + "learning_rate": 2.1898037931611688e-05, + "loss": 0.6848, + "step": 6638 + }, + { + "epoch": 1.1518043025676614, + "grad_norm": 0.7747961282730103, + "learning_rate": 2.1891255315681443e-05, + "loss": 0.7146, + "step": 6639 + }, + { + "epoch": 1.1519777931991673, + "grad_norm": 0.7893310785293579, + "learning_rate": 2.1884472480268806e-05, + "loss": 0.7349, + "step": 6640 + }, + { + "epoch": 1.1521512838306731, + "grad_norm": 0.9443196654319763, + "learning_rate": 2.1877689426160943e-05, + "loss": 0.6543, + "step": 6641 + }, + { + "epoch": 1.152324774462179, + "grad_norm": 0.788404643535614, + "learning_rate": 2.1870906154145035e-05, + "loss": 0.6721, + "step": 6642 + }, + { + "epoch": 1.152498265093685, + "grad_norm": 0.8380677103996277, + "learning_rate": 2.1864122665008294e-05, + "loss": 0.8132, + "step": 6643 + }, + { + "epoch": 1.1526717557251909, + "grad_norm": 1.575064778327942, + "learning_rate": 2.185733895953794e-05, + "loss": 0.6855, + "step": 6644 + }, + { + "epoch": 1.1528452463566967, + "grad_norm": 0.6822240352630615, + "learning_rate": 2.1850555038521236e-05, + "loss": 0.8303, + "step": 6645 + }, + { + "epoch": 1.1530187369882026, + "grad_norm": 1.0201916694641113, + "learning_rate": 2.1843770902745462e-05, + "loss": 0.6356, + "step": 6646 + }, + { + "epoch": 1.1531922276197086, + "grad_norm": 1.1040157079696655, + "learning_rate": 2.183698655299793e-05, + "loss": 0.5549, + "step": 6647 + }, + { + "epoch": 1.1533657182512145, + "grad_norm": 0.8578234314918518, + "learning_rate": 2.1830201990065966e-05, + "loss": 0.7388, + "step": 6648 + }, + { + "epoch": 1.1535392088827203, + "grad_norm": 0.8925837874412537, + "learning_rate": 2.182341721473693e-05, + "loss": 0.6102, + "step": 6649 + }, + { + "epoch": 1.1537126995142262, + "grad_norm": 1.3217049837112427, + "learning_rate": 2.1816632227798196e-05, + "loss": 0.5886, + "step": 6650 + }, + { + "epoch": 1.153886190145732, + "grad_norm": 0.9272180199623108, + "learning_rate": 2.1809847030037182e-05, + "loss": 0.6337, + "step": 6651 + }, + { + "epoch": 1.154059680777238, + "grad_norm": 1.1289716958999634, + "learning_rate": 2.180306162224131e-05, + "loss": 0.5912, + "step": 6652 + }, + { + "epoch": 1.154233171408744, + "grad_norm": 1.030557632446289, + "learning_rate": 2.179627600519803e-05, + "loss": 0.6239, + "step": 6653 + }, + { + "epoch": 1.1544066620402498, + "grad_norm": 0.8111268877983093, + "learning_rate": 2.1789490179694833e-05, + "loss": 0.7275, + "step": 6654 + }, + { + "epoch": 1.1545801526717558, + "grad_norm": 0.8481905460357666, + "learning_rate": 2.1782704146519212e-05, + "loss": 0.6963, + "step": 6655 + }, + { + "epoch": 1.1547536433032617, + "grad_norm": 0.7227159738540649, + "learning_rate": 2.1775917906458698e-05, + "loss": 0.7089, + "step": 6656 + }, + { + "epoch": 1.1549271339347675, + "grad_norm": 0.7921270132064819, + "learning_rate": 2.1769131460300844e-05, + "loss": 0.7393, + "step": 6657 + }, + { + "epoch": 1.1551006245662734, + "grad_norm": 1.063778042793274, + "learning_rate": 2.176234480883322e-05, + "loss": 0.7263, + "step": 6658 + }, + { + "epoch": 1.1552741151977792, + "grad_norm": 0.9413590431213379, + "learning_rate": 2.175555795284343e-05, + "loss": 0.646, + "step": 6659 + }, + { + "epoch": 1.1554476058292853, + "grad_norm": 0.6688209176063538, + "learning_rate": 2.174877089311909e-05, + "loss": 0.728, + "step": 6660 + }, + { + "epoch": 1.1556210964607911, + "grad_norm": 0.858394205570221, + "learning_rate": 2.1741983630447852e-05, + "loss": 0.6659, + "step": 6661 + }, + { + "epoch": 1.155794587092297, + "grad_norm": 0.8580524325370789, + "learning_rate": 2.1735196165617385e-05, + "loss": 0.6394, + "step": 6662 + }, + { + "epoch": 1.1559680777238028, + "grad_norm": 1.371187448501587, + "learning_rate": 2.172840849941538e-05, + "loss": 0.6655, + "step": 6663 + }, + { + "epoch": 1.156141568355309, + "grad_norm": 0.8718410134315491, + "learning_rate": 2.1721620632629552e-05, + "loss": 0.6412, + "step": 6664 + }, + { + "epoch": 1.1563150589868147, + "grad_norm": 0.8889141082763672, + "learning_rate": 2.171483256604765e-05, + "loss": 0.9094, + "step": 6665 + }, + { + "epoch": 1.1564885496183206, + "grad_norm": 0.7854129076004028, + "learning_rate": 2.1708044300457423e-05, + "loss": 0.6899, + "step": 6666 + }, + { + "epoch": 1.1566620402498264, + "grad_norm": 0.728609025478363, + "learning_rate": 2.1701255836646672e-05, + "loss": 0.8381, + "step": 6667 + }, + { + "epoch": 1.1568355308813323, + "grad_norm": 1.1046922206878662, + "learning_rate": 2.1694467175403197e-05, + "loss": 0.5811, + "step": 6668 + }, + { + "epoch": 1.1570090215128384, + "grad_norm": 0.7828776240348816, + "learning_rate": 2.168767831751483e-05, + "loss": 0.7468, + "step": 6669 + }, + { + "epoch": 1.1571825121443442, + "grad_norm": 1.0131479501724243, + "learning_rate": 2.1680889263769425e-05, + "loss": 0.5636, + "step": 6670 + }, + { + "epoch": 1.15735600277585, + "grad_norm": 0.805260956287384, + "learning_rate": 2.1674100014954864e-05, + "loss": 0.77, + "step": 6671 + }, + { + "epoch": 1.1575294934073561, + "grad_norm": 0.872976541519165, + "learning_rate": 2.166731057185905e-05, + "loss": 0.6182, + "step": 6672 + }, + { + "epoch": 1.157702984038862, + "grad_norm": 0.8341323733329773, + "learning_rate": 2.16605209352699e-05, + "loss": 0.7776, + "step": 6673 + }, + { + "epoch": 1.1578764746703678, + "grad_norm": 2.976569652557373, + "learning_rate": 2.1653731105975355e-05, + "loss": 0.6879, + "step": 6674 + }, + { + "epoch": 1.1580499653018737, + "grad_norm": 0.8165796399116516, + "learning_rate": 2.1646941084763397e-05, + "loss": 0.5872, + "step": 6675 + }, + { + "epoch": 1.1582234559333795, + "grad_norm": 0.7362960577011108, + "learning_rate": 2.1640150872421997e-05, + "loss": 0.7314, + "step": 6676 + }, + { + "epoch": 1.1583969465648856, + "grad_norm": 0.6628445982933044, + "learning_rate": 2.1633360469739183e-05, + "loss": 0.8179, + "step": 6677 + }, + { + "epoch": 1.1585704371963914, + "grad_norm": 0.7810073494911194, + "learning_rate": 2.1626569877502985e-05, + "loss": 0.6337, + "step": 6678 + }, + { + "epoch": 1.1587439278278973, + "grad_norm": 0.9302027821540833, + "learning_rate": 2.161977909650145e-05, + "loss": 0.6765, + "step": 6679 + }, + { + "epoch": 1.158917418459403, + "grad_norm": 0.9068854451179504, + "learning_rate": 2.161298812752267e-05, + "loss": 0.7559, + "step": 6680 + }, + { + "epoch": 1.1590909090909092, + "grad_norm": 0.885564386844635, + "learning_rate": 2.160619697135474e-05, + "loss": 0.7035, + "step": 6681 + }, + { + "epoch": 1.159264399722415, + "grad_norm": 0.728550374507904, + "learning_rate": 2.1599405628785773e-05, + "loss": 0.7361, + "step": 6682 + }, + { + "epoch": 1.1594378903539209, + "grad_norm": 0.8505485653877258, + "learning_rate": 2.1592614100603925e-05, + "loss": 0.6345, + "step": 6683 + }, + { + "epoch": 1.1596113809854267, + "grad_norm": 0.9094715118408203, + "learning_rate": 2.158582238759735e-05, + "loss": 0.6322, + "step": 6684 + }, + { + "epoch": 1.1597848716169328, + "grad_norm": 0.7953013181686401, + "learning_rate": 2.157903049055424e-05, + "loss": 0.5775, + "step": 6685 + }, + { + "epoch": 1.1599583622484386, + "grad_norm": 0.7688207626342773, + "learning_rate": 2.15722384102628e-05, + "loss": 0.8884, + "step": 6686 + }, + { + "epoch": 1.1601318528799445, + "grad_norm": 0.8681327104568481, + "learning_rate": 2.156544614751127e-05, + "loss": 0.5476, + "step": 6687 + }, + { + "epoch": 1.1603053435114503, + "grad_norm": 0.848590612411499, + "learning_rate": 2.1558653703087876e-05, + "loss": 0.6155, + "step": 6688 + }, + { + "epoch": 1.1604788341429564, + "grad_norm": 0.6741822957992554, + "learning_rate": 2.1551861077780914e-05, + "loss": 0.7522, + "step": 6689 + }, + { + "epoch": 1.1606523247744622, + "grad_norm": 0.9581078886985779, + "learning_rate": 2.1545068272378664e-05, + "loss": 0.5601, + "step": 6690 + }, + { + "epoch": 1.160825815405968, + "grad_norm": 0.8947743773460388, + "learning_rate": 2.153827528766944e-05, + "loss": 0.7146, + "step": 6691 + }, + { + "epoch": 1.160999306037474, + "grad_norm": 0.9666332602500916, + "learning_rate": 2.1531482124441574e-05, + "loss": 0.7168, + "step": 6692 + }, + { + "epoch": 1.1611727966689798, + "grad_norm": 0.6772956848144531, + "learning_rate": 2.1524688783483424e-05, + "loss": 0.7395, + "step": 6693 + }, + { + "epoch": 1.1613462873004858, + "grad_norm": 0.9423850774765015, + "learning_rate": 2.151789526558337e-05, + "loss": 0.6202, + "step": 6694 + }, + { + "epoch": 1.1615197779319917, + "grad_norm": 0.9874037504196167, + "learning_rate": 2.1511101571529793e-05, + "loss": 0.5746, + "step": 6695 + }, + { + "epoch": 1.1616932685634975, + "grad_norm": 1.3593478202819824, + "learning_rate": 2.1504307702111125e-05, + "loss": 0.7671, + "step": 6696 + }, + { + "epoch": 1.1618667591950034, + "grad_norm": 1.0141090154647827, + "learning_rate": 2.1497513658115792e-05, + "loss": 0.6495, + "step": 6697 + }, + { + "epoch": 1.1620402498265094, + "grad_norm": 1.000618577003479, + "learning_rate": 2.1490719440332252e-05, + "loss": 0.7161, + "step": 6698 + }, + { + "epoch": 1.1622137404580153, + "grad_norm": 0.6570723652839661, + "learning_rate": 2.148392504954899e-05, + "loss": 0.7776, + "step": 6699 + }, + { + "epoch": 1.1623872310895211, + "grad_norm": 2.245537519454956, + "learning_rate": 2.147713048655449e-05, + "loss": 0.6946, + "step": 6700 + }, + { + "epoch": 1.162560721721027, + "grad_norm": 0.7718012928962708, + "learning_rate": 2.147033575213728e-05, + "loss": 0.7732, + "step": 6701 + }, + { + "epoch": 1.162734212352533, + "grad_norm": 1.7756513357162476, + "learning_rate": 2.1463540847085892e-05, + "loss": 0.6824, + "step": 6702 + }, + { + "epoch": 1.162907702984039, + "grad_norm": 0.8938855528831482, + "learning_rate": 2.145674577218888e-05, + "loss": 0.5393, + "step": 6703 + }, + { + "epoch": 1.1630811936155447, + "grad_norm": 1.0327216386795044, + "learning_rate": 2.1449950528234828e-05, + "loss": 0.7311, + "step": 6704 + }, + { + "epoch": 1.1632546842470506, + "grad_norm": 0.7809851765632629, + "learning_rate": 2.1443155116012328e-05, + "loss": 0.6875, + "step": 6705 + }, + { + "epoch": 1.1634281748785567, + "grad_norm": 0.9977295398712158, + "learning_rate": 2.143635953630999e-05, + "loss": 0.704, + "step": 6706 + }, + { + "epoch": 1.1636016655100625, + "grad_norm": 1.0013166666030884, + "learning_rate": 2.142956378991646e-05, + "loss": 0.7888, + "step": 6707 + }, + { + "epoch": 1.1637751561415683, + "grad_norm": 0.8451995849609375, + "learning_rate": 2.1422767877620382e-05, + "loss": 0.7688, + "step": 6708 + }, + { + "epoch": 1.1639486467730742, + "grad_norm": 0.8709758520126343, + "learning_rate": 2.1415971800210437e-05, + "loss": 0.6223, + "step": 6709 + }, + { + "epoch": 1.16412213740458, + "grad_norm": 0.8390049338340759, + "learning_rate": 2.1409175558475307e-05, + "loss": 0.8083, + "step": 6710 + }, + { + "epoch": 1.164295628036086, + "grad_norm": 0.845690131187439, + "learning_rate": 2.1402379153203716e-05, + "loss": 0.6062, + "step": 6711 + }, + { + "epoch": 1.164469118667592, + "grad_norm": 0.8457289338111877, + "learning_rate": 2.1395582585184397e-05, + "loss": 0.6545, + "step": 6712 + }, + { + "epoch": 1.1646426092990978, + "grad_norm": 0.8456631302833557, + "learning_rate": 2.1388785855206083e-05, + "loss": 0.6791, + "step": 6713 + }, + { + "epoch": 1.1648160999306039, + "grad_norm": 0.9272366166114807, + "learning_rate": 2.138198896405756e-05, + "loss": 0.655, + "step": 6714 + }, + { + "epoch": 1.1649895905621097, + "grad_norm": 0.8622424006462097, + "learning_rate": 2.1375191912527605e-05, + "loss": 0.5803, + "step": 6715 + }, + { + "epoch": 1.1651630811936156, + "grad_norm": 0.9947532415390015, + "learning_rate": 2.1368394701405023e-05, + "loss": 0.645, + "step": 6716 + }, + { + "epoch": 1.1653365718251214, + "grad_norm": 1.0418200492858887, + "learning_rate": 2.1361597331478647e-05, + "loss": 0.6906, + "step": 6717 + }, + { + "epoch": 1.1655100624566272, + "grad_norm": 0.7536782622337341, + "learning_rate": 2.1354799803537312e-05, + "loss": 0.7015, + "step": 6718 + }, + { + "epoch": 1.1656835530881333, + "grad_norm": 0.7578790783882141, + "learning_rate": 2.1348002118369878e-05, + "loss": 0.712, + "step": 6719 + }, + { + "epoch": 1.1658570437196392, + "grad_norm": 0.6685656905174255, + "learning_rate": 2.134120427676523e-05, + "loss": 0.7268, + "step": 6720 + }, + { + "epoch": 1.166030534351145, + "grad_norm": 0.749864935874939, + "learning_rate": 2.133440627951226e-05, + "loss": 0.8989, + "step": 6721 + }, + { + "epoch": 1.1662040249826509, + "grad_norm": 0.8240868449211121, + "learning_rate": 2.1327608127399895e-05, + "loss": 0.7057, + "step": 6722 + }, + { + "epoch": 1.166377515614157, + "grad_norm": 0.9472382664680481, + "learning_rate": 2.1320809821217052e-05, + "loss": 0.6638, + "step": 6723 + }, + { + "epoch": 1.1665510062456628, + "grad_norm": 1.1621041297912598, + "learning_rate": 2.1314011361752687e-05, + "loss": 0.5912, + "step": 6724 + }, + { + "epoch": 1.1667244968771686, + "grad_norm": 0.7945003509521484, + "learning_rate": 2.1307212749795782e-05, + "loss": 0.7942, + "step": 6725 + }, + { + "epoch": 1.1668979875086745, + "grad_norm": 0.8160886764526367, + "learning_rate": 2.1300413986135313e-05, + "loss": 0.6543, + "step": 6726 + }, + { + "epoch": 1.1670714781401803, + "grad_norm": 0.7893632650375366, + "learning_rate": 2.1293615071560277e-05, + "loss": 0.7263, + "step": 6727 + }, + { + "epoch": 1.1672449687716864, + "grad_norm": 1.5990073680877686, + "learning_rate": 2.128681600685971e-05, + "loss": 0.7003, + "step": 6728 + }, + { + "epoch": 1.1674184594031922, + "grad_norm": 1.0113952159881592, + "learning_rate": 2.1280016792822645e-05, + "loss": 0.6263, + "step": 6729 + }, + { + "epoch": 1.167591950034698, + "grad_norm": 0.924402117729187, + "learning_rate": 2.1273217430238146e-05, + "loss": 0.7878, + "step": 6730 + }, + { + "epoch": 1.1677654406662041, + "grad_norm": 0.8414960503578186, + "learning_rate": 2.1266417919895274e-05, + "loss": 0.8623, + "step": 6731 + }, + { + "epoch": 1.16793893129771, + "grad_norm": 0.8273116946220398, + "learning_rate": 2.1259618262583122e-05, + "loss": 0.7566, + "step": 6732 + }, + { + "epoch": 1.1681124219292158, + "grad_norm": 0.909164547920227, + "learning_rate": 2.1252818459090814e-05, + "loss": 0.5405, + "step": 6733 + }, + { + "epoch": 1.1682859125607217, + "grad_norm": 0.7406533360481262, + "learning_rate": 2.1246018510207452e-05, + "loss": 0.7905, + "step": 6734 + }, + { + "epoch": 1.1684594031922275, + "grad_norm": 0.8015473484992981, + "learning_rate": 2.12392184167222e-05, + "loss": 0.725, + "step": 6735 + }, + { + "epoch": 1.1686328938237336, + "grad_norm": 0.9115320444107056, + "learning_rate": 2.1232418179424204e-05, + "loss": 0.6362, + "step": 6736 + }, + { + "epoch": 1.1688063844552394, + "grad_norm": 1.1168357133865356, + "learning_rate": 2.1225617799102638e-05, + "loss": 0.5491, + "step": 6737 + }, + { + "epoch": 1.1689798750867453, + "grad_norm": 3.241492748260498, + "learning_rate": 2.12188172765467e-05, + "loss": 0.7345, + "step": 6738 + }, + { + "epoch": 1.1691533657182511, + "grad_norm": 1.0342267751693726, + "learning_rate": 2.1212016612545604e-05, + "loss": 0.657, + "step": 6739 + }, + { + "epoch": 1.1693268563497572, + "grad_norm": 0.7908423542976379, + "learning_rate": 2.120521580788856e-05, + "loss": 0.7041, + "step": 6740 + }, + { + "epoch": 1.169500346981263, + "grad_norm": 1.370042085647583, + "learning_rate": 2.1198414863364822e-05, + "loss": 0.8054, + "step": 6741 + }, + { + "epoch": 1.1696738376127689, + "grad_norm": 1.2341727018356323, + "learning_rate": 2.1191613779763635e-05, + "loss": 0.7496, + "step": 6742 + }, + { + "epoch": 1.1698473282442747, + "grad_norm": 0.9660539627075195, + "learning_rate": 2.1184812557874287e-05, + "loss": 0.65, + "step": 6743 + }, + { + "epoch": 1.1700208188757808, + "grad_norm": 0.794297456741333, + "learning_rate": 2.1178011198486064e-05, + "loss": 0.7617, + "step": 6744 + }, + { + "epoch": 1.1701943095072866, + "grad_norm": 2.0239949226379395, + "learning_rate": 2.117120970238826e-05, + "loss": 0.8147, + "step": 6745 + }, + { + "epoch": 1.1703678001387925, + "grad_norm": 0.8750567436218262, + "learning_rate": 2.1164408070370212e-05, + "loss": 0.7568, + "step": 6746 + }, + { + "epoch": 1.1705412907702983, + "grad_norm": 0.9331697821617126, + "learning_rate": 2.1157606303221253e-05, + "loss": 0.6261, + "step": 6747 + }, + { + "epoch": 1.1707147814018044, + "grad_norm": 1.4687786102294922, + "learning_rate": 2.1150804401730724e-05, + "loss": 0.7427, + "step": 6748 + }, + { + "epoch": 1.1708882720333103, + "grad_norm": 1.220777153968811, + "learning_rate": 2.114400236668801e-05, + "loss": 0.875, + "step": 6749 + }, + { + "epoch": 1.171061762664816, + "grad_norm": 1.0383104085922241, + "learning_rate": 2.1137200198882484e-05, + "loss": 0.54, + "step": 6750 + }, + { + "epoch": 1.171235253296322, + "grad_norm": 0.9129163026809692, + "learning_rate": 2.113039789910355e-05, + "loss": 0.579, + "step": 6751 + }, + { + "epoch": 1.1714087439278278, + "grad_norm": 0.9913812875747681, + "learning_rate": 2.112359546814063e-05, + "loss": 0.6112, + "step": 6752 + }, + { + "epoch": 1.1715822345593339, + "grad_norm": 0.7706146240234375, + "learning_rate": 2.111679290678314e-05, + "loss": 0.7183, + "step": 6753 + }, + { + "epoch": 1.1717557251908397, + "grad_norm": 1.062812328338623, + "learning_rate": 2.110999021582053e-05, + "loss": 0.7291, + "step": 6754 + }, + { + "epoch": 1.1719292158223455, + "grad_norm": 0.896100640296936, + "learning_rate": 2.110318739604227e-05, + "loss": 0.6512, + "step": 6755 + }, + { + "epoch": 1.1721027064538514, + "grad_norm": 0.8069289326667786, + "learning_rate": 2.1096384448237824e-05, + "loss": 0.8323, + "step": 6756 + }, + { + "epoch": 1.1722761970853575, + "grad_norm": 0.886757493019104, + "learning_rate": 2.1089581373196686e-05, + "loss": 0.8094, + "step": 6757 + }, + { + "epoch": 1.1724496877168633, + "grad_norm": 0.9247627854347229, + "learning_rate": 2.1082778171708355e-05, + "loss": 0.7639, + "step": 6758 + }, + { + "epoch": 1.1726231783483692, + "grad_norm": 1.0542868375778198, + "learning_rate": 2.1075974844562354e-05, + "loss": 0.63, + "step": 6759 + }, + { + "epoch": 1.172796668979875, + "grad_norm": 0.9716145992279053, + "learning_rate": 2.1069171392548226e-05, + "loss": 0.6254, + "step": 6760 + }, + { + "epoch": 1.172970159611381, + "grad_norm": 0.9822885990142822, + "learning_rate": 2.106236781645551e-05, + "loss": 0.6084, + "step": 6761 + }, + { + "epoch": 1.173143650242887, + "grad_norm": 1.1359974145889282, + "learning_rate": 2.1055564117073767e-05, + "loss": 0.7595, + "step": 6762 + }, + { + "epoch": 1.1733171408743928, + "grad_norm": 1.6778615713119507, + "learning_rate": 2.104876029519258e-05, + "loss": 0.7, + "step": 6763 + }, + { + "epoch": 1.1734906315058986, + "grad_norm": 0.863174557685852, + "learning_rate": 2.1041956351601543e-05, + "loss": 0.7041, + "step": 6764 + }, + { + "epoch": 1.1736641221374047, + "grad_norm": 1.0102797746658325, + "learning_rate": 2.1035152287090254e-05, + "loss": 0.5857, + "step": 6765 + }, + { + "epoch": 1.1738376127689105, + "grad_norm": 1.074951410293579, + "learning_rate": 2.1028348102448338e-05, + "loss": 0.5698, + "step": 6766 + }, + { + "epoch": 1.1740111034004164, + "grad_norm": 0.9615200161933899, + "learning_rate": 2.1021543798465426e-05, + "loss": 0.6234, + "step": 6767 + }, + { + "epoch": 1.1741845940319222, + "grad_norm": 0.7072198987007141, + "learning_rate": 2.1014739375931166e-05, + "loss": 0.6825, + "step": 6768 + }, + { + "epoch": 1.174358084663428, + "grad_norm": 0.7945123910903931, + "learning_rate": 2.1007934835635213e-05, + "loss": 0.6892, + "step": 6769 + }, + { + "epoch": 1.1745315752949341, + "grad_norm": 1.1626288890838623, + "learning_rate": 2.1001130178367256e-05, + "loss": 0.6064, + "step": 6770 + }, + { + "epoch": 1.17470506592644, + "grad_norm": 0.9220759868621826, + "learning_rate": 2.0994325404916967e-05, + "loss": 0.7341, + "step": 6771 + }, + { + "epoch": 1.1748785565579458, + "grad_norm": 0.9079402685165405, + "learning_rate": 2.098752051607406e-05, + "loss": 0.6265, + "step": 6772 + }, + { + "epoch": 1.1750520471894519, + "grad_norm": 1.3341718912124634, + "learning_rate": 2.0980715512628255e-05, + "loss": 0.6857, + "step": 6773 + }, + { + "epoch": 1.1752255378209577, + "grad_norm": 0.8901751637458801, + "learning_rate": 2.097391039536926e-05, + "loss": 0.6644, + "step": 6774 + }, + { + "epoch": 1.1753990284524636, + "grad_norm": 0.7068885564804077, + "learning_rate": 2.0967105165086835e-05, + "loss": 0.6705, + "step": 6775 + }, + { + "epoch": 1.1755725190839694, + "grad_norm": 1.0732301473617554, + "learning_rate": 2.0960299822570728e-05, + "loss": 0.6595, + "step": 6776 + }, + { + "epoch": 1.1757460097154753, + "grad_norm": 0.8915671110153198, + "learning_rate": 2.0953494368610702e-05, + "loss": 0.6429, + "step": 6777 + }, + { + "epoch": 1.1759195003469813, + "grad_norm": 0.7607666850090027, + "learning_rate": 2.094668880399655e-05, + "loss": 0.7092, + "step": 6778 + }, + { + "epoch": 1.1760929909784872, + "grad_norm": 0.8482019305229187, + "learning_rate": 2.0939883129518056e-05, + "loss": 0.6531, + "step": 6779 + }, + { + "epoch": 1.176266481609993, + "grad_norm": 1.039025068283081, + "learning_rate": 2.0933077345965032e-05, + "loss": 0.5895, + "step": 6780 + }, + { + "epoch": 1.1764399722414989, + "grad_norm": 0.9551583528518677, + "learning_rate": 2.092627145412729e-05, + "loss": 0.6534, + "step": 6781 + }, + { + "epoch": 1.176613462873005, + "grad_norm": 0.9569187760353088, + "learning_rate": 2.0919465454794672e-05, + "loss": 0.823, + "step": 6782 + }, + { + "epoch": 1.1767869535045108, + "grad_norm": 1.312218427658081, + "learning_rate": 2.091265934875701e-05, + "loss": 0.7633, + "step": 6783 + }, + { + "epoch": 1.1769604441360166, + "grad_norm": 0.851509153842926, + "learning_rate": 2.0905853136804173e-05, + "loss": 0.6764, + "step": 6784 + }, + { + "epoch": 1.1771339347675225, + "grad_norm": 0.9329447746276855, + "learning_rate": 2.0899046819726025e-05, + "loss": 0.7319, + "step": 6785 + }, + { + "epoch": 1.1773074253990283, + "grad_norm": 0.9057019352912903, + "learning_rate": 2.089224039831244e-05, + "loss": 0.5824, + "step": 6786 + }, + { + "epoch": 1.1774809160305344, + "grad_norm": 1.3175793886184692, + "learning_rate": 2.088543387335332e-05, + "loss": 0.5948, + "step": 6787 + }, + { + "epoch": 1.1776544066620402, + "grad_norm": 1.0423130989074707, + "learning_rate": 2.087862724563857e-05, + "loss": 0.7467, + "step": 6788 + }, + { + "epoch": 1.177827897293546, + "grad_norm": 1.062504529953003, + "learning_rate": 2.0871820515958102e-05, + "loss": 0.608, + "step": 6789 + }, + { + "epoch": 1.1780013879250522, + "grad_norm": 0.7851483821868896, + "learning_rate": 2.0865013685101844e-05, + "loss": 0.6837, + "step": 6790 + }, + { + "epoch": 1.178174878556558, + "grad_norm": 1.0758823156356812, + "learning_rate": 2.085820675385975e-05, + "loss": 0.6954, + "step": 6791 + }, + { + "epoch": 1.1783483691880638, + "grad_norm": 0.7049002647399902, + "learning_rate": 2.085139972302175e-05, + "loss": 0.8309, + "step": 6792 + }, + { + "epoch": 1.1785218598195697, + "grad_norm": 1.03895902633667, + "learning_rate": 2.0844592593377827e-05, + "loss": 0.5806, + "step": 6793 + }, + { + "epoch": 1.1786953504510755, + "grad_norm": 1.6185563802719116, + "learning_rate": 2.083778536571795e-05, + "loss": 0.5985, + "step": 6794 + }, + { + "epoch": 1.1788688410825816, + "grad_norm": 1.4336625337600708, + "learning_rate": 2.0830978040832098e-05, + "loss": 0.717, + "step": 6795 + }, + { + "epoch": 1.1790423317140875, + "grad_norm": 0.7833899855613708, + "learning_rate": 2.0824170619510283e-05, + "loss": 0.8398, + "step": 6796 + }, + { + "epoch": 1.1792158223455933, + "grad_norm": 0.9580617547035217, + "learning_rate": 2.081736310254251e-05, + "loss": 0.5935, + "step": 6797 + }, + { + "epoch": 1.1793893129770991, + "grad_norm": 0.7682326436042786, + "learning_rate": 2.0810555490718787e-05, + "loss": 0.7493, + "step": 6798 + }, + { + "epoch": 1.1795628036086052, + "grad_norm": 0.9713231325149536, + "learning_rate": 2.0803747784829166e-05, + "loss": 0.5485, + "step": 6799 + }, + { + "epoch": 1.179736294240111, + "grad_norm": 0.9717401266098022, + "learning_rate": 2.0796939985663666e-05, + "loss": 0.5911, + "step": 6800 + }, + { + "epoch": 1.179909784871617, + "grad_norm": 0.9062899947166443, + "learning_rate": 2.079013209401236e-05, + "loss": 0.6273, + "step": 6801 + }, + { + "epoch": 1.1800832755031228, + "grad_norm": 0.7409073114395142, + "learning_rate": 2.0783324110665306e-05, + "loss": 0.6219, + "step": 6802 + }, + { + "epoch": 1.1802567661346288, + "grad_norm": 1.0369431972503662, + "learning_rate": 2.0776516036412565e-05, + "loss": 0.7733, + "step": 6803 + }, + { + "epoch": 1.1804302567661347, + "grad_norm": 0.9139466881752014, + "learning_rate": 2.0769707872044242e-05, + "loss": 0.6545, + "step": 6804 + }, + { + "epoch": 1.1806037473976405, + "grad_norm": 0.9401171207427979, + "learning_rate": 2.076289961835042e-05, + "loss": 0.6423, + "step": 6805 + }, + { + "epoch": 1.1807772380291464, + "grad_norm": 0.7296811938285828, + "learning_rate": 2.0756091276121212e-05, + "loss": 0.8384, + "step": 6806 + }, + { + "epoch": 1.1809507286606524, + "grad_norm": 0.917891263961792, + "learning_rate": 2.074928284614673e-05, + "loss": 0.7261, + "step": 6807 + }, + { + "epoch": 1.1811242192921583, + "grad_norm": 1.1051464080810547, + "learning_rate": 2.0742474329217094e-05, + "loss": 0.6926, + "step": 6808 + }, + { + "epoch": 1.1812977099236641, + "grad_norm": 0.9008103609085083, + "learning_rate": 2.0735665726122453e-05, + "loss": 0.683, + "step": 6809 + }, + { + "epoch": 1.18147120055517, + "grad_norm": 0.9984400868415833, + "learning_rate": 2.0728857037652945e-05, + "loss": 0.5687, + "step": 6810 + }, + { + "epoch": 1.1816446911866758, + "grad_norm": 2.5187008380889893, + "learning_rate": 2.0722048264598727e-05, + "loss": 0.7068, + "step": 6811 + }, + { + "epoch": 1.1818181818181819, + "grad_norm": 0.9860779643058777, + "learning_rate": 2.0715239407749973e-05, + "loss": 0.674, + "step": 6812 + }, + { + "epoch": 1.1819916724496877, + "grad_norm": 1.5390641689300537, + "learning_rate": 2.0708430467896848e-05, + "loss": 0.8132, + "step": 6813 + }, + { + "epoch": 1.1821651630811936, + "grad_norm": 0.948144793510437, + "learning_rate": 2.070162144582954e-05, + "loss": 0.6123, + "step": 6814 + }, + { + "epoch": 1.1823386537126996, + "grad_norm": 0.8004565238952637, + "learning_rate": 2.0694812342338252e-05, + "loss": 0.8372, + "step": 6815 + }, + { + "epoch": 1.1825121443442055, + "grad_norm": 0.919697105884552, + "learning_rate": 2.0688003158213172e-05, + "loss": 0.7002, + "step": 6816 + }, + { + "epoch": 1.1826856349757113, + "grad_norm": 0.9538434147834778, + "learning_rate": 2.0681193894244536e-05, + "loss": 0.5649, + "step": 6817 + }, + { + "epoch": 1.1828591256072172, + "grad_norm": 0.978543758392334, + "learning_rate": 2.067438455122255e-05, + "loss": 0.6752, + "step": 6818 + }, + { + "epoch": 1.183032616238723, + "grad_norm": 0.8659153580665588, + "learning_rate": 2.0667575129937446e-05, + "loss": 0.6292, + "step": 6819 + }, + { + "epoch": 1.183206106870229, + "grad_norm": 1.7112199068069458, + "learning_rate": 2.0660765631179474e-05, + "loss": 0.7482, + "step": 6820 + }, + { + "epoch": 1.183379597501735, + "grad_norm": 0.7273812294006348, + "learning_rate": 2.0653956055738876e-05, + "loss": 0.7203, + "step": 6821 + }, + { + "epoch": 1.1835530881332408, + "grad_norm": 1.3406543731689453, + "learning_rate": 2.0647146404405923e-05, + "loss": 0.7683, + "step": 6822 + }, + { + "epoch": 1.1837265787647466, + "grad_norm": 0.735585629940033, + "learning_rate": 2.0640336677970875e-05, + "loss": 0.6521, + "step": 6823 + }, + { + "epoch": 1.1839000693962527, + "grad_norm": 0.9135825634002686, + "learning_rate": 2.0633526877224006e-05, + "loss": 0.609, + "step": 6824 + }, + { + "epoch": 1.1840735600277585, + "grad_norm": 0.8458887934684753, + "learning_rate": 2.0626717002955606e-05, + "loss": 0.6505, + "step": 6825 + }, + { + "epoch": 1.1842470506592644, + "grad_norm": 0.9066083431243896, + "learning_rate": 2.061990705595597e-05, + "loss": 0.6064, + "step": 6826 + }, + { + "epoch": 1.1844205412907702, + "grad_norm": 0.9937846660614014, + "learning_rate": 2.061309703701539e-05, + "loss": 0.8252, + "step": 6827 + }, + { + "epoch": 1.184594031922276, + "grad_norm": 1.0215959548950195, + "learning_rate": 2.060628694692419e-05, + "loss": 0.8796, + "step": 6828 + }, + { + "epoch": 1.1847675225537821, + "grad_norm": 1.4363983869552612, + "learning_rate": 2.0599476786472686e-05, + "loss": 0.6283, + "step": 6829 + }, + { + "epoch": 1.184941013185288, + "grad_norm": 0.8138648271560669, + "learning_rate": 2.0592666556451197e-05, + "loss": 0.853, + "step": 6830 + }, + { + "epoch": 1.1851145038167938, + "grad_norm": 0.945885956287384, + "learning_rate": 2.0585856257650067e-05, + "loss": 0.6699, + "step": 6831 + }, + { + "epoch": 1.1852879944483, + "grad_norm": 0.8202991485595703, + "learning_rate": 2.0579045890859635e-05, + "loss": 0.7917, + "step": 6832 + }, + { + "epoch": 1.1854614850798058, + "grad_norm": 1.3546086549758911, + "learning_rate": 2.057223545687025e-05, + "loss": 0.6919, + "step": 6833 + }, + { + "epoch": 1.1856349757113116, + "grad_norm": 0.9824239611625671, + "learning_rate": 2.0565424956472278e-05, + "loss": 0.7207, + "step": 6834 + }, + { + "epoch": 1.1858084663428174, + "grad_norm": 0.9010014533996582, + "learning_rate": 2.0558614390456075e-05, + "loss": 0.579, + "step": 6835 + }, + { + "epoch": 1.1859819569743233, + "grad_norm": 1.0801827907562256, + "learning_rate": 2.055180375961203e-05, + "loss": 0.5836, + "step": 6836 + }, + { + "epoch": 1.1861554476058294, + "grad_norm": 1.2945942878723145, + "learning_rate": 2.0544993064730507e-05, + "loss": 0.615, + "step": 6837 + }, + { + "epoch": 1.1863289382373352, + "grad_norm": 2.1237757205963135, + "learning_rate": 2.053818230660191e-05, + "loss": 0.6951, + "step": 6838 + }, + { + "epoch": 1.186502428868841, + "grad_norm": 0.882905900478363, + "learning_rate": 2.053137148601662e-05, + "loss": 0.7434, + "step": 6839 + }, + { + "epoch": 1.186675919500347, + "grad_norm": 0.8946963548660278, + "learning_rate": 2.052456060376506e-05, + "loss": 0.7429, + "step": 6840 + }, + { + "epoch": 1.186849410131853, + "grad_norm": 1.0058070421218872, + "learning_rate": 2.051774966063763e-05, + "loss": 0.598, + "step": 6841 + }, + { + "epoch": 1.1870229007633588, + "grad_norm": 0.9654482007026672, + "learning_rate": 2.051093865742474e-05, + "loss": 0.8662, + "step": 6842 + }, + { + "epoch": 1.1871963913948647, + "grad_norm": 1.264804720878601, + "learning_rate": 2.0504127594916833e-05, + "loss": 0.8647, + "step": 6843 + }, + { + "epoch": 1.1873698820263705, + "grad_norm": 0.7864022850990295, + "learning_rate": 2.0497316473904324e-05, + "loss": 0.772, + "step": 6844 + }, + { + "epoch": 1.1875433726578764, + "grad_norm": 1.2368816137313843, + "learning_rate": 2.049050529517766e-05, + "loss": 0.677, + "step": 6845 + }, + { + "epoch": 1.1877168632893824, + "grad_norm": 1.0391591787338257, + "learning_rate": 2.048369405952729e-05, + "loss": 0.8901, + "step": 6846 + }, + { + "epoch": 1.1878903539208883, + "grad_norm": 0.7876938581466675, + "learning_rate": 2.047688276774366e-05, + "loss": 0.7439, + "step": 6847 + }, + { + "epoch": 1.188063844552394, + "grad_norm": 1.1337908506393433, + "learning_rate": 2.0470071420617222e-05, + "loss": 0.6796, + "step": 6848 + }, + { + "epoch": 1.1882373351839002, + "grad_norm": 1.0973769426345825, + "learning_rate": 2.046326001893846e-05, + "loss": 0.6669, + "step": 6849 + }, + { + "epoch": 1.188410825815406, + "grad_norm": 1.1711623668670654, + "learning_rate": 2.045644856349782e-05, + "loss": 0.8082, + "step": 6850 + }, + { + "epoch": 1.1885843164469119, + "grad_norm": 0.9292357563972473, + "learning_rate": 2.0449637055085798e-05, + "loss": 0.8174, + "step": 6851 + }, + { + "epoch": 1.1887578070784177, + "grad_norm": 0.8670117259025574, + "learning_rate": 2.0442825494492876e-05, + "loss": 0.6901, + "step": 6852 + }, + { + "epoch": 1.1889312977099236, + "grad_norm": 0.9586260318756104, + "learning_rate": 2.043601388250953e-05, + "loss": 0.6261, + "step": 6853 + }, + { + "epoch": 1.1891047883414296, + "grad_norm": 1.1122393608093262, + "learning_rate": 2.0429202219926273e-05, + "loss": 0.72, + "step": 6854 + }, + { + "epoch": 1.1892782789729355, + "grad_norm": 1.606480360031128, + "learning_rate": 2.0422390507533593e-05, + "loss": 0.7301, + "step": 6855 + }, + { + "epoch": 1.1894517696044413, + "grad_norm": 0.9758058786392212, + "learning_rate": 2.0415578746122007e-05, + "loss": 0.6931, + "step": 6856 + }, + { + "epoch": 1.1896252602359472, + "grad_norm": 0.838006854057312, + "learning_rate": 2.0408766936482016e-05, + "loss": 0.7471, + "step": 6857 + }, + { + "epoch": 1.1897987508674532, + "grad_norm": 1.1063505411148071, + "learning_rate": 2.0401955079404154e-05, + "loss": 0.5775, + "step": 6858 + }, + { + "epoch": 1.189972241498959, + "grad_norm": 2.048800468444824, + "learning_rate": 2.039514317567893e-05, + "loss": 0.6537, + "step": 6859 + }, + { + "epoch": 1.190145732130465, + "grad_norm": 1.4208464622497559, + "learning_rate": 2.0388331226096886e-05, + "loss": 0.6667, + "step": 6860 + }, + { + "epoch": 1.1903192227619708, + "grad_norm": 1.527850866317749, + "learning_rate": 2.0381519231448544e-05, + "loss": 0.6995, + "step": 6861 + }, + { + "epoch": 1.1904927133934768, + "grad_norm": 1.285622239112854, + "learning_rate": 2.0374707192524455e-05, + "loss": 0.7133, + "step": 6862 + }, + { + "epoch": 1.1906662040249827, + "grad_norm": 0.9839930534362793, + "learning_rate": 2.0367895110115166e-05, + "loss": 0.8086, + "step": 6863 + }, + { + "epoch": 1.1908396946564885, + "grad_norm": 0.8491948843002319, + "learning_rate": 2.036108298501121e-05, + "loss": 0.5811, + "step": 6864 + }, + { + "epoch": 1.1910131852879944, + "grad_norm": 0.8427409529685974, + "learning_rate": 2.035427081800316e-05, + "loss": 0.7346, + "step": 6865 + }, + { + "epoch": 1.1911866759195004, + "grad_norm": 0.8490955829620361, + "learning_rate": 2.034745860988156e-05, + "loss": 0.8246, + "step": 6866 + }, + { + "epoch": 1.1913601665510063, + "grad_norm": 0.7731961607933044, + "learning_rate": 2.0340646361436994e-05, + "loss": 0.8059, + "step": 6867 + }, + { + "epoch": 1.1915336571825121, + "grad_norm": 1.266151785850525, + "learning_rate": 2.0333834073460018e-05, + "loss": 0.7183, + "step": 6868 + }, + { + "epoch": 1.191707147814018, + "grad_norm": 1.0077153444290161, + "learning_rate": 2.03270217467412e-05, + "loss": 0.7397, + "step": 6869 + }, + { + "epoch": 1.1918806384455238, + "grad_norm": 0.9727525115013123, + "learning_rate": 2.032020938207114e-05, + "loss": 0.7031, + "step": 6870 + }, + { + "epoch": 1.19205412907703, + "grad_norm": 0.7900940775871277, + "learning_rate": 2.03133969802404e-05, + "loss": 0.687, + "step": 6871 + }, + { + "epoch": 1.1922276197085357, + "grad_norm": 2.3254146575927734, + "learning_rate": 2.030658454203958e-05, + "loss": 0.8159, + "step": 6872 + }, + { + "epoch": 1.1924011103400416, + "grad_norm": 0.9387832283973694, + "learning_rate": 2.0299772068259263e-05, + "loss": 0.6122, + "step": 6873 + }, + { + "epoch": 1.1925746009715477, + "grad_norm": 0.8891761898994446, + "learning_rate": 2.029295955969005e-05, + "loss": 0.7585, + "step": 6874 + }, + { + "epoch": 1.1927480916030535, + "grad_norm": 0.868195116519928, + "learning_rate": 2.0286147017122538e-05, + "loss": 0.7444, + "step": 6875 + }, + { + "epoch": 1.1929215822345594, + "grad_norm": 0.8770192265510559, + "learning_rate": 2.027933444134733e-05, + "loss": 0.6681, + "step": 6876 + }, + { + "epoch": 1.1930950728660652, + "grad_norm": 0.858079195022583, + "learning_rate": 2.0272521833155038e-05, + "loss": 0.6887, + "step": 6877 + }, + { + "epoch": 1.193268563497571, + "grad_norm": 1.0472079515457153, + "learning_rate": 2.0265709193336266e-05, + "loss": 0.5541, + "step": 6878 + }, + { + "epoch": 1.1934420541290771, + "grad_norm": 0.7814067006111145, + "learning_rate": 2.0258896522681635e-05, + "loss": 0.8457, + "step": 6879 + }, + { + "epoch": 1.193615544760583, + "grad_norm": 0.6990267634391785, + "learning_rate": 2.025208382198176e-05, + "loss": 0.8291, + "step": 6880 + }, + { + "epoch": 1.1937890353920888, + "grad_norm": 1.2606936693191528, + "learning_rate": 2.024527109202726e-05, + "loss": 0.6562, + "step": 6881 + }, + { + "epoch": 1.1939625260235947, + "grad_norm": 1.1128766536712646, + "learning_rate": 2.0238458333608766e-05, + "loss": 0.6338, + "step": 6882 + }, + { + "epoch": 1.1941360166551007, + "grad_norm": 1.3687227964401245, + "learning_rate": 2.023164554751691e-05, + "loss": 0.7026, + "step": 6883 + }, + { + "epoch": 1.1943095072866066, + "grad_norm": 0.9274455904960632, + "learning_rate": 2.0224832734542314e-05, + "loss": 0.6326, + "step": 6884 + }, + { + "epoch": 1.1944829979181124, + "grad_norm": 0.9109746217727661, + "learning_rate": 2.0218019895475612e-05, + "loss": 0.5941, + "step": 6885 + }, + { + "epoch": 1.1946564885496183, + "grad_norm": 0.971297562122345, + "learning_rate": 2.0211207031107457e-05, + "loss": 0.8904, + "step": 6886 + }, + { + "epoch": 1.194829979181124, + "grad_norm": 0.7386256456375122, + "learning_rate": 2.0204394142228473e-05, + "loss": 0.8248, + "step": 6887 + }, + { + "epoch": 1.1950034698126302, + "grad_norm": 1.0652024745941162, + "learning_rate": 2.0197581229629317e-05, + "loss": 0.734, + "step": 6888 + }, + { + "epoch": 1.195176960444136, + "grad_norm": 1.1150227785110474, + "learning_rate": 2.0190768294100626e-05, + "loss": 0.8513, + "step": 6889 + }, + { + "epoch": 1.1953504510756419, + "grad_norm": 0.8286466002464294, + "learning_rate": 2.018395533643305e-05, + "loss": 0.7297, + "step": 6890 + }, + { + "epoch": 1.195523941707148, + "grad_norm": 0.8191513419151306, + "learning_rate": 2.0177142357417243e-05, + "loss": 0.6202, + "step": 6891 + }, + { + "epoch": 1.1956974323386538, + "grad_norm": 2.123058795928955, + "learning_rate": 2.017032935784386e-05, + "loss": 0.6248, + "step": 6892 + }, + { + "epoch": 1.1958709229701596, + "grad_norm": 0.7636653780937195, + "learning_rate": 2.0163516338503556e-05, + "loss": 0.6687, + "step": 6893 + }, + { + "epoch": 1.1960444136016655, + "grad_norm": 0.7214606404304504, + "learning_rate": 2.0156703300186997e-05, + "loss": 0.8164, + "step": 6894 + }, + { + "epoch": 1.1962179042331713, + "grad_norm": 1.0091768503189087, + "learning_rate": 2.0149890243684827e-05, + "loss": 0.6094, + "step": 6895 + }, + { + "epoch": 1.1963913948646774, + "grad_norm": 0.846049964427948, + "learning_rate": 2.0143077169787725e-05, + "loss": 0.6997, + "step": 6896 + }, + { + "epoch": 1.1965648854961832, + "grad_norm": 1.6351693868637085, + "learning_rate": 2.0136264079286354e-05, + "loss": 0.6796, + "step": 6897 + }, + { + "epoch": 1.196738376127689, + "grad_norm": 0.852753758430481, + "learning_rate": 2.012945097297137e-05, + "loss": 0.7458, + "step": 6898 + }, + { + "epoch": 1.196911866759195, + "grad_norm": 1.1859902143478394, + "learning_rate": 2.0122637851633455e-05, + "loss": 0.6448, + "step": 6899 + }, + { + "epoch": 1.197085357390701, + "grad_norm": 1.248917818069458, + "learning_rate": 2.0115824716063273e-05, + "loss": 0.6815, + "step": 6900 + }, + { + "epoch": 1.1972588480222068, + "grad_norm": 0.7497267127037048, + "learning_rate": 2.01090115670515e-05, + "loss": 0.8513, + "step": 6901 + }, + { + "epoch": 1.1974323386537127, + "grad_norm": 1.1210031509399414, + "learning_rate": 2.0102198405388806e-05, + "loss": 0.7385, + "step": 6902 + }, + { + "epoch": 1.1976058292852185, + "grad_norm": 0.9574193954467773, + "learning_rate": 2.0095385231865864e-05, + "loss": 0.6962, + "step": 6903 + }, + { + "epoch": 1.1977793199167244, + "grad_norm": 0.9358164072036743, + "learning_rate": 2.008857204727336e-05, + "loss": 0.6461, + "step": 6904 + }, + { + "epoch": 1.1979528105482304, + "grad_norm": 0.8664985299110413, + "learning_rate": 2.0081758852401964e-05, + "loss": 0.7805, + "step": 6905 + }, + { + "epoch": 1.1981263011797363, + "grad_norm": 0.738760232925415, + "learning_rate": 2.0074945648042353e-05, + "loss": 0.6945, + "step": 6906 + }, + { + "epoch": 1.1982997918112421, + "grad_norm": 3.5684494972229004, + "learning_rate": 2.006813243498522e-05, + "loss": 0.6414, + "step": 6907 + }, + { + "epoch": 1.1984732824427482, + "grad_norm": 0.8157991170883179, + "learning_rate": 2.0061319214021237e-05, + "loss": 0.8254, + "step": 6908 + }, + { + "epoch": 1.198646773074254, + "grad_norm": 0.8006649613380432, + "learning_rate": 2.005450598594109e-05, + "loss": 0.7058, + "step": 6909 + }, + { + "epoch": 1.19882026370576, + "grad_norm": 1.0093694925308228, + "learning_rate": 2.0047692751535454e-05, + "loss": 0.6011, + "step": 6910 + }, + { + "epoch": 1.1989937543372657, + "grad_norm": 1.0476534366607666, + "learning_rate": 2.004087951159502e-05, + "loss": 0.6758, + "step": 6911 + }, + { + "epoch": 1.1991672449687716, + "grad_norm": 0.8825247287750244, + "learning_rate": 2.0034066266910475e-05, + "loss": 0.6671, + "step": 6912 + }, + { + "epoch": 1.1993407356002777, + "grad_norm": 0.8785678744316101, + "learning_rate": 2.0027253018272498e-05, + "loss": 0.7042, + "step": 6913 + }, + { + "epoch": 1.1995142262317835, + "grad_norm": 0.8468264937400818, + "learning_rate": 2.0020439766471775e-05, + "loss": 0.5555, + "step": 6914 + }, + { + "epoch": 1.1996877168632893, + "grad_norm": 0.8833370804786682, + "learning_rate": 2.0013626512298996e-05, + "loss": 0.576, + "step": 6915 + }, + { + "epoch": 1.1998612074947952, + "grad_norm": 1.153717279434204, + "learning_rate": 2.000681325654484e-05, + "loss": 0.6389, + "step": 6916 + }, + { + "epoch": 1.2000346981263013, + "grad_norm": 1.1763560771942139, + "learning_rate": 2e-05, + "loss": 0.6404, + "step": 6917 + }, + { + "epoch": 1.200208188757807, + "grad_norm": 0.8371836543083191, + "learning_rate": 1.999318674345516e-05, + "loss": 0.7266, + "step": 6918 + }, + { + "epoch": 1.200381679389313, + "grad_norm": 2.1703083515167236, + "learning_rate": 1.9986373487701014e-05, + "loss": 0.7214, + "step": 6919 + }, + { + "epoch": 1.2005551700208188, + "grad_norm": 1.20407235622406, + "learning_rate": 1.997956023352823e-05, + "loss": 0.5504, + "step": 6920 + }, + { + "epoch": 1.2007286606523249, + "grad_norm": 0.7500548958778381, + "learning_rate": 1.9972746981727505e-05, + "loss": 0.7007, + "step": 6921 + }, + { + "epoch": 1.2009021512838307, + "grad_norm": 1.0067046880722046, + "learning_rate": 1.9965933733089535e-05, + "loss": 0.5289, + "step": 6922 + }, + { + "epoch": 1.2010756419153366, + "grad_norm": 1.7737727165222168, + "learning_rate": 1.9959120488404986e-05, + "loss": 0.7393, + "step": 6923 + }, + { + "epoch": 1.2012491325468424, + "grad_norm": 0.8398721218109131, + "learning_rate": 1.995230724846455e-05, + "loss": 0.8002, + "step": 6924 + }, + { + "epoch": 1.2014226231783485, + "grad_norm": 0.9303058981895447, + "learning_rate": 1.9945494014058915e-05, + "loss": 0.6313, + "step": 6925 + }, + { + "epoch": 1.2015961138098543, + "grad_norm": 0.932012140750885, + "learning_rate": 1.993868078597877e-05, + "loss": 0.7247, + "step": 6926 + }, + { + "epoch": 1.2017696044413602, + "grad_norm": 1.0939282178878784, + "learning_rate": 1.9931867565014785e-05, + "loss": 0.5916, + "step": 6927 + }, + { + "epoch": 1.201943095072866, + "grad_norm": 1.0534816980361938, + "learning_rate": 1.9925054351957647e-05, + "loss": 0.8225, + "step": 6928 + }, + { + "epoch": 1.2021165857043719, + "grad_norm": 0.9078762531280518, + "learning_rate": 1.9918241147598043e-05, + "loss": 0.5736, + "step": 6929 + }, + { + "epoch": 1.202290076335878, + "grad_norm": 2.212172746658325, + "learning_rate": 1.9911427952726644e-05, + "loss": 0.6223, + "step": 6930 + }, + { + "epoch": 1.2024635669673838, + "grad_norm": 0.8466037511825562, + "learning_rate": 1.990461476813414e-05, + "loss": 0.7117, + "step": 6931 + }, + { + "epoch": 1.2026370575988896, + "grad_norm": 0.706632137298584, + "learning_rate": 1.9897801594611204e-05, + "loss": 0.678, + "step": 6932 + }, + { + "epoch": 1.2028105482303957, + "grad_norm": 0.8059514760971069, + "learning_rate": 1.9890988432948508e-05, + "loss": 0.7805, + "step": 6933 + }, + { + "epoch": 1.2029840388619015, + "grad_norm": 1.0724302530288696, + "learning_rate": 1.988417528393673e-05, + "loss": 0.666, + "step": 6934 + }, + { + "epoch": 1.2031575294934074, + "grad_norm": 0.8604851365089417, + "learning_rate": 1.9877362148366555e-05, + "loss": 0.5376, + "step": 6935 + }, + { + "epoch": 1.2033310201249132, + "grad_norm": 1.1547662019729614, + "learning_rate": 1.9870549027028635e-05, + "loss": 0.6678, + "step": 6936 + }, + { + "epoch": 1.203504510756419, + "grad_norm": 0.9691640138626099, + "learning_rate": 1.9863735920713653e-05, + "loss": 0.6948, + "step": 6937 + }, + { + "epoch": 1.2036780013879251, + "grad_norm": 1.2008283138275146, + "learning_rate": 1.9856922830212286e-05, + "loss": 0.5696, + "step": 6938 + }, + { + "epoch": 1.203851492019431, + "grad_norm": 0.8108875155448914, + "learning_rate": 1.9850109756315176e-05, + "loss": 0.9153, + "step": 6939 + }, + { + "epoch": 1.2040249826509368, + "grad_norm": 0.9569621086120605, + "learning_rate": 1.984329669981301e-05, + "loss": 0.8291, + "step": 6940 + }, + { + "epoch": 1.2041984732824427, + "grad_norm": 1.109199047088623, + "learning_rate": 1.983648366149644e-05, + "loss": 0.6774, + "step": 6941 + }, + { + "epoch": 1.2043719639139487, + "grad_norm": 0.9597120881080627, + "learning_rate": 1.9829670642156147e-05, + "loss": 0.688, + "step": 6942 + }, + { + "epoch": 1.2045454545454546, + "grad_norm": 1.3851652145385742, + "learning_rate": 1.982285764258276e-05, + "loss": 0.6793, + "step": 6943 + }, + { + "epoch": 1.2047189451769604, + "grad_norm": 0.7761045694351196, + "learning_rate": 1.981604466356695e-05, + "loss": 0.884, + "step": 6944 + }, + { + "epoch": 1.2048924358084663, + "grad_norm": 0.9400565028190613, + "learning_rate": 1.9809231705899384e-05, + "loss": 0.6223, + "step": 6945 + }, + { + "epoch": 1.2050659264399721, + "grad_norm": 1.0829803943634033, + "learning_rate": 1.980241877037069e-05, + "loss": 0.8699, + "step": 6946 + }, + { + "epoch": 1.2052394170714782, + "grad_norm": 0.9244141578674316, + "learning_rate": 1.9795605857771527e-05, + "loss": 0.5726, + "step": 6947 + }, + { + "epoch": 1.205412907702984, + "grad_norm": 1.4941930770874023, + "learning_rate": 1.9788792968892553e-05, + "loss": 0.6663, + "step": 6948 + }, + { + "epoch": 1.2055863983344899, + "grad_norm": 0.8803195953369141, + "learning_rate": 1.978198010452439e-05, + "loss": 0.6847, + "step": 6949 + }, + { + "epoch": 1.205759888965996, + "grad_norm": 0.9142873883247375, + "learning_rate": 1.977516726545769e-05, + "loss": 0.7355, + "step": 6950 + }, + { + "epoch": 1.2059333795975018, + "grad_norm": 1.2392163276672363, + "learning_rate": 1.97683544524831e-05, + "loss": 0.5992, + "step": 6951 + }, + { + "epoch": 1.2061068702290076, + "grad_norm": 0.8886696100234985, + "learning_rate": 1.976154166639124e-05, + "loss": 0.697, + "step": 6952 + }, + { + "epoch": 1.2062803608605135, + "grad_norm": 0.8994086384773254, + "learning_rate": 1.9754728907972745e-05, + "loss": 0.6042, + "step": 6953 + }, + { + "epoch": 1.2064538514920193, + "grad_norm": 0.9723712205886841, + "learning_rate": 1.9747916178018246e-05, + "loss": 0.7488, + "step": 6954 + }, + { + "epoch": 1.2066273421235254, + "grad_norm": 0.8302885890007019, + "learning_rate": 1.974110347731837e-05, + "loss": 0.7177, + "step": 6955 + }, + { + "epoch": 1.2068008327550312, + "grad_norm": 0.8572049140930176, + "learning_rate": 1.9734290806663738e-05, + "loss": 0.6449, + "step": 6956 + }, + { + "epoch": 1.206974323386537, + "grad_norm": 0.9683158993721008, + "learning_rate": 1.972747816684497e-05, + "loss": 0.6519, + "step": 6957 + }, + { + "epoch": 1.207147814018043, + "grad_norm": 1.0818190574645996, + "learning_rate": 1.9720665558652676e-05, + "loss": 0.7499, + "step": 6958 + }, + { + "epoch": 1.207321304649549, + "grad_norm": 1.0552599430084229, + "learning_rate": 1.971385298287747e-05, + "loss": 0.6584, + "step": 6959 + }, + { + "epoch": 1.2074947952810549, + "grad_norm": 1.6453750133514404, + "learning_rate": 1.970704044030995e-05, + "loss": 0.729, + "step": 6960 + }, + { + "epoch": 1.2076682859125607, + "grad_norm": 1.4474948644638062, + "learning_rate": 1.9700227931740747e-05, + "loss": 0.6246, + "step": 6961 + }, + { + "epoch": 1.2078417765440665, + "grad_norm": 1.040631890296936, + "learning_rate": 1.9693415457960426e-05, + "loss": 0.548, + "step": 6962 + }, + { + "epoch": 1.2080152671755726, + "grad_norm": 0.9565908312797546, + "learning_rate": 1.9686603019759602e-05, + "loss": 0.7771, + "step": 6963 + }, + { + "epoch": 1.2081887578070785, + "grad_norm": 0.9068557024002075, + "learning_rate": 1.9679790617928872e-05, + "loss": 0.6075, + "step": 6964 + }, + { + "epoch": 1.2083622484385843, + "grad_norm": 0.8985767960548401, + "learning_rate": 1.9672978253258802e-05, + "loss": 0.5778, + "step": 6965 + }, + { + "epoch": 1.2085357390700902, + "grad_norm": 0.753365695476532, + "learning_rate": 1.966616592653999e-05, + "loss": 0.8748, + "step": 6966 + }, + { + "epoch": 1.2087092297015962, + "grad_norm": 0.8086770176887512, + "learning_rate": 1.9659353638563016e-05, + "loss": 0.6886, + "step": 6967 + }, + { + "epoch": 1.208882720333102, + "grad_norm": 1.0661956071853638, + "learning_rate": 1.9652541390118443e-05, + "loss": 0.7257, + "step": 6968 + }, + { + "epoch": 1.209056210964608, + "grad_norm": 1.0398930311203003, + "learning_rate": 1.9645729181996846e-05, + "loss": 0.8418, + "step": 6969 + }, + { + "epoch": 1.2092297015961138, + "grad_norm": 0.8322306275367737, + "learning_rate": 1.963891701498879e-05, + "loss": 0.6079, + "step": 6970 + }, + { + "epoch": 1.2094031922276196, + "grad_norm": 1.0453869104385376, + "learning_rate": 1.9632104889884844e-05, + "loss": 0.5873, + "step": 6971 + }, + { + "epoch": 1.2095766828591257, + "grad_norm": 0.7730023860931396, + "learning_rate": 1.9625292807475548e-05, + "loss": 0.7886, + "step": 6972 + }, + { + "epoch": 1.2097501734906315, + "grad_norm": 0.74858158826828, + "learning_rate": 1.9618480768551456e-05, + "loss": 0.7959, + "step": 6973 + }, + { + "epoch": 1.2099236641221374, + "grad_norm": 1.1824636459350586, + "learning_rate": 1.9611668773903124e-05, + "loss": 0.6157, + "step": 6974 + }, + { + "epoch": 1.2100971547536432, + "grad_norm": 0.9360800981521606, + "learning_rate": 1.9604856824321076e-05, + "loss": 0.8508, + "step": 6975 + }, + { + "epoch": 1.2102706453851493, + "grad_norm": 1.014859676361084, + "learning_rate": 1.9598044920595853e-05, + "loss": 0.6887, + "step": 6976 + }, + { + "epoch": 1.2104441360166551, + "grad_norm": 0.6307652592658997, + "learning_rate": 1.9591233063517988e-05, + "loss": 0.7395, + "step": 6977 + }, + { + "epoch": 1.210617626648161, + "grad_norm": 1.2196919918060303, + "learning_rate": 1.9584421253878e-05, + "loss": 0.7834, + "step": 6978 + }, + { + "epoch": 1.2107911172796668, + "grad_norm": 0.9747678637504578, + "learning_rate": 1.957760949246641e-05, + "loss": 0.6865, + "step": 6979 + }, + { + "epoch": 1.2109646079111729, + "grad_norm": 1.1730493307113647, + "learning_rate": 1.9570797780073737e-05, + "loss": 0.5719, + "step": 6980 + }, + { + "epoch": 1.2111380985426787, + "grad_norm": 0.8629263639450073, + "learning_rate": 1.9563986117490476e-05, + "loss": 0.6036, + "step": 6981 + }, + { + "epoch": 1.2113115891741846, + "grad_norm": 1.7504934072494507, + "learning_rate": 1.955717450550713e-05, + "loss": 0.6989, + "step": 6982 + }, + { + "epoch": 1.2114850798056904, + "grad_norm": 1.451129674911499, + "learning_rate": 1.9550362944914202e-05, + "loss": 0.6642, + "step": 6983 + }, + { + "epoch": 1.2116585704371965, + "grad_norm": 0.824196457862854, + "learning_rate": 1.9543551436502186e-05, + "loss": 0.6178, + "step": 6984 + }, + { + "epoch": 1.2118320610687023, + "grad_norm": 0.7660611271858215, + "learning_rate": 1.953673998106155e-05, + "loss": 0.8594, + "step": 6985 + }, + { + "epoch": 1.2120055517002082, + "grad_norm": 1.162145733833313, + "learning_rate": 1.9529928579382778e-05, + "loss": 0.5975, + "step": 6986 + }, + { + "epoch": 1.212179042331714, + "grad_norm": 0.9012119770050049, + "learning_rate": 1.952311723225635e-05, + "loss": 0.7562, + "step": 6987 + }, + { + "epoch": 1.2123525329632199, + "grad_norm": 1.2088011503219604, + "learning_rate": 1.9516305940472714e-05, + "loss": 0.7241, + "step": 6988 + }, + { + "epoch": 1.212526023594726, + "grad_norm": 1.44375741481781, + "learning_rate": 1.950949470482234e-05, + "loss": 0.548, + "step": 6989 + }, + { + "epoch": 1.2126995142262318, + "grad_norm": 1.2474559545516968, + "learning_rate": 1.9502683526095683e-05, + "loss": 0.5884, + "step": 6990 + }, + { + "epoch": 1.2128730048577376, + "grad_norm": 1.2629752159118652, + "learning_rate": 1.9495872405083178e-05, + "loss": 0.6862, + "step": 6991 + }, + { + "epoch": 1.2130464954892437, + "grad_norm": 1.2401788234710693, + "learning_rate": 1.948906134257526e-05, + "loss": 0.681, + "step": 6992 + }, + { + "epoch": 1.2132199861207495, + "grad_norm": 0.6261561512947083, + "learning_rate": 1.9482250339362383e-05, + "loss": 0.824, + "step": 6993 + }, + { + "epoch": 1.2133934767522554, + "grad_norm": 0.7971076369285583, + "learning_rate": 1.947543939623495e-05, + "loss": 0.6771, + "step": 6994 + }, + { + "epoch": 1.2135669673837612, + "grad_norm": 0.8256326913833618, + "learning_rate": 1.9468628513983382e-05, + "loss": 0.7426, + "step": 6995 + }, + { + "epoch": 1.213740458015267, + "grad_norm": 3.3545384407043457, + "learning_rate": 1.9461817693398105e-05, + "loss": 0.5594, + "step": 6996 + }, + { + "epoch": 1.2139139486467732, + "grad_norm": 0.8700054287910461, + "learning_rate": 1.94550069352695e-05, + "loss": 0.7643, + "step": 6997 + }, + { + "epoch": 1.214087439278279, + "grad_norm": 0.8704787492752075, + "learning_rate": 1.944819624038798e-05, + "loss": 0.8159, + "step": 6998 + }, + { + "epoch": 1.2142609299097848, + "grad_norm": 0.9413266181945801, + "learning_rate": 1.9441385609543925e-05, + "loss": 0.7605, + "step": 6999 + }, + { + "epoch": 1.2144344205412907, + "grad_norm": 1.1238477230072021, + "learning_rate": 1.943457504352773e-05, + "loss": 0.7028, + "step": 7000 + }, + { + "epoch": 1.2146079111727968, + "grad_norm": 0.975283682346344, + "learning_rate": 1.9427764543129756e-05, + "loss": 0.6167, + "step": 7001 + }, + { + "epoch": 1.2147814018043026, + "grad_norm": 0.9563808441162109, + "learning_rate": 1.942095410914037e-05, + "loss": 0.7158, + "step": 7002 + }, + { + "epoch": 1.2149548924358085, + "grad_norm": 0.7943102717399597, + "learning_rate": 1.941414374234994e-05, + "loss": 0.6812, + "step": 7003 + }, + { + "epoch": 1.2151283830673143, + "grad_norm": 1.0173747539520264, + "learning_rate": 1.9407333443548806e-05, + "loss": 0.6675, + "step": 7004 + }, + { + "epoch": 1.2153018736988201, + "grad_norm": 1.0478026866912842, + "learning_rate": 1.9400523213527324e-05, + "loss": 0.7448, + "step": 7005 + }, + { + "epoch": 1.2154753643303262, + "grad_norm": 1.0102869272232056, + "learning_rate": 1.9393713053075816e-05, + "loss": 0.7029, + "step": 7006 + }, + { + "epoch": 1.215648854961832, + "grad_norm": 0.7127972841262817, + "learning_rate": 1.9386902962984613e-05, + "loss": 0.7681, + "step": 7007 + }, + { + "epoch": 1.215822345593338, + "grad_norm": 0.574860692024231, + "learning_rate": 1.9380092944044036e-05, + "loss": 0.7344, + "step": 7008 + }, + { + "epoch": 1.215995836224844, + "grad_norm": 0.9264565706253052, + "learning_rate": 1.9373282997044404e-05, + "loss": 0.7043, + "step": 7009 + }, + { + "epoch": 1.2161693268563498, + "grad_norm": 1.0120972394943237, + "learning_rate": 1.9366473122776e-05, + "loss": 0.8359, + "step": 7010 + }, + { + "epoch": 1.2163428174878557, + "grad_norm": 0.847721517086029, + "learning_rate": 1.935966332202913e-05, + "loss": 0.7098, + "step": 7011 + }, + { + "epoch": 1.2165163081193615, + "grad_norm": 0.7164156436920166, + "learning_rate": 1.9352853595594077e-05, + "loss": 0.8616, + "step": 7012 + }, + { + "epoch": 1.2166897987508674, + "grad_norm": 1.347800612449646, + "learning_rate": 1.9346043944261127e-05, + "loss": 0.5463, + "step": 7013 + }, + { + "epoch": 1.2168632893823734, + "grad_norm": 0.9018406867980957, + "learning_rate": 1.9339234368820533e-05, + "loss": 0.5359, + "step": 7014 + }, + { + "epoch": 1.2170367800138793, + "grad_norm": 0.9277125597000122, + "learning_rate": 1.9332424870062557e-05, + "loss": 0.5923, + "step": 7015 + }, + { + "epoch": 1.2172102706453851, + "grad_norm": 0.8449962139129639, + "learning_rate": 1.932561544877746e-05, + "loss": 0.7375, + "step": 7016 + }, + { + "epoch": 1.217383761276891, + "grad_norm": 1.6571367979049683, + "learning_rate": 1.9318806105755474e-05, + "loss": 0.5912, + "step": 7017 + }, + { + "epoch": 1.217557251908397, + "grad_norm": 0.9230605959892273, + "learning_rate": 1.9311996841786825e-05, + "loss": 0.752, + "step": 7018 + }, + { + "epoch": 1.2177307425399029, + "grad_norm": 1.1761434078216553, + "learning_rate": 1.9305187657661758e-05, + "loss": 0.5286, + "step": 7019 + }, + { + "epoch": 1.2179042331714087, + "grad_norm": 1.5194318294525146, + "learning_rate": 1.9298378554170463e-05, + "loss": 0.6287, + "step": 7020 + }, + { + "epoch": 1.2180777238029146, + "grad_norm": 0.8910053968429565, + "learning_rate": 1.9291569532103155e-05, + "loss": 0.5995, + "step": 7021 + }, + { + "epoch": 1.2182512144344206, + "grad_norm": 0.8396157622337341, + "learning_rate": 1.9284760592250037e-05, + "loss": 0.873, + "step": 7022 + }, + { + "epoch": 1.2184247050659265, + "grad_norm": 1.4700703620910645, + "learning_rate": 1.9277951735401276e-05, + "loss": 0.6107, + "step": 7023 + }, + { + "epoch": 1.2185981956974323, + "grad_norm": 1.0609556436538696, + "learning_rate": 1.9271142962347058e-05, + "loss": 0.6038, + "step": 7024 + }, + { + "epoch": 1.2187716863289382, + "grad_norm": 1.1797919273376465, + "learning_rate": 1.926433427387755e-05, + "loss": 0.6709, + "step": 7025 + }, + { + "epoch": 1.2189451769604442, + "grad_norm": 6.967121124267578, + "learning_rate": 1.925752567078291e-05, + "loss": 0.693, + "step": 7026 + }, + { + "epoch": 1.21911866759195, + "grad_norm": 0.9978941679000854, + "learning_rate": 1.925071715385328e-05, + "loss": 0.6678, + "step": 7027 + }, + { + "epoch": 1.219292158223456, + "grad_norm": 1.1189296245574951, + "learning_rate": 1.924390872387879e-05, + "loss": 0.718, + "step": 7028 + }, + { + "epoch": 1.2194656488549618, + "grad_norm": 0.9446016550064087, + "learning_rate": 1.9237100381649586e-05, + "loss": 0.7013, + "step": 7029 + }, + { + "epoch": 1.2196391394864676, + "grad_norm": 0.8862053751945496, + "learning_rate": 1.923029212795576e-05, + "loss": 0.5913, + "step": 7030 + }, + { + "epoch": 1.2198126301179737, + "grad_norm": 0.8173671364784241, + "learning_rate": 1.9223483963587435e-05, + "loss": 0.663, + "step": 7031 + }, + { + "epoch": 1.2199861207494795, + "grad_norm": 1.0808637142181396, + "learning_rate": 1.9216675889334704e-05, + "loss": 0.5883, + "step": 7032 + }, + { + "epoch": 1.2201596113809854, + "grad_norm": 0.8310257196426392, + "learning_rate": 1.9209867905987643e-05, + "loss": 0.6954, + "step": 7033 + }, + { + "epoch": 1.2203331020124912, + "grad_norm": 1.0848429203033447, + "learning_rate": 1.9203060014336334e-05, + "loss": 0.6991, + "step": 7034 + }, + { + "epoch": 1.2205065926439973, + "grad_norm": 0.8142674565315247, + "learning_rate": 1.9196252215170848e-05, + "loss": 0.7922, + "step": 7035 + }, + { + "epoch": 1.2206800832755031, + "grad_norm": 0.8385669589042664, + "learning_rate": 1.9189444509281216e-05, + "loss": 0.7429, + "step": 7036 + }, + { + "epoch": 1.220853573907009, + "grad_norm": 0.8826152682304382, + "learning_rate": 1.9182636897457496e-05, + "loss": 0.6216, + "step": 7037 + }, + { + "epoch": 1.2210270645385148, + "grad_norm": 0.8022192120552063, + "learning_rate": 1.9175829380489727e-05, + "loss": 0.724, + "step": 7038 + }, + { + "epoch": 1.221200555170021, + "grad_norm": 0.8471437692642212, + "learning_rate": 1.9169021959167905e-05, + "loss": 0.6765, + "step": 7039 + }, + { + "epoch": 1.2213740458015268, + "grad_norm": 1.2940444946289062, + "learning_rate": 1.9162214634282055e-05, + "loss": 0.6633, + "step": 7040 + }, + { + "epoch": 1.2215475364330326, + "grad_norm": 1.2769120931625366, + "learning_rate": 1.9155407406622177e-05, + "loss": 0.6294, + "step": 7041 + }, + { + "epoch": 1.2217210270645384, + "grad_norm": 1.1805446147918701, + "learning_rate": 1.9148600276978254e-05, + "loss": 0.5223, + "step": 7042 + }, + { + "epoch": 1.2218945176960445, + "grad_norm": 1.2815589904785156, + "learning_rate": 1.9141793246140257e-05, + "loss": 0.7217, + "step": 7043 + }, + { + "epoch": 1.2220680083275504, + "grad_norm": 0.7685003876686096, + "learning_rate": 1.9134986314898156e-05, + "loss": 0.7153, + "step": 7044 + }, + { + "epoch": 1.2222414989590562, + "grad_norm": 0.9014049172401428, + "learning_rate": 1.9128179484041908e-05, + "loss": 0.7366, + "step": 7045 + }, + { + "epoch": 1.222414989590562, + "grad_norm": 0.8343337178230286, + "learning_rate": 1.9121372754361437e-05, + "loss": 0.7092, + "step": 7046 + }, + { + "epoch": 1.222588480222068, + "grad_norm": 3.0732522010803223, + "learning_rate": 1.911456612664668e-05, + "loss": 0.7068, + "step": 7047 + }, + { + "epoch": 1.222761970853574, + "grad_norm": 0.8420279622077942, + "learning_rate": 1.9107759601687562e-05, + "loss": 0.5602, + "step": 7048 + }, + { + "epoch": 1.2229354614850798, + "grad_norm": 0.9449787735939026, + "learning_rate": 1.9100953180273985e-05, + "loss": 0.6907, + "step": 7049 + }, + { + "epoch": 1.2231089521165857, + "grad_norm": 0.9218570590019226, + "learning_rate": 1.909414686319583e-05, + "loss": 0.7361, + "step": 7050 + }, + { + "epoch": 1.2232824427480917, + "grad_norm": 0.8119929432868958, + "learning_rate": 1.9087340651242995e-05, + "loss": 0.7185, + "step": 7051 + }, + { + "epoch": 1.2234559333795976, + "grad_norm": 0.8828674554824829, + "learning_rate": 1.9080534545205334e-05, + "loss": 0.7458, + "step": 7052 + }, + { + "epoch": 1.2236294240111034, + "grad_norm": 0.8259418606758118, + "learning_rate": 1.9073728545872717e-05, + "loss": 0.5576, + "step": 7053 + }, + { + "epoch": 1.2238029146426093, + "grad_norm": 0.9669384360313416, + "learning_rate": 1.9066922654034975e-05, + "loss": 0.6259, + "step": 7054 + }, + { + "epoch": 1.223976405274115, + "grad_norm": 1.8559068441390991, + "learning_rate": 1.906011687048195e-05, + "loss": 0.6671, + "step": 7055 + }, + { + "epoch": 1.2241498959056212, + "grad_norm": 1.0027129650115967, + "learning_rate": 1.9053311196003457e-05, + "loss": 0.6028, + "step": 7056 + }, + { + "epoch": 1.224323386537127, + "grad_norm": 1.32857084274292, + "learning_rate": 1.9046505631389297e-05, + "loss": 0.7129, + "step": 7057 + }, + { + "epoch": 1.2244968771686329, + "grad_norm": 2.2998945713043213, + "learning_rate": 1.9039700177429282e-05, + "loss": 0.844, + "step": 7058 + }, + { + "epoch": 1.2246703678001387, + "grad_norm": 1.0979045629501343, + "learning_rate": 1.9032894834913172e-05, + "loss": 0.6838, + "step": 7059 + }, + { + "epoch": 1.2248438584316448, + "grad_norm": 1.2498865127563477, + "learning_rate": 1.9026089604630743e-05, + "loss": 0.6635, + "step": 7060 + }, + { + "epoch": 1.2250173490631506, + "grad_norm": 0.9816832542419434, + "learning_rate": 1.901928448737176e-05, + "loss": 0.6124, + "step": 7061 + }, + { + "epoch": 1.2251908396946565, + "grad_norm": 1.0011227130889893, + "learning_rate": 1.9012479483925942e-05, + "loss": 0.6458, + "step": 7062 + }, + { + "epoch": 1.2253643303261623, + "grad_norm": 0.9218721985816956, + "learning_rate": 1.9005674595083033e-05, + "loss": 0.5972, + "step": 7063 + }, + { + "epoch": 1.2255378209576682, + "grad_norm": 1.0962084531784058, + "learning_rate": 1.8998869821632757e-05, + "loss": 0.5403, + "step": 7064 + }, + { + "epoch": 1.2257113115891742, + "grad_norm": 0.7858194708824158, + "learning_rate": 1.8992065164364793e-05, + "loss": 0.6085, + "step": 7065 + }, + { + "epoch": 1.22588480222068, + "grad_norm": 0.7549760341644287, + "learning_rate": 1.898526062406884e-05, + "loss": 0.6604, + "step": 7066 + }, + { + "epoch": 1.226058292852186, + "grad_norm": 0.8628838658332825, + "learning_rate": 1.8978456201534587e-05, + "loss": 0.6997, + "step": 7067 + }, + { + "epoch": 1.226231783483692, + "grad_norm": 0.9157595038414001, + "learning_rate": 1.8971651897551672e-05, + "loss": 0.635, + "step": 7068 + }, + { + "epoch": 1.2264052741151978, + "grad_norm": 1.232090950012207, + "learning_rate": 1.896484771290975e-05, + "loss": 0.5852, + "step": 7069 + }, + { + "epoch": 1.2265787647467037, + "grad_norm": 0.9431121349334717, + "learning_rate": 1.8958043648398457e-05, + "loss": 0.7853, + "step": 7070 + }, + { + "epoch": 1.2267522553782095, + "grad_norm": 0.7870289087295532, + "learning_rate": 1.8951239704807424e-05, + "loss": 0.6594, + "step": 7071 + }, + { + "epoch": 1.2269257460097154, + "grad_norm": 1.1450037956237793, + "learning_rate": 1.8944435882926236e-05, + "loss": 0.6472, + "step": 7072 + }, + { + "epoch": 1.2270992366412214, + "grad_norm": 0.6952449679374695, + "learning_rate": 1.8937632183544495e-05, + "loss": 0.863, + "step": 7073 + }, + { + "epoch": 1.2272727272727273, + "grad_norm": 1.1675097942352295, + "learning_rate": 1.893082860745178e-05, + "loss": 0.5441, + "step": 7074 + }, + { + "epoch": 1.2274462179042331, + "grad_norm": 0.8283983469009399, + "learning_rate": 1.892402515543765e-05, + "loss": 0.6233, + "step": 7075 + }, + { + "epoch": 1.227619708535739, + "grad_norm": 1.488653540611267, + "learning_rate": 1.8917221828291652e-05, + "loss": 0.6146, + "step": 7076 + }, + { + "epoch": 1.227793199167245, + "grad_norm": 0.7899083495140076, + "learning_rate": 1.8910418626803328e-05, + "loss": 0.5928, + "step": 7077 + }, + { + "epoch": 1.227966689798751, + "grad_norm": 0.9175604581832886, + "learning_rate": 1.8903615551762182e-05, + "loss": 0.7684, + "step": 7078 + }, + { + "epoch": 1.2281401804302567, + "grad_norm": 0.9017005562782288, + "learning_rate": 1.8896812603957732e-05, + "loss": 0.6554, + "step": 7079 + }, + { + "epoch": 1.2283136710617626, + "grad_norm": 0.7513145208358765, + "learning_rate": 1.8890009784179476e-05, + "loss": 0.7612, + "step": 7080 + }, + { + "epoch": 1.2284871616932687, + "grad_norm": 0.8528692126274109, + "learning_rate": 1.8883207093216865e-05, + "loss": 0.7052, + "step": 7081 + }, + { + "epoch": 1.2286606523247745, + "grad_norm": 0.8786064386367798, + "learning_rate": 1.8876404531859376e-05, + "loss": 0.6501, + "step": 7082 + }, + { + "epoch": 1.2288341429562804, + "grad_norm": 0.7160065174102783, + "learning_rate": 1.8869602100896446e-05, + "loss": 0.78, + "step": 7083 + }, + { + "epoch": 1.2290076335877862, + "grad_norm": 1.0121495723724365, + "learning_rate": 1.8862799801117523e-05, + "loss": 0.7063, + "step": 7084 + }, + { + "epoch": 1.2291811242192923, + "grad_norm": 0.9707969427108765, + "learning_rate": 1.8855997633311995e-05, + "loss": 0.6068, + "step": 7085 + }, + { + "epoch": 1.229354614850798, + "grad_norm": 1.0295031070709229, + "learning_rate": 1.884919559826928e-05, + "loss": 0.6981, + "step": 7086 + }, + { + "epoch": 1.229528105482304, + "grad_norm": 1.8003791570663452, + "learning_rate": 1.884239369677876e-05, + "loss": 0.5994, + "step": 7087 + }, + { + "epoch": 1.2297015961138098, + "grad_norm": 0.7385640144348145, + "learning_rate": 1.8835591929629795e-05, + "loss": 0.8005, + "step": 7088 + }, + { + "epoch": 1.2298750867453156, + "grad_norm": 0.8499237895011902, + "learning_rate": 1.8828790297611742e-05, + "loss": 0.7916, + "step": 7089 + }, + { + "epoch": 1.2300485773768217, + "grad_norm": 1.1348598003387451, + "learning_rate": 1.882198880151395e-05, + "loss": 0.5793, + "step": 7090 + }, + { + "epoch": 1.2302220680083276, + "grad_norm": 4.368579864501953, + "learning_rate": 1.8815187442125716e-05, + "loss": 0.9089, + "step": 7091 + }, + { + "epoch": 1.2303955586398334, + "grad_norm": 1.2904912233352661, + "learning_rate": 1.8808386220236365e-05, + "loss": 0.5748, + "step": 7092 + }, + { + "epoch": 1.2305690492713393, + "grad_norm": 1.4427123069763184, + "learning_rate": 1.8801585136635188e-05, + "loss": 0.6801, + "step": 7093 + }, + { + "epoch": 1.2307425399028453, + "grad_norm": 0.974590003490448, + "learning_rate": 1.8794784192111448e-05, + "loss": 0.6173, + "step": 7094 + }, + { + "epoch": 1.2309160305343512, + "grad_norm": 1.6354907751083374, + "learning_rate": 1.8787983387454402e-05, + "loss": 0.7634, + "step": 7095 + }, + { + "epoch": 1.231089521165857, + "grad_norm": 1.135785698890686, + "learning_rate": 1.8781182723453303e-05, + "loss": 0.6096, + "step": 7096 + }, + { + "epoch": 1.2312630117973629, + "grad_norm": 1.177046537399292, + "learning_rate": 1.877438220089737e-05, + "loss": 0.5472, + "step": 7097 + }, + { + "epoch": 1.231436502428869, + "grad_norm": 1.013460636138916, + "learning_rate": 1.8767581820575803e-05, + "loss": 0.7561, + "step": 7098 + }, + { + "epoch": 1.2316099930603748, + "grad_norm": 0.8168665766716003, + "learning_rate": 1.8760781583277804e-05, + "loss": 0.8594, + "step": 7099 + }, + { + "epoch": 1.2317834836918806, + "grad_norm": 2.856889009475708, + "learning_rate": 1.875398148979255e-05, + "loss": 0.7791, + "step": 7100 + }, + { + "epoch": 1.2319569743233865, + "grad_norm": 1.2846661806106567, + "learning_rate": 1.8747181540909193e-05, + "loss": 0.6478, + "step": 7101 + }, + { + "epoch": 1.2321304649548925, + "grad_norm": 1.169470191001892, + "learning_rate": 1.874038173741688e-05, + "loss": 0.707, + "step": 7102 + }, + { + "epoch": 1.2323039555863984, + "grad_norm": 1.2500641345977783, + "learning_rate": 1.8733582080104736e-05, + "loss": 0.6018, + "step": 7103 + }, + { + "epoch": 1.2324774462179042, + "grad_norm": 1.0091335773468018, + "learning_rate": 1.8726782569761864e-05, + "loss": 0.7048, + "step": 7104 + }, + { + "epoch": 1.23265093684941, + "grad_norm": 0.8624663949012756, + "learning_rate": 1.8719983207177358e-05, + "loss": 0.834, + "step": 7105 + }, + { + "epoch": 1.232824427480916, + "grad_norm": 0.9266715049743652, + "learning_rate": 1.87131839931403e-05, + "loss": 0.7212, + "step": 7106 + }, + { + "epoch": 1.232997918112422, + "grad_norm": 0.8897729516029358, + "learning_rate": 1.8706384928439726e-05, + "loss": 0.7024, + "step": 7107 + }, + { + "epoch": 1.2331714087439278, + "grad_norm": 1.080377459526062, + "learning_rate": 1.8699586013864694e-05, + "loss": 0.5961, + "step": 7108 + }, + { + "epoch": 1.2333448993754337, + "grad_norm": 3.0149197578430176, + "learning_rate": 1.8692787250204228e-05, + "loss": 0.6918, + "step": 7109 + }, + { + "epoch": 1.2335183900069397, + "grad_norm": 0.8102272152900696, + "learning_rate": 1.8685988638247316e-05, + "loss": 0.7937, + "step": 7110 + }, + { + "epoch": 1.2336918806384456, + "grad_norm": 1.0226622819900513, + "learning_rate": 1.8679190178782954e-05, + "loss": 0.6182, + "step": 7111 + }, + { + "epoch": 1.2338653712699514, + "grad_norm": 0.9000120759010315, + "learning_rate": 1.8672391872600108e-05, + "loss": 0.7811, + "step": 7112 + }, + { + "epoch": 1.2340388619014573, + "grad_norm": 0.949752926826477, + "learning_rate": 1.8665593720487743e-05, + "loss": 0.803, + "step": 7113 + }, + { + "epoch": 1.2342123525329631, + "grad_norm": 1.118082046508789, + "learning_rate": 1.8658795723234774e-05, + "loss": 0.6217, + "step": 7114 + }, + { + "epoch": 1.2343858431644692, + "grad_norm": 1.2627627849578857, + "learning_rate": 1.8651997881630125e-05, + "loss": 0.6443, + "step": 7115 + }, + { + "epoch": 1.234559333795975, + "grad_norm": 0.8282992243766785, + "learning_rate": 1.8645200196462698e-05, + "loss": 0.7617, + "step": 7116 + }, + { + "epoch": 1.234732824427481, + "grad_norm": 0.7638682126998901, + "learning_rate": 1.8638402668521356e-05, + "loss": 0.7109, + "step": 7117 + }, + { + "epoch": 1.2349063150589867, + "grad_norm": 0.9909878969192505, + "learning_rate": 1.8631605298594977e-05, + "loss": 0.684, + "step": 7118 + }, + { + "epoch": 1.2350798056904928, + "grad_norm": 1.0381137132644653, + "learning_rate": 1.8624808087472405e-05, + "loss": 0.8188, + "step": 7119 + }, + { + "epoch": 1.2352532963219987, + "grad_norm": 0.8058453798294067, + "learning_rate": 1.8618011035942444e-05, + "loss": 0.5791, + "step": 7120 + }, + { + "epoch": 1.2354267869535045, + "grad_norm": 0.9340435862541199, + "learning_rate": 1.8611214144793917e-05, + "loss": 0.7515, + "step": 7121 + }, + { + "epoch": 1.2356002775850103, + "grad_norm": 0.7576418519020081, + "learning_rate": 1.860441741481561e-05, + "loss": 0.7396, + "step": 7122 + }, + { + "epoch": 1.2357737682165162, + "grad_norm": 1.1328879594802856, + "learning_rate": 1.8597620846796287e-05, + "loss": 0.5724, + "step": 7123 + }, + { + "epoch": 1.2359472588480223, + "grad_norm": 1.0334988832473755, + "learning_rate": 1.8590824441524696e-05, + "loss": 0.7961, + "step": 7124 + }, + { + "epoch": 1.236120749479528, + "grad_norm": 1.3308449983596802, + "learning_rate": 1.8584028199789573e-05, + "loss": 0.6786, + "step": 7125 + }, + { + "epoch": 1.236294240111034, + "grad_norm": 0.8343605995178223, + "learning_rate": 1.8577232122379625e-05, + "loss": 0.5676, + "step": 7126 + }, + { + "epoch": 1.23646773074254, + "grad_norm": 0.9642437100410461, + "learning_rate": 1.8570436210083547e-05, + "loss": 0.6699, + "step": 7127 + }, + { + "epoch": 1.2366412213740459, + "grad_norm": 0.8406843543052673, + "learning_rate": 1.8563640463690015e-05, + "loss": 0.6697, + "step": 7128 + }, + { + "epoch": 1.2368147120055517, + "grad_norm": 0.9044820070266724, + "learning_rate": 1.8556844883987682e-05, + "loss": 0.6871, + "step": 7129 + }, + { + "epoch": 1.2369882026370576, + "grad_norm": 0.8109055161476135, + "learning_rate": 1.8550049471765176e-05, + "loss": 0.8706, + "step": 7130 + }, + { + "epoch": 1.2371616932685634, + "grad_norm": 1.0079851150512695, + "learning_rate": 1.854325422781112e-05, + "loss": 0.783, + "step": 7131 + }, + { + "epoch": 1.2373351839000695, + "grad_norm": 0.9107648730278015, + "learning_rate": 1.853645915291412e-05, + "loss": 0.6691, + "step": 7132 + }, + { + "epoch": 1.2375086745315753, + "grad_norm": 1.6369155645370483, + "learning_rate": 1.8529664247862726e-05, + "loss": 0.7039, + "step": 7133 + }, + { + "epoch": 1.2376821651630812, + "grad_norm": 0.7198823094367981, + "learning_rate": 1.8522869513445515e-05, + "loss": 0.6646, + "step": 7134 + }, + { + "epoch": 1.237855655794587, + "grad_norm": 0.9255663156509399, + "learning_rate": 1.851607495045102e-05, + "loss": 0.5906, + "step": 7135 + }, + { + "epoch": 1.238029146426093, + "grad_norm": 1.2681900262832642, + "learning_rate": 1.850928055966775e-05, + "loss": 0.5491, + "step": 7136 + }, + { + "epoch": 1.238202637057599, + "grad_norm": 1.086580753326416, + "learning_rate": 1.8502486341884215e-05, + "loss": 0.7352, + "step": 7137 + }, + { + "epoch": 1.2383761276891048, + "grad_norm": 0.9631519913673401, + "learning_rate": 1.8495692297888885e-05, + "loss": 0.6064, + "step": 7138 + }, + { + "epoch": 1.2385496183206106, + "grad_norm": 1.2342954874038696, + "learning_rate": 1.8488898428470213e-05, + "loss": 0.6873, + "step": 7139 + }, + { + "epoch": 1.2387231089521167, + "grad_norm": 0.8956231474876404, + "learning_rate": 1.848210473441664e-05, + "loss": 0.6302, + "step": 7140 + }, + { + "epoch": 1.2388965995836225, + "grad_norm": 0.9687951803207397, + "learning_rate": 1.8475311216516576e-05, + "loss": 0.6914, + "step": 7141 + }, + { + "epoch": 1.2390700902151284, + "grad_norm": 0.6721723079681396, + "learning_rate": 1.8468517875558433e-05, + "loss": 0.7322, + "step": 7142 + }, + { + "epoch": 1.2392435808466342, + "grad_norm": 0.9201697111129761, + "learning_rate": 1.8461724712330567e-05, + "loss": 0.766, + "step": 7143 + }, + { + "epoch": 1.2394170714781403, + "grad_norm": 0.8394030332565308, + "learning_rate": 1.845493172762134e-05, + "loss": 0.7391, + "step": 7144 + }, + { + "epoch": 1.2395905621096461, + "grad_norm": 1.7839723825454712, + "learning_rate": 1.8448138922219093e-05, + "loss": 0.6973, + "step": 7145 + }, + { + "epoch": 1.239764052741152, + "grad_norm": 1.1122424602508545, + "learning_rate": 1.8441346296912128e-05, + "loss": 0.5751, + "step": 7146 + }, + { + "epoch": 1.2399375433726578, + "grad_norm": 1.5597155094146729, + "learning_rate": 1.843455385248874e-05, + "loss": 0.7255, + "step": 7147 + }, + { + "epoch": 1.2401110340041637, + "grad_norm": 1.208731770515442, + "learning_rate": 1.8427761589737203e-05, + "loss": 0.5665, + "step": 7148 + }, + { + "epoch": 1.2402845246356697, + "grad_norm": 0.8989875912666321, + "learning_rate": 1.8420969509445764e-05, + "loss": 0.7229, + "step": 7149 + }, + { + "epoch": 1.2404580152671756, + "grad_norm": 1.0319335460662842, + "learning_rate": 1.8414177612402657e-05, + "loss": 0.7124, + "step": 7150 + }, + { + "epoch": 1.2406315058986814, + "grad_norm": 0.9463761448860168, + "learning_rate": 1.8407385899396085e-05, + "loss": 0.7888, + "step": 7151 + }, + { + "epoch": 1.2408049965301873, + "grad_norm": 1.0970789194107056, + "learning_rate": 1.8400594371214234e-05, + "loss": 0.642, + "step": 7152 + }, + { + "epoch": 1.2409784871616933, + "grad_norm": 1.0302538871765137, + "learning_rate": 1.8393803028645267e-05, + "loss": 0.6393, + "step": 7153 + }, + { + "epoch": 1.2411519777931992, + "grad_norm": 1.2533684968948364, + "learning_rate": 1.8387011872477338e-05, + "loss": 0.7037, + "step": 7154 + }, + { + "epoch": 1.241325468424705, + "grad_norm": 1.0823200941085815, + "learning_rate": 1.8380220903498554e-05, + "loss": 0.6427, + "step": 7155 + }, + { + "epoch": 1.2414989590562109, + "grad_norm": 0.9605312943458557, + "learning_rate": 1.8373430122497022e-05, + "loss": 0.7593, + "step": 7156 + }, + { + "epoch": 1.241672449687717, + "grad_norm": 0.8815540671348572, + "learning_rate": 1.836663953026082e-05, + "loss": 0.6199, + "step": 7157 + }, + { + "epoch": 1.2418459403192228, + "grad_norm": 1.0092716217041016, + "learning_rate": 1.835984912757801e-05, + "loss": 0.8862, + "step": 7158 + }, + { + "epoch": 1.2420194309507286, + "grad_norm": 1.0213440656661987, + "learning_rate": 1.8353058915236613e-05, + "loss": 0.6565, + "step": 7159 + }, + { + "epoch": 1.2421929215822345, + "grad_norm": 1.8007724285125732, + "learning_rate": 1.8346268894024644e-05, + "loss": 0.7139, + "step": 7160 + }, + { + "epoch": 1.2423664122137406, + "grad_norm": 0.9364546537399292, + "learning_rate": 1.833947906473011e-05, + "loss": 0.6311, + "step": 7161 + }, + { + "epoch": 1.2425399028452464, + "grad_norm": 1.0564968585968018, + "learning_rate": 1.8332689428140956e-05, + "loss": 0.6711, + "step": 7162 + }, + { + "epoch": 1.2427133934767522, + "grad_norm": 0.7355627417564392, + "learning_rate": 1.8325899985045135e-05, + "loss": 0.7411, + "step": 7163 + }, + { + "epoch": 1.242886884108258, + "grad_norm": 0.8379045128822327, + "learning_rate": 1.831911073623058e-05, + "loss": 0.6859, + "step": 7164 + }, + { + "epoch": 1.243060374739764, + "grad_norm": 0.8332933187484741, + "learning_rate": 1.8312321682485178e-05, + "loss": 0.655, + "step": 7165 + }, + { + "epoch": 1.24323386537127, + "grad_norm": 0.8423359990119934, + "learning_rate": 1.830553282459681e-05, + "loss": 0.7546, + "step": 7166 + }, + { + "epoch": 1.2434073560027759, + "grad_norm": 1.032071590423584, + "learning_rate": 1.8298744163353338e-05, + "loss": 0.925, + "step": 7167 + }, + { + "epoch": 1.2435808466342817, + "grad_norm": 2.032209873199463, + "learning_rate": 1.8291955699542584e-05, + "loss": 0.8276, + "step": 7168 + }, + { + "epoch": 1.2437543372657878, + "grad_norm": 0.8779877424240112, + "learning_rate": 1.828516743395236e-05, + "loss": 0.667, + "step": 7169 + }, + { + "epoch": 1.2439278278972936, + "grad_norm": 1.421433925628662, + "learning_rate": 1.8278379367370448e-05, + "loss": 0.6348, + "step": 7170 + }, + { + "epoch": 1.2441013185287995, + "grad_norm": 0.9636150598526001, + "learning_rate": 1.8271591500584625e-05, + "loss": 0.7271, + "step": 7171 + }, + { + "epoch": 1.2442748091603053, + "grad_norm": 1.052131175994873, + "learning_rate": 1.8264803834382622e-05, + "loss": 0.7415, + "step": 7172 + }, + { + "epoch": 1.2444482997918112, + "grad_norm": 1.1324986219406128, + "learning_rate": 1.825801636955215e-05, + "loss": 0.5994, + "step": 7173 + }, + { + "epoch": 1.2446217904233172, + "grad_norm": 1.4779952764511108, + "learning_rate": 1.8251229106880916e-05, + "loss": 0.6089, + "step": 7174 + }, + { + "epoch": 1.244795281054823, + "grad_norm": 0.9863800406455994, + "learning_rate": 1.8244442047156577e-05, + "loss": 0.822, + "step": 7175 + }, + { + "epoch": 1.244968771686329, + "grad_norm": 0.9942257404327393, + "learning_rate": 1.8237655191166785e-05, + "loss": 0.6968, + "step": 7176 + }, + { + "epoch": 1.2451422623178348, + "grad_norm": 0.8994476199150085, + "learning_rate": 1.8230868539699166e-05, + "loss": 0.7085, + "step": 7177 + }, + { + "epoch": 1.2453157529493408, + "grad_norm": 0.8978316783905029, + "learning_rate": 1.8224082093541306e-05, + "loss": 0.6425, + "step": 7178 + }, + { + "epoch": 1.2454892435808467, + "grad_norm": 0.8790307641029358, + "learning_rate": 1.821729585348079e-05, + "loss": 0.5842, + "step": 7179 + }, + { + "epoch": 1.2456627342123525, + "grad_norm": 0.9833171963691711, + "learning_rate": 1.8210509820305174e-05, + "loss": 0.6846, + "step": 7180 + }, + { + "epoch": 1.2458362248438584, + "grad_norm": 1.047590732574463, + "learning_rate": 1.8203723994801974e-05, + "loss": 0.6992, + "step": 7181 + }, + { + "epoch": 1.2460097154753642, + "grad_norm": 0.9310769438743591, + "learning_rate": 1.8196938377758696e-05, + "loss": 0.7008, + "step": 7182 + }, + { + "epoch": 1.2461832061068703, + "grad_norm": 0.6472421288490295, + "learning_rate": 1.819015296996282e-05, + "loss": 0.7708, + "step": 7183 + }, + { + "epoch": 1.2463566967383761, + "grad_norm": 0.9580670595169067, + "learning_rate": 1.818336777220181e-05, + "loss": 0.6967, + "step": 7184 + }, + { + "epoch": 1.246530187369882, + "grad_norm": 0.9030908942222595, + "learning_rate": 1.817658278526308e-05, + "loss": 0.5447, + "step": 7185 + }, + { + "epoch": 1.246703678001388, + "grad_norm": 1.7859220504760742, + "learning_rate": 1.8169798009934038e-05, + "loss": 0.6322, + "step": 7186 + }, + { + "epoch": 1.2468771686328939, + "grad_norm": 1.0738788843154907, + "learning_rate": 1.816301344700208e-05, + "loss": 0.5736, + "step": 7187 + }, + { + "epoch": 1.2470506592643997, + "grad_norm": 1.4733651876449585, + "learning_rate": 1.815622909725454e-05, + "loss": 0.7286, + "step": 7188 + }, + { + "epoch": 1.2472241498959056, + "grad_norm": 1.5198785066604614, + "learning_rate": 1.8149444961478767e-05, + "loss": 0.6978, + "step": 7189 + }, + { + "epoch": 1.2473976405274114, + "grad_norm": 1.8485901355743408, + "learning_rate": 1.8142661040462068e-05, + "loss": 0.7013, + "step": 7190 + }, + { + "epoch": 1.2475711311589175, + "grad_norm": 1.2864807844161987, + "learning_rate": 1.8135877334991713e-05, + "loss": 0.667, + "step": 7191 + }, + { + "epoch": 1.2477446217904233, + "grad_norm": 1.0022995471954346, + "learning_rate": 1.8129093845854965e-05, + "loss": 0.6993, + "step": 7192 + }, + { + "epoch": 1.2479181124219292, + "grad_norm": 0.8947988152503967, + "learning_rate": 1.8122310573839063e-05, + "loss": 0.7705, + "step": 7193 + }, + { + "epoch": 1.248091603053435, + "grad_norm": 0.9792948365211487, + "learning_rate": 1.81155275197312e-05, + "loss": 0.7385, + "step": 7194 + }, + { + "epoch": 1.248265093684941, + "grad_norm": 1.1802785396575928, + "learning_rate": 1.8108744684318564e-05, + "loss": 0.6129, + "step": 7195 + }, + { + "epoch": 1.248438584316447, + "grad_norm": 1.0757787227630615, + "learning_rate": 1.8101962068388315e-05, + "loss": 0.73, + "step": 7196 + }, + { + "epoch": 1.2486120749479528, + "grad_norm": 0.9558101892471313, + "learning_rate": 1.8095179672727575e-05, + "loss": 0.6309, + "step": 7197 + }, + { + "epoch": 1.2487855655794586, + "grad_norm": 0.9179098010063171, + "learning_rate": 1.808839749812346e-05, + "loss": 0.5298, + "step": 7198 + }, + { + "epoch": 1.2489590562109647, + "grad_norm": 0.9532437324523926, + "learning_rate": 1.8081615545363035e-05, + "loss": 0.7085, + "step": 7199 + }, + { + "epoch": 1.2491325468424705, + "grad_norm": 1.3434770107269287, + "learning_rate": 1.807483381523337e-05, + "loss": 0.6425, + "step": 7200 + }, + { + "epoch": 1.2493060374739764, + "grad_norm": 0.7468249201774597, + "learning_rate": 1.8068052308521474e-05, + "loss": 0.7571, + "step": 7201 + }, + { + "epoch": 1.2494795281054822, + "grad_norm": 0.8346254229545593, + "learning_rate": 1.8061271026014364e-05, + "loss": 0.5392, + "step": 7202 + }, + { + "epoch": 1.2496530187369883, + "grad_norm": 1.0284264087677002, + "learning_rate": 1.8054489968499018e-05, + "loss": 0.5247, + "step": 7203 + }, + { + "epoch": 1.2498265093684942, + "grad_norm": 3.7614479064941406, + "learning_rate": 1.8047709136762368e-05, + "loss": 0.6564, + "step": 7204 + }, + { + "epoch": 1.25, + "grad_norm": 0.8937176465988159, + "learning_rate": 1.8040928531591347e-05, + "loss": 0.72, + "step": 7205 + }, + { + "epoch": 1.2501734906315058, + "grad_norm": 0.9378701448440552, + "learning_rate": 1.8034148153772864e-05, + "loss": 0.6039, + "step": 7206 + }, + { + "epoch": 1.2503469812630117, + "grad_norm": 1.2754393815994263, + "learning_rate": 1.8027368004093766e-05, + "loss": 0.6099, + "step": 7207 + }, + { + "epoch": 1.2505204718945178, + "grad_norm": 1.1329644918441772, + "learning_rate": 1.8020588083340912e-05, + "loss": 0.7261, + "step": 7208 + }, + { + "epoch": 1.2506939625260236, + "grad_norm": 0.8676601052284241, + "learning_rate": 1.801380839230113e-05, + "loss": 0.6831, + "step": 7209 + }, + { + "epoch": 1.2508674531575295, + "grad_norm": 0.9615556597709656, + "learning_rate": 1.8007028931761184e-05, + "loss": 0.6698, + "step": 7210 + }, + { + "epoch": 1.2510409437890355, + "grad_norm": 0.9375060796737671, + "learning_rate": 1.8000249702507854e-05, + "loss": 0.7091, + "step": 7211 + }, + { + "epoch": 1.2512144344205414, + "grad_norm": 0.9629554152488708, + "learning_rate": 1.7993470705327877e-05, + "loss": 0.8638, + "step": 7212 + }, + { + "epoch": 1.2513879250520472, + "grad_norm": 1.4969462156295776, + "learning_rate": 1.798669194100797e-05, + "loss": 0.5951, + "step": 7213 + }, + { + "epoch": 1.251561415683553, + "grad_norm": 1.1308945417404175, + "learning_rate": 1.79799134103348e-05, + "loss": 0.6295, + "step": 7214 + }, + { + "epoch": 1.251734906315059, + "grad_norm": 0.8289528489112854, + "learning_rate": 1.7973135114095038e-05, + "loss": 0.7283, + "step": 7215 + }, + { + "epoch": 1.2519083969465647, + "grad_norm": 0.8897553086280823, + "learning_rate": 1.7966357053075312e-05, + "loss": 0.7201, + "step": 7216 + }, + { + "epoch": 1.2520818875780708, + "grad_norm": 0.8658400177955627, + "learning_rate": 1.795957922806221e-05, + "loss": 0.6519, + "step": 7217 + }, + { + "epoch": 1.2522553782095767, + "grad_norm": 1.7833908796310425, + "learning_rate": 1.795280163984232e-05, + "loss": 0.7847, + "step": 7218 + }, + { + "epoch": 1.2524288688410825, + "grad_norm": 1.3249858617782593, + "learning_rate": 1.7946024289202188e-05, + "loss": 0.7322, + "step": 7219 + }, + { + "epoch": 1.2526023594725886, + "grad_norm": 0.9739170074462891, + "learning_rate": 1.7939247176928328e-05, + "loss": 0.6102, + "step": 7220 + }, + { + "epoch": 1.2527758501040944, + "grad_norm": 1.2435564994812012, + "learning_rate": 1.793247030380723e-05, + "loss": 0.7734, + "step": 7221 + }, + { + "epoch": 1.2529493407356003, + "grad_norm": 1.2119343280792236, + "learning_rate": 1.792569367062537e-05, + "loss": 0.7738, + "step": 7222 + }, + { + "epoch": 1.2531228313671061, + "grad_norm": 0.9308053851127625, + "learning_rate": 1.7918917278169173e-05, + "loss": 0.7159, + "step": 7223 + }, + { + "epoch": 1.253296321998612, + "grad_norm": 1.1386020183563232, + "learning_rate": 1.791214112722505e-05, + "loss": 0.6824, + "step": 7224 + }, + { + "epoch": 1.253469812630118, + "grad_norm": 0.8041576743125916, + "learning_rate": 1.7905365218579387e-05, + "loss": 0.6649, + "step": 7225 + }, + { + "epoch": 1.2536433032616239, + "grad_norm": 1.1738430261611938, + "learning_rate": 1.7898589553018523e-05, + "loss": 0.7079, + "step": 7226 + }, + { + "epoch": 1.2538167938931297, + "grad_norm": 0.7398790121078491, + "learning_rate": 1.7891814131328795e-05, + "loss": 0.8066, + "step": 7227 + }, + { + "epoch": 1.2539902845246358, + "grad_norm": 1.2997254133224487, + "learning_rate": 1.788503895429649e-05, + "loss": 0.645, + "step": 7228 + }, + { + "epoch": 1.2541637751561416, + "grad_norm": 1.771164894104004, + "learning_rate": 1.787826402270789e-05, + "loss": 0.7249, + "step": 7229 + }, + { + "epoch": 1.2543372657876475, + "grad_norm": 0.9473429322242737, + "learning_rate": 1.7871489337349208e-05, + "loss": 0.5745, + "step": 7230 + }, + { + "epoch": 1.2545107564191533, + "grad_norm": 1.3012268543243408, + "learning_rate": 1.7864714899006672e-05, + "loss": 0.7645, + "step": 7231 + }, + { + "epoch": 1.2546842470506592, + "grad_norm": 1.8051451444625854, + "learning_rate": 1.785794070846647e-05, + "loss": 0.5819, + "step": 7232 + }, + { + "epoch": 1.2548577376821652, + "grad_norm": 1.0018703937530518, + "learning_rate": 1.7851166766514737e-05, + "loss": 0.6423, + "step": 7233 + }, + { + "epoch": 1.255031228313671, + "grad_norm": 0.6814323663711548, + "learning_rate": 1.78443930739376e-05, + "loss": 0.7637, + "step": 7234 + }, + { + "epoch": 1.255204718945177, + "grad_norm": 1.116615653038025, + "learning_rate": 1.783761963152117e-05, + "loss": 0.6503, + "step": 7235 + }, + { + "epoch": 1.2553782095766828, + "grad_norm": 1.208640694618225, + "learning_rate": 1.7830846440051493e-05, + "loss": 0.7512, + "step": 7236 + }, + { + "epoch": 1.2555517002081888, + "grad_norm": 1.0058035850524902, + "learning_rate": 1.7824073500314614e-05, + "loss": 0.821, + "step": 7237 + }, + { + "epoch": 1.2557251908396947, + "grad_norm": 0.7908157110214233, + "learning_rate": 1.7817300813096548e-05, + "loss": 0.7239, + "step": 7238 + }, + { + "epoch": 1.2558986814712005, + "grad_norm": 1.5170058012008667, + "learning_rate": 1.7810528379183262e-05, + "loss": 0.7302, + "step": 7239 + }, + { + "epoch": 1.2560721721027064, + "grad_norm": 0.788485050201416, + "learning_rate": 1.7803756199360704e-05, + "loss": 0.6956, + "step": 7240 + }, + { + "epoch": 1.2562456627342122, + "grad_norm": 1.0328097343444824, + "learning_rate": 1.7796984274414797e-05, + "loss": 0.6594, + "step": 7241 + }, + { + "epoch": 1.2564191533657183, + "grad_norm": 0.9331695437431335, + "learning_rate": 1.7790212605131448e-05, + "loss": 0.6401, + "step": 7242 + }, + { + "epoch": 1.2565926439972241, + "grad_norm": 0.8585333228111267, + "learning_rate": 1.7783441192296488e-05, + "loss": 0.6882, + "step": 7243 + }, + { + "epoch": 1.25676613462873, + "grad_norm": 1.289650321006775, + "learning_rate": 1.7776670036695758e-05, + "loss": 0.6274, + "step": 7244 + }, + { + "epoch": 1.256939625260236, + "grad_norm": 1.54393470287323, + "learning_rate": 1.7769899139115066e-05, + "loss": 0.6975, + "step": 7245 + }, + { + "epoch": 1.257113115891742, + "grad_norm": 0.7662222385406494, + "learning_rate": 1.776312850034018e-05, + "loss": 0.7249, + "step": 7246 + }, + { + "epoch": 1.2572866065232478, + "grad_norm": 0.9696910977363586, + "learning_rate": 1.7756358121156835e-05, + "loss": 0.6472, + "step": 7247 + }, + { + "epoch": 1.2574600971547536, + "grad_norm": 0.9469233155250549, + "learning_rate": 1.7749588002350748e-05, + "loss": 0.6276, + "step": 7248 + }, + { + "epoch": 1.2576335877862594, + "grad_norm": 1.183083415031433, + "learning_rate": 1.7742818144707588e-05, + "loss": 0.6244, + "step": 7249 + }, + { + "epoch": 1.2578070784177655, + "grad_norm": 0.8691683411598206, + "learning_rate": 1.7736048549013013e-05, + "loss": 0.8862, + "step": 7250 + }, + { + "epoch": 1.2579805690492714, + "grad_norm": 0.8153529167175293, + "learning_rate": 1.7729279216052652e-05, + "loss": 0.7112, + "step": 7251 + }, + { + "epoch": 1.2581540596807772, + "grad_norm": 1.0654182434082031, + "learning_rate": 1.7722510146612075e-05, + "loss": 0.7191, + "step": 7252 + }, + { + "epoch": 1.2583275503122833, + "grad_norm": 1.2800743579864502, + "learning_rate": 1.771574134147685e-05, + "loss": 0.6331, + "step": 7253 + }, + { + "epoch": 1.2585010409437891, + "grad_norm": 0.8579086065292358, + "learning_rate": 1.770897280143251e-05, + "loss": 0.7341, + "step": 7254 + }, + { + "epoch": 1.258674531575295, + "grad_norm": 2.405527353286743, + "learning_rate": 1.770220452726454e-05, + "loss": 0.6566, + "step": 7255 + }, + { + "epoch": 1.2588480222068008, + "grad_norm": 0.7249985933303833, + "learning_rate": 1.7695436519758412e-05, + "loss": 0.7335, + "step": 7256 + }, + { + "epoch": 1.2590215128383067, + "grad_norm": 1.242544412612915, + "learning_rate": 1.7688668779699562e-05, + "loss": 0.6406, + "step": 7257 + }, + { + "epoch": 1.2591950034698125, + "grad_norm": 2.011195659637451, + "learning_rate": 1.76819013078734e-05, + "loss": 0.6376, + "step": 7258 + }, + { + "epoch": 1.2593684941013186, + "grad_norm": 1.1015033721923828, + "learning_rate": 1.767513410506528e-05, + "loss": 0.606, + "step": 7259 + }, + { + "epoch": 1.2595419847328244, + "grad_norm": 1.523072361946106, + "learning_rate": 1.7668367172060562e-05, + "loss": 0.7097, + "step": 7260 + }, + { + "epoch": 1.2597154753643303, + "grad_norm": 0.9354090690612793, + "learning_rate": 1.766160050964456e-05, + "loss": 0.7756, + "step": 7261 + }, + { + "epoch": 1.2598889659958363, + "grad_norm": 1.0542137622833252, + "learning_rate": 1.765483411860253e-05, + "loss": 0.9602, + "step": 7262 + }, + { + "epoch": 1.2600624566273422, + "grad_norm": 1.3997797966003418, + "learning_rate": 1.7648067999719734e-05, + "loss": 0.679, + "step": 7263 + }, + { + "epoch": 1.260235947258848, + "grad_norm": 0.7511456608772278, + "learning_rate": 1.7641302153781402e-05, + "loss": 0.8259, + "step": 7264 + }, + { + "epoch": 1.2604094378903539, + "grad_norm": 0.9171479940414429, + "learning_rate": 1.7634536581572687e-05, + "loss": 0.6702, + "step": 7265 + }, + { + "epoch": 1.2605829285218597, + "grad_norm": 0.9686594009399414, + "learning_rate": 1.7627771283878764e-05, + "loss": 0.7568, + "step": 7266 + }, + { + "epoch": 1.2607564191533658, + "grad_norm": 1.1589064598083496, + "learning_rate": 1.762100626148475e-05, + "loss": 0.7861, + "step": 7267 + }, + { + "epoch": 1.2609299097848716, + "grad_norm": 0.9227722883224487, + "learning_rate": 1.761424151517573e-05, + "loss": 0.616, + "step": 7268 + }, + { + "epoch": 1.2611034004163775, + "grad_norm": 1.0354520082473755, + "learning_rate": 1.7607477045736758e-05, + "loss": 0.6425, + "step": 7269 + }, + { + "epoch": 1.2612768910478835, + "grad_norm": 1.4837673902511597, + "learning_rate": 1.7600712853952863e-05, + "loss": 0.8289, + "step": 7270 + }, + { + "epoch": 1.2614503816793894, + "grad_norm": 0.8205804228782654, + "learning_rate": 1.7593948940609043e-05, + "loss": 0.7393, + "step": 7271 + }, + { + "epoch": 1.2616238723108952, + "grad_norm": 2.693303108215332, + "learning_rate": 1.7587185306490245e-05, + "loss": 0.7012, + "step": 7272 + }, + { + "epoch": 1.261797362942401, + "grad_norm": 1.5837275981903076, + "learning_rate": 1.758042195238141e-05, + "loss": 0.7109, + "step": 7273 + }, + { + "epoch": 1.261970853573907, + "grad_norm": 0.9741407632827759, + "learning_rate": 1.7573658879067424e-05, + "loss": 0.6079, + "step": 7274 + }, + { + "epoch": 1.2621443442054128, + "grad_norm": 0.9193976521492004, + "learning_rate": 1.756689608733315e-05, + "loss": 0.6632, + "step": 7275 + }, + { + "epoch": 1.2623178348369188, + "grad_norm": 0.779644250869751, + "learning_rate": 1.7560133577963423e-05, + "loss": 0.7009, + "step": 7276 + }, + { + "epoch": 1.2624913254684247, + "grad_norm": 1.0467300415039062, + "learning_rate": 1.7553371351743043e-05, + "loss": 0.7002, + "step": 7277 + }, + { + "epoch": 1.2626648160999305, + "grad_norm": 0.8226938247680664, + "learning_rate": 1.754660940945676e-05, + "loss": 0.7073, + "step": 7278 + }, + { + "epoch": 1.2628383067314366, + "grad_norm": 0.9757381677627563, + "learning_rate": 1.7539847751889314e-05, + "loss": 0.6196, + "step": 7279 + }, + { + "epoch": 1.2630117973629424, + "grad_norm": 0.8395245671272278, + "learning_rate": 1.753308637982541e-05, + "loss": 0.6798, + "step": 7280 + }, + { + "epoch": 1.2631852879944483, + "grad_norm": 0.9730170369148254, + "learning_rate": 1.75263252940497e-05, + "loss": 0.6311, + "step": 7281 + }, + { + "epoch": 1.2633587786259541, + "grad_norm": 0.8657923340797424, + "learning_rate": 1.751956449534682e-05, + "loss": 0.6803, + "step": 7282 + }, + { + "epoch": 1.26353226925746, + "grad_norm": 1.0469084978103638, + "learning_rate": 1.7512803984501385e-05, + "loss": 0.6858, + "step": 7283 + }, + { + "epoch": 1.263705759888966, + "grad_norm": 0.924187183380127, + "learning_rate": 1.7506043762297932e-05, + "loss": 0.6145, + "step": 7284 + }, + { + "epoch": 1.263879250520472, + "grad_norm": 0.9160469770431519, + "learning_rate": 1.7499283829521006e-05, + "loss": 0.5627, + "step": 7285 + }, + { + "epoch": 1.2640527411519777, + "grad_norm": 0.8873293995857239, + "learning_rate": 1.7492524186955108e-05, + "loss": 0.7671, + "step": 7286 + }, + { + "epoch": 1.2642262317834838, + "grad_norm": 0.7107306122779846, + "learning_rate": 1.7485764835384705e-05, + "loss": 0.7534, + "step": 7287 + }, + { + "epoch": 1.2643997224149897, + "grad_norm": 0.8610885739326477, + "learning_rate": 1.7479005775594216e-05, + "loss": 0.771, + "step": 7288 + }, + { + "epoch": 1.2645732130464955, + "grad_norm": 1.0761052370071411, + "learning_rate": 1.7472247008368046e-05, + "loss": 0.653, + "step": 7289 + }, + { + "epoch": 1.2647467036780013, + "grad_norm": 0.7292563915252686, + "learning_rate": 1.746548853449056e-05, + "loss": 0.7791, + "step": 7290 + }, + { + "epoch": 1.2649201943095072, + "grad_norm": 1.2264353036880493, + "learning_rate": 1.7458730354746077e-05, + "loss": 0.7456, + "step": 7291 + }, + { + "epoch": 1.2650936849410133, + "grad_norm": 1.4271624088287354, + "learning_rate": 1.74519724699189e-05, + "loss": 0.7111, + "step": 7292 + }, + { + "epoch": 1.265267175572519, + "grad_norm": 0.7449595332145691, + "learning_rate": 1.7445214880793287e-05, + "loss": 0.729, + "step": 7293 + }, + { + "epoch": 1.265440666204025, + "grad_norm": 0.8854461908340454, + "learning_rate": 1.7438457588153466e-05, + "loss": 0.6874, + "step": 7294 + }, + { + "epoch": 1.2656141568355308, + "grad_norm": 0.9280667901039124, + "learning_rate": 1.7431700592783622e-05, + "loss": 0.5317, + "step": 7295 + }, + { + "epoch": 1.2657876474670369, + "grad_norm": 0.8157984614372253, + "learning_rate": 1.742494389546792e-05, + "loss": 0.7625, + "step": 7296 + }, + { + "epoch": 1.2659611380985427, + "grad_norm": 0.7768906354904175, + "learning_rate": 1.7418187496990476e-05, + "loss": 0.7471, + "step": 7297 + }, + { + "epoch": 1.2661346287300486, + "grad_norm": 0.8647959232330322, + "learning_rate": 1.7411431398135384e-05, + "loss": 0.5413, + "step": 7298 + }, + { + "epoch": 1.2663081193615544, + "grad_norm": 0.755017876625061, + "learning_rate": 1.7404675599686697e-05, + "loss": 0.772, + "step": 7299 + }, + { + "epoch": 1.2664816099930603, + "grad_norm": 0.9681296348571777, + "learning_rate": 1.739792010242843e-05, + "loss": 0.7297, + "step": 7300 + }, + { + "epoch": 1.2666551006245663, + "grad_norm": 1.2521724700927734, + "learning_rate": 1.7391164907144563e-05, + "loss": 0.7104, + "step": 7301 + }, + { + "epoch": 1.2668285912560722, + "grad_norm": 0.7951853275299072, + "learning_rate": 1.738441001461905e-05, + "loss": 0.7279, + "step": 7302 + }, + { + "epoch": 1.267002081887578, + "grad_norm": 0.967841386795044, + "learning_rate": 1.7377655425635813e-05, + "loss": 0.6045, + "step": 7303 + }, + { + "epoch": 1.267175572519084, + "grad_norm": 1.083212971687317, + "learning_rate": 1.7370901140978706e-05, + "loss": 0.575, + "step": 7304 + }, + { + "epoch": 1.26734906315059, + "grad_norm": 1.0246564149856567, + "learning_rate": 1.7364147161431585e-05, + "loss": 0.6632, + "step": 7305 + }, + { + "epoch": 1.2675225537820958, + "grad_norm": 1.4003570079803467, + "learning_rate": 1.735739348777827e-05, + "loss": 0.8408, + "step": 7306 + }, + { + "epoch": 1.2676960444136016, + "grad_norm": 1.0730516910552979, + "learning_rate": 1.735064012080251e-05, + "loss": 0.676, + "step": 7307 + }, + { + "epoch": 1.2678695350451075, + "grad_norm": 0.9693717360496521, + "learning_rate": 1.734388706128805e-05, + "loss": 0.687, + "step": 7308 + }, + { + "epoch": 1.2680430256766135, + "grad_norm": 0.8211718201637268, + "learning_rate": 1.73371343100186e-05, + "loss": 0.7059, + "step": 7309 + }, + { + "epoch": 1.2682165163081194, + "grad_norm": 0.9057807922363281, + "learning_rate": 1.7330381867777808e-05, + "loss": 0.582, + "step": 7310 + }, + { + "epoch": 1.2683900069396252, + "grad_norm": 0.7685960531234741, + "learning_rate": 1.7323629735349313e-05, + "loss": 0.8098, + "step": 7311 + }, + { + "epoch": 1.2685634975711313, + "grad_norm": 1.034140944480896, + "learning_rate": 1.731687791351671e-05, + "loss": 0.6882, + "step": 7312 + }, + { + "epoch": 1.2687369882026371, + "grad_norm": 0.7149333953857422, + "learning_rate": 1.7310126403063545e-05, + "loss": 0.8066, + "step": 7313 + }, + { + "epoch": 1.268910478834143, + "grad_norm": 1.3766943216323853, + "learning_rate": 1.730337520477335e-05, + "loss": 0.668, + "step": 7314 + }, + { + "epoch": 1.2690839694656488, + "grad_norm": 1.0722219944000244, + "learning_rate": 1.7296624319429598e-05, + "loss": 0.7242, + "step": 7315 + }, + { + "epoch": 1.2692574600971547, + "grad_norm": 1.085383415222168, + "learning_rate": 1.7289873747815755e-05, + "loss": 0.6825, + "step": 7316 + }, + { + "epoch": 1.2694309507286605, + "grad_norm": 1.3181840181350708, + "learning_rate": 1.7283123490715213e-05, + "loss": 0.6199, + "step": 7317 + }, + { + "epoch": 1.2696044413601666, + "grad_norm": 0.8751301169395447, + "learning_rate": 1.7276373548911355e-05, + "loss": 0.6658, + "step": 7318 + }, + { + "epoch": 1.2697779319916724, + "grad_norm": 1.108463168144226, + "learning_rate": 1.7269623923187522e-05, + "loss": 0.7473, + "step": 7319 + }, + { + "epoch": 1.2699514226231783, + "grad_norm": 1.8801891803741455, + "learning_rate": 1.7262874614327016e-05, + "loss": 0.6843, + "step": 7320 + }, + { + "epoch": 1.2701249132546844, + "grad_norm": 0.7803223729133606, + "learning_rate": 1.72561256231131e-05, + "loss": 0.8293, + "step": 7321 + }, + { + "epoch": 1.2702984038861902, + "grad_norm": 1.0703188180923462, + "learning_rate": 1.7249376950329004e-05, + "loss": 0.8181, + "step": 7322 + }, + { + "epoch": 1.270471894517696, + "grad_norm": 2.3461406230926514, + "learning_rate": 1.7242628596757916e-05, + "loss": 0.9417, + "step": 7323 + }, + { + "epoch": 1.270645385149202, + "grad_norm": 1.0376940965652466, + "learning_rate": 1.7235880563182988e-05, + "loss": 0.7524, + "step": 7324 + }, + { + "epoch": 1.2708188757807077, + "grad_norm": 1.1363784074783325, + "learning_rate": 1.7229132850387352e-05, + "loss": 0.7654, + "step": 7325 + }, + { + "epoch": 1.2709923664122138, + "grad_norm": 0.9572004079818726, + "learning_rate": 1.7222385459154072e-05, + "loss": 0.7347, + "step": 7326 + }, + { + "epoch": 1.2711658570437196, + "grad_norm": 1.085919976234436, + "learning_rate": 1.721563839026619e-05, + "loss": 0.6499, + "step": 7327 + }, + { + "epoch": 1.2713393476752255, + "grad_norm": 0.8625211119651794, + "learning_rate": 1.720889164450672e-05, + "loss": 0.7709, + "step": 7328 + }, + { + "epoch": 1.2715128383067316, + "grad_norm": 0.8784975409507751, + "learning_rate": 1.720214522265864e-05, + "loss": 0.7065, + "step": 7329 + }, + { + "epoch": 1.2716863289382374, + "grad_norm": 0.7598005533218384, + "learning_rate": 1.7195399125504853e-05, + "loss": 0.687, + "step": 7330 + }, + { + "epoch": 1.2718598195697433, + "grad_norm": 0.7725282907485962, + "learning_rate": 1.7188653353828266e-05, + "loss": 0.7333, + "step": 7331 + }, + { + "epoch": 1.272033310201249, + "grad_norm": 1.3108631372451782, + "learning_rate": 1.7181907908411744e-05, + "loss": 0.7181, + "step": 7332 + }, + { + "epoch": 1.272206800832755, + "grad_norm": 1.1820404529571533, + "learning_rate": 1.7175162790038086e-05, + "loss": 0.7556, + "step": 7333 + }, + { + "epoch": 1.2723802914642608, + "grad_norm": 1.222381830215454, + "learning_rate": 1.716841799949007e-05, + "loss": 0.5964, + "step": 7334 + }, + { + "epoch": 1.2725537820957669, + "grad_norm": 1.1475998163223267, + "learning_rate": 1.716167353755046e-05, + "loss": 0.6302, + "step": 7335 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 1.4456623792648315, + "learning_rate": 1.7154929405001936e-05, + "loss": 0.5861, + "step": 7336 + }, + { + "epoch": 1.2729007633587786, + "grad_norm": 1.301460862159729, + "learning_rate": 1.7148185602627166e-05, + "loss": 0.6997, + "step": 7337 + }, + { + "epoch": 1.2730742539902846, + "grad_norm": 0.9453331828117371, + "learning_rate": 1.7141442131208788e-05, + "loss": 0.6117, + "step": 7338 + }, + { + "epoch": 1.2732477446217905, + "grad_norm": 1.1131048202514648, + "learning_rate": 1.7134698991529373e-05, + "loss": 0.5743, + "step": 7339 + }, + { + "epoch": 1.2734212352532963, + "grad_norm": 0.6404421925544739, + "learning_rate": 1.712795618437148e-05, + "loss": 0.832, + "step": 7340 + }, + { + "epoch": 1.2735947258848022, + "grad_norm": 1.2087706327438354, + "learning_rate": 1.7121213710517616e-05, + "loss": 0.5685, + "step": 7341 + }, + { + "epoch": 1.273768216516308, + "grad_norm": 2.8402440547943115, + "learning_rate": 1.7114471570750266e-05, + "loss": 0.5402, + "step": 7342 + }, + { + "epoch": 1.273941707147814, + "grad_norm": 0.9312517046928406, + "learning_rate": 1.7107729765851847e-05, + "loss": 0.8318, + "step": 7343 + }, + { + "epoch": 1.27411519777932, + "grad_norm": 1.6928627490997314, + "learning_rate": 1.7100988296604756e-05, + "loss": 0.7109, + "step": 7344 + }, + { + "epoch": 1.2742886884108258, + "grad_norm": 0.8369051814079285, + "learning_rate": 1.7094247163791352e-05, + "loss": 0.577, + "step": 7345 + }, + { + "epoch": 1.2744621790423318, + "grad_norm": 0.9837419986724854, + "learning_rate": 1.708750636819395e-05, + "loss": 0.7839, + "step": 7346 + }, + { + "epoch": 1.2746356696738377, + "grad_norm": 0.7982485890388489, + "learning_rate": 1.7080765910594833e-05, + "loss": 0.7147, + "step": 7347 + }, + { + "epoch": 1.2748091603053435, + "grad_norm": 1.47170090675354, + "learning_rate": 1.7074025791776232e-05, + "loss": 0.646, + "step": 7348 + }, + { + "epoch": 1.2749826509368494, + "grad_norm": 1.0544430017471313, + "learning_rate": 1.7067286012520343e-05, + "loss": 0.6956, + "step": 7349 + }, + { + "epoch": 1.2751561415683552, + "grad_norm": 0.7858167290687561, + "learning_rate": 1.706054657360933e-05, + "loss": 0.6843, + "step": 7350 + }, + { + "epoch": 1.2753296321998613, + "grad_norm": 0.8236578702926636, + "learning_rate": 1.705380747582532e-05, + "loss": 0.6858, + "step": 7351 + }, + { + "epoch": 1.2755031228313671, + "grad_norm": 1.2308380603790283, + "learning_rate": 1.704706871995038e-05, + "loss": 0.6824, + "step": 7352 + }, + { + "epoch": 1.275676613462873, + "grad_norm": 0.9892506003379822, + "learning_rate": 1.704033030676655e-05, + "loss": 0.786, + "step": 7353 + }, + { + "epoch": 1.2758501040943788, + "grad_norm": 1.6927263736724854, + "learning_rate": 1.703359223705585e-05, + "loss": 0.5916, + "step": 7354 + }, + { + "epoch": 1.276023594725885, + "grad_norm": 1.147552251815796, + "learning_rate": 1.7026854511600218e-05, + "loss": 0.6278, + "step": 7355 + }, + { + "epoch": 1.2761970853573907, + "grad_norm": 0.9135046601295471, + "learning_rate": 1.7020117131181585e-05, + "loss": 0.7722, + "step": 7356 + }, + { + "epoch": 1.2763705759888966, + "grad_norm": 1.0044752359390259, + "learning_rate": 1.7013380096581828e-05, + "loss": 0.6135, + "step": 7357 + }, + { + "epoch": 1.2765440666204024, + "grad_norm": 0.6548492908477783, + "learning_rate": 1.70066434085828e-05, + "loss": 0.8308, + "step": 7358 + }, + { + "epoch": 1.2767175572519083, + "grad_norm": 0.8124057054519653, + "learning_rate": 1.6999907067966285e-05, + "loss": 0.6702, + "step": 7359 + }, + { + "epoch": 1.2768910478834143, + "grad_norm": 0.7332319617271423, + "learning_rate": 1.6993171075514054e-05, + "loss": 0.6621, + "step": 7360 + }, + { + "epoch": 1.2770645385149202, + "grad_norm": 0.8131732940673828, + "learning_rate": 1.6986435432007826e-05, + "loss": 0.575, + "step": 7361 + }, + { + "epoch": 1.277238029146426, + "grad_norm": 0.7829189896583557, + "learning_rate": 1.697970013822927e-05, + "loss": 0.736, + "step": 7362 + }, + { + "epoch": 1.277411519777932, + "grad_norm": 0.8020564913749695, + "learning_rate": 1.6972965194960034e-05, + "loss": 0.7722, + "step": 7363 + }, + { + "epoch": 1.277585010409438, + "grad_norm": 0.6803737282752991, + "learning_rate": 1.6966230602981727e-05, + "loss": 0.7505, + "step": 7364 + }, + { + "epoch": 1.2777585010409438, + "grad_norm": 0.9168761372566223, + "learning_rate": 1.695949636307588e-05, + "loss": 0.6398, + "step": 7365 + }, + { + "epoch": 1.2779319916724496, + "grad_norm": 0.8363815546035767, + "learning_rate": 1.6952762476024023e-05, + "loss": 0.7979, + "step": 7366 + }, + { + "epoch": 1.2781054823039555, + "grad_norm": 1.2496986389160156, + "learning_rate": 1.694602894260764e-05, + "loss": 0.6515, + "step": 7367 + }, + { + "epoch": 1.2782789729354616, + "grad_norm": 1.2530725002288818, + "learning_rate": 1.6939295763608146e-05, + "loss": 0.6915, + "step": 7368 + }, + { + "epoch": 1.2784524635669674, + "grad_norm": 0.8217654228210449, + "learning_rate": 1.6932562939806952e-05, + "loss": 0.7749, + "step": 7369 + }, + { + "epoch": 1.2786259541984732, + "grad_norm": 0.9112854599952698, + "learning_rate": 1.6925830471985398e-05, + "loss": 0.7297, + "step": 7370 + }, + { + "epoch": 1.2787994448299793, + "grad_norm": 0.8864389061927795, + "learning_rate": 1.6919098360924804e-05, + "loss": 0.7581, + "step": 7371 + }, + { + "epoch": 1.2789729354614852, + "grad_norm": 1.1927157640457153, + "learning_rate": 1.6912366607406433e-05, + "loss": 0.7938, + "step": 7372 + }, + { + "epoch": 1.279146426092991, + "grad_norm": 1.3114755153656006, + "learning_rate": 1.6905635212211517e-05, + "loss": 0.5525, + "step": 7373 + }, + { + "epoch": 1.2793199167244969, + "grad_norm": 0.9929764866828918, + "learning_rate": 1.6898904176121246e-05, + "loss": 0.5784, + "step": 7374 + }, + { + "epoch": 1.2794934073560027, + "grad_norm": 0.8106144666671753, + "learning_rate": 1.6892173499916752e-05, + "loss": 0.7207, + "step": 7375 + }, + { + "epoch": 1.2796668979875085, + "grad_norm": 0.8984010219573975, + "learning_rate": 1.688544318437914e-05, + "loss": 0.6688, + "step": 7376 + }, + { + "epoch": 1.2798403886190146, + "grad_norm": 0.7469072937965393, + "learning_rate": 1.687871323028949e-05, + "loss": 0.822, + "step": 7377 + }, + { + "epoch": 1.2800138792505205, + "grad_norm": 1.4508287906646729, + "learning_rate": 1.6871983638428794e-05, + "loss": 0.5542, + "step": 7378 + }, + { + "epoch": 1.2801873698820263, + "grad_norm": 1.1114686727523804, + "learning_rate": 1.6865254409578042e-05, + "loss": 0.6611, + "step": 7379 + }, + { + "epoch": 1.2803608605135324, + "grad_norm": 1.9393035173416138, + "learning_rate": 1.685852554451818e-05, + "loss": 0.6302, + "step": 7380 + }, + { + "epoch": 1.2805343511450382, + "grad_norm": 0.8700711727142334, + "learning_rate": 1.6851797044030076e-05, + "loss": 0.7445, + "step": 7381 + }, + { + "epoch": 1.280707841776544, + "grad_norm": 1.7180064916610718, + "learning_rate": 1.6845068908894597e-05, + "loss": 0.6207, + "step": 7382 + }, + { + "epoch": 1.28088133240805, + "grad_norm": 1.2635867595672607, + "learning_rate": 1.6838341139892556e-05, + "loss": 0.7338, + "step": 7383 + }, + { + "epoch": 1.2810548230395558, + "grad_norm": 1.2473243474960327, + "learning_rate": 1.68316137378047e-05, + "loss": 0.6183, + "step": 7384 + }, + { + "epoch": 1.2812283136710618, + "grad_norm": 1.027662754058838, + "learning_rate": 1.682488670341176e-05, + "loss": 0.6311, + "step": 7385 + }, + { + "epoch": 1.2814018043025677, + "grad_norm": 1.2727835178375244, + "learning_rate": 1.681816003749442e-05, + "loss": 0.5302, + "step": 7386 + }, + { + "epoch": 1.2815752949340735, + "grad_norm": 0.8725461959838867, + "learning_rate": 1.681143374083332e-05, + "loss": 0.6659, + "step": 7387 + }, + { + "epoch": 1.2817487855655796, + "grad_norm": 0.8658729195594788, + "learning_rate": 1.6804707814209046e-05, + "loss": 0.7615, + "step": 7388 + }, + { + "epoch": 1.2819222761970854, + "grad_norm": 0.8485891819000244, + "learning_rate": 1.6797982258402154e-05, + "loss": 0.6978, + "step": 7389 + }, + { + "epoch": 1.2820957668285913, + "grad_norm": 1.120810866355896, + "learning_rate": 1.6791257074193156e-05, + "loss": 0.7019, + "step": 7390 + }, + { + "epoch": 1.2822692574600971, + "grad_norm": 0.7649238109588623, + "learning_rate": 1.678453226236251e-05, + "loss": 0.691, + "step": 7391 + }, + { + "epoch": 1.282442748091603, + "grad_norm": 0.7956505417823792, + "learning_rate": 1.677780782369064e-05, + "loss": 0.7734, + "step": 7392 + }, + { + "epoch": 1.2826162387231088, + "grad_norm": 0.6197633743286133, + "learning_rate": 1.677108375895793e-05, + "loss": 0.8303, + "step": 7393 + }, + { + "epoch": 1.2827897293546149, + "grad_norm": 0.7636104226112366, + "learning_rate": 1.6764360068944706e-05, + "loss": 0.6528, + "step": 7394 + }, + { + "epoch": 1.2829632199861207, + "grad_norm": 1.0658921003341675, + "learning_rate": 1.6757636754431272e-05, + "loss": 0.7748, + "step": 7395 + }, + { + "epoch": 1.2831367106176266, + "grad_norm": 0.7058350443840027, + "learning_rate": 1.6750913816197873e-05, + "loss": 0.6785, + "step": 7396 + }, + { + "epoch": 1.2833102012491326, + "grad_norm": 0.8169340491294861, + "learning_rate": 1.6744191255024707e-05, + "loss": 0.7235, + "step": 7397 + }, + { + "epoch": 1.2834836918806385, + "grad_norm": 0.9136412143707275, + "learning_rate": 1.6737469071691936e-05, + "loss": 0.6527, + "step": 7398 + }, + { + "epoch": 1.2836571825121443, + "grad_norm": 1.947454571723938, + "learning_rate": 1.6730747266979683e-05, + "loss": 0.7598, + "step": 7399 + }, + { + "epoch": 1.2838306731436502, + "grad_norm": 1.1262925863265991, + "learning_rate": 1.6724025841668026e-05, + "loss": 0.7246, + "step": 7400 + }, + { + "epoch": 1.284004163775156, + "grad_norm": 1.2926437854766846, + "learning_rate": 1.6717304796536984e-05, + "loss": 0.6841, + "step": 7401 + }, + { + "epoch": 1.284177654406662, + "grad_norm": 1.0177016258239746, + "learning_rate": 1.6710584132366542e-05, + "loss": 0.8662, + "step": 7402 + }, + { + "epoch": 1.284351145038168, + "grad_norm": 0.7405509352684021, + "learning_rate": 1.6703863849936654e-05, + "loss": 0.8677, + "step": 7403 + }, + { + "epoch": 1.2845246356696738, + "grad_norm": 0.9204124212265015, + "learning_rate": 1.6697143950027194e-05, + "loss": 0.7107, + "step": 7404 + }, + { + "epoch": 1.2846981263011799, + "grad_norm": 1.1831809282302856, + "learning_rate": 1.6690424433418032e-05, + "loss": 0.7805, + "step": 7405 + }, + { + "epoch": 1.2848716169326857, + "grad_norm": 1.015375018119812, + "learning_rate": 1.6683705300888977e-05, + "loss": 0.8328, + "step": 7406 + }, + { + "epoch": 1.2850451075641915, + "grad_norm": 1.1339685916900635, + "learning_rate": 1.6676986553219778e-05, + "loss": 0.588, + "step": 7407 + }, + { + "epoch": 1.2852185981956974, + "grad_norm": 0.7485135197639465, + "learning_rate": 1.667026819119016e-05, + "loss": 0.7944, + "step": 7408 + }, + { + "epoch": 1.2853920888272032, + "grad_norm": 0.8292363286018372, + "learning_rate": 1.666355021557981e-05, + "loss": 0.6077, + "step": 7409 + }, + { + "epoch": 1.2855655794587093, + "grad_norm": 0.6901074647903442, + "learning_rate": 1.6656832627168338e-05, + "loss": 0.7959, + "step": 7410 + }, + { + "epoch": 1.2857390700902152, + "grad_norm": 0.9084571003913879, + "learning_rate": 1.665011542673533e-05, + "loss": 0.6532, + "step": 7411 + }, + { + "epoch": 1.285912560721721, + "grad_norm": 0.8940662741661072, + "learning_rate": 1.6643398615060346e-05, + "loss": 0.6768, + "step": 7412 + }, + { + "epoch": 1.2860860513532268, + "grad_norm": 0.855122447013855, + "learning_rate": 1.6636682192922847e-05, + "loss": 0.7642, + "step": 7413 + }, + { + "epoch": 1.286259541984733, + "grad_norm": 0.7779899835586548, + "learning_rate": 1.6629966161102304e-05, + "loss": 0.6973, + "step": 7414 + }, + { + "epoch": 1.2864330326162388, + "grad_norm": 0.7770170569419861, + "learning_rate": 1.6623250520378114e-05, + "loss": 0.8118, + "step": 7415 + }, + { + "epoch": 1.2866065232477446, + "grad_norm": 1.5178810358047485, + "learning_rate": 1.661653527152964e-05, + "loss": 0.5792, + "step": 7416 + }, + { + "epoch": 1.2867800138792505, + "grad_norm": 0.8263197541236877, + "learning_rate": 1.6609820415336188e-05, + "loss": 0.6072, + "step": 7417 + }, + { + "epoch": 1.2869535045107563, + "grad_norm": 0.9673381447792053, + "learning_rate": 1.6603105952577024e-05, + "loss": 0.8618, + "step": 7418 + }, + { + "epoch": 1.2871269951422624, + "grad_norm": 1.4596730470657349, + "learning_rate": 1.6596391884031378e-05, + "loss": 0.5392, + "step": 7419 + }, + { + "epoch": 1.2873004857737682, + "grad_norm": 0.8773238658905029, + "learning_rate": 1.6589678210478415e-05, + "loss": 0.5858, + "step": 7420 + }, + { + "epoch": 1.287473976405274, + "grad_norm": 1.213123083114624, + "learning_rate": 1.658296493269727e-05, + "loss": 0.5602, + "step": 7421 + }, + { + "epoch": 1.2876474670367801, + "grad_norm": 0.9349101781845093, + "learning_rate": 1.657625205146703e-05, + "loss": 0.7224, + "step": 7422 + }, + { + "epoch": 1.287820957668286, + "grad_norm": 0.9519764184951782, + "learning_rate": 1.6569539567566726e-05, + "loss": 0.5757, + "step": 7423 + }, + { + "epoch": 1.2879944482997918, + "grad_norm": 0.8786226511001587, + "learning_rate": 1.6562827481775353e-05, + "loss": 0.6702, + "step": 7424 + }, + { + "epoch": 1.2881679389312977, + "grad_norm": 0.7192205190658569, + "learning_rate": 1.6556115794871862e-05, + "loss": 0.8752, + "step": 7425 + }, + { + "epoch": 1.2883414295628035, + "grad_norm": 1.1581581830978394, + "learning_rate": 1.6549404507635135e-05, + "loss": 0.5166, + "step": 7426 + }, + { + "epoch": 1.2885149201943096, + "grad_norm": 0.8398575782775879, + "learning_rate": 1.654269362084404e-05, + "loss": 0.845, + "step": 7427 + }, + { + "epoch": 1.2886884108258154, + "grad_norm": 0.7932721972465515, + "learning_rate": 1.6535983135277378e-05, + "loss": 0.7437, + "step": 7428 + }, + { + "epoch": 1.2888619014573213, + "grad_norm": 0.8498834371566772, + "learning_rate": 1.6529273051713917e-05, + "loss": 0.6787, + "step": 7429 + }, + { + "epoch": 1.2890353920888273, + "grad_norm": 1.2395590543746948, + "learning_rate": 1.6522563370932355e-05, + "loss": 0.6415, + "step": 7430 + }, + { + "epoch": 1.2892088827203332, + "grad_norm": 1.0109156370162964, + "learning_rate": 1.6515854093711364e-05, + "loss": 0.8582, + "step": 7431 + }, + { + "epoch": 1.289382373351839, + "grad_norm": 0.9154742956161499, + "learning_rate": 1.6509145220829574e-05, + "loss": 0.7401, + "step": 7432 + }, + { + "epoch": 1.2895558639833449, + "grad_norm": 1.1064646244049072, + "learning_rate": 1.650243675306554e-05, + "loss": 0.672, + "step": 7433 + }, + { + "epoch": 1.2897293546148507, + "grad_norm": 1.072745680809021, + "learning_rate": 1.64957286911978e-05, + "loss": 0.5951, + "step": 7434 + }, + { + "epoch": 1.2899028452463566, + "grad_norm": 0.9511796236038208, + "learning_rate": 1.6489021036004835e-05, + "loss": 0.6957, + "step": 7435 + }, + { + "epoch": 1.2900763358778626, + "grad_norm": 1.3743038177490234, + "learning_rate": 1.6482313788265058e-05, + "loss": 0.7585, + "step": 7436 + }, + { + "epoch": 1.2902498265093685, + "grad_norm": 0.6532631516456604, + "learning_rate": 1.647560694875687e-05, + "loss": 0.7106, + "step": 7437 + }, + { + "epoch": 1.2904233171408743, + "grad_norm": 0.9920677542686462, + "learning_rate": 1.64689005182586e-05, + "loss": 0.7145, + "step": 7438 + }, + { + "epoch": 1.2905968077723804, + "grad_norm": 0.7278286218643188, + "learning_rate": 1.6462194497548546e-05, + "loss": 0.7815, + "step": 7439 + }, + { + "epoch": 1.2907702984038862, + "grad_norm": 0.7567177414894104, + "learning_rate": 1.6455488887404935e-05, + "loss": 0.7483, + "step": 7440 + }, + { + "epoch": 1.290943789035392, + "grad_norm": 0.9415431022644043, + "learning_rate": 1.6448783688605976e-05, + "loss": 0.5364, + "step": 7441 + }, + { + "epoch": 1.291117279666898, + "grad_norm": 1.3220552206039429, + "learning_rate": 1.6442078901929803e-05, + "loss": 0.6145, + "step": 7442 + }, + { + "epoch": 1.2912907702984038, + "grad_norm": 1.1244868040084839, + "learning_rate": 1.6435374528154517e-05, + "loss": 0.5657, + "step": 7443 + }, + { + "epoch": 1.2914642609299098, + "grad_norm": 0.8175819516181946, + "learning_rate": 1.6428670568058176e-05, + "loss": 0.6763, + "step": 7444 + }, + { + "epoch": 1.2916377515614157, + "grad_norm": 1.463597297668457, + "learning_rate": 1.6421967022418776e-05, + "loss": 0.5767, + "step": 7445 + }, + { + "epoch": 1.2918112421929215, + "grad_norm": 1.2112313508987427, + "learning_rate": 1.641526389201427e-05, + "loss": 0.6348, + "step": 7446 + }, + { + "epoch": 1.2919847328244276, + "grad_norm": 0.9626292586326599, + "learning_rate": 1.6408561177622566e-05, + "loss": 0.6187, + "step": 7447 + }, + { + "epoch": 1.2921582234559335, + "grad_norm": 0.7495235800743103, + "learning_rate": 1.640185888002153e-05, + "loss": 0.8882, + "step": 7448 + }, + { + "epoch": 1.2923317140874393, + "grad_norm": 0.981387734413147, + "learning_rate": 1.6395156999988956e-05, + "loss": 0.533, + "step": 7449 + }, + { + "epoch": 1.2925052047189451, + "grad_norm": 1.4258828163146973, + "learning_rate": 1.6388455538302612e-05, + "loss": 0.6805, + "step": 7450 + }, + { + "epoch": 1.292678695350451, + "grad_norm": 0.9495477080345154, + "learning_rate": 1.638175449574022e-05, + "loss": 0.7629, + "step": 7451 + }, + { + "epoch": 1.2928521859819568, + "grad_norm": 0.928279459476471, + "learning_rate": 1.6375053873079424e-05, + "loss": 0.8193, + "step": 7452 + }, + { + "epoch": 1.293025676613463, + "grad_norm": 0.9136890769004822, + "learning_rate": 1.6368353671097854e-05, + "loss": 0.8584, + "step": 7453 + }, + { + "epoch": 1.2931991672449688, + "grad_norm": 0.9173855781555176, + "learning_rate": 1.6361653890573078e-05, + "loss": 0.5978, + "step": 7454 + }, + { + "epoch": 1.2933726578764746, + "grad_norm": 1.2741767168045044, + "learning_rate": 1.6354954532282598e-05, + "loss": 0.7498, + "step": 7455 + }, + { + "epoch": 1.2935461485079807, + "grad_norm": 2.4320292472839355, + "learning_rate": 1.6348255597003896e-05, + "loss": 0.6975, + "step": 7456 + }, + { + "epoch": 1.2937196391394865, + "grad_norm": 1.4899499416351318, + "learning_rate": 1.6341557085514385e-05, + "loss": 0.5292, + "step": 7457 + }, + { + "epoch": 1.2938931297709924, + "grad_norm": 0.8415932655334473, + "learning_rate": 1.633485899859144e-05, + "loss": 0.627, + "step": 7458 + }, + { + "epoch": 1.2940666204024982, + "grad_norm": 1.0413763523101807, + "learning_rate": 1.6328161337012377e-05, + "loss": 0.7284, + "step": 7459 + }, + { + "epoch": 1.294240111034004, + "grad_norm": 1.674776315689087, + "learning_rate": 1.632146410155447e-05, + "loss": 0.6801, + "step": 7460 + }, + { + "epoch": 1.2944136016655101, + "grad_norm": 1.5257529020309448, + "learning_rate": 1.6314767292994946e-05, + "loss": 0.6344, + "step": 7461 + }, + { + "epoch": 1.294587092297016, + "grad_norm": 0.8709819912910461, + "learning_rate": 1.6308070912110965e-05, + "loss": 0.7244, + "step": 7462 + }, + { + "epoch": 1.2947605829285218, + "grad_norm": 1.238824486732483, + "learning_rate": 1.6301374959679654e-05, + "loss": 0.6458, + "step": 7463 + }, + { + "epoch": 1.2949340735600279, + "grad_norm": 0.9053732752799988, + "learning_rate": 1.6294679436478095e-05, + "loss": 0.6821, + "step": 7464 + }, + { + "epoch": 1.2951075641915337, + "grad_norm": 1.249714970588684, + "learning_rate": 1.6287984343283304e-05, + "loss": 0.5786, + "step": 7465 + }, + { + "epoch": 1.2952810548230396, + "grad_norm": 0.8870633244514465, + "learning_rate": 1.6281289680872252e-05, + "loss": 0.8933, + "step": 7466 + }, + { + "epoch": 1.2954545454545454, + "grad_norm": 1.4831454753875732, + "learning_rate": 1.627459545002187e-05, + "loss": 0.7871, + "step": 7467 + }, + { + "epoch": 1.2956280360860513, + "grad_norm": 0.848332941532135, + "learning_rate": 1.6267901651509022e-05, + "loss": 0.6904, + "step": 7468 + }, + { + "epoch": 1.2958015267175573, + "grad_norm": 0.7702910304069519, + "learning_rate": 1.6261208286110536e-05, + "loss": 0.722, + "step": 7469 + }, + { + "epoch": 1.2959750173490632, + "grad_norm": 0.9446412920951843, + "learning_rate": 1.6254515354603194e-05, + "loss": 0.6888, + "step": 7470 + }, + { + "epoch": 1.296148507980569, + "grad_norm": 0.9585701823234558, + "learning_rate": 1.6247822857763703e-05, + "loss": 0.6833, + "step": 7471 + }, + { + "epoch": 1.2963219986120749, + "grad_norm": 0.8555445075035095, + "learning_rate": 1.6241130796368737e-05, + "loss": 0.5675, + "step": 7472 + }, + { + "epoch": 1.296495489243581, + "grad_norm": 0.6473482847213745, + "learning_rate": 1.6234439171194925e-05, + "loss": 0.6621, + "step": 7473 + }, + { + "epoch": 1.2966689798750868, + "grad_norm": 1.0498594045639038, + "learning_rate": 1.6227747983018845e-05, + "loss": 0.6228, + "step": 7474 + }, + { + "epoch": 1.2968424705065926, + "grad_norm": 0.7725870609283447, + "learning_rate": 1.6221057232616994e-05, + "loss": 0.8369, + "step": 7475 + }, + { + "epoch": 1.2970159611380985, + "grad_norm": 1.2104430198669434, + "learning_rate": 1.6214366920765856e-05, + "loss": 0.8716, + "step": 7476 + }, + { + "epoch": 1.2971894517696043, + "grad_norm": 0.8917331695556641, + "learning_rate": 1.6207677048241858e-05, + "loss": 0.8397, + "step": 7477 + }, + { + "epoch": 1.2973629424011104, + "grad_norm": 1.7597315311431885, + "learning_rate": 1.620098761582135e-05, + "loss": 0.8411, + "step": 7478 + }, + { + "epoch": 1.2975364330326162, + "grad_norm": 0.9333053231239319, + "learning_rate": 1.6194298624280653e-05, + "loss": 0.7209, + "step": 7479 + }, + { + "epoch": 1.297709923664122, + "grad_norm": 1.6973851919174194, + "learning_rate": 1.6187610074396044e-05, + "loss": 0.5586, + "step": 7480 + }, + { + "epoch": 1.2978834142956281, + "grad_norm": 1.2647325992584229, + "learning_rate": 1.6180921966943722e-05, + "loss": 0.6782, + "step": 7481 + }, + { + "epoch": 1.298056904927134, + "grad_norm": 0.874609649181366, + "learning_rate": 1.6174234302699856e-05, + "loss": 0.6644, + "step": 7482 + }, + { + "epoch": 1.2982303955586398, + "grad_norm": 0.9690744280815125, + "learning_rate": 1.616754708244056e-05, + "loss": 0.6627, + "step": 7483 + }, + { + "epoch": 1.2984038861901457, + "grad_norm": 0.730150580406189, + "learning_rate": 1.616086030694189e-05, + "loss": 0.738, + "step": 7484 + }, + { + "epoch": 1.2985773768216515, + "grad_norm": 1.3693609237670898, + "learning_rate": 1.615417397697985e-05, + "loss": 0.6447, + "step": 7485 + }, + { + "epoch": 1.2987508674531576, + "grad_norm": 1.3202661275863647, + "learning_rate": 1.6147488093330405e-05, + "loss": 0.7754, + "step": 7486 + }, + { + "epoch": 1.2989243580846634, + "grad_norm": 0.7806547284126282, + "learning_rate": 1.6140802656769457e-05, + "loss": 0.7588, + "step": 7487 + }, + { + "epoch": 1.2990978487161693, + "grad_norm": 0.8665243983268738, + "learning_rate": 1.6134117668072858e-05, + "loss": 0.6604, + "step": 7488 + }, + { + "epoch": 1.2992713393476754, + "grad_norm": 0.8657317757606506, + "learning_rate": 1.6127433128016403e-05, + "loss": 0.7773, + "step": 7489 + }, + { + "epoch": 1.2994448299791812, + "grad_norm": 1.435542345046997, + "learning_rate": 1.612074903737585e-05, + "loss": 0.7378, + "step": 7490 + }, + { + "epoch": 1.299618320610687, + "grad_norm": 1.2831004858016968, + "learning_rate": 1.611406539692689e-05, + "loss": 0.7747, + "step": 7491 + }, + { + "epoch": 1.299791811242193, + "grad_norm": 1.177079677581787, + "learning_rate": 1.610738220744517e-05, + "loss": 0.6771, + "step": 7492 + }, + { + "epoch": 1.2999653018736987, + "grad_norm": 1.1005759239196777, + "learning_rate": 1.6100699469706285e-05, + "loss": 0.6572, + "step": 7493 + }, + { + "epoch": 1.3001387925052046, + "grad_norm": 1.1334228515625, + "learning_rate": 1.6094017184485763e-05, + "loss": 0.7122, + "step": 7494 + }, + { + "epoch": 1.3003122831367107, + "grad_norm": 0.8523833155632019, + "learning_rate": 1.6087335352559097e-05, + "loss": 0.6809, + "step": 7495 + }, + { + "epoch": 1.3004857737682165, + "grad_norm": 1.0016939640045166, + "learning_rate": 1.6080653974701732e-05, + "loss": 0.5646, + "step": 7496 + }, + { + "epoch": 1.3006592643997223, + "grad_norm": 0.8622357845306396, + "learning_rate": 1.6073973051689032e-05, + "loss": 0.7466, + "step": 7497 + }, + { + "epoch": 1.3008327550312284, + "grad_norm": 0.758266270160675, + "learning_rate": 1.6067292584296333e-05, + "loss": 0.6501, + "step": 7498 + }, + { + "epoch": 1.3010062456627343, + "grad_norm": 0.6668974161148071, + "learning_rate": 1.6060612573298912e-05, + "loss": 0.8536, + "step": 7499 + }, + { + "epoch": 1.30117973629424, + "grad_norm": 1.1053359508514404, + "learning_rate": 1.6053933019472003e-05, + "loss": 0.5952, + "step": 7500 + }, + { + "epoch": 1.301353226925746, + "grad_norm": 0.9018994569778442, + "learning_rate": 1.6047253923590756e-05, + "loss": 0.7465, + "step": 7501 + }, + { + "epoch": 1.3015267175572518, + "grad_norm": 1.0523308515548706, + "learning_rate": 1.6040575286430295e-05, + "loss": 0.651, + "step": 7502 + }, + { + "epoch": 1.3017002081887579, + "grad_norm": 0.9410999417304993, + "learning_rate": 1.6033897108765696e-05, + "loss": 0.7805, + "step": 7503 + }, + { + "epoch": 1.3018736988202637, + "grad_norm": 0.9537340998649597, + "learning_rate": 1.602721939137195e-05, + "loss": 0.5824, + "step": 7504 + }, + { + "epoch": 1.3020471894517696, + "grad_norm": 0.7937243580818176, + "learning_rate": 1.6020542135024023e-05, + "loss": 0.7996, + "step": 7505 + }, + { + "epoch": 1.3022206800832756, + "grad_norm": 1.1207983493804932, + "learning_rate": 1.6013865340496826e-05, + "loss": 0.592, + "step": 7506 + }, + { + "epoch": 1.3023941707147815, + "grad_norm": 1.6212788820266724, + "learning_rate": 1.6007189008565195e-05, + "loss": 0.5536, + "step": 7507 + }, + { + "epoch": 1.3025676613462873, + "grad_norm": 1.0500280857086182, + "learning_rate": 1.6000513140003927e-05, + "loss": 0.6969, + "step": 7508 + }, + { + "epoch": 1.3027411519777932, + "grad_norm": 0.8775234818458557, + "learning_rate": 1.5993837735587783e-05, + "loss": 0.6803, + "step": 7509 + }, + { + "epoch": 1.302914642609299, + "grad_norm": 0.8487836122512817, + "learning_rate": 1.5987162796091428e-05, + "loss": 0.7292, + "step": 7510 + }, + { + "epoch": 1.3030881332408049, + "grad_norm": 0.7071741819381714, + "learning_rate": 1.5980488322289505e-05, + "loss": 0.7432, + "step": 7511 + }, + { + "epoch": 1.303261623872311, + "grad_norm": 0.805499792098999, + "learning_rate": 1.5973814314956602e-05, + "loss": 0.699, + "step": 7512 + }, + { + "epoch": 1.3034351145038168, + "grad_norm": 1.9736933708190918, + "learning_rate": 1.5967140774867235e-05, + "loss": 0.5481, + "step": 7513 + }, + { + "epoch": 1.3036086051353226, + "grad_norm": 0.8418450951576233, + "learning_rate": 1.596046770279588e-05, + "loss": 0.6571, + "step": 7514 + }, + { + "epoch": 1.3037820957668287, + "grad_norm": 0.9092546701431274, + "learning_rate": 1.5953795099516955e-05, + "loss": 0.812, + "step": 7515 + }, + { + "epoch": 1.3039555863983345, + "grad_norm": 1.0161569118499756, + "learning_rate": 1.5947122965804827e-05, + "loss": 0.6665, + "step": 7516 + }, + { + "epoch": 1.3041290770298404, + "grad_norm": 0.753367006778717, + "learning_rate": 1.59404513024338e-05, + "loss": 0.6686, + "step": 7517 + }, + { + "epoch": 1.3043025676613462, + "grad_norm": 0.9276489615440369, + "learning_rate": 1.5933780110178128e-05, + "loss": 0.7733, + "step": 7518 + }, + { + "epoch": 1.304476058292852, + "grad_norm": 0.8951383829116821, + "learning_rate": 1.5927109389812013e-05, + "loss": 0.7302, + "step": 7519 + }, + { + "epoch": 1.3046495489243581, + "grad_norm": 0.880757749080658, + "learning_rate": 1.59204391421096e-05, + "loss": 0.7864, + "step": 7520 + }, + { + "epoch": 1.304823039555864, + "grad_norm": 0.7709455490112305, + "learning_rate": 1.5913769367844974e-05, + "loss": 0.7186, + "step": 7521 + }, + { + "epoch": 1.3049965301873698, + "grad_norm": 1.421297550201416, + "learning_rate": 1.5907100067792186e-05, + "loss": 0.6821, + "step": 7522 + }, + { + "epoch": 1.305170020818876, + "grad_norm": 1.1831244230270386, + "learning_rate": 1.590043124272519e-05, + "loss": 0.7552, + "step": 7523 + }, + { + "epoch": 1.3053435114503817, + "grad_norm": 0.8124173283576965, + "learning_rate": 1.589376289341793e-05, + "loss": 0.9128, + "step": 7524 + }, + { + "epoch": 1.3055170020818876, + "grad_norm": 1.1667371988296509, + "learning_rate": 1.5887095020644282e-05, + "loss": 0.7123, + "step": 7525 + }, + { + "epoch": 1.3056904927133934, + "grad_norm": 0.9393389225006104, + "learning_rate": 1.5880427625178035e-05, + "loss": 0.6788, + "step": 7526 + }, + { + "epoch": 1.3058639833448993, + "grad_norm": 0.7772049307823181, + "learning_rate": 1.5873760707792966e-05, + "loss": 0.908, + "step": 7527 + }, + { + "epoch": 1.3060374739764053, + "grad_norm": 1.094639539718628, + "learning_rate": 1.586709426926277e-05, + "loss": 0.5769, + "step": 7528 + }, + { + "epoch": 1.3062109646079112, + "grad_norm": 1.0683293342590332, + "learning_rate": 1.5860428310361117e-05, + "loss": 0.7639, + "step": 7529 + }, + { + "epoch": 1.306384455239417, + "grad_norm": 0.7631242871284485, + "learning_rate": 1.5853762831861567e-05, + "loss": 0.6732, + "step": 7530 + }, + { + "epoch": 1.3065579458709229, + "grad_norm": 0.9106690287590027, + "learning_rate": 1.5847097834537674e-05, + "loss": 0.6292, + "step": 7531 + }, + { + "epoch": 1.306731436502429, + "grad_norm": 2.2238504886627197, + "learning_rate": 1.5840433319162925e-05, + "loss": 0.6902, + "step": 7532 + }, + { + "epoch": 1.3069049271339348, + "grad_norm": 1.3497262001037598, + "learning_rate": 1.5833769286510727e-05, + "loss": 0.5686, + "step": 7533 + }, + { + "epoch": 1.3070784177654406, + "grad_norm": 1.0890588760375977, + "learning_rate": 1.5827105737354456e-05, + "loss": 0.7225, + "step": 7534 + }, + { + "epoch": 1.3072519083969465, + "grad_norm": 2.0195226669311523, + "learning_rate": 1.5820442672467436e-05, + "loss": 0.7422, + "step": 7535 + }, + { + "epoch": 1.3074253990284523, + "grad_norm": 1.4667134284973145, + "learning_rate": 1.5813780092622907e-05, + "loss": 0.7271, + "step": 7536 + }, + { + "epoch": 1.3075988896599584, + "grad_norm": 1.1005094051361084, + "learning_rate": 1.5807117998594077e-05, + "loss": 0.7057, + "step": 7537 + }, + { + "epoch": 1.3077723802914643, + "grad_norm": 0.9319872260093689, + "learning_rate": 1.580045639115409e-05, + "loss": 0.6433, + "step": 7538 + }, + { + "epoch": 1.30794587092297, + "grad_norm": 0.8977460861206055, + "learning_rate": 1.5793795271076033e-05, + "loss": 0.6608, + "step": 7539 + }, + { + "epoch": 1.3081193615544762, + "grad_norm": 1.1549010276794434, + "learning_rate": 1.5787134639132935e-05, + "loss": 0.5474, + "step": 7540 + }, + { + "epoch": 1.308292852185982, + "grad_norm": 1.3894524574279785, + "learning_rate": 1.5780474496097773e-05, + "loss": 0.663, + "step": 7541 + }, + { + "epoch": 1.3084663428174879, + "grad_norm": 1.0959807634353638, + "learning_rate": 1.577381484274346e-05, + "loss": 0.6479, + "step": 7542 + }, + { + "epoch": 1.3086398334489937, + "grad_norm": 0.8566595911979675, + "learning_rate": 1.5767155679842857e-05, + "loss": 0.7566, + "step": 7543 + }, + { + "epoch": 1.3088133240804996, + "grad_norm": 0.8799921870231628, + "learning_rate": 1.576049700816877e-05, + "loss": 0.8064, + "step": 7544 + }, + { + "epoch": 1.3089868147120056, + "grad_norm": 1.065097451210022, + "learning_rate": 1.5753838828493953e-05, + "loss": 0.5881, + "step": 7545 + }, + { + "epoch": 1.3091603053435115, + "grad_norm": 0.9642067551612854, + "learning_rate": 1.574718114159108e-05, + "loss": 0.7102, + "step": 7546 + }, + { + "epoch": 1.3093337959750173, + "grad_norm": 0.8585482239723206, + "learning_rate": 1.574052394823279e-05, + "loss": 0.7166, + "step": 7547 + }, + { + "epoch": 1.3095072866065234, + "grad_norm": 0.6905062794685364, + "learning_rate": 1.5733867249191667e-05, + "loss": 0.6605, + "step": 7548 + }, + { + "epoch": 1.3096807772380292, + "grad_norm": 0.928502082824707, + "learning_rate": 1.5727211045240217e-05, + "loss": 0.6804, + "step": 7549 + }, + { + "epoch": 1.309854267869535, + "grad_norm": 1.656383991241455, + "learning_rate": 1.57205553371509e-05, + "loss": 0.6804, + "step": 7550 + }, + { + "epoch": 1.310027758501041, + "grad_norm": 0.8206719756126404, + "learning_rate": 1.571390012569613e-05, + "loss": 0.6897, + "step": 7551 + }, + { + "epoch": 1.3102012491325468, + "grad_norm": 0.8175944685935974, + "learning_rate": 1.570724541164824e-05, + "loss": 0.6824, + "step": 7552 + }, + { + "epoch": 1.3103747397640526, + "grad_norm": 0.9384368062019348, + "learning_rate": 1.570059119577952e-05, + "loss": 0.8142, + "step": 7553 + }, + { + "epoch": 1.3105482303955587, + "grad_norm": 0.8884804844856262, + "learning_rate": 1.569393747886221e-05, + "loss": 0.6298, + "step": 7554 + }, + { + "epoch": 1.3107217210270645, + "grad_norm": 1.0313459634780884, + "learning_rate": 1.5687284261668465e-05, + "loss": 0.7915, + "step": 7555 + }, + { + "epoch": 1.3108952116585704, + "grad_norm": 2.2591023445129395, + "learning_rate": 1.5680631544970405e-05, + "loss": 0.6339, + "step": 7556 + }, + { + "epoch": 1.3110687022900764, + "grad_norm": 1.415168285369873, + "learning_rate": 1.567397932954009e-05, + "loss": 0.6407, + "step": 7557 + }, + { + "epoch": 1.3112421929215823, + "grad_norm": 1.4144896268844604, + "learning_rate": 1.5667327616149522e-05, + "loss": 0.7383, + "step": 7558 + }, + { + "epoch": 1.3114156835530881, + "grad_norm": 1.829025149345398, + "learning_rate": 1.5660676405570625e-05, + "loss": 0.8296, + "step": 7559 + }, + { + "epoch": 1.311589174184594, + "grad_norm": 1.306922197341919, + "learning_rate": 1.5654025698575286e-05, + "loss": 0.7329, + "step": 7560 + }, + { + "epoch": 1.3117626648160998, + "grad_norm": 1.0386871099472046, + "learning_rate": 1.5647375495935334e-05, + "loss": 0.6855, + "step": 7561 + }, + { + "epoch": 1.311936155447606, + "grad_norm": 1.4324531555175781, + "learning_rate": 1.5640725798422525e-05, + "loss": 0.6072, + "step": 7562 + }, + { + "epoch": 1.3121096460791117, + "grad_norm": 0.9096825122833252, + "learning_rate": 1.5634076606808567e-05, + "loss": 0.694, + "step": 7563 + }, + { + "epoch": 1.3122831367106176, + "grad_norm": 3.68599796295166, + "learning_rate": 1.5627427921865106e-05, + "loss": 0.6572, + "step": 7564 + }, + { + "epoch": 1.3124566273421236, + "grad_norm": 0.9260284900665283, + "learning_rate": 1.562077974436373e-05, + "loss": 0.5872, + "step": 7565 + }, + { + "epoch": 1.3126301179736295, + "grad_norm": 0.8843275904655457, + "learning_rate": 1.5614132075075967e-05, + "loss": 0.6444, + "step": 7566 + }, + { + "epoch": 1.3128036086051353, + "grad_norm": 0.7820656895637512, + "learning_rate": 1.560748491477329e-05, + "loss": 0.7494, + "step": 7567 + }, + { + "epoch": 1.3129770992366412, + "grad_norm": 0.7000197172164917, + "learning_rate": 1.5600838264227102e-05, + "loss": 0.8713, + "step": 7568 + }, + { + "epoch": 1.313150589868147, + "grad_norm": 1.1530879735946655, + "learning_rate": 1.5594192124208758e-05, + "loss": 0.5422, + "step": 7569 + }, + { + "epoch": 1.313324080499653, + "grad_norm": 0.9452706575393677, + "learning_rate": 1.5587546495489563e-05, + "loss": 0.5923, + "step": 7570 + }, + { + "epoch": 1.313497571131159, + "grad_norm": 0.8510993123054504, + "learning_rate": 1.558090137884073e-05, + "loss": 0.7939, + "step": 7571 + }, + { + "epoch": 1.3136710617626648, + "grad_norm": 0.7352356314659119, + "learning_rate": 1.557425677503344e-05, + "loss": 0.6628, + "step": 7572 + }, + { + "epoch": 1.3138445523941706, + "grad_norm": 1.6412001848220825, + "learning_rate": 1.5567612684838805e-05, + "loss": 0.9102, + "step": 7573 + }, + { + "epoch": 1.3140180430256767, + "grad_norm": 0.9555302858352661, + "learning_rate": 1.5560969109027896e-05, + "loss": 0.6608, + "step": 7574 + }, + { + "epoch": 1.3141915336571826, + "grad_norm": 1.170717716217041, + "learning_rate": 1.5554326048371686e-05, + "loss": 0.595, + "step": 7575 + }, + { + "epoch": 1.3143650242886884, + "grad_norm": 0.7326885461807251, + "learning_rate": 1.5547683503641115e-05, + "loss": 0.7407, + "step": 7576 + }, + { + "epoch": 1.3145385149201942, + "grad_norm": 0.8056427836418152, + "learning_rate": 1.5541041475607073e-05, + "loss": 0.7241, + "step": 7577 + }, + { + "epoch": 1.3147120055517, + "grad_norm": 0.9078532457351685, + "learning_rate": 1.5534399965040353e-05, + "loss": 0.6917, + "step": 7578 + }, + { + "epoch": 1.3148854961832062, + "grad_norm": 1.1429286003112793, + "learning_rate": 1.552775897271172e-05, + "loss": 0.7101, + "step": 7579 + }, + { + "epoch": 1.315058986814712, + "grad_norm": 0.9033637046813965, + "learning_rate": 1.552111849939188e-05, + "loss": 0.6448, + "step": 7580 + }, + { + "epoch": 1.3152324774462179, + "grad_norm": 0.8969703912734985, + "learning_rate": 1.5514478545851452e-05, + "loss": 0.8356, + "step": 7581 + }, + { + "epoch": 1.315405968077724, + "grad_norm": 1.5112354755401611, + "learning_rate": 1.550783911286101e-05, + "loss": 0.6351, + "step": 7582 + }, + { + "epoch": 1.3155794587092298, + "grad_norm": 0.8369285464286804, + "learning_rate": 1.550120020119108e-05, + "loss": 0.7878, + "step": 7583 + }, + { + "epoch": 1.3157529493407356, + "grad_norm": 0.9816716313362122, + "learning_rate": 1.5494561811612102e-05, + "loss": 0.5978, + "step": 7584 + }, + { + "epoch": 1.3159264399722415, + "grad_norm": 2.203784942626953, + "learning_rate": 1.548792394489448e-05, + "loss": 0.4966, + "step": 7585 + }, + { + "epoch": 1.3160999306037473, + "grad_norm": 1.083376407623291, + "learning_rate": 1.548128660180854e-05, + "loss": 0.6714, + "step": 7586 + }, + { + "epoch": 1.3162734212352534, + "grad_norm": 0.8516615629196167, + "learning_rate": 1.5474649783124555e-05, + "loss": 0.6221, + "step": 7587 + }, + { + "epoch": 1.3164469118667592, + "grad_norm": 1.1593409776687622, + "learning_rate": 1.5468013489612742e-05, + "loss": 0.7546, + "step": 7588 + }, + { + "epoch": 1.316620402498265, + "grad_norm": 1.6557027101516724, + "learning_rate": 1.5461377722043235e-05, + "loss": 0.7449, + "step": 7589 + }, + { + "epoch": 1.316793893129771, + "grad_norm": 0.7382414937019348, + "learning_rate": 1.5454742481186137e-05, + "loss": 0.6587, + "step": 7590 + }, + { + "epoch": 1.316967383761277, + "grad_norm": 1.1538621187210083, + "learning_rate": 1.5448107767811468e-05, + "loss": 0.5465, + "step": 7591 + }, + { + "epoch": 1.3171408743927828, + "grad_norm": 0.8423037528991699, + "learning_rate": 1.5441473582689198e-05, + "loss": 0.7085, + "step": 7592 + }, + { + "epoch": 1.3173143650242887, + "grad_norm": 0.8207338452339172, + "learning_rate": 1.5434839926589236e-05, + "loss": 0.7078, + "step": 7593 + }, + { + "epoch": 1.3174878556557945, + "grad_norm": 0.8653791546821594, + "learning_rate": 1.5428206800281413e-05, + "loss": 0.6129, + "step": 7594 + }, + { + "epoch": 1.3176613462873004, + "grad_norm": 0.6433460116386414, + "learning_rate": 1.5421574204535516e-05, + "loss": 0.8633, + "step": 7595 + }, + { + "epoch": 1.3178348369188064, + "grad_norm": 1.1964366436004639, + "learning_rate": 1.5414942140121278e-05, + "loss": 0.5446, + "step": 7596 + }, + { + "epoch": 1.3180083275503123, + "grad_norm": 1.0941598415374756, + "learning_rate": 1.5408310607808336e-05, + "loss": 0.5701, + "step": 7597 + }, + { + "epoch": 1.3181818181818181, + "grad_norm": 0.7580935955047607, + "learning_rate": 1.54016796083663e-05, + "loss": 0.7625, + "step": 7598 + }, + { + "epoch": 1.3183553088133242, + "grad_norm": 0.9231324195861816, + "learning_rate": 1.5395049142564717e-05, + "loss": 0.71, + "step": 7599 + }, + { + "epoch": 1.31852879944483, + "grad_norm": 1.405079960823059, + "learning_rate": 1.538841921117303e-05, + "loss": 0.8064, + "step": 7600 + }, + { + "epoch": 1.3187022900763359, + "grad_norm": 0.945346474647522, + "learning_rate": 1.5381789814960674e-05, + "loss": 0.8011, + "step": 7601 + }, + { + "epoch": 1.3188757807078417, + "grad_norm": 1.6452194452285767, + "learning_rate": 1.5375160954696986e-05, + "loss": 0.8035, + "step": 7602 + }, + { + "epoch": 1.3190492713393476, + "grad_norm": 0.9065169095993042, + "learning_rate": 1.536853263115127e-05, + "loss": 0.6683, + "step": 7603 + }, + { + "epoch": 1.3192227619708536, + "grad_norm": 1.0479841232299805, + "learning_rate": 1.536190484509273e-05, + "loss": 0.5371, + "step": 7604 + }, + { + "epoch": 1.3193962526023595, + "grad_norm": 0.8723646402359009, + "learning_rate": 1.5355277597290537e-05, + "loss": 0.6322, + "step": 7605 + }, + { + "epoch": 1.3195697432338653, + "grad_norm": 0.8753295540809631, + "learning_rate": 1.5348650888513798e-05, + "loss": 0.7379, + "step": 7606 + }, + { + "epoch": 1.3197432338653714, + "grad_norm": 1.3839025497436523, + "learning_rate": 1.5342024719531536e-05, + "loss": 0.7939, + "step": 7607 + }, + { + "epoch": 1.3199167244968772, + "grad_norm": 0.8038983941078186, + "learning_rate": 1.533539909111273e-05, + "loss": 0.6384, + "step": 7608 + }, + { + "epoch": 1.320090215128383, + "grad_norm": 0.7504783272743225, + "learning_rate": 1.5328774004026304e-05, + "loss": 0.7697, + "step": 7609 + }, + { + "epoch": 1.320263705759889, + "grad_norm": 0.983370840549469, + "learning_rate": 1.5322149459041097e-05, + "loss": 0.7281, + "step": 7610 + }, + { + "epoch": 1.3204371963913948, + "grad_norm": 1.067522644996643, + "learning_rate": 1.531552545692589e-05, + "loss": 0.6406, + "step": 7611 + }, + { + "epoch": 1.3206106870229006, + "grad_norm": 1.124183177947998, + "learning_rate": 1.5308901998449415e-05, + "loss": 0.5925, + "step": 7612 + }, + { + "epoch": 1.3207841776544067, + "grad_norm": 1.0554368495941162, + "learning_rate": 1.5302279084380328e-05, + "loss": 0.5449, + "step": 7613 + }, + { + "epoch": 1.3209576682859125, + "grad_norm": 0.8714486956596375, + "learning_rate": 1.5295656715487226e-05, + "loss": 0.5931, + "step": 7614 + }, + { + "epoch": 1.3211311589174184, + "grad_norm": 1.0293232202529907, + "learning_rate": 1.528903489253865e-05, + "loss": 0.7614, + "step": 7615 + }, + { + "epoch": 1.3213046495489245, + "grad_norm": 1.3576380014419556, + "learning_rate": 1.5282413616303063e-05, + "loss": 0.7581, + "step": 7616 + }, + { + "epoch": 1.3214781401804303, + "grad_norm": 1.0762827396392822, + "learning_rate": 1.5275792887548866e-05, + "loss": 0.5951, + "step": 7617 + }, + { + "epoch": 1.3216516308119362, + "grad_norm": 1.7965368032455444, + "learning_rate": 1.526917270704441e-05, + "loss": 0.6862, + "step": 7618 + }, + { + "epoch": 1.321825121443442, + "grad_norm": 0.998123049736023, + "learning_rate": 1.5262553075557985e-05, + "loss": 0.731, + "step": 7619 + }, + { + "epoch": 1.3219986120749478, + "grad_norm": 0.7886993885040283, + "learning_rate": 1.5255933993857785e-05, + "loss": 0.7122, + "step": 7620 + }, + { + "epoch": 1.322172102706454, + "grad_norm": 1.1329472064971924, + "learning_rate": 1.5249315462711974e-05, + "loss": 0.6865, + "step": 7621 + }, + { + "epoch": 1.3223455933379598, + "grad_norm": 0.7323819994926453, + "learning_rate": 1.5242697482888649e-05, + "loss": 0.7548, + "step": 7622 + }, + { + "epoch": 1.3225190839694656, + "grad_norm": 1.2573583126068115, + "learning_rate": 1.5236080055155812e-05, + "loss": 0.7053, + "step": 7623 + }, + { + "epoch": 1.3226925746009717, + "grad_norm": 0.9759790897369385, + "learning_rate": 1.5229463180281441e-05, + "loss": 0.7594, + "step": 7624 + }, + { + "epoch": 1.3228660652324775, + "grad_norm": 1.7983524799346924, + "learning_rate": 1.522284685903343e-05, + "loss": 0.7031, + "step": 7625 + }, + { + "epoch": 1.3230395558639834, + "grad_norm": 0.7985862493515015, + "learning_rate": 1.5216231092179604e-05, + "loss": 0.656, + "step": 7626 + }, + { + "epoch": 1.3232130464954892, + "grad_norm": 0.7949106693267822, + "learning_rate": 1.5209615880487728e-05, + "loss": 0.7952, + "step": 7627 + }, + { + "epoch": 1.323386537126995, + "grad_norm": 0.9013630151748657, + "learning_rate": 1.5203001224725525e-05, + "loss": 0.707, + "step": 7628 + }, + { + "epoch": 1.3235600277585011, + "grad_norm": 0.9553844928741455, + "learning_rate": 1.5196387125660607e-05, + "loss": 0.6874, + "step": 7629 + }, + { + "epoch": 1.323733518390007, + "grad_norm": 0.9913061857223511, + "learning_rate": 1.5189773584060563e-05, + "loss": 0.7195, + "step": 7630 + }, + { + "epoch": 1.3239070090215128, + "grad_norm": 1.0381765365600586, + "learning_rate": 1.51831606006929e-05, + "loss": 0.6205, + "step": 7631 + }, + { + "epoch": 1.3240804996530187, + "grad_norm": 0.9583021998405457, + "learning_rate": 1.517654817632507e-05, + "loss": 0.6089, + "step": 7632 + }, + { + "epoch": 1.3242539902845247, + "grad_norm": 1.2168796062469482, + "learning_rate": 1.5169936311724434e-05, + "loss": 0.5527, + "step": 7633 + }, + { + "epoch": 1.3244274809160306, + "grad_norm": 1.2229652404785156, + "learning_rate": 1.5163325007658319e-05, + "loss": 0.7886, + "step": 7634 + }, + { + "epoch": 1.3246009715475364, + "grad_norm": 0.7948911190032959, + "learning_rate": 1.5156714264893974e-05, + "loss": 0.7925, + "step": 7635 + }, + { + "epoch": 1.3247744621790423, + "grad_norm": 1.3847335577011108, + "learning_rate": 1.5150104084198587e-05, + "loss": 0.5856, + "step": 7636 + }, + { + "epoch": 1.3249479528105481, + "grad_norm": 0.8714485168457031, + "learning_rate": 1.5143494466339266e-05, + "loss": 0.6589, + "step": 7637 + }, + { + "epoch": 1.3251214434420542, + "grad_norm": 1.1255385875701904, + "learning_rate": 1.5136885412083073e-05, + "loss": 0.6357, + "step": 7638 + }, + { + "epoch": 1.32529493407356, + "grad_norm": 0.7412751913070679, + "learning_rate": 1.5130276922196993e-05, + "loss": 0.771, + "step": 7639 + }, + { + "epoch": 1.3254684247050659, + "grad_norm": 0.8312340974807739, + "learning_rate": 1.5123668997447948e-05, + "loss": 0.696, + "step": 7640 + }, + { + "epoch": 1.325641915336572, + "grad_norm": 0.7387779951095581, + "learning_rate": 1.5117061638602811e-05, + "loss": 0.844, + "step": 7641 + }, + { + "epoch": 1.3258154059680778, + "grad_norm": 0.8821995258331299, + "learning_rate": 1.5110454846428348e-05, + "loss": 0.6084, + "step": 7642 + }, + { + "epoch": 1.3259888965995836, + "grad_norm": 0.9441134929656982, + "learning_rate": 1.51038486216913e-05, + "loss": 0.701, + "step": 7643 + }, + { + "epoch": 1.3261623872310895, + "grad_norm": 0.9492355585098267, + "learning_rate": 1.5097242965158322e-05, + "loss": 0.6924, + "step": 7644 + }, + { + "epoch": 1.3263358778625953, + "grad_norm": 1.0463985204696655, + "learning_rate": 1.5090637877596022e-05, + "loss": 0.6226, + "step": 7645 + }, + { + "epoch": 1.3265093684941014, + "grad_norm": 1.3754757642745972, + "learning_rate": 1.5084033359770907e-05, + "loss": 0.8088, + "step": 7646 + }, + { + "epoch": 1.3266828591256072, + "grad_norm": 1.1888870000839233, + "learning_rate": 1.507742941244945e-05, + "loss": 0.7183, + "step": 7647 + }, + { + "epoch": 1.326856349757113, + "grad_norm": 1.8579933643341064, + "learning_rate": 1.5070826036398052e-05, + "loss": 0.6788, + "step": 7648 + }, + { + "epoch": 1.3270298403886192, + "grad_norm": 1.3298841714859009, + "learning_rate": 1.5064223232383028e-05, + "loss": 0.7834, + "step": 7649 + }, + { + "epoch": 1.327203331020125, + "grad_norm": 0.9527466893196106, + "learning_rate": 1.505762100117065e-05, + "loss": 0.6294, + "step": 7650 + }, + { + "epoch": 1.3273768216516308, + "grad_norm": 1.507103443145752, + "learning_rate": 1.5051019343527123e-05, + "loss": 0.7476, + "step": 7651 + }, + { + "epoch": 1.3275503122831367, + "grad_norm": 0.8929161429405212, + "learning_rate": 1.5044418260218559e-05, + "loss": 0.7183, + "step": 7652 + }, + { + "epoch": 1.3277238029146425, + "grad_norm": 0.8975552320480347, + "learning_rate": 1.503781775201103e-05, + "loss": 0.7249, + "step": 7653 + }, + { + "epoch": 1.3278972935461484, + "grad_norm": 1.0545209646224976, + "learning_rate": 1.503121781967054e-05, + "loss": 0.6512, + "step": 7654 + }, + { + "epoch": 1.3280707841776545, + "grad_norm": 1.045506238937378, + "learning_rate": 1.5024618463963006e-05, + "loss": 0.6946, + "step": 7655 + }, + { + "epoch": 1.3282442748091603, + "grad_norm": 0.9262662529945374, + "learning_rate": 1.5018019685654295e-05, + "loss": 0.7869, + "step": 7656 + }, + { + "epoch": 1.3284177654406661, + "grad_norm": 0.7106608152389526, + "learning_rate": 1.5011421485510204e-05, + "loss": 0.6897, + "step": 7657 + }, + { + "epoch": 1.3285912560721722, + "grad_norm": 0.7571691274642944, + "learning_rate": 1.5004823864296472e-05, + "loss": 0.7078, + "step": 7658 + }, + { + "epoch": 1.328764746703678, + "grad_norm": 0.8296497464179993, + "learning_rate": 1.4998226822778743e-05, + "loss": 0.7905, + "step": 7659 + }, + { + "epoch": 1.328938237335184, + "grad_norm": 0.9777929186820984, + "learning_rate": 1.4991630361722619e-05, + "loss": 0.6853, + "step": 7660 + }, + { + "epoch": 1.3291117279666897, + "grad_norm": 0.859122633934021, + "learning_rate": 1.498503448189363e-05, + "loss": 0.6943, + "step": 7661 + }, + { + "epoch": 1.3292852185981956, + "grad_norm": 0.9104518890380859, + "learning_rate": 1.4978439184057233e-05, + "loss": 0.8223, + "step": 7662 + }, + { + "epoch": 1.3294587092297017, + "grad_norm": 0.7493782639503479, + "learning_rate": 1.497184446897882e-05, + "loss": 0.5845, + "step": 7663 + }, + { + "epoch": 1.3296321998612075, + "grad_norm": 0.9589608907699585, + "learning_rate": 1.4965250337423718e-05, + "loss": 0.6448, + "step": 7664 + }, + { + "epoch": 1.3298056904927134, + "grad_norm": 0.7621675729751587, + "learning_rate": 1.4958656790157176e-05, + "loss": 0.6882, + "step": 7665 + }, + { + "epoch": 1.3299791811242194, + "grad_norm": 1.0299450159072876, + "learning_rate": 1.4952063827944385e-05, + "loss": 0.5798, + "step": 7666 + }, + { + "epoch": 1.3301526717557253, + "grad_norm": 0.7855533957481384, + "learning_rate": 1.4945471451550481e-05, + "loss": 0.5784, + "step": 7667 + }, + { + "epoch": 1.3303261623872311, + "grad_norm": 1.0374501943588257, + "learning_rate": 1.4938879661740495e-05, + "loss": 0.7432, + "step": 7668 + }, + { + "epoch": 1.330499653018737, + "grad_norm": 1.7287695407867432, + "learning_rate": 1.4932288459279423e-05, + "loss": 0.5907, + "step": 7669 + }, + { + "epoch": 1.3306731436502428, + "grad_norm": 2.1563735008239746, + "learning_rate": 1.4925697844932185e-05, + "loss": 0.7153, + "step": 7670 + }, + { + "epoch": 1.3308466342817487, + "grad_norm": 0.9084241986274719, + "learning_rate": 1.491910781946362e-05, + "loss": 0.6962, + "step": 7671 + }, + { + "epoch": 1.3310201249132547, + "grad_norm": 1.0283417701721191, + "learning_rate": 1.4912518383638512e-05, + "loss": 0.5499, + "step": 7672 + }, + { + "epoch": 1.3311936155447606, + "grad_norm": 1.2048921585083008, + "learning_rate": 1.4905929538221574e-05, + "loss": 0.63, + "step": 7673 + }, + { + "epoch": 1.3313671061762664, + "grad_norm": 0.9731554388999939, + "learning_rate": 1.4899341283977457e-05, + "loss": 0.7971, + "step": 7674 + }, + { + "epoch": 1.3315405968077725, + "grad_norm": 0.718123733997345, + "learning_rate": 1.489275362167072e-05, + "loss": 0.8054, + "step": 7675 + }, + { + "epoch": 1.3317140874392783, + "grad_norm": 1.1172152757644653, + "learning_rate": 1.4886166552065873e-05, + "loss": 0.6737, + "step": 7676 + }, + { + "epoch": 1.3318875780707842, + "grad_norm": 0.8550201058387756, + "learning_rate": 1.4879580075927367e-05, + "loss": 0.7101, + "step": 7677 + }, + { + "epoch": 1.33206106870229, + "grad_norm": 1.3838368654251099, + "learning_rate": 1.4872994194019553e-05, + "loss": 0.8353, + "step": 7678 + }, + { + "epoch": 1.3322345593337959, + "grad_norm": 1.0857000350952148, + "learning_rate": 1.4866408907106734e-05, + "loss": 0.5376, + "step": 7679 + }, + { + "epoch": 1.332408049965302, + "grad_norm": 1.0645716190338135, + "learning_rate": 1.4859824215953154e-05, + "loss": 0.5854, + "step": 7680 + }, + { + "epoch": 1.3325815405968078, + "grad_norm": 0.7363383769989014, + "learning_rate": 1.4853240121322951e-05, + "loss": 0.8813, + "step": 7681 + }, + { + "epoch": 1.3327550312283136, + "grad_norm": 1.104215145111084, + "learning_rate": 1.4846656623980234e-05, + "loss": 0.5646, + "step": 7682 + }, + { + "epoch": 1.3329285218598197, + "grad_norm": 1.2378709316253662, + "learning_rate": 1.4840073724689021e-05, + "loss": 0.8904, + "step": 7683 + }, + { + "epoch": 1.3331020124913255, + "grad_norm": 0.8392555713653564, + "learning_rate": 1.4833491424213268e-05, + "loss": 0.7385, + "step": 7684 + }, + { + "epoch": 1.3332755031228314, + "grad_norm": 1.2061463594436646, + "learning_rate": 1.482690972331685e-05, + "loss": 0.7126, + "step": 7685 + }, + { + "epoch": 1.3334489937543372, + "grad_norm": 1.009365439414978, + "learning_rate": 1.4820328622763584e-05, + "loss": 0.7053, + "step": 7686 + }, + { + "epoch": 1.333622484385843, + "grad_norm": 1.1234580278396606, + "learning_rate": 1.4813748123317223e-05, + "loss": 0.5845, + "step": 7687 + }, + { + "epoch": 1.3337959750173491, + "grad_norm": 0.8283588290214539, + "learning_rate": 1.4807168225741433e-05, + "loss": 0.6582, + "step": 7688 + }, + { + "epoch": 1.333969465648855, + "grad_norm": 0.7632939219474792, + "learning_rate": 1.4800588930799822e-05, + "loss": 0.7108, + "step": 7689 + }, + { + "epoch": 1.3341429562803608, + "grad_norm": 1.089051365852356, + "learning_rate": 1.4794010239255925e-05, + "loss": 0.8149, + "step": 7690 + }, + { + "epoch": 1.3343164469118667, + "grad_norm": 0.759956419467926, + "learning_rate": 1.4787432151873202e-05, + "loss": 0.8103, + "step": 7691 + }, + { + "epoch": 1.3344899375433728, + "grad_norm": 0.8171418309211731, + "learning_rate": 1.4780854669415053e-05, + "loss": 0.666, + "step": 7692 + }, + { + "epoch": 1.3346634281748786, + "grad_norm": 0.9616034030914307, + "learning_rate": 1.4774277792644812e-05, + "loss": 0.7656, + "step": 7693 + }, + { + "epoch": 1.3348369188063844, + "grad_norm": 1.206411600112915, + "learning_rate": 1.4767701522325708e-05, + "loss": 0.595, + "step": 7694 + }, + { + "epoch": 1.3350104094378903, + "grad_norm": 0.6969934701919556, + "learning_rate": 1.4761125859220942e-05, + "loss": 0.8054, + "step": 7695 + }, + { + "epoch": 1.3351839000693961, + "grad_norm": 0.8580862879753113, + "learning_rate": 1.4754550804093633e-05, + "loss": 0.5852, + "step": 7696 + }, + { + "epoch": 1.3353573907009022, + "grad_norm": 1.1272857189178467, + "learning_rate": 1.4747976357706806e-05, + "loss": 0.7236, + "step": 7697 + }, + { + "epoch": 1.335530881332408, + "grad_norm": 1.1583610773086548, + "learning_rate": 1.4741402520823442e-05, + "loss": 0.7407, + "step": 7698 + }, + { + "epoch": 1.335704371963914, + "grad_norm": 0.8893204927444458, + "learning_rate": 1.4734829294206455e-05, + "loss": 0.5443, + "step": 7699 + }, + { + "epoch": 1.33587786259542, + "grad_norm": 2.9291861057281494, + "learning_rate": 1.4728256678618652e-05, + "loss": 0.5828, + "step": 7700 + }, + { + "epoch": 1.3360513532269258, + "grad_norm": 1.3816217184066772, + "learning_rate": 1.4721684674822805e-05, + "loss": 0.6135, + "step": 7701 + }, + { + "epoch": 1.3362248438584317, + "grad_norm": 0.8436519503593445, + "learning_rate": 1.47151132835816e-05, + "loss": 0.6536, + "step": 7702 + }, + { + "epoch": 1.3363983344899375, + "grad_norm": 1.120797872543335, + "learning_rate": 1.4708542505657668e-05, + "loss": 0.7771, + "step": 7703 + }, + { + "epoch": 1.3365718251214433, + "grad_norm": 0.9385079741477966, + "learning_rate": 1.4701972341813533e-05, + "loss": 0.7039, + "step": 7704 + }, + { + "epoch": 1.3367453157529494, + "grad_norm": 0.9422768950462341, + "learning_rate": 1.4695402792811684e-05, + "loss": 0.5857, + "step": 7705 + }, + { + "epoch": 1.3369188063844553, + "grad_norm": 0.7795658707618713, + "learning_rate": 1.4688833859414529e-05, + "loss": 0.6033, + "step": 7706 + }, + { + "epoch": 1.337092297015961, + "grad_norm": 1.3062316179275513, + "learning_rate": 1.4682265542384384e-05, + "loss": 0.6613, + "step": 7707 + }, + { + "epoch": 1.3372657876474672, + "grad_norm": 1.2028254270553589, + "learning_rate": 1.467569784248352e-05, + "loss": 0.6964, + "step": 7708 + }, + { + "epoch": 1.337439278278973, + "grad_norm": 1.1249308586120605, + "learning_rate": 1.466913076047413e-05, + "loss": 0.6842, + "step": 7709 + }, + { + "epoch": 1.3376127689104789, + "grad_norm": 1.0424563884735107, + "learning_rate": 1.4662564297118325e-05, + "loss": 0.7274, + "step": 7710 + }, + { + "epoch": 1.3377862595419847, + "grad_norm": 2.294053792953491, + "learning_rate": 1.465599845317815e-05, + "loss": 0.6725, + "step": 7711 + }, + { + "epoch": 1.3379597501734906, + "grad_norm": 0.8939918875694275, + "learning_rate": 1.4649433229415588e-05, + "loss": 0.7673, + "step": 7712 + }, + { + "epoch": 1.3381332408049964, + "grad_norm": 1.1304715871810913, + "learning_rate": 1.4642868626592529e-05, + "loss": 0.5851, + "step": 7713 + }, + { + "epoch": 1.3383067314365025, + "grad_norm": 0.9182126522064209, + "learning_rate": 1.4636304645470807e-05, + "loss": 0.7571, + "step": 7714 + }, + { + "epoch": 1.3384802220680083, + "grad_norm": 0.9545129537582397, + "learning_rate": 1.462974128681218e-05, + "loss": 0.8533, + "step": 7715 + }, + { + "epoch": 1.3386537126995142, + "grad_norm": 1.0332227945327759, + "learning_rate": 1.4623178551378346e-05, + "loss": 0.5508, + "step": 7716 + }, + { + "epoch": 1.3388272033310202, + "grad_norm": 1.563864827156067, + "learning_rate": 1.4616616439930895e-05, + "loss": 0.8516, + "step": 7717 + }, + { + "epoch": 1.339000693962526, + "grad_norm": 1.0063860416412354, + "learning_rate": 1.4610054953231379e-05, + "loss": 0.6318, + "step": 7718 + }, + { + "epoch": 1.339174184594032, + "grad_norm": 0.7957261204719543, + "learning_rate": 1.4603494092041275e-05, + "loss": 0.6821, + "step": 7719 + }, + { + "epoch": 1.3393476752255378, + "grad_norm": 1.4669586420059204, + "learning_rate": 1.4596933857121963e-05, + "loss": 0.6033, + "step": 7720 + }, + { + "epoch": 1.3395211658570436, + "grad_norm": 1.045806646347046, + "learning_rate": 1.4590374249234768e-05, + "loss": 0.7493, + "step": 7721 + }, + { + "epoch": 1.3396946564885497, + "grad_norm": 0.8063860535621643, + "learning_rate": 1.4583815269140957e-05, + "loss": 0.8086, + "step": 7722 + }, + { + "epoch": 1.3398681471200555, + "grad_norm": 0.797913134098053, + "learning_rate": 1.4577256917601688e-05, + "loss": 0.6217, + "step": 7723 + }, + { + "epoch": 1.3400416377515614, + "grad_norm": 0.841364860534668, + "learning_rate": 1.4570699195378071e-05, + "loss": 0.6111, + "step": 7724 + }, + { + "epoch": 1.3402151283830674, + "grad_norm": 0.9163260459899902, + "learning_rate": 1.4564142103231148e-05, + "loss": 0.5737, + "step": 7725 + }, + { + "epoch": 1.3403886190145733, + "grad_norm": 1.3713716268539429, + "learning_rate": 1.4557585641921859e-05, + "loss": 0.6382, + "step": 7726 + }, + { + "epoch": 1.3405621096460791, + "grad_norm": 0.777184009552002, + "learning_rate": 1.4551029812211095e-05, + "loss": 0.6917, + "step": 7727 + }, + { + "epoch": 1.340735600277585, + "grad_norm": 0.9109882116317749, + "learning_rate": 1.4544474614859683e-05, + "loss": 0.6095, + "step": 7728 + }, + { + "epoch": 1.3409090909090908, + "grad_norm": 1.3098782300949097, + "learning_rate": 1.4537920050628338e-05, + "loss": 0.6147, + "step": 7729 + }, + { + "epoch": 1.3410825815405967, + "grad_norm": 0.920302152633667, + "learning_rate": 1.4531366120277736e-05, + "loss": 0.7498, + "step": 7730 + }, + { + "epoch": 1.3412560721721027, + "grad_norm": 2.160980463027954, + "learning_rate": 1.4524812824568471e-05, + "loss": 0.5935, + "step": 7731 + }, + { + "epoch": 1.3414295628036086, + "grad_norm": 0.9924008250236511, + "learning_rate": 1.4518260164261058e-05, + "loss": 0.5582, + "step": 7732 + }, + { + "epoch": 1.3416030534351144, + "grad_norm": 1.3508355617523193, + "learning_rate": 1.4511708140115942e-05, + "loss": 0.6763, + "step": 7733 + }, + { + "epoch": 1.3417765440666205, + "grad_norm": 1.100056767463684, + "learning_rate": 1.4505156752893488e-05, + "loss": 0.5919, + "step": 7734 + }, + { + "epoch": 1.3419500346981263, + "grad_norm": 1.3886994123458862, + "learning_rate": 1.4498606003353998e-05, + "loss": 0.7065, + "step": 7735 + }, + { + "epoch": 1.3421235253296322, + "grad_norm": 1.2759461402893066, + "learning_rate": 1.4492055892257688e-05, + "loss": 0.7419, + "step": 7736 + }, + { + "epoch": 1.342297015961138, + "grad_norm": 0.7625912427902222, + "learning_rate": 1.4485506420364715e-05, + "loss": 0.7817, + "step": 7737 + }, + { + "epoch": 1.3424705065926439, + "grad_norm": 0.9284970164299011, + "learning_rate": 1.4478957588435148e-05, + "loss": 0.8098, + "step": 7738 + }, + { + "epoch": 1.34264399722415, + "grad_norm": 1.2536166906356812, + "learning_rate": 1.4472409397228979e-05, + "loss": 0.6147, + "step": 7739 + }, + { + "epoch": 1.3428174878556558, + "grad_norm": 1.066021203994751, + "learning_rate": 1.4465861847506142e-05, + "loss": 0.6069, + "step": 7740 + }, + { + "epoch": 1.3429909784871616, + "grad_norm": 1.2290135622024536, + "learning_rate": 1.4459314940026495e-05, + "loss": 0.7393, + "step": 7741 + }, + { + "epoch": 1.3431644691186677, + "grad_norm": 1.1170623302459717, + "learning_rate": 1.4452768675549798e-05, + "loss": 0.593, + "step": 7742 + }, + { + "epoch": 1.3433379597501736, + "grad_norm": 0.8870379328727722, + "learning_rate": 1.4446223054835758e-05, + "loss": 0.7816, + "step": 7743 + }, + { + "epoch": 1.3435114503816794, + "grad_norm": 1.1640665531158447, + "learning_rate": 1.4439678078644004e-05, + "loss": 0.6088, + "step": 7744 + }, + { + "epoch": 1.3436849410131853, + "grad_norm": 0.8158072829246521, + "learning_rate": 1.4433133747734097e-05, + "loss": 0.7145, + "step": 7745 + }, + { + "epoch": 1.343858431644691, + "grad_norm": 1.7424559593200684, + "learning_rate": 1.4426590062865497e-05, + "loss": 0.8038, + "step": 7746 + }, + { + "epoch": 1.3440319222761972, + "grad_norm": 0.7977254986763, + "learning_rate": 1.4420047024797614e-05, + "loss": 0.6494, + "step": 7747 + }, + { + "epoch": 1.344205412907703, + "grad_norm": 0.8209559321403503, + "learning_rate": 1.4413504634289785e-05, + "loss": 0.6948, + "step": 7748 + }, + { + "epoch": 1.3443789035392089, + "grad_norm": 1.39668607711792, + "learning_rate": 1.4406962892101243e-05, + "loss": 0.7922, + "step": 7749 + }, + { + "epoch": 1.3445523941707147, + "grad_norm": 0.9484068751335144, + "learning_rate": 1.4400421798991178e-05, + "loss": 0.6473, + "step": 7750 + }, + { + "epoch": 1.3447258848022208, + "grad_norm": 2.50835919380188, + "learning_rate": 1.4393881355718694e-05, + "loss": 0.535, + "step": 7751 + }, + { + "epoch": 1.3448993754337266, + "grad_norm": 0.9222944974899292, + "learning_rate": 1.4387341563042801e-05, + "loss": 0.7721, + "step": 7752 + }, + { + "epoch": 1.3450728660652325, + "grad_norm": 0.8452399969100952, + "learning_rate": 1.4380802421722461e-05, + "loss": 0.7665, + "step": 7753 + }, + { + "epoch": 1.3452463566967383, + "grad_norm": 1.13324773311615, + "learning_rate": 1.4374263932516557e-05, + "loss": 0.6448, + "step": 7754 + }, + { + "epoch": 1.3454198473282442, + "grad_norm": 1.0384275913238525, + "learning_rate": 1.436772609618387e-05, + "loss": 0.635, + "step": 7755 + }, + { + "epoch": 1.3455933379597502, + "grad_norm": 0.7754824161529541, + "learning_rate": 1.4361188913483132e-05, + "loss": 0.682, + "step": 7756 + }, + { + "epoch": 1.345766828591256, + "grad_norm": 1.3799774646759033, + "learning_rate": 1.4354652385172995e-05, + "loss": 0.6952, + "step": 7757 + }, + { + "epoch": 1.345940319222762, + "grad_norm": 0.9278022050857544, + "learning_rate": 1.4348116512012024e-05, + "loss": 0.7106, + "step": 7758 + }, + { + "epoch": 1.346113809854268, + "grad_norm": 1.2366571426391602, + "learning_rate": 1.4341581294758722e-05, + "loss": 0.7717, + "step": 7759 + }, + { + "epoch": 1.3462873004857738, + "grad_norm": 0.8208428025245667, + "learning_rate": 1.4335046734171499e-05, + "loss": 0.6748, + "step": 7760 + }, + { + "epoch": 1.3464607911172797, + "grad_norm": 0.994337797164917, + "learning_rate": 1.4328512831008708e-05, + "loss": 0.853, + "step": 7761 + }, + { + "epoch": 1.3466342817487855, + "grad_norm": 0.9598089456558228, + "learning_rate": 1.4321979586028607e-05, + "loss": 0.5693, + "step": 7762 + }, + { + "epoch": 1.3468077723802914, + "grad_norm": 1.241450309753418, + "learning_rate": 1.431544699998939e-05, + "loss": 0.8254, + "step": 7763 + }, + { + "epoch": 1.3469812630117974, + "grad_norm": 1.2358708381652832, + "learning_rate": 1.4308915073649182e-05, + "loss": 0.6936, + "step": 7764 + }, + { + "epoch": 1.3471547536433033, + "grad_norm": 0.9582039713859558, + "learning_rate": 1.4302383807766003e-05, + "loss": 0.7769, + "step": 7765 + }, + { + "epoch": 1.3473282442748091, + "grad_norm": 1.7637245655059814, + "learning_rate": 1.4295853203097823e-05, + "loss": 0.5981, + "step": 7766 + }, + { + "epoch": 1.3475017349063152, + "grad_norm": 0.9475969076156616, + "learning_rate": 1.4289323260402533e-05, + "loss": 0.6385, + "step": 7767 + }, + { + "epoch": 1.347675225537821, + "grad_norm": 1.0744237899780273, + "learning_rate": 1.4282793980437923e-05, + "loss": 0.7292, + "step": 7768 + }, + { + "epoch": 1.3478487161693269, + "grad_norm": 0.8003800511360168, + "learning_rate": 1.4276265363961735e-05, + "loss": 0.5676, + "step": 7769 + }, + { + "epoch": 1.3480222068008327, + "grad_norm": 0.9467061161994934, + "learning_rate": 1.4269737411731627e-05, + "loss": 0.7399, + "step": 7770 + }, + { + "epoch": 1.3481956974323386, + "grad_norm": 0.9442554116249084, + "learning_rate": 1.4263210124505164e-05, + "loss": 0.7898, + "step": 7771 + }, + { + "epoch": 1.3483691880638444, + "grad_norm": 1.302185297012329, + "learning_rate": 1.425668350303985e-05, + "loss": 0.7725, + "step": 7772 + }, + { + "epoch": 1.3485426786953505, + "grad_norm": 0.9019026160240173, + "learning_rate": 1.425015754809311e-05, + "loss": 0.5906, + "step": 7773 + }, + { + "epoch": 1.3487161693268563, + "grad_norm": 1.3226135969161987, + "learning_rate": 1.4243632260422292e-05, + "loss": 0.728, + "step": 7774 + }, + { + "epoch": 1.3488896599583622, + "grad_norm": 1.0895695686340332, + "learning_rate": 1.4237107640784648e-05, + "loss": 0.6842, + "step": 7775 + }, + { + "epoch": 1.3490631505898683, + "grad_norm": 0.9046276211738586, + "learning_rate": 1.4230583689937381e-05, + "loss": 0.5774, + "step": 7776 + }, + { + "epoch": 1.349236641221374, + "grad_norm": 0.7335824966430664, + "learning_rate": 1.4224060408637605e-05, + "loss": 0.7144, + "step": 7777 + }, + { + "epoch": 1.34941013185288, + "grad_norm": 1.4070457220077515, + "learning_rate": 1.4217537797642343e-05, + "loss": 0.6715, + "step": 7778 + }, + { + "epoch": 1.3495836224843858, + "grad_norm": 1.456397533416748, + "learning_rate": 1.4211015857708555e-05, + "loss": 0.7749, + "step": 7779 + }, + { + "epoch": 1.3497571131158916, + "grad_norm": 0.8538010716438293, + "learning_rate": 1.4204494589593127e-05, + "loss": 0.6763, + "step": 7780 + }, + { + "epoch": 1.3499306037473977, + "grad_norm": 1.006040334701538, + "learning_rate": 1.4197973994052855e-05, + "loss": 0.7108, + "step": 7781 + }, + { + "epoch": 1.3501040943789036, + "grad_norm": 1.742959976196289, + "learning_rate": 1.4191454071844457e-05, + "loss": 0.712, + "step": 7782 + }, + { + "epoch": 1.3502775850104094, + "grad_norm": 1.030233383178711, + "learning_rate": 1.4184934823724588e-05, + "loss": 0.6389, + "step": 7783 + }, + { + "epoch": 1.3504510756419155, + "grad_norm": 0.9362318515777588, + "learning_rate": 1.4178416250449804e-05, + "loss": 0.7795, + "step": 7784 + }, + { + "epoch": 1.3506245662734213, + "grad_norm": 1.4862852096557617, + "learning_rate": 1.41718983527766e-05, + "loss": 0.8337, + "step": 7785 + }, + { + "epoch": 1.3507980569049272, + "grad_norm": 1.4277019500732422, + "learning_rate": 1.4165381131461388e-05, + "loss": 0.7512, + "step": 7786 + }, + { + "epoch": 1.350971547536433, + "grad_norm": 1.2953425645828247, + "learning_rate": 1.4158864587260488e-05, + "loss": 0.6647, + "step": 7787 + }, + { + "epoch": 1.3511450381679388, + "grad_norm": 1.6384633779525757, + "learning_rate": 1.4152348720930156e-05, + "loss": 0.6052, + "step": 7788 + }, + { + "epoch": 1.3513185287994447, + "grad_norm": 0.9518514275550842, + "learning_rate": 1.414583353322657e-05, + "loss": 0.5798, + "step": 7789 + }, + { + "epoch": 1.3514920194309508, + "grad_norm": 0.8064939379692078, + "learning_rate": 1.4139319024905836e-05, + "loss": 0.8467, + "step": 7790 + }, + { + "epoch": 1.3516655100624566, + "grad_norm": 1.4454270601272583, + "learning_rate": 1.4132805196723944e-05, + "loss": 0.543, + "step": 7791 + }, + { + "epoch": 1.3518390006939625, + "grad_norm": 1.0906680822372437, + "learning_rate": 1.412629204943685e-05, + "loss": 0.6677, + "step": 7792 + }, + { + "epoch": 1.3520124913254685, + "grad_norm": 1.2927372455596924, + "learning_rate": 1.4119779583800414e-05, + "loss": 0.5303, + "step": 7793 + }, + { + "epoch": 1.3521859819569744, + "grad_norm": 0.7444872260093689, + "learning_rate": 1.4113267800570402e-05, + "loss": 0.7233, + "step": 7794 + }, + { + "epoch": 1.3523594725884802, + "grad_norm": 0.681178092956543, + "learning_rate": 1.4106756700502522e-05, + "loss": 0.7357, + "step": 7795 + }, + { + "epoch": 1.352532963219986, + "grad_norm": 0.7068869471549988, + "learning_rate": 1.41002462843524e-05, + "loss": 0.8062, + "step": 7796 + }, + { + "epoch": 1.352706453851492, + "grad_norm": 0.8216350078582764, + "learning_rate": 1.4093736552875563e-05, + "loss": 0.6996, + "step": 7797 + }, + { + "epoch": 1.352879944482998, + "grad_norm": 1.1742219924926758, + "learning_rate": 1.4087227506827482e-05, + "loss": 0.6858, + "step": 7798 + }, + { + "epoch": 1.3530534351145038, + "grad_norm": 1.0530279874801636, + "learning_rate": 1.4080719146963548e-05, + "loss": 0.6729, + "step": 7799 + }, + { + "epoch": 1.3532269257460097, + "grad_norm": 0.8392711281776428, + "learning_rate": 1.4074211474039046e-05, + "loss": 0.6187, + "step": 7800 + }, + { + "epoch": 1.3534004163775157, + "grad_norm": 0.985632061958313, + "learning_rate": 1.4067704488809204e-05, + "loss": 0.661, + "step": 7801 + }, + { + "epoch": 1.3535739070090216, + "grad_norm": 1.0594362020492554, + "learning_rate": 1.406119819202917e-05, + "loss": 0.5552, + "step": 7802 + }, + { + "epoch": 1.3537473976405274, + "grad_norm": 1.2232969999313354, + "learning_rate": 1.4054692584454017e-05, + "loss": 0.6326, + "step": 7803 + }, + { + "epoch": 1.3539208882720333, + "grad_norm": 1.0104693174362183, + "learning_rate": 1.4048187666838707e-05, + "loss": 0.6045, + "step": 7804 + }, + { + "epoch": 1.3540943789035391, + "grad_norm": 2.85333514213562, + "learning_rate": 1.4041683439938152e-05, + "loss": 0.6725, + "step": 7805 + }, + { + "epoch": 1.3542678695350452, + "grad_norm": 0.9291746616363525, + "learning_rate": 1.4035179904507184e-05, + "loss": 0.5924, + "step": 7806 + }, + { + "epoch": 1.354441360166551, + "grad_norm": 1.0255100727081299, + "learning_rate": 1.4028677061300535e-05, + "loss": 0.7397, + "step": 7807 + }, + { + "epoch": 1.3546148507980569, + "grad_norm": 0.7788848280906677, + "learning_rate": 1.4022174911072868e-05, + "loss": 0.7935, + "step": 7808 + }, + { + "epoch": 1.3547883414295627, + "grad_norm": 0.9688061475753784, + "learning_rate": 1.4015673454578776e-05, + "loss": 0.7932, + "step": 7809 + }, + { + "epoch": 1.3549618320610688, + "grad_norm": 0.8791283965110779, + "learning_rate": 1.4009172692572743e-05, + "loss": 0.604, + "step": 7810 + }, + { + "epoch": 1.3551353226925746, + "grad_norm": 0.9824856519699097, + "learning_rate": 1.4002672625809201e-05, + "loss": 0.733, + "step": 7811 + }, + { + "epoch": 1.3553088133240805, + "grad_norm": 0.7644726634025574, + "learning_rate": 1.39961732550425e-05, + "loss": 0.9072, + "step": 7812 + }, + { + "epoch": 1.3554823039555863, + "grad_norm": 0.9838712215423584, + "learning_rate": 1.3989674581026878e-05, + "loss": 0.7307, + "step": 7813 + }, + { + "epoch": 1.3556557945870922, + "grad_norm": 0.9105207324028015, + "learning_rate": 1.3983176604516526e-05, + "loss": 0.6852, + "step": 7814 + }, + { + "epoch": 1.3558292852185982, + "grad_norm": 1.156519889831543, + "learning_rate": 1.3976679326265542e-05, + "loss": 0.5548, + "step": 7815 + }, + { + "epoch": 1.356002775850104, + "grad_norm": 1.5149478912353516, + "learning_rate": 1.3970182747027944e-05, + "loss": 0.7043, + "step": 7816 + }, + { + "epoch": 1.35617626648161, + "grad_norm": 0.7506683468818665, + "learning_rate": 1.3963686867557658e-05, + "loss": 0.6415, + "step": 7817 + }, + { + "epoch": 1.356349757113116, + "grad_norm": 0.8504114747047424, + "learning_rate": 1.3957191688608544e-05, + "loss": 0.6936, + "step": 7818 + }, + { + "epoch": 1.3565232477446219, + "grad_norm": 0.8564591407775879, + "learning_rate": 1.3950697210934387e-05, + "loss": 0.6917, + "step": 7819 + }, + { + "epoch": 1.3566967383761277, + "grad_norm": 1.13394296169281, + "learning_rate": 1.3944203435288857e-05, + "loss": 0.5459, + "step": 7820 + }, + { + "epoch": 1.3568702290076335, + "grad_norm": 0.7487472891807556, + "learning_rate": 1.3937710362425574e-05, + "loss": 0.8828, + "step": 7821 + }, + { + "epoch": 1.3570437196391394, + "grad_norm": 0.7924402356147766, + "learning_rate": 1.3931217993098076e-05, + "loss": 0.8418, + "step": 7822 + }, + { + "epoch": 1.3572172102706455, + "grad_norm": 0.8471485376358032, + "learning_rate": 1.3924726328059794e-05, + "loss": 0.7148, + "step": 7823 + }, + { + "epoch": 1.3573907009021513, + "grad_norm": 0.7406964898109436, + "learning_rate": 1.3918235368064102e-05, + "loss": 0.6971, + "step": 7824 + }, + { + "epoch": 1.3575641915336571, + "grad_norm": 1.1263967752456665, + "learning_rate": 1.3911745113864288e-05, + "loss": 0.6602, + "step": 7825 + }, + { + "epoch": 1.3577376821651632, + "grad_norm": 1.2606744766235352, + "learning_rate": 1.3905255566213542e-05, + "loss": 0.5524, + "step": 7826 + }, + { + "epoch": 1.357911172796669, + "grad_norm": 0.9254157543182373, + "learning_rate": 1.3898766725864988e-05, + "loss": 0.6959, + "step": 7827 + }, + { + "epoch": 1.358084663428175, + "grad_norm": 1.0528981685638428, + "learning_rate": 1.3892278593571669e-05, + "loss": 0.7551, + "step": 7828 + }, + { + "epoch": 1.3582581540596808, + "grad_norm": 1.0024977922439575, + "learning_rate": 1.3885791170086535e-05, + "loss": 0.6128, + "step": 7829 + }, + { + "epoch": 1.3584316446911866, + "grad_norm": 1.1445761919021606, + "learning_rate": 1.3879304456162457e-05, + "loss": 0.6501, + "step": 7830 + }, + { + "epoch": 1.3586051353226924, + "grad_norm": 0.709992527961731, + "learning_rate": 1.3872818452552227e-05, + "loss": 0.8342, + "step": 7831 + }, + { + "epoch": 1.3587786259541985, + "grad_norm": 1.0938845872879028, + "learning_rate": 1.3866333160008562e-05, + "loss": 0.702, + "step": 7832 + }, + { + "epoch": 1.3589521165857044, + "grad_norm": 0.9098923206329346, + "learning_rate": 1.3859848579284076e-05, + "loss": 0.8423, + "step": 7833 + }, + { + "epoch": 1.3591256072172102, + "grad_norm": 0.9814373850822449, + "learning_rate": 1.3853364711131324e-05, + "loss": 0.6785, + "step": 7834 + }, + { + "epoch": 1.3592990978487163, + "grad_norm": 1.135046362876892, + "learning_rate": 1.3846881556302757e-05, + "loss": 0.6221, + "step": 7835 + }, + { + "epoch": 1.3594725884802221, + "grad_norm": 1.0822478532791138, + "learning_rate": 1.3840399115550748e-05, + "loss": 0.6063, + "step": 7836 + }, + { + "epoch": 1.359646079111728, + "grad_norm": 1.8415066003799438, + "learning_rate": 1.3833917389627603e-05, + "loss": 0.6853, + "step": 7837 + }, + { + "epoch": 1.3598195697432338, + "grad_norm": 0.8637315630912781, + "learning_rate": 1.3827436379285537e-05, + "loss": 0.7744, + "step": 7838 + }, + { + "epoch": 1.3599930603747397, + "grad_norm": 0.7767878770828247, + "learning_rate": 1.3820956085276661e-05, + "loss": 0.8455, + "step": 7839 + }, + { + "epoch": 1.3601665510062457, + "grad_norm": 0.7779116630554199, + "learning_rate": 1.3814476508353036e-05, + "loss": 0.7629, + "step": 7840 + }, + { + "epoch": 1.3603400416377516, + "grad_norm": 1.279744267463684, + "learning_rate": 1.3807997649266625e-05, + "loss": 0.605, + "step": 7841 + }, + { + "epoch": 1.3605135322692574, + "grad_norm": 0.7662531733512878, + "learning_rate": 1.3801519508769295e-05, + "loss": 0.7627, + "step": 7842 + }, + { + "epoch": 1.3606870229007635, + "grad_norm": 0.8628897666931152, + "learning_rate": 1.3795042087612847e-05, + "loss": 0.7449, + "step": 7843 + }, + { + "epoch": 1.3608605135322693, + "grad_norm": 0.8848919868469238, + "learning_rate": 1.3788565386548996e-05, + "loss": 0.7678, + "step": 7844 + }, + { + "epoch": 1.3610340041637752, + "grad_norm": 1.2208175659179688, + "learning_rate": 1.3782089406329377e-05, + "loss": 0.5776, + "step": 7845 + }, + { + "epoch": 1.361207494795281, + "grad_norm": 0.8830855488777161, + "learning_rate": 1.3775614147705521e-05, + "loss": 0.6768, + "step": 7846 + }, + { + "epoch": 1.3613809854267869, + "grad_norm": 0.7793346047401428, + "learning_rate": 1.3769139611428895e-05, + "loss": 0.8484, + "step": 7847 + }, + { + "epoch": 1.3615544760582927, + "grad_norm": 0.7315047979354858, + "learning_rate": 1.3762665798250887e-05, + "loss": 0.4984, + "step": 7848 + }, + { + "epoch": 1.3617279666897988, + "grad_norm": 1.106075644493103, + "learning_rate": 1.375619270892277e-05, + "loss": 0.6619, + "step": 7849 + }, + { + "epoch": 1.3619014573213046, + "grad_norm": 3.1170873641967773, + "learning_rate": 1.3749720344195768e-05, + "loss": 0.5978, + "step": 7850 + }, + { + "epoch": 1.3620749479528105, + "grad_norm": 1.3620574474334717, + "learning_rate": 1.3743248704821008e-05, + "loss": 0.7859, + "step": 7851 + }, + { + "epoch": 1.3622484385843165, + "grad_norm": 0.6606683731079102, + "learning_rate": 1.373677779154952e-05, + "loss": 0.8518, + "step": 7852 + }, + { + "epoch": 1.3624219292158224, + "grad_norm": 0.8188430666923523, + "learning_rate": 1.3730307605132268e-05, + "loss": 0.7909, + "step": 7853 + }, + { + "epoch": 1.3625954198473282, + "grad_norm": 0.6261219382286072, + "learning_rate": 1.3723838146320128e-05, + "loss": 0.6129, + "step": 7854 + }, + { + "epoch": 1.362768910478834, + "grad_norm": 0.7469984292984009, + "learning_rate": 1.3717369415863884e-05, + "loss": 0.7405, + "step": 7855 + }, + { + "epoch": 1.36294240111034, + "grad_norm": 0.8932355046272278, + "learning_rate": 1.3710901414514235e-05, + "loss": 0.6171, + "step": 7856 + }, + { + "epoch": 1.363115891741846, + "grad_norm": 1.0184097290039062, + "learning_rate": 1.370443414302181e-05, + "loss": 0.6492, + "step": 7857 + }, + { + "epoch": 1.3632893823733518, + "grad_norm": 5.672355651855469, + "learning_rate": 1.3697967602137135e-05, + "loss": 0.7445, + "step": 7858 + }, + { + "epoch": 1.3634628730048577, + "grad_norm": 0.8128076791763306, + "learning_rate": 1.3691501792610662e-05, + "loss": 0.9192, + "step": 7859 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 0.9685764312744141, + "learning_rate": 1.368503671519276e-05, + "loss": 0.6008, + "step": 7860 + }, + { + "epoch": 1.3638098542678696, + "grad_norm": 0.9164113402366638, + "learning_rate": 1.3678572370633708e-05, + "loss": 0.8076, + "step": 7861 + }, + { + "epoch": 1.3639833448993754, + "grad_norm": 1.4794604778289795, + "learning_rate": 1.3672108759683694e-05, + "loss": 0.5627, + "step": 7862 + }, + { + "epoch": 1.3641568355308813, + "grad_norm": 0.7308427691459656, + "learning_rate": 1.366564588309283e-05, + "loss": 0.7275, + "step": 7863 + }, + { + "epoch": 1.3643303261623871, + "grad_norm": 0.9077423810958862, + "learning_rate": 1.3659183741611154e-05, + "loss": 0.6979, + "step": 7864 + }, + { + "epoch": 1.3645038167938932, + "grad_norm": 0.7161286473274231, + "learning_rate": 1.3652722335988579e-05, + "loss": 0.8013, + "step": 7865 + }, + { + "epoch": 1.364677307425399, + "grad_norm": 0.7382241487503052, + "learning_rate": 1.3646261666974976e-05, + "loss": 0.8137, + "step": 7866 + }, + { + "epoch": 1.364850798056905, + "grad_norm": 1.183933138847351, + "learning_rate": 1.3639801735320122e-05, + "loss": 0.5803, + "step": 7867 + }, + { + "epoch": 1.3650242886884107, + "grad_norm": 1.024889588356018, + "learning_rate": 1.3633342541773673e-05, + "loss": 0.5697, + "step": 7868 + }, + { + "epoch": 1.3651977793199168, + "grad_norm": 0.8020725250244141, + "learning_rate": 1.3626884087085246e-05, + "loss": 0.726, + "step": 7869 + }, + { + "epoch": 1.3653712699514227, + "grad_norm": 1.0861945152282715, + "learning_rate": 1.3620426372004353e-05, + "loss": 0.7062, + "step": 7870 + }, + { + "epoch": 1.3655447605829285, + "grad_norm": 0.976035475730896, + "learning_rate": 1.3613969397280405e-05, + "loss": 0.7739, + "step": 7871 + }, + { + "epoch": 1.3657182512144344, + "grad_norm": 1.1171950101852417, + "learning_rate": 1.360751316366275e-05, + "loss": 0.9043, + "step": 7872 + }, + { + "epoch": 1.3658917418459402, + "grad_norm": 0.8524395227432251, + "learning_rate": 1.3601057671900639e-05, + "loss": 0.8286, + "step": 7873 + }, + { + "epoch": 1.3660652324774463, + "grad_norm": 1.1146184206008911, + "learning_rate": 1.3594602922743252e-05, + "loss": 0.6766, + "step": 7874 + }, + { + "epoch": 1.3662387231089521, + "grad_norm": 0.8761046528816223, + "learning_rate": 1.3588148916939651e-05, + "loss": 0.7145, + "step": 7875 + }, + { + "epoch": 1.366412213740458, + "grad_norm": 1.3646795749664307, + "learning_rate": 1.358169565523884e-05, + "loss": 0.7065, + "step": 7876 + }, + { + "epoch": 1.366585704371964, + "grad_norm": 2.328704357147217, + "learning_rate": 1.3575243138389733e-05, + "loss": 0.7219, + "step": 7877 + }, + { + "epoch": 1.3667591950034699, + "grad_norm": 0.6590009927749634, + "learning_rate": 1.356879136714114e-05, + "loss": 0.8016, + "step": 7878 + }, + { + "epoch": 1.3669326856349757, + "grad_norm": 0.7581875324249268, + "learning_rate": 1.3562340342241802e-05, + "loss": 0.793, + "step": 7879 + }, + { + "epoch": 1.3671061762664816, + "grad_norm": 1.365269422531128, + "learning_rate": 1.3555890064440374e-05, + "loss": 0.5828, + "step": 7880 + }, + { + "epoch": 1.3672796668979874, + "grad_norm": 0.9123929142951965, + "learning_rate": 1.3549440534485407e-05, + "loss": 0.6748, + "step": 7881 + }, + { + "epoch": 1.3674531575294935, + "grad_norm": 0.9603720903396606, + "learning_rate": 1.3542991753125387e-05, + "loss": 0.725, + "step": 7882 + }, + { + "epoch": 1.3676266481609993, + "grad_norm": 1.0935423374176025, + "learning_rate": 1.3536543721108698e-05, + "loss": 0.7283, + "step": 7883 + }, + { + "epoch": 1.3678001387925052, + "grad_norm": 1.6079456806182861, + "learning_rate": 1.3530096439183637e-05, + "loss": 0.6189, + "step": 7884 + }, + { + "epoch": 1.3679736294240112, + "grad_norm": 0.9644209742546082, + "learning_rate": 1.3523649908098423e-05, + "loss": 0.6658, + "step": 7885 + }, + { + "epoch": 1.368147120055517, + "grad_norm": 1.042120337486267, + "learning_rate": 1.3517204128601193e-05, + "loss": 0.6101, + "step": 7886 + }, + { + "epoch": 1.368320610687023, + "grad_norm": 1.252160668373108, + "learning_rate": 1.351075910143997e-05, + "loss": 0.6215, + "step": 7887 + }, + { + "epoch": 1.3684941013185288, + "grad_norm": 0.7686306834220886, + "learning_rate": 1.3504314827362715e-05, + "loss": 0.6582, + "step": 7888 + }, + { + "epoch": 1.3686675919500346, + "grad_norm": 1.2325763702392578, + "learning_rate": 1.3497871307117291e-05, + "loss": 0.5709, + "step": 7889 + }, + { + "epoch": 1.3688410825815405, + "grad_norm": 1.478040337562561, + "learning_rate": 1.3491428541451487e-05, + "loss": 0.7036, + "step": 7890 + }, + { + "epoch": 1.3690145732130465, + "grad_norm": 1.0204581022262573, + "learning_rate": 1.3484986531112977e-05, + "loss": 0.5999, + "step": 7891 + }, + { + "epoch": 1.3691880638445524, + "grad_norm": 1.09433114528656, + "learning_rate": 1.3478545276849373e-05, + "loss": 0.7571, + "step": 7892 + }, + { + "epoch": 1.3693615544760582, + "grad_norm": 1.2253233194351196, + "learning_rate": 1.3472104779408195e-05, + "loss": 0.5812, + "step": 7893 + }, + { + "epoch": 1.3695350451075643, + "grad_norm": 0.9430752396583557, + "learning_rate": 1.3465665039536857e-05, + "loss": 0.6539, + "step": 7894 + }, + { + "epoch": 1.3697085357390701, + "grad_norm": 1.0453155040740967, + "learning_rate": 1.3459226057982706e-05, + "loss": 0.6417, + "step": 7895 + }, + { + "epoch": 1.369882026370576, + "grad_norm": 1.5613213777542114, + "learning_rate": 1.3452787835492998e-05, + "loss": 0.8015, + "step": 7896 + }, + { + "epoch": 1.3700555170020818, + "grad_norm": 1.5401633977890015, + "learning_rate": 1.3446350372814884e-05, + "loss": 0.6415, + "step": 7897 + }, + { + "epoch": 1.3702290076335877, + "grad_norm": 0.756100058555603, + "learning_rate": 1.3439913670695445e-05, + "loss": 0.8329, + "step": 7898 + }, + { + "epoch": 1.3704024982650937, + "grad_norm": 0.9485135674476624, + "learning_rate": 1.3433477729881679e-05, + "loss": 0.6488, + "step": 7899 + }, + { + "epoch": 1.3705759888965996, + "grad_norm": 0.9283890128135681, + "learning_rate": 1.3427042551120461e-05, + "loss": 0.66, + "step": 7900 + }, + { + "epoch": 1.3707494795281054, + "grad_norm": 0.8877713084220886, + "learning_rate": 1.342060813515862e-05, + "loss": 0.752, + "step": 7901 + }, + { + "epoch": 1.3709229701596115, + "grad_norm": 0.7735199332237244, + "learning_rate": 1.3414174482742865e-05, + "loss": 0.689, + "step": 7902 + }, + { + "epoch": 1.3710964607911174, + "grad_norm": 0.9867647886276245, + "learning_rate": 1.3407741594619844e-05, + "loss": 0.6217, + "step": 7903 + }, + { + "epoch": 1.3712699514226232, + "grad_norm": 1.8981071710586548, + "learning_rate": 1.3401309471536092e-05, + "loss": 0.71, + "step": 7904 + }, + { + "epoch": 1.371443442054129, + "grad_norm": 0.8998876810073853, + "learning_rate": 1.3394878114238059e-05, + "loss": 0.5413, + "step": 7905 + }, + { + "epoch": 1.371616932685635, + "grad_norm": 1.0560396909713745, + "learning_rate": 1.3388447523472122e-05, + "loss": 0.6111, + "step": 7906 + }, + { + "epoch": 1.3717904233171407, + "grad_norm": 0.8488804697990417, + "learning_rate": 1.3382017699984551e-05, + "loss": 0.8105, + "step": 7907 + }, + { + "epoch": 1.3719639139486468, + "grad_norm": 0.8172667026519775, + "learning_rate": 1.337558864452154e-05, + "loss": 0.824, + "step": 7908 + }, + { + "epoch": 1.3721374045801527, + "grad_norm": 1.0840293169021606, + "learning_rate": 1.3369160357829185e-05, + "loss": 0.6772, + "step": 7909 + }, + { + "epoch": 1.3723108952116585, + "grad_norm": 1.1725060939788818, + "learning_rate": 1.3362732840653494e-05, + "loss": 0.6825, + "step": 7910 + }, + { + "epoch": 1.3724843858431646, + "grad_norm": 1.1753002405166626, + "learning_rate": 1.3356306093740392e-05, + "loss": 0.7347, + "step": 7911 + }, + { + "epoch": 1.3726578764746704, + "grad_norm": 0.9607205986976624, + "learning_rate": 1.3349880117835716e-05, + "loss": 0.7971, + "step": 7912 + }, + { + "epoch": 1.3728313671061763, + "grad_norm": 1.1791051626205444, + "learning_rate": 1.3343454913685195e-05, + "loss": 0.5327, + "step": 7913 + }, + { + "epoch": 1.373004857737682, + "grad_norm": 0.9866552948951721, + "learning_rate": 1.3337030482034485e-05, + "loss": 0.6241, + "step": 7914 + }, + { + "epoch": 1.373178348369188, + "grad_norm": 1.0429608821868896, + "learning_rate": 1.3330606823629161e-05, + "loss": 0.5528, + "step": 7915 + }, + { + "epoch": 1.373351839000694, + "grad_norm": 1.0296598672866821, + "learning_rate": 1.332418393921468e-05, + "loss": 0.6609, + "step": 7916 + }, + { + "epoch": 1.3735253296321999, + "grad_norm": 2.9180335998535156, + "learning_rate": 1.3317761829536428e-05, + "loss": 0.6158, + "step": 7917 + }, + { + "epoch": 1.3736988202637057, + "grad_norm": 1.4046275615692139, + "learning_rate": 1.3311340495339704e-05, + "loss": 0.6566, + "step": 7918 + }, + { + "epoch": 1.3738723108952118, + "grad_norm": 0.9443055391311646, + "learning_rate": 1.3304919937369718e-05, + "loss": 0.5479, + "step": 7919 + }, + { + "epoch": 1.3740458015267176, + "grad_norm": 1.0538780689239502, + "learning_rate": 1.3298500156371565e-05, + "loss": 0.6791, + "step": 7920 + }, + { + "epoch": 1.3742192921582235, + "grad_norm": 1.1865625381469727, + "learning_rate": 1.329208115309028e-05, + "loss": 0.8462, + "step": 7921 + }, + { + "epoch": 1.3743927827897293, + "grad_norm": 0.8766623735427856, + "learning_rate": 1.32856629282708e-05, + "loss": 0.754, + "step": 7922 + }, + { + "epoch": 1.3745662734212352, + "grad_norm": 1.1540385484695435, + "learning_rate": 1.3279245482657953e-05, + "loss": 0.6283, + "step": 7923 + }, + { + "epoch": 1.3747397640527412, + "grad_norm": 1.1594018936157227, + "learning_rate": 1.3272828816996498e-05, + "loss": 0.7173, + "step": 7924 + }, + { + "epoch": 1.374913254684247, + "grad_norm": 1.0621143579483032, + "learning_rate": 1.3266412932031108e-05, + "loss": 0.7161, + "step": 7925 + }, + { + "epoch": 1.375086745315753, + "grad_norm": 1.1770943403244019, + "learning_rate": 1.3259997828506333e-05, + "loss": 0.5952, + "step": 7926 + }, + { + "epoch": 1.3752602359472588, + "grad_norm": 0.9982792139053345, + "learning_rate": 1.3253583507166666e-05, + "loss": 0.7769, + "step": 7927 + }, + { + "epoch": 1.3754337265787648, + "grad_norm": 0.9407086968421936, + "learning_rate": 1.3247169968756494e-05, + "loss": 0.7861, + "step": 7928 + }, + { + "epoch": 1.3756072172102707, + "grad_norm": 0.9402908086776733, + "learning_rate": 1.3240757214020117e-05, + "loss": 0.6162, + "step": 7929 + }, + { + "epoch": 1.3757807078417765, + "grad_norm": 0.8987496495246887, + "learning_rate": 1.3234345243701743e-05, + "loss": 0.6603, + "step": 7930 + }, + { + "epoch": 1.3759541984732824, + "grad_norm": 1.2195148468017578, + "learning_rate": 1.3227934058545481e-05, + "loss": 0.748, + "step": 7931 + }, + { + "epoch": 1.3761276891047882, + "grad_norm": 0.9742690920829773, + "learning_rate": 1.322152365929537e-05, + "loss": 0.6111, + "step": 7932 + }, + { + "epoch": 1.3763011797362943, + "grad_norm": 0.7927941083908081, + "learning_rate": 1.321511404669533e-05, + "loss": 0.7228, + "step": 7933 + }, + { + "epoch": 1.3764746703678001, + "grad_norm": 1.0623246431350708, + "learning_rate": 1.320870522148921e-05, + "loss": 0.5543, + "step": 7934 + }, + { + "epoch": 1.376648160999306, + "grad_norm": 1.9994624853134155, + "learning_rate": 1.3202297184420774e-05, + "loss": 0.6904, + "step": 7935 + }, + { + "epoch": 1.376821651630812, + "grad_norm": 0.7930338382720947, + "learning_rate": 1.3195889936233662e-05, + "loss": 0.7791, + "step": 7936 + }, + { + "epoch": 1.376995142262318, + "grad_norm": 0.7339880466461182, + "learning_rate": 1.318948347767145e-05, + "loss": 0.8613, + "step": 7937 + }, + { + "epoch": 1.3771686328938237, + "grad_norm": 1.0511869192123413, + "learning_rate": 1.3183077809477625e-05, + "loss": 0.7295, + "step": 7938 + }, + { + "epoch": 1.3773421235253296, + "grad_norm": 0.7754176259040833, + "learning_rate": 1.3176672932395556e-05, + "loss": 0.7234, + "step": 7939 + }, + { + "epoch": 1.3775156141568354, + "grad_norm": 0.7992454767227173, + "learning_rate": 1.3170268847168541e-05, + "loss": 0.7286, + "step": 7940 + }, + { + "epoch": 1.3776891047883415, + "grad_norm": 0.6674901843070984, + "learning_rate": 1.3163865554539794e-05, + "loss": 0.7031, + "step": 7941 + }, + { + "epoch": 1.3778625954198473, + "grad_norm": 0.9475463032722473, + "learning_rate": 1.315746305525241e-05, + "loss": 0.7158, + "step": 7942 + }, + { + "epoch": 1.3780360860513532, + "grad_norm": 0.9277877807617188, + "learning_rate": 1.3151061350049408e-05, + "loss": 0.5759, + "step": 7943 + }, + { + "epoch": 1.3782095766828593, + "grad_norm": 0.9108982086181641, + "learning_rate": 1.3144660439673727e-05, + "loss": 0.667, + "step": 7944 + }, + { + "epoch": 1.378383067314365, + "grad_norm": 1.0104904174804688, + "learning_rate": 1.3138260324868179e-05, + "loss": 0.5363, + "step": 7945 + }, + { + "epoch": 1.378556557945871, + "grad_norm": 0.983525276184082, + "learning_rate": 1.3131861006375519e-05, + "loss": 0.9001, + "step": 7946 + }, + { + "epoch": 1.3787300485773768, + "grad_norm": 0.9385712742805481, + "learning_rate": 1.3125462484938391e-05, + "loss": 0.6527, + "step": 7947 + }, + { + "epoch": 1.3789035392088826, + "grad_norm": 0.8720304369926453, + "learning_rate": 1.311906476129936e-05, + "loss": 0.7584, + "step": 7948 + }, + { + "epoch": 1.3790770298403885, + "grad_norm": 0.7727636098861694, + "learning_rate": 1.3112667836200873e-05, + "loss": 0.8445, + "step": 7949 + }, + { + "epoch": 1.3792505204718946, + "grad_norm": 0.9119306802749634, + "learning_rate": 1.3106271710385312e-05, + "loss": 0.7151, + "step": 7950 + }, + { + "epoch": 1.3794240111034004, + "grad_norm": 1.1061729192733765, + "learning_rate": 1.3099876384594951e-05, + "loss": 0.7546, + "step": 7951 + }, + { + "epoch": 1.3795975017349063, + "grad_norm": 0.8095812201499939, + "learning_rate": 1.3093481859571981e-05, + "loss": 0.8032, + "step": 7952 + }, + { + "epoch": 1.3797709923664123, + "grad_norm": 0.8781191110610962, + "learning_rate": 1.3087088136058481e-05, + "loss": 0.6744, + "step": 7953 + }, + { + "epoch": 1.3799444829979182, + "grad_norm": 2.086254358291626, + "learning_rate": 1.3080695214796464e-05, + "loss": 0.7406, + "step": 7954 + }, + { + "epoch": 1.380117973629424, + "grad_norm": 0.6720625758171082, + "learning_rate": 1.3074303096527823e-05, + "loss": 0.7849, + "step": 7955 + }, + { + "epoch": 1.3802914642609299, + "grad_norm": 1.05963134765625, + "learning_rate": 1.3067911781994384e-05, + "loss": 0.6998, + "step": 7956 + }, + { + "epoch": 1.3804649548924357, + "grad_norm": 1.069916009902954, + "learning_rate": 1.306152127193786e-05, + "loss": 0.6908, + "step": 7957 + }, + { + "epoch": 1.3806384455239418, + "grad_norm": 0.9541560411453247, + "learning_rate": 1.3055131567099872e-05, + "loss": 0.7096, + "step": 7958 + }, + { + "epoch": 1.3808119361554476, + "grad_norm": 0.938120424747467, + "learning_rate": 1.3048742668221959e-05, + "loss": 0.7676, + "step": 7959 + }, + { + "epoch": 1.3809854267869535, + "grad_norm": 1.038383960723877, + "learning_rate": 1.3042354576045559e-05, + "loss": 0.5751, + "step": 7960 + }, + { + "epoch": 1.3811589174184595, + "grad_norm": 0.801677942276001, + "learning_rate": 1.3035967291312029e-05, + "loss": 0.7085, + "step": 7961 + }, + { + "epoch": 1.3813324080499654, + "grad_norm": 0.8584671020507812, + "learning_rate": 1.3029580814762596e-05, + "loss": 0.7201, + "step": 7962 + }, + { + "epoch": 1.3815058986814712, + "grad_norm": 0.944556713104248, + "learning_rate": 1.3023195147138434e-05, + "loss": 0.5358, + "step": 7963 + }, + { + "epoch": 1.381679389312977, + "grad_norm": 1.4829367399215698, + "learning_rate": 1.3016810289180615e-05, + "loss": 0.5133, + "step": 7964 + }, + { + "epoch": 1.381852879944483, + "grad_norm": 0.9718306064605713, + "learning_rate": 1.3010426241630088e-05, + "loss": 0.8464, + "step": 7965 + }, + { + "epoch": 1.3820263705759888, + "grad_norm": 1.11758553981781, + "learning_rate": 1.3004043005227742e-05, + "loss": 0.5953, + "step": 7966 + }, + { + "epoch": 1.3821998612074948, + "grad_norm": 0.8829220533370972, + "learning_rate": 1.2997660580714364e-05, + "loss": 0.7756, + "step": 7967 + }, + { + "epoch": 1.3823733518390007, + "grad_norm": 0.8267119526863098, + "learning_rate": 1.2991278968830625e-05, + "loss": 0.7974, + "step": 7968 + }, + { + "epoch": 1.3825468424705065, + "grad_norm": 0.9178968071937561, + "learning_rate": 1.298489817031713e-05, + "loss": 0.6758, + "step": 7969 + }, + { + "epoch": 1.3827203331020126, + "grad_norm": 0.9040138721466064, + "learning_rate": 1.2978518185914388e-05, + "loss": 0.6359, + "step": 7970 + }, + { + "epoch": 1.3828938237335184, + "grad_norm": 1.107829213142395, + "learning_rate": 1.2972139016362782e-05, + "loss": 0.6622, + "step": 7971 + }, + { + "epoch": 1.3830673143650243, + "grad_norm": 0.7691729068756104, + "learning_rate": 1.296576066240263e-05, + "loss": 0.7087, + "step": 7972 + }, + { + "epoch": 1.3832408049965301, + "grad_norm": 1.534574031829834, + "learning_rate": 1.2959383124774152e-05, + "loss": 0.6716, + "step": 7973 + }, + { + "epoch": 1.383414295628036, + "grad_norm": 0.8717774748802185, + "learning_rate": 1.2953006404217474e-05, + "loss": 0.7446, + "step": 7974 + }, + { + "epoch": 1.383587786259542, + "grad_norm": 0.7984367609024048, + "learning_rate": 1.2946630501472607e-05, + "loss": 0.7281, + "step": 7975 + }, + { + "epoch": 1.3837612768910479, + "grad_norm": 1.091104507446289, + "learning_rate": 1.2940255417279486e-05, + "loss": 0.7314, + "step": 7976 + }, + { + "epoch": 1.3839347675225537, + "grad_norm": 1.3805065155029297, + "learning_rate": 1.2933881152377956e-05, + "loss": 0.707, + "step": 7977 + }, + { + "epoch": 1.3841082581540598, + "grad_norm": 1.234600305557251, + "learning_rate": 1.2927507707507751e-05, + "loss": 0.6346, + "step": 7978 + }, + { + "epoch": 1.3842817487855656, + "grad_norm": 0.9650316834449768, + "learning_rate": 1.2921135083408513e-05, + "loss": 0.7075, + "step": 7979 + }, + { + "epoch": 1.3844552394170715, + "grad_norm": 1.1210131645202637, + "learning_rate": 1.2914763280819804e-05, + "loss": 0.7484, + "step": 7980 + }, + { + "epoch": 1.3846287300485773, + "grad_norm": 1.9405583143234253, + "learning_rate": 1.2908392300481067e-05, + "loss": 0.702, + "step": 7981 + }, + { + "epoch": 1.3848022206800832, + "grad_norm": 1.1998621225357056, + "learning_rate": 1.2902022143131668e-05, + "loss": 0.6309, + "step": 7982 + }, + { + "epoch": 1.3849757113115893, + "grad_norm": 0.9981486797332764, + "learning_rate": 1.2895652809510875e-05, + "loss": 0.6426, + "step": 7983 + }, + { + "epoch": 1.385149201943095, + "grad_norm": 0.7377332448959351, + "learning_rate": 1.2889284300357847e-05, + "loss": 0.8905, + "step": 7984 + }, + { + "epoch": 1.385322692574601, + "grad_norm": 0.8932281732559204, + "learning_rate": 1.2882916616411658e-05, + "loss": 0.7112, + "step": 7985 + }, + { + "epoch": 1.3854961832061068, + "grad_norm": 0.7345430850982666, + "learning_rate": 1.28765497584113e-05, + "loss": 0.6587, + "step": 7986 + }, + { + "epoch": 1.3856696738376129, + "grad_norm": 0.9459245204925537, + "learning_rate": 1.2870183727095633e-05, + "loss": 0.7139, + "step": 7987 + }, + { + "epoch": 1.3858431644691187, + "grad_norm": 1.2363439798355103, + "learning_rate": 1.2863818523203452e-05, + "loss": 0.5653, + "step": 7988 + }, + { + "epoch": 1.3860166551006246, + "grad_norm": 0.9558073282241821, + "learning_rate": 1.285745414747345e-05, + "loss": 0.6721, + "step": 7989 + }, + { + "epoch": 1.3861901457321304, + "grad_norm": 1.302771806716919, + "learning_rate": 1.2851090600644223e-05, + "loss": 0.7078, + "step": 7990 + }, + { + "epoch": 1.3863636363636362, + "grad_norm": 1.0245417356491089, + "learning_rate": 1.284472788345425e-05, + "loss": 0.8064, + "step": 7991 + }, + { + "epoch": 1.3865371269951423, + "grad_norm": 0.9302948117256165, + "learning_rate": 1.2838365996641949e-05, + "loss": 0.698, + "step": 7992 + }, + { + "epoch": 1.3867106176266482, + "grad_norm": 1.1496130228042603, + "learning_rate": 1.2832004940945626e-05, + "loss": 0.7595, + "step": 7993 + }, + { + "epoch": 1.386884108258154, + "grad_norm": 0.9967625141143799, + "learning_rate": 1.2825644717103472e-05, + "loss": 0.7671, + "step": 7994 + }, + { + "epoch": 1.38705759888966, + "grad_norm": 0.8126618266105652, + "learning_rate": 1.281928532585361e-05, + "loss": 0.866, + "step": 7995 + }, + { + "epoch": 1.387231089521166, + "grad_norm": 0.9364733099937439, + "learning_rate": 1.281292676793406e-05, + "loss": 0.7646, + "step": 7996 + }, + { + "epoch": 1.3874045801526718, + "grad_norm": 1.1658798456192017, + "learning_rate": 1.2806569044082729e-05, + "loss": 0.636, + "step": 7997 + }, + { + "epoch": 1.3875780707841776, + "grad_norm": 1.0728306770324707, + "learning_rate": 1.2800212155037437e-05, + "loss": 0.6411, + "step": 7998 + }, + { + "epoch": 1.3877515614156835, + "grad_norm": 0.8164652585983276, + "learning_rate": 1.2793856101535922e-05, + "loss": 0.7917, + "step": 7999 + }, + { + "epoch": 1.3879250520471895, + "grad_norm": 1.0305864810943604, + "learning_rate": 1.27875008843158e-05, + "loss": 0.7666, + "step": 8000 + }, + { + "epoch": 1.3880985426786954, + "grad_norm": 1.580378532409668, + "learning_rate": 1.2781146504114604e-05, + "loss": 0.6836, + "step": 8001 + }, + { + "epoch": 1.3882720333102012, + "grad_norm": 1.5256668329238892, + "learning_rate": 1.2774792961669764e-05, + "loss": 0.7771, + "step": 8002 + }, + { + "epoch": 1.3884455239417073, + "grad_norm": 1.1284058094024658, + "learning_rate": 1.2768440257718626e-05, + "loss": 0.7, + "step": 8003 + }, + { + "epoch": 1.3886190145732131, + "grad_norm": 0.9840512275695801, + "learning_rate": 1.2762088392998417e-05, + "loss": 0.6122, + "step": 8004 + }, + { + "epoch": 1.388792505204719, + "grad_norm": 1.4198616743087769, + "learning_rate": 1.275573736824629e-05, + "loss": 0.6832, + "step": 8005 + }, + { + "epoch": 1.3889659958362248, + "grad_norm": 0.8015937805175781, + "learning_rate": 1.2749387184199283e-05, + "loss": 0.8867, + "step": 8006 + }, + { + "epoch": 1.3891394864677307, + "grad_norm": 1.4538313150405884, + "learning_rate": 1.2743037841594334e-05, + "loss": 0.5706, + "step": 8007 + }, + { + "epoch": 1.3893129770992365, + "grad_norm": 0.8867776393890381, + "learning_rate": 1.2736689341168304e-05, + "loss": 0.5377, + "step": 8008 + }, + { + "epoch": 1.3894864677307426, + "grad_norm": 1.169008493423462, + "learning_rate": 1.2730341683657945e-05, + "loss": 0.5388, + "step": 8009 + }, + { + "epoch": 1.3896599583622484, + "grad_norm": 0.9816068410873413, + "learning_rate": 1.2723994869799898e-05, + "loss": 0.5667, + "step": 8010 + }, + { + "epoch": 1.3898334489937543, + "grad_norm": 0.7220948338508606, + "learning_rate": 1.2717648900330723e-05, + "loss": 0.6465, + "step": 8011 + }, + { + "epoch": 1.3900069396252603, + "grad_norm": 0.8136598467826843, + "learning_rate": 1.2711303775986888e-05, + "loss": 0.708, + "step": 8012 + }, + { + "epoch": 1.3901804302567662, + "grad_norm": 1.1590909957885742, + "learning_rate": 1.2704959497504734e-05, + "loss": 0.6049, + "step": 8013 + }, + { + "epoch": 1.390353920888272, + "grad_norm": 0.9597334265708923, + "learning_rate": 1.2698616065620528e-05, + "loss": 0.8467, + "step": 8014 + }, + { + "epoch": 1.3905274115197779, + "grad_norm": 1.0963040590286255, + "learning_rate": 1.269227348107045e-05, + "loss": 0.6101, + "step": 8015 + }, + { + "epoch": 1.3907009021512837, + "grad_norm": 1.2490590810775757, + "learning_rate": 1.2685931744590536e-05, + "loss": 0.7023, + "step": 8016 + }, + { + "epoch": 1.3908743927827898, + "grad_norm": 0.8851966857910156, + "learning_rate": 1.2679590856916769e-05, + "loss": 0.7147, + "step": 8017 + }, + { + "epoch": 1.3910478834142956, + "grad_norm": 1.6806771755218506, + "learning_rate": 1.267325081878501e-05, + "loss": 0.6716, + "step": 8018 + }, + { + "epoch": 1.3912213740458015, + "grad_norm": 1.0348660945892334, + "learning_rate": 1.266691163093104e-05, + "loss": 0.6848, + "step": 8019 + }, + { + "epoch": 1.3913948646773076, + "grad_norm": 0.9733901023864746, + "learning_rate": 1.2660573294090512e-05, + "loss": 0.5796, + "step": 8020 + }, + { + "epoch": 1.3915683553088134, + "grad_norm": 0.8911205530166626, + "learning_rate": 1.2654235808999007e-05, + "loss": 0.7581, + "step": 8021 + }, + { + "epoch": 1.3917418459403192, + "grad_norm": 0.8962191343307495, + "learning_rate": 1.2647899176392003e-05, + "loss": 0.5659, + "step": 8022 + }, + { + "epoch": 1.391915336571825, + "grad_norm": 0.6896473169326782, + "learning_rate": 1.2641563397004859e-05, + "loss": 0.8916, + "step": 8023 + }, + { + "epoch": 1.392088827203331, + "grad_norm": 0.9082165956497192, + "learning_rate": 1.2635228471572853e-05, + "loss": 0.7944, + "step": 8024 + }, + { + "epoch": 1.3922623178348368, + "grad_norm": 0.8065841794013977, + "learning_rate": 1.2628894400831175e-05, + "loss": 0.6438, + "step": 8025 + }, + { + "epoch": 1.3924358084663429, + "grad_norm": 0.9644336104393005, + "learning_rate": 1.2622561185514886e-05, + "loss": 0.6127, + "step": 8026 + }, + { + "epoch": 1.3926092990978487, + "grad_norm": 0.9195787310600281, + "learning_rate": 1.2616228826358965e-05, + "loss": 0.7173, + "step": 8027 + }, + { + "epoch": 1.3927827897293545, + "grad_norm": 1.0101850032806396, + "learning_rate": 1.2609897324098297e-05, + "loss": 0.5657, + "step": 8028 + }, + { + "epoch": 1.3929562803608606, + "grad_norm": 1.2096457481384277, + "learning_rate": 1.2603566679467654e-05, + "loss": 0.7312, + "step": 8029 + }, + { + "epoch": 1.3931297709923665, + "grad_norm": 0.7993245720863342, + "learning_rate": 1.2597236893201712e-05, + "loss": 0.7368, + "step": 8030 + }, + { + "epoch": 1.3933032616238723, + "grad_norm": 0.7649964094161987, + "learning_rate": 1.259090796603506e-05, + "loss": 0.7793, + "step": 8031 + }, + { + "epoch": 1.3934767522553781, + "grad_norm": 1.3832169771194458, + "learning_rate": 1.2584579898702175e-05, + "loss": 0.6206, + "step": 8032 + }, + { + "epoch": 1.393650242886884, + "grad_norm": 1.054778814315796, + "learning_rate": 1.2578252691937425e-05, + "loss": 0.6193, + "step": 8033 + }, + { + "epoch": 1.39382373351839, + "grad_norm": 0.8354358077049255, + "learning_rate": 1.25719263464751e-05, + "loss": 0.9126, + "step": 8034 + }, + { + "epoch": 1.393997224149896, + "grad_norm": 0.9617300033569336, + "learning_rate": 1.2565600863049387e-05, + "loss": 0.656, + "step": 8035 + }, + { + "epoch": 1.3941707147814018, + "grad_norm": 1.0957831144332886, + "learning_rate": 1.2559276242394347e-05, + "loss": 0.6306, + "step": 8036 + }, + { + "epoch": 1.3943442054129078, + "grad_norm": 1.0565696954727173, + "learning_rate": 1.255295248524397e-05, + "loss": 0.6072, + "step": 8037 + }, + { + "epoch": 1.3945176960444137, + "grad_norm": 0.7889808416366577, + "learning_rate": 1.254662959233214e-05, + "loss": 0.6089, + "step": 8038 + }, + { + "epoch": 1.3946911866759195, + "grad_norm": 1.2708064317703247, + "learning_rate": 1.2540307564392627e-05, + "loss": 0.6909, + "step": 8039 + }, + { + "epoch": 1.3948646773074254, + "grad_norm": 0.8058679103851318, + "learning_rate": 1.2533986402159113e-05, + "loss": 0.5596, + "step": 8040 + }, + { + "epoch": 1.3950381679389312, + "grad_norm": 1.3211768865585327, + "learning_rate": 1.2527666106365182e-05, + "loss": 0.528, + "step": 8041 + }, + { + "epoch": 1.3952116585704373, + "grad_norm": 1.2449687719345093, + "learning_rate": 1.25213466777443e-05, + "loss": 0.6235, + "step": 8042 + }, + { + "epoch": 1.3953851492019431, + "grad_norm": 1.0022900104522705, + "learning_rate": 1.2515028117029848e-05, + "loss": 0.5834, + "step": 8043 + }, + { + "epoch": 1.395558639833449, + "grad_norm": 1.0637526512145996, + "learning_rate": 1.2508710424955117e-05, + "loss": 0.7565, + "step": 8044 + }, + { + "epoch": 1.3957321304649548, + "grad_norm": 0.8811051845550537, + "learning_rate": 1.2502393602253258e-05, + "loss": 0.8179, + "step": 8045 + }, + { + "epoch": 1.3959056210964609, + "grad_norm": 0.7115247845649719, + "learning_rate": 1.249607764965736e-05, + "loss": 0.7677, + "step": 8046 + }, + { + "epoch": 1.3960791117279667, + "grad_norm": 1.0147924423217773, + "learning_rate": 1.2489762567900395e-05, + "loss": 0.6152, + "step": 8047 + }, + { + "epoch": 1.3962526023594726, + "grad_norm": 5.437389373779297, + "learning_rate": 1.2483448357715242e-05, + "loss": 0.7194, + "step": 8048 + }, + { + "epoch": 1.3964260929909784, + "grad_norm": 0.9743807911872864, + "learning_rate": 1.2477135019834659e-05, + "loss": 0.6483, + "step": 8049 + }, + { + "epoch": 1.3965995836224843, + "grad_norm": 2.424745798110962, + "learning_rate": 1.2470822554991321e-05, + "loss": 0.6206, + "step": 8050 + }, + { + "epoch": 1.3967730742539903, + "grad_norm": 0.8367231488227844, + "learning_rate": 1.2464510963917803e-05, + "loss": 0.8461, + "step": 8051 + }, + { + "epoch": 1.3969465648854962, + "grad_norm": 1.1343785524368286, + "learning_rate": 1.2458200247346569e-05, + "loss": 0.8123, + "step": 8052 + }, + { + "epoch": 1.397120055517002, + "grad_norm": 0.8361849784851074, + "learning_rate": 1.2451890406009981e-05, + "loss": 0.7634, + "step": 8053 + }, + { + "epoch": 1.397293546148508, + "grad_norm": 0.8060932159423828, + "learning_rate": 1.2445581440640312e-05, + "loss": 0.6576, + "step": 8054 + }, + { + "epoch": 1.397467036780014, + "grad_norm": 0.8014965653419495, + "learning_rate": 1.2439273351969712e-05, + "loss": 0.7302, + "step": 8055 + }, + { + "epoch": 1.3976405274115198, + "grad_norm": 1.1547352075576782, + "learning_rate": 1.243296614073025e-05, + "loss": 0.6804, + "step": 8056 + }, + { + "epoch": 1.3978140180430256, + "grad_norm": 1.518118143081665, + "learning_rate": 1.2426659807653896e-05, + "loss": 0.5747, + "step": 8057 + }, + { + "epoch": 1.3979875086745315, + "grad_norm": 1.1531081199645996, + "learning_rate": 1.2420354353472483e-05, + "loss": 0.5999, + "step": 8058 + }, + { + "epoch": 1.3981609993060375, + "grad_norm": 0.8253722786903381, + "learning_rate": 1.2414049778917782e-05, + "loss": 0.7219, + "step": 8059 + }, + { + "epoch": 1.3983344899375434, + "grad_norm": 1.1576659679412842, + "learning_rate": 1.2407746084721444e-05, + "loss": 0.7295, + "step": 8060 + }, + { + "epoch": 1.3985079805690492, + "grad_norm": 0.8974632620811462, + "learning_rate": 1.2401443271615028e-05, + "loss": 0.5861, + "step": 8061 + }, + { + "epoch": 1.3986814712005553, + "grad_norm": 1.0525966882705688, + "learning_rate": 1.2395141340329966e-05, + "loss": 0.7048, + "step": 8062 + }, + { + "epoch": 1.3988549618320612, + "grad_norm": 1.453865647315979, + "learning_rate": 1.2388840291597611e-05, + "loss": 0.7439, + "step": 8063 + }, + { + "epoch": 1.399028452463567, + "grad_norm": 1.18207848072052, + "learning_rate": 1.2382540126149218e-05, + "loss": 0.6838, + "step": 8064 + }, + { + "epoch": 1.3992019430950728, + "grad_norm": 0.834454357624054, + "learning_rate": 1.2376240844715912e-05, + "loss": 0.6819, + "step": 8065 + }, + { + "epoch": 1.3993754337265787, + "grad_norm": 1.11368727684021, + "learning_rate": 1.2369942448028738e-05, + "loss": 0.676, + "step": 8066 + }, + { + "epoch": 1.3995489243580845, + "grad_norm": 1.3010069131851196, + "learning_rate": 1.2363644936818645e-05, + "loss": 0.6108, + "step": 8067 + }, + { + "epoch": 1.3997224149895906, + "grad_norm": 0.687893807888031, + "learning_rate": 1.2357348311816444e-05, + "loss": 0.7148, + "step": 8068 + }, + { + "epoch": 1.3998959056210964, + "grad_norm": 1.1652369499206543, + "learning_rate": 1.2351052573752878e-05, + "loss": 0.6624, + "step": 8069 + }, + { + "epoch": 1.4000693962526023, + "grad_norm": 0.9500651955604553, + "learning_rate": 1.2344757723358583e-05, + "loss": 0.5098, + "step": 8070 + }, + { + "epoch": 1.4002428868841084, + "grad_norm": 0.9084645509719849, + "learning_rate": 1.2338463761364063e-05, + "loss": 0.7964, + "step": 8071 + }, + { + "epoch": 1.4004163775156142, + "grad_norm": 0.7188977003097534, + "learning_rate": 1.2332170688499753e-05, + "loss": 0.8284, + "step": 8072 + }, + { + "epoch": 1.40058986814712, + "grad_norm": 1.1773197650909424, + "learning_rate": 1.2325878505495971e-05, + "loss": 0.6112, + "step": 8073 + }, + { + "epoch": 1.400763358778626, + "grad_norm": 1.2211356163024902, + "learning_rate": 1.2319587213082931e-05, + "loss": 0.6671, + "step": 8074 + }, + { + "epoch": 1.4009368494101317, + "grad_norm": 0.9914692640304565, + "learning_rate": 1.2313296811990747e-05, + "loss": 0.6777, + "step": 8075 + }, + { + "epoch": 1.4011103400416378, + "grad_norm": 1.1794930696487427, + "learning_rate": 1.230700730294942e-05, + "loss": 0.5688, + "step": 8076 + }, + { + "epoch": 1.4012838306731437, + "grad_norm": 0.7747244238853455, + "learning_rate": 1.2300718686688863e-05, + "loss": 0.645, + "step": 8077 + }, + { + "epoch": 1.4014573213046495, + "grad_norm": 0.9731133580207825, + "learning_rate": 1.229443096393887e-05, + "loss": 0.5514, + "step": 8078 + }, + { + "epoch": 1.4016308119361556, + "grad_norm": 1.0880115032196045, + "learning_rate": 1.2288144135429146e-05, + "loss": 0.5267, + "step": 8079 + }, + { + "epoch": 1.4018043025676614, + "grad_norm": 0.7924405932426453, + "learning_rate": 1.2281858201889283e-05, + "loss": 0.8569, + "step": 8080 + }, + { + "epoch": 1.4019777931991673, + "grad_norm": 0.8059411644935608, + "learning_rate": 1.2275573164048765e-05, + "loss": 0.6093, + "step": 8081 + }, + { + "epoch": 1.4021512838306731, + "grad_norm": 2.010251998901367, + "learning_rate": 1.226928902263698e-05, + "loss": 0.6976, + "step": 8082 + }, + { + "epoch": 1.402324774462179, + "grad_norm": 1.794490933418274, + "learning_rate": 1.2263005778383222e-05, + "loss": 0.8225, + "step": 8083 + }, + { + "epoch": 1.4024982650936848, + "grad_norm": 1.518547534942627, + "learning_rate": 1.2256723432016648e-05, + "loss": 0.5906, + "step": 8084 + }, + { + "epoch": 1.4026717557251909, + "grad_norm": 0.803968608379364, + "learning_rate": 1.2250441984266343e-05, + "loss": 0.7178, + "step": 8085 + }, + { + "epoch": 1.4028452463566967, + "grad_norm": 1.036411166191101, + "learning_rate": 1.2244161435861286e-05, + "loss": 0.8254, + "step": 8086 + }, + { + "epoch": 1.4030187369882026, + "grad_norm": 0.7741610407829285, + "learning_rate": 1.2237881787530324e-05, + "loss": 0.7656, + "step": 8087 + }, + { + "epoch": 1.4031922276197086, + "grad_norm": 1.3745089769363403, + "learning_rate": 1.2231603040002225e-05, + "loss": 0.5973, + "step": 8088 + }, + { + "epoch": 1.4033657182512145, + "grad_norm": 1.1955164670944214, + "learning_rate": 1.2225325194005642e-05, + "loss": 0.6045, + "step": 8089 + }, + { + "epoch": 1.4035392088827203, + "grad_norm": 2.0960581302642822, + "learning_rate": 1.2219048250269141e-05, + "loss": 0.7678, + "step": 8090 + }, + { + "epoch": 1.4037126995142262, + "grad_norm": 1.470046043395996, + "learning_rate": 1.2212772209521146e-05, + "loss": 0.8412, + "step": 8091 + }, + { + "epoch": 1.403886190145732, + "grad_norm": 0.7863114476203918, + "learning_rate": 1.2206497072490014e-05, + "loss": 0.8438, + "step": 8092 + }, + { + "epoch": 1.404059680777238, + "grad_norm": 0.8866109251976013, + "learning_rate": 1.2200222839903983e-05, + "loss": 0.6523, + "step": 8093 + }, + { + "epoch": 1.404233171408744, + "grad_norm": 1.0516676902770996, + "learning_rate": 1.2193949512491172e-05, + "loss": 0.8347, + "step": 8094 + }, + { + "epoch": 1.4044066620402498, + "grad_norm": 0.6335523128509521, + "learning_rate": 1.218767709097962e-05, + "loss": 0.7731, + "step": 8095 + }, + { + "epoch": 1.4045801526717558, + "grad_norm": 0.8425604104995728, + "learning_rate": 1.2181405576097247e-05, + "loss": 0.7258, + "step": 8096 + }, + { + "epoch": 1.4047536433032617, + "grad_norm": 0.9876832962036133, + "learning_rate": 1.2175134968571863e-05, + "loss": 0.7688, + "step": 8097 + }, + { + "epoch": 1.4049271339347675, + "grad_norm": 1.6114414930343628, + "learning_rate": 1.2168865269131182e-05, + "loss": 0.593, + "step": 8098 + }, + { + "epoch": 1.4051006245662734, + "grad_norm": 0.9184619784355164, + "learning_rate": 1.2162596478502815e-05, + "loss": 0.7668, + "step": 8099 + }, + { + "epoch": 1.4052741151977792, + "grad_norm": 0.9215431213378906, + "learning_rate": 1.2156328597414258e-05, + "loss": 0.6779, + "step": 8100 + }, + { + "epoch": 1.4054476058292853, + "grad_norm": 1.251543402671814, + "learning_rate": 1.2150061626592908e-05, + "loss": 0.7966, + "step": 8101 + }, + { + "epoch": 1.4056210964607911, + "grad_norm": 1.1763193607330322, + "learning_rate": 1.2143795566766054e-05, + "loss": 0.7611, + "step": 8102 + }, + { + "epoch": 1.405794587092297, + "grad_norm": 1.150187373161316, + "learning_rate": 1.2137530418660877e-05, + "loss": 0.7146, + "step": 8103 + }, + { + "epoch": 1.4059680777238028, + "grad_norm": 1.0967097282409668, + "learning_rate": 1.2131266183004455e-05, + "loss": 0.6555, + "step": 8104 + }, + { + "epoch": 1.406141568355309, + "grad_norm": 0.9923232197761536, + "learning_rate": 1.212500286052376e-05, + "loss": 0.5516, + "step": 8105 + }, + { + "epoch": 1.4063150589868147, + "grad_norm": 0.705280065536499, + "learning_rate": 1.2118740451945668e-05, + "loss": 0.8491, + "step": 8106 + }, + { + "epoch": 1.4064885496183206, + "grad_norm": 1.2229992151260376, + "learning_rate": 1.2112478957996922e-05, + "loss": 0.5566, + "step": 8107 + }, + { + "epoch": 1.4066620402498264, + "grad_norm": 2.6804494857788086, + "learning_rate": 1.2106218379404187e-05, + "loss": 0.8604, + "step": 8108 + }, + { + "epoch": 1.4068355308813323, + "grad_norm": 3.7239835262298584, + "learning_rate": 1.2099958716894008e-05, + "loss": 0.7494, + "step": 8109 + }, + { + "epoch": 1.4070090215128384, + "grad_norm": 1.6651278734207153, + "learning_rate": 1.2093699971192825e-05, + "loss": 0.8518, + "step": 8110 + }, + { + "epoch": 1.4071825121443442, + "grad_norm": 0.954336404800415, + "learning_rate": 1.2087442143026968e-05, + "loss": 0.5941, + "step": 8111 + }, + { + "epoch": 1.40735600277585, + "grad_norm": 1.175838828086853, + "learning_rate": 1.208118523312268e-05, + "loss": 0.7231, + "step": 8112 + }, + { + "epoch": 1.4075294934073561, + "grad_norm": 1.2879379987716675, + "learning_rate": 1.2074929242206066e-05, + "loss": 0.5546, + "step": 8113 + }, + { + "epoch": 1.407702984038862, + "grad_norm": 1.1188079118728638, + "learning_rate": 1.2068674171003146e-05, + "loss": 0.6304, + "step": 8114 + }, + { + "epoch": 1.4078764746703678, + "grad_norm": 0.9107108116149902, + "learning_rate": 1.2062420020239838e-05, + "loss": 0.6482, + "step": 8115 + }, + { + "epoch": 1.4080499653018737, + "grad_norm": 1.1703542470932007, + "learning_rate": 1.205616679064193e-05, + "loss": 0.6595, + "step": 8116 + }, + { + "epoch": 1.4082234559333795, + "grad_norm": 1.1100467443466187, + "learning_rate": 1.204991448293512e-05, + "loss": 0.7249, + "step": 8117 + }, + { + "epoch": 1.4083969465648856, + "grad_norm": 1.4010428190231323, + "learning_rate": 1.2043663097844999e-05, + "loss": 0.6061, + "step": 8118 + }, + { + "epoch": 1.4085704371963914, + "grad_norm": 0.8237162232398987, + "learning_rate": 1.2037412636097056e-05, + "loss": 0.6266, + "step": 8119 + }, + { + "epoch": 1.4087439278278973, + "grad_norm": 0.8209179639816284, + "learning_rate": 1.2031163098416644e-05, + "loss": 0.748, + "step": 8120 + }, + { + "epoch": 1.4089174184594033, + "grad_norm": 0.9005157947540283, + "learning_rate": 1.2024914485529042e-05, + "loss": 0.5673, + "step": 8121 + }, + { + "epoch": 1.4090909090909092, + "grad_norm": 1.080001711845398, + "learning_rate": 1.2018666798159408e-05, + "loss": 0.6879, + "step": 8122 + }, + { + "epoch": 1.409264399722415, + "grad_norm": 0.8887181282043457, + "learning_rate": 1.2012420037032797e-05, + "loss": 0.6064, + "step": 8123 + }, + { + "epoch": 1.4094378903539209, + "grad_norm": 1.040218472480774, + "learning_rate": 1.2006174202874141e-05, + "loss": 0.5447, + "step": 8124 + }, + { + "epoch": 1.4096113809854267, + "grad_norm": 0.8045010566711426, + "learning_rate": 1.1999929296408288e-05, + "loss": 0.7476, + "step": 8125 + }, + { + "epoch": 1.4097848716169326, + "grad_norm": 2.4957849979400635, + "learning_rate": 1.1993685318359956e-05, + "loss": 0.5034, + "step": 8126 + }, + { + "epoch": 1.4099583622484386, + "grad_norm": 1.1189608573913574, + "learning_rate": 1.1987442269453779e-05, + "loss": 0.823, + "step": 8127 + }, + { + "epoch": 1.4101318528799445, + "grad_norm": 1.5976892709732056, + "learning_rate": 1.1981200150414262e-05, + "loss": 0.9236, + "step": 8128 + }, + { + "epoch": 1.4103053435114503, + "grad_norm": 1.4010212421417236, + "learning_rate": 1.1974958961965806e-05, + "loss": 0.6919, + "step": 8129 + }, + { + "epoch": 1.4104788341429564, + "grad_norm": 0.8226161599159241, + "learning_rate": 1.1968718704832716e-05, + "loss": 0.7725, + "step": 8130 + }, + { + "epoch": 1.4106523247744622, + "grad_norm": 1.0676817893981934, + "learning_rate": 1.1962479379739179e-05, + "loss": 0.7817, + "step": 8131 + }, + { + "epoch": 1.410825815405968, + "grad_norm": 1.624548077583313, + "learning_rate": 1.195624098740928e-05, + "loss": 0.7275, + "step": 8132 + }, + { + "epoch": 1.410999306037474, + "grad_norm": 1.1379715204238892, + "learning_rate": 1.1950003528566981e-05, + "loss": 0.6014, + "step": 8133 + }, + { + "epoch": 1.4111727966689798, + "grad_norm": 0.8901684284210205, + "learning_rate": 1.1943767003936152e-05, + "loss": 0.7224, + "step": 8134 + }, + { + "epoch": 1.4113462873004858, + "grad_norm": 0.9298567175865173, + "learning_rate": 1.193753141424056e-05, + "loss": 0.6929, + "step": 8135 + }, + { + "epoch": 1.4115197779319917, + "grad_norm": 0.7918391823768616, + "learning_rate": 1.1931296760203831e-05, + "loss": 0.8347, + "step": 8136 + }, + { + "epoch": 1.4116932685634975, + "grad_norm": 0.8446359038352966, + "learning_rate": 1.1925063042549519e-05, + "loss": 0.8201, + "step": 8137 + }, + { + "epoch": 1.4118667591950036, + "grad_norm": 0.8346973061561584, + "learning_rate": 1.1918830262001055e-05, + "loss": 0.8511, + "step": 8138 + }, + { + "epoch": 1.4120402498265094, + "grad_norm": 0.8432217836380005, + "learning_rate": 1.1912598419281747e-05, + "loss": 0.6964, + "step": 8139 + }, + { + "epoch": 1.4122137404580153, + "grad_norm": 1.0354795455932617, + "learning_rate": 1.1906367515114816e-05, + "loss": 0.5931, + "step": 8140 + }, + { + "epoch": 1.4123872310895211, + "grad_norm": 0.7862289547920227, + "learning_rate": 1.1900137550223376e-05, + "loss": 0.7473, + "step": 8141 + }, + { + "epoch": 1.412560721721027, + "grad_norm": 5.312673091888428, + "learning_rate": 1.1893908525330401e-05, + "loss": 0.5695, + "step": 8142 + }, + { + "epoch": 1.4127342123525328, + "grad_norm": 0.8413613438606262, + "learning_rate": 1.1887680441158791e-05, + "loss": 0.9131, + "step": 8143 + }, + { + "epoch": 1.412907702984039, + "grad_norm": 0.8659276366233826, + "learning_rate": 1.1881453298431323e-05, + "loss": 0.7168, + "step": 8144 + }, + { + "epoch": 1.4130811936155447, + "grad_norm": 1.6472563743591309, + "learning_rate": 1.1875227097870652e-05, + "loss": 0.7722, + "step": 8145 + }, + { + "epoch": 1.4132546842470506, + "grad_norm": 0.6160121560096741, + "learning_rate": 1.1869001840199345e-05, + "loss": 0.8008, + "step": 8146 + }, + { + "epoch": 1.4134281748785567, + "grad_norm": 1.47457754611969, + "learning_rate": 1.1862777526139848e-05, + "loss": 0.5579, + "step": 8147 + }, + { + "epoch": 1.4136016655100625, + "grad_norm": 0.9126172065734863, + "learning_rate": 1.1856554156414503e-05, + "loss": 0.8008, + "step": 8148 + }, + { + "epoch": 1.4137751561415683, + "grad_norm": 0.6801590919494629, + "learning_rate": 1.1850331731745541e-05, + "loss": 0.8362, + "step": 8149 + }, + { + "epoch": 1.4139486467730742, + "grad_norm": 0.9157345294952393, + "learning_rate": 1.1844110252855072e-05, + "loss": 0.7269, + "step": 8150 + }, + { + "epoch": 1.41412213740458, + "grad_norm": 1.0989285707473755, + "learning_rate": 1.1837889720465117e-05, + "loss": 0.7083, + "step": 8151 + }, + { + "epoch": 1.414295628036086, + "grad_norm": 1.5595669746398926, + "learning_rate": 1.1831670135297564e-05, + "loss": 0.7634, + "step": 8152 + }, + { + "epoch": 1.414469118667592, + "grad_norm": 1.2519500255584717, + "learning_rate": 1.1825451498074209e-05, + "loss": 0.6929, + "step": 8153 + }, + { + "epoch": 1.4146426092990978, + "grad_norm": 1.6829266548156738, + "learning_rate": 1.1819233809516746e-05, + "loss": 0.6689, + "step": 8154 + }, + { + "epoch": 1.4148160999306039, + "grad_norm": 0.8893418908119202, + "learning_rate": 1.181301707034672e-05, + "loss": 0.7179, + "step": 8155 + }, + { + "epoch": 1.4149895905621097, + "grad_norm": 1.6616911888122559, + "learning_rate": 1.1806801281285604e-05, + "loss": 0.8572, + "step": 8156 + }, + { + "epoch": 1.4151630811936156, + "grad_norm": 0.9237269759178162, + "learning_rate": 1.1800586443054754e-05, + "loss": 0.5909, + "step": 8157 + }, + { + "epoch": 1.4153365718251214, + "grad_norm": 0.8154220581054688, + "learning_rate": 1.1794372556375392e-05, + "loss": 0.7637, + "step": 8158 + }, + { + "epoch": 1.4155100624566272, + "grad_norm": 0.7871053814888, + "learning_rate": 1.178815962196866e-05, + "loss": 0.7439, + "step": 8159 + }, + { + "epoch": 1.4156835530881333, + "grad_norm": 2.3836259841918945, + "learning_rate": 1.178194764055557e-05, + "loss": 0.6047, + "step": 8160 + }, + { + "epoch": 1.4158570437196392, + "grad_norm": 1.6293028593063354, + "learning_rate": 1.1775736612857038e-05, + "loss": 0.7526, + "step": 8161 + }, + { + "epoch": 1.416030534351145, + "grad_norm": 1.0038878917694092, + "learning_rate": 1.176952653959385e-05, + "loss": 0.5936, + "step": 8162 + }, + { + "epoch": 1.4162040249826509, + "grad_norm": 0.9935939311981201, + "learning_rate": 1.17633174214867e-05, + "loss": 0.5897, + "step": 8163 + }, + { + "epoch": 1.416377515614157, + "grad_norm": 0.897951066493988, + "learning_rate": 1.1757109259256163e-05, + "loss": 0.6599, + "step": 8164 + }, + { + "epoch": 1.4165510062456628, + "grad_norm": 1.0244419574737549, + "learning_rate": 1.17509020536227e-05, + "loss": 0.5818, + "step": 8165 + }, + { + "epoch": 1.4167244968771686, + "grad_norm": 1.1540290117263794, + "learning_rate": 1.174469580530666e-05, + "loss": 0.7361, + "step": 8166 + }, + { + "epoch": 1.4168979875086745, + "grad_norm": 1.6335035562515259, + "learning_rate": 1.1738490515028303e-05, + "loss": 0.7096, + "step": 8167 + }, + { + "epoch": 1.4170714781401803, + "grad_norm": 1.1316359043121338, + "learning_rate": 1.1732286183507738e-05, + "loss": 0.677, + "step": 8168 + }, + { + "epoch": 1.4172449687716864, + "grad_norm": 0.7788179516792297, + "learning_rate": 1.1726082811464994e-05, + "loss": 0.9363, + "step": 8169 + }, + { + "epoch": 1.4174184594031922, + "grad_norm": 0.7448990941047668, + "learning_rate": 1.1719880399619987e-05, + "loss": 0.8279, + "step": 8170 + }, + { + "epoch": 1.417591950034698, + "grad_norm": 0.999065637588501, + "learning_rate": 1.171367894869251e-05, + "loss": 0.7229, + "step": 8171 + }, + { + "epoch": 1.4177654406662041, + "grad_norm": 0.8131214380264282, + "learning_rate": 1.1707478459402236e-05, + "loss": 0.744, + "step": 8172 + }, + { + "epoch": 1.41793893129771, + "grad_norm": 1.3180747032165527, + "learning_rate": 1.170127893246876e-05, + "loss": 0.5861, + "step": 8173 + }, + { + "epoch": 1.4181124219292158, + "grad_norm": 0.8535788655281067, + "learning_rate": 1.1695080368611526e-05, + "loss": 0.8479, + "step": 8174 + }, + { + "epoch": 1.4182859125607217, + "grad_norm": 0.8913144469261169, + "learning_rate": 1.1688882768549895e-05, + "loss": 0.6074, + "step": 8175 + }, + { + "epoch": 1.4184594031922275, + "grad_norm": 1.146071434020996, + "learning_rate": 1.1682686133003105e-05, + "loss": 0.6748, + "step": 8176 + }, + { + "epoch": 1.4186328938237336, + "grad_norm": 0.8888277411460876, + "learning_rate": 1.1676490462690282e-05, + "loss": 0.6851, + "step": 8177 + }, + { + "epoch": 1.4188063844552394, + "grad_norm": 0.8923730254173279, + "learning_rate": 1.167029575833044e-05, + "loss": 0.6614, + "step": 8178 + }, + { + "epoch": 1.4189798750867453, + "grad_norm": 0.9504383206367493, + "learning_rate": 1.1664102020642472e-05, + "loss": 0.5594, + "step": 8179 + }, + { + "epoch": 1.4191533657182513, + "grad_norm": 0.9361410737037659, + "learning_rate": 1.1657909250345194e-05, + "loss": 0.7361, + "step": 8180 + }, + { + "epoch": 1.4193268563497572, + "grad_norm": 0.9271007180213928, + "learning_rate": 1.1651717448157252e-05, + "loss": 0.6857, + "step": 8181 + }, + { + "epoch": 1.419500346981263, + "grad_norm": 0.8406780958175659, + "learning_rate": 1.1645526614797235e-05, + "loss": 0.6503, + "step": 8182 + }, + { + "epoch": 1.4196738376127689, + "grad_norm": 1.2220560312271118, + "learning_rate": 1.1639336750983593e-05, + "loss": 0.7415, + "step": 8183 + }, + { + "epoch": 1.4198473282442747, + "grad_norm": 0.8340184688568115, + "learning_rate": 1.1633147857434658e-05, + "loss": 0.8042, + "step": 8184 + }, + { + "epoch": 1.4200208188757806, + "grad_norm": 1.1775054931640625, + "learning_rate": 1.162695993486866e-05, + "loss": 0.5768, + "step": 8185 + }, + { + "epoch": 1.4201943095072866, + "grad_norm": 0.9609564542770386, + "learning_rate": 1.1620772984003724e-05, + "loss": 0.6898, + "step": 8186 + }, + { + "epoch": 1.4203678001387925, + "grad_norm": 0.9398984909057617, + "learning_rate": 1.1614587005557847e-05, + "loss": 0.5497, + "step": 8187 + }, + { + "epoch": 1.4205412907702983, + "grad_norm": 0.800710916519165, + "learning_rate": 1.1608402000248908e-05, + "loss": 0.7563, + "step": 8188 + }, + { + "epoch": 1.4207147814018044, + "grad_norm": 0.8001608848571777, + "learning_rate": 1.160221796879471e-05, + "loss": 0.7123, + "step": 8189 + }, + { + "epoch": 1.4208882720333103, + "grad_norm": 0.9361445307731628, + "learning_rate": 1.1596034911912896e-05, + "loss": 0.6627, + "step": 8190 + }, + { + "epoch": 1.421061762664816, + "grad_norm": 0.9159601330757141, + "learning_rate": 1.1589852830321024e-05, + "loss": 0.7391, + "step": 8191 + }, + { + "epoch": 1.421235253296322, + "grad_norm": 0.9172091484069824, + "learning_rate": 1.1583671724736526e-05, + "loss": 0.6377, + "step": 8192 + }, + { + "epoch": 1.4214087439278278, + "grad_norm": 0.9782140851020813, + "learning_rate": 1.1577491595876744e-05, + "loss": 0.7101, + "step": 8193 + }, + { + "epoch": 1.4215822345593339, + "grad_norm": 1.0422441959381104, + "learning_rate": 1.157131244445886e-05, + "loss": 0.5735, + "step": 8194 + }, + { + "epoch": 1.4217557251908397, + "grad_norm": 1.3555887937545776, + "learning_rate": 1.1565134271199999e-05, + "loss": 0.6338, + "step": 8195 + }, + { + "epoch": 1.4219292158223455, + "grad_norm": 1.7500970363616943, + "learning_rate": 1.1558957076817135e-05, + "loss": 0.6924, + "step": 8196 + }, + { + "epoch": 1.4221027064538516, + "grad_norm": 0.8329987525939941, + "learning_rate": 1.1552780862027136e-05, + "loss": 0.715, + "step": 8197 + }, + { + "epoch": 1.4222761970853575, + "grad_norm": 0.9852977395057678, + "learning_rate": 1.1546605627546752e-05, + "loss": 0.7469, + "step": 8198 + }, + { + "epoch": 1.4224496877168633, + "grad_norm": 0.8461137413978577, + "learning_rate": 1.154043137409265e-05, + "loss": 0.7891, + "step": 8199 + }, + { + "epoch": 1.4226231783483692, + "grad_norm": 0.987891435623169, + "learning_rate": 1.1534258102381332e-05, + "loss": 0.575, + "step": 8200 + }, + { + "epoch": 1.422796668979875, + "grad_norm": 0.7943682670593262, + "learning_rate": 1.1528085813129233e-05, + "loss": 0.6106, + "step": 8201 + }, + { + "epoch": 1.4229701596113808, + "grad_norm": 0.9360314011573792, + "learning_rate": 1.1521914507052646e-05, + "loss": 0.5704, + "step": 8202 + }, + { + "epoch": 1.423143650242887, + "grad_norm": 0.7538761496543884, + "learning_rate": 1.151574418486776e-05, + "loss": 0.7507, + "step": 8203 + }, + { + "epoch": 1.4233171408743928, + "grad_norm": 1.319765329360962, + "learning_rate": 1.1509574847290641e-05, + "loss": 0.7544, + "step": 8204 + }, + { + "epoch": 1.4234906315058986, + "grad_norm": 0.737889289855957, + "learning_rate": 1.150340649503726e-05, + "loss": 0.7473, + "step": 8205 + }, + { + "epoch": 1.4236641221374047, + "grad_norm": 1.7748624086380005, + "learning_rate": 1.1497239128823456e-05, + "loss": 0.7665, + "step": 8206 + }, + { + "epoch": 1.4238376127689105, + "grad_norm": 1.096632719039917, + "learning_rate": 1.149107274936496e-05, + "loss": 0.537, + "step": 8207 + }, + { + "epoch": 1.4240111034004164, + "grad_norm": 1.3405345678329468, + "learning_rate": 1.1484907357377378e-05, + "loss": 0.6031, + "step": 8208 + }, + { + "epoch": 1.4241845940319222, + "grad_norm": 1.031005859375, + "learning_rate": 1.1478742953576225e-05, + "loss": 0.7271, + "step": 8209 + }, + { + "epoch": 1.424358084663428, + "grad_norm": 1.0085846185684204, + "learning_rate": 1.1472579538676883e-05, + "loss": 0.7012, + "step": 8210 + }, + { + "epoch": 1.4245315752949341, + "grad_norm": 0.7088702321052551, + "learning_rate": 1.1466417113394624e-05, + "loss": 0.8396, + "step": 8211 + }, + { + "epoch": 1.42470506592644, + "grad_norm": 1.0155975818634033, + "learning_rate": 1.1460255678444598e-05, + "loss": 0.5969, + "step": 8212 + }, + { + "epoch": 1.4248785565579458, + "grad_norm": 0.8602683544158936, + "learning_rate": 1.1454095234541848e-05, + "loss": 0.6173, + "step": 8213 + }, + { + "epoch": 1.4250520471894519, + "grad_norm": 0.8065668940544128, + "learning_rate": 1.144793578240131e-05, + "loss": 0.7686, + "step": 8214 + }, + { + "epoch": 1.4252255378209577, + "grad_norm": 0.7363778352737427, + "learning_rate": 1.1441777322737791e-05, + "loss": 0.5648, + "step": 8215 + }, + { + "epoch": 1.4253990284524636, + "grad_norm": 0.9109671115875244, + "learning_rate": 1.1435619856265982e-05, + "loss": 0.6039, + "step": 8216 + }, + { + "epoch": 1.4255725190839694, + "grad_norm": 1.002806544303894, + "learning_rate": 1.1429463383700465e-05, + "loss": 0.5972, + "step": 8217 + }, + { + "epoch": 1.4257460097154753, + "grad_norm": 0.6833398342132568, + "learning_rate": 1.1423307905755714e-05, + "loss": 0.8525, + "step": 8218 + }, + { + "epoch": 1.4259195003469813, + "grad_norm": 0.7292102575302124, + "learning_rate": 1.1417153423146074e-05, + "loss": 0.8126, + "step": 8219 + }, + { + "epoch": 1.4260929909784872, + "grad_norm": 1.9132691621780396, + "learning_rate": 1.1410999936585782e-05, + "loss": 0.5421, + "step": 8220 + }, + { + "epoch": 1.426266481609993, + "grad_norm": 1.780884861946106, + "learning_rate": 1.1404847446788948e-05, + "loss": 0.5903, + "step": 8221 + }, + { + "epoch": 1.4264399722414989, + "grad_norm": 0.8338730931282043, + "learning_rate": 1.1398695954469598e-05, + "loss": 0.7604, + "step": 8222 + }, + { + "epoch": 1.426613462873005, + "grad_norm": 0.917503297328949, + "learning_rate": 1.1392545460341588e-05, + "loss": 0.6379, + "step": 8223 + }, + { + "epoch": 1.4267869535045108, + "grad_norm": 0.9174084067344666, + "learning_rate": 1.1386395965118715e-05, + "loss": 0.6239, + "step": 8224 + }, + { + "epoch": 1.4269604441360166, + "grad_norm": 0.9609774947166443, + "learning_rate": 1.1380247469514628e-05, + "loss": 0.6443, + "step": 8225 + }, + { + "epoch": 1.4271339347675225, + "grad_norm": 0.951890766620636, + "learning_rate": 1.1374099974242867e-05, + "loss": 0.6281, + "step": 8226 + }, + { + "epoch": 1.4273074253990283, + "grad_norm": 0.918853223323822, + "learning_rate": 1.1367953480016848e-05, + "loss": 0.7524, + "step": 8227 + }, + { + "epoch": 1.4274809160305344, + "grad_norm": 0.9370686411857605, + "learning_rate": 1.13618079875499e-05, + "loss": 0.719, + "step": 8228 + }, + { + "epoch": 1.4276544066620402, + "grad_norm": 0.9790075421333313, + "learning_rate": 1.1355663497555186e-05, + "loss": 0.6631, + "step": 8229 + }, + { + "epoch": 1.427827897293546, + "grad_norm": 0.9799947142601013, + "learning_rate": 1.1349520010745802e-05, + "loss": 0.6492, + "step": 8230 + }, + { + "epoch": 1.4280013879250522, + "grad_norm": 0.870806872844696, + "learning_rate": 1.1343377527834703e-05, + "loss": 0.8281, + "step": 8231 + }, + { + "epoch": 1.428174878556558, + "grad_norm": 0.8370445966720581, + "learning_rate": 1.1337236049534726e-05, + "loss": 0.6846, + "step": 8232 + }, + { + "epoch": 1.4283483691880638, + "grad_norm": 1.4423253536224365, + "learning_rate": 1.133109557655859e-05, + "loss": 0.6626, + "step": 8233 + }, + { + "epoch": 1.4285218598195697, + "grad_norm": 0.9715951681137085, + "learning_rate": 1.1324956109618927e-05, + "loss": 0.7429, + "step": 8234 + }, + { + "epoch": 1.4286953504510755, + "grad_norm": 0.9846787452697754, + "learning_rate": 1.1318817649428213e-05, + "loss": 0.5867, + "step": 8235 + }, + { + "epoch": 1.4288688410825816, + "grad_norm": 1.0472121238708496, + "learning_rate": 1.1312680196698817e-05, + "loss": 0.697, + "step": 8236 + }, + { + "epoch": 1.4290423317140875, + "grad_norm": 1.2708057165145874, + "learning_rate": 1.1306543752143018e-05, + "loss": 0.7422, + "step": 8237 + }, + { + "epoch": 1.4292158223455933, + "grad_norm": 0.8832883238792419, + "learning_rate": 1.1300408316472944e-05, + "loss": 0.7504, + "step": 8238 + }, + { + "epoch": 1.4293893129770994, + "grad_norm": 0.9006069302558899, + "learning_rate": 1.1294273890400625e-05, + "loss": 0.6479, + "step": 8239 + }, + { + "epoch": 1.4295628036086052, + "grad_norm": 0.8571210503578186, + "learning_rate": 1.1288140474637953e-05, + "loss": 0.7258, + "step": 8240 + }, + { + "epoch": 1.429736294240111, + "grad_norm": 0.8362399935722351, + "learning_rate": 1.1282008069896749e-05, + "loss": 0.7134, + "step": 8241 + }, + { + "epoch": 1.429909784871617, + "grad_norm": 1.0734233856201172, + "learning_rate": 1.127587667688865e-05, + "loss": 0.7, + "step": 8242 + }, + { + "epoch": 1.4300832755031228, + "grad_norm": 1.0220903158187866, + "learning_rate": 1.1269746296325236e-05, + "loss": 0.5939, + "step": 8243 + }, + { + "epoch": 1.4302567661346286, + "grad_norm": 0.9192237854003906, + "learning_rate": 1.126361692891794e-05, + "loss": 0.7527, + "step": 8244 + }, + { + "epoch": 1.4304302567661347, + "grad_norm": 0.9567228555679321, + "learning_rate": 1.1257488575378077e-05, + "loss": 0.7898, + "step": 8245 + }, + { + "epoch": 1.4306037473976405, + "grad_norm": 0.8961965441703796, + "learning_rate": 1.1251361236416845e-05, + "loss": 0.6458, + "step": 8246 + }, + { + "epoch": 1.4307772380291464, + "grad_norm": 0.9105427861213684, + "learning_rate": 1.124523491274534e-05, + "loss": 0.6617, + "step": 8247 + }, + { + "epoch": 1.4309507286606524, + "grad_norm": 0.8918143510818481, + "learning_rate": 1.1239109605074527e-05, + "loss": 0.6093, + "step": 8248 + }, + { + "epoch": 1.4311242192921583, + "grad_norm": 0.796769380569458, + "learning_rate": 1.1232985314115255e-05, + "loss": 0.7319, + "step": 8249 + }, + { + "epoch": 1.4312977099236641, + "grad_norm": 1.848008155822754, + "learning_rate": 1.1226862040578244e-05, + "loss": 0.6975, + "step": 8250 + }, + { + "epoch": 1.43147120055517, + "grad_norm": 1.3447892665863037, + "learning_rate": 1.1220739785174129e-05, + "loss": 0.7637, + "step": 8251 + }, + { + "epoch": 1.4316446911866758, + "grad_norm": 0.9727442860603333, + "learning_rate": 1.1214618548613379e-05, + "loss": 0.6024, + "step": 8252 + }, + { + "epoch": 1.4318181818181819, + "grad_norm": 2.1161611080169678, + "learning_rate": 1.120849833160639e-05, + "loss": 0.834, + "step": 8253 + }, + { + "epoch": 1.4319916724496877, + "grad_norm": 1.8288322687149048, + "learning_rate": 1.1202379134863412e-05, + "loss": 0.678, + "step": 8254 + }, + { + "epoch": 1.4321651630811936, + "grad_norm": 0.8958083987236023, + "learning_rate": 1.1196260959094588e-05, + "loss": 0.679, + "step": 8255 + }, + { + "epoch": 1.4323386537126996, + "grad_norm": 0.832627534866333, + "learning_rate": 1.1190143805009934e-05, + "loss": 0.7906, + "step": 8256 + }, + { + "epoch": 1.4325121443442055, + "grad_norm": 0.9608588814735413, + "learning_rate": 1.118402767331936e-05, + "loss": 0.6016, + "step": 8257 + }, + { + "epoch": 1.4326856349757113, + "grad_norm": 1.3488579988479614, + "learning_rate": 1.117791256473265e-05, + "loss": 0.678, + "step": 8258 + }, + { + "epoch": 1.4328591256072172, + "grad_norm": 0.7823948860168457, + "learning_rate": 1.1171798479959466e-05, + "loss": 0.7402, + "step": 8259 + }, + { + "epoch": 1.433032616238723, + "grad_norm": 1.046374797821045, + "learning_rate": 1.1165685419709353e-05, + "loss": 0.6285, + "step": 8260 + }, + { + "epoch": 1.433206106870229, + "grad_norm": 0.7331840991973877, + "learning_rate": 1.1159573384691736e-05, + "loss": 0.7189, + "step": 8261 + }, + { + "epoch": 1.433379597501735, + "grad_norm": 1.265857458114624, + "learning_rate": 1.1153462375615934e-05, + "loss": 0.7195, + "step": 8262 + }, + { + "epoch": 1.4335530881332408, + "grad_norm": 0.7981159090995789, + "learning_rate": 1.1147352393191135e-05, + "loss": 0.7316, + "step": 8263 + }, + { + "epoch": 1.4337265787647466, + "grad_norm": 0.992882490158081, + "learning_rate": 1.1141243438126403e-05, + "loss": 0.6749, + "step": 8264 + }, + { + "epoch": 1.4339000693962527, + "grad_norm": 0.6876752972602844, + "learning_rate": 1.1135135511130685e-05, + "loss": 0.6196, + "step": 8265 + }, + { + "epoch": 1.4340735600277585, + "grad_norm": 0.8812111616134644, + "learning_rate": 1.1129028612912832e-05, + "loss": 0.712, + "step": 8266 + }, + { + "epoch": 1.4342470506592644, + "grad_norm": 0.9719088077545166, + "learning_rate": 1.1122922744181541e-05, + "loss": 0.6401, + "step": 8267 + }, + { + "epoch": 1.4344205412907702, + "grad_norm": 0.8841831684112549, + "learning_rate": 1.1116817905645411e-05, + "loss": 0.6893, + "step": 8268 + }, + { + "epoch": 1.434594031922276, + "grad_norm": 0.7794767022132874, + "learning_rate": 1.1110714098012907e-05, + "loss": 0.6466, + "step": 8269 + }, + { + "epoch": 1.4347675225537821, + "grad_norm": 0.9668713212013245, + "learning_rate": 1.1104611321992404e-05, + "loss": 0.8547, + "step": 8270 + }, + { + "epoch": 1.434941013185288, + "grad_norm": 0.942042350769043, + "learning_rate": 1.1098509578292109e-05, + "loss": 0.7214, + "step": 8271 + }, + { + "epoch": 1.4351145038167938, + "grad_norm": 0.7979863882064819, + "learning_rate": 1.1092408867620155e-05, + "loss": 0.6897, + "step": 8272 + }, + { + "epoch": 1.4352879944483, + "grad_norm": 1.037564754486084, + "learning_rate": 1.1086309190684532e-05, + "loss": 0.5509, + "step": 8273 + }, + { + "epoch": 1.4354614850798058, + "grad_norm": 1.2324150800704956, + "learning_rate": 1.1080210548193113e-05, + "loss": 0.6825, + "step": 8274 + }, + { + "epoch": 1.4356349757113116, + "grad_norm": 1.0929065942764282, + "learning_rate": 1.107411294085365e-05, + "loss": 0.5413, + "step": 8275 + }, + { + "epoch": 1.4358084663428174, + "grad_norm": 0.8611690402030945, + "learning_rate": 1.1068016369373784e-05, + "loss": 0.5645, + "step": 8276 + }, + { + "epoch": 1.4359819569743233, + "grad_norm": 0.946142315864563, + "learning_rate": 1.1061920834461026e-05, + "loss": 0.7388, + "step": 8277 + }, + { + "epoch": 1.4361554476058294, + "grad_norm": 0.9910246133804321, + "learning_rate": 1.1055826336822775e-05, + "loss": 0.6093, + "step": 8278 + }, + { + "epoch": 1.4363289382373352, + "grad_norm": 1.005667805671692, + "learning_rate": 1.1049732877166286e-05, + "loss": 0.6277, + "step": 8279 + }, + { + "epoch": 1.436502428868841, + "grad_norm": 0.7766329050064087, + "learning_rate": 1.1043640456198745e-05, + "loss": 0.664, + "step": 8280 + }, + { + "epoch": 1.436675919500347, + "grad_norm": 1.1204898357391357, + "learning_rate": 1.103754907462715e-05, + "loss": 0.7163, + "step": 8281 + }, + { + "epoch": 1.436849410131853, + "grad_norm": 2.7592949867248535, + "learning_rate": 1.1031458733158434e-05, + "loss": 0.7358, + "step": 8282 + }, + { + "epoch": 1.4370229007633588, + "grad_norm": 0.7331947684288025, + "learning_rate": 1.1025369432499385e-05, + "loss": 0.7137, + "step": 8283 + }, + { + "epoch": 1.4371963913948647, + "grad_norm": 0.9395007491111755, + "learning_rate": 1.101928117335666e-05, + "loss": 0.6162, + "step": 8284 + }, + { + "epoch": 1.4373698820263705, + "grad_norm": 0.855191171169281, + "learning_rate": 1.101319395643683e-05, + "loss": 0.689, + "step": 8285 + }, + { + "epoch": 1.4375433726578764, + "grad_norm": 0.8747117519378662, + "learning_rate": 1.100710778244631e-05, + "loss": 0.6493, + "step": 8286 + }, + { + "epoch": 1.4377168632893824, + "grad_norm": 0.8851883411407471, + "learning_rate": 1.1001022652091413e-05, + "loss": 0.864, + "step": 8287 + }, + { + "epoch": 1.4378903539208883, + "grad_norm": 1.2516107559204102, + "learning_rate": 1.0994938566078315e-05, + "loss": 0.5909, + "step": 8288 + }, + { + "epoch": 1.438063844552394, + "grad_norm": 1.1174479722976685, + "learning_rate": 1.0988855525113096e-05, + "loss": 0.5719, + "step": 8289 + }, + { + "epoch": 1.4382373351839002, + "grad_norm": 1.5790598392486572, + "learning_rate": 1.0982773529901696e-05, + "loss": 0.7251, + "step": 8290 + }, + { + "epoch": 1.438410825815406, + "grad_norm": 1.2860158681869507, + "learning_rate": 1.0976692581149933e-05, + "loss": 0.5685, + "step": 8291 + }, + { + "epoch": 1.4385843164469119, + "grad_norm": 0.755673885345459, + "learning_rate": 1.0970612679563501e-05, + "loss": 0.6577, + "step": 8292 + }, + { + "epoch": 1.4387578070784177, + "grad_norm": 0.8757845759391785, + "learning_rate": 1.0964533825848008e-05, + "loss": 0.7263, + "step": 8293 + }, + { + "epoch": 1.4389312977099236, + "grad_norm": 0.999706506729126, + "learning_rate": 1.0958456020708875e-05, + "loss": 0.719, + "step": 8294 + }, + { + "epoch": 1.4391047883414296, + "grad_norm": 0.9387538433074951, + "learning_rate": 1.0952379264851464e-05, + "loss": 0.6783, + "step": 8295 + }, + { + "epoch": 1.4392782789729355, + "grad_norm": 0.7980362176895142, + "learning_rate": 1.0946303558980981e-05, + "loss": 0.7312, + "step": 8296 + }, + { + "epoch": 1.4394517696044413, + "grad_norm": 1.0636227130889893, + "learning_rate": 1.094022890380252e-05, + "loss": 0.5896, + "step": 8297 + }, + { + "epoch": 1.4396252602359474, + "grad_norm": 1.0003043413162231, + "learning_rate": 1.0934155300021048e-05, + "loss": 0.686, + "step": 8298 + }, + { + "epoch": 1.4397987508674532, + "grad_norm": 1.4416046142578125, + "learning_rate": 1.0928082748341429e-05, + "loss": 0.603, + "step": 8299 + }, + { + "epoch": 1.439972241498959, + "grad_norm": 1.1569799184799194, + "learning_rate": 1.0922011249468362e-05, + "loss": 0.7524, + "step": 8300 + }, + { + "epoch": 1.440145732130465, + "grad_norm": 0.7506779432296753, + "learning_rate": 1.0915940804106477e-05, + "loss": 0.7542, + "step": 8301 + }, + { + "epoch": 1.4403192227619708, + "grad_norm": 1.0286344289779663, + "learning_rate": 1.0909871412960245e-05, + "loss": 0.7937, + "step": 8302 + }, + { + "epoch": 1.4404927133934766, + "grad_norm": 1.3450305461883545, + "learning_rate": 1.090380307673403e-05, + "loss": 0.6205, + "step": 8303 + }, + { + "epoch": 1.4406662040249827, + "grad_norm": 0.8232612609863281, + "learning_rate": 1.0897735796132056e-05, + "loss": 0.7043, + "step": 8304 + }, + { + "epoch": 1.4408396946564885, + "grad_norm": 0.971222996711731, + "learning_rate": 1.0891669571858458e-05, + "loss": 0.7876, + "step": 8305 + }, + { + "epoch": 1.4410131852879944, + "grad_norm": 0.8925091624259949, + "learning_rate": 1.0885604404617221e-05, + "loss": 0.6165, + "step": 8306 + }, + { + "epoch": 1.4411866759195004, + "grad_norm": 1.3998011350631714, + "learning_rate": 1.0879540295112212e-05, + "loss": 0.6608, + "step": 8307 + }, + { + "epoch": 1.4413601665510063, + "grad_norm": 1.266869068145752, + "learning_rate": 1.087347724404717e-05, + "loss": 0.6605, + "step": 8308 + }, + { + "epoch": 1.4415336571825121, + "grad_norm": 0.8374751806259155, + "learning_rate": 1.086741525212574e-05, + "loss": 0.6769, + "step": 8309 + }, + { + "epoch": 1.441707147814018, + "grad_norm": 1.3684556484222412, + "learning_rate": 1.086135432005141e-05, + "loss": 0.7529, + "step": 8310 + }, + { + "epoch": 1.4418806384455238, + "grad_norm": 1.6483705043792725, + "learning_rate": 1.0855294448527548e-05, + "loss": 0.6566, + "step": 8311 + }, + { + "epoch": 1.44205412907703, + "grad_norm": 1.3396052122116089, + "learning_rate": 1.0849235638257442e-05, + "loss": 0.6956, + "step": 8312 + }, + { + "epoch": 1.4422276197085357, + "grad_norm": 0.8740501403808594, + "learning_rate": 1.0843177889944187e-05, + "loss": 0.7288, + "step": 8313 + }, + { + "epoch": 1.4424011103400416, + "grad_norm": 1.0966017246246338, + "learning_rate": 1.0837121204290812e-05, + "loss": 0.567, + "step": 8314 + }, + { + "epoch": 1.4425746009715477, + "grad_norm": 0.844046950340271, + "learning_rate": 1.08310655820002e-05, + "loss": 0.8169, + "step": 8315 + }, + { + "epoch": 1.4427480916030535, + "grad_norm": 0.8962596654891968, + "learning_rate": 1.0825011023775111e-05, + "loss": 0.7822, + "step": 8316 + }, + { + "epoch": 1.4429215822345594, + "grad_norm": 1.2069220542907715, + "learning_rate": 1.0818957530318177e-05, + "loss": 0.6866, + "step": 8317 + }, + { + "epoch": 1.4430950728660652, + "grad_norm": 1.0831001996994019, + "learning_rate": 1.0812905102331927e-05, + "loss": 0.5458, + "step": 8318 + }, + { + "epoch": 1.443268563497571, + "grad_norm": 0.9876552224159241, + "learning_rate": 1.0806853740518743e-05, + "loss": 0.6627, + "step": 8319 + }, + { + "epoch": 1.4434420541290771, + "grad_norm": 1.0128096342086792, + "learning_rate": 1.0800803445580896e-05, + "loss": 0.6285, + "step": 8320 + }, + { + "epoch": 1.443615544760583, + "grad_norm": 0.8289034366607666, + "learning_rate": 1.079475421822052e-05, + "loss": 0.7034, + "step": 8321 + }, + { + "epoch": 1.4437890353920888, + "grad_norm": 0.788754940032959, + "learning_rate": 1.078870605913966e-05, + "loss": 0.6189, + "step": 8322 + }, + { + "epoch": 1.4439625260235947, + "grad_norm": 0.8486101627349854, + "learning_rate": 1.0782658969040179e-05, + "loss": 0.8752, + "step": 8323 + }, + { + "epoch": 1.4441360166551007, + "grad_norm": 1.0304034948349, + "learning_rate": 1.0776612948623874e-05, + "loss": 0.5201, + "step": 8324 + }, + { + "epoch": 1.4443095072866066, + "grad_norm": 1.165314793586731, + "learning_rate": 1.0770567998592383e-05, + "loss": 0.5883, + "step": 8325 + }, + { + "epoch": 1.4444829979181124, + "grad_norm": 0.8483907580375671, + "learning_rate": 1.0764524119647228e-05, + "loss": 0.6814, + "step": 8326 + }, + { + "epoch": 1.4446564885496183, + "grad_norm": 1.4149788618087769, + "learning_rate": 1.0758481312489804e-05, + "loss": 0.5852, + "step": 8327 + }, + { + "epoch": 1.444829979181124, + "grad_norm": 1.2425750494003296, + "learning_rate": 1.0752439577821398e-05, + "loss": 0.7058, + "step": 8328 + }, + { + "epoch": 1.4450034698126302, + "grad_norm": 1.1051685810089111, + "learning_rate": 1.0746398916343158e-05, + "loss": 0.5985, + "step": 8329 + }, + { + "epoch": 1.445176960444136, + "grad_norm": 0.8199599385261536, + "learning_rate": 1.0740359328756105e-05, + "loss": 0.6453, + "step": 8330 + }, + { + "epoch": 1.4453504510756419, + "grad_norm": 0.777734637260437, + "learning_rate": 1.0734320815761143e-05, + "loss": 0.7965, + "step": 8331 + }, + { + "epoch": 1.445523941707148, + "grad_norm": 2.0338516235351562, + "learning_rate": 1.0728283378059036e-05, + "loss": 0.7461, + "step": 8332 + }, + { + "epoch": 1.4456974323386538, + "grad_norm": 0.9562954902648926, + "learning_rate": 1.0722247016350456e-05, + "loss": 0.7072, + "step": 8333 + }, + { + "epoch": 1.4458709229701596, + "grad_norm": 0.9932342767715454, + "learning_rate": 1.0716211731335922e-05, + "loss": 0.6935, + "step": 8334 + }, + { + "epoch": 1.4460444136016655, + "grad_norm": 0.8846539855003357, + "learning_rate": 1.0710177523715833e-05, + "loss": 0.6318, + "step": 8335 + }, + { + "epoch": 1.4462179042331713, + "grad_norm": 0.821926474571228, + "learning_rate": 1.0704144394190458e-05, + "loss": 0.6222, + "step": 8336 + }, + { + "epoch": 1.4463913948646774, + "grad_norm": 0.9306650161743164, + "learning_rate": 1.0698112343459967e-05, + "loss": 0.5551, + "step": 8337 + }, + { + "epoch": 1.4465648854961832, + "grad_norm": 0.9521759152412415, + "learning_rate": 1.0692081372224378e-05, + "loss": 0.7339, + "step": 8338 + }, + { + "epoch": 1.446738376127689, + "grad_norm": 1.2749427556991577, + "learning_rate": 1.0686051481183593e-05, + "loss": 0.5826, + "step": 8339 + }, + { + "epoch": 1.4469118667591951, + "grad_norm": 0.8372285962104797, + "learning_rate": 1.0680022671037376e-05, + "loss": 0.7668, + "step": 8340 + }, + { + "epoch": 1.447085357390701, + "grad_norm": 0.9472049474716187, + "learning_rate": 1.0673994942485404e-05, + "loss": 0.7219, + "step": 8341 + }, + { + "epoch": 1.4472588480222068, + "grad_norm": 0.6972798109054565, + "learning_rate": 1.0667968296227169e-05, + "loss": 0.8206, + "step": 8342 + }, + { + "epoch": 1.4474323386537127, + "grad_norm": 1.2551277875900269, + "learning_rate": 1.0661942732962096e-05, + "loss": 0.8188, + "step": 8343 + }, + { + "epoch": 1.4476058292852185, + "grad_norm": 0.9840268492698669, + "learning_rate": 1.0655918253389452e-05, + "loss": 0.6707, + "step": 8344 + }, + { + "epoch": 1.4477793199167244, + "grad_norm": 0.9034901261329651, + "learning_rate": 1.0649894858208381e-05, + "loss": 0.7236, + "step": 8345 + }, + { + "epoch": 1.4479528105482304, + "grad_norm": 1.5338900089263916, + "learning_rate": 1.06438725481179e-05, + "loss": 0.6324, + "step": 8346 + }, + { + "epoch": 1.4481263011797363, + "grad_norm": 1.2235087156295776, + "learning_rate": 1.0637851323816918e-05, + "loss": 0.6458, + "step": 8347 + }, + { + "epoch": 1.4482997918112421, + "grad_norm": 0.8968767523765564, + "learning_rate": 1.06318311860042e-05, + "loss": 0.6732, + "step": 8348 + }, + { + "epoch": 1.4484732824427482, + "grad_norm": 0.8165450096130371, + "learning_rate": 1.0625812135378388e-05, + "loss": 0.7271, + "step": 8349 + }, + { + "epoch": 1.448646773074254, + "grad_norm": 0.8977956771850586, + "learning_rate": 1.0619794172637995e-05, + "loss": 0.7629, + "step": 8350 + }, + { + "epoch": 1.44882026370576, + "grad_norm": 1.0379178524017334, + "learning_rate": 1.0613777298481431e-05, + "loss": 0.7136, + "step": 8351 + }, + { + "epoch": 1.4489937543372657, + "grad_norm": 1.024416208267212, + "learning_rate": 1.0607761513606935e-05, + "loss": 0.7015, + "step": 8352 + }, + { + "epoch": 1.4491672449687716, + "grad_norm": 0.7644994854927063, + "learning_rate": 1.0601746818712666e-05, + "loss": 0.8533, + "step": 8353 + }, + { + "epoch": 1.4493407356002777, + "grad_norm": 0.9009976387023926, + "learning_rate": 1.0595733214496633e-05, + "loss": 0.6512, + "step": 8354 + }, + { + "epoch": 1.4495142262317835, + "grad_norm": 0.6559053659439087, + "learning_rate": 1.0589720701656714e-05, + "loss": 0.7487, + "step": 8355 + }, + { + "epoch": 1.4496877168632893, + "grad_norm": 1.8236305713653564, + "learning_rate": 1.0583709280890668e-05, + "loss": 0.798, + "step": 8356 + }, + { + "epoch": 1.4498612074947954, + "grad_norm": 0.9773489832878113, + "learning_rate": 1.057769895289614e-05, + "loss": 0.5682, + "step": 8357 + }, + { + "epoch": 1.4500346981263013, + "grad_norm": 0.7722461819648743, + "learning_rate": 1.0571689718370629e-05, + "loss": 0.7854, + "step": 8358 + }, + { + "epoch": 1.450208188757807, + "grad_norm": 0.9951772093772888, + "learning_rate": 1.05656815780115e-05, + "loss": 0.8303, + "step": 8359 + }, + { + "epoch": 1.450381679389313, + "grad_norm": 0.849237322807312, + "learning_rate": 1.0559674532516033e-05, + "loss": 0.6777, + "step": 8360 + }, + { + "epoch": 1.4505551700208188, + "grad_norm": 1.115174412727356, + "learning_rate": 1.0553668582581324e-05, + "loss": 0.5675, + "step": 8361 + }, + { + "epoch": 1.4507286606523246, + "grad_norm": 0.997922956943512, + "learning_rate": 1.0547663728904392e-05, + "loss": 0.5786, + "step": 8362 + }, + { + "epoch": 1.4509021512838307, + "grad_norm": 0.8883750438690186, + "learning_rate": 1.0541659972182088e-05, + "loss": 0.7551, + "step": 8363 + }, + { + "epoch": 1.4510756419153366, + "grad_norm": 10.320053100585938, + "learning_rate": 1.0535657313111183e-05, + "loss": 0.8286, + "step": 8364 + }, + { + "epoch": 1.4512491325468424, + "grad_norm": 1.0095723867416382, + "learning_rate": 1.0529655752388254e-05, + "loss": 0.6696, + "step": 8365 + }, + { + "epoch": 1.4514226231783485, + "grad_norm": 1.0852502584457397, + "learning_rate": 1.0523655290709825e-05, + "loss": 0.7603, + "step": 8366 + }, + { + "epoch": 1.4515961138098543, + "grad_norm": 0.756560206413269, + "learning_rate": 1.051765592877224e-05, + "loss": 0.5844, + "step": 8367 + }, + { + "epoch": 1.4517696044413602, + "grad_norm": 0.7062219381332397, + "learning_rate": 1.0511657667271731e-05, + "loss": 0.7651, + "step": 8368 + }, + { + "epoch": 1.451943095072866, + "grad_norm": 0.6771591305732727, + "learning_rate": 1.0505660506904398e-05, + "loss": 0.8306, + "step": 8369 + }, + { + "epoch": 1.4521165857043719, + "grad_norm": 0.8277074694633484, + "learning_rate": 1.0499664448366245e-05, + "loss": 0.7671, + "step": 8370 + }, + { + "epoch": 1.452290076335878, + "grad_norm": 1.5071462392807007, + "learning_rate": 1.0493669492353082e-05, + "loss": 0.5588, + "step": 8371 + }, + { + "epoch": 1.4524635669673838, + "grad_norm": 0.6597122550010681, + "learning_rate": 1.0487675639560664e-05, + "loss": 0.8113, + "step": 8372 + }, + { + "epoch": 1.4526370575988896, + "grad_norm": 0.8499297499656677, + "learning_rate": 1.048168289068457e-05, + "loss": 0.5797, + "step": 8373 + }, + { + "epoch": 1.4528105482303957, + "grad_norm": 1.1765002012252808, + "learning_rate": 1.0475691246420267e-05, + "loss": 0.5735, + "step": 8374 + }, + { + "epoch": 1.4529840388619015, + "grad_norm": 0.8222528100013733, + "learning_rate": 1.0469700707463089e-05, + "loss": 0.6975, + "step": 8375 + }, + { + "epoch": 1.4531575294934074, + "grad_norm": 1.1415014266967773, + "learning_rate": 1.0463711274508253e-05, + "loss": 0.6495, + "step": 8376 + }, + { + "epoch": 1.4533310201249132, + "grad_norm": 1.196657657623291, + "learning_rate": 1.0457722948250837e-05, + "loss": 0.6588, + "step": 8377 + }, + { + "epoch": 1.453504510756419, + "grad_norm": 0.8647920489311218, + "learning_rate": 1.045173572938579e-05, + "loss": 0.7168, + "step": 8378 + }, + { + "epoch": 1.4536780013879251, + "grad_norm": 1.4990001916885376, + "learning_rate": 1.0445749618607932e-05, + "loss": 0.8187, + "step": 8379 + }, + { + "epoch": 1.453851492019431, + "grad_norm": 1.1049847602844238, + "learning_rate": 1.0439764616611972e-05, + "loss": 0.6702, + "step": 8380 + }, + { + "epoch": 1.4540249826509368, + "grad_norm": 1.4034414291381836, + "learning_rate": 1.0433780724092466e-05, + "loss": 0.6626, + "step": 8381 + }, + { + "epoch": 1.4541984732824427, + "grad_norm": 2.891657829284668, + "learning_rate": 1.0427797941743854e-05, + "loss": 0.5734, + "step": 8382 + }, + { + "epoch": 1.4543719639139487, + "grad_norm": 1.0076673030853271, + "learning_rate": 1.0421816270260447e-05, + "loss": 0.5383, + "step": 8383 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 1.104081392288208, + "learning_rate": 1.041583571033641e-05, + "loss": 0.5651, + "step": 8384 + }, + { + "epoch": 1.4547189451769604, + "grad_norm": 1.1450533866882324, + "learning_rate": 1.0409856262665818e-05, + "loss": 0.7655, + "step": 8385 + }, + { + "epoch": 1.4548924358084663, + "grad_norm": 1.505397915840149, + "learning_rate": 1.0403877927942582e-05, + "loss": 0.8257, + "step": 8386 + }, + { + "epoch": 1.4550659264399721, + "grad_norm": 0.8133670687675476, + "learning_rate": 1.0397900706860493e-05, + "loss": 0.7036, + "step": 8387 + }, + { + "epoch": 1.4552394170714782, + "grad_norm": 1.2931723594665527, + "learning_rate": 1.0391924600113211e-05, + "loss": 0.6733, + "step": 8388 + }, + { + "epoch": 1.455412907702984, + "grad_norm": 1.0534595251083374, + "learning_rate": 1.0385949608394283e-05, + "loss": 0.6019, + "step": 8389 + }, + { + "epoch": 1.4555863983344899, + "grad_norm": 0.8554640412330627, + "learning_rate": 1.0379975732397096e-05, + "loss": 0.6311, + "step": 8390 + }, + { + "epoch": 1.455759888965996, + "grad_norm": 0.9273337125778198, + "learning_rate": 1.0374002972814941e-05, + "loss": 0.5845, + "step": 8391 + }, + { + "epoch": 1.4559333795975018, + "grad_norm": 0.9922353029251099, + "learning_rate": 1.0368031330340948e-05, + "loss": 0.6956, + "step": 8392 + }, + { + "epoch": 1.4561068702290076, + "grad_norm": 0.7446044087409973, + "learning_rate": 1.0362060805668162e-05, + "loss": 0.7719, + "step": 8393 + }, + { + "epoch": 1.4562803608605135, + "grad_norm": 0.7604733109474182, + "learning_rate": 1.0356091399489431e-05, + "loss": 0.6294, + "step": 8394 + }, + { + "epoch": 1.4564538514920193, + "grad_norm": 0.9189860224723816, + "learning_rate": 1.0350123112497541e-05, + "loss": 0.6244, + "step": 8395 + }, + { + "epoch": 1.4566273421235254, + "grad_norm": 0.8261826038360596, + "learning_rate": 1.0344155945385106e-05, + "loss": 0.7145, + "step": 8396 + }, + { + "epoch": 1.4568008327550312, + "grad_norm": 0.8415585160255432, + "learning_rate": 1.0338189898844626e-05, + "loss": 0.6874, + "step": 8397 + }, + { + "epoch": 1.456974323386537, + "grad_norm": 0.9195595979690552, + "learning_rate": 1.0332224973568458e-05, + "loss": 0.6371, + "step": 8398 + }, + { + "epoch": 1.4571478140180432, + "grad_norm": 0.7520208358764648, + "learning_rate": 1.0326261170248863e-05, + "loss": 0.7581, + "step": 8399 + }, + { + "epoch": 1.457321304649549, + "grad_norm": 1.0782853364944458, + "learning_rate": 1.0320298489577913e-05, + "loss": 0.7085, + "step": 8400 + }, + { + "epoch": 1.4574947952810549, + "grad_norm": 0.7850427031517029, + "learning_rate": 1.0314336932247615e-05, + "loss": 0.7201, + "step": 8401 + }, + { + "epoch": 1.4576682859125607, + "grad_norm": 0.9978787899017334, + "learning_rate": 1.03083764989498e-05, + "loss": 0.6719, + "step": 8402 + }, + { + "epoch": 1.4578417765440665, + "grad_norm": 1.2042200565338135, + "learning_rate": 1.0302417190376184e-05, + "loss": 0.6355, + "step": 8403 + }, + { + "epoch": 1.4580152671755724, + "grad_norm": 0.8262830376625061, + "learning_rate": 1.0296459007218345e-05, + "loss": 0.6963, + "step": 8404 + }, + { + "epoch": 1.4581887578070785, + "grad_norm": 0.9790987372398376, + "learning_rate": 1.0290501950167755e-05, + "loss": 0.5503, + "step": 8405 + }, + { + "epoch": 1.4583622484385843, + "grad_norm": 1.2007272243499756, + "learning_rate": 1.0284546019915727e-05, + "loss": 0.5913, + "step": 8406 + }, + { + "epoch": 1.4585357390700902, + "grad_norm": 1.1359059810638428, + "learning_rate": 1.0278591217153444e-05, + "loss": 0.7488, + "step": 8407 + }, + { + "epoch": 1.4587092297015962, + "grad_norm": 0.8492912650108337, + "learning_rate": 1.0272637542571988e-05, + "loss": 0.7861, + "step": 8408 + }, + { + "epoch": 1.458882720333102, + "grad_norm": 1.0982236862182617, + "learning_rate": 1.026668499686228e-05, + "loss": 0.6365, + "step": 8409 + }, + { + "epoch": 1.459056210964608, + "grad_norm": 0.8532607555389404, + "learning_rate": 1.026073358071512e-05, + "loss": 0.818, + "step": 8410 + }, + { + "epoch": 1.4592297015961138, + "grad_norm": 1.0151493549346924, + "learning_rate": 1.0254783294821166e-05, + "loss": 0.5675, + "step": 8411 + }, + { + "epoch": 1.4594031922276196, + "grad_norm": 0.6919994354248047, + "learning_rate": 1.0248834139870985e-05, + "loss": 0.7117, + "step": 8412 + }, + { + "epoch": 1.4595766828591257, + "grad_norm": 0.7110797762870789, + "learning_rate": 1.0242886116554947e-05, + "loss": 0.8125, + "step": 8413 + }, + { + "epoch": 1.4597501734906315, + "grad_norm": 1.2104823589324951, + "learning_rate": 1.0236939225563351e-05, + "loss": 0.8086, + "step": 8414 + }, + { + "epoch": 1.4599236641221374, + "grad_norm": 0.7267889380455017, + "learning_rate": 1.0230993467586336e-05, + "loss": 0.8718, + "step": 8415 + }, + { + "epoch": 1.4600971547536434, + "grad_norm": 0.7332297563552856, + "learning_rate": 1.0225048843313914e-05, + "loss": 0.6331, + "step": 8416 + }, + { + "epoch": 1.4602706453851493, + "grad_norm": 1.2351951599121094, + "learning_rate": 1.0219105353435951e-05, + "loss": 0.549, + "step": 8417 + }, + { + "epoch": 1.4604441360166551, + "grad_norm": 1.301702618598938, + "learning_rate": 1.021316299864223e-05, + "loss": 0.5626, + "step": 8418 + }, + { + "epoch": 1.460617626648161, + "grad_norm": 1.0427969694137573, + "learning_rate": 1.0207221779622327e-05, + "loss": 0.6437, + "step": 8419 + }, + { + "epoch": 1.4607911172796668, + "grad_norm": 0.9285373687744141, + "learning_rate": 1.0201281697065757e-05, + "loss": 0.5879, + "step": 8420 + }, + { + "epoch": 1.4609646079111727, + "grad_norm": 0.9877512454986572, + "learning_rate": 1.0195342751661856e-05, + "loss": 0.6071, + "step": 8421 + }, + { + "epoch": 1.4611380985426787, + "grad_norm": 1.0631251335144043, + "learning_rate": 1.0189404944099867e-05, + "loss": 0.7854, + "step": 8422 + }, + { + "epoch": 1.4613115891741846, + "grad_norm": 0.8142562508583069, + "learning_rate": 1.0183468275068853e-05, + "loss": 0.6232, + "step": 8423 + }, + { + "epoch": 1.4614850798056904, + "grad_norm": 1.292889952659607, + "learning_rate": 1.017753274525779e-05, + "loss": 0.637, + "step": 8424 + }, + { + "epoch": 1.4616585704371965, + "grad_norm": 1.0010274648666382, + "learning_rate": 1.01715983553555e-05, + "loss": 0.5691, + "step": 8425 + }, + { + "epoch": 1.4618320610687023, + "grad_norm": 0.9427107572555542, + "learning_rate": 1.016566510605067e-05, + "loss": 0.6923, + "step": 8426 + }, + { + "epoch": 1.4620055517002082, + "grad_norm": 1.9668302536010742, + "learning_rate": 1.0159732998031857e-05, + "loss": 0.599, + "step": 8427 + }, + { + "epoch": 1.462179042331714, + "grad_norm": 0.7990998029708862, + "learning_rate": 1.0153802031987504e-05, + "loss": 0.7161, + "step": 8428 + }, + { + "epoch": 1.4623525329632199, + "grad_norm": 0.7187040448188782, + "learning_rate": 1.01478722086059e-05, + "loss": 0.7368, + "step": 8429 + }, + { + "epoch": 1.462526023594726, + "grad_norm": 1.1104599237442017, + "learning_rate": 1.0141943528575205e-05, + "loss": 0.5682, + "step": 8430 + }, + { + "epoch": 1.4626995142262318, + "grad_norm": 0.8144204616546631, + "learning_rate": 1.0136015992583449e-05, + "loss": 0.6982, + "step": 8431 + }, + { + "epoch": 1.4628730048577376, + "grad_norm": 0.7592847347259521, + "learning_rate": 1.0130089601318525e-05, + "loss": 0.7859, + "step": 8432 + }, + { + "epoch": 1.4630464954892437, + "grad_norm": 1.538515567779541, + "learning_rate": 1.0124164355468208e-05, + "loss": 0.5992, + "step": 8433 + }, + { + "epoch": 1.4632199861207495, + "grad_norm": 0.8194349408149719, + "learning_rate": 1.0118240255720128e-05, + "loss": 0.7151, + "step": 8434 + }, + { + "epoch": 1.4633934767522554, + "grad_norm": 1.5449851751327515, + "learning_rate": 1.011231730276178e-05, + "loss": 0.6261, + "step": 8435 + }, + { + "epoch": 1.4635669673837612, + "grad_norm": 0.8338571786880493, + "learning_rate": 1.0106395497280524e-05, + "loss": 0.6531, + "step": 8436 + }, + { + "epoch": 1.463740458015267, + "grad_norm": 0.7999670505523682, + "learning_rate": 1.0100474839963605e-05, + "loss": 0.6661, + "step": 8437 + }, + { + "epoch": 1.4639139486467732, + "grad_norm": 0.8283721804618835, + "learning_rate": 1.0094555331498118e-05, + "loss": 0.6367, + "step": 8438 + }, + { + "epoch": 1.464087439278279, + "grad_norm": 0.8673132658004761, + "learning_rate": 1.0088636972571027e-05, + "loss": 0.6919, + "step": 8439 + }, + { + "epoch": 1.4642609299097848, + "grad_norm": 1.0466936826705933, + "learning_rate": 1.0082719763869153e-05, + "loss": 0.595, + "step": 8440 + }, + { + "epoch": 1.4644344205412907, + "grad_norm": 0.6959648728370667, + "learning_rate": 1.0076803706079224e-05, + "loss": 0.8792, + "step": 8441 + }, + { + "epoch": 1.4646079111727968, + "grad_norm": 1.5184818506240845, + "learning_rate": 1.0070888799887772e-05, + "loss": 0.5308, + "step": 8442 + }, + { + "epoch": 1.4647814018043026, + "grad_norm": 0.8451223969459534, + "learning_rate": 1.0064975045981254e-05, + "loss": 0.832, + "step": 8443 + }, + { + "epoch": 1.4649548924358085, + "grad_norm": 0.8520974516868591, + "learning_rate": 1.0059062445045957e-05, + "loss": 0.792, + "step": 8444 + }, + { + "epoch": 1.4651283830673143, + "grad_norm": 0.8259444236755371, + "learning_rate": 1.0053150997768046e-05, + "loss": 0.8208, + "step": 8445 + }, + { + "epoch": 1.4653018736988201, + "grad_norm": 0.7902709245681763, + "learning_rate": 1.0047240704833544e-05, + "loss": 0.6465, + "step": 8446 + }, + { + "epoch": 1.4654753643303262, + "grad_norm": 1.001044511795044, + "learning_rate": 1.0041331566928365e-05, + "loss": 0.6394, + "step": 8447 + }, + { + "epoch": 1.465648854961832, + "grad_norm": 0.8686853051185608, + "learning_rate": 1.0035423584738262e-05, + "loss": 0.5811, + "step": 8448 + }, + { + "epoch": 1.465822345593338, + "grad_norm": 0.8119032382965088, + "learning_rate": 1.0029516758948863e-05, + "loss": 0.8169, + "step": 8449 + }, + { + "epoch": 1.465995836224844, + "grad_norm": 0.8414337635040283, + "learning_rate": 1.0023611090245653e-05, + "loss": 0.7322, + "step": 8450 + }, + { + "epoch": 1.4661693268563498, + "grad_norm": 1.043776035308838, + "learning_rate": 1.0017706579314016e-05, + "loss": 0.7155, + "step": 8451 + }, + { + "epoch": 1.4663428174878557, + "grad_norm": 0.9843214154243469, + "learning_rate": 1.0011803226839148e-05, + "loss": 0.8137, + "step": 8452 + }, + { + "epoch": 1.4665163081193615, + "grad_norm": 1.000673532485962, + "learning_rate": 1.000590103350616e-05, + "loss": 0.6703, + "step": 8453 + }, + { + "epoch": 1.4666897987508674, + "grad_norm": 0.7425524592399597, + "learning_rate": 1.0000000000000006e-05, + "loss": 0.7896, + "step": 8454 + }, + { + "epoch": 1.4668632893823734, + "grad_norm": 0.7179698944091797, + "learning_rate": 9.994100127005492e-06, + "loss": 0.9089, + "step": 8455 + }, + { + "epoch": 1.4670367800138793, + "grad_norm": 0.9303914904594421, + "learning_rate": 9.988201415207327e-06, + "loss": 0.7178, + "step": 8456 + }, + { + "epoch": 1.4672102706453851, + "grad_norm": 0.7110491394996643, + "learning_rate": 9.982303865290055e-06, + "loss": 0.7695, + "step": 8457 + }, + { + "epoch": 1.4673837612768912, + "grad_norm": 0.8206285238265991, + "learning_rate": 9.976407477938092e-06, + "loss": 0.8376, + "step": 8458 + }, + { + "epoch": 1.467557251908397, + "grad_norm": 0.9266627430915833, + "learning_rate": 9.970512253835713e-06, + "loss": 0.5544, + "step": 8459 + }, + { + "epoch": 1.4677307425399029, + "grad_norm": 0.883059024810791, + "learning_rate": 9.96461819366709e-06, + "loss": 0.7235, + "step": 8460 + }, + { + "epoch": 1.4679042331714087, + "grad_norm": 0.8365076780319214, + "learning_rate": 9.958725298116204e-06, + "loss": 0.9041, + "step": 8461 + }, + { + "epoch": 1.4680777238029146, + "grad_norm": 0.777123749256134, + "learning_rate": 9.952833567866954e-06, + "loss": 0.6754, + "step": 8462 + }, + { + "epoch": 1.4682512144344204, + "grad_norm": 2.297539234161377, + "learning_rate": 9.946943003603067e-06, + "loss": 0.6052, + "step": 8463 + }, + { + "epoch": 1.4684247050659265, + "grad_norm": 0.9138921499252319, + "learning_rate": 9.941053606008176e-06, + "loss": 0.5726, + "step": 8464 + }, + { + "epoch": 1.4685981956974323, + "grad_norm": 0.9136441349983215, + "learning_rate": 9.935165375765718e-06, + "loss": 0.7704, + "step": 8465 + }, + { + "epoch": 1.4687716863289382, + "grad_norm": 0.8914064764976501, + "learning_rate": 9.929278313559054e-06, + "loss": 0.803, + "step": 8466 + }, + { + "epoch": 1.4689451769604442, + "grad_norm": 0.864658534526825, + "learning_rate": 9.923392420071376e-06, + "loss": 0.7686, + "step": 8467 + }, + { + "epoch": 1.46911866759195, + "grad_norm": 0.8166852593421936, + "learning_rate": 9.917507695985752e-06, + "loss": 0.7427, + "step": 8468 + }, + { + "epoch": 1.469292158223456, + "grad_norm": 0.9941490292549133, + "learning_rate": 9.911624141985096e-06, + "loss": 0.5898, + "step": 8469 + }, + { + "epoch": 1.4694656488549618, + "grad_norm": 0.6097172498703003, + "learning_rate": 9.905741758752234e-06, + "loss": 0.7524, + "step": 8470 + }, + { + "epoch": 1.4696391394864676, + "grad_norm": 0.6961132287979126, + "learning_rate": 9.899860546969785e-06, + "loss": 0.7358, + "step": 8471 + }, + { + "epoch": 1.4698126301179737, + "grad_norm": 1.097730040550232, + "learning_rate": 9.893980507320295e-06, + "loss": 0.6978, + "step": 8472 + }, + { + "epoch": 1.4699861207494795, + "grad_norm": 0.7502076625823975, + "learning_rate": 9.888101640486146e-06, + "loss": 0.6918, + "step": 8473 + }, + { + "epoch": 1.4701596113809854, + "grad_norm": 1.4986361265182495, + "learning_rate": 9.882223947149583e-06, + "loss": 0.6536, + "step": 8474 + }, + { + "epoch": 1.4703331020124915, + "grad_norm": 0.8134422302246094, + "learning_rate": 9.876347427992712e-06, + "loss": 0.7817, + "step": 8475 + }, + { + "epoch": 1.4705065926439973, + "grad_norm": 1.0398681163787842, + "learning_rate": 9.870472083697526e-06, + "loss": 0.5552, + "step": 8476 + }, + { + "epoch": 1.4706800832755031, + "grad_norm": 1.698244333267212, + "learning_rate": 9.864597914945859e-06, + "loss": 0.6151, + "step": 8477 + }, + { + "epoch": 1.470853573907009, + "grad_norm": 0.7696563005447388, + "learning_rate": 9.858724922419413e-06, + "loss": 0.7169, + "step": 8478 + }, + { + "epoch": 1.4710270645385148, + "grad_norm": 1.0133099555969238, + "learning_rate": 9.852853106799752e-06, + "loss": 0.6394, + "step": 8479 + }, + { + "epoch": 1.4712005551700207, + "grad_norm": 0.7010124921798706, + "learning_rate": 9.846982468768316e-06, + "loss": 0.7859, + "step": 8480 + }, + { + "epoch": 1.4713740458015268, + "grad_norm": 1.3696974515914917, + "learning_rate": 9.841113009006395e-06, + "loss": 0.677, + "step": 8481 + }, + { + "epoch": 1.4715475364330326, + "grad_norm": 0.9543020129203796, + "learning_rate": 9.83524472819515e-06, + "loss": 0.6869, + "step": 8482 + }, + { + "epoch": 1.4717210270645384, + "grad_norm": 3.94061017036438, + "learning_rate": 9.829377627015595e-06, + "loss": 0.6, + "step": 8483 + }, + { + "epoch": 1.4718945176960445, + "grad_norm": 1.514819622039795, + "learning_rate": 9.823511706148612e-06, + "loss": 0.5692, + "step": 8484 + }, + { + "epoch": 1.4720680083275504, + "grad_norm": 1.2112153768539429, + "learning_rate": 9.81764696627496e-06, + "loss": 0.7656, + "step": 8485 + }, + { + "epoch": 1.4722414989590562, + "grad_norm": 1.2294178009033203, + "learning_rate": 9.811783408075244e-06, + "loss": 0.6954, + "step": 8486 + }, + { + "epoch": 1.472414989590562, + "grad_norm": 1.510826826095581, + "learning_rate": 9.805921032229935e-06, + "loss": 0.6157, + "step": 8487 + }, + { + "epoch": 1.472588480222068, + "grad_norm": 0.8552189469337463, + "learning_rate": 9.800059839419358e-06, + "loss": 0.7239, + "step": 8488 + }, + { + "epoch": 1.472761970853574, + "grad_norm": 0.8934471607208252, + "learning_rate": 9.794199830323741e-06, + "loss": 0.694, + "step": 8489 + }, + { + "epoch": 1.4729354614850798, + "grad_norm": 2.224855661392212, + "learning_rate": 9.788341005623107e-06, + "loss": 0.7054, + "step": 8490 + }, + { + "epoch": 1.4731089521165857, + "grad_norm": 0.7964619398117065, + "learning_rate": 9.782483365997409e-06, + "loss": 0.7312, + "step": 8491 + }, + { + "epoch": 1.4732824427480917, + "grad_norm": 0.9794811606407166, + "learning_rate": 9.776626912126413e-06, + "loss": 0.6924, + "step": 8492 + }, + { + "epoch": 1.4734559333795976, + "grad_norm": 0.8908123970031738, + "learning_rate": 9.770771644689792e-06, + "loss": 0.8268, + "step": 8493 + }, + { + "epoch": 1.4736294240111034, + "grad_norm": 1.0078786611557007, + "learning_rate": 9.764917564367025e-06, + "loss": 0.6368, + "step": 8494 + }, + { + "epoch": 1.4738029146426093, + "grad_norm": 1.0195238590240479, + "learning_rate": 9.75906467183751e-06, + "loss": 0.7743, + "step": 8495 + }, + { + "epoch": 1.473976405274115, + "grad_norm": 0.8835687041282654, + "learning_rate": 9.753212967780472e-06, + "loss": 0.589, + "step": 8496 + }, + { + "epoch": 1.4741498959056212, + "grad_norm": 0.8950802683830261, + "learning_rate": 9.747362452875009e-06, + "loss": 0.6687, + "step": 8497 + }, + { + "epoch": 1.474323386537127, + "grad_norm": 0.8243603110313416, + "learning_rate": 9.741513127800072e-06, + "loss": 0.7013, + "step": 8498 + }, + { + "epoch": 1.4744968771686329, + "grad_norm": 0.9779012203216553, + "learning_rate": 9.735664993234499e-06, + "loss": 0.6479, + "step": 8499 + }, + { + "epoch": 1.4746703678001387, + "grad_norm": 0.8955667018890381, + "learning_rate": 9.729818049856963e-06, + "loss": 0.6388, + "step": 8500 + }, + { + "epoch": 1.4748438584316448, + "grad_norm": 1.878222942352295, + "learning_rate": 9.72397229834601e-06, + "loss": 0.7296, + "step": 8501 + }, + { + "epoch": 1.4750173490631506, + "grad_norm": 0.7965671420097351, + "learning_rate": 9.718127739380043e-06, + "loss": 0.811, + "step": 8502 + }, + { + "epoch": 1.4751908396946565, + "grad_norm": 1.030659794807434, + "learning_rate": 9.712284373637329e-06, + "loss": 0.5751, + "step": 8503 + }, + { + "epoch": 1.4753643303261623, + "grad_norm": 0.7140384912490845, + "learning_rate": 9.706442201796007e-06, + "loss": 0.7573, + "step": 8504 + }, + { + "epoch": 1.4755378209576682, + "grad_norm": 1.1221225261688232, + "learning_rate": 9.700601224534061e-06, + "loss": 0.653, + "step": 8505 + }, + { + "epoch": 1.4757113115891742, + "grad_norm": 0.7878429293632507, + "learning_rate": 9.694761442529345e-06, + "loss": 0.6233, + "step": 8506 + }, + { + "epoch": 1.47588480222068, + "grad_norm": 1.3988245725631714, + "learning_rate": 9.688922856459563e-06, + "loss": 0.532, + "step": 8507 + }, + { + "epoch": 1.476058292852186, + "grad_norm": 1.0901696681976318, + "learning_rate": 9.683085467002306e-06, + "loss": 0.7544, + "step": 8508 + }, + { + "epoch": 1.476231783483692, + "grad_norm": 0.7453171610832214, + "learning_rate": 9.677249274835003e-06, + "loss": 0.7533, + "step": 8509 + }, + { + "epoch": 1.4764052741151978, + "grad_norm": 0.9851856827735901, + "learning_rate": 9.67141428063495e-06, + "loss": 0.6793, + "step": 8510 + }, + { + "epoch": 1.4765787647467037, + "grad_norm": 0.6667001843452454, + "learning_rate": 9.665580485079297e-06, + "loss": 0.7396, + "step": 8511 + }, + { + "epoch": 1.4767522553782095, + "grad_norm": 0.7090960144996643, + "learning_rate": 9.659747888845087e-06, + "loss": 0.8381, + "step": 8512 + }, + { + "epoch": 1.4769257460097154, + "grad_norm": 1.0123482942581177, + "learning_rate": 9.653916492609168e-06, + "loss": 0.7233, + "step": 8513 + }, + { + "epoch": 1.4770992366412214, + "grad_norm": 1.336560845375061, + "learning_rate": 9.648086297048302e-06, + "loss": 0.7915, + "step": 8514 + }, + { + "epoch": 1.4772727272727273, + "grad_norm": 1.409533143043518, + "learning_rate": 9.642257302839085e-06, + "loss": 0.8564, + "step": 8515 + }, + { + "epoch": 1.4774462179042331, + "grad_norm": 0.8501150608062744, + "learning_rate": 9.636429510657974e-06, + "loss": 0.5365, + "step": 8516 + }, + { + "epoch": 1.4776197085357392, + "grad_norm": 1.0981407165527344, + "learning_rate": 9.63060292118129e-06, + "loss": 0.8115, + "step": 8517 + }, + { + "epoch": 1.477793199167245, + "grad_norm": 2.3347830772399902, + "learning_rate": 9.624777535085233e-06, + "loss": 0.5947, + "step": 8518 + }, + { + "epoch": 1.477966689798751, + "grad_norm": 1.0945852994918823, + "learning_rate": 9.61895335304582e-06, + "loss": 0.8511, + "step": 8519 + }, + { + "epoch": 1.4781401804302567, + "grad_norm": 1.2378287315368652, + "learning_rate": 9.61313037573897e-06, + "loss": 0.7324, + "step": 8520 + }, + { + "epoch": 1.4783136710617626, + "grad_norm": 1.2561061382293701, + "learning_rate": 9.607308603840437e-06, + "loss": 0.6124, + "step": 8521 + }, + { + "epoch": 1.4784871616932684, + "grad_norm": 1.4695407152175903, + "learning_rate": 9.601488038025869e-06, + "loss": 0.6587, + "step": 8522 + }, + { + "epoch": 1.4786606523247745, + "grad_norm": 1.0496524572372437, + "learning_rate": 9.59566867897071e-06, + "loss": 0.7106, + "step": 8523 + }, + { + "epoch": 1.4788341429562804, + "grad_norm": 0.8551136255264282, + "learning_rate": 9.589850527350337e-06, + "loss": 0.7637, + "step": 8524 + }, + { + "epoch": 1.4790076335877862, + "grad_norm": 0.8643609285354614, + "learning_rate": 9.584033583839938e-06, + "loss": 0.6082, + "step": 8525 + }, + { + "epoch": 1.4791811242192923, + "grad_norm": 0.871792197227478, + "learning_rate": 9.578217849114579e-06, + "loss": 0.665, + "step": 8526 + }, + { + "epoch": 1.479354614850798, + "grad_norm": 0.9538140296936035, + "learning_rate": 9.572403323849175e-06, + "loss": 0.6316, + "step": 8527 + }, + { + "epoch": 1.479528105482304, + "grad_norm": 0.844017505645752, + "learning_rate": 9.566590008718524e-06, + "loss": 0.6326, + "step": 8528 + }, + { + "epoch": 1.4797015961138098, + "grad_norm": 1.2310161590576172, + "learning_rate": 9.560777904397258e-06, + "loss": 0.8416, + "step": 8529 + }, + { + "epoch": 1.4798750867453156, + "grad_norm": 0.7887097597122192, + "learning_rate": 9.554967011559874e-06, + "loss": 0.7627, + "step": 8530 + }, + { + "epoch": 1.4800485773768217, + "grad_norm": 3.0276875495910645, + "learning_rate": 9.549157330880753e-06, + "loss": 0.6359, + "step": 8531 + }, + { + "epoch": 1.4802220680083276, + "grad_norm": 0.9238868951797485, + "learning_rate": 9.54334886303409e-06, + "loss": 0.6182, + "step": 8532 + }, + { + "epoch": 1.4803955586398334, + "grad_norm": 0.9500969052314758, + "learning_rate": 9.537541608693982e-06, + "loss": 0.6223, + "step": 8533 + }, + { + "epoch": 1.4805690492713395, + "grad_norm": 0.9156795740127563, + "learning_rate": 9.53173556853435e-06, + "loss": 0.6982, + "step": 8534 + }, + { + "epoch": 1.4807425399028453, + "grad_norm": 0.859477162361145, + "learning_rate": 9.52593074322902e-06, + "loss": 0.6392, + "step": 8535 + }, + { + "epoch": 1.4809160305343512, + "grad_norm": 0.7322713732719421, + "learning_rate": 9.520127133451619e-06, + "loss": 0.8156, + "step": 8536 + }, + { + "epoch": 1.481089521165857, + "grad_norm": 1.2173984050750732, + "learning_rate": 9.514324739875684e-06, + "loss": 0.61, + "step": 8537 + }, + { + "epoch": 1.4812630117973629, + "grad_norm": 0.7745656967163086, + "learning_rate": 9.508523563174578e-06, + "loss": 0.8118, + "step": 8538 + }, + { + "epoch": 1.4814365024288687, + "grad_norm": 0.9724113941192627, + "learning_rate": 9.50272360402154e-06, + "loss": 0.8169, + "step": 8539 + }, + { + "epoch": 1.4816099930603748, + "grad_norm": 1.0198888778686523, + "learning_rate": 9.496924863089652e-06, + "loss": 0.5841, + "step": 8540 + }, + { + "epoch": 1.4817834836918806, + "grad_norm": 0.9343327283859253, + "learning_rate": 9.491127341051884e-06, + "loss": 0.8081, + "step": 8541 + }, + { + "epoch": 1.4819569743233865, + "grad_norm": 1.2559216022491455, + "learning_rate": 9.485331038581021e-06, + "loss": 0.7286, + "step": 8542 + }, + { + "epoch": 1.4821304649548925, + "grad_norm": 0.676838755607605, + "learning_rate": 9.479535956349751e-06, + "loss": 0.8009, + "step": 8543 + }, + { + "epoch": 1.4823039555863984, + "grad_norm": 0.9703164100646973, + "learning_rate": 9.473742095030588e-06, + "loss": 0.6481, + "step": 8544 + }, + { + "epoch": 1.4824774462179042, + "grad_norm": 1.499454379081726, + "learning_rate": 9.467949455295925e-06, + "loss": 0.7161, + "step": 8545 + }, + { + "epoch": 1.48265093684941, + "grad_norm": 0.7268586158752441, + "learning_rate": 9.46215803781799e-06, + "loss": 0.8013, + "step": 8546 + }, + { + "epoch": 1.482824427480916, + "grad_norm": 0.9505850076675415, + "learning_rate": 9.456367843268903e-06, + "loss": 0.6541, + "step": 8547 + }, + { + "epoch": 1.482997918112422, + "grad_norm": 1.0516271591186523, + "learning_rate": 9.450578872320613e-06, + "loss": 0.8132, + "step": 8548 + }, + { + "epoch": 1.4831714087439278, + "grad_norm": 0.8915678262710571, + "learning_rate": 9.444791125644937e-06, + "loss": 0.6687, + "step": 8549 + }, + { + "epoch": 1.4833448993754337, + "grad_norm": 0.9466924071311951, + "learning_rate": 9.439004603913542e-06, + "loss": 0.8193, + "step": 8550 + }, + { + "epoch": 1.4835183900069397, + "grad_norm": 0.629422128200531, + "learning_rate": 9.433219307797977e-06, + "loss": 0.8396, + "step": 8551 + }, + { + "epoch": 1.4836918806384456, + "grad_norm": 1.5751148462295532, + "learning_rate": 9.427435237969624e-06, + "loss": 0.6892, + "step": 8552 + }, + { + "epoch": 1.4838653712699514, + "grad_norm": 0.8522116541862488, + "learning_rate": 9.421652395099732e-06, + "loss": 0.661, + "step": 8553 + }, + { + "epoch": 1.4840388619014573, + "grad_norm": 0.8857063055038452, + "learning_rate": 9.415870779859405e-06, + "loss": 0.6827, + "step": 8554 + }, + { + "epoch": 1.4842123525329631, + "grad_norm": 1.112443447113037, + "learning_rate": 9.410090392919598e-06, + "loss": 0.6345, + "step": 8555 + }, + { + "epoch": 1.4843858431644692, + "grad_norm": 0.6972340941429138, + "learning_rate": 9.404311234951148e-06, + "loss": 0.7772, + "step": 8556 + }, + { + "epoch": 1.484559333795975, + "grad_norm": 0.8230925798416138, + "learning_rate": 9.39853330662473e-06, + "loss": 0.6478, + "step": 8557 + }, + { + "epoch": 1.484732824427481, + "grad_norm": 1.4788563251495361, + "learning_rate": 9.392756608610871e-06, + "loss": 0.749, + "step": 8558 + }, + { + "epoch": 1.4849063150589867, + "grad_norm": 1.102135419845581, + "learning_rate": 9.386981141579961e-06, + "loss": 0.6974, + "step": 8559 + }, + { + "epoch": 1.4850798056904928, + "grad_norm": 0.7171000242233276, + "learning_rate": 9.381206906202268e-06, + "loss": 0.6995, + "step": 8560 + }, + { + "epoch": 1.4852532963219987, + "grad_norm": 0.8302825689315796, + "learning_rate": 9.375433903147877e-06, + "loss": 0.7373, + "step": 8561 + }, + { + "epoch": 1.4854267869535045, + "grad_norm": 1.0722918510437012, + "learning_rate": 9.369662133086768e-06, + "loss": 0.6077, + "step": 8562 + }, + { + "epoch": 1.4856002775850103, + "grad_norm": 1.1712939739227295, + "learning_rate": 9.363891596688745e-06, + "loss": 0.6101, + "step": 8563 + }, + { + "epoch": 1.4857737682165162, + "grad_norm": 1.0081614255905151, + "learning_rate": 9.358122294623514e-06, + "loss": 0.5785, + "step": 8564 + }, + { + "epoch": 1.4859472588480223, + "grad_norm": 1.0067394971847534, + "learning_rate": 9.352354227560575e-06, + "loss": 0.5616, + "step": 8565 + }, + { + "epoch": 1.486120749479528, + "grad_norm": 1.1829419136047363, + "learning_rate": 9.34658739616934e-06, + "loss": 0.6858, + "step": 8566 + }, + { + "epoch": 1.486294240111034, + "grad_norm": 0.8359526991844177, + "learning_rate": 9.340821801119053e-06, + "loss": 0.5955, + "step": 8567 + }, + { + "epoch": 1.48646773074254, + "grad_norm": 0.8153902292251587, + "learning_rate": 9.335057443078817e-06, + "loss": 0.6858, + "step": 8568 + }, + { + "epoch": 1.4866412213740459, + "grad_norm": 0.8586882948875427, + "learning_rate": 9.329294322717584e-06, + "loss": 0.7401, + "step": 8569 + }, + { + "epoch": 1.4868147120055517, + "grad_norm": 0.9296817183494568, + "learning_rate": 9.323532440704196e-06, + "loss": 0.6277, + "step": 8570 + }, + { + "epoch": 1.4869882026370576, + "grad_norm": 0.873951256275177, + "learning_rate": 9.31777179770729e-06, + "loss": 0.7017, + "step": 8571 + }, + { + "epoch": 1.4871616932685634, + "grad_norm": 1.883460521697998, + "learning_rate": 9.312012394395423e-06, + "loss": 0.7908, + "step": 8572 + }, + { + "epoch": 1.4873351839000695, + "grad_norm": 4.544217109680176, + "learning_rate": 9.30625423143697e-06, + "loss": 0.5188, + "step": 8573 + }, + { + "epoch": 1.4875086745315753, + "grad_norm": 0.7111706137657166, + "learning_rate": 9.300497309500176e-06, + "loss": 0.6987, + "step": 8574 + }, + { + "epoch": 1.4876821651630812, + "grad_norm": 0.8358369469642639, + "learning_rate": 9.29474162925313e-06, + "loss": 0.6967, + "step": 8575 + }, + { + "epoch": 1.4878556557945872, + "grad_norm": 1.0885305404663086, + "learning_rate": 9.288987191363799e-06, + "loss": 0.5295, + "step": 8576 + }, + { + "epoch": 1.488029146426093, + "grad_norm": 1.1077760457992554, + "learning_rate": 9.283233996499984e-06, + "loss": 0.72, + "step": 8577 + }, + { + "epoch": 1.488202637057599, + "grad_norm": 0.802131712436676, + "learning_rate": 9.277482045329344e-06, + "loss": 0.6807, + "step": 8578 + }, + { + "epoch": 1.4883761276891048, + "grad_norm": 0.8568203449249268, + "learning_rate": 9.271731338519415e-06, + "loss": 0.6296, + "step": 8579 + }, + { + "epoch": 1.4885496183206106, + "grad_norm": 1.0968971252441406, + "learning_rate": 9.265981876737566e-06, + "loss": 0.7964, + "step": 8580 + }, + { + "epoch": 1.4887231089521165, + "grad_norm": 0.815367579460144, + "learning_rate": 9.260233660651025e-06, + "loss": 0.6055, + "step": 8581 + }, + { + "epoch": 1.4888965995836225, + "grad_norm": 1.1572723388671875, + "learning_rate": 9.254486690926878e-06, + "loss": 0.6008, + "step": 8582 + }, + { + "epoch": 1.4890700902151284, + "grad_norm": 1.9300347566604614, + "learning_rate": 9.248740968232084e-06, + "loss": 0.6511, + "step": 8583 + }, + { + "epoch": 1.4892435808466342, + "grad_norm": 1.2847375869750977, + "learning_rate": 9.242996493233414e-06, + "loss": 0.5513, + "step": 8584 + }, + { + "epoch": 1.4894170714781403, + "grad_norm": 0.7096062898635864, + "learning_rate": 9.237253266597544e-06, + "loss": 0.8123, + "step": 8585 + }, + { + "epoch": 1.4895905621096461, + "grad_norm": 1.2176920175552368, + "learning_rate": 9.23151128899097e-06, + "loss": 0.6074, + "step": 8586 + }, + { + "epoch": 1.489764052741152, + "grad_norm": 1.2220444679260254, + "learning_rate": 9.225770561080062e-06, + "loss": 0.7195, + "step": 8587 + }, + { + "epoch": 1.4899375433726578, + "grad_norm": 1.0005877017974854, + "learning_rate": 9.220031083531026e-06, + "loss": 0.9402, + "step": 8588 + }, + { + "epoch": 1.4901110340041637, + "grad_norm": 0.7866361737251282, + "learning_rate": 9.214292857009961e-06, + "loss": 0.6169, + "step": 8589 + }, + { + "epoch": 1.4902845246356697, + "grad_norm": 1.494857668876648, + "learning_rate": 9.208555882182762e-06, + "loss": 0.7268, + "step": 8590 + }, + { + "epoch": 1.4904580152671756, + "grad_norm": 0.9308006167411804, + "learning_rate": 9.202820159715234e-06, + "loss": 0.6886, + "step": 8591 + }, + { + "epoch": 1.4906315058986814, + "grad_norm": 0.9522799253463745, + "learning_rate": 9.197085690273e-06, + "loss": 0.6335, + "step": 8592 + }, + { + "epoch": 1.4908049965301875, + "grad_norm": 0.8912412524223328, + "learning_rate": 9.191352474521572e-06, + "loss": 0.6423, + "step": 8593 + }, + { + "epoch": 1.4909784871616933, + "grad_norm": 1.2185969352722168, + "learning_rate": 9.185620513126275e-06, + "loss": 0.6766, + "step": 8594 + }, + { + "epoch": 1.4911519777931992, + "grad_norm": 1.0056630373001099, + "learning_rate": 9.179889806752322e-06, + "loss": 0.7679, + "step": 8595 + }, + { + "epoch": 1.491325468424705, + "grad_norm": 0.8760972023010254, + "learning_rate": 9.174160356064765e-06, + "loss": 0.729, + "step": 8596 + }, + { + "epoch": 1.4914989590562109, + "grad_norm": 1.203986406326294, + "learning_rate": 9.168432161728515e-06, + "loss": 0.6865, + "step": 8597 + }, + { + "epoch": 1.4916724496877167, + "grad_norm": 0.7040188312530518, + "learning_rate": 9.162705224408326e-06, + "loss": 0.7017, + "step": 8598 + }, + { + "epoch": 1.4918459403192228, + "grad_norm": 0.8385800719261169, + "learning_rate": 9.15697954476883e-06, + "loss": 0.7003, + "step": 8599 + }, + { + "epoch": 1.4920194309507286, + "grad_norm": 0.6725385785102844, + "learning_rate": 9.151255123474493e-06, + "loss": 0.8315, + "step": 8600 + }, + { + "epoch": 1.4921929215822345, + "grad_norm": 1.7756174802780151, + "learning_rate": 9.14553196118964e-06, + "loss": 0.7767, + "step": 8601 + }, + { + "epoch": 1.4923664122137406, + "grad_norm": 0.6734706163406372, + "learning_rate": 9.139810058578451e-06, + "loss": 0.8113, + "step": 8602 + }, + { + "epoch": 1.4925399028452464, + "grad_norm": 0.8494265079498291, + "learning_rate": 9.134089416304951e-06, + "loss": 0.7076, + "step": 8603 + }, + { + "epoch": 1.4927133934767522, + "grad_norm": 0.6556048393249512, + "learning_rate": 9.128370035033046e-06, + "loss": 0.8264, + "step": 8604 + }, + { + "epoch": 1.492886884108258, + "grad_norm": 0.9406347274780273, + "learning_rate": 9.122651915426464e-06, + "loss": 0.5786, + "step": 8605 + }, + { + "epoch": 1.493060374739764, + "grad_norm": 0.7269783616065979, + "learning_rate": 9.116935058148801e-06, + "loss": 0.702, + "step": 8606 + }, + { + "epoch": 1.49323386537127, + "grad_norm": 1.6071827411651611, + "learning_rate": 9.111219463863501e-06, + "loss": 0.7291, + "step": 8607 + }, + { + "epoch": 1.4934073560027759, + "grad_norm": 0.8791521191596985, + "learning_rate": 9.105505133233876e-06, + "loss": 0.7236, + "step": 8608 + }, + { + "epoch": 1.4935808466342817, + "grad_norm": 1.2191895246505737, + "learning_rate": 9.099792066923077e-06, + "loss": 0.7965, + "step": 8609 + }, + { + "epoch": 1.4937543372657878, + "grad_norm": 0.6723723411560059, + "learning_rate": 9.094080265594108e-06, + "loss": 0.6572, + "step": 8610 + }, + { + "epoch": 1.4939278278972936, + "grad_norm": 1.132746934890747, + "learning_rate": 9.088369729909823e-06, + "loss": 0.6162, + "step": 8611 + }, + { + "epoch": 1.4941013185287995, + "grad_norm": 0.7853979468345642, + "learning_rate": 9.082660460532961e-06, + "loss": 0.6993, + "step": 8612 + }, + { + "epoch": 1.4942748091603053, + "grad_norm": 0.8842340111732483, + "learning_rate": 9.07695245812606e-06, + "loss": 0.7034, + "step": 8613 + }, + { + "epoch": 1.4944482997918112, + "grad_norm": 0.7312718629837036, + "learning_rate": 9.071245723351563e-06, + "loss": 0.6716, + "step": 8614 + }, + { + "epoch": 1.4946217904233172, + "grad_norm": 1.2216020822525024, + "learning_rate": 9.065540256871733e-06, + "loss": 0.6565, + "step": 8615 + }, + { + "epoch": 1.494795281054823, + "grad_norm": 0.613294243812561, + "learning_rate": 9.059836059348696e-06, + "loss": 0.6721, + "step": 8616 + }, + { + "epoch": 1.494968771686329, + "grad_norm": 1.4183495044708252, + "learning_rate": 9.054133131444427e-06, + "loss": 0.619, + "step": 8617 + }, + { + "epoch": 1.4951422623178348, + "grad_norm": 0.7423856258392334, + "learning_rate": 9.048431473820776e-06, + "loss": 0.8733, + "step": 8618 + }, + { + "epoch": 1.4953157529493408, + "grad_norm": 0.8733114004135132, + "learning_rate": 9.042731087139398e-06, + "loss": 0.7271, + "step": 8619 + }, + { + "epoch": 1.4954892435808467, + "grad_norm": 1.2791216373443604, + "learning_rate": 9.037031972061854e-06, + "loss": 0.748, + "step": 8620 + }, + { + "epoch": 1.4956627342123525, + "grad_norm": 0.9203847646713257, + "learning_rate": 9.031334129249515e-06, + "loss": 0.6165, + "step": 8621 + }, + { + "epoch": 1.4958362248438584, + "grad_norm": 0.7348108291625977, + "learning_rate": 9.02563755936365e-06, + "loss": 0.6886, + "step": 8622 + }, + { + "epoch": 1.4960097154753642, + "grad_norm": 0.7587302923202515, + "learning_rate": 9.019942263065316e-06, + "loss": 0.7266, + "step": 8623 + }, + { + "epoch": 1.4961832061068703, + "grad_norm": 0.8804594874382019, + "learning_rate": 9.014248241015484e-06, + "loss": 0.6479, + "step": 8624 + }, + { + "epoch": 1.4963566967383761, + "grad_norm": 1.267380714416504, + "learning_rate": 9.008555493874944e-06, + "loss": 0.6741, + "step": 8625 + }, + { + "epoch": 1.496530187369882, + "grad_norm": 1.8381561040878296, + "learning_rate": 9.00286402230434e-06, + "loss": 0.6014, + "step": 8626 + }, + { + "epoch": 1.496703678001388, + "grad_norm": 0.8941490054130554, + "learning_rate": 8.99717382696419e-06, + "loss": 0.5736, + "step": 8627 + }, + { + "epoch": 1.4968771686328939, + "grad_norm": 0.9645127654075623, + "learning_rate": 8.991484908514835e-06, + "loss": 0.6857, + "step": 8628 + }, + { + "epoch": 1.4970506592643997, + "grad_norm": 1.036421537399292, + "learning_rate": 8.985797267616485e-06, + "loss": 0.7576, + "step": 8629 + }, + { + "epoch": 1.4972241498959056, + "grad_norm": 0.8281311392784119, + "learning_rate": 8.980110904929189e-06, + "loss": 0.635, + "step": 8630 + }, + { + "epoch": 1.4973976405274114, + "grad_norm": 0.921423077583313, + "learning_rate": 8.974425821112877e-06, + "loss": 0.5955, + "step": 8631 + }, + { + "epoch": 1.4975711311589175, + "grad_norm": 0.6703919172286987, + "learning_rate": 8.968742016827283e-06, + "loss": 0.5686, + "step": 8632 + }, + { + "epoch": 1.4977446217904233, + "grad_norm": 0.9083982110023499, + "learning_rate": 8.963059492732039e-06, + "loss": 0.7478, + "step": 8633 + }, + { + "epoch": 1.4979181124219292, + "grad_norm": 0.8873187899589539, + "learning_rate": 8.957378249486592e-06, + "loss": 0.7377, + "step": 8634 + }, + { + "epoch": 1.4980916030534353, + "grad_norm": 0.8857871890068054, + "learning_rate": 8.951698287750283e-06, + "loss": 0.7769, + "step": 8635 + }, + { + "epoch": 1.498265093684941, + "grad_norm": 0.8813860416412354, + "learning_rate": 8.946019608182245e-06, + "loss": 0.7731, + "step": 8636 + }, + { + "epoch": 1.498438584316447, + "grad_norm": 0.753755509853363, + "learning_rate": 8.940342211441522e-06, + "loss": 0.8357, + "step": 8637 + }, + { + "epoch": 1.4986120749479528, + "grad_norm": 0.9052684307098389, + "learning_rate": 8.93466609818697e-06, + "loss": 0.6302, + "step": 8638 + }, + { + "epoch": 1.4987855655794586, + "grad_norm": 0.9361521601676941, + "learning_rate": 8.928991269077311e-06, + "loss": 0.7134, + "step": 8639 + }, + { + "epoch": 1.4989590562109645, + "grad_norm": 0.9007132053375244, + "learning_rate": 8.92331772477111e-06, + "loss": 0.6472, + "step": 8640 + }, + { + "epoch": 1.4991325468424705, + "grad_norm": 1.0852935314178467, + "learning_rate": 8.917645465926807e-06, + "loss": 0.6396, + "step": 8641 + }, + { + "epoch": 1.4993060374739764, + "grad_norm": 0.8199490308761597, + "learning_rate": 8.91197449320265e-06, + "loss": 0.6277, + "step": 8642 + }, + { + "epoch": 1.4994795281054822, + "grad_norm": 1.7509039640426636, + "learning_rate": 8.906304807256775e-06, + "loss": 0.6461, + "step": 8643 + }, + { + "epoch": 1.4996530187369883, + "grad_norm": 0.9358723163604736, + "learning_rate": 8.900636408747156e-06, + "loss": 0.562, + "step": 8644 + }, + { + "epoch": 1.4998265093684942, + "grad_norm": 0.8062341213226318, + "learning_rate": 8.894969298331617e-06, + "loss": 0.6, + "step": 8645 + }, + { + "epoch": 1.5, + "grad_norm": 1.0771723985671997, + "learning_rate": 8.889303476667823e-06, + "loss": 0.5979, + "step": 8646 + }, + { + "epoch": 1.5001734906315058, + "grad_norm": 1.1561203002929688, + "learning_rate": 8.883638944413313e-06, + "loss": 0.8447, + "step": 8647 + }, + { + "epoch": 1.5003469812630117, + "grad_norm": 1.0808532238006592, + "learning_rate": 8.877975702225457e-06, + "loss": 0.6985, + "step": 8648 + }, + { + "epoch": 1.5005204718945175, + "grad_norm": 0.8741946816444397, + "learning_rate": 8.872313750761482e-06, + "loss": 0.6935, + "step": 8649 + }, + { + "epoch": 1.5006939625260236, + "grad_norm": 0.8525562882423401, + "learning_rate": 8.866653090678452e-06, + "loss": 0.6074, + "step": 8650 + }, + { + "epoch": 1.5008674531575295, + "grad_norm": 0.8524796962738037, + "learning_rate": 8.860993722633312e-06, + "loss": 0.5806, + "step": 8651 + }, + { + "epoch": 1.5010409437890355, + "grad_norm": 0.9304973483085632, + "learning_rate": 8.855335647282833e-06, + "loss": 0.7133, + "step": 8652 + }, + { + "epoch": 1.5012144344205414, + "grad_norm": 0.8788804411888123, + "learning_rate": 8.849678865283635e-06, + "loss": 0.7156, + "step": 8653 + }, + { + "epoch": 1.5013879250520472, + "grad_norm": 0.9486120939254761, + "learning_rate": 8.844023377292198e-06, + "loss": 0.7639, + "step": 8654 + }, + { + "epoch": 1.501561415683553, + "grad_norm": 0.7377996444702148, + "learning_rate": 8.838369183964841e-06, + "loss": 0.6619, + "step": 8655 + }, + { + "epoch": 1.501734906315059, + "grad_norm": 0.9218708872795105, + "learning_rate": 8.832716285957754e-06, + "loss": 0.6859, + "step": 8656 + }, + { + "epoch": 1.5019083969465647, + "grad_norm": 1.0594158172607422, + "learning_rate": 8.827064683926954e-06, + "loss": 0.6647, + "step": 8657 + }, + { + "epoch": 1.5020818875780708, + "grad_norm": 1.1117682456970215, + "learning_rate": 8.821414378528314e-06, + "loss": 0.5847, + "step": 8658 + }, + { + "epoch": 1.5022553782095767, + "grad_norm": 1.2291145324707031, + "learning_rate": 8.815765370417557e-06, + "loss": 0.5599, + "step": 8659 + }, + { + "epoch": 1.5024288688410827, + "grad_norm": 0.9111098051071167, + "learning_rate": 8.810117660250275e-06, + "loss": 0.6321, + "step": 8660 + }, + { + "epoch": 1.5026023594725886, + "grad_norm": 0.6874105930328369, + "learning_rate": 8.80447124868186e-06, + "loss": 0.7561, + "step": 8661 + }, + { + "epoch": 1.5027758501040944, + "grad_norm": 1.252671480178833, + "learning_rate": 8.798826136367613e-06, + "loss": 0.6669, + "step": 8662 + }, + { + "epoch": 1.5029493407356003, + "grad_norm": 0.8946930170059204, + "learning_rate": 8.793182323962635e-06, + "loss": 0.6227, + "step": 8663 + }, + { + "epoch": 1.5031228313671061, + "grad_norm": 0.9339969754219055, + "learning_rate": 8.787539812121924e-06, + "loss": 0.6486, + "step": 8664 + }, + { + "epoch": 1.503296321998612, + "grad_norm": 0.942572832107544, + "learning_rate": 8.781898601500265e-06, + "loss": 0.6255, + "step": 8665 + }, + { + "epoch": 1.5034698126301178, + "grad_norm": 1.2333080768585205, + "learning_rate": 8.776258692752355e-06, + "loss": 0.563, + "step": 8666 + }, + { + "epoch": 1.5036433032616239, + "grad_norm": 1.5359143018722534, + "learning_rate": 8.7706200865327e-06, + "loss": 0.917, + "step": 8667 + }, + { + "epoch": 1.5038167938931297, + "grad_norm": 0.943639874458313, + "learning_rate": 8.76498278349567e-06, + "loss": 0.6902, + "step": 8668 + }, + { + "epoch": 1.5039902845246358, + "grad_norm": 0.848282516002655, + "learning_rate": 8.75934678429547e-06, + "loss": 0.6725, + "step": 8669 + }, + { + "epoch": 1.5041637751561416, + "grad_norm": 2.104316473007202, + "learning_rate": 8.753712089586184e-06, + "loss": 0.6714, + "step": 8670 + }, + { + "epoch": 1.5043372657876475, + "grad_norm": 0.6891066431999207, + "learning_rate": 8.748078700021714e-06, + "loss": 0.6134, + "step": 8671 + }, + { + "epoch": 1.5045107564191533, + "grad_norm": 1.625869870185852, + "learning_rate": 8.74244661625582e-06, + "loss": 0.821, + "step": 8672 + }, + { + "epoch": 1.5046842470506592, + "grad_norm": 1.497833251953125, + "learning_rate": 8.736815838942114e-06, + "loss": 0.771, + "step": 8673 + }, + { + "epoch": 1.504857737682165, + "grad_norm": 0.9495749473571777, + "learning_rate": 8.731186368734049e-06, + "loss": 0.6782, + "step": 8674 + }, + { + "epoch": 1.505031228313671, + "grad_norm": 1.0892244577407837, + "learning_rate": 8.725558206284944e-06, + "loss": 0.7527, + "step": 8675 + }, + { + "epoch": 1.505204718945177, + "grad_norm": 0.729296088218689, + "learning_rate": 8.719931352247944e-06, + "loss": 0.6904, + "step": 8676 + }, + { + "epoch": 1.505378209576683, + "grad_norm": 1.172075629234314, + "learning_rate": 8.714305807276056e-06, + "loss": 0.637, + "step": 8677 + }, + { + "epoch": 1.5055517002081888, + "grad_norm": 1.68405020236969, + "learning_rate": 8.708681572022122e-06, + "loss": 0.8989, + "step": 8678 + }, + { + "epoch": 1.5057251908396947, + "grad_norm": 0.9458889365196228, + "learning_rate": 8.703058647138855e-06, + "loss": 0.6023, + "step": 8679 + }, + { + "epoch": 1.5058986814712005, + "grad_norm": 0.8928313851356506, + "learning_rate": 8.697437033278797e-06, + "loss": 0.6809, + "step": 8680 + }, + { + "epoch": 1.5060721721027064, + "grad_norm": 1.0344091653823853, + "learning_rate": 8.691816731094341e-06, + "loss": 0.6677, + "step": 8681 + }, + { + "epoch": 1.5062456627342122, + "grad_norm": 0.9545073509216309, + "learning_rate": 8.686197741237722e-06, + "loss": 0.5836, + "step": 8682 + }, + { + "epoch": 1.5064191533657183, + "grad_norm": 0.795782208442688, + "learning_rate": 8.680580064361053e-06, + "loss": 0.8396, + "step": 8683 + }, + { + "epoch": 1.5065926439972241, + "grad_norm": 0.8766036033630371, + "learning_rate": 8.674963701116243e-06, + "loss": 0.7373, + "step": 8684 + }, + { + "epoch": 1.5067661346287302, + "grad_norm": 1.2608840465545654, + "learning_rate": 8.669348652155101e-06, + "loss": 0.6211, + "step": 8685 + }, + { + "epoch": 1.506939625260236, + "grad_norm": 1.0810413360595703, + "learning_rate": 8.663734918129247e-06, + "loss": 0.7683, + "step": 8686 + }, + { + "epoch": 1.507113115891742, + "grad_norm": 0.823975145816803, + "learning_rate": 8.658122499690166e-06, + "loss": 0.7114, + "step": 8687 + }, + { + "epoch": 1.5072866065232478, + "grad_norm": 1.471629023551941, + "learning_rate": 8.652511397489181e-06, + "loss": 0.7234, + "step": 8688 + }, + { + "epoch": 1.5074600971547536, + "grad_norm": 0.8432521820068359, + "learning_rate": 8.646901612177484e-06, + "loss": 0.7108, + "step": 8689 + }, + { + "epoch": 1.5076335877862594, + "grad_norm": 0.7302473187446594, + "learning_rate": 8.641293144406067e-06, + "loss": 0.6718, + "step": 8690 + }, + { + "epoch": 1.5078070784177653, + "grad_norm": 1.4336203336715698, + "learning_rate": 8.635685994825824e-06, + "loss": 0.7268, + "step": 8691 + }, + { + "epoch": 1.5079805690492714, + "grad_norm": 0.982468843460083, + "learning_rate": 8.630080164087456e-06, + "loss": 0.6371, + "step": 8692 + }, + { + "epoch": 1.5081540596807772, + "grad_norm": 1.4009476900100708, + "learning_rate": 8.624475652841549e-06, + "loss": 0.5901, + "step": 8693 + }, + { + "epoch": 1.5083275503122833, + "grad_norm": 0.7309932708740234, + "learning_rate": 8.618872461738483e-06, + "loss": 0.6022, + "step": 8694 + }, + { + "epoch": 1.5085010409437891, + "grad_norm": 0.7125513553619385, + "learning_rate": 8.613270591428537e-06, + "loss": 0.8197, + "step": 8695 + }, + { + "epoch": 1.508674531575295, + "grad_norm": 0.7505030035972595, + "learning_rate": 8.607670042561807e-06, + "loss": 0.6865, + "step": 8696 + }, + { + "epoch": 1.5088480222068008, + "grad_norm": 0.9548516273498535, + "learning_rate": 8.602070815788241e-06, + "loss": 0.6908, + "step": 8697 + }, + { + "epoch": 1.5090215128383067, + "grad_norm": 0.8782280087471008, + "learning_rate": 8.596472911757633e-06, + "loss": 0.696, + "step": 8698 + }, + { + "epoch": 1.5091950034698125, + "grad_norm": 1.0532952547073364, + "learning_rate": 8.59087633111964e-06, + "loss": 0.6801, + "step": 8699 + }, + { + "epoch": 1.5093684941013186, + "grad_norm": 0.9790932536125183, + "learning_rate": 8.58528107452374e-06, + "loss": 0.6488, + "step": 8700 + }, + { + "epoch": 1.5095419847328244, + "grad_norm": 1.0710840225219727, + "learning_rate": 8.579687142619267e-06, + "loss": 0.6401, + "step": 8701 + }, + { + "epoch": 1.5097154753643305, + "grad_norm": 0.9019536972045898, + "learning_rate": 8.574094536055423e-06, + "loss": 0.5983, + "step": 8702 + }, + { + "epoch": 1.5098889659958363, + "grad_norm": 0.7647848129272461, + "learning_rate": 8.568503255481204e-06, + "loss": 0.7996, + "step": 8703 + }, + { + "epoch": 1.5100624566273422, + "grad_norm": 1.0439449548721313, + "learning_rate": 8.562913301545513e-06, + "loss": 0.7161, + "step": 8704 + }, + { + "epoch": 1.510235947258848, + "grad_norm": 0.8429141640663147, + "learning_rate": 8.557324674897061e-06, + "loss": 0.7542, + "step": 8705 + }, + { + "epoch": 1.5104094378903539, + "grad_norm": 0.9142834544181824, + "learning_rate": 8.55173737618441e-06, + "loss": 0.8003, + "step": 8706 + }, + { + "epoch": 1.5105829285218597, + "grad_norm": 0.7735834717750549, + "learning_rate": 8.54615140605597e-06, + "loss": 0.6312, + "step": 8707 + }, + { + "epoch": 1.5107564191533656, + "grad_norm": 0.7899014949798584, + "learning_rate": 8.540566765160016e-06, + "loss": 0.7288, + "step": 8708 + }, + { + "epoch": 1.5109299097848716, + "grad_norm": 3.752467393875122, + "learning_rate": 8.53498345414464e-06, + "loss": 0.6416, + "step": 8709 + }, + { + "epoch": 1.5111034004163775, + "grad_norm": 0.8472861051559448, + "learning_rate": 8.529401473657795e-06, + "loss": 0.6239, + "step": 8710 + }, + { + "epoch": 1.5112768910478835, + "grad_norm": 1.3371511697769165, + "learning_rate": 8.523820824347266e-06, + "loss": 0.7373, + "step": 8711 + }, + { + "epoch": 1.5114503816793894, + "grad_norm": 0.8310530185699463, + "learning_rate": 8.518241506860719e-06, + "loss": 0.6624, + "step": 8712 + }, + { + "epoch": 1.5116238723108952, + "grad_norm": 1.5058526992797852, + "learning_rate": 8.512663521845607e-06, + "loss": 0.6393, + "step": 8713 + }, + { + "epoch": 1.511797362942401, + "grad_norm": 0.7586250901222229, + "learning_rate": 8.507086869949287e-06, + "loss": 0.7329, + "step": 8714 + }, + { + "epoch": 1.511970853573907, + "grad_norm": 0.8346177339553833, + "learning_rate": 8.50151155181893e-06, + "loss": 0.6005, + "step": 8715 + }, + { + "epoch": 1.5121443442054128, + "grad_norm": 0.9492912292480469, + "learning_rate": 8.495937568101551e-06, + "loss": 0.6858, + "step": 8716 + }, + { + "epoch": 1.5123178348369188, + "grad_norm": 1.3562021255493164, + "learning_rate": 8.49036491944402e-06, + "loss": 0.5835, + "step": 8717 + }, + { + "epoch": 1.5124913254684247, + "grad_norm": 0.9317839741706848, + "learning_rate": 8.484793606493054e-06, + "loss": 0.693, + "step": 8718 + }, + { + "epoch": 1.5126648160999308, + "grad_norm": 0.9044678807258606, + "learning_rate": 8.47922362989521e-06, + "loss": 0.7373, + "step": 8719 + }, + { + "epoch": 1.5128383067314366, + "grad_norm": 0.8151566386222839, + "learning_rate": 8.473654990296887e-06, + "loss": 0.771, + "step": 8720 + }, + { + "epoch": 1.5130117973629424, + "grad_norm": 0.7804116606712341, + "learning_rate": 8.468087688344329e-06, + "loss": 0.7445, + "step": 8721 + }, + { + "epoch": 1.5131852879944483, + "grad_norm": 0.9387608766555786, + "learning_rate": 8.462521724683637e-06, + "loss": 0.5675, + "step": 8722 + }, + { + "epoch": 1.5133587786259541, + "grad_norm": 0.9193735718727112, + "learning_rate": 8.456957099960743e-06, + "loss": 0.8496, + "step": 8723 + }, + { + "epoch": 1.51353226925746, + "grad_norm": 1.6663531064987183, + "learning_rate": 8.451393814821427e-06, + "loss": 0.707, + "step": 8724 + }, + { + "epoch": 1.5137057598889658, + "grad_norm": 0.8037397265434265, + "learning_rate": 8.445831869911317e-06, + "loss": 0.723, + "step": 8725 + }, + { + "epoch": 1.513879250520472, + "grad_norm": 1.269026517868042, + "learning_rate": 8.440271265875875e-06, + "loss": 0.5652, + "step": 8726 + }, + { + "epoch": 1.5140527411519777, + "grad_norm": 0.8081490397453308, + "learning_rate": 8.434712003360427e-06, + "loss": 0.702, + "step": 8727 + }, + { + "epoch": 1.5142262317834838, + "grad_norm": 0.8634703159332275, + "learning_rate": 8.42915408301013e-06, + "loss": 0.7251, + "step": 8728 + }, + { + "epoch": 1.5143997224149897, + "grad_norm": 1.680543065071106, + "learning_rate": 8.423597505469983e-06, + "loss": 0.6414, + "step": 8729 + }, + { + "epoch": 1.5145732130464955, + "grad_norm": 0.8394985198974609, + "learning_rate": 8.418042271384828e-06, + "loss": 0.6097, + "step": 8730 + }, + { + "epoch": 1.5147467036780013, + "grad_norm": 0.7513120174407959, + "learning_rate": 8.412488381399378e-06, + "loss": 0.7439, + "step": 8731 + }, + { + "epoch": 1.5149201943095072, + "grad_norm": 1.2480336427688599, + "learning_rate": 8.406935836158138e-06, + "loss": 0.5702, + "step": 8732 + }, + { + "epoch": 1.515093684941013, + "grad_norm": 0.765986979007721, + "learning_rate": 8.401384636305509e-06, + "loss": 0.7483, + "step": 8733 + }, + { + "epoch": 1.515267175572519, + "grad_norm": 1.0906040668487549, + "learning_rate": 8.39583478248571e-06, + "loss": 0.556, + "step": 8734 + }, + { + "epoch": 1.515440666204025, + "grad_norm": 0.9435783624649048, + "learning_rate": 8.390286275342805e-06, + "loss": 0.6305, + "step": 8735 + }, + { + "epoch": 1.515614156835531, + "grad_norm": 0.9871505498886108, + "learning_rate": 8.3847391155207e-06, + "loss": 0.59, + "step": 8736 + }, + { + "epoch": 1.5157876474670369, + "grad_norm": 0.8121782541275024, + "learning_rate": 8.379193303663162e-06, + "loss": 0.6366, + "step": 8737 + }, + { + "epoch": 1.5159611380985427, + "grad_norm": 0.8033392429351807, + "learning_rate": 8.373648840413781e-06, + "loss": 0.7849, + "step": 8738 + }, + { + "epoch": 1.5161346287300486, + "grad_norm": 0.9808334708213806, + "learning_rate": 8.368105726416e-06, + "loss": 0.6383, + "step": 8739 + }, + { + "epoch": 1.5163081193615544, + "grad_norm": 0.881854772567749, + "learning_rate": 8.362563962313095e-06, + "loss": 0.6807, + "step": 8740 + }, + { + "epoch": 1.5164816099930603, + "grad_norm": 1.5732028484344482, + "learning_rate": 8.357023548748217e-06, + "loss": 0.692, + "step": 8741 + }, + { + "epoch": 1.5166551006245663, + "grad_norm": 0.8473667502403259, + "learning_rate": 8.35148448636431e-06, + "loss": 0.6868, + "step": 8742 + }, + { + "epoch": 1.5168285912560722, + "grad_norm": 1.6209126710891724, + "learning_rate": 8.345946775804209e-06, + "loss": 0.6674, + "step": 8743 + }, + { + "epoch": 1.5170020818875782, + "grad_norm": 2.5117719173431396, + "learning_rate": 8.340410417710562e-06, + "loss": 0.649, + "step": 8744 + }, + { + "epoch": 1.517175572519084, + "grad_norm": 0.9129233956336975, + "learning_rate": 8.334875412725874e-06, + "loss": 0.7012, + "step": 8745 + }, + { + "epoch": 1.51734906315059, + "grad_norm": 0.866087019443512, + "learning_rate": 8.32934176149248e-06, + "loss": 0.641, + "step": 8746 + }, + { + "epoch": 1.5175225537820958, + "grad_norm": 0.7993111610412598, + "learning_rate": 8.323809464652579e-06, + "loss": 0.6982, + "step": 8747 + }, + { + "epoch": 1.5176960444136016, + "grad_norm": 0.8884661197662354, + "learning_rate": 8.318278522848198e-06, + "loss": 0.6454, + "step": 8748 + }, + { + "epoch": 1.5178695350451075, + "grad_norm": 0.9014550447463989, + "learning_rate": 8.312748936721196e-06, + "loss": 0.6165, + "step": 8749 + }, + { + "epoch": 1.5180430256766133, + "grad_norm": 1.0556797981262207, + "learning_rate": 8.307220706913308e-06, + "loss": 0.6148, + "step": 8750 + }, + { + "epoch": 1.5182165163081194, + "grad_norm": 0.9356533885002136, + "learning_rate": 8.30169383406608e-06, + "loss": 0.663, + "step": 8751 + }, + { + "epoch": 1.5183900069396252, + "grad_norm": 1.0932947397232056, + "learning_rate": 8.296168318820914e-06, + "loss": 0.604, + "step": 8752 + }, + { + "epoch": 1.5185634975711313, + "grad_norm": 1.1307965517044067, + "learning_rate": 8.290644161819046e-06, + "loss": 0.6243, + "step": 8753 + }, + { + "epoch": 1.5187369882026371, + "grad_norm": 1.1548904180526733, + "learning_rate": 8.28512136370158e-06, + "loss": 0.8596, + "step": 8754 + }, + { + "epoch": 1.518910478834143, + "grad_norm": 0.6748799085617065, + "learning_rate": 8.279599925109415e-06, + "loss": 0.8538, + "step": 8755 + }, + { + "epoch": 1.5190839694656488, + "grad_norm": 0.8357910513877869, + "learning_rate": 8.274079846683346e-06, + "loss": 0.6599, + "step": 8756 + }, + { + "epoch": 1.5192574600971547, + "grad_norm": 0.8060771226882935, + "learning_rate": 8.268561129063975e-06, + "loss": 0.7922, + "step": 8757 + }, + { + "epoch": 1.5194309507286605, + "grad_norm": 1.0987573862075806, + "learning_rate": 8.263043772891752e-06, + "loss": 0.6151, + "step": 8758 + }, + { + "epoch": 1.5196044413601666, + "grad_norm": 0.7658439874649048, + "learning_rate": 8.257527778806968e-06, + "loss": 0.7052, + "step": 8759 + }, + { + "epoch": 1.5197779319916724, + "grad_norm": 0.8992171883583069, + "learning_rate": 8.252013147449785e-06, + "loss": 0.6136, + "step": 8760 + }, + { + "epoch": 1.5199514226231785, + "grad_norm": 0.7366397976875305, + "learning_rate": 8.246499879460149e-06, + "loss": 0.8931, + "step": 8761 + }, + { + "epoch": 1.5201249132546844, + "grad_norm": 1.3429559469223022, + "learning_rate": 8.240987975477903e-06, + "loss": 0.6315, + "step": 8762 + }, + { + "epoch": 1.5202984038861902, + "grad_norm": 1.5855289697647095, + "learning_rate": 8.2354774361427e-06, + "loss": 0.6539, + "step": 8763 + }, + { + "epoch": 1.520471894517696, + "grad_norm": 1.098713994026184, + "learning_rate": 8.229968262094064e-06, + "loss": 0.6555, + "step": 8764 + }, + { + "epoch": 1.520645385149202, + "grad_norm": 0.9491657018661499, + "learning_rate": 8.224460453971307e-06, + "loss": 0.7336, + "step": 8765 + }, + { + "epoch": 1.5208188757807077, + "grad_norm": 0.8541056513786316, + "learning_rate": 8.218954012413647e-06, + "loss": 0.6096, + "step": 8766 + }, + { + "epoch": 1.5209923664122136, + "grad_norm": 0.8711955547332764, + "learning_rate": 8.2134489380601e-06, + "loss": 0.6492, + "step": 8767 + }, + { + "epoch": 1.5211658570437196, + "grad_norm": 1.3064299821853638, + "learning_rate": 8.207945231549539e-06, + "loss": 0.67, + "step": 8768 + }, + { + "epoch": 1.5213393476752255, + "grad_norm": 1.0181825160980225, + "learning_rate": 8.202442893520666e-06, + "loss": 0.5856, + "step": 8769 + }, + { + "epoch": 1.5215128383067316, + "grad_norm": 1.7912498712539673, + "learning_rate": 8.19694192461205e-06, + "loss": 0.8896, + "step": 8770 + }, + { + "epoch": 1.5216863289382374, + "grad_norm": 0.8200807571411133, + "learning_rate": 8.191442325462075e-06, + "loss": 0.6766, + "step": 8771 + }, + { + "epoch": 1.5218598195697433, + "grad_norm": 2.0739011764526367, + "learning_rate": 8.185944096708982e-06, + "loss": 0.7739, + "step": 8772 + }, + { + "epoch": 1.522033310201249, + "grad_norm": 0.7645139694213867, + "learning_rate": 8.180447238990838e-06, + "loss": 0.6881, + "step": 8773 + }, + { + "epoch": 1.522206800832755, + "grad_norm": 0.8422749042510986, + "learning_rate": 8.17495175294556e-06, + "loss": 0.7594, + "step": 8774 + }, + { + "epoch": 1.5223802914642608, + "grad_norm": 0.8097181916236877, + "learning_rate": 8.169457639210916e-06, + "loss": 0.7485, + "step": 8775 + }, + { + "epoch": 1.5225537820957669, + "grad_norm": 2.0968470573425293, + "learning_rate": 8.1639648984245e-06, + "loss": 0.7556, + "step": 8776 + }, + { + "epoch": 1.5227272727272727, + "grad_norm": 1.0154684782028198, + "learning_rate": 8.158473531223748e-06, + "loss": 0.6219, + "step": 8777 + }, + { + "epoch": 1.5229007633587788, + "grad_norm": 1.1207032203674316, + "learning_rate": 8.152983538245933e-06, + "loss": 0.6553, + "step": 8778 + }, + { + "epoch": 1.5230742539902846, + "grad_norm": 1.042795181274414, + "learning_rate": 8.147494920128192e-06, + "loss": 0.5471, + "step": 8779 + }, + { + "epoch": 1.5232477446217905, + "grad_norm": 0.8392865657806396, + "learning_rate": 8.142007677507475e-06, + "loss": 0.6471, + "step": 8780 + }, + { + "epoch": 1.5234212352532963, + "grad_norm": 1.3669254779815674, + "learning_rate": 8.136521811020584e-06, + "loss": 0.5433, + "step": 8781 + }, + { + "epoch": 1.5235947258848022, + "grad_norm": 0.6280065178871155, + "learning_rate": 8.131037321304154e-06, + "loss": 0.7556, + "step": 8782 + }, + { + "epoch": 1.523768216516308, + "grad_norm": 0.7921758890151978, + "learning_rate": 8.125554208994688e-06, + "loss": 0.7094, + "step": 8783 + }, + { + "epoch": 1.523941707147814, + "grad_norm": 1.0207020044326782, + "learning_rate": 8.120072474728476e-06, + "loss": 0.6672, + "step": 8784 + }, + { + "epoch": 1.52411519777932, + "grad_norm": 1.01663076877594, + "learning_rate": 8.114592119141704e-06, + "loss": 0.5752, + "step": 8785 + }, + { + "epoch": 1.5242886884108258, + "grad_norm": 0.7511789798736572, + "learning_rate": 8.10911314287037e-06, + "loss": 0.7328, + "step": 8786 + }, + { + "epoch": 1.5244621790423318, + "grad_norm": 0.739768385887146, + "learning_rate": 8.103635546550305e-06, + "loss": 0.6649, + "step": 8787 + }, + { + "epoch": 1.5246356696738377, + "grad_norm": 0.790746808052063, + "learning_rate": 8.098159330817192e-06, + "loss": 0.6656, + "step": 8788 + }, + { + "epoch": 1.5248091603053435, + "grad_norm": 0.6809366345405579, + "learning_rate": 8.092684496306573e-06, + "loss": 0.7559, + "step": 8789 + }, + { + "epoch": 1.5249826509368494, + "grad_norm": 1.0132341384887695, + "learning_rate": 8.087211043653777e-06, + "loss": 0.5807, + "step": 8790 + }, + { + "epoch": 1.5251561415683552, + "grad_norm": 0.7272564768791199, + "learning_rate": 8.08173897349403e-06, + "loss": 0.8086, + "step": 8791 + }, + { + "epoch": 1.525329632199861, + "grad_norm": 0.7236567735671997, + "learning_rate": 8.076268286462352e-06, + "loss": 0.7803, + "step": 8792 + }, + { + "epoch": 1.5255031228313671, + "grad_norm": 1.0134706497192383, + "learning_rate": 8.070798983193651e-06, + "loss": 0.7314, + "step": 8793 + }, + { + "epoch": 1.525676613462873, + "grad_norm": 1.0489037036895752, + "learning_rate": 8.065331064322612e-06, + "loss": 0.8479, + "step": 8794 + }, + { + "epoch": 1.525850104094379, + "grad_norm": 0.6739673614501953, + "learning_rate": 8.059864530483816e-06, + "loss": 0.8137, + "step": 8795 + }, + { + "epoch": 1.526023594725885, + "grad_norm": 1.0908654928207397, + "learning_rate": 8.054399382311657e-06, + "loss": 0.6708, + "step": 8796 + }, + { + "epoch": 1.5261970853573907, + "grad_norm": 1.0803344249725342, + "learning_rate": 8.048935620440359e-06, + "loss": 0.6589, + "step": 8797 + }, + { + "epoch": 1.5263705759888966, + "grad_norm": 0.8907851576805115, + "learning_rate": 8.043473245504017e-06, + "loss": 0.6438, + "step": 8798 + }, + { + "epoch": 1.5265440666204024, + "grad_norm": 0.9591167569160461, + "learning_rate": 8.038012258136536e-06, + "loss": 0.7314, + "step": 8799 + }, + { + "epoch": 1.5267175572519083, + "grad_norm": 0.8286004662513733, + "learning_rate": 8.032552658971672e-06, + "loss": 0.6533, + "step": 8800 + }, + { + "epoch": 1.5268910478834143, + "grad_norm": 0.9312100410461426, + "learning_rate": 8.02709444864301e-06, + "loss": 0.6975, + "step": 8801 + }, + { + "epoch": 1.5270645385149202, + "grad_norm": 1.2509292364120483, + "learning_rate": 8.021637627784e-06, + "loss": 0.7927, + "step": 8802 + }, + { + "epoch": 1.5272380291464263, + "grad_norm": 0.8572384119033813, + "learning_rate": 8.01618219702789e-06, + "loss": 0.6805, + "step": 8803 + }, + { + "epoch": 1.527411519777932, + "grad_norm": 1.1578960418701172, + "learning_rate": 8.010728157007805e-06, + "loss": 0.7795, + "step": 8804 + }, + { + "epoch": 1.527585010409438, + "grad_norm": 0.7449067831039429, + "learning_rate": 8.005275508356689e-06, + "loss": 0.6337, + "step": 8805 + }, + { + "epoch": 1.5277585010409438, + "grad_norm": 0.8748636245727539, + "learning_rate": 7.999824251707324e-06, + "loss": 0.5643, + "step": 8806 + }, + { + "epoch": 1.5279319916724496, + "grad_norm": 0.9757163524627686, + "learning_rate": 7.994374387692335e-06, + "loss": 0.6528, + "step": 8807 + }, + { + "epoch": 1.5281054823039555, + "grad_norm": 0.8212743401527405, + "learning_rate": 7.98892591694419e-06, + "loss": 0.6946, + "step": 8808 + }, + { + "epoch": 1.5282789729354613, + "grad_norm": 0.676810085773468, + "learning_rate": 7.98347884009519e-06, + "loss": 0.7683, + "step": 8809 + }, + { + "epoch": 1.5284524635669674, + "grad_norm": 1.0326652526855469, + "learning_rate": 7.978033157777473e-06, + "loss": 0.5833, + "step": 8810 + }, + { + "epoch": 1.5286259541984732, + "grad_norm": 0.9548013806343079, + "learning_rate": 7.97258887062301e-06, + "loss": 0.6178, + "step": 8811 + }, + { + "epoch": 1.5287994448299793, + "grad_norm": 0.9476114511489868, + "learning_rate": 7.967145979263637e-06, + "loss": 0.6844, + "step": 8812 + }, + { + "epoch": 1.5289729354614852, + "grad_norm": 1.03043794631958, + "learning_rate": 7.961704484330979e-06, + "loss": 0.7412, + "step": 8813 + }, + { + "epoch": 1.529146426092991, + "grad_norm": 1.3412387371063232, + "learning_rate": 7.956264386456551e-06, + "loss": 0.6367, + "step": 8814 + }, + { + "epoch": 1.5293199167244969, + "grad_norm": 3.4852521419525146, + "learning_rate": 7.950825686271674e-06, + "loss": 0.6411, + "step": 8815 + }, + { + "epoch": 1.5294934073560027, + "grad_norm": 1.1809844970703125, + "learning_rate": 7.945388384407518e-06, + "loss": 0.7676, + "step": 8816 + }, + { + "epoch": 1.5296668979875085, + "grad_norm": 0.7197659611701965, + "learning_rate": 7.939952481495076e-06, + "loss": 0.8541, + "step": 8817 + }, + { + "epoch": 1.5298403886190146, + "grad_norm": 1.0437402725219727, + "learning_rate": 7.934517978165211e-06, + "loss": 0.678, + "step": 8818 + }, + { + "epoch": 1.5300138792505205, + "grad_norm": 1.5622807741165161, + "learning_rate": 7.929084875048594e-06, + "loss": 0.6978, + "step": 8819 + }, + { + "epoch": 1.5301873698820265, + "grad_norm": 1.4501808881759644, + "learning_rate": 7.92365317277574e-06, + "loss": 0.6168, + "step": 8820 + }, + { + "epoch": 1.5303608605135324, + "grad_norm": 0.8383119106292725, + "learning_rate": 7.918222871977004e-06, + "loss": 0.6605, + "step": 8821 + }, + { + "epoch": 1.5305343511450382, + "grad_norm": 1.1208136081695557, + "learning_rate": 7.912793973282584e-06, + "loss": 0.5607, + "step": 8822 + }, + { + "epoch": 1.530707841776544, + "grad_norm": 0.6978111267089844, + "learning_rate": 7.90736647732251e-06, + "loss": 0.7837, + "step": 8823 + }, + { + "epoch": 1.53088133240805, + "grad_norm": 1.1811100244522095, + "learning_rate": 7.90194038472665e-06, + "loss": 0.5847, + "step": 8824 + }, + { + "epoch": 1.5310548230395558, + "grad_norm": 1.2415465116500854, + "learning_rate": 7.896515696124703e-06, + "loss": 0.6652, + "step": 8825 + }, + { + "epoch": 1.5312283136710616, + "grad_norm": 1.098953127861023, + "learning_rate": 7.891092412146204e-06, + "loss": 0.5548, + "step": 8826 + }, + { + "epoch": 1.5314018043025677, + "grad_norm": 0.8580667972564697, + "learning_rate": 7.885670533420548e-06, + "loss": 0.7411, + "step": 8827 + }, + { + "epoch": 1.5315752949340735, + "grad_norm": 0.6890743970870972, + "learning_rate": 7.880250060576943e-06, + "loss": 0.7562, + "step": 8828 + }, + { + "epoch": 1.5317487855655796, + "grad_norm": 0.8937185406684875, + "learning_rate": 7.874830994244443e-06, + "loss": 0.7318, + "step": 8829 + }, + { + "epoch": 1.5319222761970854, + "grad_norm": 0.9878870248794556, + "learning_rate": 7.869413335051926e-06, + "loss": 0.6395, + "step": 8830 + }, + { + "epoch": 1.5320957668285913, + "grad_norm": 1.2929092645645142, + "learning_rate": 7.863997083628139e-06, + "loss": 0.6067, + "step": 8831 + }, + { + "epoch": 1.5322692574600971, + "grad_norm": 0.9931074976921082, + "learning_rate": 7.85858224060162e-06, + "loss": 0.7227, + "step": 8832 + }, + { + "epoch": 1.532442748091603, + "grad_norm": 1.0290453433990479, + "learning_rate": 7.853168806600784e-06, + "loss": 0.5377, + "step": 8833 + }, + { + "epoch": 1.5326162387231088, + "grad_norm": 0.875695526599884, + "learning_rate": 7.847756782253864e-06, + "loss": 0.8025, + "step": 8834 + }, + { + "epoch": 1.5327897293546149, + "grad_norm": 1.4935286045074463, + "learning_rate": 7.842346168188927e-06, + "loss": 0.6946, + "step": 8835 + }, + { + "epoch": 1.5329632199861207, + "grad_norm": 1.0511342287063599, + "learning_rate": 7.836936965033881e-06, + "loss": 0.5613, + "step": 8836 + }, + { + "epoch": 1.5331367106176268, + "grad_norm": 0.7893831729888916, + "learning_rate": 7.831529173416476e-06, + "loss": 0.6268, + "step": 8837 + }, + { + "epoch": 1.5333102012491326, + "grad_norm": 0.8685739636421204, + "learning_rate": 7.826122793964293e-06, + "loss": 0.662, + "step": 8838 + }, + { + "epoch": 1.5334836918806385, + "grad_norm": 0.9125450849533081, + "learning_rate": 7.820717827304743e-06, + "loss": 0.6069, + "step": 8839 + }, + { + "epoch": 1.5336571825121443, + "grad_norm": 0.7881751656532288, + "learning_rate": 7.815314274065074e-06, + "loss": 0.8315, + "step": 8840 + }, + { + "epoch": 1.5338306731436502, + "grad_norm": 1.2757236957550049, + "learning_rate": 7.809912134872395e-06, + "loss": 0.6788, + "step": 8841 + }, + { + "epoch": 1.534004163775156, + "grad_norm": 0.8940508365631104, + "learning_rate": 7.804511410353603e-06, + "loss": 0.5896, + "step": 8842 + }, + { + "epoch": 1.534177654406662, + "grad_norm": 0.7592052221298218, + "learning_rate": 7.79911210113548e-06, + "loss": 0.7578, + "step": 8843 + }, + { + "epoch": 1.534351145038168, + "grad_norm": 1.0211981534957886, + "learning_rate": 7.793714207844616e-06, + "loss": 0.6644, + "step": 8844 + }, + { + "epoch": 1.5345246356696738, + "grad_norm": 2.571890115737915, + "learning_rate": 7.788317731107431e-06, + "loss": 0.5352, + "step": 8845 + }, + { + "epoch": 1.5346981263011799, + "grad_norm": 0.7306288480758667, + "learning_rate": 7.782922671550213e-06, + "loss": 0.8354, + "step": 8846 + }, + { + "epoch": 1.5348716169326857, + "grad_norm": 0.8432573080062866, + "learning_rate": 7.777529029799053e-06, + "loss": 0.8503, + "step": 8847 + }, + { + "epoch": 1.5350451075641915, + "grad_norm": 0.7805939316749573, + "learning_rate": 7.772136806479891e-06, + "loss": 0.8141, + "step": 8848 + }, + { + "epoch": 1.5352185981956974, + "grad_norm": 1.159404993057251, + "learning_rate": 7.766746002218495e-06, + "loss": 0.8174, + "step": 8849 + }, + { + "epoch": 1.5353920888272032, + "grad_norm": 1.027047872543335, + "learning_rate": 7.761356617640485e-06, + "loss": 0.6783, + "step": 8850 + }, + { + "epoch": 1.535565579458709, + "grad_norm": 0.8267747759819031, + "learning_rate": 7.755968653371302e-06, + "loss": 0.6926, + "step": 8851 + }, + { + "epoch": 1.5357390700902152, + "grad_norm": 1.1061229705810547, + "learning_rate": 7.750582110036225e-06, + "loss": 0.6437, + "step": 8852 + }, + { + "epoch": 1.535912560721721, + "grad_norm": 0.782421350479126, + "learning_rate": 7.745196988260357e-06, + "loss": 0.6613, + "step": 8853 + }, + { + "epoch": 1.536086051353227, + "grad_norm": 1.2259498834609985, + "learning_rate": 7.739813288668677e-06, + "loss": 0.5541, + "step": 8854 + }, + { + "epoch": 1.536259541984733, + "grad_norm": 0.923143208026886, + "learning_rate": 7.734431011885932e-06, + "loss": 0.7622, + "step": 8855 + }, + { + "epoch": 1.5364330326162388, + "grad_norm": 0.7383148074150085, + "learning_rate": 7.72905015853677e-06, + "loss": 0.7964, + "step": 8856 + }, + { + "epoch": 1.5366065232477446, + "grad_norm": 1.0290781259536743, + "learning_rate": 7.723670729245636e-06, + "loss": 0.7681, + "step": 8857 + }, + { + "epoch": 1.5367800138792505, + "grad_norm": 4.370320796966553, + "learning_rate": 7.718292724636815e-06, + "loss": 0.8884, + "step": 8858 + }, + { + "epoch": 1.5369535045107563, + "grad_norm": 0.7299044728279114, + "learning_rate": 7.712916145334432e-06, + "loss": 0.9216, + "step": 8859 + }, + { + "epoch": 1.5371269951422624, + "grad_norm": 1.1130975484848022, + "learning_rate": 7.70754099196246e-06, + "loss": 0.8057, + "step": 8860 + }, + { + "epoch": 1.5373004857737682, + "grad_norm": 1.4193140268325806, + "learning_rate": 7.702167265144664e-06, + "loss": 0.5905, + "step": 8861 + }, + { + "epoch": 1.5374739764052743, + "grad_norm": 1.094899296760559, + "learning_rate": 7.696794965504695e-06, + "loss": 0.494, + "step": 8862 + }, + { + "epoch": 1.5376474670367801, + "grad_norm": 1.0338526964187622, + "learning_rate": 7.691424093666007e-06, + "loss": 0.7278, + "step": 8863 + }, + { + "epoch": 1.537820957668286, + "grad_norm": 0.8817349672317505, + "learning_rate": 7.686054650251893e-06, + "loss": 0.812, + "step": 8864 + }, + { + "epoch": 1.5379944482997918, + "grad_norm": 0.7624608278274536, + "learning_rate": 7.68068663588548e-06, + "loss": 0.762, + "step": 8865 + }, + { + "epoch": 1.5381679389312977, + "grad_norm": 0.7601727843284607, + "learning_rate": 7.675320051189746e-06, + "loss": 0.7993, + "step": 8866 + }, + { + "epoch": 1.5383414295628035, + "grad_norm": 1.802462100982666, + "learning_rate": 7.66995489678748e-06, + "loss": 0.6775, + "step": 8867 + }, + { + "epoch": 1.5385149201943094, + "grad_norm": 1.270880103111267, + "learning_rate": 7.664591173301315e-06, + "loss": 0.8069, + "step": 8868 + }, + { + "epoch": 1.5386884108258154, + "grad_norm": 0.6767659187316895, + "learning_rate": 7.65922888135371e-06, + "loss": 0.5553, + "step": 8869 + }, + { + "epoch": 1.5388619014573213, + "grad_norm": 0.6147394180297852, + "learning_rate": 7.65386802156698e-06, + "loss": 0.7832, + "step": 8870 + }, + { + "epoch": 1.5390353920888273, + "grad_norm": 0.7871049642562866, + "learning_rate": 7.648508594563256e-06, + "loss": 0.8624, + "step": 8871 + }, + { + "epoch": 1.5392088827203332, + "grad_norm": 0.9996743202209473, + "learning_rate": 7.64315060096449e-06, + "loss": 0.7444, + "step": 8872 + }, + { + "epoch": 1.539382373351839, + "grad_norm": 1.1132068634033203, + "learning_rate": 7.637794041392512e-06, + "loss": 0.6979, + "step": 8873 + }, + { + "epoch": 1.5395558639833449, + "grad_norm": 1.3156222105026245, + "learning_rate": 7.632438916468928e-06, + "loss": 0.8252, + "step": 8874 + }, + { + "epoch": 1.5397293546148507, + "grad_norm": 1.0022774934768677, + "learning_rate": 7.627085226815223e-06, + "loss": 0.6613, + "step": 8875 + }, + { + "epoch": 1.5399028452463566, + "grad_norm": 1.2532445192337036, + "learning_rate": 7.621732973052696e-06, + "loss": 0.6858, + "step": 8876 + }, + { + "epoch": 1.5400763358778626, + "grad_norm": 0.9732064008712769, + "learning_rate": 7.616382155802482e-06, + "loss": 0.5586, + "step": 8877 + }, + { + "epoch": 1.5402498265093685, + "grad_norm": 0.9320505261421204, + "learning_rate": 7.611032775685541e-06, + "loss": 0.6903, + "step": 8878 + }, + { + "epoch": 1.5404233171408745, + "grad_norm": 1.0466597080230713, + "learning_rate": 7.605684833322686e-06, + "loss": 0.6698, + "step": 8879 + }, + { + "epoch": 1.5405968077723804, + "grad_norm": 1.1103706359863281, + "learning_rate": 7.600338329334554e-06, + "loss": 0.709, + "step": 8880 + }, + { + "epoch": 1.5407702984038862, + "grad_norm": 0.77680903673172, + "learning_rate": 7.5949932643416036e-06, + "loss": 0.7924, + "step": 8881 + }, + { + "epoch": 1.540943789035392, + "grad_norm": 1.2111412286758423, + "learning_rate": 7.5896496389641336e-06, + "loss": 0.621, + "step": 8882 + }, + { + "epoch": 1.541117279666898, + "grad_norm": 0.8593001365661621, + "learning_rate": 7.584307453822297e-06, + "loss": 0.7542, + "step": 8883 + }, + { + "epoch": 1.5412907702984038, + "grad_norm": 0.7281240820884705, + "learning_rate": 7.5789667095360355e-06, + "loss": 0.6486, + "step": 8884 + }, + { + "epoch": 1.5414642609299096, + "grad_norm": 0.8426680564880371, + "learning_rate": 7.5736274067251655e-06, + "loss": 0.6427, + "step": 8885 + }, + { + "epoch": 1.5416377515614157, + "grad_norm": 1.1195613145828247, + "learning_rate": 7.568289546009316e-06, + "loss": 0.588, + "step": 8886 + }, + { + "epoch": 1.5418112421929215, + "grad_norm": 1.247865915298462, + "learning_rate": 7.562953128007951e-06, + "loss": 0.7603, + "step": 8887 + }, + { + "epoch": 1.5419847328244276, + "grad_norm": 1.263616919517517, + "learning_rate": 7.557618153340358e-06, + "loss": 0.4995, + "step": 8888 + }, + { + "epoch": 1.5421582234559335, + "grad_norm": 0.8810479640960693, + "learning_rate": 7.5522846226256855e-06, + "loss": 0.8319, + "step": 8889 + }, + { + "epoch": 1.5423317140874393, + "grad_norm": 1.1706221103668213, + "learning_rate": 7.546952536482888e-06, + "loss": 0.7649, + "step": 8890 + }, + { + "epoch": 1.5425052047189451, + "grad_norm": 1.1847307682037354, + "learning_rate": 7.541621895530757e-06, + "loss": 0.6907, + "step": 8891 + }, + { + "epoch": 1.542678695350451, + "grad_norm": 0.8313330411911011, + "learning_rate": 7.536292700387924e-06, + "loss": 0.5651, + "step": 8892 + }, + { + "epoch": 1.5428521859819568, + "grad_norm": 1.132016897201538, + "learning_rate": 7.530964951672836e-06, + "loss": 0.5862, + "step": 8893 + }, + { + "epoch": 1.543025676613463, + "grad_norm": 1.0026733875274658, + "learning_rate": 7.5256386500038055e-06, + "loss": 0.6355, + "step": 8894 + }, + { + "epoch": 1.5431991672449688, + "grad_norm": 0.8432292938232422, + "learning_rate": 7.520313795998943e-06, + "loss": 0.748, + "step": 8895 + }, + { + "epoch": 1.5433726578764748, + "grad_norm": 0.8612122535705566, + "learning_rate": 7.5149903902762066e-06, + "loss": 0.769, + "step": 8896 + }, + { + "epoch": 1.5435461485079807, + "grad_norm": 0.9496098756790161, + "learning_rate": 7.5096684334533765e-06, + "loss": 0.5841, + "step": 8897 + }, + { + "epoch": 1.5437196391394865, + "grad_norm": 0.7324972748756409, + "learning_rate": 7.504347926148086e-06, + "loss": 0.731, + "step": 8898 + }, + { + "epoch": 1.5438931297709924, + "grad_norm": 0.8037887811660767, + "learning_rate": 7.499028868977778e-06, + "loss": 0.6959, + "step": 8899 + }, + { + "epoch": 1.5440666204024982, + "grad_norm": 1.0163054466247559, + "learning_rate": 7.4937112625597375e-06, + "loss": 0.5653, + "step": 8900 + }, + { + "epoch": 1.544240111034004, + "grad_norm": 0.8873172998428345, + "learning_rate": 7.48839510751107e-06, + "loss": 0.8037, + "step": 8901 + }, + { + "epoch": 1.5444136016655101, + "grad_norm": 0.7139516472816467, + "learning_rate": 7.483080404448744e-06, + "loss": 0.7993, + "step": 8902 + }, + { + "epoch": 1.544587092297016, + "grad_norm": 0.8036996722221375, + "learning_rate": 7.477767153989508e-06, + "loss": 0.6732, + "step": 8903 + }, + { + "epoch": 1.5447605829285218, + "grad_norm": 0.90054851770401, + "learning_rate": 7.472455356749992e-06, + "loss": 0.616, + "step": 8904 + }, + { + "epoch": 1.5449340735600279, + "grad_norm": 1.945794701576233, + "learning_rate": 7.4671450133466285e-06, + "loss": 0.5958, + "step": 8905 + }, + { + "epoch": 1.5451075641915337, + "grad_norm": 1.0604478120803833, + "learning_rate": 7.461836124395692e-06, + "loss": 0.5977, + "step": 8906 + }, + { + "epoch": 1.5452810548230396, + "grad_norm": 0.6592944264411926, + "learning_rate": 7.456528690513274e-06, + "loss": 0.7161, + "step": 8907 + }, + { + "epoch": 1.5454545454545454, + "grad_norm": 0.7081552147865295, + "learning_rate": 7.451222712315325e-06, + "loss": 0.7394, + "step": 8908 + }, + { + "epoch": 1.5456280360860513, + "grad_norm": 0.8260698914527893, + "learning_rate": 7.445918190417603e-06, + "loss": 0.6823, + "step": 8909 + }, + { + "epoch": 1.545801526717557, + "grad_norm": 1.5092754364013672, + "learning_rate": 7.440615125435702e-06, + "loss": 0.7458, + "step": 8910 + }, + { + "epoch": 1.5459750173490632, + "grad_norm": 1.3839350938796997, + "learning_rate": 7.435313517985043e-06, + "loss": 0.741, + "step": 8911 + }, + { + "epoch": 1.546148507980569, + "grad_norm": 0.9526405334472656, + "learning_rate": 7.430013368680908e-06, + "loss": 0.5691, + "step": 8912 + }, + { + "epoch": 1.546321998612075, + "grad_norm": 0.8952462673187256, + "learning_rate": 7.424714678138351e-06, + "loss": 0.6559, + "step": 8913 + }, + { + "epoch": 1.546495489243581, + "grad_norm": 1.086256742477417, + "learning_rate": 7.419417446972319e-06, + "loss": 0.6089, + "step": 8914 + }, + { + "epoch": 1.5466689798750868, + "grad_norm": 1.0725758075714111, + "learning_rate": 7.41412167579755e-06, + "loss": 0.8333, + "step": 8915 + }, + { + "epoch": 1.5468424705065926, + "grad_norm": 0.8001101613044739, + "learning_rate": 7.408827365228625e-06, + "loss": 0.5299, + "step": 8916 + }, + { + "epoch": 1.5470159611380985, + "grad_norm": 1.617409110069275, + "learning_rate": 7.403534515879951e-06, + "loss": 0.5925, + "step": 8917 + }, + { + "epoch": 1.5471894517696043, + "grad_norm": 0.9956353306770325, + "learning_rate": 7.3982431283657805e-06, + "loss": 0.7517, + "step": 8918 + }, + { + "epoch": 1.5473629424011104, + "grad_norm": 0.8451547026634216, + "learning_rate": 7.392953203300177e-06, + "loss": 0.7803, + "step": 8919 + }, + { + "epoch": 1.5475364330326162, + "grad_norm": 0.7982496619224548, + "learning_rate": 7.38766474129704e-06, + "loss": 0.6249, + "step": 8920 + }, + { + "epoch": 1.5477099236641223, + "grad_norm": 0.986893892288208, + "learning_rate": 7.3823777429701125e-06, + "loss": 0.6953, + "step": 8921 + }, + { + "epoch": 1.5478834142956281, + "grad_norm": 0.8023416996002197, + "learning_rate": 7.37709220893295e-06, + "loss": 0.8174, + "step": 8922 + }, + { + "epoch": 1.548056904927134, + "grad_norm": 1.186764121055603, + "learning_rate": 7.371808139798944e-06, + "loss": 0.6449, + "step": 8923 + }, + { + "epoch": 1.5482303955586398, + "grad_norm": 0.8557913303375244, + "learning_rate": 7.3665255361813125e-06, + "loss": 0.6561, + "step": 8924 + }, + { + "epoch": 1.5484038861901457, + "grad_norm": 0.8289293050765991, + "learning_rate": 7.361244398693128e-06, + "loss": 0.7185, + "step": 8925 + }, + { + "epoch": 1.5485773768216515, + "grad_norm": 0.7416999936103821, + "learning_rate": 7.355964727947242e-06, + "loss": 0.6239, + "step": 8926 + }, + { + "epoch": 1.5487508674531574, + "grad_norm": 0.9505546689033508, + "learning_rate": 7.350686524556392e-06, + "loss": 0.5931, + "step": 8927 + }, + { + "epoch": 1.5489243580846634, + "grad_norm": 0.9984015822410583, + "learning_rate": 7.3454097891331085e-06, + "loss": 0.6534, + "step": 8928 + }, + { + "epoch": 1.5490978487161693, + "grad_norm": 1.050493597984314, + "learning_rate": 7.340134522289766e-06, + "loss": 0.7991, + "step": 8929 + }, + { + "epoch": 1.5492713393476754, + "grad_norm": 1.1233447790145874, + "learning_rate": 7.334860724638555e-06, + "loss": 0.6741, + "step": 8930 + }, + { + "epoch": 1.5494448299791812, + "grad_norm": 0.9462011456489563, + "learning_rate": 7.329588396791529e-06, + "loss": 0.6289, + "step": 8931 + }, + { + "epoch": 1.549618320610687, + "grad_norm": 0.9420047998428345, + "learning_rate": 7.3243175393605215e-06, + "loss": 0.8145, + "step": 8932 + }, + { + "epoch": 1.549791811242193, + "grad_norm": 1.451534390449524, + "learning_rate": 7.31904815295724e-06, + "loss": 0.8005, + "step": 8933 + }, + { + "epoch": 1.5499653018736987, + "grad_norm": 2.010936975479126, + "learning_rate": 7.313780238193195e-06, + "loss": 0.5424, + "step": 8934 + }, + { + "epoch": 1.5501387925052046, + "grad_norm": 1.735575556755066, + "learning_rate": 7.308513795679737e-06, + "loss": 0.7505, + "step": 8935 + }, + { + "epoch": 1.5503122831367107, + "grad_norm": 2.088411808013916, + "learning_rate": 7.303248826028036e-06, + "loss": 0.6088, + "step": 8936 + }, + { + "epoch": 1.5504857737682165, + "grad_norm": 0.7258575558662415, + "learning_rate": 7.297985329849106e-06, + "loss": 0.745, + "step": 8937 + }, + { + "epoch": 1.5506592643997226, + "grad_norm": 1.0114256143569946, + "learning_rate": 7.292723307753784e-06, + "loss": 0.7374, + "step": 8938 + }, + { + "epoch": 1.5508327550312284, + "grad_norm": 0.9837453961372375, + "learning_rate": 7.287462760352726e-06, + "loss": 0.579, + "step": 8939 + }, + { + "epoch": 1.5510062456627343, + "grad_norm": 0.7941184043884277, + "learning_rate": 7.282203688256422e-06, + "loss": 0.7795, + "step": 8940 + }, + { + "epoch": 1.55117973629424, + "grad_norm": 1.321887731552124, + "learning_rate": 7.276946092075205e-06, + "loss": 0.5647, + "step": 8941 + }, + { + "epoch": 1.551353226925746, + "grad_norm": 1.3103902339935303, + "learning_rate": 7.27168997241922e-06, + "loss": 0.5791, + "step": 8942 + }, + { + "epoch": 1.5515267175572518, + "grad_norm": 0.8666737675666809, + "learning_rate": 7.266435329898447e-06, + "loss": 0.5795, + "step": 8943 + }, + { + "epoch": 1.5517002081887576, + "grad_norm": 0.8662192821502686, + "learning_rate": 7.261182165122689e-06, + "loss": 0.7161, + "step": 8944 + }, + { + "epoch": 1.5518736988202637, + "grad_norm": 1.5507638454437256, + "learning_rate": 7.255930478701578e-06, + "loss": 0.6234, + "step": 8945 + }, + { + "epoch": 1.5520471894517696, + "grad_norm": 1.4852821826934814, + "learning_rate": 7.250680271244593e-06, + "loss": 0.5947, + "step": 8946 + }, + { + "epoch": 1.5522206800832756, + "grad_norm": 0.8373382687568665, + "learning_rate": 7.245431543361016e-06, + "loss": 0.8455, + "step": 8947 + }, + { + "epoch": 1.5523941707147815, + "grad_norm": 1.6078777313232422, + "learning_rate": 7.240184295659971e-06, + "loss": 0.8049, + "step": 8948 + }, + { + "epoch": 1.5525676613462873, + "grad_norm": 0.7689506411552429, + "learning_rate": 7.234938528750402e-06, + "loss": 0.895, + "step": 8949 + }, + { + "epoch": 1.5527411519777932, + "grad_norm": 0.7702403664588928, + "learning_rate": 7.229694243241097e-06, + "loss": 0.8733, + "step": 8950 + }, + { + "epoch": 1.552914642609299, + "grad_norm": 1.3707280158996582, + "learning_rate": 7.224451439740654e-06, + "loss": 0.5516, + "step": 8951 + }, + { + "epoch": 1.5530881332408049, + "grad_norm": 0.8067935109138489, + "learning_rate": 7.219210118857509e-06, + "loss": 0.7292, + "step": 8952 + }, + { + "epoch": 1.553261623872311, + "grad_norm": 1.3930362462997437, + "learning_rate": 7.213970281199913e-06, + "loss": 0.5172, + "step": 8953 + }, + { + "epoch": 1.5534351145038168, + "grad_norm": 0.8083785772323608, + "learning_rate": 7.208731927375982e-06, + "loss": 0.8018, + "step": 8954 + }, + { + "epoch": 1.5536086051353228, + "grad_norm": 0.8476241827011108, + "learning_rate": 7.203495057993599e-06, + "loss": 0.7255, + "step": 8955 + }, + { + "epoch": 1.5537820957668287, + "grad_norm": 2.1864075660705566, + "learning_rate": 7.198259673660535e-06, + "loss": 0.9241, + "step": 8956 + }, + { + "epoch": 1.5539555863983345, + "grad_norm": 1.280950665473938, + "learning_rate": 7.19302577498435e-06, + "loss": 0.5566, + "step": 8957 + }, + { + "epoch": 1.5541290770298404, + "grad_norm": 1.0391618013381958, + "learning_rate": 7.187793362572451e-06, + "loss": 0.7185, + "step": 8958 + }, + { + "epoch": 1.5543025676613462, + "grad_norm": 1.3404078483581543, + "learning_rate": 7.1825624370320505e-06, + "loss": 0.4917, + "step": 8959 + }, + { + "epoch": 1.554476058292852, + "grad_norm": 0.6567124724388123, + "learning_rate": 7.17733299897023e-06, + "loss": 0.8184, + "step": 8960 + }, + { + "epoch": 1.5546495489243581, + "grad_norm": 0.9664612412452698, + "learning_rate": 7.172105048993842e-06, + "loss": 0.6248, + "step": 8961 + }, + { + "epoch": 1.554823039555864, + "grad_norm": 0.8981587290763855, + "learning_rate": 7.166878587709618e-06, + "loss": 0.7146, + "step": 8962 + }, + { + "epoch": 1.5549965301873698, + "grad_norm": 1.2921290397644043, + "learning_rate": 7.1616536157240865e-06, + "loss": 0.6365, + "step": 8963 + }, + { + "epoch": 1.555170020818876, + "grad_norm": 0.7206228375434875, + "learning_rate": 7.156430133643613e-06, + "loss": 0.7585, + "step": 8964 + }, + { + "epoch": 1.5553435114503817, + "grad_norm": 0.8712290525436401, + "learning_rate": 7.151208142074382e-06, + "loss": 0.7617, + "step": 8965 + }, + { + "epoch": 1.5555170020818876, + "grad_norm": 0.8790315389633179, + "learning_rate": 7.145987641622423e-06, + "loss": 0.6519, + "step": 8966 + }, + { + "epoch": 1.5556904927133934, + "grad_norm": 0.9406765103340149, + "learning_rate": 7.1407686328935774e-06, + "loss": 0.7067, + "step": 8967 + }, + { + "epoch": 1.5558639833448993, + "grad_norm": 0.7513253092765808, + "learning_rate": 7.1355511164935085e-06, + "loss": 0.7748, + "step": 8968 + }, + { + "epoch": 1.5560374739764051, + "grad_norm": 1.0153144598007202, + "learning_rate": 7.130335093027731e-06, + "loss": 0.8123, + "step": 8969 + }, + { + "epoch": 1.5562109646079112, + "grad_norm": 1.0520907640457153, + "learning_rate": 7.125120563101562e-06, + "loss": 0.7463, + "step": 8970 + }, + { + "epoch": 1.556384455239417, + "grad_norm": 0.8306304812431335, + "learning_rate": 7.1199075273201515e-06, + "loss": 0.6813, + "step": 8971 + }, + { + "epoch": 1.556557945870923, + "grad_norm": 0.9629014134407043, + "learning_rate": 7.114695986288476e-06, + "loss": 0.6235, + "step": 8972 + }, + { + "epoch": 1.556731436502429, + "grad_norm": 0.7953219413757324, + "learning_rate": 7.109485940611358e-06, + "loss": 0.5957, + "step": 8973 + }, + { + "epoch": 1.5569049271339348, + "grad_norm": 1.475295901298523, + "learning_rate": 7.104277390893404e-06, + "loss": 0.6823, + "step": 8974 + }, + { + "epoch": 1.5570784177654406, + "grad_norm": 2.02095365524292, + "learning_rate": 7.099070337739094e-06, + "loss": 0.5695, + "step": 8975 + }, + { + "epoch": 1.5572519083969465, + "grad_norm": 0.7603011131286621, + "learning_rate": 7.0938647817527014e-06, + "loss": 0.7896, + "step": 8976 + }, + { + "epoch": 1.5574253990284523, + "grad_norm": 1.1442238092422485, + "learning_rate": 7.088660723538339e-06, + "loss": 0.5852, + "step": 8977 + }, + { + "epoch": 1.5575988896599584, + "grad_norm": 0.8309770226478577, + "learning_rate": 7.083458163699939e-06, + "loss": 0.6573, + "step": 8978 + }, + { + "epoch": 1.5577723802914643, + "grad_norm": 2.594754695892334, + "learning_rate": 7.0782571028412774e-06, + "loss": 0.6372, + "step": 8979 + }, + { + "epoch": 1.5579458709229703, + "grad_norm": 0.6463872790336609, + "learning_rate": 7.073057541565933e-06, + "loss": 0.7036, + "step": 8980 + }, + { + "epoch": 1.5581193615544762, + "grad_norm": 1.4513139724731445, + "learning_rate": 7.067859480477326e-06, + "loss": 0.7407, + "step": 8981 + }, + { + "epoch": 1.558292852185982, + "grad_norm": 0.9616003036499023, + "learning_rate": 7.062662920178689e-06, + "loss": 0.7083, + "step": 8982 + }, + { + "epoch": 1.5584663428174879, + "grad_norm": 0.9408608675003052, + "learning_rate": 7.057467861273106e-06, + "loss": 0.7941, + "step": 8983 + }, + { + "epoch": 1.5586398334489937, + "grad_norm": 0.8922111988067627, + "learning_rate": 7.052274304363449e-06, + "loss": 0.7284, + "step": 8984 + }, + { + "epoch": 1.5588133240804996, + "grad_norm": 1.1501398086547852, + "learning_rate": 7.0470822500524504e-06, + "loss": 0.5786, + "step": 8985 + }, + { + "epoch": 1.5589868147120054, + "grad_norm": 0.9940692782402039, + "learning_rate": 7.041891698942649e-06, + "loss": 0.5452, + "step": 8986 + }, + { + "epoch": 1.5591603053435115, + "grad_norm": 0.8202354907989502, + "learning_rate": 7.036702651636416e-06, + "loss": 0.6997, + "step": 8987 + }, + { + "epoch": 1.5593337959750173, + "grad_norm": 0.7108455300331116, + "learning_rate": 7.03151510873594e-06, + "loss": 0.7666, + "step": 8988 + }, + { + "epoch": 1.5595072866065234, + "grad_norm": 0.859872579574585, + "learning_rate": 7.0263290708432515e-06, + "loss": 0.6577, + "step": 8989 + }, + { + "epoch": 1.5596807772380292, + "grad_norm": 0.940835177898407, + "learning_rate": 7.021144538560194e-06, + "loss": 0.759, + "step": 8990 + }, + { + "epoch": 1.559854267869535, + "grad_norm": 1.2106335163116455, + "learning_rate": 7.015961512488434e-06, + "loss": 0.7466, + "step": 8991 + }, + { + "epoch": 1.560027758501041, + "grad_norm": 1.15566086769104, + "learning_rate": 7.010779993229471e-06, + "loss": 0.6023, + "step": 8992 + }, + { + "epoch": 1.5602012491325468, + "grad_norm": 0.9566821455955505, + "learning_rate": 7.005599981384618e-06, + "loss": 0.7896, + "step": 8993 + }, + { + "epoch": 1.5603747397640526, + "grad_norm": 0.9018764495849609, + "learning_rate": 7.000421477555038e-06, + "loss": 0.7643, + "step": 8994 + }, + { + "epoch": 1.5605482303955587, + "grad_norm": 1.6827456951141357, + "learning_rate": 6.9952444823416894e-06, + "loss": 0.5391, + "step": 8995 + }, + { + "epoch": 1.5607217210270645, + "grad_norm": 0.7773724794387817, + "learning_rate": 6.9900689963453734e-06, + "loss": 0.6917, + "step": 8996 + }, + { + "epoch": 1.5608952116585706, + "grad_norm": 0.8962863087654114, + "learning_rate": 6.9848950201667045e-06, + "loss": 0.6909, + "step": 8997 + }, + { + "epoch": 1.5610687022900764, + "grad_norm": 0.7415764331817627, + "learning_rate": 6.9797225544061385e-06, + "loss": 0.6575, + "step": 8998 + }, + { + "epoch": 1.5612421929215823, + "grad_norm": 0.9435992240905762, + "learning_rate": 6.974551599663944e-06, + "loss": 0.6608, + "step": 8999 + }, + { + "epoch": 1.5614156835530881, + "grad_norm": 0.9136607646942139, + "learning_rate": 6.969382156540212e-06, + "loss": 0.6946, + "step": 9000 + }, + { + "epoch": 1.561589174184594, + "grad_norm": 1.271981120109558, + "learning_rate": 6.96421422563486e-06, + "loss": 0.5963, + "step": 9001 + }, + { + "epoch": 1.5617626648160998, + "grad_norm": 1.347968578338623, + "learning_rate": 6.9590478075476475e-06, + "loss": 0.609, + "step": 9002 + }, + { + "epoch": 1.5619361554476057, + "grad_norm": 0.8680453300476074, + "learning_rate": 6.953882902878122e-06, + "loss": 0.6904, + "step": 9003 + }, + { + "epoch": 1.5621096460791117, + "grad_norm": 0.7330612540245056, + "learning_rate": 6.9487195122256925e-06, + "loss": 0.8225, + "step": 9004 + }, + { + "epoch": 1.5622831367106176, + "grad_norm": 0.7182332873344421, + "learning_rate": 6.943557636189571e-06, + "loss": 0.7983, + "step": 9005 + }, + { + "epoch": 1.5624566273421236, + "grad_norm": 0.8386569023132324, + "learning_rate": 6.9383972753688e-06, + "loss": 0.7073, + "step": 9006 + }, + { + "epoch": 1.5626301179736295, + "grad_norm": 1.6388517618179321, + "learning_rate": 6.933238430362239e-06, + "loss": 0.5768, + "step": 9007 + }, + { + "epoch": 1.5628036086051353, + "grad_norm": 1.9835724830627441, + "learning_rate": 6.928081101768589e-06, + "loss": 0.5692, + "step": 9008 + }, + { + "epoch": 1.5629770992366412, + "grad_norm": 0.8341009020805359, + "learning_rate": 6.922925290186362e-06, + "loss": 0.6757, + "step": 9009 + }, + { + "epoch": 1.563150589868147, + "grad_norm": 1.0076158046722412, + "learning_rate": 6.9177709962138905e-06, + "loss": 0.6613, + "step": 9010 + }, + { + "epoch": 1.5633240804996529, + "grad_norm": 0.7925286889076233, + "learning_rate": 6.912618220449332e-06, + "loss": 0.75, + "step": 9011 + }, + { + "epoch": 1.563497571131159, + "grad_norm": 1.835541009902954, + "learning_rate": 6.907466963490692e-06, + "loss": 0.6995, + "step": 9012 + }, + { + "epoch": 1.5636710617626648, + "grad_norm": 0.7670965790748596, + "learning_rate": 6.9023172259357555e-06, + "loss": 0.726, + "step": 9013 + }, + { + "epoch": 1.5638445523941709, + "grad_norm": 1.1195290088653564, + "learning_rate": 6.897169008382172e-06, + "loss": 0.5858, + "step": 9014 + }, + { + "epoch": 1.5640180430256767, + "grad_norm": 0.804973840713501, + "learning_rate": 6.892022311427393e-06, + "loss": 0.7156, + "step": 9015 + }, + { + "epoch": 1.5641915336571826, + "grad_norm": 1.437686562538147, + "learning_rate": 6.88687713566869e-06, + "loss": 0.6764, + "step": 9016 + }, + { + "epoch": 1.5643650242886884, + "grad_norm": 1.1120517253875732, + "learning_rate": 6.881733481703186e-06, + "loss": 0.6418, + "step": 9017 + }, + { + "epoch": 1.5645385149201942, + "grad_norm": 1.1453386545181274, + "learning_rate": 6.876591350127795e-06, + "loss": 0.5649, + "step": 9018 + }, + { + "epoch": 1.5647120055517, + "grad_norm": 1.4167479276657104, + "learning_rate": 6.87145074153927e-06, + "loss": 0.6584, + "step": 9019 + }, + { + "epoch": 1.5648854961832062, + "grad_norm": 0.9022642970085144, + "learning_rate": 6.866311656534177e-06, + "loss": 0.7711, + "step": 9020 + }, + { + "epoch": 1.565058986814712, + "grad_norm": 1.0740294456481934, + "learning_rate": 6.861174095708934e-06, + "loss": 0.7244, + "step": 9021 + }, + { + "epoch": 1.5652324774462179, + "grad_norm": 1.423272967338562, + "learning_rate": 6.856038059659731e-06, + "loss": 0.7241, + "step": 9022 + }, + { + "epoch": 1.565405968077724, + "grad_norm": 0.9200296401977539, + "learning_rate": 6.850903548982637e-06, + "loss": 0.7173, + "step": 9023 + }, + { + "epoch": 1.5655794587092298, + "grad_norm": 0.8293949365615845, + "learning_rate": 6.8457705642734994e-06, + "loss": 0.527, + "step": 9024 + }, + { + "epoch": 1.5657529493407356, + "grad_norm": 1.3087120056152344, + "learning_rate": 6.840639106128031e-06, + "loss": 0.5292, + "step": 9025 + }, + { + "epoch": 1.5659264399722415, + "grad_norm": 0.9679762125015259, + "learning_rate": 6.835509175141713e-06, + "loss": 0.645, + "step": 9026 + }, + { + "epoch": 1.5660999306037473, + "grad_norm": 0.8624405264854431, + "learning_rate": 6.830380771909901e-06, + "loss": 0.6339, + "step": 9027 + }, + { + "epoch": 1.5662734212352531, + "grad_norm": 0.8317779302597046, + "learning_rate": 6.825253897027746e-06, + "loss": 0.7982, + "step": 9028 + }, + { + "epoch": 1.5664469118667592, + "grad_norm": 0.9171637296676636, + "learning_rate": 6.82012855109023e-06, + "loss": 0.692, + "step": 9029 + }, + { + "epoch": 1.566620402498265, + "grad_norm": 0.7766134142875671, + "learning_rate": 6.815004734692146e-06, + "loss": 0.7861, + "step": 9030 + }, + { + "epoch": 1.5667938931297711, + "grad_norm": 0.7812772989273071, + "learning_rate": 6.8098824484281375e-06, + "loss": 0.4916, + "step": 9031 + }, + { + "epoch": 1.566967383761277, + "grad_norm": 1.7976561784744263, + "learning_rate": 6.804761692892627e-06, + "loss": 0.7915, + "step": 9032 + }, + { + "epoch": 1.5671408743927828, + "grad_norm": 1.1417678594589233, + "learning_rate": 6.799642468679908e-06, + "loss": 0.62, + "step": 9033 + }, + { + "epoch": 1.5673143650242887, + "grad_norm": 1.4384663105010986, + "learning_rate": 6.794524776384059e-06, + "loss": 0.603, + "step": 9034 + }, + { + "epoch": 1.5674878556557945, + "grad_norm": 0.870684802532196, + "learning_rate": 6.7894086165989985e-06, + "loss": 0.7181, + "step": 9035 + }, + { + "epoch": 1.5676613462873004, + "grad_norm": 0.9034882187843323, + "learning_rate": 6.784293989918454e-06, + "loss": 0.6393, + "step": 9036 + }, + { + "epoch": 1.5678348369188064, + "grad_norm": 0.9593498706817627, + "learning_rate": 6.779180896935997e-06, + "loss": 0.7438, + "step": 9037 + }, + { + "epoch": 1.5680083275503123, + "grad_norm": 1.0753164291381836, + "learning_rate": 6.774069338245002e-06, + "loss": 0.655, + "step": 9038 + }, + { + "epoch": 1.5681818181818183, + "grad_norm": 0.8681920766830444, + "learning_rate": 6.7689593144386745e-06, + "loss": 0.7371, + "step": 9039 + }, + { + "epoch": 1.5683553088133242, + "grad_norm": 1.0526624917984009, + "learning_rate": 6.763850826110025e-06, + "loss": 0.6118, + "step": 9040 + }, + { + "epoch": 1.56852879944483, + "grad_norm": 0.8068903088569641, + "learning_rate": 6.758743873851921e-06, + "loss": 0.7302, + "step": 9041 + }, + { + "epoch": 1.5687022900763359, + "grad_norm": 1.1422016620635986, + "learning_rate": 6.753638458257017e-06, + "loss": 0.7683, + "step": 9042 + }, + { + "epoch": 1.5688757807078417, + "grad_norm": 1.0817248821258545, + "learning_rate": 6.748534579917807e-06, + "loss": 0.6255, + "step": 9043 + }, + { + "epoch": 1.5690492713393476, + "grad_norm": 0.8376688361167908, + "learning_rate": 6.743432239426599e-06, + "loss": 0.551, + "step": 9044 + }, + { + "epoch": 1.5692227619708534, + "grad_norm": 0.7223806977272034, + "learning_rate": 6.7383314373755184e-06, + "loss": 0.7322, + "step": 9045 + }, + { + "epoch": 1.5693962526023595, + "grad_norm": 0.9871660470962524, + "learning_rate": 6.733232174356537e-06, + "loss": 0.6602, + "step": 9046 + }, + { + "epoch": 1.5695697432338653, + "grad_norm": 1.067580223083496, + "learning_rate": 6.728134450961419e-06, + "loss": 0.522, + "step": 9047 + }, + { + "epoch": 1.5697432338653714, + "grad_norm": 1.1022852659225464, + "learning_rate": 6.723038267781763e-06, + "loss": 0.7184, + "step": 9048 + }, + { + "epoch": 1.5699167244968772, + "grad_norm": 0.9337177276611328, + "learning_rate": 6.71794362540898e-06, + "loss": 0.8271, + "step": 9049 + }, + { + "epoch": 1.570090215128383, + "grad_norm": 0.9357997179031372, + "learning_rate": 6.712850524434329e-06, + "loss": 0.8191, + "step": 9050 + }, + { + "epoch": 1.570263705759889, + "grad_norm": 0.9036109447479248, + "learning_rate": 6.707758965448843e-06, + "loss": 0.8176, + "step": 9051 + }, + { + "epoch": 1.5704371963913948, + "grad_norm": 1.1329014301300049, + "learning_rate": 6.7026689490434275e-06, + "loss": 0.5903, + "step": 9052 + }, + { + "epoch": 1.5706106870229006, + "grad_norm": 0.9743953943252563, + "learning_rate": 6.6975804758087645e-06, + "loss": 0.7936, + "step": 9053 + }, + { + "epoch": 1.5707841776544067, + "grad_norm": 1.087054967880249, + "learning_rate": 6.692493546335404e-06, + "loss": 0.6456, + "step": 9054 + }, + { + "epoch": 1.5709576682859125, + "grad_norm": 0.8859637379646301, + "learning_rate": 6.687408161213657e-06, + "loss": 0.651, + "step": 9055 + }, + { + "epoch": 1.5711311589174186, + "grad_norm": 0.94200599193573, + "learning_rate": 6.682324321033715e-06, + "loss": 0.652, + "step": 9056 + }, + { + "epoch": 1.5713046495489245, + "grad_norm": 0.8629254698753357, + "learning_rate": 6.677242026385553e-06, + "loss": 0.7032, + "step": 9057 + }, + { + "epoch": 1.5714781401804303, + "grad_norm": 1.1218312978744507, + "learning_rate": 6.672161277858977e-06, + "loss": 0.6304, + "step": 9058 + }, + { + "epoch": 1.5716516308119362, + "grad_norm": 1.1719050407409668, + "learning_rate": 6.667082076043609e-06, + "loss": 0.6191, + "step": 9059 + }, + { + "epoch": 1.571825121443442, + "grad_norm": 1.4238147735595703, + "learning_rate": 6.662004421528909e-06, + "loss": 0.676, + "step": 9060 + }, + { + "epoch": 1.5719986120749478, + "grad_norm": 0.8628272414207458, + "learning_rate": 6.656928314904136e-06, + "loss": 0.8433, + "step": 9061 + }, + { + "epoch": 1.5721721027064537, + "grad_norm": 0.8878034949302673, + "learning_rate": 6.651853756758382e-06, + "loss": 0.8342, + "step": 9062 + }, + { + "epoch": 1.5723455933379598, + "grad_norm": 1.4456825256347656, + "learning_rate": 6.646780747680552e-06, + "loss": 0.6974, + "step": 9063 + }, + { + "epoch": 1.5725190839694656, + "grad_norm": 4.831257343292236, + "learning_rate": 6.641709288259368e-06, + "loss": 0.6521, + "step": 9064 + }, + { + "epoch": 1.5726925746009717, + "grad_norm": 0.8076745271682739, + "learning_rate": 6.636639379083396e-06, + "loss": 0.671, + "step": 9065 + }, + { + "epoch": 1.5728660652324775, + "grad_norm": 1.10724937915802, + "learning_rate": 6.6315710207409925e-06, + "loss": 0.5566, + "step": 9066 + }, + { + "epoch": 1.5730395558639834, + "grad_norm": 0.9080255627632141, + "learning_rate": 6.6265042138203505e-06, + "loss": 0.7465, + "step": 9067 + }, + { + "epoch": 1.5732130464954892, + "grad_norm": 1.36742103099823, + "learning_rate": 6.621438958909472e-06, + "loss": 0.6892, + "step": 9068 + }, + { + "epoch": 1.573386537126995, + "grad_norm": 0.7483637928962708, + "learning_rate": 6.616375256596197e-06, + "loss": 0.8237, + "step": 9069 + }, + { + "epoch": 1.573560027758501, + "grad_norm": 0.7270797491073608, + "learning_rate": 6.6113131074681694e-06, + "loss": 0.733, + "step": 9070 + }, + { + "epoch": 1.573733518390007, + "grad_norm": 0.6634971499443054, + "learning_rate": 6.606252512112856e-06, + "loss": 0.9028, + "step": 9071 + }, + { + "epoch": 1.5739070090215128, + "grad_norm": 1.0640778541564941, + "learning_rate": 6.6011934711175395e-06, + "loss": 0.5917, + "step": 9072 + }, + { + "epoch": 1.5740804996530189, + "grad_norm": 0.9467125535011292, + "learning_rate": 6.596135985069347e-06, + "loss": 0.6643, + "step": 9073 + }, + { + "epoch": 1.5742539902845247, + "grad_norm": 0.9458147883415222, + "learning_rate": 6.591080054555177e-06, + "loss": 0.7015, + "step": 9074 + }, + { + "epoch": 1.5744274809160306, + "grad_norm": 1.0506178140640259, + "learning_rate": 6.586025680161799e-06, + "loss": 0.6392, + "step": 9075 + }, + { + "epoch": 1.5746009715475364, + "grad_norm": 0.673294723033905, + "learning_rate": 6.580972862475769e-06, + "loss": 0.7045, + "step": 9076 + }, + { + "epoch": 1.5747744621790423, + "grad_norm": 0.8712504506111145, + "learning_rate": 6.575921602083477e-06, + "loss": 0.663, + "step": 9077 + }, + { + "epoch": 1.5749479528105481, + "grad_norm": 1.676498532295227, + "learning_rate": 6.570871899571119e-06, + "loss": 0.5808, + "step": 9078 + }, + { + "epoch": 1.5751214434420542, + "grad_norm": 0.8963791728019714, + "learning_rate": 6.565823755524732e-06, + "loss": 0.6808, + "step": 9079 + }, + { + "epoch": 1.57529493407356, + "grad_norm": 1.437051773071289, + "learning_rate": 6.56077717053015e-06, + "loss": 0.6603, + "step": 9080 + }, + { + "epoch": 1.5754684247050659, + "grad_norm": 1.0033570528030396, + "learning_rate": 6.555732145173037e-06, + "loss": 0.5767, + "step": 9081 + }, + { + "epoch": 1.575641915336572, + "grad_norm": 0.9751380681991577, + "learning_rate": 6.550688680038871e-06, + "loss": 0.6687, + "step": 9082 + }, + { + "epoch": 1.5758154059680778, + "grad_norm": 1.1368739604949951, + "learning_rate": 6.545646775712964e-06, + "loss": 0.7551, + "step": 9083 + }, + { + "epoch": 1.5759888965995836, + "grad_norm": 1.0037841796875, + "learning_rate": 6.5406064327804165e-06, + "loss": 0.5571, + "step": 9084 + }, + { + "epoch": 1.5761623872310895, + "grad_norm": 0.96094810962677, + "learning_rate": 6.53556765182618e-06, + "loss": 0.5311, + "step": 9085 + }, + { + "epoch": 1.5763358778625953, + "grad_norm": 0.9603806734085083, + "learning_rate": 6.5305304334350075e-06, + "loss": 0.6006, + "step": 9086 + }, + { + "epoch": 1.5765093684941012, + "grad_norm": 0.8324559330940247, + "learning_rate": 6.525494778191473e-06, + "loss": 0.739, + "step": 9087 + }, + { + "epoch": 1.5766828591256072, + "grad_norm": 1.1911317110061646, + "learning_rate": 6.520460686679964e-06, + "loss": 0.5695, + "step": 9088 + }, + { + "epoch": 1.576856349757113, + "grad_norm": 2.0591511726379395, + "learning_rate": 6.515428159484707e-06, + "loss": 0.6079, + "step": 9089 + }, + { + "epoch": 1.5770298403886192, + "grad_norm": 0.7616486549377441, + "learning_rate": 6.510397197189724e-06, + "loss": 0.688, + "step": 9090 + }, + { + "epoch": 1.577203331020125, + "grad_norm": 1.0189143419265747, + "learning_rate": 6.505367800378856e-06, + "loss": 0.6288, + "step": 9091 + }, + { + "epoch": 1.5773768216516308, + "grad_norm": 0.9918420910835266, + "learning_rate": 6.500339969635794e-06, + "loss": 0.77, + "step": 9092 + }, + { + "epoch": 1.5775503122831367, + "grad_norm": 0.8132083415985107, + "learning_rate": 6.495313705543997e-06, + "loss": 0.642, + "step": 9093 + }, + { + "epoch": 1.5777238029146425, + "grad_norm": 1.0540673732757568, + "learning_rate": 6.490289008686786e-06, + "loss": 0.6814, + "step": 9094 + }, + { + "epoch": 1.5778972935461484, + "grad_norm": 1.1245464086532593, + "learning_rate": 6.485265879647269e-06, + "loss": 0.7157, + "step": 9095 + }, + { + "epoch": 1.5780707841776545, + "grad_norm": 1.112305760383606, + "learning_rate": 6.480244319008411e-06, + "loss": 0.8271, + "step": 9096 + }, + { + "epoch": 1.5782442748091603, + "grad_norm": 1.1391340494155884, + "learning_rate": 6.475224327352938e-06, + "loss": 0.5725, + "step": 9097 + }, + { + "epoch": 1.5784177654406664, + "grad_norm": 0.9266571998596191, + "learning_rate": 6.470205905263449e-06, + "loss": 0.6195, + "step": 9098 + }, + { + "epoch": 1.5785912560721722, + "grad_norm": 1.0901470184326172, + "learning_rate": 6.4651890533223294e-06, + "loss": 0.8013, + "step": 9099 + }, + { + "epoch": 1.578764746703678, + "grad_norm": 0.8059579133987427, + "learning_rate": 6.460173772111791e-06, + "loss": 0.7162, + "step": 9100 + }, + { + "epoch": 1.578938237335184, + "grad_norm": 1.3515913486480713, + "learning_rate": 6.455160062213857e-06, + "loss": 0.6234, + "step": 9101 + }, + { + "epoch": 1.5791117279666897, + "grad_norm": 0.8095378279685974, + "learning_rate": 6.450147924210395e-06, + "loss": 0.8494, + "step": 9102 + }, + { + "epoch": 1.5792852185981956, + "grad_norm": 0.9865389466285706, + "learning_rate": 6.44513735868304e-06, + "loss": 0.7148, + "step": 9103 + }, + { + "epoch": 1.5794587092297014, + "grad_norm": 0.968824028968811, + "learning_rate": 6.440128366213297e-06, + "loss": 0.6104, + "step": 9104 + }, + { + "epoch": 1.5796321998612075, + "grad_norm": 1.1221611499786377, + "learning_rate": 6.435120947382456e-06, + "loss": 0.636, + "step": 9105 + }, + { + "epoch": 1.5798056904927134, + "grad_norm": 0.7428556680679321, + "learning_rate": 6.430115102771637e-06, + "loss": 0.77, + "step": 9106 + }, + { + "epoch": 1.5799791811242194, + "grad_norm": 0.8559720516204834, + "learning_rate": 6.425110832961765e-06, + "loss": 0.6144, + "step": 9107 + }, + { + "epoch": 1.5801526717557253, + "grad_norm": 1.2220149040222168, + "learning_rate": 6.420108138533607e-06, + "loss": 0.7494, + "step": 9108 + }, + { + "epoch": 1.5803261623872311, + "grad_norm": 0.8277440071105957, + "learning_rate": 6.415107020067721e-06, + "loss": 0.781, + "step": 9109 + }, + { + "epoch": 1.580499653018737, + "grad_norm": 1.9041646718978882, + "learning_rate": 6.410107478144496e-06, + "loss": 0.6395, + "step": 9110 + }, + { + "epoch": 1.5806731436502428, + "grad_norm": 2.3140852451324463, + "learning_rate": 6.405109513344126e-06, + "loss": 0.5359, + "step": 9111 + }, + { + "epoch": 1.5808466342817487, + "grad_norm": 1.2109776735305786, + "learning_rate": 6.400113126246645e-06, + "loss": 0.8625, + "step": 9112 + }, + { + "epoch": 1.5810201249132547, + "grad_norm": 0.74896639585495, + "learning_rate": 6.395118317431883e-06, + "loss": 0.7878, + "step": 9113 + }, + { + "epoch": 1.5811936155447606, + "grad_norm": 1.1811130046844482, + "learning_rate": 6.390125087479493e-06, + "loss": 0.5419, + "step": 9114 + }, + { + "epoch": 1.5813671061762666, + "grad_norm": 1.001627802848816, + "learning_rate": 6.385133436968946e-06, + "loss": 0.7251, + "step": 9115 + }, + { + "epoch": 1.5815405968077725, + "grad_norm": 1.2847737073898315, + "learning_rate": 6.380143366479521e-06, + "loss": 0.5568, + "step": 9116 + }, + { + "epoch": 1.5817140874392783, + "grad_norm": 1.8875972032546997, + "learning_rate": 6.375154876590335e-06, + "loss": 0.7406, + "step": 9117 + }, + { + "epoch": 1.5818875780707842, + "grad_norm": 1.0075972080230713, + "learning_rate": 6.370167967880303e-06, + "loss": 0.6351, + "step": 9118 + }, + { + "epoch": 1.58206106870229, + "grad_norm": 0.7694903612136841, + "learning_rate": 6.365182640928158e-06, + "loss": 0.7091, + "step": 9119 + }, + { + "epoch": 1.5822345593337959, + "grad_norm": 0.9580382704734802, + "learning_rate": 6.360198896312451e-06, + "loss": 0.6886, + "step": 9120 + }, + { + "epoch": 1.5824080499653017, + "grad_norm": 0.6838316321372986, + "learning_rate": 6.355216734611567e-06, + "loss": 0.7294, + "step": 9121 + }, + { + "epoch": 1.5825815405968078, + "grad_norm": 0.9779296517372131, + "learning_rate": 6.350236156403666e-06, + "loss": 0.6165, + "step": 9122 + }, + { + "epoch": 1.5827550312283136, + "grad_norm": 0.9810131192207336, + "learning_rate": 6.345257162266773e-06, + "loss": 0.6764, + "step": 9123 + }, + { + "epoch": 1.5829285218598197, + "grad_norm": 1.7076297998428345, + "learning_rate": 6.3402797527786904e-06, + "loss": 0.5392, + "step": 9124 + }, + { + "epoch": 1.5831020124913255, + "grad_norm": 0.8479841947555542, + "learning_rate": 6.335303928517071e-06, + "loss": 0.6853, + "step": 9125 + }, + { + "epoch": 1.5832755031228314, + "grad_norm": 0.967517614364624, + "learning_rate": 6.330329690059342e-06, + "loss": 0.7983, + "step": 9126 + }, + { + "epoch": 1.5834489937543372, + "grad_norm": 1.2772085666656494, + "learning_rate": 6.3253570379827864e-06, + "loss": 0.6921, + "step": 9127 + }, + { + "epoch": 1.583622484385843, + "grad_norm": 1.6205071210861206, + "learning_rate": 6.32038597286448e-06, + "loss": 0.6235, + "step": 9128 + }, + { + "epoch": 1.583795975017349, + "grad_norm": 0.9493603706359863, + "learning_rate": 6.315416495281323e-06, + "loss": 0.6318, + "step": 9129 + }, + { + "epoch": 1.583969465648855, + "grad_norm": 1.2238574028015137, + "learning_rate": 6.31044860581002e-06, + "loss": 0.8542, + "step": 9130 + }, + { + "epoch": 1.5841429562803608, + "grad_norm": 1.5491362810134888, + "learning_rate": 6.305482305027122e-06, + "loss": 0.7332, + "step": 9131 + }, + { + "epoch": 1.584316446911867, + "grad_norm": 0.6769113540649414, + "learning_rate": 6.300517593508944e-06, + "loss": 0.8027, + "step": 9132 + }, + { + "epoch": 1.5844899375433728, + "grad_norm": 0.7900825142860413, + "learning_rate": 6.295554471831671e-06, + "loss": 0.6412, + "step": 9133 + }, + { + "epoch": 1.5846634281748786, + "grad_norm": 0.7570249438285828, + "learning_rate": 6.290592940571269e-06, + "loss": 0.7689, + "step": 9134 + }, + { + "epoch": 1.5848369188063844, + "grad_norm": 2.5499629974365234, + "learning_rate": 6.2856330003035324e-06, + "loss": 0.771, + "step": 9135 + }, + { + "epoch": 1.5850104094378903, + "grad_norm": 1.4348570108413696, + "learning_rate": 6.280674651604059e-06, + "loss": 0.6677, + "step": 9136 + }, + { + "epoch": 1.5851839000693961, + "grad_norm": 0.9304018020629883, + "learning_rate": 6.275717895048285e-06, + "loss": 0.5985, + "step": 9137 + }, + { + "epoch": 1.5853573907009022, + "grad_norm": 2.1724681854248047, + "learning_rate": 6.270762731211442e-06, + "loss": 0.6687, + "step": 9138 + }, + { + "epoch": 1.585530881332408, + "grad_norm": 1.0812253952026367, + "learning_rate": 6.2658091606685745e-06, + "loss": 0.5635, + "step": 9139 + }, + { + "epoch": 1.585704371963914, + "grad_norm": 0.7685642838478088, + "learning_rate": 6.260857183994564e-06, + "loss": 0.8098, + "step": 9140 + }, + { + "epoch": 1.58587786259542, + "grad_norm": 0.9069064855575562, + "learning_rate": 6.2559068017640865e-06, + "loss": 0.7124, + "step": 9141 + }, + { + "epoch": 1.5860513532269258, + "grad_norm": 1.4261544942855835, + "learning_rate": 6.25095801455164e-06, + "loss": 0.5983, + "step": 9142 + }, + { + "epoch": 1.5862248438584317, + "grad_norm": 0.8733739852905273, + "learning_rate": 6.246010822931532e-06, + "loss": 0.5925, + "step": 9143 + }, + { + "epoch": 1.5863983344899375, + "grad_norm": 1.295709252357483, + "learning_rate": 6.241065227477905e-06, + "loss": 0.6216, + "step": 9144 + }, + { + "epoch": 1.5865718251214433, + "grad_norm": 1.0127530097961426, + "learning_rate": 6.2361212287646774e-06, + "loss": 0.7607, + "step": 9145 + }, + { + "epoch": 1.5867453157529492, + "grad_norm": 0.8819593787193298, + "learning_rate": 6.231178827365627e-06, + "loss": 0.7041, + "step": 9146 + }, + { + "epoch": 1.5869188063844553, + "grad_norm": 1.2364251613616943, + "learning_rate": 6.2262380238543185e-06, + "loss": 0.7412, + "step": 9147 + }, + { + "epoch": 1.587092297015961, + "grad_norm": 0.8162155151367188, + "learning_rate": 6.221298818804136e-06, + "loss": 0.6128, + "step": 9148 + }, + { + "epoch": 1.5872657876474672, + "grad_norm": 1.2361853122711182, + "learning_rate": 6.216361212788276e-06, + "loss": 0.6589, + "step": 9149 + }, + { + "epoch": 1.587439278278973, + "grad_norm": 1.3652019500732422, + "learning_rate": 6.211425206379769e-06, + "loss": 0.6361, + "step": 9150 + }, + { + "epoch": 1.5876127689104789, + "grad_norm": 1.1567814350128174, + "learning_rate": 6.206490800151421e-06, + "loss": 0.6743, + "step": 9151 + }, + { + "epoch": 1.5877862595419847, + "grad_norm": 0.9452083706855774, + "learning_rate": 6.201557994675895e-06, + "loss": 0.6581, + "step": 9152 + }, + { + "epoch": 1.5879597501734906, + "grad_norm": 1.2218339443206787, + "learning_rate": 6.196626790525635e-06, + "loss": 0.7024, + "step": 9153 + }, + { + "epoch": 1.5881332408049964, + "grad_norm": 0.9316795468330383, + "learning_rate": 6.191697188272933e-06, + "loss": 0.7052, + "step": 9154 + }, + { + "epoch": 1.5883067314365025, + "grad_norm": 1.1242773532867432, + "learning_rate": 6.186769188489852e-06, + "loss": 0.6865, + "step": 9155 + }, + { + "epoch": 1.5884802220680083, + "grad_norm": 1.0448147058486938, + "learning_rate": 6.181842791748307e-06, + "loss": 0.6393, + "step": 9156 + }, + { + "epoch": 1.5886537126995144, + "grad_norm": 0.6968982815742493, + "learning_rate": 6.1769179986200065e-06, + "loss": 0.657, + "step": 9157 + }, + { + "epoch": 1.5888272033310202, + "grad_norm": 2.283013105392456, + "learning_rate": 6.17199480967648e-06, + "loss": 0.6995, + "step": 9158 + }, + { + "epoch": 1.589000693962526, + "grad_norm": 1.2441235780715942, + "learning_rate": 6.167073225489062e-06, + "loss": 0.7601, + "step": 9159 + }, + { + "epoch": 1.589174184594032, + "grad_norm": 0.8872819542884827, + "learning_rate": 6.162153246628921e-06, + "loss": 0.7319, + "step": 9160 + }, + { + "epoch": 1.5893476752255378, + "grad_norm": 0.837116539478302, + "learning_rate": 6.157234873667021e-06, + "loss": 0.7534, + "step": 9161 + }, + { + "epoch": 1.5895211658570436, + "grad_norm": 0.7112773656845093, + "learning_rate": 6.152318107174144e-06, + "loss": 0.5662, + "step": 9162 + }, + { + "epoch": 1.5896946564885495, + "grad_norm": 0.9326079487800598, + "learning_rate": 6.1474029477208864e-06, + "loss": 0.6482, + "step": 9163 + }, + { + "epoch": 1.5898681471200555, + "grad_norm": 1.0424163341522217, + "learning_rate": 6.142489395877651e-06, + "loss": 0.6208, + "step": 9164 + }, + { + "epoch": 1.5900416377515614, + "grad_norm": 0.9549498558044434, + "learning_rate": 6.137577452214676e-06, + "loss": 0.5811, + "step": 9165 + }, + { + "epoch": 1.5902151283830674, + "grad_norm": 0.8643216490745544, + "learning_rate": 6.132667117301989e-06, + "loss": 0.7939, + "step": 9166 + }, + { + "epoch": 1.5903886190145733, + "grad_norm": 1.571100115776062, + "learning_rate": 6.127758391709442e-06, + "loss": 0.8123, + "step": 9167 + }, + { + "epoch": 1.5905621096460791, + "grad_norm": 1.3855425119400024, + "learning_rate": 6.122851276006692e-06, + "loss": 0.5758, + "step": 9168 + }, + { + "epoch": 1.590735600277585, + "grad_norm": 0.8331384062767029, + "learning_rate": 6.117945770763228e-06, + "loss": 0.6348, + "step": 9169 + }, + { + "epoch": 1.5909090909090908, + "grad_norm": 0.9192061424255371, + "learning_rate": 6.113041876548333e-06, + "loss": 0.6073, + "step": 9170 + }, + { + "epoch": 1.5910825815405967, + "grad_norm": 1.3337138891220093, + "learning_rate": 6.1081395939311086e-06, + "loss": 0.7568, + "step": 9171 + }, + { + "epoch": 1.5912560721721027, + "grad_norm": 0.8826763033866882, + "learning_rate": 6.103238923480468e-06, + "loss": 0.7054, + "step": 9172 + }, + { + "epoch": 1.5914295628036086, + "grad_norm": 0.9426026940345764, + "learning_rate": 6.098339865765153e-06, + "loss": 0.597, + "step": 9173 + }, + { + "epoch": 1.5916030534351147, + "grad_norm": 0.9719138145446777, + "learning_rate": 6.093442421353683e-06, + "loss": 0.6062, + "step": 9174 + }, + { + "epoch": 1.5917765440666205, + "grad_norm": 1.951622486114502, + "learning_rate": 6.088546590814432e-06, + "loss": 0.6492, + "step": 9175 + }, + { + "epoch": 1.5919500346981263, + "grad_norm": 0.7425032258033752, + "learning_rate": 6.083652374715561e-06, + "loss": 0.7355, + "step": 9176 + }, + { + "epoch": 1.5921235253296322, + "grad_norm": 0.8384028673171997, + "learning_rate": 6.078759773625045e-06, + "loss": 0.7352, + "step": 9177 + }, + { + "epoch": 1.592297015961138, + "grad_norm": 0.8721210360527039, + "learning_rate": 6.073868788110673e-06, + "loss": 0.7092, + "step": 9178 + }, + { + "epoch": 1.5924705065926439, + "grad_norm": 1.0410772562026978, + "learning_rate": 6.068979418740067e-06, + "loss": 0.6261, + "step": 9179 + }, + { + "epoch": 1.5926439972241497, + "grad_norm": 1.3195279836654663, + "learning_rate": 6.064091666080621e-06, + "loss": 0.7356, + "step": 9180 + }, + { + "epoch": 1.5928174878556558, + "grad_norm": 0.8185099959373474, + "learning_rate": 6.059205530699581e-06, + "loss": 0.661, + "step": 9181 + }, + { + "epoch": 1.5929909784871616, + "grad_norm": 0.9514410495758057, + "learning_rate": 6.054321013163978e-06, + "loss": 0.5879, + "step": 9182 + }, + { + "epoch": 1.5931644691186677, + "grad_norm": 0.902005136013031, + "learning_rate": 6.049438114040682e-06, + "loss": 0.5709, + "step": 9183 + }, + { + "epoch": 1.5933379597501736, + "grad_norm": 0.7728752493858337, + "learning_rate": 6.044556833896338e-06, + "loss": 0.8567, + "step": 9184 + }, + { + "epoch": 1.5935114503816794, + "grad_norm": 1.1088452339172363, + "learning_rate": 6.03967717329744e-06, + "loss": 0.6074, + "step": 9185 + }, + { + "epoch": 1.5936849410131853, + "grad_norm": 1.7371045351028442, + "learning_rate": 6.034799132810274e-06, + "loss": 0.6903, + "step": 9186 + }, + { + "epoch": 1.593858431644691, + "grad_norm": 0.8223839998245239, + "learning_rate": 6.029922713000935e-06, + "loss": 0.8024, + "step": 9187 + }, + { + "epoch": 1.594031922276197, + "grad_norm": 0.8528516888618469, + "learning_rate": 6.025047914435349e-06, + "loss": 0.8452, + "step": 9188 + }, + { + "epoch": 1.594205412907703, + "grad_norm": 1.4042026996612549, + "learning_rate": 6.020174737679236e-06, + "loss": 0.8042, + "step": 9189 + }, + { + "epoch": 1.5943789035392089, + "grad_norm": 0.8104608654975891, + "learning_rate": 6.015303183298135e-06, + "loss": 0.7595, + "step": 9190 + }, + { + "epoch": 1.594552394170715, + "grad_norm": 0.791183352470398, + "learning_rate": 6.01043325185739e-06, + "loss": 0.8242, + "step": 9191 + }, + { + "epoch": 1.5947258848022208, + "grad_norm": 0.8298701047897339, + "learning_rate": 6.005564943922179e-06, + "loss": 0.8569, + "step": 9192 + }, + { + "epoch": 1.5948993754337266, + "grad_norm": 1.117209553718567, + "learning_rate": 6.000698260057452e-06, + "loss": 0.7156, + "step": 9193 + }, + { + "epoch": 1.5950728660652325, + "grad_norm": 0.8822762370109558, + "learning_rate": 5.995833200828007e-06, + "loss": 0.5778, + "step": 9194 + }, + { + "epoch": 1.5952463566967383, + "grad_norm": 0.7663726806640625, + "learning_rate": 5.990969766798433e-06, + "loss": 0.6853, + "step": 9195 + }, + { + "epoch": 1.5954198473282442, + "grad_norm": 0.7968588471412659, + "learning_rate": 5.9861079585331535e-06, + "loss": 0.7208, + "step": 9196 + }, + { + "epoch": 1.5955933379597502, + "grad_norm": 1.0341931581497192, + "learning_rate": 5.981247776596364e-06, + "loss": 0.8159, + "step": 9197 + }, + { + "epoch": 1.595766828591256, + "grad_norm": 1.0992929935455322, + "learning_rate": 5.97638922155211e-06, + "loss": 0.783, + "step": 9198 + }, + { + "epoch": 1.595940319222762, + "grad_norm": 1.1084662675857544, + "learning_rate": 5.971532293964226e-06, + "loss": 0.5985, + "step": 9199 + }, + { + "epoch": 1.596113809854268, + "grad_norm": 1.2379547357559204, + "learning_rate": 5.9666769943963675e-06, + "loss": 0.6477, + "step": 9200 + }, + { + "epoch": 1.5962873004857738, + "grad_norm": 0.9869577884674072, + "learning_rate": 5.96182332341199e-06, + "loss": 0.7374, + "step": 9201 + }, + { + "epoch": 1.5964607911172797, + "grad_norm": 0.8310408592224121, + "learning_rate": 5.956971281574386e-06, + "loss": 0.7521, + "step": 9202 + }, + { + "epoch": 1.5966342817487855, + "grad_norm": 1.3346121311187744, + "learning_rate": 5.952120869446614e-06, + "loss": 0.7725, + "step": 9203 + }, + { + "epoch": 1.5968077723802914, + "grad_norm": 1.3183389902114868, + "learning_rate": 5.947272087591591e-06, + "loss": 0.5813, + "step": 9204 + }, + { + "epoch": 1.5969812630117972, + "grad_norm": 0.9264410138130188, + "learning_rate": 5.942424936572017e-06, + "loss": 0.5615, + "step": 9205 + }, + { + "epoch": 1.5971547536433033, + "grad_norm": 1.0262141227722168, + "learning_rate": 5.9375794169504095e-06, + "loss": 0.6173, + "step": 9206 + }, + { + "epoch": 1.5973282442748091, + "grad_norm": 0.8180903792381287, + "learning_rate": 5.932735529289091e-06, + "loss": 0.76, + "step": 9207 + }, + { + "epoch": 1.5975017349063152, + "grad_norm": 1.0252745151519775, + "learning_rate": 5.927893274150214e-06, + "loss": 0.6984, + "step": 9208 + }, + { + "epoch": 1.597675225537821, + "grad_norm": 0.9555381536483765, + "learning_rate": 5.9230526520957154e-06, + "loss": 0.6716, + "step": 9209 + }, + { + "epoch": 1.5978487161693269, + "grad_norm": 0.9984028935432434, + "learning_rate": 5.918213663687362e-06, + "loss": 0.5415, + "step": 9210 + }, + { + "epoch": 1.5980222068008327, + "grad_norm": 0.9813024997711182, + "learning_rate": 5.913376309486716e-06, + "loss": 0.6027, + "step": 9211 + }, + { + "epoch": 1.5981956974323386, + "grad_norm": 0.70670086145401, + "learning_rate": 5.908540590055168e-06, + "loss": 0.8079, + "step": 9212 + }, + { + "epoch": 1.5983691880638444, + "grad_norm": 1.996172547340393, + "learning_rate": 5.903706505953906e-06, + "loss": 0.6926, + "step": 9213 + }, + { + "epoch": 1.5985426786953505, + "grad_norm": 2.898315668106079, + "learning_rate": 5.898874057743926e-06, + "loss": 0.5868, + "step": 9214 + }, + { + "epoch": 1.5987161693268563, + "grad_norm": 1.2920238971710205, + "learning_rate": 5.894043245986045e-06, + "loss": 0.8997, + "step": 9215 + }, + { + "epoch": 1.5988896599583624, + "grad_norm": 0.7730161547660828, + "learning_rate": 5.889214071240876e-06, + "loss": 0.7194, + "step": 9216 + }, + { + "epoch": 1.5990631505898683, + "grad_norm": 0.8587391376495361, + "learning_rate": 5.884386534068864e-06, + "loss": 0.7126, + "step": 9217 + }, + { + "epoch": 1.599236641221374, + "grad_norm": 0.8057717084884644, + "learning_rate": 5.879560635030242e-06, + "loss": 0.6245, + "step": 9218 + }, + { + "epoch": 1.59941013185288, + "grad_norm": 1.665824294090271, + "learning_rate": 5.87473637468506e-06, + "loss": 0.8364, + "step": 9219 + }, + { + "epoch": 1.5995836224843858, + "grad_norm": 0.8758461475372314, + "learning_rate": 5.869913753593175e-06, + "loss": 0.7122, + "step": 9220 + }, + { + "epoch": 1.5997571131158916, + "grad_norm": 0.7453508973121643, + "learning_rate": 5.865092772314276e-06, + "loss": 0.71, + "step": 9221 + }, + { + "epoch": 1.5999306037473975, + "grad_norm": 1.142492413520813, + "learning_rate": 5.860273431407821e-06, + "loss": 0.7355, + "step": 9222 + }, + { + "epoch": 1.6001040943789036, + "grad_norm": 1.3166511058807373, + "learning_rate": 5.855455731433115e-06, + "loss": 0.8418, + "step": 9223 + }, + { + "epoch": 1.6002775850104094, + "grad_norm": 1.6561031341552734, + "learning_rate": 5.8506396729492455e-06, + "loss": 0.5859, + "step": 9224 + }, + { + "epoch": 1.6004510756419155, + "grad_norm": 0.9149600267410278, + "learning_rate": 5.845825256515145e-06, + "loss": 0.6965, + "step": 9225 + }, + { + "epoch": 1.6006245662734213, + "grad_norm": 1.121959924697876, + "learning_rate": 5.841012482689501e-06, + "loss": 0.5782, + "step": 9226 + }, + { + "epoch": 1.6007980569049272, + "grad_norm": 1.0745552778244019, + "learning_rate": 5.836201352030863e-06, + "loss": 0.6715, + "step": 9227 + }, + { + "epoch": 1.600971547536433, + "grad_norm": 1.0616880655288696, + "learning_rate": 5.831391865097564e-06, + "loss": 0.6727, + "step": 9228 + }, + { + "epoch": 1.6011450381679388, + "grad_norm": 2.349346160888672, + "learning_rate": 5.82658402244775e-06, + "loss": 0.6932, + "step": 9229 + }, + { + "epoch": 1.6013185287994447, + "grad_norm": 0.9211059212684631, + "learning_rate": 5.821777824639365e-06, + "loss": 0.8123, + "step": 9230 + }, + { + "epoch": 1.6014920194309508, + "grad_norm": 1.344643473625183, + "learning_rate": 5.816973272230196e-06, + "loss": 0.6296, + "step": 9231 + }, + { + "epoch": 1.6016655100624566, + "grad_norm": 0.8227471113204956, + "learning_rate": 5.812170365777801e-06, + "loss": 0.6056, + "step": 9232 + }, + { + "epoch": 1.6018390006939627, + "grad_norm": 0.8955137133598328, + "learning_rate": 5.807369105839569e-06, + "loss": 0.6904, + "step": 9233 + }, + { + "epoch": 1.6020124913254685, + "grad_norm": 1.164100170135498, + "learning_rate": 5.802569492972687e-06, + "loss": 0.6963, + "step": 9234 + }, + { + "epoch": 1.6021859819569744, + "grad_norm": 0.8580724596977234, + "learning_rate": 5.797771527734155e-06, + "loss": 0.7646, + "step": 9235 + }, + { + "epoch": 1.6023594725884802, + "grad_norm": 0.9330925941467285, + "learning_rate": 5.792975210680793e-06, + "loss": 0.6799, + "step": 9236 + }, + { + "epoch": 1.602532963219986, + "grad_norm": 0.852889358997345, + "learning_rate": 5.7881805423692105e-06, + "loss": 0.6227, + "step": 9237 + }, + { + "epoch": 1.602706453851492, + "grad_norm": 0.6625455617904663, + "learning_rate": 5.783387523355839e-06, + "loss": 0.8469, + "step": 9238 + }, + { + "epoch": 1.6028799444829978, + "grad_norm": 0.8633602261543274, + "learning_rate": 5.778596154196902e-06, + "loss": 0.6997, + "step": 9239 + }, + { + "epoch": 1.6030534351145038, + "grad_norm": 0.9465035200119019, + "learning_rate": 5.773806435448459e-06, + "loss": 0.5568, + "step": 9240 + }, + { + "epoch": 1.6032269257460097, + "grad_norm": 0.7581543326377869, + "learning_rate": 5.769018367666357e-06, + "loss": 0.7605, + "step": 9241 + }, + { + "epoch": 1.6034004163775157, + "grad_norm": 1.138923168182373, + "learning_rate": 5.764231951406256e-06, + "loss": 0.6646, + "step": 9242 + }, + { + "epoch": 1.6035739070090216, + "grad_norm": 0.822629451751709, + "learning_rate": 5.759447187223617e-06, + "loss": 0.7603, + "step": 9243 + }, + { + "epoch": 1.6037473976405274, + "grad_norm": 1.064039945602417, + "learning_rate": 5.75466407567374e-06, + "loss": 0.6495, + "step": 9244 + }, + { + "epoch": 1.6039208882720333, + "grad_norm": 0.9656265377998352, + "learning_rate": 5.749882617311682e-06, + "loss": 0.7125, + "step": 9245 + }, + { + "epoch": 1.6040943789035391, + "grad_norm": 1.1500558853149414, + "learning_rate": 5.745102812692358e-06, + "loss": 0.6677, + "step": 9246 + }, + { + "epoch": 1.604267869535045, + "grad_norm": 0.8273731470108032, + "learning_rate": 5.740324662370462e-06, + "loss": 0.6721, + "step": 9247 + }, + { + "epoch": 1.604441360166551, + "grad_norm": 0.9122879505157471, + "learning_rate": 5.735548166900506e-06, + "loss": 0.5471, + "step": 9248 + }, + { + "epoch": 1.6046148507980569, + "grad_norm": 0.8580262660980225, + "learning_rate": 5.7307733268368e-06, + "loss": 0.7913, + "step": 9249 + }, + { + "epoch": 1.604788341429563, + "grad_norm": 0.770071804523468, + "learning_rate": 5.72600014273349e-06, + "loss": 0.6001, + "step": 9250 + }, + { + "epoch": 1.6049618320610688, + "grad_norm": 0.7847028970718384, + "learning_rate": 5.721228615144481e-06, + "loss": 0.8096, + "step": 9251 + }, + { + "epoch": 1.6051353226925746, + "grad_norm": 1.0415167808532715, + "learning_rate": 5.716458744623536e-06, + "loss": 0.8088, + "step": 9252 + }, + { + "epoch": 1.6053088133240805, + "grad_norm": 1.1189672946929932, + "learning_rate": 5.711690531724192e-06, + "loss": 0.5717, + "step": 9253 + }, + { + "epoch": 1.6054823039555863, + "grad_norm": 0.8271700739860535, + "learning_rate": 5.706923976999825e-06, + "loss": 0.767, + "step": 9254 + }, + { + "epoch": 1.6056557945870922, + "grad_norm": 0.9686708450317383, + "learning_rate": 5.702159081003571e-06, + "loss": 0.6565, + "step": 9255 + }, + { + "epoch": 1.6058292852185982, + "grad_norm": 1.5437569618225098, + "learning_rate": 5.697395844288423e-06, + "loss": 0.572, + "step": 9256 + }, + { + "epoch": 1.606002775850104, + "grad_norm": 1.0085010528564453, + "learning_rate": 5.692634267407151e-06, + "loss": 0.705, + "step": 9257 + }, + { + "epoch": 1.60617626648161, + "grad_norm": 0.7898030877113342, + "learning_rate": 5.687874350912346e-06, + "loss": 0.5808, + "step": 9258 + }, + { + "epoch": 1.606349757113116, + "grad_norm": 0.8206831216812134, + "learning_rate": 5.683116095356391e-06, + "loss": 0.6578, + "step": 9259 + }, + { + "epoch": 1.6065232477446219, + "grad_norm": 0.7612369060516357, + "learning_rate": 5.678359501291504e-06, + "loss": 0.801, + "step": 9260 + }, + { + "epoch": 1.6066967383761277, + "grad_norm": 0.8573133945465088, + "learning_rate": 5.673604569269684e-06, + "loss": 0.7822, + "step": 9261 + }, + { + "epoch": 1.6068702290076335, + "grad_norm": 0.7325265407562256, + "learning_rate": 5.668851299842739e-06, + "loss": 0.8235, + "step": 9262 + }, + { + "epoch": 1.6070437196391394, + "grad_norm": 0.7421675324440002, + "learning_rate": 5.664099693562315e-06, + "loss": 0.8594, + "step": 9263 + }, + { + "epoch": 1.6072172102706452, + "grad_norm": 1.118357539176941, + "learning_rate": 5.659349750979814e-06, + "loss": 0.7925, + "step": 9264 + }, + { + "epoch": 1.6073907009021513, + "grad_norm": 1.4785107374191284, + "learning_rate": 5.6546014726464906e-06, + "loss": 0.6038, + "step": 9265 + }, + { + "epoch": 1.6075641915336571, + "grad_norm": 1.2138341665267944, + "learning_rate": 5.6498548591133725e-06, + "loss": 0.5339, + "step": 9266 + }, + { + "epoch": 1.6077376821651632, + "grad_norm": 0.8459744453430176, + "learning_rate": 5.645109910931335e-06, + "loss": 0.6758, + "step": 9267 + }, + { + "epoch": 1.607911172796669, + "grad_norm": 0.8932994604110718, + "learning_rate": 5.6403666286510065e-06, + "loss": 0.5741, + "step": 9268 + }, + { + "epoch": 1.608084663428175, + "grad_norm": 0.9636117815971375, + "learning_rate": 5.635625012822869e-06, + "loss": 0.8298, + "step": 9269 + }, + { + "epoch": 1.6082581540596808, + "grad_norm": 1.5823535919189453, + "learning_rate": 5.630885063997187e-06, + "loss": 0.7883, + "step": 9270 + }, + { + "epoch": 1.6084316446911866, + "grad_norm": 0.9738103151321411, + "learning_rate": 5.626146782724036e-06, + "loss": 0.6515, + "step": 9271 + }, + { + "epoch": 1.6086051353226924, + "grad_norm": 0.9354885220527649, + "learning_rate": 5.621410169553292e-06, + "loss": 0.5428, + "step": 9272 + }, + { + "epoch": 1.6087786259541985, + "grad_norm": 1.0138059854507446, + "learning_rate": 5.616675225034667e-06, + "loss": 0.8051, + "step": 9273 + }, + { + "epoch": 1.6089521165857044, + "grad_norm": 0.6884923577308655, + "learning_rate": 5.6119419497176275e-06, + "loss": 0.7573, + "step": 9274 + }, + { + "epoch": 1.6091256072172104, + "grad_norm": 0.6675198674201965, + "learning_rate": 5.607210344151497e-06, + "loss": 0.7031, + "step": 9275 + }, + { + "epoch": 1.6092990978487163, + "grad_norm": 0.8114702105522156, + "learning_rate": 5.6024804088853775e-06, + "loss": 0.6667, + "step": 9276 + }, + { + "epoch": 1.6094725884802221, + "grad_norm": 1.168317198753357, + "learning_rate": 5.597752144468185e-06, + "loss": 0.5431, + "step": 9277 + }, + { + "epoch": 1.609646079111728, + "grad_norm": 0.673999011516571, + "learning_rate": 5.59302555144863e-06, + "loss": 0.682, + "step": 9278 + }, + { + "epoch": 1.6098195697432338, + "grad_norm": 0.8778970241546631, + "learning_rate": 5.588300630375252e-06, + "loss": 0.8066, + "step": 9279 + }, + { + "epoch": 1.6099930603747397, + "grad_norm": 0.9512525796890259, + "learning_rate": 5.5835773817963814e-06, + "loss": 0.6765, + "step": 9280 + }, + { + "epoch": 1.6101665510062455, + "grad_norm": 1.0571208000183105, + "learning_rate": 5.57885580626015e-06, + "loss": 0.6432, + "step": 9281 + }, + { + "epoch": 1.6103400416377516, + "grad_norm": 0.762089729309082, + "learning_rate": 5.574135904314504e-06, + "loss": 0.8145, + "step": 9282 + }, + { + "epoch": 1.6105135322692574, + "grad_norm": 0.750906765460968, + "learning_rate": 5.5694176765072005e-06, + "loss": 0.723, + "step": 9283 + }, + { + "epoch": 1.6106870229007635, + "grad_norm": 0.8546105027198792, + "learning_rate": 5.5647011233857915e-06, + "loss": 0.6255, + "step": 9284 + }, + { + "epoch": 1.6108605135322693, + "grad_norm": 0.8520799279212952, + "learning_rate": 5.559986245497637e-06, + "loss": 0.8103, + "step": 9285 + }, + { + "epoch": 1.6110340041637752, + "grad_norm": 1.0121498107910156, + "learning_rate": 5.555273043389906e-06, + "loss": 0.759, + "step": 9286 + }, + { + "epoch": 1.611207494795281, + "grad_norm": 0.9401363134384155, + "learning_rate": 5.55056151760956e-06, + "loss": 0.6205, + "step": 9287 + }, + { + "epoch": 1.6113809854267869, + "grad_norm": 2.120675802230835, + "learning_rate": 5.545851668703397e-06, + "loss": 0.573, + "step": 9288 + }, + { + "epoch": 1.6115544760582927, + "grad_norm": 0.8539029359817505, + "learning_rate": 5.541143497217989e-06, + "loss": 0.7791, + "step": 9289 + }, + { + "epoch": 1.6117279666897988, + "grad_norm": 1.2598599195480347, + "learning_rate": 5.536437003699724e-06, + "loss": 0.6571, + "step": 9290 + }, + { + "epoch": 1.6119014573213046, + "grad_norm": 1.3844982385635376, + "learning_rate": 5.531732188694794e-06, + "loss": 0.5859, + "step": 9291 + }, + { + "epoch": 1.6120749479528107, + "grad_norm": 0.8106334805488586, + "learning_rate": 5.527029052749216e-06, + "loss": 0.7014, + "step": 9292 + }, + { + "epoch": 1.6122484385843165, + "grad_norm": 0.8846122026443481, + "learning_rate": 5.522327596408766e-06, + "loss": 0.6917, + "step": 9293 + }, + { + "epoch": 1.6124219292158224, + "grad_norm": 0.9131292104721069, + "learning_rate": 5.517627820219076e-06, + "loss": 0.6841, + "step": 9294 + }, + { + "epoch": 1.6125954198473282, + "grad_norm": 1.0344430208206177, + "learning_rate": 5.512929724725544e-06, + "loss": 0.6975, + "step": 9295 + }, + { + "epoch": 1.612768910478834, + "grad_norm": 0.9669039249420166, + "learning_rate": 5.508233310473412e-06, + "loss": 0.824, + "step": 9296 + }, + { + "epoch": 1.61294240111034, + "grad_norm": 1.0789260864257812, + "learning_rate": 5.503538578007679e-06, + "loss": 0.5365, + "step": 9297 + }, + { + "epoch": 1.6131158917418458, + "grad_norm": 0.7297152876853943, + "learning_rate": 5.498845527873193e-06, + "loss": 0.8179, + "step": 9298 + }, + { + "epoch": 1.6132893823733518, + "grad_norm": 1.0082979202270508, + "learning_rate": 5.494154160614578e-06, + "loss": 0.6017, + "step": 9299 + }, + { + "epoch": 1.6134628730048577, + "grad_norm": 0.9128202199935913, + "learning_rate": 5.489464476776276e-06, + "loss": 0.6521, + "step": 9300 + }, + { + "epoch": 1.6136363636363638, + "grad_norm": 5.550414085388184, + "learning_rate": 5.484776476902525e-06, + "loss": 0.5001, + "step": 9301 + }, + { + "epoch": 1.6138098542678696, + "grad_norm": 1.361626148223877, + "learning_rate": 5.480090161537388e-06, + "loss": 0.6149, + "step": 9302 + }, + { + "epoch": 1.6139833448993754, + "grad_norm": 0.9390026330947876, + "learning_rate": 5.475405531224696e-06, + "loss": 0.7236, + "step": 9303 + }, + { + "epoch": 1.6141568355308813, + "grad_norm": 0.9986167550086975, + "learning_rate": 5.470722586508122e-06, + "loss": 0.6964, + "step": 9304 + }, + { + "epoch": 1.6143303261623871, + "grad_norm": 0.9759873151779175, + "learning_rate": 5.466041327931122e-06, + "loss": 0.6726, + "step": 9305 + }, + { + "epoch": 1.614503816793893, + "grad_norm": 0.9950049519538879, + "learning_rate": 5.4613617560369625e-06, + "loss": 0.5889, + "step": 9306 + }, + { + "epoch": 1.614677307425399, + "grad_norm": 0.6720487475395203, + "learning_rate": 5.456683871368704e-06, + "loss": 0.8422, + "step": 9307 + }, + { + "epoch": 1.614850798056905, + "grad_norm": 0.7266060709953308, + "learning_rate": 5.452007674469235e-06, + "loss": 0.7961, + "step": 9308 + }, + { + "epoch": 1.615024288688411, + "grad_norm": 1.0562281608581543, + "learning_rate": 5.447333165881228e-06, + "loss": 0.6901, + "step": 9309 + }, + { + "epoch": 1.6151977793199168, + "grad_norm": 2.0633630752563477, + "learning_rate": 5.442660346147157e-06, + "loss": 0.5945, + "step": 9310 + }, + { + "epoch": 1.6153712699514227, + "grad_norm": 0.8110103011131287, + "learning_rate": 5.437989215809323e-06, + "loss": 0.6675, + "step": 9311 + }, + { + "epoch": 1.6155447605829285, + "grad_norm": 1.1493251323699951, + "learning_rate": 5.433319775409807e-06, + "loss": 0.55, + "step": 9312 + }, + { + "epoch": 1.6157182512144344, + "grad_norm": 0.9942023754119873, + "learning_rate": 5.428652025490506e-06, + "loss": 0.6522, + "step": 9313 + }, + { + "epoch": 1.6158917418459402, + "grad_norm": 1.0094043016433716, + "learning_rate": 5.4239859665931105e-06, + "loss": 0.5968, + "step": 9314 + }, + { + "epoch": 1.6160652324774463, + "grad_norm": 0.9131174087524414, + "learning_rate": 5.41932159925914e-06, + "loss": 0.629, + "step": 9315 + }, + { + "epoch": 1.6162387231089521, + "grad_norm": 0.933478832244873, + "learning_rate": 5.4146589240298745e-06, + "loss": 0.6125, + "step": 9316 + }, + { + "epoch": 1.6164122137404582, + "grad_norm": 0.8319815397262573, + "learning_rate": 5.409997941446443e-06, + "loss": 0.6184, + "step": 9317 + }, + { + "epoch": 1.616585704371964, + "grad_norm": 0.8241746425628662, + "learning_rate": 5.405338652049749e-06, + "loss": 0.7153, + "step": 9318 + }, + { + "epoch": 1.6167591950034699, + "grad_norm": 0.9305207133293152, + "learning_rate": 5.400681056380515e-06, + "loss": 0.8501, + "step": 9319 + }, + { + "epoch": 1.6169326856349757, + "grad_norm": 0.585139811038971, + "learning_rate": 5.396025154979247e-06, + "loss": 0.8525, + "step": 9320 + }, + { + "epoch": 1.6171061762664816, + "grad_norm": 2.2390921115875244, + "learning_rate": 5.39137094838629e-06, + "loss": 0.5878, + "step": 9321 + }, + { + "epoch": 1.6172796668979874, + "grad_norm": 1.0336289405822754, + "learning_rate": 5.386718437141743e-06, + "loss": 0.6132, + "step": 9322 + }, + { + "epoch": 1.6174531575294933, + "grad_norm": 1.2470624446868896, + "learning_rate": 5.382067621785556e-06, + "loss": 0.7532, + "step": 9323 + }, + { + "epoch": 1.6176266481609993, + "grad_norm": 1.6590094566345215, + "learning_rate": 5.37741850285745e-06, + "loss": 0.835, + "step": 9324 + }, + { + "epoch": 1.6178001387925052, + "grad_norm": 2.086590528488159, + "learning_rate": 5.372771080896977e-06, + "loss": 0.9272, + "step": 9325 + }, + { + "epoch": 1.6179736294240112, + "grad_norm": 1.0392628908157349, + "learning_rate": 5.368125356443452e-06, + "loss": 0.5691, + "step": 9326 + }, + { + "epoch": 1.618147120055517, + "grad_norm": 1.3442753553390503, + "learning_rate": 5.3634813300360355e-06, + "loss": 0.5892, + "step": 9327 + }, + { + "epoch": 1.618320610687023, + "grad_norm": 1.040371298789978, + "learning_rate": 5.358839002213665e-06, + "loss": 0.7109, + "step": 9328 + }, + { + "epoch": 1.6184941013185288, + "grad_norm": 0.7031519412994385, + "learning_rate": 5.354198373515087e-06, + "loss": 0.8167, + "step": 9329 + }, + { + "epoch": 1.6186675919500346, + "grad_norm": 0.9409035444259644, + "learning_rate": 5.349559444478849e-06, + "loss": 0.5952, + "step": 9330 + }, + { + "epoch": 1.6188410825815405, + "grad_norm": 0.9949761629104614, + "learning_rate": 5.344922215643316e-06, + "loss": 0.7029, + "step": 9331 + }, + { + "epoch": 1.6190145732130465, + "grad_norm": 1.0031683444976807, + "learning_rate": 5.3402866875466344e-06, + "loss": 0.5598, + "step": 9332 + }, + { + "epoch": 1.6191880638445524, + "grad_norm": 1.3263990879058838, + "learning_rate": 5.335652860726765e-06, + "loss": 0.6906, + "step": 9333 + }, + { + "epoch": 1.6193615544760585, + "grad_norm": 0.8390851020812988, + "learning_rate": 5.331020735721469e-06, + "loss": 0.8718, + "step": 9334 + }, + { + "epoch": 1.6195350451075643, + "grad_norm": 0.850297749042511, + "learning_rate": 5.326390313068303e-06, + "loss": 0.6882, + "step": 9335 + }, + { + "epoch": 1.6197085357390701, + "grad_norm": 0.8722672462463379, + "learning_rate": 5.321761593304646e-06, + "loss": 0.7432, + "step": 9336 + }, + { + "epoch": 1.619882026370576, + "grad_norm": 0.808373212814331, + "learning_rate": 5.317134576967658e-06, + "loss": 0.6223, + "step": 9337 + }, + { + "epoch": 1.6200555170020818, + "grad_norm": 0.7097983956336975, + "learning_rate": 5.312509264594312e-06, + "loss": 0.8323, + "step": 9338 + }, + { + "epoch": 1.6202290076335877, + "grad_norm": 0.8288452625274658, + "learning_rate": 5.307885656721374e-06, + "loss": 0.7974, + "step": 9339 + }, + { + "epoch": 1.6204024982650935, + "grad_norm": 0.9931098818778992, + "learning_rate": 5.303263753885433e-06, + "loss": 0.6868, + "step": 9340 + }, + { + "epoch": 1.6205759888965996, + "grad_norm": 0.8495844602584839, + "learning_rate": 5.298643556622858e-06, + "loss": 0.7148, + "step": 9341 + }, + { + "epoch": 1.6207494795281054, + "grad_norm": 1.9866089820861816, + "learning_rate": 5.294025065469827e-06, + "loss": 0.6071, + "step": 9342 + }, + { + "epoch": 1.6209229701596115, + "grad_norm": 1.2187680006027222, + "learning_rate": 5.2894082809623185e-06, + "loss": 0.7144, + "step": 9343 + }, + { + "epoch": 1.6210964607911174, + "grad_norm": 0.8841637372970581, + "learning_rate": 5.284793203636132e-06, + "loss": 0.7739, + "step": 9344 + }, + { + "epoch": 1.6212699514226232, + "grad_norm": 1.1625877618789673, + "learning_rate": 5.280179834026828e-06, + "loss": 0.8337, + "step": 9345 + }, + { + "epoch": 1.621443442054129, + "grad_norm": 0.6400302648544312, + "learning_rate": 5.2755681726698134e-06, + "loss": 0.7031, + "step": 9346 + }, + { + "epoch": 1.621616932685635, + "grad_norm": 1.0452789068222046, + "learning_rate": 5.270958220100269e-06, + "loss": 0.812, + "step": 9347 + }, + { + "epoch": 1.6217904233171407, + "grad_norm": 0.9176926016807556, + "learning_rate": 5.2663499768531865e-06, + "loss": 0.6349, + "step": 9348 + }, + { + "epoch": 1.6219639139486468, + "grad_norm": 0.6751433610916138, + "learning_rate": 5.26174344346335e-06, + "loss": 0.7709, + "step": 9349 + }, + { + "epoch": 1.6221374045801527, + "grad_norm": 0.9687822461128235, + "learning_rate": 5.257138620465374e-06, + "loss": 0.7178, + "step": 9350 + }, + { + "epoch": 1.6223108952116587, + "grad_norm": 1.2950260639190674, + "learning_rate": 5.252535508393628e-06, + "loss": 0.668, + "step": 9351 + }, + { + "epoch": 1.6224843858431646, + "grad_norm": 0.8031737804412842, + "learning_rate": 5.247934107782324e-06, + "loss": 0.7594, + "step": 9352 + }, + { + "epoch": 1.6226578764746704, + "grad_norm": 1.1909713745117188, + "learning_rate": 5.243334419165453e-06, + "loss": 0.7109, + "step": 9353 + }, + { + "epoch": 1.6228313671061763, + "grad_norm": 1.0480983257293701, + "learning_rate": 5.238736443076828e-06, + "loss": 0.7351, + "step": 9354 + }, + { + "epoch": 1.623004857737682, + "grad_norm": 1.0589905977249146, + "learning_rate": 5.234140180050029e-06, + "loss": 0.5442, + "step": 9355 + }, + { + "epoch": 1.623178348369188, + "grad_norm": 0.7929001450538635, + "learning_rate": 5.2295456306184715e-06, + "loss": 0.7058, + "step": 9356 + }, + { + "epoch": 1.6233518390006938, + "grad_norm": 0.7510325312614441, + "learning_rate": 5.2249527953153545e-06, + "loss": 0.8159, + "step": 9357 + }, + { + "epoch": 1.6235253296321999, + "grad_norm": 0.9143862128257751, + "learning_rate": 5.220361674673677e-06, + "loss": 0.5581, + "step": 9358 + }, + { + "epoch": 1.6236988202637057, + "grad_norm": 0.7704092860221863, + "learning_rate": 5.215772269226255e-06, + "loss": 0.6823, + "step": 9359 + }, + { + "epoch": 1.6238723108952118, + "grad_norm": 0.6990248560905457, + "learning_rate": 5.211184579505688e-06, + "loss": 0.7856, + "step": 9360 + }, + { + "epoch": 1.6240458015267176, + "grad_norm": 2.344926595687866, + "learning_rate": 5.206598606044384e-06, + "loss": 0.5398, + "step": 9361 + }, + { + "epoch": 1.6242192921582235, + "grad_norm": 1.0054848194122314, + "learning_rate": 5.2020143493745425e-06, + "loss": 0.5513, + "step": 9362 + }, + { + "epoch": 1.6243927827897293, + "grad_norm": 0.8317369222640991, + "learning_rate": 5.1974318100281905e-06, + "loss": 0.6141, + "step": 9363 + }, + { + "epoch": 1.6245662734212352, + "grad_norm": 1.0652899742126465, + "learning_rate": 5.1928509885371124e-06, + "loss": 0.6139, + "step": 9364 + }, + { + "epoch": 1.624739764052741, + "grad_norm": 0.6454002261161804, + "learning_rate": 5.188271885432938e-06, + "loss": 0.7656, + "step": 9365 + }, + { + "epoch": 1.624913254684247, + "grad_norm": 0.6984428763389587, + "learning_rate": 5.183694501247072e-06, + "loss": 0.7896, + "step": 9366 + }, + { + "epoch": 1.625086745315753, + "grad_norm": 0.6627167463302612, + "learning_rate": 5.179118836510721e-06, + "loss": 0.752, + "step": 9367 + }, + { + "epoch": 1.625260235947259, + "grad_norm": 1.319754719734192, + "learning_rate": 5.174544891754896e-06, + "loss": 0.8198, + "step": 9368 + }, + { + "epoch": 1.6254337265787648, + "grad_norm": 0.9097060561180115, + "learning_rate": 5.169972667510414e-06, + "loss": 0.6107, + "step": 9369 + }, + { + "epoch": 1.6256072172102707, + "grad_norm": 1.7754642963409424, + "learning_rate": 5.165402164307884e-06, + "loss": 0.679, + "step": 9370 + }, + { + "epoch": 1.6257807078417765, + "grad_norm": 1.5291123390197754, + "learning_rate": 5.160833382677721e-06, + "loss": 0.752, + "step": 9371 + }, + { + "epoch": 1.6259541984732824, + "grad_norm": 0.9237502217292786, + "learning_rate": 5.15626632315013e-06, + "loss": 0.6302, + "step": 9372 + }, + { + "epoch": 1.6261276891047882, + "grad_norm": 0.9657291173934937, + "learning_rate": 5.151700986255137e-06, + "loss": 0.6089, + "step": 9373 + }, + { + "epoch": 1.6263011797362943, + "grad_norm": 1.086250901222229, + "learning_rate": 5.147137372522537e-06, + "loss": 0.6372, + "step": 9374 + }, + { + "epoch": 1.6264746703678001, + "grad_norm": 0.9748876094818115, + "learning_rate": 5.142575482481957e-06, + "loss": 0.6594, + "step": 9375 + }, + { + "epoch": 1.6266481609993062, + "grad_norm": 1.02510404586792, + "learning_rate": 5.138015316662803e-06, + "loss": 0.6133, + "step": 9376 + }, + { + "epoch": 1.626821651630812, + "grad_norm": 1.3306300640106201, + "learning_rate": 5.1334568755942915e-06, + "loss": 0.5952, + "step": 9377 + }, + { + "epoch": 1.626995142262318, + "grad_norm": 0.8220638036727905, + "learning_rate": 5.128900159805425e-06, + "loss": 0.8262, + "step": 9378 + }, + { + "epoch": 1.6271686328938237, + "grad_norm": 1.0603889226913452, + "learning_rate": 5.124345169825031e-06, + "loss": 0.5902, + "step": 9379 + }, + { + "epoch": 1.6273421235253296, + "grad_norm": 0.9265195727348328, + "learning_rate": 5.119791906181713e-06, + "loss": 0.6499, + "step": 9380 + }, + { + "epoch": 1.6275156141568354, + "grad_norm": 0.8225482702255249, + "learning_rate": 5.115240369403882e-06, + "loss": 0.6842, + "step": 9381 + }, + { + "epoch": 1.6276891047883413, + "grad_norm": 0.9355594515800476, + "learning_rate": 5.110690560019744e-06, + "loss": 0.6641, + "step": 9382 + }, + { + "epoch": 1.6278625954198473, + "grad_norm": 0.9947420954704285, + "learning_rate": 5.106142478557323e-06, + "loss": 0.7224, + "step": 9383 + }, + { + "epoch": 1.6280360860513532, + "grad_norm": 0.775924026966095, + "learning_rate": 5.1015961255444235e-06, + "loss": 0.8152, + "step": 9384 + }, + { + "epoch": 1.6282095766828593, + "grad_norm": 0.7723604440689087, + "learning_rate": 5.097051501508652e-06, + "loss": 0.7355, + "step": 9385 + }, + { + "epoch": 1.628383067314365, + "grad_norm": 1.0363492965698242, + "learning_rate": 5.09250860697742e-06, + "loss": 0.8125, + "step": 9386 + }, + { + "epoch": 1.628556557945871, + "grad_norm": 1.3312816619873047, + "learning_rate": 5.087967442477928e-06, + "loss": 0.6643, + "step": 9387 + }, + { + "epoch": 1.6287300485773768, + "grad_norm": 0.6040915250778198, + "learning_rate": 5.083428008537197e-06, + "loss": 0.9412, + "step": 9388 + }, + { + "epoch": 1.6289035392088826, + "grad_norm": 0.8499444127082825, + "learning_rate": 5.078890305682027e-06, + "loss": 0.6536, + "step": 9389 + }, + { + "epoch": 1.6290770298403885, + "grad_norm": 0.8448061943054199, + "learning_rate": 5.074354334439022e-06, + "loss": 0.7201, + "step": 9390 + }, + { + "epoch": 1.6292505204718946, + "grad_norm": 0.9188697934150696, + "learning_rate": 5.069820095334583e-06, + "loss": 0.8396, + "step": 9391 + }, + { + "epoch": 1.6294240111034004, + "grad_norm": 1.2361544370651245, + "learning_rate": 5.065287588894933e-06, + "loss": 0.7703, + "step": 9392 + }, + { + "epoch": 1.6295975017349065, + "grad_norm": 0.6595848202705383, + "learning_rate": 5.060756815646046e-06, + "loss": 0.7507, + "step": 9393 + }, + { + "epoch": 1.6297709923664123, + "grad_norm": 1.0064538717269897, + "learning_rate": 5.056227776113747e-06, + "loss": 0.6707, + "step": 9394 + }, + { + "epoch": 1.6299444829979182, + "grad_norm": 0.6891892552375793, + "learning_rate": 5.05170047082362e-06, + "loss": 0.7815, + "step": 9395 + }, + { + "epoch": 1.630117973629424, + "grad_norm": 0.8169323801994324, + "learning_rate": 5.0471749003010835e-06, + "loss": 0.6184, + "step": 9396 + }, + { + "epoch": 1.6302914642609299, + "grad_norm": 1.0280102491378784, + "learning_rate": 5.0426510650713116e-06, + "loss": 0.5294, + "step": 9397 + }, + { + "epoch": 1.6304649548924357, + "grad_norm": 1.0078742504119873, + "learning_rate": 5.038128965659317e-06, + "loss": 0.7744, + "step": 9398 + }, + { + "epoch": 1.6306384455239415, + "grad_norm": 0.7181193232536316, + "learning_rate": 5.033608602589892e-06, + "loss": 0.7341, + "step": 9399 + }, + { + "epoch": 1.6308119361554476, + "grad_norm": 0.9698305130004883, + "learning_rate": 5.029089976387627e-06, + "loss": 0.7412, + "step": 9400 + }, + { + "epoch": 1.6309854267869535, + "grad_norm": 1.1855453252792358, + "learning_rate": 5.0245730875769095e-06, + "loss": 0.6323, + "step": 9401 + }, + { + "epoch": 1.6311589174184595, + "grad_norm": 1.0596458911895752, + "learning_rate": 5.020057936681939e-06, + "loss": 0.496, + "step": 9402 + }, + { + "epoch": 1.6313324080499654, + "grad_norm": 0.7069152593612671, + "learning_rate": 5.0155445242267006e-06, + "loss": 0.7473, + "step": 9403 + }, + { + "epoch": 1.6315058986814712, + "grad_norm": 1.200939655303955, + "learning_rate": 5.011032850734983e-06, + "loss": 0.7256, + "step": 9404 + }, + { + "epoch": 1.631679389312977, + "grad_norm": 0.9315928220748901, + "learning_rate": 5.006522916730368e-06, + "loss": 0.8682, + "step": 9405 + }, + { + "epoch": 1.631852879944483, + "grad_norm": 1.0779587030410767, + "learning_rate": 5.00201472273623e-06, + "loss": 0.7611, + "step": 9406 + }, + { + "epoch": 1.6320263705759888, + "grad_norm": 1.4412240982055664, + "learning_rate": 4.9975082692757705e-06, + "loss": 0.6843, + "step": 9407 + }, + { + "epoch": 1.6321998612074948, + "grad_norm": 1.177139163017273, + "learning_rate": 4.993003556871954e-06, + "loss": 0.6764, + "step": 9408 + }, + { + "epoch": 1.6323733518390007, + "grad_norm": 0.7385892868041992, + "learning_rate": 4.9885005860475626e-06, + "loss": 0.6788, + "step": 9409 + }, + { + "epoch": 1.6325468424705067, + "grad_norm": 0.9641989469528198, + "learning_rate": 4.983999357325164e-06, + "loss": 0.6836, + "step": 9410 + }, + { + "epoch": 1.6327203331020126, + "grad_norm": 0.8198578357696533, + "learning_rate": 4.9794998712271425e-06, + "loss": 0.8284, + "step": 9411 + }, + { + "epoch": 1.6328938237335184, + "grad_norm": 0.8635667562484741, + "learning_rate": 4.975002128275666e-06, + "loss": 0.5596, + "step": 9412 + }, + { + "epoch": 1.6330673143650243, + "grad_norm": 0.8504005074501038, + "learning_rate": 4.970506128992696e-06, + "loss": 0.6093, + "step": 9413 + }, + { + "epoch": 1.6332408049965301, + "grad_norm": 1.9095425605773926, + "learning_rate": 4.966011873900001e-06, + "loss": 0.5518, + "step": 9414 + }, + { + "epoch": 1.633414295628036, + "grad_norm": 1.2808948755264282, + "learning_rate": 4.961519363519154e-06, + "loss": 0.6757, + "step": 9415 + }, + { + "epoch": 1.6335877862595418, + "grad_norm": 0.9194421768188477, + "learning_rate": 4.957028598371498e-06, + "loss": 0.584, + "step": 9416 + }, + { + "epoch": 1.6337612768910479, + "grad_norm": 1.140230655670166, + "learning_rate": 4.9525395789782085e-06, + "loss": 0.5597, + "step": 9417 + }, + { + "epoch": 1.6339347675225537, + "grad_norm": 1.5511494874954224, + "learning_rate": 4.948052305860233e-06, + "loss": 0.6228, + "step": 9418 + }, + { + "epoch": 1.6341082581540598, + "grad_norm": 1.1746565103530884, + "learning_rate": 4.943566779538327e-06, + "loss": 0.6719, + "step": 9419 + }, + { + "epoch": 1.6342817487855656, + "grad_norm": 1.0987014770507812, + "learning_rate": 4.939083000533036e-06, + "loss": 0.8716, + "step": 9420 + }, + { + "epoch": 1.6344552394170715, + "grad_norm": 1.2873334884643555, + "learning_rate": 4.934600969364722e-06, + "loss": 0.6439, + "step": 9421 + }, + { + "epoch": 1.6346287300485773, + "grad_norm": 4.804620265960693, + "learning_rate": 4.93012068655351e-06, + "loss": 0.7188, + "step": 9422 + }, + { + "epoch": 1.6348022206800832, + "grad_norm": 0.764326810836792, + "learning_rate": 4.925642152619357e-06, + "loss": 0.6587, + "step": 9423 + }, + { + "epoch": 1.634975711311589, + "grad_norm": 0.8877264857292175, + "learning_rate": 4.92116536808199e-06, + "loss": 0.5992, + "step": 9424 + }, + { + "epoch": 1.635149201943095, + "grad_norm": 0.9197680354118347, + "learning_rate": 4.9166903334609675e-06, + "loss": 0.6672, + "step": 9425 + }, + { + "epoch": 1.635322692574601, + "grad_norm": 1.1835715770721436, + "learning_rate": 4.912217049275594e-06, + "loss": 0.5989, + "step": 9426 + }, + { + "epoch": 1.635496183206107, + "grad_norm": 1.8347687721252441, + "learning_rate": 4.907745516045017e-06, + "loss": 0.5378, + "step": 9427 + }, + { + "epoch": 1.6356696738376129, + "grad_norm": 0.8023619055747986, + "learning_rate": 4.903275734288162e-06, + "loss": 0.6934, + "step": 9428 + }, + { + "epoch": 1.6358431644691187, + "grad_norm": 0.764162003993988, + "learning_rate": 4.898807704523747e-06, + "loss": 0.7263, + "step": 9429 + }, + { + "epoch": 1.6360166551006246, + "grad_norm": 1.264714002609253, + "learning_rate": 4.8943414272702886e-06, + "loss": 0.6312, + "step": 9430 + }, + { + "epoch": 1.6361901457321304, + "grad_norm": 0.9907357692718506, + "learning_rate": 4.889876903046116e-06, + "loss": 0.7163, + "step": 9431 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 1.120741367340088, + "learning_rate": 4.885414132369335e-06, + "loss": 0.6915, + "step": 9432 + }, + { + "epoch": 1.6365371269951423, + "grad_norm": 0.8104992508888245, + "learning_rate": 4.8809531157578495e-06, + "loss": 0.6298, + "step": 9433 + }, + { + "epoch": 1.6367106176266482, + "grad_norm": 1.2727948427200317, + "learning_rate": 4.876493853729385e-06, + "loss": 0.6758, + "step": 9434 + }, + { + "epoch": 1.6368841082581542, + "grad_norm": 0.9034106135368347, + "learning_rate": 4.872036346801419e-06, + "loss": 0.6406, + "step": 9435 + }, + { + "epoch": 1.63705759888966, + "grad_norm": 0.7851207256317139, + "learning_rate": 4.867580595491268e-06, + "loss": 0.7035, + "step": 9436 + }, + { + "epoch": 1.637231089521166, + "grad_norm": 0.75726717710495, + "learning_rate": 4.863126600316021e-06, + "loss": 0.7432, + "step": 9437 + }, + { + "epoch": 1.6374045801526718, + "grad_norm": 1.7232260704040527, + "learning_rate": 4.858674361792571e-06, + "loss": 0.6581, + "step": 9438 + }, + { + "epoch": 1.6375780707841776, + "grad_norm": 0.7266525626182556, + "learning_rate": 4.854223880437599e-06, + "loss": 0.8169, + "step": 9439 + }, + { + "epoch": 1.6377515614156835, + "grad_norm": 1.4345743656158447, + "learning_rate": 4.849775156767598e-06, + "loss": 0.7527, + "step": 9440 + }, + { + "epoch": 1.6379250520471893, + "grad_norm": 0.799789309501648, + "learning_rate": 4.845328191298846e-06, + "loss": 0.6338, + "step": 9441 + }, + { + "epoch": 1.6380985426786954, + "grad_norm": 0.8746475577354431, + "learning_rate": 4.840882984547415e-06, + "loss": 0.6213, + "step": 9442 + }, + { + "epoch": 1.6382720333102012, + "grad_norm": 0.8918264508247375, + "learning_rate": 4.8364395370291715e-06, + "loss": 0.7064, + "step": 9443 + }, + { + "epoch": 1.6384455239417073, + "grad_norm": 1.0930944681167603, + "learning_rate": 4.8319978492598e-06, + "loss": 0.766, + "step": 9444 + }, + { + "epoch": 1.6386190145732131, + "grad_norm": 1.0904698371887207, + "learning_rate": 4.827557921754742e-06, + "loss": 0.6543, + "step": 9445 + }, + { + "epoch": 1.638792505204719, + "grad_norm": 0.778989851474762, + "learning_rate": 4.823119755029271e-06, + "loss": 0.6858, + "step": 9446 + }, + { + "epoch": 1.6389659958362248, + "grad_norm": 0.7581886053085327, + "learning_rate": 4.818683349598439e-06, + "loss": 0.7574, + "step": 9447 + }, + { + "epoch": 1.6391394864677307, + "grad_norm": 0.8794681429862976, + "learning_rate": 4.814248705977092e-06, + "loss": 0.6799, + "step": 9448 + }, + { + "epoch": 1.6393129770992365, + "grad_norm": 0.7979905605316162, + "learning_rate": 4.8098158246798734e-06, + "loss": 0.7778, + "step": 9449 + }, + { + "epoch": 1.6394864677307426, + "grad_norm": 0.7528773546218872, + "learning_rate": 4.805384706221232e-06, + "loss": 0.7991, + "step": 9450 + }, + { + "epoch": 1.6396599583622484, + "grad_norm": 1.7649091482162476, + "learning_rate": 4.800955351115402e-06, + "loss": 0.6687, + "step": 9451 + }, + { + "epoch": 1.6398334489937545, + "grad_norm": 1.0803954601287842, + "learning_rate": 4.796527759876415e-06, + "loss": 0.7651, + "step": 9452 + }, + { + "epoch": 1.6400069396252603, + "grad_norm": 1.5208661556243896, + "learning_rate": 4.79210193301809e-06, + "loss": 0.6697, + "step": 9453 + }, + { + "epoch": 1.6401804302567662, + "grad_norm": 0.707905113697052, + "learning_rate": 4.787677871054062e-06, + "loss": 0.7114, + "step": 9454 + }, + { + "epoch": 1.640353920888272, + "grad_norm": 0.8808759450912476, + "learning_rate": 4.783255574497742e-06, + "loss": 0.5546, + "step": 9455 + }, + { + "epoch": 1.6405274115197779, + "grad_norm": 0.6926727294921875, + "learning_rate": 4.7788350438623465e-06, + "loss": 0.7744, + "step": 9456 + }, + { + "epoch": 1.6407009021512837, + "grad_norm": 1.0511314868927002, + "learning_rate": 4.774416279660879e-06, + "loss": 0.6061, + "step": 9457 + }, + { + "epoch": 1.6408743927827896, + "grad_norm": 1.1505118608474731, + "learning_rate": 4.769999282406137e-06, + "loss": 0.7285, + "step": 9458 + }, + { + "epoch": 1.6410478834142956, + "grad_norm": 1.1086581945419312, + "learning_rate": 4.76558405261073e-06, + "loss": 0.6528, + "step": 9459 + }, + { + "epoch": 1.6412213740458015, + "grad_norm": 0.7956562042236328, + "learning_rate": 4.7611705907870474e-06, + "loss": 0.792, + "step": 9460 + }, + { + "epoch": 1.6413948646773076, + "grad_norm": 0.8844649791717529, + "learning_rate": 4.7567588974472734e-06, + "loss": 0.6443, + "step": 9461 + }, + { + "epoch": 1.6415683553088134, + "grad_norm": 0.9366843104362488, + "learning_rate": 4.7523489731033845e-06, + "loss": 0.7323, + "step": 9462 + }, + { + "epoch": 1.6417418459403192, + "grad_norm": 0.931502640247345, + "learning_rate": 4.747940818267178e-06, + "loss": 0.6382, + "step": 9463 + }, + { + "epoch": 1.641915336571825, + "grad_norm": 1.1287342309951782, + "learning_rate": 4.743534433450199e-06, + "loss": 0.5614, + "step": 9464 + }, + { + "epoch": 1.642088827203331, + "grad_norm": 0.8692043423652649, + "learning_rate": 4.739129819163832e-06, + "loss": 0.5919, + "step": 9465 + }, + { + "epoch": 1.6422623178348368, + "grad_norm": 0.8899786472320557, + "learning_rate": 4.734726975919233e-06, + "loss": 0.7063, + "step": 9466 + }, + { + "epoch": 1.6424358084663429, + "grad_norm": 3.2243151664733887, + "learning_rate": 4.730325904227355e-06, + "loss": 0.7681, + "step": 9467 + }, + { + "epoch": 1.6426092990978487, + "grad_norm": 0.6788902878761292, + "learning_rate": 4.725926604598942e-06, + "loss": 0.7256, + "step": 9468 + }, + { + "epoch": 1.6427827897293548, + "grad_norm": 1.6101783514022827, + "learning_rate": 4.721529077544551e-06, + "loss": 0.6517, + "step": 9469 + }, + { + "epoch": 1.6429562803608606, + "grad_norm": 1.1097967624664307, + "learning_rate": 4.7171333235745145e-06, + "loss": 0.7975, + "step": 9470 + }, + { + "epoch": 1.6431297709923665, + "grad_norm": 1.0666511058807373, + "learning_rate": 4.712739343198962e-06, + "loss": 0.6235, + "step": 9471 + }, + { + "epoch": 1.6433032616238723, + "grad_norm": 1.3334972858428955, + "learning_rate": 4.708347136927818e-06, + "loss": 0.6887, + "step": 9472 + }, + { + "epoch": 1.6434767522553781, + "grad_norm": 0.9498238563537598, + "learning_rate": 4.703956705270818e-06, + "loss": 0.6815, + "step": 9473 + }, + { + "epoch": 1.643650242886884, + "grad_norm": 1.7042254209518433, + "learning_rate": 4.699568048737453e-06, + "loss": 0.5963, + "step": 9474 + }, + { + "epoch": 1.64382373351839, + "grad_norm": 0.7952605485916138, + "learning_rate": 4.695181167837051e-06, + "loss": 0.7068, + "step": 9475 + }, + { + "epoch": 1.643997224149896, + "grad_norm": 0.9149199724197388, + "learning_rate": 4.690796063078709e-06, + "loss": 0.7103, + "step": 9476 + }, + { + "epoch": 1.6441707147814018, + "grad_norm": 0.8649287819862366, + "learning_rate": 4.686412734971322e-06, + "loss": 0.688, + "step": 9477 + }, + { + "epoch": 1.6443442054129078, + "grad_norm": 1.0638666152954102, + "learning_rate": 4.6820311840235745e-06, + "loss": 0.5966, + "step": 9478 + }, + { + "epoch": 1.6445176960444137, + "grad_norm": 0.9548177123069763, + "learning_rate": 4.677651410743964e-06, + "loss": 0.5496, + "step": 9479 + }, + { + "epoch": 1.6446911866759195, + "grad_norm": 0.9970127940177917, + "learning_rate": 4.67327341564076e-06, + "loss": 0.7563, + "step": 9480 + }, + { + "epoch": 1.6448646773074254, + "grad_norm": 0.8579133749008179, + "learning_rate": 4.668897199222031e-06, + "loss": 0.6549, + "step": 9481 + }, + { + "epoch": 1.6450381679389312, + "grad_norm": 1.202392578125, + "learning_rate": 4.6645227619956515e-06, + "loss": 0.8528, + "step": 9482 + }, + { + "epoch": 1.645211658570437, + "grad_norm": 1.3389400243759155, + "learning_rate": 4.660150104469274e-06, + "loss": 0.6667, + "step": 9483 + }, + { + "epoch": 1.6453851492019431, + "grad_norm": 0.9235726594924927, + "learning_rate": 4.655779227150352e-06, + "loss": 0.6602, + "step": 9484 + }, + { + "epoch": 1.645558639833449, + "grad_norm": 0.8955357074737549, + "learning_rate": 4.6514101305461255e-06, + "loss": 0.7515, + "step": 9485 + }, + { + "epoch": 1.645732130464955, + "grad_norm": 1.1202785968780518, + "learning_rate": 4.647042815163649e-06, + "loss": 0.7966, + "step": 9486 + }, + { + "epoch": 1.6459056210964609, + "grad_norm": 0.7847790122032166, + "learning_rate": 4.6426772815097306e-06, + "loss": 0.6655, + "step": 9487 + }, + { + "epoch": 1.6460791117279667, + "grad_norm": 1.8485238552093506, + "learning_rate": 4.638313530091016e-06, + "loss": 0.8203, + "step": 9488 + }, + { + "epoch": 1.6462526023594726, + "grad_norm": 0.9468644261360168, + "learning_rate": 4.633951561413916e-06, + "loss": 0.5619, + "step": 9489 + }, + { + "epoch": 1.6464260929909784, + "grad_norm": 0.9301597476005554, + "learning_rate": 4.629591375984641e-06, + "loss": 0.6742, + "step": 9490 + }, + { + "epoch": 1.6465995836224843, + "grad_norm": 0.8259731531143188, + "learning_rate": 4.625232974309193e-06, + "loss": 0.6017, + "step": 9491 + }, + { + "epoch": 1.6467730742539903, + "grad_norm": 0.9795178174972534, + "learning_rate": 4.620876356893385e-06, + "loss": 0.782, + "step": 9492 + }, + { + "epoch": 1.6469465648854962, + "grad_norm": 0.7310971021652222, + "learning_rate": 4.616521524242783e-06, + "loss": 0.7328, + "step": 9493 + }, + { + "epoch": 1.6471200555170022, + "grad_norm": 1.2041423320770264, + "learning_rate": 4.612168476862789e-06, + "loss": 0.7354, + "step": 9494 + }, + { + "epoch": 1.647293546148508, + "grad_norm": 0.9853072166442871, + "learning_rate": 4.607817215258574e-06, + "loss": 0.5101, + "step": 9495 + }, + { + "epoch": 1.647467036780014, + "grad_norm": 0.8522449731826782, + "learning_rate": 4.603467739935108e-06, + "loss": 0.6978, + "step": 9496 + }, + { + "epoch": 1.6476405274115198, + "grad_norm": 1.000693917274475, + "learning_rate": 4.599120051397144e-06, + "loss": 0.572, + "step": 9497 + }, + { + "epoch": 1.6478140180430256, + "grad_norm": 1.2344470024108887, + "learning_rate": 4.594774150149251e-06, + "loss": 0.5881, + "step": 9498 + }, + { + "epoch": 1.6479875086745315, + "grad_norm": 0.7889182567596436, + "learning_rate": 4.5904300366957675e-06, + "loss": 0.7327, + "step": 9499 + }, + { + "epoch": 1.6481609993060373, + "grad_norm": 1.056145191192627, + "learning_rate": 4.586087711540832e-06, + "loss": 0.5724, + "step": 9500 + }, + { + "epoch": 1.6483344899375434, + "grad_norm": 0.9913428425788879, + "learning_rate": 4.581747175188376e-06, + "loss": 0.8203, + "step": 9501 + }, + { + "epoch": 1.6485079805690492, + "grad_norm": 1.0335627794265747, + "learning_rate": 4.57740842814213e-06, + "loss": 0.6425, + "step": 9502 + }, + { + "epoch": 1.6486814712005553, + "grad_norm": 1.2480660676956177, + "learning_rate": 4.573071470905608e-06, + "loss": 0.5669, + "step": 9503 + }, + { + "epoch": 1.6488549618320612, + "grad_norm": 0.8680832386016846, + "learning_rate": 4.568736303982115e-06, + "loss": 0.6144, + "step": 9504 + }, + { + "epoch": 1.649028452463567, + "grad_norm": 1.255113959312439, + "learning_rate": 4.564402927874758e-06, + "loss": 0.866, + "step": 9505 + }, + { + "epoch": 1.6492019430950728, + "grad_norm": 1.0716314315795898, + "learning_rate": 4.560071343086421e-06, + "loss": 0.6868, + "step": 9506 + }, + { + "epoch": 1.6493754337265787, + "grad_norm": 0.8827287554740906, + "learning_rate": 4.555741550119801e-06, + "loss": 0.5677, + "step": 9507 + }, + { + "epoch": 1.6495489243580845, + "grad_norm": 1.0018595457077026, + "learning_rate": 4.55141354947737e-06, + "loss": 0.6937, + "step": 9508 + }, + { + "epoch": 1.6497224149895906, + "grad_norm": 0.8144444227218628, + "learning_rate": 4.547087341661398e-06, + "loss": 0.8416, + "step": 9509 + }, + { + "epoch": 1.6498959056210964, + "grad_norm": 0.8745377659797668, + "learning_rate": 4.542762927173941e-06, + "loss": 0.6305, + "step": 9510 + }, + { + "epoch": 1.6500693962526025, + "grad_norm": 1.7324726581573486, + "learning_rate": 4.53844030651686e-06, + "loss": 0.6649, + "step": 9511 + }, + { + "epoch": 1.6502428868841084, + "grad_norm": 1.2096058130264282, + "learning_rate": 4.534119480191801e-06, + "loss": 0.5712, + "step": 9512 + }, + { + "epoch": 1.6504163775156142, + "grad_norm": 1.437991738319397, + "learning_rate": 4.5298004487001966e-06, + "loss": 0.6987, + "step": 9513 + }, + { + "epoch": 1.65058986814712, + "grad_norm": 1.0967353582382202, + "learning_rate": 4.525483212543273e-06, + "loss": 0.5667, + "step": 9514 + }, + { + "epoch": 1.650763358778626, + "grad_norm": 1.1616286039352417, + "learning_rate": 4.521167772222064e-06, + "loss": 0.6807, + "step": 9515 + }, + { + "epoch": 1.6509368494101317, + "grad_norm": 1.008234977722168, + "learning_rate": 4.516854128237358e-06, + "loss": 0.5516, + "step": 9516 + }, + { + "epoch": 1.6511103400416376, + "grad_norm": 0.8805198073387146, + "learning_rate": 4.51254228108978e-06, + "loss": 0.6119, + "step": 9517 + }, + { + "epoch": 1.6512838306731437, + "grad_norm": 1.322674036026001, + "learning_rate": 4.5082322312797166e-06, + "loss": 0.7122, + "step": 9518 + }, + { + "epoch": 1.6514573213046495, + "grad_norm": 0.9941253662109375, + "learning_rate": 4.503923979307352e-06, + "loss": 0.7974, + "step": 9519 + }, + { + "epoch": 1.6516308119361556, + "grad_norm": 0.9113391041755676, + "learning_rate": 4.499617525672664e-06, + "loss": 0.6083, + "step": 9520 + }, + { + "epoch": 1.6518043025676614, + "grad_norm": 0.6696825623512268, + "learning_rate": 4.4953128708754326e-06, + "loss": 0.7052, + "step": 9521 + }, + { + "epoch": 1.6519777931991673, + "grad_norm": 1.6888654232025146, + "learning_rate": 4.491010015415198e-06, + "loss": 0.6499, + "step": 9522 + }, + { + "epoch": 1.6521512838306731, + "grad_norm": 0.9574541449546814, + "learning_rate": 4.486708959791328e-06, + "loss": 0.7771, + "step": 9523 + }, + { + "epoch": 1.652324774462179, + "grad_norm": 0.6409510970115662, + "learning_rate": 4.4824097045029615e-06, + "loss": 0.8381, + "step": 9524 + }, + { + "epoch": 1.6524982650936848, + "grad_norm": 0.8876250982284546, + "learning_rate": 4.478112250049029e-06, + "loss": 0.5289, + "step": 9525 + }, + { + "epoch": 1.6526717557251909, + "grad_norm": 1.099391222000122, + "learning_rate": 4.473816596928251e-06, + "loss": 0.6506, + "step": 9526 + }, + { + "epoch": 1.6528452463566967, + "grad_norm": 0.916703462600708, + "learning_rate": 4.469522745639154e-06, + "loss": 0.8403, + "step": 9527 + }, + { + "epoch": 1.6530187369882028, + "grad_norm": 0.8563109636306763, + "learning_rate": 4.465230696680038e-06, + "loss": 0.8024, + "step": 9528 + }, + { + "epoch": 1.6531922276197086, + "grad_norm": 0.6839407086372375, + "learning_rate": 4.460940450548998e-06, + "loss": 0.575, + "step": 9529 + }, + { + "epoch": 1.6533657182512145, + "grad_norm": 1.0033677816390991, + "learning_rate": 4.456652007743929e-06, + "loss": 0.6089, + "step": 9530 + }, + { + "epoch": 1.6535392088827203, + "grad_norm": 0.8285371661186218, + "learning_rate": 4.452365368762508e-06, + "loss": 0.6914, + "step": 9531 + }, + { + "epoch": 1.6537126995142262, + "grad_norm": 0.828673243522644, + "learning_rate": 4.448080534102202e-06, + "loss": 0.8093, + "step": 9532 + }, + { + "epoch": 1.653886190145732, + "grad_norm": 1.5755674839019775, + "learning_rate": 4.4437975042602635e-06, + "loss": 0.548, + "step": 9533 + }, + { + "epoch": 1.654059680777238, + "grad_norm": 2.369816303253174, + "learning_rate": 4.439516279733764e-06, + "loss": 0.7891, + "step": 9534 + }, + { + "epoch": 1.654233171408744, + "grad_norm": 0.7620404362678528, + "learning_rate": 4.435236861019521e-06, + "loss": 0.688, + "step": 9535 + }, + { + "epoch": 1.6544066620402498, + "grad_norm": 1.0021594762802124, + "learning_rate": 4.430959248614184e-06, + "loss": 0.6887, + "step": 9536 + }, + { + "epoch": 1.6545801526717558, + "grad_norm": 0.8196793794631958, + "learning_rate": 4.4266834430141654e-06, + "loss": 0.7257, + "step": 9537 + }, + { + "epoch": 1.6547536433032617, + "grad_norm": 0.8745413422584534, + "learning_rate": 4.42240944471568e-06, + "loss": 0.7566, + "step": 9538 + }, + { + "epoch": 1.6549271339347675, + "grad_norm": 1.0237301588058472, + "learning_rate": 4.418137254214725e-06, + "loss": 0.5536, + "step": 9539 + }, + { + "epoch": 1.6551006245662734, + "grad_norm": 0.9248822927474976, + "learning_rate": 4.413866872007104e-06, + "loss": 0.7384, + "step": 9540 + }, + { + "epoch": 1.6552741151977792, + "grad_norm": 0.9885796308517456, + "learning_rate": 4.409598298588394e-06, + "loss": 0.6445, + "step": 9541 + }, + { + "epoch": 1.655447605829285, + "grad_norm": 0.7725172638893127, + "learning_rate": 4.405331534453967e-06, + "loss": 0.7991, + "step": 9542 + }, + { + "epoch": 1.6556210964607911, + "grad_norm": 1.1244972944259644, + "learning_rate": 4.4010665800989804e-06, + "loss": 0.6859, + "step": 9543 + }, + { + "epoch": 1.655794587092297, + "grad_norm": 0.9435034394264221, + "learning_rate": 4.396803436018406e-06, + "loss": 0.5773, + "step": 9544 + }, + { + "epoch": 1.655968077723803, + "grad_norm": 0.8756732940673828, + "learning_rate": 4.3925421027069645e-06, + "loss": 0.7764, + "step": 9545 + }, + { + "epoch": 1.656141568355309, + "grad_norm": 1.3764512538909912, + "learning_rate": 4.3882825806592024e-06, + "loss": 0.757, + "step": 9546 + }, + { + "epoch": 1.6563150589868147, + "grad_norm": 0.8957831859588623, + "learning_rate": 4.3840248703694365e-06, + "loss": 0.7227, + "step": 9547 + }, + { + "epoch": 1.6564885496183206, + "grad_norm": 0.6774768829345703, + "learning_rate": 4.379768972331784e-06, + "loss": 0.8711, + "step": 9548 + }, + { + "epoch": 1.6566620402498264, + "grad_norm": 1.0495319366455078, + "learning_rate": 4.375514887040135e-06, + "loss": 0.6661, + "step": 9549 + }, + { + "epoch": 1.6568355308813323, + "grad_norm": 1.0048288106918335, + "learning_rate": 4.371262614988196e-06, + "loss": 0.6232, + "step": 9550 + }, + { + "epoch": 1.6570090215128384, + "grad_norm": 1.0539934635162354, + "learning_rate": 4.367012156669441e-06, + "loss": 0.5627, + "step": 9551 + }, + { + "epoch": 1.6571825121443442, + "grad_norm": 0.8260024785995483, + "learning_rate": 4.362763512577144e-06, + "loss": 0.7385, + "step": 9552 + }, + { + "epoch": 1.6573560027758503, + "grad_norm": 1.6775082349777222, + "learning_rate": 4.358516683204355e-06, + "loss": 0.6459, + "step": 9553 + }, + { + "epoch": 1.6575294934073561, + "grad_norm": 1.055074691772461, + "learning_rate": 4.354271669043934e-06, + "loss": 0.5222, + "step": 9554 + }, + { + "epoch": 1.657702984038862, + "grad_norm": 0.8306162357330322, + "learning_rate": 4.350028470588521e-06, + "loss": 0.7744, + "step": 9555 + }, + { + "epoch": 1.6578764746703678, + "grad_norm": 1.4615000486373901, + "learning_rate": 4.345787088330537e-06, + "loss": 0.8052, + "step": 9556 + }, + { + "epoch": 1.6580499653018737, + "grad_norm": 1.051827311515808, + "learning_rate": 4.341547522762202e-06, + "loss": 0.5841, + "step": 9557 + }, + { + "epoch": 1.6582234559333795, + "grad_norm": 0.6133503317832947, + "learning_rate": 4.33730977437552e-06, + "loss": 0.7893, + "step": 9558 + }, + { + "epoch": 1.6583969465648853, + "grad_norm": 0.9759916067123413, + "learning_rate": 4.333073843662292e-06, + "loss": 0.5553, + "step": 9559 + }, + { + "epoch": 1.6585704371963914, + "grad_norm": 1.054907202720642, + "learning_rate": 4.328839731114101e-06, + "loss": 0.6703, + "step": 9560 + }, + { + "epoch": 1.6587439278278973, + "grad_norm": 0.9655316472053528, + "learning_rate": 4.324607437222319e-06, + "loss": 0.6436, + "step": 9561 + }, + { + "epoch": 1.6589174184594033, + "grad_norm": 0.7934287190437317, + "learning_rate": 4.3203769624781055e-06, + "loss": 0.752, + "step": 9562 + }, + { + "epoch": 1.6590909090909092, + "grad_norm": 1.0748814344406128, + "learning_rate": 4.316148307372425e-06, + "loss": 0.5553, + "step": 9563 + }, + { + "epoch": 1.659264399722415, + "grad_norm": 0.9359804391860962, + "learning_rate": 4.311921472395999e-06, + "loss": 0.7373, + "step": 9564 + }, + { + "epoch": 1.6594378903539209, + "grad_norm": 0.7267853617668152, + "learning_rate": 4.307696458039372e-06, + "loss": 0.7529, + "step": 9565 + }, + { + "epoch": 1.6596113809854267, + "grad_norm": 1.273830533027649, + "learning_rate": 4.303473264792857e-06, + "loss": 0.5992, + "step": 9566 + }, + { + "epoch": 1.6597848716169326, + "grad_norm": 0.8594560623168945, + "learning_rate": 4.2992518931465566e-06, + "loss": 0.6937, + "step": 9567 + }, + { + "epoch": 1.6599583622484386, + "grad_norm": 0.934070885181427, + "learning_rate": 4.295032343590366e-06, + "loss": 0.6724, + "step": 9568 + }, + { + "epoch": 1.6601318528799445, + "grad_norm": 1.1754329204559326, + "learning_rate": 4.290814616613976e-06, + "loss": 0.6929, + "step": 9569 + }, + { + "epoch": 1.6603053435114505, + "grad_norm": 1.1788092851638794, + "learning_rate": 4.286598712706858e-06, + "loss": 0.6543, + "step": 9570 + }, + { + "epoch": 1.6604788341429564, + "grad_norm": 1.1722832918167114, + "learning_rate": 4.282384632358265e-06, + "loss": 0.5731, + "step": 9571 + }, + { + "epoch": 1.6606523247744622, + "grad_norm": 0.9637743830680847, + "learning_rate": 4.278172376057246e-06, + "loss": 0.6353, + "step": 9572 + }, + { + "epoch": 1.660825815405968, + "grad_norm": 1.0516408681869507, + "learning_rate": 4.2739619442926525e-06, + "loss": 0.5469, + "step": 9573 + }, + { + "epoch": 1.660999306037474, + "grad_norm": 1.462754726409912, + "learning_rate": 4.269753337553091e-06, + "loss": 0.7294, + "step": 9574 + }, + { + "epoch": 1.6611727966689798, + "grad_norm": 0.81769859790802, + "learning_rate": 4.265546556326989e-06, + "loss": 0.6873, + "step": 9575 + }, + { + "epoch": 1.6613462873004856, + "grad_norm": 1.1327284574508667, + "learning_rate": 4.2613416011025424e-06, + "loss": 0.6427, + "step": 9576 + }, + { + "epoch": 1.6615197779319917, + "grad_norm": 0.9745861887931824, + "learning_rate": 4.257138472367737e-06, + "loss": 0.6151, + "step": 9577 + }, + { + "epoch": 1.6616932685634975, + "grad_norm": 1.307094931602478, + "learning_rate": 4.25293717061036e-06, + "loss": 0.8315, + "step": 9578 + }, + { + "epoch": 1.6618667591950036, + "grad_norm": 0.8174538612365723, + "learning_rate": 4.248737696317975e-06, + "loss": 0.5729, + "step": 9579 + }, + { + "epoch": 1.6620402498265094, + "grad_norm": 0.9411543011665344, + "learning_rate": 4.244540049977934e-06, + "loss": 0.5571, + "step": 9580 + }, + { + "epoch": 1.6622137404580153, + "grad_norm": 0.8956382870674133, + "learning_rate": 4.240344232077373e-06, + "loss": 0.7014, + "step": 9581 + }, + { + "epoch": 1.6623872310895211, + "grad_norm": 0.8056433796882629, + "learning_rate": 4.236150243103234e-06, + "loss": 0.6411, + "step": 9582 + }, + { + "epoch": 1.662560721721027, + "grad_norm": 1.0418583154678345, + "learning_rate": 4.231958083542229e-06, + "loss": 0.8259, + "step": 9583 + }, + { + "epoch": 1.6627342123525328, + "grad_norm": 1.3381532430648804, + "learning_rate": 4.227767753880861e-06, + "loss": 0.5889, + "step": 9584 + }, + { + "epoch": 1.662907702984039, + "grad_norm": 0.6461206674575806, + "learning_rate": 4.223579254605414e-06, + "loss": 0.7968, + "step": 9585 + }, + { + "epoch": 1.6630811936155447, + "grad_norm": 0.9801682233810425, + "learning_rate": 4.2193925862019934e-06, + "loss": 0.6721, + "step": 9586 + }, + { + "epoch": 1.6632546842470508, + "grad_norm": 0.9827626943588257, + "learning_rate": 4.2152077491564385e-06, + "loss": 0.6852, + "step": 9587 + }, + { + "epoch": 1.6634281748785567, + "grad_norm": 1.6362828016281128, + "learning_rate": 4.211024743954424e-06, + "loss": 0.7632, + "step": 9588 + }, + { + "epoch": 1.6636016655100625, + "grad_norm": 1.285579800605774, + "learning_rate": 4.206843571081383e-06, + "loss": 0.6509, + "step": 9589 + }, + { + "epoch": 1.6637751561415683, + "grad_norm": 0.9165834784507751, + "learning_rate": 4.2026642310225505e-06, + "loss": 0.6322, + "step": 9590 + }, + { + "epoch": 1.6639486467730742, + "grad_norm": 1.1765949726104736, + "learning_rate": 4.198486724262935e-06, + "loss": 0.5754, + "step": 9591 + }, + { + "epoch": 1.66412213740458, + "grad_norm": 1.3145416975021362, + "learning_rate": 4.194311051287359e-06, + "loss": 0.6803, + "step": 9592 + }, + { + "epoch": 1.664295628036086, + "grad_norm": 0.8858749866485596, + "learning_rate": 4.190137212580392e-06, + "loss": 0.5632, + "step": 9593 + }, + { + "epoch": 1.664469118667592, + "grad_norm": 0.9987931251525879, + "learning_rate": 4.185965208626428e-06, + "loss": 0.6362, + "step": 9594 + }, + { + "epoch": 1.6646426092990978, + "grad_norm": 0.7726349830627441, + "learning_rate": 4.181795039909631e-06, + "loss": 0.8796, + "step": 9595 + }, + { + "epoch": 1.6648160999306039, + "grad_norm": 1.1507006883621216, + "learning_rate": 4.177626706913948e-06, + "loss": 0.5765, + "step": 9596 + }, + { + "epoch": 1.6649895905621097, + "grad_norm": 0.9931668043136597, + "learning_rate": 4.173460210123119e-06, + "loss": 0.6261, + "step": 9597 + }, + { + "epoch": 1.6651630811936156, + "grad_norm": 0.8501849174499512, + "learning_rate": 4.16929555002068e-06, + "loss": 0.7119, + "step": 9598 + }, + { + "epoch": 1.6653365718251214, + "grad_norm": 0.8432859182357788, + "learning_rate": 4.16513272708994e-06, + "loss": 0.6453, + "step": 9599 + }, + { + "epoch": 1.6655100624566272, + "grad_norm": 1.5360214710235596, + "learning_rate": 4.160971741813995e-06, + "loss": 0.5487, + "step": 9600 + }, + { + "epoch": 1.665683553088133, + "grad_norm": 1.0905430316925049, + "learning_rate": 4.156812594675732e-06, + "loss": 0.6917, + "step": 9601 + }, + { + "epoch": 1.6658570437196392, + "grad_norm": 0.9105680584907532, + "learning_rate": 4.152655286157834e-06, + "loss": 0.5996, + "step": 9602 + }, + { + "epoch": 1.666030534351145, + "grad_norm": 1.583189606666565, + "learning_rate": 4.148499816742755e-06, + "loss": 0.5939, + "step": 9603 + }, + { + "epoch": 1.666204024982651, + "grad_norm": 0.7403680086135864, + "learning_rate": 4.144346186912738e-06, + "loss": 0.8333, + "step": 9604 + }, + { + "epoch": 1.666377515614157, + "grad_norm": 0.8469246625900269, + "learning_rate": 4.140194397149833e-06, + "loss": 0.7054, + "step": 9605 + }, + { + "epoch": 1.6665510062456628, + "grad_norm": 1.1953257322311401, + "learning_rate": 4.136044447935837e-06, + "loss": 0.629, + "step": 9606 + }, + { + "epoch": 1.6667244968771686, + "grad_norm": 1.220877766609192, + "learning_rate": 4.1318963397523725e-06, + "loss": 0.6538, + "step": 9607 + }, + { + "epoch": 1.6668979875086745, + "grad_norm": 0.7938002347946167, + "learning_rate": 4.127750073080829e-06, + "loss": 0.8376, + "step": 9608 + }, + { + "epoch": 1.6670714781401803, + "grad_norm": 0.8775656819343567, + "learning_rate": 4.123605648402385e-06, + "loss": 0.5337, + "step": 9609 + }, + { + "epoch": 1.6672449687716864, + "grad_norm": 0.7513276934623718, + "learning_rate": 4.119463066197997e-06, + "loss": 0.7759, + "step": 9610 + }, + { + "epoch": 1.6674184594031922, + "grad_norm": 2.4658586978912354, + "learning_rate": 4.115322326948432e-06, + "loss": 0.6847, + "step": 9611 + }, + { + "epoch": 1.6675919500346983, + "grad_norm": 0.5565110445022583, + "learning_rate": 4.111183431134223e-06, + "loss": 0.8, + "step": 9612 + }, + { + "epoch": 1.6677654406662041, + "grad_norm": 0.8709799647331238, + "learning_rate": 4.1070463792356865e-06, + "loss": 0.7014, + "step": 9613 + }, + { + "epoch": 1.66793893129771, + "grad_norm": 0.8792738914489746, + "learning_rate": 4.102911171732933e-06, + "loss": 0.6232, + "step": 9614 + }, + { + "epoch": 1.6681124219292158, + "grad_norm": 1.0803195238113403, + "learning_rate": 4.098777809105871e-06, + "loss": 0.8948, + "step": 9615 + }, + { + "epoch": 1.6682859125607217, + "grad_norm": 1.4542795419692993, + "learning_rate": 4.094646291834166e-06, + "loss": 0.5713, + "step": 9616 + }, + { + "epoch": 1.6684594031922275, + "grad_norm": 0.8021177649497986, + "learning_rate": 4.090516620397294e-06, + "loss": 0.7981, + "step": 9617 + }, + { + "epoch": 1.6686328938237334, + "grad_norm": 0.8744016289710999, + "learning_rate": 4.086388795274508e-06, + "loss": 0.6996, + "step": 9618 + }, + { + "epoch": 1.6688063844552394, + "grad_norm": 0.9589870572090149, + "learning_rate": 4.082262816944845e-06, + "loss": 0.6692, + "step": 9619 + }, + { + "epoch": 1.6689798750867453, + "grad_norm": 0.8657647371292114, + "learning_rate": 4.078138685887125e-06, + "loss": 0.6997, + "step": 9620 + }, + { + "epoch": 1.6691533657182513, + "grad_norm": 1.812299370765686, + "learning_rate": 4.074016402579968e-06, + "loss": 0.7581, + "step": 9621 + }, + { + "epoch": 1.6693268563497572, + "grad_norm": 0.8271260261535645, + "learning_rate": 4.069895967501765e-06, + "loss": 0.6647, + "step": 9622 + }, + { + "epoch": 1.669500346981263, + "grad_norm": 0.9010117650032043, + "learning_rate": 4.065777381130698e-06, + "loss": 0.6317, + "step": 9623 + }, + { + "epoch": 1.6696738376127689, + "grad_norm": 0.878829836845398, + "learning_rate": 4.0616606439447315e-06, + "loss": 0.781, + "step": 9624 + }, + { + "epoch": 1.6698473282442747, + "grad_norm": 1.6823614835739136, + "learning_rate": 4.057545756421615e-06, + "loss": 0.6448, + "step": 9625 + }, + { + "epoch": 1.6700208188757806, + "grad_norm": 1.138664722442627, + "learning_rate": 4.053432719038895e-06, + "loss": 0.7673, + "step": 9626 + }, + { + "epoch": 1.6701943095072866, + "grad_norm": 0.7689375281333923, + "learning_rate": 4.049321532273889e-06, + "loss": 0.6381, + "step": 9627 + }, + { + "epoch": 1.6703678001387925, + "grad_norm": 1.0617420673370361, + "learning_rate": 4.045212196603705e-06, + "loss": 0.5526, + "step": 9628 + }, + { + "epoch": 1.6705412907702986, + "grad_norm": 1.1108149290084839, + "learning_rate": 4.041104712505233e-06, + "loss": 0.8079, + "step": 9629 + }, + { + "epoch": 1.6707147814018044, + "grad_norm": 0.8816924095153809, + "learning_rate": 4.03699908045516e-06, + "loss": 0.7157, + "step": 9630 + }, + { + "epoch": 1.6708882720333103, + "grad_norm": 0.7140135169029236, + "learning_rate": 4.0328953009299425e-06, + "loss": 0.579, + "step": 9631 + }, + { + "epoch": 1.671061762664816, + "grad_norm": 0.8854278326034546, + "learning_rate": 4.028793374405833e-06, + "loss": 0.6202, + "step": 9632 + }, + { + "epoch": 1.671235253296322, + "grad_norm": 0.7356824278831482, + "learning_rate": 4.024693301358855e-06, + "loss": 0.723, + "step": 9633 + }, + { + "epoch": 1.6714087439278278, + "grad_norm": 1.2458597421646118, + "learning_rate": 4.020595082264847e-06, + "loss": 0.854, + "step": 9634 + }, + { + "epoch": 1.6715822345593336, + "grad_norm": 1.072119116783142, + "learning_rate": 4.016498717599387e-06, + "loss": 0.6539, + "step": 9635 + }, + { + "epoch": 1.6717557251908397, + "grad_norm": 1.0929231643676758, + "learning_rate": 4.012404207837881e-06, + "loss": 0.7112, + "step": 9636 + }, + { + "epoch": 1.6719292158223455, + "grad_norm": 0.7963440418243408, + "learning_rate": 4.008311553455497e-06, + "loss": 0.7461, + "step": 9637 + }, + { + "epoch": 1.6721027064538516, + "grad_norm": 1.0182315111160278, + "learning_rate": 4.0042207549271905e-06, + "loss": 0.6599, + "step": 9638 + }, + { + "epoch": 1.6722761970853575, + "grad_norm": 0.9329388737678528, + "learning_rate": 4.0001318127276985e-06, + "loss": 0.7798, + "step": 9639 + }, + { + "epoch": 1.6724496877168633, + "grad_norm": 0.868603527545929, + "learning_rate": 3.996044727331558e-06, + "loss": 0.5586, + "step": 9640 + }, + { + "epoch": 1.6726231783483692, + "grad_norm": 0.8270264863967896, + "learning_rate": 3.991959499213076e-06, + "loss": 0.6306, + "step": 9641 + }, + { + "epoch": 1.672796668979875, + "grad_norm": 0.9222108721733093, + "learning_rate": 3.987876128846349e-06, + "loss": 0.6582, + "step": 9642 + }, + { + "epoch": 1.6729701596113808, + "grad_norm": 0.7447760105133057, + "learning_rate": 3.983794616705248e-06, + "loss": 0.8029, + "step": 9643 + }, + { + "epoch": 1.673143650242887, + "grad_norm": 0.8347333669662476, + "learning_rate": 3.979714963263455e-06, + "loss": 0.5641, + "step": 9644 + }, + { + "epoch": 1.6733171408743928, + "grad_norm": 1.2873328924179077, + "learning_rate": 3.975637168994397e-06, + "loss": 0.6516, + "step": 9645 + }, + { + "epoch": 1.6734906315058988, + "grad_norm": 1.6356264352798462, + "learning_rate": 3.971561234371324e-06, + "loss": 0.6974, + "step": 9646 + }, + { + "epoch": 1.6736641221374047, + "grad_norm": 0.9202525615692139, + "learning_rate": 3.967487159867245e-06, + "loss": 0.7458, + "step": 9647 + }, + { + "epoch": 1.6738376127689105, + "grad_norm": 1.1964435577392578, + "learning_rate": 3.963414945954962e-06, + "loss": 0.6849, + "step": 9648 + }, + { + "epoch": 1.6740111034004164, + "grad_norm": 0.8092890977859497, + "learning_rate": 3.959344593107057e-06, + "loss": 0.6011, + "step": 9649 + }, + { + "epoch": 1.6741845940319222, + "grad_norm": 0.8431543707847595, + "learning_rate": 3.955276101795908e-06, + "loss": 0.6885, + "step": 9650 + }, + { + "epoch": 1.674358084663428, + "grad_norm": 1.1318917274475098, + "learning_rate": 3.951209472493664e-06, + "loss": 0.8342, + "step": 9651 + }, + { + "epoch": 1.6745315752949341, + "grad_norm": 1.0593349933624268, + "learning_rate": 3.947144705672257e-06, + "loss": 0.741, + "step": 9652 + }, + { + "epoch": 1.67470506592644, + "grad_norm": 0.9511032700538635, + "learning_rate": 3.943081801803421e-06, + "loss": 0.733, + "step": 9653 + }, + { + "epoch": 1.6748785565579458, + "grad_norm": 1.5057730674743652, + "learning_rate": 3.939020761358641e-06, + "loss": 0.5525, + "step": 9654 + }, + { + "epoch": 1.6750520471894519, + "grad_norm": 0.8178658485412598, + "learning_rate": 3.934961584809222e-06, + "loss": 0.6417, + "step": 9655 + }, + { + "epoch": 1.6752255378209577, + "grad_norm": 1.381604552268982, + "learning_rate": 3.930904272626226e-06, + "loss": 0.6848, + "step": 9656 + }, + { + "epoch": 1.6753990284524636, + "grad_norm": 0.8809424042701721, + "learning_rate": 3.926848825280524e-06, + "loss": 0.7213, + "step": 9657 + }, + { + "epoch": 1.6755725190839694, + "grad_norm": 0.7162255644798279, + "learning_rate": 3.922795243242734e-06, + "loss": 0.8589, + "step": 9658 + }, + { + "epoch": 1.6757460097154753, + "grad_norm": 0.6355248689651489, + "learning_rate": 3.918743526983295e-06, + "loss": 0.7981, + "step": 9659 + }, + { + "epoch": 1.6759195003469811, + "grad_norm": 0.9748026728630066, + "learning_rate": 3.914693676972408e-06, + "loss": 0.6848, + "step": 9660 + }, + { + "epoch": 1.6760929909784872, + "grad_norm": 0.826667308807373, + "learning_rate": 3.9106456936800615e-06, + "loss": 0.6803, + "step": 9661 + }, + { + "epoch": 1.676266481609993, + "grad_norm": 1.0473655462265015, + "learning_rate": 3.906599577576027e-06, + "loss": 0.7437, + "step": 9662 + }, + { + "epoch": 1.676439972241499, + "grad_norm": 0.7893468737602234, + "learning_rate": 3.902555329129874e-06, + "loss": 0.7222, + "step": 9663 + }, + { + "epoch": 1.676613462873005, + "grad_norm": 1.6509974002838135, + "learning_rate": 3.898512948810922e-06, + "loss": 0.5588, + "step": 9664 + }, + { + "epoch": 1.6767869535045108, + "grad_norm": 0.9831893444061279, + "learning_rate": 3.894472437088308e-06, + "loss": 0.7874, + "step": 9665 + }, + { + "epoch": 1.6769604441360166, + "grad_norm": 1.0757166147232056, + "learning_rate": 3.890433794430934e-06, + "loss": 0.5258, + "step": 9666 + }, + { + "epoch": 1.6771339347675225, + "grad_norm": 1.0870760679244995, + "learning_rate": 3.886397021307493e-06, + "loss": 0.6915, + "step": 9667 + }, + { + "epoch": 1.6773074253990283, + "grad_norm": 1.0920192003250122, + "learning_rate": 3.882362118186445e-06, + "loss": 0.6975, + "step": 9668 + }, + { + "epoch": 1.6774809160305344, + "grad_norm": 1.4072377681732178, + "learning_rate": 3.878329085536061e-06, + "loss": 0.603, + "step": 9669 + }, + { + "epoch": 1.6776544066620402, + "grad_norm": 1.0822335481643677, + "learning_rate": 3.87429792382437e-06, + "loss": 0.731, + "step": 9670 + }, + { + "epoch": 1.6778278972935463, + "grad_norm": 0.9419888257980347, + "learning_rate": 3.870268633519198e-06, + "loss": 0.561, + "step": 9671 + }, + { + "epoch": 1.6780013879250522, + "grad_norm": 0.8266680836677551, + "learning_rate": 3.86624121508814e-06, + "loss": 0.6964, + "step": 9672 + }, + { + "epoch": 1.678174878556558, + "grad_norm": 0.9966776967048645, + "learning_rate": 3.862215668998592e-06, + "loss": 0.6475, + "step": 9673 + }, + { + "epoch": 1.6783483691880638, + "grad_norm": 1.0693165063858032, + "learning_rate": 3.858191995717722e-06, + "loss": 0.7732, + "step": 9674 + }, + { + "epoch": 1.6785218598195697, + "grad_norm": 0.8669641017913818, + "learning_rate": 3.854170195712479e-06, + "loss": 0.7086, + "step": 9675 + }, + { + "epoch": 1.6786953504510755, + "grad_norm": 1.1276823282241821, + "learning_rate": 3.850150269449597e-06, + "loss": 0.5826, + "step": 9676 + }, + { + "epoch": 1.6788688410825814, + "grad_norm": 2.5128157138824463, + "learning_rate": 3.846132217395593e-06, + "loss": 0.7831, + "step": 9677 + }, + { + "epoch": 1.6790423317140875, + "grad_norm": 0.9161383509635925, + "learning_rate": 3.84211604001677e-06, + "loss": 0.7935, + "step": 9678 + }, + { + "epoch": 1.6792158223455933, + "grad_norm": 0.779302716255188, + "learning_rate": 3.83810173777921e-06, + "loss": 0.7184, + "step": 9679 + }, + { + "epoch": 1.6793893129770994, + "grad_norm": 0.7737762928009033, + "learning_rate": 3.834089311148774e-06, + "loss": 0.7257, + "step": 9680 + }, + { + "epoch": 1.6795628036086052, + "grad_norm": 1.0349769592285156, + "learning_rate": 3.830078760591107e-06, + "loss": 0.7054, + "step": 9681 + }, + { + "epoch": 1.679736294240111, + "grad_norm": 1.2657341957092285, + "learning_rate": 3.826070086571651e-06, + "loss": 0.5583, + "step": 9682 + }, + { + "epoch": 1.679909784871617, + "grad_norm": 2.2871108055114746, + "learning_rate": 3.822063289555597e-06, + "loss": 0.7562, + "step": 9683 + }, + { + "epoch": 1.6800832755031228, + "grad_norm": 1.2227116823196411, + "learning_rate": 3.818058370007956e-06, + "loss": 0.573, + "step": 9684 + }, + { + "epoch": 1.6802567661346286, + "grad_norm": 0.7064873576164246, + "learning_rate": 3.814055328393491e-06, + "loss": 0.5673, + "step": 9685 + }, + { + "epoch": 1.6804302567661347, + "grad_norm": 0.8350714445114136, + "learning_rate": 3.810054165176775e-06, + "loss": 0.72, + "step": 9686 + }, + { + "epoch": 1.6806037473976405, + "grad_norm": 1.0272722244262695, + "learning_rate": 3.8060548808221277e-06, + "loss": 0.6401, + "step": 9687 + }, + { + "epoch": 1.6807772380291466, + "grad_norm": 1.218443512916565, + "learning_rate": 3.802057475793688e-06, + "loss": 0.8223, + "step": 9688 + }, + { + "epoch": 1.6809507286606524, + "grad_norm": 0.9541050791740417, + "learning_rate": 3.7980619505553516e-06, + "loss": 0.657, + "step": 9689 + }, + { + "epoch": 1.6811242192921583, + "grad_norm": 1.3084975481033325, + "learning_rate": 3.794068305570804e-06, + "loss": 0.6602, + "step": 9690 + }, + { + "epoch": 1.6812977099236641, + "grad_norm": 0.9637660980224609, + "learning_rate": 3.790076541303509e-06, + "loss": 0.7405, + "step": 9691 + }, + { + "epoch": 1.68147120055517, + "grad_norm": 2.2211196422576904, + "learning_rate": 3.78608665821673e-06, + "loss": 0.5869, + "step": 9692 + }, + { + "epoch": 1.6816446911866758, + "grad_norm": 2.655383586883545, + "learning_rate": 3.7820986567734787e-06, + "loss": 0.7806, + "step": 9693 + }, + { + "epoch": 1.6818181818181817, + "grad_norm": 1.2662744522094727, + "learning_rate": 3.778112537436578e-06, + "loss": 0.5652, + "step": 9694 + }, + { + "epoch": 1.6819916724496877, + "grad_norm": 0.9830438494682312, + "learning_rate": 3.7741283006686204e-06, + "loss": 0.7058, + "step": 9695 + }, + { + "epoch": 1.6821651630811936, + "grad_norm": 1.4416142702102661, + "learning_rate": 3.7701459469319824e-06, + "loss": 0.7198, + "step": 9696 + }, + { + "epoch": 1.6823386537126996, + "grad_norm": 1.5309628248214722, + "learning_rate": 3.7661654766888124e-06, + "loss": 0.5878, + "step": 9697 + }, + { + "epoch": 1.6825121443442055, + "grad_norm": 0.9777270555496216, + "learning_rate": 3.7621868904010585e-06, + "loss": 0.5935, + "step": 9698 + }, + { + "epoch": 1.6826856349757113, + "grad_norm": 0.8420565724372864, + "learning_rate": 3.758210188530438e-06, + "loss": 0.7378, + "step": 9699 + }, + { + "epoch": 1.6828591256072172, + "grad_norm": 1.107589840888977, + "learning_rate": 3.7542353715384462e-06, + "loss": 0.7859, + "step": 9700 + }, + { + "epoch": 1.683032616238723, + "grad_norm": 0.9287567138671875, + "learning_rate": 3.750262439886374e-06, + "loss": 0.6937, + "step": 9701 + }, + { + "epoch": 1.6832061068702289, + "grad_norm": 0.7320445775985718, + "learning_rate": 3.7462913940352797e-06, + "loss": 0.7131, + "step": 9702 + }, + { + "epoch": 1.683379597501735, + "grad_norm": 1.0381250381469727, + "learning_rate": 3.7423222344460096e-06, + "loss": 0.6531, + "step": 9703 + }, + { + "epoch": 1.6835530881332408, + "grad_norm": 0.6408345699310303, + "learning_rate": 3.7383549615791826e-06, + "loss": 0.7754, + "step": 9704 + }, + { + "epoch": 1.6837265787647469, + "grad_norm": 1.0776193141937256, + "learning_rate": 3.734389575895221e-06, + "loss": 0.5197, + "step": 9705 + }, + { + "epoch": 1.6839000693962527, + "grad_norm": 0.8209722638130188, + "learning_rate": 3.7304260778542924e-06, + "loss": 0.7361, + "step": 9706 + }, + { + "epoch": 1.6840735600277585, + "grad_norm": 1.008620262145996, + "learning_rate": 3.726464467916382e-06, + "loss": 0.7391, + "step": 9707 + }, + { + "epoch": 1.6842470506592644, + "grad_norm": 1.9826463460922241, + "learning_rate": 3.722504746541229e-06, + "loss": 0.7025, + "step": 9708 + }, + { + "epoch": 1.6844205412907702, + "grad_norm": 0.8668285012245178, + "learning_rate": 3.71854691418837e-06, + "loss": 0.8611, + "step": 9709 + }, + { + "epoch": 1.684594031922276, + "grad_norm": 1.01505708694458, + "learning_rate": 3.714590971317107e-06, + "loss": 0.6761, + "step": 9710 + }, + { + "epoch": 1.6847675225537821, + "grad_norm": 1.1364691257476807, + "learning_rate": 3.710636918386543e-06, + "loss": 0.6556, + "step": 9711 + }, + { + "epoch": 1.684941013185288, + "grad_norm": 0.8767856955528259, + "learning_rate": 3.706684755855545e-06, + "loss": 0.7432, + "step": 9712 + }, + { + "epoch": 1.6851145038167938, + "grad_norm": 1.0421017408370972, + "learning_rate": 3.7027344841827684e-06, + "loss": 0.5435, + "step": 9713 + }, + { + "epoch": 1.6852879944483, + "grad_norm": 1.6204692125320435, + "learning_rate": 3.698786103826639e-06, + "loss": 0.6139, + "step": 9714 + }, + { + "epoch": 1.6854614850798058, + "grad_norm": 1.1383973360061646, + "learning_rate": 3.694839615245387e-06, + "loss": 0.6251, + "step": 9715 + }, + { + "epoch": 1.6856349757113116, + "grad_norm": 1.1176308393478394, + "learning_rate": 3.690895018896987e-06, + "loss": 0.6003, + "step": 9716 + }, + { + "epoch": 1.6858084663428174, + "grad_norm": 1.0669652223587036, + "learning_rate": 3.6869523152392296e-06, + "loss": 0.691, + "step": 9717 + }, + { + "epoch": 1.6859819569743233, + "grad_norm": 3.3589630126953125, + "learning_rate": 3.6830115047296633e-06, + "loss": 0.7292, + "step": 9718 + }, + { + "epoch": 1.6861554476058291, + "grad_norm": 0.9055686593055725, + "learning_rate": 3.679072587825625e-06, + "loss": 0.6707, + "step": 9719 + }, + { + "epoch": 1.6863289382373352, + "grad_norm": 1.2850862741470337, + "learning_rate": 3.675135564984227e-06, + "loss": 0.705, + "step": 9720 + }, + { + "epoch": 1.686502428868841, + "grad_norm": 1.0283968448638916, + "learning_rate": 3.671200436662372e-06, + "loss": 0.7095, + "step": 9721 + }, + { + "epoch": 1.6866759195003471, + "grad_norm": 1.1323987245559692, + "learning_rate": 3.6672672033167333e-06, + "loss": 0.6566, + "step": 9722 + }, + { + "epoch": 1.686849410131853, + "grad_norm": 2.2638661861419678, + "learning_rate": 3.6633358654037676e-06, + "loss": 0.8484, + "step": 9723 + }, + { + "epoch": 1.6870229007633588, + "grad_norm": 1.1714380979537964, + "learning_rate": 3.6594064233797123e-06, + "loss": 0.7061, + "step": 9724 + }, + { + "epoch": 1.6871963913948647, + "grad_norm": 1.0960177183151245, + "learning_rate": 3.6554788777005758e-06, + "loss": 0.8516, + "step": 9725 + }, + { + "epoch": 1.6873698820263705, + "grad_norm": 1.5228326320648193, + "learning_rate": 3.6515532288221646e-06, + "loss": 0.7458, + "step": 9726 + }, + { + "epoch": 1.6875433726578764, + "grad_norm": 0.884206235408783, + "learning_rate": 3.647629477200052e-06, + "loss": 0.671, + "step": 9727 + }, + { + "epoch": 1.6877168632893824, + "grad_norm": 2.2448136806488037, + "learning_rate": 3.643707623289592e-06, + "loss": 0.6787, + "step": 9728 + }, + { + "epoch": 1.6878903539208883, + "grad_norm": 0.9800110459327698, + "learning_rate": 3.6397876675459175e-06, + "loss": 0.5828, + "step": 9729 + }, + { + "epoch": 1.6880638445523943, + "grad_norm": 1.0726561546325684, + "learning_rate": 3.635869610423952e-06, + "loss": 0.6205, + "step": 9730 + }, + { + "epoch": 1.6882373351839002, + "grad_norm": 1.5589643716812134, + "learning_rate": 3.6319534523783872e-06, + "loss": 0.7704, + "step": 9731 + }, + { + "epoch": 1.688410825815406, + "grad_norm": 1.0034973621368408, + "learning_rate": 3.628039193863695e-06, + "loss": 0.6425, + "step": 9732 + }, + { + "epoch": 1.6885843164469119, + "grad_norm": 1.031617283821106, + "learning_rate": 3.6241268353341276e-06, + "loss": 0.8696, + "step": 9733 + }, + { + "epoch": 1.6887578070784177, + "grad_norm": 0.8206128478050232, + "learning_rate": 3.6202163772437326e-06, + "loss": 0.8541, + "step": 9734 + }, + { + "epoch": 1.6889312977099236, + "grad_norm": 0.9887929558753967, + "learning_rate": 3.616307820046303e-06, + "loss": 0.8501, + "step": 9735 + }, + { + "epoch": 1.6891047883414294, + "grad_norm": 1.9326984882354736, + "learning_rate": 3.6124011641954473e-06, + "loss": 0.7805, + "step": 9736 + }, + { + "epoch": 1.6892782789729355, + "grad_norm": 0.6932715773582458, + "learning_rate": 3.6084964101445307e-06, + "loss": 0.7432, + "step": 9737 + }, + { + "epoch": 1.6894517696044413, + "grad_norm": 1.1159330606460571, + "learning_rate": 3.6045935583467053e-06, + "loss": 0.5472, + "step": 9738 + }, + { + "epoch": 1.6896252602359474, + "grad_norm": 0.9280418157577515, + "learning_rate": 3.6006926092548988e-06, + "loss": 0.7694, + "step": 9739 + }, + { + "epoch": 1.6897987508674532, + "grad_norm": 1.16310453414917, + "learning_rate": 3.5967935633218277e-06, + "loss": 0.6273, + "step": 9740 + }, + { + "epoch": 1.689972241498959, + "grad_norm": 0.9876702427864075, + "learning_rate": 3.5928964209999784e-06, + "loss": 0.5453, + "step": 9741 + }, + { + "epoch": 1.690145732130465, + "grad_norm": 1.0925616025924683, + "learning_rate": 3.589001182741616e-06, + "loss": 0.6199, + "step": 9742 + }, + { + "epoch": 1.6903192227619708, + "grad_norm": 0.75493323802948, + "learning_rate": 3.5851078489987834e-06, + "loss": 0.7908, + "step": 9743 + }, + { + "epoch": 1.6904927133934766, + "grad_norm": 0.8334967494010925, + "learning_rate": 3.5812164202233236e-06, + "loss": 0.7561, + "step": 9744 + }, + { + "epoch": 1.6906662040249827, + "grad_norm": 0.9011484384536743, + "learning_rate": 3.5773268968668194e-06, + "loss": 0.6167, + "step": 9745 + }, + { + "epoch": 1.6908396946564885, + "grad_norm": 0.8248322606086731, + "learning_rate": 3.5734392793806704e-06, + "loss": 0.6882, + "step": 9746 + }, + { + "epoch": 1.6910131852879946, + "grad_norm": 1.016876459121704, + "learning_rate": 3.569553568216033e-06, + "loss": 0.582, + "step": 9747 + }, + { + "epoch": 1.6911866759195004, + "grad_norm": 1.4863795042037964, + "learning_rate": 3.5656697638238447e-06, + "loss": 0.6146, + "step": 9748 + }, + { + "epoch": 1.6913601665510063, + "grad_norm": 1.1102920770645142, + "learning_rate": 3.5617878666548354e-06, + "loss": 0.7476, + "step": 9749 + }, + { + "epoch": 1.6915336571825121, + "grad_norm": 1.1641820669174194, + "learning_rate": 3.5579078771594988e-06, + "loss": 0.5756, + "step": 9750 + }, + { + "epoch": 1.691707147814018, + "grad_norm": 0.9014329314231873, + "learning_rate": 3.55402979578811e-06, + "loss": 0.5065, + "step": 9751 + }, + { + "epoch": 1.6918806384455238, + "grad_norm": 1.0844136476516724, + "learning_rate": 3.550153622990724e-06, + "loss": 0.5189, + "step": 9752 + }, + { + "epoch": 1.6920541290770297, + "grad_norm": 1.1042444705963135, + "learning_rate": 3.546279359217186e-06, + "loss": 0.7607, + "step": 9753 + }, + { + "epoch": 1.6922276197085357, + "grad_norm": 1.1180315017700195, + "learning_rate": 3.542407004917092e-06, + "loss": 0.6733, + "step": 9754 + }, + { + "epoch": 1.6924011103400416, + "grad_norm": 1.040847659111023, + "learning_rate": 3.538536560539847e-06, + "loss": 0.6835, + "step": 9755 + }, + { + "epoch": 1.6925746009715477, + "grad_norm": 0.9323839545249939, + "learning_rate": 3.5346680265346113e-06, + "loss": 0.5609, + "step": 9756 + }, + { + "epoch": 1.6927480916030535, + "grad_norm": 8.60783863067627, + "learning_rate": 3.530801403350346e-06, + "loss": 0.6559, + "step": 9757 + }, + { + "epoch": 1.6929215822345594, + "grad_norm": 1.7898246049880981, + "learning_rate": 3.5269366914357585e-06, + "loss": 0.5671, + "step": 9758 + }, + { + "epoch": 1.6930950728660652, + "grad_norm": 0.7830457091331482, + "learning_rate": 3.523073891239368e-06, + "loss": 0.7515, + "step": 9759 + }, + { + "epoch": 1.693268563497571, + "grad_norm": 0.7543913125991821, + "learning_rate": 3.5192130032094517e-06, + "loss": 0.7101, + "step": 9760 + }, + { + "epoch": 1.693442054129077, + "grad_norm": 1.8772211074829102, + "learning_rate": 3.515354027794069e-06, + "loss": 0.5464, + "step": 9761 + }, + { + "epoch": 1.693615544760583, + "grad_norm": 0.7456567287445068, + "learning_rate": 3.511496965441057e-06, + "loss": 0.8188, + "step": 9762 + }, + { + "epoch": 1.6937890353920888, + "grad_norm": 1.2332134246826172, + "learning_rate": 3.507641816598044e-06, + "loss": 0.7239, + "step": 9763 + }, + { + "epoch": 1.6939625260235949, + "grad_norm": 0.8494266867637634, + "learning_rate": 3.503788581712406e-06, + "loss": 0.6383, + "step": 9764 + }, + { + "epoch": 1.6941360166551007, + "grad_norm": 1.2763233184814453, + "learning_rate": 3.49993726123133e-06, + "loss": 0.6632, + "step": 9765 + }, + { + "epoch": 1.6943095072866066, + "grad_norm": 1.0013275146484375, + "learning_rate": 3.4960878556017597e-06, + "loss": 0.589, + "step": 9766 + }, + { + "epoch": 1.6944829979181124, + "grad_norm": 0.926929771900177, + "learning_rate": 3.492240365270425e-06, + "loss": 0.6835, + "step": 9767 + }, + { + "epoch": 1.6946564885496183, + "grad_norm": 0.8992684483528137, + "learning_rate": 3.488394790683829e-06, + "loss": 0.5341, + "step": 9768 + }, + { + "epoch": 1.694829979181124, + "grad_norm": 1.1735883951187134, + "learning_rate": 3.4845511322882587e-06, + "loss": 0.6957, + "step": 9769 + }, + { + "epoch": 1.6950034698126302, + "grad_norm": 2.465162992477417, + "learning_rate": 3.480709390529777e-06, + "loss": 0.7136, + "step": 9770 + }, + { + "epoch": 1.695176960444136, + "grad_norm": 0.635297417640686, + "learning_rate": 3.476869565854217e-06, + "loss": 0.8853, + "step": 9771 + }, + { + "epoch": 1.6953504510756419, + "grad_norm": 0.8366246223449707, + "learning_rate": 3.473031658707193e-06, + "loss": 0.6483, + "step": 9772 + }, + { + "epoch": 1.695523941707148, + "grad_norm": 1.0991452932357788, + "learning_rate": 3.469195669534109e-06, + "loss": 0.7166, + "step": 9773 + }, + { + "epoch": 1.6956974323386538, + "grad_norm": 1.406754493713379, + "learning_rate": 3.465361598780128e-06, + "loss": 0.5785, + "step": 9774 + }, + { + "epoch": 1.6958709229701596, + "grad_norm": 1.4300154447555542, + "learning_rate": 3.4615294468902017e-06, + "loss": 0.6711, + "step": 9775 + }, + { + "epoch": 1.6960444136016655, + "grad_norm": 1.0769494771957397, + "learning_rate": 3.4576992143090517e-06, + "loss": 0.7289, + "step": 9776 + }, + { + "epoch": 1.6962179042331713, + "grad_norm": 0.9132763147354126, + "learning_rate": 3.45387090148118e-06, + "loss": 0.6793, + "step": 9777 + }, + { + "epoch": 1.6963913948646772, + "grad_norm": 0.927716076374054, + "learning_rate": 3.450044508850876e-06, + "loss": 0.6887, + "step": 9778 + }, + { + "epoch": 1.6965648854961832, + "grad_norm": 0.7266793251037598, + "learning_rate": 3.446220036862191e-06, + "loss": 0.6426, + "step": 9779 + }, + { + "epoch": 1.696738376127689, + "grad_norm": 0.9168142080307007, + "learning_rate": 3.4423974859589594e-06, + "loss": 0.6193, + "step": 9780 + }, + { + "epoch": 1.6969118667591951, + "grad_norm": 0.9461618065834045, + "learning_rate": 3.4385768565847876e-06, + "loss": 0.5928, + "step": 9781 + }, + { + "epoch": 1.697085357390701, + "grad_norm": 1.2285414934158325, + "learning_rate": 3.4347581491830796e-06, + "loss": 0.72, + "step": 9782 + }, + { + "epoch": 1.6972588480222068, + "grad_norm": 0.9479393362998962, + "learning_rate": 3.4309413641969802e-06, + "loss": 0.7122, + "step": 9783 + }, + { + "epoch": 1.6974323386537127, + "grad_norm": 0.8083467483520508, + "learning_rate": 3.427126502069449e-06, + "loss": 0.6732, + "step": 9784 + }, + { + "epoch": 1.6976058292852185, + "grad_norm": 0.955507218837738, + "learning_rate": 3.4233135632431913e-06, + "loss": 0.7101, + "step": 9785 + }, + { + "epoch": 1.6977793199167244, + "grad_norm": 0.7145348191261292, + "learning_rate": 3.4195025481607224e-06, + "loss": 0.6528, + "step": 9786 + }, + { + "epoch": 1.6979528105482304, + "grad_norm": 0.9581376910209656, + "learning_rate": 3.415693457264291e-06, + "loss": 0.6465, + "step": 9787 + }, + { + "epoch": 1.6981263011797363, + "grad_norm": 0.9344385862350464, + "learning_rate": 3.411886290995965e-06, + "loss": 0.6167, + "step": 9788 + }, + { + "epoch": 1.6982997918112424, + "grad_norm": 0.8918091654777527, + "learning_rate": 3.4080810497975626e-06, + "loss": 0.5321, + "step": 9789 + }, + { + "epoch": 1.6984732824427482, + "grad_norm": 1.0492780208587646, + "learning_rate": 3.4042777341106903e-06, + "loss": 0.6898, + "step": 9790 + }, + { + "epoch": 1.698646773074254, + "grad_norm": 0.8839273452758789, + "learning_rate": 3.4004763443767175e-06, + "loss": 0.6599, + "step": 9791 + }, + { + "epoch": 1.69882026370576, + "grad_norm": 1.917194128036499, + "learning_rate": 3.3966768810368132e-06, + "loss": 0.64, + "step": 9792 + }, + { + "epoch": 1.6989937543372657, + "grad_norm": 0.8875200152397156, + "learning_rate": 3.392879344531903e-06, + "loss": 0.7239, + "step": 9793 + }, + { + "epoch": 1.6991672449687716, + "grad_norm": 1.1788406372070312, + "learning_rate": 3.3890837353026964e-06, + "loss": 0.7493, + "step": 9794 + }, + { + "epoch": 1.6993407356002774, + "grad_norm": 0.9344495534896851, + "learning_rate": 3.385290053789676e-06, + "loss": 0.7651, + "step": 9795 + }, + { + "epoch": 1.6995142262317835, + "grad_norm": 0.8415957689285278, + "learning_rate": 3.3814983004331014e-06, + "loss": 0.6079, + "step": 9796 + }, + { + "epoch": 1.6996877168632893, + "grad_norm": 0.7643976807594299, + "learning_rate": 3.3777084756730183e-06, + "loss": 0.7881, + "step": 9797 + }, + { + "epoch": 1.6998612074947954, + "grad_norm": 0.9766060709953308, + "learning_rate": 3.373920579949237e-06, + "loss": 0.7405, + "step": 9798 + }, + { + "epoch": 1.7000346981263013, + "grad_norm": 0.7779763340950012, + "learning_rate": 3.3701346137013435e-06, + "loss": 0.7102, + "step": 9799 + }, + { + "epoch": 1.700208188757807, + "grad_norm": 0.9627187252044678, + "learning_rate": 3.3663505773687023e-06, + "loss": 0.5752, + "step": 9800 + }, + { + "epoch": 1.700381679389313, + "grad_norm": 1.0879006385803223, + "learning_rate": 3.3625684713904617e-06, + "loss": 0.5741, + "step": 9801 + }, + { + "epoch": 1.7005551700208188, + "grad_norm": 0.8979458808898926, + "learning_rate": 3.3587882962055374e-06, + "loss": 0.577, + "step": 9802 + }, + { + "epoch": 1.7007286606523246, + "grad_norm": 0.7315860986709595, + "learning_rate": 3.355010052252623e-06, + "loss": 0.6904, + "step": 9803 + }, + { + "epoch": 1.7009021512838307, + "grad_norm": 0.9108756184577942, + "learning_rate": 3.3512337399701813e-06, + "loss": 0.6753, + "step": 9804 + }, + { + "epoch": 1.7010756419153366, + "grad_norm": 0.9173244833946228, + "learning_rate": 3.3474593597964746e-06, + "loss": 0.7227, + "step": 9805 + }, + { + "epoch": 1.7012491325468426, + "grad_norm": 1.1742217540740967, + "learning_rate": 3.3436869121695013e-06, + "loss": 0.6536, + "step": 9806 + }, + { + "epoch": 1.7014226231783485, + "grad_norm": 1.2052594423294067, + "learning_rate": 3.3399163975270786e-06, + "loss": 0.7844, + "step": 9807 + }, + { + "epoch": 1.7015961138098543, + "grad_norm": 0.8658700585365295, + "learning_rate": 3.3361478163067673e-06, + "loss": 0.6007, + "step": 9808 + }, + { + "epoch": 1.7017696044413602, + "grad_norm": 1.3064475059509277, + "learning_rate": 3.332381168945922e-06, + "loss": 0.6453, + "step": 9809 + }, + { + "epoch": 1.701943095072866, + "grad_norm": 0.8548304438591003, + "learning_rate": 3.328616455881657e-06, + "loss": 0.7861, + "step": 9810 + }, + { + "epoch": 1.7021165857043719, + "grad_norm": 0.747747540473938, + "learning_rate": 3.324853677550888e-06, + "loss": 0.7727, + "step": 9811 + }, + { + "epoch": 1.7022900763358777, + "grad_norm": 0.8915896415710449, + "learning_rate": 3.3210928343902716e-06, + "loss": 0.6188, + "step": 9812 + }, + { + "epoch": 1.7024635669673838, + "grad_norm": 1.0249011516571045, + "learning_rate": 3.31733392683627e-06, + "loss": 0.7096, + "step": 9813 + }, + { + "epoch": 1.7026370575988896, + "grad_norm": 0.760021984577179, + "learning_rate": 3.3135769553251017e-06, + "loss": 0.7373, + "step": 9814 + }, + { + "epoch": 1.7028105482303957, + "grad_norm": 1.0127907991409302, + "learning_rate": 3.3098219202927815e-06, + "loss": 0.7906, + "step": 9815 + }, + { + "epoch": 1.7029840388619015, + "grad_norm": 0.8498591184616089, + "learning_rate": 3.3060688221750637e-06, + "loss": 0.8147, + "step": 9816 + }, + { + "epoch": 1.7031575294934074, + "grad_norm": 0.8355805277824402, + "learning_rate": 3.302317661407519e-06, + "loss": 0.6736, + "step": 9817 + }, + { + "epoch": 1.7033310201249132, + "grad_norm": 0.8350297212600708, + "learning_rate": 3.2985684384254648e-06, + "loss": 0.7418, + "step": 9818 + }, + { + "epoch": 1.703504510756419, + "grad_norm": 0.8131012320518494, + "learning_rate": 3.294821153664003e-06, + "loss": 0.6992, + "step": 9819 + }, + { + "epoch": 1.703678001387925, + "grad_norm": 0.8979710340499878, + "learning_rate": 3.2910758075580085e-06, + "loss": 0.6886, + "step": 9820 + }, + { + "epoch": 1.703851492019431, + "grad_norm": 0.7247355580329895, + "learning_rate": 3.28733240054214e-06, + "loss": 0.5922, + "step": 9821 + }, + { + "epoch": 1.7040249826509368, + "grad_norm": 0.7780425548553467, + "learning_rate": 3.283590933050822e-06, + "loss": 0.7773, + "step": 9822 + }, + { + "epoch": 1.704198473282443, + "grad_norm": 0.8551125526428223, + "learning_rate": 3.2798514055182486e-06, + "loss": 0.7068, + "step": 9823 + }, + { + "epoch": 1.7043719639139487, + "grad_norm": 0.8161957859992981, + "learning_rate": 3.2761138183784126e-06, + "loss": 0.5896, + "step": 9824 + }, + { + "epoch": 1.7045454545454546, + "grad_norm": 0.9298524260520935, + "learning_rate": 3.2723781720650473e-06, + "loss": 0.7588, + "step": 9825 + }, + { + "epoch": 1.7047189451769604, + "grad_norm": 0.8668241500854492, + "learning_rate": 3.2686444670116878e-06, + "loss": 0.631, + "step": 9826 + }, + { + "epoch": 1.7048924358084663, + "grad_norm": 1.1436609029769897, + "learning_rate": 3.2649127036516325e-06, + "loss": 0.7527, + "step": 9827 + }, + { + "epoch": 1.7050659264399721, + "grad_norm": 0.7756710052490234, + "learning_rate": 3.261182882417966e-06, + "loss": 0.5934, + "step": 9828 + }, + { + "epoch": 1.7052394170714782, + "grad_norm": 1.0012167692184448, + "learning_rate": 3.2574550037435214e-06, + "loss": 0.6437, + "step": 9829 + }, + { + "epoch": 1.705412907702984, + "grad_norm": 0.7576411366462708, + "learning_rate": 3.253729068060938e-06, + "loss": 0.8004, + "step": 9830 + }, + { + "epoch": 1.7055863983344899, + "grad_norm": 0.8707934617996216, + "learning_rate": 3.250005075802607e-06, + "loss": 0.657, + "step": 9831 + }, + { + "epoch": 1.705759888965996, + "grad_norm": 0.9112079739570618, + "learning_rate": 3.2462830274007073e-06, + "loss": 0.624, + "step": 9832 + }, + { + "epoch": 1.7059333795975018, + "grad_norm": 0.8787170648574829, + "learning_rate": 3.242562923287178e-06, + "loss": 0.6589, + "step": 9833 + }, + { + "epoch": 1.7061068702290076, + "grad_norm": 0.9503645300865173, + "learning_rate": 3.2388447638937583e-06, + "loss": 0.6458, + "step": 9834 + }, + { + "epoch": 1.7062803608605135, + "grad_norm": 0.9167130589485168, + "learning_rate": 3.2351285496519247e-06, + "loss": 0.6499, + "step": 9835 + }, + { + "epoch": 1.7064538514920193, + "grad_norm": 0.753722071647644, + "learning_rate": 3.2314142809929617e-06, + "loss": 0.7842, + "step": 9836 + }, + { + "epoch": 1.7066273421235252, + "grad_norm": 0.7677097916603088, + "learning_rate": 3.227701958347911e-06, + "loss": 0.6775, + "step": 9837 + }, + { + "epoch": 1.7068008327550312, + "grad_norm": 1.1248959302902222, + "learning_rate": 3.223991582147592e-06, + "loss": 0.5928, + "step": 9838 + }, + { + "epoch": 1.706974323386537, + "grad_norm": 4.03591775894165, + "learning_rate": 3.220283152822592e-06, + "loss": 0.6251, + "step": 9839 + }, + { + "epoch": 1.7071478140180432, + "grad_norm": 0.6416550874710083, + "learning_rate": 3.216576670803291e-06, + "loss": 0.7565, + "step": 9840 + }, + { + "epoch": 1.707321304649549, + "grad_norm": 1.2726489305496216, + "learning_rate": 3.2128721365198223e-06, + "loss": 0.7374, + "step": 9841 + }, + { + "epoch": 1.7074947952810549, + "grad_norm": 0.7458406090736389, + "learning_rate": 3.2091695504021047e-06, + "loss": 0.6354, + "step": 9842 + }, + { + "epoch": 1.7076682859125607, + "grad_norm": 0.9594850540161133, + "learning_rate": 3.205468912879821e-06, + "loss": 0.6642, + "step": 9843 + }, + { + "epoch": 1.7078417765440665, + "grad_norm": 0.8956547379493713, + "learning_rate": 3.2017702243824434e-06, + "loss": 0.856, + "step": 9844 + }, + { + "epoch": 1.7080152671755724, + "grad_norm": 0.9510944485664368, + "learning_rate": 3.198073485339204e-06, + "loss": 0.8098, + "step": 9845 + }, + { + "epoch": 1.7081887578070785, + "grad_norm": 1.0747401714324951, + "learning_rate": 3.1943786961791166e-06, + "loss": 0.6262, + "step": 9846 + }, + { + "epoch": 1.7083622484385843, + "grad_norm": 1.0893385410308838, + "learning_rate": 3.190685857330964e-06, + "loss": 0.6469, + "step": 9847 + }, + { + "epoch": 1.7085357390700904, + "grad_norm": 1.7056220769882202, + "learning_rate": 3.1869949692232982e-06, + "loss": 0.6163, + "step": 9848 + }, + { + "epoch": 1.7087092297015962, + "grad_norm": 1.3976919651031494, + "learning_rate": 3.1833060322844633e-06, + "loss": 0.6758, + "step": 9849 + }, + { + "epoch": 1.708882720333102, + "grad_norm": 0.9355394244194031, + "learning_rate": 3.179619046942557e-06, + "loss": 0.6802, + "step": 9850 + }, + { + "epoch": 1.709056210964608, + "grad_norm": 1.0950841903686523, + "learning_rate": 3.1759340136254614e-06, + "loss": 0.712, + "step": 9851 + }, + { + "epoch": 1.7092297015961138, + "grad_norm": 0.8922670483589172, + "learning_rate": 3.172250932760823e-06, + "loss": 0.6306, + "step": 9852 + }, + { + "epoch": 1.7094031922276196, + "grad_norm": 1.4223984479904175, + "learning_rate": 3.1685698047760806e-06, + "loss": 0.6118, + "step": 9853 + }, + { + "epoch": 1.7095766828591255, + "grad_norm": 0.7050524950027466, + "learning_rate": 3.164890630098416e-06, + "loss": 0.7368, + "step": 9854 + }, + { + "epoch": 1.7097501734906315, + "grad_norm": 0.9262264966964722, + "learning_rate": 3.1612134091548153e-06, + "loss": 0.7458, + "step": 9855 + }, + { + "epoch": 1.7099236641221374, + "grad_norm": 1.2444229125976562, + "learning_rate": 3.1575381423720142e-06, + "loss": 0.6782, + "step": 9856 + }, + { + "epoch": 1.7100971547536434, + "grad_norm": 0.9594573378562927, + "learning_rate": 3.153864830176547e-06, + "loss": 0.5659, + "step": 9857 + }, + { + "epoch": 1.7102706453851493, + "grad_norm": 0.6772716641426086, + "learning_rate": 3.150193472994687e-06, + "loss": 0.8208, + "step": 9858 + }, + { + "epoch": 1.7104441360166551, + "grad_norm": 0.7754959464073181, + "learning_rate": 3.1465240712525124e-06, + "loss": 0.8806, + "step": 9859 + }, + { + "epoch": 1.710617626648161, + "grad_norm": 1.3917646408081055, + "learning_rate": 3.142856625375856e-06, + "loss": 0.7957, + "step": 9860 + }, + { + "epoch": 1.7107911172796668, + "grad_norm": 0.8166086077690125, + "learning_rate": 3.139191135790334e-06, + "loss": 0.7174, + "step": 9861 + }, + { + "epoch": 1.7109646079111727, + "grad_norm": 0.7734074592590332, + "learning_rate": 3.13552760292132e-06, + "loss": 0.7485, + "step": 9862 + }, + { + "epoch": 1.7111380985426787, + "grad_norm": 1.031589150428772, + "learning_rate": 3.131866027193988e-06, + "loss": 0.5901, + "step": 9863 + }, + { + "epoch": 1.7113115891741846, + "grad_norm": 0.7344492077827454, + "learning_rate": 3.1282064090332522e-06, + "loss": 0.6265, + "step": 9864 + }, + { + "epoch": 1.7114850798056906, + "grad_norm": 1.5801644325256348, + "learning_rate": 3.1245487488638247e-06, + "loss": 0.5824, + "step": 9865 + }, + { + "epoch": 1.7116585704371965, + "grad_norm": 0.9651954174041748, + "learning_rate": 3.1208930471101786e-06, + "loss": 0.771, + "step": 9866 + }, + { + "epoch": 1.7118320610687023, + "grad_norm": 1.0716207027435303, + "learning_rate": 3.1172393041965644e-06, + "loss": 0.7196, + "step": 9867 + }, + { + "epoch": 1.7120055517002082, + "grad_norm": 1.0991803407669067, + "learning_rate": 3.1135875205469946e-06, + "loss": 0.6263, + "step": 9868 + }, + { + "epoch": 1.712179042331714, + "grad_norm": 0.8005200624465942, + "learning_rate": 3.1099376965852744e-06, + "loss": 0.6276, + "step": 9869 + }, + { + "epoch": 1.7123525329632199, + "grad_norm": 0.9192963242530823, + "learning_rate": 3.1062898327349656e-06, + "loss": 0.6094, + "step": 9870 + }, + { + "epoch": 1.7125260235947257, + "grad_norm": 0.7023060321807861, + "learning_rate": 3.102643929419402e-06, + "loss": 0.8132, + "step": 9871 + }, + { + "epoch": 1.7126995142262318, + "grad_norm": 0.8230135440826416, + "learning_rate": 3.098999987061706e-06, + "loss": 0.6639, + "step": 9872 + }, + { + "epoch": 1.7128730048577376, + "grad_norm": 1.9305390119552612, + "learning_rate": 3.0953580060847545e-06, + "loss": 0.6653, + "step": 9873 + }, + { + "epoch": 1.7130464954892437, + "grad_norm": 0.9894404411315918, + "learning_rate": 3.0917179869112023e-06, + "loss": 0.6067, + "step": 9874 + }, + { + "epoch": 1.7132199861207495, + "grad_norm": 0.9195743799209595, + "learning_rate": 3.0880799299634767e-06, + "loss": 0.5771, + "step": 9875 + }, + { + "epoch": 1.7133934767522554, + "grad_norm": 0.9832761287689209, + "learning_rate": 3.084443835663791e-06, + "loss": 0.6185, + "step": 9876 + }, + { + "epoch": 1.7135669673837612, + "grad_norm": 0.9893336892127991, + "learning_rate": 3.080809704434098e-06, + "loss": 0.7009, + "step": 9877 + }, + { + "epoch": 1.713740458015267, + "grad_norm": 1.0709089040756226, + "learning_rate": 3.077177536696159e-06, + "loss": 0.7141, + "step": 9878 + }, + { + "epoch": 1.713913948646773, + "grad_norm": 0.8880771398544312, + "learning_rate": 3.0735473328714873e-06, + "loss": 0.6569, + "step": 9879 + }, + { + "epoch": 1.714087439278279, + "grad_norm": 1.0504014492034912, + "learning_rate": 3.0699190933813683e-06, + "loss": 0.7603, + "step": 9880 + }, + { + "epoch": 1.7142609299097848, + "grad_norm": 1.0975165367126465, + "learning_rate": 3.066292818646863e-06, + "loss": 0.7761, + "step": 9881 + }, + { + "epoch": 1.714434420541291, + "grad_norm": 0.904719889163971, + "learning_rate": 3.0626685090888177e-06, + "loss": 0.6558, + "step": 9882 + }, + { + "epoch": 1.7146079111727968, + "grad_norm": 0.9330397844314575, + "learning_rate": 3.0590461651278168e-06, + "loss": 0.7633, + "step": 9883 + }, + { + "epoch": 1.7147814018043026, + "grad_norm": 1.0141874551773071, + "learning_rate": 3.0554257871842543e-06, + "loss": 0.7977, + "step": 9884 + }, + { + "epoch": 1.7149548924358085, + "grad_norm": 0.753924548625946, + "learning_rate": 3.0518073756782683e-06, + "loss": 0.7393, + "step": 9885 + }, + { + "epoch": 1.7151283830673143, + "grad_norm": 0.9242173433303833, + "learning_rate": 3.0481909310297954e-06, + "loss": 0.5846, + "step": 9886 + }, + { + "epoch": 1.7153018736988201, + "grad_norm": 1.5568872690200806, + "learning_rate": 3.0445764536585076e-06, + "loss": 0.692, + "step": 9887 + }, + { + "epoch": 1.7154753643303262, + "grad_norm": 1.1766241788864136, + "learning_rate": 3.0409639439838833e-06, + "loss": 0.6107, + "step": 9888 + }, + { + "epoch": 1.715648854961832, + "grad_norm": 1.1410478353500366, + "learning_rate": 3.037353402425154e-06, + "loss": 0.6533, + "step": 9889 + }, + { + "epoch": 1.715822345593338, + "grad_norm": 0.835392415523529, + "learning_rate": 3.0337448294013307e-06, + "loss": 0.7208, + "step": 9890 + }, + { + "epoch": 1.715995836224844, + "grad_norm": 2.240907907485962, + "learning_rate": 3.0301382253311828e-06, + "loss": 0.7408, + "step": 9891 + }, + { + "epoch": 1.7161693268563498, + "grad_norm": 0.9020956158638, + "learning_rate": 3.0265335906332717e-06, + "loss": 0.7858, + "step": 9892 + }, + { + "epoch": 1.7163428174878557, + "grad_norm": 1.1710702180862427, + "learning_rate": 3.0229309257259154e-06, + "loss": 0.8137, + "step": 9893 + }, + { + "epoch": 1.7165163081193615, + "grad_norm": 0.9422942996025085, + "learning_rate": 3.019330231027209e-06, + "loss": 0.5464, + "step": 9894 + }, + { + "epoch": 1.7166897987508674, + "grad_norm": 1.0151525735855103, + "learning_rate": 3.015731506955015e-06, + "loss": 0.6804, + "step": 9895 + }, + { + "epoch": 1.7168632893823732, + "grad_norm": 1.3157821893692017, + "learning_rate": 3.012134753926965e-06, + "loss": 0.6075, + "step": 9896 + }, + { + "epoch": 1.7170367800138793, + "grad_norm": 0.9829920530319214, + "learning_rate": 3.008539972360478e-06, + "loss": 0.6864, + "step": 9897 + }, + { + "epoch": 1.7172102706453851, + "grad_norm": 1.2606054544448853, + "learning_rate": 3.0049471626727246e-06, + "loss": 0.6981, + "step": 9898 + }, + { + "epoch": 1.7173837612768912, + "grad_norm": 0.8008937835693359, + "learning_rate": 3.0013563252806576e-06, + "loss": 0.564, + "step": 9899 + }, + { + "epoch": 1.717557251908397, + "grad_norm": 3.3121867179870605, + "learning_rate": 2.997767460600991e-06, + "loss": 0.613, + "step": 9900 + }, + { + "epoch": 1.7177307425399029, + "grad_norm": 1.1032781600952148, + "learning_rate": 2.9941805690502246e-06, + "loss": 0.5566, + "step": 9901 + }, + { + "epoch": 1.7179042331714087, + "grad_norm": 1.2246575355529785, + "learning_rate": 2.990595651044621e-06, + "loss": 0.6122, + "step": 9902 + }, + { + "epoch": 1.7180777238029146, + "grad_norm": 0.8586134910583496, + "learning_rate": 2.9870127070002117e-06, + "loss": 0.6772, + "step": 9903 + }, + { + "epoch": 1.7182512144344204, + "grad_norm": 1.325268268585205, + "learning_rate": 2.9834317373327983e-06, + "loss": 0.6392, + "step": 9904 + }, + { + "epoch": 1.7184247050659265, + "grad_norm": 0.8427374958992004, + "learning_rate": 2.979852742457967e-06, + "loss": 0.6244, + "step": 9905 + }, + { + "epoch": 1.7185981956974323, + "grad_norm": 0.9688510298728943, + "learning_rate": 2.976275722791051e-06, + "loss": 0.5978, + "step": 9906 + }, + { + "epoch": 1.7187716863289384, + "grad_norm": 0.8559555411338806, + "learning_rate": 2.972700678747176e-06, + "loss": 0.731, + "step": 9907 + }, + { + "epoch": 1.7189451769604442, + "grad_norm": 1.1343542337417603, + "learning_rate": 2.9691276107412293e-06, + "loss": 0.772, + "step": 9908 + }, + { + "epoch": 1.71911866759195, + "grad_norm": 2.6265170574188232, + "learning_rate": 2.9655565191878668e-06, + "loss": 0.6404, + "step": 9909 + }, + { + "epoch": 1.719292158223456, + "grad_norm": 1.0210590362548828, + "learning_rate": 2.961987404501516e-06, + "loss": 0.7231, + "step": 9910 + }, + { + "epoch": 1.7194656488549618, + "grad_norm": 1.053719401359558, + "learning_rate": 2.9584202670963892e-06, + "loss": 0.5789, + "step": 9911 + }, + { + "epoch": 1.7196391394864676, + "grad_norm": 0.9533113241195679, + "learning_rate": 2.9548551073864386e-06, + "loss": 0.5768, + "step": 9912 + }, + { + "epoch": 1.7198126301179735, + "grad_norm": 1.5410656929016113, + "learning_rate": 2.95129192578542e-06, + "loss": 0.6432, + "step": 9913 + }, + { + "epoch": 1.7199861207494795, + "grad_norm": 1.1888794898986816, + "learning_rate": 2.947730722706832e-06, + "loss": 0.63, + "step": 9914 + }, + { + "epoch": 1.7201596113809854, + "grad_norm": 0.7506579160690308, + "learning_rate": 2.9441714985639747e-06, + "loss": 0.7456, + "step": 9915 + }, + { + "epoch": 1.7203331020124915, + "grad_norm": 0.898382842540741, + "learning_rate": 2.94061425376988e-06, + "loss": 0.6757, + "step": 9916 + }, + { + "epoch": 1.7205065926439973, + "grad_norm": 0.8660153150558472, + "learning_rate": 2.9370589887373825e-06, + "loss": 0.6116, + "step": 9917 + }, + { + "epoch": 1.7206800832755031, + "grad_norm": 0.7487264275550842, + "learning_rate": 2.9335057038790715e-06, + "loss": 0.7612, + "step": 9918 + }, + { + "epoch": 1.720853573907009, + "grad_norm": 1.3350178003311157, + "learning_rate": 2.9299543996073067e-06, + "loss": 0.5801, + "step": 9919 + }, + { + "epoch": 1.7210270645385148, + "grad_norm": 0.8501729369163513, + "learning_rate": 2.9264050763342267e-06, + "loss": 0.7451, + "step": 9920 + }, + { + "epoch": 1.7212005551700207, + "grad_norm": 0.7354087829589844, + "learning_rate": 2.9228577344717357e-06, + "loss": 0.8096, + "step": 9921 + }, + { + "epoch": 1.7213740458015268, + "grad_norm": 0.8210126757621765, + "learning_rate": 2.9193123744315e-06, + "loss": 0.8462, + "step": 9922 + }, + { + "epoch": 1.7215475364330326, + "grad_norm": 1.0893129110336304, + "learning_rate": 2.9157689966249636e-06, + "loss": 0.6575, + "step": 9923 + }, + { + "epoch": 1.7217210270645387, + "grad_norm": 0.9614791870117188, + "learning_rate": 2.912227601463351e-06, + "loss": 0.7687, + "step": 9924 + }, + { + "epoch": 1.7218945176960445, + "grad_norm": 1.890268325805664, + "learning_rate": 2.9086881893576267e-06, + "loss": 0.989, + "step": 9925 + }, + { + "epoch": 1.7220680083275504, + "grad_norm": 1.2026811838150024, + "learning_rate": 2.9051507607185603e-06, + "loss": 0.7629, + "step": 9926 + }, + { + "epoch": 1.7222414989590562, + "grad_norm": 0.9516860246658325, + "learning_rate": 2.9016153159566607e-06, + "loss": 0.7444, + "step": 9927 + }, + { + "epoch": 1.722414989590562, + "grad_norm": 0.9887000322341919, + "learning_rate": 2.8980818554822376e-06, + "loss": 0.5701, + "step": 9928 + }, + { + "epoch": 1.722588480222068, + "grad_norm": 1.266406536102295, + "learning_rate": 2.894550379705332e-06, + "loss": 0.542, + "step": 9929 + }, + { + "epoch": 1.7227619708535737, + "grad_norm": 2.0468716621398926, + "learning_rate": 2.8910208890357916e-06, + "loss": 0.6833, + "step": 9930 + }, + { + "epoch": 1.7229354614850798, + "grad_norm": 0.8736338019371033, + "learning_rate": 2.8874933838832154e-06, + "loss": 0.5924, + "step": 9931 + }, + { + "epoch": 1.7231089521165857, + "grad_norm": 0.8404735326766968, + "learning_rate": 2.883967864656969e-06, + "loss": 0.6182, + "step": 9932 + }, + { + "epoch": 1.7232824427480917, + "grad_norm": 1.005383014678955, + "learning_rate": 2.8804443317661925e-06, + "loss": 0.6171, + "step": 9933 + }, + { + "epoch": 1.7234559333795976, + "grad_norm": 0.847539484500885, + "learning_rate": 2.876922785619809e-06, + "loss": 0.6777, + "step": 9934 + }, + { + "epoch": 1.7236294240111034, + "grad_norm": 0.7576962113380432, + "learning_rate": 2.873403226626479e-06, + "loss": 0.7273, + "step": 9935 + }, + { + "epoch": 1.7238029146426093, + "grad_norm": 0.8390761017799377, + "learning_rate": 2.8698856551946664e-06, + "loss": 0.8096, + "step": 9936 + }, + { + "epoch": 1.723976405274115, + "grad_norm": 0.7022005319595337, + "learning_rate": 2.866370071732585e-06, + "loss": 0.77, + "step": 9937 + }, + { + "epoch": 1.724149895905621, + "grad_norm": 0.7310892939567566, + "learning_rate": 2.8628564766482193e-06, + "loss": 0.8013, + "step": 9938 + }, + { + "epoch": 1.724323386537127, + "grad_norm": 1.1707170009613037, + "learning_rate": 2.859344870349323e-06, + "loss": 0.7583, + "step": 9939 + }, + { + "epoch": 1.7244968771686329, + "grad_norm": 0.7527031898498535, + "learning_rate": 2.855835253243433e-06, + "loss": 0.7842, + "step": 9940 + }, + { + "epoch": 1.724670367800139, + "grad_norm": 0.8111911416053772, + "learning_rate": 2.8523276257378406e-06, + "loss": 0.6978, + "step": 9941 + }, + { + "epoch": 1.7248438584316448, + "grad_norm": 0.854202389717102, + "learning_rate": 2.848821988239605e-06, + "loss": 0.5902, + "step": 9942 + }, + { + "epoch": 1.7250173490631506, + "grad_norm": 1.204153060913086, + "learning_rate": 2.8453183411555606e-06, + "loss": 0.694, + "step": 9943 + }, + { + "epoch": 1.7251908396946565, + "grad_norm": 0.9228923320770264, + "learning_rate": 2.8418166848923158e-06, + "loss": 0.7046, + "step": 9944 + }, + { + "epoch": 1.7253643303261623, + "grad_norm": 0.8860577344894409, + "learning_rate": 2.838317019856238e-06, + "loss": 0.6743, + "step": 9945 + }, + { + "epoch": 1.7255378209576682, + "grad_norm": 0.9005757570266724, + "learning_rate": 2.834819346453468e-06, + "loss": 0.772, + "step": 9946 + }, + { + "epoch": 1.7257113115891742, + "grad_norm": 0.7927912473678589, + "learning_rate": 2.8313236650899135e-06, + "loss": 0.7092, + "step": 9947 + }, + { + "epoch": 1.72588480222068, + "grad_norm": 0.9475684762001038, + "learning_rate": 2.827829976171248e-06, + "loss": 0.6677, + "step": 9948 + }, + { + "epoch": 1.726058292852186, + "grad_norm": 0.9259396195411682, + "learning_rate": 2.8243382801029295e-06, + "loss": 0.7932, + "step": 9949 + }, + { + "epoch": 1.726231783483692, + "grad_norm": 1.0511393547058105, + "learning_rate": 2.820848577290165e-06, + "loss": 0.6202, + "step": 9950 + }, + { + "epoch": 1.7264052741151978, + "grad_norm": 1.026798129081726, + "learning_rate": 2.8173608681379417e-06, + "loss": 0.7113, + "step": 9951 + }, + { + "epoch": 1.7265787647467037, + "grad_norm": 0.7263729572296143, + "learning_rate": 2.8138751530510065e-06, + "loss": 0.6128, + "step": 9952 + }, + { + "epoch": 1.7267522553782095, + "grad_norm": 0.819282054901123, + "learning_rate": 2.8103914324338965e-06, + "loss": 0.618, + "step": 9953 + }, + { + "epoch": 1.7269257460097154, + "grad_norm": 1.1852976083755493, + "learning_rate": 2.806909706690881e-06, + "loss": 0.6577, + "step": 9954 + }, + { + "epoch": 1.7270992366412212, + "grad_norm": 0.8604832291603088, + "learning_rate": 2.8034299762260308e-06, + "loss": 0.7112, + "step": 9955 + }, + { + "epoch": 1.7272727272727273, + "grad_norm": 3.2764854431152344, + "learning_rate": 2.799952241443167e-06, + "loss": 0.6299, + "step": 9956 + }, + { + "epoch": 1.7274462179042331, + "grad_norm": 1.027565598487854, + "learning_rate": 2.796476502745895e-06, + "loss": 0.7402, + "step": 9957 + }, + { + "epoch": 1.7276197085357392, + "grad_norm": 2.3662450313568115, + "learning_rate": 2.7930027605375644e-06, + "loss": 0.563, + "step": 9958 + }, + { + "epoch": 1.727793199167245, + "grad_norm": 1.036678671836853, + "learning_rate": 2.7895310152213163e-06, + "loss": 0.7682, + "step": 9959 + }, + { + "epoch": 1.727966689798751, + "grad_norm": 0.8401703834533691, + "learning_rate": 2.7860612672000485e-06, + "loss": 0.806, + "step": 9960 + }, + { + "epoch": 1.7281401804302567, + "grad_norm": 0.8110975027084351, + "learning_rate": 2.7825935168764284e-06, + "loss": 0.6555, + "step": 9961 + }, + { + "epoch": 1.7283136710617626, + "grad_norm": 0.6722231507301331, + "learning_rate": 2.7791277646528893e-06, + "loss": 0.8232, + "step": 9962 + }, + { + "epoch": 1.7284871616932684, + "grad_norm": 0.9310319423675537, + "learning_rate": 2.7756640109316423e-06, + "loss": 0.6472, + "step": 9963 + }, + { + "epoch": 1.7286606523247745, + "grad_norm": 1.0021885633468628, + "learning_rate": 2.77220225611466e-06, + "loss": 0.5956, + "step": 9964 + }, + { + "epoch": 1.7288341429562804, + "grad_norm": 0.7026858329772949, + "learning_rate": 2.768742500603678e-06, + "loss": 0.8625, + "step": 9965 + }, + { + "epoch": 1.7290076335877864, + "grad_norm": 1.0795484781265259, + "learning_rate": 2.7652847448002074e-06, + "loss": 0.6439, + "step": 9966 + }, + { + "epoch": 1.7291811242192923, + "grad_norm": 1.720005989074707, + "learning_rate": 2.7618289891055217e-06, + "loss": 0.6406, + "step": 9967 + }, + { + "epoch": 1.729354614850798, + "grad_norm": 1.006536602973938, + "learning_rate": 2.7583752339206714e-06, + "loss": 0.6062, + "step": 9968 + }, + { + "epoch": 1.729528105482304, + "grad_norm": 3.938178539276123, + "learning_rate": 2.754923479646465e-06, + "loss": 0.6741, + "step": 9969 + }, + { + "epoch": 1.7297015961138098, + "grad_norm": 0.7338905930519104, + "learning_rate": 2.7514737266834845e-06, + "loss": 0.6865, + "step": 9970 + }, + { + "epoch": 1.7298750867453156, + "grad_norm": 1.0654323101043701, + "learning_rate": 2.7480259754320716e-06, + "loss": 0.7214, + "step": 9971 + }, + { + "epoch": 1.7300485773768215, + "grad_norm": 1.1092673540115356, + "learning_rate": 2.7445802262923505e-06, + "loss": 0.5902, + "step": 9972 + }, + { + "epoch": 1.7302220680083276, + "grad_norm": 0.8141344785690308, + "learning_rate": 2.7411364796642015e-06, + "loss": 0.7239, + "step": 9973 + }, + { + "epoch": 1.7303955586398334, + "grad_norm": 0.8767555356025696, + "learning_rate": 2.737694735947276e-06, + "loss": 0.6053, + "step": 9974 + }, + { + "epoch": 1.7305690492713395, + "grad_norm": 0.8391363024711609, + "learning_rate": 2.7342549955409836e-06, + "loss": 0.6331, + "step": 9975 + }, + { + "epoch": 1.7307425399028453, + "grad_norm": 2.123013734817505, + "learning_rate": 2.730817258844529e-06, + "loss": 0.7308, + "step": 9976 + }, + { + "epoch": 1.7309160305343512, + "grad_norm": 1.7324693202972412, + "learning_rate": 2.727381526256845e-06, + "loss": 0.6486, + "step": 9977 + }, + { + "epoch": 1.731089521165857, + "grad_norm": 0.9148504137992859, + "learning_rate": 2.723947798176665e-06, + "loss": 0.7122, + "step": 9978 + }, + { + "epoch": 1.7312630117973629, + "grad_norm": 1.2622168064117432, + "learning_rate": 2.720516075002473e-06, + "loss": 0.7157, + "step": 9979 + }, + { + "epoch": 1.7314365024288687, + "grad_norm": 0.7301722764968872, + "learning_rate": 2.7170863571325257e-06, + "loss": 0.7175, + "step": 9980 + }, + { + "epoch": 1.7316099930603748, + "grad_norm": 0.6781266331672668, + "learning_rate": 2.7136586449648407e-06, + "loss": 0.6482, + "step": 9981 + }, + { + "epoch": 1.7317834836918806, + "grad_norm": 0.6745726466178894, + "learning_rate": 2.7102329388972215e-06, + "loss": 0.7408, + "step": 9982 + }, + { + "epoch": 1.7319569743233867, + "grad_norm": 0.8330579400062561, + "learning_rate": 2.7068092393272082e-06, + "loss": 0.6815, + "step": 9983 + }, + { + "epoch": 1.7321304649548925, + "grad_norm": 0.9944073557853699, + "learning_rate": 2.7033875466521363e-06, + "loss": 0.6155, + "step": 9984 + }, + { + "epoch": 1.7323039555863984, + "grad_norm": 0.9268627166748047, + "learning_rate": 2.6999678612690907e-06, + "loss": 0.6818, + "step": 9985 + }, + { + "epoch": 1.7324774462179042, + "grad_norm": 0.984941303730011, + "learning_rate": 2.696550183574942e-06, + "loss": 0.5906, + "step": 9986 + }, + { + "epoch": 1.73265093684941, + "grad_norm": 1.2188655138015747, + "learning_rate": 2.6931345139663e-06, + "loss": 0.6399, + "step": 9987 + }, + { + "epoch": 1.732824427480916, + "grad_norm": 0.9666289687156677, + "learning_rate": 2.6897208528395656e-06, + "loss": 0.5793, + "step": 9988 + }, + { + "epoch": 1.7329979181124218, + "grad_norm": 0.6753105521202087, + "learning_rate": 2.6863092005908973e-06, + "loss": 0.6836, + "step": 9989 + }, + { + "epoch": 1.7331714087439278, + "grad_norm": 0.9076718091964722, + "learning_rate": 2.682899557616223e-06, + "loss": 0.7837, + "step": 9990 + }, + { + "epoch": 1.7333448993754337, + "grad_norm": 0.9917073249816895, + "learning_rate": 2.679491924311226e-06, + "loss": 0.7129, + "step": 9991 + }, + { + "epoch": 1.7335183900069397, + "grad_norm": 1.2670749425888062, + "learning_rate": 2.676086301071381e-06, + "loss": 0.6265, + "step": 9992 + }, + { + "epoch": 1.7336918806384456, + "grad_norm": 0.8740940093994141, + "learning_rate": 2.6726826882919055e-06, + "loss": 0.5981, + "step": 9993 + }, + { + "epoch": 1.7338653712699514, + "grad_norm": 1.1810362339019775, + "learning_rate": 2.66928108636779e-06, + "loss": 0.6406, + "step": 9994 + }, + { + "epoch": 1.7340388619014573, + "grad_norm": 1.203736424446106, + "learning_rate": 2.6658814956938073e-06, + "loss": 0.6515, + "step": 9995 + }, + { + "epoch": 1.7342123525329631, + "grad_norm": 0.8297080993652344, + "learning_rate": 2.662483916664467e-06, + "loss": 0.6716, + "step": 9996 + }, + { + "epoch": 1.734385843164469, + "grad_norm": 0.8515571355819702, + "learning_rate": 2.6590883496740727e-06, + "loss": 0.7761, + "step": 9997 + }, + { + "epoch": 1.734559333795975, + "grad_norm": 0.7537152767181396, + "learning_rate": 2.6556947951166836e-06, + "loss": 0.9268, + "step": 9998 + }, + { + "epoch": 1.734732824427481, + "grad_norm": 1.1215684413909912, + "learning_rate": 2.6523032533861236e-06, + "loss": 0.701, + "step": 9999 + }, + { + "epoch": 1.734906315058987, + "grad_norm": 1.8708440065383911, + "learning_rate": 2.648913724875981e-06, + "loss": 0.5916, + "step": 10000 + }, + { + "epoch": 1.7350798056904928, + "grad_norm": 1.4966100454330444, + "learning_rate": 2.6455262099796233e-06, + "loss": 0.62, + "step": 10001 + }, + { + "epoch": 1.7352532963219987, + "grad_norm": 0.8049783110618591, + "learning_rate": 2.6421407090901707e-06, + "loss": 0.7059, + "step": 10002 + }, + { + "epoch": 1.7354267869535045, + "grad_norm": 1.0651133060455322, + "learning_rate": 2.6387572226005143e-06, + "loss": 0.6848, + "step": 10003 + }, + { + "epoch": 1.7356002775850103, + "grad_norm": 0.9141005873680115, + "learning_rate": 2.635375750903306e-06, + "loss": 0.6467, + "step": 10004 + }, + { + "epoch": 1.7357737682165162, + "grad_norm": 0.8565404415130615, + "learning_rate": 2.631996294390986e-06, + "loss": 0.6718, + "step": 10005 + }, + { + "epoch": 1.7359472588480223, + "grad_norm": 0.7290942668914795, + "learning_rate": 2.628618853455727e-06, + "loss": 0.6781, + "step": 10006 + }, + { + "epoch": 1.736120749479528, + "grad_norm": 1.3213671445846558, + "learning_rate": 2.625243428489492e-06, + "loss": 0.5763, + "step": 10007 + }, + { + "epoch": 1.7362942401110342, + "grad_norm": 0.9209448099136353, + "learning_rate": 2.621870019884005e-06, + "loss": 0.644, + "step": 10008 + }, + { + "epoch": 1.73646773074254, + "grad_norm": 0.7260803580284119, + "learning_rate": 2.6184986280307525e-06, + "loss": 0.8083, + "step": 10009 + }, + { + "epoch": 1.7366412213740459, + "grad_norm": 1.8522229194641113, + "learning_rate": 2.6151292533209826e-06, + "loss": 0.5807, + "step": 10010 + }, + { + "epoch": 1.7368147120055517, + "grad_norm": 0.8864854574203491, + "learning_rate": 2.6117618961457235e-06, + "loss": 0.5649, + "step": 10011 + }, + { + "epoch": 1.7369882026370576, + "grad_norm": 0.8865764141082764, + "learning_rate": 2.6083965568957603e-06, + "loss": 0.5544, + "step": 10012 + }, + { + "epoch": 1.7371616932685634, + "grad_norm": 0.6846692562103271, + "learning_rate": 2.6050332359616403e-06, + "loss": 0.6832, + "step": 10013 + }, + { + "epoch": 1.7373351839000692, + "grad_norm": 0.9279193878173828, + "learning_rate": 2.601671933733678e-06, + "loss": 0.5861, + "step": 10014 + }, + { + "epoch": 1.7375086745315753, + "grad_norm": 1.1136049032211304, + "learning_rate": 2.598312650601964e-06, + "loss": 0.6128, + "step": 10015 + }, + { + "epoch": 1.7376821651630812, + "grad_norm": 0.8539490699768066, + "learning_rate": 2.594955386956346e-06, + "loss": 0.6036, + "step": 10016 + }, + { + "epoch": 1.7378556557945872, + "grad_norm": 1.0316108465194702, + "learning_rate": 2.5916001431864346e-06, + "loss": 0.637, + "step": 10017 + }, + { + "epoch": 1.738029146426093, + "grad_norm": 1.778124451637268, + "learning_rate": 2.588246919681614e-06, + "loss": 0.7671, + "step": 10018 + }, + { + "epoch": 1.738202637057599, + "grad_norm": 0.8391453623771667, + "learning_rate": 2.5848957168310195e-06, + "loss": 0.6959, + "step": 10019 + }, + { + "epoch": 1.7383761276891048, + "grad_norm": 0.8868352174758911, + "learning_rate": 2.5815465350235756e-06, + "loss": 0.7786, + "step": 10020 + }, + { + "epoch": 1.7385496183206106, + "grad_norm": 1.0937113761901855, + "learning_rate": 2.5781993746479537e-06, + "loss": 0.6406, + "step": 10021 + }, + { + "epoch": 1.7387231089521165, + "grad_norm": 1.1111633777618408, + "learning_rate": 2.5748542360925944e-06, + "loss": 0.6215, + "step": 10022 + }, + { + "epoch": 1.7388965995836225, + "grad_norm": 0.758586049079895, + "learning_rate": 2.5715111197457e-06, + "loss": 0.7325, + "step": 10023 + }, + { + "epoch": 1.7390700902151284, + "grad_norm": 0.919439435005188, + "learning_rate": 2.568170025995258e-06, + "loss": 0.6273, + "step": 10024 + }, + { + "epoch": 1.7392435808466344, + "grad_norm": 0.7109060287475586, + "learning_rate": 2.5648309552289875e-06, + "loss": 0.8004, + "step": 10025 + }, + { + "epoch": 1.7394170714781403, + "grad_norm": 0.9039729833602905, + "learning_rate": 2.561493907834405e-06, + "loss": 0.6616, + "step": 10026 + }, + { + "epoch": 1.7395905621096461, + "grad_norm": 0.9394204020500183, + "learning_rate": 2.5581588841987693e-06, + "loss": 0.5883, + "step": 10027 + }, + { + "epoch": 1.739764052741152, + "grad_norm": 0.7675293684005737, + "learning_rate": 2.5548258847091266e-06, + "loss": 0.6572, + "step": 10028 + }, + { + "epoch": 1.7399375433726578, + "grad_norm": 0.8112495541572571, + "learning_rate": 2.551494909752261e-06, + "loss": 0.6346, + "step": 10029 + }, + { + "epoch": 1.7401110340041637, + "grad_norm": 0.7785564064979553, + "learning_rate": 2.548165959714748e-06, + "loss": 0.7491, + "step": 10030 + }, + { + "epoch": 1.7402845246356695, + "grad_norm": 0.8864448070526123, + "learning_rate": 2.544839034982909e-06, + "loss": 0.5762, + "step": 10031 + }, + { + "epoch": 1.7404580152671756, + "grad_norm": 0.8243436217308044, + "learning_rate": 2.54151413594284e-06, + "loss": 0.6476, + "step": 10032 + }, + { + "epoch": 1.7406315058986814, + "grad_norm": 0.803466260433197, + "learning_rate": 2.538191262980394e-06, + "loss": 0.7795, + "step": 10033 + }, + { + "epoch": 1.7408049965301875, + "grad_norm": 1.1588993072509766, + "learning_rate": 2.534870416481208e-06, + "loss": 0.6901, + "step": 10034 + }, + { + "epoch": 1.7409784871616933, + "grad_norm": 0.8145420551300049, + "learning_rate": 2.5315515968306503e-06, + "loss": 0.7677, + "step": 10035 + }, + { + "epoch": 1.7411519777931992, + "grad_norm": 0.8939618468284607, + "learning_rate": 2.5282348044138915e-06, + "loss": 0.7111, + "step": 10036 + }, + { + "epoch": 1.741325468424705, + "grad_norm": 0.8025327324867249, + "learning_rate": 2.5249200396158414e-06, + "loss": 0.6835, + "step": 10037 + }, + { + "epoch": 1.7414989590562109, + "grad_norm": 1.1101155281066895, + "learning_rate": 2.521607302821183e-06, + "loss": 0.8267, + "step": 10038 + }, + { + "epoch": 1.7416724496877167, + "grad_norm": 0.6615278720855713, + "learning_rate": 2.51829659441436e-06, + "loss": 0.8311, + "step": 10039 + }, + { + "epoch": 1.7418459403192228, + "grad_norm": 0.8874963521957397, + "learning_rate": 2.514987914779592e-06, + "loss": 0.6998, + "step": 10040 + }, + { + "epoch": 1.7420194309507286, + "grad_norm": 2.5481045246124268, + "learning_rate": 2.5116812643008494e-06, + "loss": 0.6556, + "step": 10041 + }, + { + "epoch": 1.7421929215822347, + "grad_norm": 1.2444828748703003, + "learning_rate": 2.5083766433618695e-06, + "loss": 0.7277, + "step": 10042 + }, + { + "epoch": 1.7423664122137406, + "grad_norm": 1.1740219593048096, + "learning_rate": 2.5050740523461682e-06, + "loss": 0.7474, + "step": 10043 + }, + { + "epoch": 1.7425399028452464, + "grad_norm": 0.7892354726791382, + "learning_rate": 2.5017734916370073e-06, + "loss": 0.6743, + "step": 10044 + }, + { + "epoch": 1.7427133934767522, + "grad_norm": 0.8138338327407837, + "learning_rate": 2.498474961617421e-06, + "loss": 0.8254, + "step": 10045 + }, + { + "epoch": 1.742886884108258, + "grad_norm": 0.9641185998916626, + "learning_rate": 2.495178462670207e-06, + "loss": 0.6685, + "step": 10046 + }, + { + "epoch": 1.743060374739764, + "grad_norm": 0.66000896692276, + "learning_rate": 2.4918839951779374e-06, + "loss": 0.7793, + "step": 10047 + }, + { + "epoch": 1.7432338653712698, + "grad_norm": 0.8115548491477966, + "learning_rate": 2.4885915595229215e-06, + "loss": 0.649, + "step": 10048 + }, + { + "epoch": 1.7434073560027759, + "grad_norm": 1.0994879007339478, + "learning_rate": 2.4853011560872653e-06, + "loss": 0.6492, + "step": 10049 + }, + { + "epoch": 1.7435808466342817, + "grad_norm": 0.945711076259613, + "learning_rate": 2.4820127852528163e-06, + "loss": 0.6091, + "step": 10050 + }, + { + "epoch": 1.7437543372657878, + "grad_norm": 1.039608359336853, + "learning_rate": 2.4787264474011984e-06, + "loss": 0.5619, + "step": 10051 + }, + { + "epoch": 1.7439278278972936, + "grad_norm": 0.8751401305198669, + "learning_rate": 2.4754421429137887e-06, + "loss": 0.7878, + "step": 10052 + }, + { + "epoch": 1.7441013185287995, + "grad_norm": 1.784661054611206, + "learning_rate": 2.4721598721717465e-06, + "loss": 0.6768, + "step": 10053 + }, + { + "epoch": 1.7442748091603053, + "grad_norm": 0.7760918736457825, + "learning_rate": 2.468879635555965e-06, + "loss": 0.7231, + "step": 10054 + }, + { + "epoch": 1.7444482997918112, + "grad_norm": 1.009694218635559, + "learning_rate": 2.4656014334471357e-06, + "loss": 0.6704, + "step": 10055 + }, + { + "epoch": 1.744621790423317, + "grad_norm": 0.9278287291526794, + "learning_rate": 2.462325266225687e-06, + "loss": 0.6056, + "step": 10056 + }, + { + "epoch": 1.744795281054823, + "grad_norm": 1.0481798648834229, + "learning_rate": 2.4590511342718348e-06, + "loss": 0.6936, + "step": 10057 + }, + { + "epoch": 1.744968771686329, + "grad_norm": 0.7728239893913269, + "learning_rate": 2.455779037965529e-06, + "loss": 0.8513, + "step": 10058 + }, + { + "epoch": 1.745142262317835, + "grad_norm": 0.807590126991272, + "learning_rate": 2.452508977686514e-06, + "loss": 0.7395, + "step": 10059 + }, + { + "epoch": 1.7453157529493408, + "grad_norm": 0.9785268902778625, + "learning_rate": 2.4492409538142803e-06, + "loss": 0.7183, + "step": 10060 + }, + { + "epoch": 1.7454892435808467, + "grad_norm": 0.7123043537139893, + "learning_rate": 2.445974966728082e-06, + "loss": 0.7706, + "step": 10061 + }, + { + "epoch": 1.7456627342123525, + "grad_norm": 0.9071764349937439, + "learning_rate": 2.44271101680694e-06, + "loss": 0.6499, + "step": 10062 + }, + { + "epoch": 1.7458362248438584, + "grad_norm": 0.9755744338035583, + "learning_rate": 2.4394491044296474e-06, + "loss": 0.6802, + "step": 10063 + }, + { + "epoch": 1.7460097154753642, + "grad_norm": 1.0857245922088623, + "learning_rate": 2.436189229974748e-06, + "loss": 0.7134, + "step": 10064 + }, + { + "epoch": 1.7461832061068703, + "grad_norm": 1.3389973640441895, + "learning_rate": 2.4329313938205546e-06, + "loss": 0.6477, + "step": 10065 + }, + { + "epoch": 1.7463566967383761, + "grad_norm": 2.403944969177246, + "learning_rate": 2.4296755963451424e-06, + "loss": 0.5452, + "step": 10066 + }, + { + "epoch": 1.7465301873698822, + "grad_norm": 1.9238064289093018, + "learning_rate": 2.426421837926345e-06, + "loss": 0.6653, + "step": 10067 + }, + { + "epoch": 1.746703678001388, + "grad_norm": 0.976687490940094, + "learning_rate": 2.423170118941778e-06, + "loss": 0.5559, + "step": 10068 + }, + { + "epoch": 1.7468771686328939, + "grad_norm": 1.0631587505340576, + "learning_rate": 2.4199204397687968e-06, + "loss": 0.8378, + "step": 10069 + }, + { + "epoch": 1.7470506592643997, + "grad_norm": 1.5392581224441528, + "learning_rate": 2.4166728007845364e-06, + "loss": 0.8298, + "step": 10070 + }, + { + "epoch": 1.7472241498959056, + "grad_norm": 0.8457003831863403, + "learning_rate": 2.413427202365879e-06, + "loss": 0.8066, + "step": 10071 + }, + { + "epoch": 1.7473976405274114, + "grad_norm": 0.739046037197113, + "learning_rate": 2.4101836448894924e-06, + "loss": 0.825, + "step": 10072 + }, + { + "epoch": 1.7475711311589173, + "grad_norm": 0.951927125453949, + "learning_rate": 2.406942128731791e-06, + "loss": 0.5774, + "step": 10073 + }, + { + "epoch": 1.7477446217904233, + "grad_norm": 0.8650696277618408, + "learning_rate": 2.4037026542689555e-06, + "loss": 0.6688, + "step": 10074 + }, + { + "epoch": 1.7479181124219292, + "grad_norm": 1.844819188117981, + "learning_rate": 2.4004652218769244e-06, + "loss": 0.7229, + "step": 10075 + }, + { + "epoch": 1.7480916030534353, + "grad_norm": 0.9074751734733582, + "learning_rate": 2.3972298319314224e-06, + "loss": 0.6342, + "step": 10076 + }, + { + "epoch": 1.748265093684941, + "grad_norm": 1.0470564365386963, + "learning_rate": 2.393996484807901e-06, + "loss": 0.5819, + "step": 10077 + }, + { + "epoch": 1.748438584316447, + "grad_norm": 1.9862645864486694, + "learning_rate": 2.3907651808816067e-06, + "loss": 0.6281, + "step": 10078 + }, + { + "epoch": 1.7486120749479528, + "grad_norm": 0.875230610370636, + "learning_rate": 2.3875359205275307e-06, + "loss": 0.7119, + "step": 10079 + }, + { + "epoch": 1.7487855655794586, + "grad_norm": 0.8090860247612, + "learning_rate": 2.384308704120435e-06, + "loss": 0.7383, + "step": 10080 + }, + { + "epoch": 1.7489590562109645, + "grad_norm": 0.8229871392250061, + "learning_rate": 2.381083532034836e-06, + "loss": 0.6523, + "step": 10081 + }, + { + "epoch": 1.7491325468424705, + "grad_norm": 1.7116403579711914, + "learning_rate": 2.3778604046450313e-06, + "loss": 0.5737, + "step": 10082 + }, + { + "epoch": 1.7493060374739764, + "grad_norm": 0.8902257680892944, + "learning_rate": 2.374639322325052e-06, + "loss": 0.7759, + "step": 10083 + }, + { + "epoch": 1.7494795281054825, + "grad_norm": 0.7618767023086548, + "learning_rate": 2.371420285448722e-06, + "loss": 0.7207, + "step": 10084 + }, + { + "epoch": 1.7496530187369883, + "grad_norm": 1.0132050514221191, + "learning_rate": 2.368203294389606e-06, + "loss": 0.7157, + "step": 10085 + }, + { + "epoch": 1.7498265093684942, + "grad_norm": 1.929311990737915, + "learning_rate": 2.364988349521049e-06, + "loss": 0.7363, + "step": 10086 + }, + { + "epoch": 1.75, + "grad_norm": 1.3024741411209106, + "learning_rate": 2.3617754512161353e-06, + "loss": 0.7656, + "step": 10087 + }, + { + "epoch": 1.7501734906315058, + "grad_norm": 1.3915730714797974, + "learning_rate": 2.358564599847737e-06, + "loss": 0.6711, + "step": 10088 + }, + { + "epoch": 1.7503469812630117, + "grad_norm": 2.051024913787842, + "learning_rate": 2.3553557957884744e-06, + "loss": 0.7185, + "step": 10089 + }, + { + "epoch": 1.7505204718945175, + "grad_norm": 1.035677433013916, + "learning_rate": 2.352149039410727e-06, + "loss": 0.707, + "step": 10090 + }, + { + "epoch": 1.7506939625260236, + "grad_norm": 0.844759464263916, + "learning_rate": 2.348944331086651e-06, + "loss": 0.8152, + "step": 10091 + }, + { + "epoch": 1.7508674531575295, + "grad_norm": 1.0118043422698975, + "learning_rate": 2.345741671188153e-06, + "loss": 0.6836, + "step": 10092 + }, + { + "epoch": 1.7510409437890355, + "grad_norm": 1.0157314538955688, + "learning_rate": 2.342541060086907e-06, + "loss": 0.7723, + "step": 10093 + }, + { + "epoch": 1.7512144344205414, + "grad_norm": 1.0959937572479248, + "learning_rate": 2.33934249815434e-06, + "loss": 0.7822, + "step": 10094 + }, + { + "epoch": 1.7513879250520472, + "grad_norm": 1.2039154767990112, + "learning_rate": 2.336145985761664e-06, + "loss": 0.6279, + "step": 10095 + }, + { + "epoch": 1.751561415683553, + "grad_norm": 1.0510423183441162, + "learning_rate": 2.3329515232798207e-06, + "loss": 0.5988, + "step": 10096 + }, + { + "epoch": 1.751734906315059, + "grad_norm": 0.7698186635971069, + "learning_rate": 2.3297591110795437e-06, + "loss": 0.7188, + "step": 10097 + }, + { + "epoch": 1.7519083969465647, + "grad_norm": 1.466143012046814, + "learning_rate": 2.3265687495313106e-06, + "loss": 0.6067, + "step": 10098 + }, + { + "epoch": 1.7520818875780708, + "grad_norm": 1.1487051248550415, + "learning_rate": 2.323380439005367e-06, + "loss": 0.7689, + "step": 10099 + }, + { + "epoch": 1.7522553782095767, + "grad_norm": 1.056879997253418, + "learning_rate": 2.3201941798717176e-06, + "loss": 0.6919, + "step": 10100 + }, + { + "epoch": 1.7524288688410827, + "grad_norm": 0.8055669665336609, + "learning_rate": 2.3170099725001393e-06, + "loss": 0.738, + "step": 10101 + }, + { + "epoch": 1.7526023594725886, + "grad_norm": 1.1570075750350952, + "learning_rate": 2.313827817260159e-06, + "loss": 0.6833, + "step": 10102 + }, + { + "epoch": 1.7527758501040944, + "grad_norm": 1.247590184211731, + "learning_rate": 2.310647714521068e-06, + "loss": 0.7363, + "step": 10103 + }, + { + "epoch": 1.7529493407356003, + "grad_norm": 0.8471783995628357, + "learning_rate": 2.307469664651918e-06, + "loss": 0.6354, + "step": 10104 + }, + { + "epoch": 1.7531228313671061, + "grad_norm": 1.4561901092529297, + "learning_rate": 2.304293668021538e-06, + "loss": 0.6411, + "step": 10105 + }, + { + "epoch": 1.753296321998612, + "grad_norm": 0.9610464572906494, + "learning_rate": 2.3011197249984886e-06, + "loss": 0.6808, + "step": 10106 + }, + { + "epoch": 1.7534698126301178, + "grad_norm": 1.2187659740447998, + "learning_rate": 2.2979478359511244e-06, + "loss": 0.686, + "step": 10107 + }, + { + "epoch": 1.7536433032616239, + "grad_norm": 0.6572092175483704, + "learning_rate": 2.2947780012475396e-06, + "loss": 0.7993, + "step": 10108 + }, + { + "epoch": 1.7538167938931297, + "grad_norm": 0.748252809047699, + "learning_rate": 2.291610221255598e-06, + "loss": 0.7294, + "step": 10109 + }, + { + "epoch": 1.7539902845246358, + "grad_norm": 0.7410050630569458, + "learning_rate": 2.2884444963429188e-06, + "loss": 0.8065, + "step": 10110 + }, + { + "epoch": 1.7541637751561416, + "grad_norm": 1.0766429901123047, + "learning_rate": 2.285280826876901e-06, + "loss": 0.6742, + "step": 10111 + }, + { + "epoch": 1.7543372657876475, + "grad_norm": 1.103111743927002, + "learning_rate": 2.282119213224683e-06, + "loss": 0.812, + "step": 10112 + }, + { + "epoch": 1.7545107564191533, + "grad_norm": 2.1235623359680176, + "learning_rate": 2.2789596557531766e-06, + "loss": 0.6383, + "step": 10113 + }, + { + "epoch": 1.7546842470506592, + "grad_norm": 0.8916382193565369, + "learning_rate": 2.2758021548290478e-06, + "loss": 0.5751, + "step": 10114 + }, + { + "epoch": 1.754857737682165, + "grad_norm": 0.9292827844619751, + "learning_rate": 2.2726467108187335e-06, + "loss": 0.5696, + "step": 10115 + }, + { + "epoch": 1.755031228313671, + "grad_norm": 1.0329029560089111, + "learning_rate": 2.2694933240884277e-06, + "loss": 0.5944, + "step": 10116 + }, + { + "epoch": 1.755204718945177, + "grad_norm": 0.7868707776069641, + "learning_rate": 2.26634199500408e-06, + "loss": 0.6425, + "step": 10117 + }, + { + "epoch": 1.755378209576683, + "grad_norm": 1.0913541316986084, + "learning_rate": 2.263192723931409e-06, + "loss": 0.7249, + "step": 10118 + }, + { + "epoch": 1.7555517002081888, + "grad_norm": 1.560375690460205, + "learning_rate": 2.2600455112358843e-06, + "loss": 0.6691, + "step": 10119 + }, + { + "epoch": 1.7557251908396947, + "grad_norm": 0.875105619430542, + "learning_rate": 2.2569003572827543e-06, + "loss": 0.6295, + "step": 10120 + }, + { + "epoch": 1.7558986814712005, + "grad_norm": 0.8231083750724792, + "learning_rate": 2.2537572624370107e-06, + "loss": 0.6547, + "step": 10121 + }, + { + "epoch": 1.7560721721027064, + "grad_norm": 0.9825895428657532, + "learning_rate": 2.250616227063418e-06, + "loss": 0.7112, + "step": 10122 + }, + { + "epoch": 1.7562456627342122, + "grad_norm": 1.7779111862182617, + "learning_rate": 2.247477251526489e-06, + "loss": 0.5535, + "step": 10123 + }, + { + "epoch": 1.7564191533657183, + "grad_norm": 0.9441196918487549, + "learning_rate": 2.244340336190518e-06, + "loss": 0.7402, + "step": 10124 + }, + { + "epoch": 1.7565926439972241, + "grad_norm": 0.8721840977668762, + "learning_rate": 2.2412054814195326e-06, + "loss": 0.8503, + "step": 10125 + }, + { + "epoch": 1.7567661346287302, + "grad_norm": 1.6151373386383057, + "learning_rate": 2.2380726875773507e-06, + "loss": 0.6262, + "step": 10126 + }, + { + "epoch": 1.756939625260236, + "grad_norm": 0.7322667241096497, + "learning_rate": 2.2349419550275275e-06, + "loss": 0.6752, + "step": 10127 + }, + { + "epoch": 1.757113115891742, + "grad_norm": 0.8871001601219177, + "learning_rate": 2.2318132841333906e-06, + "loss": 0.8829, + "step": 10128 + }, + { + "epoch": 1.7572866065232478, + "grad_norm": 0.9849100708961487, + "learning_rate": 2.228686675258025e-06, + "loss": 0.6868, + "step": 10129 + }, + { + "epoch": 1.7574600971547536, + "grad_norm": 0.7656390070915222, + "learning_rate": 2.2255621287642805e-06, + "loss": 0.6193, + "step": 10130 + }, + { + "epoch": 1.7576335877862594, + "grad_norm": 1.09891939163208, + "learning_rate": 2.2224396450147623e-06, + "loss": 0.6724, + "step": 10131 + }, + { + "epoch": 1.7578070784177653, + "grad_norm": 5.602595329284668, + "learning_rate": 2.2193192243718385e-06, + "loss": 0.7201, + "step": 10132 + }, + { + "epoch": 1.7579805690492714, + "grad_norm": 0.7473915219306946, + "learning_rate": 2.216200867197633e-06, + "loss": 0.8169, + "step": 10133 + }, + { + "epoch": 1.7581540596807772, + "grad_norm": 3.8220787048339844, + "learning_rate": 2.2130845738540475e-06, + "loss": 0.7043, + "step": 10134 + }, + { + "epoch": 1.7583275503122833, + "grad_norm": 0.8837077617645264, + "learning_rate": 2.2099703447027142e-06, + "loss": 0.7036, + "step": 10135 + }, + { + "epoch": 1.7585010409437891, + "grad_norm": 0.8299365043640137, + "learning_rate": 2.2068581801050557e-06, + "loss": 0.5602, + "step": 10136 + }, + { + "epoch": 1.758674531575295, + "grad_norm": 0.8114748597145081, + "learning_rate": 2.203748080422239e-06, + "loss": 0.7094, + "step": 10137 + }, + { + "epoch": 1.7588480222068008, + "grad_norm": 0.8125333786010742, + "learning_rate": 2.2006400460151923e-06, + "loss": 0.6724, + "step": 10138 + }, + { + "epoch": 1.7590215128383067, + "grad_norm": 2.6065940856933594, + "learning_rate": 2.1975340772446095e-06, + "loss": 0.7257, + "step": 10139 + }, + { + "epoch": 1.7591950034698125, + "grad_norm": 0.8492751121520996, + "learning_rate": 2.1944301744709428e-06, + "loss": 0.645, + "step": 10140 + }, + { + "epoch": 1.7593684941013186, + "grad_norm": 1.1139445304870605, + "learning_rate": 2.1913283380544013e-06, + "loss": 0.5922, + "step": 10141 + }, + { + "epoch": 1.7595419847328244, + "grad_norm": 1.050893783569336, + "learning_rate": 2.1882285683549555e-06, + "loss": 0.5563, + "step": 10142 + }, + { + "epoch": 1.7597154753643305, + "grad_norm": 1.0920135974884033, + "learning_rate": 2.185130865732341e-06, + "loss": 0.5986, + "step": 10143 + }, + { + "epoch": 1.7598889659958363, + "grad_norm": 0.92472243309021, + "learning_rate": 2.1820352305460492e-06, + "loss": 0.5687, + "step": 10144 + }, + { + "epoch": 1.7600624566273422, + "grad_norm": 0.9259141683578491, + "learning_rate": 2.1789416631553294e-06, + "loss": 0.5763, + "step": 10145 + }, + { + "epoch": 1.760235947258848, + "grad_norm": 1.1057170629501343, + "learning_rate": 2.1758501639191908e-06, + "loss": 0.5815, + "step": 10146 + }, + { + "epoch": 1.7604094378903539, + "grad_norm": 1.1872144937515259, + "learning_rate": 2.1727607331964197e-06, + "loss": 0.6479, + "step": 10147 + }, + { + "epoch": 1.7605829285218597, + "grad_norm": 0.9312294125556946, + "learning_rate": 2.169673371345531e-06, + "loss": 0.6793, + "step": 10148 + }, + { + "epoch": 1.7607564191533656, + "grad_norm": 0.7282106876373291, + "learning_rate": 2.166588078724827e-06, + "loss": 0.6102, + "step": 10149 + }, + { + "epoch": 1.7609299097848716, + "grad_norm": 1.0911482572555542, + "learning_rate": 2.1635048556923555e-06, + "loss": 0.6567, + "step": 10150 + }, + { + "epoch": 1.7611034004163775, + "grad_norm": 1.1570355892181396, + "learning_rate": 2.1604237026059296e-06, + "loss": 0.6025, + "step": 10151 + }, + { + "epoch": 1.7612768910478835, + "grad_norm": 0.8215580582618713, + "learning_rate": 2.1573446198231185e-06, + "loss": 0.7885, + "step": 10152 + }, + { + "epoch": 1.7614503816793894, + "grad_norm": 0.7510644793510437, + "learning_rate": 2.154267607701259e-06, + "loss": 0.7512, + "step": 10153 + }, + { + "epoch": 1.7616238723108952, + "grad_norm": 0.8680253028869629, + "learning_rate": 2.1511926665974324e-06, + "loss": 0.62, + "step": 10154 + }, + { + "epoch": 1.761797362942401, + "grad_norm": 1.02955961227417, + "learning_rate": 2.1481197968684998e-06, + "loss": 0.6095, + "step": 10155 + }, + { + "epoch": 1.761970853573907, + "grad_norm": 0.8987807035446167, + "learning_rate": 2.1450489988710644e-06, + "loss": 0.6255, + "step": 10156 + }, + { + "epoch": 1.7621443442054128, + "grad_norm": 0.8493519425392151, + "learning_rate": 2.1419802729614993e-06, + "loss": 0.5851, + "step": 10157 + }, + { + "epoch": 1.7623178348369188, + "grad_norm": 0.9304413795471191, + "learning_rate": 2.138913619495928e-06, + "loss": 0.6772, + "step": 10158 + }, + { + "epoch": 1.7624913254684247, + "grad_norm": 0.8239910006523132, + "learning_rate": 2.1358490388302466e-06, + "loss": 0.75, + "step": 10159 + }, + { + "epoch": 1.7626648160999308, + "grad_norm": 1.0170409679412842, + "learning_rate": 2.1327865313201015e-06, + "loss": 0.7625, + "step": 10160 + }, + { + "epoch": 1.7628383067314366, + "grad_norm": 0.9234309792518616, + "learning_rate": 2.1297260973208987e-06, + "loss": 0.7098, + "step": 10161 + }, + { + "epoch": 1.7630117973629424, + "grad_norm": 0.7453468441963196, + "learning_rate": 2.1266677371877996e-06, + "loss": 0.6656, + "step": 10162 + }, + { + "epoch": 1.7631852879944483, + "grad_norm": 0.8940064907073975, + "learning_rate": 2.1236114512757423e-06, + "loss": 0.7493, + "step": 10163 + }, + { + "epoch": 1.7633587786259541, + "grad_norm": 1.5311079025268555, + "learning_rate": 2.120557239939405e-06, + "loss": 0.6836, + "step": 10164 + }, + { + "epoch": 1.76353226925746, + "grad_norm": 0.8410761952400208, + "learning_rate": 2.1175051035332285e-06, + "loss": 0.759, + "step": 10165 + }, + { + "epoch": 1.7637057598889658, + "grad_norm": 0.7131624817848206, + "learning_rate": 2.114455042411432e-06, + "loss": 0.8372, + "step": 10166 + }, + { + "epoch": 1.763879250520472, + "grad_norm": 0.8571066856384277, + "learning_rate": 2.111407056927959e-06, + "loss": 0.6692, + "step": 10167 + }, + { + "epoch": 1.7640527411519777, + "grad_norm": 0.8227852582931519, + "learning_rate": 2.108361147436546e-06, + "loss": 0.5503, + "step": 10168 + }, + { + "epoch": 1.7642262317834838, + "grad_norm": 0.8280117511749268, + "learning_rate": 2.105317314290671e-06, + "loss": 0.6316, + "step": 10169 + }, + { + "epoch": 1.7643997224149897, + "grad_norm": 0.7863892316818237, + "learning_rate": 2.1022755578435715e-06, + "loss": 0.6962, + "step": 10170 + }, + { + "epoch": 1.7645732130464955, + "grad_norm": 0.7971781492233276, + "learning_rate": 2.099235878448247e-06, + "loss": 0.7238, + "step": 10171 + }, + { + "epoch": 1.7647467036780013, + "grad_norm": 0.8319457173347473, + "learning_rate": 2.0961982764574597e-06, + "loss": 0.6987, + "step": 10172 + }, + { + "epoch": 1.7649201943095072, + "grad_norm": 1.2112164497375488, + "learning_rate": 2.093162752223723e-06, + "loss": 0.7666, + "step": 10173 + }, + { + "epoch": 1.765093684941013, + "grad_norm": 0.8126006722450256, + "learning_rate": 2.0901293060993154e-06, + "loss": 0.6221, + "step": 10174 + }, + { + "epoch": 1.765267175572519, + "grad_norm": 1.0374011993408203, + "learning_rate": 2.087097938436269e-06, + "loss": 0.592, + "step": 10175 + }, + { + "epoch": 1.765440666204025, + "grad_norm": 0.8511900305747986, + "learning_rate": 2.0840686495863837e-06, + "loss": 0.7588, + "step": 10176 + }, + { + "epoch": 1.765614156835531, + "grad_norm": 1.1334911584854126, + "learning_rate": 2.0810414399012034e-06, + "loss": 0.7717, + "step": 10177 + }, + { + "epoch": 1.7657876474670369, + "grad_norm": 0.9748876094818115, + "learning_rate": 2.078016309732047e-06, + "loss": 0.7285, + "step": 10178 + }, + { + "epoch": 1.7659611380985427, + "grad_norm": 2.7077372074127197, + "learning_rate": 2.0749932594299804e-06, + "loss": 0.7231, + "step": 10179 + }, + { + "epoch": 1.7661346287300486, + "grad_norm": 0.8038955330848694, + "learning_rate": 2.0719722893458317e-06, + "loss": 0.6372, + "step": 10180 + }, + { + "epoch": 1.7663081193615544, + "grad_norm": 0.776221513748169, + "learning_rate": 2.0689533998301868e-06, + "loss": 0.6962, + "step": 10181 + }, + { + "epoch": 1.7664816099930603, + "grad_norm": 1.04355788230896, + "learning_rate": 2.0659365912333972e-06, + "loss": 0.6383, + "step": 10182 + }, + { + "epoch": 1.7666551006245663, + "grad_norm": 1.0569730997085571, + "learning_rate": 2.0629218639055625e-06, + "loss": 0.7054, + "step": 10183 + }, + { + "epoch": 1.7668285912560722, + "grad_norm": 1.1978683471679688, + "learning_rate": 2.0599092181965474e-06, + "loss": 0.6801, + "step": 10184 + }, + { + "epoch": 1.7670020818875782, + "grad_norm": 1.410083293914795, + "learning_rate": 2.05689865445597e-06, + "loss": 0.6349, + "step": 10185 + }, + { + "epoch": 1.767175572519084, + "grad_norm": 2.739997148513794, + "learning_rate": 2.0538901730332128e-06, + "loss": 0.7288, + "step": 10186 + }, + { + "epoch": 1.76734906315059, + "grad_norm": 0.6860550045967102, + "learning_rate": 2.0508837742774125e-06, + "loss": 0.5773, + "step": 10187 + }, + { + "epoch": 1.7675225537820958, + "grad_norm": 0.8310195207595825, + "learning_rate": 2.047879458537465e-06, + "loss": 0.6599, + "step": 10188 + }, + { + "epoch": 1.7676960444136016, + "grad_norm": 1.0886787176132202, + "learning_rate": 2.0448772261620254e-06, + "loss": 0.5984, + "step": 10189 + }, + { + "epoch": 1.7678695350451075, + "grad_norm": 1.165220856666565, + "learning_rate": 2.0418770774995034e-06, + "loss": 0.5848, + "step": 10190 + }, + { + "epoch": 1.7680430256766133, + "grad_norm": 1.2029523849487305, + "learning_rate": 2.038879012898074e-06, + "loss": 0.6443, + "step": 10191 + }, + { + "epoch": 1.7682165163081194, + "grad_norm": 0.8949983716011047, + "learning_rate": 2.0358830327056633e-06, + "loss": 0.6826, + "step": 10192 + }, + { + "epoch": 1.7683900069396252, + "grad_norm": 1.1299594640731812, + "learning_rate": 2.03288913726996e-06, + "loss": 0.8027, + "step": 10193 + }, + { + "epoch": 1.7685634975711313, + "grad_norm": 1.0621434450149536, + "learning_rate": 2.0298973269384037e-06, + "loss": 0.5776, + "step": 10194 + }, + { + "epoch": 1.7687369882026371, + "grad_norm": 0.7805692553520203, + "learning_rate": 2.02690760205821e-06, + "loss": 0.7896, + "step": 10195 + }, + { + "epoch": 1.768910478834143, + "grad_norm": 0.9448472261428833, + "learning_rate": 2.023919962976324e-06, + "loss": 0.5831, + "step": 10196 + }, + { + "epoch": 1.7690839694656488, + "grad_norm": 0.8353790044784546, + "learning_rate": 2.020934410039477e-06, + "loss": 0.6555, + "step": 10197 + }, + { + "epoch": 1.7692574600971547, + "grad_norm": 0.9083075523376465, + "learning_rate": 2.0179509435941403e-06, + "loss": 0.6953, + "step": 10198 + }, + { + "epoch": 1.7694309507286605, + "grad_norm": 0.9630043506622314, + "learning_rate": 2.0149695639865507e-06, + "loss": 0.563, + "step": 10199 + }, + { + "epoch": 1.7696044413601666, + "grad_norm": 0.9786599278450012, + "learning_rate": 2.011990271562696e-06, + "loss": 0.5964, + "step": 10200 + }, + { + "epoch": 1.7697779319916724, + "grad_norm": 0.9990949630737305, + "learning_rate": 2.0090130666683347e-06, + "loss": 0.578, + "step": 10201 + }, + { + "epoch": 1.7699514226231785, + "grad_norm": 1.099523663520813, + "learning_rate": 2.006037949648971e-06, + "loss": 0.6273, + "step": 10202 + }, + { + "epoch": 1.7701249132546844, + "grad_norm": 1.128749132156372, + "learning_rate": 2.0030649208498685e-06, + "loss": 0.7983, + "step": 10203 + }, + { + "epoch": 1.7702984038861902, + "grad_norm": 1.0770974159240723, + "learning_rate": 2.000093980616051e-06, + "loss": 0.8218, + "step": 10204 + }, + { + "epoch": 1.770471894517696, + "grad_norm": 1.0681637525558472, + "learning_rate": 1.9971251292923076e-06, + "loss": 0.6516, + "step": 10205 + }, + { + "epoch": 1.770645385149202, + "grad_norm": 1.0759773254394531, + "learning_rate": 1.9941583672231624e-06, + "loss": 0.7071, + "step": 10206 + }, + { + "epoch": 1.7708188757807077, + "grad_norm": 0.7766039967536926, + "learning_rate": 1.991193694752924e-06, + "loss": 0.7692, + "step": 10207 + }, + { + "epoch": 1.7709923664122136, + "grad_norm": 0.7336828708648682, + "learning_rate": 1.9882311122256425e-06, + "loss": 0.7563, + "step": 10208 + }, + { + "epoch": 1.7711658570437196, + "grad_norm": 0.7387003302574158, + "learning_rate": 1.985270619985127e-06, + "loss": 0.7615, + "step": 10209 + }, + { + "epoch": 1.7713393476752255, + "grad_norm": 1.3266499042510986, + "learning_rate": 1.9823122183749443e-06, + "loss": 0.6151, + "step": 10210 + }, + { + "epoch": 1.7715128383067316, + "grad_norm": 1.3403393030166626, + "learning_rate": 1.979355907738427e-06, + "loss": 0.6499, + "step": 10211 + }, + { + "epoch": 1.7716863289382374, + "grad_norm": 1.4350847005844116, + "learning_rate": 1.9764016884186545e-06, + "loss": 0.6602, + "step": 10212 + }, + { + "epoch": 1.7718598195697433, + "grad_norm": 0.9719976186752319, + "learning_rate": 1.973449560758465e-06, + "loss": 0.6864, + "step": 10213 + }, + { + "epoch": 1.772033310201249, + "grad_norm": 0.801980197429657, + "learning_rate": 1.9704995251004622e-06, + "loss": 0.7776, + "step": 10214 + }, + { + "epoch": 1.772206800832755, + "grad_norm": 1.114893913269043, + "learning_rate": 1.9675515817869974e-06, + "loss": 0.5863, + "step": 10215 + }, + { + "epoch": 1.7723802914642608, + "grad_norm": 1.03114914894104, + "learning_rate": 1.9646057311601853e-06, + "loss": 0.699, + "step": 10216 + }, + { + "epoch": 1.7725537820957669, + "grad_norm": 1.82334303855896, + "learning_rate": 1.961661973561888e-06, + "loss": 0.5552, + "step": 10217 + }, + { + "epoch": 1.7727272727272727, + "grad_norm": 1.0841279029846191, + "learning_rate": 1.958720309333746e-06, + "loss": 0.6908, + "step": 10218 + }, + { + "epoch": 1.7729007633587788, + "grad_norm": 1.0015020370483398, + "learning_rate": 1.9557807388171257e-06, + "loss": 0.681, + "step": 10219 + }, + { + "epoch": 1.7730742539902846, + "grad_norm": 0.9661129713058472, + "learning_rate": 1.952843262353181e-06, + "loss": 0.7091, + "step": 10220 + }, + { + "epoch": 1.7732477446217905, + "grad_norm": 0.7839742302894592, + "learning_rate": 1.9499078802828044e-06, + "loss": 0.5808, + "step": 10221 + }, + { + "epoch": 1.7734212352532963, + "grad_norm": 0.8212881684303284, + "learning_rate": 1.946974592946651e-06, + "loss": 0.6921, + "step": 10222 + }, + { + "epoch": 1.7735947258848022, + "grad_norm": 0.8151078224182129, + "learning_rate": 1.9440434006851296e-06, + "loss": 0.6967, + "step": 10223 + }, + { + "epoch": 1.773768216516308, + "grad_norm": 1.2097114324569702, + "learning_rate": 1.9411143038384163e-06, + "loss": 0.5829, + "step": 10224 + }, + { + "epoch": 1.773941707147814, + "grad_norm": 1.1186792850494385, + "learning_rate": 1.938187302746424e-06, + "loss": 0.6226, + "step": 10225 + }, + { + "epoch": 1.77411519777932, + "grad_norm": 1.1043469905853271, + "learning_rate": 1.935262397748845e-06, + "loss": 0.6587, + "step": 10226 + }, + { + "epoch": 1.7742886884108258, + "grad_norm": 1.294297695159912, + "learning_rate": 1.932339589185115e-06, + "loss": 0.7139, + "step": 10227 + }, + { + "epoch": 1.7744621790423318, + "grad_norm": 0.9850130677223206, + "learning_rate": 1.929418877394429e-06, + "loss": 0.566, + "step": 10228 + }, + { + "epoch": 1.7746356696738377, + "grad_norm": 0.7916104793548584, + "learning_rate": 1.9265002627157335e-06, + "loss": 0.7415, + "step": 10229 + }, + { + "epoch": 1.7748091603053435, + "grad_norm": 1.1208515167236328, + "learning_rate": 1.923583745487747e-06, + "loss": 0.7478, + "step": 10230 + }, + { + "epoch": 1.7749826509368494, + "grad_norm": 2.2471415996551514, + "learning_rate": 1.920669326048932e-06, + "loss": 0.714, + "step": 10231 + }, + { + "epoch": 1.7751561415683552, + "grad_norm": 0.9209645986557007, + "learning_rate": 1.917757004737506e-06, + "loss": 0.6908, + "step": 10232 + }, + { + "epoch": 1.775329632199861, + "grad_norm": 0.7613286375999451, + "learning_rate": 1.914846781891444e-06, + "loss": 0.73, + "step": 10233 + }, + { + "epoch": 1.7755031228313671, + "grad_norm": 0.8059459328651428, + "learning_rate": 1.9119386578484934e-06, + "loss": 0.6848, + "step": 10234 + }, + { + "epoch": 1.775676613462873, + "grad_norm": 0.8612025380134583, + "learning_rate": 1.909032632946137e-06, + "loss": 0.5981, + "step": 10235 + }, + { + "epoch": 1.775850104094379, + "grad_norm": 0.8923397660255432, + "learning_rate": 1.906128707521624e-06, + "loss": 0.7111, + "step": 10236 + }, + { + "epoch": 1.776023594725885, + "grad_norm": 0.7700812220573425, + "learning_rate": 1.903226881911957e-06, + "loss": 0.5999, + "step": 10237 + }, + { + "epoch": 1.7761970853573907, + "grad_norm": 0.8319199085235596, + "learning_rate": 1.900327156453896e-06, + "loss": 0.7068, + "step": 10238 + }, + { + "epoch": 1.7763705759888966, + "grad_norm": 1.5347031354904175, + "learning_rate": 1.8974295314839609e-06, + "loss": 0.5765, + "step": 10239 + }, + { + "epoch": 1.7765440666204024, + "grad_norm": 0.9503218531608582, + "learning_rate": 1.894534007338422e-06, + "loss": 0.5664, + "step": 10240 + }, + { + "epoch": 1.7767175572519083, + "grad_norm": 0.7843663692474365, + "learning_rate": 1.8916405843533092e-06, + "loss": 0.7009, + "step": 10241 + }, + { + "epoch": 1.7768910478834143, + "grad_norm": 0.8792386054992676, + "learning_rate": 1.8887492628644022e-06, + "loss": 0.6267, + "step": 10242 + }, + { + "epoch": 1.7770645385149202, + "grad_norm": 0.9171980619430542, + "learning_rate": 1.8858600432072527e-06, + "loss": 0.604, + "step": 10243 + }, + { + "epoch": 1.7772380291464263, + "grad_norm": 0.9401341080665588, + "learning_rate": 1.8829729257171503e-06, + "loss": 0.7356, + "step": 10244 + }, + { + "epoch": 1.777411519777932, + "grad_norm": 0.9681640863418579, + "learning_rate": 1.8800879107291537e-06, + "loss": 0.7021, + "step": 10245 + }, + { + "epoch": 1.777585010409438, + "grad_norm": 1.1219325065612793, + "learning_rate": 1.8772049985780616e-06, + "loss": 0.7859, + "step": 10246 + }, + { + "epoch": 1.7777585010409438, + "grad_norm": 0.8624433279037476, + "learning_rate": 1.8743241895984554e-06, + "loss": 0.7349, + "step": 10247 + }, + { + "epoch": 1.7779319916724496, + "grad_norm": 1.0535106658935547, + "learning_rate": 1.871445484124641e-06, + "loss": 0.5782, + "step": 10248 + }, + { + "epoch": 1.7781054823039555, + "grad_norm": 0.9833988547325134, + "learning_rate": 1.8685688824907044e-06, + "loss": 0.6108, + "step": 10249 + }, + { + "epoch": 1.7782789729354613, + "grad_norm": 0.7674720287322998, + "learning_rate": 1.8656943850304765e-06, + "loss": 0.7463, + "step": 10250 + }, + { + "epoch": 1.7784524635669674, + "grad_norm": 0.791776716709137, + "learning_rate": 1.8628219920775481e-06, + "loss": 0.613, + "step": 10251 + }, + { + "epoch": 1.7786259541984732, + "grad_norm": 1.6227202415466309, + "learning_rate": 1.8599517039652548e-06, + "loss": 0.7438, + "step": 10252 + }, + { + "epoch": 1.7787994448299793, + "grad_norm": 1.699987530708313, + "learning_rate": 1.8570835210267125e-06, + "loss": 0.6273, + "step": 10253 + }, + { + "epoch": 1.7789729354614852, + "grad_norm": 1.9262768030166626, + "learning_rate": 1.8542174435947614e-06, + "loss": 0.6896, + "step": 10254 + }, + { + "epoch": 1.779146426092991, + "grad_norm": 0.8870375156402588, + "learning_rate": 1.851353472002022e-06, + "loss": 0.6805, + "step": 10255 + }, + { + "epoch": 1.7793199167244969, + "grad_norm": 0.9949630498886108, + "learning_rate": 1.8484916065808622e-06, + "loss": 0.625, + "step": 10256 + }, + { + "epoch": 1.7794934073560027, + "grad_norm": 1.4359320402145386, + "learning_rate": 1.8456318476634006e-06, + "loss": 0.6409, + "step": 10257 + }, + { + "epoch": 1.7796668979875085, + "grad_norm": 0.8975815176963806, + "learning_rate": 1.8427741955815138e-06, + "loss": 0.6074, + "step": 10258 + }, + { + "epoch": 1.7798403886190146, + "grad_norm": 0.7698134183883667, + "learning_rate": 1.839918650666841e-06, + "loss": 0.7583, + "step": 10259 + }, + { + "epoch": 1.7800138792505205, + "grad_norm": 1.1313923597335815, + "learning_rate": 1.8370652132507705e-06, + "loss": 0.6985, + "step": 10260 + }, + { + "epoch": 1.7801873698820265, + "grad_norm": 0.799109935760498, + "learning_rate": 1.8342138836644419e-06, + "loss": 0.7539, + "step": 10261 + }, + { + "epoch": 1.7803608605135324, + "grad_norm": 0.9591084718704224, + "learning_rate": 1.8313646622387639e-06, + "loss": 0.7294, + "step": 10262 + }, + { + "epoch": 1.7805343511450382, + "grad_norm": 1.0636893510818481, + "learning_rate": 1.8285175493043893e-06, + "loss": 0.5614, + "step": 10263 + }, + { + "epoch": 1.780707841776544, + "grad_norm": 0.7896313071250916, + "learning_rate": 1.8256725451917233e-06, + "loss": 0.7899, + "step": 10264 + }, + { + "epoch": 1.78088133240805, + "grad_norm": 1.1041220426559448, + "learning_rate": 1.822829650230935e-06, + "loss": 0.6201, + "step": 10265 + }, + { + "epoch": 1.7810548230395558, + "grad_norm": 1.0067826509475708, + "learning_rate": 1.8199888647519537e-06, + "loss": 0.7559, + "step": 10266 + }, + { + "epoch": 1.7812283136710616, + "grad_norm": 1.6000429391860962, + "learning_rate": 1.81715018908444e-06, + "loss": 0.7451, + "step": 10267 + }, + { + "epoch": 1.7814018043025677, + "grad_norm": 2.5493710041046143, + "learning_rate": 1.8143136235578374e-06, + "loss": 0.7627, + "step": 10268 + }, + { + "epoch": 1.7815752949340735, + "grad_norm": 1.0039705038070679, + "learning_rate": 1.811479168501329e-06, + "loss": 0.6138, + "step": 10269 + }, + { + "epoch": 1.7817487855655796, + "grad_norm": 0.9817229509353638, + "learning_rate": 1.8086468242438582e-06, + "loss": 0.7581, + "step": 10270 + }, + { + "epoch": 1.7819222761970854, + "grad_norm": 0.7940771579742432, + "learning_rate": 1.8058165911141179e-06, + "loss": 0.6119, + "step": 10271 + }, + { + "epoch": 1.7820957668285913, + "grad_norm": 2.44180965423584, + "learning_rate": 1.8029884694405631e-06, + "loss": 0.6802, + "step": 10272 + }, + { + "epoch": 1.7822692574600971, + "grad_norm": 1.2973943948745728, + "learning_rate": 1.8001624595514022e-06, + "loss": 0.6429, + "step": 10273 + }, + { + "epoch": 1.782442748091603, + "grad_norm": 3.7027602195739746, + "learning_rate": 1.7973385617745953e-06, + "loss": 0.6824, + "step": 10274 + }, + { + "epoch": 1.7826162387231088, + "grad_norm": 0.7475103735923767, + "learning_rate": 1.7945167764378536e-06, + "loss": 0.6591, + "step": 10275 + }, + { + "epoch": 1.7827897293546149, + "grad_norm": 0.9845368266105652, + "learning_rate": 1.7916971038686614e-06, + "loss": 0.7012, + "step": 10276 + }, + { + "epoch": 1.7829632199861207, + "grad_norm": 0.8473227620124817, + "learning_rate": 1.7888795443942308e-06, + "loss": 0.8076, + "step": 10277 + }, + { + "epoch": 1.7831367106176268, + "grad_norm": 1.0862728357315063, + "learning_rate": 1.7860640983415533e-06, + "loss": 0.6565, + "step": 10278 + }, + { + "epoch": 1.7833102012491326, + "grad_norm": 1.191525936126709, + "learning_rate": 1.7832507660373589e-06, + "loss": 0.7825, + "step": 10279 + }, + { + "epoch": 1.7834836918806385, + "grad_norm": 1.0793123245239258, + "learning_rate": 1.7804395478081416e-06, + "loss": 0.7747, + "step": 10280 + }, + { + "epoch": 1.7836571825121443, + "grad_norm": 1.6569416522979736, + "learning_rate": 1.7776304439801384e-06, + "loss": 0.7024, + "step": 10281 + }, + { + "epoch": 1.7838306731436502, + "grad_norm": 2.076094388961792, + "learning_rate": 1.774823454879362e-06, + "loss": 0.6437, + "step": 10282 + }, + { + "epoch": 1.784004163775156, + "grad_norm": 0.8846420049667358, + "learning_rate": 1.7720185808315583e-06, + "loss": 0.6929, + "step": 10283 + }, + { + "epoch": 1.784177654406662, + "grad_norm": 0.8875096440315247, + "learning_rate": 1.7692158221622379e-06, + "loss": 0.8015, + "step": 10284 + }, + { + "epoch": 1.784351145038168, + "grad_norm": 1.1163994073867798, + "learning_rate": 1.7664151791966654e-06, + "loss": 0.5494, + "step": 10285 + }, + { + "epoch": 1.7845246356696738, + "grad_norm": 1.1979444026947021, + "learning_rate": 1.763616652259854e-06, + "loss": 0.6978, + "step": 10286 + }, + { + "epoch": 1.7846981263011799, + "grad_norm": 0.8116319179534912, + "learning_rate": 1.760820241676584e-06, + "loss": 0.6865, + "step": 10287 + }, + { + "epoch": 1.7848716169326857, + "grad_norm": 0.9498181343078613, + "learning_rate": 1.758025947771378e-06, + "loss": 0.621, + "step": 10288 + }, + { + "epoch": 1.7850451075641915, + "grad_norm": 0.9460271596908569, + "learning_rate": 1.7552337708685163e-06, + "loss": 0.8428, + "step": 10289 + }, + { + "epoch": 1.7852185981956974, + "grad_norm": 0.5787179470062256, + "learning_rate": 1.752443711292029e-06, + "loss": 0.7488, + "step": 10290 + }, + { + "epoch": 1.7853920888272032, + "grad_norm": 0.7906292080879211, + "learning_rate": 1.749655769365719e-06, + "loss": 0.7671, + "step": 10291 + }, + { + "epoch": 1.785565579458709, + "grad_norm": 0.9821382761001587, + "learning_rate": 1.7468699454131211e-06, + "loss": 0.6708, + "step": 10292 + }, + { + "epoch": 1.7857390700902152, + "grad_norm": 0.8666329979896545, + "learning_rate": 1.7440862397575343e-06, + "loss": 0.6849, + "step": 10293 + }, + { + "epoch": 1.785912560721721, + "grad_norm": 1.0590832233428955, + "learning_rate": 1.741304652722009e-06, + "loss": 0.6333, + "step": 10294 + }, + { + "epoch": 1.786086051353227, + "grad_norm": 1.4201442003250122, + "learning_rate": 1.7385251846293606e-06, + "loss": 0.8115, + "step": 10295 + }, + { + "epoch": 1.786259541984733, + "grad_norm": 0.948606014251709, + "learning_rate": 1.7357478358021374e-06, + "loss": 0.6724, + "step": 10296 + }, + { + "epoch": 1.7864330326162388, + "grad_norm": 0.7361804842948914, + "learning_rate": 1.732972606562664e-06, + "loss": 0.7261, + "step": 10297 + }, + { + "epoch": 1.7866065232477446, + "grad_norm": 1.0159456729888916, + "learning_rate": 1.7301994972330028e-06, + "loss": 0.5839, + "step": 10298 + }, + { + "epoch": 1.7867800138792505, + "grad_norm": 0.9316654205322266, + "learning_rate": 1.7274285081349807e-06, + "loss": 0.6204, + "step": 10299 + }, + { + "epoch": 1.7869535045107563, + "grad_norm": 0.9995400309562683, + "learning_rate": 1.724659639590167e-06, + "loss": 0.8333, + "step": 10300 + }, + { + "epoch": 1.7871269951422624, + "grad_norm": 0.9075520634651184, + "learning_rate": 1.7218928919199008e-06, + "loss": 0.7053, + "step": 10301 + }, + { + "epoch": 1.7873004857737682, + "grad_norm": 0.9698948264122009, + "learning_rate": 1.7191282654452646e-06, + "loss": 0.7976, + "step": 10302 + }, + { + "epoch": 1.7874739764052743, + "grad_norm": 0.777226984500885, + "learning_rate": 1.7163657604870932e-06, + "loss": 0.8098, + "step": 10303 + }, + { + "epoch": 1.7876474670367801, + "grad_norm": 0.8209930658340454, + "learning_rate": 1.7136053773659766e-06, + "loss": 0.802, + "step": 10304 + }, + { + "epoch": 1.787820957668286, + "grad_norm": 1.3651156425476074, + "learning_rate": 1.71084711640227e-06, + "loss": 0.7267, + "step": 10305 + }, + { + "epoch": 1.7879944482997918, + "grad_norm": 1.1961561441421509, + "learning_rate": 1.7080909779160615e-06, + "loss": 0.6661, + "step": 10306 + }, + { + "epoch": 1.7881679389312977, + "grad_norm": 1.699177861213684, + "learning_rate": 1.705336962227211e-06, + "loss": 0.7645, + "step": 10307 + }, + { + "epoch": 1.7883414295628035, + "grad_norm": 0.7439855933189392, + "learning_rate": 1.7025850696553248e-06, + "loss": 0.6987, + "step": 10308 + }, + { + "epoch": 1.7885149201943094, + "grad_norm": 0.8450948596000671, + "learning_rate": 1.6998353005197565e-06, + "loss": 0.7891, + "step": 10309 + }, + { + "epoch": 1.7886884108258154, + "grad_norm": 0.6491101384162903, + "learning_rate": 1.6970876551396309e-06, + "loss": 0.7224, + "step": 10310 + }, + { + "epoch": 1.7888619014573213, + "grad_norm": 2.102976083755493, + "learning_rate": 1.6943421338338085e-06, + "loss": 0.8008, + "step": 10311 + }, + { + "epoch": 1.7890353920888273, + "grad_norm": 1.192093014717102, + "learning_rate": 1.6915987369209142e-06, + "loss": 0.6099, + "step": 10312 + }, + { + "epoch": 1.7892088827203332, + "grad_norm": 1.0719733238220215, + "learning_rate": 1.6888574647193157e-06, + "loss": 0.6613, + "step": 10313 + }, + { + "epoch": 1.789382373351839, + "grad_norm": 1.124106526374817, + "learning_rate": 1.6861183175471495e-06, + "loss": 0.6561, + "step": 10314 + }, + { + "epoch": 1.7895558639833449, + "grad_norm": 1.0222985744476318, + "learning_rate": 1.6833812957222884e-06, + "loss": 0.7722, + "step": 10315 + }, + { + "epoch": 1.7897293546148507, + "grad_norm": 0.8422915935516357, + "learning_rate": 1.6806463995623735e-06, + "loss": 0.6384, + "step": 10316 + }, + { + "epoch": 1.7899028452463566, + "grad_norm": 0.8054456114768982, + "learning_rate": 1.6779136293847864e-06, + "loss": 0.6824, + "step": 10317 + }, + { + "epoch": 1.7900763358778626, + "grad_norm": 0.7852348685264587, + "learning_rate": 1.6751829855066804e-06, + "loss": 0.5779, + "step": 10318 + }, + { + "epoch": 1.7902498265093685, + "grad_norm": 0.8924658298492432, + "learning_rate": 1.6724544682449328e-06, + "loss": 0.675, + "step": 10319 + }, + { + "epoch": 1.7904233171408745, + "grad_norm": 1.726906180381775, + "learning_rate": 1.669728077916206e-06, + "loss": 0.5985, + "step": 10320 + }, + { + "epoch": 1.7905968077723804, + "grad_norm": 1.6645441055297852, + "learning_rate": 1.6670038148368916e-06, + "loss": 0.5371, + "step": 10321 + }, + { + "epoch": 1.7907702984038862, + "grad_norm": 0.748140275478363, + "learning_rate": 1.6642816793231499e-06, + "loss": 0.7683, + "step": 10322 + }, + { + "epoch": 1.790943789035392, + "grad_norm": 0.9654271006584167, + "learning_rate": 1.661561671690879e-06, + "loss": 0.7078, + "step": 10323 + }, + { + "epoch": 1.791117279666898, + "grad_norm": 0.7980874180793762, + "learning_rate": 1.6588437922557533e-06, + "loss": 0.6404, + "step": 10324 + }, + { + "epoch": 1.7912907702984038, + "grad_norm": 0.8470470905303955, + "learning_rate": 1.6561280413331672e-06, + "loss": 0.7002, + "step": 10325 + }, + { + "epoch": 1.7914642609299096, + "grad_norm": 0.8063391447067261, + "learning_rate": 1.6534144192383038e-06, + "loss": 0.7322, + "step": 10326 + }, + { + "epoch": 1.7916377515614157, + "grad_norm": 1.0206921100616455, + "learning_rate": 1.6507029262860718e-06, + "loss": 0.7371, + "step": 10327 + }, + { + "epoch": 1.7918112421929215, + "grad_norm": 1.0121952295303345, + "learning_rate": 1.6479935627911481e-06, + "loss": 0.5863, + "step": 10328 + }, + { + "epoch": 1.7919847328244276, + "grad_norm": 1.1041522026062012, + "learning_rate": 1.6452863290679522e-06, + "loss": 0.5696, + "step": 10329 + }, + { + "epoch": 1.7921582234559335, + "grad_norm": 0.8530250787734985, + "learning_rate": 1.6425812254306707e-06, + "loss": 0.8098, + "step": 10330 + }, + { + "epoch": 1.7923317140874393, + "grad_norm": 1.185184121131897, + "learning_rate": 1.6398782521932254e-06, + "loss": 0.7632, + "step": 10331 + }, + { + "epoch": 1.7925052047189451, + "grad_norm": 0.9128183722496033, + "learning_rate": 1.637177409669304e-06, + "loss": 0.6392, + "step": 10332 + }, + { + "epoch": 1.792678695350451, + "grad_norm": 0.8771811723709106, + "learning_rate": 1.6344786981723371e-06, + "loss": 0.6859, + "step": 10333 + }, + { + "epoch": 1.7928521859819568, + "grad_norm": 0.8656076788902283, + "learning_rate": 1.6317821180155214e-06, + "loss": 0.6732, + "step": 10334 + }, + { + "epoch": 1.793025676613463, + "grad_norm": 1.0798834562301636, + "learning_rate": 1.6290876695117951e-06, + "loss": 0.5457, + "step": 10335 + }, + { + "epoch": 1.7931991672449688, + "grad_norm": 0.7644118070602417, + "learning_rate": 1.6263953529738464e-06, + "loss": 0.7456, + "step": 10336 + }, + { + "epoch": 1.7933726578764748, + "grad_norm": 0.8017712831497192, + "learning_rate": 1.6237051687141336e-06, + "loss": 0.8003, + "step": 10337 + }, + { + "epoch": 1.7935461485079807, + "grad_norm": 0.8778343796730042, + "learning_rate": 1.621017117044843e-06, + "loss": 0.6559, + "step": 10338 + }, + { + "epoch": 1.7937196391394865, + "grad_norm": 0.8420104384422302, + "learning_rate": 1.6183311982779337e-06, + "loss": 0.6703, + "step": 10339 + }, + { + "epoch": 1.7938931297709924, + "grad_norm": 0.7487144470214844, + "learning_rate": 1.6156474127251077e-06, + "loss": 0.7006, + "step": 10340 + }, + { + "epoch": 1.7940666204024982, + "grad_norm": 0.916608989238739, + "learning_rate": 1.6129657606978221e-06, + "loss": 0.7234, + "step": 10341 + }, + { + "epoch": 1.794240111034004, + "grad_norm": 0.7786522507667542, + "learning_rate": 1.6102862425072818e-06, + "loss": 0.7139, + "step": 10342 + }, + { + "epoch": 1.7944136016655101, + "grad_norm": 0.7419846653938293, + "learning_rate": 1.6076088584644534e-06, + "loss": 0.8126, + "step": 10343 + }, + { + "epoch": 1.794587092297016, + "grad_norm": 1.2480489015579224, + "learning_rate": 1.6049336088800505e-06, + "loss": 0.6595, + "step": 10344 + }, + { + "epoch": 1.7947605829285218, + "grad_norm": 0.9021449685096741, + "learning_rate": 1.6022604940645337e-06, + "loss": 0.6469, + "step": 10345 + }, + { + "epoch": 1.7949340735600279, + "grad_norm": 0.8655927181243896, + "learning_rate": 1.5995895143281236e-06, + "loss": 0.5936, + "step": 10346 + }, + { + "epoch": 1.7951075641915337, + "grad_norm": 0.7709633708000183, + "learning_rate": 1.596920669980797e-06, + "loss": 0.6818, + "step": 10347 + }, + { + "epoch": 1.7952810548230396, + "grad_norm": 0.7353090643882751, + "learning_rate": 1.5942539613322638e-06, + "loss": 0.6921, + "step": 10348 + }, + { + "epoch": 1.7954545454545454, + "grad_norm": 1.2317283153533936, + "learning_rate": 1.5915893886920098e-06, + "loss": 0.7241, + "step": 10349 + }, + { + "epoch": 1.7956280360860513, + "grad_norm": 0.824559211730957, + "learning_rate": 1.5889269523692541e-06, + "loss": 0.6808, + "step": 10350 + }, + { + "epoch": 1.795801526717557, + "grad_norm": 0.8244564533233643, + "learning_rate": 1.586266652672981e-06, + "loss": 0.6722, + "step": 10351 + }, + { + "epoch": 1.7959750173490632, + "grad_norm": 1.1650018692016602, + "learning_rate": 1.5836084899119165e-06, + "loss": 0.6282, + "step": 10352 + }, + { + "epoch": 1.796148507980569, + "grad_norm": 0.8130766749382019, + "learning_rate": 1.5809524643945472e-06, + "loss": 0.7249, + "step": 10353 + }, + { + "epoch": 1.796321998612075, + "grad_norm": 1.0941648483276367, + "learning_rate": 1.5782985764291091e-06, + "loss": 0.6765, + "step": 10354 + }, + { + "epoch": 1.796495489243581, + "grad_norm": 0.8080827593803406, + "learning_rate": 1.575646826323587e-06, + "loss": 0.7178, + "step": 10355 + }, + { + "epoch": 1.7966689798750868, + "grad_norm": 0.7170622944831848, + "learning_rate": 1.5729972143857164e-06, + "loss": 0.8202, + "step": 10356 + }, + { + "epoch": 1.7968424705065926, + "grad_norm": 0.8765414357185364, + "learning_rate": 1.5703497409229896e-06, + "loss": 0.6306, + "step": 10357 + }, + { + "epoch": 1.7970159611380985, + "grad_norm": 1.0133540630340576, + "learning_rate": 1.567704406242654e-06, + "loss": 0.6676, + "step": 10358 + }, + { + "epoch": 1.7971894517696043, + "grad_norm": 0.667919933795929, + "learning_rate": 1.5650612106516993e-06, + "loss": 0.7323, + "step": 10359 + }, + { + "epoch": 1.7973629424011104, + "grad_norm": 1.0993776321411133, + "learning_rate": 1.5624201544568717e-06, + "loss": 0.6392, + "step": 10360 + }, + { + "epoch": 1.7975364330326162, + "grad_norm": 0.7513256669044495, + "learning_rate": 1.559781237964666e-06, + "loss": 0.7717, + "step": 10361 + }, + { + "epoch": 1.7977099236641223, + "grad_norm": 0.899480938911438, + "learning_rate": 1.557144461481337e-06, + "loss": 0.6752, + "step": 10362 + }, + { + "epoch": 1.7978834142956281, + "grad_norm": 0.9709644317626953, + "learning_rate": 1.5545098253128843e-06, + "loss": 0.5583, + "step": 10363 + }, + { + "epoch": 1.798056904927134, + "grad_norm": 1.159766435623169, + "learning_rate": 1.5518773297650613e-06, + "loss": 0.7949, + "step": 10364 + }, + { + "epoch": 1.7982303955586398, + "grad_norm": 0.7780696749687195, + "learning_rate": 1.5492469751433658e-06, + "loss": 0.6313, + "step": 10365 + }, + { + "epoch": 1.7984038861901457, + "grad_norm": 0.9188662767410278, + "learning_rate": 1.5466187617530647e-06, + "loss": 0.6631, + "step": 10366 + }, + { + "epoch": 1.7985773768216515, + "grad_norm": 1.0568931102752686, + "learning_rate": 1.543992689899152e-06, + "loss": 0.5935, + "step": 10367 + }, + { + "epoch": 1.7987508674531574, + "grad_norm": 0.8312101364135742, + "learning_rate": 1.541368759886397e-06, + "loss": 0.7308, + "step": 10368 + }, + { + "epoch": 1.7989243580846634, + "grad_norm": 1.1675119400024414, + "learning_rate": 1.5387469720193048e-06, + "loss": 0.7644, + "step": 10369 + }, + { + "epoch": 1.7990978487161693, + "grad_norm": 1.196729063987732, + "learning_rate": 1.5361273266021392e-06, + "loss": 0.58, + "step": 10370 + }, + { + "epoch": 1.7992713393476754, + "grad_norm": 1.0965790748596191, + "learning_rate": 1.5335098239389102e-06, + "loss": 0.7905, + "step": 10371 + }, + { + "epoch": 1.7994448299791812, + "grad_norm": 1.0815366506576538, + "learning_rate": 1.5308944643333857e-06, + "loss": 0.6451, + "step": 10372 + }, + { + "epoch": 1.799618320610687, + "grad_norm": 0.7983653545379639, + "learning_rate": 1.5282812480890784e-06, + "loss": 0.6685, + "step": 10373 + }, + { + "epoch": 1.799791811242193, + "grad_norm": 1.0535144805908203, + "learning_rate": 1.5256701755092574e-06, + "loss": 0.6934, + "step": 10374 + }, + { + "epoch": 1.7999653018736987, + "grad_norm": 0.8767684102058411, + "learning_rate": 1.5230612468969352e-06, + "loss": 0.7888, + "step": 10375 + }, + { + "epoch": 1.8001387925052046, + "grad_norm": 0.891264796257019, + "learning_rate": 1.5204544625548922e-06, + "loss": 0.5162, + "step": 10376 + }, + { + "epoch": 1.8003122831367107, + "grad_norm": 0.8264020681381226, + "learning_rate": 1.5178498227856353e-06, + "loss": 0.7834, + "step": 10377 + }, + { + "epoch": 1.8004857737682165, + "grad_norm": 1.0552148818969727, + "learning_rate": 1.5152473278914447e-06, + "loss": 0.6831, + "step": 10378 + }, + { + "epoch": 1.8006592643997226, + "grad_norm": 0.8448721766471863, + "learning_rate": 1.5126469781743436e-06, + "loss": 0.6995, + "step": 10379 + }, + { + "epoch": 1.8008327550312284, + "grad_norm": 0.8552347421646118, + "learning_rate": 1.5100487739360993e-06, + "loss": 0.6085, + "step": 10380 + }, + { + "epoch": 1.8010062456627343, + "grad_norm": 1.4961739778518677, + "learning_rate": 1.5074527154782393e-06, + "loss": 0.66, + "step": 10381 + }, + { + "epoch": 1.80117973629424, + "grad_norm": 0.7845373153686523, + "learning_rate": 1.5048588031020405e-06, + "loss": 0.6237, + "step": 10382 + }, + { + "epoch": 1.801353226925746, + "grad_norm": 1.0494145154953003, + "learning_rate": 1.5022670371085314e-06, + "loss": 0.5764, + "step": 10383 + }, + { + "epoch": 1.8015267175572518, + "grad_norm": 1.6511248350143433, + "learning_rate": 1.4996774177984818e-06, + "loss": 0.553, + "step": 10384 + }, + { + "epoch": 1.8017002081887576, + "grad_norm": 2.0794832706451416, + "learning_rate": 1.4970899454724319e-06, + "loss": 0.5811, + "step": 10385 + }, + { + "epoch": 1.8018736988202637, + "grad_norm": 1.10226309299469, + "learning_rate": 1.494504620430648e-06, + "loss": 0.6853, + "step": 10386 + }, + { + "epoch": 1.8020471894517696, + "grad_norm": 0.7803844213485718, + "learning_rate": 1.4919214429731677e-06, + "loss": 0.7495, + "step": 10387 + }, + { + "epoch": 1.8022206800832756, + "grad_norm": 0.8485408425331116, + "learning_rate": 1.489340413399769e-06, + "loss": 0.6375, + "step": 10388 + }, + { + "epoch": 1.8023941707147815, + "grad_norm": 0.8034685850143433, + "learning_rate": 1.4867615320099904e-06, + "loss": 0.8455, + "step": 10389 + }, + { + "epoch": 1.8025676613462873, + "grad_norm": 0.9845173358917236, + "learning_rate": 1.484184799103101e-06, + "loss": 0.6455, + "step": 10390 + }, + { + "epoch": 1.8027411519777932, + "grad_norm": 1.4602437019348145, + "learning_rate": 1.4816102149781442e-06, + "loss": 0.6449, + "step": 10391 + }, + { + "epoch": 1.802914642609299, + "grad_norm": 1.7372161149978638, + "learning_rate": 1.4790377799339007e-06, + "loss": 0.6147, + "step": 10392 + }, + { + "epoch": 1.8030881332408049, + "grad_norm": 0.836021363735199, + "learning_rate": 1.476467494268905e-06, + "loss": 0.6449, + "step": 10393 + }, + { + "epoch": 1.803261623872311, + "grad_norm": 2.170283555984497, + "learning_rate": 1.4738993582814343e-06, + "loss": 0.6444, + "step": 10394 + }, + { + "epoch": 1.8034351145038168, + "grad_norm": 1.0143884420394897, + "learning_rate": 1.471333372269539e-06, + "loss": 0.7634, + "step": 10395 + }, + { + "epoch": 1.8036086051353228, + "grad_norm": 1.9257310628890991, + "learning_rate": 1.4687695365309895e-06, + "loss": 0.7125, + "step": 10396 + }, + { + "epoch": 1.8037820957668287, + "grad_norm": 0.9180889129638672, + "learning_rate": 1.4662078513633327e-06, + "loss": 0.6034, + "step": 10397 + }, + { + "epoch": 1.8039555863983345, + "grad_norm": 0.9589160680770874, + "learning_rate": 1.4636483170638505e-06, + "loss": 0.7141, + "step": 10398 + }, + { + "epoch": 1.8041290770298404, + "grad_norm": 1.0118221044540405, + "learning_rate": 1.4610909339295788e-06, + "loss": 0.6442, + "step": 10399 + }, + { + "epoch": 1.8043025676613462, + "grad_norm": 1.5050517320632935, + "learning_rate": 1.4585357022573043e-06, + "loss": 0.7825, + "step": 10400 + }, + { + "epoch": 1.804476058292852, + "grad_norm": 0.9964337944984436, + "learning_rate": 1.455982622343568e-06, + "loss": 0.6321, + "step": 10401 + }, + { + "epoch": 1.8046495489243581, + "grad_norm": 1.004628300666809, + "learning_rate": 1.4534316944846595e-06, + "loss": 0.6838, + "step": 10402 + }, + { + "epoch": 1.804823039555864, + "grad_norm": 1.3396317958831787, + "learning_rate": 1.4508829189766104e-06, + "loss": 0.558, + "step": 10403 + }, + { + "epoch": 1.8049965301873698, + "grad_norm": 1.548414707183838, + "learning_rate": 1.4483362961152114e-06, + "loss": 0.6868, + "step": 10404 + }, + { + "epoch": 1.805170020818876, + "grad_norm": 0.8822525143623352, + "learning_rate": 1.4457918261960057e-06, + "loss": 0.7244, + "step": 10405 + }, + { + "epoch": 1.8053435114503817, + "grad_norm": 1.2911570072174072, + "learning_rate": 1.4432495095142796e-06, + "loss": 0.6624, + "step": 10406 + }, + { + "epoch": 1.8055170020818876, + "grad_norm": 1.2486634254455566, + "learning_rate": 1.4407093463650679e-06, + "loss": 0.7712, + "step": 10407 + }, + { + "epoch": 1.8056904927133934, + "grad_norm": 1.089964509010315, + "learning_rate": 1.438171337043164e-06, + "loss": 0.6759, + "step": 10408 + }, + { + "epoch": 1.8058639833448993, + "grad_norm": 1.5116668939590454, + "learning_rate": 1.4356354818431028e-06, + "loss": 0.6124, + "step": 10409 + }, + { + "epoch": 1.8060374739764051, + "grad_norm": 2.0111656188964844, + "learning_rate": 1.4331017810591764e-06, + "loss": 0.6232, + "step": 10410 + }, + { + "epoch": 1.8062109646079112, + "grad_norm": 1.0119322538375854, + "learning_rate": 1.4305702349854245e-06, + "loss": 0.6556, + "step": 10411 + }, + { + "epoch": 1.806384455239417, + "grad_norm": 1.0209075212478638, + "learning_rate": 1.4280408439156369e-06, + "loss": 0.6476, + "step": 10412 + }, + { + "epoch": 1.806557945870923, + "grad_norm": 0.8813801407814026, + "learning_rate": 1.425513608143343e-06, + "loss": 0.7112, + "step": 10413 + }, + { + "epoch": 1.806731436502429, + "grad_norm": 0.7556938529014587, + "learning_rate": 1.4229885279618461e-06, + "loss": 0.7653, + "step": 10414 + }, + { + "epoch": 1.8069049271339348, + "grad_norm": 1.4264389276504517, + "learning_rate": 1.4204656036641717e-06, + "loss": 0.7307, + "step": 10415 + }, + { + "epoch": 1.8070784177654406, + "grad_norm": 0.7085569500923157, + "learning_rate": 1.4179448355431168e-06, + "loss": 0.854, + "step": 10416 + }, + { + "epoch": 1.8072519083969465, + "grad_norm": 0.696517288684845, + "learning_rate": 1.415426223891212e-06, + "loss": 0.8542, + "step": 10417 + }, + { + "epoch": 1.8074253990284523, + "grad_norm": 1.5400617122650146, + "learning_rate": 1.4129097690007543e-06, + "loss": 0.6241, + "step": 10418 + }, + { + "epoch": 1.8075988896599584, + "grad_norm": 0.913309633731842, + "learning_rate": 1.4103954711637724e-06, + "loss": 0.7493, + "step": 10419 + }, + { + "epoch": 1.8077723802914643, + "grad_norm": 0.8171799778938293, + "learning_rate": 1.4078833306720573e-06, + "loss": 0.6152, + "step": 10420 + }, + { + "epoch": 1.8079458709229703, + "grad_norm": 0.7422547340393066, + "learning_rate": 1.4053733478171493e-06, + "loss": 0.7319, + "step": 10421 + }, + { + "epoch": 1.8081193615544762, + "grad_norm": 0.6749753952026367, + "learning_rate": 1.4028655228903286e-06, + "loss": 0.7134, + "step": 10422 + }, + { + "epoch": 1.808292852185982, + "grad_norm": 0.6997758150100708, + "learning_rate": 1.4003598561826337e-06, + "loss": 0.6394, + "step": 10423 + }, + { + "epoch": 1.8084663428174879, + "grad_norm": 0.7608610391616821, + "learning_rate": 1.3978563479848538e-06, + "loss": 0.7617, + "step": 10424 + }, + { + "epoch": 1.8086398334489937, + "grad_norm": 1.311803936958313, + "learning_rate": 1.3953549985875148e-06, + "loss": 0.6018, + "step": 10425 + }, + { + "epoch": 1.8088133240804996, + "grad_norm": 0.7831500172615051, + "learning_rate": 1.3928558082809107e-06, + "loss": 0.6781, + "step": 10426 + }, + { + "epoch": 1.8089868147120054, + "grad_norm": 1.4155802726745605, + "learning_rate": 1.390358777355072e-06, + "loss": 0.6191, + "step": 10427 + }, + { + "epoch": 1.8091603053435115, + "grad_norm": 0.9566314220428467, + "learning_rate": 1.3878639060997822e-06, + "loss": 0.6504, + "step": 10428 + }, + { + "epoch": 1.8093337959750173, + "grad_norm": 1.8753807544708252, + "learning_rate": 1.3853711948045678e-06, + "loss": 0.668, + "step": 10429 + }, + { + "epoch": 1.8095072866065234, + "grad_norm": 0.8693917393684387, + "learning_rate": 1.3828806437587216e-06, + "loss": 0.7234, + "step": 10430 + }, + { + "epoch": 1.8096807772380292, + "grad_norm": 1.153550148010254, + "learning_rate": 1.3803922532512703e-06, + "loss": 0.6479, + "step": 10431 + }, + { + "epoch": 1.809854267869535, + "grad_norm": 0.73427414894104, + "learning_rate": 1.3779060235709918e-06, + "loss": 0.7418, + "step": 10432 + }, + { + "epoch": 1.810027758501041, + "grad_norm": 1.4947834014892578, + "learning_rate": 1.3754219550064196e-06, + "loss": 0.7209, + "step": 10433 + }, + { + "epoch": 1.8102012491325468, + "grad_norm": 0.8151593804359436, + "learning_rate": 1.3729400478458322e-06, + "loss": 0.5547, + "step": 10434 + }, + { + "epoch": 1.8103747397640526, + "grad_norm": 0.7802128791809082, + "learning_rate": 1.3704603023772567e-06, + "loss": 0.6611, + "step": 10435 + }, + { + "epoch": 1.8105482303955587, + "grad_norm": 1.2369894981384277, + "learning_rate": 1.3679827188884675e-06, + "loss": 0.6803, + "step": 10436 + }, + { + "epoch": 1.8107217210270645, + "grad_norm": 1.239542841911316, + "learning_rate": 1.3655072976670014e-06, + "loss": 0.6193, + "step": 10437 + }, + { + "epoch": 1.8108952116585706, + "grad_norm": 1.3903056383132935, + "learning_rate": 1.3630340390001195e-06, + "loss": 0.5491, + "step": 10438 + }, + { + "epoch": 1.8110687022900764, + "grad_norm": 0.8192280530929565, + "learning_rate": 1.360562943174859e-06, + "loss": 0.672, + "step": 10439 + }, + { + "epoch": 1.8112421929215823, + "grad_norm": 0.9243263602256775, + "learning_rate": 1.358094010477986e-06, + "loss": 0.5593, + "step": 10440 + }, + { + "epoch": 1.8114156835530881, + "grad_norm": 0.755497932434082, + "learning_rate": 1.355627241196027e-06, + "loss": 0.759, + "step": 10441 + }, + { + "epoch": 1.811589174184594, + "grad_norm": 1.077436923980713, + "learning_rate": 1.353162635615246e-06, + "loss": 0.7177, + "step": 10442 + }, + { + "epoch": 1.8117626648160998, + "grad_norm": 0.9135211706161499, + "learning_rate": 1.3507001940216767e-06, + "loss": 0.69, + "step": 10443 + }, + { + "epoch": 1.8119361554476057, + "grad_norm": 0.9033061265945435, + "learning_rate": 1.3482399167010752e-06, + "loss": 0.6429, + "step": 10444 + }, + { + "epoch": 1.8121096460791117, + "grad_norm": 5.1854658126831055, + "learning_rate": 1.345781803938968e-06, + "loss": 0.6506, + "step": 10445 + }, + { + "epoch": 1.8122831367106176, + "grad_norm": 1.3311586380004883, + "learning_rate": 1.3433258560206165e-06, + "loss": 0.6138, + "step": 10446 + }, + { + "epoch": 1.8124566273421236, + "grad_norm": 0.9090029001235962, + "learning_rate": 1.3408720732310432e-06, + "loss": 0.78, + "step": 10447 + }, + { + "epoch": 1.8126301179736295, + "grad_norm": 1.835009217262268, + "learning_rate": 1.3384204558550028e-06, + "loss": 0.7233, + "step": 10448 + }, + { + "epoch": 1.8128036086051353, + "grad_norm": 0.8987499475479126, + "learning_rate": 1.3359710041770147e-06, + "loss": 0.6537, + "step": 10449 + }, + { + "epoch": 1.8129770992366412, + "grad_norm": 0.8948591947555542, + "learning_rate": 1.333523718481342e-06, + "loss": 0.714, + "step": 10450 + }, + { + "epoch": 1.813150589868147, + "grad_norm": 1.0137816667556763, + "learning_rate": 1.3310785990519913e-06, + "loss": 0.7666, + "step": 10451 + }, + { + "epoch": 1.8133240804996529, + "grad_norm": 0.925883948802948, + "learning_rate": 1.3286356461727202e-06, + "loss": 0.8044, + "step": 10452 + }, + { + "epoch": 1.813497571131159, + "grad_norm": 0.8427232503890991, + "learning_rate": 1.3261948601270413e-06, + "loss": 0.7166, + "step": 10453 + }, + { + "epoch": 1.8136710617626648, + "grad_norm": 0.9277542233467102, + "learning_rate": 1.3237562411982086e-06, + "loss": 0.6382, + "step": 10454 + }, + { + "epoch": 1.8138445523941709, + "grad_norm": 3.2410616874694824, + "learning_rate": 1.3213197896692264e-06, + "loss": 0.6926, + "step": 10455 + }, + { + "epoch": 1.8140180430256767, + "grad_norm": 1.4359015226364136, + "learning_rate": 1.3188855058228468e-06, + "loss": 0.6898, + "step": 10456 + }, + { + "epoch": 1.8141915336571826, + "grad_norm": 0.6948106288909912, + "learning_rate": 1.31645338994157e-06, + "loss": 0.7625, + "step": 10457 + }, + { + "epoch": 1.8143650242886884, + "grad_norm": 0.8375139832496643, + "learning_rate": 1.3140234423076504e-06, + "loss": 0.6836, + "step": 10458 + }, + { + "epoch": 1.8145385149201942, + "grad_norm": 1.1694769859313965, + "learning_rate": 1.3115956632030845e-06, + "loss": 0.6765, + "step": 10459 + }, + { + "epoch": 1.8147120055517, + "grad_norm": 1.6982983350753784, + "learning_rate": 1.3091700529096186e-06, + "loss": 0.6956, + "step": 10460 + }, + { + "epoch": 1.8148854961832062, + "grad_norm": 1.5148093700408936, + "learning_rate": 1.3067466117087424e-06, + "loss": 0.6168, + "step": 10461 + }, + { + "epoch": 1.815058986814712, + "grad_norm": 4.021671295166016, + "learning_rate": 1.304325339881709e-06, + "loss": 0.6321, + "step": 10462 + }, + { + "epoch": 1.8152324774462179, + "grad_norm": 0.9338770508766174, + "learning_rate": 1.3019062377095026e-06, + "loss": 0.7561, + "step": 10463 + }, + { + "epoch": 1.815405968077724, + "grad_norm": 0.8266600370407104, + "learning_rate": 1.2994893054728653e-06, + "loss": 0.7554, + "step": 10464 + }, + { + "epoch": 1.8155794587092298, + "grad_norm": 1.0455262660980225, + "learning_rate": 1.2970745434522835e-06, + "loss": 0.7546, + "step": 10465 + }, + { + "epoch": 1.8157529493407356, + "grad_norm": 1.0044349431991577, + "learning_rate": 1.2946619519279979e-06, + "loss": 0.6029, + "step": 10466 + }, + { + "epoch": 1.8159264399722415, + "grad_norm": 0.8505327701568604, + "learning_rate": 1.2922515311799843e-06, + "loss": 0.7047, + "step": 10467 + }, + { + "epoch": 1.8160999306037473, + "grad_norm": 0.8839706778526306, + "learning_rate": 1.2898432814879813e-06, + "loss": 0.6311, + "step": 10468 + }, + { + "epoch": 1.8162734212352531, + "grad_norm": 1.1349387168884277, + "learning_rate": 1.2874372031314675e-06, + "loss": 0.6832, + "step": 10469 + }, + { + "epoch": 1.8164469118667592, + "grad_norm": 0.8020073175430298, + "learning_rate": 1.2850332963896706e-06, + "loss": 0.7656, + "step": 10470 + }, + { + "epoch": 1.816620402498265, + "grad_norm": 0.8114110827445984, + "learning_rate": 1.2826315615415652e-06, + "loss": 0.7437, + "step": 10471 + }, + { + "epoch": 1.8167938931297711, + "grad_norm": 0.7006446123123169, + "learning_rate": 1.2802319988658818e-06, + "loss": 0.8054, + "step": 10472 + }, + { + "epoch": 1.816967383761277, + "grad_norm": 0.7375672459602356, + "learning_rate": 1.277834608641082e-06, + "loss": 0.8223, + "step": 10473 + }, + { + "epoch": 1.8171408743927828, + "grad_norm": 0.8176854252815247, + "learning_rate": 1.2754393911453944e-06, + "loss": 0.6968, + "step": 10474 + }, + { + "epoch": 1.8173143650242887, + "grad_norm": 1.1905615329742432, + "learning_rate": 1.273046346656781e-06, + "loss": 0.5913, + "step": 10475 + }, + { + "epoch": 1.8174878556557945, + "grad_norm": 0.9247989058494568, + "learning_rate": 1.2706554754529665e-06, + "loss": 0.7057, + "step": 10476 + }, + { + "epoch": 1.8176613462873004, + "grad_norm": 0.9013538360595703, + "learning_rate": 1.2682667778114022e-06, + "loss": 0.7302, + "step": 10477 + }, + { + "epoch": 1.8178348369188064, + "grad_norm": 1.1299220323562622, + "learning_rate": 1.2658802540093084e-06, + "loss": 0.693, + "step": 10478 + }, + { + "epoch": 1.8180083275503123, + "grad_norm": 0.8593624234199524, + "learning_rate": 1.2634959043236395e-06, + "loss": 0.7888, + "step": 10479 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.7509849071502686, + "learning_rate": 1.2611137290311003e-06, + "loss": 0.7476, + "step": 10480 + }, + { + "epoch": 1.8183553088133242, + "grad_norm": 0.808872401714325, + "learning_rate": 1.2587337284081524e-06, + "loss": 0.6298, + "step": 10481 + }, + { + "epoch": 1.81852879944483, + "grad_norm": 1.0017765760421753, + "learning_rate": 1.2563559027309925e-06, + "loss": 0.5743, + "step": 10482 + }, + { + "epoch": 1.8187022900763359, + "grad_norm": 1.0465316772460938, + "learning_rate": 1.2539802522755707e-06, + "loss": 0.6403, + "step": 10483 + }, + { + "epoch": 1.8188757807078417, + "grad_norm": 1.259190559387207, + "learning_rate": 1.2516067773175822e-06, + "loss": 0.6924, + "step": 10484 + }, + { + "epoch": 1.8190492713393476, + "grad_norm": 0.8445786833763123, + "learning_rate": 1.2492354781324778e-06, + "loss": 0.6104, + "step": 10485 + }, + { + "epoch": 1.8192227619708534, + "grad_norm": 0.7929750680923462, + "learning_rate": 1.2468663549954397e-06, + "loss": 0.7307, + "step": 10486 + }, + { + "epoch": 1.8193962526023595, + "grad_norm": 0.8400115370750427, + "learning_rate": 1.2444994081814165e-06, + "loss": 0.5972, + "step": 10487 + }, + { + "epoch": 1.8195697432338653, + "grad_norm": 0.9791523814201355, + "learning_rate": 1.2421346379650868e-06, + "loss": 0.7183, + "step": 10488 + }, + { + "epoch": 1.8197432338653714, + "grad_norm": 0.976411759853363, + "learning_rate": 1.2397720446208972e-06, + "loss": 0.7412, + "step": 10489 + }, + { + "epoch": 1.8199167244968772, + "grad_norm": 1.0662147998809814, + "learning_rate": 1.2374116284230153e-06, + "loss": 0.8318, + "step": 10490 + }, + { + "epoch": 1.820090215128383, + "grad_norm": 1.9037861824035645, + "learning_rate": 1.2350533896453799e-06, + "loss": 0.7563, + "step": 10491 + }, + { + "epoch": 1.820263705759889, + "grad_norm": 0.7577615976333618, + "learning_rate": 1.2326973285616628e-06, + "loss": 0.651, + "step": 10492 + }, + { + "epoch": 1.8204371963913948, + "grad_norm": 0.8601496815681458, + "learning_rate": 1.2303434454452901e-06, + "loss": 0.6278, + "step": 10493 + }, + { + "epoch": 1.8206106870229006, + "grad_norm": 0.7998297810554504, + "learning_rate": 1.2279917405694298e-06, + "loss": 0.6663, + "step": 10494 + }, + { + "epoch": 1.8207841776544067, + "grad_norm": 2.4842052459716797, + "learning_rate": 1.2256422142070057e-06, + "loss": 0.681, + "step": 10495 + }, + { + "epoch": 1.8209576682859125, + "grad_norm": 0.8469060063362122, + "learning_rate": 1.2232948666306732e-06, + "loss": 0.5616, + "step": 10496 + }, + { + "epoch": 1.8211311589174186, + "grad_norm": 0.9011720418930054, + "learning_rate": 1.2209496981128543e-06, + "loss": 0.6658, + "step": 10497 + }, + { + "epoch": 1.8213046495489245, + "grad_norm": 1.0221352577209473, + "learning_rate": 1.218606708925707e-06, + "loss": 0.6729, + "step": 10498 + }, + { + "epoch": 1.8214781401804303, + "grad_norm": 1.322292685508728, + "learning_rate": 1.2162658993411336e-06, + "loss": 0.5695, + "step": 10499 + }, + { + "epoch": 1.8216516308119362, + "grad_norm": 1.0976120233535767, + "learning_rate": 1.2139272696307857e-06, + "loss": 0.6252, + "step": 10500 + }, + { + "epoch": 1.821825121443442, + "grad_norm": 0.8948221802711487, + "learning_rate": 1.2115908200660731e-06, + "loss": 0.5808, + "step": 10501 + }, + { + "epoch": 1.8219986120749478, + "grad_norm": 0.8044713735580444, + "learning_rate": 1.2092565509181386e-06, + "loss": 0.6924, + "step": 10502 + }, + { + "epoch": 1.8221721027064537, + "grad_norm": 0.9807984232902527, + "learning_rate": 1.2069244624578768e-06, + "loss": 0.579, + "step": 10503 + }, + { + "epoch": 1.8223455933379598, + "grad_norm": 1.0402348041534424, + "learning_rate": 1.2045945549559269e-06, + "loss": 0.7545, + "step": 10504 + }, + { + "epoch": 1.8225190839694656, + "grad_norm": 0.8155692219734192, + "learning_rate": 1.202266828682681e-06, + "loss": 0.6548, + "step": 10505 + }, + { + "epoch": 1.8226925746009717, + "grad_norm": 1.3287193775177002, + "learning_rate": 1.1999412839082748e-06, + "loss": 0.6227, + "step": 10506 + }, + { + "epoch": 1.8228660652324775, + "grad_norm": 0.9781575202941895, + "learning_rate": 1.1976179209025896e-06, + "loss": 0.6351, + "step": 10507 + }, + { + "epoch": 1.8230395558639834, + "grad_norm": 3.0555241107940674, + "learning_rate": 1.1952967399352522e-06, + "loss": 0.5614, + "step": 10508 + }, + { + "epoch": 1.8232130464954892, + "grad_norm": 0.9297942519187927, + "learning_rate": 1.1929777412756382e-06, + "loss": 0.6726, + "step": 10509 + }, + { + "epoch": 1.823386537126995, + "grad_norm": 0.8501873016357422, + "learning_rate": 1.1906609251928746e-06, + "loss": 0.6587, + "step": 10510 + }, + { + "epoch": 1.823560027758501, + "grad_norm": 1.0164989233016968, + "learning_rate": 1.1883462919558263e-06, + "loss": 0.6121, + "step": 10511 + }, + { + "epoch": 1.823733518390007, + "grad_norm": 0.7658737897872925, + "learning_rate": 1.186033841833112e-06, + "loss": 0.7446, + "step": 10512 + }, + { + "epoch": 1.8239070090215128, + "grad_norm": 0.908176600933075, + "learning_rate": 1.1837235750930898e-06, + "loss": 0.5809, + "step": 10513 + }, + { + "epoch": 1.8240804996530189, + "grad_norm": 1.0223183631896973, + "learning_rate": 1.1814154920038789e-06, + "loss": 0.5656, + "step": 10514 + }, + { + "epoch": 1.8242539902845247, + "grad_norm": 1.0727120637893677, + "learning_rate": 1.179109592833323e-06, + "loss": 0.7235, + "step": 10515 + }, + { + "epoch": 1.8244274809160306, + "grad_norm": 1.0005948543548584, + "learning_rate": 1.17680587784903e-06, + "loss": 0.6216, + "step": 10516 + }, + { + "epoch": 1.8246009715475364, + "grad_norm": 1.2637439966201782, + "learning_rate": 1.174504347318346e-06, + "loss": 0.7412, + "step": 10517 + }, + { + "epoch": 1.8247744621790423, + "grad_norm": 0.7099981307983398, + "learning_rate": 1.1722050015083752e-06, + "loss": 0.7988, + "step": 10518 + }, + { + "epoch": 1.8249479528105481, + "grad_norm": 0.8606386780738831, + "learning_rate": 1.1699078406859466e-06, + "loss": 0.7372, + "step": 10519 + }, + { + "epoch": 1.8251214434420542, + "grad_norm": 0.6610363125801086, + "learning_rate": 1.1676128651176578e-06, + "loss": 0.6777, + "step": 10520 + }, + { + "epoch": 1.82529493407356, + "grad_norm": 0.9529474973678589, + "learning_rate": 1.1653200750698402e-06, + "loss": 0.7399, + "step": 10521 + }, + { + "epoch": 1.8254684247050659, + "grad_norm": 0.786693274974823, + "learning_rate": 1.163029470808572e-06, + "loss": 0.8464, + "step": 10522 + }, + { + "epoch": 1.825641915336572, + "grad_norm": 0.9877683520317078, + "learning_rate": 1.1607410525996832e-06, + "loss": 0.5881, + "step": 10523 + }, + { + "epoch": 1.8258154059680778, + "grad_norm": 2.2365987300872803, + "learning_rate": 1.1584548207087498e-06, + "loss": 0.583, + "step": 10524 + }, + { + "epoch": 1.8259888965995836, + "grad_norm": 1.1014721393585205, + "learning_rate": 1.1561707754010887e-06, + "loss": 0.7289, + "step": 10525 + }, + { + "epoch": 1.8261623872310895, + "grad_norm": 0.7865198254585266, + "learning_rate": 1.1538889169417654e-06, + "loss": 0.5847, + "step": 10526 + }, + { + "epoch": 1.8263358778625953, + "grad_norm": 1.0256744623184204, + "learning_rate": 1.1516092455955951e-06, + "loss": 0.6292, + "step": 10527 + }, + { + "epoch": 1.8265093684941012, + "grad_norm": 1.4468580484390259, + "learning_rate": 1.1493317616271327e-06, + "loss": 0.7292, + "step": 10528 + }, + { + "epoch": 1.8266828591256072, + "grad_norm": 0.8416298031806946, + "learning_rate": 1.1470564653006844e-06, + "loss": 0.7463, + "step": 10529 + }, + { + "epoch": 1.826856349757113, + "grad_norm": 1.0290980339050293, + "learning_rate": 1.1447833568803036e-06, + "loss": 0.6014, + "step": 10530 + }, + { + "epoch": 1.8270298403886192, + "grad_norm": 0.97081458568573, + "learning_rate": 1.1425124366297858e-06, + "loss": 0.807, + "step": 10531 + }, + { + "epoch": 1.827203331020125, + "grad_norm": 0.6764442920684814, + "learning_rate": 1.140243704812667e-06, + "loss": 0.7852, + "step": 10532 + }, + { + "epoch": 1.8273768216516308, + "grad_norm": 2.476881742477417, + "learning_rate": 1.1379771616922474e-06, + "loss": 0.6851, + "step": 10533 + }, + { + "epoch": 1.8275503122831367, + "grad_norm": 1.7045180797576904, + "learning_rate": 1.1357128075315572e-06, + "loss": 0.7209, + "step": 10534 + }, + { + "epoch": 1.8277238029146425, + "grad_norm": 0.665728747844696, + "learning_rate": 1.1334506425933745e-06, + "loss": 0.78, + "step": 10535 + }, + { + "epoch": 1.8278972935461484, + "grad_norm": 0.8095372915267944, + "learning_rate": 1.1311906671402274e-06, + "loss": 0.7817, + "step": 10536 + }, + { + "epoch": 1.8280707841776545, + "grad_norm": 0.7966381907463074, + "learning_rate": 1.1289328814343969e-06, + "loss": 0.6649, + "step": 10537 + }, + { + "epoch": 1.8282442748091603, + "grad_norm": 0.9591763019561768, + "learning_rate": 1.126677285737887e-06, + "loss": 0.653, + "step": 10538 + }, + { + "epoch": 1.8284177654406664, + "grad_norm": 0.8966863751411438, + "learning_rate": 1.124423880312473e-06, + "loss": 0.7771, + "step": 10539 + }, + { + "epoch": 1.8285912560721722, + "grad_norm": 1.0086137056350708, + "learning_rate": 1.122172665419663e-06, + "loss": 0.7283, + "step": 10540 + }, + { + "epoch": 1.828764746703678, + "grad_norm": 0.880584716796875, + "learning_rate": 1.1199236413207127e-06, + "loss": 0.6427, + "step": 10541 + }, + { + "epoch": 1.828938237335184, + "grad_norm": 1.0431946516036987, + "learning_rate": 1.1176768082766177e-06, + "loss": 0.6641, + "step": 10542 + }, + { + "epoch": 1.8291117279666897, + "grad_norm": 0.6687842011451721, + "learning_rate": 1.1154321665481404e-06, + "loss": 0.7496, + "step": 10543 + }, + { + "epoch": 1.8292852185981956, + "grad_norm": 0.746192216873169, + "learning_rate": 1.1131897163957573e-06, + "loss": 0.7411, + "step": 10544 + }, + { + "epoch": 1.8294587092297014, + "grad_norm": 0.9176620244979858, + "learning_rate": 1.1109494580797175e-06, + "loss": 0.5148, + "step": 10545 + }, + { + "epoch": 1.8296321998612075, + "grad_norm": 3.515878438949585, + "learning_rate": 1.1087113918600023e-06, + "loss": 0.5603, + "step": 10546 + }, + { + "epoch": 1.8298056904927134, + "grad_norm": 0.7978086471557617, + "learning_rate": 1.1064755179963483e-06, + "loss": 0.6393, + "step": 10547 + }, + { + "epoch": 1.8299791811242194, + "grad_norm": 0.8006490468978882, + "learning_rate": 1.1042418367482188e-06, + "loss": 0.6641, + "step": 10548 + }, + { + "epoch": 1.8301526717557253, + "grad_norm": 0.8114250302314758, + "learning_rate": 1.1020103483748469e-06, + "loss": 0.7411, + "step": 10549 + }, + { + "epoch": 1.8303261623872311, + "grad_norm": 0.8507854342460632, + "learning_rate": 1.0997810531351916e-06, + "loss": 0.7002, + "step": 10550 + }, + { + "epoch": 1.830499653018737, + "grad_norm": 1.5086146593093872, + "learning_rate": 1.0975539512879707e-06, + "loss": 0.7153, + "step": 10551 + }, + { + "epoch": 1.8306731436502428, + "grad_norm": 0.7170061469078064, + "learning_rate": 1.0953290430916353e-06, + "loss": 0.6709, + "step": 10552 + }, + { + "epoch": 1.8308466342817487, + "grad_norm": 1.1193071603775024, + "learning_rate": 1.0931063288043963e-06, + "loss": 0.6013, + "step": 10553 + }, + { + "epoch": 1.8310201249132547, + "grad_norm": 0.9027571082115173, + "learning_rate": 1.0908858086841989e-06, + "loss": 0.525, + "step": 10554 + }, + { + "epoch": 1.8311936155447606, + "grad_norm": 2.116704225540161, + "learning_rate": 1.0886674829887323e-06, + "loss": 0.558, + "step": 10555 + }, + { + "epoch": 1.8313671061762666, + "grad_norm": 3.8138468265533447, + "learning_rate": 1.0864513519754484e-06, + "loss": 0.7004, + "step": 10556 + }, + { + "epoch": 1.8315405968077725, + "grad_norm": 1.1234149932861328, + "learning_rate": 1.0842374159015167e-06, + "loss": 0.6017, + "step": 10557 + }, + { + "epoch": 1.8317140874392783, + "grad_norm": 0.9321621656417847, + "learning_rate": 1.082025675023879e-06, + "loss": 0.585, + "step": 10558 + }, + { + "epoch": 1.8318875780707842, + "grad_norm": 0.9628468751907349, + "learning_rate": 1.0798161295992004e-06, + "loss": 0.8025, + "step": 10559 + }, + { + "epoch": 1.83206106870229, + "grad_norm": 0.7707751393318176, + "learning_rate": 1.077608779883912e-06, + "loss": 0.7876, + "step": 10560 + }, + { + "epoch": 1.8322345593337959, + "grad_norm": 1.486331820487976, + "learning_rate": 1.0754036261341683e-06, + "loss": 0.6284, + "step": 10561 + }, + { + "epoch": 1.8324080499653017, + "grad_norm": 1.0722484588623047, + "learning_rate": 1.0732006686058893e-06, + "loss": 0.5544, + "step": 10562 + }, + { + "epoch": 1.8325815405968078, + "grad_norm": 1.8442302942276, + "learning_rate": 1.070999907554726e-06, + "loss": 0.6877, + "step": 10563 + }, + { + "epoch": 1.8327550312283136, + "grad_norm": 0.962524950504303, + "learning_rate": 1.0688013432360811e-06, + "loss": 0.5553, + "step": 10564 + }, + { + "epoch": 1.8329285218598197, + "grad_norm": 0.8323068022727966, + "learning_rate": 1.0666049759050945e-06, + "loss": 0.6266, + "step": 10565 + }, + { + "epoch": 1.8331020124913255, + "grad_norm": 0.8176705837249756, + "learning_rate": 1.0644108058166692e-06, + "loss": 0.7849, + "step": 10566 + }, + { + "epoch": 1.8332755031228314, + "grad_norm": 0.8641448616981506, + "learning_rate": 1.0622188332254302e-06, + "loss": 0.8408, + "step": 10567 + }, + { + "epoch": 1.8334489937543372, + "grad_norm": 0.9448774456977844, + "learning_rate": 1.0600290583857631e-06, + "loss": 0.7909, + "step": 10568 + }, + { + "epoch": 1.833622484385843, + "grad_norm": 0.9694026112556458, + "learning_rate": 1.0578414815517936e-06, + "loss": 0.6487, + "step": 10569 + }, + { + "epoch": 1.833795975017349, + "grad_norm": 1.0116236209869385, + "learning_rate": 1.0556561029773914e-06, + "loss": 0.6071, + "step": 10570 + }, + { + "epoch": 1.833969465648855, + "grad_norm": 1.0293655395507812, + "learning_rate": 1.0534729229161722e-06, + "loss": 0.6693, + "step": 10571 + }, + { + "epoch": 1.8341429562803608, + "grad_norm": 0.7944754362106323, + "learning_rate": 1.0512919416214995e-06, + "loss": 0.6847, + "step": 10572 + }, + { + "epoch": 1.834316446911867, + "grad_norm": 1.7378422021865845, + "learning_rate": 1.0491131593464755e-06, + "loss": 0.6201, + "step": 10573 + }, + { + "epoch": 1.8344899375433728, + "grad_norm": 0.9240221977233887, + "learning_rate": 1.0469365763439532e-06, + "loss": 0.5573, + "step": 10574 + }, + { + "epoch": 1.8346634281748786, + "grad_norm": 0.8766233921051025, + "learning_rate": 1.044762192866522e-06, + "loss": 0.6539, + "step": 10575 + }, + { + "epoch": 1.8348369188063844, + "grad_norm": 0.9922652840614319, + "learning_rate": 1.0425900091665286e-06, + "loss": 0.6288, + "step": 10576 + }, + { + "epoch": 1.8350104094378903, + "grad_norm": 1.0475313663482666, + "learning_rate": 1.040420025496054e-06, + "loss": 0.6901, + "step": 10577 + }, + { + "epoch": 1.8351839000693961, + "grad_norm": 0.9627009630203247, + "learning_rate": 1.0382522421069274e-06, + "loss": 0.6207, + "step": 10578 + }, + { + "epoch": 1.8353573907009022, + "grad_norm": 0.7962549328804016, + "learning_rate": 1.0360866592507236e-06, + "loss": 0.788, + "step": 10579 + }, + { + "epoch": 1.835530881332408, + "grad_norm": 0.8719924688339233, + "learning_rate": 1.033923277178759e-06, + "loss": 0.6417, + "step": 10580 + }, + { + "epoch": 1.835704371963914, + "grad_norm": 0.7588886022567749, + "learning_rate": 1.0317620961420993e-06, + "loss": 0.6641, + "step": 10581 + }, + { + "epoch": 1.83587786259542, + "grad_norm": 0.770853579044342, + "learning_rate": 1.029603116391551e-06, + "loss": 0.7761, + "step": 10582 + }, + { + "epoch": 1.8360513532269258, + "grad_norm": 1.034806728363037, + "learning_rate": 1.0274463381776646e-06, + "loss": 0.6846, + "step": 10583 + }, + { + "epoch": 1.8362248438584317, + "grad_norm": 1.0565052032470703, + "learning_rate": 1.0252917617507374e-06, + "loss": 0.6558, + "step": 10584 + }, + { + "epoch": 1.8363983344899375, + "grad_norm": 0.9282329678535461, + "learning_rate": 1.0231393873608164e-06, + "loss": 0.7554, + "step": 10585 + }, + { + "epoch": 1.8365718251214433, + "grad_norm": 0.8138357996940613, + "learning_rate": 1.020989215257675e-06, + "loss": 0.6855, + "step": 10586 + }, + { + "epoch": 1.8367453157529492, + "grad_norm": 1.3039846420288086, + "learning_rate": 1.0188412456908537e-06, + "loss": 0.7747, + "step": 10587 + }, + { + "epoch": 1.8369188063844553, + "grad_norm": 0.8565714955329895, + "learning_rate": 1.0166954789096194e-06, + "loss": 0.6097, + "step": 10588 + }, + { + "epoch": 1.837092297015961, + "grad_norm": 0.9679251313209534, + "learning_rate": 1.0145519151630023e-06, + "loss": 0.6897, + "step": 10589 + }, + { + "epoch": 1.8372657876474672, + "grad_norm": 1.0613486766815186, + "learning_rate": 1.0124105546997521e-06, + "loss": 0.5885, + "step": 10590 + }, + { + "epoch": 1.837439278278973, + "grad_norm": 0.9461278915405273, + "learning_rate": 1.0102713977683832e-06, + "loss": 0.5652, + "step": 10591 + }, + { + "epoch": 1.8376127689104789, + "grad_norm": 0.879865288734436, + "learning_rate": 1.008134444617146e-06, + "loss": 0.5603, + "step": 10592 + }, + { + "epoch": 1.8377862595419847, + "grad_norm": 0.8482068181037903, + "learning_rate": 1.0059996954940377e-06, + "loss": 0.6207, + "step": 10593 + }, + { + "epoch": 1.8379597501734906, + "grad_norm": 0.8780599236488342, + "learning_rate": 1.0038671506467934e-06, + "loss": 0.6665, + "step": 10594 + }, + { + "epoch": 1.8381332408049964, + "grad_norm": 0.9728347063064575, + "learning_rate": 1.0017368103229086e-06, + "loss": 0.6732, + "step": 10595 + }, + { + "epoch": 1.8383067314365025, + "grad_norm": 0.8536355495452881, + "learning_rate": 9.996086747695966e-07, + "loss": 0.5796, + "step": 10596 + }, + { + "epoch": 1.8384802220680083, + "grad_norm": 1.157546043395996, + "learning_rate": 9.97482744233842e-07, + "loss": 0.6266, + "step": 10597 + }, + { + "epoch": 1.8386537126995144, + "grad_norm": 1.2310494184494019, + "learning_rate": 9.953590189623563e-07, + "loss": 0.6528, + "step": 10598 + }, + { + "epoch": 1.8388272033310202, + "grad_norm": 0.9181835651397705, + "learning_rate": 9.932374992016002e-07, + "loss": 0.7766, + "step": 10599 + }, + { + "epoch": 1.839000693962526, + "grad_norm": 1.6963701248168945, + "learning_rate": 9.911181851977792e-07, + "loss": 0.7098, + "step": 10600 + }, + { + "epoch": 1.839174184594032, + "grad_norm": 0.8048732280731201, + "learning_rate": 9.890010771968428e-07, + "loss": 0.7703, + "step": 10601 + }, + { + "epoch": 1.8393476752255378, + "grad_norm": 0.7889317274093628, + "learning_rate": 9.868861754444858e-07, + "loss": 0.6508, + "step": 10602 + }, + { + "epoch": 1.8395211658570436, + "grad_norm": 2.309152126312256, + "learning_rate": 9.847734801861387e-07, + "loss": 0.6672, + "step": 10603 + }, + { + "epoch": 1.8396946564885495, + "grad_norm": 0.8772554397583008, + "learning_rate": 9.826629916669917e-07, + "loss": 0.649, + "step": 10604 + }, + { + "epoch": 1.8398681471200555, + "grad_norm": 1.0758229494094849, + "learning_rate": 9.805547101319601e-07, + "loss": 0.5942, + "step": 10605 + }, + { + "epoch": 1.8400416377515614, + "grad_norm": 0.846508264541626, + "learning_rate": 9.784486358257194e-07, + "loss": 0.6565, + "step": 10606 + }, + { + "epoch": 1.8402151283830674, + "grad_norm": 1.1726436614990234, + "learning_rate": 9.763447689926763e-07, + "loss": 0.6671, + "step": 10607 + }, + { + "epoch": 1.8403886190145733, + "grad_norm": 1.6465505361557007, + "learning_rate": 9.742431098769933e-07, + "loss": 0.7694, + "step": 10608 + }, + { + "epoch": 1.8405621096460791, + "grad_norm": 1.2208987474441528, + "learning_rate": 9.721436587225618e-07, + "loss": 0.6594, + "step": 10609 + }, + { + "epoch": 1.840735600277585, + "grad_norm": 0.6525108814239502, + "learning_rate": 9.700464157730338e-07, + "loss": 0.7668, + "step": 10610 + }, + { + "epoch": 1.8409090909090908, + "grad_norm": 0.8157848715782166, + "learning_rate": 9.679513812717945e-07, + "loss": 0.7141, + "step": 10611 + }, + { + "epoch": 1.8410825815405967, + "grad_norm": 0.8425846695899963, + "learning_rate": 9.658585554619737e-07, + "loss": 0.6805, + "step": 10612 + }, + { + "epoch": 1.8412560721721027, + "grad_norm": 1.4768023490905762, + "learning_rate": 9.637679385864417e-07, + "loss": 0.714, + "step": 10613 + }, + { + "epoch": 1.8414295628036086, + "grad_norm": 1.4486258029937744, + "learning_rate": 9.616795308878313e-07, + "loss": 0.6619, + "step": 10614 + }, + { + "epoch": 1.8416030534351147, + "grad_norm": 0.7884476184844971, + "learning_rate": 9.59593332608486e-07, + "loss": 0.7168, + "step": 10615 + }, + { + "epoch": 1.8417765440666205, + "grad_norm": 0.8170456290245056, + "learning_rate": 9.575093439905259e-07, + "loss": 0.6753, + "step": 10616 + }, + { + "epoch": 1.8419500346981263, + "grad_norm": 0.8664473295211792, + "learning_rate": 9.554275652757928e-07, + "loss": 0.5586, + "step": 10617 + }, + { + "epoch": 1.8421235253296322, + "grad_norm": 0.9840244054794312, + "learning_rate": 9.533479967058867e-07, + "loss": 0.5946, + "step": 10618 + }, + { + "epoch": 1.842297015961138, + "grad_norm": 1.1888785362243652, + "learning_rate": 9.512706385221348e-07, + "loss": 0.6036, + "step": 10619 + }, + { + "epoch": 1.8424705065926439, + "grad_norm": 1.3109122514724731, + "learning_rate": 9.491954909656242e-07, + "loss": 0.6908, + "step": 10620 + }, + { + "epoch": 1.8426439972241497, + "grad_norm": 1.2498774528503418, + "learning_rate": 9.471225542771734e-07, + "loss": 0.6327, + "step": 10621 + }, + { + "epoch": 1.8428174878556558, + "grad_norm": 0.8019169569015503, + "learning_rate": 9.450518286973542e-07, + "loss": 0.6934, + "step": 10622 + }, + { + "epoch": 1.8429909784871616, + "grad_norm": 1.173684000968933, + "learning_rate": 9.42983314466468e-07, + "loss": 0.7484, + "step": 10623 + }, + { + "epoch": 1.8431644691186677, + "grad_norm": 1.0861363410949707, + "learning_rate": 9.409170118245803e-07, + "loss": 0.5945, + "step": 10624 + }, + { + "epoch": 1.8433379597501736, + "grad_norm": 0.6695698499679565, + "learning_rate": 9.388529210114794e-07, + "loss": 0.8442, + "step": 10625 + }, + { + "epoch": 1.8435114503816794, + "grad_norm": 1.079399585723877, + "learning_rate": 9.36791042266707e-07, + "loss": 0.715, + "step": 10626 + }, + { + "epoch": 1.8436849410131853, + "grad_norm": 1.087933897972107, + "learning_rate": 9.347313758295473e-07, + "loss": 0.7815, + "step": 10627 + }, + { + "epoch": 1.843858431644691, + "grad_norm": 1.3308649063110352, + "learning_rate": 9.326739219390246e-07, + "loss": 0.6492, + "step": 10628 + }, + { + "epoch": 1.844031922276197, + "grad_norm": 1.2196619510650635, + "learning_rate": 9.306186808339146e-07, + "loss": 0.6583, + "step": 10629 + }, + { + "epoch": 1.844205412907703, + "grad_norm": 1.7175997495651245, + "learning_rate": 9.285656527527264e-07, + "loss": 0.6354, + "step": 10630 + }, + { + "epoch": 1.8443789035392089, + "grad_norm": 0.9931996464729309, + "learning_rate": 9.265148379337164e-07, + "loss": 0.5365, + "step": 10631 + }, + { + "epoch": 1.844552394170715, + "grad_norm": 1.0333009958267212, + "learning_rate": 9.244662366148826e-07, + "loss": 0.6005, + "step": 10632 + }, + { + "epoch": 1.8447258848022208, + "grad_norm": 1.2563990354537964, + "learning_rate": 9.224198490339731e-07, + "loss": 0.6763, + "step": 10633 + }, + { + "epoch": 1.8448993754337266, + "grad_norm": 1.0413137674331665, + "learning_rate": 9.203756754284665e-07, + "loss": 0.6976, + "step": 10634 + }, + { + "epoch": 1.8450728660652325, + "grad_norm": 1.287703275680542, + "learning_rate": 9.183337160355976e-07, + "loss": 0.7285, + "step": 10635 + }, + { + "epoch": 1.8452463566967383, + "grad_norm": 0.7746307849884033, + "learning_rate": 9.162939710923324e-07, + "loss": 0.7128, + "step": 10636 + }, + { + "epoch": 1.8454198473282442, + "grad_norm": 1.1535283327102661, + "learning_rate": 9.142564408353949e-07, + "loss": 0.5387, + "step": 10637 + }, + { + "epoch": 1.8455933379597502, + "grad_norm": 1.2212673425674438, + "learning_rate": 9.122211255012292e-07, + "loss": 0.603, + "step": 10638 + }, + { + "epoch": 1.845766828591256, + "grad_norm": 0.9315487146377563, + "learning_rate": 9.101880253260487e-07, + "loss": 0.6936, + "step": 10639 + }, + { + "epoch": 1.845940319222762, + "grad_norm": 0.951657772064209, + "learning_rate": 9.081571405457912e-07, + "loss": 0.6339, + "step": 10640 + }, + { + "epoch": 1.846113809854268, + "grad_norm": 1.4637097120285034, + "learning_rate": 9.061284713961416e-07, + "loss": 0.6539, + "step": 10641 + }, + { + "epoch": 1.8462873004857738, + "grad_norm": 1.1528828144073486, + "learning_rate": 9.041020181125315e-07, + "loss": 0.7389, + "step": 10642 + }, + { + "epoch": 1.8464607911172797, + "grad_norm": 0.7647483348846436, + "learning_rate": 9.020777809301396e-07, + "loss": 0.6874, + "step": 10643 + }, + { + "epoch": 1.8466342817487855, + "grad_norm": 0.9587643146514893, + "learning_rate": 9.000557600838666e-07, + "loss": 0.5923, + "step": 10644 + }, + { + "epoch": 1.8468077723802914, + "grad_norm": 3.6472904682159424, + "learning_rate": 8.980359558083828e-07, + "loss": 0.6176, + "step": 10645 + }, + { + "epoch": 1.8469812630117972, + "grad_norm": 0.8364800214767456, + "learning_rate": 8.960183683380807e-07, + "loss": 0.7352, + "step": 10646 + }, + { + "epoch": 1.8471547536433033, + "grad_norm": 0.9218534231185913, + "learning_rate": 8.940029979071152e-07, + "loss": 0.7479, + "step": 10647 + }, + { + "epoch": 1.8473282442748091, + "grad_norm": 0.9457231163978577, + "learning_rate": 8.919898447493569e-07, + "loss": 0.6235, + "step": 10648 + }, + { + "epoch": 1.8475017349063152, + "grad_norm": 0.8958785533905029, + "learning_rate": 8.899789090984457e-07, + "loss": 0.6567, + "step": 10649 + }, + { + "epoch": 1.847675225537821, + "grad_norm": 0.7873144745826721, + "learning_rate": 8.879701911877503e-07, + "loss": 0.7356, + "step": 10650 + }, + { + "epoch": 1.8478487161693269, + "grad_norm": 0.9543851017951965, + "learning_rate": 8.859636912503822e-07, + "loss": 0.6161, + "step": 10651 + }, + { + "epoch": 1.8480222068008327, + "grad_norm": 0.958545446395874, + "learning_rate": 8.839594095191995e-07, + "loss": 0.7185, + "step": 10652 + }, + { + "epoch": 1.8481956974323386, + "grad_norm": 0.9481476545333862, + "learning_rate": 8.81957346226805e-07, + "loss": 0.7164, + "step": 10653 + }, + { + "epoch": 1.8483691880638444, + "grad_norm": 0.8809033036231995, + "learning_rate": 8.799575016055373e-07, + "loss": 0.8416, + "step": 10654 + }, + { + "epoch": 1.8485426786953505, + "grad_norm": 1.0478092432022095, + "learning_rate": 8.779598758874774e-07, + "loss": 0.6034, + "step": 10655 + }, + { + "epoch": 1.8487161693268563, + "grad_norm": 0.9955368041992188, + "learning_rate": 8.7596446930446e-07, + "loss": 0.7976, + "step": 10656 + }, + { + "epoch": 1.8488896599583624, + "grad_norm": 1.403151512145996, + "learning_rate": 8.739712820880441e-07, + "loss": 0.6666, + "step": 10657 + }, + { + "epoch": 1.8490631505898683, + "grad_norm": 1.139175295829773, + "learning_rate": 8.719803144695516e-07, + "loss": 0.6039, + "step": 10658 + }, + { + "epoch": 1.849236641221374, + "grad_norm": 1.1144832372665405, + "learning_rate": 8.699915666800285e-07, + "loss": 0.7997, + "step": 10659 + }, + { + "epoch": 1.84941013185288, + "grad_norm": 1.207871913909912, + "learning_rate": 8.680050389502814e-07, + "loss": 0.6836, + "step": 10660 + }, + { + "epoch": 1.8495836224843858, + "grad_norm": 0.7715673446655273, + "learning_rate": 8.660207315108371e-07, + "loss": 0.6008, + "step": 10661 + }, + { + "epoch": 1.8497571131158916, + "grad_norm": 1.2986758947372437, + "learning_rate": 8.640386445919847e-07, + "loss": 0.5597, + "step": 10662 + }, + { + "epoch": 1.8499306037473975, + "grad_norm": 0.7569127082824707, + "learning_rate": 8.620587784237444e-07, + "loss": 0.776, + "step": 10663 + }, + { + "epoch": 1.8501040943789036, + "grad_norm": 0.9743692278862, + "learning_rate": 8.600811332358861e-07, + "loss": 0.6591, + "step": 10664 + }, + { + "epoch": 1.8502775850104094, + "grad_norm": 0.9597836136817932, + "learning_rate": 8.581057092579081e-07, + "loss": 0.6665, + "step": 10665 + }, + { + "epoch": 1.8504510756419155, + "grad_norm": 0.8362787365913391, + "learning_rate": 8.561325067190762e-07, + "loss": 0.6423, + "step": 10666 + }, + { + "epoch": 1.8506245662734213, + "grad_norm": 1.0900036096572876, + "learning_rate": 8.541615258483671e-07, + "loss": 0.6584, + "step": 10667 + }, + { + "epoch": 1.8507980569049272, + "grad_norm": 0.7911356687545776, + "learning_rate": 8.521927668745244e-07, + "loss": 0.5905, + "step": 10668 + }, + { + "epoch": 1.850971547536433, + "grad_norm": 1.6886705160140991, + "learning_rate": 8.502262300260234e-07, + "loss": 0.6411, + "step": 10669 + }, + { + "epoch": 1.8511450381679388, + "grad_norm": 0.7890623211860657, + "learning_rate": 8.482619155310812e-07, + "loss": 0.8113, + "step": 10670 + }, + { + "epoch": 1.8513185287994447, + "grad_norm": 0.953999400138855, + "learning_rate": 8.462998236176578e-07, + "loss": 0.7131, + "step": 10671 + }, + { + "epoch": 1.8514920194309508, + "grad_norm": 0.9307856559753418, + "learning_rate": 8.443399545134623e-07, + "loss": 0.6409, + "step": 10672 + }, + { + "epoch": 1.8516655100624566, + "grad_norm": 0.9512802958488464, + "learning_rate": 8.423823084459349e-07, + "loss": 0.7415, + "step": 10673 + }, + { + "epoch": 1.8518390006939627, + "grad_norm": 0.9914522171020508, + "learning_rate": 8.404268856422626e-07, + "loss": 0.5762, + "step": 10674 + }, + { + "epoch": 1.8520124913254685, + "grad_norm": 1.1741764545440674, + "learning_rate": 8.384736863293729e-07, + "loss": 0.6829, + "step": 10675 + }, + { + "epoch": 1.8521859819569744, + "grad_norm": 0.7888959050178528, + "learning_rate": 8.365227107339447e-07, + "loss": 0.8291, + "step": 10676 + }, + { + "epoch": 1.8523594725884802, + "grad_norm": 0.9752930402755737, + "learning_rate": 8.345739590823832e-07, + "loss": 0.8552, + "step": 10677 + }, + { + "epoch": 1.852532963219986, + "grad_norm": 0.9059135913848877, + "learning_rate": 8.326274316008475e-07, + "loss": 0.7412, + "step": 10678 + }, + { + "epoch": 1.852706453851492, + "grad_norm": 0.7352523803710938, + "learning_rate": 8.306831285152328e-07, + "loss": 0.7827, + "step": 10679 + }, + { + "epoch": 1.8528799444829978, + "grad_norm": 0.8872414827346802, + "learning_rate": 8.287410500511739e-07, + "loss": 0.6281, + "step": 10680 + }, + { + "epoch": 1.8530534351145038, + "grad_norm": 2.6055068969726562, + "learning_rate": 8.268011964340595e-07, + "loss": 0.7953, + "step": 10681 + }, + { + "epoch": 1.8532269257460097, + "grad_norm": 1.0666767358779907, + "learning_rate": 8.248635678890049e-07, + "loss": 0.7482, + "step": 10682 + }, + { + "epoch": 1.8534004163775157, + "grad_norm": 0.7684740424156189, + "learning_rate": 8.229281646408793e-07, + "loss": 0.6781, + "step": 10683 + }, + { + "epoch": 1.8535739070090216, + "grad_norm": 0.6014052629470825, + "learning_rate": 8.209949869142808e-07, + "loss": 0.8303, + "step": 10684 + }, + { + "epoch": 1.8537473976405274, + "grad_norm": 0.7419300079345703, + "learning_rate": 8.190640349335699e-07, + "loss": 0.6769, + "step": 10685 + }, + { + "epoch": 1.8539208882720333, + "grad_norm": 0.9170615673065186, + "learning_rate": 8.171353089228206e-07, + "loss": 0.696, + "step": 10686 + }, + { + "epoch": 1.8540943789035391, + "grad_norm": 0.9430468678474426, + "learning_rate": 8.152088091058741e-07, + "loss": 0.6522, + "step": 10687 + }, + { + "epoch": 1.854267869535045, + "grad_norm": 1.801724910736084, + "learning_rate": 8.132845357062979e-07, + "loss": 0.6799, + "step": 10688 + }, + { + "epoch": 1.854441360166551, + "grad_norm": 1.112743854522705, + "learning_rate": 8.113624889474136e-07, + "loss": 0.6492, + "step": 10689 + }, + { + "epoch": 1.8546148507980569, + "grad_norm": 1.1545261144638062, + "learning_rate": 8.094426690522672e-07, + "loss": 0.5605, + "step": 10690 + }, + { + "epoch": 1.854788341429563, + "grad_norm": 0.8276134729385376, + "learning_rate": 8.075250762436626e-07, + "loss": 0.608, + "step": 10691 + }, + { + "epoch": 1.8549618320610688, + "grad_norm": 0.8473713397979736, + "learning_rate": 8.056097107441352e-07, + "loss": 0.8135, + "step": 10692 + }, + { + "epoch": 1.8551353226925746, + "grad_norm": 0.8236359357833862, + "learning_rate": 8.036965727759693e-07, + "loss": 0.6765, + "step": 10693 + }, + { + "epoch": 1.8553088133240805, + "grad_norm": 0.8526064157485962, + "learning_rate": 8.017856625611809e-07, + "loss": 0.679, + "step": 10694 + }, + { + "epoch": 1.8554823039555863, + "grad_norm": 0.8768869042396545, + "learning_rate": 7.998769803215389e-07, + "loss": 0.5248, + "step": 10695 + }, + { + "epoch": 1.8556557945870922, + "grad_norm": 1.0279953479766846, + "learning_rate": 7.979705262785442e-07, + "loss": 0.6622, + "step": 10696 + }, + { + "epoch": 1.8558292852185982, + "grad_norm": 1.047110915184021, + "learning_rate": 7.960663006534464e-07, + "loss": 0.7957, + "step": 10697 + }, + { + "epoch": 1.856002775850104, + "grad_norm": 0.7237328290939331, + "learning_rate": 7.941643036672309e-07, + "loss": 0.7628, + "step": 10698 + }, + { + "epoch": 1.85617626648161, + "grad_norm": 0.8282189965248108, + "learning_rate": 7.922645355406256e-07, + "loss": 0.6656, + "step": 10699 + }, + { + "epoch": 1.856349757113116, + "grad_norm": 0.9780529737472534, + "learning_rate": 7.903669964941052e-07, + "loss": 0.7419, + "step": 10700 + }, + { + "epoch": 1.8565232477446219, + "grad_norm": 1.5499181747436523, + "learning_rate": 7.884716867478782e-07, + "loss": 0.6599, + "step": 10701 + }, + { + "epoch": 1.8566967383761277, + "grad_norm": 0.9494152665138245, + "learning_rate": 7.865786065218973e-07, + "loss": 0.7494, + "step": 10702 + }, + { + "epoch": 1.8568702290076335, + "grad_norm": 0.756226658821106, + "learning_rate": 7.846877560358535e-07, + "loss": 0.7487, + "step": 10703 + }, + { + "epoch": 1.8570437196391394, + "grad_norm": 1.5859893560409546, + "learning_rate": 7.827991355091891e-07, + "loss": 0.658, + "step": 10704 + }, + { + "epoch": 1.8572172102706452, + "grad_norm": 1.006774663925171, + "learning_rate": 7.809127451610776e-07, + "loss": 0.5864, + "step": 10705 + }, + { + "epoch": 1.8573907009021513, + "grad_norm": 1.1608550548553467, + "learning_rate": 7.790285852104373e-07, + "loss": 0.7314, + "step": 10706 + }, + { + "epoch": 1.8575641915336571, + "grad_norm": 1.108494758605957, + "learning_rate": 7.771466558759244e-07, + "loss": 0.578, + "step": 10707 + }, + { + "epoch": 1.8577376821651632, + "grad_norm": 0.8903293609619141, + "learning_rate": 7.752669573759464e-07, + "loss": 0.6354, + "step": 10708 + }, + { + "epoch": 1.857911172796669, + "grad_norm": 1.141116976737976, + "learning_rate": 7.733894899286332e-07, + "loss": 0.6836, + "step": 10709 + }, + { + "epoch": 1.858084663428175, + "grad_norm": 2.1929969787597656, + "learning_rate": 7.715142537518771e-07, + "loss": 0.6274, + "step": 10710 + }, + { + "epoch": 1.8582581540596808, + "grad_norm": 0.7903014421463013, + "learning_rate": 7.696412490632954e-07, + "loss": 0.6804, + "step": 10711 + }, + { + "epoch": 1.8584316446911866, + "grad_norm": 0.6310881972312927, + "learning_rate": 7.677704760802562e-07, + "loss": 0.821, + "step": 10712 + }, + { + "epoch": 1.8586051353226924, + "grad_norm": 0.9507243633270264, + "learning_rate": 7.659019350198593e-07, + "loss": 0.6433, + "step": 10713 + }, + { + "epoch": 1.8587786259541985, + "grad_norm": 0.8409708738327026, + "learning_rate": 7.640356260989601e-07, + "loss": 0.6846, + "step": 10714 + }, + { + "epoch": 1.8589521165857044, + "grad_norm": 0.7247990965843201, + "learning_rate": 7.621715495341364e-07, + "loss": 0.7194, + "step": 10715 + }, + { + "epoch": 1.8591256072172104, + "grad_norm": 1.4619330167770386, + "learning_rate": 7.603097055417242e-07, + "loss": 0.8357, + "step": 10716 + }, + { + "epoch": 1.8592990978487163, + "grad_norm": 0.8959739208221436, + "learning_rate": 7.58450094337786e-07, + "loss": 0.7308, + "step": 10717 + }, + { + "epoch": 1.8594725884802221, + "grad_norm": 0.8576433658599854, + "learning_rate": 7.565927161381403e-07, + "loss": 0.632, + "step": 10718 + }, + { + "epoch": 1.859646079111728, + "grad_norm": 0.9717245697975159, + "learning_rate": 7.547375711583282e-07, + "loss": 0.5404, + "step": 10719 + }, + { + "epoch": 1.8598195697432338, + "grad_norm": 1.8223085403442383, + "learning_rate": 7.528846596136485e-07, + "loss": 0.6522, + "step": 10720 + }, + { + "epoch": 1.8599930603747397, + "grad_norm": 0.7684388160705566, + "learning_rate": 7.510339817191314e-07, + "loss": 0.7732, + "step": 10721 + }, + { + "epoch": 1.8601665510062455, + "grad_norm": 0.7786892652511597, + "learning_rate": 7.491855376895519e-07, + "loss": 0.7168, + "step": 10722 + }, + { + "epoch": 1.8603400416377516, + "grad_norm": 0.840282142162323, + "learning_rate": 7.473393277394181e-07, + "loss": 0.5845, + "step": 10723 + }, + { + "epoch": 1.8605135322692574, + "grad_norm": 0.9064529538154602, + "learning_rate": 7.454953520829899e-07, + "loss": 0.6093, + "step": 10724 + }, + { + "epoch": 1.8606870229007635, + "grad_norm": 0.9621952772140503, + "learning_rate": 7.436536109342651e-07, + "loss": 0.7078, + "step": 10725 + }, + { + "epoch": 1.8608605135322693, + "grad_norm": 1.0373022556304932, + "learning_rate": 7.418141045069727e-07, + "loss": 0.7319, + "step": 10726 + }, + { + "epoch": 1.8610340041637752, + "grad_norm": 0.892292320728302, + "learning_rate": 7.399768330145995e-07, + "loss": 0.657, + "step": 10727 + }, + { + "epoch": 1.861207494795281, + "grad_norm": 1.14809250831604, + "learning_rate": 7.381417966703508e-07, + "loss": 0.657, + "step": 10728 + }, + { + "epoch": 1.8613809854267869, + "grad_norm": 1.070886492729187, + "learning_rate": 7.363089956871961e-07, + "loss": 0.5737, + "step": 10729 + }, + { + "epoch": 1.8615544760582927, + "grad_norm": 0.9313387870788574, + "learning_rate": 7.344784302778274e-07, + "loss": 0.5853, + "step": 10730 + }, + { + "epoch": 1.8617279666897988, + "grad_norm": 1.6781797409057617, + "learning_rate": 7.32650100654686e-07, + "loss": 0.7124, + "step": 10731 + }, + { + "epoch": 1.8619014573213046, + "grad_norm": 0.788580596446991, + "learning_rate": 7.308240070299489e-07, + "loss": 0.7207, + "step": 10732 + }, + { + "epoch": 1.8620749479528107, + "grad_norm": 0.8332198262214661, + "learning_rate": 7.290001496155418e-07, + "loss": 0.6802, + "step": 10733 + }, + { + "epoch": 1.8622484385843165, + "grad_norm": 3.348316192626953, + "learning_rate": 7.271785286231204e-07, + "loss": 0.8318, + "step": 10734 + }, + { + "epoch": 1.8624219292158224, + "grad_norm": 1.8308556079864502, + "learning_rate": 7.253591442640906e-07, + "loss": 0.7299, + "step": 10735 + }, + { + "epoch": 1.8625954198473282, + "grad_norm": 0.9705187082290649, + "learning_rate": 7.235419967495883e-07, + "loss": 0.7183, + "step": 10736 + }, + { + "epoch": 1.862768910478834, + "grad_norm": 0.924876868724823, + "learning_rate": 7.217270862905023e-07, + "loss": 0.7772, + "step": 10737 + }, + { + "epoch": 1.86294240111034, + "grad_norm": 0.9643875956535339, + "learning_rate": 7.199144130974489e-07, + "loss": 0.7574, + "step": 10738 + }, + { + "epoch": 1.8631158917418458, + "grad_norm": 0.9982740879058838, + "learning_rate": 7.181039773807952e-07, + "loss": 0.8096, + "step": 10739 + }, + { + "epoch": 1.8632893823733518, + "grad_norm": 0.8030409812927246, + "learning_rate": 7.16295779350642e-07, + "loss": 0.7197, + "step": 10740 + }, + { + "epoch": 1.8634628730048577, + "grad_norm": 1.1727063655853271, + "learning_rate": 7.144898192168348e-07, + "loss": 0.5797, + "step": 10741 + }, + { + "epoch": 1.8636363636363638, + "grad_norm": 1.1272952556610107, + "learning_rate": 7.126860971889527e-07, + "loss": 0.5927, + "step": 10742 + }, + { + "epoch": 1.8638098542678696, + "grad_norm": 0.7350848913192749, + "learning_rate": 7.108846134763258e-07, + "loss": 0.6549, + "step": 10743 + }, + { + "epoch": 1.8639833448993754, + "grad_norm": 0.8996641039848328, + "learning_rate": 7.090853682880161e-07, + "loss": 0.7195, + "step": 10744 + }, + { + "epoch": 1.8641568355308813, + "grad_norm": 1.3703992366790771, + "learning_rate": 7.072883618328275e-07, + "loss": 0.7009, + "step": 10745 + }, + { + "epoch": 1.8643303261623871, + "grad_norm": 0.8136349320411682, + "learning_rate": 7.054935943193042e-07, + "loss": 0.7429, + "step": 10746 + }, + { + "epoch": 1.864503816793893, + "grad_norm": 1.2637317180633545, + "learning_rate": 7.037010659557309e-07, + "loss": 0.6349, + "step": 10747 + }, + { + "epoch": 1.864677307425399, + "grad_norm": 0.8386378884315491, + "learning_rate": 7.019107769501366e-07, + "loss": 0.6687, + "step": 10748 + }, + { + "epoch": 1.864850798056905, + "grad_norm": 0.9934601783752441, + "learning_rate": 7.001227275102818e-07, + "loss": 0.6106, + "step": 10749 + }, + { + "epoch": 1.865024288688411, + "grad_norm": 1.1391266584396362, + "learning_rate": 6.983369178436739e-07, + "loss": 0.5767, + "step": 10750 + }, + { + "epoch": 1.8651977793199168, + "grad_norm": 0.7910225987434387, + "learning_rate": 6.96553348157556e-07, + "loss": 0.6893, + "step": 10751 + }, + { + "epoch": 1.8653712699514227, + "grad_norm": 1.1058967113494873, + "learning_rate": 6.947720186589158e-07, + "loss": 0.6587, + "step": 10752 + }, + { + "epoch": 1.8655447605829285, + "grad_norm": 0.8998697996139526, + "learning_rate": 6.92992929554479e-07, + "loss": 0.5898, + "step": 10753 + }, + { + "epoch": 1.8657182512144344, + "grad_norm": 0.9082543253898621, + "learning_rate": 6.912160810507096e-07, + "loss": 0.631, + "step": 10754 + }, + { + "epoch": 1.8658917418459402, + "grad_norm": 0.7605670094490051, + "learning_rate": 6.894414733538113e-07, + "loss": 0.6837, + "step": 10755 + }, + { + "epoch": 1.8660652324774463, + "grad_norm": 0.7542372345924377, + "learning_rate": 6.876691066697349e-07, + "loss": 0.7383, + "step": 10756 + }, + { + "epoch": 1.8662387231089521, + "grad_norm": 1.0311685800552368, + "learning_rate": 6.858989812041583e-07, + "loss": 0.6583, + "step": 10757 + }, + { + "epoch": 1.8664122137404582, + "grad_norm": 1.2186540365219116, + "learning_rate": 6.841310971625103e-07, + "loss": 0.675, + "step": 10758 + }, + { + "epoch": 1.866585704371964, + "grad_norm": 1.1449110507965088, + "learning_rate": 6.823654547499581e-07, + "loss": 0.6428, + "step": 10759 + }, + { + "epoch": 1.8667591950034699, + "grad_norm": 0.8546732068061829, + "learning_rate": 6.806020541714042e-07, + "loss": 0.7306, + "step": 10760 + }, + { + "epoch": 1.8669326856349757, + "grad_norm": 0.9521517753601074, + "learning_rate": 6.788408956314918e-07, + "loss": 0.5381, + "step": 10761 + }, + { + "epoch": 1.8671061762664816, + "grad_norm": 0.9029073119163513, + "learning_rate": 6.770819793346084e-07, + "loss": 0.5607, + "step": 10762 + }, + { + "epoch": 1.8672796668979874, + "grad_norm": 1.1404337882995605, + "learning_rate": 6.753253054848774e-07, + "loss": 0.6996, + "step": 10763 + }, + { + "epoch": 1.8674531575294933, + "grad_norm": 0.7434732913970947, + "learning_rate": 6.735708742861624e-07, + "loss": 0.8694, + "step": 10764 + }, + { + "epoch": 1.8676266481609993, + "grad_norm": 2.8169846534729004, + "learning_rate": 6.71818685942065e-07, + "loss": 0.791, + "step": 10765 + }, + { + "epoch": 1.8678001387925052, + "grad_norm": 2.6992268562316895, + "learning_rate": 6.700687406559359e-07, + "loss": 0.6863, + "step": 10766 + }, + { + "epoch": 1.8679736294240112, + "grad_norm": 2.34419846534729, + "learning_rate": 6.683210386308481e-07, + "loss": 0.6384, + "step": 10767 + }, + { + "epoch": 1.868147120055517, + "grad_norm": 0.8157147765159607, + "learning_rate": 6.665755800696305e-07, + "loss": 0.7009, + "step": 10768 + }, + { + "epoch": 1.868320610687023, + "grad_norm": 0.7687466740608215, + "learning_rate": 6.648323651748457e-07, + "loss": 0.74, + "step": 10769 + }, + { + "epoch": 1.8684941013185288, + "grad_norm": 0.7950955629348755, + "learning_rate": 6.63091394148796e-07, + "loss": 0.7083, + "step": 10770 + }, + { + "epoch": 1.8686675919500346, + "grad_norm": 0.7772963047027588, + "learning_rate": 6.6135266719352e-07, + "loss": 0.7909, + "step": 10771 + }, + { + "epoch": 1.8688410825815405, + "grad_norm": 0.8227313160896301, + "learning_rate": 6.596161845108006e-07, + "loss": 0.649, + "step": 10772 + }, + { + "epoch": 1.8690145732130465, + "grad_norm": 1.2326328754425049, + "learning_rate": 6.578819463021612e-07, + "loss": 0.6223, + "step": 10773 + }, + { + "epoch": 1.8691880638445524, + "grad_norm": 0.9304020404815674, + "learning_rate": 6.561499527688586e-07, + "loss": 0.71, + "step": 10774 + }, + { + "epoch": 1.8693615544760585, + "grad_norm": 1.1520953178405762, + "learning_rate": 6.544202041118941e-07, + "loss": 0.5605, + "step": 10775 + }, + { + "epoch": 1.8695350451075643, + "grad_norm": 3.0681960582733154, + "learning_rate": 6.526927005320072e-07, + "loss": 0.5574, + "step": 10776 + }, + { + "epoch": 1.8697085357390701, + "grad_norm": 0.9721742272377014, + "learning_rate": 6.509674422296775e-07, + "loss": 0.7021, + "step": 10777 + }, + { + "epoch": 1.869882026370576, + "grad_norm": 0.9125108122825623, + "learning_rate": 6.492444294051204e-07, + "loss": 0.6038, + "step": 10778 + }, + { + "epoch": 1.8700555170020818, + "grad_norm": 0.8161735534667969, + "learning_rate": 6.47523662258298e-07, + "loss": 0.6105, + "step": 10779 + }, + { + "epoch": 1.8702290076335877, + "grad_norm": 0.9277462959289551, + "learning_rate": 6.458051409889021e-07, + "loss": 0.7712, + "step": 10780 + }, + { + "epoch": 1.8704024982650935, + "grad_norm": 0.7397034168243408, + "learning_rate": 6.440888657963729e-07, + "loss": 0.7113, + "step": 10781 + }, + { + "epoch": 1.8705759888965996, + "grad_norm": 1.1588078737258911, + "learning_rate": 6.423748368798843e-07, + "loss": 0.8267, + "step": 10782 + }, + { + "epoch": 1.8707494795281054, + "grad_norm": 0.8607925772666931, + "learning_rate": 6.40663054438353e-07, + "loss": 0.8127, + "step": 10783 + }, + { + "epoch": 1.8709229701596115, + "grad_norm": 1.0273767709732056, + "learning_rate": 6.38953518670431e-07, + "loss": 0.743, + "step": 10784 + }, + { + "epoch": 1.8710964607911174, + "grad_norm": 0.8940081596374512, + "learning_rate": 6.372462297745152e-07, + "loss": 0.7073, + "step": 10785 + }, + { + "epoch": 1.8712699514226232, + "grad_norm": 1.084601879119873, + "learning_rate": 6.355411879487339e-07, + "loss": 0.7928, + "step": 10786 + }, + { + "epoch": 1.871443442054129, + "grad_norm": 2.011991262435913, + "learning_rate": 6.338383933909642e-07, + "loss": 0.5364, + "step": 10787 + }, + { + "epoch": 1.871616932685635, + "grad_norm": 1.3015767335891724, + "learning_rate": 6.321378462988148e-07, + "loss": 0.6542, + "step": 10788 + }, + { + "epoch": 1.8717904233171407, + "grad_norm": 0.732443630695343, + "learning_rate": 6.304395468696345e-07, + "loss": 0.6687, + "step": 10789 + }, + { + "epoch": 1.8719639139486468, + "grad_norm": 1.057572841644287, + "learning_rate": 6.287434953005145e-07, + "loss": 0.5718, + "step": 10790 + }, + { + "epoch": 1.8721374045801527, + "grad_norm": 0.8018630743026733, + "learning_rate": 6.270496917882863e-07, + "loss": 0.7651, + "step": 10791 + }, + { + "epoch": 1.8723108952116587, + "grad_norm": 0.986775815486908, + "learning_rate": 6.253581365295148e-07, + "loss": 0.7068, + "step": 10792 + }, + { + "epoch": 1.8724843858431646, + "grad_norm": 0.8563531041145325, + "learning_rate": 6.236688297205074e-07, + "loss": 0.6193, + "step": 10793 + }, + { + "epoch": 1.8726578764746704, + "grad_norm": 0.9411488771438599, + "learning_rate": 6.219817715573073e-07, + "loss": 0.6467, + "step": 10794 + }, + { + "epoch": 1.8728313671061763, + "grad_norm": 0.7573801875114441, + "learning_rate": 6.202969622357069e-07, + "loss": 0.8164, + "step": 10795 + }, + { + "epoch": 1.873004857737682, + "grad_norm": 1.85628080368042, + "learning_rate": 6.18614401951223e-07, + "loss": 0.6554, + "step": 10796 + }, + { + "epoch": 1.873178348369188, + "grad_norm": 0.9734638929367065, + "learning_rate": 6.16934090899124e-07, + "loss": 0.5338, + "step": 10797 + }, + { + "epoch": 1.8733518390006938, + "grad_norm": 1.1320650577545166, + "learning_rate": 6.152560292744091e-07, + "loss": 0.5852, + "step": 10798 + }, + { + "epoch": 1.8735253296321999, + "grad_norm": 0.9801285266876221, + "learning_rate": 6.135802172718164e-07, + "loss": 0.7231, + "step": 10799 + }, + { + "epoch": 1.8736988202637057, + "grad_norm": 0.8595348596572876, + "learning_rate": 6.119066550858321e-07, + "loss": 0.5995, + "step": 10800 + }, + { + "epoch": 1.8738723108952118, + "grad_norm": 0.884962797164917, + "learning_rate": 6.102353429106722e-07, + "loss": 0.6387, + "step": 10801 + }, + { + "epoch": 1.8740458015267176, + "grad_norm": 0.8122801780700684, + "learning_rate": 6.085662809402926e-07, + "loss": 0.7637, + "step": 10802 + }, + { + "epoch": 1.8742192921582235, + "grad_norm": 1.1002975702285767, + "learning_rate": 6.068994693683916e-07, + "loss": 0.5579, + "step": 10803 + }, + { + "epoch": 1.8743927827897293, + "grad_norm": 0.7361776828765869, + "learning_rate": 6.052349083884057e-07, + "loss": 0.6508, + "step": 10804 + }, + { + "epoch": 1.8745662734212352, + "grad_norm": 0.6972280144691467, + "learning_rate": 6.035725981935092e-07, + "loss": 0.7585, + "step": 10805 + }, + { + "epoch": 1.874739764052741, + "grad_norm": 0.7903702855110168, + "learning_rate": 6.019125389766123e-07, + "loss": 0.6305, + "step": 10806 + }, + { + "epoch": 1.874913254684247, + "grad_norm": 0.8727280497550964, + "learning_rate": 6.002547309303674e-07, + "loss": 0.5953, + "step": 10807 + }, + { + "epoch": 1.875086745315753, + "grad_norm": 1.2746225595474243, + "learning_rate": 5.985991742471698e-07, + "loss": 0.6051, + "step": 10808 + }, + { + "epoch": 1.875260235947259, + "grad_norm": 1.2355235815048218, + "learning_rate": 5.969458691191432e-07, + "loss": 0.6263, + "step": 10809 + }, + { + "epoch": 1.8754337265787648, + "grad_norm": 1.7622233629226685, + "learning_rate": 5.952948157381566e-07, + "loss": 0.7434, + "step": 10810 + }, + { + "epoch": 1.8756072172102707, + "grad_norm": 1.0801303386688232, + "learning_rate": 5.936460142958189e-07, + "loss": 0.6025, + "step": 10811 + }, + { + "epoch": 1.8757807078417765, + "grad_norm": 0.9008930921554565, + "learning_rate": 5.919994649834748e-07, + "loss": 0.7026, + "step": 10812 + }, + { + "epoch": 1.8759541984732824, + "grad_norm": 0.6479740142822266, + "learning_rate": 5.903551679922049e-07, + "loss": 0.7681, + "step": 10813 + }, + { + "epoch": 1.8761276891047882, + "grad_norm": 0.6826635003089905, + "learning_rate": 5.887131235128385e-07, + "loss": 0.7506, + "step": 10814 + }, + { + "epoch": 1.8763011797362943, + "grad_norm": 0.9330180287361145, + "learning_rate": 5.870733317359278e-07, + "loss": 0.7126, + "step": 10815 + }, + { + "epoch": 1.8764746703678001, + "grad_norm": 0.8115153312683105, + "learning_rate": 5.854357928517806e-07, + "loss": 0.6758, + "step": 10816 + }, + { + "epoch": 1.8766481609993062, + "grad_norm": 0.8537460565567017, + "learning_rate": 5.838005070504293e-07, + "loss": 0.717, + "step": 10817 + }, + { + "epoch": 1.876821651630812, + "grad_norm": 0.7774044871330261, + "learning_rate": 5.821674745216599e-07, + "loss": 0.8552, + "step": 10818 + }, + { + "epoch": 1.876995142262318, + "grad_norm": 0.9240066409111023, + "learning_rate": 5.80536695454974e-07, + "loss": 0.5858, + "step": 10819 + }, + { + "epoch": 1.8771686328938237, + "grad_norm": 0.8043524622917175, + "learning_rate": 5.789081700396381e-07, + "loss": 0.8386, + "step": 10820 + }, + { + "epoch": 1.8773421235253296, + "grad_norm": 0.8309668302536011, + "learning_rate": 5.772818984646367e-07, + "loss": 0.677, + "step": 10821 + }, + { + "epoch": 1.8775156141568354, + "grad_norm": 0.8796374201774597, + "learning_rate": 5.756578809187008e-07, + "loss": 0.6473, + "step": 10822 + }, + { + "epoch": 1.8776891047883413, + "grad_norm": 0.7569525837898254, + "learning_rate": 5.740361175903042e-07, + "loss": 0.7576, + "step": 10823 + }, + { + "epoch": 1.8778625954198473, + "grad_norm": 1.963430643081665, + "learning_rate": 5.724166086676542e-07, + "loss": 0.6389, + "step": 10824 + }, + { + "epoch": 1.8780360860513532, + "grad_norm": 0.7304547429084778, + "learning_rate": 5.707993543386914e-07, + "loss": 0.8008, + "step": 10825 + }, + { + "epoch": 1.8782095766828593, + "grad_norm": 0.9248591065406799, + "learning_rate": 5.691843547911013e-07, + "loss": 0.6326, + "step": 10826 + }, + { + "epoch": 1.878383067314365, + "grad_norm": 0.8120111227035522, + "learning_rate": 5.675716102123141e-07, + "loss": 0.5608, + "step": 10827 + }, + { + "epoch": 1.878556557945871, + "grad_norm": 0.817793071269989, + "learning_rate": 5.6596112078948e-07, + "loss": 0.7241, + "step": 10828 + }, + { + "epoch": 1.8787300485773768, + "grad_norm": 0.86298668384552, + "learning_rate": 5.643528867095049e-07, + "loss": 0.6643, + "step": 10829 + }, + { + "epoch": 1.8789035392088826, + "grad_norm": 3.1503469944000244, + "learning_rate": 5.627469081590242e-07, + "loss": 0.8562, + "step": 10830 + }, + { + "epoch": 1.8790770298403885, + "grad_norm": 1.0085705518722534, + "learning_rate": 5.611431853244132e-07, + "loss": 0.8193, + "step": 10831 + }, + { + "epoch": 1.8792505204718946, + "grad_norm": 1.013214111328125, + "learning_rate": 5.595417183917851e-07, + "loss": 0.6187, + "step": 10832 + }, + { + "epoch": 1.8794240111034004, + "grad_norm": 1.8502784967422485, + "learning_rate": 5.579425075469936e-07, + "loss": 0.6735, + "step": 10833 + }, + { + "epoch": 1.8795975017349065, + "grad_norm": 3.202806234359741, + "learning_rate": 5.563455529756301e-07, + "loss": 0.5333, + "step": 10834 + }, + { + "epoch": 1.8797709923664123, + "grad_norm": 0.9397985935211182, + "learning_rate": 5.547508548630221e-07, + "loss": 0.6353, + "step": 10835 + }, + { + "epoch": 1.8799444829979182, + "grad_norm": 0.8320425152778625, + "learning_rate": 5.531584133942325e-07, + "loss": 0.7266, + "step": 10836 + }, + { + "epoch": 1.880117973629424, + "grad_norm": 1.0364868640899658, + "learning_rate": 5.515682287540736e-07, + "loss": 0.5791, + "step": 10837 + }, + { + "epoch": 1.8802914642609299, + "grad_norm": 1.1441694498062134, + "learning_rate": 5.499803011270776e-07, + "loss": 0.6017, + "step": 10838 + }, + { + "epoch": 1.8804649548924357, + "grad_norm": 0.6509250402450562, + "learning_rate": 5.483946306975374e-07, + "loss": 0.8284, + "step": 10839 + }, + { + "epoch": 1.8806384455239415, + "grad_norm": 0.7483503222465515, + "learning_rate": 5.468112176494633e-07, + "loss": 0.8252, + "step": 10840 + }, + { + "epoch": 1.8808119361554476, + "grad_norm": 1.070119857788086, + "learning_rate": 5.452300621666151e-07, + "loss": 0.5472, + "step": 10841 + }, + { + "epoch": 1.8809854267869535, + "grad_norm": 0.7898982167243958, + "learning_rate": 5.43651164432486e-07, + "loss": 0.6777, + "step": 10842 + }, + { + "epoch": 1.8811589174184595, + "grad_norm": 0.7732219696044922, + "learning_rate": 5.420745246303116e-07, + "loss": 0.744, + "step": 10843 + }, + { + "epoch": 1.8813324080499654, + "grad_norm": 1.070477843284607, + "learning_rate": 5.405001429430634e-07, + "loss": 0.6736, + "step": 10844 + }, + { + "epoch": 1.8815058986814712, + "grad_norm": 0.9307084083557129, + "learning_rate": 5.389280195534463e-07, + "loss": 0.6573, + "step": 10845 + }, + { + "epoch": 1.881679389312977, + "grad_norm": 1.2121108770370483, + "learning_rate": 5.373581546439077e-07, + "loss": 0.5792, + "step": 10846 + }, + { + "epoch": 1.881852879944483, + "grad_norm": 0.7798859477043152, + "learning_rate": 5.357905483966375e-07, + "loss": 0.7175, + "step": 10847 + }, + { + "epoch": 1.8820263705759888, + "grad_norm": 0.9041663408279419, + "learning_rate": 5.342252009935522e-07, + "loss": 0.6646, + "step": 10848 + }, + { + "epoch": 1.8821998612074948, + "grad_norm": 0.7720593214035034, + "learning_rate": 5.326621126163157e-07, + "loss": 0.8335, + "step": 10849 + }, + { + "epoch": 1.8823733518390007, + "grad_norm": 0.6593403220176697, + "learning_rate": 5.311012834463247e-07, + "loss": 0.682, + "step": 10850 + }, + { + "epoch": 1.8825468424705067, + "grad_norm": 1.1131250858306885, + "learning_rate": 5.295427136647124e-07, + "loss": 0.5338, + "step": 10851 + }, + { + "epoch": 1.8827203331020126, + "grad_norm": 0.8466019630432129, + "learning_rate": 5.279864034523586e-07, + "loss": 0.6263, + "step": 10852 + }, + { + "epoch": 1.8828938237335184, + "grad_norm": 0.9615622758865356, + "learning_rate": 5.26432352989874e-07, + "loss": 0.6185, + "step": 10853 + }, + { + "epoch": 1.8830673143650243, + "grad_norm": 1.0241212844848633, + "learning_rate": 5.248805624576037e-07, + "loss": 0.6428, + "step": 10854 + }, + { + "epoch": 1.8832408049965301, + "grad_norm": 0.8293564319610596, + "learning_rate": 5.233310320356366e-07, + "loss": 0.7839, + "step": 10855 + }, + { + "epoch": 1.883414295628036, + "grad_norm": 0.8166621327400208, + "learning_rate": 5.217837619038002e-07, + "loss": 0.769, + "step": 10856 + }, + { + "epoch": 1.8835877862595418, + "grad_norm": 0.8767759203910828, + "learning_rate": 5.20238752241653e-07, + "loss": 0.6515, + "step": 10857 + }, + { + "epoch": 1.8837612768910479, + "grad_norm": 0.8956862688064575, + "learning_rate": 5.186960032284983e-07, + "loss": 0.6399, + "step": 10858 + }, + { + "epoch": 1.8839347675225537, + "grad_norm": 0.9802941679954529, + "learning_rate": 5.171555150433705e-07, + "loss": 0.5372, + "step": 10859 + }, + { + "epoch": 1.8841082581540598, + "grad_norm": 0.8557465672492981, + "learning_rate": 5.156172878650489e-07, + "loss": 0.7108, + "step": 10860 + }, + { + "epoch": 1.8842817487855656, + "grad_norm": 1.0377771854400635, + "learning_rate": 5.140813218720442e-07, + "loss": 0.5416, + "step": 10861 + }, + { + "epoch": 1.8844552394170715, + "grad_norm": 1.0870387554168701, + "learning_rate": 5.125476172426092e-07, + "loss": 0.561, + "step": 10862 + }, + { + "epoch": 1.8846287300485773, + "grad_norm": 0.7970936894416809, + "learning_rate": 5.110161741547281e-07, + "loss": 0.7756, + "step": 10863 + }, + { + "epoch": 1.8848022206800832, + "grad_norm": 0.9938191771507263, + "learning_rate": 5.094869927861323e-07, + "loss": 0.6688, + "step": 10864 + }, + { + "epoch": 1.884975711311589, + "grad_norm": 0.7669560313224792, + "learning_rate": 5.079600733142775e-07, + "loss": 0.6863, + "step": 10865 + }, + { + "epoch": 1.885149201943095, + "grad_norm": 1.0452409982681274, + "learning_rate": 5.064354159163754e-07, + "loss": 0.6315, + "step": 10866 + }, + { + "epoch": 1.885322692574601, + "grad_norm": 1.6373646259307861, + "learning_rate": 5.049130207693509e-07, + "loss": 0.6008, + "step": 10867 + }, + { + "epoch": 1.885496183206107, + "grad_norm": 1.0283794403076172, + "learning_rate": 5.033928880498917e-07, + "loss": 0.5769, + "step": 10868 + }, + { + "epoch": 1.8856696738376129, + "grad_norm": 0.8812340497970581, + "learning_rate": 5.018750179344034e-07, + "loss": 0.584, + "step": 10869 + }, + { + "epoch": 1.8858431644691187, + "grad_norm": 1.0787441730499268, + "learning_rate": 5.003594105990384e-07, + "loss": 0.5958, + "step": 10870 + }, + { + "epoch": 1.8860166551006246, + "grad_norm": 1.0526759624481201, + "learning_rate": 4.98846066219687e-07, + "loss": 0.5643, + "step": 10871 + }, + { + "epoch": 1.8861901457321304, + "grad_norm": 0.9646508693695068, + "learning_rate": 4.973349849719733e-07, + "loss": 0.5146, + "step": 10872 + }, + { + "epoch": 1.8863636363636362, + "grad_norm": 1.4801915884017944, + "learning_rate": 4.958261670312591e-07, + "loss": 0.6925, + "step": 10873 + }, + { + "epoch": 1.8865371269951423, + "grad_norm": 0.8416954874992371, + "learning_rate": 4.943196125726446e-07, + "loss": 0.7495, + "step": 10874 + }, + { + "epoch": 1.8867106176266482, + "grad_norm": 0.7753939628601074, + "learning_rate": 4.928153217709674e-07, + "loss": 0.7587, + "step": 10875 + }, + { + "epoch": 1.8868841082581542, + "grad_norm": 0.7284078598022461, + "learning_rate": 4.913132948008037e-07, + "loss": 0.72, + "step": 10876 + }, + { + "epoch": 1.88705759888966, + "grad_norm": 0.9561038017272949, + "learning_rate": 4.89813531836465e-07, + "loss": 0.729, + "step": 10877 + }, + { + "epoch": 1.887231089521166, + "grad_norm": 1.5776721239089966, + "learning_rate": 4.883160330519965e-07, + "loss": 0.6466, + "step": 10878 + }, + { + "epoch": 1.8874045801526718, + "grad_norm": 0.8683754205703735, + "learning_rate": 4.868207986211926e-07, + "loss": 0.6157, + "step": 10879 + }, + { + "epoch": 1.8875780707841776, + "grad_norm": 1.2575308084487915, + "learning_rate": 4.853278287175677e-07, + "loss": 0.7715, + "step": 10880 + }, + { + "epoch": 1.8877515614156835, + "grad_norm": 0.7311221957206726, + "learning_rate": 4.838371235143902e-07, + "loss": 0.7693, + "step": 10881 + }, + { + "epoch": 1.8879250520471893, + "grad_norm": 1.0677968263626099, + "learning_rate": 4.823486831846547e-07, + "loss": 0.6611, + "step": 10882 + }, + { + "epoch": 1.8880985426786954, + "grad_norm": 1.594588041305542, + "learning_rate": 4.808625079010987e-07, + "loss": 0.6096, + "step": 10883 + }, + { + "epoch": 1.8882720333102012, + "grad_norm": 0.8531147837638855, + "learning_rate": 4.793785978361887e-07, + "loss": 0.7253, + "step": 10884 + }, + { + "epoch": 1.8884455239417073, + "grad_norm": 0.9412972927093506, + "learning_rate": 4.778969531621447e-07, + "loss": 0.6462, + "step": 10885 + }, + { + "epoch": 1.8886190145732131, + "grad_norm": 1.0955055952072144, + "learning_rate": 4.764175740509025e-07, + "loss": 0.6439, + "step": 10886 + }, + { + "epoch": 1.888792505204719, + "grad_norm": 0.7871654629707336, + "learning_rate": 4.749404606741514e-07, + "loss": 0.6499, + "step": 10887 + }, + { + "epoch": 1.8889659958362248, + "grad_norm": 0.8725250363349915, + "learning_rate": 4.7346561320330997e-07, + "loss": 0.6165, + "step": 10888 + }, + { + "epoch": 1.8891394864677307, + "grad_norm": 1.1742228269577026, + "learning_rate": 4.7199303180953894e-07, + "loss": 0.6128, + "step": 10889 + }, + { + "epoch": 1.8893129770992365, + "grad_norm": 0.9872661232948303, + "learning_rate": 4.7052271666373053e-07, + "loss": 0.6853, + "step": 10890 + }, + { + "epoch": 1.8894864677307426, + "grad_norm": 0.6326870322227478, + "learning_rate": 4.6905466793651713e-07, + "loss": 0.7859, + "step": 10891 + }, + { + "epoch": 1.8896599583622484, + "grad_norm": 0.7601183652877808, + "learning_rate": 4.675888857982669e-07, + "loss": 0.7004, + "step": 10892 + }, + { + "epoch": 1.8898334489937545, + "grad_norm": 1.4384464025497437, + "learning_rate": 4.6612537041908823e-07, + "loss": 0.6455, + "step": 10893 + }, + { + "epoch": 1.8900069396252603, + "grad_norm": 0.8903255462646484, + "learning_rate": 4.646641219688186e-07, + "loss": 0.7251, + "step": 10894 + }, + { + "epoch": 1.8901804302567662, + "grad_norm": 0.9574300646781921, + "learning_rate": 4.6320514061704236e-07, + "loss": 0.7471, + "step": 10895 + }, + { + "epoch": 1.890353920888272, + "grad_norm": 1.1559239625930786, + "learning_rate": 4.617484265330752e-07, + "loss": 0.5558, + "step": 10896 + }, + { + "epoch": 1.8905274115197779, + "grad_norm": 1.1219408512115479, + "learning_rate": 4.602939798859685e-07, + "loss": 0.6598, + "step": 10897 + }, + { + "epoch": 1.8907009021512837, + "grad_norm": 1.021763801574707, + "learning_rate": 4.588418008445161e-07, + "loss": 0.6019, + "step": 10898 + }, + { + "epoch": 1.8908743927827896, + "grad_norm": 1.113675832748413, + "learning_rate": 4.573918895772389e-07, + "loss": 0.6924, + "step": 10899 + }, + { + "epoch": 1.8910478834142956, + "grad_norm": 0.9383688569068909, + "learning_rate": 4.5594424625240887e-07, + "loss": 0.7412, + "step": 10900 + }, + { + "epoch": 1.8912213740458015, + "grad_norm": 1.3499459028244019, + "learning_rate": 4.544988710380205e-07, + "loss": 0.6886, + "step": 10901 + }, + { + "epoch": 1.8913948646773076, + "grad_norm": 0.9652602672576904, + "learning_rate": 4.5305576410181293e-07, + "loss": 0.5348, + "step": 10902 + }, + { + "epoch": 1.8915683553088134, + "grad_norm": 1.1247798204421997, + "learning_rate": 4.51614925611259e-07, + "loss": 0.5177, + "step": 10903 + }, + { + "epoch": 1.8917418459403192, + "grad_norm": 0.83970707654953, + "learning_rate": 4.5017635573357366e-07, + "loss": 0.6235, + "step": 10904 + }, + { + "epoch": 1.891915336571825, + "grad_norm": 1.294745683670044, + "learning_rate": 4.4874005463570126e-07, + "loss": 0.7634, + "step": 10905 + }, + { + "epoch": 1.892088827203331, + "grad_norm": 0.7530152797698975, + "learning_rate": 4.4730602248432843e-07, + "loss": 0.8254, + "step": 10906 + }, + { + "epoch": 1.8922623178348368, + "grad_norm": 0.811376690864563, + "learning_rate": 4.4587425944587317e-07, + "loss": 0.7473, + "step": 10907 + }, + { + "epoch": 1.8924358084663429, + "grad_norm": 0.7861600518226624, + "learning_rate": 4.444447656864981e-07, + "loss": 0.6512, + "step": 10908 + }, + { + "epoch": 1.8926092990978487, + "grad_norm": 0.8738217353820801, + "learning_rate": 4.4301754137209277e-07, + "loss": 0.5875, + "step": 10909 + }, + { + "epoch": 1.8927827897293548, + "grad_norm": 0.998768150806427, + "learning_rate": 4.4159258666828907e-07, + "loss": 0.6105, + "step": 10910 + }, + { + "epoch": 1.8929562803608606, + "grad_norm": 0.9726875424385071, + "learning_rate": 4.401699017404593e-07, + "loss": 0.7322, + "step": 10911 + }, + { + "epoch": 1.8931297709923665, + "grad_norm": 1.531948208808899, + "learning_rate": 4.3874948675370233e-07, + "loss": 0.7258, + "step": 10912 + }, + { + "epoch": 1.8933032616238723, + "grad_norm": 0.8177889585494995, + "learning_rate": 4.373313418728575e-07, + "loss": 0.6979, + "step": 10913 + }, + { + "epoch": 1.8934767522553781, + "grad_norm": 0.9440672993659973, + "learning_rate": 4.3591546726250877e-07, + "loss": 0.8733, + "step": 10914 + }, + { + "epoch": 1.893650242886884, + "grad_norm": 0.9789631962776184, + "learning_rate": 4.3450186308696685e-07, + "loss": 0.6965, + "step": 10915 + }, + { + "epoch": 1.89382373351839, + "grad_norm": 1.000295639038086, + "learning_rate": 4.3309052951028275e-07, + "loss": 0.5979, + "step": 10916 + }, + { + "epoch": 1.893997224149896, + "grad_norm": 1.3241627216339111, + "learning_rate": 4.31681466696241e-07, + "loss": 0.6497, + "step": 10917 + }, + { + "epoch": 1.8941707147814018, + "grad_norm": 1.147351861000061, + "learning_rate": 4.302746748083664e-07, + "loss": 0.861, + "step": 10918 + }, + { + "epoch": 1.8943442054129078, + "grad_norm": 1.0546404123306274, + "learning_rate": 4.2887015400991937e-07, + "loss": 0.584, + "step": 10919 + }, + { + "epoch": 1.8945176960444137, + "grad_norm": 1.0436419248580933, + "learning_rate": 4.2746790446389853e-07, + "loss": 0.5985, + "step": 10920 + }, + { + "epoch": 1.8946911866759195, + "grad_norm": 0.9343953132629395, + "learning_rate": 4.260679263330314e-07, + "loss": 0.6193, + "step": 10921 + }, + { + "epoch": 1.8948646773074254, + "grad_norm": 1.5770111083984375, + "learning_rate": 4.2467021977978806e-07, + "loss": 0.6689, + "step": 10922 + }, + { + "epoch": 1.8950381679389312, + "grad_norm": 0.981598436832428, + "learning_rate": 4.2327478496637877e-07, + "loss": 0.5886, + "step": 10923 + }, + { + "epoch": 1.895211658570437, + "grad_norm": 0.9529014825820923, + "learning_rate": 4.218816220547406e-07, + "loss": 0.5833, + "step": 10924 + }, + { + "epoch": 1.8953851492019431, + "grad_norm": 0.8382962346076965, + "learning_rate": 4.2049073120655315e-07, + "loss": 0.7427, + "step": 10925 + }, + { + "epoch": 1.895558639833449, + "grad_norm": 0.8542609810829163, + "learning_rate": 4.1910211258322954e-07, + "loss": 0.6619, + "step": 10926 + }, + { + "epoch": 1.895732130464955, + "grad_norm": 1.0623103380203247, + "learning_rate": 4.1771576634592524e-07, + "loss": 0.7509, + "step": 10927 + }, + { + "epoch": 1.8959056210964609, + "grad_norm": 0.8719624280929565, + "learning_rate": 4.1633169265552274e-07, + "loss": 0.7747, + "step": 10928 + }, + { + "epoch": 1.8960791117279667, + "grad_norm": 1.3904948234558105, + "learning_rate": 4.149498916726469e-07, + "loss": 0.7158, + "step": 10929 + }, + { + "epoch": 1.8962526023594726, + "grad_norm": 0.69508957862854, + "learning_rate": 4.13570363557656e-07, + "loss": 0.7837, + "step": 10930 + }, + { + "epoch": 1.8964260929909784, + "grad_norm": 0.8742428421974182, + "learning_rate": 4.1219310847064876e-07, + "loss": 0.7316, + "step": 10931 + }, + { + "epoch": 1.8965995836224843, + "grad_norm": 1.5548062324523926, + "learning_rate": 4.108181265714528e-07, + "loss": 0.6125, + "step": 10932 + }, + { + "epoch": 1.8967730742539903, + "grad_norm": 2.6371004581451416, + "learning_rate": 4.0944541801964275e-07, + "loss": 0.6401, + "step": 10933 + }, + { + "epoch": 1.8969465648854962, + "grad_norm": 0.8974584937095642, + "learning_rate": 4.0807498297451786e-07, + "loss": 0.6661, + "step": 10934 + }, + { + "epoch": 1.8971200555170022, + "grad_norm": 0.9880460500717163, + "learning_rate": 4.06706821595122e-07, + "loss": 0.5947, + "step": 10935 + }, + { + "epoch": 1.897293546148508, + "grad_norm": 0.7794961333274841, + "learning_rate": 4.053409340402259e-07, + "loss": 0.804, + "step": 10936 + }, + { + "epoch": 1.897467036780014, + "grad_norm": 1.1431559324264526, + "learning_rate": 4.039773204683517e-07, + "loss": 0.6371, + "step": 10937 + }, + { + "epoch": 1.8976405274115198, + "grad_norm": 0.8854041695594788, + "learning_rate": 4.026159810377417e-07, + "loss": 0.6039, + "step": 10938 + }, + { + "epoch": 1.8978140180430256, + "grad_norm": 1.2040066719055176, + "learning_rate": 4.012569159063806e-07, + "loss": 0.7468, + "step": 10939 + }, + { + "epoch": 1.8979875086745315, + "grad_norm": 1.1043211221694946, + "learning_rate": 3.999001252319934e-07, + "loss": 0.5897, + "step": 10940 + }, + { + "epoch": 1.8981609993060373, + "grad_norm": 1.040286660194397, + "learning_rate": 3.9854560917203635e-07, + "loss": 0.6941, + "step": 10941 + }, + { + "epoch": 1.8983344899375434, + "grad_norm": 0.9795730710029602, + "learning_rate": 3.971933678836992e-07, + "loss": 0.627, + "step": 10942 + }, + { + "epoch": 1.8985079805690492, + "grad_norm": 0.6969188451766968, + "learning_rate": 3.958434015239143e-07, + "loss": 0.6465, + "step": 10943 + }, + { + "epoch": 1.8986814712005553, + "grad_norm": 0.860477864742279, + "learning_rate": 3.944957102493474e-07, + "loss": 0.6956, + "step": 10944 + }, + { + "epoch": 1.8988549618320612, + "grad_norm": 0.9601736068725586, + "learning_rate": 3.931502942163956e-07, + "loss": 0.7065, + "step": 10945 + }, + { + "epoch": 1.899028452463567, + "grad_norm": 1.1416350603103638, + "learning_rate": 3.918071535812007e-07, + "loss": 0.6089, + "step": 10946 + }, + { + "epoch": 1.8992019430950728, + "grad_norm": 1.2078053951263428, + "learning_rate": 3.904662884996335e-07, + "loss": 0.6538, + "step": 10947 + }, + { + "epoch": 1.8993754337265787, + "grad_norm": 1.127261757850647, + "learning_rate": 3.8912769912730297e-07, + "loss": 0.6641, + "step": 10948 + }, + { + "epoch": 1.8995489243580845, + "grad_norm": 0.8657641410827637, + "learning_rate": 3.8779138561955145e-07, + "loss": 0.6858, + "step": 10949 + }, + { + "epoch": 1.8997224149895906, + "grad_norm": 0.882881224155426, + "learning_rate": 3.864573481314682e-07, + "loss": 0.5789, + "step": 10950 + }, + { + "epoch": 1.8998959056210964, + "grad_norm": 0.6538451910018921, + "learning_rate": 3.8512558681785826e-07, + "loss": 0.8403, + "step": 10951 + }, + { + "epoch": 1.9000693962526025, + "grad_norm": 1.0199337005615234, + "learning_rate": 3.837961018332825e-07, + "loss": 0.5936, + "step": 10952 + }, + { + "epoch": 1.9002428868841084, + "grad_norm": 1.1936914920806885, + "learning_rate": 3.824688933320264e-07, + "loss": 0.6371, + "step": 10953 + }, + { + "epoch": 1.9004163775156142, + "grad_norm": 1.4901388883590698, + "learning_rate": 3.811439614681156e-07, + "loss": 0.6469, + "step": 10954 + }, + { + "epoch": 1.90058986814712, + "grad_norm": 0.7382224202156067, + "learning_rate": 3.798213063953049e-07, + "loss": 0.7565, + "step": 10955 + }, + { + "epoch": 1.900763358778626, + "grad_norm": 2.17087459564209, + "learning_rate": 3.7850092826709817e-07, + "loss": 0.7227, + "step": 10956 + }, + { + "epoch": 1.9009368494101317, + "grad_norm": 1.0187807083129883, + "learning_rate": 3.771828272367195e-07, + "loss": 0.5891, + "step": 10957 + }, + { + "epoch": 1.9011103400416376, + "grad_norm": 0.8456915616989136, + "learning_rate": 3.758670034571399e-07, + "loss": 0.6191, + "step": 10958 + }, + { + "epoch": 1.9012838306731437, + "grad_norm": 0.9001958966255188, + "learning_rate": 3.745534570810616e-07, + "loss": 0.641, + "step": 10959 + }, + { + "epoch": 1.9014573213046495, + "grad_norm": 0.8792054057121277, + "learning_rate": 3.7324218826092053e-07, + "loss": 0.7285, + "step": 10960 + }, + { + "epoch": 1.9016308119361556, + "grad_norm": 1.844840407371521, + "learning_rate": 3.7193319714889487e-07, + "loss": 0.5764, + "step": 10961 + }, + { + "epoch": 1.9018043025676614, + "grad_norm": 0.9573649764060974, + "learning_rate": 3.7062648389689204e-07, + "loss": 0.5248, + "step": 10962 + }, + { + "epoch": 1.9019777931991673, + "grad_norm": 1.146152138710022, + "learning_rate": 3.6932204865655963e-07, + "loss": 0.5748, + "step": 10963 + }, + { + "epoch": 1.9021512838306731, + "grad_norm": 1.4979342222213745, + "learning_rate": 3.680198915792765e-07, + "loss": 0.5773, + "step": 10964 + }, + { + "epoch": 1.902324774462179, + "grad_norm": 0.8519922494888306, + "learning_rate": 3.6672001281616186e-07, + "loss": 0.6688, + "step": 10965 + }, + { + "epoch": 1.9024982650936848, + "grad_norm": 1.0323671102523804, + "learning_rate": 3.654224125180661e-07, + "loss": 0.7009, + "step": 10966 + }, + { + "epoch": 1.9026717557251909, + "grad_norm": 0.8531125783920288, + "learning_rate": 3.6412709083557984e-07, + "loss": 0.5892, + "step": 10967 + }, + { + "epoch": 1.9028452463566967, + "grad_norm": 0.8627721667289734, + "learning_rate": 3.628340479190229e-07, + "loss": 0.7568, + "step": 10968 + }, + { + "epoch": 1.9030187369882028, + "grad_norm": 0.8654274940490723, + "learning_rate": 3.6154328391845963e-07, + "loss": 0.6698, + "step": 10969 + }, + { + "epoch": 1.9031922276197086, + "grad_norm": 0.9468603134155273, + "learning_rate": 3.602547989836769e-07, + "loss": 0.6121, + "step": 10970 + }, + { + "epoch": 1.9033657182512145, + "grad_norm": 0.9426034688949585, + "learning_rate": 3.5896859326421284e-07, + "loss": 0.7732, + "step": 10971 + }, + { + "epoch": 1.9035392088827203, + "grad_norm": 0.8620853424072266, + "learning_rate": 3.5768466690933036e-07, + "loss": 0.657, + "step": 10972 + }, + { + "epoch": 1.9037126995142262, + "grad_norm": 0.9982082843780518, + "learning_rate": 3.564030200680302e-07, + "loss": 0.6677, + "step": 10973 + }, + { + "epoch": 1.903886190145732, + "grad_norm": 0.8966656923294067, + "learning_rate": 3.551236528890445e-07, + "loss": 0.6957, + "step": 10974 + }, + { + "epoch": 1.904059680777238, + "grad_norm": 0.966286838054657, + "learning_rate": 3.538465655208545e-07, + "loss": 0.7859, + "step": 10975 + }, + { + "epoch": 1.904233171408744, + "grad_norm": 1.1686073541641235, + "learning_rate": 3.5257175811166166e-07, + "loss": 0.67, + "step": 10976 + }, + { + "epoch": 1.9044066620402498, + "grad_norm": 0.7935567498207092, + "learning_rate": 3.5129923080940985e-07, + "loss": 0.6622, + "step": 10977 + }, + { + "epoch": 1.9045801526717558, + "grad_norm": 1.0312000513076782, + "learning_rate": 3.500289837617765e-07, + "loss": 0.7242, + "step": 10978 + }, + { + "epoch": 1.9047536433032617, + "grad_norm": 1.259545922279358, + "learning_rate": 3.4876101711617924e-07, + "loss": 0.5566, + "step": 10979 + }, + { + "epoch": 1.9049271339347675, + "grad_norm": 0.8586915135383606, + "learning_rate": 3.474953310197604e-07, + "loss": 0.7499, + "step": 10980 + }, + { + "epoch": 1.9051006245662734, + "grad_norm": 0.8395519256591797, + "learning_rate": 3.462319256194113e-07, + "loss": 0.7676, + "step": 10981 + }, + { + "epoch": 1.9052741151977792, + "grad_norm": 0.9427518248558044, + "learning_rate": 3.4497080106174806e-07, + "loss": 0.5377, + "step": 10982 + }, + { + "epoch": 1.905447605829285, + "grad_norm": 1.2536404132843018, + "learning_rate": 3.437119574931247e-07, + "loss": 0.6343, + "step": 10983 + }, + { + "epoch": 1.9056210964607911, + "grad_norm": 2.172870635986328, + "learning_rate": 3.424553950596332e-07, + "loss": 0.6848, + "step": 10984 + }, + { + "epoch": 1.905794587092297, + "grad_norm": 1.0754740238189697, + "learning_rate": 3.412011139070992e-07, + "loss": 0.6938, + "step": 10985 + }, + { + "epoch": 1.905968077723803, + "grad_norm": 0.959452748298645, + "learning_rate": 3.3994911418108176e-07, + "loss": 0.6467, + "step": 10986 + }, + { + "epoch": 1.906141568355309, + "grad_norm": 1.1357779502868652, + "learning_rate": 3.3869939602687806e-07, + "loss": 0.6091, + "step": 10987 + }, + { + "epoch": 1.9063150589868147, + "grad_norm": 0.8384358286857605, + "learning_rate": 3.374519595895209e-07, + "loss": 0.7955, + "step": 10988 + }, + { + "epoch": 1.9064885496183206, + "grad_norm": 0.8256479501724243, + "learning_rate": 3.362068050137768e-07, + "loss": 0.7832, + "step": 10989 + }, + { + "epoch": 1.9066620402498264, + "grad_norm": 0.85877525806427, + "learning_rate": 3.3496393244414114e-07, + "loss": 0.5868, + "step": 10990 + }, + { + "epoch": 1.9068355308813323, + "grad_norm": 1.0745668411254883, + "learning_rate": 3.3372334202485867e-07, + "loss": 0.582, + "step": 10991 + }, + { + "epoch": 1.9070090215128384, + "grad_norm": 0.8363622426986694, + "learning_rate": 3.324850338998964e-07, + "loss": 0.6594, + "step": 10992 + }, + { + "epoch": 1.9071825121443442, + "grad_norm": 1.2314555644989014, + "learning_rate": 3.312490082129638e-07, + "loss": 0.7908, + "step": 10993 + }, + { + "epoch": 1.9073560027758503, + "grad_norm": 1.0820732116699219, + "learning_rate": 3.300152651075039e-07, + "loss": 0.6334, + "step": 10994 + }, + { + "epoch": 1.9075294934073561, + "grad_norm": 1.0303722620010376, + "learning_rate": 3.2878380472669116e-07, + "loss": 0.6882, + "step": 10995 + }, + { + "epoch": 1.907702984038862, + "grad_norm": 0.9623411893844604, + "learning_rate": 3.2755462721344e-07, + "loss": 0.7446, + "step": 10996 + }, + { + "epoch": 1.9078764746703678, + "grad_norm": 0.8190849423408508, + "learning_rate": 3.2632773271039644e-07, + "loss": 0.6443, + "step": 10997 + }, + { + "epoch": 1.9080499653018737, + "grad_norm": 0.9348361492156982, + "learning_rate": 3.251031213599465e-07, + "loss": 0.5774, + "step": 10998 + }, + { + "epoch": 1.9082234559333795, + "grad_norm": 0.9375532269477844, + "learning_rate": 3.2388079330420095e-07, + "loss": 0.5994, + "step": 10999 + }, + { + "epoch": 1.9083969465648853, + "grad_norm": 1.044010043144226, + "learning_rate": 3.2266074868501976e-07, + "loss": 0.6061, + "step": 11000 + }, + { + "epoch": 1.9085704371963914, + "grad_norm": 0.9697517156600952, + "learning_rate": 3.2144298764398505e-07, + "loss": 0.4883, + "step": 11001 + }, + { + "epoch": 1.9087439278278973, + "grad_norm": 1.457167148590088, + "learning_rate": 3.2022751032242396e-07, + "loss": 0.5566, + "step": 11002 + }, + { + "epoch": 1.9089174184594033, + "grad_norm": 0.8725534081459045, + "learning_rate": 3.190143168613902e-07, + "loss": 0.7051, + "step": 11003 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 1.611517071723938, + "learning_rate": 3.178034074016778e-07, + "loss": 0.6268, + "step": 11004 + }, + { + "epoch": 1.909264399722415, + "grad_norm": 1.225006341934204, + "learning_rate": 3.1659478208381665e-07, + "loss": 0.6934, + "step": 11005 + }, + { + "epoch": 1.9094378903539209, + "grad_norm": 1.1692110300064087, + "learning_rate": 3.1538844104806343e-07, + "loss": 0.7915, + "step": 11006 + }, + { + "epoch": 1.9096113809854267, + "grad_norm": 0.8730213642120361, + "learning_rate": 3.141843844344195e-07, + "loss": 0.6981, + "step": 11007 + }, + { + "epoch": 1.9097848716169326, + "grad_norm": 3.2300429344177246, + "learning_rate": 3.1298261238261964e-07, + "loss": 0.6505, + "step": 11008 + }, + { + "epoch": 1.9099583622484386, + "grad_norm": 1.4380955696105957, + "learning_rate": 3.1178312503212347e-07, + "loss": 0.7731, + "step": 11009 + }, + { + "epoch": 1.9101318528799445, + "grad_norm": 1.1323953866958618, + "learning_rate": 3.105859225221397e-07, + "loss": 0.6329, + "step": 11010 + }, + { + "epoch": 1.9103053435114505, + "grad_norm": 0.9264726042747498, + "learning_rate": 3.0939100499160155e-07, + "loss": 0.5792, + "step": 11011 + }, + { + "epoch": 1.9104788341429564, + "grad_norm": 1.4041554927825928, + "learning_rate": 3.0819837257918037e-07, + "loss": 0.5396, + "step": 11012 + }, + { + "epoch": 1.9106523247744622, + "grad_norm": 1.0804286003112793, + "learning_rate": 3.0700802542328325e-07, + "loss": 0.6626, + "step": 11013 + }, + { + "epoch": 1.910825815405968, + "grad_norm": 1.0097017288208008, + "learning_rate": 3.058199636620529e-07, + "loss": 0.7852, + "step": 11014 + }, + { + "epoch": 1.910999306037474, + "grad_norm": 0.8995676636695862, + "learning_rate": 3.046341874333636e-07, + "loss": 0.7773, + "step": 11015 + }, + { + "epoch": 1.9111727966689798, + "grad_norm": 0.7420588135719299, + "learning_rate": 3.034506968748274e-07, + "loss": 0.6794, + "step": 11016 + }, + { + "epoch": 1.9113462873004856, + "grad_norm": 0.9195637106895447, + "learning_rate": 3.0226949212378786e-07, + "loss": 0.7325, + "step": 11017 + }, + { + "epoch": 1.9115197779319917, + "grad_norm": 0.8079564571380615, + "learning_rate": 3.010905733173264e-07, + "loss": 0.632, + "step": 11018 + }, + { + "epoch": 1.9116932685634975, + "grad_norm": 0.9728806614875793, + "learning_rate": 2.9991394059225797e-07, + "loss": 0.6536, + "step": 11019 + }, + { + "epoch": 1.9118667591950036, + "grad_norm": 1.2566264867782593, + "learning_rate": 2.987395940851312e-07, + "loss": 0.604, + "step": 11020 + }, + { + "epoch": 1.9120402498265094, + "grad_norm": 0.8244036436080933, + "learning_rate": 2.975675339322326e-07, + "loss": 0.5702, + "step": 11021 + }, + { + "epoch": 1.9122137404580153, + "grad_norm": 0.9689913392066956, + "learning_rate": 2.9639776026957777e-07, + "loss": 0.6017, + "step": 11022 + }, + { + "epoch": 1.9123872310895211, + "grad_norm": 2.340508460998535, + "learning_rate": 2.9523027323292264e-07, + "loss": 0.7935, + "step": 11023 + }, + { + "epoch": 1.912560721721027, + "grad_norm": 1.2765916585922241, + "learning_rate": 2.9406507295775657e-07, + "loss": 0.5273, + "step": 11024 + }, + { + "epoch": 1.9127342123525328, + "grad_norm": 1.2249248027801514, + "learning_rate": 2.9290215957929804e-07, + "loss": 0.776, + "step": 11025 + }, + { + "epoch": 1.912907702984039, + "grad_norm": 0.9371734857559204, + "learning_rate": 2.91741533232508e-07, + "loss": 0.6865, + "step": 11026 + }, + { + "epoch": 1.9130811936155447, + "grad_norm": 0.7626547813415527, + "learning_rate": 2.905831940520809e-07, + "loss": 0.7383, + "step": 11027 + }, + { + "epoch": 1.9132546842470508, + "grad_norm": 1.0338404178619385, + "learning_rate": 2.894271421724359e-07, + "loss": 0.8354, + "step": 11028 + }, + { + "epoch": 1.9134281748785567, + "grad_norm": 0.8850775361061096, + "learning_rate": 2.882733777277391e-07, + "loss": 0.7417, + "step": 11029 + }, + { + "epoch": 1.9136016655100625, + "grad_norm": 2.16418194770813, + "learning_rate": 2.871219008518877e-07, + "loss": 0.576, + "step": 11030 + }, + { + "epoch": 1.9137751561415683, + "grad_norm": 1.128124475479126, + "learning_rate": 2.859727116785083e-07, + "loss": 0.6426, + "step": 11031 + }, + { + "epoch": 1.9139486467730742, + "grad_norm": 0.9847703576087952, + "learning_rate": 2.8482581034096733e-07, + "loss": 0.7903, + "step": 11032 + }, + { + "epoch": 1.91412213740458, + "grad_norm": 0.9652157425880432, + "learning_rate": 2.8368119697236297e-07, + "loss": 0.5894, + "step": 11033 + }, + { + "epoch": 1.914295628036086, + "grad_norm": 0.9112289547920227, + "learning_rate": 2.825388717055311e-07, + "loss": 0.764, + "step": 11034 + }, + { + "epoch": 1.914469118667592, + "grad_norm": 0.8851159811019897, + "learning_rate": 2.8139883467303896e-07, + "loss": 0.8269, + "step": 11035 + }, + { + "epoch": 1.9146426092990978, + "grad_norm": 0.8966286778450012, + "learning_rate": 2.8026108600718746e-07, + "loss": 0.7222, + "step": 11036 + }, + { + "epoch": 1.9148160999306039, + "grad_norm": 0.8576352596282959, + "learning_rate": 2.7912562584001766e-07, + "loss": 0.6685, + "step": 11037 + }, + { + "epoch": 1.9149895905621097, + "grad_norm": 0.7459273338317871, + "learning_rate": 2.7799245430329526e-07, + "loss": 0.8167, + "step": 11038 + }, + { + "epoch": 1.9151630811936156, + "grad_norm": 1.4609493017196655, + "learning_rate": 2.768615715285283e-07, + "loss": 0.7416, + "step": 11039 + }, + { + "epoch": 1.9153365718251214, + "grad_norm": 0.9377540349960327, + "learning_rate": 2.7573297764696085e-07, + "loss": 0.6781, + "step": 11040 + }, + { + "epoch": 1.9155100624566272, + "grad_norm": 0.8794897198677063, + "learning_rate": 2.7460667278956355e-07, + "loss": 0.6884, + "step": 11041 + }, + { + "epoch": 1.915683553088133, + "grad_norm": 1.209489345550537, + "learning_rate": 2.7348265708704745e-07, + "loss": 0.5984, + "step": 11042 + }, + { + "epoch": 1.9158570437196392, + "grad_norm": 1.3438633680343628, + "learning_rate": 2.723609306698527e-07, + "loss": 0.761, + "step": 11043 + }, + { + "epoch": 1.916030534351145, + "grad_norm": 1.2688875198364258, + "learning_rate": 2.7124149366816177e-07, + "loss": 0.7937, + "step": 11044 + }, + { + "epoch": 1.916204024982651, + "grad_norm": 0.8101294636726379, + "learning_rate": 2.701243462118819e-07, + "loss": 0.7761, + "step": 11045 + }, + { + "epoch": 1.916377515614157, + "grad_norm": 0.8130449056625366, + "learning_rate": 2.690094884306649e-07, + "loss": 0.8008, + "step": 11046 + }, + { + "epoch": 1.9165510062456628, + "grad_norm": 0.9481711387634277, + "learning_rate": 2.678969204538828e-07, + "loss": 0.8064, + "step": 11047 + }, + { + "epoch": 1.9167244968771686, + "grad_norm": 1.0104916095733643, + "learning_rate": 2.667866424106591e-07, + "loss": 0.6531, + "step": 11048 + }, + { + "epoch": 1.9168979875086745, + "grad_norm": 1.1295627355575562, + "learning_rate": 2.656786544298373e-07, + "loss": 0.7485, + "step": 11049 + }, + { + "epoch": 1.9170714781401803, + "grad_norm": 0.8118884563446045, + "learning_rate": 2.6457295664000573e-07, + "loss": 0.7761, + "step": 11050 + }, + { + "epoch": 1.9172449687716864, + "grad_norm": 1.445842981338501, + "learning_rate": 2.634695491694772e-07, + "loss": 0.6541, + "step": 11051 + }, + { + "epoch": 1.9174184594031922, + "grad_norm": 0.9168398976325989, + "learning_rate": 2.623684321463049e-07, + "loss": 0.6595, + "step": 11052 + }, + { + "epoch": 1.9175919500346983, + "grad_norm": 0.9881173372268677, + "learning_rate": 2.6126960569827554e-07, + "loss": 0.6897, + "step": 11053 + }, + { + "epoch": 1.9177654406662041, + "grad_norm": 0.8784379363059998, + "learning_rate": 2.6017306995290926e-07, + "loss": 0.678, + "step": 11054 + }, + { + "epoch": 1.91793893129771, + "grad_norm": 0.7436887621879578, + "learning_rate": 2.5907882503745764e-07, + "loss": 0.6233, + "step": 11055 + }, + { + "epoch": 1.9181124219292158, + "grad_norm": 0.8617199063301086, + "learning_rate": 2.579868710789124e-07, + "loss": 0.6055, + "step": 11056 + }, + { + "epoch": 1.9182859125607217, + "grad_norm": 0.7896479964256287, + "learning_rate": 2.5689720820399445e-07, + "loss": 0.7324, + "step": 11057 + }, + { + "epoch": 1.9184594031922275, + "grad_norm": 0.8873956799507141, + "learning_rate": 2.5580983653916035e-07, + "loss": 0.6357, + "step": 11058 + }, + { + "epoch": 1.9186328938237334, + "grad_norm": 3.167161464691162, + "learning_rate": 2.5472475621060255e-07, + "loss": 0.7805, + "step": 11059 + }, + { + "epoch": 1.9188063844552394, + "grad_norm": 0.9144197106361389, + "learning_rate": 2.5364196734424475e-07, + "loss": 0.7113, + "step": 11060 + }, + { + "epoch": 1.9189798750867453, + "grad_norm": 0.81383216381073, + "learning_rate": 2.5256147006574195e-07, + "loss": 0.7007, + "step": 11061 + }, + { + "epoch": 1.9191533657182513, + "grad_norm": 0.7398425936698914, + "learning_rate": 2.514832645004939e-07, + "loss": 0.6681, + "step": 11062 + }, + { + "epoch": 1.9193268563497572, + "grad_norm": 1.1212183237075806, + "learning_rate": 2.504073507736249e-07, + "loss": 0.6948, + "step": 11063 + }, + { + "epoch": 1.919500346981263, + "grad_norm": 1.0546241998672485, + "learning_rate": 2.493337290099973e-07, + "loss": 0.6304, + "step": 11064 + }, + { + "epoch": 1.9196738376127689, + "grad_norm": 0.9328494668006897, + "learning_rate": 2.482623993342004e-07, + "loss": 0.705, + "step": 11065 + }, + { + "epoch": 1.9198473282442747, + "grad_norm": 0.9719135761260986, + "learning_rate": 2.471933618705702e-07, + "loss": 0.5807, + "step": 11066 + }, + { + "epoch": 1.9200208188757806, + "grad_norm": 0.7931241989135742, + "learning_rate": 2.4612661674316527e-07, + "loss": 0.682, + "step": 11067 + }, + { + "epoch": 1.9201943095072866, + "grad_norm": 0.93238765001297, + "learning_rate": 2.4506216407578665e-07, + "loss": 0.7507, + "step": 11068 + }, + { + "epoch": 1.9203678001387925, + "grad_norm": 0.9136152863502502, + "learning_rate": 2.440000039919621e-07, + "loss": 0.7534, + "step": 11069 + }, + { + "epoch": 1.9205412907702986, + "grad_norm": 0.9278675317764282, + "learning_rate": 2.429401366149553e-07, + "loss": 0.7079, + "step": 11070 + }, + { + "epoch": 1.9207147814018044, + "grad_norm": 0.9395513534545898, + "learning_rate": 2.4188256206776785e-07, + "loss": 0.5988, + "step": 11071 + }, + { + "epoch": 1.9208882720333103, + "grad_norm": 0.715532124042511, + "learning_rate": 2.4082728047313487e-07, + "loss": 0.6021, + "step": 11072 + }, + { + "epoch": 1.921061762664816, + "grad_norm": 1.5835410356521606, + "learning_rate": 2.397742919535162e-07, + "loss": 0.6581, + "step": 11073 + }, + { + "epoch": 1.921235253296322, + "grad_norm": 1.1282798051834106, + "learning_rate": 2.3872359663111856e-07, + "loss": 0.5543, + "step": 11074 + }, + { + "epoch": 1.9214087439278278, + "grad_norm": 0.777309238910675, + "learning_rate": 2.3767519462787326e-07, + "loss": 0.6885, + "step": 11075 + }, + { + "epoch": 1.9215822345593336, + "grad_norm": 1.4719706773757935, + "learning_rate": 2.3662908606544964e-07, + "loss": 0.6748, + "step": 11076 + }, + { + "epoch": 1.9217557251908397, + "grad_norm": 0.9490249156951904, + "learning_rate": 2.355852710652484e-07, + "loss": 0.792, + "step": 11077 + }, + { + "epoch": 1.9219292158223455, + "grad_norm": 0.6824055910110474, + "learning_rate": 2.34543749748406e-07, + "loss": 0.7402, + "step": 11078 + }, + { + "epoch": 1.9221027064538516, + "grad_norm": 0.8596665859222412, + "learning_rate": 2.3350452223579678e-07, + "loss": 0.6025, + "step": 11079 + }, + { + "epoch": 1.9222761970853575, + "grad_norm": 0.8673238754272461, + "learning_rate": 2.3246758864801544e-07, + "loss": 0.655, + "step": 11080 + }, + { + "epoch": 1.9224496877168633, + "grad_norm": 0.8125555515289307, + "learning_rate": 2.3143294910540794e-07, + "loss": 0.6985, + "step": 11081 + }, + { + "epoch": 1.9226231783483692, + "grad_norm": 0.9505097270011902, + "learning_rate": 2.304006037280404e-07, + "loss": 0.8093, + "step": 11082 + }, + { + "epoch": 1.922796668979875, + "grad_norm": 1.4891198873519897, + "learning_rate": 2.2937055263571928e-07, + "loss": 0.8889, + "step": 11083 + }, + { + "epoch": 1.9229701596113808, + "grad_norm": 1.4610196352005005, + "learning_rate": 2.2834279594798002e-07, + "loss": 0.6904, + "step": 11084 + }, + { + "epoch": 1.923143650242887, + "grad_norm": 0.9802378416061401, + "learning_rate": 2.2731733378410058e-07, + "loss": 0.6115, + "step": 11085 + }, + { + "epoch": 1.9233171408743928, + "grad_norm": 0.8342397809028625, + "learning_rate": 2.2629416626308353e-07, + "loss": 0.6208, + "step": 11086 + }, + { + "epoch": 1.9234906315058988, + "grad_norm": 0.9816537499427795, + "learning_rate": 2.2527329350367166e-07, + "loss": 0.6094, + "step": 11087 + }, + { + "epoch": 1.9236641221374047, + "grad_norm": 1.9470279216766357, + "learning_rate": 2.2425471562433466e-07, + "loss": 0.647, + "step": 11088 + }, + { + "epoch": 1.9238376127689105, + "grad_norm": 1.360982894897461, + "learning_rate": 2.2323843274327793e-07, + "loss": 0.8019, + "step": 11089 + }, + { + "epoch": 1.9240111034004164, + "grad_norm": 1.1086188554763794, + "learning_rate": 2.222244449784494e-07, + "loss": 0.7124, + "step": 11090 + }, + { + "epoch": 1.9241845940319222, + "grad_norm": 0.8724993467330933, + "learning_rate": 2.2121275244751939e-07, + "loss": 0.549, + "step": 11091 + }, + { + "epoch": 1.924358084663428, + "grad_norm": 0.9862327575683594, + "learning_rate": 2.2020335526789616e-07, + "loss": 0.6432, + "step": 11092 + }, + { + "epoch": 1.9245315752949341, + "grad_norm": 0.8786136507987976, + "learning_rate": 2.1919625355671936e-07, + "loss": 0.7126, + "step": 11093 + }, + { + "epoch": 1.92470506592644, + "grad_norm": 0.8162989616394043, + "learning_rate": 2.1819144743086883e-07, + "loss": 0.7368, + "step": 11094 + }, + { + "epoch": 1.9248785565579458, + "grad_norm": 0.9498807787895203, + "learning_rate": 2.1718893700695132e-07, + "loss": 0.6848, + "step": 11095 + }, + { + "epoch": 1.9250520471894519, + "grad_norm": 0.8277056813240051, + "learning_rate": 2.1618872240130928e-07, + "loss": 0.756, + "step": 11096 + }, + { + "epoch": 1.9252255378209577, + "grad_norm": 1.0503231287002563, + "learning_rate": 2.1519080373001655e-07, + "loss": 0.6182, + "step": 11097 + }, + { + "epoch": 1.9253990284524636, + "grad_norm": 0.8629254698753357, + "learning_rate": 2.1419518110888938e-07, + "loss": 0.6622, + "step": 11098 + }, + { + "epoch": 1.9255725190839694, + "grad_norm": 1.0516691207885742, + "learning_rate": 2.13201854653462e-07, + "loss": 0.5643, + "step": 11099 + }, + { + "epoch": 1.9257460097154753, + "grad_norm": 0.9161678552627563, + "learning_rate": 2.1221082447901774e-07, + "loss": 0.708, + "step": 11100 + }, + { + "epoch": 1.9259195003469811, + "grad_norm": 1.163041114807129, + "learning_rate": 2.1122209070056466e-07, + "loss": 0.6189, + "step": 11101 + }, + { + "epoch": 1.9260929909784872, + "grad_norm": 0.7865609526634216, + "learning_rate": 2.1023565343284425e-07, + "loss": 0.7646, + "step": 11102 + }, + { + "epoch": 1.926266481609993, + "grad_norm": 0.8601689338684082, + "learning_rate": 2.0925151279033828e-07, + "loss": 0.7483, + "step": 11103 + }, + { + "epoch": 1.926439972241499, + "grad_norm": 1.794548749923706, + "learning_rate": 2.082696688872554e-07, + "loss": 0.6156, + "step": 11104 + }, + { + "epoch": 1.926613462873005, + "grad_norm": 1.5173101425170898, + "learning_rate": 2.0729012183753783e-07, + "loss": 0.651, + "step": 11105 + }, + { + "epoch": 1.9267869535045108, + "grad_norm": 1.0618613958358765, + "learning_rate": 2.063128717548657e-07, + "loss": 0.6854, + "step": 11106 + }, + { + "epoch": 1.9269604441360166, + "grad_norm": 0.7097671627998352, + "learning_rate": 2.0533791875264608e-07, + "loss": 0.8022, + "step": 11107 + }, + { + "epoch": 1.9271339347675225, + "grad_norm": 1.1192578077316284, + "learning_rate": 2.043652629440307e-07, + "loss": 0.6962, + "step": 11108 + }, + { + "epoch": 1.9273074253990283, + "grad_norm": 0.6101170182228088, + "learning_rate": 2.0339490444188925e-07, + "loss": 0.6777, + "step": 11109 + }, + { + "epoch": 1.9274809160305344, + "grad_norm": 0.9917431473731995, + "learning_rate": 2.0242684335884056e-07, + "loss": 0.7168, + "step": 11110 + }, + { + "epoch": 1.9276544066620402, + "grad_norm": 0.9180713891983032, + "learning_rate": 2.0146107980722362e-07, + "loss": 0.8147, + "step": 11111 + }, + { + "epoch": 1.9278278972935463, + "grad_norm": 2.303595781326294, + "learning_rate": 2.0049761389911772e-07, + "loss": 0.5562, + "step": 11112 + }, + { + "epoch": 1.9280013879250522, + "grad_norm": 0.8443737626075745, + "learning_rate": 1.9953644574633335e-07, + "loss": 0.6859, + "step": 11113 + }, + { + "epoch": 1.928174878556558, + "grad_norm": 0.7528368830680847, + "learning_rate": 1.9857757546041912e-07, + "loss": 0.7439, + "step": 11114 + }, + { + "epoch": 1.9283483691880638, + "grad_norm": 3.7704319953918457, + "learning_rate": 1.9762100315265043e-07, + "loss": 0.7488, + "step": 11115 + }, + { + "epoch": 1.9285218598195697, + "grad_norm": 0.802143394947052, + "learning_rate": 1.9666672893403627e-07, + "loss": 0.6588, + "step": 11116 + }, + { + "epoch": 1.9286953504510755, + "grad_norm": 1.4275351762771606, + "learning_rate": 1.9571475291532805e-07, + "loss": 0.835, + "step": 11117 + }, + { + "epoch": 1.9288688410825814, + "grad_norm": 1.448490023612976, + "learning_rate": 1.9476507520699518e-07, + "loss": 0.7512, + "step": 11118 + }, + { + "epoch": 1.9290423317140875, + "grad_norm": 0.7882699370384216, + "learning_rate": 1.9381769591925614e-07, + "loss": 0.7852, + "step": 11119 + }, + { + "epoch": 1.9292158223455933, + "grad_norm": 0.8560740351676941, + "learning_rate": 1.928726151620497e-07, + "loss": 0.7679, + "step": 11120 + }, + { + "epoch": 1.9293893129770994, + "grad_norm": 0.786920964717865, + "learning_rate": 1.9192983304505697e-07, + "loss": 0.6595, + "step": 11121 + }, + { + "epoch": 1.9295628036086052, + "grad_norm": 0.7542521357536316, + "learning_rate": 1.9098934967768823e-07, + "loss": 0.7037, + "step": 11122 + }, + { + "epoch": 1.929736294240111, + "grad_norm": 0.9497079849243164, + "learning_rate": 1.9005116516908729e-07, + "loss": 0.5879, + "step": 11123 + }, + { + "epoch": 1.929909784871617, + "grad_norm": 1.0145844221115112, + "learning_rate": 1.891152796281337e-07, + "loss": 0.6851, + "step": 11124 + }, + { + "epoch": 1.9300832755031228, + "grad_norm": 0.9729589223861694, + "learning_rate": 1.8818169316343393e-07, + "loss": 0.6046, + "step": 11125 + }, + { + "epoch": 1.9302567661346286, + "grad_norm": 0.7599793076515198, + "learning_rate": 1.8725040588333466e-07, + "loss": 0.7186, + "step": 11126 + }, + { + "epoch": 1.9304302567661347, + "grad_norm": 1.4682159423828125, + "learning_rate": 1.8632141789591384e-07, + "loss": 0.6501, + "step": 11127 + }, + { + "epoch": 1.9306037473976405, + "grad_norm": 1.2840523719787598, + "learning_rate": 1.853947293089764e-07, + "loss": 0.7346, + "step": 11128 + }, + { + "epoch": 1.9307772380291466, + "grad_norm": 1.6233932971954346, + "learning_rate": 1.8447034023007183e-07, + "loss": 0.7112, + "step": 11129 + }, + { + "epoch": 1.9309507286606524, + "grad_norm": 1.223954677581787, + "learning_rate": 1.8354825076647432e-07, + "loss": 0.6046, + "step": 11130 + }, + { + "epoch": 1.9311242192921583, + "grad_norm": 1.0771539211273193, + "learning_rate": 1.826284610251916e-07, + "loss": 0.6353, + "step": 11131 + }, + { + "epoch": 1.9312977099236641, + "grad_norm": 0.9348137378692627, + "learning_rate": 1.817109711129672e-07, + "loss": 0.5588, + "step": 11132 + }, + { + "epoch": 1.93147120055517, + "grad_norm": 1.181039810180664, + "learning_rate": 1.8079578113627815e-07, + "loss": 0.6486, + "step": 11133 + }, + { + "epoch": 1.9316446911866758, + "grad_norm": 0.9667040705680847, + "learning_rate": 1.7988289120133507e-07, + "loss": 0.7037, + "step": 11134 + }, + { + "epoch": 1.9318181818181817, + "grad_norm": 1.107541799545288, + "learning_rate": 1.789723014140754e-07, + "loss": 0.6442, + "step": 11135 + }, + { + "epoch": 1.9319916724496877, + "grad_norm": 2.510324239730835, + "learning_rate": 1.7806401188017463e-07, + "loss": 0.7791, + "step": 11136 + }, + { + "epoch": 1.9321651630811936, + "grad_norm": 0.7930907607078552, + "learning_rate": 1.77158022705044e-07, + "loss": 0.7754, + "step": 11137 + }, + { + "epoch": 1.9323386537126996, + "grad_norm": 0.7807673215866089, + "learning_rate": 1.7625433399382386e-07, + "loss": 0.707, + "step": 11138 + }, + { + "epoch": 1.9325121443442055, + "grad_norm": 0.91954106092453, + "learning_rate": 1.7535294585138808e-07, + "loss": 0.6914, + "step": 11139 + }, + { + "epoch": 1.9326856349757113, + "grad_norm": 1.0975775718688965, + "learning_rate": 1.7445385838234185e-07, + "loss": 0.5597, + "step": 11140 + }, + { + "epoch": 1.9328591256072172, + "grad_norm": 0.7718478441238403, + "learning_rate": 1.735570716910262e-07, + "loss": 0.6415, + "step": 11141 + }, + { + "epoch": 1.933032616238723, + "grad_norm": 0.8518308997154236, + "learning_rate": 1.7266258588151562e-07, + "loss": 0.6957, + "step": 11142 + }, + { + "epoch": 1.9332061068702289, + "grad_norm": 1.22980535030365, + "learning_rate": 1.71770401057616e-07, + "loss": 0.8315, + "step": 11143 + }, + { + "epoch": 1.933379597501735, + "grad_norm": 0.7239780426025391, + "learning_rate": 1.7088051732286448e-07, + "loss": 0.7809, + "step": 11144 + }, + { + "epoch": 1.9335530881332408, + "grad_norm": 1.619469404220581, + "learning_rate": 1.6999293478053404e-07, + "loss": 0.6377, + "step": 11145 + }, + { + "epoch": 1.9337265787647469, + "grad_norm": 0.868916392326355, + "learning_rate": 1.6910765353363334e-07, + "loss": 0.6954, + "step": 11146 + }, + { + "epoch": 1.9339000693962527, + "grad_norm": 1.0519222021102905, + "learning_rate": 1.682246736848936e-07, + "loss": 0.5508, + "step": 11147 + }, + { + "epoch": 1.9340735600277585, + "grad_norm": 1.2899383306503296, + "learning_rate": 1.6734399533679057e-07, + "loss": 0.653, + "step": 11148 + }, + { + "epoch": 1.9342470506592644, + "grad_norm": 0.833842933177948, + "learning_rate": 1.6646561859152476e-07, + "loss": 0.7167, + "step": 11149 + }, + { + "epoch": 1.9344205412907702, + "grad_norm": 0.8812260031700134, + "learning_rate": 1.6558954355103686e-07, + "loss": 0.718, + "step": 11150 + }, + { + "epoch": 1.934594031922276, + "grad_norm": 1.038547396659851, + "learning_rate": 1.6471577031699214e-07, + "loss": 0.5969, + "step": 11151 + }, + { + "epoch": 1.9347675225537821, + "grad_norm": 0.7969525456428528, + "learning_rate": 1.6384429899079624e-07, + "loss": 0.7312, + "step": 11152 + }, + { + "epoch": 1.934941013185288, + "grad_norm": 1.0421605110168457, + "learning_rate": 1.6297512967358374e-07, + "loss": 0.7723, + "step": 11153 + }, + { + "epoch": 1.9351145038167938, + "grad_norm": 1.0239026546478271, + "learning_rate": 1.6210826246622068e-07, + "loss": 0.7056, + "step": 11154 + }, + { + "epoch": 1.9352879944483, + "grad_norm": 1.9983247518539429, + "learning_rate": 1.6124369746931102e-07, + "loss": 0.7899, + "step": 11155 + }, + { + "epoch": 1.9354614850798058, + "grad_norm": 1.0580968856811523, + "learning_rate": 1.603814347831856e-07, + "loss": 0.627, + "step": 11156 + }, + { + "epoch": 1.9356349757113116, + "grad_norm": 0.8218840956687927, + "learning_rate": 1.595214745079132e-07, + "loss": 0.7439, + "step": 11157 + }, + { + "epoch": 1.9358084663428174, + "grad_norm": 0.6810859441757202, + "learning_rate": 1.586638167432919e-07, + "loss": 0.724, + "step": 11158 + }, + { + "epoch": 1.9359819569743233, + "grad_norm": 0.9835218191146851, + "learning_rate": 1.5780846158885533e-07, + "loss": 0.7521, + "step": 11159 + }, + { + "epoch": 1.9361554476058291, + "grad_norm": 0.8291221261024475, + "learning_rate": 1.5695540914386632e-07, + "loss": 0.6947, + "step": 11160 + }, + { + "epoch": 1.9363289382373352, + "grad_norm": 1.284244179725647, + "learning_rate": 1.5610465950732569e-07, + "loss": 0.6171, + "step": 11161 + }, + { + "epoch": 1.936502428868841, + "grad_norm": 1.080061674118042, + "learning_rate": 1.552562127779611e-07, + "loss": 0.5841, + "step": 11162 + }, + { + "epoch": 1.9366759195003471, + "grad_norm": 1.0596975088119507, + "learning_rate": 1.5441006905423605e-07, + "loss": 0.6672, + "step": 11163 + }, + { + "epoch": 1.936849410131853, + "grad_norm": 0.8039401769638062, + "learning_rate": 1.5356622843434533e-07, + "loss": 0.7393, + "step": 11164 + }, + { + "epoch": 1.9370229007633588, + "grad_norm": 0.8469125032424927, + "learning_rate": 1.527246910162239e-07, + "loss": 0.597, + "step": 11165 + }, + { + "epoch": 1.9371963913948647, + "grad_norm": 1.5944223403930664, + "learning_rate": 1.51885456897527e-07, + "loss": 0.7551, + "step": 11166 + }, + { + "epoch": 1.9373698820263705, + "grad_norm": 0.6963577270507812, + "learning_rate": 1.5104852617565004e-07, + "loss": 0.8083, + "step": 11167 + }, + { + "epoch": 1.9375433726578764, + "grad_norm": 1.0410035848617554, + "learning_rate": 1.5021389894771753e-07, + "loss": 0.6553, + "step": 11168 + }, + { + "epoch": 1.9377168632893824, + "grad_norm": 0.7285270094871521, + "learning_rate": 1.493815753105965e-07, + "loss": 0.5656, + "step": 11169 + }, + { + "epoch": 1.9378903539208883, + "grad_norm": 1.108302354812622, + "learning_rate": 1.4855155536087184e-07, + "loss": 0.6453, + "step": 11170 + }, + { + "epoch": 1.9380638445523943, + "grad_norm": 1.2297085523605347, + "learning_rate": 1.47723839194871e-07, + "loss": 0.6825, + "step": 11171 + }, + { + "epoch": 1.9382373351839002, + "grad_norm": 0.7989052534103394, + "learning_rate": 1.4689842690865042e-07, + "loss": 0.6311, + "step": 11172 + }, + { + "epoch": 1.938410825815406, + "grad_norm": 0.9498565793037415, + "learning_rate": 1.4607531859800238e-07, + "loss": 0.7368, + "step": 11173 + }, + { + "epoch": 1.9385843164469119, + "grad_norm": 0.9489760398864746, + "learning_rate": 1.4525451435844608e-07, + "loss": 0.6495, + "step": 11174 + }, + { + "epoch": 1.9387578070784177, + "grad_norm": 1.1148666143417358, + "learning_rate": 1.444360142852408e-07, + "loss": 0.6464, + "step": 11175 + }, + { + "epoch": 1.9389312977099236, + "grad_norm": 1.739755392074585, + "learning_rate": 1.436198184733706e-07, + "loss": 0.6847, + "step": 11176 + }, + { + "epoch": 1.9391047883414294, + "grad_norm": 0.9644340872764587, + "learning_rate": 1.428059270175597e-07, + "loss": 0.726, + "step": 11177 + }, + { + "epoch": 1.9392782789729355, + "grad_norm": 1.0201078653335571, + "learning_rate": 1.4199434001225697e-07, + "loss": 0.6694, + "step": 11178 + }, + { + "epoch": 1.9394517696044413, + "grad_norm": 1.1699432134628296, + "learning_rate": 1.411850575516538e-07, + "loss": 0.6238, + "step": 11179 + }, + { + "epoch": 1.9396252602359474, + "grad_norm": 0.7325180768966675, + "learning_rate": 1.4037807972966167e-07, + "loss": 0.7754, + "step": 11180 + }, + { + "epoch": 1.9397987508674532, + "grad_norm": 1.2203792333602905, + "learning_rate": 1.3957340663993458e-07, + "loss": 0.6293, + "step": 11181 + }, + { + "epoch": 1.939972241498959, + "grad_norm": 0.9048212766647339, + "learning_rate": 1.387710383758556e-07, + "loss": 0.5074, + "step": 11182 + }, + { + "epoch": 1.940145732130465, + "grad_norm": 0.7406665682792664, + "learning_rate": 1.3797097503054136e-07, + "loss": 0.7185, + "step": 11183 + }, + { + "epoch": 1.9403192227619708, + "grad_norm": 0.9255716800689697, + "learning_rate": 1.3717321669683981e-07, + "loss": 0.7817, + "step": 11184 + }, + { + "epoch": 1.9404927133934766, + "grad_norm": 0.721554696559906, + "learning_rate": 1.3637776346733022e-07, + "loss": 0.7662, + "step": 11185 + }, + { + "epoch": 1.9406662040249827, + "grad_norm": 1.081559181213379, + "learning_rate": 1.3558461543432767e-07, + "loss": 0.6091, + "step": 11186 + }, + { + "epoch": 1.9408396946564885, + "grad_norm": 1.0165269374847412, + "learning_rate": 1.3479377268987626e-07, + "loss": 0.7432, + "step": 11187 + }, + { + "epoch": 1.9410131852879946, + "grad_norm": 1.8331948518753052, + "learning_rate": 1.3400523532575592e-07, + "loss": 0.5695, + "step": 11188 + }, + { + "epoch": 1.9411866759195004, + "grad_norm": 0.6886436939239502, + "learning_rate": 1.332190034334757e-07, + "loss": 0.6847, + "step": 11189 + }, + { + "epoch": 1.9413601665510063, + "grad_norm": 0.6659141778945923, + "learning_rate": 1.324350771042804e-07, + "loss": 0.7883, + "step": 11190 + }, + { + "epoch": 1.9415336571825121, + "grad_norm": 0.9886119365692139, + "learning_rate": 1.3165345642914385e-07, + "loss": 0.6042, + "step": 11191 + }, + { + "epoch": 1.941707147814018, + "grad_norm": 0.7069924473762512, + "learning_rate": 1.3087414149877574e-07, + "loss": 0.7629, + "step": 11192 + }, + { + "epoch": 1.9418806384455238, + "grad_norm": 0.7455651164054871, + "learning_rate": 1.3009713240361488e-07, + "loss": 0.7214, + "step": 11193 + }, + { + "epoch": 1.9420541290770297, + "grad_norm": 1.1055548191070557, + "learning_rate": 1.2932242923383575e-07, + "loss": 0.6638, + "step": 11194 + }, + { + "epoch": 1.9422276197085357, + "grad_norm": 1.4240350723266602, + "learning_rate": 1.2855003207934203e-07, + "loss": 0.6143, + "step": 11195 + }, + { + "epoch": 1.9424011103400416, + "grad_norm": 1.2531490325927734, + "learning_rate": 1.277799410297731e-07, + "loss": 0.6206, + "step": 11196 + }, + { + "epoch": 1.9425746009715477, + "grad_norm": 0.8796167373657227, + "learning_rate": 1.2701215617449526e-07, + "loss": 0.6304, + "step": 11197 + }, + { + "epoch": 1.9427480916030535, + "grad_norm": 1.7107962369918823, + "learning_rate": 1.26246677602615e-07, + "loss": 0.6416, + "step": 11198 + }, + { + "epoch": 1.9429215822345594, + "grad_norm": 0.8338062167167664, + "learning_rate": 1.2548350540296573e-07, + "loss": 0.731, + "step": 11199 + }, + { + "epoch": 1.9430950728660652, + "grad_norm": 0.8706353902816772, + "learning_rate": 1.2472263966411214e-07, + "loss": 0.6102, + "step": 11200 + }, + { + "epoch": 1.943268563497571, + "grad_norm": 1.0365300178527832, + "learning_rate": 1.2396408047435694e-07, + "loss": 0.6516, + "step": 11201 + }, + { + "epoch": 1.943442054129077, + "grad_norm": 1.2207627296447754, + "learning_rate": 1.2320782792173192e-07, + "loss": 0.8098, + "step": 11202 + }, + { + "epoch": 1.943615544760583, + "grad_norm": 1.0090152025222778, + "learning_rate": 1.224538820939958e-07, + "loss": 0.6572, + "step": 11203 + }, + { + "epoch": 1.9437890353920888, + "grad_norm": 0.7623633742332458, + "learning_rate": 1.2170224307865185e-07, + "loss": 0.6649, + "step": 11204 + }, + { + "epoch": 1.9439625260235949, + "grad_norm": 0.8145825862884521, + "learning_rate": 1.2095291096292373e-07, + "loss": 0.8257, + "step": 11205 + }, + { + "epoch": 1.9441360166551007, + "grad_norm": 0.8712314963340759, + "learning_rate": 1.2020588583377513e-07, + "loss": 0.748, + "step": 11206 + }, + { + "epoch": 1.9443095072866066, + "grad_norm": 1.11342453956604, + "learning_rate": 1.1946116777789673e-07, + "loss": 0.5863, + "step": 11207 + }, + { + "epoch": 1.9444829979181124, + "grad_norm": 1.1164273023605347, + "learning_rate": 1.187187568817172e-07, + "loss": 0.6689, + "step": 11208 + }, + { + "epoch": 1.9446564885496183, + "grad_norm": 1.1173028945922852, + "learning_rate": 1.179786532313898e-07, + "loss": 0.656, + "step": 11209 + }, + { + "epoch": 1.944829979181124, + "grad_norm": 1.021632432937622, + "learning_rate": 1.1724085691280806e-07, + "loss": 0.8855, + "step": 11210 + }, + { + "epoch": 1.9450034698126302, + "grad_norm": 1.991043210029602, + "learning_rate": 1.165053680115924e-07, + "loss": 0.5905, + "step": 11211 + }, + { + "epoch": 1.945176960444136, + "grad_norm": 0.8805654644966125, + "learning_rate": 1.1577218661309896e-07, + "loss": 0.6954, + "step": 11212 + }, + { + "epoch": 1.9453504510756419, + "grad_norm": 0.8361454010009766, + "learning_rate": 1.1504131280241083e-07, + "loss": 0.6621, + "step": 11213 + }, + { + "epoch": 1.945523941707148, + "grad_norm": 0.7691875100135803, + "learning_rate": 1.1431274666435121e-07, + "loss": 0.6924, + "step": 11214 + }, + { + "epoch": 1.9456974323386538, + "grad_norm": 0.6810308694839478, + "learning_rate": 1.1358648828346808e-07, + "loss": 0.8594, + "step": 11215 + }, + { + "epoch": 1.9458709229701596, + "grad_norm": 0.8661203384399414, + "learning_rate": 1.1286253774404288e-07, + "loss": 0.5894, + "step": 11216 + }, + { + "epoch": 1.9460444136016655, + "grad_norm": 0.9714053869247437, + "learning_rate": 1.121408951300973e-07, + "loss": 0.7366, + "step": 11217 + }, + { + "epoch": 1.9462179042331713, + "grad_norm": 0.8016998767852783, + "learning_rate": 1.11421560525371e-07, + "loss": 0.636, + "step": 11218 + }, + { + "epoch": 1.9463913948646772, + "grad_norm": 0.8388639092445374, + "learning_rate": 1.1070453401335058e-07, + "loss": 0.566, + "step": 11219 + }, + { + "epoch": 1.9465648854961832, + "grad_norm": 0.9131860136985779, + "learning_rate": 1.0998981567724276e-07, + "loss": 0.6514, + "step": 11220 + }, + { + "epoch": 1.946738376127689, + "grad_norm": 1.1757864952087402, + "learning_rate": 1.0927740559999455e-07, + "loss": 0.6708, + "step": 11221 + }, + { + "epoch": 1.9469118667591951, + "grad_norm": 0.7103977799415588, + "learning_rate": 1.0856730386427983e-07, + "loss": 0.7543, + "step": 11222 + }, + { + "epoch": 1.947085357390701, + "grad_norm": 1.2431491613388062, + "learning_rate": 1.078595105525082e-07, + "loss": 0.5839, + "step": 11223 + }, + { + "epoch": 1.9472588480222068, + "grad_norm": 0.8695160150527954, + "learning_rate": 1.0715402574681843e-07, + "loss": 0.5591, + "step": 11224 + }, + { + "epoch": 1.9474323386537127, + "grad_norm": 0.776345431804657, + "learning_rate": 1.0645084952908502e-07, + "loss": 0.7067, + "step": 11225 + }, + { + "epoch": 1.9476058292852185, + "grad_norm": 1.8331953287124634, + "learning_rate": 1.0574998198090935e-07, + "loss": 0.6772, + "step": 11226 + }, + { + "epoch": 1.9477793199167244, + "grad_norm": 1.2191216945648193, + "learning_rate": 1.0505142318363082e-07, + "loss": 0.6355, + "step": 11227 + }, + { + "epoch": 1.9479528105482304, + "grad_norm": 0.8790909051895142, + "learning_rate": 1.0435517321831568e-07, + "loss": 0.7322, + "step": 11228 + }, + { + "epoch": 1.9481263011797363, + "grad_norm": 1.5109652280807495, + "learning_rate": 1.0366123216576817e-07, + "loss": 0.7336, + "step": 11229 + }, + { + "epoch": 1.9482997918112424, + "grad_norm": 1.0164812803268433, + "learning_rate": 1.0296960010651725e-07, + "loss": 0.6251, + "step": 11230 + }, + { + "epoch": 1.9484732824427482, + "grad_norm": 0.8398014307022095, + "learning_rate": 1.0228027712082755e-07, + "loss": 0.7296, + "step": 11231 + }, + { + "epoch": 1.948646773074254, + "grad_norm": 0.8803063035011292, + "learning_rate": 1.0159326328869734e-07, + "loss": 0.6644, + "step": 11232 + }, + { + "epoch": 1.94882026370576, + "grad_norm": 1.2171355485916138, + "learning_rate": 1.0090855868985616e-07, + "loss": 0.7153, + "step": 11233 + }, + { + "epoch": 1.9489937543372657, + "grad_norm": 0.8672196865081787, + "learning_rate": 1.0022616340376489e-07, + "loss": 0.6653, + "step": 11234 + }, + { + "epoch": 1.9491672449687716, + "grad_norm": 0.8575815558433533, + "learning_rate": 9.954607750961353e-08, + "loss": 0.6376, + "step": 11235 + }, + { + "epoch": 1.9493407356002774, + "grad_norm": 0.7083195447921753, + "learning_rate": 9.886830108632784e-08, + "loss": 0.709, + "step": 11236 + }, + { + "epoch": 1.9495142262317835, + "grad_norm": 1.2237932682037354, + "learning_rate": 9.819283421256709e-08, + "loss": 0.7091, + "step": 11237 + }, + { + "epoch": 1.9496877168632893, + "grad_norm": 0.8088474273681641, + "learning_rate": 9.751967696671749e-08, + "loss": 0.7374, + "step": 11238 + }, + { + "epoch": 1.9498612074947954, + "grad_norm": 1.086775302886963, + "learning_rate": 9.684882942690099e-08, + "loss": 0.6313, + "step": 11239 + }, + { + "epoch": 1.9500346981263013, + "grad_norm": 1.3992745876312256, + "learning_rate": 9.618029167096865e-08, + "loss": 0.6959, + "step": 11240 + }, + { + "epoch": 1.950208188757807, + "grad_norm": 0.8882582187652588, + "learning_rate": 9.551406377650507e-08, + "loss": 0.6803, + "step": 11241 + }, + { + "epoch": 1.950381679389313, + "grad_norm": 0.8257946968078613, + "learning_rate": 9.485014582083063e-08, + "loss": 0.7067, + "step": 11242 + }, + { + "epoch": 1.9505551700208188, + "grad_norm": 1.2682875394821167, + "learning_rate": 9.418853788098814e-08, + "loss": 0.6073, + "step": 11243 + }, + { + "epoch": 1.9507286606523246, + "grad_norm": 1.2994440793991089, + "learning_rate": 9.352924003376285e-08, + "loss": 0.7103, + "step": 11244 + }, + { + "epoch": 1.9509021512838307, + "grad_norm": 0.7726919054985046, + "learning_rate": 9.287225235566244e-08, + "loss": 0.7229, + "step": 11245 + }, + { + "epoch": 1.9510756419153366, + "grad_norm": 0.9362271428108215, + "learning_rate": 9.221757492293704e-08, + "loss": 0.6688, + "step": 11246 + }, + { + "epoch": 1.9512491325468426, + "grad_norm": 0.939733624458313, + "learning_rate": 9.156520781155698e-08, + "loss": 0.6503, + "step": 11247 + }, + { + "epoch": 1.9514226231783485, + "grad_norm": 0.9286201000213623, + "learning_rate": 9.091515109723281e-08, + "loss": 0.5752, + "step": 11248 + }, + { + "epoch": 1.9515961138098543, + "grad_norm": 0.7024868726730347, + "learning_rate": 9.026740485540197e-08, + "loss": 0.6621, + "step": 11249 + }, + { + "epoch": 1.9517696044413602, + "grad_norm": 0.7902873754501343, + "learning_rate": 8.96219691612421e-08, + "loss": 0.7668, + "step": 11250 + }, + { + "epoch": 1.951943095072866, + "grad_norm": 1.0690768957138062, + "learning_rate": 8.897884408964885e-08, + "loss": 0.6459, + "step": 11251 + }, + { + "epoch": 1.9521165857043719, + "grad_norm": 1.4634710550308228, + "learning_rate": 8.833802971526472e-08, + "loss": 0.6306, + "step": 11252 + }, + { + "epoch": 1.9522900763358777, + "grad_norm": 0.9824376106262207, + "learning_rate": 8.769952611245248e-08, + "loss": 0.7585, + "step": 11253 + }, + { + "epoch": 1.9524635669673838, + "grad_norm": 0.7912077307701111, + "learning_rate": 8.706333335531503e-08, + "loss": 0.8572, + "step": 11254 + }, + { + "epoch": 1.9526370575988896, + "grad_norm": 2.212162733078003, + "learning_rate": 8.642945151767779e-08, + "loss": 0.6017, + "step": 11255 + }, + { + "epoch": 1.9528105482303957, + "grad_norm": 0.8447719812393188, + "learning_rate": 8.579788067310858e-08, + "loss": 0.5905, + "step": 11256 + }, + { + "epoch": 1.9529840388619015, + "grad_norm": 0.9966633319854736, + "learning_rate": 8.516862089489986e-08, + "loss": 0.6852, + "step": 11257 + }, + { + "epoch": 1.9531575294934074, + "grad_norm": 1.1559315919876099, + "learning_rate": 8.454167225607768e-08, + "loss": 0.6171, + "step": 11258 + }, + { + "epoch": 1.9533310201249132, + "grad_norm": 0.9916059374809265, + "learning_rate": 8.391703482939939e-08, + "loss": 0.5824, + "step": 11259 + }, + { + "epoch": 1.953504510756419, + "grad_norm": 1.3142231702804565, + "learning_rate": 8.32947086873559e-08, + "loss": 0.6084, + "step": 11260 + }, + { + "epoch": 1.953678001387925, + "grad_norm": 1.3651838302612305, + "learning_rate": 8.267469390217164e-08, + "loss": 0.6067, + "step": 11261 + }, + { + "epoch": 1.953851492019431, + "grad_norm": 0.8117287158966064, + "learning_rate": 8.205699054579575e-08, + "loss": 0.6774, + "step": 11262 + }, + { + "epoch": 1.9540249826509368, + "grad_norm": 2.479529857635498, + "learning_rate": 8.14415986899153e-08, + "loss": 0.6183, + "step": 11263 + }, + { + "epoch": 1.954198473282443, + "grad_norm": 1.1411106586456299, + "learning_rate": 8.082851840594652e-08, + "loss": 0.7378, + "step": 11264 + }, + { + "epoch": 1.9543719639139487, + "grad_norm": 0.7436021566390991, + "learning_rate": 8.021774976503915e-08, + "loss": 0.7255, + "step": 11265 + }, + { + "epoch": 1.9545454545454546, + "grad_norm": 0.6883374452590942, + "learning_rate": 7.960929283807429e-08, + "loss": 0.7272, + "step": 11266 + }, + { + "epoch": 1.9547189451769604, + "grad_norm": 0.7161055207252502, + "learning_rate": 7.900314769566208e-08, + "loss": 0.6864, + "step": 11267 + }, + { + "epoch": 1.9548924358084663, + "grad_norm": 0.9513726830482483, + "learning_rate": 7.83993144081463e-08, + "loss": 0.5706, + "step": 11268 + }, + { + "epoch": 1.9550659264399721, + "grad_norm": 1.172609567642212, + "learning_rate": 7.77977930456042e-08, + "loss": 0.5615, + "step": 11269 + }, + { + "epoch": 1.9552394170714782, + "grad_norm": 1.2623965740203857, + "learning_rate": 7.719858367784216e-08, + "loss": 0.5723, + "step": 11270 + }, + { + "epoch": 1.955412907702984, + "grad_norm": 0.6728619337081909, + "learning_rate": 7.66016863743979e-08, + "loss": 0.7561, + "step": 11271 + }, + { + "epoch": 1.9555863983344899, + "grad_norm": 1.0984541177749634, + "learning_rate": 7.600710120454491e-08, + "loss": 0.7136, + "step": 11272 + }, + { + "epoch": 1.955759888965996, + "grad_norm": 0.7620282173156738, + "learning_rate": 7.541482823728352e-08, + "loss": 0.6317, + "step": 11273 + }, + { + "epoch": 1.9559333795975018, + "grad_norm": 1.1569702625274658, + "learning_rate": 7.482486754134765e-08, + "loss": 0.7903, + "step": 11274 + }, + { + "epoch": 1.9561068702290076, + "grad_norm": 0.9147993326187134, + "learning_rate": 7.423721918520477e-08, + "loss": 0.6836, + "step": 11275 + }, + { + "epoch": 1.9562803608605135, + "grad_norm": 1.073434829711914, + "learning_rate": 7.365188323704919e-08, + "loss": 0.7725, + "step": 11276 + }, + { + "epoch": 1.9564538514920193, + "grad_norm": 0.957434356212616, + "learning_rate": 7.306885976481104e-08, + "loss": 0.7283, + "step": 11277 + }, + { + "epoch": 1.9566273421235252, + "grad_norm": 2.1675796508789062, + "learning_rate": 7.248814883615174e-08, + "loss": 0.5673, + "step": 11278 + }, + { + "epoch": 1.9568008327550312, + "grad_norm": 0.9567041993141174, + "learning_rate": 7.190975051846406e-08, + "loss": 0.665, + "step": 11279 + }, + { + "epoch": 1.956974323386537, + "grad_norm": 0.8986241817474365, + "learning_rate": 7.133366487886762e-08, + "loss": 0.7793, + "step": 11280 + }, + { + "epoch": 1.9571478140180432, + "grad_norm": 0.8499613404273987, + "learning_rate": 7.07598919842245e-08, + "loss": 0.7054, + "step": 11281 + }, + { + "epoch": 1.957321304649549, + "grad_norm": 0.9827045798301697, + "learning_rate": 7.018843190111479e-08, + "loss": 0.6329, + "step": 11282 + }, + { + "epoch": 1.9574947952810549, + "grad_norm": 0.9815980792045593, + "learning_rate": 6.961928469586321e-08, + "loss": 0.6714, + "step": 11283 + }, + { + "epoch": 1.9576682859125607, + "grad_norm": 1.0520272254943848, + "learning_rate": 6.90524504345147e-08, + "loss": 0.6737, + "step": 11284 + }, + { + "epoch": 1.9578417765440665, + "grad_norm": 0.9505751729011536, + "learning_rate": 6.848792918285663e-08, + "loss": 0.6782, + "step": 11285 + }, + { + "epoch": 1.9580152671755724, + "grad_norm": 1.1373779773712158, + "learning_rate": 6.792572100639661e-08, + "loss": 0.6416, + "step": 11286 + }, + { + "epoch": 1.9581887578070785, + "grad_norm": 1.3145954608917236, + "learning_rate": 6.736582597038243e-08, + "loss": 0.6344, + "step": 11287 + }, + { + "epoch": 1.9583622484385843, + "grad_norm": 0.913699746131897, + "learning_rate": 6.680824413979103e-08, + "loss": 0.6481, + "step": 11288 + }, + { + "epoch": 1.9585357390700904, + "grad_norm": 1.2434921264648438, + "learning_rate": 6.625297557932842e-08, + "loss": 0.6719, + "step": 11289 + }, + { + "epoch": 1.9587092297015962, + "grad_norm": 0.9850438237190247, + "learning_rate": 6.570002035343636e-08, + "loss": 0.6821, + "step": 11290 + }, + { + "epoch": 1.958882720333102, + "grad_norm": 1.023745059967041, + "learning_rate": 6.514937852628578e-08, + "loss": 0.6533, + "step": 11291 + }, + { + "epoch": 1.959056210964608, + "grad_norm": 1.0044406652450562, + "learning_rate": 6.460105016177887e-08, + "loss": 0.6755, + "step": 11292 + }, + { + "epoch": 1.9592297015961138, + "grad_norm": 0.9554905891418457, + "learning_rate": 6.405503532354695e-08, + "loss": 0.6729, + "step": 11293 + }, + { + "epoch": 1.9594031922276196, + "grad_norm": 0.9639984369277954, + "learning_rate": 6.351133407495936e-08, + "loss": 0.699, + "step": 11294 + }, + { + "epoch": 1.9595766828591255, + "grad_norm": 1.7195582389831543, + "learning_rate": 6.296994647911448e-08, + "loss": 0.6875, + "step": 11295 + }, + { + "epoch": 1.9597501734906315, + "grad_norm": 0.9284408688545227, + "learning_rate": 6.24308725988354e-08, + "loss": 0.7258, + "step": 11296 + }, + { + "epoch": 1.9599236641221374, + "grad_norm": 0.8969119787216187, + "learning_rate": 6.189411249668542e-08, + "loss": 0.5901, + "step": 11297 + }, + { + "epoch": 1.9600971547536434, + "grad_norm": 0.7135695219039917, + "learning_rate": 6.135966623495915e-08, + "loss": 0.6881, + "step": 11298 + }, + { + "epoch": 1.9602706453851493, + "grad_norm": 1.0164573192596436, + "learning_rate": 6.082753387567364e-08, + "loss": 0.5673, + "step": 11299 + }, + { + "epoch": 1.9604441360166551, + "grad_norm": 0.8014167547225952, + "learning_rate": 6.029771548058838e-08, + "loss": 0.692, + "step": 11300 + }, + { + "epoch": 1.960617626648161, + "grad_norm": 1.0487821102142334, + "learning_rate": 5.977021111118752e-08, + "loss": 0.5955, + "step": 11301 + }, + { + "epoch": 1.9607911172796668, + "grad_norm": 0.8324846625328064, + "learning_rate": 5.924502082868655e-08, + "loss": 0.6877, + "step": 11302 + }, + { + "epoch": 1.9609646079111727, + "grad_norm": 0.7180544137954712, + "learning_rate": 5.872214469403892e-08, + "loss": 0.7422, + "step": 11303 + }, + { + "epoch": 1.9611380985426787, + "grad_norm": 1.1027112007141113, + "learning_rate": 5.820158276792054e-08, + "loss": 0.5848, + "step": 11304 + }, + { + "epoch": 1.9613115891741846, + "grad_norm": 1.6352711915969849, + "learning_rate": 5.768333511074753e-08, + "loss": 0.5974, + "step": 11305 + }, + { + "epoch": 1.9614850798056906, + "grad_norm": 1.3029991388320923, + "learning_rate": 5.716740178266067e-08, + "loss": 0.7067, + "step": 11306 + }, + { + "epoch": 1.9616585704371965, + "grad_norm": 1.5053205490112305, + "learning_rate": 5.665378284353207e-08, + "loss": 0.8193, + "step": 11307 + }, + { + "epoch": 1.9618320610687023, + "grad_norm": 0.834446132183075, + "learning_rate": 5.614247835297404e-08, + "loss": 0.8611, + "step": 11308 + }, + { + "epoch": 1.9620055517002082, + "grad_norm": 0.7423368096351624, + "learning_rate": 5.563348837031912e-08, + "loss": 0.6506, + "step": 11309 + }, + { + "epoch": 1.962179042331714, + "grad_norm": 0.8255917429924011, + "learning_rate": 5.512681295463784e-08, + "loss": 0.6355, + "step": 11310 + }, + { + "epoch": 1.9623525329632199, + "grad_norm": 0.8343886733055115, + "learning_rate": 5.462245216472983e-08, + "loss": 0.6831, + "step": 11311 + }, + { + "epoch": 1.9625260235947257, + "grad_norm": 1.000819444656372, + "learning_rate": 5.4120406059128274e-08, + "loss": 0.7207, + "step": 11312 + }, + { + "epoch": 1.9626995142262318, + "grad_norm": 0.9747291207313538, + "learning_rate": 5.362067469609322e-08, + "loss": 0.718, + "step": 11313 + }, + { + "epoch": 1.9628730048577376, + "grad_norm": 1.47141695022583, + "learning_rate": 5.312325813362274e-08, + "loss": 0.7861, + "step": 11314 + }, + { + "epoch": 1.9630464954892437, + "grad_norm": 0.9573979377746582, + "learning_rate": 5.262815642943953e-08, + "loss": 0.6274, + "step": 11315 + }, + { + "epoch": 1.9632199861207495, + "grad_norm": 0.8436204195022583, + "learning_rate": 5.21353696410043e-08, + "loss": 0.7554, + "step": 11316 + }, + { + "epoch": 1.9633934767522554, + "grad_norm": 0.6817724108695984, + "learning_rate": 5.1644897825502416e-08, + "loss": 0.6689, + "step": 11317 + }, + { + "epoch": 1.9635669673837612, + "grad_norm": 1.078385829925537, + "learning_rate": 5.11567410398528e-08, + "loss": 0.5382, + "step": 11318 + }, + { + "epoch": 1.963740458015267, + "grad_norm": 0.754548192024231, + "learning_rate": 5.0670899340710124e-08, + "loss": 0.6769, + "step": 11319 + }, + { + "epoch": 1.963913948646773, + "grad_norm": 1.2282471656799316, + "learning_rate": 5.0187372784453734e-08, + "loss": 0.6238, + "step": 11320 + }, + { + "epoch": 1.964087439278279, + "grad_norm": 0.8329740762710571, + "learning_rate": 4.970616142720097e-08, + "loss": 0.823, + "step": 11321 + }, + { + "epoch": 1.9642609299097848, + "grad_norm": 1.3771153688430786, + "learning_rate": 4.922726532479383e-08, + "loss": 0.5656, + "step": 11322 + }, + { + "epoch": 1.964434420541291, + "grad_norm": 0.8418456315994263, + "learning_rate": 4.8750684532810064e-08, + "loss": 0.5967, + "step": 11323 + }, + { + "epoch": 1.9646079111727968, + "grad_norm": 3.574465036392212, + "learning_rate": 4.827641910655656e-08, + "loss": 0.7687, + "step": 11324 + }, + { + "epoch": 1.9647814018043026, + "grad_norm": 1.018480658531189, + "learning_rate": 4.7804469101073725e-08, + "loss": 0.5878, + "step": 11325 + }, + { + "epoch": 1.9649548924358085, + "grad_norm": 0.8388617038726807, + "learning_rate": 4.7334834571128866e-08, + "loss": 0.7628, + "step": 11326 + }, + { + "epoch": 1.9651283830673143, + "grad_norm": 1.01212477684021, + "learning_rate": 4.6867515571229485e-08, + "loss": 0.6376, + "step": 11327 + }, + { + "epoch": 1.9653018736988201, + "grad_norm": 1.2740428447723389, + "learning_rate": 4.640251215560332e-08, + "loss": 0.6547, + "step": 11328 + }, + { + "epoch": 1.9654753643303262, + "grad_norm": 0.7480271458625793, + "learning_rate": 4.593982437821609e-08, + "loss": 0.7333, + "step": 11329 + }, + { + "epoch": 1.965648854961832, + "grad_norm": 1.0318852663040161, + "learning_rate": 4.547945229276263e-08, + "loss": 0.5514, + "step": 11330 + }, + { + "epoch": 1.965822345593338, + "grad_norm": 0.8829219937324524, + "learning_rate": 4.5021395952671297e-08, + "loss": 0.642, + "step": 11331 + }, + { + "epoch": 1.965995836224844, + "grad_norm": 0.8558671474456787, + "learning_rate": 4.456565541109958e-08, + "loss": 0.6871, + "step": 11332 + }, + { + "epoch": 1.9661693268563498, + "grad_norm": 1.050255298614502, + "learning_rate": 4.411223072093629e-08, + "loss": 0.6051, + "step": 11333 + }, + { + "epoch": 1.9663428174878557, + "grad_norm": 1.0147488117218018, + "learning_rate": 4.366112193480154e-08, + "loss": 0.546, + "step": 11334 + }, + { + "epoch": 1.9665163081193615, + "grad_norm": 0.9437805414199829, + "learning_rate": 4.32123291050468e-08, + "loss": 0.5714, + "step": 11335 + }, + { + "epoch": 1.9666897987508674, + "grad_norm": 0.8739786744117737, + "learning_rate": 4.276585228375485e-08, + "loss": 0.6576, + "step": 11336 + }, + { + "epoch": 1.9668632893823732, + "grad_norm": 1.122430443763733, + "learning_rate": 4.2321691522742013e-08, + "loss": 0.6216, + "step": 11337 + }, + { + "epoch": 1.9670367800138793, + "grad_norm": 1.3906128406524658, + "learning_rate": 4.187984687355151e-08, + "loss": 0.5596, + "step": 11338 + }, + { + "epoch": 1.9672102706453851, + "grad_norm": 0.8969520330429077, + "learning_rate": 4.14403183874601e-08, + "loss": 0.5095, + "step": 11339 + }, + { + "epoch": 1.9673837612768912, + "grad_norm": 1.2116142511367798, + "learning_rate": 4.100310611547809e-08, + "loss": 0.5585, + "step": 11340 + }, + { + "epoch": 1.967557251908397, + "grad_norm": 0.777353823184967, + "learning_rate": 4.056821010834045e-08, + "loss": 0.7456, + "step": 11341 + }, + { + "epoch": 1.9677307425399029, + "grad_norm": 1.5847409963607788, + "learning_rate": 4.013563041652013e-08, + "loss": 0.7158, + "step": 11342 + }, + { + "epoch": 1.9679042331714087, + "grad_norm": 1.3121426105499268, + "learning_rate": 3.9705367090216995e-08, + "loss": 0.7001, + "step": 11343 + }, + { + "epoch": 1.9680777238029146, + "grad_norm": 1.7231873273849487, + "learning_rate": 3.927742017936664e-08, + "loss": 0.6554, + "step": 11344 + }, + { + "epoch": 1.9682512144344204, + "grad_norm": 0.7484825849533081, + "learning_rate": 3.885178973362713e-08, + "loss": 0.671, + "step": 11345 + }, + { + "epoch": 1.9684247050659265, + "grad_norm": 1.2277356386184692, + "learning_rate": 3.8428475802398944e-08, + "loss": 0.6069, + "step": 11346 + }, + { + "epoch": 1.9685981956974323, + "grad_norm": 1.0496498346328735, + "learning_rate": 3.800747843480501e-08, + "loss": 0.5338, + "step": 11347 + }, + { + "epoch": 1.9687716863289384, + "grad_norm": 0.8257871270179749, + "learning_rate": 3.7588797679706245e-08, + "loss": 0.6868, + "step": 11348 + }, + { + "epoch": 1.9689451769604442, + "grad_norm": 0.7789069414138794, + "learning_rate": 3.717243358568379e-08, + "loss": 0.8079, + "step": 11349 + }, + { + "epoch": 1.96911866759195, + "grad_norm": 0.7678925395011902, + "learning_rate": 3.6758386201065645e-08, + "loss": 0.7223, + "step": 11350 + }, + { + "epoch": 1.969292158223456, + "grad_norm": 1.290128469467163, + "learning_rate": 3.6346655573897823e-08, + "loss": 0.7217, + "step": 11351 + }, + { + "epoch": 1.9694656488549618, + "grad_norm": 0.9772946834564209, + "learning_rate": 3.5937241751962115e-08, + "loss": 0.5679, + "step": 11352 + }, + { + "epoch": 1.9696391394864676, + "grad_norm": 0.7821170687675476, + "learning_rate": 3.5530144782771616e-08, + "loss": 0.6323, + "step": 11353 + }, + { + "epoch": 1.9698126301179735, + "grad_norm": 0.7325802445411682, + "learning_rate": 3.5125364713572976e-08, + "loss": 0.7295, + "step": 11354 + }, + { + "epoch": 1.9699861207494795, + "grad_norm": 0.9872525930404663, + "learning_rate": 3.472290159133751e-08, + "loss": 0.7611, + "step": 11355 + }, + { + "epoch": 1.9701596113809854, + "grad_norm": 0.9516803026199341, + "learning_rate": 3.4322755462774525e-08, + "loss": 0.5924, + "step": 11356 + }, + { + "epoch": 1.9703331020124915, + "grad_norm": 1.0397332906723022, + "learning_rate": 3.392492637432021e-08, + "loss": 0.6793, + "step": 11357 + }, + { + "epoch": 1.9705065926439973, + "grad_norm": 1.0926662683486938, + "learning_rate": 3.3529414372142074e-08, + "loss": 0.7019, + "step": 11358 + }, + { + "epoch": 1.9706800832755031, + "grad_norm": 0.7211560606956482, + "learning_rate": 3.3136219502143406e-08, + "loss": 0.7556, + "step": 11359 + }, + { + "epoch": 1.970853573907009, + "grad_norm": 0.7700601816177368, + "learning_rate": 3.2745341809949923e-08, + "loss": 0.6802, + "step": 11360 + }, + { + "epoch": 1.9710270645385148, + "grad_norm": 0.963165819644928, + "learning_rate": 3.235678134092757e-08, + "loss": 0.7305, + "step": 11361 + }, + { + "epoch": 1.9712005551700207, + "grad_norm": 0.8139641880989075, + "learning_rate": 3.1970538140166927e-08, + "loss": 0.6893, + "step": 11362 + }, + { + "epoch": 1.9713740458015268, + "grad_norm": 0.763871431350708, + "learning_rate": 3.158661225249437e-08, + "loss": 0.682, + "step": 11363 + }, + { + "epoch": 1.9715475364330326, + "grad_norm": 1.0033416748046875, + "learning_rate": 3.1205003722460935e-08, + "loss": 0.728, + "step": 11364 + }, + { + "epoch": 1.9717210270645387, + "grad_norm": 1.1802377700805664, + "learning_rate": 3.082571259435563e-08, + "loss": 0.7349, + "step": 11365 + }, + { + "epoch": 1.9718945176960445, + "grad_norm": 1.4222465753555298, + "learning_rate": 3.0448738912196574e-08, + "loss": 0.6456, + "step": 11366 + }, + { + "epoch": 1.9720680083275504, + "grad_norm": 0.9753279089927673, + "learning_rate": 3.007408271972878e-08, + "loss": 0.5549, + "step": 11367 + }, + { + "epoch": 1.9722414989590562, + "grad_norm": 1.0662072896957397, + "learning_rate": 2.9701744060435246e-08, + "loss": 0.6083, + "step": 11368 + }, + { + "epoch": 1.972414989590562, + "grad_norm": 0.8894070386886597, + "learning_rate": 2.9331722977523625e-08, + "loss": 0.679, + "step": 11369 + }, + { + "epoch": 1.972588480222068, + "grad_norm": 1.247391700744629, + "learning_rate": 2.8964019513935126e-08, + "loss": 0.6399, + "step": 11370 + }, + { + "epoch": 1.9727619708535737, + "grad_norm": 0.9878469109535217, + "learning_rate": 2.8598633712342283e-08, + "loss": 0.6636, + "step": 11371 + }, + { + "epoch": 1.9729354614850798, + "grad_norm": 1.034587025642395, + "learning_rate": 2.8235565615151172e-08, + "loss": 0.6797, + "step": 11372 + }, + { + "epoch": 1.9731089521165857, + "grad_norm": 0.7358211874961853, + "learning_rate": 2.787481526449476e-08, + "loss": 0.7452, + "step": 11373 + }, + { + "epoch": 1.9732824427480917, + "grad_norm": 1.347483515739441, + "learning_rate": 2.7516382702235112e-08, + "loss": 0.6229, + "step": 11374 + }, + { + "epoch": 1.9734559333795976, + "grad_norm": 1.394002914428711, + "learning_rate": 2.7160267969974508e-08, + "loss": 0.7986, + "step": 11375 + }, + { + "epoch": 1.9736294240111034, + "grad_norm": 1.035143256187439, + "learning_rate": 2.6806471109037668e-08, + "loss": 0.7303, + "step": 11376 + }, + { + "epoch": 1.9738029146426093, + "grad_norm": 1.003517985343933, + "learning_rate": 2.645499216048286e-08, + "loss": 0.7976, + "step": 11377 + }, + { + "epoch": 1.973976405274115, + "grad_norm": 2.255449056625366, + "learning_rate": 2.6105831165099683e-08, + "loss": 0.7068, + "step": 11378 + }, + { + "epoch": 1.974149895905621, + "grad_norm": 1.3118568658828735, + "learning_rate": 2.575898816340905e-08, + "loss": 0.6475, + "step": 11379 + }, + { + "epoch": 1.974323386537127, + "grad_norm": 0.8316441178321838, + "learning_rate": 2.541446319566321e-08, + "loss": 0.6311, + "step": 11380 + }, + { + "epoch": 1.9744968771686329, + "grad_norm": 0.752608597278595, + "learning_rate": 2.5072256301843513e-08, + "loss": 0.705, + "step": 11381 + }, + { + "epoch": 1.974670367800139, + "grad_norm": 0.9498623013496399, + "learning_rate": 2.473236752166264e-08, + "loss": 0.7842, + "step": 11382 + }, + { + "epoch": 1.9748438584316448, + "grad_norm": 0.7753711938858032, + "learning_rate": 2.4394796894566807e-08, + "loss": 0.6886, + "step": 11383 + }, + { + "epoch": 1.9750173490631506, + "grad_norm": 0.8349775075912476, + "learning_rate": 2.4059544459731356e-08, + "loss": 0.712, + "step": 11384 + }, + { + "epoch": 1.9751908396946565, + "grad_norm": 0.9283580780029297, + "learning_rate": 2.3726610256062933e-08, + "loss": 0.6791, + "step": 11385 + }, + { + "epoch": 1.9753643303261623, + "grad_norm": 1.271332025527954, + "learning_rate": 2.3395994322199522e-08, + "loss": 0.6543, + "step": 11386 + }, + { + "epoch": 1.9755378209576682, + "grad_norm": 0.7398504018783569, + "learning_rate": 2.3067696696505992e-08, + "loss": 0.6531, + "step": 11387 + }, + { + "epoch": 1.9757113115891742, + "grad_norm": 0.9361887574195862, + "learning_rate": 2.2741717417085196e-08, + "loss": 0.7676, + "step": 11388 + }, + { + "epoch": 1.97588480222068, + "grad_norm": 0.9758285880088806, + "learning_rate": 2.2418056521764653e-08, + "loss": 0.7585, + "step": 11389 + }, + { + "epoch": 1.976058292852186, + "grad_norm": 0.9545868039131165, + "learning_rate": 2.2096714048109867e-08, + "loss": 0.6528, + "step": 11390 + }, + { + "epoch": 1.976231783483692, + "grad_norm": 0.9781880974769592, + "learning_rate": 2.1777690033408795e-08, + "loss": 0.5704, + "step": 11391 + }, + { + "epoch": 1.9764052741151978, + "grad_norm": 0.9435639381408691, + "learning_rate": 2.1460984514685145e-08, + "loss": 0.6993, + "step": 11392 + }, + { + "epoch": 1.9765787647467037, + "grad_norm": 1.7647173404693604, + "learning_rate": 2.114659752869619e-08, + "loss": 0.6951, + "step": 11393 + }, + { + "epoch": 1.9767522553782095, + "grad_norm": 1.0819681882858276, + "learning_rate": 2.083452911192163e-08, + "loss": 0.6433, + "step": 11394 + }, + { + "epoch": 1.9769257460097154, + "grad_norm": 1.1406724452972412, + "learning_rate": 2.0524779300581386e-08, + "loss": 0.6312, + "step": 11395 + }, + { + "epoch": 1.9770992366412212, + "grad_norm": 0.882234513759613, + "learning_rate": 2.021734813062226e-08, + "loss": 0.6521, + "step": 11396 + }, + { + "epoch": 1.9772727272727273, + "grad_norm": 0.6937500238418579, + "learning_rate": 1.9912235637720156e-08, + "loss": 0.7213, + "step": 11397 + }, + { + "epoch": 1.9774462179042331, + "grad_norm": 0.9769173264503479, + "learning_rate": 1.960944185728675e-08, + "loss": 0.7587, + "step": 11398 + }, + { + "epoch": 1.9776197085357392, + "grad_norm": 0.9350139498710632, + "learning_rate": 1.9308966824456154e-08, + "loss": 0.6533, + "step": 11399 + }, + { + "epoch": 1.977793199167245, + "grad_norm": 0.9373158812522888, + "learning_rate": 1.9010810574102702e-08, + "loss": 0.645, + "step": 11400 + }, + { + "epoch": 1.977966689798751, + "grad_norm": 1.376627802848816, + "learning_rate": 1.8714973140827596e-08, + "loss": 0.5919, + "step": 11401 + }, + { + "epoch": 1.9781401804302567, + "grad_norm": 0.8120253682136536, + "learning_rate": 1.842145455896338e-08, + "loss": 0.6932, + "step": 11402 + }, + { + "epoch": 1.9783136710617626, + "grad_norm": 0.818089485168457, + "learning_rate": 1.8130254862571693e-08, + "loss": 0.7491, + "step": 11403 + }, + { + "epoch": 1.9784871616932684, + "grad_norm": 0.7754546999931335, + "learning_rate": 1.7841374085447728e-08, + "loss": 0.6342, + "step": 11404 + }, + { + "epoch": 1.9786606523247745, + "grad_norm": 0.8400373458862305, + "learning_rate": 1.7554812261117994e-08, + "loss": 0.6322, + "step": 11405 + }, + { + "epoch": 1.9788341429562804, + "grad_norm": 0.8949489593505859, + "learning_rate": 1.727056942283367e-08, + "loss": 0.7424, + "step": 11406 + }, + { + "epoch": 1.9790076335877864, + "grad_norm": 1.7255115509033203, + "learning_rate": 1.6988645603586153e-08, + "loss": 0.7996, + "step": 11407 + }, + { + "epoch": 1.9791811242192923, + "grad_norm": 0.825623631477356, + "learning_rate": 1.6709040836089262e-08, + "loss": 0.7405, + "step": 11408 + }, + { + "epoch": 1.979354614850798, + "grad_norm": 0.888810932636261, + "learning_rate": 1.6431755152794827e-08, + "loss": 0.7482, + "step": 11409 + }, + { + "epoch": 1.979528105482304, + "grad_norm": 1.3725157976150513, + "learning_rate": 1.6156788585879325e-08, + "loss": 0.7262, + "step": 11410 + }, + { + "epoch": 1.9797015961138098, + "grad_norm": 1.0040464401245117, + "learning_rate": 1.5884141167255007e-08, + "loss": 0.5785, + "step": 11411 + }, + { + "epoch": 1.9798750867453156, + "grad_norm": 1.074227213859558, + "learning_rate": 1.5613812928563233e-08, + "loss": 0.6708, + "step": 11412 + }, + { + "epoch": 1.9800485773768215, + "grad_norm": 0.8242344260215759, + "learning_rate": 1.534580390117446e-08, + "loss": 0.7812, + "step": 11413 + }, + { + "epoch": 1.9802220680083276, + "grad_norm": 0.8488274812698364, + "learning_rate": 1.508011411619048e-08, + "loss": 0.6515, + "step": 11414 + }, + { + "epoch": 1.9803955586398334, + "grad_norm": 0.786887526512146, + "learning_rate": 1.4816743604448846e-08, + "loss": 0.7063, + "step": 11415 + }, + { + "epoch": 1.9805690492713395, + "grad_norm": 0.7623203992843628, + "learning_rate": 1.4555692396509557e-08, + "loss": 0.6401, + "step": 11416 + }, + { + "epoch": 1.9807425399028453, + "grad_norm": 0.9382438659667969, + "learning_rate": 1.42969605226706e-08, + "loss": 0.5709, + "step": 11417 + }, + { + "epoch": 1.9809160305343512, + "grad_norm": 1.1883031129837036, + "learning_rate": 1.4040548012956844e-08, + "loss": 0.7292, + "step": 11418 + }, + { + "epoch": 1.981089521165857, + "grad_norm": 0.8100271821022034, + "learning_rate": 1.378645489712449e-08, + "loss": 0.7231, + "step": 11419 + }, + { + "epoch": 1.9812630117973629, + "grad_norm": 1.2733454704284668, + "learning_rate": 1.3534681204665502e-08, + "loss": 0.7256, + "step": 11420 + }, + { + "epoch": 1.9814365024288687, + "grad_norm": 0.9601986408233643, + "learning_rate": 1.3285226964794284e-08, + "loss": 0.63, + "step": 11421 + }, + { + "epoch": 1.9816099930603748, + "grad_norm": 1.536433219909668, + "learning_rate": 1.3038092206461017e-08, + "loss": 0.5768, + "step": 11422 + }, + { + "epoch": 1.9817834836918806, + "grad_norm": 0.8105570673942566, + "learning_rate": 1.2793276958347201e-08, + "loss": 0.6566, + "step": 11423 + }, + { + "epoch": 1.9819569743233867, + "grad_norm": 0.6208687424659729, + "learning_rate": 1.2550781248863442e-08, + "loss": 0.8484, + "step": 11424 + }, + { + "epoch": 1.9821304649548925, + "grad_norm": 0.7152541875839233, + "learning_rate": 1.2310605106149454e-08, + "loss": 0.7507, + "step": 11425 + }, + { + "epoch": 1.9823039555863984, + "grad_norm": 0.6525411009788513, + "learning_rate": 1.2072748558082936e-08, + "loss": 0.7678, + "step": 11426 + }, + { + "epoch": 1.9824774462179042, + "grad_norm": 1.0771974325180054, + "learning_rate": 1.1837211632264034e-08, + "loss": 0.6525, + "step": 11427 + }, + { + "epoch": 1.98265093684941, + "grad_norm": 0.9834551811218262, + "learning_rate": 1.1603994356026437e-08, + "loss": 0.6489, + "step": 11428 + }, + { + "epoch": 1.982824427480916, + "grad_norm": 1.7787619829177856, + "learning_rate": 1.1373096756437385e-08, + "loss": 0.7126, + "step": 11429 + }, + { + "epoch": 1.9829979181124218, + "grad_norm": 0.9776009917259216, + "learning_rate": 1.1144518860290998e-08, + "loss": 0.6675, + "step": 11430 + }, + { + "epoch": 1.9831714087439278, + "grad_norm": 1.5030549764633179, + "learning_rate": 1.0918260694114947e-08, + "loss": 0.7695, + "step": 11431 + }, + { + "epoch": 1.9833448993754337, + "grad_norm": 1.3775079250335693, + "learning_rate": 1.0694322284166003e-08, + "loss": 0.7588, + "step": 11432 + }, + { + "epoch": 1.9835183900069397, + "grad_norm": 0.8132615685462952, + "learning_rate": 1.047270365643449e-08, + "loss": 0.6154, + "step": 11433 + }, + { + "epoch": 1.9836918806384456, + "grad_norm": 1.0472773313522339, + "learning_rate": 1.0253404836637615e-08, + "loss": 0.6873, + "step": 11434 + }, + { + "epoch": 1.9838653712699514, + "grad_norm": 0.7125112414360046, + "learning_rate": 1.0036425850226129e-08, + "loss": 0.7821, + "step": 11435 + }, + { + "epoch": 1.9840388619014573, + "grad_norm": 1.073898434638977, + "learning_rate": 9.821766722379888e-09, + "loss": 0.7074, + "step": 11436 + }, + { + "epoch": 1.9842123525329631, + "grad_norm": 0.9853663444519043, + "learning_rate": 9.609427478010081e-09, + "loss": 0.7222, + "step": 11437 + }, + { + "epoch": 1.984385843164469, + "grad_norm": 0.834174633026123, + "learning_rate": 9.399408141761434e-09, + "loss": 0.6754, + "step": 11438 + }, + { + "epoch": 1.984559333795975, + "grad_norm": 0.8476788997650146, + "learning_rate": 9.191708738003346e-09, + "loss": 0.7025, + "step": 11439 + }, + { + "epoch": 1.984732824427481, + "grad_norm": 0.6038647890090942, + "learning_rate": 8.9863292908432e-09, + "loss": 0.6923, + "step": 11440 + }, + { + "epoch": 1.984906315058987, + "grad_norm": 0.7101566195487976, + "learning_rate": 8.78326982411304e-09, + "loss": 0.7031, + "step": 11441 + }, + { + "epoch": 1.9850798056904928, + "grad_norm": 0.7617210149765015, + "learning_rate": 8.58253036137846e-09, + "loss": 0.7402, + "step": 11442 + }, + { + "epoch": 1.9852532963219987, + "grad_norm": 1.274199366569519, + "learning_rate": 8.384110925936384e-09, + "loss": 0.8099, + "step": 11443 + }, + { + "epoch": 1.9854267869535045, + "grad_norm": 1.0001728534698486, + "learning_rate": 8.188011540812834e-09, + "loss": 0.6266, + "step": 11444 + }, + { + "epoch": 1.9856002775850103, + "grad_norm": 0.7003872394561768, + "learning_rate": 7.994232228765164e-09, + "loss": 0.7278, + "step": 11445 + }, + { + "epoch": 1.9857737682165162, + "grad_norm": 0.6320180892944336, + "learning_rate": 7.80277301228205e-09, + "loss": 0.7341, + "step": 11446 + }, + { + "epoch": 1.9859472588480223, + "grad_norm": 0.8625776767730713, + "learning_rate": 7.613633913583495e-09, + "loss": 0.7008, + "step": 11447 + }, + { + "epoch": 1.986120749479528, + "grad_norm": 0.9104230999946594, + "learning_rate": 7.426814954618611e-09, + "loss": 0.7368, + "step": 11448 + }, + { + "epoch": 1.9862942401110342, + "grad_norm": 1.3555822372436523, + "learning_rate": 7.24231615706561e-09, + "loss": 0.7603, + "step": 11449 + }, + { + "epoch": 1.98646773074254, + "grad_norm": 1.067388653755188, + "learning_rate": 7.060137542340695e-09, + "loss": 0.5797, + "step": 11450 + }, + { + "epoch": 1.9866412213740459, + "grad_norm": 0.960496187210083, + "learning_rate": 6.8802791315825125e-09, + "loss": 0.6372, + "step": 11451 + }, + { + "epoch": 1.9868147120055517, + "grad_norm": 2.055540084838867, + "learning_rate": 6.702740945663256e-09, + "loss": 0.5966, + "step": 11452 + }, + { + "epoch": 1.9869882026370576, + "grad_norm": 1.1390448808670044, + "learning_rate": 6.527523005188663e-09, + "loss": 0.668, + "step": 11453 + }, + { + "epoch": 1.9871616932685634, + "grad_norm": 0.7323452830314636, + "learning_rate": 6.35462533049358e-09, + "loss": 0.7263, + "step": 11454 + }, + { + "epoch": 1.9873351839000692, + "grad_norm": 1.219815731048584, + "learning_rate": 6.184047941639737e-09, + "loss": 0.6207, + "step": 11455 + }, + { + "epoch": 1.9875086745315753, + "grad_norm": 0.9574404954910278, + "learning_rate": 6.0157908584246305e-09, + "loss": 0.7283, + "step": 11456 + }, + { + "epoch": 1.9876821651630812, + "grad_norm": 2.4544594287872314, + "learning_rate": 5.8498541003748634e-09, + "loss": 0.6393, + "step": 11457 + }, + { + "epoch": 1.9878556557945872, + "grad_norm": 1.037307620048523, + "learning_rate": 5.686237686746143e-09, + "loss": 0.6166, + "step": 11458 + }, + { + "epoch": 1.988029146426093, + "grad_norm": 0.8615810871124268, + "learning_rate": 5.5249416365299455e-09, + "loss": 0.7888, + "step": 11459 + }, + { + "epoch": 1.988202637057599, + "grad_norm": 0.6398669481277466, + "learning_rate": 5.36596596844019e-09, + "loss": 0.8174, + "step": 11460 + }, + { + "epoch": 1.9883761276891048, + "grad_norm": 0.8886589407920837, + "learning_rate": 5.209310700931003e-09, + "loss": 0.7809, + "step": 11461 + }, + { + "epoch": 1.9885496183206106, + "grad_norm": 0.7659891843795776, + "learning_rate": 5.054975852176736e-09, + "loss": 0.8457, + "step": 11462 + }, + { + "epoch": 1.9887231089521165, + "grad_norm": 0.6497374773025513, + "learning_rate": 4.9029614400941675e-09, + "loss": 0.6779, + "step": 11463 + }, + { + "epoch": 1.9888965995836225, + "grad_norm": 1.0003374814987183, + "learning_rate": 4.7532674823203e-09, + "loss": 0.6748, + "step": 11464 + }, + { + "epoch": 1.9890700902151284, + "grad_norm": 0.7250728607177734, + "learning_rate": 4.605893996227906e-09, + "loss": 0.7734, + "step": 11465 + }, + { + "epoch": 1.9892435808466344, + "grad_norm": 0.9680227041244507, + "learning_rate": 4.4608409989232995e-09, + "loss": 0.6283, + "step": 11466 + }, + { + "epoch": 1.9894170714781403, + "grad_norm": 0.9504625201225281, + "learning_rate": 4.318108507235241e-09, + "loss": 0.7051, + "step": 11467 + }, + { + "epoch": 1.9895905621096461, + "grad_norm": 0.6649958491325378, + "learning_rate": 4.17769653773048e-09, + "loss": 0.8142, + "step": 11468 + }, + { + "epoch": 1.989764052741152, + "grad_norm": 1.003364086151123, + "learning_rate": 4.039605106704869e-09, + "loss": 0.7122, + "step": 11469 + }, + { + "epoch": 1.9899375433726578, + "grad_norm": 1.039623737335205, + "learning_rate": 3.903834230183368e-09, + "loss": 0.7128, + "step": 11470 + }, + { + "epoch": 1.9901110340041637, + "grad_norm": 0.9628387689590454, + "learning_rate": 3.77038392392004e-09, + "loss": 0.6191, + "step": 11471 + }, + { + "epoch": 1.9902845246356695, + "grad_norm": 0.7939297556877136, + "learning_rate": 3.639254203406939e-09, + "loss": 0.6212, + "step": 11472 + }, + { + "epoch": 1.9904580152671756, + "grad_norm": 1.0657528638839722, + "learning_rate": 3.5104450838563396e-09, + "loss": 0.6572, + "step": 11473 + }, + { + "epoch": 1.9906315058986814, + "grad_norm": 1.595118522644043, + "learning_rate": 3.383956580218506e-09, + "loss": 0.7366, + "step": 11474 + }, + { + "epoch": 1.9908049965301875, + "grad_norm": 0.7961582541465759, + "learning_rate": 3.2597887071750266e-09, + "loss": 0.7964, + "step": 11475 + }, + { + "epoch": 1.9909784871616933, + "grad_norm": 1.0065538883209229, + "learning_rate": 3.1379414791343766e-09, + "loss": 0.7937, + "step": 11476 + }, + { + "epoch": 1.9911519777931992, + "grad_norm": 0.9499013423919678, + "learning_rate": 3.0184149102341354e-09, + "loss": 0.5555, + "step": 11477 + }, + { + "epoch": 1.991325468424705, + "grad_norm": 0.8192165493965149, + "learning_rate": 2.9012090143498704e-09, + "loss": 0.7443, + "step": 11478 + }, + { + "epoch": 1.9914989590562109, + "grad_norm": 0.8742952346801758, + "learning_rate": 2.786323805081814e-09, + "loss": 0.7937, + "step": 11479 + }, + { + "epoch": 1.9916724496877167, + "grad_norm": 0.8763142824172974, + "learning_rate": 2.6737592957615243e-09, + "loss": 0.7966, + "step": 11480 + }, + { + "epoch": 1.9918459403192228, + "grad_norm": 0.998501181602478, + "learning_rate": 2.563515499451885e-09, + "loss": 0.6434, + "step": 11481 + }, + { + "epoch": 1.9920194309507286, + "grad_norm": 1.1935738325119019, + "learning_rate": 2.4555924289493272e-09, + "loss": 0.6924, + "step": 11482 + }, + { + "epoch": 1.9921929215822347, + "grad_norm": 0.7513913512229919, + "learning_rate": 2.349990096777166e-09, + "loss": 0.735, + "step": 11483 + }, + { + "epoch": 1.9923664122137406, + "grad_norm": 0.9050767421722412, + "learning_rate": 2.2467085151900436e-09, + "loss": 0.6195, + "step": 11484 + }, + { + "epoch": 1.9925399028452464, + "grad_norm": 0.7191981077194214, + "learning_rate": 2.145747696173928e-09, + "loss": 0.7112, + "step": 11485 + }, + { + "epoch": 1.9927133934767522, + "grad_norm": 0.9039629101753235, + "learning_rate": 2.047107651446112e-09, + "loss": 0.6719, + "step": 11486 + }, + { + "epoch": 1.992886884108258, + "grad_norm": 1.6635719537734985, + "learning_rate": 1.950788392455216e-09, + "loss": 0.5947, + "step": 11487 + }, + { + "epoch": 1.993060374739764, + "grad_norm": 1.140831470489502, + "learning_rate": 1.8567899303767457e-09, + "loss": 0.673, + "step": 11488 + }, + { + "epoch": 1.9932338653712698, + "grad_norm": 0.6705654859542847, + "learning_rate": 1.7651122761197515e-09, + "loss": 0.8032, + "step": 11489 + }, + { + "epoch": 1.9934073560027759, + "grad_norm": 0.6758986115455627, + "learning_rate": 1.6757554403223907e-09, + "loss": 0.7915, + "step": 11490 + }, + { + "epoch": 1.9935808466342817, + "grad_norm": 0.9140558242797852, + "learning_rate": 1.5887194333585876e-09, + "loss": 0.6852, + "step": 11491 + }, + { + "epoch": 1.9937543372657878, + "grad_norm": 1.2173523902893066, + "learning_rate": 1.5040042653269304e-09, + "loss": 0.6573, + "step": 11492 + }, + { + "epoch": 1.9939278278972936, + "grad_norm": 0.8844655752182007, + "learning_rate": 1.421609946057334e-09, + "loss": 0.6335, + "step": 11493 + }, + { + "epoch": 1.9941013185287995, + "grad_norm": 1.4640592336654663, + "learning_rate": 1.3415364851132595e-09, + "loss": 0.6051, + "step": 11494 + }, + { + "epoch": 1.9942748091603053, + "grad_norm": 1.0677266120910645, + "learning_rate": 1.2637838917872735e-09, + "loss": 0.5703, + "step": 11495 + }, + { + "epoch": 1.9944482997918112, + "grad_norm": 1.1336660385131836, + "learning_rate": 1.188352175103269e-09, + "loss": 0.5815, + "step": 11496 + }, + { + "epoch": 1.994621790423317, + "grad_norm": 2.3836207389831543, + "learning_rate": 1.1152413438120235e-09, + "loss": 0.6704, + "step": 11497 + }, + { + "epoch": 1.994795281054823, + "grad_norm": 0.684938907623291, + "learning_rate": 1.0444514064023025e-09, + "loss": 0.6892, + "step": 11498 + }, + { + "epoch": 1.994968771686329, + "grad_norm": 1.7382558584213257, + "learning_rate": 9.759823710853155e-10, + "loss": 0.7791, + "step": 11499 + }, + { + "epoch": 1.995142262317835, + "grad_norm": 1.3906954526901245, + "learning_rate": 9.098342458102593e-10, + "loss": 0.5786, + "step": 11500 + }, + { + "epoch": 1.9953157529493408, + "grad_norm": 0.9615573287010193, + "learning_rate": 8.460070382532159e-10, + "loss": 0.6227, + "step": 11501 + }, + { + "epoch": 1.9954892435808467, + "grad_norm": 0.7600582242012024, + "learning_rate": 7.845007558193729e-10, + "loss": 0.7739, + "step": 11502 + }, + { + "epoch": 1.9956627342123525, + "grad_norm": 0.8383855819702148, + "learning_rate": 7.253154056474643e-10, + "loss": 0.6523, + "step": 11503 + }, + { + "epoch": 1.9958362248438584, + "grad_norm": 0.9320817589759827, + "learning_rate": 6.684509946075501e-10, + "loss": 0.6239, + "step": 11504 + }, + { + "epoch": 1.9960097154753642, + "grad_norm": 0.7621611952781677, + "learning_rate": 6.139075292987962e-10, + "loss": 0.6794, + "step": 11505 + }, + { + "epoch": 1.9961832061068703, + "grad_norm": 0.694157063961029, + "learning_rate": 5.616850160494736e-10, + "loss": 0.7399, + "step": 11506 + }, + { + "epoch": 1.9963566967383761, + "grad_norm": 1.0741289854049683, + "learning_rate": 5.117834609191796e-10, + "loss": 0.6368, + "step": 11507 + }, + { + "epoch": 1.9965301873698822, + "grad_norm": 1.4087622165679932, + "learning_rate": 4.642028697010581e-10, + "loss": 0.5499, + "step": 11508 + }, + { + "epoch": 1.996703678001388, + "grad_norm": 1.9275933504104614, + "learning_rate": 4.189432479173583e-10, + "loss": 0.5758, + "step": 11509 + }, + { + "epoch": 1.9968771686328939, + "grad_norm": 0.9132659435272217, + "learning_rate": 3.760046008172147e-10, + "loss": 0.8018, + "step": 11510 + }, + { + "epoch": 1.9970506592643997, + "grad_norm": 1.0022917985916138, + "learning_rate": 3.353869333877491e-10, + "loss": 0.6606, + "step": 11511 + }, + { + "epoch": 1.9972241498959056, + "grad_norm": 0.9113234281539917, + "learning_rate": 2.97090250340748e-10, + "loss": 0.6199, + "step": 11512 + }, + { + "epoch": 1.9973976405274114, + "grad_norm": 1.101978063583374, + "learning_rate": 2.611145561215445e-10, + "loss": 0.7379, + "step": 11513 + }, + { + "epoch": 1.9975711311589173, + "grad_norm": 0.9614459872245789, + "learning_rate": 2.2745985490235656e-10, + "loss": 0.644, + "step": 11514 + }, + { + "epoch": 1.9977446217904233, + "grad_norm": 0.9026958346366882, + "learning_rate": 1.9612615059116935e-10, + "loss": 0.6117, + "step": 11515 + }, + { + "epoch": 1.9979181124219292, + "grad_norm": 1.3231223821640015, + "learning_rate": 1.6711344682507346e-10, + "loss": 0.6329, + "step": 11516 + }, + { + "epoch": 1.9980916030534353, + "grad_norm": 0.9204657673835754, + "learning_rate": 1.4042174696804467e-10, + "loss": 0.6289, + "step": 11517 + }, + { + "epoch": 1.998265093684941, + "grad_norm": 0.8320528268814087, + "learning_rate": 1.160510541220461e-10, + "loss": 0.728, + "step": 11518 + }, + { + "epoch": 1.998438584316447, + "grad_norm": 0.8677964806556702, + "learning_rate": 9.400137111148511e-11, + "loss": 0.7567, + "step": 11519 + }, + { + "epoch": 1.9986120749479528, + "grad_norm": 1.3626582622528076, + "learning_rate": 7.427270049653601e-11, + "loss": 0.6613, + "step": 11520 + }, + { + "epoch": 1.9987855655794586, + "grad_norm": 0.8933671712875366, + "learning_rate": 5.6865044566478676e-11, + "loss": 0.7163, + "step": 11521 + }, + { + "epoch": 1.9989590562109645, + "grad_norm": 0.9347165822982788, + "learning_rate": 4.1778405341919016e-11, + "loss": 0.7162, + "step": 11522 + }, + { + "epoch": 1.9991325468424705, + "grad_norm": 0.8607229590415955, + "learning_rate": 2.9012784574788954e-11, + "loss": 0.6416, + "step": 11523 + }, + { + "epoch": 1.9993060374739764, + "grad_norm": 0.8122918605804443, + "learning_rate": 1.8568183743905566e-11, + "loss": 0.5851, + "step": 11524 + }, + { + "epoch": 1.9994795281054825, + "grad_norm": 0.93300461769104, + "learning_rate": 1.0444604063852837e-11, + "loss": 0.6156, + "step": 11525 + }, + { + "epoch": 1.9996530187369883, + "grad_norm": 0.8774994015693665, + "learning_rate": 4.6420464738794465e-12, + "loss": 0.7311, + "step": 11526 + }, + { + "epoch": 1.9998265093684942, + "grad_norm": 0.6349413990974426, + "learning_rate": 1.160511651221441e-12, + "loss": 0.6724, + "step": 11527 + }, + { + "epoch": 2.0, + "grad_norm": 0.8440874814987183, + "learning_rate": 0.0, + "loss": 0.6902, + "step": 11528 + }, + { + "epoch": 2.0, + "step": 11528, "total_flos": 0.0, "train_loss": 0.0, - "train_runtime": 1.8736, - "train_samples_per_second": 93546.646, - "train_steps_per_second": 2922.699 + "train_runtime": 1.902, + "train_samples_per_second": 193956.072, + "train_steps_per_second": 6060.93 } ], "logging_steps": 1.0, - "max_steps": 5476, + "max_steps": 11528, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, @@ -38368,7 +80732,7 @@ } }, "total_flos": 0.0, - "train_batch_size": 1, + "train_batch_size": 2, "trial_name": null, "trial_params": null }