diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,38374 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9998174182946868, + "eval_steps": 500, + "global_step": 5476, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003651634106262553, + "grad_norm": 858.7462158203125, + "learning_rate": 3.6496350364963505e-08, + "loss": 4.4937, + "step": 1 + }, + { + "epoch": 0.0007303268212525105, + "grad_norm": 6394.86376953125, + "learning_rate": 7.299270072992701e-08, + "loss": 5.9233, + "step": 2 + }, + { + "epoch": 0.0010954902318787657, + "grad_norm": 3305.432861328125, + "learning_rate": 1.0948905109489053e-07, + "loss": 5.4385, + "step": 3 + }, + { + "epoch": 0.001460653642505021, + "grad_norm": 1724.0235595703125, + "learning_rate": 1.4598540145985402e-07, + "loss": 4.5796, + "step": 4 + }, + { + "epoch": 0.0018258170531312763, + "grad_norm": 18848.08203125, + "learning_rate": 1.8248175182481753e-07, + "loss": 7.1191, + "step": 5 + }, + { + "epoch": 0.0021909804637575314, + "grad_norm": 5539.15234375, + "learning_rate": 2.1897810218978106e-07, + "loss": 6.7446, + "step": 6 + }, + { + "epoch": 0.0025561438743837866, + "grad_norm": 2666.61962890625, + "learning_rate": 2.5547445255474454e-07, + "loss": 8.3125, + "step": 7 + }, + { + "epoch": 0.002921307285010042, + "grad_norm": 3787.522705078125, + "learning_rate": 2.9197080291970804e-07, + "loss": 5.8618, + "step": 8 + }, + { + "epoch": 0.0032864706956362974, + "grad_norm": 72566.375, + "learning_rate": 3.284671532846716e-07, + "loss": 6.7295, + "step": 9 + }, + { + "epoch": 0.0036516341062625525, + "grad_norm": 30215.857421875, + "learning_rate": 3.6496350364963505e-07, + "loss": 7.3945, + "step": 10 + }, + { + "epoch": 0.004016797516888808, + "grad_norm": 26900.92578125, + "learning_rate": 4.0145985401459856e-07, + "loss": 7.6553, + "step": 11 + }, + { + "epoch": 0.004381960927515063, + "grad_norm": 3489.6513671875, + "learning_rate": 4.379562043795621e-07, + "loss": 8.3828, + "step": 12 + }, + { + "epoch": 0.004747124338141318, + "grad_norm": 808.1627197265625, + "learning_rate": 4.7445255474452557e-07, + "loss": 5.2695, + "step": 13 + }, + { + "epoch": 0.005112287748767573, + "grad_norm": 5560.9794921875, + "learning_rate": 5.109489051094891e-07, + "loss": 4.6694, + "step": 14 + }, + { + "epoch": 0.005477451159393828, + "grad_norm": 3059.473388671875, + "learning_rate": 5.474452554744526e-07, + "loss": 6.4053, + "step": 15 + }, + { + "epoch": 0.005842614570020084, + "grad_norm": 44171.578125, + "learning_rate": 5.839416058394161e-07, + "loss": 6.0503, + "step": 16 + }, + { + "epoch": 0.0062077779806463396, + "grad_norm": 6308.86474609375, + "learning_rate": 6.204379562043796e-07, + "loss": 5.3037, + "step": 17 + }, + { + "epoch": 0.006572941391272595, + "grad_norm": 1026.9315185546875, + "learning_rate": 6.569343065693432e-07, + "loss": 6.7432, + "step": 18 + }, + { + "epoch": 0.00693810480189885, + "grad_norm": 7379.29296875, + "learning_rate": 6.934306569343066e-07, + "loss": 5.8125, + "step": 19 + }, + { + "epoch": 0.007303268212525105, + "grad_norm": 1578.2115478515625, + "learning_rate": 7.299270072992701e-07, + "loss": 7.7974, + "step": 20 + }, + { + "epoch": 0.00766843162315136, + "grad_norm": 1759.99365234375, + "learning_rate": 7.664233576642337e-07, + "loss": 5.6411, + "step": 21 + }, + { + "epoch": 0.008033595033777615, + "grad_norm": 6159.49169921875, + "learning_rate": 8.029197080291971e-07, + "loss": 6.9321, + "step": 22 + }, + { + "epoch": 0.00839875844440387, + "grad_norm": 2101.160888671875, + "learning_rate": 8.394160583941606e-07, + "loss": 6.7178, + "step": 23 + }, + { + "epoch": 0.008763921855030126, + "grad_norm": 31658.865234375, + "learning_rate": 8.759124087591242e-07, + "loss": 5.3345, + "step": 24 + }, + { + "epoch": 0.009129085265656381, + "grad_norm": 7087.1318359375, + "learning_rate": 9.124087591240876e-07, + "loss": 6.7251, + "step": 25 + }, + { + "epoch": 0.009494248676282636, + "grad_norm": 18886.189453125, + "learning_rate": 9.489051094890511e-07, + "loss": 6.3369, + "step": 26 + }, + { + "epoch": 0.009859412086908891, + "grad_norm": 5172.552734375, + "learning_rate": 9.854014598540146e-07, + "loss": 6.625, + "step": 27 + }, + { + "epoch": 0.010224575497535146, + "grad_norm": 3320.644775390625, + "learning_rate": 1.0218978102189781e-06, + "loss": 6.4692, + "step": 28 + }, + { + "epoch": 0.010589738908161402, + "grad_norm": 5261.62646484375, + "learning_rate": 1.0583941605839416e-06, + "loss": 7.0078, + "step": 29 + }, + { + "epoch": 0.010954902318787657, + "grad_norm": 14572.537109375, + "learning_rate": 1.0948905109489052e-06, + "loss": 6.2578, + "step": 30 + }, + { + "epoch": 0.011320065729413912, + "grad_norm": 2529.656982421875, + "learning_rate": 1.1313868613138687e-06, + "loss": 6.7266, + "step": 31 + }, + { + "epoch": 0.011685229140040169, + "grad_norm": 6337.70458984375, + "learning_rate": 1.1678832116788322e-06, + "loss": 5.8765, + "step": 32 + }, + { + "epoch": 0.012050392550666424, + "grad_norm": 1270.5396728515625, + "learning_rate": 1.2043795620437959e-06, + "loss": 6.3232, + "step": 33 + }, + { + "epoch": 0.012415555961292679, + "grad_norm": 2410.99853515625, + "learning_rate": 1.2408759124087592e-06, + "loss": 6.2515, + "step": 34 + }, + { + "epoch": 0.012780719371918934, + "grad_norm": 1112.746826171875, + "learning_rate": 1.2773722627737229e-06, + "loss": 7.2603, + "step": 35 + }, + { + "epoch": 0.01314588278254519, + "grad_norm": 2323.461669921875, + "learning_rate": 1.3138686131386864e-06, + "loss": 6.1316, + "step": 36 + }, + { + "epoch": 0.013511046193171445, + "grad_norm": 6492.537109375, + "learning_rate": 1.3503649635036497e-06, + "loss": 5.7026, + "step": 37 + }, + { + "epoch": 0.0138762096037977, + "grad_norm": 1916.9757080078125, + "learning_rate": 1.3868613138686132e-06, + "loss": 6.5596, + "step": 38 + }, + { + "epoch": 0.014241373014423955, + "grad_norm": 980.5525512695312, + "learning_rate": 1.4233576642335767e-06, + "loss": 4.6865, + "step": 39 + }, + { + "epoch": 0.01460653642505021, + "grad_norm": 3851.416259765625, + "learning_rate": 1.4598540145985402e-06, + "loss": 5.9668, + "step": 40 + }, + { + "epoch": 0.014971699835676465, + "grad_norm": 1468.562744140625, + "learning_rate": 1.496350364963504e-06, + "loss": 6.6636, + "step": 41 + }, + { + "epoch": 0.01533686324630272, + "grad_norm": 2646.891357421875, + "learning_rate": 1.5328467153284674e-06, + "loss": 5.6963, + "step": 42 + }, + { + "epoch": 0.015702026656928977, + "grad_norm": 10316.486328125, + "learning_rate": 1.5693430656934307e-06, + "loss": 6.1025, + "step": 43 + }, + { + "epoch": 0.01606719006755523, + "grad_norm": 1718.9337158203125, + "learning_rate": 1.6058394160583942e-06, + "loss": 6.4917, + "step": 44 + }, + { + "epoch": 0.016432353478181488, + "grad_norm": 1402.13623046875, + "learning_rate": 1.6423357664233577e-06, + "loss": 4.6167, + "step": 45 + }, + { + "epoch": 0.01679751688880774, + "grad_norm": 514.71923828125, + "learning_rate": 1.6788321167883212e-06, + "loss": 5.6113, + "step": 46 + }, + { + "epoch": 0.017162680299433998, + "grad_norm": 7258.53076171875, + "learning_rate": 1.715328467153285e-06, + "loss": 5.1318, + "step": 47 + }, + { + "epoch": 0.01752784371006025, + "grad_norm": 53655.25, + "learning_rate": 1.7518248175182485e-06, + "loss": 6.5986, + "step": 48 + }, + { + "epoch": 0.01789300712068651, + "grad_norm": 6116.984375, + "learning_rate": 1.788321167883212e-06, + "loss": 5.8286, + "step": 49 + }, + { + "epoch": 0.018258170531312762, + "grad_norm": 1429.343017578125, + "learning_rate": 1.8248175182481753e-06, + "loss": 6.2109, + "step": 50 + }, + { + "epoch": 0.01862333394193902, + "grad_norm": 3730.193115234375, + "learning_rate": 1.8613138686131388e-06, + "loss": 6.8057, + "step": 51 + }, + { + "epoch": 0.018988497352565272, + "grad_norm": 42216.3984375, + "learning_rate": 1.8978102189781023e-06, + "loss": 4.1436, + "step": 52 + }, + { + "epoch": 0.01935366076319153, + "grad_norm": 3568.0703125, + "learning_rate": 1.934306569343066e-06, + "loss": 6.5127, + "step": 53 + }, + { + "epoch": 0.019718824173817782, + "grad_norm": 242.3411407470703, + "learning_rate": 1.9708029197080293e-06, + "loss": 4.0176, + "step": 54 + }, + { + "epoch": 0.02008398758444404, + "grad_norm": 7677.52392578125, + "learning_rate": 2.007299270072993e-06, + "loss": 6.2373, + "step": 55 + }, + { + "epoch": 0.020449150995070293, + "grad_norm": 2871.701904296875, + "learning_rate": 2.0437956204379563e-06, + "loss": 5.6187, + "step": 56 + }, + { + "epoch": 0.02081431440569655, + "grad_norm": 1617.5478515625, + "learning_rate": 2.08029197080292e-06, + "loss": 6.1074, + "step": 57 + }, + { + "epoch": 0.021179477816322803, + "grad_norm": 2647.792724609375, + "learning_rate": 2.1167883211678833e-06, + "loss": 5.9727, + "step": 58 + }, + { + "epoch": 0.02154464122694906, + "grad_norm": 2567.927490234375, + "learning_rate": 2.1532846715328466e-06, + "loss": 6.4531, + "step": 59 + }, + { + "epoch": 0.021909804637575313, + "grad_norm": 549.2553100585938, + "learning_rate": 2.1897810218978103e-06, + "loss": 5.0977, + "step": 60 + }, + { + "epoch": 0.02227496804820157, + "grad_norm": 7288.61474609375, + "learning_rate": 2.226277372262774e-06, + "loss": 6.0161, + "step": 61 + }, + { + "epoch": 0.022640131458827824, + "grad_norm": 18750.427734375, + "learning_rate": 2.2627737226277373e-06, + "loss": 5.6802, + "step": 62 + }, + { + "epoch": 0.02300529486945408, + "grad_norm": 1172.6300048828125, + "learning_rate": 2.299270072992701e-06, + "loss": 6.1104, + "step": 63 + }, + { + "epoch": 0.023370458280080338, + "grad_norm": 2294.61328125, + "learning_rate": 2.3357664233576643e-06, + "loss": 5.3442, + "step": 64 + }, + { + "epoch": 0.02373562169070659, + "grad_norm": 10097.84375, + "learning_rate": 2.3722627737226276e-06, + "loss": 5.3574, + "step": 65 + }, + { + "epoch": 0.024100785101332848, + "grad_norm": 6148.13916015625, + "learning_rate": 2.4087591240875918e-06, + "loss": 5.4424, + "step": 66 + }, + { + "epoch": 0.0244659485119591, + "grad_norm": 1161.4757080078125, + "learning_rate": 2.445255474452555e-06, + "loss": 5.6011, + "step": 67 + }, + { + "epoch": 0.024831111922585358, + "grad_norm": 2529.951904296875, + "learning_rate": 2.4817518248175183e-06, + "loss": 5.1353, + "step": 68 + }, + { + "epoch": 0.02519627533321161, + "grad_norm": 5371.56982421875, + "learning_rate": 2.518248175182482e-06, + "loss": 3.9297, + "step": 69 + }, + { + "epoch": 0.02556143874383787, + "grad_norm": 3334.050537109375, + "learning_rate": 2.5547445255474458e-06, + "loss": 4.1289, + "step": 70 + }, + { + "epoch": 0.025926602154464122, + "grad_norm": 8525.9794921875, + "learning_rate": 2.591240875912409e-06, + "loss": 5.3735, + "step": 71 + }, + { + "epoch": 0.02629176556509038, + "grad_norm": 4757.6689453125, + "learning_rate": 2.627737226277373e-06, + "loss": 5.7793, + "step": 72 + }, + { + "epoch": 0.026656928975716632, + "grad_norm": 6516.1943359375, + "learning_rate": 2.664233576642336e-06, + "loss": 5.9304, + "step": 73 + }, + { + "epoch": 0.02702209238634289, + "grad_norm": 7296.853515625, + "learning_rate": 2.7007299270072994e-06, + "loss": 4.9775, + "step": 74 + }, + { + "epoch": 0.027387255796969143, + "grad_norm": 816.5313720703125, + "learning_rate": 2.737226277372263e-06, + "loss": 4.5146, + "step": 75 + }, + { + "epoch": 0.0277524192075954, + "grad_norm": 1695.573974609375, + "learning_rate": 2.7737226277372264e-06, + "loss": 5.3511, + "step": 76 + }, + { + "epoch": 0.028117582618221653, + "grad_norm": 1111.10009765625, + "learning_rate": 2.81021897810219e-06, + "loss": 4.2915, + "step": 77 + }, + { + "epoch": 0.02848274602884791, + "grad_norm": 6980.4052734375, + "learning_rate": 2.8467153284671534e-06, + "loss": 3.7622, + "step": 78 + }, + { + "epoch": 0.028847909439474163, + "grad_norm": 2465.1806640625, + "learning_rate": 2.8832116788321167e-06, + "loss": 6.0605, + "step": 79 + }, + { + "epoch": 0.02921307285010042, + "grad_norm": 2196.582763671875, + "learning_rate": 2.9197080291970804e-06, + "loss": 5.7114, + "step": 80 + }, + { + "epoch": 0.029578236260726674, + "grad_norm": 893.1383666992188, + "learning_rate": 2.956204379562044e-06, + "loss": 5.2002, + "step": 81 + }, + { + "epoch": 0.02994339967135293, + "grad_norm": 3686.89306640625, + "learning_rate": 2.992700729927008e-06, + "loss": 5.29, + "step": 82 + }, + { + "epoch": 0.030308563081979184, + "grad_norm": 9257.904296875, + "learning_rate": 3.029197080291971e-06, + "loss": 4.4707, + "step": 83 + }, + { + "epoch": 0.03067372649260544, + "grad_norm": 4997.966796875, + "learning_rate": 3.065693430656935e-06, + "loss": 5.4282, + "step": 84 + }, + { + "epoch": 0.031038889903231698, + "grad_norm": 2574.991943359375, + "learning_rate": 3.102189781021898e-06, + "loss": 4.5264, + "step": 85 + }, + { + "epoch": 0.031404053313857955, + "grad_norm": 1940.85546875, + "learning_rate": 3.1386861313868614e-06, + "loss": 4.8984, + "step": 86 + }, + { + "epoch": 0.031769216724484205, + "grad_norm": 1634.5382080078125, + "learning_rate": 3.175182481751825e-06, + "loss": 4.0796, + "step": 87 + }, + { + "epoch": 0.03213438013511046, + "grad_norm": 4987.83154296875, + "learning_rate": 3.2116788321167884e-06, + "loss": 4.8687, + "step": 88 + }, + { + "epoch": 0.03249954354573672, + "grad_norm": 32678.6171875, + "learning_rate": 3.248175182481752e-06, + "loss": 5.0845, + "step": 89 + }, + { + "epoch": 0.032864706956362975, + "grad_norm": 941.7997436523438, + "learning_rate": 3.2846715328467155e-06, + "loss": 4.0479, + "step": 90 + }, + { + "epoch": 0.033229870366989225, + "grad_norm": 6600.39501953125, + "learning_rate": 3.3211678832116788e-06, + "loss": 4.0234, + "step": 91 + }, + { + "epoch": 0.03359503377761548, + "grad_norm": 195.313232421875, + "learning_rate": 3.3576642335766425e-06, + "loss": 4.543, + "step": 92 + }, + { + "epoch": 0.03396019718824174, + "grad_norm": 2158.8779296875, + "learning_rate": 3.3941605839416058e-06, + "loss": 4.282, + "step": 93 + }, + { + "epoch": 0.034325360598867996, + "grad_norm": 2006.3287353515625, + "learning_rate": 3.43065693430657e-06, + "loss": 4.5869, + "step": 94 + }, + { + "epoch": 0.034690524009494246, + "grad_norm": 5730.62646484375, + "learning_rate": 3.467153284671533e-06, + "loss": 4.3423, + "step": 95 + }, + { + "epoch": 0.0350556874201205, + "grad_norm": 6067.22216796875, + "learning_rate": 3.503649635036497e-06, + "loss": 4.561, + "step": 96 + }, + { + "epoch": 0.03542085083074676, + "grad_norm": 7373.00537109375, + "learning_rate": 3.54014598540146e-06, + "loss": 4.3584, + "step": 97 + }, + { + "epoch": 0.03578601424137302, + "grad_norm": 1633.809814453125, + "learning_rate": 3.576642335766424e-06, + "loss": 4.9429, + "step": 98 + }, + { + "epoch": 0.03615117765199927, + "grad_norm": 1958.9189453125, + "learning_rate": 3.6131386861313872e-06, + "loss": 3.9888, + "step": 99 + }, + { + "epoch": 0.036516341062625524, + "grad_norm": 587.3916625976562, + "learning_rate": 3.6496350364963505e-06, + "loss": 3.7556, + "step": 100 + }, + { + "epoch": 0.03688150447325178, + "grad_norm": 1480.251220703125, + "learning_rate": 3.6861313868613142e-06, + "loss": 4.6995, + "step": 101 + }, + { + "epoch": 0.03724666788387804, + "grad_norm": 339.09271240234375, + "learning_rate": 3.7226277372262775e-06, + "loss": 4.5959, + "step": 102 + }, + { + "epoch": 0.03761183129450429, + "grad_norm": 370.29736328125, + "learning_rate": 3.7591240875912412e-06, + "loss": 3.8401, + "step": 103 + }, + { + "epoch": 0.037976994705130544, + "grad_norm": 2126.548583984375, + "learning_rate": 3.7956204379562045e-06, + "loss": 4.8696, + "step": 104 + }, + { + "epoch": 0.0383421581157568, + "grad_norm": 357.5569763183594, + "learning_rate": 3.832116788321168e-06, + "loss": 4.5408, + "step": 105 + }, + { + "epoch": 0.03870732152638306, + "grad_norm": 375.277099609375, + "learning_rate": 3.868613138686132e-06, + "loss": 4.5688, + "step": 106 + }, + { + "epoch": 0.039072484937009315, + "grad_norm": 387.4185485839844, + "learning_rate": 3.905109489051096e-06, + "loss": 3.8911, + "step": 107 + }, + { + "epoch": 0.039437648347635565, + "grad_norm": 732.7127075195312, + "learning_rate": 3.9416058394160585e-06, + "loss": 4.6123, + "step": 108 + }, + { + "epoch": 0.03980281175826182, + "grad_norm": 3045.466064453125, + "learning_rate": 3.978102189781022e-06, + "loss": 3.9941, + "step": 109 + }, + { + "epoch": 0.04016797516888808, + "grad_norm": 6941.19677734375, + "learning_rate": 4.014598540145986e-06, + "loss": 3.981, + "step": 110 + }, + { + "epoch": 0.040533138579514336, + "grad_norm": 1226.5081787109375, + "learning_rate": 4.05109489051095e-06, + "loss": 3.7681, + "step": 111 + }, + { + "epoch": 0.040898301990140586, + "grad_norm": 4508.16748046875, + "learning_rate": 4.0875912408759126e-06, + "loss": 3.689, + "step": 112 + }, + { + "epoch": 0.04126346540076684, + "grad_norm": 8658.3134765625, + "learning_rate": 4.124087591240876e-06, + "loss": 3.3323, + "step": 113 + }, + { + "epoch": 0.0416286288113931, + "grad_norm": 732.7115478515625, + "learning_rate": 4.16058394160584e-06, + "loss": 3.7373, + "step": 114 + }, + { + "epoch": 0.041993792222019356, + "grad_norm": 3595.828369140625, + "learning_rate": 4.197080291970803e-06, + "loss": 3.7979, + "step": 115 + }, + { + "epoch": 0.042358955632645606, + "grad_norm": 558.604736328125, + "learning_rate": 4.233576642335767e-06, + "loss": 3.6646, + "step": 116 + }, + { + "epoch": 0.04272411904327186, + "grad_norm": 1674.7657470703125, + "learning_rate": 4.27007299270073e-06, + "loss": 4.2749, + "step": 117 + }, + { + "epoch": 0.04308928245389812, + "grad_norm": 1014.714599609375, + "learning_rate": 4.306569343065693e-06, + "loss": 3.411, + "step": 118 + }, + { + "epoch": 0.04345444586452438, + "grad_norm": 2597.962158203125, + "learning_rate": 4.343065693430658e-06, + "loss": 3.8379, + "step": 119 + }, + { + "epoch": 0.04381960927515063, + "grad_norm": 3630.7177734375, + "learning_rate": 4.379562043795621e-06, + "loss": 4.8774, + "step": 120 + }, + { + "epoch": 0.044184772685776884, + "grad_norm": 175.34295654296875, + "learning_rate": 4.416058394160584e-06, + "loss": 3.1763, + "step": 121 + }, + { + "epoch": 0.04454993609640314, + "grad_norm": 3908.3720703125, + "learning_rate": 4.452554744525548e-06, + "loss": 3.3569, + "step": 122 + }, + { + "epoch": 0.0449150995070294, + "grad_norm": 4444.5869140625, + "learning_rate": 4.489051094890512e-06, + "loss": 3.8149, + "step": 123 + }, + { + "epoch": 0.04528026291765565, + "grad_norm": 207.31356811523438, + "learning_rate": 4.525547445255475e-06, + "loss": 2.6179, + "step": 124 + }, + { + "epoch": 0.045645426328281904, + "grad_norm": 516.9945068359375, + "learning_rate": 4.562043795620438e-06, + "loss": 4.0015, + "step": 125 + }, + { + "epoch": 0.04601058973890816, + "grad_norm": 4471.39404296875, + "learning_rate": 4.598540145985402e-06, + "loss": 4.3091, + "step": 126 + }, + { + "epoch": 0.04637575314953442, + "grad_norm": 10257.8251953125, + "learning_rate": 4.635036496350365e-06, + "loss": 3.6528, + "step": 127 + }, + { + "epoch": 0.046740916560160675, + "grad_norm": 117.92402648925781, + "learning_rate": 4.671532846715329e-06, + "loss": 3.4553, + "step": 128 + }, + { + "epoch": 0.047106079970786925, + "grad_norm": 3125.854248046875, + "learning_rate": 4.708029197080292e-06, + "loss": 2.8669, + "step": 129 + }, + { + "epoch": 0.04747124338141318, + "grad_norm": 199.3535614013672, + "learning_rate": 4.744525547445255e-06, + "loss": 2.9316, + "step": 130 + }, + { + "epoch": 0.04783640679203944, + "grad_norm": 3418.947265625, + "learning_rate": 4.78102189781022e-06, + "loss": 4.1367, + "step": 131 + }, + { + "epoch": 0.048201570202665696, + "grad_norm": 653.3822631835938, + "learning_rate": 4.8175182481751835e-06, + "loss": 3.3528, + "step": 132 + }, + { + "epoch": 0.048566733613291946, + "grad_norm": 99.99283599853516, + "learning_rate": 4.854014598540146e-06, + "loss": 2.6455, + "step": 133 + }, + { + "epoch": 0.0489318970239182, + "grad_norm": 433.3922424316406, + "learning_rate": 4.89051094890511e-06, + "loss": 4.5752, + "step": 134 + }, + { + "epoch": 0.04929706043454446, + "grad_norm": 746.265380859375, + "learning_rate": 4.927007299270074e-06, + "loss": 3.9375, + "step": 135 + }, + { + "epoch": 0.049662223845170717, + "grad_norm": 750.6600952148438, + "learning_rate": 4.963503649635037e-06, + "loss": 3.1877, + "step": 136 + }, + { + "epoch": 0.050027387255796966, + "grad_norm": 1102.3570556640625, + "learning_rate": 5e-06, + "loss": 3.3564, + "step": 137 + }, + { + "epoch": 0.05039255066642322, + "grad_norm": 2158.283203125, + "learning_rate": 5.036496350364964e-06, + "loss": 3.2808, + "step": 138 + }, + { + "epoch": 0.05075771407704948, + "grad_norm": 308.4197998046875, + "learning_rate": 5.072992700729927e-06, + "loss": 2.7959, + "step": 139 + }, + { + "epoch": 0.05112287748767574, + "grad_norm": 478.9572448730469, + "learning_rate": 5.1094890510948916e-06, + "loss": 3.1453, + "step": 140 + }, + { + "epoch": 0.05148804089830199, + "grad_norm": 494.27197265625, + "learning_rate": 5.1459854014598544e-06, + "loss": 3.3398, + "step": 141 + }, + { + "epoch": 0.051853204308928244, + "grad_norm": 1912.637451171875, + "learning_rate": 5.182481751824818e-06, + "loss": 3.259, + "step": 142 + }, + { + "epoch": 0.0522183677195545, + "grad_norm": 466.3883361816406, + "learning_rate": 5.218978102189781e-06, + "loss": 3.5239, + "step": 143 + }, + { + "epoch": 0.05258353113018076, + "grad_norm": 2344.509765625, + "learning_rate": 5.255474452554746e-06, + "loss": 3.2029, + "step": 144 + }, + { + "epoch": 0.05294869454080701, + "grad_norm": 310.60968017578125, + "learning_rate": 5.2919708029197084e-06, + "loss": 3.3289, + "step": 145 + }, + { + "epoch": 0.053313857951433265, + "grad_norm": 1433.7972412109375, + "learning_rate": 5.328467153284672e-06, + "loss": 3.1902, + "step": 146 + }, + { + "epoch": 0.05367902136205952, + "grad_norm": 3818.810791015625, + "learning_rate": 5.364963503649635e-06, + "loss": 3.5647, + "step": 147 + }, + { + "epoch": 0.05404418477268578, + "grad_norm": 580.8521118164062, + "learning_rate": 5.401459854014599e-06, + "loss": 3.002, + "step": 148 + }, + { + "epoch": 0.054409348183312035, + "grad_norm": 1983.5205078125, + "learning_rate": 5.437956204379562e-06, + "loss": 3.064, + "step": 149 + }, + { + "epoch": 0.054774511593938285, + "grad_norm": 105.86579132080078, + "learning_rate": 5.474452554744526e-06, + "loss": 2.6968, + "step": 150 + }, + { + "epoch": 0.05513967500456454, + "grad_norm": 3389.255615234375, + "learning_rate": 5.51094890510949e-06, + "loss": 3.416, + "step": 151 + }, + { + "epoch": 0.0555048384151908, + "grad_norm": 594.318115234375, + "learning_rate": 5.547445255474453e-06, + "loss": 2.9917, + "step": 152 + }, + { + "epoch": 0.055870001825817056, + "grad_norm": 309.3802185058594, + "learning_rate": 5.5839416058394165e-06, + "loss": 3.2024, + "step": 153 + }, + { + "epoch": 0.056235165236443306, + "grad_norm": 15191.001953125, + "learning_rate": 5.62043795620438e-06, + "loss": 3.1177, + "step": 154 + }, + { + "epoch": 0.05660032864706956, + "grad_norm": 617.5128784179688, + "learning_rate": 5.656934306569344e-06, + "loss": 2.7318, + "step": 155 + }, + { + "epoch": 0.05696549205769582, + "grad_norm": 2553.682861328125, + "learning_rate": 5.693430656934307e-06, + "loss": 2.7981, + "step": 156 + }, + { + "epoch": 0.05733065546832208, + "grad_norm": 434.7556457519531, + "learning_rate": 5.7299270072992705e-06, + "loss": 3.3635, + "step": 157 + }, + { + "epoch": 0.05769581887894833, + "grad_norm": 760.1705932617188, + "learning_rate": 5.766423357664233e-06, + "loss": 3.2495, + "step": 158 + }, + { + "epoch": 0.058060982289574584, + "grad_norm": 1057.6082763671875, + "learning_rate": 5.802919708029198e-06, + "loss": 2.9773, + "step": 159 + }, + { + "epoch": 0.05842614570020084, + "grad_norm": 645.4927368164062, + "learning_rate": 5.839416058394161e-06, + "loss": 2.7349, + "step": 160 + }, + { + "epoch": 0.0587913091108271, + "grad_norm": 1277.3204345703125, + "learning_rate": 5.8759124087591245e-06, + "loss": 2.5996, + "step": 161 + }, + { + "epoch": 0.05915647252145335, + "grad_norm": 52.24332809448242, + "learning_rate": 5.912408759124088e-06, + "loss": 2.1094, + "step": 162 + }, + { + "epoch": 0.059521635932079604, + "grad_norm": 2303.613037109375, + "learning_rate": 5.948905109489051e-06, + "loss": 2.9954, + "step": 163 + }, + { + "epoch": 0.05988679934270586, + "grad_norm": 906.8886108398438, + "learning_rate": 5.985401459854016e-06, + "loss": 2.9514, + "step": 164 + }, + { + "epoch": 0.06025196275333212, + "grad_norm": 430.33221435546875, + "learning_rate": 6.0218978102189786e-06, + "loss": 2.9661, + "step": 165 + }, + { + "epoch": 0.06061712616395837, + "grad_norm": 459.38922119140625, + "learning_rate": 6.058394160583942e-06, + "loss": 3.4292, + "step": 166 + }, + { + "epoch": 0.060982289574584625, + "grad_norm": 453.9920349121094, + "learning_rate": 6.094890510948905e-06, + "loss": 3.5171, + "step": 167 + }, + { + "epoch": 0.06134745298521088, + "grad_norm": 381.0705261230469, + "learning_rate": 6.13138686131387e-06, + "loss": 2.5144, + "step": 168 + }, + { + "epoch": 0.06171261639583714, + "grad_norm": 126.2828598022461, + "learning_rate": 6.1678832116788326e-06, + "loss": 2.073, + "step": 169 + }, + { + "epoch": 0.062077779806463396, + "grad_norm": 211.9131622314453, + "learning_rate": 6.204379562043796e-06, + "loss": 2.4634, + "step": 170 + }, + { + "epoch": 0.062442943217089646, + "grad_norm": 1790.0267333984375, + "learning_rate": 6.240875912408759e-06, + "loss": 2.6631, + "step": 171 + }, + { + "epoch": 0.06280810662771591, + "grad_norm": 6149.90673828125, + "learning_rate": 6.277372262773723e-06, + "loss": 2.8137, + "step": 172 + }, + { + "epoch": 0.06317327003834215, + "grad_norm": 430.6115417480469, + "learning_rate": 6.313868613138686e-06, + "loss": 2.9197, + "step": 173 + }, + { + "epoch": 0.06353843344896841, + "grad_norm": 2553.958740234375, + "learning_rate": 6.35036496350365e-06, + "loss": 2.7192, + "step": 174 + }, + { + "epoch": 0.06390359685959467, + "grad_norm": 653.2872924804688, + "learning_rate": 6.386861313868614e-06, + "loss": 2.3171, + "step": 175 + }, + { + "epoch": 0.06426876027022092, + "grad_norm": 709.2286987304688, + "learning_rate": 6.423357664233577e-06, + "loss": 2.6941, + "step": 176 + }, + { + "epoch": 0.06463392368084718, + "grad_norm": 6361.58740234375, + "learning_rate": 6.4598540145985415e-06, + "loss": 3.2048, + "step": 177 + }, + { + "epoch": 0.06499908709147344, + "grad_norm": 511.78814697265625, + "learning_rate": 6.496350364963504e-06, + "loss": 2.3953, + "step": 178 + }, + { + "epoch": 0.0653642505020997, + "grad_norm": 2117.15576171875, + "learning_rate": 6.532846715328468e-06, + "loss": 2.8042, + "step": 179 + }, + { + "epoch": 0.06572941391272595, + "grad_norm": 448.7147521972656, + "learning_rate": 6.569343065693431e-06, + "loss": 2.3252, + "step": 180 + }, + { + "epoch": 0.0660945773233522, + "grad_norm": 373.40496826171875, + "learning_rate": 6.605839416058395e-06, + "loss": 2.2825, + "step": 181 + }, + { + "epoch": 0.06645974073397845, + "grad_norm": 2423.2509765625, + "learning_rate": 6.6423357664233575e-06, + "loss": 2.665, + "step": 182 + }, + { + "epoch": 0.06682490414460471, + "grad_norm": 810.9012451171875, + "learning_rate": 6.678832116788322e-06, + "loss": 2.3025, + "step": 183 + }, + { + "epoch": 0.06719006755523096, + "grad_norm": 237.67294311523438, + "learning_rate": 6.715328467153285e-06, + "loss": 2.5515, + "step": 184 + }, + { + "epoch": 0.06755523096585722, + "grad_norm": 2281.54345703125, + "learning_rate": 6.751824817518249e-06, + "loss": 2.478, + "step": 185 + }, + { + "epoch": 0.06792039437648348, + "grad_norm": 8467.810546875, + "learning_rate": 6.7883211678832115e-06, + "loss": 2.3552, + "step": 186 + }, + { + "epoch": 0.06828555778710974, + "grad_norm": 262.3053283691406, + "learning_rate": 6.824817518248176e-06, + "loss": 2.1997, + "step": 187 + }, + { + "epoch": 0.06865072119773599, + "grad_norm": 36.84046173095703, + "learning_rate": 6.86131386861314e-06, + "loss": 1.7383, + "step": 188 + }, + { + "epoch": 0.06901588460836225, + "grad_norm": 715.3591918945312, + "learning_rate": 6.897810218978103e-06, + "loss": 2.1892, + "step": 189 + }, + { + "epoch": 0.06938104801898849, + "grad_norm": 1096.4197998046875, + "learning_rate": 6.934306569343066e-06, + "loss": 2.3547, + "step": 190 + }, + { + "epoch": 0.06974621142961475, + "grad_norm": 3342.317138671875, + "learning_rate": 6.970802919708029e-06, + "loss": 2.7402, + "step": 191 + }, + { + "epoch": 0.070111374840241, + "grad_norm": 579.5726318359375, + "learning_rate": 7.007299270072994e-06, + "loss": 2.5376, + "step": 192 + }, + { + "epoch": 0.07047653825086726, + "grad_norm": 4662.072265625, + "learning_rate": 7.043795620437957e-06, + "loss": 2.4165, + "step": 193 + }, + { + "epoch": 0.07084170166149352, + "grad_norm": 74.3611068725586, + "learning_rate": 7.08029197080292e-06, + "loss": 1.8992, + "step": 194 + }, + { + "epoch": 0.07120686507211978, + "grad_norm": 873.2982788085938, + "learning_rate": 7.116788321167883e-06, + "loss": 2.2776, + "step": 195 + }, + { + "epoch": 0.07157202848274603, + "grad_norm": 104.76461791992188, + "learning_rate": 7.153284671532848e-06, + "loss": 2.1392, + "step": 196 + }, + { + "epoch": 0.07193719189337229, + "grad_norm": 13084.2880859375, + "learning_rate": 7.189781021897811e-06, + "loss": 2.0376, + "step": 197 + }, + { + "epoch": 0.07230235530399853, + "grad_norm": 1207.53173828125, + "learning_rate": 7.2262773722627744e-06, + "loss": 2.4053, + "step": 198 + }, + { + "epoch": 0.07266751871462479, + "grad_norm": 453.8113708496094, + "learning_rate": 7.262773722627737e-06, + "loss": 2.1384, + "step": 199 + }, + { + "epoch": 0.07303268212525105, + "grad_norm": 80.14839935302734, + "learning_rate": 7.299270072992701e-06, + "loss": 1.9971, + "step": 200 + }, + { + "epoch": 0.0733978455358773, + "grad_norm": 101.94886016845703, + "learning_rate": 7.335766423357666e-06, + "loss": 1.7673, + "step": 201 + }, + { + "epoch": 0.07376300894650356, + "grad_norm": 333.60888671875, + "learning_rate": 7.3722627737226285e-06, + "loss": 1.9768, + "step": 202 + }, + { + "epoch": 0.07412817235712982, + "grad_norm": 759.494873046875, + "learning_rate": 7.408759124087592e-06, + "loss": 2.28, + "step": 203 + }, + { + "epoch": 0.07449333576775607, + "grad_norm": 364.55377197265625, + "learning_rate": 7.445255474452555e-06, + "loss": 1.7869, + "step": 204 + }, + { + "epoch": 0.07485849917838233, + "grad_norm": 45.348201751708984, + "learning_rate": 7.481751824817519e-06, + "loss": 1.7795, + "step": 205 + }, + { + "epoch": 0.07522366258900857, + "grad_norm": 94.23865509033203, + "learning_rate": 7.5182481751824825e-06, + "loss": 1.8171, + "step": 206 + }, + { + "epoch": 0.07558882599963483, + "grad_norm": 20.518783569335938, + "learning_rate": 7.554744525547446e-06, + "loss": 1.5852, + "step": 207 + }, + { + "epoch": 0.07595398941026109, + "grad_norm": 39.20490646362305, + "learning_rate": 7.591240875912409e-06, + "loss": 1.7422, + "step": 208 + }, + { + "epoch": 0.07631915282088735, + "grad_norm": 27.00390625, + "learning_rate": 7.627737226277373e-06, + "loss": 1.6777, + "step": 209 + }, + { + "epoch": 0.0766843162315136, + "grad_norm": 83.09174346923828, + "learning_rate": 7.664233576642336e-06, + "loss": 1.5854, + "step": 210 + }, + { + "epoch": 0.07704947964213986, + "grad_norm": 34.0961799621582, + "learning_rate": 7.7007299270073e-06, + "loss": 1.7612, + "step": 211 + }, + { + "epoch": 0.07741464305276612, + "grad_norm": 16.32282829284668, + "learning_rate": 7.737226277372264e-06, + "loss": 1.4873, + "step": 212 + }, + { + "epoch": 0.07777980646339237, + "grad_norm": 23.388710021972656, + "learning_rate": 7.773722627737227e-06, + "loss": 1.6042, + "step": 213 + }, + { + "epoch": 0.07814496987401863, + "grad_norm": 14.058931350708008, + "learning_rate": 7.810218978102191e-06, + "loss": 1.4866, + "step": 214 + }, + { + "epoch": 0.07851013328464487, + "grad_norm": 44.53443908691406, + "learning_rate": 7.846715328467154e-06, + "loss": 1.5852, + "step": 215 + }, + { + "epoch": 0.07887529669527113, + "grad_norm": 19.07469940185547, + "learning_rate": 7.883211678832117e-06, + "loss": 1.5669, + "step": 216 + }, + { + "epoch": 0.07924046010589739, + "grad_norm": 25.231599807739258, + "learning_rate": 7.91970802919708e-06, + "loss": 1.6279, + "step": 217 + }, + { + "epoch": 0.07960562351652364, + "grad_norm": 39.810150146484375, + "learning_rate": 7.956204379562045e-06, + "loss": 1.6566, + "step": 218 + }, + { + "epoch": 0.0799707869271499, + "grad_norm": 29.961633682250977, + "learning_rate": 7.992700729927007e-06, + "loss": 1.6455, + "step": 219 + }, + { + "epoch": 0.08033595033777616, + "grad_norm": 28.15206527709961, + "learning_rate": 8.029197080291972e-06, + "loss": 1.595, + "step": 220 + }, + { + "epoch": 0.08070111374840241, + "grad_norm": 19.63481903076172, + "learning_rate": 8.065693430656935e-06, + "loss": 1.4966, + "step": 221 + }, + { + "epoch": 0.08106627715902867, + "grad_norm": 16.381669998168945, + "learning_rate": 8.1021897810219e-06, + "loss": 1.5696, + "step": 222 + }, + { + "epoch": 0.08143144056965491, + "grad_norm": 17.529966354370117, + "learning_rate": 8.138686131386862e-06, + "loss": 1.5007, + "step": 223 + }, + { + "epoch": 0.08179660398028117, + "grad_norm": 14.417710304260254, + "learning_rate": 8.175182481751825e-06, + "loss": 1.4875, + "step": 224 + }, + { + "epoch": 0.08216176739090743, + "grad_norm": 12.957341194152832, + "learning_rate": 8.21167883211679e-06, + "loss": 1.5208, + "step": 225 + }, + { + "epoch": 0.08252693080153368, + "grad_norm": 57.61381912231445, + "learning_rate": 8.248175182481753e-06, + "loss": 1.4958, + "step": 226 + }, + { + "epoch": 0.08289209421215994, + "grad_norm": 11.83564281463623, + "learning_rate": 8.284671532846717e-06, + "loss": 1.4526, + "step": 227 + }, + { + "epoch": 0.0832572576227862, + "grad_norm": 11.580336570739746, + "learning_rate": 8.32116788321168e-06, + "loss": 1.4099, + "step": 228 + }, + { + "epoch": 0.08362242103341246, + "grad_norm": 13.171910285949707, + "learning_rate": 8.357664233576643e-06, + "loss": 1.4248, + "step": 229 + }, + { + "epoch": 0.08398758444403871, + "grad_norm": 19.589122772216797, + "learning_rate": 8.394160583941606e-06, + "loss": 1.541, + "step": 230 + }, + { + "epoch": 0.08435274785466497, + "grad_norm": 18.963899612426758, + "learning_rate": 8.43065693430657e-06, + "loss": 1.4572, + "step": 231 + }, + { + "epoch": 0.08471791126529121, + "grad_norm": 27.862354278564453, + "learning_rate": 8.467153284671533e-06, + "loss": 1.5496, + "step": 232 + }, + { + "epoch": 0.08508307467591747, + "grad_norm": 13.316584587097168, + "learning_rate": 8.503649635036498e-06, + "loss": 1.418, + "step": 233 + }, + { + "epoch": 0.08544823808654373, + "grad_norm": 20.581497192382812, + "learning_rate": 8.54014598540146e-06, + "loss": 1.4546, + "step": 234 + }, + { + "epoch": 0.08581340149716998, + "grad_norm": 16.90022087097168, + "learning_rate": 8.576642335766423e-06, + "loss": 1.4329, + "step": 235 + }, + { + "epoch": 0.08617856490779624, + "grad_norm": 6.15281343460083, + "learning_rate": 8.613138686131386e-06, + "loss": 1.3525, + "step": 236 + }, + { + "epoch": 0.0865437283184225, + "grad_norm": 10.208907127380371, + "learning_rate": 8.649635036496351e-06, + "loss": 1.3865, + "step": 237 + }, + { + "epoch": 0.08690889172904875, + "grad_norm": 12.612737655639648, + "learning_rate": 8.686131386861315e-06, + "loss": 1.3468, + "step": 238 + }, + { + "epoch": 0.08727405513967501, + "grad_norm": 7.821756362915039, + "learning_rate": 8.722627737226278e-06, + "loss": 1.4331, + "step": 239 + }, + { + "epoch": 0.08763921855030125, + "grad_norm": 12.956961631774902, + "learning_rate": 8.759124087591241e-06, + "loss": 1.4187, + "step": 240 + }, + { + "epoch": 0.08800438196092751, + "grad_norm": 11.274295806884766, + "learning_rate": 8.795620437956204e-06, + "loss": 1.4182, + "step": 241 + }, + { + "epoch": 0.08836954537155377, + "grad_norm": 10.392095565795898, + "learning_rate": 8.832116788321169e-06, + "loss": 1.4207, + "step": 242 + }, + { + "epoch": 0.08873470878218002, + "grad_norm": 6.782304286956787, + "learning_rate": 8.868613138686132e-06, + "loss": 1.3278, + "step": 243 + }, + { + "epoch": 0.08909987219280628, + "grad_norm": 5.543254852294922, + "learning_rate": 8.905109489051096e-06, + "loss": 1.353, + "step": 244 + }, + { + "epoch": 0.08946503560343254, + "grad_norm": 4.561470031738281, + "learning_rate": 8.941605839416059e-06, + "loss": 1.3635, + "step": 245 + }, + { + "epoch": 0.0898301990140588, + "grad_norm": 23.20775604248047, + "learning_rate": 8.978102189781024e-06, + "loss": 1.467, + "step": 246 + }, + { + "epoch": 0.09019536242468505, + "grad_norm": 4.162003040313721, + "learning_rate": 9.014598540145986e-06, + "loss": 1.3699, + "step": 247 + }, + { + "epoch": 0.0905605258353113, + "grad_norm": 8.721848487854004, + "learning_rate": 9.05109489051095e-06, + "loss": 1.4294, + "step": 248 + }, + { + "epoch": 0.09092568924593755, + "grad_norm": 14.426276206970215, + "learning_rate": 9.087591240875912e-06, + "loss": 1.3287, + "step": 249 + }, + { + "epoch": 0.09129085265656381, + "grad_norm": 10.680879592895508, + "learning_rate": 9.124087591240877e-06, + "loss": 1.3496, + "step": 250 + }, + { + "epoch": 0.09165601606719007, + "grad_norm": 13.638522148132324, + "learning_rate": 9.160583941605841e-06, + "loss": 1.3894, + "step": 251 + }, + { + "epoch": 0.09202117947781632, + "grad_norm": 5.008195877075195, + "learning_rate": 9.197080291970804e-06, + "loss": 1.3525, + "step": 252 + }, + { + "epoch": 0.09238634288844258, + "grad_norm": 9.899373054504395, + "learning_rate": 9.233576642335767e-06, + "loss": 1.3335, + "step": 253 + }, + { + "epoch": 0.09275150629906884, + "grad_norm": 4.918022155761719, + "learning_rate": 9.27007299270073e-06, + "loss": 1.3196, + "step": 254 + }, + { + "epoch": 0.0931166697096951, + "grad_norm": 4.3167595863342285, + "learning_rate": 9.306569343065694e-06, + "loss": 1.3231, + "step": 255 + }, + { + "epoch": 0.09348183312032135, + "grad_norm": 3.6625378131866455, + "learning_rate": 9.343065693430657e-06, + "loss": 1.3268, + "step": 256 + }, + { + "epoch": 0.0938469965309476, + "grad_norm": 9.615107536315918, + "learning_rate": 9.379562043795622e-06, + "loss": 1.3452, + "step": 257 + }, + { + "epoch": 0.09421215994157385, + "grad_norm": 4.574172019958496, + "learning_rate": 9.416058394160585e-06, + "loss": 1.3042, + "step": 258 + }, + { + "epoch": 0.09457732335220011, + "grad_norm": 6.877755641937256, + "learning_rate": 9.452554744525548e-06, + "loss": 1.3171, + "step": 259 + }, + { + "epoch": 0.09494248676282636, + "grad_norm": 4.867911338806152, + "learning_rate": 9.48905109489051e-06, + "loss": 1.3306, + "step": 260 + }, + { + "epoch": 0.09530765017345262, + "grad_norm": 6.512596607208252, + "learning_rate": 9.525547445255475e-06, + "loss": 1.2874, + "step": 261 + }, + { + "epoch": 0.09567281358407888, + "grad_norm": 3.648726463317871, + "learning_rate": 9.56204379562044e-06, + "loss": 1.3174, + "step": 262 + }, + { + "epoch": 0.09603797699470513, + "grad_norm": 5.357989311218262, + "learning_rate": 9.598540145985402e-06, + "loss": 1.3412, + "step": 263 + }, + { + "epoch": 0.09640314040533139, + "grad_norm": 10.861003875732422, + "learning_rate": 9.635036496350367e-06, + "loss": 1.296, + "step": 264 + }, + { + "epoch": 0.09676830381595763, + "grad_norm": 12.365477561950684, + "learning_rate": 9.67153284671533e-06, + "loss": 1.3271, + "step": 265 + }, + { + "epoch": 0.09713346722658389, + "grad_norm": 7.079929351806641, + "learning_rate": 9.708029197080293e-06, + "loss": 1.2974, + "step": 266 + }, + { + "epoch": 0.09749863063721015, + "grad_norm": 4.8558430671691895, + "learning_rate": 9.744525547445256e-06, + "loss": 1.2659, + "step": 267 + }, + { + "epoch": 0.0978637940478364, + "grad_norm": 3.611086130142212, + "learning_rate": 9.78102189781022e-06, + "loss": 1.3125, + "step": 268 + }, + { + "epoch": 0.09822895745846266, + "grad_norm": 4.1291584968566895, + "learning_rate": 9.817518248175183e-06, + "loss": 1.3298, + "step": 269 + }, + { + "epoch": 0.09859412086908892, + "grad_norm": 6.496428966522217, + "learning_rate": 9.854014598540148e-06, + "loss": 1.262, + "step": 270 + }, + { + "epoch": 0.09895928427971518, + "grad_norm": 5.551502227783203, + "learning_rate": 9.89051094890511e-06, + "loss": 1.2892, + "step": 271 + }, + { + "epoch": 0.09932444769034143, + "grad_norm": 2.4452531337738037, + "learning_rate": 9.927007299270073e-06, + "loss": 1.3044, + "step": 272 + }, + { + "epoch": 0.09968961110096769, + "grad_norm": 8.159255981445312, + "learning_rate": 9.963503649635036e-06, + "loss": 1.314, + "step": 273 + }, + { + "epoch": 0.10005477451159393, + "grad_norm": 3.99063777923584, + "learning_rate": 1e-05, + "loss": 1.2576, + "step": 274 + }, + { + "epoch": 0.10041993792222019, + "grad_norm": 1.5462865829467773, + "learning_rate": 1.0036496350364964e-05, + "loss": 1.2864, + "step": 275 + }, + { + "epoch": 0.10078510133284645, + "grad_norm": 3.204332113265991, + "learning_rate": 1.0072992700729928e-05, + "loss": 1.2542, + "step": 276 + }, + { + "epoch": 0.1011502647434727, + "grad_norm": 2.300490140914917, + "learning_rate": 1.0109489051094891e-05, + "loss": 1.3162, + "step": 277 + }, + { + "epoch": 0.10151542815409896, + "grad_norm": 4.768906593322754, + "learning_rate": 1.0145985401459854e-05, + "loss": 1.2646, + "step": 278 + }, + { + "epoch": 0.10188059156472522, + "grad_norm": 4.017237186431885, + "learning_rate": 1.0182481751824817e-05, + "loss": 1.2429, + "step": 279 + }, + { + "epoch": 0.10224575497535147, + "grad_norm": 4.3871917724609375, + "learning_rate": 1.0218978102189783e-05, + "loss": 1.2776, + "step": 280 + }, + { + "epoch": 0.10261091838597773, + "grad_norm": 3.5851235389709473, + "learning_rate": 1.0255474452554746e-05, + "loss": 1.3145, + "step": 281 + }, + { + "epoch": 0.10297608179660397, + "grad_norm": 2.6969120502471924, + "learning_rate": 1.0291970802919709e-05, + "loss": 1.2322, + "step": 282 + }, + { + "epoch": 0.10334124520723023, + "grad_norm": 2.2774250507354736, + "learning_rate": 1.0328467153284672e-05, + "loss": 1.2477, + "step": 283 + }, + { + "epoch": 0.10370640861785649, + "grad_norm": 2.5099644660949707, + "learning_rate": 1.0364963503649636e-05, + "loss": 1.2175, + "step": 284 + }, + { + "epoch": 0.10407157202848275, + "grad_norm": 5.502248764038086, + "learning_rate": 1.04014598540146e-05, + "loss": 1.2748, + "step": 285 + }, + { + "epoch": 0.104436735439109, + "grad_norm": 2.743541717529297, + "learning_rate": 1.0437956204379562e-05, + "loss": 1.2744, + "step": 286 + }, + { + "epoch": 0.10480189884973526, + "grad_norm": 3.1978044509887695, + "learning_rate": 1.0474452554744528e-05, + "loss": 1.2834, + "step": 287 + }, + { + "epoch": 0.10516706226036152, + "grad_norm": 1.869937539100647, + "learning_rate": 1.0510948905109491e-05, + "loss": 1.3057, + "step": 288 + }, + { + "epoch": 0.10553222567098777, + "grad_norm": 2.7147998809814453, + "learning_rate": 1.0547445255474454e-05, + "loss": 1.2305, + "step": 289 + }, + { + "epoch": 0.10589738908161402, + "grad_norm": 1.5068933963775635, + "learning_rate": 1.0583941605839417e-05, + "loss": 1.2383, + "step": 290 + }, + { + "epoch": 0.10626255249224027, + "grad_norm": 2.4968197345733643, + "learning_rate": 1.0620437956204381e-05, + "loss": 1.2522, + "step": 291 + }, + { + "epoch": 0.10662771590286653, + "grad_norm": 2.359579086303711, + "learning_rate": 1.0656934306569344e-05, + "loss": 1.2708, + "step": 292 + }, + { + "epoch": 0.10699287931349279, + "grad_norm": 1.5981340408325195, + "learning_rate": 1.0693430656934307e-05, + "loss": 1.2152, + "step": 293 + }, + { + "epoch": 0.10735804272411904, + "grad_norm": 2.902353048324585, + "learning_rate": 1.072992700729927e-05, + "loss": 1.2087, + "step": 294 + }, + { + "epoch": 0.1077232061347453, + "grad_norm": 1.5850776433944702, + "learning_rate": 1.0766423357664235e-05, + "loss": 1.2211, + "step": 295 + }, + { + "epoch": 0.10808836954537156, + "grad_norm": 1.5473744869232178, + "learning_rate": 1.0802919708029198e-05, + "loss": 1.2305, + "step": 296 + }, + { + "epoch": 0.10845353295599781, + "grad_norm": 2.7115235328674316, + "learning_rate": 1.083941605839416e-05, + "loss": 1.1915, + "step": 297 + }, + { + "epoch": 0.10881869636662407, + "grad_norm": 1.5588639974594116, + "learning_rate": 1.0875912408759123e-05, + "loss": 1.1577, + "step": 298 + }, + { + "epoch": 0.10918385977725031, + "grad_norm": 1.6646748781204224, + "learning_rate": 1.091240875912409e-05, + "loss": 1.1941, + "step": 299 + }, + { + "epoch": 0.10954902318787657, + "grad_norm": 1.1305725574493408, + "learning_rate": 1.0948905109489052e-05, + "loss": 1.2269, + "step": 300 + }, + { + "epoch": 0.10991418659850283, + "grad_norm": 2.397463083267212, + "learning_rate": 1.0985401459854015e-05, + "loss": 1.2149, + "step": 301 + }, + { + "epoch": 0.11027935000912908, + "grad_norm": 1.4366761445999146, + "learning_rate": 1.102189781021898e-05, + "loss": 1.1731, + "step": 302 + }, + { + "epoch": 0.11064451341975534, + "grad_norm": 1.9387191534042358, + "learning_rate": 1.1058394160583943e-05, + "loss": 1.2324, + "step": 303 + }, + { + "epoch": 0.1110096768303816, + "grad_norm": 1.9468061923980713, + "learning_rate": 1.1094890510948906e-05, + "loss": 1.2415, + "step": 304 + }, + { + "epoch": 0.11137484024100786, + "grad_norm": 1.364353895187378, + "learning_rate": 1.1131386861313868e-05, + "loss": 1.2037, + "step": 305 + }, + { + "epoch": 0.11174000365163411, + "grad_norm": 1.462504267692566, + "learning_rate": 1.1167883211678833e-05, + "loss": 1.1635, + "step": 306 + }, + { + "epoch": 0.11210516706226036, + "grad_norm": 2.742152214050293, + "learning_rate": 1.1204379562043798e-05, + "loss": 1.209, + "step": 307 + }, + { + "epoch": 0.11247033047288661, + "grad_norm": 1.291460633277893, + "learning_rate": 1.124087591240876e-05, + "loss": 1.2004, + "step": 308 + }, + { + "epoch": 0.11283549388351287, + "grad_norm": 1.4753674268722534, + "learning_rate": 1.1277372262773723e-05, + "loss": 1.1941, + "step": 309 + }, + { + "epoch": 0.11320065729413913, + "grad_norm": 1.2697490453720093, + "learning_rate": 1.1313868613138688e-05, + "loss": 1.1705, + "step": 310 + }, + { + "epoch": 0.11356582070476538, + "grad_norm": 1.685782790184021, + "learning_rate": 1.135036496350365e-05, + "loss": 1.1661, + "step": 311 + }, + { + "epoch": 0.11393098411539164, + "grad_norm": 1.2983890771865845, + "learning_rate": 1.1386861313868614e-05, + "loss": 1.1893, + "step": 312 + }, + { + "epoch": 0.1142961475260179, + "grad_norm": 1.7487248182296753, + "learning_rate": 1.1423357664233578e-05, + "loss": 1.1729, + "step": 313 + }, + { + "epoch": 0.11466131093664415, + "grad_norm": 1.314300775527954, + "learning_rate": 1.1459854014598541e-05, + "loss": 1.1914, + "step": 314 + }, + { + "epoch": 0.11502647434727041, + "grad_norm": 1.6196850538253784, + "learning_rate": 1.1496350364963504e-05, + "loss": 1.1984, + "step": 315 + }, + { + "epoch": 0.11539163775789665, + "grad_norm": 1.3013982772827148, + "learning_rate": 1.1532846715328467e-05, + "loss": 1.1758, + "step": 316 + }, + { + "epoch": 0.11575680116852291, + "grad_norm": 1.7583945989608765, + "learning_rate": 1.1569343065693433e-05, + "loss": 1.1852, + "step": 317 + }, + { + "epoch": 0.11612196457914917, + "grad_norm": 1.5669349431991577, + "learning_rate": 1.1605839416058396e-05, + "loss": 1.1742, + "step": 318 + }, + { + "epoch": 0.11648712798977542, + "grad_norm": 1.3974933624267578, + "learning_rate": 1.1642335766423359e-05, + "loss": 1.1686, + "step": 319 + }, + { + "epoch": 0.11685229140040168, + "grad_norm": 1.367573618888855, + "learning_rate": 1.1678832116788322e-05, + "loss": 1.2064, + "step": 320 + }, + { + "epoch": 0.11721745481102794, + "grad_norm": 1.2772947549819946, + "learning_rate": 1.1715328467153286e-05, + "loss": 1.1616, + "step": 321 + }, + { + "epoch": 0.1175826182216542, + "grad_norm": 1.2221980094909668, + "learning_rate": 1.1751824817518249e-05, + "loss": 1.1732, + "step": 322 + }, + { + "epoch": 0.11794778163228045, + "grad_norm": 1.249281883239746, + "learning_rate": 1.1788321167883212e-05, + "loss": 1.1816, + "step": 323 + }, + { + "epoch": 0.1183129450429067, + "grad_norm": 1.674682378768921, + "learning_rate": 1.1824817518248176e-05, + "loss": 1.1648, + "step": 324 + }, + { + "epoch": 0.11867810845353295, + "grad_norm": 1.277409315109253, + "learning_rate": 1.186131386861314e-05, + "loss": 1.1906, + "step": 325 + }, + { + "epoch": 0.11904327186415921, + "grad_norm": 1.164598822593689, + "learning_rate": 1.1897810218978102e-05, + "loss": 1.1603, + "step": 326 + }, + { + "epoch": 0.11940843527478547, + "grad_norm": 1.5280879735946655, + "learning_rate": 1.1934306569343067e-05, + "loss": 1.1841, + "step": 327 + }, + { + "epoch": 0.11977359868541172, + "grad_norm": 1.3797391653060913, + "learning_rate": 1.1970802919708031e-05, + "loss": 1.1871, + "step": 328 + }, + { + "epoch": 0.12013876209603798, + "grad_norm": 1.0593525171279907, + "learning_rate": 1.2007299270072994e-05, + "loss": 1.1467, + "step": 329 + }, + { + "epoch": 0.12050392550666424, + "grad_norm": 1.0836522579193115, + "learning_rate": 1.2043795620437957e-05, + "loss": 1.1177, + "step": 330 + }, + { + "epoch": 0.1208690889172905, + "grad_norm": 0.9108336567878723, + "learning_rate": 1.208029197080292e-05, + "loss": 1.1685, + "step": 331 + }, + { + "epoch": 0.12123425232791674, + "grad_norm": 1.3441603183746338, + "learning_rate": 1.2116788321167885e-05, + "loss": 1.1779, + "step": 332 + }, + { + "epoch": 0.12159941573854299, + "grad_norm": 1.3265718221664429, + "learning_rate": 1.2153284671532847e-05, + "loss": 1.1953, + "step": 333 + }, + { + "epoch": 0.12196457914916925, + "grad_norm": 1.2688337564468384, + "learning_rate": 1.218978102189781e-05, + "loss": 1.165, + "step": 334 + }, + { + "epoch": 0.1223297425597955, + "grad_norm": 1.2843410968780518, + "learning_rate": 1.2226277372262773e-05, + "loss": 1.1614, + "step": 335 + }, + { + "epoch": 0.12269490597042176, + "grad_norm": 1.04667329788208, + "learning_rate": 1.226277372262774e-05, + "loss": 1.1503, + "step": 336 + }, + { + "epoch": 0.12306006938104802, + "grad_norm": 1.1412992477416992, + "learning_rate": 1.2299270072992702e-05, + "loss": 1.1467, + "step": 337 + }, + { + "epoch": 0.12342523279167428, + "grad_norm": 1.4156609773635864, + "learning_rate": 1.2335766423357665e-05, + "loss": 1.1447, + "step": 338 + }, + { + "epoch": 0.12379039620230053, + "grad_norm": 1.8062715530395508, + "learning_rate": 1.237226277372263e-05, + "loss": 1.1702, + "step": 339 + }, + { + "epoch": 0.12415555961292679, + "grad_norm": 1.2953407764434814, + "learning_rate": 1.2408759124087593e-05, + "loss": 1.1246, + "step": 340 + }, + { + "epoch": 0.12452072302355303, + "grad_norm": 1.685192346572876, + "learning_rate": 1.2445255474452555e-05, + "loss": 1.1724, + "step": 341 + }, + { + "epoch": 0.12488588643417929, + "grad_norm": 0.9900994896888733, + "learning_rate": 1.2481751824817518e-05, + "loss": 1.127, + "step": 342 + }, + { + "epoch": 0.12525104984480556, + "grad_norm": 1.2946479320526123, + "learning_rate": 1.2518248175182483e-05, + "loss": 1.1466, + "step": 343 + }, + { + "epoch": 0.12561621325543182, + "grad_norm": 1.1135441064834595, + "learning_rate": 1.2554744525547446e-05, + "loss": 1.1188, + "step": 344 + }, + { + "epoch": 0.12598137666605805, + "grad_norm": 1.2259373664855957, + "learning_rate": 1.2591240875912409e-05, + "loss": 1.1257, + "step": 345 + }, + { + "epoch": 0.1263465400766843, + "grad_norm": 1.0781917572021484, + "learning_rate": 1.2627737226277371e-05, + "loss": 1.1167, + "step": 346 + }, + { + "epoch": 0.12671170348731056, + "grad_norm": 1.1509932279586792, + "learning_rate": 1.2664233576642338e-05, + "loss": 1.1675, + "step": 347 + }, + { + "epoch": 0.12707686689793682, + "grad_norm": 1.7564834356307983, + "learning_rate": 1.27007299270073e-05, + "loss": 1.1327, + "step": 348 + }, + { + "epoch": 0.12744203030856308, + "grad_norm": 1.068116307258606, + "learning_rate": 1.2737226277372263e-05, + "loss": 1.1166, + "step": 349 + }, + { + "epoch": 0.12780719371918933, + "grad_norm": 1.2166095972061157, + "learning_rate": 1.2773722627737228e-05, + "loss": 1.1189, + "step": 350 + }, + { + "epoch": 0.1281723571298156, + "grad_norm": 1.385166049003601, + "learning_rate": 1.2810218978102191e-05, + "loss": 1.1252, + "step": 351 + }, + { + "epoch": 0.12853752054044185, + "grad_norm": 0.9813831448554993, + "learning_rate": 1.2846715328467154e-05, + "loss": 1.1289, + "step": 352 + }, + { + "epoch": 0.1289026839510681, + "grad_norm": 1.0689971446990967, + "learning_rate": 1.2883211678832117e-05, + "loss": 1.0798, + "step": 353 + }, + { + "epoch": 0.12926784736169436, + "grad_norm": 1.1747773885726929, + "learning_rate": 1.2919708029197083e-05, + "loss": 1.1304, + "step": 354 + }, + { + "epoch": 0.12963301077232062, + "grad_norm": 0.9878880381584167, + "learning_rate": 1.2956204379562046e-05, + "loss": 1.12, + "step": 355 + }, + { + "epoch": 0.12999817418294687, + "grad_norm": 1.2414008378982544, + "learning_rate": 1.2992700729927009e-05, + "loss": 1.1359, + "step": 356 + }, + { + "epoch": 0.13036333759357313, + "grad_norm": 1.0068449974060059, + "learning_rate": 1.3029197080291972e-05, + "loss": 1.1094, + "step": 357 + }, + { + "epoch": 0.1307285010041994, + "grad_norm": 1.1552882194519043, + "learning_rate": 1.3065693430656936e-05, + "loss": 1.1625, + "step": 358 + }, + { + "epoch": 0.13109366441482564, + "grad_norm": 1.1237143278121948, + "learning_rate": 1.3102189781021899e-05, + "loss": 1.1239, + "step": 359 + }, + { + "epoch": 0.1314588278254519, + "grad_norm": 1.3414310216903687, + "learning_rate": 1.3138686131386862e-05, + "loss": 1.1241, + "step": 360 + }, + { + "epoch": 0.13182399123607816, + "grad_norm": 1.0340030193328857, + "learning_rate": 1.3175182481751825e-05, + "loss": 1.1146, + "step": 361 + }, + { + "epoch": 0.1321891546467044, + "grad_norm": 1.00321626663208, + "learning_rate": 1.321167883211679e-05, + "loss": 1.1484, + "step": 362 + }, + { + "epoch": 0.13255431805733064, + "grad_norm": 1.6498593091964722, + "learning_rate": 1.3248175182481752e-05, + "loss": 1.1301, + "step": 363 + }, + { + "epoch": 0.1329194814679569, + "grad_norm": 1.1106163263320923, + "learning_rate": 1.3284671532846715e-05, + "loss": 1.1046, + "step": 364 + }, + { + "epoch": 0.13328464487858316, + "grad_norm": 1.888108253479004, + "learning_rate": 1.3321167883211681e-05, + "loss": 1.0774, + "step": 365 + }, + { + "epoch": 0.13364980828920942, + "grad_norm": 1.613932728767395, + "learning_rate": 1.3357664233576644e-05, + "loss": 1.1235, + "step": 366 + }, + { + "epoch": 0.13401497169983567, + "grad_norm": 1.3553868532180786, + "learning_rate": 1.3394160583941607e-05, + "loss": 1.1141, + "step": 367 + }, + { + "epoch": 0.13438013511046193, + "grad_norm": 1.1510350704193115, + "learning_rate": 1.343065693430657e-05, + "loss": 1.0662, + "step": 368 + }, + { + "epoch": 0.13474529852108819, + "grad_norm": 1.0214452743530273, + "learning_rate": 1.3467153284671534e-05, + "loss": 1.1061, + "step": 369 + }, + { + "epoch": 0.13511046193171444, + "grad_norm": 1.2430360317230225, + "learning_rate": 1.3503649635036497e-05, + "loss": 1.0872, + "step": 370 + }, + { + "epoch": 0.1354756253423407, + "grad_norm": 0.8896999359130859, + "learning_rate": 1.354014598540146e-05, + "loss": 1.0865, + "step": 371 + }, + { + "epoch": 0.13584078875296696, + "grad_norm": 1.4086633920669556, + "learning_rate": 1.3576642335766423e-05, + "loss": 1.1355, + "step": 372 + }, + { + "epoch": 0.1362059521635932, + "grad_norm": 1.149206519126892, + "learning_rate": 1.361313868613139e-05, + "loss": 1.0841, + "step": 373 + }, + { + "epoch": 0.13657111557421947, + "grad_norm": 1.3880298137664795, + "learning_rate": 1.3649635036496352e-05, + "loss": 1.1204, + "step": 374 + }, + { + "epoch": 0.13693627898484573, + "grad_norm": 1.1958924531936646, + "learning_rate": 1.3686131386861315e-05, + "loss": 1.0629, + "step": 375 + }, + { + "epoch": 0.13730144239547198, + "grad_norm": 1.2898317575454712, + "learning_rate": 1.372262773722628e-05, + "loss": 1.1003, + "step": 376 + }, + { + "epoch": 0.13766660580609824, + "grad_norm": 1.1109548807144165, + "learning_rate": 1.3759124087591242e-05, + "loss": 1.1147, + "step": 377 + }, + { + "epoch": 0.1380317692167245, + "grad_norm": 1.3594517707824707, + "learning_rate": 1.3795620437956205e-05, + "loss": 1.1285, + "step": 378 + }, + { + "epoch": 0.13839693262735073, + "grad_norm": 0.9975214004516602, + "learning_rate": 1.3832116788321168e-05, + "loss": 1.0709, + "step": 379 + }, + { + "epoch": 0.13876209603797698, + "grad_norm": 1.0834999084472656, + "learning_rate": 1.3868613138686133e-05, + "loss": 1.0236, + "step": 380 + }, + { + "epoch": 0.13912725944860324, + "grad_norm": 1.2681293487548828, + "learning_rate": 1.3905109489051096e-05, + "loss": 1.0898, + "step": 381 + }, + { + "epoch": 0.1394924228592295, + "grad_norm": 1.375718593597412, + "learning_rate": 1.3941605839416059e-05, + "loss": 1.1216, + "step": 382 + }, + { + "epoch": 0.13985758626985575, + "grad_norm": 0.9331570863723755, + "learning_rate": 1.3978102189781021e-05, + "loss": 1.0582, + "step": 383 + }, + { + "epoch": 0.140222749680482, + "grad_norm": 1.582811713218689, + "learning_rate": 1.4014598540145988e-05, + "loss": 1.0657, + "step": 384 + }, + { + "epoch": 0.14058791309110827, + "grad_norm": 1.0264136791229248, + "learning_rate": 1.405109489051095e-05, + "loss": 1.0863, + "step": 385 + }, + { + "epoch": 0.14095307650173453, + "grad_norm": 0.8130198121070862, + "learning_rate": 1.4087591240875913e-05, + "loss": 1.0668, + "step": 386 + }, + { + "epoch": 0.14131823991236078, + "grad_norm": 1.1384094953536987, + "learning_rate": 1.4124087591240878e-05, + "loss": 1.12, + "step": 387 + }, + { + "epoch": 0.14168340332298704, + "grad_norm": 0.7725839018821716, + "learning_rate": 1.416058394160584e-05, + "loss": 1.0846, + "step": 388 + }, + { + "epoch": 0.1420485667336133, + "grad_norm": 0.966789186000824, + "learning_rate": 1.4197080291970804e-05, + "loss": 1.0905, + "step": 389 + }, + { + "epoch": 0.14241373014423955, + "grad_norm": 2.1811580657958984, + "learning_rate": 1.4233576642335767e-05, + "loss": 1.129, + "step": 390 + }, + { + "epoch": 0.1427788935548658, + "grad_norm": 1.425172209739685, + "learning_rate": 1.4270072992700733e-05, + "loss": 1.0992, + "step": 391 + }, + { + "epoch": 0.14314405696549207, + "grad_norm": 0.9881778359413147, + "learning_rate": 1.4306569343065696e-05, + "loss": 1.0516, + "step": 392 + }, + { + "epoch": 0.14350922037611832, + "grad_norm": 0.9002101421356201, + "learning_rate": 1.4343065693430659e-05, + "loss": 1.0531, + "step": 393 + }, + { + "epoch": 0.14387438378674458, + "grad_norm": 1.1981589794158936, + "learning_rate": 1.4379562043795621e-05, + "loss": 1.0902, + "step": 394 + }, + { + "epoch": 0.1442395471973708, + "grad_norm": 1.5116416215896606, + "learning_rate": 1.4416058394160586e-05, + "loss": 1.1179, + "step": 395 + }, + { + "epoch": 0.14460471060799707, + "grad_norm": 0.9920293688774109, + "learning_rate": 1.4452554744525549e-05, + "loss": 1.0645, + "step": 396 + }, + { + "epoch": 0.14496987401862332, + "grad_norm": 1.0004264116287231, + "learning_rate": 1.4489051094890512e-05, + "loss": 1.0924, + "step": 397 + }, + { + "epoch": 0.14533503742924958, + "grad_norm": 0.9262351393699646, + "learning_rate": 1.4525547445255475e-05, + "loss": 1.0723, + "step": 398 + }, + { + "epoch": 0.14570020083987584, + "grad_norm": 1.1073780059814453, + "learning_rate": 1.456204379562044e-05, + "loss": 1.048, + "step": 399 + }, + { + "epoch": 0.1460653642505021, + "grad_norm": 1.540772557258606, + "learning_rate": 1.4598540145985402e-05, + "loss": 1.0638, + "step": 400 + }, + { + "epoch": 0.14643052766112835, + "grad_norm": 1.0480707883834839, + "learning_rate": 1.4635036496350365e-05, + "loss": 1.0188, + "step": 401 + }, + { + "epoch": 0.1467956910717546, + "grad_norm": 0.9333097338676453, + "learning_rate": 1.4671532846715331e-05, + "loss": 1.0465, + "step": 402 + }, + { + "epoch": 0.14716085448238087, + "grad_norm": 1.100213646888733, + "learning_rate": 1.4708029197080294e-05, + "loss": 1.0583, + "step": 403 + }, + { + "epoch": 0.14752601789300712, + "grad_norm": 1.2130063772201538, + "learning_rate": 1.4744525547445257e-05, + "loss": 1.0928, + "step": 404 + }, + { + "epoch": 0.14789118130363338, + "grad_norm": 1.29470956325531, + "learning_rate": 1.478102189781022e-05, + "loss": 1.0491, + "step": 405 + }, + { + "epoch": 0.14825634471425964, + "grad_norm": 1.0126339197158813, + "learning_rate": 1.4817518248175184e-05, + "loss": 1.08, + "step": 406 + }, + { + "epoch": 0.1486215081248859, + "grad_norm": 1.1530481576919556, + "learning_rate": 1.4854014598540147e-05, + "loss": 1.0546, + "step": 407 + }, + { + "epoch": 0.14898667153551215, + "grad_norm": 0.9986507892608643, + "learning_rate": 1.489051094890511e-05, + "loss": 1.0809, + "step": 408 + }, + { + "epoch": 0.1493518349461384, + "grad_norm": 0.8909332156181335, + "learning_rate": 1.4927007299270073e-05, + "loss": 1.073, + "step": 409 + }, + { + "epoch": 0.14971699835676466, + "grad_norm": 1.0050809383392334, + "learning_rate": 1.4963503649635038e-05, + "loss": 1.0765, + "step": 410 + }, + { + "epoch": 0.15008216176739092, + "grad_norm": 1.157792329788208, + "learning_rate": 1.5000000000000002e-05, + "loss": 1.1008, + "step": 411 + }, + { + "epoch": 0.15044732517801715, + "grad_norm": 0.9719278812408447, + "learning_rate": 1.5036496350364965e-05, + "loss": 1.0743, + "step": 412 + }, + { + "epoch": 0.1508124885886434, + "grad_norm": 1.02153742313385, + "learning_rate": 1.507299270072993e-05, + "loss": 1.0713, + "step": 413 + }, + { + "epoch": 0.15117765199926966, + "grad_norm": 1.0733529329299927, + "learning_rate": 1.5109489051094892e-05, + "loss": 1.0425, + "step": 414 + }, + { + "epoch": 0.15154281540989592, + "grad_norm": 1.4467898607254028, + "learning_rate": 1.5145985401459855e-05, + "loss": 1.0861, + "step": 415 + }, + { + "epoch": 0.15190797882052218, + "grad_norm": 1.0643540620803833, + "learning_rate": 1.5182481751824818e-05, + "loss": 1.0444, + "step": 416 + }, + { + "epoch": 0.15227314223114843, + "grad_norm": 0.9217526912689209, + "learning_rate": 1.5218978102189783e-05, + "loss": 1.0559, + "step": 417 + }, + { + "epoch": 0.1526383056417747, + "grad_norm": 0.9161515831947327, + "learning_rate": 1.5255474452554746e-05, + "loss": 1.0404, + "step": 418 + }, + { + "epoch": 0.15300346905240095, + "grad_norm": 1.1920078992843628, + "learning_rate": 1.529197080291971e-05, + "loss": 1.0243, + "step": 419 + }, + { + "epoch": 0.1533686324630272, + "grad_norm": 1.1917927265167236, + "learning_rate": 1.5328467153284673e-05, + "loss": 1.0822, + "step": 420 + }, + { + "epoch": 0.15373379587365346, + "grad_norm": 0.8970569968223572, + "learning_rate": 1.5364963503649638e-05, + "loss": 1.0408, + "step": 421 + }, + { + "epoch": 0.15409895928427972, + "grad_norm": 1.2498112916946411, + "learning_rate": 1.54014598540146e-05, + "loss": 1.0618, + "step": 422 + }, + { + "epoch": 0.15446412269490598, + "grad_norm": 0.9885013699531555, + "learning_rate": 1.5437956204379563e-05, + "loss": 1.0234, + "step": 423 + }, + { + "epoch": 0.15482928610553223, + "grad_norm": 1.7543072700500488, + "learning_rate": 1.5474452554744528e-05, + "loss": 1.0657, + "step": 424 + }, + { + "epoch": 0.1551944495161585, + "grad_norm": 1.1781865358352661, + "learning_rate": 1.5510948905109492e-05, + "loss": 1.066, + "step": 425 + }, + { + "epoch": 0.15555961292678475, + "grad_norm": 1.1695396900177002, + "learning_rate": 1.5547445255474454e-05, + "loss": 1.0443, + "step": 426 + }, + { + "epoch": 0.155924776337411, + "grad_norm": 1.2203136682510376, + "learning_rate": 1.5583941605839418e-05, + "loss": 1.0295, + "step": 427 + }, + { + "epoch": 0.15628993974803726, + "grad_norm": 1.3869448900222778, + "learning_rate": 1.5620437956204383e-05, + "loss": 1.0327, + "step": 428 + }, + { + "epoch": 0.1566551031586635, + "grad_norm": 0.9070996642112732, + "learning_rate": 1.5656934306569344e-05, + "loss": 1.0391, + "step": 429 + }, + { + "epoch": 0.15702026656928975, + "grad_norm": 1.054938554763794, + "learning_rate": 1.569343065693431e-05, + "loss": 1.0491, + "step": 430 + }, + { + "epoch": 0.157385429979916, + "grad_norm": 1.1217361688613892, + "learning_rate": 1.572992700729927e-05, + "loss": 1.0445, + "step": 431 + }, + { + "epoch": 0.15775059339054226, + "grad_norm": 0.9165348410606384, + "learning_rate": 1.5766423357664234e-05, + "loss": 1.0095, + "step": 432 + }, + { + "epoch": 0.15811575680116852, + "grad_norm": 0.9862052798271179, + "learning_rate": 1.58029197080292e-05, + "loss": 1.0411, + "step": 433 + }, + { + "epoch": 0.15848092021179477, + "grad_norm": 1.22489595413208, + "learning_rate": 1.583941605839416e-05, + "loss": 0.9929, + "step": 434 + }, + { + "epoch": 0.15884608362242103, + "grad_norm": 1.3611830472946167, + "learning_rate": 1.5875912408759125e-05, + "loss": 1.0294, + "step": 435 + }, + { + "epoch": 0.1592112470330473, + "grad_norm": 1.1147099733352661, + "learning_rate": 1.591240875912409e-05, + "loss": 1.026, + "step": 436 + }, + { + "epoch": 0.15957641044367354, + "grad_norm": 1.1320207118988037, + "learning_rate": 1.5948905109489054e-05, + "loss": 1.0403, + "step": 437 + }, + { + "epoch": 0.1599415738542998, + "grad_norm": 1.1242649555206299, + "learning_rate": 1.5985401459854015e-05, + "loss": 1.011, + "step": 438 + }, + { + "epoch": 0.16030673726492606, + "grad_norm": 1.227778673171997, + "learning_rate": 1.602189781021898e-05, + "loss": 1.0278, + "step": 439 + }, + { + "epoch": 0.16067190067555231, + "grad_norm": 1.0648647546768188, + "learning_rate": 1.6058394160583944e-05, + "loss": 1.0559, + "step": 440 + }, + { + "epoch": 0.16103706408617857, + "grad_norm": 0.7608394622802734, + "learning_rate": 1.6094890510948905e-05, + "loss": 0.9849, + "step": 441 + }, + { + "epoch": 0.16140222749680483, + "grad_norm": 1.2371690273284912, + "learning_rate": 1.613138686131387e-05, + "loss": 1.0323, + "step": 442 + }, + { + "epoch": 0.16176739090743109, + "grad_norm": 1.255880355834961, + "learning_rate": 1.6167883211678834e-05, + "loss": 1.0261, + "step": 443 + }, + { + "epoch": 0.16213255431805734, + "grad_norm": 1.570586919784546, + "learning_rate": 1.62043795620438e-05, + "loss": 1.0492, + "step": 444 + }, + { + "epoch": 0.1624977177286836, + "grad_norm": 1.0332871675491333, + "learning_rate": 1.624087591240876e-05, + "loss": 1.0372, + "step": 445 + }, + { + "epoch": 0.16286288113930983, + "grad_norm": 0.8999532461166382, + "learning_rate": 1.6277372262773725e-05, + "loss": 1.0211, + "step": 446 + }, + { + "epoch": 0.16322804454993609, + "grad_norm": 1.821873664855957, + "learning_rate": 1.631386861313869e-05, + "loss": 1.0408, + "step": 447 + }, + { + "epoch": 0.16359320796056234, + "grad_norm": 0.7166489958763123, + "learning_rate": 1.635036496350365e-05, + "loss": 1.0221, + "step": 448 + }, + { + "epoch": 0.1639583713711886, + "grad_norm": 1.1623930931091309, + "learning_rate": 1.6386861313868615e-05, + "loss": 1.007, + "step": 449 + }, + { + "epoch": 0.16432353478181486, + "grad_norm": 0.8305017352104187, + "learning_rate": 1.642335766423358e-05, + "loss": 0.9558, + "step": 450 + }, + { + "epoch": 0.1646886981924411, + "grad_norm": 1.2439148426055908, + "learning_rate": 1.645985401459854e-05, + "loss": 1.0458, + "step": 451 + }, + { + "epoch": 0.16505386160306737, + "grad_norm": 1.6258620023727417, + "learning_rate": 1.6496350364963505e-05, + "loss": 1.0782, + "step": 452 + }, + { + "epoch": 0.16541902501369363, + "grad_norm": 2.255648374557495, + "learning_rate": 1.6532846715328466e-05, + "loss": 1.0084, + "step": 453 + }, + { + "epoch": 0.16578418842431988, + "grad_norm": 1.1743186712265015, + "learning_rate": 1.6569343065693434e-05, + "loss": 0.9736, + "step": 454 + }, + { + "epoch": 0.16614935183494614, + "grad_norm": 1.2281731367111206, + "learning_rate": 1.6605839416058395e-05, + "loss": 0.9695, + "step": 455 + }, + { + "epoch": 0.1665145152455724, + "grad_norm": 1.1789402961730957, + "learning_rate": 1.664233576642336e-05, + "loss": 1.0486, + "step": 456 + }, + { + "epoch": 0.16687967865619865, + "grad_norm": 0.9462199211120605, + "learning_rate": 1.667883211678832e-05, + "loss": 0.9995, + "step": 457 + }, + { + "epoch": 0.1672448420668249, + "grad_norm": 0.989033579826355, + "learning_rate": 1.6715328467153286e-05, + "loss": 1.0385, + "step": 458 + }, + { + "epoch": 0.16761000547745117, + "grad_norm": 1.0912344455718994, + "learning_rate": 1.675182481751825e-05, + "loss": 0.9836, + "step": 459 + }, + { + "epoch": 0.16797516888807743, + "grad_norm": 1.019728660583496, + "learning_rate": 1.678832116788321e-05, + "loss": 1.0382, + "step": 460 + }, + { + "epoch": 0.16834033229870368, + "grad_norm": 1.0688890218734741, + "learning_rate": 1.6824817518248176e-05, + "loss": 0.9849, + "step": 461 + }, + { + "epoch": 0.16870549570932994, + "grad_norm": 0.9562466144561768, + "learning_rate": 1.686131386861314e-05, + "loss": 1.0332, + "step": 462 + }, + { + "epoch": 0.16907065911995617, + "grad_norm": 0.9949936270713806, + "learning_rate": 1.6897810218978102e-05, + "loss": 1.0527, + "step": 463 + }, + { + "epoch": 0.16943582253058243, + "grad_norm": 1.2476692199707031, + "learning_rate": 1.6934306569343066e-05, + "loss": 1.0553, + "step": 464 + }, + { + "epoch": 0.16980098594120868, + "grad_norm": 1.048659324645996, + "learning_rate": 1.697080291970803e-05, + "loss": 1.0164, + "step": 465 + }, + { + "epoch": 0.17016614935183494, + "grad_norm": 0.9506887793540955, + "learning_rate": 1.7007299270072995e-05, + "loss": 1.022, + "step": 466 + }, + { + "epoch": 0.1705313127624612, + "grad_norm": 0.8908308148384094, + "learning_rate": 1.7043795620437957e-05, + "loss": 0.9797, + "step": 467 + }, + { + "epoch": 0.17089647617308745, + "grad_norm": 1.17626953125, + "learning_rate": 1.708029197080292e-05, + "loss": 0.9773, + "step": 468 + }, + { + "epoch": 0.1712616395837137, + "grad_norm": 1.0561892986297607, + "learning_rate": 1.7116788321167886e-05, + "loss": 0.9957, + "step": 469 + }, + { + "epoch": 0.17162680299433997, + "grad_norm": 1.4765760898590088, + "learning_rate": 1.7153284671532847e-05, + "loss": 1.0071, + "step": 470 + }, + { + "epoch": 0.17199196640496622, + "grad_norm": 1.372825264930725, + "learning_rate": 1.718978102189781e-05, + "loss": 1.0347, + "step": 471 + }, + { + "epoch": 0.17235712981559248, + "grad_norm": 1.0499844551086426, + "learning_rate": 1.7226277372262773e-05, + "loss": 0.9867, + "step": 472 + }, + { + "epoch": 0.17272229322621874, + "grad_norm": 1.3431679010391235, + "learning_rate": 1.726277372262774e-05, + "loss": 1.0115, + "step": 473 + }, + { + "epoch": 0.173087456636845, + "grad_norm": 0.8785029649734497, + "learning_rate": 1.7299270072992702e-05, + "loss": 1.0327, + "step": 474 + }, + { + "epoch": 0.17345262004747125, + "grad_norm": 1.3840019702911377, + "learning_rate": 1.7335766423357666e-05, + "loss": 0.9919, + "step": 475 + }, + { + "epoch": 0.1738177834580975, + "grad_norm": 0.955368161201477, + "learning_rate": 1.737226277372263e-05, + "loss": 0.9966, + "step": 476 + }, + { + "epoch": 0.17418294686872376, + "grad_norm": 1.3200948238372803, + "learning_rate": 1.7408759124087592e-05, + "loss": 1.0052, + "step": 477 + }, + { + "epoch": 0.17454811027935002, + "grad_norm": 0.9315057396888733, + "learning_rate": 1.7445255474452557e-05, + "loss": 0.9813, + "step": 478 + }, + { + "epoch": 0.17491327368997625, + "grad_norm": 1.0480930805206299, + "learning_rate": 1.7481751824817518e-05, + "loss": 0.9749, + "step": 479 + }, + { + "epoch": 0.1752784371006025, + "grad_norm": 1.000348687171936, + "learning_rate": 1.7518248175182482e-05, + "loss": 1.0067, + "step": 480 + }, + { + "epoch": 0.17564360051122876, + "grad_norm": 1.0230541229248047, + "learning_rate": 1.7554744525547447e-05, + "loss": 0.994, + "step": 481 + }, + { + "epoch": 0.17600876392185502, + "grad_norm": 1.2165991067886353, + "learning_rate": 1.7591240875912408e-05, + "loss": 1.0295, + "step": 482 + }, + { + "epoch": 0.17637392733248128, + "grad_norm": 1.2207070589065552, + "learning_rate": 1.7627737226277373e-05, + "loss": 0.9905, + "step": 483 + }, + { + "epoch": 0.17673909074310754, + "grad_norm": 1.2231794595718384, + "learning_rate": 1.7664233576642337e-05, + "loss": 1.0133, + "step": 484 + }, + { + "epoch": 0.1771042541537338, + "grad_norm": 1.200690746307373, + "learning_rate": 1.7700729927007302e-05, + "loss": 0.983, + "step": 485 + }, + { + "epoch": 0.17746941756436005, + "grad_norm": 1.118323564529419, + "learning_rate": 1.7737226277372263e-05, + "loss": 1.0201, + "step": 486 + }, + { + "epoch": 0.1778345809749863, + "grad_norm": 1.030706524848938, + "learning_rate": 1.7773722627737228e-05, + "loss": 1.0175, + "step": 487 + }, + { + "epoch": 0.17819974438561256, + "grad_norm": 0.7909834980964661, + "learning_rate": 1.7810218978102192e-05, + "loss": 0.9784, + "step": 488 + }, + { + "epoch": 0.17856490779623882, + "grad_norm": 1.075653076171875, + "learning_rate": 1.7846715328467153e-05, + "loss": 0.9595, + "step": 489 + }, + { + "epoch": 0.17893007120686508, + "grad_norm": 1.1155372858047485, + "learning_rate": 1.7883211678832118e-05, + "loss": 0.9739, + "step": 490 + }, + { + "epoch": 0.17929523461749133, + "grad_norm": 1.4763505458831787, + "learning_rate": 1.7919708029197082e-05, + "loss": 0.9624, + "step": 491 + }, + { + "epoch": 0.1796603980281176, + "grad_norm": 1.2181990146636963, + "learning_rate": 1.7956204379562047e-05, + "loss": 0.9927, + "step": 492 + }, + { + "epoch": 0.18002556143874385, + "grad_norm": 1.0169252157211304, + "learning_rate": 1.7992700729927008e-05, + "loss": 1.007, + "step": 493 + }, + { + "epoch": 0.1803907248493701, + "grad_norm": 1.4996839761734009, + "learning_rate": 1.8029197080291973e-05, + "loss": 0.998, + "step": 494 + }, + { + "epoch": 0.18075588825999636, + "grad_norm": 0.8153586387634277, + "learning_rate": 1.8065693430656937e-05, + "loss": 0.9808, + "step": 495 + }, + { + "epoch": 0.1811210516706226, + "grad_norm": 1.0656777620315552, + "learning_rate": 1.81021897810219e-05, + "loss": 0.9603, + "step": 496 + }, + { + "epoch": 0.18148621508124885, + "grad_norm": 1.1432812213897705, + "learning_rate": 1.8138686131386863e-05, + "loss": 0.9727, + "step": 497 + }, + { + "epoch": 0.1818513784918751, + "grad_norm": 0.8203110694885254, + "learning_rate": 1.8175182481751824e-05, + "loss": 1.0028, + "step": 498 + }, + { + "epoch": 0.18221654190250136, + "grad_norm": 0.9081830978393555, + "learning_rate": 1.821167883211679e-05, + "loss": 0.9829, + "step": 499 + }, + { + "epoch": 0.18258170531312762, + "grad_norm": 1.2304878234863281, + "learning_rate": 1.8248175182481753e-05, + "loss": 1.0103, + "step": 500 + }, + { + "epoch": 0.18294686872375387, + "grad_norm": 1.2801042795181274, + "learning_rate": 1.8284671532846715e-05, + "loss": 1.0187, + "step": 501 + }, + { + "epoch": 0.18331203213438013, + "grad_norm": 1.001322865486145, + "learning_rate": 1.8321167883211683e-05, + "loss": 0.959, + "step": 502 + }, + { + "epoch": 0.1836771955450064, + "grad_norm": 1.000063419342041, + "learning_rate": 1.8357664233576644e-05, + "loss": 0.993, + "step": 503 + }, + { + "epoch": 0.18404235895563265, + "grad_norm": 1.2404290437698364, + "learning_rate": 1.8394160583941608e-05, + "loss": 0.9767, + "step": 504 + }, + { + "epoch": 0.1844075223662589, + "grad_norm": 0.9571080803871155, + "learning_rate": 1.843065693430657e-05, + "loss": 0.9707, + "step": 505 + }, + { + "epoch": 0.18477268577688516, + "grad_norm": 0.9870139360427856, + "learning_rate": 1.8467153284671534e-05, + "loss": 0.9684, + "step": 506 + }, + { + "epoch": 0.18513784918751142, + "grad_norm": 0.9874576330184937, + "learning_rate": 1.85036496350365e-05, + "loss": 0.9814, + "step": 507 + }, + { + "epoch": 0.18550301259813767, + "grad_norm": 1.3045789003372192, + "learning_rate": 1.854014598540146e-05, + "loss": 0.979, + "step": 508 + }, + { + "epoch": 0.18586817600876393, + "grad_norm": 1.4654738903045654, + "learning_rate": 1.8576642335766424e-05, + "loss": 0.9468, + "step": 509 + }, + { + "epoch": 0.1862333394193902, + "grad_norm": 1.3370999097824097, + "learning_rate": 1.861313868613139e-05, + "loss": 0.9634, + "step": 510 + }, + { + "epoch": 0.18659850283001644, + "grad_norm": 0.9240619540214539, + "learning_rate": 1.8649635036496353e-05, + "loss": 0.9556, + "step": 511 + }, + { + "epoch": 0.1869636662406427, + "grad_norm": 0.9380460977554321, + "learning_rate": 1.8686131386861315e-05, + "loss": 0.9839, + "step": 512 + }, + { + "epoch": 0.18732882965126893, + "grad_norm": 0.990183413028717, + "learning_rate": 1.872262773722628e-05, + "loss": 0.9829, + "step": 513 + }, + { + "epoch": 0.1876939930618952, + "grad_norm": 0.9844232201576233, + "learning_rate": 1.8759124087591244e-05, + "loss": 0.9624, + "step": 514 + }, + { + "epoch": 0.18805915647252144, + "grad_norm": 1.3360226154327393, + "learning_rate": 1.8795620437956205e-05, + "loss": 0.9667, + "step": 515 + }, + { + "epoch": 0.1884243198831477, + "grad_norm": 1.0527745485305786, + "learning_rate": 1.883211678832117e-05, + "loss": 0.9873, + "step": 516 + }, + { + "epoch": 0.18878948329377396, + "grad_norm": 0.7709869742393494, + "learning_rate": 1.8868613138686134e-05, + "loss": 0.9957, + "step": 517 + }, + { + "epoch": 0.18915464670440021, + "grad_norm": 0.9885143637657166, + "learning_rate": 1.8905109489051095e-05, + "loss": 0.9785, + "step": 518 + }, + { + "epoch": 0.18951981011502647, + "grad_norm": 0.8128122091293335, + "learning_rate": 1.894160583941606e-05, + "loss": 0.9491, + "step": 519 + }, + { + "epoch": 0.18988497352565273, + "grad_norm": 0.9787136912345886, + "learning_rate": 1.897810218978102e-05, + "loss": 0.936, + "step": 520 + }, + { + "epoch": 0.19025013693627899, + "grad_norm": 1.0835014581680298, + "learning_rate": 1.901459854014599e-05, + "loss": 0.9873, + "step": 521 + }, + { + "epoch": 0.19061530034690524, + "grad_norm": 1.2721729278564453, + "learning_rate": 1.905109489051095e-05, + "loss": 0.9996, + "step": 522 + }, + { + "epoch": 0.1909804637575315, + "grad_norm": 1.0898629426956177, + "learning_rate": 1.9087591240875915e-05, + "loss": 0.9877, + "step": 523 + }, + { + "epoch": 0.19134562716815776, + "grad_norm": 1.5092837810516357, + "learning_rate": 1.912408759124088e-05, + "loss": 0.9923, + "step": 524 + }, + { + "epoch": 0.191710790578784, + "grad_norm": 1.787390947341919, + "learning_rate": 1.916058394160584e-05, + "loss": 0.9877, + "step": 525 + }, + { + "epoch": 0.19207595398941027, + "grad_norm": 1.2582017183303833, + "learning_rate": 1.9197080291970805e-05, + "loss": 0.9637, + "step": 526 + }, + { + "epoch": 0.19244111740003653, + "grad_norm": 1.2805448770523071, + "learning_rate": 1.9233576642335766e-05, + "loss": 0.9714, + "step": 527 + }, + { + "epoch": 0.19280628081066278, + "grad_norm": 1.203843116760254, + "learning_rate": 1.9270072992700734e-05, + "loss": 0.9453, + "step": 528 + }, + { + "epoch": 0.19317144422128904, + "grad_norm": 1.221866250038147, + "learning_rate": 1.9306569343065695e-05, + "loss": 0.9307, + "step": 529 + }, + { + "epoch": 0.19353660763191527, + "grad_norm": 1.4659773111343384, + "learning_rate": 1.934306569343066e-05, + "loss": 0.9092, + "step": 530 + }, + { + "epoch": 0.19390177104254153, + "grad_norm": 0.8587760329246521, + "learning_rate": 1.937956204379562e-05, + "loss": 0.9265, + "step": 531 + }, + { + "epoch": 0.19426693445316778, + "grad_norm": 0.7869857549667358, + "learning_rate": 1.9416058394160586e-05, + "loss": 0.9355, + "step": 532 + }, + { + "epoch": 0.19463209786379404, + "grad_norm": 0.9829152226448059, + "learning_rate": 1.945255474452555e-05, + "loss": 0.9971, + "step": 533 + }, + { + "epoch": 0.1949972612744203, + "grad_norm": 0.9914855360984802, + "learning_rate": 1.948905109489051e-05, + "loss": 0.9432, + "step": 534 + }, + { + "epoch": 0.19536242468504655, + "grad_norm": 1.4708786010742188, + "learning_rate": 1.9525547445255476e-05, + "loss": 0.9668, + "step": 535 + }, + { + "epoch": 0.1957275880956728, + "grad_norm": 1.1222752332687378, + "learning_rate": 1.956204379562044e-05, + "loss": 0.9619, + "step": 536 + }, + { + "epoch": 0.19609275150629907, + "grad_norm": 1.9274342060089111, + "learning_rate": 1.95985401459854e-05, + "loss": 1.0125, + "step": 537 + }, + { + "epoch": 0.19645791491692532, + "grad_norm": 0.9744441509246826, + "learning_rate": 1.9635036496350366e-05, + "loss": 0.9624, + "step": 538 + }, + { + "epoch": 0.19682307832755158, + "grad_norm": 1.1587960720062256, + "learning_rate": 1.967153284671533e-05, + "loss": 0.9648, + "step": 539 + }, + { + "epoch": 0.19718824173817784, + "grad_norm": 1.7533305883407593, + "learning_rate": 1.9708029197080295e-05, + "loss": 0.9557, + "step": 540 + }, + { + "epoch": 0.1975534051488041, + "grad_norm": 0.9296097159385681, + "learning_rate": 1.9744525547445256e-05, + "loss": 0.9346, + "step": 541 + }, + { + "epoch": 0.19791856855943035, + "grad_norm": 1.1713056564331055, + "learning_rate": 1.978102189781022e-05, + "loss": 0.9485, + "step": 542 + }, + { + "epoch": 0.1982837319700566, + "grad_norm": 1.076978325843811, + "learning_rate": 1.9817518248175186e-05, + "loss": 0.9531, + "step": 543 + }, + { + "epoch": 0.19864889538068287, + "grad_norm": 1.4528071880340576, + "learning_rate": 1.9854014598540147e-05, + "loss": 0.9745, + "step": 544 + }, + { + "epoch": 0.19901405879130912, + "grad_norm": 1.236617922782898, + "learning_rate": 1.989051094890511e-05, + "loss": 0.931, + "step": 545 + }, + { + "epoch": 0.19937922220193538, + "grad_norm": 0.9910083413124084, + "learning_rate": 1.9927007299270073e-05, + "loss": 0.994, + "step": 546 + }, + { + "epoch": 0.1997443856125616, + "grad_norm": 0.9027132987976074, + "learning_rate": 1.9963503649635037e-05, + "loss": 0.9836, + "step": 547 + }, + { + "epoch": 0.20010954902318787, + "grad_norm": 1.0734672546386719, + "learning_rate": 2e-05, + "loss": 0.9387, + "step": 548 + }, + { + "epoch": 0.20047471243381412, + "grad_norm": 1.1241096258163452, + "learning_rate": 2.0036496350364966e-05, + "loss": 1.0135, + "step": 549 + }, + { + "epoch": 0.20083987584444038, + "grad_norm": 0.925300121307373, + "learning_rate": 2.0072992700729927e-05, + "loss": 0.9351, + "step": 550 + }, + { + "epoch": 0.20120503925506664, + "grad_norm": 0.923231303691864, + "learning_rate": 2.0109489051094892e-05, + "loss": 0.9557, + "step": 551 + }, + { + "epoch": 0.2015702026656929, + "grad_norm": 1.1165447235107422, + "learning_rate": 2.0145985401459857e-05, + "loss": 0.9553, + "step": 552 + }, + { + "epoch": 0.20193536607631915, + "grad_norm": 0.8924594521522522, + "learning_rate": 2.0182481751824818e-05, + "loss": 0.9332, + "step": 553 + }, + { + "epoch": 0.2023005294869454, + "grad_norm": 1.1063874959945679, + "learning_rate": 2.0218978102189782e-05, + "loss": 0.9163, + "step": 554 + }, + { + "epoch": 0.20266569289757166, + "grad_norm": 1.6629401445388794, + "learning_rate": 2.0255474452554743e-05, + "loss": 0.97, + "step": 555 + }, + { + "epoch": 0.20303085630819792, + "grad_norm": 1.107111930847168, + "learning_rate": 2.0291970802919708e-05, + "loss": 0.9749, + "step": 556 + }, + { + "epoch": 0.20339601971882418, + "grad_norm": 1.0518925189971924, + "learning_rate": 2.0328467153284676e-05, + "loss": 0.923, + "step": 557 + }, + { + "epoch": 0.20376118312945043, + "grad_norm": 1.3342797756195068, + "learning_rate": 2.0364963503649634e-05, + "loss": 0.9469, + "step": 558 + }, + { + "epoch": 0.2041263465400767, + "grad_norm": 0.731576681137085, + "learning_rate": 2.04014598540146e-05, + "loss": 0.9586, + "step": 559 + }, + { + "epoch": 0.20449150995070295, + "grad_norm": 1.0917059183120728, + "learning_rate": 2.0437956204379566e-05, + "loss": 0.9136, + "step": 560 + }, + { + "epoch": 0.2048566733613292, + "grad_norm": 1.082904577255249, + "learning_rate": 2.0474452554744527e-05, + "loss": 0.9521, + "step": 561 + }, + { + "epoch": 0.20522183677195546, + "grad_norm": 1.0768007040023804, + "learning_rate": 2.0510948905109492e-05, + "loss": 0.9315, + "step": 562 + }, + { + "epoch": 0.2055870001825817, + "grad_norm": 0.7562431693077087, + "learning_rate": 2.0547445255474457e-05, + "loss": 0.9064, + "step": 563 + }, + { + "epoch": 0.20595216359320795, + "grad_norm": 1.1797900199890137, + "learning_rate": 2.0583941605839418e-05, + "loss": 0.999, + "step": 564 + }, + { + "epoch": 0.2063173270038342, + "grad_norm": 0.9053616523742676, + "learning_rate": 2.0620437956204382e-05, + "loss": 0.9337, + "step": 565 + }, + { + "epoch": 0.20668249041446046, + "grad_norm": 1.3852813243865967, + "learning_rate": 2.0656934306569343e-05, + "loss": 0.941, + "step": 566 + }, + { + "epoch": 0.20704765382508672, + "grad_norm": 0.6356310844421387, + "learning_rate": 2.0693430656934308e-05, + "loss": 0.9259, + "step": 567 + }, + { + "epoch": 0.20741281723571298, + "grad_norm": 0.9954673647880554, + "learning_rate": 2.0729927007299273e-05, + "loss": 0.9258, + "step": 568 + }, + { + "epoch": 0.20777798064633923, + "grad_norm": 1.2683064937591553, + "learning_rate": 2.0766423357664234e-05, + "loss": 0.975, + "step": 569 + }, + { + "epoch": 0.2081431440569655, + "grad_norm": 0.8792716264724731, + "learning_rate": 2.08029197080292e-05, + "loss": 0.9497, + "step": 570 + }, + { + "epoch": 0.20850830746759175, + "grad_norm": 0.9924480319023132, + "learning_rate": 2.0839416058394163e-05, + "loss": 0.9425, + "step": 571 + }, + { + "epoch": 0.208873470878218, + "grad_norm": 0.9855673909187317, + "learning_rate": 2.0875912408759124e-05, + "loss": 0.9583, + "step": 572 + }, + { + "epoch": 0.20923863428884426, + "grad_norm": 0.9010187983512878, + "learning_rate": 2.091240875912409e-05, + "loss": 0.9147, + "step": 573 + }, + { + "epoch": 0.20960379769947052, + "grad_norm": 1.3942712545394897, + "learning_rate": 2.0948905109489057e-05, + "loss": 0.9437, + "step": 574 + }, + { + "epoch": 0.20996896111009677, + "grad_norm": 1.0104106664657593, + "learning_rate": 2.0985401459854014e-05, + "loss": 0.9528, + "step": 575 + }, + { + "epoch": 0.21033412452072303, + "grad_norm": 0.8935278654098511, + "learning_rate": 2.1021897810218982e-05, + "loss": 0.9368, + "step": 576 + }, + { + "epoch": 0.2106992879313493, + "grad_norm": 1.1053450107574463, + "learning_rate": 2.105839416058394e-05, + "loss": 0.9342, + "step": 577 + }, + { + "epoch": 0.21106445134197555, + "grad_norm": 0.8636622428894043, + "learning_rate": 2.1094890510948908e-05, + "loss": 0.9568, + "step": 578 + }, + { + "epoch": 0.2114296147526018, + "grad_norm": 1.6515229940414429, + "learning_rate": 2.1131386861313873e-05, + "loss": 0.9174, + "step": 579 + }, + { + "epoch": 0.21179477816322803, + "grad_norm": 0.9530323147773743, + "learning_rate": 2.1167883211678834e-05, + "loss": 0.9052, + "step": 580 + }, + { + "epoch": 0.2121599415738543, + "grad_norm": 0.7677631974220276, + "learning_rate": 2.12043795620438e-05, + "loss": 0.9427, + "step": 581 + }, + { + "epoch": 0.21252510498448055, + "grad_norm": 1.109664797782898, + "learning_rate": 2.1240875912408763e-05, + "loss": 0.938, + "step": 582 + }, + { + "epoch": 0.2128902683951068, + "grad_norm": 0.9017308950424194, + "learning_rate": 2.1277372262773724e-05, + "loss": 0.9496, + "step": 583 + }, + { + "epoch": 0.21325543180573306, + "grad_norm": 0.8534948825836182, + "learning_rate": 2.131386861313869e-05, + "loss": 0.8927, + "step": 584 + }, + { + "epoch": 0.21362059521635932, + "grad_norm": 0.9706099629402161, + "learning_rate": 2.135036496350365e-05, + "loss": 0.9166, + "step": 585 + }, + { + "epoch": 0.21398575862698557, + "grad_norm": 1.0126210451126099, + "learning_rate": 2.1386861313868614e-05, + "loss": 0.9696, + "step": 586 + }, + { + "epoch": 0.21435092203761183, + "grad_norm": 1.4508851766586304, + "learning_rate": 2.142335766423358e-05, + "loss": 0.9596, + "step": 587 + }, + { + "epoch": 0.2147160854482381, + "grad_norm": 0.8295338153839111, + "learning_rate": 2.145985401459854e-05, + "loss": 0.897, + "step": 588 + }, + { + "epoch": 0.21508124885886434, + "grad_norm": 1.030462384223938, + "learning_rate": 2.1496350364963505e-05, + "loss": 0.9136, + "step": 589 + }, + { + "epoch": 0.2154464122694906, + "grad_norm": 1.3747875690460205, + "learning_rate": 2.153284671532847e-05, + "loss": 0.9393, + "step": 590 + }, + { + "epoch": 0.21581157568011686, + "grad_norm": 0.7962733507156372, + "learning_rate": 2.156934306569343e-05, + "loss": 0.9292, + "step": 591 + }, + { + "epoch": 0.21617673909074311, + "grad_norm": 1.0487091541290283, + "learning_rate": 2.1605839416058395e-05, + "loss": 0.9493, + "step": 592 + }, + { + "epoch": 0.21654190250136937, + "grad_norm": 0.9827638864517212, + "learning_rate": 2.1642335766423363e-05, + "loss": 0.9614, + "step": 593 + }, + { + "epoch": 0.21690706591199563, + "grad_norm": 0.9397901296615601, + "learning_rate": 2.167883211678832e-05, + "loss": 0.9199, + "step": 594 + }, + { + "epoch": 0.21727222932262188, + "grad_norm": 1.07180655002594, + "learning_rate": 2.171532846715329e-05, + "loss": 0.8998, + "step": 595 + }, + { + "epoch": 0.21763739273324814, + "grad_norm": 0.9420148134231567, + "learning_rate": 2.1751824817518246e-05, + "loss": 0.9264, + "step": 596 + }, + { + "epoch": 0.21800255614387437, + "grad_norm": 0.8558598756790161, + "learning_rate": 2.1788321167883214e-05, + "loss": 0.9282, + "step": 597 + }, + { + "epoch": 0.21836771955450063, + "grad_norm": 0.8148871660232544, + "learning_rate": 2.182481751824818e-05, + "loss": 0.9066, + "step": 598 + }, + { + "epoch": 0.21873288296512688, + "grad_norm": 1.2567858695983887, + "learning_rate": 2.186131386861314e-05, + "loss": 0.9285, + "step": 599 + }, + { + "epoch": 0.21909804637575314, + "grad_norm": 1.0364936590194702, + "learning_rate": 2.1897810218978105e-05, + "loss": 0.9396, + "step": 600 + }, + { + "epoch": 0.2194632097863794, + "grad_norm": 1.0940117835998535, + "learning_rate": 2.193430656934307e-05, + "loss": 0.9309, + "step": 601 + }, + { + "epoch": 0.21982837319700566, + "grad_norm": 1.0114707946777344, + "learning_rate": 2.197080291970803e-05, + "loss": 0.927, + "step": 602 + }, + { + "epoch": 0.2201935366076319, + "grad_norm": 1.1807681322097778, + "learning_rate": 2.2007299270072995e-05, + "loss": 0.9542, + "step": 603 + }, + { + "epoch": 0.22055870001825817, + "grad_norm": 1.4283320903778076, + "learning_rate": 2.204379562043796e-05, + "loss": 0.9482, + "step": 604 + }, + { + "epoch": 0.22092386342888443, + "grad_norm": 1.064704418182373, + "learning_rate": 2.208029197080292e-05, + "loss": 0.8962, + "step": 605 + }, + { + "epoch": 0.22128902683951068, + "grad_norm": 1.1571245193481445, + "learning_rate": 2.2116788321167885e-05, + "loss": 0.976, + "step": 606 + }, + { + "epoch": 0.22165419025013694, + "grad_norm": 1.1410144567489624, + "learning_rate": 2.2153284671532847e-05, + "loss": 0.918, + "step": 607 + }, + { + "epoch": 0.2220193536607632, + "grad_norm": 1.0465383529663086, + "learning_rate": 2.218978102189781e-05, + "loss": 0.923, + "step": 608 + }, + { + "epoch": 0.22238451707138945, + "grad_norm": 1.131148338317871, + "learning_rate": 2.2226277372262776e-05, + "loss": 0.9437, + "step": 609 + }, + { + "epoch": 0.2227496804820157, + "grad_norm": 0.7514604926109314, + "learning_rate": 2.2262773722627737e-05, + "loss": 0.9127, + "step": 610 + }, + { + "epoch": 0.22311484389264197, + "grad_norm": 1.0829296112060547, + "learning_rate": 2.22992700729927e-05, + "loss": 0.9159, + "step": 611 + }, + { + "epoch": 0.22348000730326822, + "grad_norm": 1.2224297523498535, + "learning_rate": 2.2335766423357666e-05, + "loss": 0.9083, + "step": 612 + }, + { + "epoch": 0.22384517071389448, + "grad_norm": 1.065311312675476, + "learning_rate": 2.2372262773722627e-05, + "loss": 0.9169, + "step": 613 + }, + { + "epoch": 0.2242103341245207, + "grad_norm": 0.8168937563896179, + "learning_rate": 2.2408759124087595e-05, + "loss": 0.8895, + "step": 614 + }, + { + "epoch": 0.22457549753514697, + "grad_norm": 1.3119653463363647, + "learning_rate": 2.244525547445256e-05, + "loss": 0.9395, + "step": 615 + }, + { + "epoch": 0.22494066094577322, + "grad_norm": 0.9576928615570068, + "learning_rate": 2.248175182481752e-05, + "loss": 0.95, + "step": 616 + }, + { + "epoch": 0.22530582435639948, + "grad_norm": 1.0545897483825684, + "learning_rate": 2.2518248175182485e-05, + "loss": 0.9564, + "step": 617 + }, + { + "epoch": 0.22567098776702574, + "grad_norm": 1.0971013307571411, + "learning_rate": 2.2554744525547447e-05, + "loss": 0.9091, + "step": 618 + }, + { + "epoch": 0.226036151177652, + "grad_norm": 1.3408724069595337, + "learning_rate": 2.259124087591241e-05, + "loss": 0.9359, + "step": 619 + }, + { + "epoch": 0.22640131458827825, + "grad_norm": 0.7940633893013, + "learning_rate": 2.2627737226277376e-05, + "loss": 0.9159, + "step": 620 + }, + { + "epoch": 0.2267664779989045, + "grad_norm": 0.7596388459205627, + "learning_rate": 2.2664233576642337e-05, + "loss": 0.9062, + "step": 621 + }, + { + "epoch": 0.22713164140953077, + "grad_norm": 1.1658217906951904, + "learning_rate": 2.27007299270073e-05, + "loss": 0.933, + "step": 622 + }, + { + "epoch": 0.22749680482015702, + "grad_norm": 1.2590558528900146, + "learning_rate": 2.2737226277372266e-05, + "loss": 0.9424, + "step": 623 + }, + { + "epoch": 0.22786196823078328, + "grad_norm": 0.8723631501197815, + "learning_rate": 2.2773722627737227e-05, + "loss": 0.9197, + "step": 624 + }, + { + "epoch": 0.22822713164140954, + "grad_norm": 0.9985167980194092, + "learning_rate": 2.2810218978102192e-05, + "loss": 0.9287, + "step": 625 + }, + { + "epoch": 0.2285922950520358, + "grad_norm": 1.1019034385681152, + "learning_rate": 2.2846715328467156e-05, + "loss": 0.911, + "step": 626 + }, + { + "epoch": 0.22895745846266205, + "grad_norm": 1.0437474250793457, + "learning_rate": 2.2883211678832117e-05, + "loss": 0.9119, + "step": 627 + }, + { + "epoch": 0.2293226218732883, + "grad_norm": 0.864844799041748, + "learning_rate": 2.2919708029197082e-05, + "loss": 0.915, + "step": 628 + }, + { + "epoch": 0.22968778528391456, + "grad_norm": 0.829508900642395, + "learning_rate": 2.2956204379562043e-05, + "loss": 0.9077, + "step": 629 + }, + { + "epoch": 0.23005294869454082, + "grad_norm": 0.850080668926239, + "learning_rate": 2.2992700729927008e-05, + "loss": 0.9023, + "step": 630 + }, + { + "epoch": 0.23041811210516705, + "grad_norm": 0.8959830403327942, + "learning_rate": 2.3029197080291972e-05, + "loss": 0.9038, + "step": 631 + }, + { + "epoch": 0.2307832755157933, + "grad_norm": 1.3409028053283691, + "learning_rate": 2.3065693430656934e-05, + "loss": 0.9269, + "step": 632 + }, + { + "epoch": 0.23114843892641956, + "grad_norm": 1.0184214115142822, + "learning_rate": 2.31021897810219e-05, + "loss": 0.9413, + "step": 633 + }, + { + "epoch": 0.23151360233704582, + "grad_norm": 1.1121234893798828, + "learning_rate": 2.3138686131386866e-05, + "loss": 0.9196, + "step": 634 + }, + { + "epoch": 0.23187876574767208, + "grad_norm": 0.9996039271354675, + "learning_rate": 2.3175182481751827e-05, + "loss": 0.9335, + "step": 635 + }, + { + "epoch": 0.23224392915829833, + "grad_norm": 1.0234628915786743, + "learning_rate": 2.3211678832116792e-05, + "loss": 0.9364, + "step": 636 + }, + { + "epoch": 0.2326090925689246, + "grad_norm": 1.0570037364959717, + "learning_rate": 2.3248175182481756e-05, + "loss": 0.8986, + "step": 637 + }, + { + "epoch": 0.23297425597955085, + "grad_norm": 0.6851513981819153, + "learning_rate": 2.3284671532846718e-05, + "loss": 0.8804, + "step": 638 + }, + { + "epoch": 0.2333394193901771, + "grad_norm": 1.0824224948883057, + "learning_rate": 2.3321167883211682e-05, + "loss": 0.8817, + "step": 639 + }, + { + "epoch": 0.23370458280080336, + "grad_norm": 1.2433457374572754, + "learning_rate": 2.3357664233576643e-05, + "loss": 0.921, + "step": 640 + }, + { + "epoch": 0.23406974621142962, + "grad_norm": 0.9732486605644226, + "learning_rate": 2.3394160583941608e-05, + "loss": 0.8934, + "step": 641 + }, + { + "epoch": 0.23443490962205588, + "grad_norm": 1.0137306451797485, + "learning_rate": 2.3430656934306572e-05, + "loss": 0.9434, + "step": 642 + }, + { + "epoch": 0.23480007303268213, + "grad_norm": 0.9960076808929443, + "learning_rate": 2.3467153284671534e-05, + "loss": 0.8903, + "step": 643 + }, + { + "epoch": 0.2351652364433084, + "grad_norm": 0.8179641962051392, + "learning_rate": 2.3503649635036498e-05, + "loss": 0.9148, + "step": 644 + }, + { + "epoch": 0.23553039985393465, + "grad_norm": 1.8424533605575562, + "learning_rate": 2.3540145985401463e-05, + "loss": 0.9077, + "step": 645 + }, + { + "epoch": 0.2358955632645609, + "grad_norm": 1.0837489366531372, + "learning_rate": 2.3576642335766424e-05, + "loss": 0.9108, + "step": 646 + }, + { + "epoch": 0.23626072667518713, + "grad_norm": 0.9237055778503418, + "learning_rate": 2.361313868613139e-05, + "loss": 0.9159, + "step": 647 + }, + { + "epoch": 0.2366258900858134, + "grad_norm": 1.215301513671875, + "learning_rate": 2.3649635036496353e-05, + "loss": 0.9172, + "step": 648 + }, + { + "epoch": 0.23699105349643965, + "grad_norm": 1.474677562713623, + "learning_rate": 2.3686131386861314e-05, + "loss": 0.9196, + "step": 649 + }, + { + "epoch": 0.2373562169070659, + "grad_norm": 0.8014124035835266, + "learning_rate": 2.372262773722628e-05, + "loss": 0.9036, + "step": 650 + }, + { + "epoch": 0.23772138031769216, + "grad_norm": 0.9400027394294739, + "learning_rate": 2.375912408759124e-05, + "loss": 0.8754, + "step": 651 + }, + { + "epoch": 0.23808654372831842, + "grad_norm": 1.0393285751342773, + "learning_rate": 2.3795620437956204e-05, + "loss": 0.9025, + "step": 652 + }, + { + "epoch": 0.23845170713894467, + "grad_norm": 1.153727650642395, + "learning_rate": 2.3832116788321172e-05, + "loss": 0.9031, + "step": 653 + }, + { + "epoch": 0.23881687054957093, + "grad_norm": 1.0982905626296997, + "learning_rate": 2.3868613138686134e-05, + "loss": 0.8793, + "step": 654 + }, + { + "epoch": 0.2391820339601972, + "grad_norm": 0.9338880181312561, + "learning_rate": 2.3905109489051098e-05, + "loss": 0.8768, + "step": 655 + }, + { + "epoch": 0.23954719737082344, + "grad_norm": 0.9995929598808289, + "learning_rate": 2.3941605839416063e-05, + "loss": 0.8976, + "step": 656 + }, + { + "epoch": 0.2399123607814497, + "grad_norm": 1.0906516313552856, + "learning_rate": 2.3978102189781024e-05, + "loss": 0.9115, + "step": 657 + }, + { + "epoch": 0.24027752419207596, + "grad_norm": 1.2404183149337769, + "learning_rate": 2.401459854014599e-05, + "loss": 0.8925, + "step": 658 + }, + { + "epoch": 0.24064268760270222, + "grad_norm": 0.8696293234825134, + "learning_rate": 2.405109489051095e-05, + "loss": 0.8966, + "step": 659 + }, + { + "epoch": 0.24100785101332847, + "grad_norm": 0.8563190698623657, + "learning_rate": 2.4087591240875914e-05, + "loss": 0.892, + "step": 660 + }, + { + "epoch": 0.24137301442395473, + "grad_norm": 0.8124135732650757, + "learning_rate": 2.412408759124088e-05, + "loss": 0.9028, + "step": 661 + }, + { + "epoch": 0.241738177834581, + "grad_norm": 0.9765145778656006, + "learning_rate": 2.416058394160584e-05, + "loss": 0.9186, + "step": 662 + }, + { + "epoch": 0.24210334124520724, + "grad_norm": 1.0169612169265747, + "learning_rate": 2.4197080291970805e-05, + "loss": 0.8901, + "step": 663 + }, + { + "epoch": 0.24246850465583347, + "grad_norm": 0.8080570697784424, + "learning_rate": 2.423357664233577e-05, + "loss": 0.8871, + "step": 664 + }, + { + "epoch": 0.24283366806645973, + "grad_norm": 1.1883140802383423, + "learning_rate": 2.427007299270073e-05, + "loss": 0.9015, + "step": 665 + }, + { + "epoch": 0.24319883147708599, + "grad_norm": 0.9028333425521851, + "learning_rate": 2.4306569343065695e-05, + "loss": 0.8638, + "step": 666 + }, + { + "epoch": 0.24356399488771224, + "grad_norm": 0.7207622528076172, + "learning_rate": 2.434306569343066e-05, + "loss": 0.8868, + "step": 667 + }, + { + "epoch": 0.2439291582983385, + "grad_norm": 1.3190016746520996, + "learning_rate": 2.437956204379562e-05, + "loss": 0.9341, + "step": 668 + }, + { + "epoch": 0.24429432170896476, + "grad_norm": 1.3500337600708008, + "learning_rate": 2.4416058394160585e-05, + "loss": 0.9249, + "step": 669 + }, + { + "epoch": 0.244659485119591, + "grad_norm": 1.042157769203186, + "learning_rate": 2.4452554744525546e-05, + "loss": 0.8414, + "step": 670 + }, + { + "epoch": 0.24502464853021727, + "grad_norm": 1.0846127271652222, + "learning_rate": 2.448905109489051e-05, + "loss": 0.8883, + "step": 671 + }, + { + "epoch": 0.24538981194084353, + "grad_norm": 1.4956647157669067, + "learning_rate": 2.452554744525548e-05, + "loss": 0.8802, + "step": 672 + }, + { + "epoch": 0.24575497535146978, + "grad_norm": 1.2371827363967896, + "learning_rate": 2.4562043795620437e-05, + "loss": 0.91, + "step": 673 + }, + { + "epoch": 0.24612013876209604, + "grad_norm": 1.2039473056793213, + "learning_rate": 2.4598540145985405e-05, + "loss": 0.8591, + "step": 674 + }, + { + "epoch": 0.2464853021727223, + "grad_norm": 0.9505608081817627, + "learning_rate": 2.463503649635037e-05, + "loss": 0.9138, + "step": 675 + }, + { + "epoch": 0.24685046558334856, + "grad_norm": 1.2837300300598145, + "learning_rate": 2.467153284671533e-05, + "loss": 0.9059, + "step": 676 + }, + { + "epoch": 0.2472156289939748, + "grad_norm": 1.084855079650879, + "learning_rate": 2.4708029197080295e-05, + "loss": 0.9113, + "step": 677 + }, + { + "epoch": 0.24758079240460107, + "grad_norm": 0.7835924029350281, + "learning_rate": 2.474452554744526e-05, + "loss": 0.8977, + "step": 678 + }, + { + "epoch": 0.24794595581522733, + "grad_norm": 0.8942123651504517, + "learning_rate": 2.478102189781022e-05, + "loss": 0.8754, + "step": 679 + }, + { + "epoch": 0.24831111922585358, + "grad_norm": 0.9250959753990173, + "learning_rate": 2.4817518248175185e-05, + "loss": 0.8692, + "step": 680 + }, + { + "epoch": 0.2486762826364798, + "grad_norm": 1.7833359241485596, + "learning_rate": 2.4854014598540146e-05, + "loss": 0.9188, + "step": 681 + }, + { + "epoch": 0.24904144604710607, + "grad_norm": 1.1274373531341553, + "learning_rate": 2.489051094890511e-05, + "loss": 0.8969, + "step": 682 + }, + { + "epoch": 0.24940660945773233, + "grad_norm": 0.9775779247283936, + "learning_rate": 2.4927007299270075e-05, + "loss": 0.8766, + "step": 683 + }, + { + "epoch": 0.24977177286835858, + "grad_norm": 0.8695776462554932, + "learning_rate": 2.4963503649635037e-05, + "loss": 0.9164, + "step": 684 + }, + { + "epoch": 0.25013693627898487, + "grad_norm": 0.6799452900886536, + "learning_rate": 2.5e-05, + "loss": 0.8776, + "step": 685 + }, + { + "epoch": 0.2505020996896111, + "grad_norm": 0.9650082588195801, + "learning_rate": 2.5036496350364966e-05, + "loss": 0.9207, + "step": 686 + }, + { + "epoch": 0.2508672631002374, + "grad_norm": 1.0292118787765503, + "learning_rate": 2.5072992700729927e-05, + "loss": 0.9307, + "step": 687 + }, + { + "epoch": 0.25123242651086364, + "grad_norm": 1.9373090267181396, + "learning_rate": 2.510948905109489e-05, + "loss": 0.8939, + "step": 688 + }, + { + "epoch": 0.25159758992148984, + "grad_norm": 1.2657979726791382, + "learning_rate": 2.514598540145986e-05, + "loss": 0.8967, + "step": 689 + }, + { + "epoch": 0.2519627533321161, + "grad_norm": 0.9416601657867432, + "learning_rate": 2.5182481751824817e-05, + "loss": 0.8701, + "step": 690 + }, + { + "epoch": 0.25232791674274235, + "grad_norm": 1.0587490797042847, + "learning_rate": 2.5218978102189785e-05, + "loss": 0.9169, + "step": 691 + }, + { + "epoch": 0.2526930801533686, + "grad_norm": 0.8510803580284119, + "learning_rate": 2.5255474452554743e-05, + "loss": 0.9108, + "step": 692 + }, + { + "epoch": 0.25305824356399487, + "grad_norm": 0.8927858471870422, + "learning_rate": 2.529197080291971e-05, + "loss": 0.8817, + "step": 693 + }, + { + "epoch": 0.2534234069746211, + "grad_norm": 1.1475845575332642, + "learning_rate": 2.5328467153284675e-05, + "loss": 0.8895, + "step": 694 + }, + { + "epoch": 0.2537885703852474, + "grad_norm": 1.0652403831481934, + "learning_rate": 2.5364963503649637e-05, + "loss": 0.8901, + "step": 695 + }, + { + "epoch": 0.25415373379587364, + "grad_norm": 0.7411143779754639, + "learning_rate": 2.54014598540146e-05, + "loss": 0.8696, + "step": 696 + }, + { + "epoch": 0.2545188972064999, + "grad_norm": 0.870008111000061, + "learning_rate": 2.5437956204379566e-05, + "loss": 0.8942, + "step": 697 + }, + { + "epoch": 0.25488406061712615, + "grad_norm": 0.9908129572868347, + "learning_rate": 2.5474452554744527e-05, + "loss": 0.8719, + "step": 698 + }, + { + "epoch": 0.2552492240277524, + "grad_norm": 2.048208475112915, + "learning_rate": 2.551094890510949e-05, + "loss": 0.9009, + "step": 699 + }, + { + "epoch": 0.25561438743837867, + "grad_norm": 0.8549103140830994, + "learning_rate": 2.5547445255474456e-05, + "loss": 0.8794, + "step": 700 + }, + { + "epoch": 0.2559795508490049, + "grad_norm": 1.0178098678588867, + "learning_rate": 2.5583941605839417e-05, + "loss": 0.8398, + "step": 701 + }, + { + "epoch": 0.2563447142596312, + "grad_norm": 0.9158284068107605, + "learning_rate": 2.5620437956204382e-05, + "loss": 0.8771, + "step": 702 + }, + { + "epoch": 0.25670987767025744, + "grad_norm": 0.9023451805114746, + "learning_rate": 2.5656934306569343e-05, + "loss": 0.907, + "step": 703 + }, + { + "epoch": 0.2570750410808837, + "grad_norm": 1.163403868675232, + "learning_rate": 2.5693430656934308e-05, + "loss": 0.8936, + "step": 704 + }, + { + "epoch": 0.25744020449150995, + "grad_norm": 1.1924737691879272, + "learning_rate": 2.5729927007299272e-05, + "loss": 0.9314, + "step": 705 + }, + { + "epoch": 0.2578053679021362, + "grad_norm": 1.0279207229614258, + "learning_rate": 2.5766423357664233e-05, + "loss": 0.8962, + "step": 706 + }, + { + "epoch": 0.25817053131276246, + "grad_norm": 1.5878379344940186, + "learning_rate": 2.5802919708029198e-05, + "loss": 0.9159, + "step": 707 + }, + { + "epoch": 0.2585356947233887, + "grad_norm": 0.899364709854126, + "learning_rate": 2.5839416058394166e-05, + "loss": 0.8892, + "step": 708 + }, + { + "epoch": 0.258900858134015, + "grad_norm": 1.147362470626831, + "learning_rate": 2.5875912408759124e-05, + "loss": 0.8831, + "step": 709 + }, + { + "epoch": 0.25926602154464123, + "grad_norm": 1.191892385482788, + "learning_rate": 2.591240875912409e-05, + "loss": 0.9058, + "step": 710 + }, + { + "epoch": 0.2596311849552675, + "grad_norm": 0.8700737357139587, + "learning_rate": 2.5948905109489056e-05, + "loss": 0.8944, + "step": 711 + }, + { + "epoch": 0.25999634836589375, + "grad_norm": 0.8057035803794861, + "learning_rate": 2.5985401459854017e-05, + "loss": 0.9044, + "step": 712 + }, + { + "epoch": 0.26036151177652, + "grad_norm": 1.1704809665679932, + "learning_rate": 2.6021897810218982e-05, + "loss": 0.8702, + "step": 713 + }, + { + "epoch": 0.26072667518714626, + "grad_norm": 1.2193748950958252, + "learning_rate": 2.6058394160583943e-05, + "loss": 0.8966, + "step": 714 + }, + { + "epoch": 0.2610918385977725, + "grad_norm": 1.676171898841858, + "learning_rate": 2.6094890510948908e-05, + "loss": 0.8789, + "step": 715 + }, + { + "epoch": 0.2614570020083988, + "grad_norm": 1.167014241218567, + "learning_rate": 2.6131386861313872e-05, + "loss": 0.9003, + "step": 716 + }, + { + "epoch": 0.26182216541902503, + "grad_norm": 1.1899787187576294, + "learning_rate": 2.6167883211678833e-05, + "loss": 0.8865, + "step": 717 + }, + { + "epoch": 0.2621873288296513, + "grad_norm": 1.2309730052947998, + "learning_rate": 2.6204379562043798e-05, + "loss": 0.8909, + "step": 718 + }, + { + "epoch": 0.26255249224027755, + "grad_norm": 1.043740153312683, + "learning_rate": 2.6240875912408762e-05, + "loss": 0.8604, + "step": 719 + }, + { + "epoch": 0.2629176556509038, + "grad_norm": 0.9882286190986633, + "learning_rate": 2.6277372262773724e-05, + "loss": 0.8354, + "step": 720 + }, + { + "epoch": 0.26328281906153006, + "grad_norm": 1.0266289710998535, + "learning_rate": 2.6313868613138688e-05, + "loss": 0.8959, + "step": 721 + }, + { + "epoch": 0.2636479824721563, + "grad_norm": 1.2218525409698486, + "learning_rate": 2.635036496350365e-05, + "loss": 0.9207, + "step": 722 + }, + { + "epoch": 0.2640131458827825, + "grad_norm": 1.0779708623886108, + "learning_rate": 2.6386861313868614e-05, + "loss": 0.8759, + "step": 723 + }, + { + "epoch": 0.2643783092934088, + "grad_norm": 1.7302335500717163, + "learning_rate": 2.642335766423358e-05, + "loss": 0.8741, + "step": 724 + }, + { + "epoch": 0.26474347270403503, + "grad_norm": 1.0769745111465454, + "learning_rate": 2.645985401459854e-05, + "loss": 0.865, + "step": 725 + }, + { + "epoch": 0.2651086361146613, + "grad_norm": 1.138060212135315, + "learning_rate": 2.6496350364963504e-05, + "loss": 0.8702, + "step": 726 + }, + { + "epoch": 0.26547379952528755, + "grad_norm": 0.9084373116493225, + "learning_rate": 2.6532846715328472e-05, + "loss": 0.8986, + "step": 727 + }, + { + "epoch": 0.2658389629359138, + "grad_norm": 0.7882066369056702, + "learning_rate": 2.656934306569343e-05, + "loss": 0.8773, + "step": 728 + }, + { + "epoch": 0.26620412634654006, + "grad_norm": 0.9498522281646729, + "learning_rate": 2.6605839416058398e-05, + "loss": 0.8704, + "step": 729 + }, + { + "epoch": 0.2665692897571663, + "grad_norm": 0.8272795081138611, + "learning_rate": 2.6642335766423363e-05, + "loss": 0.8909, + "step": 730 + }, + { + "epoch": 0.2669344531677926, + "grad_norm": 0.9073951244354248, + "learning_rate": 2.6678832116788324e-05, + "loss": 0.8344, + "step": 731 + }, + { + "epoch": 0.26729961657841883, + "grad_norm": 1.3860666751861572, + "learning_rate": 2.6715328467153288e-05, + "loss": 0.8937, + "step": 732 + }, + { + "epoch": 0.2676647799890451, + "grad_norm": 0.8217914700508118, + "learning_rate": 2.675182481751825e-05, + "loss": 0.8906, + "step": 733 + }, + { + "epoch": 0.26802994339967134, + "grad_norm": 0.9651821851730347, + "learning_rate": 2.6788321167883214e-05, + "loss": 0.9131, + "step": 734 + }, + { + "epoch": 0.2683951068102976, + "grad_norm": 0.8943136930465698, + "learning_rate": 2.682481751824818e-05, + "loss": 0.887, + "step": 735 + }, + { + "epoch": 0.26876027022092386, + "grad_norm": 0.8883320093154907, + "learning_rate": 2.686131386861314e-05, + "loss": 0.8606, + "step": 736 + }, + { + "epoch": 0.2691254336315501, + "grad_norm": 1.6274850368499756, + "learning_rate": 2.6897810218978104e-05, + "loss": 0.8727, + "step": 737 + }, + { + "epoch": 0.26949059704217637, + "grad_norm": 0.9518415927886963, + "learning_rate": 2.693430656934307e-05, + "loss": 0.8876, + "step": 738 + }, + { + "epoch": 0.26985576045280263, + "grad_norm": 0.7219151854515076, + "learning_rate": 2.697080291970803e-05, + "loss": 0.8413, + "step": 739 + }, + { + "epoch": 0.2702209238634289, + "grad_norm": 0.9340730905532837, + "learning_rate": 2.7007299270072995e-05, + "loss": 0.8448, + "step": 740 + }, + { + "epoch": 0.27058608727405514, + "grad_norm": 0.9046608209609985, + "learning_rate": 2.704379562043796e-05, + "loss": 0.8837, + "step": 741 + }, + { + "epoch": 0.2709512506846814, + "grad_norm": 1.0275894403457642, + "learning_rate": 2.708029197080292e-05, + "loss": 0.8754, + "step": 742 + }, + { + "epoch": 0.27131641409530766, + "grad_norm": 0.8216059803962708, + "learning_rate": 2.7116788321167885e-05, + "loss": 0.8639, + "step": 743 + }, + { + "epoch": 0.2716815775059339, + "grad_norm": 1.0811870098114014, + "learning_rate": 2.7153284671532846e-05, + "loss": 0.8712, + "step": 744 + }, + { + "epoch": 0.27204674091656017, + "grad_norm": 0.9865444302558899, + "learning_rate": 2.718978102189781e-05, + "loss": 0.8939, + "step": 745 + }, + { + "epoch": 0.2724119043271864, + "grad_norm": 1.233981966972351, + "learning_rate": 2.722627737226278e-05, + "loss": 0.8718, + "step": 746 + }, + { + "epoch": 0.2727770677378127, + "grad_norm": 0.8024433851242065, + "learning_rate": 2.7262773722627736e-05, + "loss": 0.8561, + "step": 747 + }, + { + "epoch": 0.27314223114843894, + "grad_norm": 0.9161458611488342, + "learning_rate": 2.7299270072992704e-05, + "loss": 0.8787, + "step": 748 + }, + { + "epoch": 0.2735073945590652, + "grad_norm": 0.9415838122367859, + "learning_rate": 2.733576642335767e-05, + "loss": 0.8699, + "step": 749 + }, + { + "epoch": 0.27387255796969145, + "grad_norm": 1.0449683666229248, + "learning_rate": 2.737226277372263e-05, + "loss": 0.859, + "step": 750 + }, + { + "epoch": 0.2742377213803177, + "grad_norm": 0.8127017617225647, + "learning_rate": 2.7408759124087595e-05, + "loss": 0.8369, + "step": 751 + }, + { + "epoch": 0.27460288479094397, + "grad_norm": 0.7522885203361511, + "learning_rate": 2.744525547445256e-05, + "loss": 0.8561, + "step": 752 + }, + { + "epoch": 0.2749680482015702, + "grad_norm": 1.016823410987854, + "learning_rate": 2.748175182481752e-05, + "loss": 0.8719, + "step": 753 + }, + { + "epoch": 0.2753332116121965, + "grad_norm": 1.4573849439620972, + "learning_rate": 2.7518248175182485e-05, + "loss": 0.8424, + "step": 754 + }, + { + "epoch": 0.27569837502282274, + "grad_norm": 0.8680856227874756, + "learning_rate": 2.7554744525547446e-05, + "loss": 0.8884, + "step": 755 + }, + { + "epoch": 0.276063538433449, + "grad_norm": 1.0039207935333252, + "learning_rate": 2.759124087591241e-05, + "loss": 0.8804, + "step": 756 + }, + { + "epoch": 0.2764287018440752, + "grad_norm": 1.1460931301116943, + "learning_rate": 2.7627737226277375e-05, + "loss": 0.8829, + "step": 757 + }, + { + "epoch": 0.27679386525470145, + "grad_norm": 0.9026143550872803, + "learning_rate": 2.7664233576642336e-05, + "loss": 0.8467, + "step": 758 + }, + { + "epoch": 0.2771590286653277, + "grad_norm": 1.1995723247528076, + "learning_rate": 2.77007299270073e-05, + "loss": 0.8937, + "step": 759 + }, + { + "epoch": 0.27752419207595397, + "grad_norm": 0.8786799311637878, + "learning_rate": 2.7737226277372266e-05, + "loss": 0.8345, + "step": 760 + }, + { + "epoch": 0.2778893554865802, + "grad_norm": 1.2147480249404907, + "learning_rate": 2.7773722627737227e-05, + "loss": 0.8653, + "step": 761 + }, + { + "epoch": 0.2782545188972065, + "grad_norm": 1.0637128353118896, + "learning_rate": 2.781021897810219e-05, + "loss": 0.8877, + "step": 762 + }, + { + "epoch": 0.27861968230783274, + "grad_norm": 1.7768861055374146, + "learning_rate": 2.784671532846716e-05, + "loss": 0.8447, + "step": 763 + }, + { + "epoch": 0.278984845718459, + "grad_norm": 0.7212632894515991, + "learning_rate": 2.7883211678832117e-05, + "loss": 0.8678, + "step": 764 + }, + { + "epoch": 0.27935000912908525, + "grad_norm": 1.1126927137374878, + "learning_rate": 2.7919708029197085e-05, + "loss": 0.9049, + "step": 765 + }, + { + "epoch": 0.2797151725397115, + "grad_norm": 0.9064791798591614, + "learning_rate": 2.7956204379562043e-05, + "loss": 0.8501, + "step": 766 + }, + { + "epoch": 0.28008033595033777, + "grad_norm": 0.9776251912117004, + "learning_rate": 2.799270072992701e-05, + "loss": 0.8862, + "step": 767 + }, + { + "epoch": 0.280445499360964, + "grad_norm": 1.2621984481811523, + "learning_rate": 2.8029197080291975e-05, + "loss": 0.8602, + "step": 768 + }, + { + "epoch": 0.2808106627715903, + "grad_norm": 0.816348671913147, + "learning_rate": 2.8065693430656936e-05, + "loss": 0.8169, + "step": 769 + }, + { + "epoch": 0.28117582618221654, + "grad_norm": 1.1793899536132812, + "learning_rate": 2.81021897810219e-05, + "loss": 0.8347, + "step": 770 + }, + { + "epoch": 0.2815409895928428, + "grad_norm": 0.8727941513061523, + "learning_rate": 2.8138686131386866e-05, + "loss": 0.8544, + "step": 771 + }, + { + "epoch": 0.28190615300346905, + "grad_norm": 1.2146086692810059, + "learning_rate": 2.8175182481751827e-05, + "loss": 0.8943, + "step": 772 + }, + { + "epoch": 0.2822713164140953, + "grad_norm": 0.974018931388855, + "learning_rate": 2.821167883211679e-05, + "loss": 0.8328, + "step": 773 + }, + { + "epoch": 0.28263647982472156, + "grad_norm": 1.0131561756134033, + "learning_rate": 2.8248175182481756e-05, + "loss": 0.8574, + "step": 774 + }, + { + "epoch": 0.2830016432353478, + "grad_norm": 1.2922457456588745, + "learning_rate": 2.8284671532846717e-05, + "loss": 0.8866, + "step": 775 + }, + { + "epoch": 0.2833668066459741, + "grad_norm": 1.044386863708496, + "learning_rate": 2.832116788321168e-05, + "loss": 0.8892, + "step": 776 + }, + { + "epoch": 0.28373197005660034, + "grad_norm": 0.9163618087768555, + "learning_rate": 2.8357664233576643e-05, + "loss": 0.8468, + "step": 777 + }, + { + "epoch": 0.2840971334672266, + "grad_norm": 0.8878577947616577, + "learning_rate": 2.8394160583941607e-05, + "loss": 0.8496, + "step": 778 + }, + { + "epoch": 0.28446229687785285, + "grad_norm": 1.042579174041748, + "learning_rate": 2.8430656934306572e-05, + "loss": 0.8405, + "step": 779 + }, + { + "epoch": 0.2848274602884791, + "grad_norm": 1.7790285348892212, + "learning_rate": 2.8467153284671533e-05, + "loss": 0.8541, + "step": 780 + }, + { + "epoch": 0.28519262369910536, + "grad_norm": 1.053520679473877, + "learning_rate": 2.8503649635036498e-05, + "loss": 0.8567, + "step": 781 + }, + { + "epoch": 0.2855577871097316, + "grad_norm": 0.8160289525985718, + "learning_rate": 2.8540145985401466e-05, + "loss": 0.8691, + "step": 782 + }, + { + "epoch": 0.2859229505203579, + "grad_norm": 0.8448201417922974, + "learning_rate": 2.8576642335766423e-05, + "loss": 0.8516, + "step": 783 + }, + { + "epoch": 0.28628811393098413, + "grad_norm": 1.0939788818359375, + "learning_rate": 2.861313868613139e-05, + "loss": 0.8488, + "step": 784 + }, + { + "epoch": 0.2866532773416104, + "grad_norm": 0.9668917059898376, + "learning_rate": 2.8649635036496356e-05, + "loss": 0.8357, + "step": 785 + }, + { + "epoch": 0.28701844075223665, + "grad_norm": 1.2287737131118774, + "learning_rate": 2.8686131386861317e-05, + "loss": 0.8245, + "step": 786 + }, + { + "epoch": 0.2873836041628629, + "grad_norm": 0.9395943284034729, + "learning_rate": 2.872262773722628e-05, + "loss": 0.8193, + "step": 787 + }, + { + "epoch": 0.28774876757348916, + "grad_norm": 0.8268449902534485, + "learning_rate": 2.8759124087591243e-05, + "loss": 0.8689, + "step": 788 + }, + { + "epoch": 0.2881139309841154, + "grad_norm": 1.159216284751892, + "learning_rate": 2.8795620437956207e-05, + "loss": 0.8751, + "step": 789 + }, + { + "epoch": 0.2884790943947416, + "grad_norm": 0.7923019528388977, + "learning_rate": 2.8832116788321172e-05, + "loss": 0.873, + "step": 790 + }, + { + "epoch": 0.2888442578053679, + "grad_norm": 0.8604425191879272, + "learning_rate": 2.8868613138686133e-05, + "loss": 0.8672, + "step": 791 + }, + { + "epoch": 0.28920942121599413, + "grad_norm": 1.0001811981201172, + "learning_rate": 2.8905109489051098e-05, + "loss": 0.827, + "step": 792 + }, + { + "epoch": 0.2895745846266204, + "grad_norm": 0.8650938272476196, + "learning_rate": 2.8941605839416062e-05, + "loss": 0.8575, + "step": 793 + }, + { + "epoch": 0.28993974803724665, + "grad_norm": 0.9869515895843506, + "learning_rate": 2.8978102189781023e-05, + "loss": 0.8517, + "step": 794 + }, + { + "epoch": 0.2903049114478729, + "grad_norm": 1.1096405982971191, + "learning_rate": 2.9014598540145988e-05, + "loss": 0.8355, + "step": 795 + }, + { + "epoch": 0.29067007485849916, + "grad_norm": 1.2804101705551147, + "learning_rate": 2.905109489051095e-05, + "loss": 0.8984, + "step": 796 + }, + { + "epoch": 0.2910352382691254, + "grad_norm": 1.031632423400879, + "learning_rate": 2.9087591240875914e-05, + "loss": 0.8362, + "step": 797 + }, + { + "epoch": 0.2914004016797517, + "grad_norm": 1.4014887809753418, + "learning_rate": 2.912408759124088e-05, + "loss": 0.8417, + "step": 798 + }, + { + "epoch": 0.29176556509037793, + "grad_norm": 0.8653863072395325, + "learning_rate": 2.916058394160584e-05, + "loss": 0.8671, + "step": 799 + }, + { + "epoch": 0.2921307285010042, + "grad_norm": 1.3845256567001343, + "learning_rate": 2.9197080291970804e-05, + "loss": 0.8916, + "step": 800 + }, + { + "epoch": 0.29249589191163045, + "grad_norm": 0.8273415565490723, + "learning_rate": 2.923357664233577e-05, + "loss": 0.8458, + "step": 801 + }, + { + "epoch": 0.2928610553222567, + "grad_norm": 0.8840750455856323, + "learning_rate": 2.927007299270073e-05, + "loss": 0.8491, + "step": 802 + }, + { + "epoch": 0.29322621873288296, + "grad_norm": 0.972186267375946, + "learning_rate": 2.9306569343065698e-05, + "loss": 0.83, + "step": 803 + }, + { + "epoch": 0.2935913821435092, + "grad_norm": 1.119389533996582, + "learning_rate": 2.9343065693430662e-05, + "loss": 0.8501, + "step": 804 + }, + { + "epoch": 0.2939565455541355, + "grad_norm": 0.8941442966461182, + "learning_rate": 2.9379562043795624e-05, + "loss": 0.8246, + "step": 805 + }, + { + "epoch": 0.29432170896476173, + "grad_norm": 0.853405237197876, + "learning_rate": 2.9416058394160588e-05, + "loss": 0.8748, + "step": 806 + }, + { + "epoch": 0.294686872375388, + "grad_norm": 1.1405614614486694, + "learning_rate": 2.945255474452555e-05, + "loss": 0.8303, + "step": 807 + }, + { + "epoch": 0.29505203578601424, + "grad_norm": 1.0696759223937988, + "learning_rate": 2.9489051094890514e-05, + "loss": 0.8627, + "step": 808 + }, + { + "epoch": 0.2954171991966405, + "grad_norm": 1.1660170555114746, + "learning_rate": 2.952554744525548e-05, + "loss": 0.8439, + "step": 809 + }, + { + "epoch": 0.29578236260726676, + "grad_norm": 1.073717713356018, + "learning_rate": 2.956204379562044e-05, + "loss": 0.8474, + "step": 810 + }, + { + "epoch": 0.296147526017893, + "grad_norm": 0.9319285750389099, + "learning_rate": 2.9598540145985404e-05, + "loss": 0.844, + "step": 811 + }, + { + "epoch": 0.29651268942851927, + "grad_norm": 1.0393621921539307, + "learning_rate": 2.963503649635037e-05, + "loss": 0.853, + "step": 812 + }, + { + "epoch": 0.29687785283914553, + "grad_norm": 1.3637595176696777, + "learning_rate": 2.967153284671533e-05, + "loss": 0.9049, + "step": 813 + }, + { + "epoch": 0.2972430162497718, + "grad_norm": 1.4779207706451416, + "learning_rate": 2.9708029197080294e-05, + "loss": 0.826, + "step": 814 + }, + { + "epoch": 0.29760817966039804, + "grad_norm": 0.811496913433075, + "learning_rate": 2.974452554744526e-05, + "loss": 0.8352, + "step": 815 + }, + { + "epoch": 0.2979733430710243, + "grad_norm": 1.0108301639556885, + "learning_rate": 2.978102189781022e-05, + "loss": 0.851, + "step": 816 + }, + { + "epoch": 0.29833850648165056, + "grad_norm": 1.167640209197998, + "learning_rate": 2.9817518248175185e-05, + "loss": 0.8551, + "step": 817 + }, + { + "epoch": 0.2987036698922768, + "grad_norm": 1.0643702745437622, + "learning_rate": 2.9854014598540146e-05, + "loss": 0.8496, + "step": 818 + }, + { + "epoch": 0.29906883330290307, + "grad_norm": 0.986891508102417, + "learning_rate": 2.989051094890511e-05, + "loss": 0.8589, + "step": 819 + }, + { + "epoch": 0.2994339967135293, + "grad_norm": 1.0008232593536377, + "learning_rate": 2.9927007299270075e-05, + "loss": 0.8278, + "step": 820 + }, + { + "epoch": 0.2997991601241556, + "grad_norm": 1.130242109298706, + "learning_rate": 2.9963503649635036e-05, + "loss": 0.8522, + "step": 821 + }, + { + "epoch": 0.30016432353478184, + "grad_norm": 1.1431291103363037, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.8372, + "step": 822 + }, + { + "epoch": 0.3005294869454081, + "grad_norm": 1.3566477298736572, + "learning_rate": 3.003649635036497e-05, + "loss": 0.8336, + "step": 823 + }, + { + "epoch": 0.3008946503560343, + "grad_norm": 0.8847786784172058, + "learning_rate": 3.007299270072993e-05, + "loss": 0.8261, + "step": 824 + }, + { + "epoch": 0.30125981376666056, + "grad_norm": 1.1674613952636719, + "learning_rate": 3.0109489051094894e-05, + "loss": 0.8665, + "step": 825 + }, + { + "epoch": 0.3016249771772868, + "grad_norm": 1.2328945398330688, + "learning_rate": 3.014598540145986e-05, + "loss": 0.8342, + "step": 826 + }, + { + "epoch": 0.30199014058791307, + "grad_norm": 1.004921317100525, + "learning_rate": 3.018248175182482e-05, + "loss": 0.8542, + "step": 827 + }, + { + "epoch": 0.3023553039985393, + "grad_norm": 0.9154124855995178, + "learning_rate": 3.0218978102189785e-05, + "loss": 0.8376, + "step": 828 + }, + { + "epoch": 0.3027204674091656, + "grad_norm": 0.9534822106361389, + "learning_rate": 3.0255474452554746e-05, + "loss": 0.8693, + "step": 829 + }, + { + "epoch": 0.30308563081979184, + "grad_norm": 0.9566801190376282, + "learning_rate": 3.029197080291971e-05, + "loss": 0.8574, + "step": 830 + }, + { + "epoch": 0.3034507942304181, + "grad_norm": 0.9584648013114929, + "learning_rate": 3.0328467153284675e-05, + "loss": 0.8534, + "step": 831 + }, + { + "epoch": 0.30381595764104435, + "grad_norm": 1.7552622556686401, + "learning_rate": 3.0364963503649636e-05, + "loss": 0.8157, + "step": 832 + }, + { + "epoch": 0.3041811210516706, + "grad_norm": 0.8994453549385071, + "learning_rate": 3.04014598540146e-05, + "loss": 0.8274, + "step": 833 + }, + { + "epoch": 0.30454628446229687, + "grad_norm": 0.6974275708198547, + "learning_rate": 3.0437956204379565e-05, + "loss": 0.804, + "step": 834 + }, + { + "epoch": 0.3049114478729231, + "grad_norm": 0.868697464466095, + "learning_rate": 3.0474452554744527e-05, + "loss": 0.8411, + "step": 835 + }, + { + "epoch": 0.3052766112835494, + "grad_norm": 0.9820177555084229, + "learning_rate": 3.051094890510949e-05, + "loss": 0.8455, + "step": 836 + }, + { + "epoch": 0.30564177469417564, + "grad_norm": 0.7563433647155762, + "learning_rate": 3.0547445255474456e-05, + "loss": 0.8085, + "step": 837 + }, + { + "epoch": 0.3060069381048019, + "grad_norm": 1.0138208866119385, + "learning_rate": 3.058394160583942e-05, + "loss": 0.8531, + "step": 838 + }, + { + "epoch": 0.30637210151542815, + "grad_norm": 0.8455662727355957, + "learning_rate": 3.0620437956204385e-05, + "loss": 0.849, + "step": 839 + }, + { + "epoch": 0.3067372649260544, + "grad_norm": 1.0412776470184326, + "learning_rate": 3.0656934306569346e-05, + "loss": 0.817, + "step": 840 + }, + { + "epoch": 0.30710242833668067, + "grad_norm": 0.8895344734191895, + "learning_rate": 3.069343065693431e-05, + "loss": 0.8472, + "step": 841 + }, + { + "epoch": 0.3074675917473069, + "grad_norm": 1.1234147548675537, + "learning_rate": 3.0729927007299275e-05, + "loss": 0.819, + "step": 842 + }, + { + "epoch": 0.3078327551579332, + "grad_norm": 1.1289489269256592, + "learning_rate": 3.0766423357664236e-05, + "loss": 0.8502, + "step": 843 + }, + { + "epoch": 0.30819791856855944, + "grad_norm": 0.849317729473114, + "learning_rate": 3.08029197080292e-05, + "loss": 0.8062, + "step": 844 + }, + { + "epoch": 0.3085630819791857, + "grad_norm": 0.9962231516838074, + "learning_rate": 3.0839416058394165e-05, + "loss": 0.8438, + "step": 845 + }, + { + "epoch": 0.30892824538981195, + "grad_norm": 1.1971230506896973, + "learning_rate": 3.0875912408759127e-05, + "loss": 0.8685, + "step": 846 + }, + { + "epoch": 0.3092934088004382, + "grad_norm": 1.1716772317886353, + "learning_rate": 3.091240875912409e-05, + "loss": 0.8779, + "step": 847 + }, + { + "epoch": 0.30965857221106446, + "grad_norm": 0.777720034122467, + "learning_rate": 3.0948905109489056e-05, + "loss": 0.8011, + "step": 848 + }, + { + "epoch": 0.3100237356216907, + "grad_norm": 1.3869441747665405, + "learning_rate": 3.098540145985402e-05, + "loss": 0.8271, + "step": 849 + }, + { + "epoch": 0.310388899032317, + "grad_norm": 1.0939347743988037, + "learning_rate": 3.1021897810218985e-05, + "loss": 0.8478, + "step": 850 + }, + { + "epoch": 0.31075406244294324, + "grad_norm": 0.7109103202819824, + "learning_rate": 3.105839416058394e-05, + "loss": 0.8158, + "step": 851 + }, + { + "epoch": 0.3111192258535695, + "grad_norm": 1.3487030267715454, + "learning_rate": 3.109489051094891e-05, + "loss": 0.8251, + "step": 852 + }, + { + "epoch": 0.31148438926419575, + "grad_norm": 1.0756982564926147, + "learning_rate": 3.1131386861313875e-05, + "loss": 0.8522, + "step": 853 + }, + { + "epoch": 0.311849552674822, + "grad_norm": 1.2139660120010376, + "learning_rate": 3.1167883211678836e-05, + "loss": 0.8373, + "step": 854 + }, + { + "epoch": 0.31221471608544826, + "grad_norm": 1.1184643507003784, + "learning_rate": 3.12043795620438e-05, + "loss": 0.8976, + "step": 855 + }, + { + "epoch": 0.3125798794960745, + "grad_norm": 0.9936229586601257, + "learning_rate": 3.1240875912408765e-05, + "loss": 0.8284, + "step": 856 + }, + { + "epoch": 0.3129450429067007, + "grad_norm": 0.8024868369102478, + "learning_rate": 3.127737226277373e-05, + "loss": 0.8251, + "step": 857 + }, + { + "epoch": 0.313310206317327, + "grad_norm": 0.8595249652862549, + "learning_rate": 3.131386861313869e-05, + "loss": 0.7964, + "step": 858 + }, + { + "epoch": 0.31367536972795323, + "grad_norm": 0.880524218082428, + "learning_rate": 3.135036496350365e-05, + "loss": 0.8198, + "step": 859 + }, + { + "epoch": 0.3140405331385795, + "grad_norm": 1.0284209251403809, + "learning_rate": 3.138686131386862e-05, + "loss": 0.8179, + "step": 860 + }, + { + "epoch": 0.31440569654920575, + "grad_norm": 1.0328218936920166, + "learning_rate": 3.142335766423358e-05, + "loss": 0.8708, + "step": 861 + }, + { + "epoch": 0.314770859959832, + "grad_norm": 0.9691762328147888, + "learning_rate": 3.145985401459854e-05, + "loss": 0.7798, + "step": 862 + }, + { + "epoch": 0.31513602337045826, + "grad_norm": 1.3719781637191772, + "learning_rate": 3.149635036496351e-05, + "loss": 0.7811, + "step": 863 + }, + { + "epoch": 0.3155011867810845, + "grad_norm": 0.784748911857605, + "learning_rate": 3.153284671532847e-05, + "loss": 0.8466, + "step": 864 + }, + { + "epoch": 0.3158663501917108, + "grad_norm": 1.022679090499878, + "learning_rate": 3.156934306569343e-05, + "loss": 0.8276, + "step": 865 + }, + { + "epoch": 0.31623151360233703, + "grad_norm": 0.9829046726226807, + "learning_rate": 3.16058394160584e-05, + "loss": 0.8263, + "step": 866 + }, + { + "epoch": 0.3165966770129633, + "grad_norm": 0.8736450672149658, + "learning_rate": 3.1642335766423365e-05, + "loss": 0.8392, + "step": 867 + }, + { + "epoch": 0.31696184042358955, + "grad_norm": 0.9354644417762756, + "learning_rate": 3.167883211678832e-05, + "loss": 0.8011, + "step": 868 + }, + { + "epoch": 0.3173270038342158, + "grad_norm": 1.0501866340637207, + "learning_rate": 3.171532846715329e-05, + "loss": 0.8572, + "step": 869 + }, + { + "epoch": 0.31769216724484206, + "grad_norm": 0.9882240295410156, + "learning_rate": 3.175182481751825e-05, + "loss": 0.8113, + "step": 870 + }, + { + "epoch": 0.3180573306554683, + "grad_norm": 0.8495408296585083, + "learning_rate": 3.178832116788322e-05, + "loss": 0.8181, + "step": 871 + }, + { + "epoch": 0.3184224940660946, + "grad_norm": 1.4235453605651855, + "learning_rate": 3.182481751824818e-05, + "loss": 0.8707, + "step": 872 + }, + { + "epoch": 0.31878765747672083, + "grad_norm": 1.308279037475586, + "learning_rate": 3.186131386861314e-05, + "loss": 0.8494, + "step": 873 + }, + { + "epoch": 0.3191528208873471, + "grad_norm": 1.1783095598220825, + "learning_rate": 3.189781021897811e-05, + "loss": 0.8304, + "step": 874 + }, + { + "epoch": 0.31951798429797335, + "grad_norm": 0.6439776420593262, + "learning_rate": 3.193430656934307e-05, + "loss": 0.8052, + "step": 875 + }, + { + "epoch": 0.3198831477085996, + "grad_norm": 0.74483323097229, + "learning_rate": 3.197080291970803e-05, + "loss": 0.8425, + "step": 876 + }, + { + "epoch": 0.32024831111922586, + "grad_norm": 1.0718963146209717, + "learning_rate": 3.2007299270073e-05, + "loss": 0.8292, + "step": 877 + }, + { + "epoch": 0.3206134745298521, + "grad_norm": 1.2955678701400757, + "learning_rate": 3.204379562043796e-05, + "loss": 0.8169, + "step": 878 + }, + { + "epoch": 0.3209786379404784, + "grad_norm": 1.3649272918701172, + "learning_rate": 3.208029197080292e-05, + "loss": 0.8435, + "step": 879 + }, + { + "epoch": 0.32134380135110463, + "grad_norm": 0.8959698677062988, + "learning_rate": 3.211678832116789e-05, + "loss": 0.8048, + "step": 880 + }, + { + "epoch": 0.3217089647617309, + "grad_norm": 1.2295029163360596, + "learning_rate": 3.215328467153285e-05, + "loss": 0.833, + "step": 881 + }, + { + "epoch": 0.32207412817235714, + "grad_norm": 1.0183993577957153, + "learning_rate": 3.218978102189781e-05, + "loss": 0.8398, + "step": 882 + }, + { + "epoch": 0.3224392915829834, + "grad_norm": 0.7791547179222107, + "learning_rate": 3.222627737226278e-05, + "loss": 0.8392, + "step": 883 + }, + { + "epoch": 0.32280445499360966, + "grad_norm": 2.3649449348449707, + "learning_rate": 3.226277372262774e-05, + "loss": 0.8372, + "step": 884 + }, + { + "epoch": 0.3231696184042359, + "grad_norm": 1.179885983467102, + "learning_rate": 3.22992700729927e-05, + "loss": 0.8252, + "step": 885 + }, + { + "epoch": 0.32353478181486217, + "grad_norm": 1.0518529415130615, + "learning_rate": 3.233576642335767e-05, + "loss": 0.8553, + "step": 886 + }, + { + "epoch": 0.32389994522548843, + "grad_norm": 0.9633598923683167, + "learning_rate": 3.237226277372263e-05, + "loss": 0.8253, + "step": 887 + }, + { + "epoch": 0.3242651086361147, + "grad_norm": 1.2298461198806763, + "learning_rate": 3.24087591240876e-05, + "loss": 0.8173, + "step": 888 + }, + { + "epoch": 0.32463027204674094, + "grad_norm": 0.9368358254432678, + "learning_rate": 3.244525547445256e-05, + "loss": 0.8084, + "step": 889 + }, + { + "epoch": 0.3249954354573672, + "grad_norm": 0.8894047141075134, + "learning_rate": 3.248175182481752e-05, + "loss": 0.8243, + "step": 890 + }, + { + "epoch": 0.3253605988679934, + "grad_norm": 0.9334021806716919, + "learning_rate": 3.251824817518249e-05, + "loss": 0.8069, + "step": 891 + }, + { + "epoch": 0.32572576227861966, + "grad_norm": 1.008568525314331, + "learning_rate": 3.255474452554745e-05, + "loss": 0.8408, + "step": 892 + }, + { + "epoch": 0.3260909256892459, + "grad_norm": 1.0324256420135498, + "learning_rate": 3.259124087591241e-05, + "loss": 0.8558, + "step": 893 + }, + { + "epoch": 0.32645608909987217, + "grad_norm": 2.0214602947235107, + "learning_rate": 3.262773722627738e-05, + "loss": 0.7908, + "step": 894 + }, + { + "epoch": 0.32682125251049843, + "grad_norm": 0.6830151677131653, + "learning_rate": 3.266423357664234e-05, + "loss": 0.8373, + "step": 895 + }, + { + "epoch": 0.3271864159211247, + "grad_norm": 1.074393391609192, + "learning_rate": 3.27007299270073e-05, + "loss": 0.8124, + "step": 896 + }, + { + "epoch": 0.32755157933175094, + "grad_norm": 0.845180869102478, + "learning_rate": 3.273722627737227e-05, + "loss": 0.832, + "step": 897 + }, + { + "epoch": 0.3279167427423772, + "grad_norm": 1.0447711944580078, + "learning_rate": 3.277372262773723e-05, + "loss": 0.8257, + "step": 898 + }, + { + "epoch": 0.32828190615300346, + "grad_norm": 0.8940461277961731, + "learning_rate": 3.281021897810219e-05, + "loss": 0.8353, + "step": 899 + }, + { + "epoch": 0.3286470695636297, + "grad_norm": 1.0841431617736816, + "learning_rate": 3.284671532846716e-05, + "loss": 0.8226, + "step": 900 + }, + { + "epoch": 0.32901223297425597, + "grad_norm": 0.9496214389801025, + "learning_rate": 3.288321167883212e-05, + "loss": 0.8202, + "step": 901 + }, + { + "epoch": 0.3293773963848822, + "grad_norm": 0.8948093056678772, + "learning_rate": 3.291970802919708e-05, + "loss": 0.8251, + "step": 902 + }, + { + "epoch": 0.3297425597955085, + "grad_norm": 1.312648892402649, + "learning_rate": 3.295620437956204e-05, + "loss": 0.8773, + "step": 903 + }, + { + "epoch": 0.33010772320613474, + "grad_norm": 1.1946377754211426, + "learning_rate": 3.299270072992701e-05, + "loss": 0.8223, + "step": 904 + }, + { + "epoch": 0.330472886616761, + "grad_norm": 0.9710689187049866, + "learning_rate": 3.302919708029197e-05, + "loss": 0.8355, + "step": 905 + }, + { + "epoch": 0.33083805002738725, + "grad_norm": 0.8350407481193542, + "learning_rate": 3.306569343065693e-05, + "loss": 0.8265, + "step": 906 + }, + { + "epoch": 0.3312032134380135, + "grad_norm": 1.2623652219772339, + "learning_rate": 3.31021897810219e-05, + "loss": 0.8221, + "step": 907 + }, + { + "epoch": 0.33156837684863977, + "grad_norm": 0.8959325551986694, + "learning_rate": 3.313868613138687e-05, + "loss": 0.8311, + "step": 908 + }, + { + "epoch": 0.331933540259266, + "grad_norm": 0.9143708944320679, + "learning_rate": 3.317518248175183e-05, + "loss": 0.8245, + "step": 909 + }, + { + "epoch": 0.3322987036698923, + "grad_norm": 1.5401510000228882, + "learning_rate": 3.321167883211679e-05, + "loss": 0.8101, + "step": 910 + }, + { + "epoch": 0.33266386708051854, + "grad_norm": 0.6910380721092224, + "learning_rate": 3.324817518248176e-05, + "loss": 0.8, + "step": 911 + }, + { + "epoch": 0.3330290304911448, + "grad_norm": 2.472301483154297, + "learning_rate": 3.328467153284672e-05, + "loss": 0.8077, + "step": 912 + }, + { + "epoch": 0.33339419390177105, + "grad_norm": 1.2135626077651978, + "learning_rate": 3.332116788321168e-05, + "loss": 0.8113, + "step": 913 + }, + { + "epoch": 0.3337593573123973, + "grad_norm": 1.107142448425293, + "learning_rate": 3.335766423357664e-05, + "loss": 0.843, + "step": 914 + }, + { + "epoch": 0.33412452072302357, + "grad_norm": 1.1342405080795288, + "learning_rate": 3.339416058394161e-05, + "loss": 0.8278, + "step": 915 + }, + { + "epoch": 0.3344896841336498, + "grad_norm": 0.953411877155304, + "learning_rate": 3.343065693430657e-05, + "loss": 0.8277, + "step": 916 + }, + { + "epoch": 0.3348548475442761, + "grad_norm": 0.9323768019676208, + "learning_rate": 3.346715328467153e-05, + "loss": 0.8081, + "step": 917 + }, + { + "epoch": 0.33522001095490234, + "grad_norm": 0.9772979021072388, + "learning_rate": 3.35036496350365e-05, + "loss": 0.8453, + "step": 918 + }, + { + "epoch": 0.3355851743655286, + "grad_norm": 1.1281118392944336, + "learning_rate": 3.354014598540146e-05, + "loss": 0.8509, + "step": 919 + }, + { + "epoch": 0.33595033777615485, + "grad_norm": 0.9218109846115112, + "learning_rate": 3.357664233576642e-05, + "loss": 0.8181, + "step": 920 + }, + { + "epoch": 0.3363155011867811, + "grad_norm": 1.230379581451416, + "learning_rate": 3.361313868613139e-05, + "loss": 0.8436, + "step": 921 + }, + { + "epoch": 0.33668066459740736, + "grad_norm": 1.3046516180038452, + "learning_rate": 3.364963503649635e-05, + "loss": 0.8668, + "step": 922 + }, + { + "epoch": 0.3370458280080336, + "grad_norm": 0.9936146140098572, + "learning_rate": 3.368613138686131e-05, + "loss": 0.8121, + "step": 923 + }, + { + "epoch": 0.3374109914186599, + "grad_norm": 0.7893944382667542, + "learning_rate": 3.372262773722628e-05, + "loss": 0.8416, + "step": 924 + }, + { + "epoch": 0.3377761548292861, + "grad_norm": 0.9127933382987976, + "learning_rate": 3.375912408759124e-05, + "loss": 0.8004, + "step": 925 + }, + { + "epoch": 0.33814131823991234, + "grad_norm": 0.8317862749099731, + "learning_rate": 3.3795620437956204e-05, + "loss": 0.8025, + "step": 926 + }, + { + "epoch": 0.3385064816505386, + "grad_norm": 0.8821642994880676, + "learning_rate": 3.383211678832117e-05, + "loss": 0.8265, + "step": 927 + }, + { + "epoch": 0.33887164506116485, + "grad_norm": 1.2264710664749146, + "learning_rate": 3.386861313868613e-05, + "loss": 0.8625, + "step": 928 + }, + { + "epoch": 0.3392368084717911, + "grad_norm": 0.8349049687385559, + "learning_rate": 3.39051094890511e-05, + "loss": 0.8036, + "step": 929 + }, + { + "epoch": 0.33960197188241736, + "grad_norm": 0.8627322316169739, + "learning_rate": 3.394160583941606e-05, + "loss": 0.8652, + "step": 930 + }, + { + "epoch": 0.3399671352930436, + "grad_norm": 1.1609947681427002, + "learning_rate": 3.397810218978102e-05, + "loss": 0.8444, + "step": 931 + }, + { + "epoch": 0.3403322987036699, + "grad_norm": 1.0138678550720215, + "learning_rate": 3.401459854014599e-05, + "loss": 0.833, + "step": 932 + }, + { + "epoch": 0.34069746211429613, + "grad_norm": 1.2408621311187744, + "learning_rate": 3.405109489051095e-05, + "loss": 0.833, + "step": 933 + }, + { + "epoch": 0.3410626255249224, + "grad_norm": 0.9205710291862488, + "learning_rate": 3.408759124087591e-05, + "loss": 0.8199, + "step": 934 + }, + { + "epoch": 0.34142778893554865, + "grad_norm": 1.0878050327301025, + "learning_rate": 3.412408759124088e-05, + "loss": 0.8083, + "step": 935 + }, + { + "epoch": 0.3417929523461749, + "grad_norm": 0.8882326483726501, + "learning_rate": 3.416058394160584e-05, + "loss": 0.8295, + "step": 936 + }, + { + "epoch": 0.34215811575680116, + "grad_norm": 1.1191176176071167, + "learning_rate": 3.4197080291970804e-05, + "loss": 0.811, + "step": 937 + }, + { + "epoch": 0.3425232791674274, + "grad_norm": 0.829477846622467, + "learning_rate": 3.423357664233577e-05, + "loss": 0.8364, + "step": 938 + }, + { + "epoch": 0.3428884425780537, + "grad_norm": 0.9906582236289978, + "learning_rate": 3.427007299270073e-05, + "loss": 0.8206, + "step": 939 + }, + { + "epoch": 0.34325360598867993, + "grad_norm": 1.1381598711013794, + "learning_rate": 3.4306569343065694e-05, + "loss": 0.8435, + "step": 940 + }, + { + "epoch": 0.3436187693993062, + "grad_norm": 0.9499815702438354, + "learning_rate": 3.434306569343066e-05, + "loss": 0.7904, + "step": 941 + }, + { + "epoch": 0.34398393280993245, + "grad_norm": 1.2010533809661865, + "learning_rate": 3.437956204379562e-05, + "loss": 0.8521, + "step": 942 + }, + { + "epoch": 0.3443490962205587, + "grad_norm": 0.9433616399765015, + "learning_rate": 3.4416058394160584e-05, + "loss": 0.8716, + "step": 943 + }, + { + "epoch": 0.34471425963118496, + "grad_norm": 0.9508865475654602, + "learning_rate": 3.4452554744525545e-05, + "loss": 0.8239, + "step": 944 + }, + { + "epoch": 0.3450794230418112, + "grad_norm": 0.9482638239860535, + "learning_rate": 3.448905109489051e-05, + "loss": 0.839, + "step": 945 + }, + { + "epoch": 0.3454445864524375, + "grad_norm": 0.8817934393882751, + "learning_rate": 3.452554744525548e-05, + "loss": 0.8252, + "step": 946 + }, + { + "epoch": 0.34580974986306373, + "grad_norm": 1.4641749858856201, + "learning_rate": 3.4562043795620436e-05, + "loss": 0.8228, + "step": 947 + }, + { + "epoch": 0.34617491327369, + "grad_norm": 1.001059889793396, + "learning_rate": 3.4598540145985404e-05, + "loss": 0.8246, + "step": 948 + }, + { + "epoch": 0.34654007668431625, + "grad_norm": 1.1823747158050537, + "learning_rate": 3.463503649635037e-05, + "loss": 0.8475, + "step": 949 + }, + { + "epoch": 0.3469052400949425, + "grad_norm": 0.9262009263038635, + "learning_rate": 3.467153284671533e-05, + "loss": 0.8162, + "step": 950 + }, + { + "epoch": 0.34727040350556876, + "grad_norm": 0.6685298681259155, + "learning_rate": 3.4708029197080294e-05, + "loss": 0.8395, + "step": 951 + }, + { + "epoch": 0.347635566916195, + "grad_norm": 1.6057016849517822, + "learning_rate": 3.474452554744526e-05, + "loss": 0.8174, + "step": 952 + }, + { + "epoch": 0.3480007303268213, + "grad_norm": 0.9343808889389038, + "learning_rate": 3.478102189781022e-05, + "loss": 0.8411, + "step": 953 + }, + { + "epoch": 0.34836589373744753, + "grad_norm": 0.9032874703407288, + "learning_rate": 3.4817518248175184e-05, + "loss": 0.801, + "step": 954 + }, + { + "epoch": 0.3487310571480738, + "grad_norm": 0.9119698405265808, + "learning_rate": 3.4854014598540145e-05, + "loss": 0.8065, + "step": 955 + }, + { + "epoch": 0.34909622055870004, + "grad_norm": 1.0663105249404907, + "learning_rate": 3.4890510948905113e-05, + "loss": 0.7919, + "step": 956 + }, + { + "epoch": 0.3494613839693263, + "grad_norm": 1.0270241498947144, + "learning_rate": 3.4927007299270075e-05, + "loss": 0.8098, + "step": 957 + }, + { + "epoch": 0.3498265473799525, + "grad_norm": 1.1155675649642944, + "learning_rate": 3.4963503649635036e-05, + "loss": 0.8612, + "step": 958 + }, + { + "epoch": 0.35019171079057876, + "grad_norm": 1.2419427633285522, + "learning_rate": 3.5000000000000004e-05, + "loss": 0.8745, + "step": 959 + }, + { + "epoch": 0.350556874201205, + "grad_norm": 0.8625814914703369, + "learning_rate": 3.5036496350364965e-05, + "loss": 0.8186, + "step": 960 + }, + { + "epoch": 0.3509220376118313, + "grad_norm": 0.964387834072113, + "learning_rate": 3.5072992700729926e-05, + "loss": 0.8287, + "step": 961 + }, + { + "epoch": 0.35128720102245753, + "grad_norm": 0.7243093252182007, + "learning_rate": 3.5109489051094894e-05, + "loss": 0.7982, + "step": 962 + }, + { + "epoch": 0.3516523644330838, + "grad_norm": 1.322411060333252, + "learning_rate": 3.514598540145986e-05, + "loss": 0.8702, + "step": 963 + }, + { + "epoch": 0.35201752784371004, + "grad_norm": 0.9760127663612366, + "learning_rate": 3.5182481751824816e-05, + "loss": 0.8479, + "step": 964 + }, + { + "epoch": 0.3523826912543363, + "grad_norm": 1.1912221908569336, + "learning_rate": 3.5218978102189784e-05, + "loss": 0.8054, + "step": 965 + }, + { + "epoch": 0.35274785466496256, + "grad_norm": 1.126135230064392, + "learning_rate": 3.5255474452554745e-05, + "loss": 0.8484, + "step": 966 + }, + { + "epoch": 0.3531130180755888, + "grad_norm": 1.1270838975906372, + "learning_rate": 3.5291970802919713e-05, + "loss": 0.8258, + "step": 967 + }, + { + "epoch": 0.35347818148621507, + "grad_norm": 0.8599242568016052, + "learning_rate": 3.5328467153284675e-05, + "loss": 0.8273, + "step": 968 + }, + { + "epoch": 0.3538433448968413, + "grad_norm": 1.205792784690857, + "learning_rate": 3.5364963503649636e-05, + "loss": 0.8451, + "step": 969 + }, + { + "epoch": 0.3542085083074676, + "grad_norm": 1.0273261070251465, + "learning_rate": 3.5401459854014604e-05, + "loss": 0.8134, + "step": 970 + }, + { + "epoch": 0.35457367171809384, + "grad_norm": 1.1758073568344116, + "learning_rate": 3.5437956204379565e-05, + "loss": 0.8381, + "step": 971 + }, + { + "epoch": 0.3549388351287201, + "grad_norm": 1.1069955825805664, + "learning_rate": 3.5474452554744526e-05, + "loss": 0.8257, + "step": 972 + }, + { + "epoch": 0.35530399853934636, + "grad_norm": 1.0383074283599854, + "learning_rate": 3.5510948905109494e-05, + "loss": 0.8691, + "step": 973 + }, + { + "epoch": 0.3556691619499726, + "grad_norm": 1.0610301494598389, + "learning_rate": 3.5547445255474455e-05, + "loss": 0.8293, + "step": 974 + }, + { + "epoch": 0.35603432536059887, + "grad_norm": 1.1219865083694458, + "learning_rate": 3.5583941605839416e-05, + "loss": 0.8011, + "step": 975 + }, + { + "epoch": 0.3563994887712251, + "grad_norm": 1.0254887342453003, + "learning_rate": 3.5620437956204384e-05, + "loss": 0.8262, + "step": 976 + }, + { + "epoch": 0.3567646521818514, + "grad_norm": 1.057141900062561, + "learning_rate": 3.5656934306569346e-05, + "loss": 0.8442, + "step": 977 + }, + { + "epoch": 0.35712981559247764, + "grad_norm": 0.9983533024787903, + "learning_rate": 3.569343065693431e-05, + "loss": 0.809, + "step": 978 + }, + { + "epoch": 0.3574949790031039, + "grad_norm": 1.0989717245101929, + "learning_rate": 3.5729927007299275e-05, + "loss": 0.8024, + "step": 979 + }, + { + "epoch": 0.35786014241373015, + "grad_norm": 0.8879622220993042, + "learning_rate": 3.5766423357664236e-05, + "loss": 0.8134, + "step": 980 + }, + { + "epoch": 0.3582253058243564, + "grad_norm": 0.9703028798103333, + "learning_rate": 3.58029197080292e-05, + "loss": 0.8459, + "step": 981 + }, + { + "epoch": 0.35859046923498267, + "grad_norm": 0.9390744566917419, + "learning_rate": 3.5839416058394165e-05, + "loss": 0.8263, + "step": 982 + }, + { + "epoch": 0.3589556326456089, + "grad_norm": 0.9232451915740967, + "learning_rate": 3.5875912408759126e-05, + "loss": 0.7977, + "step": 983 + }, + { + "epoch": 0.3593207960562352, + "grad_norm": 0.9906948208808899, + "learning_rate": 3.5912408759124094e-05, + "loss": 0.7958, + "step": 984 + }, + { + "epoch": 0.35968595946686144, + "grad_norm": 1.122028112411499, + "learning_rate": 3.5948905109489055e-05, + "loss": 0.8314, + "step": 985 + }, + { + "epoch": 0.3600511228774877, + "grad_norm": 1.034443736076355, + "learning_rate": 3.5985401459854016e-05, + "loss": 0.7949, + "step": 986 + }, + { + "epoch": 0.36041628628811395, + "grad_norm": 1.1605156660079956, + "learning_rate": 3.6021897810218984e-05, + "loss": 0.804, + "step": 987 + }, + { + "epoch": 0.3607814496987402, + "grad_norm": 0.9331724047660828, + "learning_rate": 3.6058394160583946e-05, + "loss": 0.7858, + "step": 988 + }, + { + "epoch": 0.36114661310936647, + "grad_norm": 0.9008059501647949, + "learning_rate": 3.609489051094891e-05, + "loss": 0.8015, + "step": 989 + }, + { + "epoch": 0.3615117765199927, + "grad_norm": 1.032978892326355, + "learning_rate": 3.6131386861313875e-05, + "loss": 0.8217, + "step": 990 + }, + { + "epoch": 0.361876939930619, + "grad_norm": 0.9546392560005188, + "learning_rate": 3.6167883211678836e-05, + "loss": 0.8102, + "step": 991 + }, + { + "epoch": 0.3622421033412452, + "grad_norm": 1.1435415744781494, + "learning_rate": 3.62043795620438e-05, + "loss": 0.8077, + "step": 992 + }, + { + "epoch": 0.36260726675187144, + "grad_norm": 1.009759545326233, + "learning_rate": 3.6240875912408765e-05, + "loss": 0.802, + "step": 993 + }, + { + "epoch": 0.3629724301624977, + "grad_norm": 0.9189592599868774, + "learning_rate": 3.6277372262773726e-05, + "loss": 0.8082, + "step": 994 + }, + { + "epoch": 0.36333759357312395, + "grad_norm": 0.9083291292190552, + "learning_rate": 3.631386861313869e-05, + "loss": 0.8265, + "step": 995 + }, + { + "epoch": 0.3637027569837502, + "grad_norm": 1.0146288871765137, + "learning_rate": 3.635036496350365e-05, + "loss": 0.7642, + "step": 996 + }, + { + "epoch": 0.36406792039437647, + "grad_norm": 1.2165806293487549, + "learning_rate": 3.6386861313868616e-05, + "loss": 0.8153, + "step": 997 + }, + { + "epoch": 0.3644330838050027, + "grad_norm": 1.1724755764007568, + "learning_rate": 3.642335766423358e-05, + "loss": 0.78, + "step": 998 + }, + { + "epoch": 0.364798247215629, + "grad_norm": 1.15825355052948, + "learning_rate": 3.645985401459854e-05, + "loss": 0.8104, + "step": 999 + }, + { + "epoch": 0.36516341062625524, + "grad_norm": 1.1032559871673584, + "learning_rate": 3.649635036496351e-05, + "loss": 0.8246, + "step": 1000 + }, + { + "epoch": 0.3655285740368815, + "grad_norm": 0.9770479798316956, + "learning_rate": 3.6532846715328475e-05, + "loss": 0.7734, + "step": 1001 + }, + { + "epoch": 0.36589373744750775, + "grad_norm": 1.4258418083190918, + "learning_rate": 3.656934306569343e-05, + "loss": 0.7863, + "step": 1002 + }, + { + "epoch": 0.366258900858134, + "grad_norm": 1.1266810894012451, + "learning_rate": 3.66058394160584e-05, + "loss": 0.8093, + "step": 1003 + }, + { + "epoch": 0.36662406426876026, + "grad_norm": 1.1982899904251099, + "learning_rate": 3.6642335766423365e-05, + "loss": 0.8171, + "step": 1004 + }, + { + "epoch": 0.3669892276793865, + "grad_norm": 1.5895270109176636, + "learning_rate": 3.6678832116788326e-05, + "loss": 0.8324, + "step": 1005 + }, + { + "epoch": 0.3673543910900128, + "grad_norm": 1.1550204753875732, + "learning_rate": 3.671532846715329e-05, + "loss": 0.8187, + "step": 1006 + }, + { + "epoch": 0.36771955450063903, + "grad_norm": 3.356076955795288, + "learning_rate": 3.675182481751825e-05, + "loss": 0.7982, + "step": 1007 + }, + { + "epoch": 0.3680847179112653, + "grad_norm": 1.1151169538497925, + "learning_rate": 3.6788321167883217e-05, + "loss": 0.8, + "step": 1008 + }, + { + "epoch": 0.36844988132189155, + "grad_norm": 1.7109216451644897, + "learning_rate": 3.682481751824818e-05, + "loss": 0.8066, + "step": 1009 + }, + { + "epoch": 0.3688150447325178, + "grad_norm": 1.1339987516403198, + "learning_rate": 3.686131386861314e-05, + "loss": 0.8025, + "step": 1010 + }, + { + "epoch": 0.36918020814314406, + "grad_norm": 1.3794639110565186, + "learning_rate": 3.689781021897811e-05, + "loss": 0.7701, + "step": 1011 + }, + { + "epoch": 0.3695453715537703, + "grad_norm": 2.926950454711914, + "learning_rate": 3.693430656934307e-05, + "loss": 0.8243, + "step": 1012 + }, + { + "epoch": 0.3699105349643966, + "grad_norm": 1.6819735765457153, + "learning_rate": 3.697080291970803e-05, + "loss": 0.8391, + "step": 1013 + }, + { + "epoch": 0.37027569837502283, + "grad_norm": 1.31744384765625, + "learning_rate": 3.7007299270073e-05, + "loss": 0.7911, + "step": 1014 + }, + { + "epoch": 0.3706408617856491, + "grad_norm": 1.478202223777771, + "learning_rate": 3.704379562043796e-05, + "loss": 0.8079, + "step": 1015 + }, + { + "epoch": 0.37100602519627535, + "grad_norm": 1.2199921607971191, + "learning_rate": 3.708029197080292e-05, + "loss": 0.8317, + "step": 1016 + }, + { + "epoch": 0.3713711886069016, + "grad_norm": 2.6387546062469482, + "learning_rate": 3.711678832116789e-05, + "loss": 0.7961, + "step": 1017 + }, + { + "epoch": 0.37173635201752786, + "grad_norm": 1.2078938484191895, + "learning_rate": 3.715328467153285e-05, + "loss": 0.7894, + "step": 1018 + }, + { + "epoch": 0.3721015154281541, + "grad_norm": 1.1073907613754272, + "learning_rate": 3.718978102189781e-05, + "loss": 0.8289, + "step": 1019 + }, + { + "epoch": 0.3724666788387804, + "grad_norm": 1.514863133430481, + "learning_rate": 3.722627737226278e-05, + "loss": 0.7914, + "step": 1020 + }, + { + "epoch": 0.37283184224940663, + "grad_norm": 1.3350275754928589, + "learning_rate": 3.726277372262774e-05, + "loss": 0.7986, + "step": 1021 + }, + { + "epoch": 0.3731970056600329, + "grad_norm": 1.2805231809616089, + "learning_rate": 3.729927007299271e-05, + "loss": 0.8132, + "step": 1022 + }, + { + "epoch": 0.37356216907065914, + "grad_norm": 1.068396806716919, + "learning_rate": 3.733576642335767e-05, + "loss": 0.8007, + "step": 1023 + }, + { + "epoch": 0.3739273324812854, + "grad_norm": 0.9894810914993286, + "learning_rate": 3.737226277372263e-05, + "loss": 0.8082, + "step": 1024 + }, + { + "epoch": 0.3742924958919116, + "grad_norm": 1.133439302444458, + "learning_rate": 3.74087591240876e-05, + "loss": 0.7847, + "step": 1025 + }, + { + "epoch": 0.37465765930253786, + "grad_norm": 0.7983406782150269, + "learning_rate": 3.744525547445256e-05, + "loss": 0.7599, + "step": 1026 + }, + { + "epoch": 0.3750228227131641, + "grad_norm": 0.8845236301422119, + "learning_rate": 3.748175182481752e-05, + "loss": 0.7507, + "step": 1027 + }, + { + "epoch": 0.3753879861237904, + "grad_norm": 1.125713586807251, + "learning_rate": 3.751824817518249e-05, + "loss": 0.7647, + "step": 1028 + }, + { + "epoch": 0.37575314953441663, + "grad_norm": 1.0719150304794312, + "learning_rate": 3.755474452554745e-05, + "loss": 0.8115, + "step": 1029 + }, + { + "epoch": 0.3761183129450429, + "grad_norm": 0.8788213133811951, + "learning_rate": 3.759124087591241e-05, + "loss": 0.7986, + "step": 1030 + }, + { + "epoch": 0.37648347635566914, + "grad_norm": 0.8796796798706055, + "learning_rate": 3.762773722627738e-05, + "loss": 0.8007, + "step": 1031 + }, + { + "epoch": 0.3768486397662954, + "grad_norm": 1.013904333114624, + "learning_rate": 3.766423357664234e-05, + "loss": 0.8134, + "step": 1032 + }, + { + "epoch": 0.37721380317692166, + "grad_norm": 1.1152280569076538, + "learning_rate": 3.77007299270073e-05, + "loss": 0.7745, + "step": 1033 + }, + { + "epoch": 0.3775789665875479, + "grad_norm": 0.913651168346405, + "learning_rate": 3.773722627737227e-05, + "loss": 0.8302, + "step": 1034 + }, + { + "epoch": 0.37794412999817417, + "grad_norm": 1.1180498600006104, + "learning_rate": 3.777372262773723e-05, + "loss": 0.796, + "step": 1035 + }, + { + "epoch": 0.37830929340880043, + "grad_norm": 1.4855592250823975, + "learning_rate": 3.781021897810219e-05, + "loss": 0.8248, + "step": 1036 + }, + { + "epoch": 0.3786744568194267, + "grad_norm": 0.6914875507354736, + "learning_rate": 3.784671532846716e-05, + "loss": 0.807, + "step": 1037 + }, + { + "epoch": 0.37903962023005294, + "grad_norm": 1.0839126110076904, + "learning_rate": 3.788321167883212e-05, + "loss": 0.828, + "step": 1038 + }, + { + "epoch": 0.3794047836406792, + "grad_norm": 1.0782427787780762, + "learning_rate": 3.791970802919709e-05, + "loss": 0.772, + "step": 1039 + }, + { + "epoch": 0.37976994705130546, + "grad_norm": 1.0255452394485474, + "learning_rate": 3.795620437956204e-05, + "loss": 0.8265, + "step": 1040 + }, + { + "epoch": 0.3801351104619317, + "grad_norm": 0.9261432886123657, + "learning_rate": 3.799270072992701e-05, + "loss": 0.812, + "step": 1041 + }, + { + "epoch": 0.38050027387255797, + "grad_norm": 1.079363465309143, + "learning_rate": 3.802919708029198e-05, + "loss": 0.7696, + "step": 1042 + }, + { + "epoch": 0.3808654372831842, + "grad_norm": 0.9970255494117737, + "learning_rate": 3.806569343065694e-05, + "loss": 0.7926, + "step": 1043 + }, + { + "epoch": 0.3812306006938105, + "grad_norm": 1.0042568445205688, + "learning_rate": 3.81021897810219e-05, + "loss": 0.7717, + "step": 1044 + }, + { + "epoch": 0.38159576410443674, + "grad_norm": 0.8755733966827393, + "learning_rate": 3.813868613138687e-05, + "loss": 0.8062, + "step": 1045 + }, + { + "epoch": 0.381960927515063, + "grad_norm": 0.9101842045783997, + "learning_rate": 3.817518248175183e-05, + "loss": 0.8138, + "step": 1046 + }, + { + "epoch": 0.38232609092568925, + "grad_norm": 0.9083014726638794, + "learning_rate": 3.821167883211679e-05, + "loss": 0.8264, + "step": 1047 + }, + { + "epoch": 0.3826912543363155, + "grad_norm": 0.8410999774932861, + "learning_rate": 3.824817518248176e-05, + "loss": 0.8135, + "step": 1048 + }, + { + "epoch": 0.38305641774694177, + "grad_norm": 0.8920347094535828, + "learning_rate": 3.828467153284672e-05, + "loss": 0.7825, + "step": 1049 + }, + { + "epoch": 0.383421581157568, + "grad_norm": 1.1922773122787476, + "learning_rate": 3.832116788321168e-05, + "loss": 0.8097, + "step": 1050 + }, + { + "epoch": 0.3837867445681943, + "grad_norm": 1.257304072380066, + "learning_rate": 3.835766423357664e-05, + "loss": 0.8099, + "step": 1051 + }, + { + "epoch": 0.38415190797882054, + "grad_norm": 1.5688021183013916, + "learning_rate": 3.839416058394161e-05, + "loss": 0.8091, + "step": 1052 + }, + { + "epoch": 0.3845170713894468, + "grad_norm": 1.091172695159912, + "learning_rate": 3.843065693430657e-05, + "loss": 0.8044, + "step": 1053 + }, + { + "epoch": 0.38488223480007305, + "grad_norm": 0.9299972057342529, + "learning_rate": 3.846715328467153e-05, + "loss": 0.8152, + "step": 1054 + }, + { + "epoch": 0.3852473982106993, + "grad_norm": 0.9777531027793884, + "learning_rate": 3.85036496350365e-05, + "loss": 0.7755, + "step": 1055 + }, + { + "epoch": 0.38561256162132557, + "grad_norm": 0.9919531345367432, + "learning_rate": 3.854014598540147e-05, + "loss": 0.8351, + "step": 1056 + }, + { + "epoch": 0.3859777250319518, + "grad_norm": 0.8127278685569763, + "learning_rate": 3.857664233576642e-05, + "loss": 0.7995, + "step": 1057 + }, + { + "epoch": 0.3863428884425781, + "grad_norm": 0.7582124471664429, + "learning_rate": 3.861313868613139e-05, + "loss": 0.8107, + "step": 1058 + }, + { + "epoch": 0.3867080518532043, + "grad_norm": 1.0248321294784546, + "learning_rate": 3.864963503649636e-05, + "loss": 0.7926, + "step": 1059 + }, + { + "epoch": 0.38707321526383054, + "grad_norm": 0.9556481242179871, + "learning_rate": 3.868613138686132e-05, + "loss": 0.801, + "step": 1060 + }, + { + "epoch": 0.3874383786744568, + "grad_norm": 0.9816288352012634, + "learning_rate": 3.872262773722628e-05, + "loss": 0.8016, + "step": 1061 + }, + { + "epoch": 0.38780354208508305, + "grad_norm": 0.8992881178855896, + "learning_rate": 3.875912408759124e-05, + "loss": 0.7817, + "step": 1062 + }, + { + "epoch": 0.3881687054957093, + "grad_norm": 1.2685658931732178, + "learning_rate": 3.879562043795621e-05, + "loss": 0.8052, + "step": 1063 + }, + { + "epoch": 0.38853386890633557, + "grad_norm": 1.1701579093933105, + "learning_rate": 3.883211678832117e-05, + "loss": 0.7946, + "step": 1064 + }, + { + "epoch": 0.3888990323169618, + "grad_norm": 1.1810351610183716, + "learning_rate": 3.886861313868613e-05, + "loss": 0.8519, + "step": 1065 + }, + { + "epoch": 0.3892641957275881, + "grad_norm": 0.8648379445075989, + "learning_rate": 3.89051094890511e-05, + "loss": 0.8398, + "step": 1066 + }, + { + "epoch": 0.38962935913821434, + "grad_norm": 0.9467642903327942, + "learning_rate": 3.894160583941606e-05, + "loss": 0.7784, + "step": 1067 + }, + { + "epoch": 0.3899945225488406, + "grad_norm": 0.9347090721130371, + "learning_rate": 3.897810218978102e-05, + "loss": 0.7626, + "step": 1068 + }, + { + "epoch": 0.39035968595946685, + "grad_norm": 1.0562098026275635, + "learning_rate": 3.901459854014599e-05, + "loss": 0.7798, + "step": 1069 + }, + { + "epoch": 0.3907248493700931, + "grad_norm": 0.8048838973045349, + "learning_rate": 3.905109489051095e-05, + "loss": 0.7616, + "step": 1070 + }, + { + "epoch": 0.39109001278071936, + "grad_norm": 1.2834361791610718, + "learning_rate": 3.908759124087591e-05, + "loss": 0.7972, + "step": 1071 + }, + { + "epoch": 0.3914551761913456, + "grad_norm": 0.891139030456543, + "learning_rate": 3.912408759124088e-05, + "loss": 0.7899, + "step": 1072 + }, + { + "epoch": 0.3918203396019719, + "grad_norm": 1.0749610662460327, + "learning_rate": 3.916058394160584e-05, + "loss": 0.8253, + "step": 1073 + }, + { + "epoch": 0.39218550301259814, + "grad_norm": 0.7955366969108582, + "learning_rate": 3.91970802919708e-05, + "loss": 0.7832, + "step": 1074 + }, + { + "epoch": 0.3925506664232244, + "grad_norm": 1.152219533920288, + "learning_rate": 3.923357664233577e-05, + "loss": 0.7932, + "step": 1075 + }, + { + "epoch": 0.39291582983385065, + "grad_norm": 1.2308257818222046, + "learning_rate": 3.927007299270073e-05, + "loss": 0.7938, + "step": 1076 + }, + { + "epoch": 0.3932809932444769, + "grad_norm": 0.9965977072715759, + "learning_rate": 3.93065693430657e-05, + "loss": 0.8392, + "step": 1077 + }, + { + "epoch": 0.39364615665510316, + "grad_norm": 0.9761756062507629, + "learning_rate": 3.934306569343066e-05, + "loss": 0.7936, + "step": 1078 + }, + { + "epoch": 0.3940113200657294, + "grad_norm": 1.0288221836090088, + "learning_rate": 3.937956204379562e-05, + "loss": 0.816, + "step": 1079 + }, + { + "epoch": 0.3943764834763557, + "grad_norm": 1.1133888959884644, + "learning_rate": 3.941605839416059e-05, + "loss": 0.822, + "step": 1080 + }, + { + "epoch": 0.39474164688698193, + "grad_norm": 0.9675469994544983, + "learning_rate": 3.945255474452555e-05, + "loss": 0.7877, + "step": 1081 + }, + { + "epoch": 0.3951068102976082, + "grad_norm": 1.2215927839279175, + "learning_rate": 3.948905109489051e-05, + "loss": 0.8262, + "step": 1082 + }, + { + "epoch": 0.39547197370823445, + "grad_norm": 1.0631349086761475, + "learning_rate": 3.952554744525548e-05, + "loss": 0.7987, + "step": 1083 + }, + { + "epoch": 0.3958371371188607, + "grad_norm": 1.1623072624206543, + "learning_rate": 3.956204379562044e-05, + "loss": 0.7977, + "step": 1084 + }, + { + "epoch": 0.39620230052948696, + "grad_norm": 0.8911407589912415, + "learning_rate": 3.95985401459854e-05, + "loss": 0.7855, + "step": 1085 + }, + { + "epoch": 0.3965674639401132, + "grad_norm": 1.10833740234375, + "learning_rate": 3.963503649635037e-05, + "loss": 0.8246, + "step": 1086 + }, + { + "epoch": 0.3969326273507395, + "grad_norm": 1.0649516582489014, + "learning_rate": 3.967153284671533e-05, + "loss": 0.7784, + "step": 1087 + }, + { + "epoch": 0.39729779076136573, + "grad_norm": 0.9672961235046387, + "learning_rate": 3.9708029197080294e-05, + "loss": 0.8223, + "step": 1088 + }, + { + "epoch": 0.397662954171992, + "grad_norm": 1.1473028659820557, + "learning_rate": 3.974452554744526e-05, + "loss": 0.7705, + "step": 1089 + }, + { + "epoch": 0.39802811758261825, + "grad_norm": 0.9718714952468872, + "learning_rate": 3.978102189781022e-05, + "loss": 0.8162, + "step": 1090 + }, + { + "epoch": 0.3983932809932445, + "grad_norm": 1.0650386810302734, + "learning_rate": 3.9817518248175184e-05, + "loss": 0.7981, + "step": 1091 + }, + { + "epoch": 0.39875844440387076, + "grad_norm": 1.0443297624588013, + "learning_rate": 3.9854014598540145e-05, + "loss": 0.8149, + "step": 1092 + }, + { + "epoch": 0.39912360781449696, + "grad_norm": 1.0627777576446533, + "learning_rate": 3.989051094890511e-05, + "loss": 0.8289, + "step": 1093 + }, + { + "epoch": 0.3994887712251232, + "grad_norm": 1.1238811016082764, + "learning_rate": 3.9927007299270074e-05, + "loss": 0.7831, + "step": 1094 + }, + { + "epoch": 0.3998539346357495, + "grad_norm": 0.9035513401031494, + "learning_rate": 3.9963503649635035e-05, + "loss": 0.771, + "step": 1095 + }, + { + "epoch": 0.40021909804637573, + "grad_norm": 1.0562816858291626, + "learning_rate": 4e-05, + "loss": 0.7944, + "step": 1096 + }, + { + "epoch": 0.400584261457002, + "grad_norm": 0.821844220161438, + "learning_rate": 3.999999485540128e-05, + "loss": 0.8033, + "step": 1097 + }, + { + "epoch": 0.40094942486762825, + "grad_norm": 0.959084689617157, + "learning_rate": 3.999997942160775e-05, + "loss": 0.7932, + "step": 1098 + }, + { + "epoch": 0.4013145882782545, + "grad_norm": 0.7904175519943237, + "learning_rate": 3.9999953698627355e-05, + "loss": 0.787, + "step": 1099 + }, + { + "epoch": 0.40167975168888076, + "grad_norm": 0.7241582870483398, + "learning_rate": 3.9999917686473335e-05, + "loss": 0.7588, + "step": 1100 + }, + { + "epoch": 0.402044915099507, + "grad_norm": 0.9488881826400757, + "learning_rate": 3.9999871385164215e-05, + "loss": 0.7584, + "step": 1101 + }, + { + "epoch": 0.4024100785101333, + "grad_norm": 0.968736469745636, + "learning_rate": 3.9999814794723805e-05, + "loss": 0.8066, + "step": 1102 + }, + { + "epoch": 0.40277524192075953, + "grad_norm": 0.7286505103111267, + "learning_rate": 3.999974791518123e-05, + "loss": 0.8229, + "step": 1103 + }, + { + "epoch": 0.4031404053313858, + "grad_norm": 1.0029511451721191, + "learning_rate": 3.999967074657089e-05, + "loss": 0.7956, + "step": 1104 + }, + { + "epoch": 0.40350556874201204, + "grad_norm": 1.0827014446258545, + "learning_rate": 3.9999583288932495e-05, + "loss": 0.764, + "step": 1105 + }, + { + "epoch": 0.4038707321526383, + "grad_norm": 0.9567440152168274, + "learning_rate": 3.999948554231102e-05, + "loss": 0.7845, + "step": 1106 + }, + { + "epoch": 0.40423589556326456, + "grad_norm": 0.6938320994377136, + "learning_rate": 3.9999377506756765e-05, + "loss": 0.7732, + "step": 1107 + }, + { + "epoch": 0.4046010589738908, + "grad_norm": 0.9644039869308472, + "learning_rate": 3.9999259182325315e-05, + "loss": 0.7891, + "step": 1108 + }, + { + "epoch": 0.40496622238451707, + "grad_norm": 1.138069748878479, + "learning_rate": 3.999913056907753e-05, + "loss": 0.7866, + "step": 1109 + }, + { + "epoch": 0.40533138579514333, + "grad_norm": 0.8856624364852905, + "learning_rate": 3.999899166707959e-05, + "loss": 0.7845, + "step": 1110 + }, + { + "epoch": 0.4056965492057696, + "grad_norm": 1.0707448720932007, + "learning_rate": 3.999884247640293e-05, + "loss": 0.7736, + "step": 1111 + }, + { + "epoch": 0.40606171261639584, + "grad_norm": 1.0259274244308472, + "learning_rate": 3.999868299712434e-05, + "loss": 0.8112, + "step": 1112 + }, + { + "epoch": 0.4064268760270221, + "grad_norm": 1.0142202377319336, + "learning_rate": 3.999851322932583e-05, + "loss": 0.7892, + "step": 1113 + }, + { + "epoch": 0.40679203943764836, + "grad_norm": 0.7966458797454834, + "learning_rate": 3.9998333173094764e-05, + "loss": 0.797, + "step": 1114 + }, + { + "epoch": 0.4071572028482746, + "grad_norm": 1.1583501100540161, + "learning_rate": 3.999814282852375e-05, + "loss": 0.7563, + "step": 1115 + }, + { + "epoch": 0.40752236625890087, + "grad_norm": 1.2791121006011963, + "learning_rate": 3.9997942195710744e-05, + "loss": 0.7943, + "step": 1116 + }, + { + "epoch": 0.4078875296695271, + "grad_norm": 0.9490485191345215, + "learning_rate": 3.999773127475894e-05, + "loss": 0.7856, + "step": 1117 + }, + { + "epoch": 0.4082526930801534, + "grad_norm": 1.0569360256195068, + "learning_rate": 3.9997510065776843e-05, + "loss": 0.792, + "step": 1118 + }, + { + "epoch": 0.40861785649077964, + "grad_norm": 1.1657289266586304, + "learning_rate": 3.9997278568878275e-05, + "loss": 0.755, + "step": 1119 + }, + { + "epoch": 0.4089830199014059, + "grad_norm": 0.8284951448440552, + "learning_rate": 3.9997036784182325e-05, + "loss": 0.7788, + "step": 1120 + }, + { + "epoch": 0.40934818331203215, + "grad_norm": 0.819286048412323, + "learning_rate": 3.999678471181338e-05, + "loss": 0.7683, + "step": 1121 + }, + { + "epoch": 0.4097133467226584, + "grad_norm": 1.0198330879211426, + "learning_rate": 3.999652235190112e-05, + "loss": 0.802, + "step": 1122 + }, + { + "epoch": 0.41007851013328467, + "grad_norm": 1.0188407897949219, + "learning_rate": 3.999624970458053e-05, + "loss": 0.7964, + "step": 1123 + }, + { + "epoch": 0.4104436735439109, + "grad_norm": 0.8926658034324646, + "learning_rate": 3.999596676999185e-05, + "loss": 0.785, + "step": 1124 + }, + { + "epoch": 0.4108088369545372, + "grad_norm": 1.0475916862487793, + "learning_rate": 3.999567354828067e-05, + "loss": 0.8413, + "step": 1125 + }, + { + "epoch": 0.4111740003651634, + "grad_norm": 1.21036958694458, + "learning_rate": 3.9995370039597826e-05, + "loss": 0.7864, + "step": 1126 + }, + { + "epoch": 0.41153916377578964, + "grad_norm": 0.8281807899475098, + "learning_rate": 3.9995056244099444e-05, + "loss": 0.7778, + "step": 1127 + }, + { + "epoch": 0.4119043271864159, + "grad_norm": 1.1479681730270386, + "learning_rate": 3.9994732161946986e-05, + "loss": 0.7848, + "step": 1128 + }, + { + "epoch": 0.41226949059704215, + "grad_norm": 0.6831890940666199, + "learning_rate": 3.9994397793307175e-05, + "loss": 0.7786, + "step": 1129 + }, + { + "epoch": 0.4126346540076684, + "grad_norm": 0.9135376214981079, + "learning_rate": 3.999405313835202e-05, + "loss": 0.7648, + "step": 1130 + }, + { + "epoch": 0.41299981741829467, + "grad_norm": 0.7735843658447266, + "learning_rate": 3.999369819725884e-05, + "loss": 0.793, + "step": 1131 + }, + { + "epoch": 0.4133649808289209, + "grad_norm": 0.8167039752006531, + "learning_rate": 3.999333297021023e-05, + "loss": 0.7635, + "step": 1132 + }, + { + "epoch": 0.4137301442395472, + "grad_norm": 0.9002150297164917, + "learning_rate": 3.999295745739409e-05, + "loss": 0.7849, + "step": 1133 + }, + { + "epoch": 0.41409530765017344, + "grad_norm": 1.089339017868042, + "learning_rate": 3.999257165900361e-05, + "loss": 0.7899, + "step": 1134 + }, + { + "epoch": 0.4144604710607997, + "grad_norm": 1.2341511249542236, + "learning_rate": 3.999217557523725e-05, + "loss": 0.7859, + "step": 1135 + }, + { + "epoch": 0.41482563447142595, + "grad_norm": 1.0575354099273682, + "learning_rate": 3.9991769206298805e-05, + "loss": 0.787, + "step": 1136 + }, + { + "epoch": 0.4151907978820522, + "grad_norm": 1.286300539970398, + "learning_rate": 3.999135255239732e-05, + "loss": 0.8298, + "step": 1137 + }, + { + "epoch": 0.41555596129267847, + "grad_norm": 0.8028886318206787, + "learning_rate": 3.999092561374715e-05, + "loss": 0.7725, + "step": 1138 + }, + { + "epoch": 0.4159211247033047, + "grad_norm": 1.2141560316085815, + "learning_rate": 3.999048839056793e-05, + "loss": 0.83, + "step": 1139 + }, + { + "epoch": 0.416286288113931, + "grad_norm": 0.7901906371116638, + "learning_rate": 3.999004088308462e-05, + "loss": 0.8021, + "step": 1140 + }, + { + "epoch": 0.41665145152455724, + "grad_norm": 1.0362855195999146, + "learning_rate": 3.998958309152741e-05, + "loss": 0.7839, + "step": 1141 + }, + { + "epoch": 0.4170166149351835, + "grad_norm": 1.0257155895233154, + "learning_rate": 3.998911501613184e-05, + "loss": 0.8022, + "step": 1142 + }, + { + "epoch": 0.41738177834580975, + "grad_norm": 0.9644251465797424, + "learning_rate": 3.99886366571387e-05, + "loss": 0.7898, + "step": 1143 + }, + { + "epoch": 0.417746941756436, + "grad_norm": 0.9577879905700684, + "learning_rate": 3.99881480147941e-05, + "loss": 0.765, + "step": 1144 + }, + { + "epoch": 0.41811210516706226, + "grad_norm": 0.9823921918869019, + "learning_rate": 3.998764908934942e-05, + "loss": 0.8112, + "step": 1145 + }, + { + "epoch": 0.4184772685776885, + "grad_norm": 1.0017061233520508, + "learning_rate": 3.998713988106134e-05, + "loss": 0.8097, + "step": 1146 + }, + { + "epoch": 0.4188424319883148, + "grad_norm": 0.8555564880371094, + "learning_rate": 3.9986620390191815e-05, + "loss": 0.7759, + "step": 1147 + }, + { + "epoch": 0.41920759539894104, + "grad_norm": 1.3428617715835571, + "learning_rate": 3.998609061700812e-05, + "loss": 0.7601, + "step": 1148 + }, + { + "epoch": 0.4195727588095673, + "grad_norm": 1.0684926509857178, + "learning_rate": 3.998555056178279e-05, + "loss": 0.8042, + "step": 1149 + }, + { + "epoch": 0.41993792222019355, + "grad_norm": 0.8180867433547974, + "learning_rate": 3.998500022479367e-05, + "loss": 0.7991, + "step": 1150 + }, + { + "epoch": 0.4203030856308198, + "grad_norm": 0.9196853041648865, + "learning_rate": 3.998443960632388e-05, + "loss": 0.7787, + "step": 1151 + }, + { + "epoch": 0.42066824904144606, + "grad_norm": 0.7535399794578552, + "learning_rate": 3.998386870666185e-05, + "loss": 0.793, + "step": 1152 + }, + { + "epoch": 0.4210334124520723, + "grad_norm": 0.9972962141036987, + "learning_rate": 3.9983287526101256e-05, + "loss": 0.7958, + "step": 1153 + }, + { + "epoch": 0.4213985758626986, + "grad_norm": 1.0572669506072998, + "learning_rate": 3.9982696064941116e-05, + "loss": 0.7844, + "step": 1154 + }, + { + "epoch": 0.42176373927332483, + "grad_norm": 1.0186238288879395, + "learning_rate": 3.9982094323485706e-05, + "loss": 0.7692, + "step": 1155 + }, + { + "epoch": 0.4221289026839511, + "grad_norm": 0.851263165473938, + "learning_rate": 3.9981482302044604e-05, + "loss": 0.7744, + "step": 1156 + }, + { + "epoch": 0.42249406609457735, + "grad_norm": 1.0665268898010254, + "learning_rate": 3.998086000093266e-05, + "loss": 0.7941, + "step": 1157 + }, + { + "epoch": 0.4228592295052036, + "grad_norm": 0.9266101717948914, + "learning_rate": 3.998022742047002e-05, + "loss": 0.802, + "step": 1158 + }, + { + "epoch": 0.42322439291582986, + "grad_norm": 0.9939461946487427, + "learning_rate": 3.9979584560982144e-05, + "loss": 0.793, + "step": 1159 + }, + { + "epoch": 0.42358955632645606, + "grad_norm": 1.3110156059265137, + "learning_rate": 3.997893142279973e-05, + "loss": 0.8345, + "step": 1160 + }, + { + "epoch": 0.4239547197370823, + "grad_norm": 1.0854302644729614, + "learning_rate": 3.997826800625881e-05, + "loss": 0.8214, + "step": 1161 + }, + { + "epoch": 0.4243198831477086, + "grad_norm": 0.5960488319396973, + "learning_rate": 3.9977594311700676e-05, + "loss": 0.7914, + "step": 1162 + }, + { + "epoch": 0.42468504655833483, + "grad_norm": 1.0404022932052612, + "learning_rate": 3.9976910339471914e-05, + "loss": 0.7772, + "step": 1163 + }, + { + "epoch": 0.4250502099689611, + "grad_norm": 1.0241742134094238, + "learning_rate": 3.9976216089924415e-05, + "loss": 0.7376, + "step": 1164 + }, + { + "epoch": 0.42541537337958735, + "grad_norm": 1.1148185729980469, + "learning_rate": 3.9975511563415336e-05, + "loss": 0.8058, + "step": 1165 + }, + { + "epoch": 0.4257805367902136, + "grad_norm": 0.7152006030082703, + "learning_rate": 3.997479676030711e-05, + "loss": 0.7782, + "step": 1166 + }, + { + "epoch": 0.42614570020083986, + "grad_norm": 0.9840139746665955, + "learning_rate": 3.9974071680967504e-05, + "loss": 0.8091, + "step": 1167 + }, + { + "epoch": 0.4265108636114661, + "grad_norm": 0.9908798336982727, + "learning_rate": 3.9973336325769526e-05, + "loss": 0.7751, + "step": 1168 + }, + { + "epoch": 0.4268760270220924, + "grad_norm": 1.0988410711288452, + "learning_rate": 3.9972590695091476e-05, + "loss": 0.7939, + "step": 1169 + }, + { + "epoch": 0.42724119043271863, + "grad_norm": 0.8988221883773804, + "learning_rate": 3.997183478931698e-05, + "loss": 0.7952, + "step": 1170 + }, + { + "epoch": 0.4276063538433449, + "grad_norm": 0.8373317718505859, + "learning_rate": 3.9971068608834895e-05, + "loss": 0.7942, + "step": 1171 + }, + { + "epoch": 0.42797151725397115, + "grad_norm": 1.4707140922546387, + "learning_rate": 3.9970292154039396e-05, + "loss": 0.7861, + "step": 1172 + }, + { + "epoch": 0.4283366806645974, + "grad_norm": 1.840456485748291, + "learning_rate": 3.9969505425329955e-05, + "loss": 0.7849, + "step": 1173 + }, + { + "epoch": 0.42870184407522366, + "grad_norm": 0.8191938400268555, + "learning_rate": 3.996870842311129e-05, + "loss": 0.816, + "step": 1174 + }, + { + "epoch": 0.4290670074858499, + "grad_norm": 1.8431330919265747, + "learning_rate": 3.9967901147793436e-05, + "loss": 0.7844, + "step": 1175 + }, + { + "epoch": 0.4294321708964762, + "grad_norm": 0.8188057541847229, + "learning_rate": 3.99670835997917e-05, + "loss": 0.7913, + "step": 1176 + }, + { + "epoch": 0.42979733430710243, + "grad_norm": 0.8033564686775208, + "learning_rate": 3.996625577952669e-05, + "loss": 0.7655, + "step": 1177 + }, + { + "epoch": 0.4301624977177287, + "grad_norm": 1.0804041624069214, + "learning_rate": 3.9965417687424274e-05, + "loss": 0.8119, + "step": 1178 + }, + { + "epoch": 0.43052766112835494, + "grad_norm": 0.9008678793907166, + "learning_rate": 3.996456932391562e-05, + "loss": 0.8007, + "step": 1179 + }, + { + "epoch": 0.4308928245389812, + "grad_norm": 0.7237406373023987, + "learning_rate": 3.9963710689437174e-05, + "loss": 0.7888, + "step": 1180 + }, + { + "epoch": 0.43125798794960746, + "grad_norm": 0.8602893948554993, + "learning_rate": 3.996284178443068e-05, + "loss": 0.7904, + "step": 1181 + }, + { + "epoch": 0.4316231513602337, + "grad_norm": 1.1926499605178833, + "learning_rate": 3.996196260934314e-05, + "loss": 0.7903, + "step": 1182 + }, + { + "epoch": 0.43198831477085997, + "grad_norm": 1.187950849533081, + "learning_rate": 3.996107316462686e-05, + "loss": 0.7792, + "step": 1183 + }, + { + "epoch": 0.43235347818148623, + "grad_norm": 0.7057816386222839, + "learning_rate": 3.9960173450739425e-05, + "loss": 0.7725, + "step": 1184 + }, + { + "epoch": 0.4327186415921125, + "grad_norm": 1.08821702003479, + "learning_rate": 3.9959263468143706e-05, + "loss": 0.7986, + "step": 1185 + }, + { + "epoch": 0.43308380500273874, + "grad_norm": 0.9124771952629089, + "learning_rate": 3.995834321730785e-05, + "loss": 0.8041, + "step": 1186 + }, + { + "epoch": 0.433448968413365, + "grad_norm": 0.9151847958564758, + "learning_rate": 3.995741269870528e-05, + "loss": 0.8016, + "step": 1187 + }, + { + "epoch": 0.43381413182399126, + "grad_norm": 1.1104234457015991, + "learning_rate": 3.9956471912814715e-05, + "loss": 0.7729, + "step": 1188 + }, + { + "epoch": 0.4341792952346175, + "grad_norm": 0.7967671155929565, + "learning_rate": 3.9955520860120164e-05, + "loss": 0.7914, + "step": 1189 + }, + { + "epoch": 0.43454445864524377, + "grad_norm": 0.9998862147331238, + "learning_rate": 3.995455954111089e-05, + "loss": 0.7782, + "step": 1190 + }, + { + "epoch": 0.43490962205587, + "grad_norm": 0.8886136412620544, + "learning_rate": 3.995358795628146e-05, + "loss": 0.7969, + "step": 1191 + }, + { + "epoch": 0.4352747854664963, + "grad_norm": 0.7756946086883545, + "learning_rate": 3.995260610613172e-05, + "loss": 0.7875, + "step": 1192 + }, + { + "epoch": 0.4356399488771225, + "grad_norm": 0.9421478509902954, + "learning_rate": 3.995161399116678e-05, + "loss": 0.7557, + "step": 1193 + }, + { + "epoch": 0.43600511228774874, + "grad_norm": 0.9283790588378906, + "learning_rate": 3.9950611611897055e-05, + "loss": 0.7792, + "step": 1194 + }, + { + "epoch": 0.436370275698375, + "grad_norm": 0.7054854035377502, + "learning_rate": 3.994959896883821e-05, + "loss": 0.7354, + "step": 1195 + }, + { + "epoch": 0.43673543910900126, + "grad_norm": 0.9545449614524841, + "learning_rate": 3.994857606251124e-05, + "loss": 0.7786, + "step": 1196 + }, + { + "epoch": 0.4371006025196275, + "grad_norm": 0.8932951092720032, + "learning_rate": 3.994754289344236e-05, + "loss": 0.795, + "step": 1197 + }, + { + "epoch": 0.43746576593025377, + "grad_norm": 0.7554992437362671, + "learning_rate": 3.9946499462163116e-05, + "loss": 0.7732, + "step": 1198 + }, + { + "epoch": 0.43783092934088, + "grad_norm": 1.0914173126220703, + "learning_rate": 3.99454457692103e-05, + "loss": 0.795, + "step": 1199 + }, + { + "epoch": 0.4381960927515063, + "grad_norm": 0.92984938621521, + "learning_rate": 3.9944381815125987e-05, + "loss": 0.7977, + "step": 1200 + }, + { + "epoch": 0.43856125616213254, + "grad_norm": 0.8287634253501892, + "learning_rate": 3.9943307600457563e-05, + "loss": 0.7723, + "step": 1201 + }, + { + "epoch": 0.4389264195727588, + "grad_norm": 1.0759422779083252, + "learning_rate": 3.994222312575764e-05, + "loss": 0.7911, + "step": 1202 + }, + { + "epoch": 0.43929158298338505, + "grad_norm": 0.7861177921295166, + "learning_rate": 3.994112839158416e-05, + "loss": 0.7568, + "step": 1203 + }, + { + "epoch": 0.4396567463940113, + "grad_norm": 0.8400720357894897, + "learning_rate": 3.99400233985003e-05, + "loss": 0.7809, + "step": 1204 + }, + { + "epoch": 0.44002190980463757, + "grad_norm": 1.083005666732788, + "learning_rate": 3.993890814707455e-05, + "loss": 0.7705, + "step": 1205 + }, + { + "epoch": 0.4403870732152638, + "grad_norm": 1.2543330192565918, + "learning_rate": 3.9937782637880665e-05, + "loss": 0.8156, + "step": 1206 + }, + { + "epoch": 0.4407522366258901, + "grad_norm": 0.8394210338592529, + "learning_rate": 3.9936646871497656e-05, + "loss": 0.7837, + "step": 1207 + }, + { + "epoch": 0.44111740003651634, + "grad_norm": 0.8094637393951416, + "learning_rate": 3.9935500848509845e-05, + "loss": 0.7788, + "step": 1208 + }, + { + "epoch": 0.4414825634471426, + "grad_norm": 0.898333728313446, + "learning_rate": 3.993434456950681e-05, + "loss": 0.7497, + "step": 1209 + }, + { + "epoch": 0.44184772685776885, + "grad_norm": 0.8007587790489197, + "learning_rate": 3.9933178035083406e-05, + "loss": 0.7402, + "step": 1210 + }, + { + "epoch": 0.4422128902683951, + "grad_norm": 0.8324313759803772, + "learning_rate": 3.993200124583977e-05, + "loss": 0.7768, + "step": 1211 + }, + { + "epoch": 0.44257805367902137, + "grad_norm": 1.1105968952178955, + "learning_rate": 3.993081420238132e-05, + "loss": 0.7723, + "step": 1212 + }, + { + "epoch": 0.4429432170896476, + "grad_norm": 0.8521186709403992, + "learning_rate": 3.992961690531873e-05, + "loss": 0.7604, + "step": 1213 + }, + { + "epoch": 0.4433083805002739, + "grad_norm": 0.850679337978363, + "learning_rate": 3.992840935526797e-05, + "loss": 0.7555, + "step": 1214 + }, + { + "epoch": 0.44367354391090014, + "grad_norm": 0.8649604916572571, + "learning_rate": 3.992719155285028e-05, + "loss": 0.7933, + "step": 1215 + }, + { + "epoch": 0.4440387073215264, + "grad_norm": 0.7320381999015808, + "learning_rate": 3.992596349869216e-05, + "loss": 0.777, + "step": 1216 + }, + { + "epoch": 0.44440387073215265, + "grad_norm": 1.2951277494430542, + "learning_rate": 3.99247251934254e-05, + "loss": 0.8055, + "step": 1217 + }, + { + "epoch": 0.4447690341427789, + "grad_norm": 0.9105983972549438, + "learning_rate": 3.992347663768705e-05, + "loss": 0.7768, + "step": 1218 + }, + { + "epoch": 0.44513419755340516, + "grad_norm": 0.9981197118759155, + "learning_rate": 3.9922217832119464e-05, + "loss": 0.7576, + "step": 1219 + }, + { + "epoch": 0.4454993609640314, + "grad_norm": 1.0948731899261475, + "learning_rate": 3.992094877737022e-05, + "loss": 0.8055, + "step": 1220 + }, + { + "epoch": 0.4458645243746577, + "grad_norm": 1.0124883651733398, + "learning_rate": 3.991966947409221e-05, + "loss": 0.7642, + "step": 1221 + }, + { + "epoch": 0.44622968778528393, + "grad_norm": 0.8435676693916321, + "learning_rate": 3.991837992294358e-05, + "loss": 0.7785, + "step": 1222 + }, + { + "epoch": 0.4465948511959102, + "grad_norm": 1.0190502405166626, + "learning_rate": 3.991708012458777e-05, + "loss": 0.7943, + "step": 1223 + }, + { + "epoch": 0.44696001460653645, + "grad_norm": 1.195723056793213, + "learning_rate": 3.991577007969344e-05, + "loss": 0.8112, + "step": 1224 + }, + { + "epoch": 0.4473251780171627, + "grad_norm": 1.138889193534851, + "learning_rate": 3.9914449788934584e-05, + "loss": 0.7964, + "step": 1225 + }, + { + "epoch": 0.44769034142778896, + "grad_norm": 0.848869264125824, + "learning_rate": 3.991311925299042e-05, + "loss": 0.7592, + "step": 1226 + }, + { + "epoch": 0.44805550483841516, + "grad_norm": 1.003063440322876, + "learning_rate": 3.991177847254547e-05, + "loss": 0.7567, + "step": 1227 + }, + { + "epoch": 0.4484206682490414, + "grad_norm": 1.5992316007614136, + "learning_rate": 3.991042744828951e-05, + "loss": 0.781, + "step": 1228 + }, + { + "epoch": 0.4487858316596677, + "grad_norm": 0.8211202621459961, + "learning_rate": 3.990906618091758e-05, + "loss": 0.7533, + "step": 1229 + }, + { + "epoch": 0.44915099507029393, + "grad_norm": 0.915510356426239, + "learning_rate": 3.9907694671129996e-05, + "loss": 0.7424, + "step": 1230 + }, + { + "epoch": 0.4495161584809202, + "grad_norm": 0.9703736305236816, + "learning_rate": 3.990631291963236e-05, + "loss": 0.7519, + "step": 1231 + }, + { + "epoch": 0.44988132189154645, + "grad_norm": 1.0902239084243774, + "learning_rate": 3.9904920927135504e-05, + "loss": 0.8147, + "step": 1232 + }, + { + "epoch": 0.4502464853021727, + "grad_norm": 0.7888534069061279, + "learning_rate": 3.9903518694355575e-05, + "loss": 0.772, + "step": 1233 + }, + { + "epoch": 0.45061164871279896, + "grad_norm": 0.9694254994392395, + "learning_rate": 3.990210622201396e-05, + "loss": 0.7618, + "step": 1234 + }, + { + "epoch": 0.4509768121234252, + "grad_norm": 0.8534974455833435, + "learning_rate": 3.9900683510837306e-05, + "loss": 0.7538, + "step": 1235 + }, + { + "epoch": 0.4513419755340515, + "grad_norm": 0.6274154782295227, + "learning_rate": 3.989925056155756e-05, + "loss": 0.7614, + "step": 1236 + }, + { + "epoch": 0.45170713894467773, + "grad_norm": 0.985271692276001, + "learning_rate": 3.9897807374911895e-05, + "loss": 0.7385, + "step": 1237 + }, + { + "epoch": 0.452072302355304, + "grad_norm": 0.7203537821769714, + "learning_rate": 3.9896353951642795e-05, + "loss": 0.7429, + "step": 1238 + }, + { + "epoch": 0.45243746576593025, + "grad_norm": 0.9454881548881531, + "learning_rate": 3.989489029249797e-05, + "loss": 0.7766, + "step": 1239 + }, + { + "epoch": 0.4528026291765565, + "grad_norm": 0.8015196323394775, + "learning_rate": 3.989341639823042e-05, + "loss": 0.7528, + "step": 1240 + }, + { + "epoch": 0.45316779258718276, + "grad_norm": 0.8929771184921265, + "learning_rate": 3.9891932269598414e-05, + "loss": 0.7491, + "step": 1241 + }, + { + "epoch": 0.453532955997809, + "grad_norm": 0.7127432823181152, + "learning_rate": 3.989043790736547e-05, + "loss": 0.7749, + "step": 1242 + }, + { + "epoch": 0.4538981194084353, + "grad_norm": 1.1709378957748413, + "learning_rate": 3.988893331230038e-05, + "loss": 0.8119, + "step": 1243 + }, + { + "epoch": 0.45426328281906153, + "grad_norm": 0.9406178593635559, + "learning_rate": 3.9887418485177175e-05, + "loss": 0.7858, + "step": 1244 + }, + { + "epoch": 0.4546284462296878, + "grad_norm": 1.2341252565383911, + "learning_rate": 3.9885893426775204e-05, + "loss": 0.7908, + "step": 1245 + }, + { + "epoch": 0.45499360964031405, + "grad_norm": 0.9134424328804016, + "learning_rate": 3.988435813787904e-05, + "loss": 0.7761, + "step": 1246 + }, + { + "epoch": 0.4553587730509403, + "grad_norm": 1.2409087419509888, + "learning_rate": 3.988281261927852e-05, + "loss": 0.7548, + "step": 1247 + }, + { + "epoch": 0.45572393646156656, + "grad_norm": 0.9813235402107239, + "learning_rate": 3.9881256871768756e-05, + "loss": 0.8041, + "step": 1248 + }, + { + "epoch": 0.4560890998721928, + "grad_norm": 0.9483786225318909, + "learning_rate": 3.9879690896150114e-05, + "loss": 0.7639, + "step": 1249 + }, + { + "epoch": 0.4564542632828191, + "grad_norm": 0.9267379641532898, + "learning_rate": 3.9878114693228236e-05, + "loss": 0.7598, + "step": 1250 + }, + { + "epoch": 0.45681942669344533, + "grad_norm": 1.0630885362625122, + "learning_rate": 3.9876528263813995e-05, + "loss": 0.7939, + "step": 1251 + }, + { + "epoch": 0.4571845901040716, + "grad_norm": 0.8597736954689026, + "learning_rate": 3.9874931608723566e-05, + "loss": 0.7781, + "step": 1252 + }, + { + "epoch": 0.45754975351469784, + "grad_norm": 0.8005722761154175, + "learning_rate": 3.9873324728778354e-05, + "loss": 0.7593, + "step": 1253 + }, + { + "epoch": 0.4579149169253241, + "grad_norm": 0.9881361126899719, + "learning_rate": 3.9871707624805037e-05, + "loss": 0.7797, + "step": 1254 + }, + { + "epoch": 0.45828008033595036, + "grad_norm": 0.9582185745239258, + "learning_rate": 3.987008029763555e-05, + "loss": 0.7805, + "step": 1255 + }, + { + "epoch": 0.4586452437465766, + "grad_norm": 0.8981064558029175, + "learning_rate": 3.9868442748107076e-05, + "loss": 0.7714, + "step": 1256 + }, + { + "epoch": 0.45901040715720287, + "grad_norm": 1.3101154565811157, + "learning_rate": 3.9866794977062086e-05, + "loss": 0.7825, + "step": 1257 + }, + { + "epoch": 0.45937557056782913, + "grad_norm": 1.0810472965240479, + "learning_rate": 3.986513698534829e-05, + "loss": 0.7424, + "step": 1258 + }, + { + "epoch": 0.4597407339784554, + "grad_norm": 0.9976481795310974, + "learning_rate": 3.9863468773818646e-05, + "loss": 0.7919, + "step": 1259 + }, + { + "epoch": 0.46010589738908164, + "grad_norm": 1.0877742767333984, + "learning_rate": 3.986179034333139e-05, + "loss": 0.733, + "step": 1260 + }, + { + "epoch": 0.46047106079970784, + "grad_norm": 1.0023266077041626, + "learning_rate": 3.986010169475002e-05, + "loss": 0.7323, + "step": 1261 + }, + { + "epoch": 0.4608362242103341, + "grad_norm": 0.9857776761054993, + "learning_rate": 3.985840282894325e-05, + "loss": 0.7279, + "step": 1262 + }, + { + "epoch": 0.46120138762096036, + "grad_norm": 1.1565684080123901, + "learning_rate": 3.9856693746785095e-05, + "loss": 0.7472, + "step": 1263 + }, + { + "epoch": 0.4615665510315866, + "grad_norm": 1.14054536819458, + "learning_rate": 3.9854974449154805e-05, + "loss": 0.7546, + "step": 1264 + }, + { + "epoch": 0.46193171444221287, + "grad_norm": 1.011195421218872, + "learning_rate": 3.985324493693689e-05, + "loss": 0.776, + "step": 1265 + }, + { + "epoch": 0.4622968778528391, + "grad_norm": 0.7427286505699158, + "learning_rate": 3.985150521102113e-05, + "loss": 0.7865, + "step": 1266 + }, + { + "epoch": 0.4626620412634654, + "grad_norm": 1.1418955326080322, + "learning_rate": 3.9849755272302515e-05, + "loss": 0.7852, + "step": 1267 + }, + { + "epoch": 0.46302720467409164, + "grad_norm": 0.8222767114639282, + "learning_rate": 3.984799512168134e-05, + "loss": 0.7686, + "step": 1268 + }, + { + "epoch": 0.4633923680847179, + "grad_norm": 0.7613946199417114, + "learning_rate": 3.9846224760063125e-05, + "loss": 0.7412, + "step": 1269 + }, + { + "epoch": 0.46375753149534416, + "grad_norm": 0.8206880688667297, + "learning_rate": 3.984444418835865e-05, + "loss": 0.7761, + "step": 1270 + }, + { + "epoch": 0.4641226949059704, + "grad_norm": 0.8956536054611206, + "learning_rate": 3.984265340748395e-05, + "loss": 0.7845, + "step": 1271 + }, + { + "epoch": 0.46448785831659667, + "grad_norm": 1.3047102689743042, + "learning_rate": 3.98408524183603e-05, + "loss": 0.7668, + "step": 1272 + }, + { + "epoch": 0.4648530217272229, + "grad_norm": 1.1105831861495972, + "learning_rate": 3.983904122191425e-05, + "loss": 0.7783, + "step": 1273 + }, + { + "epoch": 0.4652181851378492, + "grad_norm": 0.9006888270378113, + "learning_rate": 3.9837219819077584e-05, + "loss": 0.7582, + "step": 1274 + }, + { + "epoch": 0.46558334854847544, + "grad_norm": 0.736251175403595, + "learning_rate": 3.983538821078734e-05, + "loss": 0.7581, + "step": 1275 + }, + { + "epoch": 0.4659485119591017, + "grad_norm": 1.315656065940857, + "learning_rate": 3.98335463979858e-05, + "loss": 0.7577, + "step": 1276 + }, + { + "epoch": 0.46631367536972795, + "grad_norm": 1.501815676689148, + "learning_rate": 3.9831694381620513e-05, + "loss": 0.7814, + "step": 1277 + }, + { + "epoch": 0.4666788387803542, + "grad_norm": 0.8803642988204956, + "learning_rate": 3.982983216264427e-05, + "loss": 0.736, + "step": 1278 + }, + { + "epoch": 0.46704400219098047, + "grad_norm": 0.748648464679718, + "learning_rate": 3.982795974201509e-05, + "loss": 0.7902, + "step": 1279 + }, + { + "epoch": 0.4674091656016067, + "grad_norm": 0.9274646043777466, + "learning_rate": 3.982607712069627e-05, + "loss": 0.7483, + "step": 1280 + }, + { + "epoch": 0.467774329012233, + "grad_norm": 0.9979590773582458, + "learning_rate": 3.982418429965635e-05, + "loss": 0.7606, + "step": 1281 + }, + { + "epoch": 0.46813949242285924, + "grad_norm": 0.6248117685317993, + "learning_rate": 3.982228127986909e-05, + "loss": 0.7252, + "step": 1282 + }, + { + "epoch": 0.4685046558334855, + "grad_norm": 1.1311684846878052, + "learning_rate": 3.9820368062313546e-05, + "loss": 0.7732, + "step": 1283 + }, + { + "epoch": 0.46886981924411175, + "grad_norm": 1.0071941614151, + "learning_rate": 3.981844464797397e-05, + "loss": 0.7675, + "step": 1284 + }, + { + "epoch": 0.469234982654738, + "grad_norm": 1.47381591796875, + "learning_rate": 3.981651103783988e-05, + "loss": 0.7737, + "step": 1285 + }, + { + "epoch": 0.46960014606536427, + "grad_norm": 1.114651083946228, + "learning_rate": 3.9814567232906054e-05, + "loss": 0.7441, + "step": 1286 + }, + { + "epoch": 0.4699653094759905, + "grad_norm": 0.6732168793678284, + "learning_rate": 3.98126132341725e-05, + "loss": 0.7352, + "step": 1287 + }, + { + "epoch": 0.4703304728866168, + "grad_norm": 0.9219527840614319, + "learning_rate": 3.981064904264446e-05, + "loss": 0.7621, + "step": 1288 + }, + { + "epoch": 0.47069563629724304, + "grad_norm": 1.1353275775909424, + "learning_rate": 3.9808674659332445e-05, + "loss": 0.7874, + "step": 1289 + }, + { + "epoch": 0.4710607997078693, + "grad_norm": 0.9650176167488098, + "learning_rate": 3.9806690085252184e-05, + "loss": 0.792, + "step": 1290 + }, + { + "epoch": 0.47142596311849555, + "grad_norm": 0.731926441192627, + "learning_rate": 3.980469532142467e-05, + "loss": 0.7357, + "step": 1291 + }, + { + "epoch": 0.4717911265291218, + "grad_norm": 1.2001556158065796, + "learning_rate": 3.980269036887613e-05, + "loss": 0.7553, + "step": 1292 + }, + { + "epoch": 0.47215628993974806, + "grad_norm": 1.1558605432510376, + "learning_rate": 3.980067522863802e-05, + "loss": 0.7749, + "step": 1293 + }, + { + "epoch": 0.47252145335037427, + "grad_norm": 0.8277838230133057, + "learning_rate": 3.9798649901747064e-05, + "loss": 0.778, + "step": 1294 + }, + { + "epoch": 0.4728866167610005, + "grad_norm": 1.0746656656265259, + "learning_rate": 3.9796614389245205e-05, + "loss": 0.797, + "step": 1295 + }, + { + "epoch": 0.4732517801716268, + "grad_norm": 0.9039919972419739, + "learning_rate": 3.979456869217962e-05, + "loss": 0.7693, + "step": 1296 + }, + { + "epoch": 0.47361694358225304, + "grad_norm": 1.7959991693496704, + "learning_rate": 3.979251281160277e-05, + "loss": 0.7856, + "step": 1297 + }, + { + "epoch": 0.4739821069928793, + "grad_norm": 0.8877509236335754, + "learning_rate": 3.979044674857228e-05, + "loss": 0.7679, + "step": 1298 + }, + { + "epoch": 0.47434727040350555, + "grad_norm": 3.5038464069366455, + "learning_rate": 3.978837050415109e-05, + "loss": 0.7628, + "step": 1299 + }, + { + "epoch": 0.4747124338141318, + "grad_norm": 0.9983282685279846, + "learning_rate": 3.9786284079407325e-05, + "loss": 0.7321, + "step": 1300 + }, + { + "epoch": 0.47507759722475806, + "grad_norm": 1.103953242301941, + "learning_rate": 3.978418747541438e-05, + "loss": 0.7543, + "step": 1301 + }, + { + "epoch": 0.4754427606353843, + "grad_norm": 1.1223280429840088, + "learning_rate": 3.9782080693250875e-05, + "loss": 0.7411, + "step": 1302 + }, + { + "epoch": 0.4758079240460106, + "grad_norm": 1.0355007648468018, + "learning_rate": 3.977996373400066e-05, + "loss": 0.7828, + "step": 1303 + }, + { + "epoch": 0.47617308745663683, + "grad_norm": 0.9945892691612244, + "learning_rate": 3.9777836598752814e-05, + "loss": 0.7616, + "step": 1304 + }, + { + "epoch": 0.4765382508672631, + "grad_norm": 1.0359525680541992, + "learning_rate": 3.977569928860168e-05, + "loss": 0.7593, + "step": 1305 + }, + { + "epoch": 0.47690341427788935, + "grad_norm": 1.0656094551086426, + "learning_rate": 3.977355180464681e-05, + "loss": 0.7476, + "step": 1306 + }, + { + "epoch": 0.4772685776885156, + "grad_norm": 1.0946440696716309, + "learning_rate": 3.9771394147993e-05, + "loss": 0.7534, + "step": 1307 + }, + { + "epoch": 0.47763374109914186, + "grad_norm": 1.5356392860412598, + "learning_rate": 3.976922631975028e-05, + "loss": 0.806, + "step": 1308 + }, + { + "epoch": 0.4779989045097681, + "grad_norm": 0.8914726376533508, + "learning_rate": 3.97670483210339e-05, + "loss": 0.7457, + "step": 1309 + }, + { + "epoch": 0.4783640679203944, + "grad_norm": 0.9732979536056519, + "learning_rate": 3.9764860152964365e-05, + "loss": 0.782, + "step": 1310 + }, + { + "epoch": 0.47872923133102063, + "grad_norm": 0.9373230934143066, + "learning_rate": 3.9762661816667404e-05, + "loss": 0.7806, + "step": 1311 + }, + { + "epoch": 0.4790943947416469, + "grad_norm": 1.190435528755188, + "learning_rate": 3.9760453313273954e-05, + "loss": 0.7563, + "step": 1312 + }, + { + "epoch": 0.47945955815227315, + "grad_norm": 1.0832659006118774, + "learning_rate": 3.9758234643920214e-05, + "loss": 0.7213, + "step": 1313 + }, + { + "epoch": 0.4798247215628994, + "grad_norm": 0.9625949263572693, + "learning_rate": 3.9756005809747604e-05, + "loss": 0.7792, + "step": 1314 + }, + { + "epoch": 0.48018988497352566, + "grad_norm": 1.1885652542114258, + "learning_rate": 3.9753766811902756e-05, + "loss": 0.7377, + "step": 1315 + }, + { + "epoch": 0.4805550483841519, + "grad_norm": 1.1306437253952026, + "learning_rate": 3.975151765153756e-05, + "loss": 0.7626, + "step": 1316 + }, + { + "epoch": 0.4809202117947782, + "grad_norm": 1.4518258571624756, + "learning_rate": 3.9749258329809104e-05, + "loss": 0.7622, + "step": 1317 + }, + { + "epoch": 0.48128537520540443, + "grad_norm": 1.0818719863891602, + "learning_rate": 3.974698884787973e-05, + "loss": 0.7561, + "step": 1318 + }, + { + "epoch": 0.4816505386160307, + "grad_norm": 0.9876286387443542, + "learning_rate": 3.974470920691699e-05, + "loss": 0.7512, + "step": 1319 + }, + { + "epoch": 0.48201570202665694, + "grad_norm": 2.4548518657684326, + "learning_rate": 3.974241940809367e-05, + "loss": 0.7738, + "step": 1320 + }, + { + "epoch": 0.4823808654372832, + "grad_norm": 0.925119936466217, + "learning_rate": 3.9740119452587784e-05, + "loss": 0.7352, + "step": 1321 + }, + { + "epoch": 0.48274602884790946, + "grad_norm": 0.841683566570282, + "learning_rate": 3.9737809341582545e-05, + "loss": 0.7432, + "step": 1322 + }, + { + "epoch": 0.4831111922585357, + "grad_norm": 1.1326254606246948, + "learning_rate": 3.973548907626644e-05, + "loss": 0.7402, + "step": 1323 + }, + { + "epoch": 0.483476355669162, + "grad_norm": 0.839130699634552, + "learning_rate": 3.973315865783314e-05, + "loss": 0.7539, + "step": 1324 + }, + { + "epoch": 0.48384151907978823, + "grad_norm": 1.288762092590332, + "learning_rate": 3.9730818087481554e-05, + "loss": 0.7786, + "step": 1325 + }, + { + "epoch": 0.4842066824904145, + "grad_norm": 0.9070988893508911, + "learning_rate": 3.9728467366415815e-05, + "loss": 0.7703, + "step": 1326 + }, + { + "epoch": 0.48457184590104074, + "grad_norm": 0.861164391040802, + "learning_rate": 3.972610649584526e-05, + "loss": 0.7523, + "step": 1327 + }, + { + "epoch": 0.48493700931166694, + "grad_norm": 1.061942458152771, + "learning_rate": 3.972373547698448e-05, + "loss": 0.7517, + "step": 1328 + }, + { + "epoch": 0.4853021727222932, + "grad_norm": 1.091475009918213, + "learning_rate": 3.9721354311053256e-05, + "loss": 0.7523, + "step": 1329 + }, + { + "epoch": 0.48566733613291946, + "grad_norm": 0.9926595687866211, + "learning_rate": 3.971896299927661e-05, + "loss": 0.7897, + "step": 1330 + }, + { + "epoch": 0.4860324995435457, + "grad_norm": 1.066672444343567, + "learning_rate": 3.971656154288477e-05, + "loss": 0.7565, + "step": 1331 + }, + { + "epoch": 0.48639766295417197, + "grad_norm": 1.009468674659729, + "learning_rate": 3.97141499431132e-05, + "loss": 0.7789, + "step": 1332 + }, + { + "epoch": 0.48676282636479823, + "grad_norm": 1.015899419784546, + "learning_rate": 3.971172820120256e-05, + "loss": 0.7394, + "step": 1333 + }, + { + "epoch": 0.4871279897754245, + "grad_norm": 0.8125002980232239, + "learning_rate": 3.970929631839874e-05, + "loss": 0.755, + "step": 1334 + }, + { + "epoch": 0.48749315318605074, + "grad_norm": 1.096103549003601, + "learning_rate": 3.9706854295952856e-05, + "loss": 0.726, + "step": 1335 + }, + { + "epoch": 0.487858316596677, + "grad_norm": 0.8944382667541504, + "learning_rate": 3.9704402135121214e-05, + "loss": 0.7747, + "step": 1336 + }, + { + "epoch": 0.48822348000730326, + "grad_norm": 1.324345588684082, + "learning_rate": 3.970193983716537e-05, + "loss": 0.809, + "step": 1337 + }, + { + "epoch": 0.4885886434179295, + "grad_norm": 0.9750800728797913, + "learning_rate": 3.9699467403352066e-05, + "loss": 0.792, + "step": 1338 + }, + { + "epoch": 0.48895380682855577, + "grad_norm": 0.7252906560897827, + "learning_rate": 3.9696984834953274e-05, + "loss": 0.7506, + "step": 1339 + }, + { + "epoch": 0.489318970239182, + "grad_norm": 0.9124673008918762, + "learning_rate": 3.969449213324617e-05, + "loss": 0.7542, + "step": 1340 + }, + { + "epoch": 0.4896841336498083, + "grad_norm": 0.9339630603790283, + "learning_rate": 3.969198929951316e-05, + "loss": 0.7512, + "step": 1341 + }, + { + "epoch": 0.49004929706043454, + "grad_norm": 1.2227842807769775, + "learning_rate": 3.9689476335041844e-05, + "loss": 0.7772, + "step": 1342 + }, + { + "epoch": 0.4904144604710608, + "grad_norm": 0.8912896513938904, + "learning_rate": 3.9686953241125045e-05, + "loss": 0.762, + "step": 1343 + }, + { + "epoch": 0.49077962388168705, + "grad_norm": 0.9073852300643921, + "learning_rate": 3.96844200190608e-05, + "loss": 0.7357, + "step": 1344 + }, + { + "epoch": 0.4911447872923133, + "grad_norm": 0.837660014629364, + "learning_rate": 3.968187667015233e-05, + "loss": 0.7352, + "step": 1345 + }, + { + "epoch": 0.49150995070293957, + "grad_norm": 0.9073949456214905, + "learning_rate": 3.9679323195708095e-05, + "loss": 0.7387, + "step": 1346 + }, + { + "epoch": 0.4918751141135658, + "grad_norm": 0.7531459331512451, + "learning_rate": 3.9676759597041765e-05, + "loss": 0.7518, + "step": 1347 + }, + { + "epoch": 0.4922402775241921, + "grad_norm": 1.2089084386825562, + "learning_rate": 3.96741858754722e-05, + "loss": 0.7792, + "step": 1348 + }, + { + "epoch": 0.49260544093481834, + "grad_norm": 0.8173801302909851, + "learning_rate": 3.9671602032323475e-05, + "loss": 0.7516, + "step": 1349 + }, + { + "epoch": 0.4929706043454446, + "grad_norm": 0.9236201643943787, + "learning_rate": 3.9669008068924885e-05, + "loss": 0.726, + "step": 1350 + }, + { + "epoch": 0.49333576775607085, + "grad_norm": 0.8511644601821899, + "learning_rate": 3.9666403986610904e-05, + "loss": 0.7853, + "step": 1351 + }, + { + "epoch": 0.4937009311666971, + "grad_norm": 1.0044788122177124, + "learning_rate": 3.9663789786721235e-05, + "loss": 0.7512, + "step": 1352 + }, + { + "epoch": 0.49406609457732337, + "grad_norm": 0.9926662445068359, + "learning_rate": 3.966116547060078e-05, + "loss": 0.7505, + "step": 1353 + }, + { + "epoch": 0.4944312579879496, + "grad_norm": 0.8975332975387573, + "learning_rate": 3.965853103959965e-05, + "loss": 0.7635, + "step": 1354 + }, + { + "epoch": 0.4947964213985759, + "grad_norm": 1.0068562030792236, + "learning_rate": 3.965588649507314e-05, + "loss": 0.7328, + "step": 1355 + }, + { + "epoch": 0.49516158480920214, + "grad_norm": 1.42154860496521, + "learning_rate": 3.965323183838177e-05, + "loss": 0.8043, + "step": 1356 + }, + { + "epoch": 0.4955267482198284, + "grad_norm": 1.7872182130813599, + "learning_rate": 3.9650567070891256e-05, + "loss": 0.7196, + "step": 1357 + }, + { + "epoch": 0.49589191163045465, + "grad_norm": 1.0129311084747314, + "learning_rate": 3.964789219397252e-05, + "loss": 0.7634, + "step": 1358 + }, + { + "epoch": 0.4962570750410809, + "grad_norm": 1.1521090269088745, + "learning_rate": 3.964520720900167e-05, + "loss": 0.7689, + "step": 1359 + }, + { + "epoch": 0.49662223845170717, + "grad_norm": 1.0223087072372437, + "learning_rate": 3.964251211736002e-05, + "loss": 0.76, + "step": 1360 + }, + { + "epoch": 0.49698740186233337, + "grad_norm": 1.1527354717254639, + "learning_rate": 3.963980692043408e-05, + "loss": 0.7615, + "step": 1361 + }, + { + "epoch": 0.4973525652729596, + "grad_norm": 1.0977435111999512, + "learning_rate": 3.963709161961559e-05, + "loss": 0.7582, + "step": 1362 + }, + { + "epoch": 0.4977177286835859, + "grad_norm": 1.1074302196502686, + "learning_rate": 3.9634366216301445e-05, + "loss": 0.761, + "step": 1363 + }, + { + "epoch": 0.49808289209421214, + "grad_norm": 0.6354876160621643, + "learning_rate": 3.963163071189376e-05, + "loss": 0.7312, + "step": 1364 + }, + { + "epoch": 0.4984480555048384, + "grad_norm": 0.8839811086654663, + "learning_rate": 3.962888510779984e-05, + "loss": 0.7709, + "step": 1365 + }, + { + "epoch": 0.49881321891546465, + "grad_norm": 1.2112983465194702, + "learning_rate": 3.962612940543219e-05, + "loss": 0.7876, + "step": 1366 + }, + { + "epoch": 0.4991783823260909, + "grad_norm": 1.0086325407028198, + "learning_rate": 3.962336360620851e-05, + "loss": 0.7424, + "step": 1367 + }, + { + "epoch": 0.49954354573671716, + "grad_norm": 0.9076545834541321, + "learning_rate": 3.962058771155169e-05, + "loss": 0.7189, + "step": 1368 + }, + { + "epoch": 0.4999087091473434, + "grad_norm": 0.8234534859657288, + "learning_rate": 3.9617801722889815e-05, + "loss": 0.7291, + "step": 1369 + }, + { + "epoch": 0.5002738725579697, + "grad_norm": 1.1717032194137573, + "learning_rate": 3.9615005641656175e-05, + "loss": 0.7498, + "step": 1370 + }, + { + "epoch": 0.5006390359685959, + "grad_norm": 1.1757103204727173, + "learning_rate": 3.961219946928923e-05, + "loss": 0.7739, + "step": 1371 + }, + { + "epoch": 0.5010041993792222, + "grad_norm": 0.7470300197601318, + "learning_rate": 3.960938320723265e-05, + "loss": 0.7401, + "step": 1372 + }, + { + "epoch": 0.5013693627898484, + "grad_norm": 1.7062366008758545, + "learning_rate": 3.960655685693528e-05, + "loss": 0.7811, + "step": 1373 + }, + { + "epoch": 0.5017345262004748, + "grad_norm": 0.990139365196228, + "learning_rate": 3.960372041985117e-05, + "loss": 0.7343, + "step": 1374 + }, + { + "epoch": 0.502099689611101, + "grad_norm": 1.0472018718719482, + "learning_rate": 3.960087389743955e-05, + "loss": 0.7577, + "step": 1375 + }, + { + "epoch": 0.5024648530217273, + "grad_norm": 0.8663569688796997, + "learning_rate": 3.959801729116485e-05, + "loss": 0.7581, + "step": 1376 + }, + { + "epoch": 0.5028300164323535, + "grad_norm": 1.1145840883255005, + "learning_rate": 3.959515060249666e-05, + "loss": 0.724, + "step": 1377 + }, + { + "epoch": 0.5031951798429797, + "grad_norm": 0.9530529975891113, + "learning_rate": 3.959227383290981e-05, + "loss": 0.7391, + "step": 1378 + }, + { + "epoch": 0.503560343253606, + "grad_norm": 1.080680251121521, + "learning_rate": 3.9589386983884245e-05, + "loss": 0.7557, + "step": 1379 + }, + { + "epoch": 0.5039255066642322, + "grad_norm": 1.1287989616394043, + "learning_rate": 3.9586490056905155e-05, + "loss": 0.7414, + "step": 1380 + }, + { + "epoch": 0.5042906700748585, + "grad_norm": 0.7276008129119873, + "learning_rate": 3.958358305346289e-05, + "loss": 0.7595, + "step": 1381 + }, + { + "epoch": 0.5046558334854847, + "grad_norm": 1.0778876543045044, + "learning_rate": 3.958066597505299e-05, + "loss": 0.7636, + "step": 1382 + }, + { + "epoch": 0.505020996896111, + "grad_norm": 1.0213855504989624, + "learning_rate": 3.957773882317615e-05, + "loss": 0.7161, + "step": 1383 + }, + { + "epoch": 0.5053861603067372, + "grad_norm": 1.043364405632019, + "learning_rate": 3.957480159933831e-05, + "loss": 0.7676, + "step": 1384 + }, + { + "epoch": 0.5057513237173635, + "grad_norm": 1.728406548500061, + "learning_rate": 3.957185430505052e-05, + "loss": 0.7615, + "step": 1385 + }, + { + "epoch": 0.5061164871279897, + "grad_norm": 1.02801513671875, + "learning_rate": 3.9568896941829076e-05, + "loss": 0.6919, + "step": 1386 + }, + { + "epoch": 0.506481650538616, + "grad_norm": 0.7651058435440063, + "learning_rate": 3.9565929511195395e-05, + "loss": 0.7399, + "step": 1387 + }, + { + "epoch": 0.5068468139492422, + "grad_norm": 0.8804731965065002, + "learning_rate": 3.9562952014676116e-05, + "loss": 0.7604, + "step": 1388 + }, + { + "epoch": 0.5072119773598686, + "grad_norm": 1.0950775146484375, + "learning_rate": 3.955996445380303e-05, + "loss": 0.7845, + "step": 1389 + }, + { + "epoch": 0.5075771407704948, + "grad_norm": 0.9987009167671204, + "learning_rate": 3.955696683011314e-05, + "loss": 0.7397, + "step": 1390 + }, + { + "epoch": 0.5079423041811211, + "grad_norm": 0.7001053094863892, + "learning_rate": 3.9553959145148585e-05, + "loss": 0.7282, + "step": 1391 + }, + { + "epoch": 0.5083074675917473, + "grad_norm": 1.055344581604004, + "learning_rate": 3.955094140045669e-05, + "loss": 0.7471, + "step": 1392 + }, + { + "epoch": 0.5086726310023736, + "grad_norm": 1.044423222541809, + "learning_rate": 3.954791359758998e-05, + "loss": 0.7891, + "step": 1393 + }, + { + "epoch": 0.5090377944129998, + "grad_norm": 0.9530107975006104, + "learning_rate": 3.9544875738106136e-05, + "loss": 0.7474, + "step": 1394 + }, + { + "epoch": 0.5094029578236261, + "grad_norm": 1.2218527793884277, + "learning_rate": 3.9541827823568016e-05, + "loss": 0.7604, + "step": 1395 + }, + { + "epoch": 0.5097681212342523, + "grad_norm": 1.285076379776001, + "learning_rate": 3.953876985554364e-05, + "loss": 0.7841, + "step": 1396 + }, + { + "epoch": 0.5101332846448786, + "grad_norm": 0.9849181175231934, + "learning_rate": 3.953570183560621e-05, + "loss": 0.7719, + "step": 1397 + }, + { + "epoch": 0.5104984480555048, + "grad_norm": 1.0370405912399292, + "learning_rate": 3.953262376533412e-05, + "loss": 0.7454, + "step": 1398 + }, + { + "epoch": 0.5108636114661311, + "grad_norm": 0.9289689064025879, + "learning_rate": 3.9529535646310876e-05, + "loss": 0.7605, + "step": 1399 + }, + { + "epoch": 0.5112287748767573, + "grad_norm": 1.2506901025772095, + "learning_rate": 3.9526437480125227e-05, + "loss": 0.7914, + "step": 1400 + }, + { + "epoch": 0.5115939382873836, + "grad_norm": 0.8542978167533875, + "learning_rate": 3.952332926837105e-05, + "loss": 0.7587, + "step": 1401 + }, + { + "epoch": 0.5119591016980098, + "grad_norm": 1.0645527839660645, + "learning_rate": 3.9520211012647366e-05, + "loss": 0.7507, + "step": 1402 + }, + { + "epoch": 0.5123242651086362, + "grad_norm": 1.0783454179763794, + "learning_rate": 3.951708271455843e-05, + "loss": 0.7578, + "step": 1403 + }, + { + "epoch": 0.5126894285192624, + "grad_norm": 0.8975682258605957, + "learning_rate": 3.95139443757136e-05, + "loss": 0.7546, + "step": 1404 + }, + { + "epoch": 0.5130545919298887, + "grad_norm": 1.2350717782974243, + "learning_rate": 3.951079599772744e-05, + "loss": 0.7856, + "step": 1405 + }, + { + "epoch": 0.5134197553405149, + "grad_norm": 1.0456730127334595, + "learning_rate": 3.950763758221966e-05, + "loss": 0.7516, + "step": 1406 + }, + { + "epoch": 0.5137849187511412, + "grad_norm": 0.9096729755401611, + "learning_rate": 3.950446913081513e-05, + "loss": 0.7571, + "step": 1407 + }, + { + "epoch": 0.5141500821617674, + "grad_norm": 0.8735365271568298, + "learning_rate": 3.9501290645143905e-05, + "loss": 0.7809, + "step": 1408 + }, + { + "epoch": 0.5145152455723937, + "grad_norm": 1.090287446975708, + "learning_rate": 3.949810212684117e-05, + "loss": 0.7471, + "step": 1409 + }, + { + "epoch": 0.5148804089830199, + "grad_norm": 0.8822610378265381, + "learning_rate": 3.949490357754731e-05, + "loss": 0.7285, + "step": 1410 + }, + { + "epoch": 0.5152455723936462, + "grad_norm": 0.9613785743713379, + "learning_rate": 3.9491694998907835e-05, + "loss": 0.7302, + "step": 1411 + }, + { + "epoch": 0.5156107358042724, + "grad_norm": 0.9969671368598938, + "learning_rate": 3.948847639257344e-05, + "loss": 0.7629, + "step": 1412 + }, + { + "epoch": 0.5159758992148986, + "grad_norm": 1.1526439189910889, + "learning_rate": 3.948524776019997e-05, + "loss": 0.765, + "step": 1413 + }, + { + "epoch": 0.5163410626255249, + "grad_norm": 0.8454238176345825, + "learning_rate": 3.9482009103448415e-05, + "loss": 0.7268, + "step": 1414 + }, + { + "epoch": 0.5167062260361511, + "grad_norm": 0.8136591911315918, + "learning_rate": 3.947876042398494e-05, + "loss": 0.7429, + "step": 1415 + }, + { + "epoch": 0.5170713894467774, + "grad_norm": 1.1326630115509033, + "learning_rate": 3.947550172348087e-05, + "loss": 0.7675, + "step": 1416 + }, + { + "epoch": 0.5174365528574036, + "grad_norm": 0.917960524559021, + "learning_rate": 3.947223300361265e-05, + "loss": 0.7319, + "step": 1417 + }, + { + "epoch": 0.51780171626803, + "grad_norm": 0.93192058801651, + "learning_rate": 3.946895426606194e-05, + "loss": 0.7578, + "step": 1418 + }, + { + "epoch": 0.5181668796786562, + "grad_norm": 0.9007557034492493, + "learning_rate": 3.946566551251549e-05, + "loss": 0.7427, + "step": 1419 + }, + { + "epoch": 0.5185320430892825, + "grad_norm": 1.0617369413375854, + "learning_rate": 3.946236674466524e-05, + "loss": 0.7753, + "step": 1420 + }, + { + "epoch": 0.5188972064999087, + "grad_norm": 0.7483951449394226, + "learning_rate": 3.945905796420828e-05, + "loss": 0.7314, + "step": 1421 + }, + { + "epoch": 0.519262369910535, + "grad_norm": 0.8350456357002258, + "learning_rate": 3.945573917284685e-05, + "loss": 0.7306, + "step": 1422 + }, + { + "epoch": 0.5196275333211612, + "grad_norm": 0.791534960269928, + "learning_rate": 3.945241037228831e-05, + "loss": 0.7414, + "step": 1423 + }, + { + "epoch": 0.5199926967317875, + "grad_norm": 0.9981648921966553, + "learning_rate": 3.944907156424522e-05, + "loss": 0.7371, + "step": 1424 + }, + { + "epoch": 0.5203578601424137, + "grad_norm": 0.7343462705612183, + "learning_rate": 3.9445722750435244e-05, + "loss": 0.7511, + "step": 1425 + }, + { + "epoch": 0.52072302355304, + "grad_norm": 0.775215208530426, + "learning_rate": 3.944236393258123e-05, + "loss": 0.7365, + "step": 1426 + }, + { + "epoch": 0.5210881869636662, + "grad_norm": 1.1900813579559326, + "learning_rate": 3.9438995112411144e-05, + "loss": 0.7915, + "step": 1427 + }, + { + "epoch": 0.5214533503742925, + "grad_norm": 0.951445460319519, + "learning_rate": 3.943561629165811e-05, + "loss": 0.7274, + "step": 1428 + }, + { + "epoch": 0.5218185137849187, + "grad_norm": 0.9051117897033691, + "learning_rate": 3.94322274720604e-05, + "loss": 0.7634, + "step": 1429 + }, + { + "epoch": 0.522183677195545, + "grad_norm": 0.8373913764953613, + "learning_rate": 3.942882865536142e-05, + "loss": 0.7477, + "step": 1430 + }, + { + "epoch": 0.5225488406061712, + "grad_norm": 1.136154294013977, + "learning_rate": 3.942541984330972e-05, + "loss": 0.7323, + "step": 1431 + }, + { + "epoch": 0.5229140040167976, + "grad_norm": 0.8140801191329956, + "learning_rate": 3.942200103765901e-05, + "loss": 0.7441, + "step": 1432 + }, + { + "epoch": 0.5232791674274238, + "grad_norm": 0.7399319410324097, + "learning_rate": 3.941857224016812e-05, + "loss": 0.7267, + "step": 1433 + }, + { + "epoch": 0.5236443308380501, + "grad_norm": 0.8513920307159424, + "learning_rate": 3.941513345260104e-05, + "loss": 0.7428, + "step": 1434 + }, + { + "epoch": 0.5240094942486763, + "grad_norm": 0.9583787322044373, + "learning_rate": 3.941168467672687e-05, + "loss": 0.7773, + "step": 1435 + }, + { + "epoch": 0.5243746576593026, + "grad_norm": 1.1312326192855835, + "learning_rate": 3.940822591431988e-05, + "loss": 0.7528, + "step": 1436 + }, + { + "epoch": 0.5247398210699288, + "grad_norm": 1.0751779079437256, + "learning_rate": 3.940475716715946e-05, + "loss": 0.7648, + "step": 1437 + }, + { + "epoch": 0.5251049844805551, + "grad_norm": 0.7694438695907593, + "learning_rate": 3.9401278437030144e-05, + "loss": 0.7466, + "step": 1438 + }, + { + "epoch": 0.5254701478911813, + "grad_norm": 1.1514031887054443, + "learning_rate": 3.9397789725721594e-05, + "loss": 0.7599, + "step": 1439 + }, + { + "epoch": 0.5258353113018076, + "grad_norm": 1.1192198991775513, + "learning_rate": 3.939429103502862e-05, + "loss": 0.7544, + "step": 1440 + }, + { + "epoch": 0.5262004747124338, + "grad_norm": 0.6443295478820801, + "learning_rate": 3.939078236675115e-05, + "loss": 0.7329, + "step": 1441 + }, + { + "epoch": 0.5265656381230601, + "grad_norm": 0.7252869606018066, + "learning_rate": 3.938726372269425e-05, + "loss": 0.7239, + "step": 1442 + }, + { + "epoch": 0.5269308015336863, + "grad_norm": 0.8113353848457336, + "learning_rate": 3.9383735104668135e-05, + "loss": 0.7357, + "step": 1443 + }, + { + "epoch": 0.5272959649443126, + "grad_norm": 1.05000638961792, + "learning_rate": 3.9380196514488126e-05, + "loss": 0.7494, + "step": 1444 + }, + { + "epoch": 0.5276611283549388, + "grad_norm": 1.1334178447723389, + "learning_rate": 3.937664795397469e-05, + "loss": 0.7516, + "step": 1445 + }, + { + "epoch": 0.528026291765565, + "grad_norm": 1.180598258972168, + "learning_rate": 3.937308942495342e-05, + "loss": 0.7391, + "step": 1446 + }, + { + "epoch": 0.5283914551761913, + "grad_norm": 0.9825596213340759, + "learning_rate": 3.936952092925503e-05, + "loss": 0.7528, + "step": 1447 + }, + { + "epoch": 0.5287566185868176, + "grad_norm": 1.298930287361145, + "learning_rate": 3.9365942468715375e-05, + "loss": 0.735, + "step": 1448 + }, + { + "epoch": 0.5291217819974439, + "grad_norm": 1.0083543062210083, + "learning_rate": 3.936235404517543e-05, + "loss": 0.731, + "step": 1449 + }, + { + "epoch": 0.5294869454080701, + "grad_norm": 0.8948465585708618, + "learning_rate": 3.935875566048129e-05, + "loss": 0.7324, + "step": 1450 + }, + { + "epoch": 0.5298521088186964, + "grad_norm": 1.0688496828079224, + "learning_rate": 3.935514731648418e-05, + "loss": 0.7656, + "step": 1451 + }, + { + "epoch": 0.5302172722293226, + "grad_norm": 1.411491870880127, + "learning_rate": 3.935152901504045e-05, + "loss": 0.7747, + "step": 1452 + }, + { + "epoch": 0.5305824356399489, + "grad_norm": 0.8527326583862305, + "learning_rate": 3.934790075801156e-05, + "loss": 0.7478, + "step": 1453 + }, + { + "epoch": 0.5309475990505751, + "grad_norm": 0.7693621516227722, + "learning_rate": 3.934426254726413e-05, + "loss": 0.7347, + "step": 1454 + }, + { + "epoch": 0.5313127624612014, + "grad_norm": 0.9457259774208069, + "learning_rate": 3.934061438466985e-05, + "loss": 0.7649, + "step": 1455 + }, + { + "epoch": 0.5316779258718276, + "grad_norm": 0.6615961194038391, + "learning_rate": 3.933695627210555e-05, + "loss": 0.7629, + "step": 1456 + }, + { + "epoch": 0.5320430892824539, + "grad_norm": 0.5765036344528198, + "learning_rate": 3.93332882114532e-05, + "loss": 0.7267, + "step": 1457 + }, + { + "epoch": 0.5324082526930801, + "grad_norm": 0.880248486995697, + "learning_rate": 3.9329610204599864e-05, + "loss": 0.7397, + "step": 1458 + }, + { + "epoch": 0.5327734161037064, + "grad_norm": 0.7687374353408813, + "learning_rate": 3.932592225343772e-05, + "loss": 0.7455, + "step": 1459 + }, + { + "epoch": 0.5331385795143326, + "grad_norm": 0.8867073059082031, + "learning_rate": 3.932222435986408e-05, + "loss": 0.748, + "step": 1460 + }, + { + "epoch": 0.533503742924959, + "grad_norm": 1.0743588209152222, + "learning_rate": 3.931851652578137e-05, + "loss": 0.764, + "step": 1461 + }, + { + "epoch": 0.5338689063355851, + "grad_norm": 0.8158539533615112, + "learning_rate": 3.93147987530971e-05, + "loss": 0.7408, + "step": 1462 + }, + { + "epoch": 0.5342340697462115, + "grad_norm": 0.9781192541122437, + "learning_rate": 3.9311071043723927e-05, + "loss": 0.7259, + "step": 1463 + }, + { + "epoch": 0.5345992331568377, + "grad_norm": 1.0258907079696655, + "learning_rate": 3.930733339957961e-05, + "loss": 0.7621, + "step": 1464 + }, + { + "epoch": 0.534964396567464, + "grad_norm": 1.2095290422439575, + "learning_rate": 3.9303585822587014e-05, + "loss": 0.748, + "step": 1465 + }, + { + "epoch": 0.5353295599780902, + "grad_norm": 1.3170123100280762, + "learning_rate": 3.929982831467412e-05, + "loss": 0.7411, + "step": 1466 + }, + { + "epoch": 0.5356947233887165, + "grad_norm": 0.941737949848175, + "learning_rate": 3.9296060877774004e-05, + "loss": 0.7373, + "step": 1467 + }, + { + "epoch": 0.5360598867993427, + "grad_norm": 1.0546380281448364, + "learning_rate": 3.9292283513824873e-05, + "loss": 0.7261, + "step": 1468 + }, + { + "epoch": 0.536425050209969, + "grad_norm": 0.8461571931838989, + "learning_rate": 3.928849622477002e-05, + "loss": 0.7213, + "step": 1469 + }, + { + "epoch": 0.5367902136205952, + "grad_norm": 0.902228832244873, + "learning_rate": 3.928469901255787e-05, + "loss": 0.7261, + "step": 1470 + }, + { + "epoch": 0.5371553770312215, + "grad_norm": 0.6545456051826477, + "learning_rate": 3.928089187914192e-05, + "loss": 0.7326, + "step": 1471 + }, + { + "epoch": 0.5375205404418477, + "grad_norm": 0.6541972160339355, + "learning_rate": 3.927707482648079e-05, + "loss": 0.7266, + "step": 1472 + }, + { + "epoch": 0.537885703852474, + "grad_norm": 0.862709641456604, + "learning_rate": 3.92732478565382e-05, + "loss": 0.7932, + "step": 1473 + }, + { + "epoch": 0.5382508672631002, + "grad_norm": 1.082711935043335, + "learning_rate": 3.926941097128298e-05, + "loss": 0.7378, + "step": 1474 + }, + { + "epoch": 0.5386160306737265, + "grad_norm": 1.1237872838974, + "learning_rate": 3.9265564172689046e-05, + "loss": 0.7468, + "step": 1475 + }, + { + "epoch": 0.5389811940843527, + "grad_norm": 0.8410653471946716, + "learning_rate": 3.926170746273543e-05, + "loss": 0.7385, + "step": 1476 + }, + { + "epoch": 0.5393463574949791, + "grad_norm": 0.7173160910606384, + "learning_rate": 3.925784084340624e-05, + "loss": 0.7298, + "step": 1477 + }, + { + "epoch": 0.5397115209056053, + "grad_norm": 0.9982025027275085, + "learning_rate": 3.9253964316690707e-05, + "loss": 0.7587, + "step": 1478 + }, + { + "epoch": 0.5400766843162315, + "grad_norm": 0.919310986995697, + "learning_rate": 3.925007788458315e-05, + "loss": 0.7244, + "step": 1479 + }, + { + "epoch": 0.5404418477268578, + "grad_norm": 0.8118817806243896, + "learning_rate": 3.924618154908298e-05, + "loss": 0.7427, + "step": 1480 + }, + { + "epoch": 0.540807011137484, + "grad_norm": 0.8505826592445374, + "learning_rate": 3.9242275312194694e-05, + "loss": 0.7567, + "step": 1481 + }, + { + "epoch": 0.5411721745481103, + "grad_norm": 0.8501365780830383, + "learning_rate": 3.923835917592792e-05, + "loss": 0.7379, + "step": 1482 + }, + { + "epoch": 0.5415373379587365, + "grad_norm": 0.6969407796859741, + "learning_rate": 3.923443314229732e-05, + "loss": 0.7416, + "step": 1483 + }, + { + "epoch": 0.5419025013693628, + "grad_norm": 0.9816417694091797, + "learning_rate": 3.9230497213322715e-05, + "loss": 0.7499, + "step": 1484 + }, + { + "epoch": 0.542267664779989, + "grad_norm": 0.8559290170669556, + "learning_rate": 3.922655139102895e-05, + "loss": 0.7424, + "step": 1485 + }, + { + "epoch": 0.5426328281906153, + "grad_norm": 0.8443220257759094, + "learning_rate": 3.922259567744602e-05, + "loss": 0.7454, + "step": 1486 + }, + { + "epoch": 0.5429979916012415, + "grad_norm": 1.1401548385620117, + "learning_rate": 3.9218630074608966e-05, + "loss": 0.735, + "step": 1487 + }, + { + "epoch": 0.5433631550118678, + "grad_norm": 0.926574170589447, + "learning_rate": 3.921465458455793e-05, + "loss": 0.7263, + "step": 1488 + }, + { + "epoch": 0.543728318422494, + "grad_norm": 0.9735384583473206, + "learning_rate": 3.9210669209338144e-05, + "loss": 0.735, + "step": 1489 + }, + { + "epoch": 0.5440934818331203, + "grad_norm": 0.8426558375358582, + "learning_rate": 3.920667395099993e-05, + "loss": 0.73, + "step": 1490 + }, + { + "epoch": 0.5444586452437465, + "grad_norm": 0.9400650262832642, + "learning_rate": 3.920266881159869e-05, + "loss": 0.7649, + "step": 1491 + }, + { + "epoch": 0.5448238086543729, + "grad_norm": 0.765070915222168, + "learning_rate": 3.9198653793194896e-05, + "loss": 0.7173, + "step": 1492 + }, + { + "epoch": 0.5451889720649991, + "grad_norm": 0.9452735185623169, + "learning_rate": 3.919462889785412e-05, + "loss": 0.7412, + "step": 1493 + }, + { + "epoch": 0.5455541354756254, + "grad_norm": 1.000473976135254, + "learning_rate": 3.9190594127647005e-05, + "loss": 0.7458, + "step": 1494 + }, + { + "epoch": 0.5459192988862516, + "grad_norm": 0.6995605826377869, + "learning_rate": 3.918654948464928e-05, + "loss": 0.7232, + "step": 1495 + }, + { + "epoch": 0.5462844622968779, + "grad_norm": 0.7358512878417969, + "learning_rate": 3.918249497094176e-05, + "loss": 0.7213, + "step": 1496 + }, + { + "epoch": 0.5466496257075041, + "grad_norm": 0.8587970733642578, + "learning_rate": 3.917843058861032e-05, + "loss": 0.7213, + "step": 1497 + }, + { + "epoch": 0.5470147891181304, + "grad_norm": 1.0094513893127441, + "learning_rate": 3.9174356339745933e-05, + "loss": 0.7267, + "step": 1498 + }, + { + "epoch": 0.5473799525287566, + "grad_norm": 1.1970059871673584, + "learning_rate": 3.917027222644462e-05, + "loss": 0.7336, + "step": 1499 + }, + { + "epoch": 0.5477451159393829, + "grad_norm": 0.9547330737113953, + "learning_rate": 3.9166178250807504e-05, + "loss": 0.7314, + "step": 1500 + }, + { + "epoch": 0.5481102793500091, + "grad_norm": 0.8042092323303223, + "learning_rate": 3.9162074414940764e-05, + "loss": 0.674, + "step": 1501 + }, + { + "epoch": 0.5484754427606354, + "grad_norm": 0.9922061562538147, + "learning_rate": 3.915796072095567e-05, + "loss": 0.7426, + "step": 1502 + }, + { + "epoch": 0.5488406061712616, + "grad_norm": 0.9946082234382629, + "learning_rate": 3.9153837170968544e-05, + "loss": 0.7634, + "step": 1503 + }, + { + "epoch": 0.5492057695818879, + "grad_norm": 0.8678110241889954, + "learning_rate": 3.914970376710079e-05, + "loss": 0.7316, + "step": 1504 + }, + { + "epoch": 0.5495709329925141, + "grad_norm": 0.9004887342453003, + "learning_rate": 3.914556051147887e-05, + "loss": 0.7429, + "step": 1505 + }, + { + "epoch": 0.5499360964031405, + "grad_norm": 0.8457182049751282, + "learning_rate": 3.914140740623434e-05, + "loss": 0.7302, + "step": 1506 + }, + { + "epoch": 0.5503012598137667, + "grad_norm": 0.6601680517196655, + "learning_rate": 3.9137244453503794e-05, + "loss": 0.7152, + "step": 1507 + }, + { + "epoch": 0.550666423224393, + "grad_norm": 0.9470217823982239, + "learning_rate": 3.9133071655428904e-05, + "loss": 0.7292, + "step": 1508 + }, + { + "epoch": 0.5510315866350192, + "grad_norm": 1.0125720500946045, + "learning_rate": 3.9128889014156415e-05, + "loss": 0.7549, + "step": 1509 + }, + { + "epoch": 0.5513967500456455, + "grad_norm": 0.8530757427215576, + "learning_rate": 3.9124696531838114e-05, + "loss": 0.7444, + "step": 1510 + }, + { + "epoch": 0.5517619134562717, + "grad_norm": 0.9094074368476868, + "learning_rate": 3.9120494210630886e-05, + "loss": 0.7416, + "step": 1511 + }, + { + "epoch": 0.552127076866898, + "grad_norm": 0.9099465012550354, + "learning_rate": 3.911628205269663e-05, + "loss": 0.7158, + "step": 1512 + }, + { + "epoch": 0.5524922402775242, + "grad_norm": 0.9260149598121643, + "learning_rate": 3.911206006020235e-05, + "loss": 0.7482, + "step": 1513 + }, + { + "epoch": 0.5528574036881504, + "grad_norm": 1.0228321552276611, + "learning_rate": 3.910782823532009e-05, + "loss": 0.7572, + "step": 1514 + }, + { + "epoch": 0.5532225670987767, + "grad_norm": 0.957773745059967, + "learning_rate": 3.910358658022696e-05, + "loss": 0.7161, + "step": 1515 + }, + { + "epoch": 0.5535877305094029, + "grad_norm": 1.198459506034851, + "learning_rate": 3.909933509710511e-05, + "loss": 0.736, + "step": 1516 + }, + { + "epoch": 0.5539528939200292, + "grad_norm": 1.1073055267333984, + "learning_rate": 3.909507378814175e-05, + "loss": 0.7547, + "step": 1517 + }, + { + "epoch": 0.5543180573306554, + "grad_norm": 0.9325324296951294, + "learning_rate": 3.909080265552918e-05, + "loss": 0.7434, + "step": 1518 + }, + { + "epoch": 0.5546832207412817, + "grad_norm": 0.7317970395088196, + "learning_rate": 3.90865217014647e-05, + "loss": 0.7222, + "step": 1519 + }, + { + "epoch": 0.5550483841519079, + "grad_norm": 1.3255256414413452, + "learning_rate": 3.90822309281507e-05, + "loss": 0.7048, + "step": 1520 + }, + { + "epoch": 0.5554135475625342, + "grad_norm": 0.7833756804466248, + "learning_rate": 3.9077930337794614e-05, + "loss": 0.7237, + "step": 1521 + }, + { + "epoch": 0.5557787109731605, + "grad_norm": 0.8257026672363281, + "learning_rate": 3.907361993260891e-05, + "loss": 0.7181, + "step": 1522 + }, + { + "epoch": 0.5561438743837868, + "grad_norm": 0.9362462162971497, + "learning_rate": 3.906929971481114e-05, + "loss": 0.7233, + "step": 1523 + }, + { + "epoch": 0.556509037794413, + "grad_norm": 1.0759377479553223, + "learning_rate": 3.906496968662386e-05, + "loss": 0.7137, + "step": 1524 + }, + { + "epoch": 0.5568742012050393, + "grad_norm": 0.9503883123397827, + "learning_rate": 3.906062985027471e-05, + "loss": 0.7472, + "step": 1525 + }, + { + "epoch": 0.5572393646156655, + "grad_norm": 0.9840250611305237, + "learning_rate": 3.905628020799636e-05, + "loss": 0.7241, + "step": 1526 + }, + { + "epoch": 0.5576045280262918, + "grad_norm": 0.9575956463813782, + "learning_rate": 3.905192076202652e-05, + "loss": 0.7015, + "step": 1527 + }, + { + "epoch": 0.557969691436918, + "grad_norm": 0.8100535273551941, + "learning_rate": 3.904755151460795e-05, + "loss": 0.7181, + "step": 1528 + }, + { + "epoch": 0.5583348548475443, + "grad_norm": 1.001095175743103, + "learning_rate": 3.9043172467988464e-05, + "loss": 0.7142, + "step": 1529 + }, + { + "epoch": 0.5587000182581705, + "grad_norm": 1.096186637878418, + "learning_rate": 3.9038783624420894e-05, + "loss": 0.749, + "step": 1530 + }, + { + "epoch": 0.5590651816687968, + "grad_norm": 3.8753530979156494, + "learning_rate": 3.9034384986163126e-05, + "loss": 0.7357, + "step": 1531 + }, + { + "epoch": 0.559430345079423, + "grad_norm": 0.928016185760498, + "learning_rate": 3.902997655547809e-05, + "loss": 0.7152, + "step": 1532 + }, + { + "epoch": 0.5597955084900493, + "grad_norm": 0.7244706749916077, + "learning_rate": 3.9025558334633735e-05, + "loss": 0.7244, + "step": 1533 + }, + { + "epoch": 0.5601606719006755, + "grad_norm": 0.8810442686080933, + "learning_rate": 3.9021130325903076e-05, + "loss": 0.6888, + "step": 1534 + }, + { + "epoch": 0.5605258353113018, + "grad_norm": 1.0424643754959106, + "learning_rate": 3.9016692531564125e-05, + "loss": 0.7013, + "step": 1535 + }, + { + "epoch": 0.560890998721928, + "grad_norm": 0.968743085861206, + "learning_rate": 3.901224495389996e-05, + "loss": 0.749, + "step": 1536 + }, + { + "epoch": 0.5612561621325544, + "grad_norm": 0.9253996014595032, + "learning_rate": 3.900778759519868e-05, + "loss": 0.7476, + "step": 1537 + }, + { + "epoch": 0.5616213255431806, + "grad_norm": 0.746687114238739, + "learning_rate": 3.9003320457753425e-05, + "loss": 0.7317, + "step": 1538 + }, + { + "epoch": 0.5619864889538069, + "grad_norm": 1.5913606882095337, + "learning_rate": 3.8998843543862347e-05, + "loss": 0.7412, + "step": 1539 + }, + { + "epoch": 0.5623516523644331, + "grad_norm": 1.0188117027282715, + "learning_rate": 3.899435685582864e-05, + "loss": 0.7125, + "step": 1540 + }, + { + "epoch": 0.5627168157750594, + "grad_norm": 1.0400840044021606, + "learning_rate": 3.898986039596052e-05, + "loss": 0.7058, + "step": 1541 + }, + { + "epoch": 0.5630819791856856, + "grad_norm": 0.8724197745323181, + "learning_rate": 3.898535416657125e-05, + "loss": 0.7518, + "step": 1542 + }, + { + "epoch": 0.5634471425963119, + "grad_norm": 0.9361122846603394, + "learning_rate": 3.89808381699791e-05, + "loss": 0.738, + "step": 1543 + }, + { + "epoch": 0.5638123060069381, + "grad_norm": 0.8224091529846191, + "learning_rate": 3.8976312408507356e-05, + "loss": 0.7749, + "step": 1544 + }, + { + "epoch": 0.5641774694175644, + "grad_norm": 0.7766799926757812, + "learning_rate": 3.897177688448435e-05, + "loss": 0.7444, + "step": 1545 + }, + { + "epoch": 0.5645426328281906, + "grad_norm": 2.588979959487915, + "learning_rate": 3.8967231600243434e-05, + "loss": 0.7341, + "step": 1546 + }, + { + "epoch": 0.5649077962388168, + "grad_norm": 0.7987133860588074, + "learning_rate": 3.8962676558122965e-05, + "loss": 0.7125, + "step": 1547 + }, + { + "epoch": 0.5652729596494431, + "grad_norm": 0.8955987095832825, + "learning_rate": 3.895811176046633e-05, + "loss": 0.7365, + "step": 1548 + }, + { + "epoch": 0.5656381230600693, + "grad_norm": 0.7170660495758057, + "learning_rate": 3.895353720962193e-05, + "loss": 0.7212, + "step": 1549 + }, + { + "epoch": 0.5660032864706956, + "grad_norm": 1.0151087045669556, + "learning_rate": 3.8948952907943206e-05, + "loss": 0.7556, + "step": 1550 + }, + { + "epoch": 0.5663684498813218, + "grad_norm": 1.3191031217575073, + "learning_rate": 3.8944358857788576e-05, + "loss": 0.77, + "step": 1551 + }, + { + "epoch": 0.5667336132919482, + "grad_norm": 1.0472934246063232, + "learning_rate": 3.893975506152151e-05, + "loss": 0.7274, + "step": 1552 + }, + { + "epoch": 0.5670987767025744, + "grad_norm": 0.9767879247665405, + "learning_rate": 3.8935141521510466e-05, + "loss": 0.7597, + "step": 1553 + }, + { + "epoch": 0.5674639401132007, + "grad_norm": 0.8284198045730591, + "learning_rate": 3.8930518240128926e-05, + "loss": 0.7382, + "step": 1554 + }, + { + "epoch": 0.5678291035238269, + "grad_norm": 1.0305380821228027, + "learning_rate": 3.892588521975539e-05, + "loss": 0.75, + "step": 1555 + }, + { + "epoch": 0.5681942669344532, + "grad_norm": 1.0287115573883057, + "learning_rate": 3.892124246277336e-05, + "loss": 0.7576, + "step": 1556 + }, + { + "epoch": 0.5685594303450794, + "grad_norm": 0.7308568358421326, + "learning_rate": 3.891658997157134e-05, + "loss": 0.7251, + "step": 1557 + }, + { + "epoch": 0.5689245937557057, + "grad_norm": 0.9927620887756348, + "learning_rate": 3.891192774854285e-05, + "loss": 0.724, + "step": 1558 + }, + { + "epoch": 0.5692897571663319, + "grad_norm": 1.1937997341156006, + "learning_rate": 3.890725579608643e-05, + "loss": 0.7468, + "step": 1559 + }, + { + "epoch": 0.5696549205769582, + "grad_norm": 0.9900869727134705, + "learning_rate": 3.89025741166056e-05, + "loss": 0.7531, + "step": 1560 + }, + { + "epoch": 0.5700200839875844, + "grad_norm": 0.9900295734405518, + "learning_rate": 3.8897882712508906e-05, + "loss": 0.7347, + "step": 1561 + }, + { + "epoch": 0.5703852473982107, + "grad_norm": 0.9560722708702087, + "learning_rate": 3.8893181586209883e-05, + "loss": 0.759, + "step": 1562 + }, + { + "epoch": 0.5707504108088369, + "grad_norm": 0.7202712297439575, + "learning_rate": 3.888847074012706e-05, + "loss": 0.7329, + "step": 1563 + }, + { + "epoch": 0.5711155742194632, + "grad_norm": 1.0042426586151123, + "learning_rate": 3.8883750176684e-05, + "loss": 0.7676, + "step": 1564 + }, + { + "epoch": 0.5714807376300894, + "grad_norm": 1.1066584587097168, + "learning_rate": 3.8879019898309215e-05, + "loss": 0.7616, + "step": 1565 + }, + { + "epoch": 0.5718459010407158, + "grad_norm": 1.1834222078323364, + "learning_rate": 3.8874279907436274e-05, + "loss": 0.7126, + "step": 1566 + }, + { + "epoch": 0.572211064451342, + "grad_norm": 0.8603668808937073, + "learning_rate": 3.886953020650369e-05, + "loss": 0.7305, + "step": 1567 + }, + { + "epoch": 0.5725762278619683, + "grad_norm": 0.7139928340911865, + "learning_rate": 3.8864770797955e-05, + "loss": 0.7516, + "step": 1568 + }, + { + "epoch": 0.5729413912725945, + "grad_norm": 0.8934731483459473, + "learning_rate": 3.8860001684238744e-05, + "loss": 0.7286, + "step": 1569 + }, + { + "epoch": 0.5733065546832208, + "grad_norm": 0.8853731155395508, + "learning_rate": 3.885522286780842e-05, + "loss": 0.7301, + "step": 1570 + }, + { + "epoch": 0.573671718093847, + "grad_norm": 0.9740007519721985, + "learning_rate": 3.8850434351122536e-05, + "loss": 0.7448, + "step": 1571 + }, + { + "epoch": 0.5740368815044733, + "grad_norm": 1.2016581296920776, + "learning_rate": 3.884563613664461e-05, + "loss": 0.7446, + "step": 1572 + }, + { + "epoch": 0.5744020449150995, + "grad_norm": 1.046797513961792, + "learning_rate": 3.8840828226843113e-05, + "loss": 0.7207, + "step": 1573 + }, + { + "epoch": 0.5747672083257258, + "grad_norm": 0.9799802899360657, + "learning_rate": 3.8836010624191535e-05, + "loss": 0.734, + "step": 1574 + }, + { + "epoch": 0.575132371736352, + "grad_norm": 1.2142255306243896, + "learning_rate": 3.883118333116833e-05, + "loss": 0.7166, + "step": 1575 + }, + { + "epoch": 0.5754975351469783, + "grad_norm": 1.001091480255127, + "learning_rate": 3.8826346350256943e-05, + "loss": 0.7248, + "step": 1576 + }, + { + "epoch": 0.5758626985576045, + "grad_norm": 1.177527666091919, + "learning_rate": 3.882149968394582e-05, + "loss": 0.7349, + "step": 1577 + }, + { + "epoch": 0.5762278619682308, + "grad_norm": 1.2621814012527466, + "learning_rate": 3.881664333472837e-05, + "loss": 0.7143, + "step": 1578 + }, + { + "epoch": 0.576593025378857, + "grad_norm": 0.8858799338340759, + "learning_rate": 3.8811777305102986e-05, + "loss": 0.7115, + "step": 1579 + }, + { + "epoch": 0.5769581887894832, + "grad_norm": 0.8787485361099243, + "learning_rate": 3.880690159757305e-05, + "loss": 0.7211, + "step": 1580 + }, + { + "epoch": 0.5773233522001096, + "grad_norm": 0.9342215061187744, + "learning_rate": 3.880201621464691e-05, + "loss": 0.7169, + "step": 1581 + }, + { + "epoch": 0.5776885156107358, + "grad_norm": 1.0358704328536987, + "learning_rate": 3.8797121158837914e-05, + "loss": 0.7211, + "step": 1582 + }, + { + "epoch": 0.5780536790213621, + "grad_norm": 1.1126457452774048, + "learning_rate": 3.8792216432664356e-05, + "loss": 0.7659, + "step": 1583 + }, + { + "epoch": 0.5784188424319883, + "grad_norm": 0.8449137210845947, + "learning_rate": 3.878730203864954e-05, + "loss": 0.7141, + "step": 1584 + }, + { + "epoch": 0.5787840058426146, + "grad_norm": 0.9435628056526184, + "learning_rate": 3.87823779793217e-05, + "loss": 0.7293, + "step": 1585 + }, + { + "epoch": 0.5791491692532408, + "grad_norm": 1.09213125705719, + "learning_rate": 3.877744425721408e-05, + "loss": 0.7275, + "step": 1586 + }, + { + "epoch": 0.5795143326638671, + "grad_norm": 1.1995439529418945, + "learning_rate": 3.8772500874864886e-05, + "loss": 0.739, + "step": 1587 + }, + { + "epoch": 0.5798794960744933, + "grad_norm": 0.8401466012001038, + "learning_rate": 3.876754783481729e-05, + "loss": 0.7004, + "step": 1588 + }, + { + "epoch": 0.5802446594851196, + "grad_norm": 1.449797511100769, + "learning_rate": 3.8762585139619415e-05, + "loss": 0.7337, + "step": 1589 + }, + { + "epoch": 0.5806098228957458, + "grad_norm": 1.1313889026641846, + "learning_rate": 3.875761279182439e-05, + "loss": 0.6908, + "step": 1590 + }, + { + "epoch": 0.5809749863063721, + "grad_norm": 0.8749697804450989, + "learning_rate": 3.875263079399028e-05, + "loss": 0.749, + "step": 1591 + }, + { + "epoch": 0.5813401497169983, + "grad_norm": 0.8486689925193787, + "learning_rate": 3.874763914868013e-05, + "loss": 0.7147, + "step": 1592 + }, + { + "epoch": 0.5817053131276246, + "grad_norm": 1.2051169872283936, + "learning_rate": 3.874263785846192e-05, + "loss": 0.7404, + "step": 1593 + }, + { + "epoch": 0.5820704765382508, + "grad_norm": 1.1990227699279785, + "learning_rate": 3.873762692590863e-05, + "loss": 0.7632, + "step": 1594 + }, + { + "epoch": 0.5824356399488771, + "grad_norm": 0.8588769435882568, + "learning_rate": 3.8732606353598185e-05, + "loss": 0.7407, + "step": 1595 + }, + { + "epoch": 0.5828008033595033, + "grad_norm": 0.9404013752937317, + "learning_rate": 3.872757614411346e-05, + "loss": 0.729, + "step": 1596 + }, + { + "epoch": 0.5831659667701297, + "grad_norm": 1.1304678916931152, + "learning_rate": 3.8722536300042305e-05, + "loss": 0.781, + "step": 1597 + }, + { + "epoch": 0.5835311301807559, + "grad_norm": 1.142740249633789, + "learning_rate": 3.871748682397751e-05, + "loss": 0.7141, + "step": 1598 + }, + { + "epoch": 0.5838962935913822, + "grad_norm": 1.2788124084472656, + "learning_rate": 3.871242771851683e-05, + "loss": 0.7212, + "step": 1599 + }, + { + "epoch": 0.5842614570020084, + "grad_norm": 0.8648350238800049, + "learning_rate": 3.870735898626297e-05, + "loss": 0.7303, + "step": 1600 + }, + { + "epoch": 0.5846266204126347, + "grad_norm": 0.9944093227386475, + "learning_rate": 3.8702280629823595e-05, + "loss": 0.7536, + "step": 1601 + }, + { + "epoch": 0.5849917838232609, + "grad_norm": 0.7696415185928345, + "learning_rate": 3.8697192651811305e-05, + "loss": 0.7068, + "step": 1602 + }, + { + "epoch": 0.5853569472338872, + "grad_norm": 1.4209320545196533, + "learning_rate": 3.869209505484367e-05, + "loss": 0.7426, + "step": 1603 + }, + { + "epoch": 0.5857221106445134, + "grad_norm": 0.8690351843833923, + "learning_rate": 3.86869878415432e-05, + "loss": 0.7363, + "step": 1604 + }, + { + "epoch": 0.5860872740551397, + "grad_norm": 1.0473322868347168, + "learning_rate": 3.868187101453734e-05, + "loss": 0.7094, + "step": 1605 + }, + { + "epoch": 0.5864524374657659, + "grad_norm": 1.0611331462860107, + "learning_rate": 3.867674457645851e-05, + "loss": 0.7642, + "step": 1606 + }, + { + "epoch": 0.5868176008763922, + "grad_norm": 0.8476665616035461, + "learning_rate": 3.8671608529944035e-05, + "loss": 0.7202, + "step": 1607 + }, + { + "epoch": 0.5871827642870184, + "grad_norm": 1.0705032348632812, + "learning_rate": 3.866646287763622e-05, + "loss": 0.7548, + "step": 1608 + }, + { + "epoch": 0.5875479276976447, + "grad_norm": 0.7043454051017761, + "learning_rate": 3.86613076221823e-05, + "loss": 0.6984, + "step": 1609 + }, + { + "epoch": 0.587913091108271, + "grad_norm": 1.0177797079086304, + "learning_rate": 3.865614276623443e-05, + "loss": 0.7512, + "step": 1610 + }, + { + "epoch": 0.5882782545188973, + "grad_norm": 1.0479447841644287, + "learning_rate": 3.8650968312449745e-05, + "loss": 0.723, + "step": 1611 + }, + { + "epoch": 0.5886434179295235, + "grad_norm": 0.7406904101371765, + "learning_rate": 3.864578426349027e-05, + "loss": 0.7261, + "step": 1612 + }, + { + "epoch": 0.5890085813401497, + "grad_norm": 1.4430965185165405, + "learning_rate": 3.8640590622023005e-05, + "loss": 0.7601, + "step": 1613 + }, + { + "epoch": 0.589373744750776, + "grad_norm": 0.7748079299926758, + "learning_rate": 3.863538739071986e-05, + "loss": 0.7094, + "step": 1614 + }, + { + "epoch": 0.5897389081614022, + "grad_norm": 2.9764978885650635, + "learning_rate": 3.86301745722577e-05, + "loss": 0.7087, + "step": 1615 + }, + { + "epoch": 0.5901040715720285, + "grad_norm": 0.768072783946991, + "learning_rate": 3.86249521693183e-05, + "loss": 0.7376, + "step": 1616 + }, + { + "epoch": 0.5904692349826547, + "grad_norm": 0.743763267993927, + "learning_rate": 3.861972018458838e-05, + "loss": 0.713, + "step": 1617 + }, + { + "epoch": 0.590834398393281, + "grad_norm": 0.926245927810669, + "learning_rate": 3.861447862075959e-05, + "loss": 0.6924, + "step": 1618 + }, + { + "epoch": 0.5911995618039072, + "grad_norm": 1.0158087015151978, + "learning_rate": 3.86092274805285e-05, + "loss": 0.7286, + "step": 1619 + }, + { + "epoch": 0.5915647252145335, + "grad_norm": 0.8567479848861694, + "learning_rate": 3.8603966766596624e-05, + "loss": 0.6996, + "step": 1620 + }, + { + "epoch": 0.5919298886251597, + "grad_norm": 0.9987544417381287, + "learning_rate": 3.8598696481670364e-05, + "loss": 0.7487, + "step": 1621 + }, + { + "epoch": 0.592295052035786, + "grad_norm": 0.9453714489936829, + "learning_rate": 3.859341662846109e-05, + "loss": 0.6998, + "step": 1622 + }, + { + "epoch": 0.5926602154464122, + "grad_norm": 0.8023524880409241, + "learning_rate": 3.858812720968507e-05, + "loss": 0.7198, + "step": 1623 + }, + { + "epoch": 0.5930253788570385, + "grad_norm": 1.01613450050354, + "learning_rate": 3.858282822806349e-05, + "loss": 0.7234, + "step": 1624 + }, + { + "epoch": 0.5933905422676647, + "grad_norm": 1.1630140542984009, + "learning_rate": 3.857751968632247e-05, + "loss": 0.7289, + "step": 1625 + }, + { + "epoch": 0.5937557056782911, + "grad_norm": 1.1454566717147827, + "learning_rate": 3.857220158719305e-05, + "loss": 0.7469, + "step": 1626 + }, + { + "epoch": 0.5941208690889173, + "grad_norm": 1.7499412298202515, + "learning_rate": 3.8566873933411156e-05, + "loss": 0.7163, + "step": 1627 + }, + { + "epoch": 0.5944860324995436, + "grad_norm": 1.0432333946228027, + "learning_rate": 3.856153672771767e-05, + "loss": 0.7432, + "step": 1628 + }, + { + "epoch": 0.5948511959101698, + "grad_norm": 0.8729361891746521, + "learning_rate": 3.855618997285837e-05, + "loss": 0.7484, + "step": 1629 + }, + { + "epoch": 0.5952163593207961, + "grad_norm": 0.9249089956283569, + "learning_rate": 3.855083367158394e-05, + "loss": 0.7289, + "step": 1630 + }, + { + "epoch": 0.5955815227314223, + "grad_norm": 1.153282642364502, + "learning_rate": 3.854546782664998e-05, + "loss": 0.7185, + "step": 1631 + }, + { + "epoch": 0.5959466861420486, + "grad_norm": 1.1534794569015503, + "learning_rate": 3.854009244081701e-05, + "loss": 0.7081, + "step": 1632 + }, + { + "epoch": 0.5963118495526748, + "grad_norm": 0.6747300028800964, + "learning_rate": 3.8534707516850446e-05, + "loss": 0.7314, + "step": 1633 + }, + { + "epoch": 0.5966770129633011, + "grad_norm": 1.1119229793548584, + "learning_rate": 3.852931305752062e-05, + "loss": 0.7388, + "step": 1634 + }, + { + "epoch": 0.5970421763739273, + "grad_norm": 1.1446058750152588, + "learning_rate": 3.852390906560276e-05, + "loss": 0.7186, + "step": 1635 + }, + { + "epoch": 0.5974073397845536, + "grad_norm": 1.0130422115325928, + "learning_rate": 3.8518495543877e-05, + "loss": 0.7465, + "step": 1636 + }, + { + "epoch": 0.5977725031951798, + "grad_norm": 1.2683770656585693, + "learning_rate": 3.8513072495128385e-05, + "loss": 0.7214, + "step": 1637 + }, + { + "epoch": 0.5981376666058061, + "grad_norm": 0.9249864816665649, + "learning_rate": 3.850763992214686e-05, + "loss": 0.7571, + "step": 1638 + }, + { + "epoch": 0.5985028300164323, + "grad_norm": 0.9933344721794128, + "learning_rate": 3.8502197827727254e-05, + "loss": 0.738, + "step": 1639 + }, + { + "epoch": 0.5988679934270587, + "grad_norm": 1.3973751068115234, + "learning_rate": 3.849674621466931e-05, + "loss": 0.7045, + "step": 1640 + }, + { + "epoch": 0.5992331568376849, + "grad_norm": 1.2577351331710815, + "learning_rate": 3.849128508577767e-05, + "loss": 0.7463, + "step": 1641 + }, + { + "epoch": 0.5995983202483112, + "grad_norm": 1.1957579851150513, + "learning_rate": 3.848581444386187e-05, + "loss": 0.7256, + "step": 1642 + }, + { + "epoch": 0.5999634836589374, + "grad_norm": 1.1578030586242676, + "learning_rate": 3.848033429173632e-05, + "loss": 0.7491, + "step": 1643 + }, + { + "epoch": 0.6003286470695637, + "grad_norm": 0.8890886306762695, + "learning_rate": 3.847484463222035e-05, + "loss": 0.6758, + "step": 1644 + }, + { + "epoch": 0.6006938104801899, + "grad_norm": 0.6987606883049011, + "learning_rate": 3.846934546813816e-05, + "loss": 0.7345, + "step": 1645 + }, + { + "epoch": 0.6010589738908162, + "grad_norm": 0.7313883304595947, + "learning_rate": 3.8463836802318865e-05, + "loss": 0.7071, + "step": 1646 + }, + { + "epoch": 0.6014241373014424, + "grad_norm": 1.2004591226577759, + "learning_rate": 3.8458318637596434e-05, + "loss": 0.7462, + "step": 1647 + }, + { + "epoch": 0.6017893007120686, + "grad_norm": 1.004254698753357, + "learning_rate": 3.845279097680975e-05, + "loss": 0.7172, + "step": 1648 + }, + { + "epoch": 0.6021544641226949, + "grad_norm": 1.0235750675201416, + "learning_rate": 3.844725382280258e-05, + "loss": 0.743, + "step": 1649 + }, + { + "epoch": 0.6025196275333211, + "grad_norm": 0.9419151544570923, + "learning_rate": 3.8441707178423554e-05, + "loss": 0.7057, + "step": 1650 + }, + { + "epoch": 0.6028847909439474, + "grad_norm": 0.799010157585144, + "learning_rate": 3.843615104652621e-05, + "loss": 0.7042, + "step": 1651 + }, + { + "epoch": 0.6032499543545736, + "grad_norm": 1.102571964263916, + "learning_rate": 3.843058542996895e-05, + "loss": 0.7648, + "step": 1652 + }, + { + "epoch": 0.6036151177651999, + "grad_norm": 0.751781165599823, + "learning_rate": 3.842501033161505e-05, + "loss": 0.7295, + "step": 1653 + }, + { + "epoch": 0.6039802811758261, + "grad_norm": 0.8896080851554871, + "learning_rate": 3.8419425754332694e-05, + "loss": 0.7025, + "step": 1654 + }, + { + "epoch": 0.6043454445864525, + "grad_norm": 1.2246979475021362, + "learning_rate": 3.8413831700994905e-05, + "loss": 0.7541, + "step": 1655 + }, + { + "epoch": 0.6047106079970787, + "grad_norm": 1.4045801162719727, + "learning_rate": 3.840822817447961e-05, + "loss": 0.7764, + "step": 1656 + }, + { + "epoch": 0.605075771407705, + "grad_norm": 1.0411593914031982, + "learning_rate": 3.8402615177669604e-05, + "loss": 0.7406, + "step": 1657 + }, + { + "epoch": 0.6054409348183312, + "grad_norm": 1.3511064052581787, + "learning_rate": 3.839699271345253e-05, + "loss": 0.7415, + "step": 1658 + }, + { + "epoch": 0.6058060982289575, + "grad_norm": 0.8141111135482788, + "learning_rate": 3.839136078472093e-05, + "loss": 0.7292, + "step": 1659 + }, + { + "epoch": 0.6061712616395837, + "grad_norm": 0.959466278553009, + "learning_rate": 3.838571939437221e-05, + "loss": 0.7207, + "step": 1660 + }, + { + "epoch": 0.60653642505021, + "grad_norm": 1.0917295217514038, + "learning_rate": 3.838006854530863e-05, + "loss": 0.7007, + "step": 1661 + }, + { + "epoch": 0.6069015884608362, + "grad_norm": 0.9539211392402649, + "learning_rate": 3.837440824043734e-05, + "loss": 0.7369, + "step": 1662 + }, + { + "epoch": 0.6072667518714625, + "grad_norm": 0.9652641415596008, + "learning_rate": 3.8368738482670315e-05, + "loss": 0.7266, + "step": 1663 + }, + { + "epoch": 0.6076319152820887, + "grad_norm": 1.208406686782837, + "learning_rate": 3.8363059274924445e-05, + "loss": 0.7199, + "step": 1664 + }, + { + "epoch": 0.607997078692715, + "grad_norm": 1.1405893564224243, + "learning_rate": 3.835737062012143e-05, + "loss": 0.7505, + "step": 1665 + }, + { + "epoch": 0.6083622421033412, + "grad_norm": 1.0682623386383057, + "learning_rate": 3.8351672521187874e-05, + "loss": 0.7272, + "step": 1666 + }, + { + "epoch": 0.6087274055139675, + "grad_norm": 0.9367862343788147, + "learning_rate": 3.834596498105521e-05, + "loss": 0.7195, + "step": 1667 + }, + { + "epoch": 0.6090925689245937, + "grad_norm": 0.6378055214881897, + "learning_rate": 3.8340248002659745e-05, + "loss": 0.7269, + "step": 1668 + }, + { + "epoch": 0.60945773233522, + "grad_norm": 1.028985619544983, + "learning_rate": 3.8334521588942626e-05, + "loss": 0.7161, + "step": 1669 + }, + { + "epoch": 0.6098228957458462, + "grad_norm": 1.2288942337036133, + "learning_rate": 3.832878574284988e-05, + "loss": 0.7233, + "step": 1670 + }, + { + "epoch": 0.6101880591564726, + "grad_norm": 0.8505779504776001, + "learning_rate": 3.8323040467332344e-05, + "loss": 0.7125, + "step": 1671 + }, + { + "epoch": 0.6105532225670988, + "grad_norm": 0.9262564778327942, + "learning_rate": 3.8317285765345746e-05, + "loss": 0.7317, + "step": 1672 + }, + { + "epoch": 0.6109183859777251, + "grad_norm": 0.9521602392196655, + "learning_rate": 3.831152163985065e-05, + "loss": 0.7239, + "step": 1673 + }, + { + "epoch": 0.6112835493883513, + "grad_norm": 0.8419995903968811, + "learning_rate": 3.830574809381247e-05, + "loss": 0.6981, + "step": 1674 + }, + { + "epoch": 0.6116487127989776, + "grad_norm": 0.906650185585022, + "learning_rate": 3.829996513020146e-05, + "loss": 0.7193, + "step": 1675 + }, + { + "epoch": 0.6120138762096038, + "grad_norm": 1.6719812154769897, + "learning_rate": 3.829417275199272e-05, + "loss": 0.7359, + "step": 1676 + }, + { + "epoch": 0.6123790396202301, + "grad_norm": 0.9106917977333069, + "learning_rate": 3.8288370962166194e-05, + "loss": 0.741, + "step": 1677 + }, + { + "epoch": 0.6127442030308563, + "grad_norm": 0.7947406768798828, + "learning_rate": 3.828255976370668e-05, + "loss": 0.7325, + "step": 1678 + }, + { + "epoch": 0.6131093664414826, + "grad_norm": 1.6704083681106567, + "learning_rate": 3.8276739159603795e-05, + "loss": 0.7078, + "step": 1679 + }, + { + "epoch": 0.6134745298521088, + "grad_norm": 1.1100096702575684, + "learning_rate": 3.827090915285202e-05, + "loss": 0.7399, + "step": 1680 + }, + { + "epoch": 0.613839693262735, + "grad_norm": 0.9204789996147156, + "learning_rate": 3.826506974645065e-05, + "loss": 0.7225, + "step": 1681 + }, + { + "epoch": 0.6142048566733613, + "grad_norm": 0.7737852334976196, + "learning_rate": 3.8259220943403825e-05, + "loss": 0.7071, + "step": 1682 + }, + { + "epoch": 0.6145700200839875, + "grad_norm": 1.0756714344024658, + "learning_rate": 3.825336274672053e-05, + "loss": 0.7123, + "step": 1683 + }, + { + "epoch": 0.6149351834946138, + "grad_norm": 1.1082093715667725, + "learning_rate": 3.824749515941455e-05, + "loss": 0.7314, + "step": 1684 + }, + { + "epoch": 0.61530034690524, + "grad_norm": 0.9082583785057068, + "learning_rate": 3.824161818450454e-05, + "loss": 0.7487, + "step": 1685 + }, + { + "epoch": 0.6156655103158664, + "grad_norm": 0.7466766834259033, + "learning_rate": 3.823573182501397e-05, + "loss": 0.7064, + "step": 1686 + }, + { + "epoch": 0.6160306737264926, + "grad_norm": 0.8748775124549866, + "learning_rate": 3.822983608397113e-05, + "loss": 0.7094, + "step": 1687 + }, + { + "epoch": 0.6163958371371189, + "grad_norm": 0.8320901989936829, + "learning_rate": 3.8223930964409136e-05, + "loss": 0.7298, + "step": 1688 + }, + { + "epoch": 0.6167610005477451, + "grad_norm": 0.9852972626686096, + "learning_rate": 3.821801646936595e-05, + "loss": 0.7457, + "step": 1689 + }, + { + "epoch": 0.6171261639583714, + "grad_norm": 0.7243798971176147, + "learning_rate": 3.821209260188433e-05, + "loss": 0.7272, + "step": 1690 + }, + { + "epoch": 0.6174913273689976, + "grad_norm": 0.8015673756599426, + "learning_rate": 3.8206159365011875e-05, + "loss": 0.7139, + "step": 1691 + }, + { + "epoch": 0.6178564907796239, + "grad_norm": 0.7652367949485779, + "learning_rate": 3.8200216761800986e-05, + "loss": 0.7321, + "step": 1692 + }, + { + "epoch": 0.6182216541902501, + "grad_norm": 1.6092801094055176, + "learning_rate": 3.819426479530891e-05, + "loss": 0.7166, + "step": 1693 + }, + { + "epoch": 0.6185868176008764, + "grad_norm": 1.0409109592437744, + "learning_rate": 3.8188303468597684e-05, + "loss": 0.7145, + "step": 1694 + }, + { + "epoch": 0.6189519810115026, + "grad_norm": 1.179543137550354, + "learning_rate": 3.818233278473417e-05, + "loss": 0.7096, + "step": 1695 + }, + { + "epoch": 0.6193171444221289, + "grad_norm": 0.6408907175064087, + "learning_rate": 3.817635274679006e-05, + "loss": 0.7106, + "step": 1696 + }, + { + "epoch": 0.6196823078327551, + "grad_norm": 0.9521438479423523, + "learning_rate": 3.817036335784183e-05, + "loss": 0.7045, + "step": 1697 + }, + { + "epoch": 0.6200474712433814, + "grad_norm": 0.9148648977279663, + "learning_rate": 3.816436462097079e-05, + "loss": 0.7177, + "step": 1698 + }, + { + "epoch": 0.6204126346540076, + "grad_norm": 0.8411362767219543, + "learning_rate": 3.815835653926303e-05, + "loss": 0.7051, + "step": 1699 + }, + { + "epoch": 0.620777798064634, + "grad_norm": 0.799384355545044, + "learning_rate": 3.8152339115809486e-05, + "loss": 0.6688, + "step": 1700 + }, + { + "epoch": 0.6211429614752602, + "grad_norm": 0.9203513264656067, + "learning_rate": 3.814631235370587e-05, + "loss": 0.7238, + "step": 1701 + }, + { + "epoch": 0.6215081248858865, + "grad_norm": 0.7966442108154297, + "learning_rate": 3.814027625605272e-05, + "loss": 0.6946, + "step": 1702 + }, + { + "epoch": 0.6218732882965127, + "grad_norm": 0.865376353263855, + "learning_rate": 3.8134230825955366e-05, + "loss": 0.7072, + "step": 1703 + }, + { + "epoch": 0.622238451707139, + "grad_norm": 0.8998957276344299, + "learning_rate": 3.812817606652392e-05, + "loss": 0.7385, + "step": 1704 + }, + { + "epoch": 0.6226036151177652, + "grad_norm": 0.8941604495048523, + "learning_rate": 3.812211198087333e-05, + "loss": 0.7095, + "step": 1705 + }, + { + "epoch": 0.6229687785283915, + "grad_norm": 0.8669780492782593, + "learning_rate": 3.8116038572123325e-05, + "loss": 0.7166, + "step": 1706 + }, + { + "epoch": 0.6233339419390177, + "grad_norm": 1.0933927297592163, + "learning_rate": 3.810995584339843e-05, + "loss": 0.744, + "step": 1707 + }, + { + "epoch": 0.623699105349644, + "grad_norm": 0.6955661773681641, + "learning_rate": 3.8103863797827955e-05, + "loss": 0.7256, + "step": 1708 + }, + { + "epoch": 0.6240642687602702, + "grad_norm": 1.25681471824646, + "learning_rate": 3.809776243854602e-05, + "loss": 0.7371, + "step": 1709 + }, + { + "epoch": 0.6244294321708965, + "grad_norm": 1.0992910861968994, + "learning_rate": 3.8091651768691526e-05, + "loss": 0.6931, + "step": 1710 + }, + { + "epoch": 0.6247945955815227, + "grad_norm": 1.6440374851226807, + "learning_rate": 3.808553179140817e-05, + "loss": 0.704, + "step": 1711 + }, + { + "epoch": 0.625159758992149, + "grad_norm": 1.3347744941711426, + "learning_rate": 3.807940250984444e-05, + "loss": 0.6899, + "step": 1712 + }, + { + "epoch": 0.6255249224027752, + "grad_norm": 1.079863429069519, + "learning_rate": 3.807326392715359e-05, + "loss": 0.7344, + "step": 1713 + }, + { + "epoch": 0.6258900858134014, + "grad_norm": 0.8595447540283203, + "learning_rate": 3.806711604649369e-05, + "loss": 0.7137, + "step": 1714 + }, + { + "epoch": 0.6262552492240278, + "grad_norm": 0.827134907245636, + "learning_rate": 3.806095887102757e-05, + "loss": 0.6954, + "step": 1715 + }, + { + "epoch": 0.626620412634654, + "grad_norm": 1.483556866645813, + "learning_rate": 3.805479240392286e-05, + "loss": 0.7368, + "step": 1716 + }, + { + "epoch": 0.6269855760452803, + "grad_norm": 1.2429213523864746, + "learning_rate": 3.804861664835195e-05, + "loss": 0.7454, + "step": 1717 + }, + { + "epoch": 0.6273507394559065, + "grad_norm": 0.8351743817329407, + "learning_rate": 3.8042431607492015e-05, + "loss": 0.7059, + "step": 1718 + }, + { + "epoch": 0.6277159028665328, + "grad_norm": 0.8552349209785461, + "learning_rate": 3.8036237284525016e-05, + "loss": 0.7266, + "step": 1719 + }, + { + "epoch": 0.628081066277159, + "grad_norm": 1.1395808458328247, + "learning_rate": 3.8030033682637686e-05, + "loss": 0.7244, + "step": 1720 + }, + { + "epoch": 0.6284462296877853, + "grad_norm": 1.2721116542816162, + "learning_rate": 3.8023820805021524e-05, + "loss": 0.722, + "step": 1721 + }, + { + "epoch": 0.6288113930984115, + "grad_norm": 0.9128320813179016, + "learning_rate": 3.801759865487281e-05, + "loss": 0.7298, + "step": 1722 + }, + { + "epoch": 0.6291765565090378, + "grad_norm": 1.0605303049087524, + "learning_rate": 3.801136723539259e-05, + "loss": 0.7627, + "step": 1723 + }, + { + "epoch": 0.629541719919664, + "grad_norm": 0.8743678331375122, + "learning_rate": 3.8005126549786674e-05, + "loss": 0.7209, + "step": 1724 + }, + { + "epoch": 0.6299068833302903, + "grad_norm": 0.8217443227767944, + "learning_rate": 3.7998876601265654e-05, + "loss": 0.7207, + "step": 1725 + }, + { + "epoch": 0.6302720467409165, + "grad_norm": 1.1639951467514038, + "learning_rate": 3.799261739304487e-05, + "loss": 0.683, + "step": 1726 + }, + { + "epoch": 0.6306372101515428, + "grad_norm": 0.8091967105865479, + "learning_rate": 3.798634892834444e-05, + "loss": 0.718, + "step": 1727 + }, + { + "epoch": 0.631002373562169, + "grad_norm": 0.9289019703865051, + "learning_rate": 3.798007121038923e-05, + "loss": 0.7153, + "step": 1728 + }, + { + "epoch": 0.6313675369727954, + "grad_norm": 1.0279673337936401, + "learning_rate": 3.797378424240888e-05, + "loss": 0.7405, + "step": 1729 + }, + { + "epoch": 0.6317327003834216, + "grad_norm": 0.9682115912437439, + "learning_rate": 3.7967488027637776e-05, + "loss": 0.7032, + "step": 1730 + }, + { + "epoch": 0.6320978637940479, + "grad_norm": 1.1122698783874512, + "learning_rate": 3.796118256931507e-05, + "loss": 0.7122, + "step": 1731 + }, + { + "epoch": 0.6324630272046741, + "grad_norm": 1.057283639907837, + "learning_rate": 3.7954867870684677e-05, + "loss": 0.7413, + "step": 1732 + }, + { + "epoch": 0.6328281906153004, + "grad_norm": 0.8642354011535645, + "learning_rate": 3.794854393499525e-05, + "loss": 0.6952, + "step": 1733 + }, + { + "epoch": 0.6331933540259266, + "grad_norm": 0.9934948086738586, + "learning_rate": 3.7942210765500197e-05, + "loss": 0.7555, + "step": 1734 + }, + { + "epoch": 0.6335585174365529, + "grad_norm": 0.855786144733429, + "learning_rate": 3.7935868365457674e-05, + "loss": 0.7121, + "step": 1735 + }, + { + "epoch": 0.6339236808471791, + "grad_norm": 1.0481681823730469, + "learning_rate": 3.7929516738130606e-05, + "loss": 0.6837, + "step": 1736 + }, + { + "epoch": 0.6342888442578054, + "grad_norm": 0.8370383381843567, + "learning_rate": 3.7923155886786636e-05, + "loss": 0.7074, + "step": 1737 + }, + { + "epoch": 0.6346540076684316, + "grad_norm": 0.963932454586029, + "learning_rate": 3.791678581469818e-05, + "loss": 0.7545, + "step": 1738 + }, + { + "epoch": 0.6350191710790579, + "grad_norm": 0.819789707660675, + "learning_rate": 3.7910406525142374e-05, + "loss": 0.7273, + "step": 1739 + }, + { + "epoch": 0.6353843344896841, + "grad_norm": 0.7305337190628052, + "learning_rate": 3.790401802140111e-05, + "loss": 0.7001, + "step": 1740 + }, + { + "epoch": 0.6357494979003104, + "grad_norm": 0.7566940188407898, + "learning_rate": 3.789762030676103e-05, + "loss": 0.7009, + "step": 1741 + }, + { + "epoch": 0.6361146613109366, + "grad_norm": 0.914252758026123, + "learning_rate": 3.7891213384513476e-05, + "loss": 0.7362, + "step": 1742 + }, + { + "epoch": 0.636479824721563, + "grad_norm": 1.0864005088806152, + "learning_rate": 3.7884797257954565e-05, + "loss": 0.6954, + "step": 1743 + }, + { + "epoch": 0.6368449881321891, + "grad_norm": 0.9217675924301147, + "learning_rate": 3.7878371930385144e-05, + "loss": 0.7169, + "step": 1744 + }, + { + "epoch": 0.6372101515428155, + "grad_norm": 0.9754314422607422, + "learning_rate": 3.787193740511077e-05, + "loss": 0.7153, + "step": 1745 + }, + { + "epoch": 0.6375753149534417, + "grad_norm": 1.1855268478393555, + "learning_rate": 3.786549368544177e-05, + "loss": 0.7084, + "step": 1746 + }, + { + "epoch": 0.637940478364068, + "grad_norm": 0.854855477809906, + "learning_rate": 3.7859040774693156e-05, + "loss": 0.7084, + "step": 1747 + }, + { + "epoch": 0.6383056417746942, + "grad_norm": 0.9696609973907471, + "learning_rate": 3.7852578676184705e-05, + "loss": 0.7118, + "step": 1748 + }, + { + "epoch": 0.6386708051853204, + "grad_norm": 0.8385400176048279, + "learning_rate": 3.784610739324091e-05, + "loss": 0.7386, + "step": 1749 + }, + { + "epoch": 0.6390359685959467, + "grad_norm": 1.112640142440796, + "learning_rate": 3.7839626929190976e-05, + "loss": 0.7151, + "step": 1750 + }, + { + "epoch": 0.6394011320065729, + "grad_norm": 1.1101984977722168, + "learning_rate": 3.783313728736884e-05, + "loss": 0.7083, + "step": 1751 + }, + { + "epoch": 0.6397662954171992, + "grad_norm": 0.865172803401947, + "learning_rate": 3.782663847111318e-05, + "loss": 0.705, + "step": 1752 + }, + { + "epoch": 0.6401314588278254, + "grad_norm": 1.0278229713439941, + "learning_rate": 3.782013048376736e-05, + "loss": 0.7145, + "step": 1753 + }, + { + "epoch": 0.6404966222384517, + "grad_norm": 0.9916568398475647, + "learning_rate": 3.781361332867948e-05, + "loss": 0.7386, + "step": 1754 + }, + { + "epoch": 0.6408617856490779, + "grad_norm": 1.145369529724121, + "learning_rate": 3.7807087009202366e-05, + "loss": 0.7388, + "step": 1755 + }, + { + "epoch": 0.6412269490597042, + "grad_norm": 0.8243128061294556, + "learning_rate": 3.780055152869354e-05, + "loss": 0.7056, + "step": 1756 + }, + { + "epoch": 0.6415921124703304, + "grad_norm": 1.1970804929733276, + "learning_rate": 3.7794006890515235e-05, + "loss": 0.7265, + "step": 1757 + }, + { + "epoch": 0.6419572758809567, + "grad_norm": 0.7989738583564758, + "learning_rate": 3.778745309803442e-05, + "loss": 0.7139, + "step": 1758 + }, + { + "epoch": 0.642322439291583, + "grad_norm": 0.8789658546447754, + "learning_rate": 3.778089015462275e-05, + "loss": 0.713, + "step": 1759 + }, + { + "epoch": 0.6426876027022093, + "grad_norm": 0.6590216159820557, + "learning_rate": 3.77743180636566e-05, + "loss": 0.7172, + "step": 1760 + }, + { + "epoch": 0.6430527661128355, + "grad_norm": 0.92206871509552, + "learning_rate": 3.776773682851705e-05, + "loss": 0.7135, + "step": 1761 + }, + { + "epoch": 0.6434179295234618, + "grad_norm": 1.0673507452011108, + "learning_rate": 3.776114645258987e-05, + "loss": 0.7426, + "step": 1762 + }, + { + "epoch": 0.643783092934088, + "grad_norm": 0.7360542416572571, + "learning_rate": 3.775454693926554e-05, + "loss": 0.7217, + "step": 1763 + }, + { + "epoch": 0.6441482563447143, + "grad_norm": 1.0386346578598022, + "learning_rate": 3.774793829193927e-05, + "loss": 0.6968, + "step": 1764 + }, + { + "epoch": 0.6445134197553405, + "grad_norm": 1.0472970008850098, + "learning_rate": 3.774132051401093e-05, + "loss": 0.7058, + "step": 1765 + }, + { + "epoch": 0.6448785831659668, + "grad_norm": 0.9268572330474854, + "learning_rate": 3.77346936088851e-05, + "loss": 0.7255, + "step": 1766 + }, + { + "epoch": 0.645243746576593, + "grad_norm": 0.9543839693069458, + "learning_rate": 3.772805757997105e-05, + "loss": 0.6696, + "step": 1767 + }, + { + "epoch": 0.6456089099872193, + "grad_norm": 1.1249008178710938, + "learning_rate": 3.7721412430682766e-05, + "loss": 0.725, + "step": 1768 + }, + { + "epoch": 0.6459740733978455, + "grad_norm": 1.248288631439209, + "learning_rate": 3.7714758164438896e-05, + "loss": 0.7472, + "step": 1769 + }, + { + "epoch": 0.6463392368084718, + "grad_norm": 1.2799824476242065, + "learning_rate": 3.7708094784662804e-05, + "loss": 0.7057, + "step": 1770 + }, + { + "epoch": 0.646704400219098, + "grad_norm": 1.0398547649383545, + "learning_rate": 3.7701422294782514e-05, + "loss": 0.7277, + "step": 1771 + }, + { + "epoch": 0.6470695636297243, + "grad_norm": 0.8737127184867859, + "learning_rate": 3.769474069823078e-05, + "loss": 0.7192, + "step": 1772 + }, + { + "epoch": 0.6474347270403505, + "grad_norm": 0.949883222579956, + "learning_rate": 3.7688049998445e-05, + "loss": 0.7462, + "step": 1773 + }, + { + "epoch": 0.6477998904509769, + "grad_norm": 0.878869354724884, + "learning_rate": 3.7681350198867274e-05, + "loss": 0.7163, + "step": 1774 + }, + { + "epoch": 0.6481650538616031, + "grad_norm": 0.7875892519950867, + "learning_rate": 3.767464130294438e-05, + "loss": 0.7289, + "step": 1775 + }, + { + "epoch": 0.6485302172722294, + "grad_norm": 0.9135648608207703, + "learning_rate": 3.7667923314127774e-05, + "loss": 0.7115, + "step": 1776 + }, + { + "epoch": 0.6488953806828556, + "grad_norm": 0.7511104345321655, + "learning_rate": 3.766119623587359e-05, + "loss": 0.7015, + "step": 1777 + }, + { + "epoch": 0.6492605440934819, + "grad_norm": 1.0761756896972656, + "learning_rate": 3.765446007164264e-05, + "loss": 0.6735, + "step": 1778 + }, + { + "epoch": 0.6496257075041081, + "grad_norm": 0.8264586925506592, + "learning_rate": 3.764771482490042e-05, + "loss": 0.7061, + "step": 1779 + }, + { + "epoch": 0.6499908709147344, + "grad_norm": 1.6037218570709229, + "learning_rate": 3.7640960499117076e-05, + "loss": 0.702, + "step": 1780 + }, + { + "epoch": 0.6503560343253606, + "grad_norm": 0.7423888444900513, + "learning_rate": 3.763419709776744e-05, + "loss": 0.7206, + "step": 1781 + }, + { + "epoch": 0.6507211977359868, + "grad_norm": 0.7805701494216919, + "learning_rate": 3.762742462433102e-05, + "loss": 0.6962, + "step": 1782 + }, + { + "epoch": 0.6510863611466131, + "grad_norm": 0.6788707971572876, + "learning_rate": 3.7620643082291976e-05, + "loss": 0.681, + "step": 1783 + }, + { + "epoch": 0.6514515245572393, + "grad_norm": 1.0078482627868652, + "learning_rate": 3.761385247513913e-05, + "loss": 0.7579, + "step": 1784 + }, + { + "epoch": 0.6518166879678656, + "grad_norm": 1.1813578605651855, + "learning_rate": 3.760705280636599e-05, + "loss": 0.7061, + "step": 1785 + }, + { + "epoch": 0.6521818513784918, + "grad_norm": 1.0172258615493774, + "learning_rate": 3.760024407947072e-05, + "loss": 0.6906, + "step": 1786 + }, + { + "epoch": 0.6525470147891181, + "grad_norm": 1.272959589958191, + "learning_rate": 3.759342629795611e-05, + "loss": 0.7523, + "step": 1787 + }, + { + "epoch": 0.6529121781997443, + "grad_norm": 1.2016862630844116, + "learning_rate": 3.758659946532965e-05, + "loss": 0.715, + "step": 1788 + }, + { + "epoch": 0.6532773416103707, + "grad_norm": 0.8221237063407898, + "learning_rate": 3.757976358510348e-05, + "loss": 0.7262, + "step": 1789 + }, + { + "epoch": 0.6536425050209969, + "grad_norm": 0.9917743802070618, + "learning_rate": 3.757291866079437e-05, + "loss": 0.6689, + "step": 1790 + }, + { + "epoch": 0.6540076684316232, + "grad_norm": 1.1483347415924072, + "learning_rate": 3.756606469592377e-05, + "loss": 0.6982, + "step": 1791 + }, + { + "epoch": 0.6543728318422494, + "grad_norm": 0.7733443975448608, + "learning_rate": 3.755920169401777e-05, + "loss": 0.7169, + "step": 1792 + }, + { + "epoch": 0.6547379952528757, + "grad_norm": 1.0676130056381226, + "learning_rate": 3.7552329658607096e-05, + "loss": 0.751, + "step": 1793 + }, + { + "epoch": 0.6551031586635019, + "grad_norm": 1.0856797695159912, + "learning_rate": 3.754544859322715e-05, + "loss": 0.7233, + "step": 1794 + }, + { + "epoch": 0.6554683220741282, + "grad_norm": 0.9236862659454346, + "learning_rate": 3.753855850141795e-05, + "loss": 0.7167, + "step": 1795 + }, + { + "epoch": 0.6558334854847544, + "grad_norm": 0.7494192123413086, + "learning_rate": 3.7531659386724195e-05, + "loss": 0.727, + "step": 1796 + }, + { + "epoch": 0.6561986488953807, + "grad_norm": 0.850273847579956, + "learning_rate": 3.752475125269517e-05, + "loss": 0.7317, + "step": 1797 + }, + { + "epoch": 0.6565638123060069, + "grad_norm": 1.0726951360702515, + "learning_rate": 3.7517834102884865e-05, + "loss": 0.7631, + "step": 1798 + }, + { + "epoch": 0.6569289757166332, + "grad_norm": 0.916655421257019, + "learning_rate": 3.751090794085185e-05, + "loss": 0.6746, + "step": 1799 + }, + { + "epoch": 0.6572941391272594, + "grad_norm": 1.0388219356536865, + "learning_rate": 3.750397277015937e-05, + "loss": 0.6825, + "step": 1800 + }, + { + "epoch": 0.6576593025378857, + "grad_norm": 1.2060333490371704, + "learning_rate": 3.74970285943753e-05, + "loss": 0.6987, + "step": 1801 + }, + { + "epoch": 0.6580244659485119, + "grad_norm": 1.0017542839050293, + "learning_rate": 3.749007541707212e-05, + "loss": 0.7143, + "step": 1802 + }, + { + "epoch": 0.6583896293591383, + "grad_norm": 1.0995457172393799, + "learning_rate": 3.7483113241826974e-05, + "loss": 0.7184, + "step": 1803 + }, + { + "epoch": 0.6587547927697645, + "grad_norm": 0.8721252679824829, + "learning_rate": 3.747614207222162e-05, + "loss": 0.6864, + "step": 1804 + }, + { + "epoch": 0.6591199561803908, + "grad_norm": 0.9801641702651978, + "learning_rate": 3.7469161911842444e-05, + "loss": 0.7024, + "step": 1805 + }, + { + "epoch": 0.659485119591017, + "grad_norm": 1.2977049350738525, + "learning_rate": 3.7462172764280456e-05, + "loss": 0.6916, + "step": 1806 + }, + { + "epoch": 0.6598502830016433, + "grad_norm": 1.3299815654754639, + "learning_rate": 3.745517463313129e-05, + "loss": 0.7258, + "step": 1807 + }, + { + "epoch": 0.6602154464122695, + "grad_norm": 1.1427744626998901, + "learning_rate": 3.7448167521995216e-05, + "loss": 0.743, + "step": 1808 + }, + { + "epoch": 0.6605806098228958, + "grad_norm": 0.9667388200759888, + "learning_rate": 3.7441151434477096e-05, + "loss": 0.7051, + "step": 1809 + }, + { + "epoch": 0.660945773233522, + "grad_norm": 0.8338121771812439, + "learning_rate": 3.743412637418644e-05, + "loss": 0.7105, + "step": 1810 + }, + { + "epoch": 0.6613109366441483, + "grad_norm": 1.0420618057250977, + "learning_rate": 3.742709234473735e-05, + "loss": 0.7135, + "step": 1811 + }, + { + "epoch": 0.6616761000547745, + "grad_norm": 1.397230625152588, + "learning_rate": 3.7420049349748555e-05, + "loss": 0.6804, + "step": 1812 + }, + { + "epoch": 0.6620412634654008, + "grad_norm": 0.7304776310920715, + "learning_rate": 3.7412997392843385e-05, + "loss": 0.6901, + "step": 1813 + }, + { + "epoch": 0.662406426876027, + "grad_norm": 0.9349809288978577, + "learning_rate": 3.7405936477649806e-05, + "loss": 0.7291, + "step": 1814 + }, + { + "epoch": 0.6627715902866532, + "grad_norm": 0.813755214214325, + "learning_rate": 3.739886660780037e-05, + "loss": 0.7118, + "step": 1815 + }, + { + "epoch": 0.6631367536972795, + "grad_norm": 0.8275956511497498, + "learning_rate": 3.739178778693222e-05, + "loss": 0.7351, + "step": 1816 + }, + { + "epoch": 0.6635019171079057, + "grad_norm": 0.8273405432701111, + "learning_rate": 3.7384700018687154e-05, + "loss": 0.7005, + "step": 1817 + }, + { + "epoch": 0.663867080518532, + "grad_norm": 0.8671954870223999, + "learning_rate": 3.737760330671153e-05, + "loss": 0.715, + "step": 1818 + }, + { + "epoch": 0.6642322439291583, + "grad_norm": 1.006998896598816, + "learning_rate": 3.737049765465633e-05, + "loss": 0.6922, + "step": 1819 + }, + { + "epoch": 0.6645974073397846, + "grad_norm": 1.0910762548446655, + "learning_rate": 3.736338306617712e-05, + "loss": 0.7117, + "step": 1820 + }, + { + "epoch": 0.6649625707504108, + "grad_norm": 0.9704879522323608, + "learning_rate": 3.735625954493406e-05, + "loss": 0.724, + "step": 1821 + }, + { + "epoch": 0.6653277341610371, + "grad_norm": 1.0446968078613281, + "learning_rate": 3.734912709459194e-05, + "loss": 0.7117, + "step": 1822 + }, + { + "epoch": 0.6656928975716633, + "grad_norm": 1.0730209350585938, + "learning_rate": 3.7341985718820106e-05, + "loss": 0.6985, + "step": 1823 + }, + { + "epoch": 0.6660580609822896, + "grad_norm": 0.7722004652023315, + "learning_rate": 3.733483542129251e-05, + "loss": 0.7067, + "step": 1824 + }, + { + "epoch": 0.6664232243929158, + "grad_norm": 0.9696376323699951, + "learning_rate": 3.732767620568769e-05, + "loss": 0.6758, + "step": 1825 + }, + { + "epoch": 0.6667883878035421, + "grad_norm": 0.747405469417572, + "learning_rate": 3.732050807568878e-05, + "loss": 0.6826, + "step": 1826 + }, + { + "epoch": 0.6671535512141683, + "grad_norm": 1.2408199310302734, + "learning_rate": 3.731333103498349e-05, + "loss": 0.7484, + "step": 1827 + }, + { + "epoch": 0.6675187146247946, + "grad_norm": 1.3128819465637207, + "learning_rate": 3.730614508726413e-05, + "loss": 0.7032, + "step": 1828 + }, + { + "epoch": 0.6678838780354208, + "grad_norm": 1.001400113105774, + "learning_rate": 3.729895023622756e-05, + "loss": 0.7014, + "step": 1829 + }, + { + "epoch": 0.6682490414460471, + "grad_norm": 0.7678777575492859, + "learning_rate": 3.729174648557528e-05, + "loss": 0.6633, + "step": 1830 + }, + { + "epoch": 0.6686142048566733, + "grad_norm": 0.8901016712188721, + "learning_rate": 3.728453383901329e-05, + "loss": 0.7197, + "step": 1831 + }, + { + "epoch": 0.6689793682672996, + "grad_norm": 0.8703147172927856, + "learning_rate": 3.727731230025224e-05, + "loss": 0.7244, + "step": 1832 + }, + { + "epoch": 0.6693445316779258, + "grad_norm": 1.0041730403900146, + "learning_rate": 3.727008187300729e-05, + "loss": 0.7062, + "step": 1833 + }, + { + "epoch": 0.6697096950885522, + "grad_norm": 1.2102035284042358, + "learning_rate": 3.726284256099823e-05, + "loss": 0.7057, + "step": 1834 + }, + { + "epoch": 0.6700748584991784, + "grad_norm": 0.9731162786483765, + "learning_rate": 3.725559436794939e-05, + "loss": 0.6986, + "step": 1835 + }, + { + "epoch": 0.6704400219098047, + "grad_norm": 0.7118486166000366, + "learning_rate": 3.7248337297589666e-05, + "loss": 0.7186, + "step": 1836 + }, + { + "epoch": 0.6708051853204309, + "grad_norm": 0.725632905960083, + "learning_rate": 3.724107135365254e-05, + "loss": 0.7206, + "step": 1837 + }, + { + "epoch": 0.6711703487310572, + "grad_norm": 1.1240463256835938, + "learning_rate": 3.723379653987604e-05, + "loss": 0.7195, + "step": 1838 + }, + { + "epoch": 0.6715355121416834, + "grad_norm": 0.9441981315612793, + "learning_rate": 3.722651286000277e-05, + "loss": 0.6832, + "step": 1839 + }, + { + "epoch": 0.6719006755523097, + "grad_norm": 2.023256540298462, + "learning_rate": 3.7219220317779886e-05, + "loss": 0.7089, + "step": 1840 + }, + { + "epoch": 0.6722658389629359, + "grad_norm": 0.8320139050483704, + "learning_rate": 3.721191891695912e-05, + "loss": 0.6735, + "step": 1841 + }, + { + "epoch": 0.6726310023735622, + "grad_norm": 0.827463686466217, + "learning_rate": 3.720460866129674e-05, + "loss": 0.6902, + "step": 1842 + }, + { + "epoch": 0.6729961657841884, + "grad_norm": 0.6844232082366943, + "learning_rate": 3.719728955455359e-05, + "loss": 0.7166, + "step": 1843 + }, + { + "epoch": 0.6733613291948147, + "grad_norm": 1.0527371168136597, + "learning_rate": 3.718996160049504e-05, + "loss": 0.7418, + "step": 1844 + }, + { + "epoch": 0.6737264926054409, + "grad_norm": 0.9820262789726257, + "learning_rate": 3.718262480289103e-05, + "loss": 0.7046, + "step": 1845 + }, + { + "epoch": 0.6740916560160672, + "grad_norm": 0.8342486023902893, + "learning_rate": 3.7175279165516064e-05, + "loss": 0.6899, + "step": 1846 + }, + { + "epoch": 0.6744568194266934, + "grad_norm": 1.2259405851364136, + "learning_rate": 3.7167924692149164e-05, + "loss": 0.7542, + "step": 1847 + }, + { + "epoch": 0.6748219828373198, + "grad_norm": 0.8796677589416504, + "learning_rate": 3.7160561386573916e-05, + "loss": 0.7241, + "step": 1848 + }, + { + "epoch": 0.675187146247946, + "grad_norm": 0.9300341606140137, + "learning_rate": 3.7153189252578454e-05, + "loss": 0.7022, + "step": 1849 + }, + { + "epoch": 0.6755523096585722, + "grad_norm": 0.7318381071090698, + "learning_rate": 3.7145808293955427e-05, + "loss": 0.7015, + "step": 1850 + }, + { + "epoch": 0.6759174730691985, + "grad_norm": 1.1048039197921753, + "learning_rate": 3.7138418514502055e-05, + "loss": 0.7115, + "step": 1851 + }, + { + "epoch": 0.6762826364798247, + "grad_norm": 0.9916225671768188, + "learning_rate": 3.7131019918020074e-05, + "loss": 0.7175, + "step": 1852 + }, + { + "epoch": 0.676647799890451, + "grad_norm": 1.0461235046386719, + "learning_rate": 3.712361250831578e-05, + "loss": 0.6848, + "step": 1853 + }, + { + "epoch": 0.6770129633010772, + "grad_norm": 1.782787561416626, + "learning_rate": 3.711619628919997e-05, + "loss": 0.6769, + "step": 1854 + }, + { + "epoch": 0.6773781267117035, + "grad_norm": 1.2121411561965942, + "learning_rate": 3.7108771264488e-05, + "loss": 0.707, + "step": 1855 + }, + { + "epoch": 0.6777432901223297, + "grad_norm": 0.8399494886398315, + "learning_rate": 3.7101337437999746e-05, + "loss": 0.6915, + "step": 1856 + }, + { + "epoch": 0.678108453532956, + "grad_norm": 0.8495551943778992, + "learning_rate": 3.709389481355962e-05, + "loss": 0.719, + "step": 1857 + }, + { + "epoch": 0.6784736169435822, + "grad_norm": 0.7249609231948853, + "learning_rate": 3.708644339499654e-05, + "loss": 0.7111, + "step": 1858 + }, + { + "epoch": 0.6788387803542085, + "grad_norm": 0.7661548256874084, + "learning_rate": 3.7078983186143976e-05, + "loss": 0.6877, + "step": 1859 + }, + { + "epoch": 0.6792039437648347, + "grad_norm": 0.9450551867485046, + "learning_rate": 3.7071514190839895e-05, + "loss": 0.7233, + "step": 1860 + }, + { + "epoch": 0.679569107175461, + "grad_norm": 0.8381118774414062, + "learning_rate": 3.706403641292681e-05, + "loss": 0.6978, + "step": 1861 + }, + { + "epoch": 0.6799342705860872, + "grad_norm": 0.859592616558075, + "learning_rate": 3.705654985625171e-05, + "loss": 0.7018, + "step": 1862 + }, + { + "epoch": 0.6802994339967136, + "grad_norm": 0.7822218537330627, + "learning_rate": 3.704905452466616e-05, + "loss": 0.7034, + "step": 1863 + }, + { + "epoch": 0.6806645974073398, + "grad_norm": 0.9308319687843323, + "learning_rate": 3.704155042202619e-05, + "loss": 0.7197, + "step": 1864 + }, + { + "epoch": 0.6810297608179661, + "grad_norm": 0.8447214365005493, + "learning_rate": 3.703403755219236e-05, + "loss": 0.7, + "step": 1865 + }, + { + "epoch": 0.6813949242285923, + "grad_norm": 0.8934256434440613, + "learning_rate": 3.702651591902974e-05, + "loss": 0.7039, + "step": 1866 + }, + { + "epoch": 0.6817600876392186, + "grad_norm": 0.8217077255249023, + "learning_rate": 3.701898552640792e-05, + "loss": 0.7259, + "step": 1867 + }, + { + "epoch": 0.6821252510498448, + "grad_norm": 1.7904006242752075, + "learning_rate": 3.7011446378200965e-05, + "loss": 0.6941, + "step": 1868 + }, + { + "epoch": 0.6824904144604711, + "grad_norm": 1.0466225147247314, + "learning_rate": 3.700389847828749e-05, + "loss": 0.6877, + "step": 1869 + }, + { + "epoch": 0.6828555778710973, + "grad_norm": 0.8028392791748047, + "learning_rate": 3.699634183055056e-05, + "loss": 0.6982, + "step": 1870 + }, + { + "epoch": 0.6832207412817236, + "grad_norm": 0.9090667366981506, + "learning_rate": 3.6988776438877784e-05, + "loss": 0.7179, + "step": 1871 + }, + { + "epoch": 0.6835859046923498, + "grad_norm": 0.7476855516433716, + "learning_rate": 3.698120230716124e-05, + "loss": 0.6968, + "step": 1872 + }, + { + "epoch": 0.6839510681029761, + "grad_norm": 1.658331274986267, + "learning_rate": 3.697361943929753e-05, + "loss": 0.7238, + "step": 1873 + }, + { + "epoch": 0.6843162315136023, + "grad_norm": 0.8094294667243958, + "learning_rate": 3.696602783918773e-05, + "loss": 0.6913, + "step": 1874 + }, + { + "epoch": 0.6846813949242286, + "grad_norm": 1.531867265701294, + "learning_rate": 3.69584275107374e-05, + "loss": 0.7319, + "step": 1875 + }, + { + "epoch": 0.6850465583348548, + "grad_norm": 0.8356072306632996, + "learning_rate": 3.695081845785663e-05, + "loss": 0.6787, + "step": 1876 + }, + { + "epoch": 0.6854117217454812, + "grad_norm": 0.961771547794342, + "learning_rate": 3.6943200684459944e-05, + "loss": 0.671, + "step": 1877 + }, + { + "epoch": 0.6857768851561074, + "grad_norm": 0.7966907024383545, + "learning_rate": 3.69355741944664e-05, + "loss": 0.6841, + "step": 1878 + }, + { + "epoch": 0.6861420485667337, + "grad_norm": 0.9229552745819092, + "learning_rate": 3.692793899179951e-05, + "loss": 0.7295, + "step": 1879 + }, + { + "epoch": 0.6865072119773599, + "grad_norm": 0.9129878878593445, + "learning_rate": 3.6920295080387295e-05, + "loss": 0.7352, + "step": 1880 + }, + { + "epoch": 0.6868723753879862, + "grad_norm": 0.9927907586097717, + "learning_rate": 3.691264246416222e-05, + "loss": 0.6975, + "step": 1881 + }, + { + "epoch": 0.6872375387986124, + "grad_norm": 0.7063228487968445, + "learning_rate": 3.6904981147061265e-05, + "loss": 0.7197, + "step": 1882 + }, + { + "epoch": 0.6876027022092386, + "grad_norm": 0.8225526809692383, + "learning_rate": 3.689731113302587e-05, + "loss": 0.702, + "step": 1883 + }, + { + "epoch": 0.6879678656198649, + "grad_norm": 0.9241859912872314, + "learning_rate": 3.688963242600193e-05, + "loss": 0.7156, + "step": 1884 + }, + { + "epoch": 0.6883330290304911, + "grad_norm": 0.970552384853363, + "learning_rate": 3.688194502993985e-05, + "loss": 0.7197, + "step": 1885 + }, + { + "epoch": 0.6886981924411174, + "grad_norm": 0.9685255289077759, + "learning_rate": 3.6874248948794494e-05, + "loss": 0.7068, + "step": 1886 + }, + { + "epoch": 0.6890633558517436, + "grad_norm": 0.8250409364700317, + "learning_rate": 3.6866544186525156e-05, + "loss": 0.691, + "step": 1887 + }, + { + "epoch": 0.6894285192623699, + "grad_norm": 1.1441587209701538, + "learning_rate": 3.685883074709566e-05, + "loss": 0.7062, + "step": 1888 + }, + { + "epoch": 0.6897936826729961, + "grad_norm": 1.142409086227417, + "learning_rate": 3.685110863447424e-05, + "loss": 0.7308, + "step": 1889 + }, + { + "epoch": 0.6901588460836224, + "grad_norm": 0.834210991859436, + "learning_rate": 3.684337785263363e-05, + "loss": 0.7133, + "step": 1890 + }, + { + "epoch": 0.6905240094942486, + "grad_norm": 1.0087356567382812, + "learning_rate": 3.6835638405550994e-05, + "loss": 0.7025, + "step": 1891 + }, + { + "epoch": 0.690889172904875, + "grad_norm": 0.8793954849243164, + "learning_rate": 3.6827890297207964e-05, + "loss": 0.6901, + "step": 1892 + }, + { + "epoch": 0.6912543363155011, + "grad_norm": 1.214178204536438, + "learning_rate": 3.682013353159065e-05, + "loss": 0.701, + "step": 1893 + }, + { + "epoch": 0.6916194997261275, + "grad_norm": 1.5995608568191528, + "learning_rate": 3.681236811268957e-05, + "loss": 0.723, + "step": 1894 + }, + { + "epoch": 0.6919846631367537, + "grad_norm": 0.9377299547195435, + "learning_rate": 3.680459404449974e-05, + "loss": 0.7001, + "step": 1895 + }, + { + "epoch": 0.69234982654738, + "grad_norm": 0.7437262535095215, + "learning_rate": 3.67968113310206e-05, + "loss": 0.7089, + "step": 1896 + }, + { + "epoch": 0.6927149899580062, + "grad_norm": 0.8654033541679382, + "learning_rate": 3.6789019976256045e-05, + "loss": 0.7159, + "step": 1897 + }, + { + "epoch": 0.6930801533686325, + "grad_norm": 1.072096347808838, + "learning_rate": 3.678121998421441e-05, + "loss": 0.7229, + "step": 1898 + }, + { + "epoch": 0.6934453167792587, + "grad_norm": 0.7390715479850769, + "learning_rate": 3.6773411358908486e-05, + "loss": 0.6949, + "step": 1899 + }, + { + "epoch": 0.693810480189885, + "grad_norm": 0.9722973704338074, + "learning_rate": 3.676559410435549e-05, + "loss": 0.7297, + "step": 1900 + }, + { + "epoch": 0.6941756436005112, + "grad_norm": 1.1977391242980957, + "learning_rate": 3.6757768224577086e-05, + "loss": 0.6913, + "step": 1901 + }, + { + "epoch": 0.6945408070111375, + "grad_norm": 0.7919796109199524, + "learning_rate": 3.6749933723599385e-05, + "loss": 0.6944, + "step": 1902 + }, + { + "epoch": 0.6949059704217637, + "grad_norm": 0.9592905044555664, + "learning_rate": 3.674209060545291e-05, + "loss": 0.7401, + "step": 1903 + }, + { + "epoch": 0.69527113383239, + "grad_norm": 0.7853878140449524, + "learning_rate": 3.6734238874172644e-05, + "loss": 0.7006, + "step": 1904 + }, + { + "epoch": 0.6956362972430162, + "grad_norm": 0.7083317041397095, + "learning_rate": 3.6726378533797976e-05, + "loss": 0.6706, + "step": 1905 + }, + { + "epoch": 0.6960014606536425, + "grad_norm": 0.6648118495941162, + "learning_rate": 3.6718509588372737e-05, + "loss": 0.6747, + "step": 1906 + }, + { + "epoch": 0.6963666240642687, + "grad_norm": 1.0176913738250732, + "learning_rate": 3.6710632041945195e-05, + "loss": 0.6844, + "step": 1907 + }, + { + "epoch": 0.6967317874748951, + "grad_norm": 0.7009652853012085, + "learning_rate": 3.670274589856802e-05, + "loss": 0.7198, + "step": 1908 + }, + { + "epoch": 0.6970969508855213, + "grad_norm": 1.0065317153930664, + "learning_rate": 3.6694851162298315e-05, + "loss": 0.71, + "step": 1909 + }, + { + "epoch": 0.6974621142961476, + "grad_norm": 0.9386461973190308, + "learning_rate": 3.668694783719762e-05, + "loss": 0.6891, + "step": 1910 + }, + { + "epoch": 0.6978272777067738, + "grad_norm": 1.139102578163147, + "learning_rate": 3.6679035927331855e-05, + "loss": 0.6519, + "step": 1911 + }, + { + "epoch": 0.6981924411174001, + "grad_norm": 0.8256600499153137, + "learning_rate": 3.6671115436771404e-05, + "loss": 0.7115, + "step": 1912 + }, + { + "epoch": 0.6985576045280263, + "grad_norm": 0.9241785407066345, + "learning_rate": 3.666318636959102e-05, + "loss": 0.6836, + "step": 1913 + }, + { + "epoch": 0.6989227679386526, + "grad_norm": 1.092182993888855, + "learning_rate": 3.665524872986991e-05, + "loss": 0.7064, + "step": 1914 + }, + { + "epoch": 0.6992879313492788, + "grad_norm": 1.0015578269958496, + "learning_rate": 3.664730252169166e-05, + "loss": 0.6837, + "step": 1915 + }, + { + "epoch": 0.699653094759905, + "grad_norm": 0.8351472020149231, + "learning_rate": 3.663934774914428e-05, + "loss": 0.6622, + "step": 1916 + }, + { + "epoch": 0.7000182581705313, + "grad_norm": 1.0238572359085083, + "learning_rate": 3.6631384416320176e-05, + "loss": 0.665, + "step": 1917 + }, + { + "epoch": 0.7003834215811575, + "grad_norm": 0.8432892560958862, + "learning_rate": 3.662341252731616e-05, + "loss": 0.7377, + "step": 1918 + }, + { + "epoch": 0.7007485849917838, + "grad_norm": 0.8084632754325867, + "learning_rate": 3.6615432086233466e-05, + "loss": 0.6875, + "step": 1919 + }, + { + "epoch": 0.70111374840241, + "grad_norm": 0.8538629412651062, + "learning_rate": 3.660744309717769e-05, + "loss": 0.7096, + "step": 1920 + }, + { + "epoch": 0.7014789118130363, + "grad_norm": 0.7508218884468079, + "learning_rate": 3.6599445564258855e-05, + "loss": 0.7041, + "step": 1921 + }, + { + "epoch": 0.7018440752236625, + "grad_norm": 0.8841976523399353, + "learning_rate": 3.659143949159138e-05, + "loss": 0.7489, + "step": 1922 + }, + { + "epoch": 0.7022092386342889, + "grad_norm": 0.6620965003967285, + "learning_rate": 3.6583424883294053e-05, + "loss": 0.7083, + "step": 1923 + }, + { + "epoch": 0.7025744020449151, + "grad_norm": 1.078153371810913, + "learning_rate": 3.657540174349007e-05, + "loss": 0.703, + "step": 1924 + }, + { + "epoch": 0.7029395654555414, + "grad_norm": 0.7211769223213196, + "learning_rate": 3.656737007630703e-05, + "loss": 0.7263, + "step": 1925 + }, + { + "epoch": 0.7033047288661676, + "grad_norm": 1.0884227752685547, + "learning_rate": 3.6559329885876896e-05, + "loss": 0.7, + "step": 1926 + }, + { + "epoch": 0.7036698922767939, + "grad_norm": 3.626338243484497, + "learning_rate": 3.6551281176336015e-05, + "loss": 0.7142, + "step": 1927 + }, + { + "epoch": 0.7040350556874201, + "grad_norm": 0.6280802488327026, + "learning_rate": 3.654322395182512e-05, + "loss": 0.687, + "step": 1928 + }, + { + "epoch": 0.7044002190980464, + "grad_norm": 0.8909886479377747, + "learning_rate": 3.653515821648936e-05, + "loss": 0.7003, + "step": 1929 + }, + { + "epoch": 0.7047653825086726, + "grad_norm": 1.0985990762710571, + "learning_rate": 3.6527083974478193e-05, + "loss": 0.6906, + "step": 1930 + }, + { + "epoch": 0.7051305459192989, + "grad_norm": 0.9323737025260925, + "learning_rate": 3.651900122994552e-05, + "loss": 0.7028, + "step": 1931 + }, + { + "epoch": 0.7054957093299251, + "grad_norm": 1.0363316535949707, + "learning_rate": 3.651090998704958e-05, + "loss": 0.7089, + "step": 1932 + }, + { + "epoch": 0.7058608727405514, + "grad_norm": 0.8372681736946106, + "learning_rate": 3.650281024995299e-05, + "loss": 0.722, + "step": 1933 + }, + { + "epoch": 0.7062260361511776, + "grad_norm": 0.9375368356704712, + "learning_rate": 3.649470202282275e-05, + "loss": 0.6845, + "step": 1934 + }, + { + "epoch": 0.7065911995618039, + "grad_norm": 0.8614623546600342, + "learning_rate": 3.648658530983021e-05, + "loss": 0.7002, + "step": 1935 + }, + { + "epoch": 0.7069563629724301, + "grad_norm": 0.7346733212471008, + "learning_rate": 3.6478460115151084e-05, + "loss": 0.6824, + "step": 1936 + }, + { + "epoch": 0.7073215263830565, + "grad_norm": 0.7642632126808167, + "learning_rate": 3.6470326442965475e-05, + "loss": 0.7172, + "step": 1937 + }, + { + "epoch": 0.7076866897936827, + "grad_norm": 0.8814178705215454, + "learning_rate": 3.6462184297457826e-05, + "loss": 0.6914, + "step": 1938 + }, + { + "epoch": 0.708051853204309, + "grad_norm": 0.9905987977981567, + "learning_rate": 3.6454033682816946e-05, + "loss": 0.7083, + "step": 1939 + }, + { + "epoch": 0.7084170166149352, + "grad_norm": 0.930086076259613, + "learning_rate": 3.6445874603235986e-05, + "loss": 0.7191, + "step": 1940 + }, + { + "epoch": 0.7087821800255615, + "grad_norm": 0.8876291513442993, + "learning_rate": 3.643770706291248e-05, + "loss": 0.6849, + "step": 1941 + }, + { + "epoch": 0.7091473434361877, + "grad_norm": 0.9349446892738342, + "learning_rate": 3.642953106604829e-05, + "loss": 0.6666, + "step": 1942 + }, + { + "epoch": 0.709512506846814, + "grad_norm": 0.8735674619674683, + "learning_rate": 3.6421346616849645e-05, + "loss": 0.719, + "step": 1943 + }, + { + "epoch": 0.7098776702574402, + "grad_norm": 0.7789167761802673, + "learning_rate": 3.641315371952711e-05, + "loss": 0.719, + "step": 1944 + }, + { + "epoch": 0.7102428336680665, + "grad_norm": 0.9366868138313293, + "learning_rate": 3.640495237829561e-05, + "loss": 0.7332, + "step": 1945 + }, + { + "epoch": 0.7106079970786927, + "grad_norm": 0.8824470043182373, + "learning_rate": 3.63967425973744e-05, + "loss": 0.694, + "step": 1946 + }, + { + "epoch": 0.710973160489319, + "grad_norm": 0.8090077638626099, + "learning_rate": 3.638852438098708e-05, + "loss": 0.6626, + "step": 1947 + }, + { + "epoch": 0.7113383238999452, + "grad_norm": 0.7595183849334717, + "learning_rate": 3.63802977333616e-05, + "loss": 0.6954, + "step": 1948 + }, + { + "epoch": 0.7117034873105714, + "grad_norm": 0.6301791071891785, + "learning_rate": 3.637206265873024e-05, + "loss": 0.7201, + "step": 1949 + }, + { + "epoch": 0.7120686507211977, + "grad_norm": 1.1005266904830933, + "learning_rate": 3.6363819161329606e-05, + "loss": 0.7225, + "step": 1950 + }, + { + "epoch": 0.7124338141318239, + "grad_norm": 0.7750469446182251, + "learning_rate": 3.6355567245400655e-05, + "loss": 0.6826, + "step": 1951 + }, + { + "epoch": 0.7127989775424503, + "grad_norm": 0.9254831671714783, + "learning_rate": 3.634730691518866e-05, + "loss": 0.7193, + "step": 1952 + }, + { + "epoch": 0.7131641409530765, + "grad_norm": 1.1714247465133667, + "learning_rate": 3.633903817494324e-05, + "loss": 0.7113, + "step": 1953 + }, + { + "epoch": 0.7135293043637028, + "grad_norm": 1.0585075616836548, + "learning_rate": 3.633076102891832e-05, + "loss": 0.7272, + "step": 1954 + }, + { + "epoch": 0.713894467774329, + "grad_norm": 0.7965645790100098, + "learning_rate": 3.632247548137217e-05, + "loss": 0.6878, + "step": 1955 + }, + { + "epoch": 0.7142596311849553, + "grad_norm": 0.6610022187232971, + "learning_rate": 3.631418153656736e-05, + "loss": 0.6795, + "step": 1956 + }, + { + "epoch": 0.7146247945955815, + "grad_norm": 1.0526721477508545, + "learning_rate": 3.630587919877079e-05, + "loss": 0.7363, + "step": 1957 + }, + { + "epoch": 0.7149899580062078, + "grad_norm": 0.8518906831741333, + "learning_rate": 3.6297568472253694e-05, + "loss": 0.7076, + "step": 1958 + }, + { + "epoch": 0.715355121416834, + "grad_norm": 0.9094949960708618, + "learning_rate": 3.628924936129161e-05, + "loss": 0.6866, + "step": 1959 + }, + { + "epoch": 0.7157202848274603, + "grad_norm": 0.8456898331642151, + "learning_rate": 3.628092187016436e-05, + "loss": 0.7202, + "step": 1960 + }, + { + "epoch": 0.7160854482380865, + "grad_norm": 1.3246712684631348, + "learning_rate": 3.627258600315612e-05, + "loss": 0.7158, + "step": 1961 + }, + { + "epoch": 0.7164506116487128, + "grad_norm": 0.8540048599243164, + "learning_rate": 3.626424176455537e-05, + "loss": 0.6913, + "step": 1962 + }, + { + "epoch": 0.716815775059339, + "grad_norm": 0.7801914811134338, + "learning_rate": 3.625588915865487e-05, + "loss": 0.6888, + "step": 1963 + }, + { + "epoch": 0.7171809384699653, + "grad_norm": 0.9037594199180603, + "learning_rate": 3.624752818975171e-05, + "loss": 0.6992, + "step": 1964 + }, + { + "epoch": 0.7175461018805915, + "grad_norm": 1.1611043214797974, + "learning_rate": 3.623915886214726e-05, + "loss": 0.687, + "step": 1965 + }, + { + "epoch": 0.7179112652912178, + "grad_norm": 0.8219496011734009, + "learning_rate": 3.6230781180147225e-05, + "loss": 0.7015, + "step": 1966 + }, + { + "epoch": 0.718276428701844, + "grad_norm": 0.8429041504859924, + "learning_rate": 3.622239514806157e-05, + "loss": 0.738, + "step": 1967 + }, + { + "epoch": 0.7186415921124704, + "grad_norm": 0.7997975945472717, + "learning_rate": 3.621400077020457e-05, + "loss": 0.6908, + "step": 1968 + }, + { + "epoch": 0.7190067555230966, + "grad_norm": 0.9351162314414978, + "learning_rate": 3.62055980508948e-05, + "loss": 0.6897, + "step": 1969 + }, + { + "epoch": 0.7193719189337229, + "grad_norm": 0.7457789778709412, + "learning_rate": 3.619718699445513e-05, + "loss": 0.665, + "step": 1970 + }, + { + "epoch": 0.7197370823443491, + "grad_norm": 0.8901992440223694, + "learning_rate": 3.61887676052127e-05, + "loss": 0.6707, + "step": 1971 + }, + { + "epoch": 0.7201022457549754, + "grad_norm": 0.6807724833488464, + "learning_rate": 3.6180339887498953e-05, + "loss": 0.6902, + "step": 1972 + }, + { + "epoch": 0.7204674091656016, + "grad_norm": 0.8766950964927673, + "learning_rate": 3.617190384564961e-05, + "loss": 0.6949, + "step": 1973 + }, + { + "epoch": 0.7208325725762279, + "grad_norm": 0.9084246754646301, + "learning_rate": 3.616345948400468e-05, + "loss": 0.7058, + "step": 1974 + }, + { + "epoch": 0.7211977359868541, + "grad_norm": 1.0412424802780151, + "learning_rate": 3.615500680690843e-05, + "loss": 0.6998, + "step": 1975 + }, + { + "epoch": 0.7215628993974804, + "grad_norm": 0.8140221238136292, + "learning_rate": 3.614654581870945e-05, + "loss": 0.6876, + "step": 1976 + }, + { + "epoch": 0.7219280628081066, + "grad_norm": 0.6115467548370361, + "learning_rate": 3.613807652376057e-05, + "loss": 0.6809, + "step": 1977 + }, + { + "epoch": 0.7222932262187329, + "grad_norm": 0.7723403573036194, + "learning_rate": 3.6129598926418896e-05, + "loss": 0.6692, + "step": 1978 + }, + { + "epoch": 0.7226583896293591, + "grad_norm": 0.732070803642273, + "learning_rate": 3.6121113031045815e-05, + "loss": 0.6859, + "step": 1979 + }, + { + "epoch": 0.7230235530399854, + "grad_norm": 0.9471202492713928, + "learning_rate": 3.611261884200698e-05, + "loss": 0.7017, + "step": 1980 + }, + { + "epoch": 0.7233887164506116, + "grad_norm": 0.7620534896850586, + "learning_rate": 3.6104116363672304e-05, + "loss": 0.6991, + "step": 1981 + }, + { + "epoch": 0.723753879861238, + "grad_norm": 0.8713348507881165, + "learning_rate": 3.6095605600415985e-05, + "loss": 0.6714, + "step": 1982 + }, + { + "epoch": 0.7241190432718642, + "grad_norm": 0.8815789818763733, + "learning_rate": 3.6087086556616457e-05, + "loss": 0.6954, + "step": 1983 + }, + { + "epoch": 0.7244842066824904, + "grad_norm": 1.0424094200134277, + "learning_rate": 3.607855923665643e-05, + "loss": 0.6781, + "step": 1984 + }, + { + "epoch": 0.7248493700931167, + "grad_norm": 0.8389812111854553, + "learning_rate": 3.607002364492287e-05, + "loss": 0.6414, + "step": 1985 + }, + { + "epoch": 0.7252145335037429, + "grad_norm": 0.8990022540092468, + "learning_rate": 3.6061479785806996e-05, + "loss": 0.6917, + "step": 1986 + }, + { + "epoch": 0.7255796969143692, + "grad_norm": 1.0698925256729126, + "learning_rate": 3.6052927663704276e-05, + "loss": 0.7104, + "step": 1987 + }, + { + "epoch": 0.7259448603249954, + "grad_norm": 0.9811051487922668, + "learning_rate": 3.604436728301443e-05, + "loss": 0.7292, + "step": 1988 + }, + { + "epoch": 0.7263100237356217, + "grad_norm": 0.7819287776947021, + "learning_rate": 3.603579864814145e-05, + "loss": 0.7457, + "step": 1989 + }, + { + "epoch": 0.7266751871462479, + "grad_norm": 0.7379570603370667, + "learning_rate": 3.6027221763493534e-05, + "loss": 0.7169, + "step": 1990 + }, + { + "epoch": 0.7270403505568742, + "grad_norm": 1.0845247507095337, + "learning_rate": 3.6018636633483154e-05, + "loss": 0.6904, + "step": 1991 + }, + { + "epoch": 0.7274055139675004, + "grad_norm": 1.1536235809326172, + "learning_rate": 3.601004326252702e-05, + "loss": 0.6995, + "step": 1992 + }, + { + "epoch": 0.7277706773781267, + "grad_norm": 0.9554802179336548, + "learning_rate": 3.600144165504607e-05, + "loss": 0.6824, + "step": 1993 + }, + { + "epoch": 0.7281358407887529, + "grad_norm": 1.0458979606628418, + "learning_rate": 3.5992831815465476e-05, + "loss": 0.7018, + "step": 1994 + }, + { + "epoch": 0.7285010041993792, + "grad_norm": 0.8219031095504761, + "learning_rate": 3.598421374821468e-05, + "loss": 0.6919, + "step": 1995 + }, + { + "epoch": 0.7288661676100054, + "grad_norm": 0.7273349761962891, + "learning_rate": 3.59755874577273e-05, + "loss": 0.6747, + "step": 1996 + }, + { + "epoch": 0.7292313310206318, + "grad_norm": 0.8271771669387817, + "learning_rate": 3.596695294844124e-05, + "loss": 0.6915, + "step": 1997 + }, + { + "epoch": 0.729596494431258, + "grad_norm": 0.9043617248535156, + "learning_rate": 3.5958310224798605e-05, + "loss": 0.7148, + "step": 1998 + }, + { + "epoch": 0.7299616578418843, + "grad_norm": 0.9108407497406006, + "learning_rate": 3.5949659291245727e-05, + "loss": 0.699, + "step": 1999 + }, + { + "epoch": 0.7303268212525105, + "grad_norm": 0.9600886106491089, + "learning_rate": 3.5941000152233166e-05, + "loss": 0.6747, + "step": 2000 + }, + { + "epoch": 0.7306919846631368, + "grad_norm": 2.9126079082489014, + "learning_rate": 3.5932332812215694e-05, + "loss": 0.6936, + "step": 2001 + }, + { + "epoch": 0.731057148073763, + "grad_norm": 0.8958035111427307, + "learning_rate": 3.5923657275652316e-05, + "loss": 0.6804, + "step": 2002 + }, + { + "epoch": 0.7314223114843893, + "grad_norm": 1.0890897512435913, + "learning_rate": 3.5914973547006244e-05, + "loss": 0.7119, + "step": 2003 + }, + { + "epoch": 0.7317874748950155, + "grad_norm": 0.9848058223724365, + "learning_rate": 3.5906281630744914e-05, + "loss": 0.7091, + "step": 2004 + }, + { + "epoch": 0.7321526383056418, + "grad_norm": 1.0069018602371216, + "learning_rate": 3.589758153133996e-05, + "loss": 0.7079, + "step": 2005 + }, + { + "epoch": 0.732517801716268, + "grad_norm": 0.8909217119216919, + "learning_rate": 3.588887325326725e-05, + "loss": 0.7091, + "step": 2006 + }, + { + "epoch": 0.7328829651268943, + "grad_norm": 0.9334930181503296, + "learning_rate": 3.5880156801006826e-05, + "loss": 0.6807, + "step": 2007 + }, + { + "epoch": 0.7332481285375205, + "grad_norm": 0.8287849426269531, + "learning_rate": 3.587143217904295e-05, + "loss": 0.6672, + "step": 2008 + }, + { + "epoch": 0.7336132919481468, + "grad_norm": 1.1333422660827637, + "learning_rate": 3.58626993918641e-05, + "loss": 0.6932, + "step": 2009 + }, + { + "epoch": 0.733978455358773, + "grad_norm": 1.2113679647445679, + "learning_rate": 3.585395844396295e-05, + "loss": 0.6713, + "step": 2010 + }, + { + "epoch": 0.7343436187693994, + "grad_norm": 0.7048681378364563, + "learning_rate": 3.584520933983636e-05, + "loss": 0.6991, + "step": 2011 + }, + { + "epoch": 0.7347087821800256, + "grad_norm": 0.9552350044250488, + "learning_rate": 3.5836452083985394e-05, + "loss": 0.7224, + "step": 2012 + }, + { + "epoch": 0.7350739455906519, + "grad_norm": 1.0799976587295532, + "learning_rate": 3.58276866809153e-05, + "loss": 0.6576, + "step": 2013 + }, + { + "epoch": 0.7354391090012781, + "grad_norm": 2.1734097003936768, + "learning_rate": 3.581891313513555e-05, + "loss": 0.7068, + "step": 2014 + }, + { + "epoch": 0.7358042724119044, + "grad_norm": 0.9834950566291809, + "learning_rate": 3.581013145115975e-05, + "loss": 0.6852, + "step": 2015 + }, + { + "epoch": 0.7361694358225306, + "grad_norm": 0.8883312940597534, + "learning_rate": 3.580134163350575e-05, + "loss": 0.6896, + "step": 2016 + }, + { + "epoch": 0.7365345992331568, + "grad_norm": 0.9405683279037476, + "learning_rate": 3.5792543686695544e-05, + "loss": 0.6899, + "step": 2017 + }, + { + "epoch": 0.7368997626437831, + "grad_norm": 0.8160017728805542, + "learning_rate": 3.5783737615255326e-05, + "loss": 0.7014, + "step": 2018 + }, + { + "epoch": 0.7372649260544093, + "grad_norm": 0.8430017828941345, + "learning_rate": 3.5774923423715464e-05, + "loss": 0.6951, + "step": 2019 + }, + { + "epoch": 0.7376300894650356, + "grad_norm": 1.1991838216781616, + "learning_rate": 3.576610111661051e-05, + "loss": 0.6572, + "step": 2020 + }, + { + "epoch": 0.7379952528756618, + "grad_norm": 0.6597726941108704, + "learning_rate": 3.5757270698479186e-05, + "loss": 0.6934, + "step": 2021 + }, + { + "epoch": 0.7383604162862881, + "grad_norm": 1.0859240293502808, + "learning_rate": 3.5748432173864394e-05, + "loss": 0.7139, + "step": 2022 + }, + { + "epoch": 0.7387255796969143, + "grad_norm": 0.8668961524963379, + "learning_rate": 3.573958554731319e-05, + "loss": 0.6971, + "step": 2023 + }, + { + "epoch": 0.7390907431075406, + "grad_norm": 0.9613633155822754, + "learning_rate": 3.573073082337681e-05, + "loss": 0.7083, + "step": 2024 + }, + { + "epoch": 0.7394559065181668, + "grad_norm": 0.6835755109786987, + "learning_rate": 3.572186800661065e-05, + "loss": 0.7037, + "step": 2025 + }, + { + "epoch": 0.7398210699287932, + "grad_norm": 0.9177113771438599, + "learning_rate": 3.571299710157429e-05, + "loss": 0.6993, + "step": 2026 + }, + { + "epoch": 0.7401862333394194, + "grad_norm": 0.9868367314338684, + "learning_rate": 3.570411811283144e-05, + "loss": 0.7054, + "step": 2027 + }, + { + "epoch": 0.7405513967500457, + "grad_norm": 1.4232624769210815, + "learning_rate": 3.569523104494999e-05, + "loss": 0.6859, + "step": 2028 + }, + { + "epoch": 0.7409165601606719, + "grad_norm": 1.0719857215881348, + "learning_rate": 3.568633590250198e-05, + "loss": 0.7003, + "step": 2029 + }, + { + "epoch": 0.7412817235712982, + "grad_norm": 0.705696702003479, + "learning_rate": 3.56774326900636e-05, + "loss": 0.6842, + "step": 2030 + }, + { + "epoch": 0.7416468869819244, + "grad_norm": 1.0737361907958984, + "learning_rate": 3.5668521412215194e-05, + "loss": 0.7257, + "step": 2031 + }, + { + "epoch": 0.7420120503925507, + "grad_norm": 0.9407960772514343, + "learning_rate": 3.5659602073541256e-05, + "loss": 0.7186, + "step": 2032 + }, + { + "epoch": 0.7423772138031769, + "grad_norm": 0.8402684330940247, + "learning_rate": 3.565067467863044e-05, + "loss": 0.7084, + "step": 2033 + }, + { + "epoch": 0.7427423772138032, + "grad_norm": 1.0480576753616333, + "learning_rate": 3.564173923207553e-05, + "loss": 0.696, + "step": 2034 + }, + { + "epoch": 0.7431075406244294, + "grad_norm": 1.7584154605865479, + "learning_rate": 3.563279573847344e-05, + "loss": 0.7129, + "step": 2035 + }, + { + "epoch": 0.7434727040350557, + "grad_norm": 1.2279152870178223, + "learning_rate": 3.5623844202425245e-05, + "loss": 0.7009, + "step": 2036 + }, + { + "epoch": 0.7438378674456819, + "grad_norm": 0.8674067258834839, + "learning_rate": 3.5614884628536156e-05, + "loss": 0.7026, + "step": 2037 + }, + { + "epoch": 0.7442030308563082, + "grad_norm": 0.8557435274124146, + "learning_rate": 3.560591702141552e-05, + "loss": 0.6932, + "step": 2038 + }, + { + "epoch": 0.7445681942669344, + "grad_norm": 0.8920378088951111, + "learning_rate": 3.559694138567679e-05, + "loss": 0.6843, + "step": 2039 + }, + { + "epoch": 0.7449333576775607, + "grad_norm": 1.034100890159607, + "learning_rate": 3.558795772593759e-05, + "loss": 0.7097, + "step": 2040 + }, + { + "epoch": 0.745298521088187, + "grad_norm": 0.7674341201782227, + "learning_rate": 3.5578966046819644e-05, + "loss": 0.6938, + "step": 2041 + }, + { + "epoch": 0.7456636844988133, + "grad_norm": 0.9016082286834717, + "learning_rate": 3.556996635294881e-05, + "loss": 0.6827, + "step": 2042 + }, + { + "epoch": 0.7460288479094395, + "grad_norm": 0.782569408416748, + "learning_rate": 3.556095864895508e-05, + "loss": 0.6498, + "step": 2043 + }, + { + "epoch": 0.7463940113200658, + "grad_norm": 0.9244042038917542, + "learning_rate": 3.555194293947254e-05, + "loss": 0.698, + "step": 2044 + }, + { + "epoch": 0.746759174730692, + "grad_norm": 0.8407185673713684, + "learning_rate": 3.554291922913942e-05, + "loss": 0.7162, + "step": 2045 + }, + { + "epoch": 0.7471243381413183, + "grad_norm": 1.2670552730560303, + "learning_rate": 3.553388752259806e-05, + "loss": 0.6927, + "step": 2046 + }, + { + "epoch": 0.7474895015519445, + "grad_norm": 0.8736078143119812, + "learning_rate": 3.5524847824494896e-05, + "loss": 0.7036, + "step": 2047 + }, + { + "epoch": 0.7478546649625708, + "grad_norm": 0.7163779735565186, + "learning_rate": 3.5515800139480505e-05, + "loss": 0.7062, + "step": 2048 + }, + { + "epoch": 0.748219828373197, + "grad_norm": 0.5790550112724304, + "learning_rate": 3.5506744472209556e-05, + "loss": 0.6837, + "step": 2049 + }, + { + "epoch": 0.7485849917838232, + "grad_norm": 1.0888993740081787, + "learning_rate": 3.5497680827340816e-05, + "loss": 0.7061, + "step": 2050 + }, + { + "epoch": 0.7489501551944495, + "grad_norm": 0.7524549961090088, + "learning_rate": 3.5488609209537176e-05, + "loss": 0.6826, + "step": 2051 + }, + { + "epoch": 0.7493153186050757, + "grad_norm": 1.128859281539917, + "learning_rate": 3.547952962346562e-05, + "loss": 0.7017, + "step": 2052 + }, + { + "epoch": 0.749680482015702, + "grad_norm": 0.9107731580734253, + "learning_rate": 3.5470442073797224e-05, + "loss": 0.6801, + "step": 2053 + }, + { + "epoch": 0.7500456454263282, + "grad_norm": 0.8646042346954346, + "learning_rate": 3.5461346565207174e-05, + "loss": 0.6924, + "step": 2054 + }, + { + "epoch": 0.7504108088369545, + "grad_norm": 0.8979294896125793, + "learning_rate": 3.5452243102374737e-05, + "loss": 0.6661, + "step": 2055 + }, + { + "epoch": 0.7507759722475807, + "grad_norm": 1.0486830472946167, + "learning_rate": 3.5443131689983285e-05, + "loss": 0.6636, + "step": 2056 + }, + { + "epoch": 0.7511411356582071, + "grad_norm": 0.8791133761405945, + "learning_rate": 3.543401233272028e-05, + "loss": 0.7052, + "step": 2057 + }, + { + "epoch": 0.7515062990688333, + "grad_norm": 0.8093293905258179, + "learning_rate": 3.5424885035277255e-05, + "loss": 0.6693, + "step": 2058 + }, + { + "epoch": 0.7518714624794596, + "grad_norm": 0.7917428612709045, + "learning_rate": 3.541574980234983e-05, + "loss": 0.6754, + "step": 2059 + }, + { + "epoch": 0.7522366258900858, + "grad_norm": 0.9093384742736816, + "learning_rate": 3.540660663863774e-05, + "loss": 0.7178, + "step": 2060 + }, + { + "epoch": 0.7526017893007121, + "grad_norm": 0.7561656832695007, + "learning_rate": 3.539745554884476e-05, + "loss": 0.6684, + "step": 2061 + }, + { + "epoch": 0.7529669527113383, + "grad_norm": 0.9925743937492371, + "learning_rate": 3.5388296537678765e-05, + "loss": 0.7123, + "step": 2062 + }, + { + "epoch": 0.7533321161219646, + "grad_norm": 0.7667102813720703, + "learning_rate": 3.537912960985169e-05, + "loss": 0.6581, + "step": 2063 + }, + { + "epoch": 0.7536972795325908, + "grad_norm": 1.0372451543807983, + "learning_rate": 3.536995477007955e-05, + "loss": 0.6929, + "step": 2064 + }, + { + "epoch": 0.7540624429432171, + "grad_norm": 1.0563349723815918, + "learning_rate": 3.5360772023082446e-05, + "loss": 0.7054, + "step": 2065 + }, + { + "epoch": 0.7544276063538433, + "grad_norm": 1.1406291723251343, + "learning_rate": 3.535158137358453e-05, + "loss": 0.7117, + "step": 2066 + }, + { + "epoch": 0.7547927697644696, + "grad_norm": 0.67351233959198, + "learning_rate": 3.534238282631401e-05, + "loss": 0.703, + "step": 2067 + }, + { + "epoch": 0.7551579331750958, + "grad_norm": 0.8350493311882019, + "learning_rate": 3.533317638600319e-05, + "loss": 0.6952, + "step": 2068 + }, + { + "epoch": 0.7555230965857221, + "grad_norm": 0.8542644381523132, + "learning_rate": 3.532396205738839e-05, + "loss": 0.6871, + "step": 2069 + }, + { + "epoch": 0.7558882599963483, + "grad_norm": 0.8126198649406433, + "learning_rate": 3.5314739845210027e-05, + "loss": 0.6661, + "step": 2070 + }, + { + "epoch": 0.7562534234069747, + "grad_norm": 1.0363764762878418, + "learning_rate": 3.530550975421255e-05, + "loss": 0.7021, + "step": 2071 + }, + { + "epoch": 0.7566185868176009, + "grad_norm": 0.8353523015975952, + "learning_rate": 3.529627178914448e-05, + "loss": 0.7183, + "step": 2072 + }, + { + "epoch": 0.7569837502282272, + "grad_norm": 0.7634402513504028, + "learning_rate": 3.5287025954758385e-05, + "loss": 0.696, + "step": 2073 + }, + { + "epoch": 0.7573489136388534, + "grad_norm": 1.201458215713501, + "learning_rate": 3.5277772255810855e-05, + "loss": 0.6416, + "step": 2074 + }, + { + "epoch": 0.7577140770494797, + "grad_norm": 0.7530384659767151, + "learning_rate": 3.526851069706256e-05, + "loss": 0.6996, + "step": 2075 + }, + { + "epoch": 0.7580792404601059, + "grad_norm": 0.8554724454879761, + "learning_rate": 3.5259241283278204e-05, + "loss": 0.6973, + "step": 2076 + }, + { + "epoch": 0.7584444038707322, + "grad_norm": 1.2510430812835693, + "learning_rate": 3.5249964019226514e-05, + "loss": 0.7042, + "step": 2077 + }, + { + "epoch": 0.7588095672813584, + "grad_norm": 0.9825018048286438, + "learning_rate": 3.524067890968029e-05, + "loss": 0.6988, + "step": 2078 + }, + { + "epoch": 0.7591747306919847, + "grad_norm": 0.7476685643196106, + "learning_rate": 3.523138595941633e-05, + "loss": 0.6993, + "step": 2079 + }, + { + "epoch": 0.7595398941026109, + "grad_norm": 0.8344430923461914, + "learning_rate": 3.5222085173215495e-05, + "loss": 0.6678, + "step": 2080 + }, + { + "epoch": 0.7599050575132372, + "grad_norm": 0.9352977275848389, + "learning_rate": 3.521277655586266e-05, + "loss": 0.7004, + "step": 2081 + }, + { + "epoch": 0.7602702209238634, + "grad_norm": 1.1583284139633179, + "learning_rate": 3.520346011214674e-05, + "loss": 0.6855, + "step": 2082 + }, + { + "epoch": 0.7606353843344897, + "grad_norm": 0.9419161081314087, + "learning_rate": 3.519413584686067e-05, + "loss": 0.687, + "step": 2083 + }, + { + "epoch": 0.7610005477451159, + "grad_norm": 1.1509552001953125, + "learning_rate": 3.518480376480141e-05, + "loss": 0.7069, + "step": 2084 + }, + { + "epoch": 0.7613657111557421, + "grad_norm": 0.9876951575279236, + "learning_rate": 3.5175463870769935e-05, + "loss": 0.6963, + "step": 2085 + }, + { + "epoch": 0.7617308745663685, + "grad_norm": 1.2705250978469849, + "learning_rate": 3.516611616957125e-05, + "loss": 0.7005, + "step": 2086 + }, + { + "epoch": 0.7620960379769947, + "grad_norm": 0.8080708980560303, + "learning_rate": 3.515676066601438e-05, + "loss": 0.6875, + "step": 2087 + }, + { + "epoch": 0.762461201387621, + "grad_norm": 0.7331348657608032, + "learning_rate": 3.514739736491235e-05, + "loss": 0.6668, + "step": 2088 + }, + { + "epoch": 0.7628263647982472, + "grad_norm": 0.698076069355011, + "learning_rate": 3.51380262710822e-05, + "loss": 0.6801, + "step": 2089 + }, + { + "epoch": 0.7631915282088735, + "grad_norm": 0.7155959010124207, + "learning_rate": 3.512864738934499e-05, + "loss": 0.6964, + "step": 2090 + }, + { + "epoch": 0.7635566916194997, + "grad_norm": 0.8522117733955383, + "learning_rate": 3.511926072452578e-05, + "loss": 0.7069, + "step": 2091 + }, + { + "epoch": 0.763921855030126, + "grad_norm": 0.8499336838722229, + "learning_rate": 3.5109866281453606e-05, + "loss": 0.6711, + "step": 2092 + }, + { + "epoch": 0.7642870184407522, + "grad_norm": 0.7255849242210388, + "learning_rate": 3.510046406496157e-05, + "loss": 0.6665, + "step": 2093 + }, + { + "epoch": 0.7646521818513785, + "grad_norm": 1.1719845533370972, + "learning_rate": 3.50910540798867e-05, + "loss": 0.666, + "step": 2094 + }, + { + "epoch": 0.7650173452620047, + "grad_norm": 0.9627401232719421, + "learning_rate": 3.508163633107008e-05, + "loss": 0.7095, + "step": 2095 + }, + { + "epoch": 0.765382508672631, + "grad_norm": 0.8184304237365723, + "learning_rate": 3.507221082335676e-05, + "loss": 0.6829, + "step": 2096 + }, + { + "epoch": 0.7657476720832572, + "grad_norm": 0.9856240153312683, + "learning_rate": 3.5062777561595776e-05, + "loss": 0.7136, + "step": 2097 + }, + { + "epoch": 0.7661128354938835, + "grad_norm": 1.0894156694412231, + "learning_rate": 3.505333655064017e-05, + "loss": 0.6995, + "step": 2098 + }, + { + "epoch": 0.7664779989045097, + "grad_norm": 1.1927454471588135, + "learning_rate": 3.5043887795346966e-05, + "loss": 0.7103, + "step": 2099 + }, + { + "epoch": 0.766843162315136, + "grad_norm": 0.9579023718833923, + "learning_rate": 3.503443130057715e-05, + "loss": 0.6916, + "step": 2100 + }, + { + "epoch": 0.7672083257257623, + "grad_norm": 1.1369895935058594, + "learning_rate": 3.5024967071195736e-05, + "loss": 0.7256, + "step": 2101 + }, + { + "epoch": 0.7675734891363886, + "grad_norm": 0.7931919097900391, + "learning_rate": 3.501549511207168e-05, + "loss": 0.7003, + "step": 2102 + }, + { + "epoch": 0.7679386525470148, + "grad_norm": 1.0025581121444702, + "learning_rate": 3.500601542807792e-05, + "loss": 0.6724, + "step": 2103 + }, + { + "epoch": 0.7683038159576411, + "grad_norm": 0.9976342916488647, + "learning_rate": 3.499652802409137e-05, + "loss": 0.6695, + "step": 2104 + }, + { + "epoch": 0.7686689793682673, + "grad_norm": 0.6871845722198486, + "learning_rate": 3.4987032904992935e-05, + "loss": 0.6914, + "step": 2105 + }, + { + "epoch": 0.7690341427788936, + "grad_norm": 1.060022234916687, + "learning_rate": 3.497753007566746e-05, + "loss": 0.6821, + "step": 2106 + }, + { + "epoch": 0.7693993061895198, + "grad_norm": 1.0832650661468506, + "learning_rate": 3.4968019541003765e-05, + "loss": 0.6801, + "step": 2107 + }, + { + "epoch": 0.7697644696001461, + "grad_norm": 0.8646666407585144, + "learning_rate": 3.495850130589465e-05, + "loss": 0.6887, + "step": 2108 + }, + { + "epoch": 0.7701296330107723, + "grad_norm": 0.6750617623329163, + "learning_rate": 3.494897537523686e-05, + "loss": 0.6979, + "step": 2109 + }, + { + "epoch": 0.7704947964213986, + "grad_norm": 1.157203197479248, + "learning_rate": 3.493944175393111e-05, + "loss": 0.6683, + "step": 2110 + }, + { + "epoch": 0.7708599598320248, + "grad_norm": 0.9452328085899353, + "learning_rate": 3.492990044688205e-05, + "loss": 0.6818, + "step": 2111 + }, + { + "epoch": 0.7712251232426511, + "grad_norm": 0.7917987704277039, + "learning_rate": 3.4920351458998316e-05, + "loss": 0.6987, + "step": 2112 + }, + { + "epoch": 0.7715902866532773, + "grad_norm": 0.9351409077644348, + "learning_rate": 3.491079479519248e-05, + "loss": 0.6997, + "step": 2113 + }, + { + "epoch": 0.7719554500639036, + "grad_norm": 0.8905813097953796, + "learning_rate": 3.490123046038104e-05, + "loss": 0.6809, + "step": 2114 + }, + { + "epoch": 0.7723206134745298, + "grad_norm": 0.8470001220703125, + "learning_rate": 3.489165845948448e-05, + "loss": 0.6904, + "step": 2115 + }, + { + "epoch": 0.7726857768851562, + "grad_norm": 1.801979660987854, + "learning_rate": 3.488207879742722e-05, + "loss": 0.6968, + "step": 2116 + }, + { + "epoch": 0.7730509402957824, + "grad_norm": 0.9084315299987793, + "learning_rate": 3.487249147913759e-05, + "loss": 0.6625, + "step": 2117 + }, + { + "epoch": 0.7734161037064086, + "grad_norm": 1.1759934425354004, + "learning_rate": 3.4862896509547886e-05, + "loss": 0.6912, + "step": 2118 + }, + { + "epoch": 0.7737812671170349, + "grad_norm": 0.7581272125244141, + "learning_rate": 3.485329389359434e-05, + "loss": 0.7298, + "step": 2119 + }, + { + "epoch": 0.7741464305276611, + "grad_norm": 0.9350868463516235, + "learning_rate": 3.484368363621712e-05, + "loss": 0.6889, + "step": 2120 + }, + { + "epoch": 0.7745115939382874, + "grad_norm": 1.0078556537628174, + "learning_rate": 3.48340657423603e-05, + "loss": 0.718, + "step": 2121 + }, + { + "epoch": 0.7748767573489136, + "grad_norm": 0.9204308986663818, + "learning_rate": 3.482444021697192e-05, + "loss": 0.686, + "step": 2122 + }, + { + "epoch": 0.7752419207595399, + "grad_norm": 0.7891234755516052, + "learning_rate": 3.481480706500391e-05, + "loss": 0.6902, + "step": 2123 + }, + { + "epoch": 0.7756070841701661, + "grad_norm": 0.8696504831314087, + "learning_rate": 3.480516629141214e-05, + "loss": 0.7117, + "step": 2124 + }, + { + "epoch": 0.7759722475807924, + "grad_norm": 0.9487229585647583, + "learning_rate": 3.479551790115642e-05, + "loss": 0.6794, + "step": 2125 + }, + { + "epoch": 0.7763374109914186, + "grad_norm": 0.8154385089874268, + "learning_rate": 3.4785861899200434e-05, + "loss": 0.6934, + "step": 2126 + }, + { + "epoch": 0.7767025744020449, + "grad_norm": 1.1768670082092285, + "learning_rate": 3.477619829051183e-05, + "loss": 0.6923, + "step": 2127 + }, + { + "epoch": 0.7770677378126711, + "grad_norm": 0.9852544069290161, + "learning_rate": 3.476652708006214e-05, + "loss": 0.7153, + "step": 2128 + }, + { + "epoch": 0.7774329012232974, + "grad_norm": 0.7022532820701599, + "learning_rate": 3.4756848272826795e-05, + "loss": 0.6879, + "step": 2129 + }, + { + "epoch": 0.7777980646339236, + "grad_norm": 1.0566223859786987, + "learning_rate": 3.474716187378518e-05, + "loss": 0.6808, + "step": 2130 + }, + { + "epoch": 0.77816322804455, + "grad_norm": 0.8685449361801147, + "learning_rate": 3.4737467887920556e-05, + "loss": 0.6829, + "step": 2131 + }, + { + "epoch": 0.7785283914551762, + "grad_norm": 0.8262388706207275, + "learning_rate": 3.4727766320220064e-05, + "loss": 0.6729, + "step": 2132 + }, + { + "epoch": 0.7788935548658025, + "grad_norm": 0.6778239607810974, + "learning_rate": 3.47180571756748e-05, + "loss": 0.6755, + "step": 2133 + }, + { + "epoch": 0.7792587182764287, + "grad_norm": 0.7228227853775024, + "learning_rate": 3.470834045927971e-05, + "loss": 0.7108, + "step": 2134 + }, + { + "epoch": 0.779623881687055, + "grad_norm": 0.8152421116828918, + "learning_rate": 3.469861617603367e-05, + "loss": 0.6943, + "step": 2135 + }, + { + "epoch": 0.7799890450976812, + "grad_norm": 0.9563167095184326, + "learning_rate": 3.468888433093943e-05, + "loss": 0.6711, + "step": 2136 + }, + { + "epoch": 0.7803542085083075, + "grad_norm": 0.8263843059539795, + "learning_rate": 3.4679144929003624e-05, + "loss": 0.6996, + "step": 2137 + }, + { + "epoch": 0.7807193719189337, + "grad_norm": 0.9184187650680542, + "learning_rate": 3.466939797523679e-05, + "loss": 0.6791, + "step": 2138 + }, + { + "epoch": 0.78108453532956, + "grad_norm": 1.192243218421936, + "learning_rate": 3.465964347465334e-05, + "loss": 0.6846, + "step": 2139 + }, + { + "epoch": 0.7814496987401862, + "grad_norm": 1.0710678100585938, + "learning_rate": 3.464988143227158e-05, + "loss": 0.7174, + "step": 2140 + }, + { + "epoch": 0.7818148621508125, + "grad_norm": 0.8729625940322876, + "learning_rate": 3.4640111853113686e-05, + "loss": 0.6285, + "step": 2141 + }, + { + "epoch": 0.7821800255614387, + "grad_norm": 1.0296928882598877, + "learning_rate": 3.463033474220572e-05, + "loss": 0.6873, + "step": 2142 + }, + { + "epoch": 0.782545188972065, + "grad_norm": 1.0441279411315918, + "learning_rate": 3.46205501045776e-05, + "loss": 0.6527, + "step": 2143 + }, + { + "epoch": 0.7829103523826912, + "grad_norm": 0.9956523180007935, + "learning_rate": 3.461075794526314e-05, + "loss": 0.6841, + "step": 2144 + }, + { + "epoch": 0.7832755157933176, + "grad_norm": 0.896945059299469, + "learning_rate": 3.460095826930001e-05, + "loss": 0.7008, + "step": 2145 + }, + { + "epoch": 0.7836406792039438, + "grad_norm": 0.9649824500083923, + "learning_rate": 3.4591151081729756e-05, + "loss": 0.6786, + "step": 2146 + }, + { + "epoch": 0.7840058426145701, + "grad_norm": 0.975921630859375, + "learning_rate": 3.458133638759777e-05, + "loss": 0.696, + "step": 2147 + }, + { + "epoch": 0.7843710060251963, + "grad_norm": 0.9992843270301819, + "learning_rate": 3.457151419195332e-05, + "loss": 0.7158, + "step": 2148 + }, + { + "epoch": 0.7847361694358226, + "grad_norm": 1.2938470840454102, + "learning_rate": 3.456168449984955e-05, + "loss": 0.657, + "step": 2149 + }, + { + "epoch": 0.7851013328464488, + "grad_norm": 0.8014959096908569, + "learning_rate": 3.4551847316343426e-05, + "loss": 0.7029, + "step": 2150 + }, + { + "epoch": 0.785466496257075, + "grad_norm": 1.136978030204773, + "learning_rate": 3.4542002646495784e-05, + "loss": 0.7255, + "step": 2151 + }, + { + "epoch": 0.7858316596677013, + "grad_norm": 0.9775698184967041, + "learning_rate": 3.453215049537131e-05, + "loss": 0.6677, + "step": 2152 + }, + { + "epoch": 0.7861968230783275, + "grad_norm": 0.7203195095062256, + "learning_rate": 3.452229086803856e-05, + "loss": 0.6564, + "step": 2153 + }, + { + "epoch": 0.7865619864889538, + "grad_norm": 1.0495548248291016, + "learning_rate": 3.451242376956988e-05, + "loss": 0.7411, + "step": 2154 + }, + { + "epoch": 0.78692714989958, + "grad_norm": 1.1219384670257568, + "learning_rate": 3.4502549205041534e-05, + "loss": 0.6976, + "step": 2155 + }, + { + "epoch": 0.7872923133102063, + "grad_norm": 0.7579952478408813, + "learning_rate": 3.449266717953357e-05, + "loss": 0.6864, + "step": 2156 + }, + { + "epoch": 0.7876574767208325, + "grad_norm": 0.9256203174591064, + "learning_rate": 3.44827776981299e-05, + "loss": 0.7102, + "step": 2157 + }, + { + "epoch": 0.7880226401314588, + "grad_norm": 1.4947396516799927, + "learning_rate": 3.447288076591825e-05, + "loss": 0.7104, + "step": 2158 + }, + { + "epoch": 0.788387803542085, + "grad_norm": 1.0526587963104248, + "learning_rate": 3.446297638799022e-05, + "loss": 0.6885, + "step": 2159 + }, + { + "epoch": 0.7887529669527114, + "grad_norm": 1.2474300861358643, + "learning_rate": 3.445306456944119e-05, + "loss": 0.6862, + "step": 2160 + }, + { + "epoch": 0.7891181303633376, + "grad_norm": 0.6906046867370605, + "learning_rate": 3.444314531537041e-05, + "loss": 0.6976, + "step": 2161 + }, + { + "epoch": 0.7894832937739639, + "grad_norm": 1.101379632949829, + "learning_rate": 3.443321863088093e-05, + "loss": 0.675, + "step": 2162 + }, + { + "epoch": 0.7898484571845901, + "grad_norm": 0.7387844324111938, + "learning_rate": 3.4423284521079635e-05, + "loss": 0.6929, + "step": 2163 + }, + { + "epoch": 0.7902136205952164, + "grad_norm": 1.1480523347854614, + "learning_rate": 3.441334299107722e-05, + "loss": 0.6868, + "step": 2164 + }, + { + "epoch": 0.7905787840058426, + "grad_norm": 0.8580271601676941, + "learning_rate": 3.440339404598822e-05, + "loss": 0.6732, + "step": 2165 + }, + { + "epoch": 0.7909439474164689, + "grad_norm": 1.411083459854126, + "learning_rate": 3.4393437690930944e-05, + "loss": 0.7169, + "step": 2166 + }, + { + "epoch": 0.7913091108270951, + "grad_norm": 1.0092064142227173, + "learning_rate": 3.438347393102755e-05, + "loss": 0.645, + "step": 2167 + }, + { + "epoch": 0.7916742742377214, + "grad_norm": 0.6816556453704834, + "learning_rate": 3.4373502771403995e-05, + "loss": 0.6823, + "step": 2168 + }, + { + "epoch": 0.7920394376483476, + "grad_norm": 0.9781081080436707, + "learning_rate": 3.436352421719004e-05, + "loss": 0.7077, + "step": 2169 + }, + { + "epoch": 0.7924046010589739, + "grad_norm": 0.9609267115592957, + "learning_rate": 3.4353538273519244e-05, + "loss": 0.6879, + "step": 2170 + }, + { + "epoch": 0.7927697644696001, + "grad_norm": 0.986900806427002, + "learning_rate": 3.4343544945528975e-05, + "loss": 0.6833, + "step": 2171 + }, + { + "epoch": 0.7931349278802264, + "grad_norm": 0.7960975766181946, + "learning_rate": 3.4333544238360404e-05, + "loss": 0.6609, + "step": 2172 + }, + { + "epoch": 0.7935000912908526, + "grad_norm": 0.7304735779762268, + "learning_rate": 3.432353615715849e-05, + "loss": 0.6893, + "step": 2173 + }, + { + "epoch": 0.793865254701479, + "grad_norm": 0.8587673902511597, + "learning_rate": 3.431352070707199e-05, + "loss": 0.6996, + "step": 2174 + }, + { + "epoch": 0.7942304181121052, + "grad_norm": 1.1969038248062134, + "learning_rate": 3.430349789325346e-05, + "loss": 0.685, + "step": 2175 + }, + { + "epoch": 0.7945955815227315, + "grad_norm": 0.762546718120575, + "learning_rate": 3.429346772085923e-05, + "loss": 0.6876, + "step": 2176 + }, + { + "epoch": 0.7949607449333577, + "grad_norm": 1.1056230068206787, + "learning_rate": 3.42834301950494e-05, + "loss": 0.7068, + "step": 2177 + }, + { + "epoch": 0.795325908343984, + "grad_norm": 0.8168462514877319, + "learning_rate": 3.427338532098791e-05, + "loss": 0.6572, + "step": 2178 + }, + { + "epoch": 0.7956910717546102, + "grad_norm": 1.0522792339324951, + "learning_rate": 3.4263333103842415e-05, + "loss": 0.7001, + "step": 2179 + }, + { + "epoch": 0.7960562351652365, + "grad_norm": 0.9588885307312012, + "learning_rate": 3.42532735487844e-05, + "loss": 0.6782, + "step": 2180 + }, + { + "epoch": 0.7964213985758627, + "grad_norm": 0.9216328859329224, + "learning_rate": 3.424320666098909e-05, + "loss": 0.6941, + "step": 2181 + }, + { + "epoch": 0.796786561986489, + "grad_norm": 0.9530096650123596, + "learning_rate": 3.4233132445635496e-05, + "loss": 0.6492, + "step": 2182 + }, + { + "epoch": 0.7971517253971152, + "grad_norm": 1.0291130542755127, + "learning_rate": 3.42230509079064e-05, + "loss": 0.7058, + "step": 2183 + }, + { + "epoch": 0.7975168888077415, + "grad_norm": 0.9647759199142456, + "learning_rate": 3.421296205298835e-05, + "loss": 0.6777, + "step": 2184 + }, + { + "epoch": 0.7978820522183677, + "grad_norm": 1.066928744316101, + "learning_rate": 3.420286588607165e-05, + "loss": 0.683, + "step": 2185 + }, + { + "epoch": 0.7982472156289939, + "grad_norm": 1.107342004776001, + "learning_rate": 3.4192762412350375e-05, + "loss": 0.6785, + "step": 2186 + }, + { + "epoch": 0.7986123790396202, + "grad_norm": 0.8849497437477112, + "learning_rate": 3.418265163702236e-05, + "loss": 0.6871, + "step": 2187 + }, + { + "epoch": 0.7989775424502464, + "grad_norm": 0.997079074382782, + "learning_rate": 3.41725335652892e-05, + "loss": 0.6908, + "step": 2188 + }, + { + "epoch": 0.7993427058608727, + "grad_norm": 1.009993553161621, + "learning_rate": 3.4162408202356224e-05, + "loss": 0.7019, + "step": 2189 + }, + { + "epoch": 0.799707869271499, + "grad_norm": 0.8812515735626221, + "learning_rate": 3.4152275553432524e-05, + "loss": 0.663, + "step": 2190 + }, + { + "epoch": 0.8000730326821253, + "grad_norm": 0.9019457697868347, + "learning_rate": 3.4142135623730954e-05, + "loss": 0.6851, + "step": 2191 + }, + { + "epoch": 0.8004381960927515, + "grad_norm": 0.6627798676490784, + "learning_rate": 3.413198841846809e-05, + "loss": 0.688, + "step": 2192 + }, + { + "epoch": 0.8008033595033778, + "grad_norm": 0.8205757141113281, + "learning_rate": 3.412183394286427e-05, + "loss": 0.6604, + "step": 2193 + }, + { + "epoch": 0.801168522914004, + "grad_norm": 0.9085656404495239, + "learning_rate": 3.411167220214356e-05, + "loss": 0.7146, + "step": 2194 + }, + { + "epoch": 0.8015336863246303, + "grad_norm": 0.7780555486679077, + "learning_rate": 3.410150320153377e-05, + "loss": 0.6484, + "step": 2195 + }, + { + "epoch": 0.8018988497352565, + "grad_norm": 0.7425792813301086, + "learning_rate": 3.409132694626643e-05, + "loss": 0.6873, + "step": 2196 + }, + { + "epoch": 0.8022640131458828, + "grad_norm": 0.7671288251876831, + "learning_rate": 3.408114344157684e-05, + "loss": 0.6995, + "step": 2197 + }, + { + "epoch": 0.802629176556509, + "grad_norm": 1.4739527702331543, + "learning_rate": 3.407095269270398e-05, + "loss": 0.6724, + "step": 2198 + }, + { + "epoch": 0.8029943399671353, + "grad_norm": 0.9535475373268127, + "learning_rate": 3.40607547048906e-05, + "loss": 0.6924, + "step": 2199 + }, + { + "epoch": 0.8033595033777615, + "grad_norm": 0.8332328796386719, + "learning_rate": 3.405054948338314e-05, + "loss": 0.6776, + "step": 2200 + }, + { + "epoch": 0.8037246667883878, + "grad_norm": 1.0553945302963257, + "learning_rate": 3.404033703343179e-05, + "loss": 0.6918, + "step": 2201 + }, + { + "epoch": 0.804089830199014, + "grad_norm": 0.998306930065155, + "learning_rate": 3.4030117360290436e-05, + "loss": 0.6989, + "step": 2202 + }, + { + "epoch": 0.8044549936096403, + "grad_norm": 0.9074344038963318, + "learning_rate": 3.40198904692167e-05, + "loss": 0.688, + "step": 2203 + }, + { + "epoch": 0.8048201570202665, + "grad_norm": 1.1983717679977417, + "learning_rate": 3.4009656365471895e-05, + "loss": 0.7019, + "step": 2204 + }, + { + "epoch": 0.8051853204308929, + "grad_norm": 1.010562777519226, + "learning_rate": 3.399941505432106e-05, + "loss": 0.7024, + "step": 2205 + }, + { + "epoch": 0.8055504838415191, + "grad_norm": 1.1205374002456665, + "learning_rate": 3.398916654103294e-05, + "loss": 0.6799, + "step": 2206 + }, + { + "epoch": 0.8059156472521454, + "grad_norm": 0.6950740218162537, + "learning_rate": 3.397891083088e-05, + "loss": 0.6813, + "step": 2207 + }, + { + "epoch": 0.8062808106627716, + "grad_norm": 0.9909610152244568, + "learning_rate": 3.396864792913836e-05, + "loss": 0.665, + "step": 2208 + }, + { + "epoch": 0.8066459740733979, + "grad_norm": 1.141450047492981, + "learning_rate": 3.3958377841087894e-05, + "loss": 0.6785, + "step": 2209 + }, + { + "epoch": 0.8070111374840241, + "grad_norm": 0.7786381840705872, + "learning_rate": 3.3948100572012145e-05, + "loss": 0.668, + "step": 2210 + }, + { + "epoch": 0.8073763008946504, + "grad_norm": 2.428544521331787, + "learning_rate": 3.393781612719835e-05, + "loss": 0.6535, + "step": 2211 + }, + { + "epoch": 0.8077414643052766, + "grad_norm": 1.2587729692459106, + "learning_rate": 3.3927524511937446e-05, + "loss": 0.6436, + "step": 2212 + }, + { + "epoch": 0.8081066277159029, + "grad_norm": 0.8259543776512146, + "learning_rate": 3.391722573152406e-05, + "loss": 0.7026, + "step": 2213 + }, + { + "epoch": 0.8084717911265291, + "grad_norm": 0.7508890628814697, + "learning_rate": 3.39069197912565e-05, + "loss": 0.6814, + "step": 2214 + }, + { + "epoch": 0.8088369545371554, + "grad_norm": 1.0003023147583008, + "learning_rate": 3.389660669643676e-05, + "loss": 0.7075, + "step": 2215 + }, + { + "epoch": 0.8092021179477816, + "grad_norm": 1.233620047569275, + "learning_rate": 3.3886286452370505e-05, + "loss": 0.7073, + "step": 2216 + }, + { + "epoch": 0.8095672813584079, + "grad_norm": 0.9432274699211121, + "learning_rate": 3.387595906436709e-05, + "loss": 0.6842, + "step": 2217 + }, + { + "epoch": 0.8099324447690341, + "grad_norm": 0.9820045232772827, + "learning_rate": 3.386562453773955e-05, + "loss": 0.6941, + "step": 2218 + }, + { + "epoch": 0.8102976081796603, + "grad_norm": 0.9450540542602539, + "learning_rate": 3.3855282877804575e-05, + "loss": 0.707, + "step": 2219 + }, + { + "epoch": 0.8106627715902867, + "grad_norm": 0.8697330951690674, + "learning_rate": 3.384493408988254e-05, + "loss": 0.7067, + "step": 2220 + }, + { + "epoch": 0.8110279350009129, + "grad_norm": 1.2985196113586426, + "learning_rate": 3.3834578179297484e-05, + "loss": 0.705, + "step": 2221 + }, + { + "epoch": 0.8113930984115392, + "grad_norm": 0.7868382334709167, + "learning_rate": 3.3824215151377095e-05, + "loss": 0.6523, + "step": 2222 + }, + { + "epoch": 0.8117582618221654, + "grad_norm": 0.9879230260848999, + "learning_rate": 3.381384501145274e-05, + "loss": 0.6889, + "step": 2223 + }, + { + "epoch": 0.8121234252327917, + "grad_norm": 0.7710115909576416, + "learning_rate": 3.380346776485944e-05, + "loss": 0.6675, + "step": 2224 + }, + { + "epoch": 0.8124885886434179, + "grad_norm": 1.0650300979614258, + "learning_rate": 3.379308341693588e-05, + "loss": 0.6707, + "step": 2225 + }, + { + "epoch": 0.8128537520540442, + "grad_norm": 1.146630048751831, + "learning_rate": 3.378269197302438e-05, + "loss": 0.6971, + "step": 2226 + }, + { + "epoch": 0.8132189154646704, + "grad_norm": 0.9396244287490845, + "learning_rate": 3.3772293438470924e-05, + "loss": 0.6747, + "step": 2227 + }, + { + "epoch": 0.8135840788752967, + "grad_norm": 0.7806180715560913, + "learning_rate": 3.376188781862515e-05, + "loss": 0.6843, + "step": 2228 + }, + { + "epoch": 0.8139492422859229, + "grad_norm": 1.0326999425888062, + "learning_rate": 3.375147511884032e-05, + "loss": 0.7123, + "step": 2229 + }, + { + "epoch": 0.8143144056965492, + "grad_norm": 0.7732445597648621, + "learning_rate": 3.374105534447334e-05, + "loss": 0.6622, + "step": 2230 + }, + { + "epoch": 0.8146795691071754, + "grad_norm": 1.0355019569396973, + "learning_rate": 3.37306285008848e-05, + "loss": 0.6843, + "step": 2231 + }, + { + "epoch": 0.8150447325178017, + "grad_norm": 0.9251493215560913, + "learning_rate": 3.372019459343886e-05, + "loss": 0.6624, + "step": 2232 + }, + { + "epoch": 0.8154098959284279, + "grad_norm": 1.0084545612335205, + "learning_rate": 3.370975362750335e-05, + "loss": 0.6965, + "step": 2233 + }, + { + "epoch": 0.8157750593390543, + "grad_norm": 0.7215114235877991, + "learning_rate": 3.369930560844975e-05, + "loss": 0.6849, + "step": 2234 + }, + { + "epoch": 0.8161402227496805, + "grad_norm": 0.6900755167007446, + "learning_rate": 3.368885054165314e-05, + "loss": 0.6823, + "step": 2235 + }, + { + "epoch": 0.8165053861603068, + "grad_norm": 1.0220706462860107, + "learning_rate": 3.367838843249222e-05, + "loss": 0.675, + "step": 2236 + }, + { + "epoch": 0.816870549570933, + "grad_norm": 0.7549731135368347, + "learning_rate": 3.366791928634932e-05, + "loss": 0.6699, + "step": 2237 + }, + { + "epoch": 0.8172357129815593, + "grad_norm": 1.0896284580230713, + "learning_rate": 3.365744310861041e-05, + "loss": 0.6907, + "step": 2238 + }, + { + "epoch": 0.8176008763921855, + "grad_norm": 1.1018774509429932, + "learning_rate": 3.364695990466507e-05, + "loss": 0.6816, + "step": 2239 + }, + { + "epoch": 0.8179660398028118, + "grad_norm": 1.1370127201080322, + "learning_rate": 3.363646967990647e-05, + "loss": 0.6642, + "step": 2240 + }, + { + "epoch": 0.818331203213438, + "grad_norm": 1.2615329027175903, + "learning_rate": 3.3625972439731425e-05, + "loss": 0.7133, + "step": 2241 + }, + { + "epoch": 0.8186963666240643, + "grad_norm": 0.8279294967651367, + "learning_rate": 3.361546818954033e-05, + "loss": 0.6337, + "step": 2242 + }, + { + "epoch": 0.8190615300346905, + "grad_norm": 0.9214347004890442, + "learning_rate": 3.3604956934737206e-05, + "loss": 0.6723, + "step": 2243 + }, + { + "epoch": 0.8194266934453168, + "grad_norm": 0.9047579169273376, + "learning_rate": 3.359443868072967e-05, + "loss": 0.6589, + "step": 2244 + }, + { + "epoch": 0.819791856855943, + "grad_norm": 1.0666414499282837, + "learning_rate": 3.3583913432928945e-05, + "loss": 0.6718, + "step": 2245 + }, + { + "epoch": 0.8201570202665693, + "grad_norm": 0.9169327020645142, + "learning_rate": 3.357338119674985e-05, + "loss": 0.675, + "step": 2246 + }, + { + "epoch": 0.8205221836771955, + "grad_norm": 0.9106442332267761, + "learning_rate": 3.3562841977610796e-05, + "loss": 0.6606, + "step": 2247 + }, + { + "epoch": 0.8208873470878219, + "grad_norm": 0.6304537057876587, + "learning_rate": 3.355229578093378e-05, + "loss": 0.6827, + "step": 2248 + }, + { + "epoch": 0.821252510498448, + "grad_norm": 0.9223216772079468, + "learning_rate": 3.354174261214441e-05, + "loss": 0.6802, + "step": 2249 + }, + { + "epoch": 0.8216176739090744, + "grad_norm": 1.1750116348266602, + "learning_rate": 3.353118247667186e-05, + "loss": 0.6851, + "step": 2250 + }, + { + "epoch": 0.8219828373197006, + "grad_norm": 1.092160940170288, + "learning_rate": 3.35206153799489e-05, + "loss": 0.6767, + "step": 2251 + }, + { + "epoch": 0.8223480007303268, + "grad_norm": 0.8775092363357544, + "learning_rate": 3.351004132741188e-05, + "loss": 0.6913, + "step": 2252 + }, + { + "epoch": 0.8227131641409531, + "grad_norm": 1.434619665145874, + "learning_rate": 3.349946032450071e-05, + "loss": 0.7225, + "step": 2253 + }, + { + "epoch": 0.8230783275515793, + "grad_norm": 0.6954415440559387, + "learning_rate": 3.348887237665891e-05, + "loss": 0.6674, + "step": 2254 + }, + { + "epoch": 0.8234434909622056, + "grad_norm": 0.7986772656440735, + "learning_rate": 3.3478277489333554e-05, + "loss": 0.681, + "step": 2255 + }, + { + "epoch": 0.8238086543728318, + "grad_norm": 1.0696231126785278, + "learning_rate": 3.346767566797527e-05, + "loss": 0.6731, + "step": 2256 + }, + { + "epoch": 0.8241738177834581, + "grad_norm": 1.0807222127914429, + "learning_rate": 3.345706691803828e-05, + "loss": 0.7114, + "step": 2257 + }, + { + "epoch": 0.8245389811940843, + "grad_norm": 0.8983349800109863, + "learning_rate": 3.344645124498036e-05, + "loss": 0.7061, + "step": 2258 + }, + { + "epoch": 0.8249041446047106, + "grad_norm": 1.091439127922058, + "learning_rate": 3.3435828654262844e-05, + "loss": 0.712, + "step": 2259 + }, + { + "epoch": 0.8252693080153368, + "grad_norm": 0.816214382648468, + "learning_rate": 3.3425199151350636e-05, + "loss": 0.6725, + "step": 2260 + }, + { + "epoch": 0.8256344714259631, + "grad_norm": 0.6547426581382751, + "learning_rate": 3.341456274171218e-05, + "loss": 0.6605, + "step": 2261 + }, + { + "epoch": 0.8259996348365893, + "grad_norm": 0.8819223046302795, + "learning_rate": 3.340391943081949e-05, + "loss": 0.6647, + "step": 2262 + }, + { + "epoch": 0.8263647982472156, + "grad_norm": 0.8542426228523254, + "learning_rate": 3.339326922414812e-05, + "loss": 0.6961, + "step": 2263 + }, + { + "epoch": 0.8267299616578418, + "grad_norm": 1.0729966163635254, + "learning_rate": 3.3382612127177166e-05, + "loss": 0.6595, + "step": 2264 + }, + { + "epoch": 0.8270951250684682, + "grad_norm": 1.184866189956665, + "learning_rate": 3.337194814538929e-05, + "loss": 0.6446, + "step": 2265 + }, + { + "epoch": 0.8274602884790944, + "grad_norm": 0.8264432549476624, + "learning_rate": 3.336127728427067e-05, + "loss": 0.6735, + "step": 2266 + }, + { + "epoch": 0.8278254518897207, + "grad_norm": 0.7857865691184998, + "learning_rate": 3.335059954931105e-05, + "loss": 0.6556, + "step": 2267 + }, + { + "epoch": 0.8281906153003469, + "grad_norm": 0.828025221824646, + "learning_rate": 3.333991494600368e-05, + "loss": 0.6747, + "step": 2268 + }, + { + "epoch": 0.8285557787109732, + "grad_norm": 1.0558624267578125, + "learning_rate": 3.332922347984537e-05, + "loss": 0.7017, + "step": 2269 + }, + { + "epoch": 0.8289209421215994, + "grad_norm": 0.8370004296302795, + "learning_rate": 3.331852515633645e-05, + "loss": 0.6711, + "step": 2270 + }, + { + "epoch": 0.8292861055322257, + "grad_norm": 0.8168753981590271, + "learning_rate": 3.330781998098078e-05, + "loss": 0.6727, + "step": 2271 + }, + { + "epoch": 0.8296512689428519, + "grad_norm": 0.7985580563545227, + "learning_rate": 3.3297107959285734e-05, + "loss": 0.6577, + "step": 2272 + }, + { + "epoch": 0.8300164323534782, + "grad_norm": 1.1410428285598755, + "learning_rate": 3.328638909676222e-05, + "loss": 0.6677, + "step": 2273 + }, + { + "epoch": 0.8303815957641044, + "grad_norm": 1.0028425455093384, + "learning_rate": 3.327566339892467e-05, + "loss": 0.6589, + "step": 2274 + }, + { + "epoch": 0.8307467591747307, + "grad_norm": 0.9950466156005859, + "learning_rate": 3.326493087129102e-05, + "loss": 0.6871, + "step": 2275 + }, + { + "epoch": 0.8311119225853569, + "grad_norm": 0.6724287271499634, + "learning_rate": 3.325419151938273e-05, + "loss": 0.6873, + "step": 2276 + }, + { + "epoch": 0.8314770859959832, + "grad_norm": 0.7970866560935974, + "learning_rate": 3.3243445348724756e-05, + "loss": 0.6788, + "step": 2277 + }, + { + "epoch": 0.8318422494066094, + "grad_norm": 1.309187412261963, + "learning_rate": 3.323269236484557e-05, + "loss": 0.6971, + "step": 2278 + }, + { + "epoch": 0.8322074128172358, + "grad_norm": 0.9678725004196167, + "learning_rate": 3.322193257327716e-05, + "loss": 0.6932, + "step": 2279 + }, + { + "epoch": 0.832572576227862, + "grad_norm": 0.7458254098892212, + "learning_rate": 3.321116597955501e-05, + "loss": 0.668, + "step": 2280 + }, + { + "epoch": 0.8329377396384883, + "grad_norm": 2.006092071533203, + "learning_rate": 3.320039258921809e-05, + "loss": 0.652, + "step": 2281 + }, + { + "epoch": 0.8333029030491145, + "grad_norm": 0.9141222238540649, + "learning_rate": 3.318961240780889e-05, + "loss": 0.6888, + "step": 2282 + }, + { + "epoch": 0.8336680664597408, + "grad_norm": 1.0954430103302002, + "learning_rate": 3.317882544087336e-05, + "loss": 0.6824, + "step": 2283 + }, + { + "epoch": 0.834033229870367, + "grad_norm": 0.8255243301391602, + "learning_rate": 3.316803169396098e-05, + "loss": 0.6443, + "step": 2284 + }, + { + "epoch": 0.8343983932809932, + "grad_norm": 1.5680209398269653, + "learning_rate": 3.31572311726247e-05, + "loss": 0.674, + "step": 2285 + }, + { + "epoch": 0.8347635566916195, + "grad_norm": 1.061949610710144, + "learning_rate": 3.3146423882420935e-05, + "loss": 0.717, + "step": 2286 + }, + { + "epoch": 0.8351287201022457, + "grad_norm": 0.993720531463623, + "learning_rate": 3.313560982890963e-05, + "loss": 0.6495, + "step": 2287 + }, + { + "epoch": 0.835493883512872, + "grad_norm": 0.9374502301216125, + "learning_rate": 3.3124789017654154e-05, + "loss": 0.6553, + "step": 2288 + }, + { + "epoch": 0.8358590469234982, + "grad_norm": 0.6694275140762329, + "learning_rate": 3.31139614542214e-05, + "loss": 0.6793, + "step": 2289 + }, + { + "epoch": 0.8362242103341245, + "grad_norm": 1.1059433221817017, + "learning_rate": 3.310312714418171e-05, + "loss": 0.705, + "step": 2290 + }, + { + "epoch": 0.8365893737447507, + "grad_norm": 0.9983911514282227, + "learning_rate": 3.3092286093108894e-05, + "loss": 0.6574, + "step": 2291 + }, + { + "epoch": 0.836954537155377, + "grad_norm": 0.8774459362030029, + "learning_rate": 3.308143830658025e-05, + "loss": 0.673, + "step": 2292 + }, + { + "epoch": 0.8373197005660032, + "grad_norm": 0.8330726027488708, + "learning_rate": 3.307058379017652e-05, + "loss": 0.6395, + "step": 2293 + }, + { + "epoch": 0.8376848639766296, + "grad_norm": 0.9212683439254761, + "learning_rate": 3.305972254948191e-05, + "loss": 0.6776, + "step": 2294 + }, + { + "epoch": 0.8380500273872558, + "grad_norm": 1.1847418546676636, + "learning_rate": 3.304885459008412e-05, + "loss": 0.6734, + "step": 2295 + }, + { + "epoch": 0.8384151907978821, + "grad_norm": 1.0819242000579834, + "learning_rate": 3.303797991757425e-05, + "loss": 0.7296, + "step": 2296 + }, + { + "epoch": 0.8387803542085083, + "grad_norm": 0.9311473369598389, + "learning_rate": 3.3027098537546904e-05, + "loss": 0.6743, + "step": 2297 + }, + { + "epoch": 0.8391455176191346, + "grad_norm": 1.4701728820800781, + "learning_rate": 3.3016210455600094e-05, + "loss": 0.6945, + "step": 2298 + }, + { + "epoch": 0.8395106810297608, + "grad_norm": 0.6400688886642456, + "learning_rate": 3.300531567733532e-05, + "loss": 0.6655, + "step": 2299 + }, + { + "epoch": 0.8398758444403871, + "grad_norm": 1.135380744934082, + "learning_rate": 3.2994414208357496e-05, + "loss": 0.6844, + "step": 2300 + }, + { + "epoch": 0.8402410078510133, + "grad_norm": 1.25801420211792, + "learning_rate": 3.2983506054274995e-05, + "loss": 0.6521, + "step": 2301 + }, + { + "epoch": 0.8406061712616396, + "grad_norm": 1.4099013805389404, + "learning_rate": 3.297259122069963e-05, + "loss": 0.7244, + "step": 2302 + }, + { + "epoch": 0.8409713346722658, + "grad_norm": 0.9134430885314941, + "learning_rate": 3.296166971324664e-05, + "loss": 0.7045, + "step": 2303 + }, + { + "epoch": 0.8413364980828921, + "grad_norm": 0.8753640651702881, + "learning_rate": 3.29507415375347e-05, + "loss": 0.6834, + "step": 2304 + }, + { + "epoch": 0.8417016614935183, + "grad_norm": 1.0376008749008179, + "learning_rate": 3.293980669918592e-05, + "loss": 0.6962, + "step": 2305 + }, + { + "epoch": 0.8420668249041446, + "grad_norm": 0.8525744080543518, + "learning_rate": 3.292886520382583e-05, + "loss": 0.6828, + "step": 2306 + }, + { + "epoch": 0.8424319883147708, + "grad_norm": 1.1410913467407227, + "learning_rate": 3.29179170570834e-05, + "loss": 0.6629, + "step": 2307 + }, + { + "epoch": 0.8427971517253972, + "grad_norm": 0.9425336718559265, + "learning_rate": 3.2906962264591014e-05, + "loss": 0.6633, + "step": 2308 + }, + { + "epoch": 0.8431623151360234, + "grad_norm": 1.1711310148239136, + "learning_rate": 3.2896000831984456e-05, + "loss": 0.6816, + "step": 2309 + }, + { + "epoch": 0.8435274785466497, + "grad_norm": 0.7658752799034119, + "learning_rate": 3.288503276490296e-05, + "loss": 0.7057, + "step": 2310 + }, + { + "epoch": 0.8438926419572759, + "grad_norm": 0.8208125829696655, + "learning_rate": 3.287405806898915e-05, + "loss": 0.6657, + "step": 2311 + }, + { + "epoch": 0.8442578053679022, + "grad_norm": 0.9931808710098267, + "learning_rate": 3.2863076749889064e-05, + "loss": 0.6545, + "step": 2312 + }, + { + "epoch": 0.8446229687785284, + "grad_norm": 1.2441120147705078, + "learning_rate": 3.285208881325216e-05, + "loss": 0.6757, + "step": 2313 + }, + { + "epoch": 0.8449881321891547, + "grad_norm": 1.0434895753860474, + "learning_rate": 3.2841094264731274e-05, + "loss": 0.6835, + "step": 2314 + }, + { + "epoch": 0.8453532955997809, + "grad_norm": 0.935052752494812, + "learning_rate": 3.283009310998268e-05, + "loss": 0.6379, + "step": 2315 + }, + { + "epoch": 0.8457184590104072, + "grad_norm": 1.2280255556106567, + "learning_rate": 3.2819085354666015e-05, + "loss": 0.6865, + "step": 2316 + }, + { + "epoch": 0.8460836224210334, + "grad_norm": 0.9368286728858948, + "learning_rate": 3.280807100444433e-05, + "loss": 0.6586, + "step": 2317 + }, + { + "epoch": 0.8464487858316597, + "grad_norm": 0.6505674719810486, + "learning_rate": 3.279705006498408e-05, + "loss": 0.6982, + "step": 2318 + }, + { + "epoch": 0.8468139492422859, + "grad_norm": 1.2371175289154053, + "learning_rate": 3.278602254195507e-05, + "loss": 0.6849, + "step": 2319 + }, + { + "epoch": 0.8471791126529121, + "grad_norm": 0.9046905040740967, + "learning_rate": 3.277498844103055e-05, + "loss": 0.6556, + "step": 2320 + }, + { + "epoch": 0.8475442760635384, + "grad_norm": 1.8878294229507446, + "learning_rate": 3.276394776788709e-05, + "loss": 0.6597, + "step": 2321 + }, + { + "epoch": 0.8479094394741646, + "grad_norm": 0.9316428899765015, + "learning_rate": 3.27529005282047e-05, + "loss": 0.6913, + "step": 2322 + }, + { + "epoch": 0.848274602884791, + "grad_norm": 1.0353072881698608, + "learning_rate": 3.274184672766673e-05, + "loss": 0.6635, + "step": 2323 + }, + { + "epoch": 0.8486397662954172, + "grad_norm": 0.8999189138412476, + "learning_rate": 3.2730786371959906e-05, + "loss": 0.6754, + "step": 2324 + }, + { + "epoch": 0.8490049297060435, + "grad_norm": 1.006822943687439, + "learning_rate": 3.271971946677436e-05, + "loss": 0.6785, + "step": 2325 + }, + { + "epoch": 0.8493700931166697, + "grad_norm": 0.8299421072006226, + "learning_rate": 3.270864601780355e-05, + "loss": 0.6778, + "step": 2326 + }, + { + "epoch": 0.849735256527296, + "grad_norm": 1.0007952451705933, + "learning_rate": 3.269756603074433e-05, + "loss": 0.6829, + "step": 2327 + }, + { + "epoch": 0.8501004199379222, + "grad_norm": 0.8802875876426697, + "learning_rate": 3.268647951129692e-05, + "loss": 0.6588, + "step": 2328 + }, + { + "epoch": 0.8504655833485485, + "grad_norm": 1.0025578737258911, + "learning_rate": 3.267538646516487e-05, + "loss": 0.65, + "step": 2329 + }, + { + "epoch": 0.8508307467591747, + "grad_norm": 0.8563314080238342, + "learning_rate": 3.266428689805512e-05, + "loss": 0.681, + "step": 2330 + }, + { + "epoch": 0.851195910169801, + "grad_norm": 1.299754023551941, + "learning_rate": 3.265318081567794e-05, + "loss": 0.678, + "step": 2331 + }, + { + "epoch": 0.8515610735804272, + "grad_norm": 0.7509905099868774, + "learning_rate": 3.2642068223746975e-05, + "loss": 0.6325, + "step": 2332 + }, + { + "epoch": 0.8519262369910535, + "grad_norm": 0.9449114203453064, + "learning_rate": 3.2630949127979204e-05, + "loss": 0.6769, + "step": 2333 + }, + { + "epoch": 0.8522914004016797, + "grad_norm": 0.8018974661827087, + "learning_rate": 3.2619823534094956e-05, + "loss": 0.6597, + "step": 2334 + }, + { + "epoch": 0.852656563812306, + "grad_norm": 1.187442421913147, + "learning_rate": 3.26086914478179e-05, + "loss": 0.6749, + "step": 2335 + }, + { + "epoch": 0.8530217272229322, + "grad_norm": 1.0720336437225342, + "learning_rate": 3.259755287487505e-05, + "loss": 0.6584, + "step": 2336 + }, + { + "epoch": 0.8533868906335585, + "grad_norm": 0.6564728617668152, + "learning_rate": 3.258640782099675e-05, + "loss": 0.6418, + "step": 2337 + }, + { + "epoch": 0.8537520540441847, + "grad_norm": 0.9528270363807678, + "learning_rate": 3.257525629191669e-05, + "loss": 0.6414, + "step": 2338 + }, + { + "epoch": 0.8541172174548111, + "grad_norm": 1.947891116142273, + "learning_rate": 3.2564098293371884e-05, + "loss": 0.6577, + "step": 2339 + }, + { + "epoch": 0.8544823808654373, + "grad_norm": 1.029895305633545, + "learning_rate": 3.255293383110267e-05, + "loss": 0.6412, + "step": 2340 + }, + { + "epoch": 0.8548475442760636, + "grad_norm": 0.9232720136642456, + "learning_rate": 3.2541762910852716e-05, + "loss": 0.6858, + "step": 2341 + }, + { + "epoch": 0.8552127076866898, + "grad_norm": 1.1750651597976685, + "learning_rate": 3.253058553836902e-05, + "loss": 0.6769, + "step": 2342 + }, + { + "epoch": 0.8555778710973161, + "grad_norm": 1.049420952796936, + "learning_rate": 3.251940171940188e-05, + "loss": 0.6663, + "step": 2343 + }, + { + "epoch": 0.8559430345079423, + "grad_norm": 0.8713171482086182, + "learning_rate": 3.250821145970493e-05, + "loss": 0.6758, + "step": 2344 + }, + { + "epoch": 0.8563081979185686, + "grad_norm": 1.214188814163208, + "learning_rate": 3.2497014765035105e-05, + "loss": 0.6747, + "step": 2345 + }, + { + "epoch": 0.8566733613291948, + "grad_norm": 1.0018802881240845, + "learning_rate": 3.2485811641152655e-05, + "loss": 0.6509, + "step": 2346 + }, + { + "epoch": 0.8570385247398211, + "grad_norm": 0.8035647869110107, + "learning_rate": 3.2474602093821145e-05, + "loss": 0.6946, + "step": 2347 + }, + { + "epoch": 0.8574036881504473, + "grad_norm": 1.55748450756073, + "learning_rate": 3.246338612880743e-05, + "loss": 0.6693, + "step": 2348 + }, + { + "epoch": 0.8577688515610736, + "grad_norm": 1.0306609869003296, + "learning_rate": 3.245216375188168e-05, + "loss": 0.6867, + "step": 2349 + }, + { + "epoch": 0.8581340149716998, + "grad_norm": 0.909618079662323, + "learning_rate": 3.2440934968817355e-05, + "loss": 0.6755, + "step": 2350 + }, + { + "epoch": 0.8584991783823261, + "grad_norm": 1.1233172416687012, + "learning_rate": 3.2429699785391205e-05, + "loss": 0.6541, + "step": 2351 + }, + { + "epoch": 0.8588643417929523, + "grad_norm": 0.6875085234642029, + "learning_rate": 3.241845820738329e-05, + "loss": 0.6486, + "step": 2352 + }, + { + "epoch": 0.8592295052035785, + "grad_norm": 1.0278838872909546, + "learning_rate": 3.240721024057695e-05, + "loss": 0.6849, + "step": 2353 + }, + { + "epoch": 0.8595946686142049, + "grad_norm": 0.888755202293396, + "learning_rate": 3.239595589075881e-05, + "loss": 0.6599, + "step": 2354 + }, + { + "epoch": 0.8599598320248311, + "grad_norm": 1.1310205459594727, + "learning_rate": 3.238469516371879e-05, + "loss": 0.686, + "step": 2355 + }, + { + "epoch": 0.8603249954354574, + "grad_norm": 1.0039758682250977, + "learning_rate": 3.237342806525007e-05, + "loss": 0.6724, + "step": 2356 + }, + { + "epoch": 0.8606901588460836, + "grad_norm": 0.7611724138259888, + "learning_rate": 3.236215460114913e-05, + "loss": 0.6803, + "step": 2357 + }, + { + "epoch": 0.8610553222567099, + "grad_norm": 1.0401297807693481, + "learning_rate": 3.23508747772157e-05, + "loss": 0.7059, + "step": 2358 + }, + { + "epoch": 0.8614204856673361, + "grad_norm": 0.7942869663238525, + "learning_rate": 3.233958859925282e-05, + "loss": 0.675, + "step": 2359 + }, + { + "epoch": 0.8617856490779624, + "grad_norm": 0.9569483399391174, + "learning_rate": 3.232829607306675e-05, + "loss": 0.6613, + "step": 2360 + }, + { + "epoch": 0.8621508124885886, + "grad_norm": 1.113763689994812, + "learning_rate": 3.231699720446706e-05, + "loss": 0.6732, + "step": 2361 + }, + { + "epoch": 0.8625159758992149, + "grad_norm": 0.8510305285453796, + "learning_rate": 3.230569199926656e-05, + "loss": 0.6957, + "step": 2362 + }, + { + "epoch": 0.8628811393098411, + "grad_norm": 0.9915270805358887, + "learning_rate": 3.2294380463281315e-05, + "loss": 0.6472, + "step": 2363 + }, + { + "epoch": 0.8632463027204674, + "grad_norm": 0.9330098032951355, + "learning_rate": 3.228306260233067e-05, + "loss": 0.6506, + "step": 2364 + }, + { + "epoch": 0.8636114661310936, + "grad_norm": 0.9357939958572388, + "learning_rate": 3.227173842223721e-05, + "loss": 0.7097, + "step": 2365 + }, + { + "epoch": 0.8639766295417199, + "grad_norm": 1.0856742858886719, + "learning_rate": 3.226040792882676e-05, + "loss": 0.6552, + "step": 2366 + }, + { + "epoch": 0.8643417929523461, + "grad_norm": 1.2186148166656494, + "learning_rate": 3.224907112792841e-05, + "loss": 0.6762, + "step": 2367 + }, + { + "epoch": 0.8647069563629725, + "grad_norm": 0.9334109425544739, + "learning_rate": 3.223772802537449e-05, + "loss": 0.7025, + "step": 2368 + }, + { + "epoch": 0.8650721197735987, + "grad_norm": 0.8101790547370911, + "learning_rate": 3.2226378627000574e-05, + "loss": 0.6598, + "step": 2369 + }, + { + "epoch": 0.865437283184225, + "grad_norm": 1.0096237659454346, + "learning_rate": 3.2215022938645465e-05, + "loss": 0.6761, + "step": 2370 + }, + { + "epoch": 0.8658024465948512, + "grad_norm": 0.7132946252822876, + "learning_rate": 3.2203660966151206e-05, + "loss": 0.6661, + "step": 2371 + }, + { + "epoch": 0.8661676100054775, + "grad_norm": 0.8075200319290161, + "learning_rate": 3.219229271536309e-05, + "loss": 0.6654, + "step": 2372 + }, + { + "epoch": 0.8665327734161037, + "grad_norm": 3.128431797027588, + "learning_rate": 3.218091819212962e-05, + "loss": 0.697, + "step": 2373 + }, + { + "epoch": 0.86689793682673, + "grad_norm": 1.0566328763961792, + "learning_rate": 3.2169537402302525e-05, + "loss": 0.6853, + "step": 2374 + }, + { + "epoch": 0.8672631002373562, + "grad_norm": 1.0124942064285278, + "learning_rate": 3.215815035173678e-05, + "loss": 0.6753, + "step": 2375 + }, + { + "epoch": 0.8676282636479825, + "grad_norm": 0.8646689057350159, + "learning_rate": 3.214675704629054e-05, + "loss": 0.6724, + "step": 2376 + }, + { + "epoch": 0.8679934270586087, + "grad_norm": 1.017532467842102, + "learning_rate": 3.213535749182523e-05, + "loss": 0.6837, + "step": 2377 + }, + { + "epoch": 0.868358590469235, + "grad_norm": 0.8565889596939087, + "learning_rate": 3.212395169420544e-05, + "loss": 0.6915, + "step": 2378 + }, + { + "epoch": 0.8687237538798612, + "grad_norm": 0.7826008200645447, + "learning_rate": 3.211253965929902e-05, + "loss": 0.6804, + "step": 2379 + }, + { + "epoch": 0.8690889172904875, + "grad_norm": 1.0141047239303589, + "learning_rate": 3.2101121392976986e-05, + "loss": 0.6706, + "step": 2380 + }, + { + "epoch": 0.8694540807011137, + "grad_norm": 0.8811666965484619, + "learning_rate": 3.2089696901113576e-05, + "loss": 0.6869, + "step": 2381 + }, + { + "epoch": 0.86981924411174, + "grad_norm": 1.180303931236267, + "learning_rate": 3.2078266189586256e-05, + "loss": 0.7188, + "step": 2382 + }, + { + "epoch": 0.8701844075223663, + "grad_norm": 1.0213888883590698, + "learning_rate": 3.2066829264275644e-05, + "loss": 0.6647, + "step": 2383 + }, + { + "epoch": 0.8705495709329926, + "grad_norm": 0.7763113975524902, + "learning_rate": 3.205538613106558e-05, + "loss": 0.6511, + "step": 2384 + }, + { + "epoch": 0.8709147343436188, + "grad_norm": 1.2403777837753296, + "learning_rate": 3.204393679584311e-05, + "loss": 0.6908, + "step": 2385 + }, + { + "epoch": 0.871279897754245, + "grad_norm": 0.8211830854415894, + "learning_rate": 3.203248126449845e-05, + "loss": 0.6514, + "step": 2386 + }, + { + "epoch": 0.8716450611648713, + "grad_norm": 0.8894174098968506, + "learning_rate": 3.2021019542925015e-05, + "loss": 0.6599, + "step": 2387 + }, + { + "epoch": 0.8720102245754975, + "grad_norm": 0.8494507670402527, + "learning_rate": 3.20095516370194e-05, + "loss": 0.6791, + "step": 2388 + }, + { + "epoch": 0.8723753879861238, + "grad_norm": 1.1291923522949219, + "learning_rate": 3.1998077552681387e-05, + "loss": 0.7286, + "step": 2389 + }, + { + "epoch": 0.87274055139675, + "grad_norm": 0.8444809913635254, + "learning_rate": 3.198659729581391e-05, + "loss": 0.6595, + "step": 2390 + }, + { + "epoch": 0.8731057148073763, + "grad_norm": 0.9420198202133179, + "learning_rate": 3.197511087232313e-05, + "loss": 0.6908, + "step": 2391 + }, + { + "epoch": 0.8734708782180025, + "grad_norm": 1.2120648622512817, + "learning_rate": 3.1963618288118334e-05, + "loss": 0.6942, + "step": 2392 + }, + { + "epoch": 0.8738360416286288, + "grad_norm": 0.9157645106315613, + "learning_rate": 3.195211954911199e-05, + "loss": 0.7096, + "step": 2393 + }, + { + "epoch": 0.874201205039255, + "grad_norm": 0.8355812430381775, + "learning_rate": 3.194061466121976e-05, + "loss": 0.6897, + "step": 2394 + }, + { + "epoch": 0.8745663684498813, + "grad_norm": 0.9789050221443176, + "learning_rate": 3.192910363036043e-05, + "loss": 0.6702, + "step": 2395 + }, + { + "epoch": 0.8749315318605075, + "grad_norm": 1.2427998781204224, + "learning_rate": 3.191758646245596e-05, + "loss": 0.6486, + "step": 2396 + }, + { + "epoch": 0.8752966952711339, + "grad_norm": 1.148383378982544, + "learning_rate": 3.1906063163431485e-05, + "loss": 0.6512, + "step": 2397 + }, + { + "epoch": 0.87566185868176, + "grad_norm": 0.9681849479675293, + "learning_rate": 3.189453373921527e-05, + "loss": 0.6417, + "step": 2398 + }, + { + "epoch": 0.8760270220923864, + "grad_norm": 0.7402408123016357, + "learning_rate": 3.1882998195738744e-05, + "loss": 0.6536, + "step": 2399 + }, + { + "epoch": 0.8763921855030126, + "grad_norm": 0.9436914920806885, + "learning_rate": 3.187145653893648e-05, + "loss": 0.6771, + "step": 2400 + }, + { + "epoch": 0.8767573489136389, + "grad_norm": 1.0563963651657104, + "learning_rate": 3.1859908774746205e-05, + "loss": 0.6899, + "step": 2401 + }, + { + "epoch": 0.8771225123242651, + "grad_norm": 0.9120274186134338, + "learning_rate": 3.184835490910877e-05, + "loss": 0.7038, + "step": 2402 + }, + { + "epoch": 0.8774876757348914, + "grad_norm": 0.7715966701507568, + "learning_rate": 3.1836794947968175e-05, + "loss": 0.6648, + "step": 2403 + }, + { + "epoch": 0.8778528391455176, + "grad_norm": 1.047858715057373, + "learning_rate": 3.182522889727157e-05, + "loss": 0.6703, + "step": 2404 + }, + { + "epoch": 0.8782180025561439, + "grad_norm": 0.652629017829895, + "learning_rate": 3.1813656762969206e-05, + "loss": 0.666, + "step": 2405 + }, + { + "epoch": 0.8785831659667701, + "grad_norm": 0.8451604843139648, + "learning_rate": 3.180207855101449e-05, + "loss": 0.7081, + "step": 2406 + }, + { + "epoch": 0.8789483293773964, + "grad_norm": 0.7865225076675415, + "learning_rate": 3.1790494267363954e-05, + "loss": 0.6756, + "step": 2407 + }, + { + "epoch": 0.8793134927880226, + "grad_norm": 1.1484609842300415, + "learning_rate": 3.177890391797724e-05, + "loss": 0.6816, + "step": 2408 + }, + { + "epoch": 0.8796786561986489, + "grad_norm": 0.7304513454437256, + "learning_rate": 3.176730750881711e-05, + "loss": 0.69, + "step": 2409 + }, + { + "epoch": 0.8800438196092751, + "grad_norm": 1.2418588399887085, + "learning_rate": 3.1755705045849465e-05, + "loss": 0.6796, + "step": 2410 + }, + { + "epoch": 0.8804089830199014, + "grad_norm": 0.636386513710022, + "learning_rate": 3.17440965350433e-05, + "loss": 0.6747, + "step": 2411 + }, + { + "epoch": 0.8807741464305276, + "grad_norm": 0.9517824649810791, + "learning_rate": 3.173248198237073e-05, + "loss": 0.673, + "step": 2412 + }, + { + "epoch": 0.881139309841154, + "grad_norm": 1.097337007522583, + "learning_rate": 3.172086139380698e-05, + "loss": 0.6783, + "step": 2413 + }, + { + "epoch": 0.8815044732517802, + "grad_norm": 0.9089690446853638, + "learning_rate": 3.170923477533036e-05, + "loss": 0.6724, + "step": 2414 + }, + { + "epoch": 0.8818696366624065, + "grad_norm": 1.1122137308120728, + "learning_rate": 3.169760213292232e-05, + "loss": 0.6299, + "step": 2415 + }, + { + "epoch": 0.8822348000730327, + "grad_norm": 0.730311930179596, + "learning_rate": 3.168596347256737e-05, + "loss": 0.6903, + "step": 2416 + }, + { + "epoch": 0.882599963483659, + "grad_norm": 0.7182562351226807, + "learning_rate": 3.1674318800253146e-05, + "loss": 0.6815, + "step": 2417 + }, + { + "epoch": 0.8829651268942852, + "grad_norm": 0.8872962594032288, + "learning_rate": 3.166266812197036e-05, + "loss": 0.658, + "step": 2418 + }, + { + "epoch": 0.8833302903049115, + "grad_norm": 1.317947268486023, + "learning_rate": 3.1651011443712825e-05, + "loss": 0.6667, + "step": 2419 + }, + { + "epoch": 0.8836954537155377, + "grad_norm": 0.9207883477210999, + "learning_rate": 3.1639348771477424e-05, + "loss": 0.6953, + "step": 2420 + }, + { + "epoch": 0.8840606171261639, + "grad_norm": 1.3670681715011597, + "learning_rate": 3.1627680111264134e-05, + "loss": 0.6684, + "step": 2421 + }, + { + "epoch": 0.8844257805367902, + "grad_norm": 1.701436161994934, + "learning_rate": 3.161600546907602e-05, + "loss": 0.6449, + "step": 2422 + }, + { + "epoch": 0.8847909439474164, + "grad_norm": 0.8770363330841064, + "learning_rate": 3.160432485091922e-05, + "loss": 0.6526, + "step": 2423 + }, + { + "epoch": 0.8851561073580427, + "grad_norm": 0.7543882131576538, + "learning_rate": 3.1592638262802926e-05, + "loss": 0.6595, + "step": 2424 + }, + { + "epoch": 0.8855212707686689, + "grad_norm": 1.0555789470672607, + "learning_rate": 3.1580945710739435e-05, + "loss": 0.6508, + "step": 2425 + }, + { + "epoch": 0.8858864341792952, + "grad_norm": 0.9207205772399902, + "learning_rate": 3.156924720074408e-05, + "loss": 0.649, + "step": 2426 + }, + { + "epoch": 0.8862515975899214, + "grad_norm": 0.8387048244476318, + "learning_rate": 3.1557542738835295e-05, + "loss": 0.6737, + "step": 2427 + }, + { + "epoch": 0.8866167610005478, + "grad_norm": 0.9562423825263977, + "learning_rate": 3.154583233103455e-05, + "loss": 0.6929, + "step": 2428 + }, + { + "epoch": 0.886981924411174, + "grad_norm": 1.06221342086792, + "learning_rate": 3.153411598336637e-05, + "loss": 0.6701, + "step": 2429 + }, + { + "epoch": 0.8873470878218003, + "grad_norm": 1.1556217670440674, + "learning_rate": 3.1522393701858353e-05, + "loss": 0.6714, + "step": 2430 + }, + { + "epoch": 0.8877122512324265, + "grad_norm": 1.1550284624099731, + "learning_rate": 3.151066549254115e-05, + "loss": 0.6792, + "step": 2431 + }, + { + "epoch": 0.8880774146430528, + "grad_norm": 1.2821412086486816, + "learning_rate": 3.149893136144843e-05, + "loss": 0.6675, + "step": 2432 + }, + { + "epoch": 0.888442578053679, + "grad_norm": 1.109316110610962, + "learning_rate": 3.148719131461695e-05, + "loss": 0.6572, + "step": 2433 + }, + { + "epoch": 0.8888077414643053, + "grad_norm": 1.0211321115493774, + "learning_rate": 3.14754453580865e-05, + "loss": 0.7051, + "step": 2434 + }, + { + "epoch": 0.8891729048749315, + "grad_norm": 0.8004649877548218, + "learning_rate": 3.1463693497899894e-05, + "loss": 0.6631, + "step": 2435 + }, + { + "epoch": 0.8895380682855578, + "grad_norm": 0.8751883506774902, + "learning_rate": 3.145193574010298e-05, + "loss": 0.6337, + "step": 2436 + }, + { + "epoch": 0.889903231696184, + "grad_norm": 1.1010115146636963, + "learning_rate": 3.1440172090744674e-05, + "loss": 0.6954, + "step": 2437 + }, + { + "epoch": 0.8902683951068103, + "grad_norm": 0.675520122051239, + "learning_rate": 3.1428402555876896e-05, + "loss": 0.6704, + "step": 2438 + }, + { + "epoch": 0.8906335585174365, + "grad_norm": 1.0251437425613403, + "learning_rate": 3.1416627141554595e-05, + "loss": 0.6634, + "step": 2439 + }, + { + "epoch": 0.8909987219280628, + "grad_norm": 0.8706310987472534, + "learning_rate": 3.1404845853835744e-05, + "loss": 0.709, + "step": 2440 + }, + { + "epoch": 0.891363885338689, + "grad_norm": 0.9390824437141418, + "learning_rate": 3.139305869878135e-05, + "loss": 0.6915, + "step": 2441 + }, + { + "epoch": 0.8917290487493154, + "grad_norm": 0.9122499227523804, + "learning_rate": 3.1381265682455436e-05, + "loss": 0.6764, + "step": 2442 + }, + { + "epoch": 0.8920942121599416, + "grad_norm": 1.045343279838562, + "learning_rate": 3.136946681092503e-05, + "loss": 0.6748, + "step": 2443 + }, + { + "epoch": 0.8924593755705679, + "grad_norm": 1.004455804824829, + "learning_rate": 3.135766209026017e-05, + "loss": 0.6948, + "step": 2444 + }, + { + "epoch": 0.8928245389811941, + "grad_norm": 0.9815378785133362, + "learning_rate": 3.134585152653393e-05, + "loss": 0.6882, + "step": 2445 + }, + { + "epoch": 0.8931897023918204, + "grad_norm": 0.958922803401947, + "learning_rate": 3.133403512582236e-05, + "loss": 0.6949, + "step": 2446 + }, + { + "epoch": 0.8935548658024466, + "grad_norm": 1.0713224411010742, + "learning_rate": 3.132221289420451e-05, + "loss": 0.6835, + "step": 2447 + }, + { + "epoch": 0.8939200292130729, + "grad_norm": 1.062995433807373, + "learning_rate": 3.131038483776247e-05, + "loss": 0.6827, + "step": 2448 + }, + { + "epoch": 0.8942851926236991, + "grad_norm": 1.309880018234253, + "learning_rate": 3.129855096258129e-05, + "loss": 0.6812, + "step": 2449 + }, + { + "epoch": 0.8946503560343254, + "grad_norm": 0.7982645034790039, + "learning_rate": 3.128671127474902e-05, + "loss": 0.7031, + "step": 2450 + }, + { + "epoch": 0.8950155194449516, + "grad_norm": 0.8103627562522888, + "learning_rate": 3.127486578035671e-05, + "loss": 0.6996, + "step": 2451 + }, + { + "epoch": 0.8953806828555779, + "grad_norm": 1.1655442714691162, + "learning_rate": 3.1263014485498374e-05, + "loss": 0.6641, + "step": 2452 + }, + { + "epoch": 0.8957458462662041, + "grad_norm": 1.0135167837142944, + "learning_rate": 3.1251157396271055e-05, + "loss": 0.6292, + "step": 2453 + }, + { + "epoch": 0.8961110096768303, + "grad_norm": 1.009577751159668, + "learning_rate": 3.123929451877473e-05, + "loss": 0.675, + "step": 2454 + }, + { + "epoch": 0.8964761730874566, + "grad_norm": 1.0211693048477173, + "learning_rate": 3.122742585911238e-05, + "loss": 0.6735, + "step": 2455 + }, + { + "epoch": 0.8968413364980828, + "grad_norm": 1.1773436069488525, + "learning_rate": 3.121555142338996e-05, + "loss": 0.6976, + "step": 2456 + }, + { + "epoch": 0.8972064999087092, + "grad_norm": 0.7748075127601624, + "learning_rate": 3.120367121771638e-05, + "loss": 0.6993, + "step": 2457 + }, + { + "epoch": 0.8975716633193354, + "grad_norm": 0.9061272144317627, + "learning_rate": 3.119178524820354e-05, + "loss": 0.6478, + "step": 2458 + }, + { + "epoch": 0.8979368267299617, + "grad_norm": 0.8899768590927124, + "learning_rate": 3.1179893520966276e-05, + "loss": 0.6632, + "step": 2459 + }, + { + "epoch": 0.8983019901405879, + "grad_norm": 0.9813979864120483, + "learning_rate": 3.1167996042122426e-05, + "loss": 0.668, + "step": 2460 + }, + { + "epoch": 0.8986671535512142, + "grad_norm": 0.9683778285980225, + "learning_rate": 3.1156092817792756e-05, + "loss": 0.646, + "step": 2461 + }, + { + "epoch": 0.8990323169618404, + "grad_norm": 1.1068257093429565, + "learning_rate": 3.1144183854100996e-05, + "loss": 0.6536, + "step": 2462 + }, + { + "epoch": 0.8993974803724667, + "grad_norm": 0.9029096364974976, + "learning_rate": 3.113226915717383e-05, + "loss": 0.6418, + "step": 2463 + }, + { + "epoch": 0.8997626437830929, + "grad_norm": 0.7347602844238281, + "learning_rate": 3.1120348733140896e-05, + "loss": 0.6893, + "step": 2464 + }, + { + "epoch": 0.9001278071937192, + "grad_norm": 0.8413745164871216, + "learning_rate": 3.110842258813477e-05, + "loss": 0.7046, + "step": 2465 + }, + { + "epoch": 0.9004929706043454, + "grad_norm": 2.552323818206787, + "learning_rate": 3.109649072829097e-05, + "loss": 0.6794, + "step": 2466 + }, + { + "epoch": 0.9008581340149717, + "grad_norm": 1.1075729131698608, + "learning_rate": 3.108455315974796e-05, + "loss": 0.6911, + "step": 2467 + }, + { + "epoch": 0.9012232974255979, + "grad_norm": 0.8394867181777954, + "learning_rate": 3.107260988864716e-05, + "loss": 0.6769, + "step": 2468 + }, + { + "epoch": 0.9015884608362242, + "grad_norm": 1.4787451028823853, + "learning_rate": 3.106066092113288e-05, + "loss": 0.6637, + "step": 2469 + }, + { + "epoch": 0.9019536242468504, + "grad_norm": 0.9691876769065857, + "learning_rate": 3.10487062633524e-05, + "loss": 0.6755, + "step": 2470 + }, + { + "epoch": 0.9023187876574768, + "grad_norm": 1.1069458723068237, + "learning_rate": 3.1036745921455895e-05, + "loss": 0.7007, + "step": 2471 + }, + { + "epoch": 0.902683951068103, + "grad_norm": 1.4953409433364868, + "learning_rate": 3.1024779901596496e-05, + "loss": 0.6846, + "step": 2472 + }, + { + "epoch": 0.9030491144787293, + "grad_norm": 0.9708583354949951, + "learning_rate": 3.101280820993023e-05, + "loss": 0.6655, + "step": 2473 + }, + { + "epoch": 0.9034142778893555, + "grad_norm": 0.8601775765419006, + "learning_rate": 3.100083085261606e-05, + "loss": 0.673, + "step": 2474 + }, + { + "epoch": 0.9037794412999818, + "grad_norm": 0.8028748631477356, + "learning_rate": 3.098884783581586e-05, + "loss": 0.6709, + "step": 2475 + }, + { + "epoch": 0.904144604710608, + "grad_norm": 0.9961649775505066, + "learning_rate": 3.097685916569439e-05, + "loss": 0.656, + "step": 2476 + }, + { + "epoch": 0.9045097681212343, + "grad_norm": 0.8735599517822266, + "learning_rate": 3.096486484841935e-05, + "loss": 0.673, + "step": 2477 + }, + { + "epoch": 0.9048749315318605, + "grad_norm": 2.3302524089813232, + "learning_rate": 3.095286489016135e-05, + "loss": 0.6274, + "step": 2478 + }, + { + "epoch": 0.9052400949424868, + "grad_norm": 1.027161717414856, + "learning_rate": 3.0940859297093874e-05, + "loss": 0.6816, + "step": 2479 + }, + { + "epoch": 0.905605258353113, + "grad_norm": 1.2939852476119995, + "learning_rate": 3.092884807539331e-05, + "loss": 0.6826, + "step": 2480 + }, + { + "epoch": 0.9059704217637393, + "grad_norm": 0.9535837173461914, + "learning_rate": 3.091683123123897e-05, + "loss": 0.6702, + "step": 2481 + }, + { + "epoch": 0.9063355851743655, + "grad_norm": 0.6913743019104004, + "learning_rate": 3.0904808770813024e-05, + "loss": 0.6533, + "step": 2482 + }, + { + "epoch": 0.9067007485849918, + "grad_norm": 0.9279155731201172, + "learning_rate": 3.0892780700300544e-05, + "loss": 0.6641, + "step": 2483 + }, + { + "epoch": 0.907065911995618, + "grad_norm": 0.9381618499755859, + "learning_rate": 3.08807470258895e-05, + "loss": 0.6606, + "step": 2484 + }, + { + "epoch": 0.9074310754062443, + "grad_norm": 1.03154718875885, + "learning_rate": 3.086870775377072e-05, + "loss": 0.7056, + "step": 2485 + }, + { + "epoch": 0.9077962388168705, + "grad_norm": 0.9431748390197754, + "learning_rate": 3.085666289013794e-05, + "loss": 0.6774, + "step": 2486 + }, + { + "epoch": 0.9081614022274968, + "grad_norm": 0.8811989426612854, + "learning_rate": 3.0844612441187755e-05, + "loss": 0.6566, + "step": 2487 + }, + { + "epoch": 0.9085265656381231, + "grad_norm": 0.7740017771720886, + "learning_rate": 3.083255641311963e-05, + "loss": 0.6962, + "step": 2488 + }, + { + "epoch": 0.9088917290487493, + "grad_norm": 0.8390200734138489, + "learning_rate": 3.082049481213592e-05, + "loss": 0.6742, + "step": 2489 + }, + { + "epoch": 0.9092568924593756, + "grad_norm": 0.7879806756973267, + "learning_rate": 3.0808427644441825e-05, + "loss": 0.6726, + "step": 2490 + }, + { + "epoch": 0.9096220558700018, + "grad_norm": 0.8950016498565674, + "learning_rate": 3.079635491624542e-05, + "loss": 0.6259, + "step": 2491 + }, + { + "epoch": 0.9099872192806281, + "grad_norm": 0.940693736076355, + "learning_rate": 3.078427663375765e-05, + "loss": 0.6816, + "step": 2492 + }, + { + "epoch": 0.9103523826912543, + "grad_norm": 0.7302183508872986, + "learning_rate": 3.077219280319229e-05, + "loss": 0.6627, + "step": 2493 + }, + { + "epoch": 0.9107175461018806, + "grad_norm": 1.1500719785690308, + "learning_rate": 3.0760103430766e-05, + "loss": 0.6769, + "step": 2494 + }, + { + "epoch": 0.9110827095125068, + "grad_norm": 1.0735278129577637, + "learning_rate": 3.0748008522698265e-05, + "loss": 0.6611, + "step": 2495 + }, + { + "epoch": 0.9114478729231331, + "grad_norm": 1.0470819473266602, + "learning_rate": 3.073590808521144e-05, + "loss": 0.696, + "step": 2496 + }, + { + "epoch": 0.9118130363337593, + "grad_norm": 0.9414476752281189, + "learning_rate": 3.072380212453071e-05, + "loss": 0.6763, + "step": 2497 + }, + { + "epoch": 0.9121781997443856, + "grad_norm": 1.0331350564956665, + "learning_rate": 3.07116906468841e-05, + "loss": 0.6649, + "step": 2498 + }, + { + "epoch": 0.9125433631550118, + "grad_norm": 0.8421180844306946, + "learning_rate": 3.069957365850249e-05, + "loss": 0.6743, + "step": 2499 + }, + { + "epoch": 0.9129085265656381, + "grad_norm": 1.2963193655014038, + "learning_rate": 3.0687451165619586e-05, + "loss": 0.6843, + "step": 2500 + }, + { + "epoch": 0.9132736899762643, + "grad_norm": 0.7593322992324829, + "learning_rate": 3.0675323174471905e-05, + "loss": 0.6536, + "step": 2501 + }, + { + "epoch": 0.9136388533868907, + "grad_norm": 0.9185073375701904, + "learning_rate": 3.0663189691298836e-05, + "loss": 0.684, + "step": 2502 + }, + { + "epoch": 0.9140040167975169, + "grad_norm": 0.7559370994567871, + "learning_rate": 3.0651050722342554e-05, + "loss": 0.6346, + "step": 2503 + }, + { + "epoch": 0.9143691802081432, + "grad_norm": 1.0058975219726562, + "learning_rate": 3.0638906273848075e-05, + "loss": 0.6587, + "step": 2504 + }, + { + "epoch": 0.9147343436187694, + "grad_norm": 0.6809512376785278, + "learning_rate": 3.062675635206323e-05, + "loss": 0.6429, + "step": 2505 + }, + { + "epoch": 0.9150995070293957, + "grad_norm": 0.8577700853347778, + "learning_rate": 3.061460096323867e-05, + "loss": 0.6777, + "step": 2506 + }, + { + "epoch": 0.9154646704400219, + "grad_norm": 0.8358782529830933, + "learning_rate": 3.060244011362785e-05, + "loss": 0.6736, + "step": 2507 + }, + { + "epoch": 0.9158298338506482, + "grad_norm": 0.864080548286438, + "learning_rate": 3.0590273809487037e-05, + "loss": 0.6594, + "step": 2508 + }, + { + "epoch": 0.9161949972612744, + "grad_norm": 0.9189779758453369, + "learning_rate": 3.057810205707532e-05, + "loss": 0.6852, + "step": 2509 + }, + { + "epoch": 0.9165601606719007, + "grad_norm": 1.0136405229568481, + "learning_rate": 3.0565924862654556e-05, + "loss": 0.6826, + "step": 2510 + }, + { + "epoch": 0.9169253240825269, + "grad_norm": 0.7467730045318604, + "learning_rate": 3.055374223248944e-05, + "loss": 0.6686, + "step": 2511 + }, + { + "epoch": 0.9172904874931532, + "grad_norm": 0.9998980164527893, + "learning_rate": 3.054155417284745e-05, + "loss": 0.6429, + "step": 2512 + }, + { + "epoch": 0.9176556509037794, + "grad_norm": 0.713403582572937, + "learning_rate": 3.0529360689998836e-05, + "loss": 0.6859, + "step": 2513 + }, + { + "epoch": 0.9180208143144057, + "grad_norm": 1.016985535621643, + "learning_rate": 3.051716179021666e-05, + "loss": 0.6797, + "step": 2514 + }, + { + "epoch": 0.9183859777250319, + "grad_norm": 0.7532426118850708, + "learning_rate": 3.050495747977677e-05, + "loss": 0.6455, + "step": 2515 + }, + { + "epoch": 0.9187511411356583, + "grad_norm": 0.7963048219680786, + "learning_rate": 3.0492747764957798e-05, + "loss": 0.6453, + "step": 2516 + }, + { + "epoch": 0.9191163045462845, + "grad_norm": 1.199378490447998, + "learning_rate": 3.0480532652041153e-05, + "loss": 0.6631, + "step": 2517 + }, + { + "epoch": 0.9194814679569108, + "grad_norm": 1.2027003765106201, + "learning_rate": 3.0468312147311007e-05, + "loss": 0.683, + "step": 2518 + }, + { + "epoch": 0.919846631367537, + "grad_norm": 0.9782599806785583, + "learning_rate": 3.0456086257054336e-05, + "loss": 0.6481, + "step": 2519 + }, + { + "epoch": 0.9202117947781633, + "grad_norm": 1.3586641550064087, + "learning_rate": 3.0443854987560856e-05, + "loss": 0.6474, + "step": 2520 + }, + { + "epoch": 0.9205769581887895, + "grad_norm": 1.0380148887634277, + "learning_rate": 3.043161834512308e-05, + "loss": 0.673, + "step": 2521 + }, + { + "epoch": 0.9209421215994157, + "grad_norm": 0.9029962420463562, + "learning_rate": 3.0419376336036252e-05, + "loss": 0.6766, + "step": 2522 + }, + { + "epoch": 0.921307285010042, + "grad_norm": 0.9196898341178894, + "learning_rate": 3.04071289665984e-05, + "loss": 0.6318, + "step": 2523 + }, + { + "epoch": 0.9216724484206682, + "grad_norm": 0.8275750875473022, + "learning_rate": 3.0394876243110318e-05, + "loss": 0.6523, + "step": 2524 + }, + { + "epoch": 0.9220376118312945, + "grad_norm": 0.7632416486740112, + "learning_rate": 3.038261817187552e-05, + "loss": 0.6415, + "step": 2525 + }, + { + "epoch": 0.9224027752419207, + "grad_norm": 0.7995944023132324, + "learning_rate": 3.0370354759200307e-05, + "loss": 0.6556, + "step": 2526 + }, + { + "epoch": 0.922767938652547, + "grad_norm": 0.802648663520813, + "learning_rate": 3.03580860113937e-05, + "loss": 0.6705, + "step": 2527 + }, + { + "epoch": 0.9231331020631732, + "grad_norm": 0.8473702073097229, + "learning_rate": 3.034581193476749e-05, + "loss": 0.6671, + "step": 2528 + }, + { + "epoch": 0.9234982654737995, + "grad_norm": 1.3099536895751953, + "learning_rate": 3.0333532535636193e-05, + "loss": 0.6554, + "step": 2529 + }, + { + "epoch": 0.9238634288844257, + "grad_norm": 0.9661362767219543, + "learning_rate": 3.032124782031706e-05, + "loss": 0.6268, + "step": 2530 + }, + { + "epoch": 0.924228592295052, + "grad_norm": 0.8286318182945251, + "learning_rate": 3.0308957795130092e-05, + "loss": 0.657, + "step": 2531 + }, + { + "epoch": 0.9245937557056783, + "grad_norm": 0.8090802431106567, + "learning_rate": 3.0296662466398005e-05, + "loss": 0.6227, + "step": 2532 + }, + { + "epoch": 0.9249589191163046, + "grad_norm": 1.15580415725708, + "learning_rate": 3.028436184044626e-05, + "loss": 0.6479, + "step": 2533 + }, + { + "epoch": 0.9253240825269308, + "grad_norm": 1.3873413801193237, + "learning_rate": 3.0272055923603046e-05, + "loss": 0.6664, + "step": 2534 + }, + { + "epoch": 0.9256892459375571, + "grad_norm": 0.9026642441749573, + "learning_rate": 3.025974472219924e-05, + "loss": 0.7167, + "step": 2535 + }, + { + "epoch": 0.9260544093481833, + "grad_norm": 0.8149577379226685, + "learning_rate": 3.024742824256848e-05, + "loss": 0.703, + "step": 2536 + }, + { + "epoch": 0.9264195727588096, + "grad_norm": 0.8617291450500488, + "learning_rate": 3.0235106491047078e-05, + "loss": 0.6335, + "step": 2537 + }, + { + "epoch": 0.9267847361694358, + "grad_norm": 1.062455654144287, + "learning_rate": 3.022277947397411e-05, + "loss": 0.6685, + "step": 2538 + }, + { + "epoch": 0.9271498995800621, + "grad_norm": 0.8623011708259583, + "learning_rate": 3.0210447197691317e-05, + "loss": 0.6544, + "step": 2539 + }, + { + "epoch": 0.9275150629906883, + "grad_norm": 0.9414846897125244, + "learning_rate": 3.019810966854315e-05, + "loss": 0.676, + "step": 2540 + }, + { + "epoch": 0.9278802264013146, + "grad_norm": 1.4701042175292969, + "learning_rate": 3.018576689287679e-05, + "loss": 0.6616, + "step": 2541 + }, + { + "epoch": 0.9282453898119408, + "grad_norm": 0.8895155787467957, + "learning_rate": 3.0173418877042092e-05, + "loss": 0.6573, + "step": 2542 + }, + { + "epoch": 0.9286105532225671, + "grad_norm": 0.735467791557312, + "learning_rate": 3.0161065627391618e-05, + "loss": 0.6837, + "step": 2543 + }, + { + "epoch": 0.9289757166331933, + "grad_norm": 0.7775281667709351, + "learning_rate": 3.0148707150280613e-05, + "loss": 0.6456, + "step": 2544 + }, + { + "epoch": 0.9293408800438197, + "grad_norm": 0.8725507259368896, + "learning_rate": 3.0136343452067023e-05, + "loss": 0.6616, + "step": 2545 + }, + { + "epoch": 0.9297060434544459, + "grad_norm": 0.7502171397209167, + "learning_rate": 3.012397453911147e-05, + "loss": 0.6427, + "step": 2546 + }, + { + "epoch": 0.9300712068650722, + "grad_norm": 0.8129810690879822, + "learning_rate": 3.011160041777727e-05, + "loss": 0.689, + "step": 2547 + }, + { + "epoch": 0.9304363702756984, + "grad_norm": 0.8770913481712341, + "learning_rate": 3.009922109443041e-05, + "loss": 0.6851, + "step": 2548 + }, + { + "epoch": 0.9308015336863247, + "grad_norm": 1.027323842048645, + "learning_rate": 3.0086836575439554e-05, + "loss": 0.6841, + "step": 2549 + }, + { + "epoch": 0.9311666970969509, + "grad_norm": 1.640295386314392, + "learning_rate": 3.0074446867176035e-05, + "loss": 0.6509, + "step": 2550 + }, + { + "epoch": 0.9315318605075772, + "grad_norm": 0.9461489915847778, + "learning_rate": 3.006205197601387e-05, + "loss": 0.6566, + "step": 2551 + }, + { + "epoch": 0.9318970239182034, + "grad_norm": 0.89789879322052, + "learning_rate": 3.0049651908329724e-05, + "loss": 0.6818, + "step": 2552 + }, + { + "epoch": 0.9322621873288297, + "grad_norm": 1.0178797245025635, + "learning_rate": 3.0037246670502943e-05, + "loss": 0.6646, + "step": 2553 + }, + { + "epoch": 0.9326273507394559, + "grad_norm": 1.3303459882736206, + "learning_rate": 3.002483626891551e-05, + "loss": 0.6968, + "step": 2554 + }, + { + "epoch": 0.9329925141500821, + "grad_norm": 0.9349764585494995, + "learning_rate": 3.00124207099521e-05, + "loss": 0.6819, + "step": 2555 + }, + { + "epoch": 0.9333576775607084, + "grad_norm": 0.8530802726745605, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.634, + "step": 2556 + }, + { + "epoch": 0.9337228409713346, + "grad_norm": 1.2609745264053345, + "learning_rate": 2.998757414544918e-05, + "loss": 0.6855, + "step": 2557 + }, + { + "epoch": 0.9340880043819609, + "grad_norm": 1.2250069379806519, + "learning_rate": 2.9975143152692242e-05, + "loss": 0.6812, + "step": 2558 + }, + { + "epoch": 0.9344531677925871, + "grad_norm": 1.0234918594360352, + "learning_rate": 2.9962707028124417e-05, + "loss": 0.6678, + "step": 2559 + }, + { + "epoch": 0.9348183312032134, + "grad_norm": 1.4435856342315674, + "learning_rate": 2.9950265778143616e-05, + "loss": 0.6772, + "step": 2560 + }, + { + "epoch": 0.9351834946138396, + "grad_norm": 0.6534599661827087, + "learning_rate": 2.9937819409150343e-05, + "loss": 0.6635, + "step": 2561 + }, + { + "epoch": 0.935548658024466, + "grad_norm": 0.7986537218093872, + "learning_rate": 2.9925367927547772e-05, + "loss": 0.6802, + "step": 2562 + }, + { + "epoch": 0.9359138214350922, + "grad_norm": 1.0671440362930298, + "learning_rate": 2.991291133974168e-05, + "loss": 0.6917, + "step": 2563 + }, + { + "epoch": 0.9362789848457185, + "grad_norm": 1.031617522239685, + "learning_rate": 2.990044965214048e-05, + "loss": 0.688, + "step": 2564 + }, + { + "epoch": 0.9366441482563447, + "grad_norm": 0.7745130658149719, + "learning_rate": 2.988798287115522e-05, + "loss": 0.6746, + "step": 2565 + }, + { + "epoch": 0.937009311666971, + "grad_norm": 0.9795826077461243, + "learning_rate": 2.9875511003199547e-05, + "loss": 0.655, + "step": 2566 + }, + { + "epoch": 0.9373744750775972, + "grad_norm": 0.8267850875854492, + "learning_rate": 2.9863034054689744e-05, + "loss": 0.6918, + "step": 2567 + }, + { + "epoch": 0.9377396384882235, + "grad_norm": 0.771300733089447, + "learning_rate": 2.98505520320447e-05, + "loss": 0.6711, + "step": 2568 + }, + { + "epoch": 0.9381048018988497, + "grad_norm": 0.9123200178146362, + "learning_rate": 2.9838064941685914e-05, + "loss": 0.7088, + "step": 2569 + }, + { + "epoch": 0.938469965309476, + "grad_norm": 0.9156253933906555, + "learning_rate": 2.9825572790037497e-05, + "loss": 0.6643, + "step": 2570 + }, + { + "epoch": 0.9388351287201022, + "grad_norm": 1.439208745956421, + "learning_rate": 2.9813075583526146e-05, + "loss": 0.6512, + "step": 2571 + }, + { + "epoch": 0.9392002921307285, + "grad_norm": 3.506324529647827, + "learning_rate": 2.9800573328581187e-05, + "loss": 0.6599, + "step": 2572 + }, + { + "epoch": 0.9395654555413547, + "grad_norm": 0.8508874177932739, + "learning_rate": 2.9788066031634523e-05, + "loss": 0.6328, + "step": 2573 + }, + { + "epoch": 0.939930618951981, + "grad_norm": 1.151938796043396, + "learning_rate": 2.9775553699120654e-05, + "loss": 0.6804, + "step": 2574 + }, + { + "epoch": 0.9402957823626072, + "grad_norm": 0.9399933815002441, + "learning_rate": 2.976303633747668e-05, + "loss": 0.6741, + "step": 2575 + }, + { + "epoch": 0.9406609457732336, + "grad_norm": 0.8826687932014465, + "learning_rate": 2.975051395314227e-05, + "loss": 0.6538, + "step": 2576 + }, + { + "epoch": 0.9410261091838598, + "grad_norm": 1.036671757698059, + "learning_rate": 2.97379865525597e-05, + "loss": 0.6794, + "step": 2577 + }, + { + "epoch": 0.9413912725944861, + "grad_norm": 0.9944286942481995, + "learning_rate": 2.9725454142173805e-05, + "loss": 0.6844, + "step": 2578 + }, + { + "epoch": 0.9417564360051123, + "grad_norm": 0.885906994342804, + "learning_rate": 2.9712916728432016e-05, + "loss": 0.6588, + "step": 2579 + }, + { + "epoch": 0.9421215994157386, + "grad_norm": 0.9254575967788696, + "learning_rate": 2.9700374317784326e-05, + "loss": 0.6608, + "step": 2580 + }, + { + "epoch": 0.9424867628263648, + "grad_norm": 0.7100337743759155, + "learning_rate": 2.9687826916683293e-05, + "loss": 0.6473, + "step": 2581 + }, + { + "epoch": 0.9428519262369911, + "grad_norm": 1.1397892236709595, + "learning_rate": 2.967527453158407e-05, + "loss": 0.681, + "step": 2582 + }, + { + "epoch": 0.9432170896476173, + "grad_norm": 0.9657423496246338, + "learning_rate": 2.9662717168944343e-05, + "loss": 0.6649, + "step": 2583 + }, + { + "epoch": 0.9435822530582436, + "grad_norm": 0.9102057814598083, + "learning_rate": 2.9650154835224373e-05, + "loss": 0.6494, + "step": 2584 + }, + { + "epoch": 0.9439474164688698, + "grad_norm": 0.7469964027404785, + "learning_rate": 2.963758753688697e-05, + "loss": 0.6558, + "step": 2585 + }, + { + "epoch": 0.9443125798794961, + "grad_norm": 0.9794651865959167, + "learning_rate": 2.962501528039752e-05, + "loss": 0.6475, + "step": 2586 + }, + { + "epoch": 0.9446777432901223, + "grad_norm": 0.8790867328643799, + "learning_rate": 2.9612438072223926e-05, + "loss": 0.6753, + "step": 2587 + }, + { + "epoch": 0.9450429067007485, + "grad_norm": 0.8802550435066223, + "learning_rate": 2.9599855918836677e-05, + "loss": 0.6663, + "step": 2588 + }, + { + "epoch": 0.9454080701113748, + "grad_norm": 0.7142858505249023, + "learning_rate": 2.9587268826708774e-05, + "loss": 0.6452, + "step": 2589 + }, + { + "epoch": 0.945773233522001, + "grad_norm": 0.9301719665527344, + "learning_rate": 2.9574676802315775e-05, + "loss": 0.6924, + "step": 2590 + }, + { + "epoch": 0.9461383969326274, + "grad_norm": 0.8197965025901794, + "learning_rate": 2.9562079852135767e-05, + "loss": 0.6494, + "step": 2591 + }, + { + "epoch": 0.9465035603432536, + "grad_norm": 0.8453877568244934, + "learning_rate": 2.9549477982649372e-05, + "loss": 0.6896, + "step": 2592 + }, + { + "epoch": 0.9468687237538799, + "grad_norm": 0.869964599609375, + "learning_rate": 2.9536871200339754e-05, + "loss": 0.5818, + "step": 2593 + }, + { + "epoch": 0.9472338871645061, + "grad_norm": 0.7052625417709351, + "learning_rate": 2.95242595116926e-05, + "loss": 0.6368, + "step": 2594 + }, + { + "epoch": 0.9475990505751324, + "grad_norm": 1.1453596353530884, + "learning_rate": 2.951164292319611e-05, + "loss": 0.6415, + "step": 2595 + }, + { + "epoch": 0.9479642139857586, + "grad_norm": 0.9452725648880005, + "learning_rate": 2.9499021441341012e-05, + "loss": 0.6585, + "step": 2596 + }, + { + "epoch": 0.9483293773963849, + "grad_norm": 0.8761558532714844, + "learning_rate": 2.9486395072620552e-05, + "loss": 0.6634, + "step": 2597 + }, + { + "epoch": 0.9486945408070111, + "grad_norm": 0.9212718605995178, + "learning_rate": 2.947376382353049e-05, + "loss": 0.6774, + "step": 2598 + }, + { + "epoch": 0.9490597042176374, + "grad_norm": 0.8499828577041626, + "learning_rate": 2.946112770056911e-05, + "loss": 0.675, + "step": 2599 + }, + { + "epoch": 0.9494248676282636, + "grad_norm": 1.040263056755066, + "learning_rate": 2.9448486710237173e-05, + "loss": 0.6536, + "step": 2600 + }, + { + "epoch": 0.9497900310388899, + "grad_norm": 1.0057485103607178, + "learning_rate": 2.943584085903797e-05, + "loss": 0.6959, + "step": 2601 + }, + { + "epoch": 0.9501551944495161, + "grad_norm": 0.9337158799171448, + "learning_rate": 2.942319015347728e-05, + "loss": 0.6506, + "step": 2602 + }, + { + "epoch": 0.9505203578601424, + "grad_norm": 0.7997397184371948, + "learning_rate": 2.9410534600063387e-05, + "loss": 0.6647, + "step": 2603 + }, + { + "epoch": 0.9508855212707686, + "grad_norm": 1.0067977905273438, + "learning_rate": 2.939787420530706e-05, + "loss": 0.6667, + "step": 2604 + }, + { + "epoch": 0.951250684681395, + "grad_norm": 0.8577103018760681, + "learning_rate": 2.9385208975721568e-05, + "loss": 0.6488, + "step": 2605 + }, + { + "epoch": 0.9516158480920212, + "grad_norm": 0.7341446280479431, + "learning_rate": 2.9372538917822666e-05, + "loss": 0.6837, + "step": 2606 + }, + { + "epoch": 0.9519810115026475, + "grad_norm": 0.7933617830276489, + "learning_rate": 2.9359864038128586e-05, + "loss": 0.6474, + "step": 2607 + }, + { + "epoch": 0.9523461749132737, + "grad_norm": 1.0580298900604248, + "learning_rate": 2.934718434316005e-05, + "loss": 0.653, + "step": 2608 + }, + { + "epoch": 0.9527113383239, + "grad_norm": 0.959298849105835, + "learning_rate": 2.933449983944024e-05, + "loss": 0.6759, + "step": 2609 + }, + { + "epoch": 0.9530765017345262, + "grad_norm": 0.6825525760650635, + "learning_rate": 2.932181053349484e-05, + "loss": 0.6476, + "step": 2610 + }, + { + "epoch": 0.9534416651451525, + "grad_norm": 1.1617248058319092, + "learning_rate": 2.930911643185198e-05, + "loss": 0.6554, + "step": 2611 + }, + { + "epoch": 0.9538068285557787, + "grad_norm": 0.9940661191940308, + "learning_rate": 2.9296417541042267e-05, + "loss": 0.6627, + "step": 2612 + }, + { + "epoch": 0.954171991966405, + "grad_norm": 1.0652787685394287, + "learning_rate": 2.928371386759877e-05, + "loss": 0.6692, + "step": 2613 + }, + { + "epoch": 0.9545371553770312, + "grad_norm": 0.8765406608581543, + "learning_rate": 2.9271005418057023e-05, + "loss": 0.6378, + "step": 2614 + }, + { + "epoch": 0.9549023187876575, + "grad_norm": 0.9065739512443542, + "learning_rate": 2.9258292198955004e-05, + "loss": 0.6375, + "step": 2615 + }, + { + "epoch": 0.9552674821982837, + "grad_norm": 1.0190774202346802, + "learning_rate": 2.924557421683317e-05, + "loss": 0.6453, + "step": 2616 + }, + { + "epoch": 0.95563264560891, + "grad_norm": 0.9807654023170471, + "learning_rate": 2.9232851478234397e-05, + "loss": 0.624, + "step": 2617 + }, + { + "epoch": 0.9559978090195362, + "grad_norm": 0.9084011316299438, + "learning_rate": 2.9220123989704034e-05, + "loss": 0.6714, + "step": 2618 + }, + { + "epoch": 0.9563629724301626, + "grad_norm": 0.7725682258605957, + "learning_rate": 2.9207391757789857e-05, + "loss": 0.6478, + "step": 2619 + }, + { + "epoch": 0.9567281358407888, + "grad_norm": 0.8699987530708313, + "learning_rate": 2.9194654789042088e-05, + "loss": 0.639, + "step": 2620 + }, + { + "epoch": 0.957093299251415, + "grad_norm": 1.0805641412734985, + "learning_rate": 2.9181913090013386e-05, + "loss": 0.6514, + "step": 2621 + }, + { + "epoch": 0.9574584626620413, + "grad_norm": 1.4972708225250244, + "learning_rate": 2.9169166667258856e-05, + "loss": 0.6708, + "step": 2622 + }, + { + "epoch": 0.9578236260726675, + "grad_norm": 0.7883728742599487, + "learning_rate": 2.9156415527336003e-05, + "loss": 0.6362, + "step": 2623 + }, + { + "epoch": 0.9581887894832938, + "grad_norm": 0.9713582992553711, + "learning_rate": 2.9143659676804788e-05, + "loss": 0.6578, + "step": 2624 + }, + { + "epoch": 0.95855395289392, + "grad_norm": 0.9497501254081726, + "learning_rate": 2.9130899122227583e-05, + "loss": 0.686, + "step": 2625 + }, + { + "epoch": 0.9589191163045463, + "grad_norm": 0.8625170588493347, + "learning_rate": 2.911813387016918e-05, + "loss": 0.6597, + "step": 2626 + }, + { + "epoch": 0.9592842797151725, + "grad_norm": 1.3011974096298218, + "learning_rate": 2.910536392719679e-05, + "loss": 0.6885, + "step": 2627 + }, + { + "epoch": 0.9596494431257988, + "grad_norm": 0.7818294763565063, + "learning_rate": 2.9092589299880028e-05, + "loss": 0.6331, + "step": 2628 + }, + { + "epoch": 0.960014606536425, + "grad_norm": 0.9671834707260132, + "learning_rate": 2.9079809994790937e-05, + "loss": 0.666, + "step": 2629 + }, + { + "epoch": 0.9603797699470513, + "grad_norm": 0.8328306674957275, + "learning_rate": 2.9067026018503956e-05, + "loss": 0.6589, + "step": 2630 + }, + { + "epoch": 0.9607449333576775, + "grad_norm": 1.8160985708236694, + "learning_rate": 2.9054237377595926e-05, + "loss": 0.6638, + "step": 2631 + }, + { + "epoch": 0.9611100967683038, + "grad_norm": 0.9076361060142517, + "learning_rate": 2.904144407864609e-05, + "loss": 0.6498, + "step": 2632 + }, + { + "epoch": 0.96147526017893, + "grad_norm": 0.821333110332489, + "learning_rate": 2.9028646128236083e-05, + "loss": 0.6732, + "step": 2633 + }, + { + "epoch": 0.9618404235895563, + "grad_norm": 0.9395877718925476, + "learning_rate": 2.901584353294994e-05, + "loss": 0.6672, + "step": 2634 + }, + { + "epoch": 0.9622055870001825, + "grad_norm": 1.0115352869033813, + "learning_rate": 2.9003036299374083e-05, + "loss": 0.6884, + "step": 2635 + }, + { + "epoch": 0.9625707504108089, + "grad_norm": 1.0721027851104736, + "learning_rate": 2.899022443409732e-05, + "loss": 0.6583, + "step": 2636 + }, + { + "epoch": 0.9629359138214351, + "grad_norm": 0.67411869764328, + "learning_rate": 2.8977407943710846e-05, + "loss": 0.6597, + "step": 2637 + }, + { + "epoch": 0.9633010772320614, + "grad_norm": 0.8598989844322205, + "learning_rate": 2.8964586834808214e-05, + "loss": 0.6729, + "step": 2638 + }, + { + "epoch": 0.9636662406426876, + "grad_norm": 0.9835753440856934, + "learning_rate": 2.8951761113985393e-05, + "loss": 0.6843, + "step": 2639 + }, + { + "epoch": 0.9640314040533139, + "grad_norm": 0.967534601688385, + "learning_rate": 2.8938930787840683e-05, + "loss": 0.6575, + "step": 2640 + }, + { + "epoch": 0.9643965674639401, + "grad_norm": 0.8860105276107788, + "learning_rate": 2.8926095862974782e-05, + "loss": 0.6855, + "step": 2641 + }, + { + "epoch": 0.9647617308745664, + "grad_norm": 0.7182053923606873, + "learning_rate": 2.8913256345990746e-05, + "loss": 0.673, + "step": 2642 + }, + { + "epoch": 0.9651268942851926, + "grad_norm": 0.788271427154541, + "learning_rate": 2.890041224349398e-05, + "loss": 0.6594, + "step": 2643 + }, + { + "epoch": 0.9654920576958189, + "grad_norm": 0.8986093997955322, + "learning_rate": 2.888756356209227e-05, + "loss": 0.6663, + "step": 2644 + }, + { + "epoch": 0.9658572211064451, + "grad_norm": 0.8097816109657288, + "learning_rate": 2.8874710308395745e-05, + "loss": 0.6356, + "step": 2645 + }, + { + "epoch": 0.9662223845170714, + "grad_norm": 0.7233504056930542, + "learning_rate": 2.8861852489016882e-05, + "loss": 0.6732, + "step": 2646 + }, + { + "epoch": 0.9665875479276976, + "grad_norm": 0.8124412298202515, + "learning_rate": 2.884899011057052e-05, + "loss": 0.6401, + "step": 2647 + }, + { + "epoch": 0.966952711338324, + "grad_norm": 1.0130997896194458, + "learning_rate": 2.8836123179673828e-05, + "loss": 0.6628, + "step": 2648 + }, + { + "epoch": 0.9673178747489501, + "grad_norm": 0.9995191097259521, + "learning_rate": 2.882325170294634e-05, + "loss": 0.6696, + "step": 2649 + }, + { + "epoch": 0.9676830381595765, + "grad_norm": 1.2716678380966187, + "learning_rate": 2.88103756870099e-05, + "loss": 0.669, + "step": 2650 + }, + { + "epoch": 0.9680482015702027, + "grad_norm": 0.9267953038215637, + "learning_rate": 2.879749513848871e-05, + "loss": 0.6901, + "step": 2651 + }, + { + "epoch": 0.968413364980829, + "grad_norm": 0.9453235268592834, + "learning_rate": 2.8784610064009297e-05, + "loss": 0.6676, + "step": 2652 + }, + { + "epoch": 0.9687785283914552, + "grad_norm": 0.8978050947189331, + "learning_rate": 2.87717204702005e-05, + "loss": 0.644, + "step": 2653 + }, + { + "epoch": 0.9691436918020815, + "grad_norm": 0.9811480045318604, + "learning_rate": 2.8758826363693516e-05, + "loss": 0.688, + "step": 2654 + }, + { + "epoch": 0.9695088552127077, + "grad_norm": 1.0782561302185059, + "learning_rate": 2.8745927751121834e-05, + "loss": 0.6663, + "step": 2655 + }, + { + "epoch": 0.9698740186233339, + "grad_norm": 1.0224454402923584, + "learning_rate": 2.8733024639121283e-05, + "loss": 0.647, + "step": 2656 + }, + { + "epoch": 0.9702391820339602, + "grad_norm": 1.114777684211731, + "learning_rate": 2.8720117034329984e-05, + "loss": 0.6792, + "step": 2657 + }, + { + "epoch": 0.9706043454445864, + "grad_norm": 0.8056793212890625, + "learning_rate": 2.8707204943388386e-05, + "loss": 0.637, + "step": 2658 + }, + { + "epoch": 0.9709695088552127, + "grad_norm": 0.9775443077087402, + "learning_rate": 2.8694288372939243e-05, + "loss": 0.6696, + "step": 2659 + }, + { + "epoch": 0.9713346722658389, + "grad_norm": 2.4969546794891357, + "learning_rate": 2.8681367329627617e-05, + "loss": 0.6862, + "step": 2660 + }, + { + "epoch": 0.9716998356764652, + "grad_norm": 0.8990214467048645, + "learning_rate": 2.8668441820100857e-05, + "loss": 0.6729, + "step": 2661 + }, + { + "epoch": 0.9720649990870914, + "grad_norm": 1.258520483970642, + "learning_rate": 2.865551185100863e-05, + "loss": 0.6596, + "step": 2662 + }, + { + "epoch": 0.9724301624977177, + "grad_norm": 0.851305365562439, + "learning_rate": 2.864257742900287e-05, + "loss": 0.6483, + "step": 2663 + }, + { + "epoch": 0.9727953259083439, + "grad_norm": 1.215659499168396, + "learning_rate": 2.8629638560737832e-05, + "loss": 0.6443, + "step": 2664 + }, + { + "epoch": 0.9731604893189703, + "grad_norm": 0.7999595403671265, + "learning_rate": 2.8616695252870044e-05, + "loss": 0.6779, + "step": 2665 + }, + { + "epoch": 0.9735256527295965, + "grad_norm": 0.8783779144287109, + "learning_rate": 2.8603747512058312e-05, + "loss": 0.673, + "step": 2666 + }, + { + "epoch": 0.9738908161402228, + "grad_norm": 0.8891261219978333, + "learning_rate": 2.859079534496373e-05, + "loss": 0.6394, + "step": 2667 + }, + { + "epoch": 0.974255979550849, + "grad_norm": 0.9345478415489197, + "learning_rate": 2.8577838758249674e-05, + "loss": 0.6508, + "step": 2668 + }, + { + "epoch": 0.9746211429614753, + "grad_norm": 1.225130319595337, + "learning_rate": 2.8564877758581784e-05, + "loss": 0.6678, + "step": 2669 + }, + { + "epoch": 0.9749863063721015, + "grad_norm": 1.0890440940856934, + "learning_rate": 2.855191235262797e-05, + "loss": 0.6763, + "step": 2670 + }, + { + "epoch": 0.9753514697827278, + "grad_norm": 1.1266112327575684, + "learning_rate": 2.8538942547058425e-05, + "loss": 0.6585, + "step": 2671 + }, + { + "epoch": 0.975716633193354, + "grad_norm": 0.8562392592430115, + "learning_rate": 2.8525968348545574e-05, + "loss": 0.6486, + "step": 2672 + }, + { + "epoch": 0.9760817966039803, + "grad_norm": 0.8453388810157776, + "learning_rate": 2.8512989763764146e-05, + "loss": 0.6464, + "step": 2673 + }, + { + "epoch": 0.9764469600146065, + "grad_norm": 0.9347715377807617, + "learning_rate": 2.850000679939108e-05, + "loss": 0.6783, + "step": 2674 + }, + { + "epoch": 0.9768121234252328, + "grad_norm": 0.811362624168396, + "learning_rate": 2.8487019462105606e-05, + "loss": 0.6508, + "step": 2675 + }, + { + "epoch": 0.977177286835859, + "grad_norm": 1.0582472085952759, + "learning_rate": 2.847402775858918e-05, + "loss": 0.6731, + "step": 2676 + }, + { + "epoch": 0.9775424502464853, + "grad_norm": 1.1253806352615356, + "learning_rate": 2.846103169552551e-05, + "loss": 0.6492, + "step": 2677 + }, + { + "epoch": 0.9779076136571115, + "grad_norm": 1.0529634952545166, + "learning_rate": 2.844803127960056e-05, + "loss": 0.6732, + "step": 2678 + }, + { + "epoch": 0.9782727770677379, + "grad_norm": 0.7079199552536011, + "learning_rate": 2.843502651750251e-05, + "loss": 0.646, + "step": 2679 + }, + { + "epoch": 0.978637940478364, + "grad_norm": 0.8783506155014038, + "learning_rate": 2.8422017415921793e-05, + "loss": 0.6696, + "step": 2680 + }, + { + "epoch": 0.9790031038889904, + "grad_norm": 0.8645327091217041, + "learning_rate": 2.840900398155107e-05, + "loss": 0.6573, + "step": 2681 + }, + { + "epoch": 0.9793682672996166, + "grad_norm": 0.7897984981536865, + "learning_rate": 2.839598622108523e-05, + "loss": 0.6962, + "step": 2682 + }, + { + "epoch": 0.9797334307102429, + "grad_norm": 0.7942821979522705, + "learning_rate": 2.8382964141221396e-05, + "loss": 0.6689, + "step": 2683 + }, + { + "epoch": 0.9800985941208691, + "grad_norm": 1.1115641593933105, + "learning_rate": 2.8369937748658892e-05, + "loss": 0.645, + "step": 2684 + }, + { + "epoch": 0.9804637575314954, + "grad_norm": 1.2980235815048218, + "learning_rate": 2.8356907050099284e-05, + "loss": 0.6995, + "step": 2685 + }, + { + "epoch": 0.9808289209421216, + "grad_norm": 0.9709264039993286, + "learning_rate": 2.834387205224634e-05, + "loss": 0.6954, + "step": 2686 + }, + { + "epoch": 0.9811940843527479, + "grad_norm": 0.951606810092926, + "learning_rate": 2.833083276180604e-05, + "loss": 0.6944, + "step": 2687 + }, + { + "epoch": 0.9815592477633741, + "grad_norm": 0.9172406792640686, + "learning_rate": 2.8317789185486587e-05, + "loss": 0.6792, + "step": 2688 + }, + { + "epoch": 0.9819244111740003, + "grad_norm": 0.8468958735466003, + "learning_rate": 2.8304741329998364e-05, + "loss": 0.6753, + "step": 2689 + }, + { + "epoch": 0.9822895745846266, + "grad_norm": 0.8902730345726013, + "learning_rate": 2.829168920205398e-05, + "loss": 0.6565, + "step": 2690 + }, + { + "epoch": 0.9826547379952528, + "grad_norm": 0.8718501329421997, + "learning_rate": 2.8278632808368222e-05, + "loss": 0.6646, + "step": 2691 + }, + { + "epoch": 0.9830199014058791, + "grad_norm": 0.9342532753944397, + "learning_rate": 2.826557215565809e-05, + "loss": 0.6182, + "step": 2692 + }, + { + "epoch": 0.9833850648165053, + "grad_norm": 1.0939021110534668, + "learning_rate": 2.825250725064275e-05, + "loss": 0.6551, + "step": 2693 + }, + { + "epoch": 0.9837502282271317, + "grad_norm": 1.0584012269973755, + "learning_rate": 2.8239438100043597e-05, + "loss": 0.6512, + "step": 2694 + }, + { + "epoch": 0.9841153916377579, + "grad_norm": 1.0934642553329468, + "learning_rate": 2.822636471058416e-05, + "loss": 0.6543, + "step": 2695 + }, + { + "epoch": 0.9844805550483842, + "grad_norm": 1.096698522567749, + "learning_rate": 2.8213287088990184e-05, + "loss": 0.6534, + "step": 2696 + }, + { + "epoch": 0.9848457184590104, + "grad_norm": 0.8641692399978638, + "learning_rate": 2.8200205241989583e-05, + "loss": 0.6652, + "step": 2697 + }, + { + "epoch": 0.9852108818696367, + "grad_norm": 1.1894280910491943, + "learning_rate": 2.818711917631243e-05, + "loss": 0.6411, + "step": 2698 + }, + { + "epoch": 0.9855760452802629, + "grad_norm": 1.0246202945709229, + "learning_rate": 2.8174028898690998e-05, + "loss": 0.6628, + "step": 2699 + }, + { + "epoch": 0.9859412086908892, + "grad_norm": 0.8037968277931213, + "learning_rate": 2.81609344158597e-05, + "loss": 0.6401, + "step": 2700 + }, + { + "epoch": 0.9863063721015154, + "grad_norm": 0.7906293272972107, + "learning_rate": 2.8147835734555114e-05, + "loss": 0.6542, + "step": 2701 + }, + { + "epoch": 0.9866715355121417, + "grad_norm": 1.5510473251342773, + "learning_rate": 2.813473286151601e-05, + "loss": 0.6641, + "step": 2702 + }, + { + "epoch": 0.9870366989227679, + "grad_norm": 0.9942931532859802, + "learning_rate": 2.8121625803483264e-05, + "loss": 0.6519, + "step": 2703 + }, + { + "epoch": 0.9874018623333942, + "grad_norm": 0.8478285074234009, + "learning_rate": 2.810851456719995e-05, + "loss": 0.662, + "step": 2704 + }, + { + "epoch": 0.9877670257440204, + "grad_norm": 0.8662696480751038, + "learning_rate": 2.8095399159411258e-05, + "loss": 0.6445, + "step": 2705 + }, + { + "epoch": 0.9881321891546467, + "grad_norm": 0.8632956147193909, + "learning_rate": 2.8082279586864548e-05, + "loss": 0.6529, + "step": 2706 + }, + { + "epoch": 0.9884973525652729, + "grad_norm": 0.6771783232688904, + "learning_rate": 2.8069155856309316e-05, + "loss": 0.6331, + "step": 2707 + }, + { + "epoch": 0.9888625159758992, + "grad_norm": 0.9262771010398865, + "learning_rate": 2.805602797449719e-05, + "loss": 0.6822, + "step": 2708 + }, + { + "epoch": 0.9892276793865254, + "grad_norm": 0.9815447926521301, + "learning_rate": 2.8042895948181944e-05, + "loss": 0.6843, + "step": 2709 + }, + { + "epoch": 0.9895928427971518, + "grad_norm": 0.898939311504364, + "learning_rate": 2.8029759784119465e-05, + "loss": 0.6542, + "step": 2710 + }, + { + "epoch": 0.989958006207778, + "grad_norm": 1.0056416988372803, + "learning_rate": 2.80166194890678e-05, + "loss": 0.6937, + "step": 2711 + }, + { + "epoch": 0.9903231696184043, + "grad_norm": 0.7354155778884888, + "learning_rate": 2.8003475069787084e-05, + "loss": 0.6714, + "step": 2712 + }, + { + "epoch": 0.9906883330290305, + "grad_norm": 0.9542761445045471, + "learning_rate": 2.799032653303961e-05, + "loss": 0.6639, + "step": 2713 + }, + { + "epoch": 0.9910534964396568, + "grad_norm": 1.159591794013977, + "learning_rate": 2.7977173885589768e-05, + "loss": 0.6654, + "step": 2714 + }, + { + "epoch": 0.991418659850283, + "grad_norm": 0.8092490434646606, + "learning_rate": 2.796401713420406e-05, + "loss": 0.6491, + "step": 2715 + }, + { + "epoch": 0.9917838232609093, + "grad_norm": 1.3644760847091675, + "learning_rate": 2.7950856285651124e-05, + "loss": 0.6834, + "step": 2716 + }, + { + "epoch": 0.9921489866715355, + "grad_norm": 0.8467960953712463, + "learning_rate": 2.793769134670167e-05, + "loss": 0.6613, + "step": 2717 + }, + { + "epoch": 0.9925141500821618, + "grad_norm": 1.0781224966049194, + "learning_rate": 2.7924522324128535e-05, + "loss": 0.6527, + "step": 2718 + }, + { + "epoch": 0.992879313492788, + "grad_norm": 0.8228006958961487, + "learning_rate": 2.791134922470666e-05, + "loss": 0.6564, + "step": 2719 + }, + { + "epoch": 0.9932444769034143, + "grad_norm": 0.9506269097328186, + "learning_rate": 2.7898172055213067e-05, + "loss": 0.6997, + "step": 2720 + }, + { + "epoch": 0.9936096403140405, + "grad_norm": 0.931259036064148, + "learning_rate": 2.788499082242689e-05, + "loss": 0.6121, + "step": 2721 + }, + { + "epoch": 0.9939748037246667, + "grad_norm": 0.9871553182601929, + "learning_rate": 2.7871805533129337e-05, + "loss": 0.6499, + "step": 2722 + }, + { + "epoch": 0.994339967135293, + "grad_norm": 1.1735976934432983, + "learning_rate": 2.7858616194103714e-05, + "loss": 0.6495, + "step": 2723 + }, + { + "epoch": 0.9947051305459192, + "grad_norm": 0.8646172285079956, + "learning_rate": 2.7845422812135406e-05, + "loss": 0.6281, + "step": 2724 + }, + { + "epoch": 0.9950702939565456, + "grad_norm": 1.1625986099243164, + "learning_rate": 2.783222539401188e-05, + "loss": 0.6686, + "step": 2725 + }, + { + "epoch": 0.9954354573671718, + "grad_norm": 0.7719307541847229, + "learning_rate": 2.7819023946522678e-05, + "loss": 0.663, + "step": 2726 + }, + { + "epoch": 0.9958006207777981, + "grad_norm": 1.3788267374038696, + "learning_rate": 2.7805818476459404e-05, + "loss": 0.7081, + "step": 2727 + }, + { + "epoch": 0.9961657841884243, + "grad_norm": 0.9009970426559448, + "learning_rate": 2.7792608990615763e-05, + "loss": 0.6738, + "step": 2728 + }, + { + "epoch": 0.9965309475990506, + "grad_norm": 1.0724678039550781, + "learning_rate": 2.777939549578749e-05, + "loss": 0.6417, + "step": 2729 + }, + { + "epoch": 0.9968961110096768, + "grad_norm": 0.9405642747879028, + "learning_rate": 2.77661779987724e-05, + "loss": 0.666, + "step": 2730 + }, + { + "epoch": 0.9972612744203031, + "grad_norm": 0.9860209226608276, + "learning_rate": 2.7752956506370366e-05, + "loss": 0.673, + "step": 2731 + }, + { + "epoch": 0.9976264378309293, + "grad_norm": 0.9241143465042114, + "learning_rate": 2.773973102538332e-05, + "loss": 0.6704, + "step": 2732 + }, + { + "epoch": 0.9979916012415556, + "grad_norm": 1.0373433828353882, + "learning_rate": 2.7726501562615237e-05, + "loss": 0.6631, + "step": 2733 + }, + { + "epoch": 0.9983567646521818, + "grad_norm": 0.9698441028594971, + "learning_rate": 2.7713268124872145e-05, + "loss": 0.6735, + "step": 2734 + }, + { + "epoch": 0.9987219280628081, + "grad_norm": 1.5003585815429688, + "learning_rate": 2.770003071896212e-05, + "loss": 0.6591, + "step": 2735 + }, + { + "epoch": 0.9990870914734343, + "grad_norm": 1.487725853919983, + "learning_rate": 2.768678935169527e-05, + "loss": 0.6617, + "step": 2736 + }, + { + "epoch": 0.9994522548840606, + "grad_norm": 0.8343055248260498, + "learning_rate": 2.7673544029883745e-05, + "loss": 0.6484, + "step": 2737 + }, + { + "epoch": 0.9998174182946868, + "grad_norm": 1.0973925590515137, + "learning_rate": 2.7660294760341744e-05, + "loss": 0.6735, + "step": 2738 + }, + { + "epoch": 1.0001825817053132, + "grad_norm": 1.0450365543365479, + "learning_rate": 2.7647041549885472e-05, + "loss": 0.6652, + "step": 2739 + }, + { + "epoch": 1.0005477451159395, + "grad_norm": 1.9960644245147705, + "learning_rate": 2.7633784405333183e-05, + "loss": 0.6205, + "step": 2740 + }, + { + "epoch": 1.0009129085265656, + "grad_norm": 0.8917062878608704, + "learning_rate": 2.7620523333505142e-05, + "loss": 0.6269, + "step": 2741 + }, + { + "epoch": 1.0012780719371919, + "grad_norm": 0.7969324588775635, + "learning_rate": 2.7607258341223636e-05, + "loss": 0.6, + "step": 2742 + }, + { + "epoch": 1.0016432353478182, + "grad_norm": 0.9513354897499084, + "learning_rate": 2.7593989435312976e-05, + "loss": 0.611, + "step": 2743 + }, + { + "epoch": 1.0020083987584445, + "grad_norm": 0.8164132833480835, + "learning_rate": 2.7580716622599478e-05, + "loss": 0.6138, + "step": 2744 + }, + { + "epoch": 1.0023735621690706, + "grad_norm": 0.9981603026390076, + "learning_rate": 2.756743990991148e-05, + "loss": 0.6108, + "step": 2745 + }, + { + "epoch": 1.002738725579697, + "grad_norm": 0.9675151705741882, + "learning_rate": 2.7554159304079298e-05, + "loss": 0.5851, + "step": 2746 + }, + { + "epoch": 1.0031038889903232, + "grad_norm": 0.8304097056388855, + "learning_rate": 2.7540874811935295e-05, + "loss": 0.5764, + "step": 2747 + }, + { + "epoch": 1.0034690524009495, + "grad_norm": 0.8475216627120972, + "learning_rate": 2.7527586440313786e-05, + "loss": 0.5999, + "step": 2748 + }, + { + "epoch": 1.0038342158115756, + "grad_norm": 1.0157322883605957, + "learning_rate": 2.7514294196051124e-05, + "loss": 0.6093, + "step": 2749 + }, + { + "epoch": 1.004199379222202, + "grad_norm": 0.9846054911613464, + "learning_rate": 2.750099808598563e-05, + "loss": 0.5847, + "step": 2750 + }, + { + "epoch": 1.0045645426328282, + "grad_norm": 0.8925500512123108, + "learning_rate": 2.7487698116957617e-05, + "loss": 0.6193, + "step": 2751 + }, + { + "epoch": 1.0049297060434546, + "grad_norm": 0.8369265198707581, + "learning_rate": 2.747439429580938e-05, + "loss": 0.6191, + "step": 2752 + }, + { + "epoch": 1.0052948694540806, + "grad_norm": 0.9499483108520508, + "learning_rate": 2.746108662938521e-05, + "loss": 0.5982, + "step": 2753 + }, + { + "epoch": 1.005660032864707, + "grad_norm": 1.0188134908676147, + "learning_rate": 2.7447775124531367e-05, + "loss": 0.5984, + "step": 2754 + }, + { + "epoch": 1.0060251962753333, + "grad_norm": 0.9362654089927673, + "learning_rate": 2.7434459788096077e-05, + "loss": 0.6281, + "step": 2755 + }, + { + "epoch": 1.0063903596859594, + "grad_norm": 0.8612992763519287, + "learning_rate": 2.7421140626929556e-05, + "loss": 0.621, + "step": 2756 + }, + { + "epoch": 1.0067555230965857, + "grad_norm": 1.003761649131775, + "learning_rate": 2.7407817647883973e-05, + "loss": 0.6154, + "step": 2757 + }, + { + "epoch": 1.007120686507212, + "grad_norm": 0.8995448350906372, + "learning_rate": 2.7394490857813467e-05, + "loss": 0.6032, + "step": 2758 + }, + { + "epoch": 1.0074858499178383, + "grad_norm": 0.9468995928764343, + "learning_rate": 2.738116026357414e-05, + "loss": 0.604, + "step": 2759 + }, + { + "epoch": 1.0078510133284644, + "grad_norm": 0.8836452960968018, + "learning_rate": 2.7367825872024042e-05, + "loss": 0.6608, + "step": 2760 + }, + { + "epoch": 1.0082161767390907, + "grad_norm": 0.9900194406509399, + "learning_rate": 2.7354487690023178e-05, + "loss": 0.6043, + "step": 2761 + }, + { + "epoch": 1.008581340149717, + "grad_norm": 3.625828981399536, + "learning_rate": 2.7341145724433532e-05, + "loss": 0.6314, + "step": 2762 + }, + { + "epoch": 1.0089465035603433, + "grad_norm": 0.79560786485672, + "learning_rate": 2.732779998211898e-05, + "loss": 0.599, + "step": 2763 + }, + { + "epoch": 1.0093116669709694, + "grad_norm": 0.996148407459259, + "learning_rate": 2.731445046994539e-05, + "loss": 0.6152, + "step": 2764 + }, + { + "epoch": 1.0096768303815957, + "grad_norm": 1.097036600112915, + "learning_rate": 2.7301097194780536e-05, + "loss": 0.6236, + "step": 2765 + }, + { + "epoch": 1.010041993792222, + "grad_norm": 0.758590042591095, + "learning_rate": 2.7287740163494153e-05, + "loss": 0.6147, + "step": 2766 + }, + { + "epoch": 1.0104071572028483, + "grad_norm": 0.7391862273216248, + "learning_rate": 2.7274379382957897e-05, + "loss": 0.5834, + "step": 2767 + }, + { + "epoch": 1.0107723206134744, + "grad_norm": 0.9706049561500549, + "learning_rate": 2.726101486004535e-05, + "loss": 0.5939, + "step": 2768 + }, + { + "epoch": 1.0111374840241008, + "grad_norm": 0.890767514705658, + "learning_rate": 2.724764660163203e-05, + "loss": 0.5643, + "step": 2769 + }, + { + "epoch": 1.011502647434727, + "grad_norm": 1.1014769077301025, + "learning_rate": 2.7234274614595353e-05, + "loss": 0.5817, + "step": 2770 + }, + { + "epoch": 1.0118678108453534, + "grad_norm": 0.8103865385055542, + "learning_rate": 2.722089890581469e-05, + "loss": 0.5939, + "step": 2771 + }, + { + "epoch": 1.0122329742559795, + "grad_norm": 1.0938106775283813, + "learning_rate": 2.7207519482171285e-05, + "loss": 0.6061, + "step": 2772 + }, + { + "epoch": 1.0125981376666058, + "grad_norm": 1.1287803649902344, + "learning_rate": 2.7194136350548332e-05, + "loss": 0.6071, + "step": 2773 + }, + { + "epoch": 1.012963301077232, + "grad_norm": 0.9278669357299805, + "learning_rate": 2.7180749517830912e-05, + "loss": 0.5779, + "step": 2774 + }, + { + "epoch": 1.0133284644878584, + "grad_norm": 0.9545652866363525, + "learning_rate": 2.7167358990906e-05, + "loss": 0.6039, + "step": 2775 + }, + { + "epoch": 1.0136936278984845, + "grad_norm": 0.8851467967033386, + "learning_rate": 2.7153964776662517e-05, + "loss": 0.5648, + "step": 2776 + }, + { + "epoch": 1.0140587913091108, + "grad_norm": 0.9389585852622986, + "learning_rate": 2.7140566881991213e-05, + "loss": 0.6088, + "step": 2777 + }, + { + "epoch": 1.0144239547197371, + "grad_norm": 1.1182056665420532, + "learning_rate": 2.712716531378478e-05, + "loss": 0.5398, + "step": 2778 + }, + { + "epoch": 1.0147891181303634, + "grad_norm": 0.7980071902275085, + "learning_rate": 2.7113760078937787e-05, + "loss": 0.6202, + "step": 2779 + }, + { + "epoch": 1.0151542815409895, + "grad_norm": 1.2338283061981201, + "learning_rate": 2.7100351184346694e-05, + "loss": 0.5673, + "step": 2780 + }, + { + "epoch": 1.0155194449516158, + "grad_norm": 0.8782559633255005, + "learning_rate": 2.708693863690984e-05, + "loss": 0.5843, + "step": 2781 + }, + { + "epoch": 1.0158846083622421, + "grad_norm": 1.2286125421524048, + "learning_rate": 2.707352244352744e-05, + "loss": 0.5917, + "step": 2782 + }, + { + "epoch": 1.0162497717728685, + "grad_norm": 0.7889975905418396, + "learning_rate": 2.7060102611101577e-05, + "loss": 0.6085, + "step": 2783 + }, + { + "epoch": 1.0166149351834946, + "grad_norm": 1.1056513786315918, + "learning_rate": 2.704667914653623e-05, + "loss": 0.6028, + "step": 2784 + }, + { + "epoch": 1.0169800985941209, + "grad_norm": 1.011452317237854, + "learning_rate": 2.7033252056737224e-05, + "loss": 0.6351, + "step": 2785 + }, + { + "epoch": 1.0173452620047472, + "grad_norm": 1.3591222763061523, + "learning_rate": 2.7019821348612265e-05, + "loss": 0.6138, + "step": 2786 + }, + { + "epoch": 1.0177104254153735, + "grad_norm": 1.2160711288452148, + "learning_rate": 2.7006387029070907e-05, + "loss": 0.5875, + "step": 2787 + }, + { + "epoch": 1.0180755888259996, + "grad_norm": 0.9456118941307068, + "learning_rate": 2.699294910502457e-05, + "loss": 0.5653, + "step": 2788 + }, + { + "epoch": 1.018440752236626, + "grad_norm": 0.6976199150085449, + "learning_rate": 2.6979507583386537e-05, + "loss": 0.6295, + "step": 2789 + }, + { + "epoch": 1.0188059156472522, + "grad_norm": 0.8012118935585022, + "learning_rate": 2.6966062471071914e-05, + "loss": 0.5898, + "step": 2790 + }, + { + "epoch": 1.0191710790578783, + "grad_norm": 0.8957707285881042, + "learning_rate": 2.6952613774997683e-05, + "loss": 0.6247, + "step": 2791 + }, + { + "epoch": 1.0195362424685046, + "grad_norm": 1.2239645719528198, + "learning_rate": 2.6939161502082653e-05, + "loss": 0.6144, + "step": 2792 + }, + { + "epoch": 1.019901405879131, + "grad_norm": 0.9017224907875061, + "learning_rate": 2.692570565924749e-05, + "loss": 0.6038, + "step": 2793 + }, + { + "epoch": 1.0202665692897572, + "grad_norm": 0.8203404545783997, + "learning_rate": 2.691224625341467e-05, + "loss": 0.5793, + "step": 2794 + }, + { + "epoch": 1.0206317327003833, + "grad_norm": 0.7525933980941772, + "learning_rate": 2.6898783291508524e-05, + "loss": 0.6202, + "step": 2795 + }, + { + "epoch": 1.0209968961110096, + "grad_norm": 0.9386183619499207, + "learning_rate": 2.6885316780455208e-05, + "loss": 0.6441, + "step": 2796 + }, + { + "epoch": 1.021362059521636, + "grad_norm": 0.9447058439254761, + "learning_rate": 2.6871846727182696e-05, + "loss": 0.6296, + "step": 2797 + }, + { + "epoch": 1.0217272229322623, + "grad_norm": 1.1461701393127441, + "learning_rate": 2.6858373138620794e-05, + "loss": 0.613, + "step": 2798 + }, + { + "epoch": 1.0220923863428883, + "grad_norm": 1.1132742166519165, + "learning_rate": 2.684489602170112e-05, + "loss": 0.5981, + "step": 2799 + }, + { + "epoch": 1.0224575497535147, + "grad_norm": 0.8169803023338318, + "learning_rate": 2.6831415383357113e-05, + "loss": 0.5908, + "step": 2800 + }, + { + "epoch": 1.022822713164141, + "grad_norm": 1.1900393962860107, + "learning_rate": 2.6817931230524016e-05, + "loss": 0.5781, + "step": 2801 + }, + { + "epoch": 1.0231878765747673, + "grad_norm": 0.9564783573150635, + "learning_rate": 2.6804443570138895e-05, + "loss": 0.5901, + "step": 2802 + }, + { + "epoch": 1.0235530399853934, + "grad_norm": 0.8402716517448425, + "learning_rate": 2.6790952409140597e-05, + "loss": 0.6099, + "step": 2803 + }, + { + "epoch": 1.0239182033960197, + "grad_norm": 0.952369749546051, + "learning_rate": 2.6777457754469788e-05, + "loss": 0.6042, + "step": 2804 + }, + { + "epoch": 1.024283366806646, + "grad_norm": 1.0410730838775635, + "learning_rate": 2.6763959613068933e-05, + "loss": 0.5655, + "step": 2805 + }, + { + "epoch": 1.0246485302172723, + "grad_norm": 1.1353759765625, + "learning_rate": 2.675045799188227e-05, + "loss": 0.5638, + "step": 2806 + }, + { + "epoch": 1.0250136936278984, + "grad_norm": 0.8030809164047241, + "learning_rate": 2.6736952897855856e-05, + "loss": 0.5865, + "step": 2807 + }, + { + "epoch": 1.0253788570385247, + "grad_norm": 0.9601109027862549, + "learning_rate": 2.6723444337937502e-05, + "loss": 0.617, + "step": 2808 + }, + { + "epoch": 1.025744020449151, + "grad_norm": 0.7185949087142944, + "learning_rate": 2.670993231907684e-05, + "loss": 0.5891, + "step": 2809 + }, + { + "epoch": 1.0261091838597773, + "grad_norm": 0.6851087212562561, + "learning_rate": 2.6696416848225256e-05, + "loss": 0.5972, + "step": 2810 + }, + { + "epoch": 1.0264743472704034, + "grad_norm": 1.0103535652160645, + "learning_rate": 2.6682897932335907e-05, + "loss": 0.6285, + "step": 2811 + }, + { + "epoch": 1.0268395106810297, + "grad_norm": 0.9711493253707886, + "learning_rate": 2.666937557836374e-05, + "loss": 0.5834, + "step": 2812 + }, + { + "epoch": 1.027204674091656, + "grad_norm": 0.7026539444923401, + "learning_rate": 2.665584979326546e-05, + "loss": 0.5943, + "step": 2813 + }, + { + "epoch": 1.0275698375022824, + "grad_norm": 0.8979662656784058, + "learning_rate": 2.6642320583999556e-05, + "loss": 0.6189, + "step": 2814 + }, + { + "epoch": 1.0279350009129085, + "grad_norm": 0.987153172492981, + "learning_rate": 2.662878795752624e-05, + "loss": 0.5969, + "step": 2815 + }, + { + "epoch": 1.0283001643235348, + "grad_norm": 0.8173993825912476, + "learning_rate": 2.6615251920807527e-05, + "loss": 0.6077, + "step": 2816 + }, + { + "epoch": 1.028665327734161, + "grad_norm": 0.9545841217041016, + "learning_rate": 2.6601712480807145e-05, + "loss": 0.5931, + "step": 2817 + }, + { + "epoch": 1.0290304911447874, + "grad_norm": 1.0173547267913818, + "learning_rate": 2.6588169644490608e-05, + "loss": 0.5833, + "step": 2818 + }, + { + "epoch": 1.0293956545554135, + "grad_norm": 0.948368489742279, + "learning_rate": 2.6574623418825152e-05, + "loss": 0.6158, + "step": 2819 + }, + { + "epoch": 1.0297608179660398, + "grad_norm": 0.8072060346603394, + "learning_rate": 2.656107381077977e-05, + "loss": 0.5652, + "step": 2820 + }, + { + "epoch": 1.0301259813766661, + "grad_norm": 0.9504181146621704, + "learning_rate": 2.6547520827325192e-05, + "loss": 0.5698, + "step": 2821 + }, + { + "epoch": 1.0304911447872924, + "grad_norm": 0.8799892067909241, + "learning_rate": 2.6533964475433886e-05, + "loss": 0.5751, + "step": 2822 + }, + { + "epoch": 1.0308563081979185, + "grad_norm": 1.240471363067627, + "learning_rate": 2.6520404762080048e-05, + "loss": 0.5967, + "step": 2823 + }, + { + "epoch": 1.0312214716085448, + "grad_norm": 0.7833214402198792, + "learning_rate": 2.650684169423961e-05, + "loss": 0.6486, + "step": 2824 + }, + { + "epoch": 1.0315866350191711, + "grad_norm": 1.1240694522857666, + "learning_rate": 2.649327527889022e-05, + "loss": 0.5483, + "step": 2825 + }, + { + "epoch": 1.0319517984297972, + "grad_norm": 0.8658917546272278, + "learning_rate": 2.6479705523011254e-05, + "loss": 0.6028, + "step": 2826 + }, + { + "epoch": 1.0323169618404235, + "grad_norm": 1.2881187200546265, + "learning_rate": 2.646613243358382e-05, + "loss": 0.5525, + "step": 2827 + }, + { + "epoch": 1.0326821252510499, + "grad_norm": 0.8781007528305054, + "learning_rate": 2.645255601759071e-05, + "loss": 0.5805, + "step": 2828 + }, + { + "epoch": 1.0330472886616762, + "grad_norm": 1.0463286638259888, + "learning_rate": 2.6438976282016465e-05, + "loss": 0.6097, + "step": 2829 + }, + { + "epoch": 1.0334124520723023, + "grad_norm": 0.8506337404251099, + "learning_rate": 2.642539323384729e-05, + "loss": 0.6221, + "step": 2830 + }, + { + "epoch": 1.0337776154829286, + "grad_norm": 1.032930612564087, + "learning_rate": 2.641180688007114e-05, + "loss": 0.5947, + "step": 2831 + }, + { + "epoch": 1.0341427788935549, + "grad_norm": 0.9797373414039612, + "learning_rate": 2.6398217227677636e-05, + "loss": 0.5948, + "step": 2832 + }, + { + "epoch": 1.0345079423041812, + "grad_norm": 1.3347671031951904, + "learning_rate": 2.638462428365811e-05, + "loss": 0.5745, + "step": 2833 + }, + { + "epoch": 1.0348731057148073, + "grad_norm": 0.9851678013801575, + "learning_rate": 2.63710280550056e-05, + "loss": 0.5747, + "step": 2834 + }, + { + "epoch": 1.0352382691254336, + "grad_norm": 0.8773434162139893, + "learning_rate": 2.63574285487148e-05, + "loss": 0.613, + "step": 2835 + }, + { + "epoch": 1.03560343253606, + "grad_norm": 0.829378068447113, + "learning_rate": 2.6343825771782125e-05, + "loss": 0.6033, + "step": 2836 + }, + { + "epoch": 1.0359685959466862, + "grad_norm": 0.990553617477417, + "learning_rate": 2.633021973120565e-05, + "loss": 0.549, + "step": 2837 + }, + { + "epoch": 1.0363337593573123, + "grad_norm": 1.1182706356048584, + "learning_rate": 2.6316610433985136e-05, + "loss": 0.6159, + "step": 2838 + }, + { + "epoch": 1.0366989227679386, + "grad_norm": 0.887545108795166, + "learning_rate": 2.6302997887122024e-05, + "loss": 0.5685, + "step": 2839 + }, + { + "epoch": 1.037064086178565, + "grad_norm": 0.770965576171875, + "learning_rate": 2.6289382097619426e-05, + "loss": 0.6278, + "step": 2840 + }, + { + "epoch": 1.0374292495891912, + "grad_norm": 0.8624748587608337, + "learning_rate": 2.6275763072482116e-05, + "loss": 0.5809, + "step": 2841 + }, + { + "epoch": 1.0377944129998173, + "grad_norm": 0.9106437563896179, + "learning_rate": 2.6262140818716537e-05, + "loss": 0.5551, + "step": 2842 + }, + { + "epoch": 1.0381595764104437, + "grad_norm": 0.9461293816566467, + "learning_rate": 2.6248515343330784e-05, + "loss": 0.5948, + "step": 2843 + }, + { + "epoch": 1.03852473982107, + "grad_norm": 0.9623094201087952, + "learning_rate": 2.6234886653334632e-05, + "loss": 0.5931, + "step": 2844 + }, + { + "epoch": 1.0388899032316963, + "grad_norm": 1.062417984008789, + "learning_rate": 2.622125475573948e-05, + "loss": 0.5694, + "step": 2845 + }, + { + "epoch": 1.0392550666423224, + "grad_norm": 0.860431969165802, + "learning_rate": 2.6207619657558404e-05, + "loss": 0.5704, + "step": 2846 + }, + { + "epoch": 1.0396202300529487, + "grad_norm": 0.764029324054718, + "learning_rate": 2.6193981365806108e-05, + "loss": 0.5839, + "step": 2847 + }, + { + "epoch": 1.039985393463575, + "grad_norm": 0.9730731844902039, + "learning_rate": 2.618033988749895e-05, + "loss": 0.6255, + "step": 2848 + }, + { + "epoch": 1.0403505568742013, + "grad_norm": 0.9711384773254395, + "learning_rate": 2.6166695229654923e-05, + "loss": 0.5496, + "step": 2849 + }, + { + "epoch": 1.0407157202848274, + "grad_norm": 1.0858523845672607, + "learning_rate": 2.6153047399293653e-05, + "loss": 0.5913, + "step": 2850 + }, + { + "epoch": 1.0410808836954537, + "grad_norm": 0.7481245994567871, + "learning_rate": 2.6139396403436404e-05, + "loss": 0.6019, + "step": 2851 + }, + { + "epoch": 1.04144604710608, + "grad_norm": 0.7096604704856873, + "learning_rate": 2.612574224910606e-05, + "loss": 0.6191, + "step": 2852 + }, + { + "epoch": 1.0418112105167063, + "grad_norm": 0.9994904398918152, + "learning_rate": 2.6112084943327146e-05, + "loss": 0.5891, + "step": 2853 + }, + { + "epoch": 1.0421763739273324, + "grad_norm": 0.9799776673316956, + "learning_rate": 2.609842449312578e-05, + "loss": 0.6093, + "step": 2854 + }, + { + "epoch": 1.0425415373379587, + "grad_norm": 0.7989087104797363, + "learning_rate": 2.608476090552974e-05, + "loss": 0.6325, + "step": 2855 + }, + { + "epoch": 1.042906700748585, + "grad_norm": 0.8211569786071777, + "learning_rate": 2.6071094187568374e-05, + "loss": 0.5905, + "step": 2856 + }, + { + "epoch": 1.0432718641592111, + "grad_norm": 1.1618964672088623, + "learning_rate": 2.6057424346272667e-05, + "loss": 0.5789, + "step": 2857 + }, + { + "epoch": 1.0436370275698374, + "grad_norm": 0.812633752822876, + "learning_rate": 2.6043751388675205e-05, + "loss": 0.5911, + "step": 2858 + }, + { + "epoch": 1.0440021909804638, + "grad_norm": 1.2491474151611328, + "learning_rate": 2.6030075321810166e-05, + "loss": 0.5936, + "step": 2859 + }, + { + "epoch": 1.04436735439109, + "grad_norm": 1.066473126411438, + "learning_rate": 2.6016396152713355e-05, + "loss": 0.5812, + "step": 2860 + }, + { + "epoch": 1.0447325178017162, + "grad_norm": 0.8640859127044678, + "learning_rate": 2.600271388842214e-05, + "loss": 0.5997, + "step": 2861 + }, + { + "epoch": 1.0450976812123425, + "grad_norm": 1.0497597455978394, + "learning_rate": 2.5989028535975508e-05, + "loss": 0.6148, + "step": 2862 + }, + { + "epoch": 1.0454628446229688, + "grad_norm": 0.8977965116500854, + "learning_rate": 2.5975340102414023e-05, + "loss": 0.6049, + "step": 2863 + }, + { + "epoch": 1.045828008033595, + "grad_norm": 1.0095138549804688, + "learning_rate": 2.5961648594779824e-05, + "loss": 0.5774, + "step": 2864 + }, + { + "epoch": 1.0461931714442212, + "grad_norm": 0.7467365264892578, + "learning_rate": 2.594795402011665e-05, + "loss": 0.6265, + "step": 2865 + }, + { + "epoch": 1.0465583348548475, + "grad_norm": 0.9556071758270264, + "learning_rate": 2.5934256385469807e-05, + "loss": 0.5911, + "step": 2866 + }, + { + "epoch": 1.0469234982654738, + "grad_norm": 0.7638899683952332, + "learning_rate": 2.5920555697886184e-05, + "loss": 0.5836, + "step": 2867 + }, + { + "epoch": 1.0472886616761001, + "grad_norm": 0.7609127759933472, + "learning_rate": 2.590685196441423e-05, + "loss": 0.6416, + "step": 2868 + }, + { + "epoch": 1.0476538250867262, + "grad_norm": 1.2816520929336548, + "learning_rate": 2.589314519210397e-05, + "loss": 0.62, + "step": 2869 + }, + { + "epoch": 1.0480189884973525, + "grad_norm": 0.920642614364624, + "learning_rate": 2.5879435388006986e-05, + "loss": 0.5876, + "step": 2870 + }, + { + "epoch": 1.0483841519079788, + "grad_norm": 1.1667977571487427, + "learning_rate": 2.586572255917642e-05, + "loss": 0.5988, + "step": 2871 + }, + { + "epoch": 1.0487493153186052, + "grad_norm": 1.0526721477508545, + "learning_rate": 2.5852006712666975e-05, + "loss": 0.5389, + "step": 2872 + }, + { + "epoch": 1.0491144787292312, + "grad_norm": 0.8131223917007446, + "learning_rate": 2.5838287855534895e-05, + "loss": 0.5482, + "step": 2873 + }, + { + "epoch": 1.0494796421398576, + "grad_norm": 0.8092973232269287, + "learning_rate": 2.5824565994838e-05, + "loss": 0.5826, + "step": 2874 + }, + { + "epoch": 1.0498448055504839, + "grad_norm": 0.751356303691864, + "learning_rate": 2.581084113763562e-05, + "loss": 0.6341, + "step": 2875 + }, + { + "epoch": 1.0502099689611102, + "grad_norm": 1.6021597385406494, + "learning_rate": 2.5797113290988655e-05, + "loss": 0.6091, + "step": 2876 + }, + { + "epoch": 1.0505751323717363, + "grad_norm": 1.7192530632019043, + "learning_rate": 2.5783382461959523e-05, + "loss": 0.5354, + "step": 2877 + }, + { + "epoch": 1.0509402957823626, + "grad_norm": 1.4987651109695435, + "learning_rate": 2.576964865761218e-05, + "loss": 0.5722, + "step": 2878 + }, + { + "epoch": 1.051305459192989, + "grad_norm": 0.7340034246444702, + "learning_rate": 2.575591188501213e-05, + "loss": 0.5746, + "step": 2879 + }, + { + "epoch": 1.0516706226036152, + "grad_norm": 1.107073426246643, + "learning_rate": 2.574217215122639e-05, + "loss": 0.6007, + "step": 2880 + }, + { + "epoch": 1.0520357860142413, + "grad_norm": 0.7700313329696655, + "learning_rate": 2.5728429463323487e-05, + "loss": 0.5551, + "step": 2881 + }, + { + "epoch": 1.0524009494248676, + "grad_norm": 0.7817539572715759, + "learning_rate": 2.57146838283735e-05, + "loss": 0.6017, + "step": 2882 + }, + { + "epoch": 1.052766112835494, + "grad_norm": 0.7932517528533936, + "learning_rate": 2.5700935253447998e-05, + "loss": 0.5914, + "step": 2883 + }, + { + "epoch": 1.0531312762461202, + "grad_norm": 0.8778212666511536, + "learning_rate": 2.5687183745620078e-05, + "loss": 0.5986, + "step": 2884 + }, + { + "epoch": 1.0534964396567463, + "grad_norm": 0.8803035616874695, + "learning_rate": 2.567342931196432e-05, + "loss": 0.6064, + "step": 2885 + }, + { + "epoch": 1.0538616030673726, + "grad_norm": 0.8971658945083618, + "learning_rate": 2.5659671959556848e-05, + "loss": 0.6267, + "step": 2886 + }, + { + "epoch": 1.054226766477999, + "grad_norm": 0.8893007040023804, + "learning_rate": 2.5645911695475264e-05, + "loss": 0.6143, + "step": 2887 + }, + { + "epoch": 1.054591929888625, + "grad_norm": 0.6497613787651062, + "learning_rate": 2.563214852679867e-05, + "loss": 0.6102, + "step": 2888 + }, + { + "epoch": 1.0549570932992514, + "grad_norm": 0.8695589303970337, + "learning_rate": 2.5618382460607666e-05, + "loss": 0.5969, + "step": 2889 + }, + { + "epoch": 1.0553222567098777, + "grad_norm": 1.3337715864181519, + "learning_rate": 2.5604613503984327e-05, + "loss": 0.6025, + "step": 2890 + }, + { + "epoch": 1.055687420120504, + "grad_norm": 0.9800289273262024, + "learning_rate": 2.559084166401224e-05, + "loss": 0.6266, + "step": 2891 + }, + { + "epoch": 1.05605258353113, + "grad_norm": 0.8625394105911255, + "learning_rate": 2.557706694777647e-05, + "loss": 0.6205, + "step": 2892 + }, + { + "epoch": 1.0564177469417564, + "grad_norm": 1.1771821975708008, + "learning_rate": 2.5563289362363547e-05, + "loss": 0.5863, + "step": 2893 + }, + { + "epoch": 1.0567829103523827, + "grad_norm": 0.7348043918609619, + "learning_rate": 2.554950891486149e-05, + "loss": 0.5781, + "step": 2894 + }, + { + "epoch": 1.057148073763009, + "grad_norm": 0.978421151638031, + "learning_rate": 2.5535725612359778e-05, + "loss": 0.6074, + "step": 2895 + }, + { + "epoch": 1.057513237173635, + "grad_norm": 1.1634827852249146, + "learning_rate": 2.5521939461949384e-05, + "loss": 0.5952, + "step": 2896 + }, + { + "epoch": 1.0578784005842614, + "grad_norm": 0.711153507232666, + "learning_rate": 2.5508150470722708e-05, + "loss": 0.6351, + "step": 2897 + }, + { + "epoch": 1.0582435639948877, + "grad_norm": 0.962415874004364, + "learning_rate": 2.5494358645773646e-05, + "loss": 0.6055, + "step": 2898 + }, + { + "epoch": 1.058608727405514, + "grad_norm": 1.2400012016296387, + "learning_rate": 2.548056399419754e-05, + "loss": 0.6092, + "step": 2899 + }, + { + "epoch": 1.0589738908161401, + "grad_norm": 0.7230969071388245, + "learning_rate": 2.5466766523091172e-05, + "loss": 0.5801, + "step": 2900 + }, + { + "epoch": 1.0593390542267664, + "grad_norm": 0.9523801803588867, + "learning_rate": 2.5452966239552802e-05, + "loss": 0.5948, + "step": 2901 + }, + { + "epoch": 1.0597042176373928, + "grad_norm": 0.8902936577796936, + "learning_rate": 2.543916315068211e-05, + "loss": 0.5399, + "step": 2902 + }, + { + "epoch": 1.060069381048019, + "grad_norm": 1.0887128114700317, + "learning_rate": 2.5425357263580246e-05, + "loss": 0.6105, + "step": 2903 + }, + { + "epoch": 1.0604345444586452, + "grad_norm": 0.8181251287460327, + "learning_rate": 2.5411548585349772e-05, + "loss": 0.5854, + "step": 2904 + }, + { + "epoch": 1.0607997078692715, + "grad_norm": 0.8453181982040405, + "learning_rate": 2.5397737123094697e-05, + "loss": 0.5965, + "step": 2905 + }, + { + "epoch": 1.0611648712798978, + "grad_norm": 1.112425446510315, + "learning_rate": 2.5383922883920476e-05, + "loss": 0.5735, + "step": 2906 + }, + { + "epoch": 1.061530034690524, + "grad_norm": 0.9731928110122681, + "learning_rate": 2.5370105874933972e-05, + "loss": 0.5959, + "step": 2907 + }, + { + "epoch": 1.0618951981011502, + "grad_norm": 0.917127251625061, + "learning_rate": 2.5356286103243485e-05, + "loss": 0.5693, + "step": 2908 + }, + { + "epoch": 1.0622603615117765, + "grad_norm": 1.1035484075546265, + "learning_rate": 2.5342463575958737e-05, + "loss": 0.559, + "step": 2909 + }, + { + "epoch": 1.0626255249224028, + "grad_norm": 1.1218947172164917, + "learning_rate": 2.5328638300190856e-05, + "loss": 0.6578, + "step": 2910 + }, + { + "epoch": 1.0629906883330291, + "grad_norm": 0.6395195722579956, + "learning_rate": 2.531481028305239e-05, + "loss": 0.6437, + "step": 2911 + }, + { + "epoch": 1.0633558517436552, + "grad_norm": 0.935915470123291, + "learning_rate": 2.5300979531657305e-05, + "loss": 0.5709, + "step": 2912 + }, + { + "epoch": 1.0637210151542815, + "grad_norm": 0.835087239742279, + "learning_rate": 2.528714605312097e-05, + "loss": 0.5789, + "step": 2913 + }, + { + "epoch": 1.0640861785649078, + "grad_norm": 2.0276432037353516, + "learning_rate": 2.5273309854560148e-05, + "loss": 0.5714, + "step": 2914 + }, + { + "epoch": 1.0644513419755341, + "grad_norm": 0.7264708876609802, + "learning_rate": 2.525947094309301e-05, + "loss": 0.5891, + "step": 2915 + }, + { + "epoch": 1.0648165053861602, + "grad_norm": 0.9568043351173401, + "learning_rate": 2.5245629325839125e-05, + "loss": 0.569, + "step": 2916 + }, + { + "epoch": 1.0651816687967866, + "grad_norm": 1.0530427694320679, + "learning_rate": 2.5231785009919437e-05, + "loss": 0.6086, + "step": 2917 + }, + { + "epoch": 1.0655468322074129, + "grad_norm": 0.8962780833244324, + "learning_rate": 2.521793800245631e-05, + "loss": 0.5636, + "step": 2918 + }, + { + "epoch": 1.0659119956180392, + "grad_norm": 1.098936676979065, + "learning_rate": 2.5204088310573455e-05, + "loss": 0.5918, + "step": 2919 + }, + { + "epoch": 1.0662771590286653, + "grad_norm": 0.9142637252807617, + "learning_rate": 2.5190235941395996e-05, + "loss": 0.532, + "step": 2920 + }, + { + "epoch": 1.0666423224392916, + "grad_norm": 0.8808488845825195, + "learning_rate": 2.5176380902050418e-05, + "loss": 0.6227, + "step": 2921 + }, + { + "epoch": 1.067007485849918, + "grad_norm": 1.0145164728164673, + "learning_rate": 2.5162523199664583e-05, + "loss": 0.6177, + "step": 2922 + }, + { + "epoch": 1.0673726492605442, + "grad_norm": 0.912358820438385, + "learning_rate": 2.5148662841367718e-05, + "loss": 0.5798, + "step": 2923 + }, + { + "epoch": 1.0677378126711703, + "grad_norm": 0.8597058057785034, + "learning_rate": 2.5134799834290417e-05, + "loss": 0.6071, + "step": 2924 + }, + { + "epoch": 1.0681029760817966, + "grad_norm": 0.7203514575958252, + "learning_rate": 2.512093418556466e-05, + "loss": 0.5608, + "step": 2925 + }, + { + "epoch": 1.068468139492423, + "grad_norm": 1.068557858467102, + "learning_rate": 2.510706590232374e-05, + "loss": 0.5716, + "step": 2926 + }, + { + "epoch": 1.068833302903049, + "grad_norm": 1.191929578781128, + "learning_rate": 2.5093194991702362e-05, + "loss": 0.584, + "step": 2927 + }, + { + "epoch": 1.0691984663136753, + "grad_norm": 0.7341733574867249, + "learning_rate": 2.5079321460836528e-05, + "loss": 0.6083, + "step": 2928 + }, + { + "epoch": 1.0695636297243016, + "grad_norm": 0.8296530842781067, + "learning_rate": 2.5065445316863627e-05, + "loss": 0.5883, + "step": 2929 + }, + { + "epoch": 1.069928793134928, + "grad_norm": 0.8199982047080994, + "learning_rate": 2.5051566566922377e-05, + "loss": 0.6203, + "step": 2930 + }, + { + "epoch": 1.070293956545554, + "grad_norm": 1.316584587097168, + "learning_rate": 2.503768521815283e-05, + "loss": 0.5904, + "step": 2931 + }, + { + "epoch": 1.0706591199561803, + "grad_norm": 0.8966913819313049, + "learning_rate": 2.5023801277696393e-05, + "loss": 0.6328, + "step": 2932 + }, + { + "epoch": 1.0710242833668067, + "grad_norm": 0.8157796859741211, + "learning_rate": 2.5009914752695785e-05, + "loss": 0.6207, + "step": 2933 + }, + { + "epoch": 1.071389446777433, + "grad_norm": 0.7004765868186951, + "learning_rate": 2.4996025650295072e-05, + "loss": 0.6138, + "step": 2934 + }, + { + "epoch": 1.071754610188059, + "grad_norm": 0.9260857701301575, + "learning_rate": 2.4982133977639644e-05, + "loss": 0.6054, + "step": 2935 + }, + { + "epoch": 1.0721197735986854, + "grad_norm": 0.8620515465736389, + "learning_rate": 2.4968239741876205e-05, + "loss": 0.5885, + "step": 2936 + }, + { + "epoch": 1.0724849370093117, + "grad_norm": 0.7664886713027954, + "learning_rate": 2.4954342950152786e-05, + "loss": 0.5781, + "step": 2937 + }, + { + "epoch": 1.072850100419938, + "grad_norm": 1.2562214136123657, + "learning_rate": 2.4940443609618713e-05, + "loss": 0.6151, + "step": 2938 + }, + { + "epoch": 1.073215263830564, + "grad_norm": 1.0961320400238037, + "learning_rate": 2.4926541727424663e-05, + "loss": 0.559, + "step": 2939 + }, + { + "epoch": 1.0735804272411904, + "grad_norm": 0.9252917170524597, + "learning_rate": 2.4912637310722575e-05, + "loss": 0.6067, + "step": 2940 + }, + { + "epoch": 1.0739455906518167, + "grad_norm": 0.8604303002357483, + "learning_rate": 2.4898730366665724e-05, + "loss": 0.5856, + "step": 2941 + }, + { + "epoch": 1.074310754062443, + "grad_norm": 0.792181670665741, + "learning_rate": 2.488482090240868e-05, + "loss": 0.6186, + "step": 2942 + }, + { + "epoch": 1.0746759174730691, + "grad_norm": 1.0725411176681519, + "learning_rate": 2.487090892510729e-05, + "loss": 0.5721, + "step": 2943 + }, + { + "epoch": 1.0750410808836954, + "grad_norm": 0.7707020044326782, + "learning_rate": 2.4856994441918718e-05, + "loss": 0.6124, + "step": 2944 + }, + { + "epoch": 1.0754062442943217, + "grad_norm": 0.9772723913192749, + "learning_rate": 2.4843077460001405e-05, + "loss": 0.5997, + "step": 2945 + }, + { + "epoch": 1.075771407704948, + "grad_norm": 2.125711441040039, + "learning_rate": 2.482915798651507e-05, + "loss": 0.6111, + "step": 2946 + }, + { + "epoch": 1.0761365711155741, + "grad_norm": 0.6215631365776062, + "learning_rate": 2.481523602862075e-05, + "loss": 0.6021, + "step": 2947 + }, + { + "epoch": 1.0765017345262005, + "grad_norm": 0.9591418504714966, + "learning_rate": 2.48013115934807e-05, + "loss": 0.5849, + "step": 2948 + }, + { + "epoch": 1.0768668979368268, + "grad_norm": 1.1631020307540894, + "learning_rate": 2.4787384688258514e-05, + "loss": 0.6129, + "step": 2949 + }, + { + "epoch": 1.077232061347453, + "grad_norm": 0.7211815714836121, + "learning_rate": 2.4773455320119005e-05, + "loss": 0.5873, + "step": 2950 + }, + { + "epoch": 1.0777798064633923, + "grad_norm": 1.2566742897033691, + "learning_rate": 2.475952349622828e-05, + "loss": 0.6654, + "step": 2951 + }, + { + "epoch": 1.0781449698740186, + "grad_norm": 1.202701449394226, + "learning_rate": 2.474558922375371e-05, + "loss": 0.6309, + "step": 2952 + }, + { + "epoch": 1.078510133284645, + "grad_norm": 1.013235092163086, + "learning_rate": 2.4731652509863904e-05, + "loss": 0.6301, + "step": 2953 + }, + { + "epoch": 1.078875296695271, + "grad_norm": 1.1638104915618896, + "learning_rate": 2.471771336172876e-05, + "loss": 0.631, + "step": 2954 + }, + { + "epoch": 1.0792404601058974, + "grad_norm": 3.4340641498565674, + "learning_rate": 2.4703771786519392e-05, + "loss": 0.6566, + "step": 2955 + }, + { + "epoch": 1.0796056235165237, + "grad_norm": 0.9713271260261536, + "learning_rate": 2.4689827791408198e-05, + "loss": 0.65, + "step": 2956 + }, + { + "epoch": 1.07997078692715, + "grad_norm": 1.1270604133605957, + "learning_rate": 2.4675881383568797e-05, + "loss": 0.6987, + "step": 2957 + }, + { + "epoch": 1.080335950337776, + "grad_norm": 0.7504619359970093, + "learning_rate": 2.4661932570176047e-05, + "loss": 0.6166, + "step": 2958 + }, + { + "epoch": 1.0807011137484024, + "grad_norm": 0.7504952549934387, + "learning_rate": 2.464798135840607e-05, + "loss": 0.642, + "step": 2959 + }, + { + "epoch": 1.0810662771590287, + "grad_norm": 0.9410606622695923, + "learning_rate": 2.4634027755436192e-05, + "loss": 0.6299, + "step": 2960 + }, + { + "epoch": 1.081431440569655, + "grad_norm": 0.9131185412406921, + "learning_rate": 2.4620071768444985e-05, + "loss": 0.6688, + "step": 2961 + }, + { + "epoch": 1.081796603980281, + "grad_norm": 0.9587386846542358, + "learning_rate": 2.4606113404612244e-05, + "loss": 0.6307, + "step": 2962 + }, + { + "epoch": 1.0821617673909074, + "grad_norm": 0.9157978296279907, + "learning_rate": 2.4592152671118993e-05, + "loss": 0.6702, + "step": 2963 + }, + { + "epoch": 1.0825269308015337, + "grad_norm": 0.8683468699455261, + "learning_rate": 2.4578189575147465e-05, + "loss": 0.6246, + "step": 2964 + }, + { + "epoch": 1.08289209421216, + "grad_norm": 0.956314206123352, + "learning_rate": 2.4564224123881103e-05, + "loss": 0.6446, + "step": 2965 + }, + { + "epoch": 1.0832572576227861, + "grad_norm": 1.0378687381744385, + "learning_rate": 2.4550256324504594e-05, + "loss": 0.6288, + "step": 2966 + }, + { + "epoch": 1.0836224210334124, + "grad_norm": 0.9109642505645752, + "learning_rate": 2.4536286184203783e-05, + "loss": 0.6291, + "step": 2967 + }, + { + "epoch": 1.0839875844440388, + "grad_norm": 0.8259170055389404, + "learning_rate": 2.4522313710165765e-05, + "loss": 0.6749, + "step": 2968 + }, + { + "epoch": 1.084352747854665, + "grad_norm": 0.7047184705734253, + "learning_rate": 2.4508338909578817e-05, + "loss": 0.6255, + "step": 2969 + }, + { + "epoch": 1.0847179112652912, + "grad_norm": 0.8270784020423889, + "learning_rate": 2.4494361789632405e-05, + "loss": 0.6633, + "step": 2970 + }, + { + "epoch": 1.0850830746759175, + "grad_norm": 0.8580787181854248, + "learning_rate": 2.4480382357517195e-05, + "loss": 0.6518, + "step": 2971 + }, + { + "epoch": 1.0854482380865438, + "grad_norm": 0.7827367782592773, + "learning_rate": 2.4466400620425054e-05, + "loss": 0.6218, + "step": 2972 + }, + { + "epoch": 1.08581340149717, + "grad_norm": 0.8759443163871765, + "learning_rate": 2.4452416585549018e-05, + "loss": 0.627, + "step": 2973 + }, + { + "epoch": 1.0861785649077962, + "grad_norm": 0.9673501253128052, + "learning_rate": 2.443843026008331e-05, + "loss": 0.6233, + "step": 2974 + }, + { + "epoch": 1.0865437283184225, + "grad_norm": 0.932388186454773, + "learning_rate": 2.4424441651223343e-05, + "loss": 0.6277, + "step": 2975 + }, + { + "epoch": 1.0869088917290488, + "grad_norm": 0.7365838289260864, + "learning_rate": 2.4410450766165688e-05, + "loss": 0.5957, + "step": 2976 + }, + { + "epoch": 1.087274055139675, + "grad_norm": 1.4103069305419922, + "learning_rate": 2.439645761210809e-05, + "loss": 0.661, + "step": 2977 + }, + { + "epoch": 1.0876392185503012, + "grad_norm": 0.9331754446029663, + "learning_rate": 2.438246219624947e-05, + "loss": 0.652, + "step": 2978 + }, + { + "epoch": 1.0880043819609275, + "grad_norm": 0.980294406414032, + "learning_rate": 2.4368464525789905e-05, + "loss": 0.645, + "step": 2979 + }, + { + "epoch": 1.0883695453715538, + "grad_norm": 0.8242958784103394, + "learning_rate": 2.435446460793064e-05, + "loss": 0.6616, + "step": 2980 + }, + { + "epoch": 1.08873470878218, + "grad_norm": 0.8177999258041382, + "learning_rate": 2.4340462449874063e-05, + "loss": 0.5975, + "step": 2981 + }, + { + "epoch": 1.0890998721928062, + "grad_norm": 1.0282783508300781, + "learning_rate": 2.4326458058823735e-05, + "loss": 0.6388, + "step": 2982 + }, + { + "epoch": 1.0894650356034326, + "grad_norm": 1.0401636362075806, + "learning_rate": 2.4312451441984344e-05, + "loss": 0.6249, + "step": 2983 + }, + { + "epoch": 1.0898301990140589, + "grad_norm": 0.8768249154090881, + "learning_rate": 2.429844260656173e-05, + "loss": 0.6978, + "step": 2984 + }, + { + "epoch": 1.090195362424685, + "grad_norm": 1.2320053577423096, + "learning_rate": 2.4284431559762888e-05, + "loss": 0.6473, + "step": 2985 + }, + { + "epoch": 1.0905605258353113, + "grad_norm": 1.0827654600143433, + "learning_rate": 2.4270418308795923e-05, + "loss": 0.6978, + "step": 2986 + }, + { + "epoch": 1.0909256892459376, + "grad_norm": 0.8920133113861084, + "learning_rate": 2.4256402860870107e-05, + "loss": 0.6119, + "step": 2987 + }, + { + "epoch": 1.091290852656564, + "grad_norm": 0.7745959758758545, + "learning_rate": 2.424238522319581e-05, + "loss": 0.6356, + "step": 2988 + }, + { + "epoch": 1.09165601606719, + "grad_norm": 0.8089456558227539, + "learning_rate": 2.4228365402984562e-05, + "loss": 0.6567, + "step": 2989 + }, + { + "epoch": 1.0920211794778163, + "grad_norm": 1.0786186456680298, + "learning_rate": 2.4214343407448984e-05, + "loss": 0.6417, + "step": 2990 + }, + { + "epoch": 1.0923863428884426, + "grad_norm": 0.6472024321556091, + "learning_rate": 2.4200319243802826e-05, + "loss": 0.6252, + "step": 2991 + }, + { + "epoch": 1.092751506299069, + "grad_norm": 1.3685519695281982, + "learning_rate": 2.4186292919260975e-05, + "loss": 0.6237, + "step": 2992 + }, + { + "epoch": 1.093116669709695, + "grad_norm": 0.9059715867042542, + "learning_rate": 2.41722644410394e-05, + "loss": 0.5992, + "step": 2993 + }, + { + "epoch": 1.0934818331203213, + "grad_norm": 1.2271138429641724, + "learning_rate": 2.4158233816355185e-05, + "loss": 0.626, + "step": 2994 + }, + { + "epoch": 1.0938469965309476, + "grad_norm": 1.1302318572998047, + "learning_rate": 2.4144201052426543e-05, + "loss": 0.6304, + "step": 2995 + }, + { + "epoch": 1.094212159941574, + "grad_norm": 1.024825930595398, + "learning_rate": 2.413016615647275e-05, + "loss": 0.6524, + "step": 2996 + }, + { + "epoch": 1.0945773233522, + "grad_norm": 0.9111654758453369, + "learning_rate": 2.4116129135714208e-05, + "loss": 0.6465, + "step": 2997 + }, + { + "epoch": 1.0949424867628264, + "grad_norm": 0.8728148341178894, + "learning_rate": 2.410208999737239e-05, + "loss": 0.6064, + "step": 2998 + }, + { + "epoch": 1.0953076501734527, + "grad_norm": 1.0016210079193115, + "learning_rate": 2.408804874866987e-05, + "loss": 0.6364, + "step": 2999 + }, + { + "epoch": 1.095672813584079, + "grad_norm": 0.9198909401893616, + "learning_rate": 2.4074005396830322e-05, + "loss": 0.6321, + "step": 3000 + }, + { + "epoch": 1.096037976994705, + "grad_norm": 1.169684648513794, + "learning_rate": 2.4059959949078467e-05, + "loss": 0.6309, + "step": 3001 + }, + { + "epoch": 1.0964031404053314, + "grad_norm": 0.8687719702720642, + "learning_rate": 2.4045912412640146e-05, + "loss": 0.6444, + "step": 3002 + }, + { + "epoch": 1.0967683038159577, + "grad_norm": 0.9157119393348694, + "learning_rate": 2.4031862794742238e-05, + "loss": 0.6494, + "step": 3003 + }, + { + "epoch": 1.097133467226584, + "grad_norm": 0.973338782787323, + "learning_rate": 2.401781110261271e-05, + "loss": 0.634, + "step": 3004 + }, + { + "epoch": 1.09749863063721, + "grad_norm": 0.8346941471099854, + "learning_rate": 2.400375734348059e-05, + "loss": 0.614, + "step": 3005 + }, + { + "epoch": 1.0978637940478364, + "grad_norm": 0.9992729425430298, + "learning_rate": 2.3989701524575976e-05, + "loss": 0.6757, + "step": 3006 + }, + { + "epoch": 1.0982289574584627, + "grad_norm": 0.917357325553894, + "learning_rate": 2.3975643653130032e-05, + "loss": 0.6708, + "step": 3007 + }, + { + "epoch": 1.0985941208690888, + "grad_norm": 0.9452908635139465, + "learning_rate": 2.3961583736374955e-05, + "loss": 0.6424, + "step": 3008 + }, + { + "epoch": 1.0989592842797151, + "grad_norm": 0.9059060215950012, + "learning_rate": 2.394752178154402e-05, + "loss": 0.6346, + "step": 3009 + }, + { + "epoch": 1.0993244476903414, + "grad_norm": 2.1735620498657227, + "learning_rate": 2.393345779587153e-05, + "loss": 0.6439, + "step": 3010 + }, + { + "epoch": 1.0996896111009677, + "grad_norm": 0.8743827939033508, + "learning_rate": 2.3919391786592842e-05, + "loss": 0.6852, + "step": 3011 + }, + { + "epoch": 1.1000547745115938, + "grad_norm": 0.6446078419685364, + "learning_rate": 2.3905323760944356e-05, + "loss": 0.6317, + "step": 3012 + }, + { + "epoch": 1.1004199379222201, + "grad_norm": 1.1510143280029297, + "learning_rate": 2.3891253726163505e-05, + "loss": 0.6357, + "step": 3013 + }, + { + "epoch": 1.1007851013328465, + "grad_norm": 1.0216180086135864, + "learning_rate": 2.387718168948876e-05, + "loss": 0.6357, + "step": 3014 + }, + { + "epoch": 1.1011502647434728, + "grad_norm": 1.133445143699646, + "learning_rate": 2.3863107658159614e-05, + "loss": 0.7019, + "step": 3015 + }, + { + "epoch": 1.1015154281540989, + "grad_norm": 1.2700881958007812, + "learning_rate": 2.38490316394166e-05, + "loss": 0.6473, + "step": 3016 + }, + { + "epoch": 1.1018805915647252, + "grad_norm": 0.7656792402267456, + "learning_rate": 2.3834953640501262e-05, + "loss": 0.6409, + "step": 3017 + }, + { + "epoch": 1.1022457549753515, + "grad_norm": 0.8528900146484375, + "learning_rate": 2.3820873668656154e-05, + "loss": 0.6619, + "step": 3018 + }, + { + "epoch": 1.1026109183859778, + "grad_norm": 1.3215256929397583, + "learning_rate": 2.380679173112487e-05, + "loss": 0.6511, + "step": 3019 + }, + { + "epoch": 1.102976081796604, + "grad_norm": 2.560433864593506, + "learning_rate": 2.3792707835151995e-05, + "loss": 0.6196, + "step": 3020 + }, + { + "epoch": 1.1033412452072302, + "grad_norm": 0.8991859555244446, + "learning_rate": 2.3778621987983133e-05, + "loss": 0.6516, + "step": 3021 + }, + { + "epoch": 1.1037064086178565, + "grad_norm": 0.9166661500930786, + "learning_rate": 2.3764534196864886e-05, + "loss": 0.6185, + "step": 3022 + }, + { + "epoch": 1.1040715720284828, + "grad_norm": 1.0352705717086792, + "learning_rate": 2.3750444469044856e-05, + "loss": 0.6555, + "step": 3023 + }, + { + "epoch": 1.104436735439109, + "grad_norm": 0.8666485548019409, + "learning_rate": 2.3736352811771647e-05, + "loss": 0.6609, + "step": 3024 + }, + { + "epoch": 1.1048018988497352, + "grad_norm": 0.8922779560089111, + "learning_rate": 2.3722259232294835e-05, + "loss": 0.6678, + "step": 3025 + }, + { + "epoch": 1.1051670622603615, + "grad_norm": 3.4590539932250977, + "learning_rate": 2.370816373786502e-05, + "loss": 0.6838, + "step": 3026 + }, + { + "epoch": 1.1055322256709879, + "grad_norm": 0.7968533635139465, + "learning_rate": 2.3694066335733758e-05, + "loss": 0.6302, + "step": 3027 + }, + { + "epoch": 1.105897389081614, + "grad_norm": 0.7470107674598694, + "learning_rate": 2.3679967033153605e-05, + "loss": 0.6379, + "step": 3028 + }, + { + "epoch": 1.1062625524922403, + "grad_norm": 1.3071082830429077, + "learning_rate": 2.366586583737808e-05, + "loss": 0.6602, + "step": 3029 + }, + { + "epoch": 1.1066277159028666, + "grad_norm": 1.1564059257507324, + "learning_rate": 2.3651762755661683e-05, + "loss": 0.6407, + "step": 3030 + }, + { + "epoch": 1.1069928793134929, + "grad_norm": 0.8905368447303772, + "learning_rate": 2.3637657795259883e-05, + "loss": 0.6196, + "step": 3031 + }, + { + "epoch": 1.107358042724119, + "grad_norm": 0.910971462726593, + "learning_rate": 2.3623550963429117e-05, + "loss": 0.6431, + "step": 3032 + }, + { + "epoch": 1.1077232061347453, + "grad_norm": 1.8840655088424683, + "learning_rate": 2.3609442267426787e-05, + "loss": 0.6377, + "step": 3033 + }, + { + "epoch": 1.1080883695453716, + "grad_norm": 1.1728452444076538, + "learning_rate": 2.3595331714511243e-05, + "loss": 0.6516, + "step": 3034 + }, + { + "epoch": 1.108453532955998, + "grad_norm": 0.8091674447059631, + "learning_rate": 2.3581219311941813e-05, + "loss": 0.6046, + "step": 3035 + }, + { + "epoch": 1.108818696366624, + "grad_norm": 0.9540202617645264, + "learning_rate": 2.3567105066978744e-05, + "loss": 0.6034, + "step": 3036 + }, + { + "epoch": 1.1091838597772503, + "grad_norm": 0.9614107012748718, + "learning_rate": 2.355298898688326e-05, + "loss": 0.6391, + "step": 3037 + }, + { + "epoch": 1.1095490231878766, + "grad_norm": 0.8953191041946411, + "learning_rate": 2.3538871078917514e-05, + "loss": 0.6394, + "step": 3038 + }, + { + "epoch": 1.1099141865985027, + "grad_norm": 0.7751514911651611, + "learning_rate": 2.3524751350344602e-05, + "loss": 0.6622, + "step": 3039 + }, + { + "epoch": 1.110279350009129, + "grad_norm": 0.9415932297706604, + "learning_rate": 2.3510629808428568e-05, + "loss": 0.6224, + "step": 3040 + }, + { + "epoch": 1.1106445134197553, + "grad_norm": 0.9658535122871399, + "learning_rate": 2.349650646043436e-05, + "loss": 0.6577, + "step": 3041 + }, + { + "epoch": 1.1110096768303817, + "grad_norm": 0.9596617221832275, + "learning_rate": 2.3482381313627886e-05, + "loss": 0.6557, + "step": 3042 + }, + { + "epoch": 1.111374840241008, + "grad_norm": 0.7701271176338196, + "learning_rate": 2.3468254375275973e-05, + "loss": 0.6545, + "step": 3043 + }, + { + "epoch": 1.111740003651634, + "grad_norm": 0.6224210262298584, + "learning_rate": 2.3454125652646348e-05, + "loss": 0.6248, + "step": 3044 + }, + { + "epoch": 1.1121051670622604, + "grad_norm": 0.8339946866035461, + "learning_rate": 2.343999515300769e-05, + "loss": 0.6459, + "step": 3045 + }, + { + "epoch": 1.1124703304728867, + "grad_norm": 0.9032711386680603, + "learning_rate": 2.3425862883629553e-05, + "loss": 0.6484, + "step": 3046 + }, + { + "epoch": 1.1128354938835128, + "grad_norm": 0.9951335191726685, + "learning_rate": 2.3411728851782442e-05, + "loss": 0.6472, + "step": 3047 + }, + { + "epoch": 1.113200657294139, + "grad_norm": 0.9903227686882019, + "learning_rate": 2.3397593064737737e-05, + "loss": 0.6292, + "step": 3048 + }, + { + "epoch": 1.1135658207047654, + "grad_norm": 0.7369117140769958, + "learning_rate": 2.338345552976774e-05, + "loss": 0.6253, + "step": 3049 + }, + { + "epoch": 1.1139309841153917, + "grad_norm": 1.0023393630981445, + "learning_rate": 2.3369316254145636e-05, + "loss": 0.6415, + "step": 3050 + }, + { + "epoch": 1.1142961475260178, + "grad_norm": 0.9049243927001953, + "learning_rate": 2.3355175245145526e-05, + "loss": 0.6213, + "step": 3051 + }, + { + "epoch": 1.1146613109366441, + "grad_norm": 0.9037688970565796, + "learning_rate": 2.3341032510042387e-05, + "loss": 0.6517, + "step": 3052 + }, + { + "epoch": 1.1150264743472704, + "grad_norm": 0.8866589069366455, + "learning_rate": 2.3326888056112086e-05, + "loss": 0.6544, + "step": 3053 + }, + { + "epoch": 1.1153916377578967, + "grad_norm": 1.0946450233459473, + "learning_rate": 2.3312741890631383e-05, + "loss": 0.5887, + "step": 3054 + }, + { + "epoch": 1.1157568011685228, + "grad_norm": 0.8456511497497559, + "learning_rate": 2.3298594020877913e-05, + "loss": 0.6501, + "step": 3055 + }, + { + "epoch": 1.1161219645791491, + "grad_norm": 1.074009656906128, + "learning_rate": 2.328444445413018e-05, + "loss": 0.6182, + "step": 3056 + }, + { + "epoch": 1.1164871279897755, + "grad_norm": 0.6773717999458313, + "learning_rate": 2.3270293197667573e-05, + "loss": 0.6216, + "step": 3057 + }, + { + "epoch": 1.1168522914004018, + "grad_norm": 0.8690649271011353, + "learning_rate": 2.325614025877034e-05, + "loss": 0.665, + "step": 3058 + }, + { + "epoch": 1.1172174548110279, + "grad_norm": 0.969440221786499, + "learning_rate": 2.3241985644719603e-05, + "loss": 0.6223, + "step": 3059 + }, + { + "epoch": 1.1175826182216542, + "grad_norm": 0.9241883754730225, + "learning_rate": 2.3227829362797355e-05, + "loss": 0.6335, + "step": 3060 + }, + { + "epoch": 1.1179477816322805, + "grad_norm": 0.9070743322372437, + "learning_rate": 2.3213671420286413e-05, + "loss": 0.6459, + "step": 3061 + }, + { + "epoch": 1.1183129450429068, + "grad_norm": 0.7353122234344482, + "learning_rate": 2.3199511824470487e-05, + "loss": 0.6281, + "step": 3062 + }, + { + "epoch": 1.1186781084535329, + "grad_norm": 0.9907974004745483, + "learning_rate": 2.318535058263412e-05, + "loss": 0.6541, + "step": 3063 + }, + { + "epoch": 1.1190432718641592, + "grad_norm": 1.0479435920715332, + "learning_rate": 2.3171187702062693e-05, + "loss": 0.6409, + "step": 3064 + }, + { + "epoch": 1.1194084352747855, + "grad_norm": 1.9881740808486938, + "learning_rate": 2.3157023190042448e-05, + "loss": 0.6329, + "step": 3065 + }, + { + "epoch": 1.1197735986854118, + "grad_norm": 1.7955461740493774, + "learning_rate": 2.3142857053860454e-05, + "loss": 0.6378, + "step": 3066 + }, + { + "epoch": 1.120138762096038, + "grad_norm": 0.7752870917320251, + "learning_rate": 2.312868930080462e-05, + "loss": 0.6451, + "step": 3067 + }, + { + "epoch": 1.1205039255066642, + "grad_norm": 0.704662561416626, + "learning_rate": 2.3114519938163683e-05, + "loss": 0.6353, + "step": 3068 + }, + { + "epoch": 1.1208690889172905, + "grad_norm": 1.5579748153686523, + "learning_rate": 2.3100348973227224e-05, + "loss": 0.6571, + "step": 3069 + }, + { + "epoch": 1.1212342523279166, + "grad_norm": 1.206935167312622, + "learning_rate": 2.3086176413285627e-05, + "loss": 0.6174, + "step": 3070 + }, + { + "epoch": 1.121599415738543, + "grad_norm": 1.1550734043121338, + "learning_rate": 2.3072002265630102e-05, + "loss": 0.6234, + "step": 3071 + }, + { + "epoch": 1.1219645791491692, + "grad_norm": 0.8067594170570374, + "learning_rate": 2.3057826537552684e-05, + "loss": 0.6423, + "step": 3072 + }, + { + "epoch": 1.1223297425597956, + "grad_norm": 1.1658196449279785, + "learning_rate": 2.3043649236346216e-05, + "loss": 0.6543, + "step": 3073 + }, + { + "epoch": 1.1226949059704219, + "grad_norm": 0.8631314039230347, + "learning_rate": 2.302947036930435e-05, + "loss": 0.6254, + "step": 3074 + }, + { + "epoch": 1.123060069381048, + "grad_norm": 0.8141615986824036, + "learning_rate": 2.301528994372154e-05, + "loss": 0.646, + "step": 3075 + }, + { + "epoch": 1.1234252327916743, + "grad_norm": 1.31234610080719, + "learning_rate": 2.3001107966893054e-05, + "loss": 0.6113, + "step": 3076 + }, + { + "epoch": 1.1237903962023006, + "grad_norm": 0.7371488213539124, + "learning_rate": 2.2986924446114947e-05, + "loss": 0.6483, + "step": 3077 + }, + { + "epoch": 1.1241555596129267, + "grad_norm": 0.7674611806869507, + "learning_rate": 2.2972739388684068e-05, + "loss": 0.6209, + "step": 3078 + }, + { + "epoch": 1.124520723023553, + "grad_norm": 4.130733013153076, + "learning_rate": 2.2958552801898068e-05, + "loss": 0.6488, + "step": 3079 + }, + { + "epoch": 1.1248858864341793, + "grad_norm": 0.8405309915542603, + "learning_rate": 2.294436469305536e-05, + "loss": 0.6263, + "step": 3080 + }, + { + "epoch": 1.1252510498448056, + "grad_norm": 0.8591222167015076, + "learning_rate": 2.2930175069455175e-05, + "loss": 0.653, + "step": 3081 + }, + { + "epoch": 1.125616213255432, + "grad_norm": 0.7199135422706604, + "learning_rate": 2.2915983938397494e-05, + "loss": 0.6292, + "step": 3082 + }, + { + "epoch": 1.125981376666058, + "grad_norm": 1.0843881368637085, + "learning_rate": 2.290179130718309e-05, + "loss": 0.6216, + "step": 3083 + }, + { + "epoch": 1.1263465400766843, + "grad_norm": 0.9486679434776306, + "learning_rate": 2.2887597183113503e-05, + "loss": 0.6062, + "step": 3084 + }, + { + "epoch": 1.1267117034873106, + "grad_norm": 1.3532755374908447, + "learning_rate": 2.2873401573491033e-05, + "loss": 0.6511, + "step": 3085 + }, + { + "epoch": 1.1270768668979367, + "grad_norm": 0.8381106853485107, + "learning_rate": 2.2859204485618758e-05, + "loss": 0.6216, + "step": 3086 + }, + { + "epoch": 1.127442030308563, + "grad_norm": 0.9955036640167236, + "learning_rate": 2.2845005926800502e-05, + "loss": 0.6321, + "step": 3087 + }, + { + "epoch": 1.1278071937191894, + "grad_norm": 0.7981411218643188, + "learning_rate": 2.2830805904340867e-05, + "loss": 0.6302, + "step": 3088 + }, + { + "epoch": 1.1281723571298157, + "grad_norm": 0.6035706400871277, + "learning_rate": 2.281660442554518e-05, + "loss": 0.639, + "step": 3089 + }, + { + "epoch": 1.1285375205404418, + "grad_norm": 0.7041566371917725, + "learning_rate": 2.2802401497719545e-05, + "loss": 0.6393, + "step": 3090 + }, + { + "epoch": 1.128902683951068, + "grad_norm": 0.9806358218193054, + "learning_rate": 2.2788197128170798e-05, + "loss": 0.5987, + "step": 3091 + }, + { + "epoch": 1.1292678473616944, + "grad_norm": 1.0443167686462402, + "learning_rate": 2.27739913242065e-05, + "loss": 0.6513, + "step": 3092 + }, + { + "epoch": 1.1296330107723207, + "grad_norm": 0.8455133438110352, + "learning_rate": 2.2759784093134987e-05, + "loss": 0.6214, + "step": 3093 + }, + { + "epoch": 1.1299981741829468, + "grad_norm": 0.9792264699935913, + "learning_rate": 2.2745575442265297e-05, + "loss": 0.6476, + "step": 3094 + }, + { + "epoch": 1.130363337593573, + "grad_norm": 0.7479596138000488, + "learning_rate": 2.273136537890722e-05, + "loss": 0.6263, + "step": 3095 + }, + { + "epoch": 1.1307285010041994, + "grad_norm": 2.3983466625213623, + "learning_rate": 2.271715391037126e-05, + "loss": 0.6525, + "step": 3096 + }, + { + "epoch": 1.1310936644148257, + "grad_norm": 0.9311935901641846, + "learning_rate": 2.2702941043968635e-05, + "loss": 0.6346, + "step": 3097 + }, + { + "epoch": 1.1314588278254518, + "grad_norm": 0.9133360981941223, + "learning_rate": 2.2688726787011315e-05, + "loss": 0.6202, + "step": 3098 + }, + { + "epoch": 1.1318239912360781, + "grad_norm": 0.6708499193191528, + "learning_rate": 2.267451114681195e-05, + "loss": 0.647, + "step": 3099 + }, + { + "epoch": 1.1321891546467044, + "grad_norm": 0.8047714829444885, + "learning_rate": 2.2660294130683923e-05, + "loss": 0.6539, + "step": 3100 + }, + { + "epoch": 1.1325543180573305, + "grad_norm": 0.9618435502052307, + "learning_rate": 2.2646075745941315e-05, + "loss": 0.6393, + "step": 3101 + }, + { + "epoch": 1.1329194814679568, + "grad_norm": 0.824882984161377, + "learning_rate": 2.2631855999898914e-05, + "loss": 0.6126, + "step": 3102 + }, + { + "epoch": 1.1332846448785832, + "grad_norm": 0.9720734357833862, + "learning_rate": 2.261763489987222e-05, + "loss": 0.5818, + "step": 3103 + }, + { + "epoch": 1.1336498082892095, + "grad_norm": 1.0631283521652222, + "learning_rate": 2.26034124531774e-05, + "loss": 0.6443, + "step": 3104 + }, + { + "epoch": 1.1340149716998358, + "grad_norm": 1.071035623550415, + "learning_rate": 2.2589188667131346e-05, + "loss": 0.6133, + "step": 3105 + }, + { + "epoch": 1.1343801351104619, + "grad_norm": 1.0741324424743652, + "learning_rate": 2.257496354905162e-05, + "loss": 0.5823, + "step": 3106 + }, + { + "epoch": 1.1347452985210882, + "grad_norm": 0.8549277186393738, + "learning_rate": 2.2560737106256472e-05, + "loss": 0.6235, + "step": 3107 + }, + { + "epoch": 1.1351104619317145, + "grad_norm": 0.995807945728302, + "learning_rate": 2.254650934606484e-05, + "loss": 0.6291, + "step": 3108 + }, + { + "epoch": 1.1354756253423406, + "grad_norm": 0.6567954421043396, + "learning_rate": 2.2532280275796333e-05, + "loss": 0.6292, + "step": 3109 + }, + { + "epoch": 1.135840788752967, + "grad_norm": 0.9623636603355408, + "learning_rate": 2.251804990277125e-05, + "loss": 0.6548, + "step": 3110 + }, + { + "epoch": 1.1362059521635932, + "grad_norm": 0.9804652333259583, + "learning_rate": 2.250381823431052e-05, + "loss": 0.6023, + "step": 3111 + }, + { + "epoch": 1.1365711155742195, + "grad_norm": 0.9299272298812866, + "learning_rate": 2.248958527773579e-05, + "loss": 0.6508, + "step": 3112 + }, + { + "epoch": 1.1369362789848458, + "grad_norm": 1.0050239562988281, + "learning_rate": 2.2475351040369327e-05, + "loss": 0.614, + "step": 3113 + }, + { + "epoch": 1.137301442395472, + "grad_norm": 0.9981377720832825, + "learning_rate": 2.2461115529534084e-05, + "loss": 0.6364, + "step": 3114 + }, + { + "epoch": 1.1376666058060982, + "grad_norm": 0.8583781719207764, + "learning_rate": 2.244687875255367e-05, + "loss": 0.6411, + "step": 3115 + }, + { + "epoch": 1.1380317692167246, + "grad_norm": 1.0073418617248535, + "learning_rate": 2.2432640716752316e-05, + "loss": 0.6619, + "step": 3116 + }, + { + "epoch": 1.1383969326273506, + "grad_norm": 0.8421041965484619, + "learning_rate": 2.241840142945494e-05, + "loss": 0.6255, + "step": 3117 + }, + { + "epoch": 1.138762096037977, + "grad_norm": 0.8554766178131104, + "learning_rate": 2.2404160897987056e-05, + "loss": 0.5789, + "step": 3118 + }, + { + "epoch": 1.1391272594486033, + "grad_norm": 0.9527449607849121, + "learning_rate": 2.2389919129674872e-05, + "loss": 0.6251, + "step": 3119 + }, + { + "epoch": 1.1394924228592296, + "grad_norm": 1.126502275466919, + "learning_rate": 2.2375676131845196e-05, + "loss": 0.6461, + "step": 3120 + }, + { + "epoch": 1.1398575862698557, + "grad_norm": 0.7084481120109558, + "learning_rate": 2.236143191182548e-05, + "loss": 0.625, + "step": 3121 + }, + { + "epoch": 1.140222749680482, + "grad_norm": 0.8147100806236267, + "learning_rate": 2.2347186476943805e-05, + "loss": 0.6227, + "step": 3122 + }, + { + "epoch": 1.1405879130911083, + "grad_norm": 1.3247547149658203, + "learning_rate": 2.2332939834528875e-05, + "loss": 0.6486, + "step": 3123 + }, + { + "epoch": 1.1409530765017346, + "grad_norm": 0.6833971738815308, + "learning_rate": 2.2318691991910014e-05, + "loss": 0.6393, + "step": 3124 + }, + { + "epoch": 1.1413182399123607, + "grad_norm": 1.0287319421768188, + "learning_rate": 2.2304442956417164e-05, + "loss": 0.6579, + "step": 3125 + }, + { + "epoch": 1.141683403322987, + "grad_norm": 0.6814072728157043, + "learning_rate": 2.229019273538089e-05, + "loss": 0.6415, + "step": 3126 + }, + { + "epoch": 1.1420485667336133, + "grad_norm": 0.8899560570716858, + "learning_rate": 2.227594133613235e-05, + "loss": 0.6276, + "step": 3127 + }, + { + "epoch": 1.1424137301442396, + "grad_norm": 1.134264349937439, + "learning_rate": 2.2261688766003317e-05, + "loss": 0.6347, + "step": 3128 + }, + { + "epoch": 1.1427788935548657, + "grad_norm": 1.2518939971923828, + "learning_rate": 2.2247435032326178e-05, + "loss": 0.6269, + "step": 3129 + }, + { + "epoch": 1.143144056965492, + "grad_norm": 0.8139986991882324, + "learning_rate": 2.2233180142433894e-05, + "loss": 0.6105, + "step": 3130 + }, + { + "epoch": 1.1435092203761184, + "grad_norm": 0.8457422256469727, + "learning_rate": 2.2218924103660035e-05, + "loss": 0.6326, + "step": 3131 + }, + { + "epoch": 1.1438743837867447, + "grad_norm": 0.9971619844436646, + "learning_rate": 2.2204666923338772e-05, + "loss": 0.6281, + "step": 3132 + }, + { + "epoch": 1.1442395471973708, + "grad_norm": 1.9557191133499146, + "learning_rate": 2.219040860880484e-05, + "loss": 0.6534, + "step": 3133 + }, + { + "epoch": 1.144604710607997, + "grad_norm": 0.7603287100791931, + "learning_rate": 2.217614916739358e-05, + "loss": 0.6303, + "step": 3134 + }, + { + "epoch": 1.1449698740186234, + "grad_norm": 0.8402383327484131, + "learning_rate": 2.2161888606440885e-05, + "loss": 0.639, + "step": 3135 + }, + { + "epoch": 1.1453350374292497, + "grad_norm": 1.0194815397262573, + "learning_rate": 2.2147626933283265e-05, + "loss": 0.6159, + "step": 3136 + }, + { + "epoch": 1.1457002008398758, + "grad_norm": 0.9987945556640625, + "learning_rate": 2.213336415525776e-05, + "loss": 0.5863, + "step": 3137 + }, + { + "epoch": 1.146065364250502, + "grad_norm": 1.2469425201416016, + "learning_rate": 2.2119100279702005e-05, + "loss": 0.6081, + "step": 3138 + }, + { + "epoch": 1.1464305276611284, + "grad_norm": 0.7377803921699524, + "learning_rate": 2.2104835313954193e-05, + "loss": 0.5949, + "step": 3139 + }, + { + "epoch": 1.1467956910717545, + "grad_norm": 0.714694619178772, + "learning_rate": 2.209056926535307e-05, + "loss": 0.6082, + "step": 3140 + }, + { + "epoch": 1.1471608544823808, + "grad_norm": 0.9076855778694153, + "learning_rate": 2.2076302141237953e-05, + "loss": 0.6224, + "step": 3141 + }, + { + "epoch": 1.1475260178930071, + "grad_norm": 0.9985514283180237, + "learning_rate": 2.2062033948948697e-05, + "loss": 0.6256, + "step": 3142 + }, + { + "epoch": 1.1478911813036334, + "grad_norm": 0.9677404761314392, + "learning_rate": 2.2047764695825725e-05, + "loss": 0.6055, + "step": 3143 + }, + { + "epoch": 1.1482563447142597, + "grad_norm": 0.9319841265678406, + "learning_rate": 2.2033494389209988e-05, + "loss": 0.6483, + "step": 3144 + }, + { + "epoch": 1.1486215081248858, + "grad_norm": 0.7451246976852417, + "learning_rate": 2.201922303644298e-05, + "loss": 0.6267, + "step": 3145 + }, + { + "epoch": 1.1489866715355121, + "grad_norm": 0.7892596125602722, + "learning_rate": 2.200495064486675e-05, + "loss": 0.6456, + "step": 3146 + }, + { + "epoch": 1.1493518349461385, + "grad_norm": 0.8904467821121216, + "learning_rate": 2.1990677221823865e-05, + "loss": 0.6443, + "step": 3147 + }, + { + "epoch": 1.1497169983567646, + "grad_norm": 0.7229827046394348, + "learning_rate": 2.1976402774657432e-05, + "loss": 0.6479, + "step": 3148 + }, + { + "epoch": 1.1500821617673909, + "grad_norm": 0.9828261137008667, + "learning_rate": 2.196212731071108e-05, + "loss": 0.6593, + "step": 3149 + }, + { + "epoch": 1.1504473251780172, + "grad_norm": 0.8736459612846375, + "learning_rate": 2.194785083732896e-05, + "loss": 0.6517, + "step": 3150 + }, + { + "epoch": 1.1508124885886435, + "grad_norm": 0.7535048723220825, + "learning_rate": 2.193357336185575e-05, + "loss": 0.6409, + "step": 3151 + }, + { + "epoch": 1.1511776519992696, + "grad_norm": 0.9029123783111572, + "learning_rate": 2.191929489163663e-05, + "loss": 0.6156, + "step": 3152 + }, + { + "epoch": 1.151542815409896, + "grad_norm": 1.1329669952392578, + "learning_rate": 2.1905015434017313e-05, + "loss": 0.6435, + "step": 3153 + }, + { + "epoch": 1.1519079788205222, + "grad_norm": 0.9592320322990417, + "learning_rate": 2.1890734996343985e-05, + "loss": 0.6366, + "step": 3154 + }, + { + "epoch": 1.1522731422311485, + "grad_norm": 0.7683483958244324, + "learning_rate": 2.1876453585963384e-05, + "loss": 0.634, + "step": 3155 + }, + { + "epoch": 1.1526383056417746, + "grad_norm": 0.794584333896637, + "learning_rate": 2.1862171210222708e-05, + "loss": 0.6211, + "step": 3156 + }, + { + "epoch": 1.153003469052401, + "grad_norm": 0.8724021315574646, + "learning_rate": 2.1847887876469666e-05, + "loss": 0.6047, + "step": 3157 + }, + { + "epoch": 1.1533686324630272, + "grad_norm": 1.1390639543533325, + "learning_rate": 2.1833603592052464e-05, + "loss": 0.6597, + "step": 3158 + }, + { + "epoch": 1.1537337958736535, + "grad_norm": 0.7585846185684204, + "learning_rate": 2.181931836431979e-05, + "loss": 0.6213, + "step": 3159 + }, + { + "epoch": 1.1540989592842796, + "grad_norm": 1.4673725366592407, + "learning_rate": 2.1805032200620824e-05, + "loss": 0.6419, + "step": 3160 + }, + { + "epoch": 1.154464122694906, + "grad_norm": 1.0087941884994507, + "learning_rate": 2.1790745108305222e-05, + "loss": 0.6164, + "step": 3161 + }, + { + "epoch": 1.1548292861055323, + "grad_norm": 0.998311460018158, + "learning_rate": 2.1776457094723115e-05, + "loss": 0.6334, + "step": 3162 + }, + { + "epoch": 1.1551944495161586, + "grad_norm": 1.0328058004379272, + "learning_rate": 2.176216816722513e-05, + "loss": 0.6338, + "step": 3163 + }, + { + "epoch": 1.1555596129267847, + "grad_norm": 0.9341315627098083, + "learning_rate": 2.1747878333162326e-05, + "loss": 0.6263, + "step": 3164 + }, + { + "epoch": 1.155924776337411, + "grad_norm": 0.8238332271575928, + "learning_rate": 2.173358759988626e-05, + "loss": 0.6186, + "step": 3165 + }, + { + "epoch": 1.1562899397480373, + "grad_norm": 1.0291343927383423, + "learning_rate": 2.1719295974748934e-05, + "loss": 0.6154, + "step": 3166 + }, + { + "epoch": 1.1566551031586636, + "grad_norm": 0.6935338377952576, + "learning_rate": 2.1705003465102818e-05, + "loss": 0.6333, + "step": 3167 + }, + { + "epoch": 1.1570202665692897, + "grad_norm": 0.8157773613929749, + "learning_rate": 2.1690710078300847e-05, + "loss": 0.628, + "step": 3168 + }, + { + "epoch": 1.157385429979916, + "grad_norm": 0.8037132620811462, + "learning_rate": 2.167641582169637e-05, + "loss": 0.6138, + "step": 3169 + }, + { + "epoch": 1.1577505933905423, + "grad_norm": 0.714047372341156, + "learning_rate": 2.166212070264324e-05, + "loss": 0.6261, + "step": 3170 + }, + { + "epoch": 1.1581157568011684, + "grad_norm": 0.7886608242988586, + "learning_rate": 2.1647824728495696e-05, + "loss": 0.6274, + "step": 3171 + }, + { + "epoch": 1.1584809202117947, + "grad_norm": 0.853618860244751, + "learning_rate": 2.1633527906608457e-05, + "loss": 0.5916, + "step": 3172 + }, + { + "epoch": 1.158846083622421, + "grad_norm": 1.2856926918029785, + "learning_rate": 2.1619230244336652e-05, + "loss": 0.6022, + "step": 3173 + }, + { + "epoch": 1.1592112470330473, + "grad_norm": 0.9644615650177002, + "learning_rate": 2.1604931749035865e-05, + "loss": 0.6204, + "step": 3174 + }, + { + "epoch": 1.1595764104436737, + "grad_norm": 0.8236731886863708, + "learning_rate": 2.1590632428062097e-05, + "loss": 0.6224, + "step": 3175 + }, + { + "epoch": 1.1599415738542997, + "grad_norm": 1.1724156141281128, + "learning_rate": 2.1576332288771776e-05, + "loss": 0.6074, + "step": 3176 + }, + { + "epoch": 1.160306737264926, + "grad_norm": 0.8694778084754944, + "learning_rate": 2.1562031338521745e-05, + "loss": 0.6204, + "step": 3177 + }, + { + "epoch": 1.1606719006755524, + "grad_norm": 0.8083097338676453, + "learning_rate": 2.1547729584669262e-05, + "loss": 0.6483, + "step": 3178 + }, + { + "epoch": 1.1610370640861785, + "grad_norm": 0.6772050857543945, + "learning_rate": 2.1533427034572022e-05, + "loss": 0.6016, + "step": 3179 + }, + { + "epoch": 1.1614022274968048, + "grad_norm": 1.0343115329742432, + "learning_rate": 2.1519123695588106e-05, + "loss": 0.6033, + "step": 3180 + }, + { + "epoch": 1.161767390907431, + "grad_norm": 0.8711481690406799, + "learning_rate": 2.1504819575076e-05, + "loss": 0.6153, + "step": 3181 + }, + { + "epoch": 1.1621325543180574, + "grad_norm": 0.8656068444252014, + "learning_rate": 2.1490514680394616e-05, + "loss": 0.6346, + "step": 3182 + }, + { + "epoch": 1.1624977177286837, + "grad_norm": 1.0679489374160767, + "learning_rate": 2.147620901890324e-05, + "loss": 0.6439, + "step": 3183 + }, + { + "epoch": 1.1628628811393098, + "grad_norm": 0.8058302998542786, + "learning_rate": 2.146190259796155e-05, + "loss": 0.6195, + "step": 3184 + }, + { + "epoch": 1.1632280445499361, + "grad_norm": 0.8528069853782654, + "learning_rate": 2.1447595424929647e-05, + "loss": 0.6425, + "step": 3185 + }, + { + "epoch": 1.1635932079605624, + "grad_norm": 0.697010338306427, + "learning_rate": 2.143328750716798e-05, + "loss": 0.6411, + "step": 3186 + }, + { + "epoch": 1.1639583713711885, + "grad_norm": 0.905610203742981, + "learning_rate": 2.141897885203741e-05, + "loss": 0.6241, + "step": 3187 + }, + { + "epoch": 1.1643235347818148, + "grad_norm": 0.9987830519676208, + "learning_rate": 2.140466946689915e-05, + "loss": 0.5917, + "step": 3188 + }, + { + "epoch": 1.1646886981924411, + "grad_norm": 1.0862075090408325, + "learning_rate": 2.1390359359114826e-05, + "loss": 0.6369, + "step": 3189 + }, + { + "epoch": 1.1650538616030675, + "grad_norm": 1.4116789102554321, + "learning_rate": 2.13760485360464e-05, + "loss": 0.6541, + "step": 3190 + }, + { + "epoch": 1.1654190250136935, + "grad_norm": 1.0040624141693115, + "learning_rate": 2.136173700505622e-05, + "loss": 0.6105, + "step": 3191 + }, + { + "epoch": 1.1657841884243199, + "grad_norm": 0.7681383490562439, + "learning_rate": 2.134742477350699e-05, + "loss": 0.581, + "step": 3192 + }, + { + "epoch": 1.1661493518349462, + "grad_norm": 0.7578985691070557, + "learning_rate": 2.133311184876179e-05, + "loss": 0.5801, + "step": 3193 + }, + { + "epoch": 1.1665145152455725, + "grad_norm": 0.8705235123634338, + "learning_rate": 2.1318798238184036e-05, + "loss": 0.6402, + "step": 3194 + }, + { + "epoch": 1.1668796786561986, + "grad_norm": 0.9274020195007324, + "learning_rate": 2.1304483949137503e-05, + "loss": 0.6138, + "step": 3195 + }, + { + "epoch": 1.1672448420668249, + "grad_norm": 0.9331055879592896, + "learning_rate": 2.1290168988986332e-05, + "loss": 0.655, + "step": 3196 + }, + { + "epoch": 1.1676100054774512, + "grad_norm": 0.8188350796699524, + "learning_rate": 2.127585336509498e-05, + "loss": 0.6006, + "step": 3197 + }, + { + "epoch": 1.1679751688880775, + "grad_norm": 1.0621793270111084, + "learning_rate": 2.1261537084828274e-05, + "loss": 0.6332, + "step": 3198 + }, + { + "epoch": 1.1683403322987036, + "grad_norm": 0.8340405225753784, + "learning_rate": 2.1247220155551357e-05, + "loss": 0.6096, + "step": 3199 + }, + { + "epoch": 1.16870549570933, + "grad_norm": 0.7741566896438599, + "learning_rate": 2.1232902584629716e-05, + "loss": 0.6539, + "step": 3200 + }, + { + "epoch": 1.1690706591199562, + "grad_norm": 0.9330441355705261, + "learning_rate": 2.121858437942917e-05, + "loss": 0.6381, + "step": 3201 + }, + { + "epoch": 1.1694358225305823, + "grad_norm": 1.090240240097046, + "learning_rate": 2.1204265547315862e-05, + "loss": 0.6595, + "step": 3202 + }, + { + "epoch": 1.1698009859412086, + "grad_norm": 0.7946920394897461, + "learning_rate": 2.1189946095656255e-05, + "loss": 0.6296, + "step": 3203 + }, + { + "epoch": 1.170166149351835, + "grad_norm": 0.6620867848396301, + "learning_rate": 2.117562603181713e-05, + "loss": 0.6378, + "step": 3204 + }, + { + "epoch": 1.1705313127624613, + "grad_norm": 0.7557404637336731, + "learning_rate": 2.116130536316558e-05, + "loss": 0.6008, + "step": 3205 + }, + { + "epoch": 1.1708964761730876, + "grad_norm": 0.7486023306846619, + "learning_rate": 2.114698409706903e-05, + "loss": 0.6088, + "step": 3206 + }, + { + "epoch": 1.1712616395837137, + "grad_norm": 0.8459969162940979, + "learning_rate": 2.1132662240895182e-05, + "loss": 0.6185, + "step": 3207 + }, + { + "epoch": 1.17162680299434, + "grad_norm": 1.013345718383789, + "learning_rate": 2.111833980201207e-05, + "loss": 0.6066, + "step": 3208 + }, + { + "epoch": 1.1719919664049663, + "grad_norm": 1.3973004817962646, + "learning_rate": 2.1104016787787994e-05, + "loss": 0.6286, + "step": 3209 + }, + { + "epoch": 1.1723571298155924, + "grad_norm": 0.8168177008628845, + "learning_rate": 2.108969320559159e-05, + "loss": 0.6224, + "step": 3210 + }, + { + "epoch": 1.1727222932262187, + "grad_norm": 0.964625895023346, + "learning_rate": 2.107536906279176e-05, + "loss": 0.6149, + "step": 3211 + }, + { + "epoch": 1.173087456636845, + "grad_norm": 0.956104576587677, + "learning_rate": 2.106104436675769e-05, + "loss": 0.6476, + "step": 3212 + }, + { + "epoch": 1.1734526200474713, + "grad_norm": 1.1153203248977661, + "learning_rate": 2.1046719124858882e-05, + "loss": 0.6167, + "step": 3213 + }, + { + "epoch": 1.1738177834580976, + "grad_norm": 0.7539756298065186, + "learning_rate": 2.1032393344465077e-05, + "loss": 0.6234, + "step": 3214 + }, + { + "epoch": 1.1741829468687237, + "grad_norm": 1.6226251125335693, + "learning_rate": 2.1018067032946327e-05, + "loss": 0.6169, + "step": 3215 + }, + { + "epoch": 1.17454811027935, + "grad_norm": 0.7395288348197937, + "learning_rate": 2.1003740197672946e-05, + "loss": 0.6255, + "step": 3216 + }, + { + "epoch": 1.1749132736899763, + "grad_norm": 0.896364152431488, + "learning_rate": 2.0989412846015504e-05, + "loss": 0.6047, + "step": 3217 + }, + { + "epoch": 1.1752784371006024, + "grad_norm": 0.9521870017051697, + "learning_rate": 2.0975084985344857e-05, + "loss": 0.6346, + "step": 3218 + }, + { + "epoch": 1.1756436005112287, + "grad_norm": 1.2138594388961792, + "learning_rate": 2.0960756623032114e-05, + "loss": 0.6334, + "step": 3219 + }, + { + "epoch": 1.176008763921855, + "grad_norm": 1.0154399871826172, + "learning_rate": 2.0946427766448642e-05, + "loss": 0.6374, + "step": 3220 + }, + { + "epoch": 1.1763739273324814, + "grad_norm": 0.9649248719215393, + "learning_rate": 2.093209842296606e-05, + "loss": 0.5993, + "step": 3221 + }, + { + "epoch": 1.1767390907431075, + "grad_norm": 0.915959358215332, + "learning_rate": 2.0917768599956236e-05, + "loss": 0.6214, + "step": 3222 + }, + { + "epoch": 1.1771042541537338, + "grad_norm": 0.8261663317680359, + "learning_rate": 2.090343830479131e-05, + "loss": 0.6181, + "step": 3223 + }, + { + "epoch": 1.17746941756436, + "grad_norm": 0.8212270736694336, + "learning_rate": 2.0889107544843615e-05, + "loss": 0.6365, + "step": 3224 + }, + { + "epoch": 1.1778345809749864, + "grad_norm": 0.7964750528335571, + "learning_rate": 2.0874776327485777e-05, + "loss": 0.6329, + "step": 3225 + }, + { + "epoch": 1.1781997443856125, + "grad_norm": 0.8948934674263, + "learning_rate": 2.0860444660090612e-05, + "loss": 0.6213, + "step": 3226 + }, + { + "epoch": 1.1785649077962388, + "grad_norm": 0.9234115481376648, + "learning_rate": 2.0846112550031198e-05, + "loss": 0.6049, + "step": 3227 + }, + { + "epoch": 1.178930071206865, + "grad_norm": 1.0438138246536255, + "learning_rate": 2.0831780004680834e-05, + "loss": 0.6008, + "step": 3228 + }, + { + "epoch": 1.1792952346174914, + "grad_norm": 0.8235229253768921, + "learning_rate": 2.081744703141303e-05, + "loss": 0.6046, + "step": 3229 + }, + { + "epoch": 1.1796603980281175, + "grad_norm": 1.143739938735962, + "learning_rate": 2.0803113637601543e-05, + "loss": 0.6107, + "step": 3230 + }, + { + "epoch": 1.1800255614387438, + "grad_norm": 0.7664543390274048, + "learning_rate": 2.07887798306203e-05, + "loss": 0.6403, + "step": 3231 + }, + { + "epoch": 1.1803907248493701, + "grad_norm": 1.0188530683517456, + "learning_rate": 2.0774445617843493e-05, + "loss": 0.6088, + "step": 3232 + }, + { + "epoch": 1.1807558882599964, + "grad_norm": 0.8213171362876892, + "learning_rate": 2.076011100664549e-05, + "loss": 0.6318, + "step": 3233 + }, + { + "epoch": 1.1811210516706225, + "grad_norm": 0.9105982184410095, + "learning_rate": 2.0745776004400876e-05, + "loss": 0.5911, + "step": 3234 + }, + { + "epoch": 1.1814862150812488, + "grad_norm": 0.8999936580657959, + "learning_rate": 2.0731440618484436e-05, + "loss": 0.6133, + "step": 3235 + }, + { + "epoch": 1.1818513784918752, + "grad_norm": 0.6579003930091858, + "learning_rate": 2.0717104856271152e-05, + "loss": 0.6404, + "step": 3236 + }, + { + "epoch": 1.1822165419025015, + "grad_norm": 0.705596923828125, + "learning_rate": 2.0702768725136192e-05, + "loss": 0.6221, + "step": 3237 + }, + { + "epoch": 1.1825817053131276, + "grad_norm": 0.9739445447921753, + "learning_rate": 2.068843223245492e-05, + "loss": 0.5918, + "step": 3238 + }, + { + "epoch": 1.1829468687237539, + "grad_norm": 1.2284148931503296, + "learning_rate": 2.0674095385602885e-05, + "loss": 0.6329, + "step": 3239 + }, + { + "epoch": 1.1833120321343802, + "grad_norm": 1.154626488685608, + "learning_rate": 2.0659758191955833e-05, + "loss": 0.6033, + "step": 3240 + }, + { + "epoch": 1.1836771955450063, + "grad_norm": 0.774357795715332, + "learning_rate": 2.0645420658889662e-05, + "loss": 0.626, + "step": 3241 + }, + { + "epoch": 1.1840423589556326, + "grad_norm": 0.7407965064048767, + "learning_rate": 2.0631082793780464e-05, + "loss": 0.6146, + "step": 3242 + }, + { + "epoch": 1.184407522366259, + "grad_norm": 1.034700870513916, + "learning_rate": 2.0616744604004496e-05, + "loss": 0.6119, + "step": 3243 + }, + { + "epoch": 1.1847726857768852, + "grad_norm": 0.7460023164749146, + "learning_rate": 2.0602406096938168e-05, + "loss": 0.6153, + "step": 3244 + }, + { + "epoch": 1.1851378491875115, + "grad_norm": 0.7647458910942078, + "learning_rate": 2.058806727995808e-05, + "loss": 0.6318, + "step": 3245 + }, + { + "epoch": 1.1855030125981376, + "grad_norm": 1.2149773836135864, + "learning_rate": 2.0573728160440972e-05, + "loss": 0.5822, + "step": 3246 + }, + { + "epoch": 1.185868176008764, + "grad_norm": 0.7748311161994934, + "learning_rate": 2.0559388745763754e-05, + "loss": 0.5909, + "step": 3247 + }, + { + "epoch": 1.1862333394193902, + "grad_norm": 0.9910469055175781, + "learning_rate": 2.0545049043303463e-05, + "loss": 0.5994, + "step": 3248 + }, + { + "epoch": 1.1865985028300163, + "grad_norm": 0.857268750667572, + "learning_rate": 2.0530709060437323e-05, + "loss": 0.6029, + "step": 3249 + }, + { + "epoch": 1.1869636662406426, + "grad_norm": 1.1047351360321045, + "learning_rate": 2.0516368804542662e-05, + "loss": 0.6266, + "step": 3250 + }, + { + "epoch": 1.187328829651269, + "grad_norm": 0.9308289885520935, + "learning_rate": 2.050202828299697e-05, + "loss": 0.6114, + "step": 3251 + }, + { + "epoch": 1.1876939930618953, + "grad_norm": 0.881679356098175, + "learning_rate": 2.0487687503177874e-05, + "loss": 0.5991, + "step": 3252 + }, + { + "epoch": 1.1880591564725214, + "grad_norm": 0.7082407474517822, + "learning_rate": 2.0473346472463125e-05, + "loss": 0.6062, + "step": 3253 + }, + { + "epoch": 1.1884243198831477, + "grad_norm": 1.4497092962265015, + "learning_rate": 2.045900519823061e-05, + "loss": 0.6298, + "step": 3254 + }, + { + "epoch": 1.188789483293774, + "grad_norm": 1.1088813543319702, + "learning_rate": 2.044466368785834e-05, + "loss": 0.6503, + "step": 3255 + }, + { + "epoch": 1.1891546467044003, + "grad_norm": 0.7719933986663818, + "learning_rate": 2.0430321948724447e-05, + "loss": 0.6332, + "step": 3256 + }, + { + "epoch": 1.1895198101150264, + "grad_norm": 0.6202806830406189, + "learning_rate": 2.041597998820718e-05, + "loss": 0.6081, + "step": 3257 + }, + { + "epoch": 1.1898849735256527, + "grad_norm": 0.73605877161026, + "learning_rate": 2.0401637813684897e-05, + "loss": 0.5995, + "step": 3258 + }, + { + "epoch": 1.190250136936279, + "grad_norm": 0.9471248984336853, + "learning_rate": 2.038729543253608e-05, + "loss": 0.6298, + "step": 3259 + }, + { + "epoch": 1.1906153003469053, + "grad_norm": 1.0479601621627808, + "learning_rate": 2.0372952852139297e-05, + "loss": 0.6345, + "step": 3260 + }, + { + "epoch": 1.1909804637575314, + "grad_norm": 0.7535351514816284, + "learning_rate": 2.0358610079873248e-05, + "loss": 0.64, + "step": 3261 + }, + { + "epoch": 1.1913456271681577, + "grad_norm": 1.083878755569458, + "learning_rate": 2.0344267123116697e-05, + "loss": 0.6246, + "step": 3262 + }, + { + "epoch": 1.191710790578784, + "grad_norm": 1.0116360187530518, + "learning_rate": 2.0329923989248525e-05, + "loss": 0.6339, + "step": 3263 + }, + { + "epoch": 1.1920759539894104, + "grad_norm": 0.9353020787239075, + "learning_rate": 2.0315580685647703e-05, + "loss": 0.6057, + "step": 3264 + }, + { + "epoch": 1.1924411174000364, + "grad_norm": 0.7070524096488953, + "learning_rate": 2.0301237219693278e-05, + "loss": 0.6249, + "step": 3265 + }, + { + "epoch": 1.1928062808106628, + "grad_norm": 0.8464820384979248, + "learning_rate": 2.0286893598764393e-05, + "loss": 0.5947, + "step": 3266 + }, + { + "epoch": 1.193171444221289, + "grad_norm": 0.9669051170349121, + "learning_rate": 2.0272549830240265e-05, + "loss": 0.588, + "step": 3267 + }, + { + "epoch": 1.1935366076319154, + "grad_norm": 0.8157206773757935, + "learning_rate": 2.0258205921500183e-05, + "loss": 0.5887, + "step": 3268 + }, + { + "epoch": 1.1939017710425415, + "grad_norm": 1.0829534530639648, + "learning_rate": 2.024386187992352e-05, + "loss": 0.5842, + "step": 3269 + }, + { + "epoch": 1.1942669344531678, + "grad_norm": 0.7082162499427795, + "learning_rate": 2.02295177128897e-05, + "loss": 0.6026, + "step": 3270 + }, + { + "epoch": 1.194632097863794, + "grad_norm": 0.8928137421607971, + "learning_rate": 2.0215173427778234e-05, + "loss": 0.6482, + "step": 3271 + }, + { + "epoch": 1.1949972612744202, + "grad_norm": 0.8363444209098816, + "learning_rate": 2.0200829031968667e-05, + "loss": 0.5997, + "step": 3272 + }, + { + "epoch": 1.1953624246850465, + "grad_norm": 0.9447734355926514, + "learning_rate": 2.018648453284062e-05, + "loss": 0.6057, + "step": 3273 + }, + { + "epoch": 1.1957275880956728, + "grad_norm": 1.049188494682312, + "learning_rate": 2.017213993777377e-05, + "loss": 0.6091, + "step": 3274 + }, + { + "epoch": 1.1960927515062991, + "grad_norm": 1.544588565826416, + "learning_rate": 2.0157795254147826e-05, + "loss": 0.6389, + "step": 3275 + }, + { + "epoch": 1.1964579149169254, + "grad_norm": 0.8541837930679321, + "learning_rate": 2.0143450489342563e-05, + "loss": 0.622, + "step": 3276 + }, + { + "epoch": 1.1968230783275515, + "grad_norm": 0.9997867941856384, + "learning_rate": 2.012910565073777e-05, + "loss": 0.5931, + "step": 3277 + }, + { + "epoch": 1.1971882417381778, + "grad_norm": 0.8798093795776367, + "learning_rate": 2.0114760745713305e-05, + "loss": 0.6131, + "step": 3278 + }, + { + "epoch": 1.1975534051488042, + "grad_norm": 0.7250221371650696, + "learning_rate": 2.010041578164904e-05, + "loss": 0.6046, + "step": 3279 + }, + { + "epoch": 1.1979185685594302, + "grad_norm": 0.967989981174469, + "learning_rate": 2.0086070765924886e-05, + "loss": 0.6251, + "step": 3280 + }, + { + "epoch": 1.1982837319700566, + "grad_norm": 1.0515309572219849, + "learning_rate": 2.0071725705920776e-05, + "loss": 0.5974, + "step": 3281 + }, + { + "epoch": 1.1986488953806829, + "grad_norm": 2.368988037109375, + "learning_rate": 2.0057380609016666e-05, + "loss": 0.6284, + "step": 3282 + }, + { + "epoch": 1.1990140587913092, + "grad_norm": 1.0001410245895386, + "learning_rate": 2.0043035482592543e-05, + "loss": 0.582, + "step": 3283 + }, + { + "epoch": 1.1993792222019355, + "grad_norm": 1.3888506889343262, + "learning_rate": 2.0028690334028384e-05, + "loss": 0.636, + "step": 3284 + }, + { + "epoch": 1.1997443856125616, + "grad_norm": 0.8603945970535278, + "learning_rate": 2.0014345170704202e-05, + "loss": 0.6385, + "step": 3285 + }, + { + "epoch": 1.200109549023188, + "grad_norm": 2.1296420097351074, + "learning_rate": 2e-05, + "loss": 0.6085, + "step": 3286 + }, + { + "epoch": 1.2004747124338142, + "grad_norm": 1.1616238355636597, + "learning_rate": 1.99856548292958e-05, + "loss": 0.6325, + "step": 3287 + }, + { + "epoch": 1.2008398758444403, + "grad_norm": 0.7582462430000305, + "learning_rate": 1.9971309665971623e-05, + "loss": 0.6052, + "step": 3288 + }, + { + "epoch": 1.2012050392550666, + "grad_norm": 0.8241375088691711, + "learning_rate": 1.995696451740746e-05, + "loss": 0.6307, + "step": 3289 + }, + { + "epoch": 1.201570202665693, + "grad_norm": 1.116804599761963, + "learning_rate": 1.9942619390983334e-05, + "loss": 0.5859, + "step": 3290 + }, + { + "epoch": 1.2019353660763192, + "grad_norm": 0.7887852191925049, + "learning_rate": 1.9928274294079227e-05, + "loss": 0.612, + "step": 3291 + }, + { + "epoch": 1.2023005294869453, + "grad_norm": 0.7873060703277588, + "learning_rate": 1.9913929234075117e-05, + "loss": 0.598, + "step": 3292 + }, + { + "epoch": 1.2026656928975716, + "grad_norm": 0.9905545711517334, + "learning_rate": 1.989958421835097e-05, + "loss": 0.6238, + "step": 3293 + }, + { + "epoch": 1.203030856308198, + "grad_norm": 0.8429453372955322, + "learning_rate": 1.9885239254286705e-05, + "loss": 0.642, + "step": 3294 + }, + { + "epoch": 1.2033960197188243, + "grad_norm": 1.3780677318572998, + "learning_rate": 1.987089434926224e-05, + "loss": 0.5864, + "step": 3295 + }, + { + "epoch": 1.2037611831294504, + "grad_norm": 1.0858463048934937, + "learning_rate": 1.9856549510657447e-05, + "loss": 0.6095, + "step": 3296 + }, + { + "epoch": 1.2041263465400767, + "grad_norm": 1.0253275632858276, + "learning_rate": 1.984220474585218e-05, + "loss": 0.6207, + "step": 3297 + }, + { + "epoch": 1.204491509950703, + "grad_norm": 0.932194173336029, + "learning_rate": 1.9827860062226236e-05, + "loss": 0.5787, + "step": 3298 + }, + { + "epoch": 1.2048566733613293, + "grad_norm": 0.9060006737709045, + "learning_rate": 1.9813515467159382e-05, + "loss": 0.619, + "step": 3299 + }, + { + "epoch": 1.2052218367719554, + "grad_norm": 0.8561171889305115, + "learning_rate": 1.979917096803134e-05, + "loss": 0.61, + "step": 3300 + }, + { + "epoch": 1.2055870001825817, + "grad_norm": 0.7535567283630371, + "learning_rate": 1.9784826572221773e-05, + "loss": 0.6067, + "step": 3301 + }, + { + "epoch": 1.205952163593208, + "grad_norm": 1.2940592765808105, + "learning_rate": 1.9770482287110305e-05, + "loss": 0.6376, + "step": 3302 + }, + { + "epoch": 1.206317327003834, + "grad_norm": 0.8710494637489319, + "learning_rate": 1.9756138120076484e-05, + "loss": 0.5845, + "step": 3303 + }, + { + "epoch": 1.2066824904144604, + "grad_norm": 0.9782663583755493, + "learning_rate": 1.974179407849982e-05, + "loss": 0.5915, + "step": 3304 + }, + { + "epoch": 1.2070476538250867, + "grad_norm": 0.6249943375587463, + "learning_rate": 1.9727450169759738e-05, + "loss": 0.6279, + "step": 3305 + }, + { + "epoch": 1.207412817235713, + "grad_norm": 1.0076680183410645, + "learning_rate": 1.971310640123561e-05, + "loss": 0.6116, + "step": 3306 + }, + { + "epoch": 1.2077779806463393, + "grad_norm": 1.613810658454895, + "learning_rate": 1.9698762780306732e-05, + "loss": 0.6313, + "step": 3307 + }, + { + "epoch": 1.2081431440569654, + "grad_norm": 0.8273488879203796, + "learning_rate": 1.9684419314352307e-05, + "loss": 0.6171, + "step": 3308 + }, + { + "epoch": 1.2085083074675917, + "grad_norm": 0.8147680759429932, + "learning_rate": 1.9670076010751478e-05, + "loss": 0.6255, + "step": 3309 + }, + { + "epoch": 1.208873470878218, + "grad_norm": 1.2720317840576172, + "learning_rate": 1.965573287688331e-05, + "loss": 0.6363, + "step": 3310 + }, + { + "epoch": 1.2092386342888441, + "grad_norm": 0.7890989184379578, + "learning_rate": 1.964138992012676e-05, + "loss": 0.5869, + "step": 3311 + }, + { + "epoch": 1.2096037976994705, + "grad_norm": 0.931955873966217, + "learning_rate": 1.9627047147860706e-05, + "loss": 0.6323, + "step": 3312 + }, + { + "epoch": 1.2099689611100968, + "grad_norm": 0.6453506946563721, + "learning_rate": 1.9612704567463926e-05, + "loss": 0.639, + "step": 3313 + }, + { + "epoch": 1.210334124520723, + "grad_norm": 0.8512594103813171, + "learning_rate": 1.959836218631511e-05, + "loss": 0.6142, + "step": 3314 + }, + { + "epoch": 1.2106992879313494, + "grad_norm": 1.2556297779083252, + "learning_rate": 1.9584020011792825e-05, + "loss": 0.6037, + "step": 3315 + }, + { + "epoch": 1.2110644513419755, + "grad_norm": 0.8126077651977539, + "learning_rate": 1.956967805127556e-05, + "loss": 0.6229, + "step": 3316 + }, + { + "epoch": 1.2114296147526018, + "grad_norm": 0.7425557374954224, + "learning_rate": 1.955533631214166e-05, + "loss": 0.5986, + "step": 3317 + }, + { + "epoch": 1.2117947781632281, + "grad_norm": 0.9697204828262329, + "learning_rate": 1.9540994801769392e-05, + "loss": 0.5703, + "step": 3318 + }, + { + "epoch": 1.2121599415738542, + "grad_norm": 0.7134398221969604, + "learning_rate": 1.952665352753688e-05, + "loss": 0.6407, + "step": 3319 + }, + { + "epoch": 1.2125251049844805, + "grad_norm": 1.0264909267425537, + "learning_rate": 1.9512312496822136e-05, + "loss": 0.6099, + "step": 3320 + }, + { + "epoch": 1.2128902683951068, + "grad_norm": 0.814012348651886, + "learning_rate": 1.949797171700304e-05, + "loss": 0.6287, + "step": 3321 + }, + { + "epoch": 1.2132554318057331, + "grad_norm": 0.7063661813735962, + "learning_rate": 1.9483631195457348e-05, + "loss": 0.5926, + "step": 3322 + }, + { + "epoch": 1.2136205952163592, + "grad_norm": 0.959875762462616, + "learning_rate": 1.9469290939562684e-05, + "loss": 0.5814, + "step": 3323 + }, + { + "epoch": 1.2139857586269855, + "grad_norm": 1.2501739263534546, + "learning_rate": 1.945495095669654e-05, + "loss": 0.6395, + "step": 3324 + }, + { + "epoch": 1.2143509220376119, + "grad_norm": 1.0051590204238892, + "learning_rate": 1.9440611254236253e-05, + "loss": 0.6177, + "step": 3325 + }, + { + "epoch": 1.2147160854482382, + "grad_norm": 0.9452838897705078, + "learning_rate": 1.942627183955903e-05, + "loss": 0.5999, + "step": 3326 + }, + { + "epoch": 1.2150812488588643, + "grad_norm": 0.8062846660614014, + "learning_rate": 1.9411932720041926e-05, + "loss": 0.5997, + "step": 3327 + }, + { + "epoch": 1.2154464122694906, + "grad_norm": 1.2123805284500122, + "learning_rate": 1.939759390306184e-05, + "loss": 0.593, + "step": 3328 + }, + { + "epoch": 1.2158115756801169, + "grad_norm": 0.7379060983657837, + "learning_rate": 1.9383255395995514e-05, + "loss": 0.63, + "step": 3329 + }, + { + "epoch": 1.2161767390907432, + "grad_norm": 0.9123709797859192, + "learning_rate": 1.9368917206219536e-05, + "loss": 0.6281, + "step": 3330 + }, + { + "epoch": 1.2165419025013693, + "grad_norm": 0.8074901103973389, + "learning_rate": 1.935457934111034e-05, + "loss": 0.6359, + "step": 3331 + }, + { + "epoch": 1.2169070659119956, + "grad_norm": 0.7655048370361328, + "learning_rate": 1.9340241808044167e-05, + "loss": 0.6021, + "step": 3332 + }, + { + "epoch": 1.217272229322622, + "grad_norm": 0.9054430723190308, + "learning_rate": 1.932590461439712e-05, + "loss": 0.5834, + "step": 3333 + }, + { + "epoch": 1.2176373927332482, + "grad_norm": 0.7461997270584106, + "learning_rate": 1.931156776754509e-05, + "loss": 0.6249, + "step": 3334 + }, + { + "epoch": 1.2180025561438743, + "grad_norm": 0.7512338161468506, + "learning_rate": 1.9297231274863818e-05, + "loss": 0.6102, + "step": 3335 + }, + { + "epoch": 1.2183677195545006, + "grad_norm": 0.8803744912147522, + "learning_rate": 1.9282895143728858e-05, + "loss": 0.6065, + "step": 3336 + }, + { + "epoch": 1.218732882965127, + "grad_norm": 0.9500965476036072, + "learning_rate": 1.9268559381515567e-05, + "loss": 0.5828, + "step": 3337 + }, + { + "epoch": 1.2190980463757533, + "grad_norm": 1.378042221069336, + "learning_rate": 1.925422399559913e-05, + "loss": 0.6112, + "step": 3338 + }, + { + "epoch": 1.2194632097863793, + "grad_norm": 1.0525225400924683, + "learning_rate": 1.9239888993354513e-05, + "loss": 0.5964, + "step": 3339 + }, + { + "epoch": 1.2198283731970057, + "grad_norm": 1.1789579391479492, + "learning_rate": 1.9225554382156514e-05, + "loss": 0.6137, + "step": 3340 + }, + { + "epoch": 1.220193536607632, + "grad_norm": 1.065134048461914, + "learning_rate": 1.9211220169379706e-05, + "loss": 0.6132, + "step": 3341 + }, + { + "epoch": 1.220558700018258, + "grad_norm": 1.0772281885147095, + "learning_rate": 1.9196886362398467e-05, + "loss": 0.6047, + "step": 3342 + }, + { + "epoch": 1.2209238634288844, + "grad_norm": 0.8940218091011047, + "learning_rate": 1.9182552968586973e-05, + "loss": 0.5607, + "step": 3343 + }, + { + "epoch": 1.2212890268395107, + "grad_norm": 1.1024268865585327, + "learning_rate": 1.9168219995319166e-05, + "loss": 0.6625, + "step": 3344 + }, + { + "epoch": 1.221654190250137, + "grad_norm": 0.9297084808349609, + "learning_rate": 1.9153887449968802e-05, + "loss": 0.5984, + "step": 3345 + }, + { + "epoch": 1.2220193536607633, + "grad_norm": 0.9231723546981812, + "learning_rate": 1.9139555339909388e-05, + "loss": 0.6176, + "step": 3346 + }, + { + "epoch": 1.2223845170713894, + "grad_norm": 0.9609585404396057, + "learning_rate": 1.9125223672514233e-05, + "loss": 0.6263, + "step": 3347 + }, + { + "epoch": 1.2227496804820157, + "grad_norm": 0.7409241199493408, + "learning_rate": 1.9110892455156395e-05, + "loss": 0.6207, + "step": 3348 + }, + { + "epoch": 1.223114843892642, + "grad_norm": 0.7970425486564636, + "learning_rate": 1.90965616952087e-05, + "loss": 0.6109, + "step": 3349 + }, + { + "epoch": 1.2234800073032681, + "grad_norm": 0.944929301738739, + "learning_rate": 1.908223140004377e-05, + "loss": 0.6056, + "step": 3350 + }, + { + "epoch": 1.2238451707138944, + "grad_norm": 1.079415202140808, + "learning_rate": 1.906790157703395e-05, + "loss": 0.5833, + "step": 3351 + }, + { + "epoch": 1.2242103341245207, + "grad_norm": 0.7950957417488098, + "learning_rate": 1.9053572233551365e-05, + "loss": 0.6001, + "step": 3352 + }, + { + "epoch": 1.224575497535147, + "grad_norm": 1.1039421558380127, + "learning_rate": 1.9039243376967893e-05, + "loss": 0.6146, + "step": 3353 + }, + { + "epoch": 1.2249406609457731, + "grad_norm": 0.9724546670913696, + "learning_rate": 1.9024915014655146e-05, + "loss": 0.6157, + "step": 3354 + }, + { + "epoch": 1.2253058243563995, + "grad_norm": 0.9508349895477295, + "learning_rate": 1.9010587153984503e-05, + "loss": 0.6254, + "step": 3355 + }, + { + "epoch": 1.2256709877670258, + "grad_norm": 0.9141985774040222, + "learning_rate": 1.899625980232706e-05, + "loss": 0.5978, + "step": 3356 + }, + { + "epoch": 1.226036151177652, + "grad_norm": 1.1930100917816162, + "learning_rate": 1.8981932967053677e-05, + "loss": 0.6092, + "step": 3357 + }, + { + "epoch": 1.2264013145882782, + "grad_norm": 0.7728860974311829, + "learning_rate": 1.8967606655534926e-05, + "loss": 0.6281, + "step": 3358 + }, + { + "epoch": 1.2267664779989045, + "grad_norm": 0.7239397168159485, + "learning_rate": 1.8953280875141125e-05, + "loss": 0.6024, + "step": 3359 + }, + { + "epoch": 1.2271316414095308, + "grad_norm": 0.8932539224624634, + "learning_rate": 1.893895563324232e-05, + "loss": 0.591, + "step": 3360 + }, + { + "epoch": 1.227496804820157, + "grad_norm": 0.9221148490905762, + "learning_rate": 1.892463093720825e-05, + "loss": 0.6439, + "step": 3361 + }, + { + "epoch": 1.2278619682307832, + "grad_norm": 0.7435846328735352, + "learning_rate": 1.891030679440842e-05, + "loss": 0.6223, + "step": 3362 + }, + { + "epoch": 1.2282271316414095, + "grad_norm": 1.098710298538208, + "learning_rate": 1.889598321221201e-05, + "loss": 0.6279, + "step": 3363 + }, + { + "epoch": 1.2285922950520358, + "grad_norm": 1.0755566358566284, + "learning_rate": 1.8881660197987937e-05, + "loss": 0.5842, + "step": 3364 + }, + { + "epoch": 1.2289574584626621, + "grad_norm": 1.1215485334396362, + "learning_rate": 1.886733775910482e-05, + "loss": 0.6005, + "step": 3365 + }, + { + "epoch": 1.2293226218732882, + "grad_norm": 0.8552205562591553, + "learning_rate": 1.8853015902930974e-05, + "loss": 0.6146, + "step": 3366 + }, + { + "epoch": 1.2296877852839145, + "grad_norm": 0.6829099655151367, + "learning_rate": 1.8838694636834423e-05, + "loss": 0.6265, + "step": 3367 + }, + { + "epoch": 1.2300529486945408, + "grad_norm": 1.0725380182266235, + "learning_rate": 1.8824373968182875e-05, + "loss": 0.5992, + "step": 3368 + }, + { + "epoch": 1.2304181121051672, + "grad_norm": 1.746687650680542, + "learning_rate": 1.881005390434375e-05, + "loss": 0.6139, + "step": 3369 + }, + { + "epoch": 1.2307832755157933, + "grad_norm": 1.0292305946350098, + "learning_rate": 1.879573445268414e-05, + "loss": 0.6255, + "step": 3370 + }, + { + "epoch": 1.2311484389264196, + "grad_norm": 1.2852039337158203, + "learning_rate": 1.8781415620570832e-05, + "loss": 0.6349, + "step": 3371 + }, + { + "epoch": 1.2315136023370459, + "grad_norm": 1.2481268644332886, + "learning_rate": 1.8767097415370287e-05, + "loss": 0.605, + "step": 3372 + }, + { + "epoch": 1.231878765747672, + "grad_norm": 0.859007716178894, + "learning_rate": 1.8752779844448653e-05, + "loss": 0.6245, + "step": 3373 + }, + { + "epoch": 1.2322439291582983, + "grad_norm": 1.0601139068603516, + "learning_rate": 1.8738462915171736e-05, + "loss": 0.6321, + "step": 3374 + }, + { + "epoch": 1.2326090925689246, + "grad_norm": 1.0997142791748047, + "learning_rate": 1.8724146634905026e-05, + "loss": 0.5712, + "step": 3375 + }, + { + "epoch": 1.232974255979551, + "grad_norm": 0.720138430595398, + "learning_rate": 1.8709831011013678e-05, + "loss": 0.5951, + "step": 3376 + }, + { + "epoch": 1.2333394193901772, + "grad_norm": 1.0345478057861328, + "learning_rate": 1.8695516050862504e-05, + "loss": 0.5701, + "step": 3377 + }, + { + "epoch": 1.2337045828008033, + "grad_norm": 1.1205919981002808, + "learning_rate": 1.8681201761815974e-05, + "loss": 0.6008, + "step": 3378 + }, + { + "epoch": 1.2340697462114296, + "grad_norm": 1.0405442714691162, + "learning_rate": 1.8666888151238217e-05, + "loss": 0.6039, + "step": 3379 + }, + { + "epoch": 1.234434909622056, + "grad_norm": 0.8944239020347595, + "learning_rate": 1.8652575226493012e-05, + "loss": 0.6328, + "step": 3380 + }, + { + "epoch": 1.234800073032682, + "grad_norm": 0.9236727952957153, + "learning_rate": 1.863826299494379e-05, + "loss": 0.5893, + "step": 3381 + }, + { + "epoch": 1.2351652364433083, + "grad_norm": 0.7447712421417236, + "learning_rate": 1.8623951463953605e-05, + "loss": 0.6182, + "step": 3382 + }, + { + "epoch": 1.2355303998539346, + "grad_norm": 0.8919333219528198, + "learning_rate": 1.8609640640885177e-05, + "loss": 0.6091, + "step": 3383 + }, + { + "epoch": 1.235895563264561, + "grad_norm": 0.9320427179336548, + "learning_rate": 1.859533053310085e-05, + "loss": 0.6072, + "step": 3384 + }, + { + "epoch": 1.236260726675187, + "grad_norm": 0.8821183443069458, + "learning_rate": 1.8581021147962593e-05, + "loss": 0.619, + "step": 3385 + }, + { + "epoch": 1.2366258900858134, + "grad_norm": 1.4499365091323853, + "learning_rate": 1.856671249283202e-05, + "loss": 0.6027, + "step": 3386 + }, + { + "epoch": 1.2369910534964397, + "grad_norm": 1.0097719430923462, + "learning_rate": 1.8552404575070363e-05, + "loss": 0.6203, + "step": 3387 + }, + { + "epoch": 1.237356216907066, + "grad_norm": 0.7847924828529358, + "learning_rate": 1.8538097402038452e-05, + "loss": 0.6049, + "step": 3388 + }, + { + "epoch": 1.237721380317692, + "grad_norm": 3.8774895668029785, + "learning_rate": 1.852379098109677e-05, + "loss": 0.5725, + "step": 3389 + }, + { + "epoch": 1.2380865437283184, + "grad_norm": 0.8568375110626221, + "learning_rate": 1.850948531960539e-05, + "loss": 0.6049, + "step": 3390 + }, + { + "epoch": 1.2384517071389447, + "grad_norm": 0.9716466665267944, + "learning_rate": 1.8495180424924003e-05, + "loss": 0.594, + "step": 3391 + }, + { + "epoch": 1.238816870549571, + "grad_norm": 1.1959747076034546, + "learning_rate": 1.84808763044119e-05, + "loss": 0.5873, + "step": 3392 + }, + { + "epoch": 1.239182033960197, + "grad_norm": 0.9050604104995728, + "learning_rate": 1.8466572965427984e-05, + "loss": 0.5921, + "step": 3393 + }, + { + "epoch": 1.2395471973708234, + "grad_norm": 0.9194764494895935, + "learning_rate": 1.845227041533074e-05, + "loss": 0.587, + "step": 3394 + }, + { + "epoch": 1.2399123607814497, + "grad_norm": 0.8811522126197815, + "learning_rate": 1.8437968661478262e-05, + "loss": 0.6271, + "step": 3395 + }, + { + "epoch": 1.240277524192076, + "grad_norm": 1.0084917545318604, + "learning_rate": 1.842366771122823e-05, + "loss": 0.5811, + "step": 3396 + }, + { + "epoch": 1.2406426876027021, + "grad_norm": 0.7907148003578186, + "learning_rate": 1.8409367571937903e-05, + "loss": 0.6132, + "step": 3397 + }, + { + "epoch": 1.2410078510133284, + "grad_norm": 0.6632513403892517, + "learning_rate": 1.8395068250964138e-05, + "loss": 0.6138, + "step": 3398 + }, + { + "epoch": 1.2413730144239548, + "grad_norm": 0.6575708985328674, + "learning_rate": 1.8380769755663348e-05, + "loss": 0.6143, + "step": 3399 + }, + { + "epoch": 1.241738177834581, + "grad_norm": 0.6493009328842163, + "learning_rate": 1.8366472093391553e-05, + "loss": 0.6385, + "step": 3400 + }, + { + "epoch": 1.2421033412452072, + "grad_norm": 0.8245896697044373, + "learning_rate": 1.8352175271504314e-05, + "loss": 0.5871, + "step": 3401 + }, + { + "epoch": 1.2424685046558335, + "grad_norm": 1.0025814771652222, + "learning_rate": 1.833787929735677e-05, + "loss": 0.597, + "step": 3402 + }, + { + "epoch": 1.2428336680664598, + "grad_norm": 0.953640341758728, + "learning_rate": 1.8323584178303632e-05, + "loss": 0.5999, + "step": 3403 + }, + { + "epoch": 1.2431988314770859, + "grad_norm": 0.8461502194404602, + "learning_rate": 1.8309289921699163e-05, + "loss": 0.5781, + "step": 3404 + }, + { + "epoch": 1.2435639948877122, + "grad_norm": 0.5869039297103882, + "learning_rate": 1.8294996534897185e-05, + "loss": 0.6208, + "step": 3405 + }, + { + "epoch": 1.2439291582983385, + "grad_norm": 1.093299388885498, + "learning_rate": 1.8280704025251076e-05, + "loss": 0.6124, + "step": 3406 + }, + { + "epoch": 1.2442943217089648, + "grad_norm": 1.0694578886032104, + "learning_rate": 1.8266412400113747e-05, + "loss": 0.5899, + "step": 3407 + }, + { + "epoch": 1.2446594851195911, + "grad_norm": 1.018556833267212, + "learning_rate": 1.825212166683768e-05, + "loss": 0.5649, + "step": 3408 + }, + { + "epoch": 1.2450246485302172, + "grad_norm": 1.0864132642745972, + "learning_rate": 1.8237831832774877e-05, + "loss": 0.5956, + "step": 3409 + }, + { + "epoch": 1.2453898119408435, + "grad_norm": 1.1059077978134155, + "learning_rate": 1.8223542905276885e-05, + "loss": 0.5708, + "step": 3410 + }, + { + "epoch": 1.2457549753514698, + "grad_norm": 1.5342904329299927, + "learning_rate": 1.820925489169478e-05, + "loss": 0.5978, + "step": 3411 + }, + { + "epoch": 1.246120138762096, + "grad_norm": 0.9663258790969849, + "learning_rate": 1.819496779937918e-05, + "loss": 0.5566, + "step": 3412 + }, + { + "epoch": 1.2464853021727222, + "grad_norm": 0.8954864740371704, + "learning_rate": 1.818068163568022e-05, + "loss": 0.6285, + "step": 3413 + }, + { + "epoch": 1.2468504655833486, + "grad_norm": 1.2142242193222046, + "learning_rate": 1.8166396407947546e-05, + "loss": 0.5819, + "step": 3414 + }, + { + "epoch": 1.2472156289939749, + "grad_norm": 0.9968467354774475, + "learning_rate": 1.8152112123530345e-05, + "loss": 0.6378, + "step": 3415 + }, + { + "epoch": 1.2475807924046012, + "grad_norm": 0.8761385679244995, + "learning_rate": 1.8137828789777302e-05, + "loss": 0.616, + "step": 3416 + }, + { + "epoch": 1.2479459558152273, + "grad_norm": 0.755793035030365, + "learning_rate": 1.8123546414036623e-05, + "loss": 0.5758, + "step": 3417 + }, + { + "epoch": 1.2483111192258536, + "grad_norm": 1.3821804523468018, + "learning_rate": 1.810926500365602e-05, + "loss": 0.5734, + "step": 3418 + }, + { + "epoch": 1.24867628263648, + "grad_norm": 0.9680818915367126, + "learning_rate": 1.8094984565982697e-05, + "loss": 0.6304, + "step": 3419 + }, + { + "epoch": 1.249041446047106, + "grad_norm": 0.9872563481330872, + "learning_rate": 1.8080705108363376e-05, + "loss": 0.595, + "step": 3420 + }, + { + "epoch": 1.2494066094577323, + "grad_norm": 0.8364111185073853, + "learning_rate": 1.8066426638144253e-05, + "loss": 0.5807, + "step": 3421 + }, + { + "epoch": 1.2497717728683586, + "grad_norm": 1.2990816831588745, + "learning_rate": 1.8052149162671045e-05, + "loss": 0.6285, + "step": 3422 + }, + { + "epoch": 1.250136936278985, + "grad_norm": 0.6941619515419006, + "learning_rate": 1.8037872689288923e-05, + "loss": 0.6104, + "step": 3423 + }, + { + "epoch": 1.2505020996896112, + "grad_norm": 1.0219366550445557, + "learning_rate": 1.802359722534257e-05, + "loss": 0.629, + "step": 3424 + }, + { + "epoch": 1.2508672631002373, + "grad_norm": 0.88230961561203, + "learning_rate": 1.800932277817614e-05, + "loss": 0.6431, + "step": 3425 + }, + { + "epoch": 1.2512324265108636, + "grad_norm": 0.7840175032615662, + "learning_rate": 1.7995049355133254e-05, + "loss": 0.6221, + "step": 3426 + }, + { + "epoch": 1.2515975899214897, + "grad_norm": 1.0608835220336914, + "learning_rate": 1.798077696355703e-05, + "loss": 0.6035, + "step": 3427 + }, + { + "epoch": 1.251962753332116, + "grad_norm": 0.9382961988449097, + "learning_rate": 1.7966505610790022e-05, + "loss": 0.5972, + "step": 3428 + }, + { + "epoch": 1.2523279167427424, + "grad_norm": 0.9159266352653503, + "learning_rate": 1.795223530417428e-05, + "loss": 0.5949, + "step": 3429 + }, + { + "epoch": 1.2526930801533687, + "grad_norm": 0.7968629002571106, + "learning_rate": 1.7937966051051306e-05, + "loss": 0.6394, + "step": 3430 + }, + { + "epoch": 1.253058243563995, + "grad_norm": 0.9000434875488281, + "learning_rate": 1.7923697858762054e-05, + "loss": 0.6027, + "step": 3431 + }, + { + "epoch": 1.253423406974621, + "grad_norm": 1.0707937479019165, + "learning_rate": 1.7909430734646936e-05, + "loss": 0.5795, + "step": 3432 + }, + { + "epoch": 1.2537885703852474, + "grad_norm": 0.9548970460891724, + "learning_rate": 1.789516468604581e-05, + "loss": 0.5969, + "step": 3433 + }, + { + "epoch": 1.2541537337958737, + "grad_norm": 0.6696003079414368, + "learning_rate": 1.7880899720297998e-05, + "loss": 0.6089, + "step": 3434 + }, + { + "epoch": 1.2545188972064998, + "grad_norm": 0.9154466390609741, + "learning_rate": 1.7866635844742243e-05, + "loss": 0.6233, + "step": 3435 + }, + { + "epoch": 1.254884060617126, + "grad_norm": 0.8010258674621582, + "learning_rate": 1.785237306671674e-05, + "loss": 0.6129, + "step": 3436 + }, + { + "epoch": 1.2552492240277524, + "grad_norm": 0.9692965149879456, + "learning_rate": 1.7838111393559115e-05, + "loss": 0.577, + "step": 3437 + }, + { + "epoch": 1.2556143874383787, + "grad_norm": 0.7379612326622009, + "learning_rate": 1.7823850832606425e-05, + "loss": 0.6067, + "step": 3438 + }, + { + "epoch": 1.255979550849005, + "grad_norm": 0.8523130416870117, + "learning_rate": 1.7809591391195162e-05, + "loss": 0.5649, + "step": 3439 + }, + { + "epoch": 1.2563447142596311, + "grad_norm": 0.9611847996711731, + "learning_rate": 1.7795333076661238e-05, + "loss": 0.5861, + "step": 3440 + }, + { + "epoch": 1.2567098776702574, + "grad_norm": 0.828985869884491, + "learning_rate": 1.7781075896339968e-05, + "loss": 0.616, + "step": 3441 + }, + { + "epoch": 1.2570750410808837, + "grad_norm": 1.1675282716751099, + "learning_rate": 1.7766819857566116e-05, + "loss": 0.5979, + "step": 3442 + }, + { + "epoch": 1.2574402044915098, + "grad_norm": 1.1937335729599, + "learning_rate": 1.7752564967673832e-05, + "loss": 0.6245, + "step": 3443 + }, + { + "epoch": 1.2578053679021362, + "grad_norm": 0.7989965081214905, + "learning_rate": 1.7738311233996686e-05, + "loss": 0.6193, + "step": 3444 + }, + { + "epoch": 1.2581705313127625, + "grad_norm": 2.223703384399414, + "learning_rate": 1.7724058663867656e-05, + "loss": 0.6288, + "step": 3445 + }, + { + "epoch": 1.2585356947233888, + "grad_norm": 0.8401123881340027, + "learning_rate": 1.770980726461912e-05, + "loss": 0.618, + "step": 3446 + }, + { + "epoch": 1.258900858134015, + "grad_norm": 1.1787495613098145, + "learning_rate": 1.769555704358284e-05, + "loss": 0.5753, + "step": 3447 + }, + { + "epoch": 1.2592660215446412, + "grad_norm": 1.0285699367523193, + "learning_rate": 1.7681308008089993e-05, + "loss": 0.5934, + "step": 3448 + }, + { + "epoch": 1.2596311849552675, + "grad_norm": 0.884979248046875, + "learning_rate": 1.766706016547113e-05, + "loss": 0.6229, + "step": 3449 + }, + { + "epoch": 1.2599963483658938, + "grad_norm": 0.7082614898681641, + "learning_rate": 1.7652813523056195e-05, + "loss": 0.6307, + "step": 3450 + }, + { + "epoch": 1.26036151177652, + "grad_norm": 1.0871187448501587, + "learning_rate": 1.763856808817452e-05, + "loss": 0.5687, + "step": 3451 + }, + { + "epoch": 1.2607266751871462, + "grad_norm": 1.1111515760421753, + "learning_rate": 1.7624323868154804e-05, + "loss": 0.5856, + "step": 3452 + }, + { + "epoch": 1.2610918385977725, + "grad_norm": 1.0516471862792969, + "learning_rate": 1.7610080870325135e-05, + "loss": 0.5662, + "step": 3453 + }, + { + "epoch": 1.2614570020083988, + "grad_norm": 0.9718304872512817, + "learning_rate": 1.7595839102012954e-05, + "loss": 0.6116, + "step": 3454 + }, + { + "epoch": 1.2618221654190251, + "grad_norm": 0.9986475706100464, + "learning_rate": 1.7581598570545075e-05, + "loss": 0.5891, + "step": 3455 + }, + { + "epoch": 1.2621873288296512, + "grad_norm": 0.9629244804382324, + "learning_rate": 1.756735928324769e-05, + "loss": 0.5981, + "step": 3456 + }, + { + "epoch": 1.2625524922402775, + "grad_norm": 1.0716477632522583, + "learning_rate": 1.7553121247446337e-05, + "loss": 0.5955, + "step": 3457 + }, + { + "epoch": 1.2629176556509039, + "grad_norm": 0.8771053552627563, + "learning_rate": 1.753888447046592e-05, + "loss": 0.579, + "step": 3458 + }, + { + "epoch": 1.26328281906153, + "grad_norm": 1.0961087942123413, + "learning_rate": 1.7524648959630676e-05, + "loss": 0.5978, + "step": 3459 + }, + { + "epoch": 1.2636479824721563, + "grad_norm": 2.062915325164795, + "learning_rate": 1.7510414722264217e-05, + "loss": 0.6278, + "step": 3460 + }, + { + "epoch": 1.2640131458827826, + "grad_norm": 0.8420235514640808, + "learning_rate": 1.7496181765689485e-05, + "loss": 0.5904, + "step": 3461 + }, + { + "epoch": 1.2643783092934089, + "grad_norm": 0.8386824727058411, + "learning_rate": 1.7481950097228757e-05, + "loss": 0.607, + "step": 3462 + }, + { + "epoch": 1.264743472704035, + "grad_norm": 0.8218091726303101, + "learning_rate": 1.7467719724203667e-05, + "loss": 0.5942, + "step": 3463 + }, + { + "epoch": 1.2651086361146613, + "grad_norm": 0.8152195811271667, + "learning_rate": 1.7453490653935162e-05, + "loss": 0.5744, + "step": 3464 + }, + { + "epoch": 1.2654737995252876, + "grad_norm": 0.853538990020752, + "learning_rate": 1.743926289374353e-05, + "loss": 0.6172, + "step": 3465 + }, + { + "epoch": 1.2658389629359137, + "grad_norm": 0.6543163657188416, + "learning_rate": 1.7425036450948383e-05, + "loss": 0.6091, + "step": 3466 + }, + { + "epoch": 1.26620412634654, + "grad_norm": 1.062475323677063, + "learning_rate": 1.7410811332868664e-05, + "loss": 0.586, + "step": 3467 + }, + { + "epoch": 1.2665692897571663, + "grad_norm": 0.7827355861663818, + "learning_rate": 1.739658754682261e-05, + "loss": 0.6071, + "step": 3468 + }, + { + "epoch": 1.2669344531677926, + "grad_norm": 0.6963713765144348, + "learning_rate": 1.738236510012779e-05, + "loss": 0.5623, + "step": 3469 + }, + { + "epoch": 1.267299616578419, + "grad_norm": 0.8926814794540405, + "learning_rate": 1.7368144000101093e-05, + "loss": 0.6138, + "step": 3470 + }, + { + "epoch": 1.267664779989045, + "grad_norm": 0.79139244556427, + "learning_rate": 1.7353924254058695e-05, + "loss": 0.6263, + "step": 3471 + }, + { + "epoch": 1.2680299433996713, + "grad_norm": 0.8635314702987671, + "learning_rate": 1.7339705869316083e-05, + "loss": 0.6343, + "step": 3472 + }, + { + "epoch": 1.2683951068102977, + "grad_norm": 0.8019347786903381, + "learning_rate": 1.732548885318806e-05, + "loss": 0.6206, + "step": 3473 + }, + { + "epoch": 1.2687602702209237, + "grad_norm": 0.8886422514915466, + "learning_rate": 1.7311273212988692e-05, + "loss": 0.6003, + "step": 3474 + }, + { + "epoch": 1.26912543363155, + "grad_norm": 1.609634280204773, + "learning_rate": 1.729705895603137e-05, + "loss": 0.5965, + "step": 3475 + }, + { + "epoch": 1.2694905970421764, + "grad_norm": 1.058031439781189, + "learning_rate": 1.728284608962875e-05, + "loss": 0.5999, + "step": 3476 + }, + { + "epoch": 1.2698557604528027, + "grad_norm": 0.8010134696960449, + "learning_rate": 1.7268634621092786e-05, + "loss": 0.5939, + "step": 3477 + }, + { + "epoch": 1.270220923863429, + "grad_norm": 0.746239185333252, + "learning_rate": 1.7254424557734703e-05, + "loss": 0.5873, + "step": 3478 + }, + { + "epoch": 1.270586087274055, + "grad_norm": 0.766264796257019, + "learning_rate": 1.7240215906865016e-05, + "loss": 0.6213, + "step": 3479 + }, + { + "epoch": 1.2709512506846814, + "grad_norm": 0.900159478187561, + "learning_rate": 1.722600867579351e-05, + "loss": 0.603, + "step": 3480 + }, + { + "epoch": 1.2713164140953077, + "grad_norm": 3.7984745502471924, + "learning_rate": 1.7211802871829216e-05, + "loss": 0.6069, + "step": 3481 + }, + { + "epoch": 1.2716815775059338, + "grad_norm": 0.9797561168670654, + "learning_rate": 1.719759850228046e-05, + "loss": 0.5898, + "step": 3482 + }, + { + "epoch": 1.2720467409165601, + "grad_norm": 0.7241623997688293, + "learning_rate": 1.7183395574454823e-05, + "loss": 0.5902, + "step": 3483 + }, + { + "epoch": 1.2724119043271864, + "grad_norm": 1.2979917526245117, + "learning_rate": 1.716919409565914e-05, + "loss": 0.5622, + "step": 3484 + }, + { + "epoch": 1.2727770677378127, + "grad_norm": 0.8148595690727234, + "learning_rate": 1.71549940731995e-05, + "loss": 0.592, + "step": 3485 + }, + { + "epoch": 1.273142231148439, + "grad_norm": 0.7827801704406738, + "learning_rate": 1.714079551438125e-05, + "loss": 0.6179, + "step": 3486 + }, + { + "epoch": 1.2735073945590651, + "grad_norm": 0.8408051133155823, + "learning_rate": 1.7126598426508974e-05, + "loss": 0.5924, + "step": 3487 + }, + { + "epoch": 1.2738725579696915, + "grad_norm": 0.909966766834259, + "learning_rate": 1.7112402816886504e-05, + "loss": 0.573, + "step": 3488 + }, + { + "epoch": 1.2742377213803178, + "grad_norm": 0.9594547748565674, + "learning_rate": 1.7098208692816915e-05, + "loss": 0.5775, + "step": 3489 + }, + { + "epoch": 1.2746028847909439, + "grad_norm": 0.6666066646575928, + "learning_rate": 1.708401606160251e-05, + "loss": 0.6097, + "step": 3490 + }, + { + "epoch": 1.2749680482015702, + "grad_norm": 0.9995865821838379, + "learning_rate": 1.706982493054483e-05, + "loss": 0.6027, + "step": 3491 + }, + { + "epoch": 1.2753332116121965, + "grad_norm": 1.255068063735962, + "learning_rate": 1.705563530694464e-05, + "loss": 0.5425, + "step": 3492 + }, + { + "epoch": 1.2756983750228228, + "grad_norm": 0.9269176721572876, + "learning_rate": 1.7041447198101946e-05, + "loss": 0.618, + "step": 3493 + }, + { + "epoch": 1.276063538433449, + "grad_norm": 0.8610042929649353, + "learning_rate": 1.7027260611315936e-05, + "loss": 0.6, + "step": 3494 + }, + { + "epoch": 1.2764287018440752, + "grad_norm": 0.9045096039772034, + "learning_rate": 1.7013075553885063e-05, + "loss": 0.5956, + "step": 3495 + }, + { + "epoch": 1.2767938652547015, + "grad_norm": 0.6672929525375366, + "learning_rate": 1.699889203310695e-05, + "loss": 0.5851, + "step": 3496 + }, + { + "epoch": 1.2771590286653276, + "grad_norm": 1.0661526918411255, + "learning_rate": 1.6984710056278462e-05, + "loss": 0.5876, + "step": 3497 + }, + { + "epoch": 1.277524192075954, + "grad_norm": 0.7957583665847778, + "learning_rate": 1.6970529630695656e-05, + "loss": 0.5843, + "step": 3498 + }, + { + "epoch": 1.2778893554865802, + "grad_norm": 0.9293861985206604, + "learning_rate": 1.695635076365379e-05, + "loss": 0.5713, + "step": 3499 + }, + { + "epoch": 1.2782545188972065, + "grad_norm": 0.9029990434646606, + "learning_rate": 1.694217346244732e-05, + "loss": 0.6146, + "step": 3500 + }, + { + "epoch": 1.2786196823078328, + "grad_norm": 1.06729257106781, + "learning_rate": 1.6927997734369904e-05, + "loss": 0.5658, + "step": 3501 + }, + { + "epoch": 1.278984845718459, + "grad_norm": 0.5850042104721069, + "learning_rate": 1.691382358671438e-05, + "loss": 0.6087, + "step": 3502 + }, + { + "epoch": 1.2793500091290853, + "grad_norm": 1.2588943243026733, + "learning_rate": 1.6899651026772776e-05, + "loss": 0.6163, + "step": 3503 + }, + { + "epoch": 1.2797151725397116, + "grad_norm": 1.1200400590896606, + "learning_rate": 1.6885480061836314e-05, + "loss": 0.5824, + "step": 3504 + }, + { + "epoch": 1.2800803359503377, + "grad_norm": 0.8963230848312378, + "learning_rate": 1.687131069919538e-05, + "loss": 0.5888, + "step": 3505 + }, + { + "epoch": 1.280445499360964, + "grad_norm": 0.977247416973114, + "learning_rate": 1.685714294613955e-05, + "loss": 0.5614, + "step": 3506 + }, + { + "epoch": 1.2808106627715903, + "grad_norm": 0.7008729577064514, + "learning_rate": 1.6842976809957562e-05, + "loss": 0.5833, + "step": 3507 + }, + { + "epoch": 1.2811758261822166, + "grad_norm": 1.0665124654769897, + "learning_rate": 1.6828812297937314e-05, + "loss": 0.563, + "step": 3508 + }, + { + "epoch": 1.281540989592843, + "grad_norm": 0.7580817937850952, + "learning_rate": 1.681464941736589e-05, + "loss": 0.5966, + "step": 3509 + }, + { + "epoch": 1.281906153003469, + "grad_norm": 2.03531813621521, + "learning_rate": 1.6800488175529516e-05, + "loss": 0.5923, + "step": 3510 + }, + { + "epoch": 1.2822713164140953, + "grad_norm": 0.7629796266555786, + "learning_rate": 1.6786328579713593e-05, + "loss": 0.5746, + "step": 3511 + }, + { + "epoch": 1.2826364798247216, + "grad_norm": 0.9172356724739075, + "learning_rate": 1.6772170637202655e-05, + "loss": 0.581, + "step": 3512 + }, + { + "epoch": 1.2830016432353477, + "grad_norm": 1.3666951656341553, + "learning_rate": 1.67580143552804e-05, + "loss": 0.5851, + "step": 3513 + }, + { + "epoch": 1.283366806645974, + "grad_norm": 0.8977357149124146, + "learning_rate": 1.6743859741229667e-05, + "loss": 0.615, + "step": 3514 + }, + { + "epoch": 1.2837319700566003, + "grad_norm": 0.7211498022079468, + "learning_rate": 1.6729706802332433e-05, + "loss": 0.5903, + "step": 3515 + }, + { + "epoch": 1.2840971334672266, + "grad_norm": 0.8175953030586243, + "learning_rate": 1.6715555545869827e-05, + "loss": 0.5858, + "step": 3516 + }, + { + "epoch": 1.284462296877853, + "grad_norm": 1.1230109930038452, + "learning_rate": 1.670140597912209e-05, + "loss": 0.5659, + "step": 3517 + }, + { + "epoch": 1.284827460288479, + "grad_norm": 1.0331265926361084, + "learning_rate": 1.6687258109368617e-05, + "loss": 0.5712, + "step": 3518 + }, + { + "epoch": 1.2851926236991054, + "grad_norm": 0.9673627614974976, + "learning_rate": 1.667311194388791e-05, + "loss": 0.597, + "step": 3519 + }, + { + "epoch": 1.2855577871097317, + "grad_norm": 0.77398282289505, + "learning_rate": 1.665896748995762e-05, + "loss": 0.6122, + "step": 3520 + }, + { + "epoch": 1.2859229505203578, + "grad_norm": 0.8281552791595459, + "learning_rate": 1.6644824754854484e-05, + "loss": 0.6006, + "step": 3521 + }, + { + "epoch": 1.286288113930984, + "grad_norm": 0.8036080002784729, + "learning_rate": 1.663068374585437e-05, + "loss": 0.593, + "step": 3522 + }, + { + "epoch": 1.2866532773416104, + "grad_norm": 0.6916384100914001, + "learning_rate": 1.661654447023227e-05, + "loss": 0.5752, + "step": 3523 + }, + { + "epoch": 1.2870184407522367, + "grad_norm": 0.7672572731971741, + "learning_rate": 1.6602406935262273e-05, + "loss": 0.5778, + "step": 3524 + }, + { + "epoch": 1.287383604162863, + "grad_norm": 0.9159708023071289, + "learning_rate": 1.658827114821756e-05, + "loss": 0.5782, + "step": 3525 + }, + { + "epoch": 1.287748767573489, + "grad_norm": 0.6897777915000916, + "learning_rate": 1.657413711637045e-05, + "loss": 0.6111, + "step": 3526 + }, + { + "epoch": 1.2881139309841154, + "grad_norm": 0.8971490859985352, + "learning_rate": 1.656000484699232e-05, + "loss": 0.5769, + "step": 3527 + }, + { + "epoch": 1.2884790943947415, + "grad_norm": 0.7451416850090027, + "learning_rate": 1.6545874347353655e-05, + "loss": 0.6108, + "step": 3528 + }, + { + "epoch": 1.2888442578053678, + "grad_norm": 0.9213032722473145, + "learning_rate": 1.653174562472403e-05, + "loss": 0.6112, + "step": 3529 + }, + { + "epoch": 1.2892094212159941, + "grad_norm": 0.91603684425354, + "learning_rate": 1.6517618686372114e-05, + "loss": 0.5648, + "step": 3530 + }, + { + "epoch": 1.2895745846266204, + "grad_norm": 0.8848807215690613, + "learning_rate": 1.6503493539565642e-05, + "loss": 0.6, + "step": 3531 + }, + { + "epoch": 1.2899397480372468, + "grad_norm": 0.9526199698448181, + "learning_rate": 1.648937019157144e-05, + "loss": 0.5811, + "step": 3532 + }, + { + "epoch": 1.2903049114478728, + "grad_norm": 0.939439058303833, + "learning_rate": 1.6475248649655398e-05, + "loss": 0.5511, + "step": 3533 + }, + { + "epoch": 1.2906700748584992, + "grad_norm": 1.0261359214782715, + "learning_rate": 1.6461128921082496e-05, + "loss": 0.5912, + "step": 3534 + }, + { + "epoch": 1.2910352382691255, + "grad_norm": 0.8147271275520325, + "learning_rate": 1.6447011013116753e-05, + "loss": 0.5732, + "step": 3535 + }, + { + "epoch": 1.2914004016797516, + "grad_norm": 0.9904578924179077, + "learning_rate": 1.6432894933021266e-05, + "loss": 0.5649, + "step": 3536 + }, + { + "epoch": 1.2917655650903779, + "grad_norm": 0.7666850090026855, + "learning_rate": 1.6418780688058197e-05, + "loss": 0.608, + "step": 3537 + }, + { + "epoch": 1.2921307285010042, + "grad_norm": 1.113914966583252, + "learning_rate": 1.6404668285488763e-05, + "loss": 0.5859, + "step": 3538 + }, + { + "epoch": 1.2924958919116305, + "grad_norm": 0.7998248934745789, + "learning_rate": 1.6390557732573217e-05, + "loss": 0.5944, + "step": 3539 + }, + { + "epoch": 1.2928610553222568, + "grad_norm": 0.9326393008232117, + "learning_rate": 1.637644903657089e-05, + "loss": 0.5851, + "step": 3540 + }, + { + "epoch": 1.293226218732883, + "grad_norm": 0.6946517825126648, + "learning_rate": 1.6362342204740124e-05, + "loss": 0.5579, + "step": 3541 + }, + { + "epoch": 1.2935913821435092, + "grad_norm": 1.0338560342788696, + "learning_rate": 1.6348237244338324e-05, + "loss": 0.5815, + "step": 3542 + }, + { + "epoch": 1.2939565455541355, + "grad_norm": 0.8650919198989868, + "learning_rate": 1.6334134162621923e-05, + "loss": 0.5754, + "step": 3543 + }, + { + "epoch": 1.2943217089647616, + "grad_norm": 0.7965771555900574, + "learning_rate": 1.63200329668464e-05, + "loss": 0.6223, + "step": 3544 + }, + { + "epoch": 1.294686872375388, + "grad_norm": 0.8801450133323669, + "learning_rate": 1.6305933664266242e-05, + "loss": 0.5831, + "step": 3545 + }, + { + "epoch": 1.2950520357860142, + "grad_norm": 0.7547784447669983, + "learning_rate": 1.629183626213498e-05, + "loss": 0.6145, + "step": 3546 + }, + { + "epoch": 1.2954171991966406, + "grad_norm": 1.00898277759552, + "learning_rate": 1.627774076770517e-05, + "loss": 0.586, + "step": 3547 + }, + { + "epoch": 1.2957823626072669, + "grad_norm": 0.8119264245033264, + "learning_rate": 1.6263647188228366e-05, + "loss": 0.5754, + "step": 3548 + }, + { + "epoch": 1.296147526017893, + "grad_norm": 0.8790151476860046, + "learning_rate": 1.624955553095515e-05, + "loss": 0.5873, + "step": 3549 + }, + { + "epoch": 1.2965126894285193, + "grad_norm": 0.7947791218757629, + "learning_rate": 1.623546580313512e-05, + "loss": 0.5911, + "step": 3550 + }, + { + "epoch": 1.2968778528391456, + "grad_norm": 0.9465892910957336, + "learning_rate": 1.622137801201687e-05, + "loss": 0.6387, + "step": 3551 + }, + { + "epoch": 1.2972430162497717, + "grad_norm": 0.9719758629798889, + "learning_rate": 1.620729216484801e-05, + "loss": 0.5508, + "step": 3552 + }, + { + "epoch": 1.297608179660398, + "grad_norm": 0.9086902141571045, + "learning_rate": 1.6193208268875133e-05, + "loss": 0.5989, + "step": 3553 + }, + { + "epoch": 1.2979733430710243, + "grad_norm": 0.8419565558433533, + "learning_rate": 1.617912633134385e-05, + "loss": 0.568, + "step": 3554 + }, + { + "epoch": 1.2983385064816506, + "grad_norm": 0.9197947978973389, + "learning_rate": 1.6165046359498748e-05, + "loss": 0.562, + "step": 3555 + }, + { + "epoch": 1.298703669892277, + "grad_norm": 0.9575653672218323, + "learning_rate": 1.6150968360583404e-05, + "loss": 0.5974, + "step": 3556 + }, + { + "epoch": 1.299068833302903, + "grad_norm": 1.4620823860168457, + "learning_rate": 1.6136892341840386e-05, + "loss": 0.6057, + "step": 3557 + }, + { + "epoch": 1.2994339967135293, + "grad_norm": 0.8264151811599731, + "learning_rate": 1.612281831051124e-05, + "loss": 0.5728, + "step": 3558 + }, + { + "epoch": 1.2997991601241556, + "grad_norm": 1.0922088623046875, + "learning_rate": 1.6108746273836495e-05, + "loss": 0.5734, + "step": 3559 + }, + { + "epoch": 1.3001643235347817, + "grad_norm": 0.9147183895111084, + "learning_rate": 1.6094676239055654e-05, + "loss": 0.5759, + "step": 3560 + }, + { + "epoch": 1.300529486945408, + "grad_norm": 0.938529372215271, + "learning_rate": 1.6080608213407164e-05, + "loss": 0.5618, + "step": 3561 + }, + { + "epoch": 1.3008946503560344, + "grad_norm": 0.7035357356071472, + "learning_rate": 1.606654220412848e-05, + "loss": 0.5869, + "step": 3562 + }, + { + "epoch": 1.3012598137666607, + "grad_norm": 0.8934658169746399, + "learning_rate": 1.6052478218455986e-05, + "loss": 0.5558, + "step": 3563 + }, + { + "epoch": 1.3016249771772868, + "grad_norm": 0.8569304943084717, + "learning_rate": 1.603841626362505e-05, + "loss": 0.5476, + "step": 3564 + }, + { + "epoch": 1.301990140587913, + "grad_norm": 0.8856249451637268, + "learning_rate": 1.6024356346869975e-05, + "loss": 0.554, + "step": 3565 + }, + { + "epoch": 1.3023553039985394, + "grad_norm": 0.8537352085113525, + "learning_rate": 1.6010298475424028e-05, + "loss": 0.5645, + "step": 3566 + }, + { + "epoch": 1.3027204674091655, + "grad_norm": 0.9424476027488708, + "learning_rate": 1.5996242656519418e-05, + "loss": 0.6075, + "step": 3567 + }, + { + "epoch": 1.3030856308197918, + "grad_norm": 0.8585494756698608, + "learning_rate": 1.5982188897387296e-05, + "loss": 0.5877, + "step": 3568 + }, + { + "epoch": 1.303450794230418, + "grad_norm": 0.8759137988090515, + "learning_rate": 1.596813720525777e-05, + "loss": 0.5927, + "step": 3569 + }, + { + "epoch": 1.3038159576410444, + "grad_norm": 1.1474668979644775, + "learning_rate": 1.5954087587359857e-05, + "loss": 0.5372, + "step": 3570 + }, + { + "epoch": 1.3041811210516707, + "grad_norm": 0.9488094449043274, + "learning_rate": 1.594004005092153e-05, + "loss": 0.5874, + "step": 3571 + }, + { + "epoch": 1.3045462844622968, + "grad_norm": 0.6511358022689819, + "learning_rate": 1.5925994603169678e-05, + "loss": 0.5779, + "step": 3572 + }, + { + "epoch": 1.3049114478729231, + "grad_norm": 0.8309807777404785, + "learning_rate": 1.5911951251330127e-05, + "loss": 0.5756, + "step": 3573 + }, + { + "epoch": 1.3052766112835494, + "grad_norm": 0.8680068850517273, + "learning_rate": 1.5897910002627625e-05, + "loss": 0.593, + "step": 3574 + }, + { + "epoch": 1.3056417746941755, + "grad_norm": 0.7424835562705994, + "learning_rate": 1.5883870864285806e-05, + "loss": 0.5756, + "step": 3575 + }, + { + "epoch": 1.3060069381048018, + "grad_norm": 0.8280590772628784, + "learning_rate": 1.586983384352726e-05, + "loss": 0.5773, + "step": 3576 + }, + { + "epoch": 1.3063721015154282, + "grad_norm": 1.1244232654571533, + "learning_rate": 1.5855798947573464e-05, + "loss": 0.6116, + "step": 3577 + }, + { + "epoch": 1.3067372649260545, + "grad_norm": 0.8125243186950684, + "learning_rate": 1.584176618364482e-05, + "loss": 0.559, + "step": 3578 + }, + { + "epoch": 1.3071024283366808, + "grad_norm": 0.8949918150901794, + "learning_rate": 1.582773555896061e-05, + "loss": 0.5954, + "step": 3579 + }, + { + "epoch": 1.3074675917473069, + "grad_norm": 1.0998412370681763, + "learning_rate": 1.5813707080739028e-05, + "loss": 0.5421, + "step": 3580 + }, + { + "epoch": 1.3078327551579332, + "grad_norm": 0.9270798563957214, + "learning_rate": 1.5799680756197177e-05, + "loss": 0.6094, + "step": 3581 + }, + { + "epoch": 1.3081979185685595, + "grad_norm": 0.7078328132629395, + "learning_rate": 1.5785656592551022e-05, + "loss": 0.5704, + "step": 3582 + }, + { + "epoch": 1.3085630819791856, + "grad_norm": 0.7362704277038574, + "learning_rate": 1.5771634597015445e-05, + "loss": 0.5872, + "step": 3583 + }, + { + "epoch": 1.308928245389812, + "grad_norm": 1.009394645690918, + "learning_rate": 1.575761477680419e-05, + "loss": 0.5833, + "step": 3584 + }, + { + "epoch": 1.3092934088004382, + "grad_norm": 1.5687860250473022, + "learning_rate": 1.57435971391299e-05, + "loss": 0.5931, + "step": 3585 + }, + { + "epoch": 1.3096585722110645, + "grad_norm": 0.6487178206443787, + "learning_rate": 1.572958169120408e-05, + "loss": 0.5676, + "step": 3586 + }, + { + "epoch": 1.3100237356216908, + "grad_norm": 0.9973515272140503, + "learning_rate": 1.5715568440237122e-05, + "loss": 0.5599, + "step": 3587 + }, + { + "epoch": 1.310388899032317, + "grad_norm": 0.9869074821472168, + "learning_rate": 1.5701557393438277e-05, + "loss": 0.6007, + "step": 3588 + }, + { + "epoch": 1.3107540624429432, + "grad_norm": 0.6144683361053467, + "learning_rate": 1.5687548558015663e-05, + "loss": 0.5912, + "step": 3589 + }, + { + "epoch": 1.3111192258535695, + "grad_norm": 0.8956754803657532, + "learning_rate": 1.567354194117627e-05, + "loss": 0.5551, + "step": 3590 + }, + { + "epoch": 1.3114843892641956, + "grad_norm": 0.9467038512229919, + "learning_rate": 1.565953755012594e-05, + "loss": 0.5912, + "step": 3591 + }, + { + "epoch": 1.311849552674822, + "grad_norm": 0.9374017715454102, + "learning_rate": 1.5645535392069366e-05, + "loss": 0.5633, + "step": 3592 + }, + { + "epoch": 1.3122147160854483, + "grad_norm": 1.8570634126663208, + "learning_rate": 1.56315354742101e-05, + "loss": 0.6488, + "step": 3593 + }, + { + "epoch": 1.3125798794960746, + "grad_norm": 0.7921823263168335, + "learning_rate": 1.5617537803750538e-05, + "loss": 0.5729, + "step": 3594 + }, + { + "epoch": 1.3129450429067007, + "grad_norm": 0.7369295358657837, + "learning_rate": 1.560354238789192e-05, + "loss": 0.5887, + "step": 3595 + }, + { + "epoch": 1.313310206317327, + "grad_norm": 0.7375071048736572, + "learning_rate": 1.558954923383432e-05, + "loss": 0.5477, + "step": 3596 + }, + { + "epoch": 1.3136753697279533, + "grad_norm": 0.6595288515090942, + "learning_rate": 1.5575558348776664e-05, + "loss": 0.5816, + "step": 3597 + }, + { + "epoch": 1.3140405331385794, + "grad_norm": 0.7850028276443481, + "learning_rate": 1.556156973991669e-05, + "loss": 0.5391, + "step": 3598 + }, + { + "epoch": 1.3144056965492057, + "grad_norm": 0.8331600427627563, + "learning_rate": 1.5547583414450985e-05, + "loss": 0.6069, + "step": 3599 + }, + { + "epoch": 1.314770859959832, + "grad_norm": 0.8196750283241272, + "learning_rate": 1.5533599379574956e-05, + "loss": 0.5251, + "step": 3600 + }, + { + "epoch": 1.3151360233704583, + "grad_norm": 0.9406454563140869, + "learning_rate": 1.551961764248281e-05, + "loss": 0.5274, + "step": 3601 + }, + { + "epoch": 1.3155011867810846, + "grad_norm": 0.644262969493866, + "learning_rate": 1.5505638210367605e-05, + "loss": 0.5952, + "step": 3602 + }, + { + "epoch": 1.3158663501917107, + "grad_norm": 0.9494520425796509, + "learning_rate": 1.5491661090421193e-05, + "loss": 0.59, + "step": 3603 + }, + { + "epoch": 1.316231513602337, + "grad_norm": 0.9427124261856079, + "learning_rate": 1.5477686289834238e-05, + "loss": 0.5742, + "step": 3604 + }, + { + "epoch": 1.3165966770129633, + "grad_norm": 0.8440623879432678, + "learning_rate": 1.5463713815796223e-05, + "loss": 0.5854, + "step": 3605 + }, + { + "epoch": 1.3169618404235894, + "grad_norm": 0.7992889881134033, + "learning_rate": 1.5449743675495416e-05, + "loss": 0.5585, + "step": 3606 + }, + { + "epoch": 1.3173270038342157, + "grad_norm": 0.8490431308746338, + "learning_rate": 1.54357758761189e-05, + "loss": 0.6118, + "step": 3607 + }, + { + "epoch": 1.317692167244842, + "grad_norm": 0.8494915962219238, + "learning_rate": 1.5421810424852542e-05, + "loss": 0.567, + "step": 3608 + }, + { + "epoch": 1.3180573306554684, + "grad_norm": 1.1305323839187622, + "learning_rate": 1.5407847328881013e-05, + "loss": 0.5708, + "step": 3609 + }, + { + "epoch": 1.3184224940660947, + "grad_norm": 1.355056643486023, + "learning_rate": 1.5393886595387756e-05, + "loss": 0.5883, + "step": 3610 + }, + { + "epoch": 1.3187876574767208, + "grad_norm": 1.0465811491012573, + "learning_rate": 1.5379928231555014e-05, + "loss": 0.5835, + "step": 3611 + }, + { + "epoch": 1.319152820887347, + "grad_norm": 1.0105823278427124, + "learning_rate": 1.536597224456381e-05, + "loss": 0.564, + "step": 3612 + }, + { + "epoch": 1.3195179842979734, + "grad_norm": 1.0827776193618774, + "learning_rate": 1.5352018641593933e-05, + "loss": 0.5918, + "step": 3613 + }, + { + "epoch": 1.3198831477085995, + "grad_norm": 0.5904019474983215, + "learning_rate": 1.5338067429823956e-05, + "loss": 0.6219, + "step": 3614 + }, + { + "epoch": 1.3202483111192258, + "grad_norm": 0.9047408103942871, + "learning_rate": 1.5324118616431216e-05, + "loss": 0.5801, + "step": 3615 + }, + { + "epoch": 1.3206134745298521, + "grad_norm": 0.8691691756248474, + "learning_rate": 1.531017220859181e-05, + "loss": 0.5646, + "step": 3616 + }, + { + "epoch": 1.3209786379404784, + "grad_norm": 0.9058032631874084, + "learning_rate": 1.5296228213480615e-05, + "loss": 0.5858, + "step": 3617 + }, + { + "epoch": 1.3213438013511047, + "grad_norm": 0.896232008934021, + "learning_rate": 1.5282286638271248e-05, + "loss": 0.5699, + "step": 3618 + }, + { + "epoch": 1.3217089647617308, + "grad_norm": 0.97877436876297, + "learning_rate": 1.5268347490136102e-05, + "loss": 0.574, + "step": 3619 + }, + { + "epoch": 1.3220741281723571, + "grad_norm": 0.7933487296104431, + "learning_rate": 1.5254410776246299e-05, + "loss": 0.5931, + "step": 3620 + }, + { + "epoch": 1.3224392915829835, + "grad_norm": 0.6697726249694824, + "learning_rate": 1.5240476503771726e-05, + "loss": 0.5953, + "step": 3621 + }, + { + "epoch": 1.3228044549936095, + "grad_norm": 0.7056508660316467, + "learning_rate": 1.5226544679881e-05, + "loss": 0.5887, + "step": 3622 + }, + { + "epoch": 1.3231696184042359, + "grad_norm": 0.9026390314102173, + "learning_rate": 1.5212615311741488e-05, + "loss": 0.5659, + "step": 3623 + }, + { + "epoch": 1.3235347818148622, + "grad_norm": 0.9017171859741211, + "learning_rate": 1.5198688406519297e-05, + "loss": 0.5895, + "step": 3624 + }, + { + "epoch": 1.3238999452254885, + "grad_norm": 0.7538767457008362, + "learning_rate": 1.5184763971379255e-05, + "loss": 0.5979, + "step": 3625 + }, + { + "epoch": 1.3242651086361148, + "grad_norm": 1.9563366174697876, + "learning_rate": 1.5170842013484928e-05, + "loss": 0.5507, + "step": 3626 + }, + { + "epoch": 1.3246302720467409, + "grad_norm": 0.705376386642456, + "learning_rate": 1.5156922539998609e-05, + "loss": 0.5605, + "step": 3627 + }, + { + "epoch": 1.3249954354573672, + "grad_norm": 0.6387503743171692, + "learning_rate": 1.5143005558081292e-05, + "loss": 0.5996, + "step": 3628 + }, + { + "epoch": 1.3253605988679933, + "grad_norm": 0.7741349935531616, + "learning_rate": 1.5129091074892721e-05, + "loss": 0.5544, + "step": 3629 + }, + { + "epoch": 1.3257257622786196, + "grad_norm": 0.865826427936554, + "learning_rate": 1.5115179097591331e-05, + "loss": 0.5783, + "step": 3630 + }, + { + "epoch": 1.326090925689246, + "grad_norm": 0.9152030348777771, + "learning_rate": 1.5101269633334284e-05, + "loss": 0.579, + "step": 3631 + }, + { + "epoch": 1.3264560890998722, + "grad_norm": 0.7293338775634766, + "learning_rate": 1.5087362689277431e-05, + "loss": 0.5459, + "step": 3632 + }, + { + "epoch": 1.3268212525104985, + "grad_norm": 0.7804079651832581, + "learning_rate": 1.5073458272575345e-05, + "loss": 0.6124, + "step": 3633 + }, + { + "epoch": 1.3271864159211246, + "grad_norm": 0.7898918986320496, + "learning_rate": 1.5059556390381289e-05, + "loss": 0.569, + "step": 3634 + }, + { + "epoch": 1.327551579331751, + "grad_norm": 0.6254060864448547, + "learning_rate": 1.5045657049847223e-05, + "loss": 0.6038, + "step": 3635 + }, + { + "epoch": 1.3279167427423773, + "grad_norm": 0.7602380514144897, + "learning_rate": 1.50317602581238e-05, + "loss": 0.5654, + "step": 3636 + }, + { + "epoch": 1.3282819061530033, + "grad_norm": 0.8868459463119507, + "learning_rate": 1.5017866022360356e-05, + "loss": 0.5845, + "step": 3637 + }, + { + "epoch": 1.3286470695636297, + "grad_norm": 0.9397568702697754, + "learning_rate": 1.5003974349704931e-05, + "loss": 0.5626, + "step": 3638 + }, + { + "epoch": 1.329012232974256, + "grad_norm": 0.755822479724884, + "learning_rate": 1.4990085247304218e-05, + "loss": 0.5749, + "step": 3639 + }, + { + "epoch": 1.3293773963848823, + "grad_norm": 0.8150222897529602, + "learning_rate": 1.4976198722303619e-05, + "loss": 0.553, + "step": 3640 + }, + { + "epoch": 1.3297425597955086, + "grad_norm": 1.1800682544708252, + "learning_rate": 1.496231478184718e-05, + "loss": 0.585, + "step": 3641 + }, + { + "epoch": 1.3301077232061347, + "grad_norm": 0.8337178826332092, + "learning_rate": 1.4948433433077632e-05, + "loss": 0.5671, + "step": 3642 + }, + { + "epoch": 1.330472886616761, + "grad_norm": 0.7551597952842712, + "learning_rate": 1.4934554683136382e-05, + "loss": 0.5992, + "step": 3643 + }, + { + "epoch": 1.3308380500273873, + "grad_norm": 0.6858837604522705, + "learning_rate": 1.4920678539163479e-05, + "loss": 0.5988, + "step": 3644 + }, + { + "epoch": 1.3312032134380134, + "grad_norm": 1.4082273244857788, + "learning_rate": 1.4906805008297645e-05, + "loss": 0.5713, + "step": 3645 + }, + { + "epoch": 1.3315683768486397, + "grad_norm": 0.8915179967880249, + "learning_rate": 1.4892934097676262e-05, + "loss": 0.5796, + "step": 3646 + }, + { + "epoch": 1.331933540259266, + "grad_norm": 0.8021594285964966, + "learning_rate": 1.4879065814435349e-05, + "loss": 0.5968, + "step": 3647 + }, + { + "epoch": 1.3322987036698923, + "grad_norm": 0.8213039636611938, + "learning_rate": 1.4865200165709588e-05, + "loss": 0.5697, + "step": 3648 + }, + { + "epoch": 1.3326638670805186, + "grad_norm": 0.6211522817611694, + "learning_rate": 1.485133715863229e-05, + "loss": 0.5959, + "step": 3649 + }, + { + "epoch": 1.3330290304911447, + "grad_norm": 0.7799485325813293, + "learning_rate": 1.4837476800335427e-05, + "loss": 0.5653, + "step": 3650 + }, + { + "epoch": 1.333394193901771, + "grad_norm": 0.9671348333358765, + "learning_rate": 1.4823619097949584e-05, + "loss": 0.5603, + "step": 3651 + }, + { + "epoch": 1.3337593573123974, + "grad_norm": 0.9694781303405762, + "learning_rate": 1.4809764058604006e-05, + "loss": 0.5805, + "step": 3652 + }, + { + "epoch": 1.3341245207230235, + "grad_norm": 1.7140861749649048, + "learning_rate": 1.4795911689426543e-05, + "loss": 0.5508, + "step": 3653 + }, + { + "epoch": 1.3344896841336498, + "grad_norm": 0.8197154998779297, + "learning_rate": 1.4782061997543699e-05, + "loss": 0.5751, + "step": 3654 + }, + { + "epoch": 1.334854847544276, + "grad_norm": 0.7604065537452698, + "learning_rate": 1.476821499008057e-05, + "loss": 0.5731, + "step": 3655 + }, + { + "epoch": 1.3352200109549024, + "grad_norm": 0.8634833097457886, + "learning_rate": 1.4754370674160885e-05, + "loss": 0.5941, + "step": 3656 + }, + { + "epoch": 1.3355851743655287, + "grad_norm": 0.946653425693512, + "learning_rate": 1.4740529056906994e-05, + "loss": 0.5927, + "step": 3657 + }, + { + "epoch": 1.3359503377761548, + "grad_norm": 0.8148925304412842, + "learning_rate": 1.4726690145439858e-05, + "loss": 0.5824, + "step": 3658 + }, + { + "epoch": 1.336315501186781, + "grad_norm": 1.1249194145202637, + "learning_rate": 1.4712853946879035e-05, + "loss": 0.5413, + "step": 3659 + }, + { + "epoch": 1.3366806645974074, + "grad_norm": 0.791527271270752, + "learning_rate": 1.46990204683427e-05, + "loss": 0.5885, + "step": 3660 + }, + { + "epoch": 1.3370458280080335, + "grad_norm": 0.8487535119056702, + "learning_rate": 1.4685189716947614e-05, + "loss": 0.5484, + "step": 3661 + }, + { + "epoch": 1.3374109914186598, + "grad_norm": 0.6759431958198547, + "learning_rate": 1.4671361699809153e-05, + "loss": 0.6078, + "step": 3662 + }, + { + "epoch": 1.3377761548292861, + "grad_norm": 1.246487021446228, + "learning_rate": 1.4657536424041268e-05, + "loss": 0.5653, + "step": 3663 + }, + { + "epoch": 1.3381413182399124, + "grad_norm": 0.8097664713859558, + "learning_rate": 1.4643713896756518e-05, + "loss": 0.5686, + "step": 3664 + }, + { + "epoch": 1.3385064816505385, + "grad_norm": 0.6811612844467163, + "learning_rate": 1.4629894125066028e-05, + "loss": 0.5887, + "step": 3665 + }, + { + "epoch": 1.3388716450611649, + "grad_norm": 0.9761230945587158, + "learning_rate": 1.4616077116079524e-05, + "loss": 0.5722, + "step": 3666 + }, + { + "epoch": 1.3392368084717912, + "grad_norm": 0.793403148651123, + "learning_rate": 1.4602262876905306e-05, + "loss": 0.5549, + "step": 3667 + }, + { + "epoch": 1.3396019718824173, + "grad_norm": 0.8240900039672852, + "learning_rate": 1.458845141465024e-05, + "loss": 0.608, + "step": 3668 + }, + { + "epoch": 1.3399671352930436, + "grad_norm": 1.1468547582626343, + "learning_rate": 1.4574642736419763e-05, + "loss": 0.5709, + "step": 3669 + }, + { + "epoch": 1.3403322987036699, + "grad_norm": 0.8393019437789917, + "learning_rate": 1.4560836849317895e-05, + "loss": 0.5826, + "step": 3670 + }, + { + "epoch": 1.3406974621142962, + "grad_norm": 1.20576012134552, + "learning_rate": 1.4547033760447202e-05, + "loss": 0.5608, + "step": 3671 + }, + { + "epoch": 1.3410626255249225, + "grad_norm": 0.8008895516395569, + "learning_rate": 1.4533233476908835e-05, + "loss": 0.5823, + "step": 3672 + }, + { + "epoch": 1.3414277889355486, + "grad_norm": 0.9509861469268799, + "learning_rate": 1.451943600580247e-05, + "loss": 0.5393, + "step": 3673 + }, + { + "epoch": 1.341792952346175, + "grad_norm": 0.9399440288543701, + "learning_rate": 1.450564135422636e-05, + "loss": 0.5958, + "step": 3674 + }, + { + "epoch": 1.3421581157568012, + "grad_norm": 1.1058588027954102, + "learning_rate": 1.4491849529277295e-05, + "loss": 0.556, + "step": 3675 + }, + { + "epoch": 1.3425232791674273, + "grad_norm": 0.7471702098846436, + "learning_rate": 1.4478060538050622e-05, + "loss": 0.6152, + "step": 3676 + }, + { + "epoch": 1.3428884425780536, + "grad_norm": 0.9087870717048645, + "learning_rate": 1.4464274387640224e-05, + "loss": 0.5765, + "step": 3677 + }, + { + "epoch": 1.34325360598868, + "grad_norm": 1.1029573678970337, + "learning_rate": 1.4450491085138514e-05, + "loss": 0.5664, + "step": 3678 + }, + { + "epoch": 1.3436187693993062, + "grad_norm": 0.8007150292396545, + "learning_rate": 1.4436710637636456e-05, + "loss": 0.5482, + "step": 3679 + }, + { + "epoch": 1.3439839328099326, + "grad_norm": 1.6680142879486084, + "learning_rate": 1.442293305222354e-05, + "loss": 0.6091, + "step": 3680 + }, + { + "epoch": 1.3443490962205586, + "grad_norm": 1.0721821784973145, + "learning_rate": 1.4409158335987763e-05, + "loss": 0.6155, + "step": 3681 + }, + { + "epoch": 1.344714259631185, + "grad_norm": 0.790995180606842, + "learning_rate": 1.4395386496015685e-05, + "loss": 0.5885, + "step": 3682 + }, + { + "epoch": 1.3450794230418113, + "grad_norm": 0.9400860667228699, + "learning_rate": 1.4381617539392347e-05, + "loss": 0.5948, + "step": 3683 + }, + { + "epoch": 1.3454445864524374, + "grad_norm": 0.8138934373855591, + "learning_rate": 1.436785147320134e-05, + "loss": 0.5776, + "step": 3684 + }, + { + "epoch": 1.3458097498630637, + "grad_norm": 1.228329062461853, + "learning_rate": 1.4354088304524739e-05, + "loss": 0.5466, + "step": 3685 + }, + { + "epoch": 1.34617491327369, + "grad_norm": 0.8579606413841248, + "learning_rate": 1.4340328040443154e-05, + "loss": 0.5759, + "step": 3686 + }, + { + "epoch": 1.3465400766843163, + "grad_norm": 1.073363184928894, + "learning_rate": 1.4326570688035682e-05, + "loss": 0.5757, + "step": 3687 + }, + { + "epoch": 1.3469052400949426, + "grad_norm": 0.8040978908538818, + "learning_rate": 1.4312816254379928e-05, + "loss": 0.5711, + "step": 3688 + }, + { + "epoch": 1.3472704035055687, + "grad_norm": 0.5264380574226379, + "learning_rate": 1.4299064746552005e-05, + "loss": 0.6295, + "step": 3689 + }, + { + "epoch": 1.347635566916195, + "grad_norm": 1.2140576839447021, + "learning_rate": 1.42853161716265e-05, + "loss": 0.5482, + "step": 3690 + }, + { + "epoch": 1.3480007303268213, + "grad_norm": 0.8451759219169617, + "learning_rate": 1.4271570536676513e-05, + "loss": 0.6004, + "step": 3691 + }, + { + "epoch": 1.3483658937374474, + "grad_norm": 0.6922603845596313, + "learning_rate": 1.4257827848773613e-05, + "loss": 0.5847, + "step": 3692 + }, + { + "epoch": 1.3487310571480737, + "grad_norm": 0.8059636354446411, + "learning_rate": 1.424408811498787e-05, + "loss": 0.5731, + "step": 3693 + }, + { + "epoch": 1.3490962205587, + "grad_norm": 1.1259942054748535, + "learning_rate": 1.4230351342387827e-05, + "loss": 0.5465, + "step": 3694 + }, + { + "epoch": 1.3494613839693264, + "grad_norm": 0.9364913105964661, + "learning_rate": 1.4216617538040488e-05, + "loss": 0.5436, + "step": 3695 + }, + { + "epoch": 1.3498265473799524, + "grad_norm": 0.8418157696723938, + "learning_rate": 1.4202886709011357e-05, + "loss": 0.5953, + "step": 3696 + }, + { + "epoch": 1.3501917107905788, + "grad_norm": 0.9355802536010742, + "learning_rate": 1.4189158862364386e-05, + "loss": 0.6121, + "step": 3697 + }, + { + "epoch": 1.350556874201205, + "grad_norm": 0.7521985769271851, + "learning_rate": 1.417543400516201e-05, + "loss": 0.5795, + "step": 3698 + }, + { + "epoch": 1.3509220376118312, + "grad_norm": 0.953596830368042, + "learning_rate": 1.4161712144465108e-05, + "loss": 0.5792, + "step": 3699 + }, + { + "epoch": 1.3512872010224575, + "grad_norm": 0.640637218952179, + "learning_rate": 1.4147993287333032e-05, + "loss": 0.5792, + "step": 3700 + }, + { + "epoch": 1.3516523644330838, + "grad_norm": 0.8487382531166077, + "learning_rate": 1.413427744082359e-05, + "loss": 0.5794, + "step": 3701 + }, + { + "epoch": 1.35201752784371, + "grad_norm": 0.7688397765159607, + "learning_rate": 1.412056461199302e-05, + "loss": 0.6128, + "step": 3702 + }, + { + "epoch": 1.3523826912543364, + "grad_norm": 0.9199714660644531, + "learning_rate": 1.4106854807896035e-05, + "loss": 0.5534, + "step": 3703 + }, + { + "epoch": 1.3527478546649625, + "grad_norm": 0.8518757820129395, + "learning_rate": 1.4093148035585774e-05, + "loss": 0.5898, + "step": 3704 + }, + { + "epoch": 1.3531130180755888, + "grad_norm": 1.321502923965454, + "learning_rate": 1.4079444302113821e-05, + "loss": 0.5872, + "step": 3705 + }, + { + "epoch": 1.3534781814862151, + "grad_norm": 0.7342141270637512, + "learning_rate": 1.4065743614530193e-05, + "loss": 0.5936, + "step": 3706 + }, + { + "epoch": 1.3538433448968412, + "grad_norm": 0.9883751273155212, + "learning_rate": 1.405204597988336e-05, + "loss": 0.5825, + "step": 3707 + }, + { + "epoch": 1.3542085083074675, + "grad_norm": 0.7839553356170654, + "learning_rate": 1.403835140522019e-05, + "loss": 0.5811, + "step": 3708 + }, + { + "epoch": 1.3545736717180938, + "grad_norm": 1.0226047039031982, + "learning_rate": 1.4024659897585989e-05, + "loss": 0.5851, + "step": 3709 + }, + { + "epoch": 1.3549388351287202, + "grad_norm": 0.95647794008255, + "learning_rate": 1.4010971464024494e-05, + "loss": 0.5844, + "step": 3710 + }, + { + "epoch": 1.3553039985393465, + "grad_norm": 0.9128265976905823, + "learning_rate": 1.3997286111577864e-05, + "loss": 0.6129, + "step": 3711 + }, + { + "epoch": 1.3556691619499726, + "grad_norm": 0.8987192511558533, + "learning_rate": 1.3983603847286648e-05, + "loss": 0.5786, + "step": 3712 + }, + { + "epoch": 1.3560343253605989, + "grad_norm": 0.8189154863357544, + "learning_rate": 1.3969924678189837e-05, + "loss": 0.5656, + "step": 3713 + }, + { + "epoch": 1.3563994887712252, + "grad_norm": 0.8834811449050903, + "learning_rate": 1.3956248611324803e-05, + "loss": 0.5743, + "step": 3714 + }, + { + "epoch": 1.3567646521818513, + "grad_norm": 0.9477565288543701, + "learning_rate": 1.3942575653727341e-05, + "loss": 0.5716, + "step": 3715 + }, + { + "epoch": 1.3571298155924776, + "grad_norm": 0.9446448087692261, + "learning_rate": 1.392890581243163e-05, + "loss": 0.567, + "step": 3716 + }, + { + "epoch": 1.357494979003104, + "grad_norm": 0.7631604075431824, + "learning_rate": 1.3915239094470268e-05, + "loss": 0.5714, + "step": 3717 + }, + { + "epoch": 1.3578601424137302, + "grad_norm": 0.6586843132972717, + "learning_rate": 1.3901575506874218e-05, + "loss": 0.5876, + "step": 3718 + }, + { + "epoch": 1.3582253058243565, + "grad_norm": 0.8762580752372742, + "learning_rate": 1.3887915056672863e-05, + "loss": 0.6044, + "step": 3719 + }, + { + "epoch": 1.3585904692349826, + "grad_norm": 0.9814974069595337, + "learning_rate": 1.387425775089395e-05, + "loss": 0.5732, + "step": 3720 + }, + { + "epoch": 1.358955632645609, + "grad_norm": 0.6957576870918274, + "learning_rate": 1.3860603596563606e-05, + "loss": 0.5784, + "step": 3721 + }, + { + "epoch": 1.3593207960562352, + "grad_norm": 0.841157078742981, + "learning_rate": 1.3846952600706354e-05, + "loss": 0.5605, + "step": 3722 + }, + { + "epoch": 1.3596859594668613, + "grad_norm": 0.8217060565948486, + "learning_rate": 1.3833304770345084e-05, + "loss": 0.5889, + "step": 3723 + }, + { + "epoch": 1.3600511228774876, + "grad_norm": 1.0585644245147705, + "learning_rate": 1.3819660112501054e-05, + "loss": 0.5721, + "step": 3724 + }, + { + "epoch": 1.360416286288114, + "grad_norm": 0.8639219403266907, + "learning_rate": 1.3806018634193899e-05, + "loss": 0.5401, + "step": 3725 + }, + { + "epoch": 1.3607814496987403, + "grad_norm": 0.7209730744361877, + "learning_rate": 1.3792380342441601e-05, + "loss": 0.5725, + "step": 3726 + }, + { + "epoch": 1.3611466131093666, + "grad_norm": 0.782489538192749, + "learning_rate": 1.3778745244260528e-05, + "loss": 0.5829, + "step": 3727 + }, + { + "epoch": 1.3615117765199927, + "grad_norm": 0.9441948533058167, + "learning_rate": 1.3765113346665375e-05, + "loss": 0.5483, + "step": 3728 + }, + { + "epoch": 1.361876939930619, + "grad_norm": 0.7579976320266724, + "learning_rate": 1.3751484656669223e-05, + "loss": 0.5814, + "step": 3729 + }, + { + "epoch": 1.362242103341245, + "grad_norm": 0.8741255402565002, + "learning_rate": 1.3737859181283471e-05, + "loss": 0.5351, + "step": 3730 + }, + { + "epoch": 1.3626072667518714, + "grad_norm": 1.0973438024520874, + "learning_rate": 1.3724236927517887e-05, + "loss": 0.5543, + "step": 3731 + }, + { + "epoch": 1.3629724301624977, + "grad_norm": 0.7737992405891418, + "learning_rate": 1.3710617902380579e-05, + "loss": 0.593, + "step": 3732 + }, + { + "epoch": 1.363337593573124, + "grad_norm": 1.0185577869415283, + "learning_rate": 1.3697002112877975e-05, + "loss": 0.5996, + "step": 3733 + }, + { + "epoch": 1.3637027569837503, + "grad_norm": 1.013541579246521, + "learning_rate": 1.3683389566014871e-05, + "loss": 0.5299, + "step": 3734 + }, + { + "epoch": 1.3640679203943764, + "grad_norm": 0.8510308861732483, + "learning_rate": 1.3669780268794362e-05, + "loss": 0.5641, + "step": 3735 + }, + { + "epoch": 1.3644330838050027, + "grad_norm": 1.2188884019851685, + "learning_rate": 1.3656174228217883e-05, + "loss": 0.5375, + "step": 3736 + }, + { + "epoch": 1.364798247215629, + "grad_norm": 1.3204916715621948, + "learning_rate": 1.3642571451285207e-05, + "loss": 0.5532, + "step": 3737 + }, + { + "epoch": 1.3651634106262551, + "grad_norm": 0.905667781829834, + "learning_rate": 1.3628971944994407e-05, + "loss": 0.5627, + "step": 3738 + }, + { + "epoch": 1.3655285740368814, + "grad_norm": 0.7779754400253296, + "learning_rate": 1.3615375716341893e-05, + "loss": 0.5475, + "step": 3739 + }, + { + "epoch": 1.3658937374475077, + "grad_norm": 1.3370355367660522, + "learning_rate": 1.3601782772322368e-05, + "loss": 0.5469, + "step": 3740 + }, + { + "epoch": 1.366258900858134, + "grad_norm": 0.7945521473884583, + "learning_rate": 1.3588193119928868e-05, + "loss": 0.5854, + "step": 3741 + }, + { + "epoch": 1.3666240642687604, + "grad_norm": 1.4507278203964233, + "learning_rate": 1.3574606766152712e-05, + "loss": 0.5408, + "step": 3742 + }, + { + "epoch": 1.3669892276793865, + "grad_norm": 0.8589543104171753, + "learning_rate": 1.3561023717983541e-05, + "loss": 0.5685, + "step": 3743 + }, + { + "epoch": 1.3673543910900128, + "grad_norm": 0.8070398569107056, + "learning_rate": 1.3547443982409291e-05, + "loss": 0.5871, + "step": 3744 + }, + { + "epoch": 1.367719554500639, + "grad_norm": 0.8978539109230042, + "learning_rate": 1.3533867566416184e-05, + "loss": 0.5471, + "step": 3745 + }, + { + "epoch": 1.3680847179112652, + "grad_norm": 0.7683486342430115, + "learning_rate": 1.3520294476988747e-05, + "loss": 0.5642, + "step": 3746 + }, + { + "epoch": 1.3684498813218915, + "grad_norm": 0.7854225635528564, + "learning_rate": 1.3506724721109792e-05, + "loss": 0.5832, + "step": 3747 + }, + { + "epoch": 1.3688150447325178, + "grad_norm": 0.8262268304824829, + "learning_rate": 1.3493158305760401e-05, + "loss": 0.5329, + "step": 3748 + }, + { + "epoch": 1.3691802081431441, + "grad_norm": 0.8037892580032349, + "learning_rate": 1.3479595237919963e-05, + "loss": 0.5434, + "step": 3749 + }, + { + "epoch": 1.3695453715537704, + "grad_norm": 0.862305223941803, + "learning_rate": 1.346603552456612e-05, + "loss": 0.5833, + "step": 3750 + }, + { + "epoch": 1.3699105349643965, + "grad_norm": 1.2398102283477783, + "learning_rate": 1.3452479172674817e-05, + "loss": 0.5561, + "step": 3751 + }, + { + "epoch": 1.3702756983750228, + "grad_norm": 0.859796404838562, + "learning_rate": 1.3438926189220239e-05, + "loss": 0.5598, + "step": 3752 + }, + { + "epoch": 1.3706408617856491, + "grad_norm": 0.9049490690231323, + "learning_rate": 1.3425376581174855e-05, + "loss": 0.5749, + "step": 3753 + }, + { + "epoch": 1.3710060251962752, + "grad_norm": 0.8754092454910278, + "learning_rate": 1.34118303555094e-05, + "loss": 0.5948, + "step": 3754 + }, + { + "epoch": 1.3713711886069015, + "grad_norm": 1.0824122428894043, + "learning_rate": 1.3398287519192858e-05, + "loss": 0.5371, + "step": 3755 + }, + { + "epoch": 1.3717363520175279, + "grad_norm": 0.76957768201828, + "learning_rate": 1.3384748079192482e-05, + "loss": 0.5688, + "step": 3756 + }, + { + "epoch": 1.3721015154281542, + "grad_norm": 0.7734008431434631, + "learning_rate": 1.337121204247376e-05, + "loss": 0.6054, + "step": 3757 + }, + { + "epoch": 1.3724666788387805, + "grad_norm": 0.9477915167808533, + "learning_rate": 1.335767941600045e-05, + "loss": 0.5388, + "step": 3758 + }, + { + "epoch": 1.3728318422494066, + "grad_norm": 0.7826148867607117, + "learning_rate": 1.3344150206734537e-05, + "loss": 0.5506, + "step": 3759 + }, + { + "epoch": 1.3731970056600329, + "grad_norm": 0.9231326580047607, + "learning_rate": 1.3330624421636265e-05, + "loss": 0.5547, + "step": 3760 + }, + { + "epoch": 1.3735621690706592, + "grad_norm": 0.8588625192642212, + "learning_rate": 1.3317102067664104e-05, + "loss": 0.5501, + "step": 3761 + }, + { + "epoch": 1.3739273324812853, + "grad_norm": 0.9153193235397339, + "learning_rate": 1.3303583151774758e-05, + "loss": 0.5639, + "step": 3762 + }, + { + "epoch": 1.3742924958919116, + "grad_norm": 0.8094452023506165, + "learning_rate": 1.3290067680923169e-05, + "loss": 0.5513, + "step": 3763 + }, + { + "epoch": 1.374657659302538, + "grad_norm": 0.8183193802833557, + "learning_rate": 1.3276555662062503e-05, + "loss": 0.5526, + "step": 3764 + }, + { + "epoch": 1.3750228227131642, + "grad_norm": 0.8727033734321594, + "learning_rate": 1.3263047102144154e-05, + "loss": 0.5302, + "step": 3765 + }, + { + "epoch": 1.3753879861237903, + "grad_norm": 0.8611675500869751, + "learning_rate": 1.3249542008117737e-05, + "loss": 0.5409, + "step": 3766 + }, + { + "epoch": 1.3757531495344166, + "grad_norm": 1.183205485343933, + "learning_rate": 1.3236040386931075e-05, + "loss": 0.5649, + "step": 3767 + }, + { + "epoch": 1.376118312945043, + "grad_norm": 0.7820980548858643, + "learning_rate": 1.3222542245530217e-05, + "loss": 0.5738, + "step": 3768 + }, + { + "epoch": 1.376483476355669, + "grad_norm": 1.0162458419799805, + "learning_rate": 1.3209047590859407e-05, + "loss": 0.5634, + "step": 3769 + }, + { + "epoch": 1.3768486397662953, + "grad_norm": 0.9946308732032776, + "learning_rate": 1.3195556429861112e-05, + "loss": 0.5799, + "step": 3770 + }, + { + "epoch": 1.3772138031769217, + "grad_norm": 0.9185833930969238, + "learning_rate": 1.3182068769475984e-05, + "loss": 0.5247, + "step": 3771 + }, + { + "epoch": 1.377578966587548, + "grad_norm": 0.7531657814979553, + "learning_rate": 1.316858461664289e-05, + "loss": 0.5931, + "step": 3772 + }, + { + "epoch": 1.3779441299981743, + "grad_norm": 0.9958174228668213, + "learning_rate": 1.3155103978298882e-05, + "loss": 0.5481, + "step": 3773 + }, + { + "epoch": 1.3783092934088004, + "grad_norm": 0.9429916739463806, + "learning_rate": 1.3141626861379215e-05, + "loss": 0.557, + "step": 3774 + }, + { + "epoch": 1.3786744568194267, + "grad_norm": 0.6601399779319763, + "learning_rate": 1.3128153272817312e-05, + "loss": 0.6037, + "step": 3775 + }, + { + "epoch": 1.379039620230053, + "grad_norm": 0.8233461380004883, + "learning_rate": 1.31146832195448e-05, + "loss": 0.5806, + "step": 3776 + }, + { + "epoch": 1.379404783640679, + "grad_norm": 0.8602706789970398, + "learning_rate": 1.3101216708491482e-05, + "loss": 0.5287, + "step": 3777 + }, + { + "epoch": 1.3797699470513054, + "grad_norm": 0.8407243490219116, + "learning_rate": 1.308775374658534e-05, + "loss": 0.5904, + "step": 3778 + }, + { + "epoch": 1.3801351104619317, + "grad_norm": 0.896361231803894, + "learning_rate": 1.3074294340752518e-05, + "loss": 0.5925, + "step": 3779 + }, + { + "epoch": 1.380500273872558, + "grad_norm": 0.9036267995834351, + "learning_rate": 1.306083849791735e-05, + "loss": 0.5231, + "step": 3780 + }, + { + "epoch": 1.3808654372831843, + "grad_norm": 0.8295239210128784, + "learning_rate": 1.304738622500232e-05, + "loss": 0.5306, + "step": 3781 + }, + { + "epoch": 1.3812306006938104, + "grad_norm": 1.0325911045074463, + "learning_rate": 1.3033937528928093e-05, + "loss": 0.5435, + "step": 3782 + }, + { + "epoch": 1.3815957641044367, + "grad_norm": 0.9571465849876404, + "learning_rate": 1.3020492416613468e-05, + "loss": 0.5611, + "step": 3783 + }, + { + "epoch": 1.381960927515063, + "grad_norm": 0.8533524870872498, + "learning_rate": 1.3007050894975433e-05, + "loss": 0.5857, + "step": 3784 + }, + { + "epoch": 1.3823260909256891, + "grad_norm": 0.7776716351509094, + "learning_rate": 1.2993612970929097e-05, + "loss": 0.5905, + "step": 3785 + }, + { + "epoch": 1.3826912543363155, + "grad_norm": 0.806546688079834, + "learning_rate": 1.2980178651387738e-05, + "loss": 0.5858, + "step": 3786 + }, + { + "epoch": 1.3830564177469418, + "grad_norm": 0.9101070165634155, + "learning_rate": 1.2966747943262786e-05, + "loss": 0.567, + "step": 3787 + }, + { + "epoch": 1.383421581157568, + "grad_norm": 1.0257490873336792, + "learning_rate": 1.2953320853463782e-05, + "loss": 0.5518, + "step": 3788 + }, + { + "epoch": 1.3837867445681944, + "grad_norm": 0.9479292035102844, + "learning_rate": 1.293989738889843e-05, + "loss": 0.5491, + "step": 3789 + }, + { + "epoch": 1.3841519079788205, + "grad_norm": 0.9954557418823242, + "learning_rate": 1.2926477556472573e-05, + "loss": 0.5419, + "step": 3790 + }, + { + "epoch": 1.3845170713894468, + "grad_norm": 0.9924768209457397, + "learning_rate": 1.2913061363090166e-05, + "loss": 0.5404, + "step": 3791 + }, + { + "epoch": 1.384882234800073, + "grad_norm": 0.8244521021842957, + "learning_rate": 1.289964881565331e-05, + "loss": 0.5723, + "step": 3792 + }, + { + "epoch": 1.3852473982106992, + "grad_norm": 0.9758168458938599, + "learning_rate": 1.2886239921062216e-05, + "loss": 0.5497, + "step": 3793 + }, + { + "epoch": 1.3856125616213255, + "grad_norm": 0.7866721749305725, + "learning_rate": 1.2872834686215227e-05, + "loss": 0.5958, + "step": 3794 + }, + { + "epoch": 1.3859777250319518, + "grad_norm": 0.6681350469589233, + "learning_rate": 1.2859433118008796e-05, + "loss": 0.5775, + "step": 3795 + }, + { + "epoch": 1.3863428884425781, + "grad_norm": 0.7156676054000854, + "learning_rate": 1.284603522333749e-05, + "loss": 0.6006, + "step": 3796 + }, + { + "epoch": 1.3867080518532042, + "grad_norm": 0.9035090208053589, + "learning_rate": 1.2832641009093995e-05, + "loss": 0.5595, + "step": 3797 + }, + { + "epoch": 1.3870732152638305, + "grad_norm": 0.8540573120117188, + "learning_rate": 1.281925048216909e-05, + "loss": 0.5612, + "step": 3798 + }, + { + "epoch": 1.3874383786744569, + "grad_norm": 0.8145067095756531, + "learning_rate": 1.2805863649451671e-05, + "loss": 0.5763, + "step": 3799 + }, + { + "epoch": 1.387803542085083, + "grad_norm": 0.8422971367835999, + "learning_rate": 1.2792480517828714e-05, + "loss": 0.5636, + "step": 3800 + }, + { + "epoch": 1.3881687054957093, + "grad_norm": 1.2129441499710083, + "learning_rate": 1.2779101094185322e-05, + "loss": 0.5776, + "step": 3801 + }, + { + "epoch": 1.3885338689063356, + "grad_norm": 0.9555438756942749, + "learning_rate": 1.2765725385404655e-05, + "loss": 0.5233, + "step": 3802 + }, + { + "epoch": 1.3888990323169619, + "grad_norm": 1.0055880546569824, + "learning_rate": 1.2752353398367982e-05, + "loss": 0.5875, + "step": 3803 + }, + { + "epoch": 1.3892641957275882, + "grad_norm": 0.8483778238296509, + "learning_rate": 1.2738985139954658e-05, + "loss": 0.6154, + "step": 3804 + }, + { + "epoch": 1.3896293591382143, + "grad_norm": 0.9318357110023499, + "learning_rate": 1.272562061704211e-05, + "loss": 0.5445, + "step": 3805 + }, + { + "epoch": 1.3899945225488406, + "grad_norm": 0.9205259084701538, + "learning_rate": 1.2712259836505854e-05, + "loss": 0.4951, + "step": 3806 + }, + { + "epoch": 1.390359685959467, + "grad_norm": 0.8118085265159607, + "learning_rate": 1.269890280521947e-05, + "loss": 0.5425, + "step": 3807 + }, + { + "epoch": 1.390724849370093, + "grad_norm": 0.737595796585083, + "learning_rate": 1.2685549530054617e-05, + "loss": 0.5488, + "step": 3808 + }, + { + "epoch": 1.3910900127807193, + "grad_norm": 1.112452507019043, + "learning_rate": 1.2672200017881027e-05, + "loss": 0.5408, + "step": 3809 + }, + { + "epoch": 1.3914551761913456, + "grad_norm": 0.7649756669998169, + "learning_rate": 1.2658854275566475e-05, + "loss": 0.585, + "step": 3810 + }, + { + "epoch": 1.391820339601972, + "grad_norm": 0.9654197692871094, + "learning_rate": 1.264551230997682e-05, + "loss": 0.5782, + "step": 3811 + }, + { + "epoch": 1.3921855030125982, + "grad_norm": 0.8528624176979065, + "learning_rate": 1.2632174127975963e-05, + "loss": 0.5843, + "step": 3812 + }, + { + "epoch": 1.3925506664232243, + "grad_norm": 0.8950546979904175, + "learning_rate": 1.2618839736425867e-05, + "loss": 0.5574, + "step": 3813 + }, + { + "epoch": 1.3929158298338506, + "grad_norm": 0.9986966252326965, + "learning_rate": 1.2605509142186543e-05, + "loss": 0.5403, + "step": 3814 + }, + { + "epoch": 1.393280993244477, + "grad_norm": 0.8956006765365601, + "learning_rate": 1.2592182352116037e-05, + "loss": 0.598, + "step": 3815 + }, + { + "epoch": 1.393646156655103, + "grad_norm": 1.0453033447265625, + "learning_rate": 1.2578859373070453e-05, + "loss": 0.5408, + "step": 3816 + }, + { + "epoch": 1.3940113200657294, + "grad_norm": 0.9151045680046082, + "learning_rate": 1.2565540211903931e-05, + "loss": 0.578, + "step": 3817 + }, + { + "epoch": 1.3943764834763557, + "grad_norm": 0.9833351969718933, + "learning_rate": 1.2552224875468642e-05, + "loss": 0.5793, + "step": 3818 + }, + { + "epoch": 1.394741646886982, + "grad_norm": 0.8560035228729248, + "learning_rate": 1.2538913370614795e-05, + "loss": 0.558, + "step": 3819 + }, + { + "epoch": 1.3951068102976083, + "grad_norm": 1.030977725982666, + "learning_rate": 1.2525605704190622e-05, + "loss": 0.5409, + "step": 3820 + }, + { + "epoch": 1.3954719737082344, + "grad_norm": 0.9003497958183289, + "learning_rate": 1.251230188304239e-05, + "loss": 0.5715, + "step": 3821 + }, + { + "epoch": 1.3958371371188607, + "grad_norm": 1.1778000593185425, + "learning_rate": 1.2499001914014373e-05, + "loss": 0.5281, + "step": 3822 + }, + { + "epoch": 1.396202300529487, + "grad_norm": 0.8389256000518799, + "learning_rate": 1.2485705803948877e-05, + "loss": 0.5568, + "step": 3823 + }, + { + "epoch": 1.396567463940113, + "grad_norm": 0.9334737658500671, + "learning_rate": 1.2472413559686212e-05, + "loss": 0.5723, + "step": 3824 + }, + { + "epoch": 1.3969326273507394, + "grad_norm": 0.7888615131378174, + "learning_rate": 1.2459125188064713e-05, + "loss": 0.5525, + "step": 3825 + }, + { + "epoch": 1.3972977907613657, + "grad_norm": 0.8109673857688904, + "learning_rate": 1.24458406959207e-05, + "loss": 0.5836, + "step": 3826 + }, + { + "epoch": 1.397662954171992, + "grad_norm": 0.8338742256164551, + "learning_rate": 1.2432560090088533e-05, + "loss": 0.5601, + "step": 3827 + }, + { + "epoch": 1.3980281175826184, + "grad_norm": 0.8548614382743835, + "learning_rate": 1.241928337740053e-05, + "loss": 0.5526, + "step": 3828 + }, + { + "epoch": 1.3983932809932444, + "grad_norm": 1.429435133934021, + "learning_rate": 1.240601056468703e-05, + "loss": 0.563, + "step": 3829 + }, + { + "epoch": 1.3987584444038708, + "grad_norm": 1.2365398406982422, + "learning_rate": 1.2392741658776368e-05, + "loss": 0.5715, + "step": 3830 + }, + { + "epoch": 1.3991236078144969, + "grad_norm": 0.976597249507904, + "learning_rate": 1.2379476666494866e-05, + "loss": 0.5486, + "step": 3831 + }, + { + "epoch": 1.3994887712251232, + "grad_norm": 0.9714939594268799, + "learning_rate": 1.2366215594666822e-05, + "loss": 0.5401, + "step": 3832 + }, + { + "epoch": 1.3998539346357495, + "grad_norm": 0.7221394777297974, + "learning_rate": 1.2352958450114532e-05, + "loss": 0.5636, + "step": 3833 + }, + { + "epoch": 1.4002190980463758, + "grad_norm": 0.7672884464263916, + "learning_rate": 1.2339705239658262e-05, + "loss": 0.5564, + "step": 3834 + }, + { + "epoch": 1.400584261457002, + "grad_norm": 0.6529276967048645, + "learning_rate": 1.232645597011626e-05, + "loss": 0.5907, + "step": 3835 + }, + { + "epoch": 1.4009494248676282, + "grad_norm": 0.9142425656318665, + "learning_rate": 1.2313210648304739e-05, + "loss": 0.5649, + "step": 3836 + }, + { + "epoch": 1.4013145882782545, + "grad_norm": 0.6880323886871338, + "learning_rate": 1.229996928103789e-05, + "loss": 0.5952, + "step": 3837 + }, + { + "epoch": 1.4016797516888808, + "grad_norm": 0.587834894657135, + "learning_rate": 1.2286731875127858e-05, + "loss": 0.5665, + "step": 3838 + }, + { + "epoch": 1.402044915099507, + "grad_norm": 1.0331255197525024, + "learning_rate": 1.2273498437384763e-05, + "loss": 0.5191, + "step": 3839 + }, + { + "epoch": 1.4024100785101332, + "grad_norm": 0.9560843110084534, + "learning_rate": 1.2260268974616683e-05, + "loss": 0.5837, + "step": 3840 + }, + { + "epoch": 1.4027752419207595, + "grad_norm": 0.7214618921279907, + "learning_rate": 1.224704349362964e-05, + "loss": 0.6163, + "step": 3841 + }, + { + "epoch": 1.4031404053313858, + "grad_norm": 0.9066854119300842, + "learning_rate": 1.2233822001227606e-05, + "loss": 0.5465, + "step": 3842 + }, + { + "epoch": 1.4035055687420122, + "grad_norm": 0.9757594466209412, + "learning_rate": 1.2220604504212519e-05, + "loss": 0.5285, + "step": 3843 + }, + { + "epoch": 1.4038707321526382, + "grad_norm": 0.8267287015914917, + "learning_rate": 1.2207391009384244e-05, + "loss": 0.5702, + "step": 3844 + }, + { + "epoch": 1.4042358955632646, + "grad_norm": 0.6605767011642456, + "learning_rate": 1.2194181523540601e-05, + "loss": 0.5706, + "step": 3845 + }, + { + "epoch": 1.4046010589738909, + "grad_norm": 0.9179114103317261, + "learning_rate": 1.2180976053477332e-05, + "loss": 0.5471, + "step": 3846 + }, + { + "epoch": 1.404966222384517, + "grad_norm": 0.9867071509361267, + "learning_rate": 1.2167774605988126e-05, + "loss": 0.525, + "step": 3847 + }, + { + "epoch": 1.4053313857951433, + "grad_norm": 0.8408907651901245, + "learning_rate": 1.2154577187864595e-05, + "loss": 0.5522, + "step": 3848 + }, + { + "epoch": 1.4056965492057696, + "grad_norm": 1.0048277378082275, + "learning_rate": 1.214138380589629e-05, + "loss": 0.555, + "step": 3849 + }, + { + "epoch": 1.406061712616396, + "grad_norm": 1.020862102508545, + "learning_rate": 1.2128194466870666e-05, + "loss": 0.5525, + "step": 3850 + }, + { + "epoch": 1.4064268760270222, + "grad_norm": 0.812157928943634, + "learning_rate": 1.2115009177573112e-05, + "loss": 0.5604, + "step": 3851 + }, + { + "epoch": 1.4067920394376483, + "grad_norm": 0.7309326529502869, + "learning_rate": 1.2101827944786936e-05, + "loss": 0.5843, + "step": 3852 + }, + { + "epoch": 1.4071572028482746, + "grad_norm": 0.7321484088897705, + "learning_rate": 1.2088650775293344e-05, + "loss": 0.5268, + "step": 3853 + }, + { + "epoch": 1.407522366258901, + "grad_norm": 1.0029045343399048, + "learning_rate": 1.2075477675871472e-05, + "loss": 0.5575, + "step": 3854 + }, + { + "epoch": 1.407887529669527, + "grad_norm": 0.7887302041053772, + "learning_rate": 1.2062308653298343e-05, + "loss": 0.5568, + "step": 3855 + }, + { + "epoch": 1.4082526930801533, + "grad_norm": 0.85622638463974, + "learning_rate": 1.2049143714348886e-05, + "loss": 0.5516, + "step": 3856 + }, + { + "epoch": 1.4086178564907796, + "grad_norm": 0.9460580348968506, + "learning_rate": 1.2035982865795944e-05, + "loss": 0.5249, + "step": 3857 + }, + { + "epoch": 1.408983019901406, + "grad_norm": 0.7221261858940125, + "learning_rate": 1.202282611441024e-05, + "loss": 0.5618, + "step": 3858 + }, + { + "epoch": 1.4093481833120323, + "grad_norm": 0.6986485719680786, + "learning_rate": 1.20096734669604e-05, + "loss": 0.5591, + "step": 3859 + }, + { + "epoch": 1.4097133467226584, + "grad_norm": 0.9845605492591858, + "learning_rate": 1.1996524930212921e-05, + "loss": 0.5518, + "step": 3860 + }, + { + "epoch": 1.4100785101332847, + "grad_norm": 1.0616382360458374, + "learning_rate": 1.198338051093221e-05, + "loss": 0.5635, + "step": 3861 + }, + { + "epoch": 1.410443673543911, + "grad_norm": 0.7955450415611267, + "learning_rate": 1.197024021588054e-05, + "loss": 0.5477, + "step": 3862 + }, + { + "epoch": 1.410808836954537, + "grad_norm": 1.042151689529419, + "learning_rate": 1.1957104051818063e-05, + "loss": 0.5869, + "step": 3863 + }, + { + "epoch": 1.4111740003651634, + "grad_norm": 0.8910982608795166, + "learning_rate": 1.1943972025502815e-05, + "loss": 0.5279, + "step": 3864 + }, + { + "epoch": 1.4115391637757897, + "grad_norm": 0.716566264629364, + "learning_rate": 1.1930844143690686e-05, + "loss": 0.5704, + "step": 3865 + }, + { + "epoch": 1.411904327186416, + "grad_norm": 1.04740571975708, + "learning_rate": 1.1917720413135454e-05, + "loss": 0.5475, + "step": 3866 + }, + { + "epoch": 1.412269490597042, + "grad_norm": 0.5876196622848511, + "learning_rate": 1.1904600840588752e-05, + "loss": 0.5816, + "step": 3867 + }, + { + "epoch": 1.4126346540076684, + "grad_norm": 0.8768644332885742, + "learning_rate": 1.189148543280006e-05, + "loss": 0.5486, + "step": 3868 + }, + { + "epoch": 1.4129998174182947, + "grad_norm": 0.6389560103416443, + "learning_rate": 1.1878374196516745e-05, + "loss": 0.5966, + "step": 3869 + }, + { + "epoch": 1.4133649808289208, + "grad_norm": 0.7460922598838806, + "learning_rate": 1.1865267138484e-05, + "loss": 0.5564, + "step": 3870 + }, + { + "epoch": 1.4137301442395471, + "grad_norm": 0.7749945521354675, + "learning_rate": 1.185216426544489e-05, + "loss": 0.5663, + "step": 3871 + }, + { + "epoch": 1.4140953076501734, + "grad_norm": 0.9464659094810486, + "learning_rate": 1.1839065584140308e-05, + "loss": 0.5497, + "step": 3872 + }, + { + "epoch": 1.4144604710607998, + "grad_norm": 0.9089056253433228, + "learning_rate": 1.1825971101309007e-05, + "loss": 0.5338, + "step": 3873 + }, + { + "epoch": 1.414825634471426, + "grad_norm": 0.7740195989608765, + "learning_rate": 1.1812880823687574e-05, + "loss": 0.5854, + "step": 3874 + }, + { + "epoch": 1.4151907978820522, + "grad_norm": 1.0285331010818481, + "learning_rate": 1.1799794758010425e-05, + "loss": 0.5397, + "step": 3875 + }, + { + "epoch": 1.4155559612926785, + "grad_norm": 0.7956613898277283, + "learning_rate": 1.1786712911009821e-05, + "loss": 0.5657, + "step": 3876 + }, + { + "epoch": 1.4159211247033048, + "grad_norm": 1.1669707298278809, + "learning_rate": 1.1773635289415846e-05, + "loss": 0.5629, + "step": 3877 + }, + { + "epoch": 1.4162862881139309, + "grad_norm": 1.1565542221069336, + "learning_rate": 1.1760561899956412e-05, + "loss": 0.6029, + "step": 3878 + }, + { + "epoch": 1.4166514515245572, + "grad_norm": 0.7887046933174133, + "learning_rate": 1.1747492749357248e-05, + "loss": 0.5464, + "step": 3879 + }, + { + "epoch": 1.4170166149351835, + "grad_norm": 0.8616452813148499, + "learning_rate": 1.1734427844341916e-05, + "loss": 0.5793, + "step": 3880 + }, + { + "epoch": 1.4173817783458098, + "grad_norm": 0.9074066877365112, + "learning_rate": 1.1721367191631788e-05, + "loss": 0.549, + "step": 3881 + }, + { + "epoch": 1.4177469417564361, + "grad_norm": 0.9903778433799744, + "learning_rate": 1.1708310797946028e-05, + "loss": 0.5272, + "step": 3882 + }, + { + "epoch": 1.4181121051670622, + "grad_norm": 0.9381337761878967, + "learning_rate": 1.169525867000164e-05, + "loss": 0.561, + "step": 3883 + }, + { + "epoch": 1.4184772685776885, + "grad_norm": 0.9176742434501648, + "learning_rate": 1.1682210814513422e-05, + "loss": 0.5579, + "step": 3884 + }, + { + "epoch": 1.4188424319883148, + "grad_norm": 2.271436929702759, + "learning_rate": 1.1669167238193965e-05, + "loss": 0.5552, + "step": 3885 + }, + { + "epoch": 1.419207595398941, + "grad_norm": 0.9941447973251343, + "learning_rate": 1.1656127947753668e-05, + "loss": 0.5275, + "step": 3886 + }, + { + "epoch": 1.4195727588095672, + "grad_norm": 1.0468906164169312, + "learning_rate": 1.1643092949900721e-05, + "loss": 0.5659, + "step": 3887 + }, + { + "epoch": 1.4199379222201935, + "grad_norm": 0.768421471118927, + "learning_rate": 1.1630062251341108e-05, + "loss": 0.5684, + "step": 3888 + }, + { + "epoch": 1.4203030856308199, + "grad_norm": 0.7507640719413757, + "learning_rate": 1.161703585877861e-05, + "loss": 0.5627, + "step": 3889 + }, + { + "epoch": 1.4206682490414462, + "grad_norm": 0.7245061993598938, + "learning_rate": 1.1604013778914771e-05, + "loss": 0.5872, + "step": 3890 + }, + { + "epoch": 1.4210334124520723, + "grad_norm": 1.1061756610870361, + "learning_rate": 1.159099601844893e-05, + "loss": 0.5411, + "step": 3891 + }, + { + "epoch": 1.4213985758626986, + "grad_norm": 0.8531544804573059, + "learning_rate": 1.1577982584078207e-05, + "loss": 0.5502, + "step": 3892 + }, + { + "epoch": 1.421763739273325, + "grad_norm": 0.7668573260307312, + "learning_rate": 1.156497348249749e-05, + "loss": 0.569, + "step": 3893 + }, + { + "epoch": 1.422128902683951, + "grad_norm": 0.6851369142532349, + "learning_rate": 1.1551968720399444e-05, + "loss": 0.5666, + "step": 3894 + }, + { + "epoch": 1.4224940660945773, + "grad_norm": 0.9241596460342407, + "learning_rate": 1.1538968304474499e-05, + "loss": 0.5724, + "step": 3895 + }, + { + "epoch": 1.4228592295052036, + "grad_norm": 0.6880790591239929, + "learning_rate": 1.1525972241410827e-05, + "loss": 0.5906, + "step": 3896 + }, + { + "epoch": 1.42322439291583, + "grad_norm": 0.8247973918914795, + "learning_rate": 1.15129805378944e-05, + "loss": 0.5439, + "step": 3897 + }, + { + "epoch": 1.423589556326456, + "grad_norm": 0.9598363041877747, + "learning_rate": 1.1499993200608921e-05, + "loss": 0.5766, + "step": 3898 + }, + { + "epoch": 1.4239547197370823, + "grad_norm": 1.1017124652862549, + "learning_rate": 1.1487010236235865e-05, + "loss": 0.5811, + "step": 3899 + }, + { + "epoch": 1.4243198831477086, + "grad_norm": 0.5694159269332886, + "learning_rate": 1.147403165145443e-05, + "loss": 0.6052, + "step": 3900 + }, + { + "epoch": 1.4246850465583347, + "grad_norm": 1.05720055103302, + "learning_rate": 1.1461057452941584e-05, + "loss": 0.5368, + "step": 3901 + }, + { + "epoch": 1.425050209968961, + "grad_norm": 1.0480480194091797, + "learning_rate": 1.1448087647372032e-05, + "loss": 0.514, + "step": 3902 + }, + { + "epoch": 1.4254153733795873, + "grad_norm": 1.006308913230896, + "learning_rate": 1.1435122241418224e-05, + "loss": 0.5287, + "step": 3903 + }, + { + "epoch": 1.4257805367902137, + "grad_norm": 0.7137683033943176, + "learning_rate": 1.142216124175033e-05, + "loss": 0.588, + "step": 3904 + }, + { + "epoch": 1.42614570020084, + "grad_norm": 0.9662609696388245, + "learning_rate": 1.1409204655036272e-05, + "loss": 0.5659, + "step": 3905 + }, + { + "epoch": 1.426510863611466, + "grad_norm": 0.7355804443359375, + "learning_rate": 1.139625248794169e-05, + "loss": 0.5449, + "step": 3906 + }, + { + "epoch": 1.4268760270220924, + "grad_norm": 0.9621326327323914, + "learning_rate": 1.1383304747129964e-05, + "loss": 0.5663, + "step": 3907 + }, + { + "epoch": 1.4272411904327187, + "grad_norm": 0.9017486572265625, + "learning_rate": 1.137036143926217e-05, + "loss": 0.5773, + "step": 3908 + }, + { + "epoch": 1.4276063538433448, + "grad_norm": 0.7580010890960693, + "learning_rate": 1.1357422570997138e-05, + "loss": 0.5631, + "step": 3909 + }, + { + "epoch": 1.427971517253971, + "grad_norm": 0.9669618010520935, + "learning_rate": 1.134448814899138e-05, + "loss": 0.5405, + "step": 3910 + }, + { + "epoch": 1.4283366806645974, + "grad_norm": 0.8277207016944885, + "learning_rate": 1.1331558179899148e-05, + "loss": 0.5593, + "step": 3911 + }, + { + "epoch": 1.4287018440752237, + "grad_norm": 0.7959915399551392, + "learning_rate": 1.1318632670372388e-05, + "loss": 0.5966, + "step": 3912 + }, + { + "epoch": 1.42906700748585, + "grad_norm": 0.9532966017723083, + "learning_rate": 1.1305711627060765e-05, + "loss": 0.5316, + "step": 3913 + }, + { + "epoch": 1.4294321708964761, + "grad_norm": 0.7921164631843567, + "learning_rate": 1.1292795056611621e-05, + "loss": 0.5753, + "step": 3914 + }, + { + "epoch": 1.4297973343071024, + "grad_norm": 0.8244426250457764, + "learning_rate": 1.1279882965670024e-05, + "loss": 0.5699, + "step": 3915 + }, + { + "epoch": 1.4301624977177287, + "grad_norm": 0.9989232420921326, + "learning_rate": 1.1266975360878723e-05, + "loss": 0.5644, + "step": 3916 + }, + { + "epoch": 1.4305276611283548, + "grad_norm": 0.886845588684082, + "learning_rate": 1.1254072248878164e-05, + "loss": 0.5635, + "step": 3917 + }, + { + "epoch": 1.4308928245389811, + "grad_norm": 0.6461564302444458, + "learning_rate": 1.1241173636306488e-05, + "loss": 0.5755, + "step": 3918 + }, + { + "epoch": 1.4312579879496075, + "grad_norm": 0.7689158320426941, + "learning_rate": 1.1228279529799501e-05, + "loss": 0.5815, + "step": 3919 + }, + { + "epoch": 1.4316231513602338, + "grad_norm": 0.874688446521759, + "learning_rate": 1.1215389935990708e-05, + "loss": 0.5505, + "step": 3920 + }, + { + "epoch": 1.43198831477086, + "grad_norm": 1.1357669830322266, + "learning_rate": 1.1202504861511296e-05, + "loss": 0.4918, + "step": 3921 + }, + { + "epoch": 1.4323534781814862, + "grad_norm": 0.6703139543533325, + "learning_rate": 1.1189624312990103e-05, + "loss": 0.585, + "step": 3922 + }, + { + "epoch": 1.4327186415921125, + "grad_norm": 1.2244127988815308, + "learning_rate": 1.1176748297053672e-05, + "loss": 0.5466, + "step": 3923 + }, + { + "epoch": 1.4330838050027388, + "grad_norm": 0.8089661002159119, + "learning_rate": 1.1163876820326179e-05, + "loss": 0.5803, + "step": 3924 + }, + { + "epoch": 1.4334489684133649, + "grad_norm": 0.7841872572898865, + "learning_rate": 1.1151009889429489e-05, + "loss": 0.5667, + "step": 3925 + }, + { + "epoch": 1.4338141318239912, + "grad_norm": 0.8316547870635986, + "learning_rate": 1.1138147510983121e-05, + "loss": 0.5403, + "step": 3926 + }, + { + "epoch": 1.4341792952346175, + "grad_norm": 0.8686932921409607, + "learning_rate": 1.112528969160426e-05, + "loss": 0.581, + "step": 3927 + }, + { + "epoch": 1.4345444586452438, + "grad_norm": 0.9217099547386169, + "learning_rate": 1.1112436437907737e-05, + "loss": 0.5358, + "step": 3928 + }, + { + "epoch": 1.4349096220558701, + "grad_norm": 0.8455994725227356, + "learning_rate": 1.1099587756506022e-05, + "loss": 0.5579, + "step": 3929 + }, + { + "epoch": 1.4352747854664962, + "grad_norm": 0.7332953214645386, + "learning_rate": 1.1086743654009257e-05, + "loss": 0.5773, + "step": 3930 + }, + { + "epoch": 1.4356399488771225, + "grad_norm": 0.8323222994804382, + "learning_rate": 1.1073904137025218e-05, + "loss": 0.5199, + "step": 3931 + }, + { + "epoch": 1.4360051122877486, + "grad_norm": 0.8535659313201904, + "learning_rate": 1.106106921215932e-05, + "loss": 0.5389, + "step": 3932 + }, + { + "epoch": 1.436370275698375, + "grad_norm": 0.6618138551712036, + "learning_rate": 1.1048238886014616e-05, + "loss": 0.5463, + "step": 3933 + }, + { + "epoch": 1.4367354391090013, + "grad_norm": 0.8994616866111755, + "learning_rate": 1.1035413165191792e-05, + "loss": 0.568, + "step": 3934 + }, + { + "epoch": 1.4371006025196276, + "grad_norm": 0.833524763584137, + "learning_rate": 1.1022592056289168e-05, + "loss": 0.5856, + "step": 3935 + }, + { + "epoch": 1.4374657659302539, + "grad_norm": 0.6980143785476685, + "learning_rate": 1.1009775565902686e-05, + "loss": 0.5832, + "step": 3936 + }, + { + "epoch": 1.43783092934088, + "grad_norm": 0.8310022354125977, + "learning_rate": 1.099696370062592e-05, + "loss": 0.5659, + "step": 3937 + }, + { + "epoch": 1.4381960927515063, + "grad_norm": 1.0758943557739258, + "learning_rate": 1.098415646705007e-05, + "loss": 0.5527, + "step": 3938 + }, + { + "epoch": 1.4385612561621326, + "grad_norm": 0.6438793540000916, + "learning_rate": 1.0971353871763925e-05, + "loss": 0.587, + "step": 3939 + }, + { + "epoch": 1.4389264195727587, + "grad_norm": 0.9396200180053711, + "learning_rate": 1.0958555921353918e-05, + "loss": 0.5582, + "step": 3940 + }, + { + "epoch": 1.439291582983385, + "grad_norm": 0.7176880836486816, + "learning_rate": 1.0945762622404078e-05, + "loss": 0.5728, + "step": 3941 + }, + { + "epoch": 1.4396567463940113, + "grad_norm": 0.6318895816802979, + "learning_rate": 1.0932973981496051e-05, + "loss": 0.592, + "step": 3942 + }, + { + "epoch": 1.4400219098046376, + "grad_norm": 0.7599021196365356, + "learning_rate": 1.0920190005209066e-05, + "loss": 0.5544, + "step": 3943 + }, + { + "epoch": 1.440387073215264, + "grad_norm": 1.7072486877441406, + "learning_rate": 1.0907410700119976e-05, + "loss": 0.5623, + "step": 3944 + }, + { + "epoch": 1.44075223662589, + "grad_norm": 0.8058611154556274, + "learning_rate": 1.0894636072803214e-05, + "loss": 0.5729, + "step": 3945 + }, + { + "epoch": 1.4411174000365163, + "grad_norm": 0.8422569632530212, + "learning_rate": 1.0881866129830829e-05, + "loss": 0.5716, + "step": 3946 + }, + { + "epoch": 1.4414825634471427, + "grad_norm": 0.8342012166976929, + "learning_rate": 1.086910087777242e-05, + "loss": 0.5366, + "step": 3947 + }, + { + "epoch": 1.4418477268577687, + "grad_norm": 0.7607089877128601, + "learning_rate": 1.085634032319522e-05, + "loss": 0.5294, + "step": 3948 + }, + { + "epoch": 1.442212890268395, + "grad_norm": 0.8281744718551636, + "learning_rate": 1.0843584472664004e-05, + "loss": 0.5604, + "step": 3949 + }, + { + "epoch": 1.4425780536790214, + "grad_norm": 0.8964477777481079, + "learning_rate": 1.0830833332741154e-05, + "loss": 0.5499, + "step": 3950 + }, + { + "epoch": 1.4429432170896477, + "grad_norm": 0.8936501145362854, + "learning_rate": 1.0818086909986613e-05, + "loss": 0.5402, + "step": 3951 + }, + { + "epoch": 1.443308380500274, + "grad_norm": 0.7300297617912292, + "learning_rate": 1.080534521095792e-05, + "loss": 0.5587, + "step": 3952 + }, + { + "epoch": 1.4436735439109, + "grad_norm": 0.8601420521736145, + "learning_rate": 1.0792608242210151e-05, + "loss": 0.568, + "step": 3953 + }, + { + "epoch": 1.4440387073215264, + "grad_norm": 0.6742613315582275, + "learning_rate": 1.0779876010295971e-05, + "loss": 0.5932, + "step": 3954 + }, + { + "epoch": 1.4444038707321527, + "grad_norm": 1.2348051071166992, + "learning_rate": 1.0767148521765604e-05, + "loss": 0.5711, + "step": 3955 + }, + { + "epoch": 1.4447690341427788, + "grad_norm": 0.7522004842758179, + "learning_rate": 1.0754425783166837e-05, + "loss": 0.5435, + "step": 3956 + }, + { + "epoch": 1.445134197553405, + "grad_norm": 0.8764027953147888, + "learning_rate": 1.0741707801044998e-05, + "loss": 0.5298, + "step": 3957 + }, + { + "epoch": 1.4454993609640314, + "grad_norm": 0.9089523553848267, + "learning_rate": 1.0728994581942982e-05, + "loss": 0.5717, + "step": 3958 + }, + { + "epoch": 1.4458645243746577, + "grad_norm": 0.7274268269538879, + "learning_rate": 1.0716286132401232e-05, + "loss": 0.5364, + "step": 3959 + }, + { + "epoch": 1.446229687785284, + "grad_norm": 0.6975461840629578, + "learning_rate": 1.0703582458957733e-05, + "loss": 0.5533, + "step": 3960 + }, + { + "epoch": 1.4465948511959101, + "grad_norm": 0.8617317080497742, + "learning_rate": 1.0690883568148025e-05, + "loss": 0.5683, + "step": 3961 + }, + { + "epoch": 1.4469600146065364, + "grad_norm": 1.0526492595672607, + "learning_rate": 1.0678189466505172e-05, + "loss": 0.5812, + "step": 3962 + }, + { + "epoch": 1.4473251780171628, + "grad_norm": 0.8932110667228699, + "learning_rate": 1.0665500160559765e-05, + "loss": 0.553, + "step": 3963 + }, + { + "epoch": 1.4476903414277889, + "grad_norm": 0.7149062156677246, + "learning_rate": 1.065281565683996e-05, + "loss": 0.5448, + "step": 3964 + }, + { + "epoch": 1.4480555048384152, + "grad_norm": 0.8641566634178162, + "learning_rate": 1.0640135961871417e-05, + "loss": 0.5274, + "step": 3965 + }, + { + "epoch": 1.4484206682490415, + "grad_norm": 1.0292166471481323, + "learning_rate": 1.0627461082177342e-05, + "loss": 0.5384, + "step": 3966 + }, + { + "epoch": 1.4487858316596678, + "grad_norm": 0.7775523066520691, + "learning_rate": 1.0614791024278437e-05, + "loss": 0.5234, + "step": 3967 + }, + { + "epoch": 1.4491509950702939, + "grad_norm": 0.7780976295471191, + "learning_rate": 1.0602125794692943e-05, + "loss": 0.5346, + "step": 3968 + }, + { + "epoch": 1.4495161584809202, + "grad_norm": 0.8889197707176208, + "learning_rate": 1.0589465399936616e-05, + "loss": 0.5238, + "step": 3969 + }, + { + "epoch": 1.4498813218915465, + "grad_norm": 0.848050594329834, + "learning_rate": 1.0576809846522721e-05, + "loss": 0.5987, + "step": 3970 + }, + { + "epoch": 1.4502464853021726, + "grad_norm": 0.8156909346580505, + "learning_rate": 1.0564159140962036e-05, + "loss": 0.5767, + "step": 3971 + }, + { + "epoch": 1.450611648712799, + "grad_norm": 1.0465294122695923, + "learning_rate": 1.0551513289762832e-05, + "loss": 0.5284, + "step": 3972 + }, + { + "epoch": 1.4509768121234252, + "grad_norm": 0.7149264216423035, + "learning_rate": 1.0538872299430892e-05, + "loss": 0.5391, + "step": 3973 + }, + { + "epoch": 1.4513419755340515, + "grad_norm": 0.5393648743629456, + "learning_rate": 1.052623617646951e-05, + "loss": 0.582, + "step": 3974 + }, + { + "epoch": 1.4517071389446778, + "grad_norm": 0.8497831225395203, + "learning_rate": 1.0513604927379455e-05, + "loss": 0.5229, + "step": 3975 + }, + { + "epoch": 1.452072302355304, + "grad_norm": 0.666197657585144, + "learning_rate": 1.0500978558659001e-05, + "loss": 0.5528, + "step": 3976 + }, + { + "epoch": 1.4524374657659302, + "grad_norm": 0.864449679851532, + "learning_rate": 1.0488357076803903e-05, + "loss": 0.538, + "step": 3977 + }, + { + "epoch": 1.4528026291765566, + "grad_norm": 0.6567715406417847, + "learning_rate": 1.047574048830741e-05, + "loss": 0.5606, + "step": 3978 + }, + { + "epoch": 1.4531677925871826, + "grad_norm": 0.6790978908538818, + "learning_rate": 1.046312879966025e-05, + "loss": 0.5479, + "step": 3979 + }, + { + "epoch": 1.453532955997809, + "grad_norm": 0.7013542652130127, + "learning_rate": 1.045052201735063e-05, + "loss": 0.5931, + "step": 3980 + }, + { + "epoch": 1.4538981194084353, + "grad_norm": 0.908169686794281, + "learning_rate": 1.0437920147864245e-05, + "loss": 0.5725, + "step": 3981 + }, + { + "epoch": 1.4542632828190616, + "grad_norm": 0.7810689210891724, + "learning_rate": 1.0425323197684233e-05, + "loss": 0.5818, + "step": 3982 + }, + { + "epoch": 1.454628446229688, + "grad_norm": 0.9803763031959534, + "learning_rate": 1.0412731173291229e-05, + "loss": 0.5341, + "step": 3983 + }, + { + "epoch": 1.454993609640314, + "grad_norm": 0.7520824074745178, + "learning_rate": 1.0400144081163321e-05, + "loss": 0.5522, + "step": 3984 + }, + { + "epoch": 1.4553587730509403, + "grad_norm": 1.0242899656295776, + "learning_rate": 1.0387561927776075e-05, + "loss": 0.5183, + "step": 3985 + }, + { + "epoch": 1.4557239364615666, + "grad_norm": 0.9487613439559937, + "learning_rate": 1.0374984719602486e-05, + "loss": 0.564, + "step": 3986 + }, + { + "epoch": 1.4560890998721927, + "grad_norm": 0.9238029718399048, + "learning_rate": 1.036241246311303e-05, + "loss": 0.5687, + "step": 3987 + }, + { + "epoch": 1.456454263282819, + "grad_norm": 0.9257331490516663, + "learning_rate": 1.0349845164775639e-05, + "loss": 0.5799, + "step": 3988 + }, + { + "epoch": 1.4568194266934453, + "grad_norm": 1.1217631101608276, + "learning_rate": 1.0337282831055664e-05, + "loss": 0.551, + "step": 3989 + }, + { + "epoch": 1.4571845901040716, + "grad_norm": 0.8550851345062256, + "learning_rate": 1.0324725468415942e-05, + "loss": 0.5538, + "step": 3990 + }, + { + "epoch": 1.457549753514698, + "grad_norm": 0.7631929516792297, + "learning_rate": 1.0312173083316712e-05, + "loss": 0.5674, + "step": 3991 + }, + { + "epoch": 1.457914916925324, + "grad_norm": 0.8212781548500061, + "learning_rate": 1.0299625682215684e-05, + "loss": 0.5521, + "step": 3992 + }, + { + "epoch": 1.4582800803359504, + "grad_norm": 0.8766182065010071, + "learning_rate": 1.028708327156799e-05, + "loss": 0.5665, + "step": 3993 + }, + { + "epoch": 1.4586452437465767, + "grad_norm": 1.0717082023620605, + "learning_rate": 1.0274545857826195e-05, + "loss": 0.5619, + "step": 3994 + }, + { + "epoch": 1.4590104071572028, + "grad_norm": 1.0999945402145386, + "learning_rate": 1.0262013447440311e-05, + "loss": 0.5492, + "step": 3995 + }, + { + "epoch": 1.459375570567829, + "grad_norm": 0.9581376314163208, + "learning_rate": 1.0249486046857735e-05, + "loss": 0.5228, + "step": 3996 + }, + { + "epoch": 1.4597407339784554, + "grad_norm": 0.8297455310821533, + "learning_rate": 1.0236963662523328e-05, + "loss": 0.5572, + "step": 3997 + }, + { + "epoch": 1.4601058973890817, + "grad_norm": 0.9138409495353699, + "learning_rate": 1.0224446300879344e-05, + "loss": 0.5201, + "step": 3998 + }, + { + "epoch": 1.4604710607997078, + "grad_norm": 0.7559996247291565, + "learning_rate": 1.0211933968365484e-05, + "loss": 0.5313, + "step": 3999 + }, + { + "epoch": 1.460836224210334, + "grad_norm": 0.8069090843200684, + "learning_rate": 1.0199426671418818e-05, + "loss": 0.5323, + "step": 4000 + }, + { + "epoch": 1.4612013876209604, + "grad_norm": 0.8363916277885437, + "learning_rate": 1.0186924416473862e-05, + "loss": 0.5317, + "step": 4001 + }, + { + "epoch": 1.4615665510315865, + "grad_norm": 0.877045750617981, + "learning_rate": 1.0174427209962513e-05, + "loss": 0.5178, + "step": 4002 + }, + { + "epoch": 1.4619317144422128, + "grad_norm": 0.961313784122467, + "learning_rate": 1.0161935058314087e-05, + "loss": 0.5294, + "step": 4003 + }, + { + "epoch": 1.4622968778528391, + "grad_norm": 1.438890814781189, + "learning_rate": 1.01494479679553e-05, + "loss": 0.602, + "step": 4004 + }, + { + "epoch": 1.4626620412634654, + "grad_norm": 1.0085265636444092, + "learning_rate": 1.0136965945310262e-05, + "loss": 0.5579, + "step": 4005 + }, + { + "epoch": 1.4630272046740918, + "grad_norm": 0.838114321231842, + "learning_rate": 1.0124488996800456e-05, + "loss": 0.5563, + "step": 4006 + }, + { + "epoch": 1.4633923680847178, + "grad_norm": 0.6769111156463623, + "learning_rate": 1.0112017128844784e-05, + "loss": 0.5522, + "step": 4007 + }, + { + "epoch": 1.4637575314953442, + "grad_norm": 0.7011337280273438, + "learning_rate": 1.0099550347859522e-05, + "loss": 0.572, + "step": 4008 + }, + { + "epoch": 1.4641226949059705, + "grad_norm": 0.8495879173278809, + "learning_rate": 1.008708866025833e-05, + "loss": 0.5612, + "step": 4009 + }, + { + "epoch": 1.4644878583165966, + "grad_norm": 0.8406143188476562, + "learning_rate": 1.0074632072452233e-05, + "loss": 0.5535, + "step": 4010 + }, + { + "epoch": 1.4648530217272229, + "grad_norm": 0.9218531847000122, + "learning_rate": 1.0062180590849655e-05, + "loss": 0.5561, + "step": 4011 + }, + { + "epoch": 1.4652181851378492, + "grad_norm": 0.8530716896057129, + "learning_rate": 1.0049734221856387e-05, + "loss": 0.553, + "step": 4012 + }, + { + "epoch": 1.4655833485484755, + "grad_norm": 0.6426557302474976, + "learning_rate": 1.003729297187558e-05, + "loss": 0.5647, + "step": 4013 + }, + { + "epoch": 1.4659485119591018, + "grad_norm": 2.151454448699951, + "learning_rate": 1.0024856847307766e-05, + "loss": 0.5303, + "step": 4014 + }, + { + "epoch": 1.466313675369728, + "grad_norm": 1.0635218620300293, + "learning_rate": 1.001242585455083e-05, + "loss": 0.4997, + "step": 4015 + }, + { + "epoch": 1.4666788387803542, + "grad_norm": 0.7495030760765076, + "learning_rate": 1.0000000000000006e-05, + "loss": 0.5418, + "step": 4016 + }, + { + "epoch": 1.4670440021909805, + "grad_norm": 0.7568851113319397, + "learning_rate": 9.987579290047906e-06, + "loss": 0.597, + "step": 4017 + }, + { + "epoch": 1.4674091656016066, + "grad_norm": 0.9760790467262268, + "learning_rate": 9.97516373108449e-06, + "loss": 0.5502, + "step": 4018 + }, + { + "epoch": 1.467774329012233, + "grad_norm": 0.9156248569488525, + "learning_rate": 9.962753329497069e-06, + "loss": 0.5353, + "step": 4019 + }, + { + "epoch": 1.4681394924228592, + "grad_norm": 0.5740343332290649, + "learning_rate": 9.950348091670281e-06, + "loss": 0.5579, + "step": 4020 + }, + { + "epoch": 1.4685046558334856, + "grad_norm": 0.9513039588928223, + "learning_rate": 9.937948023986135e-06, + "loss": 0.5347, + "step": 4021 + }, + { + "epoch": 1.4688698192441119, + "grad_norm": 0.9432803988456726, + "learning_rate": 9.925553132823967e-06, + "loss": 0.5591, + "step": 4022 + }, + { + "epoch": 1.469234982654738, + "grad_norm": 0.9032891392707825, + "learning_rate": 9.913163424560446e-06, + "loss": 0.5587, + "step": 4023 + }, + { + "epoch": 1.4696001460653643, + "grad_norm": 0.9759991765022278, + "learning_rate": 9.900778905569592e-06, + "loss": 0.5101, + "step": 4024 + }, + { + "epoch": 1.4699653094759906, + "grad_norm": 0.5861659646034241, + "learning_rate": 9.88839958222273e-06, + "loss": 0.575, + "step": 4025 + }, + { + "epoch": 1.4703304728866167, + "grad_norm": 1.0142534971237183, + "learning_rate": 9.876025460888528e-06, + "loss": 0.5567, + "step": 4026 + }, + { + "epoch": 1.470695636297243, + "grad_norm": 1.1566455364227295, + "learning_rate": 9.863656547932976e-06, + "loss": 0.5341, + "step": 4027 + }, + { + "epoch": 1.4710607997078693, + "grad_norm": 0.8980649709701538, + "learning_rate": 9.851292849719392e-06, + "loss": 0.5689, + "step": 4028 + }, + { + "epoch": 1.4714259631184956, + "grad_norm": 0.6560776829719543, + "learning_rate": 9.838934372608394e-06, + "loss": 0.5469, + "step": 4029 + }, + { + "epoch": 1.471791126529122, + "grad_norm": 1.4901273250579834, + "learning_rate": 9.826581122957915e-06, + "loss": 0.513, + "step": 4030 + }, + { + "epoch": 1.472156289939748, + "grad_norm": 1.2591936588287354, + "learning_rate": 9.814233107123215e-06, + "loss": 0.5422, + "step": 4031 + }, + { + "epoch": 1.4725214533503743, + "grad_norm": 0.7727863788604736, + "learning_rate": 9.801890331456851e-06, + "loss": 0.5754, + "step": 4032 + }, + { + "epoch": 1.4728866167610004, + "grad_norm": 1.1301239728927612, + "learning_rate": 9.789552802308697e-06, + "loss": 0.5648, + "step": 4033 + }, + { + "epoch": 1.4732517801716267, + "grad_norm": 0.7674900889396667, + "learning_rate": 9.777220526025897e-06, + "loss": 0.5556, + "step": 4034 + }, + { + "epoch": 1.473616943582253, + "grad_norm": 1.0524770021438599, + "learning_rate": 9.764893508952924e-06, + "loss": 0.5739, + "step": 4035 + }, + { + "epoch": 1.4739821069928793, + "grad_norm": 1.1383737325668335, + "learning_rate": 9.752571757431528e-06, + "loss": 0.5726, + "step": 4036 + }, + { + "epoch": 1.4743472704035057, + "grad_norm": 1.0157060623168945, + "learning_rate": 9.740255277800761e-06, + "loss": 0.4792, + "step": 4037 + }, + { + "epoch": 1.4747124338141318, + "grad_norm": 0.8764933347702026, + "learning_rate": 9.727944076396962e-06, + "loss": 0.523, + "step": 4038 + }, + { + "epoch": 1.475077597224758, + "grad_norm": 1.1367887258529663, + "learning_rate": 9.715638159553737e-06, + "loss": 0.5216, + "step": 4039 + }, + { + "epoch": 1.4754427606353844, + "grad_norm": 0.9799183011054993, + "learning_rate": 9.703337533601995e-06, + "loss": 0.5226, + "step": 4040 + }, + { + "epoch": 1.4758079240460105, + "grad_norm": 0.8010982275009155, + "learning_rate": 9.691042204869918e-06, + "loss": 0.5846, + "step": 4041 + }, + { + "epoch": 1.4761730874566368, + "grad_norm": 0.9556754231452942, + "learning_rate": 9.678752179682947e-06, + "loss": 0.5359, + "step": 4042 + }, + { + "epoch": 1.476538250867263, + "grad_norm": 0.8769105672836304, + "learning_rate": 9.666467464363822e-06, + "loss": 0.5394, + "step": 4043 + }, + { + "epoch": 1.4769034142778894, + "grad_norm": 1.2128407955169678, + "learning_rate": 9.65418806523252e-06, + "loss": 0.5633, + "step": 4044 + }, + { + "epoch": 1.4772685776885157, + "grad_norm": 1.1116158962249756, + "learning_rate": 9.641913988606304e-06, + "loss": 0.5294, + "step": 4045 + }, + { + "epoch": 1.4776337410991418, + "grad_norm": 2.208587169647217, + "learning_rate": 9.629645240799698e-06, + "loss": 0.5724, + "step": 4046 + }, + { + "epoch": 1.4779989045097681, + "grad_norm": 1.1062947511672974, + "learning_rate": 9.617381828124482e-06, + "loss": 0.5328, + "step": 4047 + }, + { + "epoch": 1.4783640679203944, + "grad_norm": 1.1361697912216187, + "learning_rate": 9.605123756889692e-06, + "loss": 0.541, + "step": 4048 + }, + { + "epoch": 1.4787292313310205, + "grad_norm": 0.9591314792633057, + "learning_rate": 9.5928710334016e-06, + "loss": 0.5646, + "step": 4049 + }, + { + "epoch": 1.4790943947416468, + "grad_norm": 2.141479015350342, + "learning_rate": 9.580623663963753e-06, + "loss": 0.5472, + "step": 4050 + }, + { + "epoch": 1.4794595581522731, + "grad_norm": 0.926795482635498, + "learning_rate": 9.568381654876924e-06, + "loss": 0.4863, + "step": 4051 + }, + { + "epoch": 1.4798247215628995, + "grad_norm": 0.8757331967353821, + "learning_rate": 9.55614501243915e-06, + "loss": 0.5847, + "step": 4052 + }, + { + "epoch": 1.4801898849735258, + "grad_norm": 1.0324450731277466, + "learning_rate": 9.54391374294567e-06, + "loss": 0.5141, + "step": 4053 + }, + { + "epoch": 1.4805550483841519, + "grad_norm": 0.8291592597961426, + "learning_rate": 9.531687852689003e-06, + "loss": 0.5546, + "step": 4054 + }, + { + "epoch": 1.4809202117947782, + "grad_norm": 1.4921578168869019, + "learning_rate": 9.519467347958857e-06, + "loss": 0.5553, + "step": 4055 + }, + { + "epoch": 1.4812853752054045, + "grad_norm": 0.980554461479187, + "learning_rate": 9.507252235042205e-06, + "loss": 0.5258, + "step": 4056 + }, + { + "epoch": 1.4816505386160306, + "grad_norm": 0.7031359672546387, + "learning_rate": 9.495042520223233e-06, + "loss": 0.5606, + "step": 4057 + }, + { + "epoch": 1.482015702026657, + "grad_norm": 0.9737385511398315, + "learning_rate": 9.482838209783351e-06, + "loss": 0.5645, + "step": 4058 + }, + { + "epoch": 1.4823808654372832, + "grad_norm": 0.7892079949378967, + "learning_rate": 9.470639310001176e-06, + "loss": 0.5356, + "step": 4059 + }, + { + "epoch": 1.4827460288479095, + "grad_norm": 0.8065754771232605, + "learning_rate": 9.458445827152558e-06, + "loss": 0.5406, + "step": 4060 + }, + { + "epoch": 1.4831111922585358, + "grad_norm": 1.0150619745254517, + "learning_rate": 9.446257767510559e-06, + "loss": 0.4894, + "step": 4061 + }, + { + "epoch": 1.483476355669162, + "grad_norm": 0.8664150238037109, + "learning_rate": 9.434075137345447e-06, + "loss": 0.5523, + "step": 4062 + }, + { + "epoch": 1.4838415190797882, + "grad_norm": 0.9920085668563843, + "learning_rate": 9.421897942924687e-06, + "loss": 0.5421, + "step": 4063 + }, + { + "epoch": 1.4842066824904145, + "grad_norm": 0.914808988571167, + "learning_rate": 9.409726190512962e-06, + "loss": 0.5664, + "step": 4064 + }, + { + "epoch": 1.4845718459010406, + "grad_norm": 0.8107731342315674, + "learning_rate": 9.397559886372152e-06, + "loss": 0.5396, + "step": 4065 + }, + { + "epoch": 1.484937009311667, + "grad_norm": 0.9913045167922974, + "learning_rate": 9.385399036761329e-06, + "loss": 0.5233, + "step": 4066 + }, + { + "epoch": 1.4853021727222933, + "grad_norm": 0.9688202142715454, + "learning_rate": 9.373243647936773e-06, + "loss": 0.5298, + "step": 4067 + }, + { + "epoch": 1.4856673361329196, + "grad_norm": 1.4763659238815308, + "learning_rate": 9.361093726151935e-06, + "loss": 0.576, + "step": 4068 + }, + { + "epoch": 1.4860324995435457, + "grad_norm": 0.9279058575630188, + "learning_rate": 9.348949277657455e-06, + "loss": 0.5326, + "step": 4069 + }, + { + "epoch": 1.486397662954172, + "grad_norm": 0.8254098296165466, + "learning_rate": 9.33681030870117e-06, + "loss": 0.5876, + "step": 4070 + }, + { + "epoch": 1.4867628263647983, + "grad_norm": 1.0469989776611328, + "learning_rate": 9.324676825528095e-06, + "loss": 0.5327, + "step": 4071 + }, + { + "epoch": 1.4871279897754244, + "grad_norm": 0.7550424337387085, + "learning_rate": 9.312548834380429e-06, + "loss": 0.5599, + "step": 4072 + }, + { + "epoch": 1.4874931531860507, + "grad_norm": 0.8626777529716492, + "learning_rate": 9.300426341497515e-06, + "loss": 0.4987, + "step": 4073 + }, + { + "epoch": 1.487858316596677, + "grad_norm": 0.7991651892662048, + "learning_rate": 9.288309353115903e-06, + "loss": 0.5625, + "step": 4074 + }, + { + "epoch": 1.4882234800073033, + "grad_norm": 1.0298484563827515, + "learning_rate": 9.276197875469298e-06, + "loss": 0.5132, + "step": 4075 + }, + { + "epoch": 1.4885886434179296, + "grad_norm": 0.7250978350639343, + "learning_rate": 9.264091914788572e-06, + "loss": 0.589, + "step": 4076 + }, + { + "epoch": 1.4889538068285557, + "grad_norm": 0.6632958650588989, + "learning_rate": 9.251991477301742e-06, + "loss": 0.5673, + "step": 4077 + }, + { + "epoch": 1.489318970239182, + "grad_norm": 0.8273259997367859, + "learning_rate": 9.239896569234008e-06, + "loss": 0.5549, + "step": 4078 + }, + { + "epoch": 1.4896841336498083, + "grad_norm": 0.7699050307273865, + "learning_rate": 9.227807196807711e-06, + "loss": 0.5649, + "step": 4079 + }, + { + "epoch": 1.4900492970604344, + "grad_norm": 0.9873746037483215, + "learning_rate": 9.215723366242352e-06, + "loss": 0.5673, + "step": 4080 + }, + { + "epoch": 1.4904144604710607, + "grad_norm": 0.8614767789840698, + "learning_rate": 9.203645083754581e-06, + "loss": 0.5473, + "step": 4081 + }, + { + "epoch": 1.490779623881687, + "grad_norm": 0.7521244287490845, + "learning_rate": 9.191572355558187e-06, + "loss": 0.5263, + "step": 4082 + }, + { + "epoch": 1.4911447872923134, + "grad_norm": 1.188138723373413, + "learning_rate": 9.17950518786409e-06, + "loss": 0.5385, + "step": 4083 + }, + { + "epoch": 1.4915099507029397, + "grad_norm": 0.7209674715995789, + "learning_rate": 9.167443586880376e-06, + "loss": 0.5397, + "step": 4084 + }, + { + "epoch": 1.4918751141135658, + "grad_norm": 0.6747141480445862, + "learning_rate": 9.155387558812252e-06, + "loss": 0.5808, + "step": 4085 + }, + { + "epoch": 1.492240277524192, + "grad_norm": 1.232947826385498, + "learning_rate": 9.14333710986207e-06, + "loss": 0.5044, + "step": 4086 + }, + { + "epoch": 1.4926054409348184, + "grad_norm": 0.6864703893661499, + "learning_rate": 9.131292246229286e-06, + "loss": 0.5612, + "step": 4087 + }, + { + "epoch": 1.4929706043454445, + "grad_norm": 0.889289915561676, + "learning_rate": 9.119252974110508e-06, + "loss": 0.5536, + "step": 4088 + }, + { + "epoch": 1.4933357677560708, + "grad_norm": 0.8222779631614685, + "learning_rate": 9.107219299699459e-06, + "loss": 0.5777, + "step": 4089 + }, + { + "epoch": 1.493700931166697, + "grad_norm": 0.8773689270019531, + "learning_rate": 9.095191229186977e-06, + "loss": 0.5237, + "step": 4090 + }, + { + "epoch": 1.4940660945773234, + "grad_norm": 1.2188704013824463, + "learning_rate": 9.083168768761035e-06, + "loss": 0.5388, + "step": 4091 + }, + { + "epoch": 1.4944312579879497, + "grad_norm": 0.9776803255081177, + "learning_rate": 9.071151924606688e-06, + "loss": 0.5234, + "step": 4092 + }, + { + "epoch": 1.4947964213985758, + "grad_norm": 1.0659689903259277, + "learning_rate": 9.059140702906128e-06, + "loss": 0.5246, + "step": 4093 + }, + { + "epoch": 1.4951615848092021, + "grad_norm": 0.952974259853363, + "learning_rate": 9.047135109838654e-06, + "loss": 0.5211, + "step": 4094 + }, + { + "epoch": 1.4955267482198285, + "grad_norm": 1.0041263103485107, + "learning_rate": 9.035135151580649e-06, + "loss": 0.4886, + "step": 4095 + }, + { + "epoch": 1.4958919116304545, + "grad_norm": 0.918709397315979, + "learning_rate": 9.023140834305621e-06, + "loss": 0.5502, + "step": 4096 + }, + { + "epoch": 1.4962570750410809, + "grad_norm": 1.0465898513793945, + "learning_rate": 9.011152164184157e-06, + "loss": 0.5168, + "step": 4097 + }, + { + "epoch": 1.4966222384517072, + "grad_norm": 0.7389519810676575, + "learning_rate": 8.999169147383943e-06, + "loss": 0.5679, + "step": 4098 + }, + { + "epoch": 1.4969874018623335, + "grad_norm": 0.8351882696151733, + "learning_rate": 8.987191790069771e-06, + "loss": 0.5687, + "step": 4099 + }, + { + "epoch": 1.4973525652729596, + "grad_norm": 1.0429890155792236, + "learning_rate": 8.975220098403507e-06, + "loss": 0.5707, + "step": 4100 + }, + { + "epoch": 1.4977177286835859, + "grad_norm": 0.9522386193275452, + "learning_rate": 8.963254078544112e-06, + "loss": 0.5316, + "step": 4101 + }, + { + "epoch": 1.4980828920942122, + "grad_norm": 0.617000162601471, + "learning_rate": 8.951293736647608e-06, + "loss": 0.5643, + "step": 4102 + }, + { + "epoch": 1.4984480555048383, + "grad_norm": 0.8510279059410095, + "learning_rate": 8.93933907886712e-06, + "loss": 0.5594, + "step": 4103 + }, + { + "epoch": 1.4988132189154646, + "grad_norm": 0.9541760683059692, + "learning_rate": 8.92739011135284e-06, + "loss": 0.5601, + "step": 4104 + }, + { + "epoch": 1.499178382326091, + "grad_norm": 0.7965231537818909, + "learning_rate": 8.91544684025204e-06, + "loss": 0.5201, + "step": 4105 + }, + { + "epoch": 1.4995435457367172, + "grad_norm": 0.828439474105835, + "learning_rate": 8.903509271709035e-06, + "loss": 0.5137, + "step": 4106 + }, + { + "epoch": 1.4999087091473435, + "grad_norm": 0.8249757289886475, + "learning_rate": 8.891577411865237e-06, + "loss": 0.5493, + "step": 4107 + }, + { + "epoch": 1.5002738725579698, + "grad_norm": 1.1597923040390015, + "learning_rate": 8.879651266859116e-06, + "loss": 0.5199, + "step": 4108 + }, + { + "epoch": 1.500639035968596, + "grad_norm": 1.1648917198181152, + "learning_rate": 8.867730842826177e-06, + "loss": 0.5038, + "step": 4109 + }, + { + "epoch": 1.5010041993792222, + "grad_norm": 0.7594660520553589, + "learning_rate": 8.855816145899016e-06, + "loss": 0.5369, + "step": 4110 + }, + { + "epoch": 1.5013693627898483, + "grad_norm": 1.0559147596359253, + "learning_rate": 8.843907182207254e-06, + "loss": 0.5389, + "step": 4111 + }, + { + "epoch": 1.5017345262004747, + "grad_norm": 0.7653175592422485, + "learning_rate": 8.832003957877579e-06, + "loss": 0.5305, + "step": 4112 + }, + { + "epoch": 1.502099689611101, + "grad_norm": 0.9201658368110657, + "learning_rate": 8.820106479033725e-06, + "loss": 0.5267, + "step": 4113 + }, + { + "epoch": 1.5024648530217273, + "grad_norm": 0.8921972513198853, + "learning_rate": 8.808214751796467e-06, + "loss": 0.5702, + "step": 4114 + }, + { + "epoch": 1.5028300164323536, + "grad_norm": 0.8630145788192749, + "learning_rate": 8.796328782283627e-06, + "loss": 0.4911, + "step": 4115 + }, + { + "epoch": 1.5031951798429797, + "grad_norm": 0.8433764576911926, + "learning_rate": 8.784448576610045e-06, + "loss": 0.5311, + "step": 4116 + }, + { + "epoch": 1.503560343253606, + "grad_norm": 0.9963394999504089, + "learning_rate": 8.77257414088762e-06, + "loss": 0.5367, + "step": 4117 + }, + { + "epoch": 1.503925506664232, + "grad_norm": 0.8315544128417969, + "learning_rate": 8.760705481225271e-06, + "loss": 0.5399, + "step": 4118 + }, + { + "epoch": 1.5042906700748584, + "grad_norm": 0.6828075051307678, + "learning_rate": 8.748842603728954e-06, + "loss": 0.5844, + "step": 4119 + }, + { + "epoch": 1.5046558334854847, + "grad_norm": 1.0928466320037842, + "learning_rate": 8.736985514501627e-06, + "loss": 0.5097, + "step": 4120 + }, + { + "epoch": 1.505020996896111, + "grad_norm": 0.8490285277366638, + "learning_rate": 8.725134219643307e-06, + "loss": 0.5106, + "step": 4121 + }, + { + "epoch": 1.5053861603067373, + "grad_norm": 0.9667754173278809, + "learning_rate": 8.71328872525099e-06, + "loss": 0.5697, + "step": 4122 + }, + { + "epoch": 1.5057513237173636, + "grad_norm": 0.8549676537513733, + "learning_rate": 8.701449037418717e-06, + "loss": 0.5666, + "step": 4123 + }, + { + "epoch": 1.5061164871279897, + "grad_norm": 0.7192597389221191, + "learning_rate": 8.68961516223753e-06, + "loss": 0.515, + "step": 4124 + }, + { + "epoch": 1.506481650538616, + "grad_norm": 0.8196496367454529, + "learning_rate": 8.677787105795494e-06, + "loss": 0.564, + "step": 4125 + }, + { + "epoch": 1.5068468139492421, + "grad_norm": 0.8854207396507263, + "learning_rate": 8.66596487417765e-06, + "loss": 0.5716, + "step": 4126 + }, + { + "epoch": 1.5072119773598684, + "grad_norm": 0.9815844297409058, + "learning_rate": 8.654148473466075e-06, + "loss": 0.5479, + "step": 4127 + }, + { + "epoch": 1.5075771407704948, + "grad_norm": 0.9503182172775269, + "learning_rate": 8.642337909739826e-06, + "loss": 0.5236, + "step": 4128 + }, + { + "epoch": 1.507942304181121, + "grad_norm": 0.8710759878158569, + "learning_rate": 8.630533189074979e-06, + "loss": 0.5655, + "step": 4129 + }, + { + "epoch": 1.5083074675917474, + "grad_norm": 0.9771408438682556, + "learning_rate": 8.618734317544569e-06, + "loss": 0.5138, + "step": 4130 + }, + { + "epoch": 1.5086726310023737, + "grad_norm": 1.1011977195739746, + "learning_rate": 8.60694130121865e-06, + "loss": 0.5382, + "step": 4131 + }, + { + "epoch": 1.5090377944129998, + "grad_norm": 0.9117840528488159, + "learning_rate": 8.595154146164257e-06, + "loss": 0.5525, + "step": 4132 + }, + { + "epoch": 1.509402957823626, + "grad_norm": 0.9619202017784119, + "learning_rate": 8.583372858445408e-06, + "loss": 0.4971, + "step": 4133 + }, + { + "epoch": 1.5097681212342522, + "grad_norm": 0.99234539270401, + "learning_rate": 8.57159744412311e-06, + "loss": 0.5446, + "step": 4134 + }, + { + "epoch": 1.5101332846448785, + "grad_norm": 0.9445732831954956, + "learning_rate": 8.559827909255333e-06, + "loss": 0.5789, + "step": 4135 + }, + { + "epoch": 1.5104984480555048, + "grad_norm": 0.8710625767707825, + "learning_rate": 8.548064259897024e-06, + "loss": 0.5395, + "step": 4136 + }, + { + "epoch": 1.5108636114661311, + "grad_norm": 0.867486834526062, + "learning_rate": 8.536306502100118e-06, + "loss": 0.5536, + "step": 4137 + }, + { + "epoch": 1.5112287748767574, + "grad_norm": 1.0533684492111206, + "learning_rate": 8.524554641913504e-06, + "loss": 0.5558, + "step": 4138 + }, + { + "epoch": 1.5115939382873838, + "grad_norm": 0.8343387246131897, + "learning_rate": 8.512808685383056e-06, + "loss": 0.5719, + "step": 4139 + }, + { + "epoch": 1.5119591016980098, + "grad_norm": 0.9419680833816528, + "learning_rate": 8.501068638551577e-06, + "loss": 0.5453, + "step": 4140 + }, + { + "epoch": 1.5123242651086362, + "grad_norm": 0.8943369388580322, + "learning_rate": 8.489334507458862e-06, + "loss": 0.5189, + "step": 4141 + }, + { + "epoch": 1.5126894285192622, + "grad_norm": 0.9612909555435181, + "learning_rate": 8.47760629814165e-06, + "loss": 0.5728, + "step": 4142 + }, + { + "epoch": 1.5130545919298886, + "grad_norm": 1.2583495378494263, + "learning_rate": 8.465884016633629e-06, + "loss": 0.5344, + "step": 4143 + }, + { + "epoch": 1.5134197553405149, + "grad_norm": 1.2594666481018066, + "learning_rate": 8.454167668965457e-06, + "loss": 0.5131, + "step": 4144 + }, + { + "epoch": 1.5137849187511412, + "grad_norm": 0.8043733835220337, + "learning_rate": 8.442457261164705e-06, + "loss": 0.5537, + "step": 4145 + }, + { + "epoch": 1.5141500821617675, + "grad_norm": 0.8278462886810303, + "learning_rate": 8.430752799255918e-06, + "loss": 0.5605, + "step": 4146 + }, + { + "epoch": 1.5145152455723938, + "grad_norm": 0.7966932058334351, + "learning_rate": 8.419054289260569e-06, + "loss": 0.5403, + "step": 4147 + }, + { + "epoch": 1.51488040898302, + "grad_norm": 0.7923164367675781, + "learning_rate": 8.407361737197079e-06, + "loss": 0.5519, + "step": 4148 + }, + { + "epoch": 1.5152455723936462, + "grad_norm": 0.9273017048835754, + "learning_rate": 8.395675149080795e-06, + "loss": 0.5249, + "step": 4149 + }, + { + "epoch": 1.5156107358042723, + "grad_norm": 0.9245014190673828, + "learning_rate": 8.383994530923987e-06, + "loss": 0.5533, + "step": 4150 + }, + { + "epoch": 1.5159758992148986, + "grad_norm": 1.4228578805923462, + "learning_rate": 8.372319888735872e-06, + "loss": 0.52, + "step": 4151 + }, + { + "epoch": 1.516341062625525, + "grad_norm": 0.9283208847045898, + "learning_rate": 8.360651228522583e-06, + "loss": 0.5454, + "step": 4152 + }, + { + "epoch": 1.5167062260361512, + "grad_norm": 0.8381152749061584, + "learning_rate": 8.348988556287185e-06, + "loss": 0.5594, + "step": 4153 + }, + { + "epoch": 1.5170713894467776, + "grad_norm": 1.025354027748108, + "learning_rate": 8.337331878029644e-06, + "loss": 0.4973, + "step": 4154 + }, + { + "epoch": 1.5174365528574036, + "grad_norm": 0.7261088490486145, + "learning_rate": 8.325681199746856e-06, + "loss": 0.5406, + "step": 4155 + }, + { + "epoch": 1.51780171626803, + "grad_norm": 0.9314051270484924, + "learning_rate": 8.314036527432631e-06, + "loss": 0.5681, + "step": 4156 + }, + { + "epoch": 1.518166879678656, + "grad_norm": 1.000509262084961, + "learning_rate": 8.302397867077683e-06, + "loss": 0.5456, + "step": 4157 + }, + { + "epoch": 1.5185320430892824, + "grad_norm": 1.0051593780517578, + "learning_rate": 8.290765224669646e-06, + "loss": 0.5569, + "step": 4158 + }, + { + "epoch": 1.5188972064999087, + "grad_norm": 0.9077114462852478, + "learning_rate": 8.27913860619303e-06, + "loss": 0.5591, + "step": 4159 + }, + { + "epoch": 1.519262369910535, + "grad_norm": 0.7231484055519104, + "learning_rate": 8.267518017629272e-06, + "loss": 0.5533, + "step": 4160 + }, + { + "epoch": 1.5196275333211613, + "grad_norm": 0.8238896131515503, + "learning_rate": 8.255903464956707e-06, + "loss": 0.5546, + "step": 4161 + }, + { + "epoch": 1.5199926967317876, + "grad_norm": 0.8406558632850647, + "learning_rate": 8.24429495415054e-06, + "loss": 0.5359, + "step": 4162 + }, + { + "epoch": 1.5203578601424137, + "grad_norm": 0.7404271364212036, + "learning_rate": 8.232692491182898e-06, + "loss": 0.5751, + "step": 4163 + }, + { + "epoch": 1.52072302355304, + "grad_norm": 0.8278073668479919, + "learning_rate": 8.221096082022773e-06, + "loss": 0.5676, + "step": 4164 + }, + { + "epoch": 1.521088186963666, + "grad_norm": 0.9355740547180176, + "learning_rate": 8.209505732636052e-06, + "loss": 0.5716, + "step": 4165 + }, + { + "epoch": 1.5214533503742924, + "grad_norm": 0.9325588345527649, + "learning_rate": 8.197921448985512e-06, + "loss": 0.5132, + "step": 4166 + }, + { + "epoch": 1.5218185137849187, + "grad_norm": 0.8123670220375061, + "learning_rate": 8.186343237030795e-06, + "loss": 0.5347, + "step": 4167 + }, + { + "epoch": 1.522183677195545, + "grad_norm": 0.8848276734352112, + "learning_rate": 8.174771102728438e-06, + "loss": 0.5711, + "step": 4168 + }, + { + "epoch": 1.5225488406061713, + "grad_norm": 0.9114759564399719, + "learning_rate": 8.163205052031827e-06, + "loss": 0.5122, + "step": 4169 + }, + { + "epoch": 1.5229140040167977, + "grad_norm": 0.8145031929016113, + "learning_rate": 8.151645090891234e-06, + "loss": 0.5796, + "step": 4170 + }, + { + "epoch": 1.5232791674274238, + "grad_norm": 0.678084135055542, + "learning_rate": 8.140091225253797e-06, + "loss": 0.5652, + "step": 4171 + }, + { + "epoch": 1.52364433083805, + "grad_norm": 0.8058270812034607, + "learning_rate": 8.128543461063523e-06, + "loss": 0.563, + "step": 4172 + }, + { + "epoch": 1.5240094942486762, + "grad_norm": 0.8949602842330933, + "learning_rate": 8.11700180426126e-06, + "loss": 0.5804, + "step": 4173 + }, + { + "epoch": 1.5243746576593025, + "grad_norm": 0.9329439401626587, + "learning_rate": 8.105466260784733e-06, + "loss": 0.5499, + "step": 4174 + }, + { + "epoch": 1.5247398210699288, + "grad_norm": 1.1913632154464722, + "learning_rate": 8.093936836568523e-06, + "loss": 0.5648, + "step": 4175 + }, + { + "epoch": 1.525104984480555, + "grad_norm": 0.7555683255195618, + "learning_rate": 8.082413537544045e-06, + "loss": 0.5427, + "step": 4176 + }, + { + "epoch": 1.5254701478911814, + "grad_norm": 1.0422959327697754, + "learning_rate": 8.070896369639578e-06, + "loss": 0.5305, + "step": 4177 + }, + { + "epoch": 1.5258353113018077, + "grad_norm": 0.8744900822639465, + "learning_rate": 8.05938533878025e-06, + "loss": 0.5235, + "step": 4178 + }, + { + "epoch": 1.5262004747124338, + "grad_norm": 0.6947089433670044, + "learning_rate": 8.047880450888013e-06, + "loss": 0.5809, + "step": 4179 + }, + { + "epoch": 1.5265656381230601, + "grad_norm": 1.1186779737472534, + "learning_rate": 8.036381711881674e-06, + "loss": 0.5539, + "step": 4180 + }, + { + "epoch": 1.5269308015336862, + "grad_norm": 0.7364650368690491, + "learning_rate": 8.024889127676874e-06, + "loss": 0.5533, + "step": 4181 + }, + { + "epoch": 1.5272959649443125, + "grad_norm": 1.0077247619628906, + "learning_rate": 8.013402704186095e-06, + "loss": 0.5028, + "step": 4182 + }, + { + "epoch": 1.5276611283549388, + "grad_norm": 0.8376016616821289, + "learning_rate": 8.001922447318624e-06, + "loss": 0.5817, + "step": 4183 + }, + { + "epoch": 1.5280262917655651, + "grad_norm": 0.9573248028755188, + "learning_rate": 7.990448362980601e-06, + "loss": 0.5298, + "step": 4184 + }, + { + "epoch": 1.5283914551761915, + "grad_norm": 0.9927940368652344, + "learning_rate": 7.978980457074983e-06, + "loss": 0.5438, + "step": 4185 + }, + { + "epoch": 1.5287566185868176, + "grad_norm": 1.1189831495285034, + "learning_rate": 7.967518735501545e-06, + "loss": 0.4682, + "step": 4186 + }, + { + "epoch": 1.5291217819974439, + "grad_norm": 1.0482594966888428, + "learning_rate": 7.956063204156892e-06, + "loss": 0.527, + "step": 4187 + }, + { + "epoch": 1.52948694540807, + "grad_norm": 1.1917227506637573, + "learning_rate": 7.944613868934428e-06, + "loss": 0.528, + "step": 4188 + }, + { + "epoch": 1.5298521088186963, + "grad_norm": 0.9152950048446655, + "learning_rate": 7.93317073572437e-06, + "loss": 0.5598, + "step": 4189 + }, + { + "epoch": 1.5302172722293226, + "grad_norm": 0.9191230535507202, + "learning_rate": 7.921733810413754e-06, + "loss": 0.4855, + "step": 4190 + }, + { + "epoch": 1.530582435639949, + "grad_norm": 0.8195654153823853, + "learning_rate": 7.910303098886422e-06, + "loss": 0.5565, + "step": 4191 + }, + { + "epoch": 1.5309475990505752, + "grad_norm": 0.7268591523170471, + "learning_rate": 7.898878607023024e-06, + "loss": 0.5603, + "step": 4192 + }, + { + "epoch": 1.5313127624612015, + "grad_norm": 1.0640649795532227, + "learning_rate": 7.887460340700988e-06, + "loss": 0.561, + "step": 4193 + }, + { + "epoch": 1.5316779258718276, + "grad_norm": 0.7788658738136292, + "learning_rate": 7.87604830579456e-06, + "loss": 0.6047, + "step": 4194 + }, + { + "epoch": 1.532043089282454, + "grad_norm": 0.5427097082138062, + "learning_rate": 7.864642508174778e-06, + "loss": 0.586, + "step": 4195 + }, + { + "epoch": 1.53240825269308, + "grad_norm": 0.9323387742042542, + "learning_rate": 7.853242953709467e-06, + "loss": 0.5583, + "step": 4196 + }, + { + "epoch": 1.5327734161037063, + "grad_norm": 0.647345781326294, + "learning_rate": 7.841849648263233e-06, + "loss": 0.5873, + "step": 4197 + }, + { + "epoch": 1.5331385795143326, + "grad_norm": 1.0061805248260498, + "learning_rate": 7.830462597697476e-06, + "loss": 0.5473, + "step": 4198 + }, + { + "epoch": 1.533503742924959, + "grad_norm": 0.8690460324287415, + "learning_rate": 7.819081807870383e-06, + "loss": 0.5322, + "step": 4199 + }, + { + "epoch": 1.5338689063355853, + "grad_norm": 0.814120352268219, + "learning_rate": 7.807707284636906e-06, + "loss": 0.5649, + "step": 4200 + }, + { + "epoch": 1.5342340697462116, + "grad_norm": 0.8226633667945862, + "learning_rate": 7.796339033848797e-06, + "loss": 0.5178, + "step": 4201 + }, + { + "epoch": 1.5345992331568377, + "grad_norm": 0.8266099691390991, + "learning_rate": 7.784977061354548e-06, + "loss": 0.5394, + "step": 4202 + }, + { + "epoch": 1.534964396567464, + "grad_norm": 0.9530923962593079, + "learning_rate": 7.773621372999437e-06, + "loss": 0.5309, + "step": 4203 + }, + { + "epoch": 1.53532955997809, + "grad_norm": 1.0963695049285889, + "learning_rate": 7.762271974625516e-06, + "loss": 0.4514, + "step": 4204 + }, + { + "epoch": 1.5356947233887164, + "grad_norm": 0.8349227905273438, + "learning_rate": 7.750928872071594e-06, + "loss": 0.5683, + "step": 4205 + }, + { + "epoch": 1.5360598867993427, + "grad_norm": 1.0259308815002441, + "learning_rate": 7.73959207117325e-06, + "loss": 0.5078, + "step": 4206 + }, + { + "epoch": 1.536425050209969, + "grad_norm": 0.7931904196739197, + "learning_rate": 7.728261577762798e-06, + "loss": 0.5291, + "step": 4207 + }, + { + "epoch": 1.5367902136205953, + "grad_norm": 0.8693494200706482, + "learning_rate": 7.716937397669333e-06, + "loss": 0.5345, + "step": 4208 + }, + { + "epoch": 1.5371553770312216, + "grad_norm": 0.7820866703987122, + "learning_rate": 7.705619536718685e-06, + "loss": 0.585, + "step": 4209 + }, + { + "epoch": 1.5375205404418477, + "grad_norm": 0.6628980040550232, + "learning_rate": 7.694308000733443e-06, + "loss": 0.5522, + "step": 4210 + }, + { + "epoch": 1.537885703852474, + "grad_norm": 0.8858047127723694, + "learning_rate": 7.683002795532947e-06, + "loss": 0.5928, + "step": 4211 + }, + { + "epoch": 1.5382508672631001, + "grad_norm": 0.9911432266235352, + "learning_rate": 7.671703926933253e-06, + "loss": 0.4933, + "step": 4212 + }, + { + "epoch": 1.5386160306737264, + "grad_norm": 1.3404275178909302, + "learning_rate": 7.660411400747188e-06, + "loss": 0.4949, + "step": 4213 + }, + { + "epoch": 1.5389811940843527, + "grad_norm": 0.7487965226173401, + "learning_rate": 7.649125222784298e-06, + "loss": 0.549, + "step": 4214 + }, + { + "epoch": 1.539346357494979, + "grad_norm": 0.7452327609062195, + "learning_rate": 7.637845398850879e-06, + "loss": 0.548, + "step": 4215 + }, + { + "epoch": 1.5397115209056054, + "grad_norm": 1.9849321842193604, + "learning_rate": 7.6265719347499376e-06, + "loss": 0.5586, + "step": 4216 + }, + { + "epoch": 1.5400766843162315, + "grad_norm": 1.342613935470581, + "learning_rate": 7.6153048362812166e-06, + "loss": 0.5374, + "step": 4217 + }, + { + "epoch": 1.5404418477268578, + "grad_norm": 0.7283867597579956, + "learning_rate": 7.604044109241191e-06, + "loss": 0.5539, + "step": 4218 + }, + { + "epoch": 1.5408070111374839, + "grad_norm": 0.8096727728843689, + "learning_rate": 7.592789759423049e-06, + "loss": 0.579, + "step": 4219 + }, + { + "epoch": 1.5411721745481102, + "grad_norm": 0.7584173679351807, + "learning_rate": 7.581541792616709e-06, + "loss": 0.5505, + "step": 4220 + }, + { + "epoch": 1.5415373379587365, + "grad_norm": 0.7899060249328613, + "learning_rate": 7.570300214608801e-06, + "loss": 0.5756, + "step": 4221 + }, + { + "epoch": 1.5419025013693628, + "grad_norm": 0.9617719054222107, + "learning_rate": 7.559065031182653e-06, + "loss": 0.5525, + "step": 4222 + }, + { + "epoch": 1.5422676647799891, + "grad_norm": 0.7457582354545593, + "learning_rate": 7.547836248118321e-06, + "loss": 0.5548, + "step": 4223 + }, + { + "epoch": 1.5426328281906154, + "grad_norm": 0.7833566665649414, + "learning_rate": 7.536613871192566e-06, + "loss": 0.5673, + "step": 4224 + }, + { + "epoch": 1.5429979916012415, + "grad_norm": 0.9961585402488708, + "learning_rate": 7.525397906178858e-06, + "loss": 0.4791, + "step": 4225 + }, + { + "epoch": 1.5433631550118678, + "grad_norm": 0.8389790058135986, + "learning_rate": 7.514188358847345e-06, + "loss": 0.5434, + "step": 4226 + }, + { + "epoch": 1.543728318422494, + "grad_norm": 0.7960191965103149, + "learning_rate": 7.502985234964897e-06, + "loss": 0.5604, + "step": 4227 + }, + { + "epoch": 1.5440934818331202, + "grad_norm": 0.7586323618888855, + "learning_rate": 7.491788540295077e-06, + "loss": 0.5443, + "step": 4228 + }, + { + "epoch": 1.5444586452437465, + "grad_norm": 1.1452782154083252, + "learning_rate": 7.480598280598126e-06, + "loss": 0.5582, + "step": 4229 + }, + { + "epoch": 1.5448238086543729, + "grad_norm": 0.8075771927833557, + "learning_rate": 7.4694144616309835e-06, + "loss": 0.5435, + "step": 4230 + }, + { + "epoch": 1.5451889720649992, + "grad_norm": 0.8981938362121582, + "learning_rate": 7.458237089147289e-06, + "loss": 0.5468, + "step": 4231 + }, + { + "epoch": 1.5455541354756255, + "grad_norm": 0.9306362271308899, + "learning_rate": 7.447066168897334e-06, + "loss": 0.4987, + "step": 4232 + }, + { + "epoch": 1.5459192988862516, + "grad_norm": 1.1683992147445679, + "learning_rate": 7.435901706628119e-06, + "loss": 0.5735, + "step": 4233 + }, + { + "epoch": 1.5462844622968779, + "grad_norm": 0.7260207533836365, + "learning_rate": 7.424743708083308e-06, + "loss": 0.5517, + "step": 4234 + }, + { + "epoch": 1.546649625707504, + "grad_norm": 2.088953971862793, + "learning_rate": 7.413592179003255e-06, + "loss": 0.5201, + "step": 4235 + }, + { + "epoch": 1.5470147891181303, + "grad_norm": 0.9240289330482483, + "learning_rate": 7.402447125124956e-06, + "loss": 0.5305, + "step": 4236 + }, + { + "epoch": 1.5473799525287566, + "grad_norm": 1.0445443391799927, + "learning_rate": 7.391308552182104e-06, + "loss": 0.4925, + "step": 4237 + }, + { + "epoch": 1.547745115939383, + "grad_norm": 0.9031312465667725, + "learning_rate": 7.380176465905047e-06, + "loss": 0.5323, + "step": 4238 + }, + { + "epoch": 1.5481102793500092, + "grad_norm": 0.9161219000816345, + "learning_rate": 7.369050872020802e-06, + "loss": 0.5013, + "step": 4239 + }, + { + "epoch": 1.5484754427606355, + "grad_norm": 0.9719271659851074, + "learning_rate": 7.357931776253027e-06, + "loss": 0.5147, + "step": 4240 + }, + { + "epoch": 1.5488406061712616, + "grad_norm": 1.3940577507019043, + "learning_rate": 7.346819184322067e-06, + "loss": 0.5789, + "step": 4241 + }, + { + "epoch": 1.549205769581888, + "grad_norm": 0.7420766353607178, + "learning_rate": 7.3357131019448906e-06, + "loss": 0.5715, + "step": 4242 + }, + { + "epoch": 1.549570932992514, + "grad_norm": 0.8596094846725464, + "learning_rate": 7.324613534835134e-06, + "loss": 0.5281, + "step": 4243 + }, + { + "epoch": 1.5499360964031403, + "grad_norm": 0.8126491904258728, + "learning_rate": 7.313520488703083e-06, + "loss": 0.5525, + "step": 4244 + }, + { + "epoch": 1.5503012598137667, + "grad_norm": 0.6757723093032837, + "learning_rate": 7.3024339692556714e-06, + "loss": 0.5642, + "step": 4245 + }, + { + "epoch": 1.550666423224393, + "grad_norm": 0.7929092645645142, + "learning_rate": 7.291353982196454e-06, + "loss": 0.5198, + "step": 4246 + }, + { + "epoch": 1.5510315866350193, + "grad_norm": 0.9600295424461365, + "learning_rate": 7.280280533225648e-06, + "loss": 0.5576, + "step": 4247 + }, + { + "epoch": 1.5513967500456456, + "grad_norm": 0.8763399720191956, + "learning_rate": 7.269213628040095e-06, + "loss": 0.5662, + "step": 4248 + }, + { + "epoch": 1.5517619134562717, + "grad_norm": 1.014392375946045, + "learning_rate": 7.258153272333281e-06, + "loss": 0.5445, + "step": 4249 + }, + { + "epoch": 1.552127076866898, + "grad_norm": 0.8908765912055969, + "learning_rate": 7.247099471795307e-06, + "loss": 0.548, + "step": 4250 + }, + { + "epoch": 1.552492240277524, + "grad_norm": 0.7699535489082336, + "learning_rate": 7.236052232112912e-06, + "loss": 0.5413, + "step": 4251 + }, + { + "epoch": 1.5528574036881504, + "grad_norm": 0.9604656100273132, + "learning_rate": 7.225011558969457e-06, + "loss": 0.5506, + "step": 4252 + }, + { + "epoch": 1.5532225670987767, + "grad_norm": 0.8450736999511719, + "learning_rate": 7.213977458044925e-06, + "loss": 0.5228, + "step": 4253 + }, + { + "epoch": 1.553587730509403, + "grad_norm": 1.0771404504776, + "learning_rate": 7.202949935015928e-06, + "loss": 0.4957, + "step": 4254 + }, + { + "epoch": 1.5539528939200293, + "grad_norm": 0.9505804777145386, + "learning_rate": 7.191928995555677e-06, + "loss": 0.5399, + "step": 4255 + }, + { + "epoch": 1.5543180573306554, + "grad_norm": 0.8374097347259521, + "learning_rate": 7.180914645333994e-06, + "loss": 0.5684, + "step": 4256 + }, + { + "epoch": 1.5546832207412817, + "grad_norm": 0.6433123350143433, + "learning_rate": 7.1699068900173286e-06, + "loss": 0.5648, + "step": 4257 + }, + { + "epoch": 1.5550483841519078, + "grad_norm": 0.89088374376297, + "learning_rate": 7.158905735268728e-06, + "loss": 0.4925, + "step": 4258 + }, + { + "epoch": 1.5554135475625341, + "grad_norm": 0.7068061232566833, + "learning_rate": 7.147911186747853e-06, + "loss": 0.5532, + "step": 4259 + }, + { + "epoch": 1.5557787109731605, + "grad_norm": 0.7148109674453735, + "learning_rate": 7.136923250110943e-06, + "loss": 0.5472, + "step": 4260 + }, + { + "epoch": 1.5561438743837868, + "grad_norm": 1.200268268585205, + "learning_rate": 7.125941931010858e-06, + "loss": 0.5366, + "step": 4261 + }, + { + "epoch": 1.556509037794413, + "grad_norm": 1.0052423477172852, + "learning_rate": 7.114967235097046e-06, + "loss": 0.4856, + "step": 4262 + }, + { + "epoch": 1.5568742012050394, + "grad_norm": 0.6853835582733154, + "learning_rate": 7.103999168015548e-06, + "loss": 0.6011, + "step": 4263 + }, + { + "epoch": 1.5572393646156655, + "grad_norm": 0.8341567516326904, + "learning_rate": 7.093037735408998e-06, + "loss": 0.5544, + "step": 4264 + }, + { + "epoch": 1.5576045280262918, + "grad_norm": 0.8696557283401489, + "learning_rate": 7.082082942916604e-06, + "loss": 0.4904, + "step": 4265 + }, + { + "epoch": 1.5579696914369179, + "grad_norm": 0.7784153819084167, + "learning_rate": 7.071134796174171e-06, + "loss": 0.522, + "step": 4266 + }, + { + "epoch": 1.5583348548475442, + "grad_norm": 0.8596316576004028, + "learning_rate": 7.060193300814085e-06, + "loss": 0.5118, + "step": 4267 + }, + { + "epoch": 1.5587000182581705, + "grad_norm": 0.9530545473098755, + "learning_rate": 7.049258462465307e-06, + "loss": 0.5403, + "step": 4268 + }, + { + "epoch": 1.5590651816687968, + "grad_norm": 1.1124368906021118, + "learning_rate": 7.03833028675337e-06, + "loss": 0.4995, + "step": 4269 + }, + { + "epoch": 1.5594303450794231, + "grad_norm": 0.8794035911560059, + "learning_rate": 7.027408779300375e-06, + "loss": 0.514, + "step": 4270 + }, + { + "epoch": 1.5597955084900494, + "grad_norm": 0.7167550325393677, + "learning_rate": 7.016493945725007e-06, + "loss": 0.5587, + "step": 4271 + }, + { + "epoch": 1.5601606719006755, + "grad_norm": 0.8408735990524292, + "learning_rate": 7.005585791642506e-06, + "loss": 0.524, + "step": 4272 + }, + { + "epoch": 1.5605258353113018, + "grad_norm": 0.8484911918640137, + "learning_rate": 6.994684322664682e-06, + "loss": 0.5269, + "step": 4273 + }, + { + "epoch": 1.560890998721928, + "grad_norm": 0.7512218356132507, + "learning_rate": 6.983789544399911e-06, + "loss": 0.5798, + "step": 4274 + }, + { + "epoch": 1.5612561621325542, + "grad_norm": 1.0278340578079224, + "learning_rate": 6.9729014624531035e-06, + "loss": 0.5275, + "step": 4275 + }, + { + "epoch": 1.5616213255431806, + "grad_norm": 0.7132531404495239, + "learning_rate": 6.962020082425749e-06, + "loss": 0.5517, + "step": 4276 + }, + { + "epoch": 1.5619864889538069, + "grad_norm": 1.00215744972229, + "learning_rate": 6.951145409915881e-06, + "loss": 0.5051, + "step": 4277 + }, + { + "epoch": 1.5623516523644332, + "grad_norm": 0.9334973692893982, + "learning_rate": 6.940277450518089e-06, + "loss": 0.5254, + "step": 4278 + }, + { + "epoch": 1.5627168157750595, + "grad_norm": 0.837821364402771, + "learning_rate": 6.929416209823485e-06, + "loss": 0.5042, + "step": 4279 + }, + { + "epoch": 1.5630819791856856, + "grad_norm": 0.9468808770179749, + "learning_rate": 6.918561693419754e-06, + "loss": 0.5677, + "step": 4280 + }, + { + "epoch": 1.563447142596312, + "grad_norm": 0.9204182028770447, + "learning_rate": 6.907713906891114e-06, + "loss": 0.5432, + "step": 4281 + }, + { + "epoch": 1.563812306006938, + "grad_norm": 0.8597743511199951, + "learning_rate": 6.896872855818298e-06, + "loss": 0.5634, + "step": 4282 + }, + { + "epoch": 1.5641774694175643, + "grad_norm": 0.696017861366272, + "learning_rate": 6.886038545778611e-06, + "loss": 0.5752, + "step": 4283 + }, + { + "epoch": 1.5645426328281906, + "grad_norm": 0.833747386932373, + "learning_rate": 6.875210982345855e-06, + "loss": 0.5622, + "step": 4284 + }, + { + "epoch": 1.564907796238817, + "grad_norm": 0.7575204372406006, + "learning_rate": 6.8643901710903825e-06, + "loss": 0.5383, + "step": 4285 + }, + { + "epoch": 1.5652729596494432, + "grad_norm": 0.9199098944664001, + "learning_rate": 6.8535761175790686e-06, + "loss": 0.5327, + "step": 4286 + }, + { + "epoch": 1.5656381230600693, + "grad_norm": 0.7201038599014282, + "learning_rate": 6.842768827375308e-06, + "loss": 0.5562, + "step": 4287 + }, + { + "epoch": 1.5660032864706956, + "grad_norm": 1.0112850666046143, + "learning_rate": 6.831968306039025e-06, + "loss": 0.5761, + "step": 4288 + }, + { + "epoch": 1.5663684498813217, + "grad_norm": 0.9436196684837341, + "learning_rate": 6.821174559126644e-06, + "loss": 0.5137, + "step": 4289 + }, + { + "epoch": 1.566733613291948, + "grad_norm": 0.8342386484146118, + "learning_rate": 6.8103875921911185e-06, + "loss": 0.5444, + "step": 4290 + }, + { + "epoch": 1.5670987767025744, + "grad_norm": 1.1027247905731201, + "learning_rate": 6.79960741078191e-06, + "loss": 0.5704, + "step": 4291 + }, + { + "epoch": 1.5674639401132007, + "grad_norm": 0.9045964479446411, + "learning_rate": 6.788834020444997e-06, + "loss": 0.5527, + "step": 4292 + }, + { + "epoch": 1.567829103523827, + "grad_norm": 0.7952874898910522, + "learning_rate": 6.778067426722841e-06, + "loss": 0.5579, + "step": 4293 + }, + { + "epoch": 1.5681942669344533, + "grad_norm": 1.68389892578125, + "learning_rate": 6.767307635154432e-06, + "loss": 0.5771, + "step": 4294 + }, + { + "epoch": 1.5685594303450794, + "grad_norm": 0.82811039686203, + "learning_rate": 6.7565546512752575e-06, + "loss": 0.5667, + "step": 4295 + }, + { + "epoch": 1.5689245937557057, + "grad_norm": 0.8454976081848145, + "learning_rate": 6.74580848061728e-06, + "loss": 0.5093, + "step": 4296 + }, + { + "epoch": 1.5692897571663318, + "grad_norm": 1.0146894454956055, + "learning_rate": 6.735069128708984e-06, + "loss": 0.518, + "step": 4297 + }, + { + "epoch": 1.569654920576958, + "grad_norm": 0.9876936674118042, + "learning_rate": 6.724336601075339e-06, + "loss": 0.566, + "step": 4298 + }, + { + "epoch": 1.5700200839875844, + "grad_norm": 0.9768635630607605, + "learning_rate": 6.713610903237784e-06, + "loss": 0.5191, + "step": 4299 + }, + { + "epoch": 1.5703852473982107, + "grad_norm": 0.8869251012802124, + "learning_rate": 6.702892040714273e-06, + "loss": 0.5574, + "step": 4300 + }, + { + "epoch": 1.570750410808837, + "grad_norm": 0.7400929927825928, + "learning_rate": 6.692180019019226e-06, + "loss": 0.5781, + "step": 4301 + }, + { + "epoch": 1.5711155742194634, + "grad_norm": 0.9121541380882263, + "learning_rate": 6.681474843663556e-06, + "loss": 0.5522, + "step": 4302 + }, + { + "epoch": 1.5714807376300894, + "grad_norm": 0.8536775708198547, + "learning_rate": 6.670776520154634e-06, + "loss": 0.554, + "step": 4303 + }, + { + "epoch": 1.5718459010407158, + "grad_norm": 1.0310137271881104, + "learning_rate": 6.6600850539963215e-06, + "loss": 0.5031, + "step": 4304 + }, + { + "epoch": 1.5722110644513418, + "grad_norm": 0.9023722410202026, + "learning_rate": 6.6494004506889545e-06, + "loss": 0.5627, + "step": 4305 + }, + { + "epoch": 1.5725762278619682, + "grad_norm": 0.7008075714111328, + "learning_rate": 6.638722715729327e-06, + "loss": 0.5958, + "step": 4306 + }, + { + "epoch": 1.5729413912725945, + "grad_norm": 0.7555179595947266, + "learning_rate": 6.628051854610715e-06, + "loss": 0.5398, + "step": 4307 + }, + { + "epoch": 1.5733065546832208, + "grad_norm": 0.8288542032241821, + "learning_rate": 6.617387872822842e-06, + "loss": 0.5487, + "step": 4308 + }, + { + "epoch": 1.573671718093847, + "grad_norm": 0.8955180644989014, + "learning_rate": 6.606730775851891e-06, + "loss": 0.5454, + "step": 4309 + }, + { + "epoch": 1.5740368815044734, + "grad_norm": 0.9104514718055725, + "learning_rate": 6.596080569180517e-06, + "loss": 0.5197, + "step": 4310 + }, + { + "epoch": 1.5744020449150995, + "grad_norm": 0.9378561973571777, + "learning_rate": 6.585437258287823e-06, + "loss": 0.5162, + "step": 4311 + }, + { + "epoch": 1.5747672083257258, + "grad_norm": 0.8490752577781677, + "learning_rate": 6.574800848649374e-06, + "loss": 0.5299, + "step": 4312 + }, + { + "epoch": 1.575132371736352, + "grad_norm": 0.866038978099823, + "learning_rate": 6.564171345737163e-06, + "loss": 0.4706, + "step": 4313 + }, + { + "epoch": 1.5754975351469782, + "grad_norm": 0.9095643758773804, + "learning_rate": 6.553548755019648e-06, + "loss": 0.5261, + "step": 4314 + }, + { + "epoch": 1.5758626985576045, + "grad_norm": 1.3830903768539429, + "learning_rate": 6.542933081961724e-06, + "loss": 0.5293, + "step": 4315 + }, + { + "epoch": 1.5762278619682308, + "grad_norm": 1.1060370206832886, + "learning_rate": 6.532324332024733e-06, + "loss": 0.4509, + "step": 4316 + }, + { + "epoch": 1.5765930253788571, + "grad_norm": 1.4499001502990723, + "learning_rate": 6.521722510666457e-06, + "loss": 0.5313, + "step": 4317 + }, + { + "epoch": 1.5769581887894832, + "grad_norm": 0.9172452092170715, + "learning_rate": 6.511127623341091e-06, + "loss": 0.5488, + "step": 4318 + }, + { + "epoch": 1.5773233522001096, + "grad_norm": 0.8071887493133545, + "learning_rate": 6.5005396754992885e-06, + "loss": 0.5442, + "step": 4319 + }, + { + "epoch": 1.5776885156107356, + "grad_norm": 1.0549798011779785, + "learning_rate": 6.4899586725881235e-06, + "loss": 0.5054, + "step": 4320 + }, + { + "epoch": 1.578053679021362, + "grad_norm": 1.1160976886749268, + "learning_rate": 6.479384620051103e-06, + "loss": 0.5472, + "step": 4321 + }, + { + "epoch": 1.5784188424319883, + "grad_norm": 0.7750179171562195, + "learning_rate": 6.468817523328148e-06, + "loss": 0.5519, + "step": 4322 + }, + { + "epoch": 1.5787840058426146, + "grad_norm": 0.7808592915534973, + "learning_rate": 6.4582573878555975e-06, + "loss": 0.5411, + "step": 4323 + }, + { + "epoch": 1.579149169253241, + "grad_norm": 1.0584574937820435, + "learning_rate": 6.447704219066224e-06, + "loss": 0.5027, + "step": 4324 + }, + { + "epoch": 1.5795143326638672, + "grad_norm": 1.161388874053955, + "learning_rate": 6.437158022389212e-06, + "loss": 0.4879, + "step": 4325 + }, + { + "epoch": 1.5798794960744933, + "grad_norm": 0.6971124410629272, + "learning_rate": 6.4266188032501595e-06, + "loss": 0.5498, + "step": 4326 + }, + { + "epoch": 1.5802446594851196, + "grad_norm": 0.9522584676742554, + "learning_rate": 6.4160865670710605e-06, + "loss": 0.5123, + "step": 4327 + }, + { + "epoch": 1.5806098228957457, + "grad_norm": 0.8984469175338745, + "learning_rate": 6.405561319270335e-06, + "loss": 0.4834, + "step": 4328 + }, + { + "epoch": 1.580974986306372, + "grad_norm": 0.7459402680397034, + "learning_rate": 6.395043065262798e-06, + "loss": 0.5795, + "step": 4329 + }, + { + "epoch": 1.5813401497169983, + "grad_norm": 0.7355731129646301, + "learning_rate": 6.384531810459673e-06, + "loss": 0.5507, + "step": 4330 + }, + { + "epoch": 1.5817053131276246, + "grad_norm": 0.8336747884750366, + "learning_rate": 6.3740275602685835e-06, + "loss": 0.4817, + "step": 4331 + }, + { + "epoch": 1.582070476538251, + "grad_norm": 1.0634924173355103, + "learning_rate": 6.363530320093529e-06, + "loss": 0.5401, + "step": 4332 + }, + { + "epoch": 1.5824356399488773, + "grad_norm": 0.9377459287643433, + "learning_rate": 6.353040095334931e-06, + "loss": 0.559, + "step": 4333 + }, + { + "epoch": 1.5828008033595033, + "grad_norm": 0.9540988206863403, + "learning_rate": 6.342556891389582e-06, + "loss": 0.5518, + "step": 4334 + }, + { + "epoch": 1.5831659667701297, + "grad_norm": 0.7296430468559265, + "learning_rate": 6.332080713650684e-06, + "loss": 0.5484, + "step": 4335 + }, + { + "epoch": 1.5835311301807558, + "grad_norm": 1.0052680969238281, + "learning_rate": 6.321611567507795e-06, + "loss": 0.4866, + "step": 4336 + }, + { + "epoch": 1.583896293591382, + "grad_norm": 0.9100461006164551, + "learning_rate": 6.31114945834687e-06, + "loss": 0.513, + "step": 4337 + }, + { + "epoch": 1.5842614570020084, + "grad_norm": 0.7573947906494141, + "learning_rate": 6.3006943915502506e-06, + "loss": 0.5667, + "step": 4338 + }, + { + "epoch": 1.5846266204126347, + "grad_norm": 1.0786585807800293, + "learning_rate": 6.290246372496646e-06, + "loss": 0.5424, + "step": 4339 + }, + { + "epoch": 1.584991783823261, + "grad_norm": 0.7700137495994568, + "learning_rate": 6.279805406561146e-06, + "loss": 0.5446, + "step": 4340 + }, + { + "epoch": 1.5853569472338873, + "grad_norm": 1.2569921016693115, + "learning_rate": 6.269371499115213e-06, + "loss": 0.4965, + "step": 4341 + }, + { + "epoch": 1.5857221106445134, + "grad_norm": 0.8147380352020264, + "learning_rate": 6.258944655526662e-06, + "loss": 0.5451, + "step": 4342 + }, + { + "epoch": 1.5860872740551397, + "grad_norm": 1.0732260942459106, + "learning_rate": 6.24852488115969e-06, + "loss": 0.5136, + "step": 4343 + }, + { + "epoch": 1.5864524374657658, + "grad_norm": 0.9572034478187561, + "learning_rate": 6.238112181374856e-06, + "loss": 0.5621, + "step": 4344 + }, + { + "epoch": 1.5868176008763921, + "grad_norm": 0.8697877526283264, + "learning_rate": 6.227706561529079e-06, + "loss": 0.5513, + "step": 4345 + }, + { + "epoch": 1.5871827642870184, + "grad_norm": 0.8382867574691772, + "learning_rate": 6.217308026975623e-06, + "loss": 0.5509, + "step": 4346 + }, + { + "epoch": 1.5875479276976447, + "grad_norm": 0.6258609294891357, + "learning_rate": 6.206916583064124e-06, + "loss": 0.5468, + "step": 4347 + }, + { + "epoch": 1.587913091108271, + "grad_norm": 0.8393055200576782, + "learning_rate": 6.196532235140564e-06, + "loss": 0.574, + "step": 4348 + }, + { + "epoch": 1.5882782545188974, + "grad_norm": 0.9363372325897217, + "learning_rate": 6.186154988547266e-06, + "loss": 0.5171, + "step": 4349 + }, + { + "epoch": 1.5886434179295235, + "grad_norm": 0.6817806363105774, + "learning_rate": 6.175784848622913e-06, + "loss": 0.5738, + "step": 4350 + }, + { + "epoch": 1.5890085813401496, + "grad_norm": 1.1340863704681396, + "learning_rate": 6.1654218207025285e-06, + "loss": 0.5109, + "step": 4351 + }, + { + "epoch": 1.5893737447507759, + "grad_norm": 0.8553128242492676, + "learning_rate": 6.155065910117464e-06, + "loss": 0.5279, + "step": 4352 + }, + { + "epoch": 1.5897389081614022, + "grad_norm": 0.9129517078399658, + "learning_rate": 6.144717122195425e-06, + "loss": 0.5211, + "step": 4353 + }, + { + "epoch": 1.5901040715720285, + "grad_norm": 0.9420212507247925, + "learning_rate": 6.134375462260449e-06, + "loss": 0.5727, + "step": 4354 + }, + { + "epoch": 1.5904692349826548, + "grad_norm": 0.8823181986808777, + "learning_rate": 6.124040935632913e-06, + "loss": 0.5493, + "step": 4355 + }, + { + "epoch": 1.5908343983932811, + "grad_norm": 0.8685112595558167, + "learning_rate": 6.113713547629501e-06, + "loss": 0.5165, + "step": 4356 + }, + { + "epoch": 1.5911995618039072, + "grad_norm": 0.8980146050453186, + "learning_rate": 6.103393303563245e-06, + "loss": 0.537, + "step": 4357 + }, + { + "epoch": 1.5915647252145335, + "grad_norm": 0.8472083210945129, + "learning_rate": 6.0930802087435005e-06, + "loss": 0.522, + "step": 4358 + }, + { + "epoch": 1.5919298886251596, + "grad_norm": 0.8739496469497681, + "learning_rate": 6.0827742684759375e-06, + "loss": 0.5405, + "step": 4359 + }, + { + "epoch": 1.592295052035786, + "grad_norm": 1.0169410705566406, + "learning_rate": 6.072475488062557e-06, + "loss": 0.5097, + "step": 4360 + }, + { + "epoch": 1.5926602154464122, + "grad_norm": 0.8349485397338867, + "learning_rate": 6.062183872801662e-06, + "loss": 0.5709, + "step": 4361 + }, + { + "epoch": 1.5930253788570385, + "grad_norm": 0.8705358505249023, + "learning_rate": 6.051899427987866e-06, + "loss": 0.5302, + "step": 4362 + }, + { + "epoch": 1.5933905422676649, + "grad_norm": 0.832336962223053, + "learning_rate": 6.041622158912113e-06, + "loss": 0.5589, + "step": 4363 + }, + { + "epoch": 1.5937557056782912, + "grad_norm": 1.2144277095794678, + "learning_rate": 6.031352070861645e-06, + "loss": 0.5204, + "step": 4364 + }, + { + "epoch": 1.5941208690889173, + "grad_norm": 1.0942915678024292, + "learning_rate": 6.021089169120013e-06, + "loss": 0.4832, + "step": 4365 + }, + { + "epoch": 1.5944860324995436, + "grad_norm": 0.941108226776123, + "learning_rate": 6.010833458967063e-06, + "loss": 0.5475, + "step": 4366 + }, + { + "epoch": 1.5948511959101697, + "grad_norm": 0.7738760113716125, + "learning_rate": 6.000584945678944e-06, + "loss": 0.562, + "step": 4367 + }, + { + "epoch": 1.595216359320796, + "grad_norm": 0.9169711470603943, + "learning_rate": 5.99034363452811e-06, + "loss": 0.5416, + "step": 4368 + }, + { + "epoch": 1.5955815227314223, + "grad_norm": 0.9156622886657715, + "learning_rate": 5.980109530783311e-06, + "loss": 0.5087, + "step": 4369 + }, + { + "epoch": 1.5959466861420486, + "grad_norm": 0.8764016628265381, + "learning_rate": 5.9698826397095676e-06, + "loss": 0.4941, + "step": 4370 + }, + { + "epoch": 1.596311849552675, + "grad_norm": 0.6216623783111572, + "learning_rate": 5.959662966568214e-06, + "loss": 0.5913, + "step": 4371 + }, + { + "epoch": 1.5966770129633012, + "grad_norm": 1.1773252487182617, + "learning_rate": 5.949450516616859e-06, + "loss": 0.542, + "step": 4372 + }, + { + "epoch": 1.5970421763739273, + "grad_norm": 1.039921522140503, + "learning_rate": 5.939245295109401e-06, + "loss": 0.5359, + "step": 4373 + }, + { + "epoch": 1.5974073397845536, + "grad_norm": 0.9120054244995117, + "learning_rate": 5.929047307296023e-06, + "loss": 0.5516, + "step": 4374 + }, + { + "epoch": 1.5977725031951797, + "grad_norm": 1.2057701349258423, + "learning_rate": 5.918856558423171e-06, + "loss": 0.531, + "step": 4375 + }, + { + "epoch": 1.598137666605806, + "grad_norm": 0.9319344162940979, + "learning_rate": 5.908673053733573e-06, + "loss": 0.55, + "step": 4376 + }, + { + "epoch": 1.5985028300164323, + "grad_norm": 1.064395785331726, + "learning_rate": 5.89849679846624e-06, + "loss": 0.5352, + "step": 4377 + }, + { + "epoch": 1.5988679934270587, + "grad_norm": 0.9176167845726013, + "learning_rate": 5.8883277978564434e-06, + "loss": 0.5002, + "step": 4378 + }, + { + "epoch": 1.599233156837685, + "grad_norm": 0.9245776534080505, + "learning_rate": 5.878166057135737e-06, + "loss": 0.5517, + "step": 4379 + }, + { + "epoch": 1.5995983202483113, + "grad_norm": 0.9759179353713989, + "learning_rate": 5.868011581531914e-06, + "loss": 0.4983, + "step": 4380 + }, + { + "epoch": 1.5999634836589374, + "grad_norm": 0.9405667185783386, + "learning_rate": 5.857864376269051e-06, + "loss": 0.5289, + "step": 4381 + }, + { + "epoch": 1.6003286470695637, + "grad_norm": 0.8187503814697266, + "learning_rate": 5.847724446567477e-06, + "loss": 0.4938, + "step": 4382 + }, + { + "epoch": 1.6006938104801898, + "grad_norm": 0.6253883242607117, + "learning_rate": 5.83759179764378e-06, + "loss": 0.589, + "step": 4383 + }, + { + "epoch": 1.601058973890816, + "grad_norm": 0.6693951487541199, + "learning_rate": 5.8274664347108086e-06, + "loss": 0.5475, + "step": 4384 + }, + { + "epoch": 1.6014241373014424, + "grad_norm": 1.325948715209961, + "learning_rate": 5.817348362977642e-06, + "loss": 0.487, + "step": 4385 + }, + { + "epoch": 1.6017893007120687, + "grad_norm": 0.8186267018318176, + "learning_rate": 5.807237587649625e-06, + "loss": 0.5538, + "step": 4386 + }, + { + "epoch": 1.602154464122695, + "grad_norm": 0.9162182807922363, + "learning_rate": 5.7971341139283535e-06, + "loss": 0.5602, + "step": 4387 + }, + { + "epoch": 1.6025196275333211, + "grad_norm": 1.035686731338501, + "learning_rate": 5.787037947011658e-06, + "loss": 0.5117, + "step": 4388 + }, + { + "epoch": 1.6028847909439474, + "grad_norm": 0.750034511089325, + "learning_rate": 5.7769490920936085e-06, + "loss": 0.5272, + "step": 4389 + }, + { + "epoch": 1.6032499543545735, + "grad_norm": 0.8791800141334534, + "learning_rate": 5.76686755436451e-06, + "loss": 0.5806, + "step": 4390 + }, + { + "epoch": 1.6036151177651998, + "grad_norm": 0.6303296089172363, + "learning_rate": 5.756793339010915e-06, + "loss": 0.5706, + "step": 4391 + }, + { + "epoch": 1.6039802811758261, + "grad_norm": 0.8920360207557678, + "learning_rate": 5.746726451215603e-06, + "loss": 0.5136, + "step": 4392 + }, + { + "epoch": 1.6043454445864525, + "grad_norm": 1.3762799501419067, + "learning_rate": 5.7366668961575835e-06, + "loss": 0.5153, + "step": 4393 + }, + { + "epoch": 1.6047106079970788, + "grad_norm": 1.5691697597503662, + "learning_rate": 5.726614679012099e-06, + "loss": 0.5258, + "step": 4394 + }, + { + "epoch": 1.605075771407705, + "grad_norm": 0.961821973323822, + "learning_rate": 5.716569804950603e-06, + "loss": 0.5383, + "step": 4395 + }, + { + "epoch": 1.6054409348183312, + "grad_norm": 1.1597667932510376, + "learning_rate": 5.706532279140782e-06, + "loss": 0.502, + "step": 4396 + }, + { + "epoch": 1.6058060982289575, + "grad_norm": 1.0048213005065918, + "learning_rate": 5.696502106746542e-06, + "loss": 0.5543, + "step": 4397 + }, + { + "epoch": 1.6061712616395836, + "grad_norm": 1.2439483404159546, + "learning_rate": 5.686479292928011e-06, + "loss": 0.5152, + "step": 4398 + }, + { + "epoch": 1.6065364250502099, + "grad_norm": 0.9458543062210083, + "learning_rate": 5.676463842841513e-06, + "loss": 0.5099, + "step": 4399 + }, + { + "epoch": 1.6069015884608362, + "grad_norm": 1.0276858806610107, + "learning_rate": 5.666455761639598e-06, + "loss": 0.5485, + "step": 4400 + }, + { + "epoch": 1.6072667518714625, + "grad_norm": 0.9634801149368286, + "learning_rate": 5.656455054471026e-06, + "loss": 0.5549, + "step": 4401 + }, + { + "epoch": 1.6076319152820888, + "grad_norm": 0.8646242022514343, + "learning_rate": 5.646461726480763e-06, + "loss": 0.5114, + "step": 4402 + }, + { + "epoch": 1.6079970786927151, + "grad_norm": 5.977184295654297, + "learning_rate": 5.636475782809972e-06, + "loss": 0.5429, + "step": 4403 + }, + { + "epoch": 1.6083622421033412, + "grad_norm": 0.8848747611045837, + "learning_rate": 5.6264972285960104e-06, + "loss": 0.5088, + "step": 4404 + }, + { + "epoch": 1.6087274055139675, + "grad_norm": 0.8562219142913818, + "learning_rate": 5.616526068972452e-06, + "loss": 0.5502, + "step": 4405 + }, + { + "epoch": 1.6090925689245936, + "grad_norm": 0.5673458576202393, + "learning_rate": 5.606562309069059e-06, + "loss": 0.5854, + "step": 4406 + }, + { + "epoch": 1.60945773233522, + "grad_norm": 1.1740148067474365, + "learning_rate": 5.596605954011785e-06, + "loss": 0.5512, + "step": 4407 + }, + { + "epoch": 1.6098228957458462, + "grad_norm": 0.950234055519104, + "learning_rate": 5.586657008922782e-06, + "loss": 0.4951, + "step": 4408 + }, + { + "epoch": 1.6101880591564726, + "grad_norm": 0.7697010040283203, + "learning_rate": 5.576715478920369e-06, + "loss": 0.5232, + "step": 4409 + }, + { + "epoch": 1.6105532225670989, + "grad_norm": 0.8417774438858032, + "learning_rate": 5.566781369119072e-06, + "loss": 0.5501, + "step": 4410 + }, + { + "epoch": 1.6109183859777252, + "grad_norm": 1.1219253540039062, + "learning_rate": 5.556854684629593e-06, + "loss": 0.5276, + "step": 4411 + }, + { + "epoch": 1.6112835493883513, + "grad_norm": 0.8750378489494324, + "learning_rate": 5.5469354305588175e-06, + "loss": 0.5226, + "step": 4412 + }, + { + "epoch": 1.6116487127989776, + "grad_norm": 0.8155550956726074, + "learning_rate": 5.537023612009791e-06, + "loss": 0.5524, + "step": 4413 + }, + { + "epoch": 1.6120138762096037, + "grad_norm": 1.2941229343414307, + "learning_rate": 5.527119234081752e-06, + "loss": 0.5051, + "step": 4414 + }, + { + "epoch": 1.61237903962023, + "grad_norm": 0.8032121658325195, + "learning_rate": 5.5172223018701135e-06, + "loss": 0.5461, + "step": 4415 + }, + { + "epoch": 1.6127442030308563, + "grad_norm": 0.7761710286140442, + "learning_rate": 5.5073328204664354e-06, + "loss": 0.5779, + "step": 4416 + }, + { + "epoch": 1.6131093664414826, + "grad_norm": 0.9179719686508179, + "learning_rate": 5.49745079495847e-06, + "loss": 0.5251, + "step": 4417 + }, + { + "epoch": 1.613474529852109, + "grad_norm": 1.0164270401000977, + "learning_rate": 5.487576230430123e-06, + "loss": 0.5067, + "step": 4418 + }, + { + "epoch": 1.613839693262735, + "grad_norm": 0.854945719242096, + "learning_rate": 5.477709131961453e-06, + "loss": 0.5369, + "step": 4419 + }, + { + "epoch": 1.6142048566733613, + "grad_norm": 0.7561818361282349, + "learning_rate": 5.46784950462869e-06, + "loss": 0.553, + "step": 4420 + }, + { + "epoch": 1.6145700200839874, + "grad_norm": 0.9647868871688843, + "learning_rate": 5.457997353504221e-06, + "loss": 0.5198, + "step": 4421 + }, + { + "epoch": 1.6149351834946137, + "grad_norm": 1.0278557538986206, + "learning_rate": 5.448152683656582e-06, + "loss": 0.5064, + "step": 4422 + }, + { + "epoch": 1.61530034690524, + "grad_norm": 0.8827544450759888, + "learning_rate": 5.4383155001504525e-06, + "loss": 0.541, + "step": 4423 + }, + { + "epoch": 1.6156655103158664, + "grad_norm": 0.7255877256393433, + "learning_rate": 5.428485808046677e-06, + "loss": 0.5531, + "step": 4424 + }, + { + "epoch": 1.6160306737264927, + "grad_norm": 0.8962209224700928, + "learning_rate": 5.418663612402233e-06, + "loss": 0.542, + "step": 4425 + }, + { + "epoch": 1.616395837137119, + "grad_norm": 0.8296344876289368, + "learning_rate": 5.408848918270246e-06, + "loss": 0.5864, + "step": 4426 + }, + { + "epoch": 1.616761000547745, + "grad_norm": 1.0218677520751953, + "learning_rate": 5.399041730699992e-06, + "loss": 0.5439, + "step": 4427 + }, + { + "epoch": 1.6171261639583714, + "grad_norm": 0.7651405930519104, + "learning_rate": 5.389242054736867e-06, + "loss": 0.5707, + "step": 4428 + }, + { + "epoch": 1.6174913273689975, + "grad_norm": 0.7582070231437683, + "learning_rate": 5.3794498954224085e-06, + "loss": 0.5297, + "step": 4429 + }, + { + "epoch": 1.6178564907796238, + "grad_norm": 0.7480289340019226, + "learning_rate": 5.36966525779429e-06, + "loss": 0.5734, + "step": 4430 + }, + { + "epoch": 1.61822165419025, + "grad_norm": 1.042218804359436, + "learning_rate": 5.359888146886316e-06, + "loss": 0.489, + "step": 4431 + }, + { + "epoch": 1.6185868176008764, + "grad_norm": 0.6962181329727173, + "learning_rate": 5.350118567728429e-06, + "loss": 0.5604, + "step": 4432 + }, + { + "epoch": 1.6189519810115027, + "grad_norm": 1.0039385557174683, + "learning_rate": 5.340356525346666e-06, + "loss": 0.4632, + "step": 4433 + }, + { + "epoch": 1.619317144422129, + "grad_norm": 0.6350936889648438, + "learning_rate": 5.330602024763218e-06, + "loss": 0.572, + "step": 4434 + }, + { + "epoch": 1.6196823078327551, + "grad_norm": 0.8618603348731995, + "learning_rate": 5.320855070996383e-06, + "loss": 0.5159, + "step": 4435 + }, + { + "epoch": 1.6200474712433814, + "grad_norm": 0.776425302028656, + "learning_rate": 5.311115669060576e-06, + "loss": 0.5402, + "step": 4436 + }, + { + "epoch": 1.6204126346540075, + "grad_norm": 0.9595008492469788, + "learning_rate": 5.3013838239663325e-06, + "loss": 0.5249, + "step": 4437 + }, + { + "epoch": 1.6207777980646338, + "grad_norm": 1.2430531978607178, + "learning_rate": 5.291659540720289e-06, + "loss": 0.495, + "step": 4438 + }, + { + "epoch": 1.6211429614752602, + "grad_norm": 0.8708037734031677, + "learning_rate": 5.281942824325204e-06, + "loss": 0.5486, + "step": 4439 + }, + { + "epoch": 1.6215081248858865, + "grad_norm": 0.7520557045936584, + "learning_rate": 5.272233679779934e-06, + "loss": 0.5284, + "step": 4440 + }, + { + "epoch": 1.6218732882965128, + "grad_norm": 0.847500205039978, + "learning_rate": 5.262532112079455e-06, + "loss": 0.5211, + "step": 4441 + }, + { + "epoch": 1.622238451707139, + "grad_norm": 0.9707865715026855, + "learning_rate": 5.252838126214827e-06, + "loss": 0.5611, + "step": 4442 + }, + { + "epoch": 1.6226036151177652, + "grad_norm": 0.8329001069068909, + "learning_rate": 5.2431517271732106e-06, + "loss": 0.5207, + "step": 4443 + }, + { + "epoch": 1.6229687785283915, + "grad_norm": 0.8859490752220154, + "learning_rate": 5.233472919937872e-06, + "loss": 0.5468, + "step": 4444 + }, + { + "epoch": 1.6233339419390176, + "grad_norm": 0.9074608087539673, + "learning_rate": 5.2238017094881765e-06, + "loss": 0.5367, + "step": 4445 + }, + { + "epoch": 1.623699105349644, + "grad_norm": 0.6423451900482178, + "learning_rate": 5.214138100799573e-06, + "loss": 0.5806, + "step": 4446 + }, + { + "epoch": 1.6240642687602702, + "grad_norm": 0.9432792067527771, + "learning_rate": 5.2044820988435906e-06, + "loss": 0.5418, + "step": 4447 + }, + { + "epoch": 1.6244294321708965, + "grad_norm": 1.0792163610458374, + "learning_rate": 5.194833708587863e-06, + "loss": 0.4911, + "step": 4448 + }, + { + "epoch": 1.6247945955815228, + "grad_norm": 1.0764150619506836, + "learning_rate": 5.185192934996097e-06, + "loss": 0.4913, + "step": 4449 + }, + { + "epoch": 1.6251597589921492, + "grad_norm": 0.9995002746582031, + "learning_rate": 5.175559783028084e-06, + "loss": 0.5176, + "step": 4450 + }, + { + "epoch": 1.6255249224027752, + "grad_norm": 0.9153430461883545, + "learning_rate": 5.165934257639702e-06, + "loss": 0.5353, + "step": 4451 + }, + { + "epoch": 1.6258900858134013, + "grad_norm": 0.9202440977096558, + "learning_rate": 5.156316363782885e-06, + "loss": 0.5455, + "step": 4452 + }, + { + "epoch": 1.6262552492240276, + "grad_norm": 0.7982866168022156, + "learning_rate": 5.146706106405657e-06, + "loss": 0.5364, + "step": 4453 + }, + { + "epoch": 1.626620412634654, + "grad_norm": 1.1405035257339478, + "learning_rate": 5.137103490452113e-06, + "loss": 0.4984, + "step": 4454 + }, + { + "epoch": 1.6269855760452803, + "grad_norm": 1.1086018085479736, + "learning_rate": 5.1275085208624185e-06, + "loss": 0.5183, + "step": 4455 + }, + { + "epoch": 1.6273507394559066, + "grad_norm": 0.6876553893089294, + "learning_rate": 5.1179212025727935e-06, + "loss": 0.5603, + "step": 4456 + }, + { + "epoch": 1.627715902866533, + "grad_norm": 0.7829571962356567, + "learning_rate": 5.108341540515522e-06, + "loss": 0.5641, + "step": 4457 + }, + { + "epoch": 1.628081066277159, + "grad_norm": 0.8676231503486633, + "learning_rate": 5.098769539618964e-06, + "loss": 0.5289, + "step": 4458 + }, + { + "epoch": 1.6284462296877853, + "grad_norm": 1.0025681257247925, + "learning_rate": 5.08920520480753e-06, + "loss": 0.5165, + "step": 4459 + }, + { + "epoch": 1.6288113930984114, + "grad_norm": 0.9066442847251892, + "learning_rate": 5.0796485410016825e-06, + "loss": 0.543, + "step": 4460 + }, + { + "epoch": 1.6291765565090377, + "grad_norm": 1.3929529190063477, + "learning_rate": 5.070099553117953e-06, + "loss": 0.5398, + "step": 4461 + }, + { + "epoch": 1.629541719919664, + "grad_norm": 1.1188859939575195, + "learning_rate": 5.060558246068897e-06, + "loss": 0.536, + "step": 4462 + }, + { + "epoch": 1.6299068833302903, + "grad_norm": 0.7350136637687683, + "learning_rate": 5.0510246247631385e-06, + "loss": 0.5649, + "step": 4463 + }, + { + "epoch": 1.6302720467409166, + "grad_norm": 1.0863380432128906, + "learning_rate": 5.041498694105349e-06, + "loss": 0.4687, + "step": 4464 + }, + { + "epoch": 1.630637210151543, + "grad_norm": 0.8181613683700562, + "learning_rate": 5.03198045899624e-06, + "loss": 0.5439, + "step": 4465 + }, + { + "epoch": 1.631002373562169, + "grad_norm": 0.9987273216247559, + "learning_rate": 5.022469924332547e-06, + "loss": 0.5333, + "step": 4466 + }, + { + "epoch": 1.6313675369727954, + "grad_norm": 0.9395421147346497, + "learning_rate": 5.012967095007068e-06, + "loss": 0.5405, + "step": 4467 + }, + { + "epoch": 1.6317327003834214, + "grad_norm": 0.8800456523895264, + "learning_rate": 5.0034719759086335e-06, + "loss": 0.5277, + "step": 4468 + }, + { + "epoch": 1.6320978637940478, + "grad_norm": 0.8373021483421326, + "learning_rate": 4.993984571922086e-06, + "loss": 0.5268, + "step": 4469 + }, + { + "epoch": 1.632463027204674, + "grad_norm": 1.0338077545166016, + "learning_rate": 4.984504887928325e-06, + "loss": 0.5418, + "step": 4470 + }, + { + "epoch": 1.6328281906153004, + "grad_norm": 0.9462342262268066, + "learning_rate": 4.975032928804269e-06, + "loss": 0.5069, + "step": 4471 + }, + { + "epoch": 1.6331933540259267, + "grad_norm": 1.029945731163025, + "learning_rate": 4.965568699422851e-06, + "loss": 0.5525, + "step": 4472 + }, + { + "epoch": 1.633558517436553, + "grad_norm": 0.8257748484611511, + "learning_rate": 4.956112204653043e-06, + "loss": 0.5594, + "step": 4473 + }, + { + "epoch": 1.633923680847179, + "grad_norm": 0.9749363660812378, + "learning_rate": 4.946663449359834e-06, + "loss": 0.4737, + "step": 4474 + }, + { + "epoch": 1.6342888442578054, + "grad_norm": 0.9025813341140747, + "learning_rate": 4.937222438404232e-06, + "loss": 0.5493, + "step": 4475 + }, + { + "epoch": 1.6346540076684315, + "grad_norm": 1.1877425909042358, + "learning_rate": 4.927789176643247e-06, + "loss": 0.5672, + "step": 4476 + }, + { + "epoch": 1.6350191710790578, + "grad_norm": 0.843192458152771, + "learning_rate": 4.918363668929922e-06, + "loss": 0.5676, + "step": 4477 + }, + { + "epoch": 1.6353843344896841, + "grad_norm": 0.840177595615387, + "learning_rate": 4.908945920113299e-06, + "loss": 0.5505, + "step": 4478 + }, + { + "epoch": 1.6357494979003104, + "grad_norm": 0.7071192860603333, + "learning_rate": 4.899535935038436e-06, + "loss": 0.5436, + "step": 4479 + }, + { + "epoch": 1.6361146613109367, + "grad_norm": 0.8813037872314453, + "learning_rate": 4.890133718546395e-06, + "loss": 0.5706, + "step": 4480 + }, + { + "epoch": 1.636479824721563, + "grad_norm": 0.8635373711585999, + "learning_rate": 4.880739275474229e-06, + "loss": 0.5175, + "step": 4481 + }, + { + "epoch": 1.6368449881321891, + "grad_norm": 0.9497634768486023, + "learning_rate": 4.8713526106550134e-06, + "loss": 0.5266, + "step": 4482 + }, + { + "epoch": 1.6372101515428155, + "grad_norm": 0.8595808148384094, + "learning_rate": 4.861973728917799e-06, + "loss": 0.5293, + "step": 4483 + }, + { + "epoch": 1.6375753149534416, + "grad_norm": 1.2146114110946655, + "learning_rate": 4.852602635087651e-06, + "loss": 0.4702, + "step": 4484 + }, + { + "epoch": 1.6379404783640679, + "grad_norm": 0.818753182888031, + "learning_rate": 4.843239333985625e-06, + "loss": 0.5299, + "step": 4485 + }, + { + "epoch": 1.6383056417746942, + "grad_norm": 1.060773491859436, + "learning_rate": 4.833883830428754e-06, + "loss": 0.5176, + "step": 4486 + }, + { + "epoch": 1.6386708051853205, + "grad_norm": 0.8212208151817322, + "learning_rate": 4.8245361292300705e-06, + "loss": 0.5612, + "step": 4487 + }, + { + "epoch": 1.6390359685959468, + "grad_norm": 0.8592978119850159, + "learning_rate": 4.815196235198598e-06, + "loss": 0.5427, + "step": 4488 + }, + { + "epoch": 1.639401132006573, + "grad_norm": 1.0273016691207886, + "learning_rate": 4.805864153139339e-06, + "loss": 0.5248, + "step": 4489 + }, + { + "epoch": 1.6397662954171992, + "grad_norm": 0.8871325254440308, + "learning_rate": 4.796539887853266e-06, + "loss": 0.5452, + "step": 4490 + }, + { + "epoch": 1.6401314588278253, + "grad_norm": 0.8996334075927734, + "learning_rate": 4.7872234441373434e-06, + "loss": 0.5277, + "step": 4491 + }, + { + "epoch": 1.6404966222384516, + "grad_norm": 0.8074132204055786, + "learning_rate": 4.7779148267845065e-06, + "loss": 0.5553, + "step": 4492 + }, + { + "epoch": 1.640861785649078, + "grad_norm": 1.0954314470291138, + "learning_rate": 4.768614040583668e-06, + "loss": 0.5074, + "step": 4493 + }, + { + "epoch": 1.6412269490597042, + "grad_norm": 0.7810633778572083, + "learning_rate": 4.7593210903197155e-06, + "loss": 0.5637, + "step": 4494 + }, + { + "epoch": 1.6415921124703305, + "grad_norm": 1.101777195930481, + "learning_rate": 4.750035980773488e-06, + "loss": 0.5187, + "step": 4495 + }, + { + "epoch": 1.6419572758809569, + "grad_norm": 0.7545532584190369, + "learning_rate": 4.740758716721803e-06, + "loss": 0.5423, + "step": 4496 + }, + { + "epoch": 1.642322439291583, + "grad_norm": 0.9766955375671387, + "learning_rate": 4.731489302937442e-06, + "loss": 0.5436, + "step": 4497 + }, + { + "epoch": 1.6426876027022093, + "grad_norm": 0.7398218512535095, + "learning_rate": 4.722227744189148e-06, + "loss": 0.5809, + "step": 4498 + }, + { + "epoch": 1.6430527661128354, + "grad_norm": 0.7854627966880798, + "learning_rate": 4.712974045241625e-06, + "loss": 0.507, + "step": 4499 + }, + { + "epoch": 1.6434179295234617, + "grad_norm": 1.1670691967010498, + "learning_rate": 4.70372821085552e-06, + "loss": 0.5069, + "step": 4500 + }, + { + "epoch": 1.643783092934088, + "grad_norm": 0.7382610440254211, + "learning_rate": 4.694490245787451e-06, + "loss": 0.5783, + "step": 4501 + }, + { + "epoch": 1.6441482563447143, + "grad_norm": 0.8627193570137024, + "learning_rate": 4.685260154789979e-06, + "loss": 0.53, + "step": 4502 + }, + { + "epoch": 1.6445134197553406, + "grad_norm": 1.0902482271194458, + "learning_rate": 4.676037942611613e-06, + "loss": 0.5189, + "step": 4503 + }, + { + "epoch": 1.644878583165967, + "grad_norm": 0.8628425598144531, + "learning_rate": 4.6668236139968205e-06, + "loss": 0.5364, + "step": 4504 + }, + { + "epoch": 1.645243746576593, + "grad_norm": 0.9813662171363831, + "learning_rate": 4.657617173685989e-06, + "loss": 0.493, + "step": 4505 + }, + { + "epoch": 1.6456089099872193, + "grad_norm": 0.723275899887085, + "learning_rate": 4.648418626415472e-06, + "loss": 0.5497, + "step": 4506 + }, + { + "epoch": 1.6459740733978454, + "grad_norm": 0.9946385622024536, + "learning_rate": 4.63922797691755e-06, + "loss": 0.5436, + "step": 4507 + }, + { + "epoch": 1.6463392368084717, + "grad_norm": 1.0270251035690308, + "learning_rate": 4.63004522992045e-06, + "loss": 0.5001, + "step": 4508 + }, + { + "epoch": 1.646704400219098, + "grad_norm": 0.974694013595581, + "learning_rate": 4.620870390148322e-06, + "loss": 0.5034, + "step": 4509 + }, + { + "epoch": 1.6470695636297243, + "grad_norm": 0.8969122767448425, + "learning_rate": 4.611703462321246e-06, + "loss": 0.5258, + "step": 4510 + }, + { + "epoch": 1.6474347270403507, + "grad_norm": 0.906024694442749, + "learning_rate": 4.602544451155247e-06, + "loss": 0.5309, + "step": 4511 + }, + { + "epoch": 1.647799890450977, + "grad_norm": 0.7742956876754761, + "learning_rate": 4.593393361362264e-06, + "loss": 0.5596, + "step": 4512 + }, + { + "epoch": 1.648165053861603, + "grad_norm": 0.8088904023170471, + "learning_rate": 4.584250197650169e-06, + "loss": 0.5823, + "step": 4513 + }, + { + "epoch": 1.6485302172722294, + "grad_norm": 0.8699324727058411, + "learning_rate": 4.575114964722758e-06, + "loss": 0.5165, + "step": 4514 + }, + { + "epoch": 1.6488953806828555, + "grad_norm": 0.6912913918495178, + "learning_rate": 4.565987667279728e-06, + "loss": 0.5538, + "step": 4515 + }, + { + "epoch": 1.6492605440934818, + "grad_norm": 1.3685637712478638, + "learning_rate": 4.556868310016715e-06, + "loss": 0.5148, + "step": 4516 + }, + { + "epoch": 1.649625707504108, + "grad_norm": 1.088139533996582, + "learning_rate": 4.547756897625264e-06, + "loss": 0.5196, + "step": 4517 + }, + { + "epoch": 1.6499908709147344, + "grad_norm": 0.9013568758964539, + "learning_rate": 4.538653434792833e-06, + "loss": 0.5044, + "step": 4518 + }, + { + "epoch": 1.6503560343253607, + "grad_norm": 0.8035263419151306, + "learning_rate": 4.529557926202781e-06, + "loss": 0.5621, + "step": 4519 + }, + { + "epoch": 1.6507211977359868, + "grad_norm": 0.770159900188446, + "learning_rate": 4.520470376534385e-06, + "loss": 0.5474, + "step": 4520 + }, + { + "epoch": 1.6510863611466131, + "grad_norm": 0.6742453575134277, + "learning_rate": 4.511390790462824e-06, + "loss": 0.5527, + "step": 4521 + }, + { + "epoch": 1.6514515245572392, + "grad_norm": 0.9856719374656677, + "learning_rate": 4.502319172659189e-06, + "loss": 0.5764, + "step": 4522 + }, + { + "epoch": 1.6518166879678655, + "grad_norm": 1.1472326517105103, + "learning_rate": 4.49325552779045e-06, + "loss": 0.5225, + "step": 4523 + }, + { + "epoch": 1.6521818513784918, + "grad_norm": 1.3834545612335205, + "learning_rate": 4.484199860519502e-06, + "loss": 0.5, + "step": 4524 + }, + { + "epoch": 1.6525470147891181, + "grad_norm": 1.005962610244751, + "learning_rate": 4.475152175505108e-06, + "loss": 0.5237, + "step": 4525 + }, + { + "epoch": 1.6529121781997445, + "grad_norm": 0.7523673176765442, + "learning_rate": 4.466112477401949e-06, + "loss": 0.5524, + "step": 4526 + }, + { + "epoch": 1.6532773416103708, + "grad_norm": 0.7982600331306458, + "learning_rate": 4.4570807708605825e-06, + "loss": 0.5645, + "step": 4527 + }, + { + "epoch": 1.6536425050209969, + "grad_norm": 0.8677096366882324, + "learning_rate": 4.448057060527466e-06, + "loss": 0.4809, + "step": 4528 + }, + { + "epoch": 1.6540076684316232, + "grad_norm": 0.8809179663658142, + "learning_rate": 4.439041351044926e-06, + "loss": 0.4971, + "step": 4529 + }, + { + "epoch": 1.6543728318422493, + "grad_norm": 0.8608189225196838, + "learning_rate": 4.430033647051191e-06, + "loss": 0.5663, + "step": 4530 + }, + { + "epoch": 1.6547379952528756, + "grad_norm": 1.1307836771011353, + "learning_rate": 4.421033953180358e-06, + "loss": 0.5461, + "step": 4531 + }, + { + "epoch": 1.6551031586635019, + "grad_norm": 0.8868815302848816, + "learning_rate": 4.412042274062415e-06, + "loss": 0.5142, + "step": 4532 + }, + { + "epoch": 1.6554683220741282, + "grad_norm": 1.160722017288208, + "learning_rate": 4.4030586143232145e-06, + "loss": 0.5339, + "step": 4533 + }, + { + "epoch": 1.6558334854847545, + "grad_norm": 0.7950741052627563, + "learning_rate": 4.394082978584488e-06, + "loss": 0.561, + "step": 4534 + }, + { + "epoch": 1.6561986488953808, + "grad_norm": 0.7816884517669678, + "learning_rate": 4.38511537146385e-06, + "loss": 0.5635, + "step": 4535 + }, + { + "epoch": 1.656563812306007, + "grad_norm": 1.2029130458831787, + "learning_rate": 4.376155797574761e-06, + "loss": 0.5617, + "step": 4536 + }, + { + "epoch": 1.6569289757166332, + "grad_norm": 0.8650860786437988, + "learning_rate": 4.367204261526568e-06, + "loss": 0.4732, + "step": 4537 + }, + { + "epoch": 1.6572941391272593, + "grad_norm": 0.8647130131721497, + "learning_rate": 4.358260767924482e-06, + "loss": 0.4763, + "step": 4538 + }, + { + "epoch": 1.6576593025378856, + "grad_norm": 1.0285489559173584, + "learning_rate": 4.349325321369564e-06, + "loss": 0.4862, + "step": 4539 + }, + { + "epoch": 1.658024465948512, + "grad_norm": 1.072596788406372, + "learning_rate": 4.340397926458744e-06, + "loss": 0.5381, + "step": 4540 + }, + { + "epoch": 1.6583896293591383, + "grad_norm": 0.9929602146148682, + "learning_rate": 4.331478587784809e-06, + "loss": 0.5239, + "step": 4541 + }, + { + "epoch": 1.6587547927697646, + "grad_norm": 0.8808034658432007, + "learning_rate": 4.32256730993641e-06, + "loss": 0.4933, + "step": 4542 + }, + { + "epoch": 1.6591199561803909, + "grad_norm": 0.9162267446517944, + "learning_rate": 4.313664097498027e-06, + "loss": 0.5379, + "step": 4543 + }, + { + "epoch": 1.659485119591017, + "grad_norm": 0.8565149903297424, + "learning_rate": 4.304768955050014e-06, + "loss": 0.5077, + "step": 4544 + }, + { + "epoch": 1.6598502830016433, + "grad_norm": 1.352622389793396, + "learning_rate": 4.29588188716856e-06, + "loss": 0.5193, + "step": 4545 + }, + { + "epoch": 1.6602154464122694, + "grad_norm": 1.0091907978057861, + "learning_rate": 4.287002898425709e-06, + "loss": 0.5691, + "step": 4546 + }, + { + "epoch": 1.6605806098228957, + "grad_norm": 0.9868928790092468, + "learning_rate": 4.2781319933893515e-06, + "loss": 0.5322, + "step": 4547 + }, + { + "epoch": 1.660945773233522, + "grad_norm": 0.8290953636169434, + "learning_rate": 4.269269176623203e-06, + "loss": 0.5705, + "step": 4548 + }, + { + "epoch": 1.6613109366441483, + "grad_norm": 0.9042468667030334, + "learning_rate": 4.260414452686821e-06, + "loss": 0.552, + "step": 4549 + }, + { + "epoch": 1.6616761000547746, + "grad_norm": 0.8311589360237122, + "learning_rate": 4.251567826135614e-06, + "loss": 0.5229, + "step": 4550 + }, + { + "epoch": 1.662041263465401, + "grad_norm": 0.7332230806350708, + "learning_rate": 4.242729301520816e-06, + "loss": 0.539, + "step": 4551 + }, + { + "epoch": 1.662406426876027, + "grad_norm": 0.7751251459121704, + "learning_rate": 4.233898883389496e-06, + "loss": 0.5619, + "step": 4552 + }, + { + "epoch": 1.6627715902866531, + "grad_norm": 0.696730375289917, + "learning_rate": 4.225076576284541e-06, + "loss": 0.5615, + "step": 4553 + }, + { + "epoch": 1.6631367536972794, + "grad_norm": 1.0118292570114136, + "learning_rate": 4.2162623847446806e-06, + "loss": 0.5767, + "step": 4554 + }, + { + "epoch": 1.6635019171079057, + "grad_norm": 0.9420759081840515, + "learning_rate": 4.207456313304461e-06, + "loss": 0.5393, + "step": 4555 + }, + { + "epoch": 1.663867080518532, + "grad_norm": 0.7974475622177124, + "learning_rate": 4.198658366494252e-06, + "loss": 0.5333, + "step": 4556 + }, + { + "epoch": 1.6642322439291584, + "grad_norm": 0.912028968334198, + "learning_rate": 4.189868548840253e-06, + "loss": 0.5154, + "step": 4557 + }, + { + "epoch": 1.6645974073397847, + "grad_norm": 1.3836195468902588, + "learning_rate": 4.181086864864457e-06, + "loss": 0.4968, + "step": 4558 + }, + { + "epoch": 1.6649625707504108, + "grad_norm": 1.2628008127212524, + "learning_rate": 4.172313319084695e-06, + "loss": 0.5497, + "step": 4559 + }, + { + "epoch": 1.665327734161037, + "grad_norm": 0.8643813729286194, + "learning_rate": 4.163547916014605e-06, + "loss": 0.5334, + "step": 4560 + }, + { + "epoch": 1.6656928975716632, + "grad_norm": 0.8692271113395691, + "learning_rate": 4.154790660163641e-06, + "loss": 0.5178, + "step": 4561 + }, + { + "epoch": 1.6660580609822895, + "grad_norm": 0.7350249290466309, + "learning_rate": 4.1460415560370545e-06, + "loss": 0.5498, + "step": 4562 + }, + { + "epoch": 1.6664232243929158, + "grad_norm": 0.8751364350318909, + "learning_rate": 4.137300608135901e-06, + "loss": 0.4749, + "step": 4563 + }, + { + "epoch": 1.666788387803542, + "grad_norm": 0.7478228211402893, + "learning_rate": 4.128567820957054e-06, + "loss": 0.5363, + "step": 4564 + }, + { + "epoch": 1.6671535512141684, + "grad_norm": 1.0465996265411377, + "learning_rate": 4.119843198993185e-06, + "loss": 0.4948, + "step": 4565 + }, + { + "epoch": 1.6675187146247947, + "grad_norm": 1.0394774675369263, + "learning_rate": 4.111126746732756e-06, + "loss": 0.5178, + "step": 4566 + }, + { + "epoch": 1.6678838780354208, + "grad_norm": 0.839904248714447, + "learning_rate": 4.102418468660041e-06, + "loss": 0.5305, + "step": 4567 + }, + { + "epoch": 1.6682490414460471, + "grad_norm": 0.7755999565124512, + "learning_rate": 4.0937183692550885e-06, + "loss": 0.5279, + "step": 4568 + }, + { + "epoch": 1.6686142048566732, + "grad_norm": 1.0791418552398682, + "learning_rate": 4.0850264529937565e-06, + "loss": 0.5492, + "step": 4569 + }, + { + "epoch": 1.6689793682672995, + "grad_norm": 0.7471647262573242, + "learning_rate": 4.076342724347686e-06, + "loss": 0.5443, + "step": 4570 + }, + { + "epoch": 1.6693445316779258, + "grad_norm": 1.0776115655899048, + "learning_rate": 4.067667187784312e-06, + "loss": 0.492, + "step": 4571 + }, + { + "epoch": 1.6697096950885522, + "grad_norm": 0.9209566712379456, + "learning_rate": 4.0589998477668405e-06, + "loss": 0.5322, + "step": 4572 + }, + { + "epoch": 1.6700748584991785, + "grad_norm": 0.9458308219909668, + "learning_rate": 4.050340708754274e-06, + "loss": 0.5481, + "step": 4573 + }, + { + "epoch": 1.6704400219098048, + "grad_norm": 0.8129193782806396, + "learning_rate": 4.041689775201394e-06, + "loss": 0.5698, + "step": 4574 + }, + { + "epoch": 1.6708051853204309, + "grad_norm": 0.7496057748794556, + "learning_rate": 4.03304705155876e-06, + "loss": 0.5632, + "step": 4575 + }, + { + "epoch": 1.6711703487310572, + "grad_norm": 0.9546471238136292, + "learning_rate": 4.024412542272706e-06, + "loss": 0.5239, + "step": 4576 + }, + { + "epoch": 1.6715355121416833, + "grad_norm": 0.9323283433914185, + "learning_rate": 4.015786251785334e-06, + "loss": 0.4836, + "step": 4577 + }, + { + "epoch": 1.6719006755523096, + "grad_norm": 0.8260817527770996, + "learning_rate": 4.007168184534529e-06, + "loss": 0.5256, + "step": 4578 + }, + { + "epoch": 1.672265838962936, + "grad_norm": 0.7987567782402039, + "learning_rate": 3.99855834495394e-06, + "loss": 0.5239, + "step": 4579 + }, + { + "epoch": 1.6726310023735622, + "grad_norm": 0.9163694977760315, + "learning_rate": 3.989956737472984e-06, + "loss": 0.5173, + "step": 4580 + }, + { + "epoch": 1.6729961657841885, + "grad_norm": 0.5946126580238342, + "learning_rate": 3.98136336651685e-06, + "loss": 0.5784, + "step": 4581 + }, + { + "epoch": 1.6733613291948148, + "grad_norm": 0.780558168888092, + "learning_rate": 3.9727782365064695e-06, + "loss": 0.5597, + "step": 4582 + }, + { + "epoch": 1.673726492605441, + "grad_norm": 0.9302628040313721, + "learning_rate": 3.9642013518585564e-06, + "loss": 0.5172, + "step": 4583 + }, + { + "epoch": 1.6740916560160672, + "grad_norm": 0.8205109238624573, + "learning_rate": 3.955632716985569e-06, + "loss": 0.535, + "step": 4584 + }, + { + "epoch": 1.6744568194266933, + "grad_norm": 1.1331835985183716, + "learning_rate": 3.947072336295734e-06, + "loss": 0.5323, + "step": 4585 + }, + { + "epoch": 1.6748219828373196, + "grad_norm": 0.8803338408470154, + "learning_rate": 3.938520214193014e-06, + "loss": 0.5562, + "step": 4586 + }, + { + "epoch": 1.675187146247946, + "grad_norm": 0.8349100351333618, + "learning_rate": 3.929976355077134e-06, + "loss": 0.5216, + "step": 4587 + }, + { + "epoch": 1.6755523096585723, + "grad_norm": 1.0220751762390137, + "learning_rate": 3.921440763343578e-06, + "loss": 0.5576, + "step": 4588 + }, + { + "epoch": 1.6759174730691986, + "grad_norm": 0.9241355657577515, + "learning_rate": 3.91291344338355e-06, + "loss": 0.5306, + "step": 4589 + }, + { + "epoch": 1.6762826364798247, + "grad_norm": 0.9509197473526001, + "learning_rate": 3.90439439958402e-06, + "loss": 0.5306, + "step": 4590 + }, + { + "epoch": 1.676647799890451, + "grad_norm": 0.9647827744483948, + "learning_rate": 3.8958836363277016e-06, + "loss": 0.4928, + "step": 4591 + }, + { + "epoch": 1.677012963301077, + "grad_norm": 0.9714269042015076, + "learning_rate": 3.887381157993029e-06, + "loss": 0.4742, + "step": 4592 + }, + { + "epoch": 1.6773781267117034, + "grad_norm": 0.8928794860839844, + "learning_rate": 3.87888696895419e-06, + "loss": 0.5338, + "step": 4593 + }, + { + "epoch": 1.6777432901223297, + "grad_norm": 0.8356234431266785, + "learning_rate": 3.870401073581107e-06, + "loss": 0.5305, + "step": 4594 + }, + { + "epoch": 1.678108453532956, + "grad_norm": 0.7654070854187012, + "learning_rate": 3.861923476239435e-06, + "loss": 0.5637, + "step": 4595 + }, + { + "epoch": 1.6784736169435823, + "grad_norm": 0.6427393555641174, + "learning_rate": 3.85345418129055e-06, + "loss": 0.5503, + "step": 4596 + }, + { + "epoch": 1.6788387803542086, + "grad_norm": 0.7948646545410156, + "learning_rate": 3.844993193091568e-06, + "loss": 0.525, + "step": 4597 + }, + { + "epoch": 1.6792039437648347, + "grad_norm": 0.9657595753669739, + "learning_rate": 3.8365405159953265e-06, + "loss": 0.5482, + "step": 4598 + }, + { + "epoch": 1.679569107175461, + "grad_norm": 0.7931142449378967, + "learning_rate": 3.828096154350391e-06, + "loss": 0.5335, + "step": 4599 + }, + { + "epoch": 1.6799342705860871, + "grad_norm": 0.8278718590736389, + "learning_rate": 3.819660112501053e-06, + "loss": 0.5414, + "step": 4600 + }, + { + "epoch": 1.6802994339967134, + "grad_norm": 0.8467036485671997, + "learning_rate": 3.811232394787303e-06, + "loss": 0.5532, + "step": 4601 + }, + { + "epoch": 1.6806645974073398, + "grad_norm": 0.992175817489624, + "learning_rate": 3.802813005544879e-06, + "loss": 0.5353, + "step": 4602 + }, + { + "epoch": 1.681029760817966, + "grad_norm": 1.0147136449813843, + "learning_rate": 3.7944019491052043e-06, + "loss": 0.5217, + "step": 4603 + }, + { + "epoch": 1.6813949242285924, + "grad_norm": 0.8257679343223572, + "learning_rate": 3.7859992297954363e-06, + "loss": 0.5546, + "step": 4604 + }, + { + "epoch": 1.6817600876392187, + "grad_norm": 0.876408576965332, + "learning_rate": 3.7776048519384413e-06, + "loss": 0.5532, + "step": 4605 + }, + { + "epoch": 1.6821252510498448, + "grad_norm": 0.9336234331130981, + "learning_rate": 3.7692188198527822e-06, + "loss": 0.5084, + "step": 4606 + }, + { + "epoch": 1.682490414460471, + "grad_norm": 0.8870642185211182, + "learning_rate": 3.76084113785274e-06, + "loss": 0.4638, + "step": 4607 + }, + { + "epoch": 1.6828555778710972, + "grad_norm": 0.7216123938560486, + "learning_rate": 3.7524718102482947e-06, + "loss": 0.5434, + "step": 4608 + }, + { + "epoch": 1.6832207412817235, + "grad_norm": 0.9200725555419922, + "learning_rate": 3.74411084134513e-06, + "loss": 0.5232, + "step": 4609 + }, + { + "epoch": 1.6835859046923498, + "grad_norm": 0.7329199314117432, + "learning_rate": 3.7357582354446352e-06, + "loss": 0.5414, + "step": 4610 + }, + { + "epoch": 1.6839510681029761, + "grad_norm": 1.110066294670105, + "learning_rate": 3.7274139968438782e-06, + "loss": 0.5015, + "step": 4611 + }, + { + "epoch": 1.6843162315136024, + "grad_norm": 0.9416675567626953, + "learning_rate": 3.7190781298356428e-06, + "loss": 0.5309, + "step": 4612 + }, + { + "epoch": 1.6846813949242287, + "grad_norm": 1.218187928199768, + "learning_rate": 3.710750638708398e-06, + "loss": 0.5042, + "step": 4613 + }, + { + "epoch": 1.6850465583348548, + "grad_norm": 1.1069811582565308, + "learning_rate": 3.7024315277463064e-06, + "loss": 0.5276, + "step": 4614 + }, + { + "epoch": 1.6854117217454812, + "grad_norm": 0.7823757529258728, + "learning_rate": 3.694120801229213e-06, + "loss": 0.5285, + "step": 4615 + }, + { + "epoch": 1.6857768851561072, + "grad_norm": 0.7776532173156738, + "learning_rate": 3.685818463432649e-06, + "loss": 0.5238, + "step": 4616 + }, + { + "epoch": 1.6861420485667336, + "grad_norm": 0.7341038584709167, + "learning_rate": 3.6775245186278375e-06, + "loss": 0.5586, + "step": 4617 + }, + { + "epoch": 1.6865072119773599, + "grad_norm": 0.9906373620033264, + "learning_rate": 3.669238971081681e-06, + "loss": 0.568, + "step": 4618 + }, + { + "epoch": 1.6868723753879862, + "grad_norm": 1.0237858295440674, + "learning_rate": 3.6609618250567657e-06, + "loss": 0.5143, + "step": 4619 + }, + { + "epoch": 1.6872375387986125, + "grad_norm": 0.6446452140808105, + "learning_rate": 3.652693084811343e-06, + "loss": 0.5756, + "step": 4620 + }, + { + "epoch": 1.6876027022092386, + "grad_norm": 0.7709906697273254, + "learning_rate": 3.6444327545993497e-06, + "loss": 0.5422, + "step": 4621 + }, + { + "epoch": 1.687967865619865, + "grad_norm": 0.8992085456848145, + "learning_rate": 3.636180838670398e-06, + "loss": 0.5671, + "step": 4622 + }, + { + "epoch": 1.688333029030491, + "grad_norm": 0.9192847013473511, + "learning_rate": 3.627937341269765e-06, + "loss": 0.523, + "step": 4623 + }, + { + "epoch": 1.6886981924411173, + "grad_norm": 0.9154180884361267, + "learning_rate": 3.619702266638405e-06, + "loss": 0.5198, + "step": 4624 + }, + { + "epoch": 1.6890633558517436, + "grad_norm": 0.960006594657898, + "learning_rate": 3.611475619012923e-06, + "loss": 0.5456, + "step": 4625 + }, + { + "epoch": 1.68942851926237, + "grad_norm": 1.1184638738632202, + "learning_rate": 3.603257402625604e-06, + "loss": 0.4909, + "step": 4626 + }, + { + "epoch": 1.6897936826729962, + "grad_norm": 1.1449368000030518, + "learning_rate": 3.5950476217043928e-06, + "loss": 0.5238, + "step": 4627 + }, + { + "epoch": 1.6901588460836225, + "grad_norm": 0.9218161702156067, + "learning_rate": 3.5868462804728933e-06, + "loss": 0.5313, + "step": 4628 + }, + { + "epoch": 1.6905240094942486, + "grad_norm": 0.8735278844833374, + "learning_rate": 3.5786533831503654e-06, + "loss": 0.5478, + "step": 4629 + }, + { + "epoch": 1.690889172904875, + "grad_norm": 0.8266289830207825, + "learning_rate": 3.5704689339517184e-06, + "loss": 0.529, + "step": 4630 + }, + { + "epoch": 1.691254336315501, + "grad_norm": 0.8460071682929993, + "learning_rate": 3.56229293708753e-06, + "loss": 0.5178, + "step": 4631 + }, + { + "epoch": 1.6916194997261274, + "grad_norm": 0.9639946818351746, + "learning_rate": 3.554125396764021e-06, + "loss": 0.508, + "step": 4632 + }, + { + "epoch": 1.6919846631367537, + "grad_norm": 0.9099420309066772, + "learning_rate": 3.5459663171830626e-06, + "loss": 0.5219, + "step": 4633 + }, + { + "epoch": 1.69234982654738, + "grad_norm": 0.6585804224014282, + "learning_rate": 3.53781570254218e-06, + "loss": 0.5629, + "step": 4634 + }, + { + "epoch": 1.6927149899580063, + "grad_norm": 0.8587778210639954, + "learning_rate": 3.5296735570345276e-06, + "loss": 0.5479, + "step": 4635 + }, + { + "epoch": 1.6930801533686326, + "grad_norm": 0.7885758280754089, + "learning_rate": 3.5215398848489167e-06, + "loss": 0.5573, + "step": 4636 + }, + { + "epoch": 1.6934453167792587, + "grad_norm": 0.6936434507369995, + "learning_rate": 3.513414690169794e-06, + "loss": 0.5464, + "step": 4637 + }, + { + "epoch": 1.693810480189885, + "grad_norm": 0.9549039006233215, + "learning_rate": 3.5052979771772555e-06, + "loss": 0.5423, + "step": 4638 + }, + { + "epoch": 1.694175643600511, + "grad_norm": 1.0629690885543823, + "learning_rate": 3.49718975004701e-06, + "loss": 0.4832, + "step": 4639 + }, + { + "epoch": 1.6945408070111374, + "grad_norm": 0.7019412517547607, + "learning_rate": 3.489090012950422e-06, + "loss": 0.5543, + "step": 4640 + }, + { + "epoch": 1.6949059704217637, + "grad_norm": 0.8659648299217224, + "learning_rate": 3.48099877005448e-06, + "loss": 0.5479, + "step": 4641 + }, + { + "epoch": 1.69527113383239, + "grad_norm": 0.7017379403114319, + "learning_rate": 3.4729160255218107e-06, + "loss": 0.5425, + "step": 4642 + }, + { + "epoch": 1.6956362972430163, + "grad_norm": 0.6785612106323242, + "learning_rate": 3.4648417835106507e-06, + "loss": 0.5347, + "step": 4643 + }, + { + "epoch": 1.6960014606536427, + "grad_norm": 0.7757022976875305, + "learning_rate": 3.4567760481748835e-06, + "loss": 0.5406, + "step": 4644 + }, + { + "epoch": 1.6963666240642687, + "grad_norm": 0.9227039217948914, + "learning_rate": 3.4487188236639966e-06, + "loss": 0.5199, + "step": 4645 + }, + { + "epoch": 1.696731787474895, + "grad_norm": 0.693849503993988, + "learning_rate": 3.4406701141231126e-06, + "loss": 0.5845, + "step": 4646 + }, + { + "epoch": 1.6970969508855211, + "grad_norm": 0.734327495098114, + "learning_rate": 3.4326299236929693e-06, + "loss": 0.5612, + "step": 4647 + }, + { + "epoch": 1.6974621142961475, + "grad_norm": 0.9029885530471802, + "learning_rate": 3.42459825650993e-06, + "loss": 0.514, + "step": 4648 + }, + { + "epoch": 1.6978272777067738, + "grad_norm": 0.7814059853553772, + "learning_rate": 3.416575116705951e-06, + "loss": 0.468, + "step": 4649 + }, + { + "epoch": 1.6981924411174, + "grad_norm": 0.7431153655052185, + "learning_rate": 3.408560508408625e-06, + "loss": 0.5677, + "step": 4650 + }, + { + "epoch": 1.6985576045280264, + "grad_norm": 1.0016436576843262, + "learning_rate": 3.4005544357411433e-06, + "loss": 0.5071, + "step": 4651 + }, + { + "epoch": 1.6989227679386527, + "grad_norm": 0.9274985194206238, + "learning_rate": 3.392556902822313e-06, + "loss": 0.4889, + "step": 4652 + }, + { + "epoch": 1.6992879313492788, + "grad_norm": 0.9575678706169128, + "learning_rate": 3.3845679137665434e-06, + "loss": 0.4849, + "step": 4653 + }, + { + "epoch": 1.699653094759905, + "grad_norm": 0.766573429107666, + "learning_rate": 3.376587472683841e-06, + "loss": 0.4888, + "step": 4654 + }, + { + "epoch": 1.7000182581705312, + "grad_norm": 0.9481750726699829, + "learning_rate": 3.368615583679833e-06, + "loss": 0.4871, + "step": 4655 + }, + { + "epoch": 1.7003834215811575, + "grad_norm": 0.9773246049880981, + "learning_rate": 3.360652250855727e-06, + "loss": 0.5846, + "step": 4656 + }, + { + "epoch": 1.7007485849917838, + "grad_norm": 0.8178773522377014, + "learning_rate": 3.352697478308342e-06, + "loss": 0.5229, + "step": 4657 + }, + { + "epoch": 1.7011137484024101, + "grad_norm": 0.8280562162399292, + "learning_rate": 3.3447512701300956e-06, + "loss": 0.5539, + "step": 4658 + }, + { + "epoch": 1.7014789118130365, + "grad_norm": 0.7413144111633301, + "learning_rate": 3.3368136304089815e-06, + "loss": 0.5374, + "step": 4659 + }, + { + "epoch": 1.7018440752236625, + "grad_norm": 0.9765322208404541, + "learning_rate": 3.328884563228605e-06, + "loss": 0.5624, + "step": 4660 + }, + { + "epoch": 1.7022092386342889, + "grad_norm": 0.8073693513870239, + "learning_rate": 3.320964072668147e-06, + "loss": 0.5722, + "step": 4661 + }, + { + "epoch": 1.702574402044915, + "grad_norm": 1.0331017971038818, + "learning_rate": 3.3130521628023926e-06, + "loss": 0.5284, + "step": 4662 + }, + { + "epoch": 1.7029395654555413, + "grad_norm": 0.8730890154838562, + "learning_rate": 3.3051488377016884e-06, + "loss": 0.5876, + "step": 4663 + }, + { + "epoch": 1.7033047288661676, + "grad_norm": 1.1102790832519531, + "learning_rate": 3.297254101431986e-06, + "loss": 0.5334, + "step": 4664 + }, + { + "epoch": 1.7036698922767939, + "grad_norm": 0.9046598672866821, + "learning_rate": 3.2893679580548075e-06, + "loss": 0.5447, + "step": 4665 + }, + { + "epoch": 1.7040350556874202, + "grad_norm": 0.6133385300636292, + "learning_rate": 3.2814904116272595e-06, + "loss": 0.5544, + "step": 4666 + }, + { + "epoch": 1.7044002190980465, + "grad_norm": 0.7287967801094055, + "learning_rate": 3.2736214662020284e-06, + "loss": 0.531, + "step": 4667 + }, + { + "epoch": 1.7047653825086726, + "grad_norm": 1.0205227136611938, + "learning_rate": 3.2657611258273602e-06, + "loss": 0.4849, + "step": 4668 + }, + { + "epoch": 1.705130545919299, + "grad_norm": 0.8684663772583008, + "learning_rate": 3.257909394547092e-06, + "loss": 0.5234, + "step": 4669 + }, + { + "epoch": 1.705495709329925, + "grad_norm": 4.236178398132324, + "learning_rate": 3.250066276400621e-06, + "loss": 0.5249, + "step": 4670 + }, + { + "epoch": 1.7058608727405513, + "grad_norm": 1.5054926872253418, + "learning_rate": 3.242231775422915e-06, + "loss": 0.5444, + "step": 4671 + }, + { + "epoch": 1.7062260361511776, + "grad_norm": 0.7978805303573608, + "learning_rate": 3.234405895644519e-06, + "loss": 0.4977, + "step": 4672 + }, + { + "epoch": 1.706591199561804, + "grad_norm": 0.7267588973045349, + "learning_rate": 3.2265886410915214e-06, + "loss": 0.546, + "step": 4673 + }, + { + "epoch": 1.7069563629724303, + "grad_norm": 1.0223290920257568, + "learning_rate": 3.2187800157855964e-06, + "loss": 0.5465, + "step": 4674 + }, + { + "epoch": 1.7073215263830566, + "grad_norm": 0.8201450109481812, + "learning_rate": 3.2109800237439616e-06, + "loss": 0.5579, + "step": 4675 + }, + { + "epoch": 1.7076866897936827, + "grad_norm": 0.7174233794212341, + "learning_rate": 3.2031886689794044e-06, + "loss": 0.5502, + "step": 4676 + }, + { + "epoch": 1.708051853204309, + "grad_norm": 1.003202199935913, + "learning_rate": 3.1954059555002683e-06, + "loss": 0.5174, + "step": 4677 + }, + { + "epoch": 1.708417016614935, + "grad_norm": 0.89548259973526, + "learning_rate": 3.1876318873104383e-06, + "loss": 0.5293, + "step": 4678 + }, + { + "epoch": 1.7087821800255614, + "grad_norm": 0.8657696843147278, + "learning_rate": 3.1798664684093606e-06, + "loss": 0.5137, + "step": 4679 + }, + { + "epoch": 1.7091473434361877, + "grad_norm": 0.7795391082763672, + "learning_rate": 3.1721097027920367e-06, + "loss": 0.5038, + "step": 4680 + }, + { + "epoch": 1.709512506846814, + "grad_norm": 0.8812219500541687, + "learning_rate": 3.1643615944490147e-06, + "loss": 0.5207, + "step": 4681 + }, + { + "epoch": 1.7098776702574403, + "grad_norm": 0.8421750664710999, + "learning_rate": 3.1566221473663794e-06, + "loss": 0.5429, + "step": 4682 + }, + { + "epoch": 1.7102428336680666, + "grad_norm": 0.750450074672699, + "learning_rate": 3.1488913655257635e-06, + "loss": 0.5621, + "step": 4683 + }, + { + "epoch": 1.7106079970786927, + "grad_norm": 0.8702534437179565, + "learning_rate": 3.1411692529043457e-06, + "loss": 0.5168, + "step": 4684 + }, + { + "epoch": 1.710973160489319, + "grad_norm": 1.2278602123260498, + "learning_rate": 3.133455813474844e-06, + "loss": 0.5337, + "step": 4685 + }, + { + "epoch": 1.7113383238999451, + "grad_norm": 0.8343172073364258, + "learning_rate": 3.1257510512055145e-06, + "loss": 0.5358, + "step": 4686 + }, + { + "epoch": 1.7117034873105714, + "grad_norm": 0.66392982006073, + "learning_rate": 3.1180549700601535e-06, + "loss": 0.5743, + "step": 4687 + }, + { + "epoch": 1.7120686507211977, + "grad_norm": 0.9112119078636169, + "learning_rate": 3.1103675739980745e-06, + "loss": 0.5335, + "step": 4688 + }, + { + "epoch": 1.712433814131824, + "grad_norm": 1.051275610923767, + "learning_rate": 3.1026888669741396e-06, + "loss": 0.5261, + "step": 4689 + }, + { + "epoch": 1.7127989775424504, + "grad_norm": 0.9014411568641663, + "learning_rate": 3.095018852938736e-06, + "loss": 0.5255, + "step": 4690 + }, + { + "epoch": 1.7131641409530765, + "grad_norm": 1.0786256790161133, + "learning_rate": 3.0873575358377826e-06, + "loss": 0.5172, + "step": 4691 + }, + { + "epoch": 1.7135293043637028, + "grad_norm": 0.8149101734161377, + "learning_rate": 3.0797049196127115e-06, + "loss": 0.5845, + "step": 4692 + }, + { + "epoch": 1.7138944677743289, + "grad_norm": 0.71403568983078, + "learning_rate": 3.0720610082004887e-06, + "loss": 0.5392, + "step": 4693 + }, + { + "epoch": 1.7142596311849552, + "grad_norm": 0.6756953001022339, + "learning_rate": 3.0644258055336017e-06, + "loss": 0.5479, + "step": 4694 + }, + { + "epoch": 1.7146247945955815, + "grad_norm": 0.9829939007759094, + "learning_rate": 3.05679931554006e-06, + "loss": 0.5401, + "step": 4695 + }, + { + "epoch": 1.7149899580062078, + "grad_norm": 0.918777346611023, + "learning_rate": 3.0491815421433825e-06, + "loss": 0.5274, + "step": 4696 + }, + { + "epoch": 1.715355121416834, + "grad_norm": 0.759799063205719, + "learning_rate": 3.041572489262603e-06, + "loss": 0.5356, + "step": 4697 + }, + { + "epoch": 1.7157202848274604, + "grad_norm": 0.7517125010490417, + "learning_rate": 3.0339721608122774e-06, + "loss": 0.5537, + "step": 4698 + }, + { + "epoch": 1.7160854482380865, + "grad_norm": 1.0564138889312744, + "learning_rate": 3.026380560702471e-06, + "loss": 0.4696, + "step": 4699 + }, + { + "epoch": 1.7164506116487128, + "grad_norm": 1.2564903497695923, + "learning_rate": 3.0187976928387573e-06, + "loss": 0.5359, + "step": 4700 + }, + { + "epoch": 1.716815775059339, + "grad_norm": 0.8190529346466064, + "learning_rate": 3.011223561122223e-06, + "loss": 0.5407, + "step": 4701 + }, + { + "epoch": 1.7171809384699652, + "grad_norm": 1.0427823066711426, + "learning_rate": 3.0036581694494436e-06, + "loss": 0.5139, + "step": 4702 + }, + { + "epoch": 1.7175461018805915, + "grad_norm": 0.9642361998558044, + "learning_rate": 2.9961015217125155e-06, + "loss": 0.5073, + "step": 4703 + }, + { + "epoch": 1.7179112652912178, + "grad_norm": 0.7839714884757996, + "learning_rate": 2.988553621799033e-06, + "loss": 0.5373, + "step": 4704 + }, + { + "epoch": 1.7182764287018442, + "grad_norm": 0.8481120467185974, + "learning_rate": 2.9810144735920877e-06, + "loss": 0.5574, + "step": 4705 + }, + { + "epoch": 1.7186415921124705, + "grad_norm": 0.7420259118080139, + "learning_rate": 2.9734840809702613e-06, + "loss": 0.5377, + "step": 4706 + }, + { + "epoch": 1.7190067555230966, + "grad_norm": 0.9698185920715332, + "learning_rate": 2.965962447807644e-06, + "loss": 0.5281, + "step": 4707 + }, + { + "epoch": 1.7193719189337229, + "grad_norm": 0.7731484770774841, + "learning_rate": 2.9584495779738144e-06, + "loss": 0.5192, + "step": 4708 + }, + { + "epoch": 1.719737082344349, + "grad_norm": 1.15269136428833, + "learning_rate": 2.950945475333846e-06, + "loss": 0.5162, + "step": 4709 + }, + { + "epoch": 1.7201022457549753, + "grad_norm": 0.6507449746131897, + "learning_rate": 2.94345014374829e-06, + "loss": 0.5638, + "step": 4710 + }, + { + "epoch": 1.7204674091656016, + "grad_norm": 0.7738745212554932, + "learning_rate": 2.9359635870732028e-06, + "loss": 0.5279, + "step": 4711 + }, + { + "epoch": 1.720832572576228, + "grad_norm": 0.7932908535003662, + "learning_rate": 2.928485809160109e-06, + "loss": 0.5308, + "step": 4712 + }, + { + "epoch": 1.7211977359868542, + "grad_norm": 0.9706488847732544, + "learning_rate": 2.921016813856028e-06, + "loss": 0.5123, + "step": 4713 + }, + { + "epoch": 1.7215628993974805, + "grad_norm": 0.9505299925804138, + "learning_rate": 2.91355660500346e-06, + "loss": 0.5431, + "step": 4714 + }, + { + "epoch": 1.7219280628081066, + "grad_norm": 0.6139938235282898, + "learning_rate": 2.906105186440389e-06, + "loss": 0.5527, + "step": 4715 + }, + { + "epoch": 1.722293226218733, + "grad_norm": 1.4412078857421875, + "learning_rate": 2.8986625620002586e-06, + "loss": 0.5151, + "step": 4716 + }, + { + "epoch": 1.722658389629359, + "grad_norm": 0.7758844494819641, + "learning_rate": 2.891228735512004e-06, + "loss": 0.5332, + "step": 4717 + }, + { + "epoch": 1.7230235530399853, + "grad_norm": 0.9113196730613708, + "learning_rate": 2.883803710800035e-06, + "loss": 0.5276, + "step": 4718 + }, + { + "epoch": 1.7233887164506116, + "grad_norm": 0.7024984955787659, + "learning_rate": 2.876387491684225e-06, + "loss": 0.5439, + "step": 4719 + }, + { + "epoch": 1.723753879861238, + "grad_norm": 0.7437683343887329, + "learning_rate": 2.8689800819799286e-06, + "loss": 0.5068, + "step": 4720 + }, + { + "epoch": 1.7241190432718643, + "grad_norm": 1.359061598777771, + "learning_rate": 2.8615814854979507e-06, + "loss": 0.5174, + "step": 4721 + }, + { + "epoch": 1.7244842066824904, + "grad_norm": 0.9796109795570374, + "learning_rate": 2.85419170604458e-06, + "loss": 0.5132, + "step": 4722 + }, + { + "epoch": 1.7248493700931167, + "grad_norm": 0.8437585830688477, + "learning_rate": 2.846810747421553e-06, + "loss": 0.4893, + "step": 4723 + }, + { + "epoch": 1.7252145335037428, + "grad_norm": 0.8218032121658325, + "learning_rate": 2.8394386134260843e-06, + "loss": 0.5335, + "step": 4724 + }, + { + "epoch": 1.725579696914369, + "grad_norm": 0.9758670330047607, + "learning_rate": 2.832075307850841e-06, + "loss": 0.5168, + "step": 4725 + }, + { + "epoch": 1.7259448603249954, + "grad_norm": 0.8763068914413452, + "learning_rate": 2.8247208344839428e-06, + "loss": 0.5391, + "step": 4726 + }, + { + "epoch": 1.7263100237356217, + "grad_norm": 0.8063436150550842, + "learning_rate": 2.8173751971089734e-06, + "loss": 0.5856, + "step": 4727 + }, + { + "epoch": 1.726675187146248, + "grad_norm": 0.7337233424186707, + "learning_rate": 2.8100383995049687e-06, + "loss": 0.58, + "step": 4728 + }, + { + "epoch": 1.7270403505568743, + "grad_norm": 1.227530837059021, + "learning_rate": 2.8027104454464172e-06, + "loss": 0.4644, + "step": 4729 + }, + { + "epoch": 1.7274055139675004, + "grad_norm": 1.321012020111084, + "learning_rate": 2.795391338703264e-06, + "loss": 0.4856, + "step": 4730 + }, + { + "epoch": 1.7277706773781267, + "grad_norm": 0.9268075227737427, + "learning_rate": 2.7880810830408834e-06, + "loss": 0.5202, + "step": 4731 + }, + { + "epoch": 1.7281358407887528, + "grad_norm": 0.896336555480957, + "learning_rate": 2.7807796822201137e-06, + "loss": 0.5237, + "step": 4732 + }, + { + "epoch": 1.7285010041993791, + "grad_norm": 0.802261233329773, + "learning_rate": 2.773487139997233e-06, + "loss": 0.543, + "step": 4733 + }, + { + "epoch": 1.7288661676100054, + "grad_norm": 0.7332318425178528, + "learning_rate": 2.7662034601239664e-06, + "loss": 0.5299, + "step": 4734 + }, + { + "epoch": 1.7292313310206318, + "grad_norm": 0.9159461259841919, + "learning_rate": 2.7589286463474698e-06, + "loss": 0.5172, + "step": 4735 + }, + { + "epoch": 1.729596494431258, + "grad_norm": 0.8737739324569702, + "learning_rate": 2.7516627024103403e-06, + "loss": 0.5149, + "step": 4736 + }, + { + "epoch": 1.7299616578418844, + "grad_norm": 0.8592380881309509, + "learning_rate": 2.7444056320506175e-06, + "loss": 0.5326, + "step": 4737 + }, + { + "epoch": 1.7303268212525105, + "grad_norm": 0.8493632078170776, + "learning_rate": 2.7371574390017742e-06, + "loss": 0.5212, + "step": 4738 + }, + { + "epoch": 1.7306919846631368, + "grad_norm": 0.8829220533370972, + "learning_rate": 2.7299181269927165e-06, + "loss": 0.5368, + "step": 4739 + }, + { + "epoch": 1.7310571480737629, + "grad_norm": 0.8817154169082642, + "learning_rate": 2.7226876997477723e-06, + "loss": 0.5236, + "step": 4740 + }, + { + "epoch": 1.7314223114843892, + "grad_norm": 2.187655210494995, + "learning_rate": 2.7154661609867126e-06, + "loss": 0.5365, + "step": 4741 + }, + { + "epoch": 1.7317874748950155, + "grad_norm": 0.9684646725654602, + "learning_rate": 2.708253514424728e-06, + "loss": 0.4993, + "step": 4742 + }, + { + "epoch": 1.7321526383056418, + "grad_norm": 0.9810671210289001, + "learning_rate": 2.701049763772434e-06, + "loss": 0.5244, + "step": 4743 + }, + { + "epoch": 1.7325178017162681, + "grad_norm": 0.9408937692642212, + "learning_rate": 2.6938549127358803e-06, + "loss": 0.5175, + "step": 4744 + }, + { + "epoch": 1.7328829651268944, + "grad_norm": 0.9165375828742981, + "learning_rate": 2.6866689650165146e-06, + "loss": 0.5063, + "step": 4745 + }, + { + "epoch": 1.7332481285375205, + "grad_norm": 0.8951494097709656, + "learning_rate": 2.679491924311226e-06, + "loss": 0.4961, + "step": 4746 + }, + { + "epoch": 1.7336132919481468, + "grad_norm": 0.9282603859901428, + "learning_rate": 2.672323794312315e-06, + "loss": 0.4906, + "step": 4747 + }, + { + "epoch": 1.733978455358773, + "grad_norm": 0.9112241864204407, + "learning_rate": 2.6651645787075e-06, + "loss": 0.5159, + "step": 4748 + }, + { + "epoch": 1.7343436187693992, + "grad_norm": 0.7655164003372192, + "learning_rate": 2.6580142811799037e-06, + "loss": 0.5637, + "step": 4749 + }, + { + "epoch": 1.7347087821800256, + "grad_norm": 1.001355528831482, + "learning_rate": 2.6508729054080664e-06, + "loss": 0.5419, + "step": 4750 + }, + { + "epoch": 1.7350739455906519, + "grad_norm": 0.9860793352127075, + "learning_rate": 2.6437404550659416e-06, + "loss": 0.5027, + "step": 4751 + }, + { + "epoch": 1.7354391090012782, + "grad_norm": 0.9379138946533203, + "learning_rate": 2.6366169338228885e-06, + "loss": 0.5681, + "step": 4752 + }, + { + "epoch": 1.7358042724119045, + "grad_norm": 0.9619566798210144, + "learning_rate": 2.629502345343675e-06, + "loss": 0.5085, + "step": 4753 + }, + { + "epoch": 1.7361694358225306, + "grad_norm": 0.8003615140914917, + "learning_rate": 2.622396693288474e-06, + "loss": 0.54, + "step": 4754 + }, + { + "epoch": 1.7365345992331567, + "grad_norm": 0.8115968108177185, + "learning_rate": 2.6152999813128487e-06, + "loss": 0.5348, + "step": 4755 + }, + { + "epoch": 1.736899762643783, + "grad_norm": 0.6173460483551025, + "learning_rate": 2.60821221306778e-06, + "loss": 0.5458, + "step": 4756 + }, + { + "epoch": 1.7372649260544093, + "grad_norm": 0.7967430949211121, + "learning_rate": 2.6011333921996397e-06, + "loss": 0.5471, + "step": 4757 + }, + { + "epoch": 1.7376300894650356, + "grad_norm": 1.2879356145858765, + "learning_rate": 2.5940635223501985e-06, + "loss": 0.4722, + "step": 4758 + }, + { + "epoch": 1.737995252875662, + "grad_norm": 0.6266947984695435, + "learning_rate": 2.5870026071566145e-06, + "loss": 0.5734, + "step": 4759 + }, + { + "epoch": 1.7383604162862882, + "grad_norm": 0.8225075006484985, + "learning_rate": 2.5799506502514504e-06, + "loss": 0.5502, + "step": 4760 + }, + { + "epoch": 1.7387255796969143, + "grad_norm": 0.9278892874717712, + "learning_rate": 2.572907655262653e-06, + "loss": 0.5358, + "step": 4761 + }, + { + "epoch": 1.7390907431075406, + "grad_norm": 1.1641044616699219, + "learning_rate": 2.565873625813564e-06, + "loss": 0.5253, + "step": 4762 + }, + { + "epoch": 1.7394559065181667, + "grad_norm": 0.6092157959938049, + "learning_rate": 2.5588485655229046e-06, + "loss": 0.5792, + "step": 4763 + }, + { + "epoch": 1.739821069928793, + "grad_norm": 0.8689990043640137, + "learning_rate": 2.5518324780047922e-06, + "loss": 0.519, + "step": 4764 + }, + { + "epoch": 1.7401862333394194, + "grad_norm": 0.952075719833374, + "learning_rate": 2.544825366868713e-06, + "loss": 0.51, + "step": 4765 + }, + { + "epoch": 1.7405513967500457, + "grad_norm": 0.996828019618988, + "learning_rate": 2.53782723571955e-06, + "loss": 0.5074, + "step": 4766 + }, + { + "epoch": 1.740916560160672, + "grad_norm": 1.044696569442749, + "learning_rate": 2.5308380881575613e-06, + "loss": 0.5215, + "step": 4767 + }, + { + "epoch": 1.7412817235712983, + "grad_norm": 0.796924889087677, + "learning_rate": 2.523857927778388e-06, + "loss": 0.5408, + "step": 4768 + }, + { + "epoch": 1.7416468869819244, + "grad_norm": 0.7988172769546509, + "learning_rate": 2.5168867581730315e-06, + "loss": 0.5623, + "step": 4769 + }, + { + "epoch": 1.7420120503925507, + "grad_norm": 1.13484525680542, + "learning_rate": 2.509924582927883e-06, + "loss": 0.5479, + "step": 4770 + }, + { + "epoch": 1.7423772138031768, + "grad_norm": 0.7947938442230225, + "learning_rate": 2.502971405624706e-06, + "loss": 0.5462, + "step": 4771 + }, + { + "epoch": 1.742742377213803, + "grad_norm": 0.9073025584220886, + "learning_rate": 2.4960272298406276e-06, + "loss": 0.497, + "step": 4772 + }, + { + "epoch": 1.7431075406244294, + "grad_norm": 1.380642056465149, + "learning_rate": 2.4890920591481525e-06, + "loss": 0.5008, + "step": 4773 + }, + { + "epoch": 1.7434727040350557, + "grad_norm": 1.0848580598831177, + "learning_rate": 2.4821658971151406e-06, + "loss": 0.5126, + "step": 4774 + }, + { + "epoch": 1.743837867445682, + "grad_norm": 0.920659601688385, + "learning_rate": 2.4752487473048327e-06, + "loss": 0.5457, + "step": 4775 + }, + { + "epoch": 1.7442030308563083, + "grad_norm": 0.8806758522987366, + "learning_rate": 2.4683406132758147e-06, + "loss": 0.5411, + "step": 4776 + }, + { + "epoch": 1.7445681942669344, + "grad_norm": 0.6796289086341858, + "learning_rate": 2.461441498582049e-06, + "loss": 0.5435, + "step": 4777 + }, + { + "epoch": 1.7449333576775607, + "grad_norm": 1.0951828956604004, + "learning_rate": 2.454551406772858e-06, + "loss": 0.5091, + "step": 4778 + }, + { + "epoch": 1.7452985210881868, + "grad_norm": 0.8472336530685425, + "learning_rate": 2.447670341392909e-06, + "loss": 0.5439, + "step": 4779 + }, + { + "epoch": 1.7456636844988132, + "grad_norm": 0.878318190574646, + "learning_rate": 2.4407983059822394e-06, + "loss": 0.5339, + "step": 4780 + }, + { + "epoch": 1.7460288479094395, + "grad_norm": 1.5507493019104004, + "learning_rate": 2.4339353040762337e-06, + "loss": 0.4882, + "step": 4781 + }, + { + "epoch": 1.7463940113200658, + "grad_norm": 0.8883445262908936, + "learning_rate": 2.427081339205635e-06, + "loss": 0.5267, + "step": 4782 + }, + { + "epoch": 1.746759174730692, + "grad_norm": 1.020946741104126, + "learning_rate": 2.4202364148965262e-06, + "loss": 0.5491, + "step": 4783 + }, + { + "epoch": 1.7471243381413184, + "grad_norm": 1.1917569637298584, + "learning_rate": 2.4134005346703517e-06, + "loss": 0.4731, + "step": 4784 + }, + { + "epoch": 1.7474895015519445, + "grad_norm": 0.8810885548591614, + "learning_rate": 2.406573702043893e-06, + "loss": 0.5244, + "step": 4785 + }, + { + "epoch": 1.7478546649625708, + "grad_norm": 0.9442121386528015, + "learning_rate": 2.3997559205292877e-06, + "loss": 0.5521, + "step": 4786 + }, + { + "epoch": 1.748219828373197, + "grad_norm": 0.6434993147850037, + "learning_rate": 2.3929471936340075e-06, + "loss": 0.5661, + "step": 4787 + }, + { + "epoch": 1.7485849917838232, + "grad_norm": 1.0258697271347046, + "learning_rate": 2.386147524860869e-06, + "loss": 0.5139, + "step": 4788 + }, + { + "epoch": 1.7489501551944495, + "grad_norm": 0.7254213094711304, + "learning_rate": 2.379356917708031e-06, + "loss": 0.5439, + "step": 4789 + }, + { + "epoch": 1.7493153186050758, + "grad_norm": 0.9993529915809631, + "learning_rate": 2.3725753756689816e-06, + "loss": 0.5168, + "step": 4790 + }, + { + "epoch": 1.7496804820157021, + "grad_norm": 1.0882890224456787, + "learning_rate": 2.365802902232559e-06, + "loss": 0.5161, + "step": 4791 + }, + { + "epoch": 1.7500456454263282, + "grad_norm": 1.1464070081710815, + "learning_rate": 2.3590395008829314e-06, + "loss": 0.5489, + "step": 4792 + }, + { + "epoch": 1.7504108088369545, + "grad_norm": 0.8863118290901184, + "learning_rate": 2.352285175099587e-06, + "loss": 0.5322, + "step": 4793 + }, + { + "epoch": 1.7507759722475806, + "grad_norm": 0.8491637110710144, + "learning_rate": 2.345539928357361e-06, + "loss": 0.4818, + "step": 4794 + }, + { + "epoch": 1.751141135658207, + "grad_norm": 0.7878150343894958, + "learning_rate": 2.338803764126414e-06, + "loss": 0.558, + "step": 4795 + }, + { + "epoch": 1.7515062990688333, + "grad_norm": 0.8596755862236023, + "learning_rate": 2.332076685872231e-06, + "loss": 0.5386, + "step": 4796 + }, + { + "epoch": 1.7518714624794596, + "grad_norm": 0.6973880529403687, + "learning_rate": 2.325358697055626e-06, + "loss": 0.5399, + "step": 4797 + }, + { + "epoch": 1.7522366258900859, + "grad_norm": 0.9609445333480835, + "learning_rate": 2.3186498011327286e-06, + "loss": 0.5519, + "step": 4798 + }, + { + "epoch": 1.7526017893007122, + "grad_norm": 0.8189699053764343, + "learning_rate": 2.3119500015550012e-06, + "loss": 0.5047, + "step": 4799 + }, + { + "epoch": 1.7529669527113383, + "grad_norm": 0.820521891117096, + "learning_rate": 2.3052593017692184e-06, + "loss": 0.5433, + "step": 4800 + }, + { + "epoch": 1.7533321161219646, + "grad_norm": 0.7218295335769653, + "learning_rate": 2.298577705217486e-06, + "loss": 0.5228, + "step": 4801 + }, + { + "epoch": 1.7536972795325907, + "grad_norm": 0.9512649178504944, + "learning_rate": 2.291905215337209e-06, + "loss": 0.4886, + "step": 4802 + }, + { + "epoch": 1.754062442943217, + "grad_norm": 1.2566230297088623, + "learning_rate": 2.285241835561112e-06, + "loss": 0.4784, + "step": 4803 + }, + { + "epoch": 1.7544276063538433, + "grad_norm": 1.1414672136306763, + "learning_rate": 2.2785875693172433e-06, + "loss": 0.5043, + "step": 4804 + }, + { + "epoch": 1.7547927697644696, + "grad_norm": 1.2826882600784302, + "learning_rate": 2.271942420028954e-06, + "loss": 0.5812, + "step": 4805 + }, + { + "epoch": 1.755157933175096, + "grad_norm": 0.8251254558563232, + "learning_rate": 2.2653063911149052e-06, + "loss": 0.5404, + "step": 4806 + }, + { + "epoch": 1.7555230965857223, + "grad_norm": 0.7828681468963623, + "learning_rate": 2.258679485989075e-06, + "loss": 0.5413, + "step": 4807 + }, + { + "epoch": 1.7558882599963483, + "grad_norm": 0.8482041358947754, + "learning_rate": 2.252061708060731e-06, + "loss": 0.5275, + "step": 4808 + }, + { + "epoch": 1.7562534234069747, + "grad_norm": 1.0937703847885132, + "learning_rate": 2.245453060734457e-06, + "loss": 0.511, + "step": 4809 + }, + { + "epoch": 1.7566185868176007, + "grad_norm": 0.6529538035392761, + "learning_rate": 2.238853547410136e-06, + "loss": 0.5653, + "step": 4810 + }, + { + "epoch": 1.756983750228227, + "grad_norm": 0.8129596710205078, + "learning_rate": 2.23226317148296e-06, + "loss": 0.561, + "step": 4811 + }, + { + "epoch": 1.7573489136388534, + "grad_norm": 0.8450109362602234, + "learning_rate": 2.2256819363434048e-06, + "loss": 0.4653, + "step": 4812 + }, + { + "epoch": 1.7577140770494797, + "grad_norm": 0.7205243706703186, + "learning_rate": 2.219109845377252e-06, + "loss": 0.5592, + "step": 4813 + }, + { + "epoch": 1.758079240460106, + "grad_norm": 0.8044624924659729, + "learning_rate": 2.212546901965582e-06, + "loss": 0.5601, + "step": 4814 + }, + { + "epoch": 1.7584444038707323, + "grad_norm": 1.0224130153656006, + "learning_rate": 2.2059931094847676e-06, + "loss": 0.511, + "step": 4815 + }, + { + "epoch": 1.7588095672813584, + "grad_norm": 0.8828184604644775, + "learning_rate": 2.199448471306467e-06, + "loss": 0.4883, + "step": 4816 + }, + { + "epoch": 1.7591747306919847, + "grad_norm": 0.6978619694709778, + "learning_rate": 2.19291299079764e-06, + "loss": 0.5757, + "step": 4817 + }, + { + "epoch": 1.7595398941026108, + "grad_norm": 0.8363275527954102, + "learning_rate": 2.186386671320522e-06, + "loss": 0.5259, + "step": 4818 + }, + { + "epoch": 1.7599050575132371, + "grad_norm": 0.7557588219642639, + "learning_rate": 2.1798695162326444e-06, + "loss": 0.5323, + "step": 4819 + }, + { + "epoch": 1.7602702209238634, + "grad_norm": 0.9395306706428528, + "learning_rate": 2.1733615288868236e-06, + "loss": 0.4881, + "step": 4820 + }, + { + "epoch": 1.7606353843344897, + "grad_norm": 0.9875335097312927, + "learning_rate": 2.1668627126311613e-06, + "loss": 0.5258, + "step": 4821 + }, + { + "epoch": 1.761000547745116, + "grad_norm": 0.999130368232727, + "learning_rate": 2.160373070809032e-06, + "loss": 0.5168, + "step": 4822 + }, + { + "epoch": 1.7613657111557421, + "grad_norm": 0.8791902661323547, + "learning_rate": 2.153892606759096e-06, + "loss": 0.5003, + "step": 4823 + }, + { + "epoch": 1.7617308745663685, + "grad_norm": 0.8618801832199097, + "learning_rate": 2.1474213238152954e-06, + "loss": 0.5528, + "step": 4824 + }, + { + "epoch": 1.7620960379769945, + "grad_norm": 0.7623894214630127, + "learning_rate": 2.1409592253068467e-06, + "loss": 0.538, + "step": 4825 + }, + { + "epoch": 1.7624612013876209, + "grad_norm": 0.8928764462471008, + "learning_rate": 2.1345063145582357e-06, + "loss": 0.5246, + "step": 4826 + }, + { + "epoch": 1.7628263647982472, + "grad_norm": 0.6507142782211304, + "learning_rate": 2.128062594889229e-06, + "loss": 0.5662, + "step": 4827 + }, + { + "epoch": 1.7631915282088735, + "grad_norm": 0.6624026894569397, + "learning_rate": 2.1216280696148585e-06, + "loss": 0.5513, + "step": 4828 + }, + { + "epoch": 1.7635566916194998, + "grad_norm": 0.8341464400291443, + "learning_rate": 2.115202742045437e-06, + "loss": 0.5278, + "step": 4829 + }, + { + "epoch": 1.763921855030126, + "grad_norm": 1.1716243028640747, + "learning_rate": 2.108786615486529e-06, + "loss": 0.507, + "step": 4830 + }, + { + "epoch": 1.7642870184407522, + "grad_norm": 0.7278099656105042, + "learning_rate": 2.1023796932389805e-06, + "loss": 0.5435, + "step": 4831 + }, + { + "epoch": 1.7646521818513785, + "grad_norm": 1.058091402053833, + "learning_rate": 2.0959819785988912e-06, + "loss": 0.5063, + "step": 4832 + }, + { + "epoch": 1.7650173452620046, + "grad_norm": 0.9359176754951477, + "learning_rate": 2.0895934748576273e-06, + "loss": 0.5524, + "step": 4833 + }, + { + "epoch": 1.765382508672631, + "grad_norm": 0.8545036911964417, + "learning_rate": 2.0832141853018227e-06, + "loss": 0.5243, + "step": 4834 + }, + { + "epoch": 1.7657476720832572, + "grad_norm": 0.8072575330734253, + "learning_rate": 2.0768441132133676e-06, + "loss": 0.5597, + "step": 4835 + }, + { + "epoch": 1.7661128354938835, + "grad_norm": 0.9316344857215881, + "learning_rate": 2.0704832618694006e-06, + "loss": 0.5033, + "step": 4836 + }, + { + "epoch": 1.7664779989045098, + "grad_norm": 1.037131428718567, + "learning_rate": 2.0641316345423303e-06, + "loss": 0.5213, + "step": 4837 + }, + { + "epoch": 1.7668431623151362, + "grad_norm": 0.9110147356987, + "learning_rate": 2.0577892344998097e-06, + "loss": 0.5194, + "step": 4838 + }, + { + "epoch": 1.7672083257257623, + "grad_norm": 1.0251874923706055, + "learning_rate": 2.051456065004753e-06, + "loss": 0.5171, + "step": 4839 + }, + { + "epoch": 1.7675734891363886, + "grad_norm": 0.7813372015953064, + "learning_rate": 2.045132129315326e-06, + "loss": 0.5572, + "step": 4840 + }, + { + "epoch": 1.7679386525470147, + "grad_norm": 0.8873542547225952, + "learning_rate": 2.0388174306849297e-06, + "loss": 0.5058, + "step": 4841 + }, + { + "epoch": 1.768303815957641, + "grad_norm": 0.9328134655952454, + "learning_rate": 2.03251197236223e-06, + "loss": 0.5251, + "step": 4842 + }, + { + "epoch": 1.7686689793682673, + "grad_norm": 0.6912343502044678, + "learning_rate": 2.026215757591128e-06, + "loss": 0.5766, + "step": 4843 + }, + { + "epoch": 1.7690341427788936, + "grad_norm": 1.0498008728027344, + "learning_rate": 2.0199287896107743e-06, + "loss": 0.4877, + "step": 4844 + }, + { + "epoch": 1.76939930618952, + "grad_norm": 0.9849687814712524, + "learning_rate": 2.013651071655569e-06, + "loss": 0.5003, + "step": 4845 + }, + { + "epoch": 1.7697644696001462, + "grad_norm": 0.7693188190460205, + "learning_rate": 2.007382606955135e-06, + "loss": 0.5307, + "step": 4846 + }, + { + "epoch": 1.7701296330107723, + "grad_norm": 0.6070563197135925, + "learning_rate": 2.00112339873435e-06, + "loss": 0.5847, + "step": 4847 + }, + { + "epoch": 1.7704947964213986, + "grad_norm": 1.0467851161956787, + "learning_rate": 1.9948734502133284e-06, + "loss": 0.4864, + "step": 4848 + }, + { + "epoch": 1.7708599598320247, + "grad_norm": 0.8477541208267212, + "learning_rate": 1.9886327646074143e-06, + "loss": 0.5194, + "step": 4849 + }, + { + "epoch": 1.771225123242651, + "grad_norm": 0.8596732020378113, + "learning_rate": 1.9824013451271964e-06, + "loss": 0.5591, + "step": 4850 + }, + { + "epoch": 1.7715902866532773, + "grad_norm": 0.963614284992218, + "learning_rate": 1.9761791949784827e-06, + "loss": 0.5387, + "step": 4851 + }, + { + "epoch": 1.7719554500639036, + "grad_norm": 0.8316560983657837, + "learning_rate": 1.9699663173623195e-06, + "loss": 0.5331, + "step": 4852 + }, + { + "epoch": 1.77232061347453, + "grad_norm": 0.8175371289253235, + "learning_rate": 1.9637627154749882e-06, + "loss": 0.5364, + "step": 4853 + }, + { + "epoch": 1.7726857768851563, + "grad_norm": 1.0496881008148193, + "learning_rate": 1.9575683925079913e-06, + "loss": 0.5104, + "step": 4854 + }, + { + "epoch": 1.7730509402957824, + "grad_norm": 0.9106097221374512, + "learning_rate": 1.951383351648057e-06, + "loss": 0.4966, + "step": 4855 + }, + { + "epoch": 1.7734161037064085, + "grad_norm": 1.158538818359375, + "learning_rate": 1.945207596077148e-06, + "loss": 0.4819, + "step": 4856 + }, + { + "epoch": 1.7737812671170348, + "grad_norm": 0.7018920183181763, + "learning_rate": 1.93904112897243e-06, + "loss": 0.6078, + "step": 4857 + }, + { + "epoch": 1.774146430527661, + "grad_norm": 0.92885422706604, + "learning_rate": 1.9328839535063125e-06, + "loss": 0.5323, + "step": 4858 + }, + { + "epoch": 1.7745115939382874, + "grad_norm": 1.1306513547897339, + "learning_rate": 1.9267360728464113e-06, + "loss": 0.553, + "step": 4859 + }, + { + "epoch": 1.7748767573489137, + "grad_norm": 0.9900164008140564, + "learning_rate": 1.920597490155569e-06, + "loss": 0.5171, + "step": 4860 + }, + { + "epoch": 1.77524192075954, + "grad_norm": 0.9863156676292419, + "learning_rate": 1.9144682085918354e-06, + "loss": 0.5778, + "step": 4861 + }, + { + "epoch": 1.775607084170166, + "grad_norm": 1.0554563999176025, + "learning_rate": 1.908348231308479e-06, + "loss": 0.5439, + "step": 4862 + }, + { + "epoch": 1.7759722475807924, + "grad_norm": 0.9097983241081238, + "learning_rate": 1.9022375614539857e-06, + "loss": 0.5279, + "step": 4863 + }, + { + "epoch": 1.7763374109914185, + "grad_norm": 0.9413915872573853, + "learning_rate": 1.896136202172052e-06, + "loss": 0.5353, + "step": 4864 + }, + { + "epoch": 1.7767025744020448, + "grad_norm": 1.017560601234436, + "learning_rate": 1.890044156601576e-06, + "loss": 0.5193, + "step": 4865 + }, + { + "epoch": 1.7770677378126711, + "grad_norm": 1.0063955783843994, + "learning_rate": 1.883961427876675e-06, + "loss": 0.5507, + "step": 4866 + }, + { + "epoch": 1.7774329012232974, + "grad_norm": 0.7404409646987915, + "learning_rate": 1.877888019126668e-06, + "loss": 0.5698, + "step": 4867 + }, + { + "epoch": 1.7777980646339238, + "grad_norm": 0.9323861002922058, + "learning_rate": 1.8718239334760824e-06, + "loss": 0.4848, + "step": 4868 + }, + { + "epoch": 1.77816322804455, + "grad_norm": 0.8537315130233765, + "learning_rate": 1.865769174044647e-06, + "loss": 0.5092, + "step": 4869 + }, + { + "epoch": 1.7785283914551762, + "grad_norm": 0.7566486597061157, + "learning_rate": 1.8597237439472837e-06, + "loss": 0.539, + "step": 4870 + }, + { + "epoch": 1.7788935548658025, + "grad_norm": 0.7786027789115906, + "learning_rate": 1.8536876462941311e-06, + "loss": 0.5466, + "step": 4871 + }, + { + "epoch": 1.7792587182764286, + "grad_norm": 0.7033721804618835, + "learning_rate": 1.8476608841905186e-06, + "loss": 0.5851, + "step": 4872 + }, + { + "epoch": 1.7796238816870549, + "grad_norm": 0.6912455558776855, + "learning_rate": 1.841643460736975e-06, + "loss": 0.5567, + "step": 4873 + }, + { + "epoch": 1.7799890450976812, + "grad_norm": 0.8948424458503723, + "learning_rate": 1.8356353790292237e-06, + "loss": 0.5106, + "step": 4874 + }, + { + "epoch": 1.7803542085083075, + "grad_norm": 0.7366495728492737, + "learning_rate": 1.8296366421581747e-06, + "loss": 0.5799, + "step": 4875 + }, + { + "epoch": 1.7807193719189338, + "grad_norm": 1.0875005722045898, + "learning_rate": 1.8236472532099413e-06, + "loss": 0.5114, + "step": 4876 + }, + { + "epoch": 1.7810845353295601, + "grad_norm": 1.5355473756790161, + "learning_rate": 1.817667215265826e-06, + "loss": 0.4878, + "step": 4877 + }, + { + "epoch": 1.7814496987401862, + "grad_norm": 0.9621209502220154, + "learning_rate": 1.8116965314023205e-06, + "loss": 0.5393, + "step": 4878 + }, + { + "epoch": 1.7818148621508125, + "grad_norm": 1.0355948209762573, + "learning_rate": 1.8057352046910948e-06, + "loss": 0.4891, + "step": 4879 + }, + { + "epoch": 1.7821800255614386, + "grad_norm": 0.8816014528274536, + "learning_rate": 1.7997832381990156e-06, + "loss": 0.4552, + "step": 4880 + }, + { + "epoch": 1.782545188972065, + "grad_norm": 0.9060640931129456, + "learning_rate": 1.793840634988131e-06, + "loss": 0.5117, + "step": 4881 + }, + { + "epoch": 1.7829103523826912, + "grad_norm": 1.2189513444900513, + "learning_rate": 1.787907398115676e-06, + "loss": 0.5229, + "step": 4882 + }, + { + "epoch": 1.7832755157933176, + "grad_norm": 1.0151749849319458, + "learning_rate": 1.781983530634055e-06, + "loss": 0.5406, + "step": 4883 + }, + { + "epoch": 1.7836406792039439, + "grad_norm": 0.9411863088607788, + "learning_rate": 1.7760690355908682e-06, + "loss": 0.4972, + "step": 4884 + }, + { + "epoch": 1.7840058426145702, + "grad_norm": 1.045339584350586, + "learning_rate": 1.7701639160288775e-06, + "loss": 0.501, + "step": 4885 + }, + { + "epoch": 1.7843710060251963, + "grad_norm": 0.8763211965560913, + "learning_rate": 1.7642681749860346e-06, + "loss": 0.5034, + "step": 4886 + }, + { + "epoch": 1.7847361694358226, + "grad_norm": 1.0296286344528198, + "learning_rate": 1.7583818154954602e-06, + "loss": 0.4522, + "step": 4887 + }, + { + "epoch": 1.7851013328464487, + "grad_norm": 0.8169348239898682, + "learning_rate": 1.7525048405854562e-06, + "loss": 0.5501, + "step": 4888 + }, + { + "epoch": 1.785466496257075, + "grad_norm": 0.9823426008224487, + "learning_rate": 1.7466372532794818e-06, + "loss": 0.5288, + "step": 4889 + }, + { + "epoch": 1.7858316596677013, + "grad_norm": 0.9222946166992188, + "learning_rate": 1.740779056596178e-06, + "loss": 0.4924, + "step": 4890 + }, + { + "epoch": 1.7861968230783276, + "grad_norm": 0.8936170935630798, + "learning_rate": 1.7349302535493539e-06, + "loss": 0.5334, + "step": 4891 + }, + { + "epoch": 1.786561986488954, + "grad_norm": 1.0143158435821533, + "learning_rate": 1.7290908471479805e-06, + "loss": 0.556, + "step": 4892 + }, + { + "epoch": 1.78692714989958, + "grad_norm": 0.8250148296356201, + "learning_rate": 1.723260840396206e-06, + "loss": 0.5419, + "step": 4893 + }, + { + "epoch": 1.7872923133102063, + "grad_norm": 0.8316763639450073, + "learning_rate": 1.717440236293324e-06, + "loss": 0.5456, + "step": 4894 + }, + { + "epoch": 1.7876574767208324, + "grad_norm": 0.874988317489624, + "learning_rate": 1.7116290378338085e-06, + "loss": 0.5511, + "step": 4895 + }, + { + "epoch": 1.7880226401314587, + "grad_norm": 1.303494930267334, + "learning_rate": 1.7058272480072879e-06, + "loss": 0.5079, + "step": 4896 + }, + { + "epoch": 1.788387803542085, + "grad_norm": 0.9614773988723755, + "learning_rate": 1.7000348697985481e-06, + "loss": 0.4785, + "step": 4897 + }, + { + "epoch": 1.7887529669527114, + "grad_norm": 1.2812979221343994, + "learning_rate": 1.6942519061875361e-06, + "loss": 0.5123, + "step": 4898 + }, + { + "epoch": 1.7891181303633377, + "grad_norm": 0.7624049186706543, + "learning_rate": 1.6884783601493525e-06, + "loss": 0.5734, + "step": 4899 + }, + { + "epoch": 1.789483293773964, + "grad_norm": 1.0612908601760864, + "learning_rate": 1.682714234654259e-06, + "loss": 0.4995, + "step": 4900 + }, + { + "epoch": 1.78984845718459, + "grad_norm": 0.931629478931427, + "learning_rate": 1.6769595326676614e-06, + "loss": 0.5619, + "step": 4901 + }, + { + "epoch": 1.7902136205952164, + "grad_norm": 0.9756954908370972, + "learning_rate": 1.6712142571501289e-06, + "loss": 0.5531, + "step": 4902 + }, + { + "epoch": 1.7905787840058425, + "grad_norm": 0.9437235593795776, + "learning_rate": 1.6654784110573752e-06, + "loss": 0.5197, + "step": 4903 + }, + { + "epoch": 1.7909439474164688, + "grad_norm": 0.9321839213371277, + "learning_rate": 1.6597519973402576e-06, + "loss": 0.539, + "step": 4904 + }, + { + "epoch": 1.791309110827095, + "grad_norm": 0.8487227559089661, + "learning_rate": 1.6540350189447885e-06, + "loss": 0.468, + "step": 4905 + }, + { + "epoch": 1.7916742742377214, + "grad_norm": 0.649875819683075, + "learning_rate": 1.6483274788121239e-06, + "loss": 0.5441, + "step": 4906 + }, + { + "epoch": 1.7920394376483477, + "grad_norm": 0.8302043676376343, + "learning_rate": 1.6426293798785687e-06, + "loss": 0.5531, + "step": 4907 + }, + { + "epoch": 1.792404601058974, + "grad_norm": 0.8178344964981079, + "learning_rate": 1.6369407250755598e-06, + "loss": 0.5449, + "step": 4908 + }, + { + "epoch": 1.7927697644696001, + "grad_norm": 0.9554221034049988, + "learning_rate": 1.6312615173296853e-06, + "loss": 0.4843, + "step": 4909 + }, + { + "epoch": 1.7931349278802264, + "grad_norm": 0.7250683307647705, + "learning_rate": 1.6255917595626681e-06, + "loss": 0.5209, + "step": 4910 + }, + { + "epoch": 1.7935000912908525, + "grad_norm": 0.7468822002410889, + "learning_rate": 1.619931454691368e-06, + "loss": 0.5529, + "step": 4911 + }, + { + "epoch": 1.7938652547014788, + "grad_norm": 0.866430401802063, + "learning_rate": 1.6142806056277937e-06, + "loss": 0.5628, + "step": 4912 + }, + { + "epoch": 1.7942304181121052, + "grad_norm": 1.0588103532791138, + "learning_rate": 1.6086392152790709e-06, + "loss": 0.4854, + "step": 4913 + }, + { + "epoch": 1.7945955815227315, + "grad_norm": 0.8642292022705078, + "learning_rate": 1.6030072865474733e-06, + "loss": 0.5345, + "step": 4914 + }, + { + "epoch": 1.7949607449333578, + "grad_norm": 0.8990249633789062, + "learning_rate": 1.5973848223304012e-06, + "loss": 0.5628, + "step": 4915 + }, + { + "epoch": 1.795325908343984, + "grad_norm": 0.7247717976570129, + "learning_rate": 1.5917718255203873e-06, + "loss": 0.5196, + "step": 4916 + }, + { + "epoch": 1.7956910717546102, + "grad_norm": 1.2262150049209595, + "learning_rate": 1.5861682990050954e-06, + "loss": 0.5317, + "step": 4917 + }, + { + "epoch": 1.7960562351652365, + "grad_norm": 0.8901282548904419, + "learning_rate": 1.5805742456673101e-06, + "loss": 0.524, + "step": 4918 + }, + { + "epoch": 1.7964213985758626, + "grad_norm": 0.8625813126564026, + "learning_rate": 1.5749896683849474e-06, + "loss": 0.5604, + "step": 4919 + }, + { + "epoch": 1.796786561986489, + "grad_norm": 0.7180377244949341, + "learning_rate": 1.5694145700310536e-06, + "loss": 0.5193, + "step": 4920 + }, + { + "epoch": 1.7971517253971152, + "grad_norm": 0.7622803449630737, + "learning_rate": 1.563848953473792e-06, + "loss": 0.5828, + "step": 4921 + }, + { + "epoch": 1.7975168888077415, + "grad_norm": 1.0300092697143555, + "learning_rate": 1.5582928215764481e-06, + "loss": 0.4961, + "step": 4922 + }, + { + "epoch": 1.7978820522183678, + "grad_norm": 0.8336998820304871, + "learning_rate": 1.552746177197424e-06, + "loss": 0.4831, + "step": 4923 + }, + { + "epoch": 1.798247215628994, + "grad_norm": 1.1535495519638062, + "learning_rate": 1.5472090231902504e-06, + "loss": 0.5051, + "step": 4924 + }, + { + "epoch": 1.7986123790396202, + "grad_norm": 0.9202743172645569, + "learning_rate": 1.5416813624035688e-06, + "loss": 0.5196, + "step": 4925 + }, + { + "epoch": 1.7989775424502463, + "grad_norm": 0.8379017114639282, + "learning_rate": 1.5361631976811397e-06, + "loss": 0.5328, + "step": 4926 + }, + { + "epoch": 1.7993427058608726, + "grad_norm": 1.020539402961731, + "learning_rate": 1.5306545318618437e-06, + "loss": 0.5235, + "step": 4927 + }, + { + "epoch": 1.799707869271499, + "grad_norm": 0.7980867028236389, + "learning_rate": 1.525155367779656e-06, + "loss": 0.5013, + "step": 4928 + }, + { + "epoch": 1.8000730326821253, + "grad_norm": 0.8660412430763245, + "learning_rate": 1.5196657082636845e-06, + "loss": 0.5398, + "step": 4929 + }, + { + "epoch": 1.8004381960927516, + "grad_norm": 0.7763341069221497, + "learning_rate": 1.5141855561381347e-06, + "loss": 0.5747, + "step": 4930 + }, + { + "epoch": 1.8008033595033779, + "grad_norm": 0.7591754198074341, + "learning_rate": 1.5087149142223313e-06, + "loss": 0.5247, + "step": 4931 + }, + { + "epoch": 1.801168522914004, + "grad_norm": 0.8435109257698059, + "learning_rate": 1.5032537853306917e-06, + "loss": 0.5799, + "step": 4932 + }, + { + "epoch": 1.8015336863246303, + "grad_norm": 0.8091663122177124, + "learning_rate": 1.4978021722727509e-06, + "loss": 0.5258, + "step": 4933 + }, + { + "epoch": 1.8018988497352564, + "grad_norm": 0.7816550731658936, + "learning_rate": 1.4923600778531456e-06, + "loss": 0.5383, + "step": 4934 + }, + { + "epoch": 1.8022640131458827, + "grad_norm": 0.7945713996887207, + "learning_rate": 1.486927504871616e-06, + "loss": 0.5642, + "step": 4935 + }, + { + "epoch": 1.802629176556509, + "grad_norm": 1.025600552558899, + "learning_rate": 1.481504456123004e-06, + "loss": 0.4561, + "step": 4936 + }, + { + "epoch": 1.8029943399671353, + "grad_norm": 0.870495617389679, + "learning_rate": 1.4760909343972473e-06, + "loss": 0.5342, + "step": 4937 + }, + { + "epoch": 1.8033595033777616, + "grad_norm": 0.7982380390167236, + "learning_rate": 1.4706869424793847e-06, + "loss": 0.5445, + "step": 4938 + }, + { + "epoch": 1.803724666788388, + "grad_norm": 1.338091492652893, + "learning_rate": 1.4652924831495563e-06, + "loss": 0.5234, + "step": 4939 + }, + { + "epoch": 1.804089830199014, + "grad_norm": 1.0072129964828491, + "learning_rate": 1.4599075591829915e-06, + "loss": 0.5588, + "step": 4940 + }, + { + "epoch": 1.8044549936096403, + "grad_norm": 1.072298288345337, + "learning_rate": 1.454532173350025e-06, + "loss": 0.5201, + "step": 4941 + }, + { + "epoch": 1.8048201570202664, + "grad_norm": 1.1159089803695679, + "learning_rate": 1.4491663284160694e-06, + "loss": 0.5026, + "step": 4942 + }, + { + "epoch": 1.8051853204308927, + "grad_norm": 0.9787336587905884, + "learning_rate": 1.4438100271416367e-06, + "loss": 0.5298, + "step": 4943 + }, + { + "epoch": 1.805550483841519, + "grad_norm": 0.9807587265968323, + "learning_rate": 1.4384632722823333e-06, + "loss": 0.5079, + "step": 4944 + }, + { + "epoch": 1.8059156472521454, + "grad_norm": 0.7864483594894409, + "learning_rate": 1.433126066588848e-06, + "loss": 0.561, + "step": 4945 + }, + { + "epoch": 1.8062808106627717, + "grad_norm": 0.9076181054115295, + "learning_rate": 1.4277984128069622e-06, + "loss": 0.5233, + "step": 4946 + }, + { + "epoch": 1.806645974073398, + "grad_norm": 1.0766689777374268, + "learning_rate": 1.4224803136775323e-06, + "loss": 0.4543, + "step": 4947 + }, + { + "epoch": 1.807011137484024, + "grad_norm": 0.8053579330444336, + "learning_rate": 1.417171771936514e-06, + "loss": 0.5475, + "step": 4948 + }, + { + "epoch": 1.8073763008946504, + "grad_norm": 0.856950581073761, + "learning_rate": 1.4118727903149387e-06, + "loss": 0.5093, + "step": 4949 + }, + { + "epoch": 1.8077414643052765, + "grad_norm": 1.2885442972183228, + "learning_rate": 1.4065833715389143e-06, + "loss": 0.433, + "step": 4950 + }, + { + "epoch": 1.8081066277159028, + "grad_norm": 0.9208719730377197, + "learning_rate": 1.401303518329642e-06, + "loss": 0.5619, + "step": 4951 + }, + { + "epoch": 1.8084717911265291, + "grad_norm": 0.6822531223297119, + "learning_rate": 1.3960332334033844e-06, + "loss": 0.5667, + "step": 4952 + }, + { + "epoch": 1.8088369545371554, + "grad_norm": 0.9479916095733643, + "learning_rate": 1.3907725194714994e-06, + "loss": 0.5314, + "step": 4953 + }, + { + "epoch": 1.8092021179477817, + "grad_norm": 0.8681054711341858, + "learning_rate": 1.3855213792404132e-06, + "loss": 0.5623, + "step": 4954 + }, + { + "epoch": 1.809567281358408, + "grad_norm": 0.8470045328140259, + "learning_rate": 1.3802798154116249e-06, + "loss": 0.5168, + "step": 4955 + }, + { + "epoch": 1.8099324447690341, + "grad_norm": 1.1664899587631226, + "learning_rate": 1.3750478306817082e-06, + "loss": 0.4947, + "step": 4956 + }, + { + "epoch": 1.8102976081796602, + "grad_norm": 1.0370721817016602, + "learning_rate": 1.3698254277423083e-06, + "loss": 0.536, + "step": 4957 + }, + { + "epoch": 1.8106627715902865, + "grad_norm": 0.7912999987602234, + "learning_rate": 1.3646126092801425e-06, + "loss": 0.5617, + "step": 4958 + }, + { + "epoch": 1.8110279350009129, + "grad_norm": 1.4642152786254883, + "learning_rate": 1.359409377976999e-06, + "loss": 0.5126, + "step": 4959 + }, + { + "epoch": 1.8113930984115392, + "grad_norm": 0.799483060836792, + "learning_rate": 1.354215736509734e-06, + "loss": 0.5196, + "step": 4960 + }, + { + "epoch": 1.8117582618221655, + "grad_norm": 0.8305796980857849, + "learning_rate": 1.3490316875502597e-06, + "loss": 0.5394, + "step": 4961 + }, + { + "epoch": 1.8121234252327918, + "grad_norm": 0.8141010999679565, + "learning_rate": 1.3438572337655686e-06, + "loss": 0.5229, + "step": 4962 + }, + { + "epoch": 1.8124885886434179, + "grad_norm": 0.9301368594169617, + "learning_rate": 1.338692377817703e-06, + "loss": 0.4989, + "step": 4963 + }, + { + "epoch": 1.8128537520540442, + "grad_norm": 0.9604087471961975, + "learning_rate": 1.3335371223637772e-06, + "loss": 0.5339, + "step": 4964 + }, + { + "epoch": 1.8132189154646703, + "grad_norm": 0.8913207650184631, + "learning_rate": 1.3283914700559675e-06, + "loss": 0.5143, + "step": 4965 + }, + { + "epoch": 1.8135840788752966, + "grad_norm": 0.7561500072479248, + "learning_rate": 1.3232554235414985e-06, + "loss": 0.5594, + "step": 4966 + }, + { + "epoch": 1.813949242285923, + "grad_norm": 1.0631812810897827, + "learning_rate": 1.3181289854626633e-06, + "loss": 0.5388, + "step": 4967 + }, + { + "epoch": 1.8143144056965492, + "grad_norm": 0.9004719853401184, + "learning_rate": 1.3130121584568055e-06, + "loss": 0.5388, + "step": 4968 + }, + { + "epoch": 1.8146795691071755, + "grad_norm": 0.777157723903656, + "learning_rate": 1.3079049451563331e-06, + "loss": 0.5369, + "step": 4969 + }, + { + "epoch": 1.8150447325178019, + "grad_norm": 0.816108763217926, + "learning_rate": 1.3028073481887016e-06, + "loss": 0.5091, + "step": 4970 + }, + { + "epoch": 1.815409895928428, + "grad_norm": 1.2835263013839722, + "learning_rate": 1.2977193701764135e-06, + "loss": 0.515, + "step": 4971 + }, + { + "epoch": 1.8157750593390543, + "grad_norm": 0.7208719253540039, + "learning_rate": 1.2926410137370348e-06, + "loss": 0.5374, + "step": 4972 + }, + { + "epoch": 1.8161402227496803, + "grad_norm": 0.7209171652793884, + "learning_rate": 1.2875722814831737e-06, + "loss": 0.554, + "step": 4973 + }, + { + "epoch": 1.8165053861603067, + "grad_norm": 0.847134530544281, + "learning_rate": 1.2825131760224952e-06, + "loss": 0.5379, + "step": 4974 + }, + { + "epoch": 1.816870549570933, + "grad_norm": 0.874386191368103, + "learning_rate": 1.2774636999576995e-06, + "loss": 0.5429, + "step": 4975 + }, + { + "epoch": 1.8172357129815593, + "grad_norm": 0.86895352602005, + "learning_rate": 1.272423855886542e-06, + "loss": 0.5114, + "step": 4976 + }, + { + "epoch": 1.8176008763921856, + "grad_norm": 0.8646639585494995, + "learning_rate": 1.26739364640182e-06, + "loss": 0.5099, + "step": 4977 + }, + { + "epoch": 1.817966039802812, + "grad_norm": 0.9552761912345886, + "learning_rate": 1.262373074091372e-06, + "loss": 0.477, + "step": 4978 + }, + { + "epoch": 1.818331203213438, + "grad_norm": 0.8524230718612671, + "learning_rate": 1.2573621415380832e-06, + "loss": 0.5237, + "step": 4979 + }, + { + "epoch": 1.8186963666240643, + "grad_norm": 0.9472958445549011, + "learning_rate": 1.2523608513198803e-06, + "loss": 0.492, + "step": 4980 + }, + { + "epoch": 1.8190615300346904, + "grad_norm": 1.036932349205017, + "learning_rate": 1.247369206009721e-06, + "loss": 0.5062, + "step": 4981 + }, + { + "epoch": 1.8194266934453167, + "grad_norm": 1.1887929439544678, + "learning_rate": 1.2423872081756106e-06, + "loss": 0.5118, + "step": 4982 + }, + { + "epoch": 1.819791856855943, + "grad_norm": 1.0288493633270264, + "learning_rate": 1.2374148603805835e-06, + "loss": 0.5062, + "step": 4983 + }, + { + "epoch": 1.8201570202665693, + "grad_norm": 2.3531882762908936, + "learning_rate": 1.2324521651827182e-06, + "loss": 0.5302, + "step": 4984 + }, + { + "epoch": 1.8205221836771956, + "grad_norm": 0.8009319305419922, + "learning_rate": 1.2274991251351166e-06, + "loss": 0.5413, + "step": 4985 + }, + { + "epoch": 1.820887347087822, + "grad_norm": 0.6363532543182373, + "learning_rate": 1.2225557427859203e-06, + "loss": 0.5743, + "step": 4986 + }, + { + "epoch": 1.821252510498448, + "grad_norm": 0.9401905536651611, + "learning_rate": 1.217622020678304e-06, + "loss": 0.5066, + "step": 4987 + }, + { + "epoch": 1.8216176739090744, + "grad_norm": 0.9579744935035706, + "learning_rate": 1.2126979613504664e-06, + "loss": 0.5391, + "step": 4988 + }, + { + "epoch": 1.8219828373197005, + "grad_norm": 0.7629438042640686, + "learning_rate": 1.2077835673356454e-06, + "loss": 0.5311, + "step": 4989 + }, + { + "epoch": 1.8223480007303268, + "grad_norm": 0.8086284399032593, + "learning_rate": 1.202878841162094e-06, + "loss": 0.5348, + "step": 4990 + }, + { + "epoch": 1.822713164140953, + "grad_norm": 1.3739471435546875, + "learning_rate": 1.197983785353094e-06, + "loss": 0.5128, + "step": 4991 + }, + { + "epoch": 1.8230783275515794, + "grad_norm": 0.6952402591705322, + "learning_rate": 1.1930984024269575e-06, + "loss": 0.5634, + "step": 4992 + }, + { + "epoch": 1.8234434909622057, + "grad_norm": 0.8957251906394958, + "learning_rate": 1.1882226948970188e-06, + "loss": 0.531, + "step": 4993 + }, + { + "epoch": 1.8238086543728318, + "grad_norm": 1.115297555923462, + "learning_rate": 1.1833566652716378e-06, + "loss": 0.506, + "step": 4994 + }, + { + "epoch": 1.824173817783458, + "grad_norm": 0.8088821172714233, + "learning_rate": 1.1785003160541852e-06, + "loss": 0.5461, + "step": 4995 + }, + { + "epoch": 1.8245389811940842, + "grad_norm": 0.9462229013442993, + "learning_rate": 1.1736536497430584e-06, + "loss": 0.5522, + "step": 4996 + }, + { + "epoch": 1.8249041446047105, + "grad_norm": 1.1374109983444214, + "learning_rate": 1.168816668831676e-06, + "loss": 0.5076, + "step": 4997 + }, + { + "epoch": 1.8252693080153368, + "grad_norm": 1.011764407157898, + "learning_rate": 1.1639893758084719e-06, + "loss": 0.5021, + "step": 4998 + }, + { + "epoch": 1.8256344714259631, + "grad_norm": 0.6716031432151794, + "learning_rate": 1.1591717731568909e-06, + "loss": 0.5431, + "step": 4999 + }, + { + "epoch": 1.8259996348365894, + "grad_norm": 1.1675262451171875, + "learning_rate": 1.1543638633553945e-06, + "loss": 0.5005, + "step": 5000 + }, + { + "epoch": 1.8263647982472158, + "grad_norm": 1.0584003925323486, + "learning_rate": 1.149565648877462e-06, + "loss": 0.54, + "step": 5001 + }, + { + "epoch": 1.8267299616578418, + "grad_norm": 0.7157919406890869, + "learning_rate": 1.144777132191588e-06, + "loss": 0.5447, + "step": 5002 + }, + { + "epoch": 1.8270951250684682, + "grad_norm": 1.2633295059204102, + "learning_rate": 1.1399983157612616e-06, + "loss": 0.5209, + "step": 5003 + }, + { + "epoch": 1.8274602884790943, + "grad_norm": 0.8516579270362854, + "learning_rate": 1.1352292020449984e-06, + "loss": 0.5413, + "step": 5004 + }, + { + "epoch": 1.8278254518897206, + "grad_norm": 0.7476287484169006, + "learning_rate": 1.130469793496314e-06, + "loss": 0.5039, + "step": 5005 + }, + { + "epoch": 1.8281906153003469, + "grad_norm": 0.7512426376342773, + "learning_rate": 1.1257200925637336e-06, + "loss": 0.5333, + "step": 5006 + }, + { + "epoch": 1.8285557787109732, + "grad_norm": 0.8339123725891113, + "learning_rate": 1.1209801016907872e-06, + "loss": 0.5235, + "step": 5007 + }, + { + "epoch": 1.8289209421215995, + "grad_norm": 0.8975398540496826, + "learning_rate": 1.1162498233160136e-06, + "loss": 0.5236, + "step": 5008 + }, + { + "epoch": 1.8292861055322258, + "grad_norm": 0.7915427684783936, + "learning_rate": 1.1115292598729454e-06, + "loss": 0.5349, + "step": 5009 + }, + { + "epoch": 1.829651268942852, + "grad_norm": 0.797041118144989, + "learning_rate": 1.106818413790125e-06, + "loss": 0.5127, + "step": 5010 + }, + { + "epoch": 1.8300164323534782, + "grad_norm": 0.9191210269927979, + "learning_rate": 1.1021172874910957e-06, + "loss": 0.5061, + "step": 5011 + }, + { + "epoch": 1.8303815957641043, + "grad_norm": 0.8843085765838623, + "learning_rate": 1.0974258833943985e-06, + "loss": 0.5046, + "step": 5012 + }, + { + "epoch": 1.8307467591747306, + "grad_norm": 0.8699705600738525, + "learning_rate": 1.0927442039135717e-06, + "loss": 0.517, + "step": 5013 + }, + { + "epoch": 1.831111922585357, + "grad_norm": 0.602509081363678, + "learning_rate": 1.0880722514571484e-06, + "loss": 0.5811, + "step": 5014 + }, + { + "epoch": 1.8314770859959832, + "grad_norm": 0.8212730884552002, + "learning_rate": 1.0834100284286641e-06, + "loss": 0.5353, + "step": 5015 + }, + { + "epoch": 1.8318422494066096, + "grad_norm": 1.0749436616897583, + "learning_rate": 1.0787575372266467e-06, + "loss": 0.4735, + "step": 5016 + }, + { + "epoch": 1.8322074128172359, + "grad_norm": 0.844359815120697, + "learning_rate": 1.0741147802446128e-06, + "loss": 0.5265, + "step": 5017 + }, + { + "epoch": 1.832572576227862, + "grad_norm": 0.7892604470252991, + "learning_rate": 1.0694817598710782e-06, + "loss": 0.5421, + "step": 5018 + }, + { + "epoch": 1.8329377396384883, + "grad_norm": 0.9948855638504028, + "learning_rate": 1.0648584784895411e-06, + "loss": 0.4838, + "step": 5019 + }, + { + "epoch": 1.8333029030491144, + "grad_norm": 0.8588843941688538, + "learning_rate": 1.0602449384784963e-06, + "loss": 0.5593, + "step": 5020 + }, + { + "epoch": 1.8336680664597407, + "grad_norm": 0.8757759928703308, + "learning_rate": 1.0556411422114254e-06, + "loss": 0.5088, + "step": 5021 + }, + { + "epoch": 1.834033229870367, + "grad_norm": 0.8504748344421387, + "learning_rate": 1.0510470920567983e-06, + "loss": 0.5199, + "step": 5022 + }, + { + "epoch": 1.8343983932809933, + "grad_norm": 0.8831689953804016, + "learning_rate": 1.0464627903780689e-06, + "loss": 0.5128, + "step": 5023 + }, + { + "epoch": 1.8347635566916196, + "grad_norm": 1.101543664932251, + "learning_rate": 1.041888239533675e-06, + "loss": 0.5398, + "step": 5024 + }, + { + "epoch": 1.8351287201022457, + "grad_norm": 1.0294517278671265, + "learning_rate": 1.0373234418770385e-06, + "loss": 0.4874, + "step": 5025 + }, + { + "epoch": 1.835493883512872, + "grad_norm": 0.9753843545913696, + "learning_rate": 1.0327683997565674e-06, + "loss": 0.5183, + "step": 5026 + }, + { + "epoch": 1.835859046923498, + "grad_norm": 0.6896717548370361, + "learning_rate": 1.0282231155156498e-06, + "loss": 0.5707, + "step": 5027 + }, + { + "epoch": 1.8362242103341244, + "grad_norm": 0.9963521957397461, + "learning_rate": 1.0236875914926458e-06, + "loss": 0.5245, + "step": 5028 + }, + { + "epoch": 1.8365893737447507, + "grad_norm": 0.898601770401001, + "learning_rate": 1.0191618300209094e-06, + "loss": 0.4901, + "step": 5029 + }, + { + "epoch": 1.836954537155377, + "grad_norm": 0.8382347822189331, + "learning_rate": 1.0146458334287513e-06, + "loss": 0.5373, + "step": 5030 + }, + { + "epoch": 1.8373197005660034, + "grad_norm": 1.4509795904159546, + "learning_rate": 1.0101396040394795e-06, + "loss": 0.5228, + "step": 5031 + }, + { + "epoch": 1.8376848639766297, + "grad_norm": 0.8937653303146362, + "learning_rate": 1.0056431441713643e-06, + "loss": 0.5381, + "step": 5032 + }, + { + "epoch": 1.8380500273872558, + "grad_norm": 1.0046682357788086, + "learning_rate": 1.0011564561376596e-06, + "loss": 0.4879, + "step": 5033 + }, + { + "epoch": 1.838415190797882, + "grad_norm": 1.0494298934936523, + "learning_rate": 9.966795422465792e-07, + "loss": 0.5415, + "step": 5034 + }, + { + "epoch": 1.8387803542085082, + "grad_norm": 0.838736355304718, + "learning_rate": 9.922124048013183e-07, + "loss": 0.5259, + "step": 5035 + }, + { + "epoch": 1.8391455176191345, + "grad_norm": 1.297999382019043, + "learning_rate": 9.877550461000385e-07, + "loss": 0.5302, + "step": 5036 + }, + { + "epoch": 1.8395106810297608, + "grad_norm": 0.5948071479797363, + "learning_rate": 9.833074684358768e-07, + "loss": 0.5594, + "step": 5037 + }, + { + "epoch": 1.839875844440387, + "grad_norm": 0.86806321144104, + "learning_rate": 9.788696740969295e-07, + "loss": 0.5304, + "step": 5038 + }, + { + "epoch": 1.8402410078510134, + "grad_norm": 1.3624813556671143, + "learning_rate": 9.744416653662636e-07, + "loss": 0.5087, + "step": 5039 + }, + { + "epoch": 1.8406061712616397, + "grad_norm": 1.0623867511749268, + "learning_rate": 9.700234445219126e-07, + "loss": 0.5437, + "step": 5040 + }, + { + "epoch": 1.8409713346722658, + "grad_norm": 0.8289335370063782, + "learning_rate": 9.656150138368758e-07, + "loss": 0.5636, + "step": 5041 + }, + { + "epoch": 1.8413364980828921, + "grad_norm": 0.932661235332489, + "learning_rate": 9.612163755791105e-07, + "loss": 0.5368, + "step": 5042 + }, + { + "epoch": 1.8417016614935182, + "grad_norm": 1.0001920461654663, + "learning_rate": 9.568275320115438e-07, + "loss": 0.5366, + "step": 5043 + }, + { + "epoch": 1.8420668249041445, + "grad_norm": 0.7952094674110413, + "learning_rate": 9.524484853920524e-07, + "loss": 0.5441, + "step": 5044 + }, + { + "epoch": 1.8424319883147708, + "grad_norm": 0.7896952629089355, + "learning_rate": 9.480792379734871e-07, + "loss": 0.5292, + "step": 5045 + }, + { + "epoch": 1.8427971517253972, + "grad_norm": 0.8495169878005981, + "learning_rate": 9.437197920036456e-07, + "loss": 0.4935, + "step": 5046 + }, + { + "epoch": 1.8431623151360235, + "grad_norm": 0.7229299545288086, + "learning_rate": 9.393701497252939e-07, + "loss": 0.5603, + "step": 5047 + }, + { + "epoch": 1.8435274785466498, + "grad_norm": 0.7918698191642761, + "learning_rate": 9.35030313376144e-07, + "loss": 0.5757, + "step": 5048 + }, + { + "epoch": 1.8438926419572759, + "grad_norm": 0.7822374701499939, + "learning_rate": 9.307002851888658e-07, + "loss": 0.5241, + "step": 5049 + }, + { + "epoch": 1.8442578053679022, + "grad_norm": 0.8543692231178284, + "learning_rate": 9.263800673910883e-07, + "loss": 0.5047, + "step": 5050 + }, + { + "epoch": 1.8446229687785283, + "grad_norm": 0.9035658836364746, + "learning_rate": 9.220696622053915e-07, + "loss": 0.5197, + "step": 5051 + }, + { + "epoch": 1.8449881321891546, + "grad_norm": 0.8677619695663452, + "learning_rate": 9.177690718493016e-07, + "loss": 0.5373, + "step": 5052 + }, + { + "epoch": 1.845353295599781, + "grad_norm": 0.9196531772613525, + "learning_rate": 9.134782985353019e-07, + "loss": 0.4809, + "step": 5053 + }, + { + "epoch": 1.8457184590104072, + "grad_norm": 0.7895047068595886, + "learning_rate": 9.091973444708247e-07, + "loss": 0.5539, + "step": 5054 + }, + { + "epoch": 1.8460836224210335, + "grad_norm": 0.9806033372879028, + "learning_rate": 9.049262118582458e-07, + "loss": 0.4776, + "step": 5055 + }, + { + "epoch": 1.8464487858316598, + "grad_norm": 0.6609007716178894, + "learning_rate": 9.006649028948966e-07, + "loss": 0.5833, + "step": 5056 + }, + { + "epoch": 1.846813949242286, + "grad_norm": 1.074385643005371, + "learning_rate": 8.964134197730457e-07, + "loss": 0.5267, + "step": 5057 + }, + { + "epoch": 1.847179112652912, + "grad_norm": 0.837611734867096, + "learning_rate": 8.921717646799077e-07, + "loss": 0.5246, + "step": 5058 + }, + { + "epoch": 1.8475442760635383, + "grad_norm": 0.9103251099586487, + "learning_rate": 8.879399397976484e-07, + "loss": 0.5136, + "step": 5059 + }, + { + "epoch": 1.8479094394741646, + "grad_norm": 0.8506982922554016, + "learning_rate": 8.83717947303373e-07, + "loss": 0.5287, + "step": 5060 + }, + { + "epoch": 1.848274602884791, + "grad_norm": 0.9154410362243652, + "learning_rate": 8.795057893691239e-07, + "loss": 0.494, + "step": 5061 + }, + { + "epoch": 1.8486397662954173, + "grad_norm": 0.8932507634162903, + "learning_rate": 8.753034681618877e-07, + "loss": 0.5208, + "step": 5062 + }, + { + "epoch": 1.8490049297060436, + "grad_norm": 0.7752853631973267, + "learning_rate": 8.711109858435907e-07, + "loss": 0.5314, + "step": 5063 + }, + { + "epoch": 1.8493700931166697, + "grad_norm": 0.837082028388977, + "learning_rate": 8.669283445710985e-07, + "loss": 0.5121, + "step": 5064 + }, + { + "epoch": 1.849735256527296, + "grad_norm": 0.6760733723640442, + "learning_rate": 8.627555464962078e-07, + "loss": 0.5282, + "step": 5065 + }, + { + "epoch": 1.850100419937922, + "grad_norm": 0.8442636728286743, + "learning_rate": 8.585925937656636e-07, + "loss": 0.5213, + "step": 5066 + }, + { + "epoch": 1.8504655833485484, + "grad_norm": 0.9905248880386353, + "learning_rate": 8.544394885211305e-07, + "loss": 0.5267, + "step": 5067 + }, + { + "epoch": 1.8508307467591747, + "grad_norm": 0.6257760524749756, + "learning_rate": 8.502962328992149e-07, + "loss": 0.5668, + "step": 5068 + }, + { + "epoch": 1.851195910169801, + "grad_norm": 0.9485304355621338, + "learning_rate": 8.461628290314605e-07, + "loss": 0.5191, + "step": 5069 + }, + { + "epoch": 1.8515610735804273, + "grad_norm": 0.8224905729293823, + "learning_rate": 8.420392790443332e-07, + "loss": 0.5058, + "step": 5070 + }, + { + "epoch": 1.8519262369910536, + "grad_norm": 1.0173250436782837, + "learning_rate": 8.379255850592404e-07, + "loss": 0.5284, + "step": 5071 + }, + { + "epoch": 1.8522914004016797, + "grad_norm": 0.8787329792976379, + "learning_rate": 8.338217491925027e-07, + "loss": 0.5308, + "step": 5072 + }, + { + "epoch": 1.852656563812306, + "grad_norm": 1.0824297666549683, + "learning_rate": 8.297277735553844e-07, + "loss": 0.5091, + "step": 5073 + }, + { + "epoch": 1.8530217272229321, + "grad_norm": 0.9079974889755249, + "learning_rate": 8.256436602540718e-07, + "loss": 0.5069, + "step": 5074 + }, + { + "epoch": 1.8533868906335584, + "grad_norm": 0.6514511108398438, + "learning_rate": 8.215694113896777e-07, + "loss": 0.5233, + "step": 5075 + }, + { + "epoch": 1.8537520540441847, + "grad_norm": 0.8281962275505066, + "learning_rate": 8.17505029058241e-07, + "loss": 0.4781, + "step": 5076 + }, + { + "epoch": 1.854117217454811, + "grad_norm": 0.8062742352485657, + "learning_rate": 8.134505153507177e-07, + "loss": 0.4791, + "step": 5077 + }, + { + "epoch": 1.8544823808654374, + "grad_norm": 0.782093346118927, + "learning_rate": 8.094058723529974e-07, + "loss": 0.5082, + "step": 5078 + }, + { + "epoch": 1.8548475442760637, + "grad_norm": 0.9898812770843506, + "learning_rate": 8.053711021458843e-07, + "loss": 0.5347, + "step": 5079 + }, + { + "epoch": 1.8552127076866898, + "grad_norm": 0.9271273612976074, + "learning_rate": 8.013462068051092e-07, + "loss": 0.5262, + "step": 5080 + }, + { + "epoch": 1.855577871097316, + "grad_norm": 0.9929766058921814, + "learning_rate": 7.973311884013158e-07, + "loss": 0.4949, + "step": 5081 + }, + { + "epoch": 1.8559430345079422, + "grad_norm": 0.8740081787109375, + "learning_rate": 7.933260490000694e-07, + "loss": 0.5351, + "step": 5082 + }, + { + "epoch": 1.8563081979185685, + "grad_norm": 1.054709553718567, + "learning_rate": 7.893307906618575e-07, + "loss": 0.5242, + "step": 5083 + }, + { + "epoch": 1.8566733613291948, + "grad_norm": 1.1594157218933105, + "learning_rate": 7.853454154420758e-07, + "loss": 0.4914, + "step": 5084 + }, + { + "epoch": 1.8570385247398211, + "grad_norm": 0.7112655639648438, + "learning_rate": 7.813699253910423e-07, + "loss": 0.5694, + "step": 5085 + }, + { + "epoch": 1.8574036881504474, + "grad_norm": 0.8234994411468506, + "learning_rate": 7.774043225539874e-07, + "loss": 0.5347, + "step": 5086 + }, + { + "epoch": 1.8577688515610737, + "grad_norm": 1.0197539329528809, + "learning_rate": 7.734486089710502e-07, + "loss": 0.5295, + "step": 5087 + }, + { + "epoch": 1.8581340149716998, + "grad_norm": 1.031860113143921, + "learning_rate": 7.695027866772919e-07, + "loss": 0.5211, + "step": 5088 + }, + { + "epoch": 1.8584991783823261, + "grad_norm": 0.9370394349098206, + "learning_rate": 7.655668577026798e-07, + "loss": 0.4739, + "step": 5089 + }, + { + "epoch": 1.8588643417929522, + "grad_norm": 0.6947592496871948, + "learning_rate": 7.616408240720896e-07, + "loss": 0.5311, + "step": 5090 + }, + { + "epoch": 1.8592295052035785, + "grad_norm": 0.9864943027496338, + "learning_rate": 7.577246878053057e-07, + "loss": 0.5118, + "step": 5091 + }, + { + "epoch": 1.8595946686142049, + "grad_norm": 0.8007437586784363, + "learning_rate": 7.538184509170276e-07, + "loss": 0.5092, + "step": 5092 + }, + { + "epoch": 1.8599598320248312, + "grad_norm": 0.9444168210029602, + "learning_rate": 7.499221154168545e-07, + "loss": 0.5236, + "step": 5093 + }, + { + "epoch": 1.8603249954354575, + "grad_norm": 0.8404966592788696, + "learning_rate": 7.460356833092963e-07, + "loss": 0.5286, + "step": 5094 + }, + { + "epoch": 1.8606901588460836, + "grad_norm": 0.7893446683883667, + "learning_rate": 7.421591565937647e-07, + "loss": 0.5523, + "step": 5095 + }, + { + "epoch": 1.8610553222567099, + "grad_norm": 0.8035286664962769, + "learning_rate": 7.3829253726458e-07, + "loss": 0.5595, + "step": 5096 + }, + { + "epoch": 1.861420485667336, + "grad_norm": 0.709818422794342, + "learning_rate": 7.344358273109575e-07, + "loss": 0.5464, + "step": 5097 + }, + { + "epoch": 1.8617856490779623, + "grad_norm": 0.8071439862251282, + "learning_rate": 7.305890287170236e-07, + "loss": 0.5075, + "step": 5098 + }, + { + "epoch": 1.8621508124885886, + "grad_norm": 1.1296306848526, + "learning_rate": 7.267521434618018e-07, + "loss": 0.5022, + "step": 5099 + }, + { + "epoch": 1.862515975899215, + "grad_norm": 1.1329855918884277, + "learning_rate": 7.229251735192178e-07, + "loss": 0.5441, + "step": 5100 + }, + { + "epoch": 1.8628811393098412, + "grad_norm": 1.3989837169647217, + "learning_rate": 7.191081208580874e-07, + "loss": 0.498, + "step": 5101 + }, + { + "epoch": 1.8632463027204675, + "grad_norm": 1.5248597860336304, + "learning_rate": 7.153009874421357e-07, + "loss": 0.4777, + "step": 5102 + }, + { + "epoch": 1.8636114661310936, + "grad_norm": 0.9464701414108276, + "learning_rate": 7.11503775229978e-07, + "loss": 0.5377, + "step": 5103 + }, + { + "epoch": 1.86397662954172, + "grad_norm": 0.874332845211029, + "learning_rate": 7.077164861751318e-07, + "loss": 0.4953, + "step": 5104 + }, + { + "epoch": 1.864341792952346, + "grad_norm": 0.9725618362426758, + "learning_rate": 7.039391222260005e-07, + "loss": 0.5128, + "step": 5105 + }, + { + "epoch": 1.8647069563629723, + "grad_norm": 0.7084645628929138, + "learning_rate": 7.001716853258877e-07, + "loss": 0.5797, + "step": 5106 + }, + { + "epoch": 1.8650721197735987, + "grad_norm": 0.9991376399993896, + "learning_rate": 6.964141774129873e-07, + "loss": 0.53, + "step": 5107 + }, + { + "epoch": 1.865437283184225, + "grad_norm": 1.4431263208389282, + "learning_rate": 6.926666004203908e-07, + "loss": 0.5262, + "step": 5108 + }, + { + "epoch": 1.8658024465948513, + "grad_norm": 2.061160087585449, + "learning_rate": 6.889289562760738e-07, + "loss": 0.551, + "step": 5109 + }, + { + "epoch": 1.8661676100054776, + "grad_norm": 0.9618075489997864, + "learning_rate": 6.852012469029046e-07, + "loss": 0.5092, + "step": 5110 + }, + { + "epoch": 1.8665327734161037, + "grad_norm": 1.071441888809204, + "learning_rate": 6.814834742186361e-07, + "loss": 0.5159, + "step": 5111 + }, + { + "epoch": 1.86689793682673, + "grad_norm": 1.1133548021316528, + "learning_rate": 6.777756401359159e-07, + "loss": 0.5017, + "step": 5112 + }, + { + "epoch": 1.867263100237356, + "grad_norm": 1.0975421667099, + "learning_rate": 6.740777465622784e-07, + "loss": 0.5208, + "step": 5113 + }, + { + "epoch": 1.8676282636479824, + "grad_norm": 0.9239022135734558, + "learning_rate": 6.703897954001392e-07, + "loss": 0.5184, + "step": 5114 + }, + { + "epoch": 1.8679934270586087, + "grad_norm": 0.9000834226608276, + "learning_rate": 6.667117885468011e-07, + "loss": 0.5036, + "step": 5115 + }, + { + "epoch": 1.868358590469235, + "grad_norm": 0.8452208638191223, + "learning_rate": 6.630437278944501e-07, + "loss": 0.5719, + "step": 5116 + }, + { + "epoch": 1.8687237538798613, + "grad_norm": 1.1362179517745972, + "learning_rate": 6.59385615330157e-07, + "loss": 0.5406, + "step": 5117 + }, + { + "epoch": 1.8690889172904877, + "grad_norm": 0.7628581523895264, + "learning_rate": 6.557374527358762e-07, + "loss": 0.5428, + "step": 5118 + }, + { + "epoch": 1.8694540807011137, + "grad_norm": 0.836574912071228, + "learning_rate": 6.520992419884398e-07, + "loss": 0.551, + "step": 5119 + }, + { + "epoch": 1.86981924411174, + "grad_norm": 1.2260987758636475, + "learning_rate": 6.484709849595572e-07, + "loss": 0.4869, + "step": 5120 + }, + { + "epoch": 1.8701844075223661, + "grad_norm": 0.948838472366333, + "learning_rate": 6.448526835158264e-07, + "loss": 0.5066, + "step": 5121 + }, + { + "epoch": 1.8705495709329925, + "grad_norm": 0.7823359966278076, + "learning_rate": 6.41244339518714e-07, + "loss": 0.5254, + "step": 5122 + }, + { + "epoch": 1.8709147343436188, + "grad_norm": 1.5459065437316895, + "learning_rate": 6.37645954824575e-07, + "loss": 0.5058, + "step": 5123 + }, + { + "epoch": 1.871279897754245, + "grad_norm": 1.0922521352767944, + "learning_rate": 6.340575312846287e-07, + "loss": 0.5208, + "step": 5124 + }, + { + "epoch": 1.8716450611648714, + "grad_norm": 0.7133572697639465, + "learning_rate": 6.304790707449738e-07, + "loss": 0.5318, + "step": 5125 + }, + { + "epoch": 1.8720102245754975, + "grad_norm": 0.7148061990737915, + "learning_rate": 6.269105750465843e-07, + "loss": 0.5628, + "step": 5126 + }, + { + "epoch": 1.8723753879861238, + "grad_norm": 1.031386137008667, + "learning_rate": 6.233520460253117e-07, + "loss": 0.5663, + "step": 5127 + }, + { + "epoch": 1.8727405513967499, + "grad_norm": 0.8790083527565002, + "learning_rate": 6.198034855118784e-07, + "loss": 0.5196, + "step": 5128 + }, + { + "epoch": 1.8731057148073762, + "grad_norm": 0.8304159641265869, + "learning_rate": 6.162648953318684e-07, + "loss": 0.5443, + "step": 5129 + }, + { + "epoch": 1.8734708782180025, + "grad_norm": 1.0308268070220947, + "learning_rate": 6.1273627730575e-07, + "loss": 0.5205, + "step": 5130 + }, + { + "epoch": 1.8738360416286288, + "grad_norm": 0.9887058734893799, + "learning_rate": 6.092176332488553e-07, + "loss": 0.5578, + "step": 5131 + }, + { + "epoch": 1.8742012050392551, + "grad_norm": 0.7946871519088745, + "learning_rate": 6.057089649713832e-07, + "loss": 0.5652, + "step": 5132 + }, + { + "epoch": 1.8745663684498814, + "grad_norm": 1.2616994380950928, + "learning_rate": 6.022102742784075e-07, + "loss": 0.5202, + "step": 5133 + }, + { + "epoch": 1.8749315318605075, + "grad_norm": 1.0364519357681274, + "learning_rate": 5.987215629698595e-07, + "loss": 0.4841, + "step": 5134 + }, + { + "epoch": 1.8752966952711339, + "grad_norm": 0.867057740688324, + "learning_rate": 5.952428328405413e-07, + "loss": 0.5433, + "step": 5135 + }, + { + "epoch": 1.87566185868176, + "grad_norm": 0.9206724166870117, + "learning_rate": 5.917740856801235e-07, + "loss": 0.4821, + "step": 5136 + }, + { + "epoch": 1.8760270220923863, + "grad_norm": 0.737504243850708, + "learning_rate": 5.88315323273132e-07, + "loss": 0.5388, + "step": 5137 + }, + { + "epoch": 1.8763921855030126, + "grad_norm": 0.9644716382026672, + "learning_rate": 5.848665473989679e-07, + "loss": 0.5231, + "step": 5138 + }, + { + "epoch": 1.8767573489136389, + "grad_norm": 0.8571596741676331, + "learning_rate": 5.814277598318808e-07, + "loss": 0.5346, + "step": 5139 + }, + { + "epoch": 1.8771225123242652, + "grad_norm": 0.8317295908927917, + "learning_rate": 5.779989623409932e-07, + "loss": 0.5537, + "step": 5140 + }, + { + "epoch": 1.8774876757348915, + "grad_norm": 0.9167125225067139, + "learning_rate": 5.745801566902831e-07, + "loss": 0.5292, + "step": 5141 + }, + { + "epoch": 1.8778528391455176, + "grad_norm": 1.2240817546844482, + "learning_rate": 5.71171344638588e-07, + "loss": 0.4914, + "step": 5142 + }, + { + "epoch": 1.878218002556144, + "grad_norm": 0.9029586911201477, + "learning_rate": 5.677725279396096e-07, + "loss": 0.5697, + "step": 5143 + }, + { + "epoch": 1.87858316596677, + "grad_norm": 0.9542390704154968, + "learning_rate": 5.643837083418957e-07, + "loss": 0.5768, + "step": 5144 + }, + { + "epoch": 1.8789483293773963, + "grad_norm": 0.8885607719421387, + "learning_rate": 5.610048875888607e-07, + "loss": 0.5492, + "step": 5145 + }, + { + "epoch": 1.8793134927880226, + "grad_norm": 1.0386658906936646, + "learning_rate": 5.57636067418772e-07, + "loss": 0.5406, + "step": 5146 + }, + { + "epoch": 1.879678656198649, + "grad_norm": 0.7747313380241394, + "learning_rate": 5.542772495647563e-07, + "loss": 0.5815, + "step": 5147 + }, + { + "epoch": 1.8800438196092752, + "grad_norm": 1.0311294794082642, + "learning_rate": 5.509284357547873e-07, + "loss": 0.4989, + "step": 5148 + }, + { + "epoch": 1.8804089830199016, + "grad_norm": 0.6967086791992188, + "learning_rate": 5.475896277116954e-07, + "loss": 0.57, + "step": 5149 + }, + { + "epoch": 1.8807741464305276, + "grad_norm": 0.9049924612045288, + "learning_rate": 5.442608271531602e-07, + "loss": 0.5278, + "step": 5150 + }, + { + "epoch": 1.881139309841154, + "grad_norm": 0.9503490328788757, + "learning_rate": 5.409420357917205e-07, + "loss": 0.5592, + "step": 5151 + }, + { + "epoch": 1.88150447325178, + "grad_norm": 0.7792781591415405, + "learning_rate": 5.376332553347618e-07, + "loss": 0.5201, + "step": 5152 + }, + { + "epoch": 1.8818696366624064, + "grad_norm": 0.9307024478912354, + "learning_rate": 5.34334487484518e-07, + "loss": 0.4548, + "step": 5153 + }, + { + "epoch": 1.8822348000730327, + "grad_norm": 0.7584879398345947, + "learning_rate": 5.310457339380693e-07, + "loss": 0.5837, + "step": 5154 + }, + { + "epoch": 1.882599963483659, + "grad_norm": 0.6833974123001099, + "learning_rate": 5.277669963873489e-07, + "loss": 0.579, + "step": 5155 + }, + { + "epoch": 1.8829651268942853, + "grad_norm": 0.7575476169586182, + "learning_rate": 5.244982765191387e-07, + "loss": 0.5337, + "step": 5156 + }, + { + "epoch": 1.8833302903049116, + "grad_norm": 1.2384319305419922, + "learning_rate": 5.212395760150623e-07, + "loss": 0.5113, + "step": 5157 + }, + { + "epoch": 1.8836954537155377, + "grad_norm": 0.811079204082489, + "learning_rate": 5.1799089655159e-07, + "loss": 0.556, + "step": 5158 + }, + { + "epoch": 1.8840606171261638, + "grad_norm": 0.8482411503791809, + "learning_rate": 5.14752239800036e-07, + "loss": 0.5125, + "step": 5159 + }, + { + "epoch": 1.88442578053679, + "grad_norm": 1.042563557624817, + "learning_rate": 5.115236074265606e-07, + "loss": 0.4683, + "step": 5160 + }, + { + "epoch": 1.8847909439474164, + "grad_norm": 1.1850470304489136, + "learning_rate": 5.083050010921642e-07, + "loss": 0.5096, + "step": 5161 + }, + { + "epoch": 1.8851561073580427, + "grad_norm": 0.8976897597312927, + "learning_rate": 5.050964224526956e-07, + "loss": 0.5477, + "step": 5162 + }, + { + "epoch": 1.885521270768669, + "grad_norm": 1.336876392364502, + "learning_rate": 5.018978731588342e-07, + "loss": 0.4924, + "step": 5163 + }, + { + "epoch": 1.8858864341792954, + "grad_norm": 0.8368218541145325, + "learning_rate": 4.987093548561062e-07, + "loss": 0.506, + "step": 5164 + }, + { + "epoch": 1.8862515975899214, + "grad_norm": 0.8556219935417175, + "learning_rate": 4.95530869184877e-07, + "loss": 0.5411, + "step": 5165 + }, + { + "epoch": 1.8866167610005478, + "grad_norm": 1.0150723457336426, + "learning_rate": 4.923624177803498e-07, + "loss": 0.5634, + "step": 5166 + }, + { + "epoch": 1.8869819244111738, + "grad_norm": 0.8867017030715942, + "learning_rate": 4.892040022725675e-07, + "loss": 0.5142, + "step": 5167 + }, + { + "epoch": 1.8873470878218002, + "grad_norm": 0.9332976937294006, + "learning_rate": 4.860556242864034e-07, + "loss": 0.5117, + "step": 5168 + }, + { + "epoch": 1.8877122512324265, + "grad_norm": 3.3063771724700928, + "learning_rate": 4.829172854415775e-07, + "loss": 0.5136, + "step": 5169 + }, + { + "epoch": 1.8880774146430528, + "grad_norm": 0.8885701298713684, + "learning_rate": 4.79788987352634e-07, + "loss": 0.5219, + "step": 5170 + }, + { + "epoch": 1.888442578053679, + "grad_norm": 1.0955491065979004, + "learning_rate": 4.7667073162896315e-07, + "loss": 0.504, + "step": 5171 + }, + { + "epoch": 1.8888077414643054, + "grad_norm": 1.0627782344818115, + "learning_rate": 4.7356251987477507e-07, + "loss": 0.555, + "step": 5172 + }, + { + "epoch": 1.8891729048749315, + "grad_norm": 0.6565744876861572, + "learning_rate": 4.7046435368912404e-07, + "loss": 0.5494, + "step": 5173 + }, + { + "epoch": 1.8895380682855578, + "grad_norm": 0.8472015857696533, + "learning_rate": 4.6737623466589055e-07, + "loss": 0.5099, + "step": 5174 + }, + { + "epoch": 1.889903231696184, + "grad_norm": 1.0763630867004395, + "learning_rate": 4.642981643937905e-07, + "loss": 0.5105, + "step": 5175 + }, + { + "epoch": 1.8902683951068102, + "grad_norm": 0.6906781196594238, + "learning_rate": 4.6123014445636605e-07, + "loss": 0.5608, + "step": 5176 + }, + { + "epoch": 1.8906335585174365, + "grad_norm": 0.9713335633277893, + "learning_rate": 4.581721764319924e-07, + "loss": 0.515, + "step": 5177 + }, + { + "epoch": 1.8909987219280628, + "grad_norm": 1.1435459852218628, + "learning_rate": 4.5512426189386674e-07, + "loss": 0.5635, + "step": 5178 + }, + { + "epoch": 1.8913638853386892, + "grad_norm": 0.9018896818161011, + "learning_rate": 4.520864024100191e-07, + "loss": 0.5465, + "step": 5179 + }, + { + "epoch": 1.8917290487493155, + "grad_norm": 0.7561548948287964, + "learning_rate": 4.4905859954331057e-07, + "loss": 0.5474, + "step": 5180 + }, + { + "epoch": 1.8920942121599416, + "grad_norm": 1.0169223546981812, + "learning_rate": 4.460408548514239e-07, + "loss": 0.5194, + "step": 5181 + }, + { + "epoch": 1.8924593755705679, + "grad_norm": 1.0248956680297852, + "learning_rate": 4.4303316988686396e-07, + "loss": 0.5539, + "step": 5182 + }, + { + "epoch": 1.892824538981194, + "grad_norm": 0.9140022993087769, + "learning_rate": 4.400355461969663e-07, + "loss": 0.5504, + "step": 5183 + }, + { + "epoch": 1.8931897023918203, + "grad_norm": 0.8542696833610535, + "learning_rate": 4.3704798532388624e-07, + "loss": 0.5561, + "step": 5184 + }, + { + "epoch": 1.8935548658024466, + "grad_norm": 1.0125644207000732, + "learning_rate": 4.3407048880460765e-07, + "loss": 0.5264, + "step": 5185 + }, + { + "epoch": 1.893920029213073, + "grad_norm": 1.2030428647994995, + "learning_rate": 4.311030581709297e-07, + "loss": 0.535, + "step": 5186 + }, + { + "epoch": 1.8942851926236992, + "grad_norm": 2.7859158515930176, + "learning_rate": 4.281456949494778e-07, + "loss": 0.4829, + "step": 5187 + }, + { + "epoch": 1.8946503560343255, + "grad_norm": 0.9519990086555481, + "learning_rate": 4.2519840066169493e-07, + "loss": 0.5758, + "step": 5188 + }, + { + "epoch": 1.8950155194449516, + "grad_norm": 0.8662074208259583, + "learning_rate": 4.222611768238505e-07, + "loss": 0.5784, + "step": 5189 + }, + { + "epoch": 1.895380682855578, + "grad_norm": 0.9285888075828552, + "learning_rate": 4.1933402494702235e-07, + "loss": 0.4911, + "step": 5190 + }, + { + "epoch": 1.895745846266204, + "grad_norm": 1.0755295753479004, + "learning_rate": 4.164169465371148e-07, + "loss": 0.4974, + "step": 5191 + }, + { + "epoch": 1.8961110096768303, + "grad_norm": 0.8906064629554749, + "learning_rate": 4.135099430948475e-07, + "loss": 0.5153, + "step": 5192 + }, + { + "epoch": 1.8964761730874566, + "grad_norm": 1.133794903755188, + "learning_rate": 4.106130161157595e-07, + "loss": 0.5216, + "step": 5193 + }, + { + "epoch": 1.896841336498083, + "grad_norm": 1.1784135103225708, + "learning_rate": 4.077261670901989e-07, + "loss": 0.525, + "step": 5194 + }, + { + "epoch": 1.8972064999087093, + "grad_norm": 0.8882606029510498, + "learning_rate": 4.0484939750333743e-07, + "loss": 0.5609, + "step": 5195 + }, + { + "epoch": 1.8975716633193354, + "grad_norm": 0.8074411153793335, + "learning_rate": 4.01982708835158e-07, + "loss": 0.5096, + "step": 5196 + }, + { + "epoch": 1.8979368267299617, + "grad_norm": 0.8147451281547546, + "learning_rate": 3.991261025604543e-07, + "loss": 0.5411, + "step": 5197 + }, + { + "epoch": 1.8983019901405878, + "grad_norm": 1.0759518146514893, + "learning_rate": 3.9627958014883725e-07, + "loss": 0.5085, + "step": 5198 + }, + { + "epoch": 1.898667153551214, + "grad_norm": 0.7098891139030457, + "learning_rate": 3.9344314306472674e-07, + "loss": 0.5232, + "step": 5199 + }, + { + "epoch": 1.8990323169618404, + "grad_norm": 1.0414230823516846, + "learning_rate": 3.9061679276735986e-07, + "loss": 0.4982, + "step": 5200 + }, + { + "epoch": 1.8993974803724667, + "grad_norm": 0.7669789791107178, + "learning_rate": 3.878005307107735e-07, + "loss": 0.5128, + "step": 5201 + }, + { + "epoch": 1.899762643783093, + "grad_norm": 0.6906653642654419, + "learning_rate": 3.849943583438287e-07, + "loss": 0.5851, + "step": 5202 + }, + { + "epoch": 1.9001278071937193, + "grad_norm": 0.7735735177993774, + "learning_rate": 3.8219827711018397e-07, + "loss": 0.5872, + "step": 5203 + }, + { + "epoch": 1.9004929706043454, + "grad_norm": 0.9029932618141174, + "learning_rate": 3.794122884483131e-07, + "loss": 0.5286, + "step": 5204 + }, + { + "epoch": 1.9008581340149717, + "grad_norm": 1.4787055253982544, + "learning_rate": 3.7663639379149406e-07, + "loss": 0.5121, + "step": 5205 + }, + { + "epoch": 1.9012232974255978, + "grad_norm": 0.8257893919944763, + "learning_rate": 3.738705945678134e-07, + "loss": 0.5512, + "step": 5206 + }, + { + "epoch": 1.9015884608362241, + "grad_norm": 1.0745633840560913, + "learning_rate": 3.7111489220016617e-07, + "loss": 0.4923, + "step": 5207 + }, + { + "epoch": 1.9019536242468504, + "grad_norm": 1.098933458328247, + "learning_rate": 3.6836928810624506e-07, + "loss": 0.5365, + "step": 5208 + }, + { + "epoch": 1.9023187876574768, + "grad_norm": 1.2179603576660156, + "learning_rate": 3.656337836985602e-07, + "loss": 0.5296, + "step": 5209 + }, + { + "epoch": 1.902683951068103, + "grad_norm": 1.182113528251648, + "learning_rate": 3.629083803844147e-07, + "loss": 0.483, + "step": 5210 + }, + { + "epoch": 1.9030491144787294, + "grad_norm": 1.2880009412765503, + "learning_rate": 3.6019307956592034e-07, + "loss": 0.5182, + "step": 5211 + }, + { + "epoch": 1.9034142778893555, + "grad_norm": 0.8011050224304199, + "learning_rate": 3.5748788263998855e-07, + "loss": 0.5498, + "step": 5212 + }, + { + "epoch": 1.9037794412999818, + "grad_norm": 1.1082504987716675, + "learning_rate": 3.547927909983373e-07, + "loss": 0.564, + "step": 5213 + }, + { + "epoch": 1.9041446047106079, + "grad_norm": 0.8854731321334839, + "learning_rate": 3.521078060274841e-07, + "loss": 0.5016, + "step": 5214 + }, + { + "epoch": 1.9045097681212342, + "grad_norm": 0.816261351108551, + "learning_rate": 3.4943292910874173e-07, + "loss": 0.5451, + "step": 5215 + }, + { + "epoch": 1.9048749315318605, + "grad_norm": 0.8715974688529968, + "learning_rate": 3.4676816161822947e-07, + "loss": 0.485, + "step": 5216 + }, + { + "epoch": 1.9052400949424868, + "grad_norm": 0.9326238036155701, + "learning_rate": 3.4411350492686404e-07, + "loss": 0.5318, + "step": 5217 + }, + { + "epoch": 1.9056052583531131, + "grad_norm": 1.1741446256637573, + "learning_rate": 3.4146896040035514e-07, + "loss": 0.4912, + "step": 5218 + }, + { + "epoch": 1.9059704217637394, + "grad_norm": 0.9231522083282471, + "learning_rate": 3.3883452939922123e-07, + "loss": 0.4973, + "step": 5219 + }, + { + "epoch": 1.9063355851743655, + "grad_norm": 0.7504870295524597, + "learning_rate": 3.3621021327876923e-07, + "loss": 0.5522, + "step": 5220 + }, + { + "epoch": 1.9067007485849918, + "grad_norm": 0.8062773942947388, + "learning_rate": 3.3359601338910143e-07, + "loss": 0.5267, + "step": 5221 + }, + { + "epoch": 1.907065911995618, + "grad_norm": 0.8056736588478088, + "learning_rate": 3.3099193107512197e-07, + "loss": 0.5195, + "step": 5222 + }, + { + "epoch": 1.9074310754062442, + "grad_norm": 0.86921226978302, + "learning_rate": 3.283979676765259e-07, + "loss": 0.5514, + "step": 5223 + }, + { + "epoch": 1.9077962388168705, + "grad_norm": 0.8477928042411804, + "learning_rate": 3.258141245278057e-07, + "loss": 0.5361, + "step": 5224 + }, + { + "epoch": 1.9081614022274969, + "grad_norm": 0.6933594346046448, + "learning_rate": 3.2324040295824033e-07, + "loss": 0.556, + "step": 5225 + }, + { + "epoch": 1.9085265656381232, + "grad_norm": 0.7396385073661804, + "learning_rate": 3.2067680429190617e-07, + "loss": 0.6044, + "step": 5226 + }, + { + "epoch": 1.9088917290487493, + "grad_norm": 0.843963623046875, + "learning_rate": 3.181233298476771e-07, + "loss": 0.5346, + "step": 5227 + }, + { + "epoch": 1.9092568924593756, + "grad_norm": 0.7993393540382385, + "learning_rate": 3.1557998093920904e-07, + "loss": 0.5474, + "step": 5228 + }, + { + "epoch": 1.9096220558700017, + "grad_norm": 1.0452121496200562, + "learning_rate": 3.130467588749553e-07, + "loss": 0.5049, + "step": 5229 + }, + { + "epoch": 1.909987219280628, + "grad_norm": 0.920778214931488, + "learning_rate": 3.105236649581556e-07, + "loss": 0.5573, + "step": 5230 + }, + { + "epoch": 1.9103523826912543, + "grad_norm": 0.6959015727043152, + "learning_rate": 3.0801070048684046e-07, + "loss": 0.5451, + "step": 5231 + }, + { + "epoch": 1.9107175461018806, + "grad_norm": 0.9710888862609863, + "learning_rate": 3.055078667538292e-07, + "loss": 0.5067, + "step": 5232 + }, + { + "epoch": 1.911082709512507, + "grad_norm": 0.9164549112319946, + "learning_rate": 3.0301516504672944e-07, + "loss": 0.5367, + "step": 5233 + }, + { + "epoch": 1.9114478729231332, + "grad_norm": 0.9263083338737488, + "learning_rate": 3.0053259664793997e-07, + "loss": 0.5489, + "step": 5234 + }, + { + "epoch": 1.9118130363337593, + "grad_norm": 1.0650849342346191, + "learning_rate": 2.980601628346347e-07, + "loss": 0.5145, + "step": 5235 + }, + { + "epoch": 1.9121781997443856, + "grad_norm": 0.8558089733123779, + "learning_rate": 2.9559786487878716e-07, + "loss": 0.5443, + "step": 5236 + }, + { + "epoch": 1.9125433631550117, + "grad_norm": 0.9622271656990051, + "learning_rate": 2.931457040471508e-07, + "loss": 0.5552, + "step": 5237 + }, + { + "epoch": 1.912908526565638, + "grad_norm": 1.4848085641860962, + "learning_rate": 2.907036816012609e-07, + "loss": 0.5295, + "step": 5238 + }, + { + "epoch": 1.9132736899762643, + "grad_norm": 0.7723660469055176, + "learning_rate": 2.882717987974437e-07, + "loss": 0.5229, + "step": 5239 + }, + { + "epoch": 1.9136388533868907, + "grad_norm": 0.8148236274719238, + "learning_rate": 2.85850056886805e-07, + "loss": 0.5573, + "step": 5240 + }, + { + "epoch": 1.914004016797517, + "grad_norm": 0.6501472592353821, + "learning_rate": 2.834384571152282e-07, + "loss": 0.5251, + "step": 5241 + }, + { + "epoch": 1.9143691802081433, + "grad_norm": 1.1822669506072998, + "learning_rate": 2.8103700072339203e-07, + "loss": 0.5328, + "step": 5242 + }, + { + "epoch": 1.9147343436187694, + "grad_norm": 0.7121161222457886, + "learning_rate": 2.7864568894674593e-07, + "loss": 0.5369, + "step": 5243 + }, + { + "epoch": 1.9150995070293957, + "grad_norm": 0.8092657327651978, + "learning_rate": 2.7626452301552586e-07, + "loss": 0.5597, + "step": 5244 + }, + { + "epoch": 1.9154646704400218, + "grad_norm": 0.7837401032447815, + "learning_rate": 2.7389350415474305e-07, + "loss": 0.5346, + "step": 5245 + }, + { + "epoch": 1.915829833850648, + "grad_norm": 0.9305130243301392, + "learning_rate": 2.715326335841906e-07, + "loss": 0.5218, + "step": 5246 + }, + { + "epoch": 1.9161949972612744, + "grad_norm": 0.9589447975158691, + "learning_rate": 2.691819125184458e-07, + "loss": 0.5131, + "step": 5247 + }, + { + "epoch": 1.9165601606719007, + "grad_norm": 1.0760564804077148, + "learning_rate": 2.668413421668592e-07, + "loss": 0.5446, + "step": 5248 + }, + { + "epoch": 1.916925324082527, + "grad_norm": 0.7688742280006409, + "learning_rate": 2.645109237335608e-07, + "loss": 0.5536, + "step": 5249 + }, + { + "epoch": 1.9172904874931533, + "grad_norm": 0.8162276744842529, + "learning_rate": 2.6219065841745383e-07, + "loss": 0.5089, + "step": 5250 + }, + { + "epoch": 1.9176556509037794, + "grad_norm": 0.7043295502662659, + "learning_rate": 2.5988054741222345e-07, + "loss": 0.5647, + "step": 5251 + }, + { + "epoch": 1.9180208143144057, + "grad_norm": 0.8645861744880676, + "learning_rate": 2.5758059190633233e-07, + "loss": 0.5314, + "step": 5252 + }, + { + "epoch": 1.9183859777250318, + "grad_norm": 0.6989781260490417, + "learning_rate": 2.5529079308301174e-07, + "loss": 0.53, + "step": 5253 + }, + { + "epoch": 1.9187511411356581, + "grad_norm": 0.7858476638793945, + "learning_rate": 2.530111521202727e-07, + "loss": 0.5229, + "step": 5254 + }, + { + "epoch": 1.9191163045462845, + "grad_norm": 0.8875958919525146, + "learning_rate": 2.5074167019089714e-07, + "loss": 0.5294, + "step": 5255 + }, + { + "epoch": 1.9194814679569108, + "grad_norm": 0.9924106001853943, + "learning_rate": 2.484823484624466e-07, + "loss": 0.5052, + "step": 5256 + }, + { + "epoch": 1.919846631367537, + "grad_norm": 0.8700651526451111, + "learning_rate": 2.462331880972468e-07, + "loss": 0.5251, + "step": 5257 + }, + { + "epoch": 1.9202117947781634, + "grad_norm": 1.09417724609375, + "learning_rate": 2.4399419025240344e-07, + "loss": 0.4907, + "step": 5258 + }, + { + "epoch": 1.9205769581887895, + "grad_norm": 0.8320743441581726, + "learning_rate": 2.4176535607978835e-07, + "loss": 0.4963, + "step": 5259 + }, + { + "epoch": 1.9209421215994156, + "grad_norm": 0.9074090123176575, + "learning_rate": 2.3954668672604874e-07, + "loss": 0.528, + "step": 5260 + }, + { + "epoch": 1.9213072850100419, + "grad_norm": 0.9395208954811096, + "learning_rate": 2.373381833326027e-07, + "loss": 0.4746, + "step": 5261 + }, + { + "epoch": 1.9216724484206682, + "grad_norm": 0.9719210267066956, + "learning_rate": 2.3513984703563476e-07, + "loss": 0.533, + "step": 5262 + }, + { + "epoch": 1.9220376118312945, + "grad_norm": 0.9111335277557373, + "learning_rate": 2.3295167896610016e-07, + "loss": 0.5145, + "step": 5263 + }, + { + "epoch": 1.9224027752419208, + "grad_norm": 0.6564895510673523, + "learning_rate": 2.3077368024972514e-07, + "loss": 0.5348, + "step": 5264 + }, + { + "epoch": 1.9227679386525471, + "grad_norm": 0.8111744523048401, + "learning_rate": 2.2860585200700226e-07, + "loss": 0.5407, + "step": 5265 + }, + { + "epoch": 1.9231331020631732, + "grad_norm": 0.8375979661941528, + "learning_rate": 2.2644819535319051e-07, + "loss": 0.5314, + "step": 5266 + }, + { + "epoch": 1.9234982654737995, + "grad_norm": 1.4540085792541504, + "learning_rate": 2.2430071139832198e-07, + "loss": 0.5235, + "step": 5267 + }, + { + "epoch": 1.9238634288844256, + "grad_norm": 0.8702842593193054, + "learning_rate": 2.2216340124718626e-07, + "loss": 0.4963, + "step": 5268 + }, + { + "epoch": 1.924228592295052, + "grad_norm": 0.8858942985534668, + "learning_rate": 2.2003626599934602e-07, + "loss": 0.5234, + "step": 5269 + }, + { + "epoch": 1.9245937557056783, + "grad_norm": 0.7983465194702148, + "learning_rate": 2.1791930674912587e-07, + "loss": 0.4987, + "step": 5270 + }, + { + "epoch": 1.9249589191163046, + "grad_norm": 0.9610999822616577, + "learning_rate": 2.1581252458561684e-07, + "loss": 0.4934, + "step": 5271 + }, + { + "epoch": 1.9253240825269309, + "grad_norm": 0.8956314921379089, + "learning_rate": 2.137159205926742e-07, + "loss": 0.5299, + "step": 5272 + }, + { + "epoch": 1.9256892459375572, + "grad_norm": 0.9054751396179199, + "learning_rate": 2.1162949584891512e-07, + "loss": 0.588, + "step": 5273 + }, + { + "epoch": 1.9260544093481833, + "grad_norm": 0.7674873471260071, + "learning_rate": 2.095532514277232e-07, + "loss": 0.5629, + "step": 5274 + }, + { + "epoch": 1.9264195727588096, + "grad_norm": 0.8046696782112122, + "learning_rate": 2.0748718839724403e-07, + "loss": 0.4931, + "step": 5275 + }, + { + "epoch": 1.9267847361694357, + "grad_norm": 1.0259958505630493, + "learning_rate": 2.0543130782037845e-07, + "loss": 0.5065, + "step": 5276 + }, + { + "epoch": 1.927149899580062, + "grad_norm": 0.9353147745132446, + "learning_rate": 2.0338561075480269e-07, + "loss": 0.5138, + "step": 5277 + }, + { + "epoch": 1.9275150629906883, + "grad_norm": 0.8617286682128906, + "learning_rate": 2.0135009825293928e-07, + "loss": 0.5269, + "step": 5278 + }, + { + "epoch": 1.9278802264013146, + "grad_norm": 0.8524954915046692, + "learning_rate": 1.9932477136197949e-07, + "loss": 0.5413, + "step": 5279 + }, + { + "epoch": 1.928245389811941, + "grad_norm": 0.9233561754226685, + "learning_rate": 1.9730963112387425e-07, + "loss": 0.5186, + "step": 5280 + }, + { + "epoch": 1.9286105532225672, + "grad_norm": 0.7263604402542114, + "learning_rate": 1.9530467857532986e-07, + "loss": 0.5771, + "step": 5281 + }, + { + "epoch": 1.9289757166331933, + "grad_norm": 0.7217230796813965, + "learning_rate": 1.93309914747819e-07, + "loss": 0.5419, + "step": 5282 + }, + { + "epoch": 1.9293408800438197, + "grad_norm": 0.9343338012695312, + "learning_rate": 1.9132534066756304e-07, + "loss": 0.5175, + "step": 5283 + }, + { + "epoch": 1.9297060434544457, + "grad_norm": 0.7075455784797668, + "learning_rate": 1.8935095735554522e-07, + "loss": 0.5212, + "step": 5284 + }, + { + "epoch": 1.930071206865072, + "grad_norm": 0.7148510217666626, + "learning_rate": 1.8738676582750638e-07, + "loss": 0.5789, + "step": 5285 + }, + { + "epoch": 1.9304363702756984, + "grad_norm": 1.1724885702133179, + "learning_rate": 1.854327670939471e-07, + "loss": 0.5462, + "step": 5286 + }, + { + "epoch": 1.9308015336863247, + "grad_norm": 0.9485020637512207, + "learning_rate": 1.8348896216012102e-07, + "loss": 0.509, + "step": 5287 + }, + { + "epoch": 1.931166697096951, + "grad_norm": 1.7501707077026367, + "learning_rate": 1.8155535202603712e-07, + "loss": 0.5125, + "step": 5288 + }, + { + "epoch": 1.9315318605075773, + "grad_norm": 1.040753960609436, + "learning_rate": 1.796319376864597e-07, + "loss": 0.4997, + "step": 5289 + }, + { + "epoch": 1.9318970239182034, + "grad_norm": 1.0983657836914062, + "learning_rate": 1.7771872013090608e-07, + "loss": 0.5547, + "step": 5290 + }, + { + "epoch": 1.9322621873288297, + "grad_norm": 1.2510367631912231, + "learning_rate": 1.7581570034365557e-07, + "loss": 0.5208, + "step": 5291 + }, + { + "epoch": 1.9326273507394558, + "grad_norm": 1.0194956064224243, + "learning_rate": 1.7392287930373175e-07, + "loss": 0.5549, + "step": 5292 + }, + { + "epoch": 1.932992514150082, + "grad_norm": 0.9619592428207397, + "learning_rate": 1.7204025798491342e-07, + "loss": 0.5373, + "step": 5293 + }, + { + "epoch": 1.9333576775607084, + "grad_norm": 0.7508404850959778, + "learning_rate": 1.7016783735573693e-07, + "loss": 0.5258, + "step": 5294 + }, + { + "epoch": 1.9337228409713347, + "grad_norm": 1.070589542388916, + "learning_rate": 1.6830561837948735e-07, + "loss": 0.5208, + "step": 5295 + }, + { + "epoch": 1.934088004381961, + "grad_norm": 0.9107255339622498, + "learning_rate": 1.6645360201420046e-07, + "loss": 0.5433, + "step": 5296 + }, + { + "epoch": 1.9344531677925871, + "grad_norm": 0.9078409075737, + "learning_rate": 1.646117892126653e-07, + "loss": 0.5283, + "step": 5297 + }, + { + "epoch": 1.9348183312032134, + "grad_norm": 0.9286045432090759, + "learning_rate": 1.6278018092241943e-07, + "loss": 0.5377, + "step": 5298 + }, + { + "epoch": 1.9351834946138395, + "grad_norm": 0.6777305603027344, + "learning_rate": 1.6095877808575133e-07, + "loss": 0.5591, + "step": 5299 + }, + { + "epoch": 1.9355486580244659, + "grad_norm": 0.869694173336029, + "learning_rate": 1.5914758163970033e-07, + "loss": 0.5428, + "step": 5300 + }, + { + "epoch": 1.9359138214350922, + "grad_norm": 1.2617483139038086, + "learning_rate": 1.5734659251605666e-07, + "loss": 0.5204, + "step": 5301 + }, + { + "epoch": 1.9362789848457185, + "grad_norm": 0.9180070161819458, + "learning_rate": 1.5555581164135468e-07, + "loss": 0.5474, + "step": 5302 + }, + { + "epoch": 1.9366441482563448, + "grad_norm": 0.7689355611801147, + "learning_rate": 1.537752399368797e-07, + "loss": 0.5398, + "step": 5303 + }, + { + "epoch": 1.937009311666971, + "grad_norm": 0.9898827075958252, + "learning_rate": 1.520048783186634e-07, + "loss": 0.51, + "step": 5304 + }, + { + "epoch": 1.9373744750775972, + "grad_norm": 0.7218833565711975, + "learning_rate": 1.502447276974861e-07, + "loss": 0.5702, + "step": 5305 + }, + { + "epoch": 1.9377396384882235, + "grad_norm": 0.7822741866111755, + "learning_rate": 1.484947889788768e-07, + "loss": 0.5656, + "step": 5306 + }, + { + "epoch": 1.9381048018988496, + "grad_norm": 0.9823201894760132, + "learning_rate": 1.4675506306310873e-07, + "loss": 0.5863, + "step": 5307 + }, + { + "epoch": 1.938469965309476, + "grad_norm": 1.0413130521774292, + "learning_rate": 1.4502555084519698e-07, + "loss": 0.5248, + "step": 5308 + }, + { + "epoch": 1.9388351287201022, + "grad_norm": 1.0553734302520752, + "learning_rate": 1.4330625321490988e-07, + "loss": 0.4869, + "step": 5309 + }, + { + "epoch": 1.9392002921307285, + "grad_norm": 1.0827317237854004, + "learning_rate": 1.4159717105675542e-07, + "loss": 0.5398, + "step": 5310 + }, + { + "epoch": 1.9395654555413548, + "grad_norm": 1.0234954357147217, + "learning_rate": 1.3989830524999025e-07, + "loss": 0.5092, + "step": 5311 + }, + { + "epoch": 1.9399306189519812, + "grad_norm": 1.1130499839782715, + "learning_rate": 1.3820965666860865e-07, + "loss": 0.5089, + "step": 5312 + }, + { + "epoch": 1.9402957823626072, + "grad_norm": 0.9848282337188721, + "learning_rate": 1.3653122618135562e-07, + "loss": 0.5146, + "step": 5313 + }, + { + "epoch": 1.9406609457732336, + "grad_norm": 1.0807373523712158, + "learning_rate": 1.348630146517138e-07, + "loss": 0.527, + "step": 5314 + }, + { + "epoch": 1.9410261091838596, + "grad_norm": 0.9990309476852417, + "learning_rate": 1.3320502293791448e-07, + "loss": 0.5344, + "step": 5315 + }, + { + "epoch": 1.941391272594486, + "grad_norm": 1.0408899784088135, + "learning_rate": 1.3155725189292646e-07, + "loss": 0.5235, + "step": 5316 + }, + { + "epoch": 1.9417564360051123, + "grad_norm": 0.9949876666069031, + "learning_rate": 1.2991970236445828e-07, + "loss": 0.5322, + "step": 5317 + }, + { + "epoch": 1.9421215994157386, + "grad_norm": 0.745749294757843, + "learning_rate": 1.282923751949694e-07, + "loss": 0.5435, + "step": 5318 + }, + { + "epoch": 1.942486762826365, + "grad_norm": 0.7982057929039001, + "learning_rate": 1.2667527122165014e-07, + "loss": 0.5428, + "step": 5319 + }, + { + "epoch": 1.9428519262369912, + "grad_norm": 0.9611827731132507, + "learning_rate": 1.2506839127643943e-07, + "loss": 0.5186, + "step": 5320 + }, + { + "epoch": 1.9432170896476173, + "grad_norm": 0.8420519828796387, + "learning_rate": 1.2347173618600717e-07, + "loss": 0.5284, + "step": 5321 + }, + { + "epoch": 1.9435822530582436, + "grad_norm": 1.0904260873794556, + "learning_rate": 1.218853067717718e-07, + "loss": 0.4985, + "step": 5322 + }, + { + "epoch": 1.9439474164688697, + "grad_norm": 2.987138509750366, + "learning_rate": 1.2030910384988716e-07, + "loss": 0.5245, + "step": 5323 + }, + { + "epoch": 1.944312579879496, + "grad_norm": 0.9676865339279175, + "learning_rate": 1.1874312823124678e-07, + "loss": 0.5267, + "step": 5324 + }, + { + "epoch": 1.9446777432901223, + "grad_norm": 0.6485067009925842, + "learning_rate": 1.1718738072148184e-07, + "loss": 0.5727, + "step": 5325 + }, + { + "epoch": 1.9450429067007486, + "grad_norm": 1.0290840864181519, + "learning_rate": 1.1564186212096317e-07, + "loss": 0.5164, + "step": 5326 + }, + { + "epoch": 1.945408070111375, + "grad_norm": 0.7069109082221985, + "learning_rate": 1.1410657322479479e-07, + "loss": 0.5349, + "step": 5327 + }, + { + "epoch": 1.945773233522001, + "grad_norm": 0.7101110219955444, + "learning_rate": 1.1258151482282265e-07, + "loss": 0.5997, + "step": 5328 + }, + { + "epoch": 1.9461383969326274, + "grad_norm": 0.8071128726005554, + "learning_rate": 1.1106668769963025e-07, + "loss": 0.5282, + "step": 5329 + }, + { + "epoch": 1.9465035603432534, + "grad_norm": 0.7718363404273987, + "learning_rate": 1.0956209263453421e-07, + "loss": 0.5654, + "step": 5330 + }, + { + "epoch": 1.9468687237538798, + "grad_norm": 0.9163565635681152, + "learning_rate": 1.0806773040158647e-07, + "loss": 0.4722, + "step": 5331 + }, + { + "epoch": 1.947233887164506, + "grad_norm": 0.6575088500976562, + "learning_rate": 1.0658360176957871e-07, + "loss": 0.5298, + "step": 5332 + }, + { + "epoch": 1.9475990505751324, + "grad_norm": 0.9368276000022888, + "learning_rate": 1.0510970750203353e-07, + "loss": 0.4665, + "step": 5333 + }, + { + "epoch": 1.9479642139857587, + "grad_norm": 0.8309523463249207, + "learning_rate": 1.0364604835721325e-07, + "loss": 0.5255, + "step": 5334 + }, + { + "epoch": 1.948329377396385, + "grad_norm": 0.9614001512527466, + "learning_rate": 1.021926250881089e-07, + "loss": 0.5179, + "step": 5335 + }, + { + "epoch": 1.948694540807011, + "grad_norm": 0.9124447703361511, + "learning_rate": 1.0074943844245122e-07, + "loss": 0.5408, + "step": 5336 + }, + { + "epoch": 1.9490597042176374, + "grad_norm": 0.887027382850647, + "learning_rate": 9.931648916269965e-08, + "loss": 0.5436, + "step": 5337 + }, + { + "epoch": 1.9494248676282635, + "grad_norm": 0.9393956661224365, + "learning_rate": 9.789377798604894e-08, + "loss": 0.5057, + "step": 5338 + }, + { + "epoch": 1.9497900310388898, + "grad_norm": 0.8401802778244019, + "learning_rate": 9.648130564442915e-08, + "loss": 0.5381, + "step": 5339 + }, + { + "epoch": 1.9501551944495161, + "grad_norm": 1.0334709882736206, + "learning_rate": 9.507907286449903e-08, + "loss": 0.4995, + "step": 5340 + }, + { + "epoch": 1.9505203578601424, + "grad_norm": 0.9041337370872498, + "learning_rate": 9.368708036764818e-08, + "loss": 0.5391, + "step": 5341 + }, + { + "epoch": 1.9508855212707688, + "grad_norm": 1.1579806804656982, + "learning_rate": 9.230532887000598e-08, + "loss": 0.5238, + "step": 5342 + }, + { + "epoch": 1.951250684681395, + "grad_norm": 1.1014970541000366, + "learning_rate": 9.093381908242605e-08, + "loss": 0.5234, + "step": 5343 + }, + { + "epoch": 1.9516158480920212, + "grad_norm": 0.9048789739608765, + "learning_rate": 8.957255171049506e-08, + "loss": 0.5793, + "step": 5344 + }, + { + "epoch": 1.9519810115026475, + "grad_norm": 0.8284463286399841, + "learning_rate": 8.822152745453061e-08, + "loss": 0.5453, + "step": 5345 + }, + { + "epoch": 1.9523461749132736, + "grad_norm": 0.9747271537780762, + "learning_rate": 8.688074700958115e-08, + "loss": 0.4893, + "step": 5346 + }, + { + "epoch": 1.9527113383238999, + "grad_norm": 0.9729907512664795, + "learning_rate": 8.55502110654216e-08, + "loss": 0.5385, + "step": 5347 + }, + { + "epoch": 1.9530765017345262, + "grad_norm": 0.7343084216117859, + "learning_rate": 8.422992030656218e-08, + "loss": 0.5416, + "step": 5348 + }, + { + "epoch": 1.9534416651451525, + "grad_norm": 0.9176368713378906, + "learning_rate": 8.291987541223955e-08, + "loss": 0.5335, + "step": 5349 + }, + { + "epoch": 1.9538068285557788, + "grad_norm": 0.9123088121414185, + "learning_rate": 8.162007705641905e-08, + "loss": 0.5065, + "step": 5350 + }, + { + "epoch": 1.9541719919664051, + "grad_norm": 1.1829543113708496, + "learning_rate": 8.033052590779245e-08, + "loss": 0.5075, + "step": 5351 + }, + { + "epoch": 1.9545371553770312, + "grad_norm": 0.9668950438499451, + "learning_rate": 7.90512226297846e-08, + "loss": 0.4947, + "step": 5352 + }, + { + "epoch": 1.9549023187876575, + "grad_norm": 1.1358628273010254, + "learning_rate": 7.77821678805446e-08, + "loss": 0.5368, + "step": 5353 + }, + { + "epoch": 1.9552674821982836, + "grad_norm": 0.9124475121498108, + "learning_rate": 7.652336231295021e-08, + "loss": 0.4982, + "step": 5354 + }, + { + "epoch": 1.95563264560891, + "grad_norm": 0.8891622424125671, + "learning_rate": 7.527480657460562e-08, + "loss": 0.4711, + "step": 5355 + }, + { + "epoch": 1.9559978090195362, + "grad_norm": 2.195021629333496, + "learning_rate": 7.403650130784368e-08, + "loss": 0.5155, + "step": 5356 + }, + { + "epoch": 1.9563629724301626, + "grad_norm": 0.8266718983650208, + "learning_rate": 7.280844714972368e-08, + "loss": 0.5251, + "step": 5357 + }, + { + "epoch": 1.9567281358407889, + "grad_norm": 0.7959644794464111, + "learning_rate": 7.159064473202914e-08, + "loss": 0.5285, + "step": 5358 + }, + { + "epoch": 1.957093299251415, + "grad_norm": 0.9930187463760376, + "learning_rate": 7.038309468127225e-08, + "loss": 0.4782, + "step": 5359 + }, + { + "epoch": 1.9574584626620413, + "grad_norm": 1.3513890504837036, + "learning_rate": 6.918579761868493e-08, + "loss": 0.4846, + "step": 5360 + }, + { + "epoch": 1.9578236260726674, + "grad_norm": 0.8065437078475952, + "learning_rate": 6.799875416023005e-08, + "loss": 0.51, + "step": 5361 + }, + { + "epoch": 1.9581887894832937, + "grad_norm": 0.9978945851325989, + "learning_rate": 6.682196491659687e-08, + "loss": 0.4882, + "step": 5362 + }, + { + "epoch": 1.95855395289392, + "grad_norm": 0.8857904076576233, + "learning_rate": 6.565543049319445e-08, + "loss": 0.5433, + "step": 5363 + }, + { + "epoch": 1.9589191163045463, + "grad_norm": 0.8729395270347595, + "learning_rate": 6.449915149015828e-08, + "loss": 0.5188, + "step": 5364 + }, + { + "epoch": 1.9592842797151726, + "grad_norm": 1.0837210416793823, + "learning_rate": 6.335312850234365e-08, + "loss": 0.5128, + "step": 5365 + }, + { + "epoch": 1.959649443125799, + "grad_norm": 0.758825421333313, + "learning_rate": 6.221736211933893e-08, + "loss": 0.5235, + "step": 5366 + }, + { + "epoch": 1.960014606536425, + "grad_norm": 0.9562605619430542, + "learning_rate": 6.109185292544784e-08, + "loss": 0.5296, + "step": 5367 + }, + { + "epoch": 1.9603797699470513, + "grad_norm": 1.308916687965393, + "learning_rate": 5.99766014996983e-08, + "loss": 0.5097, + "step": 5368 + }, + { + "epoch": 1.9607449333576774, + "grad_norm": 1.2315113544464111, + "learning_rate": 5.887160841584472e-08, + "loss": 0.521, + "step": 5369 + }, + { + "epoch": 1.9611100967683037, + "grad_norm": 0.6987513899803162, + "learning_rate": 5.777687424236123e-08, + "loss": 0.5208, + "step": 5370 + }, + { + "epoch": 1.96147526017893, + "grad_norm": 0.820620059967041, + "learning_rate": 5.669239954244399e-08, + "loss": 0.5549, + "step": 5371 + }, + { + "epoch": 1.9618404235895563, + "grad_norm": 0.9430913925170898, + "learning_rate": 5.561818487401338e-08, + "loss": 0.5097, + "step": 5372 + }, + { + "epoch": 1.9622055870001827, + "grad_norm": 0.9805968999862671, + "learning_rate": 5.455423078970734e-08, + "loss": 0.5195, + "step": 5373 + }, + { + "epoch": 1.962570750410809, + "grad_norm": 1.0393964052200317, + "learning_rate": 5.350053783689024e-08, + "loss": 0.5257, + "step": 5374 + }, + { + "epoch": 1.962935913821435, + "grad_norm": 0.6415253281593323, + "learning_rate": 5.2457106557641803e-08, + "loss": 0.5567, + "step": 5375 + }, + { + "epoch": 1.9633010772320614, + "grad_norm": 0.987578809261322, + "learning_rate": 5.142393748876595e-08, + "loss": 0.5237, + "step": 5376 + }, + { + "epoch": 1.9636662406426875, + "grad_norm": 1.112883448600769, + "learning_rate": 5.040103116178863e-08, + "loss": 0.5076, + "step": 5377 + }, + { + "epoch": 1.9640314040533138, + "grad_norm": 0.9256656169891357, + "learning_rate": 4.938838810295554e-08, + "loss": 0.5236, + "step": 5378 + }, + { + "epoch": 1.96439656746394, + "grad_norm": 0.9695578217506409, + "learning_rate": 4.8386008833225526e-08, + "loss": 0.5601, + "step": 5379 + }, + { + "epoch": 1.9647617308745664, + "grad_norm": 0.7416371703147888, + "learning_rate": 4.739389386828608e-08, + "loss": 0.5699, + "step": 5380 + }, + { + "epoch": 1.9651268942851927, + "grad_norm": 1.8474271297454834, + "learning_rate": 4.6412043718540024e-08, + "loss": 0.5528, + "step": 5381 + }, + { + "epoch": 1.965492057695819, + "grad_norm": 1.0167008638381958, + "learning_rate": 4.544045888910997e-08, + "loss": 0.5311, + "step": 5382 + }, + { + "epoch": 1.9658572211064451, + "grad_norm": 0.8540982604026794, + "learning_rate": 4.447913987983832e-08, + "loss": 0.5136, + "step": 5383 + }, + { + "epoch": 1.9662223845170714, + "grad_norm": 0.8130719065666199, + "learning_rate": 4.352808718528279e-08, + "loss": 0.5834, + "step": 5384 + }, + { + "epoch": 1.9665875479276975, + "grad_norm": 2.0834383964538574, + "learning_rate": 4.2587301294723105e-08, + "loss": 0.5196, + "step": 5385 + }, + { + "epoch": 1.9669527113383238, + "grad_norm": 1.1500024795532227, + "learning_rate": 4.165678269215656e-08, + "loss": 0.5363, + "step": 5386 + }, + { + "epoch": 1.9673178747489501, + "grad_norm": 1.5207279920578003, + "learning_rate": 4.073653185629578e-08, + "loss": 0.5292, + "step": 5387 + }, + { + "epoch": 1.9676830381595765, + "grad_norm": 0.7735336422920227, + "learning_rate": 3.982654926057539e-08, + "loss": 0.5164, + "step": 5388 + }, + { + "epoch": 1.9680482015702028, + "grad_norm": 0.9119135141372681, + "learning_rate": 3.8926835373143125e-08, + "loss": 0.5292, + "step": 5389 + }, + { + "epoch": 1.968413364980829, + "grad_norm": 0.876446545124054, + "learning_rate": 3.803739065686651e-08, + "loss": 0.5468, + "step": 5390 + }, + { + "epoch": 1.9687785283914552, + "grad_norm": 0.9272382259368896, + "learning_rate": 3.7158215569326194e-08, + "loss": 0.5084, + "step": 5391 + }, + { + "epoch": 1.9691436918020815, + "grad_norm": 0.9327825307846069, + "learning_rate": 3.628931056282703e-08, + "loss": 0.5548, + "step": 5392 + }, + { + "epoch": 1.9695088552127076, + "grad_norm": 1.1887319087982178, + "learning_rate": 3.5430676084384775e-08, + "loss": 0.5042, + "step": 5393 + }, + { + "epoch": 1.9698740186233339, + "grad_norm": 1.0209521055221558, + "learning_rate": 3.4582312575728306e-08, + "loss": 0.4762, + "step": 5394 + }, + { + "epoch": 1.9702391820339602, + "grad_norm": 1.0543705224990845, + "learning_rate": 3.3744220473312937e-08, + "loss": 0.5107, + "step": 5395 + }, + { + "epoch": 1.9706043454445865, + "grad_norm": 0.7981095314025879, + "learning_rate": 3.291640020829823e-08, + "loss": 0.5123, + "step": 5396 + }, + { + "epoch": 1.9709695088552128, + "grad_norm": 0.9885851144790649, + "learning_rate": 3.2098852206567944e-08, + "loss": 0.5234, + "step": 5397 + }, + { + "epoch": 1.971334672265839, + "grad_norm": 0.9228920936584473, + "learning_rate": 3.1291576888714536e-08, + "loss": 0.5839, + "step": 5398 + }, + { + "epoch": 1.9716998356764652, + "grad_norm": 1.0322990417480469, + "learning_rate": 3.0494574670050236e-08, + "loss": 0.5578, + "step": 5399 + }, + { + "epoch": 1.9720649990870913, + "grad_norm": 1.2793372869491577, + "learning_rate": 2.970784596060261e-08, + "loss": 0.4935, + "step": 5400 + }, + { + "epoch": 1.9724301624977176, + "grad_norm": 0.8219887614250183, + "learning_rate": 2.8931391165107902e-08, + "loss": 0.5135, + "step": 5401 + }, + { + "epoch": 1.972795325908344, + "grad_norm": 1.2734088897705078, + "learning_rate": 2.816521068302658e-08, + "loss": 0.5017, + "step": 5402 + }, + { + "epoch": 1.9731604893189703, + "grad_norm": 0.7988722920417786, + "learning_rate": 2.740930490852334e-08, + "loss": 0.5737, + "step": 5403 + }, + { + "epoch": 1.9735256527295966, + "grad_norm": 1.2488044500350952, + "learning_rate": 2.6663674230482663e-08, + "loss": 0.5502, + "step": 5404 + }, + { + "epoch": 1.9738908161402229, + "grad_norm": 0.9395003318786621, + "learning_rate": 2.5928319032499928e-08, + "loss": 0.4857, + "step": 5405 + }, + { + "epoch": 1.974255979550849, + "grad_norm": 0.849014937877655, + "learning_rate": 2.520323969288807e-08, + "loss": 0.5157, + "step": 5406 + }, + { + "epoch": 1.9746211429614753, + "grad_norm": 0.9843391180038452, + "learning_rate": 2.4488436584670928e-08, + "loss": 0.521, + "step": 5407 + }, + { + "epoch": 1.9749863063721014, + "grad_norm": 1.1458913087844849, + "learning_rate": 2.378391007558767e-08, + "loss": 0.5219, + "step": 5408 + }, + { + "epoch": 1.9753514697827277, + "grad_norm": 1.136871099472046, + "learning_rate": 2.3089660528083923e-08, + "loss": 0.475, + "step": 5409 + }, + { + "epoch": 1.975716633193354, + "grad_norm": 0.814587414264679, + "learning_rate": 2.240568829932732e-08, + "loss": 0.5254, + "step": 5410 + }, + { + "epoch": 1.9760817966039803, + "grad_norm": 0.8695433139801025, + "learning_rate": 2.173199374119417e-08, + "loss": 0.5297, + "step": 5411 + }, + { + "epoch": 1.9764469600146066, + "grad_norm": 0.9298883676528931, + "learning_rate": 2.106857720027167e-08, + "loss": 0.5593, + "step": 5412 + }, + { + "epoch": 1.976812123425233, + "grad_norm": 0.7862787842750549, + "learning_rate": 2.041543901786236e-08, + "loss": 0.5323, + "step": 5413 + }, + { + "epoch": 1.977177286835859, + "grad_norm": 0.9040335416793823, + "learning_rate": 1.9772579529977463e-08, + "loss": 0.537, + "step": 5414 + }, + { + "epoch": 1.9775424502464853, + "grad_norm": 0.9744546413421631, + "learning_rate": 1.913999906734354e-08, + "loss": 0.4876, + "step": 5415 + }, + { + "epoch": 1.9779076136571114, + "grad_norm": 1.0703387260437012, + "learning_rate": 1.851769795540026e-08, + "loss": 0.5482, + "step": 5416 + }, + { + "epoch": 1.9782727770677377, + "grad_norm": 0.7839832305908203, + "learning_rate": 1.7905676514293757e-08, + "loss": 0.5452, + "step": 5417 + }, + { + "epoch": 1.978637940478364, + "grad_norm": 0.7852559685707092, + "learning_rate": 1.7303935058885502e-08, + "loss": 0.5621, + "step": 5418 + }, + { + "epoch": 1.9790031038889904, + "grad_norm": 0.9149523973464966, + "learning_rate": 1.6712473898745642e-08, + "loss": 0.5286, + "step": 5419 + }, + { + "epoch": 1.9793682672996167, + "grad_norm": 0.8258424997329712, + "learning_rate": 1.6131293338157438e-08, + "loss": 0.5724, + "step": 5420 + }, + { + "epoch": 1.979733430710243, + "grad_norm": 0.9903010129928589, + "learning_rate": 1.55603936761195e-08, + "loss": 0.5513, + "step": 5421 + }, + { + "epoch": 1.980098594120869, + "grad_norm": 1.2193019390106201, + "learning_rate": 1.4999775206330224e-08, + "loss": 0.4595, + "step": 5422 + }, + { + "epoch": 1.9804637575314954, + "grad_norm": 1.0448379516601562, + "learning_rate": 1.444943821721001e-08, + "loss": 0.5557, + "step": 5423 + }, + { + "epoch": 1.9808289209421215, + "grad_norm": 0.9651296138763428, + "learning_rate": 1.390938299188349e-08, + "loss": 0.5552, + "step": 5424 + }, + { + "epoch": 1.9811940843527478, + "grad_norm": 1.150664210319519, + "learning_rate": 1.337960980818842e-08, + "loss": 0.5406, + "step": 5425 + }, + { + "epoch": 1.981559247763374, + "grad_norm": 0.8297967910766602, + "learning_rate": 1.2860118938669008e-08, + "loss": 0.5714, + "step": 5426 + }, + { + "epoch": 1.9819244111740004, + "grad_norm": 0.8567715287208557, + "learning_rate": 1.2350910650587022e-08, + "loss": 0.5518, + "step": 5427 + }, + { + "epoch": 1.9822895745846267, + "grad_norm": 0.8794555068016052, + "learning_rate": 1.1851985205904026e-08, + "loss": 0.5276, + "step": 5428 + }, + { + "epoch": 1.9826547379952528, + "grad_norm": 0.735395073890686, + "learning_rate": 1.1363342861301363e-08, + "loss": 0.5508, + "step": 5429 + }, + { + "epoch": 1.9830199014058791, + "grad_norm": 0.86039799451828, + "learning_rate": 1.0884983868166832e-08, + "loss": 0.4865, + "step": 5430 + }, + { + "epoch": 1.9833850648165052, + "grad_norm": 1.0929391384124756, + "learning_rate": 1.0416908472592468e-08, + "loss": 0.473, + "step": 5431 + }, + { + "epoch": 1.9837502282271315, + "grad_norm": 0.8757457137107849, + "learning_rate": 9.959116915387868e-09, + "loss": 0.5257, + "step": 5432 + }, + { + "epoch": 1.9841153916377579, + "grad_norm": 1.0023913383483887, + "learning_rate": 9.51160943206686e-09, + "loss": 0.533, + "step": 5433 + }, + { + "epoch": 1.9844805550483842, + "grad_norm": 0.9719913601875305, + "learning_rate": 9.074386252854172e-09, + "loss": 0.4845, + "step": 5434 + }, + { + "epoch": 1.9848457184590105, + "grad_norm": 0.8217945098876953, + "learning_rate": 8.647447602683212e-09, + "loss": 0.5406, + "step": 5435 + }, + { + "epoch": 1.9852108818696368, + "grad_norm": 1.0762484073638916, + "learning_rate": 8.23079370119828e-09, + "loss": 0.4913, + "step": 5436 + }, + { + "epoch": 1.9855760452802629, + "grad_norm": 0.937466561794281, + "learning_rate": 7.824424762750137e-09, + "loss": 0.5121, + "step": 5437 + }, + { + "epoch": 1.9859412086908892, + "grad_norm": 0.7162231802940369, + "learning_rate": 7.428340996400441e-09, + "loss": 0.5219, + "step": 5438 + }, + { + "epoch": 1.9863063721015153, + "grad_norm": 0.8674814105033875, + "learning_rate": 7.042542605915081e-09, + "loss": 0.5443, + "step": 5439 + }, + { + "epoch": 1.9866715355121416, + "grad_norm": 0.9962090849876404, + "learning_rate": 6.667029789775292e-09, + "loss": 0.5136, + "step": 5440 + }, + { + "epoch": 1.987036698922768, + "grad_norm": 0.9544779658317566, + "learning_rate": 6.301802741166541e-09, + "loss": 0.5163, + "step": 5441 + }, + { + "epoch": 1.9874018623333942, + "grad_norm": 1.1398656368255615, + "learning_rate": 5.946861647982971e-09, + "loss": 0.5375, + "step": 5442 + }, + { + "epoch": 1.9877670257440205, + "grad_norm": 0.8499765396118164, + "learning_rate": 5.602206692827405e-09, + "loss": 0.4969, + "step": 5443 + }, + { + "epoch": 1.9881321891546468, + "grad_norm": 0.8699721097946167, + "learning_rate": 5.267838053011343e-09, + "loss": 0.5186, + "step": 5444 + }, + { + "epoch": 1.988497352565273, + "grad_norm": 0.5777058601379395, + "learning_rate": 4.943755900554958e-09, + "loss": 0.5463, + "step": 5445 + }, + { + "epoch": 1.9888625159758992, + "grad_norm": 0.7768506407737732, + "learning_rate": 4.629960402182665e-09, + "loss": 0.5792, + "step": 5446 + }, + { + "epoch": 1.9892276793865253, + "grad_norm": 0.8641973733901978, + "learning_rate": 4.326451719334213e-09, + "loss": 0.5445, + "step": 5447 + }, + { + "epoch": 1.9895928427971517, + "grad_norm": 1.0852634906768799, + "learning_rate": 4.033230008146927e-09, + "loss": 0.5142, + "step": 5448 + }, + { + "epoch": 1.989958006207778, + "grad_norm": 0.8240854740142822, + "learning_rate": 3.750295419475692e-09, + "loss": 0.5847, + "step": 5449 + }, + { + "epoch": 1.9903231696184043, + "grad_norm": 1.0229144096374512, + "learning_rate": 3.477648098879627e-09, + "loss": 0.5779, + "step": 5450 + }, + { + "epoch": 1.9906883330290306, + "grad_norm": 0.9161554574966431, + "learning_rate": 3.2152881866198695e-09, + "loss": 0.5316, + "step": 5451 + }, + { + "epoch": 1.991053496439657, + "grad_norm": 1.0170706510543823, + "learning_rate": 2.9632158176751134e-09, + "loss": 0.5254, + "step": 5452 + }, + { + "epoch": 1.991418659850283, + "grad_norm": 0.9410620927810669, + "learning_rate": 2.72143112172607e-09, + "loss": 0.548, + "step": 5453 + }, + { + "epoch": 1.9917838232609093, + "grad_norm": 1.0087649822235107, + "learning_rate": 2.489934223157686e-09, + "loss": 0.5626, + "step": 5454 + }, + { + "epoch": 1.9921489866715354, + "grad_norm": 0.9506940841674805, + "learning_rate": 2.268725241068026e-09, + "loss": 0.5505, + "step": 5455 + }, + { + "epoch": 1.9925141500821617, + "grad_norm": 0.9841387867927551, + "learning_rate": 2.0578042892616114e-09, + "loss": 0.523, + "step": 5456 + }, + { + "epoch": 1.992879313492788, + "grad_norm": 0.7301874756813049, + "learning_rate": 1.8571714762471993e-09, + "loss": 0.5695, + "step": 5457 + }, + { + "epoch": 1.9932444769034143, + "grad_norm": 1.060308575630188, + "learning_rate": 1.6668269052422248e-09, + "loss": 0.5644, + "step": 5458 + }, + { + "epoch": 1.9936096403140406, + "grad_norm": 0.9705191254615784, + "learning_rate": 1.4867706741727994e-09, + "loss": 0.5026, + "step": 5459 + }, + { + "epoch": 1.9939748037246667, + "grad_norm": 1.1569206714630127, + "learning_rate": 1.3170028756670506e-09, + "loss": 0.5347, + "step": 5460 + }, + { + "epoch": 1.994339967135293, + "grad_norm": 1.3227145671844482, + "learning_rate": 1.157523597068444e-09, + "loss": 0.5019, + "step": 5461 + }, + { + "epoch": 1.9947051305459191, + "grad_norm": 1.1881853342056274, + "learning_rate": 1.0083329204180204e-09, + "loss": 0.5177, + "step": 5462 + }, + { + "epoch": 1.9950702939565454, + "grad_norm": 0.8623504638671875, + "learning_rate": 8.694309224721586e-10, + "loss": 0.5263, + "step": 5463 + }, + { + "epoch": 1.9954354573671718, + "grad_norm": 0.7508077025413513, + "learning_rate": 7.408176746892537e-10, + "loss": 0.5711, + "step": 5464 + }, + { + "epoch": 1.995800620777798, + "grad_norm": 1.054730772972107, + "learning_rate": 6.224932432363773e-10, + "loss": 0.5274, + "step": 5465 + }, + { + "epoch": 1.9961657841884244, + "grad_norm": 0.898906946182251, + "learning_rate": 5.144576889826169e-10, + "loss": 0.5164, + "step": 5466 + }, + { + "epoch": 1.9965309475990507, + "grad_norm": 0.8694043159484863, + "learning_rate": 4.1671106751239866e-10, + "loss": 0.4893, + "step": 5467 + }, + { + "epoch": 1.9968961110096768, + "grad_norm": 0.9791451096534729, + "learning_rate": 3.2925342911216405e-10, + "loss": 0.5417, + "step": 5468 + }, + { + "epoch": 1.997261274420303, + "grad_norm": 0.8434933423995972, + "learning_rate": 2.5208481877259103e-10, + "loss": 0.554, + "step": 5469 + }, + { + "epoch": 1.9976264378309292, + "grad_norm": 0.9651261568069458, + "learning_rate": 1.8520527619747543e-10, + "loss": 0.542, + "step": 5470 + }, + { + "epoch": 1.9979916012415555, + "grad_norm": 0.9461495280265808, + "learning_rate": 1.2861483579040824e-10, + "loss": 0.5278, + "step": 5471 + }, + { + "epoch": 1.9983567646521818, + "grad_norm": 1.4563730955123901, + "learning_rate": 8.231352666587811e-11, + "loss": 0.5299, + "step": 5472 + }, + { + "epoch": 1.9987219280628081, + "grad_norm": 0.9893348217010498, + "learning_rate": 4.630137264483026e-11, + "loss": 0.5204, + "step": 5473 + }, + { + "epoch": 1.9990870914734344, + "grad_norm": 0.9429623484611511, + "learning_rate": 2.0578392252446066e-11, + "loss": 0.5097, + "step": 5474 + }, + { + "epoch": 1.9994522548840608, + "grad_norm": 0.7210927605628967, + "learning_rate": 5.1445987248044394e-12, + "loss": 0.5456, + "step": 5475 + }, + { + "epoch": 1.9998174182946868, + "grad_norm": 1.0174647569656372, + "learning_rate": 0.0, + "loss": 0.4995, + "step": 5476 + }, + { + "epoch": 1.9998174182946868, + "step": 5476, + "total_flos": 0.0, + "train_loss": 0.11873858405169646, + "train_runtime": 35260.5195, + "train_samples_per_second": 4.971, + "train_steps_per_second": 0.155 + } + ], + "logging_steps": 1.0, + "max_steps": 5476, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}