{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 4216, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004743833017077799, "grad_norm": 12.117147445678711, "learning_rate": 1.5748031496062994e-07, "loss": 0.6555, "step": 1 }, { "epoch": 0.0009487666034155598, "grad_norm": 12.216944694519043, "learning_rate": 3.149606299212599e-07, "loss": 0.7042, "step": 2 }, { "epoch": 0.0014231499051233396, "grad_norm": 11.745461463928223, "learning_rate": 4.724409448818898e-07, "loss": 0.7974, "step": 3 }, { "epoch": 0.0018975332068311196, "grad_norm": 14.210430145263672, "learning_rate": 6.299212598425198e-07, "loss": 0.6868, "step": 4 }, { "epoch": 0.0023719165085388993, "grad_norm": 13.679463386535645, "learning_rate": 7.874015748031496e-07, "loss": 0.7468, "step": 5 }, { "epoch": 0.0028462998102466793, "grad_norm": 12.984026908874512, "learning_rate": 9.448818897637796e-07, "loss": 0.7443, "step": 6 }, { "epoch": 0.003320683111954459, "grad_norm": 15.170774459838867, "learning_rate": 1.1023622047244096e-06, "loss": 0.7255, "step": 7 }, { "epoch": 0.003795066413662239, "grad_norm": 12.011886596679688, "learning_rate": 1.2598425196850396e-06, "loss": 0.6684, "step": 8 }, { "epoch": 0.004269449715370019, "grad_norm": 29.63150405883789, "learning_rate": 1.4173228346456693e-06, "loss": 0.6935, "step": 9 }, { "epoch": 0.004743833017077799, "grad_norm": 12.307829856872559, "learning_rate": 1.5748031496062992e-06, "loss": 0.5554, "step": 10 }, { "epoch": 0.005218216318785579, "grad_norm": 15.281328201293945, "learning_rate": 1.7322834645669292e-06, "loss": 0.5863, "step": 11 }, { "epoch": 0.0056925996204933585, "grad_norm": 10.656610488891602, "learning_rate": 1.8897637795275591e-06, "loss": 0.4246, "step": 12 }, { "epoch": 0.006166982922201139, "grad_norm": 9.216360092163086, "learning_rate": 2.0472440944881893e-06, "loss": 0.3654, "step": 13 }, { "epoch": 0.006641366223908918, "grad_norm": 7.334237575531006, "learning_rate": 2.2047244094488192e-06, "loss": 0.3389, "step": 14 }, { "epoch": 0.007115749525616698, "grad_norm": 9.264925003051758, "learning_rate": 2.362204724409449e-06, "loss": 0.3582, "step": 15 }, { "epoch": 0.007590132827324478, "grad_norm": 7.431726455688477, "learning_rate": 2.519685039370079e-06, "loss": 0.3778, "step": 16 }, { "epoch": 0.008064516129032258, "grad_norm": 14.502622604370117, "learning_rate": 2.677165354330709e-06, "loss": 0.4209, "step": 17 }, { "epoch": 0.008538899430740038, "grad_norm": 8.857325553894043, "learning_rate": 2.8346456692913386e-06, "loss": 0.3823, "step": 18 }, { "epoch": 0.009013282732447819, "grad_norm": 10.866437911987305, "learning_rate": 2.992125984251969e-06, "loss": 0.4257, "step": 19 }, { "epoch": 0.009487666034155597, "grad_norm": 7.098900318145752, "learning_rate": 3.1496062992125985e-06, "loss": 0.324, "step": 20 }, { "epoch": 0.009962049335863378, "grad_norm": 7.2487616539001465, "learning_rate": 3.307086614173229e-06, "loss": 0.3815, "step": 21 }, { "epoch": 0.010436432637571158, "grad_norm": 6.616808891296387, "learning_rate": 3.4645669291338583e-06, "loss": 0.2939, "step": 22 }, { "epoch": 0.010910815939278937, "grad_norm": 6.747620582580566, "learning_rate": 3.6220472440944887e-06, "loss": 0.2914, "step": 23 }, { "epoch": 0.011385199240986717, "grad_norm": 5.690284729003906, "learning_rate": 3.7795275590551182e-06, "loss": 0.2879, "step": 24 }, { "epoch": 0.011859582542694497, "grad_norm": 6.056181907653809, "learning_rate": 3.937007874015748e-06, "loss": 0.2785, "step": 25 }, { "epoch": 0.012333965844402278, "grad_norm": 7.560486316680908, "learning_rate": 4.0944881889763785e-06, "loss": 0.3008, "step": 26 }, { "epoch": 0.012808349146110056, "grad_norm": 7.066635608673096, "learning_rate": 4.251968503937008e-06, "loss": 0.3298, "step": 27 }, { "epoch": 0.013282732447817837, "grad_norm": 5.725413799285889, "learning_rate": 4.4094488188976384e-06, "loss": 0.2901, "step": 28 }, { "epoch": 0.013757115749525617, "grad_norm": 7.520142555236816, "learning_rate": 4.566929133858268e-06, "loss": 0.3461, "step": 29 }, { "epoch": 0.014231499051233396, "grad_norm": 6.214694976806641, "learning_rate": 4.724409448818898e-06, "loss": 0.3216, "step": 30 }, { "epoch": 0.014705882352941176, "grad_norm": 6.9384870529174805, "learning_rate": 4.881889763779528e-06, "loss": 0.3779, "step": 31 }, { "epoch": 0.015180265654648957, "grad_norm": 6.863812446594238, "learning_rate": 5.039370078740158e-06, "loss": 0.3194, "step": 32 }, { "epoch": 0.015654648956356737, "grad_norm": 7.183291912078857, "learning_rate": 5.196850393700788e-06, "loss": 0.3293, "step": 33 }, { "epoch": 0.016129032258064516, "grad_norm": 6.995275497436523, "learning_rate": 5.354330708661418e-06, "loss": 0.4026, "step": 34 }, { "epoch": 0.016603415559772294, "grad_norm": 6.775660514831543, "learning_rate": 5.511811023622048e-06, "loss": 0.3289, "step": 35 }, { "epoch": 0.017077798861480076, "grad_norm": 6.413484573364258, "learning_rate": 5.669291338582677e-06, "loss": 0.3315, "step": 36 }, { "epoch": 0.017552182163187855, "grad_norm": 7.173760414123535, "learning_rate": 5.8267716535433075e-06, "loss": 0.3633, "step": 37 }, { "epoch": 0.018026565464895637, "grad_norm": 4.803355693817139, "learning_rate": 5.984251968503938e-06, "loss": 0.3607, "step": 38 }, { "epoch": 0.018500948766603416, "grad_norm": 5.451982498168945, "learning_rate": 6.141732283464567e-06, "loss": 0.3503, "step": 39 }, { "epoch": 0.018975332068311195, "grad_norm": 5.767213821411133, "learning_rate": 6.299212598425197e-06, "loss": 0.2794, "step": 40 }, { "epoch": 0.019449715370018977, "grad_norm": 6.687100410461426, "learning_rate": 6.456692913385827e-06, "loss": 0.3668, "step": 41 }, { "epoch": 0.019924098671726755, "grad_norm": 6.374785900115967, "learning_rate": 6.614173228346458e-06, "loss": 0.3366, "step": 42 }, { "epoch": 0.020398481973434534, "grad_norm": 6.196499347686768, "learning_rate": 6.771653543307087e-06, "loss": 0.3421, "step": 43 }, { "epoch": 0.020872865275142316, "grad_norm": 6.357893943786621, "learning_rate": 6.929133858267717e-06, "loss": 0.3663, "step": 44 }, { "epoch": 0.021347248576850095, "grad_norm": 5.325692653656006, "learning_rate": 7.086614173228347e-06, "loss": 0.3341, "step": 45 }, { "epoch": 0.021821631878557873, "grad_norm": 6.0895538330078125, "learning_rate": 7.2440944881889774e-06, "loss": 0.3348, "step": 46 }, { "epoch": 0.022296015180265655, "grad_norm": 5.129356384277344, "learning_rate": 7.401574803149607e-06, "loss": 0.294, "step": 47 }, { "epoch": 0.022770398481973434, "grad_norm": 5.093695163726807, "learning_rate": 7.5590551181102365e-06, "loss": 0.3223, "step": 48 }, { "epoch": 0.023244781783681213, "grad_norm": 5.516068935394287, "learning_rate": 7.716535433070867e-06, "loss": 0.3455, "step": 49 }, { "epoch": 0.023719165085388995, "grad_norm": 5.486409664154053, "learning_rate": 7.874015748031496e-06, "loss": 0.3349, "step": 50 }, { "epoch": 0.024193548387096774, "grad_norm": 5.390769004821777, "learning_rate": 8.031496062992128e-06, "loss": 0.3474, "step": 51 }, { "epoch": 0.024667931688804556, "grad_norm": 5.160117149353027, "learning_rate": 8.188976377952757e-06, "loss": 0.3048, "step": 52 }, { "epoch": 0.025142314990512334, "grad_norm": 7.194066047668457, "learning_rate": 8.346456692913387e-06, "loss": 0.4043, "step": 53 }, { "epoch": 0.025616698292220113, "grad_norm": 5.130771636962891, "learning_rate": 8.503937007874016e-06, "loss": 0.3362, "step": 54 }, { "epoch": 0.026091081593927895, "grad_norm": 5.493372440338135, "learning_rate": 8.661417322834647e-06, "loss": 0.3842, "step": 55 }, { "epoch": 0.026565464895635674, "grad_norm": 6.465633392333984, "learning_rate": 8.818897637795277e-06, "loss": 0.3974, "step": 56 }, { "epoch": 0.027039848197343452, "grad_norm": 7.080130100250244, "learning_rate": 8.976377952755906e-06, "loss": 0.3579, "step": 57 }, { "epoch": 0.027514231499051234, "grad_norm": 5.774561882019043, "learning_rate": 9.133858267716536e-06, "loss": 0.374, "step": 58 }, { "epoch": 0.027988614800759013, "grad_norm": 5.477121353149414, "learning_rate": 9.291338582677165e-06, "loss": 0.3664, "step": 59 }, { "epoch": 0.028462998102466792, "grad_norm": 5.975531101226807, "learning_rate": 9.448818897637797e-06, "loss": 0.4218, "step": 60 }, { "epoch": 0.028937381404174574, "grad_norm": 5.1018967628479, "learning_rate": 9.606299212598426e-06, "loss": 0.3879, "step": 61 }, { "epoch": 0.029411764705882353, "grad_norm": 5.107121467590332, "learning_rate": 9.763779527559056e-06, "loss": 0.3992, "step": 62 }, { "epoch": 0.02988614800759013, "grad_norm": 6.47650671005249, "learning_rate": 9.921259842519685e-06, "loss": 0.4705, "step": 63 }, { "epoch": 0.030360531309297913, "grad_norm": 6.037809371948242, "learning_rate": 1.0078740157480316e-05, "loss": 0.4264, "step": 64 }, { "epoch": 0.030834914611005692, "grad_norm": 5.349067211151123, "learning_rate": 1.0236220472440946e-05, "loss": 0.384, "step": 65 }, { "epoch": 0.031309297912713474, "grad_norm": 5.131612777709961, "learning_rate": 1.0393700787401575e-05, "loss": 0.3905, "step": 66 }, { "epoch": 0.03178368121442125, "grad_norm": 5.476340293884277, "learning_rate": 1.0551181102362205e-05, "loss": 0.427, "step": 67 }, { "epoch": 0.03225806451612903, "grad_norm": 6.4385480880737305, "learning_rate": 1.0708661417322836e-05, "loss": 0.5252, "step": 68 }, { "epoch": 0.03273244781783681, "grad_norm": 5.29050874710083, "learning_rate": 1.0866141732283466e-05, "loss": 0.453, "step": 69 }, { "epoch": 0.03320683111954459, "grad_norm": 4.675810813903809, "learning_rate": 1.1023622047244095e-05, "loss": 0.412, "step": 70 }, { "epoch": 0.033681214421252374, "grad_norm": 5.531217575073242, "learning_rate": 1.1181102362204725e-05, "loss": 0.4553, "step": 71 }, { "epoch": 0.03415559772296015, "grad_norm": 6.203500747680664, "learning_rate": 1.1338582677165354e-05, "loss": 0.4826, "step": 72 }, { "epoch": 0.03462998102466793, "grad_norm": 5.102776050567627, "learning_rate": 1.1496062992125985e-05, "loss": 0.3427, "step": 73 }, { "epoch": 0.03510436432637571, "grad_norm": 4.565568923950195, "learning_rate": 1.1653543307086615e-05, "loss": 0.3734, "step": 74 }, { "epoch": 0.03557874762808349, "grad_norm": 4.548391819000244, "learning_rate": 1.1811023622047245e-05, "loss": 0.3647, "step": 75 }, { "epoch": 0.036053130929791274, "grad_norm": 5.1189398765563965, "learning_rate": 1.1968503937007876e-05, "loss": 0.4211, "step": 76 }, { "epoch": 0.03652751423149905, "grad_norm": 6.630745887756348, "learning_rate": 1.2125984251968505e-05, "loss": 0.5191, "step": 77 }, { "epoch": 0.03700189753320683, "grad_norm": 5.194817066192627, "learning_rate": 1.2283464566929135e-05, "loss": 0.3779, "step": 78 }, { "epoch": 0.03747628083491461, "grad_norm": 5.23333740234375, "learning_rate": 1.2440944881889764e-05, "loss": 0.4235, "step": 79 }, { "epoch": 0.03795066413662239, "grad_norm": 4.812337398529053, "learning_rate": 1.2598425196850394e-05, "loss": 0.3958, "step": 80 }, { "epoch": 0.03842504743833017, "grad_norm": 4.991153240203857, "learning_rate": 1.2755905511811025e-05, "loss": 0.4543, "step": 81 }, { "epoch": 0.03889943074003795, "grad_norm": 5.6834025382995605, "learning_rate": 1.2913385826771655e-05, "loss": 0.4253, "step": 82 }, { "epoch": 0.03937381404174573, "grad_norm": 4.659487247467041, "learning_rate": 1.3070866141732284e-05, "loss": 0.3582, "step": 83 }, { "epoch": 0.03984819734345351, "grad_norm": 6.373027324676514, "learning_rate": 1.3228346456692915e-05, "loss": 0.4211, "step": 84 }, { "epoch": 0.04032258064516129, "grad_norm": 4.993396759033203, "learning_rate": 1.3385826771653545e-05, "loss": 0.417, "step": 85 }, { "epoch": 0.04079696394686907, "grad_norm": 5.25504207611084, "learning_rate": 1.3543307086614174e-05, "loss": 0.4558, "step": 86 }, { "epoch": 0.04127134724857685, "grad_norm": 5.829912185668945, "learning_rate": 1.3700787401574804e-05, "loss": 0.473, "step": 87 }, { "epoch": 0.04174573055028463, "grad_norm": 5.047908782958984, "learning_rate": 1.3858267716535433e-05, "loss": 0.4056, "step": 88 }, { "epoch": 0.04222011385199241, "grad_norm": 5.062896251678467, "learning_rate": 1.4015748031496063e-05, "loss": 0.4264, "step": 89 }, { "epoch": 0.04269449715370019, "grad_norm": 5.646873950958252, "learning_rate": 1.4173228346456694e-05, "loss": 0.4772, "step": 90 }, { "epoch": 0.04316888045540797, "grad_norm": 4.573570728302002, "learning_rate": 1.4330708661417324e-05, "loss": 0.4483, "step": 91 }, { "epoch": 0.04364326375711575, "grad_norm": 5.0274128913879395, "learning_rate": 1.4488188976377955e-05, "loss": 0.4248, "step": 92 }, { "epoch": 0.04411764705882353, "grad_norm": 5.646190643310547, "learning_rate": 1.4645669291338584e-05, "loss": 0.5235, "step": 93 }, { "epoch": 0.04459203036053131, "grad_norm": 5.01361083984375, "learning_rate": 1.4803149606299214e-05, "loss": 0.4591, "step": 94 }, { "epoch": 0.04506641366223909, "grad_norm": 5.54431676864624, "learning_rate": 1.4960629921259843e-05, "loss": 0.4361, "step": 95 }, { "epoch": 0.04554079696394687, "grad_norm": 4.712576389312744, "learning_rate": 1.5118110236220473e-05, "loss": 0.3923, "step": 96 }, { "epoch": 0.04601518026565465, "grad_norm": 4.848332405090332, "learning_rate": 1.5275590551181102e-05, "loss": 0.4677, "step": 97 }, { "epoch": 0.046489563567362426, "grad_norm": 5.39124870300293, "learning_rate": 1.5433070866141734e-05, "loss": 0.4506, "step": 98 }, { "epoch": 0.04696394686907021, "grad_norm": 5.186285972595215, "learning_rate": 1.559055118110236e-05, "loss": 0.4037, "step": 99 }, { "epoch": 0.04743833017077799, "grad_norm": 4.8791913986206055, "learning_rate": 1.5748031496062993e-05, "loss": 0.4467, "step": 100 }, { "epoch": 0.04791271347248577, "grad_norm": 5.618642330169678, "learning_rate": 1.5905511811023624e-05, "loss": 0.4959, "step": 101 }, { "epoch": 0.04838709677419355, "grad_norm": 4.833317279815674, "learning_rate": 1.6062992125984255e-05, "loss": 0.4752, "step": 102 }, { "epoch": 0.048861480075901326, "grad_norm": 4.536538600921631, "learning_rate": 1.6220472440944883e-05, "loss": 0.4186, "step": 103 }, { "epoch": 0.04933586337760911, "grad_norm": 5.135328769683838, "learning_rate": 1.6377952755905514e-05, "loss": 0.4471, "step": 104 }, { "epoch": 0.04981024667931689, "grad_norm": 5.110005855560303, "learning_rate": 1.6535433070866142e-05, "loss": 0.4585, "step": 105 }, { "epoch": 0.05028462998102467, "grad_norm": 5.040478229522705, "learning_rate": 1.6692913385826773e-05, "loss": 0.4656, "step": 106 }, { "epoch": 0.05075901328273245, "grad_norm": 4.914176940917969, "learning_rate": 1.68503937007874e-05, "loss": 0.4783, "step": 107 }, { "epoch": 0.051233396584440226, "grad_norm": 4.72114372253418, "learning_rate": 1.7007874015748032e-05, "loss": 0.4365, "step": 108 }, { "epoch": 0.051707779886148005, "grad_norm": 4.7739362716674805, "learning_rate": 1.7165354330708663e-05, "loss": 0.4766, "step": 109 }, { "epoch": 0.05218216318785579, "grad_norm": 4.787454128265381, "learning_rate": 1.7322834645669295e-05, "loss": 0.4542, "step": 110 }, { "epoch": 0.05265654648956357, "grad_norm": 5.132379055023193, "learning_rate": 1.7480314960629923e-05, "loss": 0.4978, "step": 111 }, { "epoch": 0.05313092979127135, "grad_norm": 6.812975883483887, "learning_rate": 1.7637795275590554e-05, "loss": 0.4559, "step": 112 }, { "epoch": 0.053605313092979126, "grad_norm": 5.09624719619751, "learning_rate": 1.779527559055118e-05, "loss": 0.5481, "step": 113 }, { "epoch": 0.054079696394686905, "grad_norm": 5.495835781097412, "learning_rate": 1.7952755905511813e-05, "loss": 0.5282, "step": 114 }, { "epoch": 0.05455407969639469, "grad_norm": 6.671237468719482, "learning_rate": 1.811023622047244e-05, "loss": 0.5136, "step": 115 }, { "epoch": 0.05502846299810247, "grad_norm": 7.128397464752197, "learning_rate": 1.8267716535433072e-05, "loss": 0.5009, "step": 116 }, { "epoch": 0.05550284629981025, "grad_norm": 4.9932026863098145, "learning_rate": 1.8425196850393703e-05, "loss": 0.4473, "step": 117 }, { "epoch": 0.055977229601518026, "grad_norm": 5.382380485534668, "learning_rate": 1.858267716535433e-05, "loss": 0.5041, "step": 118 }, { "epoch": 0.056451612903225805, "grad_norm": 4.284328937530518, "learning_rate": 1.8740157480314962e-05, "loss": 0.4731, "step": 119 }, { "epoch": 0.056925996204933584, "grad_norm": 6.466545104980469, "learning_rate": 1.8897637795275593e-05, "loss": 0.5566, "step": 120 }, { "epoch": 0.05740037950664137, "grad_norm": 4.7638959884643555, "learning_rate": 1.905511811023622e-05, "loss": 0.5096, "step": 121 }, { "epoch": 0.05787476280834915, "grad_norm": 5.770228862762451, "learning_rate": 1.9212598425196852e-05, "loss": 0.5628, "step": 122 }, { "epoch": 0.058349146110056926, "grad_norm": 5.023980140686035, "learning_rate": 1.937007874015748e-05, "loss": 0.4875, "step": 123 }, { "epoch": 0.058823529411764705, "grad_norm": 5.017863750457764, "learning_rate": 1.952755905511811e-05, "loss": 0.5002, "step": 124 }, { "epoch": 0.059297912713472484, "grad_norm": 5.194025039672852, "learning_rate": 1.9685039370078743e-05, "loss": 0.4668, "step": 125 }, { "epoch": 0.05977229601518026, "grad_norm": 5.176628589630127, "learning_rate": 1.984251968503937e-05, "loss": 0.4949, "step": 126 }, { "epoch": 0.06024667931688805, "grad_norm": 5.499045372009277, "learning_rate": 2e-05, "loss": 0.502, "step": 127 }, { "epoch": 0.06072106261859583, "grad_norm": 4.964819431304932, "learning_rate": 1.999999704854948e-05, "loss": 0.5428, "step": 128 }, { "epoch": 0.061195445920303605, "grad_norm": 4.875455379486084, "learning_rate": 1.9999988194199653e-05, "loss": 0.473, "step": 129 }, { "epoch": 0.061669829222011384, "grad_norm": 4.419773578643799, "learning_rate": 1.999997343695575e-05, "loss": 0.4555, "step": 130 }, { "epoch": 0.06214421252371916, "grad_norm": 5.253462314605713, "learning_rate": 1.999995277682648e-05, "loss": 0.563, "step": 131 }, { "epoch": 0.06261859582542695, "grad_norm": 4.706827640533447, "learning_rate": 1.999992621382404e-05, "loss": 0.5296, "step": 132 }, { "epoch": 0.06309297912713473, "grad_norm": 4.565430641174316, "learning_rate": 1.9999893747964108e-05, "loss": 0.4412, "step": 133 }, { "epoch": 0.0635673624288425, "grad_norm": 4.702053070068359, "learning_rate": 1.9999855379265855e-05, "loss": 0.5513, "step": 134 }, { "epoch": 0.06404174573055028, "grad_norm": 4.838067531585693, "learning_rate": 1.999981110775192e-05, "loss": 0.5562, "step": 135 }, { "epoch": 0.06451612903225806, "grad_norm": 4.499827861785889, "learning_rate": 1.9999760933448443e-05, "loss": 0.549, "step": 136 }, { "epoch": 0.06499051233396584, "grad_norm": 4.42749547958374, "learning_rate": 1.9999704856385037e-05, "loss": 0.5549, "step": 137 }, { "epoch": 0.06546489563567362, "grad_norm": 4.693006992340088, "learning_rate": 1.9999642876594806e-05, "loss": 0.4435, "step": 138 }, { "epoch": 0.0659392789373814, "grad_norm": 4.892571449279785, "learning_rate": 1.9999574994114336e-05, "loss": 0.5113, "step": 139 }, { "epoch": 0.06641366223908918, "grad_norm": 4.446290493011475, "learning_rate": 1.9999501208983692e-05, "loss": 0.5456, "step": 140 }, { "epoch": 0.06688804554079697, "grad_norm": 5.799952507019043, "learning_rate": 1.999942152124644e-05, "loss": 0.517, "step": 141 }, { "epoch": 0.06736242884250475, "grad_norm": 4.448901653289795, "learning_rate": 1.9999335930949612e-05, "loss": 0.4831, "step": 142 }, { "epoch": 0.06783681214421253, "grad_norm": 5.490376949310303, "learning_rate": 1.999924443814373e-05, "loss": 0.5247, "step": 143 }, { "epoch": 0.0683111954459203, "grad_norm": 4.813867568969727, "learning_rate": 1.9999147042882803e-05, "loss": 0.4864, "step": 144 }, { "epoch": 0.06878557874762808, "grad_norm": 4.6280293464660645, "learning_rate": 1.9999043745224324e-05, "loss": 0.5794, "step": 145 }, { "epoch": 0.06925996204933586, "grad_norm": 4.530418872833252, "learning_rate": 1.9998934545229266e-05, "loss": 0.5069, "step": 146 }, { "epoch": 0.06973434535104364, "grad_norm": 5.701735019683838, "learning_rate": 1.9998819442962088e-05, "loss": 0.657, "step": 147 }, { "epoch": 0.07020872865275142, "grad_norm": 5.44976282119751, "learning_rate": 1.999869843849074e-05, "loss": 0.5559, "step": 148 }, { "epoch": 0.0706831119544592, "grad_norm": 4.931896686553955, "learning_rate": 1.999857153188664e-05, "loss": 0.527, "step": 149 }, { "epoch": 0.07115749525616698, "grad_norm": 4.865047931671143, "learning_rate": 1.999843872322471e-05, "loss": 0.5177, "step": 150 }, { "epoch": 0.07163187855787476, "grad_norm": 4.696810245513916, "learning_rate": 1.9998300012583333e-05, "loss": 0.5715, "step": 151 }, { "epoch": 0.07210626185958255, "grad_norm": 4.350542068481445, "learning_rate": 1.99981554000444e-05, "loss": 0.4627, "step": 152 }, { "epoch": 0.07258064516129033, "grad_norm": 5.0633111000061035, "learning_rate": 1.999800488569327e-05, "loss": 0.5614, "step": 153 }, { "epoch": 0.0730550284629981, "grad_norm": 4.741815090179443, "learning_rate": 1.999784846961879e-05, "loss": 0.5093, "step": 154 }, { "epoch": 0.07352941176470588, "grad_norm": 6.608360290527344, "learning_rate": 1.9997686151913297e-05, "loss": 0.497, "step": 155 }, { "epoch": 0.07400379506641366, "grad_norm": 5.2583441734313965, "learning_rate": 1.9997517932672592e-05, "loss": 0.5925, "step": 156 }, { "epoch": 0.07447817836812144, "grad_norm": 4.322223663330078, "learning_rate": 1.9997343811995985e-05, "loss": 0.5369, "step": 157 }, { "epoch": 0.07495256166982922, "grad_norm": 4.527962684631348, "learning_rate": 1.9997163789986255e-05, "loss": 0.5313, "step": 158 }, { "epoch": 0.075426944971537, "grad_norm": 4.953525066375732, "learning_rate": 1.999697786674966e-05, "loss": 0.5229, "step": 159 }, { "epoch": 0.07590132827324478, "grad_norm": 5.049784183502197, "learning_rate": 1.999678604239596e-05, "loss": 0.4546, "step": 160 }, { "epoch": 0.07637571157495256, "grad_norm": 5.497480392456055, "learning_rate": 1.9996588317038382e-05, "loss": 0.5097, "step": 161 }, { "epoch": 0.07685009487666034, "grad_norm": 4.220514297485352, "learning_rate": 1.9996384690793634e-05, "loss": 0.471, "step": 162 }, { "epoch": 0.07732447817836813, "grad_norm": 4.465171813964844, "learning_rate": 1.999617516378193e-05, "loss": 0.4938, "step": 163 }, { "epoch": 0.0777988614800759, "grad_norm": 5.333194255828857, "learning_rate": 1.999595973612694e-05, "loss": 0.5676, "step": 164 }, { "epoch": 0.07827324478178369, "grad_norm": 4.7594780921936035, "learning_rate": 1.999573840795583e-05, "loss": 0.5586, "step": 165 }, { "epoch": 0.07874762808349146, "grad_norm": 4.380119800567627, "learning_rate": 1.9995511179399253e-05, "loss": 0.5206, "step": 166 }, { "epoch": 0.07922201138519924, "grad_norm": 5.4240922927856445, "learning_rate": 1.9995278050591334e-05, "loss": 0.6165, "step": 167 }, { "epoch": 0.07969639468690702, "grad_norm": 5.2482075691223145, "learning_rate": 1.9995039021669692e-05, "loss": 0.5472, "step": 168 }, { "epoch": 0.0801707779886148, "grad_norm": 4.465966701507568, "learning_rate": 1.9994794092775418e-05, "loss": 0.5151, "step": 169 }, { "epoch": 0.08064516129032258, "grad_norm": 4.838151454925537, "learning_rate": 1.9994543264053093e-05, "loss": 0.5031, "step": 170 }, { "epoch": 0.08111954459203036, "grad_norm": 4.365642070770264, "learning_rate": 1.9994286535650782e-05, "loss": 0.4621, "step": 171 }, { "epoch": 0.08159392789373814, "grad_norm": 5.1148152351379395, "learning_rate": 1.9994023907720027e-05, "loss": 0.5432, "step": 172 }, { "epoch": 0.08206831119544591, "grad_norm": 4.51958703994751, "learning_rate": 1.9993755380415854e-05, "loss": 0.526, "step": 173 }, { "epoch": 0.0825426944971537, "grad_norm": 3.9636707305908203, "learning_rate": 1.999348095389677e-05, "loss": 0.4614, "step": 174 }, { "epoch": 0.08301707779886149, "grad_norm": 5.010396480560303, "learning_rate": 1.999320062832477e-05, "loss": 0.5862, "step": 175 }, { "epoch": 0.08349146110056926, "grad_norm": 4.869819164276123, "learning_rate": 1.9992914403865327e-05, "loss": 0.5009, "step": 176 }, { "epoch": 0.08396584440227704, "grad_norm": 5.5586838722229, "learning_rate": 1.9992622280687395e-05, "loss": 0.5472, "step": 177 }, { "epoch": 0.08444022770398482, "grad_norm": 4.700314998626709, "learning_rate": 1.9992324258963414e-05, "loss": 0.5833, "step": 178 }, { "epoch": 0.0849146110056926, "grad_norm": 5.759016990661621, "learning_rate": 1.99920203388693e-05, "loss": 0.6045, "step": 179 }, { "epoch": 0.08538899430740038, "grad_norm": 5.080048084259033, "learning_rate": 1.999171052058445e-05, "loss": 0.5256, "step": 180 }, { "epoch": 0.08586337760910816, "grad_norm": 4.277463912963867, "learning_rate": 1.999139480429176e-05, "loss": 0.5763, "step": 181 }, { "epoch": 0.08633776091081594, "grad_norm": 5.613553524017334, "learning_rate": 1.999107319017758e-05, "loss": 0.5423, "step": 182 }, { "epoch": 0.08681214421252371, "grad_norm": 4.811156272888184, "learning_rate": 1.9990745678431765e-05, "loss": 0.4214, "step": 183 }, { "epoch": 0.0872865275142315, "grad_norm": 4.72564697265625, "learning_rate": 1.9990412269247637e-05, "loss": 0.466, "step": 184 }, { "epoch": 0.08776091081593927, "grad_norm": 7.244020938873291, "learning_rate": 1.999007296282201e-05, "loss": 0.5073, "step": 185 }, { "epoch": 0.08823529411764706, "grad_norm": 4.737941741943359, "learning_rate": 1.9989727759355164e-05, "loss": 0.5523, "step": 186 }, { "epoch": 0.08870967741935484, "grad_norm": 4.7859086990356445, "learning_rate": 1.9989376659050878e-05, "loss": 0.5216, "step": 187 }, { "epoch": 0.08918406072106262, "grad_norm": 4.781891345977783, "learning_rate": 1.9989019662116395e-05, "loss": 0.5328, "step": 188 }, { "epoch": 0.0896584440227704, "grad_norm": 4.595061779022217, "learning_rate": 1.9988656768762455e-05, "loss": 0.5584, "step": 189 }, { "epoch": 0.09013282732447818, "grad_norm": 4.338557720184326, "learning_rate": 1.9988287979203264e-05, "loss": 0.4839, "step": 190 }, { "epoch": 0.09060721062618596, "grad_norm": 4.124628067016602, "learning_rate": 1.9987913293656515e-05, "loss": 0.5612, "step": 191 }, { "epoch": 0.09108159392789374, "grad_norm": 3.821711778640747, "learning_rate": 1.9987532712343388e-05, "loss": 0.4442, "step": 192 }, { "epoch": 0.09155597722960152, "grad_norm": 3.7175636291503906, "learning_rate": 1.9987146235488532e-05, "loss": 0.5021, "step": 193 }, { "epoch": 0.0920303605313093, "grad_norm": 4.476022720336914, "learning_rate": 1.9986753863320077e-05, "loss": 0.524, "step": 194 }, { "epoch": 0.09250474383301707, "grad_norm": 4.444040298461914, "learning_rate": 1.998635559606964e-05, "loss": 0.4565, "step": 195 }, { "epoch": 0.09297912713472485, "grad_norm": 4.800974369049072, "learning_rate": 1.9985951433972313e-05, "loss": 0.5845, "step": 196 }, { "epoch": 0.09345351043643264, "grad_norm": 4.884645462036133, "learning_rate": 1.9985541377266675e-05, "loss": 0.5319, "step": 197 }, { "epoch": 0.09392789373814042, "grad_norm": 4.790307521820068, "learning_rate": 1.998512542619477e-05, "loss": 0.505, "step": 198 }, { "epoch": 0.0944022770398482, "grad_norm": 4.581803321838379, "learning_rate": 1.998470358100213e-05, "loss": 0.5228, "step": 199 }, { "epoch": 0.09487666034155598, "grad_norm": 4.337541580200195, "learning_rate": 1.9984275841937776e-05, "loss": 0.5135, "step": 200 }, { "epoch": 0.09535104364326376, "grad_norm": 4.175999641418457, "learning_rate": 1.9983842209254182e-05, "loss": 0.5219, "step": 201 }, { "epoch": 0.09582542694497154, "grad_norm": 3.8409905433654785, "learning_rate": 1.9983402683207334e-05, "loss": 0.4495, "step": 202 }, { "epoch": 0.09629981024667932, "grad_norm": 3.7136104106903076, "learning_rate": 1.9982957264056667e-05, "loss": 0.5357, "step": 203 }, { "epoch": 0.0967741935483871, "grad_norm": 4.796589374542236, "learning_rate": 1.9982505952065115e-05, "loss": 0.5567, "step": 204 }, { "epoch": 0.09724857685009487, "grad_norm": 5.0165886878967285, "learning_rate": 1.9982048747499082e-05, "loss": 0.5025, "step": 205 }, { "epoch": 0.09772296015180265, "grad_norm": 4.150423049926758, "learning_rate": 1.9981585650628447e-05, "loss": 0.5405, "step": 206 }, { "epoch": 0.09819734345351043, "grad_norm": 3.635175943374634, "learning_rate": 1.9981116661726575e-05, "loss": 0.4784, "step": 207 }, { "epoch": 0.09867172675521822, "grad_norm": 4.339625358581543, "learning_rate": 1.998064178107031e-05, "loss": 0.5831, "step": 208 }, { "epoch": 0.099146110056926, "grad_norm": 4.854864120483398, "learning_rate": 1.9980161008939957e-05, "loss": 0.6303, "step": 209 }, { "epoch": 0.09962049335863378, "grad_norm": 4.287778377532959, "learning_rate": 1.9979674345619322e-05, "loss": 0.467, "step": 210 }, { "epoch": 0.10009487666034156, "grad_norm": 3.7619311809539795, "learning_rate": 1.997918179139567e-05, "loss": 0.5403, "step": 211 }, { "epoch": 0.10056925996204934, "grad_norm": 3.486117124557495, "learning_rate": 1.9978683346559762e-05, "loss": 0.4819, "step": 212 }, { "epoch": 0.10104364326375712, "grad_norm": 4.301611423492432, "learning_rate": 1.9978179011405814e-05, "loss": 0.5293, "step": 213 }, { "epoch": 0.1015180265654649, "grad_norm": 4.88173770904541, "learning_rate": 1.9977668786231536e-05, "loss": 0.5506, "step": 214 }, { "epoch": 0.10199240986717267, "grad_norm": 4.954184532165527, "learning_rate": 1.99771526713381e-05, "loss": 0.5209, "step": 215 }, { "epoch": 0.10246679316888045, "grad_norm": 4.852926254272461, "learning_rate": 1.9976630667030175e-05, "loss": 0.5922, "step": 216 }, { "epoch": 0.10294117647058823, "grad_norm": 5.19386100769043, "learning_rate": 1.9976102773615894e-05, "loss": 0.5261, "step": 217 }, { "epoch": 0.10341555977229601, "grad_norm": 4.068961143493652, "learning_rate": 1.997556899140686e-05, "loss": 0.5104, "step": 218 }, { "epoch": 0.1038899430740038, "grad_norm": 3.790403366088867, "learning_rate": 1.997502932071816e-05, "loss": 0.4915, "step": 219 }, { "epoch": 0.10436432637571158, "grad_norm": 5.95817756652832, "learning_rate": 1.997448376186836e-05, "loss": 0.4801, "step": 220 }, { "epoch": 0.10483870967741936, "grad_norm": 4.333256244659424, "learning_rate": 1.9973932315179502e-05, "loss": 0.5602, "step": 221 }, { "epoch": 0.10531309297912714, "grad_norm": 3.932823419570923, "learning_rate": 1.997337498097709e-05, "loss": 0.5351, "step": 222 }, { "epoch": 0.10578747628083492, "grad_norm": 4.296048164367676, "learning_rate": 1.9972811759590117e-05, "loss": 0.5502, "step": 223 }, { "epoch": 0.1062618595825427, "grad_norm": 4.166548252105713, "learning_rate": 1.997224265135105e-05, "loss": 0.5277, "step": 224 }, { "epoch": 0.10673624288425047, "grad_norm": 5.517498016357422, "learning_rate": 1.9971667656595824e-05, "loss": 0.5458, "step": 225 }, { "epoch": 0.10721062618595825, "grad_norm": 4.525831699371338, "learning_rate": 1.9971086775663856e-05, "loss": 0.5935, "step": 226 }, { "epoch": 0.10768500948766603, "grad_norm": 4.5020012855529785, "learning_rate": 1.997050000889803e-05, "loss": 0.4852, "step": 227 }, { "epoch": 0.10815939278937381, "grad_norm": 4.161679744720459, "learning_rate": 1.9969907356644716e-05, "loss": 0.5776, "step": 228 }, { "epoch": 0.10863377609108159, "grad_norm": 5.031361103057861, "learning_rate": 1.996930881925374e-05, "loss": 0.5912, "step": 229 }, { "epoch": 0.10910815939278938, "grad_norm": 4.38935661315918, "learning_rate": 1.9968704397078422e-05, "loss": 0.5005, "step": 230 }, { "epoch": 0.10958254269449716, "grad_norm": 4.4222412109375, "learning_rate": 1.996809409047554e-05, "loss": 0.5669, "step": 231 }, { "epoch": 0.11005692599620494, "grad_norm": 4.336076736450195, "learning_rate": 1.996747789980536e-05, "loss": 0.5975, "step": 232 }, { "epoch": 0.11053130929791272, "grad_norm": 4.671880722045898, "learning_rate": 1.9966855825431605e-05, "loss": 0.6065, "step": 233 }, { "epoch": 0.1110056925996205, "grad_norm": 3.9240012168884277, "learning_rate": 1.996622786772148e-05, "loss": 0.474, "step": 234 }, { "epoch": 0.11148007590132827, "grad_norm": 4.048487663269043, "learning_rate": 1.9965594027045668e-05, "loss": 0.5361, "step": 235 }, { "epoch": 0.11195445920303605, "grad_norm": 3.9906532764434814, "learning_rate": 1.996495430377831e-05, "loss": 0.6107, "step": 236 }, { "epoch": 0.11242884250474383, "grad_norm": 4.5680084228515625, "learning_rate": 1.996430869829704e-05, "loss": 0.4812, "step": 237 }, { "epoch": 0.11290322580645161, "grad_norm": 4.888277530670166, "learning_rate": 1.9963657210982947e-05, "loss": 0.479, "step": 238 }, { "epoch": 0.11337760910815939, "grad_norm": 3.874725103378296, "learning_rate": 1.9962999842220596e-05, "loss": 0.5418, "step": 239 }, { "epoch": 0.11385199240986717, "grad_norm": 4.379036903381348, "learning_rate": 1.9962336592398027e-05, "loss": 0.4977, "step": 240 }, { "epoch": 0.11432637571157495, "grad_norm": 4.229637145996094, "learning_rate": 1.9961667461906743e-05, "loss": 0.5353, "step": 241 }, { "epoch": 0.11480075901328274, "grad_norm": 4.034006118774414, "learning_rate": 1.9960992451141737e-05, "loss": 0.5121, "step": 242 }, { "epoch": 0.11527514231499052, "grad_norm": 4.3000288009643555, "learning_rate": 1.9960311560501457e-05, "loss": 0.6312, "step": 243 }, { "epoch": 0.1157495256166983, "grad_norm": 4.055355072021484, "learning_rate": 1.995962479038782e-05, "loss": 0.5626, "step": 244 }, { "epoch": 0.11622390891840607, "grad_norm": 4.0171990394592285, "learning_rate": 1.9958932141206224e-05, "loss": 0.4723, "step": 245 }, { "epoch": 0.11669829222011385, "grad_norm": 4.504859924316406, "learning_rate": 1.9958233613365534e-05, "loss": 0.5482, "step": 246 }, { "epoch": 0.11717267552182163, "grad_norm": 4.141243934631348, "learning_rate": 1.9957529207278082e-05, "loss": 0.4875, "step": 247 }, { "epoch": 0.11764705882352941, "grad_norm": 4.3180389404296875, "learning_rate": 1.9956818923359677e-05, "loss": 0.5515, "step": 248 }, { "epoch": 0.11812144212523719, "grad_norm": 3.9524307250976562, "learning_rate": 1.995610276202958e-05, "loss": 0.5466, "step": 249 }, { "epoch": 0.11859582542694497, "grad_norm": 3.8721537590026855, "learning_rate": 1.995538072371055e-05, "loss": 0.6034, "step": 250 }, { "epoch": 0.11907020872865275, "grad_norm": 4.435497283935547, "learning_rate": 1.9954652808828793e-05, "loss": 0.5693, "step": 251 }, { "epoch": 0.11954459203036052, "grad_norm": 3.3445560932159424, "learning_rate": 1.9953919017813985e-05, "loss": 0.3989, "step": 252 }, { "epoch": 0.12001897533206832, "grad_norm": 3.7608346939086914, "learning_rate": 1.9953179351099276e-05, "loss": 0.5034, "step": 253 }, { "epoch": 0.1204933586337761, "grad_norm": 4.122411727905273, "learning_rate": 1.995243380912129e-05, "loss": 0.5092, "step": 254 }, { "epoch": 0.12096774193548387, "grad_norm": 4.1877007484436035, "learning_rate": 1.995168239232011e-05, "loss": 0.5847, "step": 255 }, { "epoch": 0.12144212523719165, "grad_norm": 4.213406562805176, "learning_rate": 1.9950925101139292e-05, "loss": 0.4321, "step": 256 }, { "epoch": 0.12191650853889943, "grad_norm": 3.9307830333709717, "learning_rate": 1.995016193602585e-05, "loss": 0.5166, "step": 257 }, { "epoch": 0.12239089184060721, "grad_norm": 4.194809913635254, "learning_rate": 1.9949392897430283e-05, "loss": 0.5193, "step": 258 }, { "epoch": 0.12286527514231499, "grad_norm": 5.011254787445068, "learning_rate": 1.994861798580654e-05, "loss": 0.6044, "step": 259 }, { "epoch": 0.12333965844402277, "grad_norm": 3.800550937652588, "learning_rate": 1.9947837201612046e-05, "loss": 0.5104, "step": 260 }, { "epoch": 0.12381404174573055, "grad_norm": 4.209667682647705, "learning_rate": 1.9947050545307693e-05, "loss": 0.5558, "step": 261 }, { "epoch": 0.12428842504743833, "grad_norm": 4.35175085067749, "learning_rate": 1.994625801735783e-05, "loss": 0.4245, "step": 262 }, { "epoch": 0.1247628083491461, "grad_norm": 4.295873165130615, "learning_rate": 1.9945459618230282e-05, "loss": 0.5044, "step": 263 }, { "epoch": 0.1252371916508539, "grad_norm": 4.467810153961182, "learning_rate": 1.9944655348396336e-05, "loss": 0.5308, "step": 264 }, { "epoch": 0.12571157495256166, "grad_norm": 3.797988176345825, "learning_rate": 1.9943845208330742e-05, "loss": 0.4543, "step": 265 }, { "epoch": 0.12618595825426945, "grad_norm": 3.5803213119506836, "learning_rate": 1.9943029198511724e-05, "loss": 0.4598, "step": 266 }, { "epoch": 0.12666034155597722, "grad_norm": 3.6839261054992676, "learning_rate": 1.9942207319420962e-05, "loss": 0.5384, "step": 267 }, { "epoch": 0.127134724857685, "grad_norm": 3.6129837036132812, "learning_rate": 1.9941379571543597e-05, "loss": 0.5078, "step": 268 }, { "epoch": 0.12760910815939278, "grad_norm": 4.039849281311035, "learning_rate": 1.9940545955368247e-05, "loss": 0.6595, "step": 269 }, { "epoch": 0.12808349146110057, "grad_norm": 4.292903423309326, "learning_rate": 1.993970647138699e-05, "loss": 0.6002, "step": 270 }, { "epoch": 0.12855787476280836, "grad_norm": 3.4465274810791016, "learning_rate": 1.9938861120095353e-05, "loss": 0.4465, "step": 271 }, { "epoch": 0.12903225806451613, "grad_norm": 5.027604579925537, "learning_rate": 1.993800990199235e-05, "loss": 0.5447, "step": 272 }, { "epoch": 0.12950664136622392, "grad_norm": 4.433981418609619, "learning_rate": 1.993715281758044e-05, "loss": 0.5327, "step": 273 }, { "epoch": 0.12998102466793168, "grad_norm": 4.061509609222412, "learning_rate": 1.9936289867365557e-05, "loss": 0.4561, "step": 274 }, { "epoch": 0.13045540796963948, "grad_norm": 4.023436546325684, "learning_rate": 1.9935421051857088e-05, "loss": 0.5113, "step": 275 }, { "epoch": 0.13092979127134724, "grad_norm": 4.111091613769531, "learning_rate": 1.9934546371567888e-05, "loss": 0.5168, "step": 276 }, { "epoch": 0.13140417457305503, "grad_norm": 4.130946636199951, "learning_rate": 1.9933665827014272e-05, "loss": 0.5131, "step": 277 }, { "epoch": 0.1318785578747628, "grad_norm": 4.0225348472595215, "learning_rate": 1.9932779418716012e-05, "loss": 0.4993, "step": 278 }, { "epoch": 0.1323529411764706, "grad_norm": 4.16855525970459, "learning_rate": 1.9931887147196355e-05, "loss": 0.4827, "step": 279 }, { "epoch": 0.13282732447817835, "grad_norm": 4.637106418609619, "learning_rate": 1.9930989012981992e-05, "loss": 0.4109, "step": 280 }, { "epoch": 0.13330170777988615, "grad_norm": 3.4839680194854736, "learning_rate": 1.993008501660309e-05, "loss": 0.4779, "step": 281 }, { "epoch": 0.13377609108159394, "grad_norm": 4.075927734375, "learning_rate": 1.9929175158593262e-05, "loss": 0.531, "step": 282 }, { "epoch": 0.1342504743833017, "grad_norm": 4.005972385406494, "learning_rate": 1.992825943948959e-05, "loss": 0.5608, "step": 283 }, { "epoch": 0.1347248576850095, "grad_norm": 3.9188387393951416, "learning_rate": 1.9927337859832617e-05, "loss": 0.5646, "step": 284 }, { "epoch": 0.13519924098671726, "grad_norm": 3.8422210216522217, "learning_rate": 1.9926410420166343e-05, "loss": 0.4807, "step": 285 }, { "epoch": 0.13567362428842505, "grad_norm": 4.120503902435303, "learning_rate": 1.9925477121038218e-05, "loss": 0.6104, "step": 286 }, { "epoch": 0.13614800759013282, "grad_norm": 4.473390579223633, "learning_rate": 1.992453796299917e-05, "loss": 0.5511, "step": 287 }, { "epoch": 0.1366223908918406, "grad_norm": 4.302347183227539, "learning_rate": 1.9923592946603573e-05, "loss": 0.4618, "step": 288 }, { "epoch": 0.13709677419354838, "grad_norm": 7.132138729095459, "learning_rate": 1.992264207240925e-05, "loss": 0.5165, "step": 289 }, { "epoch": 0.13757115749525617, "grad_norm": 4.486799716949463, "learning_rate": 1.9921685340977506e-05, "loss": 0.459, "step": 290 }, { "epoch": 0.13804554079696393, "grad_norm": 4.0745038986206055, "learning_rate": 1.992072275287308e-05, "loss": 0.4965, "step": 291 }, { "epoch": 0.13851992409867173, "grad_norm": 4.047938346862793, "learning_rate": 1.991975430866419e-05, "loss": 0.5792, "step": 292 }, { "epoch": 0.13899430740037952, "grad_norm": 4.198129177093506, "learning_rate": 1.9918780008922484e-05, "loss": 0.5546, "step": 293 }, { "epoch": 0.13946869070208728, "grad_norm": 4.39850378036499, "learning_rate": 1.9917799854223093e-05, "loss": 0.4974, "step": 294 }, { "epoch": 0.13994307400379508, "grad_norm": 5.012327194213867, "learning_rate": 1.9916813845144587e-05, "loss": 0.5658, "step": 295 }, { "epoch": 0.14041745730550284, "grad_norm": 4.509042263031006, "learning_rate": 1.9915821982269002e-05, "loss": 0.5774, "step": 296 }, { "epoch": 0.14089184060721063, "grad_norm": 4.060871124267578, "learning_rate": 1.9914824266181818e-05, "loss": 0.4573, "step": 297 }, { "epoch": 0.1413662239089184, "grad_norm": 3.875030040740967, "learning_rate": 1.9913820697471988e-05, "loss": 0.4239, "step": 298 }, { "epoch": 0.1418406072106262, "grad_norm": 4.114848613739014, "learning_rate": 1.9912811276731895e-05, "loss": 0.5666, "step": 299 }, { "epoch": 0.14231499051233396, "grad_norm": 3.5044398307800293, "learning_rate": 1.9911796004557397e-05, "loss": 0.5189, "step": 300 }, { "epoch": 0.14278937381404175, "grad_norm": 3.8690919876098633, "learning_rate": 1.9910774881547803e-05, "loss": 0.5543, "step": 301 }, { "epoch": 0.1432637571157495, "grad_norm": 3.6126976013183594, "learning_rate": 1.9909747908305866e-05, "loss": 0.4635, "step": 302 }, { "epoch": 0.1437381404174573, "grad_norm": 3.76629900932312, "learning_rate": 1.99087150854378e-05, "loss": 0.5394, "step": 303 }, { "epoch": 0.1442125237191651, "grad_norm": 4.235199451446533, "learning_rate": 1.990767641355327e-05, "loss": 0.6441, "step": 304 }, { "epoch": 0.14468690702087286, "grad_norm": 3.634746551513672, "learning_rate": 1.9906631893265393e-05, "loss": 0.5005, "step": 305 }, { "epoch": 0.14516129032258066, "grad_norm": 3.812934160232544, "learning_rate": 1.990558152519074e-05, "loss": 0.4656, "step": 306 }, { "epoch": 0.14563567362428842, "grad_norm": 3.4849369525909424, "learning_rate": 1.9904525309949332e-05, "loss": 0.4462, "step": 307 }, { "epoch": 0.1461100569259962, "grad_norm": 4.69333553314209, "learning_rate": 1.9903463248164643e-05, "loss": 0.5539, "step": 308 }, { "epoch": 0.14658444022770398, "grad_norm": 4.567856788635254, "learning_rate": 1.99023953404636e-05, "loss": 0.5745, "step": 309 }, { "epoch": 0.14705882352941177, "grad_norm": 3.9516940116882324, "learning_rate": 1.9901321587476573e-05, "loss": 0.5849, "step": 310 }, { "epoch": 0.14753320683111953, "grad_norm": 4.662928581237793, "learning_rate": 1.9900241989837395e-05, "loss": 0.57, "step": 311 }, { "epoch": 0.14800759013282733, "grad_norm": 3.7832348346710205, "learning_rate": 1.9899156548183332e-05, "loss": 0.5359, "step": 312 }, { "epoch": 0.1484819734345351, "grad_norm": 3.7341370582580566, "learning_rate": 1.9898065263155118e-05, "loss": 0.5341, "step": 313 }, { "epoch": 0.14895635673624288, "grad_norm": 4.346981048583984, "learning_rate": 1.9896968135396924e-05, "loss": 0.5166, "step": 314 }, { "epoch": 0.14943074003795068, "grad_norm": 4.063964366912842, "learning_rate": 1.9895865165556375e-05, "loss": 0.5726, "step": 315 }, { "epoch": 0.14990512333965844, "grad_norm": 3.6817450523376465, "learning_rate": 1.989475635428454e-05, "loss": 0.4844, "step": 316 }, { "epoch": 0.15037950664136623, "grad_norm": 3.254460096359253, "learning_rate": 1.9893641702235946e-05, "loss": 0.4731, "step": 317 }, { "epoch": 0.150853889943074, "grad_norm": 4.255951404571533, "learning_rate": 1.9892521210068552e-05, "loss": 0.5561, "step": 318 }, { "epoch": 0.1513282732447818, "grad_norm": 4.263412952423096, "learning_rate": 1.9891394878443783e-05, "loss": 0.5309, "step": 319 }, { "epoch": 0.15180265654648956, "grad_norm": 3.499624252319336, "learning_rate": 1.9890262708026497e-05, "loss": 0.5453, "step": 320 }, { "epoch": 0.15227703984819735, "grad_norm": 4.114099502563477, "learning_rate": 1.9889124699485e-05, "loss": 0.55, "step": 321 }, { "epoch": 0.1527514231499051, "grad_norm": 3.8856418132781982, "learning_rate": 1.988798085349105e-05, "loss": 0.4584, "step": 322 }, { "epoch": 0.1532258064516129, "grad_norm": 3.403284788131714, "learning_rate": 1.9886831170719852e-05, "loss": 0.4941, "step": 323 }, { "epoch": 0.15370018975332067, "grad_norm": 3.322474479675293, "learning_rate": 1.9885675651850044e-05, "loss": 0.4564, "step": 324 }, { "epoch": 0.15417457305502846, "grad_norm": 4.788414001464844, "learning_rate": 1.9884514297563722e-05, "loss": 0.6303, "step": 325 }, { "epoch": 0.15464895635673626, "grad_norm": 3.2986533641815186, "learning_rate": 1.9883347108546424e-05, "loss": 0.4885, "step": 326 }, { "epoch": 0.15512333965844402, "grad_norm": 3.582542896270752, "learning_rate": 1.9882174085487125e-05, "loss": 0.4793, "step": 327 }, { "epoch": 0.1555977229601518, "grad_norm": 5.127254486083984, "learning_rate": 1.9880995229078253e-05, "loss": 0.5257, "step": 328 }, { "epoch": 0.15607210626185958, "grad_norm": 3.8393657207489014, "learning_rate": 1.9879810540015674e-05, "loss": 0.4924, "step": 329 }, { "epoch": 0.15654648956356737, "grad_norm": 4.303122043609619, "learning_rate": 1.9878620018998696e-05, "loss": 0.5073, "step": 330 }, { "epoch": 0.15702087286527514, "grad_norm": 3.3921568393707275, "learning_rate": 1.9877423666730075e-05, "loss": 0.4634, "step": 331 }, { "epoch": 0.15749525616698293, "grad_norm": 4.370551109313965, "learning_rate": 1.9876221483916006e-05, "loss": 0.641, "step": 332 }, { "epoch": 0.1579696394686907, "grad_norm": 7.480087757110596, "learning_rate": 1.9875013471266124e-05, "loss": 0.5516, "step": 333 }, { "epoch": 0.15844402277039848, "grad_norm": 3.2443172931671143, "learning_rate": 1.9873799629493507e-05, "loss": 0.4536, "step": 334 }, { "epoch": 0.15891840607210625, "grad_norm": 3.3742051124572754, "learning_rate": 1.9872579959314675e-05, "loss": 0.4506, "step": 335 }, { "epoch": 0.15939278937381404, "grad_norm": 4.108852386474609, "learning_rate": 1.987135446144959e-05, "loss": 0.5022, "step": 336 }, { "epoch": 0.15986717267552183, "grad_norm": 3.776676893234253, "learning_rate": 1.9870123136621638e-05, "loss": 0.4441, "step": 337 }, { "epoch": 0.1603415559772296, "grad_norm": 4.488317966461182, "learning_rate": 1.9868885985557675e-05, "loss": 0.5622, "step": 338 }, { "epoch": 0.1608159392789374, "grad_norm": 4.163279056549072, "learning_rate": 1.986764300898797e-05, "loss": 0.5567, "step": 339 }, { "epoch": 0.16129032258064516, "grad_norm": 3.326791524887085, "learning_rate": 1.986639420764624e-05, "loss": 0.4917, "step": 340 }, { "epoch": 0.16176470588235295, "grad_norm": 4.889130115509033, "learning_rate": 1.9865139582269642e-05, "loss": 0.5238, "step": 341 }, { "epoch": 0.16223908918406071, "grad_norm": 3.6789140701293945, "learning_rate": 1.986387913359877e-05, "loss": 0.6143, "step": 342 }, { "epoch": 0.1627134724857685, "grad_norm": 3.560218572616577, "learning_rate": 1.9862612862377652e-05, "loss": 0.4937, "step": 343 }, { "epoch": 0.16318785578747627, "grad_norm": 3.0971603393554688, "learning_rate": 1.9861340769353753e-05, "loss": 0.4614, "step": 344 }, { "epoch": 0.16366223908918406, "grad_norm": 3.41616153717041, "learning_rate": 1.9860062855277982e-05, "loss": 0.4853, "step": 345 }, { "epoch": 0.16413662239089183, "grad_norm": 3.4327688217163086, "learning_rate": 1.985877912090468e-05, "loss": 0.4745, "step": 346 }, { "epoch": 0.16461100569259962, "grad_norm": 3.024824380874634, "learning_rate": 1.9857489566991614e-05, "loss": 0.5228, "step": 347 }, { "epoch": 0.1650853889943074, "grad_norm": 4.104083061218262, "learning_rate": 1.9856194194300005e-05, "loss": 0.5777, "step": 348 }, { "epoch": 0.16555977229601518, "grad_norm": 4.0848798751831055, "learning_rate": 1.9854893003594492e-05, "loss": 0.5047, "step": 349 }, { "epoch": 0.16603415559772297, "grad_norm": 3.6772310733795166, "learning_rate": 1.9853585995643158e-05, "loss": 0.5242, "step": 350 }, { "epoch": 0.16650853889943074, "grad_norm": 3.716475009918213, "learning_rate": 1.9852273171217518e-05, "loss": 0.4722, "step": 351 }, { "epoch": 0.16698292220113853, "grad_norm": 4.485456466674805, "learning_rate": 1.9850954531092515e-05, "loss": 0.4973, "step": 352 }, { "epoch": 0.1674573055028463, "grad_norm": 3.9249846935272217, "learning_rate": 1.9849630076046536e-05, "loss": 0.506, "step": 353 }, { "epoch": 0.16793168880455409, "grad_norm": 3.6390221118927, "learning_rate": 1.9848299806861385e-05, "loss": 0.4511, "step": 354 }, { "epoch": 0.16840607210626185, "grad_norm": 4.2053937911987305, "learning_rate": 1.984696372432231e-05, "loss": 0.523, "step": 355 }, { "epoch": 0.16888045540796964, "grad_norm": 3.690425157546997, "learning_rate": 1.984562182921799e-05, "loss": 0.5692, "step": 356 }, { "epoch": 0.1693548387096774, "grad_norm": 3.777768611907959, "learning_rate": 1.9844274122340534e-05, "loss": 0.4881, "step": 357 }, { "epoch": 0.1698292220113852, "grad_norm": 3.9081265926361084, "learning_rate": 1.9842920604485474e-05, "loss": 0.5767, "step": 358 }, { "epoch": 0.170303605313093, "grad_norm": 4.124914169311523, "learning_rate": 1.984156127645178e-05, "loss": 0.5526, "step": 359 }, { "epoch": 0.17077798861480076, "grad_norm": 3.433190107345581, "learning_rate": 1.9840196139041853e-05, "loss": 0.385, "step": 360 }, { "epoch": 0.17125237191650855, "grad_norm": 3.6542704105377197, "learning_rate": 1.9838825193061518e-05, "loss": 0.4004, "step": 361 }, { "epoch": 0.17172675521821631, "grad_norm": 4.064217567443848, "learning_rate": 1.9837448439320027e-05, "loss": 0.4996, "step": 362 }, { "epoch": 0.1722011385199241, "grad_norm": 3.451735019683838, "learning_rate": 1.9836065878630074e-05, "loss": 0.4737, "step": 363 }, { "epoch": 0.17267552182163187, "grad_norm": 3.747659206390381, "learning_rate": 1.983467751180776e-05, "loss": 0.5078, "step": 364 }, { "epoch": 0.17314990512333966, "grad_norm": 3.898174524307251, "learning_rate": 1.983328333967263e-05, "loss": 0.5549, "step": 365 }, { "epoch": 0.17362428842504743, "grad_norm": 3.5523483753204346, "learning_rate": 1.983188336304765e-05, "loss": 0.5202, "step": 366 }, { "epoch": 0.17409867172675522, "grad_norm": 3.90126633644104, "learning_rate": 1.9830477582759213e-05, "loss": 0.5718, "step": 367 }, { "epoch": 0.174573055028463, "grad_norm": 4.264011383056641, "learning_rate": 1.9829065999637134e-05, "loss": 0.561, "step": 368 }, { "epoch": 0.17504743833017078, "grad_norm": 3.489223003387451, "learning_rate": 1.982764861451466e-05, "loss": 0.5515, "step": 369 }, { "epoch": 0.17552182163187854, "grad_norm": 3.144134998321533, "learning_rate": 1.9826225428228455e-05, "loss": 0.3927, "step": 370 }, { "epoch": 0.17599620493358634, "grad_norm": 3.7182862758636475, "learning_rate": 1.9824796441618617e-05, "loss": 0.5222, "step": 371 }, { "epoch": 0.17647058823529413, "grad_norm": 3.7761785984039307, "learning_rate": 1.9823361655528658e-05, "loss": 0.5393, "step": 372 }, { "epoch": 0.1769449715370019, "grad_norm": 3.673797369003296, "learning_rate": 1.9821921070805522e-05, "loss": 0.5508, "step": 373 }, { "epoch": 0.1774193548387097, "grad_norm": 5.075558662414551, "learning_rate": 1.9820474688299566e-05, "loss": 0.4923, "step": 374 }, { "epoch": 0.17789373814041745, "grad_norm": 2.8637001514434814, "learning_rate": 1.9819022508864582e-05, "loss": 0.4581, "step": 375 }, { "epoch": 0.17836812144212524, "grad_norm": 3.7829291820526123, "learning_rate": 1.9817564533357775e-05, "loss": 0.4989, "step": 376 }, { "epoch": 0.178842504743833, "grad_norm": 3.2799975872039795, "learning_rate": 1.9816100762639773e-05, "loss": 0.513, "step": 377 }, { "epoch": 0.1793168880455408, "grad_norm": 3.540116310119629, "learning_rate": 1.9814631197574626e-05, "loss": 0.4851, "step": 378 }, { "epoch": 0.17979127134724857, "grad_norm": 3.821983814239502, "learning_rate": 1.98131558390298e-05, "loss": 0.4854, "step": 379 }, { "epoch": 0.18026565464895636, "grad_norm": 7.540534973144531, "learning_rate": 1.981167468787619e-05, "loss": 0.5211, "step": 380 }, { "epoch": 0.18074003795066412, "grad_norm": 3.578125, "learning_rate": 1.98101877449881e-05, "loss": 0.5135, "step": 381 }, { "epoch": 0.18121442125237192, "grad_norm": 3.8135201930999756, "learning_rate": 1.980869501124326e-05, "loss": 0.551, "step": 382 }, { "epoch": 0.1816888045540797, "grad_norm": 3.3921446800231934, "learning_rate": 1.9807196487522818e-05, "loss": 0.5174, "step": 383 }, { "epoch": 0.18216318785578747, "grad_norm": 3.473004102706909, "learning_rate": 1.9805692174711337e-05, "loss": 0.5091, "step": 384 }, { "epoch": 0.18263757115749527, "grad_norm": 3.449249505996704, "learning_rate": 1.9804182073696793e-05, "loss": 0.5027, "step": 385 }, { "epoch": 0.18311195445920303, "grad_norm": 3.633528470993042, "learning_rate": 1.980266618537059e-05, "loss": 0.5025, "step": 386 }, { "epoch": 0.18358633776091082, "grad_norm": 3.325308084487915, "learning_rate": 1.980114451062754e-05, "loss": 0.4739, "step": 387 }, { "epoch": 0.1840607210626186, "grad_norm": 3.829240083694458, "learning_rate": 1.979961705036587e-05, "loss": 0.4921, "step": 388 }, { "epoch": 0.18453510436432638, "grad_norm": 3.1751868724823, "learning_rate": 1.9798083805487228e-05, "loss": 0.4318, "step": 389 }, { "epoch": 0.18500948766603414, "grad_norm": 3.521556854248047, "learning_rate": 1.979654477689667e-05, "loss": 0.4898, "step": 390 }, { "epoch": 0.18548387096774194, "grad_norm": 3.7403318881988525, "learning_rate": 1.979499996550267e-05, "loss": 0.5121, "step": 391 }, { "epoch": 0.1859582542694497, "grad_norm": 4.491858959197998, "learning_rate": 1.9793449372217123e-05, "loss": 0.5375, "step": 392 }, { "epoch": 0.1864326375711575, "grad_norm": 3.4098191261291504, "learning_rate": 1.9791892997955317e-05, "loss": 0.5462, "step": 393 }, { "epoch": 0.1869070208728653, "grad_norm": 3.658325672149658, "learning_rate": 1.9790330843635967e-05, "loss": 0.4567, "step": 394 }, { "epoch": 0.18738140417457305, "grad_norm": 4.077053546905518, "learning_rate": 1.97887629101812e-05, "loss": 0.5194, "step": 395 }, { "epoch": 0.18785578747628084, "grad_norm": 3.680041551589966, "learning_rate": 1.9787189198516553e-05, "loss": 0.4515, "step": 396 }, { "epoch": 0.1883301707779886, "grad_norm": 3.7228457927703857, "learning_rate": 1.9785609709570973e-05, "loss": 0.4504, "step": 397 }, { "epoch": 0.1888045540796964, "grad_norm": 4.01034688949585, "learning_rate": 1.9784024444276812e-05, "loss": 0.4637, "step": 398 }, { "epoch": 0.18927893738140417, "grad_norm": 2.7363219261169434, "learning_rate": 1.9782433403569836e-05, "loss": 0.4619, "step": 399 }, { "epoch": 0.18975332068311196, "grad_norm": 3.8097171783447266, "learning_rate": 1.9780836588389225e-05, "loss": 0.578, "step": 400 }, { "epoch": 0.19022770398481972, "grad_norm": 3.741736650466919, "learning_rate": 1.9779233999677563e-05, "loss": 0.5731, "step": 401 }, { "epoch": 0.19070208728652752, "grad_norm": 3.690082311630249, "learning_rate": 1.9777625638380838e-05, "loss": 0.5538, "step": 402 }, { "epoch": 0.19117647058823528, "grad_norm": 3.4858877658843994, "learning_rate": 1.9776011505448455e-05, "loss": 0.4858, "step": 403 }, { "epoch": 0.19165085388994307, "grad_norm": 3.8378117084503174, "learning_rate": 1.977439160183322e-05, "loss": 0.5408, "step": 404 }, { "epoch": 0.19212523719165087, "grad_norm": 3.4993057250976562, "learning_rate": 1.977276592849134e-05, "loss": 0.4522, "step": 405 }, { "epoch": 0.19259962049335863, "grad_norm": 3.624081611633301, "learning_rate": 1.9771134486382436e-05, "loss": 0.4739, "step": 406 }, { "epoch": 0.19307400379506642, "grad_norm": 3.2720398902893066, "learning_rate": 1.9769497276469538e-05, "loss": 0.5003, "step": 407 }, { "epoch": 0.1935483870967742, "grad_norm": 3.429849624633789, "learning_rate": 1.9767854299719073e-05, "loss": 0.5161, "step": 408 }, { "epoch": 0.19402277039848198, "grad_norm": 3.6602742671966553, "learning_rate": 1.976620555710087e-05, "loss": 0.5546, "step": 409 }, { "epoch": 0.19449715370018975, "grad_norm": 4.340588092803955, "learning_rate": 1.9764551049588165e-05, "loss": 0.5262, "step": 410 }, { "epoch": 0.19497153700189754, "grad_norm": 3.39455509185791, "learning_rate": 1.97628907781576e-05, "loss": 0.4408, "step": 411 }, { "epoch": 0.1954459203036053, "grad_norm": 3.521057605743408, "learning_rate": 1.976122474378922e-05, "loss": 0.4413, "step": 412 }, { "epoch": 0.1959203036053131, "grad_norm": 4.244317531585693, "learning_rate": 1.9759552947466462e-05, "loss": 0.5525, "step": 413 }, { "epoch": 0.19639468690702086, "grad_norm": 3.8756752014160156, "learning_rate": 1.9757875390176177e-05, "loss": 0.5267, "step": 414 }, { "epoch": 0.19686907020872865, "grad_norm": 3.6845438480377197, "learning_rate": 1.9756192072908605e-05, "loss": 0.5128, "step": 415 }, { "epoch": 0.19734345351043645, "grad_norm": 4.302131652832031, "learning_rate": 1.9754502996657395e-05, "loss": 0.5089, "step": 416 }, { "epoch": 0.1978178368121442, "grad_norm": 3.6274499893188477, "learning_rate": 1.975280816241959e-05, "loss": 0.4639, "step": 417 }, { "epoch": 0.198292220113852, "grad_norm": 3.127408742904663, "learning_rate": 1.975110757119564e-05, "loss": 0.4801, "step": 418 }, { "epoch": 0.19876660341555977, "grad_norm": 9.724570274353027, "learning_rate": 1.9749401223989376e-05, "loss": 0.6008, "step": 419 }, { "epoch": 0.19924098671726756, "grad_norm": 3.2324442863464355, "learning_rate": 1.9747689121808045e-05, "loss": 0.4913, "step": 420 }, { "epoch": 0.19971537001897532, "grad_norm": 2.958690881729126, "learning_rate": 1.9745971265662286e-05, "loss": 0.3945, "step": 421 }, { "epoch": 0.20018975332068312, "grad_norm": 3.4928455352783203, "learning_rate": 1.9744247656566125e-05, "loss": 0.4863, "step": 422 }, { "epoch": 0.20066413662239088, "grad_norm": 3.0363330841064453, "learning_rate": 1.9742518295536997e-05, "loss": 0.4847, "step": 423 }, { "epoch": 0.20113851992409867, "grad_norm": 3.126666307449341, "learning_rate": 1.9740783183595726e-05, "loss": 0.4281, "step": 424 }, { "epoch": 0.20161290322580644, "grad_norm": 3.345747232437134, "learning_rate": 1.9739042321766533e-05, "loss": 0.4699, "step": 425 }, { "epoch": 0.20208728652751423, "grad_norm": 3.1891050338745117, "learning_rate": 1.973729571107703e-05, "loss": 0.4362, "step": 426 }, { "epoch": 0.20256166982922202, "grad_norm": 3.56644606590271, "learning_rate": 1.973554335255822e-05, "loss": 0.4374, "step": 427 }, { "epoch": 0.2030360531309298, "grad_norm": 3.70436692237854, "learning_rate": 1.9733785247244506e-05, "loss": 0.4582, "step": 428 }, { "epoch": 0.20351043643263758, "grad_norm": 3.7359845638275146, "learning_rate": 1.9732021396173682e-05, "loss": 0.5247, "step": 429 }, { "epoch": 0.20398481973434535, "grad_norm": 3.910747528076172, "learning_rate": 1.973025180038693e-05, "loss": 0.5534, "step": 430 }, { "epoch": 0.20445920303605314, "grad_norm": 3.243649482727051, "learning_rate": 1.9728476460928828e-05, "loss": 0.53, "step": 431 }, { "epoch": 0.2049335863377609, "grad_norm": 3.5252304077148438, "learning_rate": 1.9726695378847332e-05, "loss": 0.5136, "step": 432 }, { "epoch": 0.2054079696394687, "grad_norm": 3.3857786655426025, "learning_rate": 1.972490855519381e-05, "loss": 0.4836, "step": 433 }, { "epoch": 0.20588235294117646, "grad_norm": 3.3901121616363525, "learning_rate": 1.9723115991022997e-05, "loss": 0.4585, "step": 434 }, { "epoch": 0.20635673624288425, "grad_norm": 3.816561460494995, "learning_rate": 1.972131768739303e-05, "loss": 0.5757, "step": 435 }, { "epoch": 0.20683111954459202, "grad_norm": 3.4764530658721924, "learning_rate": 1.9719513645365426e-05, "loss": 0.4892, "step": 436 }, { "epoch": 0.2073055028462998, "grad_norm": 3.6275835037231445, "learning_rate": 1.97177038660051e-05, "loss": 0.4744, "step": 437 }, { "epoch": 0.2077798861480076, "grad_norm": 3.73422908782959, "learning_rate": 1.971588835038034e-05, "loss": 0.6116, "step": 438 }, { "epoch": 0.20825426944971537, "grad_norm": 3.751572370529175, "learning_rate": 1.971406709956283e-05, "loss": 0.534, "step": 439 }, { "epoch": 0.20872865275142316, "grad_norm": 2.7028439044952393, "learning_rate": 1.9712240114627637e-05, "loss": 0.4065, "step": 440 }, { "epoch": 0.20920303605313093, "grad_norm": 2.995476007461548, "learning_rate": 1.971040739665321e-05, "loss": 0.4091, "step": 441 }, { "epoch": 0.20967741935483872, "grad_norm": 2.9523215293884277, "learning_rate": 1.970856894672139e-05, "loss": 0.465, "step": 442 }, { "epoch": 0.21015180265654648, "grad_norm": 3.5105068683624268, "learning_rate": 1.9706724765917384e-05, "loss": 0.459, "step": 443 }, { "epoch": 0.21062618595825428, "grad_norm": 2.8509361743927, "learning_rate": 1.97048748553298e-05, "loss": 0.4519, "step": 444 }, { "epoch": 0.21110056925996204, "grad_norm": 3.4983131885528564, "learning_rate": 1.9703019216050627e-05, "loss": 0.5233, "step": 445 }, { "epoch": 0.21157495256166983, "grad_norm": 3.1177706718444824, "learning_rate": 1.970115784917523e-05, "loss": 0.4847, "step": 446 }, { "epoch": 0.2120493358633776, "grad_norm": 3.1251273155212402, "learning_rate": 1.9699290755802344e-05, "loss": 0.4727, "step": 447 }, { "epoch": 0.2125237191650854, "grad_norm": 2.4966578483581543, "learning_rate": 1.9697417937034106e-05, "loss": 0.4165, "step": 448 }, { "epoch": 0.21299810246679318, "grad_norm": 3.8288350105285645, "learning_rate": 1.969553939397602e-05, "loss": 0.553, "step": 449 }, { "epoch": 0.21347248576850095, "grad_norm": 3.0275819301605225, "learning_rate": 1.9693655127736974e-05, "loss": 0.3849, "step": 450 }, { "epoch": 0.21394686907020874, "grad_norm": 3.2554712295532227, "learning_rate": 1.9691765139429227e-05, "loss": 0.5852, "step": 451 }, { "epoch": 0.2144212523719165, "grad_norm": 3.404175043106079, "learning_rate": 1.968986943016842e-05, "loss": 0.5296, "step": 452 }, { "epoch": 0.2148956356736243, "grad_norm": 3.853860378265381, "learning_rate": 1.9687968001073575e-05, "loss": 0.6205, "step": 453 }, { "epoch": 0.21537001897533206, "grad_norm": 3.2675604820251465, "learning_rate": 1.9686060853267088e-05, "loss": 0.5094, "step": 454 }, { "epoch": 0.21584440227703985, "grad_norm": 4.281660079956055, "learning_rate": 1.9684147987874725e-05, "loss": 0.514, "step": 455 }, { "epoch": 0.21631878557874762, "grad_norm": 3.5292105674743652, "learning_rate": 1.9682229406025635e-05, "loss": 0.4982, "step": 456 }, { "epoch": 0.2167931688804554, "grad_norm": 3.585477352142334, "learning_rate": 1.9680305108852335e-05, "loss": 0.4612, "step": 457 }, { "epoch": 0.21726755218216318, "grad_norm": 3.200887680053711, "learning_rate": 1.9678375097490717e-05, "loss": 0.5057, "step": 458 }, { "epoch": 0.21774193548387097, "grad_norm": 3.3191189765930176, "learning_rate": 1.9676439373080056e-05, "loss": 0.4977, "step": 459 }, { "epoch": 0.21821631878557876, "grad_norm": 3.9757373332977295, "learning_rate": 1.9674497936762984e-05, "loss": 0.5366, "step": 460 }, { "epoch": 0.21869070208728653, "grad_norm": 3.3760838508605957, "learning_rate": 1.9672550789685512e-05, "loss": 0.4957, "step": 461 }, { "epoch": 0.21916508538899432, "grad_norm": 3.371440887451172, "learning_rate": 1.9670597932997023e-05, "loss": 0.5172, "step": 462 }, { "epoch": 0.21963946869070208, "grad_norm": 5.2982001304626465, "learning_rate": 1.966863936785027e-05, "loss": 0.5267, "step": 463 }, { "epoch": 0.22011385199240988, "grad_norm": 3.9646644592285156, "learning_rate": 1.966667509540137e-05, "loss": 0.4803, "step": 464 }, { "epoch": 0.22058823529411764, "grad_norm": 3.059048891067505, "learning_rate": 1.9664705116809823e-05, "loss": 0.4096, "step": 465 }, { "epoch": 0.22106261859582543, "grad_norm": 3.714001178741455, "learning_rate": 1.9662729433238477e-05, "loss": 0.6027, "step": 466 }, { "epoch": 0.2215370018975332, "grad_norm": 3.6565237045288086, "learning_rate": 1.9660748045853567e-05, "loss": 0.4968, "step": 467 }, { "epoch": 0.222011385199241, "grad_norm": 3.092435836791992, "learning_rate": 1.965876095582468e-05, "loss": 0.4037, "step": 468 }, { "epoch": 0.22248576850094876, "grad_norm": 3.3902924060821533, "learning_rate": 1.965676816432478e-05, "loss": 0.5345, "step": 469 }, { "epoch": 0.22296015180265655, "grad_norm": 3.4829235076904297, "learning_rate": 1.9654769672530186e-05, "loss": 0.504, "step": 470 }, { "epoch": 0.22343453510436434, "grad_norm": 3.3810994625091553, "learning_rate": 1.9652765481620596e-05, "loss": 0.4718, "step": 471 }, { "epoch": 0.2239089184060721, "grad_norm": 4.006764888763428, "learning_rate": 1.965075559277906e-05, "loss": 0.5748, "step": 472 }, { "epoch": 0.2243833017077799, "grad_norm": 3.318103313446045, "learning_rate": 1.9648740007191994e-05, "loss": 0.4585, "step": 473 }, { "epoch": 0.22485768500948766, "grad_norm": 3.561944007873535, "learning_rate": 1.9646718726049187e-05, "loss": 0.5574, "step": 474 }, { "epoch": 0.22533206831119545, "grad_norm": 3.0337014198303223, "learning_rate": 1.964469175054377e-05, "loss": 0.4363, "step": 475 }, { "epoch": 0.22580645161290322, "grad_norm": 2.986750364303589, "learning_rate": 1.964265908187225e-05, "loss": 0.4626, "step": 476 }, { "epoch": 0.226280834914611, "grad_norm": 3.6310508251190186, "learning_rate": 1.9640620721234488e-05, "loss": 0.4709, "step": 477 }, { "epoch": 0.22675521821631878, "grad_norm": 3.44527530670166, "learning_rate": 1.963857666983372e-05, "loss": 0.4663, "step": 478 }, { "epoch": 0.22722960151802657, "grad_norm": 4.240223407745361, "learning_rate": 1.963652692887652e-05, "loss": 0.6681, "step": 479 }, { "epoch": 0.22770398481973433, "grad_norm": 3.381404161453247, "learning_rate": 1.9634471499572826e-05, "loss": 0.4429, "step": 480 }, { "epoch": 0.22817836812144213, "grad_norm": 3.138699769973755, "learning_rate": 1.9632410383135946e-05, "loss": 0.4803, "step": 481 }, { "epoch": 0.2286527514231499, "grad_norm": 3.806920051574707, "learning_rate": 1.9630343580782538e-05, "loss": 0.4943, "step": 482 }, { "epoch": 0.22912713472485768, "grad_norm": 3.408698320388794, "learning_rate": 1.9628271093732605e-05, "loss": 0.5208, "step": 483 }, { "epoch": 0.22960151802656548, "grad_norm": 2.962362051010132, "learning_rate": 1.9626192923209524e-05, "loss": 0.4449, "step": 484 }, { "epoch": 0.23007590132827324, "grad_norm": 3.10517954826355, "learning_rate": 1.9624109070440017e-05, "loss": 0.4865, "step": 485 }, { "epoch": 0.23055028462998103, "grad_norm": 3.3021395206451416, "learning_rate": 1.9622019536654154e-05, "loss": 0.5415, "step": 486 }, { "epoch": 0.2310246679316888, "grad_norm": 3.686612606048584, "learning_rate": 1.961992432308538e-05, "loss": 0.4875, "step": 487 }, { "epoch": 0.2314990512333966, "grad_norm": 3.50701904296875, "learning_rate": 1.961782343097047e-05, "loss": 0.459, "step": 488 }, { "epoch": 0.23197343453510436, "grad_norm": 3.291677713394165, "learning_rate": 1.9615716861549557e-05, "loss": 0.5038, "step": 489 }, { "epoch": 0.23244781783681215, "grad_norm": 3.3820300102233887, "learning_rate": 1.9613604616066137e-05, "loss": 0.4445, "step": 490 }, { "epoch": 0.2329222011385199, "grad_norm": 3.5324206352233887, "learning_rate": 1.9611486695767037e-05, "loss": 0.5252, "step": 491 }, { "epoch": 0.2333965844402277, "grad_norm": 3.8190603256225586, "learning_rate": 1.9609363101902456e-05, "loss": 0.4666, "step": 492 }, { "epoch": 0.23387096774193547, "grad_norm": 4.703067779541016, "learning_rate": 1.960723383572592e-05, "loss": 0.4529, "step": 493 }, { "epoch": 0.23434535104364326, "grad_norm": 3.1753973960876465, "learning_rate": 1.960509889849432e-05, "loss": 0.4731, "step": 494 }, { "epoch": 0.23481973434535106, "grad_norm": 3.4943294525146484, "learning_rate": 1.960295829146789e-05, "loss": 0.5578, "step": 495 }, { "epoch": 0.23529411764705882, "grad_norm": 3.4101996421813965, "learning_rate": 1.9600812015910203e-05, "loss": 0.5255, "step": 496 }, { "epoch": 0.2357685009487666, "grad_norm": 3.5300912857055664, "learning_rate": 1.9598660073088186e-05, "loss": 0.5781, "step": 497 }, { "epoch": 0.23624288425047438, "grad_norm": 3.204566478729248, "learning_rate": 1.959650246427211e-05, "loss": 0.5813, "step": 498 }, { "epoch": 0.23671726755218217, "grad_norm": 3.192633628845215, "learning_rate": 1.9594339190735594e-05, "loss": 0.4062, "step": 499 }, { "epoch": 0.23719165085388993, "grad_norm": 3.062784194946289, "learning_rate": 1.9592170253755594e-05, "loss": 0.4764, "step": 500 }, { "epoch": 0.23766603415559773, "grad_norm": 2.8469433784484863, "learning_rate": 1.9589995654612412e-05, "loss": 0.4091, "step": 501 }, { "epoch": 0.2381404174573055, "grad_norm": 3.5228500366210938, "learning_rate": 1.958781539458969e-05, "loss": 0.51, "step": 502 }, { "epoch": 0.23861480075901328, "grad_norm": 3.872974395751953, "learning_rate": 1.9585629474974413e-05, "loss": 0.5438, "step": 503 }, { "epoch": 0.23908918406072105, "grad_norm": 3.1505889892578125, "learning_rate": 1.9583437897056915e-05, "loss": 0.4181, "step": 504 }, { "epoch": 0.23956356736242884, "grad_norm": 3.113068103790283, "learning_rate": 1.958124066213086e-05, "loss": 0.4512, "step": 505 }, { "epoch": 0.24003795066413663, "grad_norm": 3.0535147190093994, "learning_rate": 1.957903777149325e-05, "loss": 0.3913, "step": 506 }, { "epoch": 0.2405123339658444, "grad_norm": 3.152808427810669, "learning_rate": 1.957682922644443e-05, "loss": 0.4813, "step": 507 }, { "epoch": 0.2409867172675522, "grad_norm": 3.2799031734466553, "learning_rate": 1.957461502828809e-05, "loss": 0.4559, "step": 508 }, { "epoch": 0.24146110056925996, "grad_norm": 3.305764675140381, "learning_rate": 1.957239517833124e-05, "loss": 0.4553, "step": 509 }, { "epoch": 0.24193548387096775, "grad_norm": 2.7665562629699707, "learning_rate": 1.957016967788424e-05, "loss": 0.4145, "step": 510 }, { "epoch": 0.2424098671726755, "grad_norm": 3.1011545658111572, "learning_rate": 1.9567938528260778e-05, "loss": 0.5109, "step": 511 }, { "epoch": 0.2428842504743833, "grad_norm": 3.0363271236419678, "learning_rate": 1.9565701730777883e-05, "loss": 0.4649, "step": 512 }, { "epoch": 0.24335863377609107, "grad_norm": 3.1920406818389893, "learning_rate": 1.9563459286755914e-05, "loss": 0.5797, "step": 513 }, { "epoch": 0.24383301707779886, "grad_norm": 3.131014347076416, "learning_rate": 1.9561211197518564e-05, "loss": 0.4634, "step": 514 }, { "epoch": 0.24430740037950663, "grad_norm": 3.515669107437134, "learning_rate": 1.9558957464392853e-05, "loss": 0.5281, "step": 515 }, { "epoch": 0.24478178368121442, "grad_norm": 3.187274932861328, "learning_rate": 1.9556698088709144e-05, "loss": 0.443, "step": 516 }, { "epoch": 0.2452561669829222, "grad_norm": 3.458089828491211, "learning_rate": 1.9554433071801117e-05, "loss": 0.4375, "step": 517 }, { "epoch": 0.24573055028462998, "grad_norm": 3.4186503887176514, "learning_rate": 1.9552162415005797e-05, "loss": 0.5475, "step": 518 }, { "epoch": 0.24620493358633777, "grad_norm": 3.319234848022461, "learning_rate": 1.954988611966352e-05, "loss": 0.4545, "step": 519 }, { "epoch": 0.24667931688804554, "grad_norm": 3.491206169128418, "learning_rate": 1.9547604187117974e-05, "loss": 0.4993, "step": 520 }, { "epoch": 0.24715370018975333, "grad_norm": 3.1041345596313477, "learning_rate": 1.954531661871615e-05, "loss": 0.4866, "step": 521 }, { "epoch": 0.2476280834914611, "grad_norm": 4.381217956542969, "learning_rate": 1.954302341580838e-05, "loss": 0.5897, "step": 522 }, { "epoch": 0.24810246679316889, "grad_norm": 3.8608601093292236, "learning_rate": 1.9540724579748323e-05, "loss": 0.4814, "step": 523 }, { "epoch": 0.24857685009487665, "grad_norm": 3.7535743713378906, "learning_rate": 1.953842011189295e-05, "loss": 0.5214, "step": 524 }, { "epoch": 0.24905123339658444, "grad_norm": 3.2677531242370605, "learning_rate": 1.9536110013602578e-05, "loss": 0.3946, "step": 525 }, { "epoch": 0.2495256166982922, "grad_norm": 3.734286069869995, "learning_rate": 1.9533794286240828e-05, "loss": 0.4848, "step": 526 }, { "epoch": 0.25, "grad_norm": 3.121314764022827, "learning_rate": 1.953147293117465e-05, "loss": 0.4554, "step": 527 }, { "epoch": 0.2504743833017078, "grad_norm": 4.03399658203125, "learning_rate": 1.9529145949774322e-05, "loss": 0.5657, "step": 528 }, { "epoch": 0.2509487666034156, "grad_norm": 3.444110631942749, "learning_rate": 1.952681334341343e-05, "loss": 0.4767, "step": 529 }, { "epoch": 0.2514231499051233, "grad_norm": 3.272854804992676, "learning_rate": 1.9524475113468897e-05, "loss": 0.5218, "step": 530 }, { "epoch": 0.2518975332068311, "grad_norm": 3.146151542663574, "learning_rate": 1.9522131261320952e-05, "loss": 0.47, "step": 531 }, { "epoch": 0.2523719165085389, "grad_norm": 3.1734254360198975, "learning_rate": 1.9519781788353148e-05, "loss": 0.4846, "step": 532 }, { "epoch": 0.2528462998102467, "grad_norm": 4.805948257446289, "learning_rate": 1.9517426695952358e-05, "loss": 0.4689, "step": 533 }, { "epoch": 0.25332068311195444, "grad_norm": 3.5787010192871094, "learning_rate": 1.9515065985508766e-05, "loss": 0.4505, "step": 534 }, { "epoch": 0.25379506641366223, "grad_norm": 3.29832124710083, "learning_rate": 1.9512699658415882e-05, "loss": 0.4994, "step": 535 }, { "epoch": 0.25426944971537, "grad_norm": 3.159043312072754, "learning_rate": 1.951032771607052e-05, "loss": 0.5383, "step": 536 }, { "epoch": 0.2547438330170778, "grad_norm": 3.565232515335083, "learning_rate": 1.9507950159872814e-05, "loss": 0.4824, "step": 537 }, { "epoch": 0.25521821631878555, "grad_norm": 3.3344407081604004, "learning_rate": 1.9505566991226214e-05, "loss": 0.4797, "step": 538 }, { "epoch": 0.25569259962049334, "grad_norm": 3.4726908206939697, "learning_rate": 1.9503178211537483e-05, "loss": 0.5105, "step": 539 }, { "epoch": 0.25616698292220114, "grad_norm": 3.1351394653320312, "learning_rate": 1.9500783822216693e-05, "loss": 0.4824, "step": 540 }, { "epoch": 0.25664136622390893, "grad_norm": 3.049128532409668, "learning_rate": 1.9498383824677223e-05, "loss": 0.518, "step": 541 }, { "epoch": 0.2571157495256167, "grad_norm": 3.9116055965423584, "learning_rate": 1.9495978220335774e-05, "loss": 0.4997, "step": 542 }, { "epoch": 0.25759013282732446, "grad_norm": 3.9851927757263184, "learning_rate": 1.949356701061235e-05, "loss": 0.4598, "step": 543 }, { "epoch": 0.25806451612903225, "grad_norm": 3.269645929336548, "learning_rate": 1.9491150196930258e-05, "loss": 0.4539, "step": 544 }, { "epoch": 0.25853889943074004, "grad_norm": 3.270271062850952, "learning_rate": 1.9488727780716125e-05, "loss": 0.4371, "step": 545 }, { "epoch": 0.25901328273244784, "grad_norm": 3.121974468231201, "learning_rate": 1.948629976339988e-05, "loss": 0.4952, "step": 546 }, { "epoch": 0.2594876660341556, "grad_norm": 2.9817652702331543, "learning_rate": 1.9483866146414756e-05, "loss": 0.4515, "step": 547 }, { "epoch": 0.25996204933586337, "grad_norm": 3.0989949703216553, "learning_rate": 1.9481426931197293e-05, "loss": 0.4139, "step": 548 }, { "epoch": 0.26043643263757116, "grad_norm": 4.340046405792236, "learning_rate": 1.9478982119187333e-05, "loss": 0.6325, "step": 549 }, { "epoch": 0.26091081593927895, "grad_norm": 3.455524444580078, "learning_rate": 1.9476531711828027e-05, "loss": 0.5801, "step": 550 }, { "epoch": 0.26138519924098674, "grad_norm": 3.185102939605713, "learning_rate": 1.9474075710565825e-05, "loss": 0.5409, "step": 551 }, { "epoch": 0.2618595825426945, "grad_norm": 3.527089834213257, "learning_rate": 1.9471614116850482e-05, "loss": 0.44, "step": 552 }, { "epoch": 0.2623339658444023, "grad_norm": 3.273226499557495, "learning_rate": 1.946914693213505e-05, "loss": 0.4909, "step": 553 }, { "epoch": 0.26280834914611007, "grad_norm": 3.4487481117248535, "learning_rate": 1.946667415787589e-05, "loss": 0.4712, "step": 554 }, { "epoch": 0.26328273244781786, "grad_norm": 3.9401209354400635, "learning_rate": 1.9464195795532648e-05, "loss": 0.4435, "step": 555 }, { "epoch": 0.2637571157495256, "grad_norm": 3.349949836730957, "learning_rate": 1.946171184656828e-05, "loss": 0.4909, "step": 556 }, { "epoch": 0.2642314990512334, "grad_norm": 4.190546989440918, "learning_rate": 1.9459222312449036e-05, "loss": 0.5163, "step": 557 }, { "epoch": 0.2647058823529412, "grad_norm": 3.7838876247406006, "learning_rate": 1.9456727194644465e-05, "loss": 0.5356, "step": 558 }, { "epoch": 0.265180265654649, "grad_norm": 3.253169536590576, "learning_rate": 1.945422649462741e-05, "loss": 0.5241, "step": 559 }, { "epoch": 0.2656546489563567, "grad_norm": 3.7193899154663086, "learning_rate": 1.9451720213874007e-05, "loss": 0.4999, "step": 560 }, { "epoch": 0.2661290322580645, "grad_norm": 3.82112717628479, "learning_rate": 1.9449208353863693e-05, "loss": 0.4958, "step": 561 }, { "epoch": 0.2666034155597723, "grad_norm": 3.6052136421203613, "learning_rate": 1.944669091607919e-05, "loss": 0.5644, "step": 562 }, { "epoch": 0.2670777988614801, "grad_norm": 3.2630624771118164, "learning_rate": 1.9444167902006516e-05, "loss": 0.4038, "step": 563 }, { "epoch": 0.2675521821631879, "grad_norm": 3.3602817058563232, "learning_rate": 1.944163931313499e-05, "loss": 0.4928, "step": 564 }, { "epoch": 0.2680265654648956, "grad_norm": 2.9780235290527344, "learning_rate": 1.94391051509572e-05, "loss": 0.4451, "step": 565 }, { "epoch": 0.2685009487666034, "grad_norm": 3.799266815185547, "learning_rate": 1.9436565416969045e-05, "loss": 0.5159, "step": 566 }, { "epoch": 0.2689753320683112, "grad_norm": 3.299086093902588, "learning_rate": 1.9434020112669706e-05, "loss": 0.4683, "step": 567 }, { "epoch": 0.269449715370019, "grad_norm": 3.3280222415924072, "learning_rate": 1.9431469239561646e-05, "loss": 0.4656, "step": 568 }, { "epoch": 0.26992409867172673, "grad_norm": 3.7164220809936523, "learning_rate": 1.9428912799150624e-05, "loss": 0.4551, "step": 569 }, { "epoch": 0.2703984819734345, "grad_norm": 3.125343084335327, "learning_rate": 1.9426350792945676e-05, "loss": 0.4851, "step": 570 }, { "epoch": 0.2708728652751423, "grad_norm": 2.9615533351898193, "learning_rate": 1.9423783222459135e-05, "loss": 0.4318, "step": 571 }, { "epoch": 0.2713472485768501, "grad_norm": 2.992379903793335, "learning_rate": 1.942121008920661e-05, "loss": 0.4402, "step": 572 }, { "epoch": 0.2718216318785579, "grad_norm": 4.136736869812012, "learning_rate": 1.9418631394706998e-05, "loss": 0.5978, "step": 573 }, { "epoch": 0.27229601518026564, "grad_norm": 3.9404451847076416, "learning_rate": 1.941604714048247e-05, "loss": 0.5606, "step": 574 }, { "epoch": 0.27277039848197343, "grad_norm": 3.232971668243408, "learning_rate": 1.9413457328058495e-05, "loss": 0.5066, "step": 575 }, { "epoch": 0.2732447817836812, "grad_norm": 3.407205820083618, "learning_rate": 1.941086195896381e-05, "loss": 0.5584, "step": 576 }, { "epoch": 0.273719165085389, "grad_norm": 3.10275936126709, "learning_rate": 1.940826103473043e-05, "loss": 0.4627, "step": 577 }, { "epoch": 0.27419354838709675, "grad_norm": 3.237243413925171, "learning_rate": 1.9405654556893667e-05, "loss": 0.4897, "step": 578 }, { "epoch": 0.27466793168880455, "grad_norm": 3.3360302448272705, "learning_rate": 1.9403042526992087e-05, "loss": 0.5143, "step": 579 }, { "epoch": 0.27514231499051234, "grad_norm": 3.802823305130005, "learning_rate": 1.9400424946567552e-05, "loss": 0.492, "step": 580 }, { "epoch": 0.27561669829222013, "grad_norm": 2.7487895488739014, "learning_rate": 1.9397801817165192e-05, "loss": 0.4175, "step": 581 }, { "epoch": 0.27609108159392787, "grad_norm": 3.0970826148986816, "learning_rate": 1.9395173140333413e-05, "loss": 0.4156, "step": 582 }, { "epoch": 0.27656546489563566, "grad_norm": 4.566722393035889, "learning_rate": 1.93925389176239e-05, "loss": 0.5653, "step": 583 }, { "epoch": 0.27703984819734345, "grad_norm": 2.8217339515686035, "learning_rate": 1.9389899150591605e-05, "loss": 0.4451, "step": 584 }, { "epoch": 0.27751423149905124, "grad_norm": 3.9668426513671875, "learning_rate": 1.938725384079476e-05, "loss": 0.5351, "step": 585 }, { "epoch": 0.27798861480075904, "grad_norm": 2.601659059524536, "learning_rate": 1.9384602989794868e-05, "loss": 0.3897, "step": 586 }, { "epoch": 0.2784629981024668, "grad_norm": 4.612437725067139, "learning_rate": 1.938194659915669e-05, "loss": 0.436, "step": 587 }, { "epoch": 0.27893738140417457, "grad_norm": 3.368626356124878, "learning_rate": 1.9379284670448274e-05, "loss": 0.3943, "step": 588 }, { "epoch": 0.27941176470588236, "grad_norm": 3.076282262802124, "learning_rate": 1.937661720524093e-05, "loss": 0.4004, "step": 589 }, { "epoch": 0.27988614800759015, "grad_norm": 2.78485107421875, "learning_rate": 1.9373944205109236e-05, "loss": 0.35, "step": 590 }, { "epoch": 0.2803605313092979, "grad_norm": 3.2779622077941895, "learning_rate": 1.9371265671631038e-05, "loss": 0.4068, "step": 591 }, { "epoch": 0.2808349146110057, "grad_norm": 2.9968180656433105, "learning_rate": 1.9368581606387442e-05, "loss": 0.4667, "step": 592 }, { "epoch": 0.2813092979127135, "grad_norm": 3.2578439712524414, "learning_rate": 1.9365892010962834e-05, "loss": 0.4826, "step": 593 }, { "epoch": 0.28178368121442127, "grad_norm": 3.467496395111084, "learning_rate": 1.936319688694485e-05, "loss": 0.503, "step": 594 }, { "epoch": 0.28225806451612906, "grad_norm": 2.929417848587036, "learning_rate": 1.9360496235924396e-05, "loss": 0.4527, "step": 595 }, { "epoch": 0.2827324478178368, "grad_norm": 3.066240072250366, "learning_rate": 1.935779005949564e-05, "loss": 0.4604, "step": 596 }, { "epoch": 0.2832068311195446, "grad_norm": 2.88321852684021, "learning_rate": 1.935507835925601e-05, "loss": 0.4544, "step": 597 }, { "epoch": 0.2836812144212524, "grad_norm": 2.9748549461364746, "learning_rate": 1.93523611368062e-05, "loss": 0.4272, "step": 598 }, { "epoch": 0.2841555977229602, "grad_norm": 2.744473695755005, "learning_rate": 1.9349638393750156e-05, "loss": 0.4874, "step": 599 }, { "epoch": 0.2846299810246679, "grad_norm": 3.6597301959991455, "learning_rate": 1.9346910131695084e-05, "loss": 0.4996, "step": 600 }, { "epoch": 0.2851043643263757, "grad_norm": 2.5389652252197266, "learning_rate": 1.9344176352251456e-05, "loss": 0.3724, "step": 601 }, { "epoch": 0.2855787476280835, "grad_norm": 3.2770135402679443, "learning_rate": 1.934143705703299e-05, "loss": 0.4549, "step": 602 }, { "epoch": 0.2860531309297913, "grad_norm": 2.7468864917755127, "learning_rate": 1.933869224765667e-05, "loss": 0.4371, "step": 603 }, { "epoch": 0.286527514231499, "grad_norm": 2.662562370300293, "learning_rate": 1.933594192574272e-05, "loss": 0.4146, "step": 604 }, { "epoch": 0.2870018975332068, "grad_norm": 2.997250556945801, "learning_rate": 1.933318609291464e-05, "loss": 0.485, "step": 605 }, { "epoch": 0.2874762808349146, "grad_norm": 2.8720083236694336, "learning_rate": 1.9330424750799165e-05, "loss": 0.4197, "step": 606 }, { "epoch": 0.2879506641366224, "grad_norm": 3.827880620956421, "learning_rate": 1.9327657901026284e-05, "loss": 0.4941, "step": 607 }, { "epoch": 0.2884250474383302, "grad_norm": 3.460559368133545, "learning_rate": 1.9324885545229248e-05, "loss": 0.4842, "step": 608 }, { "epoch": 0.28889943074003793, "grad_norm": 2.8307604789733887, "learning_rate": 1.932210768504455e-05, "loss": 0.4564, "step": 609 }, { "epoch": 0.2893738140417457, "grad_norm": 3.346304416656494, "learning_rate": 1.9319324322111928e-05, "loss": 0.5572, "step": 610 }, { "epoch": 0.2898481973434535, "grad_norm": 3.4799015522003174, "learning_rate": 1.931653545807438e-05, "loss": 0.542, "step": 611 }, { "epoch": 0.2903225806451613, "grad_norm": 2.8453822135925293, "learning_rate": 1.931374109457814e-05, "loss": 0.3665, "step": 612 }, { "epoch": 0.29079696394686905, "grad_norm": 3.6189804077148438, "learning_rate": 1.9310941233272698e-05, "loss": 0.4945, "step": 613 }, { "epoch": 0.29127134724857684, "grad_norm": 3.2037627696990967, "learning_rate": 1.9308135875810778e-05, "loss": 0.4499, "step": 614 }, { "epoch": 0.29174573055028463, "grad_norm": 3.068361520767212, "learning_rate": 1.930532502384836e-05, "loss": 0.4529, "step": 615 }, { "epoch": 0.2922201138519924, "grad_norm": 2.5314414501190186, "learning_rate": 1.9302508679044662e-05, "loss": 0.3866, "step": 616 }, { "epoch": 0.2926944971537002, "grad_norm": 3.3053672313690186, "learning_rate": 1.929968684306214e-05, "loss": 0.5091, "step": 617 }, { "epoch": 0.29316888045540795, "grad_norm": 3.5351502895355225, "learning_rate": 1.9296859517566505e-05, "loss": 0.5416, "step": 618 }, { "epoch": 0.29364326375711575, "grad_norm": 3.617424964904785, "learning_rate": 1.929402670422669e-05, "loss": 0.5325, "step": 619 }, { "epoch": 0.29411764705882354, "grad_norm": 2.799778938293457, "learning_rate": 1.9291188404714876e-05, "loss": 0.514, "step": 620 }, { "epoch": 0.29459203036053133, "grad_norm": 3.1185672283172607, "learning_rate": 1.9288344620706493e-05, "loss": 0.5391, "step": 621 }, { "epoch": 0.29506641366223907, "grad_norm": 3.1492013931274414, "learning_rate": 1.9285495353880187e-05, "loss": 0.4585, "step": 622 }, { "epoch": 0.29554079696394686, "grad_norm": 3.211419105529785, "learning_rate": 1.928264060591786e-05, "loss": 0.4983, "step": 623 }, { "epoch": 0.29601518026565465, "grad_norm": 3.3846914768218994, "learning_rate": 1.927978037850464e-05, "loss": 0.5182, "step": 624 }, { "epoch": 0.29648956356736245, "grad_norm": 2.692244052886963, "learning_rate": 1.927691467332889e-05, "loss": 0.4298, "step": 625 }, { "epoch": 0.2969639468690702, "grad_norm": 2.660977363586426, "learning_rate": 1.9274043492082205e-05, "loss": 0.396, "step": 626 }, { "epoch": 0.297438330170778, "grad_norm": 3.270251989364624, "learning_rate": 1.9271166836459418e-05, "loss": 0.499, "step": 627 }, { "epoch": 0.29791271347248577, "grad_norm": 3.6545050144195557, "learning_rate": 1.926828470815859e-05, "loss": 0.5114, "step": 628 }, { "epoch": 0.29838709677419356, "grad_norm": 3.5217530727386475, "learning_rate": 1.9265397108881015e-05, "loss": 0.4573, "step": 629 }, { "epoch": 0.29886148007590135, "grad_norm": 3.2720742225646973, "learning_rate": 1.9262504040331208e-05, "loss": 0.4423, "step": 630 }, { "epoch": 0.2993358633776091, "grad_norm": 3.0071029663085938, "learning_rate": 1.9259605504216922e-05, "loss": 0.4364, "step": 631 }, { "epoch": 0.2998102466793169, "grad_norm": 4.148307800292969, "learning_rate": 1.925670150224914e-05, "loss": 0.4924, "step": 632 }, { "epoch": 0.3002846299810247, "grad_norm": 3.19143009185791, "learning_rate": 1.9253792036142052e-05, "loss": 0.5972, "step": 633 }, { "epoch": 0.30075901328273247, "grad_norm": 2.9321649074554443, "learning_rate": 1.92508771076131e-05, "loss": 0.3967, "step": 634 }, { "epoch": 0.3012333965844402, "grad_norm": 3.0709781646728516, "learning_rate": 1.9247956718382933e-05, "loss": 0.5369, "step": 635 }, { "epoch": 0.301707779886148, "grad_norm": 2.6908938884735107, "learning_rate": 1.9245030870175427e-05, "loss": 0.4135, "step": 636 }, { "epoch": 0.3021821631878558, "grad_norm": 2.776761054992676, "learning_rate": 1.9242099564717683e-05, "loss": 0.519, "step": 637 }, { "epoch": 0.3026565464895636, "grad_norm": 3.2145349979400635, "learning_rate": 1.9239162803740016e-05, "loss": 0.4982, "step": 638 }, { "epoch": 0.3031309297912713, "grad_norm": 2.7437076568603516, "learning_rate": 1.9236220588975976e-05, "loss": 0.4159, "step": 639 }, { "epoch": 0.3036053130929791, "grad_norm": 3.104196310043335, "learning_rate": 1.9233272922162318e-05, "loss": 0.493, "step": 640 }, { "epoch": 0.3040796963946869, "grad_norm": 3.3658387660980225, "learning_rate": 1.9230319805039022e-05, "loss": 0.5201, "step": 641 }, { "epoch": 0.3045540796963947, "grad_norm": 2.732059955596924, "learning_rate": 1.922736123934928e-05, "loss": 0.4961, "step": 642 }, { "epoch": 0.3050284629981025, "grad_norm": 3.038705825805664, "learning_rate": 1.922439722683951e-05, "loss": 0.4819, "step": 643 }, { "epoch": 0.3055028462998102, "grad_norm": 2.702758312225342, "learning_rate": 1.9221427769259333e-05, "loss": 0.4632, "step": 644 }, { "epoch": 0.305977229601518, "grad_norm": 3.554279327392578, "learning_rate": 1.9218452868361597e-05, "loss": 0.4135, "step": 645 }, { "epoch": 0.3064516129032258, "grad_norm": 3.9966790676116943, "learning_rate": 1.921547252590235e-05, "loss": 0.537, "step": 646 }, { "epoch": 0.3069259962049336, "grad_norm": 3.105196475982666, "learning_rate": 1.9212486743640864e-05, "loss": 0.474, "step": 647 }, { "epoch": 0.30740037950664134, "grad_norm": 3.0190954208374023, "learning_rate": 1.9209495523339614e-05, "loss": 0.4715, "step": 648 }, { "epoch": 0.30787476280834913, "grad_norm": 3.061038017272949, "learning_rate": 1.920649886676429e-05, "loss": 0.5493, "step": 649 }, { "epoch": 0.3083491461100569, "grad_norm": 3.0604310035705566, "learning_rate": 1.9203496775683787e-05, "loss": 0.4818, "step": 650 }, { "epoch": 0.3088235294117647, "grad_norm": 3.209660768508911, "learning_rate": 1.9200489251870207e-05, "loss": 0.3923, "step": 651 }, { "epoch": 0.3092979127134725, "grad_norm": 3.1261727809906006, "learning_rate": 1.9197476297098868e-05, "loss": 0.4662, "step": 652 }, { "epoch": 0.30977229601518025, "grad_norm": 3.9394774436950684, "learning_rate": 1.919445791314828e-05, "loss": 0.4693, "step": 653 }, { "epoch": 0.31024667931688804, "grad_norm": 2.708280086517334, "learning_rate": 1.9191434101800174e-05, "loss": 0.4855, "step": 654 }, { "epoch": 0.31072106261859583, "grad_norm": 3.1161880493164062, "learning_rate": 1.9188404864839465e-05, "loss": 0.5513, "step": 655 }, { "epoch": 0.3111954459203036, "grad_norm": 3.3281567096710205, "learning_rate": 1.918537020405429e-05, "loss": 0.4403, "step": 656 }, { "epoch": 0.31166982922201136, "grad_norm": 3.432615041732788, "learning_rate": 1.9182330121235978e-05, "loss": 0.5654, "step": 657 }, { "epoch": 0.31214421252371916, "grad_norm": 3.0545363426208496, "learning_rate": 1.917928461817906e-05, "loss": 0.4361, "step": 658 }, { "epoch": 0.31261859582542695, "grad_norm": 2.9954404830932617, "learning_rate": 1.917623369668126e-05, "loss": 0.4687, "step": 659 }, { "epoch": 0.31309297912713474, "grad_norm": 3.5407347679138184, "learning_rate": 1.9173177358543512e-05, "loss": 0.5126, "step": 660 }, { "epoch": 0.3135673624288425, "grad_norm": 3.2592506408691406, "learning_rate": 1.9170115605569945e-05, "loss": 0.5198, "step": 661 }, { "epoch": 0.31404174573055027, "grad_norm": 2.86240291595459, "learning_rate": 1.9167048439567876e-05, "loss": 0.4064, "step": 662 }, { "epoch": 0.31451612903225806, "grad_norm": 2.8558406829833984, "learning_rate": 1.9163975862347824e-05, "loss": 0.4397, "step": 663 }, { "epoch": 0.31499051233396586, "grad_norm": 2.9497201442718506, "learning_rate": 1.9160897875723505e-05, "loss": 0.447, "step": 664 }, { "epoch": 0.31546489563567365, "grad_norm": 2.7483532428741455, "learning_rate": 1.915781448151182e-05, "loss": 0.4017, "step": 665 }, { "epoch": 0.3159392789373814, "grad_norm": 3.7263011932373047, "learning_rate": 1.9154725681532867e-05, "loss": 0.5402, "step": 666 }, { "epoch": 0.3164136622390892, "grad_norm": 3.7040700912475586, "learning_rate": 1.9151631477609932e-05, "loss": 0.4836, "step": 667 }, { "epoch": 0.31688804554079697, "grad_norm": 2.6552894115448, "learning_rate": 1.9148531871569496e-05, "loss": 0.4124, "step": 668 }, { "epoch": 0.31736242884250476, "grad_norm": 3.1435623168945312, "learning_rate": 1.9145426865241224e-05, "loss": 0.4807, "step": 669 }, { "epoch": 0.3178368121442125, "grad_norm": 2.9633023738861084, "learning_rate": 1.9142316460457974e-05, "loss": 0.4609, "step": 670 }, { "epoch": 0.3183111954459203, "grad_norm": 3.0346839427948, "learning_rate": 1.9139200659055785e-05, "loss": 0.4028, "step": 671 }, { "epoch": 0.3187855787476281, "grad_norm": 3.140936851501465, "learning_rate": 1.913607946287388e-05, "loss": 0.4631, "step": 672 }, { "epoch": 0.3192599620493359, "grad_norm": 2.8173251152038574, "learning_rate": 1.9132952873754675e-05, "loss": 0.4232, "step": 673 }, { "epoch": 0.31973434535104367, "grad_norm": 2.9139256477355957, "learning_rate": 1.9129820893543766e-05, "loss": 0.4401, "step": 674 }, { "epoch": 0.3202087286527514, "grad_norm": 3.9275453090667725, "learning_rate": 1.912668352408992e-05, "loss": 0.5251, "step": 675 }, { "epoch": 0.3206831119544592, "grad_norm": 2.808434247970581, "learning_rate": 1.9123540767245107e-05, "loss": 0.4207, "step": 676 }, { "epoch": 0.321157495256167, "grad_norm": 2.854140043258667, "learning_rate": 1.912039262486446e-05, "loss": 0.4516, "step": 677 }, { "epoch": 0.3216318785578748, "grad_norm": 3.1188645362854004, "learning_rate": 1.9117239098806296e-05, "loss": 0.473, "step": 678 }, { "epoch": 0.3221062618595825, "grad_norm": 2.9418070316314697, "learning_rate": 1.911408019093211e-05, "loss": 0.4666, "step": 679 }, { "epoch": 0.3225806451612903, "grad_norm": 7.523387908935547, "learning_rate": 1.9110915903106577e-05, "loss": 0.4657, "step": 680 }, { "epoch": 0.3230550284629981, "grad_norm": 3.0443685054779053, "learning_rate": 1.9107746237197542e-05, "loss": 0.5032, "step": 681 }, { "epoch": 0.3235294117647059, "grad_norm": 3.1826181411743164, "learning_rate": 1.910457119507603e-05, "loss": 0.4716, "step": 682 }, { "epoch": 0.32400379506641364, "grad_norm": 2.9390838146209717, "learning_rate": 1.9101390778616232e-05, "loss": 0.3951, "step": 683 }, { "epoch": 0.32447817836812143, "grad_norm": 3.393476963043213, "learning_rate": 1.909820498969552e-05, "loss": 0.5147, "step": 684 }, { "epoch": 0.3249525616698292, "grad_norm": 3.6798181533813477, "learning_rate": 1.9095013830194432e-05, "loss": 0.4758, "step": 685 }, { "epoch": 0.325426944971537, "grad_norm": 3.2658169269561768, "learning_rate": 1.9091817301996684e-05, "loss": 0.4902, "step": 686 }, { "epoch": 0.3259013282732448, "grad_norm": 3.256455898284912, "learning_rate": 1.9088615406989146e-05, "loss": 0.5238, "step": 687 }, { "epoch": 0.32637571157495254, "grad_norm": 3.187185287475586, "learning_rate": 1.908540814706187e-05, "loss": 0.4681, "step": 688 }, { "epoch": 0.32685009487666034, "grad_norm": 2.9497554302215576, "learning_rate": 1.9082195524108068e-05, "loss": 0.4091, "step": 689 }, { "epoch": 0.32732447817836813, "grad_norm": 3.060279607772827, "learning_rate": 1.907897754002412e-05, "loss": 0.504, "step": 690 }, { "epoch": 0.3277988614800759, "grad_norm": 2.83272123336792, "learning_rate": 1.9075754196709574e-05, "loss": 0.4661, "step": 691 }, { "epoch": 0.32827324478178366, "grad_norm": 3.2421908378601074, "learning_rate": 1.9072525496067128e-05, "loss": 0.3761, "step": 692 }, { "epoch": 0.32874762808349145, "grad_norm": 3.3531394004821777, "learning_rate": 1.9069291440002665e-05, "loss": 0.553, "step": 693 }, { "epoch": 0.32922201138519924, "grad_norm": 3.3758304119110107, "learning_rate": 1.9066052030425206e-05, "loss": 0.5256, "step": 694 }, { "epoch": 0.32969639468690703, "grad_norm": 3.2974579334259033, "learning_rate": 1.9062807269246945e-05, "loss": 0.4929, "step": 695 }, { "epoch": 0.3301707779886148, "grad_norm": 3.126060962677002, "learning_rate": 1.9059557158383234e-05, "loss": 0.4732, "step": 696 }, { "epoch": 0.33064516129032256, "grad_norm": 3.069438934326172, "learning_rate": 1.9056301699752578e-05, "loss": 0.4659, "step": 697 }, { "epoch": 0.33111954459203036, "grad_norm": 3.131999969482422, "learning_rate": 1.9053040895276652e-05, "loss": 0.486, "step": 698 }, { "epoch": 0.33159392789373815, "grad_norm": 3.255291700363159, "learning_rate": 1.904977474688026e-05, "loss": 0.4963, "step": 699 }, { "epoch": 0.33206831119544594, "grad_norm": 3.3611719608306885, "learning_rate": 1.9046503256491395e-05, "loss": 0.5017, "step": 700 }, { "epoch": 0.3325426944971537, "grad_norm": 2.538780689239502, "learning_rate": 1.904322642604117e-05, "loss": 0.405, "step": 701 }, { "epoch": 0.33301707779886147, "grad_norm": 2.767956495285034, "learning_rate": 1.9039944257463875e-05, "loss": 0.3662, "step": 702 }, { "epoch": 0.33349146110056926, "grad_norm": 2.7074410915374756, "learning_rate": 1.903665675269694e-05, "loss": 0.4577, "step": 703 }, { "epoch": 0.33396584440227706, "grad_norm": 3.3300936222076416, "learning_rate": 1.9033363913680944e-05, "loss": 0.4979, "step": 704 }, { "epoch": 0.3344402277039848, "grad_norm": 2.8957359790802, "learning_rate": 1.9030065742359618e-05, "loss": 0.359, "step": 705 }, { "epoch": 0.3349146110056926, "grad_norm": 3.4609487056732178, "learning_rate": 1.9026762240679843e-05, "loss": 0.5508, "step": 706 }, { "epoch": 0.3353889943074004, "grad_norm": 3.0877790451049805, "learning_rate": 1.902345341059164e-05, "loss": 0.5167, "step": 707 }, { "epoch": 0.33586337760910817, "grad_norm": 3.2066214084625244, "learning_rate": 1.9020139254048174e-05, "loss": 0.4389, "step": 708 }, { "epoch": 0.33633776091081596, "grad_norm": 3.8554136753082275, "learning_rate": 1.9016819773005774e-05, "loss": 0.5015, "step": 709 }, { "epoch": 0.3368121442125237, "grad_norm": 3.0794146060943604, "learning_rate": 1.901349496942388e-05, "loss": 0.4256, "step": 710 }, { "epoch": 0.3372865275142315, "grad_norm": 3.269186496734619, "learning_rate": 1.9010164845265103e-05, "loss": 0.4864, "step": 711 }, { "epoch": 0.3377609108159393, "grad_norm": 3.30047607421875, "learning_rate": 1.9006829402495174e-05, "loss": 0.5635, "step": 712 }, { "epoch": 0.3382352941176471, "grad_norm": 3.331386089324951, "learning_rate": 1.9003488643082978e-05, "loss": 0.5245, "step": 713 }, { "epoch": 0.3387096774193548, "grad_norm": 3.2732012271881104, "learning_rate": 1.9000142569000524e-05, "loss": 0.4657, "step": 714 }, { "epoch": 0.3391840607210626, "grad_norm": 2.6173768043518066, "learning_rate": 1.8996791182222977e-05, "loss": 0.4839, "step": 715 }, { "epoch": 0.3396584440227704, "grad_norm": 2.7928354740142822, "learning_rate": 1.899343448472862e-05, "loss": 0.5419, "step": 716 }, { "epoch": 0.3401328273244782, "grad_norm": 2.8685014247894287, "learning_rate": 1.899007247849888e-05, "loss": 0.3946, "step": 717 }, { "epoch": 0.340607210626186, "grad_norm": 2.6877050399780273, "learning_rate": 1.8986705165518318e-05, "loss": 0.457, "step": 718 }, { "epoch": 0.3410815939278937, "grad_norm": 3.0826516151428223, "learning_rate": 1.898333254777462e-05, "loss": 0.4636, "step": 719 }, { "epoch": 0.3415559772296015, "grad_norm": 4.633234024047852, "learning_rate": 1.897995462725862e-05, "loss": 0.4727, "step": 720 }, { "epoch": 0.3420303605313093, "grad_norm": 2.6432247161865234, "learning_rate": 1.8976571405964258e-05, "loss": 0.4449, "step": 721 }, { "epoch": 0.3425047438330171, "grad_norm": 5.404833793640137, "learning_rate": 1.8973182885888626e-05, "loss": 0.4411, "step": 722 }, { "epoch": 0.34297912713472484, "grad_norm": 2.931230306625366, "learning_rate": 1.8969789069031927e-05, "loss": 0.4184, "step": 723 }, { "epoch": 0.34345351043643263, "grad_norm": 3.3786165714263916, "learning_rate": 1.8966389957397503e-05, "loss": 0.4311, "step": 724 }, { "epoch": 0.3439278937381404, "grad_norm": 2.9587435722351074, "learning_rate": 1.896298555299181e-05, "loss": 0.4465, "step": 725 }, { "epoch": 0.3444022770398482, "grad_norm": 2.8019826412200928, "learning_rate": 1.895957585782444e-05, "loss": 0.4498, "step": 726 }, { "epoch": 0.34487666034155595, "grad_norm": 3.1993303298950195, "learning_rate": 1.8956160873908097e-05, "loss": 0.4221, "step": 727 }, { "epoch": 0.34535104364326374, "grad_norm": 2.909705877304077, "learning_rate": 1.895274060325862e-05, "loss": 0.4789, "step": 728 }, { "epoch": 0.34582542694497154, "grad_norm": 3.371626138687134, "learning_rate": 1.8949315047894956e-05, "loss": 0.5294, "step": 729 }, { "epoch": 0.34629981024667933, "grad_norm": 3.704002618789673, "learning_rate": 1.8945884209839172e-05, "loss": 0.5202, "step": 730 }, { "epoch": 0.3467741935483871, "grad_norm": 2.856421709060669, "learning_rate": 1.8942448091116464e-05, "loss": 0.4168, "step": 731 }, { "epoch": 0.34724857685009486, "grad_norm": 3.03263783454895, "learning_rate": 1.8939006693755138e-05, "loss": 0.4592, "step": 732 }, { "epoch": 0.34772296015180265, "grad_norm": 3.3531453609466553, "learning_rate": 1.8935560019786618e-05, "loss": 0.4228, "step": 733 }, { "epoch": 0.34819734345351044, "grad_norm": 2.8518269062042236, "learning_rate": 1.8932108071245435e-05, "loss": 0.4481, "step": 734 }, { "epoch": 0.34867172675521824, "grad_norm": 3.01810622215271, "learning_rate": 1.8928650850169246e-05, "loss": 0.5109, "step": 735 }, { "epoch": 0.349146110056926, "grad_norm": 3.7409400939941406, "learning_rate": 1.8925188358598815e-05, "loss": 0.5323, "step": 736 }, { "epoch": 0.34962049335863377, "grad_norm": 2.571120262145996, "learning_rate": 1.892172059857801e-05, "loss": 0.4073, "step": 737 }, { "epoch": 0.35009487666034156, "grad_norm": 3.2061607837677, "learning_rate": 1.8918247572153822e-05, "loss": 0.5185, "step": 738 }, { "epoch": 0.35056925996204935, "grad_norm": 2.933711528778076, "learning_rate": 1.8914769281376345e-05, "loss": 0.3978, "step": 739 }, { "epoch": 0.3510436432637571, "grad_norm": 3.2886109352111816, "learning_rate": 1.8911285728298778e-05, "loss": 0.4505, "step": 740 }, { "epoch": 0.3515180265654649, "grad_norm": 3.886824607849121, "learning_rate": 1.8907796914977422e-05, "loss": 0.5427, "step": 741 }, { "epoch": 0.3519924098671727, "grad_norm": 3.2919833660125732, "learning_rate": 1.8904302843471692e-05, "loss": 0.409, "step": 742 }, { "epoch": 0.35246679316888047, "grad_norm": 3.437626600265503, "learning_rate": 1.8900803515844107e-05, "loss": 0.4457, "step": 743 }, { "epoch": 0.35294117647058826, "grad_norm": 3.2604973316192627, "learning_rate": 1.8897298934160285e-05, "loss": 0.4763, "step": 744 }, { "epoch": 0.353415559772296, "grad_norm": 2.8597803115844727, "learning_rate": 1.8893789100488945e-05, "loss": 0.4451, "step": 745 }, { "epoch": 0.3538899430740038, "grad_norm": 2.9492034912109375, "learning_rate": 1.8890274016901905e-05, "loss": 0.4771, "step": 746 }, { "epoch": 0.3543643263757116, "grad_norm": 3.0947840213775635, "learning_rate": 1.888675368547409e-05, "loss": 0.4063, "step": 747 }, { "epoch": 0.3548387096774194, "grad_norm": 2.7532737255096436, "learning_rate": 1.888322810828351e-05, "loss": 0.461, "step": 748 }, { "epoch": 0.3553130929791271, "grad_norm": 3.891683340072632, "learning_rate": 1.887969728741128e-05, "loss": 0.4789, "step": 749 }, { "epoch": 0.3557874762808349, "grad_norm": 3.2587778568267822, "learning_rate": 1.8876161224941607e-05, "loss": 0.4776, "step": 750 }, { "epoch": 0.3562618595825427, "grad_norm": 2.7931885719299316, "learning_rate": 1.8872619922961802e-05, "loss": 0.4503, "step": 751 }, { "epoch": 0.3567362428842505, "grad_norm": 3.225916862487793, "learning_rate": 1.886907338356225e-05, "loss": 0.4879, "step": 752 }, { "epoch": 0.3572106261859583, "grad_norm": 2.8587422370910645, "learning_rate": 1.8865521608836446e-05, "loss": 0.4243, "step": 753 }, { "epoch": 0.357685009487666, "grad_norm": 3.333045482635498, "learning_rate": 1.8861964600880963e-05, "loss": 0.4953, "step": 754 }, { "epoch": 0.3581593927893738, "grad_norm": 2.7636733055114746, "learning_rate": 1.885840236179547e-05, "loss": 0.4484, "step": 755 }, { "epoch": 0.3586337760910816, "grad_norm": 2.7610294818878174, "learning_rate": 1.8854834893682722e-05, "loss": 0.4055, "step": 756 }, { "epoch": 0.3591081593927894, "grad_norm": 3.245123863220215, "learning_rate": 1.8851262198648555e-05, "loss": 0.5173, "step": 757 }, { "epoch": 0.35958254269449713, "grad_norm": 2.8020386695861816, "learning_rate": 1.88476842788019e-05, "loss": 0.4727, "step": 758 }, { "epoch": 0.3600569259962049, "grad_norm": 3.41428542137146, "learning_rate": 1.8844101136254768e-05, "loss": 0.4924, "step": 759 }, { "epoch": 0.3605313092979127, "grad_norm": 2.779858350753784, "learning_rate": 1.884051277312225e-05, "loss": 0.4455, "step": 760 }, { "epoch": 0.3610056925996205, "grad_norm": 2.965505838394165, "learning_rate": 1.8836919191522526e-05, "loss": 0.5216, "step": 761 }, { "epoch": 0.36148007590132825, "grad_norm": 2.897125005722046, "learning_rate": 1.8833320393576847e-05, "loss": 0.4364, "step": 762 }, { "epoch": 0.36195445920303604, "grad_norm": 3.3312103748321533, "learning_rate": 1.8829716381409545e-05, "loss": 0.541, "step": 763 }, { "epoch": 0.36242884250474383, "grad_norm": 3.1323423385620117, "learning_rate": 1.8826107157148042e-05, "loss": 0.4879, "step": 764 }, { "epoch": 0.3629032258064516, "grad_norm": 2.554779052734375, "learning_rate": 1.882249272292282e-05, "loss": 0.3459, "step": 765 }, { "epoch": 0.3633776091081594, "grad_norm": 2.9526796340942383, "learning_rate": 1.8818873080867445e-05, "loss": 0.4802, "step": 766 }, { "epoch": 0.36385199240986715, "grad_norm": 2.4592695236206055, "learning_rate": 1.8815248233118558e-05, "loss": 0.3779, "step": 767 }, { "epoch": 0.36432637571157495, "grad_norm": 2.9448318481445312, "learning_rate": 1.881161818181587e-05, "loss": 0.4546, "step": 768 }, { "epoch": 0.36480075901328274, "grad_norm": 3.2894225120544434, "learning_rate": 1.8807982929102164e-05, "loss": 0.4668, "step": 769 }, { "epoch": 0.36527514231499053, "grad_norm": 2.9263877868652344, "learning_rate": 1.8804342477123292e-05, "loss": 0.4316, "step": 770 }, { "epoch": 0.36574952561669827, "grad_norm": 2.8721306324005127, "learning_rate": 1.880069682802818e-05, "loss": 0.4152, "step": 771 }, { "epoch": 0.36622390891840606, "grad_norm": 2.6576414108276367, "learning_rate": 1.879704598396882e-05, "loss": 0.3802, "step": 772 }, { "epoch": 0.36669829222011385, "grad_norm": 2.7889490127563477, "learning_rate": 1.879338994710026e-05, "loss": 0.3833, "step": 773 }, { "epoch": 0.36717267552182165, "grad_norm": 3.141726016998291, "learning_rate": 1.8789728719580632e-05, "loss": 0.4202, "step": 774 }, { "epoch": 0.36764705882352944, "grad_norm": 3.9903249740600586, "learning_rate": 1.878606230357112e-05, "loss": 0.5108, "step": 775 }, { "epoch": 0.3681214421252372, "grad_norm": 2.6597206592559814, "learning_rate": 1.878239070123597e-05, "loss": 0.4236, "step": 776 }, { "epoch": 0.36859582542694497, "grad_norm": 3.9152090549468994, "learning_rate": 1.8778713914742494e-05, "loss": 0.5953, "step": 777 }, { "epoch": 0.36907020872865276, "grad_norm": 2.510295867919922, "learning_rate": 1.8775031946261065e-05, "loss": 0.389, "step": 778 }, { "epoch": 0.36954459203036055, "grad_norm": 2.6154842376708984, "learning_rate": 1.877134479796511e-05, "loss": 0.4015, "step": 779 }, { "epoch": 0.3700189753320683, "grad_norm": 3.2904458045959473, "learning_rate": 1.8767652472031118e-05, "loss": 0.4731, "step": 780 }, { "epoch": 0.3704933586337761, "grad_norm": 3.2095184326171875, "learning_rate": 1.8763954970638628e-05, "loss": 0.5158, "step": 781 }, { "epoch": 0.3709677419354839, "grad_norm": 2.783874034881592, "learning_rate": 1.8760252295970245e-05, "loss": 0.4831, "step": 782 }, { "epoch": 0.37144212523719167, "grad_norm": 2.9339380264282227, "learning_rate": 1.8756544450211614e-05, "loss": 0.5171, "step": 783 }, { "epoch": 0.3719165085388994, "grad_norm": 3.8392632007598877, "learning_rate": 1.875283143555145e-05, "loss": 0.4464, "step": 784 }, { "epoch": 0.3723908918406072, "grad_norm": 2.6166434288024902, "learning_rate": 1.8749113254181498e-05, "loss": 0.4343, "step": 785 }, { "epoch": 0.372865275142315, "grad_norm": 2.91298246383667, "learning_rate": 1.874538990829657e-05, "loss": 0.5344, "step": 786 }, { "epoch": 0.3733396584440228, "grad_norm": 2.931877374649048, "learning_rate": 1.874166140009452e-05, "loss": 0.4732, "step": 787 }, { "epoch": 0.3738140417457306, "grad_norm": 2.885974407196045, "learning_rate": 1.8737927731776245e-05, "loss": 0.4594, "step": 788 }, { "epoch": 0.3742884250474383, "grad_norm": 3.037022352218628, "learning_rate": 1.8734188905545697e-05, "loss": 0.4378, "step": 789 }, { "epoch": 0.3747628083491461, "grad_norm": 3.06512713432312, "learning_rate": 1.8730444923609865e-05, "loss": 0.4882, "step": 790 }, { "epoch": 0.3752371916508539, "grad_norm": 3.42930269241333, "learning_rate": 1.872669578817879e-05, "loss": 0.5288, "step": 791 }, { "epoch": 0.3757115749525617, "grad_norm": 2.3780622482299805, "learning_rate": 1.872294150146554e-05, "loss": 0.3665, "step": 792 }, { "epoch": 0.3761859582542694, "grad_norm": 3.4545505046844482, "learning_rate": 1.8719182065686242e-05, "loss": 0.5605, "step": 793 }, { "epoch": 0.3766603415559772, "grad_norm": 2.9837419986724854, "learning_rate": 1.871541748306005e-05, "loss": 0.4979, "step": 794 }, { "epoch": 0.377134724857685, "grad_norm": 2.7663657665252686, "learning_rate": 1.871164775580916e-05, "loss": 0.4332, "step": 795 }, { "epoch": 0.3776091081593928, "grad_norm": 3.1293842792510986, "learning_rate": 1.8707872886158806e-05, "loss": 0.4708, "step": 796 }, { "epoch": 0.3780834914611006, "grad_norm": 2.830396890640259, "learning_rate": 1.870409287633726e-05, "loss": 0.5119, "step": 797 }, { "epoch": 0.37855787476280833, "grad_norm": 2.5124335289001465, "learning_rate": 1.8700307728575813e-05, "loss": 0.3997, "step": 798 }, { "epoch": 0.3790322580645161, "grad_norm": 2.8015406131744385, "learning_rate": 1.8696517445108807e-05, "loss": 0.4406, "step": 799 }, { "epoch": 0.3795066413662239, "grad_norm": 2.6684601306915283, "learning_rate": 1.8692722028173612e-05, "loss": 0.4578, "step": 800 }, { "epoch": 0.3799810246679317, "grad_norm": 2.6726903915405273, "learning_rate": 1.868892148001062e-05, "loss": 0.3832, "step": 801 }, { "epoch": 0.38045540796963945, "grad_norm": 2.7872049808502197, "learning_rate": 1.868511580286326e-05, "loss": 0.4706, "step": 802 }, { "epoch": 0.38092979127134724, "grad_norm": 2.7430388927459717, "learning_rate": 1.8681304998977988e-05, "loss": 0.4383, "step": 803 }, { "epoch": 0.38140417457305503, "grad_norm": 2.5697414875030518, "learning_rate": 1.8677489070604274e-05, "loss": 0.4179, "step": 804 }, { "epoch": 0.3818785578747628, "grad_norm": 2.9383411407470703, "learning_rate": 1.8673668019994632e-05, "loss": 0.4117, "step": 805 }, { "epoch": 0.38235294117647056, "grad_norm": 3.752838373184204, "learning_rate": 1.866984184940459e-05, "loss": 0.5366, "step": 806 }, { "epoch": 0.38282732447817835, "grad_norm": 4.9944024085998535, "learning_rate": 1.866601056109269e-05, "loss": 0.5161, "step": 807 }, { "epoch": 0.38330170777988615, "grad_norm": 2.624593734741211, "learning_rate": 1.8662174157320515e-05, "loss": 0.3766, "step": 808 }, { "epoch": 0.38377609108159394, "grad_norm": 2.987598419189453, "learning_rate": 1.8658332640352647e-05, "loss": 0.5166, "step": 809 }, { "epoch": 0.38425047438330173, "grad_norm": 2.785848379135132, "learning_rate": 1.8654486012456704e-05, "loss": 0.4269, "step": 810 }, { "epoch": 0.38472485768500947, "grad_norm": 2.538492202758789, "learning_rate": 1.8650634275903304e-05, "loss": 0.4338, "step": 811 }, { "epoch": 0.38519924098671726, "grad_norm": 2.889648914337158, "learning_rate": 1.864677743296609e-05, "loss": 0.4407, "step": 812 }, { "epoch": 0.38567362428842505, "grad_norm": 2.803192138671875, "learning_rate": 1.8642915485921726e-05, "loss": 0.4189, "step": 813 }, { "epoch": 0.38614800759013285, "grad_norm": 2.4680614471435547, "learning_rate": 1.8639048437049875e-05, "loss": 0.3655, "step": 814 }, { "epoch": 0.3866223908918406, "grad_norm": 3.2701048851013184, "learning_rate": 1.8635176288633218e-05, "loss": 0.4297, "step": 815 }, { "epoch": 0.3870967741935484, "grad_norm": 3.7797579765319824, "learning_rate": 1.8631299042957448e-05, "loss": 0.5702, "step": 816 }, { "epoch": 0.38757115749525617, "grad_norm": 3.297485113143921, "learning_rate": 1.862741670231126e-05, "loss": 0.5324, "step": 817 }, { "epoch": 0.38804554079696396, "grad_norm": 2.463757276535034, "learning_rate": 1.8623529268986366e-05, "loss": 0.3748, "step": 818 }, { "epoch": 0.38851992409867175, "grad_norm": 2.859805107116699, "learning_rate": 1.861963674527748e-05, "loss": 0.5214, "step": 819 }, { "epoch": 0.3889943074003795, "grad_norm": 3.135437488555908, "learning_rate": 1.8615739133482315e-05, "loss": 0.4381, "step": 820 }, { "epoch": 0.3894686907020873, "grad_norm": 3.0858075618743896, "learning_rate": 1.8611836435901595e-05, "loss": 0.4133, "step": 821 }, { "epoch": 0.3899430740037951, "grad_norm": 2.585782527923584, "learning_rate": 1.860792865483905e-05, "loss": 0.3963, "step": 822 }, { "epoch": 0.39041745730550287, "grad_norm": 3.3522262573242188, "learning_rate": 1.8604015792601395e-05, "loss": 0.478, "step": 823 }, { "epoch": 0.3908918406072106, "grad_norm": 2.940312147140503, "learning_rate": 1.860009785149836e-05, "loss": 0.5005, "step": 824 }, { "epoch": 0.3913662239089184, "grad_norm": 3.0102758407592773, "learning_rate": 1.8596174833842664e-05, "loss": 0.4627, "step": 825 }, { "epoch": 0.3918406072106262, "grad_norm": 2.446089744567871, "learning_rate": 1.8592246741950027e-05, "loss": 0.3507, "step": 826 }, { "epoch": 0.392314990512334, "grad_norm": 2.569380283355713, "learning_rate": 1.858831357813916e-05, "loss": 0.3988, "step": 827 }, { "epoch": 0.3927893738140417, "grad_norm": 3.037994861602783, "learning_rate": 1.8584375344731777e-05, "loss": 0.3629, "step": 828 }, { "epoch": 0.3932637571157495, "grad_norm": 3.2176268100738525, "learning_rate": 1.8580432044052567e-05, "loss": 0.4764, "step": 829 }, { "epoch": 0.3937381404174573, "grad_norm": 2.4146931171417236, "learning_rate": 1.8576483678429234e-05, "loss": 0.3852, "step": 830 }, { "epoch": 0.3942125237191651, "grad_norm": 3.2689833641052246, "learning_rate": 1.8572530250192453e-05, "loss": 0.444, "step": 831 }, { "epoch": 0.3946869070208729, "grad_norm": 3.194239377975464, "learning_rate": 1.8568571761675893e-05, "loss": 0.4141, "step": 832 }, { "epoch": 0.3951612903225806, "grad_norm": 2.9334359169006348, "learning_rate": 1.8564608215216212e-05, "loss": 0.4693, "step": 833 }, { "epoch": 0.3956356736242884, "grad_norm": 2.812944173812866, "learning_rate": 1.8560639613153056e-05, "loss": 0.436, "step": 834 }, { "epoch": 0.3961100569259962, "grad_norm": 2.986809730529785, "learning_rate": 1.855666595782904e-05, "loss": 0.4789, "step": 835 }, { "epoch": 0.396584440227704, "grad_norm": 2.4972949028015137, "learning_rate": 1.8552687251589786e-05, "loss": 0.4695, "step": 836 }, { "epoch": 0.39705882352941174, "grad_norm": 2.675363779067993, "learning_rate": 1.8548703496783877e-05, "loss": 0.4012, "step": 837 }, { "epoch": 0.39753320683111953, "grad_norm": 2.956235885620117, "learning_rate": 1.854471469576289e-05, "loss": 0.503, "step": 838 }, { "epoch": 0.3980075901328273, "grad_norm": 2.4831976890563965, "learning_rate": 1.8540720850881372e-05, "loss": 0.3945, "step": 839 }, { "epoch": 0.3984819734345351, "grad_norm": 2.747809886932373, "learning_rate": 1.8536721964496846e-05, "loss": 0.431, "step": 840 }, { "epoch": 0.3989563567362429, "grad_norm": 3.124937057495117, "learning_rate": 1.853271803896982e-05, "loss": 0.505, "step": 841 }, { "epoch": 0.39943074003795065, "grad_norm": 3.363137722015381, "learning_rate": 1.8528709076663772e-05, "loss": 0.4563, "step": 842 }, { "epoch": 0.39990512333965844, "grad_norm": 2.7867400646209717, "learning_rate": 1.8524695079945154e-05, "loss": 0.4487, "step": 843 }, { "epoch": 0.40037950664136623, "grad_norm": 2.9408624172210693, "learning_rate": 1.8520676051183377e-05, "loss": 0.4014, "step": 844 }, { "epoch": 0.400853889943074, "grad_norm": 3.224708318710327, "learning_rate": 1.851665199275085e-05, "loss": 0.4478, "step": 845 }, { "epoch": 0.40132827324478176, "grad_norm": 2.7404086589813232, "learning_rate": 1.8512622907022924e-05, "loss": 0.4689, "step": 846 }, { "epoch": 0.40180265654648956, "grad_norm": 2.713667154312134, "learning_rate": 1.8508588796377936e-05, "loss": 0.4486, "step": 847 }, { "epoch": 0.40227703984819735, "grad_norm": 3.0665454864501953, "learning_rate": 1.8504549663197175e-05, "loss": 0.4629, "step": 848 }, { "epoch": 0.40275142314990514, "grad_norm": 3.2429921627044678, "learning_rate": 1.8500505509864903e-05, "loss": 0.4945, "step": 849 }, { "epoch": 0.4032258064516129, "grad_norm": 3.3584725856781006, "learning_rate": 1.8496456338768345e-05, "loss": 0.4229, "step": 850 }, { "epoch": 0.40370018975332067, "grad_norm": 2.912458896636963, "learning_rate": 1.8492402152297688e-05, "loss": 0.4418, "step": 851 }, { "epoch": 0.40417457305502846, "grad_norm": 2.935781478881836, "learning_rate": 1.8488342952846074e-05, "loss": 0.3275, "step": 852 }, { "epoch": 0.40464895635673626, "grad_norm": 3.8504891395568848, "learning_rate": 1.848427874280961e-05, "loss": 0.413, "step": 853 }, { "epoch": 0.40512333965844405, "grad_norm": 2.4456560611724854, "learning_rate": 1.8480209524587363e-05, "loss": 0.419, "step": 854 }, { "epoch": 0.4055977229601518, "grad_norm": 3.666505813598633, "learning_rate": 1.8476135300581347e-05, "loss": 0.4197, "step": 855 }, { "epoch": 0.4060721062618596, "grad_norm": 2.4932539463043213, "learning_rate": 1.847205607319654e-05, "loss": 0.3565, "step": 856 }, { "epoch": 0.40654648956356737, "grad_norm": 2.7682738304138184, "learning_rate": 1.8467971844840864e-05, "loss": 0.4519, "step": 857 }, { "epoch": 0.40702087286527516, "grad_norm": 2.8969831466674805, "learning_rate": 1.8463882617925208e-05, "loss": 0.4616, "step": 858 }, { "epoch": 0.4074952561669829, "grad_norm": 3.2404887676239014, "learning_rate": 1.8459788394863388e-05, "loss": 0.5208, "step": 859 }, { "epoch": 0.4079696394686907, "grad_norm": 2.699791669845581, "learning_rate": 1.8455689178072197e-05, "loss": 0.438, "step": 860 }, { "epoch": 0.4084440227703985, "grad_norm": 3.718968152999878, "learning_rate": 1.8451584969971358e-05, "loss": 0.5273, "step": 861 }, { "epoch": 0.4089184060721063, "grad_norm": 2.4565632343292236, "learning_rate": 1.8447475772983542e-05, "loss": 0.3949, "step": 862 }, { "epoch": 0.409392789373814, "grad_norm": 2.7938921451568604, "learning_rate": 1.8443361589534366e-05, "loss": 0.3812, "step": 863 }, { "epoch": 0.4098671726755218, "grad_norm": 2.8506662845611572, "learning_rate": 1.84392424220524e-05, "loss": 0.4237, "step": 864 }, { "epoch": 0.4103415559772296, "grad_norm": 2.958847999572754, "learning_rate": 1.8435118272969135e-05, "loss": 0.5132, "step": 865 }, { "epoch": 0.4108159392789374, "grad_norm": 3.2671656608581543, "learning_rate": 1.8430989144719028e-05, "loss": 0.4776, "step": 866 }, { "epoch": 0.4112903225806452, "grad_norm": 2.673391103744507, "learning_rate": 1.8426855039739454e-05, "loss": 0.4367, "step": 867 }, { "epoch": 0.4117647058823529, "grad_norm": 2.532644271850586, "learning_rate": 1.8422715960470737e-05, "loss": 0.4003, "step": 868 }, { "epoch": 0.4122390891840607, "grad_norm": 2.612393856048584, "learning_rate": 1.8418571909356138e-05, "loss": 0.4027, "step": 869 }, { "epoch": 0.4127134724857685, "grad_norm": 2.484384775161743, "learning_rate": 1.8414422888841844e-05, "loss": 0.4135, "step": 870 }, { "epoch": 0.4131878557874763, "grad_norm": 3.7062361240386963, "learning_rate": 1.8410268901376983e-05, "loss": 0.4552, "step": 871 }, { "epoch": 0.41366223908918404, "grad_norm": 2.7248425483703613, "learning_rate": 1.8406109949413614e-05, "loss": 0.4411, "step": 872 }, { "epoch": 0.41413662239089183, "grad_norm": 2.9587390422821045, "learning_rate": 1.8401946035406723e-05, "loss": 0.5239, "step": 873 }, { "epoch": 0.4146110056925996, "grad_norm": 3.329129457473755, "learning_rate": 1.839777716181423e-05, "loss": 0.4036, "step": 874 }, { "epoch": 0.4150853889943074, "grad_norm": 2.5802624225616455, "learning_rate": 1.8393603331096974e-05, "loss": 0.4146, "step": 875 }, { "epoch": 0.4155597722960152, "grad_norm": 3.270162582397461, "learning_rate": 1.8389424545718733e-05, "loss": 0.4765, "step": 876 }, { "epoch": 0.41603415559772294, "grad_norm": 2.688945770263672, "learning_rate": 1.8385240808146197e-05, "loss": 0.4338, "step": 877 }, { "epoch": 0.41650853889943074, "grad_norm": 2.7994351387023926, "learning_rate": 1.838105212084899e-05, "loss": 0.4464, "step": 878 }, { "epoch": 0.41698292220113853, "grad_norm": 2.3464057445526123, "learning_rate": 1.837685848629965e-05, "loss": 0.3783, "step": 879 }, { "epoch": 0.4174573055028463, "grad_norm": 3.1224400997161865, "learning_rate": 1.837265990697364e-05, "loss": 0.5672, "step": 880 }, { "epoch": 0.41793168880455406, "grad_norm": 3.112441301345825, "learning_rate": 1.8368456385349333e-05, "loss": 0.4204, "step": 881 }, { "epoch": 0.41840607210626185, "grad_norm": 3.453056812286377, "learning_rate": 1.8364247923908033e-05, "loss": 0.4365, "step": 882 }, { "epoch": 0.41888045540796964, "grad_norm": 3.2397301197052, "learning_rate": 1.8360034525133953e-05, "loss": 0.4454, "step": 883 }, { "epoch": 0.41935483870967744, "grad_norm": 2.897355794906616, "learning_rate": 1.8355816191514216e-05, "loss": 0.4647, "step": 884 }, { "epoch": 0.4198292220113852, "grad_norm": 3.6154603958129883, "learning_rate": 1.8351592925538865e-05, "loss": 0.6002, "step": 885 }, { "epoch": 0.42030360531309297, "grad_norm": 2.3313372135162354, "learning_rate": 1.8347364729700857e-05, "loss": 0.4218, "step": 886 }, { "epoch": 0.42077798861480076, "grad_norm": 2.8984670639038086, "learning_rate": 1.8343131606496046e-05, "loss": 0.4025, "step": 887 }, { "epoch": 0.42125237191650855, "grad_norm": 3.124525785446167, "learning_rate": 1.8338893558423207e-05, "loss": 0.441, "step": 888 }, { "epoch": 0.42172675521821634, "grad_norm": 2.574047565460205, "learning_rate": 1.833465058798402e-05, "loss": 0.4107, "step": 889 }, { "epoch": 0.4222011385199241, "grad_norm": 2.7786552906036377, "learning_rate": 1.8330402697683067e-05, "loss": 0.4836, "step": 890 }, { "epoch": 0.42267552182163187, "grad_norm": 2.599820137023926, "learning_rate": 1.832614989002783e-05, "loss": 0.4446, "step": 891 }, { "epoch": 0.42314990512333966, "grad_norm": 2.800710678100586, "learning_rate": 1.8321892167528707e-05, "loss": 0.4315, "step": 892 }, { "epoch": 0.42362428842504746, "grad_norm": 2.935514450073242, "learning_rate": 1.831762953269898e-05, "loss": 0.4062, "step": 893 }, { "epoch": 0.4240986717267552, "grad_norm": 2.597562551498413, "learning_rate": 1.8313361988054853e-05, "loss": 0.3958, "step": 894 }, { "epoch": 0.424573055028463, "grad_norm": 2.7025997638702393, "learning_rate": 1.8309089536115406e-05, "loss": 0.4288, "step": 895 }, { "epoch": 0.4250474383301708, "grad_norm": 2.58357310295105, "learning_rate": 1.8304812179402626e-05, "loss": 0.4734, "step": 896 }, { "epoch": 0.42552182163187857, "grad_norm": 2.8267996311187744, "learning_rate": 1.83005299204414e-05, "loss": 0.4395, "step": 897 }, { "epoch": 0.42599620493358636, "grad_norm": 2.934715986251831, "learning_rate": 1.82962427617595e-05, "loss": 0.4381, "step": 898 }, { "epoch": 0.4264705882352941, "grad_norm": 2.855116128921509, "learning_rate": 1.829195070588759e-05, "loss": 0.4122, "step": 899 }, { "epoch": 0.4269449715370019, "grad_norm": 2.7135651111602783, "learning_rate": 1.8287653755359228e-05, "loss": 0.4812, "step": 900 }, { "epoch": 0.4274193548387097, "grad_norm": 2.9477922916412354, "learning_rate": 1.8283351912710867e-05, "loss": 0.4166, "step": 901 }, { "epoch": 0.4278937381404175, "grad_norm": 3.2937138080596924, "learning_rate": 1.827904518048184e-05, "loss": 0.4541, "step": 902 }, { "epoch": 0.4283681214421252, "grad_norm": 2.7012522220611572, "learning_rate": 1.8274733561214368e-05, "loss": 0.4235, "step": 903 }, { "epoch": 0.428842504743833, "grad_norm": 3.3364474773406982, "learning_rate": 1.8270417057453554e-05, "loss": 0.4433, "step": 904 }, { "epoch": 0.4293168880455408, "grad_norm": 2.75413179397583, "learning_rate": 1.826609567174739e-05, "loss": 0.4105, "step": 905 }, { "epoch": 0.4297912713472486, "grad_norm": 2.27841854095459, "learning_rate": 1.826176940664675e-05, "loss": 0.3656, "step": 906 }, { "epoch": 0.43026565464895633, "grad_norm": 3.102612018585205, "learning_rate": 1.8257438264705382e-05, "loss": 0.4927, "step": 907 }, { "epoch": 0.4307400379506641, "grad_norm": 2.6770787239074707, "learning_rate": 1.825310224847992e-05, "loss": 0.4812, "step": 908 }, { "epoch": 0.4312144212523719, "grad_norm": 3.230632781982422, "learning_rate": 1.8248761360529864e-05, "loss": 0.4326, "step": 909 }, { "epoch": 0.4316888045540797, "grad_norm": 3.148620843887329, "learning_rate": 1.8244415603417603e-05, "loss": 0.4355, "step": 910 }, { "epoch": 0.4321631878557875, "grad_norm": 3.1337029933929443, "learning_rate": 1.8240064979708397e-05, "loss": 0.5506, "step": 911 }, { "epoch": 0.43263757115749524, "grad_norm": 2.7768473625183105, "learning_rate": 1.8235709491970366e-05, "loss": 0.4683, "step": 912 }, { "epoch": 0.43311195445920303, "grad_norm": 2.6969683170318604, "learning_rate": 1.8231349142774525e-05, "loss": 0.4588, "step": 913 }, { "epoch": 0.4335863377609108, "grad_norm": 2.7988765239715576, "learning_rate": 1.8226983934694732e-05, "loss": 0.399, "step": 914 }, { "epoch": 0.4340607210626186, "grad_norm": 3.1512439250946045, "learning_rate": 1.8222613870307735e-05, "loss": 0.5061, "step": 915 }, { "epoch": 0.43453510436432635, "grad_norm": 2.885368824005127, "learning_rate": 1.8218238952193136e-05, "loss": 0.4037, "step": 916 }, { "epoch": 0.43500948766603414, "grad_norm": 2.4949498176574707, "learning_rate": 1.8213859182933407e-05, "loss": 0.3942, "step": 917 }, { "epoch": 0.43548387096774194, "grad_norm": 2.3052561283111572, "learning_rate": 1.820947456511388e-05, "loss": 0.4208, "step": 918 }, { "epoch": 0.43595825426944973, "grad_norm": 2.642266273498535, "learning_rate": 1.8205085101322754e-05, "loss": 0.465, "step": 919 }, { "epoch": 0.4364326375711575, "grad_norm": 3.87248158454895, "learning_rate": 1.8200690794151087e-05, "loss": 0.5536, "step": 920 }, { "epoch": 0.43690702087286526, "grad_norm": 2.5663421154022217, "learning_rate": 1.819629164619279e-05, "loss": 0.4441, "step": 921 }, { "epoch": 0.43738140417457305, "grad_norm": 2.330423593521118, "learning_rate": 1.8191887660044646e-05, "loss": 0.36, "step": 922 }, { "epoch": 0.43785578747628084, "grad_norm": 2.401970386505127, "learning_rate": 1.8187478838306273e-05, "loss": 0.3381, "step": 923 }, { "epoch": 0.43833017077798864, "grad_norm": 3.270221710205078, "learning_rate": 1.818306518358016e-05, "loss": 0.5003, "step": 924 }, { "epoch": 0.4388045540796964, "grad_norm": 2.4524645805358887, "learning_rate": 1.817864669847165e-05, "loss": 0.4074, "step": 925 }, { "epoch": 0.43927893738140417, "grad_norm": 3.223022222518921, "learning_rate": 1.817422338558892e-05, "loss": 0.5338, "step": 926 }, { "epoch": 0.43975332068311196, "grad_norm": 2.6757102012634277, "learning_rate": 1.8169795247543014e-05, "loss": 0.4344, "step": 927 }, { "epoch": 0.44022770398481975, "grad_norm": 3.086233139038086, "learning_rate": 1.8165362286947817e-05, "loss": 0.4607, "step": 928 }, { "epoch": 0.4407020872865275, "grad_norm": 2.754612922668457, "learning_rate": 1.8160924506420064e-05, "loss": 0.3898, "step": 929 }, { "epoch": 0.4411764705882353, "grad_norm": 4.099813461303711, "learning_rate": 1.8156481908579326e-05, "loss": 0.4333, "step": 930 }, { "epoch": 0.4416508538899431, "grad_norm": 2.578651189804077, "learning_rate": 1.815203449604803e-05, "loss": 0.3853, "step": 931 }, { "epoch": 0.44212523719165087, "grad_norm": 2.7206499576568604, "learning_rate": 1.8147582271451443e-05, "loss": 0.4229, "step": 932 }, { "epoch": 0.44259962049335866, "grad_norm": 3.497983694076538, "learning_rate": 1.814312523741766e-05, "loss": 0.5558, "step": 933 }, { "epoch": 0.4430740037950664, "grad_norm": 3.077042579650879, "learning_rate": 1.8138663396577633e-05, "loss": 0.4388, "step": 934 }, { "epoch": 0.4435483870967742, "grad_norm": 2.824192523956299, "learning_rate": 1.813419675156514e-05, "loss": 0.4736, "step": 935 }, { "epoch": 0.444022770398482, "grad_norm": 2.8267297744750977, "learning_rate": 1.8129725305016793e-05, "loss": 0.4546, "step": 936 }, { "epoch": 0.4444971537001898, "grad_norm": 2.9733405113220215, "learning_rate": 1.8125249059572042e-05, "loss": 0.5409, "step": 937 }, { "epoch": 0.4449715370018975, "grad_norm": 3.0438060760498047, "learning_rate": 1.8120768017873178e-05, "loss": 0.4126, "step": 938 }, { "epoch": 0.4454459203036053, "grad_norm": 2.6118757724761963, "learning_rate": 1.8116282182565313e-05, "loss": 0.3854, "step": 939 }, { "epoch": 0.4459203036053131, "grad_norm": 2.909747838973999, "learning_rate": 1.8111791556296386e-05, "loss": 0.5156, "step": 940 }, { "epoch": 0.4463946869070209, "grad_norm": 2.6982431411743164, "learning_rate": 1.8107296141717175e-05, "loss": 0.3597, "step": 941 }, { "epoch": 0.4468690702087287, "grad_norm": 3.628803253173828, "learning_rate": 1.8102795941481277e-05, "loss": 0.5069, "step": 942 }, { "epoch": 0.4473434535104364, "grad_norm": 2.5725972652435303, "learning_rate": 1.8098290958245116e-05, "loss": 0.3675, "step": 943 }, { "epoch": 0.4478178368121442, "grad_norm": 3.47405743598938, "learning_rate": 1.8093781194667935e-05, "loss": 0.4533, "step": 944 }, { "epoch": 0.448292220113852, "grad_norm": 2.7190101146698, "learning_rate": 1.808926665341181e-05, "loss": 0.3759, "step": 945 }, { "epoch": 0.4487666034155598, "grad_norm": 4.446865558624268, "learning_rate": 1.8084747337141622e-05, "loss": 0.3144, "step": 946 }, { "epoch": 0.44924098671726753, "grad_norm": 2.6146459579467773, "learning_rate": 1.8080223248525087e-05, "loss": 0.4736, "step": 947 }, { "epoch": 0.4497153700189753, "grad_norm": 3.1823534965515137, "learning_rate": 1.8075694390232725e-05, "loss": 0.5324, "step": 948 }, { "epoch": 0.4501897533206831, "grad_norm": 3.1742212772369385, "learning_rate": 1.8071160764937875e-05, "loss": 0.4744, "step": 949 }, { "epoch": 0.4506641366223909, "grad_norm": 2.7472152709960938, "learning_rate": 1.8066622375316695e-05, "loss": 0.3879, "step": 950 }, { "epoch": 0.45113851992409865, "grad_norm": 2.5368971824645996, "learning_rate": 1.8062079224048146e-05, "loss": 0.4705, "step": 951 }, { "epoch": 0.45161290322580644, "grad_norm": 2.812864065170288, "learning_rate": 1.805753131381401e-05, "loss": 0.4219, "step": 952 }, { "epoch": 0.45208728652751423, "grad_norm": 2.2089133262634277, "learning_rate": 1.8052978647298873e-05, "loss": 0.387, "step": 953 }, { "epoch": 0.452561669829222, "grad_norm": 2.8962409496307373, "learning_rate": 1.804842122719013e-05, "loss": 0.5197, "step": 954 }, { "epoch": 0.4530360531309298, "grad_norm": 2.630946397781372, "learning_rate": 1.8043859056177976e-05, "loss": 0.3808, "step": 955 }, { "epoch": 0.45351043643263755, "grad_norm": 2.4841771125793457, "learning_rate": 1.803929213695542e-05, "loss": 0.4099, "step": 956 }, { "epoch": 0.45398481973434535, "grad_norm": 2.3919498920440674, "learning_rate": 1.803472047221827e-05, "loss": 0.3677, "step": 957 }, { "epoch": 0.45445920303605314, "grad_norm": 2.786252498626709, "learning_rate": 1.8030144064665127e-05, "loss": 0.4811, "step": 958 }, { "epoch": 0.45493358633776093, "grad_norm": 2.7596981525421143, "learning_rate": 1.802556291699741e-05, "loss": 0.4529, "step": 959 }, { "epoch": 0.45540796963946867, "grad_norm": 2.7790467739105225, "learning_rate": 1.8020977031919315e-05, "loss": 0.4452, "step": 960 }, { "epoch": 0.45588235294117646, "grad_norm": 3.088778018951416, "learning_rate": 1.801638641213785e-05, "loss": 0.4797, "step": 961 }, { "epoch": 0.45635673624288425, "grad_norm": 2.6321218013763428, "learning_rate": 1.801179106036281e-05, "loss": 0.4205, "step": 962 }, { "epoch": 0.45683111954459205, "grad_norm": 2.7974629402160645, "learning_rate": 1.8007190979306793e-05, "loss": 0.4557, "step": 963 }, { "epoch": 0.4573055028462998, "grad_norm": 2.396785259246826, "learning_rate": 1.800258617168517e-05, "loss": 0.4265, "step": 964 }, { "epoch": 0.4577798861480076, "grad_norm": 2.8968255519866943, "learning_rate": 1.799797664021612e-05, "loss": 0.3469, "step": 965 }, { "epoch": 0.45825426944971537, "grad_norm": 2.8797080516815186, "learning_rate": 1.7993362387620602e-05, "loss": 0.4216, "step": 966 }, { "epoch": 0.45872865275142316, "grad_norm": 3.1519381999969482, "learning_rate": 1.798874341662237e-05, "loss": 0.5129, "step": 967 }, { "epoch": 0.45920303605313095, "grad_norm": 2.6369104385375977, "learning_rate": 1.7984119729947944e-05, "loss": 0.4251, "step": 968 }, { "epoch": 0.4596774193548387, "grad_norm": 2.4390084743499756, "learning_rate": 1.797949133032665e-05, "loss": 0.4245, "step": 969 }, { "epoch": 0.4601518026565465, "grad_norm": 2.8855037689208984, "learning_rate": 1.7974858220490586e-05, "loss": 0.4015, "step": 970 }, { "epoch": 0.4606261859582543, "grad_norm": 2.9386088848114014, "learning_rate": 1.7970220403174626e-05, "loss": 0.4222, "step": 971 }, { "epoch": 0.46110056925996207, "grad_norm": 2.18912410736084, "learning_rate": 1.796557788111643e-05, "loss": 0.3247, "step": 972 }, { "epoch": 0.4615749525616698, "grad_norm": 3.183178663253784, "learning_rate": 1.796093065705644e-05, "loss": 0.5049, "step": 973 }, { "epoch": 0.4620493358633776, "grad_norm": 2.8676347732543945, "learning_rate": 1.7956278733737855e-05, "loss": 0.458, "step": 974 }, { "epoch": 0.4625237191650854, "grad_norm": 5.36901330947876, "learning_rate": 1.7951622113906663e-05, "loss": 0.4597, "step": 975 }, { "epoch": 0.4629981024667932, "grad_norm": 2.528803586959839, "learning_rate": 1.7946960800311623e-05, "loss": 0.3229, "step": 976 }, { "epoch": 0.463472485768501, "grad_norm": 2.7239816188812256, "learning_rate": 1.7942294795704265e-05, "loss": 0.4284, "step": 977 }, { "epoch": 0.4639468690702087, "grad_norm": 2.4139273166656494, "learning_rate": 1.7937624102838878e-05, "loss": 0.3585, "step": 978 }, { "epoch": 0.4644212523719165, "grad_norm": 2.762474775314331, "learning_rate": 1.793294872447253e-05, "loss": 0.3938, "step": 979 }, { "epoch": 0.4648956356736243, "grad_norm": 2.676497459411621, "learning_rate": 1.792826866336505e-05, "loss": 0.4188, "step": 980 }, { "epoch": 0.4653700189753321, "grad_norm": 3.0633938312530518, "learning_rate": 1.792358392227903e-05, "loss": 0.3947, "step": 981 }, { "epoch": 0.4658444022770398, "grad_norm": 2.3310940265655518, "learning_rate": 1.791889450397983e-05, "loss": 0.3666, "step": 982 }, { "epoch": 0.4663187855787476, "grad_norm": 3.144381523132324, "learning_rate": 1.7914200411235562e-05, "loss": 0.4274, "step": 983 }, { "epoch": 0.4667931688804554, "grad_norm": 2.8709566593170166, "learning_rate": 1.7909501646817108e-05, "loss": 0.4486, "step": 984 }, { "epoch": 0.4672675521821632, "grad_norm": 2.4950621128082275, "learning_rate": 1.79047982134981e-05, "loss": 0.4079, "step": 985 }, { "epoch": 0.46774193548387094, "grad_norm": 2.85103178024292, "learning_rate": 1.7900090114054925e-05, "loss": 0.3948, "step": 986 }, { "epoch": 0.46821631878557873, "grad_norm": 2.7091755867004395, "learning_rate": 1.7895377351266737e-05, "loss": 0.4352, "step": 987 }, { "epoch": 0.4686907020872865, "grad_norm": 2.969698905944824, "learning_rate": 1.7890659927915418e-05, "loss": 0.4333, "step": 988 }, { "epoch": 0.4691650853889943, "grad_norm": 2.8901190757751465, "learning_rate": 1.7885937846785633e-05, "loss": 0.4314, "step": 989 }, { "epoch": 0.4696394686907021, "grad_norm": 2.5206494331359863, "learning_rate": 1.7881211110664767e-05, "loss": 0.4896, "step": 990 }, { "epoch": 0.47011385199240985, "grad_norm": 2.556368589401245, "learning_rate": 1.7876479722342972e-05, "loss": 0.4165, "step": 991 }, { "epoch": 0.47058823529411764, "grad_norm": 2.626251459121704, "learning_rate": 1.7871743684613137e-05, "loss": 0.5195, "step": 992 }, { "epoch": 0.47106261859582543, "grad_norm": 2.428295373916626, "learning_rate": 1.78670030002709e-05, "loss": 0.4136, "step": 993 }, { "epoch": 0.4715370018975332, "grad_norm": 2.7496650218963623, "learning_rate": 1.786225767211464e-05, "loss": 0.4622, "step": 994 }, { "epoch": 0.47201138519924096, "grad_norm": 2.3996787071228027, "learning_rate": 1.7857507702945472e-05, "loss": 0.3481, "step": 995 }, { "epoch": 0.47248576850094876, "grad_norm": 2.776946544647217, "learning_rate": 1.7852753095567266e-05, "loss": 0.4187, "step": 996 }, { "epoch": 0.47296015180265655, "grad_norm": 2.833527088165283, "learning_rate": 1.7847993852786612e-05, "loss": 0.4635, "step": 997 }, { "epoch": 0.47343453510436434, "grad_norm": 2.589299201965332, "learning_rate": 1.7843229977412844e-05, "loss": 0.4279, "step": 998 }, { "epoch": 0.47390891840607213, "grad_norm": 2.5228946208953857, "learning_rate": 1.7838461472258035e-05, "loss": 0.4273, "step": 999 }, { "epoch": 0.47438330170777987, "grad_norm": 2.441437244415283, "learning_rate": 1.7833688340136982e-05, "loss": 0.3756, "step": 1000 }, { "epoch": 0.47485768500948766, "grad_norm": 3.258878231048584, "learning_rate": 1.782891058386722e-05, "loss": 0.4789, "step": 1001 }, { "epoch": 0.47533206831119545, "grad_norm": 2.807469367980957, "learning_rate": 1.782412820626901e-05, "loss": 0.4335, "step": 1002 }, { "epoch": 0.47580645161290325, "grad_norm": 2.546851873397827, "learning_rate": 1.7819341210165347e-05, "loss": 0.3704, "step": 1003 }, { "epoch": 0.476280834914611, "grad_norm": 2.9630987644195557, "learning_rate": 1.781454959838194e-05, "loss": 0.4367, "step": 1004 }, { "epoch": 0.4767552182163188, "grad_norm": 2.7907612323760986, "learning_rate": 1.7809753373747235e-05, "loss": 0.4591, "step": 1005 }, { "epoch": 0.47722960151802657, "grad_norm": 3.0077595710754395, "learning_rate": 1.7804952539092393e-05, "loss": 0.4258, "step": 1006 }, { "epoch": 0.47770398481973436, "grad_norm": 2.507061243057251, "learning_rate": 1.78001470972513e-05, "loss": 0.4579, "step": 1007 }, { "epoch": 0.4781783681214421, "grad_norm": 2.3923323154449463, "learning_rate": 1.7795337051060562e-05, "loss": 0.3826, "step": 1008 }, { "epoch": 0.4786527514231499, "grad_norm": 2.3353378772735596, "learning_rate": 1.77905224033595e-05, "loss": 0.3645, "step": 1009 }, { "epoch": 0.4791271347248577, "grad_norm": 2.9613916873931885, "learning_rate": 1.7785703156990153e-05, "loss": 0.4481, "step": 1010 }, { "epoch": 0.4796015180265655, "grad_norm": 3.13313889503479, "learning_rate": 1.7780879314797278e-05, "loss": 0.4703, "step": 1011 }, { "epoch": 0.48007590132827327, "grad_norm": 3.504654884338379, "learning_rate": 1.7776050879628338e-05, "loss": 0.4535, "step": 1012 }, { "epoch": 0.480550284629981, "grad_norm": 2.6365973949432373, "learning_rate": 1.777121785433351e-05, "loss": 0.3529, "step": 1013 }, { "epoch": 0.4810246679316888, "grad_norm": 3.4946820735931396, "learning_rate": 1.7766380241765682e-05, "loss": 0.4124, "step": 1014 }, { "epoch": 0.4814990512333966, "grad_norm": 2.5579440593719482, "learning_rate": 1.776153804478045e-05, "loss": 0.4045, "step": 1015 }, { "epoch": 0.4819734345351044, "grad_norm": 2.6704046726226807, "learning_rate": 1.7756691266236114e-05, "loss": 0.4014, "step": 1016 }, { "epoch": 0.4824478178368121, "grad_norm": 2.637950897216797, "learning_rate": 1.7751839908993677e-05, "loss": 0.4664, "step": 1017 }, { "epoch": 0.4829222011385199, "grad_norm": 2.9768013954162598, "learning_rate": 1.7746983975916852e-05, "loss": 0.5027, "step": 1018 }, { "epoch": 0.4833965844402277, "grad_norm": 2.8544719219207764, "learning_rate": 1.7742123469872043e-05, "loss": 0.479, "step": 1019 }, { "epoch": 0.4838709677419355, "grad_norm": 2.228623151779175, "learning_rate": 1.7737258393728363e-05, "loss": 0.3347, "step": 1020 }, { "epoch": 0.4843453510436433, "grad_norm": 2.553727865219116, "learning_rate": 1.7732388750357618e-05, "loss": 0.3965, "step": 1021 }, { "epoch": 0.484819734345351, "grad_norm": 3.6200978755950928, "learning_rate": 1.7727514542634308e-05, "loss": 0.532, "step": 1022 }, { "epoch": 0.4852941176470588, "grad_norm": 2.9352922439575195, "learning_rate": 1.772263577343563e-05, "loss": 0.428, "step": 1023 }, { "epoch": 0.4857685009487666, "grad_norm": 2.7596585750579834, "learning_rate": 1.7717752445641473e-05, "loss": 0.4415, "step": 1024 }, { "epoch": 0.4862428842504744, "grad_norm": 2.538545608520508, "learning_rate": 1.771286456213442e-05, "loss": 0.3995, "step": 1025 }, { "epoch": 0.48671726755218214, "grad_norm": 2.592538833618164, "learning_rate": 1.7707972125799738e-05, "loss": 0.3906, "step": 1026 }, { "epoch": 0.48719165085388993, "grad_norm": 2.813682794570923, "learning_rate": 1.770307513952538e-05, "loss": 0.409, "step": 1027 }, { "epoch": 0.4876660341555977, "grad_norm": 2.8214876651763916, "learning_rate": 1.7698173606201994e-05, "loss": 0.4843, "step": 1028 }, { "epoch": 0.4881404174573055, "grad_norm": 2.7187728881835938, "learning_rate": 1.7693267528722907e-05, "loss": 0.4048, "step": 1029 }, { "epoch": 0.48861480075901326, "grad_norm": 3.217090368270874, "learning_rate": 1.7688356909984125e-05, "loss": 0.5723, "step": 1030 }, { "epoch": 0.48908918406072105, "grad_norm": 2.610750675201416, "learning_rate": 1.7683441752884337e-05, "loss": 0.3835, "step": 1031 }, { "epoch": 0.48956356736242884, "grad_norm": 2.7182836532592773, "learning_rate": 1.7678522060324918e-05, "loss": 0.4672, "step": 1032 }, { "epoch": 0.49003795066413663, "grad_norm": 11.401104927062988, "learning_rate": 1.7673597835209904e-05, "loss": 0.3387, "step": 1033 }, { "epoch": 0.4905123339658444, "grad_norm": 2.622774124145508, "learning_rate": 1.766866908044602e-05, "loss": 0.4282, "step": 1034 }, { "epoch": 0.49098671726755216, "grad_norm": 2.8863613605499268, "learning_rate": 1.7663735798942666e-05, "loss": 0.4643, "step": 1035 }, { "epoch": 0.49146110056925996, "grad_norm": 2.4292235374450684, "learning_rate": 1.7658797993611907e-05, "loss": 0.3813, "step": 1036 }, { "epoch": 0.49193548387096775, "grad_norm": 2.83770751953125, "learning_rate": 1.7653855667368474e-05, "loss": 0.5212, "step": 1037 }, { "epoch": 0.49240986717267554, "grad_norm": 2.406994581222534, "learning_rate": 1.7648908823129788e-05, "loss": 0.429, "step": 1038 }, { "epoch": 0.4928842504743833, "grad_norm": 2.8495097160339355, "learning_rate": 1.7643957463815904e-05, "loss": 0.4773, "step": 1039 }, { "epoch": 0.49335863377609107, "grad_norm": 2.946979522705078, "learning_rate": 1.7639001592349575e-05, "loss": 0.4433, "step": 1040 }, { "epoch": 0.49383301707779886, "grad_norm": 2.636714220046997, "learning_rate": 1.7634041211656193e-05, "loss": 0.3856, "step": 1041 }, { "epoch": 0.49430740037950666, "grad_norm": 2.825824737548828, "learning_rate": 1.7629076324663827e-05, "loss": 0.4985, "step": 1042 }, { "epoch": 0.49478178368121445, "grad_norm": 2.4185476303100586, "learning_rate": 1.7624106934303202e-05, "loss": 0.4149, "step": 1043 }, { "epoch": 0.4952561669829222, "grad_norm": 2.691755533218384, "learning_rate": 1.7619133043507694e-05, "loss": 0.4321, "step": 1044 }, { "epoch": 0.49573055028463, "grad_norm": 2.816511392593384, "learning_rate": 1.7614154655213343e-05, "loss": 0.4777, "step": 1045 }, { "epoch": 0.49620493358633777, "grad_norm": 2.25382924079895, "learning_rate": 1.760917177235885e-05, "loss": 0.4227, "step": 1046 }, { "epoch": 0.49667931688804556, "grad_norm": 3.144374132156372, "learning_rate": 1.7604184397885554e-05, "loss": 0.4284, "step": 1047 }, { "epoch": 0.4971537001897533, "grad_norm": 2.518986940383911, "learning_rate": 1.7599192534737453e-05, "loss": 0.4029, "step": 1048 }, { "epoch": 0.4976280834914611, "grad_norm": 2.7416396141052246, "learning_rate": 1.7594196185861198e-05, "loss": 0.4459, "step": 1049 }, { "epoch": 0.4981024667931689, "grad_norm": 3.26064395904541, "learning_rate": 1.7589195354206082e-05, "loss": 0.3789, "step": 1050 }, { "epoch": 0.4985768500948767, "grad_norm": 3.0012292861938477, "learning_rate": 1.7584190042724047e-05, "loss": 0.5016, "step": 1051 }, { "epoch": 0.4990512333965844, "grad_norm": 3.3961806297302246, "learning_rate": 1.7579180254369674e-05, "loss": 0.4325, "step": 1052 }, { "epoch": 0.4995256166982922, "grad_norm": 2.4713120460510254, "learning_rate": 1.75741659921002e-05, "loss": 0.3976, "step": 1053 }, { "epoch": 0.5, "grad_norm": 2.7608566284179688, "learning_rate": 1.756914725887549e-05, "loss": 0.4458, "step": 1054 }, { "epoch": 0.5004743833017078, "grad_norm": 2.3729867935180664, "learning_rate": 1.7564124057658057e-05, "loss": 0.3763, "step": 1055 }, { "epoch": 0.5009487666034156, "grad_norm": 2.746999979019165, "learning_rate": 1.755909639141304e-05, "loss": 0.4111, "step": 1056 }, { "epoch": 0.5014231499051234, "grad_norm": 2.6905953884124756, "learning_rate": 1.755406426310822e-05, "loss": 0.4423, "step": 1057 }, { "epoch": 0.5018975332068312, "grad_norm": 2.955181837081909, "learning_rate": 1.754902767571402e-05, "loss": 0.33, "step": 1058 }, { "epoch": 0.5023719165085389, "grad_norm": 2.381403684616089, "learning_rate": 1.754398663220348e-05, "loss": 0.4346, "step": 1059 }, { "epoch": 0.5028462998102466, "grad_norm": 2.5319292545318604, "learning_rate": 1.7538941135552286e-05, "loss": 0.4469, "step": 1060 }, { "epoch": 0.5033206831119544, "grad_norm": 2.7511696815490723, "learning_rate": 1.7533891188738738e-05, "loss": 0.4318, "step": 1061 }, { "epoch": 0.5037950664136622, "grad_norm": 2.6034233570098877, "learning_rate": 1.7528836794743776e-05, "loss": 0.3468, "step": 1062 }, { "epoch": 0.50426944971537, "grad_norm": 2.702395439147949, "learning_rate": 1.752377795655095e-05, "loss": 0.4786, "step": 1063 }, { "epoch": 0.5047438330170778, "grad_norm": 2.741300582885742, "learning_rate": 1.751871467714645e-05, "loss": 0.3856, "step": 1064 }, { "epoch": 0.5052182163187856, "grad_norm": 2.5689239501953125, "learning_rate": 1.751364695951908e-05, "loss": 0.3626, "step": 1065 }, { "epoch": 0.5056925996204934, "grad_norm": 2.544797897338867, "learning_rate": 1.7508574806660256e-05, "loss": 0.4253, "step": 1066 }, { "epoch": 0.5061669829222012, "grad_norm": 2.194748640060425, "learning_rate": 1.7503498221564026e-05, "loss": 0.322, "step": 1067 }, { "epoch": 0.5066413662239089, "grad_norm": 2.4700746536254883, "learning_rate": 1.7498417207227046e-05, "loss": 0.3716, "step": 1068 }, { "epoch": 0.5071157495256167, "grad_norm": 3.115464687347412, "learning_rate": 1.7493331766648588e-05, "loss": 0.4376, "step": 1069 }, { "epoch": 0.5075901328273245, "grad_norm": 2.6482186317443848, "learning_rate": 1.748824190283054e-05, "loss": 0.3741, "step": 1070 }, { "epoch": 0.5080645161290323, "grad_norm": 2.6619374752044678, "learning_rate": 1.7483147618777393e-05, "loss": 0.4422, "step": 1071 }, { "epoch": 0.50853889943074, "grad_norm": 2.94618558883667, "learning_rate": 1.7478048917496255e-05, "loss": 0.4338, "step": 1072 }, { "epoch": 0.5090132827324478, "grad_norm": 3.4346418380737305, "learning_rate": 1.7472945801996842e-05, "loss": 0.5294, "step": 1073 }, { "epoch": 0.5094876660341556, "grad_norm": 2.2107574939727783, "learning_rate": 1.7467838275291467e-05, "loss": 0.3451, "step": 1074 }, { "epoch": 0.5099620493358634, "grad_norm": 2.622593641281128, "learning_rate": 1.746272634039506e-05, "loss": 0.4888, "step": 1075 }, { "epoch": 0.5104364326375711, "grad_norm": 2.8133835792541504, "learning_rate": 1.745761000032514e-05, "loss": 0.45, "step": 1076 }, { "epoch": 0.5109108159392789, "grad_norm": 1.8900046348571777, "learning_rate": 1.745248925810183e-05, "loss": 0.3165, "step": 1077 }, { "epoch": 0.5113851992409867, "grad_norm": 2.4775540828704834, "learning_rate": 1.744736411674786e-05, "loss": 0.3891, "step": 1078 }, { "epoch": 0.5118595825426945, "grad_norm": 2.9898617267608643, "learning_rate": 1.7442234579288543e-05, "loss": 0.4028, "step": 1079 }, { "epoch": 0.5123339658444023, "grad_norm": 2.6482555866241455, "learning_rate": 1.74371006487518e-05, "loss": 0.445, "step": 1080 }, { "epoch": 0.5128083491461101, "grad_norm": 3.1871161460876465, "learning_rate": 1.743196232816814e-05, "loss": 0.3826, "step": 1081 }, { "epoch": 0.5132827324478179, "grad_norm": 2.722008466720581, "learning_rate": 1.7426819620570655e-05, "loss": 0.4419, "step": 1082 }, { "epoch": 0.5137571157495257, "grad_norm": 2.5276153087615967, "learning_rate": 1.7421672528995043e-05, "loss": 0.4635, "step": 1083 }, { "epoch": 0.5142314990512334, "grad_norm": 2.950514554977417, "learning_rate": 1.7416521056479577e-05, "loss": 0.3816, "step": 1084 }, { "epoch": 0.5147058823529411, "grad_norm": 2.382774829864502, "learning_rate": 1.741136520606512e-05, "loss": 0.3931, "step": 1085 }, { "epoch": 0.5151802656546489, "grad_norm": 2.6314048767089844, "learning_rate": 1.7406204980795124e-05, "loss": 0.4377, "step": 1086 }, { "epoch": 0.5156546489563567, "grad_norm": 3.256890296936035, "learning_rate": 1.740104038371561e-05, "loss": 0.5105, "step": 1087 }, { "epoch": 0.5161290322580645, "grad_norm": 2.571377992630005, "learning_rate": 1.7395871417875198e-05, "loss": 0.4192, "step": 1088 }, { "epoch": 0.5166034155597723, "grad_norm": 2.945387601852417, "learning_rate": 1.7390698086325072e-05, "loss": 0.3837, "step": 1089 }, { "epoch": 0.5170777988614801, "grad_norm": 3.032877206802368, "learning_rate": 1.7385520392118998e-05, "loss": 0.4706, "step": 1090 }, { "epoch": 0.5175521821631879, "grad_norm": 3.0032753944396973, "learning_rate": 1.7380338338313322e-05, "loss": 0.4447, "step": 1091 }, { "epoch": 0.5180265654648957, "grad_norm": 2.387190103530884, "learning_rate": 1.7375151927966954e-05, "loss": 0.386, "step": 1092 }, { "epoch": 0.5185009487666035, "grad_norm": 3.4106404781341553, "learning_rate": 1.7369961164141383e-05, "loss": 0.5009, "step": 1093 }, { "epoch": 0.5189753320683111, "grad_norm": 2.958599805831909, "learning_rate": 1.736476604990067e-05, "loss": 0.4794, "step": 1094 }, { "epoch": 0.5194497153700189, "grad_norm": 2.795013189315796, "learning_rate": 1.735956658831143e-05, "loss": 0.4323, "step": 1095 }, { "epoch": 0.5199240986717267, "grad_norm": 2.330531120300293, "learning_rate": 1.7354362782442864e-05, "loss": 0.3815, "step": 1096 }, { "epoch": 0.5203984819734345, "grad_norm": 2.7681827545166016, "learning_rate": 1.734915463536672e-05, "loss": 0.4358, "step": 1097 }, { "epoch": 0.5208728652751423, "grad_norm": 2.7465672492980957, "learning_rate": 1.7343942150157315e-05, "loss": 0.4384, "step": 1098 }, { "epoch": 0.5213472485768501, "grad_norm": 2.6306185722351074, "learning_rate": 1.7338725329891532e-05, "loss": 0.4233, "step": 1099 }, { "epoch": 0.5218216318785579, "grad_norm": 2.5246400833129883, "learning_rate": 1.7333504177648806e-05, "loss": 0.3583, "step": 1100 }, { "epoch": 0.5222960151802657, "grad_norm": 2.543013334274292, "learning_rate": 1.7328278696511135e-05, "loss": 0.4036, "step": 1101 }, { "epoch": 0.5227703984819735, "grad_norm": 3.1978096961975098, "learning_rate": 1.7323048889563062e-05, "loss": 0.489, "step": 1102 }, { "epoch": 0.5232447817836812, "grad_norm": 2.8943567276000977, "learning_rate": 1.73178147598917e-05, "loss": 0.5043, "step": 1103 }, { "epoch": 0.523719165085389, "grad_norm": 2.635658025741577, "learning_rate": 1.7312576310586692e-05, "loss": 0.4701, "step": 1104 }, { "epoch": 0.5241935483870968, "grad_norm": 2.4131529331207275, "learning_rate": 1.730733354474025e-05, "loss": 0.4144, "step": 1105 }, { "epoch": 0.5246679316888045, "grad_norm": 2.3183786869049072, "learning_rate": 1.730208646544713e-05, "loss": 0.4114, "step": 1106 }, { "epoch": 0.5251423149905123, "grad_norm": 2.472311496734619, "learning_rate": 1.729683507580462e-05, "loss": 0.3944, "step": 1107 }, { "epoch": 0.5256166982922201, "grad_norm": 2.4466824531555176, "learning_rate": 1.7291579378912574e-05, "loss": 0.4335, "step": 1108 }, { "epoch": 0.5260910815939279, "grad_norm": 2.289536237716675, "learning_rate": 1.7286319377873374e-05, "loss": 0.3746, "step": 1109 }, { "epoch": 0.5265654648956357, "grad_norm": 2.8903002738952637, "learning_rate": 1.7281055075791946e-05, "loss": 0.4114, "step": 1110 }, { "epoch": 0.5270398481973435, "grad_norm": 2.633312225341797, "learning_rate": 1.7275786475775757e-05, "loss": 0.3702, "step": 1111 }, { "epoch": 0.5275142314990512, "grad_norm": 3.3855140209198, "learning_rate": 1.7270513580934805e-05, "loss": 0.4827, "step": 1112 }, { "epoch": 0.527988614800759, "grad_norm": 2.293537139892578, "learning_rate": 1.7265236394381634e-05, "loss": 0.3591, "step": 1113 }, { "epoch": 0.5284629981024668, "grad_norm": 2.2649781703948975, "learning_rate": 1.725995491923131e-05, "loss": 0.3496, "step": 1114 }, { "epoch": 0.5289373814041746, "grad_norm": 2.7540788650512695, "learning_rate": 1.725466915860144e-05, "loss": 0.4094, "step": 1115 }, { "epoch": 0.5294117647058824, "grad_norm": 2.8190548419952393, "learning_rate": 1.7249379115612154e-05, "loss": 0.442, "step": 1116 }, { "epoch": 0.5298861480075902, "grad_norm": 2.757744789123535, "learning_rate": 1.7244084793386108e-05, "loss": 0.3825, "step": 1117 }, { "epoch": 0.530360531309298, "grad_norm": 3.2028768062591553, "learning_rate": 1.7238786195048493e-05, "loss": 0.4394, "step": 1118 }, { "epoch": 0.5308349146110057, "grad_norm": 2.5741443634033203, "learning_rate": 1.7233483323727018e-05, "loss": 0.3955, "step": 1119 }, { "epoch": 0.5313092979127134, "grad_norm": 3.050013303756714, "learning_rate": 1.7228176182551917e-05, "loss": 0.4598, "step": 1120 }, { "epoch": 0.5317836812144212, "grad_norm": 2.7212207317352295, "learning_rate": 1.722286477465594e-05, "loss": 0.4926, "step": 1121 }, { "epoch": 0.532258064516129, "grad_norm": 2.9433910846710205, "learning_rate": 1.721754910317436e-05, "loss": 0.4738, "step": 1122 }, { "epoch": 0.5327324478178368, "grad_norm": 2.578136444091797, "learning_rate": 1.7212229171244966e-05, "loss": 0.3956, "step": 1123 }, { "epoch": 0.5332068311195446, "grad_norm": 2.4337382316589355, "learning_rate": 1.7206904982008058e-05, "loss": 0.3345, "step": 1124 }, { "epoch": 0.5336812144212524, "grad_norm": 2.611060619354248, "learning_rate": 1.7201576538606458e-05, "loss": 0.4081, "step": 1125 }, { "epoch": 0.5341555977229602, "grad_norm": 2.9734725952148438, "learning_rate": 1.7196243844185488e-05, "loss": 0.4356, "step": 1126 }, { "epoch": 0.534629981024668, "grad_norm": 2.1246142387390137, "learning_rate": 1.7190906901892986e-05, "loss": 0.412, "step": 1127 }, { "epoch": 0.5351043643263758, "grad_norm": 2.533374547958374, "learning_rate": 1.7185565714879295e-05, "loss": 0.4051, "step": 1128 }, { "epoch": 0.5355787476280834, "grad_norm": 2.476228713989258, "learning_rate": 1.718022028629727e-05, "loss": 0.4775, "step": 1129 }, { "epoch": 0.5360531309297912, "grad_norm": 2.6208913326263428, "learning_rate": 1.7174870619302263e-05, "loss": 0.3889, "step": 1130 }, { "epoch": 0.536527514231499, "grad_norm": 2.2168285846710205, "learning_rate": 1.7169516717052122e-05, "loss": 0.3205, "step": 1131 }, { "epoch": 0.5370018975332068, "grad_norm": 2.6198856830596924, "learning_rate": 1.7164158582707215e-05, "loss": 0.3413, "step": 1132 }, { "epoch": 0.5374762808349146, "grad_norm": 2.9443211555480957, "learning_rate": 1.715879621943038e-05, "loss": 0.5137, "step": 1133 }, { "epoch": 0.5379506641366224, "grad_norm": 2.5952861309051514, "learning_rate": 1.7153429630386985e-05, "loss": 0.4471, "step": 1134 }, { "epoch": 0.5384250474383302, "grad_norm": 2.5865321159362793, "learning_rate": 1.714805881874486e-05, "loss": 0.3915, "step": 1135 }, { "epoch": 0.538899430740038, "grad_norm": 2.849581003189087, "learning_rate": 1.7142683787674353e-05, "loss": 0.4406, "step": 1136 }, { "epoch": 0.5393738140417458, "grad_norm": 2.4955079555511475, "learning_rate": 1.713730454034828e-05, "loss": 0.4032, "step": 1137 }, { "epoch": 0.5398481973434535, "grad_norm": 2.804607391357422, "learning_rate": 1.7131921079941965e-05, "loss": 0.354, "step": 1138 }, { "epoch": 0.5403225806451613, "grad_norm": 2.3211519718170166, "learning_rate": 1.7126533409633214e-05, "loss": 0.3228, "step": 1139 }, { "epoch": 0.540796963946869, "grad_norm": 2.563812732696533, "learning_rate": 1.7121141532602306e-05, "loss": 0.407, "step": 1140 }, { "epoch": 0.5412713472485768, "grad_norm": 2.5948188304901123, "learning_rate": 1.7115745452032023e-05, "loss": 0.3885, "step": 1141 }, { "epoch": 0.5417457305502846, "grad_norm": 2.253960371017456, "learning_rate": 1.711034517110761e-05, "loss": 0.4199, "step": 1142 }, { "epoch": 0.5422201138519924, "grad_norm": 2.6741251945495605, "learning_rate": 1.7104940693016803e-05, "loss": 0.4642, "step": 1143 }, { "epoch": 0.5426944971537002, "grad_norm": 2.9957103729248047, "learning_rate": 1.709953202094981e-05, "loss": 0.3952, "step": 1144 }, { "epoch": 0.543168880455408, "grad_norm": 2.8041434288024902, "learning_rate": 1.7094119158099318e-05, "loss": 0.3991, "step": 1145 }, { "epoch": 0.5436432637571158, "grad_norm": 2.4647340774536133, "learning_rate": 1.708870210766049e-05, "loss": 0.3622, "step": 1146 }, { "epoch": 0.5441176470588235, "grad_norm": 3.1950418949127197, "learning_rate": 1.708328087283095e-05, "loss": 0.4606, "step": 1147 }, { "epoch": 0.5445920303605313, "grad_norm": 2.3248064517974854, "learning_rate": 1.7077855456810803e-05, "loss": 0.3685, "step": 1148 }, { "epoch": 0.5450664136622391, "grad_norm": 3.027299404144287, "learning_rate": 1.7072425862802618e-05, "loss": 0.4978, "step": 1149 }, { "epoch": 0.5455407969639469, "grad_norm": 3.0360865592956543, "learning_rate": 1.706699209401143e-05, "loss": 0.3587, "step": 1150 }, { "epoch": 0.5460151802656547, "grad_norm": 2.7599403858184814, "learning_rate": 1.7061554153644743e-05, "loss": 0.4213, "step": 1151 }, { "epoch": 0.5464895635673624, "grad_norm": 3.035600423812866, "learning_rate": 1.7056112044912513e-05, "loss": 0.4591, "step": 1152 }, { "epoch": 0.5469639468690702, "grad_norm": 2.4583334922790527, "learning_rate": 1.705066577102717e-05, "loss": 0.4092, "step": 1153 }, { "epoch": 0.547438330170778, "grad_norm": 2.438873529434204, "learning_rate": 1.704521533520359e-05, "loss": 0.4513, "step": 1154 }, { "epoch": 0.5479127134724858, "grad_norm": 2.434335708618164, "learning_rate": 1.703976074065911e-05, "loss": 0.3763, "step": 1155 }, { "epoch": 0.5483870967741935, "grad_norm": 2.7896952629089355, "learning_rate": 1.703430199061353e-05, "loss": 0.4478, "step": 1156 }, { "epoch": 0.5488614800759013, "grad_norm": 3.330437183380127, "learning_rate": 1.7028839088289092e-05, "loss": 0.4884, "step": 1157 }, { "epoch": 0.5493358633776091, "grad_norm": 2.2704224586486816, "learning_rate": 1.702337203691049e-05, "loss": 0.3599, "step": 1158 }, { "epoch": 0.5498102466793169, "grad_norm": 2.6077935695648193, "learning_rate": 1.7017900839704877e-05, "loss": 0.439, "step": 1159 }, { "epoch": 0.5502846299810247, "grad_norm": 2.6503183841705322, "learning_rate": 1.7012425499901842e-05, "loss": 0.443, "step": 1160 }, { "epoch": 0.5507590132827325, "grad_norm": 2.3090269565582275, "learning_rate": 1.7006946020733426e-05, "loss": 0.4096, "step": 1161 }, { "epoch": 0.5512333965844403, "grad_norm": 2.1434450149536133, "learning_rate": 1.700146240543411e-05, "loss": 0.373, "step": 1162 }, { "epoch": 0.551707779886148, "grad_norm": 2.830927610397339, "learning_rate": 1.699597465724082e-05, "loss": 0.4866, "step": 1163 }, { "epoch": 0.5521821631878557, "grad_norm": 2.427565336227417, "learning_rate": 1.6990482779392918e-05, "loss": 0.3733, "step": 1164 }, { "epoch": 0.5526565464895635, "grad_norm": 2.7859416007995605, "learning_rate": 1.6984986775132202e-05, "loss": 0.4309, "step": 1165 }, { "epoch": 0.5531309297912713, "grad_norm": 3.880078077316284, "learning_rate": 1.6979486647702917e-05, "loss": 0.6485, "step": 1166 }, { "epoch": 0.5536053130929791, "grad_norm": 2.7935032844543457, "learning_rate": 1.6973982400351726e-05, "loss": 0.4619, "step": 1167 }, { "epoch": 0.5540796963946869, "grad_norm": 2.788963794708252, "learning_rate": 1.6968474036327733e-05, "loss": 0.3934, "step": 1168 }, { "epoch": 0.5545540796963947, "grad_norm": 2.3804471492767334, "learning_rate": 1.6962961558882476e-05, "loss": 0.4006, "step": 1169 }, { "epoch": 0.5550284629981025, "grad_norm": 2.1218841075897217, "learning_rate": 1.6957444971269907e-05, "loss": 0.3862, "step": 1170 }, { "epoch": 0.5555028462998103, "grad_norm": 2.6136226654052734, "learning_rate": 1.6951924276746425e-05, "loss": 0.4073, "step": 1171 }, { "epoch": 0.5559772296015181, "grad_norm": 2.710028886795044, "learning_rate": 1.694639947857083e-05, "loss": 0.4464, "step": 1172 }, { "epoch": 0.5564516129032258, "grad_norm": 2.524308204650879, "learning_rate": 1.6940870580004364e-05, "loss": 0.4632, "step": 1173 }, { "epoch": 0.5569259962049335, "grad_norm": 2.7076687812805176, "learning_rate": 1.6935337584310674e-05, "loss": 0.4706, "step": 1174 }, { "epoch": 0.5574003795066413, "grad_norm": 2.7183334827423096, "learning_rate": 1.6929800494755836e-05, "loss": 0.4462, "step": 1175 }, { "epoch": 0.5578747628083491, "grad_norm": 2.5758368968963623, "learning_rate": 1.692425931460834e-05, "loss": 0.3369, "step": 1176 }, { "epoch": 0.5583491461100569, "grad_norm": 2.9988672733306885, "learning_rate": 1.691871404713909e-05, "loss": 0.5041, "step": 1177 }, { "epoch": 0.5588235294117647, "grad_norm": 2.278681516647339, "learning_rate": 1.69131646956214e-05, "loss": 0.4227, "step": 1178 }, { "epoch": 0.5592979127134725, "grad_norm": 2.4047083854675293, "learning_rate": 1.6907611263331004e-05, "loss": 0.3633, "step": 1179 }, { "epoch": 0.5597722960151803, "grad_norm": 2.6348397731781006, "learning_rate": 1.6902053753546026e-05, "loss": 0.4034, "step": 1180 }, { "epoch": 0.5602466793168881, "grad_norm": 2.6245360374450684, "learning_rate": 1.6896492169547022e-05, "loss": 0.4203, "step": 1181 }, { "epoch": 0.5607210626185958, "grad_norm": 2.4060757160186768, "learning_rate": 1.6890926514616926e-05, "loss": 0.3561, "step": 1182 }, { "epoch": 0.5611954459203036, "grad_norm": 4.168298244476318, "learning_rate": 1.6885356792041107e-05, "loss": 0.4976, "step": 1183 }, { "epoch": 0.5616698292220114, "grad_norm": 2.4673924446105957, "learning_rate": 1.68797830051073e-05, "loss": 0.4339, "step": 1184 }, { "epoch": 0.5621442125237192, "grad_norm": 2.879330635070801, "learning_rate": 1.6874205157105667e-05, "loss": 0.4091, "step": 1185 }, { "epoch": 0.562618595825427, "grad_norm": 2.9739437103271484, "learning_rate": 1.686862325132875e-05, "loss": 0.379, "step": 1186 }, { "epoch": 0.5630929791271347, "grad_norm": 2.463998317718506, "learning_rate": 1.68630372910715e-05, "loss": 0.394, "step": 1187 }, { "epoch": 0.5635673624288425, "grad_norm": 3.4014320373535156, "learning_rate": 1.685744727963125e-05, "loss": 0.4995, "step": 1188 }, { "epoch": 0.5640417457305503, "grad_norm": 3.1018409729003906, "learning_rate": 1.685185322030772e-05, "loss": 0.3801, "step": 1189 }, { "epoch": 0.5645161290322581, "grad_norm": 3.028447151184082, "learning_rate": 1.6846255116403044e-05, "loss": 0.4118, "step": 1190 }, { "epoch": 0.5649905123339658, "grad_norm": 3.2124314308166504, "learning_rate": 1.6840652971221714e-05, "loss": 0.3829, "step": 1191 }, { "epoch": 0.5654648956356736, "grad_norm": 3.856614351272583, "learning_rate": 1.683504678807063e-05, "loss": 0.4024, "step": 1192 }, { "epoch": 0.5659392789373814, "grad_norm": 2.8994805812835693, "learning_rate": 1.6829436570259064e-05, "loss": 0.469, "step": 1193 }, { "epoch": 0.5664136622390892, "grad_norm": 2.968153953552246, "learning_rate": 1.6823822321098667e-05, "loss": 0.4357, "step": 1194 }, { "epoch": 0.566888045540797, "grad_norm": 2.9536709785461426, "learning_rate": 1.681820404390348e-05, "loss": 0.4572, "step": 1195 }, { "epoch": 0.5673624288425048, "grad_norm": 2.574279546737671, "learning_rate": 1.6812581741989915e-05, "loss": 0.3608, "step": 1196 }, { "epoch": 0.5678368121442126, "grad_norm": 2.5156452655792236, "learning_rate": 1.680695541867676e-05, "loss": 0.402, "step": 1197 }, { "epoch": 0.5683111954459203, "grad_norm": 3.1304829120635986, "learning_rate": 1.680132507728518e-05, "loss": 0.3993, "step": 1198 }, { "epoch": 0.568785578747628, "grad_norm": 3.0311906337738037, "learning_rate": 1.679569072113871e-05, "loss": 0.3883, "step": 1199 }, { "epoch": 0.5692599620493358, "grad_norm": 2.762141227722168, "learning_rate": 1.6790052353563254e-05, "loss": 0.3877, "step": 1200 }, { "epoch": 0.5697343453510436, "grad_norm": 2.4210193157196045, "learning_rate": 1.678440997788708e-05, "loss": 0.3758, "step": 1201 }, { "epoch": 0.5702087286527514, "grad_norm": 2.446246862411499, "learning_rate": 1.6778763597440833e-05, "loss": 0.3929, "step": 1202 }, { "epoch": 0.5706831119544592, "grad_norm": 2.542310953140259, "learning_rate": 1.6773113215557514e-05, "loss": 0.4038, "step": 1203 }, { "epoch": 0.571157495256167, "grad_norm": 2.6096861362457275, "learning_rate": 1.6767458835572487e-05, "loss": 0.421, "step": 1204 }, { "epoch": 0.5716318785578748, "grad_norm": 2.5336506366729736, "learning_rate": 1.6761800460823473e-05, "loss": 0.4722, "step": 1205 }, { "epoch": 0.5721062618595826, "grad_norm": 2.657362461090088, "learning_rate": 1.6756138094650563e-05, "loss": 0.4266, "step": 1206 }, { "epoch": 0.5725806451612904, "grad_norm": 2.6421406269073486, "learning_rate": 1.6750471740396187e-05, "loss": 0.4215, "step": 1207 }, { "epoch": 0.573055028462998, "grad_norm": 2.54229998588562, "learning_rate": 1.6744801401405138e-05, "loss": 0.4456, "step": 1208 }, { "epoch": 0.5735294117647058, "grad_norm": 3.0632123947143555, "learning_rate": 1.6739127081024574e-05, "loss": 0.3799, "step": 1209 }, { "epoch": 0.5740037950664136, "grad_norm": 2.753382921218872, "learning_rate": 1.673344878260397e-05, "loss": 0.4764, "step": 1210 }, { "epoch": 0.5744781783681214, "grad_norm": 2.06011700630188, "learning_rate": 1.6727766509495186e-05, "loss": 0.3336, "step": 1211 }, { "epoch": 0.5749525616698292, "grad_norm": 2.405019760131836, "learning_rate": 1.6722080265052407e-05, "loss": 0.4868, "step": 1212 }, { "epoch": 0.575426944971537, "grad_norm": 2.274955987930298, "learning_rate": 1.671639005263216e-05, "loss": 0.3628, "step": 1213 }, { "epoch": 0.5759013282732448, "grad_norm": 3.2761285305023193, "learning_rate": 1.671069587559333e-05, "loss": 0.4588, "step": 1214 }, { "epoch": 0.5763757115749526, "grad_norm": 3.0697081089019775, "learning_rate": 1.6704997737297134e-05, "loss": 0.43, "step": 1215 }, { "epoch": 0.5768500948766604, "grad_norm": 2.4201908111572266, "learning_rate": 1.6699295641107116e-05, "loss": 0.3929, "step": 1216 }, { "epoch": 0.5773244781783681, "grad_norm": 2.5585038661956787, "learning_rate": 1.6693589590389176e-05, "loss": 0.4479, "step": 1217 }, { "epoch": 0.5777988614800759, "grad_norm": 2.488297939300537, "learning_rate": 1.668787958851153e-05, "loss": 0.4259, "step": 1218 }, { "epoch": 0.5782732447817837, "grad_norm": 2.3524911403656006, "learning_rate": 1.6682165638844753e-05, "loss": 0.4478, "step": 1219 }, { "epoch": 0.5787476280834914, "grad_norm": 2.2819252014160156, "learning_rate": 1.6676447744761715e-05, "loss": 0.3464, "step": 1220 }, { "epoch": 0.5792220113851992, "grad_norm": 2.497574806213379, "learning_rate": 1.6670725909637645e-05, "loss": 0.4008, "step": 1221 }, { "epoch": 0.579696394686907, "grad_norm": 2.4774222373962402, "learning_rate": 1.6665000136850076e-05, "loss": 0.4056, "step": 1222 }, { "epoch": 0.5801707779886148, "grad_norm": 3.251617908477783, "learning_rate": 1.665927042977888e-05, "loss": 0.4725, "step": 1223 }, { "epoch": 0.5806451612903226, "grad_norm": 5.06425666809082, "learning_rate": 1.6653536791806248e-05, "loss": 0.4183, "step": 1224 }, { "epoch": 0.5811195445920304, "grad_norm": 2.323410749435425, "learning_rate": 1.6647799226316684e-05, "loss": 0.3405, "step": 1225 }, { "epoch": 0.5815939278937381, "grad_norm": 2.2724153995513916, "learning_rate": 1.6642057736697023e-05, "loss": 0.3668, "step": 1226 }, { "epoch": 0.5820683111954459, "grad_norm": 2.63533878326416, "learning_rate": 1.6636312326336402e-05, "loss": 0.458, "step": 1227 }, { "epoch": 0.5825426944971537, "grad_norm": 3.030594825744629, "learning_rate": 1.6630562998626287e-05, "loss": 0.3992, "step": 1228 }, { "epoch": 0.5830170777988615, "grad_norm": 2.6020584106445312, "learning_rate": 1.6624809756960445e-05, "loss": 0.4317, "step": 1229 }, { "epoch": 0.5834914611005693, "grad_norm": 2.6600024700164795, "learning_rate": 1.6619052604734958e-05, "loss": 0.3952, "step": 1230 }, { "epoch": 0.5839658444022771, "grad_norm": 2.172214984893799, "learning_rate": 1.661329154534822e-05, "loss": 0.343, "step": 1231 }, { "epoch": 0.5844402277039848, "grad_norm": 2.652944803237915, "learning_rate": 1.6607526582200918e-05, "loss": 0.4041, "step": 1232 }, { "epoch": 0.5849146110056926, "grad_norm": 2.3528411388397217, "learning_rate": 1.6601757718696065e-05, "loss": 0.4106, "step": 1233 }, { "epoch": 0.5853889943074004, "grad_norm": 2.205167770385742, "learning_rate": 1.6595984958238952e-05, "loss": 0.3645, "step": 1234 }, { "epoch": 0.5858633776091081, "grad_norm": 2.9009509086608887, "learning_rate": 1.6590208304237193e-05, "loss": 0.5345, "step": 1235 }, { "epoch": 0.5863377609108159, "grad_norm": 3.4057090282440186, "learning_rate": 1.6584427760100682e-05, "loss": 0.4928, "step": 1236 }, { "epoch": 0.5868121442125237, "grad_norm": 2.6523916721343994, "learning_rate": 1.657864332924162e-05, "loss": 0.4055, "step": 1237 }, { "epoch": 0.5872865275142315, "grad_norm": 2.7750117778778076, "learning_rate": 1.6572855015074502e-05, "loss": 0.4551, "step": 1238 }, { "epoch": 0.5877609108159393, "grad_norm": 2.880966901779175, "learning_rate": 1.656706282101611e-05, "loss": 0.4637, "step": 1239 }, { "epoch": 0.5882352941176471, "grad_norm": 2.484705924987793, "learning_rate": 1.6561266750485517e-05, "loss": 0.37, "step": 1240 }, { "epoch": 0.5887096774193549, "grad_norm": 2.381371259689331, "learning_rate": 1.655546680690409e-05, "loss": 0.4134, "step": 1241 }, { "epoch": 0.5891840607210627, "grad_norm": 2.261758327484131, "learning_rate": 1.654966299369547e-05, "loss": 0.3461, "step": 1242 }, { "epoch": 0.5896584440227703, "grad_norm": 2.374173402786255, "learning_rate": 1.6543855314285598e-05, "loss": 0.4009, "step": 1243 }, { "epoch": 0.5901328273244781, "grad_norm": 2.667855978012085, "learning_rate": 1.6538043772102692e-05, "loss": 0.4612, "step": 1244 }, { "epoch": 0.5906072106261859, "grad_norm": 2.694554328918457, "learning_rate": 1.653222837057724e-05, "loss": 0.4593, "step": 1245 }, { "epoch": 0.5910815939278937, "grad_norm": 2.515317678451538, "learning_rate": 1.6526409113142022e-05, "loss": 0.4236, "step": 1246 }, { "epoch": 0.5915559772296015, "grad_norm": 2.1223080158233643, "learning_rate": 1.652058600323209e-05, "loss": 0.3854, "step": 1247 }, { "epoch": 0.5920303605313093, "grad_norm": 2.624248504638672, "learning_rate": 1.651475904428476e-05, "loss": 0.3865, "step": 1248 }, { "epoch": 0.5925047438330171, "grad_norm": 2.6333069801330566, "learning_rate": 1.6508928239739632e-05, "loss": 0.4257, "step": 1249 }, { "epoch": 0.5929791271347249, "grad_norm": 2.5864479541778564, "learning_rate": 1.6503093593038573e-05, "loss": 0.415, "step": 1250 }, { "epoch": 0.5934535104364327, "grad_norm": 2.4937593936920166, "learning_rate": 1.649725510762572e-05, "loss": 0.4392, "step": 1251 }, { "epoch": 0.5939278937381404, "grad_norm": 2.5235211849212646, "learning_rate": 1.6491412786947468e-05, "loss": 0.3365, "step": 1252 }, { "epoch": 0.5944022770398482, "grad_norm": 2.3775665760040283, "learning_rate": 1.6485566634452483e-05, "loss": 0.3976, "step": 1253 }, { "epoch": 0.594876660341556, "grad_norm": 2.5280470848083496, "learning_rate": 1.6479716653591694e-05, "loss": 0.4174, "step": 1254 }, { "epoch": 0.5953510436432637, "grad_norm": 2.390460968017578, "learning_rate": 1.647386284781828e-05, "loss": 0.3771, "step": 1255 }, { "epoch": 0.5958254269449715, "grad_norm": 2.5851383209228516, "learning_rate": 1.6468005220587687e-05, "loss": 0.4512, "step": 1256 }, { "epoch": 0.5962998102466793, "grad_norm": 2.5305631160736084, "learning_rate": 1.646214377535762e-05, "loss": 0.3618, "step": 1257 }, { "epoch": 0.5967741935483871, "grad_norm": 2.88232159614563, "learning_rate": 1.6456278515588023e-05, "loss": 0.4459, "step": 1258 }, { "epoch": 0.5972485768500949, "grad_norm": 2.761087656021118, "learning_rate": 1.6450409444741112e-05, "loss": 0.4103, "step": 1259 }, { "epoch": 0.5977229601518027, "grad_norm": 2.098811388015747, "learning_rate": 1.6444536566281332e-05, "loss": 0.3358, "step": 1260 }, { "epoch": 0.5981973434535104, "grad_norm": 2.363554000854492, "learning_rate": 1.643865988367539e-05, "loss": 0.3722, "step": 1261 }, { "epoch": 0.5986717267552182, "grad_norm": 2.7632856369018555, "learning_rate": 1.6432779400392232e-05, "loss": 0.4257, "step": 1262 }, { "epoch": 0.599146110056926, "grad_norm": 2.5627601146698, "learning_rate": 1.6426895119903046e-05, "loss": 0.4035, "step": 1263 }, { "epoch": 0.5996204933586338, "grad_norm": 2.7176856994628906, "learning_rate": 1.6421007045681273e-05, "loss": 0.4467, "step": 1264 }, { "epoch": 0.6000948766603416, "grad_norm": 2.368619441986084, "learning_rate": 1.6415115181202576e-05, "loss": 0.3523, "step": 1265 }, { "epoch": 0.6005692599620494, "grad_norm": 2.4514405727386475, "learning_rate": 1.6409219529944866e-05, "loss": 0.3838, "step": 1266 }, { "epoch": 0.6010436432637571, "grad_norm": 2.8423192501068115, "learning_rate": 1.640332009538829e-05, "loss": 0.3943, "step": 1267 }, { "epoch": 0.6015180265654649, "grad_norm": 2.820495843887329, "learning_rate": 1.639741688101523e-05, "loss": 0.4247, "step": 1268 }, { "epoch": 0.6019924098671727, "grad_norm": 2.6144824028015137, "learning_rate": 1.6391509890310285e-05, "loss": 0.43, "step": 1269 }, { "epoch": 0.6024667931688804, "grad_norm": 2.3828113079071045, "learning_rate": 1.63855991267603e-05, "loss": 0.3844, "step": 1270 }, { "epoch": 0.6029411764705882, "grad_norm": 2.611931324005127, "learning_rate": 1.637968459385434e-05, "loss": 0.3651, "step": 1271 }, { "epoch": 0.603415559772296, "grad_norm": 2.4862523078918457, "learning_rate": 1.6373766295083693e-05, "loss": 0.4257, "step": 1272 }, { "epoch": 0.6038899430740038, "grad_norm": 2.119983673095703, "learning_rate": 1.636784423394187e-05, "loss": 0.3721, "step": 1273 }, { "epoch": 0.6043643263757116, "grad_norm": 2.5551223754882812, "learning_rate": 1.6361918413924614e-05, "loss": 0.4151, "step": 1274 }, { "epoch": 0.6048387096774194, "grad_norm": 2.954251527786255, "learning_rate": 1.635598883852987e-05, "loss": 0.4131, "step": 1275 }, { "epoch": 0.6053130929791272, "grad_norm": 2.8312759399414062, "learning_rate": 1.6350055511257806e-05, "loss": 0.3901, "step": 1276 }, { "epoch": 0.605787476280835, "grad_norm": 2.543302536010742, "learning_rate": 1.6344118435610814e-05, "loss": 0.3597, "step": 1277 }, { "epoch": 0.6062618595825426, "grad_norm": 2.338500738143921, "learning_rate": 1.633817761509349e-05, "loss": 0.3255, "step": 1278 }, { "epoch": 0.6067362428842504, "grad_norm": 2.9333157539367676, "learning_rate": 1.6332233053212632e-05, "loss": 0.4025, "step": 1279 }, { "epoch": 0.6072106261859582, "grad_norm": 2.388615608215332, "learning_rate": 1.6326284753477267e-05, "loss": 0.4042, "step": 1280 }, { "epoch": 0.607685009487666, "grad_norm": 2.1982688903808594, "learning_rate": 1.6320332719398612e-05, "loss": 0.341, "step": 1281 }, { "epoch": 0.6081593927893738, "grad_norm": 2.6067326068878174, "learning_rate": 1.6314376954490097e-05, "loss": 0.441, "step": 1282 }, { "epoch": 0.6086337760910816, "grad_norm": 2.442051410675049, "learning_rate": 1.6308417462267348e-05, "loss": 0.4034, "step": 1283 }, { "epoch": 0.6091081593927894, "grad_norm": 2.5747456550598145, "learning_rate": 1.6302454246248195e-05, "loss": 0.4045, "step": 1284 }, { "epoch": 0.6095825426944972, "grad_norm": 3.0773766040802, "learning_rate": 1.6296487309952666e-05, "loss": 0.4241, "step": 1285 }, { "epoch": 0.610056925996205, "grad_norm": 2.966850519180298, "learning_rate": 1.6290516656902985e-05, "loss": 0.4131, "step": 1286 }, { "epoch": 0.6105313092979127, "grad_norm": 2.5385637283325195, "learning_rate": 1.6284542290623568e-05, "loss": 0.3904, "step": 1287 }, { "epoch": 0.6110056925996205, "grad_norm": 2.2452964782714844, "learning_rate": 1.6278564214641027e-05, "loss": 0.3438, "step": 1288 }, { "epoch": 0.6114800759013282, "grad_norm": 3.1382503509521484, "learning_rate": 1.6272582432484155e-05, "loss": 0.4465, "step": 1289 }, { "epoch": 0.611954459203036, "grad_norm": 2.5982370376586914, "learning_rate": 1.6266596947683945e-05, "loss": 0.4197, "step": 1290 }, { "epoch": 0.6124288425047438, "grad_norm": 3.1609013080596924, "learning_rate": 1.626060776377357e-05, "loss": 0.4191, "step": 1291 }, { "epoch": 0.6129032258064516, "grad_norm": 2.7382616996765137, "learning_rate": 1.625461488428838e-05, "loss": 0.4635, "step": 1292 }, { "epoch": 0.6133776091081594, "grad_norm": 2.5566318035125732, "learning_rate": 1.6248618312765918e-05, "loss": 0.4796, "step": 1293 }, { "epoch": 0.6138519924098672, "grad_norm": 2.541482448577881, "learning_rate": 1.62426180527459e-05, "loss": 0.4099, "step": 1294 }, { "epoch": 0.614326375711575, "grad_norm": 3.1109683513641357, "learning_rate": 1.6236614107770216e-05, "loss": 0.4481, "step": 1295 }, { "epoch": 0.6148007590132827, "grad_norm": 2.3709452152252197, "learning_rate": 1.623060648138294e-05, "loss": 0.4273, "step": 1296 }, { "epoch": 0.6152751423149905, "grad_norm": 2.797222375869751, "learning_rate": 1.622459517713031e-05, "loss": 0.3849, "step": 1297 }, { "epoch": 0.6157495256166983, "grad_norm": 2.5726704597473145, "learning_rate": 1.621858019856074e-05, "loss": 0.4335, "step": 1298 }, { "epoch": 0.6162239089184061, "grad_norm": 2.80641508102417, "learning_rate": 1.6212561549224818e-05, "loss": 0.4747, "step": 1299 }, { "epoch": 0.6166982922201139, "grad_norm": 2.7815253734588623, "learning_rate": 1.620653923267529e-05, "loss": 0.4144, "step": 1300 }, { "epoch": 0.6171726755218216, "grad_norm": 2.2950119972229004, "learning_rate": 1.6200513252467068e-05, "loss": 0.4452, "step": 1301 }, { "epoch": 0.6176470588235294, "grad_norm": 2.441072463989258, "learning_rate": 1.6194483612157232e-05, "loss": 0.4684, "step": 1302 }, { "epoch": 0.6181214421252372, "grad_norm": 2.0913050174713135, "learning_rate": 1.6188450315305012e-05, "loss": 0.3706, "step": 1303 }, { "epoch": 0.618595825426945, "grad_norm": 2.4034149646759033, "learning_rate": 1.6182413365471815e-05, "loss": 0.3506, "step": 1304 }, { "epoch": 0.6190702087286527, "grad_norm": 2.8475241661071777, "learning_rate": 1.617637276622118e-05, "loss": 0.4507, "step": 1305 }, { "epoch": 0.6195445920303605, "grad_norm": 2.499732255935669, "learning_rate": 1.617032852111882e-05, "loss": 0.368, "step": 1306 }, { "epoch": 0.6200189753320683, "grad_norm": 2.651540517807007, "learning_rate": 1.6164280633732594e-05, "loss": 0.4742, "step": 1307 }, { "epoch": 0.6204933586337761, "grad_norm": 2.2591097354888916, "learning_rate": 1.6158229107632507e-05, "loss": 0.3422, "step": 1308 }, { "epoch": 0.6209677419354839, "grad_norm": 2.7106447219848633, "learning_rate": 1.6152173946390715e-05, "loss": 0.4175, "step": 1309 }, { "epoch": 0.6214421252371917, "grad_norm": 3.118277072906494, "learning_rate": 1.6146115153581523e-05, "loss": 0.5013, "step": 1310 }, { "epoch": 0.6219165085388995, "grad_norm": 2.603504180908203, "learning_rate": 1.6140052732781373e-05, "loss": 0.3475, "step": 1311 }, { "epoch": 0.6223908918406073, "grad_norm": 2.1159276962280273, "learning_rate": 1.6133986687568854e-05, "loss": 0.3465, "step": 1312 }, { "epoch": 0.622865275142315, "grad_norm": 2.8673932552337646, "learning_rate": 1.612791702152469e-05, "loss": 0.4357, "step": 1313 }, { "epoch": 0.6233396584440227, "grad_norm": 2.5344109535217285, "learning_rate": 1.6121843738231748e-05, "loss": 0.3729, "step": 1314 }, { "epoch": 0.6238140417457305, "grad_norm": 2.3215112686157227, "learning_rate": 1.6115766841275027e-05, "loss": 0.3495, "step": 1315 }, { "epoch": 0.6242884250474383, "grad_norm": 2.229944944381714, "learning_rate": 1.6109686334241655e-05, "loss": 0.3594, "step": 1316 }, { "epoch": 0.6247628083491461, "grad_norm": 3.3155102729797363, "learning_rate": 1.6103602220720897e-05, "loss": 0.3867, "step": 1317 }, { "epoch": 0.6252371916508539, "grad_norm": 2.2139594554901123, "learning_rate": 1.609751450430415e-05, "loss": 0.374, "step": 1318 }, { "epoch": 0.6257115749525617, "grad_norm": 2.4204013347625732, "learning_rate": 1.6091423188584926e-05, "loss": 0.3767, "step": 1319 }, { "epoch": 0.6261859582542695, "grad_norm": 2.2802765369415283, "learning_rate": 1.608532827715887e-05, "loss": 0.3397, "step": 1320 }, { "epoch": 0.6266603415559773, "grad_norm": 2.470691204071045, "learning_rate": 1.607922977362375e-05, "loss": 0.3941, "step": 1321 }, { "epoch": 0.627134724857685, "grad_norm": 2.650733232498169, "learning_rate": 1.607312768157945e-05, "loss": 0.3187, "step": 1322 }, { "epoch": 0.6276091081593927, "grad_norm": 2.694547176361084, "learning_rate": 1.606702200462798e-05, "loss": 0.4531, "step": 1323 }, { "epoch": 0.6280834914611005, "grad_norm": 2.8122143745422363, "learning_rate": 1.6060912746373453e-05, "loss": 0.4155, "step": 1324 }, { "epoch": 0.6285578747628083, "grad_norm": 2.1507198810577393, "learning_rate": 1.6054799910422106e-05, "loss": 0.3287, "step": 1325 }, { "epoch": 0.6290322580645161, "grad_norm": 3.3852505683898926, "learning_rate": 1.604868350038229e-05, "loss": 0.4204, "step": 1326 }, { "epoch": 0.6295066413662239, "grad_norm": 2.315032482147217, "learning_rate": 1.6042563519864452e-05, "loss": 0.3035, "step": 1327 }, { "epoch": 0.6299810246679317, "grad_norm": 2.5337111949920654, "learning_rate": 1.603643997248117e-05, "loss": 0.4118, "step": 1328 }, { "epoch": 0.6304554079696395, "grad_norm": 2.549182653427124, "learning_rate": 1.60303128618471e-05, "loss": 0.3673, "step": 1329 }, { "epoch": 0.6309297912713473, "grad_norm": 2.420851230621338, "learning_rate": 1.6024182191579024e-05, "loss": 0.3595, "step": 1330 }, { "epoch": 0.631404174573055, "grad_norm": 2.3073275089263916, "learning_rate": 1.601804796529581e-05, "loss": 0.3316, "step": 1331 }, { "epoch": 0.6318785578747628, "grad_norm": 2.652170181274414, "learning_rate": 1.6011910186618433e-05, "loss": 0.3742, "step": 1332 }, { "epoch": 0.6323529411764706, "grad_norm": 2.5604469776153564, "learning_rate": 1.6005768859169965e-05, "loss": 0.3906, "step": 1333 }, { "epoch": 0.6328273244781784, "grad_norm": 2.7032854557037354, "learning_rate": 1.5999623986575565e-05, "loss": 0.3651, "step": 1334 }, { "epoch": 0.6333017077798861, "grad_norm": 2.5799543857574463, "learning_rate": 1.5993475572462498e-05, "loss": 0.3985, "step": 1335 }, { "epoch": 0.6337760910815939, "grad_norm": 2.3732707500457764, "learning_rate": 1.5987323620460106e-05, "loss": 0.3936, "step": 1336 }, { "epoch": 0.6342504743833017, "grad_norm": 2.5276899337768555, "learning_rate": 1.598116813419983e-05, "loss": 0.3924, "step": 1337 }, { "epoch": 0.6347248576850095, "grad_norm": 2.2890419960021973, "learning_rate": 1.597500911731519e-05, "loss": 0.4306, "step": 1338 }, { "epoch": 0.6351992409867173, "grad_norm": 2.1135501861572266, "learning_rate": 1.5968846573441794e-05, "loss": 0.281, "step": 1339 }, { "epoch": 0.635673624288425, "grad_norm": 2.5915725231170654, "learning_rate": 1.596268050621733e-05, "loss": 0.4047, "step": 1340 }, { "epoch": 0.6361480075901328, "grad_norm": 2.0635995864868164, "learning_rate": 1.5956510919281564e-05, "loss": 0.3489, "step": 1341 }, { "epoch": 0.6366223908918406, "grad_norm": 2.7874855995178223, "learning_rate": 1.5950337816276347e-05, "loss": 0.3854, "step": 1342 }, { "epoch": 0.6370967741935484, "grad_norm": 2.3711447715759277, "learning_rate": 1.5944161200845595e-05, "loss": 0.3981, "step": 1343 }, { "epoch": 0.6375711574952562, "grad_norm": 2.6661829948425293, "learning_rate": 1.5937981076635313e-05, "loss": 0.4276, "step": 1344 }, { "epoch": 0.638045540796964, "grad_norm": 2.2618393898010254, "learning_rate": 1.5931797447293553e-05, "loss": 0.3637, "step": 1345 }, { "epoch": 0.6385199240986718, "grad_norm": 2.272256851196289, "learning_rate": 1.5925610316470462e-05, "loss": 0.3746, "step": 1346 }, { "epoch": 0.6389943074003795, "grad_norm": 2.7523257732391357, "learning_rate": 1.5919419687818235e-05, "loss": 0.4538, "step": 1347 }, { "epoch": 0.6394686907020873, "grad_norm": 2.0002548694610596, "learning_rate": 1.5913225564991142e-05, "loss": 0.352, "step": 1348 }, { "epoch": 0.639943074003795, "grad_norm": 3.1848556995391846, "learning_rate": 1.590702795164551e-05, "loss": 0.4523, "step": 1349 }, { "epoch": 0.6404174573055028, "grad_norm": 2.4870452880859375, "learning_rate": 1.5900826851439734e-05, "loss": 0.4132, "step": 1350 }, { "epoch": 0.6408918406072106, "grad_norm": 2.299504041671753, "learning_rate": 1.5894622268034258e-05, "loss": 0.3974, "step": 1351 }, { "epoch": 0.6413662239089184, "grad_norm": 2.126984119415283, "learning_rate": 1.588841420509159e-05, "loss": 0.377, "step": 1352 }, { "epoch": 0.6418406072106262, "grad_norm": 2.595367193222046, "learning_rate": 1.588220266627628e-05, "loss": 0.375, "step": 1353 }, { "epoch": 0.642314990512334, "grad_norm": 2.4395711421966553, "learning_rate": 1.5875987655254947e-05, "loss": 0.3947, "step": 1354 }, { "epoch": 0.6427893738140418, "grad_norm": 2.616209030151367, "learning_rate": 1.5869769175696243e-05, "loss": 0.3637, "step": 1355 }, { "epoch": 0.6432637571157496, "grad_norm": 2.404573917388916, "learning_rate": 1.586354723127088e-05, "loss": 0.3973, "step": 1356 }, { "epoch": 0.6437381404174574, "grad_norm": 2.426798105239868, "learning_rate": 1.585732182565161e-05, "loss": 0.3932, "step": 1357 }, { "epoch": 0.644212523719165, "grad_norm": 2.2638189792633057, "learning_rate": 1.5851092962513224e-05, "loss": 0.3432, "step": 1358 }, { "epoch": 0.6446869070208728, "grad_norm": 3.1538186073303223, "learning_rate": 1.5844860645532563e-05, "loss": 0.3715, "step": 1359 }, { "epoch": 0.6451612903225806, "grad_norm": 2.5172009468078613, "learning_rate": 1.5838624878388498e-05, "loss": 0.3756, "step": 1360 }, { "epoch": 0.6456356736242884, "grad_norm": 2.5488150119781494, "learning_rate": 1.5832385664761943e-05, "loss": 0.4415, "step": 1361 }, { "epoch": 0.6461100569259962, "grad_norm": 2.245295763015747, "learning_rate": 1.582614300833585e-05, "loss": 0.3377, "step": 1362 }, { "epoch": 0.646584440227704, "grad_norm": 2.691495180130005, "learning_rate": 1.5819896912795186e-05, "loss": 0.3952, "step": 1363 }, { "epoch": 0.6470588235294118, "grad_norm": 2.450929880142212, "learning_rate": 1.5813647381826968e-05, "loss": 0.436, "step": 1364 }, { "epoch": 0.6475332068311196, "grad_norm": 2.736772298812866, "learning_rate": 1.5807394419120224e-05, "loss": 0.4265, "step": 1365 }, { "epoch": 0.6480075901328273, "grad_norm": 2.9314868450164795, "learning_rate": 1.5801138028366026e-05, "loss": 0.353, "step": 1366 }, { "epoch": 0.6484819734345351, "grad_norm": 3.4517641067504883, "learning_rate": 1.579487821325745e-05, "loss": 0.4646, "step": 1367 }, { "epoch": 0.6489563567362429, "grad_norm": 2.4029881954193115, "learning_rate": 1.5788614977489612e-05, "loss": 0.3996, "step": 1368 }, { "epoch": 0.6494307400379506, "grad_norm": 2.305480718612671, "learning_rate": 1.578234832475963e-05, "loss": 0.3684, "step": 1369 }, { "epoch": 0.6499051233396584, "grad_norm": 2.9141790866851807, "learning_rate": 1.5776078258766654e-05, "loss": 0.4405, "step": 1370 }, { "epoch": 0.6503795066413662, "grad_norm": 2.265993356704712, "learning_rate": 1.5769804783211837e-05, "loss": 0.3726, "step": 1371 }, { "epoch": 0.650853889943074, "grad_norm": 2.6375434398651123, "learning_rate": 1.576352790179835e-05, "loss": 0.3809, "step": 1372 }, { "epoch": 0.6513282732447818, "grad_norm": 2.3192057609558105, "learning_rate": 1.5757247618231378e-05, "loss": 0.4016, "step": 1373 }, { "epoch": 0.6518026565464896, "grad_norm": 2.4729015827178955, "learning_rate": 1.5750963936218104e-05, "loss": 0.4239, "step": 1374 }, { "epoch": 0.6522770398481973, "grad_norm": 2.1907765865325928, "learning_rate": 1.574467685946773e-05, "loss": 0.3452, "step": 1375 }, { "epoch": 0.6527514231499051, "grad_norm": 2.9924540519714355, "learning_rate": 1.573838639169145e-05, "loss": 0.5011, "step": 1376 }, { "epoch": 0.6532258064516129, "grad_norm": 2.7867164611816406, "learning_rate": 1.5732092536602466e-05, "loss": 0.4015, "step": 1377 }, { "epoch": 0.6537001897533207, "grad_norm": 1.9842467308044434, "learning_rate": 1.572579529791598e-05, "loss": 0.2994, "step": 1378 }, { "epoch": 0.6541745730550285, "grad_norm": 2.6319007873535156, "learning_rate": 1.571949467934919e-05, "loss": 0.4522, "step": 1379 }, { "epoch": 0.6546489563567363, "grad_norm": 2.6779913902282715, "learning_rate": 1.5713190684621285e-05, "loss": 0.3868, "step": 1380 }, { "epoch": 0.655123339658444, "grad_norm": 2.5989339351654053, "learning_rate": 1.5706883317453455e-05, "loss": 0.3916, "step": 1381 }, { "epoch": 0.6555977229601518, "grad_norm": 2.8662238121032715, "learning_rate": 1.5700572581568875e-05, "loss": 0.4695, "step": 1382 }, { "epoch": 0.6560721062618596, "grad_norm": 2.4219655990600586, "learning_rate": 1.569425848069271e-05, "loss": 0.4111, "step": 1383 }, { "epoch": 0.6565464895635673, "grad_norm": 2.6834099292755127, "learning_rate": 1.568794101855211e-05, "loss": 0.4125, "step": 1384 }, { "epoch": 0.6570208728652751, "grad_norm": 2.470796585083008, "learning_rate": 1.568162019887621e-05, "loss": 0.4157, "step": 1385 }, { "epoch": 0.6574952561669829, "grad_norm": 2.3338913917541504, "learning_rate": 1.567529602539613e-05, "loss": 0.3847, "step": 1386 }, { "epoch": 0.6579696394686907, "grad_norm": 2.07490873336792, "learning_rate": 1.5668968501844966e-05, "loss": 0.3439, "step": 1387 }, { "epoch": 0.6584440227703985, "grad_norm": 2.4413368701934814, "learning_rate": 1.5662637631957793e-05, "loss": 0.4105, "step": 1388 }, { "epoch": 0.6589184060721063, "grad_norm": 2.3040366172790527, "learning_rate": 1.565630341947166e-05, "loss": 0.3204, "step": 1389 }, { "epoch": 0.6593927893738141, "grad_norm": 2.9284796714782715, "learning_rate": 1.564996586812559e-05, "loss": 0.4296, "step": 1390 }, { "epoch": 0.6598671726755219, "grad_norm": 2.683302164077759, "learning_rate": 1.5643624981660573e-05, "loss": 0.3965, "step": 1391 }, { "epoch": 0.6603415559772297, "grad_norm": 2.779085397720337, "learning_rate": 1.563728076381958e-05, "loss": 0.4836, "step": 1392 }, { "epoch": 0.6608159392789373, "grad_norm": 2.2810940742492676, "learning_rate": 1.5630933218347536e-05, "loss": 0.3765, "step": 1393 }, { "epoch": 0.6612903225806451, "grad_norm": 2.6218605041503906, "learning_rate": 1.5624582348991327e-05, "loss": 0.3954, "step": 1394 }, { "epoch": 0.6617647058823529, "grad_norm": 2.3885746002197266, "learning_rate": 1.5618228159499823e-05, "loss": 0.3817, "step": 1395 }, { "epoch": 0.6622390891840607, "grad_norm": 3.312493085861206, "learning_rate": 1.5611870653623826e-05, "loss": 0.4092, "step": 1396 }, { "epoch": 0.6627134724857685, "grad_norm": 3.737312078475952, "learning_rate": 1.5605509835116115e-05, "loss": 0.4746, "step": 1397 }, { "epoch": 0.6631878557874763, "grad_norm": 2.460756778717041, "learning_rate": 1.5599145707731417e-05, "loss": 0.415, "step": 1398 }, { "epoch": 0.6636622390891841, "grad_norm": 2.247875690460205, "learning_rate": 1.5592778275226413e-05, "loss": 0.3812, "step": 1399 }, { "epoch": 0.6641366223908919, "grad_norm": 2.569827079772949, "learning_rate": 1.558640754135974e-05, "loss": 0.4392, "step": 1400 }, { "epoch": 0.6646110056925996, "grad_norm": 2.2257590293884277, "learning_rate": 1.558003350989197e-05, "loss": 0.3763, "step": 1401 }, { "epoch": 0.6650853889943074, "grad_norm": 2.3620169162750244, "learning_rate": 1.5573656184585643e-05, "loss": 0.4499, "step": 1402 }, { "epoch": 0.6655597722960152, "grad_norm": 2.4123148918151855, "learning_rate": 1.5567275569205216e-05, "loss": 0.3642, "step": 1403 }, { "epoch": 0.6660341555977229, "grad_norm": 2.121957302093506, "learning_rate": 1.556089166751712e-05, "loss": 0.3786, "step": 1404 }, { "epoch": 0.6665085388994307, "grad_norm": 2.452988624572754, "learning_rate": 1.555450448328969e-05, "loss": 0.4058, "step": 1405 }, { "epoch": 0.6669829222011385, "grad_norm": 2.358628988265991, "learning_rate": 1.554811402029323e-05, "loss": 0.4026, "step": 1406 }, { "epoch": 0.6674573055028463, "grad_norm": 2.781123638153076, "learning_rate": 1.554172028229997e-05, "loss": 0.455, "step": 1407 }, { "epoch": 0.6679316888045541, "grad_norm": 3.405427932739258, "learning_rate": 1.5535323273084062e-05, "loss": 0.456, "step": 1408 }, { "epoch": 0.6684060721062619, "grad_norm": 2.7615528106689453, "learning_rate": 1.55289229964216e-05, "loss": 0.3644, "step": 1409 }, { "epoch": 0.6688804554079696, "grad_norm": 3.0365653038024902, "learning_rate": 1.5522519456090604e-05, "loss": 0.5515, "step": 1410 }, { "epoch": 0.6693548387096774, "grad_norm": 2.3187596797943115, "learning_rate": 1.551611265587102e-05, "loss": 0.3391, "step": 1411 }, { "epoch": 0.6698292220113852, "grad_norm": 2.320058584213257, "learning_rate": 1.5509702599544723e-05, "loss": 0.3337, "step": 1412 }, { "epoch": 0.670303605313093, "grad_norm": 2.3811447620391846, "learning_rate": 1.55032892908955e-05, "loss": 0.3742, "step": 1413 }, { "epoch": 0.6707779886148008, "grad_norm": 2.2313289642333984, "learning_rate": 1.5496872733709067e-05, "loss": 0.3389, "step": 1414 }, { "epoch": 0.6712523719165086, "grad_norm": 2.9275193214416504, "learning_rate": 1.5490452931773053e-05, "loss": 0.4493, "step": 1415 }, { "epoch": 0.6717267552182163, "grad_norm": 2.3053927421569824, "learning_rate": 1.5484029888877004e-05, "loss": 0.3616, "step": 1416 }, { "epoch": 0.6722011385199241, "grad_norm": 2.178098201751709, "learning_rate": 1.547760360881238e-05, "loss": 0.3492, "step": 1417 }, { "epoch": 0.6726755218216319, "grad_norm": 2.574260950088501, "learning_rate": 1.547117409537254e-05, "loss": 0.4382, "step": 1418 }, { "epoch": 0.6731499051233396, "grad_norm": 2.7682528495788574, "learning_rate": 1.546474135235278e-05, "loss": 0.3529, "step": 1419 }, { "epoch": 0.6736242884250474, "grad_norm": 5.3996500968933105, "learning_rate": 1.5458305383550275e-05, "loss": 0.3958, "step": 1420 }, { "epoch": 0.6740986717267552, "grad_norm": 2.8296844959259033, "learning_rate": 1.545186619276411e-05, "loss": 0.4987, "step": 1421 }, { "epoch": 0.674573055028463, "grad_norm": 2.638801097869873, "learning_rate": 1.5445423783795283e-05, "loss": 0.4888, "step": 1422 }, { "epoch": 0.6750474383301708, "grad_norm": 2.253319025039673, "learning_rate": 1.5438978160446684e-05, "loss": 0.3512, "step": 1423 }, { "epoch": 0.6755218216318786, "grad_norm": 2.3527333736419678, "learning_rate": 1.543252932652309e-05, "loss": 0.3054, "step": 1424 }, { "epoch": 0.6759962049335864, "grad_norm": 2.6959636211395264, "learning_rate": 1.5426077285831195e-05, "loss": 0.3844, "step": 1425 }, { "epoch": 0.6764705882352942, "grad_norm": 2.884432792663574, "learning_rate": 1.5419622042179575e-05, "loss": 0.4787, "step": 1426 }, { "epoch": 0.676944971537002, "grad_norm": 3.0761313438415527, "learning_rate": 1.541316359937869e-05, "loss": 0.4422, "step": 1427 }, { "epoch": 0.6774193548387096, "grad_norm": 2.4010064601898193, "learning_rate": 1.54067019612409e-05, "loss": 0.4129, "step": 1428 }, { "epoch": 0.6778937381404174, "grad_norm": 2.568399429321289, "learning_rate": 1.5400237131580443e-05, "loss": 0.3751, "step": 1429 }, { "epoch": 0.6783681214421252, "grad_norm": 3.373952627182007, "learning_rate": 1.539376911421344e-05, "loss": 0.404, "step": 1430 }, { "epoch": 0.678842504743833, "grad_norm": 2.3124148845672607, "learning_rate": 1.5387297912957907e-05, "loss": 0.3388, "step": 1431 }, { "epoch": 0.6793168880455408, "grad_norm": 2.5099921226501465, "learning_rate": 1.5380823531633727e-05, "loss": 0.3389, "step": 1432 }, { "epoch": 0.6797912713472486, "grad_norm": 2.23146915435791, "learning_rate": 1.537434597406266e-05, "loss": 0.3419, "step": 1433 }, { "epoch": 0.6802656546489564, "grad_norm": 2.684089183807373, "learning_rate": 1.5367865244068346e-05, "loss": 0.445, "step": 1434 }, { "epoch": 0.6807400379506642, "grad_norm": 2.252448320388794, "learning_rate": 1.53613813454763e-05, "loss": 0.3182, "step": 1435 }, { "epoch": 0.681214421252372, "grad_norm": 2.7252156734466553, "learning_rate": 1.5354894282113892e-05, "loss": 0.4379, "step": 1436 }, { "epoch": 0.6816888045540797, "grad_norm": 2.7150869369506836, "learning_rate": 1.5348404057810383e-05, "loss": 0.405, "step": 1437 }, { "epoch": 0.6821631878557874, "grad_norm": 2.2281618118286133, "learning_rate": 1.534191067639688e-05, "loss": 0.3822, "step": 1438 }, { "epoch": 0.6826375711574952, "grad_norm": 2.0423622131347656, "learning_rate": 1.5335414141706366e-05, "loss": 0.3505, "step": 1439 }, { "epoch": 0.683111954459203, "grad_norm": 2.7963783740997314, "learning_rate": 1.5328914457573683e-05, "loss": 0.4724, "step": 1440 }, { "epoch": 0.6835863377609108, "grad_norm": 2.374962568283081, "learning_rate": 1.5322411627835526e-05, "loss": 0.3721, "step": 1441 }, { "epoch": 0.6840607210626186, "grad_norm": 2.619636058807373, "learning_rate": 1.531590565633045e-05, "loss": 0.4549, "step": 1442 }, { "epoch": 0.6845351043643264, "grad_norm": 2.1424806118011475, "learning_rate": 1.530939654689887e-05, "loss": 0.3219, "step": 1443 }, { "epoch": 0.6850094876660342, "grad_norm": 2.061220407485962, "learning_rate": 1.5302884303383046e-05, "loss": 0.358, "step": 1444 }, { "epoch": 0.6854838709677419, "grad_norm": 3.31339955329895, "learning_rate": 1.5296368929627097e-05, "loss": 0.5536, "step": 1445 }, { "epoch": 0.6859582542694497, "grad_norm": 2.542325973510742, "learning_rate": 1.528985042947697e-05, "loss": 0.4265, "step": 1446 }, { "epoch": 0.6864326375711575, "grad_norm": 2.6769909858703613, "learning_rate": 1.5283328806780488e-05, "loss": 0.4561, "step": 1447 }, { "epoch": 0.6869070208728653, "grad_norm": 2.1639342308044434, "learning_rate": 1.527680406538729e-05, "loss": 0.3672, "step": 1448 }, { "epoch": 0.687381404174573, "grad_norm": 2.8890748023986816, "learning_rate": 1.5270276209148867e-05, "loss": 0.3952, "step": 1449 }, { "epoch": 0.6878557874762808, "grad_norm": 2.939438819885254, "learning_rate": 1.526374524191855e-05, "loss": 0.4082, "step": 1450 }, { "epoch": 0.6883301707779886, "grad_norm": 2.6139535903930664, "learning_rate": 1.52572111675515e-05, "loss": 0.4213, "step": 1451 }, { "epoch": 0.6888045540796964, "grad_norm": 2.3408873081207275, "learning_rate": 1.5250673989904728e-05, "loss": 0.3499, "step": 1452 }, { "epoch": 0.6892789373814042, "grad_norm": 2.349148988723755, "learning_rate": 1.524413371283705e-05, "loss": 0.3571, "step": 1453 }, { "epoch": 0.6897533206831119, "grad_norm": 2.3099985122680664, "learning_rate": 1.5237590340209139e-05, "loss": 0.3955, "step": 1454 }, { "epoch": 0.6902277039848197, "grad_norm": 2.275421380996704, "learning_rate": 1.5231043875883474e-05, "loss": 0.3267, "step": 1455 }, { "epoch": 0.6907020872865275, "grad_norm": 2.5383238792419434, "learning_rate": 1.5224494323724374e-05, "loss": 0.355, "step": 1456 }, { "epoch": 0.6911764705882353, "grad_norm": 2.6643269062042236, "learning_rate": 1.5217941687597976e-05, "loss": 0.4296, "step": 1457 }, { "epoch": 0.6916508538899431, "grad_norm": 2.6603057384490967, "learning_rate": 1.5211385971372233e-05, "loss": 0.3736, "step": 1458 }, { "epoch": 0.6921252371916509, "grad_norm": 2.079129695892334, "learning_rate": 1.5204827178916919e-05, "loss": 0.3354, "step": 1459 }, { "epoch": 0.6925996204933587, "grad_norm": 2.422872304916382, "learning_rate": 1.5198265314103626e-05, "loss": 0.4841, "step": 1460 }, { "epoch": 0.6930740037950665, "grad_norm": 2.4735107421875, "learning_rate": 1.5191700380805754e-05, "loss": 0.3615, "step": 1461 }, { "epoch": 0.6935483870967742, "grad_norm": 2.0199763774871826, "learning_rate": 1.5185132382898524e-05, "loss": 0.2761, "step": 1462 }, { "epoch": 0.6940227703984819, "grad_norm": 2.425189733505249, "learning_rate": 1.5178561324258956e-05, "loss": 0.4204, "step": 1463 }, { "epoch": 0.6944971537001897, "grad_norm": 2.212663173675537, "learning_rate": 1.5171987208765884e-05, "loss": 0.35, "step": 1464 }, { "epoch": 0.6949715370018975, "grad_norm": 3.0007028579711914, "learning_rate": 1.5165410040299942e-05, "loss": 0.424, "step": 1465 }, { "epoch": 0.6954459203036053, "grad_norm": 2.555004835128784, "learning_rate": 1.5158829822743567e-05, "loss": 0.4113, "step": 1466 }, { "epoch": 0.6959203036053131, "grad_norm": 2.6518445014953613, "learning_rate": 1.5152246559980994e-05, "loss": 0.3866, "step": 1467 }, { "epoch": 0.6963946869070209, "grad_norm": 2.7971367835998535, "learning_rate": 1.5145660255898262e-05, "loss": 0.3921, "step": 1468 }, { "epoch": 0.6968690702087287, "grad_norm": 2.154005765914917, "learning_rate": 1.5139070914383198e-05, "loss": 0.334, "step": 1469 }, { "epoch": 0.6973434535104365, "grad_norm": 2.5318751335144043, "learning_rate": 1.5132478539325428e-05, "loss": 0.3787, "step": 1470 }, { "epoch": 0.6978178368121443, "grad_norm": 2.6897404193878174, "learning_rate": 1.5125883134616363e-05, "loss": 0.3783, "step": 1471 }, { "epoch": 0.698292220113852, "grad_norm": 2.307647943496704, "learning_rate": 1.5119284704149211e-05, "loss": 0.3814, "step": 1472 }, { "epoch": 0.6987666034155597, "grad_norm": 2.4573044776916504, "learning_rate": 1.5112683251818952e-05, "loss": 0.42, "step": 1473 }, { "epoch": 0.6992409867172675, "grad_norm": 2.418071985244751, "learning_rate": 1.5106078781522363e-05, "loss": 0.3043, "step": 1474 }, { "epoch": 0.6997153700189753, "grad_norm": 2.5575947761535645, "learning_rate": 1.509947129715799e-05, "loss": 0.4268, "step": 1475 }, { "epoch": 0.7001897533206831, "grad_norm": 2.537424087524414, "learning_rate": 1.5092860802626179e-05, "loss": 0.4167, "step": 1476 }, { "epoch": 0.7006641366223909, "grad_norm": 2.9543752670288086, "learning_rate": 1.5086247301829028e-05, "loss": 0.407, "step": 1477 }, { "epoch": 0.7011385199240987, "grad_norm": 2.10909104347229, "learning_rate": 1.507963079867043e-05, "loss": 0.3587, "step": 1478 }, { "epoch": 0.7016129032258065, "grad_norm": 2.046889543533325, "learning_rate": 1.5073011297056033e-05, "loss": 0.3019, "step": 1479 }, { "epoch": 0.7020872865275142, "grad_norm": 2.1816864013671875, "learning_rate": 1.5066388800893266e-05, "loss": 0.349, "step": 1480 }, { "epoch": 0.702561669829222, "grad_norm": 1.9779382944107056, "learning_rate": 1.5059763314091326e-05, "loss": 0.3138, "step": 1481 }, { "epoch": 0.7030360531309298, "grad_norm": 3.019711971282959, "learning_rate": 1.505313484056117e-05, "loss": 0.466, "step": 1482 }, { "epoch": 0.7035104364326376, "grad_norm": 2.450096607208252, "learning_rate": 1.5046503384215521e-05, "loss": 0.3534, "step": 1483 }, { "epoch": 0.7039848197343453, "grad_norm": 2.3313405513763428, "learning_rate": 1.503986894896886e-05, "loss": 0.4373, "step": 1484 }, { "epoch": 0.7044592030360531, "grad_norm": 2.4880897998809814, "learning_rate": 1.5033231538737432e-05, "loss": 0.4231, "step": 1485 }, { "epoch": 0.7049335863377609, "grad_norm": 2.491865873336792, "learning_rate": 1.502659115743923e-05, "loss": 0.4453, "step": 1486 }, { "epoch": 0.7054079696394687, "grad_norm": 2.641712188720703, "learning_rate": 1.501994780899401e-05, "loss": 0.4031, "step": 1487 }, { "epoch": 0.7058823529411765, "grad_norm": 2.309870719909668, "learning_rate": 1.5013301497323274e-05, "loss": 0.3533, "step": 1488 }, { "epoch": 0.7063567362428842, "grad_norm": 2.5076959133148193, "learning_rate": 1.5006652226350272e-05, "loss": 0.3699, "step": 1489 }, { "epoch": 0.706831119544592, "grad_norm": 2.0265212059020996, "learning_rate": 1.5000000000000002e-05, "loss": 0.3459, "step": 1490 }, { "epoch": 0.7073055028462998, "grad_norm": 2.6799657344818115, "learning_rate": 1.499334482219921e-05, "loss": 0.4195, "step": 1491 }, { "epoch": 0.7077798861480076, "grad_norm": 2.5318443775177, "learning_rate": 1.4986686696876381e-05, "loss": 0.3655, "step": 1492 }, { "epoch": 0.7082542694497154, "grad_norm": 2.371875047683716, "learning_rate": 1.498002562796174e-05, "loss": 0.3385, "step": 1493 }, { "epoch": 0.7087286527514232, "grad_norm": 3.2066407203674316, "learning_rate": 1.497336161938725e-05, "loss": 0.3932, "step": 1494 }, { "epoch": 0.709203036053131, "grad_norm": 2.56129789352417, "learning_rate": 1.4966694675086611e-05, "loss": 0.3833, "step": 1495 }, { "epoch": 0.7096774193548387, "grad_norm": 1.934271216392517, "learning_rate": 1.4960024798995252e-05, "loss": 0.3025, "step": 1496 }, { "epoch": 0.7101518026565465, "grad_norm": 2.200591564178467, "learning_rate": 1.4953351995050336e-05, "loss": 0.3285, "step": 1497 }, { "epoch": 0.7106261859582542, "grad_norm": 2.4421536922454834, "learning_rate": 1.4946676267190751e-05, "loss": 0.3932, "step": 1498 }, { "epoch": 0.711100569259962, "grad_norm": 2.842991590499878, "learning_rate": 1.4939997619357116e-05, "loss": 0.4339, "step": 1499 }, { "epoch": 0.7115749525616698, "grad_norm": 2.2917048931121826, "learning_rate": 1.493331605549177e-05, "loss": 0.3514, "step": 1500 }, { "epoch": 0.7120493358633776, "grad_norm": 2.323843002319336, "learning_rate": 1.4926631579538775e-05, "loss": 0.3419, "step": 1501 }, { "epoch": 0.7125237191650854, "grad_norm": 2.5014545917510986, "learning_rate": 1.4919944195443908e-05, "loss": 0.46, "step": 1502 }, { "epoch": 0.7129981024667932, "grad_norm": 2.414332389831543, "learning_rate": 1.4913253907154666e-05, "loss": 0.3509, "step": 1503 }, { "epoch": 0.713472485768501, "grad_norm": 2.879897117614746, "learning_rate": 1.4906560718620265e-05, "loss": 0.318, "step": 1504 }, { "epoch": 0.7139468690702088, "grad_norm": 2.6578876972198486, "learning_rate": 1.4899864633791623e-05, "loss": 0.3298, "step": 1505 }, { "epoch": 0.7144212523719166, "grad_norm": 2.390251636505127, "learning_rate": 1.4893165656621371e-05, "loss": 0.3778, "step": 1506 }, { "epoch": 0.7148956356736242, "grad_norm": 3.521209478378296, "learning_rate": 1.4886463791063854e-05, "loss": 0.4522, "step": 1507 }, { "epoch": 0.715370018975332, "grad_norm": 2.280897617340088, "learning_rate": 1.4879759041075117e-05, "loss": 0.3888, "step": 1508 }, { "epoch": 0.7158444022770398, "grad_norm": 2.56290340423584, "learning_rate": 1.4873051410612905e-05, "loss": 0.3578, "step": 1509 }, { "epoch": 0.7163187855787476, "grad_norm": 2.607135534286499, "learning_rate": 1.486634090363666e-05, "loss": 0.4305, "step": 1510 }, { "epoch": 0.7167931688804554, "grad_norm": 2.685330629348755, "learning_rate": 1.4859627524107538e-05, "loss": 0.4462, "step": 1511 }, { "epoch": 0.7172675521821632, "grad_norm": 2.119518280029297, "learning_rate": 1.4852911275988379e-05, "loss": 0.3338, "step": 1512 }, { "epoch": 0.717741935483871, "grad_norm": 2.616091251373291, "learning_rate": 1.4846192163243713e-05, "loss": 0.387, "step": 1513 }, { "epoch": 0.7182163187855788, "grad_norm": 2.2375452518463135, "learning_rate": 1.483947018983977e-05, "loss": 0.3961, "step": 1514 }, { "epoch": 0.7186907020872866, "grad_norm": 2.1125147342681885, "learning_rate": 1.483274535974446e-05, "loss": 0.3652, "step": 1515 }, { "epoch": 0.7191650853889943, "grad_norm": 2.8368351459503174, "learning_rate": 1.482601767692739e-05, "loss": 0.4423, "step": 1516 }, { "epoch": 0.719639468690702, "grad_norm": 2.951655864715576, "learning_rate": 1.4819287145359837e-05, "loss": 0.4042, "step": 1517 }, { "epoch": 0.7201138519924098, "grad_norm": 2.661517381668091, "learning_rate": 1.481255376901477e-05, "loss": 0.3976, "step": 1518 }, { "epoch": 0.7205882352941176, "grad_norm": 2.2923014163970947, "learning_rate": 1.4805817551866839e-05, "loss": 0.354, "step": 1519 }, { "epoch": 0.7210626185958254, "grad_norm": 2.025585174560547, "learning_rate": 1.479907849789236e-05, "loss": 0.3351, "step": 1520 }, { "epoch": 0.7215370018975332, "grad_norm": 2.3796160221099854, "learning_rate": 1.4792336611069335e-05, "loss": 0.4202, "step": 1521 }, { "epoch": 0.722011385199241, "grad_norm": 2.3229618072509766, "learning_rate": 1.478559189537743e-05, "loss": 0.3969, "step": 1522 }, { "epoch": 0.7224857685009488, "grad_norm": 2.162994861602783, "learning_rate": 1.4778844354797985e-05, "loss": 0.302, "step": 1523 }, { "epoch": 0.7229601518026565, "grad_norm": 2.562293767929077, "learning_rate": 1.4772093993314005e-05, "loss": 0.3537, "step": 1524 }, { "epoch": 0.7234345351043643, "grad_norm": 2.323808431625366, "learning_rate": 1.4765340814910163e-05, "loss": 0.3961, "step": 1525 }, { "epoch": 0.7239089184060721, "grad_norm": 2.3642516136169434, "learning_rate": 1.4758584823572792e-05, "loss": 0.3491, "step": 1526 }, { "epoch": 0.7243833017077799, "grad_norm": 2.2448556423187256, "learning_rate": 1.4751826023289889e-05, "loss": 0.3941, "step": 1527 }, { "epoch": 0.7248576850094877, "grad_norm": 2.7379980087280273, "learning_rate": 1.4745064418051107e-05, "loss": 0.4637, "step": 1528 }, { "epoch": 0.7253320683111955, "grad_norm": 2.4983396530151367, "learning_rate": 1.4738300011847752e-05, "loss": 0.3884, "step": 1529 }, { "epoch": 0.7258064516129032, "grad_norm": 2.730942964553833, "learning_rate": 1.4731532808672785e-05, "loss": 0.4037, "step": 1530 }, { "epoch": 0.726280834914611, "grad_norm": 2.594155788421631, "learning_rate": 1.4724762812520825e-05, "loss": 0.4064, "step": 1531 }, { "epoch": 0.7267552182163188, "grad_norm": 2.7857630252838135, "learning_rate": 1.4717990027388129e-05, "loss": 0.3294, "step": 1532 }, { "epoch": 0.7272296015180265, "grad_norm": 2.5247364044189453, "learning_rate": 1.4711214457272611e-05, "loss": 0.4019, "step": 1533 }, { "epoch": 0.7277039848197343, "grad_norm": 2.9784631729125977, "learning_rate": 1.4704436106173813e-05, "loss": 0.4528, "step": 1534 }, { "epoch": 0.7281783681214421, "grad_norm": 2.051332950592041, "learning_rate": 1.4697654978092935e-05, "loss": 0.3219, "step": 1535 }, { "epoch": 0.7286527514231499, "grad_norm": 2.4342894554138184, "learning_rate": 1.4690871077032808e-05, "loss": 0.4505, "step": 1536 }, { "epoch": 0.7291271347248577, "grad_norm": 2.270163059234619, "learning_rate": 1.4684084406997903e-05, "loss": 0.3858, "step": 1537 }, { "epoch": 0.7296015180265655, "grad_norm": 2.4693052768707275, "learning_rate": 1.4677294971994325e-05, "loss": 0.3749, "step": 1538 }, { "epoch": 0.7300759013282733, "grad_norm": 2.4099597930908203, "learning_rate": 1.4670502776029804e-05, "loss": 0.328, "step": 1539 }, { "epoch": 0.7305502846299811, "grad_norm": 3.063075304031372, "learning_rate": 1.4663707823113717e-05, "loss": 0.4566, "step": 1540 }, { "epoch": 0.7310246679316889, "grad_norm": 1.9556933641433716, "learning_rate": 1.4656910117257049e-05, "loss": 0.3263, "step": 1541 }, { "epoch": 0.7314990512333965, "grad_norm": 2.0928752422332764, "learning_rate": 1.4650109662472422e-05, "loss": 0.314, "step": 1542 }, { "epoch": 0.7319734345351043, "grad_norm": 2.1672606468200684, "learning_rate": 1.4643306462774071e-05, "loss": 0.2977, "step": 1543 }, { "epoch": 0.7324478178368121, "grad_norm": 2.769050121307373, "learning_rate": 1.4636500522177868e-05, "loss": 0.3666, "step": 1544 }, { "epoch": 0.7329222011385199, "grad_norm": 2.6004836559295654, "learning_rate": 1.4629691844701288e-05, "loss": 0.3975, "step": 1545 }, { "epoch": 0.7333965844402277, "grad_norm": 2.554579019546509, "learning_rate": 1.462288043436342e-05, "loss": 0.4653, "step": 1546 }, { "epoch": 0.7338709677419355, "grad_norm": 2.2475826740264893, "learning_rate": 1.461606629518498e-05, "loss": 0.4302, "step": 1547 }, { "epoch": 0.7343453510436433, "grad_norm": 2.2661335468292236, "learning_rate": 1.460924943118828e-05, "loss": 0.3663, "step": 1548 }, { "epoch": 0.7348197343453511, "grad_norm": 3.354585647583008, "learning_rate": 1.4602429846397254e-05, "loss": 0.385, "step": 1549 }, { "epoch": 0.7352941176470589, "grad_norm": 2.5863516330718994, "learning_rate": 1.4595607544837435e-05, "loss": 0.4031, "step": 1550 }, { "epoch": 0.7357685009487666, "grad_norm": 2.85939359664917, "learning_rate": 1.4588782530535955e-05, "loss": 0.4245, "step": 1551 }, { "epoch": 0.7362428842504743, "grad_norm": 2.234726667404175, "learning_rate": 1.4581954807521555e-05, "loss": 0.314, "step": 1552 }, { "epoch": 0.7367172675521821, "grad_norm": 2.442960262298584, "learning_rate": 1.4575124379824574e-05, "loss": 0.3825, "step": 1553 }, { "epoch": 0.7371916508538899, "grad_norm": 3.2431604862213135, "learning_rate": 1.4568291251476944e-05, "loss": 0.4371, "step": 1554 }, { "epoch": 0.7376660341555977, "grad_norm": 2.6331787109375, "learning_rate": 1.4561455426512192e-05, "loss": 0.4168, "step": 1555 }, { "epoch": 0.7381404174573055, "grad_norm": 2.5787758827209473, "learning_rate": 1.455461690896544e-05, "loss": 0.3975, "step": 1556 }, { "epoch": 0.7386148007590133, "grad_norm": 2.2691023349761963, "learning_rate": 1.45477757028734e-05, "loss": 0.3795, "step": 1557 }, { "epoch": 0.7390891840607211, "grad_norm": 2.3956029415130615, "learning_rate": 1.4540931812274359e-05, "loss": 0.393, "step": 1558 }, { "epoch": 0.7395635673624289, "grad_norm": 2.2934021949768066, "learning_rate": 1.4534085241208206e-05, "loss": 0.3465, "step": 1559 }, { "epoch": 0.7400379506641366, "grad_norm": 2.020655632019043, "learning_rate": 1.4527235993716402e-05, "loss": 0.2692, "step": 1560 }, { "epoch": 0.7405123339658444, "grad_norm": 2.299783945083618, "learning_rate": 1.4520384073841991e-05, "loss": 0.3394, "step": 1561 }, { "epoch": 0.7409867172675522, "grad_norm": 2.876800298690796, "learning_rate": 1.4513529485629591e-05, "loss": 0.3533, "step": 1562 }, { "epoch": 0.74146110056926, "grad_norm": 2.770765542984009, "learning_rate": 1.4506672233125398e-05, "loss": 0.3855, "step": 1563 }, { "epoch": 0.7419354838709677, "grad_norm": 2.479526996612549, "learning_rate": 1.4499812320377182e-05, "loss": 0.4417, "step": 1564 }, { "epoch": 0.7424098671726755, "grad_norm": 2.534569025039673, "learning_rate": 1.4492949751434282e-05, "loss": 0.346, "step": 1565 }, { "epoch": 0.7428842504743833, "grad_norm": 2.4352223873138428, "learning_rate": 1.4486084530347604e-05, "loss": 0.3262, "step": 1566 }, { "epoch": 0.7433586337760911, "grad_norm": 2.7315893173217773, "learning_rate": 1.4479216661169618e-05, "loss": 0.3896, "step": 1567 }, { "epoch": 0.7438330170777988, "grad_norm": 2.1865360736846924, "learning_rate": 1.4472346147954356e-05, "loss": 0.4102, "step": 1568 }, { "epoch": 0.7443074003795066, "grad_norm": 1.9840688705444336, "learning_rate": 1.4465472994757429e-05, "loss": 0.2967, "step": 1569 }, { "epoch": 0.7447817836812144, "grad_norm": 2.338481903076172, "learning_rate": 1.4458597205635973e-05, "loss": 0.3746, "step": 1570 }, { "epoch": 0.7452561669829222, "grad_norm": 2.240323543548584, "learning_rate": 1.445171878464871e-05, "loss": 0.3847, "step": 1571 }, { "epoch": 0.74573055028463, "grad_norm": 2.151249408721924, "learning_rate": 1.4444837735855896e-05, "loss": 0.3707, "step": 1572 }, { "epoch": 0.7462049335863378, "grad_norm": 2.2230353355407715, "learning_rate": 1.4437954063319352e-05, "loss": 0.3636, "step": 1573 }, { "epoch": 0.7466793168880456, "grad_norm": 2.0856575965881348, "learning_rate": 1.4431067771102443e-05, "loss": 0.2867, "step": 1574 }, { "epoch": 0.7471537001897534, "grad_norm": 2.0492308139801025, "learning_rate": 1.442417886327007e-05, "loss": 0.3734, "step": 1575 }, { "epoch": 0.7476280834914611, "grad_norm": 2.09089994430542, "learning_rate": 1.4417287343888698e-05, "loss": 0.3346, "step": 1576 }, { "epoch": 0.7481024667931688, "grad_norm": 2.4338037967681885, "learning_rate": 1.4410393217026317e-05, "loss": 0.3388, "step": 1577 }, { "epoch": 0.7485768500948766, "grad_norm": 2.734534740447998, "learning_rate": 1.4403496486752465e-05, "loss": 0.4177, "step": 1578 }, { "epoch": 0.7490512333965844, "grad_norm": 3.0047781467437744, "learning_rate": 1.4396597157138205e-05, "loss": 0.3927, "step": 1579 }, { "epoch": 0.7495256166982922, "grad_norm": 2.755629062652588, "learning_rate": 1.4389695232256151e-05, "loss": 0.4591, "step": 1580 }, { "epoch": 0.75, "grad_norm": 2.1415135860443115, "learning_rate": 1.4382790716180446e-05, "loss": 0.2856, "step": 1581 }, { "epoch": 0.7504743833017078, "grad_norm": 2.5595362186431885, "learning_rate": 1.4375883612986744e-05, "loss": 0.3146, "step": 1582 }, { "epoch": 0.7509487666034156, "grad_norm": 3.115945339202881, "learning_rate": 1.4368973926752248e-05, "loss": 0.4559, "step": 1583 }, { "epoch": 0.7514231499051234, "grad_norm": 2.1239240169525146, "learning_rate": 1.4362061661555675e-05, "loss": 0.3491, "step": 1584 }, { "epoch": 0.7518975332068312, "grad_norm": 2.0848984718322754, "learning_rate": 1.435514682147727e-05, "loss": 0.3327, "step": 1585 }, { "epoch": 0.7523719165085389, "grad_norm": 2.5240261554718018, "learning_rate": 1.4348229410598791e-05, "loss": 0.3467, "step": 1586 }, { "epoch": 0.7528462998102466, "grad_norm": 2.0627050399780273, "learning_rate": 1.4341309433003518e-05, "loss": 0.3109, "step": 1587 }, { "epoch": 0.7533206831119544, "grad_norm": 2.349839210510254, "learning_rate": 1.4334386892776246e-05, "loss": 0.3552, "step": 1588 }, { "epoch": 0.7537950664136622, "grad_norm": 3.1062917709350586, "learning_rate": 1.4327461794003284e-05, "loss": 0.4663, "step": 1589 }, { "epoch": 0.75426944971537, "grad_norm": 2.2856526374816895, "learning_rate": 1.4320534140772447e-05, "loss": 0.3425, "step": 1590 }, { "epoch": 0.7547438330170778, "grad_norm": 2.193957567214966, "learning_rate": 1.4313603937173058e-05, "loss": 0.3448, "step": 1591 }, { "epoch": 0.7552182163187856, "grad_norm": 2.4013428688049316, "learning_rate": 1.4306671187295948e-05, "loss": 0.3795, "step": 1592 }, { "epoch": 0.7556925996204934, "grad_norm": 2.1507675647735596, "learning_rate": 1.4299735895233457e-05, "loss": 0.307, "step": 1593 }, { "epoch": 0.7561669829222012, "grad_norm": 2.046177864074707, "learning_rate": 1.4292798065079413e-05, "loss": 0.3221, "step": 1594 }, { "epoch": 0.7566413662239089, "grad_norm": 1.7904236316680908, "learning_rate": 1.428585770092915e-05, "loss": 0.3009, "step": 1595 }, { "epoch": 0.7571157495256167, "grad_norm": 2.811075210571289, "learning_rate": 1.4278914806879494e-05, "loss": 0.4872, "step": 1596 }, { "epoch": 0.7575901328273245, "grad_norm": 2.26833176612854, "learning_rate": 1.4271969387028773e-05, "loss": 0.3738, "step": 1597 }, { "epoch": 0.7580645161290323, "grad_norm": 2.7194955348968506, "learning_rate": 1.4265021445476794e-05, "loss": 0.4348, "step": 1598 }, { "epoch": 0.75853889943074, "grad_norm": 2.353123664855957, "learning_rate": 1.4258070986324859e-05, "loss": 0.3861, "step": 1599 }, { "epoch": 0.7590132827324478, "grad_norm": 4.482807636260986, "learning_rate": 1.4251118013675758e-05, "loss": 0.3265, "step": 1600 }, { "epoch": 0.7594876660341556, "grad_norm": 2.567732810974121, "learning_rate": 1.424416253163376e-05, "loss": 0.4312, "step": 1601 }, { "epoch": 0.7599620493358634, "grad_norm": 2.412712335586548, "learning_rate": 1.4237204544304616e-05, "loss": 0.4085, "step": 1602 }, { "epoch": 0.7604364326375711, "grad_norm": 2.1643178462982178, "learning_rate": 1.423024405579556e-05, "loss": 0.3099, "step": 1603 }, { "epoch": 0.7609108159392789, "grad_norm": 2.2639448642730713, "learning_rate": 1.4223281070215297e-05, "loss": 0.3819, "step": 1604 }, { "epoch": 0.7613851992409867, "grad_norm": 2.341059446334839, "learning_rate": 1.421631559167401e-05, "loss": 0.4073, "step": 1605 }, { "epoch": 0.7618595825426945, "grad_norm": 2.276641845703125, "learning_rate": 1.4209347624283352e-05, "loss": 0.4333, "step": 1606 }, { "epoch": 0.7623339658444023, "grad_norm": 2.5862913131713867, "learning_rate": 1.4202377172156443e-05, "loss": 0.3642, "step": 1607 }, { "epoch": 0.7628083491461101, "grad_norm": 2.1797101497650146, "learning_rate": 1.4195404239407873e-05, "loss": 0.35, "step": 1608 }, { "epoch": 0.7632827324478179, "grad_norm": 2.534806251525879, "learning_rate": 1.4188428830153698e-05, "loss": 0.4342, "step": 1609 }, { "epoch": 0.7637571157495257, "grad_norm": 2.4096391201019287, "learning_rate": 1.4181450948511431e-05, "loss": 0.3694, "step": 1610 }, { "epoch": 0.7642314990512334, "grad_norm": 2.1816611289978027, "learning_rate": 1.4174470598600048e-05, "loss": 0.3174, "step": 1611 }, { "epoch": 0.7647058823529411, "grad_norm": 2.178921699523926, "learning_rate": 1.4167487784539973e-05, "loss": 0.3492, "step": 1612 }, { "epoch": 0.7651802656546489, "grad_norm": 2.4309422969818115, "learning_rate": 1.4160502510453103e-05, "loss": 0.3701, "step": 1613 }, { "epoch": 0.7656546489563567, "grad_norm": 2.0590929985046387, "learning_rate": 1.4153514780462767e-05, "loss": 0.305, "step": 1614 }, { "epoch": 0.7661290322580645, "grad_norm": 2.736320972442627, "learning_rate": 1.4146524598693758e-05, "loss": 0.4535, "step": 1615 }, { "epoch": 0.7666034155597723, "grad_norm": 2.418962001800537, "learning_rate": 1.4139531969272313e-05, "loss": 0.3975, "step": 1616 }, { "epoch": 0.7670777988614801, "grad_norm": 2.187948703765869, "learning_rate": 1.4132536896326105e-05, "loss": 0.3178, "step": 1617 }, { "epoch": 0.7675521821631879, "grad_norm": 2.1725316047668457, "learning_rate": 1.4125539383984264e-05, "loss": 0.3138, "step": 1618 }, { "epoch": 0.7680265654648957, "grad_norm": 2.384047508239746, "learning_rate": 1.4118539436377345e-05, "loss": 0.3081, "step": 1619 }, { "epoch": 0.7685009487666035, "grad_norm": 2.3366310596466064, "learning_rate": 1.4111537057637353e-05, "loss": 0.335, "step": 1620 }, { "epoch": 0.7689753320683111, "grad_norm": 2.152684211730957, "learning_rate": 1.4104532251897722e-05, "loss": 0.3145, "step": 1621 }, { "epoch": 0.7694497153700189, "grad_norm": 3.2571208477020264, "learning_rate": 1.409752502329332e-05, "loss": 0.297, "step": 1622 }, { "epoch": 0.7699240986717267, "grad_norm": 2.3170127868652344, "learning_rate": 1.409051537596044e-05, "loss": 0.3196, "step": 1623 }, { "epoch": 0.7703984819734345, "grad_norm": 2.442521810531616, "learning_rate": 1.4083503314036813e-05, "loss": 0.318, "step": 1624 }, { "epoch": 0.7708728652751423, "grad_norm": 3.705756902694702, "learning_rate": 1.4076488841661588e-05, "loss": 0.3793, "step": 1625 }, { "epoch": 0.7713472485768501, "grad_norm": 2.7555148601531982, "learning_rate": 1.406947196297534e-05, "loss": 0.3927, "step": 1626 }, { "epoch": 0.7718216318785579, "grad_norm": 2.6326348781585693, "learning_rate": 1.4062452682120056e-05, "loss": 0.4274, "step": 1627 }, { "epoch": 0.7722960151802657, "grad_norm": 2.276700258255005, "learning_rate": 1.4055431003239156e-05, "loss": 0.3913, "step": 1628 }, { "epoch": 0.7727703984819735, "grad_norm": 2.765024423599243, "learning_rate": 1.4048406930477465e-05, "loss": 0.4786, "step": 1629 }, { "epoch": 0.7732447817836812, "grad_norm": 2.5013036727905273, "learning_rate": 1.4041380467981225e-05, "loss": 0.4325, "step": 1630 }, { "epoch": 0.773719165085389, "grad_norm": 2.752293348312378, "learning_rate": 1.4034351619898088e-05, "loss": 0.3946, "step": 1631 }, { "epoch": 0.7741935483870968, "grad_norm": 2.66115665435791, "learning_rate": 1.4027320390377106e-05, "loss": 0.4486, "step": 1632 }, { "epoch": 0.7746679316888045, "grad_norm": 2.023422956466675, "learning_rate": 1.4020286783568753e-05, "loss": 0.3411, "step": 1633 }, { "epoch": 0.7751423149905123, "grad_norm": 2.1489474773406982, "learning_rate": 1.4013250803624894e-05, "loss": 0.3351, "step": 1634 }, { "epoch": 0.7756166982922201, "grad_norm": 2.9013800621032715, "learning_rate": 1.4006212454698798e-05, "loss": 0.4143, "step": 1635 }, { "epoch": 0.7760910815939279, "grad_norm": 2.3680756092071533, "learning_rate": 1.3999171740945132e-05, "loss": 0.3711, "step": 1636 }, { "epoch": 0.7765654648956357, "grad_norm": 2.0781219005584717, "learning_rate": 1.3992128666519961e-05, "loss": 0.3542, "step": 1637 }, { "epoch": 0.7770398481973435, "grad_norm": 1.984551191329956, "learning_rate": 1.3985083235580743e-05, "loss": 0.315, "step": 1638 }, { "epoch": 0.7775142314990512, "grad_norm": 1.8971257209777832, "learning_rate": 1.3978035452286325e-05, "loss": 0.2914, "step": 1639 }, { "epoch": 0.777988614800759, "grad_norm": 2.8102481365203857, "learning_rate": 1.3970985320796943e-05, "loss": 0.3875, "step": 1640 }, { "epoch": 0.7784629981024668, "grad_norm": 2.2260053157806396, "learning_rate": 1.396393284527422e-05, "loss": 0.3428, "step": 1641 }, { "epoch": 0.7789373814041746, "grad_norm": 2.3798251152038574, "learning_rate": 1.3956878029881167e-05, "loss": 0.3244, "step": 1642 }, { "epoch": 0.7794117647058824, "grad_norm": 2.634979009628296, "learning_rate": 1.3949820878782166e-05, "loss": 0.317, "step": 1643 }, { "epoch": 0.7798861480075902, "grad_norm": 2.1837527751922607, "learning_rate": 1.3942761396142982e-05, "loss": 0.3588, "step": 1644 }, { "epoch": 0.780360531309298, "grad_norm": 2.22882080078125, "learning_rate": 1.3935699586130767e-05, "loss": 0.3373, "step": 1645 }, { "epoch": 0.7808349146110057, "grad_norm": 2.3200697898864746, "learning_rate": 1.3928635452914028e-05, "loss": 0.3661, "step": 1646 }, { "epoch": 0.7813092979127134, "grad_norm": 2.158055543899536, "learning_rate": 1.3921569000662658e-05, "loss": 0.3346, "step": 1647 }, { "epoch": 0.7817836812144212, "grad_norm": 2.5481321811676025, "learning_rate": 1.3914500233547909e-05, "loss": 0.3935, "step": 1648 }, { "epoch": 0.782258064516129, "grad_norm": 2.6886203289031982, "learning_rate": 1.3907429155742414e-05, "loss": 0.4086, "step": 1649 }, { "epoch": 0.7827324478178368, "grad_norm": 2.635003089904785, "learning_rate": 1.3900355771420151e-05, "loss": 0.4449, "step": 1650 }, { "epoch": 0.7832068311195446, "grad_norm": 2.5500478744506836, "learning_rate": 1.3893280084756468e-05, "loss": 0.3725, "step": 1651 }, { "epoch": 0.7836812144212524, "grad_norm": 2.0035715103149414, "learning_rate": 1.3886202099928083e-05, "loss": 0.3206, "step": 1652 }, { "epoch": 0.7841555977229602, "grad_norm": 2.1694624423980713, "learning_rate": 1.387912182111305e-05, "loss": 0.3578, "step": 1653 }, { "epoch": 0.784629981024668, "grad_norm": 2.505841016769409, "learning_rate": 1.3872039252490796e-05, "loss": 0.395, "step": 1654 }, { "epoch": 0.7851043643263758, "grad_norm": 1.9906131029129028, "learning_rate": 1.386495439824208e-05, "loss": 0.3343, "step": 1655 }, { "epoch": 0.7855787476280834, "grad_norm": 2.116349458694458, "learning_rate": 1.3857867262549032e-05, "loss": 0.3224, "step": 1656 }, { "epoch": 0.7860531309297912, "grad_norm": 2.3191020488739014, "learning_rate": 1.3850777849595114e-05, "loss": 0.4305, "step": 1657 }, { "epoch": 0.786527514231499, "grad_norm": 2.867265224456787, "learning_rate": 1.3843686163565138e-05, "loss": 0.4917, "step": 1658 }, { "epoch": 0.7870018975332068, "grad_norm": 2.6565215587615967, "learning_rate": 1.3836592208645252e-05, "loss": 0.3741, "step": 1659 }, { "epoch": 0.7874762808349146, "grad_norm": 2.156074047088623, "learning_rate": 1.382949598902295e-05, "loss": 0.352, "step": 1660 }, { "epoch": 0.7879506641366224, "grad_norm": 2.667464017868042, "learning_rate": 1.3822397508887063e-05, "loss": 0.4249, "step": 1661 }, { "epoch": 0.7884250474383302, "grad_norm": 2.4767210483551025, "learning_rate": 1.381529677242775e-05, "loss": 0.3748, "step": 1662 }, { "epoch": 0.788899430740038, "grad_norm": 2.4702491760253906, "learning_rate": 1.38081937838365e-05, "loss": 0.3526, "step": 1663 }, { "epoch": 0.7893738140417458, "grad_norm": 2.1653172969818115, "learning_rate": 1.3801088547306149e-05, "loss": 0.393, "step": 1664 }, { "epoch": 0.7898481973434535, "grad_norm": 2.3579320907592773, "learning_rate": 1.379398106703084e-05, "loss": 0.4005, "step": 1665 }, { "epoch": 0.7903225806451613, "grad_norm": 1.9658737182617188, "learning_rate": 1.3786871347206053e-05, "loss": 0.2901, "step": 1666 }, { "epoch": 0.790796963946869, "grad_norm": 2.903672695159912, "learning_rate": 1.377975939202858e-05, "loss": 0.4421, "step": 1667 }, { "epoch": 0.7912713472485768, "grad_norm": 3.15793514251709, "learning_rate": 1.377264520569654e-05, "loss": 0.3416, "step": 1668 }, { "epoch": 0.7917457305502846, "grad_norm": 2.4560675621032715, "learning_rate": 1.3765528792409368e-05, "loss": 0.3839, "step": 1669 }, { "epoch": 0.7922201138519924, "grad_norm": 2.24422550201416, "learning_rate": 1.3758410156367812e-05, "loss": 0.33, "step": 1670 }, { "epoch": 0.7926944971537002, "grad_norm": 2.6136462688446045, "learning_rate": 1.375128930177393e-05, "loss": 0.3796, "step": 1671 }, { "epoch": 0.793168880455408, "grad_norm": 2.220956325531006, "learning_rate": 1.3744166232831093e-05, "loss": 0.343, "step": 1672 }, { "epoch": 0.7936432637571158, "grad_norm": 2.2082831859588623, "learning_rate": 1.3737040953743981e-05, "loss": 0.3344, "step": 1673 }, { "epoch": 0.7941176470588235, "grad_norm": 2.327711582183838, "learning_rate": 1.3729913468718574e-05, "loss": 0.3889, "step": 1674 }, { "epoch": 0.7945920303605313, "grad_norm": 2.5623528957366943, "learning_rate": 1.3722783781962155e-05, "loss": 0.3582, "step": 1675 }, { "epoch": 0.7950664136622391, "grad_norm": 2.1435022354125977, "learning_rate": 1.3715651897683306e-05, "loss": 0.3027, "step": 1676 }, { "epoch": 0.7955407969639469, "grad_norm": 2.650327205657959, "learning_rate": 1.3708517820091912e-05, "loss": 0.3932, "step": 1677 }, { "epoch": 0.7960151802656547, "grad_norm": 2.224015235900879, "learning_rate": 1.3701381553399147e-05, "loss": 0.3401, "step": 1678 }, { "epoch": 0.7964895635673624, "grad_norm": 2.3124001026153564, "learning_rate": 1.3694243101817475e-05, "loss": 0.3646, "step": 1679 }, { "epoch": 0.7969639468690702, "grad_norm": 2.3588428497314453, "learning_rate": 1.3687102469560656e-05, "loss": 0.3372, "step": 1680 }, { "epoch": 0.797438330170778, "grad_norm": 2.7870490550994873, "learning_rate": 1.3679959660843736e-05, "loss": 0.47, "step": 1681 }, { "epoch": 0.7979127134724858, "grad_norm": 2.235140562057495, "learning_rate": 1.3672814679883044e-05, "loss": 0.3261, "step": 1682 }, { "epoch": 0.7983870967741935, "grad_norm": 2.4782466888427734, "learning_rate": 1.3665667530896189e-05, "loss": 0.4113, "step": 1683 }, { "epoch": 0.7988614800759013, "grad_norm": 2.6865344047546387, "learning_rate": 1.3658518218102064e-05, "loss": 0.4226, "step": 1684 }, { "epoch": 0.7993358633776091, "grad_norm": 2.3782565593719482, "learning_rate": 1.3651366745720837e-05, "loss": 0.3715, "step": 1685 }, { "epoch": 0.7998102466793169, "grad_norm": 2.305004119873047, "learning_rate": 1.3644213117973954e-05, "loss": 0.4011, "step": 1686 }, { "epoch": 0.8002846299810247, "grad_norm": 2.5552866458892822, "learning_rate": 1.3637057339084125e-05, "loss": 0.3444, "step": 1687 }, { "epoch": 0.8007590132827325, "grad_norm": 2.5058846473693848, "learning_rate": 1.3629899413275342e-05, "loss": 0.3723, "step": 1688 }, { "epoch": 0.8012333965844403, "grad_norm": 2.0107834339141846, "learning_rate": 1.3622739344772853e-05, "loss": 0.2992, "step": 1689 }, { "epoch": 0.801707779886148, "grad_norm": 1.9664660692214966, "learning_rate": 1.361557713780318e-05, "loss": 0.3447, "step": 1690 }, { "epoch": 0.8021821631878557, "grad_norm": 2.476154088973999, "learning_rate": 1.3608412796594096e-05, "loss": 0.4195, "step": 1691 }, { "epoch": 0.8026565464895635, "grad_norm": 2.669809103012085, "learning_rate": 1.360124632537465e-05, "loss": 0.4169, "step": 1692 }, { "epoch": 0.8031309297912713, "grad_norm": 2.68930983543396, "learning_rate": 1.3594077728375129e-05, "loss": 0.418, "step": 1693 }, { "epoch": 0.8036053130929791, "grad_norm": 2.4470651149749756, "learning_rate": 1.3586907009827093e-05, "loss": 0.4022, "step": 1694 }, { "epoch": 0.8040796963946869, "grad_norm": 2.488619565963745, "learning_rate": 1.3579734173963343e-05, "loss": 0.3691, "step": 1695 }, { "epoch": 0.8045540796963947, "grad_norm": 2.0860743522644043, "learning_rate": 1.3572559225017932e-05, "loss": 0.3586, "step": 1696 }, { "epoch": 0.8050284629981025, "grad_norm": 2.9916765689849854, "learning_rate": 1.3565382167226162e-05, "loss": 0.3821, "step": 1697 }, { "epoch": 0.8055028462998103, "grad_norm": 2.3840274810791016, "learning_rate": 1.3558203004824581e-05, "loss": 0.3548, "step": 1698 }, { "epoch": 0.8059772296015181, "grad_norm": 5.072951316833496, "learning_rate": 1.3551021742050974e-05, "loss": 0.3889, "step": 1699 }, { "epoch": 0.8064516129032258, "grad_norm": 1.943625569343567, "learning_rate": 1.3543838383144374e-05, "loss": 0.3432, "step": 1700 }, { "epoch": 0.8069259962049335, "grad_norm": 1.8006517887115479, "learning_rate": 1.3536652932345043e-05, "loss": 0.2821, "step": 1701 }, { "epoch": 0.8074003795066413, "grad_norm": 2.0171427726745605, "learning_rate": 1.3529465393894485e-05, "loss": 0.3287, "step": 1702 }, { "epoch": 0.8078747628083491, "grad_norm": 2.133181095123291, "learning_rate": 1.3522275772035426e-05, "loss": 0.3074, "step": 1703 }, { "epoch": 0.8083491461100569, "grad_norm": 1.9669580459594727, "learning_rate": 1.3515084071011837e-05, "loss": 0.3211, "step": 1704 }, { "epoch": 0.8088235294117647, "grad_norm": 2.351128101348877, "learning_rate": 1.3507890295068902e-05, "loss": 0.3666, "step": 1705 }, { "epoch": 0.8092979127134725, "grad_norm": 2.2970938682556152, "learning_rate": 1.3500694448453038e-05, "loss": 0.3401, "step": 1706 }, { "epoch": 0.8097722960151803, "grad_norm": 2.5522122383117676, "learning_rate": 1.3493496535411885e-05, "loss": 0.3661, "step": 1707 }, { "epoch": 0.8102466793168881, "grad_norm": 3.098694324493408, "learning_rate": 1.3486296560194292e-05, "loss": 0.3709, "step": 1708 }, { "epoch": 0.8107210626185958, "grad_norm": 2.7200305461883545, "learning_rate": 1.347909452705034e-05, "loss": 0.4361, "step": 1709 }, { "epoch": 0.8111954459203036, "grad_norm": 2.124351978302002, "learning_rate": 1.3471890440231319e-05, "loss": 0.411, "step": 1710 }, { "epoch": 0.8116698292220114, "grad_norm": 2.3741860389709473, "learning_rate": 1.3464684303989723e-05, "loss": 0.36, "step": 1711 }, { "epoch": 0.8121442125237192, "grad_norm": 2.1634681224823, "learning_rate": 1.3457476122579268e-05, "loss": 0.3456, "step": 1712 }, { "epoch": 0.812618595825427, "grad_norm": 2.8045156002044678, "learning_rate": 1.3450265900254868e-05, "loss": 0.472, "step": 1713 }, { "epoch": 0.8130929791271347, "grad_norm": 3.158280372619629, "learning_rate": 1.3443053641272656e-05, "loss": 0.4005, "step": 1714 }, { "epoch": 0.8135673624288425, "grad_norm": 2.369152784347534, "learning_rate": 1.3435839349889945e-05, "loss": 0.3169, "step": 1715 }, { "epoch": 0.8140417457305503, "grad_norm": 2.4343760013580322, "learning_rate": 1.3428623030365267e-05, "loss": 0.3438, "step": 1716 }, { "epoch": 0.8145161290322581, "grad_norm": 2.882737636566162, "learning_rate": 1.342140468695834e-05, "loss": 0.3749, "step": 1717 }, { "epoch": 0.8149905123339658, "grad_norm": 2.584279775619507, "learning_rate": 1.3414184323930082e-05, "loss": 0.3517, "step": 1718 }, { "epoch": 0.8154648956356736, "grad_norm": 2.2734382152557373, "learning_rate": 1.3406961945542605e-05, "loss": 0.2909, "step": 1719 }, { "epoch": 0.8159392789373814, "grad_norm": 2.0772268772125244, "learning_rate": 1.3399737556059203e-05, "loss": 0.3331, "step": 1720 }, { "epoch": 0.8164136622390892, "grad_norm": 2.1270837783813477, "learning_rate": 1.3392511159744364e-05, "loss": 0.286, "step": 1721 }, { "epoch": 0.816888045540797, "grad_norm": 2.7782678604125977, "learning_rate": 1.3385282760863758e-05, "loss": 0.3716, "step": 1722 }, { "epoch": 0.8173624288425048, "grad_norm": 2.4752237796783447, "learning_rate": 1.3378052363684238e-05, "loss": 0.3222, "step": 1723 }, { "epoch": 0.8178368121442126, "grad_norm": 2.3941702842712402, "learning_rate": 1.3370819972473832e-05, "loss": 0.3384, "step": 1724 }, { "epoch": 0.8183111954459203, "grad_norm": 2.5002641677856445, "learning_rate": 1.3363585591501751e-05, "loss": 0.3646, "step": 1725 }, { "epoch": 0.818785578747628, "grad_norm": 2.297147750854492, "learning_rate": 1.3356349225038384e-05, "loss": 0.3964, "step": 1726 }, { "epoch": 0.8192599620493358, "grad_norm": 2.366065740585327, "learning_rate": 1.334911087735528e-05, "loss": 0.3453, "step": 1727 }, { "epoch": 0.8197343453510436, "grad_norm": 2.4925296306610107, "learning_rate": 1.3341870552725166e-05, "loss": 0.3305, "step": 1728 }, { "epoch": 0.8202087286527514, "grad_norm": 2.4606728553771973, "learning_rate": 1.3334628255421932e-05, "loss": 0.3727, "step": 1729 }, { "epoch": 0.8206831119544592, "grad_norm": 2.2296173572540283, "learning_rate": 1.3327383989720639e-05, "loss": 0.3335, "step": 1730 }, { "epoch": 0.821157495256167, "grad_norm": 2.3122830390930176, "learning_rate": 1.3320137759897505e-05, "loss": 0.3168, "step": 1731 }, { "epoch": 0.8216318785578748, "grad_norm": 2.9335269927978516, "learning_rate": 1.3312889570229901e-05, "loss": 0.391, "step": 1732 }, { "epoch": 0.8221062618595826, "grad_norm": 1.8690437078475952, "learning_rate": 1.3305639424996369e-05, "loss": 0.3191, "step": 1733 }, { "epoch": 0.8225806451612904, "grad_norm": 2.195049524307251, "learning_rate": 1.3298387328476594e-05, "loss": 0.3354, "step": 1734 }, { "epoch": 0.823055028462998, "grad_norm": 2.337287187576294, "learning_rate": 1.3291133284951418e-05, "loss": 0.3784, "step": 1735 }, { "epoch": 0.8235294117647058, "grad_norm": 2.4185826778411865, "learning_rate": 1.328387729870283e-05, "loss": 0.3549, "step": 1736 }, { "epoch": 0.8240037950664136, "grad_norm": 2.838695526123047, "learning_rate": 1.3276619374013965e-05, "loss": 0.4295, "step": 1737 }, { "epoch": 0.8244781783681214, "grad_norm": 2.432915687561035, "learning_rate": 1.3269359515169113e-05, "loss": 0.2935, "step": 1738 }, { "epoch": 0.8249525616698292, "grad_norm": 3.295152187347412, "learning_rate": 1.326209772645369e-05, "loss": 0.473, "step": 1739 }, { "epoch": 0.825426944971537, "grad_norm": 3.3393542766571045, "learning_rate": 1.3254834012154259e-05, "loss": 0.2849, "step": 1740 }, { "epoch": 0.8259013282732448, "grad_norm": 2.1521685123443604, "learning_rate": 1.3247568376558515e-05, "loss": 0.3756, "step": 1741 }, { "epoch": 0.8263757115749526, "grad_norm": 2.7531650066375732, "learning_rate": 1.3240300823955297e-05, "loss": 0.3388, "step": 1742 }, { "epoch": 0.8268500948766604, "grad_norm": 2.021456241607666, "learning_rate": 1.3233031358634566e-05, "loss": 0.3245, "step": 1743 }, { "epoch": 0.8273244781783681, "grad_norm": 2.3515677452087402, "learning_rate": 1.3225759984887416e-05, "loss": 0.412, "step": 1744 }, { "epoch": 0.8277988614800759, "grad_norm": 1.9360193014144897, "learning_rate": 1.3218486707006069e-05, "loss": 0.3299, "step": 1745 }, { "epoch": 0.8282732447817837, "grad_norm": 1.9744075536727905, "learning_rate": 1.3211211529283867e-05, "loss": 0.3216, "step": 1746 }, { "epoch": 0.8287476280834914, "grad_norm": 2.255706548690796, "learning_rate": 1.3203934456015275e-05, "loss": 0.3296, "step": 1747 }, { "epoch": 0.8292220113851992, "grad_norm": 2.0415632724761963, "learning_rate": 1.3196655491495877e-05, "loss": 0.2996, "step": 1748 }, { "epoch": 0.829696394686907, "grad_norm": 2.2202017307281494, "learning_rate": 1.3189374640022372e-05, "loss": 0.3678, "step": 1749 }, { "epoch": 0.8301707779886148, "grad_norm": 2.691748857498169, "learning_rate": 1.3182091905892581e-05, "loss": 0.4671, "step": 1750 }, { "epoch": 0.8306451612903226, "grad_norm": 2.277745008468628, "learning_rate": 1.3174807293405427e-05, "loss": 0.3606, "step": 1751 }, { "epoch": 0.8311195445920304, "grad_norm": 2.287468910217285, "learning_rate": 1.3167520806860943e-05, "loss": 0.4041, "step": 1752 }, { "epoch": 0.8315939278937381, "grad_norm": 2.397675037384033, "learning_rate": 1.3160232450560268e-05, "loss": 0.4329, "step": 1753 }, { "epoch": 0.8320683111954459, "grad_norm": 2.1399128437042236, "learning_rate": 1.3152942228805651e-05, "loss": 0.3077, "step": 1754 }, { "epoch": 0.8325426944971537, "grad_norm": 2.231016159057617, "learning_rate": 1.314565014590044e-05, "loss": 0.3496, "step": 1755 }, { "epoch": 0.8330170777988615, "grad_norm": 1.9128320217132568, "learning_rate": 1.3138356206149069e-05, "loss": 0.2968, "step": 1756 }, { "epoch": 0.8334914611005693, "grad_norm": 2.082197666168213, "learning_rate": 1.3131060413857087e-05, "loss": 0.323, "step": 1757 }, { "epoch": 0.8339658444022771, "grad_norm": 2.4280385971069336, "learning_rate": 1.3123762773331127e-05, "loss": 0.3908, "step": 1758 }, { "epoch": 0.8344402277039848, "grad_norm": 2.383655071258545, "learning_rate": 1.3116463288878914e-05, "loss": 0.3721, "step": 1759 }, { "epoch": 0.8349146110056926, "grad_norm": 2.2262542247772217, "learning_rate": 1.3109161964809256e-05, "loss": 0.368, "step": 1760 }, { "epoch": 0.8353889943074004, "grad_norm": 2.4055254459381104, "learning_rate": 1.310185880543206e-05, "loss": 0.3321, "step": 1761 }, { "epoch": 0.8358633776091081, "grad_norm": 2.6039791107177734, "learning_rate": 1.3094553815058304e-05, "loss": 0.3922, "step": 1762 }, { "epoch": 0.8363377609108159, "grad_norm": 1.9806139469146729, "learning_rate": 1.3087246998000054e-05, "loss": 0.3628, "step": 1763 }, { "epoch": 0.8368121442125237, "grad_norm": 2.162916660308838, "learning_rate": 1.307993835857045e-05, "loss": 0.3516, "step": 1764 }, { "epoch": 0.8372865275142315, "grad_norm": 2.237438917160034, "learning_rate": 1.307262790108371e-05, "loss": 0.3383, "step": 1765 }, { "epoch": 0.8377609108159393, "grad_norm": 2.004232406616211, "learning_rate": 1.3065315629855124e-05, "loss": 0.3077, "step": 1766 }, { "epoch": 0.8382352941176471, "grad_norm": 2.4037442207336426, "learning_rate": 1.3058001549201056e-05, "loss": 0.3527, "step": 1767 }, { "epoch": 0.8387096774193549, "grad_norm": 2.4244558811187744, "learning_rate": 1.3050685663438931e-05, "loss": 0.3533, "step": 1768 }, { "epoch": 0.8391840607210627, "grad_norm": 2.3665969371795654, "learning_rate": 1.3043367976887251e-05, "loss": 0.3269, "step": 1769 }, { "epoch": 0.8396584440227703, "grad_norm": 2.266602039337158, "learning_rate": 1.3036048493865567e-05, "loss": 0.3345, "step": 1770 }, { "epoch": 0.8401328273244781, "grad_norm": 2.609055757522583, "learning_rate": 1.3028727218694503e-05, "loss": 0.3333, "step": 1771 }, { "epoch": 0.8406072106261859, "grad_norm": 2.1717774868011475, "learning_rate": 1.3021404155695728e-05, "loss": 0.3405, "step": 1772 }, { "epoch": 0.8410815939278937, "grad_norm": 2.0103063583374023, "learning_rate": 1.301407930919198e-05, "loss": 0.2846, "step": 1773 }, { "epoch": 0.8415559772296015, "grad_norm": 2.3277881145477295, "learning_rate": 1.300675268350704e-05, "loss": 0.331, "step": 1774 }, { "epoch": 0.8420303605313093, "grad_norm": 2.5206899642944336, "learning_rate": 1.2999424282965747e-05, "loss": 0.3298, "step": 1775 }, { "epoch": 0.8425047438330171, "grad_norm": 2.3512508869171143, "learning_rate": 1.299209411189398e-05, "loss": 0.3856, "step": 1776 }, { "epoch": 0.8429791271347249, "grad_norm": 2.2688114643096924, "learning_rate": 1.2984762174618664e-05, "loss": 0.3016, "step": 1777 }, { "epoch": 0.8434535104364327, "grad_norm": 2.555619239807129, "learning_rate": 1.2977428475467773e-05, "loss": 0.317, "step": 1778 }, { "epoch": 0.8439278937381404, "grad_norm": 2.4550106525421143, "learning_rate": 1.2970093018770318e-05, "loss": 0.3679, "step": 1779 }, { "epoch": 0.8444022770398482, "grad_norm": 2.240664005279541, "learning_rate": 1.2962755808856341e-05, "loss": 0.3785, "step": 1780 }, { "epoch": 0.844876660341556, "grad_norm": 2.236607313156128, "learning_rate": 1.295541685005693e-05, "loss": 0.3545, "step": 1781 }, { "epoch": 0.8453510436432637, "grad_norm": 2.1329894065856934, "learning_rate": 1.2948076146704195e-05, "loss": 0.3287, "step": 1782 }, { "epoch": 0.8458254269449715, "grad_norm": 2.1090683937072754, "learning_rate": 1.2940733703131287e-05, "loss": 0.3241, "step": 1783 }, { "epoch": 0.8462998102466793, "grad_norm": 2.2232136726379395, "learning_rate": 1.293338952367237e-05, "loss": 0.3764, "step": 1784 }, { "epoch": 0.8467741935483871, "grad_norm": 2.0039751529693604, "learning_rate": 1.2926043612662646e-05, "loss": 0.3344, "step": 1785 }, { "epoch": 0.8472485768500949, "grad_norm": 2.570286273956299, "learning_rate": 1.291869597443833e-05, "loss": 0.3779, "step": 1786 }, { "epoch": 0.8477229601518027, "grad_norm": 2.4321069717407227, "learning_rate": 1.2911346613336666e-05, "loss": 0.4621, "step": 1787 }, { "epoch": 0.8481973434535104, "grad_norm": 2.2266476154327393, "learning_rate": 1.2903995533695904e-05, "loss": 0.3687, "step": 1788 }, { "epoch": 0.8486717267552182, "grad_norm": 1.9736994504928589, "learning_rate": 1.2896642739855311e-05, "loss": 0.3805, "step": 1789 }, { "epoch": 0.849146110056926, "grad_norm": 3.032970666885376, "learning_rate": 1.2889288236155177e-05, "loss": 0.403, "step": 1790 }, { "epoch": 0.8496204933586338, "grad_norm": 2.4022488594055176, "learning_rate": 1.2881932026936785e-05, "loss": 0.405, "step": 1791 }, { "epoch": 0.8500948766603416, "grad_norm": 2.144449234008789, "learning_rate": 1.2874574116542439e-05, "loss": 0.3194, "step": 1792 }, { "epoch": 0.8505692599620494, "grad_norm": 2.102857828140259, "learning_rate": 1.2867214509315434e-05, "loss": 0.339, "step": 1793 }, { "epoch": 0.8510436432637571, "grad_norm": 2.4417030811309814, "learning_rate": 1.2859853209600075e-05, "loss": 0.3896, "step": 1794 }, { "epoch": 0.8515180265654649, "grad_norm": 2.425147533416748, "learning_rate": 1.2852490221741669e-05, "loss": 0.2993, "step": 1795 }, { "epoch": 0.8519924098671727, "grad_norm": 2.761335849761963, "learning_rate": 1.284512555008651e-05, "loss": 0.3798, "step": 1796 }, { "epoch": 0.8524667931688804, "grad_norm": 2.5409250259399414, "learning_rate": 1.2837759198981894e-05, "loss": 0.3064, "step": 1797 }, { "epoch": 0.8529411764705882, "grad_norm": 2.241783380508423, "learning_rate": 1.28303911727761e-05, "loss": 0.3359, "step": 1798 }, { "epoch": 0.853415559772296, "grad_norm": 2.344348907470703, "learning_rate": 1.2823021475818408e-05, "loss": 0.3305, "step": 1799 }, { "epoch": 0.8538899430740038, "grad_norm": 2.3390181064605713, "learning_rate": 1.2815650112459075e-05, "loss": 0.3343, "step": 1800 }, { "epoch": 0.8543643263757116, "grad_norm": 1.938061237335205, "learning_rate": 1.2808277087049338e-05, "loss": 0.2944, "step": 1801 }, { "epoch": 0.8548387096774194, "grad_norm": 2.33855938911438, "learning_rate": 1.2800902403941429e-05, "loss": 0.4172, "step": 1802 }, { "epoch": 0.8553130929791272, "grad_norm": 2.146927833557129, "learning_rate": 1.279352606748855e-05, "loss": 0.3548, "step": 1803 }, { "epoch": 0.855787476280835, "grad_norm": 2.442716598510742, "learning_rate": 1.278614808204487e-05, "loss": 0.3444, "step": 1804 }, { "epoch": 0.8562618595825426, "grad_norm": 2.412533760070801, "learning_rate": 1.277876845196555e-05, "loss": 0.4137, "step": 1805 }, { "epoch": 0.8567362428842504, "grad_norm": 2.463116407394409, "learning_rate": 1.2771387181606709e-05, "loss": 0.4336, "step": 1806 }, { "epoch": 0.8572106261859582, "grad_norm": 2.2430520057678223, "learning_rate": 1.2764004275325443e-05, "loss": 0.2363, "step": 1807 }, { "epoch": 0.857685009487666, "grad_norm": 2.2711448669433594, "learning_rate": 1.2756619737479801e-05, "loss": 0.4087, "step": 1808 }, { "epoch": 0.8581593927893738, "grad_norm": 2.3729405403137207, "learning_rate": 1.2749233572428805e-05, "loss": 0.3526, "step": 1809 }, { "epoch": 0.8586337760910816, "grad_norm": 2.3605003356933594, "learning_rate": 1.2741845784532436e-05, "loss": 0.3231, "step": 1810 }, { "epoch": 0.8591081593927894, "grad_norm": 2.177626848220825, "learning_rate": 1.2734456378151636e-05, "loss": 0.3287, "step": 1811 }, { "epoch": 0.8595825426944972, "grad_norm": 2.164935350418091, "learning_rate": 1.272706535764829e-05, "loss": 0.2948, "step": 1812 }, { "epoch": 0.860056925996205, "grad_norm": 2.3316850662231445, "learning_rate": 1.2719672727385249e-05, "loss": 0.3226, "step": 1813 }, { "epoch": 0.8605313092979127, "grad_norm": 2.731200933456421, "learning_rate": 1.271227849172631e-05, "loss": 0.358, "step": 1814 }, { "epoch": 0.8610056925996205, "grad_norm": 3.0866591930389404, "learning_rate": 1.270488265503622e-05, "loss": 0.4681, "step": 1815 }, { "epoch": 0.8614800759013282, "grad_norm": 2.4756972789764404, "learning_rate": 1.2697485221680663e-05, "loss": 0.3664, "step": 1816 }, { "epoch": 0.861954459203036, "grad_norm": 2.456561803817749, "learning_rate": 1.269008619602627e-05, "loss": 0.334, "step": 1817 }, { "epoch": 0.8624288425047438, "grad_norm": 2.1399877071380615, "learning_rate": 1.2682685582440614e-05, "loss": 0.3091, "step": 1818 }, { "epoch": 0.8629032258064516, "grad_norm": 2.5643470287323, "learning_rate": 1.2675283385292212e-05, "loss": 0.4395, "step": 1819 }, { "epoch": 0.8633776091081594, "grad_norm": 2.543917179107666, "learning_rate": 1.2667879608950494e-05, "loss": 0.3285, "step": 1820 }, { "epoch": 0.8638519924098672, "grad_norm": 2.08056902885437, "learning_rate": 1.2660474257785844e-05, "loss": 0.3604, "step": 1821 }, { "epoch": 0.864326375711575, "grad_norm": 2.5211234092712402, "learning_rate": 1.2653067336169566e-05, "loss": 0.326, "step": 1822 }, { "epoch": 0.8648007590132827, "grad_norm": 1.8964157104492188, "learning_rate": 1.2645658848473894e-05, "loss": 0.2578, "step": 1823 }, { "epoch": 0.8652751423149905, "grad_norm": 1.9530812501907349, "learning_rate": 1.2638248799071985e-05, "loss": 0.3031, "step": 1824 }, { "epoch": 0.8657495256166983, "grad_norm": 2.3713808059692383, "learning_rate": 1.2630837192337913e-05, "loss": 0.3138, "step": 1825 }, { "epoch": 0.8662239089184061, "grad_norm": 2.222252607345581, "learning_rate": 1.2623424032646683e-05, "loss": 0.3455, "step": 1826 }, { "epoch": 0.8666982922201139, "grad_norm": 2.9525506496429443, "learning_rate": 1.2616009324374205e-05, "loss": 0.4101, "step": 1827 }, { "epoch": 0.8671726755218216, "grad_norm": 2.140566825866699, "learning_rate": 1.2608593071897311e-05, "loss": 0.3621, "step": 1828 }, { "epoch": 0.8676470588235294, "grad_norm": 2.7032320499420166, "learning_rate": 1.2601175279593737e-05, "loss": 0.4193, "step": 1829 }, { "epoch": 0.8681214421252372, "grad_norm": 2.723836660385132, "learning_rate": 1.2593755951842134e-05, "loss": 0.4037, "step": 1830 }, { "epoch": 0.868595825426945, "grad_norm": 2.102358102798462, "learning_rate": 1.2586335093022064e-05, "loss": 0.3605, "step": 1831 }, { "epoch": 0.8690702087286527, "grad_norm": 2.0517046451568604, "learning_rate": 1.2578912707513977e-05, "loss": 0.3291, "step": 1832 }, { "epoch": 0.8695445920303605, "grad_norm": 2.466214418411255, "learning_rate": 1.257148879969924e-05, "loss": 0.3382, "step": 1833 }, { "epoch": 0.8700189753320683, "grad_norm": 2.1797330379486084, "learning_rate": 1.2564063373960109e-05, "loss": 0.3483, "step": 1834 }, { "epoch": 0.8704933586337761, "grad_norm": 2.182875394821167, "learning_rate": 1.2556636434679744e-05, "loss": 0.3307, "step": 1835 }, { "epoch": 0.8709677419354839, "grad_norm": 1.9578689336776733, "learning_rate": 1.254920798624219e-05, "loss": 0.284, "step": 1836 }, { "epoch": 0.8714421252371917, "grad_norm": 2.1100683212280273, "learning_rate": 1.2541778033032383e-05, "loss": 0.3402, "step": 1837 }, { "epoch": 0.8719165085388995, "grad_norm": 2.199645757675171, "learning_rate": 1.2534346579436158e-05, "loss": 0.3736, "step": 1838 }, { "epoch": 0.8723908918406073, "grad_norm": 2.371626853942871, "learning_rate": 1.2526913629840228e-05, "loss": 0.4038, "step": 1839 }, { "epoch": 0.872865275142315, "grad_norm": 2.2260429859161377, "learning_rate": 1.2519479188632184e-05, "loss": 0.3058, "step": 1840 }, { "epoch": 0.8733396584440227, "grad_norm": 2.0778400897979736, "learning_rate": 1.2512043260200506e-05, "loss": 0.34, "step": 1841 }, { "epoch": 0.8738140417457305, "grad_norm": 1.6490452289581299, "learning_rate": 1.2504605848934552e-05, "loss": 0.2596, "step": 1842 }, { "epoch": 0.8742884250474383, "grad_norm": 2.222475051879883, "learning_rate": 1.2497166959224546e-05, "loss": 0.2611, "step": 1843 }, { "epoch": 0.8747628083491461, "grad_norm": 2.1947946548461914, "learning_rate": 1.2489726595461598e-05, "loss": 0.3622, "step": 1844 }, { "epoch": 0.8752371916508539, "grad_norm": 2.6763131618499756, "learning_rate": 1.2482284762037675e-05, "loss": 0.3425, "step": 1845 }, { "epoch": 0.8757115749525617, "grad_norm": 1.7346967458724976, "learning_rate": 1.247484146334562e-05, "loss": 0.243, "step": 1846 }, { "epoch": 0.8761859582542695, "grad_norm": 2.4290475845336914, "learning_rate": 1.246739670377914e-05, "loss": 0.4306, "step": 1847 }, { "epoch": 0.8766603415559773, "grad_norm": 2.096940517425537, "learning_rate": 1.2459950487732804e-05, "loss": 0.3498, "step": 1848 }, { "epoch": 0.877134724857685, "grad_norm": 1.9826620817184448, "learning_rate": 1.2452502819602035e-05, "loss": 0.2844, "step": 1849 }, { "epoch": 0.8776091081593927, "grad_norm": 2.250736951828003, "learning_rate": 1.2445053703783118e-05, "loss": 0.3541, "step": 1850 }, { "epoch": 0.8780834914611005, "grad_norm": 2.070547342300415, "learning_rate": 1.2437603144673198e-05, "loss": 0.2969, "step": 1851 }, { "epoch": 0.8785578747628083, "grad_norm": 2.3527519702911377, "learning_rate": 1.2430151146670261e-05, "loss": 0.3719, "step": 1852 }, { "epoch": 0.8790322580645161, "grad_norm": 2.063654661178589, "learning_rate": 1.242269771417315e-05, "loss": 0.315, "step": 1853 }, { "epoch": 0.8795066413662239, "grad_norm": 2.292386531829834, "learning_rate": 1.2415242851581552e-05, "loss": 0.3539, "step": 1854 }, { "epoch": 0.8799810246679317, "grad_norm": 2.2170569896698, "learning_rate": 1.2407786563296e-05, "loss": 0.2982, "step": 1855 }, { "epoch": 0.8804554079696395, "grad_norm": 2.0592644214630127, "learning_rate": 1.2400328853717862e-05, "loss": 0.3065, "step": 1856 }, { "epoch": 0.8809297912713473, "grad_norm": 1.8755029439926147, "learning_rate": 1.2392869727249358e-05, "loss": 0.3065, "step": 1857 }, { "epoch": 0.881404174573055, "grad_norm": 1.9304600954055786, "learning_rate": 1.2385409188293528e-05, "loss": 0.3204, "step": 1858 }, { "epoch": 0.8818785578747628, "grad_norm": 2.4600729942321777, "learning_rate": 1.2377947241254263e-05, "loss": 0.3721, "step": 1859 }, { "epoch": 0.8823529411764706, "grad_norm": 2.047233819961548, "learning_rate": 1.2370483890536271e-05, "loss": 0.3081, "step": 1860 }, { "epoch": 0.8828273244781784, "grad_norm": 1.9848384857177734, "learning_rate": 1.2363019140545096e-05, "loss": 0.2854, "step": 1861 }, { "epoch": 0.8833017077798861, "grad_norm": 2.357391595840454, "learning_rate": 1.2355552995687104e-05, "loss": 0.4203, "step": 1862 }, { "epoch": 0.8837760910815939, "grad_norm": 1.8160470724105835, "learning_rate": 1.2348085460369488e-05, "loss": 0.236, "step": 1863 }, { "epoch": 0.8842504743833017, "grad_norm": 2.142580509185791, "learning_rate": 1.234061653900026e-05, "loss": 0.3978, "step": 1864 }, { "epoch": 0.8847248576850095, "grad_norm": 2.4461870193481445, "learning_rate": 1.2333146235988251e-05, "loss": 0.3631, "step": 1865 }, { "epoch": 0.8851992409867173, "grad_norm": 2.1813673973083496, "learning_rate": 1.2325674555743106e-05, "loss": 0.3192, "step": 1866 }, { "epoch": 0.885673624288425, "grad_norm": 2.124764919281006, "learning_rate": 1.2318201502675285e-05, "loss": 0.3734, "step": 1867 }, { "epoch": 0.8861480075901328, "grad_norm": 1.9253191947937012, "learning_rate": 1.2310727081196054e-05, "loss": 0.3054, "step": 1868 }, { "epoch": 0.8866223908918406, "grad_norm": 1.8508185148239136, "learning_rate": 1.2303251295717495e-05, "loss": 0.2451, "step": 1869 }, { "epoch": 0.8870967741935484, "grad_norm": 2.3445682525634766, "learning_rate": 1.2295774150652486e-05, "loss": 0.3997, "step": 1870 }, { "epoch": 0.8875711574952562, "grad_norm": 2.4285199642181396, "learning_rate": 1.2288295650414716e-05, "loss": 0.3243, "step": 1871 }, { "epoch": 0.888045540796964, "grad_norm": 1.8846771717071533, "learning_rate": 1.2280815799418666e-05, "loss": 0.2579, "step": 1872 }, { "epoch": 0.8885199240986718, "grad_norm": 3.0957839488983154, "learning_rate": 1.2273334602079621e-05, "loss": 0.4514, "step": 1873 }, { "epoch": 0.8889943074003795, "grad_norm": 2.224541425704956, "learning_rate": 1.2265852062813652e-05, "loss": 0.3709, "step": 1874 }, { "epoch": 0.8894686907020873, "grad_norm": 2.923872470855713, "learning_rate": 1.2258368186037638e-05, "loss": 0.4537, "step": 1875 }, { "epoch": 0.889943074003795, "grad_norm": 2.0478696823120117, "learning_rate": 1.2250882976169228e-05, "loss": 0.3222, "step": 1876 }, { "epoch": 0.8904174573055028, "grad_norm": 2.2573540210723877, "learning_rate": 1.2243396437626866e-05, "loss": 0.37, "step": 1877 }, { "epoch": 0.8908918406072106, "grad_norm": 2.3689069747924805, "learning_rate": 1.2235908574829792e-05, "loss": 0.3659, "step": 1878 }, { "epoch": 0.8913662239089184, "grad_norm": 2.4283788204193115, "learning_rate": 1.2228419392198008e-05, "loss": 0.3417, "step": 1879 }, { "epoch": 0.8918406072106262, "grad_norm": 2.9532792568206787, "learning_rate": 1.2220928894152313e-05, "loss": 0.2853, "step": 1880 }, { "epoch": 0.892314990512334, "grad_norm": 2.0899689197540283, "learning_rate": 1.2213437085114263e-05, "loss": 0.3315, "step": 1881 }, { "epoch": 0.8927893738140418, "grad_norm": 2.3636410236358643, "learning_rate": 1.2205943969506207e-05, "loss": 0.4148, "step": 1882 }, { "epoch": 0.8932637571157496, "grad_norm": 2.356285810470581, "learning_rate": 1.2198449551751255e-05, "loss": 0.3218, "step": 1883 }, { "epoch": 0.8937381404174574, "grad_norm": 2.254668951034546, "learning_rate": 1.219095383627329e-05, "loss": 0.3474, "step": 1884 }, { "epoch": 0.894212523719165, "grad_norm": 2.1036298274993896, "learning_rate": 1.2183456827496951e-05, "loss": 0.3559, "step": 1885 }, { "epoch": 0.8946869070208728, "grad_norm": 2.2919163703918457, "learning_rate": 1.2175958529847654e-05, "loss": 0.3481, "step": 1886 }, { "epoch": 0.8951612903225806, "grad_norm": 2.041938066482544, "learning_rate": 1.216845894775157e-05, "loss": 0.2727, "step": 1887 }, { "epoch": 0.8956356736242884, "grad_norm": 2.564687728881836, "learning_rate": 1.2160958085635628e-05, "loss": 0.4146, "step": 1888 }, { "epoch": 0.8961100569259962, "grad_norm": 2.3934693336486816, "learning_rate": 1.2153455947927509e-05, "loss": 0.2986, "step": 1889 }, { "epoch": 0.896584440227704, "grad_norm": 2.548827886581421, "learning_rate": 1.2145952539055654e-05, "loss": 0.4007, "step": 1890 }, { "epoch": 0.8970588235294118, "grad_norm": 2.382176399230957, "learning_rate": 1.213844786344925e-05, "loss": 0.3523, "step": 1891 }, { "epoch": 0.8975332068311196, "grad_norm": 2.39910626411438, "learning_rate": 1.2130941925538237e-05, "loss": 0.3493, "step": 1892 }, { "epoch": 0.8980075901328273, "grad_norm": 2.387073278427124, "learning_rate": 1.2123434729753287e-05, "loss": 0.3532, "step": 1893 }, { "epoch": 0.8984819734345351, "grad_norm": 2.265078544616699, "learning_rate": 1.211592628052583e-05, "loss": 0.3608, "step": 1894 }, { "epoch": 0.8989563567362429, "grad_norm": 2.8354647159576416, "learning_rate": 1.2108416582288027e-05, "loss": 0.3634, "step": 1895 }, { "epoch": 0.8994307400379506, "grad_norm": 2.2116317749023438, "learning_rate": 1.210090563947278e-05, "loss": 0.31, "step": 1896 }, { "epoch": 0.8999051233396584, "grad_norm": 2.5344135761260986, "learning_rate": 1.2093393456513724e-05, "loss": 0.3346, "step": 1897 }, { "epoch": 0.9003795066413662, "grad_norm": 2.046262502670288, "learning_rate": 1.2085880037845223e-05, "loss": 0.3562, "step": 1898 }, { "epoch": 0.900853889943074, "grad_norm": 1.9915024042129517, "learning_rate": 1.2078365387902379e-05, "loss": 0.3242, "step": 1899 }, { "epoch": 0.9013282732447818, "grad_norm": 2.071885108947754, "learning_rate": 1.2070849511121014e-05, "loss": 0.3272, "step": 1900 }, { "epoch": 0.9018026565464896, "grad_norm": 2.1949217319488525, "learning_rate": 1.2063332411937672e-05, "loss": 0.3419, "step": 1901 }, { "epoch": 0.9022770398481973, "grad_norm": 2.311913013458252, "learning_rate": 1.2055814094789625e-05, "loss": 0.3776, "step": 1902 }, { "epoch": 0.9027514231499051, "grad_norm": 2.1613712310791016, "learning_rate": 1.2048294564114859e-05, "loss": 0.3538, "step": 1903 }, { "epoch": 0.9032258064516129, "grad_norm": 2.3244266510009766, "learning_rate": 1.2040773824352088e-05, "loss": 0.366, "step": 1904 }, { "epoch": 0.9037001897533207, "grad_norm": 3.7160706520080566, "learning_rate": 1.2033251879940716e-05, "loss": 0.4384, "step": 1905 }, { "epoch": 0.9041745730550285, "grad_norm": 1.9680372476577759, "learning_rate": 1.2025728735320878e-05, "loss": 0.2941, "step": 1906 }, { "epoch": 0.9046489563567363, "grad_norm": 1.8036983013153076, "learning_rate": 1.2018204394933417e-05, "loss": 0.2192, "step": 1907 }, { "epoch": 0.905123339658444, "grad_norm": 2.109776496887207, "learning_rate": 1.201067886321987e-05, "loss": 0.3297, "step": 1908 }, { "epoch": 0.9055977229601518, "grad_norm": 2.50654935836792, "learning_rate": 1.2003152144622493e-05, "loss": 0.4017, "step": 1909 }, { "epoch": 0.9060721062618596, "grad_norm": 2.427208185195923, "learning_rate": 1.1995624243584219e-05, "loss": 0.3476, "step": 1910 }, { "epoch": 0.9065464895635673, "grad_norm": 3.9568138122558594, "learning_rate": 1.1988095164548706e-05, "loss": 0.3642, "step": 1911 }, { "epoch": 0.9070208728652751, "grad_norm": 2.389507293701172, "learning_rate": 1.198056491196029e-05, "loss": 0.4409, "step": 1912 }, { "epoch": 0.9074952561669829, "grad_norm": 2.369760751724243, "learning_rate": 1.1973033490264e-05, "loss": 0.3742, "step": 1913 }, { "epoch": 0.9079696394686907, "grad_norm": 2.787485361099243, "learning_rate": 1.1965500903905571e-05, "loss": 0.3947, "step": 1914 }, { "epoch": 0.9084440227703985, "grad_norm": 2.2195568084716797, "learning_rate": 1.1957967157331404e-05, "loss": 0.3326, "step": 1915 }, { "epoch": 0.9089184060721063, "grad_norm": 2.2224323749542236, "learning_rate": 1.1950432254988604e-05, "loss": 0.4156, "step": 1916 }, { "epoch": 0.9093927893738141, "grad_norm": 2.9090452194213867, "learning_rate": 1.1942896201324938e-05, "loss": 0.4651, "step": 1917 }, { "epoch": 0.9098671726755219, "grad_norm": 2.5390193462371826, "learning_rate": 1.1935359000788873e-05, "loss": 0.3556, "step": 1918 }, { "epoch": 0.9103415559772297, "grad_norm": 2.103929281234741, "learning_rate": 1.192782065782954e-05, "loss": 0.3225, "step": 1919 }, { "epoch": 0.9108159392789373, "grad_norm": 2.0904929637908936, "learning_rate": 1.1920281176896752e-05, "loss": 0.2711, "step": 1920 }, { "epoch": 0.9112903225806451, "grad_norm": 2.126173257827759, "learning_rate": 1.1912740562440988e-05, "loss": 0.2951, "step": 1921 }, { "epoch": 0.9117647058823529, "grad_norm": 1.8137263059616089, "learning_rate": 1.1905198818913393e-05, "loss": 0.2783, "step": 1922 }, { "epoch": 0.9122390891840607, "grad_norm": 2.573413133621216, "learning_rate": 1.1897655950765789e-05, "loss": 0.3153, "step": 1923 }, { "epoch": 0.9127134724857685, "grad_norm": 1.9147273302078247, "learning_rate": 1.189011196245066e-05, "loss": 0.3062, "step": 1924 }, { "epoch": 0.9131878557874763, "grad_norm": 2.3643062114715576, "learning_rate": 1.1882566858421137e-05, "loss": 0.4424, "step": 1925 }, { "epoch": 0.9136622390891841, "grad_norm": 2.178414821624756, "learning_rate": 1.1875020643131028e-05, "loss": 0.3204, "step": 1926 }, { "epoch": 0.9141366223908919, "grad_norm": 2.598742961883545, "learning_rate": 1.1867473321034786e-05, "loss": 0.4002, "step": 1927 }, { "epoch": 0.9146110056925996, "grad_norm": 2.4741811752319336, "learning_rate": 1.1859924896587528e-05, "loss": 0.3557, "step": 1928 }, { "epoch": 0.9150853889943074, "grad_norm": 2.3986568450927734, "learning_rate": 1.1852375374245003e-05, "loss": 0.3971, "step": 1929 }, { "epoch": 0.9155597722960152, "grad_norm": 1.920324683189392, "learning_rate": 1.1844824758463626e-05, "loss": 0.2705, "step": 1930 }, { "epoch": 0.9160341555977229, "grad_norm": 2.353745937347412, "learning_rate": 1.183727305370045e-05, "loss": 0.3579, "step": 1931 }, { "epoch": 0.9165085388994307, "grad_norm": 2.8409640789031982, "learning_rate": 1.1829720264413169e-05, "loss": 0.3981, "step": 1932 }, { "epoch": 0.9169829222011385, "grad_norm": 2.5938303470611572, "learning_rate": 1.1822166395060124e-05, "loss": 0.3382, "step": 1933 }, { "epoch": 0.9174573055028463, "grad_norm": 2.2392194271087646, "learning_rate": 1.1814611450100286e-05, "loss": 0.3603, "step": 1934 }, { "epoch": 0.9179316888045541, "grad_norm": 2.44313907623291, "learning_rate": 1.1807055433993266e-05, "loss": 0.3935, "step": 1935 }, { "epoch": 0.9184060721062619, "grad_norm": 2.418811559677124, "learning_rate": 1.1799498351199303e-05, "loss": 0.3081, "step": 1936 }, { "epoch": 0.9188804554079696, "grad_norm": 2.3761041164398193, "learning_rate": 1.179194020617927e-05, "loss": 0.3717, "step": 1937 }, { "epoch": 0.9193548387096774, "grad_norm": 2.0884335041046143, "learning_rate": 1.1784381003394669e-05, "loss": 0.3512, "step": 1938 }, { "epoch": 0.9198292220113852, "grad_norm": 1.974050760269165, "learning_rate": 1.1776820747307615e-05, "loss": 0.3303, "step": 1939 }, { "epoch": 0.920303605313093, "grad_norm": 2.162907600402832, "learning_rate": 1.1769259442380862e-05, "loss": 0.3726, "step": 1940 }, { "epoch": 0.9207779886148008, "grad_norm": 2.2310690879821777, "learning_rate": 1.1761697093077762e-05, "loss": 0.3117, "step": 1941 }, { "epoch": 0.9212523719165086, "grad_norm": 2.247523307800293, "learning_rate": 1.1754133703862302e-05, "loss": 0.3046, "step": 1942 }, { "epoch": 0.9217267552182163, "grad_norm": 2.2049553394317627, "learning_rate": 1.1746569279199073e-05, "loss": 0.2433, "step": 1943 }, { "epoch": 0.9222011385199241, "grad_norm": 2.9283275604248047, "learning_rate": 1.1739003823553286e-05, "loss": 0.3686, "step": 1944 }, { "epoch": 0.9226755218216319, "grad_norm": 1.9516762495040894, "learning_rate": 1.1731437341390748e-05, "loss": 0.2654, "step": 1945 }, { "epoch": 0.9231499051233396, "grad_norm": 2.1916255950927734, "learning_rate": 1.172386983717788e-05, "loss": 0.3597, "step": 1946 }, { "epoch": 0.9236242884250474, "grad_norm": 2.2968626022338867, "learning_rate": 1.1716301315381706e-05, "loss": 0.3295, "step": 1947 }, { "epoch": 0.9240986717267552, "grad_norm": 2.341371774673462, "learning_rate": 1.170873178046985e-05, "loss": 0.3365, "step": 1948 }, { "epoch": 0.924573055028463, "grad_norm": 2.3758347034454346, "learning_rate": 1.1701161236910532e-05, "loss": 0.3714, "step": 1949 }, { "epoch": 0.9250474383301708, "grad_norm": 2.347499132156372, "learning_rate": 1.1693589689172566e-05, "loss": 0.3345, "step": 1950 }, { "epoch": 0.9255218216318786, "grad_norm": 2.4397614002227783, "learning_rate": 1.1686017141725367e-05, "loss": 0.295, "step": 1951 }, { "epoch": 0.9259962049335864, "grad_norm": 1.9351602792739868, "learning_rate": 1.167844359903894e-05, "loss": 0.2817, "step": 1952 }, { "epoch": 0.9264705882352942, "grad_norm": 2.63478684425354, "learning_rate": 1.1670869065583856e-05, "loss": 0.4525, "step": 1953 }, { "epoch": 0.926944971537002, "grad_norm": 3.3862996101379395, "learning_rate": 1.1663293545831302e-05, "loss": 0.3677, "step": 1954 }, { "epoch": 0.9274193548387096, "grad_norm": 2.883392333984375, "learning_rate": 1.1655717044253025e-05, "loss": 0.41, "step": 1955 }, { "epoch": 0.9278937381404174, "grad_norm": 2.115570068359375, "learning_rate": 1.164813956532136e-05, "loss": 0.3187, "step": 1956 }, { "epoch": 0.9283681214421252, "grad_norm": 1.8709588050842285, "learning_rate": 1.1640561113509222e-05, "loss": 0.2728, "step": 1957 }, { "epoch": 0.928842504743833, "grad_norm": 2.141753911972046, "learning_rate": 1.1632981693290089e-05, "loss": 0.2978, "step": 1958 }, { "epoch": 0.9293168880455408, "grad_norm": 2.3228015899658203, "learning_rate": 1.1625401309138025e-05, "loss": 0.3375, "step": 1959 }, { "epoch": 0.9297912713472486, "grad_norm": 2.2550671100616455, "learning_rate": 1.161781996552765e-05, "loss": 0.3075, "step": 1960 }, { "epoch": 0.9302656546489564, "grad_norm": 2.196171998977661, "learning_rate": 1.1610237666934158e-05, "loss": 0.3969, "step": 1961 }, { "epoch": 0.9307400379506642, "grad_norm": 3.4995615482330322, "learning_rate": 1.1602654417833305e-05, "loss": 0.349, "step": 1962 }, { "epoch": 0.931214421252372, "grad_norm": 2.3877415657043457, "learning_rate": 1.1595070222701408e-05, "loss": 0.3497, "step": 1963 }, { "epoch": 0.9316888045540797, "grad_norm": 2.260338306427002, "learning_rate": 1.1587485086015346e-05, "loss": 0.2895, "step": 1964 }, { "epoch": 0.9321631878557874, "grad_norm": 2.212602138519287, "learning_rate": 1.1579899012252543e-05, "loss": 0.3157, "step": 1965 }, { "epoch": 0.9326375711574952, "grad_norm": 1.9429450035095215, "learning_rate": 1.1572312005890986e-05, "loss": 0.3059, "step": 1966 }, { "epoch": 0.933111954459203, "grad_norm": 2.017580509185791, "learning_rate": 1.1564724071409213e-05, "loss": 0.2694, "step": 1967 }, { "epoch": 0.9335863377609108, "grad_norm": 3.207867383956909, "learning_rate": 1.1557135213286303e-05, "loss": 0.3742, "step": 1968 }, { "epoch": 0.9340607210626186, "grad_norm": 2.253946542739868, "learning_rate": 1.1549545436001888e-05, "loss": 0.3292, "step": 1969 }, { "epoch": 0.9345351043643264, "grad_norm": 2.34022855758667, "learning_rate": 1.1541954744036131e-05, "loss": 0.388, "step": 1970 }, { "epoch": 0.9350094876660342, "grad_norm": 2.2387092113494873, "learning_rate": 1.153436314186975e-05, "loss": 0.344, "step": 1971 }, { "epoch": 0.9354838709677419, "grad_norm": 2.4496212005615234, "learning_rate": 1.152677063398399e-05, "loss": 0.3724, "step": 1972 }, { "epoch": 0.9359582542694497, "grad_norm": 2.5698015689849854, "learning_rate": 1.1519177224860632e-05, "loss": 0.3821, "step": 1973 }, { "epoch": 0.9364326375711575, "grad_norm": 1.8094062805175781, "learning_rate": 1.151158291898199e-05, "loss": 0.3154, "step": 1974 }, { "epoch": 0.9369070208728653, "grad_norm": 2.1138343811035156, "learning_rate": 1.1503987720830908e-05, "loss": 0.347, "step": 1975 }, { "epoch": 0.937381404174573, "grad_norm": 2.825183153152466, "learning_rate": 1.1496391634890758e-05, "loss": 0.4074, "step": 1976 }, { "epoch": 0.9378557874762808, "grad_norm": 2.231473922729492, "learning_rate": 1.1488794665645434e-05, "loss": 0.3262, "step": 1977 }, { "epoch": 0.9383301707779886, "grad_norm": 2.0554375648498535, "learning_rate": 1.1481196817579352e-05, "loss": 0.2908, "step": 1978 }, { "epoch": 0.9388045540796964, "grad_norm": 2.4389944076538086, "learning_rate": 1.1473598095177443e-05, "loss": 0.2931, "step": 1979 }, { "epoch": 0.9392789373814042, "grad_norm": 3.037283420562744, "learning_rate": 1.1465998502925161e-05, "loss": 0.4337, "step": 1980 }, { "epoch": 0.9397533206831119, "grad_norm": 2.06538987159729, "learning_rate": 1.1458398045308471e-05, "loss": 0.2837, "step": 1981 }, { "epoch": 0.9402277039848197, "grad_norm": 2.118652820587158, "learning_rate": 1.1450796726813844e-05, "loss": 0.3299, "step": 1982 }, { "epoch": 0.9407020872865275, "grad_norm": 2.2635416984558105, "learning_rate": 1.1443194551928267e-05, "loss": 0.3512, "step": 1983 }, { "epoch": 0.9411764705882353, "grad_norm": 2.088435173034668, "learning_rate": 1.1435591525139228e-05, "loss": 0.3404, "step": 1984 }, { "epoch": 0.9416508538899431, "grad_norm": 2.3585917949676514, "learning_rate": 1.1427987650934717e-05, "loss": 0.3526, "step": 1985 }, { "epoch": 0.9421252371916509, "grad_norm": 1.858132243156433, "learning_rate": 1.1420382933803225e-05, "loss": 0.2791, "step": 1986 }, { "epoch": 0.9425996204933587, "grad_norm": 2.3248212337493896, "learning_rate": 1.1412777378233742e-05, "loss": 0.3954, "step": 1987 }, { "epoch": 0.9430740037950665, "grad_norm": 2.187311887741089, "learning_rate": 1.1405170988715752e-05, "loss": 0.3678, "step": 1988 }, { "epoch": 0.9435483870967742, "grad_norm": 2.151273488998413, "learning_rate": 1.1397563769739235e-05, "loss": 0.3197, "step": 1989 }, { "epoch": 0.9440227703984819, "grad_norm": 2.1821095943450928, "learning_rate": 1.1389955725794655e-05, "loss": 0.3604, "step": 1990 }, { "epoch": 0.9444971537001897, "grad_norm": 2.963883876800537, "learning_rate": 1.138234686137296e-05, "loss": 0.3858, "step": 1991 }, { "epoch": 0.9449715370018975, "grad_norm": 2.512237310409546, "learning_rate": 1.1374737180965593e-05, "loss": 0.3751, "step": 1992 }, { "epoch": 0.9454459203036053, "grad_norm": 2.063708543777466, "learning_rate": 1.1367126689064473e-05, "loss": 0.315, "step": 1993 }, { "epoch": 0.9459203036053131, "grad_norm": 2.0706629753112793, "learning_rate": 1.1359515390161996e-05, "loss": 0.3446, "step": 1994 }, { "epoch": 0.9463946869070209, "grad_norm": 2.1223249435424805, "learning_rate": 1.1351903288751038e-05, "loss": 0.3453, "step": 1995 }, { "epoch": 0.9468690702087287, "grad_norm": 2.1654293537139893, "learning_rate": 1.1344290389324949e-05, "loss": 0.2958, "step": 1996 }, { "epoch": 0.9473434535104365, "grad_norm": 2.032048225402832, "learning_rate": 1.1336676696377542e-05, "loss": 0.3034, "step": 1997 }, { "epoch": 0.9478178368121443, "grad_norm": 2.133802890777588, "learning_rate": 1.1329062214403106e-05, "loss": 0.3172, "step": 1998 }, { "epoch": 0.948292220113852, "grad_norm": 2.323798656463623, "learning_rate": 1.13214469478964e-05, "loss": 0.3271, "step": 1999 }, { "epoch": 0.9487666034155597, "grad_norm": 2.347034454345703, "learning_rate": 1.1313830901352634e-05, "loss": 0.3382, "step": 2000 }, { "epoch": 0.9492409867172675, "grad_norm": 2.104706048965454, "learning_rate": 1.130621407926749e-05, "loss": 0.3714, "step": 2001 }, { "epoch": 0.9497153700189753, "grad_norm": 2.216869354248047, "learning_rate": 1.12985964861371e-05, "loss": 0.3052, "step": 2002 }, { "epoch": 0.9501897533206831, "grad_norm": 2.0643813610076904, "learning_rate": 1.1290978126458054e-05, "loss": 0.2865, "step": 2003 }, { "epoch": 0.9506641366223909, "grad_norm": 1.9597926139831543, "learning_rate": 1.1283359004727397e-05, "loss": 0.3136, "step": 2004 }, { "epoch": 0.9511385199240987, "grad_norm": 2.2669873237609863, "learning_rate": 1.1275739125442618e-05, "loss": 0.3544, "step": 2005 }, { "epoch": 0.9516129032258065, "grad_norm": 1.7317811250686646, "learning_rate": 1.1268118493101654e-05, "loss": 0.2356, "step": 2006 }, { "epoch": 0.9520872865275142, "grad_norm": 2.301215887069702, "learning_rate": 1.1260497112202895e-05, "loss": 0.3188, "step": 2007 }, { "epoch": 0.952561669829222, "grad_norm": 1.8703101873397827, "learning_rate": 1.1252874987245163e-05, "loss": 0.2773, "step": 2008 }, { "epoch": 0.9530360531309298, "grad_norm": 2.490936517715454, "learning_rate": 1.1245252122727726e-05, "loss": 0.363, "step": 2009 }, { "epoch": 0.9535104364326376, "grad_norm": 2.491692066192627, "learning_rate": 1.123762852315028e-05, "loss": 0.3159, "step": 2010 }, { "epoch": 0.9539848197343453, "grad_norm": 2.416424036026001, "learning_rate": 1.1230004193012964e-05, "loss": 0.3757, "step": 2011 }, { "epoch": 0.9544592030360531, "grad_norm": 2.583618402481079, "learning_rate": 1.1222379136816347e-05, "loss": 0.3097, "step": 2012 }, { "epoch": 0.9549335863377609, "grad_norm": 2.5698044300079346, "learning_rate": 1.1214753359061418e-05, "loss": 0.3456, "step": 2013 }, { "epoch": 0.9554079696394687, "grad_norm": 2.5504226684570312, "learning_rate": 1.1207126864249604e-05, "loss": 0.3817, "step": 2014 }, { "epoch": 0.9558823529411765, "grad_norm": 2.0663864612579346, "learning_rate": 1.1199499656882747e-05, "loss": 0.2627, "step": 2015 }, { "epoch": 0.9563567362428842, "grad_norm": 2.0620248317718506, "learning_rate": 1.1191871741463112e-05, "loss": 0.3134, "step": 2016 }, { "epoch": 0.956831119544592, "grad_norm": 2.855698585510254, "learning_rate": 1.1184243122493381e-05, "loss": 0.3152, "step": 2017 }, { "epoch": 0.9573055028462998, "grad_norm": 1.9541351795196533, "learning_rate": 1.1176613804476655e-05, "loss": 0.3101, "step": 2018 }, { "epoch": 0.9577798861480076, "grad_norm": 2.0520036220550537, "learning_rate": 1.1168983791916442e-05, "loss": 0.2803, "step": 2019 }, { "epoch": 0.9582542694497154, "grad_norm": 2.2084736824035645, "learning_rate": 1.1161353089316664e-05, "loss": 0.3272, "step": 2020 }, { "epoch": 0.9587286527514232, "grad_norm": 2.3713338375091553, "learning_rate": 1.115372170118165e-05, "loss": 0.3432, "step": 2021 }, { "epoch": 0.959203036053131, "grad_norm": 2.585407257080078, "learning_rate": 1.1146089632016132e-05, "loss": 0.4149, "step": 2022 }, { "epoch": 0.9596774193548387, "grad_norm": 2.2623908519744873, "learning_rate": 1.1138456886325247e-05, "loss": 0.3292, "step": 2023 }, { "epoch": 0.9601518026565465, "grad_norm": 1.627692461013794, "learning_rate": 1.1130823468614525e-05, "loss": 0.2295, "step": 2024 }, { "epoch": 0.9606261859582542, "grad_norm": 2.19572114944458, "learning_rate": 1.1123189383389901e-05, "loss": 0.3843, "step": 2025 }, { "epoch": 0.961100569259962, "grad_norm": 2.066941499710083, "learning_rate": 1.1115554635157698e-05, "loss": 0.3033, "step": 2026 }, { "epoch": 0.9615749525616698, "grad_norm": 2.1408028602600098, "learning_rate": 1.1107919228424631e-05, "loss": 0.3627, "step": 2027 }, { "epoch": 0.9620493358633776, "grad_norm": 2.3943800926208496, "learning_rate": 1.110028316769781e-05, "loss": 0.3059, "step": 2028 }, { "epoch": 0.9625237191650854, "grad_norm": 2.0022356510162354, "learning_rate": 1.1092646457484721e-05, "loss": 0.3247, "step": 2029 }, { "epoch": 0.9629981024667932, "grad_norm": 2.043557643890381, "learning_rate": 1.1085009102293243e-05, "loss": 0.2876, "step": 2030 }, { "epoch": 0.963472485768501, "grad_norm": 2.535658836364746, "learning_rate": 1.1077371106631625e-05, "loss": 0.3627, "step": 2031 }, { "epoch": 0.9639468690702088, "grad_norm": 1.851802945137024, "learning_rate": 1.1069732475008504e-05, "loss": 0.2793, "step": 2032 }, { "epoch": 0.9644212523719166, "grad_norm": 2.064366340637207, "learning_rate": 1.1062093211932891e-05, "loss": 0.3435, "step": 2033 }, { "epoch": 0.9648956356736242, "grad_norm": 1.589677333831787, "learning_rate": 1.1054453321914162e-05, "loss": 0.2602, "step": 2034 }, { "epoch": 0.965370018975332, "grad_norm": 2.559872627258301, "learning_rate": 1.1046812809462073e-05, "loss": 0.3856, "step": 2035 }, { "epoch": 0.9658444022770398, "grad_norm": 2.204385995864868, "learning_rate": 1.1039171679086739e-05, "loss": 0.336, "step": 2036 }, { "epoch": 0.9663187855787476, "grad_norm": 2.258500337600708, "learning_rate": 1.1031529935298651e-05, "loss": 0.3388, "step": 2037 }, { "epoch": 0.9667931688804554, "grad_norm": 2.894205331802368, "learning_rate": 1.1023887582608645e-05, "loss": 0.368, "step": 2038 }, { "epoch": 0.9672675521821632, "grad_norm": 3.6994946002960205, "learning_rate": 1.1016244625527933e-05, "loss": 0.3644, "step": 2039 }, { "epoch": 0.967741935483871, "grad_norm": 2.2392802238464355, "learning_rate": 1.1008601068568074e-05, "loss": 0.3416, "step": 2040 }, { "epoch": 0.9682163187855788, "grad_norm": 2.4265706539154053, "learning_rate": 1.1000956916240985e-05, "loss": 0.3332, "step": 2041 }, { "epoch": 0.9686907020872866, "grad_norm": 1.7920218706130981, "learning_rate": 1.0993312173058934e-05, "loss": 0.2237, "step": 2042 }, { "epoch": 0.9691650853889943, "grad_norm": 2.4036221504211426, "learning_rate": 1.0985666843534534e-05, "loss": 0.3508, "step": 2043 }, { "epoch": 0.969639468690702, "grad_norm": 2.5761871337890625, "learning_rate": 1.097802093218075e-05, "loss": 0.3949, "step": 2044 }, { "epoch": 0.9701138519924098, "grad_norm": 2.6531662940979004, "learning_rate": 1.0970374443510891e-05, "loss": 0.3703, "step": 2045 }, { "epoch": 0.9705882352941176, "grad_norm": 1.9129854440689087, "learning_rate": 1.0962727382038598e-05, "loss": 0.2939, "step": 2046 }, { "epoch": 0.9710626185958254, "grad_norm": 2.6273584365844727, "learning_rate": 1.0955079752277859e-05, "loss": 0.418, "step": 2047 }, { "epoch": 0.9715370018975332, "grad_norm": 2.4543769359588623, "learning_rate": 1.0947431558742991e-05, "loss": 0.2984, "step": 2048 }, { "epoch": 0.972011385199241, "grad_norm": 2.275509834289551, "learning_rate": 1.0939782805948653e-05, "loss": 0.3299, "step": 2049 }, { "epoch": 0.9724857685009488, "grad_norm": 2.039682626724243, "learning_rate": 1.0932133498409821e-05, "loss": 0.3081, "step": 2050 }, { "epoch": 0.9729601518026565, "grad_norm": 2.2457706928253174, "learning_rate": 1.0924483640641808e-05, "loss": 0.3862, "step": 2051 }, { "epoch": 0.9734345351043643, "grad_norm": 2.2409019470214844, "learning_rate": 1.0916833237160252e-05, "loss": 0.3165, "step": 2052 }, { "epoch": 0.9739089184060721, "grad_norm": 1.9504499435424805, "learning_rate": 1.0909182292481108e-05, "loss": 0.3161, "step": 2053 }, { "epoch": 0.9743833017077799, "grad_norm": 2.0921568870544434, "learning_rate": 1.0901530811120655e-05, "loss": 0.3617, "step": 2054 }, { "epoch": 0.9748576850094877, "grad_norm": 1.8210068941116333, "learning_rate": 1.0893878797595484e-05, "loss": 0.2962, "step": 2055 }, { "epoch": 0.9753320683111955, "grad_norm": 1.7719886302947998, "learning_rate": 1.0886226256422501e-05, "loss": 0.2733, "step": 2056 }, { "epoch": 0.9758064516129032, "grad_norm": 2.5448601245880127, "learning_rate": 1.0878573192118933e-05, "loss": 0.2718, "step": 2057 }, { "epoch": 0.976280834914611, "grad_norm": 1.8214176893234253, "learning_rate": 1.0870919609202301e-05, "loss": 0.2664, "step": 2058 }, { "epoch": 0.9767552182163188, "grad_norm": 2.0862033367156982, "learning_rate": 1.0863265512190444e-05, "loss": 0.3164, "step": 2059 }, { "epoch": 0.9772296015180265, "grad_norm": 1.6361185312271118, "learning_rate": 1.0855610905601495e-05, "loss": 0.217, "step": 2060 }, { "epoch": 0.9777039848197343, "grad_norm": 2.2029919624328613, "learning_rate": 1.0847955793953899e-05, "loss": 0.3134, "step": 2061 }, { "epoch": 0.9781783681214421, "grad_norm": 2.551486015319824, "learning_rate": 1.0840300181766383e-05, "loss": 0.4083, "step": 2062 }, { "epoch": 0.9786527514231499, "grad_norm": 2.5572237968444824, "learning_rate": 1.0832644073557987e-05, "loss": 0.4408, "step": 2063 }, { "epoch": 0.9791271347248577, "grad_norm": 2.1839964389801025, "learning_rate": 1.0824987473848037e-05, "loss": 0.362, "step": 2064 }, { "epoch": 0.9796015180265655, "grad_norm": 2.5323362350463867, "learning_rate": 1.0817330387156142e-05, "loss": 0.3513, "step": 2065 }, { "epoch": 0.9800759013282733, "grad_norm": 2.2569549083709717, "learning_rate": 1.0809672818002209e-05, "loss": 0.3858, "step": 2066 }, { "epoch": 0.9805502846299811, "grad_norm": 2.198728561401367, "learning_rate": 1.080201477090642e-05, "loss": 0.36, "step": 2067 }, { "epoch": 0.9810246679316889, "grad_norm": 2.281818389892578, "learning_rate": 1.079435625038925e-05, "loss": 0.3084, "step": 2068 }, { "epoch": 0.9814990512333965, "grad_norm": 2.4596810340881348, "learning_rate": 1.0786697260971449e-05, "loss": 0.3617, "step": 2069 }, { "epoch": 0.9819734345351043, "grad_norm": 2.167909622192383, "learning_rate": 1.0779037807174032e-05, "loss": 0.3021, "step": 2070 }, { "epoch": 0.9824478178368121, "grad_norm": 1.8683514595031738, "learning_rate": 1.0771377893518314e-05, "loss": 0.2457, "step": 2071 }, { "epoch": 0.9829222011385199, "grad_norm": 2.5630767345428467, "learning_rate": 1.0763717524525854e-05, "loss": 0.3483, "step": 2072 }, { "epoch": 0.9833965844402277, "grad_norm": 2.544394016265869, "learning_rate": 1.0756056704718498e-05, "loss": 0.3901, "step": 2073 }, { "epoch": 0.9838709677419355, "grad_norm": 2.037989377975464, "learning_rate": 1.0748395438618353e-05, "loss": 0.3368, "step": 2074 }, { "epoch": 0.9843453510436433, "grad_norm": 1.889345407485962, "learning_rate": 1.074073373074778e-05, "loss": 0.2303, "step": 2075 }, { "epoch": 0.9848197343453511, "grad_norm": 2.1819489002227783, "learning_rate": 1.0733071585629423e-05, "loss": 0.31, "step": 2076 }, { "epoch": 0.9852941176470589, "grad_norm": 2.640437126159668, "learning_rate": 1.0725409007786161e-05, "loss": 0.4647, "step": 2077 }, { "epoch": 0.9857685009487666, "grad_norm": 2.5758280754089355, "learning_rate": 1.0717746001741139e-05, "loss": 0.3326, "step": 2078 }, { "epoch": 0.9862428842504743, "grad_norm": 1.8008846044540405, "learning_rate": 1.0710082572017753e-05, "loss": 0.2491, "step": 2079 }, { "epoch": 0.9867172675521821, "grad_norm": 2.1113157272338867, "learning_rate": 1.0702418723139654e-05, "loss": 0.3475, "step": 2080 }, { "epoch": 0.9871916508538899, "grad_norm": 2.860546827316284, "learning_rate": 1.0694754459630732e-05, "loss": 0.3269, "step": 2081 }, { "epoch": 0.9876660341555977, "grad_norm": 2.7859246730804443, "learning_rate": 1.0687089786015126e-05, "loss": 0.3732, "step": 2082 }, { "epoch": 0.9881404174573055, "grad_norm": 2.2017409801483154, "learning_rate": 1.0679424706817221e-05, "loss": 0.3555, "step": 2083 }, { "epoch": 0.9886148007590133, "grad_norm": 1.9931174516677856, "learning_rate": 1.0671759226561631e-05, "loss": 0.3141, "step": 2084 }, { "epoch": 0.9890891840607211, "grad_norm": 2.228123664855957, "learning_rate": 1.0664093349773222e-05, "loss": 0.3647, "step": 2085 }, { "epoch": 0.9895635673624289, "grad_norm": 2.191070795059204, "learning_rate": 1.0656427080977077e-05, "loss": 0.3575, "step": 2086 }, { "epoch": 0.9900379506641366, "grad_norm": 2.1509921550750732, "learning_rate": 1.0648760424698521e-05, "loss": 0.3169, "step": 2087 }, { "epoch": 0.9905123339658444, "grad_norm": 1.9772475957870483, "learning_rate": 1.0641093385463108e-05, "loss": 0.3271, "step": 2088 }, { "epoch": 0.9909867172675522, "grad_norm": 2.0178329944610596, "learning_rate": 1.0633425967796614e-05, "loss": 0.3091, "step": 2089 }, { "epoch": 0.99146110056926, "grad_norm": 3.558840751647949, "learning_rate": 1.0625758176225038e-05, "loss": 0.3339, "step": 2090 }, { "epoch": 0.9919354838709677, "grad_norm": 2.146780014038086, "learning_rate": 1.0618090015274603e-05, "loss": 0.298, "step": 2091 }, { "epoch": 0.9924098671726755, "grad_norm": 2.24959659576416, "learning_rate": 1.0610421489471748e-05, "loss": 0.383, "step": 2092 }, { "epoch": 0.9928842504743833, "grad_norm": 2.0573537349700928, "learning_rate": 1.0602752603343127e-05, "loss": 0.3116, "step": 2093 }, { "epoch": 0.9933586337760911, "grad_norm": 2.7625508308410645, "learning_rate": 1.059508336141561e-05, "loss": 0.3006, "step": 2094 }, { "epoch": 0.9938330170777988, "grad_norm": 2.205559492111206, "learning_rate": 1.0587413768216273e-05, "loss": 0.3381, "step": 2095 }, { "epoch": 0.9943074003795066, "grad_norm": 2.6509761810302734, "learning_rate": 1.05797438282724e-05, "loss": 0.3785, "step": 2096 }, { "epoch": 0.9947817836812144, "grad_norm": 2.5124902725219727, "learning_rate": 1.0572073546111485e-05, "loss": 0.3773, "step": 2097 }, { "epoch": 0.9952561669829222, "grad_norm": 2.2544424533843994, "learning_rate": 1.0564402926261216e-05, "loss": 0.3657, "step": 2098 }, { "epoch": 0.99573055028463, "grad_norm": 1.9105042219161987, "learning_rate": 1.0556731973249486e-05, "loss": 0.2686, "step": 2099 }, { "epoch": 0.9962049335863378, "grad_norm": 1.9611090421676636, "learning_rate": 1.0549060691604376e-05, "loss": 0.3119, "step": 2100 }, { "epoch": 0.9966793168880456, "grad_norm": 2.110646963119507, "learning_rate": 1.0541389085854177e-05, "loss": 0.3229, "step": 2101 }, { "epoch": 0.9971537001897534, "grad_norm": 2.017954111099243, "learning_rate": 1.0533717160527357e-05, "loss": 0.2829, "step": 2102 }, { "epoch": 0.9976280834914611, "grad_norm": 1.9769190549850464, "learning_rate": 1.0526044920152578e-05, "loss": 0.2776, "step": 2103 }, { "epoch": 0.9981024667931688, "grad_norm": 2.1241579055786133, "learning_rate": 1.051837236925869e-05, "loss": 0.3468, "step": 2104 }, { "epoch": 0.9985768500948766, "grad_norm": 2.512434720993042, "learning_rate": 1.051069951237472e-05, "loss": 0.2764, "step": 2105 }, { "epoch": 0.9990512333965844, "grad_norm": 2.3992347717285156, "learning_rate": 1.0503026354029882e-05, "loss": 0.3268, "step": 2106 }, { "epoch": 0.9995256166982922, "grad_norm": 2.2171542644500732, "learning_rate": 1.0495352898753563e-05, "loss": 0.329, "step": 2107 }, { "epoch": 1.0, "grad_norm": 1.9980618953704834, "learning_rate": 1.0487679151075332e-05, "loss": 0.3509, "step": 2108 }, { "epoch": 1.0004743833017078, "grad_norm": 2.0853278636932373, "learning_rate": 1.0480005115524923e-05, "loss": 0.2764, "step": 2109 }, { "epoch": 1.0009487666034156, "grad_norm": 1.904092788696289, "learning_rate": 1.0472330796632244e-05, "loss": 0.26, "step": 2110 }, { "epoch": 1.0014231499051234, "grad_norm": 2.1483476161956787, "learning_rate": 1.0464656198927373e-05, "loss": 0.2206, "step": 2111 }, { "epoch": 1.0018975332068312, "grad_norm": 1.696155309677124, "learning_rate": 1.0456981326940541e-05, "loss": 0.1763, "step": 2112 }, { "epoch": 1.002371916508539, "grad_norm": 1.9953058958053589, "learning_rate": 1.0449306185202155e-05, "loss": 0.1922, "step": 2113 }, { "epoch": 1.0028462998102468, "grad_norm": 1.8557651042938232, "learning_rate": 1.0441630778242775e-05, "loss": 0.2421, "step": 2114 }, { "epoch": 1.0033206831119545, "grad_norm": 1.7121390104293823, "learning_rate": 1.0433955110593115e-05, "loss": 0.2016, "step": 2115 }, { "epoch": 1.0037950664136623, "grad_norm": 1.799052357673645, "learning_rate": 1.042627918678405e-05, "loss": 0.1668, "step": 2116 }, { "epoch": 1.00426944971537, "grad_norm": 1.8632630109786987, "learning_rate": 1.04186030113466e-05, "loss": 0.2101, "step": 2117 }, { "epoch": 1.0047438330170777, "grad_norm": 2.4004900455474854, "learning_rate": 1.0410926588811931e-05, "loss": 0.2088, "step": 2118 }, { "epoch": 1.0052182163187855, "grad_norm": 1.4884165525436401, "learning_rate": 1.0403249923711365e-05, "loss": 0.1716, "step": 2119 }, { "epoch": 1.0056925996204933, "grad_norm": 1.6962333917617798, "learning_rate": 1.0395573020576357e-05, "loss": 0.1693, "step": 2120 }, { "epoch": 1.006166982922201, "grad_norm": 1.9333925247192383, "learning_rate": 1.038789588393851e-05, "loss": 0.1512, "step": 2121 }, { "epoch": 1.0066413662239089, "grad_norm": 2.6267874240875244, "learning_rate": 1.0380218518329564e-05, "loss": 0.2462, "step": 2122 }, { "epoch": 1.0071157495256167, "grad_norm": 1.9119246006011963, "learning_rate": 1.037254092828139e-05, "loss": 0.1687, "step": 2123 }, { "epoch": 1.0075901328273245, "grad_norm": 2.043225049972534, "learning_rate": 1.0364863118325988e-05, "loss": 0.225, "step": 2124 }, { "epoch": 1.0080645161290323, "grad_norm": 2.4549620151519775, "learning_rate": 1.0357185092995499e-05, "loss": 0.2173, "step": 2125 }, { "epoch": 1.00853889943074, "grad_norm": 1.932741403579712, "learning_rate": 1.0349506856822184e-05, "loss": 0.2033, "step": 2126 }, { "epoch": 1.0090132827324478, "grad_norm": 2.0784912109375, "learning_rate": 1.0341828414338431e-05, "loss": 0.1974, "step": 2127 }, { "epoch": 1.0094876660341556, "grad_norm": 1.9300061464309692, "learning_rate": 1.0334149770076747e-05, "loss": 0.1846, "step": 2128 }, { "epoch": 1.0099620493358634, "grad_norm": 1.8968406915664673, "learning_rate": 1.0326470928569758e-05, "loss": 0.2126, "step": 2129 }, { "epoch": 1.0104364326375712, "grad_norm": 2.385936975479126, "learning_rate": 1.0318791894350217e-05, "loss": 0.2034, "step": 2130 }, { "epoch": 1.010910815939279, "grad_norm": 2.1592063903808594, "learning_rate": 1.0311112671950969e-05, "loss": 0.2947, "step": 2131 }, { "epoch": 1.0113851992409868, "grad_norm": 1.7248077392578125, "learning_rate": 1.030343326590499e-05, "loss": 0.158, "step": 2132 }, { "epoch": 1.0118595825426946, "grad_norm": 2.433410167694092, "learning_rate": 1.0295753680745359e-05, "loss": 0.226, "step": 2133 }, { "epoch": 1.0123339658444024, "grad_norm": 1.5119600296020508, "learning_rate": 1.0288073921005258e-05, "loss": 0.1824, "step": 2134 }, { "epoch": 1.01280834914611, "grad_norm": 1.8853365182876587, "learning_rate": 1.028039399121797e-05, "loss": 0.2013, "step": 2135 }, { "epoch": 1.0132827324478177, "grad_norm": 1.7288734912872314, "learning_rate": 1.0272713895916884e-05, "loss": 0.1868, "step": 2136 }, { "epoch": 1.0137571157495255, "grad_norm": 1.6384727954864502, "learning_rate": 1.0265033639635483e-05, "loss": 0.178, "step": 2137 }, { "epoch": 1.0142314990512333, "grad_norm": 1.747512936592102, "learning_rate": 1.0257353226907349e-05, "loss": 0.1839, "step": 2138 }, { "epoch": 1.0147058823529411, "grad_norm": 2.3721799850463867, "learning_rate": 1.0249672662266148e-05, "loss": 0.1944, "step": 2139 }, { "epoch": 1.015180265654649, "grad_norm": 2.0248069763183594, "learning_rate": 1.0241991950245648e-05, "loss": 0.2256, "step": 2140 }, { "epoch": 1.0156546489563567, "grad_norm": 2.1707992553710938, "learning_rate": 1.0234311095379694e-05, "loss": 0.1732, "step": 2141 }, { "epoch": 1.0161290322580645, "grad_norm": 2.033766746520996, "learning_rate": 1.0226630102202221e-05, "loss": 0.1923, "step": 2142 }, { "epoch": 1.0166034155597723, "grad_norm": 1.76777184009552, "learning_rate": 1.0218948975247238e-05, "loss": 0.1628, "step": 2143 }, { "epoch": 1.01707779886148, "grad_norm": 1.886703372001648, "learning_rate": 1.021126771904884e-05, "loss": 0.1511, "step": 2144 }, { "epoch": 1.0175521821631879, "grad_norm": 1.8403862714767456, "learning_rate": 1.0203586338141202e-05, "loss": 0.1826, "step": 2145 }, { "epoch": 1.0180265654648957, "grad_norm": 2.151848793029785, "learning_rate": 1.0195904837058563e-05, "loss": 0.2328, "step": 2146 }, { "epoch": 1.0185009487666035, "grad_norm": 1.8675678968429565, "learning_rate": 1.0188223220335238e-05, "loss": 0.1969, "step": 2147 }, { "epoch": 1.0189753320683113, "grad_norm": 2.2719991207122803, "learning_rate": 1.0180541492505605e-05, "loss": 0.2024, "step": 2148 }, { "epoch": 1.019449715370019, "grad_norm": 3.2726147174835205, "learning_rate": 1.0172859658104117e-05, "loss": 0.2026, "step": 2149 }, { "epoch": 1.0199240986717268, "grad_norm": 1.7774338722229004, "learning_rate": 1.0165177721665284e-05, "loss": 0.1555, "step": 2150 }, { "epoch": 1.0203984819734346, "grad_norm": 1.7717763185501099, "learning_rate": 1.0157495687723675e-05, "loss": 0.1982, "step": 2151 }, { "epoch": 1.0208728652751424, "grad_norm": 1.9706634283065796, "learning_rate": 1.0149813560813924e-05, "loss": 0.1834, "step": 2152 }, { "epoch": 1.02134724857685, "grad_norm": 1.683834433555603, "learning_rate": 1.0142131345470704e-05, "loss": 0.1814, "step": 2153 }, { "epoch": 1.0218216318785578, "grad_norm": 1.992972731590271, "learning_rate": 1.0134449046228764e-05, "loss": 0.2124, "step": 2154 }, { "epoch": 1.0222960151802656, "grad_norm": 1.861265778541565, "learning_rate": 1.0126766667622878e-05, "loss": 0.2043, "step": 2155 }, { "epoch": 1.0227703984819734, "grad_norm": 2.0793933868408203, "learning_rate": 1.0119084214187882e-05, "loss": 0.2147, "step": 2156 }, { "epoch": 1.0232447817836812, "grad_norm": 1.6746715307235718, "learning_rate": 1.0111401690458655e-05, "loss": 0.2066, "step": 2157 }, { "epoch": 1.023719165085389, "grad_norm": 1.7119837999343872, "learning_rate": 1.0103719100970115e-05, "loss": 0.161, "step": 2158 }, { "epoch": 1.0241935483870968, "grad_norm": 2.014586925506592, "learning_rate": 1.0096036450257214e-05, "loss": 0.1854, "step": 2159 }, { "epoch": 1.0246679316888045, "grad_norm": 1.4165558815002441, "learning_rate": 1.0088353742854943e-05, "loss": 0.1476, "step": 2160 }, { "epoch": 1.0251423149905123, "grad_norm": 1.4397838115692139, "learning_rate": 1.0080670983298335e-05, "loss": 0.1388, "step": 2161 }, { "epoch": 1.0256166982922201, "grad_norm": 2.0614216327667236, "learning_rate": 1.0072988176122445e-05, "loss": 0.2106, "step": 2162 }, { "epoch": 1.026091081593928, "grad_norm": 2.1988351345062256, "learning_rate": 1.0065305325862354e-05, "loss": 0.1489, "step": 2163 }, { "epoch": 1.0265654648956357, "grad_norm": 1.5647263526916504, "learning_rate": 1.0057622437053178e-05, "loss": 0.1348, "step": 2164 }, { "epoch": 1.0270398481973435, "grad_norm": 1.8337029218673706, "learning_rate": 1.0049939514230045e-05, "loss": 0.1559, "step": 2165 }, { "epoch": 1.0275142314990513, "grad_norm": 1.588962435722351, "learning_rate": 1.0042256561928115e-05, "loss": 0.1418, "step": 2166 }, { "epoch": 1.027988614800759, "grad_norm": 1.8072073459625244, "learning_rate": 1.003457358468255e-05, "loss": 0.1858, "step": 2167 }, { "epoch": 1.0284629981024669, "grad_norm": 2.0580496788024902, "learning_rate": 1.0026890587028544e-05, "loss": 0.2082, "step": 2168 }, { "epoch": 1.0289373814041747, "grad_norm": 2.101830244064331, "learning_rate": 1.0019207573501287e-05, "loss": 0.2083, "step": 2169 }, { "epoch": 1.0294117647058822, "grad_norm": 1.8399953842163086, "learning_rate": 1.001152454863599e-05, "loss": 0.1713, "step": 2170 }, { "epoch": 1.02988614800759, "grad_norm": 1.6996338367462158, "learning_rate": 1.000384151696787e-05, "loss": 0.1551, "step": 2171 }, { "epoch": 1.0303605313092978, "grad_norm": 1.8706146478652954, "learning_rate": 9.996158483032137e-06, "loss": 0.2203, "step": 2172 }, { "epoch": 1.0308349146110056, "grad_norm": 2.099688768386841, "learning_rate": 9.988475451364013e-06, "loss": 0.2117, "step": 2173 }, { "epoch": 1.0313092979127134, "grad_norm": 2.176435947418213, "learning_rate": 9.980792426498717e-06, "loss": 0.2196, "step": 2174 }, { "epoch": 1.0317836812144212, "grad_norm": 2.1011524200439453, "learning_rate": 9.97310941297146e-06, "loss": 0.1895, "step": 2175 }, { "epoch": 1.032258064516129, "grad_norm": 1.816577434539795, "learning_rate": 9.965426415317451e-06, "loss": 0.1829, "step": 2176 }, { "epoch": 1.0327324478178368, "grad_norm": 2.274826765060425, "learning_rate": 9.95774343807189e-06, "loss": 0.167, "step": 2177 }, { "epoch": 1.0332068311195446, "grad_norm": 1.8276028633117676, "learning_rate": 9.950060485769958e-06, "loss": 0.1813, "step": 2178 }, { "epoch": 1.0336812144212524, "grad_norm": 1.7824639081954956, "learning_rate": 9.942377562946825e-06, "loss": 0.1577, "step": 2179 }, { "epoch": 1.0341555977229602, "grad_norm": 1.696703314781189, "learning_rate": 9.934694674137648e-06, "loss": 0.1729, "step": 2180 }, { "epoch": 1.034629981024668, "grad_norm": 1.851335048675537, "learning_rate": 9.927011823877559e-06, "loss": 0.2252, "step": 2181 }, { "epoch": 1.0351043643263758, "grad_norm": 1.9400562047958374, "learning_rate": 9.919329016701668e-06, "loss": 0.2241, "step": 2182 }, { "epoch": 1.0355787476280836, "grad_norm": 1.9708709716796875, "learning_rate": 9.91164625714506e-06, "loss": 0.2549, "step": 2183 }, { "epoch": 1.0360531309297913, "grad_norm": 1.7907429933547974, "learning_rate": 9.90396354974279e-06, "loss": 0.1807, "step": 2184 }, { "epoch": 1.0365275142314991, "grad_norm": 1.606693148612976, "learning_rate": 9.896280899029887e-06, "loss": 0.1502, "step": 2185 }, { "epoch": 1.037001897533207, "grad_norm": 2.3423125743865967, "learning_rate": 9.888598309541347e-06, "loss": 0.1645, "step": 2186 }, { "epoch": 1.0374762808349147, "grad_norm": 1.9627526998519897, "learning_rate": 9.88091578581212e-06, "loss": 0.1701, "step": 2187 }, { "epoch": 1.0379506641366223, "grad_norm": 1.9357240200042725, "learning_rate": 9.873233332377125e-06, "loss": 0.2047, "step": 2188 }, { "epoch": 1.03842504743833, "grad_norm": 2.164787530899048, "learning_rate": 9.865550953771237e-06, "loss": 0.2643, "step": 2189 }, { "epoch": 1.0388994307400379, "grad_norm": 2.26877498626709, "learning_rate": 9.857868654529296e-06, "loss": 0.2058, "step": 2190 }, { "epoch": 1.0393738140417457, "grad_norm": 1.9010010957717896, "learning_rate": 9.850186439186083e-06, "loss": 0.2138, "step": 2191 }, { "epoch": 1.0398481973434535, "grad_norm": 1.56143057346344, "learning_rate": 9.842504312276326e-06, "loss": 0.1677, "step": 2192 }, { "epoch": 1.0403225806451613, "grad_norm": 1.7175490856170654, "learning_rate": 9.834822278334718e-06, "loss": 0.2035, "step": 2193 }, { "epoch": 1.040796963946869, "grad_norm": 1.5723693370819092, "learning_rate": 9.827140341895885e-06, "loss": 0.1521, "step": 2194 }, { "epoch": 1.0412713472485768, "grad_norm": 1.6594966650009155, "learning_rate": 9.819458507494395e-06, "loss": 0.2055, "step": 2195 }, { "epoch": 1.0417457305502846, "grad_norm": 2.062875747680664, "learning_rate": 9.811776779664767e-06, "loss": 0.1937, "step": 2196 }, { "epoch": 1.0422201138519924, "grad_norm": 4.114375591278076, "learning_rate": 9.804095162941439e-06, "loss": 0.3022, "step": 2197 }, { "epoch": 1.0426944971537002, "grad_norm": 2.6715476512908936, "learning_rate": 9.7964136618588e-06, "loss": 0.2538, "step": 2198 }, { "epoch": 1.043168880455408, "grad_norm": 2.0732991695404053, "learning_rate": 9.788732280951158e-06, "loss": 0.2196, "step": 2199 }, { "epoch": 1.0436432637571158, "grad_norm": 2.420642375946045, "learning_rate": 9.781051024752762e-06, "loss": 0.2222, "step": 2200 }, { "epoch": 1.0441176470588236, "grad_norm": 1.8388843536376953, "learning_rate": 9.773369897797784e-06, "loss": 0.2209, "step": 2201 }, { "epoch": 1.0445920303605314, "grad_norm": 1.8891000747680664, "learning_rate": 9.76568890462031e-06, "loss": 0.1758, "step": 2202 }, { "epoch": 1.0450664136622392, "grad_norm": 1.7700624465942383, "learning_rate": 9.758008049754353e-06, "loss": 0.1815, "step": 2203 }, { "epoch": 1.045540796963947, "grad_norm": 2.6104068756103516, "learning_rate": 9.750327337733852e-06, "loss": 0.2194, "step": 2204 }, { "epoch": 1.0460151802656545, "grad_norm": 1.9325884580612183, "learning_rate": 9.742646773092656e-06, "loss": 0.1886, "step": 2205 }, { "epoch": 1.0464895635673623, "grad_norm": 2.4706602096557617, "learning_rate": 9.734966360364519e-06, "loss": 0.2789, "step": 2206 }, { "epoch": 1.0469639468690701, "grad_norm": 2.077040195465088, "learning_rate": 9.72728610408312e-06, "loss": 0.2351, "step": 2207 }, { "epoch": 1.047438330170778, "grad_norm": 1.9420323371887207, "learning_rate": 9.719606008782031e-06, "loss": 0.2085, "step": 2208 }, { "epoch": 1.0479127134724857, "grad_norm": 1.9051023721694946, "learning_rate": 9.711926078994744e-06, "loss": 0.2164, "step": 2209 }, { "epoch": 1.0483870967741935, "grad_norm": 1.6898927688598633, "learning_rate": 9.704246319254643e-06, "loss": 0.1834, "step": 2210 }, { "epoch": 1.0488614800759013, "grad_norm": 1.6880271434783936, "learning_rate": 9.696566734095012e-06, "loss": 0.1787, "step": 2211 }, { "epoch": 1.049335863377609, "grad_norm": 1.8380780220031738, "learning_rate": 9.688887328049034e-06, "loss": 0.1836, "step": 2212 }, { "epoch": 1.0498102466793169, "grad_norm": 1.8687678575515747, "learning_rate": 9.681208105649786e-06, "loss": 0.2263, "step": 2213 }, { "epoch": 1.0502846299810247, "grad_norm": 2.063650608062744, "learning_rate": 9.673529071430242e-06, "loss": 0.1881, "step": 2214 }, { "epoch": 1.0507590132827325, "grad_norm": 2.072175979614258, "learning_rate": 9.665850229923258e-06, "loss": 0.1931, "step": 2215 }, { "epoch": 1.0512333965844403, "grad_norm": 2.0182647705078125, "learning_rate": 9.658171585661572e-06, "loss": 0.2125, "step": 2216 }, { "epoch": 1.051707779886148, "grad_norm": 1.7542519569396973, "learning_rate": 9.650493143177817e-06, "loss": 0.1789, "step": 2217 }, { "epoch": 1.0521821631878558, "grad_norm": 1.796041488647461, "learning_rate": 9.642814907004505e-06, "loss": 0.1936, "step": 2218 }, { "epoch": 1.0526565464895636, "grad_norm": 2.394157648086548, "learning_rate": 9.635136881674013e-06, "loss": 0.2391, "step": 2219 }, { "epoch": 1.0531309297912714, "grad_norm": 2.1615726947784424, "learning_rate": 9.627459071718617e-06, "loss": 0.2023, "step": 2220 }, { "epoch": 1.0536053130929792, "grad_norm": 2.2835464477539062, "learning_rate": 9.619781481670437e-06, "loss": 0.2261, "step": 2221 }, { "epoch": 1.054079696394687, "grad_norm": 1.822164535522461, "learning_rate": 9.612104116061491e-06, "loss": 0.2148, "step": 2222 }, { "epoch": 1.0545540796963946, "grad_norm": 1.8478764295578003, "learning_rate": 9.604426979423645e-06, "loss": 0.1656, "step": 2223 }, { "epoch": 1.0550284629981024, "grad_norm": 2.215240001678467, "learning_rate": 9.596750076288642e-06, "loss": 0.2309, "step": 2224 }, { "epoch": 1.0555028462998102, "grad_norm": 1.7701280117034912, "learning_rate": 9.589073411188074e-06, "loss": 0.2403, "step": 2225 }, { "epoch": 1.055977229601518, "grad_norm": 1.9893231391906738, "learning_rate": 9.581396988653404e-06, "loss": 0.2326, "step": 2226 }, { "epoch": 1.0564516129032258, "grad_norm": 1.8492276668548584, "learning_rate": 9.573720813215954e-06, "loss": 0.1864, "step": 2227 }, { "epoch": 1.0569259962049335, "grad_norm": 1.7757240533828735, "learning_rate": 9.566044889406885e-06, "loss": 0.2256, "step": 2228 }, { "epoch": 1.0574003795066413, "grad_norm": 1.9123115539550781, "learning_rate": 9.558369221757229e-06, "loss": 0.2317, "step": 2229 }, { "epoch": 1.0578747628083491, "grad_norm": 2.3356640338897705, "learning_rate": 9.550693814797847e-06, "loss": 0.2255, "step": 2230 }, { "epoch": 1.058349146110057, "grad_norm": 1.911953330039978, "learning_rate": 9.543018673059462e-06, "loss": 0.192, "step": 2231 }, { "epoch": 1.0588235294117647, "grad_norm": 2.0029706954956055, "learning_rate": 9.53534380107263e-06, "loss": 0.2193, "step": 2232 }, { "epoch": 1.0592979127134725, "grad_norm": 1.5258057117462158, "learning_rate": 9.527669203367756e-06, "loss": 0.1815, "step": 2233 }, { "epoch": 1.0597722960151803, "grad_norm": 1.8638172149658203, "learning_rate": 9.51999488447508e-06, "loss": 0.1792, "step": 2234 }, { "epoch": 1.060246679316888, "grad_norm": 2.022976875305176, "learning_rate": 9.512320848924672e-06, "loss": 0.2382, "step": 2235 }, { "epoch": 1.060721062618596, "grad_norm": 1.830183744430542, "learning_rate": 9.504647101246438e-06, "loss": 0.207, "step": 2236 }, { "epoch": 1.0611954459203037, "grad_norm": 1.8549655675888062, "learning_rate": 9.49697364597012e-06, "loss": 0.2103, "step": 2237 }, { "epoch": 1.0616698292220115, "grad_norm": 2.056288719177246, "learning_rate": 9.489300487625283e-06, "loss": 0.2282, "step": 2238 }, { "epoch": 1.0621442125237193, "grad_norm": 1.720819354057312, "learning_rate": 9.481627630741315e-06, "loss": 0.1979, "step": 2239 }, { "epoch": 1.0626185958254268, "grad_norm": 1.9592931270599365, "learning_rate": 9.473955079847426e-06, "loss": 0.2263, "step": 2240 }, { "epoch": 1.0630929791271346, "grad_norm": 1.7493445873260498, "learning_rate": 9.466282839472645e-06, "loss": 0.1799, "step": 2241 }, { "epoch": 1.0635673624288424, "grad_norm": 2.2582602500915527, "learning_rate": 9.458610914145826e-06, "loss": 0.1552, "step": 2242 }, { "epoch": 1.0640417457305502, "grad_norm": 1.6923972368240356, "learning_rate": 9.450939308395629e-06, "loss": 0.1921, "step": 2243 }, { "epoch": 1.064516129032258, "grad_norm": 1.8049591779708862, "learning_rate": 9.443268026750521e-06, "loss": 0.2018, "step": 2244 }, { "epoch": 1.0649905123339658, "grad_norm": 1.8883256912231445, "learning_rate": 9.435597073738787e-06, "loss": 0.1884, "step": 2245 }, { "epoch": 1.0654648956356736, "grad_norm": 2.0094857215881348, "learning_rate": 9.427926453888518e-06, "loss": 0.2109, "step": 2246 }, { "epoch": 1.0659392789373814, "grad_norm": 1.975786566734314, "learning_rate": 9.4202561717276e-06, "loss": 0.1851, "step": 2247 }, { "epoch": 1.0664136622390892, "grad_norm": 1.8108450174331665, "learning_rate": 9.41258623178373e-06, "loss": 0.1696, "step": 2248 }, { "epoch": 1.066888045540797, "grad_norm": 2.1892595291137695, "learning_rate": 9.404916638584394e-06, "loss": 0.1848, "step": 2249 }, { "epoch": 1.0673624288425048, "grad_norm": 2.6319334506988525, "learning_rate": 9.397247396656875e-06, "loss": 0.217, "step": 2250 }, { "epoch": 1.0678368121442126, "grad_norm": 1.9110881090164185, "learning_rate": 9.389578510528256e-06, "loss": 0.1634, "step": 2251 }, { "epoch": 1.0683111954459203, "grad_norm": 1.8657715320587158, "learning_rate": 9.381909984725399e-06, "loss": 0.1685, "step": 2252 }, { "epoch": 1.0687855787476281, "grad_norm": 1.8046596050262451, "learning_rate": 9.374241823774967e-06, "loss": 0.1784, "step": 2253 }, { "epoch": 1.069259962049336, "grad_norm": 1.8186533451080322, "learning_rate": 9.36657403220339e-06, "loss": 0.1822, "step": 2254 }, { "epoch": 1.0697343453510437, "grad_norm": 2.0945181846618652, "learning_rate": 9.358906614536895e-06, "loss": 0.2373, "step": 2255 }, { "epoch": 1.0702087286527515, "grad_norm": 2.1683075428009033, "learning_rate": 9.351239575301479e-06, "loss": 0.2144, "step": 2256 }, { "epoch": 1.0706831119544593, "grad_norm": 1.9169727563858032, "learning_rate": 9.343572919022924e-06, "loss": 0.2189, "step": 2257 }, { "epoch": 1.0711574952561669, "grad_norm": 1.7258111238479614, "learning_rate": 9.335906650226783e-06, "loss": 0.1937, "step": 2258 }, { "epoch": 1.0716318785578747, "grad_norm": 2.077392816543579, "learning_rate": 9.328240773438372e-06, "loss": 0.1898, "step": 2259 }, { "epoch": 1.0721062618595825, "grad_norm": 1.833132028579712, "learning_rate": 9.320575293182782e-06, "loss": 0.2103, "step": 2260 }, { "epoch": 1.0725806451612903, "grad_norm": 1.7154309749603271, "learning_rate": 9.312910213984876e-06, "loss": 0.1733, "step": 2261 }, { "epoch": 1.073055028462998, "grad_norm": 1.8700604438781738, "learning_rate": 9.305245540369273e-06, "loss": 0.2193, "step": 2262 }, { "epoch": 1.0735294117647058, "grad_norm": 1.6208839416503906, "learning_rate": 9.297581276860353e-06, "loss": 0.1841, "step": 2263 }, { "epoch": 1.0740037950664136, "grad_norm": 1.4968012571334839, "learning_rate": 9.28991742798225e-06, "loss": 0.16, "step": 2264 }, { "epoch": 1.0744781783681214, "grad_norm": 1.8390696048736572, "learning_rate": 9.282253998258865e-06, "loss": 0.2012, "step": 2265 }, { "epoch": 1.0749525616698292, "grad_norm": 1.7694889307022095, "learning_rate": 9.274590992213844e-06, "loss": 0.1982, "step": 2266 }, { "epoch": 1.075426944971537, "grad_norm": 1.4749209880828857, "learning_rate": 9.26692841437058e-06, "loss": 0.1581, "step": 2267 }, { "epoch": 1.0759013282732448, "grad_norm": 1.6622833013534546, "learning_rate": 9.259266269252221e-06, "loss": 0.1919, "step": 2268 }, { "epoch": 1.0763757115749526, "grad_norm": 1.8222259283065796, "learning_rate": 9.25160456138165e-06, "loss": 0.1752, "step": 2269 }, { "epoch": 1.0768500948766604, "grad_norm": 2.1095898151397705, "learning_rate": 9.243943295281505e-06, "loss": 0.1706, "step": 2270 }, { "epoch": 1.0773244781783682, "grad_norm": 1.9356722831726074, "learning_rate": 9.236282475474146e-06, "loss": 0.2063, "step": 2271 }, { "epoch": 1.077798861480076, "grad_norm": 2.4067156314849854, "learning_rate": 9.228622106481691e-06, "loss": 0.188, "step": 2272 }, { "epoch": 1.0782732447817838, "grad_norm": 3.1411545276641846, "learning_rate": 9.22096219282597e-06, "loss": 0.2492, "step": 2273 }, { "epoch": 1.0787476280834916, "grad_norm": 1.8042513132095337, "learning_rate": 9.213302739028555e-06, "loss": 0.1861, "step": 2274 }, { "epoch": 1.0792220113851991, "grad_norm": 1.792336106300354, "learning_rate": 9.205643749610751e-06, "loss": 0.1982, "step": 2275 }, { "epoch": 1.079696394686907, "grad_norm": 3.020674228668213, "learning_rate": 9.19798522909358e-06, "loss": 0.1797, "step": 2276 }, { "epoch": 1.0801707779886147, "grad_norm": 1.7047805786132812, "learning_rate": 9.190327181997796e-06, "loss": 0.1772, "step": 2277 }, { "epoch": 1.0806451612903225, "grad_norm": 2.018634080886841, "learning_rate": 9.182669612843861e-06, "loss": 0.1895, "step": 2278 }, { "epoch": 1.0811195445920303, "grad_norm": 1.8824384212493896, "learning_rate": 9.175012526151968e-06, "loss": 0.1718, "step": 2279 }, { "epoch": 1.081593927893738, "grad_norm": 1.511277437210083, "learning_rate": 9.167355926442013e-06, "loss": 0.1604, "step": 2280 }, { "epoch": 1.0820683111954459, "grad_norm": 1.670013427734375, "learning_rate": 9.15969981823362e-06, "loss": 0.1973, "step": 2281 }, { "epoch": 1.0825426944971537, "grad_norm": 1.8747272491455078, "learning_rate": 9.152044206046106e-06, "loss": 0.1461, "step": 2282 }, { "epoch": 1.0830170777988615, "grad_norm": 1.7594618797302246, "learning_rate": 9.144389094398508e-06, "loss": 0.1442, "step": 2283 }, { "epoch": 1.0834914611005693, "grad_norm": 2.149158477783203, "learning_rate": 9.136734487809559e-06, "loss": 0.2489, "step": 2284 }, { "epoch": 1.083965844402277, "grad_norm": 1.746847152709961, "learning_rate": 9.129080390797699e-06, "loss": 0.1697, "step": 2285 }, { "epoch": 1.0844402277039848, "grad_norm": 1.7937593460083008, "learning_rate": 9.12142680788107e-06, "loss": 0.166, "step": 2286 }, { "epoch": 1.0849146110056926, "grad_norm": 2.1846930980682373, "learning_rate": 9.113773743577502e-06, "loss": 0.1949, "step": 2287 }, { "epoch": 1.0853889943074004, "grad_norm": 2.1543169021606445, "learning_rate": 9.106121202404521e-06, "loss": 0.2145, "step": 2288 }, { "epoch": 1.0858633776091082, "grad_norm": 1.6700936555862427, "learning_rate": 9.098469188879348e-06, "loss": 0.164, "step": 2289 }, { "epoch": 1.086337760910816, "grad_norm": 1.5210555791854858, "learning_rate": 9.090817707518893e-06, "loss": 0.1588, "step": 2290 }, { "epoch": 1.0868121442125238, "grad_norm": 1.8209552764892578, "learning_rate": 9.083166762839751e-06, "loss": 0.2126, "step": 2291 }, { "epoch": 1.0872865275142316, "grad_norm": 1.954764485359192, "learning_rate": 9.075516359358195e-06, "loss": 0.1862, "step": 2292 }, { "epoch": 1.0877609108159392, "grad_norm": 1.4143950939178467, "learning_rate": 9.067866501590182e-06, "loss": 0.1454, "step": 2293 }, { "epoch": 1.088235294117647, "grad_norm": 2.105008363723755, "learning_rate": 9.06021719405135e-06, "loss": 0.2585, "step": 2294 }, { "epoch": 1.0887096774193548, "grad_norm": 2.2465155124664307, "learning_rate": 9.05256844125701e-06, "loss": 0.2133, "step": 2295 }, { "epoch": 1.0891840607210626, "grad_norm": 2.175455331802368, "learning_rate": 9.044920247722146e-06, "loss": 0.2389, "step": 2296 }, { "epoch": 1.0896584440227703, "grad_norm": 1.8581993579864502, "learning_rate": 9.037272617961405e-06, "loss": 0.2035, "step": 2297 }, { "epoch": 1.0901328273244781, "grad_norm": 1.7983685731887817, "learning_rate": 9.02962555648911e-06, "loss": 0.2225, "step": 2298 }, { "epoch": 1.090607210626186, "grad_norm": 1.98671293258667, "learning_rate": 9.021979067819252e-06, "loss": 0.2191, "step": 2299 }, { "epoch": 1.0910815939278937, "grad_norm": 1.4009077548980713, "learning_rate": 9.014333156465467e-06, "loss": 0.1482, "step": 2300 }, { "epoch": 1.0915559772296015, "grad_norm": 1.6995691061019897, "learning_rate": 9.00668782694107e-06, "loss": 0.1738, "step": 2301 }, { "epoch": 1.0920303605313093, "grad_norm": 1.6954809427261353, "learning_rate": 8.999043083759016e-06, "loss": 0.1835, "step": 2302 }, { "epoch": 1.092504743833017, "grad_norm": 1.936194896697998, "learning_rate": 8.99139893143193e-06, "loss": 0.2194, "step": 2303 }, { "epoch": 1.092979127134725, "grad_norm": 1.9008138179779053, "learning_rate": 8.983755374472069e-06, "loss": 0.2544, "step": 2304 }, { "epoch": 1.0934535104364327, "grad_norm": 1.8978139162063599, "learning_rate": 8.976112417391358e-06, "loss": 0.2117, "step": 2305 }, { "epoch": 1.0939278937381405, "grad_norm": 1.6767897605895996, "learning_rate": 8.968470064701354e-06, "loss": 0.1708, "step": 2306 }, { "epoch": 1.0944022770398483, "grad_norm": 2.310898780822754, "learning_rate": 8.960828320913263e-06, "loss": 0.2313, "step": 2307 }, { "epoch": 1.094876660341556, "grad_norm": 1.91811203956604, "learning_rate": 8.953187190537929e-06, "loss": 0.1858, "step": 2308 }, { "epoch": 1.0953510436432639, "grad_norm": 1.711073398590088, "learning_rate": 8.945546678085838e-06, "loss": 0.183, "step": 2309 }, { "epoch": 1.0958254269449714, "grad_norm": 1.898019552230835, "learning_rate": 8.937906788067114e-06, "loss": 0.1654, "step": 2310 }, { "epoch": 1.0962998102466792, "grad_norm": 1.8163573741912842, "learning_rate": 8.9302675249915e-06, "loss": 0.1461, "step": 2311 }, { "epoch": 1.096774193548387, "grad_norm": 2.2861268520355225, "learning_rate": 8.922628893368378e-06, "loss": 0.2169, "step": 2312 }, { "epoch": 1.0972485768500948, "grad_norm": 1.6889331340789795, "learning_rate": 8.91499089770676e-06, "loss": 0.1876, "step": 2313 }, { "epoch": 1.0977229601518026, "grad_norm": 1.7860161066055298, "learning_rate": 8.90735354251528e-06, "loss": 0.1808, "step": 2314 }, { "epoch": 1.0981973434535104, "grad_norm": 1.4944920539855957, "learning_rate": 8.899716832302193e-06, "loss": 0.142, "step": 2315 }, { "epoch": 1.0986717267552182, "grad_norm": 2.013535737991333, "learning_rate": 8.89208077157537e-06, "loss": 0.2055, "step": 2316 }, { "epoch": 1.099146110056926, "grad_norm": 1.811508059501648, "learning_rate": 8.884445364842304e-06, "loss": 0.199, "step": 2317 }, { "epoch": 1.0996204933586338, "grad_norm": 2.4114105701446533, "learning_rate": 8.8768106166101e-06, "loss": 0.2022, "step": 2318 }, { "epoch": 1.1000948766603416, "grad_norm": 2.2680139541625977, "learning_rate": 8.869176531385476e-06, "loss": 0.2317, "step": 2319 }, { "epoch": 1.1005692599620494, "grad_norm": 1.6140555143356323, "learning_rate": 8.861543113674758e-06, "loss": 0.1574, "step": 2320 }, { "epoch": 1.1010436432637571, "grad_norm": 1.9578766822814941, "learning_rate": 8.853910367983871e-06, "loss": 0.1904, "step": 2321 }, { "epoch": 1.101518026565465, "grad_norm": 2.459045886993408, "learning_rate": 8.846278298818352e-06, "loss": 0.1852, "step": 2322 }, { "epoch": 1.1019924098671727, "grad_norm": 1.5891822576522827, "learning_rate": 8.838646910683338e-06, "loss": 0.1727, "step": 2323 }, { "epoch": 1.1024667931688805, "grad_norm": 1.923692226409912, "learning_rate": 8.831016208083563e-06, "loss": 0.1874, "step": 2324 }, { "epoch": 1.1029411764705883, "grad_norm": 1.6137306690216064, "learning_rate": 8.82338619552335e-06, "loss": 0.1609, "step": 2325 }, { "epoch": 1.103415559772296, "grad_norm": 2.0651705265045166, "learning_rate": 8.815756877506622e-06, "loss": 0.1954, "step": 2326 }, { "epoch": 1.103889943074004, "grad_norm": 1.705381155014038, "learning_rate": 8.808128258536893e-06, "loss": 0.2241, "step": 2327 }, { "epoch": 1.1043643263757117, "grad_norm": 1.756785273551941, "learning_rate": 8.800500343117255e-06, "loss": 0.1729, "step": 2328 }, { "epoch": 1.1048387096774193, "grad_norm": 2.024177312850952, "learning_rate": 8.7928731357504e-06, "loss": 0.223, "step": 2329 }, { "epoch": 1.105313092979127, "grad_norm": 1.7107657194137573, "learning_rate": 8.785246640938584e-06, "loss": 0.1882, "step": 2330 }, { "epoch": 1.1057874762808348, "grad_norm": 2.2373104095458984, "learning_rate": 8.777620863183658e-06, "loss": 0.2527, "step": 2331 }, { "epoch": 1.1062618595825426, "grad_norm": 1.823649525642395, "learning_rate": 8.769995806987037e-06, "loss": 0.2164, "step": 2332 }, { "epoch": 1.1067362428842504, "grad_norm": 1.7617485523223877, "learning_rate": 8.762371476849722e-06, "loss": 0.1942, "step": 2333 }, { "epoch": 1.1072106261859582, "grad_norm": 1.7329638004302979, "learning_rate": 8.754747877272279e-06, "loss": 0.1631, "step": 2334 }, { "epoch": 1.107685009487666, "grad_norm": 1.7115880250930786, "learning_rate": 8.747125012754839e-06, "loss": 0.174, "step": 2335 }, { "epoch": 1.1081593927893738, "grad_norm": 1.9890222549438477, "learning_rate": 8.739502887797108e-06, "loss": 0.1825, "step": 2336 }, { "epoch": 1.1086337760910816, "grad_norm": 1.9389824867248535, "learning_rate": 8.731881506898348e-06, "loss": 0.1788, "step": 2337 }, { "epoch": 1.1091081593927894, "grad_norm": 1.6447457075119019, "learning_rate": 8.724260874557384e-06, "loss": 0.1562, "step": 2338 }, { "epoch": 1.1095825426944972, "grad_norm": 1.9594886302947998, "learning_rate": 8.716640995272607e-06, "loss": 0.2101, "step": 2339 }, { "epoch": 1.110056925996205, "grad_norm": 1.9339709281921387, "learning_rate": 8.70902187354195e-06, "loss": 0.2179, "step": 2340 }, { "epoch": 1.1105313092979128, "grad_norm": 1.7604994773864746, "learning_rate": 8.701403513862901e-06, "loss": 0.1864, "step": 2341 }, { "epoch": 1.1110056925996206, "grad_norm": 1.8655658960342407, "learning_rate": 8.69378592073251e-06, "loss": 0.1842, "step": 2342 }, { "epoch": 1.1114800759013284, "grad_norm": 1.4417341947555542, "learning_rate": 8.68616909864737e-06, "loss": 0.1523, "step": 2343 }, { "epoch": 1.1119544592030361, "grad_norm": 1.832533836364746, "learning_rate": 8.678553052103605e-06, "loss": 0.1856, "step": 2344 }, { "epoch": 1.1124288425047437, "grad_norm": 2.599240303039551, "learning_rate": 8.670937785596897e-06, "loss": 0.1896, "step": 2345 }, { "epoch": 1.1129032258064515, "grad_norm": 1.5967415571212769, "learning_rate": 8.663323303622462e-06, "loss": 0.1631, "step": 2346 }, { "epoch": 1.1133776091081593, "grad_norm": 1.8945375680923462, "learning_rate": 8.655709610675056e-06, "loss": 0.1659, "step": 2347 }, { "epoch": 1.113851992409867, "grad_norm": 2.1004648208618164, "learning_rate": 8.648096711248967e-06, "loss": 0.2273, "step": 2348 }, { "epoch": 1.114326375711575, "grad_norm": 1.748862624168396, "learning_rate": 8.640484609838007e-06, "loss": 0.1857, "step": 2349 }, { "epoch": 1.1148007590132827, "grad_norm": 1.9971057176589966, "learning_rate": 8.632873310935528e-06, "loss": 0.1852, "step": 2350 }, { "epoch": 1.1152751423149905, "grad_norm": 1.7587655782699585, "learning_rate": 8.625262819034408e-06, "loss": 0.1937, "step": 2351 }, { "epoch": 1.1157495256166983, "grad_norm": 1.5481939315795898, "learning_rate": 8.61765313862704e-06, "loss": 0.1684, "step": 2352 }, { "epoch": 1.116223908918406, "grad_norm": 1.8051375150680542, "learning_rate": 8.610044274205352e-06, "loss": 0.2023, "step": 2353 }, { "epoch": 1.1166982922201139, "grad_norm": 2.1944355964660645, "learning_rate": 8.602436230260768e-06, "loss": 0.1854, "step": 2354 }, { "epoch": 1.1171726755218216, "grad_norm": 2.0533533096313477, "learning_rate": 8.59482901128425e-06, "loss": 0.2236, "step": 2355 }, { "epoch": 1.1176470588235294, "grad_norm": 2.361926555633545, "learning_rate": 8.58722262176626e-06, "loss": 0.2015, "step": 2356 }, { "epoch": 1.1181214421252372, "grad_norm": 1.664730191230774, "learning_rate": 8.579617066196777e-06, "loss": 0.1824, "step": 2357 }, { "epoch": 1.118595825426945, "grad_norm": 1.7999298572540283, "learning_rate": 8.572012349065288e-06, "loss": 0.1814, "step": 2358 }, { "epoch": 1.1190702087286528, "grad_norm": 2.117183208465576, "learning_rate": 8.564408474860774e-06, "loss": 0.2215, "step": 2359 }, { "epoch": 1.1195445920303606, "grad_norm": 1.658808946609497, "learning_rate": 8.556805448071736e-06, "loss": 0.1568, "step": 2360 }, { "epoch": 1.1200189753320684, "grad_norm": 2.2438066005706787, "learning_rate": 8.549203273186156e-06, "loss": 0.1864, "step": 2361 }, { "epoch": 1.1204933586337762, "grad_norm": 1.6112077236175537, "learning_rate": 8.541601954691534e-06, "loss": 0.162, "step": 2362 }, { "epoch": 1.120967741935484, "grad_norm": 1.4597880840301514, "learning_rate": 8.534001497074842e-06, "loss": 0.1589, "step": 2363 }, { "epoch": 1.1214421252371916, "grad_norm": 1.9932173490524292, "learning_rate": 8.52640190482256e-06, "loss": 0.2256, "step": 2364 }, { "epoch": 1.1219165085388993, "grad_norm": 1.9947072267532349, "learning_rate": 8.518803182420651e-06, "loss": 0.2217, "step": 2365 }, { "epoch": 1.1223908918406071, "grad_norm": 1.7771036624908447, "learning_rate": 8.511205334354566e-06, "loss": 0.1938, "step": 2366 }, { "epoch": 1.122865275142315, "grad_norm": 1.7289372682571411, "learning_rate": 8.503608365109247e-06, "loss": 0.1559, "step": 2367 }, { "epoch": 1.1233396584440227, "grad_norm": 1.8934062719345093, "learning_rate": 8.496012279169097e-06, "loss": 0.1868, "step": 2368 }, { "epoch": 1.1238140417457305, "grad_norm": 1.762656569480896, "learning_rate": 8.488417081018015e-06, "loss": 0.1465, "step": 2369 }, { "epoch": 1.1242884250474383, "grad_norm": 1.8933993577957153, "learning_rate": 8.480822775139371e-06, "loss": 0.2391, "step": 2370 }, { "epoch": 1.124762808349146, "grad_norm": 1.5721724033355713, "learning_rate": 8.473229366016014e-06, "loss": 0.1591, "step": 2371 }, { "epoch": 1.125237191650854, "grad_norm": 1.9659096002578735, "learning_rate": 8.465636858130255e-06, "loss": 0.1897, "step": 2372 }, { "epoch": 1.1257115749525617, "grad_norm": 2.4556431770324707, "learning_rate": 8.45804525596387e-06, "loss": 0.2287, "step": 2373 }, { "epoch": 1.1261859582542695, "grad_norm": 1.9868581295013428, "learning_rate": 8.450454563998117e-06, "loss": 0.2112, "step": 2374 }, { "epoch": 1.1266603415559773, "grad_norm": 1.849770188331604, "learning_rate": 8.4428647867137e-06, "loss": 0.1841, "step": 2375 }, { "epoch": 1.127134724857685, "grad_norm": 1.5524128675460815, "learning_rate": 8.435275928590789e-06, "loss": 0.1456, "step": 2376 }, { "epoch": 1.1276091081593929, "grad_norm": 1.8452872037887573, "learning_rate": 8.427687994109017e-06, "loss": 0.1729, "step": 2377 }, { "epoch": 1.1280834914611007, "grad_norm": 2.2624077796936035, "learning_rate": 8.42010098774746e-06, "loss": 0.2093, "step": 2378 }, { "epoch": 1.1285578747628084, "grad_norm": 1.6121922731399536, "learning_rate": 8.412514913984657e-06, "loss": 0.157, "step": 2379 }, { "epoch": 1.129032258064516, "grad_norm": 1.7655012607574463, "learning_rate": 8.404929777298592e-06, "loss": 0.1697, "step": 2380 }, { "epoch": 1.1295066413662238, "grad_norm": 1.811368703842163, "learning_rate": 8.3973455821667e-06, "loss": 0.1537, "step": 2381 }, { "epoch": 1.1299810246679316, "grad_norm": 1.8110166788101196, "learning_rate": 8.389762333065847e-06, "loss": 0.2188, "step": 2382 }, { "epoch": 1.1304554079696394, "grad_norm": 1.7781898975372314, "learning_rate": 8.382180034472353e-06, "loss": 0.2, "step": 2383 }, { "epoch": 1.1309297912713472, "grad_norm": 1.581636905670166, "learning_rate": 8.374598690861978e-06, "loss": 0.1607, "step": 2384 }, { "epoch": 1.131404174573055, "grad_norm": 2.6846189498901367, "learning_rate": 8.367018306709913e-06, "loss": 0.1924, "step": 2385 }, { "epoch": 1.1318785578747628, "grad_norm": 2.4494681358337402, "learning_rate": 8.359438886490783e-06, "loss": 0.2339, "step": 2386 }, { "epoch": 1.1323529411764706, "grad_norm": 1.944273591041565, "learning_rate": 8.351860434678641e-06, "loss": 0.186, "step": 2387 }, { "epoch": 1.1328273244781784, "grad_norm": 1.9705138206481934, "learning_rate": 8.344282955746978e-06, "loss": 0.186, "step": 2388 }, { "epoch": 1.1333017077798861, "grad_norm": 1.6467750072479248, "learning_rate": 8.336706454168701e-06, "loss": 0.1791, "step": 2389 }, { "epoch": 1.133776091081594, "grad_norm": 1.7570585012435913, "learning_rate": 8.329130934416142e-06, "loss": 0.1567, "step": 2390 }, { "epoch": 1.1342504743833017, "grad_norm": 1.4821341037750244, "learning_rate": 8.321556400961067e-06, "loss": 0.1465, "step": 2391 }, { "epoch": 1.1347248576850095, "grad_norm": 2.2223289012908936, "learning_rate": 8.313982858274634e-06, "loss": 0.2004, "step": 2392 }, { "epoch": 1.1351992409867173, "grad_norm": 1.931081771850586, "learning_rate": 8.306410310827435e-06, "loss": 0.1863, "step": 2393 }, { "epoch": 1.135673624288425, "grad_norm": 1.7010200023651123, "learning_rate": 8.298838763089471e-06, "loss": 0.1667, "step": 2394 }, { "epoch": 1.136148007590133, "grad_norm": 2.051765203475952, "learning_rate": 8.291268219530153e-06, "loss": 0.1978, "step": 2395 }, { "epoch": 1.1366223908918407, "grad_norm": 1.797257900238037, "learning_rate": 8.2836986846183e-06, "loss": 0.1652, "step": 2396 }, { "epoch": 1.1370967741935485, "grad_norm": 1.760908603668213, "learning_rate": 8.276130162822124e-06, "loss": 0.2005, "step": 2397 }, { "epoch": 1.1375711574952563, "grad_norm": 1.7077192068099976, "learning_rate": 8.268562658609254e-06, "loss": 0.2053, "step": 2398 }, { "epoch": 1.1380455407969639, "grad_norm": 2.186999797821045, "learning_rate": 8.260996176446716e-06, "loss": 0.2174, "step": 2399 }, { "epoch": 1.1385199240986716, "grad_norm": 1.788298487663269, "learning_rate": 8.253430720800928e-06, "loss": 0.1705, "step": 2400 }, { "epoch": 1.1389943074003794, "grad_norm": 2.1071958541870117, "learning_rate": 8.245866296137701e-06, "loss": 0.1797, "step": 2401 }, { "epoch": 1.1394686907020872, "grad_norm": 1.7165755033493042, "learning_rate": 8.238302906922242e-06, "loss": 0.2184, "step": 2402 }, { "epoch": 1.139943074003795, "grad_norm": 1.7405943870544434, "learning_rate": 8.230740557619142e-06, "loss": 0.1772, "step": 2403 }, { "epoch": 1.1404174573055028, "grad_norm": 2.0307819843292236, "learning_rate": 8.223179252692385e-06, "loss": 0.2016, "step": 2404 }, { "epoch": 1.1408918406072106, "grad_norm": 1.6938095092773438, "learning_rate": 8.215618996605336e-06, "loss": 0.1615, "step": 2405 }, { "epoch": 1.1413662239089184, "grad_norm": 1.7445240020751953, "learning_rate": 8.208059793820731e-06, "loss": 0.2164, "step": 2406 }, { "epoch": 1.1418406072106262, "grad_norm": 2.3769569396972656, "learning_rate": 8.200501648800698e-06, "loss": 0.1711, "step": 2407 }, { "epoch": 1.142314990512334, "grad_norm": 1.518480658531189, "learning_rate": 8.192944566006737e-06, "loss": 0.1561, "step": 2408 }, { "epoch": 1.1427893738140418, "grad_norm": 1.9066368341445923, "learning_rate": 8.185388549899715e-06, "loss": 0.2031, "step": 2409 }, { "epoch": 1.1432637571157496, "grad_norm": 2.3878681659698486, "learning_rate": 8.17783360493988e-06, "loss": 0.2288, "step": 2410 }, { "epoch": 1.1437381404174574, "grad_norm": 2.5900661945343018, "learning_rate": 8.170279735586833e-06, "loss": 0.1964, "step": 2411 }, { "epoch": 1.1442125237191652, "grad_norm": 1.9122105836868286, "learning_rate": 8.162726946299556e-06, "loss": 0.1934, "step": 2412 }, { "epoch": 1.144686907020873, "grad_norm": 2.024298667907715, "learning_rate": 8.155175241536377e-06, "loss": 0.2132, "step": 2413 }, { "epoch": 1.1451612903225807, "grad_norm": 1.898661732673645, "learning_rate": 8.147624625754999e-06, "loss": 0.2355, "step": 2414 }, { "epoch": 1.1456356736242883, "grad_norm": 1.5798265933990479, "learning_rate": 8.140075103412477e-06, "loss": 0.1495, "step": 2415 }, { "epoch": 1.146110056925996, "grad_norm": 1.6848498582839966, "learning_rate": 8.132526678965215e-06, "loss": 0.1671, "step": 2416 }, { "epoch": 1.146584440227704, "grad_norm": 1.576760172843933, "learning_rate": 8.124979356868976e-06, "loss": 0.1631, "step": 2417 }, { "epoch": 1.1470588235294117, "grad_norm": 2.446101427078247, "learning_rate": 8.117433141578865e-06, "loss": 0.253, "step": 2418 }, { "epoch": 1.1475332068311195, "grad_norm": 1.735353946685791, "learning_rate": 8.109888037549346e-06, "loss": 0.1542, "step": 2419 }, { "epoch": 1.1480075901328273, "grad_norm": 1.9680674076080322, "learning_rate": 8.102344049234213e-06, "loss": 0.2229, "step": 2420 }, { "epoch": 1.148481973434535, "grad_norm": 1.939218521118164, "learning_rate": 8.094801181086612e-06, "loss": 0.188, "step": 2421 }, { "epoch": 1.1489563567362429, "grad_norm": 1.6669310331344604, "learning_rate": 8.087259437559017e-06, "loss": 0.1843, "step": 2422 }, { "epoch": 1.1494307400379506, "grad_norm": 1.6978830099105835, "learning_rate": 8.079718823103251e-06, "loss": 0.1643, "step": 2423 }, { "epoch": 1.1499051233396584, "grad_norm": 1.4840154647827148, "learning_rate": 8.072179342170461e-06, "loss": 0.1328, "step": 2424 }, { "epoch": 1.1503795066413662, "grad_norm": 1.5778673887252808, "learning_rate": 8.06464099921113e-06, "loss": 0.1554, "step": 2425 }, { "epoch": 1.150853889943074, "grad_norm": 1.7717269659042358, "learning_rate": 8.057103798675063e-06, "loss": 0.1731, "step": 2426 }, { "epoch": 1.1513282732447818, "grad_norm": 1.752498984336853, "learning_rate": 8.0495677450114e-06, "loss": 0.1794, "step": 2427 }, { "epoch": 1.1518026565464896, "grad_norm": 1.9130120277404785, "learning_rate": 8.042032842668598e-06, "loss": 0.1863, "step": 2428 }, { "epoch": 1.1522770398481974, "grad_norm": 1.86540687084198, "learning_rate": 8.034499096094434e-06, "loss": 0.1823, "step": 2429 }, { "epoch": 1.1527514231499052, "grad_norm": 2.4312217235565186, "learning_rate": 8.026966509736001e-06, "loss": 0.1981, "step": 2430 }, { "epoch": 1.153225806451613, "grad_norm": 1.7719708681106567, "learning_rate": 8.019435088039714e-06, "loss": 0.1733, "step": 2431 }, { "epoch": 1.1537001897533208, "grad_norm": 2.0009617805480957, "learning_rate": 8.011904835451298e-06, "loss": 0.178, "step": 2432 }, { "epoch": 1.1541745730550286, "grad_norm": 2.0233676433563232, "learning_rate": 8.004375756415783e-06, "loss": 0.1689, "step": 2433 }, { "epoch": 1.1546489563567364, "grad_norm": 1.6531426906585693, "learning_rate": 7.996847855377514e-06, "loss": 0.1642, "step": 2434 }, { "epoch": 1.155123339658444, "grad_norm": 2.1204326152801514, "learning_rate": 7.989321136780131e-06, "loss": 0.1987, "step": 2435 }, { "epoch": 1.1555977229601517, "grad_norm": 1.7340539693832397, "learning_rate": 7.981795605066585e-06, "loss": 0.1949, "step": 2436 }, { "epoch": 1.1560721062618595, "grad_norm": 1.8401095867156982, "learning_rate": 7.974271264679122e-06, "loss": 0.1563, "step": 2437 }, { "epoch": 1.1565464895635673, "grad_norm": 1.637509822845459, "learning_rate": 7.966748120059286e-06, "loss": 0.1538, "step": 2438 }, { "epoch": 1.157020872865275, "grad_norm": 2.007936954498291, "learning_rate": 7.959226175647919e-06, "loss": 0.1921, "step": 2439 }, { "epoch": 1.157495256166983, "grad_norm": 1.9487900733947754, "learning_rate": 7.951705435885143e-06, "loss": 0.1798, "step": 2440 }, { "epoch": 1.1579696394686907, "grad_norm": 1.8263933658599854, "learning_rate": 7.944185905210377e-06, "loss": 0.1991, "step": 2441 }, { "epoch": 1.1584440227703985, "grad_norm": 2.152977466583252, "learning_rate": 7.93666758806233e-06, "loss": 0.1981, "step": 2442 }, { "epoch": 1.1589184060721063, "grad_norm": 1.8188046216964722, "learning_rate": 7.929150488878991e-06, "loss": 0.1388, "step": 2443 }, { "epoch": 1.159392789373814, "grad_norm": 1.7171995639801025, "learning_rate": 7.921634612097623e-06, "loss": 0.1689, "step": 2444 }, { "epoch": 1.1598671726755219, "grad_norm": 1.9360337257385254, "learning_rate": 7.914119962154779e-06, "loss": 0.1858, "step": 2445 }, { "epoch": 1.1603415559772297, "grad_norm": 1.8121434450149536, "learning_rate": 7.906606543486278e-06, "loss": 0.169, "step": 2446 }, { "epoch": 1.1608159392789374, "grad_norm": 2.7344167232513428, "learning_rate": 7.89909436052722e-06, "loss": 0.2118, "step": 2447 }, { "epoch": 1.1612903225806452, "grad_norm": 2.18343186378479, "learning_rate": 7.891583417711975e-06, "loss": 0.21, "step": 2448 }, { "epoch": 1.161764705882353, "grad_norm": 1.4035104513168335, "learning_rate": 7.884073719474174e-06, "loss": 0.152, "step": 2449 }, { "epoch": 1.1622390891840606, "grad_norm": 1.8150465488433838, "learning_rate": 7.876565270246715e-06, "loss": 0.1535, "step": 2450 }, { "epoch": 1.1627134724857684, "grad_norm": 1.967027187347412, "learning_rate": 7.869058074461766e-06, "loss": 0.1987, "step": 2451 }, { "epoch": 1.1631878557874762, "grad_norm": 1.906870722770691, "learning_rate": 7.86155213655075e-06, "loss": 0.1772, "step": 2452 }, { "epoch": 1.163662239089184, "grad_norm": 1.788644790649414, "learning_rate": 7.85404746094435e-06, "loss": 0.1803, "step": 2453 }, { "epoch": 1.1641366223908918, "grad_norm": 2.3639960289001465, "learning_rate": 7.846544052072494e-06, "loss": 0.1969, "step": 2454 }, { "epoch": 1.1646110056925996, "grad_norm": 1.8786990642547607, "learning_rate": 7.839041914364375e-06, "loss": 0.2137, "step": 2455 }, { "epoch": 1.1650853889943074, "grad_norm": 1.9279520511627197, "learning_rate": 7.831541052248433e-06, "loss": 0.1631, "step": 2456 }, { "epoch": 1.1655597722960152, "grad_norm": 1.9649664163589478, "learning_rate": 7.824041470152346e-06, "loss": 0.1736, "step": 2457 }, { "epoch": 1.166034155597723, "grad_norm": 2.580247402191162, "learning_rate": 7.816543172503052e-06, "loss": 0.2557, "step": 2458 }, { "epoch": 1.1665085388994307, "grad_norm": 1.8188077211380005, "learning_rate": 7.809046163726715e-06, "loss": 0.1678, "step": 2459 }, { "epoch": 1.1669829222011385, "grad_norm": 1.8238269090652466, "learning_rate": 7.801550448248746e-06, "loss": 0.2021, "step": 2460 }, { "epoch": 1.1674573055028463, "grad_norm": 1.9044618606567383, "learning_rate": 7.794056030493793e-06, "loss": 0.1528, "step": 2461 }, { "epoch": 1.1679316888045541, "grad_norm": 1.5531914234161377, "learning_rate": 7.78656291488574e-06, "loss": 0.1613, "step": 2462 }, { "epoch": 1.168406072106262, "grad_norm": 2.1100800037384033, "learning_rate": 7.779071105847692e-06, "loss": 0.2401, "step": 2463 }, { "epoch": 1.1688804554079697, "grad_norm": 1.6972509622573853, "learning_rate": 7.771580607801994e-06, "loss": 0.2089, "step": 2464 }, { "epoch": 1.1693548387096775, "grad_norm": 1.727967381477356, "learning_rate": 7.76409142517021e-06, "loss": 0.1693, "step": 2465 }, { "epoch": 1.1698292220113853, "grad_norm": 2.0903284549713135, "learning_rate": 7.756603562373134e-06, "loss": 0.1983, "step": 2466 }, { "epoch": 1.170303605313093, "grad_norm": 1.5043290853500366, "learning_rate": 7.749117023830779e-06, "loss": 0.1714, "step": 2467 }, { "epoch": 1.1707779886148009, "grad_norm": 1.7228398323059082, "learning_rate": 7.741631813962367e-06, "loss": 0.1587, "step": 2468 }, { "epoch": 1.1712523719165087, "grad_norm": 1.9864158630371094, "learning_rate": 7.73414793718635e-06, "loss": 0.1873, "step": 2469 }, { "epoch": 1.1717267552182162, "grad_norm": 1.7577637434005737, "learning_rate": 7.72666539792038e-06, "loss": 0.1599, "step": 2470 }, { "epoch": 1.172201138519924, "grad_norm": 1.9289382696151733, "learning_rate": 7.719184200581334e-06, "loss": 0.1762, "step": 2471 }, { "epoch": 1.1726755218216318, "grad_norm": 1.6133570671081543, "learning_rate": 7.711704349585287e-06, "loss": 0.1568, "step": 2472 }, { "epoch": 1.1731499051233396, "grad_norm": 1.882628083229065, "learning_rate": 7.704225849347517e-06, "loss": 0.1661, "step": 2473 }, { "epoch": 1.1736242884250474, "grad_norm": 2.2073872089385986, "learning_rate": 7.696748704282507e-06, "loss": 0.1734, "step": 2474 }, { "epoch": 1.1740986717267552, "grad_norm": 1.9427272081375122, "learning_rate": 7.689272918803946e-06, "loss": 0.1713, "step": 2475 }, { "epoch": 1.174573055028463, "grad_norm": 1.8138381242752075, "learning_rate": 7.681798497324717e-06, "loss": 0.1864, "step": 2476 }, { "epoch": 1.1750474383301708, "grad_norm": 2.078590154647827, "learning_rate": 7.674325444256899e-06, "loss": 0.1737, "step": 2477 }, { "epoch": 1.1755218216318786, "grad_norm": 2.3086998462677, "learning_rate": 7.666853764011752e-06, "loss": 0.2696, "step": 2478 }, { "epoch": 1.1759962049335864, "grad_norm": 2.107063055038452, "learning_rate": 7.659383460999742e-06, "loss": 0.1974, "step": 2479 }, { "epoch": 1.1764705882352942, "grad_norm": 1.7796244621276855, "learning_rate": 7.651914539630515e-06, "loss": 0.194, "step": 2480 }, { "epoch": 1.176944971537002, "grad_norm": 2.3119170665740967, "learning_rate": 7.644447004312903e-06, "loss": 0.2256, "step": 2481 }, { "epoch": 1.1774193548387097, "grad_norm": 1.837838053703308, "learning_rate": 7.63698085945491e-06, "loss": 0.1657, "step": 2482 }, { "epoch": 1.1778937381404175, "grad_norm": 1.628775954246521, "learning_rate": 7.629516109463732e-06, "loss": 0.1785, "step": 2483 }, { "epoch": 1.1783681214421253, "grad_norm": 1.6993663311004639, "learning_rate": 7.622052758745741e-06, "loss": 0.1829, "step": 2484 }, { "epoch": 1.178842504743833, "grad_norm": 2.071762800216675, "learning_rate": 7.614590811706473e-06, "loss": 0.2427, "step": 2485 }, { "epoch": 1.1793168880455407, "grad_norm": 1.9026820659637451, "learning_rate": 7.607130272750647e-06, "loss": 0.2126, "step": 2486 }, { "epoch": 1.1797912713472485, "grad_norm": 1.604719638824463, "learning_rate": 7.59967114628214e-06, "loss": 0.1347, "step": 2487 }, { "epoch": 1.1802656546489563, "grad_norm": 1.5981990098953247, "learning_rate": 7.592213436704004e-06, "loss": 0.1716, "step": 2488 }, { "epoch": 1.180740037950664, "grad_norm": 1.558847188949585, "learning_rate": 7.58475714841845e-06, "loss": 0.1558, "step": 2489 }, { "epoch": 1.1812144212523719, "grad_norm": 1.6636430025100708, "learning_rate": 7.577302285826851e-06, "loss": 0.1693, "step": 2490 }, { "epoch": 1.1816888045540797, "grad_norm": 1.62824285030365, "learning_rate": 7.569848853329742e-06, "loss": 0.1649, "step": 2491 }, { "epoch": 1.1821631878557874, "grad_norm": 1.6984463930130005, "learning_rate": 7.562396855326805e-06, "loss": 0.1345, "step": 2492 }, { "epoch": 1.1826375711574952, "grad_norm": 2.12964129447937, "learning_rate": 7.554946296216884e-06, "loss": 0.1844, "step": 2493 }, { "epoch": 1.183111954459203, "grad_norm": 1.8614979982376099, "learning_rate": 7.547497180397968e-06, "loss": 0.203, "step": 2494 }, { "epoch": 1.1835863377609108, "grad_norm": 1.7780121564865112, "learning_rate": 7.540049512267197e-06, "loss": 0.199, "step": 2495 }, { "epoch": 1.1840607210626186, "grad_norm": 2.061229705810547, "learning_rate": 7.532603296220862e-06, "loss": 0.2002, "step": 2496 }, { "epoch": 1.1845351043643264, "grad_norm": 1.7762293815612793, "learning_rate": 7.525158536654382e-06, "loss": 0.1714, "step": 2497 }, { "epoch": 1.1850094876660342, "grad_norm": 1.525127649307251, "learning_rate": 7.517715237962328e-06, "loss": 0.1557, "step": 2498 }, { "epoch": 1.185483870967742, "grad_norm": 1.9348129034042358, "learning_rate": 7.510273404538404e-06, "loss": 0.1971, "step": 2499 }, { "epoch": 1.1859582542694498, "grad_norm": 1.6467643976211548, "learning_rate": 7.502833040775457e-06, "loss": 0.1551, "step": 2500 }, { "epoch": 1.1864326375711576, "grad_norm": 1.4035449028015137, "learning_rate": 7.4953941510654535e-06, "loss": 0.1521, "step": 2501 }, { "epoch": 1.1869070208728654, "grad_norm": 1.7778912782669067, "learning_rate": 7.487956739799496e-06, "loss": 0.1788, "step": 2502 }, { "epoch": 1.1873814041745732, "grad_norm": 1.626543402671814, "learning_rate": 7.480520811367817e-06, "loss": 0.1382, "step": 2503 }, { "epoch": 1.187855787476281, "grad_norm": 2.147493362426758, "learning_rate": 7.473086370159776e-06, "loss": 0.2429, "step": 2504 }, { "epoch": 1.1883301707779885, "grad_norm": 1.7285178899765015, "learning_rate": 7.465653420563846e-06, "loss": 0.1818, "step": 2505 }, { "epoch": 1.1888045540796963, "grad_norm": 1.7863154411315918, "learning_rate": 7.45822196696762e-06, "loss": 0.1791, "step": 2506 }, { "epoch": 1.189278937381404, "grad_norm": 2.1906211376190186, "learning_rate": 7.4507920137578146e-06, "loss": 0.2307, "step": 2507 }, { "epoch": 1.189753320683112, "grad_norm": 3.06583309173584, "learning_rate": 7.443363565320259e-06, "loss": 0.1962, "step": 2508 }, { "epoch": 1.1902277039848197, "grad_norm": 1.7111692428588867, "learning_rate": 7.435936626039891e-06, "loss": 0.1744, "step": 2509 }, { "epoch": 1.1907020872865275, "grad_norm": 3.7655081748962402, "learning_rate": 7.428511200300765e-06, "loss": 0.2345, "step": 2510 }, { "epoch": 1.1911764705882353, "grad_norm": 2.3031506538391113, "learning_rate": 7.421087292486027e-06, "loss": 0.208, "step": 2511 }, { "epoch": 1.191650853889943, "grad_norm": 2.341597318649292, "learning_rate": 7.41366490697794e-06, "loss": 0.2249, "step": 2512 }, { "epoch": 1.1921252371916509, "grad_norm": 2.380882501602173, "learning_rate": 7.406244048157867e-06, "loss": 0.1982, "step": 2513 }, { "epoch": 1.1925996204933587, "grad_norm": 2.25535249710083, "learning_rate": 7.398824720406265e-06, "loss": 0.2163, "step": 2514 }, { "epoch": 1.1930740037950665, "grad_norm": 1.5992552042007446, "learning_rate": 7.391406928102695e-06, "loss": 0.1696, "step": 2515 }, { "epoch": 1.1935483870967742, "grad_norm": 1.729461908340454, "learning_rate": 7.383990675625797e-06, "loss": 0.1747, "step": 2516 }, { "epoch": 1.194022770398482, "grad_norm": 2.1388728618621826, "learning_rate": 7.37657596735332e-06, "loss": 0.1818, "step": 2517 }, { "epoch": 1.1944971537001898, "grad_norm": 1.8862483501434326, "learning_rate": 7.369162807662087e-06, "loss": 0.1946, "step": 2518 }, { "epoch": 1.1949715370018976, "grad_norm": 1.9275808334350586, "learning_rate": 7.36175120092802e-06, "loss": 0.2447, "step": 2519 }, { "epoch": 1.1954459203036052, "grad_norm": 2.302499294281006, "learning_rate": 7.354341151526107e-06, "loss": 0.1424, "step": 2520 }, { "epoch": 1.195920303605313, "grad_norm": 2.07778263092041, "learning_rate": 7.3469326638304365e-06, "loss": 0.2255, "step": 2521 }, { "epoch": 1.1963946869070208, "grad_norm": 1.843159794807434, "learning_rate": 7.339525742214157e-06, "loss": 0.1705, "step": 2522 }, { "epoch": 1.1968690702087286, "grad_norm": 2.0549871921539307, "learning_rate": 7.332120391049506e-06, "loss": 0.2071, "step": 2523 }, { "epoch": 1.1973434535104364, "grad_norm": 1.5816240310668945, "learning_rate": 7.324716614707794e-06, "loss": 0.1605, "step": 2524 }, { "epoch": 1.1978178368121442, "grad_norm": 1.4971600770950317, "learning_rate": 7.317314417559389e-06, "loss": 0.1392, "step": 2525 }, { "epoch": 1.198292220113852, "grad_norm": 1.5572508573532104, "learning_rate": 7.309913803973734e-06, "loss": 0.1518, "step": 2526 }, { "epoch": 1.1987666034155597, "grad_norm": 2.0520272254943848, "learning_rate": 7.302514778319341e-06, "loss": 0.2069, "step": 2527 }, { "epoch": 1.1992409867172675, "grad_norm": 1.7996068000793457, "learning_rate": 7.295117344963782e-06, "loss": 0.2041, "step": 2528 }, { "epoch": 1.1997153700189753, "grad_norm": 1.6315232515335083, "learning_rate": 7.287721508273691e-06, "loss": 0.1681, "step": 2529 }, { "epoch": 1.2001897533206831, "grad_norm": 1.8578884601593018, "learning_rate": 7.280327272614753e-06, "loss": 0.1671, "step": 2530 }, { "epoch": 1.200664136622391, "grad_norm": 1.7982726097106934, "learning_rate": 7.272934642351712e-06, "loss": 0.1849, "step": 2531 }, { "epoch": 1.2011385199240987, "grad_norm": 2.00294828414917, "learning_rate": 7.265543621848368e-06, "loss": 0.1644, "step": 2532 }, { "epoch": 1.2016129032258065, "grad_norm": 1.6262898445129395, "learning_rate": 7.2581542154675654e-06, "loss": 0.1612, "step": 2533 }, { "epoch": 1.2020872865275143, "grad_norm": 1.8584097623825073, "learning_rate": 7.2507664275712e-06, "loss": 0.1822, "step": 2534 }, { "epoch": 1.202561669829222, "grad_norm": 2.0192692279815674, "learning_rate": 7.243380262520203e-06, "loss": 0.2231, "step": 2535 }, { "epoch": 1.2030360531309299, "grad_norm": 2.2712655067443848, "learning_rate": 7.23599572467456e-06, "loss": 0.2046, "step": 2536 }, { "epoch": 1.2035104364326377, "grad_norm": 2.379418134689331, "learning_rate": 7.228612818393292e-06, "loss": 0.2186, "step": 2537 }, { "epoch": 1.2039848197343455, "grad_norm": 1.9519847631454468, "learning_rate": 7.221231548034451e-06, "loss": 0.1875, "step": 2538 }, { "epoch": 1.2044592030360532, "grad_norm": 1.7326889038085938, "learning_rate": 7.2138519179551335e-06, "loss": 0.153, "step": 2539 }, { "epoch": 1.2049335863377608, "grad_norm": 1.4056307077407837, "learning_rate": 7.206473932511455e-06, "loss": 0.1415, "step": 2540 }, { "epoch": 1.2054079696394686, "grad_norm": 1.8842997550964355, "learning_rate": 7.199097596058573e-06, "loss": 0.1948, "step": 2541 }, { "epoch": 1.2058823529411764, "grad_norm": 1.9680218696594238, "learning_rate": 7.1917229129506626e-06, "loss": 0.1902, "step": 2542 }, { "epoch": 1.2063567362428842, "grad_norm": 6.604353427886963, "learning_rate": 7.1843498875409315e-06, "loss": 0.1912, "step": 2543 }, { "epoch": 1.206831119544592, "grad_norm": 1.8714125156402588, "learning_rate": 7.176978524181595e-06, "loss": 0.1609, "step": 2544 }, { "epoch": 1.2073055028462998, "grad_norm": 1.6578891277313232, "learning_rate": 7.169608827223902e-06, "loss": 0.1785, "step": 2545 }, { "epoch": 1.2077798861480076, "grad_norm": 1.9563171863555908, "learning_rate": 7.16224080101811e-06, "loss": 0.2015, "step": 2546 }, { "epoch": 1.2082542694497154, "grad_norm": 1.5768934488296509, "learning_rate": 7.154874449913492e-06, "loss": 0.1484, "step": 2547 }, { "epoch": 1.2087286527514232, "grad_norm": 1.6119853258132935, "learning_rate": 7.147509778258334e-06, "loss": 0.2121, "step": 2548 }, { "epoch": 1.209203036053131, "grad_norm": 1.5381999015808105, "learning_rate": 7.1401467903999285e-06, "loss": 0.1711, "step": 2549 }, { "epoch": 1.2096774193548387, "grad_norm": 1.6348845958709717, "learning_rate": 7.1327854906845706e-06, "loss": 0.1632, "step": 2550 }, { "epoch": 1.2101518026565465, "grad_norm": 1.7592006921768188, "learning_rate": 7.125425883457564e-06, "loss": 0.1786, "step": 2551 }, { "epoch": 1.2106261859582543, "grad_norm": 1.6525636911392212, "learning_rate": 7.118067973063216e-06, "loss": 0.1704, "step": 2552 }, { "epoch": 1.2111005692599621, "grad_norm": 1.930760383605957, "learning_rate": 7.110711763844826e-06, "loss": 0.1789, "step": 2553 }, { "epoch": 1.21157495256167, "grad_norm": 1.6949514150619507, "learning_rate": 7.10335726014469e-06, "loss": 0.2061, "step": 2554 }, { "epoch": 1.2120493358633775, "grad_norm": 1.494983434677124, "learning_rate": 7.096004466304099e-06, "loss": 0.1317, "step": 2555 }, { "epoch": 1.2125237191650853, "grad_norm": 1.5275888442993164, "learning_rate": 7.088653386663335e-06, "loss": 0.1481, "step": 2556 }, { "epoch": 1.212998102466793, "grad_norm": 2.201962471008301, "learning_rate": 7.081304025561668e-06, "loss": 0.2149, "step": 2557 }, { "epoch": 1.2134724857685009, "grad_norm": 1.78719961643219, "learning_rate": 7.073956387337357e-06, "loss": 0.1635, "step": 2558 }, { "epoch": 1.2139468690702087, "grad_norm": 1.9768500328063965, "learning_rate": 7.066610476327632e-06, "loss": 0.1727, "step": 2559 }, { "epoch": 1.2144212523719164, "grad_norm": 2.0566301345825195, "learning_rate": 7.059266296868715e-06, "loss": 0.2103, "step": 2560 }, { "epoch": 1.2148956356736242, "grad_norm": 1.8030273914337158, "learning_rate": 7.051923853295805e-06, "loss": 0.1743, "step": 2561 }, { "epoch": 1.215370018975332, "grad_norm": 1.7541704177856445, "learning_rate": 7.044583149943076e-06, "loss": 0.191, "step": 2562 }, { "epoch": 1.2158444022770398, "grad_norm": 1.8001971244812012, "learning_rate": 7.037244191143662e-06, "loss": 0.1788, "step": 2563 }, { "epoch": 1.2163187855787476, "grad_norm": 1.620387315750122, "learning_rate": 7.029906981229686e-06, "loss": 0.1421, "step": 2564 }, { "epoch": 1.2167931688804554, "grad_norm": 1.8180466890335083, "learning_rate": 7.022571524532229e-06, "loss": 0.1778, "step": 2565 }, { "epoch": 1.2172675521821632, "grad_norm": 2.036728620529175, "learning_rate": 7.015237825381338e-06, "loss": 0.2076, "step": 2566 }, { "epoch": 1.217741935483871, "grad_norm": 1.8897072076797485, "learning_rate": 7.007905888106026e-06, "loss": 0.1871, "step": 2567 }, { "epoch": 1.2182163187855788, "grad_norm": 1.585066556930542, "learning_rate": 7.000575717034256e-06, "loss": 0.163, "step": 2568 }, { "epoch": 1.2186907020872866, "grad_norm": 1.9996800422668457, "learning_rate": 6.993247316492962e-06, "loss": 0.1774, "step": 2569 }, { "epoch": 1.2191650853889944, "grad_norm": 2.1412925720214844, "learning_rate": 6.985920690808021e-06, "loss": 0.1793, "step": 2570 }, { "epoch": 1.2196394686907022, "grad_norm": 1.8670066595077515, "learning_rate": 6.978595844304272e-06, "loss": 0.1692, "step": 2571 }, { "epoch": 1.22011385199241, "grad_norm": 1.7688610553741455, "learning_rate": 6.971272781305503e-06, "loss": 0.175, "step": 2572 }, { "epoch": 1.2205882352941178, "grad_norm": 1.6328133344650269, "learning_rate": 6.963951506134434e-06, "loss": 0.1546, "step": 2573 }, { "epoch": 1.2210626185958255, "grad_norm": 1.8396352529525757, "learning_rate": 6.956632023112751e-06, "loss": 0.1753, "step": 2574 }, { "epoch": 1.2215370018975331, "grad_norm": 1.9134464263916016, "learning_rate": 6.949314336561068e-06, "loss": 0.1688, "step": 2575 }, { "epoch": 1.222011385199241, "grad_norm": 1.5007470846176147, "learning_rate": 6.941998450798946e-06, "loss": 0.172, "step": 2576 }, { "epoch": 1.2224857685009487, "grad_norm": 1.7491124868392944, "learning_rate": 6.934684370144879e-06, "loss": 0.2002, "step": 2577 }, { "epoch": 1.2229601518026565, "grad_norm": 1.8310550451278687, "learning_rate": 6.927372098916294e-06, "loss": 0.1743, "step": 2578 }, { "epoch": 1.2234345351043643, "grad_norm": 1.72085702419281, "learning_rate": 6.9200616414295525e-06, "loss": 0.1635, "step": 2579 }, { "epoch": 1.223908918406072, "grad_norm": 1.723044514656067, "learning_rate": 6.912753001999948e-06, "loss": 0.1717, "step": 2580 }, { "epoch": 1.2243833017077799, "grad_norm": 1.7026135921478271, "learning_rate": 6.9054461849417e-06, "loss": 0.1621, "step": 2581 }, { "epoch": 1.2248576850094877, "grad_norm": 1.7875248193740845, "learning_rate": 6.898141194567944e-06, "loss": 0.1824, "step": 2582 }, { "epoch": 1.2253320683111955, "grad_norm": 1.8562390804290771, "learning_rate": 6.890838035190747e-06, "loss": 0.1861, "step": 2583 }, { "epoch": 1.2258064516129032, "grad_norm": 1.7400999069213867, "learning_rate": 6.88353671112109e-06, "loss": 0.189, "step": 2584 }, { "epoch": 1.226280834914611, "grad_norm": 1.8128386735916138, "learning_rate": 6.876237226668876e-06, "loss": 0.1825, "step": 2585 }, { "epoch": 1.2267552182163188, "grad_norm": 1.9619581699371338, "learning_rate": 6.868939586142917e-06, "loss": 0.191, "step": 2586 }, { "epoch": 1.2272296015180266, "grad_norm": 1.6164292097091675, "learning_rate": 6.861643793850934e-06, "loss": 0.1718, "step": 2587 }, { "epoch": 1.2277039848197344, "grad_norm": 1.7258304357528687, "learning_rate": 6.854349854099565e-06, "loss": 0.1814, "step": 2588 }, { "epoch": 1.2281783681214422, "grad_norm": 2.029466390609741, "learning_rate": 6.847057771194351e-06, "loss": 0.1718, "step": 2589 }, { "epoch": 1.2286527514231498, "grad_norm": 2.171515464782715, "learning_rate": 6.839767549439733e-06, "loss": 0.1829, "step": 2590 }, { "epoch": 1.2291271347248576, "grad_norm": 1.7947821617126465, "learning_rate": 6.832479193139062e-06, "loss": 0.1668, "step": 2591 }, { "epoch": 1.2296015180265654, "grad_norm": 1.5067108869552612, "learning_rate": 6.8251927065945755e-06, "loss": 0.1532, "step": 2592 }, { "epoch": 1.2300759013282732, "grad_norm": 1.7323521375656128, "learning_rate": 6.81790809410742e-06, "loss": 0.1754, "step": 2593 }, { "epoch": 1.230550284629981, "grad_norm": 1.8438886404037476, "learning_rate": 6.8106253599776275e-06, "loss": 0.173, "step": 2594 }, { "epoch": 1.2310246679316887, "grad_norm": 1.911361813545227, "learning_rate": 6.803344508504124e-06, "loss": 0.1942, "step": 2595 }, { "epoch": 1.2314990512333965, "grad_norm": 2.2166242599487305, "learning_rate": 6.796065543984729e-06, "loss": 0.1846, "step": 2596 }, { "epoch": 1.2319734345351043, "grad_norm": 1.8407729864120483, "learning_rate": 6.788788470716136e-06, "loss": 0.1958, "step": 2597 }, { "epoch": 1.2324478178368121, "grad_norm": 1.4400639533996582, "learning_rate": 6.781513292993933e-06, "loss": 0.1268, "step": 2598 }, { "epoch": 1.23292220113852, "grad_norm": 1.7160131931304932, "learning_rate": 6.774240015112583e-06, "loss": 0.1838, "step": 2599 }, { "epoch": 1.2333965844402277, "grad_norm": 1.9042142629623413, "learning_rate": 6.766968641365437e-06, "loss": 0.1965, "step": 2600 }, { "epoch": 1.2338709677419355, "grad_norm": 2.0009496212005615, "learning_rate": 6.759699176044705e-06, "loss": 0.1894, "step": 2601 }, { "epoch": 1.2343453510436433, "grad_norm": 1.4664678573608398, "learning_rate": 6.752431623441488e-06, "loss": 0.1356, "step": 2602 }, { "epoch": 1.234819734345351, "grad_norm": 2.177931547164917, "learning_rate": 6.745165987845744e-06, "loss": 0.1661, "step": 2603 }, { "epoch": 1.2352941176470589, "grad_norm": 1.4767887592315674, "learning_rate": 6.73790227354631e-06, "loss": 0.161, "step": 2604 }, { "epoch": 1.2357685009487667, "grad_norm": 1.9112201929092407, "learning_rate": 6.73064048483089e-06, "loss": 0.1683, "step": 2605 }, { "epoch": 1.2362428842504745, "grad_norm": 1.6860960721969604, "learning_rate": 6.7233806259860355e-06, "loss": 0.183, "step": 2606 }, { "epoch": 1.2367172675521823, "grad_norm": 1.788388967514038, "learning_rate": 6.716122701297173e-06, "loss": 0.1649, "step": 2607 }, { "epoch": 1.23719165085389, "grad_norm": 1.9071511030197144, "learning_rate": 6.708866715048586e-06, "loss": 0.1961, "step": 2608 }, { "epoch": 1.2376660341555978, "grad_norm": 1.8968346118927002, "learning_rate": 6.70161267152341e-06, "loss": 0.1793, "step": 2609 }, { "epoch": 1.2381404174573054, "grad_norm": 1.6861655712127686, "learning_rate": 6.694360575003637e-06, "loss": 0.145, "step": 2610 }, { "epoch": 1.2386148007590132, "grad_norm": 1.9473601579666138, "learning_rate": 6.687110429770103e-06, "loss": 0.1736, "step": 2611 }, { "epoch": 1.239089184060721, "grad_norm": 1.6562045812606812, "learning_rate": 6.679862240102499e-06, "loss": 0.1866, "step": 2612 }, { "epoch": 1.2395635673624288, "grad_norm": 1.8242155313491821, "learning_rate": 6.672616010279362e-06, "loss": 0.1956, "step": 2613 }, { "epoch": 1.2400379506641366, "grad_norm": 1.7770357131958008, "learning_rate": 6.6653717445780675e-06, "loss": 0.1963, "step": 2614 }, { "epoch": 1.2405123339658444, "grad_norm": 1.7528514862060547, "learning_rate": 6.658129447274838e-06, "loss": 0.176, "step": 2615 }, { "epoch": 1.2409867172675522, "grad_norm": 1.6112436056137085, "learning_rate": 6.650889122644723e-06, "loss": 0.1464, "step": 2616 }, { "epoch": 1.24146110056926, "grad_norm": 1.8748856782913208, "learning_rate": 6.6436507749616195e-06, "loss": 0.1667, "step": 2617 }, { "epoch": 1.2419354838709677, "grad_norm": 1.466968297958374, "learning_rate": 6.636414408498249e-06, "loss": 0.1546, "step": 2618 }, { "epoch": 1.2424098671726755, "grad_norm": 1.6490691900253296, "learning_rate": 6.629180027526174e-06, "loss": 0.1427, "step": 2619 }, { "epoch": 1.2428842504743833, "grad_norm": 1.4727083444595337, "learning_rate": 6.6219476363157685e-06, "loss": 0.145, "step": 2620 }, { "epoch": 1.2433586337760911, "grad_norm": 1.5241156816482544, "learning_rate": 6.614717239136246e-06, "loss": 0.1576, "step": 2621 }, { "epoch": 1.243833017077799, "grad_norm": 1.842430830001831, "learning_rate": 6.6074888402556405e-06, "loss": 0.1815, "step": 2622 }, { "epoch": 1.2443074003795067, "grad_norm": 2.016418933868408, "learning_rate": 6.600262443940799e-06, "loss": 0.227, "step": 2623 }, { "epoch": 1.2447817836812145, "grad_norm": 1.7062424421310425, "learning_rate": 6.593038054457402e-06, "loss": 0.1792, "step": 2624 }, { "epoch": 1.2452561669829223, "grad_norm": 1.7208788394927979, "learning_rate": 6.58581567606992e-06, "loss": 0.1858, "step": 2625 }, { "epoch": 1.2457305502846299, "grad_norm": 1.6579339504241943, "learning_rate": 6.578595313041665e-06, "loss": 0.1828, "step": 2626 }, { "epoch": 1.2462049335863377, "grad_norm": 1.732631802558899, "learning_rate": 6.571376969634738e-06, "loss": 0.1919, "step": 2627 }, { "epoch": 1.2466793168880455, "grad_norm": 1.7477668523788452, "learning_rate": 6.564160650110057e-06, "loss": 0.1894, "step": 2628 }, { "epoch": 1.2471537001897532, "grad_norm": 1.7273080348968506, "learning_rate": 6.556946358727349e-06, "loss": 0.194, "step": 2629 }, { "epoch": 1.247628083491461, "grad_norm": 1.438499927520752, "learning_rate": 6.5497340997451335e-06, "loss": 0.1533, "step": 2630 }, { "epoch": 1.2481024667931688, "grad_norm": 1.645349383354187, "learning_rate": 6.5425238774207345e-06, "loss": 0.1597, "step": 2631 }, { "epoch": 1.2485768500948766, "grad_norm": 2.0088460445404053, "learning_rate": 6.535315696010278e-06, "loss": 0.2085, "step": 2632 }, { "epoch": 1.2490512333965844, "grad_norm": 2.179716110229492, "learning_rate": 6.528109559768685e-06, "loss": 0.1696, "step": 2633 }, { "epoch": 1.2495256166982922, "grad_norm": 1.4562453031539917, "learning_rate": 6.520905472949664e-06, "loss": 0.145, "step": 2634 }, { "epoch": 1.25, "grad_norm": 1.8796703815460205, "learning_rate": 6.51370343980571e-06, "loss": 0.1897, "step": 2635 }, { "epoch": 1.2504743833017078, "grad_norm": 1.4412729740142822, "learning_rate": 6.506503464588119e-06, "loss": 0.1843, "step": 2636 }, { "epoch": 1.2509487666034156, "grad_norm": 1.632536768913269, "learning_rate": 6.499305551546964e-06, "loss": 0.1567, "step": 2637 }, { "epoch": 1.2514231499051234, "grad_norm": 1.5959841012954712, "learning_rate": 6.492109704931101e-06, "loss": 0.1611, "step": 2638 }, { "epoch": 1.2518975332068312, "grad_norm": 1.659533977508545, "learning_rate": 6.484915928988167e-06, "loss": 0.1776, "step": 2639 }, { "epoch": 1.252371916508539, "grad_norm": 2.8352959156036377, "learning_rate": 6.4777242279645754e-06, "loss": 0.1758, "step": 2640 }, { "epoch": 1.2528462998102468, "grad_norm": 1.6581546068191528, "learning_rate": 6.470534606105519e-06, "loss": 0.1767, "step": 2641 }, { "epoch": 1.2533206831119545, "grad_norm": 2.0722496509552, "learning_rate": 6.463347067654959e-06, "loss": 0.1892, "step": 2642 }, { "epoch": 1.2537950664136623, "grad_norm": 1.7481244802474976, "learning_rate": 6.456161616855631e-06, "loss": 0.1726, "step": 2643 }, { "epoch": 1.2542694497153701, "grad_norm": 2.0864098072052, "learning_rate": 6.448978257949028e-06, "loss": 0.2157, "step": 2644 }, { "epoch": 1.254743833017078, "grad_norm": 1.7608370780944824, "learning_rate": 6.441796995175422e-06, "loss": 0.1735, "step": 2645 }, { "epoch": 1.2552182163187855, "grad_norm": 1.9006822109222412, "learning_rate": 6.4346178327738405e-06, "loss": 0.1814, "step": 2646 }, { "epoch": 1.2556925996204933, "grad_norm": 1.6309512853622437, "learning_rate": 6.42744077498207e-06, "loss": 0.1562, "step": 2647 }, { "epoch": 1.256166982922201, "grad_norm": 1.9142560958862305, "learning_rate": 6.420265826036663e-06, "loss": 0.1902, "step": 2648 }, { "epoch": 1.2566413662239089, "grad_norm": 2.259213447570801, "learning_rate": 6.41309299017291e-06, "loss": 0.2249, "step": 2649 }, { "epoch": 1.2571157495256167, "grad_norm": 1.8461946249008179, "learning_rate": 6.405922271624874e-06, "loss": 0.1775, "step": 2650 }, { "epoch": 1.2575901328273245, "grad_norm": 2.4497241973876953, "learning_rate": 6.398753674625353e-06, "loss": 0.1998, "step": 2651 }, { "epoch": 1.2580645161290323, "grad_norm": 1.6050024032592773, "learning_rate": 6.391587203405903e-06, "loss": 0.1655, "step": 2652 }, { "epoch": 1.25853889943074, "grad_norm": 1.7544989585876465, "learning_rate": 6.384422862196824e-06, "loss": 0.1495, "step": 2653 }, { "epoch": 1.2590132827324478, "grad_norm": 1.697287917137146, "learning_rate": 6.37726065522715e-06, "loss": 0.1454, "step": 2654 }, { "epoch": 1.2594876660341556, "grad_norm": 3.1953728199005127, "learning_rate": 6.37010058672466e-06, "loss": 0.2283, "step": 2655 }, { "epoch": 1.2599620493358634, "grad_norm": 1.9299657344818115, "learning_rate": 6.362942660915875e-06, "loss": 0.2082, "step": 2656 }, { "epoch": 1.2604364326375712, "grad_norm": 1.5635534524917603, "learning_rate": 6.3557868820260495e-06, "loss": 0.1593, "step": 2657 }, { "epoch": 1.260910815939279, "grad_norm": 2.0182600021362305, "learning_rate": 6.348633254279166e-06, "loss": 0.1987, "step": 2658 }, { "epoch": 1.2613851992409868, "grad_norm": 1.7452116012573242, "learning_rate": 6.341481781897939e-06, "loss": 0.205, "step": 2659 }, { "epoch": 1.2618595825426944, "grad_norm": 2.041940450668335, "learning_rate": 6.334332469103814e-06, "loss": 0.2476, "step": 2660 }, { "epoch": 1.2623339658444022, "grad_norm": 1.419898509979248, "learning_rate": 6.3271853201169594e-06, "loss": 0.1227, "step": 2661 }, { "epoch": 1.26280834914611, "grad_norm": 1.834674596786499, "learning_rate": 6.320040339156267e-06, "loss": 0.1672, "step": 2662 }, { "epoch": 1.2632827324478177, "grad_norm": 1.685529351234436, "learning_rate": 6.312897530439348e-06, "loss": 0.1822, "step": 2663 }, { "epoch": 1.2637571157495255, "grad_norm": 1.5215164422988892, "learning_rate": 6.305756898182529e-06, "loss": 0.1613, "step": 2664 }, { "epoch": 1.2642314990512333, "grad_norm": 16.828428268432617, "learning_rate": 6.298618446600856e-06, "loss": 0.233, "step": 2665 }, { "epoch": 1.2647058823529411, "grad_norm": 2.161356210708618, "learning_rate": 6.29148217990809e-06, "loss": 0.1777, "step": 2666 }, { "epoch": 1.265180265654649, "grad_norm": 1.8170483112335205, "learning_rate": 6.2843481023166975e-06, "loss": 0.1898, "step": 2667 }, { "epoch": 1.2656546489563567, "grad_norm": 1.768537998199463, "learning_rate": 6.27721621803785e-06, "loss": 0.1702, "step": 2668 }, { "epoch": 1.2661290322580645, "grad_norm": 1.892989993095398, "learning_rate": 6.270086531281428e-06, "loss": 0.205, "step": 2669 }, { "epoch": 1.2666034155597723, "grad_norm": 1.9306186437606812, "learning_rate": 6.262959046256021e-06, "loss": 0.2097, "step": 2670 }, { "epoch": 1.26707779886148, "grad_norm": 1.8183523416519165, "learning_rate": 6.255833767168907e-06, "loss": 0.1883, "step": 2671 }, { "epoch": 1.2675521821631879, "grad_norm": 1.4449431896209717, "learning_rate": 6.248710698226074e-06, "loss": 0.1465, "step": 2672 }, { "epoch": 1.2680265654648957, "grad_norm": 1.5579372644424438, "learning_rate": 6.241589843632192e-06, "loss": 0.1579, "step": 2673 }, { "epoch": 1.2685009487666035, "grad_norm": 1.6067235469818115, "learning_rate": 6.234471207590636e-06, "loss": 0.1597, "step": 2674 }, { "epoch": 1.2689753320683113, "grad_norm": 1.6292030811309814, "learning_rate": 6.227354794303461e-06, "loss": 0.2124, "step": 2675 }, { "epoch": 1.269449715370019, "grad_norm": 1.8330110311508179, "learning_rate": 6.220240607971422e-06, "loss": 0.1691, "step": 2676 }, { "epoch": 1.2699240986717268, "grad_norm": 1.4492367506027222, "learning_rate": 6.213128652793952e-06, "loss": 0.1557, "step": 2677 }, { "epoch": 1.2703984819734346, "grad_norm": 1.5724751949310303, "learning_rate": 6.206018932969162e-06, "loss": 0.142, "step": 2678 }, { "epoch": 1.2708728652751424, "grad_norm": 1.4839202165603638, "learning_rate": 6.1989114526938535e-06, "loss": 0.1237, "step": 2679 }, { "epoch": 1.2713472485768502, "grad_norm": 1.7914515733718872, "learning_rate": 6.1918062161635005e-06, "loss": 0.185, "step": 2680 }, { "epoch": 1.271821631878558, "grad_norm": 1.844212532043457, "learning_rate": 6.184703227572257e-06, "loss": 0.1892, "step": 2681 }, { "epoch": 1.2722960151802656, "grad_norm": 1.94633948802948, "learning_rate": 6.1776024911129414e-06, "loss": 0.1789, "step": 2682 }, { "epoch": 1.2727703984819734, "grad_norm": 1.849686622619629, "learning_rate": 6.170504010977053e-06, "loss": 0.1831, "step": 2683 }, { "epoch": 1.2732447817836812, "grad_norm": 1.323569893836975, "learning_rate": 6.163407791354751e-06, "loss": 0.1487, "step": 2684 }, { "epoch": 1.273719165085389, "grad_norm": 2.2079055309295654, "learning_rate": 6.156313836434864e-06, "loss": 0.1744, "step": 2685 }, { "epoch": 1.2741935483870968, "grad_norm": 1.6516516208648682, "learning_rate": 6.149222150404889e-06, "loss": 0.1772, "step": 2686 }, { "epoch": 1.2746679316888045, "grad_norm": 1.9557101726531982, "learning_rate": 6.142132737450971e-06, "loss": 0.2113, "step": 2687 }, { "epoch": 1.2751423149905123, "grad_norm": 1.9967963695526123, "learning_rate": 6.135045601757921e-06, "loss": 0.1954, "step": 2688 }, { "epoch": 1.2756166982922201, "grad_norm": 1.9493542909622192, "learning_rate": 6.127960747509207e-06, "loss": 0.1692, "step": 2689 }, { "epoch": 1.276091081593928, "grad_norm": 2.279658794403076, "learning_rate": 6.120878178886951e-06, "loss": 0.2131, "step": 2690 }, { "epoch": 1.2765654648956357, "grad_norm": 1.827585220336914, "learning_rate": 6.113797900071923e-06, "loss": 0.1943, "step": 2691 }, { "epoch": 1.2770398481973435, "grad_norm": 1.7768232822418213, "learning_rate": 6.106719915243533e-06, "loss": 0.1943, "step": 2692 }, { "epoch": 1.2775142314990513, "grad_norm": 1.574704885482788, "learning_rate": 6.099644228579852e-06, "loss": 0.1617, "step": 2693 }, { "epoch": 1.277988614800759, "grad_norm": 2.0819928646087646, "learning_rate": 6.092570844257589e-06, "loss": 0.2005, "step": 2694 }, { "epoch": 1.2784629981024667, "grad_norm": 2.360511064529419, "learning_rate": 6.08549976645209e-06, "loss": 0.2153, "step": 2695 }, { "epoch": 1.2789373814041745, "grad_norm": 1.7095462083816528, "learning_rate": 6.078430999337346e-06, "loss": 0.1612, "step": 2696 }, { "epoch": 1.2794117647058822, "grad_norm": 2.0695948600769043, "learning_rate": 6.071364547085974e-06, "loss": 0.1911, "step": 2697 }, { "epoch": 1.27988614800759, "grad_norm": 1.8348363637924194, "learning_rate": 6.064300413869237e-06, "loss": 0.1883, "step": 2698 }, { "epoch": 1.2803605313092978, "grad_norm": 1.8135762214660645, "learning_rate": 6.057238603857018e-06, "loss": 0.1442, "step": 2699 }, { "epoch": 1.2808349146110056, "grad_norm": 2.002474546432495, "learning_rate": 6.050179121217839e-06, "loss": 0.2274, "step": 2700 }, { "epoch": 1.2813092979127134, "grad_norm": 2.7125346660614014, "learning_rate": 6.043121970118837e-06, "loss": 0.1427, "step": 2701 }, { "epoch": 1.2817836812144212, "grad_norm": 1.8385379314422607, "learning_rate": 6.0360671547257825e-06, "loss": 0.1609, "step": 2702 }, { "epoch": 1.282258064516129, "grad_norm": 1.4639102220535278, "learning_rate": 6.029014679203059e-06, "loss": 0.1462, "step": 2703 }, { "epoch": 1.2827324478178368, "grad_norm": 1.8368364572525024, "learning_rate": 6.0219645477136764e-06, "loss": 0.1943, "step": 2704 }, { "epoch": 1.2832068311195446, "grad_norm": 1.7069745063781738, "learning_rate": 6.014916764419261e-06, "loss": 0.1726, "step": 2705 }, { "epoch": 1.2836812144212524, "grad_norm": 1.7310154438018799, "learning_rate": 6.007871333480041e-06, "loss": 0.1615, "step": 2706 }, { "epoch": 1.2841555977229602, "grad_norm": 1.5262956619262695, "learning_rate": 6.000828259054872e-06, "loss": 0.1396, "step": 2707 }, { "epoch": 1.284629981024668, "grad_norm": 1.7818677425384521, "learning_rate": 5.993787545301204e-06, "loss": 0.1845, "step": 2708 }, { "epoch": 1.2851043643263758, "grad_norm": 2.1896657943725586, "learning_rate": 5.986749196375108e-06, "loss": 0.2015, "step": 2709 }, { "epoch": 1.2855787476280836, "grad_norm": 2.082186698913574, "learning_rate": 5.97971321643125e-06, "loss": 0.2172, "step": 2710 }, { "epoch": 1.2860531309297913, "grad_norm": 2.232640504837036, "learning_rate": 5.972679609622897e-06, "loss": 0.2158, "step": 2711 }, { "epoch": 1.2865275142314991, "grad_norm": 1.818824291229248, "learning_rate": 5.965648380101916e-06, "loss": 0.1672, "step": 2712 }, { "epoch": 1.287001897533207, "grad_norm": 2.0493948459625244, "learning_rate": 5.958619532018775e-06, "loss": 0.1805, "step": 2713 }, { "epoch": 1.2874762808349147, "grad_norm": 2.046306610107422, "learning_rate": 5.951593069522535e-06, "loss": 0.1763, "step": 2714 }, { "epoch": 1.2879506641366225, "grad_norm": 1.797914743423462, "learning_rate": 5.944568996760847e-06, "loss": 0.2099, "step": 2715 }, { "epoch": 1.2884250474383303, "grad_norm": 1.5372848510742188, "learning_rate": 5.937547317879946e-06, "loss": 0.1515, "step": 2716 }, { "epoch": 1.2888994307400379, "grad_norm": 1.811116337776184, "learning_rate": 5.930528037024664e-06, "loss": 0.151, "step": 2717 }, { "epoch": 1.2893738140417457, "grad_norm": 1.7281832695007324, "learning_rate": 5.923511158338415e-06, "loss": 0.1729, "step": 2718 }, { "epoch": 1.2898481973434535, "grad_norm": 1.5258969068527222, "learning_rate": 5.916496685963191e-06, "loss": 0.1412, "step": 2719 }, { "epoch": 1.2903225806451613, "grad_norm": 1.6176496744155884, "learning_rate": 5.909484624039563e-06, "loss": 0.1712, "step": 2720 }, { "epoch": 1.290796963946869, "grad_norm": 1.8939071893692017, "learning_rate": 5.9024749767066835e-06, "loss": 0.1806, "step": 2721 }, { "epoch": 1.2912713472485768, "grad_norm": 1.8788526058197021, "learning_rate": 5.89546774810228e-06, "loss": 0.17, "step": 2722 }, { "epoch": 1.2917457305502846, "grad_norm": 2.107088327407837, "learning_rate": 5.888462942362647e-06, "loss": 0.18, "step": 2723 }, { "epoch": 1.2922201138519924, "grad_norm": 1.6402045488357544, "learning_rate": 5.881460563622659e-06, "loss": 0.1535, "step": 2724 }, { "epoch": 1.2926944971537002, "grad_norm": 1.9502211809158325, "learning_rate": 5.87446061601574e-06, "loss": 0.1858, "step": 2725 }, { "epoch": 1.293168880455408, "grad_norm": 1.5377591848373413, "learning_rate": 5.867463103673898e-06, "loss": 0.1621, "step": 2726 }, { "epoch": 1.2936432637571158, "grad_norm": 2.0564045906066895, "learning_rate": 5.8604680307276906e-06, "loss": 0.1743, "step": 2727 }, { "epoch": 1.2941176470588236, "grad_norm": 1.4501993656158447, "learning_rate": 5.853475401306241e-06, "loss": 0.1646, "step": 2728 }, { "epoch": 1.2945920303605314, "grad_norm": 2.006211042404175, "learning_rate": 5.846485219537237e-06, "loss": 0.2069, "step": 2729 }, { "epoch": 1.295066413662239, "grad_norm": 2.675589084625244, "learning_rate": 5.8394974895469015e-06, "loss": 0.2098, "step": 2730 }, { "epoch": 1.2955407969639468, "grad_norm": 1.6392362117767334, "learning_rate": 5.83251221546003e-06, "loss": 0.1624, "step": 2731 }, { "epoch": 1.2960151802656545, "grad_norm": 1.847262978553772, "learning_rate": 5.825529401399956e-06, "loss": 0.2169, "step": 2732 }, { "epoch": 1.2964895635673623, "grad_norm": 1.3665305376052856, "learning_rate": 5.818549051488569e-06, "loss": 0.1345, "step": 2733 }, { "epoch": 1.2969639468690701, "grad_norm": 1.4444960355758667, "learning_rate": 5.811571169846304e-06, "loss": 0.1572, "step": 2734 }, { "epoch": 1.297438330170778, "grad_norm": 1.631394624710083, "learning_rate": 5.804595760592127e-06, "loss": 0.1435, "step": 2735 }, { "epoch": 1.2979127134724857, "grad_norm": 2.4109082221984863, "learning_rate": 5.797622827843561e-06, "loss": 0.2491, "step": 2736 }, { "epoch": 1.2983870967741935, "grad_norm": 1.8289345502853394, "learning_rate": 5.790652375716653e-06, "loss": 0.1797, "step": 2737 }, { "epoch": 1.2988614800759013, "grad_norm": 1.665688157081604, "learning_rate": 5.7836844083259954e-06, "loss": 0.1651, "step": 2738 }, { "epoch": 1.299335863377609, "grad_norm": 1.573738932609558, "learning_rate": 5.776718929784707e-06, "loss": 0.156, "step": 2739 }, { "epoch": 1.2998102466793169, "grad_norm": 1.9333720207214355, "learning_rate": 5.769755944204443e-06, "loss": 0.1613, "step": 2740 }, { "epoch": 1.3002846299810247, "grad_norm": 1.6634092330932617, "learning_rate": 5.762795455695385e-06, "loss": 0.198, "step": 2741 }, { "epoch": 1.3007590132827325, "grad_norm": 1.8421812057495117, "learning_rate": 5.755837468366241e-06, "loss": 0.1715, "step": 2742 }, { "epoch": 1.3012333965844403, "grad_norm": 1.5852497816085815, "learning_rate": 5.748881986324245e-06, "loss": 0.1821, "step": 2743 }, { "epoch": 1.301707779886148, "grad_norm": 1.8755565881729126, "learning_rate": 5.741929013675143e-06, "loss": 0.182, "step": 2744 }, { "epoch": 1.3021821631878558, "grad_norm": 1.483273983001709, "learning_rate": 5.7349785545232115e-06, "loss": 0.1509, "step": 2745 }, { "epoch": 1.3026565464895636, "grad_norm": 1.5257030725479126, "learning_rate": 5.728030612971231e-06, "loss": 0.1342, "step": 2746 }, { "epoch": 1.3031309297912714, "grad_norm": 1.8776187896728516, "learning_rate": 5.721085193120507e-06, "loss": 0.1959, "step": 2747 }, { "epoch": 1.3036053130929792, "grad_norm": 1.5106158256530762, "learning_rate": 5.714142299070856e-06, "loss": 0.1563, "step": 2748 }, { "epoch": 1.304079696394687, "grad_norm": 1.8498852252960205, "learning_rate": 5.7072019349205925e-06, "loss": 0.216, "step": 2749 }, { "epoch": 1.3045540796963948, "grad_norm": 1.8534907102584839, "learning_rate": 5.700264104766547e-06, "loss": 0.1699, "step": 2750 }, { "epoch": 1.3050284629981026, "grad_norm": 1.647162675857544, "learning_rate": 5.6933288127040505e-06, "loss": 0.1455, "step": 2751 }, { "epoch": 1.3055028462998102, "grad_norm": 1.767333745956421, "learning_rate": 5.686396062826946e-06, "loss": 0.1483, "step": 2752 }, { "epoch": 1.305977229601518, "grad_norm": 1.979466438293457, "learning_rate": 5.679465859227561e-06, "loss": 0.1747, "step": 2753 }, { "epoch": 1.3064516129032258, "grad_norm": 1.9015872478485107, "learning_rate": 5.6725382059967205e-06, "loss": 0.2133, "step": 2754 }, { "epoch": 1.3069259962049335, "grad_norm": 1.6040949821472168, "learning_rate": 5.665613107223755e-06, "loss": 0.158, "step": 2755 }, { "epoch": 1.3074003795066413, "grad_norm": 1.9205068349838257, "learning_rate": 5.658690566996483e-06, "loss": 0.1565, "step": 2756 }, { "epoch": 1.3078747628083491, "grad_norm": 1.864329218864441, "learning_rate": 5.651770589401209e-06, "loss": 0.1957, "step": 2757 }, { "epoch": 1.308349146110057, "grad_norm": 2.1258440017700195, "learning_rate": 5.644853178522734e-06, "loss": 0.1696, "step": 2758 }, { "epoch": 1.3088235294117647, "grad_norm": 1.630826711654663, "learning_rate": 5.637938338444325e-06, "loss": 0.1278, "step": 2759 }, { "epoch": 1.3092979127134725, "grad_norm": 1.273641586303711, "learning_rate": 5.631026073247752e-06, "loss": 0.1296, "step": 2760 }, { "epoch": 1.3097722960151803, "grad_norm": 1.7420580387115479, "learning_rate": 5.624116387013259e-06, "loss": 0.1666, "step": 2761 }, { "epoch": 1.310246679316888, "grad_norm": 1.7752727270126343, "learning_rate": 5.617209283819562e-06, "loss": 0.1868, "step": 2762 }, { "epoch": 1.310721062618596, "grad_norm": 1.8602519035339355, "learning_rate": 5.61030476774385e-06, "loss": 0.1966, "step": 2763 }, { "epoch": 1.3111954459203037, "grad_norm": 1.8290077447891235, "learning_rate": 5.603402842861797e-06, "loss": 0.1749, "step": 2764 }, { "epoch": 1.3116698292220113, "grad_norm": 1.944825291633606, "learning_rate": 5.5965035132475395e-06, "loss": 0.2062, "step": 2765 }, { "epoch": 1.312144212523719, "grad_norm": 1.7076294422149658, "learning_rate": 5.589606782973683e-06, "loss": 0.1738, "step": 2766 }, { "epoch": 1.3126185958254268, "grad_norm": 1.9629321098327637, "learning_rate": 5.5827126561113045e-06, "loss": 0.1418, "step": 2767 }, { "epoch": 1.3130929791271346, "grad_norm": 1.9150692224502563, "learning_rate": 5.575821136729929e-06, "loss": 0.1978, "step": 2768 }, { "epoch": 1.3135673624288424, "grad_norm": 1.7870982885360718, "learning_rate": 5.568932228897563e-06, "loss": 0.1764, "step": 2769 }, { "epoch": 1.3140417457305502, "grad_norm": 2.10677170753479, "learning_rate": 5.562045936680649e-06, "loss": 0.2014, "step": 2770 }, { "epoch": 1.314516129032258, "grad_norm": 2.482201337814331, "learning_rate": 5.555162264144105e-06, "loss": 0.2658, "step": 2771 }, { "epoch": 1.3149905123339658, "grad_norm": 1.6711809635162354, "learning_rate": 5.548281215351297e-06, "loss": 0.1548, "step": 2772 }, { "epoch": 1.3154648956356736, "grad_norm": 2.137512683868408, "learning_rate": 5.54140279436403e-06, "loss": 0.2021, "step": 2773 }, { "epoch": 1.3159392789373814, "grad_norm": 1.648645281791687, "learning_rate": 5.534527005242575e-06, "loss": 0.1697, "step": 2774 }, { "epoch": 1.3164136622390892, "grad_norm": 1.3213448524475098, "learning_rate": 5.52765385204564e-06, "loss": 0.1197, "step": 2775 }, { "epoch": 1.316888045540797, "grad_norm": 1.4370523691177368, "learning_rate": 5.520783338830386e-06, "loss": 0.1534, "step": 2776 }, { "epoch": 1.3173624288425048, "grad_norm": 1.695040225982666, "learning_rate": 5.5139154696524025e-06, "loss": 0.1957, "step": 2777 }, { "epoch": 1.3178368121442126, "grad_norm": 1.5011173486709595, "learning_rate": 5.5070502485657216e-06, "loss": 0.157, "step": 2778 }, { "epoch": 1.3183111954459203, "grad_norm": 1.3054955005645752, "learning_rate": 5.500187679622819e-06, "loss": 0.1141, "step": 2779 }, { "epoch": 1.3187855787476281, "grad_norm": 1.8683662414550781, "learning_rate": 5.4933277668746036e-06, "loss": 0.1725, "step": 2780 }, { "epoch": 1.319259962049336, "grad_norm": 1.2149678468704224, "learning_rate": 5.486470514370415e-06, "loss": 0.1239, "step": 2781 }, { "epoch": 1.3197343453510437, "grad_norm": 1.8600114583969116, "learning_rate": 5.479615926158013e-06, "loss": 0.1712, "step": 2782 }, { "epoch": 1.3202087286527515, "grad_norm": 1.914152979850769, "learning_rate": 5.4727640062836e-06, "loss": 0.1836, "step": 2783 }, { "epoch": 1.3206831119544593, "grad_norm": 1.7095372676849365, "learning_rate": 5.465914758791794e-06, "loss": 0.17, "step": 2784 }, { "epoch": 1.321157495256167, "grad_norm": 1.9404311180114746, "learning_rate": 5.459068187725644e-06, "loss": 0.1771, "step": 2785 }, { "epoch": 1.321631878557875, "grad_norm": 1.4658271074295044, "learning_rate": 5.452224297126607e-06, "loss": 0.1525, "step": 2786 }, { "epoch": 1.3221062618595825, "grad_norm": 1.9287587404251099, "learning_rate": 5.445383091034564e-06, "loss": 0.2087, "step": 2787 }, { "epoch": 1.3225806451612903, "grad_norm": 1.8362351655960083, "learning_rate": 5.438544573487811e-06, "loss": 0.1848, "step": 2788 }, { "epoch": 1.323055028462998, "grad_norm": 1.9103182554244995, "learning_rate": 5.431708748523058e-06, "loss": 0.1918, "step": 2789 }, { "epoch": 1.3235294117647058, "grad_norm": 1.722016453742981, "learning_rate": 5.424875620175427e-06, "loss": 0.1852, "step": 2790 }, { "epoch": 1.3240037950664136, "grad_norm": 1.559799313545227, "learning_rate": 5.4180451924784475e-06, "loss": 0.1548, "step": 2791 }, { "epoch": 1.3244781783681214, "grad_norm": 2.0171709060668945, "learning_rate": 5.4112174694640475e-06, "loss": 0.2102, "step": 2792 }, { "epoch": 1.3249525616698292, "grad_norm": 1.49323570728302, "learning_rate": 5.404392455162571e-06, "loss": 0.1358, "step": 2793 }, { "epoch": 1.325426944971537, "grad_norm": 1.6861432790756226, "learning_rate": 5.397570153602747e-06, "loss": 0.1531, "step": 2794 }, { "epoch": 1.3259013282732448, "grad_norm": 2.1216015815734863, "learning_rate": 5.39075056881172e-06, "loss": 0.1996, "step": 2795 }, { "epoch": 1.3263757115749526, "grad_norm": 1.6811705827713013, "learning_rate": 5.383933704815025e-06, "loss": 0.1685, "step": 2796 }, { "epoch": 1.3268500948766604, "grad_norm": 1.721708059310913, "learning_rate": 5.377119565636584e-06, "loss": 0.1591, "step": 2797 }, { "epoch": 1.3273244781783682, "grad_norm": 1.8749873638153076, "learning_rate": 5.370308155298716e-06, "loss": 0.177, "step": 2798 }, { "epoch": 1.327798861480076, "grad_norm": 1.5626192092895508, "learning_rate": 5.363499477822132e-06, "loss": 0.1364, "step": 2799 }, { "epoch": 1.3282732447817835, "grad_norm": 1.7851414680480957, "learning_rate": 5.35669353722593e-06, "loss": 0.1724, "step": 2800 }, { "epoch": 1.3287476280834913, "grad_norm": 1.9391238689422607, "learning_rate": 5.3498903375275815e-06, "loss": 0.161, "step": 2801 }, { "epoch": 1.3292220113851991, "grad_norm": 2.0420241355895996, "learning_rate": 5.3430898827429555e-06, "loss": 0.1735, "step": 2802 }, { "epoch": 1.329696394686907, "grad_norm": 1.4055426120758057, "learning_rate": 5.336292176886287e-06, "loss": 0.1211, "step": 2803 }, { "epoch": 1.3301707779886147, "grad_norm": 1.658674716949463, "learning_rate": 5.329497223970195e-06, "loss": 0.1562, "step": 2804 }, { "epoch": 1.3306451612903225, "grad_norm": 2.3525173664093018, "learning_rate": 5.32270502800568e-06, "loss": 0.2096, "step": 2805 }, { "epoch": 1.3311195445920303, "grad_norm": 1.7249343395233154, "learning_rate": 5.3159155930021e-06, "loss": 0.1569, "step": 2806 }, { "epoch": 1.331593927893738, "grad_norm": 2.1588501930236816, "learning_rate": 5.309128922967194e-06, "loss": 0.2027, "step": 2807 }, { "epoch": 1.3320683111954459, "grad_norm": 2.3709404468536377, "learning_rate": 5.302345021907066e-06, "loss": 0.2227, "step": 2808 }, { "epoch": 1.3325426944971537, "grad_norm": 1.7174904346466064, "learning_rate": 5.295563893826191e-06, "loss": 0.1723, "step": 2809 }, { "epoch": 1.3330170777988615, "grad_norm": 1.9293659925460815, "learning_rate": 5.288785542727397e-06, "loss": 0.1562, "step": 2810 }, { "epoch": 1.3334914611005693, "grad_norm": 1.5283536911010742, "learning_rate": 5.282009972611873e-06, "loss": 0.1329, "step": 2811 }, { "epoch": 1.333965844402277, "grad_norm": 1.8217898607254028, "learning_rate": 5.275237187479176e-06, "loss": 0.1772, "step": 2812 }, { "epoch": 1.3344402277039848, "grad_norm": 2.461566686630249, "learning_rate": 5.268467191327214e-06, "loss": 0.2084, "step": 2813 }, { "epoch": 1.3349146110056926, "grad_norm": 1.7789011001586914, "learning_rate": 5.261699988152249e-06, "loss": 0.1837, "step": 2814 }, { "epoch": 1.3353889943074004, "grad_norm": 1.8147854804992676, "learning_rate": 5.254935581948897e-06, "loss": 0.2358, "step": 2815 }, { "epoch": 1.3358633776091082, "grad_norm": 1.6862436532974243, "learning_rate": 5.248173976710111e-06, "loss": 0.1419, "step": 2816 }, { "epoch": 1.336337760910816, "grad_norm": 1.3814226388931274, "learning_rate": 5.2414151764272116e-06, "loss": 0.1232, "step": 2817 }, { "epoch": 1.3368121442125238, "grad_norm": 1.4221547842025757, "learning_rate": 5.23465918508984e-06, "loss": 0.1346, "step": 2818 }, { "epoch": 1.3372865275142316, "grad_norm": 2.043062686920166, "learning_rate": 5.227906006686001e-06, "loss": 0.1743, "step": 2819 }, { "epoch": 1.3377609108159394, "grad_norm": 2.2925965785980225, "learning_rate": 5.22115564520202e-06, "loss": 0.1989, "step": 2820 }, { "epoch": 1.3382352941176472, "grad_norm": 1.5758484601974487, "learning_rate": 5.214408104622573e-06, "loss": 0.154, "step": 2821 }, { "epoch": 1.3387096774193548, "grad_norm": 1.688869833946228, "learning_rate": 5.207663388930666e-06, "loss": 0.158, "step": 2822 }, { "epoch": 1.3391840607210626, "grad_norm": 1.5997257232666016, "learning_rate": 5.200921502107638e-06, "loss": 0.1363, "step": 2823 }, { "epoch": 1.3396584440227703, "grad_norm": 2.4265429973602295, "learning_rate": 5.194182448133163e-06, "loss": 0.1848, "step": 2824 }, { "epoch": 1.3401328273244781, "grad_norm": 1.8538546562194824, "learning_rate": 5.187446230985229e-06, "loss": 0.1652, "step": 2825 }, { "epoch": 1.340607210626186, "grad_norm": 1.5969922542572021, "learning_rate": 5.180712854640168e-06, "loss": 0.1649, "step": 2826 }, { "epoch": 1.3410815939278937, "grad_norm": 1.623705267906189, "learning_rate": 5.173982323072615e-06, "loss": 0.1676, "step": 2827 }, { "epoch": 1.3415559772296015, "grad_norm": 1.4985768795013428, "learning_rate": 5.167254640255542e-06, "loss": 0.1731, "step": 2828 }, { "epoch": 1.3420303605313093, "grad_norm": 1.6000432968139648, "learning_rate": 5.160529810160235e-06, "loss": 0.15, "step": 2829 }, { "epoch": 1.342504743833017, "grad_norm": 1.7188128232955933, "learning_rate": 5.153807836756288e-06, "loss": 0.151, "step": 2830 }, { "epoch": 1.342979127134725, "grad_norm": 2.0611462593078613, "learning_rate": 5.147088724011622e-06, "loss": 0.2002, "step": 2831 }, { "epoch": 1.3434535104364327, "grad_norm": 1.8624550104141235, "learning_rate": 5.14037247589246e-06, "loss": 0.1607, "step": 2832 }, { "epoch": 1.3439278937381405, "grad_norm": 1.335152506828308, "learning_rate": 5.133659096363341e-06, "loss": 0.118, "step": 2833 }, { "epoch": 1.3444022770398483, "grad_norm": 1.6288925409317017, "learning_rate": 5.126948589387104e-06, "loss": 0.184, "step": 2834 }, { "epoch": 1.3448766603415558, "grad_norm": 2.3706490993499756, "learning_rate": 5.120240958924888e-06, "loss": 0.2334, "step": 2835 }, { "epoch": 1.3453510436432636, "grad_norm": 1.5236918926239014, "learning_rate": 5.113536208936147e-06, "loss": 0.1429, "step": 2836 }, { "epoch": 1.3458254269449714, "grad_norm": 1.7775214910507202, "learning_rate": 5.106834343378629e-06, "loss": 0.1757, "step": 2837 }, { "epoch": 1.3462998102466792, "grad_norm": 1.7919654846191406, "learning_rate": 5.100135366208383e-06, "loss": 0.1604, "step": 2838 }, { "epoch": 1.346774193548387, "grad_norm": 2.141319990158081, "learning_rate": 5.093439281379738e-06, "loss": 0.2079, "step": 2839 }, { "epoch": 1.3472485768500948, "grad_norm": 1.5796895027160645, "learning_rate": 5.086746092845334e-06, "loss": 0.1522, "step": 2840 }, { "epoch": 1.3477229601518026, "grad_norm": 1.6436305046081543, "learning_rate": 5.080055804556097e-06, "loss": 0.15, "step": 2841 }, { "epoch": 1.3481973434535104, "grad_norm": 2.3595945835113525, "learning_rate": 5.073368420461229e-06, "loss": 0.2145, "step": 2842 }, { "epoch": 1.3486717267552182, "grad_norm": 2.0658891201019287, "learning_rate": 5.066683944508235e-06, "loss": 0.1851, "step": 2843 }, { "epoch": 1.349146110056926, "grad_norm": 1.6966054439544678, "learning_rate": 5.060002380642887e-06, "loss": 0.1432, "step": 2844 }, { "epoch": 1.3496204933586338, "grad_norm": 1.6572761535644531, "learning_rate": 5.053323732809252e-06, "loss": 0.1847, "step": 2845 }, { "epoch": 1.3500948766603416, "grad_norm": 1.654388427734375, "learning_rate": 5.046648004949667e-06, "loss": 0.1683, "step": 2846 }, { "epoch": 1.3505692599620494, "grad_norm": 1.6657936573028564, "learning_rate": 5.0399752010047495e-06, "loss": 0.2028, "step": 2847 }, { "epoch": 1.3510436432637571, "grad_norm": 1.6080611944198608, "learning_rate": 5.033305324913392e-06, "loss": 0.1479, "step": 2848 }, { "epoch": 1.351518026565465, "grad_norm": 1.4048763513565063, "learning_rate": 5.0266383806127514e-06, "loss": 0.1322, "step": 2849 }, { "epoch": 1.3519924098671727, "grad_norm": 1.806875467300415, "learning_rate": 5.019974372038265e-06, "loss": 0.2062, "step": 2850 }, { "epoch": 1.3524667931688805, "grad_norm": 1.8327056169509888, "learning_rate": 5.0133133031236215e-06, "loss": 0.1663, "step": 2851 }, { "epoch": 1.3529411764705883, "grad_norm": 1.5724437236785889, "learning_rate": 5.006655177800792e-06, "loss": 0.1578, "step": 2852 }, { "epoch": 1.353415559772296, "grad_norm": 1.6653856039047241, "learning_rate": 5.000000000000003e-06, "loss": 0.1481, "step": 2853 }, { "epoch": 1.353889943074004, "grad_norm": 2.099308729171753, "learning_rate": 4.993347773649732e-06, "loss": 0.2375, "step": 2854 }, { "epoch": 1.3543643263757117, "grad_norm": 1.8074994087219238, "learning_rate": 4.9866985026767276e-06, "loss": 0.1593, "step": 2855 }, { "epoch": 1.3548387096774195, "grad_norm": 1.7318613529205322, "learning_rate": 4.980052191005989e-06, "loss": 0.1456, "step": 2856 }, { "epoch": 1.355313092979127, "grad_norm": 1.5684703588485718, "learning_rate": 4.973408842560772e-06, "loss": 0.1847, "step": 2857 }, { "epoch": 1.3557874762808348, "grad_norm": 2.352754831314087, "learning_rate": 4.966768461262573e-06, "loss": 0.1362, "step": 2858 }, { "epoch": 1.3562618595825426, "grad_norm": 1.6529217958450317, "learning_rate": 4.960131051031143e-06, "loss": 0.1486, "step": 2859 }, { "epoch": 1.3567362428842504, "grad_norm": 1.7900093793869019, "learning_rate": 4.953496615784482e-06, "loss": 0.1411, "step": 2860 }, { "epoch": 1.3572106261859582, "grad_norm": 1.4834645986557007, "learning_rate": 4.94686515943883e-06, "loss": 0.143, "step": 2861 }, { "epoch": 1.357685009487666, "grad_norm": 1.7863253355026245, "learning_rate": 4.940236685908677e-06, "loss": 0.1657, "step": 2862 }, { "epoch": 1.3581593927893738, "grad_norm": 1.8910566568374634, "learning_rate": 4.933611199106736e-06, "loss": 0.1838, "step": 2863 }, { "epoch": 1.3586337760910816, "grad_norm": 1.7281635999679565, "learning_rate": 4.9269887029439686e-06, "loss": 0.1884, "step": 2864 }, { "epoch": 1.3591081593927894, "grad_norm": 1.703221321105957, "learning_rate": 4.920369201329575e-06, "loss": 0.1579, "step": 2865 }, { "epoch": 1.3595825426944972, "grad_norm": 1.3635776042938232, "learning_rate": 4.913752698170972e-06, "loss": 0.137, "step": 2866 }, { "epoch": 1.360056925996205, "grad_norm": 1.9177613258361816, "learning_rate": 4.907139197373827e-06, "loss": 0.1918, "step": 2867 }, { "epoch": 1.3605313092979128, "grad_norm": 1.5520260334014893, "learning_rate": 4.900528702842011e-06, "loss": 0.1742, "step": 2868 }, { "epoch": 1.3610056925996206, "grad_norm": 1.8986750841140747, "learning_rate": 4.893921218477642e-06, "loss": 0.1707, "step": 2869 }, { "epoch": 1.3614800759013281, "grad_norm": 1.954786777496338, "learning_rate": 4.8873167481810516e-06, "loss": 0.1707, "step": 2870 }, { "epoch": 1.361954459203036, "grad_norm": 1.7505497932434082, "learning_rate": 4.880715295850791e-06, "loss": 0.1829, "step": 2871 }, { "epoch": 1.3624288425047437, "grad_norm": 1.406683087348938, "learning_rate": 4.874116865383638e-06, "loss": 0.1356, "step": 2872 }, { "epoch": 1.3629032258064515, "grad_norm": 1.3671265840530396, "learning_rate": 4.867521460674573e-06, "loss": 0.1402, "step": 2873 }, { "epoch": 1.3633776091081593, "grad_norm": 1.9116820096969604, "learning_rate": 4.860929085616804e-06, "loss": 0.1588, "step": 2874 }, { "epoch": 1.363851992409867, "grad_norm": 1.5119657516479492, "learning_rate": 4.85433974410174e-06, "loss": 0.1318, "step": 2875 }, { "epoch": 1.364326375711575, "grad_norm": 1.674631953239441, "learning_rate": 4.8477534400190075e-06, "loss": 0.1778, "step": 2876 }, { "epoch": 1.3648007590132827, "grad_norm": 1.8323768377304077, "learning_rate": 4.841170177256439e-06, "loss": 0.2063, "step": 2877 }, { "epoch": 1.3652751423149905, "grad_norm": 2.152963638305664, "learning_rate": 4.834589959700061e-06, "loss": 0.2062, "step": 2878 }, { "epoch": 1.3657495256166983, "grad_norm": 1.483198881149292, "learning_rate": 4.828012791234117e-06, "loss": 0.1412, "step": 2879 }, { "epoch": 1.366223908918406, "grad_norm": 1.3623414039611816, "learning_rate": 4.821438675741044e-06, "loss": 0.1237, "step": 2880 }, { "epoch": 1.3666982922201139, "grad_norm": 1.7617155313491821, "learning_rate": 4.814867617101479e-06, "loss": 0.1496, "step": 2881 }, { "epoch": 1.3671726755218216, "grad_norm": 1.808595061302185, "learning_rate": 4.808299619194251e-06, "loss": 0.1625, "step": 2882 }, { "epoch": 1.3676470588235294, "grad_norm": 1.750730276107788, "learning_rate": 4.80173468589638e-06, "loss": 0.1531, "step": 2883 }, { "epoch": 1.3681214421252372, "grad_norm": 1.9921399354934692, "learning_rate": 4.795172821083084e-06, "loss": 0.2081, "step": 2884 }, { "epoch": 1.368595825426945, "grad_norm": 1.9898700714111328, "learning_rate": 4.788614028627769e-06, "loss": 0.1826, "step": 2885 }, { "epoch": 1.3690702087286528, "grad_norm": 1.715350866317749, "learning_rate": 4.782058312402027e-06, "loss": 0.1736, "step": 2886 }, { "epoch": 1.3695445920303606, "grad_norm": 1.990676760673523, "learning_rate": 4.7755056762756255e-06, "loss": 0.1491, "step": 2887 }, { "epoch": 1.3700189753320684, "grad_norm": 1.4953407049179077, "learning_rate": 4.768956124116526e-06, "loss": 0.1616, "step": 2888 }, { "epoch": 1.3704933586337762, "grad_norm": 1.76144278049469, "learning_rate": 4.762409659790866e-06, "loss": 0.1715, "step": 2889 }, { "epoch": 1.370967741935484, "grad_norm": 1.379292368888855, "learning_rate": 4.755866287162952e-06, "loss": 0.1385, "step": 2890 }, { "epoch": 1.3714421252371918, "grad_norm": 2.018620014190674, "learning_rate": 4.74932601009528e-06, "loss": 0.1926, "step": 2891 }, { "epoch": 1.3719165085388993, "grad_norm": 2.3270931243896484, "learning_rate": 4.742788832448501e-06, "loss": 0.2674, "step": 2892 }, { "epoch": 1.3723908918406071, "grad_norm": 1.6892091035842896, "learning_rate": 4.736254758081454e-06, "loss": 0.191, "step": 2893 }, { "epoch": 1.372865275142315, "grad_norm": 1.5555603504180908, "learning_rate": 4.729723790851135e-06, "loss": 0.1301, "step": 2894 }, { "epoch": 1.3733396584440227, "grad_norm": 1.7797441482543945, "learning_rate": 4.723195934612711e-06, "loss": 0.1627, "step": 2895 }, { "epoch": 1.3738140417457305, "grad_norm": 2.268540859222412, "learning_rate": 4.7166711932195155e-06, "loss": 0.1985, "step": 2896 }, { "epoch": 1.3742884250474383, "grad_norm": 1.6203911304473877, "learning_rate": 4.7101495705230285e-06, "loss": 0.1565, "step": 2897 }, { "epoch": 1.374762808349146, "grad_norm": 1.598213791847229, "learning_rate": 4.703631070372909e-06, "loss": 0.179, "step": 2898 }, { "epoch": 1.375237191650854, "grad_norm": 1.5291484594345093, "learning_rate": 4.697115696616955e-06, "loss": 0.1816, "step": 2899 }, { "epoch": 1.3757115749525617, "grad_norm": 1.719058871269226, "learning_rate": 4.690603453101134e-06, "loss": 0.1626, "step": 2900 }, { "epoch": 1.3761859582542695, "grad_norm": 1.810861587524414, "learning_rate": 4.684094343669554e-06, "loss": 0.1542, "step": 2901 }, { "epoch": 1.3766603415559773, "grad_norm": 1.7553555965423584, "learning_rate": 4.677588372164479e-06, "loss": 0.1787, "step": 2902 }, { "epoch": 1.377134724857685, "grad_norm": 1.9552857875823975, "learning_rate": 4.6710855424263205e-06, "loss": 0.1996, "step": 2903 }, { "epoch": 1.3776091081593929, "grad_norm": 1.9977998733520508, "learning_rate": 4.6645858582936345e-06, "loss": 0.1963, "step": 2904 }, { "epoch": 1.3780834914611007, "grad_norm": 1.5562477111816406, "learning_rate": 4.658089323603123e-06, "loss": 0.1483, "step": 2905 }, { "epoch": 1.3785578747628082, "grad_norm": 1.7609916925430298, "learning_rate": 4.651595942189624e-06, "loss": 0.2095, "step": 2906 }, { "epoch": 1.379032258064516, "grad_norm": 1.431341528892517, "learning_rate": 4.645105717886112e-06, "loss": 0.1309, "step": 2907 }, { "epoch": 1.3795066413662238, "grad_norm": 1.580350399017334, "learning_rate": 4.638618654523705e-06, "loss": 0.1537, "step": 2908 }, { "epoch": 1.3799810246679316, "grad_norm": 1.886465072631836, "learning_rate": 4.632134755931653e-06, "loss": 0.1535, "step": 2909 }, { "epoch": 1.3804554079696394, "grad_norm": 1.5633918046951294, "learning_rate": 4.625654025937342e-06, "loss": 0.1684, "step": 2910 }, { "epoch": 1.3809297912713472, "grad_norm": 1.7328919172286987, "learning_rate": 4.619176468366274e-06, "loss": 0.1585, "step": 2911 }, { "epoch": 1.381404174573055, "grad_norm": 1.3697620630264282, "learning_rate": 4.612702087042091e-06, "loss": 0.1405, "step": 2912 }, { "epoch": 1.3818785578747628, "grad_norm": 2.1742119789123535, "learning_rate": 4.606230885786557e-06, "loss": 0.1519, "step": 2913 }, { "epoch": 1.3823529411764706, "grad_norm": 1.6288695335388184, "learning_rate": 4.599762868419561e-06, "loss": 0.1662, "step": 2914 }, { "epoch": 1.3828273244781784, "grad_norm": 1.7532272338867188, "learning_rate": 4.5932980387591054e-06, "loss": 0.1819, "step": 2915 }, { "epoch": 1.3833017077798861, "grad_norm": 1.4690742492675781, "learning_rate": 4.586836400621313e-06, "loss": 0.1624, "step": 2916 }, { "epoch": 1.383776091081594, "grad_norm": 1.6084843873977661, "learning_rate": 4.580377957820427e-06, "loss": 0.1653, "step": 2917 }, { "epoch": 1.3842504743833017, "grad_norm": 1.5763942003250122, "learning_rate": 4.573922714168804e-06, "loss": 0.1519, "step": 2918 }, { "epoch": 1.3847248576850095, "grad_norm": 2.008824586868286, "learning_rate": 4.567470673476912e-06, "loss": 0.1915, "step": 2919 }, { "epoch": 1.3851992409867173, "grad_norm": 1.5423541069030762, "learning_rate": 4.561021839553323e-06, "loss": 0.1571, "step": 2920 }, { "epoch": 1.385673624288425, "grad_norm": 1.8469271659851074, "learning_rate": 4.554576216204718e-06, "loss": 0.1971, "step": 2921 }, { "epoch": 1.386148007590133, "grad_norm": 2.0073492527008057, "learning_rate": 4.548133807235893e-06, "loss": 0.1826, "step": 2922 }, { "epoch": 1.3866223908918407, "grad_norm": 1.9065463542938232, "learning_rate": 4.541694616449729e-06, "loss": 0.1774, "step": 2923 }, { "epoch": 1.3870967741935485, "grad_norm": 1.4257770776748657, "learning_rate": 4.535258647647225e-06, "loss": 0.1407, "step": 2924 }, { "epoch": 1.3875711574952563, "grad_norm": 1.6374725103378296, "learning_rate": 4.5288259046274605e-06, "loss": 0.1694, "step": 2925 }, { "epoch": 1.388045540796964, "grad_norm": 1.9593760967254639, "learning_rate": 4.5223963911876265e-06, "loss": 0.1788, "step": 2926 }, { "epoch": 1.3885199240986719, "grad_norm": 1.881184697151184, "learning_rate": 4.5159701111229995e-06, "loss": 0.1875, "step": 2927 }, { "epoch": 1.3889943074003794, "grad_norm": 1.786327838897705, "learning_rate": 4.509547068226947e-06, "loss": 0.171, "step": 2928 }, { "epoch": 1.3894686907020872, "grad_norm": 1.864014983177185, "learning_rate": 4.503127266290935e-06, "loss": 0.1636, "step": 2929 }, { "epoch": 1.389943074003795, "grad_norm": 1.5407509803771973, "learning_rate": 4.496710709104504e-06, "loss": 0.1757, "step": 2930 }, { "epoch": 1.3904174573055028, "grad_norm": 1.7357945442199707, "learning_rate": 4.49029740045528e-06, "loss": 0.1297, "step": 2931 }, { "epoch": 1.3908918406072106, "grad_norm": 1.6357675790786743, "learning_rate": 4.48388734412898e-06, "loss": 0.1654, "step": 2932 }, { "epoch": 1.3913662239089184, "grad_norm": 1.856079339981079, "learning_rate": 4.477480543909396e-06, "loss": 0.1842, "step": 2933 }, { "epoch": 1.3918406072106262, "grad_norm": 2.02909779548645, "learning_rate": 4.471077003578403e-06, "loss": 0.1907, "step": 2934 }, { "epoch": 1.392314990512334, "grad_norm": 1.560585379600525, "learning_rate": 4.464676726915939e-06, "loss": 0.1355, "step": 2935 }, { "epoch": 1.3927893738140418, "grad_norm": 1.7451188564300537, "learning_rate": 4.458279717700031e-06, "loss": 0.172, "step": 2936 }, { "epoch": 1.3932637571157496, "grad_norm": 1.3806904554367065, "learning_rate": 4.451885979706767e-06, "loss": 0.1345, "step": 2937 }, { "epoch": 1.3937381404174574, "grad_norm": 2.2464559078216553, "learning_rate": 4.445495516710312e-06, "loss": 0.1972, "step": 2938 }, { "epoch": 1.3942125237191652, "grad_norm": 1.8251832723617554, "learning_rate": 4.439108332482889e-06, "loss": 0.1987, "step": 2939 }, { "epoch": 1.394686907020873, "grad_norm": 1.6302183866500854, "learning_rate": 4.432724430794786e-06, "loss": 0.1673, "step": 2940 }, { "epoch": 1.3951612903225805, "grad_norm": 1.733634114265442, "learning_rate": 4.426343815414361e-06, "loss": 0.1736, "step": 2941 }, { "epoch": 1.3956356736242883, "grad_norm": 1.841150164604187, "learning_rate": 4.419966490108028e-06, "loss": 0.1898, "step": 2942 }, { "epoch": 1.396110056925996, "grad_norm": 1.559160828590393, "learning_rate": 4.413592458640264e-06, "loss": 0.1301, "step": 2943 }, { "epoch": 1.396584440227704, "grad_norm": 1.6111403703689575, "learning_rate": 4.407221724773587e-06, "loss": 0.1824, "step": 2944 }, { "epoch": 1.3970588235294117, "grad_norm": 1.6043332815170288, "learning_rate": 4.4008542922685834e-06, "loss": 0.1555, "step": 2945 }, { "epoch": 1.3975332068311195, "grad_norm": 1.6422481536865234, "learning_rate": 4.3944901648838885e-06, "loss": 0.155, "step": 2946 }, { "epoch": 1.3980075901328273, "grad_norm": 1.5227971076965332, "learning_rate": 4.388129346376177e-06, "loss": 0.1495, "step": 2947 }, { "epoch": 1.398481973434535, "grad_norm": 1.7671695947647095, "learning_rate": 4.3817718405001844e-06, "loss": 0.1753, "step": 2948 }, { "epoch": 1.3989563567362429, "grad_norm": 1.6482768058776855, "learning_rate": 4.3754176510086756e-06, "loss": 0.156, "step": 2949 }, { "epoch": 1.3994307400379506, "grad_norm": 1.7106208801269531, "learning_rate": 4.369066781652469e-06, "loss": 0.1681, "step": 2950 }, { "epoch": 1.3999051233396584, "grad_norm": 1.4155083894729614, "learning_rate": 4.362719236180422e-06, "loss": 0.1351, "step": 2951 }, { "epoch": 1.4003795066413662, "grad_norm": 1.725624918937683, "learning_rate": 4.356375018339426e-06, "loss": 0.1601, "step": 2952 }, { "epoch": 1.400853889943074, "grad_norm": 1.8480092287063599, "learning_rate": 4.350034131874414e-06, "loss": 0.1942, "step": 2953 }, { "epoch": 1.4013282732447818, "grad_norm": 1.3656977415084839, "learning_rate": 4.343696580528343e-06, "loss": 0.1348, "step": 2954 }, { "epoch": 1.4018026565464896, "grad_norm": 1.9449461698532104, "learning_rate": 4.33736236804221e-06, "loss": 0.1573, "step": 2955 }, { "epoch": 1.4022770398481974, "grad_norm": 1.8900002241134644, "learning_rate": 4.331031498155035e-06, "loss": 0.1748, "step": 2956 }, { "epoch": 1.4027514231499052, "grad_norm": 1.8122249841690063, "learning_rate": 4.324703974603873e-06, "loss": 0.1863, "step": 2957 }, { "epoch": 1.403225806451613, "grad_norm": 1.5694611072540283, "learning_rate": 4.318379801123792e-06, "loss": 0.1249, "step": 2958 }, { "epoch": 1.4037001897533208, "grad_norm": 1.4751542806625366, "learning_rate": 4.312058981447893e-06, "loss": 0.1364, "step": 2959 }, { "epoch": 1.4041745730550286, "grad_norm": 2.7020621299743652, "learning_rate": 4.305741519307291e-06, "loss": 0.1851, "step": 2960 }, { "epoch": 1.4046489563567364, "grad_norm": 1.3586440086364746, "learning_rate": 4.2994274184311245e-06, "loss": 0.1246, "step": 2961 }, { "epoch": 1.4051233396584442, "grad_norm": 1.8559433221817017, "learning_rate": 4.293116682546546e-06, "loss": 0.154, "step": 2962 }, { "epoch": 1.4055977229601517, "grad_norm": 1.997769832611084, "learning_rate": 4.28680931537872e-06, "loss": 0.1688, "step": 2963 }, { "epoch": 1.4060721062618595, "grad_norm": 1.8411399126052856, "learning_rate": 4.280505320650814e-06, "loss": 0.1979, "step": 2964 }, { "epoch": 1.4065464895635673, "grad_norm": 1.8652249574661255, "learning_rate": 4.2742047020840214e-06, "loss": 0.1975, "step": 2965 }, { "epoch": 1.407020872865275, "grad_norm": 1.6203731298446655, "learning_rate": 4.2679074633975345e-06, "loss": 0.149, "step": 2966 }, { "epoch": 1.407495256166983, "grad_norm": 1.7112479209899902, "learning_rate": 4.261613608308553e-06, "loss": 0.1555, "step": 2967 }, { "epoch": 1.4079696394686907, "grad_norm": 1.9960196018218994, "learning_rate": 4.255323140532272e-06, "loss": 0.1732, "step": 2968 }, { "epoch": 1.4084440227703985, "grad_norm": 1.8311996459960938, "learning_rate": 4.2490360637818965e-06, "loss": 0.158, "step": 2969 }, { "epoch": 1.4089184060721063, "grad_norm": 1.7324796915054321, "learning_rate": 4.242752381768626e-06, "loss": 0.1525, "step": 2970 }, { "epoch": 1.409392789373814, "grad_norm": 1.6953574419021606, "learning_rate": 4.236472098201651e-06, "loss": 0.1494, "step": 2971 }, { "epoch": 1.4098671726755219, "grad_norm": 1.5616384744644165, "learning_rate": 4.230195216788168e-06, "loss": 0.1523, "step": 2972 }, { "epoch": 1.4103415559772297, "grad_norm": 1.4512946605682373, "learning_rate": 4.223921741233349e-06, "loss": 0.1483, "step": 2973 }, { "epoch": 1.4108159392789374, "grad_norm": 1.525667667388916, "learning_rate": 4.217651675240371e-06, "loss": 0.1236, "step": 2974 }, { "epoch": 1.4112903225806452, "grad_norm": 1.9730256795883179, "learning_rate": 4.211385022510389e-06, "loss": 0.175, "step": 2975 }, { "epoch": 1.4117647058823528, "grad_norm": 1.4136416912078857, "learning_rate": 4.205121786742552e-06, "loss": 0.1287, "step": 2976 }, { "epoch": 1.4122390891840606, "grad_norm": 1.5880212783813477, "learning_rate": 4.198861971633977e-06, "loss": 0.1426, "step": 2977 }, { "epoch": 1.4127134724857684, "grad_norm": 1.5319970846176147, "learning_rate": 4.1926055808797765e-06, "loss": 0.1749, "step": 2978 }, { "epoch": 1.4131878557874762, "grad_norm": 1.677323579788208, "learning_rate": 4.186352618173037e-06, "loss": 0.1692, "step": 2979 }, { "epoch": 1.413662239089184, "grad_norm": 1.9051989316940308, "learning_rate": 4.180103087204817e-06, "loss": 0.1202, "step": 2980 }, { "epoch": 1.4141366223908918, "grad_norm": 1.7331345081329346, "learning_rate": 4.1738569916641555e-06, "loss": 0.1551, "step": 2981 }, { "epoch": 1.4146110056925996, "grad_norm": 1.4977072477340698, "learning_rate": 4.167614335238058e-06, "loss": 0.1707, "step": 2982 }, { "epoch": 1.4150853889943074, "grad_norm": 2.1823627948760986, "learning_rate": 4.161375121611504e-06, "loss": 0.1984, "step": 2983 }, { "epoch": 1.4155597722960152, "grad_norm": 1.9357411861419678, "learning_rate": 4.155139354467439e-06, "loss": 0.1967, "step": 2984 }, { "epoch": 1.416034155597723, "grad_norm": 1.4335790872573853, "learning_rate": 4.1489070374867765e-06, "loss": 0.1393, "step": 2985 }, { "epoch": 1.4165085388994307, "grad_norm": 1.459723949432373, "learning_rate": 4.142678174348395e-06, "loss": 0.1365, "step": 2986 }, { "epoch": 1.4169829222011385, "grad_norm": 1.581613302230835, "learning_rate": 4.136452768729126e-06, "loss": 0.1507, "step": 2987 }, { "epoch": 1.4174573055028463, "grad_norm": 1.6380817890167236, "learning_rate": 4.130230824303761e-06, "loss": 0.1611, "step": 2988 }, { "epoch": 1.4179316888045541, "grad_norm": 1.8408558368682861, "learning_rate": 4.1240123447450575e-06, "loss": 0.1885, "step": 2989 }, { "epoch": 1.418406072106262, "grad_norm": 1.6198720932006836, "learning_rate": 4.117797333723721e-06, "loss": 0.1538, "step": 2990 }, { "epoch": 1.4188804554079697, "grad_norm": 1.727500319480896, "learning_rate": 4.1115857949084145e-06, "loss": 0.1834, "step": 2991 }, { "epoch": 1.4193548387096775, "grad_norm": 2.777367353439331, "learning_rate": 4.105377731965743e-06, "loss": 0.1844, "step": 2992 }, { "epoch": 1.4198292220113853, "grad_norm": 1.618923306465149, "learning_rate": 4.099173148560265e-06, "loss": 0.1926, "step": 2993 }, { "epoch": 1.420303605313093, "grad_norm": 1.8688693046569824, "learning_rate": 4.092972048354491e-06, "loss": 0.1688, "step": 2994 }, { "epoch": 1.4207779886148009, "grad_norm": 1.7127189636230469, "learning_rate": 4.08677443500886e-06, "loss": 0.1505, "step": 2995 }, { "epoch": 1.4212523719165087, "grad_norm": 1.893202543258667, "learning_rate": 4.08058031218177e-06, "loss": 0.1643, "step": 2996 }, { "epoch": 1.4217267552182165, "grad_norm": 1.7235002517700195, "learning_rate": 4.074389683529542e-06, "loss": 0.1644, "step": 2997 }, { "epoch": 1.422201138519924, "grad_norm": 1.580626368522644, "learning_rate": 4.0682025527064486e-06, "loss": 0.1579, "step": 2998 }, { "epoch": 1.4226755218216318, "grad_norm": 1.6993354558944702, "learning_rate": 4.06201892336469e-06, "loss": 0.1617, "step": 2999 }, { "epoch": 1.4231499051233396, "grad_norm": 1.7193957567214966, "learning_rate": 4.055838799154406e-06, "loss": 0.1584, "step": 3000 }, { "epoch": 1.4236242884250474, "grad_norm": 1.6037389039993286, "learning_rate": 4.049662183723655e-06, "loss": 0.1507, "step": 3001 }, { "epoch": 1.4240986717267552, "grad_norm": 1.4931392669677734, "learning_rate": 4.043489080718437e-06, "loss": 0.1296, "step": 3002 }, { "epoch": 1.424573055028463, "grad_norm": 1.6914727687835693, "learning_rate": 4.037319493782674e-06, "loss": 0.1486, "step": 3003 }, { "epoch": 1.4250474383301708, "grad_norm": 1.8355352878570557, "learning_rate": 4.031153426558209e-06, "loss": 0.1778, "step": 3004 }, { "epoch": 1.4255218216318786, "grad_norm": 1.8768749237060547, "learning_rate": 4.024990882684815e-06, "loss": 0.1356, "step": 3005 }, { "epoch": 1.4259962049335864, "grad_norm": 1.700304388999939, "learning_rate": 4.018831865800174e-06, "loss": 0.157, "step": 3006 }, { "epoch": 1.4264705882352942, "grad_norm": 2.2423648834228516, "learning_rate": 4.012676379539896e-06, "loss": 0.1578, "step": 3007 }, { "epoch": 1.426944971537002, "grad_norm": 1.7538981437683105, "learning_rate": 4.006524427537504e-06, "loss": 0.1863, "step": 3008 }, { "epoch": 1.4274193548387097, "grad_norm": 2.0132992267608643, "learning_rate": 4.0003760134244355e-06, "loss": 0.1697, "step": 3009 }, { "epoch": 1.4278937381404175, "grad_norm": 1.686045527458191, "learning_rate": 3.9942311408300395e-06, "loss": 0.1412, "step": 3010 }, { "epoch": 1.428368121442125, "grad_norm": 1.5621978044509888, "learning_rate": 3.9880898133815724e-06, "loss": 0.1428, "step": 3011 }, { "epoch": 1.428842504743833, "grad_norm": 1.520124912261963, "learning_rate": 3.981952034704194e-06, "loss": 0.1541, "step": 3012 }, { "epoch": 1.4293168880455407, "grad_norm": 1.930535912513733, "learning_rate": 3.975817808420978e-06, "loss": 0.1873, "step": 3013 }, { "epoch": 1.4297912713472485, "grad_norm": 1.5178654193878174, "learning_rate": 3.969687138152899e-06, "loss": 0.1246, "step": 3014 }, { "epoch": 1.4302656546489563, "grad_norm": 1.252669095993042, "learning_rate": 3.9635600275188335e-06, "loss": 0.1106, "step": 3015 }, { "epoch": 1.430740037950664, "grad_norm": 1.629313349723816, "learning_rate": 3.957436480135547e-06, "loss": 0.1768, "step": 3016 }, { "epoch": 1.4312144212523719, "grad_norm": 1.6184428930282593, "learning_rate": 3.951316499617711e-06, "loss": 0.1696, "step": 3017 }, { "epoch": 1.4316888045540797, "grad_norm": 1.6965394020080566, "learning_rate": 3.9452000895778964e-06, "loss": 0.1764, "step": 3018 }, { "epoch": 1.4321631878557874, "grad_norm": 1.5129637718200684, "learning_rate": 3.93908725362655e-06, "loss": 0.1325, "step": 3019 }, { "epoch": 1.4326375711574952, "grad_norm": 1.4176459312438965, "learning_rate": 3.932977995372025e-06, "loss": 0.1385, "step": 3020 }, { "epoch": 1.433111954459203, "grad_norm": 1.6068745851516724, "learning_rate": 3.926872318420551e-06, "loss": 0.1298, "step": 3021 }, { "epoch": 1.4335863377609108, "grad_norm": 1.753598928451538, "learning_rate": 3.920770226376251e-06, "loss": 0.1772, "step": 3022 }, { "epoch": 1.4340607210626186, "grad_norm": 1.7061421871185303, "learning_rate": 3.9146717228411305e-06, "loss": 0.1549, "step": 3023 }, { "epoch": 1.4345351043643264, "grad_norm": 1.6681634187698364, "learning_rate": 3.908576811415078e-06, "loss": 0.1601, "step": 3024 }, { "epoch": 1.4350094876660342, "grad_norm": 1.7460962533950806, "learning_rate": 3.902485495695853e-06, "loss": 0.1574, "step": 3025 }, { "epoch": 1.435483870967742, "grad_norm": 1.7344130277633667, "learning_rate": 3.896397779279102e-06, "loss": 0.1805, "step": 3026 }, { "epoch": 1.4359582542694498, "grad_norm": 1.7097632884979248, "learning_rate": 3.890313665758348e-06, "loss": 0.1652, "step": 3027 }, { "epoch": 1.4364326375711576, "grad_norm": 1.7743875980377197, "learning_rate": 3.884233158724976e-06, "loss": 0.1655, "step": 3028 }, { "epoch": 1.4369070208728654, "grad_norm": 1.5984655618667603, "learning_rate": 3.8781562617682555e-06, "loss": 0.1579, "step": 3029 }, { "epoch": 1.4373814041745732, "grad_norm": 1.6356760263442993, "learning_rate": 3.872082978475312e-06, "loss": 0.1618, "step": 3030 }, { "epoch": 1.437855787476281, "grad_norm": 1.7311320304870605, "learning_rate": 3.866013312431148e-06, "loss": 0.1827, "step": 3031 }, { "epoch": 1.4383301707779887, "grad_norm": 1.62131929397583, "learning_rate": 3.859947267218627e-06, "loss": 0.1412, "step": 3032 }, { "epoch": 1.4388045540796963, "grad_norm": 1.5495834350585938, "learning_rate": 3.8538848464184766e-06, "loss": 0.1318, "step": 3033 }, { "epoch": 1.439278937381404, "grad_norm": 2.08760404586792, "learning_rate": 3.847826053609286e-06, "loss": 0.1718, "step": 3034 }, { "epoch": 1.439753320683112, "grad_norm": 1.9067436456680298, "learning_rate": 3.841770892367497e-06, "loss": 0.1776, "step": 3035 }, { "epoch": 1.4402277039848197, "grad_norm": 1.3851908445358276, "learning_rate": 3.83571936626741e-06, "loss": 0.1305, "step": 3036 }, { "epoch": 1.4407020872865275, "grad_norm": 1.6880590915679932, "learning_rate": 3.82967147888118e-06, "loss": 0.157, "step": 3037 }, { "epoch": 1.4411764705882353, "grad_norm": 1.8479284048080444, "learning_rate": 3.823627233778824e-06, "loss": 0.1465, "step": 3038 }, { "epoch": 1.441650853889943, "grad_norm": 1.678410291671753, "learning_rate": 3.8175866345281895e-06, "loss": 0.1558, "step": 3039 }, { "epoch": 1.4421252371916509, "grad_norm": 1.8292676210403442, "learning_rate": 3.8115496846949885e-06, "loss": 0.1587, "step": 3040 }, { "epoch": 1.4425996204933587, "grad_norm": 1.7389070987701416, "learning_rate": 3.8055163878427703e-06, "loss": 0.1605, "step": 3041 }, { "epoch": 1.4430740037950665, "grad_norm": 1.9918736219406128, "learning_rate": 3.7994867475329346e-06, "loss": 0.1665, "step": 3042 }, { "epoch": 1.4435483870967742, "grad_norm": 1.6637475490570068, "learning_rate": 3.7934607673247116e-06, "loss": 0.1636, "step": 3043 }, { "epoch": 1.444022770398482, "grad_norm": 1.8318939208984375, "learning_rate": 3.787438450775185e-06, "loss": 0.1563, "step": 3044 }, { "epoch": 1.4444971537001898, "grad_norm": 1.7426759004592896, "learning_rate": 3.781419801439261e-06, "loss": 0.1709, "step": 3045 }, { "epoch": 1.4449715370018974, "grad_norm": 1.7280033826828003, "learning_rate": 3.775404822869694e-06, "loss": 0.1878, "step": 3046 }, { "epoch": 1.4454459203036052, "grad_norm": 1.8315585851669312, "learning_rate": 3.7693935186170638e-06, "loss": 0.1904, "step": 3047 }, { "epoch": 1.445920303605313, "grad_norm": 1.3828792572021484, "learning_rate": 3.7633858922297885e-06, "loss": 0.1232, "step": 3048 }, { "epoch": 1.4463946869070208, "grad_norm": 1.6559396982192993, "learning_rate": 3.757381947254104e-06, "loss": 0.1733, "step": 3049 }, { "epoch": 1.4468690702087286, "grad_norm": 1.8938626050949097, "learning_rate": 3.7513816872340826e-06, "loss": 0.1362, "step": 3050 }, { "epoch": 1.4473434535104364, "grad_norm": 2.071521282196045, "learning_rate": 3.745385115711623e-06, "loss": 0.1934, "step": 3051 }, { "epoch": 1.4478178368121442, "grad_norm": 1.8912732601165771, "learning_rate": 3.739392236226432e-06, "loss": 0.1588, "step": 3052 }, { "epoch": 1.448292220113852, "grad_norm": 1.723662257194519, "learning_rate": 3.7334030523160582e-06, "loss": 0.1656, "step": 3053 }, { "epoch": 1.4487666034155597, "grad_norm": 1.6426823139190674, "learning_rate": 3.7274175675158477e-06, "loss": 0.1391, "step": 3054 }, { "epoch": 1.4492409867172675, "grad_norm": 1.779732346534729, "learning_rate": 3.7214357853589765e-06, "loss": 0.1712, "step": 3055 }, { "epoch": 1.4497153700189753, "grad_norm": 1.4211159944534302, "learning_rate": 3.7154577093764334e-06, "loss": 0.1523, "step": 3056 }, { "epoch": 1.4501897533206831, "grad_norm": 2.087869167327881, "learning_rate": 3.7094833430970188e-06, "loss": 0.2148, "step": 3057 }, { "epoch": 1.450664136622391, "grad_norm": 1.6574203968048096, "learning_rate": 3.7035126900473363e-06, "loss": 0.1594, "step": 3058 }, { "epoch": 1.4511385199240987, "grad_norm": 1.7676061391830444, "learning_rate": 3.69754575375181e-06, "loss": 0.161, "step": 3059 }, { "epoch": 1.4516129032258065, "grad_norm": 1.7256070375442505, "learning_rate": 3.691582537732655e-06, "loss": 0.1791, "step": 3060 }, { "epoch": 1.4520872865275143, "grad_norm": 1.9738215208053589, "learning_rate": 3.6856230455099053e-06, "loss": 0.1804, "step": 3061 }, { "epoch": 1.452561669829222, "grad_norm": 1.850088357925415, "learning_rate": 3.6796672806013913e-06, "loss": 0.173, "step": 3062 }, { "epoch": 1.4530360531309299, "grad_norm": 1.7364604473114014, "learning_rate": 3.6737152465227355e-06, "loss": 0.1691, "step": 3063 }, { "epoch": 1.4535104364326377, "grad_norm": 1.4408142566680908, "learning_rate": 3.667766946787369e-06, "loss": 0.1409, "step": 3064 }, { "epoch": 1.4539848197343455, "grad_norm": 1.6814855337142944, "learning_rate": 3.6618223849065126e-06, "loss": 0.1652, "step": 3065 }, { "epoch": 1.4544592030360532, "grad_norm": 1.748642921447754, "learning_rate": 3.655881564389184e-06, "loss": 0.1565, "step": 3066 }, { "epoch": 1.454933586337761, "grad_norm": 1.5492746829986572, "learning_rate": 3.649944488742194e-06, "loss": 0.1557, "step": 3067 }, { "epoch": 1.4554079696394686, "grad_norm": 1.3638722896575928, "learning_rate": 3.644011161470136e-06, "loss": 0.1359, "step": 3068 }, { "epoch": 1.4558823529411764, "grad_norm": 1.5125813484191895, "learning_rate": 3.6380815860753904e-06, "loss": 0.1492, "step": 3069 }, { "epoch": 1.4563567362428842, "grad_norm": 1.7100872993469238, "learning_rate": 3.632155766058131e-06, "loss": 0.1598, "step": 3070 }, { "epoch": 1.456831119544592, "grad_norm": 1.9752591848373413, "learning_rate": 3.6262337049163088e-06, "loss": 0.1696, "step": 3071 }, { "epoch": 1.4573055028462998, "grad_norm": 1.8669570684432983, "learning_rate": 3.6203154061456648e-06, "loss": 0.1718, "step": 3072 }, { "epoch": 1.4577798861480076, "grad_norm": 1.8326796293258667, "learning_rate": 3.614400873239703e-06, "loss": 0.203, "step": 3073 }, { "epoch": 1.4582542694497154, "grad_norm": 1.6635417938232422, "learning_rate": 3.6084901096897163e-06, "loss": 0.1458, "step": 3074 }, { "epoch": 1.4587286527514232, "grad_norm": 1.6087132692337036, "learning_rate": 3.602583118984776e-06, "loss": 0.1431, "step": 3075 }, { "epoch": 1.459203036053131, "grad_norm": 1.7828035354614258, "learning_rate": 3.596679904611715e-06, "loss": 0.1735, "step": 3076 }, { "epoch": 1.4596774193548387, "grad_norm": 1.9378790855407715, "learning_rate": 3.5907804700551385e-06, "loss": 0.1872, "step": 3077 }, { "epoch": 1.4601518026565465, "grad_norm": 1.23013174533844, "learning_rate": 3.5848848187974294e-06, "loss": 0.114, "step": 3078 }, { "epoch": 1.4606261859582543, "grad_norm": 1.618354320526123, "learning_rate": 3.5789929543187317e-06, "loss": 0.147, "step": 3079 }, { "epoch": 1.4611005692599621, "grad_norm": 2.332402229309082, "learning_rate": 3.5731048800969536e-06, "loss": 0.1879, "step": 3080 }, { "epoch": 1.4615749525616697, "grad_norm": 1.8381061553955078, "learning_rate": 3.5672205996077726e-06, "loss": 0.1654, "step": 3081 }, { "epoch": 1.4620493358633775, "grad_norm": 1.7428001165390015, "learning_rate": 3.5613401163246118e-06, "loss": 0.1625, "step": 3082 }, { "epoch": 1.4625237191650853, "grad_norm": 1.8108954429626465, "learning_rate": 3.555463433718671e-06, "loss": 0.1954, "step": 3083 }, { "epoch": 1.462998102466793, "grad_norm": 1.7431238889694214, "learning_rate": 3.549590555258892e-06, "loss": 0.1849, "step": 3084 }, { "epoch": 1.4634724857685009, "grad_norm": 1.3837710618972778, "learning_rate": 3.543721484411976e-06, "loss": 0.1412, "step": 3085 }, { "epoch": 1.4639468690702087, "grad_norm": 1.585083246231079, "learning_rate": 3.537856224642385e-06, "loss": 0.1452, "step": 3086 }, { "epoch": 1.4644212523719164, "grad_norm": 1.6242449283599854, "learning_rate": 3.5319947794123153e-06, "loss": 0.157, "step": 3087 }, { "epoch": 1.4648956356736242, "grad_norm": 1.61245596408844, "learning_rate": 3.5261371521817247e-06, "loss": 0.1665, "step": 3088 }, { "epoch": 1.465370018975332, "grad_norm": 1.7214654684066772, "learning_rate": 3.5202833464083096e-06, "loss": 0.1626, "step": 3089 }, { "epoch": 1.4658444022770398, "grad_norm": 1.7213759422302246, "learning_rate": 3.514433365547517e-06, "loss": 0.1907, "step": 3090 }, { "epoch": 1.4663187855787476, "grad_norm": 1.6483070850372314, "learning_rate": 3.5085872130525345e-06, "loss": 0.1634, "step": 3091 }, { "epoch": 1.4667931688804554, "grad_norm": 1.7148456573486328, "learning_rate": 3.5027448923742845e-06, "loss": 0.1782, "step": 3092 }, { "epoch": 1.4672675521821632, "grad_norm": 1.4626739025115967, "learning_rate": 3.496906406961428e-06, "loss": 0.1386, "step": 3093 }, { "epoch": 1.467741935483871, "grad_norm": 1.7289766073226929, "learning_rate": 3.491071760260368e-06, "loss": 0.1474, "step": 3094 }, { "epoch": 1.4682163187855788, "grad_norm": 1.4813730716705322, "learning_rate": 3.4852409557152432e-06, "loss": 0.1572, "step": 3095 }, { "epoch": 1.4686907020872866, "grad_norm": 1.7407090663909912, "learning_rate": 3.479413996767913e-06, "loss": 0.1711, "step": 3096 }, { "epoch": 1.4691650853889944, "grad_norm": 1.5457485914230347, "learning_rate": 3.473590886857977e-06, "loss": 0.1327, "step": 3097 }, { "epoch": 1.4696394686907022, "grad_norm": 1.932664394378662, "learning_rate": 3.4677716294227583e-06, "loss": 0.1631, "step": 3098 }, { "epoch": 1.47011385199241, "grad_norm": 1.4631233215332031, "learning_rate": 3.4619562278973105e-06, "loss": 0.1312, "step": 3099 }, { "epoch": 1.4705882352941178, "grad_norm": 1.3594050407409668, "learning_rate": 3.4561446857144054e-06, "loss": 0.1175, "step": 3100 }, { "epoch": 1.4710626185958255, "grad_norm": 1.4718384742736816, "learning_rate": 3.4503370063045338e-06, "loss": 0.1438, "step": 3101 }, { "epoch": 1.4715370018975333, "grad_norm": 1.6673498153686523, "learning_rate": 3.444533193095917e-06, "loss": 0.1628, "step": 3102 }, { "epoch": 1.472011385199241, "grad_norm": 1.9447062015533447, "learning_rate": 3.4387332495144866e-06, "loss": 0.2165, "step": 3103 }, { "epoch": 1.4724857685009487, "grad_norm": 1.9136441946029663, "learning_rate": 3.4329371789838916e-06, "loss": 0.1972, "step": 3104 }, { "epoch": 1.4729601518026565, "grad_norm": 2.037196636199951, "learning_rate": 3.4271449849255003e-06, "loss": 0.1881, "step": 3105 }, { "epoch": 1.4734345351043643, "grad_norm": 1.9500868320465088, "learning_rate": 3.42135667075838e-06, "loss": 0.1469, "step": 3106 }, { "epoch": 1.473908918406072, "grad_norm": 1.8555630445480347, "learning_rate": 3.4155722398993175e-06, "loss": 0.1753, "step": 3107 }, { "epoch": 1.4743833017077799, "grad_norm": 1.6183701753616333, "learning_rate": 3.4097916957628108e-06, "loss": 0.1391, "step": 3108 }, { "epoch": 1.4748576850094877, "grad_norm": 1.8170623779296875, "learning_rate": 3.4040150417610483e-06, "loss": 0.1789, "step": 3109 }, { "epoch": 1.4753320683111955, "grad_norm": 1.6441960334777832, "learning_rate": 3.3982422813039407e-06, "loss": 0.1682, "step": 3110 }, { "epoch": 1.4758064516129032, "grad_norm": 1.511757254600525, "learning_rate": 3.3924734177990847e-06, "loss": 0.1337, "step": 3111 }, { "epoch": 1.476280834914611, "grad_norm": 1.9087467193603516, "learning_rate": 3.3867084546517847e-06, "loss": 0.1792, "step": 3112 }, { "epoch": 1.4767552182163188, "grad_norm": 1.632246732711792, "learning_rate": 3.3809473952650427e-06, "loss": 0.1387, "step": 3113 }, { "epoch": 1.4772296015180266, "grad_norm": 1.7560805082321167, "learning_rate": 3.3751902430395558e-06, "loss": 0.1753, "step": 3114 }, { "epoch": 1.4777039848197344, "grad_norm": 1.700164556503296, "learning_rate": 3.3694370013737153e-06, "loss": 0.1838, "step": 3115 }, { "epoch": 1.478178368121442, "grad_norm": 1.6236701011657715, "learning_rate": 3.3636876736636013e-06, "loss": 0.1711, "step": 3116 }, { "epoch": 1.4786527514231498, "grad_norm": 1.9994237422943115, "learning_rate": 3.3579422633029813e-06, "loss": 0.1642, "step": 3117 }, { "epoch": 1.4791271347248576, "grad_norm": 1.9831045866012573, "learning_rate": 3.352200773683317e-06, "loss": 0.1701, "step": 3118 }, { "epoch": 1.4796015180265654, "grad_norm": 1.4581242799758911, "learning_rate": 3.3464632081937567e-06, "loss": 0.1479, "step": 3119 }, { "epoch": 1.4800759013282732, "grad_norm": 1.944639801979065, "learning_rate": 3.3407295702211217e-06, "loss": 0.1544, "step": 3120 }, { "epoch": 1.480550284629981, "grad_norm": 1.6618354320526123, "learning_rate": 3.3349998631499247e-06, "loss": 0.1325, "step": 3121 }, { "epoch": 1.4810246679316887, "grad_norm": 2.091897964477539, "learning_rate": 3.3292740903623567e-06, "loss": 0.1895, "step": 3122 }, { "epoch": 1.4814990512333965, "grad_norm": 1.445960283279419, "learning_rate": 3.323552255238286e-06, "loss": 0.1448, "step": 3123 }, { "epoch": 1.4819734345351043, "grad_norm": 1.5881474018096924, "learning_rate": 3.317834361155252e-06, "loss": 0.136, "step": 3124 }, { "epoch": 1.4824478178368121, "grad_norm": 1.9209330081939697, "learning_rate": 3.3121204114884696e-06, "loss": 0.1705, "step": 3125 }, { "epoch": 1.48292220113852, "grad_norm": 1.59219491481781, "learning_rate": 3.3064104096108287e-06, "loss": 0.1403, "step": 3126 }, { "epoch": 1.4833965844402277, "grad_norm": 1.7049628496170044, "learning_rate": 3.3007043588928866e-06, "loss": 0.1602, "step": 3127 }, { "epoch": 1.4838709677419355, "grad_norm": 1.7211378812789917, "learning_rate": 3.295002262702869e-06, "loss": 0.1711, "step": 3128 }, { "epoch": 1.4843453510436433, "grad_norm": 1.8670153617858887, "learning_rate": 3.2893041244066704e-06, "loss": 0.1732, "step": 3129 }, { "epoch": 1.484819734345351, "grad_norm": 1.462051510810852, "learning_rate": 3.2836099473678384e-06, "loss": 0.1409, "step": 3130 }, { "epoch": 1.4852941176470589, "grad_norm": 1.5482720136642456, "learning_rate": 3.2779197349475933e-06, "loss": 0.1419, "step": 3131 }, { "epoch": 1.4857685009487667, "grad_norm": 2.3937394618988037, "learning_rate": 3.2722334905048146e-06, "loss": 0.1878, "step": 3132 }, { "epoch": 1.4862428842504745, "grad_norm": 1.6191223859786987, "learning_rate": 3.266551217396029e-06, "loss": 0.1706, "step": 3133 }, { "epoch": 1.4867172675521823, "grad_norm": 1.8945509195327759, "learning_rate": 3.260872918975432e-06, "loss": 0.1589, "step": 3134 }, { "epoch": 1.48719165085389, "grad_norm": 1.668412685394287, "learning_rate": 3.255198598594862e-06, "loss": 0.1757, "step": 3135 }, { "epoch": 1.4876660341555978, "grad_norm": 1.9364664554595947, "learning_rate": 3.2495282596038156e-06, "loss": 0.1189, "step": 3136 }, { "epoch": 1.4881404174573056, "grad_norm": 1.442252278327942, "learning_rate": 3.243861905349439e-06, "loss": 0.1294, "step": 3137 }, { "epoch": 1.4886148007590132, "grad_norm": 1.5097479820251465, "learning_rate": 3.2381995391765288e-06, "loss": 0.1502, "step": 3138 }, { "epoch": 1.489089184060721, "grad_norm": 1.8824775218963623, "learning_rate": 3.2325411644275164e-06, "loss": 0.1816, "step": 3139 }, { "epoch": 1.4895635673624288, "grad_norm": 1.5267640352249146, "learning_rate": 3.22688678444249e-06, "loss": 0.1391, "step": 3140 }, { "epoch": 1.4900379506641366, "grad_norm": 1.582728385925293, "learning_rate": 3.221236402559169e-06, "loss": 0.168, "step": 3141 }, { "epoch": 1.4905123339658444, "grad_norm": 1.3848261833190918, "learning_rate": 3.215590022112921e-06, "loss": 0.1533, "step": 3142 }, { "epoch": 1.4909867172675522, "grad_norm": 1.7733631134033203, "learning_rate": 3.209947646436752e-06, "loss": 0.1689, "step": 3143 }, { "epoch": 1.49146110056926, "grad_norm": 1.8000030517578125, "learning_rate": 3.204309278861294e-06, "loss": 0.1392, "step": 3144 }, { "epoch": 1.4919354838709677, "grad_norm": 1.5423595905303955, "learning_rate": 3.1986749227148215e-06, "loss": 0.1553, "step": 3145 }, { "epoch": 1.4924098671726755, "grad_norm": 1.7993470430374146, "learning_rate": 3.19304458132324e-06, "loss": 0.1598, "step": 3146 }, { "epoch": 1.4928842504743833, "grad_norm": 1.7650487422943115, "learning_rate": 3.1874182580100874e-06, "loss": 0.1995, "step": 3147 }, { "epoch": 1.4933586337760911, "grad_norm": 2.178863525390625, "learning_rate": 3.181795956096522e-06, "loss": 0.1757, "step": 3148 }, { "epoch": 1.493833017077799, "grad_norm": 1.823729395866394, "learning_rate": 3.1761776789013365e-06, "loss": 0.1441, "step": 3149 }, { "epoch": 1.4943074003795067, "grad_norm": 1.6294647455215454, "learning_rate": 3.1705634297409404e-06, "loss": 0.1846, "step": 3150 }, { "epoch": 1.4947817836812145, "grad_norm": 1.7616844177246094, "learning_rate": 3.1649532119293713e-06, "loss": 0.169, "step": 3151 }, { "epoch": 1.495256166982922, "grad_norm": 1.9334604740142822, "learning_rate": 3.1593470287782847e-06, "loss": 0.1989, "step": 3152 }, { "epoch": 1.4957305502846299, "grad_norm": 1.5212819576263428, "learning_rate": 3.15374488359696e-06, "loss": 0.1652, "step": 3153 }, { "epoch": 1.4962049335863377, "grad_norm": 1.8870230913162231, "learning_rate": 3.1481467796922804e-06, "loss": 0.1775, "step": 3154 }, { "epoch": 1.4966793168880455, "grad_norm": 1.5474931001663208, "learning_rate": 3.1425527203687543e-06, "loss": 0.152, "step": 3155 }, { "epoch": 1.4971537001897532, "grad_norm": 1.711269497871399, "learning_rate": 3.1369627089285036e-06, "loss": 0.1527, "step": 3156 }, { "epoch": 1.497628083491461, "grad_norm": 1.6961112022399902, "learning_rate": 3.131376748671253e-06, "loss": 0.1761, "step": 3157 }, { "epoch": 1.4981024667931688, "grad_norm": 1.4259470701217651, "learning_rate": 3.1257948428943375e-06, "loss": 0.1497, "step": 3158 }, { "epoch": 1.4985768500948766, "grad_norm": 1.8194340467453003, "learning_rate": 3.120216994892702e-06, "loss": 0.1785, "step": 3159 }, { "epoch": 1.4990512333965844, "grad_norm": 1.514940619468689, "learning_rate": 3.1146432079588963e-06, "loss": 0.1585, "step": 3160 }, { "epoch": 1.4995256166982922, "grad_norm": 1.442490577697754, "learning_rate": 3.1090734853830718e-06, "loss": 0.1337, "step": 3161 }, { "epoch": 1.5, "grad_norm": 1.7581517696380615, "learning_rate": 3.103507830452982e-06, "loss": 0.1771, "step": 3162 }, { "epoch": 1.5004743833017078, "grad_norm": 2.3333330154418945, "learning_rate": 3.0979462464539744e-06, "loss": 0.1982, "step": 3163 }, { "epoch": 1.5009487666034156, "grad_norm": 1.7055302858352661, "learning_rate": 3.092388736669002e-06, "loss": 0.1643, "step": 3164 }, { "epoch": 1.5014231499051234, "grad_norm": 1.49795401096344, "learning_rate": 3.0868353043786004e-06, "loss": 0.1416, "step": 3165 }, { "epoch": 1.5018975332068312, "grad_norm": 1.4336013793945312, "learning_rate": 3.0812859528609106e-06, "loss": 0.1412, "step": 3166 }, { "epoch": 1.502371916508539, "grad_norm": 1.6154780387878418, "learning_rate": 3.0757406853916627e-06, "loss": 0.1667, "step": 3167 }, { "epoch": 1.5028462998102468, "grad_norm": 1.670455813407898, "learning_rate": 3.0701995052441658e-06, "loss": 0.1467, "step": 3168 }, { "epoch": 1.5033206831119545, "grad_norm": 1.86698317527771, "learning_rate": 3.064662415689328e-06, "loss": 0.2076, "step": 3169 }, { "epoch": 1.5037950664136623, "grad_norm": 1.5714526176452637, "learning_rate": 3.059129419995638e-06, "loss": 0.155, "step": 3170 }, { "epoch": 1.5042694497153701, "grad_norm": 1.567219853401184, "learning_rate": 3.053600521429172e-06, "loss": 0.1538, "step": 3171 }, { "epoch": 1.504743833017078, "grad_norm": 1.410211205482483, "learning_rate": 3.0480757232535773e-06, "loss": 0.1173, "step": 3172 }, { "epoch": 1.5052182163187857, "grad_norm": 1.7881250381469727, "learning_rate": 3.0425550287300943e-06, "loss": 0.1827, "step": 3173 }, { "epoch": 1.5056925996204935, "grad_norm": 1.595481514930725, "learning_rate": 3.037038441117528e-06, "loss": 0.1373, "step": 3174 }, { "epoch": 1.5061669829222013, "grad_norm": 1.6253283023834229, "learning_rate": 3.031525963672267e-06, "loss": 0.1438, "step": 3175 }, { "epoch": 1.5066413662239089, "grad_norm": 1.9808628559112549, "learning_rate": 3.0260175996482787e-06, "loss": 0.1479, "step": 3176 }, { "epoch": 1.5071157495256167, "grad_norm": 2.177093744277954, "learning_rate": 3.0205133522970865e-06, "loss": 0.2158, "step": 3177 }, { "epoch": 1.5075901328273245, "grad_norm": 1.7438995838165283, "learning_rate": 3.0150132248677976e-06, "loss": 0.1905, "step": 3178 }, { "epoch": 1.5080645161290323, "grad_norm": 1.5714632272720337, "learning_rate": 3.0095172206070833e-06, "loss": 0.1403, "step": 3179 }, { "epoch": 1.50853889943074, "grad_norm": 1.7036011219024658, "learning_rate": 3.0040253427591827e-06, "loss": 0.139, "step": 3180 }, { "epoch": 1.5090132827324478, "grad_norm": 1.967517375946045, "learning_rate": 2.9985375945658934e-06, "loss": 0.1712, "step": 3181 }, { "epoch": 1.5094876660341556, "grad_norm": 1.4856235980987549, "learning_rate": 2.9930539792665767e-06, "loss": 0.1472, "step": 3182 }, { "epoch": 1.5099620493358634, "grad_norm": 1.9669393301010132, "learning_rate": 2.9875745000981603e-06, "loss": 0.1836, "step": 3183 }, { "epoch": 1.510436432637571, "grad_norm": 1.9737735986709595, "learning_rate": 2.9820991602951255e-06, "loss": 0.1955, "step": 3184 }, { "epoch": 1.5109108159392788, "grad_norm": 1.8585445880889893, "learning_rate": 2.97662796308951e-06, "loss": 0.1881, "step": 3185 }, { "epoch": 1.5113851992409866, "grad_norm": 1.8154902458190918, "learning_rate": 2.971160911710913e-06, "loss": 0.1274, "step": 3186 }, { "epoch": 1.5118595825426944, "grad_norm": 1.5582470893859863, "learning_rate": 2.965698009386473e-06, "loss": 0.1438, "step": 3187 }, { "epoch": 1.5123339658444022, "grad_norm": 2.245789051055908, "learning_rate": 2.9602392593408933e-06, "loss": 0.2352, "step": 3188 }, { "epoch": 1.51280834914611, "grad_norm": 1.8019410371780396, "learning_rate": 2.954784664796414e-06, "loss": 0.181, "step": 3189 }, { "epoch": 1.5132827324478177, "grad_norm": 1.7983249425888062, "learning_rate": 2.9493342289728334e-06, "loss": 0.1187, "step": 3190 }, { "epoch": 1.5137571157495255, "grad_norm": 1.7953065633773804, "learning_rate": 2.94388795508749e-06, "loss": 0.1749, "step": 3191 }, { "epoch": 1.5142314990512333, "grad_norm": 1.9157280921936035, "learning_rate": 2.93844584635526e-06, "loss": 0.2026, "step": 3192 }, { "epoch": 1.5147058823529411, "grad_norm": 1.7939811944961548, "learning_rate": 2.9330079059885708e-06, "loss": 0.1734, "step": 3193 }, { "epoch": 1.515180265654649, "grad_norm": 1.3667829036712646, "learning_rate": 2.927574137197383e-06, "loss": 0.1348, "step": 3194 }, { "epoch": 1.5156546489563567, "grad_norm": 1.706744909286499, "learning_rate": 2.9221445431892003e-06, "loss": 0.1574, "step": 3195 }, { "epoch": 1.5161290322580645, "grad_norm": 1.9317013025283813, "learning_rate": 2.916719127169053e-06, "loss": 0.1782, "step": 3196 }, { "epoch": 1.5166034155597723, "grad_norm": 1.5074424743652344, "learning_rate": 2.911297892339516e-06, "loss": 0.1355, "step": 3197 }, { "epoch": 1.51707779886148, "grad_norm": 1.852922797203064, "learning_rate": 2.9058808419006834e-06, "loss": 0.1614, "step": 3198 }, { "epoch": 1.5175521821631879, "grad_norm": 1.6857560873031616, "learning_rate": 2.9004679790501922e-06, "loss": 0.1859, "step": 3199 }, { "epoch": 1.5180265654648957, "grad_norm": 1.6432700157165527, "learning_rate": 2.895059306983201e-06, "loss": 0.1501, "step": 3200 }, { "epoch": 1.5185009487666035, "grad_norm": 2.039566993713379, "learning_rate": 2.889654828892393e-06, "loss": 0.1988, "step": 3201 }, { "epoch": 1.5189753320683113, "grad_norm": 1.716520071029663, "learning_rate": 2.8842545479679796e-06, "loss": 0.1636, "step": 3202 }, { "epoch": 1.519449715370019, "grad_norm": 1.249931812286377, "learning_rate": 2.878858467397693e-06, "loss": 0.1183, "step": 3203 }, { "epoch": 1.5199240986717268, "grad_norm": 1.6915541887283325, "learning_rate": 2.8734665903667892e-06, "loss": 0.155, "step": 3204 }, { "epoch": 1.5203984819734346, "grad_norm": 1.4063469171524048, "learning_rate": 2.8680789200580373e-06, "loss": 0.1344, "step": 3205 }, { "epoch": 1.5208728652751424, "grad_norm": 1.345933198928833, "learning_rate": 2.862695459651722e-06, "loss": 0.1286, "step": 3206 }, { "epoch": 1.5213472485768502, "grad_norm": 1.5287388563156128, "learning_rate": 2.8573162123256504e-06, "loss": 0.1617, "step": 3207 }, { "epoch": 1.521821631878558, "grad_norm": 1.7381535768508911, "learning_rate": 2.851941181255139e-06, "loss": 0.1772, "step": 3208 }, { "epoch": 1.5222960151802658, "grad_norm": 1.8976391553878784, "learning_rate": 2.8465703696130142e-06, "loss": 0.1806, "step": 3209 }, { "epoch": 1.5227703984819736, "grad_norm": 1.5034171342849731, "learning_rate": 2.841203780569618e-06, "loss": 0.1625, "step": 3210 }, { "epoch": 1.5232447817836812, "grad_norm": 1.4791839122772217, "learning_rate": 2.835841417292788e-06, "loss": 0.1411, "step": 3211 }, { "epoch": 1.523719165085389, "grad_norm": 1.7350883483886719, "learning_rate": 2.8304832829478802e-06, "loss": 0.1606, "step": 3212 }, { "epoch": 1.5241935483870968, "grad_norm": 1.4422385692596436, "learning_rate": 2.825129380697741e-06, "loss": 0.1368, "step": 3213 }, { "epoch": 1.5246679316888045, "grad_norm": 1.8611582517623901, "learning_rate": 2.8197797137027338e-06, "loss": 0.1495, "step": 3214 }, { "epoch": 1.5251423149905123, "grad_norm": 2.3095667362213135, "learning_rate": 2.8144342851207076e-06, "loss": 0.2357, "step": 3215 }, { "epoch": 1.5256166982922201, "grad_norm": 1.809204339981079, "learning_rate": 2.8090930981070176e-06, "loss": 0.1656, "step": 3216 }, { "epoch": 1.526091081593928, "grad_norm": 1.5061122179031372, "learning_rate": 2.8037561558145154e-06, "loss": 0.1525, "step": 3217 }, { "epoch": 1.5265654648956357, "grad_norm": 1.6025121212005615, "learning_rate": 2.7984234613935434e-06, "loss": 0.137, "step": 3218 }, { "epoch": 1.5270398481973435, "grad_norm": 1.6906027793884277, "learning_rate": 2.7930950179919438e-06, "loss": 0.1781, "step": 3219 }, { "epoch": 1.527514231499051, "grad_norm": 1.538794994354248, "learning_rate": 2.7877708287550366e-06, "loss": 0.145, "step": 3220 }, { "epoch": 1.5279886148007589, "grad_norm": 1.517501711845398, "learning_rate": 2.7824508968256435e-06, "loss": 0.1444, "step": 3221 }, { "epoch": 1.5284629981024667, "grad_norm": 1.737318515777588, "learning_rate": 2.777135225344063e-06, "loss": 0.1274, "step": 3222 }, { "epoch": 1.5289373814041745, "grad_norm": 1.742997407913208, "learning_rate": 2.771823817448085e-06, "loss": 0.1682, "step": 3223 }, { "epoch": 1.5294117647058822, "grad_norm": 1.806666612625122, "learning_rate": 2.7665166762729856e-06, "loss": 0.1467, "step": 3224 }, { "epoch": 1.52988614800759, "grad_norm": 2.0361621379852295, "learning_rate": 2.7612138049515102e-06, "loss": 0.1591, "step": 3225 }, { "epoch": 1.5303605313092978, "grad_norm": 1.6584888696670532, "learning_rate": 2.755915206613895e-06, "loss": 0.1443, "step": 3226 }, { "epoch": 1.5308349146110056, "grad_norm": 1.9782226085662842, "learning_rate": 2.75062088438785e-06, "loss": 0.1732, "step": 3227 }, { "epoch": 1.5313092979127134, "grad_norm": 1.7193796634674072, "learning_rate": 2.7453308413985635e-06, "loss": 0.1448, "step": 3228 }, { "epoch": 1.5317836812144212, "grad_norm": 1.6462289094924927, "learning_rate": 2.740045080768694e-06, "loss": 0.1444, "step": 3229 }, { "epoch": 1.532258064516129, "grad_norm": 1.397916316986084, "learning_rate": 2.73476360561837e-06, "loss": 0.1503, "step": 3230 }, { "epoch": 1.5327324478178368, "grad_norm": 2.2624073028564453, "learning_rate": 2.7294864190651972e-06, "loss": 0.2172, "step": 3231 }, { "epoch": 1.5332068311195446, "grad_norm": 1.890386939048767, "learning_rate": 2.724213524224246e-06, "loss": 0.1538, "step": 3232 }, { "epoch": 1.5336812144212524, "grad_norm": 2.1407437324523926, "learning_rate": 2.7189449242080557e-06, "loss": 0.1784, "step": 3233 }, { "epoch": 1.5341555977229602, "grad_norm": 1.7359429597854614, "learning_rate": 2.7136806221266286e-06, "loss": 0.1851, "step": 3234 }, { "epoch": 1.534629981024668, "grad_norm": 1.662856101989746, "learning_rate": 2.7084206210874277e-06, "loss": 0.1352, "step": 3235 }, { "epoch": 1.5351043643263758, "grad_norm": 1.425294280052185, "learning_rate": 2.7031649241953826e-06, "loss": 0.134, "step": 3236 }, { "epoch": 1.5355787476280836, "grad_norm": 1.6383399963378906, "learning_rate": 2.697913534552875e-06, "loss": 0.1595, "step": 3237 }, { "epoch": 1.5360531309297913, "grad_norm": 1.8296263217926025, "learning_rate": 2.6926664552597537e-06, "loss": 0.1928, "step": 3238 }, { "epoch": 1.5365275142314991, "grad_norm": 1.6858452558517456, "learning_rate": 2.687423689413312e-06, "loss": 0.1304, "step": 3239 }, { "epoch": 1.537001897533207, "grad_norm": 1.7636797428131104, "learning_rate": 2.6821852401083048e-06, "loss": 0.167, "step": 3240 }, { "epoch": 1.5374762808349147, "grad_norm": 1.6068103313446045, "learning_rate": 2.6769511104369384e-06, "loss": 0.1536, "step": 3241 }, { "epoch": 1.5379506641366225, "grad_norm": 1.8157026767730713, "learning_rate": 2.6717213034888656e-06, "loss": 0.1601, "step": 3242 }, { "epoch": 1.5384250474383303, "grad_norm": 1.6704944372177124, "learning_rate": 2.6664958223511948e-06, "loss": 0.165, "step": 3243 }, { "epoch": 1.538899430740038, "grad_norm": 1.5348975658416748, "learning_rate": 2.661274670108469e-06, "loss": 0.1695, "step": 3244 }, { "epoch": 1.539373814041746, "grad_norm": 1.7109907865524292, "learning_rate": 2.6560578498426883e-06, "loss": 0.151, "step": 3245 }, { "epoch": 1.5398481973434535, "grad_norm": 1.4413361549377441, "learning_rate": 2.6508453646332845e-06, "loss": 0.1458, "step": 3246 }, { "epoch": 1.5403225806451613, "grad_norm": 1.3164259195327759, "learning_rate": 2.645637217557139e-06, "loss": 0.1393, "step": 3247 }, { "epoch": 1.540796963946869, "grad_norm": 1.6673977375030518, "learning_rate": 2.640433411688572e-06, "loss": 0.1613, "step": 3248 }, { "epoch": 1.5412713472485768, "grad_norm": 1.5010228157043457, "learning_rate": 2.635233950099334e-06, "loss": 0.1433, "step": 3249 }, { "epoch": 1.5417457305502846, "grad_norm": 1.5055230855941772, "learning_rate": 2.630038835858617e-06, "loss": 0.146, "step": 3250 }, { "epoch": 1.5422201138519924, "grad_norm": 1.621572494506836, "learning_rate": 2.624848072033046e-06, "loss": 0.1707, "step": 3251 }, { "epoch": 1.5426944971537002, "grad_norm": 2.2330636978149414, "learning_rate": 2.6196616616866822e-06, "loss": 0.1665, "step": 3252 }, { "epoch": 1.543168880455408, "grad_norm": 1.9100672006607056, "learning_rate": 2.6144796078810065e-06, "loss": 0.1728, "step": 3253 }, { "epoch": 1.5436432637571158, "grad_norm": 1.8044493198394775, "learning_rate": 2.609301913674933e-06, "loss": 0.1741, "step": 3254 }, { "epoch": 1.5441176470588234, "grad_norm": 1.4403407573699951, "learning_rate": 2.6041285821248064e-06, "loss": 0.1417, "step": 3255 }, { "epoch": 1.5445920303605312, "grad_norm": 1.5496286153793335, "learning_rate": 2.598959616284391e-06, "loss": 0.1356, "step": 3256 }, { "epoch": 1.545066413662239, "grad_norm": 2.311690092086792, "learning_rate": 2.5937950192048823e-06, "loss": 0.1931, "step": 3257 }, { "epoch": 1.5455407969639468, "grad_norm": 2.143083095550537, "learning_rate": 2.588634793934882e-06, "loss": 0.1902, "step": 3258 }, { "epoch": 1.5460151802656545, "grad_norm": 2.0627899169921875, "learning_rate": 2.5834789435204245e-06, "loss": 0.1883, "step": 3259 }, { "epoch": 1.5464895635673623, "grad_norm": 2.2060842514038086, "learning_rate": 2.57832747100496e-06, "loss": 0.2299, "step": 3260 }, { "epoch": 1.5469639468690701, "grad_norm": 1.7852225303649902, "learning_rate": 2.5731803794293465e-06, "loss": 0.1833, "step": 3261 }, { "epoch": 1.547438330170778, "grad_norm": 1.6334515810012817, "learning_rate": 2.5680376718318657e-06, "loss": 0.1617, "step": 3262 }, { "epoch": 1.5479127134724857, "grad_norm": 1.4657764434814453, "learning_rate": 2.5628993512482013e-06, "loss": 0.15, "step": 3263 }, { "epoch": 1.5483870967741935, "grad_norm": 1.775926947593689, "learning_rate": 2.557765420711458e-06, "loss": 0.1747, "step": 3264 }, { "epoch": 1.5488614800759013, "grad_norm": 1.7715890407562256, "learning_rate": 2.5526358832521424e-06, "loss": 0.1488, "step": 3265 }, { "epoch": 1.549335863377609, "grad_norm": 1.91201651096344, "learning_rate": 2.5475107418981692e-06, "loss": 0.1736, "step": 3266 }, { "epoch": 1.5498102466793169, "grad_norm": 1.8647923469543457, "learning_rate": 2.5423899996748636e-06, "loss": 0.1741, "step": 3267 }, { "epoch": 1.5502846299810247, "grad_norm": 1.4479807615280151, "learning_rate": 2.5372736596049417e-06, "loss": 0.1228, "step": 3268 }, { "epoch": 1.5507590132827325, "grad_norm": 1.4096297025680542, "learning_rate": 2.532161724708534e-06, "loss": 0.1368, "step": 3269 }, { "epoch": 1.5512333965844403, "grad_norm": 1.8053456544876099, "learning_rate": 2.5270541980031603e-06, "loss": 0.1674, "step": 3270 }, { "epoch": 1.551707779886148, "grad_norm": 1.699059009552002, "learning_rate": 2.521951082503746e-06, "loss": 0.1305, "step": 3271 }, { "epoch": 1.5521821631878558, "grad_norm": 1.85928213596344, "learning_rate": 2.516852381222612e-06, "loss": 0.1696, "step": 3272 }, { "epoch": 1.5526565464895636, "grad_norm": 1.6062089204788208, "learning_rate": 2.5117580971694644e-06, "loss": 0.1237, "step": 3273 }, { "epoch": 1.5531309297912714, "grad_norm": 1.8484961986541748, "learning_rate": 2.5066682333514136e-06, "loss": 0.1724, "step": 3274 }, { "epoch": 1.5536053130929792, "grad_norm": 1.7327786684036255, "learning_rate": 2.5015827927729554e-06, "loss": 0.1314, "step": 3275 }, { "epoch": 1.554079696394687, "grad_norm": 1.7604756355285645, "learning_rate": 2.496501778435977e-06, "loss": 0.1538, "step": 3276 }, { "epoch": 1.5545540796963948, "grad_norm": 1.8276807069778442, "learning_rate": 2.491425193339748e-06, "loss": 0.1595, "step": 3277 }, { "epoch": 1.5550284629981026, "grad_norm": 1.5220667123794556, "learning_rate": 2.4863530404809253e-06, "loss": 0.1231, "step": 3278 }, { "epoch": 1.5555028462998104, "grad_norm": 2.1537914276123047, "learning_rate": 2.4812853228535515e-06, "loss": 0.1959, "step": 3279 }, { "epoch": 1.5559772296015182, "grad_norm": 1.3721140623092651, "learning_rate": 2.4762220434490504e-06, "loss": 0.1232, "step": 3280 }, { "epoch": 1.5564516129032258, "grad_norm": 1.8437730073928833, "learning_rate": 2.4711632052562283e-06, "loss": 0.1598, "step": 3281 }, { "epoch": 1.5569259962049335, "grad_norm": 1.4485067129135132, "learning_rate": 2.466108811261263e-06, "loss": 0.161, "step": 3282 }, { "epoch": 1.5574003795066413, "grad_norm": 1.9926724433898926, "learning_rate": 2.461058864447716e-06, "loss": 0.1368, "step": 3283 }, { "epoch": 1.5578747628083491, "grad_norm": 3.5024831295013428, "learning_rate": 2.456013367796519e-06, "loss": 0.1822, "step": 3284 }, { "epoch": 1.558349146110057, "grad_norm": 1.5812478065490723, "learning_rate": 2.450972324285984e-06, "loss": 0.1446, "step": 3285 }, { "epoch": 1.5588235294117647, "grad_norm": 1.8127206563949585, "learning_rate": 2.445935736891785e-06, "loss": 0.1643, "step": 3286 }, { "epoch": 1.5592979127134725, "grad_norm": 1.8025965690612793, "learning_rate": 2.4409036085869665e-06, "loss": 0.1455, "step": 3287 }, { "epoch": 1.5597722960151803, "grad_norm": 1.6995849609375, "learning_rate": 2.4358759423419476e-06, "loss": 0.1461, "step": 3288 }, { "epoch": 1.560246679316888, "grad_norm": 1.7094231843948364, "learning_rate": 2.43085274112451e-06, "loss": 0.1504, "step": 3289 }, { "epoch": 1.5607210626185957, "grad_norm": 1.381062626838684, "learning_rate": 2.425834007899799e-06, "loss": 0.1279, "step": 3290 }, { "epoch": 1.5611954459203035, "grad_norm": 1.921175241470337, "learning_rate": 2.420819745630326e-06, "loss": 0.1932, "step": 3291 }, { "epoch": 1.5616698292220113, "grad_norm": 1.5312777757644653, "learning_rate": 2.4158099572759564e-06, "loss": 0.1378, "step": 3292 }, { "epoch": 1.562144212523719, "grad_norm": 2.1276259422302246, "learning_rate": 2.4108046457939215e-06, "loss": 0.1814, "step": 3293 }, { "epoch": 1.5626185958254268, "grad_norm": 1.3188360929489136, "learning_rate": 2.405803814138804e-06, "loss": 0.1254, "step": 3294 }, { "epoch": 1.5630929791271346, "grad_norm": 1.5824406147003174, "learning_rate": 2.40080746526255e-06, "loss": 0.1318, "step": 3295 }, { "epoch": 1.5635673624288424, "grad_norm": 1.6440813541412354, "learning_rate": 2.3958156021144495e-06, "loss": 0.1395, "step": 3296 }, { "epoch": 1.5640417457305502, "grad_norm": 1.85137140750885, "learning_rate": 2.390828227641152e-06, "loss": 0.171, "step": 3297 }, { "epoch": 1.564516129032258, "grad_norm": 1.7361198663711548, "learning_rate": 2.385845344786656e-06, "loss": 0.1485, "step": 3298 }, { "epoch": 1.5649905123339658, "grad_norm": 1.779136061668396, "learning_rate": 2.380866956492307e-06, "loss": 0.1833, "step": 3299 }, { "epoch": 1.5654648956356736, "grad_norm": 1.568875789642334, "learning_rate": 2.3758930656968025e-06, "loss": 0.144, "step": 3300 }, { "epoch": 1.5659392789373814, "grad_norm": 1.6992019414901733, "learning_rate": 2.3709236753361777e-06, "loss": 0.1576, "step": 3301 }, { "epoch": 1.5664136622390892, "grad_norm": 1.7569388151168823, "learning_rate": 2.3659587883438106e-06, "loss": 0.1876, "step": 3302 }, { "epoch": 1.566888045540797, "grad_norm": 1.6685066223144531, "learning_rate": 2.36099840765043e-06, "loss": 0.1738, "step": 3303 }, { "epoch": 1.5673624288425048, "grad_norm": 1.763706922531128, "learning_rate": 2.3560425361840976e-06, "loss": 0.1597, "step": 3304 }, { "epoch": 1.5678368121442126, "grad_norm": 1.954229712486267, "learning_rate": 2.3510911768702184e-06, "loss": 0.1906, "step": 3305 }, { "epoch": 1.5683111954459203, "grad_norm": 1.7269539833068848, "learning_rate": 2.346144332631526e-06, "loss": 0.1543, "step": 3306 }, { "epoch": 1.5687855787476281, "grad_norm": 1.9077547788619995, "learning_rate": 2.3412020063880957e-06, "loss": 0.1752, "step": 3307 }, { "epoch": 1.569259962049336, "grad_norm": 1.3996086120605469, "learning_rate": 2.336264201057333e-06, "loss": 0.1241, "step": 3308 }, { "epoch": 1.5697343453510437, "grad_norm": 1.5195519924163818, "learning_rate": 2.331330919553981e-06, "loss": 0.1282, "step": 3309 }, { "epoch": 1.5702087286527515, "grad_norm": 1.725934624671936, "learning_rate": 2.3264021647901014e-06, "loss": 0.1716, "step": 3310 }, { "epoch": 1.5706831119544593, "grad_norm": 1.739709734916687, "learning_rate": 2.3214779396750885e-06, "loss": 0.149, "step": 3311 }, { "epoch": 1.571157495256167, "grad_norm": 1.6437108516693115, "learning_rate": 2.3165582471156643e-06, "loss": 0.1608, "step": 3312 }, { "epoch": 1.571631878557875, "grad_norm": 2.1839261054992676, "learning_rate": 2.311643090015877e-06, "loss": 0.1525, "step": 3313 }, { "epoch": 1.5721062618595827, "grad_norm": 1.8370964527130127, "learning_rate": 2.3067324712770967e-06, "loss": 0.1484, "step": 3314 }, { "epoch": 1.5725806451612905, "grad_norm": 1.8021410703659058, "learning_rate": 2.301826393798008e-06, "loss": 0.1618, "step": 3315 }, { "epoch": 1.573055028462998, "grad_norm": 1.7016773223876953, "learning_rate": 2.296924860474621e-06, "loss": 0.137, "step": 3316 }, { "epoch": 1.5735294117647058, "grad_norm": 1.573151707649231, "learning_rate": 2.2920278742002677e-06, "loss": 0.1449, "step": 3317 }, { "epoch": 1.5740037950664136, "grad_norm": 1.4972699880599976, "learning_rate": 2.287135437865583e-06, "loss": 0.1369, "step": 3318 }, { "epoch": 1.5744781783681214, "grad_norm": 1.9173699617385864, "learning_rate": 2.282247554358531e-06, "loss": 0.1329, "step": 3319 }, { "epoch": 1.5749525616698292, "grad_norm": 1.7459148168563843, "learning_rate": 2.2773642265643734e-06, "loss": 0.1782, "step": 3320 }, { "epoch": 1.575426944971537, "grad_norm": 1.479753851890564, "learning_rate": 2.272485457365695e-06, "loss": 0.1241, "step": 3321 }, { "epoch": 1.5759013282732448, "grad_norm": 1.529350996017456, "learning_rate": 2.267611249642383e-06, "loss": 0.1352, "step": 3322 }, { "epoch": 1.5763757115749526, "grad_norm": 2.032287120819092, "learning_rate": 2.2627416062716366e-06, "loss": 0.1704, "step": 3323 }, { "epoch": 1.5768500948766604, "grad_norm": 1.399459719657898, "learning_rate": 2.257876530127958e-06, "loss": 0.1306, "step": 3324 }, { "epoch": 1.577324478178368, "grad_norm": 1.9230834245681763, "learning_rate": 2.25301602408315e-06, "loss": 0.2053, "step": 3325 }, { "epoch": 1.5777988614800758, "grad_norm": 1.5427366495132446, "learning_rate": 2.248160091006326e-06, "loss": 0.137, "step": 3326 }, { "epoch": 1.5782732447817835, "grad_norm": 1.7453058958053589, "learning_rate": 2.243308733763889e-06, "loss": 0.1422, "step": 3327 }, { "epoch": 1.5787476280834913, "grad_norm": 1.7460287809371948, "learning_rate": 2.2384619552195518e-06, "loss": 0.1742, "step": 3328 }, { "epoch": 1.5792220113851991, "grad_norm": 1.7111773490905762, "learning_rate": 2.233619758234321e-06, "loss": 0.1552, "step": 3329 }, { "epoch": 1.579696394686907, "grad_norm": 1.5118764638900757, "learning_rate": 2.2287821456664926e-06, "loss": 0.1266, "step": 3330 }, { "epoch": 1.5801707779886147, "grad_norm": 1.605214238166809, "learning_rate": 2.2239491203716644e-06, "loss": 0.1548, "step": 3331 }, { "epoch": 1.5806451612903225, "grad_norm": 1.5546764135360718, "learning_rate": 2.2191206852027225e-06, "loss": 0.1369, "step": 3332 }, { "epoch": 1.5811195445920303, "grad_norm": 1.529728889465332, "learning_rate": 2.214296843009848e-06, "loss": 0.1205, "step": 3333 }, { "epoch": 1.581593927893738, "grad_norm": 1.7725614309310913, "learning_rate": 2.2094775966405045e-06, "loss": 0.1509, "step": 3334 }, { "epoch": 1.5820683111954459, "grad_norm": 1.8418389558792114, "learning_rate": 2.2046629489394422e-06, "loss": 0.1339, "step": 3335 }, { "epoch": 1.5825426944971537, "grad_norm": 1.8874013423919678, "learning_rate": 2.199852902748704e-06, "loss": 0.1873, "step": 3336 }, { "epoch": 1.5830170777988615, "grad_norm": 1.5304529666900635, "learning_rate": 2.19504746090761e-06, "loss": 0.1311, "step": 3337 }, { "epoch": 1.5834914611005693, "grad_norm": 2.1148695945739746, "learning_rate": 2.19024662625277e-06, "loss": 0.1458, "step": 3338 }, { "epoch": 1.583965844402277, "grad_norm": 1.3242015838623047, "learning_rate": 2.185450401618062e-06, "loss": 0.1238, "step": 3339 }, { "epoch": 1.5844402277039848, "grad_norm": 2.047175645828247, "learning_rate": 2.1806587898346553e-06, "loss": 0.1712, "step": 3340 }, { "epoch": 1.5849146110056926, "grad_norm": 1.6021418571472168, "learning_rate": 2.17587179373099e-06, "loss": 0.1615, "step": 3341 }, { "epoch": 1.5853889943074004, "grad_norm": 1.975977897644043, "learning_rate": 2.1710894161327813e-06, "loss": 0.1399, "step": 3342 }, { "epoch": 1.5858633776091082, "grad_norm": 1.3944038152694702, "learning_rate": 2.1663116598630207e-06, "loss": 0.1451, "step": 3343 }, { "epoch": 1.586337760910816, "grad_norm": 1.7378476858139038, "learning_rate": 2.1615385277419687e-06, "loss": 0.1514, "step": 3344 }, { "epoch": 1.5868121442125238, "grad_norm": 1.55166494846344, "learning_rate": 2.156770022587157e-06, "loss": 0.1545, "step": 3345 }, { "epoch": 1.5872865275142316, "grad_norm": 2.1811869144439697, "learning_rate": 2.1520061472133903e-06, "loss": 0.1508, "step": 3346 }, { "epoch": 1.5877609108159394, "grad_norm": 1.4423187971115112, "learning_rate": 2.147246904432735e-06, "loss": 0.1401, "step": 3347 }, { "epoch": 1.5882352941176472, "grad_norm": 1.590010166168213, "learning_rate": 2.1424922970545283e-06, "loss": 0.1466, "step": 3348 }, { "epoch": 1.588709677419355, "grad_norm": 1.5334136486053467, "learning_rate": 2.1377423278853627e-06, "loss": 0.1193, "step": 3349 }, { "epoch": 1.5891840607210628, "grad_norm": 1.605485200881958, "learning_rate": 2.1329969997291035e-06, "loss": 0.1445, "step": 3350 }, { "epoch": 1.5896584440227703, "grad_norm": 1.5612473487854004, "learning_rate": 2.128256315386865e-06, "loss": 0.1313, "step": 3351 }, { "epoch": 1.5901328273244781, "grad_norm": 1.702793002128601, "learning_rate": 2.1235202776570297e-06, "loss": 0.158, "step": 3352 }, { "epoch": 1.590607210626186, "grad_norm": 1.6103143692016602, "learning_rate": 2.118788889335236e-06, "loss": 0.1707, "step": 3353 }, { "epoch": 1.5910815939278937, "grad_norm": 1.7173638343811035, "learning_rate": 2.11406215321437e-06, "loss": 0.1728, "step": 3354 }, { "epoch": 1.5915559772296015, "grad_norm": 1.4429662227630615, "learning_rate": 2.1093400720845813e-06, "loss": 0.1154, "step": 3355 }, { "epoch": 1.5920303605313093, "grad_norm": 1.5256520509719849, "learning_rate": 2.1046226487332655e-06, "loss": 0.1471, "step": 3356 }, { "epoch": 1.592504743833017, "grad_norm": 1.6516417264938354, "learning_rate": 2.099909885945075e-06, "loss": 0.1527, "step": 3357 }, { "epoch": 1.592979127134725, "grad_norm": 1.8592281341552734, "learning_rate": 2.0952017865019036e-06, "loss": 0.1506, "step": 3358 }, { "epoch": 1.5934535104364327, "grad_norm": 1.5824545621871948, "learning_rate": 2.0904983531828947e-06, "loss": 0.1468, "step": 3359 }, { "epoch": 1.5939278937381403, "grad_norm": 1.5531175136566162, "learning_rate": 2.085799588764439e-06, "loss": 0.1386, "step": 3360 }, { "epoch": 1.594402277039848, "grad_norm": 1.6820399761199951, "learning_rate": 2.081105496020173e-06, "loss": 0.157, "step": 3361 }, { "epoch": 1.5948766603415558, "grad_norm": 1.3576366901397705, "learning_rate": 2.076416077720973e-06, "loss": 0.1285, "step": 3362 }, { "epoch": 1.5953510436432636, "grad_norm": 1.8949618339538574, "learning_rate": 2.0717313366349534e-06, "loss": 0.1585, "step": 3363 }, { "epoch": 1.5958254269449714, "grad_norm": 1.572543978691101, "learning_rate": 2.067051275527472e-06, "loss": 0.1509, "step": 3364 }, { "epoch": 1.5962998102466792, "grad_norm": 1.6558380126953125, "learning_rate": 2.0623758971611252e-06, "loss": 0.1409, "step": 3365 }, { "epoch": 1.596774193548387, "grad_norm": 1.892311930656433, "learning_rate": 2.0577052042957378e-06, "loss": 0.189, "step": 3366 }, { "epoch": 1.5972485768500948, "grad_norm": 1.8912239074707031, "learning_rate": 2.0530391996883782e-06, "loss": 0.15, "step": 3367 }, { "epoch": 1.5977229601518026, "grad_norm": 1.8770484924316406, "learning_rate": 2.0483778860933377e-06, "loss": 0.1802, "step": 3368 }, { "epoch": 1.5981973434535104, "grad_norm": 1.7015761137008667, "learning_rate": 2.0437212662621477e-06, "loss": 0.1437, "step": 3369 }, { "epoch": 1.5986717267552182, "grad_norm": 1.6395004987716675, "learning_rate": 2.0390693429435626e-06, "loss": 0.1361, "step": 3370 }, { "epoch": 1.599146110056926, "grad_norm": 1.7334997653961182, "learning_rate": 2.0344221188835667e-06, "loss": 0.1372, "step": 3371 }, { "epoch": 1.5996204933586338, "grad_norm": 1.7484663724899292, "learning_rate": 2.0297795968253753e-06, "loss": 0.173, "step": 3372 }, { "epoch": 1.6000948766603416, "grad_norm": 1.5254170894622803, "learning_rate": 2.0251417795094166e-06, "loss": 0.1516, "step": 3373 }, { "epoch": 1.6005692599620494, "grad_norm": 1.8212133646011353, "learning_rate": 2.020508669673352e-06, "loss": 0.151, "step": 3374 }, { "epoch": 1.6010436432637571, "grad_norm": 1.369913101196289, "learning_rate": 2.0158802700520576e-06, "loss": 0.1338, "step": 3375 }, { "epoch": 1.601518026565465, "grad_norm": 1.2662460803985596, "learning_rate": 2.0112565833776364e-06, "loss": 0.1228, "step": 3376 }, { "epoch": 1.6019924098671727, "grad_norm": 1.7709726095199585, "learning_rate": 2.0066376123793984e-06, "loss": 0.1726, "step": 3377 }, { "epoch": 1.6024667931688805, "grad_norm": 1.5085747241973877, "learning_rate": 2.0020233597838813e-06, "loss": 0.1034, "step": 3378 }, { "epoch": 1.6029411764705883, "grad_norm": 1.3893810510635376, "learning_rate": 1.99741382831483e-06, "loss": 0.131, "step": 3379 }, { "epoch": 1.603415559772296, "grad_norm": 2.0741636753082275, "learning_rate": 1.9928090206932083e-06, "loss": 0.2064, "step": 3380 }, { "epoch": 1.603889943074004, "grad_norm": 1.5764415264129639, "learning_rate": 1.9882089396371896e-06, "loss": 0.1558, "step": 3381 }, { "epoch": 1.6043643263757117, "grad_norm": 1.7845758199691772, "learning_rate": 1.983613587862153e-06, "loss": 0.1599, "step": 3382 }, { "epoch": 1.6048387096774195, "grad_norm": 1.610253095626831, "learning_rate": 1.9790229680806883e-06, "loss": 0.1682, "step": 3383 }, { "epoch": 1.6053130929791273, "grad_norm": 1.7443877458572388, "learning_rate": 1.9744370830025937e-06, "loss": 0.155, "step": 3384 }, { "epoch": 1.605787476280835, "grad_norm": 1.7004719972610474, "learning_rate": 1.9698559353348735e-06, "loss": 0.1436, "step": 3385 }, { "epoch": 1.6062618595825426, "grad_norm": 1.878106713294983, "learning_rate": 1.9652795277817348e-06, "loss": 0.181, "step": 3386 }, { "epoch": 1.6067362428842504, "grad_norm": 1.6228288412094116, "learning_rate": 1.960707863044582e-06, "loss": 0.1511, "step": 3387 }, { "epoch": 1.6072106261859582, "grad_norm": 1.7127445936203003, "learning_rate": 1.9561409438220245e-06, "loss": 0.1445, "step": 3388 }, { "epoch": 1.607685009487666, "grad_norm": 1.5435030460357666, "learning_rate": 1.9515787728098733e-06, "loss": 0.1298, "step": 3389 }, { "epoch": 1.6081593927893738, "grad_norm": 1.4933137893676758, "learning_rate": 1.9470213527011282e-06, "loss": 0.1415, "step": 3390 }, { "epoch": 1.6086337760910816, "grad_norm": 1.9454411268234253, "learning_rate": 1.9424686861859933e-06, "loss": 0.174, "step": 3391 }, { "epoch": 1.6091081593927894, "grad_norm": 1.5008386373519897, "learning_rate": 1.937920775951857e-06, "loss": 0.1465, "step": 3392 }, { "epoch": 1.6095825426944972, "grad_norm": 1.5946168899536133, "learning_rate": 1.9333776246833092e-06, "loss": 0.1379, "step": 3393 }, { "epoch": 1.610056925996205, "grad_norm": 1.4043335914611816, "learning_rate": 1.9288392350621275e-06, "loss": 0.1265, "step": 3394 }, { "epoch": 1.6105313092979125, "grad_norm": 1.7210297584533691, "learning_rate": 1.9243056097672796e-06, "loss": 0.1668, "step": 3395 }, { "epoch": 1.6110056925996203, "grad_norm": 1.7458949089050293, "learning_rate": 1.9197767514749156e-06, "loss": 0.16, "step": 3396 }, { "epoch": 1.6114800759013281, "grad_norm": 1.8597474098205566, "learning_rate": 1.915252662858378e-06, "loss": 0.1811, "step": 3397 }, { "epoch": 1.611954459203036, "grad_norm": 1.412137508392334, "learning_rate": 1.910733346588194e-06, "loss": 0.1204, "step": 3398 }, { "epoch": 1.6124288425047437, "grad_norm": 1.6285221576690674, "learning_rate": 1.9062188053320663e-06, "loss": 0.1567, "step": 3399 }, { "epoch": 1.6129032258064515, "grad_norm": 2.3471789360046387, "learning_rate": 1.901709041754889e-06, "loss": 0.1562, "step": 3400 }, { "epoch": 1.6133776091081593, "grad_norm": 1.5829018354415894, "learning_rate": 1.8972040585187256e-06, "loss": 0.1545, "step": 3401 }, { "epoch": 1.613851992409867, "grad_norm": 1.5271830558776855, "learning_rate": 1.8927038582828261e-06, "loss": 0.1294, "step": 3402 }, { "epoch": 1.614326375711575, "grad_norm": 1.8951029777526855, "learning_rate": 1.8882084437036142e-06, "loss": 0.1618, "step": 3403 }, { "epoch": 1.6148007590132827, "grad_norm": 1.707849383354187, "learning_rate": 1.8837178174346882e-06, "loss": 0.1544, "step": 3404 }, { "epoch": 1.6152751423149905, "grad_norm": 1.8298168182373047, "learning_rate": 1.8792319821268223e-06, "loss": 0.1755, "step": 3405 }, { "epoch": 1.6157495256166983, "grad_norm": 1.6558297872543335, "learning_rate": 1.8747509404279595e-06, "loss": 0.1473, "step": 3406 }, { "epoch": 1.616223908918406, "grad_norm": 1.5755760669708252, "learning_rate": 1.8702746949832117e-06, "loss": 0.1447, "step": 3407 }, { "epoch": 1.6166982922201139, "grad_norm": 1.9705390930175781, "learning_rate": 1.8658032484348632e-06, "loss": 0.1738, "step": 3408 }, { "epoch": 1.6171726755218216, "grad_norm": 2.1337902545928955, "learning_rate": 1.8613366034223668e-06, "loss": 0.2303, "step": 3409 }, { "epoch": 1.6176470588235294, "grad_norm": 1.5434858798980713, "learning_rate": 1.8568747625823403e-06, "loss": 0.1602, "step": 3410 }, { "epoch": 1.6181214421252372, "grad_norm": 2.1457488536834717, "learning_rate": 1.8524177285485588e-06, "loss": 0.1905, "step": 3411 }, { "epoch": 1.618595825426945, "grad_norm": 1.500415563583374, "learning_rate": 1.8479655039519683e-06, "loss": 0.1368, "step": 3412 }, { "epoch": 1.6190702087286528, "grad_norm": 1.6900932788848877, "learning_rate": 1.8435180914206763e-06, "loss": 0.1541, "step": 3413 }, { "epoch": 1.6195445920303606, "grad_norm": 1.5415065288543701, "learning_rate": 1.8390754935799404e-06, "loss": 0.1665, "step": 3414 }, { "epoch": 1.6200189753320684, "grad_norm": 1.7073814868927002, "learning_rate": 1.8346377130521864e-06, "loss": 0.1549, "step": 3415 }, { "epoch": 1.6204933586337762, "grad_norm": 1.7535865306854248, "learning_rate": 1.8302047524569888e-06, "loss": 0.1497, "step": 3416 }, { "epoch": 1.620967741935484, "grad_norm": 1.7559723854064941, "learning_rate": 1.8257766144110823e-06, "loss": 0.1579, "step": 3417 }, { "epoch": 1.6214421252371918, "grad_norm": 1.74555242061615, "learning_rate": 1.8213533015283524e-06, "loss": 0.1379, "step": 3418 }, { "epoch": 1.6219165085388996, "grad_norm": 2.0303471088409424, "learning_rate": 1.8169348164198408e-06, "loss": 0.1545, "step": 3419 }, { "epoch": 1.6223908918406074, "grad_norm": 1.6806904077529907, "learning_rate": 1.8125211616937289e-06, "loss": 0.1171, "step": 3420 }, { "epoch": 1.6228652751423152, "grad_norm": 1.7993685007095337, "learning_rate": 1.8081123399553569e-06, "loss": 0.1789, "step": 3421 }, { "epoch": 1.6233396584440227, "grad_norm": 1.8345065116882324, "learning_rate": 1.8037083538072109e-06, "loss": 0.16, "step": 3422 }, { "epoch": 1.6238140417457305, "grad_norm": 1.875479817390442, "learning_rate": 1.7993092058489158e-06, "loss": 0.1429, "step": 3423 }, { "epoch": 1.6242884250474383, "grad_norm": 1.7342585325241089, "learning_rate": 1.794914898677249e-06, "loss": 0.1624, "step": 3424 }, { "epoch": 1.624762808349146, "grad_norm": 1.4515552520751953, "learning_rate": 1.7905254348861235e-06, "loss": 0.1347, "step": 3425 }, { "epoch": 1.625237191650854, "grad_norm": 1.9419041872024536, "learning_rate": 1.7861408170665961e-06, "loss": 0.1561, "step": 3426 }, { "epoch": 1.6257115749525617, "grad_norm": 2.1055121421813965, "learning_rate": 1.7817610478068659e-06, "loss": 0.1867, "step": 3427 }, { "epoch": 1.6261859582542695, "grad_norm": 1.6690064668655396, "learning_rate": 1.7773861296922657e-06, "loss": 0.1776, "step": 3428 }, { "epoch": 1.6266603415559773, "grad_norm": 2.282166004180908, "learning_rate": 1.7730160653052685e-06, "loss": 0.205, "step": 3429 }, { "epoch": 1.6271347248576848, "grad_norm": 1.7570334672927856, "learning_rate": 1.7686508572254802e-06, "loss": 0.1546, "step": 3430 }, { "epoch": 1.6276091081593926, "grad_norm": 1.8625390529632568, "learning_rate": 1.7642905080296346e-06, "loss": 0.1668, "step": 3431 }, { "epoch": 1.6280834914611004, "grad_norm": 1.4064631462097168, "learning_rate": 1.7599350202916066e-06, "loss": 0.1402, "step": 3432 }, { "epoch": 1.6285578747628082, "grad_norm": 1.3597134351730347, "learning_rate": 1.7555843965823992e-06, "loss": 0.1314, "step": 3433 }, { "epoch": 1.629032258064516, "grad_norm": 1.76325523853302, "learning_rate": 1.7512386394701386e-06, "loss": 0.1729, "step": 3434 }, { "epoch": 1.6295066413662238, "grad_norm": 1.6423853635787964, "learning_rate": 1.7468977515200835e-06, "loss": 0.1287, "step": 3435 }, { "epoch": 1.6299810246679316, "grad_norm": 1.6672308444976807, "learning_rate": 1.7425617352946178e-06, "loss": 0.1701, "step": 3436 }, { "epoch": 1.6304554079696394, "grad_norm": 1.6599950790405273, "learning_rate": 1.7382305933532494e-06, "loss": 0.1618, "step": 3437 }, { "epoch": 1.6309297912713472, "grad_norm": 1.5585594177246094, "learning_rate": 1.7339043282526103e-06, "loss": 0.149, "step": 3438 }, { "epoch": 1.631404174573055, "grad_norm": 1.8305972814559937, "learning_rate": 1.7295829425464494e-06, "loss": 0.1789, "step": 3439 }, { "epoch": 1.6318785578747628, "grad_norm": 1.763118028640747, "learning_rate": 1.7252664387856367e-06, "loss": 0.1734, "step": 3440 }, { "epoch": 1.6323529411764706, "grad_norm": 1.743260383605957, "learning_rate": 1.7209548195181625e-06, "loss": 0.1609, "step": 3441 }, { "epoch": 1.6328273244781784, "grad_norm": 1.892848014831543, "learning_rate": 1.7166480872891333e-06, "loss": 0.1606, "step": 3442 }, { "epoch": 1.6333017077798861, "grad_norm": 1.9057481288909912, "learning_rate": 1.7123462446407746e-06, "loss": 0.1659, "step": 3443 }, { "epoch": 1.633776091081594, "grad_norm": 1.7405368089675903, "learning_rate": 1.7080492941124139e-06, "loss": 0.1593, "step": 3444 }, { "epoch": 1.6342504743833017, "grad_norm": 1.713452696800232, "learning_rate": 1.7037572382405031e-06, "loss": 0.1283, "step": 3445 }, { "epoch": 1.6347248576850095, "grad_norm": 1.595138430595398, "learning_rate": 1.6994700795586027e-06, "loss": 0.1319, "step": 3446 }, { "epoch": 1.6351992409867173, "grad_norm": 1.756667137145996, "learning_rate": 1.6951878205973738e-06, "loss": 0.1677, "step": 3447 }, { "epoch": 1.635673624288425, "grad_norm": 1.6868460178375244, "learning_rate": 1.6909104638845986e-06, "loss": 0.1546, "step": 3448 }, { "epoch": 1.636148007590133, "grad_norm": 1.5798834562301636, "learning_rate": 1.686638011945151e-06, "loss": 0.1525, "step": 3449 }, { "epoch": 1.6366223908918407, "grad_norm": 1.3165028095245361, "learning_rate": 1.682370467301021e-06, "loss": 0.1139, "step": 3450 }, { "epoch": 1.6370967741935485, "grad_norm": 1.6068346500396729, "learning_rate": 1.6781078324712973e-06, "loss": 0.1436, "step": 3451 }, { "epoch": 1.6375711574952563, "grad_norm": 1.4109083414077759, "learning_rate": 1.6738501099721737e-06, "loss": 0.1366, "step": 3452 }, { "epoch": 1.638045540796964, "grad_norm": 1.7669192552566528, "learning_rate": 1.6695973023169375e-06, "loss": 0.1751, "step": 3453 }, { "epoch": 1.6385199240986719, "grad_norm": 1.5503531694412231, "learning_rate": 1.6653494120159842e-06, "loss": 0.1462, "step": 3454 }, { "epoch": 1.6389943074003797, "grad_norm": 1.9344103336334229, "learning_rate": 1.6611064415767941e-06, "loss": 0.2062, "step": 3455 }, { "epoch": 1.6394686907020875, "grad_norm": 1.5301547050476074, "learning_rate": 1.6568683935039554e-06, "loss": 0.1375, "step": 3456 }, { "epoch": 1.639943074003795, "grad_norm": 1.9043666124343872, "learning_rate": 1.6526352702991478e-06, "loss": 0.199, "step": 3457 }, { "epoch": 1.6404174573055028, "grad_norm": 1.684772253036499, "learning_rate": 1.6484070744611358e-06, "loss": 0.1337, "step": 3458 }, { "epoch": 1.6408918406072106, "grad_norm": 1.598937749862671, "learning_rate": 1.6441838084857863e-06, "loss": 0.1523, "step": 3459 }, { "epoch": 1.6413662239089184, "grad_norm": 1.815584421157837, "learning_rate": 1.6399654748660498e-06, "loss": 0.1498, "step": 3460 }, { "epoch": 1.6418406072106262, "grad_norm": 1.7786113023757935, "learning_rate": 1.6357520760919675e-06, "loss": 0.1683, "step": 3461 }, { "epoch": 1.642314990512334, "grad_norm": 1.3854840993881226, "learning_rate": 1.6315436146506702e-06, "loss": 0.1122, "step": 3462 }, { "epoch": 1.6427893738140418, "grad_norm": 1.40729558467865, "learning_rate": 1.6273400930263672e-06, "loss": 0.1333, "step": 3463 }, { "epoch": 1.6432637571157496, "grad_norm": 1.844829797744751, "learning_rate": 1.6231415137003536e-06, "loss": 0.145, "step": 3464 }, { "epoch": 1.6437381404174574, "grad_norm": 2.1833529472351074, "learning_rate": 1.6189478791510116e-06, "loss": 0.189, "step": 3465 }, { "epoch": 1.644212523719165, "grad_norm": 1.6294044256210327, "learning_rate": 1.614759191853803e-06, "loss": 0.1264, "step": 3466 }, { "epoch": 1.6446869070208727, "grad_norm": 1.6471272706985474, "learning_rate": 1.6105754542812702e-06, "loss": 0.1396, "step": 3467 }, { "epoch": 1.6451612903225805, "grad_norm": 1.920088768005371, "learning_rate": 1.6063966689030275e-06, "loss": 0.1704, "step": 3468 }, { "epoch": 1.6456356736242883, "grad_norm": 1.8987756967544556, "learning_rate": 1.6022228381857729e-06, "loss": 0.1184, "step": 3469 }, { "epoch": 1.646110056925996, "grad_norm": 1.6714553833007812, "learning_rate": 1.5980539645932802e-06, "loss": 0.1568, "step": 3470 }, { "epoch": 1.646584440227704, "grad_norm": 1.6025445461273193, "learning_rate": 1.5938900505863886e-06, "loss": 0.143, "step": 3471 }, { "epoch": 1.6470588235294117, "grad_norm": 1.8814153671264648, "learning_rate": 1.5897310986230196e-06, "loss": 0.1493, "step": 3472 }, { "epoch": 1.6475332068311195, "grad_norm": 1.7132648229599, "learning_rate": 1.5855771111581586e-06, "loss": 0.1695, "step": 3473 }, { "epoch": 1.6480075901328273, "grad_norm": 1.6808983087539673, "learning_rate": 1.5814280906438639e-06, "loss": 0.1711, "step": 3474 }, { "epoch": 1.648481973434535, "grad_norm": 2.184779405593872, "learning_rate": 1.5772840395292632e-06, "loss": 0.1722, "step": 3475 }, { "epoch": 1.6489563567362429, "grad_norm": 1.5792295932769775, "learning_rate": 1.5731449602605487e-06, "loss": 0.152, "step": 3476 }, { "epoch": 1.6494307400379506, "grad_norm": 1.4124369621276855, "learning_rate": 1.5690108552809746e-06, "loss": 0.1496, "step": 3477 }, { "epoch": 1.6499051233396584, "grad_norm": 1.8069303035736084, "learning_rate": 1.5648817270308648e-06, "loss": 0.1382, "step": 3478 }, { "epoch": 1.6503795066413662, "grad_norm": 1.832484245300293, "learning_rate": 1.5607575779476047e-06, "loss": 0.148, "step": 3479 }, { "epoch": 1.650853889943074, "grad_norm": 2.0292952060699463, "learning_rate": 1.556638410465635e-06, "loss": 0.1711, "step": 3480 }, { "epoch": 1.6513282732447818, "grad_norm": 1.6780633926391602, "learning_rate": 1.552524227016462e-06, "loss": 0.1237, "step": 3481 }, { "epoch": 1.6518026565464896, "grad_norm": 1.6623648405075073, "learning_rate": 1.548415030028645e-06, "loss": 0.1589, "step": 3482 }, { "epoch": 1.6522770398481974, "grad_norm": 1.652055263519287, "learning_rate": 1.5443108219278036e-06, "loss": 0.1615, "step": 3483 }, { "epoch": 1.6527514231499052, "grad_norm": 1.6934820413589478, "learning_rate": 1.5402116051366111e-06, "loss": 0.1585, "step": 3484 }, { "epoch": 1.653225806451613, "grad_norm": 1.5433070659637451, "learning_rate": 1.5361173820747942e-06, "loss": 0.1395, "step": 3485 }, { "epoch": 1.6537001897533208, "grad_norm": 1.812530755996704, "learning_rate": 1.5320281551591366e-06, "loss": 0.1592, "step": 3486 }, { "epoch": 1.6541745730550286, "grad_norm": 2.096843957901001, "learning_rate": 1.5279439268034634e-06, "loss": 0.1543, "step": 3487 }, { "epoch": 1.6546489563567364, "grad_norm": 1.825169563293457, "learning_rate": 1.5238646994186546e-06, "loss": 0.153, "step": 3488 }, { "epoch": 1.6551233396584442, "grad_norm": 1.7507545948028564, "learning_rate": 1.519790475412638e-06, "loss": 0.1509, "step": 3489 }, { "epoch": 1.655597722960152, "grad_norm": 1.6845128536224365, "learning_rate": 1.515721257190389e-06, "loss": 0.1752, "step": 3490 }, { "epoch": 1.6560721062618597, "grad_norm": 1.793799638748169, "learning_rate": 1.5116570471539294e-06, "loss": 0.1689, "step": 3491 }, { "epoch": 1.6565464895635673, "grad_norm": 1.5899111032485962, "learning_rate": 1.5075978477023156e-06, "loss": 0.1291, "step": 3492 }, { "epoch": 1.657020872865275, "grad_norm": 1.5500413179397583, "learning_rate": 1.5035436612316567e-06, "loss": 0.1548, "step": 3493 }, { "epoch": 1.657495256166983, "grad_norm": 1.3642367124557495, "learning_rate": 1.4994944901351006e-06, "loss": 0.1452, "step": 3494 }, { "epoch": 1.6579696394686907, "grad_norm": 1.7189817428588867, "learning_rate": 1.4954503368028305e-06, "loss": 0.1784, "step": 3495 }, { "epoch": 1.6584440227703985, "grad_norm": 1.5453788042068481, "learning_rate": 1.4914112036220696e-06, "loss": 0.1572, "step": 3496 }, { "epoch": 1.6589184060721063, "grad_norm": 1.3796250820159912, "learning_rate": 1.4873770929770782e-06, "loss": 0.1385, "step": 3497 }, { "epoch": 1.659392789373814, "grad_norm": 1.3950660228729248, "learning_rate": 1.4833480072491524e-06, "loss": 0.1264, "step": 3498 }, { "epoch": 1.6598671726755219, "grad_norm": 1.7917977571487427, "learning_rate": 1.4793239488166222e-06, "loss": 0.1483, "step": 3499 }, { "epoch": 1.6603415559772297, "grad_norm": 1.921996831893921, "learning_rate": 1.4753049200548519e-06, "loss": 0.2267, "step": 3500 }, { "epoch": 1.6608159392789372, "grad_norm": 1.6829748153686523, "learning_rate": 1.4712909233362304e-06, "loss": 0.1554, "step": 3501 }, { "epoch": 1.661290322580645, "grad_norm": 1.645816445350647, "learning_rate": 1.4672819610301802e-06, "loss": 0.1642, "step": 3502 }, { "epoch": 1.6617647058823528, "grad_norm": 1.2264525890350342, "learning_rate": 1.4632780355031573e-06, "loss": 0.1073, "step": 3503 }, { "epoch": 1.6622390891840606, "grad_norm": 1.5682717561721802, "learning_rate": 1.459279149118632e-06, "loss": 0.1596, "step": 3504 }, { "epoch": 1.6627134724857684, "grad_norm": 1.7140616178512573, "learning_rate": 1.455285304237114e-06, "loss": 0.1499, "step": 3505 }, { "epoch": 1.6631878557874762, "grad_norm": 2.096163749694824, "learning_rate": 1.4512965032161242e-06, "loss": 0.1381, "step": 3506 }, { "epoch": 1.663662239089184, "grad_norm": 1.4052388668060303, "learning_rate": 1.4473127484102157e-06, "loss": 0.1269, "step": 3507 }, { "epoch": 1.6641366223908918, "grad_norm": 1.667314052581787, "learning_rate": 1.4433340421709597e-06, "loss": 0.1574, "step": 3508 }, { "epoch": 1.6646110056925996, "grad_norm": 1.8331180810928345, "learning_rate": 1.4393603868469464e-06, "loss": 0.1621, "step": 3509 }, { "epoch": 1.6650853889943074, "grad_norm": 1.3572697639465332, "learning_rate": 1.4353917847837883e-06, "loss": 0.1127, "step": 3510 }, { "epoch": 1.6655597722960152, "grad_norm": 1.5188183784484863, "learning_rate": 1.4314282383241097e-06, "loss": 0.1292, "step": 3511 }, { "epoch": 1.666034155597723, "grad_norm": 1.5787996053695679, "learning_rate": 1.4274697498075495e-06, "loss": 0.1352, "step": 3512 }, { "epoch": 1.6665085388994307, "grad_norm": 1.8923653364181519, "learning_rate": 1.423516321570767e-06, "loss": 0.1811, "step": 3513 }, { "epoch": 1.6669829222011385, "grad_norm": 1.3783034086227417, "learning_rate": 1.419567955947434e-06, "loss": 0.1151, "step": 3514 }, { "epoch": 1.6674573055028463, "grad_norm": 1.7358620166778564, "learning_rate": 1.4156246552682274e-06, "loss": 0.178, "step": 3515 }, { "epoch": 1.6679316888045541, "grad_norm": 1.7342180013656616, "learning_rate": 1.4116864218608416e-06, "loss": 0.1696, "step": 3516 }, { "epoch": 1.668406072106262, "grad_norm": 1.8265262842178345, "learning_rate": 1.4077532580499753e-06, "loss": 0.1312, "step": 3517 }, { "epoch": 1.6688804554079697, "grad_norm": 1.7043464183807373, "learning_rate": 1.4038251661573387e-06, "loss": 0.1454, "step": 3518 }, { "epoch": 1.6693548387096775, "grad_norm": 1.6283822059631348, "learning_rate": 1.3999021485016429e-06, "loss": 0.1431, "step": 3519 }, { "epoch": 1.6698292220113853, "grad_norm": 1.8391207456588745, "learning_rate": 1.3959842073986085e-06, "loss": 0.1677, "step": 3520 }, { "epoch": 1.670303605313093, "grad_norm": 1.350468635559082, "learning_rate": 1.3920713451609535e-06, "loss": 0.1179, "step": 3521 }, { "epoch": 1.6707779886148009, "grad_norm": 2.141695976257324, "learning_rate": 1.3881635640984048e-06, "loss": 0.1947, "step": 3522 }, { "epoch": 1.6712523719165087, "grad_norm": 1.6811950206756592, "learning_rate": 1.384260866517686e-06, "loss": 0.1595, "step": 3523 }, { "epoch": 1.6717267552182165, "grad_norm": 1.6273670196533203, "learning_rate": 1.3803632547225242e-06, "loss": 0.1674, "step": 3524 }, { "epoch": 1.6722011385199242, "grad_norm": 1.6808335781097412, "learning_rate": 1.376470731013636e-06, "loss": 0.1413, "step": 3525 }, { "epoch": 1.672675521821632, "grad_norm": 1.9226058721542358, "learning_rate": 1.372583297688741e-06, "loss": 0.1372, "step": 3526 }, { "epoch": 1.6731499051233396, "grad_norm": 1.719659686088562, "learning_rate": 1.368700957042557e-06, "loss": 0.1393, "step": 3527 }, { "epoch": 1.6736242884250474, "grad_norm": 1.4148011207580566, "learning_rate": 1.3648237113667839e-06, "loss": 0.144, "step": 3528 }, { "epoch": 1.6740986717267552, "grad_norm": 1.5885848999023438, "learning_rate": 1.3609515629501279e-06, "loss": 0.1342, "step": 3529 }, { "epoch": 1.674573055028463, "grad_norm": 1.828921914100647, "learning_rate": 1.3570845140782752e-06, "loss": 0.1664, "step": 3530 }, { "epoch": 1.6750474383301708, "grad_norm": 1.8899797201156616, "learning_rate": 1.3532225670339095e-06, "loss": 0.1795, "step": 3531 }, { "epoch": 1.6755218216318786, "grad_norm": 1.7065174579620361, "learning_rate": 1.3493657240966974e-06, "loss": 0.1541, "step": 3532 }, { "epoch": 1.6759962049335864, "grad_norm": 1.6612739562988281, "learning_rate": 1.3455139875433e-06, "loss": 0.1717, "step": 3533 }, { "epoch": 1.6764705882352942, "grad_norm": 2.0222203731536865, "learning_rate": 1.3416673596473528e-06, "loss": 0.1909, "step": 3534 }, { "epoch": 1.676944971537002, "grad_norm": 1.3110690116882324, "learning_rate": 1.3378258426794888e-06, "loss": 0.1174, "step": 3535 }, { "epoch": 1.6774193548387095, "grad_norm": 1.338375449180603, "learning_rate": 1.3339894389073104e-06, "loss": 0.1189, "step": 3536 }, { "epoch": 1.6778937381404173, "grad_norm": 1.7583532333374023, "learning_rate": 1.3301581505954131e-06, "loss": 0.174, "step": 3537 }, { "epoch": 1.678368121442125, "grad_norm": 1.7339435815811157, "learning_rate": 1.3263319800053698e-06, "loss": 0.1568, "step": 3538 }, { "epoch": 1.678842504743833, "grad_norm": 1.5173304080963135, "learning_rate": 1.3225109293957272e-06, "loss": 0.1379, "step": 3539 }, { "epoch": 1.6793168880455407, "grad_norm": 1.5127456188201904, "learning_rate": 1.3186950010220156e-06, "loss": 0.1355, "step": 3540 }, { "epoch": 1.6797912713472485, "grad_norm": 1.6670554876327515, "learning_rate": 1.3148841971367387e-06, "loss": 0.1832, "step": 3541 }, { "epoch": 1.6802656546489563, "grad_norm": 1.6429007053375244, "learning_rate": 1.3110785199893806e-06, "loss": 0.1324, "step": 3542 }, { "epoch": 1.680740037950664, "grad_norm": 1.5100871324539185, "learning_rate": 1.3072779718263884e-06, "loss": 0.1413, "step": 3543 }, { "epoch": 1.6812144212523719, "grad_norm": 1.6783661842346191, "learning_rate": 1.3034825548911944e-06, "loss": 0.1182, "step": 3544 }, { "epoch": 1.6816888045540797, "grad_norm": 1.708293080329895, "learning_rate": 1.29969227142419e-06, "loss": 0.1654, "step": 3545 }, { "epoch": 1.6821631878557874, "grad_norm": 1.9683902263641357, "learning_rate": 1.295907123662744e-06, "loss": 0.1813, "step": 3546 }, { "epoch": 1.6826375711574952, "grad_norm": 1.7525516748428345, "learning_rate": 1.2921271138411927e-06, "loss": 0.1453, "step": 3547 }, { "epoch": 1.683111954459203, "grad_norm": 1.5966471433639526, "learning_rate": 1.2883522441908403e-06, "loss": 0.123, "step": 3548 }, { "epoch": 1.6835863377609108, "grad_norm": 1.6996803283691406, "learning_rate": 1.2845825169399506e-06, "loss": 0.1626, "step": 3549 }, { "epoch": 1.6840607210626186, "grad_norm": 1.50482177734375, "learning_rate": 1.2808179343137583e-06, "loss": 0.1239, "step": 3550 }, { "epoch": 1.6845351043643264, "grad_norm": 1.7944713830947876, "learning_rate": 1.2770584985344613e-06, "loss": 0.1743, "step": 3551 }, { "epoch": 1.6850094876660342, "grad_norm": 1.602755069732666, "learning_rate": 1.2733042118212157e-06, "loss": 0.1602, "step": 3552 }, { "epoch": 1.685483870967742, "grad_norm": 1.5536739826202393, "learning_rate": 1.2695550763901376e-06, "loss": 0.1252, "step": 3553 }, { "epoch": 1.6859582542694498, "grad_norm": 1.7096506357192993, "learning_rate": 1.2658110944543055e-06, "loss": 0.1481, "step": 3554 }, { "epoch": 1.6864326375711576, "grad_norm": 1.5002373456954956, "learning_rate": 1.2620722682237575e-06, "loss": 0.1237, "step": 3555 }, { "epoch": 1.6869070208728654, "grad_norm": 1.692552924156189, "learning_rate": 1.258338599905482e-06, "loss": 0.176, "step": 3556 }, { "epoch": 1.6873814041745732, "grad_norm": 1.6304970979690552, "learning_rate": 1.2546100917034322e-06, "loss": 0.1477, "step": 3557 }, { "epoch": 1.687855787476281, "grad_norm": 1.4163672924041748, "learning_rate": 1.2508867458185037e-06, "loss": 0.111, "step": 3558 }, { "epoch": 1.6883301707779887, "grad_norm": 1.4730753898620605, "learning_rate": 1.2471685644485543e-06, "loss": 0.1274, "step": 3559 }, { "epoch": 1.6888045540796965, "grad_norm": 1.3261274099349976, "learning_rate": 1.2434555497883872e-06, "loss": 0.1217, "step": 3560 }, { "epoch": 1.6892789373814043, "grad_norm": 1.5221738815307617, "learning_rate": 1.239747704029758e-06, "loss": 0.1276, "step": 3561 }, { "epoch": 1.689753320683112, "grad_norm": 1.6323662996292114, "learning_rate": 1.2360450293613757e-06, "loss": 0.1552, "step": 3562 }, { "epoch": 1.6902277039848197, "grad_norm": 1.506781816482544, "learning_rate": 1.2323475279688869e-06, "loss": 0.1304, "step": 3563 }, { "epoch": 1.6907020872865275, "grad_norm": 1.473595380783081, "learning_rate": 1.228655202034893e-06, "loss": 0.1362, "step": 3564 }, { "epoch": 1.6911764705882353, "grad_norm": 2.067612409591675, "learning_rate": 1.2249680537389375e-06, "loss": 0.1776, "step": 3565 }, { "epoch": 1.691650853889943, "grad_norm": 2.5935635566711426, "learning_rate": 1.2212860852575093e-06, "loss": 0.1547, "step": 3566 }, { "epoch": 1.6921252371916509, "grad_norm": 1.8970015048980713, "learning_rate": 1.217609298764033e-06, "loss": 0.1734, "step": 3567 }, { "epoch": 1.6925996204933587, "grad_norm": 1.6494086980819702, "learning_rate": 1.213937696428885e-06, "loss": 0.1458, "step": 3568 }, { "epoch": 1.6930740037950665, "grad_norm": 2.007638454437256, "learning_rate": 1.2102712804193705e-06, "loss": 0.2278, "step": 3569 }, { "epoch": 1.6935483870967742, "grad_norm": 1.6985396146774292, "learning_rate": 1.2066100528997415e-06, "loss": 0.169, "step": 3570 }, { "epoch": 1.6940227703984818, "grad_norm": 2.0554983615875244, "learning_rate": 1.2029540160311859e-06, "loss": 0.1328, "step": 3571 }, { "epoch": 1.6944971537001896, "grad_norm": 1.8407206535339355, "learning_rate": 1.1993031719718217e-06, "loss": 0.1437, "step": 3572 }, { "epoch": 1.6949715370018974, "grad_norm": 1.4634854793548584, "learning_rate": 1.1956575228767087e-06, "loss": 0.1477, "step": 3573 }, { "epoch": 1.6954459203036052, "grad_norm": 1.489542007446289, "learning_rate": 1.1920170708978374e-06, "loss": 0.1353, "step": 3574 }, { "epoch": 1.695920303605313, "grad_norm": 1.7846124172210693, "learning_rate": 1.1883818181841323e-06, "loss": 0.1611, "step": 3575 }, { "epoch": 1.6963946869070208, "grad_norm": 1.4704738855361938, "learning_rate": 1.1847517668814456e-06, "loss": 0.1405, "step": 3576 }, { "epoch": 1.6968690702087286, "grad_norm": 1.5131449699401855, "learning_rate": 1.181126919132557e-06, "loss": 0.1282, "step": 3577 }, { "epoch": 1.6973434535104364, "grad_norm": 1.477185845375061, "learning_rate": 1.1775072770771833e-06, "loss": 0.1477, "step": 3578 }, { "epoch": 1.6978178368121442, "grad_norm": 1.6830008029937744, "learning_rate": 1.1738928428519603e-06, "loss": 0.1419, "step": 3579 }, { "epoch": 1.698292220113852, "grad_norm": 1.5827606916427612, "learning_rate": 1.1702836185904543e-06, "loss": 0.1486, "step": 3580 }, { "epoch": 1.6987666034155597, "grad_norm": 1.622440218925476, "learning_rate": 1.1666796064231566e-06, "loss": 0.1392, "step": 3581 }, { "epoch": 1.6992409867172675, "grad_norm": 1.690509557723999, "learning_rate": 1.1630808084774758e-06, "loss": 0.1346, "step": 3582 }, { "epoch": 1.6997153700189753, "grad_norm": 2.058816432952881, "learning_rate": 1.1594872268777513e-06, "loss": 0.2435, "step": 3583 }, { "epoch": 1.7001897533206831, "grad_norm": 1.905711054801941, "learning_rate": 1.155898863745234e-06, "loss": 0.1724, "step": 3584 }, { "epoch": 1.700664136622391, "grad_norm": 1.4442384243011475, "learning_rate": 1.1523157211981006e-06, "loss": 0.1424, "step": 3585 }, { "epoch": 1.7011385199240987, "grad_norm": 1.9293603897094727, "learning_rate": 1.1487378013514483e-06, "loss": 0.1443, "step": 3586 }, { "epoch": 1.7016129032258065, "grad_norm": 1.8342009782791138, "learning_rate": 1.145165106317282e-06, "loss": 0.195, "step": 3587 }, { "epoch": 1.7020872865275143, "grad_norm": 1.8110581636428833, "learning_rate": 1.1415976382045313e-06, "loss": 0.1736, "step": 3588 }, { "epoch": 1.702561669829222, "grad_norm": 1.952837347984314, "learning_rate": 1.1380353991190373e-06, "loss": 0.1521, "step": 3589 }, { "epoch": 1.7030360531309299, "grad_norm": 2.2737579345703125, "learning_rate": 1.1344783911635538e-06, "loss": 0.1737, "step": 3590 }, { "epoch": 1.7035104364326377, "grad_norm": 1.83051335811615, "learning_rate": 1.130926616437751e-06, "loss": 0.1311, "step": 3591 }, { "epoch": 1.7039848197343455, "grad_norm": 1.8009090423583984, "learning_rate": 1.1273800770382027e-06, "loss": 0.1798, "step": 3592 }, { "epoch": 1.7044592030360532, "grad_norm": 1.824048399925232, "learning_rate": 1.1238387750583945e-06, "loss": 0.1588, "step": 3593 }, { "epoch": 1.704933586337761, "grad_norm": 1.5769997835159302, "learning_rate": 1.1203027125887235e-06, "loss": 0.1491, "step": 3594 }, { "epoch": 1.7054079696394688, "grad_norm": 1.6240620613098145, "learning_rate": 1.1167718917164961e-06, "loss": 0.1457, "step": 3595 }, { "epoch": 1.7058823529411766, "grad_norm": 1.4127658605575562, "learning_rate": 1.1132463145259144e-06, "loss": 0.124, "step": 3596 }, { "epoch": 1.7063567362428842, "grad_norm": 1.866065502166748, "learning_rate": 1.1097259830980956e-06, "loss": 0.1289, "step": 3597 }, { "epoch": 1.706831119544592, "grad_norm": 2.1026611328125, "learning_rate": 1.1062108995110566e-06, "loss": 0.182, "step": 3598 }, { "epoch": 1.7073055028462998, "grad_norm": 1.580645203590393, "learning_rate": 1.1027010658397175e-06, "loss": 0.1433, "step": 3599 }, { "epoch": 1.7077798861480076, "grad_norm": 1.5717014074325562, "learning_rate": 1.0991964841558955e-06, "loss": 0.151, "step": 3600 }, { "epoch": 1.7082542694497154, "grad_norm": 1.5343966484069824, "learning_rate": 1.0956971565283114e-06, "loss": 0.1533, "step": 3601 }, { "epoch": 1.7087286527514232, "grad_norm": 1.4576107263565063, "learning_rate": 1.092203085022583e-06, "loss": 0.1411, "step": 3602 }, { "epoch": 1.709203036053131, "grad_norm": 1.7388747930526733, "learning_rate": 1.0887142717012266e-06, "loss": 0.1765, "step": 3603 }, { "epoch": 1.7096774193548387, "grad_norm": 1.668588638305664, "learning_rate": 1.0852307186236554e-06, "loss": 0.1562, "step": 3604 }, { "epoch": 1.7101518026565465, "grad_norm": 1.5516544580459595, "learning_rate": 1.0817524278461777e-06, "loss": 0.1373, "step": 3605 }, { "epoch": 1.710626185958254, "grad_norm": 1.614884614944458, "learning_rate": 1.07827940142199e-06, "loss": 0.1378, "step": 3606 }, { "epoch": 1.711100569259962, "grad_norm": 1.7359789609909058, "learning_rate": 1.074811641401189e-06, "loss": 0.1462, "step": 3607 }, { "epoch": 1.7115749525616697, "grad_norm": 1.587064504623413, "learning_rate": 1.071349149830756e-06, "loss": 0.1173, "step": 3608 }, { "epoch": 1.7120493358633775, "grad_norm": 1.8435542583465576, "learning_rate": 1.0678919287545663e-06, "loss": 0.1849, "step": 3609 }, { "epoch": 1.7125237191650853, "grad_norm": 1.614603042602539, "learning_rate": 1.0644399802133876e-06, "loss": 0.1441, "step": 3610 }, { "epoch": 1.712998102466793, "grad_norm": 1.7255760431289673, "learning_rate": 1.060993306244864e-06, "loss": 0.1506, "step": 3611 }, { "epoch": 1.7134724857685009, "grad_norm": 2.0267391204833984, "learning_rate": 1.0575519088835374e-06, "loss": 0.1878, "step": 3612 }, { "epoch": 1.7139468690702087, "grad_norm": 1.741645336151123, "learning_rate": 1.054115790160829e-06, "loss": 0.1536, "step": 3613 }, { "epoch": 1.7144212523719164, "grad_norm": 1.8468014001846313, "learning_rate": 1.0506849521050478e-06, "loss": 0.1775, "step": 3614 }, { "epoch": 1.7148956356736242, "grad_norm": 2.1939375400543213, "learning_rate": 1.0472593967413813e-06, "loss": 0.2245, "step": 3615 }, { "epoch": 1.715370018975332, "grad_norm": 1.7620271444320679, "learning_rate": 1.0438391260919034e-06, "loss": 0.1369, "step": 3616 }, { "epoch": 1.7158444022770398, "grad_norm": 1.5060416460037231, "learning_rate": 1.0404241421755623e-06, "loss": 0.1334, "step": 3617 }, { "epoch": 1.7163187855787476, "grad_norm": 1.5071996450424194, "learning_rate": 1.037014447008191e-06, "loss": 0.1385, "step": 3618 }, { "epoch": 1.7167931688804554, "grad_norm": 1.741341471672058, "learning_rate": 1.0336100426025008e-06, "loss": 0.1588, "step": 3619 }, { "epoch": 1.7172675521821632, "grad_norm": 1.5700851678848267, "learning_rate": 1.0302109309680752e-06, "loss": 0.1558, "step": 3620 }, { "epoch": 1.717741935483871, "grad_norm": 1.5904638767242432, "learning_rate": 1.0268171141113769e-06, "loss": 0.1772, "step": 3621 }, { "epoch": 1.7182163187855788, "grad_norm": 1.7748823165893555, "learning_rate": 1.0234285940357424e-06, "loss": 0.1702, "step": 3622 }, { "epoch": 1.7186907020872866, "grad_norm": 1.6879323720932007, "learning_rate": 1.020045372741384e-06, "loss": 0.1319, "step": 3623 }, { "epoch": 1.7191650853889944, "grad_norm": 1.3660778999328613, "learning_rate": 1.0166674522253817e-06, "loss": 0.1377, "step": 3624 }, { "epoch": 1.7196394686907022, "grad_norm": 1.907179594039917, "learning_rate": 1.0132948344816863e-06, "loss": 0.1655, "step": 3625 }, { "epoch": 1.72011385199241, "grad_norm": 1.814266562461853, "learning_rate": 1.0099275215011227e-06, "loss": 0.1645, "step": 3626 }, { "epoch": 1.7205882352941178, "grad_norm": 1.505477786064148, "learning_rate": 1.0065655152713828e-06, "loss": 0.1253, "step": 3627 }, { "epoch": 1.7210626185958255, "grad_norm": 2.314615249633789, "learning_rate": 1.003208817777025e-06, "loss": 0.1994, "step": 3628 }, { "epoch": 1.7215370018975333, "grad_norm": 1.3844680786132812, "learning_rate": 9.99857430999478e-07, "loss": 0.1154, "step": 3629 }, { "epoch": 1.7220113851992411, "grad_norm": 1.7867406606674194, "learning_rate": 9.965113569170258e-07, "loss": 0.1278, "step": 3630 }, { "epoch": 1.722485768500949, "grad_norm": 1.7978324890136719, "learning_rate": 9.931705975048279e-07, "loss": 0.1922, "step": 3631 }, { "epoch": 1.7229601518026565, "grad_norm": 1.2707836627960205, "learning_rate": 9.898351547349005e-07, "loss": 0.1291, "step": 3632 }, { "epoch": 1.7234345351043643, "grad_norm": 1.5729351043701172, "learning_rate": 9.865050305761226e-07, "loss": 0.1654, "step": 3633 }, { "epoch": 1.723908918406072, "grad_norm": 1.5746902227401733, "learning_rate": 9.831802269942304e-07, "loss": 0.1423, "step": 3634 }, { "epoch": 1.7243833017077799, "grad_norm": 1.864458680152893, "learning_rate": 9.79860745951825e-07, "loss": 0.1778, "step": 3635 }, { "epoch": 1.7248576850094877, "grad_norm": 1.7163097858428955, "learning_rate": 9.765465894083637e-07, "loss": 0.1507, "step": 3636 }, { "epoch": 1.7253320683111955, "grad_norm": 1.3414804935455322, "learning_rate": 9.73237759320159e-07, "loss": 0.1321, "step": 3637 }, { "epoch": 1.7258064516129032, "grad_norm": 2.13075852394104, "learning_rate": 9.699342576403847e-07, "loss": 0.1707, "step": 3638 }, { "epoch": 1.726280834914611, "grad_norm": 1.949949860572815, "learning_rate": 9.666360863190583e-07, "loss": 0.1592, "step": 3639 }, { "epoch": 1.7267552182163188, "grad_norm": 9.197535514831543, "learning_rate": 9.633432473030635e-07, "loss": 0.1525, "step": 3640 }, { "epoch": 1.7272296015180264, "grad_norm": 1.6272482872009277, "learning_rate": 9.600557425361269e-07, "loss": 0.1586, "step": 3641 }, { "epoch": 1.7277039848197342, "grad_norm": 1.5397499799728394, "learning_rate": 9.567735739588314e-07, "loss": 0.1292, "step": 3642 }, { "epoch": 1.728178368121442, "grad_norm": 1.5450812578201294, "learning_rate": 9.534967435086095e-07, "loss": 0.1174, "step": 3643 }, { "epoch": 1.7286527514231498, "grad_norm": 1.3916116952896118, "learning_rate": 9.502252531197398e-07, "loss": 0.1326, "step": 3644 }, { "epoch": 1.7291271347248576, "grad_norm": 1.4877862930297852, "learning_rate": 9.469591047233517e-07, "loss": 0.1328, "step": 3645 }, { "epoch": 1.7296015180265654, "grad_norm": 1.4843299388885498, "learning_rate": 9.436983002474209e-07, "loss": 0.1508, "step": 3646 }, { "epoch": 1.7300759013282732, "grad_norm": 1.532395601272583, "learning_rate": 9.404428416167688e-07, "loss": 0.1403, "step": 3647 }, { "epoch": 1.730550284629981, "grad_norm": 2.052851915359497, "learning_rate": 9.371927307530593e-07, "loss": 0.1483, "step": 3648 }, { "epoch": 1.7310246679316887, "grad_norm": 1.7346041202545166, "learning_rate": 9.339479695747988e-07, "loss": 0.1573, "step": 3649 }, { "epoch": 1.7314990512333965, "grad_norm": 1.8105442523956299, "learning_rate": 9.307085599973387e-07, "loss": 0.1597, "step": 3650 }, { "epoch": 1.7319734345351043, "grad_norm": 2.083927631378174, "learning_rate": 9.274745039328725e-07, "loss": 0.164, "step": 3651 }, { "epoch": 1.7324478178368121, "grad_norm": 1.740408182144165, "learning_rate": 9.242458032904311e-07, "loss": 0.1351, "step": 3652 }, { "epoch": 1.73292220113852, "grad_norm": 1.4160617589950562, "learning_rate": 9.210224599758811e-07, "loss": 0.1228, "step": 3653 }, { "epoch": 1.7333965844402277, "grad_norm": 1.3447182178497314, "learning_rate": 9.178044758919336e-07, "loss": 0.1153, "step": 3654 }, { "epoch": 1.7338709677419355, "grad_norm": 2.0321731567382812, "learning_rate": 9.145918529381314e-07, "loss": 0.1511, "step": 3655 }, { "epoch": 1.7343453510436433, "grad_norm": 1.9544363021850586, "learning_rate": 9.113845930108567e-07, "loss": 0.1599, "step": 3656 }, { "epoch": 1.734819734345351, "grad_norm": 1.5864086151123047, "learning_rate": 9.081826980033215e-07, "loss": 0.1562, "step": 3657 }, { "epoch": 1.7352941176470589, "grad_norm": 1.6874994039535522, "learning_rate": 9.049861698055696e-07, "loss": 0.1619, "step": 3658 }, { "epoch": 1.7357685009487667, "grad_norm": 1.6411479711532593, "learning_rate": 9.017950103044826e-07, "loss": 0.1461, "step": 3659 }, { "epoch": 1.7362428842504745, "grad_norm": 1.355292797088623, "learning_rate": 8.986092213837705e-07, "loss": 0.0838, "step": 3660 }, { "epoch": 1.7367172675521823, "grad_norm": 2.7029168605804443, "learning_rate": 8.954288049239734e-07, "loss": 0.2356, "step": 3661 }, { "epoch": 1.73719165085389, "grad_norm": 1.5269354581832886, "learning_rate": 8.922537628024608e-07, "loss": 0.1512, "step": 3662 }, { "epoch": 1.7376660341555978, "grad_norm": 1.495399832725525, "learning_rate": 8.890840968934244e-07, "loss": 0.1317, "step": 3663 }, { "epoch": 1.7381404174573056, "grad_norm": 1.3357588052749634, "learning_rate": 8.859198090678923e-07, "loss": 0.1108, "step": 3664 }, { "epoch": 1.7386148007590134, "grad_norm": 1.5755971670150757, "learning_rate": 8.827609011937066e-07, "loss": 0.1393, "step": 3665 }, { "epoch": 1.7390891840607212, "grad_norm": 1.437064528465271, "learning_rate": 8.796073751355417e-07, "loss": 0.117, "step": 3666 }, { "epoch": 1.739563567362429, "grad_norm": 1.7386492490768433, "learning_rate": 8.764592327548948e-07, "loss": 0.16, "step": 3667 }, { "epoch": 1.7400379506641366, "grad_norm": 1.099024772644043, "learning_rate": 8.733164759100809e-07, "loss": 0.0981, "step": 3668 }, { "epoch": 1.7405123339658444, "grad_norm": 1.9566211700439453, "learning_rate": 8.701791064562382e-07, "loss": 0.157, "step": 3669 }, { "epoch": 1.7409867172675522, "grad_norm": 1.458828091621399, "learning_rate": 8.670471262453251e-07, "loss": 0.1359, "step": 3670 }, { "epoch": 1.74146110056926, "grad_norm": 1.4478492736816406, "learning_rate": 8.639205371261217e-07, "loss": 0.1664, "step": 3671 }, { "epoch": 1.7419354838709677, "grad_norm": 1.6490832567214966, "learning_rate": 8.607993409442173e-07, "loss": 0.1427, "step": 3672 }, { "epoch": 1.7424098671726755, "grad_norm": 3.0351619720458984, "learning_rate": 8.57683539542028e-07, "loss": 0.1973, "step": 3673 }, { "epoch": 1.7428842504743833, "grad_norm": 1.4072052240371704, "learning_rate": 8.54573134758776e-07, "loss": 0.1351, "step": 3674 }, { "epoch": 1.7433586337760911, "grad_norm": 1.6349687576293945, "learning_rate": 8.514681284305048e-07, "loss": 0.1405, "step": 3675 }, { "epoch": 1.7438330170777987, "grad_norm": 1.8339117765426636, "learning_rate": 8.483685223900706e-07, "loss": 0.1787, "step": 3676 }, { "epoch": 1.7443074003795065, "grad_norm": 1.4211890697479248, "learning_rate": 8.452743184671363e-07, "loss": 0.1482, "step": 3677 }, { "epoch": 1.7447817836812143, "grad_norm": 1.6451423168182373, "learning_rate": 8.421855184881822e-07, "loss": 0.1287, "step": 3678 }, { "epoch": 1.745256166982922, "grad_norm": 1.9902081489562988, "learning_rate": 8.391021242764962e-07, "loss": 0.1923, "step": 3679 }, { "epoch": 1.7457305502846299, "grad_norm": 1.723975658416748, "learning_rate": 8.360241376521772e-07, "loss": 0.1539, "step": 3680 }, { "epoch": 1.7462049335863377, "grad_norm": 1.7645349502563477, "learning_rate": 8.329515604321281e-07, "loss": 0.15, "step": 3681 }, { "epoch": 1.7466793168880455, "grad_norm": 1.3427985906600952, "learning_rate": 8.298843944300583e-07, "loss": 0.113, "step": 3682 }, { "epoch": 1.7471537001897532, "grad_norm": 1.576675295829773, "learning_rate": 8.268226414564895e-07, "loss": 0.1492, "step": 3683 }, { "epoch": 1.747628083491461, "grad_norm": 1.534294605255127, "learning_rate": 8.237663033187426e-07, "loss": 0.135, "step": 3684 }, { "epoch": 1.7481024667931688, "grad_norm": 1.7654008865356445, "learning_rate": 8.207153818209446e-07, "loss": 0.1498, "step": 3685 }, { "epoch": 1.7485768500948766, "grad_norm": 1.6576253175735474, "learning_rate": 8.176698787640247e-07, "loss": 0.1322, "step": 3686 }, { "epoch": 1.7490512333965844, "grad_norm": 1.7810031175613403, "learning_rate": 8.146297959457116e-07, "loss": 0.1576, "step": 3687 }, { "epoch": 1.7495256166982922, "grad_norm": 1.609955906867981, "learning_rate": 8.115951351605378e-07, "loss": 0.1367, "step": 3688 }, { "epoch": 1.75, "grad_norm": 1.7572312355041504, "learning_rate": 8.085658981998312e-07, "loss": 0.1378, "step": 3689 }, { "epoch": 1.7504743833017078, "grad_norm": 1.3481042385101318, "learning_rate": 8.055420868517227e-07, "loss": 0.1182, "step": 3690 }, { "epoch": 1.7509487666034156, "grad_norm": 2.354755401611328, "learning_rate": 8.025237029011368e-07, "loss": 0.201, "step": 3691 }, { "epoch": 1.7514231499051234, "grad_norm": 1.7035918235778809, "learning_rate": 7.995107481297948e-07, "loss": 0.1286, "step": 3692 }, { "epoch": 1.7518975332068312, "grad_norm": 1.6000009775161743, "learning_rate": 7.965032243162163e-07, "loss": 0.1307, "step": 3693 }, { "epoch": 1.752371916508539, "grad_norm": 1.5351053476333618, "learning_rate": 7.935011332357113e-07, "loss": 0.1476, "step": 3694 }, { "epoch": 1.7528462998102468, "grad_norm": 1.4815095663070679, "learning_rate": 7.905044766603876e-07, "loss": 0.1445, "step": 3695 }, { "epoch": 1.7533206831119545, "grad_norm": 1.7617803812026978, "learning_rate": 7.875132563591382e-07, "loss": 0.153, "step": 3696 }, { "epoch": 1.7537950664136623, "grad_norm": 1.4343780279159546, "learning_rate": 7.845274740976527e-07, "loss": 0.135, "step": 3697 }, { "epoch": 1.7542694497153701, "grad_norm": 1.5575170516967773, "learning_rate": 7.815471316384071e-07, "loss": 0.1198, "step": 3698 }, { "epoch": 1.754743833017078, "grad_norm": 1.9399551153182983, "learning_rate": 7.785722307406685e-07, "loss": 0.1777, "step": 3699 }, { "epoch": 1.7552182163187857, "grad_norm": 1.8243434429168701, "learning_rate": 7.756027731604943e-07, "loss": 0.1456, "step": 3700 }, { "epoch": 1.7556925996204935, "grad_norm": 1.7485800981521606, "learning_rate": 7.726387606507224e-07, "loss": 0.1182, "step": 3701 }, { "epoch": 1.7561669829222013, "grad_norm": 1.8186720609664917, "learning_rate": 7.696801949609811e-07, "loss": 0.1679, "step": 3702 }, { "epoch": 1.7566413662239089, "grad_norm": 2.1172068119049072, "learning_rate": 7.667270778376834e-07, "loss": 0.2089, "step": 3703 }, { "epoch": 1.7571157495256167, "grad_norm": 1.8532425165176392, "learning_rate": 7.637794110240259e-07, "loss": 0.1866, "step": 3704 }, { "epoch": 1.7575901328273245, "grad_norm": 1.3793625831604004, "learning_rate": 7.608371962599847e-07, "loss": 0.1241, "step": 3705 }, { "epoch": 1.7580645161290323, "grad_norm": 1.5093717575073242, "learning_rate": 7.579004352823205e-07, "loss": 0.1404, "step": 3706 }, { "epoch": 1.75853889943074, "grad_norm": 2.2254135608673096, "learning_rate": 7.549691298245754e-07, "loss": 0.1306, "step": 3707 }, { "epoch": 1.7590132827324478, "grad_norm": 1.539953351020813, "learning_rate": 7.520432816170686e-07, "loss": 0.1244, "step": 3708 }, { "epoch": 1.7594876660341556, "grad_norm": 1.3199529647827148, "learning_rate": 7.491228923868999e-07, "loss": 0.1135, "step": 3709 }, { "epoch": 1.7599620493358634, "grad_norm": 1.6439963579177856, "learning_rate": 7.462079638579489e-07, "loss": 0.124, "step": 3710 }, { "epoch": 1.760436432637571, "grad_norm": 1.6355787515640259, "learning_rate": 7.432984977508639e-07, "loss": 0.1471, "step": 3711 }, { "epoch": 1.7609108159392788, "grad_norm": 1.7077136039733887, "learning_rate": 7.4039449578308e-07, "loss": 0.1542, "step": 3712 }, { "epoch": 1.7613851992409866, "grad_norm": 1.7139383554458618, "learning_rate": 7.374959596687948e-07, "loss": 0.1534, "step": 3713 }, { "epoch": 1.7618595825426944, "grad_norm": 1.5415153503417969, "learning_rate": 7.346028911189895e-07, "loss": 0.1537, "step": 3714 }, { "epoch": 1.7623339658444022, "grad_norm": 1.2754297256469727, "learning_rate": 7.317152918414116e-07, "loss": 0.1296, "step": 3715 }, { "epoch": 1.76280834914611, "grad_norm": 1.7097855806350708, "learning_rate": 7.288331635405832e-07, "loss": 0.1495, "step": 3716 }, { "epoch": 1.7632827324478177, "grad_norm": 1.5699681043624878, "learning_rate": 7.259565079177966e-07, "loss": 0.1435, "step": 3717 }, { "epoch": 1.7637571157495255, "grad_norm": 1.5883891582489014, "learning_rate": 7.230853266711124e-07, "loss": 0.142, "step": 3718 }, { "epoch": 1.7642314990512333, "grad_norm": 1.6266002655029297, "learning_rate": 7.202196214953616e-07, "loss": 0.1283, "step": 3719 }, { "epoch": 1.7647058823529411, "grad_norm": 1.50413978099823, "learning_rate": 7.173593940821411e-07, "loss": 0.1272, "step": 3720 }, { "epoch": 1.765180265654649, "grad_norm": 1.3868768215179443, "learning_rate": 7.145046461198146e-07, "loss": 0.1253, "step": 3721 }, { "epoch": 1.7656546489563567, "grad_norm": 1.3816770315170288, "learning_rate": 7.11655379293511e-07, "loss": 0.1432, "step": 3722 }, { "epoch": 1.7661290322580645, "grad_norm": 1.8432587385177612, "learning_rate": 7.088115952851238e-07, "loss": 0.1617, "step": 3723 }, { "epoch": 1.7666034155597723, "grad_norm": 1.6249598264694214, "learning_rate": 7.059732957733145e-07, "loss": 0.1462, "step": 3724 }, { "epoch": 1.76707779886148, "grad_norm": 1.4073657989501953, "learning_rate": 7.031404824334986e-07, "loss": 0.126, "step": 3725 }, { "epoch": 1.7675521821631879, "grad_norm": 1.3608930110931396, "learning_rate": 7.003131569378586e-07, "loss": 0.1174, "step": 3726 }, { "epoch": 1.7680265654648957, "grad_norm": 1.6498671770095825, "learning_rate": 6.97491320955338e-07, "loss": 0.1563, "step": 3727 }, { "epoch": 1.7685009487666035, "grad_norm": 1.5158390998840332, "learning_rate": 6.946749761516402e-07, "loss": 0.1455, "step": 3728 }, { "epoch": 1.7689753320683113, "grad_norm": 1.5832974910736084, "learning_rate": 6.918641241892243e-07, "loss": 0.1263, "step": 3729 }, { "epoch": 1.769449715370019, "grad_norm": 1.628330945968628, "learning_rate": 6.890587667273064e-07, "loss": 0.1484, "step": 3730 }, { "epoch": 1.7699240986717268, "grad_norm": 1.4703549146652222, "learning_rate": 6.862589054218616e-07, "loss": 0.1334, "step": 3731 }, { "epoch": 1.7703984819734346, "grad_norm": 1.8686482906341553, "learning_rate": 6.834645419256225e-07, "loss": 0.1633, "step": 3732 }, { "epoch": 1.7708728652751424, "grad_norm": 1.9761866331100464, "learning_rate": 6.806756778880752e-07, "loss": 0.1436, "step": 3733 }, { "epoch": 1.7713472485768502, "grad_norm": 1.775627851486206, "learning_rate": 6.77892314955454e-07, "loss": 0.1542, "step": 3734 }, { "epoch": 1.771821631878558, "grad_norm": 1.536481261253357, "learning_rate": 6.751144547707533e-07, "loss": 0.1278, "step": 3735 }, { "epoch": 1.7722960151802658, "grad_norm": 2.131673812866211, "learning_rate": 6.723420989737184e-07, "loss": 0.1947, "step": 3736 }, { "epoch": 1.7727703984819736, "grad_norm": 1.6633423566818237, "learning_rate": 6.695752492008389e-07, "loss": 0.1661, "step": 3737 }, { "epoch": 1.7732447817836812, "grad_norm": 1.718865990638733, "learning_rate": 6.66813907085363e-07, "loss": 0.1533, "step": 3738 }, { "epoch": 1.773719165085389, "grad_norm": 1.7374719381332397, "learning_rate": 6.64058074257281e-07, "loss": 0.1821, "step": 3739 }, { "epoch": 1.7741935483870968, "grad_norm": 1.5633835792541504, "learning_rate": 6.613077523433342e-07, "loss": 0.1299, "step": 3740 }, { "epoch": 1.7746679316888045, "grad_norm": 1.8587796688079834, "learning_rate": 6.585629429670115e-07, "loss": 0.1701, "step": 3741 }, { "epoch": 1.7751423149905123, "grad_norm": 1.9442617893218994, "learning_rate": 6.558236477485458e-07, "loss": 0.1211, "step": 3742 }, { "epoch": 1.7756166982922201, "grad_norm": 1.8159979581832886, "learning_rate": 6.53089868304917e-07, "loss": 0.1365, "step": 3743 }, { "epoch": 1.776091081593928, "grad_norm": 1.4900180101394653, "learning_rate": 6.503616062498464e-07, "loss": 0.1381, "step": 3744 }, { "epoch": 1.7765654648956357, "grad_norm": 1.4936360120773315, "learning_rate": 6.476388631938024e-07, "loss": 0.1397, "step": 3745 }, { "epoch": 1.7770398481973435, "grad_norm": 1.368524432182312, "learning_rate": 6.449216407439906e-07, "loss": 0.1033, "step": 3746 }, { "epoch": 1.777514231499051, "grad_norm": 1.771744728088379, "learning_rate": 6.422099405043613e-07, "loss": 0.1609, "step": 3747 }, { "epoch": 1.7779886148007589, "grad_norm": 1.7894208431243896, "learning_rate": 6.395037640756074e-07, "loss": 0.1453, "step": 3748 }, { "epoch": 1.7784629981024667, "grad_norm": 1.6382660865783691, "learning_rate": 6.368031130551533e-07, "loss": 0.1232, "step": 3749 }, { "epoch": 1.7789373814041745, "grad_norm": 1.9367256164550781, "learning_rate": 6.341079890371682e-07, "loss": 0.183, "step": 3750 }, { "epoch": 1.7794117647058822, "grad_norm": 1.570556640625, "learning_rate": 6.314183936125584e-07, "loss": 0.1278, "step": 3751 }, { "epoch": 1.77988614800759, "grad_norm": 1.5493059158325195, "learning_rate": 6.287343283689662e-07, "loss": 0.1329, "step": 3752 }, { "epoch": 1.7803605313092978, "grad_norm": 1.3096294403076172, "learning_rate": 6.260557948907664e-07, "loss": 0.1195, "step": 3753 }, { "epoch": 1.7808349146110056, "grad_norm": 2.0561397075653076, "learning_rate": 6.233827947590709e-07, "loss": 0.1499, "step": 3754 }, { "epoch": 1.7813092979127134, "grad_norm": 1.4813035726547241, "learning_rate": 6.207153295517265e-07, "loss": 0.1271, "step": 3755 }, { "epoch": 1.7817836812144212, "grad_norm": 1.5294699668884277, "learning_rate": 6.180534008433114e-07, "loss": 0.1312, "step": 3756 }, { "epoch": 1.782258064516129, "grad_norm": 1.5780595541000366, "learning_rate": 6.153970102051354e-07, "loss": 0.1269, "step": 3757 }, { "epoch": 1.7827324478178368, "grad_norm": 1.5694869756698608, "learning_rate": 6.127461592052397e-07, "loss": 0.1167, "step": 3758 }, { "epoch": 1.7832068311195446, "grad_norm": 1.5196350812911987, "learning_rate": 6.101008494083948e-07, "loss": 0.1269, "step": 3759 }, { "epoch": 1.7836812144212524, "grad_norm": 1.7971737384796143, "learning_rate": 6.074610823761029e-07, "loss": 0.1443, "step": 3760 }, { "epoch": 1.7841555977229602, "grad_norm": 1.4723081588745117, "learning_rate": 6.048268596665896e-07, "loss": 0.1194, "step": 3761 }, { "epoch": 1.784629981024668, "grad_norm": 1.2657811641693115, "learning_rate": 6.021981828348123e-07, "loss": 0.1006, "step": 3762 }, { "epoch": 1.7851043643263758, "grad_norm": 1.2274057865142822, "learning_rate": 5.995750534324518e-07, "loss": 0.1081, "step": 3763 }, { "epoch": 1.7855787476280836, "grad_norm": 1.5427498817443848, "learning_rate": 5.969574730079164e-07, "loss": 0.1491, "step": 3764 }, { "epoch": 1.7860531309297913, "grad_norm": 1.7528079748153687, "learning_rate": 5.943454431063367e-07, "loss": 0.1405, "step": 3765 }, { "epoch": 1.7865275142314991, "grad_norm": 1.619002342224121, "learning_rate": 5.917389652695693e-07, "loss": 0.1516, "step": 3766 }, { "epoch": 1.787001897533207, "grad_norm": 2.0210466384887695, "learning_rate": 5.891380410361947e-07, "loss": 0.1638, "step": 3767 }, { "epoch": 1.7874762808349147, "grad_norm": 1.4195687770843506, "learning_rate": 5.865426719415068e-07, "loss": 0.1199, "step": 3768 }, { "epoch": 1.7879506641366225, "grad_norm": 1.5800368785858154, "learning_rate": 5.839528595175314e-07, "loss": 0.1301, "step": 3769 }, { "epoch": 1.7884250474383303, "grad_norm": 1.5409759283065796, "learning_rate": 5.813686052930068e-07, "loss": 0.1405, "step": 3770 }, { "epoch": 1.788899430740038, "grad_norm": 1.754284143447876, "learning_rate": 5.787899107933936e-07, "loss": 0.1502, "step": 3771 }, { "epoch": 1.789373814041746, "grad_norm": 1.586987853050232, "learning_rate": 5.762167775408678e-07, "loss": 0.1244, "step": 3772 }, { "epoch": 1.7898481973434535, "grad_norm": 1.611585021018982, "learning_rate": 5.736492070543265e-07, "loss": 0.1423, "step": 3773 }, { "epoch": 1.7903225806451613, "grad_norm": 1.8729734420776367, "learning_rate": 5.710872008493795e-07, "loss": 0.1478, "step": 3774 }, { "epoch": 1.790796963946869, "grad_norm": 1.65080988407135, "learning_rate": 5.685307604383561e-07, "loss": 0.1352, "step": 3775 }, { "epoch": 1.7912713472485768, "grad_norm": 1.5004171133041382, "learning_rate": 5.659798873302968e-07, "loss": 0.1132, "step": 3776 }, { "epoch": 1.7917457305502846, "grad_norm": 1.4567018747329712, "learning_rate": 5.634345830309563e-07, "loss": 0.1126, "step": 3777 }, { "epoch": 1.7922201138519924, "grad_norm": 1.566493034362793, "learning_rate": 5.608948490428023e-07, "loss": 0.1527, "step": 3778 }, { "epoch": 1.7926944971537002, "grad_norm": 1.4656519889831543, "learning_rate": 5.583606868650138e-07, "loss": 0.1614, "step": 3779 }, { "epoch": 1.793168880455408, "grad_norm": 1.7221026420593262, "learning_rate": 5.558320979934839e-07, "loss": 0.1515, "step": 3780 }, { "epoch": 1.7936432637571158, "grad_norm": 1.484522819519043, "learning_rate": 5.533090839208133e-07, "loss": 0.1394, "step": 3781 }, { "epoch": 1.7941176470588234, "grad_norm": 1.717825174331665, "learning_rate": 5.507916461363094e-07, "loss": 0.1472, "step": 3782 }, { "epoch": 1.7945920303605312, "grad_norm": 1.826720952987671, "learning_rate": 5.482797861259937e-07, "loss": 0.1465, "step": 3783 }, { "epoch": 1.795066413662239, "grad_norm": 1.4461770057678223, "learning_rate": 5.45773505372591e-07, "loss": 0.1204, "step": 3784 }, { "epoch": 1.7955407969639468, "grad_norm": 1.7469446659088135, "learning_rate": 5.43272805355537e-07, "loss": 0.1501, "step": 3785 }, { "epoch": 1.7960151802656545, "grad_norm": 1.5683156251907349, "learning_rate": 5.407776875509663e-07, "loss": 0.1272, "step": 3786 }, { "epoch": 1.7964895635673623, "grad_norm": 1.7831333875656128, "learning_rate": 5.382881534317231e-07, "loss": 0.1567, "step": 3787 }, { "epoch": 1.7969639468690701, "grad_norm": 1.6136335134506226, "learning_rate": 5.35804204467355e-07, "loss": 0.1244, "step": 3788 }, { "epoch": 1.797438330170778, "grad_norm": 2.210510015487671, "learning_rate": 5.333258421241127e-07, "loss": 0.1729, "step": 3789 }, { "epoch": 1.7979127134724857, "grad_norm": 1.5324962139129639, "learning_rate": 5.308530678649504e-07, "loss": 0.1455, "step": 3790 }, { "epoch": 1.7983870967741935, "grad_norm": 2.263535737991333, "learning_rate": 5.283858831495192e-07, "loss": 0.2427, "step": 3791 }, { "epoch": 1.7988614800759013, "grad_norm": 1.561944603919983, "learning_rate": 5.259242894341765e-07, "loss": 0.1292, "step": 3792 }, { "epoch": 1.799335863377609, "grad_norm": 1.5755988359451294, "learning_rate": 5.234682881719766e-07, "loss": 0.1342, "step": 3793 }, { "epoch": 1.7998102466793169, "grad_norm": 1.7627415657043457, "learning_rate": 5.210178808126698e-07, "loss": 0.1818, "step": 3794 }, { "epoch": 1.8002846299810247, "grad_norm": 1.6085638999938965, "learning_rate": 5.185730688027124e-07, "loss": 0.166, "step": 3795 }, { "epoch": 1.8007590132827325, "grad_norm": 1.764938473701477, "learning_rate": 5.161338535852467e-07, "loss": 0.204, "step": 3796 }, { "epoch": 1.8012333965844403, "grad_norm": 1.4336494207382202, "learning_rate": 5.137002366001209e-07, "loss": 0.1188, "step": 3797 }, { "epoch": 1.801707779886148, "grad_norm": 1.5212832689285278, "learning_rate": 5.112722192838748e-07, "loss": 0.1357, "step": 3798 }, { "epoch": 1.8021821631878558, "grad_norm": 1.5787197351455688, "learning_rate": 5.08849803069743e-07, "loss": 0.1273, "step": 3799 }, { "epoch": 1.8026565464895636, "grad_norm": 1.7871791124343872, "learning_rate": 5.064329893876541e-07, "loss": 0.1423, "step": 3800 }, { "epoch": 1.8031309297912714, "grad_norm": 1.4555021524429321, "learning_rate": 5.0402177966423e-07, "loss": 0.1356, "step": 3801 }, { "epoch": 1.8036053130929792, "grad_norm": 1.40778648853302, "learning_rate": 5.016161753227799e-07, "loss": 0.1366, "step": 3802 }, { "epoch": 1.804079696394687, "grad_norm": 1.7542588710784912, "learning_rate": 4.992161777833116e-07, "loss": 0.1631, "step": 3803 }, { "epoch": 1.8045540796963948, "grad_norm": 1.4451662302017212, "learning_rate": 4.968217884625182e-07, "loss": 0.1096, "step": 3804 }, { "epoch": 1.8050284629981026, "grad_norm": 1.571986198425293, "learning_rate": 4.944330087737881e-07, "loss": 0.1449, "step": 3805 }, { "epoch": 1.8055028462998104, "grad_norm": 1.793839693069458, "learning_rate": 4.92049840127189e-07, "loss": 0.1494, "step": 3806 }, { "epoch": 1.8059772296015182, "grad_norm": 1.783685564994812, "learning_rate": 4.896722839294843e-07, "loss": 0.1737, "step": 3807 }, { "epoch": 1.8064516129032258, "grad_norm": 1.5244357585906982, "learning_rate": 4.873003415841215e-07, "loss": 0.1348, "step": 3808 }, { "epoch": 1.8069259962049335, "grad_norm": 1.776548147201538, "learning_rate": 4.849340144912363e-07, "loss": 0.1773, "step": 3809 }, { "epoch": 1.8074003795066413, "grad_norm": 1.6930649280548096, "learning_rate": 4.825733040476465e-07, "loss": 0.1131, "step": 3810 }, { "epoch": 1.8078747628083491, "grad_norm": 1.6838048696517944, "learning_rate": 4.802182116468556e-07, "loss": 0.1474, "step": 3811 }, { "epoch": 1.808349146110057, "grad_norm": 1.8660080432891846, "learning_rate": 4.778687386790515e-07, "loss": 0.1772, "step": 3812 }, { "epoch": 1.8088235294117647, "grad_norm": 1.328919768333435, "learning_rate": 4.7552488653110575e-07, "loss": 0.1042, "step": 3813 }, { "epoch": 1.8092979127134725, "grad_norm": 1.7662233114242554, "learning_rate": 4.731866565865717e-07, "loss": 0.1594, "step": 3814 }, { "epoch": 1.8097722960151803, "grad_norm": 1.4320173263549805, "learning_rate": 4.7085405022568196e-07, "loss": 0.1277, "step": 3815 }, { "epoch": 1.810246679316888, "grad_norm": 1.8004564046859741, "learning_rate": 4.685270688253507e-07, "loss": 0.1586, "step": 3816 }, { "epoch": 1.8107210626185957, "grad_norm": 1.5289167165756226, "learning_rate": 4.6620571375917356e-07, "loss": 0.1296, "step": 3817 }, { "epoch": 1.8111954459203035, "grad_norm": 1.3791871070861816, "learning_rate": 4.638899863974222e-07, "loss": 0.1142, "step": 3818 }, { "epoch": 1.8116698292220113, "grad_norm": 1.5164226293563843, "learning_rate": 4.615798881070499e-07, "loss": 0.157, "step": 3819 }, { "epoch": 1.812144212523719, "grad_norm": 1.3034436702728271, "learning_rate": 4.5927542025168025e-07, "loss": 0.1208, "step": 3820 }, { "epoch": 1.8126185958254268, "grad_norm": 1.3577231168746948, "learning_rate": 4.5697658419162183e-07, "loss": 0.1204, "step": 3821 }, { "epoch": 1.8130929791271346, "grad_norm": 1.599459171295166, "learning_rate": 4.5468338128385247e-07, "loss": 0.1329, "step": 3822 }, { "epoch": 1.8135673624288424, "grad_norm": 1.78299081325531, "learning_rate": 4.523958128820283e-07, "loss": 0.1555, "step": 3823 }, { "epoch": 1.8140417457305502, "grad_norm": 2.62516450881958, "learning_rate": 4.501138803364802e-07, "loss": 0.2332, "step": 3824 }, { "epoch": 1.814516129032258, "grad_norm": 1.7529395818710327, "learning_rate": 4.478375849942063e-07, "loss": 0.1568, "step": 3825 }, { "epoch": 1.8149905123339658, "grad_norm": 1.5792466402053833, "learning_rate": 4.4556692819888504e-07, "loss": 0.1675, "step": 3826 }, { "epoch": 1.8154648956356736, "grad_norm": 1.464310646057129, "learning_rate": 4.4330191129085873e-07, "loss": 0.1283, "step": 3827 }, { "epoch": 1.8159392789373814, "grad_norm": 1.2922571897506714, "learning_rate": 4.4104253560714794e-07, "loss": 0.1136, "step": 3828 }, { "epoch": 1.8164136622390892, "grad_norm": 1.4315319061279297, "learning_rate": 4.3878880248143904e-07, "loss": 0.1296, "step": 3829 }, { "epoch": 1.816888045540797, "grad_norm": 1.5491856336593628, "learning_rate": 4.3654071324408685e-07, "loss": 0.1488, "step": 3830 }, { "epoch": 1.8173624288425048, "grad_norm": 1.3317991495132446, "learning_rate": 4.342982692221165e-07, "loss": 0.1164, "step": 3831 }, { "epoch": 1.8178368121442126, "grad_norm": 1.8910882472991943, "learning_rate": 4.3206147173922133e-07, "loss": 0.176, "step": 3832 }, { "epoch": 1.8183111954459203, "grad_norm": 1.317647933959961, "learning_rate": 4.298303221157618e-07, "loss": 0.1262, "step": 3833 }, { "epoch": 1.8187855787476281, "grad_norm": 1.5027046203613281, "learning_rate": 4.276048216687634e-07, "loss": 0.1289, "step": 3834 }, { "epoch": 1.819259962049336, "grad_norm": 1.5207945108413696, "learning_rate": 4.25384971711913e-07, "loss": 0.1304, "step": 3835 }, { "epoch": 1.8197343453510437, "grad_norm": 2.8607802391052246, "learning_rate": 4.231707735555701e-07, "loss": 0.1902, "step": 3836 }, { "epoch": 1.8202087286527515, "grad_norm": 1.8455995321273804, "learning_rate": 4.209622285067516e-07, "loss": 0.1518, "step": 3837 }, { "epoch": 1.8206831119544593, "grad_norm": 1.3427563905715942, "learning_rate": 4.187593378691435e-07, "loss": 0.1137, "step": 3838 }, { "epoch": 1.821157495256167, "grad_norm": 1.9709796905517578, "learning_rate": 4.165621029430855e-07, "loss": 0.1531, "step": 3839 }, { "epoch": 1.821631878557875, "grad_norm": 2.085360527038574, "learning_rate": 4.1437052502558693e-07, "loss": 0.1924, "step": 3840 }, { "epoch": 1.8221062618595827, "grad_norm": 1.684356927871704, "learning_rate": 4.1218460541031404e-07, "loss": 0.1503, "step": 3841 }, { "epoch": 1.8225806451612905, "grad_norm": 1.6647909879684448, "learning_rate": 4.1000434538759235e-07, "loss": 0.1498, "step": 3842 }, { "epoch": 1.823055028462998, "grad_norm": 1.9033278226852417, "learning_rate": 4.078297462444092e-07, "loss": 0.1553, "step": 3843 }, { "epoch": 1.8235294117647058, "grad_norm": 1.4398269653320312, "learning_rate": 4.0566080926440765e-07, "loss": 0.128, "step": 3844 }, { "epoch": 1.8240037950664136, "grad_norm": 1.634743571281433, "learning_rate": 4.034975357278903e-07, "loss": 0.1454, "step": 3845 }, { "epoch": 1.8244781783681214, "grad_norm": 1.7548015117645264, "learning_rate": 4.013399269118157e-07, "loss": 0.1535, "step": 3846 }, { "epoch": 1.8249525616698292, "grad_norm": 1.925663948059082, "learning_rate": 3.991879840897994e-07, "loss": 0.1483, "step": 3847 }, { "epoch": 1.825426944971537, "grad_norm": 1.2624423503875732, "learning_rate": 3.970417085321143e-07, "loss": 0.1018, "step": 3848 }, { "epoch": 1.8259013282732448, "grad_norm": 1.4971909523010254, "learning_rate": 3.949011015056803e-07, "loss": 0.1474, "step": 3849 }, { "epoch": 1.8263757115749526, "grad_norm": 1.7054779529571533, "learning_rate": 3.9276616427408207e-07, "loss": 0.1422, "step": 3850 }, { "epoch": 1.8268500948766604, "grad_norm": 1.506289005279541, "learning_rate": 3.9063689809754837e-07, "loss": 0.1337, "step": 3851 }, { "epoch": 1.827324478178368, "grad_norm": 1.4159045219421387, "learning_rate": 3.8851330423296476e-07, "loss": 0.1259, "step": 3852 }, { "epoch": 1.8277988614800758, "grad_norm": 1.5308058261871338, "learning_rate": 3.8639538393386854e-07, "loss": 0.1109, "step": 3853 }, { "epoch": 1.8282732447817835, "grad_norm": 1.6169154644012451, "learning_rate": 3.842831384504453e-07, "loss": 0.1385, "step": 3854 }, { "epoch": 1.8287476280834913, "grad_norm": 1.4959263801574707, "learning_rate": 3.821765690295343e-07, "loss": 0.1364, "step": 3855 }, { "epoch": 1.8292220113851991, "grad_norm": 1.550232172012329, "learning_rate": 3.8007567691462187e-07, "loss": 0.1415, "step": 3856 }, { "epoch": 1.829696394686907, "grad_norm": 1.8426754474639893, "learning_rate": 3.77980463345845e-07, "loss": 0.1443, "step": 3857 }, { "epoch": 1.8301707779886147, "grad_norm": 1.846116065979004, "learning_rate": 3.758909295599877e-07, "loss": 0.1743, "step": 3858 }, { "epoch": 1.8306451612903225, "grad_norm": 2.2521839141845703, "learning_rate": 3.738070767904778e-07, "loss": 0.1485, "step": 3859 }, { "epoch": 1.8311195445920303, "grad_norm": 1.8919899463653564, "learning_rate": 3.7172890626739566e-07, "loss": 0.1651, "step": 3860 }, { "epoch": 1.831593927893738, "grad_norm": 1.477987289428711, "learning_rate": 3.696564192174645e-07, "loss": 0.1399, "step": 3861 }, { "epoch": 1.8320683111954459, "grad_norm": 1.4697281122207642, "learning_rate": 3.675896168640536e-07, "loss": 0.114, "step": 3862 }, { "epoch": 1.8325426944971537, "grad_norm": 1.2176628112792969, "learning_rate": 3.655285004271747e-07, "loss": 0.1069, "step": 3863 }, { "epoch": 1.8330170777988615, "grad_norm": 1.5733720064163208, "learning_rate": 3.634730711234835e-07, "loss": 0.1506, "step": 3864 }, { "epoch": 1.8334914611005693, "grad_norm": 2.247699737548828, "learning_rate": 3.6142333016628286e-07, "loss": 0.1617, "step": 3865 }, { "epoch": 1.833965844402277, "grad_norm": 1.3761510848999023, "learning_rate": 3.593792787655115e-07, "loss": 0.1083, "step": 3866 }, { "epoch": 1.8344402277039848, "grad_norm": 1.6789445877075195, "learning_rate": 3.573409181277554e-07, "loss": 0.1403, "step": 3867 }, { "epoch": 1.8349146110056926, "grad_norm": 1.8942904472351074, "learning_rate": 3.553082494562354e-07, "loss": 0.1702, "step": 3868 }, { "epoch": 1.8353889943074004, "grad_norm": 1.7401469945907593, "learning_rate": 3.5328127395081736e-07, "loss": 0.1493, "step": 3869 }, { "epoch": 1.8358633776091082, "grad_norm": 1.4459508657455444, "learning_rate": 3.5125999280800517e-07, "loss": 0.1332, "step": 3870 }, { "epoch": 1.836337760910816, "grad_norm": 1.4247829914093018, "learning_rate": 3.492444072209411e-07, "loss": 0.1384, "step": 3871 }, { "epoch": 1.8368121442125238, "grad_norm": 1.431828260421753, "learning_rate": 3.472345183794046e-07, "loss": 0.1207, "step": 3872 }, { "epoch": 1.8372865275142316, "grad_norm": 1.6826831102371216, "learning_rate": 3.4523032746981434e-07, "loss": 0.1252, "step": 3873 }, { "epoch": 1.8377609108159394, "grad_norm": 1.5859261751174927, "learning_rate": 3.43231835675224e-07, "loss": 0.1273, "step": 3874 }, { "epoch": 1.8382352941176472, "grad_norm": 1.906586766242981, "learning_rate": 3.4123904417532325e-07, "loss": 0.1509, "step": 3875 }, { "epoch": 1.838709677419355, "grad_norm": 1.7677314281463623, "learning_rate": 3.3925195414643677e-07, "loss": 0.1258, "step": 3876 }, { "epoch": 1.8391840607210628, "grad_norm": 1.9575536251068115, "learning_rate": 3.37270566761525e-07, "loss": 0.1903, "step": 3877 }, { "epoch": 1.8396584440227703, "grad_norm": 1.5801265239715576, "learning_rate": 3.3529488319017924e-07, "loss": 0.123, "step": 3878 }, { "epoch": 1.8401328273244781, "grad_norm": 1.6547151803970337, "learning_rate": 3.3332490459862865e-07, "loss": 0.1577, "step": 3879 }, { "epoch": 1.840607210626186, "grad_norm": 1.688603401184082, "learning_rate": 3.313606321497309e-07, "loss": 0.1416, "step": 3880 }, { "epoch": 1.8410815939278937, "grad_norm": 1.4524204730987549, "learning_rate": 3.294020670029785e-07, "loss": 0.1121, "step": 3881 }, { "epoch": 1.8415559772296015, "grad_norm": 1.886781930923462, "learning_rate": 3.2744921031448997e-07, "loss": 0.1375, "step": 3882 }, { "epoch": 1.8420303605313093, "grad_norm": 1.9067200422286987, "learning_rate": 3.255020632370176e-07, "loss": 0.182, "step": 3883 }, { "epoch": 1.842504743833017, "grad_norm": 1.91090989112854, "learning_rate": 3.235606269199454e-07, "loss": 0.1711, "step": 3884 }, { "epoch": 1.842979127134725, "grad_norm": 2.2755870819091797, "learning_rate": 3.2162490250928103e-07, "loss": 0.1389, "step": 3885 }, { "epoch": 1.8434535104364327, "grad_norm": 1.7041386365890503, "learning_rate": 3.1969489114766715e-07, "loss": 0.1558, "step": 3886 }, { "epoch": 1.8439278937381403, "grad_norm": 1.5372099876403809, "learning_rate": 3.1777059397436693e-07, "loss": 0.1196, "step": 3887 }, { "epoch": 1.844402277039848, "grad_norm": 1.4192637205123901, "learning_rate": 3.1585201212527507e-07, "loss": 0.1458, "step": 3888 }, { "epoch": 1.8448766603415558, "grad_norm": 2.2096190452575684, "learning_rate": 3.1393914673291335e-07, "loss": 0.1621, "step": 3889 }, { "epoch": 1.8453510436432636, "grad_norm": 1.54441499710083, "learning_rate": 3.120319989264242e-07, "loss": 0.1255, "step": 3890 }, { "epoch": 1.8458254269449714, "grad_norm": 1.6018781661987305, "learning_rate": 3.101305698315815e-07, "loss": 0.1354, "step": 3891 }, { "epoch": 1.8462998102466792, "grad_norm": 2.1039795875549316, "learning_rate": 3.082348605707752e-07, "loss": 0.1501, "step": 3892 }, { "epoch": 1.846774193548387, "grad_norm": 1.5671883821487427, "learning_rate": 3.06344872263028e-07, "loss": 0.1483, "step": 3893 }, { "epoch": 1.8472485768500948, "grad_norm": 1.5464131832122803, "learning_rate": 3.0446060602397965e-07, "loss": 0.1455, "step": 3894 }, { "epoch": 1.8477229601518026, "grad_norm": 1.674782156944275, "learning_rate": 3.0258206296589487e-07, "loss": 0.158, "step": 3895 }, { "epoch": 1.8481973434535104, "grad_norm": 1.4829376935958862, "learning_rate": 3.007092441976567e-07, "loss": 0.1428, "step": 3896 }, { "epoch": 1.8486717267552182, "grad_norm": 1.6832263469696045, "learning_rate": 2.988421508247741e-07, "loss": 0.1594, "step": 3897 }, { "epoch": 1.849146110056926, "grad_norm": 1.5657440423965454, "learning_rate": 2.9698078394937325e-07, "loss": 0.1129, "step": 3898 }, { "epoch": 1.8496204933586338, "grad_norm": 1.9195455312728882, "learning_rate": 2.951251446701997e-07, "loss": 0.1411, "step": 3899 }, { "epoch": 1.8500948766603416, "grad_norm": 1.7796403169631958, "learning_rate": 2.932752340826195e-07, "loss": 0.1521, "step": 3900 }, { "epoch": 1.8505692599620494, "grad_norm": 1.448384404182434, "learning_rate": 2.914310532786158e-07, "loss": 0.1057, "step": 3901 }, { "epoch": 1.8510436432637571, "grad_norm": 1.8354464769363403, "learning_rate": 2.8959260334679107e-07, "loss": 0.1109, "step": 3902 }, { "epoch": 1.851518026565465, "grad_norm": 1.9993449449539185, "learning_rate": 2.87759885372364e-07, "loss": 0.194, "step": 3903 }, { "epoch": 1.8519924098671727, "grad_norm": 1.7258479595184326, "learning_rate": 2.859329004371703e-07, "loss": 0.1546, "step": 3904 }, { "epoch": 1.8524667931688805, "grad_norm": 1.6797391176223755, "learning_rate": 2.8411164961966164e-07, "loss": 0.1433, "step": 3905 }, { "epoch": 1.8529411764705883, "grad_norm": 1.4090880155563354, "learning_rate": 2.8229613399490265e-07, "loss": 0.1238, "step": 3906 }, { "epoch": 1.853415559772296, "grad_norm": 1.9143239259719849, "learning_rate": 2.8048635463457485e-07, "loss": 0.1373, "step": 3907 }, { "epoch": 1.853889943074004, "grad_norm": 1.5854467153549194, "learning_rate": 2.7868231260697267e-07, "loss": 0.1538, "step": 3908 }, { "epoch": 1.8543643263757117, "grad_norm": 1.3684797286987305, "learning_rate": 2.768840089770053e-07, "loss": 0.0993, "step": 3909 }, { "epoch": 1.8548387096774195, "grad_norm": 1.3780193328857422, "learning_rate": 2.750914448061925e-07, "loss": 0.0877, "step": 3910 }, { "epoch": 1.8553130929791273, "grad_norm": 1.7516002655029297, "learning_rate": 2.73304621152668e-07, "loss": 0.1584, "step": 3911 }, { "epoch": 1.855787476280835, "grad_norm": 2.2483954429626465, "learning_rate": 2.7152353907117566e-07, "loss": 0.2224, "step": 3912 }, { "epoch": 1.8562618595825426, "grad_norm": 1.5955204963684082, "learning_rate": 2.697481996130713e-07, "loss": 0.127, "step": 3913 }, { "epoch": 1.8567362428842504, "grad_norm": 1.3224315643310547, "learning_rate": 2.6797860382631993e-07, "loss": 0.1119, "step": 3914 }, { "epoch": 1.8572106261859582, "grad_norm": 1.8325411081314087, "learning_rate": 2.6621475275549593e-07, "loss": 0.1764, "step": 3915 }, { "epoch": 1.857685009487666, "grad_norm": 1.426345705986023, "learning_rate": 2.644566474417831e-07, "loss": 0.1312, "step": 3916 }, { "epoch": 1.8581593927893738, "grad_norm": 1.5736240148544312, "learning_rate": 2.627042889229736e-07, "loss": 0.1433, "step": 3917 }, { "epoch": 1.8586337760910816, "grad_norm": 1.5317918062210083, "learning_rate": 2.609576782334688e-07, "loss": 0.1323, "step": 3918 }, { "epoch": 1.8591081593927894, "grad_norm": 1.8912248611450195, "learning_rate": 2.592168164042741e-07, "loss": 0.14, "step": 3919 }, { "epoch": 1.8595825426944972, "grad_norm": 1.658992052078247, "learning_rate": 2.57481704463004e-07, "loss": 0.1498, "step": 3920 }, { "epoch": 1.860056925996205, "grad_norm": 1.9872993230819702, "learning_rate": 2.5575234343387603e-07, "loss": 0.1534, "step": 3921 }, { "epoch": 1.8605313092979125, "grad_norm": 1.5195602178573608, "learning_rate": 2.5402873433771793e-07, "loss": 0.1353, "step": 3922 }, { "epoch": 1.8610056925996203, "grad_norm": 1.7144728899002075, "learning_rate": 2.52310878191957e-07, "loss": 0.1466, "step": 3923 }, { "epoch": 1.8614800759013281, "grad_norm": 1.6180931329727173, "learning_rate": 2.5059877601062655e-07, "loss": 0.1519, "step": 3924 }, { "epoch": 1.861954459203036, "grad_norm": 1.5871514081954956, "learning_rate": 2.488924288043648e-07, "loss": 0.1451, "step": 3925 }, { "epoch": 1.8624288425047437, "grad_norm": 1.9041343927383423, "learning_rate": 2.4719183758041056e-07, "loss": 0.1313, "step": 3926 }, { "epoch": 1.8629032258064515, "grad_norm": 1.3394379615783691, "learning_rate": 2.454970033426052e-07, "loss": 0.1156, "step": 3927 }, { "epoch": 1.8633776091081593, "grad_norm": 1.8027927875518799, "learning_rate": 2.4380792709139513e-07, "loss": 0.1709, "step": 3928 }, { "epoch": 1.863851992409867, "grad_norm": 1.628211498260498, "learning_rate": 2.4212460982382503e-07, "loss": 0.1529, "step": 3929 }, { "epoch": 1.864326375711575, "grad_norm": 1.6816056966781616, "learning_rate": 2.4044705253353897e-07, "loss": 0.171, "step": 3930 }, { "epoch": 1.8648007590132827, "grad_norm": 1.5056520700454712, "learning_rate": 2.387752562107826e-07, "loss": 0.1233, "step": 3931 }, { "epoch": 1.8652751423149905, "grad_norm": 1.4784669876098633, "learning_rate": 2.3710922184239983e-07, "loss": 0.103, "step": 3932 }, { "epoch": 1.8657495256166983, "grad_norm": 1.5319194793701172, "learning_rate": 2.3544895041183736e-07, "loss": 0.131, "step": 3933 }, { "epoch": 1.866223908918406, "grad_norm": 1.5678507089614868, "learning_rate": 2.3379444289913344e-07, "loss": 0.1255, "step": 3934 }, { "epoch": 1.8666982922201139, "grad_norm": 1.686842441558838, "learning_rate": 2.321457002809302e-07, "loss": 0.1496, "step": 3935 }, { "epoch": 1.8671726755218216, "grad_norm": 1.3267841339111328, "learning_rate": 2.3050272353046244e-07, "loss": 0.0997, "step": 3936 }, { "epoch": 1.8676470588235294, "grad_norm": 1.3671629428863525, "learning_rate": 2.2886551361756326e-07, "loss": 0.1393, "step": 3937 }, { "epoch": 1.8681214421252372, "grad_norm": 1.4104747772216797, "learning_rate": 2.2723407150866295e-07, "loss": 0.117, "step": 3938 }, { "epoch": 1.868595825426945, "grad_norm": 1.6997783184051514, "learning_rate": 2.2560839816678447e-07, "loss": 0.1638, "step": 3939 }, { "epoch": 1.8690702087286528, "grad_norm": 1.3959341049194336, "learning_rate": 2.2398849455154693e-07, "loss": 0.1404, "step": 3940 }, { "epoch": 1.8695445920303606, "grad_norm": 1.8219751119613647, "learning_rate": 2.2237436161916204e-07, "loss": 0.163, "step": 3941 }, { "epoch": 1.8700189753320684, "grad_norm": 1.9857854843139648, "learning_rate": 2.2076600032243766e-07, "loss": 0.1881, "step": 3942 }, { "epoch": 1.8704933586337762, "grad_norm": 1.7564761638641357, "learning_rate": 2.1916341161077547e-07, "loss": 0.1525, "step": 3943 }, { "epoch": 1.870967741935484, "grad_norm": 1.8521637916564941, "learning_rate": 2.175665964301643e-07, "loss": 0.16, "step": 3944 }, { "epoch": 1.8714421252371918, "grad_norm": 1.7699942588806152, "learning_rate": 2.1597555572319017e-07, "loss": 0.1384, "step": 3945 }, { "epoch": 1.8719165085388996, "grad_norm": 1.621044635772705, "learning_rate": 2.143902904290296e-07, "loss": 0.1489, "step": 3946 }, { "epoch": 1.8723908918406074, "grad_norm": 1.7144032716751099, "learning_rate": 2.1281080148344734e-07, "loss": 0.1488, "step": 3947 }, { "epoch": 1.8728652751423152, "grad_norm": 1.5608563423156738, "learning_rate": 2.1123708981880097e-07, "loss": 0.133, "step": 3948 }, { "epoch": 1.8733396584440227, "grad_norm": 1.6698534488677979, "learning_rate": 2.0966915636403518e-07, "loss": 0.1631, "step": 3949 }, { "epoch": 1.8738140417457305, "grad_norm": 1.595363974571228, "learning_rate": 2.0810700204468737e-07, "loss": 0.1502, "step": 3950 }, { "epoch": 1.8742884250474383, "grad_norm": 1.279978632926941, "learning_rate": 2.0655062778288103e-07, "loss": 0.1074, "step": 3951 }, { "epoch": 1.874762808349146, "grad_norm": 1.4724317789077759, "learning_rate": 2.050000344973302e-07, "loss": 0.1373, "step": 3952 }, { "epoch": 1.875237191650854, "grad_norm": 1.8594779968261719, "learning_rate": 2.0345522310333154e-07, "loss": 0.1505, "step": 3953 }, { "epoch": 1.8757115749525617, "grad_norm": 1.8657863140106201, "learning_rate": 2.0191619451277568e-07, "loss": 0.1591, "step": 3954 }, { "epoch": 1.8761859582542695, "grad_norm": 1.5500346422195435, "learning_rate": 2.0038294963413251e-07, "loss": 0.1185, "step": 3955 }, { "epoch": 1.8766603415559773, "grad_norm": 2.1276473999023438, "learning_rate": 1.9885548937246259e-07, "loss": 0.1951, "step": 3956 }, { "epoch": 1.8771347248576848, "grad_norm": 1.9857920408248901, "learning_rate": 1.9733381462941237e-07, "loss": 0.1628, "step": 3957 }, { "epoch": 1.8776091081593926, "grad_norm": 1.5573140382766724, "learning_rate": 1.9581792630320784e-07, "loss": 0.142, "step": 3958 }, { "epoch": 1.8780834914611004, "grad_norm": 1.2747137546539307, "learning_rate": 1.9430782528866655e-07, "loss": 0.0852, "step": 3959 }, { "epoch": 1.8785578747628082, "grad_norm": 1.549103856086731, "learning_rate": 1.928035124771832e-07, "loss": 0.1298, "step": 3960 }, { "epoch": 1.879032258064516, "grad_norm": 1.432792067527771, "learning_rate": 1.9130498875673975e-07, "loss": 0.1194, "step": 3961 }, { "epoch": 1.8795066413662238, "grad_norm": 1.63291335105896, "learning_rate": 1.8981225501190193e-07, "loss": 0.1418, "step": 3962 }, { "epoch": 1.8799810246679316, "grad_norm": 1.7714699506759644, "learning_rate": 1.8832531212381378e-07, "loss": 0.1696, "step": 3963 }, { "epoch": 1.8804554079696394, "grad_norm": 1.4121677875518799, "learning_rate": 1.8684416097020318e-07, "loss": 0.1029, "step": 3964 }, { "epoch": 1.8809297912713472, "grad_norm": 2.2035553455352783, "learning_rate": 1.853688024253786e-07, "loss": 0.1971, "step": 3965 }, { "epoch": 1.881404174573055, "grad_norm": 1.4989498853683472, "learning_rate": 1.8389923736022886e-07, "loss": 0.1317, "step": 3966 }, { "epoch": 1.8818785578747628, "grad_norm": 1.5162180662155151, "learning_rate": 1.824354666422268e-07, "loss": 0.1308, "step": 3967 }, { "epoch": 1.8823529411764706, "grad_norm": 1.9501628875732422, "learning_rate": 1.8097749113541897e-07, "loss": 0.1492, "step": 3968 }, { "epoch": 1.8828273244781784, "grad_norm": 1.5917279720306396, "learning_rate": 1.7952531170043474e-07, "loss": 0.1331, "step": 3969 }, { "epoch": 1.8833017077798861, "grad_norm": 1.2856277227401733, "learning_rate": 1.7807892919448178e-07, "loss": 0.1212, "step": 3970 }, { "epoch": 1.883776091081594, "grad_norm": 1.876271367073059, "learning_rate": 1.7663834447134488e-07, "loss": 0.184, "step": 3971 }, { "epoch": 1.8842504743833017, "grad_norm": 2.1068382263183594, "learning_rate": 1.7520355838138603e-07, "loss": 0.1388, "step": 3972 }, { "epoch": 1.8847248576850095, "grad_norm": 1.8254120349884033, "learning_rate": 1.7377457177154554e-07, "loss": 0.1451, "step": 3973 }, { "epoch": 1.8851992409867173, "grad_norm": 2.037038564682007, "learning_rate": 1.7235138548534202e-07, "loss": 0.1183, "step": 3974 }, { "epoch": 1.885673624288425, "grad_norm": 1.3412225246429443, "learning_rate": 1.7093400036286567e-07, "loss": 0.1199, "step": 3975 }, { "epoch": 1.886148007590133, "grad_norm": 1.9562183618545532, "learning_rate": 1.6952241724078723e-07, "loss": 0.1583, "step": 3976 }, { "epoch": 1.8866223908918407, "grad_norm": 1.518302321434021, "learning_rate": 1.6811663695234903e-07, "loss": 0.1554, "step": 3977 }, { "epoch": 1.8870967741935485, "grad_norm": 1.9183396100997925, "learning_rate": 1.6671666032736845e-07, "loss": 0.1279, "step": 3978 }, { "epoch": 1.8875711574952563, "grad_norm": 2.229943037033081, "learning_rate": 1.6532248819223995e-07, "loss": 0.1979, "step": 3979 }, { "epoch": 1.888045540796964, "grad_norm": 1.8653277158737183, "learning_rate": 1.639341213699286e-07, "loss": 0.1692, "step": 3980 }, { "epoch": 1.8885199240986719, "grad_norm": 1.746902346611023, "learning_rate": 1.6255156067997325e-07, "loss": 0.1305, "step": 3981 }, { "epoch": 1.8889943074003797, "grad_norm": 1.417380928993225, "learning_rate": 1.6117480693848442e-07, "loss": 0.1378, "step": 3982 }, { "epoch": 1.8894686907020875, "grad_norm": 1.7626910209655762, "learning_rate": 1.598038609581487e-07, "loss": 0.1556, "step": 3983 }, { "epoch": 1.889943074003795, "grad_norm": 1.6275900602340698, "learning_rate": 1.5843872354822099e-07, "loss": 0.1346, "step": 3984 }, { "epoch": 1.8904174573055028, "grad_norm": 2.031325340270996, "learning_rate": 1.5707939551452778e-07, "loss": 0.1854, "step": 3985 }, { "epoch": 1.8908918406072106, "grad_norm": 1.9907748699188232, "learning_rate": 1.5572587765946833e-07, "loss": 0.1645, "step": 3986 }, { "epoch": 1.8913662239089184, "grad_norm": 1.80055570602417, "learning_rate": 1.5437817078201024e-07, "loss": 0.1441, "step": 3987 }, { "epoch": 1.8918406072106262, "grad_norm": 1.7448289394378662, "learning_rate": 1.5303627567769043e-07, "loss": 0.1543, "step": 3988 }, { "epoch": 1.892314990512334, "grad_norm": 1.6348083019256592, "learning_rate": 1.517001931386175e-07, "loss": 0.1384, "step": 3989 }, { "epoch": 1.8927893738140418, "grad_norm": 1.6425923109054565, "learning_rate": 1.5036992395346838e-07, "loss": 0.1273, "step": 3990 }, { "epoch": 1.8932637571157496, "grad_norm": 1.3642024993896484, "learning_rate": 1.4904546890748606e-07, "loss": 0.123, "step": 3991 }, { "epoch": 1.8937381404174574, "grad_norm": 2.2443578243255615, "learning_rate": 1.477268287824829e-07, "loss": 0.1664, "step": 3992 }, { "epoch": 1.894212523719165, "grad_norm": 1.5792819261550903, "learning_rate": 1.4641400435684184e-07, "loss": 0.1412, "step": 3993 }, { "epoch": 1.8946869070208727, "grad_norm": 1.4068413972854614, "learning_rate": 1.4510699640550852e-07, "loss": 0.1264, "step": 3994 }, { "epoch": 1.8951612903225805, "grad_norm": 1.7615771293640137, "learning_rate": 1.438058056999969e-07, "loss": 0.1467, "step": 3995 }, { "epoch": 1.8956356736242883, "grad_norm": 1.797843098640442, "learning_rate": 1.4251043300838706e-07, "loss": 0.1566, "step": 3996 }, { "epoch": 1.896110056925996, "grad_norm": 1.5007370710372925, "learning_rate": 1.412208790953229e-07, "loss": 0.1244, "step": 3997 }, { "epoch": 1.896584440227704, "grad_norm": 1.8236267566680908, "learning_rate": 1.399371447220188e-07, "loss": 0.1535, "step": 3998 }, { "epoch": 1.8970588235294117, "grad_norm": 1.9256770610809326, "learning_rate": 1.3865923064624753e-07, "loss": 0.1759, "step": 3999 }, { "epoch": 1.8975332068311195, "grad_norm": 1.9802489280700684, "learning_rate": 1.3738713762235124e-07, "loss": 0.1589, "step": 4000 }, { "epoch": 1.8980075901328273, "grad_norm": 1.7271060943603516, "learning_rate": 1.3612086640123257e-07, "loss": 0.1405, "step": 4001 }, { "epoch": 1.898481973434535, "grad_norm": 1.7396737337112427, "learning_rate": 1.348604177303592e-07, "loss": 0.1848, "step": 4002 }, { "epoch": 1.8989563567362429, "grad_norm": 1.5223190784454346, "learning_rate": 1.3360579235376148e-07, "loss": 0.1409, "step": 4003 }, { "epoch": 1.8994307400379506, "grad_norm": 1.359562635421753, "learning_rate": 1.323569910120326e-07, "loss": 0.1323, "step": 4004 }, { "epoch": 1.8999051233396584, "grad_norm": 1.640760064125061, "learning_rate": 1.311140144423273e-07, "loss": 0.1185, "step": 4005 }, { "epoch": 1.9003795066413662, "grad_norm": 1.469146966934204, "learning_rate": 1.2987686337836202e-07, "loss": 0.1387, "step": 4006 }, { "epoch": 1.900853889943074, "grad_norm": 1.6434742212295532, "learning_rate": 1.2864553855041484e-07, "loss": 0.159, "step": 4007 }, { "epoch": 1.9013282732447818, "grad_norm": 1.5248254537582397, "learning_rate": 1.2742004068532544e-07, "loss": 0.1352, "step": 4008 }, { "epoch": 1.9018026565464896, "grad_norm": 1.5884958505630493, "learning_rate": 1.2620037050649404e-07, "loss": 0.1388, "step": 4009 }, { "epoch": 1.9022770398481974, "grad_norm": 1.4101170301437378, "learning_rate": 1.2498652873387696e-07, "loss": 0.1303, "step": 4010 }, { "epoch": 1.9027514231499052, "grad_norm": 1.5898159742355347, "learning_rate": 1.237785160839955e-07, "loss": 0.1243, "step": 4011 }, { "epoch": 1.903225806451613, "grad_norm": 1.8250586986541748, "learning_rate": 1.225763332699259e-07, "loss": 0.1687, "step": 4012 }, { "epoch": 1.9037001897533208, "grad_norm": 1.5457115173339844, "learning_rate": 1.21379981001305e-07, "loss": 0.137, "step": 4013 }, { "epoch": 1.9041745730550286, "grad_norm": 2.082979917526245, "learning_rate": 1.2018945998433007e-07, "loss": 0.1687, "step": 4014 }, { "epoch": 1.9046489563567364, "grad_norm": 1.4070978164672852, "learning_rate": 1.190047709217501e-07, "loss": 0.0983, "step": 4015 }, { "epoch": 1.9051233396584442, "grad_norm": 1.6972084045410156, "learning_rate": 1.1782591451287795e-07, "loss": 0.1345, "step": 4016 }, { "epoch": 1.905597722960152, "grad_norm": 1.9089219570159912, "learning_rate": 1.1665289145357916e-07, "loss": 0.1535, "step": 4017 }, { "epoch": 1.9060721062618597, "grad_norm": 1.5392357110977173, "learning_rate": 1.1548570243627988e-07, "loss": 0.1201, "step": 4018 }, { "epoch": 1.9065464895635673, "grad_norm": 1.7641630172729492, "learning_rate": 1.1432434814995897e-07, "loss": 0.1616, "step": 4019 }, { "epoch": 1.907020872865275, "grad_norm": 1.4591128826141357, "learning_rate": 1.131688292801525e-07, "loss": 0.125, "step": 4020 }, { "epoch": 1.907495256166983, "grad_norm": 1.7952362298965454, "learning_rate": 1.1201914650895152e-07, "loss": 0.1476, "step": 4021 }, { "epoch": 1.9079696394686907, "grad_norm": 1.4461826086044312, "learning_rate": 1.1087530051500206e-07, "loss": 0.1369, "step": 4022 }, { "epoch": 1.9084440227703985, "grad_norm": 1.7202457189559937, "learning_rate": 1.0973729197350514e-07, "loss": 0.1535, "step": 4023 }, { "epoch": 1.9089184060721063, "grad_norm": 1.419438362121582, "learning_rate": 1.0860512155621783e-07, "loss": 0.1482, "step": 4024 }, { "epoch": 1.909392789373814, "grad_norm": 1.293198585510254, "learning_rate": 1.0747878993144667e-07, "loss": 0.1156, "step": 4025 }, { "epoch": 1.9098671726755219, "grad_norm": 1.2855961322784424, "learning_rate": 1.0635829776405537e-07, "loss": 0.1263, "step": 4026 }, { "epoch": 1.9103415559772297, "grad_norm": 1.5536890029907227, "learning_rate": 1.0524364571546042e-07, "loss": 0.1277, "step": 4027 }, { "epoch": 1.9108159392789372, "grad_norm": 1.8725106716156006, "learning_rate": 1.041348344436277e-07, "loss": 0.1618, "step": 4028 }, { "epoch": 1.911290322580645, "grad_norm": 1.922491431236267, "learning_rate": 1.0303186460307813e-07, "loss": 0.1475, "step": 4029 }, { "epoch": 1.9117647058823528, "grad_norm": 1.6635925769805908, "learning_rate": 1.019347368448842e-07, "loss": 0.1678, "step": 4030 }, { "epoch": 1.9122390891840606, "grad_norm": 1.6013275384902954, "learning_rate": 1.0084345181666899e-07, "loss": 0.1227, "step": 4031 }, { "epoch": 1.9127134724857684, "grad_norm": 1.4993422031402588, "learning_rate": 9.975801016260834e-08, "loss": 0.1227, "step": 4032 }, { "epoch": 1.9131878557874762, "grad_norm": 1.8088953495025635, "learning_rate": 9.867841252342747e-08, "loss": 0.1511, "step": 4033 }, { "epoch": 1.913662239089184, "grad_norm": 1.366092562675476, "learning_rate": 9.76046595364022e-08, "loss": 0.1118, "step": 4034 }, { "epoch": 1.9141366223908918, "grad_norm": 1.6480141878128052, "learning_rate": 9.65367518353577e-08, "loss": 0.1707, "step": 4035 }, { "epoch": 1.9146110056925996, "grad_norm": 1.374222755432129, "learning_rate": 9.547469005066979e-08, "loss": 0.1033, "step": 4036 }, { "epoch": 1.9150853889943074, "grad_norm": 1.432600736618042, "learning_rate": 9.441847480926247e-08, "loss": 0.1277, "step": 4037 }, { "epoch": 1.9155597722960152, "grad_norm": 1.8599258661270142, "learning_rate": 9.336810673460928e-08, "loss": 0.1714, "step": 4038 }, { "epoch": 1.916034155597723, "grad_norm": 1.7460694313049316, "learning_rate": 9.232358644673311e-08, "loss": 0.141, "step": 4039 }, { "epoch": 1.9165085388994307, "grad_norm": 1.5455800294876099, "learning_rate": 9.12849145622019e-08, "loss": 0.1159, "step": 4040 }, { "epoch": 1.9169829222011385, "grad_norm": 1.9061877727508545, "learning_rate": 9.025209169413629e-08, "loss": 0.1449, "step": 4041 }, { "epoch": 1.9174573055028463, "grad_norm": 1.8176791667938232, "learning_rate": 8.922511845219972e-08, "loss": 0.1377, "step": 4042 }, { "epoch": 1.9179316888045541, "grad_norm": 1.5344781875610352, "learning_rate": 8.820399544260283e-08, "loss": 0.1394, "step": 4043 }, { "epoch": 1.918406072106262, "grad_norm": 1.6512079238891602, "learning_rate": 8.71887232681079e-08, "loss": 0.1129, "step": 4044 }, { "epoch": 1.9188804554079697, "grad_norm": 2.159686803817749, "learning_rate": 8.617930252801665e-08, "loss": 0.1642, "step": 4045 }, { "epoch": 1.9193548387096775, "grad_norm": 1.8301466703414917, "learning_rate": 8.517573381818245e-08, "loss": 0.1814, "step": 4046 }, { "epoch": 1.9198292220113853, "grad_norm": 1.6383099555969238, "learning_rate": 8.417801773100032e-08, "loss": 0.1308, "step": 4047 }, { "epoch": 1.920303605313093, "grad_norm": 1.4430229663848877, "learning_rate": 8.31861548554147e-08, "loss": 0.1266, "step": 4048 }, { "epoch": 1.9207779886148009, "grad_norm": 1.6547898054122925, "learning_rate": 8.220014577690949e-08, "loss": 0.1496, "step": 4049 }, { "epoch": 1.9212523719165087, "grad_norm": 1.4231595993041992, "learning_rate": 8.1219991077518e-08, "loss": 0.1162, "step": 4050 }, { "epoch": 1.9217267552182165, "grad_norm": 1.9546478986740112, "learning_rate": 8.024569133581517e-08, "loss": 0.1404, "step": 4051 }, { "epoch": 1.9222011385199242, "grad_norm": 1.541332483291626, "learning_rate": 7.927724712692098e-08, "loss": 0.1339, "step": 4052 }, { "epoch": 1.922675521821632, "grad_norm": 2.084918260574341, "learning_rate": 7.831465902249701e-08, "loss": 0.1533, "step": 4053 }, { "epoch": 1.9231499051233396, "grad_norm": 1.7567437887191772, "learning_rate": 7.7357927590751e-08, "loss": 0.1356, "step": 4054 }, { "epoch": 1.9236242884250474, "grad_norm": 1.616256833076477, "learning_rate": 7.640705339643118e-08, "loss": 0.1404, "step": 4055 }, { "epoch": 1.9240986717267552, "grad_norm": 1.6991674900054932, "learning_rate": 7.546203700082966e-08, "loss": 0.1553, "step": 4056 }, { "epoch": 1.924573055028463, "grad_norm": 1.4188369512557983, "learning_rate": 7.452287896178134e-08, "loss": 0.1166, "step": 4057 }, { "epoch": 1.9250474383301708, "grad_norm": 1.779510498046875, "learning_rate": 7.358957983365944e-08, "loss": 0.1478, "step": 4058 }, { "epoch": 1.9255218216318786, "grad_norm": 1.4632707834243774, "learning_rate": 7.266214016738326e-08, "loss": 0.1639, "step": 4059 }, { "epoch": 1.9259962049335864, "grad_norm": 2.640392303466797, "learning_rate": 7.174056051041045e-08, "loss": 0.1873, "step": 4060 }, { "epoch": 1.9264705882352942, "grad_norm": 2.0957512855529785, "learning_rate": 7.082484140674029e-08, "loss": 0.1609, "step": 4061 }, { "epoch": 1.926944971537002, "grad_norm": 1.6926616430282593, "learning_rate": 6.99149833969126e-08, "loss": 0.1288, "step": 4062 }, { "epoch": 1.9274193548387095, "grad_norm": 1.697201132774353, "learning_rate": 6.901098701800779e-08, "loss": 0.1569, "step": 4063 }, { "epoch": 1.9278937381404173, "grad_norm": 1.5255944728851318, "learning_rate": 6.811285280364677e-08, "loss": 0.1255, "step": 4064 }, { "epoch": 1.928368121442125, "grad_norm": 1.38019859790802, "learning_rate": 6.722058128398768e-08, "loss": 0.1255, "step": 4065 }, { "epoch": 1.928842504743833, "grad_norm": 1.9084728956222534, "learning_rate": 6.633417298573142e-08, "loss": 0.1617, "step": 4066 }, { "epoch": 1.9293168880455407, "grad_norm": 1.2430599927902222, "learning_rate": 6.54536284321139e-08, "loss": 0.1083, "step": 4067 }, { "epoch": 1.9297912713472485, "grad_norm": 1.3795536756515503, "learning_rate": 6.457894814291376e-08, "loss": 0.1128, "step": 4068 }, { "epoch": 1.9302656546489563, "grad_norm": 1.5916107892990112, "learning_rate": 6.371013263444469e-08, "loss": 0.1367, "step": 4069 }, { "epoch": 1.930740037950664, "grad_norm": 1.4485095739364624, "learning_rate": 6.284718241956089e-08, "loss": 0.1246, "step": 4070 }, { "epoch": 1.9312144212523719, "grad_norm": 1.8069593906402588, "learning_rate": 6.199009800765265e-08, "loss": 0.1763, "step": 4071 }, { "epoch": 1.9316888045540797, "grad_norm": 1.608431339263916, "learning_rate": 6.113887990464862e-08, "loss": 0.1233, "step": 4072 }, { "epoch": 1.9321631878557874, "grad_norm": 1.4641162157058716, "learning_rate": 6.029352861301462e-08, "loss": 0.127, "step": 4073 }, { "epoch": 1.9326375711574952, "grad_norm": 1.5734367370605469, "learning_rate": 5.945404463175375e-08, "loss": 0.1413, "step": 4074 }, { "epoch": 1.933111954459203, "grad_norm": 1.354025959968567, "learning_rate": 5.862042845640403e-08, "loss": 0.1094, "step": 4075 }, { "epoch": 1.9335863377609108, "grad_norm": 1.6101434230804443, "learning_rate": 5.779268057904186e-08, "loss": 0.1551, "step": 4076 }, { "epoch": 1.9340607210626186, "grad_norm": 1.3197808265686035, "learning_rate": 5.6970801488276385e-08, "loss": 0.1253, "step": 4077 }, { "epoch": 1.9345351043643264, "grad_norm": 1.7028498649597168, "learning_rate": 5.61547916692573e-08, "loss": 0.1485, "step": 4078 }, { "epoch": 1.9350094876660342, "grad_norm": 1.6578190326690674, "learning_rate": 5.534465160366598e-08, "loss": 0.1275, "step": 4079 }, { "epoch": 1.935483870967742, "grad_norm": 1.70431649684906, "learning_rate": 5.454038176971987e-08, "loss": 0.1387, "step": 4080 }, { "epoch": 1.9359582542694498, "grad_norm": 1.3666795492172241, "learning_rate": 5.3741982642173675e-08, "loss": 0.1215, "step": 4081 }, { "epoch": 1.9364326375711576, "grad_norm": 1.2814855575561523, "learning_rate": 5.294945469231039e-08, "loss": 0.1162, "step": 4082 }, { "epoch": 1.9369070208728654, "grad_norm": 1.7028858661651611, "learning_rate": 5.2162798387954686e-08, "loss": 0.1455, "step": 4083 }, { "epoch": 1.9373814041745732, "grad_norm": 1.8575924634933472, "learning_rate": 5.1382014193461783e-08, "loss": 0.1607, "step": 4084 }, { "epoch": 1.937855787476281, "grad_norm": 1.5589218139648438, "learning_rate": 5.0607102569718566e-08, "loss": 0.1597, "step": 4085 }, { "epoch": 1.9383301707779887, "grad_norm": 1.469462275505066, "learning_rate": 4.9838063974150255e-08, "loss": 0.1403, "step": 4086 }, { "epoch": 1.9388045540796965, "grad_norm": 1.900517225265503, "learning_rate": 4.9074898860711485e-08, "loss": 0.1851, "step": 4087 }, { "epoch": 1.9392789373814043, "grad_norm": 1.6202280521392822, "learning_rate": 4.83176076798908e-08, "loss": 0.1457, "step": 4088 }, { "epoch": 1.939753320683112, "grad_norm": 1.608129620552063, "learning_rate": 4.7566190878710615e-08, "loss": 0.1075, "step": 4089 }, { "epoch": 1.9402277039848197, "grad_norm": 1.531738519668579, "learning_rate": 4.6820648900725016e-08, "loss": 0.1195, "step": 4090 }, { "epoch": 1.9407020872865275, "grad_norm": 1.6448765993118286, "learning_rate": 4.608098218601864e-08, "loss": 0.1508, "step": 4091 }, { "epoch": 1.9411764705882353, "grad_norm": 1.4417756795883179, "learning_rate": 4.5347191171211114e-08, "loss": 0.1439, "step": 4092 }, { "epoch": 1.941650853889943, "grad_norm": 1.6759575605392456, "learning_rate": 4.4619276289450394e-08, "loss": 0.1487, "step": 4093 }, { "epoch": 1.9421252371916509, "grad_norm": 1.6265183687210083, "learning_rate": 4.3897237970418336e-08, "loss": 0.1338, "step": 4094 }, { "epoch": 1.9425996204933587, "grad_norm": 1.5453293323516846, "learning_rate": 4.318107664032622e-08, "loss": 0.1366, "step": 4095 }, { "epoch": 1.9430740037950665, "grad_norm": 1.4695607423782349, "learning_rate": 4.247079272191812e-08, "loss": 0.1329, "step": 4096 }, { "epoch": 1.9435483870967742, "grad_norm": 1.945720911026001, "learning_rate": 4.1766386634467523e-08, "loss": 0.169, "step": 4097 }, { "epoch": 1.9440227703984818, "grad_norm": 1.592432975769043, "learning_rate": 4.1067858793777394e-08, "loss": 0.1179, "step": 4098 }, { "epoch": 1.9444971537001896, "grad_norm": 1.6521458625793457, "learning_rate": 4.037520961218233e-08, "loss": 0.1522, "step": 4099 }, { "epoch": 1.9449715370018974, "grad_norm": 1.886146903038025, "learning_rate": 3.96884394985475e-08, "loss": 0.1636, "step": 4100 }, { "epoch": 1.9454459203036052, "grad_norm": 1.8324472904205322, "learning_rate": 3.900754885826419e-08, "loss": 0.1596, "step": 4101 }, { "epoch": 1.945920303605313, "grad_norm": 1.935429573059082, "learning_rate": 3.833253809325643e-08, "loss": 0.171, "step": 4102 }, { "epoch": 1.9463946869070208, "grad_norm": 1.5601022243499756, "learning_rate": 3.766340760197662e-08, "loss": 0.1629, "step": 4103 }, { "epoch": 1.9468690702087286, "grad_norm": 1.2988762855529785, "learning_rate": 3.700015777940547e-08, "loss": 0.1098, "step": 4104 }, { "epoch": 1.9473434535104364, "grad_norm": 1.6322436332702637, "learning_rate": 3.634278901705424e-08, "loss": 0.167, "step": 4105 }, { "epoch": 1.9478178368121442, "grad_norm": 1.707542061805725, "learning_rate": 3.56913017029592e-08, "loss": 0.1659, "step": 4106 }, { "epoch": 1.948292220113852, "grad_norm": 1.5153127908706665, "learning_rate": 3.50456962216883e-08, "loss": 0.1431, "step": 4107 }, { "epoch": 1.9487666034155597, "grad_norm": 1.735724687576294, "learning_rate": 3.4405972954334454e-08, "loss": 0.1478, "step": 4108 }, { "epoch": 1.9492409867172675, "grad_norm": 1.4995523691177368, "learning_rate": 3.3772132278522276e-08, "loss": 0.1218, "step": 4109 }, { "epoch": 1.9497153700189753, "grad_norm": 1.6257517337799072, "learning_rate": 3.3144174568399135e-08, "loss": 0.1306, "step": 4110 }, { "epoch": 1.9501897533206831, "grad_norm": 1.814677357673645, "learning_rate": 3.252210019464408e-08, "loss": 0.1423, "step": 4111 }, { "epoch": 1.950664136622391, "grad_norm": 1.7298712730407715, "learning_rate": 3.190590952446115e-08, "loss": 0.1366, "step": 4112 }, { "epoch": 1.9511385199240987, "grad_norm": 1.5716830492019653, "learning_rate": 3.129560292158051e-08, "loss": 0.1091, "step": 4113 }, { "epoch": 1.9516129032258065, "grad_norm": 1.3613415956497192, "learning_rate": 3.069118074626176e-08, "loss": 0.1238, "step": 4114 }, { "epoch": 1.9520872865275143, "grad_norm": 1.6825555562973022, "learning_rate": 3.0092643355287274e-08, "loss": 0.1441, "step": 4115 }, { "epoch": 1.952561669829222, "grad_norm": 1.650367021560669, "learning_rate": 2.9499991101969995e-08, "loss": 0.156, "step": 4116 }, { "epoch": 1.9530360531309299, "grad_norm": 1.4215662479400635, "learning_rate": 2.8913224336145628e-08, "loss": 0.1157, "step": 4117 }, { "epoch": 1.9535104364326377, "grad_norm": 1.7951689958572388, "learning_rate": 2.8332343404177122e-08, "loss": 0.1527, "step": 4118 }, { "epoch": 1.9539848197343455, "grad_norm": 1.3944616317749023, "learning_rate": 2.7757348648951298e-08, "loss": 0.1123, "step": 4119 }, { "epoch": 1.9544592030360532, "grad_norm": 1.5503982305526733, "learning_rate": 2.7188240409883325e-08, "loss": 0.1412, "step": 4120 }, { "epoch": 1.954933586337761, "grad_norm": 1.899158000946045, "learning_rate": 2.6625019022912256e-08, "loss": 0.1543, "step": 4121 }, { "epoch": 1.9554079696394688, "grad_norm": 1.7116118669509888, "learning_rate": 2.606768482050215e-08, "loss": 0.1359, "step": 4122 }, { "epoch": 1.9558823529411766, "grad_norm": 1.7955915927886963, "learning_rate": 2.5516238131640945e-08, "loss": 0.1608, "step": 4123 }, { "epoch": 1.9563567362428842, "grad_norm": 1.3384437561035156, "learning_rate": 2.4970679281842715e-08, "loss": 0.1349, "step": 4124 }, { "epoch": 1.956831119544592, "grad_norm": 1.5795972347259521, "learning_rate": 2.443100859314429e-08, "loss": 0.131, "step": 4125 }, { "epoch": 1.9573055028462998, "grad_norm": 1.428765892982483, "learning_rate": 2.389722638410974e-08, "loss": 0.1195, "step": 4126 }, { "epoch": 1.9577798861480076, "grad_norm": 1.754357933998108, "learning_rate": 2.3369332969824798e-08, "loss": 0.1282, "step": 4127 }, { "epoch": 1.9582542694497154, "grad_norm": 1.8691462278366089, "learning_rate": 2.2847328661900203e-08, "loss": 0.1735, "step": 4128 }, { "epoch": 1.9587286527514232, "grad_norm": 1.630744218826294, "learning_rate": 2.2331213768468363e-08, "loss": 0.119, "step": 4129 }, { "epoch": 1.959203036053131, "grad_norm": 1.8132566213607788, "learning_rate": 2.1820988594187796e-08, "loss": 0.1588, "step": 4130 }, { "epoch": 1.9596774193548387, "grad_norm": 1.4721542596817017, "learning_rate": 2.131665344023981e-08, "loss": 0.1301, "step": 4131 }, { "epoch": 1.9601518026565465, "grad_norm": 1.846588373184204, "learning_rate": 2.0818208604328482e-08, "loss": 0.1776, "step": 4132 }, { "epoch": 1.960626185958254, "grad_norm": 1.6939235925674438, "learning_rate": 2.032565438067957e-08, "loss": 0.163, "step": 4133 }, { "epoch": 1.961100569259962, "grad_norm": 1.7470312118530273, "learning_rate": 1.9838991060043833e-08, "loss": 0.1481, "step": 4134 }, { "epoch": 1.9615749525616697, "grad_norm": 1.7479218244552612, "learning_rate": 1.9358218929693695e-08, "loss": 0.1545, "step": 4135 }, { "epoch": 1.9620493358633775, "grad_norm": 1.5178728103637695, "learning_rate": 1.8883338273425478e-08, "loss": 0.134, "step": 4136 }, { "epoch": 1.9625237191650853, "grad_norm": 1.7611669301986694, "learning_rate": 1.8414349371553842e-08, "loss": 0.1345, "step": 4137 }, { "epoch": 1.962998102466793, "grad_norm": 1.5717504024505615, "learning_rate": 1.7951252500920668e-08, "loss": 0.1209, "step": 4138 }, { "epoch": 1.9634724857685009, "grad_norm": 1.52717924118042, "learning_rate": 1.7494047934885073e-08, "loss": 0.1201, "step": 4139 }, { "epoch": 1.9639468690702087, "grad_norm": 1.8689799308776855, "learning_rate": 1.7042735943333388e-08, "loss": 0.1422, "step": 4140 }, { "epoch": 1.9644212523719164, "grad_norm": 1.4372001886367798, "learning_rate": 1.659731679266807e-08, "loss": 0.123, "step": 4141 }, { "epoch": 1.9648956356736242, "grad_norm": 1.98843514919281, "learning_rate": 1.6157790745817692e-08, "loss": 0.1896, "step": 4142 }, { "epoch": 1.965370018975332, "grad_norm": 1.4792766571044922, "learning_rate": 1.5724158062228046e-08, "loss": 0.1424, "step": 4143 }, { "epoch": 1.9658444022770398, "grad_norm": 1.795911192893982, "learning_rate": 1.5296418997869932e-08, "loss": 0.1412, "step": 4144 }, { "epoch": 1.9663187855787476, "grad_norm": 1.8548011779785156, "learning_rate": 1.4874573805232495e-08, "loss": 0.1624, "step": 4145 }, { "epoch": 1.9667931688804554, "grad_norm": 2.306521415710449, "learning_rate": 1.4458622733327654e-08, "loss": 0.1668, "step": 4146 }, { "epoch": 1.9672675521821632, "grad_norm": 2.005544900894165, "learning_rate": 1.4048566027685673e-08, "loss": 0.1667, "step": 4147 }, { "epoch": 1.967741935483871, "grad_norm": 2.2230448722839355, "learning_rate": 1.3644403930360706e-08, "loss": 0.1654, "step": 4148 }, { "epoch": 1.9682163187855788, "grad_norm": 1.799142837524414, "learning_rate": 1.3246136679925249e-08, "loss": 0.138, "step": 4149 }, { "epoch": 1.9686907020872866, "grad_norm": 1.5519241094589233, "learning_rate": 1.2853764511471245e-08, "loss": 0.114, "step": 4150 }, { "epoch": 1.9691650853889944, "grad_norm": 1.872344732284546, "learning_rate": 1.2467287656613425e-08, "loss": 0.1696, "step": 4151 }, { "epoch": 1.9696394686907022, "grad_norm": 2.0996015071868896, "learning_rate": 1.2086706343484855e-08, "loss": 0.1841, "step": 4152 }, { "epoch": 1.97011385199241, "grad_norm": 1.880236268043518, "learning_rate": 1.1712020796738056e-08, "loss": 0.1657, "step": 4153 }, { "epoch": 1.9705882352941178, "grad_norm": 1.773377537727356, "learning_rate": 1.1343231237548324e-08, "loss": 0.1686, "step": 4154 }, { "epoch": 1.9710626185958255, "grad_norm": 2.0221047401428223, "learning_rate": 1.0980337883605973e-08, "loss": 0.1789, "step": 4155 }, { "epoch": 1.9715370018975333, "grad_norm": 1.4797416925430298, "learning_rate": 1.0623340949125204e-08, "loss": 0.1164, "step": 4156 }, { "epoch": 1.9720113851992411, "grad_norm": 1.4807063341140747, "learning_rate": 1.027224064483745e-08, "loss": 0.1294, "step": 4157 }, { "epoch": 1.972485768500949, "grad_norm": 1.8948291540145874, "learning_rate": 9.927037177993593e-09, "loss": 0.1705, "step": 4158 }, { "epoch": 1.9729601518026565, "grad_norm": 1.3457918167114258, "learning_rate": 9.587730752362855e-09, "loss": 0.1067, "step": 4159 }, { "epoch": 1.9734345351043643, "grad_norm": 1.3780224323272705, "learning_rate": 9.254321568236135e-09, "loss": 0.1239, "step": 4160 }, { "epoch": 1.973908918406072, "grad_norm": 1.7696219682693481, "learning_rate": 8.926809822420446e-09, "loss": 0.1581, "step": 4161 }, { "epoch": 1.9743833017077799, "grad_norm": 1.508495807647705, "learning_rate": 8.605195708242254e-09, "loss": 0.1172, "step": 4162 }, { "epoch": 1.9748576850094877, "grad_norm": 1.7179173231124878, "learning_rate": 8.289479415548585e-09, "loss": 0.1668, "step": 4163 }, { "epoch": 1.9753320683111955, "grad_norm": 1.8635634183883667, "learning_rate": 7.979661130703697e-09, "loss": 0.148, "step": 4164 }, { "epoch": 1.9758064516129032, "grad_norm": 1.8939173221588135, "learning_rate": 7.67574103658797e-09, "loss": 0.1754, "step": 4165 }, { "epoch": 1.976280834914611, "grad_norm": 1.3412449359893799, "learning_rate": 7.377719312605669e-09, "loss": 0.1145, "step": 4166 }, { "epoch": 1.9767552182163188, "grad_norm": 1.482282280921936, "learning_rate": 7.085596134673855e-09, "loss": 0.1171, "step": 4167 }, { "epoch": 1.9772296015180264, "grad_norm": 1.7333260774612427, "learning_rate": 6.799371675230149e-09, "loss": 0.154, "step": 4168 }, { "epoch": 1.9777039848197342, "grad_norm": 2.1031572818756104, "learning_rate": 6.5190461032305085e-09, "loss": 0.1674, "step": 4169 }, { "epoch": 1.978178368121442, "grad_norm": 2.2297379970550537, "learning_rate": 6.244619584148126e-09, "loss": 0.1709, "step": 4170 }, { "epoch": 1.9786527514231498, "grad_norm": 1.7986024618148804, "learning_rate": 5.976092279974533e-09, "loss": 0.1359, "step": 4171 }, { "epoch": 1.9791271347248576, "grad_norm": 1.8482614755630493, "learning_rate": 5.713464349218489e-09, "loss": 0.1645, "step": 4172 }, { "epoch": 1.9796015180265654, "grad_norm": 1.9954028129577637, "learning_rate": 5.456735946907099e-09, "loss": 0.15, "step": 4173 }, { "epoch": 1.9800759013282732, "grad_norm": 1.6475908756256104, "learning_rate": 5.205907224583584e-09, "loss": 0.1412, "step": 4174 }, { "epoch": 1.980550284629981, "grad_norm": 1.6490331888198853, "learning_rate": 4.960978330310618e-09, "loss": 0.1739, "step": 4175 }, { "epoch": 1.9810246679316887, "grad_norm": 1.4961657524108887, "learning_rate": 4.721949408666993e-09, "loss": 0.1371, "step": 4176 }, { "epoch": 1.9814990512333965, "grad_norm": 2.557554006576538, "learning_rate": 4.488820600749844e-09, "loss": 0.2002, "step": 4177 }, { "epoch": 1.9819734345351043, "grad_norm": 1.2743830680847168, "learning_rate": 4.261592044171314e-09, "loss": 0.1035, "step": 4178 }, { "epoch": 1.9824478178368121, "grad_norm": 2.122196674346924, "learning_rate": 4.040263873063e-09, "loss": 0.1985, "step": 4179 }, { "epoch": 1.98292220113852, "grad_norm": 2.3841426372528076, "learning_rate": 3.824836218072614e-09, "loss": 0.1923, "step": 4180 }, { "epoch": 1.9833965844402277, "grad_norm": 1.5701267719268799, "learning_rate": 3.615309206365103e-09, "loss": 0.12, "step": 4181 }, { "epoch": 1.9838709677419355, "grad_norm": 1.7213834524154663, "learning_rate": 3.411682961621532e-09, "loss": 0.1528, "step": 4182 }, { "epoch": 1.9843453510436433, "grad_norm": 1.8331454992294312, "learning_rate": 3.2139576040413067e-09, "loss": 0.1536, "step": 4183 }, { "epoch": 1.984819734345351, "grad_norm": 1.744813084602356, "learning_rate": 3.0221332503399534e-09, "loss": 0.1609, "step": 4184 }, { "epoch": 1.9852941176470589, "grad_norm": 1.6270570755004883, "learning_rate": 2.8362100137491187e-09, "loss": 0.1473, "step": 4185 }, { "epoch": 1.9857685009487667, "grad_norm": 1.7102569341659546, "learning_rate": 2.656188004016569e-09, "loss": 0.1371, "step": 4186 }, { "epoch": 1.9862428842504745, "grad_norm": 1.3755574226379395, "learning_rate": 2.4820673274095207e-09, "loss": 0.1176, "step": 4187 }, { "epoch": 1.9867172675521823, "grad_norm": 1.6135125160217285, "learning_rate": 2.3138480867079814e-09, "loss": 0.1593, "step": 4188 }, { "epoch": 1.98719165085389, "grad_norm": 1.431528925895691, "learning_rate": 2.1515303812091883e-09, "loss": 0.1384, "step": 4189 }, { "epoch": 1.9876660341555978, "grad_norm": 1.3943769931793213, "learning_rate": 1.9951143067309385e-09, "loss": 0.1049, "step": 4190 }, { "epoch": 1.9881404174573056, "grad_norm": 1.5564050674438477, "learning_rate": 1.8445999556016003e-09, "loss": 0.1207, "step": 4191 }, { "epoch": 1.9886148007590134, "grad_norm": 1.4701181650161743, "learning_rate": 1.6999874166678809e-09, "loss": 0.1306, "step": 4192 }, { "epoch": 1.9890891840607212, "grad_norm": 2.03861141204834, "learning_rate": 1.561276775295939e-09, "loss": 0.1344, "step": 4193 }, { "epoch": 1.989563567362429, "grad_norm": 1.9190205335617065, "learning_rate": 1.4284681133625022e-09, "loss": 0.1536, "step": 4194 }, { "epoch": 1.9900379506641366, "grad_norm": 1.4114086627960205, "learning_rate": 1.301561509263749e-09, "loss": 0.1194, "step": 4195 }, { "epoch": 1.9905123339658444, "grad_norm": 1.5192102193832397, "learning_rate": 1.1805570379130882e-09, "loss": 0.1374, "step": 4196 }, { "epoch": 1.9909867172675522, "grad_norm": 1.6114188432693481, "learning_rate": 1.0654547707367179e-09, "loss": 0.1747, "step": 4197 }, { "epoch": 1.99146110056926, "grad_norm": 1.4619154930114746, "learning_rate": 9.562547756780672e-10, "loss": 0.1304, "step": 4198 }, { "epoch": 1.9919354838709677, "grad_norm": 1.5712562799453735, "learning_rate": 8.529571171977946e-10, "loss": 0.1691, "step": 4199 }, { "epoch": 1.9924098671726755, "grad_norm": 1.8165565729141235, "learning_rate": 7.555618562715695e-10, "loss": 0.1524, "step": 4200 }, { "epoch": 1.9928842504743833, "grad_norm": 1.9051848649978638, "learning_rate": 6.640690503889601e-10, "loss": 0.1687, "step": 4201 }, { "epoch": 1.9933586337760911, "grad_norm": 1.6580159664154053, "learning_rate": 5.784787535600966e-10, "loss": 0.1748, "step": 4202 }, { "epoch": 1.9938330170777987, "grad_norm": 1.8379294872283936, "learning_rate": 4.987910163067878e-10, "loss": 0.127, "step": 4203 }, { "epoch": 1.9943074003795065, "grad_norm": 1.5118616819381714, "learning_rate": 4.2500588566696254e-10, "loss": 0.1409, "step": 4204 }, { "epoch": 1.9947817836812143, "grad_norm": 1.5703513622283936, "learning_rate": 3.5712340519689083e-10, "loss": 0.1385, "step": 4205 }, { "epoch": 1.995256166982922, "grad_norm": 1.7153023481369019, "learning_rate": 2.9514361496563173e-10, "loss": 0.1148, "step": 4206 }, { "epoch": 1.9957305502846299, "grad_norm": 1.4218897819519043, "learning_rate": 2.39066551560585e-10, "loss": 0.1276, "step": 4207 }, { "epoch": 1.9962049335863377, "grad_norm": 1.6402268409729004, "learning_rate": 1.8889224808193995e-10, "loss": 0.1318, "step": 4208 }, { "epoch": 1.9966793168880455, "grad_norm": 1.7963615655899048, "learning_rate": 1.446207341482264e-10, "loss": 0.1481, "step": 4209 }, { "epoch": 1.9971537001897532, "grad_norm": 1.9880362749099731, "learning_rate": 1.0625203589187394e-10, "loss": 0.1913, "step": 4210 }, { "epoch": 1.997628083491461, "grad_norm": 1.616383671760559, "learning_rate": 7.378617596143223e-11, "loss": 0.1222, "step": 4211 }, { "epoch": 1.9981024667931688, "grad_norm": 1.5262385606765747, "learning_rate": 4.722317352157113e-11, "loss": 0.1253, "step": 4212 }, { "epoch": 1.9985768500948766, "grad_norm": 1.7264385223388672, "learning_rate": 2.656304425308065e-11, "loss": 0.1395, "step": 4213 }, { "epoch": 1.9990512333965844, "grad_norm": 1.826885461807251, "learning_rate": 1.1805800349540264e-11, "loss": 0.1372, "step": 4214 }, { "epoch": 1.9995256166982922, "grad_norm": 1.5402823686599731, "learning_rate": 2.9514505228700473e-12, "loss": 0.1458, "step": 4215 }, { "epoch": 2.0, "grad_norm": 1.4708012342453003, "learning_rate": 0.0, "loss": 0.1223, "step": 4216 } ], "logging_steps": 1.0, "max_steps": 4216, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 8.236846139047936e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }