fats-fme's picture
Training in progress, step 759, checkpoint
d30df7a verified
raw
history blame
132 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2499897072748981,
"eval_steps": 759,
"global_step": 759,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0003293672032607353,
"grad_norm": 3.2412221431732178,
"learning_rate": 2.0000000000000003e-06,
"loss": 3.3662,
"step": 1
},
{
"epoch": 0.0003293672032607353,
"eval_loss": 3.95210599899292,
"eval_runtime": 812.7321,
"eval_samples_per_second": 3.146,
"eval_steps_per_second": 1.574,
"step": 1
},
{
"epoch": 0.0006587344065214706,
"grad_norm": 3.8850836753845215,
"learning_rate": 4.000000000000001e-06,
"loss": 3.2364,
"step": 2
},
{
"epoch": 0.000988101609782206,
"grad_norm": 3.7942073345184326,
"learning_rate": 6e-06,
"loss": 3.3337,
"step": 3
},
{
"epoch": 0.0013174688130429412,
"grad_norm": 4.045947074890137,
"learning_rate": 8.000000000000001e-06,
"loss": 3.523,
"step": 4
},
{
"epoch": 0.0016468360163036766,
"grad_norm": 3.6181905269622803,
"learning_rate": 1e-05,
"loss": 3.1772,
"step": 5
},
{
"epoch": 0.001976203219564412,
"grad_norm": 4.814149379730225,
"learning_rate": 1.2e-05,
"loss": 3.871,
"step": 6
},
{
"epoch": 0.002305570422825147,
"grad_norm": 4.0820112228393555,
"learning_rate": 1.4000000000000001e-05,
"loss": 3.7199,
"step": 7
},
{
"epoch": 0.0026349376260858823,
"grad_norm": 4.483249187469482,
"learning_rate": 1.6000000000000003e-05,
"loss": 3.5852,
"step": 8
},
{
"epoch": 0.002964304829346618,
"grad_norm": 5.03192138671875,
"learning_rate": 1.8e-05,
"loss": 3.5165,
"step": 9
},
{
"epoch": 0.003293672032607353,
"grad_norm": 4.41070556640625,
"learning_rate": 2e-05,
"loss": 3.1522,
"step": 10
},
{
"epoch": 0.0036230392358680883,
"grad_norm": 4.54289436340332,
"learning_rate": 2.2000000000000003e-05,
"loss": 3.113,
"step": 11
},
{
"epoch": 0.003952406439128824,
"grad_norm": 5.175803184509277,
"learning_rate": 2.4e-05,
"loss": 3.0833,
"step": 12
},
{
"epoch": 0.004281773642389559,
"grad_norm": 5.162367820739746,
"learning_rate": 2.6000000000000002e-05,
"loss": 3.2495,
"step": 13
},
{
"epoch": 0.004611140845650294,
"grad_norm": 5.13967227935791,
"learning_rate": 2.8000000000000003e-05,
"loss": 3.1866,
"step": 14
},
{
"epoch": 0.0049405080489110294,
"grad_norm": 4.819941520690918,
"learning_rate": 3e-05,
"loss": 2.8483,
"step": 15
},
{
"epoch": 0.005269875252171765,
"grad_norm": 4.200347900390625,
"learning_rate": 3.2000000000000005e-05,
"loss": 3.0112,
"step": 16
},
{
"epoch": 0.005599242455432501,
"grad_norm": 3.748765230178833,
"learning_rate": 3.4000000000000007e-05,
"loss": 2.4482,
"step": 17
},
{
"epoch": 0.005928609658693236,
"grad_norm": 4.706761360168457,
"learning_rate": 3.6e-05,
"loss": 2.7336,
"step": 18
},
{
"epoch": 0.006257976861953971,
"grad_norm": 4.122259616851807,
"learning_rate": 3.8e-05,
"loss": 2.2935,
"step": 19
},
{
"epoch": 0.006587344065214706,
"grad_norm": 3.4756200313568115,
"learning_rate": 4e-05,
"loss": 2.4558,
"step": 20
},
{
"epoch": 0.006916711268475441,
"grad_norm": 4.478485107421875,
"learning_rate": 4.2e-05,
"loss": 2.7799,
"step": 21
},
{
"epoch": 0.007246078471736177,
"grad_norm": 3.8455657958984375,
"learning_rate": 4.4000000000000006e-05,
"loss": 2.3758,
"step": 22
},
{
"epoch": 0.007575445674996912,
"grad_norm": 4.439505100250244,
"learning_rate": 4.600000000000001e-05,
"loss": 2.4312,
"step": 23
},
{
"epoch": 0.007904812878257648,
"grad_norm": 5.45050048828125,
"learning_rate": 4.8e-05,
"loss": 2.3483,
"step": 24
},
{
"epoch": 0.008234180081518383,
"grad_norm": 6.179609298706055,
"learning_rate": 5e-05,
"loss": 2.1845,
"step": 25
},
{
"epoch": 0.008563547284779118,
"grad_norm": 3.980731964111328,
"learning_rate": 5.2000000000000004e-05,
"loss": 2.7766,
"step": 26
},
{
"epoch": 0.008892914488039853,
"grad_norm": 3.381983995437622,
"learning_rate": 5.4000000000000005e-05,
"loss": 2.5084,
"step": 27
},
{
"epoch": 0.009222281691300589,
"grad_norm": 3.6727781295776367,
"learning_rate": 5.6000000000000006e-05,
"loss": 2.8683,
"step": 28
},
{
"epoch": 0.009551648894561324,
"grad_norm": 3.6398210525512695,
"learning_rate": 5.8e-05,
"loss": 2.4222,
"step": 29
},
{
"epoch": 0.009881016097822059,
"grad_norm": 3.005326986312866,
"learning_rate": 6e-05,
"loss": 2.5306,
"step": 30
},
{
"epoch": 0.010210383301082794,
"grad_norm": 3.8456501960754395,
"learning_rate": 6.2e-05,
"loss": 2.7113,
"step": 31
},
{
"epoch": 0.01053975050434353,
"grad_norm": 2.880403518676758,
"learning_rate": 6.400000000000001e-05,
"loss": 2.3046,
"step": 32
},
{
"epoch": 0.010869117707604266,
"grad_norm": 3.180576801300049,
"learning_rate": 6.6e-05,
"loss": 2.5024,
"step": 33
},
{
"epoch": 0.011198484910865001,
"grad_norm": 3.2660038471221924,
"learning_rate": 6.800000000000001e-05,
"loss": 2.6287,
"step": 34
},
{
"epoch": 0.011527852114125737,
"grad_norm": 2.8470723628997803,
"learning_rate": 7e-05,
"loss": 2.3198,
"step": 35
},
{
"epoch": 0.011857219317386472,
"grad_norm": 3.0491487979888916,
"learning_rate": 7.2e-05,
"loss": 2.3468,
"step": 36
},
{
"epoch": 0.012186586520647207,
"grad_norm": 3.558762788772583,
"learning_rate": 7.4e-05,
"loss": 2.5952,
"step": 37
},
{
"epoch": 0.012515953723907942,
"grad_norm": 3.0420889854431152,
"learning_rate": 7.6e-05,
"loss": 2.2754,
"step": 38
},
{
"epoch": 0.012845320927168677,
"grad_norm": 2.9328525066375732,
"learning_rate": 7.800000000000001e-05,
"loss": 1.9192,
"step": 39
},
{
"epoch": 0.013174688130429412,
"grad_norm": 2.9751060009002686,
"learning_rate": 8e-05,
"loss": 2.4325,
"step": 40
},
{
"epoch": 0.013504055333690148,
"grad_norm": 2.924984931945801,
"learning_rate": 8.2e-05,
"loss": 2.376,
"step": 41
},
{
"epoch": 0.013833422536950883,
"grad_norm": 2.9344444274902344,
"learning_rate": 8.4e-05,
"loss": 2.3072,
"step": 42
},
{
"epoch": 0.014162789740211618,
"grad_norm": 3.2519760131835938,
"learning_rate": 8.6e-05,
"loss": 2.3017,
"step": 43
},
{
"epoch": 0.014492156943472353,
"grad_norm": 2.9699273109436035,
"learning_rate": 8.800000000000001e-05,
"loss": 2.2801,
"step": 44
},
{
"epoch": 0.014821524146733088,
"grad_norm": 3.125183343887329,
"learning_rate": 9e-05,
"loss": 2.1891,
"step": 45
},
{
"epoch": 0.015150891349993824,
"grad_norm": 4.185096740722656,
"learning_rate": 9.200000000000001e-05,
"loss": 2.3958,
"step": 46
},
{
"epoch": 0.01548025855325456,
"grad_norm": 4.01360559463501,
"learning_rate": 9.4e-05,
"loss": 2.3923,
"step": 47
},
{
"epoch": 0.015809625756515296,
"grad_norm": 3.355820417404175,
"learning_rate": 9.6e-05,
"loss": 2.0905,
"step": 48
},
{
"epoch": 0.01613899295977603,
"grad_norm": 4.308192729949951,
"learning_rate": 9.8e-05,
"loss": 2.0585,
"step": 49
},
{
"epoch": 0.016468360163036766,
"grad_norm": 5.098552227020264,
"learning_rate": 0.0001,
"loss": 2.7164,
"step": 50
},
{
"epoch": 0.0167977273662975,
"grad_norm": 2.8389906883239746,
"learning_rate": 9.999997232675378e-05,
"loss": 2.7739,
"step": 51
},
{
"epoch": 0.017127094569558236,
"grad_norm": 3.0663294792175293,
"learning_rate": 9.999988930704576e-05,
"loss": 2.7123,
"step": 52
},
{
"epoch": 0.01745646177281897,
"grad_norm": 3.3522698879241943,
"learning_rate": 9.99997509409678e-05,
"loss": 2.4191,
"step": 53
},
{
"epoch": 0.017785828976079707,
"grad_norm": 2.815250873565674,
"learning_rate": 9.999955722867312e-05,
"loss": 2.6329,
"step": 54
},
{
"epoch": 0.018115196179340442,
"grad_norm": 2.9915289878845215,
"learning_rate": 9.999930817037609e-05,
"loss": 2.4786,
"step": 55
},
{
"epoch": 0.018444563382601177,
"grad_norm": 3.284151554107666,
"learning_rate": 9.999900376635243e-05,
"loss": 2.4249,
"step": 56
},
{
"epoch": 0.018773930585861912,
"grad_norm": 2.4534027576446533,
"learning_rate": 9.999864401693908e-05,
"loss": 2.356,
"step": 57
},
{
"epoch": 0.019103297789122647,
"grad_norm": 2.522468328475952,
"learning_rate": 9.999822892253428e-05,
"loss": 2.3852,
"step": 58
},
{
"epoch": 0.019432664992383383,
"grad_norm": 2.9176650047302246,
"learning_rate": 9.999775848359749e-05,
"loss": 2.2109,
"step": 59
},
{
"epoch": 0.019762032195644118,
"grad_norm": 3.167917013168335,
"learning_rate": 9.999723270064945e-05,
"loss": 2.3511,
"step": 60
},
{
"epoch": 0.020091399398904853,
"grad_norm": 2.6770145893096924,
"learning_rate": 9.999665157427218e-05,
"loss": 2.6865,
"step": 61
},
{
"epoch": 0.020420766602165588,
"grad_norm": 2.729907989501953,
"learning_rate": 9.999601510510895e-05,
"loss": 2.3012,
"step": 62
},
{
"epoch": 0.020750133805426323,
"grad_norm": 2.7215871810913086,
"learning_rate": 9.999532329386425e-05,
"loss": 2.0514,
"step": 63
},
{
"epoch": 0.02107950100868706,
"grad_norm": 2.7751474380493164,
"learning_rate": 9.999457614130391e-05,
"loss": 2.4867,
"step": 64
},
{
"epoch": 0.021408868211947794,
"grad_norm": 2.4375815391540527,
"learning_rate": 9.999377364825496e-05,
"loss": 2.2029,
"step": 65
},
{
"epoch": 0.021738235415208532,
"grad_norm": 2.7066309452056885,
"learning_rate": 9.99929158156057e-05,
"loss": 2.1274,
"step": 66
},
{
"epoch": 0.022067602618469268,
"grad_norm": 3.297457695007324,
"learning_rate": 9.999200264430569e-05,
"loss": 2.5863,
"step": 67
},
{
"epoch": 0.022396969821730003,
"grad_norm": 3.2771711349487305,
"learning_rate": 9.999103413536574e-05,
"loss": 2.334,
"step": 68
},
{
"epoch": 0.022726337024990738,
"grad_norm": 2.9507834911346436,
"learning_rate": 9.999001028985795e-05,
"loss": 2.3783,
"step": 69
},
{
"epoch": 0.023055704228251473,
"grad_norm": 2.6401610374450684,
"learning_rate": 9.998893110891562e-05,
"loss": 1.9256,
"step": 70
},
{
"epoch": 0.023385071431512208,
"grad_norm": 3.0901267528533936,
"learning_rate": 9.998779659373332e-05,
"loss": 2.1796,
"step": 71
},
{
"epoch": 0.023714438634772943,
"grad_norm": 3.132500171661377,
"learning_rate": 9.99866067455669e-05,
"loss": 2.2996,
"step": 72
},
{
"epoch": 0.02404380583803368,
"grad_norm": 3.1304476261138916,
"learning_rate": 9.998536156573343e-05,
"loss": 2.1264,
"step": 73
},
{
"epoch": 0.024373173041294414,
"grad_norm": 3.315488576889038,
"learning_rate": 9.998406105561125e-05,
"loss": 1.984,
"step": 74
},
{
"epoch": 0.02470254024455515,
"grad_norm": 7.0925774574279785,
"learning_rate": 9.99827052166399e-05,
"loss": 2.533,
"step": 75
},
{
"epoch": 0.025031907447815884,
"grad_norm": 2.485670328140259,
"learning_rate": 9.998129405032022e-05,
"loss": 2.4111,
"step": 76
},
{
"epoch": 0.02536127465107662,
"grad_norm": 2.9346237182617188,
"learning_rate": 9.997982755821428e-05,
"loss": 2.4954,
"step": 77
},
{
"epoch": 0.025690641854337355,
"grad_norm": 2.6853690147399902,
"learning_rate": 9.997830574194538e-05,
"loss": 2.7849,
"step": 78
},
{
"epoch": 0.02602000905759809,
"grad_norm": 2.614497661590576,
"learning_rate": 9.997672860319804e-05,
"loss": 2.7054,
"step": 79
},
{
"epoch": 0.026349376260858825,
"grad_norm": 2.476630687713623,
"learning_rate": 9.997509614371807e-05,
"loss": 2.3267,
"step": 80
},
{
"epoch": 0.02667874346411956,
"grad_norm": 2.7469708919525146,
"learning_rate": 9.997340836531249e-05,
"loss": 2.3416,
"step": 81
},
{
"epoch": 0.027008110667380295,
"grad_norm": 2.3859927654266357,
"learning_rate": 9.997166526984954e-05,
"loss": 2.1941,
"step": 82
},
{
"epoch": 0.02733747787064103,
"grad_norm": 2.3550686836242676,
"learning_rate": 9.996986685925868e-05,
"loss": 2.2731,
"step": 83
},
{
"epoch": 0.027666845073901766,
"grad_norm": 2.6245596408843994,
"learning_rate": 9.996801313553068e-05,
"loss": 2.1934,
"step": 84
},
{
"epoch": 0.0279962122771625,
"grad_norm": 2.337979793548584,
"learning_rate": 9.996610410071742e-05,
"loss": 2.0485,
"step": 85
},
{
"epoch": 0.028325579480423236,
"grad_norm": 2.2919275760650635,
"learning_rate": 9.996413975693214e-05,
"loss": 2.1097,
"step": 86
},
{
"epoch": 0.02865494668368397,
"grad_norm": 2.4434707164764404,
"learning_rate": 9.996212010634917e-05,
"loss": 2.1181,
"step": 87
},
{
"epoch": 0.028984313886944706,
"grad_norm": 2.2074220180511475,
"learning_rate": 9.996004515120414e-05,
"loss": 2.1341,
"step": 88
},
{
"epoch": 0.02931368109020544,
"grad_norm": 2.670886993408203,
"learning_rate": 9.995791489379388e-05,
"loss": 2.2701,
"step": 89
},
{
"epoch": 0.029643048293466177,
"grad_norm": 3.0483765602111816,
"learning_rate": 9.995572933647643e-05,
"loss": 2.2896,
"step": 90
},
{
"epoch": 0.029972415496726912,
"grad_norm": 2.74815034866333,
"learning_rate": 9.995348848167107e-05,
"loss": 2.3094,
"step": 91
},
{
"epoch": 0.030301782699987647,
"grad_norm": 3.188274621963501,
"learning_rate": 9.995119233185825e-05,
"loss": 2.3274,
"step": 92
},
{
"epoch": 0.030631149903248386,
"grad_norm": 2.603101968765259,
"learning_rate": 9.994884088957966e-05,
"loss": 2.0735,
"step": 93
},
{
"epoch": 0.03096051710650912,
"grad_norm": 2.7947723865509033,
"learning_rate": 9.994643415743817e-05,
"loss": 2.571,
"step": 94
},
{
"epoch": 0.03128988430976985,
"grad_norm": 2.8341128826141357,
"learning_rate": 9.994397213809786e-05,
"loss": 2.4703,
"step": 95
},
{
"epoch": 0.03161925151303059,
"grad_norm": 2.8162853717803955,
"learning_rate": 9.994145483428403e-05,
"loss": 2.2555,
"step": 96
},
{
"epoch": 0.03194861871629132,
"grad_norm": 2.9116039276123047,
"learning_rate": 9.993888224878313e-05,
"loss": 2.0821,
"step": 97
},
{
"epoch": 0.03227798591955206,
"grad_norm": 3.1359169483184814,
"learning_rate": 9.993625438444287e-05,
"loss": 1.8411,
"step": 98
},
{
"epoch": 0.03260735312281279,
"grad_norm": 3.9783849716186523,
"learning_rate": 9.993357124417209e-05,
"loss": 2.3985,
"step": 99
},
{
"epoch": 0.03293672032607353,
"grad_norm": 3.3506076335906982,
"learning_rate": 9.993083283094084e-05,
"loss": 2.2946,
"step": 100
},
{
"epoch": 0.033266087529334264,
"grad_norm": 2.503321409225464,
"learning_rate": 9.992803914778034e-05,
"loss": 2.6129,
"step": 101
},
{
"epoch": 0.033595454732595,
"grad_norm": 3.6384737491607666,
"learning_rate": 9.992519019778301e-05,
"loss": 2.6609,
"step": 102
},
{
"epoch": 0.033924821935855734,
"grad_norm": 2.3811521530151367,
"learning_rate": 9.992228598410244e-05,
"loss": 2.3603,
"step": 103
},
{
"epoch": 0.03425418913911647,
"grad_norm": 2.4915661811828613,
"learning_rate": 9.991932650995341e-05,
"loss": 2.2988,
"step": 104
},
{
"epoch": 0.034583556342377204,
"grad_norm": 2.7037079334259033,
"learning_rate": 9.991631177861182e-05,
"loss": 2.4169,
"step": 105
},
{
"epoch": 0.03491292354563794,
"grad_norm": 2.6235880851745605,
"learning_rate": 9.991324179341478e-05,
"loss": 2.2463,
"step": 106
},
{
"epoch": 0.03524229074889868,
"grad_norm": 2.3382320404052734,
"learning_rate": 9.991011655776055e-05,
"loss": 2.2194,
"step": 107
},
{
"epoch": 0.03557165795215941,
"grad_norm": 2.669034957885742,
"learning_rate": 9.990693607510853e-05,
"loss": 2.4065,
"step": 108
},
{
"epoch": 0.03590102515542015,
"grad_norm": 2.219449281692505,
"learning_rate": 9.990370034897931e-05,
"loss": 2.0945,
"step": 109
},
{
"epoch": 0.036230392358680884,
"grad_norm": 3.07869291305542,
"learning_rate": 9.99004093829546e-05,
"loss": 2.3285,
"step": 110
},
{
"epoch": 0.03655975956194162,
"grad_norm": 2.4732134342193604,
"learning_rate": 9.989706318067728e-05,
"loss": 2.0391,
"step": 111
},
{
"epoch": 0.036889126765202354,
"grad_norm": 2.9737696647644043,
"learning_rate": 9.989366174585135e-05,
"loss": 2.0482,
"step": 112
},
{
"epoch": 0.03721849396846309,
"grad_norm": 2.8496570587158203,
"learning_rate": 9.989020508224197e-05,
"loss": 2.3701,
"step": 113
},
{
"epoch": 0.037547861171723824,
"grad_norm": 3.108743190765381,
"learning_rate": 9.98866931936754e-05,
"loss": 2.4347,
"step": 114
},
{
"epoch": 0.03787722837498456,
"grad_norm": 2.642160177230835,
"learning_rate": 9.988312608403909e-05,
"loss": 2.3828,
"step": 115
},
{
"epoch": 0.038206595578245295,
"grad_norm": 2.7258148193359375,
"learning_rate": 9.987950375728157e-05,
"loss": 2.0533,
"step": 116
},
{
"epoch": 0.038535962781506033,
"grad_norm": 2.6295080184936523,
"learning_rate": 9.98758262174125e-05,
"loss": 2.3752,
"step": 117
},
{
"epoch": 0.038865329984766765,
"grad_norm": 2.905092477798462,
"learning_rate": 9.987209346850263e-05,
"loss": 2.031,
"step": 118
},
{
"epoch": 0.039194697188027504,
"grad_norm": 2.780714988708496,
"learning_rate": 9.986830551468388e-05,
"loss": 2.0643,
"step": 119
},
{
"epoch": 0.039524064391288236,
"grad_norm": 2.858346939086914,
"learning_rate": 9.986446236014925e-05,
"loss": 2.3293,
"step": 120
},
{
"epoch": 0.039853431594548974,
"grad_norm": 3.3452649116516113,
"learning_rate": 9.986056400915284e-05,
"loss": 2.4871,
"step": 121
},
{
"epoch": 0.040182798797809706,
"grad_norm": 2.921239137649536,
"learning_rate": 9.985661046600984e-05,
"loss": 2.3667,
"step": 122
},
{
"epoch": 0.040512166001070445,
"grad_norm": 3.3637163639068604,
"learning_rate": 9.985260173509656e-05,
"loss": 2.2286,
"step": 123
},
{
"epoch": 0.040841533204331176,
"grad_norm": 2.8042445182800293,
"learning_rate": 9.984853782085035e-05,
"loss": 1.9868,
"step": 124
},
{
"epoch": 0.041170900407591915,
"grad_norm": 3.1596014499664307,
"learning_rate": 9.984441872776973e-05,
"loss": 1.9366,
"step": 125
},
{
"epoch": 0.04150026761085265,
"grad_norm": 2.2009477615356445,
"learning_rate": 9.984024446041423e-05,
"loss": 2.7554,
"step": 126
},
{
"epoch": 0.041829634814113385,
"grad_norm": 2.5528178215026855,
"learning_rate": 9.983601502340443e-05,
"loss": 2.4577,
"step": 127
},
{
"epoch": 0.04215900201737412,
"grad_norm": 2.445089101791382,
"learning_rate": 9.983173042142208e-05,
"loss": 2.3525,
"step": 128
},
{
"epoch": 0.042488369220634856,
"grad_norm": 2.8235416412353516,
"learning_rate": 9.98273906592099e-05,
"loss": 2.6086,
"step": 129
},
{
"epoch": 0.04281773642389559,
"grad_norm": 2.569711446762085,
"learning_rate": 9.98229957415717e-05,
"loss": 2.4181,
"step": 130
},
{
"epoch": 0.043147103627156326,
"grad_norm": 2.627051591873169,
"learning_rate": 9.981854567337237e-05,
"loss": 2.307,
"step": 131
},
{
"epoch": 0.043476470830417065,
"grad_norm": 2.5236432552337646,
"learning_rate": 9.98140404595378e-05,
"loss": 2.5002,
"step": 132
},
{
"epoch": 0.043805838033677796,
"grad_norm": 2.83263897895813,
"learning_rate": 9.980948010505493e-05,
"loss": 2.6664,
"step": 133
},
{
"epoch": 0.044135205236938535,
"grad_norm": 2.404651641845703,
"learning_rate": 9.98048646149718e-05,
"loss": 2.4746,
"step": 134
},
{
"epoch": 0.04446457244019927,
"grad_norm": 2.768198013305664,
"learning_rate": 9.980019399439741e-05,
"loss": 2.3371,
"step": 135
},
{
"epoch": 0.044793939643460005,
"grad_norm": 3.2502036094665527,
"learning_rate": 9.97954682485018e-05,
"loss": 2.7268,
"step": 136
},
{
"epoch": 0.04512330684672074,
"grad_norm": 2.356938362121582,
"learning_rate": 9.979068738251605e-05,
"loss": 2.271,
"step": 137
},
{
"epoch": 0.045452674049981476,
"grad_norm": 2.790741205215454,
"learning_rate": 9.978585140173225e-05,
"loss": 2.5781,
"step": 138
},
{
"epoch": 0.04578204125324221,
"grad_norm": 3.1573290824890137,
"learning_rate": 9.978096031150346e-05,
"loss": 2.3545,
"step": 139
},
{
"epoch": 0.046111408456502946,
"grad_norm": 2.6217617988586426,
"learning_rate": 9.977601411724382e-05,
"loss": 2.2068,
"step": 140
},
{
"epoch": 0.04644077565976368,
"grad_norm": 3.0412278175354004,
"learning_rate": 9.977101282442839e-05,
"loss": 2.386,
"step": 141
},
{
"epoch": 0.046770142863024416,
"grad_norm": 3.4128334522247314,
"learning_rate": 9.976595643859326e-05,
"loss": 2.5365,
"step": 142
},
{
"epoch": 0.04709951006628515,
"grad_norm": 3.5312652587890625,
"learning_rate": 9.976084496533547e-05,
"loss": 2.2243,
"step": 143
},
{
"epoch": 0.04742887726954589,
"grad_norm": 2.8533828258514404,
"learning_rate": 9.97556784103131e-05,
"loss": 2.2667,
"step": 144
},
{
"epoch": 0.04775824447280662,
"grad_norm": 3.081562042236328,
"learning_rate": 9.975045677924515e-05,
"loss": 2.3508,
"step": 145
},
{
"epoch": 0.04808761167606736,
"grad_norm": 3.2083470821380615,
"learning_rate": 9.97451800779116e-05,
"loss": 2.4371,
"step": 146
},
{
"epoch": 0.04841697887932809,
"grad_norm": 3.0021450519561768,
"learning_rate": 9.973984831215337e-05,
"loss": 1.9932,
"step": 147
},
{
"epoch": 0.04874634608258883,
"grad_norm": 3.146559953689575,
"learning_rate": 9.973446148787238e-05,
"loss": 2.0892,
"step": 148
},
{
"epoch": 0.04907571328584956,
"grad_norm": 3.2698886394500732,
"learning_rate": 9.972901961103145e-05,
"loss": 1.9643,
"step": 149
},
{
"epoch": 0.0494050804891103,
"grad_norm": 3.640223503112793,
"learning_rate": 9.972352268765434e-05,
"loss": 2.0784,
"step": 150
},
{
"epoch": 0.04973444769237103,
"grad_norm": 2.1278653144836426,
"learning_rate": 9.971797072382579e-05,
"loss": 2.4746,
"step": 151
},
{
"epoch": 0.05006381489563177,
"grad_norm": 2.588524341583252,
"learning_rate": 9.971236372569142e-05,
"loss": 2.2406,
"step": 152
},
{
"epoch": 0.0503931820988925,
"grad_norm": 2.5750515460968018,
"learning_rate": 9.97067016994578e-05,
"loss": 2.254,
"step": 153
},
{
"epoch": 0.05072254930215324,
"grad_norm": 2.47708797454834,
"learning_rate": 9.970098465139236e-05,
"loss": 2.2676,
"step": 154
},
{
"epoch": 0.05105191650541397,
"grad_norm": 2.496344566345215,
"learning_rate": 9.969521258782351e-05,
"loss": 2.3978,
"step": 155
},
{
"epoch": 0.05138128370867471,
"grad_norm": 2.462228775024414,
"learning_rate": 9.968938551514048e-05,
"loss": 2.3217,
"step": 156
},
{
"epoch": 0.05171065091193544,
"grad_norm": 2.3593785762786865,
"learning_rate": 9.968350343979346e-05,
"loss": 2.0463,
"step": 157
},
{
"epoch": 0.05204001811519618,
"grad_norm": 2.4684934616088867,
"learning_rate": 9.967756636829348e-05,
"loss": 2.3118,
"step": 158
},
{
"epoch": 0.05236938531845692,
"grad_norm": 3.1452877521514893,
"learning_rate": 9.967157430721248e-05,
"loss": 2.3831,
"step": 159
},
{
"epoch": 0.05269875252171765,
"grad_norm": 2.745805025100708,
"learning_rate": 9.966552726318323e-05,
"loss": 2.3436,
"step": 160
},
{
"epoch": 0.05302811972497839,
"grad_norm": 2.490478038787842,
"learning_rate": 9.965942524289941e-05,
"loss": 2.2698,
"step": 161
},
{
"epoch": 0.05335748692823912,
"grad_norm": 2.3748586177825928,
"learning_rate": 9.96532682531155e-05,
"loss": 2.4236,
"step": 162
},
{
"epoch": 0.05368685413149986,
"grad_norm": 2.378679037094116,
"learning_rate": 9.964705630064686e-05,
"loss": 2.1829,
"step": 163
},
{
"epoch": 0.05401622133476059,
"grad_norm": 2.8342976570129395,
"learning_rate": 9.964078939236971e-05,
"loss": 2.3079,
"step": 164
},
{
"epoch": 0.05434558853802133,
"grad_norm": 2.9072232246398926,
"learning_rate": 9.963446753522104e-05,
"loss": 2.3423,
"step": 165
},
{
"epoch": 0.05467495574128206,
"grad_norm": 2.8593156337738037,
"learning_rate": 9.962809073619875e-05,
"loss": 2.2235,
"step": 166
},
{
"epoch": 0.0550043229445428,
"grad_norm": 2.832493543624878,
"learning_rate": 9.962165900236146e-05,
"loss": 2.2889,
"step": 167
},
{
"epoch": 0.05533369014780353,
"grad_norm": 2.806488037109375,
"learning_rate": 9.961517234082866e-05,
"loss": 2.1615,
"step": 168
},
{
"epoch": 0.05566305735106427,
"grad_norm": 3.301941394805908,
"learning_rate": 9.960863075878067e-05,
"loss": 2.2195,
"step": 169
},
{
"epoch": 0.055992424554325,
"grad_norm": 2.458503484725952,
"learning_rate": 9.960203426345851e-05,
"loss": 2.1645,
"step": 170
},
{
"epoch": 0.05632179175758574,
"grad_norm": 2.415736675262451,
"learning_rate": 9.959538286216408e-05,
"loss": 2.012,
"step": 171
},
{
"epoch": 0.05665115896084647,
"grad_norm": 2.7939391136169434,
"learning_rate": 9.958867656225997e-05,
"loss": 2.3091,
"step": 172
},
{
"epoch": 0.05698052616410721,
"grad_norm": 2.971738576889038,
"learning_rate": 9.958191537116963e-05,
"loss": 2.1566,
"step": 173
},
{
"epoch": 0.05730989336736794,
"grad_norm": 3.0159671306610107,
"learning_rate": 9.957509929637719e-05,
"loss": 2.0143,
"step": 174
},
{
"epoch": 0.05763926057062868,
"grad_norm": 3.120633125305176,
"learning_rate": 9.956822834542759e-05,
"loss": 1.8494,
"step": 175
},
{
"epoch": 0.05796862777388941,
"grad_norm": 2.3217828273773193,
"learning_rate": 9.956130252592646e-05,
"loss": 2.6393,
"step": 176
},
{
"epoch": 0.05829799497715015,
"grad_norm": 2.3948323726654053,
"learning_rate": 9.955432184554024e-05,
"loss": 2.6342,
"step": 177
},
{
"epoch": 0.05862736218041088,
"grad_norm": 2.1750946044921875,
"learning_rate": 9.9547286311996e-05,
"loss": 2.3683,
"step": 178
},
{
"epoch": 0.05895672938367162,
"grad_norm": 2.6148295402526855,
"learning_rate": 9.954019593308163e-05,
"loss": 2.5178,
"step": 179
},
{
"epoch": 0.05928609658693235,
"grad_norm": 2.6671082973480225,
"learning_rate": 9.953305071664566e-05,
"loss": 2.3501,
"step": 180
},
{
"epoch": 0.05961546379019309,
"grad_norm": 2.740058422088623,
"learning_rate": 9.952585067059734e-05,
"loss": 2.3677,
"step": 181
},
{
"epoch": 0.059944830993453824,
"grad_norm": 2.2972073554992676,
"learning_rate": 9.951859580290664e-05,
"loss": 2.5598,
"step": 182
},
{
"epoch": 0.06027419819671456,
"grad_norm": 2.5012662410736084,
"learning_rate": 9.951128612160417e-05,
"loss": 2.247,
"step": 183
},
{
"epoch": 0.060603565399975294,
"grad_norm": 2.5461273193359375,
"learning_rate": 9.950392163478121e-05,
"loss": 2.4683,
"step": 184
},
{
"epoch": 0.06093293260323603,
"grad_norm": 3.3869452476501465,
"learning_rate": 9.949650235058978e-05,
"loss": 2.5158,
"step": 185
},
{
"epoch": 0.06126229980649677,
"grad_norm": 2.2931103706359863,
"learning_rate": 9.948902827724248e-05,
"loss": 2.2837,
"step": 186
},
{
"epoch": 0.0615916670097575,
"grad_norm": 2.4342219829559326,
"learning_rate": 9.94814994230126e-05,
"loss": 2.2988,
"step": 187
},
{
"epoch": 0.06192103421301824,
"grad_norm": 2.924483299255371,
"learning_rate": 9.947391579623401e-05,
"loss": 2.4679,
"step": 188
},
{
"epoch": 0.06225040141627897,
"grad_norm": 2.381253480911255,
"learning_rate": 9.946627740530131e-05,
"loss": 2.0651,
"step": 189
},
{
"epoch": 0.0625797686195397,
"grad_norm": 2.3202030658721924,
"learning_rate": 9.945858425866962e-05,
"loss": 2.0079,
"step": 190
},
{
"epoch": 0.06290913582280044,
"grad_norm": 2.717766523361206,
"learning_rate": 9.945083636485476e-05,
"loss": 2.6815,
"step": 191
},
{
"epoch": 0.06323850302606118,
"grad_norm": 3.0983986854553223,
"learning_rate": 9.944303373243307e-05,
"loss": 2.4154,
"step": 192
},
{
"epoch": 0.06356787022932192,
"grad_norm": 2.5674819946289062,
"learning_rate": 9.943517637004151e-05,
"loss": 1.8935,
"step": 193
},
{
"epoch": 0.06389723743258265,
"grad_norm": 2.8902697563171387,
"learning_rate": 9.942726428637765e-05,
"loss": 2.19,
"step": 194
},
{
"epoch": 0.06422660463584338,
"grad_norm": 2.824228525161743,
"learning_rate": 9.941929749019961e-05,
"loss": 1.7962,
"step": 195
},
{
"epoch": 0.06455597183910412,
"grad_norm": 2.9178125858306885,
"learning_rate": 9.941127599032605e-05,
"loss": 1.9707,
"step": 196
},
{
"epoch": 0.06488533904236486,
"grad_norm": 3.112900495529175,
"learning_rate": 9.940319979563624e-05,
"loss": 2.1085,
"step": 197
},
{
"epoch": 0.06521470624562559,
"grad_norm": 2.4243252277374268,
"learning_rate": 9.939506891506993e-05,
"loss": 1.6683,
"step": 198
},
{
"epoch": 0.06554407344888633,
"grad_norm": 3.3545095920562744,
"learning_rate": 9.938688335762747e-05,
"loss": 1.9903,
"step": 199
},
{
"epoch": 0.06587344065214706,
"grad_norm": 4.020653247833252,
"learning_rate": 9.937864313236968e-05,
"loss": 1.9782,
"step": 200
},
{
"epoch": 0.0662028078554078,
"grad_norm": 2.1895785331726074,
"learning_rate": 9.93703482484179e-05,
"loss": 2.4232,
"step": 201
},
{
"epoch": 0.06653217505866853,
"grad_norm": 2.779294967651367,
"learning_rate": 9.9361998714954e-05,
"loss": 2.4474,
"step": 202
},
{
"epoch": 0.06686154226192927,
"grad_norm": 2.666214942932129,
"learning_rate": 9.935359454122033e-05,
"loss": 2.3747,
"step": 203
},
{
"epoch": 0.06719090946519,
"grad_norm": 2.4927260875701904,
"learning_rate": 9.93451357365197e-05,
"loss": 2.3229,
"step": 204
},
{
"epoch": 0.06752027666845074,
"grad_norm": 2.6281189918518066,
"learning_rate": 9.933662231021543e-05,
"loss": 2.3106,
"step": 205
},
{
"epoch": 0.06784964387171147,
"grad_norm": 2.487201452255249,
"learning_rate": 9.932805427173128e-05,
"loss": 2.1396,
"step": 206
},
{
"epoch": 0.0681790110749722,
"grad_norm": 2.6833136081695557,
"learning_rate": 9.931943163055148e-05,
"loss": 2.6855,
"step": 207
},
{
"epoch": 0.06850837827823295,
"grad_norm": 2.670117139816284,
"learning_rate": 9.931075439622069e-05,
"loss": 2.0407,
"step": 208
},
{
"epoch": 0.06883774548149368,
"grad_norm": 2.8142549991607666,
"learning_rate": 9.930202257834397e-05,
"loss": 2.5156,
"step": 209
},
{
"epoch": 0.06916711268475441,
"grad_norm": 2.5977020263671875,
"learning_rate": 9.929323618658686e-05,
"loss": 2.2659,
"step": 210
},
{
"epoch": 0.06949647988801515,
"grad_norm": 2.188446521759033,
"learning_rate": 9.928439523067526e-05,
"loss": 1.853,
"step": 211
},
{
"epoch": 0.06982584709127589,
"grad_norm": 2.691819906234741,
"learning_rate": 9.92754997203955e-05,
"loss": 2.0574,
"step": 212
},
{
"epoch": 0.07015521429453662,
"grad_norm": 2.599579334259033,
"learning_rate": 9.926654966559427e-05,
"loss": 2.1189,
"step": 213
},
{
"epoch": 0.07048458149779736,
"grad_norm": 3.0365447998046875,
"learning_rate": 9.925754507617868e-05,
"loss": 2.2052,
"step": 214
},
{
"epoch": 0.07081394870105809,
"grad_norm": 3.086376190185547,
"learning_rate": 9.924848596211618e-05,
"loss": 2.5058,
"step": 215
},
{
"epoch": 0.07114331590431883,
"grad_norm": 3.420269012451172,
"learning_rate": 9.923937233343453e-05,
"loss": 2.0489,
"step": 216
},
{
"epoch": 0.07147268310757957,
"grad_norm": 2.6292338371276855,
"learning_rate": 9.923020420022191e-05,
"loss": 2.4083,
"step": 217
},
{
"epoch": 0.0718020503108403,
"grad_norm": 3.046985387802124,
"learning_rate": 9.92209815726268e-05,
"loss": 2.2224,
"step": 218
},
{
"epoch": 0.07213141751410103,
"grad_norm": 3.1213648319244385,
"learning_rate": 9.921170446085798e-05,
"loss": 2.0798,
"step": 219
},
{
"epoch": 0.07246078471736177,
"grad_norm": 2.71501088142395,
"learning_rate": 9.920237287518462e-05,
"loss": 2.3788,
"step": 220
},
{
"epoch": 0.0727901519206225,
"grad_norm": 2.7265591621398926,
"learning_rate": 9.919298682593605e-05,
"loss": 2.0445,
"step": 221
},
{
"epoch": 0.07311951912388324,
"grad_norm": 3.3277218341827393,
"learning_rate": 9.918354632350202e-05,
"loss": 2.1541,
"step": 222
},
{
"epoch": 0.07344888632714397,
"grad_norm": 3.288353443145752,
"learning_rate": 9.917405137833249e-05,
"loss": 2.2666,
"step": 223
},
{
"epoch": 0.07377825353040471,
"grad_norm": 3.049253463745117,
"learning_rate": 9.916450200093771e-05,
"loss": 1.8277,
"step": 224
},
{
"epoch": 0.07410762073366545,
"grad_norm": 3.3985233306884766,
"learning_rate": 9.915489820188814e-05,
"loss": 1.9754,
"step": 225
},
{
"epoch": 0.07443698793692619,
"grad_norm": 2.7046058177948,
"learning_rate": 9.914523999181456e-05,
"loss": 2.3886,
"step": 226
},
{
"epoch": 0.07476635514018691,
"grad_norm": 2.472142219543457,
"learning_rate": 9.91355273814079e-05,
"loss": 2.4031,
"step": 227
},
{
"epoch": 0.07509572234344765,
"grad_norm": 2.1829640865325928,
"learning_rate": 9.912576038141933e-05,
"loss": 2.34,
"step": 228
},
{
"epoch": 0.07542508954670839,
"grad_norm": 2.4181277751922607,
"learning_rate": 9.911593900266026e-05,
"loss": 2.1865,
"step": 229
},
{
"epoch": 0.07575445674996913,
"grad_norm": 2.6987533569335938,
"learning_rate": 9.910606325600223e-05,
"loss": 2.3855,
"step": 230
},
{
"epoch": 0.07608382395322985,
"grad_norm": 2.5802574157714844,
"learning_rate": 9.909613315237702e-05,
"loss": 2.3325,
"step": 231
},
{
"epoch": 0.07641319115649059,
"grad_norm": 2.356382369995117,
"learning_rate": 9.90861487027766e-05,
"loss": 2.1446,
"step": 232
},
{
"epoch": 0.07674255835975133,
"grad_norm": 2.3594143390655518,
"learning_rate": 9.907610991825298e-05,
"loss": 2.204,
"step": 233
},
{
"epoch": 0.07707192556301207,
"grad_norm": 2.47929310798645,
"learning_rate": 9.906601680991842e-05,
"loss": 2.2276,
"step": 234
},
{
"epoch": 0.07740129276627279,
"grad_norm": 3.100281000137329,
"learning_rate": 9.905586938894531e-05,
"loss": 2.3447,
"step": 235
},
{
"epoch": 0.07773065996953353,
"grad_norm": 2.3578147888183594,
"learning_rate": 9.904566766656612e-05,
"loss": 2.1765,
"step": 236
},
{
"epoch": 0.07806002717279427,
"grad_norm": 2.6452767848968506,
"learning_rate": 9.903541165407341e-05,
"loss": 2.2725,
"step": 237
},
{
"epoch": 0.07838939437605501,
"grad_norm": 2.8530819416046143,
"learning_rate": 9.902510136281989e-05,
"loss": 2.1286,
"step": 238
},
{
"epoch": 0.07871876157931575,
"grad_norm": 3.382469892501831,
"learning_rate": 9.901473680421833e-05,
"loss": 2.5109,
"step": 239
},
{
"epoch": 0.07904812878257647,
"grad_norm": 2.7474164962768555,
"learning_rate": 9.900431798974158e-05,
"loss": 2.0808,
"step": 240
},
{
"epoch": 0.07937749598583721,
"grad_norm": 2.889378070831299,
"learning_rate": 9.899384493092252e-05,
"loss": 2.6418,
"step": 241
},
{
"epoch": 0.07970686318909795,
"grad_norm": 2.788848876953125,
"learning_rate": 9.89833176393541e-05,
"loss": 2.4061,
"step": 242
},
{
"epoch": 0.08003623039235869,
"grad_norm": 2.540822982788086,
"learning_rate": 9.897273612668927e-05,
"loss": 1.9808,
"step": 243
},
{
"epoch": 0.08036559759561941,
"grad_norm": 2.4531843662261963,
"learning_rate": 9.896210040464105e-05,
"loss": 1.9014,
"step": 244
},
{
"epoch": 0.08069496479888015,
"grad_norm": 3.2541840076446533,
"learning_rate": 9.895141048498244e-05,
"loss": 2.5161,
"step": 245
},
{
"epoch": 0.08102433200214089,
"grad_norm": 2.396268367767334,
"learning_rate": 9.89406663795464e-05,
"loss": 1.944,
"step": 246
},
{
"epoch": 0.08135369920540163,
"grad_norm": 3.345994710922241,
"learning_rate": 9.892986810022594e-05,
"loss": 2.4834,
"step": 247
},
{
"epoch": 0.08168306640866235,
"grad_norm": 2.90889048576355,
"learning_rate": 9.891901565897397e-05,
"loss": 2.041,
"step": 248
},
{
"epoch": 0.08201243361192309,
"grad_norm": 3.424887180328369,
"learning_rate": 9.89081090678034e-05,
"loss": 2.1515,
"step": 249
},
{
"epoch": 0.08234180081518383,
"grad_norm": 3.129890203475952,
"learning_rate": 9.889714833878705e-05,
"loss": 1.5253,
"step": 250
},
{
"epoch": 0.08267116801844457,
"grad_norm": 2.033660411834717,
"learning_rate": 9.888613348405766e-05,
"loss": 2.4284,
"step": 251
},
{
"epoch": 0.0830005352217053,
"grad_norm": 2.726742744445801,
"learning_rate": 9.887506451580794e-05,
"loss": 2.5739,
"step": 252
},
{
"epoch": 0.08332990242496603,
"grad_norm": 2.1317944526672363,
"learning_rate": 9.886394144629044e-05,
"loss": 2.3195,
"step": 253
},
{
"epoch": 0.08365926962822677,
"grad_norm": 2.2404332160949707,
"learning_rate": 9.885276428781763e-05,
"loss": 2.6881,
"step": 254
},
{
"epoch": 0.08398863683148751,
"grad_norm": 2.376636028289795,
"learning_rate": 9.88415330527618e-05,
"loss": 2.2499,
"step": 255
},
{
"epoch": 0.08431800403474823,
"grad_norm": 2.24052095413208,
"learning_rate": 9.88302477535552e-05,
"loss": 2.4867,
"step": 256
},
{
"epoch": 0.08464737123800897,
"grad_norm": 2.6284337043762207,
"learning_rate": 9.881890840268981e-05,
"loss": 2.5267,
"step": 257
},
{
"epoch": 0.08497673844126971,
"grad_norm": 2.5630033016204834,
"learning_rate": 9.880751501271755e-05,
"loss": 2.3627,
"step": 258
},
{
"epoch": 0.08530610564453045,
"grad_norm": 2.3372888565063477,
"learning_rate": 9.879606759625004e-05,
"loss": 2.4922,
"step": 259
},
{
"epoch": 0.08563547284779117,
"grad_norm": 2.3916890621185303,
"learning_rate": 9.878456616595882e-05,
"loss": 2.1065,
"step": 260
},
{
"epoch": 0.08596484005105191,
"grad_norm": 2.6594431400299072,
"learning_rate": 9.877301073457515e-05,
"loss": 2.034,
"step": 261
},
{
"epoch": 0.08629420725431265,
"grad_norm": 2.3828513622283936,
"learning_rate": 9.876140131489008e-05,
"loss": 1.9415,
"step": 262
},
{
"epoch": 0.08662357445757339,
"grad_norm": 2.558377265930176,
"learning_rate": 9.874973791975442e-05,
"loss": 2.1253,
"step": 263
},
{
"epoch": 0.08695294166083413,
"grad_norm": 2.41732120513916,
"learning_rate": 9.873802056207872e-05,
"loss": 2.4188,
"step": 264
},
{
"epoch": 0.08728230886409485,
"grad_norm": 2.653940439224243,
"learning_rate": 9.872624925483329e-05,
"loss": 2.1664,
"step": 265
},
{
"epoch": 0.08761167606735559,
"grad_norm": 2.6138205528259277,
"learning_rate": 9.871442401104816e-05,
"loss": 1.9422,
"step": 266
},
{
"epoch": 0.08794104327061633,
"grad_norm": 2.4396393299102783,
"learning_rate": 9.870254484381299e-05,
"loss": 2.2988,
"step": 267
},
{
"epoch": 0.08827041047387707,
"grad_norm": 3.269818067550659,
"learning_rate": 9.869061176627724e-05,
"loss": 2.4621,
"step": 268
},
{
"epoch": 0.0885997776771378,
"grad_norm": 2.802405595779419,
"learning_rate": 9.867862479164996e-05,
"loss": 2.0724,
"step": 269
},
{
"epoch": 0.08892914488039853,
"grad_norm": 3.132948160171509,
"learning_rate": 9.866658393319988e-05,
"loss": 2.3857,
"step": 270
},
{
"epoch": 0.08925851208365927,
"grad_norm": 3.446258544921875,
"learning_rate": 9.865448920425541e-05,
"loss": 2.1038,
"step": 271
},
{
"epoch": 0.08958787928692001,
"grad_norm": 3.5878865718841553,
"learning_rate": 9.864234061820458e-05,
"loss": 2.4794,
"step": 272
},
{
"epoch": 0.08991724649018074,
"grad_norm": 2.7170724868774414,
"learning_rate": 9.863013818849499e-05,
"loss": 2.0187,
"step": 273
},
{
"epoch": 0.09024661369344147,
"grad_norm": 3.4046213626861572,
"learning_rate": 9.861788192863388e-05,
"loss": 1.9167,
"step": 274
},
{
"epoch": 0.09057598089670221,
"grad_norm": 4.091611385345459,
"learning_rate": 9.860557185218808e-05,
"loss": 2.203,
"step": 275
},
{
"epoch": 0.09090534809996295,
"grad_norm": 1.9869258403778076,
"learning_rate": 9.859320797278397e-05,
"loss": 2.4434,
"step": 276
},
{
"epoch": 0.09123471530322368,
"grad_norm": 2.5179264545440674,
"learning_rate": 9.85807903041075e-05,
"loss": 2.2607,
"step": 277
},
{
"epoch": 0.09156408250648441,
"grad_norm": 2.7460920810699463,
"learning_rate": 9.856831885990416e-05,
"loss": 2.5456,
"step": 278
},
{
"epoch": 0.09189344970974515,
"grad_norm": 2.803849458694458,
"learning_rate": 9.855579365397898e-05,
"loss": 2.5656,
"step": 279
},
{
"epoch": 0.09222281691300589,
"grad_norm": 2.4078571796417236,
"learning_rate": 9.854321470019646e-05,
"loss": 2.4701,
"step": 280
},
{
"epoch": 0.09255218411626662,
"grad_norm": 2.735297679901123,
"learning_rate": 9.853058201248063e-05,
"loss": 2.4882,
"step": 281
},
{
"epoch": 0.09288155131952736,
"grad_norm": 2.441884756088257,
"learning_rate": 9.851789560481499e-05,
"loss": 2.2741,
"step": 282
},
{
"epoch": 0.0932109185227881,
"grad_norm": 2.480804681777954,
"learning_rate": 9.85051554912425e-05,
"loss": 2.2021,
"step": 283
},
{
"epoch": 0.09354028572604883,
"grad_norm": 2.59104585647583,
"learning_rate": 9.849236168586558e-05,
"loss": 2.7452,
"step": 284
},
{
"epoch": 0.09386965292930956,
"grad_norm": 2.392718553543091,
"learning_rate": 9.847951420284605e-05,
"loss": 2.231,
"step": 285
},
{
"epoch": 0.0941990201325703,
"grad_norm": 2.880892753601074,
"learning_rate": 9.84666130564052e-05,
"loss": 2.5514,
"step": 286
},
{
"epoch": 0.09452838733583104,
"grad_norm": 2.457305431365967,
"learning_rate": 9.845365826082368e-05,
"loss": 2.261,
"step": 287
},
{
"epoch": 0.09485775453909177,
"grad_norm": 2.8255691528320312,
"learning_rate": 9.844064983044157e-05,
"loss": 2.4296,
"step": 288
},
{
"epoch": 0.0951871217423525,
"grad_norm": 2.97965407371521,
"learning_rate": 9.842758777965824e-05,
"loss": 2.3662,
"step": 289
},
{
"epoch": 0.09551648894561324,
"grad_norm": 2.454676389694214,
"learning_rate": 9.841447212293249e-05,
"loss": 2.1213,
"step": 290
},
{
"epoch": 0.09584585614887398,
"grad_norm": 3.022413492202759,
"learning_rate": 9.840130287478245e-05,
"loss": 2.4408,
"step": 291
},
{
"epoch": 0.09617522335213471,
"grad_norm": 3.0308666229248047,
"learning_rate": 9.83880800497855e-05,
"loss": 2.5066,
"step": 292
},
{
"epoch": 0.09650459055539545,
"grad_norm": 2.674705743789673,
"learning_rate": 9.837480366257844e-05,
"loss": 2.2084,
"step": 293
},
{
"epoch": 0.09683395775865618,
"grad_norm": 2.988152503967285,
"learning_rate": 9.836147372785726e-05,
"loss": 2.2093,
"step": 294
},
{
"epoch": 0.09716332496191692,
"grad_norm": 2.502009868621826,
"learning_rate": 9.834809026037728e-05,
"loss": 2.259,
"step": 295
},
{
"epoch": 0.09749269216517766,
"grad_norm": 2.590487241744995,
"learning_rate": 9.833465327495306e-05,
"loss": 2.1714,
"step": 296
},
{
"epoch": 0.0978220593684384,
"grad_norm": 2.34224534034729,
"learning_rate": 9.83211627864584e-05,
"loss": 1.7337,
"step": 297
},
{
"epoch": 0.09815142657169912,
"grad_norm": 2.985863447189331,
"learning_rate": 9.83076188098263e-05,
"loss": 2.0128,
"step": 298
},
{
"epoch": 0.09848079377495986,
"grad_norm": 2.8273167610168457,
"learning_rate": 9.829402136004904e-05,
"loss": 2.0277,
"step": 299
},
{
"epoch": 0.0988101609782206,
"grad_norm": 3.533780813217163,
"learning_rate": 9.8280370452178e-05,
"loss": 2.0994,
"step": 300
},
{
"epoch": 0.09913952818148133,
"grad_norm": 2.0469958782196045,
"learning_rate": 9.82666661013238e-05,
"loss": 2.4583,
"step": 301
},
{
"epoch": 0.09946889538474206,
"grad_norm": 2.3586039543151855,
"learning_rate": 9.825290832265617e-05,
"loss": 2.4677,
"step": 302
},
{
"epoch": 0.0997982625880028,
"grad_norm": 2.269946813583374,
"learning_rate": 9.823909713140404e-05,
"loss": 2.3393,
"step": 303
},
{
"epoch": 0.10012762979126354,
"grad_norm": 2.338125705718994,
"learning_rate": 9.82252325428554e-05,
"loss": 2.4787,
"step": 304
},
{
"epoch": 0.10045699699452428,
"grad_norm": 2.1585426330566406,
"learning_rate": 9.821131457235739e-05,
"loss": 2.5393,
"step": 305
},
{
"epoch": 0.100786364197785,
"grad_norm": 2.6568026542663574,
"learning_rate": 9.819734323531624e-05,
"loss": 2.5194,
"step": 306
},
{
"epoch": 0.10111573140104574,
"grad_norm": 2.206839084625244,
"learning_rate": 9.818331854719722e-05,
"loss": 2.4154,
"step": 307
},
{
"epoch": 0.10144509860430648,
"grad_norm": 2.444082498550415,
"learning_rate": 9.816924052352468e-05,
"loss": 2.2583,
"step": 308
},
{
"epoch": 0.10177446580756722,
"grad_norm": 2.4031789302825928,
"learning_rate": 9.815510917988202e-05,
"loss": 2.5014,
"step": 309
},
{
"epoch": 0.10210383301082794,
"grad_norm": 2.008598566055298,
"learning_rate": 9.814092453191163e-05,
"loss": 2.1755,
"step": 310
},
{
"epoch": 0.10243320021408868,
"grad_norm": 2.6430673599243164,
"learning_rate": 9.812668659531492e-05,
"loss": 2.4391,
"step": 311
},
{
"epoch": 0.10276256741734942,
"grad_norm": 2.4818711280822754,
"learning_rate": 9.811239538585229e-05,
"loss": 2.2518,
"step": 312
},
{
"epoch": 0.10309193462061016,
"grad_norm": 2.733666181564331,
"learning_rate": 9.80980509193431e-05,
"loss": 2.3118,
"step": 313
},
{
"epoch": 0.10342130182387088,
"grad_norm": 2.3446598052978516,
"learning_rate": 9.808365321166568e-05,
"loss": 2.3457,
"step": 314
},
{
"epoch": 0.10375066902713162,
"grad_norm": 2.2961266040802,
"learning_rate": 9.806920227875729e-05,
"loss": 1.9795,
"step": 315
},
{
"epoch": 0.10408003623039236,
"grad_norm": 2.767897844314575,
"learning_rate": 9.805469813661408e-05,
"loss": 2.2274,
"step": 316
},
{
"epoch": 0.1044094034336531,
"grad_norm": 2.9815187454223633,
"learning_rate": 9.804014080129111e-05,
"loss": 2.4279,
"step": 317
},
{
"epoch": 0.10473877063691384,
"grad_norm": 2.3294548988342285,
"learning_rate": 9.802553028890237e-05,
"loss": 1.9681,
"step": 318
},
{
"epoch": 0.10506813784017456,
"grad_norm": 3.110809564590454,
"learning_rate": 9.801086661562062e-05,
"loss": 2.3353,
"step": 319
},
{
"epoch": 0.1053975050434353,
"grad_norm": 2.6092398166656494,
"learning_rate": 9.799614979767757e-05,
"loss": 2.1682,
"step": 320
},
{
"epoch": 0.10572687224669604,
"grad_norm": 3.033212184906006,
"learning_rate": 9.798137985136367e-05,
"loss": 1.9523,
"step": 321
},
{
"epoch": 0.10605623944995678,
"grad_norm": 2.9443624019622803,
"learning_rate": 9.79665567930282e-05,
"loss": 2.1126,
"step": 322
},
{
"epoch": 0.1063856066532175,
"grad_norm": 3.2337043285369873,
"learning_rate": 9.795168063907929e-05,
"loss": 2.1455,
"step": 323
},
{
"epoch": 0.10671497385647824,
"grad_norm": 2.5773916244506836,
"learning_rate": 9.793675140598377e-05,
"loss": 1.7072,
"step": 324
},
{
"epoch": 0.10704434105973898,
"grad_norm": 3.643908977508545,
"learning_rate": 9.792176911026727e-05,
"loss": 2.2543,
"step": 325
},
{
"epoch": 0.10737370826299972,
"grad_norm": 1.8910375833511353,
"learning_rate": 9.790673376851414e-05,
"loss": 2.5806,
"step": 326
},
{
"epoch": 0.10770307546626044,
"grad_norm": 2.3565833568573,
"learning_rate": 9.789164539736741e-05,
"loss": 2.4109,
"step": 327
},
{
"epoch": 0.10803244266952118,
"grad_norm": 2.3572747707366943,
"learning_rate": 9.78765040135289e-05,
"loss": 2.3321,
"step": 328
},
{
"epoch": 0.10836180987278192,
"grad_norm": 2.759070873260498,
"learning_rate": 9.786130963375904e-05,
"loss": 2.4526,
"step": 329
},
{
"epoch": 0.10869117707604266,
"grad_norm": 2.419929027557373,
"learning_rate": 9.784606227487693e-05,
"loss": 2.1635,
"step": 330
},
{
"epoch": 0.10902054427930338,
"grad_norm": 2.4334778785705566,
"learning_rate": 9.783076195376036e-05,
"loss": 2.5785,
"step": 331
},
{
"epoch": 0.10934991148256412,
"grad_norm": 2.6062378883361816,
"learning_rate": 9.781540868734567e-05,
"loss": 2.4172,
"step": 332
},
{
"epoch": 0.10967927868582486,
"grad_norm": 2.5069472789764404,
"learning_rate": 9.780000249262787e-05,
"loss": 2.3606,
"step": 333
},
{
"epoch": 0.1100086458890856,
"grad_norm": 2.3731164932250977,
"learning_rate": 9.778454338666053e-05,
"loss": 2.3024,
"step": 334
},
{
"epoch": 0.11033801309234632,
"grad_norm": 2.8083693981170654,
"learning_rate": 9.776903138655581e-05,
"loss": 2.426,
"step": 335
},
{
"epoch": 0.11066738029560706,
"grad_norm": 2.4966108798980713,
"learning_rate": 9.775346650948439e-05,
"loss": 2.2338,
"step": 336
},
{
"epoch": 0.1109967474988678,
"grad_norm": 2.9775607585906982,
"learning_rate": 9.77378487726755e-05,
"loss": 2.7963,
"step": 337
},
{
"epoch": 0.11132611470212854,
"grad_norm": 2.692918062210083,
"learning_rate": 9.77221781934169e-05,
"loss": 2.4771,
"step": 338
},
{
"epoch": 0.11165548190538926,
"grad_norm": 2.5929603576660156,
"learning_rate": 9.770645478905481e-05,
"loss": 2.4814,
"step": 339
},
{
"epoch": 0.11198484910865,
"grad_norm": 2.6808626651763916,
"learning_rate": 9.76906785769939e-05,
"loss": 2.1842,
"step": 340
},
{
"epoch": 0.11231421631191074,
"grad_norm": 2.846548080444336,
"learning_rate": 9.767484957469739e-05,
"loss": 2.1864,
"step": 341
},
{
"epoch": 0.11264358351517148,
"grad_norm": 2.9507384300231934,
"learning_rate": 9.765896779968685e-05,
"loss": 2.3369,
"step": 342
},
{
"epoch": 0.11297295071843222,
"grad_norm": 3.360044002532959,
"learning_rate": 9.764303326954226e-05,
"loss": 2.2568,
"step": 343
},
{
"epoch": 0.11330231792169294,
"grad_norm": 2.5054543018341064,
"learning_rate": 9.762704600190207e-05,
"loss": 1.8953,
"step": 344
},
{
"epoch": 0.11363168512495368,
"grad_norm": 3.037480592727661,
"learning_rate": 9.761100601446304e-05,
"loss": 2.4476,
"step": 345
},
{
"epoch": 0.11396105232821442,
"grad_norm": 3.0009777545928955,
"learning_rate": 9.759491332498032e-05,
"loss": 2.3728,
"step": 346
},
{
"epoch": 0.11429041953147516,
"grad_norm": 2.856393814086914,
"learning_rate": 9.757876795126739e-05,
"loss": 1.9786,
"step": 347
},
{
"epoch": 0.11461978673473588,
"grad_norm": 3.3026115894317627,
"learning_rate": 9.756256991119603e-05,
"loss": 2.6015,
"step": 348
},
{
"epoch": 0.11494915393799662,
"grad_norm": 3.5809285640716553,
"learning_rate": 9.754631922269636e-05,
"loss": 2.307,
"step": 349
},
{
"epoch": 0.11527852114125736,
"grad_norm": 3.11942195892334,
"learning_rate": 9.753001590375674e-05,
"loss": 2.0157,
"step": 350
},
{
"epoch": 0.1156078883445181,
"grad_norm": 2.313547372817993,
"learning_rate": 9.75136599724238e-05,
"loss": 2.4783,
"step": 351
},
{
"epoch": 0.11593725554777883,
"grad_norm": 2.4683542251586914,
"learning_rate": 9.74972514468024e-05,
"loss": 2.4294,
"step": 352
},
{
"epoch": 0.11626662275103956,
"grad_norm": 2.895655632019043,
"learning_rate": 9.748079034505565e-05,
"loss": 2.6217,
"step": 353
},
{
"epoch": 0.1165959899543003,
"grad_norm": 2.646327495574951,
"learning_rate": 9.746427668540481e-05,
"loss": 2.5583,
"step": 354
},
{
"epoch": 0.11692535715756104,
"grad_norm": 2.4664547443389893,
"learning_rate": 9.744771048612935e-05,
"loss": 2.4272,
"step": 355
},
{
"epoch": 0.11725472436082177,
"grad_norm": 2.5208280086517334,
"learning_rate": 9.743109176556689e-05,
"loss": 2.7008,
"step": 356
},
{
"epoch": 0.1175840915640825,
"grad_norm": 2.5895395278930664,
"learning_rate": 9.741442054211319e-05,
"loss": 2.4581,
"step": 357
},
{
"epoch": 0.11791345876734324,
"grad_norm": 2.3770651817321777,
"learning_rate": 9.739769683422214e-05,
"loss": 2.2697,
"step": 358
},
{
"epoch": 0.11824282597060398,
"grad_norm": 2.263289451599121,
"learning_rate": 9.738092066040568e-05,
"loss": 2.1457,
"step": 359
},
{
"epoch": 0.1185721931738647,
"grad_norm": 3.5201501846313477,
"learning_rate": 9.736409203923388e-05,
"loss": 2.4604,
"step": 360
},
{
"epoch": 0.11890156037712545,
"grad_norm": 2.3830199241638184,
"learning_rate": 9.734721098933484e-05,
"loss": 2.0263,
"step": 361
},
{
"epoch": 0.11923092758038618,
"grad_norm": 2.431260347366333,
"learning_rate": 9.73302775293947e-05,
"loss": 2.2848,
"step": 362
},
{
"epoch": 0.11956029478364692,
"grad_norm": 2.4053165912628174,
"learning_rate": 9.73132916781576e-05,
"loss": 2.0885,
"step": 363
},
{
"epoch": 0.11988966198690765,
"grad_norm": 2.584679126739502,
"learning_rate": 9.72962534544257e-05,
"loss": 2.2592,
"step": 364
},
{
"epoch": 0.12021902919016839,
"grad_norm": 2.6707746982574463,
"learning_rate": 9.727916287705912e-05,
"loss": 2.135,
"step": 365
},
{
"epoch": 0.12054839639342912,
"grad_norm": 2.802182197570801,
"learning_rate": 9.72620199649759e-05,
"loss": 2.6641,
"step": 366
},
{
"epoch": 0.12087776359668986,
"grad_norm": 3.923982620239258,
"learning_rate": 9.724482473715207e-05,
"loss": 2.3298,
"step": 367
},
{
"epoch": 0.12120713079995059,
"grad_norm": 2.7437329292297363,
"learning_rate": 9.722757721262154e-05,
"loss": 2.2673,
"step": 368
},
{
"epoch": 0.12153649800321133,
"grad_norm": 2.640639543533325,
"learning_rate": 9.72102774104761e-05,
"loss": 2.2149,
"step": 369
},
{
"epoch": 0.12186586520647207,
"grad_norm": 3.37756609916687,
"learning_rate": 9.719292534986543e-05,
"loss": 2.0474,
"step": 370
},
{
"epoch": 0.1221952324097328,
"grad_norm": 2.524691581726074,
"learning_rate": 9.717552104999703e-05,
"loss": 2.0271,
"step": 371
},
{
"epoch": 0.12252459961299354,
"grad_norm": 2.769646406173706,
"learning_rate": 9.715806453013625e-05,
"loss": 2.0452,
"step": 372
},
{
"epoch": 0.12285396681625427,
"grad_norm": 2.9961395263671875,
"learning_rate": 9.714055580960622e-05,
"loss": 1.9754,
"step": 373
},
{
"epoch": 0.123183334019515,
"grad_norm": 3.112914800643921,
"learning_rate": 9.712299490778786e-05,
"loss": 2.1425,
"step": 374
},
{
"epoch": 0.12351270122277574,
"grad_norm": 3.4256157875061035,
"learning_rate": 9.710538184411991e-05,
"loss": 1.9117,
"step": 375
},
{
"epoch": 0.12384206842603648,
"grad_norm": 1.9335092306137085,
"learning_rate": 9.708771663809872e-05,
"loss": 2.4769,
"step": 376
},
{
"epoch": 0.12417143562929721,
"grad_norm": 2.1666107177734375,
"learning_rate": 9.706999930927848e-05,
"loss": 2.2265,
"step": 377
},
{
"epoch": 0.12450080283255795,
"grad_norm": 2.2735953330993652,
"learning_rate": 9.7052229877271e-05,
"loss": 2.4031,
"step": 378
},
{
"epoch": 0.12483017003581869,
"grad_norm": 2.531383514404297,
"learning_rate": 9.703440836174583e-05,
"loss": 2.4251,
"step": 379
},
{
"epoch": 0.1251595372390794,
"grad_norm": 2.608523368835449,
"learning_rate": 9.701653478243013e-05,
"loss": 2.4985,
"step": 380
},
{
"epoch": 0.12548890444234015,
"grad_norm": 2.4094133377075195,
"learning_rate": 9.699860915910868e-05,
"loss": 1.9842,
"step": 381
},
{
"epoch": 0.1258182716456009,
"grad_norm": 2.278822183609009,
"learning_rate": 9.698063151162389e-05,
"loss": 2.0549,
"step": 382
},
{
"epoch": 0.12614763884886163,
"grad_norm": 2.65533709526062,
"learning_rate": 9.696260185987576e-05,
"loss": 2.4869,
"step": 383
},
{
"epoch": 0.12647700605212236,
"grad_norm": 2.8089699745178223,
"learning_rate": 9.694452022382186e-05,
"loss": 2.3468,
"step": 384
},
{
"epoch": 0.1268063732553831,
"grad_norm": 2.4362573623657227,
"learning_rate": 9.692638662347728e-05,
"loss": 2.5076,
"step": 385
},
{
"epoch": 0.12713574045864384,
"grad_norm": 3.138957977294922,
"learning_rate": 9.690820107891466e-05,
"loss": 2.3684,
"step": 386
},
{
"epoch": 0.12746510766190455,
"grad_norm": 3.3768868446350098,
"learning_rate": 9.68899636102641e-05,
"loss": 2.7286,
"step": 387
},
{
"epoch": 0.1277944748651653,
"grad_norm": 2.5961859226226807,
"learning_rate": 9.68716742377132e-05,
"loss": 2.1429,
"step": 388
},
{
"epoch": 0.12812384206842603,
"grad_norm": 2.7435123920440674,
"learning_rate": 9.685333298150702e-05,
"loss": 2.2378,
"step": 389
},
{
"epoch": 0.12845320927168677,
"grad_norm": 2.207853317260742,
"learning_rate": 9.683493986194808e-05,
"loss": 2.0057,
"step": 390
},
{
"epoch": 0.1287825764749475,
"grad_norm": 3.450223922729492,
"learning_rate": 9.681649489939619e-05,
"loss": 2.5243,
"step": 391
},
{
"epoch": 0.12911194367820825,
"grad_norm": 2.479057788848877,
"learning_rate": 9.67979981142687e-05,
"loss": 1.9977,
"step": 392
},
{
"epoch": 0.12944131088146899,
"grad_norm": 2.4462480545043945,
"learning_rate": 9.677944952704023e-05,
"loss": 1.9085,
"step": 393
},
{
"epoch": 0.12977067808472972,
"grad_norm": 3.183197021484375,
"learning_rate": 9.676084915824276e-05,
"loss": 2.6043,
"step": 394
},
{
"epoch": 0.13010004528799043,
"grad_norm": 2.802366256713867,
"learning_rate": 9.674219702846561e-05,
"loss": 2.3849,
"step": 395
},
{
"epoch": 0.13042941249125117,
"grad_norm": 3.248178482055664,
"learning_rate": 9.672349315835535e-05,
"loss": 2.3824,
"step": 396
},
{
"epoch": 0.1307587796945119,
"grad_norm": 2.677258253097534,
"learning_rate": 9.670473756861588e-05,
"loss": 1.8382,
"step": 397
},
{
"epoch": 0.13108814689777265,
"grad_norm": 2.8905255794525146,
"learning_rate": 9.668593028000831e-05,
"loss": 1.9997,
"step": 398
},
{
"epoch": 0.1314175141010334,
"grad_norm": 2.8859400749206543,
"learning_rate": 9.6667071313351e-05,
"loss": 1.904,
"step": 399
},
{
"epoch": 0.13174688130429413,
"grad_norm": 3.3241395950317383,
"learning_rate": 9.664816068951947e-05,
"loss": 1.7025,
"step": 400
},
{
"epoch": 0.13207624850755487,
"grad_norm": 2.0233919620513916,
"learning_rate": 9.662919842944651e-05,
"loss": 2.3296,
"step": 401
},
{
"epoch": 0.1324056157108156,
"grad_norm": 2.4507946968078613,
"learning_rate": 9.661018455412197e-05,
"loss": 2.431,
"step": 402
},
{
"epoch": 0.13273498291407634,
"grad_norm": 2.341461658477783,
"learning_rate": 9.659111908459288e-05,
"loss": 2.5369,
"step": 403
},
{
"epoch": 0.13306435011733705,
"grad_norm": 2.3247382640838623,
"learning_rate": 9.657200204196337e-05,
"loss": 2.3336,
"step": 404
},
{
"epoch": 0.1333937173205978,
"grad_norm": 2.6917145252227783,
"learning_rate": 9.65528334473947e-05,
"loss": 2.5245,
"step": 405
},
{
"epoch": 0.13372308452385853,
"grad_norm": 2.5759096145629883,
"learning_rate": 9.653361332210513e-05,
"loss": 2.5367,
"step": 406
},
{
"epoch": 0.13405245172711927,
"grad_norm": 3.123220682144165,
"learning_rate": 9.651434168737e-05,
"loss": 2.3026,
"step": 407
},
{
"epoch": 0.13438181893038,
"grad_norm": 1.9661033153533936,
"learning_rate": 9.649501856452165e-05,
"loss": 1.9149,
"step": 408
},
{
"epoch": 0.13471118613364075,
"grad_norm": 2.2201578617095947,
"learning_rate": 9.647564397494944e-05,
"loss": 2.4501,
"step": 409
},
{
"epoch": 0.1350405533369015,
"grad_norm": 2.485238790512085,
"learning_rate": 9.645621794009967e-05,
"loss": 2.3879,
"step": 410
},
{
"epoch": 0.13536992054016223,
"grad_norm": 2.429931163787842,
"learning_rate": 9.643674048147558e-05,
"loss": 2.3842,
"step": 411
},
{
"epoch": 0.13569928774342294,
"grad_norm": 2.674360513687134,
"learning_rate": 9.641721162063739e-05,
"loss": 2.2783,
"step": 412
},
{
"epoch": 0.13602865494668367,
"grad_norm": 2.56316876411438,
"learning_rate": 9.639763137920214e-05,
"loss": 2.2338,
"step": 413
},
{
"epoch": 0.1363580221499444,
"grad_norm": 2.6116116046905518,
"learning_rate": 9.637799977884381e-05,
"loss": 2.2164,
"step": 414
},
{
"epoch": 0.13668738935320515,
"grad_norm": 2.379546880722046,
"learning_rate": 9.635831684129318e-05,
"loss": 1.8579,
"step": 415
},
{
"epoch": 0.1370167565564659,
"grad_norm": 2.250623941421509,
"learning_rate": 9.63385825883379e-05,
"loss": 1.9759,
"step": 416
},
{
"epoch": 0.13734612375972663,
"grad_norm": 2.5346574783325195,
"learning_rate": 9.63187970418224e-05,
"loss": 2.1965,
"step": 417
},
{
"epoch": 0.13767549096298737,
"grad_norm": 2.7046220302581787,
"learning_rate": 9.62989602236479e-05,
"loss": 2.207,
"step": 418
},
{
"epoch": 0.1380048581662481,
"grad_norm": 2.8171615600585938,
"learning_rate": 9.627907215577236e-05,
"loss": 2.1648,
"step": 419
},
{
"epoch": 0.13833422536950882,
"grad_norm": 2.6681857109069824,
"learning_rate": 9.625913286021046e-05,
"loss": 2.2677,
"step": 420
},
{
"epoch": 0.13866359257276956,
"grad_norm": 2.8763694763183594,
"learning_rate": 9.623914235903362e-05,
"loss": 1.9661,
"step": 421
},
{
"epoch": 0.1389929597760303,
"grad_norm": 2.8630616664886475,
"learning_rate": 9.621910067436992e-05,
"loss": 2.1464,
"step": 422
},
{
"epoch": 0.13932232697929103,
"grad_norm": 3.0975148677825928,
"learning_rate": 9.61990078284041e-05,
"loss": 2.1959,
"step": 423
},
{
"epoch": 0.13965169418255177,
"grad_norm": 3.8912901878356934,
"learning_rate": 9.617886384337751e-05,
"loss": 2.1552,
"step": 424
},
{
"epoch": 0.1399810613858125,
"grad_norm": 3.1637001037597656,
"learning_rate": 9.615866874158816e-05,
"loss": 1.8698,
"step": 425
},
{
"epoch": 0.14031042858907325,
"grad_norm": 2.0410444736480713,
"learning_rate": 9.613842254539058e-05,
"loss": 2.6313,
"step": 426
},
{
"epoch": 0.140639795792334,
"grad_norm": 2.081902503967285,
"learning_rate": 9.611812527719593e-05,
"loss": 2.444,
"step": 427
},
{
"epoch": 0.14096916299559473,
"grad_norm": 2.232922077178955,
"learning_rate": 9.609777695947182e-05,
"loss": 2.1034,
"step": 428
},
{
"epoch": 0.14129853019885544,
"grad_norm": 2.2757458686828613,
"learning_rate": 9.607737761474242e-05,
"loss": 2.5483,
"step": 429
},
{
"epoch": 0.14162789740211618,
"grad_norm": 2.3721635341644287,
"learning_rate": 9.60569272655884e-05,
"loss": 2.6453,
"step": 430
},
{
"epoch": 0.14195726460537691,
"grad_norm": 2.366621971130371,
"learning_rate": 9.603642593464683e-05,
"loss": 2.3196,
"step": 431
},
{
"epoch": 0.14228663180863765,
"grad_norm": 2.4534189701080322,
"learning_rate": 9.601587364461127e-05,
"loss": 2.4592,
"step": 432
},
{
"epoch": 0.1426159990118984,
"grad_norm": 2.350576639175415,
"learning_rate": 9.599527041823164e-05,
"loss": 2.0809,
"step": 433
},
{
"epoch": 0.14294536621515913,
"grad_norm": 2.2983639240264893,
"learning_rate": 9.59746162783143e-05,
"loss": 2.1881,
"step": 434
},
{
"epoch": 0.14327473341841987,
"grad_norm": 2.5199313163757324,
"learning_rate": 9.595391124772189e-05,
"loss": 2.2941,
"step": 435
},
{
"epoch": 0.1436041006216806,
"grad_norm": 2.4015860557556152,
"learning_rate": 9.593315534937345e-05,
"loss": 2.2748,
"step": 436
},
{
"epoch": 0.14393346782494132,
"grad_norm": 2.675955057144165,
"learning_rate": 9.591234860624431e-05,
"loss": 2.1505,
"step": 437
},
{
"epoch": 0.14426283502820206,
"grad_norm": 2.4210708141326904,
"learning_rate": 9.589149104136605e-05,
"loss": 2.2871,
"step": 438
},
{
"epoch": 0.1445922022314628,
"grad_norm": 2.417851686477661,
"learning_rate": 9.587058267782656e-05,
"loss": 2.1308,
"step": 439
},
{
"epoch": 0.14492156943472354,
"grad_norm": 2.456392526626587,
"learning_rate": 9.584962353876992e-05,
"loss": 1.8146,
"step": 440
},
{
"epoch": 0.14525093663798427,
"grad_norm": 2.4803600311279297,
"learning_rate": 9.582861364739642e-05,
"loss": 2.2325,
"step": 441
},
{
"epoch": 0.145580303841245,
"grad_norm": 2.4525747299194336,
"learning_rate": 9.580755302696256e-05,
"loss": 2.0824,
"step": 442
},
{
"epoch": 0.14590967104450575,
"grad_norm": 3.1618423461914062,
"learning_rate": 9.578644170078093e-05,
"loss": 2.3309,
"step": 443
},
{
"epoch": 0.1462390382477665,
"grad_norm": 3.0864148139953613,
"learning_rate": 9.576527969222031e-05,
"loss": 2.2085,
"step": 444
},
{
"epoch": 0.1465684054510272,
"grad_norm": 2.7343194484710693,
"learning_rate": 9.574406702470558e-05,
"loss": 2.2462,
"step": 445
},
{
"epoch": 0.14689777265428794,
"grad_norm": 2.57436466217041,
"learning_rate": 9.572280372171763e-05,
"loss": 2.0448,
"step": 446
},
{
"epoch": 0.14722713985754868,
"grad_norm": 2.608705759048462,
"learning_rate": 9.570148980679347e-05,
"loss": 1.9546,
"step": 447
},
{
"epoch": 0.14755650706080942,
"grad_norm": 2.5100154876708984,
"learning_rate": 9.56801253035261e-05,
"loss": 1.6897,
"step": 448
},
{
"epoch": 0.14788587426407016,
"grad_norm": 3.806117296218872,
"learning_rate": 9.565871023556455e-05,
"loss": 1.8645,
"step": 449
},
{
"epoch": 0.1482152414673309,
"grad_norm": 3.5076465606689453,
"learning_rate": 9.563724462661376e-05,
"loss": 2.2529,
"step": 450
},
{
"epoch": 0.14854460867059163,
"grad_norm": 2.111990451812744,
"learning_rate": 9.561572850043467e-05,
"loss": 2.5226,
"step": 451
},
{
"epoch": 0.14887397587385237,
"grad_norm": 2.713508367538452,
"learning_rate": 9.559416188084416e-05,
"loss": 2.548,
"step": 452
},
{
"epoch": 0.1492033430771131,
"grad_norm": 2.7908146381378174,
"learning_rate": 9.557254479171489e-05,
"loss": 2.8369,
"step": 453
},
{
"epoch": 0.14953271028037382,
"grad_norm": 2.3136017322540283,
"learning_rate": 9.555087725697554e-05,
"loss": 2.3652,
"step": 454
},
{
"epoch": 0.14986207748363456,
"grad_norm": 2.1749308109283447,
"learning_rate": 9.552915930061048e-05,
"loss": 2.2855,
"step": 455
},
{
"epoch": 0.1501914446868953,
"grad_norm": 2.302049398422241,
"learning_rate": 9.550739094666002e-05,
"loss": 2.3987,
"step": 456
},
{
"epoch": 0.15052081189015604,
"grad_norm": 2.3191065788269043,
"learning_rate": 9.548557221922017e-05,
"loss": 2.4523,
"step": 457
},
{
"epoch": 0.15085017909341678,
"grad_norm": 2.958138942718506,
"learning_rate": 9.546370314244273e-05,
"loss": 2.2964,
"step": 458
},
{
"epoch": 0.1511795462966775,
"grad_norm": 2.8905577659606934,
"learning_rate": 9.544178374053524e-05,
"loss": 2.6665,
"step": 459
},
{
"epoch": 0.15150891349993825,
"grad_norm": 2.6026418209075928,
"learning_rate": 9.541981403776095e-05,
"loss": 2.5692,
"step": 460
},
{
"epoch": 0.151838280703199,
"grad_norm": 2.7903120517730713,
"learning_rate": 9.539779405843876e-05,
"loss": 2.439,
"step": 461
},
{
"epoch": 0.1521676479064597,
"grad_norm": 3.0123376846313477,
"learning_rate": 9.537572382694328e-05,
"loss": 2.592,
"step": 462
},
{
"epoch": 0.15249701510972044,
"grad_norm": 2.3931968212127686,
"learning_rate": 9.535360336770467e-05,
"loss": 2.2706,
"step": 463
},
{
"epoch": 0.15282638231298118,
"grad_norm": 2.501070022583008,
"learning_rate": 9.533143270520873e-05,
"loss": 1.8545,
"step": 464
},
{
"epoch": 0.15315574951624192,
"grad_norm": 2.836297035217285,
"learning_rate": 9.530921186399684e-05,
"loss": 2.3613,
"step": 465
},
{
"epoch": 0.15348511671950266,
"grad_norm": 2.518571138381958,
"learning_rate": 9.528694086866592e-05,
"loss": 2.1096,
"step": 466
},
{
"epoch": 0.1538144839227634,
"grad_norm": 2.5794365406036377,
"learning_rate": 9.526461974386838e-05,
"loss": 2.0714,
"step": 467
},
{
"epoch": 0.15414385112602413,
"grad_norm": 2.955522060394287,
"learning_rate": 9.524224851431214e-05,
"loss": 2.1713,
"step": 468
},
{
"epoch": 0.15447321832928487,
"grad_norm": 3.465235948562622,
"learning_rate": 9.521982720476062e-05,
"loss": 2.2217,
"step": 469
},
{
"epoch": 0.15480258553254558,
"grad_norm": 2.497987985610962,
"learning_rate": 9.519735584003257e-05,
"loss": 1.9994,
"step": 470
},
{
"epoch": 0.15513195273580632,
"grad_norm": 2.911043643951416,
"learning_rate": 9.517483444500228e-05,
"loss": 1.9883,
"step": 471
},
{
"epoch": 0.15546131993906706,
"grad_norm": 2.7313320636749268,
"learning_rate": 9.51522630445993e-05,
"loss": 2.1227,
"step": 472
},
{
"epoch": 0.1557906871423278,
"grad_norm": 3.4482412338256836,
"learning_rate": 9.512964166380864e-05,
"loss": 2.2148,
"step": 473
},
{
"epoch": 0.15612005434558854,
"grad_norm": 2.664477825164795,
"learning_rate": 9.510697032767053e-05,
"loss": 1.7443,
"step": 474
},
{
"epoch": 0.15644942154884928,
"grad_norm": 2.8564321994781494,
"learning_rate": 9.508424906128058e-05,
"loss": 1.9044,
"step": 475
},
{
"epoch": 0.15677878875211002,
"grad_norm": 1.97623872756958,
"learning_rate": 9.506147788978965e-05,
"loss": 2.3548,
"step": 476
},
{
"epoch": 0.15710815595537075,
"grad_norm": 2.3602867126464844,
"learning_rate": 9.503865683840378e-05,
"loss": 2.5651,
"step": 477
},
{
"epoch": 0.1574375231586315,
"grad_norm": 2.1931259632110596,
"learning_rate": 9.501578593238432e-05,
"loss": 2.1644,
"step": 478
},
{
"epoch": 0.1577668903618922,
"grad_norm": 2.284613609313965,
"learning_rate": 9.499286519704773e-05,
"loss": 2.2849,
"step": 479
},
{
"epoch": 0.15809625756515294,
"grad_norm": 2.498391628265381,
"learning_rate": 9.49698946577657e-05,
"loss": 2.5516,
"step": 480
},
{
"epoch": 0.15842562476841368,
"grad_norm": 2.4198145866394043,
"learning_rate": 9.494687433996493e-05,
"loss": 2.194,
"step": 481
},
{
"epoch": 0.15875499197167442,
"grad_norm": 2.404466390609741,
"learning_rate": 9.492380426912737e-05,
"loss": 2.3409,
"step": 482
},
{
"epoch": 0.15908435917493516,
"grad_norm": 2.326627254486084,
"learning_rate": 9.490068447078992e-05,
"loss": 2.5327,
"step": 483
},
{
"epoch": 0.1594137263781959,
"grad_norm": 2.2029290199279785,
"learning_rate": 9.487751497054461e-05,
"loss": 2.3595,
"step": 484
},
{
"epoch": 0.15974309358145664,
"grad_norm": 2.798891544342041,
"learning_rate": 9.485429579403843e-05,
"loss": 2.5281,
"step": 485
},
{
"epoch": 0.16007246078471737,
"grad_norm": 2.421884775161743,
"learning_rate": 9.483102696697339e-05,
"loss": 2.3762,
"step": 486
},
{
"epoch": 0.16040182798797809,
"grad_norm": 2.618962049484253,
"learning_rate": 9.480770851510644e-05,
"loss": 2.5659,
"step": 487
},
{
"epoch": 0.16073119519123882,
"grad_norm": 2.394176721572876,
"learning_rate": 9.478434046424948e-05,
"loss": 2.0389,
"step": 488
},
{
"epoch": 0.16106056239449956,
"grad_norm": 2.6122007369995117,
"learning_rate": 9.47609228402693e-05,
"loss": 2.2712,
"step": 489
},
{
"epoch": 0.1613899295977603,
"grad_norm": 3.2165706157684326,
"learning_rate": 9.473745566908756e-05,
"loss": 2.2492,
"step": 490
},
{
"epoch": 0.16171929680102104,
"grad_norm": 2.527129650115967,
"learning_rate": 9.471393897668078e-05,
"loss": 2.4401,
"step": 491
},
{
"epoch": 0.16204866400428178,
"grad_norm": 2.758704900741577,
"learning_rate": 9.469037278908029e-05,
"loss": 2.2011,
"step": 492
},
{
"epoch": 0.16237803120754252,
"grad_norm": 2.8025074005126953,
"learning_rate": 9.46667571323722e-05,
"loss": 2.0435,
"step": 493
},
{
"epoch": 0.16270739841080326,
"grad_norm": 2.798211097717285,
"learning_rate": 9.464309203269739e-05,
"loss": 2.2279,
"step": 494
},
{
"epoch": 0.16303676561406397,
"grad_norm": 2.942800521850586,
"learning_rate": 9.461937751625145e-05,
"loss": 2.082,
"step": 495
},
{
"epoch": 0.1633661328173247,
"grad_norm": 2.7005648612976074,
"learning_rate": 9.459561360928472e-05,
"loss": 2.2156,
"step": 496
},
{
"epoch": 0.16369550002058544,
"grad_norm": 2.9613757133483887,
"learning_rate": 9.457180033810216e-05,
"loss": 2.3678,
"step": 497
},
{
"epoch": 0.16402486722384618,
"grad_norm": 2.888354539871216,
"learning_rate": 9.454793772906336e-05,
"loss": 1.9226,
"step": 498
},
{
"epoch": 0.16435423442710692,
"grad_norm": 3.021557331085205,
"learning_rate": 9.452402580858261e-05,
"loss": 2.1693,
"step": 499
},
{
"epoch": 0.16468360163036766,
"grad_norm": 3.1927671432495117,
"learning_rate": 9.45000646031287e-05,
"loss": 1.6268,
"step": 500
},
{
"epoch": 0.1650129688336284,
"grad_norm": 1.9104129076004028,
"learning_rate": 9.447605413922499e-05,
"loss": 2.6226,
"step": 501
},
{
"epoch": 0.16534233603688914,
"grad_norm": 1.981658935546875,
"learning_rate": 9.44519944434494e-05,
"loss": 2.288,
"step": 502
},
{
"epoch": 0.16567170324014988,
"grad_norm": 2.278552532196045,
"learning_rate": 9.442788554243431e-05,
"loss": 2.4093,
"step": 503
},
{
"epoch": 0.1660010704434106,
"grad_norm": 2.9332423210144043,
"learning_rate": 9.440372746286661e-05,
"loss": 2.5302,
"step": 504
},
{
"epoch": 0.16633043764667133,
"grad_norm": 2.4313466548919678,
"learning_rate": 9.437952023148757e-05,
"loss": 2.52,
"step": 505
},
{
"epoch": 0.16665980484993206,
"grad_norm": 2.3095078468322754,
"learning_rate": 9.43552638750929e-05,
"loss": 2.424,
"step": 506
},
{
"epoch": 0.1669891720531928,
"grad_norm": 2.3015639781951904,
"learning_rate": 9.433095842053272e-05,
"loss": 2.5717,
"step": 507
},
{
"epoch": 0.16731853925645354,
"grad_norm": 2.5541129112243652,
"learning_rate": 9.43066038947114e-05,
"loss": 2.5831,
"step": 508
},
{
"epoch": 0.16764790645971428,
"grad_norm": 2.421571969985962,
"learning_rate": 9.428220032458776e-05,
"loss": 2.4438,
"step": 509
},
{
"epoch": 0.16797727366297502,
"grad_norm": 2.5124785900115967,
"learning_rate": 9.425774773717479e-05,
"loss": 2.2709,
"step": 510
},
{
"epoch": 0.16830664086623576,
"grad_norm": 2.592477560043335,
"learning_rate": 9.423324615953982e-05,
"loss": 2.3089,
"step": 511
},
{
"epoch": 0.16863600806949647,
"grad_norm": 2.2433090209960938,
"learning_rate": 9.420869561880434e-05,
"loss": 2.318,
"step": 512
},
{
"epoch": 0.1689653752727572,
"grad_norm": 2.6666271686553955,
"learning_rate": 9.418409614214412e-05,
"loss": 2.3501,
"step": 513
},
{
"epoch": 0.16929474247601795,
"grad_norm": 2.3065149784088135,
"learning_rate": 9.415944775678902e-05,
"loss": 2.0432,
"step": 514
},
{
"epoch": 0.16962410967927868,
"grad_norm": 3.0598034858703613,
"learning_rate": 9.41347504900231e-05,
"loss": 2.6206,
"step": 515
},
{
"epoch": 0.16995347688253942,
"grad_norm": 2.051896095275879,
"learning_rate": 9.411000436918449e-05,
"loss": 1.9563,
"step": 516
},
{
"epoch": 0.17028284408580016,
"grad_norm": 3.1122515201568604,
"learning_rate": 9.408520942166541e-05,
"loss": 2.2816,
"step": 517
},
{
"epoch": 0.1706122112890609,
"grad_norm": 2.379194974899292,
"learning_rate": 9.406036567491213e-05,
"loss": 2.4151,
"step": 518
},
{
"epoch": 0.17094157849232164,
"grad_norm": 2.4957690238952637,
"learning_rate": 9.403547315642493e-05,
"loss": 1.8377,
"step": 519
},
{
"epoch": 0.17127094569558235,
"grad_norm": 2.6822221279144287,
"learning_rate": 9.401053189375809e-05,
"loss": 2.0926,
"step": 520
},
{
"epoch": 0.1716003128988431,
"grad_norm": 2.902961492538452,
"learning_rate": 9.398554191451983e-05,
"loss": 2.1816,
"step": 521
},
{
"epoch": 0.17192968010210383,
"grad_norm": 3.225004196166992,
"learning_rate": 9.396050324637228e-05,
"loss": 2.5233,
"step": 522
},
{
"epoch": 0.17225904730536457,
"grad_norm": 2.8753762245178223,
"learning_rate": 9.393541591703156e-05,
"loss": 2.3371,
"step": 523
},
{
"epoch": 0.1725884145086253,
"grad_norm": 3.5578978061676025,
"learning_rate": 9.39102799542675e-05,
"loss": 2.3127,
"step": 524
},
{
"epoch": 0.17291778171188604,
"grad_norm": 3.2395496368408203,
"learning_rate": 9.388509538590391e-05,
"loss": 2.0262,
"step": 525
},
{
"epoch": 0.17324714891514678,
"grad_norm": 2.063525438308716,
"learning_rate": 9.385986223981833e-05,
"loss": 2.5577,
"step": 526
},
{
"epoch": 0.17357651611840752,
"grad_norm": 2.227280616760254,
"learning_rate": 9.383458054394206e-05,
"loss": 2.5892,
"step": 527
},
{
"epoch": 0.17390588332166826,
"grad_norm": 2.1608548164367676,
"learning_rate": 9.380925032626015e-05,
"loss": 2.5988,
"step": 528
},
{
"epoch": 0.17423525052492897,
"grad_norm": 2.333763360977173,
"learning_rate": 9.378387161481142e-05,
"loss": 2.4371,
"step": 529
},
{
"epoch": 0.1745646177281897,
"grad_norm": 2.1032464504241943,
"learning_rate": 9.375844443768829e-05,
"loss": 2.1269,
"step": 530
},
{
"epoch": 0.17489398493145045,
"grad_norm": 2.7457637786865234,
"learning_rate": 9.373296882303688e-05,
"loss": 2.4893,
"step": 531
},
{
"epoch": 0.17522335213471119,
"grad_norm": 2.414435625076294,
"learning_rate": 9.37074447990569e-05,
"loss": 2.4437,
"step": 532
},
{
"epoch": 0.17555271933797192,
"grad_norm": 2.7596664428710938,
"learning_rate": 9.368187239400166e-05,
"loss": 2.3113,
"step": 533
},
{
"epoch": 0.17588208654123266,
"grad_norm": 2.7227630615234375,
"learning_rate": 9.3656251636178e-05,
"loss": 2.2728,
"step": 534
},
{
"epoch": 0.1762114537444934,
"grad_norm": 2.8053038120269775,
"learning_rate": 9.363058255394632e-05,
"loss": 2.3559,
"step": 535
},
{
"epoch": 0.17654082094775414,
"grad_norm": 2.6018600463867188,
"learning_rate": 9.360486517572049e-05,
"loss": 2.2176,
"step": 536
},
{
"epoch": 0.17687018815101485,
"grad_norm": 2.3456811904907227,
"learning_rate": 9.357909952996784e-05,
"loss": 2.1538,
"step": 537
},
{
"epoch": 0.1771995553542756,
"grad_norm": 2.6372413635253906,
"learning_rate": 9.355328564520914e-05,
"loss": 2.1687,
"step": 538
},
{
"epoch": 0.17752892255753633,
"grad_norm": 2.4839794635772705,
"learning_rate": 9.352742355001853e-05,
"loss": 2.2029,
"step": 539
},
{
"epoch": 0.17785828976079707,
"grad_norm": 2.5911648273468018,
"learning_rate": 9.350151327302356e-05,
"loss": 2.0988,
"step": 540
},
{
"epoch": 0.1781876569640578,
"grad_norm": 2.3182106018066406,
"learning_rate": 9.347555484290507e-05,
"loss": 1.9714,
"step": 541
},
{
"epoch": 0.17851702416731854,
"grad_norm": 3.564013957977295,
"learning_rate": 9.344954828839722e-05,
"loss": 2.3427,
"step": 542
},
{
"epoch": 0.17884639137057928,
"grad_norm": 2.816439628601074,
"learning_rate": 9.342349363828748e-05,
"loss": 2.198,
"step": 543
},
{
"epoch": 0.17917575857384002,
"grad_norm": 2.91499924659729,
"learning_rate": 9.339739092141647e-05,
"loss": 2.2565,
"step": 544
},
{
"epoch": 0.17950512577710073,
"grad_norm": 3.363369941711426,
"learning_rate": 9.337124016667809e-05,
"loss": 2.1877,
"step": 545
},
{
"epoch": 0.17983449298036147,
"grad_norm": 3.335421085357666,
"learning_rate": 9.334504140301938e-05,
"loss": 2.2754,
"step": 546
},
{
"epoch": 0.1801638601836222,
"grad_norm": 3.2244529724121094,
"learning_rate": 9.331879465944056e-05,
"loss": 2.0735,
"step": 547
},
{
"epoch": 0.18049322738688295,
"grad_norm": 2.5784671306610107,
"learning_rate": 9.32924999649949e-05,
"loss": 2.0593,
"step": 548
},
{
"epoch": 0.1808225945901437,
"grad_norm": 3.826298236846924,
"learning_rate": 9.326615734878878e-05,
"loss": 2.4232,
"step": 549
},
{
"epoch": 0.18115196179340443,
"grad_norm": 3.9787750244140625,
"learning_rate": 9.323976683998168e-05,
"loss": 1.9951,
"step": 550
},
{
"epoch": 0.18148132899666516,
"grad_norm": 2.1491682529449463,
"learning_rate": 9.321332846778599e-05,
"loss": 2.497,
"step": 551
},
{
"epoch": 0.1818106961999259,
"grad_norm": 2.5805752277374268,
"learning_rate": 9.318684226146714e-05,
"loss": 2.343,
"step": 552
},
{
"epoch": 0.18214006340318661,
"grad_norm": 2.671565055847168,
"learning_rate": 9.316030825034354e-05,
"loss": 2.314,
"step": 553
},
{
"epoch": 0.18246943060644735,
"grad_norm": 2.459749221801758,
"learning_rate": 9.313372646378643e-05,
"loss": 2.3503,
"step": 554
},
{
"epoch": 0.1827987978097081,
"grad_norm": 2.3150362968444824,
"learning_rate": 9.310709693122002e-05,
"loss": 1.9586,
"step": 555
},
{
"epoch": 0.18312816501296883,
"grad_norm": 2.813565254211426,
"learning_rate": 9.308041968212131e-05,
"loss": 2.467,
"step": 556
},
{
"epoch": 0.18345753221622957,
"grad_norm": 2.5931906700134277,
"learning_rate": 9.305369474602015e-05,
"loss": 2.245,
"step": 557
},
{
"epoch": 0.1837868994194903,
"grad_norm": 2.7060670852661133,
"learning_rate": 9.302692215249918e-05,
"loss": 2.7003,
"step": 558
},
{
"epoch": 0.18411626662275105,
"grad_norm": 2.543288230895996,
"learning_rate": 9.300010193119376e-05,
"loss": 2.2948,
"step": 559
},
{
"epoch": 0.18444563382601178,
"grad_norm": 2.492933750152588,
"learning_rate": 9.297323411179202e-05,
"loss": 2.2784,
"step": 560
},
{
"epoch": 0.18477500102927252,
"grad_norm": 2.2065117359161377,
"learning_rate": 9.294631872403474e-05,
"loss": 2.2058,
"step": 561
},
{
"epoch": 0.18510436823253323,
"grad_norm": 3.046983003616333,
"learning_rate": 9.291935579771536e-05,
"loss": 2.3053,
"step": 562
},
{
"epoch": 0.18543373543579397,
"grad_norm": 2.53873610496521,
"learning_rate": 9.289234536267996e-05,
"loss": 2.4844,
"step": 563
},
{
"epoch": 0.1857631026390547,
"grad_norm": 2.3958847522735596,
"learning_rate": 9.286528744882719e-05,
"loss": 2.0553,
"step": 564
},
{
"epoch": 0.18609246984231545,
"grad_norm": 3.1099889278411865,
"learning_rate": 9.283818208610826e-05,
"loss": 2.7911,
"step": 565
},
{
"epoch": 0.1864218370455762,
"grad_norm": 2.7768149375915527,
"learning_rate": 9.28110293045269e-05,
"loss": 2.3027,
"step": 566
},
{
"epoch": 0.18675120424883693,
"grad_norm": 2.551649570465088,
"learning_rate": 9.278382913413935e-05,
"loss": 1.9763,
"step": 567
},
{
"epoch": 0.18708057145209767,
"grad_norm": 2.6487247943878174,
"learning_rate": 9.27565816050543e-05,
"loss": 2.2145,
"step": 568
},
{
"epoch": 0.1874099386553584,
"grad_norm": 3.50011944770813,
"learning_rate": 9.272928674743282e-05,
"loss": 2.0824,
"step": 569
},
{
"epoch": 0.18773930585861912,
"grad_norm": 2.5027475357055664,
"learning_rate": 9.270194459148841e-05,
"loss": 2.0633,
"step": 570
},
{
"epoch": 0.18806867306187985,
"grad_norm": 2.7622134685516357,
"learning_rate": 9.267455516748693e-05,
"loss": 2.4109,
"step": 571
},
{
"epoch": 0.1883980402651406,
"grad_norm": 2.5253777503967285,
"learning_rate": 9.264711850574657e-05,
"loss": 1.8391,
"step": 572
},
{
"epoch": 0.18872740746840133,
"grad_norm": 3.9980525970458984,
"learning_rate": 9.261963463663775e-05,
"loss": 2.4288,
"step": 573
},
{
"epoch": 0.18905677467166207,
"grad_norm": 3.201934814453125,
"learning_rate": 9.25921035905832e-05,
"loss": 2.1041,
"step": 574
},
{
"epoch": 0.1893861418749228,
"grad_norm": 3.45932674407959,
"learning_rate": 9.256452539805787e-05,
"loss": 1.9162,
"step": 575
},
{
"epoch": 0.18971550907818355,
"grad_norm": 2.3997511863708496,
"learning_rate": 9.253690008958886e-05,
"loss": 2.7296,
"step": 576
},
{
"epoch": 0.19004487628144429,
"grad_norm": 2.100877523422241,
"learning_rate": 9.250922769575548e-05,
"loss": 2.2391,
"step": 577
},
{
"epoch": 0.190374243484705,
"grad_norm": 2.0351099967956543,
"learning_rate": 9.248150824718911e-05,
"loss": 2.3191,
"step": 578
},
{
"epoch": 0.19070361068796574,
"grad_norm": 2.368957757949829,
"learning_rate": 9.245374177457323e-05,
"loss": 2.2142,
"step": 579
},
{
"epoch": 0.19103297789122647,
"grad_norm": 2.49444580078125,
"learning_rate": 9.242592830864339e-05,
"loss": 2.3158,
"step": 580
},
{
"epoch": 0.1913623450944872,
"grad_norm": 2.4949872493743896,
"learning_rate": 9.239806788018714e-05,
"loss": 2.3183,
"step": 581
},
{
"epoch": 0.19169171229774795,
"grad_norm": 2.6433401107788086,
"learning_rate": 9.2370160520044e-05,
"loss": 2.4043,
"step": 582
},
{
"epoch": 0.1920210795010087,
"grad_norm": 2.3650400638580322,
"learning_rate": 9.23422062591055e-05,
"loss": 2.2305,
"step": 583
},
{
"epoch": 0.19235044670426943,
"grad_norm": 2.623833179473877,
"learning_rate": 9.231420512831501e-05,
"loss": 2.2516,
"step": 584
},
{
"epoch": 0.19267981390753017,
"grad_norm": 2.3704848289489746,
"learning_rate": 9.228615715866785e-05,
"loss": 2.1916,
"step": 585
},
{
"epoch": 0.1930091811107909,
"grad_norm": 2.6421825885772705,
"learning_rate": 9.225806238121113e-05,
"loss": 2.1705,
"step": 586
},
{
"epoch": 0.19333854831405162,
"grad_norm": 2.507702589035034,
"learning_rate": 9.222992082704381e-05,
"loss": 2.2798,
"step": 587
},
{
"epoch": 0.19366791551731236,
"grad_norm": 2.977654457092285,
"learning_rate": 9.22017325273166e-05,
"loss": 2.3934,
"step": 588
},
{
"epoch": 0.1939972827205731,
"grad_norm": 2.6255760192871094,
"learning_rate": 9.217349751323199e-05,
"loss": 2.168,
"step": 589
},
{
"epoch": 0.19432664992383383,
"grad_norm": 2.767040252685547,
"learning_rate": 9.214521581604415e-05,
"loss": 2.3227,
"step": 590
},
{
"epoch": 0.19465601712709457,
"grad_norm": 2.74783992767334,
"learning_rate": 9.211688746705894e-05,
"loss": 2.5598,
"step": 591
},
{
"epoch": 0.1949853843303553,
"grad_norm": 3.419125556945801,
"learning_rate": 9.208851249763385e-05,
"loss": 2.1441,
"step": 592
},
{
"epoch": 0.19531475153361605,
"grad_norm": 2.531890392303467,
"learning_rate": 9.206009093917798e-05,
"loss": 2.2535,
"step": 593
},
{
"epoch": 0.1956441187368768,
"grad_norm": 2.928637742996216,
"learning_rate": 9.203162282315201e-05,
"loss": 1.9981,
"step": 594
},
{
"epoch": 0.1959734859401375,
"grad_norm": 2.740192174911499,
"learning_rate": 9.200310818106813e-05,
"loss": 2.1965,
"step": 595
},
{
"epoch": 0.19630285314339824,
"grad_norm": 2.5833513736724854,
"learning_rate": 9.197454704449007e-05,
"loss": 1.9684,
"step": 596
},
{
"epoch": 0.19663222034665898,
"grad_norm": 2.816075563430786,
"learning_rate": 9.194593944503298e-05,
"loss": 1.9281,
"step": 597
},
{
"epoch": 0.19696158754991971,
"grad_norm": 2.4949111938476562,
"learning_rate": 9.19172854143635e-05,
"loss": 1.8233,
"step": 598
},
{
"epoch": 0.19729095475318045,
"grad_norm": 3.0978989601135254,
"learning_rate": 9.18885849841996e-05,
"loss": 1.9407,
"step": 599
},
{
"epoch": 0.1976203219564412,
"grad_norm": 3.714794397354126,
"learning_rate": 9.185983818631066e-05,
"loss": 2.082,
"step": 600
},
{
"epoch": 0.19794968915970193,
"grad_norm": 2.1515870094299316,
"learning_rate": 9.183104505251735e-05,
"loss": 2.5559,
"step": 601
},
{
"epoch": 0.19827905636296267,
"grad_norm": 2.2169995307922363,
"learning_rate": 9.180220561469167e-05,
"loss": 2.2574,
"step": 602
},
{
"epoch": 0.19860842356622338,
"grad_norm": 2.327986717224121,
"learning_rate": 9.177331990475685e-05,
"loss": 2.4616,
"step": 603
},
{
"epoch": 0.19893779076948412,
"grad_norm": 3.0551161766052246,
"learning_rate": 9.174438795468734e-05,
"loss": 2.602,
"step": 604
},
{
"epoch": 0.19926715797274486,
"grad_norm": 2.404407024383545,
"learning_rate": 9.171540979650879e-05,
"loss": 2.4922,
"step": 605
},
{
"epoch": 0.1995965251760056,
"grad_norm": 2.458751916885376,
"learning_rate": 9.168638546229796e-05,
"loss": 2.3752,
"step": 606
},
{
"epoch": 0.19992589237926633,
"grad_norm": 3.044050693511963,
"learning_rate": 9.165731498418277e-05,
"loss": 2.6778,
"step": 607
},
{
"epoch": 0.20025525958252707,
"grad_norm": 2.5065767765045166,
"learning_rate": 9.162819839434223e-05,
"loss": 2.3761,
"step": 608
},
{
"epoch": 0.2005846267857878,
"grad_norm": 2.5272152423858643,
"learning_rate": 9.15990357250063e-05,
"loss": 2.2685,
"step": 609
},
{
"epoch": 0.20091399398904855,
"grad_norm": 3.1523914337158203,
"learning_rate": 9.156982700845606e-05,
"loss": 2.516,
"step": 610
},
{
"epoch": 0.2012433611923093,
"grad_norm": 2.7383854389190674,
"learning_rate": 9.154057227702348e-05,
"loss": 2.4093,
"step": 611
},
{
"epoch": 0.20157272839557,
"grad_norm": 3.010639190673828,
"learning_rate": 9.151127156309151e-05,
"loss": 2.5929,
"step": 612
},
{
"epoch": 0.20190209559883074,
"grad_norm": 3.279294967651367,
"learning_rate": 9.1481924899094e-05,
"loss": 2.266,
"step": 613
},
{
"epoch": 0.20223146280209148,
"grad_norm": 3.147496223449707,
"learning_rate": 9.145253231751563e-05,
"loss": 2.1337,
"step": 614
},
{
"epoch": 0.20256083000535222,
"grad_norm": 2.710343599319458,
"learning_rate": 9.142309385089191e-05,
"loss": 2.0808,
"step": 615
},
{
"epoch": 0.20289019720861295,
"grad_norm": 2.6864521503448486,
"learning_rate": 9.139360953180918e-05,
"loss": 2.1494,
"step": 616
},
{
"epoch": 0.2032195644118737,
"grad_norm": 2.7441837787628174,
"learning_rate": 9.136407939290451e-05,
"loss": 2.5828,
"step": 617
},
{
"epoch": 0.20354893161513443,
"grad_norm": 3.46586537361145,
"learning_rate": 9.13345034668657e-05,
"loss": 2.6775,
"step": 618
},
{
"epoch": 0.20387829881839517,
"grad_norm": 2.7498793601989746,
"learning_rate": 9.130488178643119e-05,
"loss": 2.0033,
"step": 619
},
{
"epoch": 0.20420766602165588,
"grad_norm": 3.1091184616088867,
"learning_rate": 9.127521438439015e-05,
"loss": 2.4799,
"step": 620
},
{
"epoch": 0.20453703322491662,
"grad_norm": 2.3706719875335693,
"learning_rate": 9.124550129358227e-05,
"loss": 1.8221,
"step": 621
},
{
"epoch": 0.20486640042817736,
"grad_norm": 2.63212251663208,
"learning_rate": 9.121574254689788e-05,
"loss": 2.0923,
"step": 622
},
{
"epoch": 0.2051957676314381,
"grad_norm": 3.275517225265503,
"learning_rate": 9.118593817727782e-05,
"loss": 2.1223,
"step": 623
},
{
"epoch": 0.20552513483469884,
"grad_norm": 3.2868645191192627,
"learning_rate": 9.115608821771347e-05,
"loss": 2.1894,
"step": 624
},
{
"epoch": 0.20585450203795957,
"grad_norm": 3.2911734580993652,
"learning_rate": 9.112619270124658e-05,
"loss": 2.0714,
"step": 625
},
{
"epoch": 0.2061838692412203,
"grad_norm": 1.8351259231567383,
"learning_rate": 9.109625166096942e-05,
"loss": 2.5735,
"step": 626
},
{
"epoch": 0.20651323644448105,
"grad_norm": 2.289583206176758,
"learning_rate": 9.106626513002464e-05,
"loss": 2.2809,
"step": 627
},
{
"epoch": 0.20684260364774176,
"grad_norm": 2.3926987648010254,
"learning_rate": 9.103623314160518e-05,
"loss": 2.4775,
"step": 628
},
{
"epoch": 0.2071719708510025,
"grad_norm": 2.8485803604125977,
"learning_rate": 9.100615572895439e-05,
"loss": 2.5643,
"step": 629
},
{
"epoch": 0.20750133805426324,
"grad_norm": 2.601393461227417,
"learning_rate": 9.097603292536583e-05,
"loss": 2.1331,
"step": 630
},
{
"epoch": 0.20783070525752398,
"grad_norm": 2.364741086959839,
"learning_rate": 9.094586476418335e-05,
"loss": 2.2653,
"step": 631
},
{
"epoch": 0.20816007246078472,
"grad_norm": 2.307363510131836,
"learning_rate": 9.091565127880096e-05,
"loss": 2.8223,
"step": 632
},
{
"epoch": 0.20848943966404546,
"grad_norm": 2.207406997680664,
"learning_rate": 9.088539250266287e-05,
"loss": 2.2839,
"step": 633
},
{
"epoch": 0.2088188068673062,
"grad_norm": 2.271679401397705,
"learning_rate": 9.085508846926345e-05,
"loss": 2.1932,
"step": 634
},
{
"epoch": 0.20914817407056693,
"grad_norm": 2.566964864730835,
"learning_rate": 9.082473921214714e-05,
"loss": 2.2745,
"step": 635
},
{
"epoch": 0.20947754127382767,
"grad_norm": 2.6665945053100586,
"learning_rate": 9.07943447649084e-05,
"loss": 2.2285,
"step": 636
},
{
"epoch": 0.20980690847708838,
"grad_norm": 2.232455253601074,
"learning_rate": 9.07639051611918e-05,
"loss": 2.0741,
"step": 637
},
{
"epoch": 0.21013627568034912,
"grad_norm": 2.365143299102783,
"learning_rate": 9.07334204346918e-05,
"loss": 1.9137,
"step": 638
},
{
"epoch": 0.21046564288360986,
"grad_norm": 2.6072494983673096,
"learning_rate": 9.070289061915289e-05,
"loss": 2.235,
"step": 639
},
{
"epoch": 0.2107950100868706,
"grad_norm": 2.8605432510375977,
"learning_rate": 9.06723157483694e-05,
"loss": 2.2824,
"step": 640
},
{
"epoch": 0.21112437729013134,
"grad_norm": 2.70151424407959,
"learning_rate": 9.064169585618561e-05,
"loss": 2.202,
"step": 641
},
{
"epoch": 0.21145374449339208,
"grad_norm": 2.540966272354126,
"learning_rate": 9.061103097649554e-05,
"loss": 2.0908,
"step": 642
},
{
"epoch": 0.21178311169665281,
"grad_norm": 2.4950785636901855,
"learning_rate": 9.05803211432431e-05,
"loss": 1.9015,
"step": 643
},
{
"epoch": 0.21211247889991355,
"grad_norm": 2.925896406173706,
"learning_rate": 9.054956639042194e-05,
"loss": 2.2201,
"step": 644
},
{
"epoch": 0.21244184610317426,
"grad_norm": 3.0082502365112305,
"learning_rate": 9.051876675207535e-05,
"loss": 2.3677,
"step": 645
},
{
"epoch": 0.212771213306435,
"grad_norm": 2.554163694381714,
"learning_rate": 9.048792226229642e-05,
"loss": 1.9325,
"step": 646
},
{
"epoch": 0.21310058050969574,
"grad_norm": 2.7593939304351807,
"learning_rate": 9.04570329552278e-05,
"loss": 2.2431,
"step": 647
},
{
"epoch": 0.21342994771295648,
"grad_norm": 3.1389267444610596,
"learning_rate": 9.042609886506183e-05,
"loss": 2.2714,
"step": 648
},
{
"epoch": 0.21375931491621722,
"grad_norm": 2.6484322547912598,
"learning_rate": 9.039512002604034e-05,
"loss": 2.0055,
"step": 649
},
{
"epoch": 0.21408868211947796,
"grad_norm": 3.6394948959350586,
"learning_rate": 9.036409647245474e-05,
"loss": 2.2731,
"step": 650
},
{
"epoch": 0.2144180493227387,
"grad_norm": 1.968092679977417,
"learning_rate": 9.033302823864595e-05,
"loss": 2.3822,
"step": 651
},
{
"epoch": 0.21474741652599943,
"grad_norm": 2.4255237579345703,
"learning_rate": 9.03019153590043e-05,
"loss": 2.5761,
"step": 652
},
{
"epoch": 0.21507678372926015,
"grad_norm": 2.2358973026275635,
"learning_rate": 9.027075786796957e-05,
"loss": 2.2964,
"step": 653
},
{
"epoch": 0.21540615093252088,
"grad_norm": 3.0591135025024414,
"learning_rate": 9.023955580003092e-05,
"loss": 2.609,
"step": 654
},
{
"epoch": 0.21573551813578162,
"grad_norm": 2.4640660285949707,
"learning_rate": 9.020830918972684e-05,
"loss": 2.4714,
"step": 655
},
{
"epoch": 0.21606488533904236,
"grad_norm": 2.639633893966675,
"learning_rate": 9.017701807164516e-05,
"loss": 1.9724,
"step": 656
},
{
"epoch": 0.2163942525423031,
"grad_norm": 2.35853910446167,
"learning_rate": 9.014568248042292e-05,
"loss": 2.1988,
"step": 657
},
{
"epoch": 0.21672361974556384,
"grad_norm": 2.6157610416412354,
"learning_rate": 9.011430245074645e-05,
"loss": 2.4365,
"step": 658
},
{
"epoch": 0.21705298694882458,
"grad_norm": 2.606468439102173,
"learning_rate": 9.008287801735124e-05,
"loss": 2.1511,
"step": 659
},
{
"epoch": 0.21738235415208532,
"grad_norm": 2.8526015281677246,
"learning_rate": 9.005140921502193e-05,
"loss": 2.366,
"step": 660
},
{
"epoch": 0.21771172135534606,
"grad_norm": 2.663635492324829,
"learning_rate": 9.001989607859226e-05,
"loss": 2.418,
"step": 661
},
{
"epoch": 0.21804108855860677,
"grad_norm": 2.6935718059539795,
"learning_rate": 8.998833864294507e-05,
"loss": 2.327,
"step": 662
},
{
"epoch": 0.2183704557618675,
"grad_norm": 2.840543031692505,
"learning_rate": 8.995673694301223e-05,
"loss": 2.106,
"step": 663
},
{
"epoch": 0.21869982296512824,
"grad_norm": 2.640491485595703,
"learning_rate": 8.99250910137746e-05,
"loss": 1.9047,
"step": 664
},
{
"epoch": 0.21902919016838898,
"grad_norm": 2.4656693935394287,
"learning_rate": 8.989340089026203e-05,
"loss": 1.9766,
"step": 665
},
{
"epoch": 0.21935855737164972,
"grad_norm": 3.2263641357421875,
"learning_rate": 8.986166660755321e-05,
"loss": 2.4087,
"step": 666
},
{
"epoch": 0.21968792457491046,
"grad_norm": 2.758763313293457,
"learning_rate": 8.982988820077582e-05,
"loss": 2.2525,
"step": 667
},
{
"epoch": 0.2200172917781712,
"grad_norm": 2.882479429244995,
"learning_rate": 8.979806570510631e-05,
"loss": 2.3654,
"step": 668
},
{
"epoch": 0.22034665898143194,
"grad_norm": 2.9043984413146973,
"learning_rate": 8.976619915576994e-05,
"loss": 1.992,
"step": 669
},
{
"epoch": 0.22067602618469265,
"grad_norm": 2.918984889984131,
"learning_rate": 8.973428858804073e-05,
"loss": 2.2139,
"step": 670
},
{
"epoch": 0.22100539338795339,
"grad_norm": 2.9728167057037354,
"learning_rate": 8.970233403724146e-05,
"loss": 2.0774,
"step": 671
},
{
"epoch": 0.22133476059121412,
"grad_norm": 2.9786489009857178,
"learning_rate": 8.96703355387436e-05,
"loss": 2.1803,
"step": 672
},
{
"epoch": 0.22166412779447486,
"grad_norm": 3.05253267288208,
"learning_rate": 8.963829312796718e-05,
"loss": 2.1476,
"step": 673
},
{
"epoch": 0.2219934949977356,
"grad_norm": 2.640706777572632,
"learning_rate": 8.960620684038097e-05,
"loss": 2.1194,
"step": 674
},
{
"epoch": 0.22232286220099634,
"grad_norm": 3.7444612979888916,
"learning_rate": 8.95740767115022e-05,
"loss": 2.1602,
"step": 675
},
{
"epoch": 0.22265222940425708,
"grad_norm": 1.9276431798934937,
"learning_rate": 8.95419027768967e-05,
"loss": 2.1702,
"step": 676
},
{
"epoch": 0.22298159660751782,
"grad_norm": 2.062800168991089,
"learning_rate": 8.95096850721787e-05,
"loss": 2.2015,
"step": 677
},
{
"epoch": 0.22331096381077853,
"grad_norm": 2.143061637878418,
"learning_rate": 8.947742363301098e-05,
"loss": 2.4098,
"step": 678
},
{
"epoch": 0.22364033101403927,
"grad_norm": 2.5208868980407715,
"learning_rate": 8.944511849510469e-05,
"loss": 2.5313,
"step": 679
},
{
"epoch": 0.2239696982173,
"grad_norm": 2.5163257122039795,
"learning_rate": 8.941276969421935e-05,
"loss": 2.4371,
"step": 680
},
{
"epoch": 0.22429906542056074,
"grad_norm": 2.5549163818359375,
"learning_rate": 8.938037726616281e-05,
"loss": 2.3066,
"step": 681
},
{
"epoch": 0.22462843262382148,
"grad_norm": 2.5244922637939453,
"learning_rate": 8.934794124679121e-05,
"loss": 2.4772,
"step": 682
},
{
"epoch": 0.22495779982708222,
"grad_norm": 2.399825096130371,
"learning_rate": 8.931546167200895e-05,
"loss": 2.5803,
"step": 683
},
{
"epoch": 0.22528716703034296,
"grad_norm": 2.410311460494995,
"learning_rate": 8.928293857776866e-05,
"loss": 2.0751,
"step": 684
},
{
"epoch": 0.2256165342336037,
"grad_norm": 2.29736590385437,
"learning_rate": 8.925037200007109e-05,
"loss": 2.2536,
"step": 685
},
{
"epoch": 0.22594590143686444,
"grad_norm": 2.3026459217071533,
"learning_rate": 8.921776197496518e-05,
"loss": 2.0917,
"step": 686
},
{
"epoch": 0.22627526864012515,
"grad_norm": 2.490049362182617,
"learning_rate": 8.918510853854794e-05,
"loss": 2.0983,
"step": 687
},
{
"epoch": 0.2266046358433859,
"grad_norm": 3.218235969543457,
"learning_rate": 8.915241172696441e-05,
"loss": 2.3049,
"step": 688
},
{
"epoch": 0.22693400304664663,
"grad_norm": 2.5197603702545166,
"learning_rate": 8.911967157640771e-05,
"loss": 2.1897,
"step": 689
},
{
"epoch": 0.22726337024990736,
"grad_norm": 2.4397006034851074,
"learning_rate": 8.908688812311884e-05,
"loss": 1.9715,
"step": 690
},
{
"epoch": 0.2275927374531681,
"grad_norm": 2.9002199172973633,
"learning_rate": 8.905406140338683e-05,
"loss": 2.2867,
"step": 691
},
{
"epoch": 0.22792210465642884,
"grad_norm": 2.6698174476623535,
"learning_rate": 8.902119145354852e-05,
"loss": 2.4758,
"step": 692
},
{
"epoch": 0.22825147185968958,
"grad_norm": 3.1235620975494385,
"learning_rate": 8.898827830998864e-05,
"loss": 2.1978,
"step": 693
},
{
"epoch": 0.22858083906295032,
"grad_norm": 2.7362828254699707,
"learning_rate": 8.895532200913976e-05,
"loss": 2.326,
"step": 694
},
{
"epoch": 0.22891020626621103,
"grad_norm": 2.6940107345581055,
"learning_rate": 8.892232258748217e-05,
"loss": 2.0774,
"step": 695
},
{
"epoch": 0.22923957346947177,
"grad_norm": 2.963489294052124,
"learning_rate": 8.888928008154393e-05,
"loss": 2.2016,
"step": 696
},
{
"epoch": 0.2295689406727325,
"grad_norm": 2.9156124591827393,
"learning_rate": 8.885619452790078e-05,
"loss": 2.4717,
"step": 697
},
{
"epoch": 0.22989830787599325,
"grad_norm": 3.306898355484009,
"learning_rate": 8.882306596317606e-05,
"loss": 2.2373,
"step": 698
},
{
"epoch": 0.23022767507925399,
"grad_norm": 2.8223652839660645,
"learning_rate": 8.878989442404082e-05,
"loss": 1.9008,
"step": 699
},
{
"epoch": 0.23055704228251472,
"grad_norm": 2.8709568977355957,
"learning_rate": 8.87566799472136e-05,
"loss": 1.9383,
"step": 700
},
{
"epoch": 0.23088640948577546,
"grad_norm": 2.175177574157715,
"learning_rate": 8.872342256946051e-05,
"loss": 2.3998,
"step": 701
},
{
"epoch": 0.2312157766890362,
"grad_norm": 2.408965587615967,
"learning_rate": 8.869012232759512e-05,
"loss": 2.6233,
"step": 702
},
{
"epoch": 0.2315451438922969,
"grad_norm": 2.2874746322631836,
"learning_rate": 8.865677925847848e-05,
"loss": 2.0586,
"step": 703
},
{
"epoch": 0.23187451109555765,
"grad_norm": 2.436617374420166,
"learning_rate": 8.862339339901902e-05,
"loss": 2.324,
"step": 704
},
{
"epoch": 0.2322038782988184,
"grad_norm": 2.8110032081604004,
"learning_rate": 8.858996478617253e-05,
"loss": 2.4255,
"step": 705
},
{
"epoch": 0.23253324550207913,
"grad_norm": 2.681027889251709,
"learning_rate": 8.855649345694216e-05,
"loss": 2.5485,
"step": 706
},
{
"epoch": 0.23286261270533987,
"grad_norm": 2.6605820655822754,
"learning_rate": 8.852297944837831e-05,
"loss": 2.5063,
"step": 707
},
{
"epoch": 0.2331919799086006,
"grad_norm": 2.8735132217407227,
"learning_rate": 8.848942279757864e-05,
"loss": 2.1953,
"step": 708
},
{
"epoch": 0.23352134711186134,
"grad_norm": 2.5370914936065674,
"learning_rate": 8.845582354168802e-05,
"loss": 1.9245,
"step": 709
},
{
"epoch": 0.23385071431512208,
"grad_norm": 2.733022928237915,
"learning_rate": 8.842218171789846e-05,
"loss": 2.109,
"step": 710
},
{
"epoch": 0.2341800815183828,
"grad_norm": 3.819556951522827,
"learning_rate": 8.838849736344909e-05,
"loss": 2.405,
"step": 711
},
{
"epoch": 0.23450944872164353,
"grad_norm": 2.486302137374878,
"learning_rate": 8.835477051562613e-05,
"loss": 2.1244,
"step": 712
},
{
"epoch": 0.23483881592490427,
"grad_norm": 3.0748279094696045,
"learning_rate": 8.832100121176285e-05,
"loss": 2.2095,
"step": 713
},
{
"epoch": 0.235168183128165,
"grad_norm": 3.035090208053589,
"learning_rate": 8.828718948923949e-05,
"loss": 1.9369,
"step": 714
},
{
"epoch": 0.23549755033142575,
"grad_norm": 2.62367582321167,
"learning_rate": 8.825333538548326e-05,
"loss": 2.1745,
"step": 715
},
{
"epoch": 0.2358269175346865,
"grad_norm": 2.8850131034851074,
"learning_rate": 8.821943893796826e-05,
"loss": 2.3818,
"step": 716
},
{
"epoch": 0.23615628473794723,
"grad_norm": 3.0183825492858887,
"learning_rate": 8.81855001842155e-05,
"loss": 2.2054,
"step": 717
},
{
"epoch": 0.23648565194120796,
"grad_norm": 2.698676586151123,
"learning_rate": 8.81515191617928e-05,
"loss": 2.24,
"step": 718
},
{
"epoch": 0.2368150191444687,
"grad_norm": 2.8108513355255127,
"learning_rate": 8.811749590831475e-05,
"loss": 2.1469,
"step": 719
},
{
"epoch": 0.2371443863477294,
"grad_norm": 2.8320016860961914,
"learning_rate": 8.808343046144271e-05,
"loss": 2.1924,
"step": 720
},
{
"epoch": 0.23747375355099015,
"grad_norm": 3.3814282417297363,
"learning_rate": 8.804932285888477e-05,
"loss": 2.3818,
"step": 721
},
{
"epoch": 0.2378031207542509,
"grad_norm": 3.158623695373535,
"learning_rate": 8.80151731383956e-05,
"loss": 2.3962,
"step": 722
},
{
"epoch": 0.23813248795751163,
"grad_norm": 2.743208646774292,
"learning_rate": 8.798098133777659e-05,
"loss": 1.779,
"step": 723
},
{
"epoch": 0.23846185516077237,
"grad_norm": 3.1935389041900635,
"learning_rate": 8.794674749487565e-05,
"loss": 2.1095,
"step": 724
},
{
"epoch": 0.2387912223640331,
"grad_norm": 3.515878438949585,
"learning_rate": 8.791247164758722e-05,
"loss": 1.9777,
"step": 725
},
{
"epoch": 0.23912058956729385,
"grad_norm": 2.7194595336914062,
"learning_rate": 8.78781538338523e-05,
"loss": 2.3815,
"step": 726
},
{
"epoch": 0.23944995677055458,
"grad_norm": 2.3084821701049805,
"learning_rate": 8.784379409165828e-05,
"loss": 2.2721,
"step": 727
},
{
"epoch": 0.2397793239738153,
"grad_norm": 2.4673047065734863,
"learning_rate": 8.780939245903898e-05,
"loss": 2.1049,
"step": 728
},
{
"epoch": 0.24010869117707603,
"grad_norm": 2.5129013061523438,
"learning_rate": 8.77749489740746e-05,
"loss": 2.3995,
"step": 729
},
{
"epoch": 0.24043805838033677,
"grad_norm": 2.559809923171997,
"learning_rate": 8.774046367489166e-05,
"loss": 2.3422,
"step": 730
},
{
"epoch": 0.2407674255835975,
"grad_norm": 2.3660426139831543,
"learning_rate": 8.770593659966298e-05,
"loss": 2.0759,
"step": 731
},
{
"epoch": 0.24109679278685825,
"grad_norm": 2.502711057662964,
"learning_rate": 8.767136778660759e-05,
"loss": 2.2224,
"step": 732
},
{
"epoch": 0.241426159990119,
"grad_norm": 2.396199941635132,
"learning_rate": 8.763675727399075e-05,
"loss": 2.4539,
"step": 733
},
{
"epoch": 0.24175552719337973,
"grad_norm": 2.830613851547241,
"learning_rate": 8.760210510012387e-05,
"loss": 2.4058,
"step": 734
},
{
"epoch": 0.24208489439664047,
"grad_norm": 2.6569724082946777,
"learning_rate": 8.756741130336448e-05,
"loss": 2.4416,
"step": 735
},
{
"epoch": 0.24241426159990118,
"grad_norm": 3.4117109775543213,
"learning_rate": 8.753267592211616e-05,
"loss": 2.483,
"step": 736
},
{
"epoch": 0.24274362880316191,
"grad_norm": 2.8380532264709473,
"learning_rate": 8.749789899482856e-05,
"loss": 2.4988,
"step": 737
},
{
"epoch": 0.24307299600642265,
"grad_norm": 2.325429916381836,
"learning_rate": 8.74630805599973e-05,
"loss": 1.8716,
"step": 738
},
{
"epoch": 0.2434023632096834,
"grad_norm": 2.5336060523986816,
"learning_rate": 8.742822065616393e-05,
"loss": 1.8746,
"step": 739
},
{
"epoch": 0.24373173041294413,
"grad_norm": 2.6919002532958984,
"learning_rate": 8.739331932191592e-05,
"loss": 2.2107,
"step": 740
},
{
"epoch": 0.24406109761620487,
"grad_norm": 3.1390397548675537,
"learning_rate": 8.735837659588661e-05,
"loss": 2.5521,
"step": 741
},
{
"epoch": 0.2443904648194656,
"grad_norm": 2.4885716438293457,
"learning_rate": 8.732339251675516e-05,
"loss": 2.1718,
"step": 742
},
{
"epoch": 0.24471983202272635,
"grad_norm": 2.5541298389434814,
"learning_rate": 8.728836712324646e-05,
"loss": 2.1153,
"step": 743
},
{
"epoch": 0.24504919922598709,
"grad_norm": 2.7616307735443115,
"learning_rate": 8.725330045413117e-05,
"loss": 2.0635,
"step": 744
},
{
"epoch": 0.2453785664292478,
"grad_norm": 2.8875701427459717,
"learning_rate": 8.721819254822565e-05,
"loss": 2.3036,
"step": 745
},
{
"epoch": 0.24570793363250854,
"grad_norm": 3.5547261238098145,
"learning_rate": 8.718304344439186e-05,
"loss": 2.5285,
"step": 746
},
{
"epoch": 0.24603730083576927,
"grad_norm": 3.126332998275757,
"learning_rate": 8.714785318153742e-05,
"loss": 2.1983,
"step": 747
},
{
"epoch": 0.24636666803903,
"grad_norm": 2.997291088104248,
"learning_rate": 8.711262179861547e-05,
"loss": 2.2518,
"step": 748
},
{
"epoch": 0.24669603524229075,
"grad_norm": 2.786508083343506,
"learning_rate": 8.70773493346247e-05,
"loss": 1.8481,
"step": 749
},
{
"epoch": 0.2470254024455515,
"grad_norm": 3.188028573989868,
"learning_rate": 8.704203582860922e-05,
"loss": 1.929,
"step": 750
},
{
"epoch": 0.24735476964881223,
"grad_norm": 1.924673318862915,
"learning_rate": 8.700668131965861e-05,
"loss": 2.319,
"step": 751
},
{
"epoch": 0.24768413685207297,
"grad_norm": 2.163625955581665,
"learning_rate": 8.697128584690785e-05,
"loss": 2.2707,
"step": 752
},
{
"epoch": 0.24801350405533368,
"grad_norm": 2.0983192920684814,
"learning_rate": 8.693584944953723e-05,
"loss": 2.4019,
"step": 753
},
{
"epoch": 0.24834287125859442,
"grad_norm": 2.2360665798187256,
"learning_rate": 8.690037216677236e-05,
"loss": 2.5004,
"step": 754
},
{
"epoch": 0.24867223846185516,
"grad_norm": 2.301961660385132,
"learning_rate": 8.686485403788411e-05,
"loss": 2.2912,
"step": 755
},
{
"epoch": 0.2490016056651159,
"grad_norm": 2.593334436416626,
"learning_rate": 8.682929510218855e-05,
"loss": 2.7432,
"step": 756
},
{
"epoch": 0.24933097286837663,
"grad_norm": 2.3250508308410645,
"learning_rate": 8.679369539904693e-05,
"loss": 2.525,
"step": 757
},
{
"epoch": 0.24966034007163737,
"grad_norm": 2.4877517223358154,
"learning_rate": 8.675805496786563e-05,
"loss": 2.2531,
"step": 758
},
{
"epoch": 0.2499897072748981,
"grad_norm": 2.423072099685669,
"learning_rate": 8.672237384809609e-05,
"loss": 2.3282,
"step": 759
},
{
"epoch": 0.2499897072748981,
"eval_loss": 2.2632205486297607,
"eval_runtime": 795.2953,
"eval_samples_per_second": 3.215,
"eval_steps_per_second": 1.608,
"step": 759
}
],
"logging_steps": 1,
"max_steps": 3036,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 759,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.164145589886124e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}