gemma3_4b_it_x01 / trainer_state.json
sumuks's picture
Upload checkpoint from outputs/gemma3_4b_it_x01/checkpoint-2954
8606e91 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 296,
"global_step": 2954,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0003386171721233413,
"grad_norm": 29.75,
"learning_rate": 0.0,
"loss": 0.5027,
"step": 1
},
{
"epoch": 0.0006772343442466826,
"grad_norm": 35.25,
"learning_rate": 1.3513513513513515e-07,
"loss": 0.5494,
"step": 2
},
{
"epoch": 0.001015851516370024,
"grad_norm": 28.5,
"learning_rate": 2.702702702702703e-07,
"loss": 0.4482,
"step": 3
},
{
"epoch": 0.0013544686884933651,
"grad_norm": 33.5,
"learning_rate": 4.0540540540540546e-07,
"loss": 0.5661,
"step": 4
},
{
"epoch": 0.0016930858606167066,
"grad_norm": 30.25,
"learning_rate": 5.405405405405406e-07,
"loss": 0.4796,
"step": 5
},
{
"epoch": 0.002031703032740048,
"grad_norm": 34.25,
"learning_rate": 6.756756756756758e-07,
"loss": 0.515,
"step": 6
},
{
"epoch": 0.0023703202048633893,
"grad_norm": 31.875,
"learning_rate": 8.108108108108109e-07,
"loss": 0.4466,
"step": 7
},
{
"epoch": 0.0027089373769867303,
"grad_norm": 30.0,
"learning_rate": 9.459459459459461e-07,
"loss": 0.4354,
"step": 8
},
{
"epoch": 0.0030475545491100717,
"grad_norm": 28.0,
"learning_rate": 1.0810810810810812e-06,
"loss": 0.4231,
"step": 9
},
{
"epoch": 0.003386171721233413,
"grad_norm": 19.375,
"learning_rate": 1.2162162162162164e-06,
"loss": 0.3424,
"step": 10
},
{
"epoch": 0.003724788893356754,
"grad_norm": 21.875,
"learning_rate": 1.3513513513513515e-06,
"loss": 0.37,
"step": 11
},
{
"epoch": 0.004063406065480096,
"grad_norm": 19.0,
"learning_rate": 1.4864864864864868e-06,
"loss": 0.3439,
"step": 12
},
{
"epoch": 0.004402023237603437,
"grad_norm": 15.0625,
"learning_rate": 1.6216216216216219e-06,
"loss": 0.2812,
"step": 13
},
{
"epoch": 0.0047406404097267785,
"grad_norm": 16.0,
"learning_rate": 1.756756756756757e-06,
"loss": 0.335,
"step": 14
},
{
"epoch": 0.0050792575818501195,
"grad_norm": 12.5,
"learning_rate": 1.8918918918918922e-06,
"loss": 0.2898,
"step": 15
},
{
"epoch": 0.0054178747539734605,
"grad_norm": 11.6875,
"learning_rate": 2.0270270270270273e-06,
"loss": 0.2427,
"step": 16
},
{
"epoch": 0.005756491926096802,
"grad_norm": 9.625,
"learning_rate": 2.1621621621621623e-06,
"loss": 0.2346,
"step": 17
},
{
"epoch": 0.006095109098220143,
"grad_norm": 8.0625,
"learning_rate": 2.297297297297298e-06,
"loss": 0.2305,
"step": 18
},
{
"epoch": 0.006433726270343484,
"grad_norm": 6.46875,
"learning_rate": 2.432432432432433e-06,
"loss": 0.1882,
"step": 19
},
{
"epoch": 0.006772343442466826,
"grad_norm": 5.125,
"learning_rate": 2.5675675675675675e-06,
"loss": 0.1815,
"step": 20
},
{
"epoch": 0.007110960614590167,
"grad_norm": 4.15625,
"learning_rate": 2.702702702702703e-06,
"loss": 0.1453,
"step": 21
},
{
"epoch": 0.007449577786713508,
"grad_norm": 4.3125,
"learning_rate": 2.837837837837838e-06,
"loss": 0.1595,
"step": 22
},
{
"epoch": 0.00778819495883685,
"grad_norm": 4.4375,
"learning_rate": 2.9729729729729736e-06,
"loss": 0.1914,
"step": 23
},
{
"epoch": 0.008126812130960191,
"grad_norm": 3.703125,
"learning_rate": 3.1081081081081082e-06,
"loss": 0.1724,
"step": 24
},
{
"epoch": 0.008465429303083532,
"grad_norm": 2.65625,
"learning_rate": 3.2432432432432437e-06,
"loss": 0.1183,
"step": 25
},
{
"epoch": 0.008804046475206873,
"grad_norm": 3.09375,
"learning_rate": 3.3783783783783788e-06,
"loss": 0.165,
"step": 26
},
{
"epoch": 0.009142663647330216,
"grad_norm": 2.546875,
"learning_rate": 3.513513513513514e-06,
"loss": 0.1775,
"step": 27
},
{
"epoch": 0.009481280819453557,
"grad_norm": 1.71875,
"learning_rate": 3.648648648648649e-06,
"loss": 0.1276,
"step": 28
},
{
"epoch": 0.009819897991576898,
"grad_norm": 1.953125,
"learning_rate": 3.7837837837837844e-06,
"loss": 0.1605,
"step": 29
},
{
"epoch": 0.010158515163700239,
"grad_norm": 1.703125,
"learning_rate": 3.918918918918919e-06,
"loss": 0.1486,
"step": 30
},
{
"epoch": 0.01049713233582358,
"grad_norm": 1.90625,
"learning_rate": 4.0540540540540545e-06,
"loss": 0.1535,
"step": 31
},
{
"epoch": 0.010835749507946921,
"grad_norm": 1.5625,
"learning_rate": 4.189189189189189e-06,
"loss": 0.1189,
"step": 32
},
{
"epoch": 0.011174366680070264,
"grad_norm": 1.828125,
"learning_rate": 4.324324324324325e-06,
"loss": 0.1359,
"step": 33
},
{
"epoch": 0.011512983852193605,
"grad_norm": 1.328125,
"learning_rate": 4.45945945945946e-06,
"loss": 0.1289,
"step": 34
},
{
"epoch": 0.011851601024316946,
"grad_norm": 1.09375,
"learning_rate": 4.594594594594596e-06,
"loss": 0.1033,
"step": 35
},
{
"epoch": 0.012190218196440287,
"grad_norm": 1.3671875,
"learning_rate": 4.72972972972973e-06,
"loss": 0.149,
"step": 36
},
{
"epoch": 0.012528835368563628,
"grad_norm": 1.03125,
"learning_rate": 4.864864864864866e-06,
"loss": 0.0975,
"step": 37
},
{
"epoch": 0.012867452540686969,
"grad_norm": 1.0859375,
"learning_rate": 5e-06,
"loss": 0.0938,
"step": 38
},
{
"epoch": 0.013206069712810312,
"grad_norm": 1.0625,
"learning_rate": 5.135135135135135e-06,
"loss": 0.1031,
"step": 39
},
{
"epoch": 0.013544686884933653,
"grad_norm": 1.1796875,
"learning_rate": 5.2702702702702705e-06,
"loss": 0.1088,
"step": 40
},
{
"epoch": 0.013883304057056994,
"grad_norm": 1.2421875,
"learning_rate": 5.405405405405406e-06,
"loss": 0.1207,
"step": 41
},
{
"epoch": 0.014221921229180335,
"grad_norm": 1.703125,
"learning_rate": 5.540540540540541e-06,
"loss": 0.2261,
"step": 42
},
{
"epoch": 0.014560538401303676,
"grad_norm": 1.4765625,
"learning_rate": 5.675675675675676e-06,
"loss": 0.12,
"step": 43
},
{
"epoch": 0.014899155573427017,
"grad_norm": 1.21875,
"learning_rate": 5.810810810810811e-06,
"loss": 0.1243,
"step": 44
},
{
"epoch": 0.01523777274555036,
"grad_norm": 1.140625,
"learning_rate": 5.945945945945947e-06,
"loss": 0.1028,
"step": 45
},
{
"epoch": 0.0155763899176737,
"grad_norm": 1.1171875,
"learning_rate": 6.081081081081082e-06,
"loss": 0.1044,
"step": 46
},
{
"epoch": 0.01591500708979704,
"grad_norm": 1.25,
"learning_rate": 6.2162162162162164e-06,
"loss": 0.1267,
"step": 47
},
{
"epoch": 0.016253624261920382,
"grad_norm": 0.90625,
"learning_rate": 6.351351351351351e-06,
"loss": 0.0914,
"step": 48
},
{
"epoch": 0.016592241434043725,
"grad_norm": 1.0546875,
"learning_rate": 6.486486486486487e-06,
"loss": 0.1085,
"step": 49
},
{
"epoch": 0.016930858606167064,
"grad_norm": 0.87109375,
"learning_rate": 6.621621621621622e-06,
"loss": 0.0949,
"step": 50
},
{
"epoch": 0.017269475778290407,
"grad_norm": 0.8671875,
"learning_rate": 6.7567567567567575e-06,
"loss": 0.0934,
"step": 51
},
{
"epoch": 0.017608092950413747,
"grad_norm": 1.2734375,
"learning_rate": 6.891891891891892e-06,
"loss": 0.1231,
"step": 52
},
{
"epoch": 0.01794671012253709,
"grad_norm": 0.84375,
"learning_rate": 7.027027027027028e-06,
"loss": 0.0946,
"step": 53
},
{
"epoch": 0.018285327294660432,
"grad_norm": 1.3671875,
"learning_rate": 7.162162162162163e-06,
"loss": 0.1202,
"step": 54
},
{
"epoch": 0.01862394446678377,
"grad_norm": 0.79296875,
"learning_rate": 7.297297297297298e-06,
"loss": 0.0802,
"step": 55
},
{
"epoch": 0.018962561638907114,
"grad_norm": 1.2109375,
"learning_rate": 7.4324324324324324e-06,
"loss": 0.1052,
"step": 56
},
{
"epoch": 0.019301178811030453,
"grad_norm": 0.91796875,
"learning_rate": 7.567567567567569e-06,
"loss": 0.0898,
"step": 57
},
{
"epoch": 0.019639795983153796,
"grad_norm": 0.9921875,
"learning_rate": 7.702702702702704e-06,
"loss": 0.1046,
"step": 58
},
{
"epoch": 0.019978413155277135,
"grad_norm": 0.87890625,
"learning_rate": 7.837837837837838e-06,
"loss": 0.0983,
"step": 59
},
{
"epoch": 0.020317030327400478,
"grad_norm": 0.97265625,
"learning_rate": 7.972972972972974e-06,
"loss": 0.0832,
"step": 60
},
{
"epoch": 0.02065564749952382,
"grad_norm": 1.0,
"learning_rate": 8.108108108108109e-06,
"loss": 0.1012,
"step": 61
},
{
"epoch": 0.02099426467164716,
"grad_norm": 1.0,
"learning_rate": 8.243243243243245e-06,
"loss": 0.1124,
"step": 62
},
{
"epoch": 0.021332881843770503,
"grad_norm": 0.73828125,
"learning_rate": 8.378378378378378e-06,
"loss": 0.1053,
"step": 63
},
{
"epoch": 0.021671499015893842,
"grad_norm": 0.828125,
"learning_rate": 8.513513513513514e-06,
"loss": 0.0782,
"step": 64
},
{
"epoch": 0.022010116188017185,
"grad_norm": 0.74609375,
"learning_rate": 8.64864864864865e-06,
"loss": 0.0739,
"step": 65
},
{
"epoch": 0.022348733360140528,
"grad_norm": 0.89453125,
"learning_rate": 8.783783783783785e-06,
"loss": 0.0715,
"step": 66
},
{
"epoch": 0.022687350532263867,
"grad_norm": 0.9765625,
"learning_rate": 8.91891891891892e-06,
"loss": 0.0913,
"step": 67
},
{
"epoch": 0.02302596770438721,
"grad_norm": 0.8359375,
"learning_rate": 9.054054054054054e-06,
"loss": 0.0844,
"step": 68
},
{
"epoch": 0.02336458487651055,
"grad_norm": 0.82421875,
"learning_rate": 9.189189189189191e-06,
"loss": 0.0825,
"step": 69
},
{
"epoch": 0.02370320204863389,
"grad_norm": 1.0234375,
"learning_rate": 9.324324324324325e-06,
"loss": 0.084,
"step": 70
},
{
"epoch": 0.02404181922075723,
"grad_norm": 0.9375,
"learning_rate": 9.45945945945946e-06,
"loss": 0.0933,
"step": 71
},
{
"epoch": 0.024380436392880574,
"grad_norm": 0.6796875,
"learning_rate": 9.594594594594594e-06,
"loss": 0.0835,
"step": 72
},
{
"epoch": 0.024719053565003916,
"grad_norm": 0.8203125,
"learning_rate": 9.729729729729732e-06,
"loss": 0.0814,
"step": 73
},
{
"epoch": 0.025057670737127256,
"grad_norm": 0.796875,
"learning_rate": 9.864864864864865e-06,
"loss": 0.079,
"step": 74
},
{
"epoch": 0.0253962879092506,
"grad_norm": 1.046875,
"learning_rate": 1e-05,
"loss": 0.0984,
"step": 75
},
{
"epoch": 0.025734905081373938,
"grad_norm": 0.73828125,
"learning_rate": 1.0135135135135136e-05,
"loss": 0.079,
"step": 76
},
{
"epoch": 0.02607352225349728,
"grad_norm": 0.765625,
"learning_rate": 1.027027027027027e-05,
"loss": 0.0811,
"step": 77
},
{
"epoch": 0.026412139425620623,
"grad_norm": 0.796875,
"learning_rate": 1.0405405405405407e-05,
"loss": 0.0819,
"step": 78
},
{
"epoch": 0.026750756597743963,
"grad_norm": 0.84765625,
"learning_rate": 1.0540540540540541e-05,
"loss": 0.0828,
"step": 79
},
{
"epoch": 0.027089373769867305,
"grad_norm": 0.86328125,
"learning_rate": 1.0675675675675677e-05,
"loss": 0.1061,
"step": 80
},
{
"epoch": 0.027427990941990645,
"grad_norm": 0.74609375,
"learning_rate": 1.0810810810810812e-05,
"loss": 0.0912,
"step": 81
},
{
"epoch": 0.027766608114113987,
"grad_norm": 0.78515625,
"learning_rate": 1.0945945945945946e-05,
"loss": 0.0702,
"step": 82
},
{
"epoch": 0.02810522528623733,
"grad_norm": 0.80078125,
"learning_rate": 1.1081081081081081e-05,
"loss": 0.0769,
"step": 83
},
{
"epoch": 0.02844384245836067,
"grad_norm": 0.73046875,
"learning_rate": 1.1216216216216219e-05,
"loss": 0.0786,
"step": 84
},
{
"epoch": 0.028782459630484012,
"grad_norm": 0.75,
"learning_rate": 1.1351351351351352e-05,
"loss": 0.0837,
"step": 85
},
{
"epoch": 0.02912107680260735,
"grad_norm": 0.75390625,
"learning_rate": 1.1486486486486488e-05,
"loss": 0.0993,
"step": 86
},
{
"epoch": 0.029459693974730694,
"grad_norm": 0.72265625,
"learning_rate": 1.1621621621621622e-05,
"loss": 0.0806,
"step": 87
},
{
"epoch": 0.029798311146854033,
"grad_norm": 0.8046875,
"learning_rate": 1.1756756756756757e-05,
"loss": 0.0988,
"step": 88
},
{
"epoch": 0.030136928318977376,
"grad_norm": 0.78515625,
"learning_rate": 1.1891891891891894e-05,
"loss": 0.0947,
"step": 89
},
{
"epoch": 0.03047554549110072,
"grad_norm": 0.859375,
"learning_rate": 1.2027027027027028e-05,
"loss": 0.0726,
"step": 90
},
{
"epoch": 0.030814162663224058,
"grad_norm": 0.90625,
"learning_rate": 1.2162162162162164e-05,
"loss": 0.0958,
"step": 91
},
{
"epoch": 0.0311527798353474,
"grad_norm": 0.65234375,
"learning_rate": 1.2297297297297299e-05,
"loss": 0.0637,
"step": 92
},
{
"epoch": 0.031491397007470744,
"grad_norm": 0.765625,
"learning_rate": 1.2432432432432433e-05,
"loss": 0.0935,
"step": 93
},
{
"epoch": 0.03183001417959408,
"grad_norm": 0.67578125,
"learning_rate": 1.2567567567567568e-05,
"loss": 0.0661,
"step": 94
},
{
"epoch": 0.03216863135171742,
"grad_norm": 0.8671875,
"learning_rate": 1.2702702702702702e-05,
"loss": 0.0871,
"step": 95
},
{
"epoch": 0.032507248523840765,
"grad_norm": 1.5234375,
"learning_rate": 1.283783783783784e-05,
"loss": 0.1231,
"step": 96
},
{
"epoch": 0.03284586569596411,
"grad_norm": 0.91015625,
"learning_rate": 1.2972972972972975e-05,
"loss": 0.0777,
"step": 97
},
{
"epoch": 0.03318448286808745,
"grad_norm": 1.09375,
"learning_rate": 1.3108108108108109e-05,
"loss": 0.1009,
"step": 98
},
{
"epoch": 0.033523100040210786,
"grad_norm": 0.78515625,
"learning_rate": 1.3243243243243244e-05,
"loss": 0.0944,
"step": 99
},
{
"epoch": 0.03386171721233413,
"grad_norm": 0.734375,
"learning_rate": 1.3378378378378381e-05,
"loss": 0.0649,
"step": 100
},
{
"epoch": 0.03420033438445747,
"grad_norm": 0.71875,
"learning_rate": 1.3513513513513515e-05,
"loss": 0.0813,
"step": 101
},
{
"epoch": 0.034538951556580814,
"grad_norm": 0.76171875,
"learning_rate": 1.364864864864865e-05,
"loss": 0.0726,
"step": 102
},
{
"epoch": 0.03487756872870416,
"grad_norm": 0.75,
"learning_rate": 1.3783783783783784e-05,
"loss": 0.0783,
"step": 103
},
{
"epoch": 0.03521618590082749,
"grad_norm": 0.7109375,
"learning_rate": 1.391891891891892e-05,
"loss": 0.0656,
"step": 104
},
{
"epoch": 0.035554803072950836,
"grad_norm": 0.73046875,
"learning_rate": 1.4054054054054055e-05,
"loss": 0.0719,
"step": 105
},
{
"epoch": 0.03589342024507418,
"grad_norm": 0.79296875,
"learning_rate": 1.4189189189189189e-05,
"loss": 0.0821,
"step": 106
},
{
"epoch": 0.03623203741719752,
"grad_norm": 0.609375,
"learning_rate": 1.4324324324324326e-05,
"loss": 0.075,
"step": 107
},
{
"epoch": 0.036570654589320864,
"grad_norm": 0.7109375,
"learning_rate": 1.4459459459459462e-05,
"loss": 0.0753,
"step": 108
},
{
"epoch": 0.0369092717614442,
"grad_norm": 0.796875,
"learning_rate": 1.4594594594594596e-05,
"loss": 0.0976,
"step": 109
},
{
"epoch": 0.03724788893356754,
"grad_norm": 0.6953125,
"learning_rate": 1.4729729729729731e-05,
"loss": 0.0781,
"step": 110
},
{
"epoch": 0.037586506105690885,
"grad_norm": 0.74609375,
"learning_rate": 1.4864864864864865e-05,
"loss": 0.0808,
"step": 111
},
{
"epoch": 0.03792512327781423,
"grad_norm": 0.90625,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.0816,
"step": 112
},
{
"epoch": 0.03826374044993757,
"grad_norm": 0.828125,
"learning_rate": 1.5135135135135138e-05,
"loss": 0.082,
"step": 113
},
{
"epoch": 0.03860235762206091,
"grad_norm": 0.703125,
"learning_rate": 1.527027027027027e-05,
"loss": 0.0811,
"step": 114
},
{
"epoch": 0.03894097479418425,
"grad_norm": 0.73046875,
"learning_rate": 1.540540540540541e-05,
"loss": 0.0754,
"step": 115
},
{
"epoch": 0.03927959196630759,
"grad_norm": 0.76171875,
"learning_rate": 1.554054054054054e-05,
"loss": 0.0891,
"step": 116
},
{
"epoch": 0.039618209138430935,
"grad_norm": 0.77734375,
"learning_rate": 1.5675675675675676e-05,
"loss": 0.1031,
"step": 117
},
{
"epoch": 0.03995682631055427,
"grad_norm": 0.71875,
"learning_rate": 1.581081081081081e-05,
"loss": 0.0718,
"step": 118
},
{
"epoch": 0.04029544348267761,
"grad_norm": 0.64453125,
"learning_rate": 1.5945945945945947e-05,
"loss": 0.0749,
"step": 119
},
{
"epoch": 0.040634060654800956,
"grad_norm": 0.73046875,
"learning_rate": 1.6081081081081083e-05,
"loss": 0.0815,
"step": 120
},
{
"epoch": 0.0409726778269243,
"grad_norm": 0.74609375,
"learning_rate": 1.6216216216216218e-05,
"loss": 0.0832,
"step": 121
},
{
"epoch": 0.04131129499904764,
"grad_norm": 0.6875,
"learning_rate": 1.6351351351351354e-05,
"loss": 0.0703,
"step": 122
},
{
"epoch": 0.04164991217117098,
"grad_norm": 0.81640625,
"learning_rate": 1.648648648648649e-05,
"loss": 0.0965,
"step": 123
},
{
"epoch": 0.04198852934329432,
"grad_norm": 0.8671875,
"learning_rate": 1.662162162162162e-05,
"loss": 0.0811,
"step": 124
},
{
"epoch": 0.04232714651541766,
"grad_norm": 0.578125,
"learning_rate": 1.6756756756756757e-05,
"loss": 0.0647,
"step": 125
},
{
"epoch": 0.042665763687541006,
"grad_norm": 0.64453125,
"learning_rate": 1.6891891891891896e-05,
"loss": 0.0717,
"step": 126
},
{
"epoch": 0.04300438085966435,
"grad_norm": 1.125,
"learning_rate": 1.7027027027027028e-05,
"loss": 0.1835,
"step": 127
},
{
"epoch": 0.043342998031787684,
"grad_norm": 0.66015625,
"learning_rate": 1.7162162162162163e-05,
"loss": 0.0627,
"step": 128
},
{
"epoch": 0.04368161520391103,
"grad_norm": 0.59375,
"learning_rate": 1.72972972972973e-05,
"loss": 0.0673,
"step": 129
},
{
"epoch": 0.04402023237603437,
"grad_norm": 0.7421875,
"learning_rate": 1.7432432432432434e-05,
"loss": 0.0917,
"step": 130
},
{
"epoch": 0.04435884954815771,
"grad_norm": 0.828125,
"learning_rate": 1.756756756756757e-05,
"loss": 0.094,
"step": 131
},
{
"epoch": 0.044697466720281055,
"grad_norm": 0.86328125,
"learning_rate": 1.7702702702702702e-05,
"loss": 0.0912,
"step": 132
},
{
"epoch": 0.04503608389240439,
"grad_norm": 0.9140625,
"learning_rate": 1.783783783783784e-05,
"loss": 0.1102,
"step": 133
},
{
"epoch": 0.045374701064527734,
"grad_norm": 0.6953125,
"learning_rate": 1.7972972972972976e-05,
"loss": 0.0581,
"step": 134
},
{
"epoch": 0.045713318236651077,
"grad_norm": 0.66015625,
"learning_rate": 1.8108108108108108e-05,
"loss": 0.0591,
"step": 135
},
{
"epoch": 0.04605193540877442,
"grad_norm": 0.765625,
"learning_rate": 1.8243243243243244e-05,
"loss": 0.0896,
"step": 136
},
{
"epoch": 0.04639055258089776,
"grad_norm": 0.83203125,
"learning_rate": 1.8378378378378383e-05,
"loss": 0.0733,
"step": 137
},
{
"epoch": 0.0467291697530211,
"grad_norm": 1.234375,
"learning_rate": 1.8513513513513515e-05,
"loss": 0.0883,
"step": 138
},
{
"epoch": 0.04706778692514444,
"grad_norm": 0.84765625,
"learning_rate": 1.864864864864865e-05,
"loss": 0.0886,
"step": 139
},
{
"epoch": 0.04740640409726778,
"grad_norm": 0.85546875,
"learning_rate": 1.8783783783783786e-05,
"loss": 0.1031,
"step": 140
},
{
"epoch": 0.047745021269391126,
"grad_norm": 0.86328125,
"learning_rate": 1.891891891891892e-05,
"loss": 0.0936,
"step": 141
},
{
"epoch": 0.04808363844151446,
"grad_norm": 0.640625,
"learning_rate": 1.9054054054054057e-05,
"loss": 0.0706,
"step": 142
},
{
"epoch": 0.048422255613637805,
"grad_norm": 0.84375,
"learning_rate": 1.918918918918919e-05,
"loss": 0.1006,
"step": 143
},
{
"epoch": 0.04876087278576115,
"grad_norm": 0.62109375,
"learning_rate": 1.9324324324324328e-05,
"loss": 0.0715,
"step": 144
},
{
"epoch": 0.04909948995788449,
"grad_norm": 0.7421875,
"learning_rate": 1.9459459459459463e-05,
"loss": 0.0856,
"step": 145
},
{
"epoch": 0.04943810713000783,
"grad_norm": 0.83984375,
"learning_rate": 1.9594594594594595e-05,
"loss": 0.0923,
"step": 146
},
{
"epoch": 0.04977672430213117,
"grad_norm": 0.75,
"learning_rate": 1.972972972972973e-05,
"loss": 0.0825,
"step": 147
},
{
"epoch": 0.05011534147425451,
"grad_norm": 0.62890625,
"learning_rate": 1.9864864864864866e-05,
"loss": 0.0588,
"step": 148
},
{
"epoch": 0.050453958646377854,
"grad_norm": 0.81640625,
"learning_rate": 2e-05,
"loss": 0.1127,
"step": 149
},
{
"epoch": 0.0507925758185012,
"grad_norm": 0.5703125,
"learning_rate": 1.9999993732499594e-05,
"loss": 0.0628,
"step": 150
},
{
"epoch": 0.05113119299062454,
"grad_norm": 0.64453125,
"learning_rate": 1.9999974930006222e-05,
"loss": 0.0637,
"step": 151
},
{
"epoch": 0.051469810162747875,
"grad_norm": 0.6875,
"learning_rate": 1.999994359254346e-05,
"loss": 0.0706,
"step": 152
},
{
"epoch": 0.05180842733487122,
"grad_norm": 0.71875,
"learning_rate": 1.999989972015058e-05,
"loss": 0.0876,
"step": 153
},
{
"epoch": 0.05214704450699456,
"grad_norm": 0.59765625,
"learning_rate": 1.9999843312882592e-05,
"loss": 0.0708,
"step": 154
},
{
"epoch": 0.052485661679117904,
"grad_norm": 0.59765625,
"learning_rate": 1.9999774370810187e-05,
"loss": 0.0571,
"step": 155
},
{
"epoch": 0.052824278851241246,
"grad_norm": 0.8203125,
"learning_rate": 1.9999692894019792e-05,
"loss": 0.0943,
"step": 156
},
{
"epoch": 0.05316289602336458,
"grad_norm": 0.5546875,
"learning_rate": 1.9999598882613537e-05,
"loss": 0.0712,
"step": 157
},
{
"epoch": 0.053501513195487925,
"grad_norm": 0.69921875,
"learning_rate": 1.9999492336709263e-05,
"loss": 0.0836,
"step": 158
},
{
"epoch": 0.05384013036761127,
"grad_norm": 0.6328125,
"learning_rate": 1.999937325644053e-05,
"loss": 0.0626,
"step": 159
},
{
"epoch": 0.05417874753973461,
"grad_norm": 1.890625,
"learning_rate": 1.99992416419566e-05,
"loss": 0.0855,
"step": 160
},
{
"epoch": 0.05451736471185795,
"grad_norm": 0.75,
"learning_rate": 1.9999097493422453e-05,
"loss": 0.082,
"step": 161
},
{
"epoch": 0.05485598188398129,
"grad_norm": 0.6640625,
"learning_rate": 1.9998940811018782e-05,
"loss": 0.0769,
"step": 162
},
{
"epoch": 0.05519459905610463,
"grad_norm": 0.55078125,
"learning_rate": 1.9998771594941983e-05,
"loss": 0.0691,
"step": 163
},
{
"epoch": 0.055533216228227975,
"grad_norm": 0.68359375,
"learning_rate": 1.9998589845404176e-05,
"loss": 0.0687,
"step": 164
},
{
"epoch": 0.05587183340035132,
"grad_norm": 0.703125,
"learning_rate": 1.9998395562633176e-05,
"loss": 0.0738,
"step": 165
},
{
"epoch": 0.05621045057247466,
"grad_norm": 1.5234375,
"learning_rate": 1.9998188746872523e-05,
"loss": 0.0891,
"step": 166
},
{
"epoch": 0.056549067744597996,
"grad_norm": 0.62890625,
"learning_rate": 1.9997969398381454e-05,
"loss": 0.0911,
"step": 167
},
{
"epoch": 0.05688768491672134,
"grad_norm": 0.76171875,
"learning_rate": 1.9997737517434932e-05,
"loss": 0.0826,
"step": 168
},
{
"epoch": 0.05722630208884468,
"grad_norm": 0.81640625,
"learning_rate": 1.9997493104323607e-05,
"loss": 0.0955,
"step": 169
},
{
"epoch": 0.057564919260968024,
"grad_norm": 0.640625,
"learning_rate": 1.9997236159353864e-05,
"loss": 0.0804,
"step": 170
},
{
"epoch": 0.05790353643309136,
"grad_norm": 0.67578125,
"learning_rate": 1.9996966682847776e-05,
"loss": 0.0777,
"step": 171
},
{
"epoch": 0.0582421536052147,
"grad_norm": 0.703125,
"learning_rate": 1.9996684675143132e-05,
"loss": 0.0823,
"step": 172
},
{
"epoch": 0.058580770777338045,
"grad_norm": 0.640625,
"learning_rate": 1.999639013659343e-05,
"loss": 0.09,
"step": 173
},
{
"epoch": 0.05891938794946139,
"grad_norm": 0.984375,
"learning_rate": 1.9996083067567876e-05,
"loss": 0.108,
"step": 174
},
{
"epoch": 0.05925800512158473,
"grad_norm": 0.78515625,
"learning_rate": 1.9995763468451376e-05,
"loss": 0.0663,
"step": 175
},
{
"epoch": 0.05959662229370807,
"grad_norm": 0.64453125,
"learning_rate": 1.9995431339644552e-05,
"loss": 0.0856,
"step": 176
},
{
"epoch": 0.05993523946583141,
"grad_norm": 0.76171875,
"learning_rate": 1.9995086681563725e-05,
"loss": 0.0879,
"step": 177
},
{
"epoch": 0.06027385663795475,
"grad_norm": 0.671875,
"learning_rate": 1.999472949464093e-05,
"loss": 0.0747,
"step": 178
},
{
"epoch": 0.060612473810078095,
"grad_norm": 0.8359375,
"learning_rate": 1.9994359779323892e-05,
"loss": 0.1313,
"step": 179
},
{
"epoch": 0.06095109098220144,
"grad_norm": 0.66796875,
"learning_rate": 1.9993977536076052e-05,
"loss": 0.0781,
"step": 180
},
{
"epoch": 0.061289708154324773,
"grad_norm": 0.68359375,
"learning_rate": 1.999358276537655e-05,
"loss": 0.0731,
"step": 181
},
{
"epoch": 0.061628325326448116,
"grad_norm": 0.67578125,
"learning_rate": 1.9993175467720242e-05,
"loss": 0.0756,
"step": 182
},
{
"epoch": 0.06196694249857146,
"grad_norm": 0.61328125,
"learning_rate": 1.9992755643617663e-05,
"loss": 0.0743,
"step": 183
},
{
"epoch": 0.0623055596706948,
"grad_norm": 0.62109375,
"learning_rate": 1.9992323293595065e-05,
"loss": 0.0743,
"step": 184
},
{
"epoch": 0.06264417684281814,
"grad_norm": 0.69921875,
"learning_rate": 1.9991878418194407e-05,
"loss": 0.0809,
"step": 185
},
{
"epoch": 0.06298279401494149,
"grad_norm": 0.66015625,
"learning_rate": 1.9991421017973328e-05,
"loss": 0.0751,
"step": 186
},
{
"epoch": 0.06332141118706483,
"grad_norm": 0.765625,
"learning_rate": 1.999095109350519e-05,
"loss": 0.0868,
"step": 187
},
{
"epoch": 0.06366002835918816,
"grad_norm": 0.58203125,
"learning_rate": 1.9990468645379038e-05,
"loss": 0.0625,
"step": 188
},
{
"epoch": 0.0639986455313115,
"grad_norm": 0.7421875,
"learning_rate": 1.998997367419962e-05,
"loss": 0.0829,
"step": 189
},
{
"epoch": 0.06433726270343484,
"grad_norm": 0.65234375,
"learning_rate": 1.9989466180587386e-05,
"loss": 0.0729,
"step": 190
},
{
"epoch": 0.06467587987555819,
"grad_norm": 0.84375,
"learning_rate": 1.998894616517848e-05,
"loss": 0.1061,
"step": 191
},
{
"epoch": 0.06501449704768153,
"grad_norm": 0.62109375,
"learning_rate": 1.998841362862473e-05,
"loss": 0.0675,
"step": 192
},
{
"epoch": 0.06535311421980487,
"grad_norm": 0.54296875,
"learning_rate": 1.998786857159369e-05,
"loss": 0.0581,
"step": 193
},
{
"epoch": 0.06569173139192822,
"grad_norm": 0.640625,
"learning_rate": 1.9987310994768573e-05,
"loss": 0.0743,
"step": 194
},
{
"epoch": 0.06603034856405156,
"grad_norm": 0.57421875,
"learning_rate": 1.9986740898848306e-05,
"loss": 0.0655,
"step": 195
},
{
"epoch": 0.0663689657361749,
"grad_norm": 0.73046875,
"learning_rate": 1.998615828454751e-05,
"loss": 0.0885,
"step": 196
},
{
"epoch": 0.06670758290829824,
"grad_norm": 0.62890625,
"learning_rate": 1.998556315259648e-05,
"loss": 0.0657,
"step": 197
},
{
"epoch": 0.06704620008042157,
"grad_norm": 0.65234375,
"learning_rate": 1.9984955503741227e-05,
"loss": 0.0742,
"step": 198
},
{
"epoch": 0.06738481725254492,
"grad_norm": 0.73046875,
"learning_rate": 1.998433533874343e-05,
"loss": 0.0862,
"step": 199
},
{
"epoch": 0.06772343442466826,
"grad_norm": 0.77734375,
"learning_rate": 1.9983702658380474e-05,
"loss": 0.1001,
"step": 200
},
{
"epoch": 0.0680620515967916,
"grad_norm": 0.58984375,
"learning_rate": 1.9983057463445415e-05,
"loss": 0.0688,
"step": 201
},
{
"epoch": 0.06840066876891494,
"grad_norm": 0.6953125,
"learning_rate": 1.998239975474701e-05,
"loss": 0.0882,
"step": 202
},
{
"epoch": 0.06873928594103829,
"grad_norm": 0.64453125,
"learning_rate": 1.9981729533109694e-05,
"loss": 0.064,
"step": 203
},
{
"epoch": 0.06907790311316163,
"grad_norm": 0.5703125,
"learning_rate": 1.9981046799373595e-05,
"loss": 0.0665,
"step": 204
},
{
"epoch": 0.06941652028528497,
"grad_norm": 0.66796875,
"learning_rate": 1.9980351554394514e-05,
"loss": 0.0861,
"step": 205
},
{
"epoch": 0.06975513745740831,
"grad_norm": 0.57421875,
"learning_rate": 1.9979643799043945e-05,
"loss": 0.0691,
"step": 206
},
{
"epoch": 0.07009375462953164,
"grad_norm": 0.85546875,
"learning_rate": 1.9978923534209052e-05,
"loss": 0.1439,
"step": 207
},
{
"epoch": 0.07043237180165499,
"grad_norm": 0.59765625,
"learning_rate": 1.9978190760792698e-05,
"loss": 0.0725,
"step": 208
},
{
"epoch": 0.07077098897377833,
"grad_norm": 0.55859375,
"learning_rate": 1.997744547971341e-05,
"loss": 0.0732,
"step": 209
},
{
"epoch": 0.07110960614590167,
"grad_norm": 0.60546875,
"learning_rate": 1.9976687691905394e-05,
"loss": 0.0718,
"step": 210
},
{
"epoch": 0.07144822331802501,
"grad_norm": 0.78125,
"learning_rate": 1.997591739831854e-05,
"loss": 0.0774,
"step": 211
},
{
"epoch": 0.07178684049014836,
"grad_norm": 0.57421875,
"learning_rate": 1.9975134599918414e-05,
"loss": 0.0717,
"step": 212
},
{
"epoch": 0.0721254576622717,
"grad_norm": 0.478515625,
"learning_rate": 1.9974339297686246e-05,
"loss": 0.0601,
"step": 213
},
{
"epoch": 0.07246407483439504,
"grad_norm": 0.6484375,
"learning_rate": 1.9973531492618956e-05,
"loss": 0.0813,
"step": 214
},
{
"epoch": 0.07280269200651839,
"grad_norm": 0.578125,
"learning_rate": 1.9972711185729124e-05,
"loss": 0.0679,
"step": 215
},
{
"epoch": 0.07314130917864173,
"grad_norm": 0.6484375,
"learning_rate": 1.9971878378045005e-05,
"loss": 0.0735,
"step": 216
},
{
"epoch": 0.07347992635076506,
"grad_norm": 0.640625,
"learning_rate": 1.997103307061052e-05,
"loss": 0.0756,
"step": 217
},
{
"epoch": 0.0738185435228884,
"grad_norm": 0.55078125,
"learning_rate": 1.9970175264485268e-05,
"loss": 0.0679,
"step": 218
},
{
"epoch": 0.07415716069501174,
"grad_norm": 0.5390625,
"learning_rate": 1.9969304960744508e-05,
"loss": 0.0684,
"step": 219
},
{
"epoch": 0.07449577786713509,
"grad_norm": 0.73828125,
"learning_rate": 1.996842216047916e-05,
"loss": 0.0952,
"step": 220
},
{
"epoch": 0.07483439503925843,
"grad_norm": 0.6328125,
"learning_rate": 1.996752686479582e-05,
"loss": 0.0888,
"step": 221
},
{
"epoch": 0.07517301221138177,
"grad_norm": 0.49609375,
"learning_rate": 1.996661907481674e-05,
"loss": 0.0633,
"step": 222
},
{
"epoch": 0.07551162938350511,
"grad_norm": 0.82421875,
"learning_rate": 1.9965698791679834e-05,
"loss": 0.0985,
"step": 223
},
{
"epoch": 0.07585024655562846,
"grad_norm": 0.69921875,
"learning_rate": 1.996476601653868e-05,
"loss": 0.0888,
"step": 224
},
{
"epoch": 0.0761888637277518,
"grad_norm": 0.76171875,
"learning_rate": 1.9963820750562506e-05,
"loss": 0.0905,
"step": 225
},
{
"epoch": 0.07652748089987514,
"grad_norm": 0.67578125,
"learning_rate": 1.9962862994936207e-05,
"loss": 0.0802,
"step": 226
},
{
"epoch": 0.07686609807199847,
"grad_norm": 0.625,
"learning_rate": 1.996189275086033e-05,
"loss": 0.0809,
"step": 227
},
{
"epoch": 0.07720471524412181,
"grad_norm": 0.60546875,
"learning_rate": 1.9960910019551073e-05,
"loss": 0.0823,
"step": 228
},
{
"epoch": 0.07754333241624516,
"grad_norm": 0.6171875,
"learning_rate": 1.9959914802240293e-05,
"loss": 0.0713,
"step": 229
},
{
"epoch": 0.0778819495883685,
"grad_norm": 0.59375,
"learning_rate": 1.9958907100175492e-05,
"loss": 0.0754,
"step": 230
},
{
"epoch": 0.07822056676049184,
"grad_norm": 0.796875,
"learning_rate": 1.9957886914619826e-05,
"loss": 0.0887,
"step": 231
},
{
"epoch": 0.07855918393261518,
"grad_norm": 0.5546875,
"learning_rate": 1.99568542468521e-05,
"loss": 0.0667,
"step": 232
},
{
"epoch": 0.07889780110473853,
"grad_norm": 0.5234375,
"learning_rate": 1.995580909816676e-05,
"loss": 0.0622,
"step": 233
},
{
"epoch": 0.07923641827686187,
"grad_norm": 0.62109375,
"learning_rate": 1.99547514698739e-05,
"loss": 0.0765,
"step": 234
},
{
"epoch": 0.07957503544898521,
"grad_norm": 0.68359375,
"learning_rate": 1.9953681363299258e-05,
"loss": 0.0936,
"step": 235
},
{
"epoch": 0.07991365262110854,
"grad_norm": 0.48046875,
"learning_rate": 1.9952598779784214e-05,
"loss": 0.0574,
"step": 236
},
{
"epoch": 0.08025226979323188,
"grad_norm": 0.59765625,
"learning_rate": 1.9951503720685784e-05,
"loss": 0.0716,
"step": 237
},
{
"epoch": 0.08059088696535523,
"grad_norm": 0.5859375,
"learning_rate": 1.9950396187376628e-05,
"loss": 0.0781,
"step": 238
},
{
"epoch": 0.08092950413747857,
"grad_norm": 0.58203125,
"learning_rate": 1.9949276181245037e-05,
"loss": 0.0779,
"step": 239
},
{
"epoch": 0.08126812130960191,
"grad_norm": 0.703125,
"learning_rate": 1.994814370369494e-05,
"loss": 0.1049,
"step": 240
},
{
"epoch": 0.08160673848172526,
"grad_norm": 0.734375,
"learning_rate": 1.9946998756145894e-05,
"loss": 0.0857,
"step": 241
},
{
"epoch": 0.0819453556538486,
"grad_norm": 0.59765625,
"learning_rate": 1.9945841340033093e-05,
"loss": 0.0722,
"step": 242
},
{
"epoch": 0.08228397282597194,
"grad_norm": 0.6640625,
"learning_rate": 1.994467145680736e-05,
"loss": 0.083,
"step": 243
},
{
"epoch": 0.08262258999809528,
"grad_norm": 0.8203125,
"learning_rate": 1.994348910793514e-05,
"loss": 0.0852,
"step": 244
},
{
"epoch": 0.08296120717021863,
"grad_norm": 0.55078125,
"learning_rate": 1.9942294294898513e-05,
"loss": 0.063,
"step": 245
},
{
"epoch": 0.08329982434234195,
"grad_norm": 0.54296875,
"learning_rate": 1.994108701919517e-05,
"loss": 0.0701,
"step": 246
},
{
"epoch": 0.0836384415144653,
"grad_norm": 0.6953125,
"learning_rate": 1.993986728233844e-05,
"loss": 0.0869,
"step": 247
},
{
"epoch": 0.08397705868658864,
"grad_norm": 0.52734375,
"learning_rate": 1.9938635085857257e-05,
"loss": 0.0619,
"step": 248
},
{
"epoch": 0.08431567585871198,
"grad_norm": 0.578125,
"learning_rate": 1.993739043129618e-05,
"loss": 0.0643,
"step": 249
},
{
"epoch": 0.08465429303083533,
"grad_norm": 0.6484375,
"learning_rate": 1.9936133320215385e-05,
"loss": 0.0753,
"step": 250
},
{
"epoch": 0.08499291020295867,
"grad_norm": 0.6484375,
"learning_rate": 1.9934863754190662e-05,
"loss": 0.0535,
"step": 251
},
{
"epoch": 0.08533152737508201,
"grad_norm": 0.671875,
"learning_rate": 1.9933581734813404e-05,
"loss": 0.0759,
"step": 252
},
{
"epoch": 0.08567014454720535,
"grad_norm": 0.67578125,
"learning_rate": 1.9932287263690637e-05,
"loss": 0.0852,
"step": 253
},
{
"epoch": 0.0860087617193287,
"grad_norm": 0.64453125,
"learning_rate": 1.9930980342444966e-05,
"loss": 0.07,
"step": 254
},
{
"epoch": 0.08634737889145204,
"grad_norm": 0.81640625,
"learning_rate": 1.9929660972714626e-05,
"loss": 0.1072,
"step": 255
},
{
"epoch": 0.08668599606357537,
"grad_norm": 2.0,
"learning_rate": 1.9928329156153444e-05,
"loss": 0.0896,
"step": 256
},
{
"epoch": 0.08702461323569871,
"grad_norm": 0.63671875,
"learning_rate": 1.992698489443085e-05,
"loss": 0.0809,
"step": 257
},
{
"epoch": 0.08736323040782205,
"grad_norm": 0.91015625,
"learning_rate": 1.9925628189231885e-05,
"loss": 0.1163,
"step": 258
},
{
"epoch": 0.0877018475799454,
"grad_norm": 0.64453125,
"learning_rate": 1.992425904225717e-05,
"loss": 0.0763,
"step": 259
},
{
"epoch": 0.08804046475206874,
"grad_norm": 0.8046875,
"learning_rate": 1.9922877455222932e-05,
"loss": 0.069,
"step": 260
},
{
"epoch": 0.08837908192419208,
"grad_norm": 0.55078125,
"learning_rate": 1.992148342986099e-05,
"loss": 0.0689,
"step": 261
},
{
"epoch": 0.08871769909631542,
"grad_norm": 0.625,
"learning_rate": 1.9920076967918762e-05,
"loss": 0.0634,
"step": 262
},
{
"epoch": 0.08905631626843877,
"grad_norm": 0.72265625,
"learning_rate": 1.9918658071159243e-05,
"loss": 0.0745,
"step": 263
},
{
"epoch": 0.08939493344056211,
"grad_norm": 0.68359375,
"learning_rate": 1.9917226741361014e-05,
"loss": 0.0769,
"step": 264
},
{
"epoch": 0.08973355061268544,
"grad_norm": 0.671875,
"learning_rate": 1.991578298031826e-05,
"loss": 0.0794,
"step": 265
},
{
"epoch": 0.09007216778480878,
"grad_norm": 0.6640625,
"learning_rate": 1.9914326789840728e-05,
"loss": 0.0771,
"step": 266
},
{
"epoch": 0.09041078495693212,
"grad_norm": 0.90234375,
"learning_rate": 1.991285817175375e-05,
"loss": 0.0841,
"step": 267
},
{
"epoch": 0.09074940212905547,
"grad_norm": 0.64453125,
"learning_rate": 1.991137712789825e-05,
"loss": 0.0692,
"step": 268
},
{
"epoch": 0.09108801930117881,
"grad_norm": 0.81640625,
"learning_rate": 1.9909883660130703e-05,
"loss": 0.0986,
"step": 269
},
{
"epoch": 0.09142663647330215,
"grad_norm": 0.8203125,
"learning_rate": 1.9908377770323178e-05,
"loss": 0.0774,
"step": 270
},
{
"epoch": 0.0917652536454255,
"grad_norm": 0.5859375,
"learning_rate": 1.9906859460363307e-05,
"loss": 0.0663,
"step": 271
},
{
"epoch": 0.09210387081754884,
"grad_norm": 0.69921875,
"learning_rate": 1.9905328732154294e-05,
"loss": 0.0812,
"step": 272
},
{
"epoch": 0.09244248798967218,
"grad_norm": 0.69140625,
"learning_rate": 1.9903785587614907e-05,
"loss": 0.0833,
"step": 273
},
{
"epoch": 0.09278110516179552,
"grad_norm": 0.63671875,
"learning_rate": 1.990223002867947e-05,
"loss": 0.0854,
"step": 274
},
{
"epoch": 0.09311972233391885,
"grad_norm": 1.375,
"learning_rate": 1.9900662057297886e-05,
"loss": 0.0741,
"step": 275
},
{
"epoch": 0.0934583395060422,
"grad_norm": 0.69921875,
"learning_rate": 1.9899081675435604e-05,
"loss": 0.0868,
"step": 276
},
{
"epoch": 0.09379695667816554,
"grad_norm": 0.56640625,
"learning_rate": 1.989748888507363e-05,
"loss": 0.0632,
"step": 277
},
{
"epoch": 0.09413557385028888,
"grad_norm": 0.6015625,
"learning_rate": 1.9895883688208527e-05,
"loss": 0.0696,
"step": 278
},
{
"epoch": 0.09447419102241222,
"grad_norm": 0.5546875,
"learning_rate": 1.9894266086852414e-05,
"loss": 0.0657,
"step": 279
},
{
"epoch": 0.09481280819453557,
"grad_norm": 0.6484375,
"learning_rate": 1.989263608303295e-05,
"loss": 0.0742,
"step": 280
},
{
"epoch": 0.09515142536665891,
"grad_norm": 0.734375,
"learning_rate": 1.989099367879335e-05,
"loss": 0.0787,
"step": 281
},
{
"epoch": 0.09549004253878225,
"grad_norm": 0.5625,
"learning_rate": 1.9889338876192365e-05,
"loss": 0.0675,
"step": 282
},
{
"epoch": 0.0958286597109056,
"grad_norm": 0.5859375,
"learning_rate": 1.9887671677304285e-05,
"loss": 0.073,
"step": 283
},
{
"epoch": 0.09616727688302892,
"grad_norm": 0.6171875,
"learning_rate": 1.9885992084218948e-05,
"loss": 0.0574,
"step": 284
},
{
"epoch": 0.09650589405515227,
"grad_norm": 0.8671875,
"learning_rate": 1.9884300099041728e-05,
"loss": 0.1312,
"step": 285
},
{
"epoch": 0.09684451122727561,
"grad_norm": 0.69140625,
"learning_rate": 1.9882595723893525e-05,
"loss": 0.0594,
"step": 286
},
{
"epoch": 0.09718312839939895,
"grad_norm": 0.51171875,
"learning_rate": 1.9880878960910772e-05,
"loss": 0.0683,
"step": 287
},
{
"epoch": 0.0975217455715223,
"grad_norm": 0.6484375,
"learning_rate": 1.9879149812245434e-05,
"loss": 0.0887,
"step": 288
},
{
"epoch": 0.09786036274364564,
"grad_norm": 0.57421875,
"learning_rate": 1.9877408280065e-05,
"loss": 0.0788,
"step": 289
},
{
"epoch": 0.09819897991576898,
"grad_norm": 0.578125,
"learning_rate": 1.9875654366552476e-05,
"loss": 0.0716,
"step": 290
},
{
"epoch": 0.09853759708789232,
"grad_norm": 0.67578125,
"learning_rate": 1.9873888073906396e-05,
"loss": 0.0902,
"step": 291
},
{
"epoch": 0.09887621426001567,
"grad_norm": 3.21875,
"learning_rate": 1.987210940434081e-05,
"loss": 0.0776,
"step": 292
},
{
"epoch": 0.09921483143213901,
"grad_norm": 0.6171875,
"learning_rate": 1.9870318360085277e-05,
"loss": 0.0828,
"step": 293
},
{
"epoch": 0.09955344860426234,
"grad_norm": 0.5703125,
"learning_rate": 1.9868514943384872e-05,
"loss": 0.0727,
"step": 294
},
{
"epoch": 0.09989206577638568,
"grad_norm": 0.6484375,
"learning_rate": 1.9866699156500177e-05,
"loss": 0.0831,
"step": 295
},
{
"epoch": 0.10023068294850902,
"grad_norm": 0.73828125,
"learning_rate": 1.986487100170728e-05,
"loss": 0.0895,
"step": 296
},
{
"epoch": 0.10023068294850902,
"eval_loss": 0.0773262232542038,
"eval_runtime": 833.2157,
"eval_samples_per_second": 11.939,
"eval_steps_per_second": 2.985,
"step": 296
},
{
"epoch": 0.10056930012063237,
"grad_norm": 0.60546875,
"learning_rate": 1.986303048129778e-05,
"loss": 0.0779,
"step": 297
},
{
"epoch": 0.10090791729275571,
"grad_norm": 0.8515625,
"learning_rate": 1.9861177597578765e-05,
"loss": 0.0699,
"step": 298
},
{
"epoch": 0.10124653446487905,
"grad_norm": 0.60546875,
"learning_rate": 1.9859312352872822e-05,
"loss": 0.0706,
"step": 299
},
{
"epoch": 0.1015851516370024,
"grad_norm": 0.76953125,
"learning_rate": 1.985743474951804e-05,
"loss": 0.0926,
"step": 300
},
{
"epoch": 0.10192376880912574,
"grad_norm": 0.58984375,
"learning_rate": 1.985554478986799e-05,
"loss": 0.0685,
"step": 301
},
{
"epoch": 0.10226238598124908,
"grad_norm": 0.58984375,
"learning_rate": 1.9853642476291743e-05,
"loss": 0.0623,
"step": 302
},
{
"epoch": 0.10260100315337242,
"grad_norm": 0.5625,
"learning_rate": 1.9851727811173844e-05,
"loss": 0.0708,
"step": 303
},
{
"epoch": 0.10293962032549575,
"grad_norm": 0.71484375,
"learning_rate": 1.984980079691433e-05,
"loss": 0.0816,
"step": 304
},
{
"epoch": 0.1032782374976191,
"grad_norm": 0.609375,
"learning_rate": 1.9847861435928708e-05,
"loss": 0.0685,
"step": 305
},
{
"epoch": 0.10361685466974244,
"grad_norm": 0.66015625,
"learning_rate": 1.984590973064797e-05,
"loss": 0.0951,
"step": 306
},
{
"epoch": 0.10395547184186578,
"grad_norm": 0.62890625,
"learning_rate": 1.984394568351858e-05,
"loss": 0.0931,
"step": 307
},
{
"epoch": 0.10429408901398912,
"grad_norm": 0.6015625,
"learning_rate": 1.9841969297002473e-05,
"loss": 0.0701,
"step": 308
},
{
"epoch": 0.10463270618611246,
"grad_norm": 0.6484375,
"learning_rate": 1.9839980573577046e-05,
"loss": 0.0865,
"step": 309
},
{
"epoch": 0.10497132335823581,
"grad_norm": 0.55078125,
"learning_rate": 1.9837979515735168e-05,
"loss": 0.0716,
"step": 310
},
{
"epoch": 0.10530994053035915,
"grad_norm": 0.80859375,
"learning_rate": 1.9835966125985155e-05,
"loss": 0.0832,
"step": 311
},
{
"epoch": 0.10564855770248249,
"grad_norm": 0.62890625,
"learning_rate": 1.9833940406850805e-05,
"loss": 0.0777,
"step": 312
},
{
"epoch": 0.10598717487460582,
"grad_norm": 0.6015625,
"learning_rate": 1.9831902360871344e-05,
"loss": 0.0747,
"step": 313
},
{
"epoch": 0.10632579204672916,
"grad_norm": 0.59375,
"learning_rate": 1.9829851990601475e-05,
"loss": 0.0761,
"step": 314
},
{
"epoch": 0.10666440921885251,
"grad_norm": 0.484375,
"learning_rate": 1.982778929861133e-05,
"loss": 0.0632,
"step": 315
},
{
"epoch": 0.10700302639097585,
"grad_norm": 0.6796875,
"learning_rate": 1.9825714287486493e-05,
"loss": 0.0886,
"step": 316
},
{
"epoch": 0.10734164356309919,
"grad_norm": 0.4921875,
"learning_rate": 1.9823626959827997e-05,
"loss": 0.0639,
"step": 317
},
{
"epoch": 0.10768026073522254,
"grad_norm": 0.671875,
"learning_rate": 1.98215273182523e-05,
"loss": 0.0787,
"step": 318
},
{
"epoch": 0.10801887790734588,
"grad_norm": 0.73046875,
"learning_rate": 1.9819415365391307e-05,
"loss": 0.1027,
"step": 319
},
{
"epoch": 0.10835749507946922,
"grad_norm": 0.6328125,
"learning_rate": 1.9817291103892348e-05,
"loss": 0.0793,
"step": 320
},
{
"epoch": 0.10869611225159256,
"grad_norm": 0.6796875,
"learning_rate": 1.981515453641819e-05,
"loss": 0.0799,
"step": 321
},
{
"epoch": 0.1090347294237159,
"grad_norm": 0.81640625,
"learning_rate": 1.9813005665647017e-05,
"loss": 0.1096,
"step": 322
},
{
"epoch": 0.10937334659583924,
"grad_norm": 0.48828125,
"learning_rate": 1.981084449427244e-05,
"loss": 0.0687,
"step": 323
},
{
"epoch": 0.10971196376796258,
"grad_norm": 0.65234375,
"learning_rate": 1.9808671025003487e-05,
"loss": 0.0751,
"step": 324
},
{
"epoch": 0.11005058094008592,
"grad_norm": 0.64453125,
"learning_rate": 1.9806485260564597e-05,
"loss": 0.0686,
"step": 325
},
{
"epoch": 0.11038919811220926,
"grad_norm": 0.57421875,
"learning_rate": 1.9804287203695636e-05,
"loss": 0.0608,
"step": 326
},
{
"epoch": 0.1107278152843326,
"grad_norm": 0.67578125,
"learning_rate": 1.9802076857151863e-05,
"loss": 0.1027,
"step": 327
},
{
"epoch": 0.11106643245645595,
"grad_norm": 0.640625,
"learning_rate": 1.9799854223703943e-05,
"loss": 0.0796,
"step": 328
},
{
"epoch": 0.11140504962857929,
"grad_norm": 0.71484375,
"learning_rate": 1.9797619306137958e-05,
"loss": 0.0817,
"step": 329
},
{
"epoch": 0.11174366680070263,
"grad_norm": 0.56640625,
"learning_rate": 1.9795372107255368e-05,
"loss": 0.0582,
"step": 330
},
{
"epoch": 0.11208228397282598,
"grad_norm": 0.703125,
"learning_rate": 1.979311262987304e-05,
"loss": 0.0996,
"step": 331
},
{
"epoch": 0.11242090114494932,
"grad_norm": 0.494140625,
"learning_rate": 1.979084087682323e-05,
"loss": 0.0598,
"step": 332
},
{
"epoch": 0.11275951831707265,
"grad_norm": 0.5078125,
"learning_rate": 1.978855685095358e-05,
"loss": 0.0623,
"step": 333
},
{
"epoch": 0.11309813548919599,
"grad_norm": 0.490234375,
"learning_rate": 1.9786260555127116e-05,
"loss": 0.0582,
"step": 334
},
{
"epoch": 0.11343675266131933,
"grad_norm": 0.71875,
"learning_rate": 1.9783951992222246e-05,
"loss": 0.091,
"step": 335
},
{
"epoch": 0.11377536983344268,
"grad_norm": 0.5859375,
"learning_rate": 1.9781631165132755e-05,
"loss": 0.0793,
"step": 336
},
{
"epoch": 0.11411398700556602,
"grad_norm": 0.51953125,
"learning_rate": 1.9779298076767795e-05,
"loss": 0.0565,
"step": 337
},
{
"epoch": 0.11445260417768936,
"grad_norm": 0.59765625,
"learning_rate": 1.9776952730051896e-05,
"loss": 0.0736,
"step": 338
},
{
"epoch": 0.1147912213498127,
"grad_norm": 0.67578125,
"learning_rate": 1.9774595127924955e-05,
"loss": 0.0834,
"step": 339
},
{
"epoch": 0.11512983852193605,
"grad_norm": 0.5390625,
"learning_rate": 1.9772225273342216e-05,
"loss": 0.0604,
"step": 340
},
{
"epoch": 0.11546845569405939,
"grad_norm": 0.65234375,
"learning_rate": 1.97698431692743e-05,
"loss": 0.0674,
"step": 341
},
{
"epoch": 0.11580707286618272,
"grad_norm": 0.68359375,
"learning_rate": 1.976744881870717e-05,
"loss": 0.0729,
"step": 342
},
{
"epoch": 0.11614569003830606,
"grad_norm": 0.64453125,
"learning_rate": 1.9765042224642146e-05,
"loss": 0.0758,
"step": 343
},
{
"epoch": 0.1164843072104294,
"grad_norm": 0.5859375,
"learning_rate": 1.9762623390095897e-05,
"loss": 0.0778,
"step": 344
},
{
"epoch": 0.11682292438255275,
"grad_norm": 0.6875,
"learning_rate": 1.976019231810043e-05,
"loss": 0.079,
"step": 345
},
{
"epoch": 0.11716154155467609,
"grad_norm": 0.5703125,
"learning_rate": 1.9757749011703095e-05,
"loss": 0.0729,
"step": 346
},
{
"epoch": 0.11750015872679943,
"grad_norm": 0.625,
"learning_rate": 1.9755293473966574e-05,
"loss": 0.069,
"step": 347
},
{
"epoch": 0.11783877589892278,
"grad_norm": 0.60546875,
"learning_rate": 1.9752825707968884e-05,
"loss": 0.0707,
"step": 348
},
{
"epoch": 0.11817739307104612,
"grad_norm": 0.546875,
"learning_rate": 1.975034571680337e-05,
"loss": 0.0582,
"step": 349
},
{
"epoch": 0.11851601024316946,
"grad_norm": 0.65625,
"learning_rate": 1.9747853503578708e-05,
"loss": 0.073,
"step": 350
},
{
"epoch": 0.1188546274152928,
"grad_norm": 0.515625,
"learning_rate": 1.9745349071418877e-05,
"loss": 0.0576,
"step": 351
},
{
"epoch": 0.11919324458741613,
"grad_norm": 0.68359375,
"learning_rate": 1.974283242346319e-05,
"loss": 0.0855,
"step": 352
},
{
"epoch": 0.11953186175953948,
"grad_norm": 0.7890625,
"learning_rate": 1.974030356286626e-05,
"loss": 0.0982,
"step": 353
},
{
"epoch": 0.11987047893166282,
"grad_norm": 0.546875,
"learning_rate": 1.9737762492798018e-05,
"loss": 0.0632,
"step": 354
},
{
"epoch": 0.12020909610378616,
"grad_norm": 0.53125,
"learning_rate": 1.97352092164437e-05,
"loss": 0.0637,
"step": 355
},
{
"epoch": 0.1205477132759095,
"grad_norm": 0.6328125,
"learning_rate": 1.9732643737003827e-05,
"loss": 0.0851,
"step": 356
},
{
"epoch": 0.12088633044803285,
"grad_norm": 0.62109375,
"learning_rate": 1.9730066057694236e-05,
"loss": 0.0726,
"step": 357
},
{
"epoch": 0.12122494762015619,
"grad_norm": 0.64453125,
"learning_rate": 1.9727476181746045e-05,
"loss": 0.0977,
"step": 358
},
{
"epoch": 0.12156356479227953,
"grad_norm": 0.70703125,
"learning_rate": 1.9724874112405663e-05,
"loss": 0.0807,
"step": 359
},
{
"epoch": 0.12190218196440288,
"grad_norm": 0.51171875,
"learning_rate": 1.9722259852934785e-05,
"loss": 0.0616,
"step": 360
},
{
"epoch": 0.1222407991365262,
"grad_norm": 0.5546875,
"learning_rate": 1.971963340661039e-05,
"loss": 0.0783,
"step": 361
},
{
"epoch": 0.12257941630864955,
"grad_norm": 0.58984375,
"learning_rate": 1.971699477672472e-05,
"loss": 0.0684,
"step": 362
},
{
"epoch": 0.12291803348077289,
"grad_norm": 0.5390625,
"learning_rate": 1.9714343966585308e-05,
"loss": 0.0636,
"step": 363
},
{
"epoch": 0.12325665065289623,
"grad_norm": 0.51171875,
"learning_rate": 1.9711680979514936e-05,
"loss": 0.0655,
"step": 364
},
{
"epoch": 0.12359526782501958,
"grad_norm": 0.60546875,
"learning_rate": 1.970900581885166e-05,
"loss": 0.0897,
"step": 365
},
{
"epoch": 0.12393388499714292,
"grad_norm": 0.5234375,
"learning_rate": 1.97063184879488e-05,
"loss": 0.0594,
"step": 366
},
{
"epoch": 0.12427250216926626,
"grad_norm": 0.54296875,
"learning_rate": 1.9703618990174917e-05,
"loss": 0.0733,
"step": 367
},
{
"epoch": 0.1246111193413896,
"grad_norm": 0.671875,
"learning_rate": 1.970090732891384e-05,
"loss": 0.1027,
"step": 368
},
{
"epoch": 0.12494973651351295,
"grad_norm": 0.56640625,
"learning_rate": 1.9698183507564626e-05,
"loss": 0.0769,
"step": 369
},
{
"epoch": 0.1252883536856363,
"grad_norm": 0.515625,
"learning_rate": 1.96954475295416e-05,
"loss": 0.0639,
"step": 370
},
{
"epoch": 0.12562697085775962,
"grad_norm": 0.498046875,
"learning_rate": 1.9692699398274298e-05,
"loss": 0.0608,
"step": 371
},
{
"epoch": 0.12596558802988297,
"grad_norm": 0.53515625,
"learning_rate": 1.968993911720751e-05,
"loss": 0.0676,
"step": 372
},
{
"epoch": 0.1263042052020063,
"grad_norm": 0.5078125,
"learning_rate": 1.9687166689801244e-05,
"loss": 0.065,
"step": 373
},
{
"epoch": 0.12664282237412966,
"grad_norm": 0.76171875,
"learning_rate": 1.968438211953074e-05,
"loss": 0.1108,
"step": 374
},
{
"epoch": 0.126981439546253,
"grad_norm": 0.6171875,
"learning_rate": 1.9681585409886454e-05,
"loss": 0.0755,
"step": 375
},
{
"epoch": 0.12732005671837632,
"grad_norm": 0.5,
"learning_rate": 1.9678776564374068e-05,
"loss": 0.0649,
"step": 376
},
{
"epoch": 0.12765867389049967,
"grad_norm": 0.68359375,
"learning_rate": 1.967595558651447e-05,
"loss": 0.081,
"step": 377
},
{
"epoch": 0.127997291062623,
"grad_norm": 0.5703125,
"learning_rate": 1.9673122479843748e-05,
"loss": 0.0675,
"step": 378
},
{
"epoch": 0.12833590823474636,
"grad_norm": 0.640625,
"learning_rate": 1.9670277247913205e-05,
"loss": 0.0803,
"step": 379
},
{
"epoch": 0.1286745254068697,
"grad_norm": 0.58203125,
"learning_rate": 1.9667419894289345e-05,
"loss": 0.0778,
"step": 380
},
{
"epoch": 0.12901314257899305,
"grad_norm": 0.48828125,
"learning_rate": 1.9664550422553852e-05,
"loss": 0.0565,
"step": 381
},
{
"epoch": 0.12935175975111637,
"grad_norm": 0.66015625,
"learning_rate": 1.966166883630362e-05,
"loss": 0.0802,
"step": 382
},
{
"epoch": 0.12969037692323973,
"grad_norm": 0.486328125,
"learning_rate": 1.9658775139150705e-05,
"loss": 0.0626,
"step": 383
},
{
"epoch": 0.13002899409536306,
"grad_norm": 0.546875,
"learning_rate": 1.9655869334722363e-05,
"loss": 0.0667,
"step": 384
},
{
"epoch": 0.13036761126748642,
"grad_norm": 0.53515625,
"learning_rate": 1.9652951426661025e-05,
"loss": 0.0552,
"step": 385
},
{
"epoch": 0.13070622843960975,
"grad_norm": 0.66015625,
"learning_rate": 1.965002141862428e-05,
"loss": 0.06,
"step": 386
},
{
"epoch": 0.13104484561173307,
"grad_norm": 0.5625,
"learning_rate": 1.9647079314284897e-05,
"loss": 0.0681,
"step": 387
},
{
"epoch": 0.13138346278385643,
"grad_norm": 0.625,
"learning_rate": 1.9644125117330806e-05,
"loss": 0.0949,
"step": 388
},
{
"epoch": 0.13172207995597976,
"grad_norm": 0.578125,
"learning_rate": 1.964115883146509e-05,
"loss": 0.0591,
"step": 389
},
{
"epoch": 0.13206069712810312,
"grad_norm": 0.63671875,
"learning_rate": 1.9638180460405995e-05,
"loss": 0.0798,
"step": 390
},
{
"epoch": 0.13239931430022644,
"grad_norm": 0.6484375,
"learning_rate": 1.96351900078869e-05,
"loss": 0.0617,
"step": 391
},
{
"epoch": 0.1327379314723498,
"grad_norm": 0.53515625,
"learning_rate": 1.9632187477656342e-05,
"loss": 0.0765,
"step": 392
},
{
"epoch": 0.13307654864447313,
"grad_norm": 0.58984375,
"learning_rate": 1.9629172873477995e-05,
"loss": 0.0888,
"step": 393
},
{
"epoch": 0.1334151658165965,
"grad_norm": 0.56640625,
"learning_rate": 1.9626146199130664e-05,
"loss": 0.0678,
"step": 394
},
{
"epoch": 0.13375378298871982,
"grad_norm": 0.640625,
"learning_rate": 1.962310745840828e-05,
"loss": 0.0658,
"step": 395
},
{
"epoch": 0.13409240016084314,
"grad_norm": 0.4921875,
"learning_rate": 1.962005665511991e-05,
"loss": 0.0577,
"step": 396
},
{
"epoch": 0.1344310173329665,
"grad_norm": 0.59765625,
"learning_rate": 1.961699379308974e-05,
"loss": 0.0693,
"step": 397
},
{
"epoch": 0.13476963450508983,
"grad_norm": 0.58203125,
"learning_rate": 1.9613918876157062e-05,
"loss": 0.0795,
"step": 398
},
{
"epoch": 0.1351082516772132,
"grad_norm": 0.65234375,
"learning_rate": 1.9610831908176285e-05,
"loss": 0.0647,
"step": 399
},
{
"epoch": 0.13544686884933652,
"grad_norm": 0.546875,
"learning_rate": 1.9607732893016926e-05,
"loss": 0.0737,
"step": 400
},
{
"epoch": 0.13578548602145987,
"grad_norm": 0.59765625,
"learning_rate": 1.9604621834563602e-05,
"loss": 0.0687,
"step": 401
},
{
"epoch": 0.1361241031935832,
"grad_norm": 0.486328125,
"learning_rate": 1.960149873671602e-05,
"loss": 0.0583,
"step": 402
},
{
"epoch": 0.13646272036570656,
"grad_norm": 0.5234375,
"learning_rate": 1.9598363603388986e-05,
"loss": 0.0702,
"step": 403
},
{
"epoch": 0.1368013375378299,
"grad_norm": 0.64453125,
"learning_rate": 1.959521643851239e-05,
"loss": 0.0803,
"step": 404
},
{
"epoch": 0.13713995470995322,
"grad_norm": 0.8515625,
"learning_rate": 1.9592057246031203e-05,
"loss": 0.0896,
"step": 405
},
{
"epoch": 0.13747857188207657,
"grad_norm": 0.8046875,
"learning_rate": 1.9588886029905474e-05,
"loss": 0.0889,
"step": 406
},
{
"epoch": 0.1378171890541999,
"grad_norm": 0.53515625,
"learning_rate": 1.9585702794110322e-05,
"loss": 0.0684,
"step": 407
},
{
"epoch": 0.13815580622632326,
"grad_norm": 0.65625,
"learning_rate": 1.9582507542635933e-05,
"loss": 0.0822,
"step": 408
},
{
"epoch": 0.1384944233984466,
"grad_norm": 0.4765625,
"learning_rate": 1.9579300279487558e-05,
"loss": 0.0572,
"step": 409
},
{
"epoch": 0.13883304057056994,
"grad_norm": 0.609375,
"learning_rate": 1.9576081008685495e-05,
"loss": 0.0897,
"step": 410
},
{
"epoch": 0.13917165774269327,
"grad_norm": 0.52734375,
"learning_rate": 1.9572849734265107e-05,
"loss": 0.0655,
"step": 411
},
{
"epoch": 0.13951027491481663,
"grad_norm": 0.63671875,
"learning_rate": 1.956960646027679e-05,
"loss": 0.0831,
"step": 412
},
{
"epoch": 0.13984889208693996,
"grad_norm": 0.765625,
"learning_rate": 1.9566351190785998e-05,
"loss": 0.0684,
"step": 413
},
{
"epoch": 0.1401875092590633,
"grad_norm": 0.52734375,
"learning_rate": 1.9563083929873202e-05,
"loss": 0.0739,
"step": 414
},
{
"epoch": 0.14052612643118664,
"grad_norm": 0.46875,
"learning_rate": 1.9559804681633918e-05,
"loss": 0.0624,
"step": 415
},
{
"epoch": 0.14086474360330997,
"grad_norm": 0.58984375,
"learning_rate": 1.9556513450178683e-05,
"loss": 0.0775,
"step": 416
},
{
"epoch": 0.14120336077543333,
"grad_norm": 0.671875,
"learning_rate": 1.955321023963306e-05,
"loss": 0.1071,
"step": 417
},
{
"epoch": 0.14154197794755666,
"grad_norm": 0.61328125,
"learning_rate": 1.9549895054137616e-05,
"loss": 0.0705,
"step": 418
},
{
"epoch": 0.14188059511968001,
"grad_norm": 0.70703125,
"learning_rate": 1.954656789784794e-05,
"loss": 0.0901,
"step": 419
},
{
"epoch": 0.14221921229180334,
"grad_norm": 0.68359375,
"learning_rate": 1.9543228774934627e-05,
"loss": 0.0946,
"step": 420
},
{
"epoch": 0.1425578294639267,
"grad_norm": 0.484375,
"learning_rate": 1.953987768958326e-05,
"loss": 0.0663,
"step": 421
},
{
"epoch": 0.14289644663605003,
"grad_norm": 0.51953125,
"learning_rate": 1.953651464599443e-05,
"loss": 0.0736,
"step": 422
},
{
"epoch": 0.14323506380817339,
"grad_norm": 0.71484375,
"learning_rate": 1.9533139648383712e-05,
"loss": 0.0952,
"step": 423
},
{
"epoch": 0.14357368098029671,
"grad_norm": 0.5390625,
"learning_rate": 1.9529752700981664e-05,
"loss": 0.0701,
"step": 424
},
{
"epoch": 0.14391229815242004,
"grad_norm": 0.60546875,
"learning_rate": 1.9526353808033827e-05,
"loss": 0.0776,
"step": 425
},
{
"epoch": 0.1442509153245434,
"grad_norm": 0.46484375,
"learning_rate": 1.9522942973800712e-05,
"loss": 0.0644,
"step": 426
},
{
"epoch": 0.14458953249666673,
"grad_norm": 0.69140625,
"learning_rate": 1.95195202025578e-05,
"loss": 0.0784,
"step": 427
},
{
"epoch": 0.14492814966879008,
"grad_norm": 0.490234375,
"learning_rate": 1.9516085498595533e-05,
"loss": 0.0623,
"step": 428
},
{
"epoch": 0.1452667668409134,
"grad_norm": 0.447265625,
"learning_rate": 1.951263886621932e-05,
"loss": 0.0529,
"step": 429
},
{
"epoch": 0.14560538401303677,
"grad_norm": 0.61328125,
"learning_rate": 1.9509180309749505e-05,
"loss": 0.0828,
"step": 430
},
{
"epoch": 0.1459440011851601,
"grad_norm": 0.4921875,
"learning_rate": 1.9505709833521396e-05,
"loss": 0.0572,
"step": 431
},
{
"epoch": 0.14628261835728346,
"grad_norm": 0.6328125,
"learning_rate": 1.9502227441885232e-05,
"loss": 0.0668,
"step": 432
},
{
"epoch": 0.14662123552940678,
"grad_norm": 0.60546875,
"learning_rate": 1.9498733139206193e-05,
"loss": 0.0878,
"step": 433
},
{
"epoch": 0.1469598527015301,
"grad_norm": 0.5625,
"learning_rate": 1.9495226929864384e-05,
"loss": 0.0672,
"step": 434
},
{
"epoch": 0.14729846987365347,
"grad_norm": 0.62109375,
"learning_rate": 1.9491708818254847e-05,
"loss": 0.078,
"step": 435
},
{
"epoch": 0.1476370870457768,
"grad_norm": 0.66796875,
"learning_rate": 1.9488178808787527e-05,
"loss": 0.0633,
"step": 436
},
{
"epoch": 0.14797570421790016,
"grad_norm": 0.76953125,
"learning_rate": 1.94846369058873e-05,
"loss": 0.1021,
"step": 437
},
{
"epoch": 0.14831432139002348,
"grad_norm": 0.55859375,
"learning_rate": 1.9481083113993927e-05,
"loss": 0.0724,
"step": 438
},
{
"epoch": 0.14865293856214684,
"grad_norm": 0.6484375,
"learning_rate": 1.9477517437562097e-05,
"loss": 0.0714,
"step": 439
},
{
"epoch": 0.14899155573427017,
"grad_norm": 0.6484375,
"learning_rate": 1.9473939881061385e-05,
"loss": 0.0857,
"step": 440
},
{
"epoch": 0.14933017290639353,
"grad_norm": 0.6171875,
"learning_rate": 1.9470350448976257e-05,
"loss": 0.0953,
"step": 441
},
{
"epoch": 0.14966879007851686,
"grad_norm": 0.859375,
"learning_rate": 1.9466749145806065e-05,
"loss": 0.0685,
"step": 442
},
{
"epoch": 0.15000740725064018,
"grad_norm": 0.671875,
"learning_rate": 1.9463135976065043e-05,
"loss": 0.0905,
"step": 443
},
{
"epoch": 0.15034602442276354,
"grad_norm": 0.6015625,
"learning_rate": 1.9459510944282307e-05,
"loss": 0.0771,
"step": 444
},
{
"epoch": 0.15068464159488687,
"grad_norm": 0.57421875,
"learning_rate": 1.9455874055001824e-05,
"loss": 0.0682,
"step": 445
},
{
"epoch": 0.15102325876701023,
"grad_norm": 0.451171875,
"learning_rate": 1.945222531278244e-05,
"loss": 0.0599,
"step": 446
},
{
"epoch": 0.15136187593913356,
"grad_norm": 0.546875,
"learning_rate": 1.9448564722197855e-05,
"loss": 0.0668,
"step": 447
},
{
"epoch": 0.1517004931112569,
"grad_norm": 0.53515625,
"learning_rate": 1.9444892287836614e-05,
"loss": 0.0712,
"step": 448
},
{
"epoch": 0.15203911028338024,
"grad_norm": 0.5390625,
"learning_rate": 1.944120801430212e-05,
"loss": 0.0727,
"step": 449
},
{
"epoch": 0.1523777274555036,
"grad_norm": 0.546875,
"learning_rate": 1.9437511906212607e-05,
"loss": 0.0698,
"step": 450
},
{
"epoch": 0.15271634462762693,
"grad_norm": 0.69140625,
"learning_rate": 1.9433803968201148e-05,
"loss": 0.0945,
"step": 451
},
{
"epoch": 0.15305496179975028,
"grad_norm": 0.57421875,
"learning_rate": 1.9430084204915642e-05,
"loss": 0.069,
"step": 452
},
{
"epoch": 0.1533935789718736,
"grad_norm": 0.58984375,
"learning_rate": 1.9426352621018817e-05,
"loss": 0.071,
"step": 453
},
{
"epoch": 0.15373219614399694,
"grad_norm": 0.609375,
"learning_rate": 1.9422609221188208e-05,
"loss": 0.0809,
"step": 454
},
{
"epoch": 0.1540708133161203,
"grad_norm": 0.6484375,
"learning_rate": 1.9418854010116168e-05,
"loss": 0.085,
"step": 455
},
{
"epoch": 0.15440943048824363,
"grad_norm": 0.609375,
"learning_rate": 1.9415086992509858e-05,
"loss": 0.0916,
"step": 456
},
{
"epoch": 0.15474804766036698,
"grad_norm": 0.59765625,
"learning_rate": 1.941130817309123e-05,
"loss": 0.0544,
"step": 457
},
{
"epoch": 0.1550866648324903,
"grad_norm": 0.58984375,
"learning_rate": 1.940751755659704e-05,
"loss": 0.073,
"step": 458
},
{
"epoch": 0.15542528200461367,
"grad_norm": 0.61328125,
"learning_rate": 1.9403715147778822e-05,
"loss": 0.0854,
"step": 459
},
{
"epoch": 0.155763899176737,
"grad_norm": 0.765625,
"learning_rate": 1.9399900951402897e-05,
"loss": 0.0612,
"step": 460
},
{
"epoch": 0.15610251634886035,
"grad_norm": 0.58984375,
"learning_rate": 1.939607497225036e-05,
"loss": 0.079,
"step": 461
},
{
"epoch": 0.15644113352098368,
"grad_norm": 0.46484375,
"learning_rate": 1.9392237215117076e-05,
"loss": 0.0544,
"step": 462
},
{
"epoch": 0.156779750693107,
"grad_norm": 0.466796875,
"learning_rate": 1.9388387684813676e-05,
"loss": 0.0535,
"step": 463
},
{
"epoch": 0.15711836786523037,
"grad_norm": 0.546875,
"learning_rate": 1.9384526386165548e-05,
"loss": 0.081,
"step": 464
},
{
"epoch": 0.1574569850373537,
"grad_norm": 0.66015625,
"learning_rate": 1.938065332401282e-05,
"loss": 0.0916,
"step": 465
},
{
"epoch": 0.15779560220947705,
"grad_norm": 0.453125,
"learning_rate": 1.9376768503210388e-05,
"loss": 0.0584,
"step": 466
},
{
"epoch": 0.15813421938160038,
"grad_norm": 0.65234375,
"learning_rate": 1.937287192862787e-05,
"loss": 0.0942,
"step": 467
},
{
"epoch": 0.15847283655372374,
"grad_norm": 0.6015625,
"learning_rate": 1.9368963605149624e-05,
"loss": 0.0744,
"step": 468
},
{
"epoch": 0.15881145372584707,
"grad_norm": 0.416015625,
"learning_rate": 1.936504353767473e-05,
"loss": 0.0531,
"step": 469
},
{
"epoch": 0.15915007089797042,
"grad_norm": 0.5546875,
"learning_rate": 1.9361111731116993e-05,
"loss": 0.0792,
"step": 470
},
{
"epoch": 0.15948868807009375,
"grad_norm": 0.67578125,
"learning_rate": 1.9357168190404937e-05,
"loss": 0.0809,
"step": 471
},
{
"epoch": 0.15982730524221708,
"grad_norm": 0.51953125,
"learning_rate": 1.9353212920481792e-05,
"loss": 0.0707,
"step": 472
},
{
"epoch": 0.16016592241434044,
"grad_norm": 0.58984375,
"learning_rate": 1.934924592630548e-05,
"loss": 0.0847,
"step": 473
},
{
"epoch": 0.16050453958646377,
"grad_norm": 0.515625,
"learning_rate": 1.9345267212848638e-05,
"loss": 0.0683,
"step": 474
},
{
"epoch": 0.16084315675858712,
"grad_norm": 0.59765625,
"learning_rate": 1.9341276785098584e-05,
"loss": 0.081,
"step": 475
},
{
"epoch": 0.16118177393071045,
"grad_norm": 0.6640625,
"learning_rate": 1.9337274648057313e-05,
"loss": 0.0843,
"step": 476
},
{
"epoch": 0.1615203911028338,
"grad_norm": 0.50390625,
"learning_rate": 1.93332608067415e-05,
"loss": 0.0621,
"step": 477
},
{
"epoch": 0.16185900827495714,
"grad_norm": 0.5078125,
"learning_rate": 1.932923526618251e-05,
"loss": 0.0625,
"step": 478
},
{
"epoch": 0.1621976254470805,
"grad_norm": 0.63671875,
"learning_rate": 1.932519803142635e-05,
"loss": 0.0812,
"step": 479
},
{
"epoch": 0.16253624261920382,
"grad_norm": 0.470703125,
"learning_rate": 1.9321149107533693e-05,
"loss": 0.0565,
"step": 480
},
{
"epoch": 0.16287485979132718,
"grad_norm": 0.6875,
"learning_rate": 1.931708849957987e-05,
"loss": 0.0996,
"step": 481
},
{
"epoch": 0.1632134769634505,
"grad_norm": 0.455078125,
"learning_rate": 1.9313016212654845e-05,
"loss": 0.0621,
"step": 482
},
{
"epoch": 0.16355209413557384,
"grad_norm": 0.6015625,
"learning_rate": 1.9308932251863243e-05,
"loss": 0.0792,
"step": 483
},
{
"epoch": 0.1638907113076972,
"grad_norm": 0.57421875,
"learning_rate": 1.9304836622324295e-05,
"loss": 0.0705,
"step": 484
},
{
"epoch": 0.16422932847982052,
"grad_norm": 0.65625,
"learning_rate": 1.930072932917188e-05,
"loss": 0.0944,
"step": 485
},
{
"epoch": 0.16456794565194388,
"grad_norm": 0.609375,
"learning_rate": 1.9296610377554496e-05,
"loss": 0.0713,
"step": 486
},
{
"epoch": 0.1649065628240672,
"grad_norm": 0.51953125,
"learning_rate": 1.9292479772635236e-05,
"loss": 0.0654,
"step": 487
},
{
"epoch": 0.16524517999619057,
"grad_norm": 0.458984375,
"learning_rate": 1.9288337519591827e-05,
"loss": 0.0673,
"step": 488
},
{
"epoch": 0.1655837971683139,
"grad_norm": 0.71484375,
"learning_rate": 1.9284183623616573e-05,
"loss": 0.0963,
"step": 489
},
{
"epoch": 0.16592241434043725,
"grad_norm": 0.8125,
"learning_rate": 1.9280018089916387e-05,
"loss": 0.083,
"step": 490
},
{
"epoch": 0.16626103151256058,
"grad_norm": 0.5703125,
"learning_rate": 1.927584092371277e-05,
"loss": 0.0769,
"step": 491
},
{
"epoch": 0.1665996486846839,
"grad_norm": 0.62109375,
"learning_rate": 1.9271652130241794e-05,
"loss": 0.0801,
"step": 492
},
{
"epoch": 0.16693826585680727,
"grad_norm": 0.5703125,
"learning_rate": 1.9267451714754113e-05,
"loss": 0.0599,
"step": 493
},
{
"epoch": 0.1672768830289306,
"grad_norm": 0.54296875,
"learning_rate": 1.9263239682514953e-05,
"loss": 0.0793,
"step": 494
},
{
"epoch": 0.16761550020105395,
"grad_norm": 0.578125,
"learning_rate": 1.925901603880409e-05,
"loss": 0.0641,
"step": 495
},
{
"epoch": 0.16795411737317728,
"grad_norm": 0.71484375,
"learning_rate": 1.9254780788915865e-05,
"loss": 0.0641,
"step": 496
},
{
"epoch": 0.16829273454530064,
"grad_norm": 0.490234375,
"learning_rate": 1.9250533938159166e-05,
"loss": 0.0575,
"step": 497
},
{
"epoch": 0.16863135171742397,
"grad_norm": 0.5546875,
"learning_rate": 1.9246275491857417e-05,
"loss": 0.0695,
"step": 498
},
{
"epoch": 0.16896996888954732,
"grad_norm": 0.58203125,
"learning_rate": 1.9242005455348582e-05,
"loss": 0.0702,
"step": 499
},
{
"epoch": 0.16930858606167065,
"grad_norm": 0.57421875,
"learning_rate": 1.9237723833985154e-05,
"loss": 0.0819,
"step": 500
},
{
"epoch": 0.16964720323379398,
"grad_norm": 0.58984375,
"learning_rate": 1.9233430633134146e-05,
"loss": 0.0699,
"step": 501
},
{
"epoch": 0.16998582040591734,
"grad_norm": 0.51953125,
"learning_rate": 1.922912585817708e-05,
"loss": 0.0708,
"step": 502
},
{
"epoch": 0.17032443757804067,
"grad_norm": 0.5546875,
"learning_rate": 1.9224809514509998e-05,
"loss": 0.0752,
"step": 503
},
{
"epoch": 0.17066305475016402,
"grad_norm": 0.58203125,
"learning_rate": 1.9220481607543436e-05,
"loss": 0.0789,
"step": 504
},
{
"epoch": 0.17100167192228735,
"grad_norm": 0.55859375,
"learning_rate": 1.9216142142702424e-05,
"loss": 0.0735,
"step": 505
},
{
"epoch": 0.1713402890944107,
"grad_norm": 0.6015625,
"learning_rate": 1.921179112542648e-05,
"loss": 0.0773,
"step": 506
},
{
"epoch": 0.17167890626653404,
"grad_norm": 0.5234375,
"learning_rate": 1.920742856116961e-05,
"loss": 0.0579,
"step": 507
},
{
"epoch": 0.1720175234386574,
"grad_norm": 0.50390625,
"learning_rate": 1.920305445540028e-05,
"loss": 0.0578,
"step": 508
},
{
"epoch": 0.17235614061078072,
"grad_norm": 0.53515625,
"learning_rate": 1.9198668813601443e-05,
"loss": 0.0664,
"step": 509
},
{
"epoch": 0.17269475778290408,
"grad_norm": 0.57421875,
"learning_rate": 1.919427164127049e-05,
"loss": 0.0685,
"step": 510
},
{
"epoch": 0.1730333749550274,
"grad_norm": 0.66015625,
"learning_rate": 1.918986294391929e-05,
"loss": 0.0815,
"step": 511
},
{
"epoch": 0.17337199212715074,
"grad_norm": 0.76171875,
"learning_rate": 1.918544272707413e-05,
"loss": 0.0878,
"step": 512
},
{
"epoch": 0.1737106092992741,
"grad_norm": 0.578125,
"learning_rate": 1.9181010996275767e-05,
"loss": 0.0727,
"step": 513
},
{
"epoch": 0.17404922647139742,
"grad_norm": 0.47265625,
"learning_rate": 1.9176567757079368e-05,
"loss": 0.0583,
"step": 514
},
{
"epoch": 0.17438784364352078,
"grad_norm": 0.54296875,
"learning_rate": 1.917211301505453e-05,
"loss": 0.073,
"step": 515
},
{
"epoch": 0.1747264608156441,
"grad_norm": 0.65234375,
"learning_rate": 1.916764677578528e-05,
"loss": 0.0841,
"step": 516
},
{
"epoch": 0.17506507798776746,
"grad_norm": 0.412109375,
"learning_rate": 1.916316904487005e-05,
"loss": 0.0486,
"step": 517
},
{
"epoch": 0.1754036951598908,
"grad_norm": 0.64453125,
"learning_rate": 1.9158679827921667e-05,
"loss": 0.088,
"step": 518
},
{
"epoch": 0.17574231233201415,
"grad_norm": 0.484375,
"learning_rate": 1.9154179130567374e-05,
"loss": 0.0673,
"step": 519
},
{
"epoch": 0.17608092950413748,
"grad_norm": 0.54296875,
"learning_rate": 1.9149666958448792e-05,
"loss": 0.0723,
"step": 520
},
{
"epoch": 0.1764195466762608,
"grad_norm": 0.86328125,
"learning_rate": 1.9145143317221925e-05,
"loss": 0.0824,
"step": 521
},
{
"epoch": 0.17675816384838416,
"grad_norm": 0.53125,
"learning_rate": 1.9140608212557165e-05,
"loss": 0.0802,
"step": 522
},
{
"epoch": 0.1770967810205075,
"grad_norm": 1.9375,
"learning_rate": 1.9136061650139262e-05,
"loss": 0.0781,
"step": 523
},
{
"epoch": 0.17743539819263085,
"grad_norm": 0.57421875,
"learning_rate": 1.9131503635667337e-05,
"loss": 0.0737,
"step": 524
},
{
"epoch": 0.17777401536475418,
"grad_norm": 0.5390625,
"learning_rate": 1.9126934174854856e-05,
"loss": 0.0691,
"step": 525
},
{
"epoch": 0.17811263253687754,
"grad_norm": 0.66796875,
"learning_rate": 1.9122353273429635e-05,
"loss": 0.0804,
"step": 526
},
{
"epoch": 0.17845124970900086,
"grad_norm": 0.6484375,
"learning_rate": 1.9117760937133843e-05,
"loss": 0.0839,
"step": 527
},
{
"epoch": 0.17878986688112422,
"grad_norm": 0.515625,
"learning_rate": 1.911315717172397e-05,
"loss": 0.0671,
"step": 528
},
{
"epoch": 0.17912848405324755,
"grad_norm": 0.458984375,
"learning_rate": 1.910854198297084e-05,
"loss": 0.061,
"step": 529
},
{
"epoch": 0.17946710122537088,
"grad_norm": 0.515625,
"learning_rate": 1.9103915376659583e-05,
"loss": 0.0598,
"step": 530
},
{
"epoch": 0.17980571839749424,
"grad_norm": 0.48046875,
"learning_rate": 1.909927735858966e-05,
"loss": 0.0592,
"step": 531
},
{
"epoch": 0.18014433556961756,
"grad_norm": 0.5859375,
"learning_rate": 1.9094627934574825e-05,
"loss": 0.0601,
"step": 532
},
{
"epoch": 0.18048295274174092,
"grad_norm": 0.828125,
"learning_rate": 1.9089967110443127e-05,
"loss": 0.0756,
"step": 533
},
{
"epoch": 0.18082156991386425,
"grad_norm": 0.53515625,
"learning_rate": 1.9085294892036914e-05,
"loss": 0.0741,
"step": 534
},
{
"epoch": 0.1811601870859876,
"grad_norm": 0.5625,
"learning_rate": 1.908061128521281e-05,
"loss": 0.0654,
"step": 535
},
{
"epoch": 0.18149880425811094,
"grad_norm": 0.5390625,
"learning_rate": 1.907591629584172e-05,
"loss": 0.0712,
"step": 536
},
{
"epoch": 0.1818374214302343,
"grad_norm": 0.51953125,
"learning_rate": 1.9071209929808808e-05,
"loss": 0.0643,
"step": 537
},
{
"epoch": 0.18217603860235762,
"grad_norm": 0.6171875,
"learning_rate": 1.9066492193013505e-05,
"loss": 0.0861,
"step": 538
},
{
"epoch": 0.18251465577448098,
"grad_norm": 0.52734375,
"learning_rate": 1.9061763091369498e-05,
"loss": 0.0656,
"step": 539
},
{
"epoch": 0.1828532729466043,
"grad_norm": 0.48828125,
"learning_rate": 1.9057022630804715e-05,
"loss": 0.0592,
"step": 540
},
{
"epoch": 0.18319189011872763,
"grad_norm": 0.6640625,
"learning_rate": 1.9052270817261323e-05,
"loss": 0.0877,
"step": 541
},
{
"epoch": 0.183530507290851,
"grad_norm": 0.51171875,
"learning_rate": 1.9047507656695722e-05,
"loss": 0.0686,
"step": 542
},
{
"epoch": 0.18386912446297432,
"grad_norm": 0.53515625,
"learning_rate": 1.9042733155078536e-05,
"loss": 0.0651,
"step": 543
},
{
"epoch": 0.18420774163509768,
"grad_norm": 0.46484375,
"learning_rate": 1.9037947318394594e-05,
"loss": 0.0624,
"step": 544
},
{
"epoch": 0.184546358807221,
"grad_norm": 0.609375,
"learning_rate": 1.9033150152642953e-05,
"loss": 0.073,
"step": 545
},
{
"epoch": 0.18488497597934436,
"grad_norm": 0.419921875,
"learning_rate": 1.9028341663836855e-05,
"loss": 0.0587,
"step": 546
},
{
"epoch": 0.1852235931514677,
"grad_norm": 0.53125,
"learning_rate": 1.9023521858003744e-05,
"loss": 0.0741,
"step": 547
},
{
"epoch": 0.18556221032359105,
"grad_norm": 0.6328125,
"learning_rate": 1.9018690741185244e-05,
"loss": 0.0801,
"step": 548
},
{
"epoch": 0.18590082749571438,
"grad_norm": 0.55859375,
"learning_rate": 1.9013848319437163e-05,
"loss": 0.0627,
"step": 549
},
{
"epoch": 0.1862394446678377,
"grad_norm": 0.92578125,
"learning_rate": 1.900899459882948e-05,
"loss": 0.0828,
"step": 550
},
{
"epoch": 0.18657806183996106,
"grad_norm": 0.5390625,
"learning_rate": 1.9004129585446326e-05,
"loss": 0.0765,
"step": 551
},
{
"epoch": 0.1869166790120844,
"grad_norm": 0.50390625,
"learning_rate": 1.8999253285386e-05,
"loss": 0.0803,
"step": 552
},
{
"epoch": 0.18725529618420775,
"grad_norm": 0.60546875,
"learning_rate": 1.8994365704760946e-05,
"loss": 0.0785,
"step": 553
},
{
"epoch": 0.18759391335633108,
"grad_norm": 0.66015625,
"learning_rate": 1.8989466849697745e-05,
"loss": 0.0949,
"step": 554
},
{
"epoch": 0.18793253052845443,
"grad_norm": 0.486328125,
"learning_rate": 1.8984556726337113e-05,
"loss": 0.062,
"step": 555
},
{
"epoch": 0.18827114770057776,
"grad_norm": 0.56640625,
"learning_rate": 1.8979635340833887e-05,
"loss": 0.0739,
"step": 556
},
{
"epoch": 0.18860976487270112,
"grad_norm": 0.455078125,
"learning_rate": 1.897470269935703e-05,
"loss": 0.0574,
"step": 557
},
{
"epoch": 0.18894838204482445,
"grad_norm": 0.5625,
"learning_rate": 1.8969758808089602e-05,
"loss": 0.0689,
"step": 558
},
{
"epoch": 0.18928699921694778,
"grad_norm": 0.59375,
"learning_rate": 1.8964803673228776e-05,
"loss": 0.0719,
"step": 559
},
{
"epoch": 0.18962561638907113,
"grad_norm": 0.58203125,
"learning_rate": 1.895983730098581e-05,
"loss": 0.0746,
"step": 560
},
{
"epoch": 0.18996423356119446,
"grad_norm": 0.5234375,
"learning_rate": 1.8954859697586057e-05,
"loss": 0.07,
"step": 561
},
{
"epoch": 0.19030285073331782,
"grad_norm": 0.55078125,
"learning_rate": 1.8949870869268942e-05,
"loss": 0.076,
"step": 562
},
{
"epoch": 0.19064146790544115,
"grad_norm": 0.462890625,
"learning_rate": 1.8944870822287957e-05,
"loss": 0.0698,
"step": 563
},
{
"epoch": 0.1909800850775645,
"grad_norm": 0.466796875,
"learning_rate": 1.893985956291067e-05,
"loss": 0.0552,
"step": 564
},
{
"epoch": 0.19131870224968783,
"grad_norm": 0.5390625,
"learning_rate": 1.893483709741868e-05,
"loss": 0.0708,
"step": 565
},
{
"epoch": 0.1916573194218112,
"grad_norm": 0.62109375,
"learning_rate": 1.8929803432107662e-05,
"loss": 0.0855,
"step": 566
},
{
"epoch": 0.19199593659393452,
"grad_norm": 0.5390625,
"learning_rate": 1.8924758573287315e-05,
"loss": 0.0745,
"step": 567
},
{
"epoch": 0.19233455376605785,
"grad_norm": 0.51171875,
"learning_rate": 1.891970252728136e-05,
"loss": 0.07,
"step": 568
},
{
"epoch": 0.1926731709381812,
"grad_norm": 0.56640625,
"learning_rate": 1.8914635300427563e-05,
"loss": 0.0778,
"step": 569
},
{
"epoch": 0.19301178811030453,
"grad_norm": 0.46484375,
"learning_rate": 1.8909556899077683e-05,
"loss": 0.0545,
"step": 570
},
{
"epoch": 0.1933504052824279,
"grad_norm": 0.435546875,
"learning_rate": 1.8904467329597503e-05,
"loss": 0.0503,
"step": 571
},
{
"epoch": 0.19368902245455122,
"grad_norm": 0.7734375,
"learning_rate": 1.8899366598366796e-05,
"loss": 0.0593,
"step": 572
},
{
"epoch": 0.19402763962667458,
"grad_norm": 0.6640625,
"learning_rate": 1.8894254711779333e-05,
"loss": 0.1005,
"step": 573
},
{
"epoch": 0.1943662567987979,
"grad_norm": 0.51953125,
"learning_rate": 1.8889131676242858e-05,
"loss": 0.0604,
"step": 574
},
{
"epoch": 0.19470487397092126,
"grad_norm": 1.5859375,
"learning_rate": 1.8883997498179103e-05,
"loss": 0.0908,
"step": 575
},
{
"epoch": 0.1950434911430446,
"grad_norm": 0.58984375,
"learning_rate": 1.8878852184023754e-05,
"loss": 0.0736,
"step": 576
},
{
"epoch": 0.19538210831516795,
"grad_norm": 0.52734375,
"learning_rate": 1.8873695740226468e-05,
"loss": 0.0734,
"step": 577
},
{
"epoch": 0.19572072548729127,
"grad_norm": 0.53125,
"learning_rate": 1.8868528173250846e-05,
"loss": 0.0574,
"step": 578
},
{
"epoch": 0.1960593426594146,
"grad_norm": 0.63671875,
"learning_rate": 1.886334948957443e-05,
"loss": 0.0719,
"step": 579
},
{
"epoch": 0.19639795983153796,
"grad_norm": 0.462890625,
"learning_rate": 1.8858159695688708e-05,
"loss": 0.0642,
"step": 580
},
{
"epoch": 0.1967365770036613,
"grad_norm": 0.62109375,
"learning_rate": 1.885295879809908e-05,
"loss": 0.0721,
"step": 581
},
{
"epoch": 0.19707519417578465,
"grad_norm": 0.5625,
"learning_rate": 1.884774680332487e-05,
"loss": 0.0739,
"step": 582
},
{
"epoch": 0.19741381134790797,
"grad_norm": 0.5859375,
"learning_rate": 1.8842523717899326e-05,
"loss": 0.0791,
"step": 583
},
{
"epoch": 0.19775242852003133,
"grad_norm": 0.5,
"learning_rate": 1.8837289548369574e-05,
"loss": 0.0719,
"step": 584
},
{
"epoch": 0.19809104569215466,
"grad_norm": 0.515625,
"learning_rate": 1.8832044301296652e-05,
"loss": 0.0706,
"step": 585
},
{
"epoch": 0.19842966286427802,
"grad_norm": 0.60546875,
"learning_rate": 1.8826787983255474e-05,
"loss": 0.0736,
"step": 586
},
{
"epoch": 0.19876828003640135,
"grad_norm": 0.63671875,
"learning_rate": 1.882152060083484e-05,
"loss": 0.0699,
"step": 587
},
{
"epoch": 0.19910689720852467,
"grad_norm": 0.388671875,
"learning_rate": 1.881624216063741e-05,
"loss": 0.0479,
"step": 588
},
{
"epoch": 0.19944551438064803,
"grad_norm": 0.49609375,
"learning_rate": 1.8810952669279707e-05,
"loss": 0.0669,
"step": 589
},
{
"epoch": 0.19978413155277136,
"grad_norm": 0.71484375,
"learning_rate": 1.8805652133392115e-05,
"loss": 0.0875,
"step": 590
},
{
"epoch": 0.20012274872489472,
"grad_norm": 0.494140625,
"learning_rate": 1.8800340559618855e-05,
"loss": 0.0666,
"step": 591
},
{
"epoch": 0.20046136589701805,
"grad_norm": 0.62109375,
"learning_rate": 1.8795017954617982e-05,
"loss": 0.0774,
"step": 592
},
{
"epoch": 0.20046136589701805,
"eval_loss": 0.07367200404405594,
"eval_runtime": 815.492,
"eval_samples_per_second": 12.199,
"eval_steps_per_second": 3.05,
"step": 592
},
{
"epoch": 0.2007999830691414,
"grad_norm": 0.58984375,
"learning_rate": 1.8789684325061382e-05,
"loss": 0.0784,
"step": 593
},
{
"epoch": 0.20113860024126473,
"grad_norm": 0.6796875,
"learning_rate": 1.8784339677634763e-05,
"loss": 0.0774,
"step": 594
},
{
"epoch": 0.2014772174133881,
"grad_norm": 0.56640625,
"learning_rate": 1.8778984019037642e-05,
"loss": 0.0737,
"step": 595
},
{
"epoch": 0.20181583458551142,
"grad_norm": 0.62109375,
"learning_rate": 1.8773617355983332e-05,
"loss": 0.0823,
"step": 596
},
{
"epoch": 0.20215445175763475,
"grad_norm": 0.5078125,
"learning_rate": 1.8768239695198945e-05,
"loss": 0.0601,
"step": 597
},
{
"epoch": 0.2024930689297581,
"grad_norm": 0.48828125,
"learning_rate": 1.876285104342539e-05,
"loss": 0.0671,
"step": 598
},
{
"epoch": 0.20283168610188143,
"grad_norm": 0.5703125,
"learning_rate": 1.8757451407417332e-05,
"loss": 0.0679,
"step": 599
},
{
"epoch": 0.2031703032740048,
"grad_norm": 0.6796875,
"learning_rate": 1.8752040793943215e-05,
"loss": 0.0969,
"step": 600
},
{
"epoch": 0.20350892044612812,
"grad_norm": 0.5546875,
"learning_rate": 1.8746619209785253e-05,
"loss": 0.0729,
"step": 601
},
{
"epoch": 0.20384753761825147,
"grad_norm": 0.73828125,
"learning_rate": 1.874118666173939e-05,
"loss": 0.1034,
"step": 602
},
{
"epoch": 0.2041861547903748,
"grad_norm": 0.52734375,
"learning_rate": 1.8735743156615337e-05,
"loss": 0.0666,
"step": 603
},
{
"epoch": 0.20452477196249816,
"grad_norm": 0.474609375,
"learning_rate": 1.873028870123652e-05,
"loss": 0.0677,
"step": 604
},
{
"epoch": 0.2048633891346215,
"grad_norm": 0.66015625,
"learning_rate": 1.87248233024401e-05,
"loss": 0.0931,
"step": 605
},
{
"epoch": 0.20520200630674484,
"grad_norm": 0.54296875,
"learning_rate": 1.871934696707696e-05,
"loss": 0.0632,
"step": 606
},
{
"epoch": 0.20554062347886817,
"grad_norm": 0.56640625,
"learning_rate": 1.871385970201168e-05,
"loss": 0.0587,
"step": 607
},
{
"epoch": 0.2058792406509915,
"grad_norm": 0.60546875,
"learning_rate": 1.870836151412255e-05,
"loss": 0.0776,
"step": 608
},
{
"epoch": 0.20621785782311486,
"grad_norm": 0.52734375,
"learning_rate": 1.8702852410301556e-05,
"loss": 0.0557,
"step": 609
},
{
"epoch": 0.2065564749952382,
"grad_norm": 0.6328125,
"learning_rate": 1.869733239745435e-05,
"loss": 0.0801,
"step": 610
},
{
"epoch": 0.20689509216736154,
"grad_norm": 0.498046875,
"learning_rate": 1.869180148250027e-05,
"loss": 0.0632,
"step": 611
},
{
"epoch": 0.20723370933948487,
"grad_norm": 0.474609375,
"learning_rate": 1.8686259672372323e-05,
"loss": 0.0592,
"step": 612
},
{
"epoch": 0.20757232651160823,
"grad_norm": 0.50390625,
"learning_rate": 1.8680706974017164e-05,
"loss": 0.0714,
"step": 613
},
{
"epoch": 0.20791094368373156,
"grad_norm": 0.5625,
"learning_rate": 1.8675143394395106e-05,
"loss": 0.066,
"step": 614
},
{
"epoch": 0.20824956085585491,
"grad_norm": 1.15625,
"learning_rate": 1.8669568940480093e-05,
"loss": 0.0525,
"step": 615
},
{
"epoch": 0.20858817802797824,
"grad_norm": 0.478515625,
"learning_rate": 1.86639836192597e-05,
"loss": 0.0637,
"step": 616
},
{
"epoch": 0.20892679520010157,
"grad_norm": 0.494140625,
"learning_rate": 1.8658387437735137e-05,
"loss": 0.0581,
"step": 617
},
{
"epoch": 0.20926541237222493,
"grad_norm": 0.4296875,
"learning_rate": 1.865278040292121e-05,
"loss": 0.0503,
"step": 618
},
{
"epoch": 0.20960402954434826,
"grad_norm": 0.455078125,
"learning_rate": 1.864716252184634e-05,
"loss": 0.0602,
"step": 619
},
{
"epoch": 0.20994264671647161,
"grad_norm": 0.62109375,
"learning_rate": 1.864153380155254e-05,
"loss": 0.0762,
"step": 620
},
{
"epoch": 0.21028126388859494,
"grad_norm": 0.458984375,
"learning_rate": 1.863589424909541e-05,
"loss": 0.0517,
"step": 621
},
{
"epoch": 0.2106198810607183,
"grad_norm": 0.4921875,
"learning_rate": 1.863024387154414e-05,
"loss": 0.0551,
"step": 622
},
{
"epoch": 0.21095849823284163,
"grad_norm": 0.71875,
"learning_rate": 1.8624582675981466e-05,
"loss": 0.0596,
"step": 623
},
{
"epoch": 0.21129711540496499,
"grad_norm": 0.48828125,
"learning_rate": 1.8618910669503704e-05,
"loss": 0.0525,
"step": 624
},
{
"epoch": 0.21163573257708831,
"grad_norm": 0.515625,
"learning_rate": 1.861322785922071e-05,
"loss": 0.0619,
"step": 625
},
{
"epoch": 0.21197434974921164,
"grad_norm": 0.578125,
"learning_rate": 1.8607534252255896e-05,
"loss": 0.0728,
"step": 626
},
{
"epoch": 0.212312966921335,
"grad_norm": 0.435546875,
"learning_rate": 1.8601829855746187e-05,
"loss": 0.0583,
"step": 627
},
{
"epoch": 0.21265158409345833,
"grad_norm": 0.478515625,
"learning_rate": 1.8596114676842054e-05,
"loss": 0.0659,
"step": 628
},
{
"epoch": 0.21299020126558169,
"grad_norm": 0.56640625,
"learning_rate": 1.8590388722707465e-05,
"loss": 0.08,
"step": 629
},
{
"epoch": 0.21332881843770501,
"grad_norm": 0.60546875,
"learning_rate": 1.8584652000519913e-05,
"loss": 0.0701,
"step": 630
},
{
"epoch": 0.21366743560982837,
"grad_norm": 0.65625,
"learning_rate": 1.8578904517470375e-05,
"loss": 0.0718,
"step": 631
},
{
"epoch": 0.2140060527819517,
"grad_norm": 0.69921875,
"learning_rate": 1.8573146280763327e-05,
"loss": 0.1271,
"step": 632
},
{
"epoch": 0.21434466995407506,
"grad_norm": 0.59375,
"learning_rate": 1.856737729761671e-05,
"loss": 0.0677,
"step": 633
},
{
"epoch": 0.21468328712619839,
"grad_norm": 0.6953125,
"learning_rate": 1.856159757526195e-05,
"loss": 0.0763,
"step": 634
},
{
"epoch": 0.21502190429832174,
"grad_norm": 0.478515625,
"learning_rate": 1.8555807120943927e-05,
"loss": 0.0588,
"step": 635
},
{
"epoch": 0.21536052147044507,
"grad_norm": 0.78125,
"learning_rate": 1.8550005941920984e-05,
"loss": 0.1482,
"step": 636
},
{
"epoch": 0.2156991386425684,
"grad_norm": 0.6640625,
"learning_rate": 1.8544194045464888e-05,
"loss": 0.0953,
"step": 637
},
{
"epoch": 0.21603775581469176,
"grad_norm": 0.60546875,
"learning_rate": 1.8538371438860858e-05,
"loss": 0.0685,
"step": 638
},
{
"epoch": 0.21637637298681509,
"grad_norm": 0.51171875,
"learning_rate": 1.8532538129407532e-05,
"loss": 0.0665,
"step": 639
},
{
"epoch": 0.21671499015893844,
"grad_norm": 0.45703125,
"learning_rate": 1.8526694124416963e-05,
"loss": 0.049,
"step": 640
},
{
"epoch": 0.21705360733106177,
"grad_norm": 0.78125,
"learning_rate": 1.852083943121461e-05,
"loss": 0.086,
"step": 641
},
{
"epoch": 0.21739222450318513,
"grad_norm": 0.5390625,
"learning_rate": 1.8514974057139335e-05,
"loss": 0.0779,
"step": 642
},
{
"epoch": 0.21773084167530846,
"grad_norm": 0.50390625,
"learning_rate": 1.8509098009543378e-05,
"loss": 0.0581,
"step": 643
},
{
"epoch": 0.2180694588474318,
"grad_norm": 0.5625,
"learning_rate": 1.8503211295792375e-05,
"loss": 0.0687,
"step": 644
},
{
"epoch": 0.21840807601955514,
"grad_norm": 0.8125,
"learning_rate": 1.8497313923265315e-05,
"loss": 0.1094,
"step": 645
},
{
"epoch": 0.21874669319167847,
"grad_norm": 0.63671875,
"learning_rate": 1.8491405899354556e-05,
"loss": 0.0887,
"step": 646
},
{
"epoch": 0.21908531036380183,
"grad_norm": 0.54296875,
"learning_rate": 1.848548723146581e-05,
"loss": 0.0721,
"step": 647
},
{
"epoch": 0.21942392753592516,
"grad_norm": 0.51171875,
"learning_rate": 1.8479557927018127e-05,
"loss": 0.0684,
"step": 648
},
{
"epoch": 0.2197625447080485,
"grad_norm": 0.5078125,
"learning_rate": 1.8473617993443885e-05,
"loss": 0.0704,
"step": 649
},
{
"epoch": 0.22010116188017184,
"grad_norm": 0.52734375,
"learning_rate": 1.8467667438188794e-05,
"loss": 0.0695,
"step": 650
},
{
"epoch": 0.2204397790522952,
"grad_norm": 0.54296875,
"learning_rate": 1.8461706268711878e-05,
"loss": 0.0717,
"step": 651
},
{
"epoch": 0.22077839622441853,
"grad_norm": 0.4375,
"learning_rate": 1.8455734492485464e-05,
"loss": 0.0598,
"step": 652
},
{
"epoch": 0.22111701339654188,
"grad_norm": 0.486328125,
"learning_rate": 1.844975211699517e-05,
"loss": 0.0602,
"step": 653
},
{
"epoch": 0.2214556305686652,
"grad_norm": 0.55078125,
"learning_rate": 1.8443759149739906e-05,
"loss": 0.0675,
"step": 654
},
{
"epoch": 0.22179424774078854,
"grad_norm": 0.48828125,
"learning_rate": 1.8437755598231857e-05,
"loss": 0.0677,
"step": 655
},
{
"epoch": 0.2221328649129119,
"grad_norm": 0.57421875,
"learning_rate": 1.8431741469996475e-05,
"loss": 0.0745,
"step": 656
},
{
"epoch": 0.22247148208503523,
"grad_norm": 0.515625,
"learning_rate": 1.8425716772572472e-05,
"loss": 0.0688,
"step": 657
},
{
"epoch": 0.22281009925715858,
"grad_norm": 0.85546875,
"learning_rate": 1.8419681513511807e-05,
"loss": 0.0683,
"step": 658
},
{
"epoch": 0.2231487164292819,
"grad_norm": 1.0,
"learning_rate": 1.8413635700379674e-05,
"loss": 0.0793,
"step": 659
},
{
"epoch": 0.22348733360140527,
"grad_norm": 0.515625,
"learning_rate": 1.84075793407545e-05,
"loss": 0.0661,
"step": 660
},
{
"epoch": 0.2238259507735286,
"grad_norm": 0.66015625,
"learning_rate": 1.840151244222794e-05,
"loss": 0.087,
"step": 661
},
{
"epoch": 0.22416456794565195,
"grad_norm": 0.43359375,
"learning_rate": 1.8395435012404837e-05,
"loss": 0.0571,
"step": 662
},
{
"epoch": 0.22450318511777528,
"grad_norm": 0.49609375,
"learning_rate": 1.838934705890327e-05,
"loss": 0.0709,
"step": 663
},
{
"epoch": 0.22484180228989864,
"grad_norm": 0.48828125,
"learning_rate": 1.838324858935447e-05,
"loss": 0.0707,
"step": 664
},
{
"epoch": 0.22518041946202197,
"grad_norm": 0.51171875,
"learning_rate": 1.8377139611402883e-05,
"loss": 0.0706,
"step": 665
},
{
"epoch": 0.2255190366341453,
"grad_norm": 0.43359375,
"learning_rate": 1.8371020132706104e-05,
"loss": 0.0537,
"step": 666
},
{
"epoch": 0.22585765380626865,
"grad_norm": 0.7265625,
"learning_rate": 1.8364890160934905e-05,
"loss": 0.0909,
"step": 667
},
{
"epoch": 0.22619627097839198,
"grad_norm": 0.45703125,
"learning_rate": 1.8358749703773206e-05,
"loss": 0.0635,
"step": 668
},
{
"epoch": 0.22653488815051534,
"grad_norm": 0.671875,
"learning_rate": 1.835259876891807e-05,
"loss": 0.0939,
"step": 669
},
{
"epoch": 0.22687350532263867,
"grad_norm": 0.56640625,
"learning_rate": 1.8346437364079693e-05,
"loss": 0.0852,
"step": 670
},
{
"epoch": 0.22721212249476203,
"grad_norm": 0.70703125,
"learning_rate": 1.8340265496981395e-05,
"loss": 0.0639,
"step": 671
},
{
"epoch": 0.22755073966688535,
"grad_norm": 0.48828125,
"learning_rate": 1.8334083175359616e-05,
"loss": 0.0598,
"step": 672
},
{
"epoch": 0.2278893568390087,
"grad_norm": 0.6484375,
"learning_rate": 1.8327890406963895e-05,
"loss": 0.0872,
"step": 673
},
{
"epoch": 0.22822797401113204,
"grad_norm": 0.55859375,
"learning_rate": 1.8321687199556872e-05,
"loss": 0.0835,
"step": 674
},
{
"epoch": 0.22856659118325537,
"grad_norm": 0.478515625,
"learning_rate": 1.8315473560914258e-05,
"loss": 0.0586,
"step": 675
},
{
"epoch": 0.22890520835537873,
"grad_norm": 0.40234375,
"learning_rate": 1.8309249498824853e-05,
"loss": 0.0586,
"step": 676
},
{
"epoch": 0.22924382552750205,
"grad_norm": 0.474609375,
"learning_rate": 1.8303015021090526e-05,
"loss": 0.0627,
"step": 677
},
{
"epoch": 0.2295824426996254,
"grad_norm": 0.640625,
"learning_rate": 1.829677013552619e-05,
"loss": 0.0771,
"step": 678
},
{
"epoch": 0.22992105987174874,
"grad_norm": 0.55859375,
"learning_rate": 1.829051484995981e-05,
"loss": 0.0759,
"step": 679
},
{
"epoch": 0.2302596770438721,
"grad_norm": 0.546875,
"learning_rate": 1.828424917223239e-05,
"loss": 0.0696,
"step": 680
},
{
"epoch": 0.23059829421599543,
"grad_norm": 0.5859375,
"learning_rate": 1.827797311019795e-05,
"loss": 0.078,
"step": 681
},
{
"epoch": 0.23093691138811878,
"grad_norm": 0.5234375,
"learning_rate": 1.8271686671723543e-05,
"loss": 0.0612,
"step": 682
},
{
"epoch": 0.2312755285602421,
"grad_norm": 0.62109375,
"learning_rate": 1.8265389864689213e-05,
"loss": 0.0886,
"step": 683
},
{
"epoch": 0.23161414573236544,
"grad_norm": 0.58203125,
"learning_rate": 1.8259082696988013e-05,
"loss": 0.0824,
"step": 684
},
{
"epoch": 0.2319527629044888,
"grad_norm": 0.5859375,
"learning_rate": 1.8252765176525976e-05,
"loss": 0.0776,
"step": 685
},
{
"epoch": 0.23229138007661213,
"grad_norm": 0.64453125,
"learning_rate": 1.8246437311222117e-05,
"loss": 0.0831,
"step": 686
},
{
"epoch": 0.23262999724873548,
"grad_norm": 2.171875,
"learning_rate": 1.8240099109008413e-05,
"loss": 0.0765,
"step": 687
},
{
"epoch": 0.2329686144208588,
"grad_norm": 0.5390625,
"learning_rate": 1.82337505778298e-05,
"loss": 0.0721,
"step": 688
},
{
"epoch": 0.23330723159298217,
"grad_norm": 0.578125,
"learning_rate": 1.8227391725644167e-05,
"loss": 0.0893,
"step": 689
},
{
"epoch": 0.2336458487651055,
"grad_norm": 0.451171875,
"learning_rate": 1.822102256042233e-05,
"loss": 0.0597,
"step": 690
},
{
"epoch": 0.23398446593722885,
"grad_norm": 0.73046875,
"learning_rate": 1.8214643090148044e-05,
"loss": 0.0805,
"step": 691
},
{
"epoch": 0.23432308310935218,
"grad_norm": 0.55859375,
"learning_rate": 1.820825332281797e-05,
"loss": 0.0646,
"step": 692
},
{
"epoch": 0.23466170028147554,
"grad_norm": 0.5703125,
"learning_rate": 1.820185326644169e-05,
"loss": 0.0747,
"step": 693
},
{
"epoch": 0.23500031745359887,
"grad_norm": 0.5234375,
"learning_rate": 1.819544292904166e-05,
"loss": 0.0687,
"step": 694
},
{
"epoch": 0.2353389346257222,
"grad_norm": 0.486328125,
"learning_rate": 1.8189022318653254e-05,
"loss": 0.0573,
"step": 695
},
{
"epoch": 0.23567755179784555,
"grad_norm": 0.427734375,
"learning_rate": 1.81825914433247e-05,
"loss": 0.0576,
"step": 696
},
{
"epoch": 0.23601616896996888,
"grad_norm": 0.671875,
"learning_rate": 1.8176150311117103e-05,
"loss": 0.0783,
"step": 697
},
{
"epoch": 0.23635478614209224,
"grad_norm": 0.55078125,
"learning_rate": 1.816969893010442e-05,
"loss": 0.063,
"step": 698
},
{
"epoch": 0.23669340331421557,
"grad_norm": 0.59375,
"learning_rate": 1.8163237308373465e-05,
"loss": 0.084,
"step": 699
},
{
"epoch": 0.23703202048633892,
"grad_norm": 0.44921875,
"learning_rate": 1.8156765454023873e-05,
"loss": 0.0549,
"step": 700
},
{
"epoch": 0.23737063765846225,
"grad_norm": 0.61328125,
"learning_rate": 1.8150283375168112e-05,
"loss": 0.0821,
"step": 701
},
{
"epoch": 0.2377092548305856,
"grad_norm": 0.4921875,
"learning_rate": 1.814379107993148e-05,
"loss": 0.0675,
"step": 702
},
{
"epoch": 0.23804787200270894,
"grad_norm": 0.578125,
"learning_rate": 1.8137288576452064e-05,
"loss": 0.0852,
"step": 703
},
{
"epoch": 0.23838648917483227,
"grad_norm": 0.478515625,
"learning_rate": 1.8130775872880748e-05,
"loss": 0.0743,
"step": 704
},
{
"epoch": 0.23872510634695562,
"grad_norm": 0.56640625,
"learning_rate": 1.812425297738121e-05,
"loss": 0.0767,
"step": 705
},
{
"epoch": 0.23906372351907895,
"grad_norm": 0.671875,
"learning_rate": 1.81177198981299e-05,
"loss": 0.0984,
"step": 706
},
{
"epoch": 0.2394023406912023,
"grad_norm": 0.435546875,
"learning_rate": 1.811117664331604e-05,
"loss": 0.0517,
"step": 707
},
{
"epoch": 0.23974095786332564,
"grad_norm": 0.451171875,
"learning_rate": 1.810462322114159e-05,
"loss": 0.0606,
"step": 708
},
{
"epoch": 0.240079575035449,
"grad_norm": 0.462890625,
"learning_rate": 1.8098059639821265e-05,
"loss": 0.0588,
"step": 709
},
{
"epoch": 0.24041819220757232,
"grad_norm": 0.546875,
"learning_rate": 1.809148590758252e-05,
"loss": 0.0773,
"step": 710
},
{
"epoch": 0.24075680937969568,
"grad_norm": 0.439453125,
"learning_rate": 1.8084902032665533e-05,
"loss": 0.0607,
"step": 711
},
{
"epoch": 0.241095426551819,
"grad_norm": 0.5625,
"learning_rate": 1.8078308023323186e-05,
"loss": 0.0862,
"step": 712
},
{
"epoch": 0.24143404372394234,
"grad_norm": 0.62109375,
"learning_rate": 1.8071703887821067e-05,
"loss": 0.0735,
"step": 713
},
{
"epoch": 0.2417726608960657,
"grad_norm": 0.50390625,
"learning_rate": 1.8065089634437467e-05,
"loss": 0.0684,
"step": 714
},
{
"epoch": 0.24211127806818902,
"grad_norm": 0.53515625,
"learning_rate": 1.805846527146335e-05,
"loss": 0.0843,
"step": 715
},
{
"epoch": 0.24244989524031238,
"grad_norm": 0.53125,
"learning_rate": 1.8051830807202355e-05,
"loss": 0.0703,
"step": 716
},
{
"epoch": 0.2427885124124357,
"grad_norm": 0.52734375,
"learning_rate": 1.8045186249970786e-05,
"loss": 0.0828,
"step": 717
},
{
"epoch": 0.24312712958455907,
"grad_norm": 0.80859375,
"learning_rate": 1.8038531608097592e-05,
"loss": 0.1078,
"step": 718
},
{
"epoch": 0.2434657467566824,
"grad_norm": 0.55078125,
"learning_rate": 1.803186688992437e-05,
"loss": 0.0585,
"step": 719
},
{
"epoch": 0.24380436392880575,
"grad_norm": 0.5546875,
"learning_rate": 1.8025192103805348e-05,
"loss": 0.0646,
"step": 720
},
{
"epoch": 0.24414298110092908,
"grad_norm": 0.7109375,
"learning_rate": 1.8018507258107364e-05,
"loss": 0.0928,
"step": 721
},
{
"epoch": 0.2444815982730524,
"grad_norm": 0.546875,
"learning_rate": 1.801181236120988e-05,
"loss": 0.0747,
"step": 722
},
{
"epoch": 0.24482021544517577,
"grad_norm": 0.47265625,
"learning_rate": 1.800510742150494e-05,
"loss": 0.057,
"step": 723
},
{
"epoch": 0.2451588326172991,
"grad_norm": 0.5078125,
"learning_rate": 1.7998392447397197e-05,
"loss": 0.0711,
"step": 724
},
{
"epoch": 0.24549744978942245,
"grad_norm": 0.56640625,
"learning_rate": 1.7991667447303865e-05,
"loss": 0.0806,
"step": 725
},
{
"epoch": 0.24583606696154578,
"grad_norm": 0.498046875,
"learning_rate": 1.7984932429654734e-05,
"loss": 0.0787,
"step": 726
},
{
"epoch": 0.24617468413366914,
"grad_norm": 0.53125,
"learning_rate": 1.7978187402892148e-05,
"loss": 0.0801,
"step": 727
},
{
"epoch": 0.24651330130579246,
"grad_norm": 0.5,
"learning_rate": 1.7971432375471e-05,
"loss": 0.0636,
"step": 728
},
{
"epoch": 0.24685191847791582,
"grad_norm": 0.51171875,
"learning_rate": 1.7964667355858718e-05,
"loss": 0.0683,
"step": 729
},
{
"epoch": 0.24719053565003915,
"grad_norm": 0.55078125,
"learning_rate": 1.7957892352535253e-05,
"loss": 0.0845,
"step": 730
},
{
"epoch": 0.2475291528221625,
"grad_norm": 0.89453125,
"learning_rate": 1.7951107373993074e-05,
"loss": 0.1793,
"step": 731
},
{
"epoch": 0.24786776999428584,
"grad_norm": 0.515625,
"learning_rate": 1.7944312428737154e-05,
"loss": 0.0657,
"step": 732
},
{
"epoch": 0.24820638716640916,
"grad_norm": 0.53515625,
"learning_rate": 1.793750752528495e-05,
"loss": 0.0679,
"step": 733
},
{
"epoch": 0.24854500433853252,
"grad_norm": 0.56640625,
"learning_rate": 1.7930692672166416e-05,
"loss": 0.0831,
"step": 734
},
{
"epoch": 0.24888362151065585,
"grad_norm": 0.59375,
"learning_rate": 1.7923867877923967e-05,
"loss": 0.0764,
"step": 735
},
{
"epoch": 0.2492222386827792,
"grad_norm": 0.515625,
"learning_rate": 1.791703315111249e-05,
"loss": 0.0583,
"step": 736
},
{
"epoch": 0.24956085585490254,
"grad_norm": 0.49609375,
"learning_rate": 1.7910188500299303e-05,
"loss": 0.0661,
"step": 737
},
{
"epoch": 0.2498994730270259,
"grad_norm": 2.953125,
"learning_rate": 1.7903333934064185e-05,
"loss": 0.0654,
"step": 738
},
{
"epoch": 0.2502380901991492,
"grad_norm": 0.482421875,
"learning_rate": 1.789646946099934e-05,
"loss": 0.0655,
"step": 739
},
{
"epoch": 0.2505767073712726,
"grad_norm": 0.55078125,
"learning_rate": 1.7889595089709377e-05,
"loss": 0.074,
"step": 740
},
{
"epoch": 0.25091532454339593,
"grad_norm": 0.51171875,
"learning_rate": 1.7882710828811322e-05,
"loss": 0.0676,
"step": 741
},
{
"epoch": 0.25125394171551924,
"grad_norm": 0.44921875,
"learning_rate": 1.7875816686934596e-05,
"loss": 0.053,
"step": 742
},
{
"epoch": 0.2515925588876426,
"grad_norm": 0.490234375,
"learning_rate": 1.7868912672721014e-05,
"loss": 0.0705,
"step": 743
},
{
"epoch": 0.25193117605976595,
"grad_norm": 0.431640625,
"learning_rate": 1.7861998794824747e-05,
"loss": 0.0544,
"step": 744
},
{
"epoch": 0.25226979323188925,
"grad_norm": 0.458984375,
"learning_rate": 1.785507506191235e-05,
"loss": 0.063,
"step": 745
},
{
"epoch": 0.2526084104040126,
"grad_norm": 0.921875,
"learning_rate": 1.7848141482662726e-05,
"loss": 0.065,
"step": 746
},
{
"epoch": 0.25294702757613596,
"grad_norm": 0.515625,
"learning_rate": 1.7841198065767107e-05,
"loss": 0.0687,
"step": 747
},
{
"epoch": 0.2532856447482593,
"grad_norm": 0.5,
"learning_rate": 1.783424481992907e-05,
"loss": 0.0679,
"step": 748
},
{
"epoch": 0.2536242619203826,
"grad_norm": 0.4921875,
"learning_rate": 1.782728175386451e-05,
"loss": 0.0764,
"step": 749
},
{
"epoch": 0.253962879092506,
"grad_norm": 0.494140625,
"learning_rate": 1.7820308876301633e-05,
"loss": 0.0632,
"step": 750
},
{
"epoch": 0.25430149626462933,
"grad_norm": 0.52734375,
"learning_rate": 1.781332619598094e-05,
"loss": 0.0694,
"step": 751
},
{
"epoch": 0.25464011343675264,
"grad_norm": 0.546875,
"learning_rate": 1.780633372165522e-05,
"loss": 0.0661,
"step": 752
},
{
"epoch": 0.254978730608876,
"grad_norm": 0.41796875,
"learning_rate": 1.7799331462089543e-05,
"loss": 0.0546,
"step": 753
},
{
"epoch": 0.25531734778099935,
"grad_norm": 0.44140625,
"learning_rate": 1.7792319426061236e-05,
"loss": 0.0567,
"step": 754
},
{
"epoch": 0.2556559649531227,
"grad_norm": 0.58984375,
"learning_rate": 1.7785297622359893e-05,
"loss": 0.0569,
"step": 755
},
{
"epoch": 0.255994582125246,
"grad_norm": 0.5859375,
"learning_rate": 1.7778266059787345e-05,
"loss": 0.0831,
"step": 756
},
{
"epoch": 0.25633319929736936,
"grad_norm": 0.7734375,
"learning_rate": 1.7771224747157655e-05,
"loss": 0.0997,
"step": 757
},
{
"epoch": 0.2566718164694927,
"grad_norm": 6.3125,
"learning_rate": 1.7764173693297106e-05,
"loss": 0.0747,
"step": 758
},
{
"epoch": 0.2570104336416161,
"grad_norm": 0.5625,
"learning_rate": 1.77571129070442e-05,
"loss": 0.0697,
"step": 759
},
{
"epoch": 0.2573490508137394,
"grad_norm": 0.57421875,
"learning_rate": 1.775004239724963e-05,
"loss": 0.0685,
"step": 760
},
{
"epoch": 0.25768766798586273,
"grad_norm": 0.46875,
"learning_rate": 1.774296217277628e-05,
"loss": 0.0603,
"step": 761
},
{
"epoch": 0.2580262851579861,
"grad_norm": 0.474609375,
"learning_rate": 1.773587224249921e-05,
"loss": 0.0621,
"step": 762
},
{
"epoch": 0.2583649023301094,
"grad_norm": 0.5625,
"learning_rate": 1.7728772615305657e-05,
"loss": 0.061,
"step": 763
},
{
"epoch": 0.25870351950223275,
"grad_norm": 0.56640625,
"learning_rate": 1.7721663300094997e-05,
"loss": 0.0644,
"step": 764
},
{
"epoch": 0.2590421366743561,
"grad_norm": 0.447265625,
"learning_rate": 1.7714544305778757e-05,
"loss": 0.0581,
"step": 765
},
{
"epoch": 0.25938075384647946,
"grad_norm": 0.5546875,
"learning_rate": 1.7707415641280598e-05,
"loss": 0.0812,
"step": 766
},
{
"epoch": 0.25971937101860276,
"grad_norm": 0.5,
"learning_rate": 1.7700277315536305e-05,
"loss": 0.0754,
"step": 767
},
{
"epoch": 0.2600579881907261,
"grad_norm": 0.494140625,
"learning_rate": 1.7693129337493764e-05,
"loss": 0.059,
"step": 768
},
{
"epoch": 0.2603966053628495,
"grad_norm": 0.6953125,
"learning_rate": 1.768597171611297e-05,
"loss": 0.0896,
"step": 769
},
{
"epoch": 0.26073522253497283,
"grad_norm": 0.65234375,
"learning_rate": 1.7678804460366e-05,
"loss": 0.0651,
"step": 770
},
{
"epoch": 0.26107383970709613,
"grad_norm": 0.953125,
"learning_rate": 1.7671627579237016e-05,
"loss": 0.0634,
"step": 771
},
{
"epoch": 0.2614124568792195,
"grad_norm": 0.5234375,
"learning_rate": 1.766444108172223e-05,
"loss": 0.079,
"step": 772
},
{
"epoch": 0.26175107405134285,
"grad_norm": 0.55859375,
"learning_rate": 1.765724497682992e-05,
"loss": 0.0723,
"step": 773
},
{
"epoch": 0.26208969122346615,
"grad_norm": 0.75,
"learning_rate": 1.7650039273580406e-05,
"loss": 0.0871,
"step": 774
},
{
"epoch": 0.2624283083955895,
"grad_norm": 0.45703125,
"learning_rate": 1.7642823981006037e-05,
"loss": 0.065,
"step": 775
},
{
"epoch": 0.26276692556771286,
"grad_norm": 0.5078125,
"learning_rate": 1.763559910815118e-05,
"loss": 0.072,
"step": 776
},
{
"epoch": 0.2631055427398362,
"grad_norm": 0.7109375,
"learning_rate": 1.7628364664072218e-05,
"loss": 0.0845,
"step": 777
},
{
"epoch": 0.2634441599119595,
"grad_norm": 0.5625,
"learning_rate": 1.7621120657837528e-05,
"loss": 0.0778,
"step": 778
},
{
"epoch": 0.2637827770840829,
"grad_norm": 0.48046875,
"learning_rate": 1.761386709852747e-05,
"loss": 0.0744,
"step": 779
},
{
"epoch": 0.26412139425620623,
"grad_norm": 0.6640625,
"learning_rate": 1.760660399523438e-05,
"loss": 0.0994,
"step": 780
},
{
"epoch": 0.26446001142832953,
"grad_norm": 0.54296875,
"learning_rate": 1.759933135706256e-05,
"loss": 0.0787,
"step": 781
},
{
"epoch": 0.2647986286004529,
"grad_norm": 0.451171875,
"learning_rate": 1.759204919312826e-05,
"loss": 0.0614,
"step": 782
},
{
"epoch": 0.26513724577257625,
"grad_norm": 0.58203125,
"learning_rate": 1.7584757512559674e-05,
"loss": 0.0776,
"step": 783
},
{
"epoch": 0.2654758629446996,
"grad_norm": 0.50390625,
"learning_rate": 1.757745632449693e-05,
"loss": 0.0646,
"step": 784
},
{
"epoch": 0.2658144801168229,
"grad_norm": 0.6796875,
"learning_rate": 1.757014563809206e-05,
"loss": 0.0648,
"step": 785
},
{
"epoch": 0.26615309728894626,
"grad_norm": 0.447265625,
"learning_rate": 1.7562825462509018e-05,
"loss": 0.0566,
"step": 786
},
{
"epoch": 0.2664917144610696,
"grad_norm": 0.62109375,
"learning_rate": 1.7555495806923635e-05,
"loss": 0.0736,
"step": 787
},
{
"epoch": 0.266830331633193,
"grad_norm": 0.56640625,
"learning_rate": 1.754815668052364e-05,
"loss": 0.0691,
"step": 788
},
{
"epoch": 0.2671689488053163,
"grad_norm": 0.546875,
"learning_rate": 1.754080809250863e-05,
"loss": 0.069,
"step": 789
},
{
"epoch": 0.26750756597743963,
"grad_norm": 0.55859375,
"learning_rate": 1.753345005209006e-05,
"loss": 0.0768,
"step": 790
},
{
"epoch": 0.267846183149563,
"grad_norm": 0.64453125,
"learning_rate": 1.7526082568491233e-05,
"loss": 0.0748,
"step": 791
},
{
"epoch": 0.2681848003216863,
"grad_norm": 0.5625,
"learning_rate": 1.7518705650947292e-05,
"loss": 0.07,
"step": 792
},
{
"epoch": 0.26852341749380965,
"grad_norm": 0.59765625,
"learning_rate": 1.7511319308705198e-05,
"loss": 0.075,
"step": 793
},
{
"epoch": 0.268862034665933,
"grad_norm": 0.54296875,
"learning_rate": 1.750392355102374e-05,
"loss": 0.0648,
"step": 794
},
{
"epoch": 0.26920065183805636,
"grad_norm": 0.55859375,
"learning_rate": 1.74965183871735e-05,
"loss": 0.0848,
"step": 795
},
{
"epoch": 0.26953926901017966,
"grad_norm": 0.48828125,
"learning_rate": 1.7489103826436843e-05,
"loss": 0.067,
"step": 796
},
{
"epoch": 0.269877886182303,
"grad_norm": 0.431640625,
"learning_rate": 1.7481679878107928e-05,
"loss": 0.0608,
"step": 797
},
{
"epoch": 0.2702165033544264,
"grad_norm": 0.55078125,
"learning_rate": 1.7474246551492674e-05,
"loss": 0.0584,
"step": 798
},
{
"epoch": 0.2705551205265497,
"grad_norm": 0.54296875,
"learning_rate": 1.7466803855908753e-05,
"loss": 0.0558,
"step": 799
},
{
"epoch": 0.27089373769867303,
"grad_norm": 0.423828125,
"learning_rate": 1.745935180068559e-05,
"loss": 0.057,
"step": 800
},
{
"epoch": 0.2712323548707964,
"grad_norm": 0.69140625,
"learning_rate": 1.745189039516434e-05,
"loss": 0.0831,
"step": 801
},
{
"epoch": 0.27157097204291974,
"grad_norm": 0.671875,
"learning_rate": 1.7444419648697866e-05,
"loss": 0.0964,
"step": 802
},
{
"epoch": 0.27190958921504305,
"grad_norm": 0.53125,
"learning_rate": 1.7436939570650754e-05,
"loss": 0.0753,
"step": 803
},
{
"epoch": 0.2722482063871664,
"grad_norm": 0.5546875,
"learning_rate": 1.7429450170399278e-05,
"loss": 0.0524,
"step": 804
},
{
"epoch": 0.27258682355928976,
"grad_norm": 0.4921875,
"learning_rate": 1.742195145733141e-05,
"loss": 0.0638,
"step": 805
},
{
"epoch": 0.2729254407314131,
"grad_norm": 0.62109375,
"learning_rate": 1.741444344084678e-05,
"loss": 0.0813,
"step": 806
},
{
"epoch": 0.2732640579035364,
"grad_norm": 0.94921875,
"learning_rate": 1.7406926130356692e-05,
"loss": 0.0662,
"step": 807
},
{
"epoch": 0.2736026750756598,
"grad_norm": 0.53515625,
"learning_rate": 1.7399399535284093e-05,
"loss": 0.0566,
"step": 808
},
{
"epoch": 0.27394129224778313,
"grad_norm": 0.5546875,
"learning_rate": 1.7391863665063572e-05,
"loss": 0.0858,
"step": 809
},
{
"epoch": 0.27427990941990643,
"grad_norm": 0.734375,
"learning_rate": 1.738431852914134e-05,
"loss": 0.0827,
"step": 810
},
{
"epoch": 0.2746185265920298,
"grad_norm": 0.478515625,
"learning_rate": 1.737676413697523e-05,
"loss": 0.0644,
"step": 811
},
{
"epoch": 0.27495714376415314,
"grad_norm": 0.640625,
"learning_rate": 1.736920049803467e-05,
"loss": 0.0772,
"step": 812
},
{
"epoch": 0.2752957609362765,
"grad_norm": 0.44921875,
"learning_rate": 1.7361627621800683e-05,
"loss": 0.0561,
"step": 813
},
{
"epoch": 0.2756343781083998,
"grad_norm": 0.734375,
"learning_rate": 1.735404551776587e-05,
"loss": 0.0853,
"step": 814
},
{
"epoch": 0.27597299528052316,
"grad_norm": 0.486328125,
"learning_rate": 1.73464541954344e-05,
"loss": 0.0618,
"step": 815
},
{
"epoch": 0.2763116124526465,
"grad_norm": 0.69921875,
"learning_rate": 1.7338853664321993e-05,
"loss": 0.079,
"step": 816
},
{
"epoch": 0.27665022962476987,
"grad_norm": 0.62890625,
"learning_rate": 1.7331243933955918e-05,
"loss": 0.0579,
"step": 817
},
{
"epoch": 0.2769888467968932,
"grad_norm": 0.53125,
"learning_rate": 1.7323625013874972e-05,
"loss": 0.0667,
"step": 818
},
{
"epoch": 0.27732746396901653,
"grad_norm": 0.5390625,
"learning_rate": 1.731599691362947e-05,
"loss": 0.0661,
"step": 819
},
{
"epoch": 0.2776660811411399,
"grad_norm": 0.68359375,
"learning_rate": 1.730835964278124e-05,
"loss": 0.117,
"step": 820
},
{
"epoch": 0.2780046983132632,
"grad_norm": 0.5234375,
"learning_rate": 1.7300713210903605e-05,
"loss": 0.0619,
"step": 821
},
{
"epoch": 0.27834331548538654,
"grad_norm": 0.466796875,
"learning_rate": 1.7293057627581355e-05,
"loss": 0.0645,
"step": 822
},
{
"epoch": 0.2786819326575099,
"grad_norm": 0.48828125,
"learning_rate": 1.7285392902410776e-05,
"loss": 0.0636,
"step": 823
},
{
"epoch": 0.27902054982963326,
"grad_norm": 0.47265625,
"learning_rate": 1.7277719044999595e-05,
"loss": 0.0543,
"step": 824
},
{
"epoch": 0.27935916700175656,
"grad_norm": 0.423828125,
"learning_rate": 1.7270036064967e-05,
"loss": 0.06,
"step": 825
},
{
"epoch": 0.2796977841738799,
"grad_norm": 0.455078125,
"learning_rate": 1.7262343971943602e-05,
"loss": 0.0598,
"step": 826
},
{
"epoch": 0.28003640134600327,
"grad_norm": 0.486328125,
"learning_rate": 1.725464277557144e-05,
"loss": 0.0667,
"step": 827
},
{
"epoch": 0.2803750185181266,
"grad_norm": 0.439453125,
"learning_rate": 1.7246932485503964e-05,
"loss": 0.0554,
"step": 828
},
{
"epoch": 0.28071363569024993,
"grad_norm": 0.4609375,
"learning_rate": 1.7239213111406027e-05,
"loss": 0.0648,
"step": 829
},
{
"epoch": 0.2810522528623733,
"grad_norm": 0.55078125,
"learning_rate": 1.7231484662953862e-05,
"loss": 0.063,
"step": 830
},
{
"epoch": 0.28139087003449664,
"grad_norm": 0.63671875,
"learning_rate": 1.7223747149835078e-05,
"loss": 0.0752,
"step": 831
},
{
"epoch": 0.28172948720661994,
"grad_norm": 0.52734375,
"learning_rate": 1.7216000581748655e-05,
"loss": 0.0745,
"step": 832
},
{
"epoch": 0.2820681043787433,
"grad_norm": 0.59765625,
"learning_rate": 1.7208244968404904e-05,
"loss": 0.0526,
"step": 833
},
{
"epoch": 0.28240672155086666,
"grad_norm": 0.47265625,
"learning_rate": 1.7200480319525505e-05,
"loss": 0.0644,
"step": 834
},
{
"epoch": 0.28274533872299,
"grad_norm": 0.55859375,
"learning_rate": 1.719270664484343e-05,
"loss": 0.0788,
"step": 835
},
{
"epoch": 0.2830839558951133,
"grad_norm": 0.66796875,
"learning_rate": 1.7184923954102992e-05,
"loss": 0.0718,
"step": 836
},
{
"epoch": 0.28342257306723667,
"grad_norm": 0.490234375,
"learning_rate": 1.7177132257059788e-05,
"loss": 0.0729,
"step": 837
},
{
"epoch": 0.28376119023936003,
"grad_norm": 0.546875,
"learning_rate": 1.7169331563480713e-05,
"loss": 0.0568,
"step": 838
},
{
"epoch": 0.28409980741148333,
"grad_norm": 0.59375,
"learning_rate": 1.7161521883143936e-05,
"loss": 0.0644,
"step": 839
},
{
"epoch": 0.2844384245836067,
"grad_norm": 0.6484375,
"learning_rate": 1.7153703225838892e-05,
"loss": 0.0567,
"step": 840
},
{
"epoch": 0.28477704175573004,
"grad_norm": 0.62109375,
"learning_rate": 1.714587560136627e-05,
"loss": 0.0855,
"step": 841
},
{
"epoch": 0.2851156589278534,
"grad_norm": 0.515625,
"learning_rate": 1.7138039019538e-05,
"loss": 0.0765,
"step": 842
},
{
"epoch": 0.2854542760999767,
"grad_norm": 0.6484375,
"learning_rate": 1.713019349017723e-05,
"loss": 0.0771,
"step": 843
},
{
"epoch": 0.28579289327210006,
"grad_norm": 0.52734375,
"learning_rate": 1.7122339023118338e-05,
"loss": 0.0754,
"step": 844
},
{
"epoch": 0.2861315104442234,
"grad_norm": 0.40234375,
"learning_rate": 1.7114475628206897e-05,
"loss": 0.0429,
"step": 845
},
{
"epoch": 0.28647012761634677,
"grad_norm": 0.5390625,
"learning_rate": 1.7106603315299674e-05,
"loss": 0.0673,
"step": 846
},
{
"epoch": 0.28680874478847007,
"grad_norm": 0.640625,
"learning_rate": 1.7098722094264616e-05,
"loss": 0.0862,
"step": 847
},
{
"epoch": 0.28714736196059343,
"grad_norm": 0.51953125,
"learning_rate": 1.7090831974980832e-05,
"loss": 0.0655,
"step": 848
},
{
"epoch": 0.2874859791327168,
"grad_norm": 0.53515625,
"learning_rate": 1.7082932967338588e-05,
"loss": 0.0658,
"step": 849
},
{
"epoch": 0.2878245963048401,
"grad_norm": 0.69921875,
"learning_rate": 1.7075025081239286e-05,
"loss": 0.0895,
"step": 850
},
{
"epoch": 0.28816321347696344,
"grad_norm": 0.53515625,
"learning_rate": 1.706710832659547e-05,
"loss": 0.0735,
"step": 851
},
{
"epoch": 0.2885018306490868,
"grad_norm": 0.50390625,
"learning_rate": 1.7059182713330787e-05,
"loss": 0.0594,
"step": 852
},
{
"epoch": 0.28884044782121016,
"grad_norm": 0.4375,
"learning_rate": 1.7051248251379997e-05,
"loss": 0.0557,
"step": 853
},
{
"epoch": 0.28917906499333346,
"grad_norm": 0.50390625,
"learning_rate": 1.7043304950688947e-05,
"loss": 0.065,
"step": 854
},
{
"epoch": 0.2895176821654568,
"grad_norm": 0.60546875,
"learning_rate": 1.703535282121456e-05,
"loss": 0.0784,
"step": 855
},
{
"epoch": 0.28985629933758017,
"grad_norm": 0.4453125,
"learning_rate": 1.702739187292484e-05,
"loss": 0.0567,
"step": 856
},
{
"epoch": 0.29019491650970347,
"grad_norm": 0.9765625,
"learning_rate": 1.7019422115798835e-05,
"loss": 0.073,
"step": 857
},
{
"epoch": 0.2905335336818268,
"grad_norm": 0.55078125,
"learning_rate": 1.7011443559826632e-05,
"loss": 0.079,
"step": 858
},
{
"epoch": 0.2908721508539502,
"grad_norm": 0.46484375,
"learning_rate": 1.700345621500935e-05,
"loss": 0.0667,
"step": 859
},
{
"epoch": 0.29121076802607354,
"grad_norm": 0.640625,
"learning_rate": 1.699546009135913e-05,
"loss": 0.1011,
"step": 860
},
{
"epoch": 0.29154938519819684,
"grad_norm": 0.46875,
"learning_rate": 1.6987455198899118e-05,
"loss": 0.0662,
"step": 861
},
{
"epoch": 0.2918880023703202,
"grad_norm": 0.53515625,
"learning_rate": 1.6979441547663434e-05,
"loss": 0.0675,
"step": 862
},
{
"epoch": 0.29222661954244356,
"grad_norm": 0.5078125,
"learning_rate": 1.6971419147697206e-05,
"loss": 0.0607,
"step": 863
},
{
"epoch": 0.2925652367145669,
"grad_norm": 0.578125,
"learning_rate": 1.6963388009056505e-05,
"loss": 0.075,
"step": 864
},
{
"epoch": 0.2929038538866902,
"grad_norm": 0.431640625,
"learning_rate": 1.6955348141808367e-05,
"loss": 0.0512,
"step": 865
},
{
"epoch": 0.29324247105881357,
"grad_norm": 0.53125,
"learning_rate": 1.694729955603076e-05,
"loss": 0.0719,
"step": 866
},
{
"epoch": 0.2935810882309369,
"grad_norm": 3.921875,
"learning_rate": 1.6939242261812592e-05,
"loss": 0.0996,
"step": 867
},
{
"epoch": 0.2939197054030602,
"grad_norm": 0.515625,
"learning_rate": 1.693117626925368e-05,
"loss": 0.066,
"step": 868
},
{
"epoch": 0.2942583225751836,
"grad_norm": 1.0703125,
"learning_rate": 1.6923101588464753e-05,
"loss": 0.0758,
"step": 869
},
{
"epoch": 0.29459693974730694,
"grad_norm": 0.578125,
"learning_rate": 1.6915018229567412e-05,
"loss": 0.0675,
"step": 870
},
{
"epoch": 0.2949355569194303,
"grad_norm": 1.0546875,
"learning_rate": 1.6906926202694158e-05,
"loss": 0.2418,
"step": 871
},
{
"epoch": 0.2952741740915536,
"grad_norm": 0.71875,
"learning_rate": 1.6898825517988342e-05,
"loss": 0.1013,
"step": 872
},
{
"epoch": 0.29561279126367696,
"grad_norm": 0.40625,
"learning_rate": 1.6890716185604178e-05,
"loss": 0.053,
"step": 873
},
{
"epoch": 0.2959514084358003,
"grad_norm": 0.365234375,
"learning_rate": 1.688259821570671e-05,
"loss": 0.0407,
"step": 874
},
{
"epoch": 0.29629002560792367,
"grad_norm": 0.74609375,
"learning_rate": 1.6874471618471813e-05,
"loss": 0.1096,
"step": 875
},
{
"epoch": 0.29662864278004697,
"grad_norm": 0.42578125,
"learning_rate": 1.6866336404086185e-05,
"loss": 0.0607,
"step": 876
},
{
"epoch": 0.2969672599521703,
"grad_norm": 0.5078125,
"learning_rate": 1.6858192582747306e-05,
"loss": 0.0732,
"step": 877
},
{
"epoch": 0.2973058771242937,
"grad_norm": 0.54296875,
"learning_rate": 1.685004016466347e-05,
"loss": 0.0591,
"step": 878
},
{
"epoch": 0.297644494296417,
"grad_norm": 0.455078125,
"learning_rate": 1.6841879160053724e-05,
"loss": 0.0513,
"step": 879
},
{
"epoch": 0.29798311146854034,
"grad_norm": 0.427734375,
"learning_rate": 1.683370957914789e-05,
"loss": 0.056,
"step": 880
},
{
"epoch": 0.2983217286406637,
"grad_norm": 0.494140625,
"learning_rate": 1.6825531432186545e-05,
"loss": 0.0679,
"step": 881
},
{
"epoch": 0.29866034581278705,
"grad_norm": 0.90625,
"learning_rate": 1.6817344729420985e-05,
"loss": 0.0666,
"step": 882
},
{
"epoch": 0.29899896298491035,
"grad_norm": 0.52734375,
"learning_rate": 1.6809149481113252e-05,
"loss": 0.0664,
"step": 883
},
{
"epoch": 0.2993375801570337,
"grad_norm": 0.625,
"learning_rate": 1.6800945697536088e-05,
"loss": 0.081,
"step": 884
},
{
"epoch": 0.29967619732915707,
"grad_norm": 0.482421875,
"learning_rate": 1.679273338897293e-05,
"loss": 0.0667,
"step": 885
},
{
"epoch": 0.30001481450128037,
"grad_norm": 0.5078125,
"learning_rate": 1.678451256571792e-05,
"loss": 0.0587,
"step": 886
},
{
"epoch": 0.3003534316734037,
"grad_norm": 0.55078125,
"learning_rate": 1.6776283238075853e-05,
"loss": 0.0736,
"step": 887
},
{
"epoch": 0.3006920488455271,
"grad_norm": 0.66015625,
"learning_rate": 1.6768045416362192e-05,
"loss": 0.0947,
"step": 888
},
{
"epoch": 0.3006920488455271,
"eval_loss": 0.07113409042358398,
"eval_runtime": 815.5638,
"eval_samples_per_second": 12.198,
"eval_steps_per_second": 3.049,
"step": 888
},
{
"epoch": 0.30103066601765044,
"grad_norm": 0.46875,
"learning_rate": 1.6759799110903046e-05,
"loss": 0.0615,
"step": 889
},
{
"epoch": 0.30136928318977374,
"grad_norm": 0.52734375,
"learning_rate": 1.6751544332035164e-05,
"loss": 0.0656,
"step": 890
},
{
"epoch": 0.3017079003618971,
"grad_norm": 0.57421875,
"learning_rate": 1.674328109010591e-05,
"loss": 0.0781,
"step": 891
},
{
"epoch": 0.30204651753402045,
"grad_norm": 0.54296875,
"learning_rate": 1.6735009395473252e-05,
"loss": 0.0776,
"step": 892
},
{
"epoch": 0.3023851347061438,
"grad_norm": 0.5625,
"learning_rate": 1.672672925850577e-05,
"loss": 0.079,
"step": 893
},
{
"epoch": 0.3027237518782671,
"grad_norm": 0.416015625,
"learning_rate": 1.6718440689582613e-05,
"loss": 0.0536,
"step": 894
},
{
"epoch": 0.30306236905039047,
"grad_norm": 0.58203125,
"learning_rate": 1.67101436990935e-05,
"loss": 0.0796,
"step": 895
},
{
"epoch": 0.3034009862225138,
"grad_norm": 0.474609375,
"learning_rate": 1.6701838297438713e-05,
"loss": 0.0662,
"step": 896
},
{
"epoch": 0.3037396033946371,
"grad_norm": 0.486328125,
"learning_rate": 1.669352449502907e-05,
"loss": 0.0663,
"step": 897
},
{
"epoch": 0.3040782205667605,
"grad_norm": 0.474609375,
"learning_rate": 1.6685202302285926e-05,
"loss": 0.0546,
"step": 898
},
{
"epoch": 0.30441683773888384,
"grad_norm": 0.486328125,
"learning_rate": 1.667687172964115e-05,
"loss": 0.0613,
"step": 899
},
{
"epoch": 0.3047554549110072,
"grad_norm": 0.55859375,
"learning_rate": 1.6668532787537115e-05,
"loss": 0.077,
"step": 900
},
{
"epoch": 0.3050940720831305,
"grad_norm": 0.478515625,
"learning_rate": 1.6660185486426684e-05,
"loss": 0.0601,
"step": 901
},
{
"epoch": 0.30543268925525385,
"grad_norm": 0.46875,
"learning_rate": 1.66518298367732e-05,
"loss": 0.0648,
"step": 902
},
{
"epoch": 0.3057713064273772,
"grad_norm": 0.42578125,
"learning_rate": 1.6643465849050473e-05,
"loss": 0.0603,
"step": 903
},
{
"epoch": 0.30610992359950057,
"grad_norm": 0.5859375,
"learning_rate": 1.6635093533742762e-05,
"loss": 0.0758,
"step": 904
},
{
"epoch": 0.30644854077162387,
"grad_norm": 0.5,
"learning_rate": 1.662671290134476e-05,
"loss": 0.0569,
"step": 905
},
{
"epoch": 0.3067871579437472,
"grad_norm": 0.54296875,
"learning_rate": 1.6618323962361595e-05,
"loss": 0.0667,
"step": 906
},
{
"epoch": 0.3071257751158706,
"grad_norm": 0.5078125,
"learning_rate": 1.6609926727308804e-05,
"loss": 0.0652,
"step": 907
},
{
"epoch": 0.3074643922879939,
"grad_norm": 0.59765625,
"learning_rate": 1.660152120671232e-05,
"loss": 0.0823,
"step": 908
},
{
"epoch": 0.30780300946011724,
"grad_norm": 0.72265625,
"learning_rate": 1.6593107411108462e-05,
"loss": 0.0695,
"step": 909
},
{
"epoch": 0.3081416266322406,
"grad_norm": 0.61328125,
"learning_rate": 1.6584685351043924e-05,
"loss": 0.0791,
"step": 910
},
{
"epoch": 0.30848024380436395,
"grad_norm": 0.54296875,
"learning_rate": 1.657625503707576e-05,
"loss": 0.0716,
"step": 911
},
{
"epoch": 0.30881886097648725,
"grad_norm": 0.55859375,
"learning_rate": 1.6567816479771372e-05,
"loss": 0.0772,
"step": 912
},
{
"epoch": 0.3091574781486106,
"grad_norm": 0.55859375,
"learning_rate": 1.655936968970848e-05,
"loss": 0.0743,
"step": 913
},
{
"epoch": 0.30949609532073397,
"grad_norm": 0.61328125,
"learning_rate": 1.6550914677475155e-05,
"loss": 0.0842,
"step": 914
},
{
"epoch": 0.30983471249285727,
"grad_norm": 0.57421875,
"learning_rate": 1.654245145366974e-05,
"loss": 0.0685,
"step": 915
},
{
"epoch": 0.3101733296649806,
"grad_norm": 0.63671875,
"learning_rate": 1.6533980028900896e-05,
"loss": 0.0904,
"step": 916
},
{
"epoch": 0.310511946837104,
"grad_norm": 0.671875,
"learning_rate": 1.6525500413787554e-05,
"loss": 0.0768,
"step": 917
},
{
"epoch": 0.31085056400922734,
"grad_norm": 0.58203125,
"learning_rate": 1.6517012618958905e-05,
"loss": 0.0794,
"step": 918
},
{
"epoch": 0.31118918118135064,
"grad_norm": 0.58203125,
"learning_rate": 1.6508516655054404e-05,
"loss": 0.0797,
"step": 919
},
{
"epoch": 0.311527798353474,
"grad_norm": 0.55859375,
"learning_rate": 1.6500012532723748e-05,
"loss": 0.0678,
"step": 920
},
{
"epoch": 0.31186641552559735,
"grad_norm": 0.51953125,
"learning_rate": 1.6491500262626847e-05,
"loss": 0.0588,
"step": 921
},
{
"epoch": 0.3122050326977207,
"grad_norm": 0.45703125,
"learning_rate": 1.6482979855433837e-05,
"loss": 0.0647,
"step": 922
},
{
"epoch": 0.312543649869844,
"grad_norm": 0.609375,
"learning_rate": 1.6474451321825048e-05,
"loss": 0.0774,
"step": 923
},
{
"epoch": 0.31288226704196737,
"grad_norm": 0.478515625,
"learning_rate": 1.6465914672491e-05,
"loss": 0.062,
"step": 924
},
{
"epoch": 0.3132208842140907,
"grad_norm": 0.75,
"learning_rate": 1.6457369918132376e-05,
"loss": 0.0995,
"step": 925
},
{
"epoch": 0.313559501386214,
"grad_norm": 0.56640625,
"learning_rate": 1.6448817069460033e-05,
"loss": 0.0756,
"step": 926
},
{
"epoch": 0.3138981185583374,
"grad_norm": 0.5234375,
"learning_rate": 1.6440256137194965e-05,
"loss": 0.0769,
"step": 927
},
{
"epoch": 0.31423673573046074,
"grad_norm": 0.56640625,
"learning_rate": 1.6431687132068305e-05,
"loss": 0.0754,
"step": 928
},
{
"epoch": 0.3145753529025841,
"grad_norm": 0.5625,
"learning_rate": 1.6423110064821296e-05,
"loss": 0.0838,
"step": 929
},
{
"epoch": 0.3149139700747074,
"grad_norm": 0.5703125,
"learning_rate": 1.64145249462053e-05,
"loss": 0.0665,
"step": 930
},
{
"epoch": 0.31525258724683075,
"grad_norm": 0.640625,
"learning_rate": 1.6405931786981753e-05,
"loss": 0.0765,
"step": 931
},
{
"epoch": 0.3155912044189541,
"grad_norm": 0.5234375,
"learning_rate": 1.639733059792219e-05,
"loss": 0.0623,
"step": 932
},
{
"epoch": 0.31592982159107746,
"grad_norm": 0.6953125,
"learning_rate": 1.63887213898082e-05,
"loss": 0.0968,
"step": 933
},
{
"epoch": 0.31626843876320077,
"grad_norm": 0.439453125,
"learning_rate": 1.6380104173431423e-05,
"loss": 0.0638,
"step": 934
},
{
"epoch": 0.3166070559353241,
"grad_norm": 0.51953125,
"learning_rate": 1.6371478959593543e-05,
"loss": 0.0689,
"step": 935
},
{
"epoch": 0.3169456731074475,
"grad_norm": 0.63671875,
"learning_rate": 1.6362845759106267e-05,
"loss": 0.0748,
"step": 936
},
{
"epoch": 0.3172842902795708,
"grad_norm": 0.58984375,
"learning_rate": 1.635420458279131e-05,
"loss": 0.0638,
"step": 937
},
{
"epoch": 0.31762290745169414,
"grad_norm": 0.5078125,
"learning_rate": 1.634555544148039e-05,
"loss": 0.0695,
"step": 938
},
{
"epoch": 0.3179615246238175,
"grad_norm": 0.4609375,
"learning_rate": 1.6336898346015202e-05,
"loss": 0.0657,
"step": 939
},
{
"epoch": 0.31830014179594085,
"grad_norm": 0.54296875,
"learning_rate": 1.6328233307247426e-05,
"loss": 0.0808,
"step": 940
},
{
"epoch": 0.31863875896806415,
"grad_norm": 0.51953125,
"learning_rate": 1.6319560336038678e-05,
"loss": 0.0642,
"step": 941
},
{
"epoch": 0.3189773761401875,
"grad_norm": 0.4609375,
"learning_rate": 1.631087944326053e-05,
"loss": 0.0618,
"step": 942
},
{
"epoch": 0.31931599331231086,
"grad_norm": 0.46875,
"learning_rate": 1.6302190639794486e-05,
"loss": 0.0638,
"step": 943
},
{
"epoch": 0.31965461048443417,
"grad_norm": 0.49609375,
"learning_rate": 1.6293493936531956e-05,
"loss": 0.0731,
"step": 944
},
{
"epoch": 0.3199932276565575,
"grad_norm": 0.55078125,
"learning_rate": 1.6284789344374266e-05,
"loss": 0.0728,
"step": 945
},
{
"epoch": 0.3203318448286809,
"grad_norm": 0.52734375,
"learning_rate": 1.6276076874232614e-05,
"loss": 0.0491,
"step": 946
},
{
"epoch": 0.32067046200080424,
"grad_norm": 0.453125,
"learning_rate": 1.626735653702809e-05,
"loss": 0.059,
"step": 947
},
{
"epoch": 0.32100907917292754,
"grad_norm": 0.458984375,
"learning_rate": 1.6258628343691635e-05,
"loss": 0.0598,
"step": 948
},
{
"epoch": 0.3213476963450509,
"grad_norm": 0.47265625,
"learning_rate": 1.6249892305164036e-05,
"loss": 0.0661,
"step": 949
},
{
"epoch": 0.32168631351717425,
"grad_norm": 0.578125,
"learning_rate": 1.624114843239592e-05,
"loss": 0.0594,
"step": 950
},
{
"epoch": 0.3220249306892976,
"grad_norm": 0.46875,
"learning_rate": 1.6232396736347736e-05,
"loss": 0.0663,
"step": 951
},
{
"epoch": 0.3223635478614209,
"grad_norm": 0.53125,
"learning_rate": 1.6223637227989736e-05,
"loss": 0.059,
"step": 952
},
{
"epoch": 0.32270216503354426,
"grad_norm": 0.44921875,
"learning_rate": 1.621486991830196e-05,
"loss": 0.0622,
"step": 953
},
{
"epoch": 0.3230407822056676,
"grad_norm": 0.58984375,
"learning_rate": 1.6206094818274228e-05,
"loss": 0.0461,
"step": 954
},
{
"epoch": 0.3233793993777909,
"grad_norm": 0.4375,
"learning_rate": 1.619731193890614e-05,
"loss": 0.0483,
"step": 955
},
{
"epoch": 0.3237180165499143,
"grad_norm": 0.5390625,
"learning_rate": 1.6188521291207027e-05,
"loss": 0.0747,
"step": 956
},
{
"epoch": 0.32405663372203763,
"grad_norm": 0.57421875,
"learning_rate": 1.6179722886195967e-05,
"loss": 0.0733,
"step": 957
},
{
"epoch": 0.324395250894161,
"grad_norm": 0.5234375,
"learning_rate": 1.6170916734901765e-05,
"loss": 0.0702,
"step": 958
},
{
"epoch": 0.3247338680662843,
"grad_norm": 0.48828125,
"learning_rate": 1.6162102848362932e-05,
"loss": 0.0655,
"step": 959
},
{
"epoch": 0.32507248523840765,
"grad_norm": 0.49609375,
"learning_rate": 1.6153281237627675e-05,
"loss": 0.0657,
"step": 960
},
{
"epoch": 0.325411102410531,
"grad_norm": 0.546875,
"learning_rate": 1.6144451913753882e-05,
"loss": 0.0588,
"step": 961
},
{
"epoch": 0.32574971958265436,
"grad_norm": 0.51171875,
"learning_rate": 1.6135614887809113e-05,
"loss": 0.0687,
"step": 962
},
{
"epoch": 0.32608833675477766,
"grad_norm": 0.828125,
"learning_rate": 1.612677017087058e-05,
"loss": 0.0764,
"step": 963
},
{
"epoch": 0.326426953926901,
"grad_norm": 0.443359375,
"learning_rate": 1.6117917774025138e-05,
"loss": 0.0539,
"step": 964
},
{
"epoch": 0.3267655710990244,
"grad_norm": 0.6171875,
"learning_rate": 1.6109057708369263e-05,
"loss": 0.0853,
"step": 965
},
{
"epoch": 0.3271041882711477,
"grad_norm": 0.72265625,
"learning_rate": 1.6100189985009053e-05,
"loss": 0.0523,
"step": 966
},
{
"epoch": 0.32744280544327103,
"grad_norm": 0.5625,
"learning_rate": 1.6091314615060196e-05,
"loss": 0.0599,
"step": 967
},
{
"epoch": 0.3277814226153944,
"grad_norm": 0.4609375,
"learning_rate": 1.608243160964797e-05,
"loss": 0.0625,
"step": 968
},
{
"epoch": 0.32812003978751775,
"grad_norm": 0.42578125,
"learning_rate": 1.6073540979907227e-05,
"loss": 0.0561,
"step": 969
},
{
"epoch": 0.32845865695964105,
"grad_norm": 0.65625,
"learning_rate": 1.6064642736982368e-05,
"loss": 0.08,
"step": 970
},
{
"epoch": 0.3287972741317644,
"grad_norm": 0.51171875,
"learning_rate": 1.6055736892027342e-05,
"loss": 0.0691,
"step": 971
},
{
"epoch": 0.32913589130388776,
"grad_norm": 0.53515625,
"learning_rate": 1.6046823456205623e-05,
"loss": 0.0728,
"step": 972
},
{
"epoch": 0.32947450847601106,
"grad_norm": 0.482421875,
"learning_rate": 1.6037902440690212e-05,
"loss": 0.0604,
"step": 973
},
{
"epoch": 0.3298131256481344,
"grad_norm": 0.51171875,
"learning_rate": 1.6028973856663595e-05,
"loss": 0.0723,
"step": 974
},
{
"epoch": 0.3301517428202578,
"grad_norm": 0.486328125,
"learning_rate": 1.6020037715317756e-05,
"loss": 0.0836,
"step": 975
},
{
"epoch": 0.33049035999238113,
"grad_norm": 0.54296875,
"learning_rate": 1.6011094027854147e-05,
"loss": 0.0603,
"step": 976
},
{
"epoch": 0.33082897716450443,
"grad_norm": 0.4921875,
"learning_rate": 1.6002142805483686e-05,
"loss": 0.0717,
"step": 977
},
{
"epoch": 0.3311675943366278,
"grad_norm": 0.7421875,
"learning_rate": 1.5993184059426725e-05,
"loss": 0.068,
"step": 978
},
{
"epoch": 0.33150621150875115,
"grad_norm": 0.52734375,
"learning_rate": 1.5984217800913052e-05,
"loss": 0.0738,
"step": 979
},
{
"epoch": 0.3318448286808745,
"grad_norm": 0.4765625,
"learning_rate": 1.5975244041181877e-05,
"loss": 0.0627,
"step": 980
},
{
"epoch": 0.3321834458529978,
"grad_norm": 0.4765625,
"learning_rate": 1.5966262791481812e-05,
"loss": 0.0633,
"step": 981
},
{
"epoch": 0.33252206302512116,
"grad_norm": 0.453125,
"learning_rate": 1.5957274063070845e-05,
"loss": 0.0576,
"step": 982
},
{
"epoch": 0.3328606801972445,
"grad_norm": 0.58203125,
"learning_rate": 1.5948277867216355e-05,
"loss": 0.0802,
"step": 983
},
{
"epoch": 0.3331992973693678,
"grad_norm": 0.4765625,
"learning_rate": 1.5939274215195074e-05,
"loss": 0.0643,
"step": 984
},
{
"epoch": 0.3335379145414912,
"grad_norm": 1.6796875,
"learning_rate": 1.5930263118293075e-05,
"loss": 0.0746,
"step": 985
},
{
"epoch": 0.33387653171361453,
"grad_norm": 0.61328125,
"learning_rate": 1.5921244587805774e-05,
"loss": 0.0764,
"step": 986
},
{
"epoch": 0.3342151488857379,
"grad_norm": 0.48046875,
"learning_rate": 1.5912218635037896e-05,
"loss": 0.0583,
"step": 987
},
{
"epoch": 0.3345537660578612,
"grad_norm": 0.6171875,
"learning_rate": 1.5903185271303477e-05,
"loss": 0.0803,
"step": 988
},
{
"epoch": 0.33489238322998455,
"grad_norm": 0.54296875,
"learning_rate": 1.5894144507925836e-05,
"loss": 0.077,
"step": 989
},
{
"epoch": 0.3352310004021079,
"grad_norm": 0.462890625,
"learning_rate": 1.5885096356237572e-05,
"loss": 0.0668,
"step": 990
},
{
"epoch": 0.33556961757423126,
"grad_norm": 0.55078125,
"learning_rate": 1.5876040827580545e-05,
"loss": 0.0635,
"step": 991
},
{
"epoch": 0.33590823474635456,
"grad_norm": 0.69921875,
"learning_rate": 1.586697793330586e-05,
"loss": 0.058,
"step": 992
},
{
"epoch": 0.3362468519184779,
"grad_norm": 0.515625,
"learning_rate": 1.5857907684773858e-05,
"loss": 0.0653,
"step": 993
},
{
"epoch": 0.3365854690906013,
"grad_norm": 0.5234375,
"learning_rate": 1.584883009335409e-05,
"loss": 0.0474,
"step": 994
},
{
"epoch": 0.3369240862627246,
"grad_norm": 0.490234375,
"learning_rate": 1.5839745170425326e-05,
"loss": 0.0607,
"step": 995
},
{
"epoch": 0.33726270343484793,
"grad_norm": 0.55078125,
"learning_rate": 1.5830652927375506e-05,
"loss": 0.0584,
"step": 996
},
{
"epoch": 0.3376013206069713,
"grad_norm": 0.53125,
"learning_rate": 1.582155337560177e-05,
"loss": 0.0642,
"step": 997
},
{
"epoch": 0.33793993777909465,
"grad_norm": 0.51953125,
"learning_rate": 1.58124465265104e-05,
"loss": 0.0747,
"step": 998
},
{
"epoch": 0.33827855495121795,
"grad_norm": 0.68359375,
"learning_rate": 1.5803332391516832e-05,
"loss": 0.0634,
"step": 999
},
{
"epoch": 0.3386171721233413,
"grad_norm": 0.51953125,
"learning_rate": 1.5794210982045638e-05,
"loss": 0.0548,
"step": 1000
},
{
"epoch": 0.33895578929546466,
"grad_norm": 0.50390625,
"learning_rate": 1.5785082309530504e-05,
"loss": 0.0659,
"step": 1001
},
{
"epoch": 0.33929440646758796,
"grad_norm": 0.6171875,
"learning_rate": 1.577594638541422e-05,
"loss": 0.0745,
"step": 1002
},
{
"epoch": 0.3396330236397113,
"grad_norm": 0.5390625,
"learning_rate": 1.5766803221148676e-05,
"loss": 0.0689,
"step": 1003
},
{
"epoch": 0.3399716408118347,
"grad_norm": 0.55859375,
"learning_rate": 1.5757652828194815e-05,
"loss": 0.0874,
"step": 1004
},
{
"epoch": 0.34031025798395803,
"grad_norm": 0.439453125,
"learning_rate": 1.5748495218022665e-05,
"loss": 0.0583,
"step": 1005
},
{
"epoch": 0.34064887515608133,
"grad_norm": 0.443359375,
"learning_rate": 1.573933040211129e-05,
"loss": 0.0639,
"step": 1006
},
{
"epoch": 0.3409874923282047,
"grad_norm": 0.435546875,
"learning_rate": 1.5730158391948785e-05,
"loss": 0.0566,
"step": 1007
},
{
"epoch": 0.34132610950032805,
"grad_norm": 0.66796875,
"learning_rate": 1.5720979199032268e-05,
"loss": 0.0807,
"step": 1008
},
{
"epoch": 0.3416647266724514,
"grad_norm": 0.419921875,
"learning_rate": 1.5711792834867856e-05,
"loss": 0.0633,
"step": 1009
},
{
"epoch": 0.3420033438445747,
"grad_norm": 0.5078125,
"learning_rate": 1.570259931097066e-05,
"loss": 0.0722,
"step": 1010
},
{
"epoch": 0.34234196101669806,
"grad_norm": 0.421875,
"learning_rate": 1.569339863886476e-05,
"loss": 0.0596,
"step": 1011
},
{
"epoch": 0.3426805781888214,
"grad_norm": 0.423828125,
"learning_rate": 1.56841908300832e-05,
"loss": 0.0618,
"step": 1012
},
{
"epoch": 0.3430191953609447,
"grad_norm": 0.458984375,
"learning_rate": 1.567497589616797e-05,
"loss": 0.0626,
"step": 1013
},
{
"epoch": 0.3433578125330681,
"grad_norm": 0.55078125,
"learning_rate": 1.5665753848669987e-05,
"loss": 0.0731,
"step": 1014
},
{
"epoch": 0.34369642970519143,
"grad_norm": 0.5390625,
"learning_rate": 1.5656524699149096e-05,
"loss": 0.0723,
"step": 1015
},
{
"epoch": 0.3440350468773148,
"grad_norm": 0.62109375,
"learning_rate": 1.5647288459174032e-05,
"loss": 0.0949,
"step": 1016
},
{
"epoch": 0.3443736640494381,
"grad_norm": 0.431640625,
"learning_rate": 1.563804514032242e-05,
"loss": 0.0491,
"step": 1017
},
{
"epoch": 0.34471228122156145,
"grad_norm": 0.68359375,
"learning_rate": 1.5628794754180764e-05,
"loss": 0.0886,
"step": 1018
},
{
"epoch": 0.3450508983936848,
"grad_norm": 0.6015625,
"learning_rate": 1.5619537312344422e-05,
"loss": 0.0659,
"step": 1019
},
{
"epoch": 0.34538951556580816,
"grad_norm": 0.455078125,
"learning_rate": 1.56102728264176e-05,
"loss": 0.0632,
"step": 1020
},
{
"epoch": 0.34572813273793146,
"grad_norm": 1.1953125,
"learning_rate": 1.560100130801333e-05,
"loss": 0.0618,
"step": 1021
},
{
"epoch": 0.3460667499100548,
"grad_norm": 0.6015625,
"learning_rate": 1.5591722768753464e-05,
"loss": 0.0721,
"step": 1022
},
{
"epoch": 0.3464053670821782,
"grad_norm": 0.408203125,
"learning_rate": 1.5582437220268648e-05,
"loss": 0.0518,
"step": 1023
},
{
"epoch": 0.3467439842543015,
"grad_norm": 0.48828125,
"learning_rate": 1.5573144674198323e-05,
"loss": 0.0599,
"step": 1024
},
{
"epoch": 0.34708260142642483,
"grad_norm": 0.453125,
"learning_rate": 1.5563845142190687e-05,
"loss": 0.0601,
"step": 1025
},
{
"epoch": 0.3474212185985482,
"grad_norm": 0.486328125,
"learning_rate": 1.555453863590272e-05,
"loss": 0.0596,
"step": 1026
},
{
"epoch": 0.34775983577067154,
"grad_norm": 0.41015625,
"learning_rate": 1.554522516700011e-05,
"loss": 0.0496,
"step": 1027
},
{
"epoch": 0.34809845294279484,
"grad_norm": 0.5703125,
"learning_rate": 1.5535904747157303e-05,
"loss": 0.0744,
"step": 1028
},
{
"epoch": 0.3484370701149182,
"grad_norm": 0.43359375,
"learning_rate": 1.5526577388057444e-05,
"loss": 0.0532,
"step": 1029
},
{
"epoch": 0.34877568728704156,
"grad_norm": 0.63671875,
"learning_rate": 1.5517243101392373e-05,
"loss": 0.08,
"step": 1030
},
{
"epoch": 0.34911430445916486,
"grad_norm": 0.4921875,
"learning_rate": 1.5507901898862623e-05,
"loss": 0.0623,
"step": 1031
},
{
"epoch": 0.3494529216312882,
"grad_norm": 0.484375,
"learning_rate": 1.5498553792177395e-05,
"loss": 0.0582,
"step": 1032
},
{
"epoch": 0.3497915388034116,
"grad_norm": 0.59765625,
"learning_rate": 1.5489198793054535e-05,
"loss": 0.0806,
"step": 1033
},
{
"epoch": 0.35013015597553493,
"grad_norm": 0.4765625,
"learning_rate": 1.5479836913220544e-05,
"loss": 0.0691,
"step": 1034
},
{
"epoch": 0.35046877314765823,
"grad_norm": 0.47265625,
"learning_rate": 1.547046816441053e-05,
"loss": 0.0724,
"step": 1035
},
{
"epoch": 0.3508073903197816,
"grad_norm": 0.470703125,
"learning_rate": 1.5461092558368223e-05,
"loss": 0.0604,
"step": 1036
},
{
"epoch": 0.35114600749190494,
"grad_norm": 0.625,
"learning_rate": 1.5451710106845953e-05,
"loss": 0.0606,
"step": 1037
},
{
"epoch": 0.3514846246640283,
"grad_norm": 0.52734375,
"learning_rate": 1.5442320821604616e-05,
"loss": 0.0774,
"step": 1038
},
{
"epoch": 0.3518232418361516,
"grad_norm": 0.486328125,
"learning_rate": 1.5432924714413685e-05,
"loss": 0.0642,
"step": 1039
},
{
"epoch": 0.35216185900827496,
"grad_norm": 0.74609375,
"learning_rate": 1.5423521797051176e-05,
"loss": 0.1061,
"step": 1040
},
{
"epoch": 0.3525004761803983,
"grad_norm": 0.447265625,
"learning_rate": 1.541411208130365e-05,
"loss": 0.0601,
"step": 1041
},
{
"epoch": 0.3528390933525216,
"grad_norm": 0.5859375,
"learning_rate": 1.540469557896619e-05,
"loss": 0.0713,
"step": 1042
},
{
"epoch": 0.35317771052464497,
"grad_norm": 0.51953125,
"learning_rate": 1.539527230184238e-05,
"loss": 0.0591,
"step": 1043
},
{
"epoch": 0.35351632769676833,
"grad_norm": 0.52734375,
"learning_rate": 1.5385842261744296e-05,
"loss": 0.0738,
"step": 1044
},
{
"epoch": 0.3538549448688917,
"grad_norm": 0.56640625,
"learning_rate": 1.5376405470492502e-05,
"loss": 0.0825,
"step": 1045
},
{
"epoch": 0.354193562041015,
"grad_norm": 0.39453125,
"learning_rate": 1.536696193991601e-05,
"loss": 0.0502,
"step": 1046
},
{
"epoch": 0.35453217921313834,
"grad_norm": 0.56640625,
"learning_rate": 1.535751168185228e-05,
"loss": 0.0713,
"step": 1047
},
{
"epoch": 0.3548707963852617,
"grad_norm": 0.53515625,
"learning_rate": 1.5348054708147225e-05,
"loss": 0.0786,
"step": 1048
},
{
"epoch": 0.35520941355738506,
"grad_norm": 0.5078125,
"learning_rate": 1.5338591030655154e-05,
"loss": 0.0689,
"step": 1049
},
{
"epoch": 0.35554803072950836,
"grad_norm": 0.4921875,
"learning_rate": 1.5329120661238788e-05,
"loss": 0.0645,
"step": 1050
},
{
"epoch": 0.3558866479016317,
"grad_norm": 0.5234375,
"learning_rate": 1.5319643611769237e-05,
"loss": 0.0825,
"step": 1051
},
{
"epoch": 0.35622526507375507,
"grad_norm": 0.458984375,
"learning_rate": 1.5310159894125986e-05,
"loss": 0.0608,
"step": 1052
},
{
"epoch": 0.35656388224587837,
"grad_norm": 0.515625,
"learning_rate": 1.530066952019687e-05,
"loss": 0.0677,
"step": 1053
},
{
"epoch": 0.35690249941800173,
"grad_norm": 0.5234375,
"learning_rate": 1.529117250187808e-05,
"loss": 0.078,
"step": 1054
},
{
"epoch": 0.3572411165901251,
"grad_norm": 0.4921875,
"learning_rate": 1.5281668851074123e-05,
"loss": 0.0656,
"step": 1055
},
{
"epoch": 0.35757973376224844,
"grad_norm": 0.4609375,
"learning_rate": 1.527215857969783e-05,
"loss": 0.0576,
"step": 1056
},
{
"epoch": 0.35791835093437174,
"grad_norm": 0.484375,
"learning_rate": 1.526264169967033e-05,
"loss": 0.0565,
"step": 1057
},
{
"epoch": 0.3582569681064951,
"grad_norm": 0.609375,
"learning_rate": 1.5253118222921024e-05,
"loss": 0.0859,
"step": 1058
},
{
"epoch": 0.35859558527861846,
"grad_norm": 0.458984375,
"learning_rate": 1.5243588161387596e-05,
"loss": 0.0602,
"step": 1059
},
{
"epoch": 0.35893420245074176,
"grad_norm": 0.51953125,
"learning_rate": 1.5234051527015983e-05,
"loss": 0.0625,
"step": 1060
},
{
"epoch": 0.3592728196228651,
"grad_norm": 0.443359375,
"learning_rate": 1.522450833176035e-05,
"loss": 0.0579,
"step": 1061
},
{
"epoch": 0.35961143679498847,
"grad_norm": 0.87890625,
"learning_rate": 1.5214958587583092e-05,
"loss": 0.0713,
"step": 1062
},
{
"epoch": 0.3599500539671118,
"grad_norm": 0.58984375,
"learning_rate": 1.5205402306454823e-05,
"loss": 0.0696,
"step": 1063
},
{
"epoch": 0.36028867113923513,
"grad_norm": 0.515625,
"learning_rate": 1.5195839500354337e-05,
"loss": 0.0755,
"step": 1064
},
{
"epoch": 0.3606272883113585,
"grad_norm": 0.439453125,
"learning_rate": 1.5186270181268612e-05,
"loss": 0.0575,
"step": 1065
},
{
"epoch": 0.36096590548348184,
"grad_norm": 0.59765625,
"learning_rate": 1.5176694361192787e-05,
"loss": 0.0814,
"step": 1066
},
{
"epoch": 0.3613045226556052,
"grad_norm": 0.56640625,
"learning_rate": 1.516711205213016e-05,
"loss": 0.0819,
"step": 1067
},
{
"epoch": 0.3616431398277285,
"grad_norm": 0.56640625,
"learning_rate": 1.5157523266092153e-05,
"loss": 0.0777,
"step": 1068
},
{
"epoch": 0.36198175699985186,
"grad_norm": 0.419921875,
"learning_rate": 1.5147928015098309e-05,
"loss": 0.0584,
"step": 1069
},
{
"epoch": 0.3623203741719752,
"grad_norm": 0.52734375,
"learning_rate": 1.5138326311176278e-05,
"loss": 0.0745,
"step": 1070
},
{
"epoch": 0.3626589913440985,
"grad_norm": 0.490234375,
"learning_rate": 1.5128718166361793e-05,
"loss": 0.0671,
"step": 1071
},
{
"epoch": 0.36299760851622187,
"grad_norm": 0.423828125,
"learning_rate": 1.511910359269867e-05,
"loss": 0.0531,
"step": 1072
},
{
"epoch": 0.3633362256883452,
"grad_norm": 0.51171875,
"learning_rate": 1.5109482602238773e-05,
"loss": 0.0624,
"step": 1073
},
{
"epoch": 0.3636748428604686,
"grad_norm": 0.66015625,
"learning_rate": 1.5099855207042016e-05,
"loss": 0.0907,
"step": 1074
},
{
"epoch": 0.3640134600325919,
"grad_norm": 0.53125,
"learning_rate": 1.509022141917634e-05,
"loss": 0.0673,
"step": 1075
},
{
"epoch": 0.36435207720471524,
"grad_norm": 0.54296875,
"learning_rate": 1.5080581250717699e-05,
"loss": 0.0671,
"step": 1076
},
{
"epoch": 0.3646906943768386,
"grad_norm": 0.6796875,
"learning_rate": 1.5070934713750043e-05,
"loss": 0.0854,
"step": 1077
},
{
"epoch": 0.36502931154896195,
"grad_norm": 0.6171875,
"learning_rate": 1.5061281820365308e-05,
"loss": 0.0592,
"step": 1078
},
{
"epoch": 0.36536792872108526,
"grad_norm": 0.50390625,
"learning_rate": 1.50516225826634e-05,
"loss": 0.0659,
"step": 1079
},
{
"epoch": 0.3657065458932086,
"grad_norm": 0.4921875,
"learning_rate": 1.5041957012752173e-05,
"loss": 0.0522,
"step": 1080
},
{
"epoch": 0.36604516306533197,
"grad_norm": 0.60546875,
"learning_rate": 1.5032285122747414e-05,
"loss": 0.0703,
"step": 1081
},
{
"epoch": 0.36638378023745527,
"grad_norm": 0.4375,
"learning_rate": 1.5022606924772842e-05,
"loss": 0.0595,
"step": 1082
},
{
"epoch": 0.3667223974095786,
"grad_norm": 0.54296875,
"learning_rate": 1.5012922430960082e-05,
"loss": 0.059,
"step": 1083
},
{
"epoch": 0.367061014581702,
"grad_norm": 0.453125,
"learning_rate": 1.5003231653448645e-05,
"loss": 0.0567,
"step": 1084
},
{
"epoch": 0.36739963175382534,
"grad_norm": 0.462890625,
"learning_rate": 1.4993534604385917e-05,
"loss": 0.0622,
"step": 1085
},
{
"epoch": 0.36773824892594864,
"grad_norm": 0.625,
"learning_rate": 1.4983831295927154e-05,
"loss": 0.0874,
"step": 1086
},
{
"epoch": 0.368076866098072,
"grad_norm": 0.51171875,
"learning_rate": 1.4974121740235457e-05,
"loss": 0.0616,
"step": 1087
},
{
"epoch": 0.36841548327019535,
"grad_norm": 0.5234375,
"learning_rate": 1.496440594948175e-05,
"loss": 0.0776,
"step": 1088
},
{
"epoch": 0.36875410044231866,
"grad_norm": 0.5234375,
"learning_rate": 1.495468393584478e-05,
"loss": 0.0603,
"step": 1089
},
{
"epoch": 0.369092717614442,
"grad_norm": 0.412109375,
"learning_rate": 1.4944955711511091e-05,
"loss": 0.0549,
"step": 1090
},
{
"epoch": 0.36943133478656537,
"grad_norm": 0.60546875,
"learning_rate": 1.4935221288675013e-05,
"loss": 0.0634,
"step": 1091
},
{
"epoch": 0.3697699519586887,
"grad_norm": 0.49609375,
"learning_rate": 1.4925480679538646e-05,
"loss": 0.0627,
"step": 1092
},
{
"epoch": 0.370108569130812,
"grad_norm": 0.408203125,
"learning_rate": 1.4915733896311844e-05,
"loss": 0.052,
"step": 1093
},
{
"epoch": 0.3704471863029354,
"grad_norm": 0.53125,
"learning_rate": 1.49059809512122e-05,
"loss": 0.0806,
"step": 1094
},
{
"epoch": 0.37078580347505874,
"grad_norm": 0.58203125,
"learning_rate": 1.4896221856465034e-05,
"loss": 0.0737,
"step": 1095
},
{
"epoch": 0.3711244206471821,
"grad_norm": 0.63671875,
"learning_rate": 1.4886456624303369e-05,
"loss": 0.0788,
"step": 1096
},
{
"epoch": 0.3714630378193054,
"grad_norm": 0.5390625,
"learning_rate": 1.4876685266967926e-05,
"loss": 0.0535,
"step": 1097
},
{
"epoch": 0.37180165499142875,
"grad_norm": 0.44140625,
"learning_rate": 1.4866907796707102e-05,
"loss": 0.0557,
"step": 1098
},
{
"epoch": 0.3721402721635521,
"grad_norm": 0.65625,
"learning_rate": 1.4857124225776955e-05,
"loss": 0.0696,
"step": 1099
},
{
"epoch": 0.3724788893356754,
"grad_norm": 0.54296875,
"learning_rate": 1.4847334566441199e-05,
"loss": 0.0639,
"step": 1100
},
{
"epoch": 0.37281750650779877,
"grad_norm": 0.71875,
"learning_rate": 1.4837538830971162e-05,
"loss": 0.0792,
"step": 1101
},
{
"epoch": 0.3731561236799221,
"grad_norm": 0.453125,
"learning_rate": 1.4827737031645808e-05,
"loss": 0.0613,
"step": 1102
},
{
"epoch": 0.3734947408520455,
"grad_norm": 0.490234375,
"learning_rate": 1.481792918075169e-05,
"loss": 0.0628,
"step": 1103
},
{
"epoch": 0.3738333580241688,
"grad_norm": 0.58203125,
"learning_rate": 1.4808115290582947e-05,
"loss": 0.0682,
"step": 1104
},
{
"epoch": 0.37417197519629214,
"grad_norm": 0.54296875,
"learning_rate": 1.4798295373441293e-05,
"loss": 0.0762,
"step": 1105
},
{
"epoch": 0.3745105923684155,
"grad_norm": 0.451171875,
"learning_rate": 1.4788469441635997e-05,
"loss": 0.0504,
"step": 1106
},
{
"epoch": 0.3748492095405388,
"grad_norm": 0.42578125,
"learning_rate": 1.4778637507483867e-05,
"loss": 0.0547,
"step": 1107
},
{
"epoch": 0.37518782671266215,
"grad_norm": 0.44140625,
"learning_rate": 1.4768799583309228e-05,
"loss": 0.0602,
"step": 1108
},
{
"epoch": 0.3755264438847855,
"grad_norm": 0.466796875,
"learning_rate": 1.475895568144392e-05,
"loss": 0.0622,
"step": 1109
},
{
"epoch": 0.37586506105690887,
"grad_norm": 0.44140625,
"learning_rate": 1.4749105814227278e-05,
"loss": 0.0564,
"step": 1110
},
{
"epoch": 0.37620367822903217,
"grad_norm": 0.5546875,
"learning_rate": 1.4739249994006111e-05,
"loss": 0.0762,
"step": 1111
},
{
"epoch": 0.3765422954011555,
"grad_norm": 0.44140625,
"learning_rate": 1.4729388233134684e-05,
"loss": 0.059,
"step": 1112
},
{
"epoch": 0.3768809125732789,
"grad_norm": 0.48828125,
"learning_rate": 1.4719520543974723e-05,
"loss": 0.0712,
"step": 1113
},
{
"epoch": 0.37721952974540224,
"grad_norm": 0.41796875,
"learning_rate": 1.4709646938895374e-05,
"loss": 0.0532,
"step": 1114
},
{
"epoch": 0.37755814691752554,
"grad_norm": 0.515625,
"learning_rate": 1.4699767430273202e-05,
"loss": 0.0628,
"step": 1115
},
{
"epoch": 0.3778967640896489,
"grad_norm": 0.55078125,
"learning_rate": 1.468988203049217e-05,
"loss": 0.0845,
"step": 1116
},
{
"epoch": 0.37823538126177225,
"grad_norm": 0.54296875,
"learning_rate": 1.4679990751943632e-05,
"loss": 0.0704,
"step": 1117
},
{
"epoch": 0.37857399843389555,
"grad_norm": 0.4453125,
"learning_rate": 1.4670093607026302e-05,
"loss": 0.05,
"step": 1118
},
{
"epoch": 0.3789126156060189,
"grad_norm": 0.65234375,
"learning_rate": 1.4660190608146253e-05,
"loss": 0.0856,
"step": 1119
},
{
"epoch": 0.37925123277814227,
"grad_norm": 0.484375,
"learning_rate": 1.4650281767716895e-05,
"loss": 0.0504,
"step": 1120
},
{
"epoch": 0.3795898499502656,
"grad_norm": 0.4765625,
"learning_rate": 1.4640367098158961e-05,
"loss": 0.0704,
"step": 1121
},
{
"epoch": 0.3799284671223889,
"grad_norm": 0.5234375,
"learning_rate": 1.4630446611900493e-05,
"loss": 0.0601,
"step": 1122
},
{
"epoch": 0.3802670842945123,
"grad_norm": 0.451171875,
"learning_rate": 1.4620520321376814e-05,
"loss": 0.0665,
"step": 1123
},
{
"epoch": 0.38060570146663564,
"grad_norm": 0.59375,
"learning_rate": 1.4610588239030537e-05,
"loss": 0.0776,
"step": 1124
},
{
"epoch": 0.380944318638759,
"grad_norm": 0.474609375,
"learning_rate": 1.4600650377311523e-05,
"loss": 0.0622,
"step": 1125
},
{
"epoch": 0.3812829358108823,
"grad_norm": 0.515625,
"learning_rate": 1.4590706748676886e-05,
"loss": 0.0618,
"step": 1126
},
{
"epoch": 0.38162155298300565,
"grad_norm": 0.484375,
"learning_rate": 1.4580757365590965e-05,
"loss": 0.0694,
"step": 1127
},
{
"epoch": 0.381960170155129,
"grad_norm": 0.486328125,
"learning_rate": 1.4570802240525309e-05,
"loss": 0.0619,
"step": 1128
},
{
"epoch": 0.3822987873272523,
"grad_norm": 0.609375,
"learning_rate": 1.456084138595867e-05,
"loss": 0.0651,
"step": 1129
},
{
"epoch": 0.38263740449937567,
"grad_norm": 0.53125,
"learning_rate": 1.4550874814376983e-05,
"loss": 0.0668,
"step": 1130
},
{
"epoch": 0.382976021671499,
"grad_norm": 0.51171875,
"learning_rate": 1.4540902538273343e-05,
"loss": 0.0639,
"step": 1131
},
{
"epoch": 0.3833146388436224,
"grad_norm": 0.51171875,
"learning_rate": 1.4530924570147998e-05,
"loss": 0.0626,
"step": 1132
},
{
"epoch": 0.3836532560157457,
"grad_norm": 0.50390625,
"learning_rate": 1.452094092250834e-05,
"loss": 0.0724,
"step": 1133
},
{
"epoch": 0.38399187318786904,
"grad_norm": 0.64453125,
"learning_rate": 1.451095160786886e-05,
"loss": 0.1049,
"step": 1134
},
{
"epoch": 0.3843304903599924,
"grad_norm": 0.921875,
"learning_rate": 1.450095663875117e-05,
"loss": 0.0733,
"step": 1135
},
{
"epoch": 0.3846691075321157,
"grad_norm": 0.7734375,
"learning_rate": 1.449095602768397e-05,
"loss": 0.077,
"step": 1136
},
{
"epoch": 0.38500772470423905,
"grad_norm": 0.6640625,
"learning_rate": 1.4480949787203015e-05,
"loss": 0.0811,
"step": 1137
},
{
"epoch": 0.3853463418763624,
"grad_norm": 0.82421875,
"learning_rate": 1.4470937929851142e-05,
"loss": 0.0932,
"step": 1138
},
{
"epoch": 0.38568495904848576,
"grad_norm": 0.515625,
"learning_rate": 1.4460920468178204e-05,
"loss": 0.0565,
"step": 1139
},
{
"epoch": 0.38602357622060907,
"grad_norm": 0.6796875,
"learning_rate": 1.4450897414741095e-05,
"loss": 0.0705,
"step": 1140
},
{
"epoch": 0.3863621933927324,
"grad_norm": 0.5,
"learning_rate": 1.4440868782103711e-05,
"loss": 0.064,
"step": 1141
},
{
"epoch": 0.3867008105648558,
"grad_norm": 0.5078125,
"learning_rate": 1.443083458283695e-05,
"loss": 0.055,
"step": 1142
},
{
"epoch": 0.38703942773697914,
"grad_norm": 0.40234375,
"learning_rate": 1.4420794829518674e-05,
"loss": 0.0502,
"step": 1143
},
{
"epoch": 0.38737804490910244,
"grad_norm": 0.466796875,
"learning_rate": 1.4410749534733719e-05,
"loss": 0.0673,
"step": 1144
},
{
"epoch": 0.3877166620812258,
"grad_norm": 0.75,
"learning_rate": 1.440069871107386e-05,
"loss": 0.0594,
"step": 1145
},
{
"epoch": 0.38805527925334915,
"grad_norm": 0.48046875,
"learning_rate": 1.4390642371137807e-05,
"loss": 0.059,
"step": 1146
},
{
"epoch": 0.38839389642547245,
"grad_norm": 2.171875,
"learning_rate": 1.438058052753118e-05,
"loss": 0.099,
"step": 1147
},
{
"epoch": 0.3887325135975958,
"grad_norm": 0.439453125,
"learning_rate": 1.4370513192866507e-05,
"loss": 0.0603,
"step": 1148
},
{
"epoch": 0.38907113076971916,
"grad_norm": 0.46875,
"learning_rate": 1.4360440379763187e-05,
"loss": 0.059,
"step": 1149
},
{
"epoch": 0.3894097479418425,
"grad_norm": 0.482421875,
"learning_rate": 1.4350362100847495e-05,
"loss": 0.0636,
"step": 1150
},
{
"epoch": 0.3897483651139658,
"grad_norm": 0.6171875,
"learning_rate": 1.4340278368752553e-05,
"loss": 0.0746,
"step": 1151
},
{
"epoch": 0.3900869822860892,
"grad_norm": 0.50390625,
"learning_rate": 1.4330189196118323e-05,
"loss": 0.0642,
"step": 1152
},
{
"epoch": 0.39042559945821254,
"grad_norm": 0.68359375,
"learning_rate": 1.4320094595591578e-05,
"loss": 0.0929,
"step": 1153
},
{
"epoch": 0.3907642166303359,
"grad_norm": 0.462890625,
"learning_rate": 1.4309994579825908e-05,
"loss": 0.0529,
"step": 1154
},
{
"epoch": 0.3911028338024592,
"grad_norm": 0.54296875,
"learning_rate": 1.4299889161481676e-05,
"loss": 0.0702,
"step": 1155
},
{
"epoch": 0.39144145097458255,
"grad_norm": 0.62890625,
"learning_rate": 1.4289778353226032e-05,
"loss": 0.0706,
"step": 1156
},
{
"epoch": 0.3917800681467059,
"grad_norm": 0.515625,
"learning_rate": 1.4279662167732869e-05,
"loss": 0.0684,
"step": 1157
},
{
"epoch": 0.3921186853188292,
"grad_norm": 0.4765625,
"learning_rate": 1.4269540617682826e-05,
"loss": 0.0706,
"step": 1158
},
{
"epoch": 0.39245730249095256,
"grad_norm": 0.66015625,
"learning_rate": 1.4259413715763276e-05,
"loss": 0.0789,
"step": 1159
},
{
"epoch": 0.3927959196630759,
"grad_norm": 0.55859375,
"learning_rate": 1.4249281474668279e-05,
"loss": 0.07,
"step": 1160
},
{
"epoch": 0.3931345368351993,
"grad_norm": 0.671875,
"learning_rate": 1.423914390709861e-05,
"loss": 0.0906,
"step": 1161
},
{
"epoch": 0.3934731540073226,
"grad_norm": 0.44921875,
"learning_rate": 1.4229001025761704e-05,
"loss": 0.0536,
"step": 1162
},
{
"epoch": 0.39381177117944594,
"grad_norm": 0.466796875,
"learning_rate": 1.4218852843371665e-05,
"loss": 0.0557,
"step": 1163
},
{
"epoch": 0.3941503883515693,
"grad_norm": 0.49609375,
"learning_rate": 1.4208699372649244e-05,
"loss": 0.0668,
"step": 1164
},
{
"epoch": 0.3944890055236926,
"grad_norm": 0.47265625,
"learning_rate": 1.4198540626321817e-05,
"loss": 0.0609,
"step": 1165
},
{
"epoch": 0.39482762269581595,
"grad_norm": 0.453125,
"learning_rate": 1.4188376617123368e-05,
"loss": 0.0655,
"step": 1166
},
{
"epoch": 0.3951662398679393,
"grad_norm": 0.484375,
"learning_rate": 1.4178207357794486e-05,
"loss": 0.0662,
"step": 1167
},
{
"epoch": 0.39550485704006266,
"grad_norm": 0.49609375,
"learning_rate": 1.4168032861082344e-05,
"loss": 0.0721,
"step": 1168
},
{
"epoch": 0.39584347421218596,
"grad_norm": 0.458984375,
"learning_rate": 1.4157853139740665e-05,
"loss": 0.0676,
"step": 1169
},
{
"epoch": 0.3961820913843093,
"grad_norm": 0.6015625,
"learning_rate": 1.4147668206529737e-05,
"loss": 0.0768,
"step": 1170
},
{
"epoch": 0.3965207085564327,
"grad_norm": 0.486328125,
"learning_rate": 1.413747807421637e-05,
"loss": 0.0727,
"step": 1171
},
{
"epoch": 0.39685932572855603,
"grad_norm": 0.435546875,
"learning_rate": 1.4127282755573903e-05,
"loss": 0.062,
"step": 1172
},
{
"epoch": 0.39719794290067933,
"grad_norm": 0.5234375,
"learning_rate": 1.4117082263382162e-05,
"loss": 0.0696,
"step": 1173
},
{
"epoch": 0.3975365600728027,
"grad_norm": 0.45703125,
"learning_rate": 1.4106876610427466e-05,
"loss": 0.0592,
"step": 1174
},
{
"epoch": 0.39787517724492605,
"grad_norm": 0.484375,
"learning_rate": 1.4096665809502607e-05,
"loss": 0.0607,
"step": 1175
},
{
"epoch": 0.39821379441704935,
"grad_norm": 0.49609375,
"learning_rate": 1.408644987340682e-05,
"loss": 0.0619,
"step": 1176
},
{
"epoch": 0.3985524115891727,
"grad_norm": 0.50390625,
"learning_rate": 1.4076228814945778e-05,
"loss": 0.0537,
"step": 1177
},
{
"epoch": 0.39889102876129606,
"grad_norm": 0.443359375,
"learning_rate": 1.4066002646931587e-05,
"loss": 0.0623,
"step": 1178
},
{
"epoch": 0.3992296459334194,
"grad_norm": 0.49609375,
"learning_rate": 1.4055771382182744e-05,
"loss": 0.0812,
"step": 1179
},
{
"epoch": 0.3995682631055427,
"grad_norm": 0.5078125,
"learning_rate": 1.404553503352414e-05,
"loss": 0.0687,
"step": 1180
},
{
"epoch": 0.3999068802776661,
"grad_norm": 0.45703125,
"learning_rate": 1.4035293613787042e-05,
"loss": 0.0612,
"step": 1181
},
{
"epoch": 0.40024549744978943,
"grad_norm": 0.55078125,
"learning_rate": 1.4025047135809069e-05,
"loss": 0.0767,
"step": 1182
},
{
"epoch": 0.4005841146219128,
"grad_norm": 0.57421875,
"learning_rate": 1.4014795612434182e-05,
"loss": 0.085,
"step": 1183
},
{
"epoch": 0.4009227317940361,
"grad_norm": 0.50390625,
"learning_rate": 1.4004539056512667e-05,
"loss": 0.0683,
"step": 1184
},
{
"epoch": 0.4009227317940361,
"eval_loss": 0.06940508633852005,
"eval_runtime": 816.0342,
"eval_samples_per_second": 12.191,
"eval_steps_per_second": 3.048,
"step": 1184
},
{
"epoch": 0.40126134896615945,
"grad_norm": 0.494140625,
"learning_rate": 1.3994277480901116e-05,
"loss": 0.0512,
"step": 1185
},
{
"epoch": 0.4015999661382828,
"grad_norm": 0.56640625,
"learning_rate": 1.3984010898462417e-05,
"loss": 0.0746,
"step": 1186
},
{
"epoch": 0.4019385833104061,
"grad_norm": 0.515625,
"learning_rate": 1.397373932206573e-05,
"loss": 0.0604,
"step": 1187
},
{
"epoch": 0.40227720048252946,
"grad_norm": 0.53515625,
"learning_rate": 1.3963462764586479e-05,
"loss": 0.0658,
"step": 1188
},
{
"epoch": 0.4026158176546528,
"grad_norm": 0.49609375,
"learning_rate": 1.3953181238906326e-05,
"loss": 0.0674,
"step": 1189
},
{
"epoch": 0.4029544348267762,
"grad_norm": 0.47265625,
"learning_rate": 1.3942894757913169e-05,
"loss": 0.0628,
"step": 1190
},
{
"epoch": 0.4032930519988995,
"grad_norm": 0.494140625,
"learning_rate": 1.3932603334501106e-05,
"loss": 0.0657,
"step": 1191
},
{
"epoch": 0.40363166917102283,
"grad_norm": 0.44921875,
"learning_rate": 1.3922306981570447e-05,
"loss": 0.0588,
"step": 1192
},
{
"epoch": 0.4039702863431462,
"grad_norm": 0.404296875,
"learning_rate": 1.3912005712027661e-05,
"loss": 0.0558,
"step": 1193
},
{
"epoch": 0.4043089035152695,
"grad_norm": 0.375,
"learning_rate": 1.3901699538785398e-05,
"loss": 0.0519,
"step": 1194
},
{
"epoch": 0.40464752068739285,
"grad_norm": 0.671875,
"learning_rate": 1.3891388474762444e-05,
"loss": 0.0755,
"step": 1195
},
{
"epoch": 0.4049861378595162,
"grad_norm": 0.41796875,
"learning_rate": 1.388107253288372e-05,
"loss": 0.0614,
"step": 1196
},
{
"epoch": 0.40532475503163956,
"grad_norm": 0.7109375,
"learning_rate": 1.3870751726080256e-05,
"loss": 0.1036,
"step": 1197
},
{
"epoch": 0.40566337220376286,
"grad_norm": 0.349609375,
"learning_rate": 1.3860426067289185e-05,
"loss": 0.0498,
"step": 1198
},
{
"epoch": 0.4060019893758862,
"grad_norm": 0.43359375,
"learning_rate": 1.3850095569453728e-05,
"loss": 0.0602,
"step": 1199
},
{
"epoch": 0.4063406065480096,
"grad_norm": 0.59765625,
"learning_rate": 1.3839760245523155e-05,
"loss": 0.0664,
"step": 1200
},
{
"epoch": 0.40667922372013293,
"grad_norm": 0.6953125,
"learning_rate": 1.38294201084528e-05,
"loss": 0.089,
"step": 1201
},
{
"epoch": 0.40701784089225623,
"grad_norm": 0.7265625,
"learning_rate": 1.3819075171204028e-05,
"loss": 0.0595,
"step": 1202
},
{
"epoch": 0.4073564580643796,
"grad_norm": 0.765625,
"learning_rate": 1.3808725446744218e-05,
"loss": 0.0571,
"step": 1203
},
{
"epoch": 0.40769507523650295,
"grad_norm": 0.431640625,
"learning_rate": 1.3798370948046747e-05,
"loss": 0.0537,
"step": 1204
},
{
"epoch": 0.40803369240862625,
"grad_norm": 0.50390625,
"learning_rate": 1.3788011688090978e-05,
"loss": 0.0609,
"step": 1205
},
{
"epoch": 0.4083723095807496,
"grad_norm": 0.48828125,
"learning_rate": 1.3777647679862254e-05,
"loss": 0.0642,
"step": 1206
},
{
"epoch": 0.40871092675287296,
"grad_norm": 0.625,
"learning_rate": 1.3767278936351853e-05,
"loss": 0.0814,
"step": 1207
},
{
"epoch": 0.4090495439249963,
"grad_norm": 0.57421875,
"learning_rate": 1.3756905470556996e-05,
"loss": 0.099,
"step": 1208
},
{
"epoch": 0.4093881610971196,
"grad_norm": 0.4375,
"learning_rate": 1.3746527295480825e-05,
"loss": 0.0597,
"step": 1209
},
{
"epoch": 0.409726778269243,
"grad_norm": 0.50390625,
"learning_rate": 1.3736144424132383e-05,
"loss": 0.0622,
"step": 1210
},
{
"epoch": 0.41006539544136633,
"grad_norm": 0.6328125,
"learning_rate": 1.3725756869526598e-05,
"loss": 0.0794,
"step": 1211
},
{
"epoch": 0.4104040126134897,
"grad_norm": 0.48828125,
"learning_rate": 1.3715364644684273e-05,
"loss": 0.0525,
"step": 1212
},
{
"epoch": 0.410742629785613,
"grad_norm": 0.51171875,
"learning_rate": 1.370496776263206e-05,
"loss": 0.064,
"step": 1213
},
{
"epoch": 0.41108124695773635,
"grad_norm": 0.828125,
"learning_rate": 1.3694566236402458e-05,
"loss": 0.0815,
"step": 1214
},
{
"epoch": 0.4114198641298597,
"grad_norm": 0.48046875,
"learning_rate": 1.3684160079033772e-05,
"loss": 0.0638,
"step": 1215
},
{
"epoch": 0.411758481301983,
"grad_norm": 0.56640625,
"learning_rate": 1.3673749303570127e-05,
"loss": 0.0801,
"step": 1216
},
{
"epoch": 0.41209709847410636,
"grad_norm": 0.5703125,
"learning_rate": 1.366333392306143e-05,
"loss": 0.0628,
"step": 1217
},
{
"epoch": 0.4124357156462297,
"grad_norm": 0.46875,
"learning_rate": 1.3652913950563362e-05,
"loss": 0.062,
"step": 1218
},
{
"epoch": 0.4127743328183531,
"grad_norm": 0.7265625,
"learning_rate": 1.3642489399137358e-05,
"loss": 0.079,
"step": 1219
},
{
"epoch": 0.4131129499904764,
"grad_norm": 0.50390625,
"learning_rate": 1.3632060281850593e-05,
"loss": 0.0634,
"step": 1220
},
{
"epoch": 0.41345156716259973,
"grad_norm": 0.3984375,
"learning_rate": 1.3621626611775966e-05,
"loss": 0.0488,
"step": 1221
},
{
"epoch": 0.4137901843347231,
"grad_norm": 0.5625,
"learning_rate": 1.3611188401992087e-05,
"loss": 0.0813,
"step": 1222
},
{
"epoch": 0.4141288015068464,
"grad_norm": 0.91015625,
"learning_rate": 1.360074566558325e-05,
"loss": 0.0672,
"step": 1223
},
{
"epoch": 0.41446741867896975,
"grad_norm": 0.57421875,
"learning_rate": 1.3590298415639427e-05,
"loss": 0.0753,
"step": 1224
},
{
"epoch": 0.4148060358510931,
"grad_norm": 0.462890625,
"learning_rate": 1.3579846665256244e-05,
"loss": 0.0703,
"step": 1225
},
{
"epoch": 0.41514465302321646,
"grad_norm": 0.72265625,
"learning_rate": 1.3569390427534976e-05,
"loss": 0.0963,
"step": 1226
},
{
"epoch": 0.41548327019533976,
"grad_norm": 0.5859375,
"learning_rate": 1.3558929715582517e-05,
"loss": 0.07,
"step": 1227
},
{
"epoch": 0.4158218873674631,
"grad_norm": 0.47265625,
"learning_rate": 1.3548464542511364e-05,
"loss": 0.0673,
"step": 1228
},
{
"epoch": 0.4161605045395865,
"grad_norm": 0.57421875,
"learning_rate": 1.353799492143962e-05,
"loss": 0.0684,
"step": 1229
},
{
"epoch": 0.41649912171170983,
"grad_norm": 0.609375,
"learning_rate": 1.352752086549095e-05,
"loss": 0.0755,
"step": 1230
},
{
"epoch": 0.41683773888383313,
"grad_norm": 0.53125,
"learning_rate": 1.3517042387794585e-05,
"loss": 0.0698,
"step": 1231
},
{
"epoch": 0.4171763560559565,
"grad_norm": 0.5,
"learning_rate": 1.3506559501485304e-05,
"loss": 0.058,
"step": 1232
},
{
"epoch": 0.41751497322807984,
"grad_norm": 0.640625,
"learning_rate": 1.3496072219703399e-05,
"loss": 0.0792,
"step": 1233
},
{
"epoch": 0.41785359040020315,
"grad_norm": 0.4296875,
"learning_rate": 1.3485580555594679e-05,
"loss": 0.0639,
"step": 1234
},
{
"epoch": 0.4181922075723265,
"grad_norm": 0.66796875,
"learning_rate": 1.3475084522310451e-05,
"loss": 0.0783,
"step": 1235
},
{
"epoch": 0.41853082474444986,
"grad_norm": 0.64453125,
"learning_rate": 1.3464584133007486e-05,
"loss": 0.0711,
"step": 1236
},
{
"epoch": 0.4188694419165732,
"grad_norm": 0.58203125,
"learning_rate": 1.3454079400848029e-05,
"loss": 0.0688,
"step": 1237
},
{
"epoch": 0.4192080590886965,
"grad_norm": 0.486328125,
"learning_rate": 1.3443570338999759e-05,
"loss": 0.0748,
"step": 1238
},
{
"epoch": 0.4195466762608199,
"grad_norm": 0.56640625,
"learning_rate": 1.3433056960635788e-05,
"loss": 0.0767,
"step": 1239
},
{
"epoch": 0.41988529343294323,
"grad_norm": 0.5546875,
"learning_rate": 1.3422539278934637e-05,
"loss": 0.0543,
"step": 1240
},
{
"epoch": 0.4202239106050666,
"grad_norm": 0.81640625,
"learning_rate": 1.341201730708022e-05,
"loss": 0.0847,
"step": 1241
},
{
"epoch": 0.4205625277771899,
"grad_norm": 0.58984375,
"learning_rate": 1.3401491058261829e-05,
"loss": 0.0803,
"step": 1242
},
{
"epoch": 0.42090114494931324,
"grad_norm": 0.435546875,
"learning_rate": 1.3390960545674117e-05,
"loss": 0.058,
"step": 1243
},
{
"epoch": 0.4212397621214366,
"grad_norm": 0.53515625,
"learning_rate": 1.3380425782517084e-05,
"loss": 0.0666,
"step": 1244
},
{
"epoch": 0.4215783792935599,
"grad_norm": 0.63671875,
"learning_rate": 1.3369886781996056e-05,
"loss": 0.0741,
"step": 1245
},
{
"epoch": 0.42191699646568326,
"grad_norm": 0.453125,
"learning_rate": 1.335934355732167e-05,
"loss": 0.052,
"step": 1246
},
{
"epoch": 0.4222556136378066,
"grad_norm": 0.52734375,
"learning_rate": 1.3348796121709862e-05,
"loss": 0.0629,
"step": 1247
},
{
"epoch": 0.42259423080992997,
"grad_norm": 0.43359375,
"learning_rate": 1.3338244488381843e-05,
"loss": 0.0573,
"step": 1248
},
{
"epoch": 0.4229328479820533,
"grad_norm": 0.40625,
"learning_rate": 1.332768867056408e-05,
"loss": 0.0526,
"step": 1249
},
{
"epoch": 0.42327146515417663,
"grad_norm": 0.470703125,
"learning_rate": 1.3317128681488301e-05,
"loss": 0.0641,
"step": 1250
},
{
"epoch": 0.4236100823263,
"grad_norm": 0.5078125,
"learning_rate": 1.3306564534391447e-05,
"loss": 0.0617,
"step": 1251
},
{
"epoch": 0.4239486994984233,
"grad_norm": 0.91796875,
"learning_rate": 1.3295996242515679e-05,
"loss": 0.0626,
"step": 1252
},
{
"epoch": 0.42428731667054664,
"grad_norm": 0.46875,
"learning_rate": 1.3285423819108349e-05,
"loss": 0.0596,
"step": 1253
},
{
"epoch": 0.42462593384267,
"grad_norm": 0.375,
"learning_rate": 1.3274847277421997e-05,
"loss": 0.0488,
"step": 1254
},
{
"epoch": 0.42496455101479336,
"grad_norm": 0.42578125,
"learning_rate": 1.3264266630714308e-05,
"loss": 0.0614,
"step": 1255
},
{
"epoch": 0.42530316818691666,
"grad_norm": 0.455078125,
"learning_rate": 1.3253681892248136e-05,
"loss": 0.0623,
"step": 1256
},
{
"epoch": 0.42564178535904,
"grad_norm": 1.234375,
"learning_rate": 1.3243093075291444e-05,
"loss": 0.0729,
"step": 1257
},
{
"epoch": 0.42598040253116337,
"grad_norm": 0.4609375,
"learning_rate": 1.3232500193117318e-05,
"loss": 0.0576,
"step": 1258
},
{
"epoch": 0.42631901970328673,
"grad_norm": 0.58984375,
"learning_rate": 1.3221903259003935e-05,
"loss": 0.0782,
"step": 1259
},
{
"epoch": 0.42665763687541003,
"grad_norm": 0.462890625,
"learning_rate": 1.3211302286234553e-05,
"loss": 0.0638,
"step": 1260
},
{
"epoch": 0.4269962540475334,
"grad_norm": 0.431640625,
"learning_rate": 1.3200697288097492e-05,
"loss": 0.0536,
"step": 1261
},
{
"epoch": 0.42733487121965674,
"grad_norm": 0.55078125,
"learning_rate": 1.3190088277886119e-05,
"loss": 0.073,
"step": 1262
},
{
"epoch": 0.42767348839178004,
"grad_norm": 0.416015625,
"learning_rate": 1.3179475268898828e-05,
"loss": 0.0467,
"step": 1263
},
{
"epoch": 0.4280121055639034,
"grad_norm": 0.361328125,
"learning_rate": 1.316885827443903e-05,
"loss": 0.0436,
"step": 1264
},
{
"epoch": 0.42835072273602676,
"grad_norm": 0.55078125,
"learning_rate": 1.3158237307815122e-05,
"loss": 0.0731,
"step": 1265
},
{
"epoch": 0.4286893399081501,
"grad_norm": 0.54296875,
"learning_rate": 1.3147612382340493e-05,
"loss": 0.0736,
"step": 1266
},
{
"epoch": 0.4290279570802734,
"grad_norm": 0.59375,
"learning_rate": 1.3136983511333483e-05,
"loss": 0.0812,
"step": 1267
},
{
"epoch": 0.42936657425239677,
"grad_norm": 0.51171875,
"learning_rate": 1.3126350708117387e-05,
"loss": 0.0731,
"step": 1268
},
{
"epoch": 0.4297051914245201,
"grad_norm": 0.5078125,
"learning_rate": 1.3115713986020421e-05,
"loss": 0.0622,
"step": 1269
},
{
"epoch": 0.4300438085966435,
"grad_norm": 0.388671875,
"learning_rate": 1.3105073358375719e-05,
"loss": 0.0519,
"step": 1270
},
{
"epoch": 0.4303824257687668,
"grad_norm": 0.75390625,
"learning_rate": 1.309442883852131e-05,
"loss": 0.0535,
"step": 1271
},
{
"epoch": 0.43072104294089014,
"grad_norm": 0.53515625,
"learning_rate": 1.30837804398001e-05,
"loss": 0.0714,
"step": 1272
},
{
"epoch": 0.4310596601130135,
"grad_norm": 0.51953125,
"learning_rate": 1.3073128175559852e-05,
"loss": 0.0685,
"step": 1273
},
{
"epoch": 0.4313982772851368,
"grad_norm": 0.5390625,
"learning_rate": 1.3062472059153185e-05,
"loss": 0.0976,
"step": 1274
},
{
"epoch": 0.43173689445726016,
"grad_norm": 0.6171875,
"learning_rate": 1.3051812103937545e-05,
"loss": 0.0825,
"step": 1275
},
{
"epoch": 0.4320755116293835,
"grad_norm": 0.44921875,
"learning_rate": 1.3041148323275182e-05,
"loss": 0.0525,
"step": 1276
},
{
"epoch": 0.43241412880150687,
"grad_norm": 0.4609375,
"learning_rate": 1.3030480730533146e-05,
"loss": 0.0686,
"step": 1277
},
{
"epoch": 0.43275274597363017,
"grad_norm": 0.59375,
"learning_rate": 1.3019809339083262e-05,
"loss": 0.0872,
"step": 1278
},
{
"epoch": 0.4330913631457535,
"grad_norm": 0.62109375,
"learning_rate": 1.3009134162302131e-05,
"loss": 0.0991,
"step": 1279
},
{
"epoch": 0.4334299803178769,
"grad_norm": 0.4921875,
"learning_rate": 1.299845521357108e-05,
"loss": 0.0583,
"step": 1280
},
{
"epoch": 0.4337685974900002,
"grad_norm": 0.5703125,
"learning_rate": 1.2987772506276173e-05,
"loss": 0.0625,
"step": 1281
},
{
"epoch": 0.43410721466212354,
"grad_norm": 0.4765625,
"learning_rate": 1.2977086053808183e-05,
"loss": 0.0614,
"step": 1282
},
{
"epoch": 0.4344458318342469,
"grad_norm": 0.55078125,
"learning_rate": 1.2966395869562582e-05,
"loss": 0.0513,
"step": 1283
},
{
"epoch": 0.43478444900637025,
"grad_norm": 0.486328125,
"learning_rate": 1.2955701966939517e-05,
"loss": 0.0637,
"step": 1284
},
{
"epoch": 0.43512306617849356,
"grad_norm": 0.453125,
"learning_rate": 1.2945004359343794e-05,
"loss": 0.0661,
"step": 1285
},
{
"epoch": 0.4354616833506169,
"grad_norm": 0.58984375,
"learning_rate": 1.2934303060184865e-05,
"loss": 0.0694,
"step": 1286
},
{
"epoch": 0.43580030052274027,
"grad_norm": 0.40625,
"learning_rate": 1.2923598082876811e-05,
"loss": 0.0542,
"step": 1287
},
{
"epoch": 0.4361389176948636,
"grad_norm": 0.419921875,
"learning_rate": 1.291288944083832e-05,
"loss": 0.0567,
"step": 1288
},
{
"epoch": 0.4364775348669869,
"grad_norm": 0.494140625,
"learning_rate": 1.2902177147492677e-05,
"loss": 0.0662,
"step": 1289
},
{
"epoch": 0.4368161520391103,
"grad_norm": 0.51953125,
"learning_rate": 1.2891461216267742e-05,
"loss": 0.0785,
"step": 1290
},
{
"epoch": 0.43715476921123364,
"grad_norm": 0.37890625,
"learning_rate": 1.2880741660595936e-05,
"loss": 0.0521,
"step": 1291
},
{
"epoch": 0.43749338638335694,
"grad_norm": 0.412109375,
"learning_rate": 1.2870018493914227e-05,
"loss": 0.0479,
"step": 1292
},
{
"epoch": 0.4378320035554803,
"grad_norm": 0.4765625,
"learning_rate": 1.2859291729664094e-05,
"loss": 0.0694,
"step": 1293
},
{
"epoch": 0.43817062072760365,
"grad_norm": 0.76953125,
"learning_rate": 1.2848561381291547e-05,
"loss": 0.0655,
"step": 1294
},
{
"epoch": 0.438509237899727,
"grad_norm": 0.609375,
"learning_rate": 1.2837827462247077e-05,
"loss": 0.0711,
"step": 1295
},
{
"epoch": 0.4388478550718503,
"grad_norm": 0.72265625,
"learning_rate": 1.2827089985985647e-05,
"loss": 0.1055,
"step": 1296
},
{
"epoch": 0.43918647224397367,
"grad_norm": 0.55859375,
"learning_rate": 1.2816348965966693e-05,
"loss": 0.053,
"step": 1297
},
{
"epoch": 0.439525089416097,
"grad_norm": 0.46875,
"learning_rate": 1.2805604415654076e-05,
"loss": 0.0567,
"step": 1298
},
{
"epoch": 0.4398637065882204,
"grad_norm": 0.57421875,
"learning_rate": 1.2794856348516095e-05,
"loss": 0.0856,
"step": 1299
},
{
"epoch": 0.4402023237603437,
"grad_norm": 0.69140625,
"learning_rate": 1.2784104778025455e-05,
"loss": 0.0913,
"step": 1300
},
{
"epoch": 0.44054094093246704,
"grad_norm": 0.46875,
"learning_rate": 1.2773349717659245e-05,
"loss": 0.0607,
"step": 1301
},
{
"epoch": 0.4408795581045904,
"grad_norm": 0.76953125,
"learning_rate": 1.2762591180898938e-05,
"loss": 0.1013,
"step": 1302
},
{
"epoch": 0.4412181752767137,
"grad_norm": 0.5390625,
"learning_rate": 1.2751829181230364e-05,
"loss": 0.0565,
"step": 1303
},
{
"epoch": 0.44155679244883705,
"grad_norm": 0.46875,
"learning_rate": 1.274106373214368e-05,
"loss": 0.0702,
"step": 1304
},
{
"epoch": 0.4418954096209604,
"grad_norm": 0.490234375,
"learning_rate": 1.2730294847133386e-05,
"loss": 0.0666,
"step": 1305
},
{
"epoch": 0.44223402679308377,
"grad_norm": 1.0546875,
"learning_rate": 1.2719522539698277e-05,
"loss": 0.0646,
"step": 1306
},
{
"epoch": 0.44257264396520707,
"grad_norm": 0.58203125,
"learning_rate": 1.2708746823341444e-05,
"loss": 0.0871,
"step": 1307
},
{
"epoch": 0.4429112611373304,
"grad_norm": 0.5390625,
"learning_rate": 1.2697967711570243e-05,
"loss": 0.0495,
"step": 1308
},
{
"epoch": 0.4432498783094538,
"grad_norm": 0.5234375,
"learning_rate": 1.2687185217896297e-05,
"loss": 0.0733,
"step": 1309
},
{
"epoch": 0.4435884954815771,
"grad_norm": 0.5625,
"learning_rate": 1.267639935583546e-05,
"loss": 0.072,
"step": 1310
},
{
"epoch": 0.44392711265370044,
"grad_norm": 0.55078125,
"learning_rate": 1.2665610138907813e-05,
"loss": 0.0742,
"step": 1311
},
{
"epoch": 0.4442657298258238,
"grad_norm": 0.73046875,
"learning_rate": 1.2654817580637637e-05,
"loss": 0.1116,
"step": 1312
},
{
"epoch": 0.44460434699794715,
"grad_norm": 0.43359375,
"learning_rate": 1.264402169455341e-05,
"loss": 0.0562,
"step": 1313
},
{
"epoch": 0.44494296417007045,
"grad_norm": 0.408203125,
"learning_rate": 1.263322249418777e-05,
"loss": 0.052,
"step": 1314
},
{
"epoch": 0.4452815813421938,
"grad_norm": 0.56640625,
"learning_rate": 1.2622419993077518e-05,
"loss": 0.0801,
"step": 1315
},
{
"epoch": 0.44562019851431717,
"grad_norm": 0.46484375,
"learning_rate": 1.2611614204763587e-05,
"loss": 0.0588,
"step": 1316
},
{
"epoch": 0.4459588156864405,
"grad_norm": 0.515625,
"learning_rate": 1.2600805142791042e-05,
"loss": 0.0619,
"step": 1317
},
{
"epoch": 0.4462974328585638,
"grad_norm": 0.54296875,
"learning_rate": 1.2589992820709033e-05,
"loss": 0.0616,
"step": 1318
},
{
"epoch": 0.4466360500306872,
"grad_norm": 0.78515625,
"learning_rate": 1.2579177252070815e-05,
"loss": 0.0718,
"step": 1319
},
{
"epoch": 0.44697466720281054,
"grad_norm": 0.439453125,
"learning_rate": 1.2568358450433698e-05,
"loss": 0.0587,
"step": 1320
},
{
"epoch": 0.44731328437493384,
"grad_norm": 0.408203125,
"learning_rate": 1.2557536429359054e-05,
"loss": 0.0561,
"step": 1321
},
{
"epoch": 0.4476519015470572,
"grad_norm": 0.474609375,
"learning_rate": 1.2546711202412287e-05,
"loss": 0.0559,
"step": 1322
},
{
"epoch": 0.44799051871918055,
"grad_norm": 0.41015625,
"learning_rate": 1.2535882783162823e-05,
"loss": 0.0528,
"step": 1323
},
{
"epoch": 0.4483291358913039,
"grad_norm": 0.38671875,
"learning_rate": 1.2525051185184078e-05,
"loss": 0.0451,
"step": 1324
},
{
"epoch": 0.4486677530634272,
"grad_norm": 0.421875,
"learning_rate": 1.2514216422053468e-05,
"loss": 0.0545,
"step": 1325
},
{
"epoch": 0.44900637023555057,
"grad_norm": 0.48828125,
"learning_rate": 1.2503378507352365e-05,
"loss": 0.071,
"step": 1326
},
{
"epoch": 0.4493449874076739,
"grad_norm": 0.87890625,
"learning_rate": 1.24925374546661e-05,
"loss": 0.0632,
"step": 1327
},
{
"epoch": 0.4496836045797973,
"grad_norm": 0.58203125,
"learning_rate": 1.2481693277583932e-05,
"loss": 0.0858,
"step": 1328
},
{
"epoch": 0.4500222217519206,
"grad_norm": 0.5546875,
"learning_rate": 1.2470845989699036e-05,
"loss": 0.0668,
"step": 1329
},
{
"epoch": 0.45036083892404394,
"grad_norm": 0.47265625,
"learning_rate": 1.2459995604608493e-05,
"loss": 0.066,
"step": 1330
},
{
"epoch": 0.4506994560961673,
"grad_norm": 0.56640625,
"learning_rate": 1.2449142135913254e-05,
"loss": 0.0731,
"step": 1331
},
{
"epoch": 0.4510380732682906,
"grad_norm": 0.46484375,
"learning_rate": 1.243828559721815e-05,
"loss": 0.0638,
"step": 1332
},
{
"epoch": 0.45137669044041395,
"grad_norm": 0.6171875,
"learning_rate": 1.2427426002131848e-05,
"loss": 0.0645,
"step": 1333
},
{
"epoch": 0.4517153076125373,
"grad_norm": 0.640625,
"learning_rate": 1.2416563364266859e-05,
"loss": 0.0873,
"step": 1334
},
{
"epoch": 0.45205392478466067,
"grad_norm": 0.53515625,
"learning_rate": 1.240569769723949e-05,
"loss": 0.0606,
"step": 1335
},
{
"epoch": 0.45239254195678397,
"grad_norm": 0.58984375,
"learning_rate": 1.2394829014669863e-05,
"loss": 0.0785,
"step": 1336
},
{
"epoch": 0.4527311591289073,
"grad_norm": 0.5,
"learning_rate": 1.238395733018187e-05,
"loss": 0.0489,
"step": 1337
},
{
"epoch": 0.4530697763010307,
"grad_norm": 0.44921875,
"learning_rate": 1.2373082657403168e-05,
"loss": 0.0622,
"step": 1338
},
{
"epoch": 0.453408393473154,
"grad_norm": 0.5390625,
"learning_rate": 1.236220500996516e-05,
"loss": 0.0688,
"step": 1339
},
{
"epoch": 0.45374701064527734,
"grad_norm": 0.4765625,
"learning_rate": 1.235132440150298e-05,
"loss": 0.0579,
"step": 1340
},
{
"epoch": 0.4540856278174007,
"grad_norm": 0.494140625,
"learning_rate": 1.234044084565547e-05,
"loss": 0.0592,
"step": 1341
},
{
"epoch": 0.45442424498952405,
"grad_norm": 0.46875,
"learning_rate": 1.232955435606517e-05,
"loss": 0.0614,
"step": 1342
},
{
"epoch": 0.45476286216164735,
"grad_norm": 0.671875,
"learning_rate": 1.2318664946378292e-05,
"loss": 0.0752,
"step": 1343
},
{
"epoch": 0.4551014793337707,
"grad_norm": 0.53515625,
"learning_rate": 1.2307772630244715e-05,
"loss": 0.0526,
"step": 1344
},
{
"epoch": 0.45544009650589407,
"grad_norm": 0.55859375,
"learning_rate": 1.2296877421317958e-05,
"loss": 0.0691,
"step": 1345
},
{
"epoch": 0.4557787136780174,
"grad_norm": 0.59765625,
"learning_rate": 1.2285979333255165e-05,
"loss": 0.0796,
"step": 1346
},
{
"epoch": 0.4561173308501407,
"grad_norm": 0.447265625,
"learning_rate": 1.227507837971709e-05,
"loss": 0.0538,
"step": 1347
},
{
"epoch": 0.4564559480222641,
"grad_norm": 0.515625,
"learning_rate": 1.2264174574368079e-05,
"loss": 0.0668,
"step": 1348
},
{
"epoch": 0.45679456519438744,
"grad_norm": 0.46484375,
"learning_rate": 1.2253267930876056e-05,
"loss": 0.0635,
"step": 1349
},
{
"epoch": 0.45713318236651074,
"grad_norm": 0.63671875,
"learning_rate": 1.2242358462912496e-05,
"loss": 0.0826,
"step": 1350
},
{
"epoch": 0.4574717995386341,
"grad_norm": 0.458984375,
"learning_rate": 1.2231446184152419e-05,
"loss": 0.0538,
"step": 1351
},
{
"epoch": 0.45781041671075745,
"grad_norm": 0.42578125,
"learning_rate": 1.2220531108274367e-05,
"loss": 0.0596,
"step": 1352
},
{
"epoch": 0.4581490338828808,
"grad_norm": 0.53125,
"learning_rate": 1.220961324896039e-05,
"loss": 0.0715,
"step": 1353
},
{
"epoch": 0.4584876510550041,
"grad_norm": 0.58203125,
"learning_rate": 1.2198692619896026e-05,
"loss": 0.0625,
"step": 1354
},
{
"epoch": 0.45882626822712747,
"grad_norm": 0.453125,
"learning_rate": 1.218776923477028e-05,
"loss": 0.0606,
"step": 1355
},
{
"epoch": 0.4591648853992508,
"grad_norm": 0.388671875,
"learning_rate": 1.2176843107275624e-05,
"loss": 0.0471,
"step": 1356
},
{
"epoch": 0.4595035025713742,
"grad_norm": 0.83984375,
"learning_rate": 1.2165914251107953e-05,
"loss": 0.0775,
"step": 1357
},
{
"epoch": 0.4598421197434975,
"grad_norm": 0.54296875,
"learning_rate": 1.215498267996659e-05,
"loss": 0.0786,
"step": 1358
},
{
"epoch": 0.46018073691562084,
"grad_norm": 0.51171875,
"learning_rate": 1.214404840755426e-05,
"loss": 0.0682,
"step": 1359
},
{
"epoch": 0.4605193540877442,
"grad_norm": 0.494140625,
"learning_rate": 1.2133111447577077e-05,
"loss": 0.0661,
"step": 1360
},
{
"epoch": 0.4608579712598675,
"grad_norm": 0.62109375,
"learning_rate": 1.2122171813744522e-05,
"loss": 0.0905,
"step": 1361
},
{
"epoch": 0.46119658843199085,
"grad_norm": 0.4375,
"learning_rate": 1.2111229519769421e-05,
"loss": 0.0615,
"step": 1362
},
{
"epoch": 0.4615352056041142,
"grad_norm": 0.5234375,
"learning_rate": 1.2100284579367947e-05,
"loss": 0.0636,
"step": 1363
},
{
"epoch": 0.46187382277623756,
"grad_norm": 0.44140625,
"learning_rate": 1.2089337006259581e-05,
"loss": 0.0617,
"step": 1364
},
{
"epoch": 0.46221243994836086,
"grad_norm": 0.51953125,
"learning_rate": 1.2078386814167106e-05,
"loss": 0.0522,
"step": 1365
},
{
"epoch": 0.4625510571204842,
"grad_norm": 0.59765625,
"learning_rate": 1.2067434016816591e-05,
"loss": 0.0824,
"step": 1366
},
{
"epoch": 0.4628896742926076,
"grad_norm": 0.578125,
"learning_rate": 1.2056478627937364e-05,
"loss": 0.0736,
"step": 1367
},
{
"epoch": 0.4632282914647309,
"grad_norm": 0.47265625,
"learning_rate": 1.2045520661262011e-05,
"loss": 0.0579,
"step": 1368
},
{
"epoch": 0.46356690863685424,
"grad_norm": 0.765625,
"learning_rate": 1.2034560130526341e-05,
"loss": 0.0649,
"step": 1369
},
{
"epoch": 0.4639055258089776,
"grad_norm": 0.5234375,
"learning_rate": 1.2023597049469378e-05,
"loss": 0.0666,
"step": 1370
},
{
"epoch": 0.46424414298110095,
"grad_norm": 0.47265625,
"learning_rate": 1.201263143183335e-05,
"loss": 0.0632,
"step": 1371
},
{
"epoch": 0.46458276015322425,
"grad_norm": 0.5078125,
"learning_rate": 1.2001663291363661e-05,
"loss": 0.0655,
"step": 1372
},
{
"epoch": 0.4649213773253476,
"grad_norm": 0.451171875,
"learning_rate": 1.199069264180887e-05,
"loss": 0.0583,
"step": 1373
},
{
"epoch": 0.46525999449747096,
"grad_norm": 0.5625,
"learning_rate": 1.1979719496920686e-05,
"loss": 0.0851,
"step": 1374
},
{
"epoch": 0.4655986116695943,
"grad_norm": 0.67578125,
"learning_rate": 1.1968743870453956e-05,
"loss": 0.0895,
"step": 1375
},
{
"epoch": 0.4659372288417176,
"grad_norm": 0.392578125,
"learning_rate": 1.195776577616662e-05,
"loss": 0.0533,
"step": 1376
},
{
"epoch": 0.466275846013841,
"grad_norm": 0.52734375,
"learning_rate": 1.1946785227819726e-05,
"loss": 0.0661,
"step": 1377
},
{
"epoch": 0.46661446318596433,
"grad_norm": 0.5,
"learning_rate": 1.1935802239177387e-05,
"loss": 0.0636,
"step": 1378
},
{
"epoch": 0.46695308035808764,
"grad_norm": 0.435546875,
"learning_rate": 1.1924816824006787e-05,
"loss": 0.0596,
"step": 1379
},
{
"epoch": 0.467291697530211,
"grad_norm": 0.443359375,
"learning_rate": 1.1913828996078136e-05,
"loss": 0.054,
"step": 1380
},
{
"epoch": 0.46763031470233435,
"grad_norm": 0.494140625,
"learning_rate": 1.1902838769164685e-05,
"loss": 0.0634,
"step": 1381
},
{
"epoch": 0.4679689318744577,
"grad_norm": 0.63671875,
"learning_rate": 1.1891846157042678e-05,
"loss": 0.0675,
"step": 1382
},
{
"epoch": 0.468307549046581,
"grad_norm": 0.59765625,
"learning_rate": 1.1880851173491361e-05,
"loss": 0.0691,
"step": 1383
},
{
"epoch": 0.46864616621870436,
"grad_norm": 0.71484375,
"learning_rate": 1.1869853832292944e-05,
"loss": 0.1164,
"step": 1384
},
{
"epoch": 0.4689847833908277,
"grad_norm": 0.74609375,
"learning_rate": 1.1858854147232595e-05,
"loss": 0.0892,
"step": 1385
},
{
"epoch": 0.4693234005629511,
"grad_norm": 0.5703125,
"learning_rate": 1.184785213209842e-05,
"loss": 0.0762,
"step": 1386
},
{
"epoch": 0.4696620177350744,
"grad_norm": 0.515625,
"learning_rate": 1.1836847800681443e-05,
"loss": 0.0613,
"step": 1387
},
{
"epoch": 0.47000063490719773,
"grad_norm": 1.1796875,
"learning_rate": 1.1825841166775605e-05,
"loss": 0.0655,
"step": 1388
},
{
"epoch": 0.4703392520793211,
"grad_norm": 0.42578125,
"learning_rate": 1.181483224417771e-05,
"loss": 0.052,
"step": 1389
},
{
"epoch": 0.4706778692514444,
"grad_norm": 0.515625,
"learning_rate": 1.180382104668745e-05,
"loss": 0.045,
"step": 1390
},
{
"epoch": 0.47101648642356775,
"grad_norm": 0.41796875,
"learning_rate": 1.1792807588107358e-05,
"loss": 0.0532,
"step": 1391
},
{
"epoch": 0.4713551035956911,
"grad_norm": 0.890625,
"learning_rate": 1.1781791882242811e-05,
"loss": 0.0719,
"step": 1392
},
{
"epoch": 0.47169372076781446,
"grad_norm": 0.55078125,
"learning_rate": 1.177077394290199e-05,
"loss": 0.0679,
"step": 1393
},
{
"epoch": 0.47203233793993776,
"grad_norm": 0.56640625,
"learning_rate": 1.175975378389589e-05,
"loss": 0.0782,
"step": 1394
},
{
"epoch": 0.4723709551120611,
"grad_norm": 0.419921875,
"learning_rate": 1.1748731419038278e-05,
"loss": 0.0547,
"step": 1395
},
{
"epoch": 0.4727095722841845,
"grad_norm": 0.609375,
"learning_rate": 1.1737706862145688e-05,
"loss": 0.0719,
"step": 1396
},
{
"epoch": 0.4730481894563078,
"grad_norm": 0.431640625,
"learning_rate": 1.1726680127037403e-05,
"loss": 0.063,
"step": 1397
},
{
"epoch": 0.47338680662843113,
"grad_norm": 0.490234375,
"learning_rate": 1.1715651227535441e-05,
"loss": 0.0681,
"step": 1398
},
{
"epoch": 0.4737254238005545,
"grad_norm": 0.494140625,
"learning_rate": 1.170462017746452e-05,
"loss": 0.0679,
"step": 1399
},
{
"epoch": 0.47406404097267785,
"grad_norm": 0.56640625,
"learning_rate": 1.169358699065207e-05,
"loss": 0.0749,
"step": 1400
},
{
"epoch": 0.47440265814480115,
"grad_norm": 0.51171875,
"learning_rate": 1.1682551680928189e-05,
"loss": 0.0639,
"step": 1401
},
{
"epoch": 0.4747412753169245,
"grad_norm": 0.52734375,
"learning_rate": 1.1671514262125638e-05,
"loss": 0.07,
"step": 1402
},
{
"epoch": 0.47507989248904786,
"grad_norm": 0.458984375,
"learning_rate": 1.1660474748079823e-05,
"loss": 0.0539,
"step": 1403
},
{
"epoch": 0.4754185096611712,
"grad_norm": 0.53515625,
"learning_rate": 1.1649433152628775e-05,
"loss": 0.0699,
"step": 1404
},
{
"epoch": 0.4757571268332945,
"grad_norm": 0.60546875,
"learning_rate": 1.1638389489613133e-05,
"loss": 0.0785,
"step": 1405
},
{
"epoch": 0.4760957440054179,
"grad_norm": 0.5234375,
"learning_rate": 1.1627343772876133e-05,
"loss": 0.0577,
"step": 1406
},
{
"epoch": 0.47643436117754123,
"grad_norm": 0.48828125,
"learning_rate": 1.1616296016263581e-05,
"loss": 0.0617,
"step": 1407
},
{
"epoch": 0.47677297834966453,
"grad_norm": 0.498046875,
"learning_rate": 1.1605246233623843e-05,
"loss": 0.0687,
"step": 1408
},
{
"epoch": 0.4771115955217879,
"grad_norm": 0.546875,
"learning_rate": 1.1594194438807817e-05,
"loss": 0.0702,
"step": 1409
},
{
"epoch": 0.47745021269391125,
"grad_norm": 0.515625,
"learning_rate": 1.1583140645668933e-05,
"loss": 0.0706,
"step": 1410
},
{
"epoch": 0.4777888298660346,
"grad_norm": 0.546875,
"learning_rate": 1.157208486806312e-05,
"loss": 0.0633,
"step": 1411
},
{
"epoch": 0.4781274470381579,
"grad_norm": 0.419921875,
"learning_rate": 1.1561027119848793e-05,
"loss": 0.0517,
"step": 1412
},
{
"epoch": 0.47846606421028126,
"grad_norm": 0.5078125,
"learning_rate": 1.1549967414886847e-05,
"loss": 0.073,
"step": 1413
},
{
"epoch": 0.4788046813824046,
"grad_norm": 0.578125,
"learning_rate": 1.153890576704062e-05,
"loss": 0.0749,
"step": 1414
},
{
"epoch": 0.4791432985545279,
"grad_norm": 0.498046875,
"learning_rate": 1.1527842190175886e-05,
"loss": 0.0569,
"step": 1415
},
{
"epoch": 0.4794819157266513,
"grad_norm": 0.5703125,
"learning_rate": 1.1516776698160841e-05,
"loss": 0.0752,
"step": 1416
},
{
"epoch": 0.47982053289877463,
"grad_norm": 0.515625,
"learning_rate": 1.1505709304866084e-05,
"loss": 0.0677,
"step": 1417
},
{
"epoch": 0.480159150070898,
"grad_norm": 0.43359375,
"learning_rate": 1.1494640024164587e-05,
"loss": 0.0518,
"step": 1418
},
{
"epoch": 0.4804977672430213,
"grad_norm": 0.4296875,
"learning_rate": 1.14835688699317e-05,
"loss": 0.055,
"step": 1419
},
{
"epoch": 0.48083638441514465,
"grad_norm": 0.5625,
"learning_rate": 1.1472495856045112e-05,
"loss": 0.073,
"step": 1420
},
{
"epoch": 0.481175001587268,
"grad_norm": 0.51953125,
"learning_rate": 1.1461420996384849e-05,
"loss": 0.0762,
"step": 1421
},
{
"epoch": 0.48151361875939136,
"grad_norm": 0.470703125,
"learning_rate": 1.1450344304833248e-05,
"loss": 0.0513,
"step": 1422
},
{
"epoch": 0.48185223593151466,
"grad_norm": 0.6015625,
"learning_rate": 1.1439265795274941e-05,
"loss": 0.0863,
"step": 1423
},
{
"epoch": 0.482190853103638,
"grad_norm": 0.45703125,
"learning_rate": 1.142818548159684e-05,
"loss": 0.0618,
"step": 1424
},
{
"epoch": 0.4825294702757614,
"grad_norm": 0.59765625,
"learning_rate": 1.1417103377688121e-05,
"loss": 0.0715,
"step": 1425
},
{
"epoch": 0.4828680874478847,
"grad_norm": 0.47265625,
"learning_rate": 1.1406019497440206e-05,
"loss": 0.0583,
"step": 1426
},
{
"epoch": 0.48320670462000803,
"grad_norm": 0.578125,
"learning_rate": 1.1394933854746733e-05,
"loss": 0.078,
"step": 1427
},
{
"epoch": 0.4835453217921314,
"grad_norm": 0.5,
"learning_rate": 1.1383846463503558e-05,
"loss": 0.0681,
"step": 1428
},
{
"epoch": 0.48388393896425475,
"grad_norm": 0.58984375,
"learning_rate": 1.1372757337608732e-05,
"loss": 0.0879,
"step": 1429
},
{
"epoch": 0.48422255613637805,
"grad_norm": 0.53125,
"learning_rate": 1.1361666490962468e-05,
"loss": 0.0716,
"step": 1430
},
{
"epoch": 0.4845611733085014,
"grad_norm": 0.5546875,
"learning_rate": 1.1350573937467147e-05,
"loss": 0.0754,
"step": 1431
},
{
"epoch": 0.48489979048062476,
"grad_norm": 0.439453125,
"learning_rate": 1.1339479691027284e-05,
"loss": 0.0527,
"step": 1432
},
{
"epoch": 0.4852384076527481,
"grad_norm": 0.40234375,
"learning_rate": 1.132838376554952e-05,
"loss": 0.0522,
"step": 1433
},
{
"epoch": 0.4855770248248714,
"grad_norm": 0.46484375,
"learning_rate": 1.1317286174942596e-05,
"loss": 0.0715,
"step": 1434
},
{
"epoch": 0.4859156419969948,
"grad_norm": 0.48828125,
"learning_rate": 1.1306186933117343e-05,
"loss": 0.0668,
"step": 1435
},
{
"epoch": 0.48625425916911813,
"grad_norm": 0.44921875,
"learning_rate": 1.1295086053986664e-05,
"loss": 0.0657,
"step": 1436
},
{
"epoch": 0.48659287634124143,
"grad_norm": 0.47265625,
"learning_rate": 1.1283983551465512e-05,
"loss": 0.058,
"step": 1437
},
{
"epoch": 0.4869314935133648,
"grad_norm": 0.490234375,
"learning_rate": 1.127287943947087e-05,
"loss": 0.0614,
"step": 1438
},
{
"epoch": 0.48727011068548814,
"grad_norm": 0.62109375,
"learning_rate": 1.1261773731921746e-05,
"loss": 0.0736,
"step": 1439
},
{
"epoch": 0.4876087278576115,
"grad_norm": 0.46484375,
"learning_rate": 1.1250666442739149e-05,
"loss": 0.0513,
"step": 1440
},
{
"epoch": 0.4879473450297348,
"grad_norm": 0.640625,
"learning_rate": 1.1239557585846066e-05,
"loss": 0.0689,
"step": 1441
},
{
"epoch": 0.48828596220185816,
"grad_norm": 0.6171875,
"learning_rate": 1.1228447175167443e-05,
"loss": 0.065,
"step": 1442
},
{
"epoch": 0.4886245793739815,
"grad_norm": 0.470703125,
"learning_rate": 1.1217335224630186e-05,
"loss": 0.054,
"step": 1443
},
{
"epoch": 0.4889631965461048,
"grad_norm": 0.53515625,
"learning_rate": 1.1206221748163127e-05,
"loss": 0.0709,
"step": 1444
},
{
"epoch": 0.4893018137182282,
"grad_norm": 2.390625,
"learning_rate": 1.1195106759697005e-05,
"loss": 0.0699,
"step": 1445
},
{
"epoch": 0.48964043089035153,
"grad_norm": 0.53125,
"learning_rate": 1.1183990273164464e-05,
"loss": 0.0593,
"step": 1446
},
{
"epoch": 0.4899790480624749,
"grad_norm": 0.4609375,
"learning_rate": 1.1172872302500017e-05,
"loss": 0.0554,
"step": 1447
},
{
"epoch": 0.4903176652345982,
"grad_norm": 0.50390625,
"learning_rate": 1.1161752861640046e-05,
"loss": 0.0639,
"step": 1448
},
{
"epoch": 0.49065628240672154,
"grad_norm": 0.5703125,
"learning_rate": 1.1150631964522767e-05,
"loss": 0.0659,
"step": 1449
},
{
"epoch": 0.4909948995788449,
"grad_norm": 0.5234375,
"learning_rate": 1.1139509625088225e-05,
"loss": 0.0659,
"step": 1450
},
{
"epoch": 0.49133351675096826,
"grad_norm": 0.494140625,
"learning_rate": 1.1128385857278274e-05,
"loss": 0.0584,
"step": 1451
},
{
"epoch": 0.49167213392309156,
"grad_norm": 0.54296875,
"learning_rate": 1.1117260675036563e-05,
"loss": 0.0791,
"step": 1452
},
{
"epoch": 0.4920107510952149,
"grad_norm": 0.482421875,
"learning_rate": 1.1106134092308502e-05,
"loss": 0.0626,
"step": 1453
},
{
"epoch": 0.49234936826733827,
"grad_norm": 0.5390625,
"learning_rate": 1.1095006123041262e-05,
"loss": 0.0627,
"step": 1454
},
{
"epoch": 0.4926879854394616,
"grad_norm": 0.73046875,
"learning_rate": 1.1083876781183762e-05,
"loss": 0.047,
"step": 1455
},
{
"epoch": 0.49302660261158493,
"grad_norm": 1.1328125,
"learning_rate": 1.1072746080686628e-05,
"loss": 0.0471,
"step": 1456
},
{
"epoch": 0.4933652197837083,
"grad_norm": 0.625,
"learning_rate": 1.1061614035502193e-05,
"loss": 0.0858,
"step": 1457
},
{
"epoch": 0.49370383695583164,
"grad_norm": 0.462890625,
"learning_rate": 1.1050480659584475e-05,
"loss": 0.0583,
"step": 1458
},
{
"epoch": 0.49404245412795494,
"grad_norm": 0.41796875,
"learning_rate": 1.1039345966889167e-05,
"loss": 0.0484,
"step": 1459
},
{
"epoch": 0.4943810713000783,
"grad_norm": 0.59765625,
"learning_rate": 1.1028209971373605e-05,
"loss": 0.0672,
"step": 1460
},
{
"epoch": 0.49471968847220166,
"grad_norm": 0.4921875,
"learning_rate": 1.101707268699676e-05,
"loss": 0.063,
"step": 1461
},
{
"epoch": 0.495058305644325,
"grad_norm": 0.4609375,
"learning_rate": 1.1005934127719218e-05,
"loss": 0.0549,
"step": 1462
},
{
"epoch": 0.4953969228164483,
"grad_norm": 0.62109375,
"learning_rate": 1.0994794307503162e-05,
"loss": 0.0881,
"step": 1463
},
{
"epoch": 0.49573553998857167,
"grad_norm": 0.5078125,
"learning_rate": 1.0983653240312364e-05,
"loss": 0.0701,
"step": 1464
},
{
"epoch": 0.49607415716069503,
"grad_norm": 0.5078125,
"learning_rate": 1.0972510940112149e-05,
"loss": 0.0641,
"step": 1465
},
{
"epoch": 0.49641277433281833,
"grad_norm": 0.466796875,
"learning_rate": 1.0961367420869387e-05,
"loss": 0.0599,
"step": 1466
},
{
"epoch": 0.4967513915049417,
"grad_norm": 0.5546875,
"learning_rate": 1.0950222696552487e-05,
"loss": 0.0651,
"step": 1467
},
{
"epoch": 0.49709000867706504,
"grad_norm": 0.51953125,
"learning_rate": 1.0939076781131357e-05,
"loss": 0.0631,
"step": 1468
},
{
"epoch": 0.4974286258491884,
"grad_norm": 0.5546875,
"learning_rate": 1.0927929688577408e-05,
"loss": 0.0606,
"step": 1469
},
{
"epoch": 0.4977672430213117,
"grad_norm": 0.5859375,
"learning_rate": 1.0916781432863514e-05,
"loss": 0.064,
"step": 1470
},
{
"epoch": 0.49810586019343506,
"grad_norm": 0.40625,
"learning_rate": 1.0905632027964024e-05,
"loss": 0.0527,
"step": 1471
},
{
"epoch": 0.4984444773655584,
"grad_norm": 0.359375,
"learning_rate": 1.0894481487854711e-05,
"loss": 0.0429,
"step": 1472
},
{
"epoch": 0.4987830945376817,
"grad_norm": 0.60546875,
"learning_rate": 1.0883329826512779e-05,
"loss": 0.0731,
"step": 1473
},
{
"epoch": 0.49912171170980507,
"grad_norm": 0.65625,
"learning_rate": 1.087217705791684e-05,
"loss": 0.088,
"step": 1474
},
{
"epoch": 0.49946032888192843,
"grad_norm": 0.390625,
"learning_rate": 1.0861023196046885e-05,
"loss": 0.0539,
"step": 1475
},
{
"epoch": 0.4997989460540518,
"grad_norm": 0.431640625,
"learning_rate": 1.0849868254884284e-05,
"loss": 0.0572,
"step": 1476
},
{
"epoch": 0.5001375632261751,
"grad_norm": 0.427734375,
"learning_rate": 1.0838712248411754e-05,
"loss": 0.0495,
"step": 1477
},
{
"epoch": 0.5004761803982984,
"grad_norm": 0.455078125,
"learning_rate": 1.0827555190613353e-05,
"loss": 0.0592,
"step": 1478
},
{
"epoch": 0.5008147975704218,
"grad_norm": 0.57421875,
"learning_rate": 1.0816397095474454e-05,
"loss": 0.0719,
"step": 1479
},
{
"epoch": 0.5011534147425452,
"grad_norm": 0.408203125,
"learning_rate": 1.0805237976981729e-05,
"loss": 0.0547,
"step": 1480
},
{
"epoch": 0.5011534147425452,
"eval_loss": 0.06768392771482468,
"eval_runtime": 815.5247,
"eval_samples_per_second": 12.198,
"eval_steps_per_second": 3.05,
"step": 1480
},
{
"epoch": 0.5014920319146685,
"grad_norm": 0.482421875,
"learning_rate": 1.0794077849123134e-05,
"loss": 0.0581,
"step": 1481
},
{
"epoch": 0.5018306490867919,
"grad_norm": 0.5234375,
"learning_rate": 1.0782916725887888e-05,
"loss": 0.0647,
"step": 1482
},
{
"epoch": 0.5021692662589151,
"grad_norm": 0.4765625,
"learning_rate": 1.0771754621266466e-05,
"loss": 0.0725,
"step": 1483
},
{
"epoch": 0.5025078834310385,
"grad_norm": 0.494140625,
"learning_rate": 1.0760591549250561e-05,
"loss": 0.0648,
"step": 1484
},
{
"epoch": 0.5028465006031618,
"grad_norm": 0.5859375,
"learning_rate": 1.0749427523833084e-05,
"loss": 0.0707,
"step": 1485
},
{
"epoch": 0.5031851177752852,
"grad_norm": 0.5546875,
"learning_rate": 1.0738262559008148e-05,
"loss": 0.0649,
"step": 1486
},
{
"epoch": 0.5035237349474085,
"grad_norm": 0.39453125,
"learning_rate": 1.0727096668771035e-05,
"loss": 0.0522,
"step": 1487
},
{
"epoch": 0.5038623521195319,
"grad_norm": 0.92578125,
"learning_rate": 1.0715929867118187e-05,
"loss": 0.0691,
"step": 1488
},
{
"epoch": 0.5042009692916553,
"grad_norm": 0.447265625,
"learning_rate": 1.0704762168047189e-05,
"loss": 0.0571,
"step": 1489
},
{
"epoch": 0.5045395864637785,
"grad_norm": 0.5,
"learning_rate": 1.069359358555676e-05,
"loss": 0.0701,
"step": 1490
},
{
"epoch": 0.5048782036359019,
"grad_norm": 0.5390625,
"learning_rate": 1.0682424133646712e-05,
"loss": 0.0739,
"step": 1491
},
{
"epoch": 0.5052168208080252,
"grad_norm": 0.443359375,
"learning_rate": 1.0671253826317957e-05,
"loss": 0.0613,
"step": 1492
},
{
"epoch": 0.5055554379801486,
"grad_norm": 0.56640625,
"learning_rate": 1.0660082677572474e-05,
"loss": 0.0781,
"step": 1493
},
{
"epoch": 0.5058940551522719,
"grad_norm": 0.5625,
"learning_rate": 1.0648910701413306e-05,
"loss": 0.0718,
"step": 1494
},
{
"epoch": 0.5062326723243953,
"grad_norm": 0.5546875,
"learning_rate": 1.0637737911844516e-05,
"loss": 0.0781,
"step": 1495
},
{
"epoch": 0.5065712894965186,
"grad_norm": 0.57421875,
"learning_rate": 1.0626564322871205e-05,
"loss": 0.09,
"step": 1496
},
{
"epoch": 0.5069099066686419,
"grad_norm": 0.39453125,
"learning_rate": 1.061538994849946e-05,
"loss": 0.0554,
"step": 1497
},
{
"epoch": 0.5072485238407652,
"grad_norm": 0.52734375,
"learning_rate": 1.0604214802736366e-05,
"loss": 0.0735,
"step": 1498
},
{
"epoch": 0.5075871410128886,
"grad_norm": 0.462890625,
"learning_rate": 1.0593038899589968e-05,
"loss": 0.0592,
"step": 1499
},
{
"epoch": 0.507925758185012,
"grad_norm": 0.38671875,
"learning_rate": 1.0581862253069262e-05,
"loss": 0.0484,
"step": 1500
},
{
"epoch": 0.5082643753571353,
"grad_norm": 0.62109375,
"learning_rate": 1.0570684877184169e-05,
"loss": 0.0938,
"step": 1501
},
{
"epoch": 0.5086029925292587,
"grad_norm": 0.640625,
"learning_rate": 1.0559506785945538e-05,
"loss": 0.0768,
"step": 1502
},
{
"epoch": 0.508941609701382,
"grad_norm": 0.48828125,
"learning_rate": 1.0548327993365108e-05,
"loss": 0.0552,
"step": 1503
},
{
"epoch": 0.5092802268735053,
"grad_norm": 0.412109375,
"learning_rate": 1.0537148513455493e-05,
"loss": 0.0519,
"step": 1504
},
{
"epoch": 0.5096188440456286,
"grad_norm": 0.51171875,
"learning_rate": 1.0525968360230173e-05,
"loss": 0.0869,
"step": 1505
},
{
"epoch": 0.509957461217752,
"grad_norm": 0.59375,
"learning_rate": 1.0514787547703466e-05,
"loss": 0.0748,
"step": 1506
},
{
"epoch": 0.5102960783898753,
"grad_norm": 0.5546875,
"learning_rate": 1.050360608989053e-05,
"loss": 0.0689,
"step": 1507
},
{
"epoch": 0.5106346955619987,
"grad_norm": 0.490234375,
"learning_rate": 1.0492424000807316e-05,
"loss": 0.0596,
"step": 1508
},
{
"epoch": 0.510973312734122,
"grad_norm": 0.361328125,
"learning_rate": 1.0481241294470578e-05,
"loss": 0.0427,
"step": 1509
},
{
"epoch": 0.5113119299062454,
"grad_norm": 0.5390625,
"learning_rate": 1.047005798489784e-05,
"loss": 0.0608,
"step": 1510
},
{
"epoch": 0.5116505470783688,
"grad_norm": 0.474609375,
"learning_rate": 1.0458874086107379e-05,
"loss": 0.0565,
"step": 1511
},
{
"epoch": 0.511989164250492,
"grad_norm": 0.44140625,
"learning_rate": 1.0447689612118208e-05,
"loss": 0.0595,
"step": 1512
},
{
"epoch": 0.5123277814226154,
"grad_norm": 1.046875,
"learning_rate": 1.0436504576950077e-05,
"loss": 0.05,
"step": 1513
},
{
"epoch": 0.5126663985947387,
"grad_norm": 0.421875,
"learning_rate": 1.0425318994623423e-05,
"loss": 0.0583,
"step": 1514
},
{
"epoch": 0.5130050157668621,
"grad_norm": 0.56640625,
"learning_rate": 1.0414132879159375e-05,
"loss": 0.0612,
"step": 1515
},
{
"epoch": 0.5133436329389854,
"grad_norm": 0.7421875,
"learning_rate": 1.0402946244579726e-05,
"loss": 0.1383,
"step": 1516
},
{
"epoch": 0.5136822501111088,
"grad_norm": 0.4140625,
"learning_rate": 1.0391759104906928e-05,
"loss": 0.0571,
"step": 1517
},
{
"epoch": 0.5140208672832322,
"grad_norm": 0.52734375,
"learning_rate": 1.038057147416406e-05,
"loss": 0.0564,
"step": 1518
},
{
"epoch": 0.5143594844553554,
"grad_norm": 0.43359375,
"learning_rate": 1.0369383366374819e-05,
"loss": 0.0551,
"step": 1519
},
{
"epoch": 0.5146981016274788,
"grad_norm": 0.466796875,
"learning_rate": 1.0358194795563497e-05,
"loss": 0.0617,
"step": 1520
},
{
"epoch": 0.5150367187996021,
"grad_norm": 0.53515625,
"learning_rate": 1.0347005775754969e-05,
"loss": 0.0756,
"step": 1521
},
{
"epoch": 0.5153753359717255,
"grad_norm": 0.46875,
"learning_rate": 1.0335816320974672e-05,
"loss": 0.0606,
"step": 1522
},
{
"epoch": 0.5157139531438488,
"grad_norm": 0.41796875,
"learning_rate": 1.0324626445248592e-05,
"loss": 0.0454,
"step": 1523
},
{
"epoch": 0.5160525703159722,
"grad_norm": 0.53125,
"learning_rate": 1.0313436162603231e-05,
"loss": 0.0752,
"step": 1524
},
{
"epoch": 0.5163911874880955,
"grad_norm": 0.546875,
"learning_rate": 1.0302245487065621e-05,
"loss": 0.0705,
"step": 1525
},
{
"epoch": 0.5167298046602188,
"grad_norm": 0.498046875,
"learning_rate": 1.0291054432663267e-05,
"loss": 0.0666,
"step": 1526
},
{
"epoch": 0.5170684218323421,
"grad_norm": 0.58984375,
"learning_rate": 1.0279863013424154e-05,
"loss": 0.0596,
"step": 1527
},
{
"epoch": 0.5174070390044655,
"grad_norm": 0.578125,
"learning_rate": 1.0268671243376733e-05,
"loss": 0.0686,
"step": 1528
},
{
"epoch": 0.5177456561765889,
"grad_norm": 0.53125,
"learning_rate": 1.0257479136549889e-05,
"loss": 0.0569,
"step": 1529
},
{
"epoch": 0.5180842733487122,
"grad_norm": 0.458984375,
"learning_rate": 1.0246286706972923e-05,
"loss": 0.0582,
"step": 1530
},
{
"epoch": 0.5184228905208356,
"grad_norm": 0.5078125,
"learning_rate": 1.023509396867555e-05,
"loss": 0.072,
"step": 1531
},
{
"epoch": 0.5187615076929589,
"grad_norm": 1.078125,
"learning_rate": 1.0223900935687866e-05,
"loss": 0.076,
"step": 1532
},
{
"epoch": 0.5191001248650822,
"grad_norm": 0.45703125,
"learning_rate": 1.0212707622040345e-05,
"loss": 0.0651,
"step": 1533
},
{
"epoch": 0.5194387420372055,
"grad_norm": 0.51953125,
"learning_rate": 1.02015140417638e-05,
"loss": 0.0795,
"step": 1534
},
{
"epoch": 0.5197773592093289,
"grad_norm": 0.3984375,
"learning_rate": 1.0190320208889388e-05,
"loss": 0.0507,
"step": 1535
},
{
"epoch": 0.5201159763814522,
"grad_norm": 0.5,
"learning_rate": 1.0179126137448577e-05,
"loss": 0.0691,
"step": 1536
},
{
"epoch": 0.5204545935535756,
"grad_norm": 0.462890625,
"learning_rate": 1.0167931841473143e-05,
"loss": 0.0529,
"step": 1537
},
{
"epoch": 0.520793210725699,
"grad_norm": 0.51953125,
"learning_rate": 1.0156737334995129e-05,
"loss": 0.0722,
"step": 1538
},
{
"epoch": 0.5211318278978223,
"grad_norm": 0.4140625,
"learning_rate": 1.014554263204685e-05,
"loss": 0.0621,
"step": 1539
},
{
"epoch": 0.5214704450699457,
"grad_norm": 0.474609375,
"learning_rate": 1.013434774666087e-05,
"loss": 0.0425,
"step": 1540
},
{
"epoch": 0.5218090622420689,
"grad_norm": 0.47265625,
"learning_rate": 1.0123152692869981e-05,
"loss": 0.056,
"step": 1541
},
{
"epoch": 0.5221476794141923,
"grad_norm": 0.451171875,
"learning_rate": 1.0111957484707182e-05,
"loss": 0.0616,
"step": 1542
},
{
"epoch": 0.5224862965863156,
"grad_norm": 0.482421875,
"learning_rate": 1.0100762136205664e-05,
"loss": 0.0521,
"step": 1543
},
{
"epoch": 0.522824913758439,
"grad_norm": 0.61328125,
"learning_rate": 1.0089566661398802e-05,
"loss": 0.0845,
"step": 1544
},
{
"epoch": 0.5231635309305623,
"grad_norm": 0.53125,
"learning_rate": 1.0078371074320123e-05,
"loss": 0.0735,
"step": 1545
},
{
"epoch": 0.5235021481026857,
"grad_norm": 0.53125,
"learning_rate": 1.0067175389003297e-05,
"loss": 0.0699,
"step": 1546
},
{
"epoch": 0.523840765274809,
"grad_norm": 0.63671875,
"learning_rate": 1.0055979619482112e-05,
"loss": 0.0785,
"step": 1547
},
{
"epoch": 0.5241793824469323,
"grad_norm": 0.55078125,
"learning_rate": 1.0044783779790472e-05,
"loss": 0.0614,
"step": 1548
},
{
"epoch": 0.5245179996190557,
"grad_norm": 0.60546875,
"learning_rate": 1.0033587883962362e-05,
"loss": 0.0635,
"step": 1549
},
{
"epoch": 0.524856616791179,
"grad_norm": 0.478515625,
"learning_rate": 1.0022391946031832e-05,
"loss": 0.0542,
"step": 1550
},
{
"epoch": 0.5251952339633024,
"grad_norm": 0.53125,
"learning_rate": 1.0011195980032996e-05,
"loss": 0.067,
"step": 1551
},
{
"epoch": 0.5255338511354257,
"grad_norm": 0.53125,
"learning_rate": 1e-05,
"loss": 0.0592,
"step": 1552
},
{
"epoch": 0.5258724683075491,
"grad_norm": 0.578125,
"learning_rate": 9.988804019967005e-06,
"loss": 0.0721,
"step": 1553
},
{
"epoch": 0.5262110854796724,
"grad_norm": 0.48046875,
"learning_rate": 9.977608053968172e-06,
"loss": 0.064,
"step": 1554
},
{
"epoch": 0.5265497026517957,
"grad_norm": 0.6640625,
"learning_rate": 9.966412116037643e-06,
"loss": 0.0672,
"step": 1555
},
{
"epoch": 0.526888319823919,
"grad_norm": 0.423828125,
"learning_rate": 9.95521622020953e-06,
"loss": 0.0526,
"step": 1556
},
{
"epoch": 0.5272269369960424,
"grad_norm": 0.63671875,
"learning_rate": 9.94402038051789e-06,
"loss": 0.0733,
"step": 1557
},
{
"epoch": 0.5275655541681658,
"grad_norm": 0.4609375,
"learning_rate": 9.932824610996706e-06,
"loss": 0.0561,
"step": 1558
},
{
"epoch": 0.5279041713402891,
"grad_norm": 0.5546875,
"learning_rate": 9.921628925679877e-06,
"loss": 0.072,
"step": 1559
},
{
"epoch": 0.5282427885124125,
"grad_norm": 0.44921875,
"learning_rate": 9.910433338601198e-06,
"loss": 0.0561,
"step": 1560
},
{
"epoch": 0.5285814056845358,
"grad_norm": 0.62890625,
"learning_rate": 9.899237863794336e-06,
"loss": 0.0679,
"step": 1561
},
{
"epoch": 0.5289200228566591,
"grad_norm": 0.427734375,
"learning_rate": 9.888042515292821e-06,
"loss": 0.0552,
"step": 1562
},
{
"epoch": 0.5292586400287824,
"grad_norm": 0.5859375,
"learning_rate": 9.876847307130024e-06,
"loss": 0.0788,
"step": 1563
},
{
"epoch": 0.5295972572009058,
"grad_norm": 0.6171875,
"learning_rate": 9.865652253339133e-06,
"loss": 0.0774,
"step": 1564
},
{
"epoch": 0.5299358743730291,
"grad_norm": 0.427734375,
"learning_rate": 9.854457367953155e-06,
"loss": 0.0599,
"step": 1565
},
{
"epoch": 0.5302744915451525,
"grad_norm": 0.498046875,
"learning_rate": 9.843262665004876e-06,
"loss": 0.062,
"step": 1566
},
{
"epoch": 0.5306131087172758,
"grad_norm": 0.57421875,
"learning_rate": 9.832068158526862e-06,
"loss": 0.0831,
"step": 1567
},
{
"epoch": 0.5309517258893992,
"grad_norm": 0.404296875,
"learning_rate": 9.820873862551425e-06,
"loss": 0.053,
"step": 1568
},
{
"epoch": 0.5312903430615225,
"grad_norm": 1.28125,
"learning_rate": 9.809679791110615e-06,
"loss": 0.0688,
"step": 1569
},
{
"epoch": 0.5316289602336458,
"grad_norm": 0.58203125,
"learning_rate": 9.798485958236203e-06,
"loss": 0.0557,
"step": 1570
},
{
"epoch": 0.5319675774057692,
"grad_norm": 0.51953125,
"learning_rate": 9.787292377959659e-06,
"loss": 0.0671,
"step": 1571
},
{
"epoch": 0.5323061945778925,
"grad_norm": 0.546875,
"learning_rate": 9.776099064312135e-06,
"loss": 0.0679,
"step": 1572
},
{
"epoch": 0.5326448117500159,
"grad_norm": 0.7265625,
"learning_rate": 9.764906031324454e-06,
"loss": 0.0996,
"step": 1573
},
{
"epoch": 0.5329834289221392,
"grad_norm": 0.56640625,
"learning_rate": 9.75371329302708e-06,
"loss": 0.0634,
"step": 1574
},
{
"epoch": 0.5333220460942626,
"grad_norm": 0.41796875,
"learning_rate": 9.742520863450116e-06,
"loss": 0.054,
"step": 1575
},
{
"epoch": 0.533660663266386,
"grad_norm": 0.451171875,
"learning_rate": 9.731328756623269e-06,
"loss": 0.059,
"step": 1576
},
{
"epoch": 0.5339992804385092,
"grad_norm": 0.5,
"learning_rate": 9.720136986575849e-06,
"loss": 0.0614,
"step": 1577
},
{
"epoch": 0.5343378976106326,
"grad_norm": 0.4140625,
"learning_rate": 9.708945567336736e-06,
"loss": 0.0475,
"step": 1578
},
{
"epoch": 0.5346765147827559,
"grad_norm": 0.5234375,
"learning_rate": 9.69775451293438e-06,
"loss": 0.0628,
"step": 1579
},
{
"epoch": 0.5350151319548793,
"grad_norm": 0.455078125,
"learning_rate": 9.686563837396769e-06,
"loss": 0.0635,
"step": 1580
},
{
"epoch": 0.5353537491270026,
"grad_norm": 0.7890625,
"learning_rate": 9.675373554751412e-06,
"loss": 0.0987,
"step": 1581
},
{
"epoch": 0.535692366299126,
"grad_norm": 0.453125,
"learning_rate": 9.664183679025327e-06,
"loss": 0.061,
"step": 1582
},
{
"epoch": 0.5360309834712493,
"grad_norm": 0.5546875,
"learning_rate": 9.652994224245033e-06,
"loss": 0.0729,
"step": 1583
},
{
"epoch": 0.5363696006433726,
"grad_norm": 0.462890625,
"learning_rate": 9.641805204436508e-06,
"loss": 0.0598,
"step": 1584
},
{
"epoch": 0.5367082178154959,
"grad_norm": 0.5703125,
"learning_rate": 9.630616633625186e-06,
"loss": 0.0672,
"step": 1585
},
{
"epoch": 0.5370468349876193,
"grad_norm": 0.466796875,
"learning_rate": 9.619428525835944e-06,
"loss": 0.0625,
"step": 1586
},
{
"epoch": 0.5373854521597426,
"grad_norm": 0.64453125,
"learning_rate": 9.608240895093077e-06,
"loss": 0.0487,
"step": 1587
},
{
"epoch": 0.537724069331866,
"grad_norm": 0.498046875,
"learning_rate": 9.597053755420277e-06,
"loss": 0.0708,
"step": 1588
},
{
"epoch": 0.5380626865039894,
"grad_norm": 0.52734375,
"learning_rate": 9.58586712084063e-06,
"loss": 0.0683,
"step": 1589
},
{
"epoch": 0.5384013036761127,
"grad_norm": 0.427734375,
"learning_rate": 9.57468100537658e-06,
"loss": 0.0504,
"step": 1590
},
{
"epoch": 0.538739920848236,
"grad_norm": 0.453125,
"learning_rate": 9.563495423049925e-06,
"loss": 0.0582,
"step": 1591
},
{
"epoch": 0.5390785380203593,
"grad_norm": 0.50390625,
"learning_rate": 9.552310387881793e-06,
"loss": 0.0629,
"step": 1592
},
{
"epoch": 0.5394171551924827,
"grad_norm": 0.6796875,
"learning_rate": 9.541125913892625e-06,
"loss": 0.0937,
"step": 1593
},
{
"epoch": 0.539755772364606,
"grad_norm": 0.5625,
"learning_rate": 9.529942015102164e-06,
"loss": 0.079,
"step": 1594
},
{
"epoch": 0.5400943895367294,
"grad_norm": 0.5078125,
"learning_rate": 9.518758705529423e-06,
"loss": 0.0697,
"step": 1595
},
{
"epoch": 0.5404330067088527,
"grad_norm": 0.4375,
"learning_rate": 9.507575999192686e-06,
"loss": 0.0548,
"step": 1596
},
{
"epoch": 0.5407716238809761,
"grad_norm": 0.384765625,
"learning_rate": 9.496393910109473e-06,
"loss": 0.0503,
"step": 1597
},
{
"epoch": 0.5411102410530994,
"grad_norm": 0.5859375,
"learning_rate": 9.485212452296535e-06,
"loss": 0.0829,
"step": 1598
},
{
"epoch": 0.5414488582252227,
"grad_norm": 0.498046875,
"learning_rate": 9.474031639769832e-06,
"loss": 0.058,
"step": 1599
},
{
"epoch": 0.5417874753973461,
"grad_norm": 0.61328125,
"learning_rate": 9.46285148654451e-06,
"loss": 0.0779,
"step": 1600
},
{
"epoch": 0.5421260925694694,
"grad_norm": 0.498046875,
"learning_rate": 9.451672006634892e-06,
"loss": 0.0568,
"step": 1601
},
{
"epoch": 0.5424647097415928,
"grad_norm": 0.478515625,
"learning_rate": 9.44049321405446e-06,
"loss": 0.0697,
"step": 1602
},
{
"epoch": 0.5428033269137161,
"grad_norm": 0.6015625,
"learning_rate": 9.429315122815831e-06,
"loss": 0.0661,
"step": 1603
},
{
"epoch": 0.5431419440858395,
"grad_norm": 0.396484375,
"learning_rate": 9.418137746930743e-06,
"loss": 0.0526,
"step": 1604
},
{
"epoch": 0.5434805612579628,
"grad_norm": 0.546875,
"learning_rate": 9.406961100410033e-06,
"loss": 0.0715,
"step": 1605
},
{
"epoch": 0.5438191784300861,
"grad_norm": 0.64453125,
"learning_rate": 9.395785197263638e-06,
"loss": 0.0763,
"step": 1606
},
{
"epoch": 0.5441577956022094,
"grad_norm": 0.56640625,
"learning_rate": 9.384610051500546e-06,
"loss": 0.0883,
"step": 1607
},
{
"epoch": 0.5444964127743328,
"grad_norm": 0.470703125,
"learning_rate": 9.3734356771288e-06,
"loss": 0.0611,
"step": 1608
},
{
"epoch": 0.5448350299464562,
"grad_norm": 0.4140625,
"learning_rate": 9.362262088155487e-06,
"loss": 0.0593,
"step": 1609
},
{
"epoch": 0.5451736471185795,
"grad_norm": 0.408203125,
"learning_rate": 9.351089298586699e-06,
"loss": 0.0573,
"step": 1610
},
{
"epoch": 0.5455122642907029,
"grad_norm": 0.59375,
"learning_rate": 9.339917322427528e-06,
"loss": 0.0757,
"step": 1611
},
{
"epoch": 0.5458508814628262,
"grad_norm": 0.51171875,
"learning_rate": 9.328746173682046e-06,
"loss": 0.0641,
"step": 1612
},
{
"epoch": 0.5461894986349495,
"grad_norm": 0.50390625,
"learning_rate": 9.317575866353293e-06,
"loss": 0.0635,
"step": 1613
},
{
"epoch": 0.5465281158070728,
"grad_norm": 0.51953125,
"learning_rate": 9.306406414443246e-06,
"loss": 0.073,
"step": 1614
},
{
"epoch": 0.5468667329791962,
"grad_norm": 0.578125,
"learning_rate": 9.295237831952815e-06,
"loss": 0.0737,
"step": 1615
},
{
"epoch": 0.5472053501513195,
"grad_norm": 0.546875,
"learning_rate": 9.284070132881817e-06,
"loss": 0.0773,
"step": 1616
},
{
"epoch": 0.5475439673234429,
"grad_norm": 0.54296875,
"learning_rate": 9.272903331228968e-06,
"loss": 0.0576,
"step": 1617
},
{
"epoch": 0.5478825844955663,
"grad_norm": 0.498046875,
"learning_rate": 9.261737440991854e-06,
"loss": 0.0701,
"step": 1618
},
{
"epoch": 0.5482212016676896,
"grad_norm": 0.484375,
"learning_rate": 9.250572476166918e-06,
"loss": 0.0601,
"step": 1619
},
{
"epoch": 0.5485598188398129,
"grad_norm": 1.1640625,
"learning_rate": 9.239408450749442e-06,
"loss": 0.0674,
"step": 1620
},
{
"epoch": 0.5488984360119362,
"grad_norm": 0.43359375,
"learning_rate": 9.228245378733537e-06,
"loss": 0.0615,
"step": 1621
},
{
"epoch": 0.5492370531840596,
"grad_norm": 0.458984375,
"learning_rate": 9.217083274112114e-06,
"loss": 0.061,
"step": 1622
},
{
"epoch": 0.5495756703561829,
"grad_norm": 0.486328125,
"learning_rate": 9.20592215087687e-06,
"loss": 0.0649,
"step": 1623
},
{
"epoch": 0.5499142875283063,
"grad_norm": 0.60546875,
"learning_rate": 9.194762023018271e-06,
"loss": 0.0715,
"step": 1624
},
{
"epoch": 0.5502529047004296,
"grad_norm": 0.427734375,
"learning_rate": 9.183602904525546e-06,
"loss": 0.0529,
"step": 1625
},
{
"epoch": 0.550591521872553,
"grad_norm": 0.55078125,
"learning_rate": 9.172444809386647e-06,
"loss": 0.0841,
"step": 1626
},
{
"epoch": 0.5509301390446762,
"grad_norm": 0.578125,
"learning_rate": 9.161287751588249e-06,
"loss": 0.0757,
"step": 1627
},
{
"epoch": 0.5512687562167996,
"grad_norm": 0.447265625,
"learning_rate": 9.150131745115721e-06,
"loss": 0.0556,
"step": 1628
},
{
"epoch": 0.551607373388923,
"grad_norm": 0.435546875,
"learning_rate": 9.138976803953122e-06,
"loss": 0.0578,
"step": 1629
},
{
"epoch": 0.5519459905610463,
"grad_norm": 0.49609375,
"learning_rate": 9.127822942083167e-06,
"loss": 0.064,
"step": 1630
},
{
"epoch": 0.5522846077331697,
"grad_norm": 0.47265625,
"learning_rate": 9.116670173487223e-06,
"loss": 0.059,
"step": 1631
},
{
"epoch": 0.552623224905293,
"grad_norm": 0.9921875,
"learning_rate": 9.105518512145292e-06,
"loss": 0.1812,
"step": 1632
},
{
"epoch": 0.5529618420774164,
"grad_norm": 0.67578125,
"learning_rate": 9.09436797203598e-06,
"loss": 0.0754,
"step": 1633
},
{
"epoch": 0.5533004592495397,
"grad_norm": 0.59765625,
"learning_rate": 9.083218567136487e-06,
"loss": 0.0926,
"step": 1634
},
{
"epoch": 0.553639076421663,
"grad_norm": 0.42578125,
"learning_rate": 9.072070311422595e-06,
"loss": 0.0527,
"step": 1635
},
{
"epoch": 0.5539776935937863,
"grad_norm": 0.51171875,
"learning_rate": 9.060923218868644e-06,
"loss": 0.0603,
"step": 1636
},
{
"epoch": 0.5543163107659097,
"grad_norm": 0.5546875,
"learning_rate": 9.049777303447517e-06,
"loss": 0.0782,
"step": 1637
},
{
"epoch": 0.5546549279380331,
"grad_norm": 0.5859375,
"learning_rate": 9.038632579130617e-06,
"loss": 0.0807,
"step": 1638
},
{
"epoch": 0.5549935451101564,
"grad_norm": 0.625,
"learning_rate": 9.027489059887855e-06,
"loss": 0.071,
"step": 1639
},
{
"epoch": 0.5553321622822798,
"grad_norm": 0.466796875,
"learning_rate": 9.01634675968764e-06,
"loss": 0.0492,
"step": 1640
},
{
"epoch": 0.5556707794544031,
"grad_norm": 0.5078125,
"learning_rate": 9.00520569249684e-06,
"loss": 0.0619,
"step": 1641
},
{
"epoch": 0.5560093966265264,
"grad_norm": 0.412109375,
"learning_rate": 8.994065872280785e-06,
"loss": 0.0572,
"step": 1642
},
{
"epoch": 0.5563480137986497,
"grad_norm": 0.46484375,
"learning_rate": 8.982927313003242e-06,
"loss": 0.069,
"step": 1643
},
{
"epoch": 0.5566866309707731,
"grad_norm": 0.52734375,
"learning_rate": 8.971790028626395e-06,
"loss": 0.0644,
"step": 1644
},
{
"epoch": 0.5570252481428964,
"grad_norm": 0.46484375,
"learning_rate": 8.960654033110834e-06,
"loss": 0.0668,
"step": 1645
},
{
"epoch": 0.5573638653150198,
"grad_norm": 0.427734375,
"learning_rate": 8.949519340415526e-06,
"loss": 0.0586,
"step": 1646
},
{
"epoch": 0.5577024824871432,
"grad_norm": 0.375,
"learning_rate": 8.938385964497807e-06,
"loss": 0.0506,
"step": 1647
},
{
"epoch": 0.5580410996592665,
"grad_norm": 0.65625,
"learning_rate": 8.927253919313377e-06,
"loss": 0.0757,
"step": 1648
},
{
"epoch": 0.5583797168313898,
"grad_norm": 0.5390625,
"learning_rate": 8.916123218816243e-06,
"loss": 0.0689,
"step": 1649
},
{
"epoch": 0.5587183340035131,
"grad_norm": 0.53125,
"learning_rate": 8.90499387695874e-06,
"loss": 0.078,
"step": 1650
},
{
"epoch": 0.5590569511756365,
"grad_norm": 0.40625,
"learning_rate": 8.893865907691503e-06,
"loss": 0.0516,
"step": 1651
},
{
"epoch": 0.5593955683477598,
"grad_norm": 0.61328125,
"learning_rate": 8.882739324963442e-06,
"loss": 0.0698,
"step": 1652
},
{
"epoch": 0.5597341855198832,
"grad_norm": 0.443359375,
"learning_rate": 8.871614142721728e-06,
"loss": 0.0616,
"step": 1653
},
{
"epoch": 0.5600728026920065,
"grad_norm": 0.66015625,
"learning_rate": 8.860490374911777e-06,
"loss": 0.0799,
"step": 1654
},
{
"epoch": 0.5604114198641299,
"grad_norm": 0.83203125,
"learning_rate": 8.849368035477236e-06,
"loss": 0.0669,
"step": 1655
},
{
"epoch": 0.5607500370362531,
"grad_norm": 0.6953125,
"learning_rate": 8.838247138359957e-06,
"loss": 0.1207,
"step": 1656
},
{
"epoch": 0.5610886542083765,
"grad_norm": 0.494140625,
"learning_rate": 8.827127697499985e-06,
"loss": 0.0637,
"step": 1657
},
{
"epoch": 0.5614272713804999,
"grad_norm": 0.4296875,
"learning_rate": 8.816009726835538e-06,
"loss": 0.0543,
"step": 1658
},
{
"epoch": 0.5617658885526232,
"grad_norm": 0.46484375,
"learning_rate": 8.804893240302997e-06,
"loss": 0.0566,
"step": 1659
},
{
"epoch": 0.5621045057247466,
"grad_norm": 0.431640625,
"learning_rate": 8.793778251836878e-06,
"loss": 0.0618,
"step": 1660
},
{
"epoch": 0.5624431228968699,
"grad_norm": 0.59765625,
"learning_rate": 8.782664775369818e-06,
"loss": 0.0639,
"step": 1661
},
{
"epoch": 0.5627817400689933,
"grad_norm": 0.443359375,
"learning_rate": 8.771552824832559e-06,
"loss": 0.0619,
"step": 1662
},
{
"epoch": 0.5631203572411166,
"grad_norm": 0.609375,
"learning_rate": 8.760442414153937e-06,
"loss": 0.0627,
"step": 1663
},
{
"epoch": 0.5634589744132399,
"grad_norm": 0.62109375,
"learning_rate": 8.749333557260851e-06,
"loss": 0.0621,
"step": 1664
},
{
"epoch": 0.5637975915853632,
"grad_norm": 0.52734375,
"learning_rate": 8.738226268078254e-06,
"loss": 0.0725,
"step": 1665
},
{
"epoch": 0.5641362087574866,
"grad_norm": 0.67578125,
"learning_rate": 8.72712056052913e-06,
"loss": 0.0559,
"step": 1666
},
{
"epoch": 0.56447482592961,
"grad_norm": 0.515625,
"learning_rate": 8.71601644853449e-06,
"loss": 0.0639,
"step": 1667
},
{
"epoch": 0.5648134431017333,
"grad_norm": 0.53125,
"learning_rate": 8.704913946013337e-06,
"loss": 0.0652,
"step": 1668
},
{
"epoch": 0.5651520602738567,
"grad_norm": 0.55078125,
"learning_rate": 8.69381306688266e-06,
"loss": 0.0588,
"step": 1669
},
{
"epoch": 0.56549067744598,
"grad_norm": 0.703125,
"learning_rate": 8.682713825057409e-06,
"loss": 0.0987,
"step": 1670
},
{
"epoch": 0.5658292946181033,
"grad_norm": 0.55859375,
"learning_rate": 8.671616234450486e-06,
"loss": 0.0794,
"step": 1671
},
{
"epoch": 0.5661679117902266,
"grad_norm": 0.390625,
"learning_rate": 8.660520308972722e-06,
"loss": 0.0537,
"step": 1672
},
{
"epoch": 0.56650652896235,
"grad_norm": 0.416015625,
"learning_rate": 8.649426062532858e-06,
"loss": 0.0569,
"step": 1673
},
{
"epoch": 0.5668451461344733,
"grad_norm": 0.45703125,
"learning_rate": 8.638333509037537e-06,
"loss": 0.0588,
"step": 1674
},
{
"epoch": 0.5671837633065967,
"grad_norm": 0.42578125,
"learning_rate": 8.627242662391273e-06,
"loss": 0.0688,
"step": 1675
},
{
"epoch": 0.5675223804787201,
"grad_norm": 0.482421875,
"learning_rate": 8.616153536496444e-06,
"loss": 0.0627,
"step": 1676
},
{
"epoch": 0.5678609976508434,
"grad_norm": 0.45703125,
"learning_rate": 8.605066145253269e-06,
"loss": 0.0622,
"step": 1677
},
{
"epoch": 0.5681996148229667,
"grad_norm": 1.1484375,
"learning_rate": 8.593980502559797e-06,
"loss": 0.1008,
"step": 1678
},
{
"epoch": 0.56853823199509,
"grad_norm": 0.5078125,
"learning_rate": 8.58289662231188e-06,
"loss": 0.0611,
"step": 1679
},
{
"epoch": 0.5688768491672134,
"grad_norm": 0.43359375,
"learning_rate": 8.571814518403162e-06,
"loss": 0.0609,
"step": 1680
},
{
"epoch": 0.5692154663393367,
"grad_norm": 0.466796875,
"learning_rate": 8.560734204725064e-06,
"loss": 0.0711,
"step": 1681
},
{
"epoch": 0.5695540835114601,
"grad_norm": 0.46484375,
"learning_rate": 8.549655695166756e-06,
"loss": 0.0548,
"step": 1682
},
{
"epoch": 0.5698927006835834,
"grad_norm": 0.43359375,
"learning_rate": 8.538579003615154e-06,
"loss": 0.0634,
"step": 1683
},
{
"epoch": 0.5702313178557068,
"grad_norm": 0.50390625,
"learning_rate": 8.52750414395489e-06,
"loss": 0.059,
"step": 1684
},
{
"epoch": 0.57056993502783,
"grad_norm": 0.37890625,
"learning_rate": 8.516431130068303e-06,
"loss": 0.0496,
"step": 1685
},
{
"epoch": 0.5709085521999534,
"grad_norm": 0.5078125,
"learning_rate": 8.505359975835413e-06,
"loss": 0.0686,
"step": 1686
},
{
"epoch": 0.5712471693720768,
"grad_norm": 0.53515625,
"learning_rate": 8.494290695133918e-06,
"loss": 0.0561,
"step": 1687
},
{
"epoch": 0.5715857865442001,
"grad_norm": 0.4296875,
"learning_rate": 8.483223301839159e-06,
"loss": 0.0549,
"step": 1688
},
{
"epoch": 0.5719244037163235,
"grad_norm": 0.447265625,
"learning_rate": 8.472157809824115e-06,
"loss": 0.0581,
"step": 1689
},
{
"epoch": 0.5722630208884468,
"grad_norm": 0.5,
"learning_rate": 8.461094232959381e-06,
"loss": 0.0655,
"step": 1690
},
{
"epoch": 0.5726016380605702,
"grad_norm": 0.44921875,
"learning_rate": 8.450032585113156e-06,
"loss": 0.0554,
"step": 1691
},
{
"epoch": 0.5729402552326935,
"grad_norm": 0.45703125,
"learning_rate": 8.438972880151209e-06,
"loss": 0.0606,
"step": 1692
},
{
"epoch": 0.5732788724048168,
"grad_norm": 0.5390625,
"learning_rate": 8.427915131936885e-06,
"loss": 0.0702,
"step": 1693
},
{
"epoch": 0.5736174895769401,
"grad_norm": 0.52734375,
"learning_rate": 8.416859354331072e-06,
"loss": 0.0659,
"step": 1694
},
{
"epoch": 0.5739561067490635,
"grad_norm": 0.435546875,
"learning_rate": 8.405805561192188e-06,
"loss": 0.0478,
"step": 1695
},
{
"epoch": 0.5742947239211869,
"grad_norm": 0.48046875,
"learning_rate": 8.39475376637616e-06,
"loss": 0.0667,
"step": 1696
},
{
"epoch": 0.5746333410933102,
"grad_norm": 0.486328125,
"learning_rate": 8.38370398373642e-06,
"loss": 0.0618,
"step": 1697
},
{
"epoch": 0.5749719582654336,
"grad_norm": 0.5078125,
"learning_rate": 8.372656227123868e-06,
"loss": 0.0609,
"step": 1698
},
{
"epoch": 0.5753105754375569,
"grad_norm": 0.494140625,
"learning_rate": 8.36161051038687e-06,
"loss": 0.0585,
"step": 1699
},
{
"epoch": 0.5756491926096802,
"grad_norm": 0.515625,
"learning_rate": 8.350566847371228e-06,
"loss": 0.0544,
"step": 1700
},
{
"epoch": 0.5759878097818035,
"grad_norm": 0.5390625,
"learning_rate": 8.33952525192018e-06,
"loss": 0.0759,
"step": 1701
},
{
"epoch": 0.5763264269539269,
"grad_norm": 0.796875,
"learning_rate": 8.328485737874365e-06,
"loss": 0.0673,
"step": 1702
},
{
"epoch": 0.5766650441260502,
"grad_norm": 0.51953125,
"learning_rate": 8.317448319071815e-06,
"loss": 0.0662,
"step": 1703
},
{
"epoch": 0.5770036612981736,
"grad_norm": 0.60546875,
"learning_rate": 8.306413009347933e-06,
"loss": 0.0805,
"step": 1704
},
{
"epoch": 0.577342278470297,
"grad_norm": 0.45703125,
"learning_rate": 8.295379822535482e-06,
"loss": 0.0578,
"step": 1705
},
{
"epoch": 0.5776808956424203,
"grad_norm": 0.458984375,
"learning_rate": 8.284348772464564e-06,
"loss": 0.0575,
"step": 1706
},
{
"epoch": 0.5780195128145436,
"grad_norm": 0.50390625,
"learning_rate": 8.273319872962599e-06,
"loss": 0.0549,
"step": 1707
},
{
"epoch": 0.5783581299866669,
"grad_norm": 0.6015625,
"learning_rate": 8.262293137854315e-06,
"loss": 0.0598,
"step": 1708
},
{
"epoch": 0.5786967471587903,
"grad_norm": 0.5703125,
"learning_rate": 8.251268580961724e-06,
"loss": 0.0611,
"step": 1709
},
{
"epoch": 0.5790353643309136,
"grad_norm": 0.44921875,
"learning_rate": 8.24024621610411e-06,
"loss": 0.0528,
"step": 1710
},
{
"epoch": 0.579373981503037,
"grad_norm": 0.52734375,
"learning_rate": 8.229226057098012e-06,
"loss": 0.0724,
"step": 1711
},
{
"epoch": 0.5797125986751603,
"grad_norm": 0.4609375,
"learning_rate": 8.218208117757194e-06,
"loss": 0.0598,
"step": 1712
},
{
"epoch": 0.5800512158472837,
"grad_norm": 0.58984375,
"learning_rate": 8.207192411892645e-06,
"loss": 0.0767,
"step": 1713
},
{
"epoch": 0.5803898330194069,
"grad_norm": 0.369140625,
"learning_rate": 8.196178953312557e-06,
"loss": 0.0515,
"step": 1714
},
{
"epoch": 0.5807284501915303,
"grad_norm": 0.484375,
"learning_rate": 8.185167755822294e-06,
"loss": 0.0664,
"step": 1715
},
{
"epoch": 0.5810670673636537,
"grad_norm": 0.474609375,
"learning_rate": 8.1741588332244e-06,
"loss": 0.06,
"step": 1716
},
{
"epoch": 0.581405684535777,
"grad_norm": 0.5546875,
"learning_rate": 8.163152199318559e-06,
"loss": 0.0656,
"step": 1717
},
{
"epoch": 0.5817443017079004,
"grad_norm": 0.49609375,
"learning_rate": 8.152147867901586e-06,
"loss": 0.059,
"step": 1718
},
{
"epoch": 0.5820829188800237,
"grad_norm": 0.470703125,
"learning_rate": 8.141145852767408e-06,
"loss": 0.0609,
"step": 1719
},
{
"epoch": 0.5824215360521471,
"grad_norm": 0.46484375,
"learning_rate": 8.13014616770706e-06,
"loss": 0.0649,
"step": 1720
},
{
"epoch": 0.5827601532242704,
"grad_norm": 0.41796875,
"learning_rate": 8.119148826508642e-06,
"loss": 0.055,
"step": 1721
},
{
"epoch": 0.5830987703963937,
"grad_norm": 0.58984375,
"learning_rate": 8.108153842957324e-06,
"loss": 0.0735,
"step": 1722
},
{
"epoch": 0.583437387568517,
"grad_norm": 0.51171875,
"learning_rate": 8.09716123083532e-06,
"loss": 0.06,
"step": 1723
},
{
"epoch": 0.5837760047406404,
"grad_norm": 0.8046875,
"learning_rate": 8.086171003921865e-06,
"loss": 0.0472,
"step": 1724
},
{
"epoch": 0.5841146219127638,
"grad_norm": 0.515625,
"learning_rate": 8.075183175993218e-06,
"loss": 0.0706,
"step": 1725
},
{
"epoch": 0.5844532390848871,
"grad_norm": 0.578125,
"learning_rate": 8.064197760822615e-06,
"loss": 0.0567,
"step": 1726
},
{
"epoch": 0.5847918562570105,
"grad_norm": 0.609375,
"learning_rate": 8.053214772180277e-06,
"loss": 0.0868,
"step": 1727
},
{
"epoch": 0.5851304734291338,
"grad_norm": 0.59765625,
"learning_rate": 8.042234223833381e-06,
"loss": 0.0503,
"step": 1728
},
{
"epoch": 0.5854690906012571,
"grad_norm": 0.439453125,
"learning_rate": 8.031256129546046e-06,
"loss": 0.0617,
"step": 1729
},
{
"epoch": 0.5858077077733804,
"grad_norm": 0.3671875,
"learning_rate": 8.020280503079314e-06,
"loss": 0.0443,
"step": 1730
},
{
"epoch": 0.5861463249455038,
"grad_norm": 0.48046875,
"learning_rate": 8.009307358191133e-06,
"loss": 0.0642,
"step": 1731
},
{
"epoch": 0.5864849421176271,
"grad_norm": 0.50390625,
"learning_rate": 7.99833670863634e-06,
"loss": 0.0677,
"step": 1732
},
{
"epoch": 0.5868235592897505,
"grad_norm": 0.52734375,
"learning_rate": 7.987368568166653e-06,
"loss": 0.0724,
"step": 1733
},
{
"epoch": 0.5871621764618739,
"grad_norm": 0.421875,
"learning_rate": 7.976402950530623e-06,
"loss": 0.0529,
"step": 1734
},
{
"epoch": 0.5875007936339972,
"grad_norm": 0.5,
"learning_rate": 7.965439869473664e-06,
"loss": 0.067,
"step": 1735
},
{
"epoch": 0.5878394108061205,
"grad_norm": 0.421875,
"learning_rate": 7.954479338737995e-06,
"loss": 0.0582,
"step": 1736
},
{
"epoch": 0.5881780279782438,
"grad_norm": 0.609375,
"learning_rate": 7.943521372062641e-06,
"loss": 0.0765,
"step": 1737
},
{
"epoch": 0.5885166451503672,
"grad_norm": 0.59375,
"learning_rate": 7.932565983183416e-06,
"loss": 0.0745,
"step": 1738
},
{
"epoch": 0.5888552623224905,
"grad_norm": 0.45703125,
"learning_rate": 7.921613185832897e-06,
"loss": 0.0624,
"step": 1739
},
{
"epoch": 0.5891938794946139,
"grad_norm": 0.455078125,
"learning_rate": 7.910662993740422e-06,
"loss": 0.0571,
"step": 1740
},
{
"epoch": 0.5895324966667372,
"grad_norm": 0.5078125,
"learning_rate": 7.899715420632056e-06,
"loss": 0.0677,
"step": 1741
},
{
"epoch": 0.5898711138388606,
"grad_norm": 0.44140625,
"learning_rate": 7.888770480230582e-06,
"loss": 0.0539,
"step": 1742
},
{
"epoch": 0.5902097310109838,
"grad_norm": 0.5703125,
"learning_rate": 7.87782818625548e-06,
"loss": 0.0742,
"step": 1743
},
{
"epoch": 0.5905483481831072,
"grad_norm": 0.51953125,
"learning_rate": 7.866888552422924e-06,
"loss": 0.0653,
"step": 1744
},
{
"epoch": 0.5908869653552306,
"grad_norm": 0.4609375,
"learning_rate": 7.855951592445743e-06,
"loss": 0.0559,
"step": 1745
},
{
"epoch": 0.5912255825273539,
"grad_norm": 0.470703125,
"learning_rate": 7.845017320033415e-06,
"loss": 0.0502,
"step": 1746
},
{
"epoch": 0.5915641996994773,
"grad_norm": 0.53125,
"learning_rate": 7.834085748892052e-06,
"loss": 0.064,
"step": 1747
},
{
"epoch": 0.5919028168716006,
"grad_norm": 0.4921875,
"learning_rate": 7.823156892724379e-06,
"loss": 0.0646,
"step": 1748
},
{
"epoch": 0.592241434043724,
"grad_norm": 0.7109375,
"learning_rate": 7.81223076522972e-06,
"loss": 0.0848,
"step": 1749
},
{
"epoch": 0.5925800512158473,
"grad_norm": 0.546875,
"learning_rate": 7.801307380103977e-06,
"loss": 0.0657,
"step": 1750
},
{
"epoch": 0.5929186683879706,
"grad_norm": 0.458984375,
"learning_rate": 7.790386751039609e-06,
"loss": 0.0562,
"step": 1751
},
{
"epoch": 0.5932572855600939,
"grad_norm": 0.59375,
"learning_rate": 7.779468891725633e-06,
"loss": 0.0803,
"step": 1752
},
{
"epoch": 0.5935959027322173,
"grad_norm": 0.515625,
"learning_rate": 7.768553815847583e-06,
"loss": 0.0589,
"step": 1753
},
{
"epoch": 0.5939345199043407,
"grad_norm": 0.5859375,
"learning_rate": 7.757641537087509e-06,
"loss": 0.0716,
"step": 1754
},
{
"epoch": 0.594273137076464,
"grad_norm": 0.52734375,
"learning_rate": 7.74673206912395e-06,
"loss": 0.0715,
"step": 1755
},
{
"epoch": 0.5946117542485874,
"grad_norm": 0.50390625,
"learning_rate": 7.735825425631926e-06,
"loss": 0.0671,
"step": 1756
},
{
"epoch": 0.5949503714207107,
"grad_norm": 0.396484375,
"learning_rate": 7.724921620282917e-06,
"loss": 0.0529,
"step": 1757
},
{
"epoch": 0.595288988592834,
"grad_norm": 0.453125,
"learning_rate": 7.71402066674484e-06,
"loss": 0.0607,
"step": 1758
},
{
"epoch": 0.5956276057649573,
"grad_norm": 0.5390625,
"learning_rate": 7.703122578682047e-06,
"loss": 0.0687,
"step": 1759
},
{
"epoch": 0.5959662229370807,
"grad_norm": 0.390625,
"learning_rate": 7.69222736975529e-06,
"loss": 0.0491,
"step": 1760
},
{
"epoch": 0.596304840109204,
"grad_norm": 0.64453125,
"learning_rate": 7.681335053621712e-06,
"loss": 0.0563,
"step": 1761
},
{
"epoch": 0.5966434572813274,
"grad_norm": 0.43359375,
"learning_rate": 7.670445643934833e-06,
"loss": 0.0574,
"step": 1762
},
{
"epoch": 0.5969820744534508,
"grad_norm": 0.5390625,
"learning_rate": 7.659559154344533e-06,
"loss": 0.0558,
"step": 1763
},
{
"epoch": 0.5973206916255741,
"grad_norm": 0.59765625,
"learning_rate": 7.648675598497023e-06,
"loss": 0.0637,
"step": 1764
},
{
"epoch": 0.5976593087976974,
"grad_norm": 0.69140625,
"learning_rate": 7.637794990034843e-06,
"loss": 0.0891,
"step": 1765
},
{
"epoch": 0.5979979259698207,
"grad_norm": 0.51953125,
"learning_rate": 7.626917342596833e-06,
"loss": 0.0642,
"step": 1766
},
{
"epoch": 0.5983365431419441,
"grad_norm": 0.546875,
"learning_rate": 7.616042669818133e-06,
"loss": 0.0673,
"step": 1767
},
{
"epoch": 0.5986751603140674,
"grad_norm": 0.6328125,
"learning_rate": 7.605170985330139e-06,
"loss": 0.0731,
"step": 1768
},
{
"epoch": 0.5990137774861908,
"grad_norm": 0.462890625,
"learning_rate": 7.594302302760512e-06,
"loss": 0.0545,
"step": 1769
},
{
"epoch": 0.5993523946583141,
"grad_norm": 0.53515625,
"learning_rate": 7.5834366357331436e-06,
"loss": 0.0648,
"step": 1770
},
{
"epoch": 0.5996910118304375,
"grad_norm": 0.466796875,
"learning_rate": 7.572573997868151e-06,
"loss": 0.0583,
"step": 1771
},
{
"epoch": 0.6000296290025607,
"grad_norm": 0.66015625,
"learning_rate": 7.5617144027818515e-06,
"loss": 0.1069,
"step": 1772
},
{
"epoch": 0.6003682461746841,
"grad_norm": 0.474609375,
"learning_rate": 7.550857864086747e-06,
"loss": 0.0693,
"step": 1773
},
{
"epoch": 0.6007068633468075,
"grad_norm": 0.427734375,
"learning_rate": 7.540004395391509e-06,
"loss": 0.0567,
"step": 1774
},
{
"epoch": 0.6010454805189308,
"grad_norm": 0.4140625,
"learning_rate": 7.529154010300963e-06,
"loss": 0.0502,
"step": 1775
},
{
"epoch": 0.6013840976910542,
"grad_norm": 0.470703125,
"learning_rate": 7.518306722416074e-06,
"loss": 0.0619,
"step": 1776
},
{
"epoch": 0.6013840976910542,
"eval_loss": 0.06656693667173386,
"eval_runtime": 815.1123,
"eval_samples_per_second": 12.204,
"eval_steps_per_second": 3.051,
"step": 1776
},
{
"epoch": 0.6017227148631775,
"grad_norm": 0.486328125,
"learning_rate": 7.5074625453339034e-06,
"loss": 0.0615,
"step": 1777
},
{
"epoch": 0.6020613320353009,
"grad_norm": 0.51953125,
"learning_rate": 7.496621492647638e-06,
"loss": 0.0651,
"step": 1778
},
{
"epoch": 0.6023999492074242,
"grad_norm": 0.62109375,
"learning_rate": 7.485783577946537e-06,
"loss": 0.0694,
"step": 1779
},
{
"epoch": 0.6027385663795475,
"grad_norm": 0.46484375,
"learning_rate": 7.474948814815927e-06,
"loss": 0.0644,
"step": 1780
},
{
"epoch": 0.6030771835516708,
"grad_norm": 0.408203125,
"learning_rate": 7.464117216837181e-06,
"loss": 0.055,
"step": 1781
},
{
"epoch": 0.6034158007237942,
"grad_norm": 0.5078125,
"learning_rate": 7.453288797587714e-06,
"loss": 0.0585,
"step": 1782
},
{
"epoch": 0.6037544178959176,
"grad_norm": 0.45703125,
"learning_rate": 7.442463570640947e-06,
"loss": 0.0593,
"step": 1783
},
{
"epoch": 0.6040930350680409,
"grad_norm": 0.412109375,
"learning_rate": 7.431641549566304e-06,
"loss": 0.0542,
"step": 1784
},
{
"epoch": 0.6044316522401643,
"grad_norm": 0.54296875,
"learning_rate": 7.420822747929187e-06,
"loss": 0.0711,
"step": 1785
},
{
"epoch": 0.6047702694122876,
"grad_norm": 0.98046875,
"learning_rate": 7.410007179290968e-06,
"loss": 0.0832,
"step": 1786
},
{
"epoch": 0.6051088865844109,
"grad_norm": 0.49609375,
"learning_rate": 7.399194857208962e-06,
"loss": 0.0673,
"step": 1787
},
{
"epoch": 0.6054475037565342,
"grad_norm": 0.51171875,
"learning_rate": 7.388385795236415e-06,
"loss": 0.0717,
"step": 1788
},
{
"epoch": 0.6057861209286576,
"grad_norm": 0.53515625,
"learning_rate": 7.377580006922486e-06,
"loss": 0.0606,
"step": 1789
},
{
"epoch": 0.6061247381007809,
"grad_norm": 0.50390625,
"learning_rate": 7.366777505812234e-06,
"loss": 0.0667,
"step": 1790
},
{
"epoch": 0.6064633552729043,
"grad_norm": 0.421875,
"learning_rate": 7.355978305446594e-06,
"loss": 0.0498,
"step": 1791
},
{
"epoch": 0.6068019724450276,
"grad_norm": 0.7265625,
"learning_rate": 7.345182419362364e-06,
"loss": 0.1045,
"step": 1792
},
{
"epoch": 0.607140589617151,
"grad_norm": 2.0,
"learning_rate": 7.334389861092187e-06,
"loss": 0.0706,
"step": 1793
},
{
"epoch": 0.6074792067892743,
"grad_norm": 0.44140625,
"learning_rate": 7.323600644164539e-06,
"loss": 0.0613,
"step": 1794
},
{
"epoch": 0.6078178239613976,
"grad_norm": 0.66015625,
"learning_rate": 7.312814782103703e-06,
"loss": 0.0837,
"step": 1795
},
{
"epoch": 0.608156441133521,
"grad_norm": 0.48828125,
"learning_rate": 7.3020322884297565e-06,
"loss": 0.073,
"step": 1796
},
{
"epoch": 0.6084950583056443,
"grad_norm": 0.353515625,
"learning_rate": 7.291253176658562e-06,
"loss": 0.046,
"step": 1797
},
{
"epoch": 0.6088336754777677,
"grad_norm": 0.5546875,
"learning_rate": 7.280477460301727e-06,
"loss": 0.0621,
"step": 1798
},
{
"epoch": 0.609172292649891,
"grad_norm": 0.546875,
"learning_rate": 7.26970515286662e-06,
"loss": 0.0646,
"step": 1799
},
{
"epoch": 0.6095109098220144,
"grad_norm": 0.427734375,
"learning_rate": 7.258936267856323e-06,
"loss": 0.0622,
"step": 1800
},
{
"epoch": 0.6098495269941376,
"grad_norm": 0.435546875,
"learning_rate": 7.248170818769642e-06,
"loss": 0.0426,
"step": 1801
},
{
"epoch": 0.610188144166261,
"grad_norm": 0.51171875,
"learning_rate": 7.237408819101064e-06,
"loss": 0.0586,
"step": 1802
},
{
"epoch": 0.6105267613383843,
"grad_norm": 0.64453125,
"learning_rate": 7.2266502823407584e-06,
"loss": 0.0624,
"step": 1803
},
{
"epoch": 0.6108653785105077,
"grad_norm": 0.474609375,
"learning_rate": 7.215895221974548e-06,
"loss": 0.062,
"step": 1804
},
{
"epoch": 0.6112039956826311,
"grad_norm": 0.51953125,
"learning_rate": 7.2051436514839064e-06,
"loss": 0.0654,
"step": 1805
},
{
"epoch": 0.6115426128547544,
"grad_norm": 0.47265625,
"learning_rate": 7.194395584345927e-06,
"loss": 0.0551,
"step": 1806
},
{
"epoch": 0.6118812300268778,
"grad_norm": 0.55078125,
"learning_rate": 7.1836510340333125e-06,
"loss": 0.0641,
"step": 1807
},
{
"epoch": 0.6122198471990011,
"grad_norm": 0.39453125,
"learning_rate": 7.1729100140143535e-06,
"loss": 0.0479,
"step": 1808
},
{
"epoch": 0.6125584643711244,
"grad_norm": 0.6328125,
"learning_rate": 7.162172537752927e-06,
"loss": 0.0776,
"step": 1809
},
{
"epoch": 0.6128970815432477,
"grad_norm": 0.486328125,
"learning_rate": 7.151438618708455e-06,
"loss": 0.065,
"step": 1810
},
{
"epoch": 0.6132356987153711,
"grad_norm": 0.58984375,
"learning_rate": 7.1407082703359085e-06,
"loss": 0.0835,
"step": 1811
},
{
"epoch": 0.6135743158874944,
"grad_norm": 0.54296875,
"learning_rate": 7.129981506085777e-06,
"loss": 0.0665,
"step": 1812
},
{
"epoch": 0.6139129330596178,
"grad_norm": 0.48046875,
"learning_rate": 7.119258339404065e-06,
"loss": 0.0605,
"step": 1813
},
{
"epoch": 0.6142515502317412,
"grad_norm": 0.39453125,
"learning_rate": 7.1085387837322595e-06,
"loss": 0.0528,
"step": 1814
},
{
"epoch": 0.6145901674038645,
"grad_norm": 0.65625,
"learning_rate": 7.097822852507325e-06,
"loss": 0.0892,
"step": 1815
},
{
"epoch": 0.6149287845759878,
"grad_norm": 0.5625,
"learning_rate": 7.087110559161681e-06,
"loss": 0.079,
"step": 1816
},
{
"epoch": 0.6152674017481111,
"grad_norm": 0.671875,
"learning_rate": 7.0764019171231906e-06,
"loss": 0.0519,
"step": 1817
},
{
"epoch": 0.6156060189202345,
"grad_norm": 0.53125,
"learning_rate": 7.06569693981514e-06,
"loss": 0.0608,
"step": 1818
},
{
"epoch": 0.6159446360923578,
"grad_norm": 0.5546875,
"learning_rate": 7.0549956406562105e-06,
"loss": 0.0784,
"step": 1819
},
{
"epoch": 0.6162832532644812,
"grad_norm": 0.47265625,
"learning_rate": 7.044298033060487e-06,
"loss": 0.0667,
"step": 1820
},
{
"epoch": 0.6166218704366045,
"grad_norm": 0.447265625,
"learning_rate": 7.033604130437422e-06,
"loss": 0.0612,
"step": 1821
},
{
"epoch": 0.6169604876087279,
"grad_norm": 0.61328125,
"learning_rate": 7.022913946191821e-06,
"loss": 0.0698,
"step": 1822
},
{
"epoch": 0.6172991047808511,
"grad_norm": 0.5625,
"learning_rate": 7.012227493723831e-06,
"loss": 0.0673,
"step": 1823
},
{
"epoch": 0.6176377219529745,
"grad_norm": 0.5546875,
"learning_rate": 7.001544786428924e-06,
"loss": 0.0601,
"step": 1824
},
{
"epoch": 0.6179763391250979,
"grad_norm": 0.5390625,
"learning_rate": 6.990865837697872e-06,
"loss": 0.0562,
"step": 1825
},
{
"epoch": 0.6183149562972212,
"grad_norm": 0.46875,
"learning_rate": 6.980190660916739e-06,
"loss": 0.0658,
"step": 1826
},
{
"epoch": 0.6186535734693446,
"grad_norm": 0.42578125,
"learning_rate": 6.969519269466858e-06,
"loss": 0.055,
"step": 1827
},
{
"epoch": 0.6189921906414679,
"grad_norm": 0.490234375,
"learning_rate": 6.958851676724823e-06,
"loss": 0.0652,
"step": 1828
},
{
"epoch": 0.6193308078135913,
"grad_norm": 0.578125,
"learning_rate": 6.9481878960624585e-06,
"loss": 0.0715,
"step": 1829
},
{
"epoch": 0.6196694249857145,
"grad_norm": 0.74609375,
"learning_rate": 6.937527940846816e-06,
"loss": 0.1297,
"step": 1830
},
{
"epoch": 0.6200080421578379,
"grad_norm": 0.43359375,
"learning_rate": 6.926871824440149e-06,
"loss": 0.0607,
"step": 1831
},
{
"epoch": 0.6203466593299612,
"grad_norm": 0.470703125,
"learning_rate": 6.916219560199904e-06,
"loss": 0.0621,
"step": 1832
},
{
"epoch": 0.6206852765020846,
"grad_norm": 0.46484375,
"learning_rate": 6.905571161478692e-06,
"loss": 0.0516,
"step": 1833
},
{
"epoch": 0.621023893674208,
"grad_norm": 0.5546875,
"learning_rate": 6.894926641624282e-06,
"loss": 0.0806,
"step": 1834
},
{
"epoch": 0.6213625108463313,
"grad_norm": 0.453125,
"learning_rate": 6.8842860139795795e-06,
"loss": 0.0625,
"step": 1835
},
{
"epoch": 0.6217011280184547,
"grad_norm": 0.490234375,
"learning_rate": 6.873649291882613e-06,
"loss": 0.0609,
"step": 1836
},
{
"epoch": 0.622039745190578,
"grad_norm": 0.478515625,
"learning_rate": 6.8630164886665165e-06,
"loss": 0.0683,
"step": 1837
},
{
"epoch": 0.6223783623627013,
"grad_norm": 0.43359375,
"learning_rate": 6.8523876176595084e-06,
"loss": 0.0567,
"step": 1838
},
{
"epoch": 0.6227169795348246,
"grad_norm": 0.41796875,
"learning_rate": 6.841762692184881e-06,
"loss": 0.0535,
"step": 1839
},
{
"epoch": 0.623055596706948,
"grad_norm": 0.57421875,
"learning_rate": 6.831141725560975e-06,
"loss": 0.0775,
"step": 1840
},
{
"epoch": 0.6233942138790713,
"grad_norm": 0.5,
"learning_rate": 6.820524731101176e-06,
"loss": 0.0621,
"step": 1841
},
{
"epoch": 0.6237328310511947,
"grad_norm": 0.490234375,
"learning_rate": 6.809911722113884e-06,
"loss": 0.0549,
"step": 1842
},
{
"epoch": 0.6240714482233181,
"grad_norm": 0.54296875,
"learning_rate": 6.7993027119025115e-06,
"loss": 0.0683,
"step": 1843
},
{
"epoch": 0.6244100653954414,
"grad_norm": 0.4453125,
"learning_rate": 6.7886977137654505e-06,
"loss": 0.0644,
"step": 1844
},
{
"epoch": 0.6247486825675647,
"grad_norm": 0.52734375,
"learning_rate": 6.778096740996069e-06,
"loss": 0.0677,
"step": 1845
},
{
"epoch": 0.625087299739688,
"grad_norm": 1.734375,
"learning_rate": 6.767499806882685e-06,
"loss": 0.0645,
"step": 1846
},
{
"epoch": 0.6254259169118114,
"grad_norm": 0.486328125,
"learning_rate": 6.756906924708558e-06,
"loss": 0.069,
"step": 1847
},
{
"epoch": 0.6257645340839347,
"grad_norm": 0.4765625,
"learning_rate": 6.746318107751867e-06,
"loss": 0.064,
"step": 1848
},
{
"epoch": 0.6261031512560581,
"grad_norm": 0.48828125,
"learning_rate": 6.735733369285694e-06,
"loss": 0.0662,
"step": 1849
},
{
"epoch": 0.6264417684281814,
"grad_norm": 0.55078125,
"learning_rate": 6.7251527225780075e-06,
"loss": 0.0766,
"step": 1850
},
{
"epoch": 0.6267803856003048,
"grad_norm": 0.51171875,
"learning_rate": 6.714576180891653e-06,
"loss": 0.0681,
"step": 1851
},
{
"epoch": 0.627119002772428,
"grad_norm": 0.52734375,
"learning_rate": 6.7040037574843255e-06,
"loss": 0.0711,
"step": 1852
},
{
"epoch": 0.6274576199445514,
"grad_norm": 0.58203125,
"learning_rate": 6.693435465608556e-06,
"loss": 0.0742,
"step": 1853
},
{
"epoch": 0.6277962371166748,
"grad_norm": 0.36328125,
"learning_rate": 6.682871318511702e-06,
"loss": 0.0477,
"step": 1854
},
{
"epoch": 0.6281348542887981,
"grad_norm": 0.498046875,
"learning_rate": 6.672311329435919e-06,
"loss": 0.0624,
"step": 1855
},
{
"epoch": 0.6284734714609215,
"grad_norm": 0.5078125,
"learning_rate": 6.66175551161816e-06,
"loss": 0.0648,
"step": 1856
},
{
"epoch": 0.6288120886330448,
"grad_norm": 0.53125,
"learning_rate": 6.651203878290139e-06,
"loss": 0.0629,
"step": 1857
},
{
"epoch": 0.6291507058051682,
"grad_norm": 0.4375,
"learning_rate": 6.64065644267833e-06,
"loss": 0.059,
"step": 1858
},
{
"epoch": 0.6294893229772914,
"grad_norm": 0.5390625,
"learning_rate": 6.630113218003944e-06,
"loss": 0.0675,
"step": 1859
},
{
"epoch": 0.6298279401494148,
"grad_norm": 0.59375,
"learning_rate": 6.619574217482918e-06,
"loss": 0.0798,
"step": 1860
},
{
"epoch": 0.6301665573215381,
"grad_norm": 0.51171875,
"learning_rate": 6.609039454325887e-06,
"loss": 0.0715,
"step": 1861
},
{
"epoch": 0.6305051744936615,
"grad_norm": 0.59375,
"learning_rate": 6.598508941738176e-06,
"loss": 0.0674,
"step": 1862
},
{
"epoch": 0.6308437916657849,
"grad_norm": 0.427734375,
"learning_rate": 6.587982692919785e-06,
"loss": 0.045,
"step": 1863
},
{
"epoch": 0.6311824088379082,
"grad_norm": 0.53515625,
"learning_rate": 6.5774607210653675e-06,
"loss": 0.0666,
"step": 1864
},
{
"epoch": 0.6315210260100316,
"grad_norm": 0.5859375,
"learning_rate": 6.566943039364215e-06,
"loss": 0.0581,
"step": 1865
},
{
"epoch": 0.6318596431821549,
"grad_norm": 0.5546875,
"learning_rate": 6.556429661000244e-06,
"loss": 0.0726,
"step": 1866
},
{
"epoch": 0.6321982603542782,
"grad_norm": 1.3203125,
"learning_rate": 6.545920599151976e-06,
"loss": 0.0555,
"step": 1867
},
{
"epoch": 0.6325368775264015,
"grad_norm": 0.51171875,
"learning_rate": 6.535415866992518e-06,
"loss": 0.0702,
"step": 1868
},
{
"epoch": 0.6328754946985249,
"grad_norm": 0.51171875,
"learning_rate": 6.524915477689553e-06,
"loss": 0.0601,
"step": 1869
},
{
"epoch": 0.6332141118706482,
"grad_norm": 0.384765625,
"learning_rate": 6.5144194444053235e-06,
"loss": 0.0561,
"step": 1870
},
{
"epoch": 0.6335527290427716,
"grad_norm": 0.59765625,
"learning_rate": 6.503927780296605e-06,
"loss": 0.0833,
"step": 1871
},
{
"epoch": 0.633891346214895,
"grad_norm": 0.48828125,
"learning_rate": 6.4934404985147e-06,
"loss": 0.0658,
"step": 1872
},
{
"epoch": 0.6342299633870183,
"grad_norm": 0.44140625,
"learning_rate": 6.482957612205416e-06,
"loss": 0.0476,
"step": 1873
},
{
"epoch": 0.6345685805591416,
"grad_norm": 1.03125,
"learning_rate": 6.472479134509052e-06,
"loss": 0.094,
"step": 1874
},
{
"epoch": 0.6349071977312649,
"grad_norm": 0.6484375,
"learning_rate": 6.4620050785603836e-06,
"loss": 0.0744,
"step": 1875
},
{
"epoch": 0.6352458149033883,
"grad_norm": 0.462890625,
"learning_rate": 6.451535457488638e-06,
"loss": 0.0597,
"step": 1876
},
{
"epoch": 0.6355844320755116,
"grad_norm": 0.42578125,
"learning_rate": 6.4410702844174875e-06,
"loss": 0.0638,
"step": 1877
},
{
"epoch": 0.635923049247635,
"grad_norm": 0.4765625,
"learning_rate": 6.430609572465024e-06,
"loss": 0.0623,
"step": 1878
},
{
"epoch": 0.6362616664197583,
"grad_norm": 0.4375,
"learning_rate": 6.420153334743755e-06,
"loss": 0.0562,
"step": 1879
},
{
"epoch": 0.6366002835918817,
"grad_norm": 1.125,
"learning_rate": 6.409701584360575e-06,
"loss": 0.0465,
"step": 1880
},
{
"epoch": 0.636938900764005,
"grad_norm": 0.416015625,
"learning_rate": 6.399254334416752e-06,
"loss": 0.0492,
"step": 1881
},
{
"epoch": 0.6372775179361283,
"grad_norm": 0.45703125,
"learning_rate": 6.388811598007918e-06,
"loss": 0.0583,
"step": 1882
},
{
"epoch": 0.6376161351082517,
"grad_norm": 0.53125,
"learning_rate": 6.378373388224039e-06,
"loss": 0.0709,
"step": 1883
},
{
"epoch": 0.637954752280375,
"grad_norm": 0.46484375,
"learning_rate": 6.3679397181494115e-06,
"loss": 0.0598,
"step": 1884
},
{
"epoch": 0.6382933694524984,
"grad_norm": 0.41796875,
"learning_rate": 6.357510600862646e-06,
"loss": 0.0561,
"step": 1885
},
{
"epoch": 0.6386319866246217,
"grad_norm": 0.5703125,
"learning_rate": 6.3470860494366415e-06,
"loss": 0.0637,
"step": 1886
},
{
"epoch": 0.6389706037967451,
"grad_norm": 0.5390625,
"learning_rate": 6.336666076938573e-06,
"loss": 0.0627,
"step": 1887
},
{
"epoch": 0.6393092209688683,
"grad_norm": 0.5703125,
"learning_rate": 6.326250696429877e-06,
"loss": 0.0742,
"step": 1888
},
{
"epoch": 0.6396478381409917,
"grad_norm": 0.474609375,
"learning_rate": 6.315839920966229e-06,
"loss": 0.0568,
"step": 1889
},
{
"epoch": 0.639986455313115,
"grad_norm": 0.51953125,
"learning_rate": 6.305433763597546e-06,
"loss": 0.0758,
"step": 1890
},
{
"epoch": 0.6403250724852384,
"grad_norm": 0.462890625,
"learning_rate": 6.295032237367942e-06,
"loss": 0.0552,
"step": 1891
},
{
"epoch": 0.6406636896573618,
"grad_norm": 0.470703125,
"learning_rate": 6.284635355315731e-06,
"loss": 0.0675,
"step": 1892
},
{
"epoch": 0.6410023068294851,
"grad_norm": 0.52734375,
"learning_rate": 6.274243130473405e-06,
"loss": 0.063,
"step": 1893
},
{
"epoch": 0.6413409240016085,
"grad_norm": 0.40625,
"learning_rate": 6.2638555758676215e-06,
"loss": 0.0549,
"step": 1894
},
{
"epoch": 0.6416795411737318,
"grad_norm": 0.478515625,
"learning_rate": 6.253472704519179e-06,
"loss": 0.0589,
"step": 1895
},
{
"epoch": 0.6420181583458551,
"grad_norm": 0.62109375,
"learning_rate": 6.243094529443008e-06,
"loss": 0.0856,
"step": 1896
},
{
"epoch": 0.6423567755179784,
"grad_norm": 0.380859375,
"learning_rate": 6.232721063648148e-06,
"loss": 0.0506,
"step": 1897
},
{
"epoch": 0.6426953926901018,
"grad_norm": 0.55859375,
"learning_rate": 6.222352320137748e-06,
"loss": 0.0758,
"step": 1898
},
{
"epoch": 0.6430340098622251,
"grad_norm": 0.427734375,
"learning_rate": 6.211988311909021e-06,
"loss": 0.0528,
"step": 1899
},
{
"epoch": 0.6433726270343485,
"grad_norm": 0.4296875,
"learning_rate": 6.201629051953257e-06,
"loss": 0.0562,
"step": 1900
},
{
"epoch": 0.6437112442064719,
"grad_norm": 0.44140625,
"learning_rate": 6.1912745532557834e-06,
"loss": 0.0583,
"step": 1901
},
{
"epoch": 0.6440498613785952,
"grad_norm": 0.56640625,
"learning_rate": 6.180924828795972e-06,
"loss": 0.0687,
"step": 1902
},
{
"epoch": 0.6443884785507185,
"grad_norm": 0.53125,
"learning_rate": 6.170579891547202e-06,
"loss": 0.0623,
"step": 1903
},
{
"epoch": 0.6447270957228418,
"grad_norm": 0.4921875,
"learning_rate": 6.160239754476849e-06,
"loss": 0.0695,
"step": 1904
},
{
"epoch": 0.6450657128949652,
"grad_norm": 0.46484375,
"learning_rate": 6.149904430546278e-06,
"loss": 0.0585,
"step": 1905
},
{
"epoch": 0.6454043300670885,
"grad_norm": 0.44921875,
"learning_rate": 6.1395739327108185e-06,
"loss": 0.0655,
"step": 1906
},
{
"epoch": 0.6457429472392119,
"grad_norm": 0.56640625,
"learning_rate": 6.12924827391975e-06,
"loss": 0.0727,
"step": 1907
},
{
"epoch": 0.6460815644113352,
"grad_norm": 0.46484375,
"learning_rate": 6.118927467116285e-06,
"loss": 0.0523,
"step": 1908
},
{
"epoch": 0.6464201815834586,
"grad_norm": 0.625,
"learning_rate": 6.1086115252375585e-06,
"loss": 0.0855,
"step": 1909
},
{
"epoch": 0.6467587987555818,
"grad_norm": 0.6328125,
"learning_rate": 6.098300461214605e-06,
"loss": 0.0866,
"step": 1910
},
{
"epoch": 0.6470974159277052,
"grad_norm": 0.4765625,
"learning_rate": 6.087994287972341e-06,
"loss": 0.0627,
"step": 1911
},
{
"epoch": 0.6474360330998286,
"grad_norm": 0.7109375,
"learning_rate": 6.077693018429556e-06,
"loss": 0.0702,
"step": 1912
},
{
"epoch": 0.6477746502719519,
"grad_norm": 0.578125,
"learning_rate": 6.0673966654988946e-06,
"loss": 0.0679,
"step": 1913
},
{
"epoch": 0.6481132674440753,
"grad_norm": 0.51953125,
"learning_rate": 6.057105242086836e-06,
"loss": 0.0615,
"step": 1914
},
{
"epoch": 0.6484518846161986,
"grad_norm": 0.462890625,
"learning_rate": 6.046818761093678e-06,
"loss": 0.0506,
"step": 1915
},
{
"epoch": 0.648790501788322,
"grad_norm": 0.76171875,
"learning_rate": 6.036537235413524e-06,
"loss": 0.1215,
"step": 1916
},
{
"epoch": 0.6491291189604452,
"grad_norm": 0.4921875,
"learning_rate": 6.026260677934273e-06,
"loss": 0.0574,
"step": 1917
},
{
"epoch": 0.6494677361325686,
"grad_norm": 0.43359375,
"learning_rate": 6.015989101537586e-06,
"loss": 0.0573,
"step": 1918
},
{
"epoch": 0.6498063533046919,
"grad_norm": 0.384765625,
"learning_rate": 6.005722519098887e-06,
"loss": 0.0501,
"step": 1919
},
{
"epoch": 0.6501449704768153,
"grad_norm": 0.46875,
"learning_rate": 5.995460943487334e-06,
"loss": 0.0666,
"step": 1920
},
{
"epoch": 0.6504835876489387,
"grad_norm": 0.50390625,
"learning_rate": 5.9852043875658195e-06,
"loss": 0.0678,
"step": 1921
},
{
"epoch": 0.650822204821062,
"grad_norm": 0.58203125,
"learning_rate": 5.974952864190933e-06,
"loss": 0.0809,
"step": 1922
},
{
"epoch": 0.6511608219931854,
"grad_norm": 0.453125,
"learning_rate": 5.964706386212959e-06,
"loss": 0.0657,
"step": 1923
},
{
"epoch": 0.6514994391653087,
"grad_norm": 0.443359375,
"learning_rate": 5.95446496647586e-06,
"loss": 0.0508,
"step": 1924
},
{
"epoch": 0.651838056337432,
"grad_norm": 0.427734375,
"learning_rate": 5.944228617817263e-06,
"loss": 0.0598,
"step": 1925
},
{
"epoch": 0.6521766735095553,
"grad_norm": 0.470703125,
"learning_rate": 5.933997353068419e-06,
"loss": 0.0699,
"step": 1926
},
{
"epoch": 0.6525152906816787,
"grad_norm": 0.609375,
"learning_rate": 5.923771185054224e-06,
"loss": 0.0726,
"step": 1927
},
{
"epoch": 0.652853907853802,
"grad_norm": 0.5078125,
"learning_rate": 5.913550126593186e-06,
"loss": 0.0721,
"step": 1928
},
{
"epoch": 0.6531925250259254,
"grad_norm": 0.37890625,
"learning_rate": 5.903334190497396e-06,
"loss": 0.0483,
"step": 1929
},
{
"epoch": 0.6535311421980488,
"grad_norm": 0.462890625,
"learning_rate": 5.8931233895725345e-06,
"loss": 0.0528,
"step": 1930
},
{
"epoch": 0.6538697593701721,
"grad_norm": 0.482421875,
"learning_rate": 5.882917736617839e-06,
"loss": 0.0751,
"step": 1931
},
{
"epoch": 0.6542083765422954,
"grad_norm": 0.4296875,
"learning_rate": 5.872717244426099e-06,
"loss": 0.0562,
"step": 1932
},
{
"epoch": 0.6545469937144187,
"grad_norm": 0.498046875,
"learning_rate": 5.862521925783631e-06,
"loss": 0.0628,
"step": 1933
},
{
"epoch": 0.6548856108865421,
"grad_norm": 0.44140625,
"learning_rate": 5.852331793470267e-06,
"loss": 0.0523,
"step": 1934
},
{
"epoch": 0.6552242280586654,
"grad_norm": 0.5390625,
"learning_rate": 5.842146860259337e-06,
"loss": 0.0563,
"step": 1935
},
{
"epoch": 0.6555628452307888,
"grad_norm": 0.41796875,
"learning_rate": 5.8319671389176605e-06,
"loss": 0.0523,
"step": 1936
},
{
"epoch": 0.6559014624029121,
"grad_norm": 0.439453125,
"learning_rate": 5.821792642205512e-06,
"loss": 0.0534,
"step": 1937
},
{
"epoch": 0.6562400795750355,
"grad_norm": 0.8984375,
"learning_rate": 5.811623382876636e-06,
"loss": 0.0865,
"step": 1938
},
{
"epoch": 0.6565786967471587,
"grad_norm": 0.7734375,
"learning_rate": 5.8014593736781864e-06,
"loss": 0.0701,
"step": 1939
},
{
"epoch": 0.6569173139192821,
"grad_norm": 0.458984375,
"learning_rate": 5.791300627350759e-06,
"loss": 0.052,
"step": 1940
},
{
"epoch": 0.6572559310914055,
"grad_norm": 0.51953125,
"learning_rate": 5.781147156628336e-06,
"loss": 0.0633,
"step": 1941
},
{
"epoch": 0.6575945482635288,
"grad_norm": 0.54296875,
"learning_rate": 5.770998974238298e-06,
"loss": 0.0629,
"step": 1942
},
{
"epoch": 0.6579331654356522,
"grad_norm": 0.48828125,
"learning_rate": 5.760856092901394e-06,
"loss": 0.0605,
"step": 1943
},
{
"epoch": 0.6582717826077755,
"grad_norm": 0.51953125,
"learning_rate": 5.750718525331722e-06,
"loss": 0.0576,
"step": 1944
},
{
"epoch": 0.6586103997798989,
"grad_norm": 0.373046875,
"learning_rate": 5.740586284236724e-06,
"loss": 0.0499,
"step": 1945
},
{
"epoch": 0.6589490169520221,
"grad_norm": 0.5078125,
"learning_rate": 5.730459382317177e-06,
"loss": 0.0711,
"step": 1946
},
{
"epoch": 0.6592876341241455,
"grad_norm": 0.412109375,
"learning_rate": 5.720337832267136e-06,
"loss": 0.06,
"step": 1947
},
{
"epoch": 0.6596262512962688,
"grad_norm": 0.52734375,
"learning_rate": 5.710221646773971e-06,
"loss": 0.0605,
"step": 1948
},
{
"epoch": 0.6599648684683922,
"grad_norm": 0.48046875,
"learning_rate": 5.700110838518327e-06,
"loss": 0.0567,
"step": 1949
},
{
"epoch": 0.6603034856405156,
"grad_norm": 0.376953125,
"learning_rate": 5.690005420174095e-06,
"loss": 0.0477,
"step": 1950
},
{
"epoch": 0.6606421028126389,
"grad_norm": 0.486328125,
"learning_rate": 5.679905404408426e-06,
"loss": 0.0622,
"step": 1951
},
{
"epoch": 0.6609807199847623,
"grad_norm": 0.6015625,
"learning_rate": 5.6698108038816815e-06,
"loss": 0.0638,
"step": 1952
},
{
"epoch": 0.6613193371568856,
"grad_norm": 0.400390625,
"learning_rate": 5.6597216312474476e-06,
"loss": 0.054,
"step": 1953
},
{
"epoch": 0.6616579543290089,
"grad_norm": 0.453125,
"learning_rate": 5.649637899152509e-06,
"loss": 0.0533,
"step": 1954
},
{
"epoch": 0.6619965715011322,
"grad_norm": 0.455078125,
"learning_rate": 5.639559620236815e-06,
"loss": 0.0573,
"step": 1955
},
{
"epoch": 0.6623351886732556,
"grad_norm": 0.5,
"learning_rate": 5.629486807133495e-06,
"loss": 0.0699,
"step": 1956
},
{
"epoch": 0.6626738058453789,
"grad_norm": 0.44921875,
"learning_rate": 5.619419472468824e-06,
"loss": 0.0509,
"step": 1957
},
{
"epoch": 0.6630124230175023,
"grad_norm": 0.625,
"learning_rate": 5.609357628862197e-06,
"loss": 0.0755,
"step": 1958
},
{
"epoch": 0.6633510401896257,
"grad_norm": 0.451171875,
"learning_rate": 5.599301288926145e-06,
"loss": 0.0509,
"step": 1959
},
{
"epoch": 0.663689657361749,
"grad_norm": 0.478515625,
"learning_rate": 5.5892504652662845e-06,
"loss": 0.0623,
"step": 1960
},
{
"epoch": 0.6640282745338723,
"grad_norm": 0.478515625,
"learning_rate": 5.579205170481328e-06,
"loss": 0.0578,
"step": 1961
},
{
"epoch": 0.6643668917059956,
"grad_norm": 0.498046875,
"learning_rate": 5.569165417163054e-06,
"loss": 0.0685,
"step": 1962
},
{
"epoch": 0.664705508878119,
"grad_norm": 0.515625,
"learning_rate": 5.559131217896288e-06,
"loss": 0.0699,
"step": 1963
},
{
"epoch": 0.6650441260502423,
"grad_norm": 0.470703125,
"learning_rate": 5.549102585258904e-06,
"loss": 0.0572,
"step": 1964
},
{
"epoch": 0.6653827432223657,
"grad_norm": 0.48046875,
"learning_rate": 5.539079531821799e-06,
"loss": 0.0532,
"step": 1965
},
{
"epoch": 0.665721360394489,
"grad_norm": 0.47265625,
"learning_rate": 5.529062070148859e-06,
"loss": 0.0597,
"step": 1966
},
{
"epoch": 0.6660599775666124,
"grad_norm": 0.470703125,
"learning_rate": 5.519050212796986e-06,
"loss": 0.0668,
"step": 1967
},
{
"epoch": 0.6663985947387356,
"grad_norm": 0.498046875,
"learning_rate": 5.509043972316037e-06,
"loss": 0.0614,
"step": 1968
},
{
"epoch": 0.666737211910859,
"grad_norm": 0.56640625,
"learning_rate": 5.499043361248832e-06,
"loss": 0.0561,
"step": 1969
},
{
"epoch": 0.6670758290829824,
"grad_norm": 0.51953125,
"learning_rate": 5.489048392131147e-06,
"loss": 0.0859,
"step": 1970
},
{
"epoch": 0.6674144462551057,
"grad_norm": 0.46875,
"learning_rate": 5.4790590774916665e-06,
"loss": 0.0537,
"step": 1971
},
{
"epoch": 0.6677530634272291,
"grad_norm": 0.435546875,
"learning_rate": 5.469075429852002e-06,
"loss": 0.0555,
"step": 1972
},
{
"epoch": 0.6680916805993524,
"grad_norm": 0.51953125,
"learning_rate": 5.459097461726661e-06,
"loss": 0.0719,
"step": 1973
},
{
"epoch": 0.6684302977714758,
"grad_norm": 0.55078125,
"learning_rate": 5.44912518562302e-06,
"loss": 0.0772,
"step": 1974
},
{
"epoch": 0.668768914943599,
"grad_norm": 0.60546875,
"learning_rate": 5.439158614041331e-06,
"loss": 0.06,
"step": 1975
},
{
"epoch": 0.6691075321157224,
"grad_norm": 0.3984375,
"learning_rate": 5.4291977594746955e-06,
"loss": 0.0536,
"step": 1976
},
{
"epoch": 0.6694461492878457,
"grad_norm": 0.5859375,
"learning_rate": 5.419242634409039e-06,
"loss": 0.113,
"step": 1977
},
{
"epoch": 0.6697847664599691,
"grad_norm": 0.6484375,
"learning_rate": 5.409293251323119e-06,
"loss": 0.0825,
"step": 1978
},
{
"epoch": 0.6701233836320925,
"grad_norm": 0.5390625,
"learning_rate": 5.399349622688479e-06,
"loss": 0.0676,
"step": 1979
},
{
"epoch": 0.6704620008042158,
"grad_norm": 0.609375,
"learning_rate": 5.3894117609694655e-06,
"loss": 0.0731,
"step": 1980
},
{
"epoch": 0.6708006179763392,
"grad_norm": 0.4765625,
"learning_rate": 5.379479678623189e-06,
"loss": 0.0647,
"step": 1981
},
{
"epoch": 0.6711392351484625,
"grad_norm": 0.5078125,
"learning_rate": 5.3695533880995096e-06,
"loss": 0.0873,
"step": 1982
},
{
"epoch": 0.6714778523205858,
"grad_norm": 0.478515625,
"learning_rate": 5.359632901841038e-06,
"loss": 0.0594,
"step": 1983
},
{
"epoch": 0.6718164694927091,
"grad_norm": 0.404296875,
"learning_rate": 5.349718232283106e-06,
"loss": 0.0601,
"step": 1984
},
{
"epoch": 0.6721550866648325,
"grad_norm": 0.63671875,
"learning_rate": 5.339809391853747e-06,
"loss": 0.0798,
"step": 1985
},
{
"epoch": 0.6724937038369558,
"grad_norm": 0.494140625,
"learning_rate": 5.3299063929737015e-06,
"loss": 0.0687,
"step": 1986
},
{
"epoch": 0.6728323210090792,
"grad_norm": 0.4140625,
"learning_rate": 5.3200092480563704e-06,
"loss": 0.0536,
"step": 1987
},
{
"epoch": 0.6731709381812025,
"grad_norm": 0.609375,
"learning_rate": 5.310117969507833e-06,
"loss": 0.0457,
"step": 1988
},
{
"epoch": 0.6735095553533259,
"grad_norm": 0.51953125,
"learning_rate": 5.300232569726805e-06,
"loss": 0.0617,
"step": 1989
},
{
"epoch": 0.6738481725254492,
"grad_norm": 0.490234375,
"learning_rate": 5.29035306110463e-06,
"loss": 0.0634,
"step": 1990
},
{
"epoch": 0.6741867896975725,
"grad_norm": 0.53515625,
"learning_rate": 5.2804794560252785e-06,
"loss": 0.0601,
"step": 1991
},
{
"epoch": 0.6745254068696959,
"grad_norm": 0.61328125,
"learning_rate": 5.270611766865319e-06,
"loss": 0.0957,
"step": 1992
},
{
"epoch": 0.6748640240418192,
"grad_norm": 0.6328125,
"learning_rate": 5.2607500059938935e-06,
"loss": 0.1005,
"step": 1993
},
{
"epoch": 0.6752026412139426,
"grad_norm": 0.458984375,
"learning_rate": 5.250894185772724e-06,
"loss": 0.0555,
"step": 1994
},
{
"epoch": 0.6755412583860659,
"grad_norm": 0.44921875,
"learning_rate": 5.241044318556083e-06,
"loss": 0.0605,
"step": 1995
},
{
"epoch": 0.6758798755581893,
"grad_norm": 0.490234375,
"learning_rate": 5.231200416690775e-06,
"loss": 0.0753,
"step": 1996
},
{
"epoch": 0.6762184927303125,
"grad_norm": 0.515625,
"learning_rate": 5.221362492516139e-06,
"loss": 0.0718,
"step": 1997
},
{
"epoch": 0.6765571099024359,
"grad_norm": 0.50390625,
"learning_rate": 5.211530558364005e-06,
"loss": 0.0645,
"step": 1998
},
{
"epoch": 0.6768957270745593,
"grad_norm": 0.5,
"learning_rate": 5.201704626558708e-06,
"loss": 0.0597,
"step": 1999
},
{
"epoch": 0.6772343442466826,
"grad_norm": 1.109375,
"learning_rate": 5.191884709417058e-06,
"loss": 0.0725,
"step": 2000
},
{
"epoch": 0.677572961418806,
"grad_norm": 0.45703125,
"learning_rate": 5.1820708192483145e-06,
"loss": 0.0579,
"step": 2001
},
{
"epoch": 0.6779115785909293,
"grad_norm": 0.71484375,
"learning_rate": 5.172262968354198e-06,
"loss": 0.087,
"step": 2002
},
{
"epoch": 0.6782501957630527,
"grad_norm": 0.404296875,
"learning_rate": 5.162461169028841e-06,
"loss": 0.0513,
"step": 2003
},
{
"epoch": 0.6785888129351759,
"grad_norm": 0.58984375,
"learning_rate": 5.152665433558803e-06,
"loss": 0.0824,
"step": 2004
},
{
"epoch": 0.6789274301072993,
"grad_norm": 0.5,
"learning_rate": 5.1428757742230466e-06,
"loss": 0.0706,
"step": 2005
},
{
"epoch": 0.6792660472794226,
"grad_norm": 0.5078125,
"learning_rate": 5.1330922032928996e-06,
"loss": 0.0718,
"step": 2006
},
{
"epoch": 0.679604664451546,
"grad_norm": 0.59765625,
"learning_rate": 5.123314733032074e-06,
"loss": 0.0998,
"step": 2007
},
{
"epoch": 0.6799432816236693,
"grad_norm": 0.37890625,
"learning_rate": 5.113543375696633e-06,
"loss": 0.052,
"step": 2008
},
{
"epoch": 0.6802818987957927,
"grad_norm": 0.46484375,
"learning_rate": 5.1037781435349676e-06,
"loss": 0.065,
"step": 2009
},
{
"epoch": 0.6806205159679161,
"grad_norm": 0.3984375,
"learning_rate": 5.094019048787802e-06,
"loss": 0.051,
"step": 2010
},
{
"epoch": 0.6809591331400394,
"grad_norm": 0.60546875,
"learning_rate": 5.084266103688161e-06,
"loss": 0.0822,
"step": 2011
},
{
"epoch": 0.6812977503121627,
"grad_norm": 0.482421875,
"learning_rate": 5.074519320461358e-06,
"loss": 0.0605,
"step": 2012
},
{
"epoch": 0.681636367484286,
"grad_norm": 0.484375,
"learning_rate": 5.064778711324989e-06,
"loss": 0.0494,
"step": 2013
},
{
"epoch": 0.6819749846564094,
"grad_norm": 0.88671875,
"learning_rate": 5.055044288488913e-06,
"loss": 0.0791,
"step": 2014
},
{
"epoch": 0.6823136018285327,
"grad_norm": 0.41015625,
"learning_rate": 5.045316064155221e-06,
"loss": 0.054,
"step": 2015
},
{
"epoch": 0.6826522190006561,
"grad_norm": 0.4453125,
"learning_rate": 5.035594050518254e-06,
"loss": 0.0535,
"step": 2016
},
{
"epoch": 0.6829908361727794,
"grad_norm": 0.5078125,
"learning_rate": 5.025878259764545e-06,
"loss": 0.0676,
"step": 2017
},
{
"epoch": 0.6833294533449028,
"grad_norm": 0.427734375,
"learning_rate": 5.016168704072846e-06,
"loss": 0.0536,
"step": 2018
},
{
"epoch": 0.683668070517026,
"grad_norm": 0.4453125,
"learning_rate": 5.006465395614086e-06,
"loss": 0.0603,
"step": 2019
},
{
"epoch": 0.6840066876891494,
"grad_norm": 0.498046875,
"learning_rate": 4.9967683465513595e-06,
"loss": 0.0726,
"step": 2020
},
{
"epoch": 0.6843453048612728,
"grad_norm": 0.453125,
"learning_rate": 4.987077569039922e-06,
"loss": 0.0645,
"step": 2021
},
{
"epoch": 0.6846839220333961,
"grad_norm": 0.453125,
"learning_rate": 4.977393075227159e-06,
"loss": 0.0542,
"step": 2022
},
{
"epoch": 0.6850225392055195,
"grad_norm": 0.4453125,
"learning_rate": 4.967714877252587e-06,
"loss": 0.0515,
"step": 2023
},
{
"epoch": 0.6853611563776428,
"grad_norm": 0.458984375,
"learning_rate": 4.958042987247832e-06,
"loss": 0.0684,
"step": 2024
},
{
"epoch": 0.6856997735497662,
"grad_norm": 0.490234375,
"learning_rate": 4.9483774173366e-06,
"loss": 0.0718,
"step": 2025
},
{
"epoch": 0.6860383907218894,
"grad_norm": 0.41796875,
"learning_rate": 4.938718179634689e-06,
"loss": 0.0558,
"step": 2026
},
{
"epoch": 0.6863770078940128,
"grad_norm": 0.52734375,
"learning_rate": 4.929065286249959e-06,
"loss": 0.0724,
"step": 2027
},
{
"epoch": 0.6867156250661361,
"grad_norm": 0.859375,
"learning_rate": 4.919418749282302e-06,
"loss": 0.1876,
"step": 2028
},
{
"epoch": 0.6870542422382595,
"grad_norm": 0.38671875,
"learning_rate": 4.909778580823663e-06,
"loss": 0.0502,
"step": 2029
},
{
"epoch": 0.6873928594103829,
"grad_norm": 0.65234375,
"learning_rate": 4.9001447929579855e-06,
"loss": 0.0814,
"step": 2030
},
{
"epoch": 0.6877314765825062,
"grad_norm": 0.5703125,
"learning_rate": 4.890517397761232e-06,
"loss": 0.0727,
"step": 2031
},
{
"epoch": 0.6880700937546296,
"grad_norm": 0.482421875,
"learning_rate": 4.880896407301333e-06,
"loss": 0.064,
"step": 2032
},
{
"epoch": 0.6884087109267528,
"grad_norm": 0.75,
"learning_rate": 4.8712818336382104e-06,
"loss": 0.0589,
"step": 2033
},
{
"epoch": 0.6887473280988762,
"grad_norm": 0.5703125,
"learning_rate": 4.861673688823726e-06,
"loss": 0.0676,
"step": 2034
},
{
"epoch": 0.6890859452709995,
"grad_norm": 0.46484375,
"learning_rate": 4.852071984901696e-06,
"loss": 0.0677,
"step": 2035
},
{
"epoch": 0.6894245624431229,
"grad_norm": 0.67578125,
"learning_rate": 4.842476733907851e-06,
"loss": 0.0656,
"step": 2036
},
{
"epoch": 0.6897631796152462,
"grad_norm": 0.3984375,
"learning_rate": 4.832887947869841e-06,
"loss": 0.0561,
"step": 2037
},
{
"epoch": 0.6901017967873696,
"grad_norm": 0.46875,
"learning_rate": 4.823305638807215e-06,
"loss": 0.0559,
"step": 2038
},
{
"epoch": 0.690440413959493,
"grad_norm": 0.70703125,
"learning_rate": 4.813729818731391e-06,
"loss": 0.0806,
"step": 2039
},
{
"epoch": 0.6907790311316163,
"grad_norm": 0.55859375,
"learning_rate": 4.804160499645667e-06,
"loss": 0.0692,
"step": 2040
},
{
"epoch": 0.6911176483037396,
"grad_norm": 0.546875,
"learning_rate": 4.794597693545179e-06,
"loss": 0.0532,
"step": 2041
},
{
"epoch": 0.6914562654758629,
"grad_norm": 0.435546875,
"learning_rate": 4.785041412416906e-06,
"loss": 0.0625,
"step": 2042
},
{
"epoch": 0.6917948826479863,
"grad_norm": 0.49609375,
"learning_rate": 4.7754916682396545e-06,
"loss": 0.0646,
"step": 2043
},
{
"epoch": 0.6921334998201096,
"grad_norm": 0.38671875,
"learning_rate": 4.76594847298402e-06,
"loss": 0.0437,
"step": 2044
},
{
"epoch": 0.692472116992233,
"grad_norm": 0.494140625,
"learning_rate": 4.756411838612402e-06,
"loss": 0.0618,
"step": 2045
},
{
"epoch": 0.6928107341643563,
"grad_norm": 0.515625,
"learning_rate": 4.746881777078979e-06,
"loss": 0.0675,
"step": 2046
},
{
"epoch": 0.6931493513364797,
"grad_norm": 0.63671875,
"learning_rate": 4.737358300329673e-06,
"loss": 0.0711,
"step": 2047
},
{
"epoch": 0.693487968508603,
"grad_norm": 0.443359375,
"learning_rate": 4.727841420302172e-06,
"loss": 0.0549,
"step": 2048
},
{
"epoch": 0.6938265856807263,
"grad_norm": 0.412109375,
"learning_rate": 4.7183311489258774e-06,
"loss": 0.0567,
"step": 2049
},
{
"epoch": 0.6941652028528497,
"grad_norm": 0.482421875,
"learning_rate": 4.70882749812192e-06,
"loss": 0.058,
"step": 2050
},
{
"epoch": 0.694503820024973,
"grad_norm": 0.48046875,
"learning_rate": 4.699330479803131e-06,
"loss": 0.0677,
"step": 2051
},
{
"epoch": 0.6948424371970964,
"grad_norm": 0.388671875,
"learning_rate": 4.68984010587402e-06,
"loss": 0.0471,
"step": 2052
},
{
"epoch": 0.6951810543692197,
"grad_norm": 0.47265625,
"learning_rate": 4.6803563882307655e-06,
"loss": 0.06,
"step": 2053
},
{
"epoch": 0.6955196715413431,
"grad_norm": 0.482421875,
"learning_rate": 4.670879338761218e-06,
"loss": 0.0604,
"step": 2054
},
{
"epoch": 0.6958582887134663,
"grad_norm": 0.51171875,
"learning_rate": 4.6614089693448515e-06,
"loss": 0.0571,
"step": 2055
},
{
"epoch": 0.6961969058855897,
"grad_norm": 0.50390625,
"learning_rate": 4.651945291852779e-06,
"loss": 0.0746,
"step": 2056
},
{
"epoch": 0.696535523057713,
"grad_norm": 0.60546875,
"learning_rate": 4.642488318147723e-06,
"loss": 0.0734,
"step": 2057
},
{
"epoch": 0.6968741402298364,
"grad_norm": 0.384765625,
"learning_rate": 4.633038060083996e-06,
"loss": 0.0513,
"step": 2058
},
{
"epoch": 0.6972127574019598,
"grad_norm": 0.5859375,
"learning_rate": 4.623594529507503e-06,
"loss": 0.0631,
"step": 2059
},
{
"epoch": 0.6975513745740831,
"grad_norm": 0.5859375,
"learning_rate": 4.6141577382557044e-06,
"loss": 0.0805,
"step": 2060
},
{
"epoch": 0.6978899917462065,
"grad_norm": 0.55859375,
"learning_rate": 4.604727698157621e-06,
"loss": 0.0771,
"step": 2061
},
{
"epoch": 0.6982286089183297,
"grad_norm": 0.46484375,
"learning_rate": 4.5953044210338116e-06,
"loss": 0.0578,
"step": 2062
},
{
"epoch": 0.6985672260904531,
"grad_norm": 0.474609375,
"learning_rate": 4.58588791869635e-06,
"loss": 0.0691,
"step": 2063
},
{
"epoch": 0.6989058432625764,
"grad_norm": 0.439453125,
"learning_rate": 4.576478202948826e-06,
"loss": 0.0568,
"step": 2064
},
{
"epoch": 0.6992444604346998,
"grad_norm": 0.427734375,
"learning_rate": 4.567075285586321e-06,
"loss": 0.0491,
"step": 2065
},
{
"epoch": 0.6995830776068231,
"grad_norm": 0.88671875,
"learning_rate": 4.557679178395387e-06,
"loss": 0.0596,
"step": 2066
},
{
"epoch": 0.6999216947789465,
"grad_norm": 0.458984375,
"learning_rate": 4.5482898931540505e-06,
"loss": 0.0626,
"step": 2067
},
{
"epoch": 0.7002603119510699,
"grad_norm": 0.52734375,
"learning_rate": 4.538907441631776e-06,
"loss": 0.0592,
"step": 2068
},
{
"epoch": 0.7005989291231932,
"grad_norm": 0.43359375,
"learning_rate": 4.5295318355894705e-06,
"loss": 0.0555,
"step": 2069
},
{
"epoch": 0.7009375462953165,
"grad_norm": 0.4296875,
"learning_rate": 4.52016308677946e-06,
"loss": 0.0569,
"step": 2070
},
{
"epoch": 0.7012761634674398,
"grad_norm": 0.55859375,
"learning_rate": 4.5108012069454645e-06,
"loss": 0.066,
"step": 2071
},
{
"epoch": 0.7016147806395632,
"grad_norm": 0.46484375,
"learning_rate": 4.5014462078226064e-06,
"loss": 0.0519,
"step": 2072
},
{
"epoch": 0.7016147806395632,
"eval_loss": 0.06592338532209396,
"eval_runtime": 815.5749,
"eval_samples_per_second": 12.198,
"eval_steps_per_second": 3.049,
"step": 2072
},
{
"epoch": 0.7019533978116865,
"grad_norm": 0.41796875,
"learning_rate": 4.492098101137382e-06,
"loss": 0.0626,
"step": 2073
},
{
"epoch": 0.7022920149838099,
"grad_norm": 0.52734375,
"learning_rate": 4.482756898607633e-06,
"loss": 0.072,
"step": 2074
},
{
"epoch": 0.7026306321559332,
"grad_norm": 0.44140625,
"learning_rate": 4.4734226119425615e-06,
"loss": 0.0637,
"step": 2075
},
{
"epoch": 0.7029692493280566,
"grad_norm": 0.42578125,
"learning_rate": 4.464095252842703e-06,
"loss": 0.0534,
"step": 2076
},
{
"epoch": 0.7033078665001798,
"grad_norm": 0.37890625,
"learning_rate": 4.454774832999893e-06,
"loss": 0.0472,
"step": 2077
},
{
"epoch": 0.7036464836723032,
"grad_norm": 0.640625,
"learning_rate": 4.445461364097288e-06,
"loss": 0.0701,
"step": 2078
},
{
"epoch": 0.7039851008444266,
"grad_norm": 0.78125,
"learning_rate": 4.436154857809314e-06,
"loss": 0.0544,
"step": 2079
},
{
"epoch": 0.7043237180165499,
"grad_norm": 0.6171875,
"learning_rate": 4.42685532580168e-06,
"loss": 0.0479,
"step": 2080
},
{
"epoch": 0.7046623351886733,
"grad_norm": 0.60546875,
"learning_rate": 4.417562779731355e-06,
"loss": 0.0743,
"step": 2081
},
{
"epoch": 0.7050009523607966,
"grad_norm": 0.41796875,
"learning_rate": 4.408277231246539e-06,
"loss": 0.0463,
"step": 2082
},
{
"epoch": 0.70533956953292,
"grad_norm": 0.439453125,
"learning_rate": 4.3989986919866716e-06,
"loss": 0.0552,
"step": 2083
},
{
"epoch": 0.7056781867050432,
"grad_norm": 0.478515625,
"learning_rate": 4.3897271735824045e-06,
"loss": 0.0654,
"step": 2084
},
{
"epoch": 0.7060168038771666,
"grad_norm": 0.404296875,
"learning_rate": 4.380462687655581e-06,
"loss": 0.053,
"step": 2085
},
{
"epoch": 0.7063554210492899,
"grad_norm": 0.474609375,
"learning_rate": 4.371205245819241e-06,
"loss": 0.0636,
"step": 2086
},
{
"epoch": 0.7066940382214133,
"grad_norm": 0.46875,
"learning_rate": 4.361954859677584e-06,
"loss": 0.0645,
"step": 2087
},
{
"epoch": 0.7070326553935367,
"grad_norm": 0.609375,
"learning_rate": 4.35271154082597e-06,
"loss": 0.0726,
"step": 2088
},
{
"epoch": 0.70737127256566,
"grad_norm": 0.578125,
"learning_rate": 4.343475300850907e-06,
"loss": 0.0656,
"step": 2089
},
{
"epoch": 0.7077098897377834,
"grad_norm": 0.55859375,
"learning_rate": 4.334246151330012e-06,
"loss": 0.0644,
"step": 2090
},
{
"epoch": 0.7080485069099066,
"grad_norm": 0.515625,
"learning_rate": 4.32502410383203e-06,
"loss": 0.0722,
"step": 2091
},
{
"epoch": 0.70838712408203,
"grad_norm": 0.4375,
"learning_rate": 4.315809169916802e-06,
"loss": 0.0505,
"step": 2092
},
{
"epoch": 0.7087257412541533,
"grad_norm": 0.4140625,
"learning_rate": 4.306601361135241e-06,
"loss": 0.0484,
"step": 2093
},
{
"epoch": 0.7090643584262767,
"grad_norm": 0.42578125,
"learning_rate": 4.297400689029344e-06,
"loss": 0.0606,
"step": 2094
},
{
"epoch": 0.7094029755984,
"grad_norm": 0.6484375,
"learning_rate": 4.2882071651321485e-06,
"loss": 0.0702,
"step": 2095
},
{
"epoch": 0.7097415927705234,
"grad_norm": 0.43359375,
"learning_rate": 4.279020800967736e-06,
"loss": 0.0488,
"step": 2096
},
{
"epoch": 0.7100802099426468,
"grad_norm": 0.4140625,
"learning_rate": 4.2698416080512204e-06,
"loss": 0.0486,
"step": 2097
},
{
"epoch": 0.7104188271147701,
"grad_norm": 0.462890625,
"learning_rate": 4.260669597888715e-06,
"loss": 0.0501,
"step": 2098
},
{
"epoch": 0.7107574442868934,
"grad_norm": 0.62890625,
"learning_rate": 4.251504781977337e-06,
"loss": 0.0779,
"step": 2099
},
{
"epoch": 0.7110960614590167,
"grad_norm": 0.4453125,
"learning_rate": 4.24234717180519e-06,
"loss": 0.0576,
"step": 2100
},
{
"epoch": 0.7114346786311401,
"grad_norm": 0.49609375,
"learning_rate": 4.2331967788513295e-06,
"loss": 0.0669,
"step": 2101
},
{
"epoch": 0.7117732958032634,
"grad_norm": 0.451171875,
"learning_rate": 4.224053614585779e-06,
"loss": 0.0635,
"step": 2102
},
{
"epoch": 0.7121119129753868,
"grad_norm": 0.482421875,
"learning_rate": 4.214917690469499e-06,
"loss": 0.0612,
"step": 2103
},
{
"epoch": 0.7124505301475101,
"grad_norm": 0.55078125,
"learning_rate": 4.205789017954364e-06,
"loss": 0.0592,
"step": 2104
},
{
"epoch": 0.7127891473196335,
"grad_norm": 0.52734375,
"learning_rate": 4.1966676084831715e-06,
"loss": 0.065,
"step": 2105
},
{
"epoch": 0.7131277644917567,
"grad_norm": 0.380859375,
"learning_rate": 4.187553473489604e-06,
"loss": 0.0527,
"step": 2106
},
{
"epoch": 0.7134663816638801,
"grad_norm": 0.4296875,
"learning_rate": 4.178446624398233e-06,
"loss": 0.0521,
"step": 2107
},
{
"epoch": 0.7138049988360035,
"grad_norm": 0.486328125,
"learning_rate": 4.169347072624497e-06,
"loss": 0.071,
"step": 2108
},
{
"epoch": 0.7141436160081268,
"grad_norm": 0.392578125,
"learning_rate": 4.160254829574679e-06,
"loss": 0.0548,
"step": 2109
},
{
"epoch": 0.7144822331802502,
"grad_norm": 0.423828125,
"learning_rate": 4.15116990664591e-06,
"loss": 0.0579,
"step": 2110
},
{
"epoch": 0.7148208503523735,
"grad_norm": 0.400390625,
"learning_rate": 4.142092315226146e-06,
"loss": 0.0456,
"step": 2111
},
{
"epoch": 0.7151594675244969,
"grad_norm": 0.62890625,
"learning_rate": 4.13302206669414e-06,
"loss": 0.0613,
"step": 2112
},
{
"epoch": 0.7154980846966201,
"grad_norm": 0.46875,
"learning_rate": 4.123959172419456e-06,
"loss": 0.0577,
"step": 2113
},
{
"epoch": 0.7158367018687435,
"grad_norm": 0.421875,
"learning_rate": 4.114903643762428e-06,
"loss": 0.0641,
"step": 2114
},
{
"epoch": 0.7161753190408668,
"grad_norm": 0.64453125,
"learning_rate": 4.1058554920741635e-06,
"loss": 0.079,
"step": 2115
},
{
"epoch": 0.7165139362129902,
"grad_norm": 0.42578125,
"learning_rate": 4.096814728696529e-06,
"loss": 0.0563,
"step": 2116
},
{
"epoch": 0.7168525533851136,
"grad_norm": 0.4921875,
"learning_rate": 4.087781364962108e-06,
"loss": 0.0606,
"step": 2117
},
{
"epoch": 0.7171911705572369,
"grad_norm": 0.5078125,
"learning_rate": 4.078755412194228e-06,
"loss": 0.0593,
"step": 2118
},
{
"epoch": 0.7175297877293603,
"grad_norm": 0.51171875,
"learning_rate": 4.069736881706929e-06,
"loss": 0.0645,
"step": 2119
},
{
"epoch": 0.7178684049014835,
"grad_norm": 0.455078125,
"learning_rate": 4.06072578480493e-06,
"loss": 0.048,
"step": 2120
},
{
"epoch": 0.7182070220736069,
"grad_norm": 0.46875,
"learning_rate": 4.051722132783644e-06,
"loss": 0.0683,
"step": 2121
},
{
"epoch": 0.7185456392457302,
"grad_norm": 0.39453125,
"learning_rate": 4.042725936929157e-06,
"loss": 0.0465,
"step": 2122
},
{
"epoch": 0.7188842564178536,
"grad_norm": 0.50390625,
"learning_rate": 4.0337372085181905e-06,
"loss": 0.0717,
"step": 2123
},
{
"epoch": 0.7192228735899769,
"grad_norm": 0.55859375,
"learning_rate": 4.024755958818125e-06,
"loss": 0.0725,
"step": 2124
},
{
"epoch": 0.7195614907621003,
"grad_norm": 0.478515625,
"learning_rate": 4.0157821990869505e-06,
"loss": 0.0528,
"step": 2125
},
{
"epoch": 0.7199001079342237,
"grad_norm": 0.498046875,
"learning_rate": 4.006815940573279e-06,
"loss": 0.0793,
"step": 2126
},
{
"epoch": 0.720238725106347,
"grad_norm": 0.625,
"learning_rate": 3.997857194516319e-06,
"loss": 0.0728,
"step": 2127
},
{
"epoch": 0.7205773422784703,
"grad_norm": 0.5546875,
"learning_rate": 3.988905972145854e-06,
"loss": 0.0728,
"step": 2128
},
{
"epoch": 0.7209159594505936,
"grad_norm": 0.5234375,
"learning_rate": 3.979962284682245e-06,
"loss": 0.0724,
"step": 2129
},
{
"epoch": 0.721254576622717,
"grad_norm": 0.51953125,
"learning_rate": 3.971026143336409e-06,
"loss": 0.0748,
"step": 2130
},
{
"epoch": 0.7215931937948403,
"grad_norm": 0.3984375,
"learning_rate": 3.96209755930979e-06,
"loss": 0.0569,
"step": 2131
},
{
"epoch": 0.7219318109669637,
"grad_norm": 0.51953125,
"learning_rate": 3.953176543794378e-06,
"loss": 0.0673,
"step": 2132
},
{
"epoch": 0.722270428139087,
"grad_norm": 0.458984375,
"learning_rate": 3.94426310797266e-06,
"loss": 0.0556,
"step": 2133
},
{
"epoch": 0.7226090453112104,
"grad_norm": 0.447265625,
"learning_rate": 3.935357263017633e-06,
"loss": 0.0616,
"step": 2134
},
{
"epoch": 0.7229476624833336,
"grad_norm": 0.51171875,
"learning_rate": 3.926459020092774e-06,
"loss": 0.066,
"step": 2135
},
{
"epoch": 0.723286279655457,
"grad_norm": 1.4296875,
"learning_rate": 3.917568390352029e-06,
"loss": 0.0712,
"step": 2136
},
{
"epoch": 0.7236248968275804,
"grad_norm": 0.55859375,
"learning_rate": 3.908685384939807e-06,
"loss": 0.0741,
"step": 2137
},
{
"epoch": 0.7239635139997037,
"grad_norm": 0.51953125,
"learning_rate": 3.899810014990953e-06,
"loss": 0.0728,
"step": 2138
},
{
"epoch": 0.7243021311718271,
"grad_norm": 0.52734375,
"learning_rate": 3.890942291630739e-06,
"loss": 0.0746,
"step": 2139
},
{
"epoch": 0.7246407483439504,
"grad_norm": 0.484375,
"learning_rate": 3.8820822259748645e-06,
"loss": 0.0595,
"step": 2140
},
{
"epoch": 0.7249793655160738,
"grad_norm": 0.455078125,
"learning_rate": 3.873229829129423e-06,
"loss": 0.053,
"step": 2141
},
{
"epoch": 0.725317982688197,
"grad_norm": 0.41015625,
"learning_rate": 3.864385112190889e-06,
"loss": 0.0526,
"step": 2142
},
{
"epoch": 0.7256565998603204,
"grad_norm": 0.6171875,
"learning_rate": 3.8555480862461214e-06,
"loss": 0.0773,
"step": 2143
},
{
"epoch": 0.7259952170324437,
"grad_norm": 0.5078125,
"learning_rate": 3.846718762372328e-06,
"loss": 0.0595,
"step": 2144
},
{
"epoch": 0.7263338342045671,
"grad_norm": 0.58203125,
"learning_rate": 3.837897151637069e-06,
"loss": 0.073,
"step": 2145
},
{
"epoch": 0.7266724513766905,
"grad_norm": 0.4140625,
"learning_rate": 3.829083265098236e-06,
"loss": 0.0546,
"step": 2146
},
{
"epoch": 0.7270110685488138,
"grad_norm": 0.4921875,
"learning_rate": 3.820277113804034e-06,
"loss": 0.0585,
"step": 2147
},
{
"epoch": 0.7273496857209372,
"grad_norm": 0.67578125,
"learning_rate": 3.811478708792975e-06,
"loss": 0.0918,
"step": 2148
},
{
"epoch": 0.7276883028930604,
"grad_norm": 0.43359375,
"learning_rate": 3.802688061093864e-06,
"loss": 0.0533,
"step": 2149
},
{
"epoch": 0.7280269200651838,
"grad_norm": 0.466796875,
"learning_rate": 3.793905181725772e-06,
"loss": 0.0574,
"step": 2150
},
{
"epoch": 0.7283655372373071,
"grad_norm": 0.640625,
"learning_rate": 3.785130081698045e-06,
"loss": 0.0713,
"step": 2151
},
{
"epoch": 0.7287041544094305,
"grad_norm": 0.47265625,
"learning_rate": 3.776362772010267e-06,
"loss": 0.0664,
"step": 2152
},
{
"epoch": 0.7290427715815538,
"grad_norm": 0.515625,
"learning_rate": 3.767603263652263e-06,
"loss": 0.0622,
"step": 2153
},
{
"epoch": 0.7293813887536772,
"grad_norm": 0.50390625,
"learning_rate": 3.7588515676040805e-06,
"loss": 0.0673,
"step": 2154
},
{
"epoch": 0.7297200059258006,
"grad_norm": 0.4765625,
"learning_rate": 3.750107694835966e-06,
"loss": 0.0663,
"step": 2155
},
{
"epoch": 0.7300586230979239,
"grad_norm": 0.54296875,
"learning_rate": 3.7413716563083704e-06,
"loss": 0.0625,
"step": 2156
},
{
"epoch": 0.7303972402700472,
"grad_norm": 0.42578125,
"learning_rate": 3.7326434629719122e-06,
"loss": 0.0558,
"step": 2157
},
{
"epoch": 0.7307358574421705,
"grad_norm": 0.546875,
"learning_rate": 3.723923125767389e-06,
"loss": 0.0678,
"step": 2158
},
{
"epoch": 0.7310744746142939,
"grad_norm": 0.447265625,
"learning_rate": 3.715210655625738e-06,
"loss": 0.0477,
"step": 2159
},
{
"epoch": 0.7314130917864172,
"grad_norm": 0.474609375,
"learning_rate": 3.7065060634680485e-06,
"loss": 0.0604,
"step": 2160
},
{
"epoch": 0.7317517089585406,
"grad_norm": 0.67578125,
"learning_rate": 3.6978093602055186e-06,
"loss": 0.0876,
"step": 2161
},
{
"epoch": 0.7320903261306639,
"grad_norm": 0.54296875,
"learning_rate": 3.689120556739475e-06,
"loss": 0.073,
"step": 2162
},
{
"epoch": 0.7324289433027873,
"grad_norm": 0.455078125,
"learning_rate": 3.6804396639613273e-06,
"loss": 0.0456,
"step": 2163
},
{
"epoch": 0.7327675604749105,
"grad_norm": 0.734375,
"learning_rate": 3.6717666927525765e-06,
"loss": 0.1512,
"step": 2164
},
{
"epoch": 0.7331061776470339,
"grad_norm": 0.51953125,
"learning_rate": 3.6631016539847987e-06,
"loss": 0.0597,
"step": 2165
},
{
"epoch": 0.7334447948191573,
"grad_norm": 0.5,
"learning_rate": 3.654444558519612e-06,
"loss": 0.059,
"step": 2166
},
{
"epoch": 0.7337834119912806,
"grad_norm": 0.546875,
"learning_rate": 3.6457954172086895e-06,
"loss": 0.0734,
"step": 2167
},
{
"epoch": 0.734122029163404,
"grad_norm": 0.53125,
"learning_rate": 3.6371542408937355e-06,
"loss": 0.0575,
"step": 2168
},
{
"epoch": 0.7344606463355273,
"grad_norm": 0.4140625,
"learning_rate": 3.6285210404064587e-06,
"loss": 0.0573,
"step": 2169
},
{
"epoch": 0.7347992635076507,
"grad_norm": 0.431640625,
"learning_rate": 3.619895826568581e-06,
"loss": 0.0489,
"step": 2170
},
{
"epoch": 0.7351378806797739,
"grad_norm": 0.48828125,
"learning_rate": 3.611278610191804e-06,
"loss": 0.0538,
"step": 2171
},
{
"epoch": 0.7354764978518973,
"grad_norm": 0.55859375,
"learning_rate": 3.602669402077811e-06,
"loss": 0.0678,
"step": 2172
},
{
"epoch": 0.7358151150240206,
"grad_norm": 0.54296875,
"learning_rate": 3.594068213018249e-06,
"loss": 0.052,
"step": 2173
},
{
"epoch": 0.736153732196144,
"grad_norm": 0.43359375,
"learning_rate": 3.5854750537947035e-06,
"loss": 0.0622,
"step": 2174
},
{
"epoch": 0.7364923493682674,
"grad_norm": 0.6953125,
"learning_rate": 3.5768899351787066e-06,
"loss": 0.0634,
"step": 2175
},
{
"epoch": 0.7368309665403907,
"grad_norm": 0.59765625,
"learning_rate": 3.568312867931697e-06,
"loss": 0.1019,
"step": 2176
},
{
"epoch": 0.7371695837125141,
"grad_norm": 0.5078125,
"learning_rate": 3.559743862805034e-06,
"loss": 0.0662,
"step": 2177
},
{
"epoch": 0.7375082008846373,
"grad_norm": 0.51953125,
"learning_rate": 3.551182930539969e-06,
"loss": 0.0743,
"step": 2178
},
{
"epoch": 0.7378468180567607,
"grad_norm": 0.5703125,
"learning_rate": 3.5426300818676264e-06,
"loss": 0.072,
"step": 2179
},
{
"epoch": 0.738185435228884,
"grad_norm": 0.466796875,
"learning_rate": 3.534085327509006e-06,
"loss": 0.0677,
"step": 2180
},
{
"epoch": 0.7385240524010074,
"grad_norm": 0.4609375,
"learning_rate": 3.525548678174957e-06,
"loss": 0.0604,
"step": 2181
},
{
"epoch": 0.7388626695731307,
"grad_norm": 0.5078125,
"learning_rate": 3.5170201445661655e-06,
"loss": 0.0628,
"step": 2182
},
{
"epoch": 0.7392012867452541,
"grad_norm": 0.38671875,
"learning_rate": 3.5084997373731546e-06,
"loss": 0.0482,
"step": 2183
},
{
"epoch": 0.7395399039173775,
"grad_norm": 0.498046875,
"learning_rate": 3.4999874672762567e-06,
"loss": 0.0587,
"step": 2184
},
{
"epoch": 0.7398785210895007,
"grad_norm": 2.1875,
"learning_rate": 3.4914833449455963e-06,
"loss": 0.0638,
"step": 2185
},
{
"epoch": 0.740217138261624,
"grad_norm": 0.5546875,
"learning_rate": 3.482987381041096e-06,
"loss": 0.0692,
"step": 2186
},
{
"epoch": 0.7405557554337474,
"grad_norm": 0.4140625,
"learning_rate": 3.4744995862124498e-06,
"loss": 0.0501,
"step": 2187
},
{
"epoch": 0.7408943726058708,
"grad_norm": 0.48046875,
"learning_rate": 3.4660199710991038e-06,
"loss": 0.0731,
"step": 2188
},
{
"epoch": 0.7412329897779941,
"grad_norm": 0.62890625,
"learning_rate": 3.4575485463302603e-06,
"loss": 0.1051,
"step": 2189
},
{
"epoch": 0.7415716069501175,
"grad_norm": 0.44140625,
"learning_rate": 3.449085322524848e-06,
"loss": 0.0553,
"step": 2190
},
{
"epoch": 0.7419102241222408,
"grad_norm": 0.55078125,
"learning_rate": 3.440630310291517e-06,
"loss": 0.0543,
"step": 2191
},
{
"epoch": 0.7422488412943642,
"grad_norm": 0.51953125,
"learning_rate": 3.432183520228635e-06,
"loss": 0.0701,
"step": 2192
},
{
"epoch": 0.7425874584664874,
"grad_norm": 0.53125,
"learning_rate": 3.4237449629242427e-06,
"loss": 0.0757,
"step": 2193
},
{
"epoch": 0.7429260756386108,
"grad_norm": 0.42578125,
"learning_rate": 3.4153146489560807e-06,
"loss": 0.0497,
"step": 2194
},
{
"epoch": 0.7432646928107342,
"grad_norm": 0.5625,
"learning_rate": 3.4068925888915417e-06,
"loss": 0.0708,
"step": 2195
},
{
"epoch": 0.7436033099828575,
"grad_norm": 0.4765625,
"learning_rate": 3.398478793287682e-06,
"loss": 0.0616,
"step": 2196
},
{
"epoch": 0.7439419271549809,
"grad_norm": 0.44140625,
"learning_rate": 3.390073272691198e-06,
"loss": 0.0545,
"step": 2197
},
{
"epoch": 0.7442805443271042,
"grad_norm": 0.396484375,
"learning_rate": 3.381676037638404e-06,
"loss": 0.0548,
"step": 2198
},
{
"epoch": 0.7446191614992276,
"grad_norm": 0.50390625,
"learning_rate": 3.3732870986552392e-06,
"loss": 0.0593,
"step": 2199
},
{
"epoch": 0.7449577786713508,
"grad_norm": 0.6953125,
"learning_rate": 3.3649064662572406e-06,
"loss": 0.0843,
"step": 2200
},
{
"epoch": 0.7452963958434742,
"grad_norm": 0.578125,
"learning_rate": 3.35653415094953e-06,
"loss": 0.0733,
"step": 2201
},
{
"epoch": 0.7456350130155975,
"grad_norm": 0.56640625,
"learning_rate": 3.3481701632268014e-06,
"loss": 0.0623,
"step": 2202
},
{
"epoch": 0.7459736301877209,
"grad_norm": 0.439453125,
"learning_rate": 3.339814513573321e-06,
"loss": 0.059,
"step": 2203
},
{
"epoch": 0.7463122473598442,
"grad_norm": 0.37890625,
"learning_rate": 3.3314672124628877e-06,
"loss": 0.0464,
"step": 2204
},
{
"epoch": 0.7466508645319676,
"grad_norm": 0.55078125,
"learning_rate": 3.323128270358851e-06,
"loss": 0.0573,
"step": 2205
},
{
"epoch": 0.746989481704091,
"grad_norm": 0.51171875,
"learning_rate": 3.3147976977140763e-06,
"loss": 0.0692,
"step": 2206
},
{
"epoch": 0.7473280988762142,
"grad_norm": 0.423828125,
"learning_rate": 3.3064755049709307e-06,
"loss": 0.0537,
"step": 2207
},
{
"epoch": 0.7476667160483376,
"grad_norm": 0.4375,
"learning_rate": 3.2981617025612913e-06,
"loss": 0.0586,
"step": 2208
},
{
"epoch": 0.7480053332204609,
"grad_norm": 0.58203125,
"learning_rate": 3.289856300906502e-06,
"loss": 0.0716,
"step": 2209
},
{
"epoch": 0.7483439503925843,
"grad_norm": 0.515625,
"learning_rate": 3.2815593104173882e-06,
"loss": 0.0656,
"step": 2210
},
{
"epoch": 0.7486825675647076,
"grad_norm": 0.41796875,
"learning_rate": 3.273270741494232e-06,
"loss": 0.055,
"step": 2211
},
{
"epoch": 0.749021184736831,
"grad_norm": 0.462890625,
"learning_rate": 3.264990604526749e-06,
"loss": 0.0665,
"step": 2212
},
{
"epoch": 0.7493598019089543,
"grad_norm": 0.48046875,
"learning_rate": 3.2567189098940966e-06,
"loss": 0.0582,
"step": 2213
},
{
"epoch": 0.7496984190810776,
"grad_norm": 0.5703125,
"learning_rate": 3.2484556679648393e-06,
"loss": 0.084,
"step": 2214
},
{
"epoch": 0.750037036253201,
"grad_norm": 0.51171875,
"learning_rate": 3.240200889096955e-06,
"loss": 0.0749,
"step": 2215
},
{
"epoch": 0.7503756534253243,
"grad_norm": 0.4921875,
"learning_rate": 3.231954583637812e-06,
"loss": 0.0605,
"step": 2216
},
{
"epoch": 0.7507142705974477,
"grad_norm": 0.4609375,
"learning_rate": 3.2237167619241492e-06,
"loss": 0.0609,
"step": 2217
},
{
"epoch": 0.751052887769571,
"grad_norm": 0.44921875,
"learning_rate": 3.2154874342820797e-06,
"loss": 0.0622,
"step": 2218
},
{
"epoch": 0.7513915049416944,
"grad_norm": 0.3828125,
"learning_rate": 3.207266611027069e-06,
"loss": 0.0463,
"step": 2219
},
{
"epoch": 0.7517301221138177,
"grad_norm": 0.5078125,
"learning_rate": 3.199054302463914e-06,
"loss": 0.0745,
"step": 2220
},
{
"epoch": 0.7520687392859411,
"grad_norm": 0.453125,
"learning_rate": 3.1908505188867513e-06,
"loss": 0.0582,
"step": 2221
},
{
"epoch": 0.7524073564580643,
"grad_norm": 0.5,
"learning_rate": 3.1826552705790192e-06,
"loss": 0.0577,
"step": 2222
},
{
"epoch": 0.7527459736301877,
"grad_norm": 0.486328125,
"learning_rate": 3.174468567813461e-06,
"loss": 0.0602,
"step": 2223
},
{
"epoch": 0.753084590802311,
"grad_norm": 0.51171875,
"learning_rate": 3.166290420852114e-06,
"loss": 0.0631,
"step": 2224
},
{
"epoch": 0.7534232079744344,
"grad_norm": 1.5078125,
"learning_rate": 3.1581208399462804e-06,
"loss": 0.0609,
"step": 2225
},
{
"epoch": 0.7537618251465578,
"grad_norm": 0.369140625,
"learning_rate": 3.1499598353365334e-06,
"loss": 0.0447,
"step": 2226
},
{
"epoch": 0.7541004423186811,
"grad_norm": 0.47265625,
"learning_rate": 3.141807417252697e-06,
"loss": 0.0606,
"step": 2227
},
{
"epoch": 0.7544390594908045,
"grad_norm": 0.396484375,
"learning_rate": 3.1336635959138197e-06,
"loss": 0.054,
"step": 2228
},
{
"epoch": 0.7547776766629277,
"grad_norm": 0.453125,
"learning_rate": 3.1255283815281876e-06,
"loss": 0.0674,
"step": 2229
},
{
"epoch": 0.7551162938350511,
"grad_norm": 0.466796875,
"learning_rate": 3.1174017842932946e-06,
"loss": 0.0645,
"step": 2230
},
{
"epoch": 0.7554549110071744,
"grad_norm": 0.51953125,
"learning_rate": 3.109283814395825e-06,
"loss": 0.0789,
"step": 2231
},
{
"epoch": 0.7557935281792978,
"grad_norm": 0.71875,
"learning_rate": 3.1011744820116607e-06,
"loss": 0.1046,
"step": 2232
},
{
"epoch": 0.7561321453514211,
"grad_norm": 0.56640625,
"learning_rate": 3.0930737973058443e-06,
"loss": 0.076,
"step": 2233
},
{
"epoch": 0.7564707625235445,
"grad_norm": 0.48828125,
"learning_rate": 3.084981770432588e-06,
"loss": 0.0705,
"step": 2234
},
{
"epoch": 0.7568093796956679,
"grad_norm": 0.62109375,
"learning_rate": 3.076898411535252e-06,
"loss": 0.0654,
"step": 2235
},
{
"epoch": 0.7571479968677911,
"grad_norm": 0.4609375,
"learning_rate": 3.06882373074632e-06,
"loss": 0.0583,
"step": 2236
},
{
"epoch": 0.7574866140399145,
"grad_norm": 0.484375,
"learning_rate": 3.0607577381874088e-06,
"loss": 0.0562,
"step": 2237
},
{
"epoch": 0.7578252312120378,
"grad_norm": 0.53125,
"learning_rate": 3.0527004439692433e-06,
"loss": 0.0584,
"step": 2238
},
{
"epoch": 0.7581638483841612,
"grad_norm": 0.5,
"learning_rate": 3.044651858191636e-06,
"loss": 0.0621,
"step": 2239
},
{
"epoch": 0.7585024655562845,
"grad_norm": 0.80078125,
"learning_rate": 3.0366119909434977e-06,
"loss": 0.0545,
"step": 2240
},
{
"epoch": 0.7588410827284079,
"grad_norm": 0.470703125,
"learning_rate": 3.0285808523027936e-06,
"loss": 0.0627,
"step": 2241
},
{
"epoch": 0.7591796999005312,
"grad_norm": 0.482421875,
"learning_rate": 3.0205584523365626e-06,
"loss": 0.0729,
"step": 2242
},
{
"epoch": 0.7595183170726545,
"grad_norm": 0.453125,
"learning_rate": 3.0125448011008894e-06,
"loss": 0.0605,
"step": 2243
},
{
"epoch": 0.7598569342447778,
"grad_norm": 0.59765625,
"learning_rate": 3.004539908640872e-06,
"loss": 0.0855,
"step": 2244
},
{
"epoch": 0.7601955514169012,
"grad_norm": 0.466796875,
"learning_rate": 2.996543784990653e-06,
"loss": 0.0587,
"step": 2245
},
{
"epoch": 0.7605341685890246,
"grad_norm": 0.412109375,
"learning_rate": 2.9885564401733745e-06,
"loss": 0.0519,
"step": 2246
},
{
"epoch": 0.7608727857611479,
"grad_norm": 0.5,
"learning_rate": 2.980577884201169e-06,
"loss": 0.0668,
"step": 2247
},
{
"epoch": 0.7612114029332713,
"grad_norm": 0.46484375,
"learning_rate": 2.9726081270751594e-06,
"loss": 0.0552,
"step": 2248
},
{
"epoch": 0.7615500201053946,
"grad_norm": 0.443359375,
"learning_rate": 2.9646471787854416e-06,
"loss": 0.0611,
"step": 2249
},
{
"epoch": 0.761888637277518,
"grad_norm": 0.50390625,
"learning_rate": 2.956695049311057e-06,
"loss": 0.0522,
"step": 2250
},
{
"epoch": 0.7622272544496412,
"grad_norm": 0.462890625,
"learning_rate": 2.948751748620007e-06,
"loss": 0.0615,
"step": 2251
},
{
"epoch": 0.7625658716217646,
"grad_norm": 0.322265625,
"learning_rate": 2.940817286669214e-06,
"loss": 0.0447,
"step": 2252
},
{
"epoch": 0.762904488793888,
"grad_norm": 0.5,
"learning_rate": 2.93289167340453e-06,
"loss": 0.0689,
"step": 2253
},
{
"epoch": 0.7632431059660113,
"grad_norm": 0.51171875,
"learning_rate": 2.9249749187607146e-06,
"loss": 0.0608,
"step": 2254
},
{
"epoch": 0.7635817231381347,
"grad_norm": 0.62109375,
"learning_rate": 2.917067032661415e-06,
"loss": 0.0734,
"step": 2255
},
{
"epoch": 0.763920340310258,
"grad_norm": 0.50390625,
"learning_rate": 2.909168025019168e-06,
"loss": 0.071,
"step": 2256
},
{
"epoch": 0.7642589574823814,
"grad_norm": 0.51953125,
"learning_rate": 2.901277905735386e-06,
"loss": 0.0604,
"step": 2257
},
{
"epoch": 0.7645975746545046,
"grad_norm": 0.6484375,
"learning_rate": 2.893396684700326e-06,
"loss": 0.0887,
"step": 2258
},
{
"epoch": 0.764936191826628,
"grad_norm": 0.50390625,
"learning_rate": 2.885524371793106e-06,
"loss": 0.0768,
"step": 2259
},
{
"epoch": 0.7652748089987513,
"grad_norm": 0.353515625,
"learning_rate": 2.8776609768816655e-06,
"loss": 0.0521,
"step": 2260
},
{
"epoch": 0.7656134261708747,
"grad_norm": 0.474609375,
"learning_rate": 2.8698065098227725e-06,
"loss": 0.0669,
"step": 2261
},
{
"epoch": 0.765952043342998,
"grad_norm": 0.4765625,
"learning_rate": 2.8619609804620063e-06,
"loss": 0.0602,
"step": 2262
},
{
"epoch": 0.7662906605151214,
"grad_norm": 0.44140625,
"learning_rate": 2.854124398633732e-06,
"loss": 0.0546,
"step": 2263
},
{
"epoch": 0.7666292776872448,
"grad_norm": 0.451171875,
"learning_rate": 2.846296774161108e-06,
"loss": 0.0598,
"step": 2264
},
{
"epoch": 0.766967894859368,
"grad_norm": 0.421875,
"learning_rate": 2.8384781168560693e-06,
"loss": 0.06,
"step": 2265
},
{
"epoch": 0.7673065120314914,
"grad_norm": 0.4921875,
"learning_rate": 2.8306684365192915e-06,
"loss": 0.0638,
"step": 2266
},
{
"epoch": 0.7676451292036147,
"grad_norm": 0.5,
"learning_rate": 2.822867742940214e-06,
"loss": 0.0603,
"step": 2267
},
{
"epoch": 0.7679837463757381,
"grad_norm": 0.431640625,
"learning_rate": 2.8150760458970115e-06,
"loss": 0.0627,
"step": 2268
},
{
"epoch": 0.7683223635478614,
"grad_norm": 0.5546875,
"learning_rate": 2.8072933551565706e-06,
"loss": 0.0726,
"step": 2269
},
{
"epoch": 0.7686609807199848,
"grad_norm": 0.56640625,
"learning_rate": 2.7995196804745005e-06,
"loss": 0.0813,
"step": 2270
},
{
"epoch": 0.7689995978921081,
"grad_norm": 0.5703125,
"learning_rate": 2.791755031595096e-06,
"loss": 0.072,
"step": 2271
},
{
"epoch": 0.7693382150642314,
"grad_norm": 0.66796875,
"learning_rate": 2.7839994182513496e-06,
"loss": 0.0566,
"step": 2272
},
{
"epoch": 0.7696768322363547,
"grad_norm": 0.58203125,
"learning_rate": 2.7762528501649256e-06,
"loss": 0.0811,
"step": 2273
},
{
"epoch": 0.7700154494084781,
"grad_norm": 0.46484375,
"learning_rate": 2.7685153370461424e-06,
"loss": 0.0523,
"step": 2274
},
{
"epoch": 0.7703540665806015,
"grad_norm": 0.40234375,
"learning_rate": 2.760786888593975e-06,
"loss": 0.0529,
"step": 2275
},
{
"epoch": 0.7706926837527248,
"grad_norm": 0.359375,
"learning_rate": 2.7530675144960382e-06,
"loss": 0.0425,
"step": 2276
},
{
"epoch": 0.7710313009248482,
"grad_norm": 0.7578125,
"learning_rate": 2.745357224428563e-06,
"loss": 0.0685,
"step": 2277
},
{
"epoch": 0.7713699180969715,
"grad_norm": 1.9296875,
"learning_rate": 2.7376560280564025e-06,
"loss": 0.0716,
"step": 2278
},
{
"epoch": 0.7717085352690949,
"grad_norm": 0.6015625,
"learning_rate": 2.729963935033002e-06,
"loss": 0.0661,
"step": 2279
},
{
"epoch": 0.7720471524412181,
"grad_norm": 0.57421875,
"learning_rate": 2.722280955000404e-06,
"loss": 0.0642,
"step": 2280
},
{
"epoch": 0.7723857696133415,
"grad_norm": 0.625,
"learning_rate": 2.714607097589226e-06,
"loss": 0.1111,
"step": 2281
},
{
"epoch": 0.7727243867854648,
"grad_norm": 0.59375,
"learning_rate": 2.706942372418645e-06,
"loss": 0.0496,
"step": 2282
},
{
"epoch": 0.7730630039575882,
"grad_norm": 0.55859375,
"learning_rate": 2.699286789096397e-06,
"loss": 0.0549,
"step": 2283
},
{
"epoch": 0.7734016211297116,
"grad_norm": 0.5,
"learning_rate": 2.691640357218759e-06,
"loss": 0.0606,
"step": 2284
},
{
"epoch": 0.7737402383018349,
"grad_norm": 0.5234375,
"learning_rate": 2.684003086370528e-06,
"loss": 0.0644,
"step": 2285
},
{
"epoch": 0.7740788554739583,
"grad_norm": 0.52734375,
"learning_rate": 2.6763749861250297e-06,
"loss": 0.0697,
"step": 2286
},
{
"epoch": 0.7744174726460815,
"grad_norm": 0.52734375,
"learning_rate": 2.6687560660440858e-06,
"loss": 0.0635,
"step": 2287
},
{
"epoch": 0.7747560898182049,
"grad_norm": 0.41796875,
"learning_rate": 2.66114633567801e-06,
"loss": 0.0387,
"step": 2288
},
{
"epoch": 0.7750947069903282,
"grad_norm": 0.478515625,
"learning_rate": 2.653545804565606e-06,
"loss": 0.071,
"step": 2289
},
{
"epoch": 0.7754333241624516,
"grad_norm": 0.55078125,
"learning_rate": 2.645954482234133e-06,
"loss": 0.0768,
"step": 2290
},
{
"epoch": 0.7757719413345749,
"grad_norm": 0.53515625,
"learning_rate": 2.6383723781993187e-06,
"loss": 0.0642,
"step": 2291
},
{
"epoch": 0.7761105585066983,
"grad_norm": 0.484375,
"learning_rate": 2.630799501965333e-06,
"loss": 0.0548,
"step": 2292
},
{
"epoch": 0.7764491756788217,
"grad_norm": 0.451171875,
"learning_rate": 2.6232358630247722e-06,
"loss": 0.0597,
"step": 2293
},
{
"epoch": 0.7767877928509449,
"grad_norm": 0.5078125,
"learning_rate": 2.61568147085866e-06,
"loss": 0.0518,
"step": 2294
},
{
"epoch": 0.7771264100230683,
"grad_norm": 0.53515625,
"learning_rate": 2.6081363349364317e-06,
"loss": 0.0703,
"step": 2295
},
{
"epoch": 0.7774650271951916,
"grad_norm": 0.443359375,
"learning_rate": 2.600600464715909e-06,
"loss": 0.0616,
"step": 2296
},
{
"epoch": 0.777803644367315,
"grad_norm": 0.474609375,
"learning_rate": 2.5930738696433124e-06,
"loss": 0.0609,
"step": 2297
},
{
"epoch": 0.7781422615394383,
"grad_norm": 0.423828125,
"learning_rate": 2.5855565591532227e-06,
"loss": 0.0559,
"step": 2298
},
{
"epoch": 0.7784808787115617,
"grad_norm": 0.474609375,
"learning_rate": 2.578048542668593e-06,
"loss": 0.058,
"step": 2299
},
{
"epoch": 0.778819495883685,
"grad_norm": 0.6015625,
"learning_rate": 2.5705498296007247e-06,
"loss": 0.0668,
"step": 2300
},
{
"epoch": 0.7791581130558083,
"grad_norm": 0.48046875,
"learning_rate": 2.56306042934925e-06,
"loss": 0.0635,
"step": 2301
},
{
"epoch": 0.7794967302279316,
"grad_norm": 0.6171875,
"learning_rate": 2.5555803513021393e-06,
"loss": 0.0563,
"step": 2302
},
{
"epoch": 0.779835347400055,
"grad_norm": 0.58203125,
"learning_rate": 2.5481096048356636e-06,
"loss": 0.0667,
"step": 2303
},
{
"epoch": 0.7801739645721784,
"grad_norm": 0.451171875,
"learning_rate": 2.5406481993144084e-06,
"loss": 0.0589,
"step": 2304
},
{
"epoch": 0.7805125817443017,
"grad_norm": 0.515625,
"learning_rate": 2.5331961440912476e-06,
"loss": 0.0657,
"step": 2305
},
{
"epoch": 0.7808511989164251,
"grad_norm": 0.5,
"learning_rate": 2.525753448507329e-06,
"loss": 0.0664,
"step": 2306
},
{
"epoch": 0.7811898160885484,
"grad_norm": 0.462890625,
"learning_rate": 2.518320121892076e-06,
"loss": 0.0628,
"step": 2307
},
{
"epoch": 0.7815284332606718,
"grad_norm": 0.5546875,
"learning_rate": 2.5108961735631634e-06,
"loss": 0.0737,
"step": 2308
},
{
"epoch": 0.781867050432795,
"grad_norm": 0.67578125,
"learning_rate": 2.503481612826506e-06,
"loss": 0.0841,
"step": 2309
},
{
"epoch": 0.7822056676049184,
"grad_norm": 0.5234375,
"learning_rate": 2.496076448976261e-06,
"loss": 0.0647,
"step": 2310
},
{
"epoch": 0.7825442847770417,
"grad_norm": 0.50390625,
"learning_rate": 2.4886806912948034e-06,
"loss": 0.0586,
"step": 2311
},
{
"epoch": 0.7828829019491651,
"grad_norm": 0.447265625,
"learning_rate": 2.481294349052711e-06,
"loss": 0.0621,
"step": 2312
},
{
"epoch": 0.7832215191212885,
"grad_norm": 0.455078125,
"learning_rate": 2.4739174315087678e-06,
"loss": 0.0668,
"step": 2313
},
{
"epoch": 0.7835601362934118,
"grad_norm": 0.5703125,
"learning_rate": 2.466549947909942e-06,
"loss": 0.0792,
"step": 2314
},
{
"epoch": 0.7838987534655352,
"grad_norm": 0.439453125,
"learning_rate": 2.4591919074913707e-06,
"loss": 0.06,
"step": 2315
},
{
"epoch": 0.7842373706376584,
"grad_norm": 0.4765625,
"learning_rate": 2.4518433194763625e-06,
"loss": 0.0653,
"step": 2316
},
{
"epoch": 0.7845759878097818,
"grad_norm": 0.482421875,
"learning_rate": 2.444504193076368e-06,
"loss": 0.0655,
"step": 2317
},
{
"epoch": 0.7849146049819051,
"grad_norm": 0.5,
"learning_rate": 2.437174537490985e-06,
"loss": 0.0754,
"step": 2318
},
{
"epoch": 0.7852532221540285,
"grad_norm": 0.453125,
"learning_rate": 2.429854361907942e-06,
"loss": 0.0545,
"step": 2319
},
{
"epoch": 0.7855918393261518,
"grad_norm": 0.68359375,
"learning_rate": 2.4225436755030717e-06,
"loss": 0.0695,
"step": 2320
},
{
"epoch": 0.7859304564982752,
"grad_norm": 0.390625,
"learning_rate": 2.415242487440328e-06,
"loss": 0.0421,
"step": 2321
},
{
"epoch": 0.7862690736703986,
"grad_norm": 0.53515625,
"learning_rate": 2.4079508068717427e-06,
"loss": 0.0666,
"step": 2322
},
{
"epoch": 0.7866076908425218,
"grad_norm": 0.365234375,
"learning_rate": 2.4006686429374437e-06,
"loss": 0.0405,
"step": 2323
},
{
"epoch": 0.7869463080146452,
"grad_norm": 0.462890625,
"learning_rate": 2.3933960047656235e-06,
"loss": 0.0497,
"step": 2324
},
{
"epoch": 0.7872849251867685,
"grad_norm": 0.419921875,
"learning_rate": 2.386132901472532e-06,
"loss": 0.0493,
"step": 2325
},
{
"epoch": 0.7876235423588919,
"grad_norm": 0.5390625,
"learning_rate": 2.378879342162471e-06,
"loss": 0.0699,
"step": 2326
},
{
"epoch": 0.7879621595310152,
"grad_norm": 0.451171875,
"learning_rate": 2.371635335927781e-06,
"loss": 0.0606,
"step": 2327
},
{
"epoch": 0.7883007767031386,
"grad_norm": 0.61328125,
"learning_rate": 2.3644008918488216e-06,
"loss": 0.0653,
"step": 2328
},
{
"epoch": 0.7886393938752619,
"grad_norm": 0.53125,
"learning_rate": 2.357176018993966e-06,
"loss": 0.0768,
"step": 2329
},
{
"epoch": 0.7889780110473852,
"grad_norm": 0.44140625,
"learning_rate": 2.349960726419599e-06,
"loss": 0.0589,
"step": 2330
},
{
"epoch": 0.7893166282195085,
"grad_norm": 0.52734375,
"learning_rate": 2.3427550231700836e-06,
"loss": 0.0645,
"step": 2331
},
{
"epoch": 0.7896552453916319,
"grad_norm": 0.443359375,
"learning_rate": 2.335558918277774e-06,
"loss": 0.0513,
"step": 2332
},
{
"epoch": 0.7899938625637553,
"grad_norm": 0.490234375,
"learning_rate": 2.3283724207629886e-06,
"loss": 0.0674,
"step": 2333
},
{
"epoch": 0.7903324797358786,
"grad_norm": 0.44140625,
"learning_rate": 2.3211955396340003e-06,
"loss": 0.0536,
"step": 2334
},
{
"epoch": 0.790671096908002,
"grad_norm": 0.63671875,
"learning_rate": 2.3140282838870332e-06,
"loss": 0.074,
"step": 2335
},
{
"epoch": 0.7910097140801253,
"grad_norm": 0.625,
"learning_rate": 2.3068706625062385e-06,
"loss": 0.0729,
"step": 2336
},
{
"epoch": 0.7913483312522487,
"grad_norm": 0.439453125,
"learning_rate": 2.299722684463698e-06,
"loss": 0.0594,
"step": 2337
},
{
"epoch": 0.7916869484243719,
"grad_norm": 0.40234375,
"learning_rate": 2.2925843587194042e-06,
"loss": 0.0524,
"step": 2338
},
{
"epoch": 0.7920255655964953,
"grad_norm": 0.44921875,
"learning_rate": 2.285455694221246e-06,
"loss": 0.0622,
"step": 2339
},
{
"epoch": 0.7923641827686186,
"grad_norm": 0.41015625,
"learning_rate": 2.2783366999050074e-06,
"loss": 0.0543,
"step": 2340
},
{
"epoch": 0.792702799940742,
"grad_norm": 0.5859375,
"learning_rate": 2.2712273846943457e-06,
"loss": 0.0729,
"step": 2341
},
{
"epoch": 0.7930414171128654,
"grad_norm": 0.515625,
"learning_rate": 2.264127757500789e-06,
"loss": 0.0587,
"step": 2342
},
{
"epoch": 0.7933800342849887,
"grad_norm": 0.37109375,
"learning_rate": 2.2570378272237237e-06,
"loss": 0.046,
"step": 2343
},
{
"epoch": 0.7937186514571121,
"grad_norm": 0.49609375,
"learning_rate": 2.2499576027503723e-06,
"loss": 0.0689,
"step": 2344
},
{
"epoch": 0.7940572686292353,
"grad_norm": 0.443359375,
"learning_rate": 2.2428870929558012e-06,
"loss": 0.0569,
"step": 2345
},
{
"epoch": 0.7943958858013587,
"grad_norm": 0.484375,
"learning_rate": 2.2358263067028952e-06,
"loss": 0.0631,
"step": 2346
},
{
"epoch": 0.794734502973482,
"grad_norm": 0.4609375,
"learning_rate": 2.228775252842347e-06,
"loss": 0.064,
"step": 2347
},
{
"epoch": 0.7950731201456054,
"grad_norm": 0.470703125,
"learning_rate": 2.221733940212657e-06,
"loss": 0.0628,
"step": 2348
},
{
"epoch": 0.7954117373177287,
"grad_norm": 0.49609375,
"learning_rate": 2.2147023776401077e-06,
"loss": 0.0629,
"step": 2349
},
{
"epoch": 0.7957503544898521,
"grad_norm": 0.49609375,
"learning_rate": 2.2076805739387664e-06,
"loss": 0.0649,
"step": 2350
},
{
"epoch": 0.7960889716619755,
"grad_norm": 0.40625,
"learning_rate": 2.200668537910461e-06,
"loss": 0.054,
"step": 2351
},
{
"epoch": 0.7964275888340987,
"grad_norm": 0.4140625,
"learning_rate": 2.1936662783447836e-06,
"loss": 0.0542,
"step": 2352
},
{
"epoch": 0.796766206006222,
"grad_norm": 0.44140625,
"learning_rate": 2.1866738040190638e-06,
"loss": 0.0587,
"step": 2353
},
{
"epoch": 0.7971048231783454,
"grad_norm": 1.265625,
"learning_rate": 2.1796911236983708e-06,
"loss": 0.0656,
"step": 2354
},
{
"epoch": 0.7974434403504688,
"grad_norm": 0.498046875,
"learning_rate": 2.172718246135492e-06,
"loss": 0.0612,
"step": 2355
},
{
"epoch": 0.7977820575225921,
"grad_norm": 0.431640625,
"learning_rate": 2.165755180070932e-06,
"loss": 0.0623,
"step": 2356
},
{
"epoch": 0.7981206746947155,
"grad_norm": 0.59375,
"learning_rate": 2.158801934232897e-06,
"loss": 0.0772,
"step": 2357
},
{
"epoch": 0.7984592918668388,
"grad_norm": 0.5078125,
"learning_rate": 2.1518585173372774e-06,
"loss": 0.0662,
"step": 2358
},
{
"epoch": 0.7987979090389621,
"grad_norm": 0.53125,
"learning_rate": 2.14492493808765e-06,
"loss": 0.0785,
"step": 2359
},
{
"epoch": 0.7991365262110854,
"grad_norm": 0.55859375,
"learning_rate": 2.138001205175253e-06,
"loss": 0.0744,
"step": 2360
},
{
"epoch": 0.7994751433832088,
"grad_norm": 0.5390625,
"learning_rate": 2.1310873272789878e-06,
"loss": 0.0661,
"step": 2361
},
{
"epoch": 0.7998137605553322,
"grad_norm": 0.52734375,
"learning_rate": 2.1241833130654056e-06,
"loss": 0.0665,
"step": 2362
},
{
"epoch": 0.8001523777274555,
"grad_norm": 0.5625,
"learning_rate": 2.117289171188681e-06,
"loss": 0.0626,
"step": 2363
},
{
"epoch": 0.8004909948995789,
"grad_norm": 0.51171875,
"learning_rate": 2.1104049102906254e-06,
"loss": 0.0716,
"step": 2364
},
{
"epoch": 0.8008296120717022,
"grad_norm": 0.40234375,
"learning_rate": 2.103530539000662e-06,
"loss": 0.0494,
"step": 2365
},
{
"epoch": 0.8011682292438256,
"grad_norm": 0.56640625,
"learning_rate": 2.096666065935813e-06,
"loss": 0.0471,
"step": 2366
},
{
"epoch": 0.8015068464159488,
"grad_norm": 0.51953125,
"learning_rate": 2.089811499700699e-06,
"loss": 0.0715,
"step": 2367
},
{
"epoch": 0.8018454635880722,
"grad_norm": 0.458984375,
"learning_rate": 2.082966848887514e-06,
"loss": 0.0612,
"step": 2368
},
{
"epoch": 0.8018454635880722,
"eval_loss": 0.06570233404636383,
"eval_runtime": 815.4749,
"eval_samples_per_second": 12.199,
"eval_steps_per_second": 3.05,
"step": 2368
},
{
"epoch": 0.8021840807601955,
"grad_norm": 0.435546875,
"learning_rate": 2.0761321220760324e-06,
"loss": 0.0557,
"step": 2369
},
{
"epoch": 0.8025226979323189,
"grad_norm": 0.40625,
"learning_rate": 2.069307327833586e-06,
"loss": 0.0535,
"step": 2370
},
{
"epoch": 0.8028613151044423,
"grad_norm": 0.431640625,
"learning_rate": 2.062492474715053e-06,
"loss": 0.0586,
"step": 2371
},
{
"epoch": 0.8031999322765656,
"grad_norm": 0.451171875,
"learning_rate": 2.05568757126285e-06,
"loss": 0.058,
"step": 2372
},
{
"epoch": 0.803538549448689,
"grad_norm": 0.423828125,
"learning_rate": 2.0488926260069284e-06,
"loss": 0.0479,
"step": 2373
},
{
"epoch": 0.8038771666208122,
"grad_norm": 0.546875,
"learning_rate": 2.042107647464748e-06,
"loss": 0.0671,
"step": 2374
},
{
"epoch": 0.8042157837929356,
"grad_norm": 0.57421875,
"learning_rate": 2.0353326441412835e-06,
"loss": 0.0799,
"step": 2375
},
{
"epoch": 0.8045544009650589,
"grad_norm": 0.6640625,
"learning_rate": 2.0285676245290032e-06,
"loss": 0.0755,
"step": 2376
},
{
"epoch": 0.8048930181371823,
"grad_norm": 0.48046875,
"learning_rate": 2.021812597107855e-06,
"loss": 0.0677,
"step": 2377
},
{
"epoch": 0.8052316353093056,
"grad_norm": 0.51953125,
"learning_rate": 2.0150675703452717e-06,
"loss": 0.0649,
"step": 2378
},
{
"epoch": 0.805570252481429,
"grad_norm": 0.466796875,
"learning_rate": 2.0083325526961394e-06,
"loss": 0.0591,
"step": 2379
},
{
"epoch": 0.8059088696535524,
"grad_norm": 0.474609375,
"learning_rate": 2.0016075526028066e-06,
"loss": 0.0642,
"step": 2380
},
{
"epoch": 0.8062474868256756,
"grad_norm": 0.3984375,
"learning_rate": 1.9948925784950625e-06,
"loss": 0.0564,
"step": 2381
},
{
"epoch": 0.806586103997799,
"grad_norm": 0.43359375,
"learning_rate": 1.9881876387901243e-06,
"loss": 0.0556,
"step": 2382
},
{
"epoch": 0.8069247211699223,
"grad_norm": 0.5,
"learning_rate": 1.9814927418926366e-06,
"loss": 0.0643,
"step": 2383
},
{
"epoch": 0.8072633383420457,
"grad_norm": 0.546875,
"learning_rate": 1.974807896194655e-06,
"loss": 0.0687,
"step": 2384
},
{
"epoch": 0.807601955514169,
"grad_norm": 0.58984375,
"learning_rate": 1.9681331100756298e-06,
"loss": 0.073,
"step": 2385
},
{
"epoch": 0.8079405726862924,
"grad_norm": 0.55859375,
"learning_rate": 1.9614683919024103e-06,
"loss": 0.0685,
"step": 2386
},
{
"epoch": 0.8082791898584157,
"grad_norm": 0.4296875,
"learning_rate": 1.9548137500292163e-06,
"loss": 0.059,
"step": 2387
},
{
"epoch": 0.808617807030539,
"grad_norm": 0.48046875,
"learning_rate": 1.9481691927976453e-06,
"loss": 0.0592,
"step": 2388
},
{
"epoch": 0.8089564242026623,
"grad_norm": 0.546875,
"learning_rate": 1.9415347285366527e-06,
"loss": 0.0778,
"step": 2389
},
{
"epoch": 0.8092950413747857,
"grad_norm": 0.671875,
"learning_rate": 1.9349103655625346e-06,
"loss": 0.0807,
"step": 2390
},
{
"epoch": 0.809633658546909,
"grad_norm": 0.65625,
"learning_rate": 1.9282961121789324e-06,
"loss": 0.0714,
"step": 2391
},
{
"epoch": 0.8099722757190324,
"grad_norm": 0.46484375,
"learning_rate": 1.9216919766768194e-06,
"loss": 0.0507,
"step": 2392
},
{
"epoch": 0.8103108928911558,
"grad_norm": 0.59375,
"learning_rate": 1.915097967334469e-06,
"loss": 0.0808,
"step": 2393
},
{
"epoch": 0.8106495100632791,
"grad_norm": 0.60546875,
"learning_rate": 1.9085140924174783e-06,
"loss": 0.0563,
"step": 2394
},
{
"epoch": 0.8109881272354025,
"grad_norm": 0.48046875,
"learning_rate": 1.9019403601787377e-06,
"loss": 0.0615,
"step": 2395
},
{
"epoch": 0.8113267444075257,
"grad_norm": 0.51953125,
"learning_rate": 1.8953767788584155e-06,
"loss": 0.0671,
"step": 2396
},
{
"epoch": 0.8116653615796491,
"grad_norm": 0.466796875,
"learning_rate": 1.8888233566839654e-06,
"loss": 0.0672,
"step": 2397
},
{
"epoch": 0.8120039787517724,
"grad_norm": 0.4765625,
"learning_rate": 1.8822801018700999e-06,
"loss": 0.0612,
"step": 2398
},
{
"epoch": 0.8123425959238958,
"grad_norm": 0.6796875,
"learning_rate": 1.8757470226187902e-06,
"loss": 0.0745,
"step": 2399
},
{
"epoch": 0.8126812130960192,
"grad_norm": 0.40625,
"learning_rate": 1.8692241271192557e-06,
"loss": 0.0505,
"step": 2400
},
{
"epoch": 0.8130198302681425,
"grad_norm": 0.4609375,
"learning_rate": 1.8627114235479393e-06,
"loss": 0.0616,
"step": 2401
},
{
"epoch": 0.8133584474402659,
"grad_norm": 0.431640625,
"learning_rate": 1.8562089200685195e-06,
"loss": 0.0558,
"step": 2402
},
{
"epoch": 0.8136970646123891,
"grad_norm": 0.474609375,
"learning_rate": 1.8497166248318876e-06,
"loss": 0.0664,
"step": 2403
},
{
"epoch": 0.8140356817845125,
"grad_norm": 0.57421875,
"learning_rate": 1.8432345459761303e-06,
"loss": 0.0728,
"step": 2404
},
{
"epoch": 0.8143742989566358,
"grad_norm": 0.5546875,
"learning_rate": 1.8367626916265401e-06,
"loss": 0.0737,
"step": 2405
},
{
"epoch": 0.8147129161287592,
"grad_norm": 0.63671875,
"learning_rate": 1.8303010698955803e-06,
"loss": 0.0819,
"step": 2406
},
{
"epoch": 0.8150515333008825,
"grad_norm": 0.3828125,
"learning_rate": 1.8238496888828983e-06,
"loss": 0.0435,
"step": 2407
},
{
"epoch": 0.8153901504730059,
"grad_norm": 0.609375,
"learning_rate": 1.817408556675302e-06,
"loss": 0.0885,
"step": 2408
},
{
"epoch": 0.8157287676451292,
"grad_norm": 0.6015625,
"learning_rate": 1.8109776813467473e-06,
"loss": 0.0799,
"step": 2409
},
{
"epoch": 0.8160673848172525,
"grad_norm": 0.4296875,
"learning_rate": 1.8045570709583394e-06,
"loss": 0.0674,
"step": 2410
},
{
"epoch": 0.8164060019893759,
"grad_norm": 0.46875,
"learning_rate": 1.7981467335583158e-06,
"loss": 0.0574,
"step": 2411
},
{
"epoch": 0.8167446191614992,
"grad_norm": 0.46875,
"learning_rate": 1.7917466771820303e-06,
"loss": 0.064,
"step": 2412
},
{
"epoch": 0.8170832363336226,
"grad_norm": 0.55078125,
"learning_rate": 1.7853569098519586e-06,
"loss": 0.0681,
"step": 2413
},
{
"epoch": 0.8174218535057459,
"grad_norm": 0.412109375,
"learning_rate": 1.7789774395776716e-06,
"loss": 0.053,
"step": 2414
},
{
"epoch": 0.8177604706778693,
"grad_norm": 0.56640625,
"learning_rate": 1.7726082743558349e-06,
"loss": 0.0633,
"step": 2415
},
{
"epoch": 0.8180990878499926,
"grad_norm": 0.482421875,
"learning_rate": 1.766249422170202e-06,
"loss": 0.0689,
"step": 2416
},
{
"epoch": 0.8184377050221159,
"grad_norm": 0.6015625,
"learning_rate": 1.7599008909915894e-06,
"loss": 0.0711,
"step": 2417
},
{
"epoch": 0.8187763221942392,
"grad_norm": 0.70703125,
"learning_rate": 1.7535626887778846e-06,
"loss": 0.0525,
"step": 2418
},
{
"epoch": 0.8191149393663626,
"grad_norm": 0.4140625,
"learning_rate": 1.7472348234740255e-06,
"loss": 0.0505,
"step": 2419
},
{
"epoch": 0.819453556538486,
"grad_norm": 0.53125,
"learning_rate": 1.7409173030119886e-06,
"loss": 0.0696,
"step": 2420
},
{
"epoch": 0.8197921737106093,
"grad_norm": 0.5703125,
"learning_rate": 1.734610135310788e-06,
"loss": 0.0605,
"step": 2421
},
{
"epoch": 0.8201307908827327,
"grad_norm": 0.5234375,
"learning_rate": 1.7283133282764609e-06,
"loss": 0.0601,
"step": 2422
},
{
"epoch": 0.820469408054856,
"grad_norm": 0.76953125,
"learning_rate": 1.722026889802052e-06,
"loss": 0.0807,
"step": 2423
},
{
"epoch": 0.8208080252269794,
"grad_norm": 0.458984375,
"learning_rate": 1.715750827767615e-06,
"loss": 0.0635,
"step": 2424
},
{
"epoch": 0.8211466423991026,
"grad_norm": 0.49609375,
"learning_rate": 1.7094851500401922e-06,
"loss": 0.0611,
"step": 2425
},
{
"epoch": 0.821485259571226,
"grad_norm": 0.5859375,
"learning_rate": 1.703229864473811e-06,
"loss": 0.0772,
"step": 2426
},
{
"epoch": 0.8218238767433493,
"grad_norm": 0.408203125,
"learning_rate": 1.6969849789094762e-06,
"loss": 0.0508,
"step": 2427
},
{
"epoch": 0.8221624939154727,
"grad_norm": 0.51171875,
"learning_rate": 1.6907505011751468e-06,
"loss": 0.0776,
"step": 2428
},
{
"epoch": 0.822501111087596,
"grad_norm": 0.5859375,
"learning_rate": 1.684526439085744e-06,
"loss": 0.073,
"step": 2429
},
{
"epoch": 0.8228397282597194,
"grad_norm": 0.51171875,
"learning_rate": 1.6783128004431326e-06,
"loss": 0.0656,
"step": 2430
},
{
"epoch": 0.8231783454318428,
"grad_norm": 0.66015625,
"learning_rate": 1.6721095930361042e-06,
"loss": 0.0916,
"step": 2431
},
{
"epoch": 0.823516962603966,
"grad_norm": 0.50390625,
"learning_rate": 1.6659168246403855e-06,
"loss": 0.0689,
"step": 2432
},
{
"epoch": 0.8238555797760894,
"grad_norm": 0.455078125,
"learning_rate": 1.6597345030186052e-06,
"loss": 0.0594,
"step": 2433
},
{
"epoch": 0.8241941969482127,
"grad_norm": 0.546875,
"learning_rate": 1.6535626359203083e-06,
"loss": 0.0555,
"step": 2434
},
{
"epoch": 0.8245328141203361,
"grad_norm": 0.875,
"learning_rate": 1.6474012310819354e-06,
"loss": 0.0823,
"step": 2435
},
{
"epoch": 0.8248714312924594,
"grad_norm": 0.40234375,
"learning_rate": 1.6412502962267973e-06,
"loss": 0.0519,
"step": 2436
},
{
"epoch": 0.8252100484645828,
"grad_norm": 0.447265625,
"learning_rate": 1.6351098390650966e-06,
"loss": 0.0576,
"step": 2437
},
{
"epoch": 0.8255486656367061,
"grad_norm": 0.54296875,
"learning_rate": 1.6289798672938994e-06,
"loss": 0.0627,
"step": 2438
},
{
"epoch": 0.8258872828088294,
"grad_norm": 0.515625,
"learning_rate": 1.6228603885971206e-06,
"loss": 0.0665,
"step": 2439
},
{
"epoch": 0.8262258999809527,
"grad_norm": 3.921875,
"learning_rate": 1.6167514106455306e-06,
"loss": 0.0593,
"step": 2440
},
{
"epoch": 0.8265645171530761,
"grad_norm": 0.466796875,
"learning_rate": 1.6106529410967354e-06,
"loss": 0.0556,
"step": 2441
},
{
"epoch": 0.8269031343251995,
"grad_norm": 0.6015625,
"learning_rate": 1.604564987595162e-06,
"loss": 0.059,
"step": 2442
},
{
"epoch": 0.8272417514973228,
"grad_norm": 0.41796875,
"learning_rate": 1.598487557772066e-06,
"loss": 0.0548,
"step": 2443
},
{
"epoch": 0.8275803686694462,
"grad_norm": 0.373046875,
"learning_rate": 1.5924206592455016e-06,
"loss": 0.0506,
"step": 2444
},
{
"epoch": 0.8279189858415695,
"grad_norm": 0.578125,
"learning_rate": 1.5863642996203288e-06,
"loss": 0.0797,
"step": 2445
},
{
"epoch": 0.8282576030136928,
"grad_norm": 0.4453125,
"learning_rate": 1.580318486488197e-06,
"loss": 0.0668,
"step": 2446
},
{
"epoch": 0.8285962201858161,
"grad_norm": 0.44921875,
"learning_rate": 1.5742832274275288e-06,
"loss": 0.0522,
"step": 2447
},
{
"epoch": 0.8289348373579395,
"grad_norm": 0.359375,
"learning_rate": 1.5682585300035237e-06,
"loss": 0.0387,
"step": 2448
},
{
"epoch": 0.8292734545300628,
"grad_norm": 0.57421875,
"learning_rate": 1.5622444017681438e-06,
"loss": 0.0626,
"step": 2449
},
{
"epoch": 0.8296120717021862,
"grad_norm": 0.44140625,
"learning_rate": 1.5562408502600946e-06,
"loss": 0.0529,
"step": 2450
},
{
"epoch": 0.8299506888743096,
"grad_norm": 0.4296875,
"learning_rate": 1.550247883004833e-06,
"loss": 0.0554,
"step": 2451
},
{
"epoch": 0.8302893060464329,
"grad_norm": 0.50390625,
"learning_rate": 1.5442655075145375e-06,
"loss": 0.059,
"step": 2452
},
{
"epoch": 0.8306279232185563,
"grad_norm": 0.486328125,
"learning_rate": 1.5382937312881208e-06,
"loss": 0.0573,
"step": 2453
},
{
"epoch": 0.8309665403906795,
"grad_norm": 0.515625,
"learning_rate": 1.5323325618112072e-06,
"loss": 0.0677,
"step": 2454
},
{
"epoch": 0.8313051575628029,
"grad_norm": 0.408203125,
"learning_rate": 1.5263820065561174e-06,
"loss": 0.0532,
"step": 2455
},
{
"epoch": 0.8316437747349262,
"grad_norm": 0.5546875,
"learning_rate": 1.520442072981877e-06,
"loss": 0.0751,
"step": 2456
},
{
"epoch": 0.8319823919070496,
"grad_norm": 0.51953125,
"learning_rate": 1.5145127685341932e-06,
"loss": 0.0475,
"step": 2457
},
{
"epoch": 0.832321009079173,
"grad_norm": 0.609375,
"learning_rate": 1.5085941006454453e-06,
"loss": 0.0856,
"step": 2458
},
{
"epoch": 0.8326596262512963,
"grad_norm": 0.4140625,
"learning_rate": 1.5026860767346862e-06,
"loss": 0.0559,
"step": 2459
},
{
"epoch": 0.8329982434234197,
"grad_norm": 0.4609375,
"learning_rate": 1.4967887042076278e-06,
"loss": 0.0658,
"step": 2460
},
{
"epoch": 0.8333368605955429,
"grad_norm": 0.54296875,
"learning_rate": 1.4909019904566223e-06,
"loss": 0.0674,
"step": 2461
},
{
"epoch": 0.8336754777676663,
"grad_norm": 0.515625,
"learning_rate": 1.4850259428606707e-06,
"loss": 0.0607,
"step": 2462
},
{
"epoch": 0.8340140949397896,
"grad_norm": 0.40625,
"learning_rate": 1.4791605687853927e-06,
"loss": 0.0569,
"step": 2463
},
{
"epoch": 0.834352712111913,
"grad_norm": 0.50390625,
"learning_rate": 1.4733058755830399e-06,
"loss": 0.0584,
"step": 2464
},
{
"epoch": 0.8346913292840363,
"grad_norm": 0.46484375,
"learning_rate": 1.4674618705924715e-06,
"loss": 0.0637,
"step": 2465
},
{
"epoch": 0.8350299464561597,
"grad_norm": 0.41015625,
"learning_rate": 1.4616285611391445e-06,
"loss": 0.0515,
"step": 2466
},
{
"epoch": 0.835368563628283,
"grad_norm": 0.58203125,
"learning_rate": 1.4558059545351144e-06,
"loss": 0.0595,
"step": 2467
},
{
"epoch": 0.8357071808004063,
"grad_norm": 1.4453125,
"learning_rate": 1.4499940580790207e-06,
"loss": 0.0883,
"step": 2468
},
{
"epoch": 0.8360457979725296,
"grad_norm": 0.419921875,
"learning_rate": 1.4441928790560733e-06,
"loss": 0.059,
"step": 2469
},
{
"epoch": 0.836384415144653,
"grad_norm": 0.69921875,
"learning_rate": 1.4384024247380534e-06,
"loss": 0.0896,
"step": 2470
},
{
"epoch": 0.8367230323167764,
"grad_norm": 0.61328125,
"learning_rate": 1.4326227023832928e-06,
"loss": 0.0609,
"step": 2471
},
{
"epoch": 0.8370616494888997,
"grad_norm": 0.486328125,
"learning_rate": 1.426853719236676e-06,
"loss": 0.0617,
"step": 2472
},
{
"epoch": 0.8374002666610231,
"grad_norm": 0.6875,
"learning_rate": 1.4210954825296253e-06,
"loss": 0.0609,
"step": 2473
},
{
"epoch": 0.8377388838331464,
"grad_norm": 0.58203125,
"learning_rate": 1.4153479994800868e-06,
"loss": 0.0683,
"step": 2474
},
{
"epoch": 0.8380775010052697,
"grad_norm": 0.41796875,
"learning_rate": 1.4096112772925353e-06,
"loss": 0.0581,
"step": 2475
},
{
"epoch": 0.838416118177393,
"grad_norm": 0.478515625,
"learning_rate": 1.4038853231579486e-06,
"loss": 0.0599,
"step": 2476
},
{
"epoch": 0.8387547353495164,
"grad_norm": 0.6953125,
"learning_rate": 1.3981701442538155e-06,
"loss": 0.1391,
"step": 2477
},
{
"epoch": 0.8390933525216397,
"grad_norm": 0.5234375,
"learning_rate": 1.3924657477441072e-06,
"loss": 0.0586,
"step": 2478
},
{
"epoch": 0.8394319696937631,
"grad_norm": 0.45703125,
"learning_rate": 1.38677214077929e-06,
"loss": 0.0528,
"step": 2479
},
{
"epoch": 0.8397705868658865,
"grad_norm": 0.83984375,
"learning_rate": 1.381089330496297e-06,
"loss": 0.0666,
"step": 2480
},
{
"epoch": 0.8401092040380098,
"grad_norm": 0.66015625,
"learning_rate": 1.3754173240185364e-06,
"loss": 0.0938,
"step": 2481
},
{
"epoch": 0.8404478212101332,
"grad_norm": 0.3984375,
"learning_rate": 1.3697561284558624e-06,
"loss": 0.0529,
"step": 2482
},
{
"epoch": 0.8407864383822564,
"grad_norm": 0.451171875,
"learning_rate": 1.3641057509045885e-06,
"loss": 0.0661,
"step": 2483
},
{
"epoch": 0.8411250555543798,
"grad_norm": 0.421875,
"learning_rate": 1.3584661984474634e-06,
"loss": 0.0582,
"step": 2484
},
{
"epoch": 0.8414636727265031,
"grad_norm": 0.515625,
"learning_rate": 1.3528374781536634e-06,
"loss": 0.0646,
"step": 2485
},
{
"epoch": 0.8418022898986265,
"grad_norm": 0.6640625,
"learning_rate": 1.3472195970787927e-06,
"loss": 0.0813,
"step": 2486
},
{
"epoch": 0.8421409070707498,
"grad_norm": 0.423828125,
"learning_rate": 1.3416125622648668e-06,
"loss": 0.0589,
"step": 2487
},
{
"epoch": 0.8424795242428732,
"grad_norm": 0.39453125,
"learning_rate": 1.3360163807403004e-06,
"loss": 0.0494,
"step": 2488
},
{
"epoch": 0.8428181414149966,
"grad_norm": 0.6640625,
"learning_rate": 1.3304310595199121e-06,
"loss": 0.0604,
"step": 2489
},
{
"epoch": 0.8431567585871198,
"grad_norm": 0.54296875,
"learning_rate": 1.3248566056048972e-06,
"loss": 0.0608,
"step": 2490
},
{
"epoch": 0.8434953757592432,
"grad_norm": 0.47265625,
"learning_rate": 1.3192930259828363e-06,
"loss": 0.0544,
"step": 2491
},
{
"epoch": 0.8438339929313665,
"grad_norm": 0.55859375,
"learning_rate": 1.3137403276276805e-06,
"loss": 0.0719,
"step": 2492
},
{
"epoch": 0.8441726101034899,
"grad_norm": 0.41796875,
"learning_rate": 1.3081985174997325e-06,
"loss": 0.0548,
"step": 2493
},
{
"epoch": 0.8445112272756132,
"grad_norm": 0.57421875,
"learning_rate": 1.3026676025456553e-06,
"loss": 0.0585,
"step": 2494
},
{
"epoch": 0.8448498444477366,
"grad_norm": 0.4765625,
"learning_rate": 1.2971475896984475e-06,
"loss": 0.0612,
"step": 2495
},
{
"epoch": 0.8451884616198599,
"grad_norm": 0.392578125,
"learning_rate": 1.2916384858774488e-06,
"loss": 0.0516,
"step": 2496
},
{
"epoch": 0.8455270787919832,
"grad_norm": 0.48046875,
"learning_rate": 1.2861402979883231e-06,
"loss": 0.0614,
"step": 2497
},
{
"epoch": 0.8458656959641065,
"grad_norm": 0.474609375,
"learning_rate": 1.280653032923046e-06,
"loss": 0.0543,
"step": 2498
},
{
"epoch": 0.8462043131362299,
"grad_norm": 0.455078125,
"learning_rate": 1.2751766975599033e-06,
"loss": 0.0527,
"step": 2499
},
{
"epoch": 0.8465429303083533,
"grad_norm": 0.56640625,
"learning_rate": 1.2697112987634852e-06,
"loss": 0.0711,
"step": 2500
},
{
"epoch": 0.8468815474804766,
"grad_norm": 0.470703125,
"learning_rate": 1.264256843384668e-06,
"loss": 0.0619,
"step": 2501
},
{
"epoch": 0.8472201646526,
"grad_norm": 0.48828125,
"learning_rate": 1.2588133382606105e-06,
"loss": 0.0684,
"step": 2502
},
{
"epoch": 0.8475587818247233,
"grad_norm": 0.466796875,
"learning_rate": 1.2533807902147522e-06,
"loss": 0.062,
"step": 2503
},
{
"epoch": 0.8478973989968466,
"grad_norm": 0.447265625,
"learning_rate": 1.2479592060567857e-06,
"loss": 0.0526,
"step": 2504
},
{
"epoch": 0.8482360161689699,
"grad_norm": 0.486328125,
"learning_rate": 1.2425485925826708e-06,
"loss": 0.0605,
"step": 2505
},
{
"epoch": 0.8485746333410933,
"grad_norm": 0.48828125,
"learning_rate": 1.2371489565746141e-06,
"loss": 0.0743,
"step": 2506
},
{
"epoch": 0.8489132505132166,
"grad_norm": 0.494140625,
"learning_rate": 1.231760304801054e-06,
"loss": 0.0679,
"step": 2507
},
{
"epoch": 0.84925186768534,
"grad_norm": 0.466796875,
"learning_rate": 1.2263826440166725e-06,
"loss": 0.0572,
"step": 2508
},
{
"epoch": 0.8495904848574634,
"grad_norm": 0.53515625,
"learning_rate": 1.2210159809623622e-06,
"loss": 0.0659,
"step": 2509
},
{
"epoch": 0.8499291020295867,
"grad_norm": 0.47265625,
"learning_rate": 1.2156603223652376e-06,
"loss": 0.0493,
"step": 2510
},
{
"epoch": 0.8502677192017101,
"grad_norm": 0.458984375,
"learning_rate": 1.2103156749386192e-06,
"loss": 0.0545,
"step": 2511
},
{
"epoch": 0.8506063363738333,
"grad_norm": 0.62109375,
"learning_rate": 1.2049820453820194e-06,
"loss": 0.0697,
"step": 2512
},
{
"epoch": 0.8509449535459567,
"grad_norm": 0.578125,
"learning_rate": 1.1996594403811478e-06,
"loss": 0.0584,
"step": 2513
},
{
"epoch": 0.85128357071808,
"grad_norm": 0.55859375,
"learning_rate": 1.1943478666078856e-06,
"loss": 0.0706,
"step": 2514
},
{
"epoch": 0.8516221878902034,
"grad_norm": 0.498046875,
"learning_rate": 1.1890473307202922e-06,
"loss": 0.0624,
"step": 2515
},
{
"epoch": 0.8519608050623267,
"grad_norm": 0.5234375,
"learning_rate": 1.1837578393625937e-06,
"loss": 0.0638,
"step": 2516
},
{
"epoch": 0.8522994222344501,
"grad_norm": 0.5546875,
"learning_rate": 1.1784793991651623e-06,
"loss": 0.0716,
"step": 2517
},
{
"epoch": 0.8526380394065735,
"grad_norm": 0.384765625,
"learning_rate": 1.1732120167445248e-06,
"loss": 0.0467,
"step": 2518
},
{
"epoch": 0.8529766565786967,
"grad_norm": 0.48046875,
"learning_rate": 1.1679556987033492e-06,
"loss": 0.0635,
"step": 2519
},
{
"epoch": 0.8533152737508201,
"grad_norm": 0.396484375,
"learning_rate": 1.1627104516304278e-06,
"loss": 0.0492,
"step": 2520
},
{
"epoch": 0.8536538909229434,
"grad_norm": 0.443359375,
"learning_rate": 1.157476282100677e-06,
"loss": 0.0493,
"step": 2521
},
{
"epoch": 0.8539925080950668,
"grad_norm": 0.515625,
"learning_rate": 1.1522531966751304e-06,
"loss": 0.0641,
"step": 2522
},
{
"epoch": 0.8543311252671901,
"grad_norm": 0.62890625,
"learning_rate": 1.1470412019009246e-06,
"loss": 0.0554,
"step": 2523
},
{
"epoch": 0.8546697424393135,
"grad_norm": 0.58984375,
"learning_rate": 1.141840304311298e-06,
"loss": 0.0642,
"step": 2524
},
{
"epoch": 0.8550083596114368,
"grad_norm": 0.7109375,
"learning_rate": 1.1366505104255732e-06,
"loss": 0.0578,
"step": 2525
},
{
"epoch": 0.8553469767835601,
"grad_norm": 0.435546875,
"learning_rate": 1.1314718267491587e-06,
"loss": 0.0582,
"step": 2526
},
{
"epoch": 0.8556855939556834,
"grad_norm": 0.546875,
"learning_rate": 1.1263042597735363e-06,
"loss": 0.0638,
"step": 2527
},
{
"epoch": 0.8560242111278068,
"grad_norm": 0.451171875,
"learning_rate": 1.121147815976248e-06,
"loss": 0.0628,
"step": 2528
},
{
"epoch": 0.8563628282999302,
"grad_norm": 0.52734375,
"learning_rate": 1.1160025018208997e-06,
"loss": 0.056,
"step": 2529
},
{
"epoch": 0.8567014454720535,
"grad_norm": 0.478515625,
"learning_rate": 1.110868323757144e-06,
"loss": 0.0598,
"step": 2530
},
{
"epoch": 0.8570400626441769,
"grad_norm": 0.41015625,
"learning_rate": 1.1057452882206688e-06,
"loss": 0.0579,
"step": 2531
},
{
"epoch": 0.8573786798163002,
"grad_norm": 0.52734375,
"learning_rate": 1.1006334016332054e-06,
"loss": 0.072,
"step": 2532
},
{
"epoch": 0.8577172969884235,
"grad_norm": 0.4453125,
"learning_rate": 1.0955326704024983e-06,
"loss": 0.0652,
"step": 2533
},
{
"epoch": 0.8580559141605468,
"grad_norm": 0.423828125,
"learning_rate": 1.090443100922317e-06,
"loss": 0.059,
"step": 2534
},
{
"epoch": 0.8583945313326702,
"grad_norm": 0.453125,
"learning_rate": 1.085364699572441e-06,
"loss": 0.0618,
"step": 2535
},
{
"epoch": 0.8587331485047935,
"grad_norm": 0.52734375,
"learning_rate": 1.08029747271864e-06,
"loss": 0.0659,
"step": 2536
},
{
"epoch": 0.8590717656769169,
"grad_norm": 0.54296875,
"learning_rate": 1.0752414267126876e-06,
"loss": 0.074,
"step": 2537
},
{
"epoch": 0.8594103828490403,
"grad_norm": 0.490234375,
"learning_rate": 1.0701965678923387e-06,
"loss": 0.0665,
"step": 2538
},
{
"epoch": 0.8597490000211636,
"grad_norm": 0.482421875,
"learning_rate": 1.0651629025813203e-06,
"loss": 0.0722,
"step": 2539
},
{
"epoch": 0.860087617193287,
"grad_norm": 0.412109375,
"learning_rate": 1.0601404370893364e-06,
"loss": 0.0595,
"step": 2540
},
{
"epoch": 0.8604262343654102,
"grad_norm": 0.53515625,
"learning_rate": 1.0551291777120465e-06,
"loss": 0.0733,
"step": 2541
},
{
"epoch": 0.8607648515375336,
"grad_norm": 0.474609375,
"learning_rate": 1.0501291307310613e-06,
"loss": 0.0652,
"step": 2542
},
{
"epoch": 0.8611034687096569,
"grad_norm": 0.53515625,
"learning_rate": 1.045140302413945e-06,
"loss": 0.0781,
"step": 2543
},
{
"epoch": 0.8614420858817803,
"grad_norm": 0.48046875,
"learning_rate": 1.040162699014191e-06,
"loss": 0.0637,
"step": 2544
},
{
"epoch": 0.8617807030539036,
"grad_norm": 0.5390625,
"learning_rate": 1.0351963267712261e-06,
"loss": 0.0651,
"step": 2545
},
{
"epoch": 0.862119320226027,
"grad_norm": 0.5078125,
"learning_rate": 1.0302411919104005e-06,
"loss": 0.0532,
"step": 2546
},
{
"epoch": 0.8624579373981504,
"grad_norm": 0.58984375,
"learning_rate": 1.0252973006429733e-06,
"loss": 0.0686,
"step": 2547
},
{
"epoch": 0.8627965545702736,
"grad_norm": 0.46484375,
"learning_rate": 1.0203646591661142e-06,
"loss": 0.059,
"step": 2548
},
{
"epoch": 0.863135171742397,
"grad_norm": 0.5859375,
"learning_rate": 1.0154432736628916e-06,
"loss": 0.09,
"step": 2549
},
{
"epoch": 0.8634737889145203,
"grad_norm": 0.447265625,
"learning_rate": 1.0105331503022574e-06,
"loss": 0.0472,
"step": 2550
},
{
"epoch": 0.8638124060866437,
"grad_norm": 0.427734375,
"learning_rate": 1.0056342952390574e-06,
"loss": 0.0524,
"step": 2551
},
{
"epoch": 0.864151023258767,
"grad_norm": 0.482421875,
"learning_rate": 1.0007467146140026e-06,
"loss": 0.061,
"step": 2552
},
{
"epoch": 0.8644896404308904,
"grad_norm": 0.58203125,
"learning_rate": 9.958704145536767e-07,
"loss": 0.0603,
"step": 2553
},
{
"epoch": 0.8648282576030137,
"grad_norm": 0.498046875,
"learning_rate": 9.91005401170524e-07,
"loss": 0.0645,
"step": 2554
},
{
"epoch": 0.865166874775137,
"grad_norm": 0.71875,
"learning_rate": 9.86151680562837e-07,
"loss": 0.0801,
"step": 2555
},
{
"epoch": 0.8655054919472603,
"grad_norm": 0.55078125,
"learning_rate": 9.813092588147554e-07,
"loss": 0.0743,
"step": 2556
},
{
"epoch": 0.8658441091193837,
"grad_norm": 0.45703125,
"learning_rate": 9.764781419962576e-07,
"loss": 0.066,
"step": 2557
},
{
"epoch": 0.866182726291507,
"grad_norm": 0.390625,
"learning_rate": 9.71658336163146e-07,
"loss": 0.051,
"step": 2558
},
{
"epoch": 0.8665213434636304,
"grad_norm": 0.54296875,
"learning_rate": 9.668498473570499e-07,
"loss": 0.0704,
"step": 2559
},
{
"epoch": 0.8668599606357538,
"grad_norm": 0.55859375,
"learning_rate": 9.620526816054065e-07,
"loss": 0.0629,
"step": 2560
},
{
"epoch": 0.8671985778078771,
"grad_norm": 0.482421875,
"learning_rate": 9.572668449214672e-07,
"loss": 0.0703,
"step": 2561
},
{
"epoch": 0.8675371949800004,
"grad_norm": 0.56640625,
"learning_rate": 9.52492343304281e-07,
"loss": 0.0715,
"step": 2562
},
{
"epoch": 0.8678758121521237,
"grad_norm": 0.53515625,
"learning_rate": 9.477291827386781e-07,
"loss": 0.0736,
"step": 2563
},
{
"epoch": 0.8682144293242471,
"grad_norm": 0.51171875,
"learning_rate": 9.42977369195286e-07,
"loss": 0.0627,
"step": 2564
},
{
"epoch": 0.8685530464963704,
"grad_norm": 0.53515625,
"learning_rate": 9.382369086305043e-07,
"loss": 0.0759,
"step": 2565
},
{
"epoch": 0.8688916636684938,
"grad_norm": 0.44140625,
"learning_rate": 9.335078069864967e-07,
"loss": 0.0654,
"step": 2566
},
{
"epoch": 0.8692302808406172,
"grad_norm": 0.4375,
"learning_rate": 9.287900701911945e-07,
"loss": 0.0488,
"step": 2567
},
{
"epoch": 0.8695688980127405,
"grad_norm": 0.44140625,
"learning_rate": 9.240837041582839e-07,
"loss": 0.0575,
"step": 2568
},
{
"epoch": 0.8699075151848639,
"grad_norm": 0.51171875,
"learning_rate": 9.193887147871905e-07,
"loss": 0.0628,
"step": 2569
},
{
"epoch": 0.8702461323569871,
"grad_norm": 0.474609375,
"learning_rate": 9.147051079630886e-07,
"loss": 0.0584,
"step": 2570
},
{
"epoch": 0.8705847495291105,
"grad_norm": 0.453125,
"learning_rate": 9.100328895568745e-07,
"loss": 0.0583,
"step": 2571
},
{
"epoch": 0.8709233667012338,
"grad_norm": 0.6015625,
"learning_rate": 9.053720654251774e-07,
"loss": 0.0696,
"step": 2572
},
{
"epoch": 0.8712619838733572,
"grad_norm": 0.65234375,
"learning_rate": 9.00722641410342e-07,
"loss": 0.0825,
"step": 2573
},
{
"epoch": 0.8716006010454805,
"grad_norm": 0.447265625,
"learning_rate": 8.960846233404175e-07,
"loss": 0.0647,
"step": 2574
},
{
"epoch": 0.8719392182176039,
"grad_norm": 0.83984375,
"learning_rate": 8.914580170291632e-07,
"loss": 0.1008,
"step": 2575
},
{
"epoch": 0.8722778353897273,
"grad_norm": 0.5,
"learning_rate": 8.86842828276031e-07,
"loss": 0.0718,
"step": 2576
},
{
"epoch": 0.8726164525618505,
"grad_norm": 0.462890625,
"learning_rate": 8.822390628661581e-07,
"loss": 0.0611,
"step": 2577
},
{
"epoch": 0.8729550697339739,
"grad_norm": 0.53515625,
"learning_rate": 8.77646726570367e-07,
"loss": 0.0616,
"step": 2578
},
{
"epoch": 0.8732936869060972,
"grad_norm": 0.421875,
"learning_rate": 8.730658251451485e-07,
"loss": 0.0514,
"step": 2579
},
{
"epoch": 0.8736323040782206,
"grad_norm": 0.3828125,
"learning_rate": 8.68496364332665e-07,
"loss": 0.0523,
"step": 2580
},
{
"epoch": 0.8739709212503439,
"grad_norm": 0.59375,
"learning_rate": 8.639383498607379e-07,
"loss": 0.0746,
"step": 2581
},
{
"epoch": 0.8743095384224673,
"grad_norm": 0.41796875,
"learning_rate": 8.593917874428348e-07,
"loss": 0.0556,
"step": 2582
},
{
"epoch": 0.8746481555945906,
"grad_norm": 0.41015625,
"learning_rate": 8.548566827780747e-07,
"loss": 0.0481,
"step": 2583
},
{
"epoch": 0.8749867727667139,
"grad_norm": 0.63671875,
"learning_rate": 8.503330415512123e-07,
"loss": 0.072,
"step": 2584
},
{
"epoch": 0.8753253899388372,
"grad_norm": 0.431640625,
"learning_rate": 8.458208694326287e-07,
"loss": 0.054,
"step": 2585
},
{
"epoch": 0.8756640071109606,
"grad_norm": 0.734375,
"learning_rate": 8.413201720783337e-07,
"loss": 0.1025,
"step": 2586
},
{
"epoch": 0.876002624283084,
"grad_norm": 0.58984375,
"learning_rate": 8.368309551299536e-07,
"loss": 0.0694,
"step": 2587
},
{
"epoch": 0.8763412414552073,
"grad_norm": 0.515625,
"learning_rate": 8.323532242147203e-07,
"loss": 0.067,
"step": 2588
},
{
"epoch": 0.8766798586273307,
"grad_norm": 0.48828125,
"learning_rate": 8.278869849454718e-07,
"loss": 0.0664,
"step": 2589
},
{
"epoch": 0.877018475799454,
"grad_norm": 0.578125,
"learning_rate": 8.234322429206354e-07,
"loss": 0.0697,
"step": 2590
},
{
"epoch": 0.8773570929715773,
"grad_norm": 0.376953125,
"learning_rate": 8.189890037242343e-07,
"loss": 0.0443,
"step": 2591
},
{
"epoch": 0.8776957101437006,
"grad_norm": 0.6015625,
"learning_rate": 8.145572729258689e-07,
"loss": 0.0524,
"step": 2592
},
{
"epoch": 0.878034327315824,
"grad_norm": 0.380859375,
"learning_rate": 8.101370560807132e-07,
"loss": 0.0537,
"step": 2593
},
{
"epoch": 0.8783729444879473,
"grad_norm": 0.439453125,
"learning_rate": 8.057283587295084e-07,
"loss": 0.0652,
"step": 2594
},
{
"epoch": 0.8787115616600707,
"grad_norm": 0.447265625,
"learning_rate": 8.013311863985596e-07,
"loss": 0.0605,
"step": 2595
},
{
"epoch": 0.879050178832194,
"grad_norm": 0.59375,
"learning_rate": 7.969455445997198e-07,
"loss": 0.0819,
"step": 2596
},
{
"epoch": 0.8793887960043174,
"grad_norm": 0.47265625,
"learning_rate": 7.92571438830394e-07,
"loss": 0.0681,
"step": 2597
},
{
"epoch": 0.8797274131764408,
"grad_norm": 0.478515625,
"learning_rate": 7.882088745735217e-07,
"loss": 0.0554,
"step": 2598
},
{
"epoch": 0.880066030348564,
"grad_norm": 0.69140625,
"learning_rate": 7.838578572975786e-07,
"loss": 0.0827,
"step": 2599
},
{
"epoch": 0.8804046475206874,
"grad_norm": 0.48046875,
"learning_rate": 7.795183924565675e-07,
"loss": 0.0565,
"step": 2600
},
{
"epoch": 0.8807432646928107,
"grad_norm": 0.48828125,
"learning_rate": 7.751904854900027e-07,
"loss": 0.0599,
"step": 2601
},
{
"epoch": 0.8810818818649341,
"grad_norm": 0.498046875,
"learning_rate": 7.708741418229215e-07,
"loss": 0.0602,
"step": 2602
},
{
"epoch": 0.8814204990370574,
"grad_norm": 0.4609375,
"learning_rate": 7.665693668658569e-07,
"loss": 0.0624,
"step": 2603
},
{
"epoch": 0.8817591162091808,
"grad_norm": 0.6015625,
"learning_rate": 7.62276166014847e-07,
"loss": 0.066,
"step": 2604
},
{
"epoch": 0.8820977333813041,
"grad_norm": 0.412109375,
"learning_rate": 7.579945446514192e-07,
"loss": 0.0527,
"step": 2605
},
{
"epoch": 0.8824363505534274,
"grad_norm": 0.474609375,
"learning_rate": 7.53724508142587e-07,
"loss": 0.0625,
"step": 2606
},
{
"epoch": 0.8827749677255508,
"grad_norm": 0.46875,
"learning_rate": 7.494660618408379e-07,
"loss": 0.0601,
"step": 2607
},
{
"epoch": 0.8831135848976741,
"grad_norm": 0.51171875,
"learning_rate": 7.452192110841383e-07,
"loss": 0.0722,
"step": 2608
},
{
"epoch": 0.8834522020697975,
"grad_norm": 0.447265625,
"learning_rate": 7.409839611959136e-07,
"loss": 0.0582,
"step": 2609
},
{
"epoch": 0.8837908192419208,
"grad_norm": 0.515625,
"learning_rate": 7.367603174850502e-07,
"loss": 0.0681,
"step": 2610
},
{
"epoch": 0.8841294364140442,
"grad_norm": 0.43359375,
"learning_rate": 7.325482852458887e-07,
"loss": 0.0551,
"step": 2611
},
{
"epoch": 0.8844680535861675,
"grad_norm": 0.6796875,
"learning_rate": 7.283478697582091e-07,
"loss": 0.0829,
"step": 2612
},
{
"epoch": 0.8848066707582908,
"grad_norm": 0.48828125,
"learning_rate": 7.241590762872319e-07,
"loss": 0.062,
"step": 2613
},
{
"epoch": 0.8851452879304141,
"grad_norm": 0.5078125,
"learning_rate": 7.199819100836136e-07,
"loss": 0.0635,
"step": 2614
},
{
"epoch": 0.8854839051025375,
"grad_norm": 0.6171875,
"learning_rate": 7.158163763834292e-07,
"loss": 0.0689,
"step": 2615
},
{
"epoch": 0.8858225222746609,
"grad_norm": 0.408203125,
"learning_rate": 7.116624804081773e-07,
"loss": 0.052,
"step": 2616
},
{
"epoch": 0.8861611394467842,
"grad_norm": 0.4296875,
"learning_rate": 7.075202273647652e-07,
"loss": 0.0523,
"step": 2617
},
{
"epoch": 0.8864997566189076,
"grad_norm": 0.5234375,
"learning_rate": 7.033896224455072e-07,
"loss": 0.0745,
"step": 2618
},
{
"epoch": 0.8868383737910309,
"grad_norm": 0.416015625,
"learning_rate": 6.992706708281205e-07,
"loss": 0.0497,
"step": 2619
},
{
"epoch": 0.8871769909631542,
"grad_norm": 0.427734375,
"learning_rate": 6.951633776757071e-07,
"loss": 0.0559,
"step": 2620
},
{
"epoch": 0.8875156081352775,
"grad_norm": 0.470703125,
"learning_rate": 6.910677481367623e-07,
"loss": 0.0584,
"step": 2621
},
{
"epoch": 0.8878542253074009,
"grad_norm": 0.392578125,
"learning_rate": 6.869837873451557e-07,
"loss": 0.0469,
"step": 2622
},
{
"epoch": 0.8881928424795242,
"grad_norm": 0.5703125,
"learning_rate": 6.829115004201325e-07,
"loss": 0.0613,
"step": 2623
},
{
"epoch": 0.8885314596516476,
"grad_norm": 0.484375,
"learning_rate": 6.788508924663084e-07,
"loss": 0.0785,
"step": 2624
},
{
"epoch": 0.888870076823771,
"grad_norm": 0.71875,
"learning_rate": 6.748019685736507e-07,
"loss": 0.0639,
"step": 2625
},
{
"epoch": 0.8892086939958943,
"grad_norm": 0.48046875,
"learning_rate": 6.707647338174905e-07,
"loss": 0.0623,
"step": 2626
},
{
"epoch": 0.8895473111680177,
"grad_norm": 0.63671875,
"learning_rate": 6.667391932584999e-07,
"loss": 0.047,
"step": 2627
},
{
"epoch": 0.8898859283401409,
"grad_norm": 0.609375,
"learning_rate": 6.627253519426913e-07,
"loss": 0.0521,
"step": 2628
},
{
"epoch": 0.8902245455122643,
"grad_norm": 0.455078125,
"learning_rate": 6.587232149014189e-07,
"loss": 0.0527,
"step": 2629
},
{
"epoch": 0.8905631626843876,
"grad_norm": 0.4296875,
"learning_rate": 6.54732787151362e-07,
"loss": 0.0537,
"step": 2630
},
{
"epoch": 0.890901779856511,
"grad_norm": 0.76953125,
"learning_rate": 6.507540736945195e-07,
"loss": 0.0994,
"step": 2631
},
{
"epoch": 0.8912403970286343,
"grad_norm": 0.58203125,
"learning_rate": 6.467870795182108e-07,
"loss": 0.0807,
"step": 2632
},
{
"epoch": 0.8915790142007577,
"grad_norm": 0.447265625,
"learning_rate": 6.428318095950648e-07,
"loss": 0.0555,
"step": 2633
},
{
"epoch": 0.891917631372881,
"grad_norm": 0.53515625,
"learning_rate": 6.388882688830089e-07,
"loss": 0.0607,
"step": 2634
},
{
"epoch": 0.8922562485450043,
"grad_norm": 0.5859375,
"learning_rate": 6.349564623252746e-07,
"loss": 0.0594,
"step": 2635
},
{
"epoch": 0.8925948657171277,
"grad_norm": 0.4140625,
"learning_rate": 6.310363948503806e-07,
"loss": 0.0569,
"step": 2636
},
{
"epoch": 0.892933482889251,
"grad_norm": 0.484375,
"learning_rate": 6.271280713721317e-07,
"loss": 0.0663,
"step": 2637
},
{
"epoch": 0.8932721000613744,
"grad_norm": 0.45703125,
"learning_rate": 6.232314967896136e-07,
"loss": 0.0572,
"step": 2638
},
{
"epoch": 0.8936107172334977,
"grad_norm": 0.443359375,
"learning_rate": 6.193466759871792e-07,
"loss": 0.0492,
"step": 2639
},
{
"epoch": 0.8939493344056211,
"grad_norm": 0.5546875,
"learning_rate": 6.154736138344564e-07,
"loss": 0.0611,
"step": 2640
},
{
"epoch": 0.8942879515777444,
"grad_norm": 0.53125,
"learning_rate": 6.11612315186324e-07,
"loss": 0.0669,
"step": 2641
},
{
"epoch": 0.8946265687498677,
"grad_norm": 0.45703125,
"learning_rate": 6.077627848829238e-07,
"loss": 0.0627,
"step": 2642
},
{
"epoch": 0.894965185921991,
"grad_norm": 0.443359375,
"learning_rate": 6.039250277496411e-07,
"loss": 0.0535,
"step": 2643
},
{
"epoch": 0.8953038030941144,
"grad_norm": 0.55078125,
"learning_rate": 6.000990485971048e-07,
"loss": 0.0703,
"step": 2644
},
{
"epoch": 0.8956424202662377,
"grad_norm": 0.47265625,
"learning_rate": 5.962848522211784e-07,
"loss": 0.0602,
"step": 2645
},
{
"epoch": 0.8959810374383611,
"grad_norm": 0.6640625,
"learning_rate": 5.924824434029619e-07,
"loss": 0.0655,
"step": 2646
},
{
"epoch": 0.8963196546104845,
"grad_norm": 0.466796875,
"learning_rate": 5.886918269087716e-07,
"loss": 0.0624,
"step": 2647
},
{
"epoch": 0.8966582717826078,
"grad_norm": 0.48046875,
"learning_rate": 5.849130074901444e-07,
"loss": 0.0661,
"step": 2648
},
{
"epoch": 0.8969968889547311,
"grad_norm": 0.67578125,
"learning_rate": 5.811459898838345e-07,
"loss": 0.0686,
"step": 2649
},
{
"epoch": 0.8973355061268544,
"grad_norm": 0.38671875,
"learning_rate": 5.77390778811796e-07,
"loss": 0.0509,
"step": 2650
},
{
"epoch": 0.8976741232989778,
"grad_norm": 0.5703125,
"learning_rate": 5.736473789811858e-07,
"loss": 0.0724,
"step": 2651
},
{
"epoch": 0.8980127404711011,
"grad_norm": 0.62109375,
"learning_rate": 5.699157950843592e-07,
"loss": 0.0741,
"step": 2652
},
{
"epoch": 0.8983513576432245,
"grad_norm": 0.38671875,
"learning_rate": 5.661960317988535e-07,
"loss": 0.05,
"step": 2653
},
{
"epoch": 0.8986899748153478,
"grad_norm": 0.54296875,
"learning_rate": 5.624880937873956e-07,
"loss": 0.063,
"step": 2654
},
{
"epoch": 0.8990285919874712,
"grad_norm": 0.46875,
"learning_rate": 5.587919856978819e-07,
"loss": 0.053,
"step": 2655
},
{
"epoch": 0.8993672091595946,
"grad_norm": 0.474609375,
"learning_rate": 5.551077121633875e-07,
"loss": 0.0588,
"step": 2656
},
{
"epoch": 0.8997058263317178,
"grad_norm": 0.6953125,
"learning_rate": 5.514352778021492e-07,
"loss": 0.0755,
"step": 2657
},
{
"epoch": 0.9000444435038412,
"grad_norm": 0.57421875,
"learning_rate": 5.477746872175615e-07,
"loss": 0.0687,
"step": 2658
},
{
"epoch": 0.9003830606759645,
"grad_norm": 0.470703125,
"learning_rate": 5.441259449981795e-07,
"loss": 0.0619,
"step": 2659
},
{
"epoch": 0.9007216778480879,
"grad_norm": 0.45703125,
"learning_rate": 5.404890557176967e-07,
"loss": 0.0589,
"step": 2660
},
{
"epoch": 0.9010602950202112,
"grad_norm": 0.53515625,
"learning_rate": 5.368640239349554e-07,
"loss": 0.0579,
"step": 2661
},
{
"epoch": 0.9013989121923346,
"grad_norm": 0.515625,
"learning_rate": 5.332508541939374e-07,
"loss": 0.0655,
"step": 2662
},
{
"epoch": 0.901737529364458,
"grad_norm": 0.515625,
"learning_rate": 5.296495510237453e-07,
"loss": 0.0809,
"step": 2663
},
{
"epoch": 0.9020761465365812,
"grad_norm": 0.65625,
"learning_rate": 5.26060118938616e-07,
"loss": 0.0621,
"step": 2664
},
{
"epoch": 0.9020761465365812,
"eval_loss": 0.06565158814191818,
"eval_runtime": 818.5534,
"eval_samples_per_second": 12.153,
"eval_steps_per_second": 3.038,
"step": 2664
},
{
"epoch": 0.9024147637087045,
"grad_norm": 0.640625,
"learning_rate": 5.224825624379048e-07,
"loss": 0.0672,
"step": 2665
},
{
"epoch": 0.9027533808808279,
"grad_norm": 0.51953125,
"learning_rate": 5.189168860060756e-07,
"loss": 0.0757,
"step": 2666
},
{
"epoch": 0.9030919980529513,
"grad_norm": 0.52734375,
"learning_rate": 5.153630941127063e-07,
"loss": 0.0584,
"step": 2667
},
{
"epoch": 0.9034306152250746,
"grad_norm": 0.84765625,
"learning_rate": 5.118211912124726e-07,
"loss": 0.0992,
"step": 2668
},
{
"epoch": 0.903769232397198,
"grad_norm": 0.5390625,
"learning_rate": 5.082911817451541e-07,
"loss": 0.0662,
"step": 2669
},
{
"epoch": 0.9041078495693213,
"grad_norm": 0.47265625,
"learning_rate": 5.047730701356146e-07,
"loss": 0.0505,
"step": 2670
},
{
"epoch": 0.9044464667414446,
"grad_norm": 0.6875,
"learning_rate": 5.012668607938087e-07,
"loss": 0.0694,
"step": 2671
},
{
"epoch": 0.9047850839135679,
"grad_norm": 0.578125,
"learning_rate": 4.977725581147697e-07,
"loss": 0.0784,
"step": 2672
},
{
"epoch": 0.9051237010856913,
"grad_norm": 0.453125,
"learning_rate": 4.942901664786071e-07,
"loss": 0.0589,
"step": 2673
},
{
"epoch": 0.9054623182578146,
"grad_norm": 0.482421875,
"learning_rate": 4.90819690250497e-07,
"loss": 0.0609,
"step": 2674
},
{
"epoch": 0.905800935429938,
"grad_norm": 0.455078125,
"learning_rate": 4.873611337806838e-07,
"loss": 0.0631,
"step": 2675
},
{
"epoch": 0.9061395526020614,
"grad_norm": 0.53125,
"learning_rate": 4.839145014044688e-07,
"loss": 0.0775,
"step": 2676
},
{
"epoch": 0.9064781697741847,
"grad_norm": 0.515625,
"learning_rate": 4.804797974422026e-07,
"loss": 0.0666,
"step": 2677
},
{
"epoch": 0.906816786946308,
"grad_norm": 0.486328125,
"learning_rate": 4.770570261992913e-07,
"loss": 0.0547,
"step": 2678
},
{
"epoch": 0.9071554041184313,
"grad_norm": 0.408203125,
"learning_rate": 4.73646191966175e-07,
"loss": 0.0487,
"step": 2679
},
{
"epoch": 0.9074940212905547,
"grad_norm": 0.48828125,
"learning_rate": 4.70247299018336e-07,
"loss": 0.0698,
"step": 2680
},
{
"epoch": 0.907832638462678,
"grad_norm": 0.427734375,
"learning_rate": 4.668603516162895e-07,
"loss": 0.0562,
"step": 2681
},
{
"epoch": 0.9081712556348014,
"grad_norm": 0.5078125,
"learning_rate": 4.634853540055706e-07,
"loss": 0.0682,
"step": 2682
},
{
"epoch": 0.9085098728069247,
"grad_norm": 0.65234375,
"learning_rate": 4.601223104167407e-07,
"loss": 0.0755,
"step": 2683
},
{
"epoch": 0.9088484899790481,
"grad_norm": 0.490234375,
"learning_rate": 4.567712250653755e-07,
"loss": 0.0657,
"step": 2684
},
{
"epoch": 0.9091871071511715,
"grad_norm": 0.9375,
"learning_rate": 4.5343210215206047e-07,
"loss": 0.0488,
"step": 2685
},
{
"epoch": 0.9095257243232947,
"grad_norm": 0.57421875,
"learning_rate": 4.501049458623863e-07,
"loss": 0.0672,
"step": 2686
},
{
"epoch": 0.9098643414954181,
"grad_norm": 0.423828125,
"learning_rate": 4.4678976036694354e-07,
"loss": 0.0471,
"step": 2687
},
{
"epoch": 0.9102029586675414,
"grad_norm": 0.447265625,
"learning_rate": 4.43486549821317e-07,
"loss": 0.0542,
"step": 2688
},
{
"epoch": 0.9105415758396648,
"grad_norm": 0.57421875,
"learning_rate": 4.401953183660834e-07,
"loss": 0.0772,
"step": 2689
},
{
"epoch": 0.9108801930117881,
"grad_norm": 0.52734375,
"learning_rate": 4.369160701268016e-07,
"loss": 0.0738,
"step": 2690
},
{
"epoch": 0.9112188101839115,
"grad_norm": 0.86328125,
"learning_rate": 4.3364880921400567e-07,
"loss": 0.1494,
"step": 2691
},
{
"epoch": 0.9115574273560348,
"grad_norm": 0.578125,
"learning_rate": 4.303935397232117e-07,
"loss": 0.081,
"step": 2692
},
{
"epoch": 0.9118960445281581,
"grad_norm": 0.59765625,
"learning_rate": 4.271502657348969e-07,
"loss": 0.0663,
"step": 2693
},
{
"epoch": 0.9122346617002814,
"grad_norm": 0.435546875,
"learning_rate": 4.23918991314507e-07,
"loss": 0.0502,
"step": 2694
},
{
"epoch": 0.9125732788724048,
"grad_norm": 0.431640625,
"learning_rate": 4.2069972051244635e-07,
"loss": 0.055,
"step": 2695
},
{
"epoch": 0.9129118960445282,
"grad_norm": 0.671875,
"learning_rate": 4.174924573640682e-07,
"loss": 0.0958,
"step": 2696
},
{
"epoch": 0.9132505132166515,
"grad_norm": 0.52734375,
"learning_rate": 4.14297205889681e-07,
"loss": 0.0754,
"step": 2697
},
{
"epoch": 0.9135891303887749,
"grad_norm": 0.37890625,
"learning_rate": 4.111139700945277e-07,
"loss": 0.052,
"step": 2698
},
{
"epoch": 0.9139277475608982,
"grad_norm": 0.5,
"learning_rate": 4.0794275396879856e-07,
"loss": 0.0686,
"step": 2699
},
{
"epoch": 0.9142663647330215,
"grad_norm": 0.53515625,
"learning_rate": 4.047835614876128e-07,
"loss": 0.0685,
"step": 2700
},
{
"epoch": 0.9146049819051448,
"grad_norm": 0.59765625,
"learning_rate": 4.0163639661101594e-07,
"loss": 0.0812,
"step": 2701
},
{
"epoch": 0.9149435990772682,
"grad_norm": 0.5234375,
"learning_rate": 3.985012632839824e-07,
"loss": 0.0745,
"step": 2702
},
{
"epoch": 0.9152822162493915,
"grad_norm": 0.47265625,
"learning_rate": 3.9537816543640085e-07,
"loss": 0.0658,
"step": 2703
},
{
"epoch": 0.9156208334215149,
"grad_norm": 0.427734375,
"learning_rate": 3.9226710698307416e-07,
"loss": 0.0452,
"step": 2704
},
{
"epoch": 0.9159594505936383,
"grad_norm": 0.609375,
"learning_rate": 3.891680918237151e-07,
"loss": 0.0815,
"step": 2705
},
{
"epoch": 0.9162980677657616,
"grad_norm": 0.384765625,
"learning_rate": 3.8608112384293963e-07,
"loss": 0.0468,
"step": 2706
},
{
"epoch": 0.9166366849378849,
"grad_norm": 0.45703125,
"learning_rate": 3.8300620691026024e-07,
"loss": 0.0508,
"step": 2707
},
{
"epoch": 0.9169753021100082,
"grad_norm": 0.54296875,
"learning_rate": 3.799433448800893e-07,
"loss": 0.0618,
"step": 2708
},
{
"epoch": 0.9173139192821316,
"grad_norm": 0.5390625,
"learning_rate": 3.7689254159172127e-07,
"loss": 0.0641,
"step": 2709
},
{
"epoch": 0.9176525364542549,
"grad_norm": 0.7578125,
"learning_rate": 3.738538008693393e-07,
"loss": 0.0743,
"step": 2710
},
{
"epoch": 0.9179911536263783,
"grad_norm": 0.4140625,
"learning_rate": 3.708271265220087e-07,
"loss": 0.0496,
"step": 2711
},
{
"epoch": 0.9183297707985016,
"grad_norm": 0.4375,
"learning_rate": 3.6781252234365905e-07,
"loss": 0.058,
"step": 2712
},
{
"epoch": 0.918668387970625,
"grad_norm": 0.50390625,
"learning_rate": 3.64809992113101e-07,
"loss": 0.0674,
"step": 2713
},
{
"epoch": 0.9190070051427484,
"grad_norm": 0.53125,
"learning_rate": 3.618195395940083e-07,
"loss": 0.0584,
"step": 2714
},
{
"epoch": 0.9193456223148716,
"grad_norm": 0.59375,
"learning_rate": 3.5884116853490915e-07,
"loss": 0.0713,
"step": 2715
},
{
"epoch": 0.919684239486995,
"grad_norm": 0.416015625,
"learning_rate": 3.558748826691949e-07,
"loss": 0.0544,
"step": 2716
},
{
"epoch": 0.9200228566591183,
"grad_norm": 0.51171875,
"learning_rate": 3.529206857151035e-07,
"loss": 0.0735,
"step": 2717
},
{
"epoch": 0.9203614738312417,
"grad_norm": 0.51171875,
"learning_rate": 3.4997858137572174e-07,
"loss": 0.0596,
"step": 2718
},
{
"epoch": 0.920700091003365,
"grad_norm": 0.453125,
"learning_rate": 3.4704857333897834e-07,
"loss": 0.0601,
"step": 2719
},
{
"epoch": 0.9210387081754884,
"grad_norm": 0.6015625,
"learning_rate": 3.4413066527763774e-07,
"loss": 0.0785,
"step": 2720
},
{
"epoch": 0.9213773253476117,
"grad_norm": 0.42578125,
"learning_rate": 3.412248608492974e-07,
"loss": 0.0552,
"step": 2721
},
{
"epoch": 0.921715942519735,
"grad_norm": 0.462890625,
"learning_rate": 3.38331163696386e-07,
"loss": 0.0571,
"step": 2722
},
{
"epoch": 0.9220545596918583,
"grad_norm": 0.48046875,
"learning_rate": 3.354495774461497e-07,
"loss": 0.063,
"step": 2723
},
{
"epoch": 0.9223931768639817,
"grad_norm": 0.5625,
"learning_rate": 3.3258010571065925e-07,
"loss": 0.0796,
"step": 2724
},
{
"epoch": 0.9227317940361051,
"grad_norm": 0.61328125,
"learning_rate": 3.2972275208679625e-07,
"loss": 0.0554,
"step": 2725
},
{
"epoch": 0.9230704112082284,
"grad_norm": 0.46484375,
"learning_rate": 3.2687752015625574e-07,
"loss": 0.0585,
"step": 2726
},
{
"epoch": 0.9234090283803518,
"grad_norm": 0.53125,
"learning_rate": 3.2404441348553475e-07,
"loss": 0.0628,
"step": 2727
},
{
"epoch": 0.9237476455524751,
"grad_norm": 0.65625,
"learning_rate": 3.212234356259325e-07,
"loss": 0.0557,
"step": 2728
},
{
"epoch": 0.9240862627245984,
"grad_norm": 0.5078125,
"learning_rate": 3.18414590113546e-07,
"loss": 0.0617,
"step": 2729
},
{
"epoch": 0.9244248798967217,
"grad_norm": 0.44921875,
"learning_rate": 3.1561788046926335e-07,
"loss": 0.0522,
"step": 2730
},
{
"epoch": 0.9247634970688451,
"grad_norm": 0.5625,
"learning_rate": 3.1283331019875905e-07,
"loss": 0.0849,
"step": 2731
},
{
"epoch": 0.9251021142409684,
"grad_norm": 0.486328125,
"learning_rate": 3.100608827924934e-07,
"loss": 0.063,
"step": 2732
},
{
"epoch": 0.9254407314130918,
"grad_norm": 0.494140625,
"learning_rate": 3.0730060172570407e-07,
"loss": 0.0636,
"step": 2733
},
{
"epoch": 0.9257793485852152,
"grad_norm": 0.5703125,
"learning_rate": 3.045524704584024e-07,
"loss": 0.0786,
"step": 2734
},
{
"epoch": 0.9261179657573385,
"grad_norm": 0.4375,
"learning_rate": 3.018164924353739e-07,
"loss": 0.0595,
"step": 2735
},
{
"epoch": 0.9264565829294618,
"grad_norm": 0.515625,
"learning_rate": 2.990926710861641e-07,
"loss": 0.0659,
"step": 2736
},
{
"epoch": 0.9267952001015851,
"grad_norm": 0.6484375,
"learning_rate": 2.963810098250841e-07,
"loss": 0.0725,
"step": 2737
},
{
"epoch": 0.9271338172737085,
"grad_norm": 0.5390625,
"learning_rate": 2.936815120512038e-07,
"loss": 0.0688,
"step": 2738
},
{
"epoch": 0.9274724344458318,
"grad_norm": 0.546875,
"learning_rate": 2.909941811483408e-07,
"loss": 0.0729,
"step": 2739
},
{
"epoch": 0.9278110516179552,
"grad_norm": 0.44921875,
"learning_rate": 2.883190204850661e-07,
"loss": 0.0586,
"step": 2740
},
{
"epoch": 0.9281496687900785,
"grad_norm": 0.62109375,
"learning_rate": 2.8565603341469514e-07,
"loss": 0.0993,
"step": 2741
},
{
"epoch": 0.9284882859622019,
"grad_norm": 0.5,
"learning_rate": 2.8300522327528e-07,
"loss": 0.0586,
"step": 2742
},
{
"epoch": 0.9288269031343253,
"grad_norm": 0.58984375,
"learning_rate": 2.803665933896127e-07,
"loss": 0.0638,
"step": 2743
},
{
"epoch": 0.9291655203064485,
"grad_norm": 0.41796875,
"learning_rate": 2.7774014706521524e-07,
"loss": 0.0539,
"step": 2744
},
{
"epoch": 0.9295041374785719,
"grad_norm": 0.40625,
"learning_rate": 2.7512588759433857e-07,
"loss": 0.0481,
"step": 2745
},
{
"epoch": 0.9298427546506952,
"grad_norm": 0.5625,
"learning_rate": 2.7252381825395804e-07,
"loss": 0.0726,
"step": 2746
},
{
"epoch": 0.9301813718228186,
"grad_norm": 0.423828125,
"learning_rate": 2.6993394230576676e-07,
"loss": 0.0488,
"step": 2747
},
{
"epoch": 0.9305199889949419,
"grad_norm": 0.3828125,
"learning_rate": 2.6735626299617456e-07,
"loss": 0.0516,
"step": 2748
},
{
"epoch": 0.9308586061670653,
"grad_norm": 0.56640625,
"learning_rate": 2.647907835563035e-07,
"loss": 0.077,
"step": 2749
},
{
"epoch": 0.9311972233391886,
"grad_norm": 5.59375,
"learning_rate": 2.6223750720198115e-07,
"loss": 0.079,
"step": 2750
},
{
"epoch": 0.9315358405113119,
"grad_norm": 0.59765625,
"learning_rate": 2.596964371337418e-07,
"loss": 0.0828,
"step": 2751
},
{
"epoch": 0.9318744576834352,
"grad_norm": 0.47265625,
"learning_rate": 2.5716757653681313e-07,
"loss": 0.0627,
"step": 2752
},
{
"epoch": 0.9322130748555586,
"grad_norm": 0.478515625,
"learning_rate": 2.5465092858112495e-07,
"loss": 0.0546,
"step": 2753
},
{
"epoch": 0.932551692027682,
"grad_norm": 0.453125,
"learning_rate": 2.521464964212972e-07,
"loss": 0.0481,
"step": 2754
},
{
"epoch": 0.9328903091998053,
"grad_norm": 0.55078125,
"learning_rate": 2.4965428319663085e-07,
"loss": 0.0664,
"step": 2755
},
{
"epoch": 0.9332289263719287,
"grad_norm": 0.58984375,
"learning_rate": 2.471742920311193e-07,
"loss": 0.0811,
"step": 2756
},
{
"epoch": 0.933567543544052,
"grad_norm": 0.48828125,
"learning_rate": 2.4470652603343024e-07,
"loss": 0.0636,
"step": 2757
},
{
"epoch": 0.9339061607161753,
"grad_norm": 0.5234375,
"learning_rate": 2.422509882969093e-07,
"loss": 0.0657,
"step": 2758
},
{
"epoch": 0.9342447778882986,
"grad_norm": 0.439453125,
"learning_rate": 2.3980768189957205e-07,
"loss": 0.0632,
"step": 2759
},
{
"epoch": 0.934583395060422,
"grad_norm": 1.0546875,
"learning_rate": 2.3737660990410415e-07,
"loss": 0.0615,
"step": 2760
},
{
"epoch": 0.9349220122325453,
"grad_norm": 0.40234375,
"learning_rate": 2.349577753578547e-07,
"loss": 0.043,
"step": 2761
},
{
"epoch": 0.9352606294046687,
"grad_norm": 0.55078125,
"learning_rate": 2.325511812928327e-07,
"loss": 0.0684,
"step": 2762
},
{
"epoch": 0.935599246576792,
"grad_norm": 0.4296875,
"learning_rate": 2.3015683072570406e-07,
"loss": 0.0581,
"step": 2763
},
{
"epoch": 0.9359378637489154,
"grad_norm": 0.5078125,
"learning_rate": 2.2777472665778678e-07,
"loss": 0.0654,
"step": 2764
},
{
"epoch": 0.9362764809210387,
"grad_norm": 0.51171875,
"learning_rate": 2.2540487207505012e-07,
"loss": 0.0574,
"step": 2765
},
{
"epoch": 0.936615098093162,
"grad_norm": 0.4375,
"learning_rate": 2.2304726994810454e-07,
"loss": 0.0502,
"step": 2766
},
{
"epoch": 0.9369537152652854,
"grad_norm": 0.765625,
"learning_rate": 2.2070192323220606e-07,
"loss": 0.0884,
"step": 2767
},
{
"epoch": 0.9372923324374087,
"grad_norm": 0.453125,
"learning_rate": 2.1836883486724857e-07,
"loss": 0.0496,
"step": 2768
},
{
"epoch": 0.9376309496095321,
"grad_norm": 0.46875,
"learning_rate": 2.1604800777775492e-07,
"loss": 0.0553,
"step": 2769
},
{
"epoch": 0.9379695667816554,
"grad_norm": 0.400390625,
"learning_rate": 2.1373944487288577e-07,
"loss": 0.0578,
"step": 2770
},
{
"epoch": 0.9383081839537788,
"grad_norm": 0.62109375,
"learning_rate": 2.1144314904642194e-07,
"loss": 0.0569,
"step": 2771
},
{
"epoch": 0.9386468011259022,
"grad_norm": 0.423828125,
"learning_rate": 2.091591231767709e-07,
"loss": 0.0565,
"step": 2772
},
{
"epoch": 0.9389854182980254,
"grad_norm": 0.49609375,
"learning_rate": 2.0688737012696136e-07,
"loss": 0.0561,
"step": 2773
},
{
"epoch": 0.9393240354701488,
"grad_norm": 0.51171875,
"learning_rate": 2.0462789274463323e-07,
"loss": 0.069,
"step": 2774
},
{
"epoch": 0.9396626526422721,
"grad_norm": 0.55859375,
"learning_rate": 2.023806938620443e-07,
"loss": 0.0771,
"step": 2775
},
{
"epoch": 0.9400012698143955,
"grad_norm": 0.44921875,
"learning_rate": 2.0014577629605681e-07,
"loss": 0.0607,
"step": 2776
},
{
"epoch": 0.9403398869865188,
"grad_norm": 0.5234375,
"learning_rate": 1.9792314284813984e-07,
"loss": 0.0706,
"step": 2777
},
{
"epoch": 0.9406785041586422,
"grad_norm": 0.734375,
"learning_rate": 1.957127963043648e-07,
"loss": 0.0555,
"step": 2778
},
{
"epoch": 0.9410171213307655,
"grad_norm": 0.408203125,
"learning_rate": 1.93514739435402e-07,
"loss": 0.0445,
"step": 2779
},
{
"epoch": 0.9413557385028888,
"grad_norm": 0.53515625,
"learning_rate": 1.9132897499651636e-07,
"loss": 0.0706,
"step": 2780
},
{
"epoch": 0.9416943556750121,
"grad_norm": 0.4765625,
"learning_rate": 1.8915550572756293e-07,
"loss": 0.0628,
"step": 2781
},
{
"epoch": 0.9420329728471355,
"grad_norm": 0.78125,
"learning_rate": 1.8699433435298452e-07,
"loss": 0.0882,
"step": 2782
},
{
"epoch": 0.9423715900192589,
"grad_norm": 0.48828125,
"learning_rate": 1.848454635818109e-07,
"loss": 0.0704,
"step": 2783
},
{
"epoch": 0.9427102071913822,
"grad_norm": 0.6171875,
"learning_rate": 1.8270889610765285e-07,
"loss": 0.0689,
"step": 2784
},
{
"epoch": 0.9430488243635056,
"grad_norm": 0.671875,
"learning_rate": 1.8058463460869478e-07,
"loss": 0.0878,
"step": 2785
},
{
"epoch": 0.9433874415356289,
"grad_norm": 0.5234375,
"learning_rate": 1.7847268174770226e-07,
"loss": 0.0701,
"step": 2786
},
{
"epoch": 0.9437260587077522,
"grad_norm": 0.6640625,
"learning_rate": 1.763730401720065e-07,
"loss": 0.0942,
"step": 2787
},
{
"epoch": 0.9440646758798755,
"grad_norm": 0.54296875,
"learning_rate": 1.7428571251350779e-07,
"loss": 0.0697,
"step": 2788
},
{
"epoch": 0.9444032930519989,
"grad_norm": 0.51171875,
"learning_rate": 1.7221070138867312e-07,
"loss": 0.0664,
"step": 2789
},
{
"epoch": 0.9447419102241222,
"grad_norm": 0.5,
"learning_rate": 1.701480093985275e-07,
"loss": 0.0477,
"step": 2790
},
{
"epoch": 0.9450805273962456,
"grad_norm": 0.390625,
"learning_rate": 1.6809763912865596e-07,
"loss": 0.0492,
"step": 2791
},
{
"epoch": 0.945419144568369,
"grad_norm": 0.515625,
"learning_rate": 1.660595931491993e-07,
"loss": 0.063,
"step": 2792
},
{
"epoch": 0.9457577617404923,
"grad_norm": 0.5625,
"learning_rate": 1.6403387401484506e-07,
"loss": 0.076,
"step": 2793
},
{
"epoch": 0.9460963789126156,
"grad_norm": 0.54296875,
"learning_rate": 1.6202048426483652e-07,
"loss": 0.067,
"step": 2794
},
{
"epoch": 0.9464349960847389,
"grad_norm": 0.55078125,
"learning_rate": 1.6001942642295487e-07,
"loss": 0.0734,
"step": 2795
},
{
"epoch": 0.9467736132568623,
"grad_norm": 0.419921875,
"learning_rate": 1.580307029975281e-07,
"loss": 0.0465,
"step": 2796
},
{
"epoch": 0.9471122304289856,
"grad_norm": 0.435546875,
"learning_rate": 1.5605431648141878e-07,
"loss": 0.0428,
"step": 2797
},
{
"epoch": 0.947450847601109,
"grad_norm": 0.58203125,
"learning_rate": 1.5409026935203075e-07,
"loss": 0.0825,
"step": 2798
},
{
"epoch": 0.9477894647732323,
"grad_norm": 0.49609375,
"learning_rate": 1.5213856407129467e-07,
"loss": 0.0632,
"step": 2799
},
{
"epoch": 0.9481280819453557,
"grad_norm": 0.33203125,
"learning_rate": 1.501992030856736e-07,
"loss": 0.0405,
"step": 2800
},
{
"epoch": 0.9484666991174789,
"grad_norm": 0.64453125,
"learning_rate": 1.4827218882615847e-07,
"loss": 0.0767,
"step": 2801
},
{
"epoch": 0.9488053162896023,
"grad_norm": 0.61328125,
"learning_rate": 1.463575237082593e-07,
"loss": 0.1085,
"step": 2802
},
{
"epoch": 0.9491439334617257,
"grad_norm": 0.427734375,
"learning_rate": 1.444552101320107e-07,
"loss": 0.0514,
"step": 2803
},
{
"epoch": 0.949482550633849,
"grad_norm": 0.5234375,
"learning_rate": 1.42565250481963e-07,
"loss": 0.068,
"step": 2804
},
{
"epoch": 0.9498211678059724,
"grad_norm": 0.458984375,
"learning_rate": 1.4068764712717897e-07,
"loss": 0.0566,
"step": 2805
},
{
"epoch": 0.9501597849780957,
"grad_norm": 0.4609375,
"learning_rate": 1.3882240242123811e-07,
"loss": 0.0567,
"step": 2806
},
{
"epoch": 0.9504984021502191,
"grad_norm": 0.5234375,
"learning_rate": 1.3696951870222018e-07,
"loss": 0.0671,
"step": 2807
},
{
"epoch": 0.9508370193223424,
"grad_norm": 0.51953125,
"learning_rate": 1.3512899829271954e-07,
"loss": 0.0617,
"step": 2808
},
{
"epoch": 0.9511756364944657,
"grad_norm": 0.46875,
"learning_rate": 1.3330084349982509e-07,
"loss": 0.0632,
"step": 2809
},
{
"epoch": 0.951514253666589,
"grad_norm": 1.1171875,
"learning_rate": 1.3148505661513045e-07,
"loss": 0.0709,
"step": 2810
},
{
"epoch": 0.9518528708387124,
"grad_norm": 0.50390625,
"learning_rate": 1.2968163991472493e-07,
"loss": 0.0774,
"step": 2811
},
{
"epoch": 0.9521914880108358,
"grad_norm": 0.392578125,
"learning_rate": 1.2789059565919138e-07,
"loss": 0.0542,
"step": 2812
},
{
"epoch": 0.9525301051829591,
"grad_norm": 0.484375,
"learning_rate": 1.261119260936039e-07,
"loss": 0.0662,
"step": 2813
},
{
"epoch": 0.9528687223550825,
"grad_norm": 0.455078125,
"learning_rate": 1.243456334475246e-07,
"loss": 0.0544,
"step": 2814
},
{
"epoch": 0.9532073395272058,
"grad_norm": 0.47265625,
"learning_rate": 1.225917199350013e-07,
"loss": 0.0614,
"step": 2815
},
{
"epoch": 0.9535459566993291,
"grad_norm": 0.50390625,
"learning_rate": 1.2085018775456648e-07,
"loss": 0.0535,
"step": 2816
},
{
"epoch": 0.9538845738714524,
"grad_norm": 0.375,
"learning_rate": 1.1912103908922945e-07,
"loss": 0.0468,
"step": 2817
},
{
"epoch": 0.9542231910435758,
"grad_norm": 0.6328125,
"learning_rate": 1.1740427610647643e-07,
"loss": 0.0739,
"step": 2818
},
{
"epoch": 0.9545618082156991,
"grad_norm": 0.423828125,
"learning_rate": 1.1569990095827378e-07,
"loss": 0.0539,
"step": 2819
},
{
"epoch": 0.9549004253878225,
"grad_norm": 0.55078125,
"learning_rate": 1.1400791578105253e-07,
"loss": 0.0921,
"step": 2820
},
{
"epoch": 0.9552390425599459,
"grad_norm": 0.94140625,
"learning_rate": 1.1232832269571725e-07,
"loss": 0.1924,
"step": 2821
},
{
"epoch": 0.9555776597320692,
"grad_norm": 0.703125,
"learning_rate": 1.1066112380763939e-07,
"loss": 0.0674,
"step": 2822
},
{
"epoch": 0.9559162769041925,
"grad_norm": 0.5,
"learning_rate": 1.0900632120665166e-07,
"loss": 0.0646,
"step": 2823
},
{
"epoch": 0.9562548940763158,
"grad_norm": 0.5625,
"learning_rate": 1.073639169670504e-07,
"loss": 0.0756,
"step": 2824
},
{
"epoch": 0.9565935112484392,
"grad_norm": 0.58203125,
"learning_rate": 1.0573391314758652e-07,
"loss": 0.0681,
"step": 2825
},
{
"epoch": 0.9569321284205625,
"grad_norm": 0.55859375,
"learning_rate": 1.0411631179147342e-07,
"loss": 0.0694,
"step": 2826
},
{
"epoch": 0.9572707455926859,
"grad_norm": 0.55078125,
"learning_rate": 1.0251111492637245e-07,
"loss": 0.0779,
"step": 2827
},
{
"epoch": 0.9576093627648092,
"grad_norm": 0.451171875,
"learning_rate": 1.0091832456439854e-07,
"loss": 0.0551,
"step": 2828
},
{
"epoch": 0.9579479799369326,
"grad_norm": 0.494140625,
"learning_rate": 9.933794270211461e-08,
"loss": 0.0679,
"step": 2829
},
{
"epoch": 0.9582865971090558,
"grad_norm": 0.51171875,
"learning_rate": 9.776997132052935e-08,
"loss": 0.0604,
"step": 2830
},
{
"epoch": 0.9586252142811792,
"grad_norm": 0.44140625,
"learning_rate": 9.621441238509611e-08,
"loss": 0.0617,
"step": 2831
},
{
"epoch": 0.9589638314533026,
"grad_norm": 0.53515625,
"learning_rate": 9.467126784570623e-08,
"loss": 0.0703,
"step": 2832
},
{
"epoch": 0.9593024486254259,
"grad_norm": 0.6171875,
"learning_rate": 9.314053963669245e-08,
"loss": 0.0632,
"step": 2833
},
{
"epoch": 0.9596410657975493,
"grad_norm": 0.431640625,
"learning_rate": 9.162222967682322e-08,
"loss": 0.0564,
"step": 2834
},
{
"epoch": 0.9599796829696726,
"grad_norm": 0.50390625,
"learning_rate": 9.011633986929947e-08,
"loss": 0.0722,
"step": 2835
},
{
"epoch": 0.960318300141796,
"grad_norm": 0.490234375,
"learning_rate": 8.862287210175347e-08,
"loss": 0.0665,
"step": 2836
},
{
"epoch": 0.9606569173139193,
"grad_norm": 0.494140625,
"learning_rate": 8.714182824624883e-08,
"loss": 0.0717,
"step": 2837
},
{
"epoch": 0.9609955344860426,
"grad_norm": 0.46875,
"learning_rate": 8.567321015927387e-08,
"loss": 0.0603,
"step": 2838
},
{
"epoch": 0.9613341516581659,
"grad_norm": 0.5703125,
"learning_rate": 8.421701968174156e-08,
"loss": 0.0772,
"step": 2839
},
{
"epoch": 0.9616727688302893,
"grad_norm": 0.5234375,
"learning_rate": 8.27732586389851e-08,
"loss": 0.0668,
"step": 2840
},
{
"epoch": 0.9620113860024126,
"grad_norm": 0.52734375,
"learning_rate": 8.134192884076131e-08,
"loss": 0.0734,
"step": 2841
},
{
"epoch": 0.962350003174536,
"grad_norm": 0.423828125,
"learning_rate": 7.992303208123941e-08,
"loss": 0.0504,
"step": 2842
},
{
"epoch": 0.9626886203466594,
"grad_norm": 0.51171875,
"learning_rate": 7.851657013901003e-08,
"loss": 0.0743,
"step": 2843
},
{
"epoch": 0.9630272375187827,
"grad_norm": 0.478515625,
"learning_rate": 7.712254477707071e-08,
"loss": 0.0614,
"step": 2844
},
{
"epoch": 0.963365854690906,
"grad_norm": 0.470703125,
"learning_rate": 7.574095774283363e-08,
"loss": 0.0666,
"step": 2845
},
{
"epoch": 0.9637044718630293,
"grad_norm": 0.48046875,
"learning_rate": 7.437181076811794e-08,
"loss": 0.06,
"step": 2846
},
{
"epoch": 0.9640430890351527,
"grad_norm": 0.427734375,
"learning_rate": 7.301510556914859e-08,
"loss": 0.0535,
"step": 2847
},
{
"epoch": 0.964381706207276,
"grad_norm": 0.70703125,
"learning_rate": 7.167084384655742e-08,
"loss": 0.0815,
"step": 2848
},
{
"epoch": 0.9647203233793994,
"grad_norm": 0.439453125,
"learning_rate": 7.033902728537546e-08,
"loss": 0.0635,
"step": 2849
},
{
"epoch": 0.9650589405515227,
"grad_norm": 0.443359375,
"learning_rate": 6.901965755503503e-08,
"loss": 0.0566,
"step": 2850
},
{
"epoch": 0.9653975577236461,
"grad_norm": 0.51171875,
"learning_rate": 6.77127363093666e-08,
"loss": 0.0646,
"step": 2851
},
{
"epoch": 0.9657361748957694,
"grad_norm": 0.41796875,
"learning_rate": 6.641826518659633e-08,
"loss": 0.0575,
"step": 2852
},
{
"epoch": 0.9660747920678927,
"grad_norm": 0.43359375,
"learning_rate": 6.513624580934186e-08,
"loss": 0.0448,
"step": 2853
},
{
"epoch": 0.9664134092400161,
"grad_norm": 0.462890625,
"learning_rate": 6.386667978461658e-08,
"loss": 0.0509,
"step": 2854
},
{
"epoch": 0.9667520264121394,
"grad_norm": 0.55078125,
"learning_rate": 6.260956870382196e-08,
"loss": 0.0709,
"step": 2855
},
{
"epoch": 0.9670906435842628,
"grad_norm": 0.55078125,
"learning_rate": 6.136491414274415e-08,
"loss": 0.0599,
"step": 2856
},
{
"epoch": 0.9674292607563861,
"grad_norm": 0.478515625,
"learning_rate": 6.01327176615607e-08,
"loss": 0.0645,
"step": 2857
},
{
"epoch": 0.9677678779285095,
"grad_norm": 0.494140625,
"learning_rate": 5.891298080482943e-08,
"loss": 0.0707,
"step": 2858
},
{
"epoch": 0.9681064951006327,
"grad_norm": 0.71875,
"learning_rate": 5.770570510148954e-08,
"loss": 0.0616,
"step": 2859
},
{
"epoch": 0.9684451122727561,
"grad_norm": 0.59375,
"learning_rate": 5.65108920648616e-08,
"loss": 0.087,
"step": 2860
},
{
"epoch": 0.9687837294448794,
"grad_norm": 0.625,
"learning_rate": 5.5328543192643134e-08,
"loss": 0.0885,
"step": 2861
},
{
"epoch": 0.9691223466170028,
"grad_norm": 0.50390625,
"learning_rate": 5.4158659966909724e-08,
"loss": 0.0677,
"step": 2862
},
{
"epoch": 0.9694609637891262,
"grad_norm": 0.498046875,
"learning_rate": 5.300124385410943e-08,
"loss": 0.0629,
"step": 2863
},
{
"epoch": 0.9697995809612495,
"grad_norm": 0.5390625,
"learning_rate": 5.1856296305063945e-08,
"loss": 0.0759,
"step": 2864
},
{
"epoch": 0.9701381981333729,
"grad_norm": 0.451171875,
"learning_rate": 5.072381875496524e-08,
"loss": 0.065,
"step": 2865
},
{
"epoch": 0.9704768153054962,
"grad_norm": 0.447265625,
"learning_rate": 4.960381262337333e-08,
"loss": 0.0499,
"step": 2866
},
{
"epoch": 0.9708154324776195,
"grad_norm": 0.431640625,
"learning_rate": 4.84962793142163e-08,
"loss": 0.0571,
"step": 2867
},
{
"epoch": 0.9711540496497428,
"grad_norm": 0.5859375,
"learning_rate": 4.740122021578808e-08,
"loss": 0.0695,
"step": 2868
},
{
"epoch": 0.9714926668218662,
"grad_norm": 0.435546875,
"learning_rate": 4.6318636700743994e-08,
"loss": 0.0598,
"step": 2869
},
{
"epoch": 0.9718312839939895,
"grad_norm": 0.412109375,
"learning_rate": 4.5248530126102976e-08,
"loss": 0.0446,
"step": 2870
},
{
"epoch": 0.9721699011661129,
"grad_norm": 0.515625,
"learning_rate": 4.419090183324315e-08,
"loss": 0.0603,
"step": 2871
},
{
"epoch": 0.9725085183382363,
"grad_norm": 0.5546875,
"learning_rate": 4.314575314790292e-08,
"loss": 0.074,
"step": 2872
},
{
"epoch": 0.9728471355103596,
"grad_norm": 0.53515625,
"learning_rate": 4.2113085380176556e-08,
"loss": 0.0664,
"step": 2873
},
{
"epoch": 0.9731857526824829,
"grad_norm": 0.515625,
"learning_rate": 4.109289982451081e-08,
"loss": 0.0668,
"step": 2874
},
{
"epoch": 0.9735243698546062,
"grad_norm": 0.52734375,
"learning_rate": 4.008519775971054e-08,
"loss": 0.0689,
"step": 2875
},
{
"epoch": 0.9738629870267296,
"grad_norm": 0.376953125,
"learning_rate": 3.908998044892975e-08,
"loss": 0.0483,
"step": 2876
},
{
"epoch": 0.9742016041988529,
"grad_norm": 0.53125,
"learning_rate": 3.810724913967278e-08,
"loss": 0.0719,
"step": 2877
},
{
"epoch": 0.9745402213709763,
"grad_norm": 0.4609375,
"learning_rate": 3.713700506379536e-08,
"loss": 0.0687,
"step": 2878
},
{
"epoch": 0.9748788385430996,
"grad_norm": 0.58984375,
"learning_rate": 3.617924943749573e-08,
"loss": 0.0788,
"step": 2879
},
{
"epoch": 0.975217455715223,
"grad_norm": 0.609375,
"learning_rate": 3.5233983461322453e-08,
"loss": 0.0752,
"step": 2880
},
{
"epoch": 0.9755560728873462,
"grad_norm": 0.5234375,
"learning_rate": 3.430120832016659e-08,
"loss": 0.0585,
"step": 2881
},
{
"epoch": 0.9758946900594696,
"grad_norm": 0.58203125,
"learning_rate": 3.338092518326064e-08,
"loss": 0.0634,
"step": 2882
},
{
"epoch": 0.976233307231593,
"grad_norm": 0.474609375,
"learning_rate": 3.2473135204180715e-08,
"loss": 0.0662,
"step": 2883
},
{
"epoch": 0.9765719244037163,
"grad_norm": 0.58203125,
"learning_rate": 3.1577839520841034e-08,
"loss": 0.0625,
"step": 2884
},
{
"epoch": 0.9769105415758397,
"grad_norm": 0.5,
"learning_rate": 3.0695039255494995e-08,
"loss": 0.0629,
"step": 2885
},
{
"epoch": 0.977249158747963,
"grad_norm": 0.578125,
"learning_rate": 2.982473551473297e-08,
"loss": 0.0783,
"step": 2886
},
{
"epoch": 0.9775877759200864,
"grad_norm": 0.435546875,
"learning_rate": 2.8966929389481202e-08,
"loss": 0.0542,
"step": 2887
},
{
"epoch": 0.9779263930922096,
"grad_norm": 0.515625,
"learning_rate": 2.8121621954998457e-08,
"loss": 0.0662,
"step": 2888
},
{
"epoch": 0.978265010264333,
"grad_norm": 0.5390625,
"learning_rate": 2.7288814270878262e-08,
"loss": 0.0559,
"step": 2889
},
{
"epoch": 0.9786036274364563,
"grad_norm": 0.80078125,
"learning_rate": 2.6468507381045562e-08,
"loss": 0.0583,
"step": 2890
},
{
"epoch": 0.9789422446085797,
"grad_norm": 0.7109375,
"learning_rate": 2.5660702313754505e-08,
"loss": 0.0789,
"step": 2891
},
{
"epoch": 0.9792808617807031,
"grad_norm": 1.1015625,
"learning_rate": 2.4865400081589552e-08,
"loss": 0.0491,
"step": 2892
},
{
"epoch": 0.9796194789528264,
"grad_norm": 0.375,
"learning_rate": 2.4082601681461038e-08,
"loss": 0.0448,
"step": 2893
},
{
"epoch": 0.9799580961249498,
"grad_norm": 0.5546875,
"learning_rate": 2.3312308094607382e-08,
"loss": 0.063,
"step": 2894
},
{
"epoch": 0.9802967132970731,
"grad_norm": 0.67578125,
"learning_rate": 2.2554520286592885e-08,
"loss": 0.0595,
"step": 2895
},
{
"epoch": 0.9806353304691964,
"grad_norm": 0.455078125,
"learning_rate": 2.180923920730216e-08,
"loss": 0.0575,
"step": 2896
},
{
"epoch": 0.9809739476413197,
"grad_norm": 0.546875,
"learning_rate": 2.10764657909468e-08,
"loss": 0.0802,
"step": 2897
},
{
"epoch": 0.9813125648134431,
"grad_norm": 0.451171875,
"learning_rate": 2.0356200956058725e-08,
"loss": 0.0639,
"step": 2898
},
{
"epoch": 0.9816511819855664,
"grad_norm": 0.373046875,
"learning_rate": 1.9648445605487954e-08,
"loss": 0.044,
"step": 2899
},
{
"epoch": 0.9819897991576898,
"grad_norm": 0.6484375,
"learning_rate": 1.8953200626408153e-08,
"loss": 0.0908,
"step": 2900
},
{
"epoch": 0.9823284163298132,
"grad_norm": 0.5703125,
"learning_rate": 1.827046689030665e-08,
"loss": 0.0594,
"step": 2901
},
{
"epoch": 0.9826670335019365,
"grad_norm": 0.54296875,
"learning_rate": 1.76002452529922e-08,
"loss": 0.051,
"step": 2902
},
{
"epoch": 0.9830056506740598,
"grad_norm": 0.59765625,
"learning_rate": 1.6942536554587218e-08,
"loss": 0.0653,
"step": 2903
},
{
"epoch": 0.9833442678461831,
"grad_norm": 0.41796875,
"learning_rate": 1.6297341619528894e-08,
"loss": 0.0472,
"step": 2904
},
{
"epoch": 0.9836828850183065,
"grad_norm": 0.53515625,
"learning_rate": 1.566466125656918e-08,
"loss": 0.0653,
"step": 2905
},
{
"epoch": 0.9840215021904298,
"grad_norm": 0.5546875,
"learning_rate": 1.50444962587748e-08,
"loss": 0.079,
"step": 2906
},
{
"epoch": 0.9843601193625532,
"grad_norm": 0.578125,
"learning_rate": 1.4436847403519471e-08,
"loss": 0.0816,
"step": 2907
},
{
"epoch": 0.9846987365346765,
"grad_norm": 0.57421875,
"learning_rate": 1.3841715452493908e-08,
"loss": 0.0587,
"step": 2908
},
{
"epoch": 0.9850373537067999,
"grad_norm": 1.1875,
"learning_rate": 1.325910115169471e-08,
"loss": 0.0831,
"step": 2909
},
{
"epoch": 0.9853759708789231,
"grad_norm": 0.455078125,
"learning_rate": 1.2689005231429907e-08,
"loss": 0.0584,
"step": 2910
},
{
"epoch": 0.9857145880510465,
"grad_norm": 0.44140625,
"learning_rate": 1.2131428406313428e-08,
"loss": 0.0553,
"step": 2911
},
{
"epoch": 0.9860532052231699,
"grad_norm": 0.58984375,
"learning_rate": 1.1586371375268413e-08,
"loss": 0.0721,
"step": 2912
},
{
"epoch": 0.9863918223952932,
"grad_norm": 0.4140625,
"learning_rate": 1.105383482152389e-08,
"loss": 0.0446,
"step": 2913
},
{
"epoch": 0.9867304395674166,
"grad_norm": 0.51953125,
"learning_rate": 1.0533819412614776e-08,
"loss": 0.0634,
"step": 2914
},
{
"epoch": 0.9870690567395399,
"grad_norm": 0.66015625,
"learning_rate": 1.0026325800380766e-08,
"loss": 0.0712,
"step": 2915
},
{
"epoch": 0.9874076739116633,
"grad_norm": 0.443359375,
"learning_rate": 9.531354620964107e-09,
"loss": 0.0529,
"step": 2916
},
{
"epoch": 0.9877462910837865,
"grad_norm": 0.609375,
"learning_rate": 9.048906494811826e-09,
"loss": 0.0603,
"step": 2917
},
{
"epoch": 0.9880849082559099,
"grad_norm": 0.486328125,
"learning_rate": 8.5789820266724e-09,
"loss": 0.0652,
"step": 2918
},
{
"epoch": 0.9884235254280332,
"grad_norm": 0.515625,
"learning_rate": 8.121581805596857e-09,
"loss": 0.0653,
"step": 2919
},
{
"epoch": 0.9887621426001566,
"grad_norm": 0.423828125,
"learning_rate": 7.676706404935453e-09,
"loss": 0.0558,
"step": 2920
},
{
"epoch": 0.98910075977228,
"grad_norm": 0.490234375,
"learning_rate": 7.24435638233989e-09,
"loss": 0.066,
"step": 2921
},
{
"epoch": 0.9894393769444033,
"grad_norm": 0.4140625,
"learning_rate": 6.824532279761098e-09,
"loss": 0.0527,
"step": 2922
},
{
"epoch": 0.9897779941165267,
"grad_norm": 0.42578125,
"learning_rate": 6.417234623449231e-09,
"loss": 0.0532,
"step": 2923
},
{
"epoch": 0.99011661128865,
"grad_norm": 0.4375,
"learning_rate": 6.02246392395145e-09,
"loss": 0.0528,
"step": 2924
},
{
"epoch": 0.9904552284607733,
"grad_norm": 0.421875,
"learning_rate": 5.6402206761119185e-09,
"loss": 0.0547,
"step": 2925
},
{
"epoch": 0.9907938456328966,
"grad_norm": 0.671875,
"learning_rate": 5.27050535907403e-09,
"loss": 0.0707,
"step": 2926
},
{
"epoch": 0.99113246280502,
"grad_norm": 0.49609375,
"learning_rate": 4.91331843627485e-09,
"loss": 0.0661,
"step": 2927
},
{
"epoch": 0.9914710799771433,
"grad_norm": 0.4765625,
"learning_rate": 4.568660355448451e-09,
"loss": 0.0526,
"step": 2928
},
{
"epoch": 0.9918096971492667,
"grad_norm": 0.490234375,
"learning_rate": 4.2365315486248e-09,
"loss": 0.0686,
"step": 2929
},
{
"epoch": 0.9921483143213901,
"grad_norm": 0.41015625,
"learning_rate": 3.91693243212643e-09,
"loss": 0.0554,
"step": 2930
},
{
"epoch": 0.9924869314935134,
"grad_norm": 0.419921875,
"learning_rate": 3.609863406570657e-09,
"loss": 0.0495,
"step": 2931
},
{
"epoch": 0.9928255486656367,
"grad_norm": 0.5078125,
"learning_rate": 3.315324856869584e-09,
"loss": 0.0686,
"step": 2932
},
{
"epoch": 0.99316416583776,
"grad_norm": 0.423828125,
"learning_rate": 3.0333171522256568e-09,
"loss": 0.0586,
"step": 2933
},
{
"epoch": 0.9935027830098834,
"grad_norm": 0.462890625,
"learning_rate": 2.7638406461372167e-09,
"loss": 0.065,
"step": 2934
},
{
"epoch": 0.9938414001820067,
"grad_norm": 0.62109375,
"learning_rate": 2.5068956763918405e-09,
"loss": 0.0688,
"step": 2935
},
{
"epoch": 0.9941800173541301,
"grad_norm": 0.66796875,
"learning_rate": 2.262482565070778e-09,
"loss": 0.0933,
"step": 2936
},
{
"epoch": 0.9945186345262534,
"grad_norm": 0.83984375,
"learning_rate": 2.0306016185456243e-09,
"loss": 0.0699,
"step": 2937
},
{
"epoch": 0.9948572516983768,
"grad_norm": 0.4375,
"learning_rate": 1.8112531274794287e-09,
"loss": 0.0583,
"step": 2938
},
{
"epoch": 0.9951958688705,
"grad_norm": 0.5,
"learning_rate": 1.6044373668255841e-09,
"loss": 0.0712,
"step": 2939
},
{
"epoch": 0.9955344860426234,
"grad_norm": 0.640625,
"learning_rate": 1.4101545958267183e-09,
"loss": 0.0991,
"step": 2940
},
{
"epoch": 0.9958731032147468,
"grad_norm": 0.53125,
"learning_rate": 1.228405058018023e-09,
"loss": 0.0565,
"step": 2941
},
{
"epoch": 0.9962117203868701,
"grad_norm": 0.50390625,
"learning_rate": 1.0591889812205934e-09,
"loss": 0.067,
"step": 2942
},
{
"epoch": 0.9965503375589935,
"grad_norm": 0.380859375,
"learning_rate": 9.025065775492003e-10,
"loss": 0.0483,
"step": 2943
},
{
"epoch": 0.9968889547311168,
"grad_norm": 0.431640625,
"learning_rate": 7.583580434022963e-10,
"loss": 0.064,
"step": 2944
},
{
"epoch": 0.9972275719032402,
"grad_norm": 0.494140625,
"learning_rate": 6.267435594720095e-10,
"loss": 0.0585,
"step": 2945
},
{
"epoch": 0.9975661890753634,
"grad_norm": 0.451171875,
"learning_rate": 5.076632907374812e-10,
"loss": 0.0568,
"step": 2946
},
{
"epoch": 0.9979048062474868,
"grad_norm": 0.486328125,
"learning_rate": 4.011173864637563e-10,
"loss": 0.0697,
"step": 2947
},
{
"epoch": 0.9982434234196101,
"grad_norm": 0.5234375,
"learning_rate": 3.0710598020844416e-10,
"loss": 0.0631,
"step": 2948
},
{
"epoch": 0.9985820405917335,
"grad_norm": 0.44921875,
"learning_rate": 2.2562918981394732e-10,
"loss": 0.0585,
"step": 2949
},
{
"epoch": 0.9989206577638569,
"grad_norm": 0.52734375,
"learning_rate": 1.5668711741079202e-10,
"loss": 0.0752,
"step": 2950
},
{
"epoch": 0.9992592749359802,
"grad_norm": 0.48046875,
"learning_rate": 1.0027984941873847e-10,
"loss": 0.0674,
"step": 2951
},
{
"epoch": 0.9995978921081036,
"grad_norm": 0.609375,
"learning_rate": 5.640745654345026e-11,
"loss": 0.0774,
"step": 2952
},
{
"epoch": 0.9999365092802269,
"grad_norm": 0.515625,
"learning_rate": 2.5069993779824887e-11,
"loss": 0.0685,
"step": 2953
},
{
"epoch": 1.0,
"grad_norm": 0.98046875,
"learning_rate": 6.267500408663196e-12,
"loss": 0.0576,
"step": 2954
}
],
"logging_steps": 1.0,
"max_steps": 2954,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 296,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.726211900679385e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}