baseline-Llama-3-8B-Instruct-sft / trainer_state.json
ZhangShenao's picture
Model save
82fc17e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9977761304670127,
"eval_steps": 500,
"global_step": 1011,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0029651593773165306,
"grad_norm": 11.262527465820312,
"learning_rate": 1.9607843137254904e-07,
"loss": 0.9138,
"step": 1
},
{
"epoch": 0.005930318754633061,
"grad_norm": 10.613697052001953,
"learning_rate": 3.921568627450981e-07,
"loss": 0.9147,
"step": 2
},
{
"epoch": 0.008895478131949592,
"grad_norm": 11.271403312683105,
"learning_rate": 5.882352941176471e-07,
"loss": 0.9118,
"step": 3
},
{
"epoch": 0.011860637509266123,
"grad_norm": 11.093155860900879,
"learning_rate": 7.843137254901962e-07,
"loss": 0.9137,
"step": 4
},
{
"epoch": 0.014825796886582653,
"grad_norm": 11.175023078918457,
"learning_rate": 9.80392156862745e-07,
"loss": 0.8879,
"step": 5
},
{
"epoch": 0.017790956263899184,
"grad_norm": 10.275071144104004,
"learning_rate": 1.1764705882352942e-06,
"loss": 0.864,
"step": 6
},
{
"epoch": 0.020756115641215715,
"grad_norm": 8.285501480102539,
"learning_rate": 1.3725490196078434e-06,
"loss": 0.8612,
"step": 7
},
{
"epoch": 0.023721275018532245,
"grad_norm": 6.519635200500488,
"learning_rate": 1.5686274509803923e-06,
"loss": 0.8372,
"step": 8
},
{
"epoch": 0.026686434395848776,
"grad_norm": 6.018601894378662,
"learning_rate": 1.7647058823529414e-06,
"loss": 0.8244,
"step": 9
},
{
"epoch": 0.029651593773165306,
"grad_norm": 5.061045169830322,
"learning_rate": 1.96078431372549e-06,
"loss": 0.8057,
"step": 10
},
{
"epoch": 0.03261675315048184,
"grad_norm": 5.859638214111328,
"learning_rate": 2.1568627450980393e-06,
"loss": 0.7734,
"step": 11
},
{
"epoch": 0.03558191252779837,
"grad_norm": 5.410571098327637,
"learning_rate": 2.3529411764705885e-06,
"loss": 0.7635,
"step": 12
},
{
"epoch": 0.0385470719051149,
"grad_norm": 3.8421123027801514,
"learning_rate": 2.549019607843137e-06,
"loss": 0.7373,
"step": 13
},
{
"epoch": 0.04151223128243143,
"grad_norm": 2.3517632484436035,
"learning_rate": 2.7450980392156867e-06,
"loss": 0.7035,
"step": 14
},
{
"epoch": 0.04447739065974796,
"grad_norm": 2.1120362281799316,
"learning_rate": 2.9411764705882355e-06,
"loss": 0.6795,
"step": 15
},
{
"epoch": 0.04744255003706449,
"grad_norm": 2.042616605758667,
"learning_rate": 3.1372549019607846e-06,
"loss": 0.6596,
"step": 16
},
{
"epoch": 0.050407709414381024,
"grad_norm": 1.781117558479309,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.6325,
"step": 17
},
{
"epoch": 0.05337286879169755,
"grad_norm": 1.464235782623291,
"learning_rate": 3.529411764705883e-06,
"loss": 0.6265,
"step": 18
},
{
"epoch": 0.056338028169014086,
"grad_norm": 1.1197887659072876,
"learning_rate": 3.7254901960784316e-06,
"loss": 0.6251,
"step": 19
},
{
"epoch": 0.05930318754633061,
"grad_norm": 1.1305307149887085,
"learning_rate": 3.92156862745098e-06,
"loss": 0.6271,
"step": 20
},
{
"epoch": 0.06226834692364715,
"grad_norm": 1.1442177295684814,
"learning_rate": 4.11764705882353e-06,
"loss": 0.605,
"step": 21
},
{
"epoch": 0.06523350630096368,
"grad_norm": 0.8627598881721497,
"learning_rate": 4.313725490196079e-06,
"loss": 0.5979,
"step": 22
},
{
"epoch": 0.0681986656782802,
"grad_norm": 0.9222763776779175,
"learning_rate": 4.509803921568628e-06,
"loss": 0.6027,
"step": 23
},
{
"epoch": 0.07116382505559674,
"grad_norm": 0.787282407283783,
"learning_rate": 4.705882352941177e-06,
"loss": 0.5928,
"step": 24
},
{
"epoch": 0.07412898443291327,
"grad_norm": 0.8055775165557861,
"learning_rate": 4.901960784313726e-06,
"loss": 0.5842,
"step": 25
},
{
"epoch": 0.0770941438102298,
"grad_norm": 0.713017463684082,
"learning_rate": 5.098039215686274e-06,
"loss": 0.5694,
"step": 26
},
{
"epoch": 0.08005930318754632,
"grad_norm": 0.7474880814552307,
"learning_rate": 5.294117647058824e-06,
"loss": 0.5558,
"step": 27
},
{
"epoch": 0.08302446256486286,
"grad_norm": 0.7316311001777649,
"learning_rate": 5.4901960784313735e-06,
"loss": 0.5629,
"step": 28
},
{
"epoch": 0.08598962194217939,
"grad_norm": 0.760550856590271,
"learning_rate": 5.686274509803922e-06,
"loss": 0.5574,
"step": 29
},
{
"epoch": 0.08895478131949593,
"grad_norm": 0.7376196384429932,
"learning_rate": 5.882352941176471e-06,
"loss": 0.5562,
"step": 30
},
{
"epoch": 0.09191994069681246,
"grad_norm": 0.7215123176574707,
"learning_rate": 6.07843137254902e-06,
"loss": 0.5438,
"step": 31
},
{
"epoch": 0.09488510007412898,
"grad_norm": 0.7079214453697205,
"learning_rate": 6.274509803921569e-06,
"loss": 0.5418,
"step": 32
},
{
"epoch": 0.09785025945144551,
"grad_norm": 0.6675574779510498,
"learning_rate": 6.470588235294119e-06,
"loss": 0.5402,
"step": 33
},
{
"epoch": 0.10081541882876205,
"grad_norm": 0.6604759693145752,
"learning_rate": 6.666666666666667e-06,
"loss": 0.5344,
"step": 34
},
{
"epoch": 0.10378057820607858,
"grad_norm": 0.7341721057891846,
"learning_rate": 6.862745098039216e-06,
"loss": 0.5336,
"step": 35
},
{
"epoch": 0.1067457375833951,
"grad_norm": 0.6764810681343079,
"learning_rate": 7.058823529411766e-06,
"loss": 0.5327,
"step": 36
},
{
"epoch": 0.10971089696071164,
"grad_norm": 0.6292859315872192,
"learning_rate": 7.2549019607843145e-06,
"loss": 0.5275,
"step": 37
},
{
"epoch": 0.11267605633802817,
"grad_norm": 0.7222408652305603,
"learning_rate": 7.450980392156863e-06,
"loss": 0.5207,
"step": 38
},
{
"epoch": 0.1156412157153447,
"grad_norm": 0.592737078666687,
"learning_rate": 7.647058823529411e-06,
"loss": 0.5202,
"step": 39
},
{
"epoch": 0.11860637509266123,
"grad_norm": 0.7391071915626526,
"learning_rate": 7.84313725490196e-06,
"loss": 0.5088,
"step": 40
},
{
"epoch": 0.12157153446997776,
"grad_norm": 0.5978769659996033,
"learning_rate": 8.03921568627451e-06,
"loss": 0.5059,
"step": 41
},
{
"epoch": 0.1245366938472943,
"grad_norm": 0.7067713737487793,
"learning_rate": 8.23529411764706e-06,
"loss": 0.5079,
"step": 42
},
{
"epoch": 0.12750185322461083,
"grad_norm": 0.6121165752410889,
"learning_rate": 8.43137254901961e-06,
"loss": 0.4998,
"step": 43
},
{
"epoch": 0.13046701260192736,
"grad_norm": 0.7495785355567932,
"learning_rate": 8.627450980392157e-06,
"loss": 0.4877,
"step": 44
},
{
"epoch": 0.1334321719792439,
"grad_norm": 0.6476943492889404,
"learning_rate": 8.823529411764707e-06,
"loss": 0.4971,
"step": 45
},
{
"epoch": 0.1363973313565604,
"grad_norm": 0.7655041813850403,
"learning_rate": 9.019607843137256e-06,
"loss": 0.5002,
"step": 46
},
{
"epoch": 0.13936249073387694,
"grad_norm": 0.6622442007064819,
"learning_rate": 9.215686274509804e-06,
"loss": 0.484,
"step": 47
},
{
"epoch": 0.14232765011119347,
"grad_norm": 0.7732651233673096,
"learning_rate": 9.411764705882354e-06,
"loss": 0.4922,
"step": 48
},
{
"epoch": 0.14529280948851,
"grad_norm": 0.6692637205123901,
"learning_rate": 9.607843137254903e-06,
"loss": 0.4733,
"step": 49
},
{
"epoch": 0.14825796886582654,
"grad_norm": 0.705590546131134,
"learning_rate": 9.803921568627451e-06,
"loss": 0.4734,
"step": 50
},
{
"epoch": 0.15122312824314307,
"grad_norm": 0.6731917858123779,
"learning_rate": 1e-05,
"loss": 0.4651,
"step": 51
},
{
"epoch": 0.1541882876204596,
"grad_norm": 0.6704531908035278,
"learning_rate": 1.0196078431372549e-05,
"loss": 0.4689,
"step": 52
},
{
"epoch": 0.15715344699777614,
"grad_norm": 0.6448220610618591,
"learning_rate": 1.03921568627451e-05,
"loss": 0.4675,
"step": 53
},
{
"epoch": 0.16011860637509265,
"grad_norm": 0.6441836953163147,
"learning_rate": 1.0588235294117648e-05,
"loss": 0.4557,
"step": 54
},
{
"epoch": 0.16308376575240918,
"grad_norm": 0.7347533106803894,
"learning_rate": 1.0784313725490196e-05,
"loss": 0.4622,
"step": 55
},
{
"epoch": 0.16604892512972572,
"grad_norm": 0.6999682784080505,
"learning_rate": 1.0980392156862747e-05,
"loss": 0.4446,
"step": 56
},
{
"epoch": 0.16901408450704225,
"grad_norm": 0.6985459327697754,
"learning_rate": 1.1176470588235295e-05,
"loss": 0.4471,
"step": 57
},
{
"epoch": 0.17197924388435878,
"grad_norm": 0.7167170643806458,
"learning_rate": 1.1372549019607844e-05,
"loss": 0.4465,
"step": 58
},
{
"epoch": 0.17494440326167532,
"grad_norm": 0.6770612001419067,
"learning_rate": 1.1568627450980394e-05,
"loss": 0.4374,
"step": 59
},
{
"epoch": 0.17790956263899185,
"grad_norm": 0.7454700469970703,
"learning_rate": 1.1764705882352942e-05,
"loss": 0.4346,
"step": 60
},
{
"epoch": 0.1808747220163084,
"grad_norm": 0.726898193359375,
"learning_rate": 1.1960784313725491e-05,
"loss": 0.4287,
"step": 61
},
{
"epoch": 0.18383988139362492,
"grad_norm": 0.7026724219322205,
"learning_rate": 1.215686274509804e-05,
"loss": 0.4242,
"step": 62
},
{
"epoch": 0.18680504077094143,
"grad_norm": 1.0427573919296265,
"learning_rate": 1.235294117647059e-05,
"loss": 0.4301,
"step": 63
},
{
"epoch": 0.18977020014825796,
"grad_norm": 0.9116256833076477,
"learning_rate": 1.2549019607843138e-05,
"loss": 0.4131,
"step": 64
},
{
"epoch": 0.1927353595255745,
"grad_norm": 0.7025630474090576,
"learning_rate": 1.2745098039215686e-05,
"loss": 0.4175,
"step": 65
},
{
"epoch": 0.19570051890289103,
"grad_norm": 1.24030339717865,
"learning_rate": 1.2941176470588238e-05,
"loss": 0.4166,
"step": 66
},
{
"epoch": 0.19866567828020756,
"grad_norm": 0.7674146294593811,
"learning_rate": 1.3137254901960785e-05,
"loss": 0.4042,
"step": 67
},
{
"epoch": 0.2016308376575241,
"grad_norm": 0.7968058586120605,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.4015,
"step": 68
},
{
"epoch": 0.20459599703484063,
"grad_norm": 0.9057684540748596,
"learning_rate": 1.3529411764705885e-05,
"loss": 0.3992,
"step": 69
},
{
"epoch": 0.20756115641215717,
"grad_norm": 0.8404118418693542,
"learning_rate": 1.3725490196078432e-05,
"loss": 0.3974,
"step": 70
},
{
"epoch": 0.21052631578947367,
"grad_norm": 0.8619468212127686,
"learning_rate": 1.392156862745098e-05,
"loss": 0.4023,
"step": 71
},
{
"epoch": 0.2134914751667902,
"grad_norm": 0.745784342288971,
"learning_rate": 1.4117647058823532e-05,
"loss": 0.3929,
"step": 72
},
{
"epoch": 0.21645663454410674,
"grad_norm": 0.8499307632446289,
"learning_rate": 1.431372549019608e-05,
"loss": 0.3827,
"step": 73
},
{
"epoch": 0.21942179392142327,
"grad_norm": 0.8255784511566162,
"learning_rate": 1.4509803921568629e-05,
"loss": 0.3831,
"step": 74
},
{
"epoch": 0.2223869532987398,
"grad_norm": 0.8738009333610535,
"learning_rate": 1.4705882352941179e-05,
"loss": 0.377,
"step": 75
},
{
"epoch": 0.22535211267605634,
"grad_norm": 0.8723142147064209,
"learning_rate": 1.4901960784313726e-05,
"loss": 0.3685,
"step": 76
},
{
"epoch": 0.22831727205337288,
"grad_norm": 0.8929502964019775,
"learning_rate": 1.5098039215686276e-05,
"loss": 0.3787,
"step": 77
},
{
"epoch": 0.2312824314306894,
"grad_norm": 1.0882786512374878,
"learning_rate": 1.5294117647058822e-05,
"loss": 0.3652,
"step": 78
},
{
"epoch": 0.23424759080800592,
"grad_norm": 0.9075109362602234,
"learning_rate": 1.5490196078431373e-05,
"loss": 0.3674,
"step": 79
},
{
"epoch": 0.23721275018532245,
"grad_norm": 1.1592175960540771,
"learning_rate": 1.568627450980392e-05,
"loss": 0.3644,
"step": 80
},
{
"epoch": 0.24017790956263899,
"grad_norm": 0.8505756258964539,
"learning_rate": 1.5882352941176473e-05,
"loss": 0.3642,
"step": 81
},
{
"epoch": 0.24314306893995552,
"grad_norm": 0.9724293947219849,
"learning_rate": 1.607843137254902e-05,
"loss": 0.3467,
"step": 82
},
{
"epoch": 0.24610822831727205,
"grad_norm": 1.0010569095611572,
"learning_rate": 1.627450980392157e-05,
"loss": 0.3582,
"step": 83
},
{
"epoch": 0.2490733876945886,
"grad_norm": 0.9776509404182434,
"learning_rate": 1.647058823529412e-05,
"loss": 0.3494,
"step": 84
},
{
"epoch": 0.2520385470719051,
"grad_norm": 0.9763832688331604,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.3487,
"step": 85
},
{
"epoch": 0.25500370644922166,
"grad_norm": 0.8749181628227234,
"learning_rate": 1.686274509803922e-05,
"loss": 0.3425,
"step": 86
},
{
"epoch": 0.25796886582653816,
"grad_norm": 0.922757089138031,
"learning_rate": 1.7058823529411767e-05,
"loss": 0.3431,
"step": 87
},
{
"epoch": 0.2609340252038547,
"grad_norm": 0.8772656321525574,
"learning_rate": 1.7254901960784314e-05,
"loss": 0.3424,
"step": 88
},
{
"epoch": 0.26389918458117123,
"grad_norm": 0.8626474738121033,
"learning_rate": 1.7450980392156866e-05,
"loss": 0.3351,
"step": 89
},
{
"epoch": 0.2668643439584878,
"grad_norm": 0.8123406767845154,
"learning_rate": 1.7647058823529414e-05,
"loss": 0.3274,
"step": 90
},
{
"epoch": 0.2698295033358043,
"grad_norm": 0.8629675507545471,
"learning_rate": 1.7843137254901965e-05,
"loss": 0.3332,
"step": 91
},
{
"epoch": 0.2727946627131208,
"grad_norm": 0.7453241944313049,
"learning_rate": 1.8039215686274513e-05,
"loss": 0.3264,
"step": 92
},
{
"epoch": 0.27575982209043737,
"grad_norm": 0.8055425882339478,
"learning_rate": 1.823529411764706e-05,
"loss": 0.3196,
"step": 93
},
{
"epoch": 0.2787249814677539,
"grad_norm": 0.8176495432853699,
"learning_rate": 1.843137254901961e-05,
"loss": 0.3167,
"step": 94
},
{
"epoch": 0.28169014084507044,
"grad_norm": 0.7777736186981201,
"learning_rate": 1.862745098039216e-05,
"loss": 0.318,
"step": 95
},
{
"epoch": 0.28465530022238694,
"grad_norm": 0.8604575395584106,
"learning_rate": 1.8823529411764708e-05,
"loss": 0.3231,
"step": 96
},
{
"epoch": 0.2876204595997035,
"grad_norm": 0.821183979511261,
"learning_rate": 1.9019607843137255e-05,
"loss": 0.3176,
"step": 97
},
{
"epoch": 0.29058561897702,
"grad_norm": 0.8958712816238403,
"learning_rate": 1.9215686274509807e-05,
"loss": 0.3155,
"step": 98
},
{
"epoch": 0.2935507783543366,
"grad_norm": 0.9813326001167297,
"learning_rate": 1.9411764705882355e-05,
"loss": 0.3182,
"step": 99
},
{
"epoch": 0.2965159377316531,
"grad_norm": 0.9215829968452454,
"learning_rate": 1.9607843137254903e-05,
"loss": 0.3084,
"step": 100
},
{
"epoch": 0.2994810971089696,
"grad_norm": 0.8247601389884949,
"learning_rate": 1.9803921568627454e-05,
"loss": 0.3032,
"step": 101
},
{
"epoch": 0.30244625648628615,
"grad_norm": 0.8188148736953735,
"learning_rate": 2e-05,
"loss": 0.3059,
"step": 102
},
{
"epoch": 0.30541141586360265,
"grad_norm": 0.8999500870704651,
"learning_rate": 1.9999940277008807e-05,
"loss": 0.3086,
"step": 103
},
{
"epoch": 0.3083765752409192,
"grad_norm": 0.8770850300788879,
"learning_rate": 1.99997611087486e-05,
"loss": 0.299,
"step": 104
},
{
"epoch": 0.3113417346182357,
"grad_norm": 0.8018732070922852,
"learning_rate": 1.9999462497359468e-05,
"loss": 0.3034,
"step": 105
},
{
"epoch": 0.3143068939955523,
"grad_norm": 0.8308204412460327,
"learning_rate": 1.9999044446408203e-05,
"loss": 0.3009,
"step": 106
},
{
"epoch": 0.3172720533728688,
"grad_norm": 1.1407383680343628,
"learning_rate": 1.9998506960888258e-05,
"loss": 0.2982,
"step": 107
},
{
"epoch": 0.3202372127501853,
"grad_norm": 0.8638990521430969,
"learning_rate": 1.999785004721968e-05,
"loss": 0.2957,
"step": 108
},
{
"epoch": 0.32320237212750186,
"grad_norm": 0.8289093971252441,
"learning_rate": 1.999707371324904e-05,
"loss": 0.3004,
"step": 109
},
{
"epoch": 0.32616753150481836,
"grad_norm": 1.0886658430099487,
"learning_rate": 1.9996177968249336e-05,
"loss": 0.2953,
"step": 110
},
{
"epoch": 0.3291326908821349,
"grad_norm": 0.7261621356010437,
"learning_rate": 1.999516282291988e-05,
"loss": 0.2945,
"step": 111
},
{
"epoch": 0.33209785025945143,
"grad_norm": 0.9758589267730713,
"learning_rate": 1.999402828938618e-05,
"loss": 0.2946,
"step": 112
},
{
"epoch": 0.335063009636768,
"grad_norm": 0.7505866885185242,
"learning_rate": 1.999277438119978e-05,
"loss": 0.2997,
"step": 113
},
{
"epoch": 0.3380281690140845,
"grad_norm": 0.801395833492279,
"learning_rate": 1.9991401113338103e-05,
"loss": 0.2885,
"step": 114
},
{
"epoch": 0.34099332839140106,
"grad_norm": 0.7377513647079468,
"learning_rate": 1.9989908502204295e-05,
"loss": 0.2863,
"step": 115
},
{
"epoch": 0.34395848776871757,
"grad_norm": 0.728659987449646,
"learning_rate": 1.9988296565626988e-05,
"loss": 0.2863,
"step": 116
},
{
"epoch": 0.3469236471460341,
"grad_norm": 0.7147101759910583,
"learning_rate": 1.9986565322860117e-05,
"loss": 0.2813,
"step": 117
},
{
"epoch": 0.34988880652335064,
"grad_norm": 0.7080392837524414,
"learning_rate": 1.9984714794582682e-05,
"loss": 0.281,
"step": 118
},
{
"epoch": 0.35285396590066714,
"grad_norm": 0.7131238579750061,
"learning_rate": 1.99827450028985e-05,
"loss": 0.2776,
"step": 119
},
{
"epoch": 0.3558191252779837,
"grad_norm": 0.6940627694129944,
"learning_rate": 1.9980655971335944e-05,
"loss": 0.2814,
"step": 120
},
{
"epoch": 0.3587842846553002,
"grad_norm": 0.655299186706543,
"learning_rate": 1.9978447724847655e-05,
"loss": 0.2752,
"step": 121
},
{
"epoch": 0.3617494440326168,
"grad_norm": 0.676629900932312,
"learning_rate": 1.9976120289810247e-05,
"loss": 0.2818,
"step": 122
},
{
"epoch": 0.3647146034099333,
"grad_norm": 0.6595851182937622,
"learning_rate": 1.9973673694024002e-05,
"loss": 0.2801,
"step": 123
},
{
"epoch": 0.36767976278724984,
"grad_norm": 0.6573948860168457,
"learning_rate": 1.9971107966712518e-05,
"loss": 0.2829,
"step": 124
},
{
"epoch": 0.37064492216456635,
"grad_norm": 0.6650752425193787,
"learning_rate": 1.9968423138522382e-05,
"loss": 0.2774,
"step": 125
},
{
"epoch": 0.37361008154188285,
"grad_norm": 0.6870672106742859,
"learning_rate": 1.996561924152278e-05,
"loss": 0.279,
"step": 126
},
{
"epoch": 0.3765752409191994,
"grad_norm": 0.6355287432670593,
"learning_rate": 1.9962696309205146e-05,
"loss": 0.2745,
"step": 127
},
{
"epoch": 0.3795404002965159,
"grad_norm": 0.6908348798751831,
"learning_rate": 1.995965437648273e-05,
"loss": 0.2735,
"step": 128
},
{
"epoch": 0.3825055596738325,
"grad_norm": 0.6098238229751587,
"learning_rate": 1.995649347969019e-05,
"loss": 0.2658,
"step": 129
},
{
"epoch": 0.385470719051149,
"grad_norm": 0.6651309728622437,
"learning_rate": 1.995321365658317e-05,
"loss": 0.2696,
"step": 130
},
{
"epoch": 0.38843587842846555,
"grad_norm": 0.6579580903053284,
"learning_rate": 1.994981494633784e-05,
"loss": 0.2639,
"step": 131
},
{
"epoch": 0.39140103780578206,
"grad_norm": 0.650133490562439,
"learning_rate": 1.9946297389550433e-05,
"loss": 0.2664,
"step": 132
},
{
"epoch": 0.39436619718309857,
"grad_norm": 0.6148021221160889,
"learning_rate": 1.9942661028236746e-05,
"loss": 0.2691,
"step": 133
},
{
"epoch": 0.3973313565604151,
"grad_norm": 0.6839851140975952,
"learning_rate": 1.9938905905831657e-05,
"loss": 0.2619,
"step": 134
},
{
"epoch": 0.40029651593773163,
"grad_norm": 0.6269260048866272,
"learning_rate": 1.993503206718859e-05,
"loss": 0.2679,
"step": 135
},
{
"epoch": 0.4032616753150482,
"grad_norm": 0.6698904633522034,
"learning_rate": 1.9931039558578997e-05,
"loss": 0.2737,
"step": 136
},
{
"epoch": 0.4062268346923647,
"grad_norm": 0.6404738426208496,
"learning_rate": 1.9926928427691788e-05,
"loss": 0.2702,
"step": 137
},
{
"epoch": 0.40919199406968126,
"grad_norm": 0.6118369698524475,
"learning_rate": 1.992269872363277e-05,
"loss": 0.2644,
"step": 138
},
{
"epoch": 0.41215715344699777,
"grad_norm": 0.6277154684066772,
"learning_rate": 1.991835049692405e-05,
"loss": 0.2657,
"step": 139
},
{
"epoch": 0.41512231282431433,
"grad_norm": 0.5819361805915833,
"learning_rate": 1.991388379950346e-05,
"loss": 0.252,
"step": 140
},
{
"epoch": 0.41808747220163084,
"grad_norm": 0.672166109085083,
"learning_rate": 1.9909298684723905e-05,
"loss": 0.2606,
"step": 141
},
{
"epoch": 0.42105263157894735,
"grad_norm": 0.5884442925453186,
"learning_rate": 1.9904595207352736e-05,
"loss": 0.2557,
"step": 142
},
{
"epoch": 0.4240177909562639,
"grad_norm": 0.5893815755844116,
"learning_rate": 1.9899773423571102e-05,
"loss": 0.2595,
"step": 143
},
{
"epoch": 0.4269829503335804,
"grad_norm": 0.6969826221466064,
"learning_rate": 1.9894833390973266e-05,
"loss": 0.2595,
"step": 144
},
{
"epoch": 0.429948109710897,
"grad_norm": 0.5909337997436523,
"learning_rate": 1.9889775168565942e-05,
"loss": 0.2522,
"step": 145
},
{
"epoch": 0.4329132690882135,
"grad_norm": 0.5902915000915527,
"learning_rate": 1.9884598816767563e-05,
"loss": 0.2532,
"step": 146
},
{
"epoch": 0.43587842846553004,
"grad_norm": 0.6261239647865295,
"learning_rate": 1.987930439740757e-05,
"loss": 0.2566,
"step": 147
},
{
"epoch": 0.43884358784284655,
"grad_norm": 0.579250156879425,
"learning_rate": 1.9873891973725673e-05,
"loss": 0.2587,
"step": 148
},
{
"epoch": 0.44180874722016306,
"grad_norm": 0.5678402185440063,
"learning_rate": 1.98683616103711e-05,
"loss": 0.2494,
"step": 149
},
{
"epoch": 0.4447739065974796,
"grad_norm": 0.6142150163650513,
"learning_rate": 1.986271337340182e-05,
"loss": 0.2507,
"step": 150
},
{
"epoch": 0.4477390659747961,
"grad_norm": 0.6132687926292419,
"learning_rate": 1.9856947330283752e-05,
"loss": 0.2547,
"step": 151
},
{
"epoch": 0.4507042253521127,
"grad_norm": 0.5993427038192749,
"learning_rate": 1.985106354988997e-05,
"loss": 0.2478,
"step": 152
},
{
"epoch": 0.4536693847294292,
"grad_norm": 0.6638728380203247,
"learning_rate": 1.984506210249986e-05,
"loss": 0.2547,
"step": 153
},
{
"epoch": 0.45663454410674575,
"grad_norm": 0.6074317097663879,
"learning_rate": 1.9838943059798305e-05,
"loss": 0.2521,
"step": 154
},
{
"epoch": 0.45959970348406226,
"grad_norm": 0.6486067175865173,
"learning_rate": 1.9832706494874812e-05,
"loss": 0.2562,
"step": 155
},
{
"epoch": 0.4625648628613788,
"grad_norm": 0.6348186135292053,
"learning_rate": 1.982635248222264e-05,
"loss": 0.2528,
"step": 156
},
{
"epoch": 0.46553002223869533,
"grad_norm": 0.5568612217903137,
"learning_rate": 1.9819881097737917e-05,
"loss": 0.2471,
"step": 157
},
{
"epoch": 0.46849518161601184,
"grad_norm": 0.5930222272872925,
"learning_rate": 1.9813292418718734e-05,
"loss": 0.2434,
"step": 158
},
{
"epoch": 0.4714603409933284,
"grad_norm": 0.6412246823310852,
"learning_rate": 1.9806586523864212e-05,
"loss": 0.2482,
"step": 159
},
{
"epoch": 0.4744255003706449,
"grad_norm": 0.5488153696060181,
"learning_rate": 1.9799763493273572e-05,
"loss": 0.2416,
"step": 160
},
{
"epoch": 0.47739065974796147,
"grad_norm": 0.6217798590660095,
"learning_rate": 1.9792823408445173e-05,
"loss": 0.2508,
"step": 161
},
{
"epoch": 0.48035581912527797,
"grad_norm": 0.5728364586830139,
"learning_rate": 1.978576635227554e-05,
"loss": 0.2488,
"step": 162
},
{
"epoch": 0.48332097850259453,
"grad_norm": 0.6427583694458008,
"learning_rate": 1.9778592409058376e-05,
"loss": 0.2483,
"step": 163
},
{
"epoch": 0.48628613787991104,
"grad_norm": 0.6554081439971924,
"learning_rate": 1.9771301664483548e-05,
"loss": 0.2426,
"step": 164
},
{
"epoch": 0.4892512972572276,
"grad_norm": 0.5885781049728394,
"learning_rate": 1.976389420563607e-05,
"loss": 0.2551,
"step": 165
},
{
"epoch": 0.4922164566345441,
"grad_norm": 0.5944969058036804,
"learning_rate": 1.975637012099507e-05,
"loss": 0.2466,
"step": 166
},
{
"epoch": 0.4951816160118606,
"grad_norm": 0.6138084530830383,
"learning_rate": 1.97487295004327e-05,
"loss": 0.2414,
"step": 167
},
{
"epoch": 0.4981467753891772,
"grad_norm": 0.5585880279541016,
"learning_rate": 1.9740972435213114e-05,
"loss": 0.2352,
"step": 168
},
{
"epoch": 0.5011119347664937,
"grad_norm": 0.6357447504997253,
"learning_rate": 1.9733099017991342e-05,
"loss": 0.2454,
"step": 169
},
{
"epoch": 0.5040770941438102,
"grad_norm": 0.5774321556091309,
"learning_rate": 1.972510934281218e-05,
"loss": 0.2424,
"step": 170
},
{
"epoch": 0.5070422535211268,
"grad_norm": 0.6422623991966248,
"learning_rate": 1.9717003505109097e-05,
"loss": 0.2361,
"step": 171
},
{
"epoch": 0.5100074128984433,
"grad_norm": 0.5912754535675049,
"learning_rate": 1.9708781601703066e-05,
"loss": 0.243,
"step": 172
},
{
"epoch": 0.5129725722757599,
"grad_norm": 0.5881178379058838,
"learning_rate": 1.9700443730801412e-05,
"loss": 0.2394,
"step": 173
},
{
"epoch": 0.5159377316530763,
"grad_norm": 0.6363380551338196,
"learning_rate": 1.9691989991996663e-05,
"loss": 0.2407,
"step": 174
},
{
"epoch": 0.5189028910303929,
"grad_norm": 0.55989670753479,
"learning_rate": 1.9683420486265328e-05,
"loss": 0.2438,
"step": 175
},
{
"epoch": 0.5218680504077094,
"grad_norm": 0.6781154274940491,
"learning_rate": 1.967473531596671e-05,
"loss": 0.2424,
"step": 176
},
{
"epoch": 0.5248332097850259,
"grad_norm": 0.5050660967826843,
"learning_rate": 1.966593458484168e-05,
"loss": 0.2341,
"step": 177
},
{
"epoch": 0.5277983691623425,
"grad_norm": 0.6881943345069885,
"learning_rate": 1.9657018398011435e-05,
"loss": 0.2433,
"step": 178
},
{
"epoch": 0.530763528539659,
"grad_norm": 0.553970992565155,
"learning_rate": 1.9647986861976246e-05,
"loss": 0.237,
"step": 179
},
{
"epoch": 0.5337286879169756,
"grad_norm": 0.6539415121078491,
"learning_rate": 1.9638840084614182e-05,
"loss": 0.238,
"step": 180
},
{
"epoch": 0.536693847294292,
"grad_norm": 0.5665425658226013,
"learning_rate": 1.9629578175179823e-05,
"loss": 0.2399,
"step": 181
},
{
"epoch": 0.5396590066716086,
"grad_norm": 0.6046749949455261,
"learning_rate": 1.9620201244302952e-05,
"loss": 0.2359,
"step": 182
},
{
"epoch": 0.5426241660489252,
"grad_norm": 0.6772344708442688,
"learning_rate": 1.9610709403987248e-05,
"loss": 0.2382,
"step": 183
},
{
"epoch": 0.5455893254262416,
"grad_norm": 0.473206490278244,
"learning_rate": 1.9601102767608924e-05,
"loss": 0.2321,
"step": 184
},
{
"epoch": 0.5485544848035582,
"grad_norm": 0.6189218163490295,
"learning_rate": 1.95913814499154e-05,
"loss": 0.2356,
"step": 185
},
{
"epoch": 0.5515196441808747,
"grad_norm": 0.5345617532730103,
"learning_rate": 1.95815455670239e-05,
"loss": 0.2394,
"step": 186
},
{
"epoch": 0.5544848035581913,
"grad_norm": 0.5871132016181946,
"learning_rate": 1.9571595236420103e-05,
"loss": 0.2359,
"step": 187
},
{
"epoch": 0.5574499629355077,
"grad_norm": 0.5409566760063171,
"learning_rate": 1.9561530576956703e-05,
"loss": 0.2396,
"step": 188
},
{
"epoch": 0.5604151223128243,
"grad_norm": 0.5904874205589294,
"learning_rate": 1.955135170885202e-05,
"loss": 0.2361,
"step": 189
},
{
"epoch": 0.5633802816901409,
"grad_norm": 0.5407031178474426,
"learning_rate": 1.9541058753688538e-05,
"loss": 0.2368,
"step": 190
},
{
"epoch": 0.5663454410674573,
"grad_norm": 0.5759615302085876,
"learning_rate": 1.9530651834411477e-05,
"loss": 0.2358,
"step": 191
},
{
"epoch": 0.5693106004447739,
"grad_norm": 0.6436863541603088,
"learning_rate": 1.95201310753273e-05,
"loss": 0.2299,
"step": 192
},
{
"epoch": 0.5722757598220904,
"grad_norm": 0.5067325830459595,
"learning_rate": 1.9509496602102253e-05,
"loss": 0.2275,
"step": 193
},
{
"epoch": 0.575240919199407,
"grad_norm": 0.5916472673416138,
"learning_rate": 1.9498748541760845e-05,
"loss": 0.229,
"step": 194
},
{
"epoch": 0.5782060785767235,
"grad_norm": 0.49817144870758057,
"learning_rate": 1.9487887022684336e-05,
"loss": 0.2277,
"step": 195
},
{
"epoch": 0.58117123795404,
"grad_norm": 0.6111854910850525,
"learning_rate": 1.947691217460921e-05,
"loss": 0.2395,
"step": 196
},
{
"epoch": 0.5841363973313566,
"grad_norm": 0.524508535861969,
"learning_rate": 1.946582412862562e-05,
"loss": 0.2372,
"step": 197
},
{
"epoch": 0.5871015567086731,
"grad_norm": 0.5496771335601807,
"learning_rate": 1.9454623017175814e-05,
"loss": 0.2338,
"step": 198
},
{
"epoch": 0.5900667160859896,
"grad_norm": 0.5417652726173401,
"learning_rate": 1.9443308974052574e-05,
"loss": 0.2328,
"step": 199
},
{
"epoch": 0.5930318754633062,
"grad_norm": 0.49683743715286255,
"learning_rate": 1.9431882134397596e-05,
"loss": 0.2289,
"step": 200
},
{
"epoch": 0.5959970348406227,
"grad_norm": 0.5067436099052429,
"learning_rate": 1.9420342634699893e-05,
"loss": 0.2303,
"step": 201
},
{
"epoch": 0.5989621942179392,
"grad_norm": 0.532744288444519,
"learning_rate": 1.9408690612794146e-05,
"loss": 0.2219,
"step": 202
},
{
"epoch": 0.6019273535952557,
"grad_norm": 0.5270218253135681,
"learning_rate": 1.9396926207859085e-05,
"loss": 0.2324,
"step": 203
},
{
"epoch": 0.6048925129725723,
"grad_norm": 0.4947966933250427,
"learning_rate": 1.9385049560415794e-05,
"loss": 0.2282,
"step": 204
},
{
"epoch": 0.6078576723498889,
"grad_norm": 0.5205817222595215,
"learning_rate": 1.9373060812326053e-05,
"loss": 0.2279,
"step": 205
},
{
"epoch": 0.6108228317272053,
"grad_norm": 0.5304152369499207,
"learning_rate": 1.9360960106790645e-05,
"loss": 0.2288,
"step": 206
},
{
"epoch": 0.6137879911045219,
"grad_norm": 0.49558138847351074,
"learning_rate": 1.9348747588347637e-05,
"loss": 0.2284,
"step": 207
},
{
"epoch": 0.6167531504818384,
"grad_norm": 0.48547008633613586,
"learning_rate": 1.9336423402870655e-05,
"loss": 0.2297,
"step": 208
},
{
"epoch": 0.6197183098591549,
"grad_norm": 0.5189692974090576,
"learning_rate": 1.932398769756714e-05,
"loss": 0.2293,
"step": 209
},
{
"epoch": 0.6226834692364714,
"grad_norm": 0.5088484287261963,
"learning_rate": 1.9311440620976597e-05,
"loss": 0.2311,
"step": 210
},
{
"epoch": 0.625648628613788,
"grad_norm": 0.5324704051017761,
"learning_rate": 1.9298782322968817e-05,
"loss": 0.2377,
"step": 211
},
{
"epoch": 0.6286137879911046,
"grad_norm": 0.5019773840904236,
"learning_rate": 1.9286012954742078e-05,
"loss": 0.2256,
"step": 212
},
{
"epoch": 0.631578947368421,
"grad_norm": 0.5624535083770752,
"learning_rate": 1.9273132668821363e-05,
"loss": 0.2291,
"step": 213
},
{
"epoch": 0.6345441067457376,
"grad_norm": 0.5227831602096558,
"learning_rate": 1.9260141619056507e-05,
"loss": 0.2268,
"step": 214
},
{
"epoch": 0.6375092661230541,
"grad_norm": 0.5904820561408997,
"learning_rate": 1.924703996062038e-05,
"loss": 0.227,
"step": 215
},
{
"epoch": 0.6404744255003706,
"grad_norm": 0.561266303062439,
"learning_rate": 1.9233827850007028e-05,
"loss": 0.2294,
"step": 216
},
{
"epoch": 0.6434395848776872,
"grad_norm": 0.5293812155723572,
"learning_rate": 1.9220505445029803e-05,
"loss": 0.2228,
"step": 217
},
{
"epoch": 0.6464047442550037,
"grad_norm": 0.5227711200714111,
"learning_rate": 1.9207072904819484e-05,
"loss": 0.2261,
"step": 218
},
{
"epoch": 0.6493699036323203,
"grad_norm": 0.5241237282752991,
"learning_rate": 1.9193530389822364e-05,
"loss": 0.2247,
"step": 219
},
{
"epoch": 0.6523350630096367,
"grad_norm": 0.5190705060958862,
"learning_rate": 1.9179878061798347e-05,
"loss": 0.2266,
"step": 220
},
{
"epoch": 0.6553002223869533,
"grad_norm": 0.4801787734031677,
"learning_rate": 1.9166116083819002e-05,
"loss": 0.2211,
"step": 221
},
{
"epoch": 0.6582653817642699,
"grad_norm": 0.5298479795455933,
"learning_rate": 1.915224462026563e-05,
"loss": 0.2145,
"step": 222
},
{
"epoch": 0.6612305411415864,
"grad_norm": 0.5878245830535889,
"learning_rate": 1.913826383682729e-05,
"loss": 0.2249,
"step": 223
},
{
"epoch": 0.6641957005189029,
"grad_norm": 0.4641963839530945,
"learning_rate": 1.912417390049882e-05,
"loss": 0.2195,
"step": 224
},
{
"epoch": 0.6671608598962194,
"grad_norm": 0.4989553391933441,
"learning_rate": 1.9109974979578852e-05,
"loss": 0.2306,
"step": 225
},
{
"epoch": 0.670126019273536,
"grad_norm": 0.5732155442237854,
"learning_rate": 1.909566724366779e-05,
"loss": 0.2246,
"step": 226
},
{
"epoch": 0.6730911786508524,
"grad_norm": 0.5080471038818359,
"learning_rate": 1.9081250863665794e-05,
"loss": 0.2253,
"step": 227
},
{
"epoch": 0.676056338028169,
"grad_norm": 0.5161991119384766,
"learning_rate": 1.9066726011770725e-05,
"loss": 0.2248,
"step": 228
},
{
"epoch": 0.6790214974054856,
"grad_norm": 0.5189105868339539,
"learning_rate": 1.905209286147611e-05,
"loss": 0.227,
"step": 229
},
{
"epoch": 0.6819866567828021,
"grad_norm": 0.5306798219680786,
"learning_rate": 1.903735158756905e-05,
"loss": 0.2253,
"step": 230
},
{
"epoch": 0.6849518161601186,
"grad_norm": 0.523923933506012,
"learning_rate": 1.9022502366128136e-05,
"loss": 0.2295,
"step": 231
},
{
"epoch": 0.6879169755374351,
"grad_norm": 0.5236137509346008,
"learning_rate": 1.9007545374521354e-05,
"loss": 0.222,
"step": 232
},
{
"epoch": 0.6908821349147517,
"grad_norm": 0.5138505697250366,
"learning_rate": 1.8992480791403957e-05,
"loss": 0.2143,
"step": 233
},
{
"epoch": 0.6938472942920682,
"grad_norm": 0.5385280251502991,
"learning_rate": 1.897730879671634e-05,
"loss": 0.2227,
"step": 234
},
{
"epoch": 0.6968124536693847,
"grad_norm": 0.5067414045333862,
"learning_rate": 1.8962029571681887e-05,
"loss": 0.2223,
"step": 235
},
{
"epoch": 0.6997776130467013,
"grad_norm": 0.4815332591533661,
"learning_rate": 1.8946643298804794e-05,
"loss": 0.2188,
"step": 236
},
{
"epoch": 0.7027427724240178,
"grad_norm": 0.4668591618537903,
"learning_rate": 1.8931150161867917e-05,
"loss": 0.2206,
"step": 237
},
{
"epoch": 0.7057079318013343,
"grad_norm": 0.5026832222938538,
"learning_rate": 1.891555034593055e-05,
"loss": 0.2228,
"step": 238
},
{
"epoch": 0.7086730911786508,
"grad_norm": 0.5014287233352661,
"learning_rate": 1.8899844037326227e-05,
"loss": 0.216,
"step": 239
},
{
"epoch": 0.7116382505559674,
"grad_norm": 0.4586634933948517,
"learning_rate": 1.8884031423660492e-05,
"loss": 0.2206,
"step": 240
},
{
"epoch": 0.7146034099332839,
"grad_norm": 0.500434398651123,
"learning_rate": 1.8868112693808664e-05,
"loss": 0.2163,
"step": 241
},
{
"epoch": 0.7175685693106004,
"grad_norm": 0.46279287338256836,
"learning_rate": 1.8852088037913577e-05,
"loss": 0.2161,
"step": 242
},
{
"epoch": 0.720533728687917,
"grad_norm": 0.5185891389846802,
"learning_rate": 1.8835957647383304e-05,
"loss": 0.2221,
"step": 243
},
{
"epoch": 0.7234988880652335,
"grad_norm": 0.48801976442337036,
"learning_rate": 1.8819721714888878e-05,
"loss": 0.225,
"step": 244
},
{
"epoch": 0.72646404744255,
"grad_norm": 0.4899084270000458,
"learning_rate": 1.8803380434362e-05,
"loss": 0.2169,
"step": 245
},
{
"epoch": 0.7294292068198666,
"grad_norm": 0.5264920592308044,
"learning_rate": 1.878693400099269e-05,
"loss": 0.2207,
"step": 246
},
{
"epoch": 0.7323943661971831,
"grad_norm": 0.48303139209747314,
"learning_rate": 1.877038261122699e-05,
"loss": 0.2244,
"step": 247
},
{
"epoch": 0.7353595255744997,
"grad_norm": 0.46109214425086975,
"learning_rate": 1.87537264627646e-05,
"loss": 0.216,
"step": 248
},
{
"epoch": 0.7383246849518161,
"grad_norm": 0.4971975088119507,
"learning_rate": 1.8736965754556527e-05,
"loss": 0.2235,
"step": 249
},
{
"epoch": 0.7412898443291327,
"grad_norm": 0.4700891077518463,
"learning_rate": 1.8720100686802693e-05,
"loss": 0.2175,
"step": 250
},
{
"epoch": 0.7442550037064493,
"grad_norm": 0.45833539962768555,
"learning_rate": 1.8703131460949555e-05,
"loss": 0.216,
"step": 251
},
{
"epoch": 0.7472201630837657,
"grad_norm": 0.47551876306533813,
"learning_rate": 1.86860582796877e-05,
"loss": 0.2222,
"step": 252
},
{
"epoch": 0.7501853224610823,
"grad_norm": 0.4569433629512787,
"learning_rate": 1.866888134694942e-05,
"loss": 0.2165,
"step": 253
},
{
"epoch": 0.7531504818383988,
"grad_norm": 0.43670737743377686,
"learning_rate": 1.865160086790627e-05,
"loss": 0.2128,
"step": 254
},
{
"epoch": 0.7561156412157154,
"grad_norm": 0.517746090888977,
"learning_rate": 1.8634217048966638e-05,
"loss": 0.2149,
"step": 255
},
{
"epoch": 0.7590808005930318,
"grad_norm": 0.46699458360671997,
"learning_rate": 1.861673009777325e-05,
"loss": 0.2187,
"step": 256
},
{
"epoch": 0.7620459599703484,
"grad_norm": 0.46238595247268677,
"learning_rate": 1.8599140223200716e-05,
"loss": 0.2137,
"step": 257
},
{
"epoch": 0.765011119347665,
"grad_norm": 0.47764065861701965,
"learning_rate": 1.858144763535302e-05,
"loss": 0.2221,
"step": 258
},
{
"epoch": 0.7679762787249814,
"grad_norm": 0.4717821180820465,
"learning_rate": 1.8563652545561014e-05,
"loss": 0.2188,
"step": 259
},
{
"epoch": 0.770941438102298,
"grad_norm": 0.4471701383590698,
"learning_rate": 1.8545755166379898e-05,
"loss": 0.2171,
"step": 260
},
{
"epoch": 0.7739065974796145,
"grad_norm": 0.49311378598213196,
"learning_rate": 1.852775571158668e-05,
"loss": 0.2157,
"step": 261
},
{
"epoch": 0.7768717568569311,
"grad_norm": 0.4882054924964905,
"learning_rate": 1.850965439617761e-05,
"loss": 0.2167,
"step": 262
},
{
"epoch": 0.7798369162342476,
"grad_norm": 0.45021718740463257,
"learning_rate": 1.8491451436365628e-05,
"loss": 0.2191,
"step": 263
},
{
"epoch": 0.7828020756115641,
"grad_norm": 0.5516721606254578,
"learning_rate": 1.8473147049577777e-05,
"loss": 0.2152,
"step": 264
},
{
"epoch": 0.7857672349888807,
"grad_norm": 0.4654419422149658,
"learning_rate": 1.8454741454452604e-05,
"loss": 0.2216,
"step": 265
},
{
"epoch": 0.7887323943661971,
"grad_norm": 0.4703727066516876,
"learning_rate": 1.843623487083755e-05,
"loss": 0.2164,
"step": 266
},
{
"epoch": 0.7916975537435137,
"grad_norm": 0.479714959859848,
"learning_rate": 1.8417627519786317e-05,
"loss": 0.2152,
"step": 267
},
{
"epoch": 0.7946627131208303,
"grad_norm": 0.4948756992816925,
"learning_rate": 1.839891962355624e-05,
"loss": 0.2219,
"step": 268
},
{
"epoch": 0.7976278724981468,
"grad_norm": 0.45587557554244995,
"learning_rate": 1.838011140560562e-05,
"loss": 0.2157,
"step": 269
},
{
"epoch": 0.8005930318754633,
"grad_norm": 0.46080151200294495,
"learning_rate": 1.836120309059107e-05,
"loss": 0.2122,
"step": 270
},
{
"epoch": 0.8035581912527798,
"grad_norm": 0.4493560492992401,
"learning_rate": 1.8342194904364815e-05,
"loss": 0.2163,
"step": 271
},
{
"epoch": 0.8065233506300964,
"grad_norm": 0.4825652539730072,
"learning_rate": 1.8323087073971996e-05,
"loss": 0.2116,
"step": 272
},
{
"epoch": 0.809488510007413,
"grad_norm": 0.4308413863182068,
"learning_rate": 1.8303879827647977e-05,
"loss": 0.2172,
"step": 273
},
{
"epoch": 0.8124536693847294,
"grad_norm": 0.508596658706665,
"learning_rate": 1.8284573394815596e-05,
"loss": 0.2186,
"step": 274
},
{
"epoch": 0.815418828762046,
"grad_norm": 0.4650067090988159,
"learning_rate": 1.826516800608244e-05,
"loss": 0.2069,
"step": 275
},
{
"epoch": 0.8183839881393625,
"grad_norm": 0.42739060521125793,
"learning_rate": 1.8245663893238075e-05,
"loss": 0.2102,
"step": 276
},
{
"epoch": 0.821349147516679,
"grad_norm": 0.46640655398368835,
"learning_rate": 1.8226061289251297e-05,
"loss": 0.2145,
"step": 277
},
{
"epoch": 0.8243143068939955,
"grad_norm": 0.4410681426525116,
"learning_rate": 1.8206360428267332e-05,
"loss": 0.2131,
"step": 278
},
{
"epoch": 0.8272794662713121,
"grad_norm": 0.44091495871543884,
"learning_rate": 1.8186561545605055e-05,
"loss": 0.2122,
"step": 279
},
{
"epoch": 0.8302446256486287,
"grad_norm": 0.4652099311351776,
"learning_rate": 1.816666487775416e-05,
"loss": 0.2179,
"step": 280
},
{
"epoch": 0.8332097850259451,
"grad_norm": 0.4468926787376404,
"learning_rate": 1.8146670662372353e-05,
"loss": 0.219,
"step": 281
},
{
"epoch": 0.8361749444032617,
"grad_norm": 0.4693123400211334,
"learning_rate": 1.8126579138282502e-05,
"loss": 0.2145,
"step": 282
},
{
"epoch": 0.8391401037805782,
"grad_norm": 0.43998247385025024,
"learning_rate": 1.8106390545469797e-05,
"loss": 0.212,
"step": 283
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.4576677978038788,
"learning_rate": 1.8086105125078858e-05,
"loss": 0.2141,
"step": 284
},
{
"epoch": 0.8450704225352113,
"grad_norm": 0.42104509472846985,
"learning_rate": 1.8065723119410885e-05,
"loss": 0.2126,
"step": 285
},
{
"epoch": 0.8480355819125278,
"grad_norm": 0.4544185996055603,
"learning_rate": 1.804524477192075e-05,
"loss": 0.2122,
"step": 286
},
{
"epoch": 0.8510007412898444,
"grad_norm": 0.4285774528980255,
"learning_rate": 1.8024670327214084e-05,
"loss": 0.211,
"step": 287
},
{
"epoch": 0.8539659006671608,
"grad_norm": 0.43197640776634216,
"learning_rate": 1.8004000031044363e-05,
"loss": 0.2103,
"step": 288
},
{
"epoch": 0.8569310600444774,
"grad_norm": 0.4368259906768799,
"learning_rate": 1.798323413030997e-05,
"loss": 0.2134,
"step": 289
},
{
"epoch": 0.859896219421794,
"grad_norm": 0.4898151159286499,
"learning_rate": 1.796237287305125e-05,
"loss": 0.2137,
"step": 290
},
{
"epoch": 0.8628613787991104,
"grad_norm": 0.42249011993408203,
"learning_rate": 1.7941416508447537e-05,
"loss": 0.2052,
"step": 291
},
{
"epoch": 0.865826538176427,
"grad_norm": 0.45801860094070435,
"learning_rate": 1.792036528681418e-05,
"loss": 0.2146,
"step": 292
},
{
"epoch": 0.8687916975537435,
"grad_norm": 0.44352859258651733,
"learning_rate": 1.789921945959958e-05,
"loss": 0.2053,
"step": 293
},
{
"epoch": 0.8717568569310601,
"grad_norm": 0.4158633351325989,
"learning_rate": 1.7877979279382135e-05,
"loss": 0.2137,
"step": 294
},
{
"epoch": 0.8747220163083765,
"grad_norm": 0.41102075576782227,
"learning_rate": 1.7856644999867264e-05,
"loss": 0.2109,
"step": 295
},
{
"epoch": 0.8776871756856931,
"grad_norm": 0.41784408688545227,
"learning_rate": 1.783521687588437e-05,
"loss": 0.2128,
"step": 296
},
{
"epoch": 0.8806523350630097,
"grad_norm": 0.4097442626953125,
"learning_rate": 1.781369516338378e-05,
"loss": 0.2116,
"step": 297
},
{
"epoch": 0.8836174944403261,
"grad_norm": 0.4172267019748688,
"learning_rate": 1.779208011943371e-05,
"loss": 0.2096,
"step": 298
},
{
"epoch": 0.8865826538176427,
"grad_norm": 0.4201764464378357,
"learning_rate": 1.777037200221717e-05,
"loss": 0.2144,
"step": 299
},
{
"epoch": 0.8895478131949592,
"grad_norm": 0.4283645451068878,
"learning_rate": 1.77485710710289e-05,
"loss": 0.2159,
"step": 300
},
{
"epoch": 0.8925129725722758,
"grad_norm": 0.4021233022212982,
"learning_rate": 1.7726677586272263e-05,
"loss": 0.2147,
"step": 301
},
{
"epoch": 0.8954781319495922,
"grad_norm": 0.4146812856197357,
"learning_rate": 1.7704691809456142e-05,
"loss": 0.2136,
"step": 302
},
{
"epoch": 0.8984432913269088,
"grad_norm": 0.41466352343559265,
"learning_rate": 1.7682614003191807e-05,
"loss": 0.2117,
"step": 303
},
{
"epoch": 0.9014084507042254,
"grad_norm": 0.45098355412483215,
"learning_rate": 1.766044443118978e-05,
"loss": 0.2141,
"step": 304
},
{
"epoch": 0.9043736100815419,
"grad_norm": 0.39802679419517517,
"learning_rate": 1.76381833582567e-05,
"loss": 0.2119,
"step": 305
},
{
"epoch": 0.9073387694588584,
"grad_norm": 0.4417196214199066,
"learning_rate": 1.761583105029213e-05,
"loss": 0.2148,
"step": 306
},
{
"epoch": 0.910303928836175,
"grad_norm": 0.4523768723011017,
"learning_rate": 1.7593387774285412e-05,
"loss": 0.2116,
"step": 307
},
{
"epoch": 0.9132690882134915,
"grad_norm": 0.42361876368522644,
"learning_rate": 1.7570853798312462e-05,
"loss": 0.2091,
"step": 308
},
{
"epoch": 0.916234247590808,
"grad_norm": 0.44734466075897217,
"learning_rate": 1.7548229391532572e-05,
"loss": 0.2098,
"step": 309
},
{
"epoch": 0.9191994069681245,
"grad_norm": 0.4427475333213806,
"learning_rate": 1.7525514824185187e-05,
"loss": 0.2159,
"step": 310
},
{
"epoch": 0.9221645663454411,
"grad_norm": 0.4229927659034729,
"learning_rate": 1.750271036758669e-05,
"loss": 0.2104,
"step": 311
},
{
"epoch": 0.9251297257227576,
"grad_norm": 0.4121291935443878,
"learning_rate": 1.747981629412715e-05,
"loss": 0.2076,
"step": 312
},
{
"epoch": 0.9280948851000741,
"grad_norm": 0.45084404945373535,
"learning_rate": 1.7456832877267083e-05,
"loss": 0.215,
"step": 313
},
{
"epoch": 0.9310600444773907,
"grad_norm": 0.423123836517334,
"learning_rate": 1.7433760391534166e-05,
"loss": 0.2082,
"step": 314
},
{
"epoch": 0.9340252038547072,
"grad_norm": 0.4547256827354431,
"learning_rate": 1.741059911251997e-05,
"loss": 0.2089,
"step": 315
},
{
"epoch": 0.9369903632320237,
"grad_norm": 0.4248969852924347,
"learning_rate": 1.7387349316876668e-05,
"loss": 0.2039,
"step": 316
},
{
"epoch": 0.9399555226093402,
"grad_norm": 0.46414193511009216,
"learning_rate": 1.7364011282313732e-05,
"loss": 0.2081,
"step": 317
},
{
"epoch": 0.9429206819866568,
"grad_norm": 0.4844679534435272,
"learning_rate": 1.7340585287594605e-05,
"loss": 0.2142,
"step": 318
},
{
"epoch": 0.9458858413639734,
"grad_norm": 0.4147413372993469,
"learning_rate": 1.731707161253338e-05,
"loss": 0.2128,
"step": 319
},
{
"epoch": 0.9488510007412898,
"grad_norm": 0.4431176781654358,
"learning_rate": 1.7293470537991463e-05,
"loss": 0.2104,
"step": 320
},
{
"epoch": 0.9518161601186064,
"grad_norm": 0.45323607325553894,
"learning_rate": 1.7269782345874204e-05,
"loss": 0.2083,
"step": 321
},
{
"epoch": 0.9547813194959229,
"grad_norm": 0.4210136830806732,
"learning_rate": 1.7246007319127547e-05,
"loss": 0.2069,
"step": 322
},
{
"epoch": 0.9577464788732394,
"grad_norm": 0.440244197845459,
"learning_rate": 1.7222145741734625e-05,
"loss": 0.2021,
"step": 323
},
{
"epoch": 0.9607116382505559,
"grad_norm": 0.41491949558258057,
"learning_rate": 1.7198197898712402e-05,
"loss": 0.2086,
"step": 324
},
{
"epoch": 0.9636767976278725,
"grad_norm": 0.4270980954170227,
"learning_rate": 1.717416407610824e-05,
"loss": 0.2063,
"step": 325
},
{
"epoch": 0.9666419570051891,
"grad_norm": 0.436722993850708,
"learning_rate": 1.7150044560996488e-05,
"loss": 0.2095,
"step": 326
},
{
"epoch": 0.9696071163825055,
"grad_norm": 0.42856717109680176,
"learning_rate": 1.7125839641475074e-05,
"loss": 0.2151,
"step": 327
},
{
"epoch": 0.9725722757598221,
"grad_norm": 0.4263397753238678,
"learning_rate": 1.7101549606662025e-05,
"loss": 0.21,
"step": 328
},
{
"epoch": 0.9755374351371386,
"grad_norm": 0.43046820163726807,
"learning_rate": 1.7077174746692054e-05,
"loss": 0.211,
"step": 329
},
{
"epoch": 0.9785025945144552,
"grad_norm": 0.4144728481769562,
"learning_rate": 1.7052715352713076e-05,
"loss": 0.2069,
"step": 330
},
{
"epoch": 0.9814677538917717,
"grad_norm": 0.4112738072872162,
"learning_rate": 1.7028171716882714e-05,
"loss": 0.209,
"step": 331
},
{
"epoch": 0.9844329132690882,
"grad_norm": 0.4484747052192688,
"learning_rate": 1.7003544132364847e-05,
"loss": 0.2118,
"step": 332
},
{
"epoch": 0.9873980726464048,
"grad_norm": 0.4388020634651184,
"learning_rate": 1.6978832893326074e-05,
"loss": 0.2069,
"step": 333
},
{
"epoch": 0.9903632320237212,
"grad_norm": 0.45029163360595703,
"learning_rate": 1.6954038294932215e-05,
"loss": 0.2153,
"step": 334
},
{
"epoch": 0.9933283914010378,
"grad_norm": 0.4059215486049652,
"learning_rate": 1.692916063334479e-05,
"loss": 0.1999,
"step": 335
},
{
"epoch": 0.9962935507783544,
"grad_norm": 0.430908739566803,
"learning_rate": 1.690420020571747e-05,
"loss": 0.2101,
"step": 336
},
{
"epoch": 0.9992587101556709,
"grad_norm": 0.4230971336364746,
"learning_rate": 1.6879157310192537e-05,
"loss": 0.2033,
"step": 337
},
{
"epoch": 1.0022238695329875,
"grad_norm": 0.37717196345329285,
"learning_rate": 1.685403224589731e-05,
"loss": 0.1831,
"step": 338
},
{
"epoch": 1.005189028910304,
"grad_norm": 0.4386158287525177,
"learning_rate": 1.6828825312940594e-05,
"loss": 0.1782,
"step": 339
},
{
"epoch": 1.0081541882876204,
"grad_norm": 0.3862016201019287,
"learning_rate": 1.6803536812409077e-05,
"loss": 0.1779,
"step": 340
},
{
"epoch": 1.011119347664937,
"grad_norm": 0.4159914553165436,
"learning_rate": 1.6778167046363735e-05,
"loss": 0.1699,
"step": 341
},
{
"epoch": 1.0140845070422535,
"grad_norm": 0.5072054266929626,
"learning_rate": 1.675271631783623e-05,
"loss": 0.1738,
"step": 342
},
{
"epoch": 1.01704966641957,
"grad_norm": 0.41934165358543396,
"learning_rate": 1.672718493082529e-05,
"loss": 0.1722,
"step": 343
},
{
"epoch": 1.0200148257968866,
"grad_norm": 0.4099801480770111,
"learning_rate": 1.6701573190293076e-05,
"loss": 0.1713,
"step": 344
},
{
"epoch": 1.0229799851742032,
"grad_norm": 0.44231241941452026,
"learning_rate": 1.667588140216154e-05,
"loss": 0.1675,
"step": 345
},
{
"epoch": 1.0259451445515197,
"grad_norm": 0.4088985323905945,
"learning_rate": 1.6650109873308763e-05,
"loss": 0.1736,
"step": 346
},
{
"epoch": 1.028910303928836,
"grad_norm": 0.4394180476665497,
"learning_rate": 1.6624258911565312e-05,
"loss": 0.1727,
"step": 347
},
{
"epoch": 1.0318754633061527,
"grad_norm": 0.4399167001247406,
"learning_rate": 1.6598328825710536e-05,
"loss": 0.1732,
"step": 348
},
{
"epoch": 1.0348406226834692,
"grad_norm": 0.46241313219070435,
"learning_rate": 1.6572319925468892e-05,
"loss": 0.1759,
"step": 349
},
{
"epoch": 1.0378057820607858,
"grad_norm": 0.40860143303871155,
"learning_rate": 1.654623252150624e-05,
"loss": 0.1711,
"step": 350
},
{
"epoch": 1.0407709414381023,
"grad_norm": 0.4109824597835541,
"learning_rate": 1.6520066925426146e-05,
"loss": 0.1799,
"step": 351
},
{
"epoch": 1.043736100815419,
"grad_norm": 0.40983447432518005,
"learning_rate": 1.6493823449766137e-05,
"loss": 0.1752,
"step": 352
},
{
"epoch": 1.0467012601927355,
"grad_norm": 0.4187794029712677,
"learning_rate": 1.6467502407993995e-05,
"loss": 0.1753,
"step": 353
},
{
"epoch": 1.0496664195700518,
"grad_norm": 0.40739187598228455,
"learning_rate": 1.644110411450398e-05,
"loss": 0.1771,
"step": 354
},
{
"epoch": 1.0526315789473684,
"grad_norm": 0.41065889596939087,
"learning_rate": 1.6414628884613106e-05,
"loss": 0.1711,
"step": 355
},
{
"epoch": 1.055596738324685,
"grad_norm": 0.43635791540145874,
"learning_rate": 1.6388077034557355e-05,
"loss": 0.175,
"step": 356
},
{
"epoch": 1.0585618977020015,
"grad_norm": 0.432016521692276,
"learning_rate": 1.6361448881487913e-05,
"loss": 0.1754,
"step": 357
},
{
"epoch": 1.061527057079318,
"grad_norm": 0.43051794171333313,
"learning_rate": 1.6334744743467366e-05,
"loss": 0.177,
"step": 358
},
{
"epoch": 1.0644922164566346,
"grad_norm": 0.39719873666763306,
"learning_rate": 1.6307964939465914e-05,
"loss": 0.1732,
"step": 359
},
{
"epoch": 1.0674573758339512,
"grad_norm": 0.40763285756111145,
"learning_rate": 1.628110978935756e-05,
"loss": 0.1744,
"step": 360
},
{
"epoch": 1.0704225352112675,
"grad_norm": 0.40124091506004333,
"learning_rate": 1.625417961391628e-05,
"loss": 0.1759,
"step": 361
},
{
"epoch": 1.073387694588584,
"grad_norm": 0.41654643416404724,
"learning_rate": 1.62271747348122e-05,
"loss": 0.1751,
"step": 362
},
{
"epoch": 1.0763528539659006,
"grad_norm": 0.39688020944595337,
"learning_rate": 1.6200095474607753e-05,
"loss": 0.1704,
"step": 363
},
{
"epoch": 1.0793180133432172,
"grad_norm": 0.3920522928237915,
"learning_rate": 1.6172942156753822e-05,
"loss": 0.168,
"step": 364
},
{
"epoch": 1.0822831727205338,
"grad_norm": 0.4264538586139679,
"learning_rate": 1.614571510558588e-05,
"loss": 0.174,
"step": 365
},
{
"epoch": 1.0852483320978503,
"grad_norm": 0.3995387554168701,
"learning_rate": 1.6118414646320115e-05,
"loss": 0.1718,
"step": 366
},
{
"epoch": 1.0882134914751669,
"grad_norm": 0.36994609236717224,
"learning_rate": 1.6091041105049542e-05,
"loss": 0.1726,
"step": 367
},
{
"epoch": 1.0911786508524832,
"grad_norm": 0.3809909224510193,
"learning_rate": 1.6063594808740112e-05,
"loss": 0.1741,
"step": 368
},
{
"epoch": 1.0941438102297998,
"grad_norm": 0.4052869975566864,
"learning_rate": 1.6036076085226813e-05,
"loss": 0.1728,
"step": 369
},
{
"epoch": 1.0971089696071163,
"grad_norm": 0.38783711194992065,
"learning_rate": 1.6008485263209742e-05,
"loss": 0.1701,
"step": 370
},
{
"epoch": 1.100074128984433,
"grad_norm": 0.4025594890117645,
"learning_rate": 1.598082267225018e-05,
"loss": 0.1743,
"step": 371
},
{
"epoch": 1.1030392883617495,
"grad_norm": 0.4071436822414398,
"learning_rate": 1.595308864276666e-05,
"loss": 0.1726,
"step": 372
},
{
"epoch": 1.106004447739066,
"grad_norm": 0.446532279253006,
"learning_rate": 1.592528350603103e-05,
"loss": 0.1708,
"step": 373
},
{
"epoch": 1.1089696071163826,
"grad_norm": 0.3993205726146698,
"learning_rate": 1.5897407594164468e-05,
"loss": 0.1805,
"step": 374
},
{
"epoch": 1.111934766493699,
"grad_norm": 0.42292505502700806,
"learning_rate": 1.586946124013354e-05,
"loss": 0.1823,
"step": 375
},
{
"epoch": 1.1148999258710155,
"grad_norm": 0.41676023602485657,
"learning_rate": 1.5841444777746232e-05,
"loss": 0.1756,
"step": 376
},
{
"epoch": 1.117865085248332,
"grad_norm": 0.3944017291069031,
"learning_rate": 1.5813358541647915e-05,
"loss": 0.1734,
"step": 377
},
{
"epoch": 1.1208302446256486,
"grad_norm": 0.38493022322654724,
"learning_rate": 1.578520286731741e-05,
"loss": 0.1772,
"step": 378
},
{
"epoch": 1.1237954040029652,
"grad_norm": 0.4245246350765228,
"learning_rate": 1.575697809106292e-05,
"loss": 0.1743,
"step": 379
},
{
"epoch": 1.1267605633802817,
"grad_norm": 0.3895925283432007,
"learning_rate": 1.5728684550018066e-05,
"loss": 0.1704,
"step": 380
},
{
"epoch": 1.1297257227575983,
"grad_norm": 0.3827330768108368,
"learning_rate": 1.570032258213783e-05,
"loss": 0.1746,
"step": 381
},
{
"epoch": 1.1326908821349146,
"grad_norm": 0.3874651789665222,
"learning_rate": 1.5671892526194515e-05,
"loss": 0.1751,
"step": 382
},
{
"epoch": 1.1356560415122312,
"grad_norm": 0.4029993712902069,
"learning_rate": 1.564339472177373e-05,
"loss": 0.1771,
"step": 383
},
{
"epoch": 1.1386212008895478,
"grad_norm": 0.3838706314563751,
"learning_rate": 1.561482950927029e-05,
"loss": 0.1732,
"step": 384
},
{
"epoch": 1.1415863602668643,
"grad_norm": 0.3896842896938324,
"learning_rate": 1.5586197229884185e-05,
"loss": 0.1737,
"step": 385
},
{
"epoch": 1.144551519644181,
"grad_norm": 0.4098159372806549,
"learning_rate": 1.5557498225616488e-05,
"loss": 0.1769,
"step": 386
},
{
"epoch": 1.1475166790214975,
"grad_norm": 0.4123744070529938,
"learning_rate": 1.5528732839265272e-05,
"loss": 0.177,
"step": 387
},
{
"epoch": 1.150481838398814,
"grad_norm": 0.3826339542865753,
"learning_rate": 1.549990141442153e-05,
"loss": 0.1708,
"step": 388
},
{
"epoch": 1.1534469977761304,
"grad_norm": 0.38323384523391724,
"learning_rate": 1.5471004295465034e-05,
"loss": 0.1759,
"step": 389
},
{
"epoch": 1.156412157153447,
"grad_norm": 0.3751480281352997,
"learning_rate": 1.5442041827560274e-05,
"loss": 0.1742,
"step": 390
},
{
"epoch": 1.1593773165307635,
"grad_norm": 0.42600059509277344,
"learning_rate": 1.5413014356652287e-05,
"loss": 0.1726,
"step": 391
},
{
"epoch": 1.16234247590808,
"grad_norm": 0.4077330529689789,
"learning_rate": 1.538392222946255e-05,
"loss": 0.1708,
"step": 392
},
{
"epoch": 1.1653076352853966,
"grad_norm": 0.39985400438308716,
"learning_rate": 1.5354765793484834e-05,
"loss": 0.1753,
"step": 393
},
{
"epoch": 1.1682727946627132,
"grad_norm": 0.4099324941635132,
"learning_rate": 1.5325545396981053e-05,
"loss": 0.172,
"step": 394
},
{
"epoch": 1.1712379540400297,
"grad_norm": 0.39008331298828125,
"learning_rate": 1.5296261388977107e-05,
"loss": 0.172,
"step": 395
},
{
"epoch": 1.174203113417346,
"grad_norm": 0.36513862013816833,
"learning_rate": 1.52669141192587e-05,
"loss": 0.1699,
"step": 396
},
{
"epoch": 1.1771682727946626,
"grad_norm": 0.43505406379699707,
"learning_rate": 1.5237503938367186e-05,
"loss": 0.1766,
"step": 397
},
{
"epoch": 1.1801334321719792,
"grad_norm": 0.4039159417152405,
"learning_rate": 1.5208031197595357e-05,
"loss": 0.1744,
"step": 398
},
{
"epoch": 1.1830985915492958,
"grad_norm": 0.3673771619796753,
"learning_rate": 1.5178496248983254e-05,
"loss": 0.1719,
"step": 399
},
{
"epoch": 1.1860637509266123,
"grad_norm": 0.3980352580547333,
"learning_rate": 1.5148899445313983e-05,
"loss": 0.1722,
"step": 400
},
{
"epoch": 1.1890289103039289,
"grad_norm": 0.39053529500961304,
"learning_rate": 1.5119241140109466e-05,
"loss": 0.1743,
"step": 401
},
{
"epoch": 1.1919940696812454,
"grad_norm": 0.3899192214012146,
"learning_rate": 1.5089521687626243e-05,
"loss": 0.1723,
"step": 402
},
{
"epoch": 1.1949592290585618,
"grad_norm": 0.4070497453212738,
"learning_rate": 1.505974144285124e-05,
"loss": 0.1692,
"step": 403
},
{
"epoch": 1.1979243884358783,
"grad_norm": 0.3976007103919983,
"learning_rate": 1.5029900761497507e-05,
"loss": 0.1781,
"step": 404
},
{
"epoch": 1.200889547813195,
"grad_norm": 0.41118377447128296,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.1746,
"step": 405
},
{
"epoch": 1.2038547071905115,
"grad_norm": 0.41726142168045044,
"learning_rate": 1.4970039515511303e-05,
"loss": 0.179,
"step": 406
},
{
"epoch": 1.206819866567828,
"grad_norm": 0.3854449391365051,
"learning_rate": 1.4940019665897363e-05,
"loss": 0.1737,
"step": 407
},
{
"epoch": 1.2097850259451446,
"grad_norm": 0.45727819204330444,
"learning_rate": 1.4909940809733223e-05,
"loss": 0.1723,
"step": 408
},
{
"epoch": 1.2127501853224611,
"grad_norm": 0.3889809250831604,
"learning_rate": 1.4879803306298736e-05,
"loss": 0.1714,
"step": 409
},
{
"epoch": 1.2157153446997775,
"grad_norm": 0.4237361550331116,
"learning_rate": 1.4849607515574276e-05,
"loss": 0.1724,
"step": 410
},
{
"epoch": 1.218680504077094,
"grad_norm": 0.4138452112674713,
"learning_rate": 1.4819353798236427e-05,
"loss": 0.1725,
"step": 411
},
{
"epoch": 1.2216456634544106,
"grad_norm": 0.4682404100894928,
"learning_rate": 1.4789042515653687e-05,
"loss": 0.1727,
"step": 412
},
{
"epoch": 1.2246108228317272,
"grad_norm": 0.38663214445114136,
"learning_rate": 1.4758674029882152e-05,
"loss": 0.176,
"step": 413
},
{
"epoch": 1.2275759822090437,
"grad_norm": 0.391353577375412,
"learning_rate": 1.4728248703661183e-05,
"loss": 0.1775,
"step": 414
},
{
"epoch": 1.2305411415863603,
"grad_norm": 0.4257277846336365,
"learning_rate": 1.4697766900409076e-05,
"loss": 0.1773,
"step": 415
},
{
"epoch": 1.2335063009636769,
"grad_norm": 0.38307616114616394,
"learning_rate": 1.466722898421873e-05,
"loss": 0.1739,
"step": 416
},
{
"epoch": 1.2364714603409934,
"grad_norm": 0.3973027467727661,
"learning_rate": 1.4636635319853274e-05,
"loss": 0.1738,
"step": 417
},
{
"epoch": 1.2394366197183098,
"grad_norm": 0.4155060052871704,
"learning_rate": 1.4605986272741748e-05,
"loss": 0.1737,
"step": 418
},
{
"epoch": 1.2424017790956263,
"grad_norm": 0.40221065282821655,
"learning_rate": 1.4575282208974704e-05,
"loss": 0.1718,
"step": 419
},
{
"epoch": 1.2453669384729429,
"grad_norm": 0.41945594549179077,
"learning_rate": 1.4544523495299843e-05,
"loss": 0.1772,
"step": 420
},
{
"epoch": 1.2483320978502594,
"grad_norm": 0.4217647612094879,
"learning_rate": 1.4513710499117648e-05,
"loss": 0.1816,
"step": 421
},
{
"epoch": 1.251297257227576,
"grad_norm": 0.4151117205619812,
"learning_rate": 1.4482843588476976e-05,
"loss": 0.1718,
"step": 422
},
{
"epoch": 1.2542624166048926,
"grad_norm": 0.38060155510902405,
"learning_rate": 1.445192313207067e-05,
"loss": 0.1725,
"step": 423
},
{
"epoch": 1.257227575982209,
"grad_norm": 0.4043025076389313,
"learning_rate": 1.4420949499231172e-05,
"loss": 0.1735,
"step": 424
},
{
"epoch": 1.2601927353595257,
"grad_norm": 0.40334248542785645,
"learning_rate": 1.4389923059926064e-05,
"loss": 0.1748,
"step": 425
},
{
"epoch": 1.263157894736842,
"grad_norm": 0.3861962854862213,
"learning_rate": 1.4358844184753713e-05,
"loss": 0.1751,
"step": 426
},
{
"epoch": 1.2661230541141586,
"grad_norm": 0.3862569034099579,
"learning_rate": 1.432771324493879e-05,
"loss": 0.1766,
"step": 427
},
{
"epoch": 1.2690882134914752,
"grad_norm": 0.3655155897140503,
"learning_rate": 1.4296530612327864e-05,
"loss": 0.1738,
"step": 428
},
{
"epoch": 1.2720533728687917,
"grad_norm": 0.45015332102775574,
"learning_rate": 1.4265296659384956e-05,
"loss": 0.1758,
"step": 429
},
{
"epoch": 1.2750185322461083,
"grad_norm": 0.40792006254196167,
"learning_rate": 1.4234011759187084e-05,
"loss": 0.1753,
"step": 430
},
{
"epoch": 1.2779836916234246,
"grad_norm": 0.3909926116466522,
"learning_rate": 1.4202676285419811e-05,
"loss": 0.1775,
"step": 431
},
{
"epoch": 1.2809488510007414,
"grad_norm": 0.38805529475212097,
"learning_rate": 1.4171290612372781e-05,
"loss": 0.1772,
"step": 432
},
{
"epoch": 1.2839140103780577,
"grad_norm": 0.3860710859298706,
"learning_rate": 1.4139855114935253e-05,
"loss": 0.17,
"step": 433
},
{
"epoch": 1.2868791697553743,
"grad_norm": 0.42617350816726685,
"learning_rate": 1.410837016859161e-05,
"loss": 0.1743,
"step": 434
},
{
"epoch": 1.2898443291326909,
"grad_norm": 0.3832889795303345,
"learning_rate": 1.4076836149416889e-05,
"loss": 0.1698,
"step": 435
},
{
"epoch": 1.2928094885100074,
"grad_norm": 0.4039870500564575,
"learning_rate": 1.4045253434072278e-05,
"loss": 0.1752,
"step": 436
},
{
"epoch": 1.295774647887324,
"grad_norm": 0.38493219017982483,
"learning_rate": 1.4013622399800628e-05,
"loss": 0.1737,
"step": 437
},
{
"epoch": 1.2987398072646406,
"grad_norm": 0.4500020146369934,
"learning_rate": 1.3981943424421932e-05,
"loss": 0.1704,
"step": 438
},
{
"epoch": 1.3017049666419571,
"grad_norm": 0.4027196764945984,
"learning_rate": 1.3950216886328818e-05,
"loss": 0.1699,
"step": 439
},
{
"epoch": 1.3046701260192735,
"grad_norm": 0.37555673718452454,
"learning_rate": 1.3918443164482048e-05,
"loss": 0.1733,
"step": 440
},
{
"epoch": 1.30763528539659,
"grad_norm": 0.3900480568408966,
"learning_rate": 1.3886622638405953e-05,
"loss": 0.168,
"step": 441
},
{
"epoch": 1.3106004447739066,
"grad_norm": 0.40044647455215454,
"learning_rate": 1.3854755688183941e-05,
"loss": 0.1681,
"step": 442
},
{
"epoch": 1.3135656041512231,
"grad_norm": 0.39409545063972473,
"learning_rate": 1.3822842694453923e-05,
"loss": 0.1731,
"step": 443
},
{
"epoch": 1.3165307635285397,
"grad_norm": 0.37648630142211914,
"learning_rate": 1.3790884038403796e-05,
"loss": 0.1711,
"step": 444
},
{
"epoch": 1.3194959229058563,
"grad_norm": 0.3983948826789856,
"learning_rate": 1.375888010176686e-05,
"loss": 0.1782,
"step": 445
},
{
"epoch": 1.3224610822831728,
"grad_norm": 0.42869091033935547,
"learning_rate": 1.3726831266817278e-05,
"loss": 0.1714,
"step": 446
},
{
"epoch": 1.3254262416604892,
"grad_norm": 0.43148529529571533,
"learning_rate": 1.3694737916365517e-05,
"loss": 0.1734,
"step": 447
},
{
"epoch": 1.3283914010378057,
"grad_norm": 0.37700700759887695,
"learning_rate": 1.3662600433753746e-05,
"loss": 0.1732,
"step": 448
},
{
"epoch": 1.3313565604151223,
"grad_norm": 0.3717349171638489,
"learning_rate": 1.3630419202851287e-05,
"loss": 0.1722,
"step": 449
},
{
"epoch": 1.3343217197924389,
"grad_norm": 0.401803195476532,
"learning_rate": 1.3598194608050011e-05,
"loss": 0.1727,
"step": 450
},
{
"epoch": 1.3372868791697554,
"grad_norm": 0.373855322599411,
"learning_rate": 1.3565927034259757e-05,
"loss": 0.1724,
"step": 451
},
{
"epoch": 1.340252038547072,
"grad_norm": 0.40752193331718445,
"learning_rate": 1.3533616866903736e-05,
"loss": 0.1741,
"step": 452
},
{
"epoch": 1.3432171979243885,
"grad_norm": 0.37844231724739075,
"learning_rate": 1.3501264491913909e-05,
"loss": 0.1759,
"step": 453
},
{
"epoch": 1.3461823573017049,
"grad_norm": 0.37028035521507263,
"learning_rate": 1.3468870295726399e-05,
"loss": 0.1743,
"step": 454
},
{
"epoch": 1.3491475166790214,
"grad_norm": 0.3744882047176361,
"learning_rate": 1.3436434665276865e-05,
"loss": 0.176,
"step": 455
},
{
"epoch": 1.352112676056338,
"grad_norm": 0.34571152925491333,
"learning_rate": 1.3403957987995884e-05,
"loss": 0.1725,
"step": 456
},
{
"epoch": 1.3550778354336546,
"grad_norm": 0.3648885488510132,
"learning_rate": 1.3371440651804313e-05,
"loss": 0.1752,
"step": 457
},
{
"epoch": 1.3580429948109711,
"grad_norm": 0.37405288219451904,
"learning_rate": 1.3338883045108674e-05,
"loss": 0.1716,
"step": 458
},
{
"epoch": 1.3610081541882877,
"grad_norm": 0.3600881099700928,
"learning_rate": 1.3306285556796494e-05,
"loss": 0.166,
"step": 459
},
{
"epoch": 1.3639733135656043,
"grad_norm": 0.38361856341362,
"learning_rate": 1.327364857623168e-05,
"loss": 0.1686,
"step": 460
},
{
"epoch": 1.3669384729429206,
"grad_norm": 0.4009436070919037,
"learning_rate": 1.3240972493249846e-05,
"loss": 0.1765,
"step": 461
},
{
"epoch": 1.3699036323202372,
"grad_norm": 0.3752938508987427,
"learning_rate": 1.3208257698153677e-05,
"loss": 0.1673,
"step": 462
},
{
"epoch": 1.3728687916975537,
"grad_norm": 0.3697980046272278,
"learning_rate": 1.3175504581708261e-05,
"loss": 0.1696,
"step": 463
},
{
"epoch": 1.3758339510748703,
"grad_norm": 0.4123381972312927,
"learning_rate": 1.3142713535136413e-05,
"loss": 0.1751,
"step": 464
},
{
"epoch": 1.3787991104521868,
"grad_norm": 0.3773389458656311,
"learning_rate": 1.3109884950114007e-05,
"loss": 0.175,
"step": 465
},
{
"epoch": 1.3817642698295034,
"grad_norm": 0.37522801756858826,
"learning_rate": 1.3077019218765306e-05,
"loss": 0.1721,
"step": 466
},
{
"epoch": 1.38472942920682,
"grad_norm": 0.3822220265865326,
"learning_rate": 1.3044116733658261e-05,
"loss": 0.1741,
"step": 467
},
{
"epoch": 1.3876945885841363,
"grad_norm": 0.33929958939552307,
"learning_rate": 1.3011177887799846e-05,
"loss": 0.1669,
"step": 468
},
{
"epoch": 1.3906597479614529,
"grad_norm": 0.3751008212566376,
"learning_rate": 1.2978203074631335e-05,
"loss": 0.173,
"step": 469
},
{
"epoch": 1.3936249073387694,
"grad_norm": 0.3586931526660919,
"learning_rate": 1.2945192688023625e-05,
"loss": 0.1707,
"step": 470
},
{
"epoch": 1.396590066716086,
"grad_norm": 0.3598410189151764,
"learning_rate": 1.2912147122272523e-05,
"loss": 0.1673,
"step": 471
},
{
"epoch": 1.3995552260934025,
"grad_norm": 0.37330952286720276,
"learning_rate": 1.287906677209403e-05,
"loss": 0.1705,
"step": 472
},
{
"epoch": 1.402520385470719,
"grad_norm": 0.3800138533115387,
"learning_rate": 1.2845952032619651e-05,
"loss": 0.1707,
"step": 473
},
{
"epoch": 1.4054855448480357,
"grad_norm": 0.34873542189598083,
"learning_rate": 1.2812803299391629e-05,
"loss": 0.1716,
"step": 474
},
{
"epoch": 1.408450704225352,
"grad_norm": 0.39961710572242737,
"learning_rate": 1.2779620968358276e-05,
"loss": 0.1713,
"step": 475
},
{
"epoch": 1.4114158636026686,
"grad_norm": 0.37982645630836487,
"learning_rate": 1.2746405435869198e-05,
"loss": 0.1713,
"step": 476
},
{
"epoch": 1.4143810229799851,
"grad_norm": 0.3620937764644623,
"learning_rate": 1.271315709867059e-05,
"loss": 0.1712,
"step": 477
},
{
"epoch": 1.4173461823573017,
"grad_norm": 0.36581623554229736,
"learning_rate": 1.2679876353900482e-05,
"loss": 0.1725,
"step": 478
},
{
"epoch": 1.4203113417346183,
"grad_norm": 0.36710691452026367,
"learning_rate": 1.2646563599083997e-05,
"loss": 0.1706,
"step": 479
},
{
"epoch": 1.4232765011119348,
"grad_norm": 0.3968733847141266,
"learning_rate": 1.2613219232128608e-05,
"loss": 0.1704,
"step": 480
},
{
"epoch": 1.4262416604892514,
"grad_norm": 0.38720619678497314,
"learning_rate": 1.2579843651319382e-05,
"loss": 0.1714,
"step": 481
},
{
"epoch": 1.4292068198665677,
"grad_norm": 0.36827707290649414,
"learning_rate": 1.2546437255314223e-05,
"loss": 0.1715,
"step": 482
},
{
"epoch": 1.4321719792438843,
"grad_norm": 0.37707608938217163,
"learning_rate": 1.2513000443139112e-05,
"loss": 0.1735,
"step": 483
},
{
"epoch": 1.4351371386212008,
"grad_norm": 0.40368345379829407,
"learning_rate": 1.2479533614183334e-05,
"loss": 0.1726,
"step": 484
},
{
"epoch": 1.4381022979985174,
"grad_norm": 0.3910945951938629,
"learning_rate": 1.2446037168194716e-05,
"loss": 0.1755,
"step": 485
},
{
"epoch": 1.441067457375834,
"grad_norm": 0.37151867151260376,
"learning_rate": 1.2412511505274845e-05,
"loss": 0.1771,
"step": 486
},
{
"epoch": 1.4440326167531505,
"grad_norm": 0.35527053475379944,
"learning_rate": 1.23789570258743e-05,
"loss": 0.1677,
"step": 487
},
{
"epoch": 1.446997776130467,
"grad_norm": 0.3575199544429779,
"learning_rate": 1.2345374130787855e-05,
"loss": 0.1715,
"step": 488
},
{
"epoch": 1.4499629355077834,
"grad_norm": 0.35391053557395935,
"learning_rate": 1.23117632211497e-05,
"loss": 0.1716,
"step": 489
},
{
"epoch": 1.4529280948851,
"grad_norm": 0.3692530691623688,
"learning_rate": 1.2278124698428643e-05,
"loss": 0.1689,
"step": 490
},
{
"epoch": 1.4558932542624166,
"grad_norm": 0.35716333985328674,
"learning_rate": 1.2244458964423328e-05,
"loss": 0.1682,
"step": 491
},
{
"epoch": 1.4588584136397331,
"grad_norm": 0.3617175221443176,
"learning_rate": 1.221076642125742e-05,
"loss": 0.1749,
"step": 492
},
{
"epoch": 1.4618235730170497,
"grad_norm": 0.3705756366252899,
"learning_rate": 1.2177047471374808e-05,
"loss": 0.1706,
"step": 493
},
{
"epoch": 1.4647887323943662,
"grad_norm": 0.35617804527282715,
"learning_rate": 1.214330251753481e-05,
"loss": 0.1704,
"step": 494
},
{
"epoch": 1.4677538917716828,
"grad_norm": 0.3682483732700348,
"learning_rate": 1.2109531962807333e-05,
"loss": 0.1688,
"step": 495
},
{
"epoch": 1.4707190511489991,
"grad_norm": 0.3654380738735199,
"learning_rate": 1.207573621056809e-05,
"loss": 0.1654,
"step": 496
},
{
"epoch": 1.4736842105263157,
"grad_norm": 0.39695996046066284,
"learning_rate": 1.2041915664493763e-05,
"loss": 0.1705,
"step": 497
},
{
"epoch": 1.4766493699036323,
"grad_norm": 0.3834567964076996,
"learning_rate": 1.2008070728557186e-05,
"loss": 0.1737,
"step": 498
},
{
"epoch": 1.4796145292809488,
"grad_norm": 0.3756810426712036,
"learning_rate": 1.1974201807022525e-05,
"loss": 0.1744,
"step": 499
},
{
"epoch": 1.4825796886582654,
"grad_norm": 0.43872207403182983,
"learning_rate": 1.1940309304440434e-05,
"loss": 0.1725,
"step": 500
},
{
"epoch": 1.485544848035582,
"grad_norm": 0.4155595004558563,
"learning_rate": 1.1906393625643244e-05,
"loss": 0.167,
"step": 501
},
{
"epoch": 1.4885100074128985,
"grad_norm": 0.37012434005737305,
"learning_rate": 1.1872455175740111e-05,
"loss": 0.1714,
"step": 502
},
{
"epoch": 1.4914751667902149,
"grad_norm": 0.4194466173648834,
"learning_rate": 1.1838494360112185e-05,
"loss": 0.1731,
"step": 503
},
{
"epoch": 1.4944403261675316,
"grad_norm": 0.38535988330841064,
"learning_rate": 1.1804511584407763e-05,
"loss": 0.1719,
"step": 504
},
{
"epoch": 1.497405485544848,
"grad_norm": 0.3790641725063324,
"learning_rate": 1.1770507254537454e-05,
"loss": 0.1715,
"step": 505
},
{
"epoch": 1.5003706449221645,
"grad_norm": 0.40725064277648926,
"learning_rate": 1.1736481776669307e-05,
"loss": 0.1672,
"step": 506
},
{
"epoch": 1.503335804299481,
"grad_norm": 0.3657318949699402,
"learning_rate": 1.1702435557223988e-05,
"loss": 0.1701,
"step": 507
},
{
"epoch": 1.5063009636767977,
"grad_norm": 0.41225719451904297,
"learning_rate": 1.1668369002869912e-05,
"loss": 0.1703,
"step": 508
},
{
"epoch": 1.5092661230541142,
"grad_norm": 0.38106808066368103,
"learning_rate": 1.1634282520518382e-05,
"loss": 0.1705,
"step": 509
},
{
"epoch": 1.5122312824314306,
"grad_norm": 0.43504565954208374,
"learning_rate": 1.1600176517318742e-05,
"loss": 0.1712,
"step": 510
},
{
"epoch": 1.5151964418087474,
"grad_norm": 0.37367385625839233,
"learning_rate": 1.1566051400653486e-05,
"loss": 0.1708,
"step": 511
},
{
"epoch": 1.5181616011860637,
"grad_norm": 0.3934025168418884,
"learning_rate": 1.153190757813343e-05,
"loss": 0.1723,
"step": 512
},
{
"epoch": 1.5211267605633803,
"grad_norm": 0.35954198241233826,
"learning_rate": 1.1497745457592817e-05,
"loss": 0.1686,
"step": 513
},
{
"epoch": 1.5240919199406968,
"grad_norm": 0.3657681345939636,
"learning_rate": 1.1463565447084446e-05,
"loss": 0.1715,
"step": 514
},
{
"epoch": 1.5270570793180134,
"grad_norm": 0.3832554817199707,
"learning_rate": 1.142936795487482e-05,
"loss": 0.1725,
"step": 515
},
{
"epoch": 1.53002223869533,
"grad_norm": 0.36780476570129395,
"learning_rate": 1.1395153389439232e-05,
"loss": 0.1686,
"step": 516
},
{
"epoch": 1.5329873980726463,
"grad_norm": 0.37948641180992126,
"learning_rate": 1.1360922159456929e-05,
"loss": 0.169,
"step": 517
},
{
"epoch": 1.535952557449963,
"grad_norm": 0.37667399644851685,
"learning_rate": 1.1326674673806195e-05,
"loss": 0.1694,
"step": 518
},
{
"epoch": 1.5389177168272794,
"grad_norm": 0.3817925751209259,
"learning_rate": 1.129241134155949e-05,
"loss": 0.1684,
"step": 519
},
{
"epoch": 1.541882876204596,
"grad_norm": 0.3880022168159485,
"learning_rate": 1.1258132571978555e-05,
"loss": 0.1681,
"step": 520
},
{
"epoch": 1.5448480355819125,
"grad_norm": 0.39235079288482666,
"learning_rate": 1.1223838774509515e-05,
"loss": 0.1724,
"step": 521
},
{
"epoch": 1.547813194959229,
"grad_norm": 0.3959818184375763,
"learning_rate": 1.1189530358778005e-05,
"loss": 0.1653,
"step": 522
},
{
"epoch": 1.5507783543365457,
"grad_norm": 0.3723091185092926,
"learning_rate": 1.1155207734584264e-05,
"loss": 0.1715,
"step": 523
},
{
"epoch": 1.553743513713862,
"grad_norm": 0.3744927644729614,
"learning_rate": 1.1120871311898254e-05,
"loss": 0.1709,
"step": 524
},
{
"epoch": 1.5567086730911788,
"grad_norm": 0.37305641174316406,
"learning_rate": 1.1086521500854746e-05,
"loss": 0.1705,
"step": 525
},
{
"epoch": 1.5596738324684951,
"grad_norm": 0.3628908693790436,
"learning_rate": 1.1052158711748435e-05,
"loss": 0.1703,
"step": 526
},
{
"epoch": 1.5626389918458117,
"grad_norm": 0.3602434992790222,
"learning_rate": 1.1017783355029027e-05,
"loss": 0.1733,
"step": 527
},
{
"epoch": 1.5656041512231282,
"grad_norm": 0.3662010133266449,
"learning_rate": 1.0983395841296349e-05,
"loss": 0.1722,
"step": 528
},
{
"epoch": 1.5685693106004448,
"grad_norm": 0.38595232367515564,
"learning_rate": 1.0948996581295437e-05,
"loss": 0.1722,
"step": 529
},
{
"epoch": 1.5715344699777614,
"grad_norm": 0.3809836804866791,
"learning_rate": 1.0914585985911632e-05,
"loss": 0.1704,
"step": 530
},
{
"epoch": 1.5744996293550777,
"grad_norm": 0.3592289686203003,
"learning_rate": 1.0880164466165675e-05,
"loss": 0.1732,
"step": 531
},
{
"epoch": 1.5774647887323945,
"grad_norm": 0.3625737428665161,
"learning_rate": 1.084573243320878e-05,
"loss": 0.1743,
"step": 532
},
{
"epoch": 1.5804299481097108,
"grad_norm": 0.3582081198692322,
"learning_rate": 1.0811290298317755e-05,
"loss": 0.171,
"step": 533
},
{
"epoch": 1.5833951074870274,
"grad_norm": 0.3777657449245453,
"learning_rate": 1.0776838472890065e-05,
"loss": 0.1711,
"step": 534
},
{
"epoch": 1.586360266864344,
"grad_norm": 0.34954240918159485,
"learning_rate": 1.0742377368438915e-05,
"loss": 0.1685,
"step": 535
},
{
"epoch": 1.5893254262416605,
"grad_norm": 0.3632443845272064,
"learning_rate": 1.0707907396588362e-05,
"loss": 0.1689,
"step": 536
},
{
"epoch": 1.592290585618977,
"grad_norm": 0.35810449719429016,
"learning_rate": 1.0673428969068365e-05,
"loss": 0.1714,
"step": 537
},
{
"epoch": 1.5952557449962934,
"grad_norm": 0.36739829182624817,
"learning_rate": 1.063894249770989e-05,
"loss": 0.17,
"step": 538
},
{
"epoch": 1.5982209043736102,
"grad_norm": 0.35011234879493713,
"learning_rate": 1.0604448394439983e-05,
"loss": 0.1661,
"step": 539
},
{
"epoch": 1.6011860637509265,
"grad_norm": 0.37619051337242126,
"learning_rate": 1.0569947071276847e-05,
"loss": 0.1708,
"step": 540
},
{
"epoch": 1.604151223128243,
"grad_norm": 0.36766669154167175,
"learning_rate": 1.053543894032493e-05,
"loss": 0.1699,
"step": 541
},
{
"epoch": 1.6071163825055597,
"grad_norm": 0.3799968361854553,
"learning_rate": 1.0500924413769988e-05,
"loss": 0.175,
"step": 542
},
{
"epoch": 1.6100815418828762,
"grad_norm": 0.35972005128860474,
"learning_rate": 1.0466403903874176e-05,
"loss": 0.1709,
"step": 543
},
{
"epoch": 1.6130467012601928,
"grad_norm": 0.38818514347076416,
"learning_rate": 1.0431877822971118e-05,
"loss": 0.1729,
"step": 544
},
{
"epoch": 1.6160118606375091,
"grad_norm": 0.35318616032600403,
"learning_rate": 1.0397346583460972e-05,
"loss": 0.1708,
"step": 545
},
{
"epoch": 1.618977020014826,
"grad_norm": 0.34682103991508484,
"learning_rate": 1.0362810597805526e-05,
"loss": 0.172,
"step": 546
},
{
"epoch": 1.6219421793921422,
"grad_norm": 0.37605708837509155,
"learning_rate": 1.0328270278523256e-05,
"loss": 0.1733,
"step": 547
},
{
"epoch": 1.6249073387694588,
"grad_norm": 0.3474465608596802,
"learning_rate": 1.0293726038184393e-05,
"loss": 0.1659,
"step": 548
},
{
"epoch": 1.6278724981467754,
"grad_norm": 0.3567797839641571,
"learning_rate": 1.0259178289406011e-05,
"loss": 0.1692,
"step": 549
},
{
"epoch": 1.630837657524092,
"grad_norm": 0.35859590768814087,
"learning_rate": 1.022462744484709e-05,
"loss": 0.1725,
"step": 550
},
{
"epoch": 1.6338028169014085,
"grad_norm": 0.4004250168800354,
"learning_rate": 1.019007391720359e-05,
"loss": 0.171,
"step": 551
},
{
"epoch": 1.6367679762787248,
"grad_norm": 0.3502226769924164,
"learning_rate": 1.0155518119203511e-05,
"loss": 0.1669,
"step": 552
},
{
"epoch": 1.6397331356560416,
"grad_norm": 0.35019659996032715,
"learning_rate": 1.0120960463601977e-05,
"loss": 0.162,
"step": 553
},
{
"epoch": 1.642698295033358,
"grad_norm": 0.3413262963294983,
"learning_rate": 1.0086401363176306e-05,
"loss": 0.1671,
"step": 554
},
{
"epoch": 1.6456634544106745,
"grad_norm": 0.3686580955982208,
"learning_rate": 1.0051841230721065e-05,
"loss": 0.1723,
"step": 555
},
{
"epoch": 1.648628613787991,
"grad_norm": 0.4102790355682373,
"learning_rate": 1.0017280479043148e-05,
"loss": 0.1737,
"step": 556
},
{
"epoch": 1.6515937731653076,
"grad_norm": 0.3648839592933655,
"learning_rate": 9.982719520956856e-06,
"loss": 0.1701,
"step": 557
},
{
"epoch": 1.6545589325426242,
"grad_norm": 0.35376548767089844,
"learning_rate": 9.948158769278939e-06,
"loss": 0.1665,
"step": 558
},
{
"epoch": 1.6575240919199405,
"grad_norm": 0.34262967109680176,
"learning_rate": 9.913598636823694e-06,
"loss": 0.1637,
"step": 559
},
{
"epoch": 1.6604892512972573,
"grad_norm": 0.3623892068862915,
"learning_rate": 9.879039536398023e-06,
"loss": 0.1688,
"step": 560
},
{
"epoch": 1.6634544106745737,
"grad_norm": 0.36795225739479065,
"learning_rate": 9.844481880796492e-06,
"loss": 0.1716,
"step": 561
},
{
"epoch": 1.6664195700518905,
"grad_norm": 0.3584054112434387,
"learning_rate": 9.809926082796415e-06,
"loss": 0.1717,
"step": 562
},
{
"epoch": 1.6693847294292068,
"grad_norm": 0.3560091555118561,
"learning_rate": 9.775372555152912e-06,
"loss": 0.1685,
"step": 563
},
{
"epoch": 1.6723498888065234,
"grad_norm": 0.36741241812705994,
"learning_rate": 9.740821710593989e-06,
"loss": 0.1685,
"step": 564
},
{
"epoch": 1.67531504818384,
"grad_norm": 0.3397235870361328,
"learning_rate": 9.70627396181561e-06,
"loss": 0.1613,
"step": 565
},
{
"epoch": 1.6782802075611563,
"grad_norm": 0.3634246289730072,
"learning_rate": 9.671729721476747e-06,
"loss": 0.1681,
"step": 566
},
{
"epoch": 1.681245366938473,
"grad_norm": 0.3582555949687958,
"learning_rate": 9.637189402194477e-06,
"loss": 0.1687,
"step": 567
},
{
"epoch": 1.6842105263157894,
"grad_norm": 0.34005481004714966,
"learning_rate": 9.602653416539031e-06,
"loss": 0.1689,
"step": 568
},
{
"epoch": 1.6871756856931062,
"grad_norm": 0.3448920249938965,
"learning_rate": 9.568122177028884e-06,
"loss": 0.1688,
"step": 569
},
{
"epoch": 1.6901408450704225,
"grad_norm": 0.3394884169101715,
"learning_rate": 9.533596096125826e-06,
"loss": 0.163,
"step": 570
},
{
"epoch": 1.693106004447739,
"grad_norm": 0.35604503750801086,
"learning_rate": 9.499075586230014e-06,
"loss": 0.1709,
"step": 571
},
{
"epoch": 1.6960711638250556,
"grad_norm": 0.34704917669296265,
"learning_rate": 9.464561059675073e-06,
"loss": 0.1686,
"step": 572
},
{
"epoch": 1.699036323202372,
"grad_norm": 0.3488229811191559,
"learning_rate": 9.430052928723153e-06,
"loss": 0.1705,
"step": 573
},
{
"epoch": 1.7020014825796888,
"grad_norm": 0.349729984998703,
"learning_rate": 9.395551605560018e-06,
"loss": 0.1656,
"step": 574
},
{
"epoch": 1.704966641957005,
"grad_norm": 0.3426892161369324,
"learning_rate": 9.361057502290112e-06,
"loss": 0.1652,
"step": 575
},
{
"epoch": 1.7079318013343219,
"grad_norm": 0.3359294533729553,
"learning_rate": 9.326571030931636e-06,
"loss": 0.1668,
"step": 576
},
{
"epoch": 1.7108969607116382,
"grad_norm": 0.32818013429641724,
"learning_rate": 9.292092603411642e-06,
"loss": 0.1641,
"step": 577
},
{
"epoch": 1.7138621200889548,
"grad_norm": 0.3587988317012787,
"learning_rate": 9.257622631561085e-06,
"loss": 0.1692,
"step": 578
},
{
"epoch": 1.7168272794662713,
"grad_norm": 0.3606449365615845,
"learning_rate": 9.223161527109938e-06,
"loss": 0.1732,
"step": 579
},
{
"epoch": 1.7197924388435877,
"grad_norm": 0.33454060554504395,
"learning_rate": 9.188709701682246e-06,
"loss": 0.1707,
"step": 580
},
{
"epoch": 1.7227575982209045,
"grad_norm": 0.3533168435096741,
"learning_rate": 9.154267566791224e-06,
"loss": 0.1647,
"step": 581
},
{
"epoch": 1.7257227575982208,
"grad_norm": 0.3588050901889801,
"learning_rate": 9.119835533834332e-06,
"loss": 0.1709,
"step": 582
},
{
"epoch": 1.7286879169755376,
"grad_norm": 0.35869184136390686,
"learning_rate": 9.085414014088368e-06,
"loss": 0.1721,
"step": 583
},
{
"epoch": 1.731653076352854,
"grad_norm": 0.33058810234069824,
"learning_rate": 9.051003418704566e-06,
"loss": 0.1687,
"step": 584
},
{
"epoch": 1.7346182357301705,
"grad_norm": 0.35373157262802124,
"learning_rate": 9.016604158703654e-06,
"loss": 0.1685,
"step": 585
},
{
"epoch": 1.737583395107487,
"grad_norm": 0.3870552182197571,
"learning_rate": 8.982216644970978e-06,
"loss": 0.1698,
"step": 586
},
{
"epoch": 1.7405485544848036,
"grad_norm": 0.35172680020332336,
"learning_rate": 8.947841288251568e-06,
"loss": 0.167,
"step": 587
},
{
"epoch": 1.7435137138621202,
"grad_norm": 0.3640024960041046,
"learning_rate": 8.913478499145255e-06,
"loss": 0.1659,
"step": 588
},
{
"epoch": 1.7464788732394365,
"grad_norm": 0.36789610981941223,
"learning_rate": 8.879128688101749e-06,
"loss": 0.1708,
"step": 589
},
{
"epoch": 1.7494440326167533,
"grad_norm": 0.3513200283050537,
"learning_rate": 8.844792265415738e-06,
"loss": 0.1652,
"step": 590
},
{
"epoch": 1.7524091919940696,
"grad_norm": 0.3880747854709625,
"learning_rate": 8.810469641222001e-06,
"loss": 0.1699,
"step": 591
},
{
"epoch": 1.7553743513713862,
"grad_norm": 0.33801934123039246,
"learning_rate": 8.776161225490488e-06,
"loss": 0.1675,
"step": 592
},
{
"epoch": 1.7583395107487028,
"grad_norm": 0.3653337359428406,
"learning_rate": 8.741867428021447e-06,
"loss": 0.1648,
"step": 593
},
{
"epoch": 1.7613046701260193,
"grad_norm": 0.36136823892593384,
"learning_rate": 8.707588658440511e-06,
"loss": 0.1696,
"step": 594
},
{
"epoch": 1.7642698295033359,
"grad_norm": 0.33816996216773987,
"learning_rate": 8.673325326193806e-06,
"loss": 0.1665,
"step": 595
},
{
"epoch": 1.7672349888806522,
"grad_norm": 0.33847707509994507,
"learning_rate": 8.639077840543078e-06,
"loss": 0.1678,
"step": 596
},
{
"epoch": 1.770200148257969,
"grad_norm": 0.3402957022190094,
"learning_rate": 8.604846610560771e-06,
"loss": 0.1643,
"step": 597
},
{
"epoch": 1.7731653076352853,
"grad_norm": 0.37062397599220276,
"learning_rate": 8.570632045125185e-06,
"loss": 0.1679,
"step": 598
},
{
"epoch": 1.776130467012602,
"grad_norm": 0.34380587935447693,
"learning_rate": 8.536434552915555e-06,
"loss": 0.1652,
"step": 599
},
{
"epoch": 1.7790956263899185,
"grad_norm": 0.33917438983917236,
"learning_rate": 8.502254542407186e-06,
"loss": 0.1652,
"step": 600
},
{
"epoch": 1.782060785767235,
"grad_norm": 0.3372032940387726,
"learning_rate": 8.468092421866575e-06,
"loss": 0.1629,
"step": 601
},
{
"epoch": 1.7850259451445516,
"grad_norm": 0.34099259972572327,
"learning_rate": 8.433948599346516e-06,
"loss": 0.1678,
"step": 602
},
{
"epoch": 1.787991104521868,
"grad_norm": 0.370136559009552,
"learning_rate": 8.399823482681263e-06,
"loss": 0.1671,
"step": 603
},
{
"epoch": 1.7909562638991847,
"grad_norm": 0.3444167375564575,
"learning_rate": 8.36571747948162e-06,
"loss": 0.1652,
"step": 604
},
{
"epoch": 1.793921423276501,
"grad_norm": 0.3237707018852234,
"learning_rate": 8.331630997130091e-06,
"loss": 0.1647,
"step": 605
},
{
"epoch": 1.7968865826538176,
"grad_norm": 0.3346817195415497,
"learning_rate": 8.297564442776014e-06,
"loss": 0.168,
"step": 606
},
{
"epoch": 1.7998517420311342,
"grad_norm": 0.3474122881889343,
"learning_rate": 8.263518223330698e-06,
"loss": 0.1665,
"step": 607
},
{
"epoch": 1.8028169014084507,
"grad_norm": 0.37336310744285583,
"learning_rate": 8.229492745462551e-06,
"loss": 0.1628,
"step": 608
},
{
"epoch": 1.8057820607857673,
"grad_norm": 0.3516935706138611,
"learning_rate": 8.195488415592238e-06,
"loss": 0.1669,
"step": 609
},
{
"epoch": 1.8087472201630836,
"grad_norm": 0.3758098781108856,
"learning_rate": 8.161505639887818e-06,
"loss": 0.1709,
"step": 610
},
{
"epoch": 1.8117123795404004,
"grad_norm": 0.34178832173347473,
"learning_rate": 8.12754482425989e-06,
"loss": 0.1659,
"step": 611
},
{
"epoch": 1.8146775389177168,
"grad_norm": 0.3256490230560303,
"learning_rate": 8.09360637435676e-06,
"loss": 0.1649,
"step": 612
},
{
"epoch": 1.8176426982950333,
"grad_norm": 0.3661201596260071,
"learning_rate": 8.05969069555957e-06,
"loss": 0.167,
"step": 613
},
{
"epoch": 1.82060785767235,
"grad_norm": 0.34554868936538696,
"learning_rate": 8.025798192977482e-06,
"loss": 0.1685,
"step": 614
},
{
"epoch": 1.8235730170496665,
"grad_norm": 0.3409639298915863,
"learning_rate": 7.991929271442817e-06,
"loss": 0.1674,
"step": 615
},
{
"epoch": 1.826538176426983,
"grad_norm": 0.36160513758659363,
"learning_rate": 7.958084335506239e-06,
"loss": 0.167,
"step": 616
},
{
"epoch": 1.8295033358042994,
"grad_norm": 0.3661399483680725,
"learning_rate": 7.924263789431913e-06,
"loss": 0.1658,
"step": 617
},
{
"epoch": 1.8324684951816161,
"grad_norm": 0.3356715738773346,
"learning_rate": 7.89046803719267e-06,
"loss": 0.1709,
"step": 618
},
{
"epoch": 1.8354336545589325,
"grad_norm": 0.3546086549758911,
"learning_rate": 7.856697482465195e-06,
"loss": 0.1626,
"step": 619
},
{
"epoch": 1.838398813936249,
"grad_norm": 0.3332943320274353,
"learning_rate": 7.822952528625192e-06,
"loss": 0.1678,
"step": 620
},
{
"epoch": 1.8413639733135656,
"grad_norm": 0.34793728590011597,
"learning_rate": 7.789233578742583e-06,
"loss": 0.1659,
"step": 621
},
{
"epoch": 1.8443291326908822,
"grad_norm": 0.33829519152641296,
"learning_rate": 7.755541035576677e-06,
"loss": 0.1647,
"step": 622
},
{
"epoch": 1.8472942920681987,
"grad_norm": 0.33728060126304626,
"learning_rate": 7.721875301571359e-06,
"loss": 0.169,
"step": 623
},
{
"epoch": 1.850259451445515,
"grad_norm": 0.34719350934028625,
"learning_rate": 7.688236778850307e-06,
"loss": 0.1694,
"step": 624
},
{
"epoch": 1.8532246108228319,
"grad_norm": 0.328850656747818,
"learning_rate": 7.654625869212147e-06,
"loss": 0.1627,
"step": 625
},
{
"epoch": 1.8561897702001482,
"grad_norm": 0.34174007177352905,
"learning_rate": 7.621042974125701e-06,
"loss": 0.1693,
"step": 626
},
{
"epoch": 1.8591549295774648,
"grad_norm": 0.3343605101108551,
"learning_rate": 7.587488494725157e-06,
"loss": 0.1639,
"step": 627
},
{
"epoch": 1.8621200889547813,
"grad_norm": 0.3540162742137909,
"learning_rate": 7.553962831805291e-06,
"loss": 0.1667,
"step": 628
},
{
"epoch": 1.8650852483320979,
"grad_norm": 0.32035985589027405,
"learning_rate": 7.520466385816672e-06,
"loss": 0.1632,
"step": 629
},
{
"epoch": 1.8680504077094144,
"grad_norm": 0.3598351776599884,
"learning_rate": 7.48699955686089e-06,
"loss": 0.1674,
"step": 630
},
{
"epoch": 1.8710155670867308,
"grad_norm": 0.35652288794517517,
"learning_rate": 7.453562744685779e-06,
"loss": 0.1661,
"step": 631
},
{
"epoch": 1.8739807264640476,
"grad_norm": 0.32800406217575073,
"learning_rate": 7.420156348680621e-06,
"loss": 0.1617,
"step": 632
},
{
"epoch": 1.876945885841364,
"grad_norm": 0.3622135818004608,
"learning_rate": 7.3867807678713965e-06,
"loss": 0.1656,
"step": 633
},
{
"epoch": 1.8799110452186805,
"grad_norm": 0.34809359908103943,
"learning_rate": 7.353436400916006e-06,
"loss": 0.1677,
"step": 634
},
{
"epoch": 1.882876204595997,
"grad_norm": 0.31972047686576843,
"learning_rate": 7.32012364609952e-06,
"loss": 0.1614,
"step": 635
},
{
"epoch": 1.8858413639733136,
"grad_norm": 0.3630245327949524,
"learning_rate": 7.286842901329413e-06,
"loss": 0.1674,
"step": 636
},
{
"epoch": 1.8888065233506302,
"grad_norm": 0.337985098361969,
"learning_rate": 7.253594564130804e-06,
"loss": 0.1661,
"step": 637
},
{
"epoch": 1.8917716827279465,
"grad_norm": 0.35212552547454834,
"learning_rate": 7.22037903164173e-06,
"loss": 0.1667,
"step": 638
},
{
"epoch": 1.8947368421052633,
"grad_norm": 0.3282301127910614,
"learning_rate": 7.187196700608373e-06,
"loss": 0.1642,
"step": 639
},
{
"epoch": 1.8977020014825796,
"grad_norm": 0.3764040470123291,
"learning_rate": 7.154047967380353e-06,
"loss": 0.1664,
"step": 640
},
{
"epoch": 1.9006671608598962,
"grad_norm": 0.35521620512008667,
"learning_rate": 7.120933227905971e-06,
"loss": 0.1674,
"step": 641
},
{
"epoch": 1.9036323202372127,
"grad_norm": 0.34446388483047485,
"learning_rate": 7.0878528777274814e-06,
"loss": 0.167,
"step": 642
},
{
"epoch": 1.9065974796145293,
"grad_norm": 0.3320685029029846,
"learning_rate": 7.05480731197638e-06,
"loss": 0.1617,
"step": 643
},
{
"epoch": 1.9095626389918459,
"grad_norm": 0.3581525385379791,
"learning_rate": 7.021796925368667e-06,
"loss": 0.1672,
"step": 644
},
{
"epoch": 1.9125277983691622,
"grad_norm": 0.3463064730167389,
"learning_rate": 6.988822112200157e-06,
"loss": 0.1616,
"step": 645
},
{
"epoch": 1.915492957746479,
"grad_norm": 0.3198210597038269,
"learning_rate": 6.955883266341741e-06,
"loss": 0.1644,
"step": 646
},
{
"epoch": 1.9184581171237953,
"grad_norm": 0.3308843672275543,
"learning_rate": 6.9229807812346985e-06,
"loss": 0.166,
"step": 647
},
{
"epoch": 1.9214232765011119,
"grad_norm": 0.357516884803772,
"learning_rate": 6.890115049885995e-06,
"loss": 0.1653,
"step": 648
},
{
"epoch": 1.9243884358784284,
"grad_norm": 0.3382551074028015,
"learning_rate": 6.85728646486359e-06,
"loss": 0.1643,
"step": 649
},
{
"epoch": 1.927353595255745,
"grad_norm": 0.34474846720695496,
"learning_rate": 6.824495418291741e-06,
"loss": 0.1664,
"step": 650
},
{
"epoch": 1.9303187546330616,
"grad_norm": 0.3264622390270233,
"learning_rate": 6.791742301846325e-06,
"loss": 0.1651,
"step": 651
},
{
"epoch": 1.933283914010378,
"grad_norm": 0.34936872124671936,
"learning_rate": 6.759027506750159e-06,
"loss": 0.1664,
"step": 652
},
{
"epoch": 1.9362490733876947,
"grad_norm": 0.34215009212493896,
"learning_rate": 6.726351423768323e-06,
"loss": 0.1671,
"step": 653
},
{
"epoch": 1.939214232765011,
"grad_norm": 0.33212119340896606,
"learning_rate": 6.693714443203507e-06,
"loss": 0.1642,
"step": 654
},
{
"epoch": 1.9421793921423276,
"grad_norm": 0.3262803256511688,
"learning_rate": 6.661116954891329e-06,
"loss": 0.1618,
"step": 655
},
{
"epoch": 1.9451445515196442,
"grad_norm": 0.3281456530094147,
"learning_rate": 6.62855934819569e-06,
"loss": 0.1661,
"step": 656
},
{
"epoch": 1.9481097108969607,
"grad_norm": 0.3286809027194977,
"learning_rate": 6.59604201200412e-06,
"loss": 0.1639,
"step": 657
},
{
"epoch": 1.9510748702742773,
"grad_norm": 0.33260786533355713,
"learning_rate": 6.563565334723134e-06,
"loss": 0.1623,
"step": 658
},
{
"epoch": 1.9540400296515936,
"grad_norm": 0.34506213665008545,
"learning_rate": 6.5311297042736046e-06,
"loss": 0.168,
"step": 659
},
{
"epoch": 1.9570051890289104,
"grad_norm": 0.3355524241924286,
"learning_rate": 6.498735508086094e-06,
"loss": 0.1659,
"step": 660
},
{
"epoch": 1.9599703484062267,
"grad_norm": 0.3466867506504059,
"learning_rate": 6.466383133096268e-06,
"loss": 0.1658,
"step": 661
},
{
"epoch": 1.9629355077835435,
"grad_norm": 0.32791653275489807,
"learning_rate": 6.4340729657402424e-06,
"loss": 0.1656,
"step": 662
},
{
"epoch": 1.9659006671608599,
"grad_norm": 0.3234347403049469,
"learning_rate": 6.40180539194999e-06,
"loss": 0.1626,
"step": 663
},
{
"epoch": 1.9688658265381764,
"grad_norm": 0.33013296127319336,
"learning_rate": 6.3695807971487175e-06,
"loss": 0.1666,
"step": 664
},
{
"epoch": 1.971830985915493,
"grad_norm": 0.3211277425289154,
"learning_rate": 6.337399566246257e-06,
"loss": 0.1631,
"step": 665
},
{
"epoch": 1.9747961452928093,
"grad_norm": 0.32406309247016907,
"learning_rate": 6.305262083634488e-06,
"loss": 0.1648,
"step": 666
},
{
"epoch": 1.9777613046701261,
"grad_norm": 0.31896913051605225,
"learning_rate": 6.2731687331827214e-06,
"loss": 0.1639,
"step": 667
},
{
"epoch": 1.9807264640474425,
"grad_norm": 0.32637953758239746,
"learning_rate": 6.2411198982331435e-06,
"loss": 0.1616,
"step": 668
},
{
"epoch": 1.9836916234247592,
"grad_norm": 0.3404463231563568,
"learning_rate": 6.209115961596208e-06,
"loss": 0.1639,
"step": 669
},
{
"epoch": 1.9866567828020756,
"grad_norm": 0.33349186182022095,
"learning_rate": 6.177157305546077e-06,
"loss": 0.1656,
"step": 670
},
{
"epoch": 1.9896219421793921,
"grad_norm": 0.33086422085762024,
"learning_rate": 6.145244311816063e-06,
"loss": 0.1659,
"step": 671
},
{
"epoch": 1.9925871015567087,
"grad_norm": 0.325844407081604,
"learning_rate": 6.113377361594048e-06,
"loss": 0.1627,
"step": 672
},
{
"epoch": 1.995552260934025,
"grad_norm": 0.3200128376483917,
"learning_rate": 6.081556835517955e-06,
"loss": 0.1648,
"step": 673
},
{
"epoch": 1.9985174203113418,
"grad_norm": 0.3372708261013031,
"learning_rate": 6.049783113671184e-06,
"loss": 0.1647,
"step": 674
},
{
"epoch": 2.001482579688658,
"grad_norm": 0.3364209830760956,
"learning_rate": 6.018056575578075e-06,
"loss": 0.1502,
"step": 675
},
{
"epoch": 2.004447739065975,
"grad_norm": 0.48207002878189087,
"learning_rate": 5.986377600199371e-06,
"loss": 0.135,
"step": 676
},
{
"epoch": 2.0074128984432913,
"grad_norm": 0.4162459671497345,
"learning_rate": 5.9547465659277215e-06,
"loss": 0.1348,
"step": 677
},
{
"epoch": 2.010378057820608,
"grad_norm": 0.3513208329677582,
"learning_rate": 5.923163850583114e-06,
"loss": 0.1341,
"step": 678
},
{
"epoch": 2.0133432171979244,
"grad_norm": 0.3508232831954956,
"learning_rate": 5.891629831408392e-06,
"loss": 0.1293,
"step": 679
},
{
"epoch": 2.0163083765752408,
"grad_norm": 0.3477492034435272,
"learning_rate": 5.8601448850647515e-06,
"loss": 0.1301,
"step": 680
},
{
"epoch": 2.0192735359525575,
"grad_norm": 0.4429793357849121,
"learning_rate": 5.828709387627219e-06,
"loss": 0.1342,
"step": 681
},
{
"epoch": 2.022238695329874,
"grad_norm": 0.45886561274528503,
"learning_rate": 5.797323714580192e-06,
"loss": 0.1315,
"step": 682
},
{
"epoch": 2.0252038547071907,
"grad_norm": 0.4540359377861023,
"learning_rate": 5.7659882408129204e-06,
"loss": 0.1331,
"step": 683
},
{
"epoch": 2.028169014084507,
"grad_norm": 0.3975986838340759,
"learning_rate": 5.7347033406150494e-06,
"loss": 0.1336,
"step": 684
},
{
"epoch": 2.031134173461824,
"grad_norm": 0.40435880422592163,
"learning_rate": 5.703469387672138e-06,
"loss": 0.1314,
"step": 685
},
{
"epoch": 2.03409933283914,
"grad_norm": 0.36956506967544556,
"learning_rate": 5.672286755061212e-06,
"loss": 0.1292,
"step": 686
},
{
"epoch": 2.0370644922164565,
"grad_norm": 0.3341839909553528,
"learning_rate": 5.64115581524629e-06,
"loss": 0.1284,
"step": 687
},
{
"epoch": 2.0400296515937733,
"grad_norm": 0.3514721989631653,
"learning_rate": 5.610076940073939e-06,
"loss": 0.1293,
"step": 688
},
{
"epoch": 2.0429948109710896,
"grad_norm": 0.3784818947315216,
"learning_rate": 5.579050500768837e-06,
"loss": 0.1317,
"step": 689
},
{
"epoch": 2.0459599703484064,
"grad_norm": 0.37682050466537476,
"learning_rate": 5.548076867929331e-06,
"loss": 0.1299,
"step": 690
},
{
"epoch": 2.0489251297257227,
"grad_norm": 0.3408771753311157,
"learning_rate": 5.517156411523026e-06,
"loss": 0.1294,
"step": 691
},
{
"epoch": 2.0518902891030395,
"grad_norm": 0.3548658788204193,
"learning_rate": 5.486289500882355e-06,
"loss": 0.1319,
"step": 692
},
{
"epoch": 2.054855448480356,
"grad_norm": 0.3722321391105652,
"learning_rate": 5.455476504700161e-06,
"loss": 0.1317,
"step": 693
},
{
"epoch": 2.057820607857672,
"grad_norm": 0.3734947144985199,
"learning_rate": 5.424717791025302e-06,
"loss": 0.1333,
"step": 694
},
{
"epoch": 2.060785767234989,
"grad_norm": 0.3377910554409027,
"learning_rate": 5.3940137272582534e-06,
"loss": 0.1298,
"step": 695
},
{
"epoch": 2.0637509266123053,
"grad_norm": 0.371706485748291,
"learning_rate": 5.3633646801467255e-06,
"loss": 0.1315,
"step": 696
},
{
"epoch": 2.066716085989622,
"grad_norm": 0.37259435653686523,
"learning_rate": 5.332771015781275e-06,
"loss": 0.1312,
"step": 697
},
{
"epoch": 2.0696812453669384,
"grad_norm": 0.357020765542984,
"learning_rate": 5.302233099590928e-06,
"loss": 0.1298,
"step": 698
},
{
"epoch": 2.072646404744255,
"grad_norm": 0.3563533425331116,
"learning_rate": 5.271751296338823e-06,
"loss": 0.13,
"step": 699
},
{
"epoch": 2.0756115641215716,
"grad_norm": 0.35207492113113403,
"learning_rate": 5.241325970117851e-06,
"loss": 0.1331,
"step": 700
},
{
"epoch": 2.078576723498888,
"grad_norm": 0.3274974524974823,
"learning_rate": 5.210957484346314e-06,
"loss": 0.1277,
"step": 701
},
{
"epoch": 2.0815418828762047,
"grad_norm": 0.36036819219589233,
"learning_rate": 5.1806462017635775e-06,
"loss": 0.13,
"step": 702
},
{
"epoch": 2.084507042253521,
"grad_norm": 0.36002618074417114,
"learning_rate": 5.150392484425728e-06,
"loss": 0.1319,
"step": 703
},
{
"epoch": 2.087472201630838,
"grad_norm": 0.3353933095932007,
"learning_rate": 5.120196693701267e-06,
"loss": 0.1312,
"step": 704
},
{
"epoch": 2.090437361008154,
"grad_norm": 0.36249250173568726,
"learning_rate": 5.090059190266779e-06,
"loss": 0.1331,
"step": 705
},
{
"epoch": 2.093402520385471,
"grad_norm": 0.3565695583820343,
"learning_rate": 5.059980334102637e-06,
"loss": 0.1306,
"step": 706
},
{
"epoch": 2.0963676797627873,
"grad_norm": 0.35264307260513306,
"learning_rate": 5.0299604844886985e-06,
"loss": 0.1288,
"step": 707
},
{
"epoch": 2.0993328391401036,
"grad_norm": 0.34676074981689453,
"learning_rate": 5.000000000000003e-06,
"loss": 0.1289,
"step": 708
},
{
"epoch": 2.1022979985174204,
"grad_norm": 0.34547021985054016,
"learning_rate": 4.970099238502494e-06,
"loss": 0.13,
"step": 709
},
{
"epoch": 2.1052631578947367,
"grad_norm": 0.3479922413825989,
"learning_rate": 4.940258557148765e-06,
"loss": 0.1289,
"step": 710
},
{
"epoch": 2.1082283172720535,
"grad_norm": 0.34653687477111816,
"learning_rate": 4.910478312373757e-06,
"loss": 0.1274,
"step": 711
},
{
"epoch": 2.11119347664937,
"grad_norm": 0.35768458247184753,
"learning_rate": 4.8807588598905364e-06,
"loss": 0.1263,
"step": 712
},
{
"epoch": 2.1141586360266866,
"grad_norm": 0.3689504563808441,
"learning_rate": 4.8511005546860214e-06,
"loss": 0.131,
"step": 713
},
{
"epoch": 2.117123795404003,
"grad_norm": 0.36393120884895325,
"learning_rate": 4.821503751016746e-06,
"loss": 0.132,
"step": 714
},
{
"epoch": 2.1200889547813193,
"grad_norm": 0.331999272108078,
"learning_rate": 4.791968802404648e-06,
"loss": 0.1315,
"step": 715
},
{
"epoch": 2.123054114158636,
"grad_norm": 0.3519760072231293,
"learning_rate": 4.762496061632814e-06,
"loss": 0.1283,
"step": 716
},
{
"epoch": 2.1260192735359524,
"grad_norm": 0.3459528088569641,
"learning_rate": 4.733085880741301e-06,
"loss": 0.1281,
"step": 717
},
{
"epoch": 2.128984432913269,
"grad_norm": 0.3726547956466675,
"learning_rate": 4.703738611022899e-06,
"loss": 0.1304,
"step": 718
},
{
"epoch": 2.1319495922905856,
"grad_norm": 0.33399152755737305,
"learning_rate": 4.674454603018949e-06,
"loss": 0.1281,
"step": 719
},
{
"epoch": 2.1349147516679023,
"grad_norm": 0.3605293333530426,
"learning_rate": 4.645234206515171e-06,
"loss": 0.1276,
"step": 720
},
{
"epoch": 2.1378799110452187,
"grad_norm": 0.34493979811668396,
"learning_rate": 4.616077770537453e-06,
"loss": 0.1297,
"step": 721
},
{
"epoch": 2.140845070422535,
"grad_norm": 0.34895074367523193,
"learning_rate": 4.586985643347716e-06,
"loss": 0.1325,
"step": 722
},
{
"epoch": 2.143810229799852,
"grad_norm": 0.3834850788116455,
"learning_rate": 4.557958172439726e-06,
"loss": 0.1307,
"step": 723
},
{
"epoch": 2.146775389177168,
"grad_norm": 0.36474162340164185,
"learning_rate": 4.5289957045349655e-06,
"loss": 0.1319,
"step": 724
},
{
"epoch": 2.149740548554485,
"grad_norm": 0.36195108294487,
"learning_rate": 4.500098585578475e-06,
"loss": 0.1291,
"step": 725
},
{
"epoch": 2.1527057079318013,
"grad_norm": 0.3537023663520813,
"learning_rate": 4.471267160734731e-06,
"loss": 0.1301,
"step": 726
},
{
"epoch": 2.155670867309118,
"grad_norm": 0.33723926544189453,
"learning_rate": 4.4425017743835155e-06,
"loss": 0.1305,
"step": 727
},
{
"epoch": 2.1586360266864344,
"grad_norm": 0.3564864993095398,
"learning_rate": 4.413802770115816e-06,
"loss": 0.127,
"step": 728
},
{
"epoch": 2.1616011860637507,
"grad_norm": 0.35434702038764954,
"learning_rate": 4.385170490729712e-06,
"loss": 0.13,
"step": 729
},
{
"epoch": 2.1645663454410675,
"grad_norm": 0.35794782638549805,
"learning_rate": 4.356605278226274e-06,
"loss": 0.1312,
"step": 730
},
{
"epoch": 2.167531504818384,
"grad_norm": 0.3571893870830536,
"learning_rate": 4.328107473805487e-06,
"loss": 0.1275,
"step": 731
},
{
"epoch": 2.1704966641957006,
"grad_norm": 0.34572115540504456,
"learning_rate": 4.299677417862174e-06,
"loss": 0.1319,
"step": 732
},
{
"epoch": 2.173461823573017,
"grad_norm": 0.33964014053344727,
"learning_rate": 4.2713154499819345e-06,
"loss": 0.1304,
"step": 733
},
{
"epoch": 2.1764269829503338,
"grad_norm": 0.34920188784599304,
"learning_rate": 4.243021908937083e-06,
"loss": 0.1294,
"step": 734
},
{
"epoch": 2.17939214232765,
"grad_norm": 0.33510822057724,
"learning_rate": 4.214797132682597e-06,
"loss": 0.129,
"step": 735
},
{
"epoch": 2.1823573017049664,
"grad_norm": 0.33283138275146484,
"learning_rate": 4.186641458352088e-06,
"loss": 0.1271,
"step": 736
},
{
"epoch": 2.1853224610822832,
"grad_norm": 0.3561367988586426,
"learning_rate": 4.158555222253772e-06,
"loss": 0.1311,
"step": 737
},
{
"epoch": 2.1882876204595996,
"grad_norm": 0.36620989441871643,
"learning_rate": 4.130538759866457e-06,
"loss": 0.1317,
"step": 738
},
{
"epoch": 2.1912527798369164,
"grad_norm": 0.337467759847641,
"learning_rate": 4.102592405835536e-06,
"loss": 0.1297,
"step": 739
},
{
"epoch": 2.1942179392142327,
"grad_norm": 0.3357710540294647,
"learning_rate": 4.074716493968976e-06,
"loss": 0.1314,
"step": 740
},
{
"epoch": 2.1971830985915495,
"grad_norm": 0.35487931966781616,
"learning_rate": 4.046911357233343e-06,
"loss": 0.1299,
"step": 741
},
{
"epoch": 2.200148257968866,
"grad_norm": 0.34735655784606934,
"learning_rate": 4.019177327749822e-06,
"loss": 0.1324,
"step": 742
},
{
"epoch": 2.203113417346182,
"grad_norm": 0.3381595313549042,
"learning_rate": 3.991514736790259e-06,
"loss": 0.1325,
"step": 743
},
{
"epoch": 2.206078576723499,
"grad_norm": 0.33680617809295654,
"learning_rate": 3.9639239147731865e-06,
"loss": 0.1299,
"step": 744
},
{
"epoch": 2.2090437361008153,
"grad_norm": 0.33319249749183655,
"learning_rate": 3.936405191259891e-06,
"loss": 0.1291,
"step": 745
},
{
"epoch": 2.212008895478132,
"grad_norm": 0.32937324047088623,
"learning_rate": 3.908958894950465e-06,
"loss": 0.1306,
"step": 746
},
{
"epoch": 2.2149740548554484,
"grad_norm": 0.3424176573753357,
"learning_rate": 3.881585353679891e-06,
"loss": 0.1294,
"step": 747
},
{
"epoch": 2.217939214232765,
"grad_norm": 0.33413031697273254,
"learning_rate": 3.854284894414122e-06,
"loss": 0.1293,
"step": 748
},
{
"epoch": 2.2209043736100815,
"grad_norm": 0.33275535702705383,
"learning_rate": 3.827057843246181e-06,
"loss": 0.131,
"step": 749
},
{
"epoch": 2.223869532987398,
"grad_norm": 0.33580246567726135,
"learning_rate": 3.799904525392251e-06,
"loss": 0.1305,
"step": 750
},
{
"epoch": 2.2268346923647147,
"grad_norm": 0.34102970361709595,
"learning_rate": 3.7728252651878018e-06,
"loss": 0.1304,
"step": 751
},
{
"epoch": 2.229799851742031,
"grad_norm": 0.3335299491882324,
"learning_rate": 3.745820386083724e-06,
"loss": 0.1301,
"step": 752
},
{
"epoch": 2.2327650111193478,
"grad_norm": 0.33244848251342773,
"learning_rate": 3.718890210642442e-06,
"loss": 0.1289,
"step": 753
},
{
"epoch": 2.235730170496664,
"grad_norm": 0.3514939248561859,
"learning_rate": 3.6920350605340883e-06,
"loss": 0.1292,
"step": 754
},
{
"epoch": 2.238695329873981,
"grad_norm": 0.34593474864959717,
"learning_rate": 3.6652552565326382e-06,
"loss": 0.1308,
"step": 755
},
{
"epoch": 2.2416604892512972,
"grad_norm": 0.3418528139591217,
"learning_rate": 3.638551118512089e-06,
"loss": 0.13,
"step": 756
},
{
"epoch": 2.244625648628614,
"grad_norm": 0.35408085584640503,
"learning_rate": 3.611922965442648e-06,
"loss": 0.1278,
"step": 757
},
{
"epoch": 2.2475908080059304,
"grad_norm": 0.33797547221183777,
"learning_rate": 3.5853711153868962e-06,
"loss": 0.1296,
"step": 758
},
{
"epoch": 2.2505559673832467,
"grad_norm": 0.3515971302986145,
"learning_rate": 3.558895885496023e-06,
"loss": 0.1311,
"step": 759
},
{
"epoch": 2.2535211267605635,
"grad_norm": 0.34564974904060364,
"learning_rate": 3.53249759200601e-06,
"loss": 0.1271,
"step": 760
},
{
"epoch": 2.25648628613788,
"grad_norm": 0.3417358696460724,
"learning_rate": 3.506176550233863e-06,
"loss": 0.1323,
"step": 761
},
{
"epoch": 2.2594514455151966,
"grad_norm": 0.342887282371521,
"learning_rate": 3.479933074573858e-06,
"loss": 0.1305,
"step": 762
},
{
"epoch": 2.262416604892513,
"grad_norm": 0.3478313386440277,
"learning_rate": 3.453767478493761e-06,
"loss": 0.1303,
"step": 763
},
{
"epoch": 2.2653817642698293,
"grad_norm": 0.33934473991394043,
"learning_rate": 3.4276800745311135e-06,
"loss": 0.1288,
"step": 764
},
{
"epoch": 2.268346923647146,
"grad_norm": 0.31560465693473816,
"learning_rate": 3.401671174289469e-06,
"loss": 0.128,
"step": 765
},
{
"epoch": 2.2713120830244624,
"grad_norm": 0.34385186433792114,
"learning_rate": 3.37574108843469e-06,
"loss": 0.127,
"step": 766
},
{
"epoch": 2.274277242401779,
"grad_norm": 0.32480913400650024,
"learning_rate": 3.3498901266912397e-06,
"loss": 0.128,
"step": 767
},
{
"epoch": 2.2772424017790955,
"grad_norm": 0.3512122631072998,
"learning_rate": 3.3241185978384636e-06,
"loss": 0.1276,
"step": 768
},
{
"epoch": 2.2802075611564123,
"grad_norm": 0.34826409816741943,
"learning_rate": 3.2984268097069284e-06,
"loss": 0.13,
"step": 769
},
{
"epoch": 2.2831727205337287,
"grad_norm": 0.34755197167396545,
"learning_rate": 3.2728150691747117e-06,
"loss": 0.1314,
"step": 770
},
{
"epoch": 2.2861378799110454,
"grad_norm": 0.3306916654109955,
"learning_rate": 3.2472836821637744e-06,
"loss": 0.1314,
"step": 771
},
{
"epoch": 2.289103039288362,
"grad_norm": 0.3324066698551178,
"learning_rate": 3.22183295363627e-06,
"loss": 0.1304,
"step": 772
},
{
"epoch": 2.292068198665678,
"grad_norm": 0.34940484166145325,
"learning_rate": 3.196463187590929e-06,
"loss": 0.1351,
"step": 773
},
{
"epoch": 2.295033358042995,
"grad_norm": 0.34311389923095703,
"learning_rate": 3.1711746870594083e-06,
"loss": 0.1299,
"step": 774
},
{
"epoch": 2.2979985174203112,
"grad_norm": 0.3504948318004608,
"learning_rate": 3.145967754102691e-06,
"loss": 0.1314,
"step": 775
},
{
"epoch": 2.300963676797628,
"grad_norm": 0.33524277806282043,
"learning_rate": 3.1208426898074685e-06,
"loss": 0.1326,
"step": 776
},
{
"epoch": 2.3039288361749444,
"grad_norm": 0.3370322287082672,
"learning_rate": 3.0957997942825337e-06,
"loss": 0.1301,
"step": 777
},
{
"epoch": 2.3068939955522607,
"grad_norm": 0.33946508169174194,
"learning_rate": 3.070839366655215e-06,
"loss": 0.1297,
"step": 778
},
{
"epoch": 2.3098591549295775,
"grad_norm": 0.343334436416626,
"learning_rate": 3.045961705067787e-06,
"loss": 0.1279,
"step": 779
},
{
"epoch": 2.312824314306894,
"grad_norm": 0.34265804290771484,
"learning_rate": 3.021167106673928e-06,
"loss": 0.1314,
"step": 780
},
{
"epoch": 2.3157894736842106,
"grad_norm": 0.33246049284935,
"learning_rate": 2.996455867635155e-06,
"loss": 0.1306,
"step": 781
},
{
"epoch": 2.318754633061527,
"grad_norm": 0.34323611855506897,
"learning_rate": 2.9718282831172885e-06,
"loss": 0.1318,
"step": 782
},
{
"epoch": 2.3217197924388437,
"grad_norm": 0.34068265557289124,
"learning_rate": 2.94728464728693e-06,
"loss": 0.1292,
"step": 783
},
{
"epoch": 2.32468495181616,
"grad_norm": 0.3370424807071686,
"learning_rate": 2.922825253307947e-06,
"loss": 0.129,
"step": 784
},
{
"epoch": 2.327650111193477,
"grad_norm": 0.3519260883331299,
"learning_rate": 2.898450393337977e-06,
"loss": 0.1292,
"step": 785
},
{
"epoch": 2.330615270570793,
"grad_norm": 0.33347323536872864,
"learning_rate": 2.8741603585249312e-06,
"loss": 0.1261,
"step": 786
},
{
"epoch": 2.3335804299481095,
"grad_norm": 0.3215949833393097,
"learning_rate": 2.8499554390035144e-06,
"loss": 0.1294,
"step": 787
},
{
"epoch": 2.3365455893254263,
"grad_norm": 0.32965582609176636,
"learning_rate": 2.8258359238917665e-06,
"loss": 0.1281,
"step": 788
},
{
"epoch": 2.3395107487027427,
"grad_norm": 0.33794647455215454,
"learning_rate": 2.8018021012875994e-06,
"loss": 0.1285,
"step": 789
},
{
"epoch": 2.3424759080800595,
"grad_norm": 0.32937586307525635,
"learning_rate": 2.7778542582653746e-06,
"loss": 0.128,
"step": 790
},
{
"epoch": 2.345441067457376,
"grad_norm": 0.3328467607498169,
"learning_rate": 2.753992680872457e-06,
"loss": 0.129,
"step": 791
},
{
"epoch": 2.348406226834692,
"grad_norm": 0.32725760340690613,
"learning_rate": 2.7302176541257984e-06,
"loss": 0.1294,
"step": 792
},
{
"epoch": 2.351371386212009,
"grad_norm": 0.3363383114337921,
"learning_rate": 2.7065294620085425e-06,
"loss": 0.129,
"step": 793
},
{
"epoch": 2.3543365455893253,
"grad_norm": 0.33696410059928894,
"learning_rate": 2.6829283874666236e-06,
"loss": 0.1285,
"step": 794
},
{
"epoch": 2.357301704966642,
"grad_norm": 0.33244121074676514,
"learning_rate": 2.6594147124053983e-06,
"loss": 0.1277,
"step": 795
},
{
"epoch": 2.3602668643439584,
"grad_norm": 0.3353787958621979,
"learning_rate": 2.635988717686272e-06,
"loss": 0.1304,
"step": 796
},
{
"epoch": 2.363232023721275,
"grad_norm": 0.34110620617866516,
"learning_rate": 2.6126506831233343e-06,
"loss": 0.1306,
"step": 797
},
{
"epoch": 2.3661971830985915,
"grad_norm": 0.33648866415023804,
"learning_rate": 2.5894008874800323e-06,
"loss": 0.1286,
"step": 798
},
{
"epoch": 2.3691623424759083,
"grad_norm": 0.34967437386512756,
"learning_rate": 2.5662396084658383e-06,
"loss": 0.133,
"step": 799
},
{
"epoch": 2.3721275018532246,
"grad_norm": 0.33198100328445435,
"learning_rate": 2.543167122732918e-06,
"loss": 0.1277,
"step": 800
},
{
"epoch": 2.375092661230541,
"grad_norm": 0.34363454580307007,
"learning_rate": 2.5201837058728506e-06,
"loss": 0.1277,
"step": 801
},
{
"epoch": 2.3780578206078578,
"grad_norm": 0.3609948456287384,
"learning_rate": 2.4972896324133143e-06,
"loss": 0.1295,
"step": 802
},
{
"epoch": 2.381022979985174,
"grad_norm": 0.34460243582725525,
"learning_rate": 2.474485175814816e-06,
"loss": 0.1319,
"step": 803
},
{
"epoch": 2.383988139362491,
"grad_norm": 0.3403383493423462,
"learning_rate": 2.451770608467432e-06,
"loss": 0.1284,
"step": 804
},
{
"epoch": 2.386953298739807,
"grad_norm": 0.333807110786438,
"learning_rate": 2.429146201687538e-06,
"loss": 0.1257,
"step": 805
},
{
"epoch": 2.3899184581171236,
"grad_norm": 0.33072689175605774,
"learning_rate": 2.4066122257145898e-06,
"loss": 0.1309,
"step": 806
},
{
"epoch": 2.3928836174944403,
"grad_norm": 0.32463690638542175,
"learning_rate": 2.3841689497078746e-06,
"loss": 0.1289,
"step": 807
},
{
"epoch": 2.3958487768717567,
"grad_norm": 0.34213897585868835,
"learning_rate": 2.361816641743303e-06,
"loss": 0.1286,
"step": 808
},
{
"epoch": 2.3988139362490735,
"grad_norm": 0.3414537310600281,
"learning_rate": 2.339555568810221e-06,
"loss": 0.126,
"step": 809
},
{
"epoch": 2.40177909562639,
"grad_norm": 0.32957902550697327,
"learning_rate": 2.317385996808195e-06,
"loss": 0.1302,
"step": 810
},
{
"epoch": 2.4047442550037066,
"grad_norm": 0.3390369713306427,
"learning_rate": 2.295308190543859e-06,
"loss": 0.132,
"step": 811
},
{
"epoch": 2.407709414381023,
"grad_norm": 0.3288882076740265,
"learning_rate": 2.2733224137277366e-06,
"loss": 0.1271,
"step": 812
},
{
"epoch": 2.4106745737583397,
"grad_norm": 0.3289991021156311,
"learning_rate": 2.251428928971102e-06,
"loss": 0.1304,
"step": 813
},
{
"epoch": 2.413639733135656,
"grad_norm": 0.33164265751838684,
"learning_rate": 2.229627997782834e-06,
"loss": 0.1296,
"step": 814
},
{
"epoch": 2.4166048925129724,
"grad_norm": 0.33751052618026733,
"learning_rate": 2.2079198805662917e-06,
"loss": 0.1282,
"step": 815
},
{
"epoch": 2.419570051890289,
"grad_norm": 0.3279024362564087,
"learning_rate": 2.186304836616221e-06,
"loss": 0.1295,
"step": 816
},
{
"epoch": 2.4225352112676055,
"grad_norm": 0.3452274203300476,
"learning_rate": 2.1647831241156304e-06,
"loss": 0.1299,
"step": 817
},
{
"epoch": 2.4255003706449223,
"grad_norm": 0.3305584788322449,
"learning_rate": 2.1433550001327376e-06,
"loss": 0.1285,
"step": 818
},
{
"epoch": 2.4284655300222386,
"grad_norm": 0.33620432019233704,
"learning_rate": 2.122020720617869e-06,
"loss": 0.1304,
"step": 819
},
{
"epoch": 2.431430689399555,
"grad_norm": 0.3142911493778229,
"learning_rate": 2.1007805404004247e-06,
"loss": 0.125,
"step": 820
},
{
"epoch": 2.4343958487768718,
"grad_norm": 0.3442496657371521,
"learning_rate": 2.0796347131858187e-06,
"loss": 0.1286,
"step": 821
},
{
"epoch": 2.437361008154188,
"grad_norm": 0.34949377179145813,
"learning_rate": 2.058583491552465e-06,
"loss": 0.1284,
"step": 822
},
{
"epoch": 2.440326167531505,
"grad_norm": 0.36079153418540955,
"learning_rate": 2.037627126948751e-06,
"loss": 0.1303,
"step": 823
},
{
"epoch": 2.4432913269088212,
"grad_norm": 0.32977890968322754,
"learning_rate": 2.0167658696900317e-06,
"loss": 0.1279,
"step": 824
},
{
"epoch": 2.446256486286138,
"grad_norm": 0.3395943343639374,
"learning_rate": 1.9959999689556407e-06,
"loss": 0.1295,
"step": 825
},
{
"epoch": 2.4492216456634543,
"grad_norm": 0.3250430226325989,
"learning_rate": 1.9753296727859195e-06,
"loss": 0.1287,
"step": 826
},
{
"epoch": 2.452186805040771,
"grad_norm": 0.3329125642776489,
"learning_rate": 1.9547552280792528e-06,
"loss": 0.1278,
"step": 827
},
{
"epoch": 2.4551519644180875,
"grad_norm": 0.31633639335632324,
"learning_rate": 1.9342768805891176e-06,
"loss": 0.1291,
"step": 828
},
{
"epoch": 2.458117123795404,
"grad_norm": 0.3292962610721588,
"learning_rate": 1.9138948749211473e-06,
"loss": 0.1297,
"step": 829
},
{
"epoch": 2.4610822831727206,
"grad_norm": 0.34126242995262146,
"learning_rate": 1.8936094545302098e-06,
"loss": 0.1293,
"step": 830
},
{
"epoch": 2.464047442550037,
"grad_norm": 0.3327971398830414,
"learning_rate": 1.8734208617174986e-06,
"loss": 0.1284,
"step": 831
},
{
"epoch": 2.4670126019273537,
"grad_norm": 0.340774804353714,
"learning_rate": 1.8533293376276473e-06,
"loss": 0.129,
"step": 832
},
{
"epoch": 2.46997776130467,
"grad_norm": 0.3464578688144684,
"learning_rate": 1.8333351222458407e-06,
"loss": 0.1277,
"step": 833
},
{
"epoch": 2.472942920681987,
"grad_norm": 0.340108722448349,
"learning_rate": 1.813438454394948e-06,
"loss": 0.1304,
"step": 834
},
{
"epoch": 2.475908080059303,
"grad_norm": 0.36126676201820374,
"learning_rate": 1.7936395717326705e-06,
"loss": 0.1275,
"step": 835
},
{
"epoch": 2.4788732394366195,
"grad_norm": 0.3317781388759613,
"learning_rate": 1.773938710748706e-06,
"loss": 0.1301,
"step": 836
},
{
"epoch": 2.4818383988139363,
"grad_norm": 0.34120678901672363,
"learning_rate": 1.7543361067619269e-06,
"loss": 0.1287,
"step": 837
},
{
"epoch": 2.4848035581912526,
"grad_norm": 0.3353835642337799,
"learning_rate": 1.734831993917564e-06,
"loss": 0.1296,
"step": 838
},
{
"epoch": 2.4877687175685694,
"grad_norm": 0.34985971450805664,
"learning_rate": 1.715426605184407e-06,
"loss": 0.129,
"step": 839
},
{
"epoch": 2.4907338769458858,
"grad_norm": 0.3302218019962311,
"learning_rate": 1.6961201723520248e-06,
"loss": 0.131,
"step": 840
},
{
"epoch": 2.4936990363232026,
"grad_norm": 0.34012821316719055,
"learning_rate": 1.676912926028007e-06,
"loss": 0.1301,
"step": 841
},
{
"epoch": 2.496664195700519,
"grad_norm": 0.3237687945365906,
"learning_rate": 1.6578050956351887e-06,
"loss": 0.1257,
"step": 842
},
{
"epoch": 2.4996293550778352,
"grad_norm": 0.3470035791397095,
"learning_rate": 1.6387969094089318e-06,
"loss": 0.1287,
"step": 843
},
{
"epoch": 2.502594514455152,
"grad_norm": 0.35050496459007263,
"learning_rate": 1.619888594394382e-06,
"loss": 0.1314,
"step": 844
},
{
"epoch": 2.5055596738324684,
"grad_norm": 0.3287401795387268,
"learning_rate": 1.6010803764437633e-06,
"loss": 0.1285,
"step": 845
},
{
"epoch": 2.508524833209785,
"grad_norm": 0.34805530309677124,
"learning_rate": 1.5823724802136863e-06,
"loss": 0.1313,
"step": 846
},
{
"epoch": 2.5114899925871015,
"grad_norm": 0.33040815591812134,
"learning_rate": 1.5637651291624522e-06,
"loss": 0.1284,
"step": 847
},
{
"epoch": 2.514455151964418,
"grad_norm": 0.340082049369812,
"learning_rate": 1.545258545547398e-06,
"loss": 0.1258,
"step": 848
},
{
"epoch": 2.5174203113417346,
"grad_norm": 0.3319970965385437,
"learning_rate": 1.5268529504222262e-06,
"loss": 0.1278,
"step": 849
},
{
"epoch": 2.5203854707190514,
"grad_norm": 0.327903151512146,
"learning_rate": 1.5085485636343755e-06,
"loss": 0.1272,
"step": 850
},
{
"epoch": 2.5233506300963677,
"grad_norm": 0.3466844856739044,
"learning_rate": 1.4903456038223941e-06,
"loss": 0.131,
"step": 851
},
{
"epoch": 2.526315789473684,
"grad_norm": 0.3274025619029999,
"learning_rate": 1.4722442884133214e-06,
"loss": 0.127,
"step": 852
},
{
"epoch": 2.529280948851001,
"grad_norm": 0.32809337973594666,
"learning_rate": 1.4542448336201021e-06,
"loss": 0.1265,
"step": 853
},
{
"epoch": 2.532246108228317,
"grad_norm": 0.3453335165977478,
"learning_rate": 1.4363474544389876e-06,
"loss": 0.1295,
"step": 854
},
{
"epoch": 2.535211267605634,
"grad_norm": 0.3447280824184418,
"learning_rate": 1.4185523646469822e-06,
"loss": 0.1312,
"step": 855
},
{
"epoch": 2.5381764269829503,
"grad_norm": 0.33509477972984314,
"learning_rate": 1.4008597767992872e-06,
"loss": 0.1283,
"step": 856
},
{
"epoch": 2.5411415863602667,
"grad_norm": 0.3374352753162384,
"learning_rate": 1.3832699022267516e-06,
"loss": 0.1277,
"step": 857
},
{
"epoch": 2.5441067457375834,
"grad_norm": 0.3189197778701782,
"learning_rate": 1.3657829510333653e-06,
"loss": 0.1261,
"step": 858
},
{
"epoch": 2.5470719051149,
"grad_norm": 0.34467366337776184,
"learning_rate": 1.3483991320937307e-06,
"loss": 0.1295,
"step": 859
},
{
"epoch": 2.5500370644922166,
"grad_norm": 0.33318278193473816,
"learning_rate": 1.3311186530505838e-06,
"loss": 0.1271,
"step": 860
},
{
"epoch": 2.553002223869533,
"grad_norm": 0.3337114453315735,
"learning_rate": 1.313941720312303e-06,
"loss": 0.133,
"step": 861
},
{
"epoch": 2.5559673832468492,
"grad_norm": 0.33227020502090454,
"learning_rate": 1.2968685390504465e-06,
"loss": 0.1277,
"step": 862
},
{
"epoch": 2.558932542624166,
"grad_norm": 0.3402811288833618,
"learning_rate": 1.2798993131973093e-06,
"loss": 0.128,
"step": 863
},
{
"epoch": 2.561897702001483,
"grad_norm": 0.32487955689430237,
"learning_rate": 1.263034245443473e-06,
"loss": 0.1296,
"step": 864
},
{
"epoch": 2.564862861378799,
"grad_norm": 0.3243284523487091,
"learning_rate": 1.2462735372353996e-06,
"loss": 0.1262,
"step": 865
},
{
"epoch": 2.5678280207561155,
"grad_norm": 0.33498314023017883,
"learning_rate": 1.2296173887730122e-06,
"loss": 0.1311,
"step": 866
},
{
"epoch": 2.5707931801334323,
"grad_norm": 0.32444214820861816,
"learning_rate": 1.2130659990073146e-06,
"loss": 0.1251,
"step": 867
},
{
"epoch": 2.5737583395107486,
"grad_norm": 0.3283936083316803,
"learning_rate": 1.196619565638003e-06,
"loss": 0.1266,
"step": 868
},
{
"epoch": 2.5767234988880654,
"grad_norm": 0.33177807927131653,
"learning_rate": 1.1802782851111206e-06,
"loss": 0.1277,
"step": 869
},
{
"epoch": 2.5796886582653817,
"grad_norm": 0.327374130487442,
"learning_rate": 1.1640423526166987e-06,
"loss": 0.1273,
"step": 870
},
{
"epoch": 2.582653817642698,
"grad_norm": 0.3298618495464325,
"learning_rate": 1.1479119620864277e-06,
"loss": 0.1278,
"step": 871
},
{
"epoch": 2.585618977020015,
"grad_norm": 0.34262576699256897,
"learning_rate": 1.1318873061913405e-06,
"loss": 0.1253,
"step": 872
},
{
"epoch": 2.588584136397331,
"grad_norm": 0.33369916677474976,
"learning_rate": 1.1159685763395113e-06,
"loss": 0.1277,
"step": 873
},
{
"epoch": 2.591549295774648,
"grad_norm": 0.32637131214141846,
"learning_rate": 1.1001559626737757e-06,
"loss": 0.1285,
"step": 874
},
{
"epoch": 2.5945144551519643,
"grad_norm": 0.33180394768714905,
"learning_rate": 1.0844496540694515e-06,
"loss": 0.1294,
"step": 875
},
{
"epoch": 2.597479614529281,
"grad_norm": 0.36661967635154724,
"learning_rate": 1.0688498381320855e-06,
"loss": 0.127,
"step": 876
},
{
"epoch": 2.6004447739065975,
"grad_norm": 0.32528406381607056,
"learning_rate": 1.0533567011952094e-06,
"loss": 0.1253,
"step": 877
},
{
"epoch": 2.6034099332839142,
"grad_norm": 0.33627548813819885,
"learning_rate": 1.037970428318118e-06,
"loss": 0.1258,
"step": 878
},
{
"epoch": 2.6063750926612306,
"grad_norm": 0.329609215259552,
"learning_rate": 1.022691203283661e-06,
"loss": 0.1268,
"step": 879
},
{
"epoch": 2.609340252038547,
"grad_norm": 0.3270719647407532,
"learning_rate": 1.0075192085960451e-06,
"loss": 0.1282,
"step": 880
},
{
"epoch": 2.6123054114158637,
"grad_norm": 0.3354145586490631,
"learning_rate": 9.924546254786493e-07,
"loss": 0.1285,
"step": 881
},
{
"epoch": 2.61527057079318,
"grad_norm": 0.32381850481033325,
"learning_rate": 9.77497633871868e-07,
"loss": 0.1294,
"step": 882
},
{
"epoch": 2.618235730170497,
"grad_norm": 0.32297268509864807,
"learning_rate": 9.62648412430951e-07,
"loss": 0.1268,
"step": 883
},
{
"epoch": 2.621200889547813,
"grad_norm": 0.3353489339351654,
"learning_rate": 9.479071385238892e-07,
"loss": 0.1263,
"step": 884
},
{
"epoch": 2.6241660489251295,
"grad_norm": 0.331815630197525,
"learning_rate": 9.332739882292752e-07,
"loss": 0.128,
"step": 885
},
{
"epoch": 2.6271312083024463,
"grad_norm": 0.33713892102241516,
"learning_rate": 9.187491363342094e-07,
"loss": 0.1269,
"step": 886
},
{
"epoch": 2.6300963676797626,
"grad_norm": 0.3313647508621216,
"learning_rate": 9.043327563322113e-07,
"loss": 0.1305,
"step": 887
},
{
"epoch": 2.6330615270570794,
"grad_norm": 0.32444262504577637,
"learning_rate": 8.900250204211513e-07,
"loss": 0.1291,
"step": 888
},
{
"epoch": 2.6360266864343957,
"grad_norm": 0.34167933464050293,
"learning_rate": 8.758260995011825e-07,
"loss": 0.1263,
"step": 889
},
{
"epoch": 2.6389918458117125,
"grad_norm": 0.3300521671772003,
"learning_rate": 8.617361631727139e-07,
"loss": 0.1258,
"step": 890
},
{
"epoch": 2.641957005189029,
"grad_norm": 0.3591514527797699,
"learning_rate": 8.477553797343729e-07,
"loss": 0.1268,
"step": 891
},
{
"epoch": 2.6449221645663457,
"grad_norm": 0.3284503221511841,
"learning_rate": 8.338839161809997e-07,
"loss": 0.127,
"step": 892
},
{
"epoch": 2.647887323943662,
"grad_norm": 0.3253602981567383,
"learning_rate": 8.201219382016556e-07,
"loss": 0.1259,
"step": 893
},
{
"epoch": 2.6508524833209783,
"grad_norm": 0.3226112723350525,
"learning_rate": 8.06469610177636e-07,
"loss": 0.1264,
"step": 894
},
{
"epoch": 2.653817642698295,
"grad_norm": 0.3329734206199646,
"learning_rate": 7.92927095180518e-07,
"loss": 0.1277,
"step": 895
},
{
"epoch": 2.6567828020756115,
"grad_norm": 0.36240342259407043,
"learning_rate": 7.794945549701993e-07,
"loss": 0.1286,
"step": 896
},
{
"epoch": 2.6597479614529282,
"grad_norm": 0.3200359642505646,
"learning_rate": 7.661721499929753e-07,
"loss": 0.1277,
"step": 897
},
{
"epoch": 2.6627131208302446,
"grad_norm": 0.33148688077926636,
"learning_rate": 7.529600393796232e-07,
"loss": 0.1277,
"step": 898
},
{
"epoch": 2.665678280207561,
"grad_norm": 0.32987260818481445,
"learning_rate": 7.398583809434944e-07,
"loss": 0.128,
"step": 899
},
{
"epoch": 2.6686434395848777,
"grad_norm": 0.33015844225883484,
"learning_rate": 7.268673311786378e-07,
"loss": 0.1307,
"step": 900
},
{
"epoch": 2.6716085989621945,
"grad_norm": 0.32374393939971924,
"learning_rate": 7.1398704525792e-07,
"loss": 0.1277,
"step": 901
},
{
"epoch": 2.674573758339511,
"grad_norm": 0.318718284368515,
"learning_rate": 7.012176770311863e-07,
"loss": 0.1266,
"step": 902
},
{
"epoch": 2.677538917716827,
"grad_norm": 0.3262283205986023,
"learning_rate": 6.885593790234057e-07,
"loss": 0.1251,
"step": 903
},
{
"epoch": 2.680504077094144,
"grad_norm": 0.3396647274494171,
"learning_rate": 6.760123024328624e-07,
"loss": 0.1303,
"step": 904
},
{
"epoch": 2.6834692364714603,
"grad_norm": 0.3207716643810272,
"learning_rate": 6.635765971293484e-07,
"loss": 0.1274,
"step": 905
},
{
"epoch": 2.686434395848777,
"grad_norm": 0.32596075534820557,
"learning_rate": 6.512524116523633e-07,
"loss": 0.1257,
"step": 906
},
{
"epoch": 2.6893995552260934,
"grad_norm": 0.322693407535553,
"learning_rate": 6.390398932093555e-07,
"loss": 0.1248,
"step": 907
},
{
"epoch": 2.6923647146034098,
"grad_norm": 0.3405155837535858,
"learning_rate": 6.269391876739494e-07,
"loss": 0.1291,
"step": 908
},
{
"epoch": 2.6953298739807265,
"grad_norm": 0.32777202129364014,
"learning_rate": 6.149504395842087e-07,
"loss": 0.1288,
"step": 909
},
{
"epoch": 2.698295033358043,
"grad_norm": 0.3245905935764313,
"learning_rate": 6.030737921409169e-07,
"loss": 0.1261,
"step": 910
},
{
"epoch": 2.7012601927353597,
"grad_norm": 0.33581435680389404,
"learning_rate": 5.913093872058528e-07,
"loss": 0.1302,
"step": 911
},
{
"epoch": 2.704225352112676,
"grad_norm": 0.3299258053302765,
"learning_rate": 5.796573653001091e-07,
"loss": 0.1264,
"step": 912
},
{
"epoch": 2.7071905114899923,
"grad_norm": 0.3272840082645416,
"learning_rate": 5.681178656024055e-07,
"loss": 0.1269,
"step": 913
},
{
"epoch": 2.710155670867309,
"grad_norm": 0.34318456053733826,
"learning_rate": 5.56691025947429e-07,
"loss": 0.1267,
"step": 914
},
{
"epoch": 2.713120830244626,
"grad_norm": 0.32936856150627136,
"learning_rate": 5.453769828241872e-07,
"loss": 0.1259,
"step": 915
},
{
"epoch": 2.7160859896219423,
"grad_norm": 0.32494810223579407,
"learning_rate": 5.341758713743828e-07,
"loss": 0.1281,
"step": 916
},
{
"epoch": 2.7190511489992586,
"grad_norm": 0.3184977173805237,
"learning_rate": 5.230878253907911e-07,
"loss": 0.1271,
"step": 917
},
{
"epoch": 2.7220163083765754,
"grad_norm": 0.3371221125125885,
"learning_rate": 5.121129773156663e-07,
"loss": 0.1322,
"step": 918
},
{
"epoch": 2.7249814677538917,
"grad_norm": 0.32113394141197205,
"learning_rate": 5.012514582391592e-07,
"loss": 0.1272,
"step": 919
},
{
"epoch": 2.7279466271312085,
"grad_norm": 0.34688618779182434,
"learning_rate": 4.905033978977492e-07,
"loss": 0.1265,
"step": 920
},
{
"epoch": 2.730911786508525,
"grad_norm": 0.3413783311843872,
"learning_rate": 4.798689246727006e-07,
"loss": 0.1307,
"step": 921
},
{
"epoch": 2.733876945885841,
"grad_norm": 0.33555951714515686,
"learning_rate": 4.693481655885257e-07,
"loss": 0.1269,
"step": 922
},
{
"epoch": 2.736842105263158,
"grad_norm": 0.3346193730831146,
"learning_rate": 4.58941246311464e-07,
"loss": 0.1268,
"step": 923
},
{
"epoch": 2.7398072646404743,
"grad_norm": 0.3286806344985962,
"learning_rate": 4.4864829114798394e-07,
"loss": 0.1288,
"step": 924
},
{
"epoch": 2.742772424017791,
"grad_norm": 0.33568400144577026,
"learning_rate": 4.384694230432984e-07,
"loss": 0.1269,
"step": 925
},
{
"epoch": 2.7457375833951074,
"grad_norm": 0.3334142565727234,
"learning_rate": 4.2840476357989825e-07,
"loss": 0.1272,
"step": 926
},
{
"epoch": 2.7487027427724238,
"grad_norm": 0.32712700963020325,
"learning_rate": 4.184544329761009e-07,
"loss": 0.1271,
"step": 927
},
{
"epoch": 2.7516679021497406,
"grad_norm": 0.3435976803302765,
"learning_rate": 4.0861855008460403e-07,
"loss": 0.1286,
"step": 928
},
{
"epoch": 2.7546330615270573,
"grad_norm": 0.3265362083911896,
"learning_rate": 3.988972323910778e-07,
"loss": 0.1281,
"step": 929
},
{
"epoch": 2.7575982209043737,
"grad_norm": 0.32593265175819397,
"learning_rate": 3.8929059601275463e-07,
"loss": 0.1273,
"step": 930
},
{
"epoch": 2.76056338028169,
"grad_norm": 0.3315712511539459,
"learning_rate": 3.797987556970495e-07,
"loss": 0.1296,
"step": 931
},
{
"epoch": 2.763528539659007,
"grad_norm": 0.32149094343185425,
"learning_rate": 3.7042182482018074e-07,
"loss": 0.1284,
"step": 932
},
{
"epoch": 2.766493699036323,
"grad_norm": 0.3222528100013733,
"learning_rate": 3.611599153858214e-07,
"loss": 0.1259,
"step": 933
},
{
"epoch": 2.76945885841364,
"grad_norm": 0.32701918482780457,
"learning_rate": 3.520131380237546e-07,
"loss": 0.1287,
"step": 934
},
{
"epoch": 2.7724240177909563,
"grad_norm": 0.32082316279411316,
"learning_rate": 3.429816019885657e-07,
"loss": 0.1279,
"step": 935
},
{
"epoch": 2.7753891771682726,
"grad_norm": 0.32540014386177063,
"learning_rate": 3.3406541515832e-07,
"loss": 0.1273,
"step": 936
},
{
"epoch": 2.7783543365455894,
"grad_norm": 0.32940953969955444,
"learning_rate": 3.252646840332918e-07,
"loss": 0.1264,
"step": 937
},
{
"epoch": 2.7813194959229057,
"grad_norm": 0.32957059144973755,
"learning_rate": 3.16579513734675e-07,
"loss": 0.128,
"step": 938
},
{
"epoch": 2.7842846553002225,
"grad_norm": 0.31844544410705566,
"learning_rate": 3.080100080033388e-07,
"loss": 0.1268,
"step": 939
},
{
"epoch": 2.787249814677539,
"grad_norm": 0.3339882493019104,
"learning_rate": 2.995562691985898e-07,
"loss": 0.1259,
"step": 940
},
{
"epoch": 2.790214974054855,
"grad_norm": 0.3345246911048889,
"learning_rate": 2.9121839829693857e-07,
"loss": 0.1284,
"step": 941
},
{
"epoch": 2.793180133432172,
"grad_norm": 0.34839048981666565,
"learning_rate": 2.829964948909048e-07,
"loss": 0.1263,
"step": 942
},
{
"epoch": 2.7961452928094888,
"grad_norm": 0.3264479637145996,
"learning_rate": 2.748906571878207e-07,
"loss": 0.1253,
"step": 943
},
{
"epoch": 2.799110452186805,
"grad_norm": 0.3153613805770874,
"learning_rate": 2.6690098200866097e-07,
"loss": 0.1242,
"step": 944
},
{
"epoch": 2.8020756115641214,
"grad_norm": 0.3332012891769409,
"learning_rate": 2.5902756478688674e-07,
"loss": 0.1271,
"step": 945
},
{
"epoch": 2.805040770941438,
"grad_norm": 0.3120848536491394,
"learning_rate": 2.5127049956730207e-07,
"loss": 0.128,
"step": 946
},
{
"epoch": 2.8080059303187546,
"grad_norm": 0.32999473810195923,
"learning_rate": 2.436298790049363e-07,
"loss": 0.126,
"step": 947
},
{
"epoch": 2.8109710896960713,
"grad_norm": 0.32779738306999207,
"learning_rate": 2.3610579436392999e-07,
"loss": 0.1259,
"step": 948
},
{
"epoch": 2.8139362490733877,
"grad_norm": 0.31936830282211304,
"learning_rate": 2.2869833551645293e-07,
"loss": 0.1241,
"step": 949
},
{
"epoch": 2.816901408450704,
"grad_norm": 0.33030980825424194,
"learning_rate": 2.2140759094162468e-07,
"loss": 0.1274,
"step": 950
},
{
"epoch": 2.819866567828021,
"grad_norm": 0.34059956669807434,
"learning_rate": 2.1423364772445886e-07,
"loss": 0.1277,
"step": 951
},
{
"epoch": 2.822831727205337,
"grad_norm": 0.3205658793449402,
"learning_rate": 2.071765915548274e-07,
"loss": 0.1306,
"step": 952
},
{
"epoch": 2.825796886582654,
"grad_norm": 0.34191787242889404,
"learning_rate": 2.002365067264289e-07,
"loss": 0.1269,
"step": 953
},
{
"epoch": 2.8287620459599703,
"grad_norm": 0.3269594609737396,
"learning_rate": 1.9341347613579086e-07,
"loss": 0.1283,
"step": 954
},
{
"epoch": 2.8317272053372866,
"grad_norm": 0.33148735761642456,
"learning_rate": 1.867075812812691e-07,
"loss": 0.1298,
"step": 955
},
{
"epoch": 2.8346923647146034,
"grad_norm": 0.3186003267765045,
"learning_rate": 1.8011890226208527e-07,
"loss": 0.1274,
"step": 956
},
{
"epoch": 2.83765752409192,
"grad_norm": 0.32092559337615967,
"learning_rate": 1.7364751777736334e-07,
"loss": 0.1242,
"step": 957
},
{
"epoch": 2.8406226834692365,
"grad_norm": 0.36179545521736145,
"learning_rate": 1.6729350512519006e-07,
"loss": 0.129,
"step": 958
},
{
"epoch": 2.843587842846553,
"grad_norm": 0.33006298542022705,
"learning_rate": 1.6105694020169594e-07,
"loss": 0.1258,
"step": 959
},
{
"epoch": 2.8465530022238696,
"grad_norm": 0.31537604331970215,
"learning_rate": 1.5493789750014032e-07,
"loss": 0.1283,
"step": 960
},
{
"epoch": 2.849518161601186,
"grad_norm": 0.33820608258247375,
"learning_rate": 1.489364501100332e-07,
"loss": 0.1275,
"step": 961
},
{
"epoch": 2.8524833209785028,
"grad_norm": 0.3154459297657013,
"learning_rate": 1.430526697162482e-07,
"loss": 0.1258,
"step": 962
},
{
"epoch": 2.855448480355819,
"grad_norm": 0.31913918256759644,
"learning_rate": 1.3728662659818205e-07,
"loss": 0.1253,
"step": 963
},
{
"epoch": 2.8584136397331354,
"grad_norm": 0.32766804099082947,
"learning_rate": 1.3163838962890196e-07,
"loss": 0.129,
"step": 964
},
{
"epoch": 2.8613787991104522,
"grad_norm": 0.3298415541648865,
"learning_rate": 1.2610802627432972e-07,
"loss": 0.1278,
"step": 965
},
{
"epoch": 2.8643439584877686,
"grad_norm": 0.32275769114494324,
"learning_rate": 1.206956025924333e-07,
"loss": 0.126,
"step": 966
},
{
"epoch": 2.8673091178650854,
"grad_norm": 0.3340933918952942,
"learning_rate": 1.1540118323243866e-07,
"loss": 0.1272,
"step": 967
},
{
"epoch": 2.8702742772424017,
"grad_norm": 0.33475035429000854,
"learning_rate": 1.1022483143405705e-07,
"loss": 0.1265,
"step": 968
},
{
"epoch": 2.873239436619718,
"grad_norm": 0.32354483008384705,
"learning_rate": 1.0516660902673448e-07,
"loss": 0.1255,
"step": 969
},
{
"epoch": 2.876204595997035,
"grad_norm": 0.3189190924167633,
"learning_rate": 1.0022657642890232e-07,
"loss": 0.1254,
"step": 970
},
{
"epoch": 2.8791697553743516,
"grad_norm": 0.3238016366958618,
"learning_rate": 9.540479264726676e-08,
"loss": 0.1274,
"step": 971
},
{
"epoch": 2.882134914751668,
"grad_norm": 0.3224412798881531,
"learning_rate": 9.070131527609604e-08,
"loss": 0.1271,
"step": 972
},
{
"epoch": 2.8851000741289843,
"grad_norm": 0.34490659832954407,
"learning_rate": 8.61162004965388e-08,
"loss": 0.1277,
"step": 973
},
{
"epoch": 2.888065233506301,
"grad_norm": 0.3256824016571045,
"learning_rate": 8.16495030759501e-08,
"loss": 0.1304,
"step": 974
},
{
"epoch": 2.8910303928836174,
"grad_norm": 0.326412171125412,
"learning_rate": 7.730127636723539e-08,
"loss": 0.1271,
"step": 975
},
{
"epoch": 2.893995552260934,
"grad_norm": 0.32723942399024963,
"learning_rate": 7.307157230821426e-08,
"loss": 0.1291,
"step": 976
},
{
"epoch": 2.8969607116382505,
"grad_norm": 0.33483996987342834,
"learning_rate": 6.896044142100433e-08,
"loss": 0.1271,
"step": 977
},
{
"epoch": 2.899925871015567,
"grad_norm": 0.3145699203014374,
"learning_rate": 6.496793281141056e-08,
"loss": 0.1257,
"step": 978
},
{
"epoch": 2.9028910303928837,
"grad_norm": 0.3338087201118469,
"learning_rate": 6.109409416834689e-08,
"loss": 0.1272,
"step": 979
},
{
"epoch": 2.9058561897702,
"grad_norm": 0.3297833502292633,
"learning_rate": 5.7338971763256646e-08,
"loss": 0.1263,
"step": 980
},
{
"epoch": 2.9088213491475168,
"grad_norm": 0.32836630940437317,
"learning_rate": 5.37026104495697e-08,
"loss": 0.1264,
"step": 981
},
{
"epoch": 2.911786508524833,
"grad_norm": 0.32583150267601013,
"learning_rate": 5.0185053662161756e-08,
"loss": 0.1265,
"step": 982
},
{
"epoch": 2.91475166790215,
"grad_norm": 0.32299482822418213,
"learning_rate": 4.678634341683252e-08,
"loss": 0.1253,
"step": 983
},
{
"epoch": 2.9177168272794662,
"grad_norm": 0.32840579748153687,
"learning_rate": 4.350652030981395e-08,
"loss": 0.1286,
"step": 984
},
{
"epoch": 2.920681986656783,
"grad_norm": 0.3269804120063782,
"learning_rate": 4.0345623517273894e-08,
"loss": 0.1284,
"step": 985
},
{
"epoch": 2.9236471460340994,
"grad_norm": 0.31736278533935547,
"learning_rate": 3.7303690794854296e-08,
"loss": 0.1246,
"step": 986
},
{
"epoch": 2.9266123054114157,
"grad_norm": 0.3197997212409973,
"learning_rate": 3.438075847721933e-08,
"loss": 0.1247,
"step": 987
},
{
"epoch": 2.9295774647887325,
"grad_norm": 0.3263581395149231,
"learning_rate": 3.157686147762129e-08,
"loss": 0.1273,
"step": 988
},
{
"epoch": 2.932542624166049,
"grad_norm": 0.32051053643226624,
"learning_rate": 2.8892033287484245e-08,
"loss": 0.1265,
"step": 989
},
{
"epoch": 2.9355077835433656,
"grad_norm": 0.33849623799324036,
"learning_rate": 2.6326305976001054e-08,
"loss": 0.1287,
"step": 990
},
{
"epoch": 2.938472942920682,
"grad_norm": 0.3170969486236572,
"learning_rate": 2.3879710189753657e-08,
"loss": 0.1252,
"step": 991
},
{
"epoch": 2.9414381022979983,
"grad_norm": 0.32798030972480774,
"learning_rate": 2.1552275152346702e-08,
"loss": 0.1282,
"step": 992
},
{
"epoch": 2.944403261675315,
"grad_norm": 0.3274080157279968,
"learning_rate": 1.9344028664056715e-08,
"loss": 0.1249,
"step": 993
},
{
"epoch": 2.9473684210526314,
"grad_norm": 0.3368877172470093,
"learning_rate": 1.7254997101500137e-08,
"loss": 0.1287,
"step": 994
},
{
"epoch": 2.950333580429948,
"grad_norm": 0.32225024700164795,
"learning_rate": 1.528520541731915e-08,
"loss": 0.1259,
"step": 995
},
{
"epoch": 2.9532987398072645,
"grad_norm": 0.33008435368537903,
"learning_rate": 1.3434677139885222e-08,
"loss": 0.1262,
"step": 996
},
{
"epoch": 2.9562638991845813,
"grad_norm": 0.3370579183101654,
"learning_rate": 1.170343437301491e-08,
"loss": 0.126,
"step": 997
},
{
"epoch": 2.9592290585618977,
"grad_norm": 0.31601622700691223,
"learning_rate": 1.0091497795706728e-08,
"loss": 0.1269,
"step": 998
},
{
"epoch": 2.9621942179392144,
"grad_norm": 0.3216618299484253,
"learning_rate": 8.59888666189579e-09,
"loss": 0.126,
"step": 999
},
{
"epoch": 2.965159377316531,
"grad_norm": 0.3355175852775574,
"learning_rate": 7.225618800222878e-09,
"loss": 0.1278,
"step": 1000
},
{
"epoch": 2.968124536693847,
"grad_norm": 0.32904869318008423,
"learning_rate": 5.971710613821291e-09,
"loss": 0.1284,
"step": 1001
},
{
"epoch": 2.971089696071164,
"grad_norm": 0.351557195186615,
"learning_rate": 4.837177080119215e-09,
"loss": 0.1265,
"step": 1002
},
{
"epoch": 2.9740548554484803,
"grad_norm": 0.32986804842948914,
"learning_rate": 3.8220317506654226e-09,
"loss": 0.1269,
"step": 1003
},
{
"epoch": 2.977020014825797,
"grad_norm": 0.3295051157474518,
"learning_rate": 2.9262867509605164e-09,
"loss": 0.1261,
"step": 1004
},
{
"epoch": 2.9799851742031134,
"grad_norm": 0.3266933858394623,
"learning_rate": 2.149952780321485e-09,
"loss": 0.1248,
"step": 1005
},
{
"epoch": 2.9829503335804297,
"grad_norm": 0.32243990898132324,
"learning_rate": 1.4930391117451427e-09,
"loss": 0.1262,
"step": 1006
},
{
"epoch": 2.9859154929577465,
"grad_norm": 0.34273526072502136,
"learning_rate": 9.555535917993297e-10,
"loss": 0.129,
"step": 1007
},
{
"epoch": 2.9888806523350633,
"grad_norm": 0.3207569718360901,
"learning_rate": 5.375026405352035e-10,
"loss": 0.1259,
"step": 1008
},
{
"epoch": 2.9918458117123796,
"grad_norm": 0.3252420723438263,
"learning_rate": 2.388912514017516e-10,
"loss": 0.1273,
"step": 1009
},
{
"epoch": 2.994810971089696,
"grad_norm": 0.3213896155357361,
"learning_rate": 5.972299119250124e-11,
"loss": 0.1258,
"step": 1010
},
{
"epoch": 2.9977761304670127,
"grad_norm": 0.3191134035587311,
"learning_rate": 0.0,
"loss": 0.1258,
"step": 1011
},
{
"epoch": 2.9977761304670127,
"step": 1011,
"total_flos": 1.1934072323532915e+19,
"train_loss": 0.20480146514054692,
"train_runtime": 12349.0973,
"train_samples_per_second": 10.482,
"train_steps_per_second": 0.082
}
],
"logging_steps": 1,
"max_steps": 1011,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 999999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.1934072323532915e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}