NyxKrage's picture
Training in progress, step 1365, checkpoint
7c94a32 verified
raw
history blame
237 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.7509627727856225,
"eval_steps": 500,
"global_step": 1365,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012836970474967907,
"grad_norm": 0.9255548715591431,
"learning_rate": 5.000000000000001e-07,
"loss": 2.8985,
"step": 1
},
{
"epoch": 0.0025673940949935813,
"grad_norm": 0.7692601680755615,
"learning_rate": 1.0000000000000002e-06,
"loss": 2.9774,
"step": 2
},
{
"epoch": 0.0038510911424903724,
"grad_norm": 0.7884671092033386,
"learning_rate": 1.5e-06,
"loss": 2.9898,
"step": 3
},
{
"epoch": 0.005134788189987163,
"grad_norm": 0.8319393396377563,
"learning_rate": 2.0000000000000003e-06,
"loss": 2.9573,
"step": 4
},
{
"epoch": 0.006418485237483954,
"grad_norm": 0.822285532951355,
"learning_rate": 2.5e-06,
"loss": 2.9316,
"step": 5
},
{
"epoch": 0.007702182284980745,
"grad_norm": 0.7564650774002075,
"learning_rate": 3e-06,
"loss": 2.9537,
"step": 6
},
{
"epoch": 0.008985879332477536,
"grad_norm": 0.9220781922340393,
"learning_rate": 3.5000000000000004e-06,
"loss": 2.9826,
"step": 7
},
{
"epoch": 0.010269576379974325,
"grad_norm": 0.7977064847946167,
"learning_rate": 4.000000000000001e-06,
"loss": 2.8548,
"step": 8
},
{
"epoch": 0.011553273427471117,
"grad_norm": 0.6889916658401489,
"learning_rate": 4.5e-06,
"loss": 2.9936,
"step": 9
},
{
"epoch": 0.012836970474967908,
"grad_norm": 0.9777728915214539,
"learning_rate": 5e-06,
"loss": 2.9578,
"step": 10
},
{
"epoch": 0.014120667522464698,
"grad_norm": 0.8187949061393738,
"learning_rate": 5.500000000000001e-06,
"loss": 2.9442,
"step": 11
},
{
"epoch": 0.01540436456996149,
"grad_norm": 0.7016908526420593,
"learning_rate": 6e-06,
"loss": 2.9299,
"step": 12
},
{
"epoch": 0.01668806161745828,
"grad_norm": 0.6974747180938721,
"learning_rate": 6.5000000000000004e-06,
"loss": 2.9699,
"step": 13
},
{
"epoch": 0.01797175866495507,
"grad_norm": 0.8423139452934265,
"learning_rate": 7.000000000000001e-06,
"loss": 2.8706,
"step": 14
},
{
"epoch": 0.019255455712451863,
"grad_norm": 0.8017705082893372,
"learning_rate": 7.5e-06,
"loss": 2.8443,
"step": 15
},
{
"epoch": 0.02053915275994865,
"grad_norm": 0.9362208247184753,
"learning_rate": 8.000000000000001e-06,
"loss": 2.872,
"step": 16
},
{
"epoch": 0.021822849807445442,
"grad_norm": 1.013128638267517,
"learning_rate": 8.500000000000002e-06,
"loss": 2.797,
"step": 17
},
{
"epoch": 0.023106546854942234,
"grad_norm": 1.255325436592102,
"learning_rate": 9e-06,
"loss": 2.8109,
"step": 18
},
{
"epoch": 0.024390243902439025,
"grad_norm": 1.1081339120864868,
"learning_rate": 9.5e-06,
"loss": 2.6742,
"step": 19
},
{
"epoch": 0.025673940949935817,
"grad_norm": 0.8878622651100159,
"learning_rate": 1e-05,
"loss": 2.7251,
"step": 20
},
{
"epoch": 0.026957637997432605,
"grad_norm": 0.894791305065155,
"learning_rate": 1.05e-05,
"loss": 2.7127,
"step": 21
},
{
"epoch": 0.028241335044929396,
"grad_norm": 0.6742448806762695,
"learning_rate": 1.1000000000000001e-05,
"loss": 2.6842,
"step": 22
},
{
"epoch": 0.029525032092426188,
"grad_norm": 0.6250098943710327,
"learning_rate": 1.1500000000000002e-05,
"loss": 2.6476,
"step": 23
},
{
"epoch": 0.03080872913992298,
"grad_norm": 0.6331678032875061,
"learning_rate": 1.2e-05,
"loss": 2.6339,
"step": 24
},
{
"epoch": 0.03209242618741977,
"grad_norm": 0.4726584255695343,
"learning_rate": 1.25e-05,
"loss": 2.5994,
"step": 25
},
{
"epoch": 0.03337612323491656,
"grad_norm": 0.46077489852905273,
"learning_rate": 1.3000000000000001e-05,
"loss": 2.5545,
"step": 26
},
{
"epoch": 0.03465982028241335,
"grad_norm": 0.5746111273765564,
"learning_rate": 1.3500000000000001e-05,
"loss": 2.5542,
"step": 27
},
{
"epoch": 0.03594351732991014,
"grad_norm": 0.47136253118515015,
"learning_rate": 1.4000000000000001e-05,
"loss": 2.6138,
"step": 28
},
{
"epoch": 0.037227214377406934,
"grad_norm": 0.5951219797134399,
"learning_rate": 1.45e-05,
"loss": 2.5941,
"step": 29
},
{
"epoch": 0.038510911424903725,
"grad_norm": 0.48593708872795105,
"learning_rate": 1.5e-05,
"loss": 2.5751,
"step": 30
},
{
"epoch": 0.03979460847240052,
"grad_norm": 0.482264906167984,
"learning_rate": 1.55e-05,
"loss": 2.5689,
"step": 31
},
{
"epoch": 0.0410783055198973,
"grad_norm": 0.4937015175819397,
"learning_rate": 1.6000000000000003e-05,
"loss": 2.5559,
"step": 32
},
{
"epoch": 0.04236200256739409,
"grad_norm": 0.5136978030204773,
"learning_rate": 1.65e-05,
"loss": 2.5274,
"step": 33
},
{
"epoch": 0.043645699614890884,
"grad_norm": 0.40434661507606506,
"learning_rate": 1.7000000000000003e-05,
"loss": 2.4391,
"step": 34
},
{
"epoch": 0.044929396662387676,
"grad_norm": 0.49542951583862305,
"learning_rate": 1.75e-05,
"loss": 2.5738,
"step": 35
},
{
"epoch": 0.04621309370988447,
"grad_norm": 0.4381186366081238,
"learning_rate": 1.8e-05,
"loss": 2.5095,
"step": 36
},
{
"epoch": 0.04749679075738126,
"grad_norm": 0.486103892326355,
"learning_rate": 1.85e-05,
"loss": 2.6173,
"step": 37
},
{
"epoch": 0.04878048780487805,
"grad_norm": 0.4313197135925293,
"learning_rate": 1.9e-05,
"loss": 2.4951,
"step": 38
},
{
"epoch": 0.05006418485237484,
"grad_norm": 0.4211004078388214,
"learning_rate": 1.9500000000000003e-05,
"loss": 2.5469,
"step": 39
},
{
"epoch": 0.051347881899871634,
"grad_norm": 0.44780072569847107,
"learning_rate": 2e-05,
"loss": 2.4437,
"step": 40
},
{
"epoch": 0.05263157894736842,
"grad_norm": 0.44611668586730957,
"learning_rate": 2.05e-05,
"loss": 2.4795,
"step": 41
},
{
"epoch": 0.05391527599486521,
"grad_norm": 0.4598286747932434,
"learning_rate": 2.1e-05,
"loss": 2.4786,
"step": 42
},
{
"epoch": 0.055198973042362,
"grad_norm": 0.4416978061199188,
"learning_rate": 2.15e-05,
"loss": 2.453,
"step": 43
},
{
"epoch": 0.05648267008985879,
"grad_norm": 0.4136359989643097,
"learning_rate": 2.2000000000000003e-05,
"loss": 2.4975,
"step": 44
},
{
"epoch": 0.057766367137355584,
"grad_norm": 0.44032955169677734,
"learning_rate": 2.25e-05,
"loss": 2.411,
"step": 45
},
{
"epoch": 0.059050064184852376,
"grad_norm": 0.49505728483200073,
"learning_rate": 2.3000000000000003e-05,
"loss": 2.5049,
"step": 46
},
{
"epoch": 0.06033376123234917,
"grad_norm": 0.43698814511299133,
"learning_rate": 2.35e-05,
"loss": 2.5828,
"step": 47
},
{
"epoch": 0.06161745827984596,
"grad_norm": 0.44550350308418274,
"learning_rate": 2.4e-05,
"loss": 2.4326,
"step": 48
},
{
"epoch": 0.06290115532734275,
"grad_norm": 0.38959425687789917,
"learning_rate": 2.45e-05,
"loss": 2.4684,
"step": 49
},
{
"epoch": 0.06418485237483953,
"grad_norm": 0.4324244260787964,
"learning_rate": 2.5e-05,
"loss": 2.4496,
"step": 50
},
{
"epoch": 0.06546854942233633,
"grad_norm": 0.4213118553161621,
"learning_rate": 2.5500000000000003e-05,
"loss": 2.4742,
"step": 51
},
{
"epoch": 0.06675224646983312,
"grad_norm": 0.5279268622398376,
"learning_rate": 2.6000000000000002e-05,
"loss": 2.4222,
"step": 52
},
{
"epoch": 0.06803594351732992,
"grad_norm": 0.40476322174072266,
"learning_rate": 2.6500000000000004e-05,
"loss": 2.4224,
"step": 53
},
{
"epoch": 0.0693196405648267,
"grad_norm": 0.6938806176185608,
"learning_rate": 2.7000000000000002e-05,
"loss": 2.4194,
"step": 54
},
{
"epoch": 0.07060333761232349,
"grad_norm": 0.43899399042129517,
"learning_rate": 2.7500000000000004e-05,
"loss": 2.4288,
"step": 55
},
{
"epoch": 0.07188703465982028,
"grad_norm": 0.3968575596809387,
"learning_rate": 2.8000000000000003e-05,
"loss": 2.3968,
"step": 56
},
{
"epoch": 0.07317073170731707,
"grad_norm": 0.6113290786743164,
"learning_rate": 2.8499999999999998e-05,
"loss": 2.3924,
"step": 57
},
{
"epoch": 0.07445442875481387,
"grad_norm": 0.35704493522644043,
"learning_rate": 2.9e-05,
"loss": 2.4004,
"step": 58
},
{
"epoch": 0.07573812580231065,
"grad_norm": 0.3809000551700592,
"learning_rate": 2.95e-05,
"loss": 2.4303,
"step": 59
},
{
"epoch": 0.07702182284980745,
"grad_norm": 0.4394189715385437,
"learning_rate": 3e-05,
"loss": 2.4716,
"step": 60
},
{
"epoch": 0.07830551989730423,
"grad_norm": 0.4325893521308899,
"learning_rate": 3.05e-05,
"loss": 2.4715,
"step": 61
},
{
"epoch": 0.07958921694480103,
"grad_norm": 0.3560517728328705,
"learning_rate": 3.1e-05,
"loss": 2.4745,
"step": 62
},
{
"epoch": 0.08087291399229782,
"grad_norm": 0.37922918796539307,
"learning_rate": 3.15e-05,
"loss": 2.4789,
"step": 63
},
{
"epoch": 0.0821566110397946,
"grad_norm": 0.3665093779563904,
"learning_rate": 3.2000000000000005e-05,
"loss": 2.4829,
"step": 64
},
{
"epoch": 0.0834403080872914,
"grad_norm": 0.38106483221054077,
"learning_rate": 3.2500000000000004e-05,
"loss": 2.4203,
"step": 65
},
{
"epoch": 0.08472400513478819,
"grad_norm": 0.5111002326011658,
"learning_rate": 3.3e-05,
"loss": 2.3302,
"step": 66
},
{
"epoch": 0.08600770218228498,
"grad_norm": 0.37738922238349915,
"learning_rate": 3.35e-05,
"loss": 2.4991,
"step": 67
},
{
"epoch": 0.08729139922978177,
"grad_norm": 0.34703031182289124,
"learning_rate": 3.4000000000000007e-05,
"loss": 2.4224,
"step": 68
},
{
"epoch": 0.08857509627727857,
"grad_norm": 0.48427700996398926,
"learning_rate": 3.45e-05,
"loss": 2.3946,
"step": 69
},
{
"epoch": 0.08985879332477535,
"grad_norm": 0.360221266746521,
"learning_rate": 3.5e-05,
"loss": 2.4853,
"step": 70
},
{
"epoch": 0.09114249037227215,
"grad_norm": 0.4006412625312805,
"learning_rate": 3.55e-05,
"loss": 2.4654,
"step": 71
},
{
"epoch": 0.09242618741976893,
"grad_norm": 0.3662618398666382,
"learning_rate": 3.6e-05,
"loss": 2.434,
"step": 72
},
{
"epoch": 0.09370988446726572,
"grad_norm": 0.3694933354854584,
"learning_rate": 3.65e-05,
"loss": 2.4194,
"step": 73
},
{
"epoch": 0.09499358151476252,
"grad_norm": 0.34268808364868164,
"learning_rate": 3.7e-05,
"loss": 2.4922,
"step": 74
},
{
"epoch": 0.0962772785622593,
"grad_norm": 0.3664718270301819,
"learning_rate": 3.7500000000000003e-05,
"loss": 2.4083,
"step": 75
},
{
"epoch": 0.0975609756097561,
"grad_norm": 0.3909706473350525,
"learning_rate": 3.8e-05,
"loss": 2.4546,
"step": 76
},
{
"epoch": 0.09884467265725289,
"grad_norm": 0.36276674270629883,
"learning_rate": 3.85e-05,
"loss": 2.5313,
"step": 77
},
{
"epoch": 0.10012836970474968,
"grad_norm": 0.34822535514831543,
"learning_rate": 3.9000000000000006e-05,
"loss": 2.4833,
"step": 78
},
{
"epoch": 0.10141206675224647,
"grad_norm": 0.37480583786964417,
"learning_rate": 3.9500000000000005e-05,
"loss": 2.449,
"step": 79
},
{
"epoch": 0.10269576379974327,
"grad_norm": 0.3415388762950897,
"learning_rate": 4e-05,
"loss": 2.3953,
"step": 80
},
{
"epoch": 0.10397946084724005,
"grad_norm": 0.3487205505371094,
"learning_rate": 4.05e-05,
"loss": 2.4469,
"step": 81
},
{
"epoch": 0.10526315789473684,
"grad_norm": 0.5083756446838379,
"learning_rate": 4.1e-05,
"loss": 2.3423,
"step": 82
},
{
"epoch": 0.10654685494223363,
"grad_norm": 0.3747817575931549,
"learning_rate": 4.15e-05,
"loss": 2.3919,
"step": 83
},
{
"epoch": 0.10783055198973042,
"grad_norm": 0.39472389221191406,
"learning_rate": 4.2e-05,
"loss": 2.5431,
"step": 84
},
{
"epoch": 0.10911424903722722,
"grad_norm": 0.36542952060699463,
"learning_rate": 4.25e-05,
"loss": 2.3858,
"step": 85
},
{
"epoch": 0.110397946084724,
"grad_norm": 0.34629878401756287,
"learning_rate": 4.3e-05,
"loss": 2.3528,
"step": 86
},
{
"epoch": 0.1116816431322208,
"grad_norm": 0.36755290627479553,
"learning_rate": 4.35e-05,
"loss": 2.5329,
"step": 87
},
{
"epoch": 0.11296534017971759,
"grad_norm": 0.3599033057689667,
"learning_rate": 4.4000000000000006e-05,
"loss": 2.4971,
"step": 88
},
{
"epoch": 0.11424903722721438,
"grad_norm": 0.3730204701423645,
"learning_rate": 4.4500000000000004e-05,
"loss": 2.4082,
"step": 89
},
{
"epoch": 0.11553273427471117,
"grad_norm": 0.3773551881313324,
"learning_rate": 4.5e-05,
"loss": 2.3393,
"step": 90
},
{
"epoch": 0.11681643132220795,
"grad_norm": 0.36052408814430237,
"learning_rate": 4.55e-05,
"loss": 2.3799,
"step": 91
},
{
"epoch": 0.11810012836970475,
"grad_norm": 0.32685768604278564,
"learning_rate": 4.600000000000001e-05,
"loss": 2.4284,
"step": 92
},
{
"epoch": 0.11938382541720154,
"grad_norm": 0.36063292622566223,
"learning_rate": 4.6500000000000005e-05,
"loss": 2.3903,
"step": 93
},
{
"epoch": 0.12066752246469833,
"grad_norm": 0.3656150698661804,
"learning_rate": 4.7e-05,
"loss": 2.4378,
"step": 94
},
{
"epoch": 0.12195121951219512,
"grad_norm": 0.3563483655452728,
"learning_rate": 4.75e-05,
"loss": 2.4082,
"step": 95
},
{
"epoch": 0.12323491655969192,
"grad_norm": 0.35744163393974304,
"learning_rate": 4.8e-05,
"loss": 2.3979,
"step": 96
},
{
"epoch": 0.1245186136071887,
"grad_norm": 0.3400294780731201,
"learning_rate": 4.85e-05,
"loss": 2.4419,
"step": 97
},
{
"epoch": 0.1258023106546855,
"grad_norm": 0.3691268265247345,
"learning_rate": 4.9e-05,
"loss": 2.4671,
"step": 98
},
{
"epoch": 0.12708600770218229,
"grad_norm": 0.3483717739582062,
"learning_rate": 4.9500000000000004e-05,
"loss": 2.3947,
"step": 99
},
{
"epoch": 0.12836970474967907,
"grad_norm": 0.3494178354740143,
"learning_rate": 5e-05,
"loss": 2.4063,
"step": 100
},
{
"epoch": 0.12965340179717585,
"grad_norm": 0.3814634680747986,
"learning_rate": 4.9999986437272225e-05,
"loss": 2.4832,
"step": 101
},
{
"epoch": 0.13093709884467267,
"grad_norm": 0.378907710313797,
"learning_rate": 4.999994574910364e-05,
"loss": 2.3272,
"step": 102
},
{
"epoch": 0.13222079589216945,
"grad_norm": 0.3730032444000244,
"learning_rate": 4.999987793553836e-05,
"loss": 2.3965,
"step": 103
},
{
"epoch": 0.13350449293966624,
"grad_norm": 0.3670955002307892,
"learning_rate": 4.9999782996649994e-05,
"loss": 2.4565,
"step": 104
},
{
"epoch": 0.13478818998716302,
"grad_norm": 0.36450427770614624,
"learning_rate": 4.999966093254153e-05,
"loss": 2.5079,
"step": 105
},
{
"epoch": 0.13607188703465983,
"grad_norm": 0.38658779859542847,
"learning_rate": 4.9999511743345426e-05,
"loss": 2.4775,
"step": 106
},
{
"epoch": 0.13735558408215662,
"grad_norm": 0.33551573753356934,
"learning_rate": 4.999933542922354e-05,
"loss": 2.461,
"step": 107
},
{
"epoch": 0.1386392811296534,
"grad_norm": 0.3854399025440216,
"learning_rate": 4.999913199036719e-05,
"loss": 2.4075,
"step": 108
},
{
"epoch": 0.1399229781771502,
"grad_norm": 0.3932352364063263,
"learning_rate": 4.9998901426997104e-05,
"loss": 2.4311,
"step": 109
},
{
"epoch": 0.14120667522464697,
"grad_norm": 0.3379668593406677,
"learning_rate": 4.999864373936345e-05,
"loss": 2.5016,
"step": 110
},
{
"epoch": 0.14249037227214378,
"grad_norm": 0.32840296626091003,
"learning_rate": 4.9998358927745826e-05,
"loss": 2.3176,
"step": 111
},
{
"epoch": 0.14377406931964057,
"grad_norm": 0.3191027343273163,
"learning_rate": 4.999804699245325e-05,
"loss": 2.4007,
"step": 112
},
{
"epoch": 0.14505776636713735,
"grad_norm": 0.3358600437641144,
"learning_rate": 4.999770793382418e-05,
"loss": 2.3724,
"step": 113
},
{
"epoch": 0.14634146341463414,
"grad_norm": 0.335860937833786,
"learning_rate": 4.99973417522265e-05,
"loss": 2.3479,
"step": 114
},
{
"epoch": 0.14762516046213095,
"grad_norm": 0.34540730714797974,
"learning_rate": 4.999694844805753e-05,
"loss": 2.3675,
"step": 115
},
{
"epoch": 0.14890885750962773,
"grad_norm": 0.3298475742340088,
"learning_rate": 4.999652802174402e-05,
"loss": 2.3948,
"step": 116
},
{
"epoch": 0.15019255455712452,
"grad_norm": 0.3612127900123596,
"learning_rate": 4.999608047374211e-05,
"loss": 2.3855,
"step": 117
},
{
"epoch": 0.1514762516046213,
"grad_norm": 0.4185655415058136,
"learning_rate": 4.9995605804537426e-05,
"loss": 2.4015,
"step": 118
},
{
"epoch": 0.1527599486521181,
"grad_norm": 0.3759553134441376,
"learning_rate": 4.9995104014644986e-05,
"loss": 2.4483,
"step": 119
},
{
"epoch": 0.1540436456996149,
"grad_norm": 0.35989564657211304,
"learning_rate": 4.999457510460923e-05,
"loss": 2.4974,
"step": 120
},
{
"epoch": 0.15532734274711169,
"grad_norm": 0.3161202073097229,
"learning_rate": 4.999401907500405e-05,
"loss": 2.3712,
"step": 121
},
{
"epoch": 0.15661103979460847,
"grad_norm": 0.3105814456939697,
"learning_rate": 4.999343592643274e-05,
"loss": 2.4311,
"step": 122
},
{
"epoch": 0.15789473684210525,
"grad_norm": 0.3236968517303467,
"learning_rate": 4.9992825659528024e-05,
"loss": 2.5536,
"step": 123
},
{
"epoch": 0.15917843388960207,
"grad_norm": 0.3107609748840332,
"learning_rate": 4.9992188274952064e-05,
"loss": 2.3922,
"step": 124
},
{
"epoch": 0.16046213093709885,
"grad_norm": 0.39889928698539734,
"learning_rate": 4.999152377339642e-05,
"loss": 2.3488,
"step": 125
},
{
"epoch": 0.16174582798459564,
"grad_norm": 0.379323273897171,
"learning_rate": 4.99908321555821e-05,
"loss": 2.5278,
"step": 126
},
{
"epoch": 0.16302952503209242,
"grad_norm": 0.35020819306373596,
"learning_rate": 4.999011342225952e-05,
"loss": 2.3139,
"step": 127
},
{
"epoch": 0.1643132220795892,
"grad_norm": 0.34851884841918945,
"learning_rate": 4.998936757420851e-05,
"loss": 2.3495,
"step": 128
},
{
"epoch": 0.16559691912708602,
"grad_norm": 0.3453572392463684,
"learning_rate": 4.9988594612238336e-05,
"loss": 2.4128,
"step": 129
},
{
"epoch": 0.1668806161745828,
"grad_norm": 0.34227538108825684,
"learning_rate": 4.998779453718768e-05,
"loss": 2.3419,
"step": 130
},
{
"epoch": 0.1681643132220796,
"grad_norm": 0.3947238028049469,
"learning_rate": 4.998696734992462e-05,
"loss": 2.3941,
"step": 131
},
{
"epoch": 0.16944801026957637,
"grad_norm": 0.306533545255661,
"learning_rate": 4.998611305134669e-05,
"loss": 2.4645,
"step": 132
},
{
"epoch": 0.17073170731707318,
"grad_norm": 0.35172709822654724,
"learning_rate": 4.998523164238082e-05,
"loss": 2.4407,
"step": 133
},
{
"epoch": 0.17201540436456997,
"grad_norm": 0.34511688351631165,
"learning_rate": 4.9984323123983334e-05,
"loss": 2.3815,
"step": 134
},
{
"epoch": 0.17329910141206675,
"grad_norm": 0.33132505416870117,
"learning_rate": 4.9983387497140006e-05,
"loss": 2.3548,
"step": 135
},
{
"epoch": 0.17458279845956354,
"grad_norm": 0.32891082763671875,
"learning_rate": 4.998242476286601e-05,
"loss": 2.5308,
"step": 136
},
{
"epoch": 0.17586649550706032,
"grad_norm": 0.331152081489563,
"learning_rate": 4.998143492220592e-05,
"loss": 2.3858,
"step": 137
},
{
"epoch": 0.17715019255455713,
"grad_norm": 0.31813687086105347,
"learning_rate": 4.9980417976233735e-05,
"loss": 2.3136,
"step": 138
},
{
"epoch": 0.17843388960205392,
"grad_norm": 0.3268696069717407,
"learning_rate": 4.9979373926052865e-05,
"loss": 2.3133,
"step": 139
},
{
"epoch": 0.1797175866495507,
"grad_norm": 0.3389696180820465,
"learning_rate": 4.997830277279612e-05,
"loss": 2.3983,
"step": 140
},
{
"epoch": 0.1810012836970475,
"grad_norm": 0.3515508770942688,
"learning_rate": 4.997720451762572e-05,
"loss": 2.4848,
"step": 141
},
{
"epoch": 0.1822849807445443,
"grad_norm": 0.3302924335002899,
"learning_rate": 4.997607916173329e-05,
"loss": 2.3037,
"step": 142
},
{
"epoch": 0.18356867779204109,
"grad_norm": 0.3332863450050354,
"learning_rate": 4.997492670633987e-05,
"loss": 2.3563,
"step": 143
},
{
"epoch": 0.18485237483953787,
"grad_norm": 0.3398495614528656,
"learning_rate": 4.997374715269589e-05,
"loss": 2.4056,
"step": 144
},
{
"epoch": 0.18613607188703465,
"grad_norm": 0.3376169502735138,
"learning_rate": 4.9972540502081184e-05,
"loss": 2.3751,
"step": 145
},
{
"epoch": 0.18741976893453144,
"grad_norm": 0.33639830350875854,
"learning_rate": 4.9971306755804995e-05,
"loss": 2.432,
"step": 146
},
{
"epoch": 0.18870346598202825,
"grad_norm": 0.410265177488327,
"learning_rate": 4.9970045915205954e-05,
"loss": 2.3647,
"step": 147
},
{
"epoch": 0.18998716302952504,
"grad_norm": 0.31853362917900085,
"learning_rate": 4.99687579816521e-05,
"loss": 2.4224,
"step": 148
},
{
"epoch": 0.19127086007702182,
"grad_norm": 0.3495614230632782,
"learning_rate": 4.9967442956540863e-05,
"loss": 2.3961,
"step": 149
},
{
"epoch": 0.1925545571245186,
"grad_norm": 0.33301132917404175,
"learning_rate": 4.996610084129908e-05,
"loss": 2.359,
"step": 150
},
{
"epoch": 0.19383825417201542,
"grad_norm": 0.3186255395412445,
"learning_rate": 4.996473163738295e-05,
"loss": 2.4488,
"step": 151
},
{
"epoch": 0.1951219512195122,
"grad_norm": 0.3374113142490387,
"learning_rate": 4.996333534627809e-05,
"loss": 2.4019,
"step": 152
},
{
"epoch": 0.196405648267009,
"grad_norm": 0.33905014395713806,
"learning_rate": 4.996191196949952e-05,
"loss": 2.3272,
"step": 153
},
{
"epoch": 0.19768934531450577,
"grad_norm": 0.3546963036060333,
"learning_rate": 4.996046150859161e-05,
"loss": 2.4338,
"step": 154
},
{
"epoch": 0.19897304236200256,
"grad_norm": 0.35965755581855774,
"learning_rate": 4.9958983965128145e-05,
"loss": 2.3634,
"step": 155
},
{
"epoch": 0.20025673940949937,
"grad_norm": 0.3362196385860443,
"learning_rate": 4.995747934071229e-05,
"loss": 2.4457,
"step": 156
},
{
"epoch": 0.20154043645699615,
"grad_norm": 0.37214434146881104,
"learning_rate": 4.995594763697657e-05,
"loss": 2.3714,
"step": 157
},
{
"epoch": 0.20282413350449294,
"grad_norm": 0.3538600206375122,
"learning_rate": 4.995438885558294e-05,
"loss": 2.4338,
"step": 158
},
{
"epoch": 0.20410783055198972,
"grad_norm": 0.3853035569190979,
"learning_rate": 4.995280299822268e-05,
"loss": 2.4036,
"step": 159
},
{
"epoch": 0.20539152759948653,
"grad_norm": 0.35037294030189514,
"learning_rate": 4.9951190066616495e-05,
"loss": 2.3846,
"step": 160
},
{
"epoch": 0.20667522464698332,
"grad_norm": 0.3986567556858063,
"learning_rate": 4.994955006251443e-05,
"loss": 2.4405,
"step": 161
},
{
"epoch": 0.2079589216944801,
"grad_norm": 0.385637491941452,
"learning_rate": 4.994788298769593e-05,
"loss": 2.3237,
"step": 162
},
{
"epoch": 0.2092426187419769,
"grad_norm": 0.34437429904937744,
"learning_rate": 4.994618884396979e-05,
"loss": 2.3477,
"step": 163
},
{
"epoch": 0.21052631578947367,
"grad_norm": 0.34156107902526855,
"learning_rate": 4.99444676331742e-05,
"loss": 2.3576,
"step": 164
},
{
"epoch": 0.21181001283697048,
"grad_norm": 0.33888015151023865,
"learning_rate": 4.99427193571767e-05,
"loss": 2.311,
"step": 165
},
{
"epoch": 0.21309370988446727,
"grad_norm": 0.34711146354675293,
"learning_rate": 4.99409440178742e-05,
"loss": 2.293,
"step": 166
},
{
"epoch": 0.21437740693196405,
"grad_norm": 0.30586686730384827,
"learning_rate": 4.993914161719297e-05,
"loss": 2.479,
"step": 167
},
{
"epoch": 0.21566110397946084,
"grad_norm": 0.4145469069480896,
"learning_rate": 4.993731215708866e-05,
"loss": 2.2754,
"step": 168
},
{
"epoch": 0.21694480102695765,
"grad_norm": 0.6243994235992432,
"learning_rate": 4.993545563954626e-05,
"loss": 2.3751,
"step": 169
},
{
"epoch": 0.21822849807445444,
"grad_norm": 0.3301653265953064,
"learning_rate": 4.993357206658011e-05,
"loss": 2.6124,
"step": 170
},
{
"epoch": 0.21951219512195122,
"grad_norm": 0.337147980928421,
"learning_rate": 4.993166144023396e-05,
"loss": 2.3946,
"step": 171
},
{
"epoch": 0.220795892169448,
"grad_norm": 0.3796294033527374,
"learning_rate": 4.9929723762580835e-05,
"loss": 2.3314,
"step": 172
},
{
"epoch": 0.2220795892169448,
"grad_norm": 0.4021480679512024,
"learning_rate": 4.9927759035723175e-05,
"loss": 2.3035,
"step": 173
},
{
"epoch": 0.2233632862644416,
"grad_norm": 0.3196207582950592,
"learning_rate": 4.992576726179274e-05,
"loss": 2.4047,
"step": 174
},
{
"epoch": 0.2246469833119384,
"grad_norm": 0.31279096007347107,
"learning_rate": 4.992374844295064e-05,
"loss": 2.4409,
"step": 175
},
{
"epoch": 0.22593068035943517,
"grad_norm": 0.3068607747554779,
"learning_rate": 4.992170258138732e-05,
"loss": 2.4109,
"step": 176
},
{
"epoch": 0.22721437740693196,
"grad_norm": 0.35363098978996277,
"learning_rate": 4.991962967932258e-05,
"loss": 2.4083,
"step": 177
},
{
"epoch": 0.22849807445442877,
"grad_norm": 0.33908936381340027,
"learning_rate": 4.9917529739005574e-05,
"loss": 2.4129,
"step": 178
},
{
"epoch": 0.22978177150192555,
"grad_norm": 0.32510805130004883,
"learning_rate": 4.991540276271476e-05,
"loss": 2.3163,
"step": 179
},
{
"epoch": 0.23106546854942234,
"grad_norm": 0.32639047503471375,
"learning_rate": 4.991324875275794e-05,
"loss": 2.3567,
"step": 180
},
{
"epoch": 0.23234916559691912,
"grad_norm": 0.3410406708717346,
"learning_rate": 4.991106771147227e-05,
"loss": 2.4137,
"step": 181
},
{
"epoch": 0.2336328626444159,
"grad_norm": 0.3334997892379761,
"learning_rate": 4.990885964122421e-05,
"loss": 2.3994,
"step": 182
},
{
"epoch": 0.23491655969191272,
"grad_norm": 0.3203848600387573,
"learning_rate": 4.990662454440956e-05,
"loss": 2.4259,
"step": 183
},
{
"epoch": 0.2362002567394095,
"grad_norm": 0.34999069571495056,
"learning_rate": 4.9904362423453446e-05,
"loss": 2.413,
"step": 184
},
{
"epoch": 0.2374839537869063,
"grad_norm": 0.3413456678390503,
"learning_rate": 4.990207328081029e-05,
"loss": 2.3749,
"step": 185
},
{
"epoch": 0.23876765083440307,
"grad_norm": 0.3702191114425659,
"learning_rate": 4.989975711896388e-05,
"loss": 2.4309,
"step": 186
},
{
"epoch": 0.24005134788189988,
"grad_norm": 0.3365226089954376,
"learning_rate": 4.989741394042727e-05,
"loss": 2.3708,
"step": 187
},
{
"epoch": 0.24133504492939667,
"grad_norm": 0.36596447229385376,
"learning_rate": 4.989504374774288e-05,
"loss": 2.4387,
"step": 188
},
{
"epoch": 0.24261874197689345,
"grad_norm": 0.3280293941497803,
"learning_rate": 4.9892646543482377e-05,
"loss": 2.498,
"step": 189
},
{
"epoch": 0.24390243902439024,
"grad_norm": 0.35989147424697876,
"learning_rate": 4.989022233024681e-05,
"loss": 2.4993,
"step": 190
},
{
"epoch": 0.24518613607188702,
"grad_norm": 0.3124522268772125,
"learning_rate": 4.988777111066646e-05,
"loss": 2.3186,
"step": 191
},
{
"epoch": 0.24646983311938384,
"grad_norm": 0.32323285937309265,
"learning_rate": 4.988529288740096e-05,
"loss": 2.3859,
"step": 192
},
{
"epoch": 0.24775353016688062,
"grad_norm": 0.33156561851501465,
"learning_rate": 4.988278766313922e-05,
"loss": 2.3271,
"step": 193
},
{
"epoch": 0.2490372272143774,
"grad_norm": 0.32494014501571655,
"learning_rate": 4.9880255440599476e-05,
"loss": 2.4023,
"step": 194
},
{
"epoch": 0.2503209242618742,
"grad_norm": 0.3906209468841553,
"learning_rate": 4.987769622252921e-05,
"loss": 2.4245,
"step": 195
},
{
"epoch": 0.251604621309371,
"grad_norm": 0.3735322952270508,
"learning_rate": 4.987511001170523e-05,
"loss": 2.4883,
"step": 196
},
{
"epoch": 0.25288831835686776,
"grad_norm": 0.30935361981391907,
"learning_rate": 4.987249681093362e-05,
"loss": 2.5118,
"step": 197
},
{
"epoch": 0.25417201540436457,
"grad_norm": 0.34933099150657654,
"learning_rate": 4.986985662304976e-05,
"loss": 2.3138,
"step": 198
},
{
"epoch": 0.2554557124518614,
"grad_norm": 0.3328215777873993,
"learning_rate": 4.9867189450918294e-05,
"loss": 2.4475,
"step": 199
},
{
"epoch": 0.25673940949935814,
"grad_norm": 0.3271300494670868,
"learning_rate": 4.986449529743314e-05,
"loss": 2.4156,
"step": 200
},
{
"epoch": 0.25802310654685495,
"grad_norm": 0.296889990568161,
"learning_rate": 4.9861774165517536e-05,
"loss": 2.4117,
"step": 201
},
{
"epoch": 0.2593068035943517,
"grad_norm": 0.31897205114364624,
"learning_rate": 4.9859026058123925e-05,
"loss": 2.4637,
"step": 202
},
{
"epoch": 0.2605905006418485,
"grad_norm": 0.320688933134079,
"learning_rate": 4.985625097823408e-05,
"loss": 2.3865,
"step": 203
},
{
"epoch": 0.26187419768934533,
"grad_norm": 0.29705896973609924,
"learning_rate": 4.985344892885899e-05,
"loss": 2.365,
"step": 204
},
{
"epoch": 0.2631578947368421,
"grad_norm": 0.32413363456726074,
"learning_rate": 4.985061991303895e-05,
"loss": 2.4379,
"step": 205
},
{
"epoch": 0.2644415917843389,
"grad_norm": 0.38111138343811035,
"learning_rate": 4.984776393384348e-05,
"loss": 2.4303,
"step": 206
},
{
"epoch": 0.26572528883183566,
"grad_norm": 0.34943342208862305,
"learning_rate": 4.984488099437138e-05,
"loss": 2.3508,
"step": 207
},
{
"epoch": 0.26700898587933247,
"grad_norm": 0.34694743156433105,
"learning_rate": 4.984197109775068e-05,
"loss": 2.4488,
"step": 208
},
{
"epoch": 0.2682926829268293,
"grad_norm": 0.3251892626285553,
"learning_rate": 4.983903424713868e-05,
"loss": 2.4187,
"step": 209
},
{
"epoch": 0.26957637997432604,
"grad_norm": 0.34391969442367554,
"learning_rate": 4.9836070445721924e-05,
"loss": 2.4723,
"step": 210
},
{
"epoch": 0.27086007702182285,
"grad_norm": 0.32182154059410095,
"learning_rate": 4.983307969671617e-05,
"loss": 2.4282,
"step": 211
},
{
"epoch": 0.27214377406931967,
"grad_norm": 0.3486088514328003,
"learning_rate": 4.983006200336645e-05,
"loss": 2.4411,
"step": 212
},
{
"epoch": 0.2734274711168164,
"grad_norm": 0.3152271807193756,
"learning_rate": 4.9827017368947e-05,
"loss": 2.3362,
"step": 213
},
{
"epoch": 0.27471116816431324,
"grad_norm": 0.30361658334732056,
"learning_rate": 4.982394579676133e-05,
"loss": 2.34,
"step": 214
},
{
"epoch": 0.27599486521181,
"grad_norm": 0.3163071274757385,
"learning_rate": 4.9820847290142135e-05,
"loss": 2.3996,
"step": 215
},
{
"epoch": 0.2772785622593068,
"grad_norm": 0.319831907749176,
"learning_rate": 4.981772185245135e-05,
"loss": 2.352,
"step": 216
},
{
"epoch": 0.2785622593068036,
"grad_norm": 0.34143316745758057,
"learning_rate": 4.981456948708014e-05,
"loss": 2.2837,
"step": 217
},
{
"epoch": 0.2798459563543004,
"grad_norm": 0.3066380023956299,
"learning_rate": 4.981139019744887e-05,
"loss": 2.3848,
"step": 218
},
{
"epoch": 0.2811296534017972,
"grad_norm": 0.32719743251800537,
"learning_rate": 4.9808183987007136e-05,
"loss": 2.3845,
"step": 219
},
{
"epoch": 0.28241335044929394,
"grad_norm": 0.384798526763916,
"learning_rate": 4.980495085923372e-05,
"loss": 2.3767,
"step": 220
},
{
"epoch": 0.28369704749679076,
"grad_norm": 0.4263782799243927,
"learning_rate": 4.980169081763665e-05,
"loss": 2.3342,
"step": 221
},
{
"epoch": 0.28498074454428757,
"grad_norm": 0.3271712064743042,
"learning_rate": 4.979840386575311e-05,
"loss": 2.4539,
"step": 222
},
{
"epoch": 0.2862644415917843,
"grad_norm": 0.35880088806152344,
"learning_rate": 4.97950900071495e-05,
"loss": 2.3488,
"step": 223
},
{
"epoch": 0.28754813863928114,
"grad_norm": 0.30669230222702026,
"learning_rate": 4.9791749245421434e-05,
"loss": 2.4084,
"step": 224
},
{
"epoch": 0.2888318356867779,
"grad_norm": 0.34398922324180603,
"learning_rate": 4.9788381584193684e-05,
"loss": 2.3226,
"step": 225
},
{
"epoch": 0.2901155327342747,
"grad_norm": 0.3699093163013458,
"learning_rate": 4.9784987027120236e-05,
"loss": 2.3608,
"step": 226
},
{
"epoch": 0.2913992297817715,
"grad_norm": 0.3445926010608673,
"learning_rate": 4.978156557788424e-05,
"loss": 2.3771,
"step": 227
},
{
"epoch": 0.2926829268292683,
"grad_norm": 0.33999142050743103,
"learning_rate": 4.977811724019802e-05,
"loss": 2.3541,
"step": 228
},
{
"epoch": 0.2939666238767651,
"grad_norm": 0.3054107427597046,
"learning_rate": 4.9774642017803106e-05,
"loss": 2.4764,
"step": 229
},
{
"epoch": 0.2952503209242619,
"grad_norm": 0.3072315454483032,
"learning_rate": 4.977113991447017e-05,
"loss": 2.3055,
"step": 230
},
{
"epoch": 0.29653401797175866,
"grad_norm": 0.3254982531070709,
"learning_rate": 4.9767610933999055e-05,
"loss": 2.4659,
"step": 231
},
{
"epoch": 0.29781771501925547,
"grad_norm": 0.2930049002170563,
"learning_rate": 4.976405508021877e-05,
"loss": 2.3313,
"step": 232
},
{
"epoch": 0.2991014120667522,
"grad_norm": 0.30119720101356506,
"learning_rate": 4.976047235698747e-05,
"loss": 2.2981,
"step": 233
},
{
"epoch": 0.30038510911424904,
"grad_norm": 0.30038225650787354,
"learning_rate": 4.9756862768192504e-05,
"loss": 2.2864,
"step": 234
},
{
"epoch": 0.30166880616174585,
"grad_norm": 0.29573819041252136,
"learning_rate": 4.975322631775032e-05,
"loss": 2.3508,
"step": 235
},
{
"epoch": 0.3029525032092426,
"grad_norm": 0.316169798374176,
"learning_rate": 4.9749563009606534e-05,
"loss": 2.3648,
"step": 236
},
{
"epoch": 0.3042362002567394,
"grad_norm": 0.3109273910522461,
"learning_rate": 4.9745872847735894e-05,
"loss": 2.5141,
"step": 237
},
{
"epoch": 0.3055198973042362,
"grad_norm": 0.34613820910453796,
"learning_rate": 4.974215583614232e-05,
"loss": 2.333,
"step": 238
},
{
"epoch": 0.306803594351733,
"grad_norm": 0.34924328327178955,
"learning_rate": 4.9738411978858814e-05,
"loss": 2.4158,
"step": 239
},
{
"epoch": 0.3080872913992298,
"grad_norm": 0.2984835207462311,
"learning_rate": 4.9734641279947535e-05,
"loss": 2.3817,
"step": 240
},
{
"epoch": 0.30937098844672656,
"grad_norm": 0.3265876770019531,
"learning_rate": 4.9730843743499764e-05,
"loss": 2.4202,
"step": 241
},
{
"epoch": 0.31065468549422337,
"grad_norm": 0.31205374002456665,
"learning_rate": 4.9727019373635895e-05,
"loss": 2.3492,
"step": 242
},
{
"epoch": 0.3119383825417201,
"grad_norm": 0.3619811534881592,
"learning_rate": 4.972316817450544e-05,
"loss": 2.481,
"step": 243
},
{
"epoch": 0.31322207958921694,
"grad_norm": 0.4126303791999817,
"learning_rate": 4.9719290150287026e-05,
"loss": 2.2388,
"step": 244
},
{
"epoch": 0.31450577663671375,
"grad_norm": 0.34099090099334717,
"learning_rate": 4.971538530518836e-05,
"loss": 2.4,
"step": 245
},
{
"epoch": 0.3157894736842105,
"grad_norm": 0.32491570711135864,
"learning_rate": 4.971145364344628e-05,
"loss": 2.3854,
"step": 246
},
{
"epoch": 0.3170731707317073,
"grad_norm": 0.340457022190094,
"learning_rate": 4.970749516932672e-05,
"loss": 2.4011,
"step": 247
},
{
"epoch": 0.31835686777920413,
"grad_norm": 0.33701422810554504,
"learning_rate": 4.97035098871247e-05,
"loss": 2.4188,
"step": 248
},
{
"epoch": 0.3196405648267009,
"grad_norm": 0.3307265341281891,
"learning_rate": 4.969949780116431e-05,
"loss": 2.4553,
"step": 249
},
{
"epoch": 0.3209242618741977,
"grad_norm": 0.38966116309165955,
"learning_rate": 4.969545891579873e-05,
"loss": 2.3657,
"step": 250
},
{
"epoch": 0.32220795892169446,
"grad_norm": 0.36796143651008606,
"learning_rate": 4.969139323541025e-05,
"loss": 2.3044,
"step": 251
},
{
"epoch": 0.32349165596919127,
"grad_norm": 0.32584547996520996,
"learning_rate": 4.968730076441017e-05,
"loss": 2.4064,
"step": 252
},
{
"epoch": 0.3247753530166881,
"grad_norm": 0.3360714316368103,
"learning_rate": 4.968318150723893e-05,
"loss": 2.4243,
"step": 253
},
{
"epoch": 0.32605905006418484,
"grad_norm": 0.3058249056339264,
"learning_rate": 4.9679035468365986e-05,
"loss": 2.3598,
"step": 254
},
{
"epoch": 0.32734274711168165,
"grad_norm": 0.32182779908180237,
"learning_rate": 4.9674862652289865e-05,
"loss": 2.3469,
"step": 255
},
{
"epoch": 0.3286264441591784,
"grad_norm": 0.39011824131011963,
"learning_rate": 4.967066306353816e-05,
"loss": 2.4215,
"step": 256
},
{
"epoch": 0.3299101412066752,
"grad_norm": 0.3215901553630829,
"learning_rate": 4.966643670666748e-05,
"loss": 2.396,
"step": 257
},
{
"epoch": 0.33119383825417203,
"grad_norm": 0.3239218592643738,
"learning_rate": 4.9662183586263514e-05,
"loss": 2.4271,
"step": 258
},
{
"epoch": 0.3324775353016688,
"grad_norm": 0.33229637145996094,
"learning_rate": 4.965790370694097e-05,
"loss": 2.4345,
"step": 259
},
{
"epoch": 0.3337612323491656,
"grad_norm": 0.36766308546066284,
"learning_rate": 4.9653597073343594e-05,
"loss": 2.4257,
"step": 260
},
{
"epoch": 0.33504492939666236,
"grad_norm": 0.33887600898742676,
"learning_rate": 4.964926369014417e-05,
"loss": 2.4391,
"step": 261
},
{
"epoch": 0.3363286264441592,
"grad_norm": 0.3224686086177826,
"learning_rate": 4.964490356204449e-05,
"loss": 2.3664,
"step": 262
},
{
"epoch": 0.337612323491656,
"grad_norm": 0.3371565341949463,
"learning_rate": 4.964051669377538e-05,
"loss": 2.4992,
"step": 263
},
{
"epoch": 0.33889602053915274,
"grad_norm": 0.3373253345489502,
"learning_rate": 4.963610309009665e-05,
"loss": 2.3509,
"step": 264
},
{
"epoch": 0.34017971758664955,
"grad_norm": 0.34519249200820923,
"learning_rate": 4.963166275579717e-05,
"loss": 2.4388,
"step": 265
},
{
"epoch": 0.34146341463414637,
"grad_norm": 0.3187866806983948,
"learning_rate": 4.9627195695694774e-05,
"loss": 2.4655,
"step": 266
},
{
"epoch": 0.3427471116816431,
"grad_norm": 0.31844937801361084,
"learning_rate": 4.962270191463629e-05,
"loss": 2.4097,
"step": 267
},
{
"epoch": 0.34403080872913994,
"grad_norm": 0.359625905752182,
"learning_rate": 4.9618181417497566e-05,
"loss": 2.4364,
"step": 268
},
{
"epoch": 0.3453145057766367,
"grad_norm": 0.3335789740085602,
"learning_rate": 4.961363420918342e-05,
"loss": 2.2316,
"step": 269
},
{
"epoch": 0.3465982028241335,
"grad_norm": 0.3209823668003082,
"learning_rate": 4.960906029462766e-05,
"loss": 2.4698,
"step": 270
},
{
"epoch": 0.3478818998716303,
"grad_norm": 0.3264525532722473,
"learning_rate": 4.960445967879307e-05,
"loss": 2.3222,
"step": 271
},
{
"epoch": 0.3491655969191271,
"grad_norm": 0.29652708768844604,
"learning_rate": 4.959983236667138e-05,
"loss": 2.3258,
"step": 272
},
{
"epoch": 0.3504492939666239,
"grad_norm": 0.35058003664016724,
"learning_rate": 4.959517836328333e-05,
"loss": 2.3009,
"step": 273
},
{
"epoch": 0.35173299101412064,
"grad_norm": 0.29893478751182556,
"learning_rate": 4.959049767367859e-05,
"loss": 2.463,
"step": 274
},
{
"epoch": 0.35301668806161746,
"grad_norm": 0.3114057183265686,
"learning_rate": 4.95857903029358e-05,
"loss": 2.2757,
"step": 275
},
{
"epoch": 0.35430038510911427,
"grad_norm": 0.3837164640426636,
"learning_rate": 4.958105625616253e-05,
"loss": 2.445,
"step": 276
},
{
"epoch": 0.355584082156611,
"grad_norm": 0.3195387125015259,
"learning_rate": 4.957629553849532e-05,
"loss": 2.4823,
"step": 277
},
{
"epoch": 0.35686777920410784,
"grad_norm": 0.37014302611351013,
"learning_rate": 4.957150815509963e-05,
"loss": 2.3322,
"step": 278
},
{
"epoch": 0.3581514762516046,
"grad_norm": 0.28935369849205017,
"learning_rate": 4.9566694111169853e-05,
"loss": 2.4336,
"step": 279
},
{
"epoch": 0.3594351732991014,
"grad_norm": 0.3155868351459503,
"learning_rate": 4.956185341192933e-05,
"loss": 2.4691,
"step": 280
},
{
"epoch": 0.3607188703465982,
"grad_norm": 0.33008038997650146,
"learning_rate": 4.955698606263028e-05,
"loss": 2.3331,
"step": 281
},
{
"epoch": 0.362002567394095,
"grad_norm": 0.3546096086502075,
"learning_rate": 4.95520920685539e-05,
"loss": 2.4626,
"step": 282
},
{
"epoch": 0.3632862644415918,
"grad_norm": 0.3569015562534332,
"learning_rate": 4.954717143501024e-05,
"loss": 2.4165,
"step": 283
},
{
"epoch": 0.3645699614890886,
"grad_norm": 0.2979843318462372,
"learning_rate": 4.954222416733829e-05,
"loss": 2.3322,
"step": 284
},
{
"epoch": 0.36585365853658536,
"grad_norm": 0.43857890367507935,
"learning_rate": 4.953725027090591e-05,
"loss": 2.3168,
"step": 285
},
{
"epoch": 0.36713735558408217,
"grad_norm": 0.32624951004981995,
"learning_rate": 4.953224975110988e-05,
"loss": 2.3685,
"step": 286
},
{
"epoch": 0.3684210526315789,
"grad_norm": 0.3195008933544159,
"learning_rate": 4.9527222613375855e-05,
"loss": 2.4137,
"step": 287
},
{
"epoch": 0.36970474967907574,
"grad_norm": 0.362567663192749,
"learning_rate": 4.952216886315837e-05,
"loss": 2.2978,
"step": 288
},
{
"epoch": 0.37098844672657255,
"grad_norm": 0.31306540966033936,
"learning_rate": 4.951708850594083e-05,
"loss": 2.4381,
"step": 289
},
{
"epoch": 0.3722721437740693,
"grad_norm": 0.3275150954723358,
"learning_rate": 4.951198154723552e-05,
"loss": 2.3528,
"step": 290
},
{
"epoch": 0.3735558408215661,
"grad_norm": 0.32541584968566895,
"learning_rate": 4.9506847992583586e-05,
"loss": 2.5023,
"step": 291
},
{
"epoch": 0.3748395378690629,
"grad_norm": 0.3278298079967499,
"learning_rate": 4.9501687847555016e-05,
"loss": 2.3745,
"step": 292
},
{
"epoch": 0.3761232349165597,
"grad_norm": 0.3163221776485443,
"learning_rate": 4.949650111774868e-05,
"loss": 2.3451,
"step": 293
},
{
"epoch": 0.3774069319640565,
"grad_norm": 0.30997052788734436,
"learning_rate": 4.9491287808792265e-05,
"loss": 2.3976,
"step": 294
},
{
"epoch": 0.37869062901155326,
"grad_norm": 0.3496086597442627,
"learning_rate": 4.948604792634229e-05,
"loss": 2.4361,
"step": 295
},
{
"epoch": 0.37997432605905007,
"grad_norm": 0.3483693599700928,
"learning_rate": 4.948078147608416e-05,
"loss": 2.4022,
"step": 296
},
{
"epoch": 0.38125802310654683,
"grad_norm": 0.3135087490081787,
"learning_rate": 4.947548846373204e-05,
"loss": 2.3585,
"step": 297
},
{
"epoch": 0.38254172015404364,
"grad_norm": 0.323482871055603,
"learning_rate": 4.947016889502895e-05,
"loss": 2.3958,
"step": 298
},
{
"epoch": 0.38382541720154045,
"grad_norm": 0.3020581901073456,
"learning_rate": 4.946482277574673e-05,
"loss": 2.2809,
"step": 299
},
{
"epoch": 0.3851091142490372,
"grad_norm": 0.31288352608680725,
"learning_rate": 4.9459450111686e-05,
"loss": 2.3526,
"step": 300
},
{
"epoch": 0.386392811296534,
"grad_norm": 0.31495004892349243,
"learning_rate": 4.945405090867621e-05,
"loss": 2.4462,
"step": 301
},
{
"epoch": 0.38767650834403083,
"grad_norm": 0.3513999581336975,
"learning_rate": 4.94486251725756e-05,
"loss": 2.3795,
"step": 302
},
{
"epoch": 0.3889602053915276,
"grad_norm": 0.33559510111808777,
"learning_rate": 4.944317290927117e-05,
"loss": 2.3002,
"step": 303
},
{
"epoch": 0.3902439024390244,
"grad_norm": 0.3153408467769623,
"learning_rate": 4.943769412467875e-05,
"loss": 2.437,
"step": 304
},
{
"epoch": 0.39152759948652116,
"grad_norm": 0.3177226781845093,
"learning_rate": 4.943218882474291e-05,
"loss": 2.3447,
"step": 305
},
{
"epoch": 0.392811296534018,
"grad_norm": 0.3815303146839142,
"learning_rate": 4.9426657015436994e-05,
"loss": 2.3584,
"step": 306
},
{
"epoch": 0.3940949935815148,
"grad_norm": 0.34336403012275696,
"learning_rate": 4.9421098702763126e-05,
"loss": 2.2979,
"step": 307
},
{
"epoch": 0.39537869062901154,
"grad_norm": 0.33281055092811584,
"learning_rate": 4.941551389275217e-05,
"loss": 2.4152,
"step": 308
},
{
"epoch": 0.39666238767650835,
"grad_norm": 0.3309512138366699,
"learning_rate": 4.9409902591463756e-05,
"loss": 2.361,
"step": 309
},
{
"epoch": 0.3979460847240051,
"grad_norm": 0.3044840097427368,
"learning_rate": 4.940426480498623e-05,
"loss": 2.3172,
"step": 310
},
{
"epoch": 0.3992297817715019,
"grad_norm": 0.32050418853759766,
"learning_rate": 4.939860053943671e-05,
"loss": 2.374,
"step": 311
},
{
"epoch": 0.40051347881899874,
"grad_norm": 0.36678293347358704,
"learning_rate": 4.939290980096103e-05,
"loss": 2.3529,
"step": 312
},
{
"epoch": 0.4017971758664955,
"grad_norm": 0.32348355650901794,
"learning_rate": 4.9387192595733734e-05,
"loss": 2.4048,
"step": 313
},
{
"epoch": 0.4030808729139923,
"grad_norm": 0.3086166977882385,
"learning_rate": 4.938144892995809e-05,
"loss": 2.4091,
"step": 314
},
{
"epoch": 0.40436456996148906,
"grad_norm": 0.31869277358055115,
"learning_rate": 4.937567880986609e-05,
"loss": 2.3594,
"step": 315
},
{
"epoch": 0.4056482670089859,
"grad_norm": 0.317147821187973,
"learning_rate": 4.936988224171842e-05,
"loss": 2.2666,
"step": 316
},
{
"epoch": 0.4069319640564827,
"grad_norm": 0.32587262988090515,
"learning_rate": 4.936405923180446e-05,
"loss": 2.317,
"step": 317
},
{
"epoch": 0.40821566110397944,
"grad_norm": 0.3181535005569458,
"learning_rate": 4.935820978644228e-05,
"loss": 2.3611,
"step": 318
},
{
"epoch": 0.40949935815147626,
"grad_norm": 0.31699231266975403,
"learning_rate": 4.9352333911978625e-05,
"loss": 2.4676,
"step": 319
},
{
"epoch": 0.41078305519897307,
"grad_norm": 0.35386183857917786,
"learning_rate": 4.9346431614788945e-05,
"loss": 2.3525,
"step": 320
},
{
"epoch": 0.4120667522464698,
"grad_norm": 0.3134053647518158,
"learning_rate": 4.934050290127733e-05,
"loss": 2.4045,
"step": 321
},
{
"epoch": 0.41335044929396664,
"grad_norm": 0.3209103047847748,
"learning_rate": 4.933454777787654e-05,
"loss": 2.3606,
"step": 322
},
{
"epoch": 0.4146341463414634,
"grad_norm": 0.3191809356212616,
"learning_rate": 4.9328566251048e-05,
"loss": 2.4551,
"step": 323
},
{
"epoch": 0.4159178433889602,
"grad_norm": 0.2983877658843994,
"learning_rate": 4.9322558327281773e-05,
"loss": 2.3299,
"step": 324
},
{
"epoch": 0.417201540436457,
"grad_norm": 0.3376917541027069,
"learning_rate": 4.931652401309655e-05,
"loss": 2.3599,
"step": 325
},
{
"epoch": 0.4184852374839538,
"grad_norm": 0.36222752928733826,
"learning_rate": 4.93104633150397e-05,
"loss": 2.4432,
"step": 326
},
{
"epoch": 0.4197689345314506,
"grad_norm": 0.2962850034236908,
"learning_rate": 4.930437623968718e-05,
"loss": 2.3965,
"step": 327
},
{
"epoch": 0.42105263157894735,
"grad_norm": 0.32738175988197327,
"learning_rate": 4.929826279364357e-05,
"loss": 2.3276,
"step": 328
},
{
"epoch": 0.42233632862644416,
"grad_norm": 0.3119535446166992,
"learning_rate": 4.929212298354207e-05,
"loss": 2.3639,
"step": 329
},
{
"epoch": 0.42362002567394097,
"grad_norm": 0.3193548321723938,
"learning_rate": 4.9285956816044486e-05,
"loss": 2.4468,
"step": 330
},
{
"epoch": 0.4249037227214377,
"grad_norm": 0.32102808356285095,
"learning_rate": 4.927976429784124e-05,
"loss": 2.422,
"step": 331
},
{
"epoch": 0.42618741976893454,
"grad_norm": 0.3115790784358978,
"learning_rate": 4.92735454356513e-05,
"loss": 2.2948,
"step": 332
},
{
"epoch": 0.4274711168164313,
"grad_norm": 0.320351243019104,
"learning_rate": 4.926730023622227e-05,
"loss": 2.3466,
"step": 333
},
{
"epoch": 0.4287548138639281,
"grad_norm": 0.30213356018066406,
"learning_rate": 4.926102870633029e-05,
"loss": 2.3773,
"step": 334
},
{
"epoch": 0.4300385109114249,
"grad_norm": 0.3080739676952362,
"learning_rate": 4.925473085278008e-05,
"loss": 2.3822,
"step": 335
},
{
"epoch": 0.4313222079589217,
"grad_norm": 0.3327421545982361,
"learning_rate": 4.924840668240495e-05,
"loss": 2.4326,
"step": 336
},
{
"epoch": 0.4326059050064185,
"grad_norm": 0.34913370013237,
"learning_rate": 4.924205620206671e-05,
"loss": 2.4491,
"step": 337
},
{
"epoch": 0.4338896020539153,
"grad_norm": 0.3061310946941376,
"learning_rate": 4.923567941865577e-05,
"loss": 2.3727,
"step": 338
},
{
"epoch": 0.43517329910141206,
"grad_norm": 0.3259632885456085,
"learning_rate": 4.9229276339091034e-05,
"loss": 2.3775,
"step": 339
},
{
"epoch": 0.43645699614890887,
"grad_norm": 0.29085099697113037,
"learning_rate": 4.922284697031999e-05,
"loss": 2.3427,
"step": 340
},
{
"epoch": 0.43774069319640563,
"grad_norm": 0.30840590596199036,
"learning_rate": 4.921639131931859e-05,
"loss": 2.367,
"step": 341
},
{
"epoch": 0.43902439024390244,
"grad_norm": 0.32114389538764954,
"learning_rate": 4.920990939309135e-05,
"loss": 2.4625,
"step": 342
},
{
"epoch": 0.44030808729139925,
"grad_norm": 0.3091549873352051,
"learning_rate": 4.920340119867127e-05,
"loss": 2.3571,
"step": 343
},
{
"epoch": 0.441591784338896,
"grad_norm": 0.2978641092777252,
"learning_rate": 4.919686674311987e-05,
"loss": 2.3033,
"step": 344
},
{
"epoch": 0.4428754813863928,
"grad_norm": 0.33106520771980286,
"learning_rate": 4.919030603352715e-05,
"loss": 2.3276,
"step": 345
},
{
"epoch": 0.4441591784338896,
"grad_norm": 0.42283540964126587,
"learning_rate": 4.918371907701159e-05,
"loss": 2.3563,
"step": 346
},
{
"epoch": 0.4454428754813864,
"grad_norm": 0.32727953791618347,
"learning_rate": 4.9177105880720173e-05,
"loss": 2.3928,
"step": 347
},
{
"epoch": 0.4467265725288832,
"grad_norm": 0.3086586892604828,
"learning_rate": 4.9170466451828326e-05,
"loss": 2.2707,
"step": 348
},
{
"epoch": 0.44801026957637996,
"grad_norm": 0.28993648290634155,
"learning_rate": 4.916380079753995e-05,
"loss": 2.3582,
"step": 349
},
{
"epoch": 0.4492939666238768,
"grad_norm": 0.3154331147670746,
"learning_rate": 4.9157108925087405e-05,
"loss": 2.4003,
"step": 350
},
{
"epoch": 0.45057766367137353,
"grad_norm": 0.3312948942184448,
"learning_rate": 4.9150390841731485e-05,
"loss": 2.3955,
"step": 351
},
{
"epoch": 0.45186136071887034,
"grad_norm": 0.30381128191947937,
"learning_rate": 4.914364655476146e-05,
"loss": 2.3737,
"step": 352
},
{
"epoch": 0.45314505776636715,
"grad_norm": 0.30618736147880554,
"learning_rate": 4.9136876071494976e-05,
"loss": 2.3602,
"step": 353
},
{
"epoch": 0.4544287548138639,
"grad_norm": 0.3091343343257904,
"learning_rate": 4.913007939927814e-05,
"loss": 2.4564,
"step": 354
},
{
"epoch": 0.4557124518613607,
"grad_norm": 0.311162531375885,
"learning_rate": 4.912325654548546e-05,
"loss": 2.3133,
"step": 355
},
{
"epoch": 0.45699614890885754,
"grad_norm": 0.33965715765953064,
"learning_rate": 4.911640751751988e-05,
"loss": 2.4312,
"step": 356
},
{
"epoch": 0.4582798459563543,
"grad_norm": 0.3162749111652374,
"learning_rate": 4.910953232281269e-05,
"loss": 2.3438,
"step": 357
},
{
"epoch": 0.4595635430038511,
"grad_norm": 0.32984068989753723,
"learning_rate": 4.910263096882362e-05,
"loss": 2.3823,
"step": 358
},
{
"epoch": 0.46084724005134786,
"grad_norm": 0.30816513299942017,
"learning_rate": 4.909570346304076e-05,
"loss": 2.445,
"step": 359
},
{
"epoch": 0.4621309370988447,
"grad_norm": 0.2928940951824188,
"learning_rate": 4.908874981298057e-05,
"loss": 2.4538,
"step": 360
},
{
"epoch": 0.4634146341463415,
"grad_norm": 0.32563963532447815,
"learning_rate": 4.9081770026187914e-05,
"loss": 2.292,
"step": 361
},
{
"epoch": 0.46469833119383824,
"grad_norm": 0.29584982991218567,
"learning_rate": 4.907476411023596e-05,
"loss": 2.4046,
"step": 362
},
{
"epoch": 0.46598202824133506,
"grad_norm": 0.3291500210762024,
"learning_rate": 4.906773207272626e-05,
"loss": 2.3186,
"step": 363
},
{
"epoch": 0.4672657252888318,
"grad_norm": 0.3066290020942688,
"learning_rate": 4.9060673921288716e-05,
"loss": 2.3266,
"step": 364
},
{
"epoch": 0.4685494223363286,
"grad_norm": 0.3005695939064026,
"learning_rate": 4.905358966358153e-05,
"loss": 2.3712,
"step": 365
},
{
"epoch": 0.46983311938382544,
"grad_norm": 0.32391899824142456,
"learning_rate": 4.904647930729128e-05,
"loss": 2.3514,
"step": 366
},
{
"epoch": 0.4711168164313222,
"grad_norm": 0.2915700078010559,
"learning_rate": 4.903934286013281e-05,
"loss": 2.4646,
"step": 367
},
{
"epoch": 0.472400513478819,
"grad_norm": 0.34920045733451843,
"learning_rate": 4.90321803298493e-05,
"loss": 2.3287,
"step": 368
},
{
"epoch": 0.47368421052631576,
"grad_norm": 0.2955242395401001,
"learning_rate": 4.902499172421222e-05,
"loss": 2.3654,
"step": 369
},
{
"epoch": 0.4749679075738126,
"grad_norm": 0.3305751085281372,
"learning_rate": 4.901777705102135e-05,
"loss": 2.4419,
"step": 370
},
{
"epoch": 0.4762516046213094,
"grad_norm": 0.30396509170532227,
"learning_rate": 4.9010536318104734e-05,
"loss": 2.3576,
"step": 371
},
{
"epoch": 0.47753530166880614,
"grad_norm": 0.2961418330669403,
"learning_rate": 4.9003269533318704e-05,
"loss": 2.3039,
"step": 372
},
{
"epoch": 0.47881899871630296,
"grad_norm": 0.35183992981910706,
"learning_rate": 4.899597670454785e-05,
"loss": 2.3333,
"step": 373
},
{
"epoch": 0.48010269576379977,
"grad_norm": 0.297158420085907,
"learning_rate": 4.8988657839705024e-05,
"loss": 2.3514,
"step": 374
},
{
"epoch": 0.4813863928112965,
"grad_norm": 0.3379060626029968,
"learning_rate": 4.8981312946731325e-05,
"loss": 2.2811,
"step": 375
},
{
"epoch": 0.48267008985879334,
"grad_norm": 0.34060969948768616,
"learning_rate": 4.897394203359611e-05,
"loss": 2.4123,
"step": 376
},
{
"epoch": 0.4839537869062901,
"grad_norm": 0.3308861255645752,
"learning_rate": 4.896654510829694e-05,
"loss": 2.4262,
"step": 377
},
{
"epoch": 0.4852374839537869,
"grad_norm": 0.3043263852596283,
"learning_rate": 4.8959122178859616e-05,
"loss": 2.4291,
"step": 378
},
{
"epoch": 0.4865211810012837,
"grad_norm": 0.32594412565231323,
"learning_rate": 4.8951673253338156e-05,
"loss": 2.3679,
"step": 379
},
{
"epoch": 0.4878048780487805,
"grad_norm": 0.3090050220489502,
"learning_rate": 4.894419833981478e-05,
"loss": 2.3933,
"step": 380
},
{
"epoch": 0.4890885750962773,
"grad_norm": 0.3234407901763916,
"learning_rate": 4.8936697446399896e-05,
"loss": 2.3156,
"step": 381
},
{
"epoch": 0.49037227214377405,
"grad_norm": 0.3294239938259125,
"learning_rate": 4.892917058123212e-05,
"loss": 2.357,
"step": 382
},
{
"epoch": 0.49165596919127086,
"grad_norm": 0.33215636014938354,
"learning_rate": 4.8921617752478235e-05,
"loss": 2.4386,
"step": 383
},
{
"epoch": 0.49293966623876767,
"grad_norm": 0.30511781573295593,
"learning_rate": 4.89140389683332e-05,
"loss": 2.4283,
"step": 384
},
{
"epoch": 0.4942233632862644,
"grad_norm": 0.31070634722709656,
"learning_rate": 4.890643423702013e-05,
"loss": 2.3565,
"step": 385
},
{
"epoch": 0.49550706033376124,
"grad_norm": 0.3108975291252136,
"learning_rate": 4.8898803566790296e-05,
"loss": 2.3209,
"step": 386
},
{
"epoch": 0.496790757381258,
"grad_norm": 0.31348085403442383,
"learning_rate": 4.889114696592312e-05,
"loss": 2.3707,
"step": 387
},
{
"epoch": 0.4980744544287548,
"grad_norm": 0.3120376467704773,
"learning_rate": 4.8883464442726146e-05,
"loss": 2.3743,
"step": 388
},
{
"epoch": 0.4993581514762516,
"grad_norm": 0.3120869994163513,
"learning_rate": 4.887575600553506e-05,
"loss": 2.3501,
"step": 389
},
{
"epoch": 0.5006418485237484,
"grad_norm": 0.28657200932502747,
"learning_rate": 4.886802166271364e-05,
"loss": 2.365,
"step": 390
},
{
"epoch": 0.5019255455712451,
"grad_norm": 0.3038496971130371,
"learning_rate": 4.886026142265381e-05,
"loss": 2.3851,
"step": 391
},
{
"epoch": 0.503209242618742,
"grad_norm": 0.3245856761932373,
"learning_rate": 4.885247529377557e-05,
"loss": 2.3996,
"step": 392
},
{
"epoch": 0.5044929396662388,
"grad_norm": 0.2905268669128418,
"learning_rate": 4.8844663284526995e-05,
"loss": 2.4106,
"step": 393
},
{
"epoch": 0.5057766367137355,
"grad_norm": 0.3151610195636749,
"learning_rate": 4.883682540338428e-05,
"loss": 2.3665,
"step": 394
},
{
"epoch": 0.5070603337612324,
"grad_norm": 0.3045148253440857,
"learning_rate": 4.8828961658851645e-05,
"loss": 2.3244,
"step": 395
},
{
"epoch": 0.5083440308087291,
"grad_norm": 0.33118733763694763,
"learning_rate": 4.882107205946142e-05,
"loss": 2.3622,
"step": 396
},
{
"epoch": 0.5096277278562259,
"grad_norm": 0.3078731596469879,
"learning_rate": 4.881315661377393e-05,
"loss": 2.325,
"step": 397
},
{
"epoch": 0.5109114249037228,
"grad_norm": 0.33527669310569763,
"learning_rate": 4.880521533037762e-05,
"loss": 2.4234,
"step": 398
},
{
"epoch": 0.5121951219512195,
"grad_norm": 0.32107025384902954,
"learning_rate": 4.879724821788889e-05,
"loss": 2.2833,
"step": 399
},
{
"epoch": 0.5134788189987163,
"grad_norm": 0.31819841265678406,
"learning_rate": 4.878925528495223e-05,
"loss": 2.4178,
"step": 400
},
{
"epoch": 0.5147625160462131,
"grad_norm": 0.31981033086776733,
"learning_rate": 4.8781236540240106e-05,
"loss": 2.4154,
"step": 401
},
{
"epoch": 0.5160462130937099,
"grad_norm": 0.39689508080482483,
"learning_rate": 4.8773191992453e-05,
"loss": 2.3759,
"step": 402
},
{
"epoch": 0.5173299101412067,
"grad_norm": 0.3065798282623291,
"learning_rate": 4.876512165031939e-05,
"loss": 2.4322,
"step": 403
},
{
"epoch": 0.5186136071887034,
"grad_norm": 0.3112392723560333,
"learning_rate": 4.8757025522595756e-05,
"loss": 2.3008,
"step": 404
},
{
"epoch": 0.5198973042362003,
"grad_norm": 0.3170951008796692,
"learning_rate": 4.874890361806654e-05,
"loss": 2.3265,
"step": 405
},
{
"epoch": 0.521181001283697,
"grad_norm": 0.32137608528137207,
"learning_rate": 4.8740755945544156e-05,
"loss": 2.4448,
"step": 406
},
{
"epoch": 0.5224646983311938,
"grad_norm": 0.32462355494499207,
"learning_rate": 4.873258251386897e-05,
"loss": 2.3504,
"step": 407
},
{
"epoch": 0.5237483953786907,
"grad_norm": 0.32057517766952515,
"learning_rate": 4.872438333190931e-05,
"loss": 2.4223,
"step": 408
},
{
"epoch": 0.5250320924261874,
"grad_norm": 0.3375895917415619,
"learning_rate": 4.871615840856144e-05,
"loss": 2.3912,
"step": 409
},
{
"epoch": 0.5263157894736842,
"grad_norm": 0.29747098684310913,
"learning_rate": 4.870790775274955e-05,
"loss": 2.3684,
"step": 410
},
{
"epoch": 0.527599486521181,
"grad_norm": 0.31433284282684326,
"learning_rate": 4.8699631373425746e-05,
"loss": 2.4619,
"step": 411
},
{
"epoch": 0.5288831835686778,
"grad_norm": 0.31316566467285156,
"learning_rate": 4.869132927957007e-05,
"loss": 2.45,
"step": 412
},
{
"epoch": 0.5301668806161746,
"grad_norm": 0.29028958082199097,
"learning_rate": 4.8683001480190415e-05,
"loss": 2.3539,
"step": 413
},
{
"epoch": 0.5314505776636713,
"grad_norm": 0.32501766085624695,
"learning_rate": 4.867464798432262e-05,
"loss": 2.4104,
"step": 414
},
{
"epoch": 0.5327342747111682,
"grad_norm": 0.31315121054649353,
"learning_rate": 4.866626880103036e-05,
"loss": 2.4346,
"step": 415
},
{
"epoch": 0.5340179717586649,
"grad_norm": 0.30741140246391296,
"learning_rate": 4.865786393940522e-05,
"loss": 2.3253,
"step": 416
},
{
"epoch": 0.5353016688061617,
"grad_norm": 0.3042595684528351,
"learning_rate": 4.8649433408566626e-05,
"loss": 2.4107,
"step": 417
},
{
"epoch": 0.5365853658536586,
"grad_norm": 0.3254822790622711,
"learning_rate": 4.864097721766184e-05,
"loss": 2.368,
"step": 418
},
{
"epoch": 0.5378690629011553,
"grad_norm": 0.3164546489715576,
"learning_rate": 4.8632495375866004e-05,
"loss": 2.3493,
"step": 419
},
{
"epoch": 0.5391527599486521,
"grad_norm": 0.30588898062705994,
"learning_rate": 4.8623987892382066e-05,
"loss": 2.3751,
"step": 420
},
{
"epoch": 0.540436456996149,
"grad_norm": 0.3062260150909424,
"learning_rate": 4.861545477644079e-05,
"loss": 2.3137,
"step": 421
},
{
"epoch": 0.5417201540436457,
"grad_norm": 0.3363398611545563,
"learning_rate": 4.8606896037300776e-05,
"loss": 2.3326,
"step": 422
},
{
"epoch": 0.5430038510911425,
"grad_norm": 0.33340004086494446,
"learning_rate": 4.8598311684248396e-05,
"loss": 2.3511,
"step": 423
},
{
"epoch": 0.5442875481386393,
"grad_norm": 0.2808108329772949,
"learning_rate": 4.858970172659785e-05,
"loss": 2.4589,
"step": 424
},
{
"epoch": 0.5455712451861361,
"grad_norm": 0.299396276473999,
"learning_rate": 4.8581066173691074e-05,
"loss": 2.3481,
"step": 425
},
{
"epoch": 0.5468549422336328,
"grad_norm": 0.3083891272544861,
"learning_rate": 4.857240503489782e-05,
"loss": 2.2812,
"step": 426
},
{
"epoch": 0.5481386392811296,
"grad_norm": 0.3224524259567261,
"learning_rate": 4.8563718319615584e-05,
"loss": 2.2938,
"step": 427
},
{
"epoch": 0.5494223363286265,
"grad_norm": 0.2835102379322052,
"learning_rate": 4.8555006037269594e-05,
"loss": 2.3401,
"step": 428
},
{
"epoch": 0.5507060333761232,
"grad_norm": 0.28240859508514404,
"learning_rate": 4.854626819731284e-05,
"loss": 2.4238,
"step": 429
},
{
"epoch": 0.55198973042362,
"grad_norm": 0.28541532158851624,
"learning_rate": 4.853750480922604e-05,
"loss": 2.36,
"step": 430
},
{
"epoch": 0.5532734274711169,
"grad_norm": 0.3530977666378021,
"learning_rate": 4.852871588251763e-05,
"loss": 2.3936,
"step": 431
},
{
"epoch": 0.5545571245186136,
"grad_norm": 0.3233661949634552,
"learning_rate": 4.851990142672376e-05,
"loss": 2.3921,
"step": 432
},
{
"epoch": 0.5558408215661104,
"grad_norm": 0.34146299958229065,
"learning_rate": 4.851106145140827e-05,
"loss": 2.4632,
"step": 433
},
{
"epoch": 0.5571245186136072,
"grad_norm": 0.3273311257362366,
"learning_rate": 4.8502195966162694e-05,
"loss": 2.3328,
"step": 434
},
{
"epoch": 0.558408215661104,
"grad_norm": 0.29606807231903076,
"learning_rate": 4.8493304980606245e-05,
"loss": 2.3077,
"step": 435
},
{
"epoch": 0.5596919127086007,
"grad_norm": 0.2943183481693268,
"learning_rate": 4.8484388504385806e-05,
"loss": 2.3786,
"step": 436
},
{
"epoch": 0.5609756097560976,
"grad_norm": 0.29871752858161926,
"learning_rate": 4.847544654717592e-05,
"loss": 2.3262,
"step": 437
},
{
"epoch": 0.5622593068035944,
"grad_norm": 0.3182483911514282,
"learning_rate": 4.8466479118678766e-05,
"loss": 2.3529,
"step": 438
},
{
"epoch": 0.5635430038510911,
"grad_norm": 0.34097424149513245,
"learning_rate": 4.845748622862417e-05,
"loss": 2.3279,
"step": 439
},
{
"epoch": 0.5648267008985879,
"grad_norm": 0.295391321182251,
"learning_rate": 4.8448467886769585e-05,
"loss": 2.3059,
"step": 440
},
{
"epoch": 0.5661103979460848,
"grad_norm": 0.2817283868789673,
"learning_rate": 4.8439424102900064e-05,
"loss": 2.3056,
"step": 441
},
{
"epoch": 0.5673940949935815,
"grad_norm": 0.29399430751800537,
"learning_rate": 4.84303548868283e-05,
"loss": 2.4049,
"step": 442
},
{
"epoch": 0.5686777920410783,
"grad_norm": 0.32070907950401306,
"learning_rate": 4.842126024839453e-05,
"loss": 2.3548,
"step": 443
},
{
"epoch": 0.5699614890885751,
"grad_norm": 0.2972594201564789,
"learning_rate": 4.841214019746663e-05,
"loss": 2.4239,
"step": 444
},
{
"epoch": 0.5712451861360719,
"grad_norm": 0.3162088096141815,
"learning_rate": 4.840299474394e-05,
"loss": 2.4425,
"step": 445
},
{
"epoch": 0.5725288831835686,
"grad_norm": 0.2938075661659241,
"learning_rate": 4.8393823897737634e-05,
"loss": 2.3283,
"step": 446
},
{
"epoch": 0.5738125802310655,
"grad_norm": 0.2970564067363739,
"learning_rate": 4.8384627668810064e-05,
"loss": 2.3184,
"step": 447
},
{
"epoch": 0.5750962772785623,
"grad_norm": 0.2736704647541046,
"learning_rate": 4.837540606713538e-05,
"loss": 2.3115,
"step": 448
},
{
"epoch": 0.576379974326059,
"grad_norm": 0.3252422511577606,
"learning_rate": 4.8366159102719156e-05,
"loss": 2.3955,
"step": 449
},
{
"epoch": 0.5776636713735558,
"grad_norm": 0.3110811114311218,
"learning_rate": 4.8356886785594544e-05,
"loss": 2.3336,
"step": 450
},
{
"epoch": 0.5789473684210527,
"grad_norm": 0.2989579141139984,
"learning_rate": 4.834758912582217e-05,
"loss": 2.3001,
"step": 451
},
{
"epoch": 0.5802310654685494,
"grad_norm": 0.3055441379547119,
"learning_rate": 4.833826613349016e-05,
"loss": 2.3683,
"step": 452
},
{
"epoch": 0.5815147625160462,
"grad_norm": 0.30826109647750854,
"learning_rate": 4.832891781871414e-05,
"loss": 2.3054,
"step": 453
},
{
"epoch": 0.582798459563543,
"grad_norm": 0.32700634002685547,
"learning_rate": 4.831954419163719e-05,
"loss": 2.3865,
"step": 454
},
{
"epoch": 0.5840821566110398,
"grad_norm": 0.30788737535476685,
"learning_rate": 4.831014526242987e-05,
"loss": 2.3478,
"step": 455
},
{
"epoch": 0.5853658536585366,
"grad_norm": 0.28584685921669006,
"learning_rate": 4.8300721041290194e-05,
"loss": 2.356,
"step": 456
},
{
"epoch": 0.5866495507060334,
"grad_norm": 0.3295878469944,
"learning_rate": 4.829127153844361e-05,
"loss": 2.3768,
"step": 457
},
{
"epoch": 0.5879332477535302,
"grad_norm": 0.31243589520454407,
"learning_rate": 4.8281796764143e-05,
"loss": 2.365,
"step": 458
},
{
"epoch": 0.5892169448010269,
"grad_norm": 0.30623236298561096,
"learning_rate": 4.8272296728668676e-05,
"loss": 2.369,
"step": 459
},
{
"epoch": 0.5905006418485238,
"grad_norm": 0.2986590564250946,
"learning_rate": 4.826277144232834e-05,
"loss": 2.3042,
"step": 460
},
{
"epoch": 0.5917843388960206,
"grad_norm": 0.30335181951522827,
"learning_rate": 4.825322091545709e-05,
"loss": 2.2763,
"step": 461
},
{
"epoch": 0.5930680359435173,
"grad_norm": 0.30525484681129456,
"learning_rate": 4.824364515841745e-05,
"loss": 2.3532,
"step": 462
},
{
"epoch": 0.5943517329910141,
"grad_norm": 0.2878653109073639,
"learning_rate": 4.823404418159927e-05,
"loss": 2.4261,
"step": 463
},
{
"epoch": 0.5956354300385109,
"grad_norm": 0.29711782932281494,
"learning_rate": 4.822441799541979e-05,
"loss": 2.4149,
"step": 464
},
{
"epoch": 0.5969191270860077,
"grad_norm": 0.2690982520580292,
"learning_rate": 4.82147666103236e-05,
"loss": 2.3366,
"step": 465
},
{
"epoch": 0.5982028241335045,
"grad_norm": 0.2946816682815552,
"learning_rate": 4.8205090036782626e-05,
"loss": 2.3942,
"step": 466
},
{
"epoch": 0.5994865211810013,
"grad_norm": 0.3061763346195221,
"learning_rate": 4.819538828529613e-05,
"loss": 2.4006,
"step": 467
},
{
"epoch": 0.6007702182284981,
"grad_norm": 0.32586535811424255,
"learning_rate": 4.8185661366390676e-05,
"loss": 2.3707,
"step": 468
},
{
"epoch": 0.6020539152759948,
"grad_norm": 0.27984949946403503,
"learning_rate": 4.817590929062017e-05,
"loss": 2.4379,
"step": 469
},
{
"epoch": 0.6033376123234917,
"grad_norm": 0.329476922750473,
"learning_rate": 4.816613206856577e-05,
"loss": 2.4531,
"step": 470
},
{
"epoch": 0.6046213093709885,
"grad_norm": 0.31376075744628906,
"learning_rate": 4.815632971083596e-05,
"loss": 2.4197,
"step": 471
},
{
"epoch": 0.6059050064184852,
"grad_norm": 0.28301122784614563,
"learning_rate": 4.814650222806647e-05,
"loss": 2.2975,
"step": 472
},
{
"epoch": 0.6071887034659821,
"grad_norm": 0.30881014466285706,
"learning_rate": 4.813664963092029e-05,
"loss": 2.3507,
"step": 473
},
{
"epoch": 0.6084724005134788,
"grad_norm": 0.30735889077186584,
"learning_rate": 4.8126771930087674e-05,
"loss": 2.2855,
"step": 474
},
{
"epoch": 0.6097560975609756,
"grad_norm": 0.31143245100975037,
"learning_rate": 4.8116869136286106e-05,
"loss": 2.3445,
"step": 475
},
{
"epoch": 0.6110397946084724,
"grad_norm": 0.30091342329978943,
"learning_rate": 4.8106941260260296e-05,
"loss": 2.3587,
"step": 476
},
{
"epoch": 0.6123234916559692,
"grad_norm": 0.29558065533638,
"learning_rate": 4.8096988312782174e-05,
"loss": 2.4079,
"step": 477
},
{
"epoch": 0.613607188703466,
"grad_norm": 0.28896674513816833,
"learning_rate": 4.8087010304650866e-05,
"loss": 2.3141,
"step": 478
},
{
"epoch": 0.6148908857509627,
"grad_norm": 0.39745038747787476,
"learning_rate": 4.8077007246692694e-05,
"loss": 2.3588,
"step": 479
},
{
"epoch": 0.6161745827984596,
"grad_norm": 0.300830602645874,
"learning_rate": 4.806697914976116e-05,
"loss": 2.4217,
"step": 480
},
{
"epoch": 0.6174582798459564,
"grad_norm": 0.3155045211315155,
"learning_rate": 4.805692602473693e-05,
"loss": 2.3939,
"step": 481
},
{
"epoch": 0.6187419768934531,
"grad_norm": 0.3159199357032776,
"learning_rate": 4.8046847882527826e-05,
"loss": 2.4239,
"step": 482
},
{
"epoch": 0.62002567394095,
"grad_norm": 0.299836665391922,
"learning_rate": 4.8036744734068826e-05,
"loss": 2.4192,
"step": 483
},
{
"epoch": 0.6213093709884467,
"grad_norm": 0.3199720084667206,
"learning_rate": 4.802661659032202e-05,
"loss": 2.39,
"step": 484
},
{
"epoch": 0.6225930680359435,
"grad_norm": 0.29899173974990845,
"learning_rate": 4.8016463462276615e-05,
"loss": 2.3525,
"step": 485
},
{
"epoch": 0.6238767650834403,
"grad_norm": 0.2836560904979706,
"learning_rate": 4.8006285360948976e-05,
"loss": 2.3127,
"step": 486
},
{
"epoch": 0.6251604621309371,
"grad_norm": 0.31214481592178345,
"learning_rate": 4.79960822973825e-05,
"loss": 2.4241,
"step": 487
},
{
"epoch": 0.6264441591784339,
"grad_norm": 0.3114881217479706,
"learning_rate": 4.79858542826477e-05,
"loss": 2.2663,
"step": 488
},
{
"epoch": 0.6277278562259306,
"grad_norm": 0.30914467573165894,
"learning_rate": 4.7975601327842176e-05,
"loss": 2.3633,
"step": 489
},
{
"epoch": 0.6290115532734275,
"grad_norm": 0.29763662815093994,
"learning_rate": 4.796532344409055e-05,
"loss": 2.336,
"step": 490
},
{
"epoch": 0.6302952503209243,
"grad_norm": 0.2814677357673645,
"learning_rate": 4.7955020642544515e-05,
"loss": 2.4174,
"step": 491
},
{
"epoch": 0.631578947368421,
"grad_norm": 0.31262046098709106,
"learning_rate": 4.794469293438282e-05,
"loss": 2.3888,
"step": 492
},
{
"epoch": 0.6328626444159179,
"grad_norm": 0.3011872470378876,
"learning_rate": 4.7934340330811186e-05,
"loss": 2.2837,
"step": 493
},
{
"epoch": 0.6341463414634146,
"grad_norm": 0.3168891370296478,
"learning_rate": 4.79239628430624e-05,
"loss": 2.4802,
"step": 494
},
{
"epoch": 0.6354300385109114,
"grad_norm": 0.2910175621509552,
"learning_rate": 4.791356048239622e-05,
"loss": 2.3782,
"step": 495
},
{
"epoch": 0.6367137355584083,
"grad_norm": 0.2881637513637543,
"learning_rate": 4.7903133260099385e-05,
"loss": 2.3472,
"step": 496
},
{
"epoch": 0.637997432605905,
"grad_norm": 0.31721916794776917,
"learning_rate": 4.789268118748564e-05,
"loss": 2.4454,
"step": 497
},
{
"epoch": 0.6392811296534018,
"grad_norm": 0.2995906472206116,
"learning_rate": 4.788220427589566e-05,
"loss": 2.2958,
"step": 498
},
{
"epoch": 0.6405648267008985,
"grad_norm": 0.29439058899879456,
"learning_rate": 4.787170253669708e-05,
"loss": 2.3642,
"step": 499
},
{
"epoch": 0.6418485237483954,
"grad_norm": 0.29495733976364136,
"learning_rate": 4.78611759812845e-05,
"loss": 2.3808,
"step": 500
},
{
"epoch": 0.6431322207958922,
"grad_norm": 0.3079684376716614,
"learning_rate": 4.785062462107941e-05,
"loss": 2.3306,
"step": 501
},
{
"epoch": 0.6444159178433889,
"grad_norm": 0.3264211416244507,
"learning_rate": 4.784004846753023e-05,
"loss": 2.3811,
"step": 502
},
{
"epoch": 0.6456996148908858,
"grad_norm": 0.32243621349334717,
"learning_rate": 4.782944753211228e-05,
"loss": 2.3574,
"step": 503
},
{
"epoch": 0.6469833119383825,
"grad_norm": 0.33553004264831543,
"learning_rate": 4.781882182632776e-05,
"loss": 2.2817,
"step": 504
},
{
"epoch": 0.6482670089858793,
"grad_norm": 0.28932246565818787,
"learning_rate": 4.780817136170578e-05,
"loss": 2.4276,
"step": 505
},
{
"epoch": 0.6495507060333762,
"grad_norm": 0.30343809723854065,
"learning_rate": 4.7797496149802256e-05,
"loss": 2.4455,
"step": 506
},
{
"epoch": 0.6508344030808729,
"grad_norm": 0.3055610954761505,
"learning_rate": 4.77867962022e-05,
"loss": 2.318,
"step": 507
},
{
"epoch": 0.6521181001283697,
"grad_norm": 0.30805933475494385,
"learning_rate": 4.777607153050866e-05,
"loss": 2.4104,
"step": 508
},
{
"epoch": 0.6534017971758665,
"grad_norm": 0.2895534038543701,
"learning_rate": 4.7765322146364686e-05,
"loss": 2.3868,
"step": 509
},
{
"epoch": 0.6546854942233633,
"grad_norm": 0.3000538647174835,
"learning_rate": 4.775454806143137e-05,
"loss": 2.3504,
"step": 510
},
{
"epoch": 0.6559691912708601,
"grad_norm": 0.3021094799041748,
"learning_rate": 4.7743749287398776e-05,
"loss": 2.3791,
"step": 511
},
{
"epoch": 0.6572528883183568,
"grad_norm": 0.2954312860965729,
"learning_rate": 4.7732925835983775e-05,
"loss": 2.3347,
"step": 512
},
{
"epoch": 0.6585365853658537,
"grad_norm": 0.2947414815425873,
"learning_rate": 4.7722077718930014e-05,
"loss": 2.3683,
"step": 513
},
{
"epoch": 0.6598202824133504,
"grad_norm": 0.30434030294418335,
"learning_rate": 4.771120494800789e-05,
"loss": 2.3666,
"step": 514
},
{
"epoch": 0.6611039794608472,
"grad_norm": 0.2817016541957855,
"learning_rate": 4.7700307535014565e-05,
"loss": 2.3736,
"step": 515
},
{
"epoch": 0.6623876765083441,
"grad_norm": 0.2933237552642822,
"learning_rate": 4.768938549177393e-05,
"loss": 2.3464,
"step": 516
},
{
"epoch": 0.6636713735558408,
"grad_norm": 0.3133641481399536,
"learning_rate": 4.7678438830136596e-05,
"loss": 2.3103,
"step": 517
},
{
"epoch": 0.6649550706033376,
"grad_norm": 0.2988991439342499,
"learning_rate": 4.766746756197989e-05,
"loss": 2.4228,
"step": 518
},
{
"epoch": 0.6662387676508345,
"grad_norm": 0.3129512369632721,
"learning_rate": 4.765647169920785e-05,
"loss": 2.3711,
"step": 519
},
{
"epoch": 0.6675224646983312,
"grad_norm": 0.30164363980293274,
"learning_rate": 4.764545125375117e-05,
"loss": 2.314,
"step": 520
},
{
"epoch": 0.668806161745828,
"grad_norm": 0.3078322112560272,
"learning_rate": 4.7634406237567245e-05,
"loss": 2.3583,
"step": 521
},
{
"epoch": 0.6700898587933247,
"grad_norm": 0.29779842495918274,
"learning_rate": 4.7623336662640116e-05,
"loss": 2.3329,
"step": 522
},
{
"epoch": 0.6713735558408216,
"grad_norm": 0.30728939175605774,
"learning_rate": 4.761224254098048e-05,
"loss": 2.3432,
"step": 523
},
{
"epoch": 0.6726572528883183,
"grad_norm": 0.28595444560050964,
"learning_rate": 4.760112388462564e-05,
"loss": 2.3155,
"step": 524
},
{
"epoch": 0.6739409499358151,
"grad_norm": 0.32341310381889343,
"learning_rate": 4.758998070563957e-05,
"loss": 2.3919,
"step": 525
},
{
"epoch": 0.675224646983312,
"grad_norm": 0.33163347840309143,
"learning_rate": 4.75788130161128e-05,
"loss": 2.2703,
"step": 526
},
{
"epoch": 0.6765083440308087,
"grad_norm": 0.2895621061325073,
"learning_rate": 4.7567620828162486e-05,
"loss": 2.3479,
"step": 527
},
{
"epoch": 0.6777920410783055,
"grad_norm": 0.31375667452812195,
"learning_rate": 4.7556404153932356e-05,
"loss": 2.4763,
"step": 528
},
{
"epoch": 0.6790757381258024,
"grad_norm": 0.29855942726135254,
"learning_rate": 4.754516300559271e-05,
"loss": 2.4154,
"step": 529
},
{
"epoch": 0.6803594351732991,
"grad_norm": 0.29022201895713806,
"learning_rate": 4.7533897395340384e-05,
"loss": 2.3614,
"step": 530
},
{
"epoch": 0.6816431322207959,
"grad_norm": 0.280118852853775,
"learning_rate": 4.7522607335398786e-05,
"loss": 2.3306,
"step": 531
},
{
"epoch": 0.6829268292682927,
"grad_norm": 0.29326388239860535,
"learning_rate": 4.751129283801782e-05,
"loss": 2.3219,
"step": 532
},
{
"epoch": 0.6842105263157895,
"grad_norm": 0.3260889947414398,
"learning_rate": 4.7499953915473935e-05,
"loss": 2.3952,
"step": 533
},
{
"epoch": 0.6854942233632862,
"grad_norm": 0.2982180118560791,
"learning_rate": 4.7488590580070074e-05,
"loss": 2.3165,
"step": 534
},
{
"epoch": 0.686777920410783,
"grad_norm": 0.303310364484787,
"learning_rate": 4.7477202844135646e-05,
"loss": 2.2632,
"step": 535
},
{
"epoch": 0.6880616174582799,
"grad_norm": 0.329369455575943,
"learning_rate": 4.746579072002657e-05,
"loss": 2.328,
"step": 536
},
{
"epoch": 0.6893453145057766,
"grad_norm": 0.3056361973285675,
"learning_rate": 4.74543542201252e-05,
"loss": 2.4139,
"step": 537
},
{
"epoch": 0.6906290115532734,
"grad_norm": 0.31663402915000916,
"learning_rate": 4.744289335684034e-05,
"loss": 2.3929,
"step": 538
},
{
"epoch": 0.6919127086007703,
"grad_norm": 0.3166126012802124,
"learning_rate": 4.7431408142607236e-05,
"loss": 2.3308,
"step": 539
},
{
"epoch": 0.693196405648267,
"grad_norm": 0.3111107349395752,
"learning_rate": 4.7419898589887566e-05,
"loss": 2.3885,
"step": 540
},
{
"epoch": 0.6944801026957638,
"grad_norm": 0.3381554186344147,
"learning_rate": 4.74083647111694e-05,
"loss": 2.2421,
"step": 541
},
{
"epoch": 0.6957637997432606,
"grad_norm": 0.29884985089302063,
"learning_rate": 4.73968065189672e-05,
"loss": 2.3181,
"step": 542
},
{
"epoch": 0.6970474967907574,
"grad_norm": 0.2916213274002075,
"learning_rate": 4.738522402582183e-05,
"loss": 2.3626,
"step": 543
},
{
"epoch": 0.6983311938382541,
"grad_norm": 0.3153024911880493,
"learning_rate": 4.737361724430048e-05,
"loss": 2.4354,
"step": 544
},
{
"epoch": 0.699614890885751,
"grad_norm": 0.2967838644981384,
"learning_rate": 4.736198618699675e-05,
"loss": 2.4529,
"step": 545
},
{
"epoch": 0.7008985879332478,
"grad_norm": 0.29753878712654114,
"learning_rate": 4.7350330866530536e-05,
"loss": 2.3934,
"step": 546
},
{
"epoch": 0.7021822849807445,
"grad_norm": 0.2958550751209259,
"learning_rate": 4.7338651295548065e-05,
"loss": 2.2737,
"step": 547
},
{
"epoch": 0.7034659820282413,
"grad_norm": 0.2889609932899475,
"learning_rate": 4.7326947486721894e-05,
"loss": 2.3989,
"step": 548
},
{
"epoch": 0.7047496790757382,
"grad_norm": 0.2951468229293823,
"learning_rate": 4.731521945275087e-05,
"loss": 2.3909,
"step": 549
},
{
"epoch": 0.7060333761232349,
"grad_norm": 0.30337998270988464,
"learning_rate": 4.730346720636011e-05,
"loss": 2.3878,
"step": 550
},
{
"epoch": 0.7073170731707317,
"grad_norm": 0.2858850359916687,
"learning_rate": 4.7291690760301035e-05,
"loss": 2.3465,
"step": 551
},
{
"epoch": 0.7086007702182285,
"grad_norm": 0.2845821678638458,
"learning_rate": 4.727989012735129e-05,
"loss": 2.385,
"step": 552
},
{
"epoch": 0.7098844672657253,
"grad_norm": 0.2901153564453125,
"learning_rate": 4.726806532031478e-05,
"loss": 2.2939,
"step": 553
},
{
"epoch": 0.711168164313222,
"grad_norm": 0.27658534049987793,
"learning_rate": 4.725621635202164e-05,
"loss": 2.3292,
"step": 554
},
{
"epoch": 0.7124518613607189,
"grad_norm": 0.289438933134079,
"learning_rate": 4.724434323532821e-05,
"loss": 2.3481,
"step": 555
},
{
"epoch": 0.7137355584082157,
"grad_norm": 0.30061691999435425,
"learning_rate": 4.7232445983117045e-05,
"loss": 2.2876,
"step": 556
},
{
"epoch": 0.7150192554557124,
"grad_norm": 0.28911301493644714,
"learning_rate": 4.722052460829687e-05,
"loss": 2.3649,
"step": 557
},
{
"epoch": 0.7163029525032092,
"grad_norm": 0.32217785716056824,
"learning_rate": 4.720857912380261e-05,
"loss": 2.3052,
"step": 558
},
{
"epoch": 0.7175866495507061,
"grad_norm": 0.3333997428417206,
"learning_rate": 4.719660954259532e-05,
"loss": 2.3851,
"step": 559
},
{
"epoch": 0.7188703465982028,
"grad_norm": 0.3150656819343567,
"learning_rate": 4.718461587766221e-05,
"loss": 2.3354,
"step": 560
},
{
"epoch": 0.7201540436456996,
"grad_norm": 0.2966265380382538,
"learning_rate": 4.7172598142016645e-05,
"loss": 2.3681,
"step": 561
},
{
"epoch": 0.7214377406931964,
"grad_norm": 0.3027741014957428,
"learning_rate": 4.716055634869807e-05,
"loss": 2.3369,
"step": 562
},
{
"epoch": 0.7227214377406932,
"grad_norm": 0.32294961810112,
"learning_rate": 4.714849051077205e-05,
"loss": 2.4019,
"step": 563
},
{
"epoch": 0.72400513478819,
"grad_norm": 0.30615222454071045,
"learning_rate": 4.713640064133025e-05,
"loss": 2.4961,
"step": 564
},
{
"epoch": 0.7252888318356868,
"grad_norm": 0.29600149393081665,
"learning_rate": 4.712428675349039e-05,
"loss": 2.3446,
"step": 565
},
{
"epoch": 0.7265725288831836,
"grad_norm": 0.2971326410770416,
"learning_rate": 4.7112148860396265e-05,
"loss": 2.3006,
"step": 566
},
{
"epoch": 0.7278562259306803,
"grad_norm": 0.29449960589408875,
"learning_rate": 4.7099986975217704e-05,
"loss": 2.464,
"step": 567
},
{
"epoch": 0.7291399229781772,
"grad_norm": 0.27724847197532654,
"learning_rate": 4.708780111115057e-05,
"loss": 2.4383,
"step": 568
},
{
"epoch": 0.730423620025674,
"grad_norm": 0.3143709897994995,
"learning_rate": 4.7075591281416765e-05,
"loss": 2.3377,
"step": 569
},
{
"epoch": 0.7317073170731707,
"grad_norm": 0.32122036814689636,
"learning_rate": 4.706335749926417e-05,
"loss": 2.2914,
"step": 570
},
{
"epoch": 0.7329910141206675,
"grad_norm": 0.28614115715026855,
"learning_rate": 4.7051099777966645e-05,
"loss": 2.3338,
"step": 571
},
{
"epoch": 0.7342747111681643,
"grad_norm": 0.30070704221725464,
"learning_rate": 4.703881813082406e-05,
"loss": 2.3367,
"step": 572
},
{
"epoch": 0.7355584082156611,
"grad_norm": 0.2799395024776459,
"learning_rate": 4.702651257116222e-05,
"loss": 2.3604,
"step": 573
},
{
"epoch": 0.7368421052631579,
"grad_norm": 0.29513630270957947,
"learning_rate": 4.7014183112332874e-05,
"loss": 2.3487,
"step": 574
},
{
"epoch": 0.7381258023106547,
"grad_norm": 0.30071401596069336,
"learning_rate": 4.7001829767713726e-05,
"loss": 2.3422,
"step": 575
},
{
"epoch": 0.7394094993581515,
"grad_norm": 0.3397565186023712,
"learning_rate": 4.698945255070837e-05,
"loss": 2.3802,
"step": 576
},
{
"epoch": 0.7406931964056482,
"grad_norm": 0.32068637013435364,
"learning_rate": 4.697705147474631e-05,
"loss": 2.3498,
"step": 577
},
{
"epoch": 0.7419768934531451,
"grad_norm": 0.5262579917907715,
"learning_rate": 4.696462655328294e-05,
"loss": 2.2523,
"step": 578
},
{
"epoch": 0.7432605905006419,
"grad_norm": 0.3595364987850189,
"learning_rate": 4.695217779979953e-05,
"loss": 2.3578,
"step": 579
},
{
"epoch": 0.7445442875481386,
"grad_norm": 0.3001711964607239,
"learning_rate": 4.69397052278032e-05,
"loss": 2.2685,
"step": 580
},
{
"epoch": 0.7458279845956355,
"grad_norm": 0.3520994782447815,
"learning_rate": 4.6927208850826925e-05,
"loss": 2.364,
"step": 581
},
{
"epoch": 0.7471116816431322,
"grad_norm": 0.3192301094532013,
"learning_rate": 4.6914688682429496e-05,
"loss": 2.3454,
"step": 582
},
{
"epoch": 0.748395378690629,
"grad_norm": 0.28848007321357727,
"learning_rate": 4.690214473619552e-05,
"loss": 2.5347,
"step": 583
},
{
"epoch": 0.7496790757381258,
"grad_norm": 0.3215525448322296,
"learning_rate": 4.688957702573542e-05,
"loss": 2.3881,
"step": 584
},
{
"epoch": 0.7509627727856226,
"grad_norm": 0.32826587557792664,
"learning_rate": 4.687698556468538e-05,
"loss": 2.3555,
"step": 585
},
{
"epoch": 0.7522464698331194,
"grad_norm": 0.2755299210548401,
"learning_rate": 4.6864370366707366e-05,
"loss": 2.3278,
"step": 586
},
{
"epoch": 0.7535301668806161,
"grad_norm": 0.32962703704833984,
"learning_rate": 4.68517314454891e-05,
"loss": 2.3749,
"step": 587
},
{
"epoch": 0.754813863928113,
"grad_norm": 0.29674839973449707,
"learning_rate": 4.683906881474405e-05,
"loss": 2.3683,
"step": 588
},
{
"epoch": 0.7560975609756098,
"grad_norm": 0.31625881791114807,
"learning_rate": 4.682638248821138e-05,
"loss": 2.3088,
"step": 589
},
{
"epoch": 0.7573812580231065,
"grad_norm": 0.31702378392219543,
"learning_rate": 4.6813672479656e-05,
"loss": 2.4639,
"step": 590
},
{
"epoch": 0.7586649550706034,
"grad_norm": 0.3002925515174866,
"learning_rate": 4.6800938802868505e-05,
"loss": 2.3569,
"step": 591
},
{
"epoch": 0.7599486521181001,
"grad_norm": 0.30289632081985474,
"learning_rate": 4.6788181471665155e-05,
"loss": 2.3155,
"step": 592
},
{
"epoch": 0.7612323491655969,
"grad_norm": 0.2899289131164551,
"learning_rate": 4.677540049988789e-05,
"loss": 2.4306,
"step": 593
},
{
"epoch": 0.7625160462130937,
"grad_norm": 0.2906821370124817,
"learning_rate": 4.67625959014043e-05,
"loss": 2.3712,
"step": 594
},
{
"epoch": 0.7637997432605905,
"grad_norm": 0.3128328025341034,
"learning_rate": 4.674976769010761e-05,
"loss": 2.3235,
"step": 595
},
{
"epoch": 0.7650834403080873,
"grad_norm": 0.33928877115249634,
"learning_rate": 4.673691587991667e-05,
"loss": 2.4025,
"step": 596
},
{
"epoch": 0.766367137355584,
"grad_norm": 0.3034507632255554,
"learning_rate": 4.67240404847759e-05,
"loss": 2.2935,
"step": 597
},
{
"epoch": 0.7676508344030809,
"grad_norm": 0.29682376980781555,
"learning_rate": 4.671114151865536e-05,
"loss": 2.3951,
"step": 598
},
{
"epoch": 0.7689345314505777,
"grad_norm": 0.29410845041275024,
"learning_rate": 4.669821899555066e-05,
"loss": 2.3497,
"step": 599
},
{
"epoch": 0.7702182284980744,
"grad_norm": 0.3179001808166504,
"learning_rate": 4.668527292948298e-05,
"loss": 2.3472,
"step": 600
},
{
"epoch": 0.7715019255455713,
"grad_norm": 0.28910136222839355,
"learning_rate": 4.667230333449902e-05,
"loss": 2.335,
"step": 601
},
{
"epoch": 0.772785622593068,
"grad_norm": 0.3009548783302307,
"learning_rate": 4.665931022467105e-05,
"loss": 2.4499,
"step": 602
},
{
"epoch": 0.7740693196405648,
"grad_norm": 0.2947859466075897,
"learning_rate": 4.664629361409681e-05,
"loss": 2.4299,
"step": 603
},
{
"epoch": 0.7753530166880617,
"grad_norm": 0.30695682764053345,
"learning_rate": 4.663325351689956e-05,
"loss": 2.3051,
"step": 604
},
{
"epoch": 0.7766367137355584,
"grad_norm": 0.2979392111301422,
"learning_rate": 4.662018994722806e-05,
"loss": 2.4222,
"step": 605
},
{
"epoch": 0.7779204107830552,
"grad_norm": 0.298109769821167,
"learning_rate": 4.660710291925652e-05,
"loss": 2.4049,
"step": 606
},
{
"epoch": 0.7792041078305519,
"grad_norm": 0.29500192403793335,
"learning_rate": 4.6593992447184586e-05,
"loss": 2.3886,
"step": 607
},
{
"epoch": 0.7804878048780488,
"grad_norm": 0.2970205843448639,
"learning_rate": 4.658085854523737e-05,
"loss": 2.3958,
"step": 608
},
{
"epoch": 0.7817715019255456,
"grad_norm": 0.3007825016975403,
"learning_rate": 4.6567701227665394e-05,
"loss": 2.3164,
"step": 609
},
{
"epoch": 0.7830551989730423,
"grad_norm": 0.3098297715187073,
"learning_rate": 4.655452050874459e-05,
"loss": 2.3919,
"step": 610
},
{
"epoch": 0.7843388960205392,
"grad_norm": 0.29705604910850525,
"learning_rate": 4.654131640277627e-05,
"loss": 2.4343,
"step": 611
},
{
"epoch": 0.785622593068036,
"grad_norm": 0.2924937605857849,
"learning_rate": 4.6528088924087134e-05,
"loss": 2.2326,
"step": 612
},
{
"epoch": 0.7869062901155327,
"grad_norm": 0.316272109746933,
"learning_rate": 4.651483808702924e-05,
"loss": 2.3787,
"step": 613
},
{
"epoch": 0.7881899871630296,
"grad_norm": 0.3279688358306885,
"learning_rate": 4.650156390598e-05,
"loss": 2.4019,
"step": 614
},
{
"epoch": 0.7894736842105263,
"grad_norm": 0.299129456281662,
"learning_rate": 4.6488266395342105e-05,
"loss": 2.3136,
"step": 615
},
{
"epoch": 0.7907573812580231,
"grad_norm": 0.2993239760398865,
"learning_rate": 4.647494556954363e-05,
"loss": 2.4093,
"step": 616
},
{
"epoch": 0.79204107830552,
"grad_norm": 0.3059884309768677,
"learning_rate": 4.6461601443037905e-05,
"loss": 2.3847,
"step": 617
},
{
"epoch": 0.7933247753530167,
"grad_norm": 0.3300536274909973,
"learning_rate": 4.644823403030355e-05,
"loss": 2.3478,
"step": 618
},
{
"epoch": 0.7946084724005135,
"grad_norm": 0.30517104268074036,
"learning_rate": 4.643484334584445e-05,
"loss": 2.3763,
"step": 619
},
{
"epoch": 0.7958921694480102,
"grad_norm": 0.34745684266090393,
"learning_rate": 4.642142940418973e-05,
"loss": 2.4441,
"step": 620
},
{
"epoch": 0.7971758664955071,
"grad_norm": 0.2973950207233429,
"learning_rate": 4.640799221989378e-05,
"loss": 2.3866,
"step": 621
},
{
"epoch": 0.7984595635430038,
"grad_norm": 0.2934724688529968,
"learning_rate": 4.639453180753619e-05,
"loss": 2.3949,
"step": 622
},
{
"epoch": 0.7997432605905006,
"grad_norm": 0.2799731194972992,
"learning_rate": 4.6381048181721744e-05,
"loss": 2.3475,
"step": 623
},
{
"epoch": 0.8010269576379975,
"grad_norm": 0.30114057660102844,
"learning_rate": 4.636754135708041e-05,
"loss": 2.4434,
"step": 624
},
{
"epoch": 0.8023106546854942,
"grad_norm": 0.3088257610797882,
"learning_rate": 4.635401134826737e-05,
"loss": 2.3977,
"step": 625
},
{
"epoch": 0.803594351732991,
"grad_norm": 0.3008176386356354,
"learning_rate": 4.63404581699629e-05,
"loss": 2.3099,
"step": 626
},
{
"epoch": 0.8048780487804879,
"grad_norm": 0.29447734355926514,
"learning_rate": 4.632688183687246e-05,
"loss": 2.4009,
"step": 627
},
{
"epoch": 0.8061617458279846,
"grad_norm": 0.29392993450164795,
"learning_rate": 4.631328236372662e-05,
"loss": 2.3303,
"step": 628
},
{
"epoch": 0.8074454428754814,
"grad_norm": 0.28862807154655457,
"learning_rate": 4.629965976528104e-05,
"loss": 2.4822,
"step": 629
},
{
"epoch": 0.8087291399229781,
"grad_norm": 0.3060828447341919,
"learning_rate": 4.628601405631652e-05,
"loss": 2.4551,
"step": 630
},
{
"epoch": 0.810012836970475,
"grad_norm": 0.3012582063674927,
"learning_rate": 4.627234525163887e-05,
"loss": 2.3667,
"step": 631
},
{
"epoch": 0.8112965340179717,
"grad_norm": 0.2978503406047821,
"learning_rate": 4.625865336607901e-05,
"loss": 2.3306,
"step": 632
},
{
"epoch": 0.8125802310654685,
"grad_norm": 0.31029176712036133,
"learning_rate": 4.6244938414492875e-05,
"loss": 2.4326,
"step": 633
},
{
"epoch": 0.8138639281129654,
"grad_norm": 0.3450663983821869,
"learning_rate": 4.6231200411761444e-05,
"loss": 2.3047,
"step": 634
},
{
"epoch": 0.8151476251604621,
"grad_norm": 0.3168805241584778,
"learning_rate": 4.62174393727907e-05,
"loss": 2.4397,
"step": 635
},
{
"epoch": 0.8164313222079589,
"grad_norm": 0.31874215602874756,
"learning_rate": 4.6203655312511616e-05,
"loss": 2.5085,
"step": 636
},
{
"epoch": 0.8177150192554558,
"grad_norm": 0.30860742926597595,
"learning_rate": 4.618984824588016e-05,
"loss": 2.2939,
"step": 637
},
{
"epoch": 0.8189987163029525,
"grad_norm": 0.3643187880516052,
"learning_rate": 4.617601818787724e-05,
"loss": 2.4521,
"step": 638
},
{
"epoch": 0.8202824133504493,
"grad_norm": 0.29677483439445496,
"learning_rate": 4.6162165153508724e-05,
"loss": 2.4059,
"step": 639
},
{
"epoch": 0.8215661103979461,
"grad_norm": 0.31648752093315125,
"learning_rate": 4.6148289157805406e-05,
"loss": 2.3753,
"step": 640
},
{
"epoch": 0.8228498074454429,
"grad_norm": 0.30333682894706726,
"learning_rate": 4.6134390215822995e-05,
"loss": 2.367,
"step": 641
},
{
"epoch": 0.8241335044929397,
"grad_norm": 0.321492075920105,
"learning_rate": 4.61204683426421e-05,
"loss": 2.3,
"step": 642
},
{
"epoch": 0.8254172015404364,
"grad_norm": 0.2793954014778137,
"learning_rate": 4.610652355336821e-05,
"loss": 2.3858,
"step": 643
},
{
"epoch": 0.8267008985879333,
"grad_norm": 0.29857027530670166,
"learning_rate": 4.609255586313166e-05,
"loss": 2.3492,
"step": 644
},
{
"epoch": 0.82798459563543,
"grad_norm": 0.28842952847480774,
"learning_rate": 4.607856528708765e-05,
"loss": 2.3433,
"step": 645
},
{
"epoch": 0.8292682926829268,
"grad_norm": 0.3550688326358795,
"learning_rate": 4.606455184041622e-05,
"loss": 2.3817,
"step": 646
},
{
"epoch": 0.8305519897304237,
"grad_norm": 0.32418951392173767,
"learning_rate": 4.6050515538322225e-05,
"loss": 2.4612,
"step": 647
},
{
"epoch": 0.8318356867779204,
"grad_norm": 0.33099403977394104,
"learning_rate": 4.6036456396035294e-05,
"loss": 2.3853,
"step": 648
},
{
"epoch": 0.8331193838254172,
"grad_norm": 0.32067880034446716,
"learning_rate": 4.602237442880985e-05,
"loss": 2.4484,
"step": 649
},
{
"epoch": 0.834403080872914,
"grad_norm": 0.34124529361724854,
"learning_rate": 4.600826965192509e-05,
"loss": 2.3791,
"step": 650
},
{
"epoch": 0.8356867779204108,
"grad_norm": 0.2857443392276764,
"learning_rate": 4.599414208068495e-05,
"loss": 2.355,
"step": 651
},
{
"epoch": 0.8369704749679076,
"grad_norm": 0.324481338262558,
"learning_rate": 4.5979991730418105e-05,
"loss": 2.4303,
"step": 652
},
{
"epoch": 0.8382541720154044,
"grad_norm": 0.32609954476356506,
"learning_rate": 4.596581861647795e-05,
"loss": 2.3638,
"step": 653
},
{
"epoch": 0.8395378690629012,
"grad_norm": 0.303725004196167,
"learning_rate": 4.595162275424255e-05,
"loss": 2.3144,
"step": 654
},
{
"epoch": 0.8408215661103979,
"grad_norm": 0.29211246967315674,
"learning_rate": 4.59374041591147e-05,
"loss": 2.4353,
"step": 655
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.31560078263282776,
"learning_rate": 4.5923162846521824e-05,
"loss": 2.3982,
"step": 656
},
{
"epoch": 0.8433889602053916,
"grad_norm": 0.3038894534111023,
"learning_rate": 4.5908898831916e-05,
"loss": 2.2962,
"step": 657
},
{
"epoch": 0.8446726572528883,
"grad_norm": 0.34458431601524353,
"learning_rate": 4.589461213077395e-05,
"loss": 2.3738,
"step": 658
},
{
"epoch": 0.8459563543003851,
"grad_norm": 0.3022404909133911,
"learning_rate": 4.5880302758597e-05,
"loss": 2.3374,
"step": 659
},
{
"epoch": 0.8472400513478819,
"grad_norm": 0.30100885033607483,
"learning_rate": 4.586597073091109e-05,
"loss": 2.286,
"step": 660
},
{
"epoch": 0.8485237483953787,
"grad_norm": 0.31171244382858276,
"learning_rate": 4.585161606326673e-05,
"loss": 2.3892,
"step": 661
},
{
"epoch": 0.8498074454428755,
"grad_norm": 0.3026835322380066,
"learning_rate": 4.5837238771238975e-05,
"loss": 2.3063,
"step": 662
},
{
"epoch": 0.8510911424903723,
"grad_norm": 0.2819378972053528,
"learning_rate": 4.582283887042748e-05,
"loss": 2.47,
"step": 663
},
{
"epoch": 0.8523748395378691,
"grad_norm": 0.2845015823841095,
"learning_rate": 4.580841637645638e-05,
"loss": 2.4173,
"step": 664
},
{
"epoch": 0.8536585365853658,
"grad_norm": 0.3148723840713501,
"learning_rate": 4.5793971304974344e-05,
"loss": 2.3728,
"step": 665
},
{
"epoch": 0.8549422336328626,
"grad_norm": 0.3028092086315155,
"learning_rate": 4.5779503671654544e-05,
"loss": 2.3707,
"step": 666
},
{
"epoch": 0.8562259306803595,
"grad_norm": 0.2981637418270111,
"learning_rate": 4.576501349219462e-05,
"loss": 2.4751,
"step": 667
},
{
"epoch": 0.8575096277278562,
"grad_norm": 0.33563345670700073,
"learning_rate": 4.5750500782316684e-05,
"loss": 2.3136,
"step": 668
},
{
"epoch": 0.858793324775353,
"grad_norm": 0.2983044385910034,
"learning_rate": 4.57359655577673e-05,
"loss": 2.3992,
"step": 669
},
{
"epoch": 0.8600770218228498,
"grad_norm": 0.3396848440170288,
"learning_rate": 4.5721407834317436e-05,
"loss": 2.3828,
"step": 670
},
{
"epoch": 0.8613607188703466,
"grad_norm": 0.2932271659374237,
"learning_rate": 4.570682762776249e-05,
"loss": 2.3649,
"step": 671
},
{
"epoch": 0.8626444159178434,
"grad_norm": 0.30177557468414307,
"learning_rate": 4.5692224953922266e-05,
"loss": 2.3681,
"step": 672
},
{
"epoch": 0.8639281129653402,
"grad_norm": 0.3091699182987213,
"learning_rate": 4.5677599828640916e-05,
"loss": 2.3726,
"step": 673
},
{
"epoch": 0.865211810012837,
"grad_norm": 0.31127968430519104,
"learning_rate": 4.5662952267786974e-05,
"loss": 2.3379,
"step": 674
},
{
"epoch": 0.8664955070603337,
"grad_norm": 0.28275659680366516,
"learning_rate": 4.56482822872533e-05,
"loss": 2.2449,
"step": 675
},
{
"epoch": 0.8677792041078306,
"grad_norm": 0.31371593475341797,
"learning_rate": 4.563358990295711e-05,
"loss": 2.3419,
"step": 676
},
{
"epoch": 0.8690629011553274,
"grad_norm": 0.31198492646217346,
"learning_rate": 4.561887513083989e-05,
"loss": 2.372,
"step": 677
},
{
"epoch": 0.8703465982028241,
"grad_norm": 0.31090936064720154,
"learning_rate": 4.560413798686745e-05,
"loss": 2.3304,
"step": 678
},
{
"epoch": 0.8716302952503209,
"grad_norm": 0.3031321167945862,
"learning_rate": 4.5589378487029844e-05,
"loss": 2.3631,
"step": 679
},
{
"epoch": 0.8729139922978177,
"grad_norm": 0.27430158853530884,
"learning_rate": 4.557459664734141e-05,
"loss": 2.3155,
"step": 680
},
{
"epoch": 0.8741976893453145,
"grad_norm": 0.32084590196609497,
"learning_rate": 4.555979248384071e-05,
"loss": 2.4118,
"step": 681
},
{
"epoch": 0.8754813863928113,
"grad_norm": 0.31383204460144043,
"learning_rate": 4.554496601259054e-05,
"loss": 2.3323,
"step": 682
},
{
"epoch": 0.8767650834403081,
"grad_norm": 0.3000917434692383,
"learning_rate": 4.5530117249677875e-05,
"loss": 2.3542,
"step": 683
},
{
"epoch": 0.8780487804878049,
"grad_norm": 0.3041795492172241,
"learning_rate": 4.5515246211213906e-05,
"loss": 2.3466,
"step": 684
},
{
"epoch": 0.8793324775353016,
"grad_norm": 0.31530001759529114,
"learning_rate": 4.5500352913333974e-05,
"loss": 2.3783,
"step": 685
},
{
"epoch": 0.8806161745827985,
"grad_norm": 0.3116975724697113,
"learning_rate": 4.5485437372197584e-05,
"loss": 2.4377,
"step": 686
},
{
"epoch": 0.8818998716302953,
"grad_norm": 0.32885855436325073,
"learning_rate": 4.5470499603988364e-05,
"loss": 2.4546,
"step": 687
},
{
"epoch": 0.883183568677792,
"grad_norm": 0.30758875608444214,
"learning_rate": 4.5455539624914066e-05,
"loss": 2.3051,
"step": 688
},
{
"epoch": 0.8844672657252889,
"grad_norm": 0.30743828415870667,
"learning_rate": 4.544055745120655e-05,
"loss": 2.2663,
"step": 689
},
{
"epoch": 0.8857509627727856,
"grad_norm": 0.3043730556964874,
"learning_rate": 4.542555309912173e-05,
"loss": 2.3279,
"step": 690
},
{
"epoch": 0.8870346598202824,
"grad_norm": 0.3107464909553528,
"learning_rate": 4.5410526584939614e-05,
"loss": 2.3694,
"step": 691
},
{
"epoch": 0.8883183568677792,
"grad_norm": 0.3012385070323944,
"learning_rate": 4.539547792496424e-05,
"loss": 2.4196,
"step": 692
},
{
"epoch": 0.889602053915276,
"grad_norm": 0.30979493260383606,
"learning_rate": 4.5380407135523675e-05,
"loss": 2.2083,
"step": 693
},
{
"epoch": 0.8908857509627728,
"grad_norm": 0.29600152373313904,
"learning_rate": 4.536531423297001e-05,
"loss": 2.4051,
"step": 694
},
{
"epoch": 0.8921694480102695,
"grad_norm": 0.2803564965724945,
"learning_rate": 4.5350199233679316e-05,
"loss": 2.4025,
"step": 695
},
{
"epoch": 0.8934531450577664,
"grad_norm": 0.29657790064811707,
"learning_rate": 4.5335062154051625e-05,
"loss": 2.4621,
"step": 696
},
{
"epoch": 0.8947368421052632,
"grad_norm": 0.2989714443683624,
"learning_rate": 4.531990301051097e-05,
"loss": 2.5027,
"step": 697
},
{
"epoch": 0.8960205391527599,
"grad_norm": 0.28022336959838867,
"learning_rate": 4.530472181950528e-05,
"loss": 2.347,
"step": 698
},
{
"epoch": 0.8973042362002568,
"grad_norm": 0.2950182855129242,
"learning_rate": 4.528951859750643e-05,
"loss": 2.2627,
"step": 699
},
{
"epoch": 0.8985879332477535,
"grad_norm": 0.283989816904068,
"learning_rate": 4.52742933610102e-05,
"loss": 2.3606,
"step": 700
},
{
"epoch": 0.8998716302952503,
"grad_norm": 0.2783466577529907,
"learning_rate": 4.525904612653624e-05,
"loss": 2.3604,
"step": 701
},
{
"epoch": 0.9011553273427471,
"grad_norm": 0.3252420723438263,
"learning_rate": 4.5243776910628076e-05,
"loss": 2.3619,
"step": 702
},
{
"epoch": 0.9024390243902439,
"grad_norm": 0.29868730902671814,
"learning_rate": 4.5228485729853096e-05,
"loss": 2.4099,
"step": 703
},
{
"epoch": 0.9037227214377407,
"grad_norm": 0.30771031975746155,
"learning_rate": 4.52131726008025e-05,
"loss": 2.3787,
"step": 704
},
{
"epoch": 0.9050064184852374,
"grad_norm": 0.28617870807647705,
"learning_rate": 4.519783754009131e-05,
"loss": 2.3759,
"step": 705
},
{
"epoch": 0.9062901155327343,
"grad_norm": 0.2890000641345978,
"learning_rate": 4.5182480564358356e-05,
"loss": 2.3883,
"step": 706
},
{
"epoch": 0.9075738125802311,
"grad_norm": 0.30811789631843567,
"learning_rate": 4.516710169026624e-05,
"loss": 2.349,
"step": 707
},
{
"epoch": 0.9088575096277278,
"grad_norm": 0.29645591974258423,
"learning_rate": 4.515170093450129e-05,
"loss": 2.3139,
"step": 708
},
{
"epoch": 0.9101412066752247,
"grad_norm": 0.3044429421424866,
"learning_rate": 4.513627831377365e-05,
"loss": 2.4196,
"step": 709
},
{
"epoch": 0.9114249037227214,
"grad_norm": 0.30725550651550293,
"learning_rate": 4.5120833844817116e-05,
"loss": 2.2667,
"step": 710
},
{
"epoch": 0.9127086007702182,
"grad_norm": 0.29363924264907837,
"learning_rate": 4.510536754438923e-05,
"loss": 2.3684,
"step": 711
},
{
"epoch": 0.9139922978177151,
"grad_norm": 0.31257641315460205,
"learning_rate": 4.50898794292712e-05,
"loss": 2.4096,
"step": 712
},
{
"epoch": 0.9152759948652118,
"grad_norm": 0.3012520670890808,
"learning_rate": 4.507436951626792e-05,
"loss": 2.3477,
"step": 713
},
{
"epoch": 0.9165596919127086,
"grad_norm": 0.2953280508518219,
"learning_rate": 4.505883782220793e-05,
"loss": 2.4028,
"step": 714
},
{
"epoch": 0.9178433889602053,
"grad_norm": 0.3054860830307007,
"learning_rate": 4.504328436394339e-05,
"loss": 2.3845,
"step": 715
},
{
"epoch": 0.9191270860077022,
"grad_norm": 0.31432589888572693,
"learning_rate": 4.50277091583501e-05,
"loss": 2.3838,
"step": 716
},
{
"epoch": 0.920410783055199,
"grad_norm": 0.34039193391799927,
"learning_rate": 4.5012112222327434e-05,
"loss": 2.3927,
"step": 717
},
{
"epoch": 0.9216944801026957,
"grad_norm": 0.3081609308719635,
"learning_rate": 4.4996493572798356e-05,
"loss": 2.3497,
"step": 718
},
{
"epoch": 0.9229781771501926,
"grad_norm": 0.3027456998825073,
"learning_rate": 4.498085322670938e-05,
"loss": 2.2801,
"step": 719
},
{
"epoch": 0.9242618741976893,
"grad_norm": 0.29661089181900024,
"learning_rate": 4.496519120103057e-05,
"loss": 2.3386,
"step": 720
},
{
"epoch": 0.9255455712451861,
"grad_norm": 0.3082757592201233,
"learning_rate": 4.494950751275551e-05,
"loss": 2.2882,
"step": 721
},
{
"epoch": 0.926829268292683,
"grad_norm": 0.3266795575618744,
"learning_rate": 4.493380217890128e-05,
"loss": 2.4543,
"step": 722
},
{
"epoch": 0.9281129653401797,
"grad_norm": 0.30967482924461365,
"learning_rate": 4.4918075216508456e-05,
"loss": 2.3665,
"step": 723
},
{
"epoch": 0.9293966623876765,
"grad_norm": 0.3447597920894623,
"learning_rate": 4.4902326642641095e-05,
"loss": 2.36,
"step": 724
},
{
"epoch": 0.9306803594351734,
"grad_norm": 0.3120889663696289,
"learning_rate": 4.488655647438667e-05,
"loss": 2.3515,
"step": 725
},
{
"epoch": 0.9319640564826701,
"grad_norm": 0.29375332593917847,
"learning_rate": 4.48707647288561e-05,
"loss": 2.3017,
"step": 726
},
{
"epoch": 0.9332477535301669,
"grad_norm": 0.30576878786087036,
"learning_rate": 4.485495142318372e-05,
"loss": 2.4154,
"step": 727
},
{
"epoch": 0.9345314505776636,
"grad_norm": 0.26723507046699524,
"learning_rate": 4.483911657452726e-05,
"loss": 2.2889,
"step": 728
},
{
"epoch": 0.9358151476251605,
"grad_norm": 0.3043777644634247,
"learning_rate": 4.482326020006782e-05,
"loss": 2.3097,
"step": 729
},
{
"epoch": 0.9370988446726572,
"grad_norm": 0.28958752751350403,
"learning_rate": 4.480738231700985e-05,
"loss": 2.2452,
"step": 730
},
{
"epoch": 0.938382541720154,
"grad_norm": 0.2912410795688629,
"learning_rate": 4.4791482942581134e-05,
"loss": 2.2437,
"step": 731
},
{
"epoch": 0.9396662387676509,
"grad_norm": 0.29347458481788635,
"learning_rate": 4.4775562094032795e-05,
"loss": 2.3602,
"step": 732
},
{
"epoch": 0.9409499358151476,
"grad_norm": 0.29266172647476196,
"learning_rate": 4.4759619788639244e-05,
"loss": 2.3348,
"step": 733
},
{
"epoch": 0.9422336328626444,
"grad_norm": 0.29403918981552124,
"learning_rate": 4.474365604369817e-05,
"loss": 2.3746,
"step": 734
},
{
"epoch": 0.9435173299101413,
"grad_norm": 0.30329957604408264,
"learning_rate": 4.4727670876530524e-05,
"loss": 2.3238,
"step": 735
},
{
"epoch": 0.944801026957638,
"grad_norm": 0.2870703339576721,
"learning_rate": 4.47116643044805e-05,
"loss": 2.3218,
"step": 736
},
{
"epoch": 0.9460847240051348,
"grad_norm": 0.3277876377105713,
"learning_rate": 4.469563634491554e-05,
"loss": 2.2395,
"step": 737
},
{
"epoch": 0.9473684210526315,
"grad_norm": 0.31081393361091614,
"learning_rate": 4.4679587015226253e-05,
"loss": 2.3659,
"step": 738
},
{
"epoch": 0.9486521181001284,
"grad_norm": 0.3004544675350189,
"learning_rate": 4.4663516332826466e-05,
"loss": 2.3085,
"step": 739
},
{
"epoch": 0.9499358151476252,
"grad_norm": 0.3048396706581116,
"learning_rate": 4.464742431515315e-05,
"loss": 2.4874,
"step": 740
},
{
"epoch": 0.9512195121951219,
"grad_norm": 0.2936103045940399,
"learning_rate": 4.463131097966644e-05,
"loss": 2.3382,
"step": 741
},
{
"epoch": 0.9525032092426188,
"grad_norm": 0.3437049090862274,
"learning_rate": 4.461517634384961e-05,
"loss": 2.3761,
"step": 742
},
{
"epoch": 0.9537869062901155,
"grad_norm": 0.30605414509773254,
"learning_rate": 4.459902042520903e-05,
"loss": 2.3129,
"step": 743
},
{
"epoch": 0.9550706033376123,
"grad_norm": 0.3019481599330902,
"learning_rate": 4.458284324127415e-05,
"loss": 2.2858,
"step": 744
},
{
"epoch": 0.9563543003851092,
"grad_norm": 0.29308027029037476,
"learning_rate": 4.456664480959752e-05,
"loss": 2.3391,
"step": 745
},
{
"epoch": 0.9576379974326059,
"grad_norm": 0.30717384815216064,
"learning_rate": 4.455042514775475e-05,
"loss": 2.4426,
"step": 746
},
{
"epoch": 0.9589216944801027,
"grad_norm": 0.3070937395095825,
"learning_rate": 4.4534184273344445e-05,
"loss": 2.3566,
"step": 747
},
{
"epoch": 0.9602053915275995,
"grad_norm": 0.3065012991428375,
"learning_rate": 4.451792220398826e-05,
"loss": 2.3023,
"step": 748
},
{
"epoch": 0.9614890885750963,
"grad_norm": 0.32163599133491516,
"learning_rate": 4.450163895733084e-05,
"loss": 2.3961,
"step": 749
},
{
"epoch": 0.962772785622593,
"grad_norm": 0.4026564359664917,
"learning_rate": 4.448533455103979e-05,
"loss": 2.3264,
"step": 750
},
{
"epoch": 0.9640564826700898,
"grad_norm": 0.31173425912857056,
"learning_rate": 4.446900900280571e-05,
"loss": 2.4026,
"step": 751
},
{
"epoch": 0.9653401797175867,
"grad_norm": 0.2894265949726105,
"learning_rate": 4.445266233034209e-05,
"loss": 2.3748,
"step": 752
},
{
"epoch": 0.9666238767650834,
"grad_norm": 0.3028358817100525,
"learning_rate": 4.443629455138539e-05,
"loss": 2.4017,
"step": 753
},
{
"epoch": 0.9679075738125802,
"grad_norm": 0.31508946418762207,
"learning_rate": 4.441990568369494e-05,
"loss": 2.4487,
"step": 754
},
{
"epoch": 0.9691912708600771,
"grad_norm": 0.29983460903167725,
"learning_rate": 4.440349574505296e-05,
"loss": 2.3995,
"step": 755
},
{
"epoch": 0.9704749679075738,
"grad_norm": 0.31416934728622437,
"learning_rate": 4.438706475326453e-05,
"loss": 2.2832,
"step": 756
},
{
"epoch": 0.9717586649550706,
"grad_norm": 0.31629544496536255,
"learning_rate": 4.437061272615759e-05,
"loss": 2.4038,
"step": 757
},
{
"epoch": 0.9730423620025674,
"grad_norm": 0.31430545449256897,
"learning_rate": 4.4354139681582865e-05,
"loss": 2.3577,
"step": 758
},
{
"epoch": 0.9743260590500642,
"grad_norm": 0.3069208860397339,
"learning_rate": 4.4337645637413926e-05,
"loss": 2.3681,
"step": 759
},
{
"epoch": 0.975609756097561,
"grad_norm": 0.3285858631134033,
"learning_rate": 4.4321130611547116e-05,
"loss": 2.3895,
"step": 760
},
{
"epoch": 0.9768934531450578,
"grad_norm": 0.3070060610771179,
"learning_rate": 4.430459462190151e-05,
"loss": 2.297,
"step": 761
},
{
"epoch": 0.9781771501925546,
"grad_norm": 0.2973722219467163,
"learning_rate": 4.4288037686418995e-05,
"loss": 2.3951,
"step": 762
},
{
"epoch": 0.9794608472400513,
"grad_norm": 0.29620617628097534,
"learning_rate": 4.427145982306412e-05,
"loss": 2.4088,
"step": 763
},
{
"epoch": 0.9807445442875481,
"grad_norm": 0.30031436681747437,
"learning_rate": 4.425486104982418e-05,
"loss": 2.3678,
"step": 764
},
{
"epoch": 0.982028241335045,
"grad_norm": 0.312211275100708,
"learning_rate": 4.4238241384709156e-05,
"loss": 2.4162,
"step": 765
},
{
"epoch": 0.9833119383825417,
"grad_norm": 0.289779931306839,
"learning_rate": 4.422160084575167e-05,
"loss": 2.342,
"step": 766
},
{
"epoch": 0.9845956354300385,
"grad_norm": 0.3106563687324524,
"learning_rate": 4.420493945100702e-05,
"loss": 2.3842,
"step": 767
},
{
"epoch": 0.9858793324775353,
"grad_norm": 0.31197184324264526,
"learning_rate": 4.418825721855311e-05,
"loss": 2.353,
"step": 768
},
{
"epoch": 0.9871630295250321,
"grad_norm": 0.32115286588668823,
"learning_rate": 4.417155416649049e-05,
"loss": 2.3355,
"step": 769
},
{
"epoch": 0.9884467265725289,
"grad_norm": 0.3096913695335388,
"learning_rate": 4.415483031294225e-05,
"loss": 2.4134,
"step": 770
},
{
"epoch": 0.9897304236200257,
"grad_norm": 0.28306904435157776,
"learning_rate": 4.41380856760541e-05,
"loss": 2.3534,
"step": 771
},
{
"epoch": 0.9910141206675225,
"grad_norm": 0.2894434630870819,
"learning_rate": 4.412132027399426e-05,
"loss": 2.3527,
"step": 772
},
{
"epoch": 0.9922978177150192,
"grad_norm": 0.31455811858177185,
"learning_rate": 4.41045341249535e-05,
"loss": 2.4085,
"step": 773
},
{
"epoch": 0.993581514762516,
"grad_norm": 0.30663493275642395,
"learning_rate": 4.408772724714509e-05,
"loss": 2.3677,
"step": 774
},
{
"epoch": 0.9948652118100129,
"grad_norm": 0.3203311264514923,
"learning_rate": 4.407089965880482e-05,
"loss": 2.3208,
"step": 775
},
{
"epoch": 0.9961489088575096,
"grad_norm": 0.31689587235450745,
"learning_rate": 4.4054051378190915e-05,
"loss": 2.4329,
"step": 776
},
{
"epoch": 0.9974326059050064,
"grad_norm": 0.3117218017578125,
"learning_rate": 4.403718242358407e-05,
"loss": 2.4214,
"step": 777
},
{
"epoch": 0.9987163029525032,
"grad_norm": 0.3188619017601013,
"learning_rate": 4.40202928132874e-05,
"loss": 2.3655,
"step": 778
},
{
"epoch": 1.0,
"grad_norm": 0.30813828110694885,
"learning_rate": 4.400338256562645e-05,
"loss": 2.4338,
"step": 779
},
{
"epoch": 1.0012836970474968,
"grad_norm": 0.3029801845550537,
"learning_rate": 4.398645169894914e-05,
"loss": 2.3734,
"step": 780
},
{
"epoch": 1.0012836970474968,
"grad_norm": 0.2845710217952728,
"learning_rate": 4.3969500231625774e-05,
"loss": 2.2655,
"step": 781
},
{
"epoch": 1.0025673940949935,
"grad_norm": 0.28304269909858704,
"learning_rate": 4.3952528182049e-05,
"loss": 2.2808,
"step": 782
},
{
"epoch": 1.0038510911424903,
"grad_norm": 0.29494890570640564,
"learning_rate": 4.393553556863381e-05,
"loss": 2.1482,
"step": 783
},
{
"epoch": 1.0051347881899872,
"grad_norm": 0.27522915601730347,
"learning_rate": 4.391852240981749e-05,
"loss": 2.2518,
"step": 784
},
{
"epoch": 1.006418485237484,
"grad_norm": 0.3148289620876312,
"learning_rate": 4.390148872405963e-05,
"loss": 2.2193,
"step": 785
},
{
"epoch": 1.0077021822849808,
"grad_norm": 0.31480056047439575,
"learning_rate": 4.3884434529842076e-05,
"loss": 2.1945,
"step": 786
},
{
"epoch": 1.0089858793324775,
"grad_norm": 0.30653226375579834,
"learning_rate": 4.386735984566896e-05,
"loss": 2.224,
"step": 787
},
{
"epoch": 1.0102695763799743,
"grad_norm": 0.31125813722610474,
"learning_rate": 4.385026469006662e-05,
"loss": 2.3276,
"step": 788
},
{
"epoch": 1.011553273427471,
"grad_norm": 0.2961493730545044,
"learning_rate": 4.3833149081583604e-05,
"loss": 2.2199,
"step": 789
},
{
"epoch": 1.012836970474968,
"grad_norm": 0.298798143863678,
"learning_rate": 4.381601303879066e-05,
"loss": 2.2086,
"step": 790
},
{
"epoch": 1.0141206675224648,
"grad_norm": 0.336722195148468,
"learning_rate": 4.3798856580280715e-05,
"loss": 2.2231,
"step": 791
},
{
"epoch": 1.0154043645699615,
"grad_norm": 0.3271631896495819,
"learning_rate": 4.378167972466884e-05,
"loss": 2.2672,
"step": 792
},
{
"epoch": 1.0166880616174583,
"grad_norm": 0.28419965505599976,
"learning_rate": 4.376448249059222e-05,
"loss": 2.2158,
"step": 793
},
{
"epoch": 1.017971758664955,
"grad_norm": 0.3065083920955658,
"learning_rate": 4.3747264896710185e-05,
"loss": 2.1897,
"step": 794
},
{
"epoch": 1.0192554557124518,
"grad_norm": 0.31983664631843567,
"learning_rate": 4.3730026961704126e-05,
"loss": 2.2348,
"step": 795
},
{
"epoch": 1.0205391527599486,
"grad_norm": 0.3390856981277466,
"learning_rate": 4.371276870427753e-05,
"loss": 2.21,
"step": 796
},
{
"epoch": 1.0218228498074455,
"grad_norm": 0.31926363706588745,
"learning_rate": 4.36954901431559e-05,
"loss": 2.2628,
"step": 797
},
{
"epoch": 1.0231065468549423,
"grad_norm": 0.33794736862182617,
"learning_rate": 4.367819129708682e-05,
"loss": 2.229,
"step": 798
},
{
"epoch": 1.024390243902439,
"grad_norm": 0.33381831645965576,
"learning_rate": 4.3660872184839825e-05,
"loss": 2.2948,
"step": 799
},
{
"epoch": 1.0256739409499358,
"grad_norm": 0.32239073514938354,
"learning_rate": 4.364353282520648e-05,
"loss": 2.2511,
"step": 800
},
{
"epoch": 1.0269576379974326,
"grad_norm": 0.3180272877216339,
"learning_rate": 4.36261732370003e-05,
"loss": 2.1619,
"step": 801
},
{
"epoch": 1.0282413350449293,
"grad_norm": 0.33113232254981995,
"learning_rate": 4.360879343905676e-05,
"loss": 2.2182,
"step": 802
},
{
"epoch": 1.0295250320924263,
"grad_norm": 0.3161705434322357,
"learning_rate": 4.359139345023325e-05,
"loss": 2.1993,
"step": 803
},
{
"epoch": 1.030808729139923,
"grad_norm": 0.3182056248188019,
"learning_rate": 4.3573973289409094e-05,
"loss": 2.2228,
"step": 804
},
{
"epoch": 1.0320924261874198,
"grad_norm": 0.3930760324001312,
"learning_rate": 4.355653297548546e-05,
"loss": 2.2691,
"step": 805
},
{
"epoch": 1.0333761232349166,
"grad_norm": 0.31772395968437195,
"learning_rate": 4.353907252738543e-05,
"loss": 2.2816,
"step": 806
},
{
"epoch": 1.0346598202824133,
"grad_norm": 0.36085692048072815,
"learning_rate": 4.352159196405388e-05,
"loss": 2.2951,
"step": 807
},
{
"epoch": 1.03594351732991,
"grad_norm": 0.3501516580581665,
"learning_rate": 4.350409130445756e-05,
"loss": 2.2143,
"step": 808
},
{
"epoch": 1.0372272143774068,
"grad_norm": 0.3293107748031616,
"learning_rate": 4.3486570567585003e-05,
"loss": 2.2479,
"step": 809
},
{
"epoch": 1.0385109114249038,
"grad_norm": 0.31279006600379944,
"learning_rate": 4.3469029772446525e-05,
"loss": 2.2152,
"step": 810
},
{
"epoch": 1.0397946084724006,
"grad_norm": 0.34608227014541626,
"learning_rate": 4.3451468938074205e-05,
"loss": 2.173,
"step": 811
},
{
"epoch": 1.0410783055198973,
"grad_norm": 0.317304402589798,
"learning_rate": 4.343388808352187e-05,
"loss": 2.2071,
"step": 812
},
{
"epoch": 1.042362002567394,
"grad_norm": 0.3161017596721649,
"learning_rate": 4.3416287227865074e-05,
"loss": 2.2746,
"step": 813
},
{
"epoch": 1.0436456996148908,
"grad_norm": 0.3690197765827179,
"learning_rate": 4.339866639020106e-05,
"loss": 2.3066,
"step": 814
},
{
"epoch": 1.0449293966623876,
"grad_norm": 0.328646183013916,
"learning_rate": 4.338102558964876e-05,
"loss": 2.2019,
"step": 815
},
{
"epoch": 1.0462130937098846,
"grad_norm": 0.34105199575424194,
"learning_rate": 4.3363364845348755e-05,
"loss": 2.2957,
"step": 816
},
{
"epoch": 1.0474967907573813,
"grad_norm": 0.3383546769618988,
"learning_rate": 4.3345684176463286e-05,
"loss": 2.2092,
"step": 817
},
{
"epoch": 1.048780487804878,
"grad_norm": 0.3505076766014099,
"learning_rate": 4.3327983602176196e-05,
"loss": 2.1598,
"step": 818
},
{
"epoch": 1.0500641848523748,
"grad_norm": 0.35122519731521606,
"learning_rate": 4.3310263141692935e-05,
"loss": 2.174,
"step": 819
},
{
"epoch": 1.0513478818998716,
"grad_norm": 0.36835214495658875,
"learning_rate": 4.329252281424052e-05,
"loss": 2.2067,
"step": 820
},
{
"epoch": 1.0526315789473684,
"grad_norm": 0.3246287405490875,
"learning_rate": 4.327476263906753e-05,
"loss": 2.2666,
"step": 821
},
{
"epoch": 1.0539152759948651,
"grad_norm": 0.355728417634964,
"learning_rate": 4.325698263544409e-05,
"loss": 2.3094,
"step": 822
},
{
"epoch": 1.055198973042362,
"grad_norm": 0.35884207487106323,
"learning_rate": 4.3239182822661805e-05,
"loss": 2.2344,
"step": 823
},
{
"epoch": 1.0564826700898589,
"grad_norm": 0.37921926379203796,
"learning_rate": 4.3221363220033814e-05,
"loss": 2.2099,
"step": 824
},
{
"epoch": 1.0577663671373556,
"grad_norm": 0.36717137694358826,
"learning_rate": 4.320352384689471e-05,
"loss": 2.2276,
"step": 825
},
{
"epoch": 1.0590500641848524,
"grad_norm": 0.35223424434661865,
"learning_rate": 4.318566472260054e-05,
"loss": 2.2718,
"step": 826
},
{
"epoch": 1.0603337612323491,
"grad_norm": 0.321180135011673,
"learning_rate": 4.316778586652876e-05,
"loss": 2.2637,
"step": 827
},
{
"epoch": 1.0616174582798459,
"grad_norm": 0.36485522985458374,
"learning_rate": 4.3149887298078276e-05,
"loss": 2.2069,
"step": 828
},
{
"epoch": 1.0629011553273426,
"grad_norm": 0.3567776381969452,
"learning_rate": 4.313196903666935e-05,
"loss": 2.2394,
"step": 829
},
{
"epoch": 1.0641848523748396,
"grad_norm": 0.3673252463340759,
"learning_rate": 4.3114031101743625e-05,
"loss": 2.2388,
"step": 830
},
{
"epoch": 1.0654685494223364,
"grad_norm": 0.327640175819397,
"learning_rate": 4.309607351276409e-05,
"loss": 2.2847,
"step": 831
},
{
"epoch": 1.0667522464698331,
"grad_norm": 0.35278892517089844,
"learning_rate": 4.307809628921505e-05,
"loss": 2.1972,
"step": 832
},
{
"epoch": 1.0680359435173299,
"grad_norm": 0.36460593342781067,
"learning_rate": 4.306009945060212e-05,
"loss": 2.1976,
"step": 833
},
{
"epoch": 1.0693196405648266,
"grad_norm": 0.38041952252388,
"learning_rate": 4.3042083016452205e-05,
"loss": 2.2009,
"step": 834
},
{
"epoch": 1.0706033376123234,
"grad_norm": 0.3453562259674072,
"learning_rate": 4.302404700631346e-05,
"loss": 2.2788,
"step": 835
},
{
"epoch": 1.0718870346598204,
"grad_norm": 0.3455585539340973,
"learning_rate": 4.300599143975529e-05,
"loss": 2.2418,
"step": 836
},
{
"epoch": 1.0731707317073171,
"grad_norm": 0.3436455726623535,
"learning_rate": 4.298791633636831e-05,
"loss": 2.2121,
"step": 837
},
{
"epoch": 1.074454428754814,
"grad_norm": 0.35627031326293945,
"learning_rate": 4.2969821715764335e-05,
"loss": 2.255,
"step": 838
},
{
"epoch": 1.0757381258023107,
"grad_norm": 0.3669643998146057,
"learning_rate": 4.2951707597576354e-05,
"loss": 2.2815,
"step": 839
},
{
"epoch": 1.0770218228498074,
"grad_norm": 0.40819859504699707,
"learning_rate": 4.2933574001458524e-05,
"loss": 2.2359,
"step": 840
},
{
"epoch": 1.0783055198973042,
"grad_norm": 0.37338659167289734,
"learning_rate": 4.291542094708612e-05,
"loss": 2.1824,
"step": 841
},
{
"epoch": 1.0795892169448011,
"grad_norm": 0.33379244804382324,
"learning_rate": 4.2897248454155544e-05,
"loss": 2.2073,
"step": 842
},
{
"epoch": 1.080872913992298,
"grad_norm": 0.3620593845844269,
"learning_rate": 4.2879056542384274e-05,
"loss": 2.2447,
"step": 843
},
{
"epoch": 1.0821566110397947,
"grad_norm": 0.3987523913383484,
"learning_rate": 4.2860845231510873e-05,
"loss": 2.2799,
"step": 844
},
{
"epoch": 1.0834403080872914,
"grad_norm": 0.3315846621990204,
"learning_rate": 4.2842614541294935e-05,
"loss": 2.2696,
"step": 845
},
{
"epoch": 1.0847240051347882,
"grad_norm": 0.3277621865272522,
"learning_rate": 4.282436449151711e-05,
"loss": 2.201,
"step": 846
},
{
"epoch": 1.086007702182285,
"grad_norm": 0.3634556233882904,
"learning_rate": 4.2806095101979016e-05,
"loss": 2.2588,
"step": 847
},
{
"epoch": 1.0872913992297817,
"grad_norm": 0.3708522915840149,
"learning_rate": 4.278780639250328e-05,
"loss": 2.2732,
"step": 848
},
{
"epoch": 1.0885750962772787,
"grad_norm": 0.34317970275878906,
"learning_rate": 4.2769498382933484e-05,
"loss": 2.2507,
"step": 849
},
{
"epoch": 1.0898587933247754,
"grad_norm": 0.3626539707183838,
"learning_rate": 4.275117109313415e-05,
"loss": 2.2375,
"step": 850
},
{
"epoch": 1.0911424903722722,
"grad_norm": 0.34370511770248413,
"learning_rate": 4.273282454299073e-05,
"loss": 2.3254,
"step": 851
},
{
"epoch": 1.092426187419769,
"grad_norm": 0.3586619198322296,
"learning_rate": 4.271445875240956e-05,
"loss": 2.2233,
"step": 852
},
{
"epoch": 1.0937098844672657,
"grad_norm": 0.37432125210762024,
"learning_rate": 4.2696073741317854e-05,
"loss": 2.265,
"step": 853
},
{
"epoch": 1.0949935815147624,
"grad_norm": 0.3803035616874695,
"learning_rate": 4.267766952966369e-05,
"loss": 2.2271,
"step": 854
},
{
"epoch": 1.0962772785622592,
"grad_norm": 0.3703853189945221,
"learning_rate": 4.265924613741596e-05,
"loss": 2.2264,
"step": 855
},
{
"epoch": 1.0975609756097562,
"grad_norm": 0.32208389043807983,
"learning_rate": 4.264080358456441e-05,
"loss": 2.1602,
"step": 856
},
{
"epoch": 1.098844672657253,
"grad_norm": 0.34176668524742126,
"learning_rate": 4.2622341891119514e-05,
"loss": 2.2398,
"step": 857
},
{
"epoch": 1.1001283697047497,
"grad_norm": 0.3376818895339966,
"learning_rate": 4.260386107711256e-05,
"loss": 2.1697,
"step": 858
},
{
"epoch": 1.1014120667522465,
"grad_norm": 0.33973339200019836,
"learning_rate": 4.258536116259557e-05,
"loss": 2.2835,
"step": 859
},
{
"epoch": 1.1026957637997432,
"grad_norm": 0.3520197570323944,
"learning_rate": 4.256684216764127e-05,
"loss": 2.2117,
"step": 860
},
{
"epoch": 1.10397946084724,
"grad_norm": 0.36753150820732117,
"learning_rate": 4.2548304112343143e-05,
"loss": 2.1906,
"step": 861
},
{
"epoch": 1.1052631578947367,
"grad_norm": 0.40462440252304077,
"learning_rate": 4.252974701681528e-05,
"loss": 2.2117,
"step": 862
},
{
"epoch": 1.1065468549422337,
"grad_norm": 0.36666756868362427,
"learning_rate": 4.251117090119249e-05,
"loss": 2.3043,
"step": 863
},
{
"epoch": 1.1078305519897305,
"grad_norm": 0.34417638182640076,
"learning_rate": 4.249257578563019e-05,
"loss": 2.2083,
"step": 864
},
{
"epoch": 1.1091142490372272,
"grad_norm": 0.35194942355155945,
"learning_rate": 4.2473961690304415e-05,
"loss": 2.2115,
"step": 865
},
{
"epoch": 1.110397946084724,
"grad_norm": 0.33601894974708557,
"learning_rate": 4.24553286354118e-05,
"loss": 2.2619,
"step": 866
},
{
"epoch": 1.1116816431322207,
"grad_norm": 0.359961599111557,
"learning_rate": 4.243667664116956e-05,
"loss": 2.2579,
"step": 867
},
{
"epoch": 1.1129653401797175,
"grad_norm": 0.34904807806015015,
"learning_rate": 4.241800572781543e-05,
"loss": 2.2111,
"step": 868
},
{
"epoch": 1.1142490372272145,
"grad_norm": 0.3385773003101349,
"learning_rate": 4.2399315915607705e-05,
"loss": 2.2261,
"step": 869
},
{
"epoch": 1.1155327342747112,
"grad_norm": 0.387919157743454,
"learning_rate": 4.238060722482517e-05,
"loss": 2.2792,
"step": 870
},
{
"epoch": 1.116816431322208,
"grad_norm": 0.34467244148254395,
"learning_rate": 4.2361879675767094e-05,
"loss": 2.2175,
"step": 871
},
{
"epoch": 1.1181001283697047,
"grad_norm": 0.3811795115470886,
"learning_rate": 4.2343133288753194e-05,
"loss": 2.1043,
"step": 872
},
{
"epoch": 1.1193838254172015,
"grad_norm": 0.35866042971611023,
"learning_rate": 4.232436808412367e-05,
"loss": 2.1949,
"step": 873
},
{
"epoch": 1.1206675224646983,
"grad_norm": 0.3329986035823822,
"learning_rate": 4.230558408223909e-05,
"loss": 2.2682,
"step": 874
},
{
"epoch": 1.1219512195121952,
"grad_norm": 0.3897240459918976,
"learning_rate": 4.228678130348045e-05,
"loss": 2.1858,
"step": 875
},
{
"epoch": 1.123234916559692,
"grad_norm": 0.3876970112323761,
"learning_rate": 4.22679597682491e-05,
"loss": 2.2581,
"step": 876
},
{
"epoch": 1.1245186136071887,
"grad_norm": 0.3848673403263092,
"learning_rate": 4.2249119496966746e-05,
"loss": 2.2538,
"step": 877
},
{
"epoch": 1.1258023106546855,
"grad_norm": 0.35312554240226746,
"learning_rate": 4.223026051007544e-05,
"loss": 2.1958,
"step": 878
},
{
"epoch": 1.1270860077021823,
"grad_norm": 0.40287381410598755,
"learning_rate": 4.2211382828037506e-05,
"loss": 2.2334,
"step": 879
},
{
"epoch": 1.128369704749679,
"grad_norm": 0.3706461787223816,
"learning_rate": 4.2192486471335585e-05,
"loss": 2.1225,
"step": 880
},
{
"epoch": 1.1296534017971758,
"grad_norm": 0.36967286467552185,
"learning_rate": 4.2173571460472575e-05,
"loss": 2.2301,
"step": 881
},
{
"epoch": 1.1309370988446728,
"grad_norm": 0.3459497392177582,
"learning_rate": 4.215463781597159e-05,
"loss": 2.2297,
"step": 882
},
{
"epoch": 1.1322207958921695,
"grad_norm": 0.3627232313156128,
"learning_rate": 4.2135685558375994e-05,
"loss": 2.179,
"step": 883
},
{
"epoch": 1.1335044929396663,
"grad_norm": 0.38214111328125,
"learning_rate": 4.211671470824933e-05,
"loss": 2.3036,
"step": 884
},
{
"epoch": 1.134788189987163,
"grad_norm": 0.35911989212036133,
"learning_rate": 4.209772528617531e-05,
"loss": 2.1972,
"step": 885
},
{
"epoch": 1.1360718870346598,
"grad_norm": 0.35253652930259705,
"learning_rate": 4.207871731275781e-05,
"loss": 2.2036,
"step": 886
},
{
"epoch": 1.1373555840821565,
"grad_norm": 0.36945071816444397,
"learning_rate": 4.205969080862082e-05,
"loss": 2.2582,
"step": 887
},
{
"epoch": 1.1386392811296533,
"grad_norm": 0.36074042320251465,
"learning_rate": 4.2040645794408445e-05,
"loss": 2.2153,
"step": 888
},
{
"epoch": 1.1399229781771503,
"grad_norm": 0.34374964237213135,
"learning_rate": 4.202158229078488e-05,
"loss": 2.2856,
"step": 889
},
{
"epoch": 1.141206675224647,
"grad_norm": 0.3783753514289856,
"learning_rate": 4.200250031843436e-05,
"loss": 2.1796,
"step": 890
},
{
"epoch": 1.1424903722721438,
"grad_norm": 0.4086593687534332,
"learning_rate": 4.1983399898061186e-05,
"loss": 2.2755,
"step": 891
},
{
"epoch": 1.1437740693196405,
"grad_norm": 0.3416132926940918,
"learning_rate": 4.196428105038966e-05,
"loss": 2.2646,
"step": 892
},
{
"epoch": 1.1450577663671373,
"grad_norm": 0.359719455242157,
"learning_rate": 4.1945143796164076e-05,
"loss": 2.2193,
"step": 893
},
{
"epoch": 1.146341463414634,
"grad_norm": 0.4014301598072052,
"learning_rate": 4.19259881561487e-05,
"loss": 2.2917,
"step": 894
},
{
"epoch": 1.147625160462131,
"grad_norm": 0.342040091753006,
"learning_rate": 4.190681415112776e-05,
"loss": 2.2541,
"step": 895
},
{
"epoch": 1.1489088575096278,
"grad_norm": 0.3607075810432434,
"learning_rate": 4.1887621801905396e-05,
"loss": 2.2114,
"step": 896
},
{
"epoch": 1.1501925545571245,
"grad_norm": 0.34826815128326416,
"learning_rate": 4.186841112930565e-05,
"loss": 2.1932,
"step": 897
},
{
"epoch": 1.1514762516046213,
"grad_norm": 0.36598455905914307,
"learning_rate": 4.1849182154172475e-05,
"loss": 2.2374,
"step": 898
},
{
"epoch": 1.152759948652118,
"grad_norm": 0.34205228090286255,
"learning_rate": 4.1829934897369625e-05,
"loss": 2.1307,
"step": 899
},
{
"epoch": 1.1540436456996148,
"grad_norm": 0.3688623607158661,
"learning_rate": 4.181066937978075e-05,
"loss": 2.2279,
"step": 900
},
{
"epoch": 1.1553273427471118,
"grad_norm": 0.39433860778808594,
"learning_rate": 4.179138562230928e-05,
"loss": 2.2676,
"step": 901
},
{
"epoch": 1.1566110397946086,
"grad_norm": 0.3450692892074585,
"learning_rate": 4.1772083645878444e-05,
"loss": 2.2221,
"step": 902
},
{
"epoch": 1.1578947368421053,
"grad_norm": 0.3846975266933441,
"learning_rate": 4.175276347143124e-05,
"loss": 2.2659,
"step": 903
},
{
"epoch": 1.159178433889602,
"grad_norm": 0.3393380045890808,
"learning_rate": 4.17334251199304e-05,
"loss": 2.2868,
"step": 904
},
{
"epoch": 1.1604621309370988,
"grad_norm": 0.35884889960289,
"learning_rate": 4.17140686123584e-05,
"loss": 2.2106,
"step": 905
},
{
"epoch": 1.1617458279845956,
"grad_norm": 0.3687424063682556,
"learning_rate": 4.169469396971739e-05,
"loss": 2.2195,
"step": 906
},
{
"epoch": 1.1630295250320923,
"grad_norm": 0.358381450176239,
"learning_rate": 4.167530121302923e-05,
"loss": 2.2163,
"step": 907
},
{
"epoch": 1.1643132220795893,
"grad_norm": 0.3487226366996765,
"learning_rate": 4.1655890363335394e-05,
"loss": 2.2896,
"step": 908
},
{
"epoch": 1.165596919127086,
"grad_norm": 0.34116798639297485,
"learning_rate": 4.163646144169702e-05,
"loss": 2.2171,
"step": 909
},
{
"epoch": 1.1668806161745828,
"grad_norm": 0.34055590629577637,
"learning_rate": 4.1617014469194836e-05,
"loss": 2.242,
"step": 910
},
{
"epoch": 1.1681643132220796,
"grad_norm": 0.39334848523139954,
"learning_rate": 4.159754946692916e-05,
"loss": 2.1213,
"step": 911
},
{
"epoch": 1.1694480102695763,
"grad_norm": 0.3823232352733612,
"learning_rate": 4.157806645601988e-05,
"loss": 2.1674,
"step": 912
},
{
"epoch": 1.170731707317073,
"grad_norm": 0.3423396050930023,
"learning_rate": 4.1558565457606415e-05,
"loss": 2.2422,
"step": 913
},
{
"epoch": 1.1720154043645699,
"grad_norm": 0.3720071017742157,
"learning_rate": 4.153904649284771e-05,
"loss": 2.1096,
"step": 914
},
{
"epoch": 1.1732991014120668,
"grad_norm": 0.34880882501602173,
"learning_rate": 4.151950958292219e-05,
"loss": 2.2199,
"step": 915
},
{
"epoch": 1.1745827984595636,
"grad_norm": 0.3538772165775299,
"learning_rate": 4.149995474902776e-05,
"loss": 2.2033,
"step": 916
},
{
"epoch": 1.1758664955070603,
"grad_norm": 0.36488077044487,
"learning_rate": 4.1480382012381766e-05,
"loss": 2.1818,
"step": 917
},
{
"epoch": 1.177150192554557,
"grad_norm": 0.3882789611816406,
"learning_rate": 4.1460791394221e-05,
"loss": 2.2456,
"step": 918
},
{
"epoch": 1.1784338896020539,
"grad_norm": 0.4123387038707733,
"learning_rate": 4.144118291580161e-05,
"loss": 2.2155,
"step": 919
},
{
"epoch": 1.1797175866495506,
"grad_norm": 0.3578166663646698,
"learning_rate": 4.142155659839919e-05,
"loss": 2.3112,
"step": 920
},
{
"epoch": 1.1810012836970474,
"grad_norm": 0.3611357510089874,
"learning_rate": 4.1401912463308625e-05,
"loss": 2.1915,
"step": 921
},
{
"epoch": 1.1822849807445444,
"grad_norm": 0.3884929120540619,
"learning_rate": 4.138225053184417e-05,
"loss": 2.2782,
"step": 922
},
{
"epoch": 1.1835686777920411,
"grad_norm": 0.35503101348876953,
"learning_rate": 4.136257082533938e-05,
"loss": 2.2083,
"step": 923
},
{
"epoch": 1.1848523748395379,
"grad_norm": 0.3408321440219879,
"learning_rate": 4.1342873365147074e-05,
"loss": 2.3104,
"step": 924
},
{
"epoch": 1.1861360718870346,
"grad_norm": 0.3557950556278229,
"learning_rate": 4.1323158172639385e-05,
"loss": 2.1643,
"step": 925
},
{
"epoch": 1.1874197689345314,
"grad_norm": 0.37379875779151917,
"learning_rate": 4.130342526920765e-05,
"loss": 2.2973,
"step": 926
},
{
"epoch": 1.1887034659820284,
"grad_norm": 0.37019041180610657,
"learning_rate": 4.128367467626241e-05,
"loss": 2.1679,
"step": 927
},
{
"epoch": 1.1899871630295251,
"grad_norm": 0.3340079188346863,
"learning_rate": 4.126390641523344e-05,
"loss": 2.2195,
"step": 928
},
{
"epoch": 1.1912708600770219,
"grad_norm": 0.39198464155197144,
"learning_rate": 4.124412050756965e-05,
"loss": 2.2736,
"step": 929
},
{
"epoch": 1.1925545571245186,
"grad_norm": 0.37619346380233765,
"learning_rate": 4.122431697473912e-05,
"loss": 2.2903,
"step": 930
},
{
"epoch": 1.1938382541720154,
"grad_norm": 0.3682747781276703,
"learning_rate": 4.120449583822904e-05,
"loss": 2.1202,
"step": 931
},
{
"epoch": 1.1951219512195121,
"grad_norm": 0.37576183676719666,
"learning_rate": 4.118465711954569e-05,
"loss": 2.2263,
"step": 932
},
{
"epoch": 1.196405648267009,
"grad_norm": 0.3458687663078308,
"learning_rate": 4.116480084021447e-05,
"loss": 2.2303,
"step": 933
},
{
"epoch": 1.1976893453145059,
"grad_norm": 0.35645976662635803,
"learning_rate": 4.114492702177978e-05,
"loss": 2.3105,
"step": 934
},
{
"epoch": 1.1989730423620026,
"grad_norm": 0.3832726776599884,
"learning_rate": 4.1125035685805084e-05,
"loss": 2.2087,
"step": 935
},
{
"epoch": 1.2002567394094994,
"grad_norm": 0.3578638732433319,
"learning_rate": 4.1105126853872845e-05,
"loss": 2.2223,
"step": 936
},
{
"epoch": 1.2015404364569962,
"grad_norm": 0.3332241177558899,
"learning_rate": 4.108520054758451e-05,
"loss": 2.2631,
"step": 937
},
{
"epoch": 1.202824133504493,
"grad_norm": 0.35905030369758606,
"learning_rate": 4.106525678856049e-05,
"loss": 2.2027,
"step": 938
},
{
"epoch": 1.2041078305519897,
"grad_norm": 0.3650401532649994,
"learning_rate": 4.1045295598440105e-05,
"loss": 2.2894,
"step": 939
},
{
"epoch": 1.2053915275994864,
"grad_norm": 0.38491788506507874,
"learning_rate": 4.102531699888163e-05,
"loss": 2.2929,
"step": 940
},
{
"epoch": 1.2066752246469834,
"grad_norm": 0.380605012178421,
"learning_rate": 4.1005321011562206e-05,
"loss": 2.2567,
"step": 941
},
{
"epoch": 1.2079589216944802,
"grad_norm": 0.3601861894130707,
"learning_rate": 4.098530765817784e-05,
"loss": 2.0563,
"step": 942
},
{
"epoch": 1.209242618741977,
"grad_norm": 0.3581317663192749,
"learning_rate": 4.0965276960443374e-05,
"loss": 2.2161,
"step": 943
},
{
"epoch": 1.2105263157894737,
"grad_norm": 0.3650892972946167,
"learning_rate": 4.094522894009251e-05,
"loss": 2.2823,
"step": 944
},
{
"epoch": 1.2118100128369704,
"grad_norm": 0.3575249910354614,
"learning_rate": 4.0925163618877695e-05,
"loss": 2.2203,
"step": 945
},
{
"epoch": 1.2130937098844672,
"grad_norm": 0.38429054617881775,
"learning_rate": 4.090508101857017e-05,
"loss": 2.2489,
"step": 946
},
{
"epoch": 1.214377406931964,
"grad_norm": 0.3500010073184967,
"learning_rate": 4.088498116095991e-05,
"loss": 2.2303,
"step": 947
},
{
"epoch": 1.215661103979461,
"grad_norm": 0.36866387724876404,
"learning_rate": 4.086486406785566e-05,
"loss": 2.2372,
"step": 948
},
{
"epoch": 1.2169448010269577,
"grad_norm": 0.3696565330028534,
"learning_rate": 4.08447297610848e-05,
"loss": 2.2177,
"step": 949
},
{
"epoch": 1.2182284980744544,
"grad_norm": 0.37652555108070374,
"learning_rate": 4.082457826249343e-05,
"loss": 2.2195,
"step": 950
},
{
"epoch": 1.2195121951219512,
"grad_norm": 0.3684236407279968,
"learning_rate": 4.08044095939463e-05,
"loss": 2.1949,
"step": 951
},
{
"epoch": 1.220795892169448,
"grad_norm": 0.41459155082702637,
"learning_rate": 4.078422377732678e-05,
"loss": 2.2588,
"step": 952
},
{
"epoch": 1.2220795892169447,
"grad_norm": 0.3802269995212555,
"learning_rate": 4.0764020834536846e-05,
"loss": 2.1484,
"step": 953
},
{
"epoch": 1.2233632862644417,
"grad_norm": 0.38021379709243774,
"learning_rate": 4.074380078749707e-05,
"loss": 2.2062,
"step": 954
},
{
"epoch": 1.2246469833119384,
"grad_norm": 0.3608596622943878,
"learning_rate": 4.072356365814655e-05,
"loss": 2.1648,
"step": 955
},
{
"epoch": 1.2259306803594352,
"grad_norm": 0.36179718375205994,
"learning_rate": 4.070330946844295e-05,
"loss": 2.2361,
"step": 956
},
{
"epoch": 1.227214377406932,
"grad_norm": 0.39312490820884705,
"learning_rate": 4.068303824036244e-05,
"loss": 2.3051,
"step": 957
},
{
"epoch": 1.2284980744544287,
"grad_norm": 0.3528019189834595,
"learning_rate": 4.0662749995899666e-05,
"loss": 2.2951,
"step": 958
},
{
"epoch": 1.2297817715019255,
"grad_norm": 0.3852112293243408,
"learning_rate": 4.064244475706776e-05,
"loss": 2.3484,
"step": 959
},
{
"epoch": 1.2310654685494224,
"grad_norm": 0.3318076431751251,
"learning_rate": 4.0622122545898246e-05,
"loss": 2.2845,
"step": 960
},
{
"epoch": 1.2323491655969192,
"grad_norm": 0.3624611794948578,
"learning_rate": 4.0601783384441104e-05,
"loss": 2.1966,
"step": 961
},
{
"epoch": 1.233632862644416,
"grad_norm": 0.4128612279891968,
"learning_rate": 4.058142729476471e-05,
"loss": 2.2521,
"step": 962
},
{
"epoch": 1.2349165596919127,
"grad_norm": 0.3708478510379791,
"learning_rate": 4.056105429895578e-05,
"loss": 2.1304,
"step": 963
},
{
"epoch": 1.2362002567394095,
"grad_norm": 0.36903417110443115,
"learning_rate": 4.054066441911939e-05,
"loss": 2.1398,
"step": 964
},
{
"epoch": 1.2374839537869062,
"grad_norm": 0.3842657208442688,
"learning_rate": 4.052025767737893e-05,
"loss": 2.2423,
"step": 965
},
{
"epoch": 1.238767650834403,
"grad_norm": 0.34361815452575684,
"learning_rate": 4.049983409587608e-05,
"loss": 2.201,
"step": 966
},
{
"epoch": 1.2400513478819,
"grad_norm": 0.3871995508670807,
"learning_rate": 4.0479393696770805e-05,
"loss": 2.2816,
"step": 967
},
{
"epoch": 1.2413350449293967,
"grad_norm": 0.4010753333568573,
"learning_rate": 4.045893650224132e-05,
"loss": 2.1793,
"step": 968
},
{
"epoch": 1.2426187419768935,
"grad_norm": 0.364837110042572,
"learning_rate": 4.043846253448403e-05,
"loss": 2.235,
"step": 969
},
{
"epoch": 1.2439024390243902,
"grad_norm": 0.39040201902389526,
"learning_rate": 4.0417971815713584e-05,
"loss": 2.1954,
"step": 970
},
{
"epoch": 1.245186136071887,
"grad_norm": 0.3791297674179077,
"learning_rate": 4.039746436816277e-05,
"loss": 2.258,
"step": 971
},
{
"epoch": 1.2464698331193838,
"grad_norm": 0.37748801708221436,
"learning_rate": 4.037694021408255e-05,
"loss": 2.2057,
"step": 972
},
{
"epoch": 1.2477535301668805,
"grad_norm": 0.348192036151886,
"learning_rate": 4.0356399375742e-05,
"loss": 2.2646,
"step": 973
},
{
"epoch": 1.2490372272143775,
"grad_norm": 0.34801027178764343,
"learning_rate": 4.0335841875428324e-05,
"loss": 2.2689,
"step": 974
},
{
"epoch": 1.2503209242618742,
"grad_norm": 0.3744896352291107,
"learning_rate": 4.031526773544676e-05,
"loss": 2.2559,
"step": 975
},
{
"epoch": 1.251604621309371,
"grad_norm": 0.36844202876091003,
"learning_rate": 4.029467697812064e-05,
"loss": 2.2393,
"step": 976
},
{
"epoch": 1.2528883183568678,
"grad_norm": 0.3810536861419678,
"learning_rate": 4.0274069625791285e-05,
"loss": 2.2695,
"step": 977
},
{
"epoch": 1.2541720154043645,
"grad_norm": 0.3671133816242218,
"learning_rate": 4.0253445700818086e-05,
"loss": 2.1974,
"step": 978
},
{
"epoch": 1.2554557124518615,
"grad_norm": 0.3648216724395752,
"learning_rate": 4.023280522557836e-05,
"loss": 2.1598,
"step": 979
},
{
"epoch": 1.256739409499358,
"grad_norm": 0.3690861463546753,
"learning_rate": 4.021214822246739e-05,
"loss": 2.253,
"step": 980
},
{
"epoch": 1.258023106546855,
"grad_norm": 0.3612317442893982,
"learning_rate": 4.019147471389841e-05,
"loss": 2.1809,
"step": 981
},
{
"epoch": 1.2593068035943518,
"grad_norm": 0.3894643187522888,
"learning_rate": 4.017078472230255e-05,
"loss": 2.2193,
"step": 982
},
{
"epoch": 1.2605905006418485,
"grad_norm": 0.38447320461273193,
"learning_rate": 4.015007827012883e-05,
"loss": 2.2449,
"step": 983
},
{
"epoch": 1.2618741976893453,
"grad_norm": 0.41027238965034485,
"learning_rate": 4.012935537984414e-05,
"loss": 2.1993,
"step": 984
},
{
"epoch": 1.263157894736842,
"grad_norm": 0.3948719799518585,
"learning_rate": 4.0108616073933164e-05,
"loss": 2.2761,
"step": 985
},
{
"epoch": 1.264441591784339,
"grad_norm": 0.3726814091205597,
"learning_rate": 4.0087860374898446e-05,
"loss": 2.2068,
"step": 986
},
{
"epoch": 1.2657252888318355,
"grad_norm": 0.3911172151565552,
"learning_rate": 4.00670883052603e-05,
"loss": 2.1476,
"step": 987
},
{
"epoch": 1.2670089858793325,
"grad_norm": 0.39758938550949097,
"learning_rate": 4.00462998875568e-05,
"loss": 2.1984,
"step": 988
},
{
"epoch": 1.2682926829268293,
"grad_norm": 0.3571755588054657,
"learning_rate": 4.002549514434375e-05,
"loss": 2.1852,
"step": 989
},
{
"epoch": 1.269576379974326,
"grad_norm": 0.3718327283859253,
"learning_rate": 4.0004674098194675e-05,
"loss": 2.2064,
"step": 990
},
{
"epoch": 1.2708600770218228,
"grad_norm": 0.35757002234458923,
"learning_rate": 3.99838367717008e-05,
"loss": 2.2122,
"step": 991
},
{
"epoch": 1.2721437740693196,
"grad_norm": 0.3949007987976074,
"learning_rate": 3.9962983187470994e-05,
"loss": 2.2586,
"step": 992
},
{
"epoch": 1.2734274711168165,
"grad_norm": 0.3487754166126251,
"learning_rate": 3.994211336813178e-05,
"loss": 2.2183,
"step": 993
},
{
"epoch": 1.2747111681643133,
"grad_norm": 0.36431553959846497,
"learning_rate": 3.9921227336327293e-05,
"loss": 2.1891,
"step": 994
},
{
"epoch": 1.27599486521181,
"grad_norm": 0.36330950260162354,
"learning_rate": 3.990032511471926e-05,
"loss": 2.2732,
"step": 995
},
{
"epoch": 1.2772785622593068,
"grad_norm": 0.3715732991695404,
"learning_rate": 3.9879406725986965e-05,
"loss": 2.1761,
"step": 996
},
{
"epoch": 1.2785622593068036,
"grad_norm": 0.37331831455230713,
"learning_rate": 3.985847219282725e-05,
"loss": 2.3297,
"step": 997
},
{
"epoch": 1.2798459563543003,
"grad_norm": 0.36636462807655334,
"learning_rate": 3.983752153795445e-05,
"loss": 2.1766,
"step": 998
},
{
"epoch": 1.281129653401797,
"grad_norm": 0.3827669620513916,
"learning_rate": 3.981655478410043e-05,
"loss": 2.2023,
"step": 999
},
{
"epoch": 1.282413350449294,
"grad_norm": 0.35590195655822754,
"learning_rate": 3.979557195401447e-05,
"loss": 2.2212,
"step": 1000
},
{
"epoch": 1.2836970474967908,
"grad_norm": 0.35328447818756104,
"learning_rate": 3.977457307046335e-05,
"loss": 2.2325,
"step": 1001
},
{
"epoch": 1.2849807445442876,
"grad_norm": 0.3608861267566681,
"learning_rate": 3.975355815623124e-05,
"loss": 2.2387,
"step": 1002
},
{
"epoch": 1.2862644415917843,
"grad_norm": 0.3597520887851715,
"learning_rate": 3.973252723411969e-05,
"loss": 2.2482,
"step": 1003
},
{
"epoch": 1.287548138639281,
"grad_norm": 0.35393795371055603,
"learning_rate": 3.9711480326947636e-05,
"loss": 2.1458,
"step": 1004
},
{
"epoch": 1.2888318356867778,
"grad_norm": 0.33810925483703613,
"learning_rate": 3.9690417457551364e-05,
"loss": 2.2283,
"step": 1005
},
{
"epoch": 1.2901155327342746,
"grad_norm": 0.38078486919403076,
"learning_rate": 3.966933864878446e-05,
"loss": 2.3029,
"step": 1006
},
{
"epoch": 1.2913992297817716,
"grad_norm": 0.3521521985530853,
"learning_rate": 3.9648243923517836e-05,
"loss": 2.211,
"step": 1007
},
{
"epoch": 1.2926829268292683,
"grad_norm": 0.37834882736206055,
"learning_rate": 3.962713330463963e-05,
"loss": 2.2147,
"step": 1008
},
{
"epoch": 1.293966623876765,
"grad_norm": 0.3604402244091034,
"learning_rate": 3.960600681505526e-05,
"loss": 2.2126,
"step": 1009
},
{
"epoch": 1.2952503209242618,
"grad_norm": 0.3602832555770874,
"learning_rate": 3.958486447768736e-05,
"loss": 2.2646,
"step": 1010
},
{
"epoch": 1.2965340179717586,
"grad_norm": 0.368082731962204,
"learning_rate": 3.9563706315475726e-05,
"loss": 2.235,
"step": 1011
},
{
"epoch": 1.2978177150192556,
"grad_norm": 0.3699602484703064,
"learning_rate": 3.954253235137737e-05,
"loss": 2.1595,
"step": 1012
},
{
"epoch": 1.2991014120667521,
"grad_norm": 0.3776682913303375,
"learning_rate": 3.9521342608366424e-05,
"loss": 2.1841,
"step": 1013
},
{
"epoch": 1.300385109114249,
"grad_norm": 0.40402621030807495,
"learning_rate": 3.950013710943415e-05,
"loss": 2.3433,
"step": 1014
},
{
"epoch": 1.3016688061617459,
"grad_norm": 0.3616265058517456,
"learning_rate": 3.9478915877588895e-05,
"loss": 2.1792,
"step": 1015
},
{
"epoch": 1.3029525032092426,
"grad_norm": 0.370483934879303,
"learning_rate": 3.945767893585608e-05,
"loss": 2.2353,
"step": 1016
},
{
"epoch": 1.3042362002567394,
"grad_norm": 0.35736027359962463,
"learning_rate": 3.9436426307278175e-05,
"loss": 2.1984,
"step": 1017
},
{
"epoch": 1.3055198973042361,
"grad_norm": 0.35935720801353455,
"learning_rate": 3.941515801491468e-05,
"loss": 2.1947,
"step": 1018
},
{
"epoch": 1.306803594351733,
"grad_norm": 0.37101173400878906,
"learning_rate": 3.9393874081842056e-05,
"loss": 2.2611,
"step": 1019
},
{
"epoch": 1.3080872913992299,
"grad_norm": 0.35435688495635986,
"learning_rate": 3.937257453115378e-05,
"loss": 2.316,
"step": 1020
},
{
"epoch": 1.3093709884467266,
"grad_norm": 0.3522638976573944,
"learning_rate": 3.935125938596026e-05,
"loss": 2.2335,
"step": 1021
},
{
"epoch": 1.3106546854942234,
"grad_norm": 0.36578962206840515,
"learning_rate": 3.932992866938878e-05,
"loss": 2.1806,
"step": 1022
},
{
"epoch": 1.3119383825417201,
"grad_norm": 0.3987962007522583,
"learning_rate": 3.93085824045836e-05,
"loss": 2.2901,
"step": 1023
},
{
"epoch": 1.3132220795892169,
"grad_norm": 0.35971546173095703,
"learning_rate": 3.928722061470577e-05,
"loss": 2.2237,
"step": 1024
},
{
"epoch": 1.3145057766367136,
"grad_norm": 0.3714924454689026,
"learning_rate": 3.9265843322933246e-05,
"loss": 2.1631,
"step": 1025
},
{
"epoch": 1.3157894736842106,
"grad_norm": 0.3943617343902588,
"learning_rate": 3.924445055246076e-05,
"loss": 2.192,
"step": 1026
},
{
"epoch": 1.3170731707317074,
"grad_norm": 0.3856651186943054,
"learning_rate": 3.9223042326499876e-05,
"loss": 2.2358,
"step": 1027
},
{
"epoch": 1.3183568677792041,
"grad_norm": 0.37776678800582886,
"learning_rate": 3.920161866827889e-05,
"loss": 2.2994,
"step": 1028
},
{
"epoch": 1.319640564826701,
"grad_norm": 0.3672482371330261,
"learning_rate": 3.918017960104289e-05,
"loss": 2.2111,
"step": 1029
},
{
"epoch": 1.3209242618741976,
"grad_norm": 0.3800339698791504,
"learning_rate": 3.9158725148053624e-05,
"loss": 2.2559,
"step": 1030
},
{
"epoch": 1.3222079589216944,
"grad_norm": 0.3875967264175415,
"learning_rate": 3.913725533258958e-05,
"loss": 2.2545,
"step": 1031
},
{
"epoch": 1.3234916559691912,
"grad_norm": 0.35976120829582214,
"learning_rate": 3.91157701779459e-05,
"loss": 2.2463,
"step": 1032
},
{
"epoch": 1.3247753530166881,
"grad_norm": 0.36237773299217224,
"learning_rate": 3.909426970743435e-05,
"loss": 2.1812,
"step": 1033
},
{
"epoch": 1.326059050064185,
"grad_norm": 0.4001487195491791,
"learning_rate": 3.907275394438335e-05,
"loss": 2.2625,
"step": 1034
},
{
"epoch": 1.3273427471116817,
"grad_norm": 0.3896241784095764,
"learning_rate": 3.905122291213791e-05,
"loss": 2.2755,
"step": 1035
},
{
"epoch": 1.3286264441591784,
"grad_norm": 0.35467684268951416,
"learning_rate": 3.902967663405956e-05,
"loss": 2.2866,
"step": 1036
},
{
"epoch": 1.3299101412066752,
"grad_norm": 0.3871800899505615,
"learning_rate": 3.900811513352642e-05,
"loss": 2.2688,
"step": 1037
},
{
"epoch": 1.3311938382541721,
"grad_norm": 0.37341922521591187,
"learning_rate": 3.8986538433933116e-05,
"loss": 2.3456,
"step": 1038
},
{
"epoch": 1.3324775353016687,
"grad_norm": 0.3812583088874817,
"learning_rate": 3.896494655869074e-05,
"loss": 2.2648,
"step": 1039
},
{
"epoch": 1.3337612323491657,
"grad_norm": 0.3752239942550659,
"learning_rate": 3.8943339531226895e-05,
"loss": 2.2425,
"step": 1040
},
{
"epoch": 1.3350449293966624,
"grad_norm": 0.40153515338897705,
"learning_rate": 3.892171737498558e-05,
"loss": 2.2276,
"step": 1041
},
{
"epoch": 1.3363286264441592,
"grad_norm": 0.3683505952358246,
"learning_rate": 3.890008011342725e-05,
"loss": 2.2163,
"step": 1042
},
{
"epoch": 1.337612323491656,
"grad_norm": 0.38523101806640625,
"learning_rate": 3.8878427770028705e-05,
"loss": 2.2731,
"step": 1043
},
{
"epoch": 1.3388960205391527,
"grad_norm": 0.4340222179889679,
"learning_rate": 3.8856760368283143e-05,
"loss": 2.1979,
"step": 1044
},
{
"epoch": 1.3401797175866497,
"grad_norm": 0.3821227550506592,
"learning_rate": 3.8835077931700085e-05,
"loss": 2.193,
"step": 1045
},
{
"epoch": 1.3414634146341464,
"grad_norm": 0.3737218677997589,
"learning_rate": 3.8813380483805374e-05,
"loss": 2.305,
"step": 1046
},
{
"epoch": 1.3427471116816432,
"grad_norm": 0.3844810724258423,
"learning_rate": 3.879166804814114e-05,
"loss": 2.1998,
"step": 1047
},
{
"epoch": 1.34403080872914,
"grad_norm": 0.43873289227485657,
"learning_rate": 3.876994064826576e-05,
"loss": 2.2845,
"step": 1048
},
{
"epoch": 1.3453145057766367,
"grad_norm": 0.3661579489707947,
"learning_rate": 3.8748198307753874e-05,
"loss": 2.2256,
"step": 1049
},
{
"epoch": 1.3465982028241335,
"grad_norm": 0.3813747763633728,
"learning_rate": 3.8726441050196305e-05,
"loss": 2.1725,
"step": 1050
},
{
"epoch": 1.3478818998716302,
"grad_norm": 0.37020549178123474,
"learning_rate": 3.870466889920008e-05,
"loss": 2.2679,
"step": 1051
},
{
"epoch": 1.3491655969191272,
"grad_norm": 0.39001724123954773,
"learning_rate": 3.8682881878388376e-05,
"loss": 2.3152,
"step": 1052
},
{
"epoch": 1.350449293966624,
"grad_norm": 0.3887459635734558,
"learning_rate": 3.866108001140051e-05,
"loss": 2.2882,
"step": 1053
},
{
"epoch": 1.3517329910141207,
"grad_norm": 0.3768569827079773,
"learning_rate": 3.863926332189191e-05,
"loss": 2.2279,
"step": 1054
},
{
"epoch": 1.3530166880616175,
"grad_norm": 0.35536009073257446,
"learning_rate": 3.8617431833534066e-05,
"loss": 2.1596,
"step": 1055
},
{
"epoch": 1.3543003851091142,
"grad_norm": 0.38662397861480713,
"learning_rate": 3.859558557001456e-05,
"loss": 2.294,
"step": 1056
},
{
"epoch": 1.355584082156611,
"grad_norm": 0.37451159954071045,
"learning_rate": 3.857372455503697e-05,
"loss": 2.1217,
"step": 1057
},
{
"epoch": 1.3568677792041077,
"grad_norm": 0.3878231942653656,
"learning_rate": 3.8551848812320904e-05,
"loss": 2.235,
"step": 1058
},
{
"epoch": 1.3581514762516047,
"grad_norm": 0.4239773154258728,
"learning_rate": 3.8529958365601935e-05,
"loss": 2.2152,
"step": 1059
},
{
"epoch": 1.3594351732991015,
"grad_norm": 0.36272433400154114,
"learning_rate": 3.8508053238631614e-05,
"loss": 2.1909,
"step": 1060
},
{
"epoch": 1.3607188703465982,
"grad_norm": 0.3761455714702606,
"learning_rate": 3.8486133455177374e-05,
"loss": 2.2534,
"step": 1061
},
{
"epoch": 1.362002567394095,
"grad_norm": 0.3687390089035034,
"learning_rate": 3.8464199039022605e-05,
"loss": 2.3551,
"step": 1062
},
{
"epoch": 1.3632862644415917,
"grad_norm": 0.3875104784965515,
"learning_rate": 3.844225001396654e-05,
"loss": 2.1771,
"step": 1063
},
{
"epoch": 1.3645699614890887,
"grad_norm": 0.3739263713359833,
"learning_rate": 3.842028640382427e-05,
"loss": 2.2324,
"step": 1064
},
{
"epoch": 1.3658536585365852,
"grad_norm": 0.40333956480026245,
"learning_rate": 3.839830823242672e-05,
"loss": 2.2459,
"step": 1065
},
{
"epoch": 1.3671373555840822,
"grad_norm": 0.3819042146205902,
"learning_rate": 3.8376315523620584e-05,
"loss": 2.1794,
"step": 1066
},
{
"epoch": 1.368421052631579,
"grad_norm": 0.3782418668270111,
"learning_rate": 3.835430830126837e-05,
"loss": 2.2324,
"step": 1067
},
{
"epoch": 1.3697047496790757,
"grad_norm": 0.3628568947315216,
"learning_rate": 3.833228658924831e-05,
"loss": 2.2479,
"step": 1068
},
{
"epoch": 1.3709884467265725,
"grad_norm": 0.3767690658569336,
"learning_rate": 3.8310250411454373e-05,
"loss": 2.2547,
"step": 1069
},
{
"epoch": 1.3722721437740693,
"grad_norm": 0.4039711058139801,
"learning_rate": 3.8288199791796195e-05,
"loss": 2.2212,
"step": 1070
},
{
"epoch": 1.3735558408215662,
"grad_norm": 0.3665675222873688,
"learning_rate": 3.8266134754199114e-05,
"loss": 2.2553,
"step": 1071
},
{
"epoch": 1.3748395378690628,
"grad_norm": 0.3713792860507965,
"learning_rate": 3.82440553226041e-05,
"loss": 2.3088,
"step": 1072
},
{
"epoch": 1.3761232349165597,
"grad_norm": 0.3711341619491577,
"learning_rate": 3.822196152096772e-05,
"loss": 2.2099,
"step": 1073
},
{
"epoch": 1.3774069319640565,
"grad_norm": 0.3817347586154938,
"learning_rate": 3.8199853373262176e-05,
"loss": 2.2119,
"step": 1074
},
{
"epoch": 1.3786906290115533,
"grad_norm": 0.3850735127925873,
"learning_rate": 3.817773090347519e-05,
"loss": 2.2058,
"step": 1075
},
{
"epoch": 1.37997432605905,
"grad_norm": 0.3686407804489136,
"learning_rate": 3.8155594135610064e-05,
"loss": 2.2008,
"step": 1076
},
{
"epoch": 1.3812580231065468,
"grad_norm": 0.3951480984687805,
"learning_rate": 3.8133443093685585e-05,
"loss": 2.2465,
"step": 1077
},
{
"epoch": 1.3825417201540438,
"grad_norm": 0.4176129102706909,
"learning_rate": 3.811127780173602e-05,
"loss": 2.3168,
"step": 1078
},
{
"epoch": 1.3838254172015405,
"grad_norm": 0.3943917453289032,
"learning_rate": 3.808909828381115e-05,
"loss": 2.1437,
"step": 1079
},
{
"epoch": 1.3851091142490373,
"grad_norm": 0.37614941596984863,
"learning_rate": 3.806690456397612e-05,
"loss": 2.217,
"step": 1080
},
{
"epoch": 1.386392811296534,
"grad_norm": 0.38424134254455566,
"learning_rate": 3.804469666631155e-05,
"loss": 2.2226,
"step": 1081
},
{
"epoch": 1.3876765083440308,
"grad_norm": 0.36679860949516296,
"learning_rate": 3.802247461491341e-05,
"loss": 2.2228,
"step": 1082
},
{
"epoch": 1.3889602053915275,
"grad_norm": 0.3983418047428131,
"learning_rate": 3.8000238433893e-05,
"loss": 2.1754,
"step": 1083
},
{
"epoch": 1.3902439024390243,
"grad_norm": 0.40835443139076233,
"learning_rate": 3.7977988147377006e-05,
"loss": 2.177,
"step": 1084
},
{
"epoch": 1.3915275994865213,
"grad_norm": 0.3770159184932709,
"learning_rate": 3.79557237795074e-05,
"loss": 2.2697,
"step": 1085
},
{
"epoch": 1.392811296534018,
"grad_norm": 0.3879336416721344,
"learning_rate": 3.793344535444142e-05,
"loss": 2.3518,
"step": 1086
},
{
"epoch": 1.3940949935815148,
"grad_norm": 0.3921049237251282,
"learning_rate": 3.791115289635156e-05,
"loss": 2.0823,
"step": 1087
},
{
"epoch": 1.3953786906290115,
"grad_norm": 0.3612160384654999,
"learning_rate": 3.7888846429425546e-05,
"loss": 2.2694,
"step": 1088
},
{
"epoch": 1.3966623876765083,
"grad_norm": 0.36151912808418274,
"learning_rate": 3.7866525977866296e-05,
"loss": 2.2476,
"step": 1089
},
{
"epoch": 1.397946084724005,
"grad_norm": 0.36083683371543884,
"learning_rate": 3.784419156589192e-05,
"loss": 2.2319,
"step": 1090
},
{
"epoch": 1.3992297817715018,
"grad_norm": 0.3636226952075958,
"learning_rate": 3.782184321773564e-05,
"loss": 2.1486,
"step": 1091
},
{
"epoch": 1.4005134788189988,
"grad_norm": 0.36022767424583435,
"learning_rate": 3.779948095764584e-05,
"loss": 2.2666,
"step": 1092
},
{
"epoch": 1.4017971758664955,
"grad_norm": 0.36462947726249695,
"learning_rate": 3.777710480988598e-05,
"loss": 2.165,
"step": 1093
},
{
"epoch": 1.4030808729139923,
"grad_norm": 0.3657393157482147,
"learning_rate": 3.775471479873457e-05,
"loss": 2.2537,
"step": 1094
},
{
"epoch": 1.404364569961489,
"grad_norm": 0.36987847089767456,
"learning_rate": 3.773231094848519e-05,
"loss": 2.1994,
"step": 1095
},
{
"epoch": 1.4056482670089858,
"grad_norm": 0.3848939836025238,
"learning_rate": 3.770989328344645e-05,
"loss": 2.308,
"step": 1096
},
{
"epoch": 1.4069319640564828,
"grad_norm": 0.38896140456199646,
"learning_rate": 3.768746182794187e-05,
"loss": 2.1958,
"step": 1097
},
{
"epoch": 1.4082156611039793,
"grad_norm": 0.3703378736972809,
"learning_rate": 3.766501660631004e-05,
"loss": 2.209,
"step": 1098
},
{
"epoch": 1.4094993581514763,
"grad_norm": 0.373000830411911,
"learning_rate": 3.764255764290442e-05,
"loss": 2.2251,
"step": 1099
},
{
"epoch": 1.410783055198973,
"grad_norm": 0.3429698944091797,
"learning_rate": 3.762008496209338e-05,
"loss": 2.2303,
"step": 1100
},
{
"epoch": 1.4120667522464698,
"grad_norm": 0.39933037757873535,
"learning_rate": 3.7597598588260196e-05,
"loss": 2.1388,
"step": 1101
},
{
"epoch": 1.4133504492939666,
"grad_norm": 0.39360055327415466,
"learning_rate": 3.757509854580299e-05,
"loss": 2.2116,
"step": 1102
},
{
"epoch": 1.4146341463414633,
"grad_norm": 0.39372727274894714,
"learning_rate": 3.755258485913474e-05,
"loss": 2.2814,
"step": 1103
},
{
"epoch": 1.4159178433889603,
"grad_norm": 0.3706322908401489,
"learning_rate": 3.7530057552683175e-05,
"loss": 2.1897,
"step": 1104
},
{
"epoch": 1.417201540436457,
"grad_norm": 0.39840468764305115,
"learning_rate": 3.750751665089085e-05,
"loss": 2.2565,
"step": 1105
},
{
"epoch": 1.4184852374839538,
"grad_norm": 0.377890408039093,
"learning_rate": 3.7484962178215055e-05,
"loss": 2.2281,
"step": 1106
},
{
"epoch": 1.4197689345314506,
"grad_norm": 0.3951006531715393,
"learning_rate": 3.74623941591278e-05,
"loss": 2.1748,
"step": 1107
},
{
"epoch": 1.4210526315789473,
"grad_norm": 0.4109231233596802,
"learning_rate": 3.7439812618115795e-05,
"loss": 2.165,
"step": 1108
},
{
"epoch": 1.422336328626444,
"grad_norm": 0.41010499000549316,
"learning_rate": 3.7417217579680426e-05,
"loss": 2.1168,
"step": 1109
},
{
"epoch": 1.4236200256739409,
"grad_norm": 0.38000255823135376,
"learning_rate": 3.739460906833772e-05,
"loss": 2.2188,
"step": 1110
},
{
"epoch": 1.4249037227214378,
"grad_norm": 0.3960665464401245,
"learning_rate": 3.737198710861832e-05,
"loss": 2.2694,
"step": 1111
},
{
"epoch": 1.4261874197689346,
"grad_norm": 0.3778914213180542,
"learning_rate": 3.734935172506747e-05,
"loss": 2.3073,
"step": 1112
},
{
"epoch": 1.4274711168164314,
"grad_norm": 0.3883233368396759,
"learning_rate": 3.732670294224498e-05,
"loss": 2.1895,
"step": 1113
},
{
"epoch": 1.428754813863928,
"grad_norm": 0.38521334528923035,
"learning_rate": 3.730404078472518e-05,
"loss": 2.2923,
"step": 1114
},
{
"epoch": 1.4300385109114249,
"grad_norm": 0.39216870069503784,
"learning_rate": 3.728136527709694e-05,
"loss": 2.2761,
"step": 1115
},
{
"epoch": 1.4313222079589216,
"grad_norm": 0.3601703643798828,
"learning_rate": 3.725867644396358e-05,
"loss": 2.2398,
"step": 1116
},
{
"epoch": 1.4326059050064184,
"grad_norm": 0.400269091129303,
"learning_rate": 3.72359743099429e-05,
"loss": 2.2223,
"step": 1117
},
{
"epoch": 1.4338896020539154,
"grad_norm": 0.3815939724445343,
"learning_rate": 3.7213258899667144e-05,
"loss": 2.291,
"step": 1118
},
{
"epoch": 1.4351732991014121,
"grad_norm": 0.37799543142318726,
"learning_rate": 3.719053023778292e-05,
"loss": 2.2347,
"step": 1119
},
{
"epoch": 1.4364569961489089,
"grad_norm": 0.37301313877105713,
"learning_rate": 3.716778834895127e-05,
"loss": 2.2475,
"step": 1120
},
{
"epoch": 1.4377406931964056,
"grad_norm": 0.3976534307003021,
"learning_rate": 3.714503325784754e-05,
"loss": 2.2651,
"step": 1121
},
{
"epoch": 1.4390243902439024,
"grad_norm": 0.39929476380348206,
"learning_rate": 3.712226498916143e-05,
"loss": 2.2446,
"step": 1122
},
{
"epoch": 1.4403080872913994,
"grad_norm": 0.37493380904197693,
"learning_rate": 3.709948356759691e-05,
"loss": 2.1977,
"step": 1123
},
{
"epoch": 1.441591784338896,
"grad_norm": 0.3848809003829956,
"learning_rate": 3.7076689017872246e-05,
"loss": 2.3382,
"step": 1124
},
{
"epoch": 1.4428754813863929,
"grad_norm": 0.39769625663757324,
"learning_rate": 3.705388136471995e-05,
"loss": 2.3048,
"step": 1125
},
{
"epoch": 1.4441591784338896,
"grad_norm": 0.36819279193878174,
"learning_rate": 3.7031060632886724e-05,
"loss": 2.2417,
"step": 1126
},
{
"epoch": 1.4454428754813864,
"grad_norm": 0.3896733820438385,
"learning_rate": 3.700822684713349e-05,
"loss": 2.2029,
"step": 1127
},
{
"epoch": 1.4467265725288831,
"grad_norm": 0.36140698194503784,
"learning_rate": 3.6985380032235315e-05,
"loss": 2.149,
"step": 1128
},
{
"epoch": 1.44801026957638,
"grad_norm": 0.3695017993450165,
"learning_rate": 3.6962520212981405e-05,
"loss": 2.2265,
"step": 1129
},
{
"epoch": 1.4492939666238769,
"grad_norm": 0.3904394805431366,
"learning_rate": 3.69396474141751e-05,
"loss": 2.2479,
"step": 1130
},
{
"epoch": 1.4505776636713734,
"grad_norm": 0.389116495847702,
"learning_rate": 3.691676166063378e-05,
"loss": 2.2346,
"step": 1131
},
{
"epoch": 1.4518613607188704,
"grad_norm": 0.3761390149593353,
"learning_rate": 3.689386297718892e-05,
"loss": 2.2856,
"step": 1132
},
{
"epoch": 1.4531450577663672,
"grad_norm": 0.4113580882549286,
"learning_rate": 3.687095138868601e-05,
"loss": 2.1943,
"step": 1133
},
{
"epoch": 1.454428754813864,
"grad_norm": 0.358335018157959,
"learning_rate": 3.684802691998452e-05,
"loss": 2.1895,
"step": 1134
},
{
"epoch": 1.4557124518613607,
"grad_norm": 0.47489404678344727,
"learning_rate": 3.682508959595795e-05,
"loss": 2.2097,
"step": 1135
},
{
"epoch": 1.4569961489088574,
"grad_norm": 0.40556371212005615,
"learning_rate": 3.680213944149368e-05,
"loss": 2.2414,
"step": 1136
},
{
"epoch": 1.4582798459563544,
"grad_norm": 0.356882780790329,
"learning_rate": 3.677917648149307e-05,
"loss": 2.262,
"step": 1137
},
{
"epoch": 1.4595635430038512,
"grad_norm": 0.39578723907470703,
"learning_rate": 3.675620074087134e-05,
"loss": 2.1591,
"step": 1138
},
{
"epoch": 1.460847240051348,
"grad_norm": 0.3718084394931793,
"learning_rate": 3.673321224455759e-05,
"loss": 2.3229,
"step": 1139
},
{
"epoch": 1.4621309370988447,
"grad_norm": 0.3779333829879761,
"learning_rate": 3.671021101749476e-05,
"loss": 2.2221,
"step": 1140
},
{
"epoch": 1.4634146341463414,
"grad_norm": 0.3797706663608551,
"learning_rate": 3.668719708463959e-05,
"loss": 2.1386,
"step": 1141
},
{
"epoch": 1.4646983311938382,
"grad_norm": 0.3736761212348938,
"learning_rate": 3.666417047096262e-05,
"loss": 2.1575,
"step": 1142
},
{
"epoch": 1.465982028241335,
"grad_norm": 0.6424862742424011,
"learning_rate": 3.664113120144816e-05,
"loss": 2.2285,
"step": 1143
},
{
"epoch": 1.467265725288832,
"grad_norm": 0.36189529299736023,
"learning_rate": 3.6618079301094216e-05,
"loss": 2.1598,
"step": 1144
},
{
"epoch": 1.4685494223363287,
"grad_norm": 0.3950345516204834,
"learning_rate": 3.659501479491253e-05,
"loss": 2.1779,
"step": 1145
},
{
"epoch": 1.4698331193838254,
"grad_norm": 0.37057608366012573,
"learning_rate": 3.6571937707928524e-05,
"loss": 2.1717,
"step": 1146
},
{
"epoch": 1.4711168164313222,
"grad_norm": 0.3707398474216461,
"learning_rate": 3.654884806518123e-05,
"loss": 2.1828,
"step": 1147
},
{
"epoch": 1.472400513478819,
"grad_norm": 0.3933170735836029,
"learning_rate": 3.652574589172335e-05,
"loss": 2.2266,
"step": 1148
},
{
"epoch": 1.4736842105263157,
"grad_norm": 0.39545738697052,
"learning_rate": 3.650263121262117e-05,
"loss": 2.3313,
"step": 1149
},
{
"epoch": 1.4749679075738125,
"grad_norm": 0.3947657346725464,
"learning_rate": 3.6479504052954514e-05,
"loss": 2.2543,
"step": 1150
},
{
"epoch": 1.4762516046213094,
"grad_norm": 0.38942018151283264,
"learning_rate": 3.645636443781679e-05,
"loss": 2.1467,
"step": 1151
},
{
"epoch": 1.4775353016688062,
"grad_norm": 0.3504950702190399,
"learning_rate": 3.6433212392314905e-05,
"loss": 2.2263,
"step": 1152
},
{
"epoch": 1.478818998716303,
"grad_norm": 0.37644287943840027,
"learning_rate": 3.6410047941569224e-05,
"loss": 2.3145,
"step": 1153
},
{
"epoch": 1.4801026957637997,
"grad_norm": 0.41499248147010803,
"learning_rate": 3.638687111071363e-05,
"loss": 2.182,
"step": 1154
},
{
"epoch": 1.4813863928112965,
"grad_norm": 0.3680044114589691,
"learning_rate": 3.6363681924895394e-05,
"loss": 2.23,
"step": 1155
},
{
"epoch": 1.4826700898587934,
"grad_norm": 0.3712504804134369,
"learning_rate": 3.63404804092752e-05,
"loss": 2.3055,
"step": 1156
},
{
"epoch": 1.48395378690629,
"grad_norm": 0.38080593943595886,
"learning_rate": 3.6317266589027124e-05,
"loss": 2.3163,
"step": 1157
},
{
"epoch": 1.485237483953787,
"grad_norm": 0.3790014088153839,
"learning_rate": 3.6294040489338575e-05,
"loss": 2.2555,
"step": 1158
},
{
"epoch": 1.4865211810012837,
"grad_norm": 0.3629383444786072,
"learning_rate": 3.62708021354103e-05,
"loss": 2.1125,
"step": 1159
},
{
"epoch": 1.4878048780487805,
"grad_norm": 0.36138129234313965,
"learning_rate": 3.6247551552456336e-05,
"loss": 2.2633,
"step": 1160
},
{
"epoch": 1.4890885750962772,
"grad_norm": 0.35929739475250244,
"learning_rate": 3.622428876570399e-05,
"loss": 2.2202,
"step": 1161
},
{
"epoch": 1.490372272143774,
"grad_norm": 0.3624289333820343,
"learning_rate": 3.620101380039381e-05,
"loss": 2.2102,
"step": 1162
},
{
"epoch": 1.491655969191271,
"grad_norm": 0.379535049200058,
"learning_rate": 3.617772668177956e-05,
"loss": 2.2099,
"step": 1163
},
{
"epoch": 1.4929396662387677,
"grad_norm": 0.3564260005950928,
"learning_rate": 3.615442743512817e-05,
"loss": 2.2622,
"step": 1164
},
{
"epoch": 1.4942233632862645,
"grad_norm": 0.3908945918083191,
"learning_rate": 3.613111608571977e-05,
"loss": 2.2087,
"step": 1165
},
{
"epoch": 1.4955070603337612,
"grad_norm": 0.37525975704193115,
"learning_rate": 3.6107792658847595e-05,
"loss": 2.3016,
"step": 1166
},
{
"epoch": 1.496790757381258,
"grad_norm": 0.35312336683273315,
"learning_rate": 3.608445717981798e-05,
"loss": 2.2335,
"step": 1167
},
{
"epoch": 1.4980744544287548,
"grad_norm": 0.3973201513290405,
"learning_rate": 3.6061109673950334e-05,
"loss": 2.3283,
"step": 1168
},
{
"epoch": 1.4993581514762515,
"grad_norm": 0.44854387640953064,
"learning_rate": 3.603775016657715e-05,
"loss": 2.2669,
"step": 1169
},
{
"epoch": 1.5006418485237485,
"grad_norm": 0.40130218863487244,
"learning_rate": 3.601437868304391e-05,
"loss": 2.2615,
"step": 1170
},
{
"epoch": 1.501925545571245,
"grad_norm": 0.36666733026504517,
"learning_rate": 3.59909952487091e-05,
"loss": 2.1767,
"step": 1171
},
{
"epoch": 1.503209242618742,
"grad_norm": 0.369798481464386,
"learning_rate": 3.596759988894417e-05,
"loss": 2.2134,
"step": 1172
},
{
"epoch": 1.5044929396662388,
"grad_norm": 0.37616270780563354,
"learning_rate": 3.594419262913351e-05,
"loss": 2.2595,
"step": 1173
},
{
"epoch": 1.5057766367137355,
"grad_norm": 0.37049224972724915,
"learning_rate": 3.5920773494674434e-05,
"loss": 2.1782,
"step": 1174
},
{
"epoch": 1.5070603337612325,
"grad_norm": 0.38616225123405457,
"learning_rate": 3.589734251097712e-05,
"loss": 2.1387,
"step": 1175
},
{
"epoch": 1.508344030808729,
"grad_norm": 0.4116593599319458,
"learning_rate": 3.587389970346461e-05,
"loss": 2.2533,
"step": 1176
},
{
"epoch": 1.509627727856226,
"grad_norm": 0.3951702117919922,
"learning_rate": 3.585044509757278e-05,
"loss": 2.2805,
"step": 1177
},
{
"epoch": 1.5109114249037228,
"grad_norm": 0.3869103789329529,
"learning_rate": 3.582697871875031e-05,
"loss": 2.24,
"step": 1178
},
{
"epoch": 1.5121951219512195,
"grad_norm": 0.4094953238964081,
"learning_rate": 3.580350059245864e-05,
"loss": 2.2871,
"step": 1179
},
{
"epoch": 1.5134788189987163,
"grad_norm": 0.3869830369949341,
"learning_rate": 3.578001074417198e-05,
"loss": 2.1615,
"step": 1180
},
{
"epoch": 1.514762516046213,
"grad_norm": 0.3672597408294678,
"learning_rate": 3.5756509199377224e-05,
"loss": 2.2374,
"step": 1181
},
{
"epoch": 1.51604621309371,
"grad_norm": 0.36492928862571716,
"learning_rate": 3.573299598357398e-05,
"loss": 2.3031,
"step": 1182
},
{
"epoch": 1.5173299101412066,
"grad_norm": 0.3777792751789093,
"learning_rate": 3.5709471122274534e-05,
"loss": 2.2957,
"step": 1183
},
{
"epoch": 1.5186136071887035,
"grad_norm": 0.38662636280059814,
"learning_rate": 3.568593464100377e-05,
"loss": 2.1457,
"step": 1184
},
{
"epoch": 1.5198973042362003,
"grad_norm": 0.41264063119888306,
"learning_rate": 3.566238656529921e-05,
"loss": 2.2829,
"step": 1185
},
{
"epoch": 1.521181001283697,
"grad_norm": 0.3814603388309479,
"learning_rate": 3.5638826920710935e-05,
"loss": 2.2356,
"step": 1186
},
{
"epoch": 1.5224646983311938,
"grad_norm": 0.3695092797279358,
"learning_rate": 3.56152557328016e-05,
"loss": 2.1976,
"step": 1187
},
{
"epoch": 1.5237483953786906,
"grad_norm": 0.39186903834342957,
"learning_rate": 3.559167302714636e-05,
"loss": 2.2426,
"step": 1188
},
{
"epoch": 1.5250320924261875,
"grad_norm": 0.40045449137687683,
"learning_rate": 3.556807882933289e-05,
"loss": 2.2451,
"step": 1189
},
{
"epoch": 1.526315789473684,
"grad_norm": 0.38327574729919434,
"learning_rate": 3.5544473164961324e-05,
"loss": 2.2787,
"step": 1190
},
{
"epoch": 1.527599486521181,
"grad_norm": 0.36272966861724854,
"learning_rate": 3.552085605964424e-05,
"loss": 2.2206,
"step": 1191
},
{
"epoch": 1.5288831835686778,
"grad_norm": 0.39420098066329956,
"learning_rate": 3.5497227539006614e-05,
"loss": 2.2664,
"step": 1192
},
{
"epoch": 1.5301668806161746,
"grad_norm": 0.38125845789909363,
"learning_rate": 3.547358762868584e-05,
"loss": 2.2152,
"step": 1193
},
{
"epoch": 1.5314505776636713,
"grad_norm": 0.3611045479774475,
"learning_rate": 3.544993635433164e-05,
"loss": 2.1673,
"step": 1194
},
{
"epoch": 1.532734274711168,
"grad_norm": 0.38052603602409363,
"learning_rate": 3.5426273741606086e-05,
"loss": 2.276,
"step": 1195
},
{
"epoch": 1.534017971758665,
"grad_norm": 0.37632811069488525,
"learning_rate": 3.540259981618353e-05,
"loss": 2.2557,
"step": 1196
},
{
"epoch": 1.5353016688061616,
"grad_norm": 0.3829447627067566,
"learning_rate": 3.5378914603750624e-05,
"loss": 2.2649,
"step": 1197
},
{
"epoch": 1.5365853658536586,
"grad_norm": 0.3774879574775696,
"learning_rate": 3.5355218130006255e-05,
"loss": 2.2778,
"step": 1198
},
{
"epoch": 1.5378690629011553,
"grad_norm": 0.39277470111846924,
"learning_rate": 3.5331510420661525e-05,
"loss": 2.31,
"step": 1199
},
{
"epoch": 1.539152759948652,
"grad_norm": 0.3856041133403778,
"learning_rate": 3.530779150143973e-05,
"loss": 2.1995,
"step": 1200
},
{
"epoch": 1.540436456996149,
"grad_norm": 0.40345871448516846,
"learning_rate": 3.528406139807633e-05,
"loss": 2.2882,
"step": 1201
},
{
"epoch": 1.5417201540436456,
"grad_norm": 0.37651973962783813,
"learning_rate": 3.526032013631893e-05,
"loss": 2.3489,
"step": 1202
},
{
"epoch": 1.5430038510911426,
"grad_norm": 0.359997421503067,
"learning_rate": 3.523656774192721e-05,
"loss": 2.2709,
"step": 1203
},
{
"epoch": 1.5442875481386393,
"grad_norm": 0.4158475697040558,
"learning_rate": 3.521280424067296e-05,
"loss": 2.2083,
"step": 1204
},
{
"epoch": 1.545571245186136,
"grad_norm": 0.37753528356552124,
"learning_rate": 3.5189029658340025e-05,
"loss": 2.2691,
"step": 1205
},
{
"epoch": 1.5468549422336328,
"grad_norm": 0.3862055540084839,
"learning_rate": 3.516524402072425e-05,
"loss": 2.3055,
"step": 1206
},
{
"epoch": 1.5481386392811296,
"grad_norm": 0.40717101097106934,
"learning_rate": 3.5141447353633474e-05,
"loss": 2.253,
"step": 1207
},
{
"epoch": 1.5494223363286266,
"grad_norm": 0.4078426957130432,
"learning_rate": 3.5117639682887534e-05,
"loss": 2.1887,
"step": 1208
},
{
"epoch": 1.5507060333761231,
"grad_norm": 0.34787634015083313,
"learning_rate": 3.509382103431819e-05,
"loss": 2.2336,
"step": 1209
},
{
"epoch": 1.55198973042362,
"grad_norm": 0.38061514496803284,
"learning_rate": 3.506999143376908e-05,
"loss": 2.2623,
"step": 1210
},
{
"epoch": 1.5532734274711169,
"grad_norm": 0.39258530735969543,
"learning_rate": 3.5046150907095776e-05,
"loss": 2.1535,
"step": 1211
},
{
"epoch": 1.5545571245186136,
"grad_norm": 0.3953094184398651,
"learning_rate": 3.502229948016568e-05,
"loss": 2.1615,
"step": 1212
},
{
"epoch": 1.5558408215661104,
"grad_norm": 0.3790014684200287,
"learning_rate": 3.4998437178858035e-05,
"loss": 2.2498,
"step": 1213
},
{
"epoch": 1.5571245186136071,
"grad_norm": 0.40729162096977234,
"learning_rate": 3.497456402906385e-05,
"loss": 2.2521,
"step": 1214
},
{
"epoch": 1.558408215661104,
"grad_norm": 0.42329171299934387,
"learning_rate": 3.495068005668595e-05,
"loss": 2.3194,
"step": 1215
},
{
"epoch": 1.5596919127086006,
"grad_norm": 0.3534400463104248,
"learning_rate": 3.492678528763887e-05,
"loss": 2.2785,
"step": 1216
},
{
"epoch": 1.5609756097560976,
"grad_norm": 0.4265909492969513,
"learning_rate": 3.490287974784887e-05,
"loss": 2.1803,
"step": 1217
},
{
"epoch": 1.5622593068035944,
"grad_norm": 0.37546563148498535,
"learning_rate": 3.487896346325389e-05,
"loss": 2.2921,
"step": 1218
},
{
"epoch": 1.5635430038510911,
"grad_norm": 0.38932451605796814,
"learning_rate": 3.4855036459803544e-05,
"loss": 2.2686,
"step": 1219
},
{
"epoch": 1.5648267008985879,
"grad_norm": 0.3934752643108368,
"learning_rate": 3.4831098763459066e-05,
"loss": 2.3104,
"step": 1220
},
{
"epoch": 1.5661103979460846,
"grad_norm": 0.37588098645210266,
"learning_rate": 3.480715040019329e-05,
"loss": 2.2768,
"step": 1221
},
{
"epoch": 1.5673940949935816,
"grad_norm": 0.3763798773288727,
"learning_rate": 3.478319139599063e-05,
"loss": 2.2028,
"step": 1222
},
{
"epoch": 1.5686777920410782,
"grad_norm": 0.39880573749542236,
"learning_rate": 3.475922177684704e-05,
"loss": 2.2299,
"step": 1223
},
{
"epoch": 1.5699614890885751,
"grad_norm": 0.3821561336517334,
"learning_rate": 3.473524156876999e-05,
"loss": 2.1963,
"step": 1224
},
{
"epoch": 1.571245186136072,
"grad_norm": 0.39914795756340027,
"learning_rate": 3.4711250797778446e-05,
"loss": 2.2521,
"step": 1225
},
{
"epoch": 1.5725288831835686,
"grad_norm": 0.43123576045036316,
"learning_rate": 3.468724948990283e-05,
"loss": 2.1249,
"step": 1226
},
{
"epoch": 1.5738125802310656,
"grad_norm": 0.38369259238243103,
"learning_rate": 3.466323767118501e-05,
"loss": 2.2478,
"step": 1227
},
{
"epoch": 1.5750962772785622,
"grad_norm": 0.3837836682796478,
"learning_rate": 3.4639215367678225e-05,
"loss": 2.2308,
"step": 1228
},
{
"epoch": 1.5763799743260591,
"grad_norm": 0.3655678331851959,
"learning_rate": 3.461518260544713e-05,
"loss": 2.3227,
"step": 1229
},
{
"epoch": 1.5776636713735557,
"grad_norm": 0.3748762309551239,
"learning_rate": 3.45911394105677e-05,
"loss": 2.1733,
"step": 1230
},
{
"epoch": 1.5789473684210527,
"grad_norm": 0.3868464231491089,
"learning_rate": 3.456708580912725e-05,
"loss": 2.2485,
"step": 1231
},
{
"epoch": 1.5802310654685494,
"grad_norm": 0.37276676297187805,
"learning_rate": 3.4543021827224365e-05,
"loss": 2.2844,
"step": 1232
},
{
"epoch": 1.5815147625160462,
"grad_norm": 0.38882049918174744,
"learning_rate": 3.45189474909689e-05,
"loss": 2.2337,
"step": 1233
},
{
"epoch": 1.5827984595635431,
"grad_norm": 0.37841805815696716,
"learning_rate": 3.4494862826481966e-05,
"loss": 2.2276,
"step": 1234
},
{
"epoch": 1.5840821566110397,
"grad_norm": 0.3648984730243683,
"learning_rate": 3.4470767859895846e-05,
"loss": 2.1893,
"step": 1235
},
{
"epoch": 1.5853658536585367,
"grad_norm": 0.40236514806747437,
"learning_rate": 3.4446662617354034e-05,
"loss": 2.2086,
"step": 1236
},
{
"epoch": 1.5866495507060334,
"grad_norm": 0.3805043697357178,
"learning_rate": 3.442254712501115e-05,
"loss": 2.2052,
"step": 1237
},
{
"epoch": 1.5879332477535302,
"grad_norm": 0.3884541690349579,
"learning_rate": 3.439842140903293e-05,
"loss": 2.3047,
"step": 1238
},
{
"epoch": 1.589216944801027,
"grad_norm": 0.3753684461116791,
"learning_rate": 3.437428549559624e-05,
"loss": 2.258,
"step": 1239
},
{
"epoch": 1.5905006418485237,
"grad_norm": 0.3995250165462494,
"learning_rate": 3.4350139410888956e-05,
"loss": 2.2084,
"step": 1240
},
{
"epoch": 1.5917843388960207,
"grad_norm": 0.3964383602142334,
"learning_rate": 3.432598318111005e-05,
"loss": 2.1462,
"step": 1241
},
{
"epoch": 1.5930680359435172,
"grad_norm": 0.3621688783168793,
"learning_rate": 3.4301816832469443e-05,
"loss": 2.2277,
"step": 1242
},
{
"epoch": 1.5943517329910142,
"grad_norm": 0.4131968319416046,
"learning_rate": 3.4277640391188094e-05,
"loss": 2.1845,
"step": 1243
},
{
"epoch": 1.595635430038511,
"grad_norm": 0.41226086020469666,
"learning_rate": 3.425345388349786e-05,
"loss": 2.1758,
"step": 1244
},
{
"epoch": 1.5969191270860077,
"grad_norm": 0.4197986423969269,
"learning_rate": 3.422925733564155e-05,
"loss": 2.2193,
"step": 1245
},
{
"epoch": 1.5982028241335045,
"grad_norm": 0.4071418046951294,
"learning_rate": 3.420505077387286e-05,
"loss": 2.3172,
"step": 1246
},
{
"epoch": 1.5994865211810012,
"grad_norm": 0.37383535504341125,
"learning_rate": 3.418083422445635e-05,
"loss": 2.2835,
"step": 1247
},
{
"epoch": 1.6007702182284982,
"grad_norm": 0.40442097187042236,
"learning_rate": 3.415660771366741e-05,
"loss": 2.2389,
"step": 1248
},
{
"epoch": 1.6020539152759947,
"grad_norm": 0.380212664604187,
"learning_rate": 3.4132371267792264e-05,
"loss": 2.1973,
"step": 1249
},
{
"epoch": 1.6033376123234917,
"grad_norm": 0.3948259949684143,
"learning_rate": 3.4108124913127875e-05,
"loss": 2.254,
"step": 1250
},
{
"epoch": 1.6046213093709885,
"grad_norm": 0.3870978057384491,
"learning_rate": 3.4083868675981996e-05,
"loss": 2.0882,
"step": 1251
},
{
"epoch": 1.6059050064184852,
"grad_norm": 0.38109326362609863,
"learning_rate": 3.405960258267308e-05,
"loss": 2.2989,
"step": 1252
},
{
"epoch": 1.6071887034659822,
"grad_norm": 0.38316014409065247,
"learning_rate": 3.403532665953028e-05,
"loss": 2.2568,
"step": 1253
},
{
"epoch": 1.6084724005134787,
"grad_norm": 0.4252641201019287,
"learning_rate": 3.401104093289341e-05,
"loss": 2.289,
"step": 1254
},
{
"epoch": 1.6097560975609757,
"grad_norm": 0.3894971013069153,
"learning_rate": 3.3986745429112936e-05,
"loss": 2.2041,
"step": 1255
},
{
"epoch": 1.6110397946084722,
"grad_norm": 0.38642048835754395,
"learning_rate": 3.3962440174549924e-05,
"loss": 2.2442,
"step": 1256
},
{
"epoch": 1.6123234916559692,
"grad_norm": 0.38057392835617065,
"learning_rate": 3.3938125195576e-05,
"loss": 2.2573,
"step": 1257
},
{
"epoch": 1.613607188703466,
"grad_norm": 0.3963440954685211,
"learning_rate": 3.3913800518573385e-05,
"loss": 2.2125,
"step": 1258
},
{
"epoch": 1.6148908857509627,
"grad_norm": 0.37275323271751404,
"learning_rate": 3.3889466169934774e-05,
"loss": 2.1775,
"step": 1259
},
{
"epoch": 1.6161745827984597,
"grad_norm": 0.3996749818325043,
"learning_rate": 3.386512217606339e-05,
"loss": 2.2006,
"step": 1260
},
{
"epoch": 1.6174582798459562,
"grad_norm": 0.3794258236885071,
"learning_rate": 3.384076856337292e-05,
"loss": 2.2485,
"step": 1261
},
{
"epoch": 1.6187419768934532,
"grad_norm": 0.40182143449783325,
"learning_rate": 3.3816405358287446e-05,
"loss": 2.2404,
"step": 1262
},
{
"epoch": 1.62002567394095,
"grad_norm": 0.4085938036441803,
"learning_rate": 3.379203258724152e-05,
"loss": 2.2732,
"step": 1263
},
{
"epoch": 1.6213093709884467,
"grad_norm": 0.3972185552120209,
"learning_rate": 3.376765027668003e-05,
"loss": 2.2859,
"step": 1264
},
{
"epoch": 1.6225930680359435,
"grad_norm": 0.39743340015411377,
"learning_rate": 3.374325845305822e-05,
"loss": 2.2693,
"step": 1265
},
{
"epoch": 1.6238767650834403,
"grad_norm": 0.4192309081554413,
"learning_rate": 3.371885714284169e-05,
"loss": 2.3041,
"step": 1266
},
{
"epoch": 1.6251604621309372,
"grad_norm": 0.36719846725463867,
"learning_rate": 3.369444637250627e-05,
"loss": 2.2128,
"step": 1267
},
{
"epoch": 1.6264441591784338,
"grad_norm": 0.37783583998680115,
"learning_rate": 3.367002616853812e-05,
"loss": 2.1338,
"step": 1268
},
{
"epoch": 1.6277278562259307,
"grad_norm": 0.39298015832901,
"learning_rate": 3.364559655743359e-05,
"loss": 2.2804,
"step": 1269
},
{
"epoch": 1.6290115532734275,
"grad_norm": 0.39492067694664,
"learning_rate": 3.362115756569926e-05,
"loss": 2.1376,
"step": 1270
},
{
"epoch": 1.6302952503209243,
"grad_norm": 0.4016437828540802,
"learning_rate": 3.359670921985189e-05,
"loss": 2.26,
"step": 1271
},
{
"epoch": 1.631578947368421,
"grad_norm": 0.36699244379997253,
"learning_rate": 3.3572251546418355e-05,
"loss": 2.1975,
"step": 1272
},
{
"epoch": 1.6328626444159178,
"grad_norm": 0.36995914578437805,
"learning_rate": 3.3547784571935695e-05,
"loss": 2.2535,
"step": 1273
},
{
"epoch": 1.6341463414634148,
"grad_norm": 0.4290882349014282,
"learning_rate": 3.3523308322951016e-05,
"loss": 2.2441,
"step": 1274
},
{
"epoch": 1.6354300385109113,
"grad_norm": 0.37746745347976685,
"learning_rate": 3.3498822826021514e-05,
"loss": 2.1364,
"step": 1275
},
{
"epoch": 1.6367137355584083,
"grad_norm": 0.35144737362861633,
"learning_rate": 3.347432810771436e-05,
"loss": 2.3044,
"step": 1276
},
{
"epoch": 1.637997432605905,
"grad_norm": 0.3726184070110321,
"learning_rate": 3.344982419460681e-05,
"loss": 2.197,
"step": 1277
},
{
"epoch": 1.6392811296534018,
"grad_norm": 0.37522029876708984,
"learning_rate": 3.342531111328604e-05,
"loss": 2.1608,
"step": 1278
},
{
"epoch": 1.6405648267008985,
"grad_norm": 0.38890618085861206,
"learning_rate": 3.340078889034918e-05,
"loss": 2.1773,
"step": 1279
},
{
"epoch": 1.6418485237483953,
"grad_norm": 0.3751033544540405,
"learning_rate": 3.3376257552403314e-05,
"loss": 2.2046,
"step": 1280
},
{
"epoch": 1.6431322207958923,
"grad_norm": 0.3892718255519867,
"learning_rate": 3.335171712606535e-05,
"loss": 2.2177,
"step": 1281
},
{
"epoch": 1.6444159178433888,
"grad_norm": 0.37542441487312317,
"learning_rate": 3.3327167637962134e-05,
"loss": 2.2561,
"step": 1282
},
{
"epoch": 1.6456996148908858,
"grad_norm": 0.37376415729522705,
"learning_rate": 3.33026091147303e-05,
"loss": 2.1389,
"step": 1283
},
{
"epoch": 1.6469833119383825,
"grad_norm": 0.41433337330818176,
"learning_rate": 3.327804158301628e-05,
"loss": 2.1289,
"step": 1284
},
{
"epoch": 1.6482670089858793,
"grad_norm": 0.3767523765563965,
"learning_rate": 3.325346506947632e-05,
"loss": 2.2387,
"step": 1285
},
{
"epoch": 1.6495507060333763,
"grad_norm": 0.41720911860466003,
"learning_rate": 3.322887960077635e-05,
"loss": 2.1956,
"step": 1286
},
{
"epoch": 1.6508344030808728,
"grad_norm": 0.3834913372993469,
"learning_rate": 3.3204285203592076e-05,
"loss": 2.2792,
"step": 1287
},
{
"epoch": 1.6521181001283698,
"grad_norm": 0.37035104632377625,
"learning_rate": 3.3179681904608865e-05,
"loss": 2.12,
"step": 1288
},
{
"epoch": 1.6534017971758665,
"grad_norm": 0.38635921478271484,
"learning_rate": 3.3155069730521735e-05,
"loss": 2.1847,
"step": 1289
},
{
"epoch": 1.6546854942233633,
"grad_norm": 0.374063640832901,
"learning_rate": 3.313044870803536e-05,
"loss": 2.2127,
"step": 1290
},
{
"epoch": 1.65596919127086,
"grad_norm": 0.4345405101776123,
"learning_rate": 3.3105818863863966e-05,
"loss": 2.281,
"step": 1291
},
{
"epoch": 1.6572528883183568,
"grad_norm": 0.3770464360713959,
"learning_rate": 3.308118022473142e-05,
"loss": 2.2967,
"step": 1292
},
{
"epoch": 1.6585365853658538,
"grad_norm": 0.3714136481285095,
"learning_rate": 3.3056532817371065e-05,
"loss": 2.2356,
"step": 1293
},
{
"epoch": 1.6598202824133503,
"grad_norm": 0.3810267746448517,
"learning_rate": 3.3031876668525814e-05,
"loss": 2.171,
"step": 1294
},
{
"epoch": 1.6611039794608473,
"grad_norm": 0.36009058356285095,
"learning_rate": 3.3007211804948015e-05,
"loss": 2.1459,
"step": 1295
},
{
"epoch": 1.662387676508344,
"grad_norm": 0.417795866727829,
"learning_rate": 3.29825382533995e-05,
"loss": 2.2286,
"step": 1296
},
{
"epoch": 1.6636713735558408,
"grad_norm": 0.40939030051231384,
"learning_rate": 3.2957856040651535e-05,
"loss": 2.2301,
"step": 1297
},
{
"epoch": 1.6649550706033376,
"grad_norm": 0.4906468987464905,
"learning_rate": 3.293316519348475e-05,
"loss": 2.3213,
"step": 1298
},
{
"epoch": 1.6662387676508343,
"grad_norm": 0.4314740002155304,
"learning_rate": 3.290846573868918e-05,
"loss": 2.1506,
"step": 1299
},
{
"epoch": 1.6675224646983313,
"grad_norm": 0.40390533208847046,
"learning_rate": 3.2883757703064174e-05,
"loss": 2.1911,
"step": 1300
},
{
"epoch": 1.6688061617458279,
"grad_norm": 0.46770694851875305,
"learning_rate": 3.2859041113418413e-05,
"loss": 2.2219,
"step": 1301
},
{
"epoch": 1.6700898587933248,
"grad_norm": 0.38452643156051636,
"learning_rate": 3.2834315996569844e-05,
"loss": 2.2139,
"step": 1302
},
{
"epoch": 1.6713735558408216,
"grad_norm": 0.4044853150844574,
"learning_rate": 3.280958237934565e-05,
"loss": 2.2452,
"step": 1303
},
{
"epoch": 1.6726572528883183,
"grad_norm": 0.3696539103984833,
"learning_rate": 3.278484028858228e-05,
"loss": 2.2857,
"step": 1304
},
{
"epoch": 1.673940949935815,
"grad_norm": 0.4123825132846832,
"learning_rate": 3.276008975112534e-05,
"loss": 2.2101,
"step": 1305
},
{
"epoch": 1.6752246469833119,
"grad_norm": 0.4096415340900421,
"learning_rate": 3.273533079382962e-05,
"loss": 2.0861,
"step": 1306
},
{
"epoch": 1.6765083440308088,
"grad_norm": 0.3966728150844574,
"learning_rate": 3.2710563443559045e-05,
"loss": 2.2216,
"step": 1307
},
{
"epoch": 1.6777920410783054,
"grad_norm": 0.3775424659252167,
"learning_rate": 3.268578772718663e-05,
"loss": 2.244,
"step": 1308
},
{
"epoch": 1.6790757381258024,
"grad_norm": 0.411404550075531,
"learning_rate": 3.266100367159448e-05,
"loss": 2.2038,
"step": 1309
},
{
"epoch": 1.680359435173299,
"grad_norm": 0.36207374930381775,
"learning_rate": 3.263621130367375e-05,
"loss": 2.2385,
"step": 1310
},
{
"epoch": 1.6816431322207959,
"grad_norm": 0.39945024251937866,
"learning_rate": 3.2611410650324614e-05,
"loss": 2.2851,
"step": 1311
},
{
"epoch": 1.6829268292682928,
"grad_norm": 0.3958592712879181,
"learning_rate": 3.2586601738456226e-05,
"loss": 2.2264,
"step": 1312
},
{
"epoch": 1.6842105263157894,
"grad_norm": 0.386507123708725,
"learning_rate": 3.256178459498671e-05,
"loss": 2.2365,
"step": 1313
},
{
"epoch": 1.6854942233632864,
"grad_norm": 0.4143087565898895,
"learning_rate": 3.253695924684313e-05,
"loss": 2.1712,
"step": 1314
},
{
"epoch": 1.686777920410783,
"grad_norm": 0.3835086226463318,
"learning_rate": 3.251212572096143e-05,
"loss": 2.2057,
"step": 1315
},
{
"epoch": 1.6880616174582799,
"grad_norm": 0.41994401812553406,
"learning_rate": 3.248728404428643e-05,
"loss": 2.2044,
"step": 1316
},
{
"epoch": 1.6893453145057766,
"grad_norm": 0.39442333579063416,
"learning_rate": 3.246243424377182e-05,
"loss": 2.224,
"step": 1317
},
{
"epoch": 1.6906290115532734,
"grad_norm": 0.4174175262451172,
"learning_rate": 3.243757634638008e-05,
"loss": 2.1846,
"step": 1318
},
{
"epoch": 1.6919127086007704,
"grad_norm": 0.40939804911613464,
"learning_rate": 3.241271037908247e-05,
"loss": 2.2313,
"step": 1319
},
{
"epoch": 1.693196405648267,
"grad_norm": 0.4105866849422455,
"learning_rate": 3.238783636885903e-05,
"loss": 2.2061,
"step": 1320
},
{
"epoch": 1.6944801026957639,
"grad_norm": 0.37023091316223145,
"learning_rate": 3.2362954342698515e-05,
"loss": 2.2302,
"step": 1321
},
{
"epoch": 1.6957637997432606,
"grad_norm": 0.3909175992012024,
"learning_rate": 3.233806432759837e-05,
"loss": 2.2544,
"step": 1322
},
{
"epoch": 1.6970474967907574,
"grad_norm": 0.4267197847366333,
"learning_rate": 3.231316635056472e-05,
"loss": 2.2371,
"step": 1323
},
{
"epoch": 1.6983311938382541,
"grad_norm": 0.40300336480140686,
"learning_rate": 3.228826043861232e-05,
"loss": 2.3069,
"step": 1324
},
{
"epoch": 1.699614890885751,
"grad_norm": 0.3710471987724304,
"learning_rate": 3.2263346618764547e-05,
"loss": 2.1944,
"step": 1325
},
{
"epoch": 1.7008985879332479,
"grad_norm": 0.3692680895328522,
"learning_rate": 3.223842491805333e-05,
"loss": 2.1397,
"step": 1326
},
{
"epoch": 1.7021822849807444,
"grad_norm": 0.4052692949771881,
"learning_rate": 3.2213495363519184e-05,
"loss": 2.2253,
"step": 1327
},
{
"epoch": 1.7034659820282414,
"grad_norm": 0.41429218649864197,
"learning_rate": 3.218855798221114e-05,
"loss": 2.2482,
"step": 1328
},
{
"epoch": 1.7047496790757382,
"grad_norm": 0.4166804850101471,
"learning_rate": 3.2163612801186684e-05,
"loss": 2.2101,
"step": 1329
},
{
"epoch": 1.706033376123235,
"grad_norm": 0.38851645588874817,
"learning_rate": 3.2138659847511804e-05,
"loss": 2.1313,
"step": 1330
},
{
"epoch": 1.7073170731707317,
"grad_norm": 0.4157164692878723,
"learning_rate": 3.211369914826092e-05,
"loss": 2.2886,
"step": 1331
},
{
"epoch": 1.7086007702182284,
"grad_norm": 0.385335773229599,
"learning_rate": 3.2088730730516835e-05,
"loss": 2.2048,
"step": 1332
},
{
"epoch": 1.7098844672657254,
"grad_norm": 0.387554407119751,
"learning_rate": 3.206375462137074e-05,
"loss": 2.2596,
"step": 1333
},
{
"epoch": 1.711168164313222,
"grad_norm": 0.40404000878334045,
"learning_rate": 3.203877084792216e-05,
"loss": 2.2133,
"step": 1334
},
{
"epoch": 1.712451861360719,
"grad_norm": 0.40059205889701843,
"learning_rate": 3.201377943727896e-05,
"loss": 2.1791,
"step": 1335
},
{
"epoch": 1.7137355584082157,
"grad_norm": 0.3736129105091095,
"learning_rate": 3.198878041655727e-05,
"loss": 2.1687,
"step": 1336
},
{
"epoch": 1.7150192554557124,
"grad_norm": 0.3956989049911499,
"learning_rate": 3.196377381288147e-05,
"loss": 2.2798,
"step": 1337
},
{
"epoch": 1.7163029525032092,
"grad_norm": 0.402681827545166,
"learning_rate": 3.19387596533842e-05,
"loss": 2.1519,
"step": 1338
},
{
"epoch": 1.717586649550706,
"grad_norm": 0.40618735551834106,
"learning_rate": 3.191373796520627e-05,
"loss": 2.2508,
"step": 1339
},
{
"epoch": 1.718870346598203,
"grad_norm": 0.38825419545173645,
"learning_rate": 3.1888708775496665e-05,
"loss": 2.2578,
"step": 1340
},
{
"epoch": 1.7201540436456995,
"grad_norm": 0.39424875378608704,
"learning_rate": 3.186367211141252e-05,
"loss": 2.2689,
"step": 1341
},
{
"epoch": 1.7214377406931964,
"grad_norm": 0.3815070688724518,
"learning_rate": 3.1838628000119066e-05,
"loss": 2.2558,
"step": 1342
},
{
"epoch": 1.7227214377406932,
"grad_norm": 0.401339590549469,
"learning_rate": 3.1813576468789616e-05,
"loss": 2.3363,
"step": 1343
},
{
"epoch": 1.72400513478819,
"grad_norm": 0.39942529797554016,
"learning_rate": 3.178851754460555e-05,
"loss": 2.1929,
"step": 1344
},
{
"epoch": 1.725288831835687,
"grad_norm": 0.4117013216018677,
"learning_rate": 3.1763451254756246e-05,
"loss": 2.3254,
"step": 1345
},
{
"epoch": 1.7265725288831835,
"grad_norm": 0.37010103464126587,
"learning_rate": 3.1738377626439084e-05,
"loss": 2.1709,
"step": 1346
},
{
"epoch": 1.7278562259306804,
"grad_norm": 0.37751153111457825,
"learning_rate": 3.1713296686859426e-05,
"loss": 2.2218,
"step": 1347
},
{
"epoch": 1.7291399229781772,
"grad_norm": 0.3885636031627655,
"learning_rate": 3.168820846323053e-05,
"loss": 2.1923,
"step": 1348
},
{
"epoch": 1.730423620025674,
"grad_norm": 0.41108283400535583,
"learning_rate": 3.1663112982773566e-05,
"loss": 2.1655,
"step": 1349
},
{
"epoch": 1.7317073170731707,
"grad_norm": 0.3727162182331085,
"learning_rate": 3.1638010272717624e-05,
"loss": 2.2211,
"step": 1350
},
{
"epoch": 1.7329910141206675,
"grad_norm": 0.38101616501808167,
"learning_rate": 3.161290036029957e-05,
"loss": 2.286,
"step": 1351
},
{
"epoch": 1.7342747111681645,
"grad_norm": 0.395938903093338,
"learning_rate": 3.158778327276413e-05,
"loss": 2.1722,
"step": 1352
},
{
"epoch": 1.735558408215661,
"grad_norm": 0.4109671115875244,
"learning_rate": 3.15626590373638e-05,
"loss": 2.2135,
"step": 1353
},
{
"epoch": 1.736842105263158,
"grad_norm": 0.3670121133327484,
"learning_rate": 3.1537527681358834e-05,
"loss": 2.2307,
"step": 1354
},
{
"epoch": 1.7381258023106547,
"grad_norm": 0.4133658707141876,
"learning_rate": 3.15123892320172e-05,
"loss": 2.1967,
"step": 1355
},
{
"epoch": 1.7394094993581515,
"grad_norm": 0.4093180298805237,
"learning_rate": 3.148724371661459e-05,
"loss": 2.2776,
"step": 1356
},
{
"epoch": 1.7406931964056482,
"grad_norm": 0.37749165296554565,
"learning_rate": 3.1462091162434336e-05,
"loss": 2.2643,
"step": 1357
},
{
"epoch": 1.741976893453145,
"grad_norm": 0.38306891918182373,
"learning_rate": 3.143693159676743e-05,
"loss": 2.2403,
"step": 1358
},
{
"epoch": 1.743260590500642,
"grad_norm": 0.3868613541126251,
"learning_rate": 3.141176504691244e-05,
"loss": 2.2451,
"step": 1359
},
{
"epoch": 1.7445442875481385,
"grad_norm": 0.40217313170433044,
"learning_rate": 3.138659154017554e-05,
"loss": 2.1337,
"step": 1360
},
{
"epoch": 1.7458279845956355,
"grad_norm": 0.45966920256614685,
"learning_rate": 3.1361411103870455e-05,
"loss": 2.1045,
"step": 1361
},
{
"epoch": 1.7471116816431322,
"grad_norm": 0.37922942638397217,
"learning_rate": 3.1336223765318394e-05,
"loss": 2.2366,
"step": 1362
},
{
"epoch": 1.748395378690629,
"grad_norm": 0.38435661792755127,
"learning_rate": 3.1311029551848096e-05,
"loss": 2.3315,
"step": 1363
},
{
"epoch": 1.7496790757381258,
"grad_norm": 0.40681931376457214,
"learning_rate": 3.1285828490795746e-05,
"loss": 2.3369,
"step": 1364
},
{
"epoch": 1.7509627727856225,
"grad_norm": 0.4007415771484375,
"learning_rate": 3.126062060950494e-05,
"loss": 2.2209,
"step": 1365
}
],
"logging_steps": 1,
"max_steps": 3116,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 195,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.6567101604264673e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}