v0.8.4-adapter / trainer_state.json
gotzmann's picture
..
6d23946
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.283208834037025,
"eval_steps": 500,
"global_step": 2002,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 0.04008953645825386,
"learning_rate": 6.896551724137932e-06,
"loss": 1.1355,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 0.07225171476602554,
"learning_rate": 1.3793103448275863e-05,
"loss": 1.2113,
"step": 2
},
{
"epoch": 0.0,
"grad_norm": 0.04869953542947769,
"learning_rate": 2.0689655172413793e-05,
"loss": 1.4082,
"step": 3
},
{
"epoch": 0.0,
"grad_norm": 0.04640135169029236,
"learning_rate": 2.7586206896551727e-05,
"loss": 1.3191,
"step": 4
},
{
"epoch": 0.0,
"grad_norm": 0.03611049801111221,
"learning_rate": 3.4482758620689657e-05,
"loss": 1.1544,
"step": 5
},
{
"epoch": 0.0,
"grad_norm": 0.051763903349637985,
"learning_rate": 4.1379310344827587e-05,
"loss": 1.5153,
"step": 6
},
{
"epoch": 0.0,
"grad_norm": 0.05204508826136589,
"learning_rate": 4.827586206896552e-05,
"loss": 1.1674,
"step": 7
},
{
"epoch": 0.01,
"grad_norm": 0.054778363555669785,
"learning_rate": 5.517241379310345e-05,
"loss": 1.3888,
"step": 8
},
{
"epoch": 0.01,
"grad_norm": 0.055702440440654755,
"learning_rate": 6.206896551724138e-05,
"loss": 1.2173,
"step": 9
},
{
"epoch": 0.01,
"grad_norm": 0.05100104212760925,
"learning_rate": 6.896551724137931e-05,
"loss": 1.2199,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 0.06114739924669266,
"learning_rate": 7.586206896551724e-05,
"loss": 1.2196,
"step": 11
},
{
"epoch": 0.01,
"grad_norm": 0.07207280397415161,
"learning_rate": 8.275862068965517e-05,
"loss": 1.287,
"step": 12
},
{
"epoch": 0.01,
"grad_norm": 0.061948470771312714,
"learning_rate": 8.96551724137931e-05,
"loss": 1.1897,
"step": 13
},
{
"epoch": 0.01,
"grad_norm": 0.05975250154733658,
"learning_rate": 9.655172413793105e-05,
"loss": 1.0783,
"step": 14
},
{
"epoch": 0.01,
"grad_norm": 0.0717136338353157,
"learning_rate": 0.00010344827586206898,
"loss": 1.3379,
"step": 15
},
{
"epoch": 0.01,
"grad_norm": 0.07506729662418365,
"learning_rate": 0.0001103448275862069,
"loss": 1.277,
"step": 16
},
{
"epoch": 0.01,
"grad_norm": 0.06755080819129944,
"learning_rate": 0.00011724137931034482,
"loss": 1.3304,
"step": 17
},
{
"epoch": 0.01,
"grad_norm": 0.08883846551179886,
"learning_rate": 0.00012413793103448277,
"loss": 1.2343,
"step": 18
},
{
"epoch": 0.01,
"grad_norm": 0.06401929259300232,
"learning_rate": 0.00013103448275862068,
"loss": 1.2448,
"step": 19
},
{
"epoch": 0.01,
"grad_norm": 0.09903116524219513,
"learning_rate": 0.00013793103448275863,
"loss": 1.2738,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 0.07536918669939041,
"learning_rate": 0.00014482758620689657,
"loss": 1.2626,
"step": 21
},
{
"epoch": 0.01,
"grad_norm": 0.07821318507194519,
"learning_rate": 0.00015172413793103449,
"loss": 1.326,
"step": 22
},
{
"epoch": 0.01,
"grad_norm": 0.07713277637958527,
"learning_rate": 0.00015862068965517243,
"loss": 1.1252,
"step": 23
},
{
"epoch": 0.02,
"grad_norm": 0.09503794461488724,
"learning_rate": 0.00016551724137931035,
"loss": 1.3358,
"step": 24
},
{
"epoch": 0.02,
"grad_norm": 0.08219444751739502,
"learning_rate": 0.00017241379310344826,
"loss": 1.2813,
"step": 25
},
{
"epoch": 0.02,
"grad_norm": 0.11026415228843689,
"learning_rate": 0.0001793103448275862,
"loss": 1.4659,
"step": 26
},
{
"epoch": 0.02,
"grad_norm": 0.1077384352684021,
"learning_rate": 0.00018620689655172415,
"loss": 1.4567,
"step": 27
},
{
"epoch": 0.02,
"grad_norm": 0.09956091642379761,
"learning_rate": 0.0001931034482758621,
"loss": 1.3511,
"step": 28
},
{
"epoch": 0.02,
"grad_norm": 0.0894646868109703,
"learning_rate": 0.0002,
"loss": 1.1795,
"step": 29
},
{
"epoch": 0.02,
"grad_norm": 0.07985596358776093,
"learning_rate": 0.00019999994691707293,
"loss": 1.3384,
"step": 30
},
{
"epoch": 0.02,
"grad_norm": 0.10679838061332703,
"learning_rate": 0.00019999978766834805,
"loss": 1.4829,
"step": 31
},
{
"epoch": 0.02,
"grad_norm": 0.08870743215084076,
"learning_rate": 0.0001999995222539944,
"loss": 1.1258,
"step": 32
},
{
"epoch": 0.02,
"grad_norm": 0.09697020053863525,
"learning_rate": 0.00019999915067429384,
"loss": 1.3016,
"step": 33
},
{
"epoch": 0.02,
"grad_norm": 0.08852187544107437,
"learning_rate": 0.00019999867292964078,
"loss": 1.3058,
"step": 34
},
{
"epoch": 0.02,
"grad_norm": 0.09163249284029007,
"learning_rate": 0.0001999980890205425,
"loss": 1.1355,
"step": 35
},
{
"epoch": 0.02,
"grad_norm": 0.07654478400945663,
"learning_rate": 0.0001999973989476188,
"loss": 1.1533,
"step": 36
},
{
"epoch": 0.02,
"grad_norm": 0.08472617715597153,
"learning_rate": 0.00019999660271160242,
"loss": 1.1486,
"step": 37
},
{
"epoch": 0.02,
"grad_norm": 0.09144127368927002,
"learning_rate": 0.0001999957003133386,
"loss": 1.17,
"step": 38
},
{
"epoch": 0.03,
"grad_norm": 0.08375002443790436,
"learning_rate": 0.00019999469175378543,
"loss": 0.9908,
"step": 39
},
{
"epoch": 0.03,
"grad_norm": 0.0837416797876358,
"learning_rate": 0.00019999357703401365,
"loss": 1.0564,
"step": 40
},
{
"epoch": 0.03,
"grad_norm": 0.11098498106002808,
"learning_rate": 0.00019999235615520672,
"loss": 1.3512,
"step": 41
},
{
"epoch": 0.03,
"grad_norm": 0.10009056329727173,
"learning_rate": 0.00019999102911866077,
"loss": 1.4577,
"step": 42
},
{
"epoch": 0.03,
"grad_norm": 0.08330542594194412,
"learning_rate": 0.00019998959592578466,
"loss": 1.1307,
"step": 43
},
{
"epoch": 0.03,
"grad_norm": 0.10253599286079407,
"learning_rate": 0.00019998805657809996,
"loss": 1.2409,
"step": 44
},
{
"epoch": 0.03,
"grad_norm": 0.08261312544345856,
"learning_rate": 0.00019998641107724097,
"loss": 1.1321,
"step": 45
},
{
"epoch": 0.03,
"grad_norm": 0.0797719806432724,
"learning_rate": 0.00019998465942495458,
"loss": 1.1719,
"step": 46
},
{
"epoch": 0.03,
"grad_norm": 0.08123207092285156,
"learning_rate": 0.0001999828016231005,
"loss": 1.0263,
"step": 47
},
{
"epoch": 0.03,
"grad_norm": 0.07530490309000015,
"learning_rate": 0.00019998083767365103,
"loss": 1.2375,
"step": 48
},
{
"epoch": 0.03,
"grad_norm": 0.077210433781147,
"learning_rate": 0.00019997876757869127,
"loss": 1.1426,
"step": 49
},
{
"epoch": 0.03,
"grad_norm": 0.07056202739477158,
"learning_rate": 0.00019997659134041894,
"loss": 0.9958,
"step": 50
},
{
"epoch": 0.03,
"grad_norm": 0.07623480260372162,
"learning_rate": 0.00019997430896114446,
"loss": 1.0555,
"step": 51
},
{
"epoch": 0.03,
"grad_norm": 0.07929116487503052,
"learning_rate": 0.0001999719204432909,
"loss": 1.0228,
"step": 52
},
{
"epoch": 0.03,
"grad_norm": 0.07111712545156479,
"learning_rate": 0.0001999694257893941,
"loss": 1.2484,
"step": 53
},
{
"epoch": 0.04,
"grad_norm": 0.08941549062728882,
"learning_rate": 0.0001999668250021025,
"loss": 1.2755,
"step": 54
},
{
"epoch": 0.04,
"grad_norm": 0.07243392616510391,
"learning_rate": 0.00019996411808417724,
"loss": 1.2864,
"step": 55
},
{
"epoch": 0.04,
"grad_norm": 0.11266268789768219,
"learning_rate": 0.0001999613050384922,
"loss": 1.1744,
"step": 56
},
{
"epoch": 0.04,
"grad_norm": 0.07067809998989105,
"learning_rate": 0.0001999583858680338,
"loss": 1.0887,
"step": 57
},
{
"epoch": 0.04,
"grad_norm": 0.06493864208459854,
"learning_rate": 0.00019995536057590128,
"loss": 1.2716,
"step": 58
},
{
"epoch": 0.04,
"grad_norm": 0.07525993883609772,
"learning_rate": 0.00019995222916530637,
"loss": 1.2011,
"step": 59
},
{
"epoch": 0.04,
"grad_norm": 0.1059408187866211,
"learning_rate": 0.00019994899163957368,
"loss": 1.2519,
"step": 60
},
{
"epoch": 0.04,
"grad_norm": 0.07853084802627563,
"learning_rate": 0.00019994564800214024,
"loss": 1.2919,
"step": 61
},
{
"epoch": 0.04,
"grad_norm": 0.0701480507850647,
"learning_rate": 0.0001999421982565559,
"loss": 1.0858,
"step": 62
},
{
"epoch": 0.04,
"grad_norm": 0.07502806931734085,
"learning_rate": 0.00019993864240648313,
"loss": 1.0008,
"step": 63
},
{
"epoch": 0.04,
"grad_norm": 0.08633171766996384,
"learning_rate": 0.00019993498045569702,
"loss": 1.1728,
"step": 64
},
{
"epoch": 0.04,
"grad_norm": 0.08813374489545822,
"learning_rate": 0.0001999312124080853,
"loss": 1.0864,
"step": 65
},
{
"epoch": 0.04,
"grad_norm": 0.09121447801589966,
"learning_rate": 0.00019992733826764835,
"loss": 1.1113,
"step": 66
},
{
"epoch": 0.04,
"grad_norm": 0.07591919600963593,
"learning_rate": 0.00019992335803849917,
"loss": 1.202,
"step": 67
},
{
"epoch": 0.04,
"grad_norm": 0.07328435033559799,
"learning_rate": 0.00019991927172486346,
"loss": 1.1644,
"step": 68
},
{
"epoch": 0.04,
"grad_norm": 0.08068283647298813,
"learning_rate": 0.0001999150793310794,
"loss": 1.106,
"step": 69
},
{
"epoch": 0.05,
"grad_norm": 0.08356733620166779,
"learning_rate": 0.00019991078086159796,
"loss": 1.1861,
"step": 70
},
{
"epoch": 0.05,
"grad_norm": 0.09514696896076202,
"learning_rate": 0.00019990637632098262,
"loss": 1.344,
"step": 71
},
{
"epoch": 0.05,
"grad_norm": 0.07595695555210114,
"learning_rate": 0.0001999018657139095,
"loss": 1.3151,
"step": 72
},
{
"epoch": 0.05,
"grad_norm": 0.07353411614894867,
"learning_rate": 0.00019989724904516734,
"loss": 1.2144,
"step": 73
},
{
"epoch": 0.05,
"grad_norm": 0.1090092882514,
"learning_rate": 0.00019989252631965742,
"loss": 1.12,
"step": 74
},
{
"epoch": 0.05,
"grad_norm": 0.0808551162481308,
"learning_rate": 0.00019988769754239368,
"loss": 1.0838,
"step": 75
},
{
"epoch": 0.05,
"grad_norm": 0.07115597277879715,
"learning_rate": 0.00019988276271850266,
"loss": 1.3786,
"step": 76
},
{
"epoch": 0.05,
"grad_norm": 0.06909359991550446,
"learning_rate": 0.00019987772185322343,
"loss": 1.0922,
"step": 77
},
{
"epoch": 0.05,
"grad_norm": 0.08590813726186752,
"learning_rate": 0.0001998725749519077,
"loss": 1.4139,
"step": 78
},
{
"epoch": 0.05,
"grad_norm": 0.12101796269416809,
"learning_rate": 0.00019986732202001965,
"loss": 1.2673,
"step": 79
},
{
"epoch": 0.05,
"grad_norm": 0.06808914244174957,
"learning_rate": 0.0001998619630631362,
"loss": 1.2045,
"step": 80
},
{
"epoch": 0.05,
"grad_norm": 0.08627394586801529,
"learning_rate": 0.00019985649808694665,
"loss": 1.2819,
"step": 81
},
{
"epoch": 0.05,
"grad_norm": 0.07175503671169281,
"learning_rate": 0.000199850927097253,
"loss": 1.4064,
"step": 82
},
{
"epoch": 0.05,
"grad_norm": 0.08057812601327896,
"learning_rate": 0.0001998452500999697,
"loss": 1.3813,
"step": 83
},
{
"epoch": 0.05,
"grad_norm": 0.18742507696151733,
"learning_rate": 0.00019983946710112378,
"loss": 1.184,
"step": 84
},
{
"epoch": 0.06,
"grad_norm": 0.07114308327436447,
"learning_rate": 0.00019983357810685482,
"loss": 1.1716,
"step": 85
},
{
"epoch": 0.06,
"grad_norm": 0.08055424690246582,
"learning_rate": 0.00019982758312341495,
"loss": 1.2713,
"step": 86
},
{
"epoch": 0.06,
"grad_norm": 0.07629523426294327,
"learning_rate": 0.00019982148215716875,
"loss": 1.1986,
"step": 87
},
{
"epoch": 0.06,
"grad_norm": 0.08360457420349121,
"learning_rate": 0.00019981527521459338,
"loss": 1.0301,
"step": 88
},
{
"epoch": 0.06,
"grad_norm": 0.07372694462537766,
"learning_rate": 0.00019980896230227847,
"loss": 1.0811,
"step": 89
},
{
"epoch": 0.06,
"grad_norm": 0.06955759227275848,
"learning_rate": 0.00019980254342692627,
"loss": 0.8951,
"step": 90
},
{
"epoch": 0.06,
"grad_norm": 0.07801992446184158,
"learning_rate": 0.0001997960185953513,
"loss": 1.2038,
"step": 91
},
{
"epoch": 0.06,
"grad_norm": 0.08545531332492828,
"learning_rate": 0.0001997893878144808,
"loss": 1.2636,
"step": 92
},
{
"epoch": 0.06,
"grad_norm": 0.061103224754333496,
"learning_rate": 0.00019978265109135435,
"loss": 1.1628,
"step": 93
},
{
"epoch": 0.06,
"grad_norm": 0.08182393014431,
"learning_rate": 0.0001997758084331241,
"loss": 1.1868,
"step": 94
},
{
"epoch": 0.06,
"grad_norm": 0.06987016648054123,
"learning_rate": 0.00019976885984705452,
"loss": 1.2146,
"step": 95
},
{
"epoch": 0.06,
"grad_norm": 0.08362079411745071,
"learning_rate": 0.00019976180534052274,
"loss": 1.0584,
"step": 96
},
{
"epoch": 0.06,
"grad_norm": 0.07390179485082626,
"learning_rate": 0.00019975464492101819,
"loss": 1.2111,
"step": 97
},
{
"epoch": 0.06,
"grad_norm": 0.0953076109290123,
"learning_rate": 0.00019974737859614278,
"loss": 1.4299,
"step": 98
},
{
"epoch": 0.06,
"grad_norm": 0.07653916627168655,
"learning_rate": 0.00019974000637361088,
"loss": 1.1821,
"step": 99
},
{
"epoch": 0.06,
"grad_norm": 0.08676808327436447,
"learning_rate": 0.0001997325282612493,
"loss": 1.2879,
"step": 100
},
{
"epoch": 0.07,
"grad_norm": 0.0871780589222908,
"learning_rate": 0.00019972494426699717,
"loss": 1.2175,
"step": 101
},
{
"epoch": 0.07,
"grad_norm": 0.08263079077005386,
"learning_rate": 0.00019971725439890618,
"loss": 1.4334,
"step": 102
},
{
"epoch": 0.07,
"grad_norm": 0.07416750490665436,
"learning_rate": 0.0001997094586651403,
"loss": 1.3992,
"step": 103
},
{
"epoch": 0.07,
"grad_norm": 0.07549911737442017,
"learning_rate": 0.00019970155707397594,
"loss": 1.1736,
"step": 104
},
{
"epoch": 0.07,
"grad_norm": 0.07236723601818085,
"learning_rate": 0.00019969354963380193,
"loss": 1.3373,
"step": 105
},
{
"epoch": 0.07,
"grad_norm": 0.07686607539653778,
"learning_rate": 0.0001996854363531194,
"loss": 1.2163,
"step": 106
},
{
"epoch": 0.07,
"grad_norm": 0.08150803297758102,
"learning_rate": 0.00019967721724054185,
"loss": 1.3929,
"step": 107
},
{
"epoch": 0.07,
"grad_norm": 0.08020401746034622,
"learning_rate": 0.00019966889230479525,
"loss": 1.204,
"step": 108
},
{
"epoch": 0.07,
"grad_norm": 0.07023321092128754,
"learning_rate": 0.0001996604615547178,
"loss": 1.1341,
"step": 109
},
{
"epoch": 0.07,
"grad_norm": 0.082249216735363,
"learning_rate": 0.00019965192499926007,
"loss": 1.1813,
"step": 110
},
{
"epoch": 0.07,
"grad_norm": 0.08552736788988113,
"learning_rate": 0.000199643282647485,
"loss": 1.0374,
"step": 111
},
{
"epoch": 0.07,
"grad_norm": 0.0845017060637474,
"learning_rate": 0.00019963453450856774,
"loss": 1.4907,
"step": 112
},
{
"epoch": 0.07,
"grad_norm": 0.10722214728593826,
"learning_rate": 0.00019962568059179593,
"loss": 0.8428,
"step": 113
},
{
"epoch": 0.07,
"grad_norm": 0.07669556140899658,
"learning_rate": 0.00019961672090656934,
"loss": 1.1715,
"step": 114
},
{
"epoch": 0.07,
"grad_norm": 0.16831474006175995,
"learning_rate": 0.0001996076554624001,
"loss": 1.0702,
"step": 115
},
{
"epoch": 0.08,
"grad_norm": 0.07218927145004272,
"learning_rate": 0.00019959848426891265,
"loss": 1.1708,
"step": 116
},
{
"epoch": 0.08,
"grad_norm": 0.07960865646600723,
"learning_rate": 0.00019958920733584363,
"loss": 1.2616,
"step": 117
},
{
"epoch": 0.08,
"grad_norm": 0.08504796773195267,
"learning_rate": 0.00019957982467304198,
"loss": 1.1766,
"step": 118
},
{
"epoch": 0.08,
"grad_norm": 0.10497059673070908,
"learning_rate": 0.0001995703362904689,
"loss": 1.0157,
"step": 119
},
{
"epoch": 0.08,
"grad_norm": 0.07764813303947449,
"learning_rate": 0.00019956074219819783,
"loss": 1.1179,
"step": 120
},
{
"epoch": 0.08,
"grad_norm": 0.07821417599916458,
"learning_rate": 0.00019955104240641439,
"loss": 1.3126,
"step": 121
},
{
"epoch": 0.08,
"grad_norm": 0.0767795741558075,
"learning_rate": 0.00019954123692541643,
"loss": 1.247,
"step": 122
},
{
"epoch": 0.08,
"grad_norm": 0.07959768921136856,
"learning_rate": 0.00019953132576561405,
"loss": 1.0639,
"step": 123
},
{
"epoch": 0.08,
"grad_norm": 0.07457169890403748,
"learning_rate": 0.0001995213089375295,
"loss": 1.323,
"step": 124
},
{
"epoch": 0.08,
"grad_norm": 0.07672613114118576,
"learning_rate": 0.00019951118645179726,
"loss": 1.2996,
"step": 125
},
{
"epoch": 0.08,
"grad_norm": 0.09855501353740692,
"learning_rate": 0.00019950095831916396,
"loss": 1.1613,
"step": 126
},
{
"epoch": 0.08,
"grad_norm": 0.10536333918571472,
"learning_rate": 0.00019949062455048834,
"loss": 1.5025,
"step": 127
},
{
"epoch": 0.08,
"grad_norm": 0.07425690442323685,
"learning_rate": 0.00019948018515674135,
"loss": 0.8924,
"step": 128
},
{
"epoch": 0.08,
"grad_norm": 0.08064056187868118,
"learning_rate": 0.00019946964014900607,
"loss": 1.1245,
"step": 129
},
{
"epoch": 0.08,
"grad_norm": 0.08193577080965042,
"learning_rate": 0.0001994589895384777,
"loss": 1.3086,
"step": 130
},
{
"epoch": 0.09,
"grad_norm": 0.13796043395996094,
"learning_rate": 0.00019944823333646356,
"loss": 1.2822,
"step": 131
},
{
"epoch": 0.09,
"grad_norm": 0.0710272490978241,
"learning_rate": 0.00019943737155438303,
"loss": 1.1385,
"step": 132
},
{
"epoch": 0.09,
"grad_norm": 0.07783588021993637,
"learning_rate": 0.00019942640420376765,
"loss": 1.1241,
"step": 133
},
{
"epoch": 0.09,
"grad_norm": 0.07037730515003204,
"learning_rate": 0.00019941533129626095,
"loss": 1.1858,
"step": 134
},
{
"epoch": 0.09,
"grad_norm": 0.07877272367477417,
"learning_rate": 0.00019940415284361865,
"loss": 1.1963,
"step": 135
},
{
"epoch": 0.09,
"grad_norm": 0.09204830974340439,
"learning_rate": 0.0001993928688577084,
"loss": 1.3757,
"step": 136
},
{
"epoch": 0.09,
"grad_norm": 0.07677577435970306,
"learning_rate": 0.00019938147935050994,
"loss": 1.1891,
"step": 137
},
{
"epoch": 0.09,
"grad_norm": 0.08180911839008331,
"learning_rate": 0.00019936998433411508,
"loss": 1.4319,
"step": 138
},
{
"epoch": 0.09,
"grad_norm": 0.08735460042953491,
"learning_rate": 0.00019935838382072755,
"loss": 1.2685,
"step": 139
},
{
"epoch": 0.09,
"grad_norm": 0.09186897426843643,
"learning_rate": 0.00019934667782266314,
"loss": 1.118,
"step": 140
},
{
"epoch": 0.09,
"grad_norm": 0.07723281532526016,
"learning_rate": 0.00019933486635234963,
"loss": 1.0832,
"step": 141
},
{
"epoch": 0.09,
"grad_norm": 0.07641208916902542,
"learning_rate": 0.00019932294942232682,
"loss": 1.2359,
"step": 142
},
{
"epoch": 0.09,
"grad_norm": 0.07022741436958313,
"learning_rate": 0.00019931092704524636,
"loss": 0.836,
"step": 143
},
{
"epoch": 0.09,
"grad_norm": 0.07212373614311218,
"learning_rate": 0.00019929879923387192,
"loss": 1.2083,
"step": 144
},
{
"epoch": 0.09,
"grad_norm": 0.08122804015874863,
"learning_rate": 0.0001992865660010791,
"loss": 0.9731,
"step": 145
},
{
"epoch": 0.09,
"grad_norm": 0.07840394973754883,
"learning_rate": 0.00019927422735985543,
"loss": 1.31,
"step": 146
},
{
"epoch": 0.1,
"grad_norm": 0.07877013832330704,
"learning_rate": 0.00019926178332330032,
"loss": 1.0037,
"step": 147
},
{
"epoch": 0.1,
"grad_norm": 0.10172662883996964,
"learning_rate": 0.0001992492339046251,
"loss": 1.1724,
"step": 148
},
{
"epoch": 0.1,
"grad_norm": 0.08578412234783173,
"learning_rate": 0.00019923657911715296,
"loss": 1.2727,
"step": 149
},
{
"epoch": 0.1,
"grad_norm": 0.07262091338634491,
"learning_rate": 0.00019922381897431892,
"loss": 1.2786,
"step": 150
},
{
"epoch": 0.1,
"grad_norm": 0.06889116019010544,
"learning_rate": 0.00019921095348966996,
"loss": 1.2328,
"step": 151
},
{
"epoch": 0.1,
"grad_norm": 0.06720105558633804,
"learning_rate": 0.0001991979826768648,
"loss": 1.207,
"step": 152
},
{
"epoch": 0.1,
"grad_norm": 0.12904532253742218,
"learning_rate": 0.00019918490654967404,
"loss": 1.1355,
"step": 153
},
{
"epoch": 0.1,
"grad_norm": 0.07100478559732437,
"learning_rate": 0.00019917172512198,
"loss": 1.1669,
"step": 154
},
{
"epoch": 0.1,
"grad_norm": 0.08603204786777496,
"learning_rate": 0.00019915843840777694,
"loss": 1.3417,
"step": 155
},
{
"epoch": 0.1,
"grad_norm": 0.07161790132522583,
"learning_rate": 0.00019914504642117075,
"loss": 1.1422,
"step": 156
},
{
"epoch": 0.1,
"grad_norm": 0.0849173441529274,
"learning_rate": 0.00019913154917637916,
"loss": 1.1043,
"step": 157
},
{
"epoch": 0.1,
"grad_norm": 0.07790779322385788,
"learning_rate": 0.00019911794668773166,
"loss": 0.983,
"step": 158
},
{
"epoch": 0.1,
"grad_norm": 0.08227023482322693,
"learning_rate": 0.00019910423896966943,
"loss": 1.0398,
"step": 159
},
{
"epoch": 0.1,
"grad_norm": 0.07356717437505722,
"learning_rate": 0.0001990904260367454,
"loss": 1.0846,
"step": 160
},
{
"epoch": 0.1,
"grad_norm": 0.06824516505002975,
"learning_rate": 0.00019907650790362415,
"loss": 1.096,
"step": 161
},
{
"epoch": 0.11,
"grad_norm": 0.08319918066263199,
"learning_rate": 0.00019906248458508203,
"loss": 1.166,
"step": 162
},
{
"epoch": 0.11,
"grad_norm": 0.0659998208284378,
"learning_rate": 0.000199048356096007,
"loss": 0.8599,
"step": 163
},
{
"epoch": 0.11,
"grad_norm": 0.08006949722766876,
"learning_rate": 0.00019903412245139866,
"loss": 1.0234,
"step": 164
},
{
"epoch": 0.11,
"grad_norm": 0.07593873143196106,
"learning_rate": 0.00019901978366636833,
"loss": 1.251,
"step": 165
},
{
"epoch": 0.11,
"grad_norm": 0.07520274072885513,
"learning_rate": 0.0001990053397561389,
"loss": 1.1673,
"step": 166
},
{
"epoch": 0.11,
"grad_norm": 0.09776092320680618,
"learning_rate": 0.0001989907907360448,
"loss": 1.3006,
"step": 167
},
{
"epoch": 0.11,
"grad_norm": 0.08538515865802765,
"learning_rate": 0.00019897613662153222,
"loss": 0.9225,
"step": 168
},
{
"epoch": 0.11,
"grad_norm": 0.09008879214525223,
"learning_rate": 0.00019896137742815876,
"loss": 1.3857,
"step": 169
},
{
"epoch": 0.11,
"grad_norm": 0.07463444769382477,
"learning_rate": 0.00019894651317159368,
"loss": 1.3,
"step": 170
},
{
"epoch": 0.11,
"grad_norm": 0.08576199412345886,
"learning_rate": 0.00019893154386761773,
"loss": 0.7807,
"step": 171
},
{
"epoch": 0.11,
"grad_norm": 0.10605023801326752,
"learning_rate": 0.00019891646953212316,
"loss": 1.2742,
"step": 172
},
{
"epoch": 0.11,
"grad_norm": 0.07358279824256897,
"learning_rate": 0.00019890129018111384,
"loss": 1.2885,
"step": 173
},
{
"epoch": 0.11,
"grad_norm": 0.08171935379505157,
"learning_rate": 0.000198886005830705,
"loss": 1.1483,
"step": 174
},
{
"epoch": 0.11,
"grad_norm": 0.0678820013999939,
"learning_rate": 0.00019887061649712345,
"loss": 1.1168,
"step": 175
},
{
"epoch": 0.11,
"grad_norm": 0.07885902374982834,
"learning_rate": 0.00019885512219670735,
"loss": 1.1202,
"step": 176
},
{
"epoch": 0.11,
"grad_norm": 0.0892176702618599,
"learning_rate": 0.0001988395229459064,
"loss": 1.0907,
"step": 177
},
{
"epoch": 0.12,
"grad_norm": 0.07069101929664612,
"learning_rate": 0.0001988238187612817,
"loss": 1.0559,
"step": 178
},
{
"epoch": 0.12,
"grad_norm": 0.07359961420297623,
"learning_rate": 0.00019880800965950567,
"loss": 1.2076,
"step": 179
},
{
"epoch": 0.12,
"grad_norm": 0.07424277067184448,
"learning_rate": 0.00019879209565736218,
"loss": 1.1892,
"step": 180
},
{
"epoch": 0.12,
"grad_norm": 0.0668448731303215,
"learning_rate": 0.00019877607677174652,
"loss": 1.197,
"step": 181
},
{
"epoch": 0.12,
"grad_norm": 0.08037786185741425,
"learning_rate": 0.00019875995301966523,
"loss": 1.0088,
"step": 182
},
{
"epoch": 0.12,
"grad_norm": 0.0884958952665329,
"learning_rate": 0.00019874372441823629,
"loss": 1.0746,
"step": 183
},
{
"epoch": 0.12,
"grad_norm": 0.0994100570678711,
"learning_rate": 0.00019872739098468885,
"loss": 1.2779,
"step": 184
},
{
"epoch": 0.12,
"grad_norm": 0.07933379709720612,
"learning_rate": 0.0001987109527363635,
"loss": 1.0492,
"step": 185
},
{
"epoch": 0.12,
"grad_norm": 0.08432573080062866,
"learning_rate": 0.000198694409690712,
"loss": 1.1103,
"step": 186
},
{
"epoch": 0.12,
"grad_norm": 0.09406879544258118,
"learning_rate": 0.00019867776186529746,
"loss": 1.4604,
"step": 187
},
{
"epoch": 0.12,
"grad_norm": 0.0750434622168541,
"learning_rate": 0.00019866100927779415,
"loss": 1.0356,
"step": 188
},
{
"epoch": 0.12,
"grad_norm": 0.08250833302736282,
"learning_rate": 0.00019864415194598766,
"loss": 1.2232,
"step": 189
},
{
"epoch": 0.12,
"grad_norm": 0.09028942137956619,
"learning_rate": 0.00019862718988777463,
"loss": 1.3559,
"step": 190
},
{
"epoch": 0.12,
"grad_norm": 0.10870659351348877,
"learning_rate": 0.00019861012312116306,
"loss": 1.1435,
"step": 191
},
{
"epoch": 0.12,
"grad_norm": 0.07719945907592773,
"learning_rate": 0.00019859295166427198,
"loss": 1.3938,
"step": 192
},
{
"epoch": 0.13,
"grad_norm": 0.10856524854898453,
"learning_rate": 0.0001985756755353316,
"loss": 1.3754,
"step": 193
},
{
"epoch": 0.13,
"grad_norm": 0.07811608165502548,
"learning_rate": 0.0001985582947526833,
"loss": 1.0972,
"step": 194
},
{
"epoch": 0.13,
"grad_norm": 0.078819140791893,
"learning_rate": 0.00019854080933477954,
"loss": 1.1798,
"step": 195
},
{
"epoch": 0.13,
"grad_norm": 0.07991020381450653,
"learning_rate": 0.00019852321930018384,
"loss": 1.0345,
"step": 196
},
{
"epoch": 0.13,
"grad_norm": 0.07344697415828705,
"learning_rate": 0.00019850552466757083,
"loss": 1.1361,
"step": 197
},
{
"epoch": 0.13,
"grad_norm": 0.09107469767332077,
"learning_rate": 0.00019848772545572616,
"loss": 1.3357,
"step": 198
},
{
"epoch": 0.13,
"grad_norm": 0.08043038845062256,
"learning_rate": 0.00019846982168354653,
"loss": 1.1087,
"step": 199
},
{
"epoch": 0.13,
"grad_norm": 0.0794915184378624,
"learning_rate": 0.00019845181337003963,
"loss": 1.0507,
"step": 200
},
{
"epoch": 0.13,
"grad_norm": 0.09108548611402512,
"learning_rate": 0.00019843370053432407,
"loss": 0.9492,
"step": 201
},
{
"epoch": 0.13,
"grad_norm": 0.07458540052175522,
"learning_rate": 0.0001984154831956296,
"loss": 1.2029,
"step": 202
},
{
"epoch": 0.13,
"grad_norm": 0.08279601484537125,
"learning_rate": 0.00019839716137329676,
"loss": 1.3183,
"step": 203
},
{
"epoch": 0.13,
"grad_norm": 0.07415657490491867,
"learning_rate": 0.00019837873508677707,
"loss": 1.0252,
"step": 204
},
{
"epoch": 0.13,
"grad_norm": 0.06688812375068665,
"learning_rate": 0.00019836020435563297,
"loss": 1.2808,
"step": 205
},
{
"epoch": 0.13,
"grad_norm": 0.07196924835443497,
"learning_rate": 0.00019834156919953775,
"loss": 1.2533,
"step": 206
},
{
"epoch": 0.13,
"grad_norm": 0.07465454190969467,
"learning_rate": 0.0001983228296382756,
"loss": 1.3787,
"step": 207
},
{
"epoch": 0.14,
"grad_norm": 0.07878649234771729,
"learning_rate": 0.00019830398569174154,
"loss": 0.9646,
"step": 208
},
{
"epoch": 0.14,
"grad_norm": 0.10482143610715866,
"learning_rate": 0.00019828503737994138,
"loss": 1.4395,
"step": 209
},
{
"epoch": 0.14,
"grad_norm": 0.0868290439248085,
"learning_rate": 0.00019826598472299177,
"loss": 1.2466,
"step": 210
},
{
"epoch": 0.14,
"grad_norm": 0.06954513490200043,
"learning_rate": 0.00019824682774112015,
"loss": 1.0051,
"step": 211
},
{
"epoch": 0.14,
"grad_norm": 0.08132015913724899,
"learning_rate": 0.00019822756645466468,
"loss": 1.1945,
"step": 212
},
{
"epoch": 0.14,
"grad_norm": 0.08712561428546906,
"learning_rate": 0.00019820820088407422,
"loss": 1.3696,
"step": 213
},
{
"epoch": 0.14,
"grad_norm": 0.07907038927078247,
"learning_rate": 0.00019818873104990846,
"loss": 1.1412,
"step": 214
},
{
"epoch": 0.14,
"grad_norm": 0.07620605081319809,
"learning_rate": 0.00019816915697283766,
"loss": 1.2331,
"step": 215
},
{
"epoch": 0.14,
"grad_norm": 0.08560928702354431,
"learning_rate": 0.00019814947867364288,
"loss": 1.1327,
"step": 216
},
{
"epoch": 0.14,
"grad_norm": 0.08849047124385834,
"learning_rate": 0.00019812969617321571,
"loss": 1.4031,
"step": 217
},
{
"epoch": 0.14,
"grad_norm": 0.17463655769824982,
"learning_rate": 0.00019810980949255838,
"loss": 1.3308,
"step": 218
},
{
"epoch": 0.14,
"grad_norm": 0.09303506463766098,
"learning_rate": 0.0001980898186527838,
"loss": 1.4082,
"step": 219
},
{
"epoch": 0.14,
"grad_norm": 0.07691019773483276,
"learning_rate": 0.0001980697236751154,
"loss": 1.3857,
"step": 220
},
{
"epoch": 0.14,
"grad_norm": 0.07158955186605453,
"learning_rate": 0.0001980495245808872,
"loss": 1.0965,
"step": 221
},
{
"epoch": 0.14,
"grad_norm": 0.0742129236459732,
"learning_rate": 0.00019802922139154375,
"loss": 1.2464,
"step": 222
},
{
"epoch": 0.14,
"grad_norm": 0.07991943508386612,
"learning_rate": 0.00019800881412864002,
"loss": 1.2154,
"step": 223
},
{
"epoch": 0.15,
"grad_norm": 0.07743264734745026,
"learning_rate": 0.00019798830281384167,
"loss": 1.2393,
"step": 224
},
{
"epoch": 0.15,
"grad_norm": 0.08122200518846512,
"learning_rate": 0.00019796768746892463,
"loss": 1.2993,
"step": 225
},
{
"epoch": 0.15,
"grad_norm": 0.07908771932125092,
"learning_rate": 0.00019794696811577538,
"loss": 0.9361,
"step": 226
},
{
"epoch": 0.15,
"grad_norm": 0.06955181062221527,
"learning_rate": 0.00019792614477639082,
"loss": 1.0452,
"step": 227
},
{
"epoch": 0.15,
"grad_norm": 0.0668589249253273,
"learning_rate": 0.0001979052174728782,
"loss": 1.1022,
"step": 228
},
{
"epoch": 0.15,
"grad_norm": 0.08599793165922165,
"learning_rate": 0.0001978841862274552,
"loss": 1.2652,
"step": 229
},
{
"epoch": 0.15,
"grad_norm": 0.06755118072032928,
"learning_rate": 0.00019786305106244981,
"loss": 1.0727,
"step": 230
},
{
"epoch": 0.15,
"grad_norm": 0.08138154447078705,
"learning_rate": 0.0001978418120003003,
"loss": 1.343,
"step": 231
},
{
"epoch": 0.15,
"grad_norm": 0.07889607548713684,
"learning_rate": 0.0001978204690635554,
"loss": 1.1579,
"step": 232
},
{
"epoch": 0.15,
"grad_norm": 0.0738462284207344,
"learning_rate": 0.00019779902227487397,
"loss": 1.1958,
"step": 233
},
{
"epoch": 0.15,
"grad_norm": 0.07153750211000443,
"learning_rate": 0.00019777747165702518,
"loss": 0.8827,
"step": 234
},
{
"epoch": 0.15,
"grad_norm": 0.08947383612394333,
"learning_rate": 0.00019775581723288844,
"loss": 1.1964,
"step": 235
},
{
"epoch": 0.15,
"grad_norm": 0.07474005222320557,
"learning_rate": 0.0001977340590254533,
"loss": 1.1022,
"step": 236
},
{
"epoch": 0.15,
"grad_norm": 0.07341979444026947,
"learning_rate": 0.0001977121970578196,
"loss": 1.0886,
"step": 237
},
{
"epoch": 0.15,
"grad_norm": 0.08199238777160645,
"learning_rate": 0.0001976902313531973,
"loss": 1.3859,
"step": 238
},
{
"epoch": 0.16,
"grad_norm": 0.07367771118879318,
"learning_rate": 0.00019766816193490642,
"loss": 1.1585,
"step": 239
},
{
"epoch": 0.16,
"grad_norm": 0.06804653257131577,
"learning_rate": 0.0001976459888263772,
"loss": 1.22,
"step": 240
},
{
"epoch": 0.16,
"grad_norm": 0.07716865837574005,
"learning_rate": 0.00019762371205114988,
"loss": 1.2162,
"step": 241
},
{
"epoch": 0.16,
"grad_norm": 0.08062226325273514,
"learning_rate": 0.00019760133163287483,
"loss": 1.2478,
"step": 242
},
{
"epoch": 0.16,
"grad_norm": 0.0740450844168663,
"learning_rate": 0.00019757884759531233,
"loss": 1.3709,
"step": 243
},
{
"epoch": 0.16,
"grad_norm": 0.07524896413087845,
"learning_rate": 0.00019755625996233285,
"loss": 1.0305,
"step": 244
},
{
"epoch": 0.16,
"grad_norm": 0.08010102808475494,
"learning_rate": 0.00019753356875791664,
"loss": 0.9281,
"step": 245
},
{
"epoch": 0.16,
"grad_norm": 0.08609338104724884,
"learning_rate": 0.0001975107740061541,
"loss": 1.1635,
"step": 246
},
{
"epoch": 0.16,
"grad_norm": 0.07261325418949127,
"learning_rate": 0.0001974878757312454,
"loss": 1.145,
"step": 247
},
{
"epoch": 0.16,
"grad_norm": 0.08195792138576508,
"learning_rate": 0.00019746487395750078,
"loss": 1.2246,
"step": 248
},
{
"epoch": 0.16,
"grad_norm": 0.07104358822107315,
"learning_rate": 0.00019744176870934018,
"loss": 1.0597,
"step": 249
},
{
"epoch": 0.16,
"grad_norm": 0.06879540532827377,
"learning_rate": 0.00019741856001129354,
"loss": 0.9544,
"step": 250
},
{
"epoch": 0.16,
"grad_norm": 0.0912511944770813,
"learning_rate": 0.00019739524788800052,
"loss": 1.3106,
"step": 251
},
{
"epoch": 0.16,
"grad_norm": 0.08150241523981094,
"learning_rate": 0.00019737183236421068,
"loss": 1.1349,
"step": 252
},
{
"epoch": 0.16,
"grad_norm": 0.06917703151702881,
"learning_rate": 0.00019734831346478331,
"loss": 1.0447,
"step": 253
},
{
"epoch": 0.16,
"grad_norm": 0.0651431605219841,
"learning_rate": 0.00019732469121468743,
"loss": 1.033,
"step": 254
},
{
"epoch": 0.17,
"grad_norm": 0.08859477192163467,
"learning_rate": 0.0001973009656390018,
"loss": 1.1675,
"step": 255
},
{
"epoch": 0.17,
"grad_norm": 0.07274781912565231,
"learning_rate": 0.00019727713676291492,
"loss": 1.2356,
"step": 256
},
{
"epoch": 0.17,
"grad_norm": 0.07679511606693268,
"learning_rate": 0.0001972532046117249,
"loss": 1.3373,
"step": 257
},
{
"epoch": 0.17,
"grad_norm": 0.07444384694099426,
"learning_rate": 0.00019722916921083948,
"loss": 1.0004,
"step": 258
},
{
"epoch": 0.17,
"grad_norm": 0.08357756584882736,
"learning_rate": 0.00019720503058577607,
"loss": 1.3211,
"step": 259
},
{
"epoch": 0.17,
"grad_norm": 0.08496179431676865,
"learning_rate": 0.00019718078876216163,
"loss": 1.3928,
"step": 260
},
{
"epoch": 0.17,
"grad_norm": 0.08116108924150467,
"learning_rate": 0.00019715644376573277,
"loss": 1.0351,
"step": 261
},
{
"epoch": 0.17,
"grad_norm": 0.08486796915531158,
"learning_rate": 0.0001971319956223355,
"loss": 1.2676,
"step": 262
},
{
"epoch": 0.17,
"grad_norm": 0.08111484348773956,
"learning_rate": 0.0001971074443579254,
"loss": 1.1039,
"step": 263
},
{
"epoch": 0.17,
"grad_norm": 0.07966972142457962,
"learning_rate": 0.00019708278999856752,
"loss": 1.2049,
"step": 264
},
{
"epoch": 0.17,
"grad_norm": 0.0737084299325943,
"learning_rate": 0.00019705803257043646,
"loss": 1.362,
"step": 265
},
{
"epoch": 0.17,
"grad_norm": 0.09911826252937317,
"learning_rate": 0.00019703317209981603,
"loss": 1.5312,
"step": 266
},
{
"epoch": 0.17,
"grad_norm": 0.07375174760818481,
"learning_rate": 0.00019700820861309962,
"loss": 1.3419,
"step": 267
},
{
"epoch": 0.17,
"grad_norm": 0.07868732511997223,
"learning_rate": 0.00019698314213679,
"loss": 1.1063,
"step": 268
},
{
"epoch": 0.17,
"grad_norm": 0.08604758977890015,
"learning_rate": 0.00019695797269749906,
"loss": 1.2156,
"step": 269
},
{
"epoch": 0.18,
"grad_norm": 0.08033949881792068,
"learning_rate": 0.00019693270032194825,
"loss": 1.3969,
"step": 270
},
{
"epoch": 0.18,
"grad_norm": 0.07744378596544266,
"learning_rate": 0.0001969073250369682,
"loss": 1.0613,
"step": 271
},
{
"epoch": 0.18,
"grad_norm": 0.08643119782209396,
"learning_rate": 0.00019688184686949878,
"loss": 1.4592,
"step": 272
},
{
"epoch": 0.18,
"grad_norm": 0.07818787544965744,
"learning_rate": 0.00019685626584658907,
"loss": 1.1422,
"step": 273
},
{
"epoch": 0.18,
"grad_norm": 0.08434268832206726,
"learning_rate": 0.00019683058199539746,
"loss": 1.1046,
"step": 274
},
{
"epoch": 0.18,
"grad_norm": 0.06768738478422165,
"learning_rate": 0.00019680479534319134,
"loss": 0.972,
"step": 275
},
{
"epoch": 0.18,
"grad_norm": 0.06793482601642609,
"learning_rate": 0.0001967789059173474,
"loss": 1.1469,
"step": 276
},
{
"epoch": 0.18,
"grad_norm": 0.10266832262277603,
"learning_rate": 0.00019675291374535134,
"loss": 1.2476,
"step": 277
},
{
"epoch": 0.18,
"grad_norm": 0.07190185785293579,
"learning_rate": 0.00019672681885479797,
"loss": 0.9282,
"step": 278
},
{
"epoch": 0.18,
"grad_norm": 0.08223461359739304,
"learning_rate": 0.0001967006212733912,
"loss": 1.3062,
"step": 279
},
{
"epoch": 0.18,
"grad_norm": 0.08083578199148178,
"learning_rate": 0.00019667432102894383,
"loss": 0.9372,
"step": 280
},
{
"epoch": 0.18,
"grad_norm": 0.08270005881786346,
"learning_rate": 0.0001966479181493778,
"loss": 1.1984,
"step": 281
},
{
"epoch": 0.18,
"grad_norm": 0.07582583278417587,
"learning_rate": 0.00019662141266272394,
"loss": 1.0484,
"step": 282
},
{
"epoch": 0.18,
"grad_norm": 0.07078398019075394,
"learning_rate": 0.00019659480459712202,
"loss": 1.116,
"step": 283
},
{
"epoch": 0.18,
"grad_norm": 0.09360076487064362,
"learning_rate": 0.00019656809398082073,
"loss": 0.88,
"step": 284
},
{
"epoch": 0.19,
"grad_norm": 0.08306850492954254,
"learning_rate": 0.0001965412808421776,
"loss": 1.2109,
"step": 285
},
{
"epoch": 0.19,
"grad_norm": 0.07452476024627686,
"learning_rate": 0.00019651436520965907,
"loss": 1.0769,
"step": 286
},
{
"epoch": 0.19,
"grad_norm": 0.10528121888637543,
"learning_rate": 0.00019648734711184032,
"loss": 0.8117,
"step": 287
},
{
"epoch": 0.19,
"grad_norm": 0.08129299432039261,
"learning_rate": 0.00019646022657740533,
"loss": 1.3569,
"step": 288
},
{
"epoch": 0.19,
"grad_norm": 0.07164785265922546,
"learning_rate": 0.0001964330036351469,
"loss": 1.1295,
"step": 289
},
{
"epoch": 0.19,
"grad_norm": 0.08173338323831558,
"learning_rate": 0.00019640567831396647,
"loss": 0.6919,
"step": 290
},
{
"epoch": 0.19,
"grad_norm": 0.07579617202281952,
"learning_rate": 0.00019637825064287417,
"loss": 1.1889,
"step": 291
},
{
"epoch": 0.19,
"grad_norm": 0.08440928906202316,
"learning_rate": 0.00019635072065098886,
"loss": 1.2052,
"step": 292
},
{
"epoch": 0.19,
"grad_norm": 0.08670910447835922,
"learning_rate": 0.00019632308836753796,
"loss": 0.9882,
"step": 293
},
{
"epoch": 0.19,
"grad_norm": 0.08135852217674255,
"learning_rate": 0.00019629535382185759,
"loss": 1.1185,
"step": 294
},
{
"epoch": 0.19,
"grad_norm": 0.07762303203344345,
"learning_rate": 0.00019626751704339226,
"loss": 1.0747,
"step": 295
},
{
"epoch": 0.19,
"grad_norm": 0.07294236123561859,
"learning_rate": 0.0001962395780616952,
"loss": 1.223,
"step": 296
},
{
"epoch": 0.19,
"grad_norm": 0.08544450998306274,
"learning_rate": 0.00019621153690642807,
"loss": 1.1139,
"step": 297
},
{
"epoch": 0.19,
"grad_norm": 0.07629240304231644,
"learning_rate": 0.00019618339360736098,
"loss": 1.1728,
"step": 298
},
{
"epoch": 0.19,
"grad_norm": 0.08711448311805725,
"learning_rate": 0.00019615514819437249,
"loss": 1.0552,
"step": 299
},
{
"epoch": 0.19,
"grad_norm": 0.09498873353004456,
"learning_rate": 0.00019612680069744959,
"loss": 0.8618,
"step": 300
},
{
"epoch": 0.2,
"grad_norm": 0.08398744463920593,
"learning_rate": 0.00019609835114668767,
"loss": 1.2068,
"step": 301
},
{
"epoch": 0.2,
"grad_norm": 0.07466951012611389,
"learning_rate": 0.00019606979957229043,
"loss": 1.0717,
"step": 302
},
{
"epoch": 0.2,
"grad_norm": 0.07318463176488876,
"learning_rate": 0.00019604114600456986,
"loss": 1.1175,
"step": 303
},
{
"epoch": 0.2,
"grad_norm": 0.07363720238208771,
"learning_rate": 0.00019601239047394627,
"loss": 1.2193,
"step": 304
},
{
"epoch": 0.2,
"grad_norm": 0.07690288871526718,
"learning_rate": 0.00019598353301094828,
"loss": 1.0208,
"step": 305
},
{
"epoch": 0.2,
"grad_norm": 0.0708373636007309,
"learning_rate": 0.0001959545736462126,
"loss": 1.2279,
"step": 306
},
{
"epoch": 0.2,
"grad_norm": 0.08571004122495651,
"learning_rate": 0.0001959255124104842,
"loss": 1.2219,
"step": 307
},
{
"epoch": 0.2,
"grad_norm": 0.0700921043753624,
"learning_rate": 0.00019589634933461618,
"loss": 1.2477,
"step": 308
},
{
"epoch": 0.2,
"grad_norm": 0.0815667137503624,
"learning_rate": 0.00019586708444956977,
"loss": 1.3261,
"step": 309
},
{
"epoch": 0.2,
"grad_norm": 0.08132670074701309,
"learning_rate": 0.00019583771778641432,
"loss": 1.2293,
"step": 310
},
{
"epoch": 0.2,
"grad_norm": 0.07306857407093048,
"learning_rate": 0.00019580824937632718,
"loss": 1.0716,
"step": 311
},
{
"epoch": 0.2,
"grad_norm": 0.07567529380321503,
"learning_rate": 0.0001957786792505937,
"loss": 1.1178,
"step": 312
},
{
"epoch": 0.2,
"grad_norm": 0.07349325716495514,
"learning_rate": 0.00019574900744060732,
"loss": 1.2462,
"step": 313
},
{
"epoch": 0.2,
"grad_norm": 0.09117168933153152,
"learning_rate": 0.00019571923397786932,
"loss": 1.4343,
"step": 314
},
{
"epoch": 0.2,
"grad_norm": 0.09968636184930801,
"learning_rate": 0.000195689358893989,
"loss": 1.0374,
"step": 315
},
{
"epoch": 0.21,
"grad_norm": 0.06757254153490067,
"learning_rate": 0.0001956593822206834,
"loss": 0.9678,
"step": 316
},
{
"epoch": 0.21,
"grad_norm": 0.06875491142272949,
"learning_rate": 0.00019562930398977764,
"loss": 0.8834,
"step": 317
},
{
"epoch": 0.21,
"grad_norm": 0.07561076432466507,
"learning_rate": 0.00019559912423320445,
"loss": 1.0253,
"step": 318
},
{
"epoch": 0.21,
"grad_norm": 0.08350048214197159,
"learning_rate": 0.00019556884298300447,
"loss": 1.3002,
"step": 319
},
{
"epoch": 0.21,
"grad_norm": 0.08164501190185547,
"learning_rate": 0.00019553846027132597,
"loss": 1.0691,
"step": 320
},
{
"epoch": 0.21,
"grad_norm": 0.06864047795534134,
"learning_rate": 0.00019550797613042512,
"loss": 1.1126,
"step": 321
},
{
"epoch": 0.21,
"grad_norm": 0.09170655161142349,
"learning_rate": 0.00019547739059266563,
"loss": 1.2081,
"step": 322
},
{
"epoch": 0.21,
"grad_norm": 0.07826890796422958,
"learning_rate": 0.00019544670369051886,
"loss": 0.9041,
"step": 323
},
{
"epoch": 0.21,
"grad_norm": 0.08353405445814133,
"learning_rate": 0.00019541591545656382,
"loss": 1.1917,
"step": 324
},
{
"epoch": 0.21,
"grad_norm": 0.0875379666686058,
"learning_rate": 0.0001953850259234872,
"loss": 1.4304,
"step": 325
},
{
"epoch": 0.21,
"grad_norm": 0.0946691557765007,
"learning_rate": 0.00019535403512408302,
"loss": 1.1827,
"step": 326
},
{
"epoch": 0.21,
"grad_norm": 0.08039974421262741,
"learning_rate": 0.00019532294309125296,
"loss": 1.0794,
"step": 327
},
{
"epoch": 0.21,
"grad_norm": 0.08648931235074997,
"learning_rate": 0.0001952917498580062,
"loss": 1.0611,
"step": 328
},
{
"epoch": 0.21,
"grad_norm": 0.07675964385271072,
"learning_rate": 0.0001952604554574592,
"loss": 1.2255,
"step": 329
},
{
"epoch": 0.21,
"grad_norm": 0.08626928925514221,
"learning_rate": 0.00019522905992283603,
"loss": 1.1067,
"step": 330
},
{
"epoch": 0.22,
"grad_norm": 0.07394007593393326,
"learning_rate": 0.000195197563287468,
"loss": 1.273,
"step": 331
},
{
"epoch": 0.22,
"grad_norm": 0.08016256988048553,
"learning_rate": 0.00019516596558479373,
"loss": 1.2918,
"step": 332
},
{
"epoch": 0.22,
"grad_norm": 0.07985451072454453,
"learning_rate": 0.00019513426684835924,
"loss": 1.3254,
"step": 333
},
{
"epoch": 0.22,
"grad_norm": 0.07685229182243347,
"learning_rate": 0.00019510246711181773,
"loss": 1.0379,
"step": 334
},
{
"epoch": 0.22,
"grad_norm": 0.08119890838861465,
"learning_rate": 0.0001950705664089297,
"loss": 1.2837,
"step": 335
},
{
"epoch": 0.22,
"grad_norm": 0.08366896957159042,
"learning_rate": 0.00019503856477356277,
"loss": 1.0833,
"step": 336
},
{
"epoch": 0.22,
"grad_norm": 0.08915286511182785,
"learning_rate": 0.0001950064622396918,
"loss": 1.3782,
"step": 337
},
{
"epoch": 0.22,
"grad_norm": 0.07876568287611008,
"learning_rate": 0.00019497425884139867,
"loss": 1.1196,
"step": 338
},
{
"epoch": 0.22,
"grad_norm": 0.07202233374118805,
"learning_rate": 0.00019494195461287238,
"loss": 1.227,
"step": 339
},
{
"epoch": 0.22,
"grad_norm": 0.0898287370800972,
"learning_rate": 0.00019490954958840907,
"loss": 1.0928,
"step": 340
},
{
"epoch": 0.22,
"grad_norm": 0.07852581143379211,
"learning_rate": 0.00019487704380241171,
"loss": 1.213,
"step": 341
},
{
"epoch": 0.22,
"grad_norm": 0.06838871538639069,
"learning_rate": 0.00019484443728939041,
"loss": 0.8097,
"step": 342
},
{
"epoch": 0.22,
"grad_norm": 0.08563709259033203,
"learning_rate": 0.00019481173008396212,
"loss": 1.211,
"step": 343
},
{
"epoch": 0.22,
"grad_norm": 0.07851383090019226,
"learning_rate": 0.00019477892222085076,
"loss": 1.1013,
"step": 344
},
{
"epoch": 0.22,
"grad_norm": 0.07817473262548447,
"learning_rate": 0.00019474601373488704,
"loss": 1.0356,
"step": 345
},
{
"epoch": 0.22,
"grad_norm": 0.08778605610132217,
"learning_rate": 0.00019471300466100862,
"loss": 1.3028,
"step": 346
},
{
"epoch": 0.23,
"grad_norm": 0.08105292916297913,
"learning_rate": 0.00019467989503425974,
"loss": 1.2975,
"step": 347
},
{
"epoch": 0.23,
"grad_norm": 0.0834394320845604,
"learning_rate": 0.0001946466848897916,
"loss": 1.2351,
"step": 348
},
{
"epoch": 0.23,
"grad_norm": 0.07740943878889084,
"learning_rate": 0.000194613374262862,
"loss": 1.1116,
"step": 349
},
{
"epoch": 0.23,
"grad_norm": 0.07536373287439346,
"learning_rate": 0.00019457996318883547,
"loss": 1.2281,
"step": 350
},
{
"epoch": 0.23,
"grad_norm": 0.08035381883382797,
"learning_rate": 0.00019454645170318316,
"loss": 1.1608,
"step": 351
},
{
"epoch": 0.23,
"grad_norm": 0.08857207000255585,
"learning_rate": 0.00019451283984148277,
"loss": 1.2222,
"step": 352
},
{
"epoch": 0.23,
"grad_norm": 0.09400638937950134,
"learning_rate": 0.00019447912763941873,
"loss": 1.122,
"step": 353
},
{
"epoch": 0.23,
"grad_norm": 0.08267956227064133,
"learning_rate": 0.0001944453151327818,
"loss": 1.3113,
"step": 354
},
{
"epoch": 0.23,
"grad_norm": 0.08970284461975098,
"learning_rate": 0.00019441140235746938,
"loss": 1.219,
"step": 355
},
{
"epoch": 0.23,
"grad_norm": 0.07805363833904266,
"learning_rate": 0.00019437738934948516,
"loss": 1.2004,
"step": 356
},
{
"epoch": 0.23,
"grad_norm": 0.08512768894433975,
"learning_rate": 0.00019434327614493947,
"loss": 1.1079,
"step": 357
},
{
"epoch": 0.23,
"grad_norm": 0.08124563097953796,
"learning_rate": 0.00019430906278004878,
"loss": 1.3318,
"step": 358
},
{
"epoch": 0.23,
"grad_norm": 0.08076978474855423,
"learning_rate": 0.00019427474929113603,
"loss": 1.3367,
"step": 359
},
{
"epoch": 0.23,
"grad_norm": 0.08717597275972366,
"learning_rate": 0.00019424033571463045,
"loss": 1.0656,
"step": 360
},
{
"epoch": 0.23,
"grad_norm": 0.07822860032320023,
"learning_rate": 0.0001942058220870675,
"loss": 1.3459,
"step": 361
},
{
"epoch": 0.24,
"grad_norm": 0.08764126896858215,
"learning_rate": 0.00019417120844508883,
"loss": 1.3596,
"step": 362
},
{
"epoch": 0.24,
"grad_norm": 0.0675569474697113,
"learning_rate": 0.0001941364948254424,
"loss": 0.9151,
"step": 363
},
{
"epoch": 0.24,
"grad_norm": 0.08149274438619614,
"learning_rate": 0.0001941016812649821,
"loss": 1.1278,
"step": 364
},
{
"epoch": 0.24,
"grad_norm": 0.09104771912097931,
"learning_rate": 0.00019406676780066816,
"loss": 1.1458,
"step": 365
},
{
"epoch": 0.24,
"grad_norm": 0.07882712781429291,
"learning_rate": 0.00019403175446956663,
"loss": 1.2976,
"step": 366
},
{
"epoch": 0.24,
"grad_norm": 0.07717309147119522,
"learning_rate": 0.00019399664130884982,
"loss": 1.2499,
"step": 367
},
{
"epoch": 0.24,
"grad_norm": 0.07685019075870514,
"learning_rate": 0.0001939614283557959,
"loss": 0.9801,
"step": 368
},
{
"epoch": 0.24,
"grad_norm": 0.07864333689212799,
"learning_rate": 0.000193926115647789,
"loss": 1.0911,
"step": 369
},
{
"epoch": 0.24,
"grad_norm": 0.08465702831745148,
"learning_rate": 0.00019389070322231908,
"loss": 1.0375,
"step": 370
},
{
"epoch": 0.24,
"grad_norm": 0.07431544363498688,
"learning_rate": 0.00019385519111698215,
"loss": 1.1673,
"step": 371
},
{
"epoch": 0.24,
"grad_norm": 0.08939298987388611,
"learning_rate": 0.00019381957936947988,
"loss": 1.4116,
"step": 372
},
{
"epoch": 0.24,
"grad_norm": 0.07779692113399506,
"learning_rate": 0.00019378386801761983,
"loss": 0.9468,
"step": 373
},
{
"epoch": 0.24,
"grad_norm": 0.07538998126983643,
"learning_rate": 0.0001937480570993152,
"loss": 1.3055,
"step": 374
},
{
"epoch": 0.24,
"grad_norm": 0.11219945549964905,
"learning_rate": 0.00019371214665258498,
"loss": 1.2806,
"step": 375
},
{
"epoch": 0.24,
"grad_norm": 0.08720182627439499,
"learning_rate": 0.00019367613671555386,
"loss": 1.1344,
"step": 376
},
{
"epoch": 0.24,
"grad_norm": 0.08663522452116013,
"learning_rate": 0.00019364002732645202,
"loss": 1.2102,
"step": 377
},
{
"epoch": 0.25,
"grad_norm": 0.08982711285352707,
"learning_rate": 0.00019360381852361535,
"loss": 1.358,
"step": 378
},
{
"epoch": 0.25,
"grad_norm": 0.0780293419957161,
"learning_rate": 0.0001935675103454852,
"loss": 1.2362,
"step": 379
},
{
"epoch": 0.25,
"grad_norm": 0.06800080090761185,
"learning_rate": 0.00019353110283060846,
"loss": 0.9151,
"step": 380
},
{
"epoch": 0.25,
"grad_norm": 0.0782063826918602,
"learning_rate": 0.00019349459601763753,
"loss": 1.23,
"step": 381
},
{
"epoch": 0.25,
"grad_norm": 0.0815950483083725,
"learning_rate": 0.00019345798994533012,
"loss": 1.1367,
"step": 382
},
{
"epoch": 0.25,
"grad_norm": 0.07667894661426544,
"learning_rate": 0.00019342128465254943,
"loss": 1.1646,
"step": 383
},
{
"epoch": 0.25,
"grad_norm": 0.09157751500606537,
"learning_rate": 0.00019338448017826388,
"loss": 1.2779,
"step": 384
},
{
"epoch": 0.25,
"grad_norm": 0.08936314284801483,
"learning_rate": 0.00019334757656154733,
"loss": 1.2413,
"step": 385
},
{
"epoch": 0.25,
"grad_norm": 0.06992750614881516,
"learning_rate": 0.00019331057384157875,
"loss": 0.9875,
"step": 386
},
{
"epoch": 0.25,
"grad_norm": 0.07639253884553909,
"learning_rate": 0.00019327347205764245,
"loss": 0.8025,
"step": 387
},
{
"epoch": 0.25,
"grad_norm": 0.11455408483743668,
"learning_rate": 0.00019323627124912785,
"loss": 1.2565,
"step": 388
},
{
"epoch": 0.25,
"grad_norm": 0.07387693971395493,
"learning_rate": 0.00019319897145552947,
"loss": 1.0765,
"step": 389
},
{
"epoch": 0.25,
"grad_norm": 0.07396475970745087,
"learning_rate": 0.00019316157271644696,
"loss": 1.3985,
"step": 390
},
{
"epoch": 0.25,
"grad_norm": 0.10893306136131287,
"learning_rate": 0.00019312407507158502,
"loss": 1.2244,
"step": 391
},
{
"epoch": 0.25,
"grad_norm": 0.07123012095689774,
"learning_rate": 0.0001930864785607534,
"loss": 1.1026,
"step": 392
},
{
"epoch": 0.26,
"grad_norm": 0.07318564504384995,
"learning_rate": 0.00019304878322386668,
"loss": 1.0512,
"step": 393
},
{
"epoch": 0.26,
"grad_norm": 0.09317158162593842,
"learning_rate": 0.00019301098910094443,
"loss": 1.1896,
"step": 394
},
{
"epoch": 0.26,
"grad_norm": 0.07076855003833771,
"learning_rate": 0.00019297309623211118,
"loss": 1.1452,
"step": 395
},
{
"epoch": 0.26,
"grad_norm": 0.09200429171323776,
"learning_rate": 0.00019293510465759618,
"loss": 1.408,
"step": 396
},
{
"epoch": 0.26,
"grad_norm": 0.08040551096200943,
"learning_rate": 0.00019289701441773348,
"loss": 0.9442,
"step": 397
},
{
"epoch": 0.26,
"grad_norm": 0.08749471604824066,
"learning_rate": 0.00019285882555296192,
"loss": 1.1273,
"step": 398
},
{
"epoch": 0.26,
"grad_norm": 0.07210942357778549,
"learning_rate": 0.0001928205381038251,
"loss": 1.1232,
"step": 399
},
{
"epoch": 0.26,
"grad_norm": 0.09227026253938675,
"learning_rate": 0.00019278215211097113,
"loss": 0.9206,
"step": 400
},
{
"epoch": 0.26,
"grad_norm": 0.07559602707624435,
"learning_rate": 0.00019274366761515288,
"loss": 0.9949,
"step": 401
},
{
"epoch": 0.26,
"grad_norm": 0.08838319033384323,
"learning_rate": 0.0001927050846572277,
"loss": 1.2597,
"step": 402
},
{
"epoch": 0.26,
"grad_norm": 0.08586682379245758,
"learning_rate": 0.00019266640327815756,
"loss": 1.2862,
"step": 403
},
{
"epoch": 0.26,
"grad_norm": 0.08682897686958313,
"learning_rate": 0.00019262762351900884,
"loss": 1.0765,
"step": 404
},
{
"epoch": 0.26,
"grad_norm": 0.08626113086938858,
"learning_rate": 0.00019258874542095244,
"loss": 1.2946,
"step": 405
},
{
"epoch": 0.26,
"grad_norm": 0.08612517267465591,
"learning_rate": 0.0001925497690252636,
"loss": 1.2083,
"step": 406
},
{
"epoch": 0.26,
"grad_norm": 0.08026706427335739,
"learning_rate": 0.00019251069437332196,
"loss": 1.13,
"step": 407
},
{
"epoch": 0.27,
"grad_norm": 0.07586629688739777,
"learning_rate": 0.00019247152150661144,
"loss": 1.3407,
"step": 408
},
{
"epoch": 0.27,
"grad_norm": 0.06716896593570709,
"learning_rate": 0.00019243225046672023,
"loss": 1.1228,
"step": 409
},
{
"epoch": 0.27,
"grad_norm": 0.0866006463766098,
"learning_rate": 0.00019239288129534082,
"loss": 1.2552,
"step": 410
},
{
"epoch": 0.27,
"grad_norm": 0.08225997537374496,
"learning_rate": 0.00019235341403426982,
"loss": 1.3139,
"step": 411
},
{
"epoch": 0.27,
"grad_norm": 0.08211803436279297,
"learning_rate": 0.00019231384872540793,
"loss": 1.3132,
"step": 412
},
{
"epoch": 0.27,
"grad_norm": 0.07636349648237228,
"learning_rate": 0.00019227418541076005,
"loss": 1.0955,
"step": 413
},
{
"epoch": 0.27,
"grad_norm": 0.07598800212144852,
"learning_rate": 0.00019223442413243507,
"loss": 1.0774,
"step": 414
},
{
"epoch": 0.27,
"grad_norm": 0.0703662857413292,
"learning_rate": 0.00019219456493264585,
"loss": 1.0358,
"step": 415
},
{
"epoch": 0.27,
"grad_norm": 0.0865321159362793,
"learning_rate": 0.00019215460785370928,
"loss": 1.1194,
"step": 416
},
{
"epoch": 0.27,
"grad_norm": 0.07862474769353867,
"learning_rate": 0.00019211455293804614,
"loss": 1.1625,
"step": 417
},
{
"epoch": 0.27,
"grad_norm": 0.07098463177680969,
"learning_rate": 0.00019207440022818109,
"loss": 1.0096,
"step": 418
},
{
"epoch": 0.27,
"grad_norm": 0.088544100522995,
"learning_rate": 0.00019203414976674252,
"loss": 1.0971,
"step": 419
},
{
"epoch": 0.27,
"grad_norm": 0.0754735916852951,
"learning_rate": 0.00019199380159646277,
"loss": 1.102,
"step": 420
},
{
"epoch": 0.27,
"grad_norm": 0.08316062390804291,
"learning_rate": 0.00019195335576017777,
"loss": 0.8386,
"step": 421
},
{
"epoch": 0.27,
"grad_norm": 0.08435340225696564,
"learning_rate": 0.00019191281230082722,
"loss": 1.4531,
"step": 422
},
{
"epoch": 0.27,
"grad_norm": 0.0735434889793396,
"learning_rate": 0.00019187217126145437,
"loss": 1.1754,
"step": 423
},
{
"epoch": 0.28,
"grad_norm": 0.07428895682096481,
"learning_rate": 0.0001918314326852062,
"loss": 1.0972,
"step": 424
},
{
"epoch": 0.28,
"grad_norm": 0.08614860475063324,
"learning_rate": 0.0001917905966153331,
"loss": 1.1855,
"step": 425
},
{
"epoch": 0.28,
"grad_norm": 0.08168008923530579,
"learning_rate": 0.00019174966309518906,
"loss": 0.9313,
"step": 426
},
{
"epoch": 0.28,
"grad_norm": 0.07268556207418442,
"learning_rate": 0.00019170863216823154,
"loss": 1.3363,
"step": 427
},
{
"epoch": 0.28,
"grad_norm": 0.08409485965967178,
"learning_rate": 0.0001916675038780213,
"loss": 1.4261,
"step": 428
},
{
"epoch": 0.28,
"grad_norm": 0.07452593743801117,
"learning_rate": 0.0001916262782682226,
"loss": 0.948,
"step": 429
},
{
"epoch": 0.28,
"grad_norm": 0.07678116858005524,
"learning_rate": 0.00019158495538260293,
"loss": 1.1868,
"step": 430
},
{
"epoch": 0.28,
"grad_norm": 0.08513883501291275,
"learning_rate": 0.00019154353526503314,
"loss": 1.4855,
"step": 431
},
{
"epoch": 0.28,
"grad_norm": 0.11117757111787796,
"learning_rate": 0.00019150201795948714,
"loss": 1.3191,
"step": 432
},
{
"epoch": 0.28,
"grad_norm": 0.0691884309053421,
"learning_rate": 0.00019146040351004223,
"loss": 0.8405,
"step": 433
},
{
"epoch": 0.28,
"grad_norm": 0.08929750323295593,
"learning_rate": 0.0001914186919608787,
"loss": 1.1456,
"step": 434
},
{
"epoch": 0.28,
"grad_norm": 0.0731349065899849,
"learning_rate": 0.00019137688335628002,
"loss": 1.1522,
"step": 435
},
{
"epoch": 0.28,
"grad_norm": 0.09056232869625092,
"learning_rate": 0.00019133497774063258,
"loss": 1.376,
"step": 436
},
{
"epoch": 0.28,
"grad_norm": 0.08234091848134995,
"learning_rate": 0.0001912929751584259,
"loss": 1.2767,
"step": 437
},
{
"epoch": 0.28,
"grad_norm": 0.08351151645183563,
"learning_rate": 0.00019125087565425236,
"loss": 1.2058,
"step": 438
},
{
"epoch": 0.29,
"grad_norm": 0.08435379713773727,
"learning_rate": 0.00019120867927280722,
"loss": 1.3188,
"step": 439
},
{
"epoch": 0.29,
"grad_norm": 0.07525074481964111,
"learning_rate": 0.00019116638605888868,
"loss": 1.1302,
"step": 440
},
{
"epoch": 0.29,
"grad_norm": 0.08379056304693222,
"learning_rate": 0.00019112399605739766,
"loss": 1.0837,
"step": 441
},
{
"epoch": 0.29,
"grad_norm": 0.07879301905632019,
"learning_rate": 0.00019108150931333788,
"loss": 1.1719,
"step": 442
},
{
"epoch": 0.29,
"grad_norm": 0.07724307477474213,
"learning_rate": 0.00019103892587181578,
"loss": 1.2476,
"step": 443
},
{
"epoch": 0.29,
"grad_norm": 0.07908225059509277,
"learning_rate": 0.00019099624577804042,
"loss": 1.2337,
"step": 444
},
{
"epoch": 0.29,
"grad_norm": 0.07688111811876297,
"learning_rate": 0.0001909534690773234,
"loss": 1.0977,
"step": 445
},
{
"epoch": 0.29,
"grad_norm": 0.09893277287483215,
"learning_rate": 0.0001909105958150791,
"loss": 1.2359,
"step": 446
},
{
"epoch": 0.29,
"grad_norm": 0.08339065313339233,
"learning_rate": 0.00019086762603682424,
"loss": 1.1732,
"step": 447
},
{
"epoch": 0.29,
"grad_norm": 0.08026134967803955,
"learning_rate": 0.00019082455978817803,
"loss": 1.2408,
"step": 448
},
{
"epoch": 0.29,
"grad_norm": 0.2762611210346222,
"learning_rate": 0.00019078139711486213,
"loss": 1.1373,
"step": 449
},
{
"epoch": 0.29,
"grad_norm": 0.07496568560600281,
"learning_rate": 0.00019073813806270054,
"loss": 1.2368,
"step": 450
},
{
"epoch": 0.29,
"grad_norm": 0.08035270869731903,
"learning_rate": 0.00019069478267761967,
"loss": 1.1902,
"step": 451
},
{
"epoch": 0.29,
"grad_norm": 0.08573023974895477,
"learning_rate": 0.00019065133100564804,
"loss": 1.1078,
"step": 452
},
{
"epoch": 0.29,
"grad_norm": 0.08823239058256149,
"learning_rate": 0.00019060778309291658,
"loss": 1.1754,
"step": 453
},
{
"epoch": 0.29,
"grad_norm": 0.08498603105545044,
"learning_rate": 0.00019056413898565824,
"loss": 1.1661,
"step": 454
},
{
"epoch": 0.3,
"grad_norm": 0.08216524124145508,
"learning_rate": 0.0001905203987302082,
"loss": 0.9903,
"step": 455
},
{
"epoch": 0.3,
"grad_norm": 0.07343865931034088,
"learning_rate": 0.00019047656237300363,
"loss": 1.163,
"step": 456
},
{
"epoch": 0.3,
"grad_norm": 0.07787960767745972,
"learning_rate": 0.00019043262996058384,
"loss": 1.0901,
"step": 457
},
{
"epoch": 0.3,
"grad_norm": 0.08267030864953995,
"learning_rate": 0.00019038860153959,
"loss": 1.1531,
"step": 458
},
{
"epoch": 0.3,
"grad_norm": 0.09128779172897339,
"learning_rate": 0.00019034447715676525,
"loss": 0.9449,
"step": 459
},
{
"epoch": 0.3,
"grad_norm": 0.07797211408615112,
"learning_rate": 0.00019030025685895464,
"loss": 1.2067,
"step": 460
},
{
"epoch": 0.3,
"grad_norm": 0.07567018270492554,
"learning_rate": 0.00019025594069310505,
"loss": 1.134,
"step": 461
},
{
"epoch": 0.3,
"grad_norm": 0.07905828207731247,
"learning_rate": 0.00019021152870626507,
"loss": 1.2258,
"step": 462
},
{
"epoch": 0.3,
"grad_norm": 0.08188856393098831,
"learning_rate": 0.00019016702094558512,
"loss": 1.4545,
"step": 463
},
{
"epoch": 0.3,
"grad_norm": 0.0827045813202858,
"learning_rate": 0.00019012241745831716,
"loss": 1.3086,
"step": 464
},
{
"epoch": 0.3,
"grad_norm": 0.0808766558766365,
"learning_rate": 0.00019007771829181495,
"loss": 1.1352,
"step": 465
},
{
"epoch": 0.3,
"grad_norm": 0.08212947845458984,
"learning_rate": 0.00019003292349353373,
"loss": 1.0694,
"step": 466
},
{
"epoch": 0.3,
"grad_norm": 0.10015026479959488,
"learning_rate": 0.0001899880331110302,
"loss": 0.881,
"step": 467
},
{
"epoch": 0.3,
"grad_norm": 0.07001382857561111,
"learning_rate": 0.0001899430471919627,
"loss": 1.1825,
"step": 468
},
{
"epoch": 0.3,
"grad_norm": 0.08494579046964645,
"learning_rate": 0.0001898979657840909,
"loss": 1.5053,
"step": 469
},
{
"epoch": 0.31,
"grad_norm": 0.07976152747869492,
"learning_rate": 0.00018985278893527582,
"loss": 0.9016,
"step": 470
},
{
"epoch": 0.31,
"grad_norm": 0.07986082881689072,
"learning_rate": 0.00018980751669347992,
"loss": 1.2176,
"step": 471
},
{
"epoch": 0.31,
"grad_norm": 0.0754285380244255,
"learning_rate": 0.00018976214910676679,
"loss": 1.246,
"step": 472
},
{
"epoch": 0.31,
"grad_norm": 0.09687798470258713,
"learning_rate": 0.00018971668622330137,
"loss": 1.1258,
"step": 473
},
{
"epoch": 0.31,
"grad_norm": 0.0783214420080185,
"learning_rate": 0.00018967112809134968,
"loss": 1.2674,
"step": 474
},
{
"epoch": 0.31,
"grad_norm": 0.08914946019649506,
"learning_rate": 0.00018962547475927892,
"loss": 1.2167,
"step": 475
},
{
"epoch": 0.31,
"grad_norm": 0.08197241276502609,
"learning_rate": 0.00018957972627555732,
"loss": 1.0172,
"step": 476
},
{
"epoch": 0.31,
"grad_norm": 0.09656665474176407,
"learning_rate": 0.0001895338826887542,
"loss": 1.3342,
"step": 477
},
{
"epoch": 0.31,
"grad_norm": 0.07986515760421753,
"learning_rate": 0.00018948794404753975,
"loss": 1.3102,
"step": 478
},
{
"epoch": 0.31,
"grad_norm": 0.07593046873807907,
"learning_rate": 0.00018944191040068514,
"loss": 1.0646,
"step": 479
},
{
"epoch": 0.31,
"grad_norm": 0.08059150725603104,
"learning_rate": 0.00018939578179706233,
"loss": 1.0846,
"step": 480
},
{
"epoch": 0.31,
"grad_norm": 0.07469641417264938,
"learning_rate": 0.0001893495582856442,
"loss": 1.1961,
"step": 481
},
{
"epoch": 0.31,
"grad_norm": 0.0868467465043068,
"learning_rate": 0.00018930323991550436,
"loss": 1.1271,
"step": 482
},
{
"epoch": 0.31,
"grad_norm": 0.08924498409032822,
"learning_rate": 0.00018925682673581707,
"loss": 1.0574,
"step": 483
},
{
"epoch": 0.31,
"grad_norm": 0.09144362807273865,
"learning_rate": 0.00018921031879585724,
"loss": 1.3689,
"step": 484
},
{
"epoch": 0.32,
"grad_norm": 0.08528386801481247,
"learning_rate": 0.00018916371614500048,
"loss": 1.2619,
"step": 485
},
{
"epoch": 0.32,
"grad_norm": 0.0836653783917427,
"learning_rate": 0.00018911701883272288,
"loss": 1.0822,
"step": 486
},
{
"epoch": 0.32,
"grad_norm": 0.07838030159473419,
"learning_rate": 0.00018907022690860104,
"loss": 1.1393,
"step": 487
},
{
"epoch": 0.32,
"grad_norm": 0.08181433379650116,
"learning_rate": 0.00018902334042231197,
"loss": 1.0902,
"step": 488
},
{
"epoch": 0.32,
"grad_norm": 0.07703683525323868,
"learning_rate": 0.00018897635942363318,
"loss": 0.8397,
"step": 489
},
{
"epoch": 0.32,
"grad_norm": 0.07494264096021652,
"learning_rate": 0.00018892928396244235,
"loss": 1.0948,
"step": 490
},
{
"epoch": 0.32,
"grad_norm": 0.06938749551773071,
"learning_rate": 0.00018888211408871767,
"loss": 1.2057,
"step": 491
},
{
"epoch": 0.32,
"grad_norm": 0.09794944524765015,
"learning_rate": 0.00018883484985253733,
"loss": 1.3597,
"step": 492
},
{
"epoch": 0.32,
"grad_norm": 0.0744868814945221,
"learning_rate": 0.00018878749130407985,
"loss": 1.2162,
"step": 493
},
{
"epoch": 0.32,
"grad_norm": 0.06919589638710022,
"learning_rate": 0.00018874003849362386,
"loss": 1.1939,
"step": 494
},
{
"epoch": 0.32,
"grad_norm": 0.0835278257727623,
"learning_rate": 0.000188692491471548,
"loss": 1.2375,
"step": 495
},
{
"epoch": 0.32,
"grad_norm": 0.06999599188566208,
"learning_rate": 0.00018864485028833097,
"loss": 1.1542,
"step": 496
},
{
"epoch": 0.32,
"grad_norm": 0.06855190545320511,
"learning_rate": 0.0001885971149945515,
"loss": 1.2928,
"step": 497
},
{
"epoch": 0.32,
"grad_norm": 0.07003825157880783,
"learning_rate": 0.00018854928564088813,
"loss": 1.1247,
"step": 498
},
{
"epoch": 0.32,
"grad_norm": 0.08657265454530716,
"learning_rate": 0.00018850136227811928,
"loss": 1.1983,
"step": 499
},
{
"epoch": 0.32,
"grad_norm": 0.09968368709087372,
"learning_rate": 0.00018845334495712327,
"loss": 1.3784,
"step": 500
},
{
"epoch": 0.33,
"grad_norm": 0.07520922273397446,
"learning_rate": 0.000188405233728878,
"loss": 1.243,
"step": 501
},
{
"epoch": 0.33,
"grad_norm": 0.096616230905056,
"learning_rate": 0.00018835702864446123,
"loss": 1.0152,
"step": 502
},
{
"epoch": 0.33,
"grad_norm": 0.09454113990068436,
"learning_rate": 0.00018830872975505032,
"loss": 1.1541,
"step": 503
},
{
"epoch": 0.33,
"grad_norm": 0.10077288001775742,
"learning_rate": 0.00018826033711192213,
"loss": 1.5214,
"step": 504
},
{
"epoch": 0.33,
"grad_norm": 0.07623240351676941,
"learning_rate": 0.00018821185076645317,
"loss": 1.3555,
"step": 505
},
{
"epoch": 0.33,
"grad_norm": 0.07468952983617783,
"learning_rate": 0.0001881632707701194,
"loss": 1.1606,
"step": 506
},
{
"epoch": 0.33,
"grad_norm": 0.0826793685555458,
"learning_rate": 0.0001881145971744961,
"loss": 1.5361,
"step": 507
},
{
"epoch": 0.33,
"grad_norm": 0.08035515993833542,
"learning_rate": 0.00018806583003125812,
"loss": 0.9415,
"step": 508
},
{
"epoch": 0.33,
"grad_norm": 0.07866019010543823,
"learning_rate": 0.00018801696939217945,
"loss": 1.1575,
"step": 509
},
{
"epoch": 0.33,
"grad_norm": 0.07043889909982681,
"learning_rate": 0.00018796801530913344,
"loss": 1.138,
"step": 510
},
{
"epoch": 0.33,
"grad_norm": 0.07135047763586044,
"learning_rate": 0.00018791896783409254,
"loss": 0.9334,
"step": 511
},
{
"epoch": 0.33,
"grad_norm": 0.13318529725074768,
"learning_rate": 0.00018786982701912849,
"loss": 1.0068,
"step": 512
},
{
"epoch": 0.33,
"grad_norm": 0.0810013934969902,
"learning_rate": 0.00018782059291641204,
"loss": 0.9814,
"step": 513
},
{
"epoch": 0.33,
"grad_norm": 0.08140300214290619,
"learning_rate": 0.000187771265578213,
"loss": 1.2657,
"step": 514
},
{
"epoch": 0.33,
"grad_norm": 0.067005954682827,
"learning_rate": 0.00018772184505690015,
"loss": 1.1169,
"step": 515
},
{
"epoch": 0.34,
"grad_norm": 0.08094844967126846,
"learning_rate": 0.0001876723314049412,
"loss": 0.9574,
"step": 516
},
{
"epoch": 0.34,
"grad_norm": 0.10021191090345383,
"learning_rate": 0.00018762272467490277,
"loss": 1.327,
"step": 517
},
{
"epoch": 0.34,
"grad_norm": 0.086814284324646,
"learning_rate": 0.00018757302491945023,
"loss": 1.2665,
"step": 518
},
{
"epoch": 0.34,
"grad_norm": 0.08578301966190338,
"learning_rate": 0.00018752323219134776,
"loss": 1.2587,
"step": 519
},
{
"epoch": 0.34,
"grad_norm": 0.07434894889593124,
"learning_rate": 0.00018747334654345825,
"loss": 1.202,
"step": 520
},
{
"epoch": 0.34,
"grad_norm": 0.08205292373895645,
"learning_rate": 0.00018742336802874323,
"loss": 1.1721,
"step": 521
},
{
"epoch": 0.34,
"grad_norm": 0.07242529839277267,
"learning_rate": 0.00018737329670026278,
"loss": 1.0442,
"step": 522
},
{
"epoch": 0.34,
"grad_norm": 0.07199004292488098,
"learning_rate": 0.0001873231326111756,
"loss": 0.9265,
"step": 523
},
{
"epoch": 0.34,
"grad_norm": 0.07771551609039307,
"learning_rate": 0.0001872728758147388,
"loss": 1.0775,
"step": 524
},
{
"epoch": 0.34,
"grad_norm": 0.08522839844226837,
"learning_rate": 0.00018722252636430795,
"loss": 1.35,
"step": 525
},
{
"epoch": 0.34,
"grad_norm": 0.08055147528648376,
"learning_rate": 0.00018717208431333698,
"loss": 0.9515,
"step": 526
},
{
"epoch": 0.34,
"grad_norm": 0.07367686182260513,
"learning_rate": 0.00018712154971537806,
"loss": 1.0166,
"step": 527
},
{
"epoch": 0.34,
"grad_norm": 0.08174576610326767,
"learning_rate": 0.00018707092262408174,
"loss": 1.1321,
"step": 528
},
{
"epoch": 0.34,
"grad_norm": 0.08406844735145569,
"learning_rate": 0.00018702020309319673,
"loss": 1.0232,
"step": 529
},
{
"epoch": 0.34,
"grad_norm": 0.07865949720144272,
"learning_rate": 0.00018696939117656975,
"loss": 1.2811,
"step": 530
},
{
"epoch": 0.34,
"grad_norm": 0.08797873556613922,
"learning_rate": 0.0001869184869281458,
"loss": 1.3392,
"step": 531
},
{
"epoch": 0.35,
"grad_norm": 0.07272420823574066,
"learning_rate": 0.0001868674904019678,
"loss": 1.2045,
"step": 532
},
{
"epoch": 0.35,
"grad_norm": 0.0777972862124443,
"learning_rate": 0.0001868164016521766,
"loss": 1.158,
"step": 533
},
{
"epoch": 0.35,
"grad_norm": 0.08992641419172287,
"learning_rate": 0.00018676522073301106,
"loss": 1.416,
"step": 534
},
{
"epoch": 0.35,
"grad_norm": 0.08786187320947647,
"learning_rate": 0.0001867139476988078,
"loss": 1.3691,
"step": 535
},
{
"epoch": 0.35,
"grad_norm": 0.11116636544466019,
"learning_rate": 0.00018666258260400127,
"loss": 0.965,
"step": 536
},
{
"epoch": 0.35,
"grad_norm": 0.08287644386291504,
"learning_rate": 0.0001866111255031237,
"loss": 1.1935,
"step": 537
},
{
"epoch": 0.35,
"grad_norm": 0.07902242243289948,
"learning_rate": 0.00018655957645080494,
"loss": 0.9623,
"step": 538
},
{
"epoch": 0.35,
"grad_norm": 0.10562435537576675,
"learning_rate": 0.0001865079355017725,
"loss": 1.0228,
"step": 539
},
{
"epoch": 0.35,
"grad_norm": 0.07892489433288574,
"learning_rate": 0.00018645620271085135,
"loss": 1.0743,
"step": 540
},
{
"epoch": 0.35,
"grad_norm": 0.07185100018978119,
"learning_rate": 0.00018640437813296416,
"loss": 1.1183,
"step": 541
},
{
"epoch": 0.35,
"grad_norm": 0.08215206861495972,
"learning_rate": 0.0001863524618231309,
"loss": 1.1319,
"step": 542
},
{
"epoch": 0.35,
"grad_norm": 0.08066381514072418,
"learning_rate": 0.00018630045383646895,
"loss": 1.1628,
"step": 543
},
{
"epoch": 0.35,
"grad_norm": 0.07939887791872025,
"learning_rate": 0.00018624835422819305,
"loss": 1.2678,
"step": 544
},
{
"epoch": 0.35,
"grad_norm": 0.07111598551273346,
"learning_rate": 0.00018619616305361517,
"loss": 1.2372,
"step": 545
},
{
"epoch": 0.35,
"grad_norm": 0.07858019322156906,
"learning_rate": 0.0001861438803681445,
"loss": 0.9695,
"step": 546
},
{
"epoch": 0.36,
"grad_norm": 0.06863168627023697,
"learning_rate": 0.00018609150622728748,
"loss": 1.001,
"step": 547
},
{
"epoch": 0.36,
"grad_norm": 0.0846375897526741,
"learning_rate": 0.0001860390406866475,
"loss": 1.0013,
"step": 548
},
{
"epoch": 0.36,
"grad_norm": 0.0752832442522049,
"learning_rate": 0.00018598648380192505,
"loss": 1.2017,
"step": 549
},
{
"epoch": 0.36,
"grad_norm": 0.08154382556676865,
"learning_rate": 0.00018593383562891762,
"loss": 1.3927,
"step": 550
},
{
"epoch": 0.36,
"grad_norm": 0.09067925065755844,
"learning_rate": 0.0001858810962235196,
"loss": 1.4292,
"step": 551
},
{
"epoch": 0.36,
"grad_norm": 0.08958430588245392,
"learning_rate": 0.00018582826564172218,
"loss": 1.2879,
"step": 552
},
{
"epoch": 0.36,
"grad_norm": 0.08029309660196304,
"learning_rate": 0.00018577534393961345,
"loss": 1.0307,
"step": 553
},
{
"epoch": 0.36,
"grad_norm": 0.07621898502111435,
"learning_rate": 0.00018572233117337814,
"loss": 0.9909,
"step": 554
},
{
"epoch": 0.36,
"grad_norm": 0.0793851986527443,
"learning_rate": 0.00018566922739929776,
"loss": 1.0708,
"step": 555
},
{
"epoch": 0.36,
"grad_norm": 0.08344046771526337,
"learning_rate": 0.00018561603267375034,
"loss": 1.2768,
"step": 556
},
{
"epoch": 0.36,
"grad_norm": 0.07884612679481506,
"learning_rate": 0.00018556274705321054,
"loss": 1.1565,
"step": 557
},
{
"epoch": 0.36,
"grad_norm": 0.07826481014490128,
"learning_rate": 0.00018550937059424948,
"loss": 1.0805,
"step": 558
},
{
"epoch": 0.36,
"grad_norm": 0.07335788756608963,
"learning_rate": 0.00018545590335353475,
"loss": 1.0502,
"step": 559
},
{
"epoch": 0.36,
"grad_norm": 0.07203352451324463,
"learning_rate": 0.00018540234538783027,
"loss": 1.1837,
"step": 560
},
{
"epoch": 0.36,
"grad_norm": 0.08611191809177399,
"learning_rate": 0.00018534869675399638,
"loss": 1.101,
"step": 561
},
{
"epoch": 0.37,
"grad_norm": 0.06984131038188934,
"learning_rate": 0.00018529495750898954,
"loss": 1.2826,
"step": 562
},
{
"epoch": 0.37,
"grad_norm": 0.07952100038528442,
"learning_rate": 0.0001852411277098625,
"loss": 1.0585,
"step": 563
},
{
"epoch": 0.37,
"grad_norm": 0.07087212055921555,
"learning_rate": 0.00018518720741376413,
"loss": 1.2027,
"step": 564
},
{
"epoch": 0.37,
"grad_norm": 0.07011663913726807,
"learning_rate": 0.00018513319667793937,
"loss": 1.1453,
"step": 565
},
{
"epoch": 0.37,
"grad_norm": 0.07221578806638718,
"learning_rate": 0.0001850790955597292,
"loss": 1.2527,
"step": 566
},
{
"epoch": 0.37,
"grad_norm": 0.09538937360048294,
"learning_rate": 0.0001850249041165705,
"loss": 1.1497,
"step": 567
},
{
"epoch": 0.37,
"grad_norm": 0.08617708832025528,
"learning_rate": 0.0001849706224059961,
"loss": 0.972,
"step": 568
},
{
"epoch": 0.37,
"grad_norm": 0.08158829808235168,
"learning_rate": 0.00018491625048563462,
"loss": 1.3988,
"step": 569
},
{
"epoch": 0.37,
"grad_norm": 0.08035334199666977,
"learning_rate": 0.00018486178841321054,
"loss": 1.1301,
"step": 570
},
{
"epoch": 0.37,
"grad_norm": 0.07947031408548355,
"learning_rate": 0.00018480723624654391,
"loss": 0.991,
"step": 571
},
{
"epoch": 0.37,
"grad_norm": 0.07135743647813797,
"learning_rate": 0.0001847525940435505,
"loss": 1.4309,
"step": 572
},
{
"epoch": 0.37,
"grad_norm": 0.08830907940864563,
"learning_rate": 0.00018469786186224173,
"loss": 1.4685,
"step": 573
},
{
"epoch": 0.37,
"grad_norm": 0.09551135450601578,
"learning_rate": 0.00018464303976072443,
"loss": 1.4763,
"step": 574
},
{
"epoch": 0.37,
"grad_norm": 0.08997842669487,
"learning_rate": 0.00018458812779720103,
"loss": 1.0674,
"step": 575
},
{
"epoch": 0.37,
"grad_norm": 0.08004486560821533,
"learning_rate": 0.0001845331260299692,
"loss": 1.2204,
"step": 576
},
{
"epoch": 0.37,
"grad_norm": 0.08610889315605164,
"learning_rate": 0.00018447803451742206,
"loss": 0.9799,
"step": 577
},
{
"epoch": 0.38,
"grad_norm": 0.07158122211694717,
"learning_rate": 0.000184422853318048,
"loss": 0.9666,
"step": 578
},
{
"epoch": 0.38,
"grad_norm": 0.08301133662462234,
"learning_rate": 0.00018436758249043062,
"loss": 1.2545,
"step": 579
},
{
"epoch": 0.38,
"grad_norm": 0.07345041632652283,
"learning_rate": 0.00018431222209324867,
"loss": 1.0794,
"step": 580
},
{
"epoch": 0.38,
"grad_norm": 0.08044226467609406,
"learning_rate": 0.00018425677218527592,
"loss": 1.1043,
"step": 581
},
{
"epoch": 0.38,
"grad_norm": 0.08195187151432037,
"learning_rate": 0.00018420123282538136,
"loss": 1.118,
"step": 582
},
{
"epoch": 0.38,
"grad_norm": 0.0803162008523941,
"learning_rate": 0.0001841456040725287,
"loss": 0.9089,
"step": 583
},
{
"epoch": 0.38,
"grad_norm": 0.07588793337345123,
"learning_rate": 0.00018408988598577676,
"loss": 1.3073,
"step": 584
},
{
"epoch": 0.38,
"grad_norm": 0.07819163799285889,
"learning_rate": 0.0001840340786242791,
"loss": 1.3629,
"step": 585
},
{
"epoch": 0.38,
"grad_norm": 0.07606200128793716,
"learning_rate": 0.00018397818204728407,
"loss": 1.3841,
"step": 586
},
{
"epoch": 0.38,
"grad_norm": 0.07669863849878311,
"learning_rate": 0.00018392219631413478,
"loss": 0.9294,
"step": 587
},
{
"epoch": 0.38,
"grad_norm": 0.08028853684663773,
"learning_rate": 0.00018386612148426894,
"loss": 0.9692,
"step": 588
},
{
"epoch": 0.38,
"grad_norm": 0.09896233677864075,
"learning_rate": 0.00018380995761721887,
"loss": 1.1665,
"step": 589
},
{
"epoch": 0.38,
"grad_norm": 0.08026339113712311,
"learning_rate": 0.0001837537047726114,
"loss": 0.9907,
"step": 590
},
{
"epoch": 0.38,
"grad_norm": 0.07541877776384354,
"learning_rate": 0.00018369736301016788,
"loss": 1.0176,
"step": 591
},
{
"epoch": 0.38,
"grad_norm": 0.08331876993179321,
"learning_rate": 0.00018364093238970404,
"loss": 1.1241,
"step": 592
},
{
"epoch": 0.39,
"grad_norm": 0.12105081230401993,
"learning_rate": 0.00018358441297112988,
"loss": 1.1253,
"step": 593
},
{
"epoch": 0.39,
"grad_norm": 0.08892891556024551,
"learning_rate": 0.00018352780481444974,
"loss": 1.396,
"step": 594
},
{
"epoch": 0.39,
"grad_norm": 0.07173669338226318,
"learning_rate": 0.00018347110797976214,
"loss": 0.9646,
"step": 595
},
{
"epoch": 0.39,
"grad_norm": 0.07874932140111923,
"learning_rate": 0.0001834143225272598,
"loss": 1.0602,
"step": 596
},
{
"epoch": 0.39,
"grad_norm": 0.08974017947912216,
"learning_rate": 0.00018335744851722945,
"loss": 1.0953,
"step": 597
},
{
"epoch": 0.39,
"grad_norm": 0.08132424205541611,
"learning_rate": 0.00018330048601005188,
"loss": 1.1689,
"step": 598
},
{
"epoch": 0.39,
"grad_norm": 0.08893271535634995,
"learning_rate": 0.0001832434350662018,
"loss": 1.3207,
"step": 599
},
{
"epoch": 0.39,
"grad_norm": 0.0773831233382225,
"learning_rate": 0.00018318629574624786,
"loss": 1.1802,
"step": 600
},
{
"epoch": 0.39,
"grad_norm": 0.08535367995500565,
"learning_rate": 0.0001831290681108525,
"loss": 1.2335,
"step": 601
},
{
"epoch": 0.39,
"grad_norm": 0.13072720170021057,
"learning_rate": 0.0001830717522207719,
"loss": 1.1243,
"step": 602
},
{
"epoch": 0.39,
"grad_norm": 0.08342819660902023,
"learning_rate": 0.00018301434813685602,
"loss": 1.2226,
"step": 603
},
{
"epoch": 0.39,
"grad_norm": 0.08082102984189987,
"learning_rate": 0.00018295685592004834,
"loss": 1.4093,
"step": 604
},
{
"epoch": 0.39,
"grad_norm": 0.07274606078863144,
"learning_rate": 0.000182899275631386,
"loss": 0.898,
"step": 605
},
{
"epoch": 0.39,
"grad_norm": 0.08023384213447571,
"learning_rate": 0.0001828416073319996,
"loss": 1.3186,
"step": 606
},
{
"epoch": 0.39,
"grad_norm": 0.07277272641658783,
"learning_rate": 0.00018278385108311317,
"loss": 0.9697,
"step": 607
},
{
"epoch": 0.39,
"grad_norm": 0.07433497905731201,
"learning_rate": 0.0001827260069460441,
"loss": 0.8641,
"step": 608
},
{
"epoch": 0.4,
"grad_norm": 0.07678360491991043,
"learning_rate": 0.00018266807498220318,
"loss": 1.1707,
"step": 609
},
{
"epoch": 0.4,
"grad_norm": 0.07920346409082413,
"learning_rate": 0.00018261005525309432,
"loss": 1.0446,
"step": 610
},
{
"epoch": 0.4,
"grad_norm": 0.0783979520201683,
"learning_rate": 0.00018255194782031467,
"loss": 1.2376,
"step": 611
},
{
"epoch": 0.4,
"grad_norm": 0.07843583822250366,
"learning_rate": 0.00018249375274555452,
"loss": 1.033,
"step": 612
},
{
"epoch": 0.4,
"grad_norm": 0.08373366296291351,
"learning_rate": 0.00018243547009059712,
"loss": 1.1094,
"step": 613
},
{
"epoch": 0.4,
"grad_norm": 0.08458531647920609,
"learning_rate": 0.00018237709991731876,
"loss": 1.0061,
"step": 614
},
{
"epoch": 0.4,
"grad_norm": 0.08628588169813156,
"learning_rate": 0.00018231864228768864,
"loss": 1.249,
"step": 615
},
{
"epoch": 0.4,
"grad_norm": 0.08422129601240158,
"learning_rate": 0.00018226009726376882,
"loss": 1.0312,
"step": 616
},
{
"epoch": 0.4,
"grad_norm": 0.08171427249908447,
"learning_rate": 0.00018220146490771408,
"loss": 1.2,
"step": 617
},
{
"epoch": 0.4,
"grad_norm": 0.08195119351148605,
"learning_rate": 0.000182142745281772,
"loss": 1.3238,
"step": 618
},
{
"epoch": 0.4,
"grad_norm": 0.07729899883270264,
"learning_rate": 0.00018208393844828277,
"loss": 1.0531,
"step": 619
},
{
"epoch": 0.4,
"grad_norm": 0.07302694022655487,
"learning_rate": 0.00018202504446967915,
"loss": 1.1746,
"step": 620
},
{
"epoch": 0.4,
"grad_norm": 0.07666127383708954,
"learning_rate": 0.00018196606340848643,
"loss": 1.3917,
"step": 621
},
{
"epoch": 0.4,
"grad_norm": 0.08428138494491577,
"learning_rate": 0.0001819069953273224,
"loss": 1.1402,
"step": 622
},
{
"epoch": 0.4,
"grad_norm": 0.08039674162864685,
"learning_rate": 0.00018184784028889712,
"loss": 1.2403,
"step": 623
},
{
"epoch": 0.41,
"grad_norm": 0.08145111799240112,
"learning_rate": 0.00018178859835601312,
"loss": 1.0017,
"step": 624
},
{
"epoch": 0.41,
"grad_norm": 0.08442309498786926,
"learning_rate": 0.00018172926959156505,
"loss": 0.969,
"step": 625
},
{
"epoch": 0.41,
"grad_norm": 0.07632813602685928,
"learning_rate": 0.0001816698540585398,
"loss": 1.0742,
"step": 626
},
{
"epoch": 0.41,
"grad_norm": 0.09010200947523117,
"learning_rate": 0.00018161035182001642,
"loss": 1.1998,
"step": 627
},
{
"epoch": 0.41,
"grad_norm": 0.09552950412034988,
"learning_rate": 0.00018155076293916594,
"loss": 1.2505,
"step": 628
},
{
"epoch": 0.41,
"grad_norm": 0.08230545371770859,
"learning_rate": 0.00018149108747925142,
"loss": 0.707,
"step": 629
},
{
"epoch": 0.41,
"grad_norm": 0.08570738136768341,
"learning_rate": 0.00018143132550362781,
"loss": 1.0084,
"step": 630
},
{
"epoch": 0.41,
"grad_norm": 0.08343492448329926,
"learning_rate": 0.00018137147707574194,
"loss": 1.1006,
"step": 631
},
{
"epoch": 0.41,
"grad_norm": 0.08732247352600098,
"learning_rate": 0.00018131154225913237,
"loss": 0.8256,
"step": 632
},
{
"epoch": 0.41,
"grad_norm": 0.06638462096452713,
"learning_rate": 0.00018125152111742946,
"loss": 1.0986,
"step": 633
},
{
"epoch": 0.41,
"grad_norm": 0.08032132685184479,
"learning_rate": 0.00018119141371435507,
"loss": 1.1737,
"step": 634
},
{
"epoch": 0.41,
"grad_norm": 0.08700072020292282,
"learning_rate": 0.00018113122011372286,
"loss": 1.0871,
"step": 635
},
{
"epoch": 0.41,
"grad_norm": 0.0786958634853363,
"learning_rate": 0.00018107094037943778,
"loss": 1.4947,
"step": 636
},
{
"epoch": 0.41,
"grad_norm": 0.08628631383180618,
"learning_rate": 0.00018101057457549642,
"loss": 1.1618,
"step": 637
},
{
"epoch": 0.41,
"grad_norm": 0.08307655155658722,
"learning_rate": 0.00018095012276598657,
"loss": 1.3662,
"step": 638
},
{
"epoch": 0.42,
"grad_norm": 0.08931092917919159,
"learning_rate": 0.00018088958501508745,
"loss": 1.1374,
"step": 639
},
{
"epoch": 0.42,
"grad_norm": 0.07548385113477707,
"learning_rate": 0.00018082896138706947,
"loss": 1.0065,
"step": 640
},
{
"epoch": 0.42,
"grad_norm": 0.09044208377599716,
"learning_rate": 0.00018076825194629422,
"loss": 1.2454,
"step": 641
},
{
"epoch": 0.42,
"grad_norm": 0.07289768010377884,
"learning_rate": 0.0001807074567572144,
"loss": 1.118,
"step": 642
},
{
"epoch": 0.42,
"grad_norm": 0.07798143476247787,
"learning_rate": 0.00018064657588437374,
"loss": 1.1542,
"step": 643
},
{
"epoch": 0.42,
"grad_norm": 0.07771704345941544,
"learning_rate": 0.00018058560939240696,
"loss": 1.1157,
"step": 644
},
{
"epoch": 0.42,
"grad_norm": 0.07239065319299698,
"learning_rate": 0.00018052455734603962,
"loss": 1.07,
"step": 645
},
{
"epoch": 0.42,
"grad_norm": 0.09537984430789948,
"learning_rate": 0.00018046341981008815,
"loss": 1.2944,
"step": 646
},
{
"epoch": 0.42,
"grad_norm": 0.07166703790426254,
"learning_rate": 0.0001804021968494598,
"loss": 1.287,
"step": 647
},
{
"epoch": 0.42,
"grad_norm": 0.07786957174539566,
"learning_rate": 0.00018034088852915235,
"loss": 0.9241,
"step": 648
},
{
"epoch": 0.42,
"grad_norm": 0.07787182927131653,
"learning_rate": 0.00018027949491425437,
"loss": 1.1288,
"step": 649
},
{
"epoch": 0.42,
"grad_norm": 0.07539577037096024,
"learning_rate": 0.0001802180160699449,
"loss": 1.329,
"step": 650
},
{
"epoch": 0.42,
"grad_norm": 0.07263702899217606,
"learning_rate": 0.00018015645206149346,
"loss": 1.1269,
"step": 651
},
{
"epoch": 0.42,
"grad_norm": 0.09897763282060623,
"learning_rate": 0.00018009480295426008,
"loss": 1.0364,
"step": 652
},
{
"epoch": 0.42,
"grad_norm": 0.09211482852697372,
"learning_rate": 0.00018003306881369494,
"loss": 1.2633,
"step": 653
},
{
"epoch": 0.42,
"grad_norm": 0.0799705907702446,
"learning_rate": 0.00017997124970533872,
"loss": 1.2391,
"step": 654
},
{
"epoch": 0.43,
"grad_norm": 0.07563236355781555,
"learning_rate": 0.00017990934569482218,
"loss": 0.9491,
"step": 655
},
{
"epoch": 0.43,
"grad_norm": 0.07711002230644226,
"learning_rate": 0.00017984735684786619,
"loss": 1.1638,
"step": 656
},
{
"epoch": 0.43,
"grad_norm": 0.09000904858112335,
"learning_rate": 0.00017978528323028177,
"loss": 1.0891,
"step": 657
},
{
"epoch": 0.43,
"grad_norm": 0.09779634326696396,
"learning_rate": 0.00017972312490796994,
"loss": 0.9214,
"step": 658
},
{
"epoch": 0.43,
"grad_norm": 0.0815175324678421,
"learning_rate": 0.00017966088194692158,
"loss": 1.2561,
"step": 659
},
{
"epoch": 0.43,
"grad_norm": 0.0770617425441742,
"learning_rate": 0.00017959855441321748,
"loss": 1.1198,
"step": 660
},
{
"epoch": 0.43,
"grad_norm": 0.07821808010339737,
"learning_rate": 0.00017953614237302819,
"loss": 1.0745,
"step": 661
},
{
"epoch": 0.43,
"grad_norm": 0.07551560550928116,
"learning_rate": 0.00017947364589261395,
"loss": 1.3212,
"step": 662
},
{
"epoch": 0.43,
"grad_norm": 0.09358943998813629,
"learning_rate": 0.00017941106503832472,
"loss": 1.3439,
"step": 663
},
{
"epoch": 0.43,
"grad_norm": 0.08056586235761642,
"learning_rate": 0.0001793483998766,
"loss": 1.2093,
"step": 664
},
{
"epoch": 0.43,
"grad_norm": 0.08634116500616074,
"learning_rate": 0.00017928565047396883,
"loss": 0.9249,
"step": 665
},
{
"epoch": 0.43,
"grad_norm": 0.07717472314834595,
"learning_rate": 0.00017922281689704956,
"loss": 1.0396,
"step": 666
},
{
"epoch": 0.43,
"grad_norm": 0.09257300198078156,
"learning_rate": 0.00017915989921255007,
"loss": 1.3473,
"step": 667
},
{
"epoch": 0.43,
"grad_norm": 0.07906777411699295,
"learning_rate": 0.0001790968974872674,
"loss": 1.1019,
"step": 668
},
{
"epoch": 0.43,
"grad_norm": 0.07715015113353729,
"learning_rate": 0.00017903381178808791,
"loss": 1.1571,
"step": 669
},
{
"epoch": 0.44,
"grad_norm": 0.08609345555305481,
"learning_rate": 0.00017897064218198706,
"loss": 1.3525,
"step": 670
},
{
"epoch": 0.44,
"grad_norm": 0.08347558975219727,
"learning_rate": 0.00017890738873602938,
"loss": 1.3383,
"step": 671
},
{
"epoch": 0.44,
"grad_norm": 0.08017811924219131,
"learning_rate": 0.00017884405151736847,
"loss": 0.9833,
"step": 672
},
{
"epoch": 0.44,
"grad_norm": 0.08072958886623383,
"learning_rate": 0.0001787806305932468,
"loss": 1.0776,
"step": 673
},
{
"epoch": 0.44,
"grad_norm": 0.07432875782251358,
"learning_rate": 0.00017871712603099578,
"loss": 1.1423,
"step": 674
},
{
"epoch": 0.44,
"grad_norm": 0.08560364693403244,
"learning_rate": 0.00017865353789803552,
"loss": 1.3298,
"step": 675
},
{
"epoch": 0.44,
"grad_norm": 0.08586810529232025,
"learning_rate": 0.00017858986626187492,
"loss": 1.2904,
"step": 676
},
{
"epoch": 0.44,
"grad_norm": 0.1029001995921135,
"learning_rate": 0.0001785261111901115,
"loss": 1.0585,
"step": 677
},
{
"epoch": 0.44,
"grad_norm": 0.10310132056474686,
"learning_rate": 0.00017846227275043143,
"loss": 1.1977,
"step": 678
},
{
"epoch": 0.44,
"grad_norm": 0.08359325677156448,
"learning_rate": 0.00017839835101060927,
"loss": 1.061,
"step": 679
},
{
"epoch": 0.44,
"grad_norm": 0.09460770338773727,
"learning_rate": 0.00017833434603850814,
"loss": 1.1596,
"step": 680
},
{
"epoch": 0.44,
"grad_norm": 0.08996371924877167,
"learning_rate": 0.0001782702579020794,
"loss": 1.2744,
"step": 681
},
{
"epoch": 0.44,
"grad_norm": 0.08824539184570312,
"learning_rate": 0.00017820608666936286,
"loss": 1.1547,
"step": 682
},
{
"epoch": 0.44,
"grad_norm": 0.0827854722738266,
"learning_rate": 0.00017814183240848634,
"loss": 1.2645,
"step": 683
},
{
"epoch": 0.44,
"grad_norm": 0.09070557355880737,
"learning_rate": 0.00017807749518766603,
"loss": 1.1277,
"step": 684
},
{
"epoch": 0.44,
"grad_norm": 0.0831344723701477,
"learning_rate": 0.000178013075075206,
"loss": 1.1437,
"step": 685
},
{
"epoch": 0.45,
"grad_norm": 0.08220485597848892,
"learning_rate": 0.00017794857213949852,
"loss": 1.1862,
"step": 686
},
{
"epoch": 0.45,
"grad_norm": 0.0770675465464592,
"learning_rate": 0.00017788398644902358,
"loss": 1.2713,
"step": 687
},
{
"epoch": 0.45,
"grad_norm": 0.07351796329021454,
"learning_rate": 0.00017781931807234918,
"loss": 1.1178,
"step": 688
},
{
"epoch": 0.45,
"grad_norm": 0.08193398267030716,
"learning_rate": 0.00017775456707813105,
"loss": 1.3023,
"step": 689
},
{
"epoch": 0.45,
"grad_norm": 0.08915859460830688,
"learning_rate": 0.00017768973353511261,
"loss": 1.205,
"step": 690
},
{
"epoch": 0.45,
"grad_norm": 0.08455678075551987,
"learning_rate": 0.000177624817512125,
"loss": 1.0257,
"step": 691
},
{
"epoch": 0.45,
"grad_norm": 0.07098662853240967,
"learning_rate": 0.0001775598190780868,
"loss": 1.0469,
"step": 692
},
{
"epoch": 0.45,
"grad_norm": 0.07370258122682571,
"learning_rate": 0.00017749473830200424,
"loss": 1.0658,
"step": 693
},
{
"epoch": 0.45,
"grad_norm": 0.08448195457458496,
"learning_rate": 0.0001774295752529708,
"loss": 1.081,
"step": 694
},
{
"epoch": 0.45,
"grad_norm": 0.07694752514362335,
"learning_rate": 0.00017736433000016742,
"loss": 1.1418,
"step": 695
},
{
"epoch": 0.45,
"grad_norm": 0.07735437899827957,
"learning_rate": 0.0001772990026128623,
"loss": 1.0491,
"step": 696
},
{
"epoch": 0.45,
"grad_norm": 0.07796520739793777,
"learning_rate": 0.00017723359316041077,
"loss": 0.9885,
"step": 697
},
{
"epoch": 0.45,
"grad_norm": 0.07998763769865036,
"learning_rate": 0.00017716810171225538,
"loss": 1.1065,
"step": 698
},
{
"epoch": 0.45,
"grad_norm": 0.1064494326710701,
"learning_rate": 0.00017710252833792565,
"loss": 1.2058,
"step": 699
},
{
"epoch": 0.45,
"grad_norm": 0.08175428956747055,
"learning_rate": 0.0001770368731070381,
"loss": 1.3656,
"step": 700
},
{
"epoch": 0.46,
"grad_norm": 0.07967953383922577,
"learning_rate": 0.00017697113608929627,
"loss": 1.1731,
"step": 701
},
{
"epoch": 0.46,
"grad_norm": 0.07325685769319534,
"learning_rate": 0.00017690531735449033,
"loss": 1.1922,
"step": 702
},
{
"epoch": 0.46,
"grad_norm": 0.0778280571103096,
"learning_rate": 0.0001768394169724973,
"loss": 0.8566,
"step": 703
},
{
"epoch": 0.46,
"grad_norm": 0.09514844417572021,
"learning_rate": 0.00017677343501328095,
"loss": 1.3389,
"step": 704
},
{
"epoch": 0.46,
"grad_norm": 0.08737710863351822,
"learning_rate": 0.0001767073715468915,
"loss": 1.155,
"step": 705
},
{
"epoch": 0.46,
"grad_norm": 0.07468974590301514,
"learning_rate": 0.00017664122664346588,
"loss": 1.0209,
"step": 706
},
{
"epoch": 0.46,
"grad_norm": 0.0724843367934227,
"learning_rate": 0.00017657500037322735,
"loss": 0.9348,
"step": 707
},
{
"epoch": 0.46,
"grad_norm": 0.08314874768257141,
"learning_rate": 0.00017650869280648557,
"loss": 1.2589,
"step": 708
},
{
"epoch": 0.46,
"grad_norm": 0.08452475070953369,
"learning_rate": 0.00017644230401363657,
"loss": 1.1349,
"step": 709
},
{
"epoch": 0.46,
"grad_norm": 0.0918334349989891,
"learning_rate": 0.00017637583406516258,
"loss": 1.1181,
"step": 710
},
{
"epoch": 0.46,
"grad_norm": 0.07945480197668076,
"learning_rate": 0.00017630928303163202,
"loss": 1.1369,
"step": 711
},
{
"epoch": 0.46,
"grad_norm": 0.10152934491634369,
"learning_rate": 0.00017624265098369928,
"loss": 1.4079,
"step": 712
},
{
"epoch": 0.46,
"grad_norm": 0.07693798094987869,
"learning_rate": 0.0001761759379921049,
"loss": 1.1841,
"step": 713
},
{
"epoch": 0.46,
"grad_norm": 0.07069939374923706,
"learning_rate": 0.0001761091441276753,
"loss": 0.9861,
"step": 714
},
{
"epoch": 0.46,
"grad_norm": 0.08304879069328308,
"learning_rate": 0.00017604226946132273,
"loss": 1.1118,
"step": 715
},
{
"epoch": 0.47,
"grad_norm": 0.0826292335987091,
"learning_rate": 0.00017597531406404526,
"loss": 1.2392,
"step": 716
},
{
"epoch": 0.47,
"grad_norm": 0.08061166107654572,
"learning_rate": 0.00017590827800692665,
"loss": 1.0873,
"step": 717
},
{
"epoch": 0.47,
"grad_norm": 0.07403320074081421,
"learning_rate": 0.00017584116136113633,
"loss": 1.0396,
"step": 718
},
{
"epoch": 0.47,
"grad_norm": 0.100631482899189,
"learning_rate": 0.00017577396419792923,
"loss": 1.3316,
"step": 719
},
{
"epoch": 0.47,
"grad_norm": 0.0870446041226387,
"learning_rate": 0.00017570668658864583,
"loss": 1.2423,
"step": 720
},
{
"epoch": 0.47,
"grad_norm": 0.08019344508647919,
"learning_rate": 0.00017563932860471194,
"loss": 0.9506,
"step": 721
},
{
"epoch": 0.47,
"grad_norm": 0.08049733936786652,
"learning_rate": 0.00017557189031763878,
"loss": 1.2599,
"step": 722
},
{
"epoch": 0.47,
"grad_norm": 0.09278158098459244,
"learning_rate": 0.00017550437179902271,
"loss": 1.2561,
"step": 723
},
{
"epoch": 0.47,
"grad_norm": 0.07649429887533188,
"learning_rate": 0.00017543677312054543,
"loss": 1.1874,
"step": 724
},
{
"epoch": 0.47,
"grad_norm": 0.08516720682382584,
"learning_rate": 0.00017536909435397362,
"loss": 1.3583,
"step": 725
},
{
"epoch": 0.47,
"grad_norm": 0.07457318156957626,
"learning_rate": 0.000175301335571159,
"loss": 1.1077,
"step": 726
},
{
"epoch": 0.47,
"grad_norm": 0.09406981617212296,
"learning_rate": 0.0001752334968440383,
"loss": 1.3628,
"step": 727
},
{
"epoch": 0.47,
"grad_norm": 0.06670159101486206,
"learning_rate": 0.000175165578244633,
"loss": 1.0544,
"step": 728
},
{
"epoch": 0.47,
"grad_norm": 0.07625333219766617,
"learning_rate": 0.0001750975798450496,
"loss": 1.1544,
"step": 729
},
{
"epoch": 0.47,
"grad_norm": 0.0862407311797142,
"learning_rate": 0.00017502950171747905,
"loss": 1.1927,
"step": 730
},
{
"epoch": 0.47,
"grad_norm": 0.07812774926424026,
"learning_rate": 0.00017496134393419713,
"loss": 1.0182,
"step": 731
},
{
"epoch": 0.48,
"grad_norm": 0.07729184627532959,
"learning_rate": 0.00017489310656756412,
"loss": 1.1951,
"step": 732
},
{
"epoch": 0.48,
"grad_norm": 0.08399280905723572,
"learning_rate": 0.00017482478969002484,
"loss": 1.3047,
"step": 733
},
{
"epoch": 0.48,
"grad_norm": 0.08977645635604858,
"learning_rate": 0.00017475639337410847,
"loss": 0.9931,
"step": 734
},
{
"epoch": 0.48,
"grad_norm": 0.08276163786649704,
"learning_rate": 0.00017468791769242853,
"loss": 1.1554,
"step": 735
},
{
"epoch": 0.48,
"grad_norm": 0.08267621695995331,
"learning_rate": 0.0001746193627176828,
"loss": 1.1335,
"step": 736
},
{
"epoch": 0.48,
"grad_norm": 0.08823233842849731,
"learning_rate": 0.0001745507285226533,
"loss": 1.1542,
"step": 737
},
{
"epoch": 0.48,
"grad_norm": 0.07996855676174164,
"learning_rate": 0.00017448201518020602,
"loss": 1.1116,
"step": 738
},
{
"epoch": 0.48,
"grad_norm": 0.0920189619064331,
"learning_rate": 0.00017441322276329118,
"loss": 1.2266,
"step": 739
},
{
"epoch": 0.48,
"grad_norm": 0.08092907816171646,
"learning_rate": 0.00017434435134494277,
"loss": 1.3664,
"step": 740
},
{
"epoch": 0.48,
"grad_norm": 0.08727369457483292,
"learning_rate": 0.00017427540099827874,
"loss": 1.1137,
"step": 741
},
{
"epoch": 0.48,
"grad_norm": 0.12193667143583298,
"learning_rate": 0.0001742063717965008,
"loss": 1.1603,
"step": 742
},
{
"epoch": 0.48,
"grad_norm": 0.08252157270908356,
"learning_rate": 0.00017413726381289443,
"loss": 1.0399,
"step": 743
},
{
"epoch": 0.48,
"grad_norm": 0.0809679627418518,
"learning_rate": 0.00017406807712082865,
"loss": 1.2177,
"step": 744
},
{
"epoch": 0.48,
"grad_norm": 0.08453313261270523,
"learning_rate": 0.00017399881179375613,
"loss": 1.1332,
"step": 745
},
{
"epoch": 0.48,
"grad_norm": 0.09165991842746735,
"learning_rate": 0.000173929467905213,
"loss": 1.5545,
"step": 746
},
{
"epoch": 0.49,
"grad_norm": 0.09684597700834274,
"learning_rate": 0.0001738600455288188,
"loss": 1.1418,
"step": 747
},
{
"epoch": 0.49,
"grad_norm": 0.09542853385210037,
"learning_rate": 0.00017379054473827638,
"loss": 1.0924,
"step": 748
},
{
"epoch": 0.49,
"grad_norm": 0.2980574667453766,
"learning_rate": 0.00017372096560737185,
"loss": 1.1752,
"step": 749
},
{
"epoch": 0.49,
"grad_norm": 0.08750354498624802,
"learning_rate": 0.00017365130820997448,
"loss": 1.4242,
"step": 750
},
{
"epoch": 0.49,
"grad_norm": 0.07665020227432251,
"learning_rate": 0.00017358157262003666,
"loss": 0.8734,
"step": 751
},
{
"epoch": 0.49,
"grad_norm": 0.08522538095712662,
"learning_rate": 0.00017351175891159378,
"loss": 1.2326,
"step": 752
},
{
"epoch": 0.49,
"grad_norm": 0.08328203856945038,
"learning_rate": 0.0001734418671587641,
"loss": 0.9914,
"step": 753
},
{
"epoch": 0.49,
"grad_norm": 0.08143004775047302,
"learning_rate": 0.00017337189743574886,
"loss": 1.1849,
"step": 754
},
{
"epoch": 0.49,
"grad_norm": 0.08154163509607315,
"learning_rate": 0.000173301849816832,
"loss": 1.095,
"step": 755
},
{
"epoch": 0.49,
"grad_norm": 0.09051434695720673,
"learning_rate": 0.00017323172437638012,
"loss": 1.4032,
"step": 756
},
{
"epoch": 0.49,
"grad_norm": 0.07921197265386581,
"learning_rate": 0.00017316152118884262,
"loss": 1.2568,
"step": 757
},
{
"epoch": 0.49,
"grad_norm": 0.1015300527215004,
"learning_rate": 0.00017309124032875118,
"loss": 1.2418,
"step": 758
},
{
"epoch": 0.49,
"grad_norm": 0.08583255857229233,
"learning_rate": 0.00017302088187072013,
"loss": 1.214,
"step": 759
},
{
"epoch": 0.49,
"grad_norm": 0.09258947521448135,
"learning_rate": 0.00017295044588944614,
"loss": 1.2606,
"step": 760
},
{
"epoch": 0.49,
"grad_norm": 0.10023178905248642,
"learning_rate": 0.00017287993245970813,
"loss": 1.204,
"step": 761
},
{
"epoch": 0.49,
"grad_norm": 0.07907801121473312,
"learning_rate": 0.00017280934165636732,
"loss": 0.987,
"step": 762
},
{
"epoch": 0.5,
"grad_norm": 0.08570694178342819,
"learning_rate": 0.00017273867355436706,
"loss": 1.1802,
"step": 763
},
{
"epoch": 0.5,
"grad_norm": 0.08926071971654892,
"learning_rate": 0.0001726679282287327,
"loss": 1.3329,
"step": 764
},
{
"epoch": 0.5,
"grad_norm": 0.10569997131824493,
"learning_rate": 0.00017259710575457163,
"loss": 1.1928,
"step": 765
},
{
"epoch": 0.5,
"grad_norm": 0.07887765020132065,
"learning_rate": 0.00017252620620707317,
"loss": 0.9654,
"step": 766
},
{
"epoch": 0.5,
"grad_norm": 0.08243531733751297,
"learning_rate": 0.00017245522966150833,
"loss": 1.122,
"step": 767
},
{
"epoch": 0.5,
"grad_norm": 0.08016372472047806,
"learning_rate": 0.0001723841761932301,
"loss": 1.1173,
"step": 768
},
{
"epoch": 0.5,
"grad_norm": 0.17971543967723846,
"learning_rate": 0.0001723130458776729,
"loss": 1.2049,
"step": 769
},
{
"epoch": 0.5,
"grad_norm": 0.08999678492546082,
"learning_rate": 0.00017224183879035288,
"loss": 0.9927,
"step": 770
},
{
"epoch": 0.5,
"grad_norm": 0.0874532014131546,
"learning_rate": 0.00017217055500686764,
"loss": 1.2056,
"step": 771
},
{
"epoch": 0.5,
"grad_norm": 0.09283041208982468,
"learning_rate": 0.00017209919460289626,
"loss": 1.4801,
"step": 772
},
{
"epoch": 0.5,
"grad_norm": 0.09434971213340759,
"learning_rate": 0.00017202775765419906,
"loss": 1.2483,
"step": 773
},
{
"epoch": 0.5,
"grad_norm": 0.07374588400125504,
"learning_rate": 0.00017195624423661774,
"loss": 1.069,
"step": 774
},
{
"epoch": 0.5,
"grad_norm": 0.08497139811515808,
"learning_rate": 0.00017188465442607507,
"loss": 1.1224,
"step": 775
},
{
"epoch": 0.5,
"grad_norm": 0.08765482902526855,
"learning_rate": 0.000171812988298575,
"loss": 1.1164,
"step": 776
},
{
"epoch": 0.5,
"grad_norm": 0.07689573615789413,
"learning_rate": 0.00017174124593020255,
"loss": 1.0702,
"step": 777
},
{
"epoch": 0.51,
"grad_norm": 0.10699640214443207,
"learning_rate": 0.00017166942739712355,
"loss": 1.1605,
"step": 778
},
{
"epoch": 0.51,
"grad_norm": 0.07910798490047455,
"learning_rate": 0.0001715975327755848,
"loss": 0.9998,
"step": 779
},
{
"epoch": 0.51,
"grad_norm": 0.08046310395002365,
"learning_rate": 0.0001715255621419138,
"loss": 1.1715,
"step": 780
},
{
"epoch": 0.51,
"grad_norm": 0.08498966693878174,
"learning_rate": 0.0001714535155725188,
"loss": 1.0702,
"step": 781
},
{
"epoch": 0.51,
"grad_norm": 0.0835207849740982,
"learning_rate": 0.0001713813931438887,
"loss": 1.0257,
"step": 782
},
{
"epoch": 0.51,
"grad_norm": 0.07388419657945633,
"learning_rate": 0.00017130919493259282,
"loss": 1.1681,
"step": 783
},
{
"epoch": 0.51,
"grad_norm": 0.08165927976369858,
"learning_rate": 0.00017123692101528106,
"loss": 1.3463,
"step": 784
},
{
"epoch": 0.51,
"grad_norm": 0.08167090266942978,
"learning_rate": 0.00017116457146868364,
"loss": 1.2735,
"step": 785
},
{
"epoch": 0.51,
"grad_norm": 0.09316886961460114,
"learning_rate": 0.00017109214636961103,
"loss": 1.073,
"step": 786
},
{
"epoch": 0.51,
"grad_norm": 0.07424274832010269,
"learning_rate": 0.000171019645794954,
"loss": 1.0646,
"step": 787
},
{
"epoch": 0.51,
"grad_norm": 0.08338771760463715,
"learning_rate": 0.00017094706982168342,
"loss": 1.1479,
"step": 788
},
{
"epoch": 0.51,
"grad_norm": 0.08428940176963806,
"learning_rate": 0.00017087441852685014,
"loss": 1.1824,
"step": 789
},
{
"epoch": 0.51,
"grad_norm": 0.08475558459758759,
"learning_rate": 0.000170801691987585,
"loss": 1.3129,
"step": 790
},
{
"epoch": 0.51,
"grad_norm": 0.08609999716281891,
"learning_rate": 0.00017072889028109885,
"loss": 1.3433,
"step": 791
},
{
"epoch": 0.51,
"grad_norm": 0.08859992027282715,
"learning_rate": 0.0001706560134846822,
"loss": 1.1913,
"step": 792
},
{
"epoch": 0.52,
"grad_norm": 0.07474919408559799,
"learning_rate": 0.0001705830616757053,
"loss": 1.0951,
"step": 793
},
{
"epoch": 0.52,
"grad_norm": 0.08233974874019623,
"learning_rate": 0.00017051003493161808,
"loss": 1.3027,
"step": 794
},
{
"epoch": 0.52,
"grad_norm": 0.08178085833787918,
"learning_rate": 0.00017043693332995002,
"loss": 1.1052,
"step": 795
},
{
"epoch": 0.52,
"grad_norm": 0.07304524630308151,
"learning_rate": 0.00017036375694831,
"loss": 1.36,
"step": 796
},
{
"epoch": 0.52,
"grad_norm": 0.08515062183141708,
"learning_rate": 0.00017029050586438643,
"loss": 0.916,
"step": 797
},
{
"epoch": 0.52,
"grad_norm": 0.07536105811595917,
"learning_rate": 0.00017021718015594695,
"loss": 1.0816,
"step": 798
},
{
"epoch": 0.52,
"grad_norm": 0.10976564884185791,
"learning_rate": 0.00017014377990083834,
"loss": 1.2816,
"step": 799
},
{
"epoch": 0.52,
"grad_norm": 0.0825371965765953,
"learning_rate": 0.00017007030517698666,
"loss": 1.3107,
"step": 800
},
{
"epoch": 0.52,
"grad_norm": 0.07634612917900085,
"learning_rate": 0.000169996756062397,
"loss": 0.9648,
"step": 801
},
{
"epoch": 0.52,
"grad_norm": 0.0780857652425766,
"learning_rate": 0.00016992313263515337,
"loss": 0.7845,
"step": 802
},
{
"epoch": 0.52,
"grad_norm": 0.09171561151742935,
"learning_rate": 0.0001698494349734187,
"loss": 1.4566,
"step": 803
},
{
"epoch": 0.52,
"grad_norm": 0.08053979277610779,
"learning_rate": 0.00016977566315543477,
"loss": 1.2048,
"step": 804
},
{
"epoch": 0.52,
"grad_norm": 0.08218812197446823,
"learning_rate": 0.00016970181725952207,
"loss": 1.1936,
"step": 805
},
{
"epoch": 0.52,
"grad_norm": 0.08368358761072159,
"learning_rate": 0.00016962789736407974,
"loss": 1.1064,
"step": 806
},
{
"epoch": 0.52,
"grad_norm": 0.10067876428365707,
"learning_rate": 0.0001695539035475854,
"loss": 1.0918,
"step": 807
},
{
"epoch": 0.52,
"grad_norm": 0.08832260966300964,
"learning_rate": 0.00016947983588859523,
"loss": 1.1454,
"step": 808
},
{
"epoch": 0.53,
"grad_norm": 0.07921259850263596,
"learning_rate": 0.00016940569446574389,
"loss": 1.3695,
"step": 809
},
{
"epoch": 0.53,
"grad_norm": 0.09819761663675308,
"learning_rate": 0.00016933147935774411,
"loss": 1.2974,
"step": 810
},
{
"epoch": 0.53,
"grad_norm": 0.22696730494499207,
"learning_rate": 0.0001692571906433871,
"loss": 1.1638,
"step": 811
},
{
"epoch": 0.53,
"grad_norm": 0.08201338350772858,
"learning_rate": 0.0001691828284015421,
"loss": 0.8576,
"step": 812
},
{
"epoch": 0.53,
"grad_norm": 0.10405506193637848,
"learning_rate": 0.00016910839271115638,
"loss": 1.0929,
"step": 813
},
{
"epoch": 0.53,
"grad_norm": 0.09434719383716583,
"learning_rate": 0.00016903388365125524,
"loss": 1.0948,
"step": 814
},
{
"epoch": 0.53,
"grad_norm": 0.08209701627492905,
"learning_rate": 0.00016895930130094184,
"loss": 1.2546,
"step": 815
},
{
"epoch": 0.53,
"grad_norm": 0.07896193861961365,
"learning_rate": 0.0001688846457393972,
"loss": 1.106,
"step": 816
},
{
"epoch": 0.53,
"grad_norm": 0.07685357332229614,
"learning_rate": 0.00016880991704588003,
"loss": 1.1522,
"step": 817
},
{
"epoch": 0.53,
"grad_norm": 0.08388163894414902,
"learning_rate": 0.00016873511529972665,
"loss": 1.2062,
"step": 818
},
{
"epoch": 0.53,
"grad_norm": 0.0945209413766861,
"learning_rate": 0.00016866024058035105,
"loss": 1.1801,
"step": 819
},
{
"epoch": 0.53,
"grad_norm": 0.0906093567609787,
"learning_rate": 0.00016858529296724455,
"loss": 1.176,
"step": 820
},
{
"epoch": 0.53,
"grad_norm": 0.07118382304906845,
"learning_rate": 0.00016851027253997596,
"loss": 1.2439,
"step": 821
},
{
"epoch": 0.53,
"grad_norm": 0.06980909407138824,
"learning_rate": 0.00016843517937819132,
"loss": 1.1027,
"step": 822
},
{
"epoch": 0.53,
"grad_norm": 0.09179321676492691,
"learning_rate": 0.00016836001356161396,
"loss": 1.1231,
"step": 823
},
{
"epoch": 0.54,
"grad_norm": 0.08007943630218506,
"learning_rate": 0.00016828477517004427,
"loss": 1.2563,
"step": 824
},
{
"epoch": 0.54,
"grad_norm": 0.10226795822381973,
"learning_rate": 0.00016820946428335978,
"loss": 1.1662,
"step": 825
},
{
"epoch": 0.54,
"grad_norm": 0.08230665326118469,
"learning_rate": 0.00016813408098151488,
"loss": 1.2735,
"step": 826
},
{
"epoch": 0.54,
"grad_norm": 0.07995975017547607,
"learning_rate": 0.000168058625344541,
"loss": 1.1265,
"step": 827
},
{
"epoch": 0.54,
"grad_norm": 0.08252605050802231,
"learning_rate": 0.00016798309745254614,
"loss": 1.0828,
"step": 828
},
{
"epoch": 0.54,
"grad_norm": 0.10495063662528992,
"learning_rate": 0.0001679074973857152,
"loss": 1.326,
"step": 829
},
{
"epoch": 0.54,
"grad_norm": 0.08632712066173553,
"learning_rate": 0.00016783182522430964,
"loss": 1.1424,
"step": 830
},
{
"epoch": 0.54,
"grad_norm": 0.09097401052713394,
"learning_rate": 0.0001677560810486674,
"loss": 1.1875,
"step": 831
},
{
"epoch": 0.54,
"grad_norm": 0.1030447781085968,
"learning_rate": 0.000167680264939203,
"loss": 1.2773,
"step": 832
},
{
"epoch": 0.54,
"grad_norm": 0.07330919802188873,
"learning_rate": 0.00016760437697640722,
"loss": 1.0242,
"step": 833
},
{
"epoch": 0.54,
"grad_norm": 0.12255438417196274,
"learning_rate": 0.00016752841724084714,
"loss": 1.2594,
"step": 834
},
{
"epoch": 0.54,
"grad_norm": 0.08226511627435684,
"learning_rate": 0.00016745238581316612,
"loss": 0.9883,
"step": 835
},
{
"epoch": 0.54,
"grad_norm": 0.08995082974433899,
"learning_rate": 0.00016737628277408356,
"loss": 1.1848,
"step": 836
},
{
"epoch": 0.54,
"grad_norm": 0.08618593215942383,
"learning_rate": 0.00016730010820439488,
"loss": 0.9744,
"step": 837
},
{
"epoch": 0.54,
"grad_norm": 0.07120101898908615,
"learning_rate": 0.00016722386218497146,
"loss": 0.9643,
"step": 838
},
{
"epoch": 0.54,
"grad_norm": 0.09212475270032883,
"learning_rate": 0.00016714754479676058,
"loss": 1.2269,
"step": 839
},
{
"epoch": 0.55,
"grad_norm": 0.08128505945205688,
"learning_rate": 0.0001670711561207852,
"loss": 1.0458,
"step": 840
},
{
"epoch": 0.55,
"grad_norm": 0.08219733834266663,
"learning_rate": 0.00016699469623814402,
"loss": 1.304,
"step": 841
},
{
"epoch": 0.55,
"grad_norm": 0.08931570500135422,
"learning_rate": 0.00016691816523001137,
"loss": 1.267,
"step": 842
},
{
"epoch": 0.55,
"grad_norm": 0.08008779585361481,
"learning_rate": 0.000166841563177637,
"loss": 1.3622,
"step": 843
},
{
"epoch": 0.55,
"grad_norm": 0.08592413365840912,
"learning_rate": 0.00016676489016234611,
"loss": 1.1349,
"step": 844
},
{
"epoch": 0.55,
"grad_norm": 0.0958981066942215,
"learning_rate": 0.00016668814626553935,
"loss": 1.1459,
"step": 845
},
{
"epoch": 0.55,
"grad_norm": 0.07809265702962875,
"learning_rate": 0.00016661133156869245,
"loss": 1.3124,
"step": 846
},
{
"epoch": 0.55,
"grad_norm": 0.1327837109565735,
"learning_rate": 0.00016653444615335645,
"loss": 1.1953,
"step": 847
},
{
"epoch": 0.55,
"grad_norm": 0.10847461968660355,
"learning_rate": 0.00016645749010115734,
"loss": 1.2545,
"step": 848
},
{
"epoch": 0.55,
"grad_norm": 0.09043294191360474,
"learning_rate": 0.00016638046349379626,
"loss": 1.2213,
"step": 849
},
{
"epoch": 0.55,
"grad_norm": 0.09827099740505219,
"learning_rate": 0.00016630336641304907,
"loss": 1.2937,
"step": 850
},
{
"epoch": 0.55,
"grad_norm": 0.07804851233959198,
"learning_rate": 0.00016622619894076661,
"loss": 1.2444,
"step": 851
},
{
"epoch": 0.55,
"grad_norm": 0.07834355533123016,
"learning_rate": 0.00016614896115887438,
"loss": 1.2846,
"step": 852
},
{
"epoch": 0.55,
"grad_norm": 0.08101753145456314,
"learning_rate": 0.0001660716531493725,
"loss": 1.1009,
"step": 853
},
{
"epoch": 0.55,
"grad_norm": 0.11360859125852585,
"learning_rate": 0.0001659942749943357,
"loss": 1.0914,
"step": 854
},
{
"epoch": 0.56,
"grad_norm": 0.07827256619930267,
"learning_rate": 0.00016591682677591314,
"loss": 1.0918,
"step": 855
},
{
"epoch": 0.56,
"grad_norm": 0.0827704444527626,
"learning_rate": 0.0001658393085763284,
"loss": 1.333,
"step": 856
},
{
"epoch": 0.56,
"grad_norm": 0.12776634097099304,
"learning_rate": 0.00016576172047787937,
"loss": 0.8844,
"step": 857
},
{
"epoch": 0.56,
"grad_norm": 0.07633556425571442,
"learning_rate": 0.00016568406256293802,
"loss": 1.3823,
"step": 858
},
{
"epoch": 0.56,
"grad_norm": 0.0876186341047287,
"learning_rate": 0.00016560633491395068,
"loss": 1.1195,
"step": 859
},
{
"epoch": 0.56,
"grad_norm": 0.09859266132116318,
"learning_rate": 0.00016552853761343746,
"loss": 1.306,
"step": 860
},
{
"epoch": 0.56,
"grad_norm": 0.08464020490646362,
"learning_rate": 0.00016545067074399253,
"loss": 1.1206,
"step": 861
},
{
"epoch": 0.56,
"grad_norm": 0.07913007587194443,
"learning_rate": 0.000165372734388284,
"loss": 1.025,
"step": 862
},
{
"epoch": 0.56,
"grad_norm": 0.08341752737760544,
"learning_rate": 0.0001652947286290536,
"loss": 1.3396,
"step": 863
},
{
"epoch": 0.56,
"grad_norm": 0.08605632185935974,
"learning_rate": 0.00016521665354911683,
"loss": 1.2395,
"step": 864
},
{
"epoch": 0.56,
"grad_norm": 0.08409509807825089,
"learning_rate": 0.00016513850923136273,
"loss": 1.0192,
"step": 865
},
{
"epoch": 0.56,
"grad_norm": 0.08476000279188156,
"learning_rate": 0.00016506029575875396,
"loss": 1.0996,
"step": 866
},
{
"epoch": 0.56,
"grad_norm": 0.08864401280879974,
"learning_rate": 0.00016498201321432646,
"loss": 0.9354,
"step": 867
},
{
"epoch": 0.56,
"grad_norm": 0.08183170109987259,
"learning_rate": 0.0001649036616811896,
"loss": 1.0602,
"step": 868
},
{
"epoch": 0.56,
"grad_norm": 0.07102543860673904,
"learning_rate": 0.0001648252412425259,
"loss": 1.1053,
"step": 869
},
{
"epoch": 0.57,
"grad_norm": 0.08999533206224442,
"learning_rate": 0.00016474675198159116,
"loss": 1.1031,
"step": 870
},
{
"epoch": 0.57,
"grad_norm": 0.0800955668091774,
"learning_rate": 0.0001646681939817141,
"loss": 0.9741,
"step": 871
},
{
"epoch": 0.57,
"grad_norm": 0.08678940683603287,
"learning_rate": 0.00016458956732629654,
"loss": 1.2987,
"step": 872
},
{
"epoch": 0.57,
"grad_norm": 0.07563462108373642,
"learning_rate": 0.00016451087209881315,
"loss": 1.1788,
"step": 873
},
{
"epoch": 0.57,
"grad_norm": 0.07887473702430725,
"learning_rate": 0.00016443210838281135,
"loss": 1.0512,
"step": 874
},
{
"epoch": 0.57,
"grad_norm": 0.07923612743616104,
"learning_rate": 0.00016435327626191135,
"loss": 1.1476,
"step": 875
},
{
"epoch": 0.57,
"grad_norm": 0.08602787554264069,
"learning_rate": 0.0001642743758198059,
"loss": 1.2671,
"step": 876
},
{
"epoch": 0.57,
"grad_norm": 0.08977708965539932,
"learning_rate": 0.00016419540714026037,
"loss": 1.406,
"step": 877
},
{
"epoch": 0.57,
"grad_norm": 0.11876388639211655,
"learning_rate": 0.00016411637030711251,
"loss": 0.9359,
"step": 878
},
{
"epoch": 0.57,
"grad_norm": 0.08920851349830627,
"learning_rate": 0.00016403726540427247,
"loss": 1.0716,
"step": 879
},
{
"epoch": 0.57,
"grad_norm": 0.09188678115606308,
"learning_rate": 0.0001639580925157226,
"loss": 1.3255,
"step": 880
},
{
"epoch": 0.57,
"grad_norm": 0.08219437301158905,
"learning_rate": 0.00016387885172551757,
"loss": 1.1677,
"step": 881
},
{
"epoch": 0.57,
"grad_norm": 0.08392970263957977,
"learning_rate": 0.00016379954311778388,
"loss": 1.3106,
"step": 882
},
{
"epoch": 0.57,
"grad_norm": 0.08758262544870377,
"learning_rate": 0.00016372016677672037,
"loss": 1.1562,
"step": 883
},
{
"epoch": 0.57,
"grad_norm": 0.08920885622501373,
"learning_rate": 0.0001636407227865975,
"loss": 1.0487,
"step": 884
},
{
"epoch": 0.57,
"grad_norm": 0.07952671498060226,
"learning_rate": 0.00016356121123175767,
"loss": 1.2011,
"step": 885
},
{
"epoch": 0.58,
"grad_norm": 0.0843694731593132,
"learning_rate": 0.00016348163219661506,
"loss": 1.1841,
"step": 886
},
{
"epoch": 0.58,
"grad_norm": 0.10402873158454895,
"learning_rate": 0.00016340198576565539,
"loss": 1.089,
"step": 887
},
{
"epoch": 0.58,
"grad_norm": 0.09944958984851837,
"learning_rate": 0.00016332227202343595,
"loss": 1.0941,
"step": 888
},
{
"epoch": 0.58,
"grad_norm": 0.08194096386432648,
"learning_rate": 0.00016324249105458555,
"loss": 1.109,
"step": 889
},
{
"epoch": 0.58,
"grad_norm": 0.07979317754507065,
"learning_rate": 0.00016316264294380432,
"loss": 1.0224,
"step": 890
},
{
"epoch": 0.58,
"grad_norm": 0.09668459743261337,
"learning_rate": 0.0001630827277758637,
"loss": 1.1358,
"step": 891
},
{
"epoch": 0.58,
"grad_norm": 0.07681312412023544,
"learning_rate": 0.00016300274563560633,
"loss": 1.1293,
"step": 892
},
{
"epoch": 0.58,
"grad_norm": 0.09155978262424469,
"learning_rate": 0.0001629226966079459,
"loss": 1.3014,
"step": 893
},
{
"epoch": 0.58,
"grad_norm": 0.09289336204528809,
"learning_rate": 0.00016284258077786716,
"loss": 1.0673,
"step": 894
},
{
"epoch": 0.58,
"grad_norm": 0.07735373824834824,
"learning_rate": 0.00016276239823042574,
"loss": 0.9928,
"step": 895
},
{
"epoch": 0.58,
"grad_norm": 0.09608011692762375,
"learning_rate": 0.00016268214905074818,
"loss": 1.1263,
"step": 896
},
{
"epoch": 0.58,
"grad_norm": 0.09346359968185425,
"learning_rate": 0.00016260183332403164,
"loss": 1.3477,
"step": 897
},
{
"epoch": 0.58,
"grad_norm": 0.09623821079730988,
"learning_rate": 0.00016252145113554404,
"loss": 0.9902,
"step": 898
},
{
"epoch": 0.58,
"grad_norm": 0.0833108201622963,
"learning_rate": 0.00016244100257062382,
"loss": 1.1103,
"step": 899
},
{
"epoch": 0.58,
"grad_norm": 0.07932139933109283,
"learning_rate": 0.00016236048771467988,
"loss": 1.1159,
"step": 900
},
{
"epoch": 0.59,
"grad_norm": 0.082742840051651,
"learning_rate": 0.00016227990665319147,
"loss": 0.992,
"step": 901
},
{
"epoch": 0.59,
"grad_norm": 0.07717438787221909,
"learning_rate": 0.00016219925947170822,
"loss": 1.138,
"step": 902
},
{
"epoch": 0.59,
"grad_norm": 0.0860557034611702,
"learning_rate": 0.00016211854625584985,
"loss": 1.262,
"step": 903
},
{
"epoch": 0.59,
"grad_norm": 0.10646398365497589,
"learning_rate": 0.00016203776709130627,
"loss": 1.3783,
"step": 904
},
{
"epoch": 0.59,
"grad_norm": 0.08500290662050247,
"learning_rate": 0.00016195692206383733,
"loss": 1.1034,
"step": 905
},
{
"epoch": 0.59,
"grad_norm": 0.08151032775640488,
"learning_rate": 0.0001618760112592729,
"loss": 0.8408,
"step": 906
},
{
"epoch": 0.59,
"grad_norm": 0.07579167932271957,
"learning_rate": 0.00016179503476351258,
"loss": 0.8924,
"step": 907
},
{
"epoch": 0.59,
"grad_norm": 0.1033424586057663,
"learning_rate": 0.0001617139926625258,
"loss": 1.2191,
"step": 908
},
{
"epoch": 0.59,
"grad_norm": 0.08320926129817963,
"learning_rate": 0.0001616328850423515,
"loss": 1.1108,
"step": 909
},
{
"epoch": 0.59,
"grad_norm": 0.08811642974615097,
"learning_rate": 0.00016155171198909841,
"loss": 1.0175,
"step": 910
},
{
"epoch": 0.59,
"grad_norm": 0.08476217091083527,
"learning_rate": 0.00016147047358894452,
"loss": 1.2875,
"step": 911
},
{
"epoch": 0.59,
"grad_norm": 0.08753612637519836,
"learning_rate": 0.0001613891699281373,
"loss": 1.156,
"step": 912
},
{
"epoch": 0.59,
"grad_norm": 0.09200330823659897,
"learning_rate": 0.00016130780109299345,
"loss": 1.4483,
"step": 913
},
{
"epoch": 0.59,
"grad_norm": 0.08672292530536652,
"learning_rate": 0.0001612263671698989,
"loss": 1.0182,
"step": 914
},
{
"epoch": 0.59,
"grad_norm": 0.09040652215480804,
"learning_rate": 0.00016114486824530869,
"loss": 1.3006,
"step": 915
},
{
"epoch": 0.59,
"grad_norm": 0.07458732277154922,
"learning_rate": 0.0001610633044057468,
"loss": 1.2662,
"step": 916
},
{
"epoch": 0.6,
"grad_norm": 0.07543858140707016,
"learning_rate": 0.00016098167573780624,
"loss": 1.3084,
"step": 917
},
{
"epoch": 0.6,
"grad_norm": 0.09571071714162827,
"learning_rate": 0.00016089998232814875,
"loss": 1.1357,
"step": 918
},
{
"epoch": 0.6,
"grad_norm": 0.08231940865516663,
"learning_rate": 0.00016081822426350484,
"loss": 1.0647,
"step": 919
},
{
"epoch": 0.6,
"grad_norm": 0.08672209829092026,
"learning_rate": 0.00016073640163067362,
"loss": 1.0215,
"step": 920
},
{
"epoch": 0.6,
"grad_norm": 0.08963492512702942,
"learning_rate": 0.00016065451451652285,
"loss": 1.1266,
"step": 921
},
{
"epoch": 0.6,
"grad_norm": 0.08349716663360596,
"learning_rate": 0.00016057256300798867,
"loss": 1.0656,
"step": 922
},
{
"epoch": 0.6,
"grad_norm": 0.08417947590351105,
"learning_rate": 0.00016049054719207554,
"loss": 1.227,
"step": 923
},
{
"epoch": 0.6,
"grad_norm": 0.08216405659914017,
"learning_rate": 0.00016040846715585633,
"loss": 1.0997,
"step": 924
},
{
"epoch": 0.6,
"grad_norm": 0.09813476353883743,
"learning_rate": 0.00016032632298647196,
"loss": 1.2417,
"step": 925
},
{
"epoch": 0.6,
"grad_norm": 0.0971807986497879,
"learning_rate": 0.00016024411477113152,
"loss": 1.1989,
"step": 926
},
{
"epoch": 0.6,
"grad_norm": 0.10932951420545578,
"learning_rate": 0.00016016184259711204,
"loss": 1.2781,
"step": 927
},
{
"epoch": 0.6,
"grad_norm": 0.09671205282211304,
"learning_rate": 0.0001600795065517585,
"loss": 1.299,
"step": 928
},
{
"epoch": 0.6,
"grad_norm": 0.08652383089065552,
"learning_rate": 0.00015999710672248365,
"loss": 1.3306,
"step": 929
},
{
"epoch": 0.6,
"grad_norm": 0.10589951276779175,
"learning_rate": 0.000159914643196768,
"loss": 1.3625,
"step": 930
},
{
"epoch": 0.6,
"grad_norm": 0.08908641338348389,
"learning_rate": 0.00015983211606215958,
"loss": 1.1171,
"step": 931
},
{
"epoch": 0.61,
"grad_norm": 0.13235490024089813,
"learning_rate": 0.00015974952540627412,
"loss": 0.9521,
"step": 932
},
{
"epoch": 0.61,
"grad_norm": 0.0815630853176117,
"learning_rate": 0.00015966687131679463,
"loss": 1.2107,
"step": 933
},
{
"epoch": 0.61,
"grad_norm": 0.08272356539964676,
"learning_rate": 0.00015958415388147155,
"loss": 1.1149,
"step": 934
},
{
"epoch": 0.61,
"grad_norm": 0.08392516523599625,
"learning_rate": 0.0001595013731881226,
"loss": 1.2414,
"step": 935
},
{
"epoch": 0.61,
"grad_norm": 0.08603011816740036,
"learning_rate": 0.00015941852932463256,
"loss": 1.2616,
"step": 936
},
{
"epoch": 0.61,
"grad_norm": 0.07599209994077682,
"learning_rate": 0.0001593356223789533,
"loss": 1.0765,
"step": 937
},
{
"epoch": 0.61,
"grad_norm": 0.08189697563648224,
"learning_rate": 0.00015925265243910372,
"loss": 1.1934,
"step": 938
},
{
"epoch": 0.61,
"grad_norm": 0.08917500823736191,
"learning_rate": 0.00015916961959316957,
"loss": 1.2389,
"step": 939
},
{
"epoch": 0.61,
"grad_norm": 0.07975764572620392,
"learning_rate": 0.0001590865239293034,
"loss": 1.2643,
"step": 940
},
{
"epoch": 0.61,
"grad_norm": 0.07825619727373123,
"learning_rate": 0.0001590033655357244,
"loss": 0.9547,
"step": 941
},
{
"epoch": 0.61,
"grad_norm": 0.08327656984329224,
"learning_rate": 0.00015892014450071836,
"loss": 1.1476,
"step": 942
},
{
"epoch": 0.61,
"grad_norm": 0.08970014750957489,
"learning_rate": 0.00015883686091263768,
"loss": 1.158,
"step": 943
},
{
"epoch": 0.61,
"grad_norm": 0.08594327419996262,
"learning_rate": 0.00015875351485990105,
"loss": 1.0168,
"step": 944
},
{
"epoch": 0.61,
"grad_norm": 0.08538670837879181,
"learning_rate": 0.0001586701064309935,
"loss": 0.8934,
"step": 945
},
{
"epoch": 0.61,
"grad_norm": 0.0771031305193901,
"learning_rate": 0.00015858663571446631,
"loss": 1.2207,
"step": 946
},
{
"epoch": 0.62,
"grad_norm": 0.11030296981334686,
"learning_rate": 0.0001585031027989369,
"loss": 1.3215,
"step": 947
},
{
"epoch": 0.62,
"grad_norm": 0.08234802633523941,
"learning_rate": 0.0001584195077730887,
"loss": 1.0878,
"step": 948
},
{
"epoch": 0.62,
"grad_norm": 0.08790728449821472,
"learning_rate": 0.00015833585072567104,
"loss": 1.0046,
"step": 949
},
{
"epoch": 0.62,
"grad_norm": 0.12733106315135956,
"learning_rate": 0.00015825213174549925,
"loss": 1.4015,
"step": 950
},
{
"epoch": 0.62,
"grad_norm": 0.1594487875699997,
"learning_rate": 0.00015816835092145417,
"loss": 1.0978,
"step": 951
},
{
"epoch": 0.62,
"grad_norm": 0.08943841606378555,
"learning_rate": 0.0001580845083424825,
"loss": 1.2287,
"step": 952
},
{
"epoch": 0.62,
"grad_norm": 0.07976686954498291,
"learning_rate": 0.0001580006040975964,
"loss": 1.2446,
"step": 953
},
{
"epoch": 0.62,
"grad_norm": 0.07864785194396973,
"learning_rate": 0.00015791663827587353,
"loss": 1.1802,
"step": 954
},
{
"epoch": 0.62,
"grad_norm": 0.11824239790439606,
"learning_rate": 0.00015783261096645695,
"loss": 1.0745,
"step": 955
},
{
"epoch": 0.62,
"grad_norm": 0.08143503963947296,
"learning_rate": 0.00015774852225855496,
"loss": 1.1099,
"step": 956
},
{
"epoch": 0.62,
"grad_norm": 0.07825972139835358,
"learning_rate": 0.00015766437224144103,
"loss": 1.3001,
"step": 957
},
{
"epoch": 0.62,
"grad_norm": 0.09076002240180969,
"learning_rate": 0.0001575801610044538,
"loss": 1.2856,
"step": 958
},
{
"epoch": 0.62,
"grad_norm": 0.09442058950662613,
"learning_rate": 0.0001574958886369968,
"loss": 1.2894,
"step": 959
},
{
"epoch": 0.62,
"grad_norm": 0.08304957300424576,
"learning_rate": 0.0001574115552285385,
"loss": 1.1424,
"step": 960
},
{
"epoch": 0.62,
"grad_norm": 0.08791410177946091,
"learning_rate": 0.0001573271608686122,
"loss": 1.1523,
"step": 961
},
{
"epoch": 0.62,
"grad_norm": 0.09252519905567169,
"learning_rate": 0.00015724270564681592,
"loss": 1.3296,
"step": 962
},
{
"epoch": 0.63,
"grad_norm": 0.07583607733249664,
"learning_rate": 0.00015715818965281221,
"loss": 1.1952,
"step": 963
},
{
"epoch": 0.63,
"grad_norm": 0.09245412796735764,
"learning_rate": 0.00015707361297632828,
"loss": 0.9826,
"step": 964
},
{
"epoch": 0.63,
"grad_norm": 0.0894828587770462,
"learning_rate": 0.0001569889757071556,
"loss": 1.1895,
"step": 965
},
{
"epoch": 0.63,
"grad_norm": 0.0824456512928009,
"learning_rate": 0.0001569042779351501,
"loss": 1.0105,
"step": 966
},
{
"epoch": 0.63,
"grad_norm": 0.08300875127315521,
"learning_rate": 0.00015681951975023186,
"loss": 0.9969,
"step": 967
},
{
"epoch": 0.63,
"grad_norm": 0.09487631171941757,
"learning_rate": 0.00015673470124238516,
"loss": 1.2029,
"step": 968
},
{
"epoch": 0.63,
"grad_norm": 0.08492422103881836,
"learning_rate": 0.00015664982250165828,
"loss": 1.1835,
"step": 969
},
{
"epoch": 0.63,
"grad_norm": 0.12439217418432236,
"learning_rate": 0.00015656488361816346,
"loss": 1.1601,
"step": 970
},
{
"epoch": 0.63,
"grad_norm": 0.09230206161737442,
"learning_rate": 0.00015647988468207676,
"loss": 0.9133,
"step": 971
},
{
"epoch": 0.63,
"grad_norm": 0.08204157650470734,
"learning_rate": 0.0001563948257836381,
"loss": 1.0194,
"step": 972
},
{
"epoch": 0.63,
"grad_norm": 0.07965037971735,
"learning_rate": 0.00015630970701315094,
"loss": 1.3518,
"step": 973
},
{
"epoch": 0.63,
"grad_norm": 0.08081319183111191,
"learning_rate": 0.00015622452846098233,
"loss": 1.1105,
"step": 974
},
{
"epoch": 0.63,
"grad_norm": 0.08211128413677216,
"learning_rate": 0.00015613929021756284,
"loss": 1.0984,
"step": 975
},
{
"epoch": 0.63,
"grad_norm": 0.09477461129426956,
"learning_rate": 0.0001560539923733864,
"loss": 1.0591,
"step": 976
},
{
"epoch": 0.63,
"grad_norm": 0.07867059111595154,
"learning_rate": 0.00015596863501901012,
"loss": 1.2916,
"step": 977
},
{
"epoch": 0.64,
"grad_norm": 0.08776961266994476,
"learning_rate": 0.00015588321824505443,
"loss": 1.2977,
"step": 978
},
{
"epoch": 0.64,
"grad_norm": 0.0917336493730545,
"learning_rate": 0.00015579774214220278,
"loss": 1.239,
"step": 979
},
{
"epoch": 0.64,
"grad_norm": 0.09087909013032913,
"learning_rate": 0.00015571220680120153,
"loss": 1.1542,
"step": 980
},
{
"epoch": 0.64,
"grad_norm": 0.07975257188081741,
"learning_rate": 0.0001556266123128601,
"loss": 1.0691,
"step": 981
},
{
"epoch": 0.64,
"grad_norm": 0.08824899792671204,
"learning_rate": 0.00015554095876805057,
"loss": 1.3249,
"step": 982
},
{
"epoch": 0.64,
"grad_norm": 0.11979102343320847,
"learning_rate": 0.0001554552462577077,
"loss": 1.1488,
"step": 983
},
{
"epoch": 0.64,
"grad_norm": 0.0774589404463768,
"learning_rate": 0.00015536947487282903,
"loss": 1.0791,
"step": 984
},
{
"epoch": 0.64,
"grad_norm": 0.09189926832914352,
"learning_rate": 0.00015528364470447436,
"loss": 0.9839,
"step": 985
},
{
"epoch": 0.64,
"grad_norm": 0.08508016914129257,
"learning_rate": 0.00015519775584376611,
"loss": 0.9511,
"step": 986
},
{
"epoch": 0.64,
"grad_norm": 0.08841552585363388,
"learning_rate": 0.0001551118083818889,
"loss": 1.1512,
"step": 987
},
{
"epoch": 0.64,
"grad_norm": 0.08979780226945877,
"learning_rate": 0.00015502580241008956,
"loss": 1.2946,
"step": 988
},
{
"epoch": 0.64,
"grad_norm": 0.0848306193947792,
"learning_rate": 0.0001549397380196771,
"loss": 1.0743,
"step": 989
},
{
"epoch": 0.64,
"grad_norm": 0.08302175253629684,
"learning_rate": 0.00015485361530202248,
"loss": 0.9865,
"step": 990
},
{
"epoch": 0.64,
"grad_norm": 0.08802732825279236,
"learning_rate": 0.00015476743434855866,
"loss": 1.171,
"step": 991
},
{
"epoch": 0.64,
"grad_norm": 0.08969046920537949,
"learning_rate": 0.00015468119525078032,
"loss": 1.1248,
"step": 992
},
{
"epoch": 0.65,
"grad_norm": 0.0782170295715332,
"learning_rate": 0.000154594898100244,
"loss": 0.9323,
"step": 993
},
{
"epoch": 0.65,
"grad_norm": 0.0807466134428978,
"learning_rate": 0.00015450854298856777,
"loss": 0.8486,
"step": 994
},
{
"epoch": 0.65,
"grad_norm": 0.08955421298742294,
"learning_rate": 0.00015442213000743129,
"loss": 0.9727,
"step": 995
},
{
"epoch": 0.65,
"grad_norm": 0.09456660598516464,
"learning_rate": 0.00015433565924857564,
"loss": 1.2232,
"step": 996
},
{
"epoch": 0.65,
"grad_norm": 0.08864837139844894,
"learning_rate": 0.00015424913080380325,
"loss": 1.2478,
"step": 997
},
{
"epoch": 0.65,
"grad_norm": 0.07130644470453262,
"learning_rate": 0.00015416254476497776,
"loss": 1.1128,
"step": 998
},
{
"epoch": 0.65,
"grad_norm": 0.08904992043972015,
"learning_rate": 0.00015407590122402395,
"loss": 1.1202,
"step": 999
},
{
"epoch": 0.65,
"grad_norm": 0.08642040938138962,
"learning_rate": 0.00015398920027292776,
"loss": 1.2332,
"step": 1000
},
{
"epoch": 0.65,
"grad_norm": 0.10026253014802933,
"learning_rate": 0.00015390244200373592,
"loss": 1.2327,
"step": 1001
},
{
"epoch": 0.65,
"grad_norm": 0.0839606449007988,
"learning_rate": 0.00015381562650855612,
"loss": 1.193,
"step": 1002
},
{
"epoch": 0.65,
"grad_norm": 0.0924573689699173,
"learning_rate": 0.00015372875387955677,
"loss": 1.3187,
"step": 1003
},
{
"epoch": 0.65,
"grad_norm": 0.08120997995138168,
"learning_rate": 0.0001536418242089669,
"loss": 1.1482,
"step": 1004
},
{
"epoch": 0.65,
"grad_norm": 0.08533752709627151,
"learning_rate": 0.0001535548375890762,
"loss": 1.4419,
"step": 1005
},
{
"epoch": 0.65,
"grad_norm": 0.08670300245285034,
"learning_rate": 0.00015346779411223472,
"loss": 1.1334,
"step": 1006
},
{
"epoch": 0.65,
"grad_norm": 0.09586023539304733,
"learning_rate": 0.00015338069387085294,
"loss": 1.1792,
"step": 1007
},
{
"epoch": 0.65,
"grad_norm": 0.08470802009105682,
"learning_rate": 0.0001532935369574015,
"loss": 1.1229,
"step": 1008
},
{
"epoch": 0.66,
"grad_norm": 0.09617331624031067,
"learning_rate": 0.00015320632346441142,
"loss": 1.1177,
"step": 1009
},
{
"epoch": 0.66,
"grad_norm": 0.09748532623052597,
"learning_rate": 0.0001531190534844735,
"loss": 1.1242,
"step": 1010
},
{
"epoch": 0.66,
"grad_norm": 0.09671757370233536,
"learning_rate": 0.00015303172711023875,
"loss": 1.4025,
"step": 1011
},
{
"epoch": 0.66,
"grad_norm": 0.08423331379890442,
"learning_rate": 0.00015294434443441794,
"loss": 1.216,
"step": 1012
},
{
"epoch": 0.66,
"grad_norm": 0.0889984667301178,
"learning_rate": 0.00015285690554978163,
"loss": 1.1172,
"step": 1013
},
{
"epoch": 0.66,
"grad_norm": 0.08674996346235275,
"learning_rate": 0.00015276941054916002,
"loss": 0.8588,
"step": 1014
},
{
"epoch": 0.66,
"grad_norm": 0.10601639002561569,
"learning_rate": 0.00015268185952544303,
"loss": 1.1771,
"step": 1015
},
{
"epoch": 0.66,
"grad_norm": 0.09794366359710693,
"learning_rate": 0.00015259425257157987,
"loss": 0.9863,
"step": 1016
},
{
"epoch": 0.66,
"grad_norm": 0.09882765263319016,
"learning_rate": 0.0001525065897805792,
"loss": 1.2867,
"step": 1017
},
{
"epoch": 0.66,
"grad_norm": 0.08232888579368591,
"learning_rate": 0.000152418871245509,
"loss": 1.2046,
"step": 1018
},
{
"epoch": 0.66,
"grad_norm": 0.07535366714000702,
"learning_rate": 0.00015233109705949644,
"loss": 1.0457,
"step": 1019
},
{
"epoch": 0.66,
"grad_norm": 0.09567664563655853,
"learning_rate": 0.00015224326731572764,
"loss": 1.425,
"step": 1020
},
{
"epoch": 0.66,
"grad_norm": 0.07062424719333649,
"learning_rate": 0.0001521553821074479,
"loss": 0.9607,
"step": 1021
},
{
"epoch": 0.66,
"grad_norm": 0.08274392038583755,
"learning_rate": 0.00015206744152796123,
"loss": 1.0936,
"step": 1022
},
{
"epoch": 0.66,
"grad_norm": 0.09635645896196365,
"learning_rate": 0.0001519794456706305,
"loss": 1.3019,
"step": 1023
},
{
"epoch": 0.67,
"grad_norm": 0.0800948366522789,
"learning_rate": 0.00015189139462887732,
"loss": 0.929,
"step": 1024
},
{
"epoch": 0.67,
"grad_norm": 0.09318748116493225,
"learning_rate": 0.0001518032884961818,
"loss": 1.3526,
"step": 1025
},
{
"epoch": 0.67,
"grad_norm": 0.07523728907108307,
"learning_rate": 0.00015171512736608254,
"loss": 1.1982,
"step": 1026
},
{
"epoch": 0.67,
"grad_norm": 0.09203594923019409,
"learning_rate": 0.0001516269113321766,
"loss": 1.3409,
"step": 1027
},
{
"epoch": 0.67,
"grad_norm": 0.08934096992015839,
"learning_rate": 0.00015153864048811925,
"loss": 1.2817,
"step": 1028
},
{
"epoch": 0.67,
"grad_norm": 0.08486980944871902,
"learning_rate": 0.00015145031492762404,
"loss": 0.8791,
"step": 1029
},
{
"epoch": 0.67,
"grad_norm": 0.08579596132040024,
"learning_rate": 0.00015136193474446247,
"loss": 1.1381,
"step": 1030
},
{
"epoch": 0.67,
"grad_norm": 0.08721473067998886,
"learning_rate": 0.00015127350003246422,
"loss": 1.429,
"step": 1031
},
{
"epoch": 0.67,
"grad_norm": 0.09468651562929153,
"learning_rate": 0.00015118501088551666,
"loss": 1.2188,
"step": 1032
},
{
"epoch": 0.67,
"grad_norm": 0.07146283984184265,
"learning_rate": 0.0001510964673975651,
"loss": 1.1188,
"step": 1033
},
{
"epoch": 0.67,
"grad_norm": 0.09246056526899338,
"learning_rate": 0.00015100786966261247,
"loss": 1.2027,
"step": 1034
},
{
"epoch": 0.67,
"grad_norm": 0.08403714001178741,
"learning_rate": 0.00015091921777471936,
"loss": 1.273,
"step": 1035
},
{
"epoch": 0.67,
"grad_norm": 0.0870928019285202,
"learning_rate": 0.00015083051182800372,
"loss": 1.1654,
"step": 1036
},
{
"epoch": 0.67,
"grad_norm": 0.08683963119983673,
"learning_rate": 0.00015074175191664104,
"loss": 1.331,
"step": 1037
},
{
"epoch": 0.67,
"grad_norm": 0.0884169265627861,
"learning_rate": 0.00015065293813486404,
"loss": 1.2137,
"step": 1038
},
{
"epoch": 0.67,
"grad_norm": 0.08322736620903015,
"learning_rate": 0.0001505640705769626,
"loss": 1.122,
"step": 1039
},
{
"epoch": 0.68,
"grad_norm": 0.07375328987836838,
"learning_rate": 0.0001504751493372837,
"loss": 1.0161,
"step": 1040
},
{
"epoch": 0.68,
"grad_norm": 0.12521041929721832,
"learning_rate": 0.00015038617451023143,
"loss": 1.0004,
"step": 1041
},
{
"epoch": 0.68,
"grad_norm": 0.08745912462472916,
"learning_rate": 0.00015029714619026654,
"loss": 1.2979,
"step": 1042
},
{
"epoch": 0.68,
"grad_norm": 0.10064958781003952,
"learning_rate": 0.00015020806447190683,
"loss": 1.2741,
"step": 1043
},
{
"epoch": 0.68,
"grad_norm": 0.08009211719036102,
"learning_rate": 0.0001501189294497266,
"loss": 1.2031,
"step": 1044
},
{
"epoch": 0.68,
"grad_norm": 0.08244958519935608,
"learning_rate": 0.00015002974121835686,
"loss": 1.0796,
"step": 1045
},
{
"epoch": 0.68,
"grad_norm": 0.09753942489624023,
"learning_rate": 0.00014994049987248498,
"loss": 1.0495,
"step": 1046
},
{
"epoch": 0.68,
"grad_norm": 0.08097665756940842,
"learning_rate": 0.00014985120550685483,
"loss": 1.0305,
"step": 1047
},
{
"epoch": 0.68,
"grad_norm": 0.09167876839637756,
"learning_rate": 0.00014976185821626657,
"loss": 1.3199,
"step": 1048
},
{
"epoch": 0.68,
"grad_norm": 0.09040147811174393,
"learning_rate": 0.00014967245809557647,
"loss": 1.142,
"step": 1049
},
{
"epoch": 0.68,
"grad_norm": 0.10372103005647659,
"learning_rate": 0.00014958300523969695,
"loss": 1.0238,
"step": 1050
},
{
"epoch": 0.68,
"grad_norm": 0.7571262717247009,
"learning_rate": 0.0001494934997435964,
"loss": 0.6414,
"step": 1051
},
{
"epoch": 0.68,
"grad_norm": 0.08545485883951187,
"learning_rate": 0.0001494039417022991,
"loss": 1.2469,
"step": 1052
},
{
"epoch": 0.68,
"grad_norm": 0.09645125269889832,
"learning_rate": 0.0001493143312108851,
"loss": 1.2064,
"step": 1053
},
{
"epoch": 0.68,
"grad_norm": 0.10659247636795044,
"learning_rate": 0.00014922466836449013,
"loss": 1.2185,
"step": 1054
},
{
"epoch": 0.69,
"grad_norm": 0.0905073881149292,
"learning_rate": 0.0001491349532583056,
"loss": 1.1531,
"step": 1055
},
{
"epoch": 0.69,
"grad_norm": 0.08443164825439453,
"learning_rate": 0.00014904518598757814,
"loss": 1.0289,
"step": 1056
},
{
"epoch": 0.69,
"grad_norm": 0.10690521448850632,
"learning_rate": 0.00014895536664761013,
"loss": 1.182,
"step": 1057
},
{
"epoch": 0.69,
"grad_norm": 0.09757747501134872,
"learning_rate": 0.00014886549533375896,
"loss": 1.298,
"step": 1058
},
{
"epoch": 0.69,
"grad_norm": 0.11675470322370529,
"learning_rate": 0.00014877557214143728,
"loss": 1.3993,
"step": 1059
},
{
"epoch": 0.69,
"grad_norm": 0.09194236993789673,
"learning_rate": 0.00014868559716611277,
"loss": 1.2173,
"step": 1060
},
{
"epoch": 0.69,
"grad_norm": 0.09742829203605652,
"learning_rate": 0.0001485955705033082,
"loss": 1.0765,
"step": 1061
},
{
"epoch": 0.69,
"grad_norm": 0.13593271374702454,
"learning_rate": 0.00014850549224860112,
"loss": 1.3141,
"step": 1062
},
{
"epoch": 0.69,
"grad_norm": 0.09260403364896774,
"learning_rate": 0.0001484153624976239,
"loss": 1.1365,
"step": 1063
},
{
"epoch": 0.69,
"grad_norm": 0.12835289537906647,
"learning_rate": 0.0001483251813460635,
"loss": 1.206,
"step": 1064
},
{
"epoch": 0.69,
"grad_norm": 0.09844189882278442,
"learning_rate": 0.00014823494888966158,
"loss": 1.3833,
"step": 1065
},
{
"epoch": 0.69,
"grad_norm": 0.08510404080152512,
"learning_rate": 0.00014814466522421416,
"loss": 1.1127,
"step": 1066
},
{
"epoch": 0.69,
"grad_norm": 0.10135819762945175,
"learning_rate": 0.00014805433044557168,
"loss": 1.3373,
"step": 1067
},
{
"epoch": 0.69,
"grad_norm": 0.09199123084545135,
"learning_rate": 0.0001479639446496388,
"loss": 1.1125,
"step": 1068
},
{
"epoch": 0.69,
"grad_norm": 0.09400131553411484,
"learning_rate": 0.0001478735079323744,
"loss": 0.9781,
"step": 1069
},
{
"epoch": 0.7,
"grad_norm": 0.1093631461262703,
"learning_rate": 0.00014778302038979138,
"loss": 1.0756,
"step": 1070
},
{
"epoch": 0.7,
"grad_norm": 0.09117407351732254,
"learning_rate": 0.00014769248211795664,
"loss": 1.1776,
"step": 1071
},
{
"epoch": 0.7,
"grad_norm": 0.10300873219966888,
"learning_rate": 0.00014760189321299087,
"loss": 1.2383,
"step": 1072
},
{
"epoch": 0.7,
"grad_norm": 0.07841359078884125,
"learning_rate": 0.00014751125377106858,
"loss": 1.1613,
"step": 1073
},
{
"epoch": 0.7,
"grad_norm": 0.08929870277643204,
"learning_rate": 0.0001474205638884179,
"loss": 1.2414,
"step": 1074
},
{
"epoch": 0.7,
"grad_norm": 0.10028059780597687,
"learning_rate": 0.00014732982366132054,
"loss": 1.2768,
"step": 1075
},
{
"epoch": 0.7,
"grad_norm": 0.08279004693031311,
"learning_rate": 0.00014723903318611156,
"loss": 0.7972,
"step": 1076
},
{
"epoch": 0.7,
"grad_norm": 0.09201148897409439,
"learning_rate": 0.00014714819255917956,
"loss": 1.2578,
"step": 1077
},
{
"epoch": 0.7,
"grad_norm": 0.10396204143762589,
"learning_rate": 0.00014705730187696619,
"loss": 1.1112,
"step": 1078
},
{
"epoch": 0.7,
"grad_norm": 0.09462717175483704,
"learning_rate": 0.00014696636123596633,
"loss": 1.2364,
"step": 1079
},
{
"epoch": 0.7,
"grad_norm": 0.10165147483348846,
"learning_rate": 0.0001468753707327279,
"loss": 1.0232,
"step": 1080
},
{
"epoch": 0.7,
"grad_norm": 0.08603104948997498,
"learning_rate": 0.00014678433046385174,
"loss": 1.0741,
"step": 1081
},
{
"epoch": 0.7,
"grad_norm": 0.10029155015945435,
"learning_rate": 0.00014669324052599153,
"loss": 1.155,
"step": 1082
},
{
"epoch": 0.7,
"grad_norm": 0.08776471763849258,
"learning_rate": 0.00014660210101585368,
"loss": 1.0608,
"step": 1083
},
{
"epoch": 0.7,
"grad_norm": 0.07470471411943436,
"learning_rate": 0.0001465109120301972,
"loss": 1.1027,
"step": 1084
},
{
"epoch": 0.7,
"grad_norm": 0.07778345793485641,
"learning_rate": 0.0001464196736658337,
"loss": 1.0816,
"step": 1085
},
{
"epoch": 0.71,
"grad_norm": 0.08618229627609253,
"learning_rate": 0.00014632838601962716,
"loss": 1.1412,
"step": 1086
},
{
"epoch": 0.71,
"grad_norm": 0.08929042518138885,
"learning_rate": 0.00014623704918849392,
"loss": 1.3558,
"step": 1087
},
{
"epoch": 0.71,
"grad_norm": 0.08327944576740265,
"learning_rate": 0.00014614566326940243,
"loss": 1.0872,
"step": 1088
},
{
"epoch": 0.71,
"grad_norm": 0.07381748408079147,
"learning_rate": 0.00014605422835937338,
"loss": 1.1229,
"step": 1089
},
{
"epoch": 0.71,
"grad_norm": 0.0991898626089096,
"learning_rate": 0.00014596274455547941,
"loss": 0.9485,
"step": 1090
},
{
"epoch": 0.71,
"grad_norm": 0.08856026828289032,
"learning_rate": 0.00014587121195484511,
"loss": 1.0784,
"step": 1091
},
{
"epoch": 0.71,
"grad_norm": 0.08923971652984619,
"learning_rate": 0.0001457796306546468,
"loss": 1.2214,
"step": 1092
},
{
"epoch": 0.71,
"grad_norm": 0.08865326642990112,
"learning_rate": 0.00014568800075211258,
"loss": 1.0595,
"step": 1093
},
{
"epoch": 0.71,
"grad_norm": 0.09346123039722443,
"learning_rate": 0.00014559632234452212,
"loss": 1.1833,
"step": 1094
},
{
"epoch": 0.71,
"grad_norm": 0.07990922778844833,
"learning_rate": 0.00014550459552920656,
"loss": 0.9171,
"step": 1095
},
{
"epoch": 0.71,
"grad_norm": 0.08237365633249283,
"learning_rate": 0.0001454128204035485,
"loss": 1.2034,
"step": 1096
},
{
"epoch": 0.71,
"grad_norm": 0.08575394004583359,
"learning_rate": 0.00014532099706498174,
"loss": 1.0053,
"step": 1097
},
{
"epoch": 0.71,
"grad_norm": 0.08545620739459991,
"learning_rate": 0.00014522912561099134,
"loss": 1.1055,
"step": 1098
},
{
"epoch": 0.71,
"grad_norm": 0.0953921377658844,
"learning_rate": 0.0001451372061391134,
"loss": 1.0586,
"step": 1099
},
{
"epoch": 0.71,
"grad_norm": 0.1096978709101677,
"learning_rate": 0.00014504523874693501,
"loss": 1.227,
"step": 1100
},
{
"epoch": 0.72,
"grad_norm": 0.09339159727096558,
"learning_rate": 0.00014495322353209414,
"loss": 1.2558,
"step": 1101
},
{
"epoch": 0.72,
"grad_norm": 0.11579679697751999,
"learning_rate": 0.00014486116059227955,
"loss": 1.264,
"step": 1102
},
{
"epoch": 0.72,
"grad_norm": 0.0806502103805542,
"learning_rate": 0.00014476905002523064,
"loss": 1.2278,
"step": 1103
},
{
"epoch": 0.72,
"grad_norm": 0.08331871777772903,
"learning_rate": 0.00014467689192873735,
"loss": 1.0335,
"step": 1104
},
{
"epoch": 0.72,
"grad_norm": 0.1019323468208313,
"learning_rate": 0.00014458468640064014,
"loss": 1.3601,
"step": 1105
},
{
"epoch": 0.72,
"grad_norm": 0.09600196778774261,
"learning_rate": 0.00014449243353882978,
"loss": 1.125,
"step": 1106
},
{
"epoch": 0.72,
"grad_norm": 0.08367899060249329,
"learning_rate": 0.00014440013344124735,
"loss": 1.158,
"step": 1107
},
{
"epoch": 0.72,
"grad_norm": 0.10402350127696991,
"learning_rate": 0.00014430778620588396,
"loss": 1.0836,
"step": 1108
},
{
"epoch": 0.72,
"grad_norm": 0.08525457233190536,
"learning_rate": 0.00014421539193078088,
"loss": 1.0902,
"step": 1109
},
{
"epoch": 0.72,
"grad_norm": 0.09753510355949402,
"learning_rate": 0.00014412295071402934,
"loss": 1.2485,
"step": 1110
},
{
"epoch": 0.72,
"grad_norm": 0.09461357444524765,
"learning_rate": 0.00014403046265377024,
"loss": 1.2085,
"step": 1111
},
{
"epoch": 0.72,
"grad_norm": 0.08117979764938354,
"learning_rate": 0.0001439379278481944,
"loss": 1.0515,
"step": 1112
},
{
"epoch": 0.72,
"grad_norm": 0.1041373535990715,
"learning_rate": 0.00014384534639554216,
"loss": 1.0654,
"step": 1113
},
{
"epoch": 0.72,
"grad_norm": 0.08893406391143799,
"learning_rate": 0.00014375271839410338,
"loss": 0.9929,
"step": 1114
},
{
"epoch": 0.72,
"grad_norm": 0.08799731731414795,
"learning_rate": 0.00014366004394221745,
"loss": 0.9943,
"step": 1115
},
{
"epoch": 0.72,
"grad_norm": 0.11736821383237839,
"learning_rate": 0.00014356732313827288,
"loss": 1.3649,
"step": 1116
},
{
"epoch": 0.73,
"grad_norm": 0.07686550915241241,
"learning_rate": 0.00014347455608070762,
"loss": 0.8763,
"step": 1117
},
{
"epoch": 0.73,
"grad_norm": 0.08737040311098099,
"learning_rate": 0.00014338174286800852,
"loss": 1.0713,
"step": 1118
},
{
"epoch": 0.73,
"grad_norm": 0.09222594648599625,
"learning_rate": 0.00014328888359871157,
"loss": 1.2232,
"step": 1119
},
{
"epoch": 0.73,
"grad_norm": 0.08900738507509232,
"learning_rate": 0.00014319597837140157,
"loss": 1.3012,
"step": 1120
},
{
"epoch": 0.73,
"grad_norm": 0.08774983137845993,
"learning_rate": 0.0001431030272847122,
"loss": 1.232,
"step": 1121
},
{
"epoch": 0.73,
"grad_norm": 0.08808692544698715,
"learning_rate": 0.0001430100304373257,
"loss": 0.8727,
"step": 1122
},
{
"epoch": 0.73,
"grad_norm": 0.09503446519374847,
"learning_rate": 0.00014291698792797306,
"loss": 1.2056,
"step": 1123
},
{
"epoch": 0.73,
"grad_norm": 0.1891634315252304,
"learning_rate": 0.0001428238998554336,
"loss": 1.035,
"step": 1124
},
{
"epoch": 0.73,
"grad_norm": 0.0838143453001976,
"learning_rate": 0.00014273076631853503,
"loss": 1.1841,
"step": 1125
},
{
"epoch": 0.73,
"grad_norm": 0.08770063519477844,
"learning_rate": 0.00014263758741615346,
"loss": 1.2907,
"step": 1126
},
{
"epoch": 0.73,
"grad_norm": 0.08471453934907913,
"learning_rate": 0.00014254436324721297,
"loss": 1.3639,
"step": 1127
},
{
"epoch": 0.73,
"grad_norm": 0.07473208755254745,
"learning_rate": 0.00014245109391068585,
"loss": 1.1097,
"step": 1128
},
{
"epoch": 0.73,
"grad_norm": 0.07981300354003906,
"learning_rate": 0.00014235777950559228,
"loss": 1.3015,
"step": 1129
},
{
"epoch": 0.73,
"grad_norm": 0.09128767997026443,
"learning_rate": 0.00014226442013100035,
"loss": 1.2391,
"step": 1130
},
{
"epoch": 0.73,
"grad_norm": 0.11322212964296341,
"learning_rate": 0.00014217101588602572,
"loss": 1.4983,
"step": 1131
},
{
"epoch": 0.74,
"grad_norm": 0.0780053436756134,
"learning_rate": 0.0001420775668698319,
"loss": 0.9616,
"step": 1132
},
{
"epoch": 0.74,
"grad_norm": 0.0901261419057846,
"learning_rate": 0.00014198407318162976,
"loss": 1.218,
"step": 1133
},
{
"epoch": 0.74,
"grad_norm": 0.08618736267089844,
"learning_rate": 0.00014189053492067775,
"loss": 1.1451,
"step": 1134
},
{
"epoch": 0.74,
"grad_norm": 0.0928853377699852,
"learning_rate": 0.0001417969521862815,
"loss": 1.0681,
"step": 1135
},
{
"epoch": 0.74,
"grad_norm": 0.0962471067905426,
"learning_rate": 0.00014170332507779397,
"loss": 1.2452,
"step": 1136
},
{
"epoch": 0.74,
"grad_norm": 0.10371973365545273,
"learning_rate": 0.00014160965369461515,
"loss": 1.2264,
"step": 1137
},
{
"epoch": 0.74,
"grad_norm": 0.09922056645154953,
"learning_rate": 0.0001415159381361921,
"loss": 1.386,
"step": 1138
},
{
"epoch": 0.74,
"grad_norm": 0.12338308990001678,
"learning_rate": 0.00014142217850201868,
"loss": 1.0467,
"step": 1139
},
{
"epoch": 0.74,
"grad_norm": 0.09021501988172531,
"learning_rate": 0.00014132837489163567,
"loss": 1.1333,
"step": 1140
},
{
"epoch": 0.74,
"grad_norm": 0.08738788217306137,
"learning_rate": 0.00014123452740463042,
"loss": 0.983,
"step": 1141
},
{
"epoch": 0.74,
"grad_norm": 0.08590232580900192,
"learning_rate": 0.00014114063614063696,
"loss": 1.1199,
"step": 1142
},
{
"epoch": 0.74,
"grad_norm": 0.08662337064743042,
"learning_rate": 0.00014104670119933571,
"loss": 1.128,
"step": 1143
},
{
"epoch": 0.74,
"grad_norm": 0.09038158506155014,
"learning_rate": 0.00014095272268045355,
"loss": 1.19,
"step": 1144
},
{
"epoch": 0.74,
"grad_norm": 0.09062106162309647,
"learning_rate": 0.00014085870068376353,
"loss": 1.2092,
"step": 1145
},
{
"epoch": 0.74,
"grad_norm": 0.1351018100976944,
"learning_rate": 0.00014076463530908494,
"loss": 1.2049,
"step": 1146
},
{
"epoch": 0.75,
"grad_norm": 0.10191931575536728,
"learning_rate": 0.00014067052665628308,
"loss": 1.3527,
"step": 1147
},
{
"epoch": 0.75,
"grad_norm": 0.07985897362232208,
"learning_rate": 0.00014057637482526922,
"loss": 1.1659,
"step": 1148
},
{
"epoch": 0.75,
"grad_norm": 0.10022406280040741,
"learning_rate": 0.0001404821799160004,
"loss": 1.2686,
"step": 1149
},
{
"epoch": 0.75,
"grad_norm": 0.08160050213336945,
"learning_rate": 0.00014038794202847954,
"loss": 1.0361,
"step": 1150
},
{
"epoch": 0.75,
"grad_norm": 0.08739824593067169,
"learning_rate": 0.000140293661262755,
"loss": 1.1512,
"step": 1151
},
{
"epoch": 0.75,
"grad_norm": 0.10146929323673248,
"learning_rate": 0.00014019933771892084,
"loss": 1.3,
"step": 1152
},
{
"epoch": 0.75,
"grad_norm": 0.10179619491100311,
"learning_rate": 0.00014010497149711642,
"loss": 1.4097,
"step": 1153
},
{
"epoch": 0.75,
"grad_norm": 0.08360686153173447,
"learning_rate": 0.00014001056269752644,
"loss": 1.1058,
"step": 1154
},
{
"epoch": 0.75,
"grad_norm": 0.0967346802353859,
"learning_rate": 0.0001399161114203808,
"loss": 1.1717,
"step": 1155
},
{
"epoch": 0.75,
"grad_norm": 0.09379181265830994,
"learning_rate": 0.00013982161776595456,
"loss": 1.1818,
"step": 1156
},
{
"epoch": 0.75,
"grad_norm": 0.09329712390899658,
"learning_rate": 0.00013972708183456766,
"loss": 0.8509,
"step": 1157
},
{
"epoch": 0.75,
"grad_norm": 0.08215396851301193,
"learning_rate": 0.000139632503726585,
"loss": 1.1277,
"step": 1158
},
{
"epoch": 0.75,
"grad_norm": 0.09043752402067184,
"learning_rate": 0.00013953788354241622,
"loss": 1.4025,
"step": 1159
},
{
"epoch": 0.75,
"grad_norm": 0.09531474113464355,
"learning_rate": 0.0001394432213825157,
"loss": 1.0859,
"step": 1160
},
{
"epoch": 0.75,
"grad_norm": 0.09292061626911163,
"learning_rate": 0.00013934851734738221,
"loss": 0.9693,
"step": 1161
},
{
"epoch": 0.75,
"grad_norm": 0.08968769013881683,
"learning_rate": 0.00013925377153755925,
"loss": 1.1753,
"step": 1162
},
{
"epoch": 0.76,
"grad_norm": 0.0891139954328537,
"learning_rate": 0.00013915898405363443,
"loss": 1.1893,
"step": 1163
},
{
"epoch": 0.76,
"grad_norm": 0.09048478305339813,
"learning_rate": 0.00013906415499623972,
"loss": 1.0884,
"step": 1164
},
{
"epoch": 0.76,
"grad_norm": 0.09769124537706375,
"learning_rate": 0.00013896928446605113,
"loss": 1.0972,
"step": 1165
},
{
"epoch": 0.76,
"grad_norm": 0.07943381369113922,
"learning_rate": 0.0001388743725637889,
"loss": 1.2175,
"step": 1166
},
{
"epoch": 0.76,
"grad_norm": 0.08798827230930328,
"learning_rate": 0.00013877941939021695,
"loss": 1.3913,
"step": 1167
},
{
"epoch": 0.76,
"grad_norm": 0.09645213931798935,
"learning_rate": 0.00013868442504614317,
"loss": 1.1566,
"step": 1168
},
{
"epoch": 0.76,
"grad_norm": 0.0855674147605896,
"learning_rate": 0.0001385893896324191,
"loss": 1.0702,
"step": 1169
},
{
"epoch": 0.76,
"grad_norm": 0.07673133909702301,
"learning_rate": 0.00013849431324993992,
"loss": 1.2589,
"step": 1170
},
{
"epoch": 0.76,
"grad_norm": 0.0823918953537941,
"learning_rate": 0.0001383991959996443,
"loss": 1.0888,
"step": 1171
},
{
"epoch": 0.76,
"grad_norm": 0.10688403993844986,
"learning_rate": 0.00013830403798251422,
"loss": 1.4245,
"step": 1172
},
{
"epoch": 0.76,
"grad_norm": 0.09957115352153778,
"learning_rate": 0.00013820883929957503,
"loss": 1.2257,
"step": 1173
},
{
"epoch": 0.76,
"grad_norm": 0.09672617167234421,
"learning_rate": 0.00013811360005189525,
"loss": 1.3465,
"step": 1174
},
{
"epoch": 0.76,
"grad_norm": 0.08894725143909454,
"learning_rate": 0.00013801832034058645,
"loss": 1.1642,
"step": 1175
},
{
"epoch": 0.76,
"grad_norm": 0.08050256967544556,
"learning_rate": 0.0001379230002668031,
"loss": 0.989,
"step": 1176
},
{
"epoch": 0.76,
"grad_norm": 0.08343324065208435,
"learning_rate": 0.00013782763993174259,
"loss": 1.3053,
"step": 1177
},
{
"epoch": 0.77,
"grad_norm": 0.08665145933628082,
"learning_rate": 0.00013773223943664505,
"loss": 0.9861,
"step": 1178
},
{
"epoch": 0.77,
"grad_norm": 0.08662264049053192,
"learning_rate": 0.0001376367988827932,
"loss": 1.0238,
"step": 1179
},
{
"epoch": 0.77,
"grad_norm": 0.0902816653251648,
"learning_rate": 0.00013754131837151234,
"loss": 1.1862,
"step": 1180
},
{
"epoch": 0.77,
"grad_norm": 0.08907479047775269,
"learning_rate": 0.00013744579800417016,
"loss": 0.9641,
"step": 1181
},
{
"epoch": 0.77,
"grad_norm": 0.09430835396051407,
"learning_rate": 0.00013735023788217672,
"loss": 1.0027,
"step": 1182
},
{
"epoch": 0.77,
"grad_norm": 0.0847993716597557,
"learning_rate": 0.00013725463810698417,
"loss": 0.9738,
"step": 1183
},
{
"epoch": 0.77,
"grad_norm": 0.0929441973567009,
"learning_rate": 0.00013715899878008687,
"loss": 1.2102,
"step": 1184
},
{
"epoch": 0.77,
"grad_norm": 0.07598242908716202,
"learning_rate": 0.00013706332000302108,
"loss": 1.0273,
"step": 1185
},
{
"epoch": 0.77,
"grad_norm": 0.09799207001924515,
"learning_rate": 0.00013696760187736508,
"loss": 1.0714,
"step": 1186
},
{
"epoch": 0.77,
"grad_norm": 0.10814066231250763,
"learning_rate": 0.00013687184450473876,
"loss": 1.3199,
"step": 1187
},
{
"epoch": 0.77,
"grad_norm": 0.07920951396226883,
"learning_rate": 0.0001367760479868038,
"loss": 1.1849,
"step": 1188
},
{
"epoch": 0.77,
"grad_norm": 0.08429884910583496,
"learning_rate": 0.00013668021242526333,
"loss": 1.3284,
"step": 1189
},
{
"epoch": 0.77,
"grad_norm": 0.09833259880542755,
"learning_rate": 0.00013658433792186205,
"loss": 1.3027,
"step": 1190
},
{
"epoch": 0.77,
"grad_norm": 0.08972413092851639,
"learning_rate": 0.00013648842457838592,
"loss": 1.1809,
"step": 1191
},
{
"epoch": 0.77,
"grad_norm": 0.07582113891839981,
"learning_rate": 0.00013639247249666218,
"loss": 1.1759,
"step": 1192
},
{
"epoch": 0.77,
"grad_norm": 0.07063476741313934,
"learning_rate": 0.00013629648177855916,
"loss": 0.9762,
"step": 1193
},
{
"epoch": 0.78,
"grad_norm": 0.08549682050943375,
"learning_rate": 0.00013620045252598622,
"loss": 1.1447,
"step": 1194
},
{
"epoch": 0.78,
"grad_norm": 0.09319033473730087,
"learning_rate": 0.00013610438484089365,
"loss": 1.1186,
"step": 1195
},
{
"epoch": 0.78,
"grad_norm": 0.08497250080108643,
"learning_rate": 0.00013600827882527254,
"loss": 1.2636,
"step": 1196
},
{
"epoch": 0.78,
"grad_norm": 0.0931629091501236,
"learning_rate": 0.0001359121345811546,
"loss": 1.042,
"step": 1197
},
{
"epoch": 0.78,
"grad_norm": 0.09610223025083542,
"learning_rate": 0.00013581595221061227,
"loss": 1.1181,
"step": 1198
},
{
"epoch": 0.78,
"grad_norm": 0.08825349062681198,
"learning_rate": 0.00013571973181575835,
"loss": 1.0396,
"step": 1199
},
{
"epoch": 0.78,
"grad_norm": 0.08976442366838455,
"learning_rate": 0.00013562347349874604,
"loss": 1.3945,
"step": 1200
},
{
"epoch": 0.78,
"grad_norm": 0.09342648833990097,
"learning_rate": 0.00013552717736176878,
"loss": 1.2207,
"step": 1201
},
{
"epoch": 0.78,
"grad_norm": 0.08452824503183365,
"learning_rate": 0.00013543084350706028,
"loss": 1.0522,
"step": 1202
},
{
"epoch": 0.78,
"grad_norm": 0.08465161174535751,
"learning_rate": 0.00013533447203689409,
"loss": 1.1641,
"step": 1203
},
{
"epoch": 0.78,
"grad_norm": 0.07297605276107788,
"learning_rate": 0.00013523806305358385,
"loss": 0.9522,
"step": 1204
},
{
"epoch": 0.78,
"grad_norm": 0.11933207511901855,
"learning_rate": 0.00013514161665948297,
"loss": 1.1795,
"step": 1205
},
{
"epoch": 0.78,
"grad_norm": 0.0859321653842926,
"learning_rate": 0.00013504513295698462,
"loss": 1.1732,
"step": 1206
},
{
"epoch": 0.78,
"grad_norm": 0.0957307368516922,
"learning_rate": 0.0001349486120485215,
"loss": 1.2965,
"step": 1207
},
{
"epoch": 0.78,
"grad_norm": 0.09841260313987732,
"learning_rate": 0.00013485205403656591,
"loss": 1.2246,
"step": 1208
},
{
"epoch": 0.79,
"grad_norm": 0.07643883675336838,
"learning_rate": 0.00013475545902362943,
"loss": 1.2657,
"step": 1209
},
{
"epoch": 0.79,
"grad_norm": 0.09062516689300537,
"learning_rate": 0.00013465882711226302,
"loss": 1.1998,
"step": 1210
},
{
"epoch": 0.79,
"grad_norm": 0.0803908184170723,
"learning_rate": 0.00013456215840505678,
"loss": 1.0853,
"step": 1211
},
{
"epoch": 0.79,
"grad_norm": 0.07891444116830826,
"learning_rate": 0.00013446545300463986,
"loss": 1.0384,
"step": 1212
},
{
"epoch": 0.79,
"grad_norm": 0.0776372179389,
"learning_rate": 0.00013436871101368033,
"loss": 1.0959,
"step": 1213
},
{
"epoch": 0.79,
"grad_norm": 0.08687058836221695,
"learning_rate": 0.0001342719325348852,
"loss": 1.1569,
"step": 1214
},
{
"epoch": 0.79,
"grad_norm": 0.0908912867307663,
"learning_rate": 0.00013417511767100016,
"loss": 1.2967,
"step": 1215
},
{
"epoch": 0.79,
"grad_norm": 0.086028091609478,
"learning_rate": 0.00013407826652480956,
"loss": 1.2632,
"step": 1216
},
{
"epoch": 0.79,
"grad_norm": 0.0805966779589653,
"learning_rate": 0.00013398137919913618,
"loss": 1.0845,
"step": 1217
},
{
"epoch": 0.79,
"grad_norm": 0.08439763635396957,
"learning_rate": 0.00013388445579684134,
"loss": 1.2389,
"step": 1218
},
{
"epoch": 0.79,
"grad_norm": 0.09502055495977402,
"learning_rate": 0.00013378749642082457,
"loss": 1.4107,
"step": 1219
},
{
"epoch": 0.79,
"grad_norm": 0.08694536983966827,
"learning_rate": 0.00013369050117402362,
"loss": 1.1534,
"step": 1220
},
{
"epoch": 0.79,
"grad_norm": 0.0910310447216034,
"learning_rate": 0.00013359347015941432,
"loss": 1.4174,
"step": 1221
},
{
"epoch": 0.79,
"grad_norm": 0.08172550797462463,
"learning_rate": 0.00013349640348001054,
"loss": 1.2574,
"step": 1222
},
{
"epoch": 0.79,
"grad_norm": 0.07421483844518661,
"learning_rate": 0.00013339930123886382,
"loss": 1.0209,
"step": 1223
},
{
"epoch": 0.8,
"grad_norm": 0.0931035578250885,
"learning_rate": 0.00013330216353906368,
"loss": 1.2249,
"step": 1224
},
{
"epoch": 0.8,
"grad_norm": 0.08503065258264542,
"learning_rate": 0.00013320499048373718,
"loss": 1.1302,
"step": 1225
},
{
"epoch": 0.8,
"grad_norm": 0.09309668093919754,
"learning_rate": 0.00013310778217604888,
"loss": 1.2408,
"step": 1226
},
{
"epoch": 0.8,
"grad_norm": 0.06965488940477371,
"learning_rate": 0.00013301053871920087,
"loss": 1.1801,
"step": 1227
},
{
"epoch": 0.8,
"grad_norm": 0.08761877566576004,
"learning_rate": 0.00013291326021643246,
"loss": 0.8351,
"step": 1228
},
{
"epoch": 0.8,
"grad_norm": 0.09199398010969162,
"learning_rate": 0.0001328159467710202,
"loss": 0.944,
"step": 1229
},
{
"epoch": 0.8,
"grad_norm": 0.08364813774824142,
"learning_rate": 0.00013271859848627772,
"loss": 1.0332,
"step": 1230
},
{
"epoch": 0.8,
"grad_norm": 0.09218093007802963,
"learning_rate": 0.00013262121546555572,
"loss": 1.3183,
"step": 1231
},
{
"epoch": 0.8,
"grad_norm": 0.11060269176959991,
"learning_rate": 0.0001325237978122417,
"loss": 1.1082,
"step": 1232
},
{
"epoch": 0.8,
"grad_norm": 0.09054608643054962,
"learning_rate": 0.0001324263456297599,
"loss": 1.1122,
"step": 1233
},
{
"epoch": 0.8,
"grad_norm": 0.0973866879940033,
"learning_rate": 0.0001323288590215713,
"loss": 0.9585,
"step": 1234
},
{
"epoch": 0.8,
"grad_norm": 0.11306141316890717,
"learning_rate": 0.00013223133809117337,
"loss": 1.186,
"step": 1235
},
{
"epoch": 0.8,
"grad_norm": 0.11251191049814224,
"learning_rate": 0.00013213378294210006,
"loss": 1.2047,
"step": 1236
},
{
"epoch": 0.8,
"grad_norm": 0.09960142523050308,
"learning_rate": 0.00013203619367792158,
"loss": 1.2015,
"step": 1237
},
{
"epoch": 0.8,
"grad_norm": 0.08028863370418549,
"learning_rate": 0.0001319385704022445,
"loss": 1.0966,
"step": 1238
},
{
"epoch": 0.8,
"grad_norm": 0.08503750711679459,
"learning_rate": 0.00013184091321871133,
"loss": 1.2161,
"step": 1239
},
{
"epoch": 0.81,
"grad_norm": 0.09345296025276184,
"learning_rate": 0.0001317432222310006,
"loss": 1.278,
"step": 1240
},
{
"epoch": 0.81,
"grad_norm": 0.08547773957252502,
"learning_rate": 0.00013164549754282693,
"loss": 1.0141,
"step": 1241
},
{
"epoch": 0.81,
"grad_norm": 0.09492174535989761,
"learning_rate": 0.0001315477392579405,
"loss": 1.0201,
"step": 1242
},
{
"epoch": 0.81,
"grad_norm": 0.08412059396505356,
"learning_rate": 0.00013144994748012713,
"loss": 1.3892,
"step": 1243
},
{
"epoch": 0.81,
"grad_norm": 0.08974771201610565,
"learning_rate": 0.00013135212231320847,
"loss": 0.8927,
"step": 1244
},
{
"epoch": 0.81,
"grad_norm": 0.09787634760141373,
"learning_rate": 0.0001312542638610413,
"loss": 1.3731,
"step": 1245
},
{
"epoch": 0.81,
"grad_norm": 0.08702712506055832,
"learning_rate": 0.000131156372227518,
"loss": 1.1564,
"step": 1246
},
{
"epoch": 0.81,
"grad_norm": 0.12134439498186111,
"learning_rate": 0.00013105844751656594,
"loss": 1.2847,
"step": 1247
},
{
"epoch": 0.81,
"grad_norm": 0.08712608367204666,
"learning_rate": 0.0001309604898321478,
"loss": 1.1427,
"step": 1248
},
{
"epoch": 0.81,
"grad_norm": 0.08822109550237656,
"learning_rate": 0.00013086249927826119,
"loss": 1.1056,
"step": 1249
},
{
"epoch": 0.81,
"grad_norm": 0.10544507950544357,
"learning_rate": 0.00013076447595893859,
"loss": 1.5419,
"step": 1250
},
{
"epoch": 0.81,
"grad_norm": 0.0898931622505188,
"learning_rate": 0.00013066641997824734,
"loss": 1.2817,
"step": 1251
},
{
"epoch": 0.81,
"grad_norm": 0.09420999884605408,
"learning_rate": 0.00013056833144028935,
"loss": 1.1529,
"step": 1252
},
{
"epoch": 0.81,
"grad_norm": 0.10036417096853256,
"learning_rate": 0.00013047021044920119,
"loss": 1.3166,
"step": 1253
},
{
"epoch": 0.81,
"grad_norm": 0.11062490195035934,
"learning_rate": 0.00013037205710915382,
"loss": 1.2535,
"step": 1254
},
{
"epoch": 0.82,
"grad_norm": 0.09266883134841919,
"learning_rate": 0.00013027387152435266,
"loss": 1.1405,
"step": 1255
},
{
"epoch": 0.82,
"grad_norm": 0.09206977486610413,
"learning_rate": 0.00013017565379903716,
"loss": 1.2001,
"step": 1256
},
{
"epoch": 0.82,
"grad_norm": 0.09695585072040558,
"learning_rate": 0.00013007740403748108,
"loss": 1.0446,
"step": 1257
},
{
"epoch": 0.82,
"grad_norm": 0.08620952814817429,
"learning_rate": 0.00012997912234399204,
"loss": 1.1169,
"step": 1258
},
{
"epoch": 0.82,
"grad_norm": 0.08978530019521713,
"learning_rate": 0.00012988080882291175,
"loss": 1.3111,
"step": 1259
},
{
"epoch": 0.82,
"grad_norm": 0.10054924339056015,
"learning_rate": 0.00012978246357861552,
"loss": 1.006,
"step": 1260
},
{
"epoch": 0.82,
"grad_norm": 0.09115590155124664,
"learning_rate": 0.00012968408671551246,
"loss": 1.3212,
"step": 1261
},
{
"epoch": 0.82,
"grad_norm": 0.07982508838176727,
"learning_rate": 0.00012958567833804517,
"loss": 0.9031,
"step": 1262
},
{
"epoch": 0.82,
"grad_norm": 0.0962832123041153,
"learning_rate": 0.00012948723855068977,
"loss": 1.2286,
"step": 1263
},
{
"epoch": 0.82,
"grad_norm": 0.0861852616071701,
"learning_rate": 0.0001293887674579557,
"loss": 1.287,
"step": 1264
},
{
"epoch": 0.82,
"grad_norm": 0.08311343193054199,
"learning_rate": 0.00012929026516438562,
"loss": 1.0399,
"step": 1265
},
{
"epoch": 0.82,
"grad_norm": 0.09084224700927734,
"learning_rate": 0.00012919173177455533,
"loss": 1.0108,
"step": 1266
},
{
"epoch": 0.82,
"grad_norm": 0.09911910444498062,
"learning_rate": 0.00012909316739307368,
"loss": 1.2692,
"step": 1267
},
{
"epoch": 0.82,
"grad_norm": 0.08711926639080048,
"learning_rate": 0.00012899457212458233,
"loss": 1.0833,
"step": 1268
},
{
"epoch": 0.82,
"grad_norm": 0.08221635222434998,
"learning_rate": 0.00012889594607375587,
"loss": 1.2175,
"step": 1269
},
{
"epoch": 0.82,
"grad_norm": 0.10001187771558762,
"learning_rate": 0.00012879728934530143,
"loss": 1.2152,
"step": 1270
},
{
"epoch": 0.83,
"grad_norm": 0.09377269446849823,
"learning_rate": 0.00012869860204395877,
"loss": 1.1532,
"step": 1271
},
{
"epoch": 0.83,
"grad_norm": 0.08989045768976212,
"learning_rate": 0.0001285998842745001,
"loss": 1.237,
"step": 1272
},
{
"epoch": 0.83,
"grad_norm": 0.11527752131223679,
"learning_rate": 0.00012850113614173005,
"loss": 1.3364,
"step": 1273
},
{
"epoch": 0.83,
"grad_norm": 0.08604636043310165,
"learning_rate": 0.00012840235775048533,
"loss": 1.0706,
"step": 1274
},
{
"epoch": 0.83,
"grad_norm": 0.07703463733196259,
"learning_rate": 0.00012830354920563493,
"loss": 1.1714,
"step": 1275
},
{
"epoch": 0.83,
"grad_norm": 0.08045919984579086,
"learning_rate": 0.0001282047106120797,
"loss": 1.0775,
"step": 1276
},
{
"epoch": 0.83,
"grad_norm": 0.09717388451099396,
"learning_rate": 0.0001281058420747526,
"loss": 1.2014,
"step": 1277
},
{
"epoch": 0.83,
"grad_norm": 0.09249131381511688,
"learning_rate": 0.00012800694369861817,
"loss": 1.4177,
"step": 1278
},
{
"epoch": 0.83,
"grad_norm": 0.09153535962104797,
"learning_rate": 0.00012790801558867272,
"loss": 1.2693,
"step": 1279
},
{
"epoch": 0.83,
"grad_norm": 0.08211056888103485,
"learning_rate": 0.00012780905784994418,
"loss": 1.0372,
"step": 1280
},
{
"epoch": 0.83,
"grad_norm": 0.08809658139944077,
"learning_rate": 0.00012771007058749183,
"loss": 1.277,
"step": 1281
},
{
"epoch": 0.83,
"grad_norm": 0.0787929818034172,
"learning_rate": 0.0001276110539064063,
"loss": 1.0201,
"step": 1282
},
{
"epoch": 0.83,
"grad_norm": 0.08550479263067245,
"learning_rate": 0.00012751200791180962,
"loss": 0.9998,
"step": 1283
},
{
"epoch": 0.83,
"grad_norm": 0.09406717866659164,
"learning_rate": 0.00012741293270885468,
"loss": 1.2511,
"step": 1284
},
{
"epoch": 0.83,
"grad_norm": 0.0858648419380188,
"learning_rate": 0.00012731382840272564,
"loss": 1.2468,
"step": 1285
},
{
"epoch": 0.84,
"grad_norm": 0.07706280052661896,
"learning_rate": 0.0001272146950986373,
"loss": 1.1974,
"step": 1286
},
{
"epoch": 0.84,
"grad_norm": 0.08311185240745544,
"learning_rate": 0.00012711553290183553,
"loss": 1.0518,
"step": 1287
},
{
"epoch": 0.84,
"grad_norm": 0.08380080759525299,
"learning_rate": 0.0001270163419175966,
"loss": 1.1027,
"step": 1288
},
{
"epoch": 0.84,
"grad_norm": 0.08663756400346756,
"learning_rate": 0.00012691712225122752,
"loss": 1.2496,
"step": 1289
},
{
"epoch": 0.84,
"grad_norm": 0.08722782135009766,
"learning_rate": 0.00012681787400806567,
"loss": 1.3418,
"step": 1290
},
{
"epoch": 0.84,
"grad_norm": 0.08794383704662323,
"learning_rate": 0.00012671859729347884,
"loss": 1.0888,
"step": 1291
},
{
"epoch": 0.84,
"grad_norm": 0.08084482699632645,
"learning_rate": 0.00012661929221286492,
"loss": 1.0297,
"step": 1292
},
{
"epoch": 0.84,
"grad_norm": 0.0863770842552185,
"learning_rate": 0.00012651995887165212,
"loss": 1.2373,
"step": 1293
},
{
"epoch": 0.84,
"grad_norm": 0.08851886540651321,
"learning_rate": 0.00012642059737529842,
"loss": 1.2889,
"step": 1294
},
{
"epoch": 0.84,
"grad_norm": 0.10227511078119278,
"learning_rate": 0.00012632120782929185,
"loss": 1.1617,
"step": 1295
},
{
"epoch": 0.84,
"grad_norm": 0.09257698059082031,
"learning_rate": 0.00012622179033915015,
"loss": 1.2073,
"step": 1296
},
{
"epoch": 0.84,
"grad_norm": 0.10253733396530151,
"learning_rate": 0.00012612234501042078,
"loss": 1.1484,
"step": 1297
},
{
"epoch": 0.84,
"grad_norm": 0.09285663813352585,
"learning_rate": 0.00012602287194868073,
"loss": 1.0529,
"step": 1298
},
{
"epoch": 0.84,
"grad_norm": 0.08632886409759521,
"learning_rate": 0.00012592337125953633,
"loss": 1.1751,
"step": 1299
},
{
"epoch": 0.84,
"grad_norm": 0.0837155357003212,
"learning_rate": 0.00012582384304862346,
"loss": 1.0312,
"step": 1300
},
{
"epoch": 0.85,
"grad_norm": 0.07497237622737885,
"learning_rate": 0.000125724287421607,
"loss": 1.0438,
"step": 1301
},
{
"epoch": 0.85,
"grad_norm": 0.08276520669460297,
"learning_rate": 0.00012562470448418108,
"loss": 0.9125,
"step": 1302
},
{
"epoch": 0.85,
"grad_norm": 0.08902527391910553,
"learning_rate": 0.0001255250943420688,
"loss": 1.2557,
"step": 1303
},
{
"epoch": 0.85,
"grad_norm": 0.10572145134210587,
"learning_rate": 0.00012542545710102207,
"loss": 1.4523,
"step": 1304
},
{
"epoch": 0.85,
"grad_norm": 0.08492980897426605,
"learning_rate": 0.00012532579286682162,
"loss": 1.2162,
"step": 1305
},
{
"epoch": 0.85,
"grad_norm": 0.09150967746973038,
"learning_rate": 0.00012522610174527685,
"loss": 1.0517,
"step": 1306
},
{
"epoch": 0.85,
"grad_norm": 0.10158524662256241,
"learning_rate": 0.00012512638384222573,
"loss": 1.0725,
"step": 1307
},
{
"epoch": 0.85,
"grad_norm": 0.08061851561069489,
"learning_rate": 0.00012502663926353456,
"loss": 0.9771,
"step": 1308
},
{
"epoch": 0.85,
"grad_norm": 0.11557295173406601,
"learning_rate": 0.000124926868115098,
"loss": 1.3486,
"step": 1309
},
{
"epoch": 0.85,
"grad_norm": 0.0924990326166153,
"learning_rate": 0.00012482707050283907,
"loss": 1.1827,
"step": 1310
},
{
"epoch": 0.85,
"grad_norm": 0.08090729266405106,
"learning_rate": 0.00012472724653270862,
"loss": 1.1164,
"step": 1311
},
{
"epoch": 0.85,
"grad_norm": 0.10747341811656952,
"learning_rate": 0.0001246273963106857,
"loss": 1.2252,
"step": 1312
},
{
"epoch": 0.85,
"grad_norm": 0.10453370213508606,
"learning_rate": 0.00012452751994277713,
"loss": 1.2535,
"step": 1313
},
{
"epoch": 0.85,
"grad_norm": 0.09473150223493576,
"learning_rate": 0.0001244276175350175,
"loss": 1.2502,
"step": 1314
},
{
"epoch": 0.85,
"grad_norm": 0.10395807027816772,
"learning_rate": 0.00012432768919346906,
"loss": 1.135,
"step": 1315
},
{
"epoch": 0.85,
"grad_norm": 0.08586447685956955,
"learning_rate": 0.0001242277350242216,
"loss": 1.1256,
"step": 1316
},
{
"epoch": 0.86,
"grad_norm": 0.09255239367485046,
"learning_rate": 0.0001241277551333923,
"loss": 1.1852,
"step": 1317
},
{
"epoch": 0.86,
"grad_norm": 0.10878726840019226,
"learning_rate": 0.0001240277496271257,
"loss": 1.1988,
"step": 1318
},
{
"epoch": 0.86,
"grad_norm": 0.08552098274230957,
"learning_rate": 0.00012392771861159346,
"loss": 1.1542,
"step": 1319
},
{
"epoch": 0.86,
"grad_norm": 0.09766525030136108,
"learning_rate": 0.00012382766219299436,
"loss": 1.0728,
"step": 1320
},
{
"epoch": 0.86,
"grad_norm": 0.09022209048271179,
"learning_rate": 0.00012372758047755414,
"loss": 1.1071,
"step": 1321
},
{
"epoch": 0.86,
"grad_norm": 0.08056436479091644,
"learning_rate": 0.0001236274735715255,
"loss": 1.1397,
"step": 1322
},
{
"epoch": 0.86,
"grad_norm": 0.09278228878974915,
"learning_rate": 0.0001235273415811877,
"loss": 1.2702,
"step": 1323
},
{
"epoch": 0.86,
"grad_norm": 0.08561549335718155,
"learning_rate": 0.00012342718461284672,
"loss": 1.1927,
"step": 1324
},
{
"epoch": 0.86,
"grad_norm": 0.10450749844312668,
"learning_rate": 0.0001233270027728351,
"loss": 1.3358,
"step": 1325
},
{
"epoch": 0.86,
"grad_norm": 0.09876471757888794,
"learning_rate": 0.00012322679616751176,
"loss": 1.3257,
"step": 1326
},
{
"epoch": 0.86,
"grad_norm": 0.09191320091485977,
"learning_rate": 0.00012312656490326188,
"loss": 1.1142,
"step": 1327
},
{
"epoch": 0.86,
"grad_norm": 0.08982070535421371,
"learning_rate": 0.00012302630908649678,
"loss": 1.204,
"step": 1328
},
{
"epoch": 0.86,
"grad_norm": 0.08208096027374268,
"learning_rate": 0.000122926028823654,
"loss": 1.229,
"step": 1329
},
{
"epoch": 0.86,
"grad_norm": 0.10138587653636932,
"learning_rate": 0.0001228257242211969,
"loss": 1.2986,
"step": 1330
},
{
"epoch": 0.86,
"grad_norm": 0.08681602030992508,
"learning_rate": 0.00012272539538561467,
"loss": 1.3729,
"step": 1331
},
{
"epoch": 0.87,
"grad_norm": 0.09762348979711533,
"learning_rate": 0.00012262504242342235,
"loss": 1.0962,
"step": 1332
},
{
"epoch": 0.87,
"grad_norm": 0.07970700412988663,
"learning_rate": 0.00012252466544116048,
"loss": 1.2618,
"step": 1333
},
{
"epoch": 0.87,
"grad_norm": 0.09303802251815796,
"learning_rate": 0.00012242426454539516,
"loss": 1.0158,
"step": 1334
},
{
"epoch": 0.87,
"grad_norm": 0.08835624158382416,
"learning_rate": 0.00012232383984271783,
"loss": 1.2125,
"step": 1335
},
{
"epoch": 0.87,
"grad_norm": 0.10848595201969147,
"learning_rate": 0.00012222339143974523,
"loss": 1.2646,
"step": 1336
},
{
"epoch": 0.87,
"grad_norm": 0.11647682636976242,
"learning_rate": 0.00012212291944311933,
"loss": 1.3383,
"step": 1337
},
{
"epoch": 0.87,
"grad_norm": 0.08612092584371567,
"learning_rate": 0.000122022423959507,
"loss": 1.1868,
"step": 1338
},
{
"epoch": 0.87,
"grad_norm": 0.10143036395311356,
"learning_rate": 0.0001219219050956002,
"loss": 1.2887,
"step": 1339
},
{
"epoch": 0.87,
"grad_norm": 0.09681069850921631,
"learning_rate": 0.00012182136295811557,
"loss": 1.2763,
"step": 1340
},
{
"epoch": 0.87,
"grad_norm": 0.08099117130041122,
"learning_rate": 0.00012172079765379458,
"loss": 1.0453,
"step": 1341
},
{
"epoch": 0.87,
"grad_norm": 0.08035444468259811,
"learning_rate": 0.00012162020928940324,
"loss": 1.1917,
"step": 1342
},
{
"epoch": 0.87,
"grad_norm": 0.08961553871631622,
"learning_rate": 0.00012151959797173205,
"loss": 1.069,
"step": 1343
},
{
"epoch": 0.87,
"grad_norm": 0.1157972514629364,
"learning_rate": 0.00012141896380759581,
"loss": 1.2185,
"step": 1344
},
{
"epoch": 0.87,
"grad_norm": 0.10159555077552795,
"learning_rate": 0.00012131830690383375,
"loss": 1.2967,
"step": 1345
},
{
"epoch": 0.87,
"grad_norm": 0.09573571383953094,
"learning_rate": 0.00012121762736730904,
"loss": 1.2887,
"step": 1346
},
{
"epoch": 0.87,
"grad_norm": 0.0860888808965683,
"learning_rate": 0.00012111692530490899,
"loss": 0.8751,
"step": 1347
},
{
"epoch": 0.88,
"grad_norm": 0.16526953876018524,
"learning_rate": 0.00012101620082354484,
"loss": 0.9683,
"step": 1348
},
{
"epoch": 0.88,
"grad_norm": 0.0962710753083229,
"learning_rate": 0.00012091545403015159,
"loss": 0.9592,
"step": 1349
},
{
"epoch": 0.88,
"grad_norm": 0.0951792299747467,
"learning_rate": 0.0001208146850316879,
"loss": 0.9172,
"step": 1350
},
{
"epoch": 0.88,
"grad_norm": 0.0804206058382988,
"learning_rate": 0.00012071389393513603,
"loss": 1.0321,
"step": 1351
},
{
"epoch": 0.88,
"grad_norm": 0.08747891336679459,
"learning_rate": 0.00012061308084750175,
"loss": 1.022,
"step": 1352
},
{
"epoch": 0.88,
"grad_norm": 0.10773107409477234,
"learning_rate": 0.00012051224587581416,
"loss": 1.2364,
"step": 1353
},
{
"epoch": 0.88,
"grad_norm": 0.08635377138853073,
"learning_rate": 0.00012041138912712546,
"loss": 1.2689,
"step": 1354
},
{
"epoch": 0.88,
"grad_norm": 0.08952300995588303,
"learning_rate": 0.0001203105107085112,
"loss": 0.8106,
"step": 1355
},
{
"epoch": 0.88,
"grad_norm": 0.11334968358278275,
"learning_rate": 0.00012020961072706973,
"loss": 1.1669,
"step": 1356
},
{
"epoch": 0.88,
"grad_norm": 0.10848329216241837,
"learning_rate": 0.00012010868928992243,
"loss": 1.0589,
"step": 1357
},
{
"epoch": 0.88,
"grad_norm": 0.1133638545870781,
"learning_rate": 0.00012000774650421336,
"loss": 1.3081,
"step": 1358
},
{
"epoch": 0.88,
"grad_norm": 0.08815950900316238,
"learning_rate": 0.00011990678247710935,
"loss": 0.9832,
"step": 1359
},
{
"epoch": 0.88,
"grad_norm": 0.08206349611282349,
"learning_rate": 0.00011980579731579966,
"loss": 1.2696,
"step": 1360
},
{
"epoch": 0.88,
"grad_norm": 0.08839456737041473,
"learning_rate": 0.00011970479112749609,
"loss": 1.0613,
"step": 1361
},
{
"epoch": 0.88,
"grad_norm": 0.08356121927499771,
"learning_rate": 0.0001196037640194327,
"loss": 1.1493,
"step": 1362
},
{
"epoch": 0.89,
"grad_norm": 0.07834324240684509,
"learning_rate": 0.0001195027160988658,
"loss": 1.2681,
"step": 1363
},
{
"epoch": 0.89,
"grad_norm": 0.0795789584517479,
"learning_rate": 0.00011940164747307374,
"loss": 1.0486,
"step": 1364
},
{
"epoch": 0.89,
"grad_norm": 0.22570103406906128,
"learning_rate": 0.00011930055824935695,
"loss": 1.1241,
"step": 1365
},
{
"epoch": 0.89,
"grad_norm": 0.08403278887271881,
"learning_rate": 0.00011919944853503764,
"loss": 1.2673,
"step": 1366
},
{
"epoch": 0.89,
"grad_norm": 0.08180595189332962,
"learning_rate": 0.00011909831843745978,
"loss": 1.1177,
"step": 1367
},
{
"epoch": 0.89,
"grad_norm": 0.10031206160783768,
"learning_rate": 0.00011899716806398902,
"loss": 1.0852,
"step": 1368
},
{
"epoch": 0.89,
"grad_norm": 0.0997086688876152,
"learning_rate": 0.00011889599752201255,
"loss": 0.961,
"step": 1369
},
{
"epoch": 0.89,
"grad_norm": 0.10722131282091141,
"learning_rate": 0.00011879480691893887,
"loss": 1.4097,
"step": 1370
},
{
"epoch": 0.89,
"grad_norm": 0.08794771134853363,
"learning_rate": 0.00011869359636219788,
"loss": 1.0357,
"step": 1371
},
{
"epoch": 0.89,
"grad_norm": 0.08072856813669205,
"learning_rate": 0.00011859236595924069,
"loss": 1.2738,
"step": 1372
},
{
"epoch": 0.89,
"grad_norm": 0.08724841475486755,
"learning_rate": 0.00011849111581753932,
"loss": 0.9264,
"step": 1373
},
{
"epoch": 0.89,
"grad_norm": 0.09085755795240402,
"learning_rate": 0.00011838984604458692,
"loss": 0.9303,
"step": 1374
},
{
"epoch": 0.89,
"grad_norm": 0.07917338609695435,
"learning_rate": 0.00011828855674789738,
"loss": 1.1925,
"step": 1375
},
{
"epoch": 0.89,
"grad_norm": 0.10229848325252533,
"learning_rate": 0.00011818724803500539,
"loss": 1.2509,
"step": 1376
},
{
"epoch": 0.89,
"grad_norm": 0.07423543930053711,
"learning_rate": 0.0001180859200134661,
"loss": 0.9187,
"step": 1377
},
{
"epoch": 0.9,
"grad_norm": 0.09093800187110901,
"learning_rate": 0.00011798457279085542,
"loss": 0.9583,
"step": 1378
},
{
"epoch": 0.9,
"grad_norm": 0.09684485197067261,
"learning_rate": 0.00011788320647476938,
"loss": 1.3184,
"step": 1379
},
{
"epoch": 0.9,
"grad_norm": 0.12195535749197006,
"learning_rate": 0.00011778182117282443,
"loss": 1.1374,
"step": 1380
},
{
"epoch": 0.9,
"grad_norm": 0.08829156309366226,
"learning_rate": 0.00011768041699265717,
"loss": 1.0709,
"step": 1381
},
{
"epoch": 0.9,
"grad_norm": 0.0958176925778389,
"learning_rate": 0.0001175789940419242,
"loss": 1.2382,
"step": 1382
},
{
"epoch": 0.9,
"grad_norm": 0.09135644137859344,
"learning_rate": 0.00011747755242830202,
"loss": 1.1228,
"step": 1383
},
{
"epoch": 0.9,
"grad_norm": 0.08884572982788086,
"learning_rate": 0.00011737609225948702,
"loss": 1.0929,
"step": 1384
},
{
"epoch": 0.9,
"grad_norm": 0.09734932333230972,
"learning_rate": 0.00011727461364319527,
"loss": 1.1049,
"step": 1385
},
{
"epoch": 0.9,
"grad_norm": 0.08991330862045288,
"learning_rate": 0.0001171731166871624,
"loss": 1.0455,
"step": 1386
},
{
"epoch": 0.9,
"grad_norm": 0.09676750749349594,
"learning_rate": 0.0001170716014991435,
"loss": 1.2276,
"step": 1387
},
{
"epoch": 0.9,
"grad_norm": 0.12714175879955292,
"learning_rate": 0.00011697006818691305,
"loss": 1.0119,
"step": 1388
},
{
"epoch": 0.9,
"grad_norm": 0.09282530099153519,
"learning_rate": 0.00011686851685826477,
"loss": 1.0845,
"step": 1389
},
{
"epoch": 0.9,
"grad_norm": 0.09789827466011047,
"learning_rate": 0.00011676694762101146,
"loss": 0.986,
"step": 1390
},
{
"epoch": 0.9,
"grad_norm": 0.0813322439789772,
"learning_rate": 0.00011666536058298499,
"loss": 1.0775,
"step": 1391
},
{
"epoch": 0.9,
"grad_norm": 0.1267024725675583,
"learning_rate": 0.00011656375585203614,
"loss": 1.1418,
"step": 1392
},
{
"epoch": 0.9,
"grad_norm": 0.09893757104873657,
"learning_rate": 0.00011646213353603439,
"loss": 1.2688,
"step": 1393
},
{
"epoch": 0.91,
"grad_norm": 0.08630049228668213,
"learning_rate": 0.00011636049374286795,
"loss": 1.1622,
"step": 1394
},
{
"epoch": 0.91,
"grad_norm": 0.0947432890534401,
"learning_rate": 0.00011625883658044359,
"loss": 1.1753,
"step": 1395
},
{
"epoch": 0.91,
"grad_norm": 0.09087851643562317,
"learning_rate": 0.00011615716215668651,
"loss": 0.9615,
"step": 1396
},
{
"epoch": 0.91,
"grad_norm": 0.10404366999864578,
"learning_rate": 0.00011605547057954018,
"loss": 1.2773,
"step": 1397
},
{
"epoch": 0.91,
"grad_norm": 0.09579934924840927,
"learning_rate": 0.00011595376195696641,
"loss": 1.1134,
"step": 1398
},
{
"epoch": 0.91,
"grad_norm": 0.08833558112382889,
"learning_rate": 0.00011585203639694498,
"loss": 1.1027,
"step": 1399
},
{
"epoch": 0.91,
"grad_norm": 0.08974206447601318,
"learning_rate": 0.00011575029400747368,
"loss": 0.9821,
"step": 1400
},
{
"epoch": 0.91,
"grad_norm": 0.11471564322710037,
"learning_rate": 0.00011564853489656824,
"loss": 1.2049,
"step": 1401
},
{
"epoch": 0.91,
"grad_norm": 0.07331151515245438,
"learning_rate": 0.00011554675917226208,
"loss": 0.9396,
"step": 1402
},
{
"epoch": 0.91,
"grad_norm": 0.09297305345535278,
"learning_rate": 0.0001154449669426062,
"loss": 1.2055,
"step": 1403
},
{
"epoch": 0.91,
"grad_norm": 0.09701349586248398,
"learning_rate": 0.00011534315831566926,
"loss": 1.0394,
"step": 1404
},
{
"epoch": 0.91,
"grad_norm": 0.07366570085287094,
"learning_rate": 0.00011524133339953727,
"loss": 1.1705,
"step": 1405
},
{
"epoch": 0.91,
"grad_norm": 0.10137824714183807,
"learning_rate": 0.00011513949230231347,
"loss": 1.1844,
"step": 1406
},
{
"epoch": 0.91,
"grad_norm": 0.1282086968421936,
"learning_rate": 0.00011503763513211834,
"loss": 1.3521,
"step": 1407
},
{
"epoch": 0.91,
"grad_norm": 0.10978394001722336,
"learning_rate": 0.00011493576199708945,
"loss": 1.1331,
"step": 1408
},
{
"epoch": 0.92,
"grad_norm": 0.09129098057746887,
"learning_rate": 0.00011483387300538125,
"loss": 1.1629,
"step": 1409
},
{
"epoch": 0.92,
"grad_norm": 0.10823319852352142,
"learning_rate": 0.00011473196826516504,
"loss": 1.2896,
"step": 1410
},
{
"epoch": 0.92,
"grad_norm": 0.10361718386411667,
"learning_rate": 0.0001146300478846289,
"loss": 1.3865,
"step": 1411
},
{
"epoch": 0.92,
"grad_norm": 0.09458521008491516,
"learning_rate": 0.0001145281119719775,
"loss": 1.0384,
"step": 1412
},
{
"epoch": 0.92,
"grad_norm": 0.08432728797197342,
"learning_rate": 0.00011442616063543188,
"loss": 0.9361,
"step": 1413
},
{
"epoch": 0.92,
"grad_norm": 0.10821312665939331,
"learning_rate": 0.00011432419398322962,
"loss": 1.3136,
"step": 1414
},
{
"epoch": 0.92,
"grad_norm": 0.1019124835729599,
"learning_rate": 0.00011422221212362447,
"loss": 1.1952,
"step": 1415
},
{
"epoch": 0.92,
"grad_norm": 0.08034075051546097,
"learning_rate": 0.00011412021516488634,
"loss": 1.3013,
"step": 1416
},
{
"epoch": 0.92,
"grad_norm": 0.09685003012418747,
"learning_rate": 0.00011401820321530117,
"loss": 1.2334,
"step": 1417
},
{
"epoch": 0.92,
"grad_norm": 0.09500446915626526,
"learning_rate": 0.00011391617638317083,
"loss": 1.0735,
"step": 1418
},
{
"epoch": 0.92,
"grad_norm": 0.33522164821624756,
"learning_rate": 0.00011381413477681292,
"loss": 1.1088,
"step": 1419
},
{
"epoch": 0.92,
"grad_norm": 0.09433950483798981,
"learning_rate": 0.00011371207850456087,
"loss": 1.1735,
"step": 1420
},
{
"epoch": 0.92,
"grad_norm": 0.0943494364619255,
"learning_rate": 0.00011361000767476353,
"loss": 1.0166,
"step": 1421
},
{
"epoch": 0.92,
"grad_norm": 0.08994188159704208,
"learning_rate": 0.00011350792239578533,
"loss": 1.4057,
"step": 1422
},
{
"epoch": 0.92,
"grad_norm": 0.08347003906965256,
"learning_rate": 0.00011340582277600588,
"loss": 1.0144,
"step": 1423
},
{
"epoch": 0.92,
"grad_norm": 0.08465338498353958,
"learning_rate": 0.00011330370892382022,
"loss": 1.1763,
"step": 1424
},
{
"epoch": 0.93,
"grad_norm": 0.0895058661699295,
"learning_rate": 0.00011320158094763833,
"loss": 1.1092,
"step": 1425
},
{
"epoch": 0.93,
"grad_norm": 0.09917373210191727,
"learning_rate": 0.00011309943895588527,
"loss": 1.1238,
"step": 1426
},
{
"epoch": 0.93,
"grad_norm": 0.0842377170920372,
"learning_rate": 0.00011299728305700092,
"loss": 1.0029,
"step": 1427
},
{
"epoch": 0.93,
"grad_norm": 0.08558105677366257,
"learning_rate": 0.00011289511335944005,
"loss": 0.9098,
"step": 1428
},
{
"epoch": 0.93,
"grad_norm": 0.07691995054483414,
"learning_rate": 0.0001127929299716719,
"loss": 1.1513,
"step": 1429
},
{
"epoch": 0.93,
"grad_norm": 0.11070670187473297,
"learning_rate": 0.00011269073300218038,
"loss": 1.0406,
"step": 1430
},
{
"epoch": 0.93,
"grad_norm": 0.09115037322044373,
"learning_rate": 0.00011258852255946377,
"loss": 0.8851,
"step": 1431
},
{
"epoch": 0.93,
"grad_norm": 0.13411705195903778,
"learning_rate": 0.00011248629875203467,
"loss": 0.9826,
"step": 1432
},
{
"epoch": 0.93,
"grad_norm": 0.08688797801733017,
"learning_rate": 0.00011238406168841982,
"loss": 1.1807,
"step": 1433
},
{
"epoch": 0.93,
"grad_norm": 0.08258987963199615,
"learning_rate": 0.00011228181147716013,
"loss": 1.2896,
"step": 1434
},
{
"epoch": 0.93,
"grad_norm": 0.07810018956661224,
"learning_rate": 0.00011217954822681034,
"loss": 0.9262,
"step": 1435
},
{
"epoch": 0.93,
"grad_norm": 0.08800841122865677,
"learning_rate": 0.00011207727204593917,
"loss": 1.0823,
"step": 1436
},
{
"epoch": 0.93,
"grad_norm": 0.0887501984834671,
"learning_rate": 0.00011197498304312896,
"loss": 1.3328,
"step": 1437
},
{
"epoch": 0.93,
"grad_norm": 0.09655321389436722,
"learning_rate": 0.00011187268132697574,
"loss": 1.0623,
"step": 1438
},
{
"epoch": 0.93,
"grad_norm": 0.07697466760873795,
"learning_rate": 0.00011177036700608897,
"loss": 1.1771,
"step": 1439
},
{
"epoch": 0.94,
"grad_norm": 0.08838430047035217,
"learning_rate": 0.00011166804018909152,
"loss": 1.184,
"step": 1440
},
{
"epoch": 0.94,
"grad_norm": 0.0987589955329895,
"learning_rate": 0.00011156570098461953,
"loss": 1.2888,
"step": 1441
},
{
"epoch": 0.94,
"grad_norm": 0.08954241126775742,
"learning_rate": 0.0001114633495013223,
"loss": 1.0998,
"step": 1442
},
{
"epoch": 0.94,
"grad_norm": 0.08703291416168213,
"learning_rate": 0.00011136098584786217,
"loss": 0.9278,
"step": 1443
},
{
"epoch": 0.94,
"grad_norm": 0.08863551914691925,
"learning_rate": 0.00011125861013291439,
"loss": 1.1157,
"step": 1444
},
{
"epoch": 0.94,
"grad_norm": 0.08209817856550217,
"learning_rate": 0.00011115622246516697,
"loss": 1.1279,
"step": 1445
},
{
"epoch": 0.94,
"grad_norm": 0.09977526217699051,
"learning_rate": 0.00011105382295332068,
"loss": 1.1276,
"step": 1446
},
{
"epoch": 0.94,
"grad_norm": 0.09060298651456833,
"learning_rate": 0.00011095141170608882,
"loss": 1.362,
"step": 1447
},
{
"epoch": 0.94,
"grad_norm": 0.08212457597255707,
"learning_rate": 0.00011084898883219723,
"loss": 1.1678,
"step": 1448
},
{
"epoch": 0.94,
"grad_norm": 0.08215862512588501,
"learning_rate": 0.0001107465544403839,
"loss": 0.9225,
"step": 1449
},
{
"epoch": 0.94,
"grad_norm": 0.08164898306131363,
"learning_rate": 0.00011064410863939934,
"loss": 1.1899,
"step": 1450
},
{
"epoch": 0.94,
"grad_norm": 0.08226540684700012,
"learning_rate": 0.00011054165153800589,
"loss": 1.2628,
"step": 1451
},
{
"epoch": 0.94,
"grad_norm": 0.10084257274866104,
"learning_rate": 0.00011043918324497802,
"loss": 1.1758,
"step": 1452
},
{
"epoch": 0.94,
"grad_norm": 0.09931132942438126,
"learning_rate": 0.0001103367038691021,
"loss": 1.0378,
"step": 1453
},
{
"epoch": 0.94,
"grad_norm": 0.0859564021229744,
"learning_rate": 0.00011023421351917626,
"loss": 0.9962,
"step": 1454
},
{
"epoch": 0.95,
"grad_norm": 0.09622316807508469,
"learning_rate": 0.0001101317123040102,
"loss": 0.9946,
"step": 1455
},
{
"epoch": 0.95,
"grad_norm": 0.1029711440205574,
"learning_rate": 0.00011002920033242521,
"loss": 1.0857,
"step": 1456
},
{
"epoch": 0.95,
"grad_norm": 0.09329650551080704,
"learning_rate": 0.00010992667771325405,
"loss": 1.1006,
"step": 1457
},
{
"epoch": 0.95,
"grad_norm": 0.09036722034215927,
"learning_rate": 0.00010982414455534069,
"loss": 1.0406,
"step": 1458
},
{
"epoch": 0.95,
"grad_norm": 0.10984571278095245,
"learning_rate": 0.00010972160096754034,
"loss": 1.1479,
"step": 1459
},
{
"epoch": 0.95,
"grad_norm": 0.09640296548604965,
"learning_rate": 0.0001096190470587193,
"loss": 1.2259,
"step": 1460
},
{
"epoch": 0.95,
"grad_norm": 0.0921231359243393,
"learning_rate": 0.00010951648293775481,
"loss": 1.0285,
"step": 1461
},
{
"epoch": 0.95,
"grad_norm": 0.08795608580112457,
"learning_rate": 0.00010941390871353487,
"loss": 0.9821,
"step": 1462
},
{
"epoch": 0.95,
"grad_norm": 0.08401835709810257,
"learning_rate": 0.00010931132449495835,
"loss": 1.4154,
"step": 1463
},
{
"epoch": 0.95,
"grad_norm": 0.10196711122989655,
"learning_rate": 0.00010920873039093469,
"loss": 1.1526,
"step": 1464
},
{
"epoch": 0.95,
"grad_norm": 0.08550386130809784,
"learning_rate": 0.00010910612651038372,
"loss": 0.8971,
"step": 1465
},
{
"epoch": 0.95,
"grad_norm": 0.09317058324813843,
"learning_rate": 0.00010900351296223577,
"loss": 1.065,
"step": 1466
},
{
"epoch": 0.95,
"grad_norm": 0.09476014226675034,
"learning_rate": 0.00010890088985543137,
"loss": 1.2147,
"step": 1467
},
{
"epoch": 0.95,
"grad_norm": 0.14203734695911407,
"learning_rate": 0.00010879825729892123,
"loss": 0.9751,
"step": 1468
},
{
"epoch": 0.95,
"grad_norm": 0.08843449503183365,
"learning_rate": 0.00010869561540166604,
"loss": 1.0875,
"step": 1469
},
{
"epoch": 0.95,
"grad_norm": 0.08830668777227402,
"learning_rate": 0.00010859296427263654,
"loss": 1.3292,
"step": 1470
},
{
"epoch": 0.96,
"grad_norm": 0.10075783729553223,
"learning_rate": 0.00010849030402081311,
"loss": 1.1229,
"step": 1471
},
{
"epoch": 0.96,
"grad_norm": 0.08239160478115082,
"learning_rate": 0.00010838763475518588,
"loss": 1.2608,
"step": 1472
},
{
"epoch": 0.96,
"grad_norm": 0.09686918556690216,
"learning_rate": 0.00010828495658475457,
"loss": 1.2141,
"step": 1473
},
{
"epoch": 0.96,
"grad_norm": 0.08807504922151566,
"learning_rate": 0.00010818226961852835,
"loss": 1.2415,
"step": 1474
},
{
"epoch": 0.96,
"grad_norm": 0.08916371315717697,
"learning_rate": 0.00010807957396552565,
"loss": 1.337,
"step": 1475
},
{
"epoch": 0.96,
"grad_norm": 0.09403186291456223,
"learning_rate": 0.0001079768697347743,
"loss": 1.3709,
"step": 1476
},
{
"epoch": 0.96,
"grad_norm": 0.09369078278541565,
"learning_rate": 0.00010787415703531106,
"loss": 1.0744,
"step": 1477
},
{
"epoch": 0.96,
"grad_norm": 0.08687245845794678,
"learning_rate": 0.0001077714359761817,
"loss": 0.8305,
"step": 1478
},
{
"epoch": 0.96,
"grad_norm": 0.10874254256486893,
"learning_rate": 0.00010766870666644098,
"loss": 1.1111,
"step": 1479
},
{
"epoch": 0.96,
"grad_norm": 0.08670711517333984,
"learning_rate": 0.00010756596921515234,
"loss": 1.1499,
"step": 1480
},
{
"epoch": 0.96,
"grad_norm": 0.09950361400842667,
"learning_rate": 0.00010746322373138782,
"loss": 1.188,
"step": 1481
},
{
"epoch": 0.96,
"grad_norm": 0.08970309048891068,
"learning_rate": 0.00010736047032422809,
"loss": 1.1061,
"step": 1482
},
{
"epoch": 0.96,
"grad_norm": 0.0889861136674881,
"learning_rate": 0.00010725770910276218,
"loss": 1.1794,
"step": 1483
},
{
"epoch": 0.96,
"grad_norm": 0.07253705710172653,
"learning_rate": 0.00010715494017608743,
"loss": 0.9302,
"step": 1484
},
{
"epoch": 0.96,
"grad_norm": 0.14619792997837067,
"learning_rate": 0.00010705216365330928,
"loss": 1.1516,
"step": 1485
},
{
"epoch": 0.97,
"grad_norm": 0.089421845972538,
"learning_rate": 0.0001069493796435414,
"loss": 0.9597,
"step": 1486
},
{
"epoch": 0.97,
"grad_norm": 0.10465262085199356,
"learning_rate": 0.00010684658825590521,
"loss": 1.389,
"step": 1487
},
{
"epoch": 0.97,
"grad_norm": 0.08181484043598175,
"learning_rate": 0.00010674378959953015,
"loss": 0.9245,
"step": 1488
},
{
"epoch": 0.97,
"grad_norm": 0.09142012894153595,
"learning_rate": 0.00010664098378355325,
"loss": 1.0725,
"step": 1489
},
{
"epoch": 0.97,
"grad_norm": 0.0918259471654892,
"learning_rate": 0.00010653817091711919,
"loss": 1.0588,
"step": 1490
},
{
"epoch": 0.97,
"grad_norm": 0.0977063924074173,
"learning_rate": 0.00010643535110938014,
"loss": 0.8997,
"step": 1491
},
{
"epoch": 0.97,
"grad_norm": 0.09734974801540375,
"learning_rate": 0.00010633252446949562,
"loss": 1.2123,
"step": 1492
},
{
"epoch": 0.97,
"grad_norm": 0.07984396815299988,
"learning_rate": 0.00010622969110663239,
"loss": 1.1921,
"step": 1493
},
{
"epoch": 0.97,
"grad_norm": 0.08105143904685974,
"learning_rate": 0.00010612685112996437,
"loss": 0.8557,
"step": 1494
},
{
"epoch": 0.97,
"grad_norm": 0.0952603816986084,
"learning_rate": 0.00010602400464867255,
"loss": 1.332,
"step": 1495
},
{
"epoch": 0.97,
"grad_norm": 0.0906563252210617,
"learning_rate": 0.00010592115177194471,
"loss": 1.0726,
"step": 1496
},
{
"epoch": 0.97,
"grad_norm": 0.09860397130250931,
"learning_rate": 0.00010581829260897555,
"loss": 1.1553,
"step": 1497
},
{
"epoch": 0.97,
"grad_norm": 0.09034127742052078,
"learning_rate": 0.00010571542726896633,
"loss": 1.141,
"step": 1498
},
{
"epoch": 0.97,
"grad_norm": 0.08600937575101852,
"learning_rate": 0.0001056125558611249,
"loss": 1.3734,
"step": 1499
},
{
"epoch": 0.97,
"grad_norm": 0.09543422609567642,
"learning_rate": 0.00010550967849466564,
"loss": 0.906,
"step": 1500
},
{
"epoch": 0.97,
"grad_norm": 0.08430718630552292,
"learning_rate": 0.00010540679527880914,
"loss": 1.1106,
"step": 1501
},
{
"epoch": 0.98,
"grad_norm": 0.08454833924770355,
"learning_rate": 0.00010530390632278222,
"loss": 0.9461,
"step": 1502
},
{
"epoch": 0.98,
"grad_norm": 0.10491041839122772,
"learning_rate": 0.0001052010117358179,
"loss": 1.039,
"step": 1503
},
{
"epoch": 0.98,
"grad_norm": 0.12160802632570267,
"learning_rate": 0.00010509811162715499,
"loss": 1.4197,
"step": 1504
},
{
"epoch": 0.98,
"grad_norm": 0.0849526971578598,
"learning_rate": 0.00010499520610603834,
"loss": 1.1532,
"step": 1505
},
{
"epoch": 0.98,
"grad_norm": 0.10352155566215515,
"learning_rate": 0.00010489229528171847,
"loss": 1.3926,
"step": 1506
},
{
"epoch": 0.98,
"grad_norm": 0.09881944209337234,
"learning_rate": 0.00010478937926345154,
"loss": 1.0617,
"step": 1507
},
{
"epoch": 0.98,
"grad_norm": 0.0948866605758667,
"learning_rate": 0.00010468645816049918,
"loss": 1.1235,
"step": 1508
},
{
"epoch": 0.98,
"grad_norm": 0.09189804643392563,
"learning_rate": 0.0001045835320821285,
"loss": 1.1182,
"step": 1509
},
{
"epoch": 0.98,
"grad_norm": 0.0979374349117279,
"learning_rate": 0.00010448060113761182,
"loss": 1.0516,
"step": 1510
},
{
"epoch": 0.98,
"grad_norm": 0.08869647234678268,
"learning_rate": 0.00010437766543622669,
"loss": 0.9857,
"step": 1511
},
{
"epoch": 0.98,
"grad_norm": 0.09148126095533371,
"learning_rate": 0.00010427472508725564,
"loss": 1.2069,
"step": 1512
},
{
"epoch": 0.98,
"grad_norm": 0.10273449122905731,
"learning_rate": 0.00010417178019998622,
"loss": 1.1986,
"step": 1513
},
{
"epoch": 0.98,
"grad_norm": 0.08176911622285843,
"learning_rate": 0.00010406883088371069,
"loss": 1.0117,
"step": 1514
},
{
"epoch": 0.98,
"grad_norm": 0.09893279522657394,
"learning_rate": 0.00010396587724772608,
"loss": 1.0838,
"step": 1515
},
{
"epoch": 0.98,
"grad_norm": 0.09672100096940994,
"learning_rate": 0.00010386291940133404,
"loss": 1.3202,
"step": 1516
},
{
"epoch": 0.99,
"grad_norm": 0.08901774883270264,
"learning_rate": 0.00010375995745384064,
"loss": 1.0402,
"step": 1517
},
{
"epoch": 0.99,
"grad_norm": 0.09123262763023376,
"learning_rate": 0.00010365699151455623,
"loss": 1.1215,
"step": 1518
},
{
"epoch": 0.99,
"grad_norm": 0.09209898114204407,
"learning_rate": 0.0001035540216927956,
"loss": 1.1871,
"step": 1519
},
{
"epoch": 0.99,
"grad_norm": 0.09589068591594696,
"learning_rate": 0.00010345104809787747,
"loss": 1.0412,
"step": 1520
},
{
"epoch": 0.99,
"grad_norm": 0.07181321084499359,
"learning_rate": 0.00010334807083912463,
"loss": 1.2467,
"step": 1521
},
{
"epoch": 0.99,
"grad_norm": 0.09226029366254807,
"learning_rate": 0.0001032450900258638,
"loss": 1.1057,
"step": 1522
},
{
"epoch": 0.99,
"grad_norm": 0.08866297453641891,
"learning_rate": 0.00010314210576742544,
"loss": 0.9112,
"step": 1523
},
{
"epoch": 0.99,
"grad_norm": 0.09634629637002945,
"learning_rate": 0.00010303911817314365,
"loss": 0.9939,
"step": 1524
},
{
"epoch": 0.99,
"grad_norm": 0.10096573829650879,
"learning_rate": 0.00010293612735235607,
"loss": 0.8711,
"step": 1525
},
{
"epoch": 0.99,
"grad_norm": 0.09180238097906113,
"learning_rate": 0.00010283313341440382,
"loss": 1.379,
"step": 1526
},
{
"epoch": 0.99,
"grad_norm": 0.08589636534452438,
"learning_rate": 0.0001027301364686313,
"loss": 1.3623,
"step": 1527
},
{
"epoch": 0.99,
"grad_norm": 0.08953863382339478,
"learning_rate": 0.00010262713662438603,
"loss": 1.103,
"step": 1528
},
{
"epoch": 0.99,
"grad_norm": 0.08932410925626755,
"learning_rate": 0.00010252413399101877,
"loss": 1.1554,
"step": 1529
},
{
"epoch": 0.99,
"grad_norm": 0.09286177903413773,
"learning_rate": 0.00010242112867788307,
"loss": 1.0348,
"step": 1530
},
{
"epoch": 0.99,
"grad_norm": 0.0911782830953598,
"learning_rate": 0.00010231812079433542,
"loss": 1.168,
"step": 1531
},
{
"epoch": 1.0,
"grad_norm": 0.08388300240039825,
"learning_rate": 0.00010221511044973506,
"loss": 1.01,
"step": 1532
},
{
"epoch": 1.0,
"grad_norm": 0.07732200622558594,
"learning_rate": 0.00010211209775344377,
"loss": 1.0821,
"step": 1533
},
{
"epoch": 1.0,
"grad_norm": 0.1292658895254135,
"learning_rate": 0.00010200908281482584,
"loss": 1.0545,
"step": 1534
},
{
"epoch": 1.0,
"grad_norm": 0.0958942249417305,
"learning_rate": 0.00010190606574324799,
"loss": 1.484,
"step": 1535
},
{
"epoch": 1.0,
"grad_norm": 0.09653940796852112,
"learning_rate": 0.00010180304664807916,
"loss": 1.5068,
"step": 1536
},
{
"epoch": 1.0,
"grad_norm": 0.08408050984144211,
"learning_rate": 0.00010170002563869044,
"loss": 1.0758,
"step": 1537
},
{
"epoch": 1.0,
"grad_norm": 0.09309494495391846,
"learning_rate": 0.000101597002824455,
"loss": 1.0259,
"step": 1538
},
{
"epoch": 1.0,
"grad_norm": 0.09328551590442657,
"learning_rate": 0.00010149397831474787,
"loss": 1.1898,
"step": 1539
},
{
"epoch": 1.0,
"grad_norm": 0.10631902515888214,
"learning_rate": 0.00010139095221894588,
"loss": 0.9469,
"step": 1540
},
{
"epoch": 1.0,
"grad_norm": 0.08702818304300308,
"learning_rate": 0.0001012879246464276,
"loss": 1.2296,
"step": 1541
},
{
"epoch": 1.0,
"grad_norm": 0.10087259113788605,
"learning_rate": 0.00010118489570657312,
"loss": 1.0818,
"step": 1542
},
{
"epoch": 1.0,
"grad_norm": 0.12534254789352417,
"learning_rate": 0.000101081865508764,
"loss": 1.0344,
"step": 1543
},
{
"epoch": 1.0,
"grad_norm": 0.09693174064159393,
"learning_rate": 0.0001009788341623831,
"loss": 1.006,
"step": 1544
},
{
"epoch": 1.0,
"grad_norm": 0.1262136995792389,
"learning_rate": 0.00010087580177681458,
"loss": 1.1636,
"step": 1545
},
{
"epoch": 1.0,
"grad_norm": 0.08032719045877457,
"learning_rate": 0.00010077276846144358,
"loss": 1.1668,
"step": 1546
},
{
"epoch": 1.0,
"grad_norm": 0.1510113626718521,
"learning_rate": 0.00010066973432565639,
"loss": 1.356,
"step": 1547
},
{
"epoch": 1.01,
"grad_norm": 0.08856373280286789,
"learning_rate": 0.00010056669947883999,
"loss": 1.1864,
"step": 1548
},
{
"epoch": 1.01,
"grad_norm": 0.08751031756401062,
"learning_rate": 0.00010046366403038229,
"loss": 1.0831,
"step": 1549
},
{
"epoch": 1.01,
"grad_norm": 0.4390200972557068,
"learning_rate": 0.00010036062808967168,
"loss": 0.974,
"step": 1550
},
{
"epoch": 1.01,
"grad_norm": 0.09046763181686401,
"learning_rate": 0.0001002575917660972,
"loss": 1.126,
"step": 1551
},
{
"epoch": 1.01,
"grad_norm": 0.08906295150518417,
"learning_rate": 0.00010015455516904819,
"loss": 1.0233,
"step": 1552
},
{
"epoch": 1.01,
"grad_norm": 0.08376338332891464,
"learning_rate": 0.0001000515184079144,
"loss": 0.9976,
"step": 1553
},
{
"epoch": 1.01,
"grad_norm": 0.08813194930553436,
"learning_rate": 9.994848159208561e-05,
"loss": 0.9689,
"step": 1554
},
{
"epoch": 1.01,
"grad_norm": 0.0848928838968277,
"learning_rate": 9.984544483095181e-05,
"loss": 1.0962,
"step": 1555
},
{
"epoch": 1.01,
"grad_norm": 0.08137081563472748,
"learning_rate": 9.974240823390285e-05,
"loss": 1.202,
"step": 1556
},
{
"epoch": 1.01,
"grad_norm": 0.08872334659099579,
"learning_rate": 9.963937191032834e-05,
"loss": 1.1594,
"step": 1557
},
{
"epoch": 1.01,
"grad_norm": 0.08986867964267731,
"learning_rate": 9.953633596961773e-05,
"loss": 1.1293,
"step": 1558
},
{
"epoch": 1.01,
"grad_norm": 0.08683433383703232,
"learning_rate": 9.943330052116001e-05,
"loss": 0.9802,
"step": 1559
},
{
"epoch": 1.01,
"grad_norm": 0.08998879790306091,
"learning_rate": 9.933026567434365e-05,
"loss": 1.0493,
"step": 1560
},
{
"epoch": 1.01,
"grad_norm": 0.08774431049823761,
"learning_rate": 9.922723153855643e-05,
"loss": 1.2231,
"step": 1561
},
{
"epoch": 1.01,
"grad_norm": 0.09754978120326996,
"learning_rate": 9.912419822318545e-05,
"loss": 1.1032,
"step": 1562
},
{
"epoch": 1.02,
"grad_norm": 0.08667407929897308,
"learning_rate": 9.902116583761691e-05,
"loss": 1.0801,
"step": 1563
},
{
"epoch": 1.02,
"grad_norm": 0.14995823800563812,
"learning_rate": 9.891813449123604e-05,
"loss": 1.1507,
"step": 1564
},
{
"epoch": 1.02,
"grad_norm": 0.08841430395841599,
"learning_rate": 9.88151042934269e-05,
"loss": 0.9971,
"step": 1565
},
{
"epoch": 1.02,
"grad_norm": 0.11697284132242203,
"learning_rate": 9.871207535357242e-05,
"loss": 0.9707,
"step": 1566
},
{
"epoch": 1.0,
"grad_norm": 0.084991455078125,
"learning_rate": 9.860904778105413e-05,
"loss": 1.1649,
"step": 1567
},
{
"epoch": 1.0,
"grad_norm": 0.08361074328422546,
"learning_rate": 9.850602168525218e-05,
"loss": 1.2864,
"step": 1568
},
{
"epoch": 1.0,
"grad_norm": 0.08681045472621918,
"learning_rate": 9.840299717554504e-05,
"loss": 1.1166,
"step": 1569
},
{
"epoch": 1.0,
"grad_norm": 0.14112353324890137,
"learning_rate": 9.829997436130959e-05,
"loss": 1.1502,
"step": 1570
},
{
"epoch": 1.0,
"grad_norm": 0.10277639329433441,
"learning_rate": 9.819695335192085e-05,
"loss": 1.2589,
"step": 1571
},
{
"epoch": 1.0,
"grad_norm": 0.0875738337635994,
"learning_rate": 9.809393425675206e-05,
"loss": 1.2676,
"step": 1572
},
{
"epoch": 1.0,
"grad_norm": 0.0787142813205719,
"learning_rate": 9.799091718517418e-05,
"loss": 0.9822,
"step": 1573
},
{
"epoch": 1.01,
"grad_norm": 0.09278953075408936,
"learning_rate": 9.788790224655625e-05,
"loss": 0.9578,
"step": 1574
},
{
"epoch": 1.01,
"grad_norm": 0.10362366586923599,
"learning_rate": 9.778488955026495e-05,
"loss": 1.0691,
"step": 1575
},
{
"epoch": 1.01,
"grad_norm": 0.0893579050898552,
"learning_rate": 9.768187920566459e-05,
"loss": 0.9144,
"step": 1576
},
{
"epoch": 1.01,
"grad_norm": 0.09827324002981186,
"learning_rate": 9.757887132211695e-05,
"loss": 1.0275,
"step": 1577
},
{
"epoch": 1.01,
"grad_norm": 0.08911921083927155,
"learning_rate": 9.747586600898125e-05,
"loss": 1.0292,
"step": 1578
},
{
"epoch": 1.01,
"grad_norm": 0.09204531461000443,
"learning_rate": 9.737286337561398e-05,
"loss": 1.2672,
"step": 1579
},
{
"epoch": 1.01,
"grad_norm": 0.08948075771331787,
"learning_rate": 9.726986353136876e-05,
"loss": 1.219,
"step": 1580
},
{
"epoch": 1.01,
"grad_norm": 0.09748535603284836,
"learning_rate": 9.716686658559621e-05,
"loss": 1.0734,
"step": 1581
},
{
"epoch": 1.01,
"grad_norm": 0.08578736335039139,
"learning_rate": 9.706387264764395e-05,
"loss": 1.0194,
"step": 1582
},
{
"epoch": 1.01,
"grad_norm": 0.08392781019210815,
"learning_rate": 9.696088182685638e-05,
"loss": 1.304,
"step": 1583
},
{
"epoch": 1.01,
"grad_norm": 0.09899682551622391,
"learning_rate": 9.68578942325746e-05,
"loss": 1.14,
"step": 1584
},
{
"epoch": 1.01,
"grad_norm": 0.0899963453412056,
"learning_rate": 9.675490997413622e-05,
"loss": 1.0936,
"step": 1585
},
{
"epoch": 1.01,
"grad_norm": 0.08999631553888321,
"learning_rate": 9.665192916087539e-05,
"loss": 0.907,
"step": 1586
},
{
"epoch": 1.01,
"grad_norm": 0.10483089089393616,
"learning_rate": 9.654895190212253e-05,
"loss": 1.1548,
"step": 1587
},
{
"epoch": 1.01,
"grad_norm": 0.090418741106987,
"learning_rate": 9.644597830720443e-05,
"loss": 1.0094,
"step": 1588
},
{
"epoch": 1.01,
"grad_norm": 0.0993262529373169,
"learning_rate": 9.634300848544379e-05,
"loss": 1.0332,
"step": 1589
},
{
"epoch": 1.02,
"grad_norm": 0.09174484759569168,
"learning_rate": 9.62400425461594e-05,
"loss": 0.9997,
"step": 1590
},
{
"epoch": 1.02,
"grad_norm": 0.09464286267757416,
"learning_rate": 9.613708059866596e-05,
"loss": 1.1863,
"step": 1591
},
{
"epoch": 1.02,
"grad_norm": 0.10829085856676102,
"learning_rate": 9.603412275227396e-05,
"loss": 1.3869,
"step": 1592
},
{
"epoch": 1.02,
"grad_norm": 0.11104355752468109,
"learning_rate": 9.593116911628935e-05,
"loss": 0.9643,
"step": 1593
},
{
"epoch": 1.02,
"grad_norm": 0.1187749058008194,
"learning_rate": 9.58282198000138e-05,
"loss": 1.2274,
"step": 1594
},
{
"epoch": 1.02,
"grad_norm": 0.0800420418381691,
"learning_rate": 9.572527491274437e-05,
"loss": 1.0434,
"step": 1595
},
{
"epoch": 1.02,
"grad_norm": 0.0982484519481659,
"learning_rate": 9.562233456377335e-05,
"loss": 1.2365,
"step": 1596
},
{
"epoch": 1.02,
"grad_norm": 0.11580273509025574,
"learning_rate": 9.55193988623882e-05,
"loss": 1.2816,
"step": 1597
},
{
"epoch": 1.02,
"grad_norm": 0.08642828464508057,
"learning_rate": 9.541646791787152e-05,
"loss": 1.1669,
"step": 1598
},
{
"epoch": 1.02,
"grad_norm": 0.10504204779863358,
"learning_rate": 9.531354183950083e-05,
"loss": 1.06,
"step": 1599
},
{
"epoch": 1.02,
"grad_norm": 0.0852760374546051,
"learning_rate": 9.52106207365485e-05,
"loss": 1.0612,
"step": 1600
},
{
"epoch": 1.02,
"grad_norm": 0.10574441403150558,
"learning_rate": 9.510770471828156e-05,
"loss": 1.2896,
"step": 1601
},
{
"epoch": 1.02,
"grad_norm": 0.12477613985538483,
"learning_rate": 9.500479389396168e-05,
"loss": 1.2855,
"step": 1602
},
{
"epoch": 1.02,
"grad_norm": 0.09092655032873154,
"learning_rate": 9.490188837284503e-05,
"loss": 1.0693,
"step": 1603
},
{
"epoch": 1.02,
"grad_norm": 0.0969330221414566,
"learning_rate": 9.479898826418217e-05,
"loss": 0.9968,
"step": 1604
},
{
"epoch": 1.03,
"grad_norm": 0.10302500426769257,
"learning_rate": 9.469609367721781e-05,
"loss": 1.09,
"step": 1605
},
{
"epoch": 1.03,
"grad_norm": 0.1196729987859726,
"learning_rate": 9.459320472119088e-05,
"loss": 1.1109,
"step": 1606
},
{
"epoch": 1.03,
"grad_norm": 0.093118816614151,
"learning_rate": 9.449032150533437e-05,
"loss": 1.004,
"step": 1607
},
{
"epoch": 1.03,
"grad_norm": 0.09315807372331619,
"learning_rate": 9.438744413887514e-05,
"loss": 1.2652,
"step": 1608
},
{
"epoch": 1.03,
"grad_norm": 0.09159765392541885,
"learning_rate": 9.428457273103371e-05,
"loss": 1.1693,
"step": 1609
},
{
"epoch": 1.03,
"grad_norm": 0.10726916044950485,
"learning_rate": 9.418170739102447e-05,
"loss": 1.3395,
"step": 1610
},
{
"epoch": 1.03,
"grad_norm": 0.16069358587265015,
"learning_rate": 9.407884822805529e-05,
"loss": 1.1842,
"step": 1611
},
{
"epoch": 1.03,
"grad_norm": 0.1112290769815445,
"learning_rate": 9.397599535132749e-05,
"loss": 1.4022,
"step": 1612
},
{
"epoch": 1.03,
"grad_norm": 0.0998128280043602,
"learning_rate": 9.387314887003564e-05,
"loss": 1.2401,
"step": 1613
},
{
"epoch": 1.03,
"grad_norm": 0.0887540802359581,
"learning_rate": 9.377030889336764e-05,
"loss": 1.1546,
"step": 1614
},
{
"epoch": 1.03,
"grad_norm": 0.1123807430267334,
"learning_rate": 9.366747553050441e-05,
"loss": 1.3342,
"step": 1615
},
{
"epoch": 1.03,
"grad_norm": 0.11113794893026352,
"learning_rate": 9.356464889061988e-05,
"loss": 1.1752,
"step": 1616
},
{
"epoch": 1.03,
"grad_norm": 0.11348582804203033,
"learning_rate": 9.346182908288083e-05,
"loss": 1.24,
"step": 1617
},
{
"epoch": 1.03,
"grad_norm": 0.0983775332570076,
"learning_rate": 9.335901621644678e-05,
"loss": 1.3173,
"step": 1618
},
{
"epoch": 1.03,
"grad_norm": 0.12112565338611603,
"learning_rate": 9.325621040046988e-05,
"loss": 1.1178,
"step": 1619
},
{
"epoch": 1.04,
"grad_norm": 0.11085223406553268,
"learning_rate": 9.315341174409477e-05,
"loss": 1.1246,
"step": 1620
},
{
"epoch": 1.04,
"grad_norm": 0.09420628845691681,
"learning_rate": 9.305062035645867e-05,
"loss": 1.011,
"step": 1621
},
{
"epoch": 1.04,
"grad_norm": 0.09570734202861786,
"learning_rate": 9.294783634669076e-05,
"loss": 1.1477,
"step": 1622
},
{
"epoch": 1.04,
"grad_norm": 0.08612988889217377,
"learning_rate": 9.28450598239126e-05,
"loss": 0.9526,
"step": 1623
},
{
"epoch": 1.04,
"grad_norm": 0.08804440498352051,
"learning_rate": 9.274229089723782e-05,
"loss": 1.1188,
"step": 1624
},
{
"epoch": 1.04,
"grad_norm": 0.1118922308087349,
"learning_rate": 9.263952967577194e-05,
"loss": 1.1224,
"step": 1625
},
{
"epoch": 1.04,
"grad_norm": 0.08840730041265488,
"learning_rate": 9.253677626861219e-05,
"loss": 0.9452,
"step": 1626
},
{
"epoch": 1.04,
"grad_norm": 0.08761528879404068,
"learning_rate": 9.24340307848477e-05,
"loss": 1.2771,
"step": 1627
},
{
"epoch": 1.04,
"grad_norm": 0.10330937057733536,
"learning_rate": 9.233129333355902e-05,
"loss": 1.0567,
"step": 1628
},
{
"epoch": 1.04,
"grad_norm": 0.08601588010787964,
"learning_rate": 9.222856402381832e-05,
"loss": 1.0733,
"step": 1629
},
{
"epoch": 1.04,
"grad_norm": 0.13251625001430511,
"learning_rate": 9.212584296468898e-05,
"loss": 1.187,
"step": 1630
},
{
"epoch": 1.04,
"grad_norm": 0.10423807054758072,
"learning_rate": 9.202313026522571e-05,
"loss": 1.2092,
"step": 1631
},
{
"epoch": 1.04,
"grad_norm": 0.10379697382450104,
"learning_rate": 9.192042603447434e-05,
"loss": 1.0908,
"step": 1632
},
{
"epoch": 1.04,
"grad_norm": 0.11194173991680145,
"learning_rate": 9.181773038147168e-05,
"loss": 1.0675,
"step": 1633
},
{
"epoch": 1.04,
"grad_norm": 0.09776081889867783,
"learning_rate": 9.171504341524546e-05,
"loss": 0.9585,
"step": 1634
},
{
"epoch": 1.04,
"grad_norm": 0.08580949157476425,
"learning_rate": 9.161236524481415e-05,
"loss": 1.1807,
"step": 1635
},
{
"epoch": 1.05,
"grad_norm": 0.07787908613681793,
"learning_rate": 9.150969597918691e-05,
"loss": 1.067,
"step": 1636
},
{
"epoch": 1.05,
"grad_norm": 0.09458938241004944,
"learning_rate": 9.14070357273635e-05,
"loss": 1.1411,
"step": 1637
},
{
"epoch": 1.05,
"grad_norm": 0.14569172263145447,
"learning_rate": 9.130438459833397e-05,
"loss": 0.9683,
"step": 1638
},
{
"epoch": 1.05,
"grad_norm": 0.09723403304815292,
"learning_rate": 9.12017427010788e-05,
"loss": 1.1726,
"step": 1639
},
{
"epoch": 1.05,
"grad_norm": 0.10078077763319016,
"learning_rate": 9.109911014456864e-05,
"loss": 1.23,
"step": 1640
},
{
"epoch": 1.05,
"grad_norm": 0.08864934742450714,
"learning_rate": 9.099648703776429e-05,
"loss": 1.0605,
"step": 1641
},
{
"epoch": 1.05,
"grad_norm": 0.1086459532380104,
"learning_rate": 9.08938734896163e-05,
"loss": 1.1975,
"step": 1642
},
{
"epoch": 1.05,
"grad_norm": 0.09757621586322784,
"learning_rate": 9.079126960906532e-05,
"loss": 1.0398,
"step": 1643
},
{
"epoch": 1.05,
"grad_norm": 0.10592546314001083,
"learning_rate": 9.068867550504163e-05,
"loss": 1.036,
"step": 1644
},
{
"epoch": 1.05,
"grad_norm": 0.09622690081596375,
"learning_rate": 9.058609128646515e-05,
"loss": 1.3171,
"step": 1645
},
{
"epoch": 1.05,
"grad_norm": 0.08799094706773758,
"learning_rate": 9.048351706224523e-05,
"loss": 1.1222,
"step": 1646
},
{
"epoch": 1.05,
"grad_norm": 0.09637622535228729,
"learning_rate": 9.038095294128071e-05,
"loss": 1.0675,
"step": 1647
},
{
"epoch": 1.05,
"grad_norm": 0.09280356764793396,
"learning_rate": 9.027839903245965e-05,
"loss": 1.1847,
"step": 1648
},
{
"epoch": 1.05,
"grad_norm": 0.09723308682441711,
"learning_rate": 9.017585544465935e-05,
"loss": 1.1787,
"step": 1649
},
{
"epoch": 1.05,
"grad_norm": 0.10138044506311417,
"learning_rate": 9.007332228674599e-05,
"loss": 1.0679,
"step": 1650
},
{
"epoch": 1.06,
"grad_norm": 0.08530982583761215,
"learning_rate": 8.99707996675748e-05,
"loss": 0.9675,
"step": 1651
},
{
"epoch": 1.06,
"grad_norm": 0.10112479329109192,
"learning_rate": 8.986828769598982e-05,
"loss": 1.068,
"step": 1652
},
{
"epoch": 1.06,
"grad_norm": 0.09918248653411865,
"learning_rate": 8.976578648082378e-05,
"loss": 1.0932,
"step": 1653
},
{
"epoch": 1.06,
"grad_norm": 0.11432183533906937,
"learning_rate": 8.96632961308979e-05,
"loss": 1.2878,
"step": 1654
},
{
"epoch": 1.06,
"grad_norm": 0.10209428519010544,
"learning_rate": 8.956081675502199e-05,
"loss": 0.9603,
"step": 1655
},
{
"epoch": 1.06,
"grad_norm": 0.09456180036067963,
"learning_rate": 8.945834846199412e-05,
"loss": 0.9838,
"step": 1656
},
{
"epoch": 1.06,
"grad_norm": 0.10398413985967636,
"learning_rate": 8.93558913606007e-05,
"loss": 1.2358,
"step": 1657
},
{
"epoch": 1.06,
"grad_norm": 0.09459967911243439,
"learning_rate": 8.92534455596161e-05,
"loss": 0.9755,
"step": 1658
},
{
"epoch": 1.06,
"grad_norm": 0.1017826497554779,
"learning_rate": 8.91510111678028e-05,
"loss": 1.164,
"step": 1659
},
{
"epoch": 1.06,
"grad_norm": 0.10723736882209778,
"learning_rate": 8.904858829391116e-05,
"loss": 1.1384,
"step": 1660
},
{
"epoch": 1.06,
"grad_norm": 0.10102162510156631,
"learning_rate": 8.894617704667937e-05,
"loss": 1.1647,
"step": 1661
},
{
"epoch": 1.06,
"grad_norm": 0.10086066275835037,
"learning_rate": 8.884377753483304e-05,
"loss": 1.0578,
"step": 1662
},
{
"epoch": 1.06,
"grad_norm": 0.0878998413681984,
"learning_rate": 8.874138986708563e-05,
"loss": 1.1418,
"step": 1663
},
{
"epoch": 1.06,
"grad_norm": 0.09546195715665817,
"learning_rate": 8.863901415213784e-05,
"loss": 0.9597,
"step": 1664
},
{
"epoch": 1.06,
"grad_norm": 0.1714036464691162,
"learning_rate": 8.853665049867772e-05,
"loss": 1.1558,
"step": 1665
},
{
"epoch": 1.06,
"grad_norm": 0.09392836689949036,
"learning_rate": 8.843429901538049e-05,
"loss": 1.1609,
"step": 1666
},
{
"epoch": 1.07,
"grad_norm": 0.09497623145580292,
"learning_rate": 8.833195981090852e-05,
"loss": 1.2594,
"step": 1667
},
{
"epoch": 1.07,
"grad_norm": 0.11227823793888092,
"learning_rate": 8.822963299391106e-05,
"loss": 1.039,
"step": 1668
},
{
"epoch": 1.07,
"grad_norm": 0.08898564428091049,
"learning_rate": 8.81273186730243e-05,
"loss": 0.9151,
"step": 1669
},
{
"epoch": 1.07,
"grad_norm": 0.08541446179151535,
"learning_rate": 8.802501695687106e-05,
"loss": 1.0041,
"step": 1670
},
{
"epoch": 1.07,
"grad_norm": 0.10063093900680542,
"learning_rate": 8.792272795406084e-05,
"loss": 1.1596,
"step": 1671
},
{
"epoch": 1.07,
"grad_norm": 0.09659677743911743,
"learning_rate": 8.782045177318965e-05,
"loss": 1.309,
"step": 1672
},
{
"epoch": 1.07,
"grad_norm": 0.08988797664642334,
"learning_rate": 8.771818852283993e-05,
"loss": 0.9076,
"step": 1673
},
{
"epoch": 1.07,
"grad_norm": 0.10435433685779572,
"learning_rate": 8.761593831158022e-05,
"loss": 1.0474,
"step": 1674
},
{
"epoch": 1.07,
"grad_norm": 0.10360485315322876,
"learning_rate": 8.751370124796535e-05,
"loss": 0.9603,
"step": 1675
},
{
"epoch": 1.07,
"grad_norm": 0.0891716480255127,
"learning_rate": 8.741147744053624e-05,
"loss": 1.1006,
"step": 1676
},
{
"epoch": 1.07,
"grad_norm": 0.10455801337957382,
"learning_rate": 8.730926699781967e-05,
"loss": 0.8442,
"step": 1677
},
{
"epoch": 1.07,
"grad_norm": 0.09745992720127106,
"learning_rate": 8.720707002832811e-05,
"loss": 1.1582,
"step": 1678
},
{
"epoch": 1.07,
"grad_norm": 0.09172283858060837,
"learning_rate": 8.710488664055997e-05,
"loss": 1.1524,
"step": 1679
},
{
"epoch": 1.07,
"grad_norm": 0.10276954621076584,
"learning_rate": 8.700271694299907e-05,
"loss": 1.0672,
"step": 1680
},
{
"epoch": 1.07,
"grad_norm": 0.10065661370754242,
"learning_rate": 8.690056104411477e-05,
"loss": 0.9603,
"step": 1681
},
{
"epoch": 1.08,
"grad_norm": 0.0919308066368103,
"learning_rate": 8.679841905236169e-05,
"loss": 1.1662,
"step": 1682
},
{
"epoch": 1.08,
"grad_norm": 0.10107772052288055,
"learning_rate": 8.66962910761798e-05,
"loss": 0.9189,
"step": 1683
},
{
"epoch": 1.08,
"grad_norm": 0.09063085913658142,
"learning_rate": 8.659417722399412e-05,
"loss": 1.2896,
"step": 1684
},
{
"epoch": 1.08,
"grad_norm": 0.08782243728637695,
"learning_rate": 8.649207760421472e-05,
"loss": 1.0118,
"step": 1685
},
{
"epoch": 1.08,
"grad_norm": 0.10642839223146439,
"learning_rate": 8.638999232523648e-05,
"loss": 1.1992,
"step": 1686
},
{
"epoch": 1.08,
"grad_norm": 0.09847152978181839,
"learning_rate": 8.628792149543915e-05,
"loss": 1.1325,
"step": 1687
},
{
"epoch": 1.08,
"grad_norm": 0.1033477932214737,
"learning_rate": 8.618586522318708e-05,
"loss": 0.9818,
"step": 1688
},
{
"epoch": 1.08,
"grad_norm": 0.09962837398052216,
"learning_rate": 8.608382361682923e-05,
"loss": 1.0941,
"step": 1689
},
{
"epoch": 1.08,
"grad_norm": 0.09534700214862823,
"learning_rate": 8.598179678469886e-05,
"loss": 0.9968,
"step": 1690
},
{
"epoch": 1.08,
"grad_norm": 0.09052200615406036,
"learning_rate": 8.587978483511368e-05,
"loss": 0.8451,
"step": 1691
},
{
"epoch": 1.08,
"grad_norm": 0.09424137324094772,
"learning_rate": 8.577778787637553e-05,
"loss": 1.0539,
"step": 1692
},
{
"epoch": 1.08,
"grad_norm": 0.08804541081190109,
"learning_rate": 8.567580601677041e-05,
"loss": 0.9411,
"step": 1693
},
{
"epoch": 1.08,
"grad_norm": 0.08246276527643204,
"learning_rate": 8.557383936456815e-05,
"loss": 0.9724,
"step": 1694
},
{
"epoch": 1.08,
"grad_norm": 0.09954670071601868,
"learning_rate": 8.547188802802253e-05,
"loss": 1.0284,
"step": 1695
},
{
"epoch": 1.08,
"grad_norm": 0.11991100013256073,
"learning_rate": 8.536995211537108e-05,
"loss": 1.2301,
"step": 1696
},
{
"epoch": 1.09,
"grad_norm": 0.1030411571264267,
"learning_rate": 8.5268031734835e-05,
"loss": 1.048,
"step": 1697
},
{
"epoch": 1.09,
"grad_norm": 0.09521787613630295,
"learning_rate": 8.516612699461879e-05,
"loss": 0.9931,
"step": 1698
},
{
"epoch": 1.09,
"grad_norm": 0.11362671852111816,
"learning_rate": 8.506423800291058e-05,
"loss": 1.4209,
"step": 1699
},
{
"epoch": 1.09,
"grad_norm": 0.10384287685155869,
"learning_rate": 8.496236486788167e-05,
"loss": 1.1823,
"step": 1700
},
{
"epoch": 1.09,
"grad_norm": 0.10857536643743515,
"learning_rate": 8.486050769768657e-05,
"loss": 1.1724,
"step": 1701
},
{
"epoch": 1.09,
"grad_norm": 0.10215967148542404,
"learning_rate": 8.475866660046277e-05,
"loss": 1.2666,
"step": 1702
},
{
"epoch": 1.09,
"grad_norm": 0.12411094456911087,
"learning_rate": 8.465684168433075e-05,
"loss": 1.3232,
"step": 1703
},
{
"epoch": 1.09,
"grad_norm": 0.12331940978765488,
"learning_rate": 8.455503305739381e-05,
"loss": 1.2565,
"step": 1704
},
{
"epoch": 1.09,
"grad_norm": 0.09833373129367828,
"learning_rate": 8.445324082773797e-05,
"loss": 1.4012,
"step": 1705
},
{
"epoch": 1.09,
"grad_norm": 0.10029523819684982,
"learning_rate": 8.43514651034318e-05,
"loss": 0.9891,
"step": 1706
},
{
"epoch": 1.09,
"grad_norm": 0.10253671556711197,
"learning_rate": 8.424970599252633e-05,
"loss": 0.9068,
"step": 1707
},
{
"epoch": 1.09,
"grad_norm": 0.10940627008676529,
"learning_rate": 8.414796360305503e-05,
"loss": 1.3442,
"step": 1708
},
{
"epoch": 1.09,
"grad_norm": 0.10060277581214905,
"learning_rate": 8.404623804303364e-05,
"loss": 1.1808,
"step": 1709
},
{
"epoch": 1.09,
"grad_norm": 0.113701231777668,
"learning_rate": 8.394452942045985e-05,
"loss": 1.124,
"step": 1710
},
{
"epoch": 1.09,
"grad_norm": 0.10197798907756805,
"learning_rate": 8.384283784331351e-05,
"loss": 1.1917,
"step": 1711
},
{
"epoch": 1.09,
"grad_norm": 0.11811815947294235,
"learning_rate": 8.374116341955642e-05,
"loss": 1.2467,
"step": 1712
},
{
"epoch": 1.1,
"grad_norm": 0.09793061017990112,
"learning_rate": 8.36395062571321e-05,
"loss": 0.9221,
"step": 1713
},
{
"epoch": 1.1,
"grad_norm": 0.10496238619089127,
"learning_rate": 8.353786646396564e-05,
"loss": 1.0621,
"step": 1714
},
{
"epoch": 1.1,
"grad_norm": 0.09578882902860641,
"learning_rate": 8.343624414796388e-05,
"loss": 1.039,
"step": 1715
},
{
"epoch": 1.1,
"grad_norm": 0.12194564193487167,
"learning_rate": 8.333463941701501e-05,
"loss": 1.1739,
"step": 1716
},
{
"epoch": 1.1,
"grad_norm": 0.09907002747058868,
"learning_rate": 8.323305237898858e-05,
"loss": 1.2384,
"step": 1717
},
{
"epoch": 1.1,
"grad_norm": 0.10670025646686554,
"learning_rate": 8.313148314173527e-05,
"loss": 1.22,
"step": 1718
},
{
"epoch": 1.1,
"grad_norm": 0.10209029912948608,
"learning_rate": 8.302993181308697e-05,
"loss": 1.3299,
"step": 1719
},
{
"epoch": 1.1,
"grad_norm": 0.08859424293041229,
"learning_rate": 8.292839850085652e-05,
"loss": 0.9466,
"step": 1720
},
{
"epoch": 1.1,
"grad_norm": 0.08388126641511917,
"learning_rate": 8.282688331283764e-05,
"loss": 1.1799,
"step": 1721
},
{
"epoch": 1.1,
"grad_norm": 0.08690060675144196,
"learning_rate": 8.272538635680475e-05,
"loss": 0.9033,
"step": 1722
},
{
"epoch": 1.1,
"grad_norm": 0.17803430557250977,
"learning_rate": 8.262390774051299e-05,
"loss": 1.2405,
"step": 1723
},
{
"epoch": 1.1,
"grad_norm": 0.10621819645166397,
"learning_rate": 8.252244757169799e-05,
"loss": 1.4116,
"step": 1724
},
{
"epoch": 1.1,
"grad_norm": 0.10853593796491623,
"learning_rate": 8.242100595807585e-05,
"loss": 1.154,
"step": 1725
},
{
"epoch": 1.1,
"grad_norm": 0.09998656064271927,
"learning_rate": 8.231958300734286e-05,
"loss": 0.9455,
"step": 1726
},
{
"epoch": 1.1,
"grad_norm": 0.09830087423324585,
"learning_rate": 8.221817882717558e-05,
"loss": 1.1683,
"step": 1727
},
{
"epoch": 1.11,
"grad_norm": 0.1087251752614975,
"learning_rate": 8.211679352523062e-05,
"loss": 1.2435,
"step": 1728
},
{
"epoch": 1.11,
"grad_norm": 0.11251130700111389,
"learning_rate": 8.201542720914464e-05,
"loss": 1.2506,
"step": 1729
},
{
"epoch": 1.11,
"grad_norm": 0.09583642333745956,
"learning_rate": 8.191407998653392e-05,
"loss": 1.027,
"step": 1730
},
{
"epoch": 1.11,
"grad_norm": 0.09885202348232269,
"learning_rate": 8.181275196499465e-05,
"loss": 1.0188,
"step": 1731
},
{
"epoch": 1.11,
"grad_norm": 0.09704544395208359,
"learning_rate": 8.171144325210263e-05,
"loss": 1.1882,
"step": 1732
},
{
"epoch": 1.11,
"grad_norm": 0.1069527268409729,
"learning_rate": 8.16101539554131e-05,
"loss": 1.0503,
"step": 1733
},
{
"epoch": 1.11,
"grad_norm": 0.2915920317173004,
"learning_rate": 8.150888418246069e-05,
"loss": 1.1127,
"step": 1734
},
{
"epoch": 1.11,
"grad_norm": 0.10358176380395889,
"learning_rate": 8.140763404075935e-05,
"loss": 1.192,
"step": 1735
},
{
"epoch": 1.11,
"grad_norm": 0.09029901027679443,
"learning_rate": 8.130640363780212e-05,
"loss": 1.0735,
"step": 1736
},
{
"epoch": 1.11,
"grad_norm": 0.12607234716415405,
"learning_rate": 8.120519308106114e-05,
"loss": 1.2728,
"step": 1737
},
{
"epoch": 1.11,
"grad_norm": 0.1002657562494278,
"learning_rate": 8.11040024779875e-05,
"loss": 1.0616,
"step": 1738
},
{
"epoch": 1.11,
"grad_norm": 0.10784013569355011,
"learning_rate": 8.1002831936011e-05,
"loss": 1.1291,
"step": 1739
},
{
"epoch": 1.11,
"grad_norm": 0.10209915041923523,
"learning_rate": 8.090168156254024e-05,
"loss": 1.1188,
"step": 1740
},
{
"epoch": 1.11,
"grad_norm": 0.11990582942962646,
"learning_rate": 8.080055146496237e-05,
"loss": 1.1925,
"step": 1741
},
{
"epoch": 1.11,
"grad_norm": 0.09590277820825577,
"learning_rate": 8.069944175064309e-05,
"loss": 1.0407,
"step": 1742
},
{
"epoch": 1.11,
"grad_norm": 0.11796679347753525,
"learning_rate": 8.059835252692627e-05,
"loss": 1.1229,
"step": 1743
},
{
"epoch": 1.12,
"grad_norm": 0.11333147436380386,
"learning_rate": 8.049728390113422e-05,
"loss": 1.0275,
"step": 1744
},
{
"epoch": 1.12,
"grad_norm": 0.10042162239551544,
"learning_rate": 8.039623598056732e-05,
"loss": 1.0375,
"step": 1745
},
{
"epoch": 1.12,
"grad_norm": 0.10519666224718094,
"learning_rate": 8.029520887250396e-05,
"loss": 1.2391,
"step": 1746
},
{
"epoch": 1.12,
"grad_norm": 0.12084035575389862,
"learning_rate": 8.019420268420035e-05,
"loss": 1.1266,
"step": 1747
},
{
"epoch": 1.12,
"grad_norm": 0.11919303238391876,
"learning_rate": 8.009321752289067e-05,
"loss": 1.4104,
"step": 1748
},
{
"epoch": 1.12,
"grad_norm": 0.08715026825666428,
"learning_rate": 7.999225349578663e-05,
"loss": 1.1007,
"step": 1749
},
{
"epoch": 1.12,
"grad_norm": 0.11018751561641693,
"learning_rate": 7.98913107100776e-05,
"loss": 1.3332,
"step": 1750
},
{
"epoch": 1.12,
"grad_norm": 0.09563156217336655,
"learning_rate": 7.979038927293029e-05,
"loss": 0.8596,
"step": 1751
},
{
"epoch": 1.12,
"grad_norm": 0.10469130426645279,
"learning_rate": 7.968948929148882e-05,
"loss": 0.7849,
"step": 1752
},
{
"epoch": 1.12,
"grad_norm": 0.11450375616550446,
"learning_rate": 7.958861087287455e-05,
"loss": 1.1808,
"step": 1753
},
{
"epoch": 1.12,
"grad_norm": 0.10223834216594696,
"learning_rate": 7.94877541241859e-05,
"loss": 1.0672,
"step": 1754
},
{
"epoch": 1.12,
"grad_norm": 0.09747231006622314,
"learning_rate": 7.938691915249826e-05,
"loss": 1.1788,
"step": 1755
},
{
"epoch": 1.12,
"grad_norm": 0.08669831603765488,
"learning_rate": 7.928610606486399e-05,
"loss": 0.9301,
"step": 1756
},
{
"epoch": 1.12,
"grad_norm": 0.08597883582115173,
"learning_rate": 7.918531496831213e-05,
"loss": 1.0367,
"step": 1757
},
{
"epoch": 1.12,
"grad_norm": 0.10345563292503357,
"learning_rate": 7.908454596984845e-05,
"loss": 0.991,
"step": 1758
},
{
"epoch": 1.13,
"grad_norm": 0.11869390308856964,
"learning_rate": 7.898379917645517e-05,
"loss": 1.0102,
"step": 1759
},
{
"epoch": 1.13,
"grad_norm": 0.11502056568861008,
"learning_rate": 7.888307469509102e-05,
"loss": 1.2568,
"step": 1760
},
{
"epoch": 1.13,
"grad_norm": 0.10966496169567108,
"learning_rate": 7.878237263269097e-05,
"loss": 1.2016,
"step": 1761
},
{
"epoch": 1.13,
"grad_norm": 0.10536835342645645,
"learning_rate": 7.86816930961663e-05,
"loss": 1.1956,
"step": 1762
},
{
"epoch": 1.13,
"grad_norm": 0.10817115008831024,
"learning_rate": 7.858103619240422e-05,
"loss": 1.1229,
"step": 1763
},
{
"epoch": 1.13,
"grad_norm": 0.09182746708393097,
"learning_rate": 7.848040202826797e-05,
"loss": 1.3064,
"step": 1764
},
{
"epoch": 1.13,
"grad_norm": 0.10925756394863129,
"learning_rate": 7.837979071059676e-05,
"loss": 1.2452,
"step": 1765
},
{
"epoch": 1.13,
"grad_norm": 0.09588959068059921,
"learning_rate": 7.827920234620544e-05,
"loss": 0.7358,
"step": 1766
},
{
"epoch": 1.13,
"grad_norm": 0.10715386271476746,
"learning_rate": 7.817863704188444e-05,
"loss": 1.1056,
"step": 1767
},
{
"epoch": 1.13,
"grad_norm": 0.0860549658536911,
"learning_rate": 7.807809490439983e-05,
"loss": 0.9436,
"step": 1768
},
{
"epoch": 1.13,
"grad_norm": 0.08202296495437622,
"learning_rate": 7.7977576040493e-05,
"loss": 1.0748,
"step": 1769
},
{
"epoch": 1.13,
"grad_norm": 0.11034297198057175,
"learning_rate": 7.78770805568807e-05,
"loss": 1.2058,
"step": 1770
},
{
"epoch": 1.13,
"grad_norm": 0.10076629370450974,
"learning_rate": 7.777660856025478e-05,
"loss": 1.167,
"step": 1771
},
{
"epoch": 1.13,
"grad_norm": 0.10119215399026871,
"learning_rate": 7.767616015728219e-05,
"loss": 1.0581,
"step": 1772
},
{
"epoch": 1.13,
"grad_norm": 0.09804743528366089,
"learning_rate": 7.757573545460487e-05,
"loss": 0.9243,
"step": 1773
},
{
"epoch": 1.14,
"grad_norm": 0.11209340393543243,
"learning_rate": 7.747533455883955e-05,
"loss": 0.8687,
"step": 1774
},
{
"epoch": 1.14,
"grad_norm": 0.11314871907234192,
"learning_rate": 7.737495757657768e-05,
"loss": 1.2354,
"step": 1775
},
{
"epoch": 1.14,
"grad_norm": 0.09508980065584183,
"learning_rate": 7.727460461438535e-05,
"loss": 1.196,
"step": 1776
},
{
"epoch": 1.14,
"grad_norm": 0.09334195405244827,
"learning_rate": 7.717427577880312e-05,
"loss": 1.1237,
"step": 1777
},
{
"epoch": 1.14,
"grad_norm": 0.10466314107179642,
"learning_rate": 7.707397117634603e-05,
"loss": 0.8376,
"step": 1778
},
{
"epoch": 1.14,
"grad_norm": 0.0980396568775177,
"learning_rate": 7.697369091350325e-05,
"loss": 0.8361,
"step": 1779
},
{
"epoch": 1.14,
"grad_norm": 0.08817454427480698,
"learning_rate": 7.687343509673816e-05,
"loss": 1.1389,
"step": 1780
},
{
"epoch": 1.14,
"grad_norm": 0.1025627925992012,
"learning_rate": 7.677320383248825e-05,
"loss": 1.1257,
"step": 1781
},
{
"epoch": 1.14,
"grad_norm": 0.0950944647192955,
"learning_rate": 7.667299722716493e-05,
"loss": 1.094,
"step": 1782
},
{
"epoch": 1.14,
"grad_norm": 0.09257030487060547,
"learning_rate": 7.65728153871533e-05,
"loss": 1.0561,
"step": 1783
},
{
"epoch": 1.14,
"grad_norm": 0.11526904255151749,
"learning_rate": 7.647265841881233e-05,
"loss": 1.0956,
"step": 1784
},
{
"epoch": 1.14,
"grad_norm": 0.11672840267419815,
"learning_rate": 7.637252642847452e-05,
"loss": 1.0245,
"step": 1785
},
{
"epoch": 1.14,
"grad_norm": 0.10257334262132645,
"learning_rate": 7.627241952244587e-05,
"loss": 1.0203,
"step": 1786
},
{
"epoch": 1.14,
"grad_norm": 0.09029462188482285,
"learning_rate": 7.617233780700568e-05,
"loss": 1.066,
"step": 1787
},
{
"epoch": 1.14,
"grad_norm": 0.09867659211158752,
"learning_rate": 7.607228138840658e-05,
"loss": 1.0679,
"step": 1788
},
{
"epoch": 1.14,
"grad_norm": 0.09823356568813324,
"learning_rate": 7.597225037287433e-05,
"loss": 1.177,
"step": 1789
},
{
"epoch": 1.15,
"grad_norm": 0.10935312509536743,
"learning_rate": 7.587224486660771e-05,
"loss": 1.0442,
"step": 1790
},
{
"epoch": 1.15,
"grad_norm": 0.10251200199127197,
"learning_rate": 7.577226497577841e-05,
"loss": 0.8364,
"step": 1791
},
{
"epoch": 1.15,
"grad_norm": 0.10783925652503967,
"learning_rate": 7.567231080653096e-05,
"loss": 1.0555,
"step": 1792
},
{
"epoch": 1.15,
"grad_norm": 0.09637187421321869,
"learning_rate": 7.557238246498251e-05,
"loss": 1.1436,
"step": 1793
},
{
"epoch": 1.15,
"grad_norm": 0.09876862168312073,
"learning_rate": 7.547248005722291e-05,
"loss": 0.9666,
"step": 1794
},
{
"epoch": 1.15,
"grad_norm": 0.10127399116754532,
"learning_rate": 7.537260368931434e-05,
"loss": 1.0503,
"step": 1795
},
{
"epoch": 1.15,
"grad_norm": 0.0992555096745491,
"learning_rate": 7.52727534672914e-05,
"loss": 1.172,
"step": 1796
},
{
"epoch": 1.15,
"grad_norm": 0.11890536546707153,
"learning_rate": 7.517292949716095e-05,
"loss": 1.2909,
"step": 1797
},
{
"epoch": 1.15,
"grad_norm": 0.11542758345603943,
"learning_rate": 7.507313188490201e-05,
"loss": 1.0816,
"step": 1798
},
{
"epoch": 1.15,
"grad_norm": 0.09621984511613846,
"learning_rate": 7.497336073646548e-05,
"loss": 1.2692,
"step": 1799
},
{
"epoch": 1.15,
"grad_norm": 0.11845225840806961,
"learning_rate": 7.48736161577743e-05,
"loss": 1.0897,
"step": 1800
},
{
"epoch": 1.15,
"grad_norm": 0.10763073712587357,
"learning_rate": 7.477389825472314e-05,
"loss": 1.2183,
"step": 1801
},
{
"epoch": 1.15,
"grad_norm": 0.111964650452137,
"learning_rate": 7.46742071331784e-05,
"loss": 1.074,
"step": 1802
},
{
"epoch": 1.15,
"grad_norm": 0.12040800601243973,
"learning_rate": 7.457454289897796e-05,
"loss": 1.103,
"step": 1803
},
{
"epoch": 1.15,
"grad_norm": 0.1029847040772438,
"learning_rate": 7.447490565793121e-05,
"loss": 1.0382,
"step": 1804
},
{
"epoch": 1.16,
"grad_norm": 0.1057555302977562,
"learning_rate": 7.43752955158189e-05,
"loss": 1.0769,
"step": 1805
},
{
"epoch": 1.16,
"grad_norm": 0.0991426482796669,
"learning_rate": 7.427571257839302e-05,
"loss": 0.6145,
"step": 1806
},
{
"epoch": 1.16,
"grad_norm": 0.08941187709569931,
"learning_rate": 7.417615695137658e-05,
"loss": 1.083,
"step": 1807
},
{
"epoch": 1.16,
"grad_norm": 0.11119288951158524,
"learning_rate": 7.407662874046368e-05,
"loss": 1.0702,
"step": 1808
},
{
"epoch": 1.16,
"grad_norm": 0.12452839314937592,
"learning_rate": 7.397712805131932e-05,
"loss": 1.1949,
"step": 1809
},
{
"epoch": 1.16,
"grad_norm": 0.08561979234218597,
"learning_rate": 7.387765498957924e-05,
"loss": 1.0292,
"step": 1810
},
{
"epoch": 1.16,
"grad_norm": 0.1040244922041893,
"learning_rate": 7.377820966084986e-05,
"loss": 0.9392,
"step": 1811
},
{
"epoch": 1.16,
"grad_norm": 0.12474718689918518,
"learning_rate": 7.367879217070816e-05,
"loss": 1.1049,
"step": 1812
},
{
"epoch": 1.16,
"grad_norm": 0.09586942195892334,
"learning_rate": 7.357940262470157e-05,
"loss": 1.1812,
"step": 1813
},
{
"epoch": 1.16,
"grad_norm": 0.093437060713768,
"learning_rate": 7.348004112834791e-05,
"loss": 1.1362,
"step": 1814
},
{
"epoch": 1.16,
"grad_norm": 0.11114737391471863,
"learning_rate": 7.338070778713509e-05,
"loss": 1.1167,
"step": 1815
},
{
"epoch": 1.16,
"grad_norm": 0.08943517506122589,
"learning_rate": 7.328140270652117e-05,
"loss": 1.1796,
"step": 1816
},
{
"epoch": 1.16,
"grad_norm": 0.10309294611215591,
"learning_rate": 7.318212599193432e-05,
"loss": 1.2328,
"step": 1817
},
{
"epoch": 1.16,
"grad_norm": 0.0918801948428154,
"learning_rate": 7.308287774877254e-05,
"loss": 1.0734,
"step": 1818
},
{
"epoch": 1.16,
"grad_norm": 0.1139645054936409,
"learning_rate": 7.298365808240342e-05,
"loss": 1.4055,
"step": 1819
},
{
"epoch": 1.16,
"grad_norm": 0.10042014718055725,
"learning_rate": 7.28844670981645e-05,
"loss": 1.1049,
"step": 1820
},
{
"epoch": 1.17,
"grad_norm": 0.0958545058965683,
"learning_rate": 7.278530490136268e-05,
"loss": 1.0721,
"step": 1821
},
{
"epoch": 1.17,
"grad_norm": 0.09955289214849472,
"learning_rate": 7.26861715972744e-05,
"loss": 0.8481,
"step": 1822
},
{
"epoch": 1.17,
"grad_norm": 0.10987333953380585,
"learning_rate": 7.258706729114533e-05,
"loss": 1.0024,
"step": 1823
},
{
"epoch": 1.17,
"grad_norm": 0.10780365765094757,
"learning_rate": 7.248799208819042e-05,
"loss": 0.9831,
"step": 1824
},
{
"epoch": 1.17,
"grad_norm": 0.10277656465768814,
"learning_rate": 7.23889460935937e-05,
"loss": 1.3103,
"step": 1825
},
{
"epoch": 1.17,
"grad_norm": 0.10252419859170914,
"learning_rate": 7.228992941250822e-05,
"loss": 1.1921,
"step": 1826
},
{
"epoch": 1.17,
"grad_norm": 0.11087819933891296,
"learning_rate": 7.219094215005585e-05,
"loss": 1.234,
"step": 1827
},
{
"epoch": 1.17,
"grad_norm": 0.10554228723049164,
"learning_rate": 7.209198441132729e-05,
"loss": 1.1149,
"step": 1828
},
{
"epoch": 1.17,
"grad_norm": 0.09352140873670578,
"learning_rate": 7.199305630138185e-05,
"loss": 1.2931,
"step": 1829
},
{
"epoch": 1.17,
"grad_norm": 0.10124289989471436,
"learning_rate": 7.189415792524742e-05,
"loss": 1.187,
"step": 1830
},
{
"epoch": 1.17,
"grad_norm": 0.10797513276338577,
"learning_rate": 7.179528938792031e-05,
"loss": 1.2304,
"step": 1831
},
{
"epoch": 1.17,
"grad_norm": 0.1394849568605423,
"learning_rate": 7.169645079436509e-05,
"loss": 1.1145,
"step": 1832
},
{
"epoch": 1.17,
"grad_norm": 0.11761580407619476,
"learning_rate": 7.159764224951468e-05,
"loss": 1.2356,
"step": 1833
},
{
"epoch": 1.17,
"grad_norm": 0.10669250786304474,
"learning_rate": 7.149886385827e-05,
"loss": 1.1634,
"step": 1834
},
{
"epoch": 1.17,
"grad_norm": 0.10505758970975876,
"learning_rate": 7.140011572549989e-05,
"loss": 1.205,
"step": 1835
},
{
"epoch": 1.18,
"grad_norm": 0.11049558222293854,
"learning_rate": 7.130139795604125e-05,
"loss": 1.238,
"step": 1836
},
{
"epoch": 1.18,
"grad_norm": 0.11414950340986252,
"learning_rate": 7.12027106546986e-05,
"loss": 0.9981,
"step": 1837
},
{
"epoch": 1.18,
"grad_norm": 0.08454291522502899,
"learning_rate": 7.110405392624416e-05,
"loss": 0.9491,
"step": 1838
},
{
"epoch": 1.18,
"grad_norm": 0.09835947304964066,
"learning_rate": 7.100542787541766e-05,
"loss": 1.0881,
"step": 1839
},
{
"epoch": 1.18,
"grad_norm": 0.09932353347539902,
"learning_rate": 7.090683260692634e-05,
"loss": 1.202,
"step": 1840
},
{
"epoch": 1.18,
"grad_norm": 0.09155373275279999,
"learning_rate": 7.080826822544468e-05,
"loss": 1.0645,
"step": 1841
},
{
"epoch": 1.18,
"grad_norm": 0.10137968510389328,
"learning_rate": 7.070973483561443e-05,
"loss": 1.0198,
"step": 1842
},
{
"epoch": 1.18,
"grad_norm": 0.10164333134889603,
"learning_rate": 7.061123254204434e-05,
"loss": 1.1857,
"step": 1843
},
{
"epoch": 1.18,
"grad_norm": 0.11338160932064056,
"learning_rate": 7.051276144931025e-05,
"loss": 1.0085,
"step": 1844
},
{
"epoch": 1.18,
"grad_norm": 0.11890163272619247,
"learning_rate": 7.041432166195485e-05,
"loss": 1.5122,
"step": 1845
},
{
"epoch": 1.18,
"grad_norm": 0.10228203982114792,
"learning_rate": 7.03159132844876e-05,
"loss": 1.0483,
"step": 1846
},
{
"epoch": 1.18,
"grad_norm": 0.11418533325195312,
"learning_rate": 7.02175364213845e-05,
"loss": 1.2067,
"step": 1847
},
{
"epoch": 1.18,
"grad_norm": 0.11347746104001999,
"learning_rate": 7.011919117708828e-05,
"loss": 1.2764,
"step": 1848
},
{
"epoch": 1.18,
"grad_norm": 0.10556711256504059,
"learning_rate": 7.002087765600794e-05,
"loss": 1.4174,
"step": 1849
},
{
"epoch": 1.18,
"grad_norm": 0.1658787876367569,
"learning_rate": 6.992259596251897e-05,
"loss": 1.1988,
"step": 1850
},
{
"epoch": 1.19,
"grad_norm": 0.10474526137113571,
"learning_rate": 6.982434620096288e-05,
"loss": 1.1092,
"step": 1851
},
{
"epoch": 1.19,
"grad_norm": 0.12340165674686432,
"learning_rate": 6.972612847564736e-05,
"loss": 1.3335,
"step": 1852
},
{
"epoch": 1.19,
"grad_norm": 0.10219371318817139,
"learning_rate": 6.962794289084616e-05,
"loss": 1.3216,
"step": 1853
},
{
"epoch": 1.19,
"grad_norm": 0.08388462662696838,
"learning_rate": 6.952978955079885e-05,
"loss": 1.0066,
"step": 1854
},
{
"epoch": 1.19,
"grad_norm": 0.10680616647005081,
"learning_rate": 6.943166855971066e-05,
"loss": 1.0262,
"step": 1855
},
{
"epoch": 1.19,
"grad_norm": 0.10939647257328033,
"learning_rate": 6.933358002175268e-05,
"loss": 1.0341,
"step": 1856
},
{
"epoch": 1.19,
"grad_norm": 0.11564091593027115,
"learning_rate": 6.923552404106142e-05,
"loss": 1.2827,
"step": 1857
},
{
"epoch": 1.19,
"grad_norm": 0.12150295078754425,
"learning_rate": 6.913750072173884e-05,
"loss": 1.0269,
"step": 1858
},
{
"epoch": 1.19,
"grad_norm": 0.09134622663259506,
"learning_rate": 6.903951016785222e-05,
"loss": 1.0652,
"step": 1859
},
{
"epoch": 1.19,
"grad_norm": 0.11072158068418503,
"learning_rate": 6.894155248343409e-05,
"loss": 0.9662,
"step": 1860
},
{
"epoch": 1.19,
"grad_norm": 0.0969066172838211,
"learning_rate": 6.884362777248206e-05,
"loss": 0.8547,
"step": 1861
},
{
"epoch": 1.19,
"grad_norm": 0.10480539500713348,
"learning_rate": 6.874573613895872e-05,
"loss": 0.9799,
"step": 1862
},
{
"epoch": 1.19,
"grad_norm": 0.10491026937961578,
"learning_rate": 6.864787768679157e-05,
"loss": 1.093,
"step": 1863
},
{
"epoch": 1.19,
"grad_norm": 0.11408980190753937,
"learning_rate": 6.855005251987288e-05,
"loss": 1.2147,
"step": 1864
},
{
"epoch": 1.19,
"grad_norm": 0.1164758950471878,
"learning_rate": 6.845226074205954e-05,
"loss": 1.3426,
"step": 1865
},
{
"epoch": 1.19,
"grad_norm": 0.09609007090330124,
"learning_rate": 6.835450245717308e-05,
"loss": 1.1996,
"step": 1866
},
{
"epoch": 1.2,
"grad_norm": 0.09114314615726471,
"learning_rate": 6.825677776899941e-05,
"loss": 1.0326,
"step": 1867
},
{
"epoch": 1.2,
"grad_norm": 0.10207358747720718,
"learning_rate": 6.815908678128871e-05,
"loss": 1.2204,
"step": 1868
},
{
"epoch": 1.2,
"grad_norm": 0.09515099972486496,
"learning_rate": 6.806142959775552e-05,
"loss": 1.1014,
"step": 1869
},
{
"epoch": 1.2,
"grad_norm": 0.12066389620304108,
"learning_rate": 6.79638063220784e-05,
"loss": 0.9467,
"step": 1870
},
{
"epoch": 1.2,
"grad_norm": 0.10968538373708725,
"learning_rate": 6.786621705789998e-05,
"loss": 0.9573,
"step": 1871
},
{
"epoch": 1.2,
"grad_norm": 0.0835646539926529,
"learning_rate": 6.776866190882665e-05,
"loss": 0.992,
"step": 1872
},
{
"epoch": 1.2,
"grad_norm": 0.11505427211523056,
"learning_rate": 6.767114097842873e-05,
"loss": 1.1258,
"step": 1873
},
{
"epoch": 1.2,
"grad_norm": 0.09720007330179214,
"learning_rate": 6.757365437024011e-05,
"loss": 0.9342,
"step": 1874
},
{
"epoch": 1.2,
"grad_norm": 0.1192919984459877,
"learning_rate": 6.747620218775835e-05,
"loss": 1.1094,
"step": 1875
},
{
"epoch": 1.2,
"grad_norm": 0.10996134579181671,
"learning_rate": 6.737878453444429e-05,
"loss": 1.1841,
"step": 1876
},
{
"epoch": 1.2,
"grad_norm": 0.11342544853687286,
"learning_rate": 6.728140151372229e-05,
"loss": 1.1728,
"step": 1877
},
{
"epoch": 1.2,
"grad_norm": 0.09665987640619278,
"learning_rate": 6.718405322897983e-05,
"loss": 1.1124,
"step": 1878
},
{
"epoch": 1.2,
"grad_norm": 0.10993858426809311,
"learning_rate": 6.708673978356759e-05,
"loss": 0.9636,
"step": 1879
},
{
"epoch": 1.2,
"grad_norm": 0.10078933835029602,
"learning_rate": 6.698946128079917e-05,
"loss": 1.2373,
"step": 1880
},
{
"epoch": 1.2,
"grad_norm": 0.10155204683542252,
"learning_rate": 6.689221782395115e-05,
"loss": 0.9629,
"step": 1881
},
{
"epoch": 1.21,
"grad_norm": 0.1060449406504631,
"learning_rate": 6.679500951626283e-05,
"loss": 0.8571,
"step": 1882
},
{
"epoch": 1.21,
"grad_norm": 0.11292152851819992,
"learning_rate": 6.669783646093635e-05,
"loss": 1.2698,
"step": 1883
},
{
"epoch": 1.21,
"grad_norm": 0.11200407147407532,
"learning_rate": 6.660069876113621e-05,
"loss": 0.9915,
"step": 1884
},
{
"epoch": 1.21,
"grad_norm": 0.11458922177553177,
"learning_rate": 6.65035965199895e-05,
"loss": 1.1818,
"step": 1885
},
{
"epoch": 1.21,
"grad_norm": 0.0975421816110611,
"learning_rate": 6.640652984058566e-05,
"loss": 1.0658,
"step": 1886
},
{
"epoch": 1.21,
"grad_norm": 0.09911008924245834,
"learning_rate": 6.63094988259764e-05,
"loss": 1.2639,
"step": 1887
},
{
"epoch": 1.21,
"grad_norm": 0.11829882115125656,
"learning_rate": 6.621250357917545e-05,
"loss": 1.1303,
"step": 1888
},
{
"epoch": 1.21,
"grad_norm": 0.09364532679319382,
"learning_rate": 6.611554420315868e-05,
"loss": 1.1184,
"step": 1889
},
{
"epoch": 1.21,
"grad_norm": 0.09063845872879028,
"learning_rate": 6.601862080086383e-05,
"loss": 0.9681,
"step": 1890
},
{
"epoch": 1.21,
"grad_norm": 0.12965691089630127,
"learning_rate": 6.592173347519048e-05,
"loss": 1.0785,
"step": 1891
},
{
"epoch": 1.21,
"grad_norm": 0.10060100257396698,
"learning_rate": 6.582488232899986e-05,
"loss": 1.2392,
"step": 1892
},
{
"epoch": 1.21,
"grad_norm": 0.11565978080034256,
"learning_rate": 6.572806746511481e-05,
"loss": 1.2867,
"step": 1893
},
{
"epoch": 1.21,
"grad_norm": 0.09936638921499252,
"learning_rate": 6.563128898631968e-05,
"loss": 1.1509,
"step": 1894
},
{
"epoch": 1.21,
"grad_norm": 0.12283065915107727,
"learning_rate": 6.55345469953602e-05,
"loss": 1.0608,
"step": 1895
},
{
"epoch": 1.21,
"grad_norm": 0.10678443312644958,
"learning_rate": 6.543784159494323e-05,
"loss": 1.0883,
"step": 1896
},
{
"epoch": 1.22,
"grad_norm": 0.11203552782535553,
"learning_rate": 6.534117288773699e-05,
"loss": 1.2262,
"step": 1897
},
{
"epoch": 1.22,
"grad_norm": 0.3979748785495758,
"learning_rate": 6.524454097637057e-05,
"loss": 0.8822,
"step": 1898
},
{
"epoch": 1.22,
"grad_norm": 0.11394743621349335,
"learning_rate": 6.514794596343414e-05,
"loss": 1.2171,
"step": 1899
},
{
"epoch": 1.22,
"grad_norm": 0.09650028496980667,
"learning_rate": 6.505138795147853e-05,
"loss": 1.176,
"step": 1900
},
{
"epoch": 1.22,
"grad_norm": 0.10146728157997131,
"learning_rate": 6.495486704301539e-05,
"loss": 1.0829,
"step": 1901
},
{
"epoch": 1.22,
"grad_norm": 0.09876731038093567,
"learning_rate": 6.485838334051703e-05,
"loss": 0.9313,
"step": 1902
},
{
"epoch": 1.22,
"grad_norm": 0.10412513464689255,
"learning_rate": 6.476193694641619e-05,
"loss": 1.1781,
"step": 1903
},
{
"epoch": 1.22,
"grad_norm": 0.11426477134227753,
"learning_rate": 6.466552796310594e-05,
"loss": 1.1394,
"step": 1904
},
{
"epoch": 1.22,
"grad_norm": 0.1314503699541092,
"learning_rate": 6.456915649293975e-05,
"loss": 1.3437,
"step": 1905
},
{
"epoch": 1.22,
"grad_norm": 0.10625962913036346,
"learning_rate": 6.44728226382312e-05,
"loss": 0.8359,
"step": 1906
},
{
"epoch": 1.22,
"grad_norm": 0.10714507848024368,
"learning_rate": 6.437652650125398e-05,
"loss": 1.1026,
"step": 1907
},
{
"epoch": 1.22,
"grad_norm": 0.11312135308980942,
"learning_rate": 6.428026818424166e-05,
"loss": 1.0505,
"step": 1908
},
{
"epoch": 1.22,
"grad_norm": 0.09673094004392624,
"learning_rate": 6.418404778938773e-05,
"loss": 1.0523,
"step": 1909
},
{
"epoch": 1.22,
"grad_norm": 0.1013663113117218,
"learning_rate": 6.408786541884539e-05,
"loss": 0.8973,
"step": 1910
},
{
"epoch": 1.22,
"grad_norm": 0.11602221429347992,
"learning_rate": 6.399172117472751e-05,
"loss": 1.1212,
"step": 1911
},
{
"epoch": 1.22,
"grad_norm": 0.10436409711837769,
"learning_rate": 6.389561515910638e-05,
"loss": 1.1116,
"step": 1912
},
{
"epoch": 1.23,
"grad_norm": 0.12416583299636841,
"learning_rate": 6.37995474740138e-05,
"loss": 1.2028,
"step": 1913
},
{
"epoch": 1.23,
"grad_norm": 0.12245836108922958,
"learning_rate": 6.370351822144087e-05,
"loss": 1.0978,
"step": 1914
},
{
"epoch": 1.23,
"grad_norm": 0.09418083727359772,
"learning_rate": 6.360752750333785e-05,
"loss": 1.1708,
"step": 1915
},
{
"epoch": 1.23,
"grad_norm": 0.14150846004486084,
"learning_rate": 6.35115754216141e-05,
"loss": 1.1094,
"step": 1916
},
{
"epoch": 1.23,
"grad_norm": 0.10404966026544571,
"learning_rate": 6.341566207813798e-05,
"loss": 1.2364,
"step": 1917
},
{
"epoch": 1.23,
"grad_norm": 0.11477816849946976,
"learning_rate": 6.331978757473666e-05,
"loss": 1.1323,
"step": 1918
},
{
"epoch": 1.23,
"grad_norm": 0.1599774807691574,
"learning_rate": 6.322395201319625e-05,
"loss": 1.3589,
"step": 1919
},
{
"epoch": 1.23,
"grad_norm": 0.10674341768026352,
"learning_rate": 6.312815549526128e-05,
"loss": 0.8621,
"step": 1920
},
{
"epoch": 1.23,
"grad_norm": 0.10059361904859543,
"learning_rate": 6.303239812263493e-05,
"loss": 1.3208,
"step": 1921
},
{
"epoch": 1.23,
"grad_norm": 0.10563770681619644,
"learning_rate": 6.29366799969789e-05,
"loss": 1.2563,
"step": 1922
},
{
"epoch": 1.23,
"grad_norm": 0.11859140545129776,
"learning_rate": 6.284100121991319e-05,
"loss": 1.0299,
"step": 1923
},
{
"epoch": 1.23,
"grad_norm": 0.1109216958284378,
"learning_rate": 6.274536189301585e-05,
"loss": 1.1413,
"step": 1924
},
{
"epoch": 1.23,
"grad_norm": 0.09343459457159042,
"learning_rate": 6.264976211782329e-05,
"loss": 1.1803,
"step": 1925
},
{
"epoch": 1.23,
"grad_norm": 0.10926727950572968,
"learning_rate": 6.255420199582984e-05,
"loss": 1.0362,
"step": 1926
},
{
"epoch": 1.23,
"grad_norm": 0.13089515268802643,
"learning_rate": 6.245868162848769e-05,
"loss": 1.2881,
"step": 1927
},
{
"epoch": 1.24,
"grad_norm": 0.1073448583483696,
"learning_rate": 6.236320111720683e-05,
"loss": 1.1311,
"step": 1928
},
{
"epoch": 1.24,
"grad_norm": 0.11213389784097672,
"learning_rate": 6.226776056335498e-05,
"loss": 1.1203,
"step": 1929
},
{
"epoch": 1.24,
"grad_norm": 0.09957650303840637,
"learning_rate": 6.217236006825742e-05,
"loss": 1.0158,
"step": 1930
},
{
"epoch": 1.24,
"grad_norm": 0.11883754283189774,
"learning_rate": 6.207699973319694e-05,
"loss": 1.0173,
"step": 1931
},
{
"epoch": 1.24,
"grad_norm": 0.12860716879367828,
"learning_rate": 6.198167965941358e-05,
"loss": 0.9863,
"step": 1932
},
{
"epoch": 1.24,
"grad_norm": 0.10693139582872391,
"learning_rate": 6.188639994810476e-05,
"loss": 1.0937,
"step": 1933
},
{
"epoch": 1.24,
"grad_norm": 0.09557534009218216,
"learning_rate": 6.179116070042495e-05,
"loss": 0.9764,
"step": 1934
},
{
"epoch": 1.24,
"grad_norm": 0.12166590243577957,
"learning_rate": 6.169596201748583e-05,
"loss": 1.2856,
"step": 1935
},
{
"epoch": 1.24,
"grad_norm": 0.09950409084558487,
"learning_rate": 6.160080400035575e-05,
"loss": 0.9587,
"step": 1936
},
{
"epoch": 1.24,
"grad_norm": 0.10558495670557022,
"learning_rate": 6.150568675006009e-05,
"loss": 1.1173,
"step": 1937
},
{
"epoch": 1.24,
"grad_norm": 0.10239928215742111,
"learning_rate": 6.141061036758091e-05,
"loss": 1.2519,
"step": 1938
},
{
"epoch": 1.24,
"grad_norm": 0.12042713165283203,
"learning_rate": 6.13155749538569e-05,
"loss": 0.9233,
"step": 1939
},
{
"epoch": 1.24,
"grad_norm": 0.11289887130260468,
"learning_rate": 6.122058060978308e-05,
"loss": 1.1007,
"step": 1940
},
{
"epoch": 1.24,
"grad_norm": 0.11658722907304764,
"learning_rate": 6.112562743621114e-05,
"loss": 1.0938,
"step": 1941
},
{
"epoch": 1.24,
"grad_norm": 0.12887884676456451,
"learning_rate": 6.1030715533948854e-05,
"loss": 1.2651,
"step": 1942
},
{
"epoch": 1.24,
"grad_norm": 0.10279668867588043,
"learning_rate": 6.093584500376033e-05,
"loss": 0.9434,
"step": 1943
},
{
"epoch": 1.25,
"grad_norm": 0.08670208603143692,
"learning_rate": 6.0841015946365595e-05,
"loss": 0.9708,
"step": 1944
},
{
"epoch": 1.25,
"grad_norm": 0.08955523371696472,
"learning_rate": 6.074622846244077e-05,
"loss": 0.975,
"step": 1945
},
{
"epoch": 1.25,
"grad_norm": 0.09682302922010422,
"learning_rate": 6.0651482652617775e-05,
"loss": 1.197,
"step": 1946
},
{
"epoch": 1.25,
"grad_norm": 0.1102338582277298,
"learning_rate": 6.0556778617484365e-05,
"loss": 1.2363,
"step": 1947
},
{
"epoch": 1.25,
"grad_norm": 0.10501591116189957,
"learning_rate": 6.046211645758381e-05,
"loss": 0.968,
"step": 1948
},
{
"epoch": 1.25,
"grad_norm": 0.11749190837144852,
"learning_rate": 6.036749627341502e-05,
"loss": 1.3324,
"step": 1949
},
{
"epoch": 1.25,
"grad_norm": 0.11527553200721741,
"learning_rate": 6.027291816543237e-05,
"loss": 0.9384,
"step": 1950
},
{
"epoch": 1.25,
"grad_norm": 0.10384111851453781,
"learning_rate": 6.017838223404548e-05,
"loss": 1.1258,
"step": 1951
},
{
"epoch": 1.25,
"grad_norm": 0.1329168975353241,
"learning_rate": 6.008388857961922e-05,
"loss": 1.3529,
"step": 1952
},
{
"epoch": 1.25,
"grad_norm": 0.08886339515447617,
"learning_rate": 5.9989437302473595e-05,
"loss": 0.851,
"step": 1953
},
{
"epoch": 1.25,
"grad_norm": 0.12311293929815292,
"learning_rate": 5.9895028502883574e-05,
"loss": 1.394,
"step": 1954
},
{
"epoch": 1.25,
"grad_norm": 0.10424396395683289,
"learning_rate": 5.980066228107919e-05,
"loss": 1.0762,
"step": 1955
},
{
"epoch": 1.25,
"grad_norm": 0.12105081230401993,
"learning_rate": 5.9706338737245014e-05,
"loss": 1.0565,
"step": 1956
},
{
"epoch": 1.25,
"grad_norm": 0.1086994856595993,
"learning_rate": 5.961205797152047e-05,
"loss": 0.7959,
"step": 1957
},
{
"epoch": 1.25,
"grad_norm": 0.1301957368850708,
"learning_rate": 5.951782008399959e-05,
"loss": 1.3297,
"step": 1958
},
{
"epoch": 1.26,
"grad_norm": 0.11359301954507828,
"learning_rate": 5.9423625174730815e-05,
"loss": 1.229,
"step": 1959
},
{
"epoch": 1.26,
"grad_norm": 0.12482339143753052,
"learning_rate": 5.9329473343716925e-05,
"loss": 1.129,
"step": 1960
},
{
"epoch": 1.26,
"grad_norm": 0.08495151996612549,
"learning_rate": 5.9235364690915065e-05,
"loss": 0.954,
"step": 1961
},
{
"epoch": 1.26,
"grad_norm": 0.11496656388044357,
"learning_rate": 5.914129931623648e-05,
"loss": 1.1057,
"step": 1962
},
{
"epoch": 1.26,
"grad_norm": 0.09873928129673004,
"learning_rate": 5.904727731954649e-05,
"loss": 1.1929,
"step": 1963
},
{
"epoch": 1.26,
"grad_norm": 0.09060632437467575,
"learning_rate": 5.8953298800664315e-05,
"loss": 1.177,
"step": 1964
},
{
"epoch": 1.26,
"grad_norm": 0.11073742061853409,
"learning_rate": 5.885936385936307e-05,
"loss": 1.1504,
"step": 1965
},
{
"epoch": 1.26,
"grad_norm": 0.10112930834293365,
"learning_rate": 5.8765472595369594e-05,
"loss": 1.0616,
"step": 1966
},
{
"epoch": 1.26,
"grad_norm": 0.10309349000453949,
"learning_rate": 5.867162510836437e-05,
"loss": 1.2067,
"step": 1967
},
{
"epoch": 1.26,
"grad_norm": 0.11506038159132004,
"learning_rate": 5.857782149798135e-05,
"loss": 1.0837,
"step": 1968
},
{
"epoch": 1.26,
"grad_norm": 0.12611688673496246,
"learning_rate": 5.8484061863807924e-05,
"loss": 1.2255,
"step": 1969
},
{
"epoch": 1.26,
"grad_norm": 0.1064082682132721,
"learning_rate": 5.839034630538482e-05,
"loss": 0.9284,
"step": 1970
},
{
"epoch": 1.26,
"grad_norm": 0.09728314727544785,
"learning_rate": 5.8296674922206026e-05,
"loss": 1.0255,
"step": 1971
},
{
"epoch": 1.26,
"grad_norm": 0.09384610503911972,
"learning_rate": 5.820304781371851e-05,
"loss": 1.1645,
"step": 1972
},
{
"epoch": 1.26,
"grad_norm": 0.10468161106109619,
"learning_rate": 5.8109465079322265e-05,
"loss": 1.0497,
"step": 1973
},
{
"epoch": 1.27,
"grad_norm": 0.11066216975450516,
"learning_rate": 5.801592681837026e-05,
"loss": 1.1672,
"step": 1974
},
{
"epoch": 1.27,
"grad_norm": 0.1013823002576828,
"learning_rate": 5.7922433130168164e-05,
"loss": 1.0503,
"step": 1975
},
{
"epoch": 1.27,
"grad_norm": 0.10544838756322861,
"learning_rate": 5.782898411397431e-05,
"loss": 1.0823,
"step": 1976
},
{
"epoch": 1.27,
"grad_norm": 0.09770195186138153,
"learning_rate": 5.773557986899971e-05,
"loss": 0.8281,
"step": 1977
},
{
"epoch": 1.27,
"grad_norm": 0.11781159043312073,
"learning_rate": 5.764222049440771e-05,
"loss": 1.0883,
"step": 1978
},
{
"epoch": 1.27,
"grad_norm": 0.09178627282381058,
"learning_rate": 5.754890608931416e-05,
"loss": 0.9407,
"step": 1979
},
{
"epoch": 1.27,
"grad_norm": 0.1579505056142807,
"learning_rate": 5.7455636752787065e-05,
"loss": 1.0936,
"step": 1980
},
{
"epoch": 1.27,
"grad_norm": 0.12202049791812897,
"learning_rate": 5.736241258384658e-05,
"loss": 1.1957,
"step": 1981
},
{
"epoch": 1.27,
"grad_norm": 0.10151268541812897,
"learning_rate": 5.726923368146496e-05,
"loss": 1.1368,
"step": 1982
},
{
"epoch": 1.27,
"grad_norm": 0.11667834222316742,
"learning_rate": 5.7176100144566437e-05,
"loss": 1.244,
"step": 1983
},
{
"epoch": 1.27,
"grad_norm": 0.11503670364618301,
"learning_rate": 5.7083012072026974e-05,
"loss": 1.2413,
"step": 1984
},
{
"epoch": 1.27,
"grad_norm": 0.11644425988197327,
"learning_rate": 5.698996956267429e-05,
"loss": 1.2809,
"step": 1985
},
{
"epoch": 1.27,
"grad_norm": 0.10202714800834656,
"learning_rate": 5.689697271528783e-05,
"loss": 1.0215,
"step": 1986
},
{
"epoch": 1.27,
"grad_norm": 0.1217864528298378,
"learning_rate": 5.680402162859843e-05,
"loss": 1.3768,
"step": 1987
},
{
"epoch": 1.27,
"grad_norm": 0.15675802528858185,
"learning_rate": 5.6711116401288456e-05,
"loss": 0.9796,
"step": 1988
},
{
"epoch": 1.27,
"grad_norm": 0.1056087464094162,
"learning_rate": 5.661825713199151e-05,
"loss": 0.9911,
"step": 1989
},
{
"epoch": 1.28,
"grad_norm": 0.12184251844882965,
"learning_rate": 5.6525443919292396e-05,
"loss": 1.5061,
"step": 1990
},
{
"epoch": 1.28,
"grad_norm": 0.12193474918603897,
"learning_rate": 5.643267686172713e-05,
"loss": 1.1606,
"step": 1991
},
{
"epoch": 1.28,
"grad_norm": 0.09978966414928436,
"learning_rate": 5.6339956057782615e-05,
"loss": 0.9979,
"step": 1992
},
{
"epoch": 1.28,
"grad_norm": 0.12334888428449631,
"learning_rate": 5.624728160589663e-05,
"loss": 1.2239,
"step": 1993
},
{
"epoch": 1.28,
"grad_norm": 0.10410087555646896,
"learning_rate": 5.615465360445789e-05,
"loss": 0.8929,
"step": 1994
},
{
"epoch": 1.28,
"grad_norm": 0.12925098836421967,
"learning_rate": 5.6062072151805614e-05,
"loss": 0.9908,
"step": 1995
},
{
"epoch": 1.28,
"grad_norm": 0.09985952824354172,
"learning_rate": 5.596953734622978e-05,
"loss": 1.0413,
"step": 1996
},
{
"epoch": 1.28,
"grad_norm": 0.10836029797792435,
"learning_rate": 5.587704928597072e-05,
"loss": 0.9761,
"step": 1997
},
{
"epoch": 1.28,
"grad_norm": 0.1013987585902214,
"learning_rate": 5.578460806921912e-05,
"loss": 1.0686,
"step": 1998
},
{
"epoch": 1.28,
"grad_norm": 0.10120022296905518,
"learning_rate": 5.5692213794116045e-05,
"loss": 1.3073,
"step": 1999
},
{
"epoch": 1.28,
"grad_norm": 0.11423132568597794,
"learning_rate": 5.559986655875272e-05,
"loss": 1.292,
"step": 2000
},
{
"epoch": 1.28,
"grad_norm": 0.10096541047096252,
"learning_rate": 5.5507566461170236e-05,
"loss": 1.1845,
"step": 2001
},
{
"epoch": 1.28,
"grad_norm": 0.10352316498756409,
"learning_rate": 5.541531359935986e-05,
"loss": 1.1398,
"step": 2002
}
],
"logging_steps": 1,
"max_steps": 3078,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 154,
"total_flos": 1.5226989700536336e+19,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}