Weyaxi's picture
Upload folder using huggingface_hub
d62148f verified
raw
history blame
180 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0017714791851196,
"eval_steps": 500,
"global_step": 1131,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 15.385598322874909,
"learning_rate": 5.000000000000001e-07,
"loss": 0.7627,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 15.42812332406859,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.794,
"step": 2
},
{
"epoch": 0.0,
"grad_norm": 13.76934599903778,
"learning_rate": 1.5e-06,
"loss": 0.7894,
"step": 3
},
{
"epoch": 0.0,
"grad_norm": 7.9055471186770685,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.7346,
"step": 4
},
{
"epoch": 0.0,
"grad_norm": 8.624179170790118,
"learning_rate": 2.5e-06,
"loss": 0.7458,
"step": 5
},
{
"epoch": 0.01,
"grad_norm": 37.14544394485457,
"learning_rate": 3e-06,
"loss": 0.8249,
"step": 6
},
{
"epoch": 0.01,
"grad_norm": 13.413192499879626,
"learning_rate": 3.5e-06,
"loss": 0.7692,
"step": 7
},
{
"epoch": 0.01,
"grad_norm": 11.194156755277431,
"learning_rate": 4.000000000000001e-06,
"loss": 0.7724,
"step": 8
},
{
"epoch": 0.01,
"grad_norm": 8.569279640169995,
"learning_rate": 4.5e-06,
"loss": 0.7851,
"step": 9
},
{
"epoch": 0.01,
"grad_norm": 18.113903622060178,
"learning_rate": 5e-06,
"loss": 0.7874,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 7.486914001687124,
"learning_rate": 4.999997558722919e-06,
"loss": 0.7553,
"step": 11
},
{
"epoch": 0.01,
"grad_norm": 7.280219682440894,
"learning_rate": 4.999990234896445e-06,
"loss": 0.7095,
"step": 12
},
{
"epoch": 0.01,
"grad_norm": 4.3413734180304155,
"learning_rate": 4.99997802853488e-06,
"loss": 0.6916,
"step": 13
},
{
"epoch": 0.01,
"grad_norm": 5.756315245615391,
"learning_rate": 4.999960939662063e-06,
"loss": 0.7407,
"step": 14
},
{
"epoch": 0.01,
"grad_norm": 5.090553047874293,
"learning_rate": 4.999938968311371e-06,
"loss": 0.7387,
"step": 15
},
{
"epoch": 0.01,
"grad_norm": 5.8370558847287075,
"learning_rate": 4.9999121145257126e-06,
"loss": 0.7051,
"step": 16
},
{
"epoch": 0.02,
"grad_norm": 3.986658012877664,
"learning_rate": 4.999880378357535e-06,
"loss": 0.6871,
"step": 17
},
{
"epoch": 0.02,
"grad_norm": 4.141716122521651,
"learning_rate": 4.9998437598688195e-06,
"loss": 0.6694,
"step": 18
},
{
"epoch": 0.02,
"grad_norm": 4.729722439630604,
"learning_rate": 4.9998022591310815e-06,
"loss": 0.716,
"step": 19
},
{
"epoch": 0.02,
"grad_norm": 2.9486336901615497,
"learning_rate": 4.999755876225375e-06,
"loss": 0.6387,
"step": 20
},
{
"epoch": 0.02,
"grad_norm": 2.8336874650575745,
"learning_rate": 4.999704611242285e-06,
"loss": 0.6542,
"step": 21
},
{
"epoch": 0.02,
"grad_norm": 3.6724374918638905,
"learning_rate": 4.999648464281934e-06,
"loss": 0.6617,
"step": 22
},
{
"epoch": 0.02,
"grad_norm": 2.941494127880678,
"learning_rate": 4.999587435453979e-06,
"loss": 0.6687,
"step": 23
},
{
"epoch": 0.02,
"grad_norm": 2.6261822206464744,
"learning_rate": 4.999521524877608e-06,
"loss": 0.6634,
"step": 24
},
{
"epoch": 0.02,
"grad_norm": 2.8059947014946305,
"learning_rate": 4.999450732681549e-06,
"loss": 0.6901,
"step": 25
},
{
"epoch": 0.02,
"grad_norm": 3.131537494217822,
"learning_rate": 4.999375059004058e-06,
"loss": 0.6407,
"step": 26
},
{
"epoch": 0.02,
"grad_norm": 2.7893212245465837,
"learning_rate": 4.99929450399293e-06,
"loss": 0.6638,
"step": 27
},
{
"epoch": 0.02,
"grad_norm": 2.4411586751746,
"learning_rate": 4.999209067805487e-06,
"loss": 0.6196,
"step": 28
},
{
"epoch": 0.03,
"grad_norm": 2.8807261299944082,
"learning_rate": 4.999118750608591e-06,
"loss": 0.6839,
"step": 29
},
{
"epoch": 0.03,
"grad_norm": 2.879993804839069,
"learning_rate": 4.9990235525786326e-06,
"loss": 0.6484,
"step": 30
},
{
"epoch": 0.03,
"grad_norm": 2.604360711268946,
"learning_rate": 4.998923473901535e-06,
"loss": 0.6313,
"step": 31
},
{
"epoch": 0.03,
"grad_norm": 2.403225544767816,
"learning_rate": 4.9988185147727544e-06,
"loss": 0.6209,
"step": 32
},
{
"epoch": 0.03,
"grad_norm": 2.669567772543462,
"learning_rate": 4.998708675397278e-06,
"loss": 0.6068,
"step": 33
},
{
"epoch": 0.03,
"grad_norm": 2.443946495915797,
"learning_rate": 4.998593955989626e-06,
"loss": 0.6731,
"step": 34
},
{
"epoch": 0.03,
"grad_norm": 2.2104680876118317,
"learning_rate": 4.998474356773845e-06,
"loss": 0.6243,
"step": 35
},
{
"epoch": 0.03,
"grad_norm": 2.3602199264043957,
"learning_rate": 4.9983498779835175e-06,
"loss": 0.6649,
"step": 36
},
{
"epoch": 0.03,
"grad_norm": 2.4676911263240844,
"learning_rate": 4.998220519861752e-06,
"loss": 0.6174,
"step": 37
},
{
"epoch": 0.03,
"grad_norm": 2.3419026099030282,
"learning_rate": 4.998086282661188e-06,
"loss": 0.6123,
"step": 38
},
{
"epoch": 0.03,
"grad_norm": 2.14900736954254,
"learning_rate": 4.997947166643993e-06,
"loss": 0.63,
"step": 39
},
{
"epoch": 0.04,
"grad_norm": 2.570907426799795,
"learning_rate": 4.997803172081864e-06,
"loss": 0.6249,
"step": 40
},
{
"epoch": 0.04,
"grad_norm": 2.516952735669967,
"learning_rate": 4.997654299256026e-06,
"loss": 0.6727,
"step": 41
},
{
"epoch": 0.04,
"grad_norm": 2.1600457198543874,
"learning_rate": 4.997500548457231e-06,
"loss": 0.6719,
"step": 42
},
{
"epoch": 0.04,
"grad_norm": 2.2177572033934743,
"learning_rate": 4.997341919985756e-06,
"loss": 0.6148,
"step": 43
},
{
"epoch": 0.04,
"grad_norm": 2.397105205209689,
"learning_rate": 4.997178414151409e-06,
"loss": 0.6167,
"step": 44
},
{
"epoch": 0.04,
"grad_norm": 2.1254940534972167,
"learning_rate": 4.997010031273517e-06,
"loss": 0.6446,
"step": 45
},
{
"epoch": 0.04,
"grad_norm": 2.2113023791837194,
"learning_rate": 4.996836771680937e-06,
"loss": 0.6304,
"step": 46
},
{
"epoch": 0.04,
"grad_norm": 2.386446316275664,
"learning_rate": 4.99665863571205e-06,
"loss": 0.6621,
"step": 47
},
{
"epoch": 0.04,
"grad_norm": 2.1838934384314483,
"learning_rate": 4.996475623714756e-06,
"loss": 0.6214,
"step": 48
},
{
"epoch": 0.04,
"grad_norm": 2.2047933657923586,
"learning_rate": 4.996287736046485e-06,
"loss": 0.6478,
"step": 49
},
{
"epoch": 0.04,
"grad_norm": 2.208809457983808,
"learning_rate": 4.996094973074183e-06,
"loss": 0.6097,
"step": 50
},
{
"epoch": 0.05,
"grad_norm": 2.1318377198138267,
"learning_rate": 4.995897335174322e-06,
"loss": 0.622,
"step": 51
},
{
"epoch": 0.05,
"grad_norm": 2.0673034122993537,
"learning_rate": 4.995694822732893e-06,
"loss": 0.6036,
"step": 52
},
{
"epoch": 0.05,
"grad_norm": 2.195105312645423,
"learning_rate": 4.9954874361454055e-06,
"loss": 0.6052,
"step": 53
},
{
"epoch": 0.05,
"grad_norm": 2.157855029176061,
"learning_rate": 4.995275175816892e-06,
"loss": 0.6455,
"step": 54
},
{
"epoch": 0.05,
"grad_norm": 2.0500405783991043,
"learning_rate": 4.9950580421619e-06,
"loss": 0.6353,
"step": 55
},
{
"epoch": 0.05,
"grad_norm": 2.199629904296075,
"learning_rate": 4.9948360356044965e-06,
"loss": 0.6122,
"step": 56
},
{
"epoch": 0.05,
"grad_norm": 2.186847580161491,
"learning_rate": 4.994609156578267e-06,
"loss": 0.6073,
"step": 57
},
{
"epoch": 0.05,
"grad_norm": 2.0207512037097835,
"learning_rate": 4.994377405526308e-06,
"loss": 0.61,
"step": 58
},
{
"epoch": 0.05,
"grad_norm": 2.3170193964114976,
"learning_rate": 4.994140782901237e-06,
"loss": 0.6322,
"step": 59
},
{
"epoch": 0.05,
"grad_norm": 2.014785890436746,
"learning_rate": 4.9938992891651825e-06,
"loss": 0.6205,
"step": 60
},
{
"epoch": 0.05,
"grad_norm": 1.9538385063221935,
"learning_rate": 4.9936529247897854e-06,
"loss": 0.5992,
"step": 61
},
{
"epoch": 0.05,
"grad_norm": 2.084943826856202,
"learning_rate": 4.993401690256203e-06,
"loss": 0.6148,
"step": 62
},
{
"epoch": 0.06,
"grad_norm": 2.135158856581583,
"learning_rate": 4.9931455860551e-06,
"loss": 0.5937,
"step": 63
},
{
"epoch": 0.06,
"grad_norm": 1.982621418518698,
"learning_rate": 4.992884612686655e-06,
"loss": 0.6091,
"step": 64
},
{
"epoch": 0.06,
"grad_norm": 2.1030931953494956,
"learning_rate": 4.992618770660553e-06,
"loss": 0.6034,
"step": 65
},
{
"epoch": 0.06,
"grad_norm": 2.1994634556563994,
"learning_rate": 4.992348060495989e-06,
"loss": 0.5846,
"step": 66
},
{
"epoch": 0.06,
"grad_norm": 2.410691403277427,
"learning_rate": 4.992072482721669e-06,
"loss": 0.6294,
"step": 67
},
{
"epoch": 0.06,
"grad_norm": 1.9720494401999067,
"learning_rate": 4.991792037875799e-06,
"loss": 0.591,
"step": 68
},
{
"epoch": 0.06,
"grad_norm": 2.147504025949435,
"learning_rate": 4.991506726506094e-06,
"loss": 0.5689,
"step": 69
},
{
"epoch": 0.06,
"grad_norm": 2.1837702519904223,
"learning_rate": 4.991216549169776e-06,
"loss": 0.6422,
"step": 70
},
{
"epoch": 0.06,
"grad_norm": 2.0883865330274958,
"learning_rate": 4.9909215064335655e-06,
"loss": 0.6076,
"step": 71
},
{
"epoch": 0.06,
"grad_norm": 2.20727863923846,
"learning_rate": 4.990621598873687e-06,
"loss": 0.5974,
"step": 72
},
{
"epoch": 0.06,
"grad_norm": 2.0735330806418464,
"learning_rate": 4.990316827075868e-06,
"loss": 0.6809,
"step": 73
},
{
"epoch": 0.07,
"grad_norm": 2.0203203347538774,
"learning_rate": 4.990007191635334e-06,
"loss": 0.6107,
"step": 74
},
{
"epoch": 0.07,
"grad_norm": 2.234889365362174,
"learning_rate": 4.989692693156809e-06,
"loss": 0.6218,
"step": 75
},
{
"epoch": 0.07,
"grad_norm": 1.9902503343433904,
"learning_rate": 4.989373332254516e-06,
"loss": 0.6257,
"step": 76
},
{
"epoch": 0.07,
"grad_norm": 2.1041971507252466,
"learning_rate": 4.989049109552173e-06,
"loss": 0.5888,
"step": 77
},
{
"epoch": 0.07,
"grad_norm": 2.1151685783302123,
"learning_rate": 4.988720025682995e-06,
"loss": 0.6333,
"step": 78
},
{
"epoch": 0.07,
"grad_norm": 1.9223819269893592,
"learning_rate": 4.988386081289689e-06,
"loss": 0.6442,
"step": 79
},
{
"epoch": 0.07,
"grad_norm": 2.139676463756265,
"learning_rate": 4.988047277024456e-06,
"loss": 0.5966,
"step": 80
},
{
"epoch": 0.07,
"grad_norm": 2.1665820212993068,
"learning_rate": 4.987703613548988e-06,
"loss": 0.603,
"step": 81
},
{
"epoch": 0.07,
"grad_norm": 1.931456975470041,
"learning_rate": 4.987355091534467e-06,
"loss": 0.6122,
"step": 82
},
{
"epoch": 0.07,
"grad_norm": 2.134995092135601,
"learning_rate": 4.987001711661566e-06,
"loss": 0.6213,
"step": 83
},
{
"epoch": 0.07,
"grad_norm": 2.0173352657570818,
"learning_rate": 4.98664347462044e-06,
"loss": 0.5966,
"step": 84
},
{
"epoch": 0.08,
"grad_norm": 2.0816939924571183,
"learning_rate": 4.986280381110737e-06,
"loss": 0.5575,
"step": 85
},
{
"epoch": 0.08,
"grad_norm": 2.0072477771163357,
"learning_rate": 4.985912431841584e-06,
"loss": 0.6225,
"step": 86
},
{
"epoch": 0.08,
"grad_norm": 2.1895945454214507,
"learning_rate": 4.985539627531596e-06,
"loss": 0.6169,
"step": 87
},
{
"epoch": 0.08,
"grad_norm": 2.84518214074801,
"learning_rate": 4.985161968908866e-06,
"loss": 0.6317,
"step": 88
},
{
"epoch": 0.08,
"grad_norm": 2.194209857089938,
"learning_rate": 4.984779456710971e-06,
"loss": 0.6205,
"step": 89
},
{
"epoch": 0.08,
"grad_norm": 2.1604595364123083,
"learning_rate": 4.9843920916849645e-06,
"loss": 0.6176,
"step": 90
},
{
"epoch": 0.08,
"grad_norm": 2.039087518829079,
"learning_rate": 4.9839998745873795e-06,
"loss": 0.5842,
"step": 91
},
{
"epoch": 0.08,
"grad_norm": 2.0148570016863334,
"learning_rate": 4.983602806184225e-06,
"loss": 0.5936,
"step": 92
},
{
"epoch": 0.08,
"grad_norm": 2.073137159272384,
"learning_rate": 4.983200887250982e-06,
"loss": 0.6317,
"step": 93
},
{
"epoch": 0.08,
"grad_norm": 2.045469602089007,
"learning_rate": 4.9827941185726095e-06,
"loss": 0.5338,
"step": 94
},
{
"epoch": 0.08,
"grad_norm": 2.1201743116757417,
"learning_rate": 4.982382500943533e-06,
"loss": 0.6133,
"step": 95
},
{
"epoch": 0.09,
"grad_norm": 2.0637214917996363,
"learning_rate": 4.981966035167654e-06,
"loss": 0.6483,
"step": 96
},
{
"epoch": 0.09,
"grad_norm": 2.155574452675582,
"learning_rate": 4.981544722058336e-06,
"loss": 0.6001,
"step": 97
},
{
"epoch": 0.09,
"grad_norm": 1.9347601392775928,
"learning_rate": 4.981118562438414e-06,
"loss": 0.5954,
"step": 98
},
{
"epoch": 0.09,
"grad_norm": 2.3054537863874756,
"learning_rate": 4.980687557140187e-06,
"loss": 0.6338,
"step": 99
},
{
"epoch": 0.09,
"grad_norm": 2.0421104909837338,
"learning_rate": 4.980251707005417e-06,
"loss": 0.6166,
"step": 100
},
{
"epoch": 0.09,
"grad_norm": 2.023167301994367,
"learning_rate": 4.979811012885329e-06,
"loss": 0.5682,
"step": 101
},
{
"epoch": 0.09,
"grad_norm": 2.0583654213007967,
"learning_rate": 4.979365475640609e-06,
"loss": 0.5759,
"step": 102
},
{
"epoch": 0.09,
"grad_norm": 2.008917223929121,
"learning_rate": 4.9789150961414e-06,
"loss": 0.6324,
"step": 103
},
{
"epoch": 0.09,
"grad_norm": 2.1111479338304306,
"learning_rate": 4.978459875267303e-06,
"loss": 0.5821,
"step": 104
},
{
"epoch": 0.09,
"grad_norm": 2.400366962461983,
"learning_rate": 4.977999813907375e-06,
"loss": 0.5699,
"step": 105
},
{
"epoch": 0.09,
"grad_norm": 2.090668061316384,
"learning_rate": 4.977534912960124e-06,
"loss": 0.5754,
"step": 106
},
{
"epoch": 0.09,
"grad_norm": 2.2103419288491466,
"learning_rate": 4.977065173333515e-06,
"loss": 0.6005,
"step": 107
},
{
"epoch": 0.1,
"grad_norm": 2.1332380447628294,
"learning_rate": 4.9765905959449565e-06,
"loss": 0.6178,
"step": 108
},
{
"epoch": 0.1,
"grad_norm": 2.1372224949542464,
"learning_rate": 4.976111181721309e-06,
"loss": 0.6021,
"step": 109
},
{
"epoch": 0.1,
"grad_norm": 2.636052326949506,
"learning_rate": 4.97562693159888e-06,
"loss": 0.6418,
"step": 110
},
{
"epoch": 0.1,
"grad_norm": 2.1234423477493443,
"learning_rate": 4.975137846523419e-06,
"loss": 0.6231,
"step": 111
},
{
"epoch": 0.1,
"grad_norm": 2.2817790529425315,
"learning_rate": 4.974643927450121e-06,
"loss": 0.5681,
"step": 112
},
{
"epoch": 0.1,
"grad_norm": 2.2605060344304713,
"learning_rate": 4.9741451753436205e-06,
"loss": 0.5803,
"step": 113
},
{
"epoch": 0.1,
"grad_norm": 2.0355236974665876,
"learning_rate": 4.973641591177991e-06,
"loss": 0.6003,
"step": 114
},
{
"epoch": 0.1,
"grad_norm": 2.4343221170301415,
"learning_rate": 4.973133175936743e-06,
"loss": 0.5882,
"step": 115
},
{
"epoch": 0.1,
"grad_norm": 2.2135760843199734,
"learning_rate": 4.972619930612822e-06,
"loss": 0.5886,
"step": 116
},
{
"epoch": 0.1,
"grad_norm": 2.161909448676307,
"learning_rate": 4.972101856208609e-06,
"loss": 0.5792,
"step": 117
},
{
"epoch": 0.1,
"grad_norm": 2.0871148781401927,
"learning_rate": 4.9715789537359126e-06,
"loss": 0.6383,
"step": 118
},
{
"epoch": 0.11,
"grad_norm": 2.1159018206478626,
"learning_rate": 4.971051224215973e-06,
"loss": 0.5865,
"step": 119
},
{
"epoch": 0.11,
"grad_norm": 2.2036428070670375,
"learning_rate": 4.970518668679459e-06,
"loss": 0.5905,
"step": 120
},
{
"epoch": 0.11,
"grad_norm": 2.22262007661876,
"learning_rate": 4.969981288166461e-06,
"loss": 0.5951,
"step": 121
},
{
"epoch": 0.11,
"grad_norm": 2.0713458839382786,
"learning_rate": 4.969439083726496e-06,
"loss": 0.6011,
"step": 122
},
{
"epoch": 0.11,
"grad_norm": 2.0686060725186897,
"learning_rate": 4.9688920564185e-06,
"loss": 0.6038,
"step": 123
},
{
"epoch": 0.11,
"grad_norm": 2.1825376161159964,
"learning_rate": 4.968340207310832e-06,
"loss": 0.6098,
"step": 124
},
{
"epoch": 0.11,
"grad_norm": 2.142436541976576,
"learning_rate": 4.967783537481262e-06,
"loss": 0.6119,
"step": 125
},
{
"epoch": 0.11,
"grad_norm": 2.330044622755397,
"learning_rate": 4.967222048016979e-06,
"loss": 0.6057,
"step": 126
},
{
"epoch": 0.11,
"grad_norm": 2.109116942854107,
"learning_rate": 4.966655740014585e-06,
"loss": 0.5958,
"step": 127
},
{
"epoch": 0.11,
"grad_norm": 2.174219068914296,
"learning_rate": 4.9660846145800914e-06,
"loss": 0.6276,
"step": 128
},
{
"epoch": 0.11,
"grad_norm": 2.135736248304593,
"learning_rate": 4.965508672828918e-06,
"loss": 0.6309,
"step": 129
},
{
"epoch": 0.12,
"grad_norm": 2.2339234058672885,
"learning_rate": 4.964927915885893e-06,
"loss": 0.5879,
"step": 130
},
{
"epoch": 0.12,
"grad_norm": 2.0960660335616224,
"learning_rate": 4.9643423448852455e-06,
"loss": 0.6218,
"step": 131
},
{
"epoch": 0.12,
"grad_norm": 1.9468729925472703,
"learning_rate": 4.963751960970609e-06,
"loss": 0.5998,
"step": 132
},
{
"epoch": 0.12,
"grad_norm": 2.1623168252289915,
"learning_rate": 4.9631567652950164e-06,
"loss": 0.6885,
"step": 133
},
{
"epoch": 0.12,
"grad_norm": 2.084420579583794,
"learning_rate": 4.962556759020898e-06,
"loss": 0.5758,
"step": 134
},
{
"epoch": 0.12,
"grad_norm": 2.1082890389844713,
"learning_rate": 4.961951943320078e-06,
"loss": 0.6116,
"step": 135
},
{
"epoch": 0.12,
"grad_norm": 2.006123424806457,
"learning_rate": 4.9613423193737754e-06,
"loss": 0.5708,
"step": 136
},
{
"epoch": 0.12,
"grad_norm": 2.309431970929405,
"learning_rate": 4.960727888372599e-06,
"loss": 0.621,
"step": 137
},
{
"epoch": 0.12,
"grad_norm": 2.226488524758773,
"learning_rate": 4.9601086515165456e-06,
"loss": 0.5896,
"step": 138
},
{
"epoch": 0.12,
"grad_norm": 2.1242070778655253,
"learning_rate": 4.959484610014997e-06,
"loss": 0.624,
"step": 139
},
{
"epoch": 0.12,
"grad_norm": 2.2147491445730516,
"learning_rate": 4.958855765086722e-06,
"loss": 0.6064,
"step": 140
},
{
"epoch": 0.12,
"grad_norm": 2.1818004600393,
"learning_rate": 4.958222117959868e-06,
"loss": 0.6252,
"step": 141
},
{
"epoch": 0.13,
"grad_norm": 2.1094535889409696,
"learning_rate": 4.95758366987196e-06,
"loss": 0.5779,
"step": 142
},
{
"epoch": 0.13,
"grad_norm": 2.2043056809252577,
"learning_rate": 4.9569404220699025e-06,
"loss": 0.6156,
"step": 143
},
{
"epoch": 0.13,
"grad_norm": 2.158056342799238,
"learning_rate": 4.956292375809971e-06,
"loss": 0.5662,
"step": 144
},
{
"epoch": 0.13,
"grad_norm": 1.987581635345228,
"learning_rate": 4.955639532357815e-06,
"loss": 0.6148,
"step": 145
},
{
"epoch": 0.13,
"grad_norm": 2.266145451051948,
"learning_rate": 4.954981892988451e-06,
"loss": 0.5867,
"step": 146
},
{
"epoch": 0.13,
"grad_norm": 2.071082600205798,
"learning_rate": 4.954319458986264e-06,
"loss": 0.5976,
"step": 147
},
{
"epoch": 0.13,
"grad_norm": 2.1615342548575374,
"learning_rate": 4.953652231645002e-06,
"loss": 0.5643,
"step": 148
},
{
"epoch": 0.13,
"grad_norm": 2.145126231371731,
"learning_rate": 4.952980212267773e-06,
"loss": 0.5592,
"step": 149
},
{
"epoch": 0.13,
"grad_norm": 1.9161750244434461,
"learning_rate": 4.952303402167047e-06,
"loss": 0.5547,
"step": 150
},
{
"epoch": 0.13,
"grad_norm": 2.234370958372018,
"learning_rate": 4.9516218026646475e-06,
"loss": 0.578,
"step": 151
},
{
"epoch": 0.13,
"grad_norm": 2.149553338429868,
"learning_rate": 4.950935415091753e-06,
"loss": 0.5952,
"step": 152
},
{
"epoch": 0.14,
"grad_norm": 2.1021801657048016,
"learning_rate": 4.950244240788895e-06,
"loss": 0.573,
"step": 153
},
{
"epoch": 0.14,
"grad_norm": 2.488711367210497,
"learning_rate": 4.949548281105951e-06,
"loss": 0.5776,
"step": 154
},
{
"epoch": 0.14,
"grad_norm": 2.0302393290147167,
"learning_rate": 4.948847537402145e-06,
"loss": 0.5685,
"step": 155
},
{
"epoch": 0.14,
"grad_norm": 2.1563261797248043,
"learning_rate": 4.948142011046044e-06,
"loss": 0.6185,
"step": 156
},
{
"epoch": 0.14,
"grad_norm": 2.1308303224609997,
"learning_rate": 4.947431703415558e-06,
"loss": 0.6229,
"step": 157
},
{
"epoch": 0.14,
"grad_norm": 2.0988414912992273,
"learning_rate": 4.946716615897932e-06,
"loss": 0.6167,
"step": 158
},
{
"epoch": 0.14,
"grad_norm": 2.3558302474583095,
"learning_rate": 4.9459967498897485e-06,
"loss": 0.5903,
"step": 159
},
{
"epoch": 0.14,
"grad_norm": 2.1505555405055223,
"learning_rate": 4.945272106796919e-06,
"loss": 0.5709,
"step": 160
},
{
"epoch": 0.14,
"grad_norm": 2.0604140956574635,
"learning_rate": 4.94454268803469e-06,
"loss": 0.635,
"step": 161
},
{
"epoch": 0.14,
"grad_norm": 2.3699836246614696,
"learning_rate": 4.943808495027631e-06,
"loss": 0.581,
"step": 162
},
{
"epoch": 0.14,
"grad_norm": 1.9809907136859368,
"learning_rate": 4.9430695292096365e-06,
"loss": 0.5703,
"step": 163
},
{
"epoch": 0.15,
"grad_norm": 2.213101907296851,
"learning_rate": 4.942325792023922e-06,
"loss": 0.5915,
"step": 164
},
{
"epoch": 0.15,
"grad_norm": 2.3778783149383944,
"learning_rate": 4.941577284923025e-06,
"loss": 0.537,
"step": 165
},
{
"epoch": 0.15,
"grad_norm": 1.9283694807512721,
"learning_rate": 4.9408240093687934e-06,
"loss": 0.579,
"step": 166
},
{
"epoch": 0.15,
"grad_norm": 2.083087334039033,
"learning_rate": 4.940065966832392e-06,
"loss": 0.5612,
"step": 167
},
{
"epoch": 0.15,
"grad_norm": 2.314684793845775,
"learning_rate": 4.939303158794294e-06,
"loss": 0.6001,
"step": 168
},
{
"epoch": 0.15,
"grad_norm": 2.131977461745334,
"learning_rate": 4.93853558674428e-06,
"loss": 0.5809,
"step": 169
},
{
"epoch": 0.15,
"grad_norm": 2.1291924932946755,
"learning_rate": 4.937763252181434e-06,
"loss": 0.6216,
"step": 170
},
{
"epoch": 0.15,
"grad_norm": 1.9366549866764742,
"learning_rate": 4.936986156614144e-06,
"loss": 0.5888,
"step": 171
},
{
"epoch": 0.15,
"grad_norm": 2.231889540095555,
"learning_rate": 4.9362043015600934e-06,
"loss": 0.6437,
"step": 172
},
{
"epoch": 0.15,
"grad_norm": 2.0696023557568233,
"learning_rate": 4.9354176885462626e-06,
"loss": 0.5951,
"step": 173
},
{
"epoch": 0.15,
"grad_norm": 2.10974806039572,
"learning_rate": 4.934626319108923e-06,
"loss": 0.5817,
"step": 174
},
{
"epoch": 0.16,
"grad_norm": 2.0633698321381946,
"learning_rate": 4.933830194793636e-06,
"loss": 0.5692,
"step": 175
},
{
"epoch": 0.16,
"grad_norm": 2.0163693967733423,
"learning_rate": 4.933029317155251e-06,
"loss": 0.5322,
"step": 176
},
{
"epoch": 0.16,
"grad_norm": 2.1118176135699813,
"learning_rate": 4.932223687757899e-06,
"loss": 0.5809,
"step": 177
},
{
"epoch": 0.16,
"grad_norm": 2.181431947183138,
"learning_rate": 4.9314133081749906e-06,
"loss": 0.5444,
"step": 178
},
{
"epoch": 0.16,
"grad_norm": 2.2055197469621386,
"learning_rate": 4.930598179989215e-06,
"loss": 0.6063,
"step": 179
},
{
"epoch": 0.16,
"grad_norm": 2.1103699877035638,
"learning_rate": 4.929778304792537e-06,
"loss": 0.5908,
"step": 180
},
{
"epoch": 0.16,
"grad_norm": 2.01692648335164,
"learning_rate": 4.928953684186189e-06,
"loss": 0.5729,
"step": 181
},
{
"epoch": 0.16,
"grad_norm": 1.990744003423107,
"learning_rate": 4.928124319780673e-06,
"loss": 0.5935,
"step": 182
},
{
"epoch": 0.16,
"grad_norm": 1.9898687560952446,
"learning_rate": 4.9272902131957555e-06,
"loss": 0.6008,
"step": 183
},
{
"epoch": 0.16,
"grad_norm": 1.9499116832570582,
"learning_rate": 4.926451366060465e-06,
"loss": 0.5731,
"step": 184
},
{
"epoch": 0.16,
"grad_norm": 1.8933258467243923,
"learning_rate": 4.925607780013088e-06,
"loss": 0.5822,
"step": 185
},
{
"epoch": 0.16,
"grad_norm": 1.9711936623837691,
"learning_rate": 4.924759456701167e-06,
"loss": 0.5433,
"step": 186
},
{
"epoch": 0.17,
"grad_norm": 1.9981254191144715,
"learning_rate": 4.923906397781495e-06,
"loss": 0.5603,
"step": 187
},
{
"epoch": 0.17,
"grad_norm": 1.9489584101682442,
"learning_rate": 4.923048604920115e-06,
"loss": 0.592,
"step": 188
},
{
"epoch": 0.17,
"grad_norm": 2.14587896098926,
"learning_rate": 4.922186079792315e-06,
"loss": 0.5861,
"step": 189
},
{
"epoch": 0.17,
"grad_norm": 2.093505234897306,
"learning_rate": 4.921318824082625e-06,
"loss": 0.5756,
"step": 190
},
{
"epoch": 0.17,
"grad_norm": 1.9726924068956073,
"learning_rate": 4.920446839484814e-06,
"loss": 0.5954,
"step": 191
},
{
"epoch": 0.17,
"grad_norm": 2.0009011296035886,
"learning_rate": 4.919570127701888e-06,
"loss": 0.5185,
"step": 192
},
{
"epoch": 0.17,
"grad_norm": 2.0801246171281993,
"learning_rate": 4.9186886904460826e-06,
"loss": 0.5788,
"step": 193
},
{
"epoch": 0.17,
"grad_norm": 2.7712602468155096,
"learning_rate": 4.917802529438865e-06,
"loss": 0.6637,
"step": 194
},
{
"epoch": 0.17,
"grad_norm": 1.9721040372060654,
"learning_rate": 4.916911646410926e-06,
"loss": 0.5926,
"step": 195
},
{
"epoch": 0.17,
"grad_norm": 2.1199089061376855,
"learning_rate": 4.91601604310218e-06,
"loss": 0.5854,
"step": 196
},
{
"epoch": 0.17,
"grad_norm": 1.9518281461372036,
"learning_rate": 4.915115721261759e-06,
"loss": 0.5456,
"step": 197
},
{
"epoch": 0.18,
"grad_norm": 2.1537515435847734,
"learning_rate": 4.9142106826480114e-06,
"loss": 0.6152,
"step": 198
},
{
"epoch": 0.18,
"grad_norm": 2.3461320565666344,
"learning_rate": 4.913300929028498e-06,
"loss": 0.617,
"step": 199
},
{
"epoch": 0.18,
"grad_norm": 1.9789785575462193,
"learning_rate": 4.912386462179987e-06,
"loss": 0.5845,
"step": 200
},
{
"epoch": 0.18,
"grad_norm": 2.0705337307209253,
"learning_rate": 4.9114672838884515e-06,
"loss": 0.6062,
"step": 201
},
{
"epoch": 0.18,
"grad_norm": 1.9972918925367322,
"learning_rate": 4.910543395949066e-06,
"loss": 0.6318,
"step": 202
},
{
"epoch": 0.18,
"grad_norm": 2.03173534028091,
"learning_rate": 4.9096148001662055e-06,
"loss": 0.64,
"step": 203
},
{
"epoch": 0.18,
"grad_norm": 2.0861416304602356,
"learning_rate": 4.908681498353436e-06,
"loss": 0.5859,
"step": 204
},
{
"epoch": 0.18,
"grad_norm": 1.932510611788884,
"learning_rate": 4.907743492333517e-06,
"loss": 0.5483,
"step": 205
},
{
"epoch": 0.18,
"grad_norm": 1.9618471764126828,
"learning_rate": 4.906800783938395e-06,
"loss": 0.5767,
"step": 206
},
{
"epoch": 0.18,
"grad_norm": 2.3557796360921786,
"learning_rate": 4.905853375009198e-06,
"loss": 0.5934,
"step": 207
},
{
"epoch": 0.18,
"grad_norm": 2.0993364379712784,
"learning_rate": 4.9049012673962385e-06,
"loss": 0.5879,
"step": 208
},
{
"epoch": 0.19,
"grad_norm": 2.2015612636555155,
"learning_rate": 4.903944462959001e-06,
"loss": 0.5598,
"step": 209
},
{
"epoch": 0.19,
"grad_norm": 2.0374544745406062,
"learning_rate": 4.902982963566147e-06,
"loss": 0.577,
"step": 210
},
{
"epoch": 0.19,
"grad_norm": 2.194866218807,
"learning_rate": 4.902016771095506e-06,
"loss": 0.5848,
"step": 211
},
{
"epoch": 0.19,
"grad_norm": 2.2545375351308614,
"learning_rate": 4.901045887434072e-06,
"loss": 0.5846,
"step": 212
},
{
"epoch": 0.19,
"grad_norm": 2.017012770131601,
"learning_rate": 4.900070314478001e-06,
"loss": 0.5651,
"step": 213
},
{
"epoch": 0.19,
"grad_norm": 2.150900326654639,
"learning_rate": 4.899090054132609e-06,
"loss": 0.568,
"step": 214
},
{
"epoch": 0.19,
"grad_norm": 2.0404886979870454,
"learning_rate": 4.898105108312366e-06,
"loss": 0.5277,
"step": 215
},
{
"epoch": 0.19,
"grad_norm": 2.036614254190257,
"learning_rate": 4.897115478940892e-06,
"loss": 0.5754,
"step": 216
},
{
"epoch": 0.19,
"grad_norm": 2.041133008809928,
"learning_rate": 4.896121167950954e-06,
"loss": 0.6294,
"step": 217
},
{
"epoch": 0.19,
"grad_norm": 2.0029503409054885,
"learning_rate": 4.895122177284465e-06,
"loss": 0.5531,
"step": 218
},
{
"epoch": 0.19,
"grad_norm": 2.0303439698174754,
"learning_rate": 4.894118508892474e-06,
"loss": 0.6008,
"step": 219
},
{
"epoch": 0.19,
"grad_norm": 1.899982778272908,
"learning_rate": 4.893110164735167e-06,
"loss": 0.6076,
"step": 220
},
{
"epoch": 0.2,
"grad_norm": 2.170640326694132,
"learning_rate": 4.892097146781862e-06,
"loss": 0.5806,
"step": 221
},
{
"epoch": 0.2,
"grad_norm": 1.961802557992624,
"learning_rate": 4.8910794570110055e-06,
"loss": 0.5456,
"step": 222
},
{
"epoch": 0.2,
"grad_norm": 2.1149182672715807,
"learning_rate": 4.890057097410167e-06,
"loss": 0.5683,
"step": 223
},
{
"epoch": 0.2,
"grad_norm": 1.9988574008443096,
"learning_rate": 4.889030069976038e-06,
"loss": 0.5603,
"step": 224
},
{
"epoch": 0.2,
"grad_norm": 2.137840782586502,
"learning_rate": 4.887998376714424e-06,
"loss": 0.5713,
"step": 225
},
{
"epoch": 0.2,
"grad_norm": 2.2956357234771634,
"learning_rate": 4.886962019640244e-06,
"loss": 0.5635,
"step": 226
},
{
"epoch": 0.2,
"grad_norm": 2.2175517801056346,
"learning_rate": 4.885921000777528e-06,
"loss": 0.631,
"step": 227
},
{
"epoch": 0.2,
"grad_norm": 2.0861966792656546,
"learning_rate": 4.884875322159407e-06,
"loss": 0.5521,
"step": 228
},
{
"epoch": 0.2,
"grad_norm": 2.170862650134145,
"learning_rate": 4.883824985828114e-06,
"loss": 0.5953,
"step": 229
},
{
"epoch": 0.2,
"grad_norm": 2.016871028914906,
"learning_rate": 4.882769993834978e-06,
"loss": 0.5745,
"step": 230
},
{
"epoch": 0.2,
"grad_norm": 2.4069309610367107,
"learning_rate": 4.8817103482404236e-06,
"loss": 0.5752,
"step": 231
},
{
"epoch": 0.21,
"grad_norm": 1.9834780557891722,
"learning_rate": 4.880646051113959e-06,
"loss": 0.5619,
"step": 232
},
{
"epoch": 0.21,
"grad_norm": 2.1221686040256005,
"learning_rate": 4.87957710453418e-06,
"loss": 0.561,
"step": 233
},
{
"epoch": 0.21,
"grad_norm": 2.1497751964139002,
"learning_rate": 4.878503510588764e-06,
"loss": 0.5754,
"step": 234
},
{
"epoch": 0.21,
"grad_norm": 1.8535318318419167,
"learning_rate": 4.877425271374462e-06,
"loss": 0.5551,
"step": 235
},
{
"epoch": 0.21,
"grad_norm": 2.1537345489224404,
"learning_rate": 4.876342388997099e-06,
"loss": 0.544,
"step": 236
},
{
"epoch": 0.21,
"grad_norm": 1.9695512744073471,
"learning_rate": 4.875254865571567e-06,
"loss": 0.6003,
"step": 237
},
{
"epoch": 0.21,
"grad_norm": 2.2550853928957193,
"learning_rate": 4.874162703221823e-06,
"loss": 0.5968,
"step": 238
},
{
"epoch": 0.21,
"grad_norm": 2.0658630166795917,
"learning_rate": 4.873065904080884e-06,
"loss": 0.5658,
"step": 239
},
{
"epoch": 0.21,
"grad_norm": 2.0821280326495524,
"learning_rate": 4.871964470290823e-06,
"loss": 0.5711,
"step": 240
},
{
"epoch": 0.21,
"grad_norm": 1.9833074137024158,
"learning_rate": 4.8708584040027636e-06,
"loss": 0.5899,
"step": 241
},
{
"epoch": 0.21,
"grad_norm": 2.0288963441502195,
"learning_rate": 4.869747707376877e-06,
"loss": 0.5601,
"step": 242
},
{
"epoch": 0.22,
"grad_norm": 2.0970435875726463,
"learning_rate": 4.868632382582378e-06,
"loss": 0.6381,
"step": 243
},
{
"epoch": 0.22,
"grad_norm": 2.1303280408644194,
"learning_rate": 4.86751243179752e-06,
"loss": 0.5495,
"step": 244
},
{
"epoch": 0.22,
"grad_norm": 2.0851781018580584,
"learning_rate": 4.866387857209591e-06,
"loss": 0.5901,
"step": 245
},
{
"epoch": 0.22,
"grad_norm": 1.8310760160854438,
"learning_rate": 4.86525866101491e-06,
"loss": 0.5513,
"step": 246
},
{
"epoch": 0.22,
"grad_norm": 2.199726167537497,
"learning_rate": 4.8641248454188205e-06,
"loss": 0.5873,
"step": 247
},
{
"epoch": 0.22,
"grad_norm": 1.9776691221978735,
"learning_rate": 4.862986412635691e-06,
"loss": 0.6143,
"step": 248
},
{
"epoch": 0.22,
"grad_norm": 2.0663231641830873,
"learning_rate": 4.8618433648889034e-06,
"loss": 0.5937,
"step": 249
},
{
"epoch": 0.22,
"grad_norm": 2.170520506577784,
"learning_rate": 4.860695704410856e-06,
"loss": 0.5374,
"step": 250
},
{
"epoch": 0.22,
"grad_norm": 1.9685756224067419,
"learning_rate": 4.8595434334429535e-06,
"loss": 0.5139,
"step": 251
},
{
"epoch": 0.22,
"grad_norm": 1.9668205539999677,
"learning_rate": 4.8583865542356065e-06,
"loss": 0.5459,
"step": 252
},
{
"epoch": 0.22,
"grad_norm": 2.0793578279258704,
"learning_rate": 4.857225069048226e-06,
"loss": 0.593,
"step": 253
},
{
"epoch": 0.22,
"grad_norm": 1.9265474492849337,
"learning_rate": 4.8560589801492165e-06,
"loss": 0.5559,
"step": 254
},
{
"epoch": 0.23,
"grad_norm": 2.8555278122830696,
"learning_rate": 4.854888289815976e-06,
"loss": 0.5949,
"step": 255
},
{
"epoch": 0.23,
"grad_norm": 2.063838630196542,
"learning_rate": 4.853713000334887e-06,
"loss": 0.5712,
"step": 256
},
{
"epoch": 0.23,
"grad_norm": 2.168668910730517,
"learning_rate": 4.852533114001316e-06,
"loss": 0.5475,
"step": 257
},
{
"epoch": 0.23,
"grad_norm": 2.064042820960706,
"learning_rate": 4.8513486331196055e-06,
"loss": 0.5616,
"step": 258
},
{
"epoch": 0.23,
"grad_norm": 2.026751060346143,
"learning_rate": 4.850159560003074e-06,
"loss": 0.5997,
"step": 259
},
{
"epoch": 0.23,
"grad_norm": 2.1228129299875254,
"learning_rate": 4.848965896974006e-06,
"loss": 0.5622,
"step": 260
},
{
"epoch": 0.23,
"grad_norm": 1.9418510365881214,
"learning_rate": 4.847767646363652e-06,
"loss": 0.5741,
"step": 261
},
{
"epoch": 0.23,
"grad_norm": 2.070611833895483,
"learning_rate": 4.846564810512221e-06,
"loss": 0.5729,
"step": 262
},
{
"epoch": 0.23,
"grad_norm": 1.8833621440375596,
"learning_rate": 4.845357391768877e-06,
"loss": 0.5503,
"step": 263
},
{
"epoch": 0.23,
"grad_norm": 2.1022924907055387,
"learning_rate": 4.844145392491735e-06,
"loss": 0.6204,
"step": 264
},
{
"epoch": 0.23,
"grad_norm": 2.024625007813473,
"learning_rate": 4.842928815047856e-06,
"loss": 0.5776,
"step": 265
},
{
"epoch": 0.24,
"grad_norm": 1.9123739071371275,
"learning_rate": 4.8417076618132434e-06,
"loss": 0.5417,
"step": 266
},
{
"epoch": 0.24,
"grad_norm": 2.062879186086598,
"learning_rate": 4.8404819351728336e-06,
"loss": 0.5387,
"step": 267
},
{
"epoch": 0.24,
"grad_norm": 1.9944627549250884,
"learning_rate": 4.8392516375204986e-06,
"loss": 0.5731,
"step": 268
},
{
"epoch": 0.24,
"grad_norm": 1.9859912626846585,
"learning_rate": 4.838016771259037e-06,
"loss": 0.5969,
"step": 269
},
{
"epoch": 0.24,
"grad_norm": 2.043069520519082,
"learning_rate": 4.836777338800168e-06,
"loss": 0.6217,
"step": 270
},
{
"epoch": 0.24,
"grad_norm": 1.913212451622778,
"learning_rate": 4.835533342564531e-06,
"loss": 0.5527,
"step": 271
},
{
"epoch": 0.24,
"grad_norm": 1.978858281238778,
"learning_rate": 4.834284784981678e-06,
"loss": 0.5997,
"step": 272
},
{
"epoch": 0.24,
"grad_norm": 2.004628826916504,
"learning_rate": 4.833031668490067e-06,
"loss": 0.551,
"step": 273
},
{
"epoch": 0.24,
"grad_norm": 2.164370107566024,
"learning_rate": 4.8317739955370645e-06,
"loss": 0.5537,
"step": 274
},
{
"epoch": 0.24,
"grad_norm": 1.891772326146366,
"learning_rate": 4.83051176857893e-06,
"loss": 0.6075,
"step": 275
},
{
"epoch": 0.24,
"grad_norm": 2.0553128913886645,
"learning_rate": 4.8292449900808216e-06,
"loss": 0.5854,
"step": 276
},
{
"epoch": 0.25,
"grad_norm": 2.009000622167072,
"learning_rate": 4.827973662516786e-06,
"loss": 0.5503,
"step": 277
},
{
"epoch": 0.25,
"grad_norm": 1.9385043396652537,
"learning_rate": 4.826697788369752e-06,
"loss": 0.5704,
"step": 278
},
{
"epoch": 0.25,
"grad_norm": 2.3263786060073826,
"learning_rate": 4.8254173701315295e-06,
"loss": 0.5604,
"step": 279
},
{
"epoch": 0.25,
"grad_norm": 1.9251504140774536,
"learning_rate": 4.8241324103028055e-06,
"loss": 0.5647,
"step": 280
},
{
"epoch": 0.25,
"grad_norm": 1.9714117964729747,
"learning_rate": 4.822842911393131e-06,
"loss": 0.604,
"step": 281
},
{
"epoch": 0.25,
"grad_norm": 2.034372279161665,
"learning_rate": 4.821548875920927e-06,
"loss": 0.5803,
"step": 282
},
{
"epoch": 0.25,
"grad_norm": 1.9849114644945505,
"learning_rate": 4.8202503064134725e-06,
"loss": 0.5854,
"step": 283
},
{
"epoch": 0.25,
"grad_norm": 2.3435998455971343,
"learning_rate": 4.818947205406902e-06,
"loss": 0.4988,
"step": 284
},
{
"epoch": 0.25,
"grad_norm": 2.0672779732760924,
"learning_rate": 4.8176395754462e-06,
"loss": 0.5734,
"step": 285
},
{
"epoch": 0.25,
"grad_norm": 2.1206384205127544,
"learning_rate": 4.816327419085197e-06,
"loss": 0.563,
"step": 286
},
{
"epoch": 0.25,
"grad_norm": 2.1105254841893095,
"learning_rate": 4.815010738886561e-06,
"loss": 0.5765,
"step": 287
},
{
"epoch": 0.26,
"grad_norm": 2.072546090747287,
"learning_rate": 4.813689537421798e-06,
"loss": 0.6003,
"step": 288
},
{
"epoch": 0.26,
"grad_norm": 2.1131138426394442,
"learning_rate": 4.812363817271243e-06,
"loss": 0.6097,
"step": 289
},
{
"epoch": 0.26,
"grad_norm": 1.9218545344238502,
"learning_rate": 4.811033581024056e-06,
"loss": 0.6272,
"step": 290
},
{
"epoch": 0.26,
"grad_norm": 2.235420687671868,
"learning_rate": 4.809698831278217e-06,
"loss": 0.5519,
"step": 291
},
{
"epoch": 0.26,
"grad_norm": 1.8915062282224397,
"learning_rate": 4.808359570640522e-06,
"loss": 0.5832,
"step": 292
},
{
"epoch": 0.26,
"grad_norm": 1.9185231023206675,
"learning_rate": 4.8070158017265755e-06,
"loss": 0.5854,
"step": 293
},
{
"epoch": 0.26,
"grad_norm": 2.086526046887808,
"learning_rate": 4.805667527160788e-06,
"loss": 0.5314,
"step": 294
},
{
"epoch": 0.26,
"grad_norm": 1.9995370937944454,
"learning_rate": 4.804314749576368e-06,
"loss": 0.5749,
"step": 295
},
{
"epoch": 0.26,
"grad_norm": 2.099313489806141,
"learning_rate": 4.802957471615319e-06,
"loss": 0.5173,
"step": 296
},
{
"epoch": 0.26,
"grad_norm": 2.067736275086448,
"learning_rate": 4.8015956959284346e-06,
"loss": 0.5434,
"step": 297
},
{
"epoch": 0.26,
"grad_norm": 2.005525416579935,
"learning_rate": 4.800229425175294e-06,
"loss": 0.5589,
"step": 298
},
{
"epoch": 0.26,
"grad_norm": 2.172708847484724,
"learning_rate": 4.7988586620242515e-06,
"loss": 0.5919,
"step": 299
},
{
"epoch": 0.27,
"grad_norm": 2.0010542748493823,
"learning_rate": 4.797483409152438e-06,
"loss": 0.5803,
"step": 300
},
{
"epoch": 0.27,
"grad_norm": 2.1169505971764506,
"learning_rate": 4.7961036692457516e-06,
"loss": 0.5763,
"step": 301
},
{
"epoch": 0.27,
"grad_norm": 2.202849419501746,
"learning_rate": 4.794719444998856e-06,
"loss": 0.5691,
"step": 302
},
{
"epoch": 0.27,
"grad_norm": 1.9765013761990564,
"learning_rate": 4.793330739115169e-06,
"loss": 0.5657,
"step": 303
},
{
"epoch": 0.27,
"grad_norm": 2.0404392238791136,
"learning_rate": 4.791937554306863e-06,
"loss": 0.5648,
"step": 304
},
{
"epoch": 0.27,
"grad_norm": 2.0298920886210516,
"learning_rate": 4.790539893294861e-06,
"loss": 0.5353,
"step": 305
},
{
"epoch": 0.27,
"grad_norm": 2.03157486915788,
"learning_rate": 4.789137758808823e-06,
"loss": 0.5716,
"step": 306
},
{
"epoch": 0.27,
"grad_norm": 2.060346338513047,
"learning_rate": 4.787731153587149e-06,
"loss": 0.5502,
"step": 307
},
{
"epoch": 0.27,
"grad_norm": 1.9286831590091769,
"learning_rate": 4.786320080376968e-06,
"loss": 0.5646,
"step": 308
},
{
"epoch": 0.27,
"grad_norm": 2.042346254905274,
"learning_rate": 4.7849045419341376e-06,
"loss": 0.6085,
"step": 309
},
{
"epoch": 0.27,
"grad_norm": 2.0758243469708293,
"learning_rate": 4.7834845410232356e-06,
"loss": 0.5452,
"step": 310
},
{
"epoch": 0.28,
"grad_norm": 2.0454965773706553,
"learning_rate": 4.782060080417553e-06,
"loss": 0.514,
"step": 311
},
{
"epoch": 0.28,
"grad_norm": 2.073931876222572,
"learning_rate": 4.780631162899094e-06,
"loss": 0.5884,
"step": 312
},
{
"epoch": 0.28,
"grad_norm": 1.9699688248650635,
"learning_rate": 4.7791977912585645e-06,
"loss": 0.529,
"step": 313
},
{
"epoch": 0.28,
"grad_norm": 1.9886162974888701,
"learning_rate": 4.7777599682953696e-06,
"loss": 0.5796,
"step": 314
},
{
"epoch": 0.28,
"grad_norm": 1.9579685029739566,
"learning_rate": 4.7763176968176106e-06,
"loss": 0.5553,
"step": 315
},
{
"epoch": 0.28,
"grad_norm": 2.2181861411036086,
"learning_rate": 4.7748709796420735e-06,
"loss": 0.5806,
"step": 316
},
{
"epoch": 0.28,
"grad_norm": 2.0345738930041777,
"learning_rate": 4.773419819594228e-06,
"loss": 0.6059,
"step": 317
},
{
"epoch": 0.28,
"grad_norm": 2.0710385535524902,
"learning_rate": 4.7719642195082224e-06,
"loss": 0.5539,
"step": 318
},
{
"epoch": 0.28,
"grad_norm": 2.1239710444371442,
"learning_rate": 4.770504182226875e-06,
"loss": 0.5655,
"step": 319
},
{
"epoch": 0.28,
"grad_norm": 1.9564631444382952,
"learning_rate": 4.769039710601669e-06,
"loss": 0.5914,
"step": 320
},
{
"epoch": 0.28,
"grad_norm": 1.9969926160116234,
"learning_rate": 4.767570807492752e-06,
"loss": 0.55,
"step": 321
},
{
"epoch": 0.29,
"grad_norm": 1.9650736880864492,
"learning_rate": 4.766097475768919e-06,
"loss": 0.5804,
"step": 322
},
{
"epoch": 0.29,
"grad_norm": 2.1946368157969194,
"learning_rate": 4.7646197183076236e-06,
"loss": 0.5631,
"step": 323
},
{
"epoch": 0.29,
"grad_norm": 1.9834181085585831,
"learning_rate": 4.763137537994955e-06,
"loss": 0.5779,
"step": 324
},
{
"epoch": 0.29,
"grad_norm": 2.1081651164417057,
"learning_rate": 4.7616509377256445e-06,
"loss": 0.5375,
"step": 325
},
{
"epoch": 0.29,
"grad_norm": 1.9972027344990544,
"learning_rate": 4.760159920403055e-06,
"loss": 0.5608,
"step": 326
},
{
"epoch": 0.29,
"grad_norm": 1.9554967826543683,
"learning_rate": 4.758664488939174e-06,
"loss": 0.5613,
"step": 327
},
{
"epoch": 0.29,
"grad_norm": 2.211716512822424,
"learning_rate": 4.757164646254614e-06,
"loss": 0.5863,
"step": 328
},
{
"epoch": 0.29,
"grad_norm": 1.9203184200502181,
"learning_rate": 4.755660395278598e-06,
"loss": 0.5275,
"step": 329
},
{
"epoch": 0.29,
"grad_norm": 2.0355308159742505,
"learning_rate": 4.7541517389489626e-06,
"loss": 0.5304,
"step": 330
},
{
"epoch": 0.29,
"grad_norm": 2.005680103405306,
"learning_rate": 4.752638680212145e-06,
"loss": 0.5782,
"step": 331
},
{
"epoch": 0.29,
"grad_norm": 1.9930094995522492,
"learning_rate": 4.751121222023183e-06,
"loss": 0.5197,
"step": 332
},
{
"epoch": 0.29,
"grad_norm": 2.130907347619711,
"learning_rate": 4.749599367345703e-06,
"loss": 0.5453,
"step": 333
},
{
"epoch": 0.3,
"grad_norm": 2.0380649677356715,
"learning_rate": 4.748073119151923e-06,
"loss": 0.5394,
"step": 334
},
{
"epoch": 0.3,
"grad_norm": 2.02655053696048,
"learning_rate": 4.7465424804226366e-06,
"loss": 0.5359,
"step": 335
},
{
"epoch": 0.3,
"grad_norm": 2.108255877778432,
"learning_rate": 4.745007454147215e-06,
"loss": 0.5262,
"step": 336
},
{
"epoch": 0.3,
"grad_norm": 1.8422966312136684,
"learning_rate": 4.7434680433235986e-06,
"loss": 0.529,
"step": 337
},
{
"epoch": 0.3,
"grad_norm": 2.1387816386921004,
"learning_rate": 4.741924250958289e-06,
"loss": 0.5599,
"step": 338
},
{
"epoch": 0.3,
"grad_norm": 2.2063774820548794,
"learning_rate": 4.740376080066346e-06,
"loss": 0.6014,
"step": 339
},
{
"epoch": 0.3,
"grad_norm": 1.917696303327652,
"learning_rate": 4.738823533671383e-06,
"loss": 0.615,
"step": 340
},
{
"epoch": 0.3,
"grad_norm": 2.0283765999277916,
"learning_rate": 4.737266614805554e-06,
"loss": 0.5802,
"step": 341
},
{
"epoch": 0.3,
"grad_norm": 2.0340264609590437,
"learning_rate": 4.7357053265095575e-06,
"loss": 0.5331,
"step": 342
},
{
"epoch": 0.3,
"grad_norm": 2.102037194450825,
"learning_rate": 4.734139671832622e-06,
"loss": 0.5534,
"step": 343
},
{
"epoch": 0.3,
"grad_norm": 2.4389875670618113,
"learning_rate": 4.732569653832505e-06,
"loss": 0.5637,
"step": 344
},
{
"epoch": 0.31,
"grad_norm": 2.1143521053252012,
"learning_rate": 4.730995275575486e-06,
"loss": 0.6539,
"step": 345
},
{
"epoch": 0.31,
"grad_norm": 2.6240136232872064,
"learning_rate": 4.7294165401363616e-06,
"loss": 0.5515,
"step": 346
},
{
"epoch": 0.31,
"grad_norm": 2.037602072097695,
"learning_rate": 4.727833450598433e-06,
"loss": 0.5609,
"step": 347
},
{
"epoch": 0.31,
"grad_norm": 2.10711733636797,
"learning_rate": 4.72624601005351e-06,
"loss": 0.5719,
"step": 348
},
{
"epoch": 0.31,
"grad_norm": 2.277613433738313,
"learning_rate": 4.724654221601899e-06,
"loss": 0.5815,
"step": 349
},
{
"epoch": 0.31,
"grad_norm": 2.0082624113337824,
"learning_rate": 4.7230580883523955e-06,
"loss": 0.5524,
"step": 350
},
{
"epoch": 0.31,
"grad_norm": 1.8922591374161477,
"learning_rate": 4.721457613422285e-06,
"loss": 0.5981,
"step": 351
},
{
"epoch": 0.31,
"grad_norm": 2.108229047424278,
"learning_rate": 4.7198527999373266e-06,
"loss": 0.57,
"step": 352
},
{
"epoch": 0.31,
"grad_norm": 2.152965480400126,
"learning_rate": 4.718243651031759e-06,
"loss": 0.5996,
"step": 353
},
{
"epoch": 0.31,
"grad_norm": 1.8885994019827148,
"learning_rate": 4.716630169848282e-06,
"loss": 0.5543,
"step": 354
},
{
"epoch": 0.31,
"grad_norm": 2.221396082747074,
"learning_rate": 4.715012359538062e-06,
"loss": 0.5423,
"step": 355
},
{
"epoch": 0.32,
"grad_norm": 2.247525651087526,
"learning_rate": 4.7133902232607145e-06,
"loss": 0.6049,
"step": 356
},
{
"epoch": 0.32,
"grad_norm": 1.905837742487114,
"learning_rate": 4.711763764184309e-06,
"loss": 0.5523,
"step": 357
},
{
"epoch": 0.32,
"grad_norm": 2.117965067814315,
"learning_rate": 4.710132985485355e-06,
"loss": 0.5682,
"step": 358
},
{
"epoch": 0.32,
"grad_norm": 2.1530948606389373,
"learning_rate": 4.7084978903487985e-06,
"loss": 0.5506,
"step": 359
},
{
"epoch": 0.32,
"grad_norm": 1.8738866858316863,
"learning_rate": 4.706858481968017e-06,
"loss": 0.5426,
"step": 360
},
{
"epoch": 0.32,
"grad_norm": 1.9967053512246618,
"learning_rate": 4.705214763544806e-06,
"loss": 0.5555,
"step": 361
},
{
"epoch": 0.32,
"grad_norm": 2.352080896364055,
"learning_rate": 4.703566738289389e-06,
"loss": 0.587,
"step": 362
},
{
"epoch": 0.32,
"grad_norm": 2.031696719881503,
"learning_rate": 4.701914409420392e-06,
"loss": 0.6088,
"step": 363
},
{
"epoch": 0.32,
"grad_norm": 2.140107830595095,
"learning_rate": 4.700257780164849e-06,
"loss": 0.5596,
"step": 364
},
{
"epoch": 0.32,
"grad_norm": 2.125236417141067,
"learning_rate": 4.698596853758194e-06,
"loss": 0.5513,
"step": 365
},
{
"epoch": 0.32,
"grad_norm": 1.8878623518397697,
"learning_rate": 4.696931633444251e-06,
"loss": 0.5557,
"step": 366
},
{
"epoch": 0.33,
"grad_norm": 1.9523463678463824,
"learning_rate": 4.695262122475232e-06,
"loss": 0.5317,
"step": 367
},
{
"epoch": 0.33,
"grad_norm": 2.3748547328434455,
"learning_rate": 4.6935883241117286e-06,
"loss": 0.5733,
"step": 368
},
{
"epoch": 0.33,
"grad_norm": 1.9248854873148575,
"learning_rate": 4.691910241622704e-06,
"loss": 0.5523,
"step": 369
},
{
"epoch": 0.33,
"grad_norm": 2.1731794693383923,
"learning_rate": 4.69022787828549e-06,
"loss": 0.6489,
"step": 370
},
{
"epoch": 0.33,
"grad_norm": 1.996570702327501,
"learning_rate": 4.688541237385781e-06,
"loss": 0.584,
"step": 371
},
{
"epoch": 0.33,
"grad_norm": 2.0272036390008097,
"learning_rate": 4.68685032221762e-06,
"loss": 0.554,
"step": 372
},
{
"epoch": 0.33,
"grad_norm": 1.9986403184037858,
"learning_rate": 4.685155136083401e-06,
"loss": 0.5798,
"step": 373
},
{
"epoch": 0.33,
"grad_norm": 2.24642442330448,
"learning_rate": 4.683455682293863e-06,
"loss": 0.5486,
"step": 374
},
{
"epoch": 0.33,
"grad_norm": 2.916261956844043,
"learning_rate": 4.681751964168071e-06,
"loss": 0.5678,
"step": 375
},
{
"epoch": 0.33,
"grad_norm": 2.1597492287443396,
"learning_rate": 4.680043985033427e-06,
"loss": 0.5801,
"step": 376
},
{
"epoch": 0.33,
"grad_norm": 1.9634034606261326,
"learning_rate": 4.6783317482256506e-06,
"loss": 0.5412,
"step": 377
},
{
"epoch": 0.33,
"grad_norm": 2.0128604293697263,
"learning_rate": 4.676615257088777e-06,
"loss": 0.5538,
"step": 378
},
{
"epoch": 0.34,
"grad_norm": 2.2205659530523976,
"learning_rate": 4.674894514975149e-06,
"loss": 0.494,
"step": 379
},
{
"epoch": 0.34,
"grad_norm": 2.000557085172021,
"learning_rate": 4.673169525245416e-06,
"loss": 0.5459,
"step": 380
},
{
"epoch": 0.34,
"grad_norm": 2.0089256125274826,
"learning_rate": 4.671440291268518e-06,
"loss": 0.5729,
"step": 381
},
{
"epoch": 0.34,
"grad_norm": 2.076112293053539,
"learning_rate": 4.66970681642169e-06,
"loss": 0.5277,
"step": 382
},
{
"epoch": 0.34,
"grad_norm": 1.996445627957894,
"learning_rate": 4.667969104090441e-06,
"loss": 0.5879,
"step": 383
},
{
"epoch": 0.34,
"grad_norm": 2.379165029211644,
"learning_rate": 4.666227157668564e-06,
"loss": 0.5924,
"step": 384
},
{
"epoch": 0.34,
"grad_norm": 2.101190475222136,
"learning_rate": 4.664480980558118e-06,
"loss": 0.6466,
"step": 385
},
{
"epoch": 0.34,
"grad_norm": 2.035159570620747,
"learning_rate": 4.662730576169423e-06,
"loss": 0.5979,
"step": 386
},
{
"epoch": 0.34,
"grad_norm": 2.1034174780447814,
"learning_rate": 4.660975947921058e-06,
"loss": 0.5635,
"step": 387
},
{
"epoch": 0.34,
"grad_norm": 2.131573174129039,
"learning_rate": 4.65921709923985e-06,
"loss": 0.5602,
"step": 388
},
{
"epoch": 0.34,
"grad_norm": 1.9282515780121203,
"learning_rate": 4.657454033560868e-06,
"loss": 0.5292,
"step": 389
},
{
"epoch": 0.35,
"grad_norm": 1.922997066030009,
"learning_rate": 4.655686754327419e-06,
"loss": 0.5475,
"step": 390
},
{
"epoch": 0.35,
"grad_norm": 1.9692624098665525,
"learning_rate": 4.653915264991035e-06,
"loss": 0.5529,
"step": 391
},
{
"epoch": 0.35,
"grad_norm": 1.976011234185068,
"learning_rate": 4.652139569011475e-06,
"loss": 0.5439,
"step": 392
},
{
"epoch": 0.35,
"grad_norm": 1.909657950321316,
"learning_rate": 4.650359669856711e-06,
"loss": 0.5558,
"step": 393
},
{
"epoch": 0.35,
"grad_norm": 1.9134183734362904,
"learning_rate": 4.648575571002926e-06,
"loss": 0.5428,
"step": 394
},
{
"epoch": 0.35,
"grad_norm": 2.067168876792994,
"learning_rate": 4.646787275934501e-06,
"loss": 0.6261,
"step": 395
},
{
"epoch": 0.35,
"grad_norm": 1.9358304010171785,
"learning_rate": 4.644994788144017e-06,
"loss": 0.5698,
"step": 396
},
{
"epoch": 0.35,
"grad_norm": 1.9671634072657547,
"learning_rate": 4.643198111132241e-06,
"loss": 0.5345,
"step": 397
},
{
"epoch": 0.35,
"grad_norm": 2.0176052011599133,
"learning_rate": 4.641397248408122e-06,
"loss": 0.5028,
"step": 398
},
{
"epoch": 0.35,
"grad_norm": 1.9960700483606102,
"learning_rate": 4.639592203488784e-06,
"loss": 0.5253,
"step": 399
},
{
"epoch": 0.35,
"grad_norm": 1.9329472749401087,
"learning_rate": 4.63778297989952e-06,
"loss": 0.615,
"step": 400
},
{
"epoch": 0.36,
"grad_norm": 1.9689526846990402,
"learning_rate": 4.6359695811737805e-06,
"loss": 0.5558,
"step": 401
},
{
"epoch": 0.36,
"grad_norm": 2.043494453339269,
"learning_rate": 4.634152010853175e-06,
"loss": 0.5955,
"step": 402
},
{
"epoch": 0.36,
"grad_norm": 1.9251519214200417,
"learning_rate": 4.632330272487455e-06,
"loss": 0.5587,
"step": 403
},
{
"epoch": 0.36,
"grad_norm": 2.2049650629169495,
"learning_rate": 4.6305043696345175e-06,
"loss": 0.5633,
"step": 404
},
{
"epoch": 0.36,
"grad_norm": 1.8971004366601951,
"learning_rate": 4.628674305860389e-06,
"loss": 0.5147,
"step": 405
},
{
"epoch": 0.36,
"grad_norm": 1.958131978242853,
"learning_rate": 4.626840084739224e-06,
"loss": 0.558,
"step": 406
},
{
"epoch": 0.36,
"grad_norm": 1.8809187299789303,
"learning_rate": 4.625001709853296e-06,
"loss": 0.6029,
"step": 407
},
{
"epoch": 0.36,
"grad_norm": 2.07376704403877,
"learning_rate": 4.623159184792992e-06,
"loss": 0.5985,
"step": 408
},
{
"epoch": 0.36,
"grad_norm": 1.9773215118384355,
"learning_rate": 4.621312513156801e-06,
"loss": 0.5592,
"step": 409
},
{
"epoch": 0.36,
"grad_norm": 2.2454931529711373,
"learning_rate": 4.6194616985513144e-06,
"loss": 0.5265,
"step": 410
},
{
"epoch": 0.36,
"grad_norm": 1.917266484743525,
"learning_rate": 4.617606744591214e-06,
"loss": 0.5579,
"step": 411
},
{
"epoch": 0.36,
"grad_norm": 1.9196448264725143,
"learning_rate": 4.615747654899263e-06,
"loss": 0.5345,
"step": 412
},
{
"epoch": 0.37,
"grad_norm": 1.9733157447209138,
"learning_rate": 4.613884433106306e-06,
"loss": 0.528,
"step": 413
},
{
"epoch": 0.37,
"grad_norm": 1.994664364309963,
"learning_rate": 4.612017082851253e-06,
"loss": 0.5489,
"step": 414
},
{
"epoch": 0.37,
"grad_norm": 1.8266904473141898,
"learning_rate": 4.610145607781081e-06,
"loss": 0.5411,
"step": 415
},
{
"epoch": 0.37,
"grad_norm": 2.0294108873934364,
"learning_rate": 4.608270011550823e-06,
"loss": 0.5963,
"step": 416
},
{
"epoch": 0.37,
"grad_norm": 1.9735002273071562,
"learning_rate": 4.606390297823555e-06,
"loss": 0.5858,
"step": 417
},
{
"epoch": 0.37,
"grad_norm": 1.8987568737188125,
"learning_rate": 4.604506470270403e-06,
"loss": 0.493,
"step": 418
},
{
"epoch": 0.37,
"grad_norm": 1.9371998611194052,
"learning_rate": 4.6026185325705195e-06,
"loss": 0.521,
"step": 419
},
{
"epoch": 0.37,
"grad_norm": 1.8926221916061328,
"learning_rate": 4.60072648841109e-06,
"loss": 0.4922,
"step": 420
},
{
"epoch": 0.37,
"grad_norm": 1.8759546163633927,
"learning_rate": 4.598830341487317e-06,
"loss": 0.5487,
"step": 421
},
{
"epoch": 0.37,
"grad_norm": 1.9425705301229708,
"learning_rate": 4.596930095502416e-06,
"loss": 0.5155,
"step": 422
},
{
"epoch": 0.37,
"grad_norm": 1.8718904454318124,
"learning_rate": 4.59502575416761e-06,
"loss": 0.5372,
"step": 423
},
{
"epoch": 0.38,
"grad_norm": 1.8361742824749525,
"learning_rate": 4.593117321202117e-06,
"loss": 0.556,
"step": 424
},
{
"epoch": 0.38,
"grad_norm": 1.8520540031413573,
"learning_rate": 4.59120480033315e-06,
"loss": 0.6213,
"step": 425
},
{
"epoch": 0.38,
"grad_norm": 1.9670746741442957,
"learning_rate": 4.5892881952959015e-06,
"loss": 0.5685,
"step": 426
},
{
"epoch": 0.38,
"grad_norm": 1.969557039139786,
"learning_rate": 4.587367509833543e-06,
"loss": 0.5472,
"step": 427
},
{
"epoch": 0.38,
"grad_norm": 1.9873217018861624,
"learning_rate": 4.585442747697218e-06,
"loss": 0.5419,
"step": 428
},
{
"epoch": 0.38,
"grad_norm": 1.9508580236237527,
"learning_rate": 4.5835139126460234e-06,
"loss": 0.566,
"step": 429
},
{
"epoch": 0.38,
"grad_norm": 1.8929503262145966,
"learning_rate": 4.58158100844702e-06,
"loss": 0.5526,
"step": 430
},
{
"epoch": 0.38,
"grad_norm": 1.9394545018501204,
"learning_rate": 4.57964403887521e-06,
"loss": 0.5469,
"step": 431
},
{
"epoch": 0.38,
"grad_norm": 2.1045619298179927,
"learning_rate": 4.577703007713538e-06,
"loss": 0.5397,
"step": 432
},
{
"epoch": 0.38,
"grad_norm": 1.8886665443222683,
"learning_rate": 4.575757918752879e-06,
"loss": 0.5174,
"step": 433
},
{
"epoch": 0.38,
"grad_norm": 1.849256286655662,
"learning_rate": 4.573808775792033e-06,
"loss": 0.558,
"step": 434
},
{
"epoch": 0.39,
"grad_norm": 1.89537230772545,
"learning_rate": 4.5718555826377195e-06,
"loss": 0.6155,
"step": 435
},
{
"epoch": 0.39,
"grad_norm": 2.028600611269796,
"learning_rate": 4.569898343104568e-06,
"loss": 0.5639,
"step": 436
},
{
"epoch": 0.39,
"grad_norm": 2.1153787641168273,
"learning_rate": 4.567937061015107e-06,
"loss": 0.5883,
"step": 437
},
{
"epoch": 0.39,
"grad_norm": 2.0217937777574075,
"learning_rate": 4.5659717401997655e-06,
"loss": 0.5936,
"step": 438
},
{
"epoch": 0.39,
"grad_norm": 2.248716610859176,
"learning_rate": 4.564002384496856e-06,
"loss": 0.5539,
"step": 439
},
{
"epoch": 0.39,
"grad_norm": 1.9689879082294663,
"learning_rate": 4.562028997752574e-06,
"loss": 0.5636,
"step": 440
},
{
"epoch": 0.39,
"grad_norm": 1.763292547062648,
"learning_rate": 4.560051583820987e-06,
"loss": 0.5402,
"step": 441
},
{
"epoch": 0.39,
"grad_norm": 2.129235681815295,
"learning_rate": 4.558070146564025e-06,
"loss": 0.5279,
"step": 442
},
{
"epoch": 0.39,
"grad_norm": 1.987329959970642,
"learning_rate": 4.55608468985148e-06,
"loss": 0.5597,
"step": 443
},
{
"epoch": 0.39,
"grad_norm": 1.8223595251951752,
"learning_rate": 4.554095217560991e-06,
"loss": 0.5523,
"step": 444
},
{
"epoch": 0.39,
"grad_norm": 1.8945373677348296,
"learning_rate": 4.55210173357804e-06,
"loss": 0.5611,
"step": 445
},
{
"epoch": 0.4,
"grad_norm": 1.8010628987468362,
"learning_rate": 4.550104241795946e-06,
"loss": 0.5406,
"step": 446
},
{
"epoch": 0.4,
"grad_norm": 1.7680591979019162,
"learning_rate": 4.548102746115852e-06,
"loss": 0.5392,
"step": 447
},
{
"epoch": 0.4,
"grad_norm": 1.9894409183828397,
"learning_rate": 4.546097250446724e-06,
"loss": 0.568,
"step": 448
},
{
"epoch": 0.4,
"grad_norm": 1.9527217933389673,
"learning_rate": 4.544087758705338e-06,
"loss": 0.5616,
"step": 449
},
{
"epoch": 0.4,
"grad_norm": 1.8813970745759399,
"learning_rate": 4.5420742748162735e-06,
"loss": 0.5857,
"step": 450
},
{
"epoch": 0.4,
"grad_norm": 1.9697471415378363,
"learning_rate": 4.540056802711911e-06,
"loss": 0.5563,
"step": 451
},
{
"epoch": 0.4,
"grad_norm": 1.8610261764458738,
"learning_rate": 4.5380353463324135e-06,
"loss": 0.5414,
"step": 452
},
{
"epoch": 0.4,
"grad_norm": 2.0760585222699075,
"learning_rate": 4.536009909625733e-06,
"loss": 0.6113,
"step": 453
},
{
"epoch": 0.4,
"grad_norm": 1.9376608369819073,
"learning_rate": 4.533980496547588e-06,
"loss": 0.5567,
"step": 454
},
{
"epoch": 0.4,
"grad_norm": 1.9360208325717025,
"learning_rate": 4.5319471110614676e-06,
"loss": 0.5637,
"step": 455
},
{
"epoch": 0.4,
"grad_norm": 1.9103146510774847,
"learning_rate": 4.529909757138619e-06,
"loss": 0.5049,
"step": 456
},
{
"epoch": 0.4,
"grad_norm": 1.9645365532954322,
"learning_rate": 4.5278684387580356e-06,
"loss": 0.5424,
"step": 457
},
{
"epoch": 0.41,
"grad_norm": 2.0430691701895065,
"learning_rate": 4.52582315990646e-06,
"loss": 0.547,
"step": 458
},
{
"epoch": 0.41,
"grad_norm": 1.995685349345533,
"learning_rate": 4.523773924578362e-06,
"loss": 0.6005,
"step": 459
},
{
"epoch": 0.41,
"grad_norm": 1.9830544751269077,
"learning_rate": 4.521720736775947e-06,
"loss": 0.5563,
"step": 460
},
{
"epoch": 0.41,
"grad_norm": 1.8473463212841006,
"learning_rate": 4.519663600509131e-06,
"loss": 0.5913,
"step": 461
},
{
"epoch": 0.41,
"grad_norm": 1.8993140839815026,
"learning_rate": 4.5176025197955495e-06,
"loss": 0.5653,
"step": 462
},
{
"epoch": 0.41,
"grad_norm": 1.8179551662772986,
"learning_rate": 4.515537498660535e-06,
"loss": 0.5485,
"step": 463
},
{
"epoch": 0.41,
"grad_norm": 1.9275228062086758,
"learning_rate": 4.51346854113712e-06,
"loss": 0.5248,
"step": 464
},
{
"epoch": 0.41,
"grad_norm": 1.9668428438048349,
"learning_rate": 4.511395651266023e-06,
"loss": 0.5939,
"step": 465
},
{
"epoch": 0.41,
"grad_norm": 1.9602042152930792,
"learning_rate": 4.509318833095642e-06,
"loss": 0.5452,
"step": 466
},
{
"epoch": 0.41,
"grad_norm": 1.8348566721600683,
"learning_rate": 4.507238090682049e-06,
"loss": 0.5514,
"step": 467
},
{
"epoch": 0.41,
"grad_norm": 1.938525142403929,
"learning_rate": 4.505153428088979e-06,
"loss": 0.5822,
"step": 468
},
{
"epoch": 0.42,
"grad_norm": 2.008973560332548,
"learning_rate": 4.503064849387822e-06,
"loss": 0.5765,
"step": 469
},
{
"epoch": 0.42,
"grad_norm": 1.8911779425902009,
"learning_rate": 4.500972358657618e-06,
"loss": 0.5465,
"step": 470
},
{
"epoch": 0.42,
"grad_norm": 1.9224818772820709,
"learning_rate": 4.4988759599850485e-06,
"loss": 0.5897,
"step": 471
},
{
"epoch": 0.42,
"grad_norm": 1.990817812633161,
"learning_rate": 4.496775657464423e-06,
"loss": 0.5505,
"step": 472
},
{
"epoch": 0.42,
"grad_norm": 1.9167562026803746,
"learning_rate": 4.4946714551976795e-06,
"loss": 0.5779,
"step": 473
},
{
"epoch": 0.42,
"grad_norm": 1.9388400892712594,
"learning_rate": 4.492563357294369e-06,
"loss": 0.574,
"step": 474
},
{
"epoch": 0.42,
"grad_norm": 2.0140312788131762,
"learning_rate": 4.490451367871655e-06,
"loss": 0.4928,
"step": 475
},
{
"epoch": 0.42,
"grad_norm": 2.074902721101316,
"learning_rate": 4.488335491054296e-06,
"loss": 0.5366,
"step": 476
},
{
"epoch": 0.42,
"grad_norm": 1.8245504149698855,
"learning_rate": 4.486215730974646e-06,
"loss": 0.581,
"step": 477
},
{
"epoch": 0.42,
"grad_norm": 2.1100306515160656,
"learning_rate": 4.4840920917726425e-06,
"loss": 0.5677,
"step": 478
},
{
"epoch": 0.42,
"grad_norm": 1.9560380000004616,
"learning_rate": 4.4819645775958e-06,
"loss": 0.5426,
"step": 479
},
{
"epoch": 0.43,
"grad_norm": 1.721267171163405,
"learning_rate": 4.479833192599198e-06,
"loss": 0.5868,
"step": 480
},
{
"epoch": 0.43,
"grad_norm": 2.0001169229847124,
"learning_rate": 4.477697940945478e-06,
"loss": 0.5667,
"step": 481
},
{
"epoch": 0.43,
"grad_norm": 2.0111322894409134,
"learning_rate": 4.475558826804833e-06,
"loss": 0.5707,
"step": 482
},
{
"epoch": 0.43,
"grad_norm": 1.8179588699061133,
"learning_rate": 4.473415854355e-06,
"loss": 0.5484,
"step": 483
},
{
"epoch": 0.43,
"grad_norm": 2.0491236128150345,
"learning_rate": 4.47126902778125e-06,
"loss": 0.5575,
"step": 484
},
{
"epoch": 0.43,
"grad_norm": 2.049676347036571,
"learning_rate": 4.469118351276381e-06,
"loss": 0.5807,
"step": 485
},
{
"epoch": 0.43,
"grad_norm": 1.8999028972772445,
"learning_rate": 4.4669638290407115e-06,
"loss": 0.5447,
"step": 486
},
{
"epoch": 0.43,
"grad_norm": 2.0754807768031687,
"learning_rate": 4.464805465282071e-06,
"loss": 0.503,
"step": 487
},
{
"epoch": 0.43,
"grad_norm": 1.9532719169013661,
"learning_rate": 4.462643264215789e-06,
"loss": 0.5304,
"step": 488
},
{
"epoch": 0.43,
"grad_norm": 2.038547881198709,
"learning_rate": 4.460477230064693e-06,
"loss": 0.6116,
"step": 489
},
{
"epoch": 0.43,
"grad_norm": 2.1342568039197136,
"learning_rate": 4.458307367059092e-06,
"loss": 0.5632,
"step": 490
},
{
"epoch": 0.43,
"grad_norm": 1.9267024509918977,
"learning_rate": 4.456133679436778e-06,
"loss": 0.5574,
"step": 491
},
{
"epoch": 0.44,
"grad_norm": 1.795213135692931,
"learning_rate": 4.453956171443008e-06,
"loss": 0.5737,
"step": 492
},
{
"epoch": 0.44,
"grad_norm": 1.9428252328171443,
"learning_rate": 4.451774847330505e-06,
"loss": 0.5685,
"step": 493
},
{
"epoch": 0.44,
"grad_norm": 1.7903749800219122,
"learning_rate": 4.449589711359439e-06,
"loss": 0.5214,
"step": 494
},
{
"epoch": 0.44,
"grad_norm": 2.111615491479605,
"learning_rate": 4.447400767797429e-06,
"loss": 0.5329,
"step": 495
},
{
"epoch": 0.44,
"grad_norm": 1.936578332165912,
"learning_rate": 4.445208020919531e-06,
"loss": 0.543,
"step": 496
},
{
"epoch": 0.44,
"grad_norm": 2.0005145681262473,
"learning_rate": 4.4430114750082246e-06,
"loss": 0.5593,
"step": 497
},
{
"epoch": 0.44,
"grad_norm": 1.9720912009242426,
"learning_rate": 4.4408111343534125e-06,
"loss": 0.5812,
"step": 498
},
{
"epoch": 0.44,
"grad_norm": 2.0486055586452787,
"learning_rate": 4.4386070032524085e-06,
"loss": 0.5563,
"step": 499
},
{
"epoch": 0.44,
"grad_norm": 1.8043262288689983,
"learning_rate": 4.436399086009928e-06,
"loss": 0.4905,
"step": 500
},
{
"epoch": 0.44,
"grad_norm": 1.9608580808640215,
"learning_rate": 4.43418738693808e-06,
"loss": 0.5548,
"step": 501
},
{
"epoch": 0.44,
"grad_norm": 2.008548225584814,
"learning_rate": 4.431971910356363e-06,
"loss": 0.5955,
"step": 502
},
{
"epoch": 0.45,
"grad_norm": 1.8974274240345173,
"learning_rate": 4.429752660591648e-06,
"loss": 0.5742,
"step": 503
},
{
"epoch": 0.45,
"grad_norm": 1.8257689605722616,
"learning_rate": 4.427529641978181e-06,
"loss": 0.6177,
"step": 504
},
{
"epoch": 0.45,
"grad_norm": 2.0327301577551764,
"learning_rate": 4.425302858857563e-06,
"loss": 0.5872,
"step": 505
},
{
"epoch": 0.45,
"grad_norm": 1.9539661576324254,
"learning_rate": 4.42307231557875e-06,
"loss": 0.5728,
"step": 506
},
{
"epoch": 0.45,
"grad_norm": 1.9346302819034207,
"learning_rate": 4.420838016498043e-06,
"loss": 0.6019,
"step": 507
},
{
"epoch": 0.45,
"grad_norm": 2.1255667417446054,
"learning_rate": 4.418599965979074e-06,
"loss": 0.5981,
"step": 508
},
{
"epoch": 0.45,
"grad_norm": 1.8293805714793054,
"learning_rate": 4.416358168392806e-06,
"loss": 0.5497,
"step": 509
},
{
"epoch": 0.45,
"grad_norm": 1.929762647152706,
"learning_rate": 4.414112628117518e-06,
"loss": 0.5655,
"step": 510
},
{
"epoch": 0.45,
"grad_norm": 1.9808758258773635,
"learning_rate": 4.411863349538798e-06,
"loss": 0.5465,
"step": 511
},
{
"epoch": 0.45,
"grad_norm": 2.0413084054198647,
"learning_rate": 4.409610337049537e-06,
"loss": 0.5264,
"step": 512
},
{
"epoch": 0.45,
"grad_norm": 1.9506473664088613,
"learning_rate": 4.4073535950499155e-06,
"loss": 0.5284,
"step": 513
},
{
"epoch": 0.46,
"grad_norm": 1.7875399190820846,
"learning_rate": 4.405093127947402e-06,
"loss": 0.5406,
"step": 514
},
{
"epoch": 0.46,
"grad_norm": 1.9594159192262046,
"learning_rate": 4.402828940156735e-06,
"loss": 0.573,
"step": 515
},
{
"epoch": 0.46,
"grad_norm": 2.025943836966642,
"learning_rate": 4.400561036099924e-06,
"loss": 0.5227,
"step": 516
},
{
"epoch": 0.46,
"grad_norm": 1.9439140060564322,
"learning_rate": 4.398289420206235e-06,
"loss": 0.5802,
"step": 517
},
{
"epoch": 0.46,
"grad_norm": 1.891060025336787,
"learning_rate": 4.396014096912182e-06,
"loss": 0.55,
"step": 518
},
{
"epoch": 0.46,
"grad_norm": 1.9575594944193413,
"learning_rate": 4.393735070661521e-06,
"loss": 0.5213,
"step": 519
},
{
"epoch": 0.46,
"grad_norm": 2.024463679893138,
"learning_rate": 4.391452345905239e-06,
"loss": 0.5354,
"step": 520
},
{
"epoch": 0.46,
"grad_norm": 1.825359223217947,
"learning_rate": 4.389165927101549e-06,
"loss": 0.5506,
"step": 521
},
{
"epoch": 0.46,
"grad_norm": 2.0284690208197484,
"learning_rate": 4.386875818715875e-06,
"loss": 0.5763,
"step": 522
},
{
"epoch": 0.46,
"grad_norm": 1.9021830177238082,
"learning_rate": 4.3845820252208476e-06,
"loss": 0.5596,
"step": 523
},
{
"epoch": 0.46,
"grad_norm": 2.0000504821060203,
"learning_rate": 4.3822845510962966e-06,
"loss": 0.5701,
"step": 524
},
{
"epoch": 0.47,
"grad_norm": 1.7341340075311633,
"learning_rate": 4.379983400829237e-06,
"loss": 0.5315,
"step": 525
},
{
"epoch": 0.47,
"grad_norm": 1.9297447671947465,
"learning_rate": 4.377678578913868e-06,
"loss": 0.5798,
"step": 526
},
{
"epoch": 0.47,
"grad_norm": 1.9233069620366818,
"learning_rate": 4.375370089851554e-06,
"loss": 0.5391,
"step": 527
},
{
"epoch": 0.47,
"grad_norm": 1.976671700063146,
"learning_rate": 4.3730579381508254e-06,
"loss": 0.5674,
"step": 528
},
{
"epoch": 0.47,
"grad_norm": 1.914097057045113,
"learning_rate": 4.3707421283273645e-06,
"loss": 0.5367,
"step": 529
},
{
"epoch": 0.47,
"grad_norm": 1.8477362806445459,
"learning_rate": 4.368422664903997e-06,
"loss": 0.5349,
"step": 530
},
{
"epoch": 0.47,
"grad_norm": 1.9704477099484594,
"learning_rate": 4.366099552410686e-06,
"loss": 0.501,
"step": 531
},
{
"epoch": 0.47,
"grad_norm": 1.9297086500071385,
"learning_rate": 4.363772795384522e-06,
"loss": 0.5352,
"step": 532
},
{
"epoch": 0.47,
"grad_norm": 1.9090996748848685,
"learning_rate": 4.36144239836971e-06,
"loss": 0.5457,
"step": 533
},
{
"epoch": 0.47,
"grad_norm": 1.905870882711107,
"learning_rate": 4.3591083659175655e-06,
"loss": 0.5685,
"step": 534
},
{
"epoch": 0.47,
"grad_norm": 1.968618442539214,
"learning_rate": 4.356770702586506e-06,
"loss": 0.5476,
"step": 535
},
{
"epoch": 0.47,
"grad_norm": 1.9431218136805426,
"learning_rate": 4.354429412942038e-06,
"loss": 0.5719,
"step": 536
},
{
"epoch": 0.48,
"grad_norm": 2.0756451350956215,
"learning_rate": 4.3520845015567495e-06,
"loss": 0.5502,
"step": 537
},
{
"epoch": 0.48,
"grad_norm": 1.8350117686217275,
"learning_rate": 4.349735973010306e-06,
"loss": 0.5417,
"step": 538
},
{
"epoch": 0.48,
"grad_norm": 2.03495920394236,
"learning_rate": 4.3473838318894324e-06,
"loss": 0.545,
"step": 539
},
{
"epoch": 0.48,
"grad_norm": 1.7864245375307775,
"learning_rate": 4.3450280827879125e-06,
"loss": 0.5242,
"step": 540
},
{
"epoch": 0.48,
"grad_norm": 1.9018530036883652,
"learning_rate": 4.342668730306575e-06,
"loss": 0.554,
"step": 541
},
{
"epoch": 0.48,
"grad_norm": 1.8575071370513128,
"learning_rate": 4.340305779053286e-06,
"loss": 0.5287,
"step": 542
},
{
"epoch": 0.48,
"grad_norm": 1.8480049595126469,
"learning_rate": 4.33793923364294e-06,
"loss": 0.5554,
"step": 543
},
{
"epoch": 0.48,
"grad_norm": 2.103039565778625,
"learning_rate": 4.335569098697454e-06,
"loss": 0.5526,
"step": 544
},
{
"epoch": 0.48,
"grad_norm": 1.8712145108160219,
"learning_rate": 4.33319537884575e-06,
"loss": 0.5472,
"step": 545
},
{
"epoch": 0.48,
"grad_norm": 1.9271972466285336,
"learning_rate": 4.330818078723756e-06,
"loss": 0.5827,
"step": 546
},
{
"epoch": 0.48,
"grad_norm": 1.954438973741856,
"learning_rate": 4.328437202974389e-06,
"loss": 0.5433,
"step": 547
},
{
"epoch": 0.49,
"grad_norm": 2.0467264178153726,
"learning_rate": 4.326052756247553e-06,
"loss": 0.5981,
"step": 548
},
{
"epoch": 0.49,
"grad_norm": 1.9418055408636266,
"learning_rate": 4.323664743200123e-06,
"loss": 0.5832,
"step": 549
},
{
"epoch": 0.49,
"grad_norm": 2.444044603553196,
"learning_rate": 4.32127316849594e-06,
"loss": 0.5638,
"step": 550
},
{
"epoch": 0.49,
"grad_norm": 1.8791947879326414,
"learning_rate": 4.318878036805802e-06,
"loss": 0.5864,
"step": 551
},
{
"epoch": 0.49,
"grad_norm": 1.872356245946924,
"learning_rate": 4.3164793528074525e-06,
"loss": 0.5337,
"step": 552
},
{
"epoch": 0.49,
"grad_norm": 2.025493213646544,
"learning_rate": 4.3140771211855725e-06,
"loss": 0.5401,
"step": 553
},
{
"epoch": 0.49,
"grad_norm": 1.9845857759145742,
"learning_rate": 4.3116713466317745e-06,
"loss": 0.5712,
"step": 554
},
{
"epoch": 0.49,
"grad_norm": 1.9091874317608197,
"learning_rate": 4.309262033844587e-06,
"loss": 0.5337,
"step": 555
},
{
"epoch": 0.49,
"grad_norm": 1.926646558220673,
"learning_rate": 4.30684918752945e-06,
"loss": 0.5787,
"step": 556
},
{
"epoch": 0.49,
"grad_norm": 2.0450560123448165,
"learning_rate": 4.304432812398704e-06,
"loss": 0.5704,
"step": 557
},
{
"epoch": 0.49,
"grad_norm": 1.915800332391142,
"learning_rate": 4.302012913171584e-06,
"loss": 0.5194,
"step": 558
},
{
"epoch": 0.5,
"grad_norm": 1.9050588229807015,
"learning_rate": 4.299589494574204e-06,
"loss": 0.5104,
"step": 559
},
{
"epoch": 0.5,
"grad_norm": 1.9241714112001687,
"learning_rate": 4.297162561339554e-06,
"loss": 0.5388,
"step": 560
},
{
"epoch": 0.5,
"grad_norm": 1.8520273210081386,
"learning_rate": 4.294732118207486e-06,
"loss": 0.5363,
"step": 561
},
{
"epoch": 0.5,
"grad_norm": 2.0240180827444205,
"learning_rate": 4.292298169924709e-06,
"loss": 0.5632,
"step": 562
},
{
"epoch": 0.5,
"grad_norm": 1.8385436745856445,
"learning_rate": 4.289860721244776e-06,
"loss": 0.542,
"step": 563
},
{
"epoch": 0.5,
"grad_norm": 1.9260618068482396,
"learning_rate": 4.287419776928078e-06,
"loss": 0.5555,
"step": 564
},
{
"epoch": 0.5,
"grad_norm": 3.155290692386073,
"learning_rate": 4.284975341741833e-06,
"loss": 0.5336,
"step": 565
},
{
"epoch": 0.5,
"grad_norm": 2.461077264148098,
"learning_rate": 4.282527420460073e-06,
"loss": 0.5794,
"step": 566
},
{
"epoch": 0.5,
"grad_norm": 1.8539810703173831,
"learning_rate": 4.280076017863643e-06,
"loss": 0.5298,
"step": 567
},
{
"epoch": 0.5,
"grad_norm": 1.981150552962984,
"learning_rate": 4.277621138740185e-06,
"loss": 0.5862,
"step": 568
},
{
"epoch": 0.5,
"grad_norm": 1.8768796036679432,
"learning_rate": 4.275162787884132e-06,
"loss": 0.5255,
"step": 569
},
{
"epoch": 0.5,
"grad_norm": 2.022795676637582,
"learning_rate": 4.272700970096696e-06,
"loss": 0.5984,
"step": 570
},
{
"epoch": 0.51,
"grad_norm": 1.835618231704385,
"learning_rate": 4.27023569018586e-06,
"loss": 0.5297,
"step": 571
},
{
"epoch": 0.51,
"grad_norm": 1.853495005213679,
"learning_rate": 4.267766952966369e-06,
"loss": 0.5188,
"step": 572
},
{
"epoch": 0.51,
"grad_norm": 1.8841750183665413,
"learning_rate": 4.265294763259721e-06,
"loss": 0.5678,
"step": 573
},
{
"epoch": 0.51,
"grad_norm": 1.8013177249236558,
"learning_rate": 4.262819125894156e-06,
"loss": 0.5286,
"step": 574
},
{
"epoch": 0.51,
"grad_norm": 1.8320928495052518,
"learning_rate": 4.2603400457046476e-06,
"loss": 0.5341,
"step": 575
},
{
"epoch": 0.51,
"grad_norm": 1.8323864124122828,
"learning_rate": 4.257857527532891e-06,
"loss": 0.5283,
"step": 576
},
{
"epoch": 0.51,
"grad_norm": 1.9487038959665601,
"learning_rate": 4.255371576227301e-06,
"loss": 0.5418,
"step": 577
},
{
"epoch": 0.51,
"grad_norm": 1.7875154296015772,
"learning_rate": 4.252882196642993e-06,
"loss": 0.5065,
"step": 578
},
{
"epoch": 0.51,
"grad_norm": 2.089827238376911,
"learning_rate": 4.250389393641778e-06,
"loss": 0.5919,
"step": 579
},
{
"epoch": 0.51,
"grad_norm": 1.9078348658003164,
"learning_rate": 4.247893172092157e-06,
"loss": 0.5212,
"step": 580
},
{
"epoch": 0.51,
"grad_norm": 1.9952457072102052,
"learning_rate": 4.245393536869303e-06,
"loss": 0.5284,
"step": 581
},
{
"epoch": 0.52,
"grad_norm": 2.0728561008210384,
"learning_rate": 4.242890492855056e-06,
"loss": 0.5214,
"step": 582
},
{
"epoch": 0.52,
"grad_norm": 1.97825451090628,
"learning_rate": 4.240384044937919e-06,
"loss": 0.5586,
"step": 583
},
{
"epoch": 0.52,
"grad_norm": 1.85380003580073,
"learning_rate": 4.237874198013037e-06,
"loss": 0.6078,
"step": 584
},
{
"epoch": 0.52,
"grad_norm": 1.8198051628607304,
"learning_rate": 4.235360956982196e-06,
"loss": 0.5677,
"step": 585
},
{
"epoch": 0.52,
"grad_norm": 2.1343351043013183,
"learning_rate": 4.23284432675381e-06,
"loss": 0.5706,
"step": 586
},
{
"epoch": 0.52,
"grad_norm": 2.0294462862804896,
"learning_rate": 4.230324312242911e-06,
"loss": 0.5399,
"step": 587
},
{
"epoch": 0.52,
"grad_norm": 1.9618881336969853,
"learning_rate": 4.227800918371145e-06,
"loss": 0.5292,
"step": 588
},
{
"epoch": 0.52,
"grad_norm": 1.9665398714083597,
"learning_rate": 4.225274150066752e-06,
"loss": 0.5414,
"step": 589
},
{
"epoch": 0.52,
"grad_norm": 2.0976099857689268,
"learning_rate": 4.222744012264567e-06,
"loss": 0.5204,
"step": 590
},
{
"epoch": 0.52,
"grad_norm": 1.968032018982793,
"learning_rate": 4.220210509906002e-06,
"loss": 0.5622,
"step": 591
},
{
"epoch": 0.52,
"grad_norm": 2.0055542027073523,
"learning_rate": 4.217673647939044e-06,
"loss": 0.5723,
"step": 592
},
{
"epoch": 0.53,
"grad_norm": 2.031612125247833,
"learning_rate": 4.215133431318239e-06,
"loss": 0.5727,
"step": 593
},
{
"epoch": 0.53,
"grad_norm": 2.04253552367063,
"learning_rate": 4.212589865004684e-06,
"loss": 0.5676,
"step": 594
},
{
"epoch": 0.53,
"grad_norm": 1.9143447724555291,
"learning_rate": 4.2100429539660205e-06,
"loss": 0.5452,
"step": 595
},
{
"epoch": 0.53,
"grad_norm": 2.1284999811605334,
"learning_rate": 4.20749270317642e-06,
"loss": 0.5679,
"step": 596
},
{
"epoch": 0.53,
"grad_norm": 1.9726237378545723,
"learning_rate": 4.204939117616578e-06,
"loss": 0.5514,
"step": 597
},
{
"epoch": 0.53,
"grad_norm": 2.0537722291479583,
"learning_rate": 4.202382202273702e-06,
"loss": 0.5979,
"step": 598
},
{
"epoch": 0.53,
"grad_norm": 1.9695944675405062,
"learning_rate": 4.1998219621415035e-06,
"loss": 0.5519,
"step": 599
},
{
"epoch": 0.53,
"grad_norm": 2.1175148159531196,
"learning_rate": 4.197258402220187e-06,
"loss": 0.5437,
"step": 600
},
{
"epoch": 0.53,
"grad_norm": 1.9698920488340708,
"learning_rate": 4.19469152751644e-06,
"loss": 0.5765,
"step": 601
},
{
"epoch": 0.53,
"grad_norm": 1.879379971551763,
"learning_rate": 4.192121343043424e-06,
"loss": 0.5219,
"step": 602
},
{
"epoch": 0.53,
"grad_norm": 1.9668215341266202,
"learning_rate": 4.189547853820767e-06,
"loss": 0.4967,
"step": 603
},
{
"epoch": 0.53,
"grad_norm": 2.0264415648360723,
"learning_rate": 4.186971064874547e-06,
"loss": 0.5591,
"step": 604
},
{
"epoch": 0.54,
"grad_norm": 1.9996711001240413,
"learning_rate": 4.18439098123729e-06,
"loss": 0.5909,
"step": 605
},
{
"epoch": 0.54,
"grad_norm": 1.9209919754307736,
"learning_rate": 4.181807607947954e-06,
"loss": 0.5516,
"step": 606
},
{
"epoch": 0.54,
"grad_norm": 1.8120062816345244,
"learning_rate": 4.1792209500519245e-06,
"loss": 0.5112,
"step": 607
},
{
"epoch": 0.54,
"grad_norm": 1.9265993932694714,
"learning_rate": 4.176631012601e-06,
"loss": 0.5716,
"step": 608
},
{
"epoch": 0.54,
"grad_norm": 1.7951063568824173,
"learning_rate": 4.1740378006533835e-06,
"loss": 0.5546,
"step": 609
},
{
"epoch": 0.54,
"grad_norm": 1.9478736935670538,
"learning_rate": 4.1714413192736756e-06,
"loss": 0.5137,
"step": 610
},
{
"epoch": 0.54,
"grad_norm": 1.9166713700159672,
"learning_rate": 4.168841573532859e-06,
"loss": 0.5285,
"step": 611
},
{
"epoch": 0.54,
"grad_norm": 1.903061790874867,
"learning_rate": 4.166238568508294e-06,
"loss": 0.5643,
"step": 612
},
{
"epoch": 0.54,
"grad_norm": 1.8709574261812854,
"learning_rate": 4.1636323092837065e-06,
"loss": 0.5531,
"step": 613
},
{
"epoch": 0.54,
"grad_norm": 1.891374469060374,
"learning_rate": 4.161022800949177e-06,
"loss": 0.5386,
"step": 614
},
{
"epoch": 0.54,
"grad_norm": 1.8621023435008923,
"learning_rate": 4.1584100486011315e-06,
"loss": 0.5472,
"step": 615
},
{
"epoch": 0.55,
"grad_norm": 1.8927480615848256,
"learning_rate": 4.155794057342333e-06,
"loss": 0.567,
"step": 616
},
{
"epoch": 0.55,
"grad_norm": 1.9157957155248084,
"learning_rate": 4.153174832281867e-06,
"loss": 0.5295,
"step": 617
},
{
"epoch": 0.55,
"grad_norm": 1.7900976303440275,
"learning_rate": 4.150552378535138e-06,
"loss": 0.5374,
"step": 618
},
{
"epoch": 0.55,
"grad_norm": 1.9233860209522704,
"learning_rate": 4.1479267012238555e-06,
"loss": 0.5673,
"step": 619
},
{
"epoch": 0.55,
"grad_norm": 1.904244620695313,
"learning_rate": 4.145297805476023e-06,
"loss": 0.5674,
"step": 620
},
{
"epoch": 0.55,
"grad_norm": 1.8633100020518014,
"learning_rate": 4.142665696425932e-06,
"loss": 0.5717,
"step": 621
},
{
"epoch": 0.55,
"grad_norm": 2.0449274851229764,
"learning_rate": 4.140030379214147e-06,
"loss": 0.5382,
"step": 622
},
{
"epoch": 0.55,
"grad_norm": 1.8437126524936716,
"learning_rate": 4.137391858987502e-06,
"loss": 0.5635,
"step": 623
},
{
"epoch": 0.55,
"grad_norm": 1.9476300616110815,
"learning_rate": 4.134750140899082e-06,
"loss": 0.5354,
"step": 624
},
{
"epoch": 0.55,
"grad_norm": 1.8187836169409277,
"learning_rate": 4.132105230108221e-06,
"loss": 0.5678,
"step": 625
},
{
"epoch": 0.55,
"grad_norm": 1.8325255303792565,
"learning_rate": 4.1294571317804854e-06,
"loss": 0.5497,
"step": 626
},
{
"epoch": 0.56,
"grad_norm": 1.947073088948294,
"learning_rate": 4.12680585108767e-06,
"loss": 0.6005,
"step": 627
},
{
"epoch": 0.56,
"grad_norm": 1.9094602677105208,
"learning_rate": 4.1241513932077835e-06,
"loss": 0.5442,
"step": 628
},
{
"epoch": 0.56,
"grad_norm": 1.9308069577521967,
"learning_rate": 4.121493763325039e-06,
"loss": 0.4952,
"step": 629
},
{
"epoch": 0.56,
"grad_norm": 1.955225453108231,
"learning_rate": 4.118832966629847e-06,
"loss": 0.5161,
"step": 630
},
{
"epoch": 0.56,
"grad_norm": 1.8884686835300686,
"learning_rate": 4.116169008318798e-06,
"loss": 0.5834,
"step": 631
},
{
"epoch": 0.56,
"grad_norm": 1.851971220446282,
"learning_rate": 4.113501893594662e-06,
"loss": 0.5762,
"step": 632
},
{
"epoch": 0.56,
"grad_norm": 1.982231343732386,
"learning_rate": 4.110831627666372e-06,
"loss": 0.5043,
"step": 633
},
{
"epoch": 0.56,
"grad_norm": 1.8783480932058496,
"learning_rate": 4.108158215749014e-06,
"loss": 0.5202,
"step": 634
},
{
"epoch": 0.56,
"grad_norm": 1.7472053862830499,
"learning_rate": 4.105481663063821e-06,
"loss": 0.5064,
"step": 635
},
{
"epoch": 0.56,
"grad_norm": 4.71435326799849,
"learning_rate": 4.102801974838158e-06,
"loss": 0.5808,
"step": 636
},
{
"epoch": 0.56,
"grad_norm": 1.9383972995582568,
"learning_rate": 4.100119156305514e-06,
"loss": 0.5268,
"step": 637
},
{
"epoch": 0.57,
"grad_norm": 1.7165619283230378,
"learning_rate": 4.097433212705492e-06,
"loss": 0.5376,
"step": 638
},
{
"epoch": 0.57,
"grad_norm": 1.8524888535442023,
"learning_rate": 4.094744149283796e-06,
"loss": 0.5388,
"step": 639
},
{
"epoch": 0.57,
"grad_norm": 1.958121956311822,
"learning_rate": 4.092051971292228e-06,
"loss": 0.5273,
"step": 640
},
{
"epoch": 0.57,
"grad_norm": 1.8752806971174674,
"learning_rate": 4.089356683988668e-06,
"loss": 0.5283,
"step": 641
},
{
"epoch": 0.57,
"grad_norm": 2.4399117721583465,
"learning_rate": 4.086658292637072e-06,
"loss": 0.5643,
"step": 642
},
{
"epoch": 0.57,
"grad_norm": 1.897865148445396,
"learning_rate": 4.083956802507456e-06,
"loss": 0.5432,
"step": 643
},
{
"epoch": 0.57,
"grad_norm": 2.0947253224544826,
"learning_rate": 4.0812522188758874e-06,
"loss": 0.6738,
"step": 644
},
{
"epoch": 0.57,
"grad_norm": 1.8801252766945993,
"learning_rate": 4.078544547024479e-06,
"loss": 0.5516,
"step": 645
},
{
"epoch": 0.57,
"grad_norm": 1.884681207915535,
"learning_rate": 4.075833792241371e-06,
"loss": 0.5521,
"step": 646
},
{
"epoch": 0.57,
"grad_norm": 1.911314829964074,
"learning_rate": 4.073119959820728e-06,
"loss": 0.5279,
"step": 647
},
{
"epoch": 0.57,
"grad_norm": 1.860637117587055,
"learning_rate": 4.070403055062721e-06,
"loss": 0.5543,
"step": 648
},
{
"epoch": 0.57,
"grad_norm": 2.0453601596603157,
"learning_rate": 4.0676830832735245e-06,
"loss": 0.5757,
"step": 649
},
{
"epoch": 0.58,
"grad_norm": 1.8114060321351384,
"learning_rate": 4.064960049765304e-06,
"loss": 0.5049,
"step": 650
},
{
"epoch": 0.58,
"grad_norm": 1.959305167631277,
"learning_rate": 4.062233959856202e-06,
"loss": 0.5378,
"step": 651
},
{
"epoch": 0.58,
"grad_norm": 1.8509512649844786,
"learning_rate": 4.059504818870332e-06,
"loss": 0.5695,
"step": 652
},
{
"epoch": 0.58,
"grad_norm": 2.0120311393374677,
"learning_rate": 4.056772632137762e-06,
"loss": 0.5548,
"step": 653
},
{
"epoch": 0.58,
"grad_norm": 2.185006431209757,
"learning_rate": 4.054037404994516e-06,
"loss": 0.5796,
"step": 654
},
{
"epoch": 0.58,
"grad_norm": 1.8639659087725635,
"learning_rate": 4.05129914278255e-06,
"loss": 0.503,
"step": 655
},
{
"epoch": 0.58,
"grad_norm": 2.0128366658538726,
"learning_rate": 4.048557850849749e-06,
"loss": 0.5543,
"step": 656
},
{
"epoch": 0.58,
"grad_norm": 2.0493127075126467,
"learning_rate": 4.045813534549917e-06,
"loss": 0.5971,
"step": 657
},
{
"epoch": 0.58,
"grad_norm": 1.8943877873256292,
"learning_rate": 4.043066199242762e-06,
"loss": 0.5512,
"step": 658
},
{
"epoch": 0.58,
"grad_norm": 1.8607643797927613,
"learning_rate": 4.04031585029389e-06,
"loss": 0.5755,
"step": 659
},
{
"epoch": 0.58,
"grad_norm": 1.933467010931308,
"learning_rate": 4.037562493074792e-06,
"loss": 0.546,
"step": 660
},
{
"epoch": 0.59,
"grad_norm": 1.870898209604796,
"learning_rate": 4.034806132962834e-06,
"loss": 0.5101,
"step": 661
},
{
"epoch": 0.59,
"grad_norm": 1.7765005525064146,
"learning_rate": 4.032046775341247e-06,
"loss": 0.535,
"step": 662
},
{
"epoch": 0.59,
"grad_norm": 1.808388020113739,
"learning_rate": 4.029284425599116e-06,
"loss": 0.5532,
"step": 663
},
{
"epoch": 0.59,
"grad_norm": 1.9444426383785842,
"learning_rate": 4.026519089131371e-06,
"loss": 0.5804,
"step": 664
},
{
"epoch": 0.59,
"grad_norm": 1.8810929458792174,
"learning_rate": 4.023750771338774e-06,
"loss": 0.5023,
"step": 665
},
{
"epoch": 0.59,
"grad_norm": 1.7587173598023012,
"learning_rate": 4.020979477627907e-06,
"loss": 0.588,
"step": 666
},
{
"epoch": 0.59,
"grad_norm": 1.8616544736960938,
"learning_rate": 4.018205213411169e-06,
"loss": 0.5604,
"step": 667
},
{
"epoch": 0.59,
"grad_norm": 1.8517363531329913,
"learning_rate": 4.015427984106759e-06,
"loss": 0.5503,
"step": 668
},
{
"epoch": 0.59,
"grad_norm": 1.7164279131663547,
"learning_rate": 4.012647795138664e-06,
"loss": 0.5353,
"step": 669
},
{
"epoch": 0.59,
"grad_norm": 1.8490922932257532,
"learning_rate": 4.009864651936653e-06,
"loss": 0.5527,
"step": 670
},
{
"epoch": 0.59,
"grad_norm": 1.9222471762582807,
"learning_rate": 4.007078559936268e-06,
"loss": 0.5449,
"step": 671
},
{
"epoch": 0.6,
"grad_norm": 1.7126406752680576,
"learning_rate": 4.0042895245788035e-06,
"loss": 0.5102,
"step": 672
},
{
"epoch": 0.6,
"grad_norm": 1.7999692875631594,
"learning_rate": 4.001497551311308e-06,
"loss": 0.514,
"step": 673
},
{
"epoch": 0.6,
"grad_norm": 1.8482521644616647,
"learning_rate": 3.998702645586565e-06,
"loss": 0.546,
"step": 674
},
{
"epoch": 0.6,
"grad_norm": 1.8124842120343776,
"learning_rate": 3.995904812863086e-06,
"loss": 0.5432,
"step": 675
},
{
"epoch": 0.6,
"grad_norm": 1.9053654350943952,
"learning_rate": 3.993104058605099e-06,
"loss": 0.6222,
"step": 676
},
{
"epoch": 0.6,
"grad_norm": 1.851530834120678,
"learning_rate": 3.9903003882825396e-06,
"loss": 0.5069,
"step": 677
},
{
"epoch": 0.6,
"grad_norm": 1.824612938648448,
"learning_rate": 3.987493807371033e-06,
"loss": 0.5279,
"step": 678
},
{
"epoch": 0.6,
"grad_norm": 1.8322983038942529,
"learning_rate": 3.984684321351895e-06,
"loss": 0.504,
"step": 679
},
{
"epoch": 0.6,
"grad_norm": 2.1601679247075105,
"learning_rate": 3.981871935712112e-06,
"loss": 0.5448,
"step": 680
},
{
"epoch": 0.6,
"grad_norm": 1.9324323412240167,
"learning_rate": 3.979056655944335e-06,
"loss": 0.5696,
"step": 681
},
{
"epoch": 0.6,
"grad_norm": 1.8887222870071794,
"learning_rate": 3.9762384875468645e-06,
"loss": 0.5147,
"step": 682
},
{
"epoch": 0.6,
"grad_norm": 1.9025483031058836,
"learning_rate": 3.973417436023646e-06,
"loss": 0.5322,
"step": 683
},
{
"epoch": 0.61,
"grad_norm": 1.944754689874286,
"learning_rate": 3.970593506884254e-06,
"loss": 0.564,
"step": 684
},
{
"epoch": 0.61,
"grad_norm": 1.8782062559948918,
"learning_rate": 3.9677667056438824e-06,
"loss": 0.5179,
"step": 685
},
{
"epoch": 0.61,
"grad_norm": 1.7615090001622373,
"learning_rate": 3.964937037823337e-06,
"loss": 0.52,
"step": 686
},
{
"epoch": 0.61,
"grad_norm": 1.877979446527034,
"learning_rate": 3.962104508949018e-06,
"loss": 0.5611,
"step": 687
},
{
"epoch": 0.61,
"grad_norm": 1.8668900126580097,
"learning_rate": 3.9592691245529174e-06,
"loss": 0.5398,
"step": 688
},
{
"epoch": 0.61,
"grad_norm": 2.0467424748632395,
"learning_rate": 3.9564308901726016e-06,
"loss": 0.5429,
"step": 689
},
{
"epoch": 0.61,
"grad_norm": 1.7523480652481473,
"learning_rate": 3.9535898113512046e-06,
"loss": 0.5456,
"step": 690
},
{
"epoch": 0.61,
"grad_norm": 1.9384307177445268,
"learning_rate": 3.950745893637414e-06,
"loss": 0.5298,
"step": 691
},
{
"epoch": 0.61,
"grad_norm": 2.0200307543606266,
"learning_rate": 3.947899142585464e-06,
"loss": 0.5813,
"step": 692
},
{
"epoch": 0.61,
"grad_norm": 1.8825594318661294,
"learning_rate": 3.945049563755119e-06,
"loss": 0.5843,
"step": 693
},
{
"epoch": 0.61,
"grad_norm": 1.801304483173922,
"learning_rate": 3.94219716271167e-06,
"loss": 0.5332,
"step": 694
},
{
"epoch": 0.62,
"grad_norm": 1.789336412692842,
"learning_rate": 3.939341945025918e-06,
"loss": 0.5712,
"step": 695
},
{
"epoch": 0.62,
"grad_norm": 1.6764596672056864,
"learning_rate": 3.936483916274163e-06,
"loss": 0.5471,
"step": 696
},
{
"epoch": 0.62,
"grad_norm": 1.8160991340297739,
"learning_rate": 3.933623082038199e-06,
"loss": 0.5172,
"step": 697
},
{
"epoch": 0.62,
"grad_norm": 1.9958719154660882,
"learning_rate": 3.930759447905298e-06,
"loss": 0.5243,
"step": 698
},
{
"epoch": 0.62,
"grad_norm": 1.7844190098902166,
"learning_rate": 3.927893019468196e-06,
"loss": 0.5679,
"step": 699
},
{
"epoch": 0.62,
"grad_norm": 1.8231700761644845,
"learning_rate": 3.925023802325094e-06,
"loss": 0.5415,
"step": 700
},
{
"epoch": 0.62,
"grad_norm": 1.8577751348591511,
"learning_rate": 3.922151802079633e-06,
"loss": 0.5451,
"step": 701
},
{
"epoch": 0.62,
"grad_norm": 1.872268020286279,
"learning_rate": 3.919277024340891e-06,
"loss": 0.5805,
"step": 702
},
{
"epoch": 0.62,
"grad_norm": 1.956916033214976,
"learning_rate": 3.916399474723373e-06,
"loss": 0.5142,
"step": 703
},
{
"epoch": 0.62,
"grad_norm": 1.8690696320721123,
"learning_rate": 3.913519158846994e-06,
"loss": 0.5377,
"step": 704
},
{
"epoch": 0.62,
"grad_norm": 1.8932224298053513,
"learning_rate": 3.910636082337076e-06,
"loss": 0.5174,
"step": 705
},
{
"epoch": 0.63,
"grad_norm": 1.7671002724508906,
"learning_rate": 3.907750250824327e-06,
"loss": 0.5227,
"step": 706
},
{
"epoch": 0.63,
"grad_norm": 1.8537234882936333,
"learning_rate": 3.904861669944839e-06,
"loss": 0.5672,
"step": 707
},
{
"epoch": 0.63,
"grad_norm": 1.8993796687475375,
"learning_rate": 3.901970345340075e-06,
"loss": 0.5131,
"step": 708
},
{
"epoch": 0.63,
"grad_norm": 1.8118617206389966,
"learning_rate": 3.899076282656853e-06,
"loss": 0.5243,
"step": 709
},
{
"epoch": 0.63,
"grad_norm": 1.8195324114535576,
"learning_rate": 3.89617948754734e-06,
"loss": 0.5255,
"step": 710
},
{
"epoch": 0.63,
"grad_norm": 1.777076552111516,
"learning_rate": 3.89327996566904e-06,
"loss": 0.5482,
"step": 711
},
{
"epoch": 0.63,
"grad_norm": 1.7960584295638569,
"learning_rate": 3.890377722684782e-06,
"loss": 0.5232,
"step": 712
},
{
"epoch": 0.63,
"grad_norm": 2.0180517293259777,
"learning_rate": 3.887472764262709e-06,
"loss": 0.4988,
"step": 713
},
{
"epoch": 0.63,
"grad_norm": 1.7698597985590767,
"learning_rate": 3.884565096076269e-06,
"loss": 0.4934,
"step": 714
},
{
"epoch": 0.63,
"grad_norm": 1.9593013419554524,
"learning_rate": 3.8816547238042e-06,
"loss": 0.554,
"step": 715
},
{
"epoch": 0.63,
"grad_norm": 1.803176799671639,
"learning_rate": 3.878741653130521e-06,
"loss": 0.5058,
"step": 716
},
{
"epoch": 0.64,
"grad_norm": 1.8739139669777212,
"learning_rate": 3.875825889744525e-06,
"loss": 0.5291,
"step": 717
},
{
"epoch": 0.64,
"grad_norm": 1.7425957572489872,
"learning_rate": 3.872907439340758e-06,
"loss": 0.5132,
"step": 718
},
{
"epoch": 0.64,
"grad_norm": 1.7880023308134785,
"learning_rate": 3.86998630761902e-06,
"loss": 0.5388,
"step": 719
},
{
"epoch": 0.64,
"grad_norm": 2.035324802689225,
"learning_rate": 3.867062500284342e-06,
"loss": 0.5225,
"step": 720
},
{
"epoch": 0.64,
"grad_norm": 1.7720228048563502,
"learning_rate": 3.864136023046984e-06,
"loss": 0.5535,
"step": 721
},
{
"epoch": 0.64,
"grad_norm": 1.893636721431615,
"learning_rate": 3.861206881622419e-06,
"loss": 0.5445,
"step": 722
},
{
"epoch": 0.64,
"grad_norm": 1.9975882991420841,
"learning_rate": 3.8582750817313245e-06,
"loss": 0.498,
"step": 723
},
{
"epoch": 0.64,
"grad_norm": 1.8894358056153195,
"learning_rate": 3.855340629099568e-06,
"loss": 0.5262,
"step": 724
},
{
"epoch": 0.64,
"grad_norm": 1.8226831631189866,
"learning_rate": 3.852403529458199e-06,
"loss": 0.5289,
"step": 725
},
{
"epoch": 0.64,
"grad_norm": 1.9219589460322386,
"learning_rate": 3.84946378854344e-06,
"loss": 0.5828,
"step": 726
},
{
"epoch": 0.64,
"grad_norm": 1.9524000874112546,
"learning_rate": 3.846521412096665e-06,
"loss": 0.5755,
"step": 727
},
{
"epoch": 0.64,
"grad_norm": 1.7855988589662195,
"learning_rate": 3.8435764058643994e-06,
"loss": 0.508,
"step": 728
},
{
"epoch": 0.65,
"grad_norm": 1.7556968697529176,
"learning_rate": 3.840628775598306e-06,
"loss": 0.5038,
"step": 729
},
{
"epoch": 0.65,
"grad_norm": 1.8615629845007688,
"learning_rate": 3.837678527055168e-06,
"loss": 0.5658,
"step": 730
},
{
"epoch": 0.65,
"grad_norm": 3.355106616980178,
"learning_rate": 3.834725665996889e-06,
"loss": 0.6255,
"step": 731
},
{
"epoch": 0.65,
"grad_norm": 2.057901705133853,
"learning_rate": 3.8317701981904655e-06,
"loss": 0.5009,
"step": 732
},
{
"epoch": 0.65,
"grad_norm": 1.8144866213511652,
"learning_rate": 3.828812129407994e-06,
"loss": 0.5378,
"step": 733
},
{
"epoch": 0.65,
"grad_norm": 1.895740992214761,
"learning_rate": 3.825851465426643e-06,
"loss": 0.5414,
"step": 734
},
{
"epoch": 0.65,
"grad_norm": 1.7690202691648218,
"learning_rate": 3.822888212028658e-06,
"loss": 0.5782,
"step": 735
},
{
"epoch": 0.65,
"grad_norm": 1.9910212850942313,
"learning_rate": 3.819922375001334e-06,
"loss": 0.538,
"step": 736
},
{
"epoch": 0.65,
"grad_norm": 2.022977401775343,
"learning_rate": 3.816953960137017e-06,
"loss": 0.5265,
"step": 737
},
{
"epoch": 0.65,
"grad_norm": 2.18942238369997,
"learning_rate": 3.8139829732330833e-06,
"loss": 0.5419,
"step": 738
},
{
"epoch": 0.65,
"grad_norm": 2.0143145051916487,
"learning_rate": 3.8110094200919356e-06,
"loss": 0.5396,
"step": 739
},
{
"epoch": 0.66,
"grad_norm": 1.8684895296380082,
"learning_rate": 3.8080333065209885e-06,
"loss": 0.5285,
"step": 740
},
{
"epoch": 0.66,
"grad_norm": 1.899758991227905,
"learning_rate": 3.8050546383326546e-06,
"loss": 0.5392,
"step": 741
},
{
"epoch": 0.66,
"grad_norm": 1.7830347822365242,
"learning_rate": 3.8020734213443392e-06,
"loss": 0.5395,
"step": 742
},
{
"epoch": 0.66,
"grad_norm": 1.9688219937316351,
"learning_rate": 3.799089661378423e-06,
"loss": 0.5832,
"step": 743
},
{
"epoch": 0.66,
"grad_norm": 1.8380061964557934,
"learning_rate": 3.7961033642622536e-06,
"loss": 0.5182,
"step": 744
},
{
"epoch": 0.66,
"grad_norm": 1.9752769027783192,
"learning_rate": 3.793114535828134e-06,
"loss": 0.5189,
"step": 745
},
{
"epoch": 0.66,
"grad_norm": 1.9908258845677271,
"learning_rate": 3.7901231819133104e-06,
"loss": 0.5863,
"step": 746
},
{
"epoch": 0.66,
"grad_norm": 1.8419144313470388,
"learning_rate": 3.787129308359963e-06,
"loss": 0.5596,
"step": 747
},
{
"epoch": 0.66,
"grad_norm": 1.8578409208981632,
"learning_rate": 3.7841329210151905e-06,
"loss": 0.5757,
"step": 748
},
{
"epoch": 0.66,
"grad_norm": 1.8125362585272666,
"learning_rate": 3.7811340257310036e-06,
"loss": 0.5625,
"step": 749
},
{
"epoch": 0.66,
"grad_norm": 1.8266843142853604,
"learning_rate": 3.778132628364309e-06,
"loss": 0.5121,
"step": 750
},
{
"epoch": 0.67,
"grad_norm": 1.9286747700189457,
"learning_rate": 3.7751287347769006e-06,
"loss": 0.5856,
"step": 751
},
{
"epoch": 0.67,
"grad_norm": 1.8358169963837994,
"learning_rate": 3.772122350835447e-06,
"loss": 0.5363,
"step": 752
},
{
"epoch": 0.67,
"grad_norm": 1.8751145280860322,
"learning_rate": 3.769113482411483e-06,
"loss": 0.5435,
"step": 753
},
{
"epoch": 0.67,
"grad_norm": 1.7372022137266947,
"learning_rate": 3.766102135381393e-06,
"loss": 0.5114,
"step": 754
},
{
"epoch": 0.67,
"grad_norm": 1.848532567966691,
"learning_rate": 3.763088315626402e-06,
"loss": 0.4887,
"step": 755
},
{
"epoch": 0.67,
"grad_norm": 1.8724024281108291,
"learning_rate": 3.7600720290325666e-06,
"loss": 0.5681,
"step": 756
},
{
"epoch": 0.67,
"grad_norm": 1.7564274203136065,
"learning_rate": 3.757053281490759e-06,
"loss": 0.5365,
"step": 757
},
{
"epoch": 0.67,
"grad_norm": 1.7090468035537372,
"learning_rate": 3.75403207889666e-06,
"loss": 0.4976,
"step": 758
},
{
"epoch": 0.67,
"grad_norm": 1.8628034310476902,
"learning_rate": 3.7510084271507417e-06,
"loss": 0.5908,
"step": 759
},
{
"epoch": 0.67,
"grad_norm": 1.8673457440060792,
"learning_rate": 3.7479823321582624e-06,
"loss": 0.5641,
"step": 760
},
{
"epoch": 0.67,
"grad_norm": 1.8378062191959523,
"learning_rate": 3.744953799829252e-06,
"loss": 0.5175,
"step": 761
},
{
"epoch": 0.67,
"grad_norm": 1.779154712157358,
"learning_rate": 3.7419228360784987e-06,
"loss": 0.5539,
"step": 762
},
{
"epoch": 0.68,
"grad_norm": 2.1820639181555315,
"learning_rate": 3.73888944682554e-06,
"loss": 0.5247,
"step": 763
},
{
"epoch": 0.68,
"grad_norm": 1.927216958283792,
"learning_rate": 3.735853637994652e-06,
"loss": 0.5851,
"step": 764
},
{
"epoch": 0.68,
"grad_norm": 1.7670365768745326,
"learning_rate": 3.732815415514834e-06,
"loss": 0.5829,
"step": 765
},
{
"epoch": 0.68,
"grad_norm": 1.825202964363253,
"learning_rate": 3.729774785319801e-06,
"loss": 0.5257,
"step": 766
},
{
"epoch": 0.68,
"grad_norm": 1.8200852022234557,
"learning_rate": 3.72673175334797e-06,
"loss": 0.55,
"step": 767
},
{
"epoch": 0.68,
"grad_norm": 1.9436493930137209,
"learning_rate": 3.723686325542448e-06,
"loss": 0.5583,
"step": 768
},
{
"epoch": 0.68,
"grad_norm": 1.7581670709714554,
"learning_rate": 3.7206385078510204e-06,
"loss": 0.5267,
"step": 769
},
{
"epoch": 0.68,
"grad_norm": 1.9439324051591973,
"learning_rate": 3.717588306226143e-06,
"loss": 0.5686,
"step": 770
},
{
"epoch": 0.68,
"grad_norm": 1.8154349894294908,
"learning_rate": 3.7145357266249248e-06,
"loss": 0.5668,
"step": 771
},
{
"epoch": 0.68,
"grad_norm": 1.829602382975092,
"learning_rate": 3.7114807750091198e-06,
"loss": 0.5096,
"step": 772
},
{
"epoch": 0.68,
"grad_norm": 1.7902487805325054,
"learning_rate": 3.7084234573451145e-06,
"loss": 0.5387,
"step": 773
},
{
"epoch": 0.69,
"grad_norm": 1.7734778927084154,
"learning_rate": 3.7053637796039173e-06,
"loss": 0.5227,
"step": 774
},
{
"epoch": 0.69,
"grad_norm": 1.8359664701196194,
"learning_rate": 3.7023017477611444e-06,
"loss": 0.5183,
"step": 775
},
{
"epoch": 0.69,
"grad_norm": 2.049839823780983,
"learning_rate": 3.699237367797011e-06,
"loss": 0.5158,
"step": 776
},
{
"epoch": 0.69,
"grad_norm": 1.839740383172249,
"learning_rate": 3.6961706456963166e-06,
"loss": 0.509,
"step": 777
},
{
"epoch": 0.69,
"grad_norm": 1.7742492301936488,
"learning_rate": 3.693101587448436e-06,
"loss": 0.547,
"step": 778
},
{
"epoch": 0.69,
"grad_norm": 1.7626686489679533,
"learning_rate": 3.6900301990473074e-06,
"loss": 0.5501,
"step": 779
},
{
"epoch": 0.69,
"grad_norm": 1.81358040457354,
"learning_rate": 3.686956486491419e-06,
"loss": 0.5258,
"step": 780
},
{
"epoch": 0.69,
"grad_norm": 1.8446309626844912,
"learning_rate": 3.6838804557837972e-06,
"loss": 0.5438,
"step": 781
},
{
"epoch": 0.69,
"grad_norm": 1.8020540676799555,
"learning_rate": 3.680802112931996e-06,
"loss": 0.5333,
"step": 782
},
{
"epoch": 0.69,
"grad_norm": 1.8177001575706107,
"learning_rate": 3.677721463948087e-06,
"loss": 0.5194,
"step": 783
},
{
"epoch": 0.69,
"grad_norm": 1.7662648614084315,
"learning_rate": 3.6746385148486437e-06,
"loss": 0.5229,
"step": 784
},
{
"epoch": 0.7,
"grad_norm": 1.7914748738808024,
"learning_rate": 3.6715532716547325e-06,
"loss": 0.5443,
"step": 785
},
{
"epoch": 0.7,
"grad_norm": 1.6582914688424026,
"learning_rate": 3.6684657403919005e-06,
"loss": 0.4672,
"step": 786
},
{
"epoch": 0.7,
"grad_norm": 1.8779379042503213,
"learning_rate": 3.6653759270901634e-06,
"loss": 0.5361,
"step": 787
},
{
"epoch": 0.7,
"grad_norm": 1.843796012903189,
"learning_rate": 3.6622838377839927e-06,
"loss": 0.5903,
"step": 788
},
{
"epoch": 0.7,
"grad_norm": 1.7389903959091482,
"learning_rate": 3.6591894785123065e-06,
"loss": 0.5232,
"step": 789
},
{
"epoch": 0.7,
"grad_norm": 2.1531271375101912,
"learning_rate": 3.6560928553184556e-06,
"loss": 0.5811,
"step": 790
},
{
"epoch": 0.7,
"grad_norm": 1.8744519871212226,
"learning_rate": 3.6529939742502114e-06,
"loss": 0.5094,
"step": 791
},
{
"epoch": 0.7,
"grad_norm": 2.1796693544184405,
"learning_rate": 3.649892841359756e-06,
"loss": 0.5324,
"step": 792
},
{
"epoch": 0.7,
"grad_norm": 1.7983464824305884,
"learning_rate": 3.6467894627036697e-06,
"loss": 0.5406,
"step": 793
},
{
"epoch": 0.7,
"grad_norm": 1.8385213368207254,
"learning_rate": 3.6436838443429177e-06,
"loss": 0.5116,
"step": 794
},
{
"epoch": 0.7,
"grad_norm": 1.8303911353695022,
"learning_rate": 3.64057599234284e-06,
"loss": 0.5032,
"step": 795
},
{
"epoch": 0.71,
"grad_norm": 1.8212476470235475,
"learning_rate": 3.6374659127731394e-06,
"loss": 0.4977,
"step": 796
},
{
"epoch": 0.71,
"grad_norm": 1.788273127421183,
"learning_rate": 3.6343536117078674e-06,
"loss": 0.5132,
"step": 797
},
{
"epoch": 0.71,
"grad_norm": 1.792471501776643,
"learning_rate": 3.631239095225417e-06,
"loss": 0.6034,
"step": 798
},
{
"epoch": 0.71,
"grad_norm": 1.7180614128401976,
"learning_rate": 3.6281223694085055e-06,
"loss": 0.5125,
"step": 799
},
{
"epoch": 0.71,
"grad_norm": 1.968143388774121,
"learning_rate": 3.625003440344166e-06,
"loss": 0.5192,
"step": 800
},
{
"epoch": 0.71,
"grad_norm": 1.78698643398069,
"learning_rate": 3.6218823141237346e-06,
"loss": 0.5389,
"step": 801
},
{
"epoch": 0.71,
"grad_norm": 1.7360516235744345,
"learning_rate": 3.6187589968428388e-06,
"loss": 0.55,
"step": 802
},
{
"epoch": 0.71,
"grad_norm": 1.8926768947040113,
"learning_rate": 3.6156334946013844e-06,
"loss": 0.5402,
"step": 803
},
{
"epoch": 0.71,
"grad_norm": 1.7341073776764506,
"learning_rate": 3.612505813503545e-06,
"loss": 0.5156,
"step": 804
},
{
"epoch": 0.71,
"grad_norm": 1.9037532755321576,
"learning_rate": 3.6093759596577493e-06,
"loss": 0.5035,
"step": 805
},
{
"epoch": 0.71,
"grad_norm": 1.768026916515408,
"learning_rate": 3.60624393917667e-06,
"loss": 0.5317,
"step": 806
},
{
"epoch": 0.71,
"grad_norm": 1.8195441338851683,
"learning_rate": 3.6031097581772123e-06,
"loss": 0.5173,
"step": 807
},
{
"epoch": 0.72,
"grad_norm": 1.7451524294172138,
"learning_rate": 3.599973422780497e-06,
"loss": 0.5447,
"step": 808
},
{
"epoch": 0.72,
"grad_norm": 1.7994216931464604,
"learning_rate": 3.5968349391118573e-06,
"loss": 0.5468,
"step": 809
},
{
"epoch": 0.72,
"grad_norm": 1.7827289419599717,
"learning_rate": 3.5936943133008183e-06,
"loss": 0.5036,
"step": 810
},
{
"epoch": 0.72,
"grad_norm": 1.9123669337349365,
"learning_rate": 3.590551551481091e-06,
"loss": 0.4994,
"step": 811
},
{
"epoch": 0.72,
"grad_norm": 1.909151918011393,
"learning_rate": 3.5874066597905573e-06,
"loss": 0.5437,
"step": 812
},
{
"epoch": 0.72,
"grad_norm": 1.8314316190947115,
"learning_rate": 3.5842596443712586e-06,
"loss": 0.5327,
"step": 813
},
{
"epoch": 0.72,
"grad_norm": 2.025080353968657,
"learning_rate": 3.581110511369384e-06,
"loss": 0.5207,
"step": 814
},
{
"epoch": 0.72,
"grad_norm": 1.719830501688002,
"learning_rate": 3.5779592669352588e-06,
"loss": 0.5043,
"step": 815
},
{
"epoch": 0.72,
"grad_norm": 1.8856042934205883,
"learning_rate": 3.574805917223332e-06,
"loss": 0.534,
"step": 816
},
{
"epoch": 0.72,
"grad_norm": 1.8669902777268896,
"learning_rate": 3.5716504683921626e-06,
"loss": 0.5487,
"step": 817
},
{
"epoch": 0.72,
"grad_norm": 1.8420217203623648,
"learning_rate": 3.568492926604412e-06,
"loss": 0.4655,
"step": 818
},
{
"epoch": 0.73,
"grad_norm": 1.8587298766263622,
"learning_rate": 3.5653332980268267e-06,
"loss": 0.5308,
"step": 819
},
{
"epoch": 0.73,
"grad_norm": 1.8329162913986954,
"learning_rate": 3.562171588830231e-06,
"loss": 0.5061,
"step": 820
},
{
"epoch": 0.73,
"grad_norm": 1.7226245016695787,
"learning_rate": 3.5590078051895105e-06,
"loss": 0.5022,
"step": 821
},
{
"epoch": 0.73,
"grad_norm": 1.7947516408265423,
"learning_rate": 3.555841953283603e-06,
"loss": 0.5059,
"step": 822
},
{
"epoch": 0.73,
"grad_norm": 1.7754650010913384,
"learning_rate": 3.552674039295486e-06,
"loss": 0.5183,
"step": 823
},
{
"epoch": 0.73,
"grad_norm": 2.0058342412884267,
"learning_rate": 3.5495040694121644e-06,
"loss": 0.5717,
"step": 824
},
{
"epoch": 0.73,
"grad_norm": 1.8536876200790606,
"learning_rate": 3.546332049824659e-06,
"loss": 0.5445,
"step": 825
},
{
"epoch": 0.73,
"grad_norm": 1.9446394955278312,
"learning_rate": 3.543157986727991e-06,
"loss": 0.5778,
"step": 826
},
{
"epoch": 0.73,
"grad_norm": 1.7769561446293407,
"learning_rate": 3.5399818863211747e-06,
"loss": 0.5209,
"step": 827
},
{
"epoch": 0.73,
"grad_norm": 1.7847626696288204,
"learning_rate": 3.5368037548072042e-06,
"loss": 0.5684,
"step": 828
},
{
"epoch": 0.73,
"grad_norm": 1.856855628494933,
"learning_rate": 3.5336235983930383e-06,
"loss": 0.5277,
"step": 829
},
{
"epoch": 0.74,
"grad_norm": 1.799135122090622,
"learning_rate": 3.530441423289591e-06,
"loss": 0.53,
"step": 830
},
{
"epoch": 0.74,
"grad_norm": 1.7372348199564838,
"learning_rate": 3.5272572357117208e-06,
"loss": 0.5082,
"step": 831
},
{
"epoch": 0.74,
"grad_norm": 1.7713730143331359,
"learning_rate": 3.5240710418782137e-06,
"loss": 0.5127,
"step": 832
},
{
"epoch": 0.74,
"grad_norm": 1.808116845193293,
"learning_rate": 3.520882848011775e-06,
"loss": 0.5339,
"step": 833
},
{
"epoch": 0.74,
"grad_norm": 1.8168585745209507,
"learning_rate": 3.5176926603390176e-06,
"loss": 0.5773,
"step": 834
},
{
"epoch": 0.74,
"grad_norm": 1.8433472787266432,
"learning_rate": 3.514500485090446e-06,
"loss": 0.5446,
"step": 835
},
{
"epoch": 0.74,
"grad_norm": 1.7473743951502463,
"learning_rate": 3.511306328500449e-06,
"loss": 0.5182,
"step": 836
},
{
"epoch": 0.74,
"grad_norm": 1.9068925551475813,
"learning_rate": 3.5081101968072818e-06,
"loss": 0.5428,
"step": 837
},
{
"epoch": 0.74,
"grad_norm": 1.8621077674572017,
"learning_rate": 3.5049120962530608e-06,
"loss": 0.5783,
"step": 838
},
{
"epoch": 0.74,
"grad_norm": 1.8188442080835585,
"learning_rate": 3.501712033083744e-06,
"loss": 0.559,
"step": 839
},
{
"epoch": 0.74,
"grad_norm": 1.9008658249988244,
"learning_rate": 3.4985100135491245e-06,
"loss": 0.5322,
"step": 840
},
{
"epoch": 0.74,
"grad_norm": 1.8107617898563186,
"learning_rate": 3.495306043902817e-06,
"loss": 0.592,
"step": 841
},
{
"epoch": 0.75,
"grad_norm": 1.8972175021059394,
"learning_rate": 3.4921001304022422e-06,
"loss": 0.527,
"step": 842
},
{
"epoch": 0.75,
"grad_norm": 1.773730752308571,
"learning_rate": 3.4888922793086192e-06,
"loss": 0.5422,
"step": 843
},
{
"epoch": 0.75,
"grad_norm": 1.8207201600566427,
"learning_rate": 3.4856824968869506e-06,
"loss": 0.5463,
"step": 844
},
{
"epoch": 0.75,
"grad_norm": 1.7825701352278942,
"learning_rate": 3.4824707894060108e-06,
"loss": 0.5376,
"step": 845
},
{
"epoch": 0.75,
"grad_norm": 1.8186780308546509,
"learning_rate": 3.4792571631383345e-06,
"loss": 0.5448,
"step": 846
},
{
"epoch": 0.75,
"grad_norm": 1.7196535770637023,
"learning_rate": 3.4760416243602034e-06,
"loss": 0.5719,
"step": 847
},
{
"epoch": 0.75,
"grad_norm": 1.7996950762262636,
"learning_rate": 3.4728241793516345e-06,
"loss": 0.575,
"step": 848
},
{
"epoch": 0.75,
"grad_norm": 1.8460755337411012,
"learning_rate": 3.4696048343963667e-06,
"loss": 0.5303,
"step": 849
},
{
"epoch": 0.75,
"grad_norm": 1.8518850346827596,
"learning_rate": 3.4663835957818515e-06,
"loss": 0.5294,
"step": 850
},
{
"epoch": 0.75,
"grad_norm": 1.761477307422264,
"learning_rate": 3.463160469799237e-06,
"loss": 0.5303,
"step": 851
},
{
"epoch": 0.75,
"grad_norm": 1.8476905525063971,
"learning_rate": 3.459935462743359e-06,
"loss": 0.5365,
"step": 852
},
{
"epoch": 0.76,
"grad_norm": 1.7748738324934357,
"learning_rate": 3.4567085809127247e-06,
"loss": 0.5581,
"step": 853
},
{
"epoch": 0.76,
"grad_norm": 1.69994493873254,
"learning_rate": 3.4534798306095054e-06,
"loss": 0.5142,
"step": 854
},
{
"epoch": 0.76,
"grad_norm": 1.7867273775159276,
"learning_rate": 3.45024921813952e-06,
"loss": 0.5397,
"step": 855
},
{
"epoch": 0.76,
"grad_norm": 1.8894059211718275,
"learning_rate": 3.4470167498122253e-06,
"loss": 0.5327,
"step": 856
},
{
"epoch": 0.76,
"grad_norm": 1.8759154191563252,
"learning_rate": 3.4437824319407003e-06,
"loss": 0.5091,
"step": 857
},
{
"epoch": 0.76,
"grad_norm": 1.7992806971923871,
"learning_rate": 3.4405462708416393e-06,
"loss": 0.5206,
"step": 858
},
{
"epoch": 0.76,
"grad_norm": 1.8238604800708562,
"learning_rate": 3.437308272835335e-06,
"loss": 0.5452,
"step": 859
},
{
"epoch": 0.76,
"grad_norm": 1.8504559231955047,
"learning_rate": 3.4340684442456673e-06,
"loss": 0.4953,
"step": 860
},
{
"epoch": 0.76,
"grad_norm": 1.754272242495459,
"learning_rate": 3.4308267914000915e-06,
"loss": 0.5897,
"step": 861
},
{
"epoch": 0.76,
"grad_norm": 1.8733571713304673,
"learning_rate": 3.427583320629626e-06,
"loss": 0.4897,
"step": 862
},
{
"epoch": 0.76,
"grad_norm": 1.8284259921968489,
"learning_rate": 3.4243380382688395e-06,
"loss": 0.5285,
"step": 863
},
{
"epoch": 0.77,
"grad_norm": 2.4115721951019933,
"learning_rate": 3.4210909506558383e-06,
"loss": 0.5327,
"step": 864
},
{
"epoch": 0.77,
"grad_norm": 1.827035801006768,
"learning_rate": 3.4178420641322564e-06,
"loss": 0.5959,
"step": 865
},
{
"epoch": 0.77,
"grad_norm": 1.7275971455556467,
"learning_rate": 3.414591385043237e-06,
"loss": 0.5378,
"step": 866
},
{
"epoch": 0.77,
"grad_norm": 1.8399392023051784,
"learning_rate": 3.411338919737429e-06,
"loss": 0.4737,
"step": 867
},
{
"epoch": 0.77,
"grad_norm": 1.768301025681768,
"learning_rate": 3.408084674566967e-06,
"loss": 0.5237,
"step": 868
},
{
"epoch": 0.77,
"grad_norm": 1.7940606795442973,
"learning_rate": 3.404828655887462e-06,
"loss": 0.5199,
"step": 869
},
{
"epoch": 0.77,
"grad_norm": 1.736302967715387,
"learning_rate": 3.4015708700579893e-06,
"loss": 0.5103,
"step": 870
},
{
"epoch": 0.77,
"grad_norm": 1.864705554020529,
"learning_rate": 3.398311323441075e-06,
"loss": 0.5456,
"step": 871
},
{
"epoch": 0.77,
"grad_norm": 1.7371337216784375,
"learning_rate": 3.3950500224026838e-06,
"loss": 0.54,
"step": 872
},
{
"epoch": 0.77,
"grad_norm": 1.7936602187941955,
"learning_rate": 3.3917869733122082e-06,
"loss": 0.5079,
"step": 873
},
{
"epoch": 0.77,
"grad_norm": 1.78627252413609,
"learning_rate": 3.3885221825424535e-06,
"loss": 0.5272,
"step": 874
},
{
"epoch": 0.78,
"grad_norm": 2.0255442379828588,
"learning_rate": 3.385255656469627e-06,
"loss": 0.5451,
"step": 875
},
{
"epoch": 0.78,
"grad_norm": 1.8151205951225127,
"learning_rate": 3.3819874014733245e-06,
"loss": 0.545,
"step": 876
},
{
"epoch": 0.78,
"grad_norm": 1.7644602173142565,
"learning_rate": 3.3787174239365183e-06,
"loss": 0.5021,
"step": 877
},
{
"epoch": 0.78,
"grad_norm": 1.88690726704404,
"learning_rate": 3.3754457302455464e-06,
"loss": 0.5518,
"step": 878
},
{
"epoch": 0.78,
"grad_norm": 1.9466161438131033,
"learning_rate": 3.372172326790097e-06,
"loss": 0.5499,
"step": 879
},
{
"epoch": 0.78,
"grad_norm": 1.7759200801637758,
"learning_rate": 3.3688972199631974e-06,
"loss": 0.5165,
"step": 880
},
{
"epoch": 0.78,
"grad_norm": 1.7404813059594972,
"learning_rate": 3.365620416161204e-06,
"loss": 0.4914,
"step": 881
},
{
"epoch": 0.78,
"grad_norm": 1.7186493344503415,
"learning_rate": 3.3623419217837836e-06,
"loss": 0.4742,
"step": 882
},
{
"epoch": 0.78,
"grad_norm": 1.688196680775216,
"learning_rate": 3.3590617432339077e-06,
"loss": 0.4973,
"step": 883
},
{
"epoch": 0.78,
"grad_norm": 1.9998510596311416,
"learning_rate": 3.355779886917836e-06,
"loss": 0.4844,
"step": 884
},
{
"epoch": 0.78,
"grad_norm": 1.9138346820930676,
"learning_rate": 3.3524963592451048e-06,
"loss": 0.5767,
"step": 885
},
{
"epoch": 0.78,
"grad_norm": 1.8240977441306703,
"learning_rate": 3.349211166628515e-06,
"loss": 0.5535,
"step": 886
},
{
"epoch": 0.79,
"grad_norm": 1.866188876988342,
"learning_rate": 3.3459243154841194e-06,
"loss": 0.5293,
"step": 887
},
{
"epoch": 0.79,
"grad_norm": 1.8428560106324356,
"learning_rate": 3.342635812231208e-06,
"loss": 0.5545,
"step": 888
},
{
"epoch": 0.79,
"grad_norm": 1.946339663223573,
"learning_rate": 3.3393456632922997e-06,
"loss": 0.5662,
"step": 889
},
{
"epoch": 0.79,
"grad_norm": 1.7835322668971936,
"learning_rate": 3.3360538750931277e-06,
"loss": 0.5343,
"step": 890
},
{
"epoch": 0.79,
"grad_norm": 1.8985737358987655,
"learning_rate": 3.3327604540626245e-06,
"loss": 0.4882,
"step": 891
},
{
"epoch": 0.79,
"grad_norm": 1.7452799601454962,
"learning_rate": 3.3294654066329125e-06,
"loss": 0.4847,
"step": 892
},
{
"epoch": 0.79,
"grad_norm": 1.8001237054125527,
"learning_rate": 3.3261687392392917e-06,
"loss": 0.5294,
"step": 893
},
{
"epoch": 0.79,
"grad_norm": 1.878202857326882,
"learning_rate": 3.3228704583202244e-06,
"loss": 0.5506,
"step": 894
},
{
"epoch": 0.79,
"grad_norm": 1.9555722164046163,
"learning_rate": 3.319570570317324e-06,
"loss": 0.5675,
"step": 895
},
{
"epoch": 0.79,
"grad_norm": 1.842178231242227,
"learning_rate": 3.316269081675345e-06,
"loss": 0.507,
"step": 896
},
{
"epoch": 0.79,
"grad_norm": 1.7925971037996111,
"learning_rate": 3.3129659988421646e-06,
"loss": 0.544,
"step": 897
},
{
"epoch": 0.8,
"grad_norm": 1.8448861762114805,
"learning_rate": 3.309661328268776e-06,
"loss": 0.5547,
"step": 898
},
{
"epoch": 0.8,
"grad_norm": 1.8798388041152536,
"learning_rate": 3.3063550764092722e-06,
"loss": 0.5535,
"step": 899
},
{
"epoch": 0.8,
"grad_norm": 2.111205651077239,
"learning_rate": 3.3030472497208354e-06,
"loss": 0.5372,
"step": 900
},
{
"epoch": 0.8,
"grad_norm": 1.9023950174091275,
"learning_rate": 3.2997378546637217e-06,
"loss": 0.5183,
"step": 901
},
{
"epoch": 0.8,
"grad_norm": 1.828168427249714,
"learning_rate": 3.296426897701251e-06,
"loss": 0.5139,
"step": 902
},
{
"epoch": 0.8,
"grad_norm": 1.752269482139502,
"learning_rate": 3.293114385299795e-06,
"loss": 0.4977,
"step": 903
},
{
"epoch": 0.8,
"grad_norm": 1.8319951115110833,
"learning_rate": 3.2898003239287626e-06,
"loss": 0.4762,
"step": 904
},
{
"epoch": 0.8,
"grad_norm": 1.9203452380089554,
"learning_rate": 3.2864847200605864e-06,
"loss": 0.5328,
"step": 905
},
{
"epoch": 0.8,
"grad_norm": 1.9603318007718882,
"learning_rate": 3.2831675801707126e-06,
"loss": 0.5114,
"step": 906
},
{
"epoch": 0.8,
"grad_norm": 1.772386222577394,
"learning_rate": 3.2798489107375875e-06,
"loss": 0.5365,
"step": 907
},
{
"epoch": 0.8,
"grad_norm": 1.7664388279000272,
"learning_rate": 3.2765287182426445e-06,
"loss": 0.5218,
"step": 908
},
{
"epoch": 0.81,
"grad_norm": 1.705238499414661,
"learning_rate": 3.2732070091702928e-06,
"loss": 0.515,
"step": 909
},
{
"epoch": 0.81,
"grad_norm": 1.8346490363510246,
"learning_rate": 3.2698837900078995e-06,
"loss": 0.5032,
"step": 910
},
{
"epoch": 0.81,
"grad_norm": 2.1169074366870504,
"learning_rate": 3.2665590672457853e-06,
"loss": 0.5463,
"step": 911
},
{
"epoch": 0.81,
"grad_norm": 1.9794978557420737,
"learning_rate": 3.263232847377205e-06,
"loss": 0.5556,
"step": 912
},
{
"epoch": 0.81,
"grad_norm": 1.8775372141713855,
"learning_rate": 3.2599051368983393e-06,
"loss": 0.5479,
"step": 913
},
{
"epoch": 0.81,
"grad_norm": 1.9608965084656977,
"learning_rate": 3.256575942308278e-06,
"loss": 0.4934,
"step": 914
},
{
"epoch": 0.81,
"grad_norm": 1.9035969324400404,
"learning_rate": 3.2532452701090107e-06,
"loss": 0.494,
"step": 915
},
{
"epoch": 0.81,
"grad_norm": 1.8348725792159002,
"learning_rate": 3.2499131268054114e-06,
"loss": 0.5101,
"step": 916
},
{
"epoch": 0.81,
"grad_norm": 1.837442323872043,
"learning_rate": 3.2465795189052283e-06,
"loss": 0.5028,
"step": 917
},
{
"epoch": 0.81,
"grad_norm": 2.0588580347681114,
"learning_rate": 3.2432444529190714e-06,
"loss": 0.5572,
"step": 918
},
{
"epoch": 0.81,
"grad_norm": 1.800197863385395,
"learning_rate": 3.2399079353603958e-06,
"loss": 0.5456,
"step": 919
},
{
"epoch": 0.81,
"grad_norm": 1.8642409261562531,
"learning_rate": 3.236569972745492e-06,
"loss": 0.4677,
"step": 920
},
{
"epoch": 0.82,
"grad_norm": 1.8605177191737032,
"learning_rate": 3.2332305715934735e-06,
"loss": 0.5086,
"step": 921
},
{
"epoch": 0.82,
"grad_norm": 1.8779408638935786,
"learning_rate": 3.229889738426264e-06,
"loss": 0.4576,
"step": 922
},
{
"epoch": 0.82,
"grad_norm": 1.8069917958596904,
"learning_rate": 3.226547479768582e-06,
"loss": 0.4847,
"step": 923
},
{
"epoch": 0.82,
"grad_norm": 1.949377976689351,
"learning_rate": 3.2232038021479317e-06,
"loss": 0.5095,
"step": 924
},
{
"epoch": 0.82,
"grad_norm": 1.9043326063097796,
"learning_rate": 3.2198587120945878e-06,
"loss": 0.5382,
"step": 925
},
{
"epoch": 0.82,
"grad_norm": 1.8420984644699558,
"learning_rate": 3.2165122161415844e-06,
"loss": 0.5354,
"step": 926
},
{
"epoch": 0.82,
"grad_norm": 1.9159042477860826,
"learning_rate": 3.2131643208246994e-06,
"loss": 0.5676,
"step": 927
},
{
"epoch": 0.82,
"grad_norm": 1.8091292349745058,
"learning_rate": 3.209815032682445e-06,
"loss": 0.5152,
"step": 928
},
{
"epoch": 0.82,
"grad_norm": 1.9172852365194688,
"learning_rate": 3.206464358256054e-06,
"loss": 0.4965,
"step": 929
},
{
"epoch": 0.82,
"grad_norm": 1.8611473653995623,
"learning_rate": 3.2031123040894658e-06,
"loss": 0.5222,
"step": 930
},
{
"epoch": 0.82,
"grad_norm": 2.0718827285873,
"learning_rate": 3.1997588767293146e-06,
"loss": 0.5512,
"step": 931
},
{
"epoch": 0.83,
"grad_norm": 1.8367854431958046,
"learning_rate": 3.196404082724918e-06,
"loss": 0.522,
"step": 932
},
{
"epoch": 0.83,
"grad_norm": 1.9326854247843166,
"learning_rate": 3.19304792862826e-06,
"loss": 0.5262,
"step": 933
},
{
"epoch": 0.83,
"grad_norm": 1.8127395054303974,
"learning_rate": 3.1896904209939827e-06,
"loss": 0.4792,
"step": 934
},
{
"epoch": 0.83,
"grad_norm": 1.7562676297882738,
"learning_rate": 3.1863315663793715e-06,
"loss": 0.5132,
"step": 935
},
{
"epoch": 0.83,
"grad_norm": 2.1115973982625826,
"learning_rate": 3.182971371344342e-06,
"loss": 0.5431,
"step": 936
},
{
"epoch": 0.83,
"grad_norm": 1.9125267865316575,
"learning_rate": 3.179609842451428e-06,
"loss": 0.5049,
"step": 937
},
{
"epoch": 0.83,
"grad_norm": 1.8084301603852846,
"learning_rate": 3.1762469862657673e-06,
"loss": 0.5057,
"step": 938
},
{
"epoch": 0.83,
"grad_norm": 1.979887599791109,
"learning_rate": 3.172882809355092e-06,
"loss": 0.5076,
"step": 939
},
{
"epoch": 0.83,
"grad_norm": 1.8023843851685244,
"learning_rate": 3.1695173182897126e-06,
"loss": 0.507,
"step": 940
},
{
"epoch": 0.83,
"grad_norm": 1.894018453771296,
"learning_rate": 3.166150519642506e-06,
"loss": 0.4892,
"step": 941
},
{
"epoch": 0.83,
"grad_norm": 2.085200027059979,
"learning_rate": 3.162782419988901e-06,
"loss": 0.5109,
"step": 942
},
{
"epoch": 0.84,
"grad_norm": 1.9145317338940404,
"learning_rate": 3.1594130259068723e-06,
"loss": 0.5597,
"step": 943
},
{
"epoch": 0.84,
"grad_norm": 2.6898725390450196,
"learning_rate": 3.1560423439769173e-06,
"loss": 0.5364,
"step": 944
},
{
"epoch": 0.84,
"grad_norm": 1.8953702977370355,
"learning_rate": 3.152670380782052e-06,
"loss": 0.5402,
"step": 945
},
{
"epoch": 0.84,
"grad_norm": 1.8989394358006901,
"learning_rate": 3.1492971429077924e-06,
"loss": 0.499,
"step": 946
},
{
"epoch": 0.84,
"grad_norm": 1.8295299154755171,
"learning_rate": 3.1459226369421465e-06,
"loss": 0.5133,
"step": 947
},
{
"epoch": 0.84,
"grad_norm": 1.9849867895935545,
"learning_rate": 3.1425468694755968e-06,
"loss": 0.5173,
"step": 948
},
{
"epoch": 0.84,
"grad_norm": 1.7806451045050948,
"learning_rate": 3.13916984710109e-06,
"loss": 0.5314,
"step": 949
},
{
"epoch": 0.84,
"grad_norm": 1.8227836319972825,
"learning_rate": 3.1357915764140247e-06,
"loss": 0.5413,
"step": 950
},
{
"epoch": 0.84,
"grad_norm": 1.873012898370893,
"learning_rate": 3.1324120640122362e-06,
"loss": 0.5582,
"step": 951
},
{
"epoch": 0.84,
"grad_norm": 1.7312834865810094,
"learning_rate": 3.129031316495986e-06,
"loss": 0.4969,
"step": 952
},
{
"epoch": 0.84,
"grad_norm": 1.850102780247153,
"learning_rate": 3.1256493404679468e-06,
"loss": 0.4981,
"step": 953
},
{
"epoch": 0.84,
"grad_norm": 1.85121227661343,
"learning_rate": 3.122266142533191e-06,
"loss": 0.4926,
"step": 954
},
{
"epoch": 0.85,
"grad_norm": 1.911516866472808,
"learning_rate": 3.118881729299178e-06,
"loss": 0.5141,
"step": 955
},
{
"epoch": 0.85,
"grad_norm": 1.9562838385609387,
"learning_rate": 3.1154961073757388e-06,
"loss": 0.5119,
"step": 956
},
{
"epoch": 0.85,
"grad_norm": 1.9792813407411627,
"learning_rate": 3.1121092833750684e-06,
"loss": 0.5379,
"step": 957
},
{
"epoch": 0.85,
"grad_norm": 2.02442320634539,
"learning_rate": 3.1087212639117057e-06,
"loss": 0.5516,
"step": 958
},
{
"epoch": 0.85,
"grad_norm": 1.9139240600717167,
"learning_rate": 3.1053320556025272e-06,
"loss": 0.5035,
"step": 959
},
{
"epoch": 0.85,
"grad_norm": 1.6820068229198286,
"learning_rate": 3.10194166506673e-06,
"loss": 0.5082,
"step": 960
},
{
"epoch": 0.85,
"grad_norm": 1.837945615423465,
"learning_rate": 3.098550098925819e-06,
"loss": 0.5301,
"step": 961
},
{
"epoch": 0.85,
"grad_norm": 1.8297516724958631,
"learning_rate": 3.095157363803598e-06,
"loss": 0.531,
"step": 962
},
{
"epoch": 0.85,
"grad_norm": 1.8057255627930757,
"learning_rate": 3.091763466326152e-06,
"loss": 0.4962,
"step": 963
},
{
"epoch": 0.85,
"grad_norm": 1.8568993199742134,
"learning_rate": 3.0883684131218356e-06,
"loss": 0.5555,
"step": 964
},
{
"epoch": 0.85,
"grad_norm": 1.7537389006494144,
"learning_rate": 3.084972210821261e-06,
"loss": 0.4783,
"step": 965
},
{
"epoch": 0.86,
"grad_norm": 1.936835841446932,
"learning_rate": 3.0815748660572856e-06,
"loss": 0.5696,
"step": 966
},
{
"epoch": 0.86,
"grad_norm": 1.818312553754802,
"learning_rate": 3.078176385464997e-06,
"loss": 0.5125,
"step": 967
},
{
"epoch": 0.86,
"grad_norm": 1.9098144545445246,
"learning_rate": 3.074776775681702e-06,
"loss": 0.5472,
"step": 968
},
{
"epoch": 0.86,
"grad_norm": 1.8530900425697827,
"learning_rate": 3.071376043346912e-06,
"loss": 0.5387,
"step": 969
},
{
"epoch": 0.86,
"grad_norm": 1.734080732564932,
"learning_rate": 3.0679741951023302e-06,
"loss": 0.5082,
"step": 970
},
{
"epoch": 0.86,
"grad_norm": 1.7157271380716255,
"learning_rate": 3.06457123759184e-06,
"loss": 0.5057,
"step": 971
},
{
"epoch": 0.86,
"grad_norm": 1.8615941154610314,
"learning_rate": 3.061167177461492e-06,
"loss": 0.5326,
"step": 972
},
{
"epoch": 0.86,
"grad_norm": 1.8820053895933144,
"learning_rate": 3.0577620213594888e-06,
"loss": 0.5446,
"step": 973
},
{
"epoch": 0.86,
"grad_norm": 1.8157963098312144,
"learning_rate": 3.0543557759361735e-06,
"loss": 0.5627,
"step": 974
},
{
"epoch": 0.86,
"grad_norm": 1.7642611841801312,
"learning_rate": 3.0509484478440187e-06,
"loss": 0.5062,
"step": 975
},
{
"epoch": 0.86,
"grad_norm": 1.6839843509551078,
"learning_rate": 3.047540043737609e-06,
"loss": 0.526,
"step": 976
},
{
"epoch": 0.87,
"grad_norm": 1.9004464286881788,
"learning_rate": 3.0441305702736314e-06,
"loss": 0.5617,
"step": 977
},
{
"epoch": 0.87,
"grad_norm": 1.9767954561122347,
"learning_rate": 3.0407200341108618e-06,
"loss": 0.5077,
"step": 978
},
{
"epoch": 0.87,
"grad_norm": 1.825193444039661,
"learning_rate": 3.0373084419101506e-06,
"loss": 0.5097,
"step": 979
},
{
"epoch": 0.87,
"grad_norm": 1.6810496770660706,
"learning_rate": 3.0338958003344115e-06,
"loss": 0.4993,
"step": 980
},
{
"epoch": 0.87,
"grad_norm": 1.7411591022211208,
"learning_rate": 3.0304821160486086e-06,
"loss": 0.4789,
"step": 981
},
{
"epoch": 0.87,
"grad_norm": 1.7580191857406102,
"learning_rate": 3.0270673957197393e-06,
"loss": 0.5225,
"step": 982
},
{
"epoch": 0.87,
"grad_norm": 1.7440391739784626,
"learning_rate": 3.023651646016828e-06,
"loss": 0.5281,
"step": 983
},
{
"epoch": 0.87,
"grad_norm": 1.8458326991098015,
"learning_rate": 3.0202348736109074e-06,
"loss": 0.5419,
"step": 984
},
{
"epoch": 0.87,
"grad_norm": 1.7105130101397825,
"learning_rate": 3.0168170851750077e-06,
"loss": 0.5113,
"step": 985
},
{
"epoch": 0.87,
"grad_norm": 1.74741112552671,
"learning_rate": 3.013398287384144e-06,
"loss": 0.5389,
"step": 986
},
{
"epoch": 0.87,
"grad_norm": 1.7962043049830843,
"learning_rate": 3.009978486915302e-06,
"loss": 0.5212,
"step": 987
},
{
"epoch": 0.88,
"grad_norm": 1.698744627764944,
"learning_rate": 3.006557690447427e-06,
"loss": 0.508,
"step": 988
},
{
"epoch": 0.88,
"grad_norm": 1.852219826000981,
"learning_rate": 3.0031359046614073e-06,
"loss": 0.5491,
"step": 989
},
{
"epoch": 0.88,
"grad_norm": 1.8471065567470235,
"learning_rate": 2.9997131362400666e-06,
"loss": 0.4937,
"step": 990
},
{
"epoch": 0.88,
"grad_norm": 1.7925416653935446,
"learning_rate": 2.996289391868144e-06,
"loss": 0.4691,
"step": 991
},
{
"epoch": 0.88,
"grad_norm": 1.8399091219230026,
"learning_rate": 2.9928646782322875e-06,
"loss": 0.5317,
"step": 992
},
{
"epoch": 0.88,
"grad_norm": 1.7232111222956334,
"learning_rate": 2.989439002021036e-06,
"loss": 0.5152,
"step": 993
},
{
"epoch": 0.88,
"grad_norm": 1.8514924128683583,
"learning_rate": 2.986012369924811e-06,
"loss": 0.573,
"step": 994
},
{
"epoch": 0.88,
"grad_norm": 1.9226274135737127,
"learning_rate": 2.982584788635897e-06,
"loss": 0.5168,
"step": 995
},
{
"epoch": 0.88,
"grad_norm": 1.7726209925124323,
"learning_rate": 2.979156264848437e-06,
"loss": 0.5157,
"step": 996
},
{
"epoch": 0.88,
"grad_norm": 1.9564580777074403,
"learning_rate": 2.9757268052584097e-06,
"loss": 0.5693,
"step": 997
},
{
"epoch": 0.88,
"grad_norm": 1.959067937570625,
"learning_rate": 2.9722964165636263e-06,
"loss": 0.5151,
"step": 998
},
{
"epoch": 0.88,
"grad_norm": 1.9170985685573452,
"learning_rate": 2.9688651054637086e-06,
"loss": 0.5944,
"step": 999
},
{
"epoch": 0.89,
"grad_norm": 1.8773917767941883,
"learning_rate": 2.9654328786600823e-06,
"loss": 0.5128,
"step": 1000
},
{
"epoch": 0.89,
"grad_norm": 1.8184350092212354,
"learning_rate": 2.96199974285596e-06,
"loss": 0.5052,
"step": 1001
},
{
"epoch": 0.89,
"grad_norm": 1.9785501833054495,
"learning_rate": 2.9585657047563314e-06,
"loss": 0.5794,
"step": 1002
},
{
"epoch": 0.89,
"grad_norm": 1.8252999737390432,
"learning_rate": 2.9551307710679467e-06,
"loss": 0.5657,
"step": 1003
},
{
"epoch": 0.89,
"grad_norm": 1.8050415950517775,
"learning_rate": 2.9516949484993055e-06,
"loss": 0.5054,
"step": 1004
},
{
"epoch": 0.89,
"grad_norm": 1.7751399822789855,
"learning_rate": 2.9482582437606445e-06,
"loss": 0.5025,
"step": 1005
},
{
"epoch": 0.89,
"grad_norm": 1.7388276457967873,
"learning_rate": 2.9448206635639213e-06,
"loss": 0.48,
"step": 1006
},
{
"epoch": 0.89,
"grad_norm": 1.9401107131003557,
"learning_rate": 2.941382214622806e-06,
"loss": 0.5503,
"step": 1007
},
{
"epoch": 0.89,
"grad_norm": 1.8055033222058048,
"learning_rate": 2.937942903652663e-06,
"loss": 0.5589,
"step": 1008
},
{
"epoch": 0.89,
"grad_norm": 1.8833337691151302,
"learning_rate": 2.93450273737054e-06,
"loss": 0.5395,
"step": 1009
},
{
"epoch": 0.89,
"grad_norm": 1.875491652961695,
"learning_rate": 2.9310617224951594e-06,
"loss": 0.5316,
"step": 1010
},
{
"epoch": 0.9,
"grad_norm": 1.801842376116382,
"learning_rate": 2.9276198657468947e-06,
"loss": 0.5369,
"step": 1011
},
{
"epoch": 0.9,
"grad_norm": 1.7434378005034878,
"learning_rate": 2.9241771738477686e-06,
"loss": 0.5345,
"step": 1012
},
{
"epoch": 0.9,
"grad_norm": 1.840106192598806,
"learning_rate": 2.920733653521432e-06,
"loss": 0.5391,
"step": 1013
},
{
"epoch": 0.9,
"grad_norm": 1.847462115860291,
"learning_rate": 2.917289311493155e-06,
"loss": 0.5176,
"step": 1014
},
{
"epoch": 0.9,
"grad_norm": 1.7647442625122556,
"learning_rate": 2.9138441544898123e-06,
"loss": 0.502,
"step": 1015
},
{
"epoch": 0.9,
"grad_norm": 1.7981764340288842,
"learning_rate": 2.9103981892398698e-06,
"loss": 0.5422,
"step": 1016
},
{
"epoch": 0.9,
"grad_norm": 1.8491619302175528,
"learning_rate": 2.9069514224733725e-06,
"loss": 0.4993,
"step": 1017
},
{
"epoch": 0.9,
"grad_norm": 1.8345458812848932,
"learning_rate": 2.903503860921931e-06,
"loss": 0.5322,
"step": 1018
},
{
"epoch": 0.9,
"grad_norm": 1.9341637425072102,
"learning_rate": 2.900055511318707e-06,
"loss": 0.5338,
"step": 1019
},
{
"epoch": 0.9,
"grad_norm": 1.8629389822642988,
"learning_rate": 2.896606380398402e-06,
"loss": 0.538,
"step": 1020
},
{
"epoch": 0.9,
"grad_norm": 1.8190212821977385,
"learning_rate": 2.8931564748972446e-06,
"loss": 0.5417,
"step": 1021
},
{
"epoch": 0.91,
"grad_norm": 1.808827329636345,
"learning_rate": 2.8897058015529734e-06,
"loss": 0.5142,
"step": 1022
},
{
"epoch": 0.91,
"grad_norm": 1.8593710361108637,
"learning_rate": 2.8862543671048288e-06,
"loss": 0.5148,
"step": 1023
},
{
"epoch": 0.91,
"grad_norm": 1.9421214945128942,
"learning_rate": 2.882802178293538e-06,
"loss": 0.5375,
"step": 1024
},
{
"epoch": 0.91,
"grad_norm": 1.8337412539689857,
"learning_rate": 2.879349241861299e-06,
"loss": 0.5179,
"step": 1025
},
{
"epoch": 0.91,
"grad_norm": 1.8368160375080673,
"learning_rate": 2.8758955645517724e-06,
"loss": 0.5404,
"step": 1026
},
{
"epoch": 0.91,
"grad_norm": 1.8549078592919745,
"learning_rate": 2.8724411531100642e-06,
"loss": 0.5668,
"step": 1027
},
{
"epoch": 0.91,
"grad_norm": 1.7783870646526379,
"learning_rate": 2.8689860142827153e-06,
"loss": 0.5556,
"step": 1028
},
{
"epoch": 0.91,
"grad_norm": 1.8409533014441846,
"learning_rate": 2.865530154817687e-06,
"loss": 0.4876,
"step": 1029
},
{
"epoch": 0.91,
"grad_norm": 1.786131185447664,
"learning_rate": 2.8620735814643467e-06,
"loss": 0.5503,
"step": 1030
},
{
"epoch": 0.91,
"grad_norm": 1.8105893579918746,
"learning_rate": 2.858616300973458e-06,
"loss": 0.4895,
"step": 1031
},
{
"epoch": 0.91,
"grad_norm": 1.8569751090297868,
"learning_rate": 2.8551583200971638e-06,
"loss": 0.5826,
"step": 1032
},
{
"epoch": 0.91,
"grad_norm": 1.9048981456653071,
"learning_rate": 2.8516996455889763e-06,
"loss": 0.5319,
"step": 1033
},
{
"epoch": 0.92,
"grad_norm": 1.7483385624776147,
"learning_rate": 2.8482402842037615e-06,
"loss": 0.4664,
"step": 1034
},
{
"epoch": 0.92,
"grad_norm": 1.9686504696650498,
"learning_rate": 2.844780242697727e-06,
"loss": 0.5459,
"step": 1035
},
{
"epoch": 0.92,
"grad_norm": 1.7882376390021062,
"learning_rate": 2.8413195278284084e-06,
"loss": 0.5272,
"step": 1036
},
{
"epoch": 0.92,
"grad_norm": 1.9426980086335028,
"learning_rate": 2.8378581463546578e-06,
"loss": 0.4785,
"step": 1037
},
{
"epoch": 0.92,
"grad_norm": 1.7027235807049006,
"learning_rate": 2.8343961050366275e-06,
"loss": 0.5295,
"step": 1038
},
{
"epoch": 0.92,
"grad_norm": 1.7896557139712275,
"learning_rate": 2.8309334106357606e-06,
"loss": 0.4917,
"step": 1039
},
{
"epoch": 0.92,
"grad_norm": 1.9559071265127668,
"learning_rate": 2.827470069914772e-06,
"loss": 0.4813,
"step": 1040
},
{
"epoch": 0.92,
"grad_norm": 1.8624058139739532,
"learning_rate": 2.8240060896376425e-06,
"loss": 0.5173,
"step": 1041
},
{
"epoch": 0.92,
"grad_norm": 1.8517426107017696,
"learning_rate": 2.8205414765696005e-06,
"loss": 0.5022,
"step": 1042
},
{
"epoch": 0.92,
"grad_norm": 1.8669678553165407,
"learning_rate": 2.817076237477111e-06,
"loss": 0.5153,
"step": 1043
},
{
"epoch": 0.92,
"grad_norm": 1.765641529260409,
"learning_rate": 2.8136103791278597e-06,
"loss": 0.5459,
"step": 1044
},
{
"epoch": 0.93,
"grad_norm": 1.7951199125059072,
"learning_rate": 2.8101439082907432e-06,
"loss": 0.5556,
"step": 1045
},
{
"epoch": 0.93,
"grad_norm": 1.809753719361155,
"learning_rate": 2.806676831735855e-06,
"loss": 0.5082,
"step": 1046
},
{
"epoch": 0.93,
"grad_norm": 2.022714382944429,
"learning_rate": 2.8032091562344704e-06,
"loss": 0.5079,
"step": 1047
},
{
"epoch": 0.93,
"grad_norm": 1.7851810141612734,
"learning_rate": 2.7997408885590355e-06,
"loss": 0.5044,
"step": 1048
},
{
"epoch": 0.93,
"grad_norm": 1.6942393347489977,
"learning_rate": 2.7962720354831507e-06,
"loss": 0.4845,
"step": 1049
},
{
"epoch": 0.93,
"grad_norm": 1.8154999007296455,
"learning_rate": 2.792802603781562e-06,
"loss": 0.5039,
"step": 1050
},
{
"epoch": 0.93,
"grad_norm": 1.909875578351421,
"learning_rate": 2.7893326002301446e-06,
"loss": 0.5081,
"step": 1051
},
{
"epoch": 0.93,
"grad_norm": 1.8216639768552991,
"learning_rate": 2.785862031605891e-06,
"loss": 0.5022,
"step": 1052
},
{
"epoch": 0.93,
"grad_norm": 1.968304394575798,
"learning_rate": 2.7823909046868957e-06,
"loss": 0.5217,
"step": 1053
},
{
"epoch": 0.93,
"grad_norm": 2.269471892811331,
"learning_rate": 2.778919226252346e-06,
"loss": 0.5526,
"step": 1054
},
{
"epoch": 0.93,
"grad_norm": 1.8562637541083824,
"learning_rate": 2.775447003082505e-06,
"loss": 0.5686,
"step": 1055
},
{
"epoch": 0.94,
"grad_norm": 2.2263991007598114,
"learning_rate": 2.7719742419586998e-06,
"loss": 0.5402,
"step": 1056
},
{
"epoch": 0.94,
"grad_norm": 1.6883177707586092,
"learning_rate": 2.7685009496633075e-06,
"loss": 0.5033,
"step": 1057
},
{
"epoch": 0.94,
"grad_norm": 1.8528118029727803,
"learning_rate": 2.765027132979743e-06,
"loss": 0.5544,
"step": 1058
},
{
"epoch": 0.94,
"grad_norm": 1.9477833558101318,
"learning_rate": 2.761552798692446e-06,
"loss": 0.5255,
"step": 1059
},
{
"epoch": 0.94,
"grad_norm": 1.8332232845916867,
"learning_rate": 2.7580779535868675e-06,
"loss": 0.5296,
"step": 1060
},
{
"epoch": 0.94,
"grad_norm": 1.7825492914819279,
"learning_rate": 2.754602604449454e-06,
"loss": 0.5071,
"step": 1061
},
{
"epoch": 0.94,
"grad_norm": 1.8629044416803393,
"learning_rate": 2.7511267580676382e-06,
"loss": 0.5242,
"step": 1062
},
{
"epoch": 0.94,
"grad_norm": 1.7488315230717792,
"learning_rate": 2.7476504212298233e-06,
"loss": 0.5252,
"step": 1063
},
{
"epoch": 0.94,
"grad_norm": 1.8087959812729764,
"learning_rate": 2.7441736007253705e-06,
"loss": 0.4935,
"step": 1064
},
{
"epoch": 0.94,
"grad_norm": 1.805794397151574,
"learning_rate": 2.740696303344585e-06,
"loss": 0.5819,
"step": 1065
},
{
"epoch": 0.94,
"grad_norm": 1.7460467388665153,
"learning_rate": 2.737218535878705e-06,
"loss": 0.5411,
"step": 1066
},
{
"epoch": 0.95,
"grad_norm": 1.9047016227341778,
"learning_rate": 2.7337403051198846e-06,
"loss": 0.4755,
"step": 1067
},
{
"epoch": 0.95,
"grad_norm": 1.839222519978661,
"learning_rate": 2.730261617861185e-06,
"loss": 0.4855,
"step": 1068
},
{
"epoch": 0.95,
"grad_norm": 1.862929087519683,
"learning_rate": 2.726782480896557e-06,
"loss": 0.5431,
"step": 1069
},
{
"epoch": 0.95,
"grad_norm": 1.7864340196228758,
"learning_rate": 2.723302901020831e-06,
"loss": 0.5108,
"step": 1070
},
{
"epoch": 0.95,
"grad_norm": 1.827547647278096,
"learning_rate": 2.719822885029701e-06,
"loss": 0.5029,
"step": 1071
},
{
"epoch": 0.95,
"grad_norm": 1.6624898067287452,
"learning_rate": 2.716342439719714e-06,
"loss": 0.4861,
"step": 1072
},
{
"epoch": 0.95,
"grad_norm": 1.8852871442731454,
"learning_rate": 2.7128615718882554e-06,
"loss": 0.5053,
"step": 1073
},
{
"epoch": 0.95,
"grad_norm": 1.9534449028119654,
"learning_rate": 2.7093802883335357e-06,
"loss": 0.5654,
"step": 1074
},
{
"epoch": 0.95,
"grad_norm": 1.7249582061537097,
"learning_rate": 2.7058985958545765e-06,
"loss": 0.5002,
"step": 1075
},
{
"epoch": 0.95,
"grad_norm": 1.7562844672053906,
"learning_rate": 2.702416501251199e-06,
"loss": 0.5436,
"step": 1076
},
{
"epoch": 0.95,
"grad_norm": 1.986901333170154,
"learning_rate": 2.6989340113240087e-06,
"loss": 0.527,
"step": 1077
},
{
"epoch": 0.95,
"grad_norm": 1.7825768030688796,
"learning_rate": 2.695451132874385e-06,
"loss": 0.525,
"step": 1078
},
{
"epoch": 0.96,
"grad_norm": 1.6606555374476397,
"learning_rate": 2.691967872704464e-06,
"loss": 0.476,
"step": 1079
},
{
"epoch": 0.96,
"grad_norm": 1.7825083810087277,
"learning_rate": 2.688484237617129e-06,
"loss": 0.477,
"step": 1080
},
{
"epoch": 0.96,
"grad_norm": 1.9686430333958531,
"learning_rate": 2.6850002344159943e-06,
"loss": 0.5434,
"step": 1081
},
{
"epoch": 0.96,
"grad_norm": 1.717636244450827,
"learning_rate": 2.6815158699053935e-06,
"loss": 0.5794,
"step": 1082
},
{
"epoch": 0.96,
"grad_norm": 1.704099898400831,
"learning_rate": 2.6780311508903673e-06,
"loss": 0.5107,
"step": 1083
},
{
"epoch": 0.96,
"grad_norm": 1.7521786436297433,
"learning_rate": 2.6745460841766456e-06,
"loss": 0.543,
"step": 1084
},
{
"epoch": 0.96,
"grad_norm": 1.8288817857074091,
"learning_rate": 2.67106067657064e-06,
"loss": 0.4888,
"step": 1085
},
{
"epoch": 0.96,
"grad_norm": 1.7606668312978737,
"learning_rate": 2.6675749348794273e-06,
"loss": 0.5438,
"step": 1086
},
{
"epoch": 0.96,
"grad_norm": 1.8974007350921405,
"learning_rate": 2.6640888659107355e-06,
"loss": 0.5103,
"step": 1087
},
{
"epoch": 0.96,
"grad_norm": 1.8366492024047152,
"learning_rate": 2.660602476472935e-06,
"loss": 0.5211,
"step": 1088
},
{
"epoch": 0.96,
"grad_norm": 1.813780419169426,
"learning_rate": 2.657115773375018e-06,
"loss": 0.4786,
"step": 1089
},
{
"epoch": 0.97,
"grad_norm": 1.7751633927736221,
"learning_rate": 2.6536287634265918e-06,
"loss": 0.5456,
"step": 1090
},
{
"epoch": 0.97,
"grad_norm": 1.8112706588151493,
"learning_rate": 2.6501414534378616e-06,
"loss": 0.536,
"step": 1091
},
{
"epoch": 0.97,
"grad_norm": 1.7362256266600447,
"learning_rate": 2.646653850219621e-06,
"loss": 0.5266,
"step": 1092
},
{
"epoch": 0.97,
"grad_norm": 1.7746963075577837,
"learning_rate": 2.643165960583233e-06,
"loss": 0.4845,
"step": 1093
},
{
"epoch": 0.97,
"grad_norm": 1.7798827281249407,
"learning_rate": 2.6396777913406228e-06,
"loss": 0.457,
"step": 1094
},
{
"epoch": 0.97,
"grad_norm": 1.801038100374856,
"learning_rate": 2.6361893493042594e-06,
"loss": 0.5093,
"step": 1095
},
{
"epoch": 0.97,
"grad_norm": 1.70079798226535,
"learning_rate": 2.632700641287147e-06,
"loss": 0.5093,
"step": 1096
},
{
"epoch": 0.97,
"grad_norm": 1.7688921140375633,
"learning_rate": 2.6292116741028073e-06,
"loss": 0.4999,
"step": 1097
},
{
"epoch": 0.97,
"grad_norm": 1.7535762670261703,
"learning_rate": 2.6257224545652688e-06,
"loss": 0.5292,
"step": 1098
},
{
"epoch": 0.97,
"grad_norm": 1.6511600588345345,
"learning_rate": 2.622232989489052e-06,
"loss": 0.5098,
"step": 1099
},
{
"epoch": 0.97,
"grad_norm": 1.8226418118893164,
"learning_rate": 2.6187432856891585e-06,
"loss": 0.4995,
"step": 1100
},
{
"epoch": 0.98,
"grad_norm": 1.7453251697809469,
"learning_rate": 2.6152533499810567e-06,
"loss": 0.5324,
"step": 1101
},
{
"epoch": 0.98,
"grad_norm": 3.854764123671763,
"learning_rate": 2.611763189180665e-06,
"loss": 0.55,
"step": 1102
},
{
"epoch": 0.98,
"grad_norm": 1.8822045822720739,
"learning_rate": 2.608272810104343e-06,
"loss": 0.4948,
"step": 1103
},
{
"epoch": 0.98,
"grad_norm": 1.7655712435117557,
"learning_rate": 2.6047822195688775e-06,
"loss": 0.5361,
"step": 1104
},
{
"epoch": 0.98,
"grad_norm": 1.7601014248725226,
"learning_rate": 2.6012914243914667e-06,
"loss": 0.4455,
"step": 1105
},
{
"epoch": 0.98,
"grad_norm": 1.7832576894792285,
"learning_rate": 2.5978004313897104e-06,
"loss": 0.5356,
"step": 1106
},
{
"epoch": 0.98,
"grad_norm": 1.7379093473470857,
"learning_rate": 2.5943092473815922e-06,
"loss": 0.4881,
"step": 1107
},
{
"epoch": 0.98,
"grad_norm": 1.9126265402481486,
"learning_rate": 2.590817879185471e-06,
"loss": 0.4768,
"step": 1108
},
{
"epoch": 0.98,
"grad_norm": 1.9554559921329315,
"learning_rate": 2.5873263336200636e-06,
"loss": 0.572,
"step": 1109
},
{
"epoch": 0.98,
"grad_norm": 1.7969024529150637,
"learning_rate": 2.5838346175044355e-06,
"loss": 0.4894,
"step": 1110
},
{
"epoch": 0.98,
"grad_norm": 1.7234439211809627,
"learning_rate": 2.5803427376579824e-06,
"loss": 0.4926,
"step": 1111
},
{
"epoch": 0.98,
"grad_norm": 1.9809564443971945,
"learning_rate": 2.5768507009004224e-06,
"loss": 0.5677,
"step": 1112
},
{
"epoch": 0.99,
"grad_norm": 1.7861662896762107,
"learning_rate": 2.573358514051779e-06,
"loss": 0.5283,
"step": 1113
},
{
"epoch": 0.99,
"grad_norm": 1.7627618988801406,
"learning_rate": 2.569866183932368e-06,
"loss": 0.5366,
"step": 1114
},
{
"epoch": 0.99,
"grad_norm": 1.870448428328825,
"learning_rate": 2.5663737173627863e-06,
"loss": 0.4864,
"step": 1115
},
{
"epoch": 0.99,
"grad_norm": 1.7537459242093265,
"learning_rate": 2.5628811211638967e-06,
"loss": 0.5091,
"step": 1116
},
{
"epoch": 0.99,
"grad_norm": 1.7948869258749416,
"learning_rate": 2.5593884021568143e-06,
"loss": 0.4851,
"step": 1117
},
{
"epoch": 0.99,
"grad_norm": 1.7948580243638423,
"learning_rate": 2.5558955671628964e-06,
"loss": 0.5038,
"step": 1118
},
{
"epoch": 0.99,
"grad_norm": 1.7071321162179267,
"learning_rate": 2.552402623003726e-06,
"loss": 0.5172,
"step": 1119
},
{
"epoch": 0.99,
"grad_norm": 1.9189581718497948,
"learning_rate": 2.548909576501096e-06,
"loss": 0.5421,
"step": 1120
},
{
"epoch": 0.99,
"grad_norm": 1.80649128798113,
"learning_rate": 2.5454164344770044e-06,
"loss": 0.5418,
"step": 1121
},
{
"epoch": 0.99,
"grad_norm": 1.7691828394369862,
"learning_rate": 2.5419232037536316e-06,
"loss": 0.5103,
"step": 1122
},
{
"epoch": 0.99,
"grad_norm": 1.7960539293917688,
"learning_rate": 2.5384298911533344e-06,
"loss": 0.5318,
"step": 1123
},
{
"epoch": 1.0,
"grad_norm": 1.8973140103512256,
"learning_rate": 2.5349365034986267e-06,
"loss": 0.5705,
"step": 1124
},
{
"epoch": 1.0,
"grad_norm": 1.821467182784847,
"learning_rate": 2.531443047612171e-06,
"loss": 0.5195,
"step": 1125
},
{
"epoch": 1.0,
"grad_norm": 1.951851802158412,
"learning_rate": 2.527949530316762e-06,
"loss": 0.5033,
"step": 1126
},
{
"epoch": 1.0,
"grad_norm": 1.866099609048635,
"learning_rate": 2.5244559584353146e-06,
"loss": 0.5482,
"step": 1127
},
{
"epoch": 1.0,
"grad_norm": 1.764005332766906,
"learning_rate": 2.520962338790851e-06,
"loss": 0.4973,
"step": 1128
},
{
"epoch": 1.0,
"grad_norm": 1.863408871226844,
"learning_rate": 2.517468678206485e-06,
"loss": 0.5249,
"step": 1129
},
{
"epoch": 1.0,
"grad_norm": 1.7459173200826197,
"learning_rate": 2.5139749835054123e-06,
"loss": 0.4845,
"step": 1130
},
{
"epoch": 1.0,
"grad_norm": 1.9910802418986058,
"learning_rate": 2.5104812615108943e-06,
"loss": 0.5702,
"step": 1131
}
],
"logging_steps": 1,
"max_steps": 2258,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 377,
"total_flos": 532701102735360.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}