QWEN2.5-32B-e2-adapter / trainer_state.json
FINGU-AI's picture
Upload folder using huggingface_hub
b3da1cf verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.4332965821389196,
"eval_steps": 500,
"global_step": 2600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005512679162072767,
"grad_norm": 12.100004196166992,
"learning_rate": 2.9999999999999997e-05,
"loss": 1.4741,
"step": 1
},
{
"epoch": 0.0011025358324145535,
"grad_norm": 11.976073265075684,
"learning_rate": 5.9999999999999995e-05,
"loss": 1.4512,
"step": 2
},
{
"epoch": 0.0016538037486218302,
"grad_norm": 4.930200576782227,
"learning_rate": 8.999999999999999e-05,
"loss": 1.3853,
"step": 3
},
{
"epoch": 0.002205071664829107,
"grad_norm": 1.8625606298446655,
"learning_rate": 0.00011999999999999999,
"loss": 1.12,
"step": 4
},
{
"epoch": 0.0027563395810363835,
"grad_norm": 1.4577418565750122,
"learning_rate": 0.00015,
"loss": 1.005,
"step": 5
},
{
"epoch": 0.0033076074972436605,
"grad_norm": 1.1385219097137451,
"learning_rate": 0.00017999999999999998,
"loss": 0.8992,
"step": 6
},
{
"epoch": 0.003858875413450937,
"grad_norm": 1.5815627574920654,
"learning_rate": 0.00020999999999999998,
"loss": 0.815,
"step": 7
},
{
"epoch": 0.004410143329658214,
"grad_norm": 0.6205328702926636,
"learning_rate": 0.00023999999999999998,
"loss": 0.7967,
"step": 8
},
{
"epoch": 0.004961411245865491,
"grad_norm": 1.6408820152282715,
"learning_rate": 0.00027,
"loss": 0.7702,
"step": 9
},
{
"epoch": 0.005512679162072767,
"grad_norm": 0.8569570183753967,
"learning_rate": 0.0003,
"loss": 0.7845,
"step": 10
},
{
"epoch": 0.006063947078280044,
"grad_norm": 0.67384272813797,
"learning_rate": 0.0002999170812603648,
"loss": 0.7192,
"step": 11
},
{
"epoch": 0.006615214994487321,
"grad_norm": 2.0132830142974854,
"learning_rate": 0.00029983416252072964,
"loss": 0.7354,
"step": 12
},
{
"epoch": 0.007166482910694598,
"grad_norm": 0.6772907972335815,
"learning_rate": 0.0002997512437810945,
"loss": 0.715,
"step": 13
},
{
"epoch": 0.007717750826901874,
"grad_norm": 0.5798671245574951,
"learning_rate": 0.00029966832504145936,
"loss": 0.7477,
"step": 14
},
{
"epoch": 0.008269018743109152,
"grad_norm": 0.49168965220451355,
"learning_rate": 0.00029958540630182416,
"loss": 0.713,
"step": 15
},
{
"epoch": 0.008820286659316428,
"grad_norm": 0.478697806596756,
"learning_rate": 0.000299502487562189,
"loss": 0.6915,
"step": 16
},
{
"epoch": 0.009371554575523704,
"grad_norm": 0.4884359538555145,
"learning_rate": 0.0002994195688225539,
"loss": 0.7305,
"step": 17
},
{
"epoch": 0.009922822491730982,
"grad_norm": 0.4691940248012543,
"learning_rate": 0.00029933665008291874,
"loss": 0.6646,
"step": 18
},
{
"epoch": 0.010474090407938258,
"grad_norm": 0.4946594834327698,
"learning_rate": 0.00029925373134328354,
"loss": 0.7137,
"step": 19
},
{
"epoch": 0.011025358324145534,
"grad_norm": 0.4412364363670349,
"learning_rate": 0.0002991708126036484,
"loss": 0.7063,
"step": 20
},
{
"epoch": 0.011576626240352812,
"grad_norm": 0.5092226266860962,
"learning_rate": 0.0002990878938640132,
"loss": 0.684,
"step": 21
},
{
"epoch": 0.012127894156560088,
"grad_norm": 0.45330244302749634,
"learning_rate": 0.00029900497512437807,
"loss": 0.6677,
"step": 22
},
{
"epoch": 0.012679162072767364,
"grad_norm": 0.4717816710472107,
"learning_rate": 0.0002989220563847429,
"loss": 0.6898,
"step": 23
},
{
"epoch": 0.013230429988974642,
"grad_norm": 0.41348159313201904,
"learning_rate": 0.0002988391376451078,
"loss": 0.6735,
"step": 24
},
{
"epoch": 0.013781697905181918,
"grad_norm": 0.44471853971481323,
"learning_rate": 0.0002987562189054726,
"loss": 0.6732,
"step": 25
},
{
"epoch": 0.014332965821389196,
"grad_norm": 0.44660595059394836,
"learning_rate": 0.00029867330016583745,
"loss": 0.7058,
"step": 26
},
{
"epoch": 0.014884233737596472,
"grad_norm": 0.3917936086654663,
"learning_rate": 0.0002985903814262023,
"loss": 0.6486,
"step": 27
},
{
"epoch": 0.015435501653803748,
"grad_norm": 0.3844316899776459,
"learning_rate": 0.00029850746268656717,
"loss": 0.6726,
"step": 28
},
{
"epoch": 0.015986769570011026,
"grad_norm": 0.38220199942588806,
"learning_rate": 0.00029842454394693197,
"loss": 0.6835,
"step": 29
},
{
"epoch": 0.016538037486218304,
"grad_norm": 0.3823130428791046,
"learning_rate": 0.00029834162520729683,
"loss": 0.6818,
"step": 30
},
{
"epoch": 0.017089305402425578,
"grad_norm": 0.3354315161705017,
"learning_rate": 0.00029825870646766164,
"loss": 0.6421,
"step": 31
},
{
"epoch": 0.017640573318632856,
"grad_norm": 0.3261851966381073,
"learning_rate": 0.0002981757877280265,
"loss": 0.6254,
"step": 32
},
{
"epoch": 0.018191841234840134,
"grad_norm": 0.3275938928127289,
"learning_rate": 0.00029809286898839135,
"loss": 0.6529,
"step": 33
},
{
"epoch": 0.018743109151047408,
"grad_norm": 0.3375149667263031,
"learning_rate": 0.0002980099502487562,
"loss": 0.664,
"step": 34
},
{
"epoch": 0.019294377067254686,
"grad_norm": 0.33320432901382446,
"learning_rate": 0.000297927031509121,
"loss": 0.6157,
"step": 35
},
{
"epoch": 0.019845644983461964,
"grad_norm": 0.30827271938323975,
"learning_rate": 0.0002978441127694859,
"loss": 0.6418,
"step": 36
},
{
"epoch": 0.020396912899669238,
"grad_norm": 0.3377619683742523,
"learning_rate": 0.00029776119402985074,
"loss": 0.6454,
"step": 37
},
{
"epoch": 0.020948180815876516,
"grad_norm": 0.32735955715179443,
"learning_rate": 0.0002976782752902156,
"loss": 0.632,
"step": 38
},
{
"epoch": 0.021499448732083794,
"grad_norm": 0.37884464859962463,
"learning_rate": 0.0002975953565505804,
"loss": 0.6223,
"step": 39
},
{
"epoch": 0.022050716648291068,
"grad_norm": 0.3301836848258972,
"learning_rate": 0.00029751243781094526,
"loss": 0.6654,
"step": 40
},
{
"epoch": 0.022601984564498346,
"grad_norm": 0.3196747303009033,
"learning_rate": 0.00029742951907131006,
"loss": 0.6445,
"step": 41
},
{
"epoch": 0.023153252480705624,
"grad_norm": 0.3292658030986786,
"learning_rate": 0.0002973466003316749,
"loss": 0.6271,
"step": 42
},
{
"epoch": 0.023704520396912898,
"grad_norm": 0.32541969418525696,
"learning_rate": 0.0002972636815920398,
"loss": 0.6217,
"step": 43
},
{
"epoch": 0.024255788313120176,
"grad_norm": 0.3059806823730469,
"learning_rate": 0.00029718076285240464,
"loss": 0.6029,
"step": 44
},
{
"epoch": 0.024807056229327454,
"grad_norm": 0.3427717983722687,
"learning_rate": 0.00029709784411276945,
"loss": 0.6523,
"step": 45
},
{
"epoch": 0.025358324145534728,
"grad_norm": 0.33184289932250977,
"learning_rate": 0.0002970149253731343,
"loss": 0.6475,
"step": 46
},
{
"epoch": 0.025909592061742006,
"grad_norm": 0.32376739382743835,
"learning_rate": 0.00029693200663349917,
"loss": 0.6588,
"step": 47
},
{
"epoch": 0.026460859977949284,
"grad_norm": 0.30022457242012024,
"learning_rate": 0.000296849087893864,
"loss": 0.6316,
"step": 48
},
{
"epoch": 0.02701212789415656,
"grad_norm": 0.3170008957386017,
"learning_rate": 0.00029676616915422883,
"loss": 0.5847,
"step": 49
},
{
"epoch": 0.027563395810363836,
"grad_norm": 0.3455023765563965,
"learning_rate": 0.0002966832504145937,
"loss": 0.6668,
"step": 50
},
{
"epoch": 0.028114663726571114,
"grad_norm": 0.3004387617111206,
"learning_rate": 0.0002966003316749585,
"loss": 0.6599,
"step": 51
},
{
"epoch": 0.02866593164277839,
"grad_norm": 0.33361348509788513,
"learning_rate": 0.00029651741293532335,
"loss": 0.6502,
"step": 52
},
{
"epoch": 0.029217199558985666,
"grad_norm": 0.34541115164756775,
"learning_rate": 0.0002964344941956882,
"loss": 0.6764,
"step": 53
},
{
"epoch": 0.029768467475192944,
"grad_norm": 0.32801833748817444,
"learning_rate": 0.00029635157545605307,
"loss": 0.6347,
"step": 54
},
{
"epoch": 0.03031973539140022,
"grad_norm": 0.30410563945770264,
"learning_rate": 0.0002962686567164179,
"loss": 0.6117,
"step": 55
},
{
"epoch": 0.030871003307607496,
"grad_norm": 0.31390225887298584,
"learning_rate": 0.00029618573797678274,
"loss": 0.5973,
"step": 56
},
{
"epoch": 0.031422271223814774,
"grad_norm": 0.34744319319725037,
"learning_rate": 0.0002961028192371476,
"loss": 0.6544,
"step": 57
},
{
"epoch": 0.03197353914002205,
"grad_norm": 0.3452775180339813,
"learning_rate": 0.0002960199004975124,
"loss": 0.6234,
"step": 58
},
{
"epoch": 0.03252480705622933,
"grad_norm": 0.34327036142349243,
"learning_rate": 0.00029593698175787726,
"loss": 0.6485,
"step": 59
},
{
"epoch": 0.03307607497243661,
"grad_norm": 0.317579448223114,
"learning_rate": 0.00029585406301824206,
"loss": 0.6182,
"step": 60
},
{
"epoch": 0.03362734288864388,
"grad_norm": 0.3586544692516327,
"learning_rate": 0.0002957711442786069,
"loss": 0.6149,
"step": 61
},
{
"epoch": 0.034178610804851156,
"grad_norm": 0.3077372908592224,
"learning_rate": 0.0002956882255389718,
"loss": 0.5806,
"step": 62
},
{
"epoch": 0.034729878721058434,
"grad_norm": 0.33191806077957153,
"learning_rate": 0.00029560530679933664,
"loss": 0.631,
"step": 63
},
{
"epoch": 0.03528114663726571,
"grad_norm": 0.32726630568504333,
"learning_rate": 0.00029552238805970145,
"loss": 0.6364,
"step": 64
},
{
"epoch": 0.03583241455347299,
"grad_norm": 0.3058015704154968,
"learning_rate": 0.0002954394693200663,
"loss": 0.6193,
"step": 65
},
{
"epoch": 0.03638368246968027,
"grad_norm": 0.30789121985435486,
"learning_rate": 0.00029535655058043116,
"loss": 0.6322,
"step": 66
},
{
"epoch": 0.03693495038588754,
"grad_norm": 0.33515268564224243,
"learning_rate": 0.000295273631840796,
"loss": 0.6581,
"step": 67
},
{
"epoch": 0.037486218302094816,
"grad_norm": 0.3196898400783539,
"learning_rate": 0.00029519071310116083,
"loss": 0.6134,
"step": 68
},
{
"epoch": 0.038037486218302094,
"grad_norm": 0.3255867660045624,
"learning_rate": 0.0002951077943615257,
"loss": 0.6176,
"step": 69
},
{
"epoch": 0.03858875413450937,
"grad_norm": 0.3257988691329956,
"learning_rate": 0.0002950248756218905,
"loss": 0.6214,
"step": 70
},
{
"epoch": 0.03914002205071665,
"grad_norm": 0.29037123918533325,
"learning_rate": 0.00029494195688225535,
"loss": 0.6098,
"step": 71
},
{
"epoch": 0.03969128996692393,
"grad_norm": 0.3127928674221039,
"learning_rate": 0.0002948590381426202,
"loss": 0.6532,
"step": 72
},
{
"epoch": 0.0402425578831312,
"grad_norm": 0.2821784019470215,
"learning_rate": 0.00029477611940298507,
"loss": 0.6101,
"step": 73
},
{
"epoch": 0.040793825799338476,
"grad_norm": 0.2889716923236847,
"learning_rate": 0.0002946932006633499,
"loss": 0.6097,
"step": 74
},
{
"epoch": 0.041345093715545754,
"grad_norm": 0.3002908527851105,
"learning_rate": 0.00029461028192371473,
"loss": 0.626,
"step": 75
},
{
"epoch": 0.04189636163175303,
"grad_norm": 0.2943056523799896,
"learning_rate": 0.0002945273631840796,
"loss": 0.6061,
"step": 76
},
{
"epoch": 0.04244762954796031,
"grad_norm": 0.31590160727500916,
"learning_rate": 0.00029444444444444445,
"loss": 0.6279,
"step": 77
},
{
"epoch": 0.04299889746416759,
"grad_norm": 0.31002211570739746,
"learning_rate": 0.00029436152570480926,
"loss": 0.6066,
"step": 78
},
{
"epoch": 0.043550165380374865,
"grad_norm": 0.27883172035217285,
"learning_rate": 0.0002942786069651741,
"loss": 0.6053,
"step": 79
},
{
"epoch": 0.044101433296582136,
"grad_norm": 0.3098636567592621,
"learning_rate": 0.0002941956882255389,
"loss": 0.6041,
"step": 80
},
{
"epoch": 0.044652701212789414,
"grad_norm": 0.31574317812919617,
"learning_rate": 0.0002941127694859038,
"loss": 0.6132,
"step": 81
},
{
"epoch": 0.04520396912899669,
"grad_norm": 0.2871106266975403,
"learning_rate": 0.00029402985074626864,
"loss": 0.5759,
"step": 82
},
{
"epoch": 0.04575523704520397,
"grad_norm": 0.2808583676815033,
"learning_rate": 0.0002939469320066335,
"loss": 0.583,
"step": 83
},
{
"epoch": 0.04630650496141125,
"grad_norm": 0.29489415884017944,
"learning_rate": 0.0002938640132669983,
"loss": 0.6018,
"step": 84
},
{
"epoch": 0.046857772877618525,
"grad_norm": 0.28468286991119385,
"learning_rate": 0.00029378109452736316,
"loss": 0.602,
"step": 85
},
{
"epoch": 0.047409040793825796,
"grad_norm": 0.28690364956855774,
"learning_rate": 0.000293698175787728,
"loss": 0.5802,
"step": 86
},
{
"epoch": 0.047960308710033074,
"grad_norm": 0.30015993118286133,
"learning_rate": 0.0002936152570480929,
"loss": 0.5889,
"step": 87
},
{
"epoch": 0.04851157662624035,
"grad_norm": 0.3080478310585022,
"learning_rate": 0.0002935323383084577,
"loss": 0.6106,
"step": 88
},
{
"epoch": 0.04906284454244763,
"grad_norm": 0.2852279245853424,
"learning_rate": 0.00029344941956882254,
"loss": 0.5902,
"step": 89
},
{
"epoch": 0.04961411245865491,
"grad_norm": 0.2944631278514862,
"learning_rate": 0.00029336650082918735,
"loss": 0.6222,
"step": 90
},
{
"epoch": 0.050165380374862185,
"grad_norm": 0.29476436972618103,
"learning_rate": 0.0002932835820895522,
"loss": 0.6151,
"step": 91
},
{
"epoch": 0.050716648291069456,
"grad_norm": 0.2786809802055359,
"learning_rate": 0.00029320066334991707,
"loss": 0.5801,
"step": 92
},
{
"epoch": 0.051267916207276734,
"grad_norm": 0.27844133973121643,
"learning_rate": 0.0002931177446102819,
"loss": 0.5708,
"step": 93
},
{
"epoch": 0.05181918412348401,
"grad_norm": 0.2947113811969757,
"learning_rate": 0.00029303482587064673,
"loss": 0.5951,
"step": 94
},
{
"epoch": 0.05237045203969129,
"grad_norm": 0.2926524877548218,
"learning_rate": 0.0002929519071310116,
"loss": 0.6281,
"step": 95
},
{
"epoch": 0.05292171995589857,
"grad_norm": 0.27508488297462463,
"learning_rate": 0.00029286898839137645,
"loss": 0.5769,
"step": 96
},
{
"epoch": 0.053472987872105845,
"grad_norm": 0.2983228862285614,
"learning_rate": 0.0002927860696517413,
"loss": 0.5808,
"step": 97
},
{
"epoch": 0.05402425578831312,
"grad_norm": 0.28955212235450745,
"learning_rate": 0.0002927031509121061,
"loss": 0.6009,
"step": 98
},
{
"epoch": 0.054575523704520394,
"grad_norm": 0.30267390608787537,
"learning_rate": 0.0002926202321724709,
"loss": 0.5938,
"step": 99
},
{
"epoch": 0.05512679162072767,
"grad_norm": 0.2869952917098999,
"learning_rate": 0.0002925373134328358,
"loss": 0.5695,
"step": 100
},
{
"epoch": 0.05567805953693495,
"grad_norm": 0.28908076882362366,
"learning_rate": 0.00029245439469320064,
"loss": 0.5904,
"step": 101
},
{
"epoch": 0.05622932745314223,
"grad_norm": 0.2866143584251404,
"learning_rate": 0.0002923714759535655,
"loss": 0.5945,
"step": 102
},
{
"epoch": 0.056780595369349506,
"grad_norm": 0.2788505554199219,
"learning_rate": 0.0002922885572139303,
"loss": 0.5861,
"step": 103
},
{
"epoch": 0.05733186328555678,
"grad_norm": 0.2852947413921356,
"learning_rate": 0.00029220563847429516,
"loss": 0.6012,
"step": 104
},
{
"epoch": 0.057883131201764054,
"grad_norm": 0.27692896127700806,
"learning_rate": 0.00029212271973466,
"loss": 0.5797,
"step": 105
},
{
"epoch": 0.05843439911797133,
"grad_norm": 0.27395880222320557,
"learning_rate": 0.0002920398009950249,
"loss": 0.5854,
"step": 106
},
{
"epoch": 0.05898566703417861,
"grad_norm": 0.2730069160461426,
"learning_rate": 0.0002919568822553897,
"loss": 0.5882,
"step": 107
},
{
"epoch": 0.05953693495038589,
"grad_norm": 0.2808207869529724,
"learning_rate": 0.00029187396351575454,
"loss": 0.5868,
"step": 108
},
{
"epoch": 0.060088202866593166,
"grad_norm": 0.26693934202194214,
"learning_rate": 0.00029179104477611935,
"loss": 0.5656,
"step": 109
},
{
"epoch": 0.06063947078280044,
"grad_norm": 0.29277607798576355,
"learning_rate": 0.0002917081260364842,
"loss": 0.608,
"step": 110
},
{
"epoch": 0.061190738699007714,
"grad_norm": 0.29922837018966675,
"learning_rate": 0.00029162520729684907,
"loss": 0.5952,
"step": 111
},
{
"epoch": 0.06174200661521499,
"grad_norm": 0.26753753423690796,
"learning_rate": 0.0002915422885572139,
"loss": 0.5964,
"step": 112
},
{
"epoch": 0.06229327453142227,
"grad_norm": 0.2910638451576233,
"learning_rate": 0.00029145936981757873,
"loss": 0.5822,
"step": 113
},
{
"epoch": 0.06284454244762955,
"grad_norm": 0.3202199339866638,
"learning_rate": 0.0002913764510779436,
"loss": 0.5927,
"step": 114
},
{
"epoch": 0.06339581036383682,
"grad_norm": 0.26713207364082336,
"learning_rate": 0.00029129353233830845,
"loss": 0.5698,
"step": 115
},
{
"epoch": 0.0639470782800441,
"grad_norm": 0.3109968304634094,
"learning_rate": 0.0002912106135986733,
"loss": 0.5954,
"step": 116
},
{
"epoch": 0.06449834619625137,
"grad_norm": 0.30233150720596313,
"learning_rate": 0.0002911276948590381,
"loss": 0.5941,
"step": 117
},
{
"epoch": 0.06504961411245866,
"grad_norm": 0.28545138239860535,
"learning_rate": 0.00029104477611940297,
"loss": 0.5773,
"step": 118
},
{
"epoch": 0.06560088202866593,
"grad_norm": 0.29633569717407227,
"learning_rate": 0.0002909618573797678,
"loss": 0.6014,
"step": 119
},
{
"epoch": 0.06615214994487321,
"grad_norm": 0.29278406500816345,
"learning_rate": 0.00029087893864013264,
"loss": 0.6001,
"step": 120
},
{
"epoch": 0.06670341786108049,
"grad_norm": 0.29871347546577454,
"learning_rate": 0.0002907960199004975,
"loss": 0.629,
"step": 121
},
{
"epoch": 0.06725468577728776,
"grad_norm": 0.27272510528564453,
"learning_rate": 0.00029071310116086235,
"loss": 0.5502,
"step": 122
},
{
"epoch": 0.06780595369349504,
"grad_norm": 0.2796414792537689,
"learning_rate": 0.00029063018242122716,
"loss": 0.5712,
"step": 123
},
{
"epoch": 0.06835722160970231,
"grad_norm": 0.277700811624527,
"learning_rate": 0.000290547263681592,
"loss": 0.5654,
"step": 124
},
{
"epoch": 0.0689084895259096,
"grad_norm": 0.2710396647453308,
"learning_rate": 0.0002904643449419569,
"loss": 0.5866,
"step": 125
},
{
"epoch": 0.06945975744211687,
"grad_norm": 0.28910425305366516,
"learning_rate": 0.00029038142620232174,
"loss": 0.5679,
"step": 126
},
{
"epoch": 0.07001102535832414,
"grad_norm": 0.2892954647541046,
"learning_rate": 0.00029029850746268654,
"loss": 0.5915,
"step": 127
},
{
"epoch": 0.07056229327453142,
"grad_norm": 0.3241787552833557,
"learning_rate": 0.0002902155887230514,
"loss": 0.5818,
"step": 128
},
{
"epoch": 0.0711135611907387,
"grad_norm": 0.29878735542297363,
"learning_rate": 0.0002901326699834162,
"loss": 0.5813,
"step": 129
},
{
"epoch": 0.07166482910694598,
"grad_norm": 0.27833399176597595,
"learning_rate": 0.00029004975124378106,
"loss": 0.5865,
"step": 130
},
{
"epoch": 0.07221609702315325,
"grad_norm": 0.3239665627479553,
"learning_rate": 0.0002899668325041459,
"loss": 0.5898,
"step": 131
},
{
"epoch": 0.07276736493936053,
"grad_norm": 0.31001126766204834,
"learning_rate": 0.0002898839137645108,
"loss": 0.577,
"step": 132
},
{
"epoch": 0.0733186328555678,
"grad_norm": 0.2673737704753876,
"learning_rate": 0.0002898009950248756,
"loss": 0.5684,
"step": 133
},
{
"epoch": 0.07386990077177508,
"grad_norm": 0.3218002915382385,
"learning_rate": 0.00028971807628524045,
"loss": 0.5826,
"step": 134
},
{
"epoch": 0.07442116868798236,
"grad_norm": 0.2867553234100342,
"learning_rate": 0.00028963515754560525,
"loss": 0.5679,
"step": 135
},
{
"epoch": 0.07497243660418963,
"grad_norm": 0.2790491282939911,
"learning_rate": 0.00028955223880597017,
"loss": 0.5532,
"step": 136
},
{
"epoch": 0.07552370452039692,
"grad_norm": 0.3101596534252167,
"learning_rate": 0.00028946932006633497,
"loss": 0.616,
"step": 137
},
{
"epoch": 0.07607497243660419,
"grad_norm": 0.2670627534389496,
"learning_rate": 0.00028938640132669983,
"loss": 0.5147,
"step": 138
},
{
"epoch": 0.07662624035281147,
"grad_norm": 0.28873148560523987,
"learning_rate": 0.00028930348258706463,
"loss": 0.5723,
"step": 139
},
{
"epoch": 0.07717750826901874,
"grad_norm": 0.3042322099208832,
"learning_rate": 0.0002892205638474295,
"loss": 0.5483,
"step": 140
},
{
"epoch": 0.07772877618522601,
"grad_norm": 0.30197396874427795,
"learning_rate": 0.00028913764510779435,
"loss": 0.5731,
"step": 141
},
{
"epoch": 0.0782800441014333,
"grad_norm": 0.2676428258419037,
"learning_rate": 0.0002890547263681592,
"loss": 0.5384,
"step": 142
},
{
"epoch": 0.07883131201764057,
"grad_norm": 0.2983885705471039,
"learning_rate": 0.000288971807628524,
"loss": 0.5777,
"step": 143
},
{
"epoch": 0.07938257993384785,
"grad_norm": 0.3119770586490631,
"learning_rate": 0.0002888888888888888,
"loss": 0.5682,
"step": 144
},
{
"epoch": 0.07993384785005513,
"grad_norm": 0.28664880990982056,
"learning_rate": 0.0002888059701492537,
"loss": 0.5875,
"step": 145
},
{
"epoch": 0.0804851157662624,
"grad_norm": 0.2691631615161896,
"learning_rate": 0.00028872305140961854,
"loss": 0.5841,
"step": 146
},
{
"epoch": 0.08103638368246968,
"grad_norm": 0.29469335079193115,
"learning_rate": 0.0002886401326699834,
"loss": 0.6111,
"step": 147
},
{
"epoch": 0.08158765159867695,
"grad_norm": 0.27499398589134216,
"learning_rate": 0.0002885572139303482,
"loss": 0.5984,
"step": 148
},
{
"epoch": 0.08213891951488424,
"grad_norm": 0.2869040369987488,
"learning_rate": 0.00028847429519071306,
"loss": 0.5862,
"step": 149
},
{
"epoch": 0.08269018743109151,
"grad_norm": 0.25979968905448914,
"learning_rate": 0.0002883913764510779,
"loss": 0.5948,
"step": 150
},
{
"epoch": 0.08324145534729879,
"grad_norm": 0.2581140398979187,
"learning_rate": 0.0002883084577114428,
"loss": 0.543,
"step": 151
},
{
"epoch": 0.08379272326350606,
"grad_norm": 0.3241422474384308,
"learning_rate": 0.0002882255389718076,
"loss": 0.5584,
"step": 152
},
{
"epoch": 0.08434399117971333,
"grad_norm": 0.3122616112232208,
"learning_rate": 0.00028814262023217245,
"loss": 0.6101,
"step": 153
},
{
"epoch": 0.08489525909592062,
"grad_norm": 0.28104907274246216,
"learning_rate": 0.00028805970149253725,
"loss": 0.5721,
"step": 154
},
{
"epoch": 0.08544652701212789,
"grad_norm": 0.32965442538261414,
"learning_rate": 0.0002879767827529021,
"loss": 0.5396,
"step": 155
},
{
"epoch": 0.08599779492833518,
"grad_norm": 0.32811254262924194,
"learning_rate": 0.00028789386401326697,
"loss": 0.5819,
"step": 156
},
{
"epoch": 0.08654906284454245,
"grad_norm": 0.3046472668647766,
"learning_rate": 0.00028781094527363183,
"loss": 0.5756,
"step": 157
},
{
"epoch": 0.08710033076074973,
"grad_norm": 0.308413028717041,
"learning_rate": 0.00028772802653399663,
"loss": 0.611,
"step": 158
},
{
"epoch": 0.087651598676957,
"grad_norm": 0.2636229693889618,
"learning_rate": 0.0002876451077943615,
"loss": 0.5608,
"step": 159
},
{
"epoch": 0.08820286659316427,
"grad_norm": 0.29085874557495117,
"learning_rate": 0.00028756218905472635,
"loss": 0.553,
"step": 160
},
{
"epoch": 0.08875413450937156,
"grad_norm": 0.2887280285358429,
"learning_rate": 0.0002874792703150912,
"loss": 0.5958,
"step": 161
},
{
"epoch": 0.08930540242557883,
"grad_norm": 0.26728978753089905,
"learning_rate": 0.000287396351575456,
"loss": 0.5487,
"step": 162
},
{
"epoch": 0.08985667034178611,
"grad_norm": 0.25967663526535034,
"learning_rate": 0.0002873134328358209,
"loss": 0.5657,
"step": 163
},
{
"epoch": 0.09040793825799338,
"grad_norm": 0.2513408064842224,
"learning_rate": 0.0002872305140961857,
"loss": 0.5358,
"step": 164
},
{
"epoch": 0.09095920617420065,
"grad_norm": 0.28536808490753174,
"learning_rate": 0.00028714759535655054,
"loss": 0.6057,
"step": 165
},
{
"epoch": 0.09151047409040794,
"grad_norm": 0.28766608238220215,
"learning_rate": 0.0002870646766169154,
"loss": 0.6108,
"step": 166
},
{
"epoch": 0.09206174200661521,
"grad_norm": 0.25628137588500977,
"learning_rate": 0.00028698175787728026,
"loss": 0.53,
"step": 167
},
{
"epoch": 0.0926130099228225,
"grad_norm": 0.2983819246292114,
"learning_rate": 0.00028689883913764506,
"loss": 0.5997,
"step": 168
},
{
"epoch": 0.09316427783902977,
"grad_norm": 0.27762502431869507,
"learning_rate": 0.0002868159203980099,
"loss": 0.5833,
"step": 169
},
{
"epoch": 0.09371554575523705,
"grad_norm": 0.28496429324150085,
"learning_rate": 0.0002867330016583748,
"loss": 0.5863,
"step": 170
},
{
"epoch": 0.09426681367144432,
"grad_norm": 0.26081910729408264,
"learning_rate": 0.00028665008291873964,
"loss": 0.5943,
"step": 171
},
{
"epoch": 0.09481808158765159,
"grad_norm": 0.27544835209846497,
"learning_rate": 0.00028656716417910444,
"loss": 0.6175,
"step": 172
},
{
"epoch": 0.09536934950385888,
"grad_norm": 0.2690446972846985,
"learning_rate": 0.0002864842454394693,
"loss": 0.5473,
"step": 173
},
{
"epoch": 0.09592061742006615,
"grad_norm": 0.2816300690174103,
"learning_rate": 0.0002864013266998341,
"loss": 0.5908,
"step": 174
},
{
"epoch": 0.09647188533627343,
"grad_norm": 0.26558321714401245,
"learning_rate": 0.00028631840796019897,
"loss": 0.5711,
"step": 175
},
{
"epoch": 0.0970231532524807,
"grad_norm": 0.2692832946777344,
"learning_rate": 0.0002862354892205638,
"loss": 0.5731,
"step": 176
},
{
"epoch": 0.09757442116868799,
"grad_norm": 0.2814270555973053,
"learning_rate": 0.0002861525704809287,
"loss": 0.5353,
"step": 177
},
{
"epoch": 0.09812568908489526,
"grad_norm": 0.26562657952308655,
"learning_rate": 0.0002860696517412935,
"loss": 0.5955,
"step": 178
},
{
"epoch": 0.09867695700110253,
"grad_norm": 0.2592059075832367,
"learning_rate": 0.00028598673300165835,
"loss": 0.5617,
"step": 179
},
{
"epoch": 0.09922822491730982,
"grad_norm": 0.26579222083091736,
"learning_rate": 0.0002859038142620232,
"loss": 0.5725,
"step": 180
},
{
"epoch": 0.09977949283351709,
"grad_norm": 0.2731139063835144,
"learning_rate": 0.00028582089552238807,
"loss": 0.5614,
"step": 181
},
{
"epoch": 0.10033076074972437,
"grad_norm": 0.2470698207616806,
"learning_rate": 0.00028573797678275287,
"loss": 0.5347,
"step": 182
},
{
"epoch": 0.10088202866593164,
"grad_norm": 0.24656972289085388,
"learning_rate": 0.00028565505804311773,
"loss": 0.5481,
"step": 183
},
{
"epoch": 0.10143329658213891,
"grad_norm": 0.2857254445552826,
"learning_rate": 0.00028557213930348254,
"loss": 0.602,
"step": 184
},
{
"epoch": 0.1019845644983462,
"grad_norm": 0.27286651730537415,
"learning_rate": 0.0002854892205638474,
"loss": 0.5585,
"step": 185
},
{
"epoch": 0.10253583241455347,
"grad_norm": 0.2675493359565735,
"learning_rate": 0.00028540630182421225,
"loss": 0.567,
"step": 186
},
{
"epoch": 0.10308710033076075,
"grad_norm": 0.26535746455192566,
"learning_rate": 0.00028532338308457706,
"loss": 0.5696,
"step": 187
},
{
"epoch": 0.10363836824696802,
"grad_norm": 0.2633534371852875,
"learning_rate": 0.0002852404643449419,
"loss": 0.5326,
"step": 188
},
{
"epoch": 0.10418963616317531,
"grad_norm": 0.2724531292915344,
"learning_rate": 0.0002851575456053068,
"loss": 0.5905,
"step": 189
},
{
"epoch": 0.10474090407938258,
"grad_norm": 0.2680416405200958,
"learning_rate": 0.00028507462686567164,
"loss": 0.5924,
"step": 190
},
{
"epoch": 0.10529217199558985,
"grad_norm": 0.28108882904052734,
"learning_rate": 0.00028499170812603644,
"loss": 0.5926,
"step": 191
},
{
"epoch": 0.10584343991179714,
"grad_norm": 0.2787463366985321,
"learning_rate": 0.0002849087893864013,
"loss": 0.5699,
"step": 192
},
{
"epoch": 0.1063947078280044,
"grad_norm": 0.2674010396003723,
"learning_rate": 0.0002848258706467661,
"loss": 0.587,
"step": 193
},
{
"epoch": 0.10694597574421169,
"grad_norm": 0.27142807841300964,
"learning_rate": 0.00028474295190713097,
"loss": 0.5762,
"step": 194
},
{
"epoch": 0.10749724366041896,
"grad_norm": 0.2817786633968353,
"learning_rate": 0.0002846600331674958,
"loss": 0.5672,
"step": 195
},
{
"epoch": 0.10804851157662625,
"grad_norm": 0.250627338886261,
"learning_rate": 0.0002845771144278607,
"loss": 0.5425,
"step": 196
},
{
"epoch": 0.10859977949283352,
"grad_norm": 0.2636951506137848,
"learning_rate": 0.0002844941956882255,
"loss": 0.579,
"step": 197
},
{
"epoch": 0.10915104740904079,
"grad_norm": 0.2613438665866852,
"learning_rate": 0.00028441127694859035,
"loss": 0.5531,
"step": 198
},
{
"epoch": 0.10970231532524807,
"grad_norm": 0.28677162528038025,
"learning_rate": 0.0002843283582089552,
"loss": 0.5875,
"step": 199
},
{
"epoch": 0.11025358324145534,
"grad_norm": 0.2670292258262634,
"learning_rate": 0.00028424543946932007,
"loss": 0.5625,
"step": 200
},
{
"epoch": 0.11080485115766263,
"grad_norm": 0.23815321922302246,
"learning_rate": 0.00028416252072968487,
"loss": 0.5484,
"step": 201
},
{
"epoch": 0.1113561190738699,
"grad_norm": 0.2709272503852844,
"learning_rate": 0.00028407960199004973,
"loss": 0.5387,
"step": 202
},
{
"epoch": 0.11190738699007717,
"grad_norm": 0.25918126106262207,
"learning_rate": 0.00028399668325041453,
"loss": 0.5686,
"step": 203
},
{
"epoch": 0.11245865490628446,
"grad_norm": 0.27118560671806335,
"learning_rate": 0.0002839137645107794,
"loss": 0.5637,
"step": 204
},
{
"epoch": 0.11300992282249173,
"grad_norm": 0.26395100355148315,
"learning_rate": 0.00028383084577114425,
"loss": 0.5499,
"step": 205
},
{
"epoch": 0.11356119073869901,
"grad_norm": 0.272989422082901,
"learning_rate": 0.0002837479270315091,
"loss": 0.5606,
"step": 206
},
{
"epoch": 0.11411245865490628,
"grad_norm": 0.2708880603313446,
"learning_rate": 0.0002836650082918739,
"loss": 0.534,
"step": 207
},
{
"epoch": 0.11466372657111357,
"grad_norm": 0.28653857111930847,
"learning_rate": 0.0002835820895522388,
"loss": 0.5727,
"step": 208
},
{
"epoch": 0.11521499448732084,
"grad_norm": 0.2767845392227173,
"learning_rate": 0.00028349917081260364,
"loss": 0.5664,
"step": 209
},
{
"epoch": 0.11576626240352811,
"grad_norm": 0.27690836787223816,
"learning_rate": 0.0002834162520729685,
"loss": 0.5656,
"step": 210
},
{
"epoch": 0.1163175303197354,
"grad_norm": 0.2831721007823944,
"learning_rate": 0.0002833333333333333,
"loss": 0.596,
"step": 211
},
{
"epoch": 0.11686879823594266,
"grad_norm": 0.3024809658527374,
"learning_rate": 0.00028325041459369816,
"loss": 0.5849,
"step": 212
},
{
"epoch": 0.11742006615214995,
"grad_norm": 0.2787605822086334,
"learning_rate": 0.00028316749585406296,
"loss": 0.5606,
"step": 213
},
{
"epoch": 0.11797133406835722,
"grad_norm": 0.2734401226043701,
"learning_rate": 0.0002830845771144278,
"loss": 0.5524,
"step": 214
},
{
"epoch": 0.1185226019845645,
"grad_norm": 0.2717944085597992,
"learning_rate": 0.0002830016583747927,
"loss": 0.5533,
"step": 215
},
{
"epoch": 0.11907386990077178,
"grad_norm": 0.2634055018424988,
"learning_rate": 0.00028291873963515754,
"loss": 0.5552,
"step": 216
},
{
"epoch": 0.11962513781697905,
"grad_norm": 0.27231520414352417,
"learning_rate": 0.00028283582089552235,
"loss": 0.5608,
"step": 217
},
{
"epoch": 0.12017640573318633,
"grad_norm": 0.2709995210170746,
"learning_rate": 0.0002827529021558872,
"loss": 0.5608,
"step": 218
},
{
"epoch": 0.1207276736493936,
"grad_norm": 0.24507290124893188,
"learning_rate": 0.00028266998341625206,
"loss": 0.5324,
"step": 219
},
{
"epoch": 0.12127894156560089,
"grad_norm": 0.26341697573661804,
"learning_rate": 0.0002825870646766169,
"loss": 0.5686,
"step": 220
},
{
"epoch": 0.12183020948180816,
"grad_norm": 0.2655317783355713,
"learning_rate": 0.00028250414593698173,
"loss": 0.5792,
"step": 221
},
{
"epoch": 0.12238147739801543,
"grad_norm": 0.263235867023468,
"learning_rate": 0.0002824212271973466,
"loss": 0.5633,
"step": 222
},
{
"epoch": 0.12293274531422271,
"grad_norm": 0.28087055683135986,
"learning_rate": 0.0002823383084577114,
"loss": 0.559,
"step": 223
},
{
"epoch": 0.12348401323042998,
"grad_norm": 0.2734236717224121,
"learning_rate": 0.00028225538971807625,
"loss": 0.5772,
"step": 224
},
{
"epoch": 0.12403528114663727,
"grad_norm": 0.2594766318798065,
"learning_rate": 0.0002821724709784411,
"loss": 0.5698,
"step": 225
},
{
"epoch": 0.12458654906284454,
"grad_norm": 0.2490595132112503,
"learning_rate": 0.00028208955223880597,
"loss": 0.5419,
"step": 226
},
{
"epoch": 0.12513781697905182,
"grad_norm": 0.25069767236709595,
"learning_rate": 0.0002820066334991708,
"loss": 0.531,
"step": 227
},
{
"epoch": 0.1256890848952591,
"grad_norm": 0.2518230080604553,
"learning_rate": 0.00028192371475953563,
"loss": 0.5509,
"step": 228
},
{
"epoch": 0.12624035281146637,
"grad_norm": 0.2488110512495041,
"learning_rate": 0.0002818407960199005,
"loss": 0.5341,
"step": 229
},
{
"epoch": 0.12679162072767364,
"grad_norm": 0.26115381717681885,
"learning_rate": 0.00028175787728026535,
"loss": 0.5433,
"step": 230
},
{
"epoch": 0.12734288864388094,
"grad_norm": 0.24792101979255676,
"learning_rate": 0.00028167495854063016,
"loss": 0.5672,
"step": 231
},
{
"epoch": 0.1278941565600882,
"grad_norm": 0.2637925148010254,
"learning_rate": 0.00028159203980099496,
"loss": 0.5868,
"step": 232
},
{
"epoch": 0.12844542447629548,
"grad_norm": 0.2799462676048279,
"learning_rate": 0.0002815091210613598,
"loss": 0.5514,
"step": 233
},
{
"epoch": 0.12899669239250275,
"grad_norm": 0.2809968590736389,
"learning_rate": 0.0002814262023217247,
"loss": 0.5847,
"step": 234
},
{
"epoch": 0.12954796030871002,
"grad_norm": 0.27108708024024963,
"learning_rate": 0.00028134328358208954,
"loss": 0.5718,
"step": 235
},
{
"epoch": 0.13009922822491732,
"grad_norm": 0.2557702660560608,
"learning_rate": 0.00028126036484245434,
"loss": 0.575,
"step": 236
},
{
"epoch": 0.1306504961411246,
"grad_norm": 0.2593226134777069,
"learning_rate": 0.0002811774461028192,
"loss": 0.5534,
"step": 237
},
{
"epoch": 0.13120176405733186,
"grad_norm": 0.2657114565372467,
"learning_rate": 0.00028109452736318406,
"loss": 0.5605,
"step": 238
},
{
"epoch": 0.13175303197353913,
"grad_norm": 0.25616228580474854,
"learning_rate": 0.0002810116086235489,
"loss": 0.5227,
"step": 239
},
{
"epoch": 0.13230429988974643,
"grad_norm": 0.2749009430408478,
"learning_rate": 0.0002809286898839137,
"loss": 0.536,
"step": 240
},
{
"epoch": 0.1328555678059537,
"grad_norm": 0.2617826759815216,
"learning_rate": 0.0002808457711442786,
"loss": 0.5602,
"step": 241
},
{
"epoch": 0.13340683572216097,
"grad_norm": 0.2576202154159546,
"learning_rate": 0.0002807628524046434,
"loss": 0.5205,
"step": 242
},
{
"epoch": 0.13395810363836824,
"grad_norm": 0.2764850854873657,
"learning_rate": 0.00028067993366500825,
"loss": 0.5752,
"step": 243
},
{
"epoch": 0.1345093715545755,
"grad_norm": 0.2652502954006195,
"learning_rate": 0.0002805970149253731,
"loss": 0.5495,
"step": 244
},
{
"epoch": 0.1350606394707828,
"grad_norm": 0.24600890278816223,
"learning_rate": 0.00028051409618573797,
"loss": 0.5146,
"step": 245
},
{
"epoch": 0.13561190738699008,
"grad_norm": 0.253635048866272,
"learning_rate": 0.0002804311774461028,
"loss": 0.5483,
"step": 246
},
{
"epoch": 0.13616317530319735,
"grad_norm": 0.24037104845046997,
"learning_rate": 0.00028034825870646763,
"loss": 0.5624,
"step": 247
},
{
"epoch": 0.13671444321940462,
"grad_norm": 0.24676042795181274,
"learning_rate": 0.0002802653399668325,
"loss": 0.537,
"step": 248
},
{
"epoch": 0.1372657111356119,
"grad_norm": 0.25283971428871155,
"learning_rate": 0.00028018242122719735,
"loss": 0.5705,
"step": 249
},
{
"epoch": 0.1378169790518192,
"grad_norm": 0.2672947347164154,
"learning_rate": 0.00028009950248756216,
"loss": 0.5699,
"step": 250
},
{
"epoch": 0.13836824696802646,
"grad_norm": 0.25930237770080566,
"learning_rate": 0.000280016583747927,
"loss": 0.5581,
"step": 251
},
{
"epoch": 0.13891951488423374,
"grad_norm": 0.24674735963344574,
"learning_rate": 0.0002799336650082918,
"loss": 0.5282,
"step": 252
},
{
"epoch": 0.139470782800441,
"grad_norm": 0.2826119065284729,
"learning_rate": 0.0002798507462686567,
"loss": 0.5261,
"step": 253
},
{
"epoch": 0.14002205071664828,
"grad_norm": 0.290584534406662,
"learning_rate": 0.00027976782752902154,
"loss": 0.5245,
"step": 254
},
{
"epoch": 0.14057331863285558,
"grad_norm": 0.25072574615478516,
"learning_rate": 0.0002796849087893864,
"loss": 0.5264,
"step": 255
},
{
"epoch": 0.14112458654906285,
"grad_norm": 0.24929046630859375,
"learning_rate": 0.0002796019900497512,
"loss": 0.5698,
"step": 256
},
{
"epoch": 0.14167585446527012,
"grad_norm": 0.24978522956371307,
"learning_rate": 0.00027951907131011606,
"loss": 0.5269,
"step": 257
},
{
"epoch": 0.1422271223814774,
"grad_norm": 0.26195666193962097,
"learning_rate": 0.0002794361525704809,
"loss": 0.5801,
"step": 258
},
{
"epoch": 0.1427783902976847,
"grad_norm": 0.27321335673332214,
"learning_rate": 0.0002793532338308458,
"loss": 0.5556,
"step": 259
},
{
"epoch": 0.14332965821389196,
"grad_norm": 0.2694965898990631,
"learning_rate": 0.0002792703150912106,
"loss": 0.5715,
"step": 260
},
{
"epoch": 0.14388092613009923,
"grad_norm": 0.2757553160190582,
"learning_rate": 0.00027918739635157544,
"loss": 0.5645,
"step": 261
},
{
"epoch": 0.1444321940463065,
"grad_norm": 0.2602946162223816,
"learning_rate": 0.00027910447761194025,
"loss": 0.5703,
"step": 262
},
{
"epoch": 0.14498346196251377,
"grad_norm": 0.24068838357925415,
"learning_rate": 0.0002790215588723051,
"loss": 0.5168,
"step": 263
},
{
"epoch": 0.14553472987872107,
"grad_norm": 0.26140162348747253,
"learning_rate": 0.00027893864013266997,
"loss": 0.5271,
"step": 264
},
{
"epoch": 0.14608599779492834,
"grad_norm": 0.26940983533859253,
"learning_rate": 0.0002788557213930348,
"loss": 0.5571,
"step": 265
},
{
"epoch": 0.1466372657111356,
"grad_norm": 0.24524417519569397,
"learning_rate": 0.00027877280265339963,
"loss": 0.5227,
"step": 266
},
{
"epoch": 0.14718853362734288,
"grad_norm": 0.2636984884738922,
"learning_rate": 0.0002786898839137645,
"loss": 0.5335,
"step": 267
},
{
"epoch": 0.14773980154355015,
"grad_norm": 0.24600271880626678,
"learning_rate": 0.00027860696517412935,
"loss": 0.5601,
"step": 268
},
{
"epoch": 0.14829106945975745,
"grad_norm": 0.24977444112300873,
"learning_rate": 0.0002785240464344942,
"loss": 0.5437,
"step": 269
},
{
"epoch": 0.14884233737596472,
"grad_norm": 0.27960002422332764,
"learning_rate": 0.000278441127694859,
"loss": 0.548,
"step": 270
},
{
"epoch": 0.149393605292172,
"grad_norm": 0.2514914870262146,
"learning_rate": 0.00027835820895522387,
"loss": 0.5335,
"step": 271
},
{
"epoch": 0.14994487320837926,
"grad_norm": 0.2503030300140381,
"learning_rate": 0.0002782752902155887,
"loss": 0.5538,
"step": 272
},
{
"epoch": 0.15049614112458654,
"grad_norm": 0.28311678767204285,
"learning_rate": 0.00027819237147595354,
"loss": 0.5649,
"step": 273
},
{
"epoch": 0.15104740904079383,
"grad_norm": 0.27529653906822205,
"learning_rate": 0.0002781094527363184,
"loss": 0.5432,
"step": 274
},
{
"epoch": 0.1515986769570011,
"grad_norm": 0.266111820936203,
"learning_rate": 0.0002780265339966832,
"loss": 0.5475,
"step": 275
},
{
"epoch": 0.15214994487320838,
"grad_norm": 0.2525365352630615,
"learning_rate": 0.00027794361525704806,
"loss": 0.5252,
"step": 276
},
{
"epoch": 0.15270121278941565,
"grad_norm": 0.2655681371688843,
"learning_rate": 0.0002778606965174129,
"loss": 0.5406,
"step": 277
},
{
"epoch": 0.15325248070562295,
"grad_norm": 0.29118314385414124,
"learning_rate": 0.0002777777777777778,
"loss": 0.5324,
"step": 278
},
{
"epoch": 0.15380374862183022,
"grad_norm": 0.2875930070877075,
"learning_rate": 0.0002776948590381426,
"loss": 0.5804,
"step": 279
},
{
"epoch": 0.1543550165380375,
"grad_norm": 0.26764920353889465,
"learning_rate": 0.00027761194029850744,
"loss": 0.5391,
"step": 280
},
{
"epoch": 0.15490628445424476,
"grad_norm": 0.2753891348838806,
"learning_rate": 0.00027752902155887225,
"loss": 0.5573,
"step": 281
},
{
"epoch": 0.15545755237045203,
"grad_norm": 0.26174411177635193,
"learning_rate": 0.0002774461028192371,
"loss": 0.5543,
"step": 282
},
{
"epoch": 0.15600882028665933,
"grad_norm": 0.25004303455352783,
"learning_rate": 0.00027736318407960196,
"loss": 0.5546,
"step": 283
},
{
"epoch": 0.1565600882028666,
"grad_norm": 0.2634401023387909,
"learning_rate": 0.0002772802653399668,
"loss": 0.524,
"step": 284
},
{
"epoch": 0.15711135611907387,
"grad_norm": 0.26751798391342163,
"learning_rate": 0.00027719734660033163,
"loss": 0.574,
"step": 285
},
{
"epoch": 0.15766262403528114,
"grad_norm": 0.2556850016117096,
"learning_rate": 0.0002771144278606965,
"loss": 0.5533,
"step": 286
},
{
"epoch": 0.1582138919514884,
"grad_norm": 0.2557762563228607,
"learning_rate": 0.00027703150912106135,
"loss": 0.546,
"step": 287
},
{
"epoch": 0.1587651598676957,
"grad_norm": 0.25817009806632996,
"learning_rate": 0.0002769485903814262,
"loss": 0.5519,
"step": 288
},
{
"epoch": 0.15931642778390298,
"grad_norm": 0.26580142974853516,
"learning_rate": 0.000276865671641791,
"loss": 0.5438,
"step": 289
},
{
"epoch": 0.15986769570011025,
"grad_norm": 0.25780072808265686,
"learning_rate": 0.00027678275290215587,
"loss": 0.549,
"step": 290
},
{
"epoch": 0.16041896361631752,
"grad_norm": 0.2627890706062317,
"learning_rate": 0.0002766998341625207,
"loss": 0.5565,
"step": 291
},
{
"epoch": 0.1609702315325248,
"grad_norm": 0.26781341433525085,
"learning_rate": 0.00027661691542288553,
"loss": 0.542,
"step": 292
},
{
"epoch": 0.1615214994487321,
"grad_norm": 0.253888338804245,
"learning_rate": 0.0002765339966832504,
"loss": 0.5424,
"step": 293
},
{
"epoch": 0.16207276736493936,
"grad_norm": 0.2835153043270111,
"learning_rate": 0.00027645107794361525,
"loss": 0.5354,
"step": 294
},
{
"epoch": 0.16262403528114663,
"grad_norm": 0.286640465259552,
"learning_rate": 0.00027636815920398006,
"loss": 0.5209,
"step": 295
},
{
"epoch": 0.1631753031973539,
"grad_norm": 0.25742077827453613,
"learning_rate": 0.0002762852404643449,
"loss": 0.5198,
"step": 296
},
{
"epoch": 0.1637265711135612,
"grad_norm": 0.24710626900196075,
"learning_rate": 0.0002762023217247098,
"loss": 0.5189,
"step": 297
},
{
"epoch": 0.16427783902976847,
"grad_norm": 0.28113001585006714,
"learning_rate": 0.00027611940298507464,
"loss": 0.5519,
"step": 298
},
{
"epoch": 0.16482910694597575,
"grad_norm": 0.2573966085910797,
"learning_rate": 0.00027603648424543944,
"loss": 0.5307,
"step": 299
},
{
"epoch": 0.16538037486218302,
"grad_norm": 0.24416916072368622,
"learning_rate": 0.0002759535655058043,
"loss": 0.5519,
"step": 300
},
{
"epoch": 0.1659316427783903,
"grad_norm": 0.25596654415130615,
"learning_rate": 0.0002758706467661691,
"loss": 0.5344,
"step": 301
},
{
"epoch": 0.16648291069459759,
"grad_norm": 0.25158900022506714,
"learning_rate": 0.00027578772802653396,
"loss": 0.5399,
"step": 302
},
{
"epoch": 0.16703417861080486,
"grad_norm": 0.24854016304016113,
"learning_rate": 0.0002757048092868988,
"loss": 0.5389,
"step": 303
},
{
"epoch": 0.16758544652701213,
"grad_norm": 0.2592412233352661,
"learning_rate": 0.0002756218905472637,
"loss": 0.5584,
"step": 304
},
{
"epoch": 0.1681367144432194,
"grad_norm": 0.2527318298816681,
"learning_rate": 0.0002755389718076285,
"loss": 0.5604,
"step": 305
},
{
"epoch": 0.16868798235942667,
"grad_norm": 0.26560983061790466,
"learning_rate": 0.00027545605306799335,
"loss": 0.5561,
"step": 306
},
{
"epoch": 0.16923925027563397,
"grad_norm": 0.2634880542755127,
"learning_rate": 0.0002753731343283582,
"loss": 0.5281,
"step": 307
},
{
"epoch": 0.16979051819184124,
"grad_norm": 0.2732850909233093,
"learning_rate": 0.00027529021558872306,
"loss": 0.5398,
"step": 308
},
{
"epoch": 0.1703417861080485,
"grad_norm": 0.23158006370067596,
"learning_rate": 0.00027520729684908787,
"loss": 0.5325,
"step": 309
},
{
"epoch": 0.17089305402425578,
"grad_norm": 0.24649128317832947,
"learning_rate": 0.00027512437810945273,
"loss": 0.5381,
"step": 310
},
{
"epoch": 0.17144432194046305,
"grad_norm": 0.2770949602127075,
"learning_rate": 0.00027504145936981753,
"loss": 0.5498,
"step": 311
},
{
"epoch": 0.17199558985667035,
"grad_norm": 0.25388598442077637,
"learning_rate": 0.0002749585406301824,
"loss": 0.5389,
"step": 312
},
{
"epoch": 0.17254685777287762,
"grad_norm": 0.2431599199771881,
"learning_rate": 0.00027487562189054725,
"loss": 0.5343,
"step": 313
},
{
"epoch": 0.1730981256890849,
"grad_norm": 0.24289795756340027,
"learning_rate": 0.0002747927031509121,
"loss": 0.5073,
"step": 314
},
{
"epoch": 0.17364939360529216,
"grad_norm": 0.2458408623933792,
"learning_rate": 0.0002747097844112769,
"loss": 0.5278,
"step": 315
},
{
"epoch": 0.17420066152149946,
"grad_norm": 0.24127742648124695,
"learning_rate": 0.0002746268656716418,
"loss": 0.5345,
"step": 316
},
{
"epoch": 0.17475192943770673,
"grad_norm": 0.26737701892852783,
"learning_rate": 0.00027454394693200663,
"loss": 0.5395,
"step": 317
},
{
"epoch": 0.175303197353914,
"grad_norm": 0.26361507177352905,
"learning_rate": 0.0002744610281923715,
"loss": 0.5405,
"step": 318
},
{
"epoch": 0.17585446527012127,
"grad_norm": 0.24210020899772644,
"learning_rate": 0.0002743781094527363,
"loss": 0.5268,
"step": 319
},
{
"epoch": 0.17640573318632854,
"grad_norm": 0.2510232627391815,
"learning_rate": 0.0002742951907131011,
"loss": 0.5373,
"step": 320
},
{
"epoch": 0.17695700110253584,
"grad_norm": 0.23939576745033264,
"learning_rate": 0.00027421227197346596,
"loss": 0.5561,
"step": 321
},
{
"epoch": 0.17750826901874311,
"grad_norm": 0.273258239030838,
"learning_rate": 0.0002741293532338308,
"loss": 0.5507,
"step": 322
},
{
"epoch": 0.17805953693495039,
"grad_norm": 0.23547501862049103,
"learning_rate": 0.0002740464344941957,
"loss": 0.5293,
"step": 323
},
{
"epoch": 0.17861080485115766,
"grad_norm": 0.24796201288700104,
"learning_rate": 0.0002739635157545605,
"loss": 0.5378,
"step": 324
},
{
"epoch": 0.17916207276736493,
"grad_norm": 0.23436011373996735,
"learning_rate": 0.00027388059701492534,
"loss": 0.5432,
"step": 325
},
{
"epoch": 0.17971334068357223,
"grad_norm": 0.22892701625823975,
"learning_rate": 0.0002737976782752902,
"loss": 0.5221,
"step": 326
},
{
"epoch": 0.1802646085997795,
"grad_norm": 0.23817826807498932,
"learning_rate": 0.00027371475953565506,
"loss": 0.5284,
"step": 327
},
{
"epoch": 0.18081587651598677,
"grad_norm": 0.23703162372112274,
"learning_rate": 0.00027363184079601987,
"loss": 0.5223,
"step": 328
},
{
"epoch": 0.18136714443219404,
"grad_norm": 0.24087084829807281,
"learning_rate": 0.0002735489220563847,
"loss": 0.5489,
"step": 329
},
{
"epoch": 0.1819184123484013,
"grad_norm": 0.2529735267162323,
"learning_rate": 0.00027346600331674953,
"loss": 0.5485,
"step": 330
},
{
"epoch": 0.1824696802646086,
"grad_norm": 0.23450088500976562,
"learning_rate": 0.0002733830845771144,
"loss": 0.4971,
"step": 331
},
{
"epoch": 0.18302094818081588,
"grad_norm": 0.23895451426506042,
"learning_rate": 0.00027330016583747925,
"loss": 0.5165,
"step": 332
},
{
"epoch": 0.18357221609702315,
"grad_norm": 0.24417142570018768,
"learning_rate": 0.0002732172470978441,
"loss": 0.5491,
"step": 333
},
{
"epoch": 0.18412348401323042,
"grad_norm": 0.2527695596218109,
"learning_rate": 0.0002731343283582089,
"loss": 0.5255,
"step": 334
},
{
"epoch": 0.18467475192943772,
"grad_norm": 0.24978198111057281,
"learning_rate": 0.00027305140961857377,
"loss": 0.5389,
"step": 335
},
{
"epoch": 0.185226019845645,
"grad_norm": 0.2539977431297302,
"learning_rate": 0.00027296849087893863,
"loss": 0.5392,
"step": 336
},
{
"epoch": 0.18577728776185226,
"grad_norm": 0.24033623933792114,
"learning_rate": 0.0002728855721393035,
"loss": 0.5356,
"step": 337
},
{
"epoch": 0.18632855567805953,
"grad_norm": 0.24697022140026093,
"learning_rate": 0.0002728026533996683,
"loss": 0.5159,
"step": 338
},
{
"epoch": 0.1868798235942668,
"grad_norm": 0.25741416215896606,
"learning_rate": 0.00027271973466003315,
"loss": 0.56,
"step": 339
},
{
"epoch": 0.1874310915104741,
"grad_norm": 0.2324167639017105,
"learning_rate": 0.00027263681592039796,
"loss": 0.5379,
"step": 340
},
{
"epoch": 0.18798235942668137,
"grad_norm": 0.24800144135951996,
"learning_rate": 0.0002725538971807628,
"loss": 0.5129,
"step": 341
},
{
"epoch": 0.18853362734288864,
"grad_norm": 0.26905378699302673,
"learning_rate": 0.0002724709784411277,
"loss": 0.5226,
"step": 342
},
{
"epoch": 0.18908489525909591,
"grad_norm": 0.25401249527931213,
"learning_rate": 0.00027238805970149254,
"loss": 0.5313,
"step": 343
},
{
"epoch": 0.18963616317530319,
"grad_norm": 0.24307483434677124,
"learning_rate": 0.00027230514096185734,
"loss": 0.5427,
"step": 344
},
{
"epoch": 0.19018743109151048,
"grad_norm": 0.25807374715805054,
"learning_rate": 0.0002722222222222222,
"loss": 0.524,
"step": 345
},
{
"epoch": 0.19073869900771775,
"grad_norm": 0.2321993112564087,
"learning_rate": 0.00027213930348258706,
"loss": 0.5314,
"step": 346
},
{
"epoch": 0.19128996692392503,
"grad_norm": 0.23558932542800903,
"learning_rate": 0.0002720563847429519,
"loss": 0.5223,
"step": 347
},
{
"epoch": 0.1918412348401323,
"grad_norm": 0.25960054993629456,
"learning_rate": 0.0002719734660033167,
"loss": 0.5436,
"step": 348
},
{
"epoch": 0.19239250275633957,
"grad_norm": 0.2273932248353958,
"learning_rate": 0.0002718905472636816,
"loss": 0.5048,
"step": 349
},
{
"epoch": 0.19294377067254687,
"grad_norm": 0.2279786467552185,
"learning_rate": 0.0002718076285240464,
"loss": 0.5164,
"step": 350
},
{
"epoch": 0.19349503858875414,
"grad_norm": 0.23833182454109192,
"learning_rate": 0.00027172470978441125,
"loss": 0.5378,
"step": 351
},
{
"epoch": 0.1940463065049614,
"grad_norm": 0.2499193549156189,
"learning_rate": 0.0002716417910447761,
"loss": 0.5494,
"step": 352
},
{
"epoch": 0.19459757442116868,
"grad_norm": 0.2734036147594452,
"learning_rate": 0.00027155887230514097,
"loss": 0.5391,
"step": 353
},
{
"epoch": 0.19514884233737598,
"grad_norm": 0.25754764676094055,
"learning_rate": 0.00027147595356550577,
"loss": 0.5212,
"step": 354
},
{
"epoch": 0.19570011025358325,
"grad_norm": 0.22964167594909668,
"learning_rate": 0.00027139303482587063,
"loss": 0.5301,
"step": 355
},
{
"epoch": 0.19625137816979052,
"grad_norm": 0.24985463917255402,
"learning_rate": 0.0002713101160862355,
"loss": 0.5177,
"step": 356
},
{
"epoch": 0.1968026460859978,
"grad_norm": 0.27296510338783264,
"learning_rate": 0.00027122719734660035,
"loss": 0.5443,
"step": 357
},
{
"epoch": 0.19735391400220506,
"grad_norm": 0.2506982982158661,
"learning_rate": 0.00027114427860696515,
"loss": 0.5419,
"step": 358
},
{
"epoch": 0.19790518191841236,
"grad_norm": 0.2600388526916504,
"learning_rate": 0.00027106135986733,
"loss": 0.5402,
"step": 359
},
{
"epoch": 0.19845644983461963,
"grad_norm": 0.25040823221206665,
"learning_rate": 0.0002709784411276948,
"loss": 0.5463,
"step": 360
},
{
"epoch": 0.1990077177508269,
"grad_norm": 0.25567591190338135,
"learning_rate": 0.0002708955223880597,
"loss": 0.5189,
"step": 361
},
{
"epoch": 0.19955898566703417,
"grad_norm": 0.24336600303649902,
"learning_rate": 0.00027081260364842454,
"loss": 0.5393,
"step": 362
},
{
"epoch": 0.20011025358324144,
"grad_norm": 0.23660831153392792,
"learning_rate": 0.00027072968490878934,
"loss": 0.5121,
"step": 363
},
{
"epoch": 0.20066152149944874,
"grad_norm": 0.23589812219142914,
"learning_rate": 0.0002706467661691542,
"loss": 0.5016,
"step": 364
},
{
"epoch": 0.201212789415656,
"grad_norm": 0.2517778277397156,
"learning_rate": 0.000270563847429519,
"loss": 0.5127,
"step": 365
},
{
"epoch": 0.20176405733186328,
"grad_norm": 0.263662189245224,
"learning_rate": 0.0002704809286898839,
"loss": 0.5518,
"step": 366
},
{
"epoch": 0.20231532524807055,
"grad_norm": 0.25211676955223083,
"learning_rate": 0.0002703980099502487,
"loss": 0.5362,
"step": 367
},
{
"epoch": 0.20286659316427783,
"grad_norm": 0.22718675434589386,
"learning_rate": 0.0002703150912106136,
"loss": 0.5127,
"step": 368
},
{
"epoch": 0.20341786108048512,
"grad_norm": 0.24481582641601562,
"learning_rate": 0.0002702321724709784,
"loss": 0.5084,
"step": 369
},
{
"epoch": 0.2039691289966924,
"grad_norm": 0.2656586766242981,
"learning_rate": 0.00027014925373134325,
"loss": 0.5454,
"step": 370
},
{
"epoch": 0.20452039691289967,
"grad_norm": 0.2491103559732437,
"learning_rate": 0.0002700663349917081,
"loss": 0.5412,
"step": 371
},
{
"epoch": 0.20507166482910694,
"grad_norm": 0.252030611038208,
"learning_rate": 0.00026998341625207296,
"loss": 0.5761,
"step": 372
},
{
"epoch": 0.20562293274531424,
"grad_norm": 0.24894152581691742,
"learning_rate": 0.00026990049751243777,
"loss": 0.5264,
"step": 373
},
{
"epoch": 0.2061742006615215,
"grad_norm": 0.25231489539146423,
"learning_rate": 0.00026981757877280263,
"loss": 0.5295,
"step": 374
},
{
"epoch": 0.20672546857772878,
"grad_norm": 0.25147655606269836,
"learning_rate": 0.00026973466003316743,
"loss": 0.5126,
"step": 375
},
{
"epoch": 0.20727673649393605,
"grad_norm": 0.2379835844039917,
"learning_rate": 0.0002696517412935323,
"loss": 0.4937,
"step": 376
},
{
"epoch": 0.20782800441014332,
"grad_norm": 0.24038439989089966,
"learning_rate": 0.00026956882255389715,
"loss": 0.5426,
"step": 377
},
{
"epoch": 0.20837927232635062,
"grad_norm": 0.24591150879859924,
"learning_rate": 0.000269485903814262,
"loss": 0.5191,
"step": 378
},
{
"epoch": 0.2089305402425579,
"grad_norm": 0.23723675310611725,
"learning_rate": 0.0002694029850746268,
"loss": 0.5247,
"step": 379
},
{
"epoch": 0.20948180815876516,
"grad_norm": 0.2618078887462616,
"learning_rate": 0.0002693200663349917,
"loss": 0.5559,
"step": 380
},
{
"epoch": 0.21003307607497243,
"grad_norm": 0.2556595504283905,
"learning_rate": 0.00026923714759535653,
"loss": 0.544,
"step": 381
},
{
"epoch": 0.2105843439911797,
"grad_norm": 0.24010786414146423,
"learning_rate": 0.0002691542288557214,
"loss": 0.4958,
"step": 382
},
{
"epoch": 0.211135611907387,
"grad_norm": 0.253151535987854,
"learning_rate": 0.0002690713101160862,
"loss": 0.5371,
"step": 383
},
{
"epoch": 0.21168687982359427,
"grad_norm": 0.2715364694595337,
"learning_rate": 0.00026898839137645106,
"loss": 0.5788,
"step": 384
},
{
"epoch": 0.21223814773980154,
"grad_norm": 0.2472977191209793,
"learning_rate": 0.00026890547263681586,
"loss": 0.5359,
"step": 385
},
{
"epoch": 0.2127894156560088,
"grad_norm": 0.2925645411014557,
"learning_rate": 0.0002688225538971807,
"loss": 0.5373,
"step": 386
},
{
"epoch": 0.21334068357221608,
"grad_norm": 0.23534104228019714,
"learning_rate": 0.0002687396351575456,
"loss": 0.5421,
"step": 387
},
{
"epoch": 0.21389195148842338,
"grad_norm": 0.25397318601608276,
"learning_rate": 0.00026865671641791044,
"loss": 0.5538,
"step": 388
},
{
"epoch": 0.21444321940463065,
"grad_norm": 0.26708152890205383,
"learning_rate": 0.00026857379767827524,
"loss": 0.5088,
"step": 389
},
{
"epoch": 0.21499448732083792,
"grad_norm": 0.24131494760513306,
"learning_rate": 0.0002684908789386401,
"loss": 0.5215,
"step": 390
},
{
"epoch": 0.2155457552370452,
"grad_norm": 0.25981369614601135,
"learning_rate": 0.00026840796019900496,
"loss": 0.5481,
"step": 391
},
{
"epoch": 0.2160970231532525,
"grad_norm": 0.25831639766693115,
"learning_rate": 0.0002683250414593698,
"loss": 0.5352,
"step": 392
},
{
"epoch": 0.21664829106945976,
"grad_norm": 0.24388836324214935,
"learning_rate": 0.0002682421227197346,
"loss": 0.5047,
"step": 393
},
{
"epoch": 0.21719955898566704,
"grad_norm": 0.25614237785339355,
"learning_rate": 0.0002681592039800995,
"loss": 0.5236,
"step": 394
},
{
"epoch": 0.2177508269018743,
"grad_norm": 0.23628944158554077,
"learning_rate": 0.0002680762852404643,
"loss": 0.5118,
"step": 395
},
{
"epoch": 0.21830209481808158,
"grad_norm": 0.25390875339508057,
"learning_rate": 0.00026799336650082915,
"loss": 0.5231,
"step": 396
},
{
"epoch": 0.21885336273428888,
"grad_norm": 0.27364251017570496,
"learning_rate": 0.000267910447761194,
"loss": 0.5573,
"step": 397
},
{
"epoch": 0.21940463065049615,
"grad_norm": 0.25110650062561035,
"learning_rate": 0.00026782752902155887,
"loss": 0.5078,
"step": 398
},
{
"epoch": 0.21995589856670342,
"grad_norm": 0.24438323080539703,
"learning_rate": 0.0002677446102819237,
"loss": 0.5026,
"step": 399
},
{
"epoch": 0.2205071664829107,
"grad_norm": 0.23745465278625488,
"learning_rate": 0.00026766169154228853,
"loss": 0.5568,
"step": 400
},
{
"epoch": 0.22105843439911796,
"grad_norm": 0.25559869408607483,
"learning_rate": 0.0002675787728026534,
"loss": 0.5286,
"step": 401
},
{
"epoch": 0.22160970231532526,
"grad_norm": 0.24587516486644745,
"learning_rate": 0.00026749585406301825,
"loss": 0.5258,
"step": 402
},
{
"epoch": 0.22216097023153253,
"grad_norm": 0.26151949167251587,
"learning_rate": 0.00026741293532338306,
"loss": 0.5426,
"step": 403
},
{
"epoch": 0.2227122381477398,
"grad_norm": 0.2910129427909851,
"learning_rate": 0.0002673300165837479,
"loss": 0.5376,
"step": 404
},
{
"epoch": 0.22326350606394707,
"grad_norm": 0.28276947140693665,
"learning_rate": 0.0002672470978441127,
"loss": 0.5271,
"step": 405
},
{
"epoch": 0.22381477398015434,
"grad_norm": 0.25096046924591064,
"learning_rate": 0.0002671641791044776,
"loss": 0.5439,
"step": 406
},
{
"epoch": 0.22436604189636164,
"grad_norm": 0.2461530715227127,
"learning_rate": 0.00026708126036484244,
"loss": 0.5239,
"step": 407
},
{
"epoch": 0.2249173098125689,
"grad_norm": 0.2833070456981659,
"learning_rate": 0.00026699834162520724,
"loss": 0.531,
"step": 408
},
{
"epoch": 0.22546857772877618,
"grad_norm": 0.24600760638713837,
"learning_rate": 0.0002669154228855721,
"loss": 0.5419,
"step": 409
},
{
"epoch": 0.22601984564498345,
"grad_norm": 0.2620793581008911,
"learning_rate": 0.00026683250414593696,
"loss": 0.5033,
"step": 410
},
{
"epoch": 0.22657111356119075,
"grad_norm": 0.27523407340049744,
"learning_rate": 0.0002667495854063018,
"loss": 0.5257,
"step": 411
},
{
"epoch": 0.22712238147739802,
"grad_norm": 0.2630368769168854,
"learning_rate": 0.0002666666666666666,
"loss": 0.5156,
"step": 412
},
{
"epoch": 0.2276736493936053,
"grad_norm": 0.24897338449954987,
"learning_rate": 0.0002665837479270315,
"loss": 0.5301,
"step": 413
},
{
"epoch": 0.22822491730981256,
"grad_norm": 0.26213693618774414,
"learning_rate": 0.0002665008291873963,
"loss": 0.5563,
"step": 414
},
{
"epoch": 0.22877618522601983,
"grad_norm": 0.23822888731956482,
"learning_rate": 0.00026641791044776115,
"loss": 0.5273,
"step": 415
},
{
"epoch": 0.22932745314222713,
"grad_norm": 0.22970083355903625,
"learning_rate": 0.000266334991708126,
"loss": 0.5321,
"step": 416
},
{
"epoch": 0.2298787210584344,
"grad_norm": 0.26430296897888184,
"learning_rate": 0.00026625207296849087,
"loss": 0.5539,
"step": 417
},
{
"epoch": 0.23042998897464168,
"grad_norm": 0.25960785150527954,
"learning_rate": 0.00026616915422885567,
"loss": 0.5357,
"step": 418
},
{
"epoch": 0.23098125689084895,
"grad_norm": 0.23449423909187317,
"learning_rate": 0.00026608623548922053,
"loss": 0.5143,
"step": 419
},
{
"epoch": 0.23153252480705622,
"grad_norm": 0.2795349061489105,
"learning_rate": 0.0002660033167495854,
"loss": 0.5363,
"step": 420
},
{
"epoch": 0.23208379272326352,
"grad_norm": 0.2637255787849426,
"learning_rate": 0.00026592039800995025,
"loss": 0.5607,
"step": 421
},
{
"epoch": 0.2326350606394708,
"grad_norm": 0.23269203305244446,
"learning_rate": 0.00026583747927031505,
"loss": 0.5239,
"step": 422
},
{
"epoch": 0.23318632855567806,
"grad_norm": 0.2501350939273834,
"learning_rate": 0.0002657545605306799,
"loss": 0.5303,
"step": 423
},
{
"epoch": 0.23373759647188533,
"grad_norm": 0.25998207926750183,
"learning_rate": 0.0002656716417910447,
"loss": 0.5258,
"step": 424
},
{
"epoch": 0.2342888643880926,
"grad_norm": 0.25762224197387695,
"learning_rate": 0.0002655887230514096,
"loss": 0.5427,
"step": 425
},
{
"epoch": 0.2348401323042999,
"grad_norm": 0.2542650103569031,
"learning_rate": 0.00026550580431177444,
"loss": 0.5363,
"step": 426
},
{
"epoch": 0.23539140022050717,
"grad_norm": 0.24817922711372375,
"learning_rate": 0.0002654228855721393,
"loss": 0.5294,
"step": 427
},
{
"epoch": 0.23594266813671444,
"grad_norm": 0.23553630709648132,
"learning_rate": 0.0002653399668325041,
"loss": 0.5401,
"step": 428
},
{
"epoch": 0.2364939360529217,
"grad_norm": 0.2774706184864044,
"learning_rate": 0.00026525704809286896,
"loss": 0.5352,
"step": 429
},
{
"epoch": 0.237045203969129,
"grad_norm": 0.2383023351430893,
"learning_rate": 0.0002651741293532338,
"loss": 0.5243,
"step": 430
},
{
"epoch": 0.23759647188533628,
"grad_norm": 0.23838096857070923,
"learning_rate": 0.0002650912106135987,
"loss": 0.5336,
"step": 431
},
{
"epoch": 0.23814773980154355,
"grad_norm": 0.2416170984506607,
"learning_rate": 0.0002650082918739635,
"loss": 0.5044,
"step": 432
},
{
"epoch": 0.23869900771775082,
"grad_norm": 0.24407121539115906,
"learning_rate": 0.00026492537313432834,
"loss": 0.5383,
"step": 433
},
{
"epoch": 0.2392502756339581,
"grad_norm": 0.26349690556526184,
"learning_rate": 0.00026484245439469315,
"loss": 0.5553,
"step": 434
},
{
"epoch": 0.2398015435501654,
"grad_norm": 0.27343693375587463,
"learning_rate": 0.000264759535655058,
"loss": 0.5593,
"step": 435
},
{
"epoch": 0.24035281146637266,
"grad_norm": 0.22751976549625397,
"learning_rate": 0.00026467661691542287,
"loss": 0.5254,
"step": 436
},
{
"epoch": 0.24090407938257993,
"grad_norm": 0.2342759519815445,
"learning_rate": 0.0002645936981757877,
"loss": 0.5076,
"step": 437
},
{
"epoch": 0.2414553472987872,
"grad_norm": 0.25039923191070557,
"learning_rate": 0.00026451077943615253,
"loss": 0.4816,
"step": 438
},
{
"epoch": 0.24200661521499447,
"grad_norm": 0.24585099518299103,
"learning_rate": 0.0002644278606965174,
"loss": 0.5132,
"step": 439
},
{
"epoch": 0.24255788313120177,
"grad_norm": 0.24062813818454742,
"learning_rate": 0.00026434494195688225,
"loss": 0.5152,
"step": 440
},
{
"epoch": 0.24310915104740904,
"grad_norm": 0.23549048602581024,
"learning_rate": 0.0002642620232172471,
"loss": 0.5201,
"step": 441
},
{
"epoch": 0.24366041896361632,
"grad_norm": 0.24712547659873962,
"learning_rate": 0.0002641791044776119,
"loss": 0.5252,
"step": 442
},
{
"epoch": 0.2442116868798236,
"grad_norm": 0.25113359093666077,
"learning_rate": 0.00026409618573797677,
"loss": 0.5593,
"step": 443
},
{
"epoch": 0.24476295479603086,
"grad_norm": 0.24021007120609283,
"learning_rate": 0.0002640132669983416,
"loss": 0.5338,
"step": 444
},
{
"epoch": 0.24531422271223816,
"grad_norm": 0.23334236443042755,
"learning_rate": 0.00026393034825870643,
"loss": 0.4842,
"step": 445
},
{
"epoch": 0.24586549062844543,
"grad_norm": 0.25075432658195496,
"learning_rate": 0.0002638474295190713,
"loss": 0.5498,
"step": 446
},
{
"epoch": 0.2464167585446527,
"grad_norm": 0.23466569185256958,
"learning_rate": 0.00026376451077943615,
"loss": 0.5125,
"step": 447
},
{
"epoch": 0.24696802646085997,
"grad_norm": 0.23975308239459991,
"learning_rate": 0.00026368159203980096,
"loss": 0.5315,
"step": 448
},
{
"epoch": 0.24751929437706727,
"grad_norm": 0.227213054895401,
"learning_rate": 0.0002635986733001658,
"loss": 0.4826,
"step": 449
},
{
"epoch": 0.24807056229327454,
"grad_norm": 0.23588328063488007,
"learning_rate": 0.0002635157545605307,
"loss": 0.4902,
"step": 450
},
{
"epoch": 0.2486218302094818,
"grad_norm": 0.24110263586044312,
"learning_rate": 0.00026343283582089554,
"loss": 0.5152,
"step": 451
},
{
"epoch": 0.24917309812568908,
"grad_norm": 0.24417544901371002,
"learning_rate": 0.00026334991708126034,
"loss": 0.5326,
"step": 452
},
{
"epoch": 0.24972436604189635,
"grad_norm": 0.24150699377059937,
"learning_rate": 0.00026326699834162515,
"loss": 0.547,
"step": 453
},
{
"epoch": 0.25027563395810365,
"grad_norm": 0.26009777188301086,
"learning_rate": 0.00026318407960199,
"loss": 0.5315,
"step": 454
},
{
"epoch": 0.2508269018743109,
"grad_norm": 0.2537683844566345,
"learning_rate": 0.00026310116086235486,
"loss": 0.5304,
"step": 455
},
{
"epoch": 0.2513781697905182,
"grad_norm": 0.2526278495788574,
"learning_rate": 0.0002630182421227197,
"loss": 0.5194,
"step": 456
},
{
"epoch": 0.2519294377067255,
"grad_norm": 0.24355928599834442,
"learning_rate": 0.00026293532338308453,
"loss": 0.5096,
"step": 457
},
{
"epoch": 0.25248070562293273,
"grad_norm": 0.243259459733963,
"learning_rate": 0.0002628524046434494,
"loss": 0.4971,
"step": 458
},
{
"epoch": 0.25303197353914003,
"grad_norm": 0.2597525417804718,
"learning_rate": 0.00026276948590381425,
"loss": 0.5224,
"step": 459
},
{
"epoch": 0.2535832414553473,
"grad_norm": 0.2498249113559723,
"learning_rate": 0.0002626865671641791,
"loss": 0.506,
"step": 460
},
{
"epoch": 0.2541345093715546,
"grad_norm": 0.21408714354038239,
"learning_rate": 0.0002626036484245439,
"loss": 0.5076,
"step": 461
},
{
"epoch": 0.25468577728776187,
"grad_norm": 0.25370824337005615,
"learning_rate": 0.00026252072968490877,
"loss": 0.5065,
"step": 462
},
{
"epoch": 0.2552370452039691,
"grad_norm": 0.25148823857307434,
"learning_rate": 0.0002624378109452736,
"loss": 0.4932,
"step": 463
},
{
"epoch": 0.2557883131201764,
"grad_norm": 0.24903985857963562,
"learning_rate": 0.00026235489220563843,
"loss": 0.5366,
"step": 464
},
{
"epoch": 0.25633958103638366,
"grad_norm": 0.2521916329860687,
"learning_rate": 0.0002622719734660033,
"loss": 0.5392,
"step": 465
},
{
"epoch": 0.25689084895259096,
"grad_norm": 0.24553993344306946,
"learning_rate": 0.00026218905472636815,
"loss": 0.5382,
"step": 466
},
{
"epoch": 0.25744211686879825,
"grad_norm": 0.23382090032100677,
"learning_rate": 0.00026210613598673296,
"loss": 0.523,
"step": 467
},
{
"epoch": 0.2579933847850055,
"grad_norm": 0.25337761640548706,
"learning_rate": 0.0002620232172470978,
"loss": 0.5147,
"step": 468
},
{
"epoch": 0.2585446527012128,
"grad_norm": 0.25433778762817383,
"learning_rate": 0.0002619402985074627,
"loss": 0.5012,
"step": 469
},
{
"epoch": 0.25909592061742004,
"grad_norm": 0.2362672984600067,
"learning_rate": 0.00026185737976782753,
"loss": 0.5328,
"step": 470
},
{
"epoch": 0.25964718853362734,
"grad_norm": 0.241427481174469,
"learning_rate": 0.00026177446102819234,
"loss": 0.5207,
"step": 471
},
{
"epoch": 0.26019845644983464,
"grad_norm": 0.24943798780441284,
"learning_rate": 0.0002616915422885572,
"loss": 0.5607,
"step": 472
},
{
"epoch": 0.2607497243660419,
"grad_norm": 0.21813860535621643,
"learning_rate": 0.000261608623548922,
"loss": 0.5036,
"step": 473
},
{
"epoch": 0.2613009922822492,
"grad_norm": 0.22680509090423584,
"learning_rate": 0.00026152570480928686,
"loss": 0.4765,
"step": 474
},
{
"epoch": 0.2618522601984565,
"grad_norm": 0.23577630519866943,
"learning_rate": 0.0002614427860696517,
"loss": 0.5267,
"step": 475
},
{
"epoch": 0.2624035281146637,
"grad_norm": 0.22560511529445648,
"learning_rate": 0.0002613598673300166,
"loss": 0.5089,
"step": 476
},
{
"epoch": 0.262954796030871,
"grad_norm": 0.2485722452402115,
"learning_rate": 0.0002612769485903814,
"loss": 0.5231,
"step": 477
},
{
"epoch": 0.26350606394707826,
"grad_norm": 0.2396019846200943,
"learning_rate": 0.00026119402985074624,
"loss": 0.515,
"step": 478
},
{
"epoch": 0.26405733186328556,
"grad_norm": 0.24977676570415497,
"learning_rate": 0.0002611111111111111,
"loss": 0.5303,
"step": 479
},
{
"epoch": 0.26460859977949286,
"grad_norm": 0.2788902521133423,
"learning_rate": 0.00026102819237147596,
"loss": 0.5324,
"step": 480
},
{
"epoch": 0.2651598676957001,
"grad_norm": 0.2515452802181244,
"learning_rate": 0.00026094527363184077,
"loss": 0.5373,
"step": 481
},
{
"epoch": 0.2657111356119074,
"grad_norm": 0.2408224493265152,
"learning_rate": 0.0002608623548922056,
"loss": 0.5021,
"step": 482
},
{
"epoch": 0.26626240352811464,
"grad_norm": 0.25597700476646423,
"learning_rate": 0.00026077943615257043,
"loss": 0.5292,
"step": 483
},
{
"epoch": 0.26681367144432194,
"grad_norm": 0.24885378777980804,
"learning_rate": 0.0002606965174129353,
"loss": 0.5047,
"step": 484
},
{
"epoch": 0.26736493936052924,
"grad_norm": 0.24355795979499817,
"learning_rate": 0.00026061359867330015,
"loss": 0.5258,
"step": 485
},
{
"epoch": 0.2679162072767365,
"grad_norm": 0.2580486238002777,
"learning_rate": 0.000260530679933665,
"loss": 0.5533,
"step": 486
},
{
"epoch": 0.2684674751929438,
"grad_norm": 0.27081531286239624,
"learning_rate": 0.0002604477611940298,
"loss": 0.525,
"step": 487
},
{
"epoch": 0.269018743109151,
"grad_norm": 0.2559351325035095,
"learning_rate": 0.0002603648424543947,
"loss": 0.5074,
"step": 488
},
{
"epoch": 0.2695700110253583,
"grad_norm": 0.2617773711681366,
"learning_rate": 0.00026028192371475953,
"loss": 0.5244,
"step": 489
},
{
"epoch": 0.2701212789415656,
"grad_norm": 0.23218858242034912,
"learning_rate": 0.0002601990049751244,
"loss": 0.5048,
"step": 490
},
{
"epoch": 0.27067254685777287,
"grad_norm": 0.24924521148204803,
"learning_rate": 0.0002601160862354892,
"loss": 0.521,
"step": 491
},
{
"epoch": 0.27122381477398017,
"grad_norm": 0.26815906167030334,
"learning_rate": 0.00026003316749585406,
"loss": 0.5574,
"step": 492
},
{
"epoch": 0.2717750826901874,
"grad_norm": 0.240220308303833,
"learning_rate": 0.00025995024875621886,
"loss": 0.483,
"step": 493
},
{
"epoch": 0.2723263506063947,
"grad_norm": 0.24979090690612793,
"learning_rate": 0.0002598673300165837,
"loss": 0.5262,
"step": 494
},
{
"epoch": 0.272877618522602,
"grad_norm": 0.24111522734165192,
"learning_rate": 0.0002597844112769486,
"loss": 0.5068,
"step": 495
},
{
"epoch": 0.27342888643880925,
"grad_norm": 0.2612921893596649,
"learning_rate": 0.0002597014925373134,
"loss": 0.519,
"step": 496
},
{
"epoch": 0.27398015435501655,
"grad_norm": 0.24324454367160797,
"learning_rate": 0.00025961857379767824,
"loss": 0.4826,
"step": 497
},
{
"epoch": 0.2745314222712238,
"grad_norm": 0.2406265288591385,
"learning_rate": 0.0002595356550580431,
"loss": 0.5223,
"step": 498
},
{
"epoch": 0.2750826901874311,
"grad_norm": 0.2597537934780121,
"learning_rate": 0.00025945273631840796,
"loss": 0.535,
"step": 499
},
{
"epoch": 0.2756339581036384,
"grad_norm": 0.2446909099817276,
"learning_rate": 0.00025936981757877277,
"loss": 0.5108,
"step": 500
},
{
"epoch": 0.2756339581036384,
"eval_loss": 0.5157487988471985,
"eval_runtime": 312.0533,
"eval_samples_per_second": 3.733,
"eval_steps_per_second": 0.468,
"step": 500
},
{
"epoch": 0.27618522601984563,
"grad_norm": 0.2623630166053772,
"learning_rate": 0.0002592868988391376,
"loss": 0.5414,
"step": 501
},
{
"epoch": 0.27673649393605293,
"grad_norm": 0.2578775882720947,
"learning_rate": 0.00025920398009950243,
"loss": 0.5121,
"step": 502
},
{
"epoch": 0.2772877618522602,
"grad_norm": 0.23712347447872162,
"learning_rate": 0.0002591210613598673,
"loss": 0.5085,
"step": 503
},
{
"epoch": 0.27783902976846747,
"grad_norm": 0.22108785808086395,
"learning_rate": 0.00025903814262023215,
"loss": 0.5202,
"step": 504
},
{
"epoch": 0.27839029768467477,
"grad_norm": 0.25034549832344055,
"learning_rate": 0.000258955223880597,
"loss": 0.5389,
"step": 505
},
{
"epoch": 0.278941565600882,
"grad_norm": 0.21812468767166138,
"learning_rate": 0.0002588723051409618,
"loss": 0.4994,
"step": 506
},
{
"epoch": 0.2794928335170893,
"grad_norm": 0.22681641578674316,
"learning_rate": 0.00025878938640132667,
"loss": 0.5219,
"step": 507
},
{
"epoch": 0.28004410143329656,
"grad_norm": 0.25568950176239014,
"learning_rate": 0.00025870646766169153,
"loss": 0.5188,
"step": 508
},
{
"epoch": 0.28059536934950385,
"grad_norm": 0.24642765522003174,
"learning_rate": 0.0002586235489220564,
"loss": 0.4978,
"step": 509
},
{
"epoch": 0.28114663726571115,
"grad_norm": 0.22820910811424255,
"learning_rate": 0.0002585406301824212,
"loss": 0.5168,
"step": 510
},
{
"epoch": 0.2816979051819184,
"grad_norm": 0.23360006511211395,
"learning_rate": 0.00025845771144278605,
"loss": 0.5059,
"step": 511
},
{
"epoch": 0.2822491730981257,
"grad_norm": 0.24599935114383698,
"learning_rate": 0.00025837479270315086,
"loss": 0.5293,
"step": 512
},
{
"epoch": 0.282800441014333,
"grad_norm": 0.23006513714790344,
"learning_rate": 0.0002582918739635157,
"loss": 0.5028,
"step": 513
},
{
"epoch": 0.28335170893054024,
"grad_norm": 0.22950898110866547,
"learning_rate": 0.0002582089552238806,
"loss": 0.5064,
"step": 514
},
{
"epoch": 0.28390297684674753,
"grad_norm": 0.23649993538856506,
"learning_rate": 0.00025812603648424544,
"loss": 0.515,
"step": 515
},
{
"epoch": 0.2844542447629548,
"grad_norm": 0.23335647583007812,
"learning_rate": 0.00025804311774461024,
"loss": 0.4977,
"step": 516
},
{
"epoch": 0.2850055126791621,
"grad_norm": 0.21914584934711456,
"learning_rate": 0.0002579601990049751,
"loss": 0.5018,
"step": 517
},
{
"epoch": 0.2855567805953694,
"grad_norm": 0.2474760264158249,
"learning_rate": 0.00025787728026533996,
"loss": 0.542,
"step": 518
},
{
"epoch": 0.2861080485115766,
"grad_norm": 0.24011823534965515,
"learning_rate": 0.0002577943615257048,
"loss": 0.5243,
"step": 519
},
{
"epoch": 0.2866593164277839,
"grad_norm": 0.2619330883026123,
"learning_rate": 0.0002577114427860696,
"loss": 0.5657,
"step": 520
},
{
"epoch": 0.28721058434399116,
"grad_norm": 0.2715679407119751,
"learning_rate": 0.0002576285240464345,
"loss": 0.5506,
"step": 521
},
{
"epoch": 0.28776185226019846,
"grad_norm": 0.26569628715515137,
"learning_rate": 0.0002575456053067993,
"loss": 0.5525,
"step": 522
},
{
"epoch": 0.28831312017640576,
"grad_norm": 0.23253163695335388,
"learning_rate": 0.00025746268656716415,
"loss": 0.5184,
"step": 523
},
{
"epoch": 0.288864388092613,
"grad_norm": 0.2698347866535187,
"learning_rate": 0.000257379767827529,
"loss": 0.5274,
"step": 524
},
{
"epoch": 0.2894156560088203,
"grad_norm": 0.2556426227092743,
"learning_rate": 0.00025729684908789386,
"loss": 0.5032,
"step": 525
},
{
"epoch": 0.28996692392502754,
"grad_norm": 0.252575546503067,
"learning_rate": 0.00025721393034825867,
"loss": 0.525,
"step": 526
},
{
"epoch": 0.29051819184123484,
"grad_norm": 0.26160725951194763,
"learning_rate": 0.00025713101160862353,
"loss": 0.552,
"step": 527
},
{
"epoch": 0.29106945975744214,
"grad_norm": 0.250885546207428,
"learning_rate": 0.0002570480928689884,
"loss": 0.5159,
"step": 528
},
{
"epoch": 0.2916207276736494,
"grad_norm": 0.24888747930526733,
"learning_rate": 0.00025696517412935325,
"loss": 0.5104,
"step": 529
},
{
"epoch": 0.2921719955898567,
"grad_norm": 0.2554168105125427,
"learning_rate": 0.00025688225538971805,
"loss": 0.4867,
"step": 530
},
{
"epoch": 0.2927232635060639,
"grad_norm": 0.24712808430194855,
"learning_rate": 0.0002567993366500829,
"loss": 0.5087,
"step": 531
},
{
"epoch": 0.2932745314222712,
"grad_norm": 0.26169416308403015,
"learning_rate": 0.0002567164179104477,
"loss": 0.5094,
"step": 532
},
{
"epoch": 0.2938257993384785,
"grad_norm": 0.25625213980674744,
"learning_rate": 0.0002566334991708126,
"loss": 0.5264,
"step": 533
},
{
"epoch": 0.29437706725468576,
"grad_norm": 0.22383877635002136,
"learning_rate": 0.00025655058043117743,
"loss": 0.4719,
"step": 534
},
{
"epoch": 0.29492833517089306,
"grad_norm": 0.2579217851161957,
"learning_rate": 0.0002564676616915423,
"loss": 0.5254,
"step": 535
},
{
"epoch": 0.2954796030871003,
"grad_norm": 0.25349318981170654,
"learning_rate": 0.0002563847429519071,
"loss": 0.4932,
"step": 536
},
{
"epoch": 0.2960308710033076,
"grad_norm": 0.25384828448295593,
"learning_rate": 0.00025630182421227196,
"loss": 0.51,
"step": 537
},
{
"epoch": 0.2965821389195149,
"grad_norm": 0.22186040878295898,
"learning_rate": 0.0002562189054726368,
"loss": 0.5074,
"step": 538
},
{
"epoch": 0.29713340683572215,
"grad_norm": 0.2735055685043335,
"learning_rate": 0.0002561359867330017,
"loss": 0.5151,
"step": 539
},
{
"epoch": 0.29768467475192945,
"grad_norm": 0.24992069602012634,
"learning_rate": 0.0002560530679933665,
"loss": 0.4987,
"step": 540
},
{
"epoch": 0.2982359426681367,
"grad_norm": 0.24067966639995575,
"learning_rate": 0.0002559701492537313,
"loss": 0.5434,
"step": 541
},
{
"epoch": 0.298787210584344,
"grad_norm": 0.22907654941082,
"learning_rate": 0.00025588723051409614,
"loss": 0.5091,
"step": 542
},
{
"epoch": 0.2993384785005513,
"grad_norm": 0.21983608603477478,
"learning_rate": 0.000255804311774461,
"loss": 0.5234,
"step": 543
},
{
"epoch": 0.29988974641675853,
"grad_norm": 0.2439606636762619,
"learning_rate": 0.00025572139303482586,
"loss": 0.5271,
"step": 544
},
{
"epoch": 0.30044101433296583,
"grad_norm": 0.25168585777282715,
"learning_rate": 0.00025563847429519067,
"loss": 0.4998,
"step": 545
},
{
"epoch": 0.30099228224917307,
"grad_norm": 0.22324073314666748,
"learning_rate": 0.00025555555555555553,
"loss": 0.5086,
"step": 546
},
{
"epoch": 0.30154355016538037,
"grad_norm": 0.22652758657932281,
"learning_rate": 0.0002554726368159204,
"loss": 0.5044,
"step": 547
},
{
"epoch": 0.30209481808158767,
"grad_norm": 0.2422345131635666,
"learning_rate": 0.00025538971807628525,
"loss": 0.4968,
"step": 548
},
{
"epoch": 0.3026460859977949,
"grad_norm": 0.24840863049030304,
"learning_rate": 0.00025530679933665005,
"loss": 0.5267,
"step": 549
},
{
"epoch": 0.3031973539140022,
"grad_norm": 0.26198020577430725,
"learning_rate": 0.0002552238805970149,
"loss": 0.528,
"step": 550
},
{
"epoch": 0.3037486218302095,
"grad_norm": 0.24763406813144684,
"learning_rate": 0.0002551409618573797,
"loss": 0.5387,
"step": 551
},
{
"epoch": 0.30429988974641675,
"grad_norm": 0.22976034879684448,
"learning_rate": 0.0002550580431177446,
"loss": 0.5171,
"step": 552
},
{
"epoch": 0.30485115766262405,
"grad_norm": 0.26161912083625793,
"learning_rate": 0.00025497512437810943,
"loss": 0.4956,
"step": 553
},
{
"epoch": 0.3054024255788313,
"grad_norm": 0.2695063650608063,
"learning_rate": 0.0002548922056384743,
"loss": 0.5339,
"step": 554
},
{
"epoch": 0.3059536934950386,
"grad_norm": 0.22745662927627563,
"learning_rate": 0.0002548092868988391,
"loss": 0.4769,
"step": 555
},
{
"epoch": 0.3065049614112459,
"grad_norm": 0.2539026439189911,
"learning_rate": 0.00025472636815920396,
"loss": 0.5085,
"step": 556
},
{
"epoch": 0.30705622932745313,
"grad_norm": 0.25683802366256714,
"learning_rate": 0.0002546434494195688,
"loss": 0.4828,
"step": 557
},
{
"epoch": 0.30760749724366043,
"grad_norm": 0.24806293845176697,
"learning_rate": 0.0002545605306799337,
"loss": 0.534,
"step": 558
},
{
"epoch": 0.3081587651598677,
"grad_norm": 0.24956698715686798,
"learning_rate": 0.0002544776119402985,
"loss": 0.4988,
"step": 559
},
{
"epoch": 0.308710033076075,
"grad_norm": 0.2466159611940384,
"learning_rate": 0.00025439469320066334,
"loss": 0.525,
"step": 560
},
{
"epoch": 0.3092613009922823,
"grad_norm": 0.2732326090335846,
"learning_rate": 0.00025431177446102814,
"loss": 0.5096,
"step": 561
},
{
"epoch": 0.3098125689084895,
"grad_norm": 0.257656067609787,
"learning_rate": 0.000254228855721393,
"loss": 0.5241,
"step": 562
},
{
"epoch": 0.3103638368246968,
"grad_norm": 0.2280483990907669,
"learning_rate": 0.00025414593698175786,
"loss": 0.5051,
"step": 563
},
{
"epoch": 0.31091510474090406,
"grad_norm": 0.24017442762851715,
"learning_rate": 0.0002540630182421227,
"loss": 0.4923,
"step": 564
},
{
"epoch": 0.31146637265711136,
"grad_norm": 0.27770093083381653,
"learning_rate": 0.0002539800995024875,
"loss": 0.5068,
"step": 565
},
{
"epoch": 0.31201764057331866,
"grad_norm": 0.2428130954504013,
"learning_rate": 0.0002538971807628524,
"loss": 0.5223,
"step": 566
},
{
"epoch": 0.3125689084895259,
"grad_norm": 0.24798986315727234,
"learning_rate": 0.00025381426202321724,
"loss": 0.5269,
"step": 567
},
{
"epoch": 0.3131201764057332,
"grad_norm": 0.2388242930173874,
"learning_rate": 0.0002537313432835821,
"loss": 0.5328,
"step": 568
},
{
"epoch": 0.31367144432194044,
"grad_norm": 0.24993616342544556,
"learning_rate": 0.0002536484245439469,
"loss": 0.523,
"step": 569
},
{
"epoch": 0.31422271223814774,
"grad_norm": 0.22417233884334564,
"learning_rate": 0.00025356550580431177,
"loss": 0.5162,
"step": 570
},
{
"epoch": 0.31477398015435504,
"grad_norm": 0.25001853704452515,
"learning_rate": 0.00025348258706467657,
"loss": 0.5172,
"step": 571
},
{
"epoch": 0.3153252480705623,
"grad_norm": 0.24982157349586487,
"learning_rate": 0.00025339966832504143,
"loss": 0.516,
"step": 572
},
{
"epoch": 0.3158765159867696,
"grad_norm": 0.23938202857971191,
"learning_rate": 0.0002533167495854063,
"loss": 0.4984,
"step": 573
},
{
"epoch": 0.3164277839029768,
"grad_norm": 0.23941190540790558,
"learning_rate": 0.00025323383084577115,
"loss": 0.5285,
"step": 574
},
{
"epoch": 0.3169790518191841,
"grad_norm": 0.26152345538139343,
"learning_rate": 0.00025315091210613595,
"loss": 0.5354,
"step": 575
},
{
"epoch": 0.3175303197353914,
"grad_norm": 0.2364695519208908,
"learning_rate": 0.0002530679933665008,
"loss": 0.4926,
"step": 576
},
{
"epoch": 0.31808158765159866,
"grad_norm": 0.2498009353876114,
"learning_rate": 0.00025298507462686567,
"loss": 0.4879,
"step": 577
},
{
"epoch": 0.31863285556780596,
"grad_norm": 0.2434455007314682,
"learning_rate": 0.00025290215588723053,
"loss": 0.4941,
"step": 578
},
{
"epoch": 0.3191841234840132,
"grad_norm": 0.2500743269920349,
"learning_rate": 0.00025281923714759534,
"loss": 0.5224,
"step": 579
},
{
"epoch": 0.3197353914002205,
"grad_norm": 0.24151727557182312,
"learning_rate": 0.0002527363184079602,
"loss": 0.5056,
"step": 580
},
{
"epoch": 0.3202866593164278,
"grad_norm": 0.23307417333126068,
"learning_rate": 0.000252653399668325,
"loss": 0.4944,
"step": 581
},
{
"epoch": 0.32083792723263505,
"grad_norm": 0.25184640288352966,
"learning_rate": 0.00025257048092868986,
"loss": 0.5471,
"step": 582
},
{
"epoch": 0.32138919514884234,
"grad_norm": 0.21968768537044525,
"learning_rate": 0.0002524875621890547,
"loss": 0.4773,
"step": 583
},
{
"epoch": 0.3219404630650496,
"grad_norm": 0.22851119935512543,
"learning_rate": 0.0002524046434494195,
"loss": 0.4964,
"step": 584
},
{
"epoch": 0.3224917309812569,
"grad_norm": 0.2595960795879364,
"learning_rate": 0.0002523217247097844,
"loss": 0.5109,
"step": 585
},
{
"epoch": 0.3230429988974642,
"grad_norm": 0.25090447068214417,
"learning_rate": 0.00025223880597014924,
"loss": 0.4932,
"step": 586
},
{
"epoch": 0.3235942668136714,
"grad_norm": 0.24583864212036133,
"learning_rate": 0.0002521558872305141,
"loss": 0.4779,
"step": 587
},
{
"epoch": 0.3241455347298787,
"grad_norm": 0.23779521882534027,
"learning_rate": 0.0002520729684908789,
"loss": 0.4925,
"step": 588
},
{
"epoch": 0.324696802646086,
"grad_norm": 0.2614596486091614,
"learning_rate": 0.00025199004975124377,
"loss": 0.5064,
"step": 589
},
{
"epoch": 0.32524807056229327,
"grad_norm": 0.2449434995651245,
"learning_rate": 0.00025190713101160857,
"loss": 0.4768,
"step": 590
},
{
"epoch": 0.32579933847850057,
"grad_norm": 0.24249720573425293,
"learning_rate": 0.00025182421227197343,
"loss": 0.5183,
"step": 591
},
{
"epoch": 0.3263506063947078,
"grad_norm": 0.2366262972354889,
"learning_rate": 0.0002517412935323383,
"loss": 0.5119,
"step": 592
},
{
"epoch": 0.3269018743109151,
"grad_norm": 0.2465352565050125,
"learning_rate": 0.00025165837479270315,
"loss": 0.5133,
"step": 593
},
{
"epoch": 0.3274531422271224,
"grad_norm": 0.24108771979808807,
"learning_rate": 0.00025157545605306795,
"loss": 0.5139,
"step": 594
},
{
"epoch": 0.32800441014332965,
"grad_norm": 0.25272470712661743,
"learning_rate": 0.0002514925373134328,
"loss": 0.5161,
"step": 595
},
{
"epoch": 0.32855567805953695,
"grad_norm": 0.23254331946372986,
"learning_rate": 0.00025140961857379767,
"loss": 0.5048,
"step": 596
},
{
"epoch": 0.3291069459757442,
"grad_norm": 0.24523723125457764,
"learning_rate": 0.00025132669983416253,
"loss": 0.5234,
"step": 597
},
{
"epoch": 0.3296582138919515,
"grad_norm": 0.2396179735660553,
"learning_rate": 0.00025124378109452733,
"loss": 0.4865,
"step": 598
},
{
"epoch": 0.3302094818081588,
"grad_norm": 0.24812306463718414,
"learning_rate": 0.0002511608623548922,
"loss": 0.5262,
"step": 599
},
{
"epoch": 0.33076074972436603,
"grad_norm": 0.21982058882713318,
"learning_rate": 0.000251077943615257,
"loss": 0.5067,
"step": 600
},
{
"epoch": 0.33131201764057333,
"grad_norm": 0.23328660428524017,
"learning_rate": 0.00025099502487562186,
"loss": 0.5166,
"step": 601
},
{
"epoch": 0.3318632855567806,
"grad_norm": 0.23042722046375275,
"learning_rate": 0.0002509121061359867,
"loss": 0.4754,
"step": 602
},
{
"epoch": 0.3324145534729879,
"grad_norm": 0.2361726462841034,
"learning_rate": 0.0002508291873963516,
"loss": 0.5048,
"step": 603
},
{
"epoch": 0.33296582138919517,
"grad_norm": 0.22569622099399567,
"learning_rate": 0.0002507462686567164,
"loss": 0.5272,
"step": 604
},
{
"epoch": 0.3335170893054024,
"grad_norm": 0.28286513686180115,
"learning_rate": 0.00025066334991708124,
"loss": 0.5316,
"step": 605
},
{
"epoch": 0.3340683572216097,
"grad_norm": 0.2402937114238739,
"learning_rate": 0.0002505804311774461,
"loss": 0.5213,
"step": 606
},
{
"epoch": 0.33461962513781696,
"grad_norm": 0.23157329857349396,
"learning_rate": 0.00025049751243781096,
"loss": 0.5259,
"step": 607
},
{
"epoch": 0.33517089305402425,
"grad_norm": 0.24995861947536469,
"learning_rate": 0.00025041459369817576,
"loss": 0.4986,
"step": 608
},
{
"epoch": 0.33572216097023155,
"grad_norm": 0.2656213939189911,
"learning_rate": 0.0002503316749585406,
"loss": 0.4951,
"step": 609
},
{
"epoch": 0.3362734288864388,
"grad_norm": 0.2361687421798706,
"learning_rate": 0.00025024875621890543,
"loss": 0.4897,
"step": 610
},
{
"epoch": 0.3368246968026461,
"grad_norm": 0.23117870092391968,
"learning_rate": 0.0002501658374792703,
"loss": 0.5115,
"step": 611
},
{
"epoch": 0.33737596471885334,
"grad_norm": 0.2605067491531372,
"learning_rate": 0.00025008291873963515,
"loss": 0.4969,
"step": 612
},
{
"epoch": 0.33792723263506064,
"grad_norm": 0.2486005276441574,
"learning_rate": 0.00025,
"loss": 0.4853,
"step": 613
},
{
"epoch": 0.33847850055126794,
"grad_norm": 0.2559118866920471,
"learning_rate": 0.0002499170812603648,
"loss": 0.5279,
"step": 614
},
{
"epoch": 0.3390297684674752,
"grad_norm": 0.2579089403152466,
"learning_rate": 0.00024983416252072967,
"loss": 0.4942,
"step": 615
},
{
"epoch": 0.3395810363836825,
"grad_norm": 0.24982236325740814,
"learning_rate": 0.0002497512437810945,
"loss": 0.5061,
"step": 616
},
{
"epoch": 0.3401323042998897,
"grad_norm": 0.22861437499523163,
"learning_rate": 0.0002496683250414594,
"loss": 0.4935,
"step": 617
},
{
"epoch": 0.340683572216097,
"grad_norm": 0.26352861523628235,
"learning_rate": 0.0002495854063018242,
"loss": 0.4989,
"step": 618
},
{
"epoch": 0.3412348401323043,
"grad_norm": 0.26364725828170776,
"learning_rate": 0.00024950248756218905,
"loss": 0.5178,
"step": 619
},
{
"epoch": 0.34178610804851156,
"grad_norm": 0.2375265508890152,
"learning_rate": 0.00024941956882255386,
"loss": 0.5081,
"step": 620
},
{
"epoch": 0.34233737596471886,
"grad_norm": 0.24559634923934937,
"learning_rate": 0.0002493366500829187,
"loss": 0.5231,
"step": 621
},
{
"epoch": 0.3428886438809261,
"grad_norm": 0.25992295145988464,
"learning_rate": 0.0002492537313432836,
"loss": 0.4919,
"step": 622
},
{
"epoch": 0.3434399117971334,
"grad_norm": 0.2260003536939621,
"learning_rate": 0.00024917081260364843,
"loss": 0.4798,
"step": 623
},
{
"epoch": 0.3439911797133407,
"grad_norm": 0.24474291503429413,
"learning_rate": 0.00024908789386401324,
"loss": 0.5063,
"step": 624
},
{
"epoch": 0.34454244762954794,
"grad_norm": 0.27368757128715515,
"learning_rate": 0.0002490049751243781,
"loss": 0.5138,
"step": 625
},
{
"epoch": 0.34509371554575524,
"grad_norm": 0.23762589693069458,
"learning_rate": 0.0002489220563847429,
"loss": 0.4739,
"step": 626
},
{
"epoch": 0.34564498346196254,
"grad_norm": 0.26609158515930176,
"learning_rate": 0.00024883913764510776,
"loss": 0.5017,
"step": 627
},
{
"epoch": 0.3461962513781698,
"grad_norm": 0.26183345913887024,
"learning_rate": 0.0002487562189054726,
"loss": 0.5278,
"step": 628
},
{
"epoch": 0.3467475192943771,
"grad_norm": 0.254160076379776,
"learning_rate": 0.0002486733001658374,
"loss": 0.5178,
"step": 629
},
{
"epoch": 0.3472987872105843,
"grad_norm": 0.23745757341384888,
"learning_rate": 0.0002485903814262023,
"loss": 0.5152,
"step": 630
},
{
"epoch": 0.3478500551267916,
"grad_norm": 0.24215815961360931,
"learning_rate": 0.00024850746268656714,
"loss": 0.4821,
"step": 631
},
{
"epoch": 0.3484013230429989,
"grad_norm": 0.2696283459663391,
"learning_rate": 0.000248424543946932,
"loss": 0.4868,
"step": 632
},
{
"epoch": 0.34895259095920617,
"grad_norm": 0.2615061402320862,
"learning_rate": 0.0002483416252072968,
"loss": 0.5066,
"step": 633
},
{
"epoch": 0.34950385887541346,
"grad_norm": 0.2618487775325775,
"learning_rate": 0.00024825870646766167,
"loss": 0.5084,
"step": 634
},
{
"epoch": 0.3500551267916207,
"grad_norm": 0.2500843107700348,
"learning_rate": 0.00024817578772802647,
"loss": 0.5065,
"step": 635
},
{
"epoch": 0.350606394707828,
"grad_norm": 0.2559143304824829,
"learning_rate": 0.00024809286898839133,
"loss": 0.5058,
"step": 636
},
{
"epoch": 0.3511576626240353,
"grad_norm": 0.2498316466808319,
"learning_rate": 0.0002480099502487562,
"loss": 0.5033,
"step": 637
},
{
"epoch": 0.35170893054024255,
"grad_norm": 0.2778237760066986,
"learning_rate": 0.00024792703150912105,
"loss": 0.5319,
"step": 638
},
{
"epoch": 0.35226019845644985,
"grad_norm": 0.22850993275642395,
"learning_rate": 0.00024784411276948585,
"loss": 0.4852,
"step": 639
},
{
"epoch": 0.3528114663726571,
"grad_norm": 0.22482328116893768,
"learning_rate": 0.0002477611940298507,
"loss": 0.5044,
"step": 640
},
{
"epoch": 0.3533627342888644,
"grad_norm": 0.2470054179430008,
"learning_rate": 0.0002476782752902156,
"loss": 0.5119,
"step": 641
},
{
"epoch": 0.3539140022050717,
"grad_norm": 0.26223158836364746,
"learning_rate": 0.00024759535655058043,
"loss": 0.5276,
"step": 642
},
{
"epoch": 0.35446527012127893,
"grad_norm": 0.25175783038139343,
"learning_rate": 0.00024751243781094524,
"loss": 0.4963,
"step": 643
},
{
"epoch": 0.35501653803748623,
"grad_norm": 0.26237010955810547,
"learning_rate": 0.0002474295190713101,
"loss": 0.4989,
"step": 644
},
{
"epoch": 0.35556780595369347,
"grad_norm": 0.23380139470100403,
"learning_rate": 0.0002473466003316749,
"loss": 0.5143,
"step": 645
},
{
"epoch": 0.35611907386990077,
"grad_norm": 0.23414726555347443,
"learning_rate": 0.00024726368159203976,
"loss": 0.4837,
"step": 646
},
{
"epoch": 0.35667034178610807,
"grad_norm": 0.2426154464483261,
"learning_rate": 0.0002471807628524046,
"loss": 0.4953,
"step": 647
},
{
"epoch": 0.3572216097023153,
"grad_norm": 0.25034722685813904,
"learning_rate": 0.0002470978441127695,
"loss": 0.505,
"step": 648
},
{
"epoch": 0.3577728776185226,
"grad_norm": 0.21789918839931488,
"learning_rate": 0.0002470149253731343,
"loss": 0.5121,
"step": 649
},
{
"epoch": 0.35832414553472985,
"grad_norm": 0.2339979112148285,
"learning_rate": 0.00024693200663349914,
"loss": 0.5065,
"step": 650
},
{
"epoch": 0.35887541345093715,
"grad_norm": 0.22365735471248627,
"learning_rate": 0.000246849087893864,
"loss": 0.4952,
"step": 651
},
{
"epoch": 0.35942668136714445,
"grad_norm": 0.2149263620376587,
"learning_rate": 0.00024676616915422886,
"loss": 0.4677,
"step": 652
},
{
"epoch": 0.3599779492833517,
"grad_norm": 0.2143101543188095,
"learning_rate": 0.00024668325041459367,
"loss": 0.4881,
"step": 653
},
{
"epoch": 0.360529217199559,
"grad_norm": 0.23739519715309143,
"learning_rate": 0.0002466003316749585,
"loss": 0.5006,
"step": 654
},
{
"epoch": 0.36108048511576624,
"grad_norm": 0.24234917759895325,
"learning_rate": 0.00024651741293532333,
"loss": 0.5206,
"step": 655
},
{
"epoch": 0.36163175303197354,
"grad_norm": 0.2366551011800766,
"learning_rate": 0.0002464344941956882,
"loss": 0.5075,
"step": 656
},
{
"epoch": 0.36218302094818083,
"grad_norm": 0.2543952465057373,
"learning_rate": 0.00024635157545605305,
"loss": 0.4985,
"step": 657
},
{
"epoch": 0.3627342888643881,
"grad_norm": 0.24470911920070648,
"learning_rate": 0.0002462686567164179,
"loss": 0.5128,
"step": 658
},
{
"epoch": 0.3632855567805954,
"grad_norm": 0.22214102745056152,
"learning_rate": 0.0002461857379767827,
"loss": 0.5125,
"step": 659
},
{
"epoch": 0.3638368246968026,
"grad_norm": 0.24312040209770203,
"learning_rate": 0.00024610281923714757,
"loss": 0.4936,
"step": 660
},
{
"epoch": 0.3643880926130099,
"grad_norm": 0.25986719131469727,
"learning_rate": 0.00024601990049751243,
"loss": 0.5347,
"step": 661
},
{
"epoch": 0.3649393605292172,
"grad_norm": 0.22576284408569336,
"learning_rate": 0.0002459369817578773,
"loss": 0.4747,
"step": 662
},
{
"epoch": 0.36549062844542446,
"grad_norm": 0.257548451423645,
"learning_rate": 0.0002458540630182421,
"loss": 0.5083,
"step": 663
},
{
"epoch": 0.36604189636163176,
"grad_norm": 0.26048266887664795,
"learning_rate": 0.00024577114427860695,
"loss": 0.539,
"step": 664
},
{
"epoch": 0.36659316427783906,
"grad_norm": 0.2594940662384033,
"learning_rate": 0.00024568822553897176,
"loss": 0.5003,
"step": 665
},
{
"epoch": 0.3671444321940463,
"grad_norm": 0.2651066482067108,
"learning_rate": 0.0002456053067993366,
"loss": 0.4979,
"step": 666
},
{
"epoch": 0.3676957001102536,
"grad_norm": 0.2542423903942108,
"learning_rate": 0.0002455223880597015,
"loss": 0.5338,
"step": 667
},
{
"epoch": 0.36824696802646084,
"grad_norm": 0.24032056331634521,
"learning_rate": 0.00024543946932006634,
"loss": 0.5101,
"step": 668
},
{
"epoch": 0.36879823594266814,
"grad_norm": 0.26019784808158875,
"learning_rate": 0.00024535655058043114,
"loss": 0.5217,
"step": 669
},
{
"epoch": 0.36934950385887544,
"grad_norm": 0.24449752271175385,
"learning_rate": 0.000245273631840796,
"loss": 0.5318,
"step": 670
},
{
"epoch": 0.3699007717750827,
"grad_norm": 0.22685208916664124,
"learning_rate": 0.00024519071310116086,
"loss": 0.5186,
"step": 671
},
{
"epoch": 0.37045203969129,
"grad_norm": 0.2340528517961502,
"learning_rate": 0.00024510779436152566,
"loss": 0.4879,
"step": 672
},
{
"epoch": 0.3710033076074972,
"grad_norm": 0.2637344002723694,
"learning_rate": 0.0002450248756218905,
"loss": 0.5225,
"step": 673
},
{
"epoch": 0.3715545755237045,
"grad_norm": 0.2515370845794678,
"learning_rate": 0.00024494195688225533,
"loss": 0.4913,
"step": 674
},
{
"epoch": 0.3721058434399118,
"grad_norm": 0.22438743710517883,
"learning_rate": 0.0002448590381426202,
"loss": 0.4733,
"step": 675
},
{
"epoch": 0.37265711135611906,
"grad_norm": 0.24447986483573914,
"learning_rate": 0.00024477611940298505,
"loss": 0.5138,
"step": 676
},
{
"epoch": 0.37320837927232636,
"grad_norm": 0.2652420699596405,
"learning_rate": 0.0002446932006633499,
"loss": 0.4897,
"step": 677
},
{
"epoch": 0.3737596471885336,
"grad_norm": 0.23273025453090668,
"learning_rate": 0.0002446102819237147,
"loss": 0.4823,
"step": 678
},
{
"epoch": 0.3743109151047409,
"grad_norm": 0.24014912545681,
"learning_rate": 0.00024452736318407957,
"loss": 0.4963,
"step": 679
},
{
"epoch": 0.3748621830209482,
"grad_norm": 0.2454654574394226,
"learning_rate": 0.00024444444444444443,
"loss": 0.5367,
"step": 680
},
{
"epoch": 0.37541345093715545,
"grad_norm": 0.23897579312324524,
"learning_rate": 0.0002443615257048093,
"loss": 0.5038,
"step": 681
},
{
"epoch": 0.37596471885336274,
"grad_norm": 0.25277066230773926,
"learning_rate": 0.0002442786069651741,
"loss": 0.506,
"step": 682
},
{
"epoch": 0.37651598676957,
"grad_norm": 0.22470998764038086,
"learning_rate": 0.00024419568822553895,
"loss": 0.5038,
"step": 683
},
{
"epoch": 0.3770672546857773,
"grad_norm": 0.2490270882844925,
"learning_rate": 0.00024411276948590378,
"loss": 0.5073,
"step": 684
},
{
"epoch": 0.3776185226019846,
"grad_norm": 0.23964819312095642,
"learning_rate": 0.00024402985074626864,
"loss": 0.4932,
"step": 685
},
{
"epoch": 0.37816979051819183,
"grad_norm": 0.2595767676830292,
"learning_rate": 0.00024394693200663348,
"loss": 0.5263,
"step": 686
},
{
"epoch": 0.3787210584343991,
"grad_norm": 0.23740339279174805,
"learning_rate": 0.00024386401326699833,
"loss": 0.5019,
"step": 687
},
{
"epoch": 0.37927232635060637,
"grad_norm": 0.23046371340751648,
"learning_rate": 0.00024378109452736314,
"loss": 0.5071,
"step": 688
},
{
"epoch": 0.37982359426681367,
"grad_norm": 0.24483554065227509,
"learning_rate": 0.000243698175787728,
"loss": 0.4978,
"step": 689
},
{
"epoch": 0.38037486218302097,
"grad_norm": 0.23441949486732483,
"learning_rate": 0.00024361525704809283,
"loss": 0.5217,
"step": 690
},
{
"epoch": 0.3809261300992282,
"grad_norm": 0.23334890604019165,
"learning_rate": 0.0002435323383084577,
"loss": 0.4826,
"step": 691
},
{
"epoch": 0.3814773980154355,
"grad_norm": 0.2869088053703308,
"learning_rate": 0.00024344941956882252,
"loss": 0.5199,
"step": 692
},
{
"epoch": 0.38202866593164275,
"grad_norm": 0.22842839360237122,
"learning_rate": 0.00024336650082918738,
"loss": 0.4586,
"step": 693
},
{
"epoch": 0.38257993384785005,
"grad_norm": 0.23558756709098816,
"learning_rate": 0.0002432835820895522,
"loss": 0.4775,
"step": 694
},
{
"epoch": 0.38313120176405735,
"grad_norm": 0.2528475821018219,
"learning_rate": 0.00024320066334991707,
"loss": 0.5068,
"step": 695
},
{
"epoch": 0.3836824696802646,
"grad_norm": 0.2580317258834839,
"learning_rate": 0.0002431177446102819,
"loss": 0.52,
"step": 696
},
{
"epoch": 0.3842337375964719,
"grad_norm": 0.23449361324310303,
"learning_rate": 0.00024303482587064676,
"loss": 0.4776,
"step": 697
},
{
"epoch": 0.38478500551267913,
"grad_norm": 0.2365398108959198,
"learning_rate": 0.00024295190713101157,
"loss": 0.5063,
"step": 698
},
{
"epoch": 0.38533627342888643,
"grad_norm": 0.24017611145973206,
"learning_rate": 0.00024286898839137643,
"loss": 0.4989,
"step": 699
},
{
"epoch": 0.38588754134509373,
"grad_norm": 0.237211212515831,
"learning_rate": 0.00024278606965174126,
"loss": 0.4942,
"step": 700
},
{
"epoch": 0.386438809261301,
"grad_norm": 0.24133196473121643,
"learning_rate": 0.00024270315091210612,
"loss": 0.4991,
"step": 701
},
{
"epoch": 0.3869900771775083,
"grad_norm": 0.23730522394180298,
"learning_rate": 0.00024262023217247095,
"loss": 0.4847,
"step": 702
},
{
"epoch": 0.3875413450937156,
"grad_norm": 0.23267106711864471,
"learning_rate": 0.0002425373134328358,
"loss": 0.5304,
"step": 703
},
{
"epoch": 0.3880926130099228,
"grad_norm": 0.22734446823596954,
"learning_rate": 0.00024245439469320064,
"loss": 0.4752,
"step": 704
},
{
"epoch": 0.3886438809261301,
"grad_norm": 0.24138008058071136,
"learning_rate": 0.0002423714759535655,
"loss": 0.4831,
"step": 705
},
{
"epoch": 0.38919514884233736,
"grad_norm": 0.24015116691589355,
"learning_rate": 0.00024228855721393033,
"loss": 0.506,
"step": 706
},
{
"epoch": 0.38974641675854466,
"grad_norm": 0.23817308247089386,
"learning_rate": 0.0002422056384742952,
"loss": 0.4868,
"step": 707
},
{
"epoch": 0.39029768467475195,
"grad_norm": 0.21546156704425812,
"learning_rate": 0.00024212271973466,
"loss": 0.5102,
"step": 708
},
{
"epoch": 0.3908489525909592,
"grad_norm": 0.2489834874868393,
"learning_rate": 0.00024203980099502486,
"loss": 0.4985,
"step": 709
},
{
"epoch": 0.3914002205071665,
"grad_norm": 0.23067452013492584,
"learning_rate": 0.0002419568822553897,
"loss": 0.4985,
"step": 710
},
{
"epoch": 0.39195148842337374,
"grad_norm": 0.24763309955596924,
"learning_rate": 0.00024187396351575455,
"loss": 0.5124,
"step": 711
},
{
"epoch": 0.39250275633958104,
"grad_norm": 0.2439269721508026,
"learning_rate": 0.00024179104477611938,
"loss": 0.4939,
"step": 712
},
{
"epoch": 0.39305402425578834,
"grad_norm": 0.23163112998008728,
"learning_rate": 0.00024170812603648424,
"loss": 0.4954,
"step": 713
},
{
"epoch": 0.3936052921719956,
"grad_norm": 0.24170540273189545,
"learning_rate": 0.00024162520729684907,
"loss": 0.4947,
"step": 714
},
{
"epoch": 0.3941565600882029,
"grad_norm": 0.23549963533878326,
"learning_rate": 0.00024154228855721393,
"loss": 0.5132,
"step": 715
},
{
"epoch": 0.3947078280044101,
"grad_norm": 0.2394574135541916,
"learning_rate": 0.00024145936981757876,
"loss": 0.5153,
"step": 716
},
{
"epoch": 0.3952590959206174,
"grad_norm": 0.2615318298339844,
"learning_rate": 0.00024137645107794357,
"loss": 0.4971,
"step": 717
},
{
"epoch": 0.3958103638368247,
"grad_norm": 0.2353423684835434,
"learning_rate": 0.00024129353233830843,
"loss": 0.4966,
"step": 718
},
{
"epoch": 0.39636163175303196,
"grad_norm": 0.22130148112773895,
"learning_rate": 0.00024121061359867326,
"loss": 0.4487,
"step": 719
},
{
"epoch": 0.39691289966923926,
"grad_norm": 0.234688401222229,
"learning_rate": 0.00024112769485903812,
"loss": 0.499,
"step": 720
},
{
"epoch": 0.3974641675854465,
"grad_norm": 0.23247137665748596,
"learning_rate": 0.00024104477611940295,
"loss": 0.4944,
"step": 721
},
{
"epoch": 0.3980154355016538,
"grad_norm": 0.2362777143716812,
"learning_rate": 0.0002409618573797678,
"loss": 0.481,
"step": 722
},
{
"epoch": 0.3985667034178611,
"grad_norm": 0.24181120097637177,
"learning_rate": 0.00024087893864013264,
"loss": 0.5211,
"step": 723
},
{
"epoch": 0.39911797133406834,
"grad_norm": 0.22298705577850342,
"learning_rate": 0.0002407960199004975,
"loss": 0.4888,
"step": 724
},
{
"epoch": 0.39966923925027564,
"grad_norm": 0.2304617017507553,
"learning_rate": 0.00024071310116086233,
"loss": 0.4811,
"step": 725
},
{
"epoch": 0.4002205071664829,
"grad_norm": 0.24691155552864075,
"learning_rate": 0.0002406301824212272,
"loss": 0.5189,
"step": 726
},
{
"epoch": 0.4007717750826902,
"grad_norm": 0.25604429841041565,
"learning_rate": 0.000240547263681592,
"loss": 0.4927,
"step": 727
},
{
"epoch": 0.4013230429988975,
"grad_norm": 0.2280474603176117,
"learning_rate": 0.00024046434494195685,
"loss": 0.4882,
"step": 728
},
{
"epoch": 0.4018743109151047,
"grad_norm": 0.23425596952438354,
"learning_rate": 0.0002403814262023217,
"loss": 0.4875,
"step": 729
},
{
"epoch": 0.402425578831312,
"grad_norm": 0.26156267523765564,
"learning_rate": 0.00024029850746268655,
"loss": 0.5087,
"step": 730
},
{
"epoch": 0.40297684674751927,
"grad_norm": 0.23172809183597565,
"learning_rate": 0.00024021558872305138,
"loss": 0.5024,
"step": 731
},
{
"epoch": 0.40352811466372657,
"grad_norm": 0.23358501493930817,
"learning_rate": 0.00024013266998341624,
"loss": 0.4972,
"step": 732
},
{
"epoch": 0.40407938257993387,
"grad_norm": 0.23836782574653625,
"learning_rate": 0.00024004975124378107,
"loss": 0.5061,
"step": 733
},
{
"epoch": 0.4046306504961411,
"grad_norm": 0.23341165482997894,
"learning_rate": 0.00023996683250414593,
"loss": 0.4927,
"step": 734
},
{
"epoch": 0.4051819184123484,
"grad_norm": 0.2267657369375229,
"learning_rate": 0.00023988391376451076,
"loss": 0.4884,
"step": 735
},
{
"epoch": 0.40573318632855565,
"grad_norm": 0.23333032429218292,
"learning_rate": 0.00023980099502487562,
"loss": 0.4764,
"step": 736
},
{
"epoch": 0.40628445424476295,
"grad_norm": 0.24722862243652344,
"learning_rate": 0.00023971807628524042,
"loss": 0.5168,
"step": 737
},
{
"epoch": 0.40683572216097025,
"grad_norm": 0.24919219315052032,
"learning_rate": 0.00023963515754560528,
"loss": 0.4953,
"step": 738
},
{
"epoch": 0.4073869900771775,
"grad_norm": 0.22673016786575317,
"learning_rate": 0.00023955223880597012,
"loss": 0.4883,
"step": 739
},
{
"epoch": 0.4079382579933848,
"grad_norm": 0.22796331346035004,
"learning_rate": 0.00023946932006633497,
"loss": 0.4683,
"step": 740
},
{
"epoch": 0.4084895259095921,
"grad_norm": 0.23972417414188385,
"learning_rate": 0.0002393864013266998,
"loss": 0.4919,
"step": 741
},
{
"epoch": 0.40904079382579933,
"grad_norm": 0.23933400213718414,
"learning_rate": 0.00023930348258706467,
"loss": 0.5053,
"step": 742
},
{
"epoch": 0.40959206174200663,
"grad_norm": 0.24868054687976837,
"learning_rate": 0.0002392205638474295,
"loss": 0.4854,
"step": 743
},
{
"epoch": 0.4101433296582139,
"grad_norm": 0.23096708953380585,
"learning_rate": 0.00023913764510779436,
"loss": 0.4739,
"step": 744
},
{
"epoch": 0.41069459757442117,
"grad_norm": 0.2553226947784424,
"learning_rate": 0.0002390547263681592,
"loss": 0.4679,
"step": 745
},
{
"epoch": 0.41124586549062847,
"grad_norm": 0.24697932600975037,
"learning_rate": 0.00023897180762852405,
"loss": 0.4858,
"step": 746
},
{
"epoch": 0.4117971334068357,
"grad_norm": 0.2418091893196106,
"learning_rate": 0.00023888888888888885,
"loss": 0.5172,
"step": 747
},
{
"epoch": 0.412348401323043,
"grad_norm": 0.24144020676612854,
"learning_rate": 0.0002388059701492537,
"loss": 0.4711,
"step": 748
},
{
"epoch": 0.41289966923925026,
"grad_norm": 0.24137695133686066,
"learning_rate": 0.00023872305140961854,
"loss": 0.5106,
"step": 749
},
{
"epoch": 0.41345093715545755,
"grad_norm": 0.220285102725029,
"learning_rate": 0.0002386401326699834,
"loss": 0.4704,
"step": 750
},
{
"epoch": 0.41400220507166485,
"grad_norm": 0.24430547654628754,
"learning_rate": 0.00023855721393034824,
"loss": 0.5038,
"step": 751
},
{
"epoch": 0.4145534729878721,
"grad_norm": 0.24019300937652588,
"learning_rate": 0.0002384742951907131,
"loss": 0.4949,
"step": 752
},
{
"epoch": 0.4151047409040794,
"grad_norm": 0.22668643295764923,
"learning_rate": 0.00023839137645107793,
"loss": 0.4718,
"step": 753
},
{
"epoch": 0.41565600882028664,
"grad_norm": 0.2277330756187439,
"learning_rate": 0.00023830845771144279,
"loss": 0.514,
"step": 754
},
{
"epoch": 0.41620727673649394,
"grad_norm": 0.2215653359889984,
"learning_rate": 0.00023822553897180762,
"loss": 0.4873,
"step": 755
},
{
"epoch": 0.41675854465270123,
"grad_norm": 0.22386564314365387,
"learning_rate": 0.00023814262023217248,
"loss": 0.4824,
"step": 756
},
{
"epoch": 0.4173098125689085,
"grad_norm": 0.2562282681465149,
"learning_rate": 0.00023805970149253728,
"loss": 0.5177,
"step": 757
},
{
"epoch": 0.4178610804851158,
"grad_norm": 0.25375691056251526,
"learning_rate": 0.00023797678275290214,
"loss": 0.51,
"step": 758
},
{
"epoch": 0.418412348401323,
"grad_norm": 0.26564472913742065,
"learning_rate": 0.00023789386401326697,
"loss": 0.5048,
"step": 759
},
{
"epoch": 0.4189636163175303,
"grad_norm": 0.24918165802955627,
"learning_rate": 0.00023781094527363183,
"loss": 0.4964,
"step": 760
},
{
"epoch": 0.4195148842337376,
"grad_norm": 0.26909199357032776,
"learning_rate": 0.00023772802653399666,
"loss": 0.4511,
"step": 761
},
{
"epoch": 0.42006615214994486,
"grad_norm": 0.27723434567451477,
"learning_rate": 0.0002376451077943615,
"loss": 0.4994,
"step": 762
},
{
"epoch": 0.42061742006615216,
"grad_norm": 0.23842424154281616,
"learning_rate": 0.00023756218905472636,
"loss": 0.5127,
"step": 763
},
{
"epoch": 0.4211686879823594,
"grad_norm": 0.2599777281284332,
"learning_rate": 0.0002374792703150912,
"loss": 0.5221,
"step": 764
},
{
"epoch": 0.4217199558985667,
"grad_norm": 0.2541678845882416,
"learning_rate": 0.00023739635157545605,
"loss": 0.5086,
"step": 765
},
{
"epoch": 0.422271223814774,
"grad_norm": 0.24489666521549225,
"learning_rate": 0.00023731343283582085,
"loss": 0.5052,
"step": 766
},
{
"epoch": 0.42282249173098124,
"grad_norm": 0.23364123702049255,
"learning_rate": 0.0002372305140961857,
"loss": 0.4815,
"step": 767
},
{
"epoch": 0.42337375964718854,
"grad_norm": 0.24420395493507385,
"learning_rate": 0.00023714759535655054,
"loss": 0.4799,
"step": 768
},
{
"epoch": 0.4239250275633958,
"grad_norm": 0.2559242844581604,
"learning_rate": 0.0002370646766169154,
"loss": 0.5218,
"step": 769
},
{
"epoch": 0.4244762954796031,
"grad_norm": 0.24033527076244354,
"learning_rate": 0.00023698175787728023,
"loss": 0.4951,
"step": 770
},
{
"epoch": 0.4250275633958104,
"grad_norm": 0.2582804262638092,
"learning_rate": 0.0002368988391376451,
"loss": 0.4925,
"step": 771
},
{
"epoch": 0.4255788313120176,
"grad_norm": 0.21231015026569366,
"learning_rate": 0.00023681592039800992,
"loss": 0.4975,
"step": 772
},
{
"epoch": 0.4261300992282249,
"grad_norm": 0.23742909729480743,
"learning_rate": 0.00023673300165837478,
"loss": 0.5115,
"step": 773
},
{
"epoch": 0.42668136714443217,
"grad_norm": 0.23761944472789764,
"learning_rate": 0.00023665008291873962,
"loss": 0.5117,
"step": 774
},
{
"epoch": 0.42723263506063947,
"grad_norm": 0.25065210461616516,
"learning_rate": 0.00023656716417910448,
"loss": 0.5305,
"step": 775
},
{
"epoch": 0.42778390297684676,
"grad_norm": 0.23839645087718964,
"learning_rate": 0.00023648424543946928,
"loss": 0.5245,
"step": 776
},
{
"epoch": 0.428335170893054,
"grad_norm": 0.22241149842739105,
"learning_rate": 0.00023640132669983414,
"loss": 0.5041,
"step": 777
},
{
"epoch": 0.4288864388092613,
"grad_norm": 0.23228657245635986,
"learning_rate": 0.00023631840796019897,
"loss": 0.4955,
"step": 778
},
{
"epoch": 0.4294377067254686,
"grad_norm": 0.24807095527648926,
"learning_rate": 0.00023623548922056383,
"loss": 0.5057,
"step": 779
},
{
"epoch": 0.42998897464167585,
"grad_norm": 0.253288209438324,
"learning_rate": 0.00023615257048092866,
"loss": 0.5179,
"step": 780
},
{
"epoch": 0.43054024255788315,
"grad_norm": 0.2280365228652954,
"learning_rate": 0.00023606965174129352,
"loss": 0.5104,
"step": 781
},
{
"epoch": 0.4310915104740904,
"grad_norm": 0.21497339010238647,
"learning_rate": 0.00023598673300165835,
"loss": 0.479,
"step": 782
},
{
"epoch": 0.4316427783902977,
"grad_norm": 0.25969845056533813,
"learning_rate": 0.0002359038142620232,
"loss": 0.4952,
"step": 783
},
{
"epoch": 0.432194046306505,
"grad_norm": 0.24241061508655548,
"learning_rate": 0.00023582089552238804,
"loss": 0.5147,
"step": 784
},
{
"epoch": 0.43274531422271223,
"grad_norm": 0.23297248780727386,
"learning_rate": 0.0002357379767827529,
"loss": 0.4698,
"step": 785
},
{
"epoch": 0.43329658213891953,
"grad_norm": 0.23766906559467316,
"learning_rate": 0.0002356550580431177,
"loss": 0.5127,
"step": 786
},
{
"epoch": 0.43384785005512677,
"grad_norm": 0.225977823138237,
"learning_rate": 0.00023557213930348257,
"loss": 0.4698,
"step": 787
},
{
"epoch": 0.43439911797133407,
"grad_norm": 0.25361236929893494,
"learning_rate": 0.0002354892205638474,
"loss": 0.4887,
"step": 788
},
{
"epoch": 0.43495038588754137,
"grad_norm": 0.23103906214237213,
"learning_rate": 0.00023540630182421226,
"loss": 0.4831,
"step": 789
},
{
"epoch": 0.4355016538037486,
"grad_norm": 0.23840244114398956,
"learning_rate": 0.0002353233830845771,
"loss": 0.501,
"step": 790
},
{
"epoch": 0.4360529217199559,
"grad_norm": 0.2217642217874527,
"learning_rate": 0.00023524046434494195,
"loss": 0.4792,
"step": 791
},
{
"epoch": 0.43660418963616315,
"grad_norm": 0.23963388800621033,
"learning_rate": 0.00023515754560530678,
"loss": 0.5043,
"step": 792
},
{
"epoch": 0.43715545755237045,
"grad_norm": 0.2423614263534546,
"learning_rate": 0.00023507462686567164,
"loss": 0.4923,
"step": 793
},
{
"epoch": 0.43770672546857775,
"grad_norm": 0.23817111551761627,
"learning_rate": 0.00023499170812603645,
"loss": 0.4836,
"step": 794
},
{
"epoch": 0.438257993384785,
"grad_norm": 0.22162829339504242,
"learning_rate": 0.00023490878938640133,
"loss": 0.4919,
"step": 795
},
{
"epoch": 0.4388092613009923,
"grad_norm": 0.22646528482437134,
"learning_rate": 0.00023482587064676614,
"loss": 0.4727,
"step": 796
},
{
"epoch": 0.43936052921719954,
"grad_norm": 0.2530063986778259,
"learning_rate": 0.000234742951907131,
"loss": 0.4896,
"step": 797
},
{
"epoch": 0.43991179713340683,
"grad_norm": 0.24201619625091553,
"learning_rate": 0.00023466003316749583,
"loss": 0.4664,
"step": 798
},
{
"epoch": 0.44046306504961413,
"grad_norm": 0.22222551703453064,
"learning_rate": 0.0002345771144278607,
"loss": 0.4914,
"step": 799
},
{
"epoch": 0.4410143329658214,
"grad_norm": 0.2384173721075058,
"learning_rate": 0.00023449419568822552,
"loss": 0.5029,
"step": 800
},
{
"epoch": 0.4415656008820287,
"grad_norm": 0.23053288459777832,
"learning_rate": 0.00023441127694859038,
"loss": 0.5011,
"step": 801
},
{
"epoch": 0.4421168687982359,
"grad_norm": 0.2338135987520218,
"learning_rate": 0.0002343283582089552,
"loss": 0.5145,
"step": 802
},
{
"epoch": 0.4426681367144432,
"grad_norm": 0.2439098060131073,
"learning_rate": 0.00023424543946932007,
"loss": 0.5353,
"step": 803
},
{
"epoch": 0.4432194046306505,
"grad_norm": 0.25395849347114563,
"learning_rate": 0.00023416252072968488,
"loss": 0.5287,
"step": 804
},
{
"epoch": 0.44377067254685776,
"grad_norm": 0.24382875859737396,
"learning_rate": 0.0002340796019900497,
"loss": 0.4753,
"step": 805
},
{
"epoch": 0.44432194046306506,
"grad_norm": 0.22943390905857086,
"learning_rate": 0.00023399668325041457,
"loss": 0.4899,
"step": 806
},
{
"epoch": 0.4448732083792723,
"grad_norm": 0.23026274144649506,
"learning_rate": 0.0002339137645107794,
"loss": 0.4776,
"step": 807
},
{
"epoch": 0.4454244762954796,
"grad_norm": 0.263637512922287,
"learning_rate": 0.00023383084577114426,
"loss": 0.5036,
"step": 808
},
{
"epoch": 0.4459757442116869,
"grad_norm": 0.2239854782819748,
"learning_rate": 0.0002337479270315091,
"loss": 0.5074,
"step": 809
},
{
"epoch": 0.44652701212789414,
"grad_norm": 0.24209174513816833,
"learning_rate": 0.00023366500829187395,
"loss": 0.4962,
"step": 810
},
{
"epoch": 0.44707828004410144,
"grad_norm": 0.2574441730976105,
"learning_rate": 0.00023358208955223878,
"loss": 0.4833,
"step": 811
},
{
"epoch": 0.4476295479603087,
"grad_norm": 0.24309788644313812,
"learning_rate": 0.00023349917081260364,
"loss": 0.4971,
"step": 812
},
{
"epoch": 0.448180815876516,
"grad_norm": 0.23553608357906342,
"learning_rate": 0.00023341625207296844,
"loss": 0.4951,
"step": 813
},
{
"epoch": 0.4487320837927233,
"grad_norm": 0.23820781707763672,
"learning_rate": 0.0002333333333333333,
"loss": 0.4974,
"step": 814
},
{
"epoch": 0.4492833517089305,
"grad_norm": 0.26907938718795776,
"learning_rate": 0.00023325041459369814,
"loss": 0.4904,
"step": 815
},
{
"epoch": 0.4498346196251378,
"grad_norm": 0.2529081702232361,
"learning_rate": 0.000233167495854063,
"loss": 0.5047,
"step": 816
},
{
"epoch": 0.4503858875413451,
"grad_norm": 0.2080521285533905,
"learning_rate": 0.00023308457711442783,
"loss": 0.4676,
"step": 817
},
{
"epoch": 0.45093715545755236,
"grad_norm": 0.25028982758522034,
"learning_rate": 0.00023300165837479269,
"loss": 0.5093,
"step": 818
},
{
"epoch": 0.45148842337375966,
"grad_norm": 0.24182821810245514,
"learning_rate": 0.00023291873963515752,
"loss": 0.5082,
"step": 819
},
{
"epoch": 0.4520396912899669,
"grad_norm": 0.23918956518173218,
"learning_rate": 0.00023283582089552238,
"loss": 0.4887,
"step": 820
},
{
"epoch": 0.4525909592061742,
"grad_norm": 0.25016239285469055,
"learning_rate": 0.0002327529021558872,
"loss": 0.4887,
"step": 821
},
{
"epoch": 0.4531422271223815,
"grad_norm": 0.2489538937807083,
"learning_rate": 0.00023266998341625207,
"loss": 0.5089,
"step": 822
},
{
"epoch": 0.45369349503858875,
"grad_norm": 0.2490735650062561,
"learning_rate": 0.00023258706467661687,
"loss": 0.4812,
"step": 823
},
{
"epoch": 0.45424476295479604,
"grad_norm": 0.26727011799812317,
"learning_rate": 0.00023250414593698173,
"loss": 0.4943,
"step": 824
},
{
"epoch": 0.4547960308710033,
"grad_norm": 0.2334149330854416,
"learning_rate": 0.00023242122719734656,
"loss": 0.4743,
"step": 825
},
{
"epoch": 0.4553472987872106,
"grad_norm": 0.24874447286128998,
"learning_rate": 0.00023233830845771142,
"loss": 0.5034,
"step": 826
},
{
"epoch": 0.4558985667034179,
"grad_norm": 0.26186123490333557,
"learning_rate": 0.00023225538971807626,
"loss": 0.4986,
"step": 827
},
{
"epoch": 0.4564498346196251,
"grad_norm": 0.22734478116035461,
"learning_rate": 0.00023217247097844111,
"loss": 0.479,
"step": 828
},
{
"epoch": 0.4570011025358324,
"grad_norm": 0.24908246099948883,
"learning_rate": 0.00023208955223880595,
"loss": 0.5176,
"step": 829
},
{
"epoch": 0.45755237045203967,
"grad_norm": 0.2561740279197693,
"learning_rate": 0.0002320066334991708,
"loss": 0.5181,
"step": 830
},
{
"epoch": 0.45810363836824697,
"grad_norm": 0.24820713698863983,
"learning_rate": 0.00023192371475953564,
"loss": 0.5168,
"step": 831
},
{
"epoch": 0.45865490628445427,
"grad_norm": 0.22865842282772064,
"learning_rate": 0.0002318407960199005,
"loss": 0.5034,
"step": 832
},
{
"epoch": 0.4592061742006615,
"grad_norm": 0.2395135760307312,
"learning_rate": 0.0002317578772802653,
"loss": 0.4956,
"step": 833
},
{
"epoch": 0.4597574421168688,
"grad_norm": 0.2375570386648178,
"learning_rate": 0.00023167495854063016,
"loss": 0.4939,
"step": 834
},
{
"epoch": 0.46030871003307605,
"grad_norm": 0.24207614362239838,
"learning_rate": 0.000231592039800995,
"loss": 0.4998,
"step": 835
},
{
"epoch": 0.46085997794928335,
"grad_norm": 0.231749027967453,
"learning_rate": 0.00023150912106135985,
"loss": 0.5071,
"step": 836
},
{
"epoch": 0.46141124586549065,
"grad_norm": 0.2529800236225128,
"learning_rate": 0.00023142620232172468,
"loss": 0.5152,
"step": 837
},
{
"epoch": 0.4619625137816979,
"grad_norm": 0.24748285114765167,
"learning_rate": 0.00023134328358208954,
"loss": 0.4929,
"step": 838
},
{
"epoch": 0.4625137816979052,
"grad_norm": 0.2481345683336258,
"learning_rate": 0.00023126036484245438,
"loss": 0.5131,
"step": 839
},
{
"epoch": 0.46306504961411243,
"grad_norm": 0.22557318210601807,
"learning_rate": 0.00023117744610281923,
"loss": 0.5111,
"step": 840
},
{
"epoch": 0.46361631753031973,
"grad_norm": 0.24130286276340485,
"learning_rate": 0.00023109452736318407,
"loss": 0.486,
"step": 841
},
{
"epoch": 0.46416758544652703,
"grad_norm": 0.2238035351037979,
"learning_rate": 0.00023101160862354893,
"loss": 0.4836,
"step": 842
},
{
"epoch": 0.4647188533627343,
"grad_norm": 0.23449353873729706,
"learning_rate": 0.00023092868988391373,
"loss": 0.4714,
"step": 843
},
{
"epoch": 0.4652701212789416,
"grad_norm": 0.2284533679485321,
"learning_rate": 0.0002308457711442786,
"loss": 0.4739,
"step": 844
},
{
"epoch": 0.4658213891951488,
"grad_norm": 0.2420201152563095,
"learning_rate": 0.00023076285240464342,
"loss": 0.4797,
"step": 845
},
{
"epoch": 0.4663726571113561,
"grad_norm": 0.2669530212879181,
"learning_rate": 0.00023067993366500828,
"loss": 0.5017,
"step": 846
},
{
"epoch": 0.4669239250275634,
"grad_norm": 0.2415032982826233,
"learning_rate": 0.0002305970149253731,
"loss": 0.5023,
"step": 847
},
{
"epoch": 0.46747519294377066,
"grad_norm": 0.2327703833580017,
"learning_rate": 0.00023051409618573797,
"loss": 0.5089,
"step": 848
},
{
"epoch": 0.46802646085997796,
"grad_norm": 0.24102593958377838,
"learning_rate": 0.0002304311774461028,
"loss": 0.5092,
"step": 849
},
{
"epoch": 0.4685777287761852,
"grad_norm": 0.22270776331424713,
"learning_rate": 0.00023034825870646764,
"loss": 0.4677,
"step": 850
},
{
"epoch": 0.4691289966923925,
"grad_norm": 0.23423947393894196,
"learning_rate": 0.0002302653399668325,
"loss": 0.4909,
"step": 851
},
{
"epoch": 0.4696802646085998,
"grad_norm": 0.24698768556118011,
"learning_rate": 0.0002301824212271973,
"loss": 0.5,
"step": 852
},
{
"epoch": 0.47023153252480704,
"grad_norm": 0.24313125014305115,
"learning_rate": 0.00023009950248756216,
"loss": 0.4908,
"step": 853
},
{
"epoch": 0.47078280044101434,
"grad_norm": 0.2673037648200989,
"learning_rate": 0.000230016583747927,
"loss": 0.4971,
"step": 854
},
{
"epoch": 0.47133406835722164,
"grad_norm": 0.23639419674873352,
"learning_rate": 0.00022993366500829185,
"loss": 0.486,
"step": 855
},
{
"epoch": 0.4718853362734289,
"grad_norm": 0.2316926270723343,
"learning_rate": 0.00022985074626865668,
"loss": 0.5045,
"step": 856
},
{
"epoch": 0.4724366041896362,
"grad_norm": 0.23044279217720032,
"learning_rate": 0.00022976782752902154,
"loss": 0.4752,
"step": 857
},
{
"epoch": 0.4729878721058434,
"grad_norm": 0.2599242329597473,
"learning_rate": 0.00022968490878938637,
"loss": 0.5058,
"step": 858
},
{
"epoch": 0.4735391400220507,
"grad_norm": 0.2420707494020462,
"learning_rate": 0.00022960199004975123,
"loss": 0.4689,
"step": 859
},
{
"epoch": 0.474090407938258,
"grad_norm": 0.26549097895622253,
"learning_rate": 0.00022951907131011607,
"loss": 0.5161,
"step": 860
},
{
"epoch": 0.47464167585446526,
"grad_norm": 0.24539636075496674,
"learning_rate": 0.00022943615257048092,
"loss": 0.4887,
"step": 861
},
{
"epoch": 0.47519294377067256,
"grad_norm": 0.23257140815258026,
"learning_rate": 0.00022935323383084573,
"loss": 0.4841,
"step": 862
},
{
"epoch": 0.4757442116868798,
"grad_norm": 0.27551430463790894,
"learning_rate": 0.0002292703150912106,
"loss": 0.5369,
"step": 863
},
{
"epoch": 0.4762954796030871,
"grad_norm": 0.2414499670267105,
"learning_rate": 0.00022918739635157542,
"loss": 0.5031,
"step": 864
},
{
"epoch": 0.4768467475192944,
"grad_norm": 0.24039071798324585,
"learning_rate": 0.00022910447761194028,
"loss": 0.4958,
"step": 865
},
{
"epoch": 0.47739801543550164,
"grad_norm": 0.23044785857200623,
"learning_rate": 0.0002290215588723051,
"loss": 0.4884,
"step": 866
},
{
"epoch": 0.47794928335170894,
"grad_norm": 0.2677319645881653,
"learning_rate": 0.00022893864013266997,
"loss": 0.5096,
"step": 867
},
{
"epoch": 0.4785005512679162,
"grad_norm": 0.22575704753398895,
"learning_rate": 0.0002288557213930348,
"loss": 0.4968,
"step": 868
},
{
"epoch": 0.4790518191841235,
"grad_norm": 0.24338865280151367,
"learning_rate": 0.00022877280265339966,
"loss": 0.4669,
"step": 869
},
{
"epoch": 0.4796030871003308,
"grad_norm": 0.25083914399147034,
"learning_rate": 0.0002286898839137645,
"loss": 0.5035,
"step": 870
},
{
"epoch": 0.480154355016538,
"grad_norm": 0.24006043374538422,
"learning_rate": 0.00022860696517412935,
"loss": 0.459,
"step": 871
},
{
"epoch": 0.4807056229327453,
"grad_norm": 0.2326238453388214,
"learning_rate": 0.00022852404643449416,
"loss": 0.4599,
"step": 872
},
{
"epoch": 0.48125689084895257,
"grad_norm": 0.24134741723537445,
"learning_rate": 0.00022844112769485902,
"loss": 0.4755,
"step": 873
},
{
"epoch": 0.48180815876515987,
"grad_norm": 0.2148948460817337,
"learning_rate": 0.00022835820895522385,
"loss": 0.4759,
"step": 874
},
{
"epoch": 0.48235942668136716,
"grad_norm": 0.2361116260290146,
"learning_rate": 0.0002282752902155887,
"loss": 0.4771,
"step": 875
},
{
"epoch": 0.4829106945975744,
"grad_norm": 0.24435687065124512,
"learning_rate": 0.00022819237147595354,
"loss": 0.492,
"step": 876
},
{
"epoch": 0.4834619625137817,
"grad_norm": 0.23266686499118805,
"learning_rate": 0.0002281094527363184,
"loss": 0.5269,
"step": 877
},
{
"epoch": 0.48401323042998895,
"grad_norm": 0.2184826284646988,
"learning_rate": 0.00022802653399668323,
"loss": 0.4741,
"step": 878
},
{
"epoch": 0.48456449834619625,
"grad_norm": 0.24351243674755096,
"learning_rate": 0.0002279436152570481,
"loss": 0.5121,
"step": 879
},
{
"epoch": 0.48511576626240355,
"grad_norm": 0.2366686463356018,
"learning_rate": 0.00022786069651741292,
"loss": 0.5002,
"step": 880
},
{
"epoch": 0.4856670341786108,
"grad_norm": 0.23044729232788086,
"learning_rate": 0.00022777777777777778,
"loss": 0.4742,
"step": 881
},
{
"epoch": 0.4862183020948181,
"grad_norm": 0.23718389868736267,
"learning_rate": 0.0002276948590381426,
"loss": 0.4864,
"step": 882
},
{
"epoch": 0.48676957001102533,
"grad_norm": 0.25451889634132385,
"learning_rate": 0.00022761194029850745,
"loss": 0.4809,
"step": 883
},
{
"epoch": 0.48732083792723263,
"grad_norm": 0.22073966264724731,
"learning_rate": 0.00022752902155887228,
"loss": 0.4853,
"step": 884
},
{
"epoch": 0.48787210584343993,
"grad_norm": 0.24639108777046204,
"learning_rate": 0.00022744610281923714,
"loss": 0.4848,
"step": 885
},
{
"epoch": 0.4884233737596472,
"grad_norm": 0.2543313503265381,
"learning_rate": 0.00022736318407960197,
"loss": 0.5109,
"step": 886
},
{
"epoch": 0.48897464167585447,
"grad_norm": 0.24580398201942444,
"learning_rate": 0.00022728026533996683,
"loss": 0.4919,
"step": 887
},
{
"epoch": 0.4895259095920617,
"grad_norm": 0.23678098618984222,
"learning_rate": 0.00022719734660033166,
"loss": 0.48,
"step": 888
},
{
"epoch": 0.490077177508269,
"grad_norm": 0.2219116985797882,
"learning_rate": 0.00022711442786069652,
"loss": 0.4647,
"step": 889
},
{
"epoch": 0.4906284454244763,
"grad_norm": 0.2577376067638397,
"learning_rate": 0.00022703150912106135,
"loss": 0.4729,
"step": 890
},
{
"epoch": 0.49117971334068355,
"grad_norm": 0.2527279853820801,
"learning_rate": 0.0002269485903814262,
"loss": 0.4899,
"step": 891
},
{
"epoch": 0.49173098125689085,
"grad_norm": 0.2718394100666046,
"learning_rate": 0.00022686567164179102,
"loss": 0.5247,
"step": 892
},
{
"epoch": 0.49228224917309815,
"grad_norm": 0.23161333799362183,
"learning_rate": 0.00022678275290215585,
"loss": 0.4786,
"step": 893
},
{
"epoch": 0.4928335170893054,
"grad_norm": 0.22976607084274292,
"learning_rate": 0.0002266998341625207,
"loss": 0.4963,
"step": 894
},
{
"epoch": 0.4933847850055127,
"grad_norm": 0.26446732878685,
"learning_rate": 0.00022661691542288554,
"loss": 0.5076,
"step": 895
},
{
"epoch": 0.49393605292171994,
"grad_norm": 0.2513757348060608,
"learning_rate": 0.0002265339966832504,
"loss": 0.4906,
"step": 896
},
{
"epoch": 0.49448732083792724,
"grad_norm": 0.2355221062898636,
"learning_rate": 0.00022645107794361523,
"loss": 0.5083,
"step": 897
},
{
"epoch": 0.49503858875413453,
"grad_norm": 0.24008940160274506,
"learning_rate": 0.0002263681592039801,
"loss": 0.5075,
"step": 898
},
{
"epoch": 0.4955898566703418,
"grad_norm": 0.23088522255420685,
"learning_rate": 0.00022628524046434492,
"loss": 0.4975,
"step": 899
},
{
"epoch": 0.4961411245865491,
"grad_norm": 0.2754332721233368,
"learning_rate": 0.00022620232172470978,
"loss": 0.5144,
"step": 900
},
{
"epoch": 0.4966923925027563,
"grad_norm": 0.25219646096229553,
"learning_rate": 0.00022611940298507459,
"loss": 0.4854,
"step": 901
},
{
"epoch": 0.4972436604189636,
"grad_norm": 0.2489755004644394,
"learning_rate": 0.00022603648424543944,
"loss": 0.4708,
"step": 902
},
{
"epoch": 0.4977949283351709,
"grad_norm": 0.24141034483909607,
"learning_rate": 0.00022595356550580428,
"loss": 0.4917,
"step": 903
},
{
"epoch": 0.49834619625137816,
"grad_norm": 0.23453152179718018,
"learning_rate": 0.00022587064676616914,
"loss": 0.4754,
"step": 904
},
{
"epoch": 0.49889746416758546,
"grad_norm": 0.25601381063461304,
"learning_rate": 0.00022578772802653397,
"loss": 0.4909,
"step": 905
},
{
"epoch": 0.4994487320837927,
"grad_norm": 0.22102084755897522,
"learning_rate": 0.00022570480928689883,
"loss": 0.4673,
"step": 906
},
{
"epoch": 0.5,
"grad_norm": 0.2369261085987091,
"learning_rate": 0.00022562189054726366,
"loss": 0.4544,
"step": 907
},
{
"epoch": 0.5005512679162073,
"grad_norm": 0.25789421796798706,
"learning_rate": 0.00022553897180762852,
"loss": 0.5032,
"step": 908
},
{
"epoch": 0.5011025358324146,
"grad_norm": 0.2342817783355713,
"learning_rate": 0.00022545605306799335,
"loss": 0.4649,
"step": 909
},
{
"epoch": 0.5016538037486218,
"grad_norm": 0.25317567586898804,
"learning_rate": 0.0002253731343283582,
"loss": 0.4974,
"step": 910
},
{
"epoch": 0.5022050716648291,
"grad_norm": 0.23973771929740906,
"learning_rate": 0.00022529021558872301,
"loss": 0.5093,
"step": 911
},
{
"epoch": 0.5027563395810364,
"grad_norm": 0.24858252704143524,
"learning_rate": 0.00022520729684908787,
"loss": 0.4781,
"step": 912
},
{
"epoch": 0.5033076074972437,
"grad_norm": 0.25571468472480774,
"learning_rate": 0.0002251243781094527,
"loss": 0.4992,
"step": 913
},
{
"epoch": 0.503858875413451,
"grad_norm": 0.2476612776517868,
"learning_rate": 0.00022504145936981756,
"loss": 0.4803,
"step": 914
},
{
"epoch": 0.5044101433296582,
"grad_norm": 0.24917398393154144,
"learning_rate": 0.0002249585406301824,
"loss": 0.5022,
"step": 915
},
{
"epoch": 0.5049614112458655,
"grad_norm": 0.24204300343990326,
"learning_rate": 0.00022487562189054726,
"loss": 0.4919,
"step": 916
},
{
"epoch": 0.5055126791620728,
"grad_norm": 0.23442697525024414,
"learning_rate": 0.0002247927031509121,
"loss": 0.4754,
"step": 917
},
{
"epoch": 0.5060639470782801,
"grad_norm": 0.26630768179893494,
"learning_rate": 0.00022470978441127695,
"loss": 0.5119,
"step": 918
},
{
"epoch": 0.5066152149944874,
"grad_norm": 0.2312323898077011,
"learning_rate": 0.00022462686567164175,
"loss": 0.4735,
"step": 919
},
{
"epoch": 0.5071664829106945,
"grad_norm": 0.23444309830665588,
"learning_rate": 0.0002245439469320066,
"loss": 0.4718,
"step": 920
},
{
"epoch": 0.5077177508269018,
"grad_norm": 0.2260974645614624,
"learning_rate": 0.00022446102819237144,
"loss": 0.48,
"step": 921
},
{
"epoch": 0.5082690187431091,
"grad_norm": 0.2403731793165207,
"learning_rate": 0.0002243781094527363,
"loss": 0.5014,
"step": 922
},
{
"epoch": 0.5088202866593164,
"grad_norm": 0.240118607878685,
"learning_rate": 0.00022429519071310113,
"loss": 0.4669,
"step": 923
},
{
"epoch": 0.5093715545755237,
"grad_norm": 0.2268829345703125,
"learning_rate": 0.000224212271973466,
"loss": 0.4924,
"step": 924
},
{
"epoch": 0.5099228224917309,
"grad_norm": 0.23937518894672394,
"learning_rate": 0.00022412935323383083,
"loss": 0.4743,
"step": 925
},
{
"epoch": 0.5104740904079382,
"grad_norm": 0.25224533677101135,
"learning_rate": 0.00022404643449419568,
"loss": 0.502,
"step": 926
},
{
"epoch": 0.5110253583241455,
"grad_norm": 0.23434899747371674,
"learning_rate": 0.00022396351575456052,
"loss": 0.4825,
"step": 927
},
{
"epoch": 0.5115766262403528,
"grad_norm": 0.249129980802536,
"learning_rate": 0.00022388059701492538,
"loss": 0.4689,
"step": 928
},
{
"epoch": 0.5121278941565601,
"grad_norm": 0.2530542314052582,
"learning_rate": 0.00022379767827529018,
"loss": 0.4726,
"step": 929
},
{
"epoch": 0.5126791620727673,
"grad_norm": 0.2488546073436737,
"learning_rate": 0.00022371475953565504,
"loss": 0.5024,
"step": 930
},
{
"epoch": 0.5132304299889746,
"grad_norm": 0.23048900067806244,
"learning_rate": 0.00022363184079601987,
"loss": 0.4633,
"step": 931
},
{
"epoch": 0.5137816979051819,
"grad_norm": 0.2485697716474533,
"learning_rate": 0.00022354892205638473,
"loss": 0.4955,
"step": 932
},
{
"epoch": 0.5143329658213892,
"grad_norm": 0.23724399507045746,
"learning_rate": 0.00022346600331674956,
"loss": 0.4859,
"step": 933
},
{
"epoch": 0.5148842337375965,
"grad_norm": 0.2424692064523697,
"learning_rate": 0.00022338308457711442,
"loss": 0.5115,
"step": 934
},
{
"epoch": 0.5154355016538037,
"grad_norm": 0.24387586116790771,
"learning_rate": 0.00022330016583747925,
"loss": 0.4969,
"step": 935
},
{
"epoch": 0.515986769570011,
"grad_norm": 0.22749263048171997,
"learning_rate": 0.0002232172470978441,
"loss": 0.5014,
"step": 936
},
{
"epoch": 0.5165380374862183,
"grad_norm": 0.22205640375614166,
"learning_rate": 0.00022313432835820894,
"loss": 0.4912,
"step": 937
},
{
"epoch": 0.5170893054024256,
"grad_norm": 0.23504669964313507,
"learning_rate": 0.00022305140961857375,
"loss": 0.4841,
"step": 938
},
{
"epoch": 0.5176405733186329,
"grad_norm": 0.2282828390598297,
"learning_rate": 0.0002229684908789386,
"loss": 0.463,
"step": 939
},
{
"epoch": 0.5181918412348401,
"grad_norm": 0.23592360317707062,
"learning_rate": 0.00022288557213930344,
"loss": 0.48,
"step": 940
},
{
"epoch": 0.5187431091510474,
"grad_norm": 0.2408529818058014,
"learning_rate": 0.0002228026533996683,
"loss": 0.485,
"step": 941
},
{
"epoch": 0.5192943770672547,
"grad_norm": 0.2507123351097107,
"learning_rate": 0.00022271973466003313,
"loss": 0.4696,
"step": 942
},
{
"epoch": 0.519845644983462,
"grad_norm": 0.21724364161491394,
"learning_rate": 0.000222636815920398,
"loss": 0.4883,
"step": 943
},
{
"epoch": 0.5203969128996693,
"grad_norm": 0.22868378460407257,
"learning_rate": 0.00022255389718076282,
"loss": 0.4852,
"step": 944
},
{
"epoch": 0.5209481808158766,
"grad_norm": 0.23937176167964935,
"learning_rate": 0.00022247097844112768,
"loss": 0.4966,
"step": 945
},
{
"epoch": 0.5214994487320838,
"grad_norm": 0.24673771858215332,
"learning_rate": 0.00022238805970149251,
"loss": 0.5089,
"step": 946
},
{
"epoch": 0.5220507166482911,
"grad_norm": 0.23318541049957275,
"learning_rate": 0.00022230514096185737,
"loss": 0.4847,
"step": 947
},
{
"epoch": 0.5226019845644984,
"grad_norm": 0.2237371951341629,
"learning_rate": 0.00022222222222222218,
"loss": 0.4745,
"step": 948
},
{
"epoch": 0.5231532524807057,
"grad_norm": 0.22587883472442627,
"learning_rate": 0.00022213930348258704,
"loss": 0.502,
"step": 949
},
{
"epoch": 0.523704520396913,
"grad_norm": 0.237474262714386,
"learning_rate": 0.00022205638474295187,
"loss": 0.5003,
"step": 950
},
{
"epoch": 0.5242557883131201,
"grad_norm": 0.2394198328256607,
"learning_rate": 0.00022197346600331673,
"loss": 0.5032,
"step": 951
},
{
"epoch": 0.5248070562293274,
"grad_norm": 0.22187075018882751,
"learning_rate": 0.00022189054726368156,
"loss": 0.4543,
"step": 952
},
{
"epoch": 0.5253583241455347,
"grad_norm": 0.23657891154289246,
"learning_rate": 0.00022180762852404642,
"loss": 0.496,
"step": 953
},
{
"epoch": 0.525909592061742,
"grad_norm": 0.23503652215003967,
"learning_rate": 0.00022172470978441125,
"loss": 0.4724,
"step": 954
},
{
"epoch": 0.5264608599779493,
"grad_norm": 0.2500884532928467,
"learning_rate": 0.0002216417910447761,
"loss": 0.4837,
"step": 955
},
{
"epoch": 0.5270121278941565,
"grad_norm": 0.2291148602962494,
"learning_rate": 0.00022155887230514094,
"loss": 0.4884,
"step": 956
},
{
"epoch": 0.5275633958103638,
"grad_norm": 0.2256416380405426,
"learning_rate": 0.0002214759535655058,
"loss": 0.4743,
"step": 957
},
{
"epoch": 0.5281146637265711,
"grad_norm": 0.23922450840473175,
"learning_rate": 0.0002213930348258706,
"loss": 0.4784,
"step": 958
},
{
"epoch": 0.5286659316427784,
"grad_norm": 0.24849876761436462,
"learning_rate": 0.00022131011608623547,
"loss": 0.498,
"step": 959
},
{
"epoch": 0.5292171995589857,
"grad_norm": 0.2211284190416336,
"learning_rate": 0.0002212271973466003,
"loss": 0.4711,
"step": 960
},
{
"epoch": 0.5297684674751929,
"grad_norm": 0.2296118289232254,
"learning_rate": 0.00022114427860696516,
"loss": 0.49,
"step": 961
},
{
"epoch": 0.5303197353914002,
"grad_norm": 0.22921642661094666,
"learning_rate": 0.00022106135986733,
"loss": 0.4864,
"step": 962
},
{
"epoch": 0.5308710033076075,
"grad_norm": 0.23854584991931915,
"learning_rate": 0.00022097844112769485,
"loss": 0.4976,
"step": 963
},
{
"epoch": 0.5314222712238148,
"grad_norm": 0.22192314267158508,
"learning_rate": 0.00022089552238805968,
"loss": 0.4889,
"step": 964
},
{
"epoch": 0.5319735391400221,
"grad_norm": 0.24450358748435974,
"learning_rate": 0.00022081260364842454,
"loss": 0.4784,
"step": 965
},
{
"epoch": 0.5325248070562293,
"grad_norm": 0.2145015150308609,
"learning_rate": 0.00022072968490878937,
"loss": 0.4543,
"step": 966
},
{
"epoch": 0.5330760749724366,
"grad_norm": 0.22203224897384644,
"learning_rate": 0.00022064676616915423,
"loss": 0.4892,
"step": 967
},
{
"epoch": 0.5336273428886439,
"grad_norm": 0.2423708289861679,
"learning_rate": 0.00022056384742951904,
"loss": 0.4866,
"step": 968
},
{
"epoch": 0.5341786108048512,
"grad_norm": 0.2290901392698288,
"learning_rate": 0.0002204809286898839,
"loss": 0.4809,
"step": 969
},
{
"epoch": 0.5347298787210585,
"grad_norm": 0.22281813621520996,
"learning_rate": 0.00022039800995024873,
"loss": 0.5083,
"step": 970
},
{
"epoch": 0.5352811466372657,
"grad_norm": 0.23863239586353302,
"learning_rate": 0.0002203150912106136,
"loss": 0.4732,
"step": 971
},
{
"epoch": 0.535832414553473,
"grad_norm": 0.2304835319519043,
"learning_rate": 0.00022023217247097842,
"loss": 0.4898,
"step": 972
},
{
"epoch": 0.5363836824696803,
"grad_norm": 0.23452985286712646,
"learning_rate": 0.00022014925373134328,
"loss": 0.5177,
"step": 973
},
{
"epoch": 0.5369349503858876,
"grad_norm": 0.252209335565567,
"learning_rate": 0.0002200663349917081,
"loss": 0.482,
"step": 974
},
{
"epoch": 0.5374862183020949,
"grad_norm": 0.23390796780586243,
"learning_rate": 0.00021998341625207297,
"loss": 0.4913,
"step": 975
},
{
"epoch": 0.538037486218302,
"grad_norm": 0.24304579198360443,
"learning_rate": 0.0002199004975124378,
"loss": 0.4963,
"step": 976
},
{
"epoch": 0.5385887541345094,
"grad_norm": 0.22291411459445953,
"learning_rate": 0.00021981757877280266,
"loss": 0.4835,
"step": 977
},
{
"epoch": 0.5391400220507166,
"grad_norm": 0.23994603753089905,
"learning_rate": 0.00021973466003316746,
"loss": 0.4596,
"step": 978
},
{
"epoch": 0.539691289966924,
"grad_norm": 0.2375342845916748,
"learning_rate": 0.00021965174129353232,
"loss": 0.5138,
"step": 979
},
{
"epoch": 0.5402425578831312,
"grad_norm": 0.22774764895439148,
"learning_rate": 0.00021956882255389716,
"loss": 0.4949,
"step": 980
},
{
"epoch": 0.5407938257993384,
"grad_norm": 0.2277144491672516,
"learning_rate": 0.000219485903814262,
"loss": 0.4843,
"step": 981
},
{
"epoch": 0.5413450937155457,
"grad_norm": 0.23078951239585876,
"learning_rate": 0.00021940298507462685,
"loss": 0.5089,
"step": 982
},
{
"epoch": 0.541896361631753,
"grad_norm": 0.23093165457248688,
"learning_rate": 0.00021932006633499168,
"loss": 0.4913,
"step": 983
},
{
"epoch": 0.5424476295479603,
"grad_norm": 0.22961430251598358,
"learning_rate": 0.00021923714759535654,
"loss": 0.4957,
"step": 984
},
{
"epoch": 0.5429988974641676,
"grad_norm": 0.2303048074245453,
"learning_rate": 0.00021915422885572137,
"loss": 0.4991,
"step": 985
},
{
"epoch": 0.5435501653803748,
"grad_norm": 0.2352553904056549,
"learning_rate": 0.00021907131011608623,
"loss": 0.4838,
"step": 986
},
{
"epoch": 0.5441014332965821,
"grad_norm": 0.2251589596271515,
"learning_rate": 0.00021898839137645103,
"loss": 0.4928,
"step": 987
},
{
"epoch": 0.5446527012127894,
"grad_norm": 0.2577657103538513,
"learning_rate": 0.0002189054726368159,
"loss": 0.4897,
"step": 988
},
{
"epoch": 0.5452039691289967,
"grad_norm": 0.23328843712806702,
"learning_rate": 0.00021882255389718073,
"loss": 0.4949,
"step": 989
},
{
"epoch": 0.545755237045204,
"grad_norm": 0.23206306993961334,
"learning_rate": 0.00021873963515754558,
"loss": 0.4791,
"step": 990
},
{
"epoch": 0.5463065049614112,
"grad_norm": 0.2417128086090088,
"learning_rate": 0.00021865671641791042,
"loss": 0.5161,
"step": 991
},
{
"epoch": 0.5468577728776185,
"grad_norm": 0.2541581988334656,
"learning_rate": 0.00021857379767827528,
"loss": 0.5253,
"step": 992
},
{
"epoch": 0.5474090407938258,
"grad_norm": 0.23152418434619904,
"learning_rate": 0.0002184908789386401,
"loss": 0.4854,
"step": 993
},
{
"epoch": 0.5479603087100331,
"grad_norm": 0.21505197882652283,
"learning_rate": 0.00021840796019900497,
"loss": 0.4664,
"step": 994
},
{
"epoch": 0.5485115766262404,
"grad_norm": 0.23766584694385529,
"learning_rate": 0.0002183250414593698,
"loss": 0.4976,
"step": 995
},
{
"epoch": 0.5490628445424476,
"grad_norm": 0.23223701119422913,
"learning_rate": 0.00021824212271973466,
"loss": 0.4485,
"step": 996
},
{
"epoch": 0.5496141124586549,
"grad_norm": 0.25161734223365784,
"learning_rate": 0.00021815920398009946,
"loss": 0.4818,
"step": 997
},
{
"epoch": 0.5501653803748622,
"grad_norm": 0.23082609474658966,
"learning_rate": 0.00021807628524046432,
"loss": 0.502,
"step": 998
},
{
"epoch": 0.5507166482910695,
"grad_norm": 0.23080939054489136,
"learning_rate": 0.00021799336650082915,
"loss": 0.5005,
"step": 999
},
{
"epoch": 0.5512679162072768,
"grad_norm": 0.22184456884860992,
"learning_rate": 0.00021791044776119401,
"loss": 0.4833,
"step": 1000
},
{
"epoch": 0.5512679162072768,
"eval_loss": 0.48357656598091125,
"eval_runtime": 311.7364,
"eval_samples_per_second": 3.737,
"eval_steps_per_second": 0.468,
"step": 1000
},
{
"epoch": 0.551819184123484,
"grad_norm": 0.25572869181632996,
"learning_rate": 0.00021782752902155885,
"loss": 0.4925,
"step": 1001
},
{
"epoch": 0.5523704520396913,
"grad_norm": 0.2477078139781952,
"learning_rate": 0.0002177446102819237,
"loss": 0.4847,
"step": 1002
},
{
"epoch": 0.5529217199558986,
"grad_norm": 0.23749567568302155,
"learning_rate": 0.00021766169154228854,
"loss": 0.4933,
"step": 1003
},
{
"epoch": 0.5534729878721059,
"grad_norm": 0.22248369455337524,
"learning_rate": 0.0002175787728026534,
"loss": 0.4883,
"step": 1004
},
{
"epoch": 0.5540242557883132,
"grad_norm": 0.23769117891788483,
"learning_rate": 0.00021749585406301823,
"loss": 0.4977,
"step": 1005
},
{
"epoch": 0.5545755237045203,
"grad_norm": 0.22872841358184814,
"learning_rate": 0.0002174129353233831,
"loss": 0.4952,
"step": 1006
},
{
"epoch": 0.5551267916207276,
"grad_norm": 0.23627693951129913,
"learning_rate": 0.0002173300165837479,
"loss": 0.4653,
"step": 1007
},
{
"epoch": 0.5556780595369349,
"grad_norm": 0.24900414049625397,
"learning_rate": 0.00021724709784411275,
"loss": 0.4833,
"step": 1008
},
{
"epoch": 0.5562293274531422,
"grad_norm": 0.2288302332162857,
"learning_rate": 0.00021716417910447758,
"loss": 0.4735,
"step": 1009
},
{
"epoch": 0.5567805953693495,
"grad_norm": 0.2251368761062622,
"learning_rate": 0.00021708126036484244,
"loss": 0.4887,
"step": 1010
},
{
"epoch": 0.5573318632855567,
"grad_norm": 0.2496083676815033,
"learning_rate": 0.00021699834162520727,
"loss": 0.4959,
"step": 1011
},
{
"epoch": 0.557883131201764,
"grad_norm": 0.23241998255252838,
"learning_rate": 0.00021691542288557213,
"loss": 0.462,
"step": 1012
},
{
"epoch": 0.5584343991179713,
"grad_norm": 0.239312544465065,
"learning_rate": 0.00021683250414593697,
"loss": 0.4792,
"step": 1013
},
{
"epoch": 0.5589856670341786,
"grad_norm": 0.22684402763843536,
"learning_rate": 0.00021674958540630182,
"loss": 0.4825,
"step": 1014
},
{
"epoch": 0.5595369349503859,
"grad_norm": 0.23261615633964539,
"learning_rate": 0.00021666666666666666,
"loss": 0.4604,
"step": 1015
},
{
"epoch": 0.5600882028665931,
"grad_norm": 0.26163482666015625,
"learning_rate": 0.00021658374792703152,
"loss": 0.5158,
"step": 1016
},
{
"epoch": 0.5606394707828004,
"grad_norm": 0.2275197058916092,
"learning_rate": 0.00021650082918739632,
"loss": 0.4733,
"step": 1017
},
{
"epoch": 0.5611907386990077,
"grad_norm": 0.2636192739009857,
"learning_rate": 0.00021641791044776118,
"loss": 0.5018,
"step": 1018
},
{
"epoch": 0.561742006615215,
"grad_norm": 0.2224932312965393,
"learning_rate": 0.000216334991708126,
"loss": 0.5064,
"step": 1019
},
{
"epoch": 0.5622932745314223,
"grad_norm": 0.2518375813961029,
"learning_rate": 0.00021625207296849087,
"loss": 0.4874,
"step": 1020
},
{
"epoch": 0.5628445424476296,
"grad_norm": 0.24104849994182587,
"learning_rate": 0.0002161691542288557,
"loss": 0.4864,
"step": 1021
},
{
"epoch": 0.5633958103638368,
"grad_norm": 0.25608646869659424,
"learning_rate": 0.00021608623548922056,
"loss": 0.4752,
"step": 1022
},
{
"epoch": 0.5639470782800441,
"grad_norm": 0.24174031615257263,
"learning_rate": 0.0002160033167495854,
"loss": 0.4986,
"step": 1023
},
{
"epoch": 0.5644983461962514,
"grad_norm": 0.23120078444480896,
"learning_rate": 0.00021592039800995025,
"loss": 0.4615,
"step": 1024
},
{
"epoch": 0.5650496141124587,
"grad_norm": 0.2599080204963684,
"learning_rate": 0.00021583747927031509,
"loss": 0.4994,
"step": 1025
},
{
"epoch": 0.565600882028666,
"grad_norm": 0.23741313815116882,
"learning_rate": 0.0002157545605306799,
"loss": 0.4745,
"step": 1026
},
{
"epoch": 0.5661521499448732,
"grad_norm": 0.24400565028190613,
"learning_rate": 0.00021567164179104475,
"loss": 0.4891,
"step": 1027
},
{
"epoch": 0.5667034178610805,
"grad_norm": 0.2503412663936615,
"learning_rate": 0.00021558872305140958,
"loss": 0.5014,
"step": 1028
},
{
"epoch": 0.5672546857772878,
"grad_norm": 0.23471197485923767,
"learning_rate": 0.00021550580431177444,
"loss": 0.4958,
"step": 1029
},
{
"epoch": 0.5678059536934951,
"grad_norm": 0.2323479950428009,
"learning_rate": 0.00021542288557213927,
"loss": 0.4691,
"step": 1030
},
{
"epoch": 0.5683572216097024,
"grad_norm": 0.23778273165225983,
"learning_rate": 0.00021533996683250413,
"loss": 0.4881,
"step": 1031
},
{
"epoch": 0.5689084895259096,
"grad_norm": 0.21465396881103516,
"learning_rate": 0.00021525704809286896,
"loss": 0.4689,
"step": 1032
},
{
"epoch": 0.5694597574421169,
"grad_norm": 0.2397712767124176,
"learning_rate": 0.00021517412935323382,
"loss": 0.4873,
"step": 1033
},
{
"epoch": 0.5700110253583242,
"grad_norm": 0.2142529934644699,
"learning_rate": 0.00021509121061359863,
"loss": 0.4686,
"step": 1034
},
{
"epoch": 0.5705622932745315,
"grad_norm": 0.24334488809108734,
"learning_rate": 0.00021500829187396351,
"loss": 0.508,
"step": 1035
},
{
"epoch": 0.5711135611907387,
"grad_norm": 0.2391451597213745,
"learning_rate": 0.00021492537313432832,
"loss": 0.5049,
"step": 1036
},
{
"epoch": 0.5716648291069459,
"grad_norm": 0.25972914695739746,
"learning_rate": 0.00021484245439469318,
"loss": 0.5022,
"step": 1037
},
{
"epoch": 0.5722160970231532,
"grad_norm": 0.23072604835033417,
"learning_rate": 0.000214759535655058,
"loss": 0.4888,
"step": 1038
},
{
"epoch": 0.5727673649393605,
"grad_norm": 0.2415681630373001,
"learning_rate": 0.00021467661691542287,
"loss": 0.4787,
"step": 1039
},
{
"epoch": 0.5733186328555678,
"grad_norm": 0.24707187712192535,
"learning_rate": 0.0002145936981757877,
"loss": 0.4877,
"step": 1040
},
{
"epoch": 0.5738699007717751,
"grad_norm": 0.24816669523715973,
"learning_rate": 0.00021451077943615256,
"loss": 0.4704,
"step": 1041
},
{
"epoch": 0.5744211686879823,
"grad_norm": 0.23687899112701416,
"learning_rate": 0.0002144278606965174,
"loss": 0.4757,
"step": 1042
},
{
"epoch": 0.5749724366041896,
"grad_norm": 0.25993046164512634,
"learning_rate": 0.00021434494195688225,
"loss": 0.4919,
"step": 1043
},
{
"epoch": 0.5755237045203969,
"grad_norm": 0.23352675139904022,
"learning_rate": 0.00021426202321724706,
"loss": 0.4762,
"step": 1044
},
{
"epoch": 0.5760749724366042,
"grad_norm": 0.23056983947753906,
"learning_rate": 0.00021417910447761192,
"loss": 0.4638,
"step": 1045
},
{
"epoch": 0.5766262403528115,
"grad_norm": 0.22587046027183533,
"learning_rate": 0.00021409618573797675,
"loss": 0.4777,
"step": 1046
},
{
"epoch": 0.5771775082690187,
"grad_norm": 0.2561855912208557,
"learning_rate": 0.0002140132669983416,
"loss": 0.5056,
"step": 1047
},
{
"epoch": 0.577728776185226,
"grad_norm": 0.24537737667560577,
"learning_rate": 0.00021393034825870644,
"loss": 0.497,
"step": 1048
},
{
"epoch": 0.5782800441014333,
"grad_norm": 0.22903874516487122,
"learning_rate": 0.0002138474295190713,
"loss": 0.4749,
"step": 1049
},
{
"epoch": 0.5788313120176406,
"grad_norm": 0.24069786071777344,
"learning_rate": 0.00021376451077943613,
"loss": 0.4901,
"step": 1050
},
{
"epoch": 0.5793825799338479,
"grad_norm": 0.2355291098356247,
"learning_rate": 0.000213681592039801,
"loss": 0.478,
"step": 1051
},
{
"epoch": 0.5799338478500551,
"grad_norm": 0.24105066061019897,
"learning_rate": 0.00021359867330016582,
"loss": 0.4832,
"step": 1052
},
{
"epoch": 0.5804851157662624,
"grad_norm": 0.22479461133480072,
"learning_rate": 0.00021351575456053068,
"loss": 0.4657,
"step": 1053
},
{
"epoch": 0.5810363836824697,
"grad_norm": 0.24978676438331604,
"learning_rate": 0.00021343283582089549,
"loss": 0.4795,
"step": 1054
},
{
"epoch": 0.581587651598677,
"grad_norm": 0.22877342998981476,
"learning_rate": 0.00021334991708126034,
"loss": 0.476,
"step": 1055
},
{
"epoch": 0.5821389195148843,
"grad_norm": 0.230316624045372,
"learning_rate": 0.00021326699834162518,
"loss": 0.4854,
"step": 1056
},
{
"epoch": 0.5826901874310915,
"grad_norm": 0.2178526371717453,
"learning_rate": 0.00021318407960199004,
"loss": 0.4798,
"step": 1057
},
{
"epoch": 0.5832414553472988,
"grad_norm": 0.23913492262363434,
"learning_rate": 0.00021310116086235487,
"loss": 0.4759,
"step": 1058
},
{
"epoch": 0.5837927232635061,
"grad_norm": 0.23534056544303894,
"learning_rate": 0.00021301824212271973,
"loss": 0.475,
"step": 1059
},
{
"epoch": 0.5843439911797134,
"grad_norm": 0.23057684302330017,
"learning_rate": 0.00021293532338308456,
"loss": 0.4835,
"step": 1060
},
{
"epoch": 0.5848952590959207,
"grad_norm": 0.2420724630355835,
"learning_rate": 0.00021285240464344942,
"loss": 0.4684,
"step": 1061
},
{
"epoch": 0.5854465270121278,
"grad_norm": 0.23270656168460846,
"learning_rate": 0.00021276948590381425,
"loss": 0.4714,
"step": 1062
},
{
"epoch": 0.5859977949283351,
"grad_norm": 0.22105982899665833,
"learning_rate": 0.0002126865671641791,
"loss": 0.4739,
"step": 1063
},
{
"epoch": 0.5865490628445424,
"grad_norm": 0.22896204888820648,
"learning_rate": 0.00021260364842454391,
"loss": 0.4792,
"step": 1064
},
{
"epoch": 0.5871003307607497,
"grad_norm": 0.22883784770965576,
"learning_rate": 0.00021252072968490877,
"loss": 0.4775,
"step": 1065
},
{
"epoch": 0.587651598676957,
"grad_norm": 0.22493380308151245,
"learning_rate": 0.0002124378109452736,
"loss": 0.4565,
"step": 1066
},
{
"epoch": 0.5882028665931642,
"grad_norm": 0.20627589523792267,
"learning_rate": 0.00021235489220563846,
"loss": 0.4421,
"step": 1067
},
{
"epoch": 0.5887541345093715,
"grad_norm": 0.22995707392692566,
"learning_rate": 0.0002122719734660033,
"loss": 0.5007,
"step": 1068
},
{
"epoch": 0.5893054024255788,
"grad_norm": 0.22702358663082123,
"learning_rate": 0.00021218905472636813,
"loss": 0.4848,
"step": 1069
},
{
"epoch": 0.5898566703417861,
"grad_norm": 0.2274836003780365,
"learning_rate": 0.000212106135986733,
"loss": 0.4512,
"step": 1070
},
{
"epoch": 0.5904079382579934,
"grad_norm": 0.25226280093193054,
"learning_rate": 0.00021202321724709782,
"loss": 0.4739,
"step": 1071
},
{
"epoch": 0.5909592061742006,
"grad_norm": 0.21378135681152344,
"learning_rate": 0.00021194029850746268,
"loss": 0.4902,
"step": 1072
},
{
"epoch": 0.5915104740904079,
"grad_norm": 0.2266150563955307,
"learning_rate": 0.00021185737976782748,
"loss": 0.4787,
"step": 1073
},
{
"epoch": 0.5920617420066152,
"grad_norm": 0.24346543848514557,
"learning_rate": 0.00021177446102819234,
"loss": 0.4758,
"step": 1074
},
{
"epoch": 0.5926130099228225,
"grad_norm": 0.23416201770305634,
"learning_rate": 0.00021169154228855718,
"loss": 0.4976,
"step": 1075
},
{
"epoch": 0.5931642778390298,
"grad_norm": 0.22314603626728058,
"learning_rate": 0.00021160862354892203,
"loss": 0.483,
"step": 1076
},
{
"epoch": 0.593715545755237,
"grad_norm": 0.23636144399642944,
"learning_rate": 0.00021152570480928687,
"loss": 0.4883,
"step": 1077
},
{
"epoch": 0.5942668136714443,
"grad_norm": 0.25075021386146545,
"learning_rate": 0.00021144278606965173,
"loss": 0.5093,
"step": 1078
},
{
"epoch": 0.5948180815876516,
"grad_norm": 0.25016966462135315,
"learning_rate": 0.00021135986733001656,
"loss": 0.4901,
"step": 1079
},
{
"epoch": 0.5953693495038589,
"grad_norm": 0.22505664825439453,
"learning_rate": 0.00021127694859038142,
"loss": 0.4982,
"step": 1080
},
{
"epoch": 0.5959206174200662,
"grad_norm": 0.2462112158536911,
"learning_rate": 0.00021119402985074625,
"loss": 0.4925,
"step": 1081
},
{
"epoch": 0.5964718853362734,
"grad_norm": 0.24048367142677307,
"learning_rate": 0.0002111111111111111,
"loss": 0.4711,
"step": 1082
},
{
"epoch": 0.5970231532524807,
"grad_norm": 0.2399929016828537,
"learning_rate": 0.0002110281923714759,
"loss": 0.4534,
"step": 1083
},
{
"epoch": 0.597574421168688,
"grad_norm": 0.22102728486061096,
"learning_rate": 0.00021094527363184077,
"loss": 0.475,
"step": 1084
},
{
"epoch": 0.5981256890848953,
"grad_norm": 0.22623874247074127,
"learning_rate": 0.0002108623548922056,
"loss": 0.4771,
"step": 1085
},
{
"epoch": 0.5986769570011026,
"grad_norm": 0.22739335894584656,
"learning_rate": 0.00021077943615257046,
"loss": 0.4524,
"step": 1086
},
{
"epoch": 0.5992282249173098,
"grad_norm": 0.22587355971336365,
"learning_rate": 0.0002106965174129353,
"loss": 0.481,
"step": 1087
},
{
"epoch": 0.5997794928335171,
"grad_norm": 0.238664448261261,
"learning_rate": 0.00021061359867330015,
"loss": 0.4812,
"step": 1088
},
{
"epoch": 0.6003307607497244,
"grad_norm": 0.2626015245914459,
"learning_rate": 0.00021053067993366499,
"loss": 0.5396,
"step": 1089
},
{
"epoch": 0.6008820286659317,
"grad_norm": 0.23110847175121307,
"learning_rate": 0.00021044776119402985,
"loss": 0.4768,
"step": 1090
},
{
"epoch": 0.601433296582139,
"grad_norm": 0.2324095070362091,
"learning_rate": 0.00021036484245439468,
"loss": 0.4569,
"step": 1091
},
{
"epoch": 0.6019845644983461,
"grad_norm": 0.2298206239938736,
"learning_rate": 0.00021028192371475954,
"loss": 0.4867,
"step": 1092
},
{
"epoch": 0.6025358324145534,
"grad_norm": 0.23651166260242462,
"learning_rate": 0.00021019900497512434,
"loss": 0.5119,
"step": 1093
},
{
"epoch": 0.6030871003307607,
"grad_norm": 0.24213020503520966,
"learning_rate": 0.0002101160862354892,
"loss": 0.4989,
"step": 1094
},
{
"epoch": 0.603638368246968,
"grad_norm": 0.2975553572177887,
"learning_rate": 0.00021003316749585403,
"loss": 0.4937,
"step": 1095
},
{
"epoch": 0.6041896361631753,
"grad_norm": 0.22954276204109192,
"learning_rate": 0.0002099502487562189,
"loss": 0.4569,
"step": 1096
},
{
"epoch": 0.6047409040793826,
"grad_norm": 0.23405365645885468,
"learning_rate": 0.00020986733001658372,
"loss": 0.476,
"step": 1097
},
{
"epoch": 0.6052921719955898,
"grad_norm": 0.22513137757778168,
"learning_rate": 0.00020978441127694858,
"loss": 0.4561,
"step": 1098
},
{
"epoch": 0.6058434399117971,
"grad_norm": 0.2296430617570877,
"learning_rate": 0.00020970149253731341,
"loss": 0.4628,
"step": 1099
},
{
"epoch": 0.6063947078280044,
"grad_norm": 0.24347829818725586,
"learning_rate": 0.00020961857379767827,
"loss": 0.5152,
"step": 1100
},
{
"epoch": 0.6069459757442117,
"grad_norm": 0.2580801546573639,
"learning_rate": 0.0002095356550580431,
"loss": 0.4751,
"step": 1101
},
{
"epoch": 0.607497243660419,
"grad_norm": 0.22813639044761658,
"learning_rate": 0.00020945273631840797,
"loss": 0.4807,
"step": 1102
},
{
"epoch": 0.6080485115766262,
"grad_norm": 0.22047673165798187,
"learning_rate": 0.00020936981757877277,
"loss": 0.4686,
"step": 1103
},
{
"epoch": 0.6085997794928335,
"grad_norm": 0.2241135686635971,
"learning_rate": 0.00020928689883913763,
"loss": 0.4826,
"step": 1104
},
{
"epoch": 0.6091510474090408,
"grad_norm": 0.24011586606502533,
"learning_rate": 0.00020920398009950246,
"loss": 0.4559,
"step": 1105
},
{
"epoch": 0.6097023153252481,
"grad_norm": 0.2351463884115219,
"learning_rate": 0.00020912106135986732,
"loss": 0.4523,
"step": 1106
},
{
"epoch": 0.6102535832414554,
"grad_norm": 0.2268303632736206,
"learning_rate": 0.00020903814262023215,
"loss": 0.486,
"step": 1107
},
{
"epoch": 0.6108048511576626,
"grad_norm": 0.2280043363571167,
"learning_rate": 0.000208955223880597,
"loss": 0.4902,
"step": 1108
},
{
"epoch": 0.6113561190738699,
"grad_norm": 0.21859845519065857,
"learning_rate": 0.00020887230514096184,
"loss": 0.4593,
"step": 1109
},
{
"epoch": 0.6119073869900772,
"grad_norm": 0.23152512311935425,
"learning_rate": 0.0002087893864013267,
"loss": 0.4762,
"step": 1110
},
{
"epoch": 0.6124586549062845,
"grad_norm": 0.23346808552742004,
"learning_rate": 0.00020870646766169153,
"loss": 0.4919,
"step": 1111
},
{
"epoch": 0.6130099228224918,
"grad_norm": 0.2313188761472702,
"learning_rate": 0.0002086235489220564,
"loss": 0.4792,
"step": 1112
},
{
"epoch": 0.613561190738699,
"grad_norm": 0.2261422574520111,
"learning_rate": 0.0002085406301824212,
"loss": 0.5008,
"step": 1113
},
{
"epoch": 0.6141124586549063,
"grad_norm": 0.24444694817066193,
"learning_rate": 0.00020845771144278603,
"loss": 0.503,
"step": 1114
},
{
"epoch": 0.6146637265711136,
"grad_norm": 0.23184862732887268,
"learning_rate": 0.0002083747927031509,
"loss": 0.5024,
"step": 1115
},
{
"epoch": 0.6152149944873209,
"grad_norm": 0.22305606305599213,
"learning_rate": 0.00020829187396351572,
"loss": 0.4815,
"step": 1116
},
{
"epoch": 0.6157662624035282,
"grad_norm": 0.24641431868076324,
"learning_rate": 0.00020820895522388058,
"loss": 0.5079,
"step": 1117
},
{
"epoch": 0.6163175303197354,
"grad_norm": 0.24148327112197876,
"learning_rate": 0.0002081260364842454,
"loss": 0.507,
"step": 1118
},
{
"epoch": 0.6168687982359427,
"grad_norm": 0.23938195407390594,
"learning_rate": 0.00020804311774461027,
"loss": 0.4668,
"step": 1119
},
{
"epoch": 0.61742006615215,
"grad_norm": 0.2462988644838333,
"learning_rate": 0.0002079601990049751,
"loss": 0.4941,
"step": 1120
},
{
"epoch": 0.6179713340683572,
"grad_norm": 0.23903852701187134,
"learning_rate": 0.00020787728026533996,
"loss": 0.4684,
"step": 1121
},
{
"epoch": 0.6185226019845645,
"grad_norm": 0.2402830719947815,
"learning_rate": 0.00020779436152570477,
"loss": 0.4705,
"step": 1122
},
{
"epoch": 0.6190738699007717,
"grad_norm": 0.24639341235160828,
"learning_rate": 0.00020771144278606963,
"loss": 0.4874,
"step": 1123
},
{
"epoch": 0.619625137816979,
"grad_norm": 0.22861522436141968,
"learning_rate": 0.00020762852404643446,
"loss": 0.4696,
"step": 1124
},
{
"epoch": 0.6201764057331863,
"grad_norm": 0.23462949693202972,
"learning_rate": 0.00020754560530679932,
"loss": 0.509,
"step": 1125
},
{
"epoch": 0.6207276736493936,
"grad_norm": 0.24041415750980377,
"learning_rate": 0.00020746268656716415,
"loss": 0.4792,
"step": 1126
},
{
"epoch": 0.6212789415656009,
"grad_norm": 0.23339125514030457,
"learning_rate": 0.000207379767827529,
"loss": 0.4603,
"step": 1127
},
{
"epoch": 0.6218302094818081,
"grad_norm": 0.23568972945213318,
"learning_rate": 0.00020729684908789384,
"loss": 0.4882,
"step": 1128
},
{
"epoch": 0.6223814773980154,
"grad_norm": 0.24162200093269348,
"learning_rate": 0.0002072139303482587,
"loss": 0.4835,
"step": 1129
},
{
"epoch": 0.6229327453142227,
"grad_norm": 0.24957728385925293,
"learning_rate": 0.00020713101160862353,
"loss": 0.4871,
"step": 1130
},
{
"epoch": 0.62348401323043,
"grad_norm": 0.24710482358932495,
"learning_rate": 0.0002070480928689884,
"loss": 0.4604,
"step": 1131
},
{
"epoch": 0.6240352811466373,
"grad_norm": 0.24623054265975952,
"learning_rate": 0.0002069651741293532,
"loss": 0.4986,
"step": 1132
},
{
"epoch": 0.6245865490628445,
"grad_norm": 0.24791941046714783,
"learning_rate": 0.00020688225538971806,
"loss": 0.4665,
"step": 1133
},
{
"epoch": 0.6251378169790518,
"grad_norm": 0.26239630579948425,
"learning_rate": 0.0002067993366500829,
"loss": 0.5193,
"step": 1134
},
{
"epoch": 0.6256890848952591,
"grad_norm": 0.2580834925174713,
"learning_rate": 0.00020671641791044775,
"loss": 0.5162,
"step": 1135
},
{
"epoch": 0.6262403528114664,
"grad_norm": 0.21768338978290558,
"learning_rate": 0.00020663349917081258,
"loss": 0.4626,
"step": 1136
},
{
"epoch": 0.6267916207276737,
"grad_norm": 0.24815984070301056,
"learning_rate": 0.00020655058043117744,
"loss": 0.4943,
"step": 1137
},
{
"epoch": 0.6273428886438809,
"grad_norm": 0.2349233627319336,
"learning_rate": 0.00020646766169154227,
"loss": 0.4819,
"step": 1138
},
{
"epoch": 0.6278941565600882,
"grad_norm": 0.23029837012290955,
"learning_rate": 0.00020638474295190713,
"loss": 0.488,
"step": 1139
},
{
"epoch": 0.6284454244762955,
"grad_norm": 0.23574088513851166,
"learning_rate": 0.00020630182421227196,
"loss": 0.4791,
"step": 1140
},
{
"epoch": 0.6289966923925028,
"grad_norm": 0.23277179896831512,
"learning_rate": 0.00020621890547263682,
"loss": 0.5047,
"step": 1141
},
{
"epoch": 0.6295479603087101,
"grad_norm": 0.2530352473258972,
"learning_rate": 0.00020613598673300163,
"loss": 0.5143,
"step": 1142
},
{
"epoch": 0.6300992282249173,
"grad_norm": 0.2136935591697693,
"learning_rate": 0.00020605306799336649,
"loss": 0.4768,
"step": 1143
},
{
"epoch": 0.6306504961411246,
"grad_norm": 0.23165372014045715,
"learning_rate": 0.00020597014925373132,
"loss": 0.4802,
"step": 1144
},
{
"epoch": 0.6312017640573319,
"grad_norm": 0.23744627833366394,
"learning_rate": 0.00020588723051409618,
"loss": 0.4751,
"step": 1145
},
{
"epoch": 0.6317530319735392,
"grad_norm": 0.2552582323551178,
"learning_rate": 0.000205804311774461,
"loss": 0.4949,
"step": 1146
},
{
"epoch": 0.6323042998897465,
"grad_norm": 0.22193565964698792,
"learning_rate": 0.00020572139303482587,
"loss": 0.4629,
"step": 1147
},
{
"epoch": 0.6328555678059536,
"grad_norm": 0.2249847799539566,
"learning_rate": 0.0002056384742951907,
"loss": 0.46,
"step": 1148
},
{
"epoch": 0.6334068357221609,
"grad_norm": 0.234629824757576,
"learning_rate": 0.00020555555555555556,
"loss": 0.4792,
"step": 1149
},
{
"epoch": 0.6339581036383682,
"grad_norm": 0.23007982969284058,
"learning_rate": 0.0002054726368159204,
"loss": 0.4857,
"step": 1150
},
{
"epoch": 0.6345093715545755,
"grad_norm": 0.24549317359924316,
"learning_rate": 0.00020538971807628525,
"loss": 0.4697,
"step": 1151
},
{
"epoch": 0.6350606394707828,
"grad_norm": 0.26415401697158813,
"learning_rate": 0.00020530679933665005,
"loss": 0.4858,
"step": 1152
},
{
"epoch": 0.63561190738699,
"grad_norm": 0.20789586007595062,
"learning_rate": 0.00020522388059701491,
"loss": 0.4312,
"step": 1153
},
{
"epoch": 0.6361631753031973,
"grad_norm": 0.23789043724536896,
"learning_rate": 0.00020514096185737975,
"loss": 0.4816,
"step": 1154
},
{
"epoch": 0.6367144432194046,
"grad_norm": 0.23785383999347687,
"learning_rate": 0.0002050580431177446,
"loss": 0.4743,
"step": 1155
},
{
"epoch": 0.6372657111356119,
"grad_norm": 0.26521044969558716,
"learning_rate": 0.00020497512437810944,
"loss": 0.4904,
"step": 1156
},
{
"epoch": 0.6378169790518192,
"grad_norm": 0.25412556529045105,
"learning_rate": 0.0002048922056384743,
"loss": 0.5,
"step": 1157
},
{
"epoch": 0.6383682469680264,
"grad_norm": 0.23178859055042267,
"learning_rate": 0.00020480928689883913,
"loss": 0.4791,
"step": 1158
},
{
"epoch": 0.6389195148842337,
"grad_norm": 0.23838523030281067,
"learning_rate": 0.00020472636815920393,
"loss": 0.4539,
"step": 1159
},
{
"epoch": 0.639470782800441,
"grad_norm": 0.23378612101078033,
"learning_rate": 0.0002046434494195688,
"loss": 0.492,
"step": 1160
},
{
"epoch": 0.6400220507166483,
"grad_norm": 0.24227279424667358,
"learning_rate": 0.00020456053067993362,
"loss": 0.474,
"step": 1161
},
{
"epoch": 0.6405733186328556,
"grad_norm": 0.23166267573833466,
"learning_rate": 0.00020447761194029848,
"loss": 0.4684,
"step": 1162
},
{
"epoch": 0.6411245865490628,
"grad_norm": 0.23626738786697388,
"learning_rate": 0.00020439469320066332,
"loss": 0.4744,
"step": 1163
},
{
"epoch": 0.6416758544652701,
"grad_norm": 0.2464771568775177,
"learning_rate": 0.00020431177446102817,
"loss": 0.47,
"step": 1164
},
{
"epoch": 0.6422271223814774,
"grad_norm": 0.23458126187324524,
"learning_rate": 0.000204228855721393,
"loss": 0.4442,
"step": 1165
},
{
"epoch": 0.6427783902976847,
"grad_norm": 0.23561522364616394,
"learning_rate": 0.00020414593698175787,
"loss": 0.4696,
"step": 1166
},
{
"epoch": 0.643329658213892,
"grad_norm": 0.2327614575624466,
"learning_rate": 0.0002040630182421227,
"loss": 0.486,
"step": 1167
},
{
"epoch": 0.6438809261300992,
"grad_norm": 0.22742946445941925,
"learning_rate": 0.00020398009950248756,
"loss": 0.4448,
"step": 1168
},
{
"epoch": 0.6444321940463065,
"grad_norm": 0.22767378389835358,
"learning_rate": 0.00020389718076285236,
"loss": 0.4749,
"step": 1169
},
{
"epoch": 0.6449834619625138,
"grad_norm": 0.21805496513843536,
"learning_rate": 0.00020381426202321722,
"loss": 0.4976,
"step": 1170
},
{
"epoch": 0.6455347298787211,
"grad_norm": 0.23068863153457642,
"learning_rate": 0.00020373134328358205,
"loss": 0.4839,
"step": 1171
},
{
"epoch": 0.6460859977949284,
"grad_norm": 0.24028991162776947,
"learning_rate": 0.0002036484245439469,
"loss": 0.4613,
"step": 1172
},
{
"epoch": 0.6466372657111357,
"grad_norm": 0.2558547854423523,
"learning_rate": 0.00020356550580431174,
"loss": 0.4795,
"step": 1173
},
{
"epoch": 0.6471885336273429,
"grad_norm": 0.2363976091146469,
"learning_rate": 0.0002034825870646766,
"loss": 0.4819,
"step": 1174
},
{
"epoch": 0.6477398015435502,
"grad_norm": 0.23440702259540558,
"learning_rate": 0.00020339966832504144,
"loss": 0.4676,
"step": 1175
},
{
"epoch": 0.6482910694597575,
"grad_norm": 0.23950831592082977,
"learning_rate": 0.0002033167495854063,
"loss": 0.4775,
"step": 1176
},
{
"epoch": 0.6488423373759648,
"grad_norm": 0.23549869656562805,
"learning_rate": 0.00020323383084577113,
"loss": 0.471,
"step": 1177
},
{
"epoch": 0.649393605292172,
"grad_norm": 0.2294132113456726,
"learning_rate": 0.00020315091210613599,
"loss": 0.4584,
"step": 1178
},
{
"epoch": 0.6499448732083792,
"grad_norm": 0.2511732280254364,
"learning_rate": 0.0002030679933665008,
"loss": 0.4886,
"step": 1179
},
{
"epoch": 0.6504961411245865,
"grad_norm": 0.23680317401885986,
"learning_rate": 0.00020298507462686565,
"loss": 0.5026,
"step": 1180
},
{
"epoch": 0.6510474090407938,
"grad_norm": 0.24410556256771088,
"learning_rate": 0.00020290215588723048,
"loss": 0.4862,
"step": 1181
},
{
"epoch": 0.6515986769570011,
"grad_norm": 0.24827975034713745,
"learning_rate": 0.00020281923714759534,
"loss": 0.4734,
"step": 1182
},
{
"epoch": 0.6521499448732084,
"grad_norm": 0.24595201015472412,
"learning_rate": 0.00020273631840796017,
"loss": 0.4754,
"step": 1183
},
{
"epoch": 0.6527012127894156,
"grad_norm": 0.24838019907474518,
"learning_rate": 0.00020265339966832503,
"loss": 0.4923,
"step": 1184
},
{
"epoch": 0.6532524807056229,
"grad_norm": 0.23605762422084808,
"learning_rate": 0.00020257048092868986,
"loss": 0.477,
"step": 1185
},
{
"epoch": 0.6538037486218302,
"grad_norm": 0.24502962827682495,
"learning_rate": 0.00020248756218905472,
"loss": 0.482,
"step": 1186
},
{
"epoch": 0.6543550165380375,
"grad_norm": 0.24489161372184753,
"learning_rate": 0.00020240464344941956,
"loss": 0.4783,
"step": 1187
},
{
"epoch": 0.6549062844542448,
"grad_norm": 0.236792653799057,
"learning_rate": 0.00020232172470978441,
"loss": 0.4899,
"step": 1188
},
{
"epoch": 0.655457552370452,
"grad_norm": 0.2327335923910141,
"learning_rate": 0.00020223880597014922,
"loss": 0.4915,
"step": 1189
},
{
"epoch": 0.6560088202866593,
"grad_norm": 0.21822991967201233,
"learning_rate": 0.00020215588723051408,
"loss": 0.472,
"step": 1190
},
{
"epoch": 0.6565600882028666,
"grad_norm": 0.2524334788322449,
"learning_rate": 0.0002020729684908789,
"loss": 0.4942,
"step": 1191
},
{
"epoch": 0.6571113561190739,
"grad_norm": 0.23585528135299683,
"learning_rate": 0.00020199004975124377,
"loss": 0.5011,
"step": 1192
},
{
"epoch": 0.6576626240352812,
"grad_norm": 0.24948836863040924,
"learning_rate": 0.0002019071310116086,
"loss": 0.4831,
"step": 1193
},
{
"epoch": 0.6582138919514884,
"grad_norm": 0.2369844615459442,
"learning_rate": 0.00020182421227197346,
"loss": 0.4923,
"step": 1194
},
{
"epoch": 0.6587651598676957,
"grad_norm": 0.22455725073814392,
"learning_rate": 0.0002017412935323383,
"loss": 0.4699,
"step": 1195
},
{
"epoch": 0.659316427783903,
"grad_norm": 0.22049696743488312,
"learning_rate": 0.00020165837479270315,
"loss": 0.4569,
"step": 1196
},
{
"epoch": 0.6598676957001103,
"grad_norm": 0.21964412927627563,
"learning_rate": 0.00020157545605306798,
"loss": 0.4818,
"step": 1197
},
{
"epoch": 0.6604189636163176,
"grad_norm": 0.24084921181201935,
"learning_rate": 0.00020149253731343284,
"loss": 0.4834,
"step": 1198
},
{
"epoch": 0.6609702315325248,
"grad_norm": 0.2169031798839569,
"learning_rate": 0.00020140961857379765,
"loss": 0.458,
"step": 1199
},
{
"epoch": 0.6615214994487321,
"grad_norm": 0.2437864989042282,
"learning_rate": 0.0002013266998341625,
"loss": 0.4998,
"step": 1200
},
{
"epoch": 0.6620727673649394,
"grad_norm": 0.2373666912317276,
"learning_rate": 0.00020124378109452734,
"loss": 0.4593,
"step": 1201
},
{
"epoch": 0.6626240352811467,
"grad_norm": 0.2300565093755722,
"learning_rate": 0.00020116086235489217,
"loss": 0.4698,
"step": 1202
},
{
"epoch": 0.663175303197354,
"grad_norm": 0.2500588595867157,
"learning_rate": 0.00020107794361525703,
"loss": 0.4847,
"step": 1203
},
{
"epoch": 0.6637265711135611,
"grad_norm": 0.24038562178611755,
"learning_rate": 0.00020099502487562186,
"loss": 0.4746,
"step": 1204
},
{
"epoch": 0.6642778390297684,
"grad_norm": 0.2691898047924042,
"learning_rate": 0.00020091210613598672,
"loss": 0.4547,
"step": 1205
},
{
"epoch": 0.6648291069459757,
"grad_norm": 0.23530587553977966,
"learning_rate": 0.00020082918739635155,
"loss": 0.4618,
"step": 1206
},
{
"epoch": 0.665380374862183,
"grad_norm": 0.24838554859161377,
"learning_rate": 0.0002007462686567164,
"loss": 0.5093,
"step": 1207
},
{
"epoch": 0.6659316427783903,
"grad_norm": 0.24996088445186615,
"learning_rate": 0.00020066334991708122,
"loss": 0.4703,
"step": 1208
},
{
"epoch": 0.6664829106945975,
"grad_norm": 0.2432130128145218,
"learning_rate": 0.00020058043117744608,
"loss": 0.4651,
"step": 1209
},
{
"epoch": 0.6670341786108048,
"grad_norm": 0.2394338846206665,
"learning_rate": 0.0002004975124378109,
"loss": 0.4679,
"step": 1210
},
{
"epoch": 0.6675854465270121,
"grad_norm": 0.23440587520599365,
"learning_rate": 0.00020041459369817577,
"loss": 0.4552,
"step": 1211
},
{
"epoch": 0.6681367144432194,
"grad_norm": 0.25409042835235596,
"learning_rate": 0.0002003316749585406,
"loss": 0.4879,
"step": 1212
},
{
"epoch": 0.6686879823594267,
"grad_norm": 0.24675914645195007,
"learning_rate": 0.00020024875621890546,
"loss": 0.4935,
"step": 1213
},
{
"epoch": 0.6692392502756339,
"grad_norm": 0.2398385852575302,
"learning_rate": 0.0002001658374792703,
"loss": 0.4588,
"step": 1214
},
{
"epoch": 0.6697905181918412,
"grad_norm": 0.23646225035190582,
"learning_rate": 0.00020008291873963515,
"loss": 0.486,
"step": 1215
},
{
"epoch": 0.6703417861080485,
"grad_norm": 0.2433752566576004,
"learning_rate": 0.00019999999999999998,
"loss": 0.5,
"step": 1216
},
{
"epoch": 0.6708930540242558,
"grad_norm": 0.22759981453418732,
"learning_rate": 0.00019991708126036484,
"loss": 0.482,
"step": 1217
},
{
"epoch": 0.6714443219404631,
"grad_norm": 0.2414034903049469,
"learning_rate": 0.00019983416252072965,
"loss": 0.4754,
"step": 1218
},
{
"epoch": 0.6719955898566703,
"grad_norm": 0.23548895120620728,
"learning_rate": 0.0001997512437810945,
"loss": 0.4793,
"step": 1219
},
{
"epoch": 0.6725468577728776,
"grad_norm": 0.22510850429534912,
"learning_rate": 0.00019966832504145934,
"loss": 0.474,
"step": 1220
},
{
"epoch": 0.6730981256890849,
"grad_norm": 0.21878324449062347,
"learning_rate": 0.0001995854063018242,
"loss": 0.4349,
"step": 1221
},
{
"epoch": 0.6736493936052922,
"grad_norm": 0.234661266207695,
"learning_rate": 0.00019950248756218903,
"loss": 0.4602,
"step": 1222
},
{
"epoch": 0.6742006615214995,
"grad_norm": 0.24233828485012054,
"learning_rate": 0.0001994195688225539,
"loss": 0.4932,
"step": 1223
},
{
"epoch": 0.6747519294377067,
"grad_norm": 0.22866547107696533,
"learning_rate": 0.00019933665008291872,
"loss": 0.4697,
"step": 1224
},
{
"epoch": 0.675303197353914,
"grad_norm": 0.2325911670923233,
"learning_rate": 0.00019925373134328358,
"loss": 0.4631,
"step": 1225
},
{
"epoch": 0.6758544652701213,
"grad_norm": 0.22702381014823914,
"learning_rate": 0.0001991708126036484,
"loss": 0.4631,
"step": 1226
},
{
"epoch": 0.6764057331863286,
"grad_norm": 0.23354612290859222,
"learning_rate": 0.00019908789386401327,
"loss": 0.4687,
"step": 1227
},
{
"epoch": 0.6769570011025359,
"grad_norm": 0.2386290431022644,
"learning_rate": 0.00019900497512437808,
"loss": 0.4777,
"step": 1228
},
{
"epoch": 0.6775082690187431,
"grad_norm": 0.24729053676128387,
"learning_rate": 0.00019892205638474293,
"loss": 0.4785,
"step": 1229
},
{
"epoch": 0.6780595369349504,
"grad_norm": 0.2109660655260086,
"learning_rate": 0.00019883913764510777,
"loss": 0.464,
"step": 1230
},
{
"epoch": 0.6786108048511577,
"grad_norm": 0.24349510669708252,
"learning_rate": 0.00019875621890547263,
"loss": 0.4972,
"step": 1231
},
{
"epoch": 0.679162072767365,
"grad_norm": 0.236436665058136,
"learning_rate": 0.00019867330016583746,
"loss": 0.4655,
"step": 1232
},
{
"epoch": 0.6797133406835723,
"grad_norm": 0.22133763134479523,
"learning_rate": 0.00019859038142620232,
"loss": 0.4856,
"step": 1233
},
{
"epoch": 0.6802646085997794,
"grad_norm": 0.23461799323558807,
"learning_rate": 0.00019850746268656715,
"loss": 0.4974,
"step": 1234
},
{
"epoch": 0.6808158765159867,
"grad_norm": 0.23802213370800018,
"learning_rate": 0.000198424543946932,
"loss": 0.4634,
"step": 1235
},
{
"epoch": 0.681367144432194,
"grad_norm": 0.23866182565689087,
"learning_rate": 0.00019834162520729684,
"loss": 0.4962,
"step": 1236
},
{
"epoch": 0.6819184123484013,
"grad_norm": 0.20461198687553406,
"learning_rate": 0.0001982587064676617,
"loss": 0.479,
"step": 1237
},
{
"epoch": 0.6824696802646086,
"grad_norm": 0.2442476749420166,
"learning_rate": 0.0001981757877280265,
"loss": 0.5007,
"step": 1238
},
{
"epoch": 0.6830209481808158,
"grad_norm": 0.2257671356201172,
"learning_rate": 0.00019809286898839136,
"loss": 0.4899,
"step": 1239
},
{
"epoch": 0.6835722160970231,
"grad_norm": 0.214102640748024,
"learning_rate": 0.0001980099502487562,
"loss": 0.4536,
"step": 1240
},
{
"epoch": 0.6841234840132304,
"grad_norm": 0.21543948352336884,
"learning_rate": 0.00019792703150912105,
"loss": 0.4811,
"step": 1241
},
{
"epoch": 0.6846747519294377,
"grad_norm": 0.25430455803871155,
"learning_rate": 0.00019784411276948589,
"loss": 0.486,
"step": 1242
},
{
"epoch": 0.685226019845645,
"grad_norm": 0.2656538486480713,
"learning_rate": 0.00019776119402985075,
"loss": 0.462,
"step": 1243
},
{
"epoch": 0.6857772877618522,
"grad_norm": 0.23967699706554413,
"learning_rate": 0.00019767827529021558,
"loss": 0.5004,
"step": 1244
},
{
"epoch": 0.6863285556780595,
"grad_norm": 0.22987446188926697,
"learning_rate": 0.00019759535655058044,
"loss": 0.4804,
"step": 1245
},
{
"epoch": 0.6868798235942668,
"grad_norm": 0.20953255891799927,
"learning_rate": 0.00019751243781094527,
"loss": 0.4793,
"step": 1246
},
{
"epoch": 0.6874310915104741,
"grad_norm": 0.256028413772583,
"learning_rate": 0.00019742951907131007,
"loss": 0.4881,
"step": 1247
},
{
"epoch": 0.6879823594266814,
"grad_norm": 0.23885922133922577,
"learning_rate": 0.00019734660033167493,
"loss": 0.508,
"step": 1248
},
{
"epoch": 0.6885336273428887,
"grad_norm": 0.24736814200878143,
"learning_rate": 0.00019726368159203976,
"loss": 0.4935,
"step": 1249
},
{
"epoch": 0.6890848952590959,
"grad_norm": 0.23237743973731995,
"learning_rate": 0.00019718076285240462,
"loss": 0.4775,
"step": 1250
},
{
"epoch": 0.6896361631753032,
"grad_norm": 0.24340516328811646,
"learning_rate": 0.00019709784411276946,
"loss": 0.4987,
"step": 1251
},
{
"epoch": 0.6901874310915105,
"grad_norm": 0.22015541791915894,
"learning_rate": 0.00019701492537313432,
"loss": 0.4524,
"step": 1252
},
{
"epoch": 0.6907386990077178,
"grad_norm": 0.25280436873435974,
"learning_rate": 0.00019693200663349915,
"loss": 0.4953,
"step": 1253
},
{
"epoch": 0.6912899669239251,
"grad_norm": 0.22572125494480133,
"learning_rate": 0.000196849087893864,
"loss": 0.4692,
"step": 1254
},
{
"epoch": 0.6918412348401323,
"grad_norm": 0.2326386719942093,
"learning_rate": 0.00019676616915422884,
"loss": 0.475,
"step": 1255
},
{
"epoch": 0.6923925027563396,
"grad_norm": 0.2248145192861557,
"learning_rate": 0.0001966832504145937,
"loss": 0.4463,
"step": 1256
},
{
"epoch": 0.6929437706725469,
"grad_norm": 0.236514613032341,
"learning_rate": 0.0001966003316749585,
"loss": 0.4502,
"step": 1257
},
{
"epoch": 0.6934950385887542,
"grad_norm": 0.2295265942811966,
"learning_rate": 0.00019651741293532336,
"loss": 0.4559,
"step": 1258
},
{
"epoch": 0.6940463065049615,
"grad_norm": 0.24026772379875183,
"learning_rate": 0.0001964344941956882,
"loss": 0.4642,
"step": 1259
},
{
"epoch": 0.6945975744211687,
"grad_norm": 0.2558375298976898,
"learning_rate": 0.00019635157545605305,
"loss": 0.4864,
"step": 1260
},
{
"epoch": 0.695148842337376,
"grad_norm": 0.2334502935409546,
"learning_rate": 0.00019626865671641788,
"loss": 0.47,
"step": 1261
},
{
"epoch": 0.6957001102535832,
"grad_norm": 0.23098182678222656,
"learning_rate": 0.00019618573797678274,
"loss": 0.4786,
"step": 1262
},
{
"epoch": 0.6962513781697905,
"grad_norm": 0.22288668155670166,
"learning_rate": 0.00019610281923714758,
"loss": 0.4638,
"step": 1263
},
{
"epoch": 0.6968026460859978,
"grad_norm": 0.23454713821411133,
"learning_rate": 0.00019601990049751244,
"loss": 0.4661,
"step": 1264
},
{
"epoch": 0.697353914002205,
"grad_norm": 0.22980453073978424,
"learning_rate": 0.00019593698175787727,
"loss": 0.4681,
"step": 1265
},
{
"epoch": 0.6979051819184123,
"grad_norm": 0.20731012523174286,
"learning_rate": 0.00019585406301824213,
"loss": 0.4439,
"step": 1266
},
{
"epoch": 0.6984564498346196,
"grad_norm": 0.22292488813400269,
"learning_rate": 0.00019577114427860693,
"loss": 0.4663,
"step": 1267
},
{
"epoch": 0.6990077177508269,
"grad_norm": 0.22497937083244324,
"learning_rate": 0.0001956882255389718,
"loss": 0.4751,
"step": 1268
},
{
"epoch": 0.6995589856670342,
"grad_norm": 0.2342757284641266,
"learning_rate": 0.00019560530679933662,
"loss": 0.4544,
"step": 1269
},
{
"epoch": 0.7001102535832414,
"grad_norm": 0.23075568675994873,
"learning_rate": 0.00019552238805970148,
"loss": 0.4634,
"step": 1270
},
{
"epoch": 0.7006615214994487,
"grad_norm": 0.2278735637664795,
"learning_rate": 0.0001954394693200663,
"loss": 0.4895,
"step": 1271
},
{
"epoch": 0.701212789415656,
"grad_norm": 0.25607171654701233,
"learning_rate": 0.00019535655058043117,
"loss": 0.49,
"step": 1272
},
{
"epoch": 0.7017640573318633,
"grad_norm": 0.2315627932548523,
"learning_rate": 0.000195273631840796,
"loss": 0.4522,
"step": 1273
},
{
"epoch": 0.7023153252480706,
"grad_norm": 0.2047976851463318,
"learning_rate": 0.00019519071310116086,
"loss": 0.4356,
"step": 1274
},
{
"epoch": 0.7028665931642778,
"grad_norm": 0.24180057644844055,
"learning_rate": 0.00019510779436152567,
"loss": 0.4749,
"step": 1275
},
{
"epoch": 0.7034178610804851,
"grad_norm": 0.2599826753139496,
"learning_rate": 0.00019502487562189055,
"loss": 0.5082,
"step": 1276
},
{
"epoch": 0.7039691289966924,
"grad_norm": 0.23944783210754395,
"learning_rate": 0.00019494195688225536,
"loss": 0.4828,
"step": 1277
},
{
"epoch": 0.7045203969128997,
"grad_norm": 0.21794094145298004,
"learning_rate": 0.00019485903814262022,
"loss": 0.4691,
"step": 1278
},
{
"epoch": 0.705071664829107,
"grad_norm": 0.23379597067832947,
"learning_rate": 0.00019477611940298505,
"loss": 0.486,
"step": 1279
},
{
"epoch": 0.7056229327453142,
"grad_norm": 0.21778427064418793,
"learning_rate": 0.0001946932006633499,
"loss": 0.4483,
"step": 1280
},
{
"epoch": 0.7061742006615215,
"grad_norm": 0.23941390216350555,
"learning_rate": 0.00019461028192371474,
"loss": 0.4885,
"step": 1281
},
{
"epoch": 0.7067254685777288,
"grad_norm": 0.23993995785713196,
"learning_rate": 0.0001945273631840796,
"loss": 0.5098,
"step": 1282
},
{
"epoch": 0.7072767364939361,
"grad_norm": 0.2523173391819,
"learning_rate": 0.00019444444444444443,
"loss": 0.4752,
"step": 1283
},
{
"epoch": 0.7078280044101434,
"grad_norm": 0.23337773978710175,
"learning_rate": 0.0001943615257048093,
"loss": 0.5198,
"step": 1284
},
{
"epoch": 0.7083792723263506,
"grad_norm": 0.24418905377388,
"learning_rate": 0.0001942786069651741,
"loss": 0.519,
"step": 1285
},
{
"epoch": 0.7089305402425579,
"grad_norm": 0.24214893579483032,
"learning_rate": 0.00019419568822553896,
"loss": 0.4625,
"step": 1286
},
{
"epoch": 0.7094818081587652,
"grad_norm": 0.25616276264190674,
"learning_rate": 0.0001941127694859038,
"loss": 0.483,
"step": 1287
},
{
"epoch": 0.7100330760749725,
"grad_norm": 0.2434643656015396,
"learning_rate": 0.00019402985074626865,
"loss": 0.4834,
"step": 1288
},
{
"epoch": 0.7105843439911798,
"grad_norm": 0.23342913389205933,
"learning_rate": 0.00019394693200663348,
"loss": 0.4577,
"step": 1289
},
{
"epoch": 0.7111356119073869,
"grad_norm": 0.23564305901527405,
"learning_rate": 0.0001938640132669983,
"loss": 0.4731,
"step": 1290
},
{
"epoch": 0.7116868798235942,
"grad_norm": 0.2814309000968933,
"learning_rate": 0.00019378109452736317,
"loss": 0.4845,
"step": 1291
},
{
"epoch": 0.7122381477398015,
"grad_norm": 0.2305363267660141,
"learning_rate": 0.000193698175787728,
"loss": 0.4577,
"step": 1292
},
{
"epoch": 0.7127894156560088,
"grad_norm": 0.2413802593946457,
"learning_rate": 0.00019361525704809286,
"loss": 0.5005,
"step": 1293
},
{
"epoch": 0.7133406835722161,
"grad_norm": 0.22398939728736877,
"learning_rate": 0.00019353233830845767,
"loss": 0.4645,
"step": 1294
},
{
"epoch": 0.7138919514884233,
"grad_norm": 0.2510089874267578,
"learning_rate": 0.00019344941956882253,
"loss": 0.4721,
"step": 1295
},
{
"epoch": 0.7144432194046306,
"grad_norm": 0.23676623404026031,
"learning_rate": 0.00019336650082918736,
"loss": 0.5126,
"step": 1296
},
{
"epoch": 0.7149944873208379,
"grad_norm": 0.22751228511333466,
"learning_rate": 0.00019328358208955222,
"loss": 0.4403,
"step": 1297
},
{
"epoch": 0.7155457552370452,
"grad_norm": 0.23468491435050964,
"learning_rate": 0.00019320066334991705,
"loss": 0.4697,
"step": 1298
},
{
"epoch": 0.7160970231532525,
"grad_norm": 0.2132336050271988,
"learning_rate": 0.0001931177446102819,
"loss": 0.4468,
"step": 1299
},
{
"epoch": 0.7166482910694597,
"grad_norm": 0.22579894959926605,
"learning_rate": 0.00019303482587064674,
"loss": 0.458,
"step": 1300
},
{
"epoch": 0.717199558985667,
"grad_norm": 0.22772036492824554,
"learning_rate": 0.0001929519071310116,
"loss": 0.457,
"step": 1301
},
{
"epoch": 0.7177508269018743,
"grad_norm": 0.2290082722902298,
"learning_rate": 0.00019286898839137643,
"loss": 0.4771,
"step": 1302
},
{
"epoch": 0.7183020948180816,
"grad_norm": 0.2190980762243271,
"learning_rate": 0.0001927860696517413,
"loss": 0.4754,
"step": 1303
},
{
"epoch": 0.7188533627342889,
"grad_norm": 0.2228933423757553,
"learning_rate": 0.0001927031509121061,
"loss": 0.476,
"step": 1304
},
{
"epoch": 0.7194046306504961,
"grad_norm": 0.23896026611328125,
"learning_rate": 0.00019262023217247096,
"loss": 0.5008,
"step": 1305
},
{
"epoch": 0.7199558985667034,
"grad_norm": 0.222875714302063,
"learning_rate": 0.0001925373134328358,
"loss": 0.4526,
"step": 1306
},
{
"epoch": 0.7205071664829107,
"grad_norm": 0.22457565367221832,
"learning_rate": 0.00019245439469320065,
"loss": 0.5019,
"step": 1307
},
{
"epoch": 0.721058434399118,
"grad_norm": 0.24464376270771027,
"learning_rate": 0.00019237147595356548,
"loss": 0.4896,
"step": 1308
},
{
"epoch": 0.7216097023153253,
"grad_norm": 0.22952450811862946,
"learning_rate": 0.00019228855721393034,
"loss": 0.4751,
"step": 1309
},
{
"epoch": 0.7221609702315325,
"grad_norm": 0.22557076811790466,
"learning_rate": 0.00019220563847429517,
"loss": 0.4859,
"step": 1310
},
{
"epoch": 0.7227122381477398,
"grad_norm": 0.2599719762802124,
"learning_rate": 0.00019212271973466003,
"loss": 0.4871,
"step": 1311
},
{
"epoch": 0.7232635060639471,
"grad_norm": 0.2541002333164215,
"learning_rate": 0.00019203980099502486,
"loss": 0.5076,
"step": 1312
},
{
"epoch": 0.7238147739801544,
"grad_norm": 0.234733447432518,
"learning_rate": 0.00019195688225538972,
"loss": 0.471,
"step": 1313
},
{
"epoch": 0.7243660418963617,
"grad_norm": 0.23307423293590546,
"learning_rate": 0.00019187396351575452,
"loss": 0.4758,
"step": 1314
},
{
"epoch": 0.7249173098125689,
"grad_norm": 0.22905585169792175,
"learning_rate": 0.00019179104477611938,
"loss": 0.4674,
"step": 1315
},
{
"epoch": 0.7254685777287762,
"grad_norm": 0.24311380088329315,
"learning_rate": 0.00019170812603648422,
"loss": 0.4838,
"step": 1316
},
{
"epoch": 0.7260198456449835,
"grad_norm": 0.24221283197402954,
"learning_rate": 0.00019162520729684907,
"loss": 0.4671,
"step": 1317
},
{
"epoch": 0.7265711135611908,
"grad_norm": 0.2364143580198288,
"learning_rate": 0.0001915422885572139,
"loss": 0.4496,
"step": 1318
},
{
"epoch": 0.727122381477398,
"grad_norm": 0.2382567673921585,
"learning_rate": 0.00019145936981757877,
"loss": 0.4516,
"step": 1319
},
{
"epoch": 0.7276736493936052,
"grad_norm": 0.281539648771286,
"learning_rate": 0.0001913764510779436,
"loss": 0.4742,
"step": 1320
},
{
"epoch": 0.7282249173098125,
"grad_norm": 0.2738378345966339,
"learning_rate": 0.00019129353233830846,
"loss": 0.5158,
"step": 1321
},
{
"epoch": 0.7287761852260198,
"grad_norm": 0.23668839037418365,
"learning_rate": 0.0001912106135986733,
"loss": 0.4907,
"step": 1322
},
{
"epoch": 0.7293274531422271,
"grad_norm": 0.2443835288286209,
"learning_rate": 0.00019112769485903815,
"loss": 0.4887,
"step": 1323
},
{
"epoch": 0.7298787210584344,
"grad_norm": 0.2538048028945923,
"learning_rate": 0.00019104477611940295,
"loss": 0.4413,
"step": 1324
},
{
"epoch": 0.7304299889746417,
"grad_norm": 0.24266113340854645,
"learning_rate": 0.0001909618573797678,
"loss": 0.4618,
"step": 1325
},
{
"epoch": 0.7309812568908489,
"grad_norm": 0.2522546648979187,
"learning_rate": 0.00019087893864013264,
"loss": 0.493,
"step": 1326
},
{
"epoch": 0.7315325248070562,
"grad_norm": 0.24361646175384521,
"learning_rate": 0.0001907960199004975,
"loss": 0.4552,
"step": 1327
},
{
"epoch": 0.7320837927232635,
"grad_norm": 0.24726730585098267,
"learning_rate": 0.00019071310116086234,
"loss": 0.4899,
"step": 1328
},
{
"epoch": 0.7326350606394708,
"grad_norm": 0.23533383011817932,
"learning_rate": 0.0001906301824212272,
"loss": 0.4674,
"step": 1329
},
{
"epoch": 0.7331863285556781,
"grad_norm": 0.23652805387973785,
"learning_rate": 0.00019054726368159203,
"loss": 0.4734,
"step": 1330
},
{
"epoch": 0.7337375964718853,
"grad_norm": 0.24334965646266937,
"learning_rate": 0.00019046434494195689,
"loss": 0.4897,
"step": 1331
},
{
"epoch": 0.7342888643880926,
"grad_norm": 0.2077738642692566,
"learning_rate": 0.00019038142620232172,
"loss": 0.4516,
"step": 1332
},
{
"epoch": 0.7348401323042999,
"grad_norm": 0.23306086659431458,
"learning_rate": 0.00019029850746268658,
"loss": 0.5076,
"step": 1333
},
{
"epoch": 0.7353914002205072,
"grad_norm": 0.2449159324169159,
"learning_rate": 0.00019021558872305138,
"loss": 0.4618,
"step": 1334
},
{
"epoch": 0.7359426681367145,
"grad_norm": 0.24829532206058502,
"learning_rate": 0.00019013266998341621,
"loss": 0.4614,
"step": 1335
},
{
"epoch": 0.7364939360529217,
"grad_norm": 0.23648925125598907,
"learning_rate": 0.00019004975124378107,
"loss": 0.4616,
"step": 1336
},
{
"epoch": 0.737045203969129,
"grad_norm": 0.23551128804683685,
"learning_rate": 0.0001899668325041459,
"loss": 0.4724,
"step": 1337
},
{
"epoch": 0.7375964718853363,
"grad_norm": 0.23878498375415802,
"learning_rate": 0.00018988391376451076,
"loss": 0.4639,
"step": 1338
},
{
"epoch": 0.7381477398015436,
"grad_norm": 0.24612358212471008,
"learning_rate": 0.0001898009950248756,
"loss": 0.4757,
"step": 1339
},
{
"epoch": 0.7386990077177509,
"grad_norm": 0.2288011610507965,
"learning_rate": 0.00018971807628524046,
"loss": 0.4598,
"step": 1340
},
{
"epoch": 0.7392502756339581,
"grad_norm": 0.2329450398683548,
"learning_rate": 0.0001896351575456053,
"loss": 0.4884,
"step": 1341
},
{
"epoch": 0.7398015435501654,
"grad_norm": 0.23273812234401703,
"learning_rate": 0.00018955223880597015,
"loss": 0.4834,
"step": 1342
},
{
"epoch": 0.7403528114663727,
"grad_norm": 0.24095992743968964,
"learning_rate": 0.00018946932006633495,
"loss": 0.4352,
"step": 1343
},
{
"epoch": 0.74090407938258,
"grad_norm": 0.24149319529533386,
"learning_rate": 0.0001893864013266998,
"loss": 0.4675,
"step": 1344
},
{
"epoch": 0.7414553472987873,
"grad_norm": 0.24013857543468475,
"learning_rate": 0.00018930348258706464,
"loss": 0.4879,
"step": 1345
},
{
"epoch": 0.7420066152149944,
"grad_norm": 0.23142081499099731,
"learning_rate": 0.0001892205638474295,
"loss": 0.4607,
"step": 1346
},
{
"epoch": 0.7425578831312017,
"grad_norm": 0.2619989514350891,
"learning_rate": 0.00018913764510779433,
"loss": 0.4784,
"step": 1347
},
{
"epoch": 0.743109151047409,
"grad_norm": 0.23706799745559692,
"learning_rate": 0.0001890547263681592,
"loss": 0.4716,
"step": 1348
},
{
"epoch": 0.7436604189636163,
"grad_norm": 0.25641632080078125,
"learning_rate": 0.00018897180762852403,
"loss": 0.4951,
"step": 1349
},
{
"epoch": 0.7442116868798236,
"grad_norm": 0.225026935338974,
"learning_rate": 0.00018888888888888888,
"loss": 0.4742,
"step": 1350
},
{
"epoch": 0.7447629547960308,
"grad_norm": 0.21225763857364655,
"learning_rate": 0.00018880597014925372,
"loss": 0.4484,
"step": 1351
},
{
"epoch": 0.7453142227122381,
"grad_norm": 0.2503174841403961,
"learning_rate": 0.00018872305140961858,
"loss": 0.4832,
"step": 1352
},
{
"epoch": 0.7458654906284454,
"grad_norm": 0.25594860315322876,
"learning_rate": 0.00018864013266998338,
"loss": 0.4952,
"step": 1353
},
{
"epoch": 0.7464167585446527,
"grad_norm": 0.23849812150001526,
"learning_rate": 0.00018855721393034824,
"loss": 0.4927,
"step": 1354
},
{
"epoch": 0.74696802646086,
"grad_norm": 0.22114640474319458,
"learning_rate": 0.00018847429519071307,
"loss": 0.4475,
"step": 1355
},
{
"epoch": 0.7475192943770672,
"grad_norm": 0.23791830241680145,
"learning_rate": 0.00018839137645107793,
"loss": 0.4846,
"step": 1356
},
{
"epoch": 0.7480705622932745,
"grad_norm": 0.2577480673789978,
"learning_rate": 0.00018830845771144276,
"loss": 0.4541,
"step": 1357
},
{
"epoch": 0.7486218302094818,
"grad_norm": 0.2754758596420288,
"learning_rate": 0.00018822553897180762,
"loss": 0.4958,
"step": 1358
},
{
"epoch": 0.7491730981256891,
"grad_norm": 0.2309567779302597,
"learning_rate": 0.00018814262023217245,
"loss": 0.4671,
"step": 1359
},
{
"epoch": 0.7497243660418964,
"grad_norm": 0.24164016544818878,
"learning_rate": 0.0001880597014925373,
"loss": 0.4712,
"step": 1360
},
{
"epoch": 0.7502756339581036,
"grad_norm": 0.21853327751159668,
"learning_rate": 0.00018797678275290215,
"loss": 0.503,
"step": 1361
},
{
"epoch": 0.7508269018743109,
"grad_norm": 0.22078783810138702,
"learning_rate": 0.000187893864013267,
"loss": 0.4654,
"step": 1362
},
{
"epoch": 0.7513781697905182,
"grad_norm": 0.23638005554676056,
"learning_rate": 0.0001878109452736318,
"loss": 0.4742,
"step": 1363
},
{
"epoch": 0.7519294377067255,
"grad_norm": 0.23174162209033966,
"learning_rate": 0.00018772802653399667,
"loss": 0.4599,
"step": 1364
},
{
"epoch": 0.7524807056229328,
"grad_norm": 0.23956626653671265,
"learning_rate": 0.0001876451077943615,
"loss": 0.477,
"step": 1365
},
{
"epoch": 0.75303197353914,
"grad_norm": 0.23747730255126953,
"learning_rate": 0.00018756218905472636,
"loss": 0.46,
"step": 1366
},
{
"epoch": 0.7535832414553473,
"grad_norm": 0.22467990219593048,
"learning_rate": 0.0001874792703150912,
"loss": 0.4502,
"step": 1367
},
{
"epoch": 0.7541345093715546,
"grad_norm": 0.230741485953331,
"learning_rate": 0.00018739635157545605,
"loss": 0.4718,
"step": 1368
},
{
"epoch": 0.7546857772877619,
"grad_norm": 0.24028630554676056,
"learning_rate": 0.00018731343283582088,
"loss": 0.4619,
"step": 1369
},
{
"epoch": 0.7552370452039692,
"grad_norm": 0.24253641068935394,
"learning_rate": 0.00018723051409618574,
"loss": 0.4817,
"step": 1370
},
{
"epoch": 0.7557883131201764,
"grad_norm": 0.22565878927707672,
"learning_rate": 0.00018714759535655057,
"loss": 0.4663,
"step": 1371
},
{
"epoch": 0.7563395810363837,
"grad_norm": 0.23143254220485687,
"learning_rate": 0.00018706467661691543,
"loss": 0.4536,
"step": 1372
},
{
"epoch": 0.756890848952591,
"grad_norm": 0.23320366442203522,
"learning_rate": 0.00018698175787728024,
"loss": 0.4304,
"step": 1373
},
{
"epoch": 0.7574421168687983,
"grad_norm": 0.23350325226783752,
"learning_rate": 0.0001868988391376451,
"loss": 0.4649,
"step": 1374
},
{
"epoch": 0.7579933847850056,
"grad_norm": 0.2501453757286072,
"learning_rate": 0.00018681592039800993,
"loss": 0.4696,
"step": 1375
},
{
"epoch": 0.7585446527012127,
"grad_norm": 0.22919632494449615,
"learning_rate": 0.0001867330016583748,
"loss": 0.4751,
"step": 1376
},
{
"epoch": 0.75909592061742,
"grad_norm": 0.2562139332294464,
"learning_rate": 0.00018665008291873962,
"loss": 0.49,
"step": 1377
},
{
"epoch": 0.7596471885336273,
"grad_norm": 0.2472946047782898,
"learning_rate": 0.00018656716417910445,
"loss": 0.4873,
"step": 1378
},
{
"epoch": 0.7601984564498346,
"grad_norm": 0.22273144125938416,
"learning_rate": 0.0001864842454394693,
"loss": 0.4569,
"step": 1379
},
{
"epoch": 0.7607497243660419,
"grad_norm": 0.24337974190711975,
"learning_rate": 0.00018640132669983414,
"loss": 0.4717,
"step": 1380
},
{
"epoch": 0.7613009922822491,
"grad_norm": 0.23919668793678284,
"learning_rate": 0.000186318407960199,
"loss": 0.4966,
"step": 1381
},
{
"epoch": 0.7618522601984564,
"grad_norm": 0.25102800130844116,
"learning_rate": 0.0001862354892205638,
"loss": 0.4551,
"step": 1382
},
{
"epoch": 0.7624035281146637,
"grad_norm": 0.22430755198001862,
"learning_rate": 0.00018615257048092867,
"loss": 0.4628,
"step": 1383
},
{
"epoch": 0.762954796030871,
"grad_norm": 0.2542060613632202,
"learning_rate": 0.0001860696517412935,
"loss": 0.474,
"step": 1384
},
{
"epoch": 0.7635060639470783,
"grad_norm": 0.24267995357513428,
"learning_rate": 0.00018598673300165836,
"loss": 0.4709,
"step": 1385
},
{
"epoch": 0.7640573318632855,
"grad_norm": 0.24730850756168365,
"learning_rate": 0.0001859038142620232,
"loss": 0.4703,
"step": 1386
},
{
"epoch": 0.7646085997794928,
"grad_norm": 0.22491230070590973,
"learning_rate": 0.00018582089552238805,
"loss": 0.4572,
"step": 1387
},
{
"epoch": 0.7651598676957001,
"grad_norm": 0.25823476910591125,
"learning_rate": 0.00018573797678275288,
"loss": 0.4911,
"step": 1388
},
{
"epoch": 0.7657111356119074,
"grad_norm": 0.2442496418952942,
"learning_rate": 0.00018565505804311774,
"loss": 0.4514,
"step": 1389
},
{
"epoch": 0.7662624035281147,
"grad_norm": 0.22842232882976532,
"learning_rate": 0.00018557213930348257,
"loss": 0.459,
"step": 1390
},
{
"epoch": 0.7668136714443219,
"grad_norm": 0.24691414833068848,
"learning_rate": 0.00018548922056384743,
"loss": 0.4958,
"step": 1391
},
{
"epoch": 0.7673649393605292,
"grad_norm": 0.22024598717689514,
"learning_rate": 0.00018540630182421224,
"loss": 0.4621,
"step": 1392
},
{
"epoch": 0.7679162072767365,
"grad_norm": 0.24100075662136078,
"learning_rate": 0.0001853233830845771,
"loss": 0.486,
"step": 1393
},
{
"epoch": 0.7684674751929438,
"grad_norm": 0.2123764157295227,
"learning_rate": 0.00018524046434494193,
"loss": 0.4575,
"step": 1394
},
{
"epoch": 0.7690187431091511,
"grad_norm": 0.239015132188797,
"learning_rate": 0.0001851575456053068,
"loss": 0.4777,
"step": 1395
},
{
"epoch": 0.7695700110253583,
"grad_norm": 0.22858455777168274,
"learning_rate": 0.00018507462686567162,
"loss": 0.438,
"step": 1396
},
{
"epoch": 0.7701212789415656,
"grad_norm": 0.23843710124492645,
"learning_rate": 0.00018499170812603648,
"loss": 0.456,
"step": 1397
},
{
"epoch": 0.7706725468577729,
"grad_norm": 0.23079745471477509,
"learning_rate": 0.0001849087893864013,
"loss": 0.4648,
"step": 1398
},
{
"epoch": 0.7712238147739802,
"grad_norm": 0.23103727400302887,
"learning_rate": 0.00018482587064676617,
"loss": 0.4589,
"step": 1399
},
{
"epoch": 0.7717750826901875,
"grad_norm": 0.2261170893907547,
"learning_rate": 0.00018474295190713097,
"loss": 0.4734,
"step": 1400
},
{
"epoch": 0.7723263506063948,
"grad_norm": 0.2249629944562912,
"learning_rate": 0.00018466003316749586,
"loss": 0.4542,
"step": 1401
},
{
"epoch": 0.772877618522602,
"grad_norm": 0.2366032898426056,
"learning_rate": 0.00018457711442786067,
"loss": 0.458,
"step": 1402
},
{
"epoch": 0.7734288864388092,
"grad_norm": 0.2598401606082916,
"learning_rate": 0.00018449419568822552,
"loss": 0.4557,
"step": 1403
},
{
"epoch": 0.7739801543550165,
"grad_norm": 0.23570790886878967,
"learning_rate": 0.00018441127694859036,
"loss": 0.4656,
"step": 1404
},
{
"epoch": 0.7745314222712238,
"grad_norm": 0.23591196537017822,
"learning_rate": 0.00018432835820895522,
"loss": 0.4689,
"step": 1405
},
{
"epoch": 0.7750826901874311,
"grad_norm": 0.2540998160839081,
"learning_rate": 0.00018424543946932005,
"loss": 0.4977,
"step": 1406
},
{
"epoch": 0.7756339581036383,
"grad_norm": 0.22981034219264984,
"learning_rate": 0.0001841625207296849,
"loss": 0.4718,
"step": 1407
},
{
"epoch": 0.7761852260198456,
"grad_norm": 0.2221202403306961,
"learning_rate": 0.00018407960199004974,
"loss": 0.4784,
"step": 1408
},
{
"epoch": 0.7767364939360529,
"grad_norm": 0.2501460909843445,
"learning_rate": 0.0001839966832504146,
"loss": 0.4806,
"step": 1409
},
{
"epoch": 0.7772877618522602,
"grad_norm": 0.2174586057662964,
"learning_rate": 0.0001839137645107794,
"loss": 0.4833,
"step": 1410
},
{
"epoch": 0.7778390297684675,
"grad_norm": 0.2424350082874298,
"learning_rate": 0.00018383084577114426,
"loss": 0.4902,
"step": 1411
},
{
"epoch": 0.7783902976846747,
"grad_norm": 0.25260457396507263,
"learning_rate": 0.0001837479270315091,
"loss": 0.4843,
"step": 1412
},
{
"epoch": 0.778941565600882,
"grad_norm": 0.27532869577407837,
"learning_rate": 0.00018366500829187395,
"loss": 0.4914,
"step": 1413
},
{
"epoch": 0.7794928335170893,
"grad_norm": 0.24072158336639404,
"learning_rate": 0.00018358208955223879,
"loss": 0.4888,
"step": 1414
},
{
"epoch": 0.7800441014332966,
"grad_norm": 0.24182955920696259,
"learning_rate": 0.00018349917081260364,
"loss": 0.4589,
"step": 1415
},
{
"epoch": 0.7805953693495039,
"grad_norm": 0.25824496150016785,
"learning_rate": 0.00018341625207296848,
"loss": 0.4868,
"step": 1416
},
{
"epoch": 0.7811466372657111,
"grad_norm": 0.2336832731962204,
"learning_rate": 0.00018333333333333334,
"loss": 0.472,
"step": 1417
},
{
"epoch": 0.7816979051819184,
"grad_norm": 0.24849727749824524,
"learning_rate": 0.00018325041459369817,
"loss": 0.4743,
"step": 1418
},
{
"epoch": 0.7822491730981257,
"grad_norm": 0.21890904009342194,
"learning_rate": 0.00018316749585406303,
"loss": 0.465,
"step": 1419
},
{
"epoch": 0.782800441014333,
"grad_norm": 0.2601034343242645,
"learning_rate": 0.00018308457711442783,
"loss": 0.4531,
"step": 1420
},
{
"epoch": 0.7833517089305403,
"grad_norm": 0.2441786229610443,
"learning_rate": 0.0001830016583747927,
"loss": 0.4536,
"step": 1421
},
{
"epoch": 0.7839029768467475,
"grad_norm": 0.2240273654460907,
"learning_rate": 0.00018291873963515752,
"loss": 0.461,
"step": 1422
},
{
"epoch": 0.7844542447629548,
"grad_norm": 0.2334737479686737,
"learning_rate": 0.00018283582089552235,
"loss": 0.4779,
"step": 1423
},
{
"epoch": 0.7850055126791621,
"grad_norm": 0.23395971953868866,
"learning_rate": 0.00018275290215588721,
"loss": 0.4585,
"step": 1424
},
{
"epoch": 0.7855567805953694,
"grad_norm": 0.24163080751895905,
"learning_rate": 0.00018266998341625205,
"loss": 0.4781,
"step": 1425
},
{
"epoch": 0.7861080485115767,
"grad_norm": 0.23681163787841797,
"learning_rate": 0.0001825870646766169,
"loss": 0.4518,
"step": 1426
},
{
"epoch": 0.7866593164277839,
"grad_norm": 0.2450489103794098,
"learning_rate": 0.00018250414593698174,
"loss": 0.4741,
"step": 1427
},
{
"epoch": 0.7872105843439912,
"grad_norm": 0.23335276544094086,
"learning_rate": 0.0001824212271973466,
"loss": 0.4938,
"step": 1428
},
{
"epoch": 0.7877618522601985,
"grad_norm": 0.22969652712345123,
"learning_rate": 0.0001823383084577114,
"loss": 0.4577,
"step": 1429
},
{
"epoch": 0.7883131201764058,
"grad_norm": 0.2162095010280609,
"learning_rate": 0.00018225538971807626,
"loss": 0.4632,
"step": 1430
},
{
"epoch": 0.7888643880926131,
"grad_norm": 0.2445029318332672,
"learning_rate": 0.0001821724709784411,
"loss": 0.4657,
"step": 1431
},
{
"epoch": 0.7894156560088202,
"grad_norm": 0.21864482760429382,
"learning_rate": 0.00018208955223880595,
"loss": 0.4759,
"step": 1432
},
{
"epoch": 0.7899669239250275,
"grad_norm": 0.24577899277210236,
"learning_rate": 0.00018200663349917078,
"loss": 0.4717,
"step": 1433
},
{
"epoch": 0.7905181918412348,
"grad_norm": 0.21177740395069122,
"learning_rate": 0.00018192371475953564,
"loss": 0.4564,
"step": 1434
},
{
"epoch": 0.7910694597574421,
"grad_norm": 0.2460215985774994,
"learning_rate": 0.00018184079601990047,
"loss": 0.4921,
"step": 1435
},
{
"epoch": 0.7916207276736494,
"grad_norm": 0.24731247127056122,
"learning_rate": 0.00018175787728026533,
"loss": 0.4655,
"step": 1436
},
{
"epoch": 0.7921719955898566,
"grad_norm": 0.24188898503780365,
"learning_rate": 0.00018167495854063017,
"loss": 0.4665,
"step": 1437
},
{
"epoch": 0.7927232635060639,
"grad_norm": 0.2347448617219925,
"learning_rate": 0.00018159203980099502,
"loss": 0.4563,
"step": 1438
},
{
"epoch": 0.7932745314222712,
"grad_norm": 0.242751806974411,
"learning_rate": 0.00018150912106135983,
"loss": 0.4622,
"step": 1439
},
{
"epoch": 0.7938257993384785,
"grad_norm": 0.2598075270652771,
"learning_rate": 0.0001814262023217247,
"loss": 0.4679,
"step": 1440
},
{
"epoch": 0.7943770672546858,
"grad_norm": 0.23368312418460846,
"learning_rate": 0.00018134328358208952,
"loss": 0.4627,
"step": 1441
},
{
"epoch": 0.794928335170893,
"grad_norm": 0.24804770946502686,
"learning_rate": 0.00018126036484245438,
"loss": 0.4663,
"step": 1442
},
{
"epoch": 0.7954796030871003,
"grad_norm": 0.22588974237442017,
"learning_rate": 0.0001811774461028192,
"loss": 0.4514,
"step": 1443
},
{
"epoch": 0.7960308710033076,
"grad_norm": 0.22374935448169708,
"learning_rate": 0.00018109452736318407,
"loss": 0.4552,
"step": 1444
},
{
"epoch": 0.7965821389195149,
"grad_norm": 0.24665199220180511,
"learning_rate": 0.0001810116086235489,
"loss": 0.4639,
"step": 1445
},
{
"epoch": 0.7971334068357222,
"grad_norm": 0.25782036781311035,
"learning_rate": 0.00018092868988391376,
"loss": 0.4592,
"step": 1446
},
{
"epoch": 0.7976846747519294,
"grad_norm": 0.21815195679664612,
"learning_rate": 0.0001808457711442786,
"loss": 0.4724,
"step": 1447
},
{
"epoch": 0.7982359426681367,
"grad_norm": 0.24236443638801575,
"learning_rate": 0.00018076285240464345,
"loss": 0.473,
"step": 1448
},
{
"epoch": 0.798787210584344,
"grad_norm": 0.23173320293426514,
"learning_rate": 0.00018067993366500826,
"loss": 0.4771,
"step": 1449
},
{
"epoch": 0.7993384785005513,
"grad_norm": 0.22303089499473572,
"learning_rate": 0.00018059701492537312,
"loss": 0.4545,
"step": 1450
},
{
"epoch": 0.7998897464167586,
"grad_norm": 0.23491422832012177,
"learning_rate": 0.00018051409618573795,
"loss": 0.4807,
"step": 1451
},
{
"epoch": 0.8004410143329658,
"grad_norm": 0.23925326764583588,
"learning_rate": 0.0001804311774461028,
"loss": 0.4705,
"step": 1452
},
{
"epoch": 0.8009922822491731,
"grad_norm": 0.2446267306804657,
"learning_rate": 0.00018034825870646764,
"loss": 0.4514,
"step": 1453
},
{
"epoch": 0.8015435501653804,
"grad_norm": 0.2514120936393738,
"learning_rate": 0.0001802653399668325,
"loss": 0.4823,
"step": 1454
},
{
"epoch": 0.8020948180815877,
"grad_norm": 0.2469882369041443,
"learning_rate": 0.00018018242122719733,
"loss": 0.45,
"step": 1455
},
{
"epoch": 0.802646085997795,
"grad_norm": 0.23653636872768402,
"learning_rate": 0.0001800995024875622,
"loss": 0.4649,
"step": 1456
},
{
"epoch": 0.8031973539140022,
"grad_norm": 0.22585710883140564,
"learning_rate": 0.00018001658374792702,
"loss": 0.4384,
"step": 1457
},
{
"epoch": 0.8037486218302095,
"grad_norm": 0.24817028641700745,
"learning_rate": 0.00017993366500829188,
"loss": 0.4739,
"step": 1458
},
{
"epoch": 0.8042998897464168,
"grad_norm": 0.25585106015205383,
"learning_rate": 0.0001798507462686567,
"loss": 0.4958,
"step": 1459
},
{
"epoch": 0.804851157662624,
"grad_norm": 0.25958600640296936,
"learning_rate": 0.00017976782752902155,
"loss": 0.4673,
"step": 1460
},
{
"epoch": 0.8054024255788313,
"grad_norm": 0.2447502166032791,
"learning_rate": 0.00017968490878938638,
"loss": 0.484,
"step": 1461
},
{
"epoch": 0.8059536934950385,
"grad_norm": 0.22878794372081757,
"learning_rate": 0.00017960199004975124,
"loss": 0.4832,
"step": 1462
},
{
"epoch": 0.8065049614112458,
"grad_norm": 0.24230952560901642,
"learning_rate": 0.00017951907131011607,
"loss": 0.4498,
"step": 1463
},
{
"epoch": 0.8070562293274531,
"grad_norm": 0.2345331311225891,
"learning_rate": 0.00017943615257048093,
"loss": 0.4529,
"step": 1464
},
{
"epoch": 0.8076074972436604,
"grad_norm": 0.2564900815486908,
"learning_rate": 0.00017935323383084576,
"loss": 0.4747,
"step": 1465
},
{
"epoch": 0.8081587651598677,
"grad_norm": 0.2226727157831192,
"learning_rate": 0.00017927031509121062,
"loss": 0.4453,
"step": 1466
},
{
"epoch": 0.8087100330760749,
"grad_norm": 0.26586976647377014,
"learning_rate": 0.00017918739635157545,
"loss": 0.5032,
"step": 1467
},
{
"epoch": 0.8092613009922822,
"grad_norm": 0.23573876917362213,
"learning_rate": 0.00017910447761194026,
"loss": 0.4674,
"step": 1468
},
{
"epoch": 0.8098125689084895,
"grad_norm": 0.24506725370883942,
"learning_rate": 0.00017902155887230512,
"loss": 0.4605,
"step": 1469
},
{
"epoch": 0.8103638368246968,
"grad_norm": 0.2386348396539688,
"learning_rate": 0.00017893864013266995,
"loss": 0.4618,
"step": 1470
},
{
"epoch": 0.8109151047409041,
"grad_norm": 0.24811455607414246,
"learning_rate": 0.0001788557213930348,
"loss": 0.4615,
"step": 1471
},
{
"epoch": 0.8114663726571113,
"grad_norm": 0.2334372103214264,
"learning_rate": 0.00017877280265339964,
"loss": 0.474,
"step": 1472
},
{
"epoch": 0.8120176405733186,
"grad_norm": 0.247808575630188,
"learning_rate": 0.0001786898839137645,
"loss": 0.4504,
"step": 1473
},
{
"epoch": 0.8125689084895259,
"grad_norm": 0.21028272807598114,
"learning_rate": 0.00017860696517412933,
"loss": 0.4425,
"step": 1474
},
{
"epoch": 0.8131201764057332,
"grad_norm": 0.22339411079883575,
"learning_rate": 0.0001785240464344942,
"loss": 0.449,
"step": 1475
},
{
"epoch": 0.8136714443219405,
"grad_norm": 0.23447810113430023,
"learning_rate": 0.00017844112769485902,
"loss": 0.4593,
"step": 1476
},
{
"epoch": 0.8142227122381478,
"grad_norm": 0.22381900250911713,
"learning_rate": 0.00017835820895522388,
"loss": 0.4603,
"step": 1477
},
{
"epoch": 0.814773980154355,
"grad_norm": 0.22677209973335266,
"learning_rate": 0.00017827529021558869,
"loss": 0.4525,
"step": 1478
},
{
"epoch": 0.8153252480705623,
"grad_norm": 0.2385341227054596,
"learning_rate": 0.00017819237147595354,
"loss": 0.49,
"step": 1479
},
{
"epoch": 0.8158765159867696,
"grad_norm": 0.24088934063911438,
"learning_rate": 0.00017810945273631838,
"loss": 0.4984,
"step": 1480
},
{
"epoch": 0.8164277839029769,
"grad_norm": 0.20627839863300323,
"learning_rate": 0.00017802653399668324,
"loss": 0.4597,
"step": 1481
},
{
"epoch": 0.8169790518191842,
"grad_norm": 0.2268056422472,
"learning_rate": 0.00017794361525704807,
"loss": 0.4581,
"step": 1482
},
{
"epoch": 0.8175303197353914,
"grad_norm": 0.24342721700668335,
"learning_rate": 0.00017786069651741293,
"loss": 0.4715,
"step": 1483
},
{
"epoch": 0.8180815876515987,
"grad_norm": 0.23494994640350342,
"learning_rate": 0.00017777777777777776,
"loss": 0.4859,
"step": 1484
},
{
"epoch": 0.818632855567806,
"grad_norm": 0.23297634720802307,
"learning_rate": 0.00017769485903814262,
"loss": 0.4644,
"step": 1485
},
{
"epoch": 0.8191841234840133,
"grad_norm": 0.24424344301223755,
"learning_rate": 0.00017761194029850745,
"loss": 0.456,
"step": 1486
},
{
"epoch": 0.8197353914002206,
"grad_norm": 0.2417961210012436,
"learning_rate": 0.0001775290215588723,
"loss": 0.5005,
"step": 1487
},
{
"epoch": 0.8202866593164277,
"grad_norm": 0.24089650809764862,
"learning_rate": 0.00017744610281923711,
"loss": 0.4953,
"step": 1488
},
{
"epoch": 0.820837927232635,
"grad_norm": 0.22983671724796295,
"learning_rate": 0.00017736318407960197,
"loss": 0.4544,
"step": 1489
},
{
"epoch": 0.8213891951488423,
"grad_norm": 0.20966455340385437,
"learning_rate": 0.0001772802653399668,
"loss": 0.4724,
"step": 1490
},
{
"epoch": 0.8219404630650496,
"grad_norm": 0.24843506515026093,
"learning_rate": 0.00017719734660033166,
"loss": 0.4799,
"step": 1491
},
{
"epoch": 0.8224917309812569,
"grad_norm": 0.22664618492126465,
"learning_rate": 0.0001771144278606965,
"loss": 0.4421,
"step": 1492
},
{
"epoch": 0.8230429988974641,
"grad_norm": 0.22813642024993896,
"learning_rate": 0.00017703150912106136,
"loss": 0.4622,
"step": 1493
},
{
"epoch": 0.8235942668136714,
"grad_norm": 0.2250567078590393,
"learning_rate": 0.0001769485903814262,
"loss": 0.4526,
"step": 1494
},
{
"epoch": 0.8241455347298787,
"grad_norm": 0.2317907065153122,
"learning_rate": 0.00017686567164179105,
"loss": 0.4743,
"step": 1495
},
{
"epoch": 0.824696802646086,
"grad_norm": 0.22760067880153656,
"learning_rate": 0.00017678275290215588,
"loss": 0.4765,
"step": 1496
},
{
"epoch": 0.8252480705622933,
"grad_norm": 0.21815039217472076,
"learning_rate": 0.00017669983416252074,
"loss": 0.4588,
"step": 1497
},
{
"epoch": 0.8257993384785005,
"grad_norm": 0.25006452202796936,
"learning_rate": 0.00017661691542288554,
"loss": 0.451,
"step": 1498
},
{
"epoch": 0.8263506063947078,
"grad_norm": 0.22310319542884827,
"learning_rate": 0.0001765339966832504,
"loss": 0.4754,
"step": 1499
},
{
"epoch": 0.8269018743109151,
"grad_norm": 0.26363706588745117,
"learning_rate": 0.00017645107794361523,
"loss": 0.4834,
"step": 1500
},
{
"epoch": 0.8269018743109151,
"eval_loss": 0.4649047255516052,
"eval_runtime": 312.7946,
"eval_samples_per_second": 3.724,
"eval_steps_per_second": 0.467,
"step": 1500
},
{
"epoch": 0.8274531422271224,
"grad_norm": 0.22052568197250366,
"learning_rate": 0.0001763681592039801,
"loss": 0.4931,
"step": 1501
},
{
"epoch": 0.8280044101433297,
"grad_norm": 0.23108328878879547,
"learning_rate": 0.00017628524046434493,
"loss": 0.4901,
"step": 1502
},
{
"epoch": 0.8285556780595369,
"grad_norm": 0.23075662553310394,
"learning_rate": 0.00017620232172470978,
"loss": 0.4484,
"step": 1503
},
{
"epoch": 0.8291069459757442,
"grad_norm": 0.24602019786834717,
"learning_rate": 0.00017611940298507462,
"loss": 0.4427,
"step": 1504
},
{
"epoch": 0.8296582138919515,
"grad_norm": 0.2438734471797943,
"learning_rate": 0.00017603648424543948,
"loss": 0.4731,
"step": 1505
},
{
"epoch": 0.8302094818081588,
"grad_norm": 0.23441627621650696,
"learning_rate": 0.0001759535655058043,
"loss": 0.4628,
"step": 1506
},
{
"epoch": 0.8307607497243661,
"grad_norm": 0.23310305178165436,
"learning_rate": 0.00017587064676616917,
"loss": 0.4929,
"step": 1507
},
{
"epoch": 0.8313120176405733,
"grad_norm": 0.25448939204216003,
"learning_rate": 0.00017578772802653397,
"loss": 0.4851,
"step": 1508
},
{
"epoch": 0.8318632855567806,
"grad_norm": 0.2438756674528122,
"learning_rate": 0.00017570480928689883,
"loss": 0.4706,
"step": 1509
},
{
"epoch": 0.8324145534729879,
"grad_norm": 0.25436931848526,
"learning_rate": 0.00017562189054726366,
"loss": 0.4869,
"step": 1510
},
{
"epoch": 0.8329658213891952,
"grad_norm": 0.22301998734474182,
"learning_rate": 0.0001755389718076285,
"loss": 0.4593,
"step": 1511
},
{
"epoch": 0.8335170893054025,
"grad_norm": 0.24233976006507874,
"learning_rate": 0.00017545605306799335,
"loss": 0.5016,
"step": 1512
},
{
"epoch": 0.8340683572216097,
"grad_norm": 0.22516629099845886,
"learning_rate": 0.00017537313432835819,
"loss": 0.4732,
"step": 1513
},
{
"epoch": 0.834619625137817,
"grad_norm": 0.22612155973911285,
"learning_rate": 0.00017529021558872305,
"loss": 0.4625,
"step": 1514
},
{
"epoch": 0.8351708930540243,
"grad_norm": 0.23177853226661682,
"learning_rate": 0.00017520729684908785,
"loss": 0.4776,
"step": 1515
},
{
"epoch": 0.8357221609702316,
"grad_norm": 0.24279583990573883,
"learning_rate": 0.00017512437810945274,
"loss": 0.4721,
"step": 1516
},
{
"epoch": 0.8362734288864389,
"grad_norm": 0.23456443846225739,
"learning_rate": 0.00017504145936981754,
"loss": 0.4635,
"step": 1517
},
{
"epoch": 0.836824696802646,
"grad_norm": 0.23287171125411987,
"learning_rate": 0.0001749585406301824,
"loss": 0.4739,
"step": 1518
},
{
"epoch": 0.8373759647188533,
"grad_norm": 0.22415684163570404,
"learning_rate": 0.00017487562189054723,
"loss": 0.4769,
"step": 1519
},
{
"epoch": 0.8379272326350606,
"grad_norm": 0.2180211991071701,
"learning_rate": 0.0001747927031509121,
"loss": 0.4388,
"step": 1520
},
{
"epoch": 0.8384785005512679,
"grad_norm": 0.2260761708021164,
"learning_rate": 0.00017470978441127692,
"loss": 0.4972,
"step": 1521
},
{
"epoch": 0.8390297684674752,
"grad_norm": 0.22887657582759857,
"learning_rate": 0.00017462686567164178,
"loss": 0.4554,
"step": 1522
},
{
"epoch": 0.8395810363836824,
"grad_norm": 0.241640105843544,
"learning_rate": 0.00017454394693200662,
"loss": 0.4732,
"step": 1523
},
{
"epoch": 0.8401323042998897,
"grad_norm": 0.2288465052843094,
"learning_rate": 0.00017446102819237147,
"loss": 0.4527,
"step": 1524
},
{
"epoch": 0.840683572216097,
"grad_norm": 0.23457041382789612,
"learning_rate": 0.00017437810945273628,
"loss": 0.4574,
"step": 1525
},
{
"epoch": 0.8412348401323043,
"grad_norm": 0.25197815895080566,
"learning_rate": 0.00017429519071310114,
"loss": 0.4597,
"step": 1526
},
{
"epoch": 0.8417861080485116,
"grad_norm": 0.2385404258966446,
"learning_rate": 0.00017421227197346597,
"loss": 0.4649,
"step": 1527
},
{
"epoch": 0.8423373759647188,
"grad_norm": 0.23451651632785797,
"learning_rate": 0.00017412935323383083,
"loss": 0.4646,
"step": 1528
},
{
"epoch": 0.8428886438809261,
"grad_norm": 0.2421046793460846,
"learning_rate": 0.00017404643449419566,
"loss": 0.4852,
"step": 1529
},
{
"epoch": 0.8434399117971334,
"grad_norm": 0.25406989455223083,
"learning_rate": 0.00017396351575456052,
"loss": 0.4804,
"step": 1530
},
{
"epoch": 0.8439911797133407,
"grad_norm": 0.24752497673034668,
"learning_rate": 0.00017388059701492535,
"loss": 0.4777,
"step": 1531
},
{
"epoch": 0.844542447629548,
"grad_norm": 0.226281076669693,
"learning_rate": 0.0001737976782752902,
"loss": 0.4747,
"step": 1532
},
{
"epoch": 0.8450937155457552,
"grad_norm": 0.2519485652446747,
"learning_rate": 0.00017371475953565504,
"loss": 0.4639,
"step": 1533
},
{
"epoch": 0.8456449834619625,
"grad_norm": 0.2347985804080963,
"learning_rate": 0.0001736318407960199,
"loss": 0.4715,
"step": 1534
},
{
"epoch": 0.8461962513781698,
"grad_norm": 0.24425053596496582,
"learning_rate": 0.0001735489220563847,
"loss": 0.445,
"step": 1535
},
{
"epoch": 0.8467475192943771,
"grad_norm": 0.2559725046157837,
"learning_rate": 0.00017346600331674957,
"loss": 0.49,
"step": 1536
},
{
"epoch": 0.8472987872105844,
"grad_norm": 0.23750551044940948,
"learning_rate": 0.0001733830845771144,
"loss": 0.4663,
"step": 1537
},
{
"epoch": 0.8478500551267916,
"grad_norm": 0.22861897945404053,
"learning_rate": 0.00017330016583747926,
"loss": 0.45,
"step": 1538
},
{
"epoch": 0.8484013230429989,
"grad_norm": 0.24839669466018677,
"learning_rate": 0.0001732172470978441,
"loss": 0.4856,
"step": 1539
},
{
"epoch": 0.8489525909592062,
"grad_norm": 0.23960521817207336,
"learning_rate": 0.00017313432835820895,
"loss": 0.4933,
"step": 1540
},
{
"epoch": 0.8495038588754135,
"grad_norm": 0.23533576726913452,
"learning_rate": 0.00017305140961857378,
"loss": 0.4698,
"step": 1541
},
{
"epoch": 0.8500551267916208,
"grad_norm": 0.23979732394218445,
"learning_rate": 0.00017296849087893864,
"loss": 0.4953,
"step": 1542
},
{
"epoch": 0.850606394707828,
"grad_norm": 0.24841150641441345,
"learning_rate": 0.00017288557213930347,
"loss": 0.4845,
"step": 1543
},
{
"epoch": 0.8511576626240352,
"grad_norm": 0.22132597863674164,
"learning_rate": 0.00017280265339966833,
"loss": 0.4643,
"step": 1544
},
{
"epoch": 0.8517089305402425,
"grad_norm": 0.22431734204292297,
"learning_rate": 0.00017271973466003314,
"loss": 0.4547,
"step": 1545
},
{
"epoch": 0.8522601984564498,
"grad_norm": 0.22704413533210754,
"learning_rate": 0.000172636815920398,
"loss": 0.4665,
"step": 1546
},
{
"epoch": 0.8528114663726571,
"grad_norm": 0.22971755266189575,
"learning_rate": 0.00017255389718076283,
"loss": 0.4709,
"step": 1547
},
{
"epoch": 0.8533627342888643,
"grad_norm": 0.2435724288225174,
"learning_rate": 0.0001724709784411277,
"loss": 0.4733,
"step": 1548
},
{
"epoch": 0.8539140022050716,
"grad_norm": 0.24051538109779358,
"learning_rate": 0.00017238805970149252,
"loss": 0.4695,
"step": 1549
},
{
"epoch": 0.8544652701212789,
"grad_norm": 0.26592954993247986,
"learning_rate": 0.00017230514096185738,
"loss": 0.4683,
"step": 1550
},
{
"epoch": 0.8550165380374862,
"grad_norm": 0.24452587962150574,
"learning_rate": 0.0001722222222222222,
"loss": 0.4623,
"step": 1551
},
{
"epoch": 0.8555678059536935,
"grad_norm": 0.23351791501045227,
"learning_rate": 0.00017213930348258707,
"loss": 0.4559,
"step": 1552
},
{
"epoch": 0.8561190738699008,
"grad_norm": 0.23652702569961548,
"learning_rate": 0.0001720563847429519,
"loss": 0.4507,
"step": 1553
},
{
"epoch": 0.856670341786108,
"grad_norm": 0.22390702366828918,
"learning_rate": 0.00017197346600331676,
"loss": 0.4521,
"step": 1554
},
{
"epoch": 0.8572216097023153,
"grad_norm": 0.24590735137462616,
"learning_rate": 0.00017189054726368157,
"loss": 0.4712,
"step": 1555
},
{
"epoch": 0.8577728776185226,
"grad_norm": 0.21954110264778137,
"learning_rate": 0.0001718076285240464,
"loss": 0.4447,
"step": 1556
},
{
"epoch": 0.8583241455347299,
"grad_norm": 0.23404909670352936,
"learning_rate": 0.00017172470978441126,
"loss": 0.4699,
"step": 1557
},
{
"epoch": 0.8588754134509372,
"grad_norm": 0.24352899193763733,
"learning_rate": 0.0001716417910447761,
"loss": 0.4904,
"step": 1558
},
{
"epoch": 0.8594266813671444,
"grad_norm": 0.30317431688308716,
"learning_rate": 0.00017155887230514095,
"loss": 0.4606,
"step": 1559
},
{
"epoch": 0.8599779492833517,
"grad_norm": 0.22517681121826172,
"learning_rate": 0.00017147595356550578,
"loss": 0.4892,
"step": 1560
},
{
"epoch": 0.860529217199559,
"grad_norm": 0.23503634333610535,
"learning_rate": 0.00017139303482587064,
"loss": 0.4755,
"step": 1561
},
{
"epoch": 0.8610804851157663,
"grad_norm": 0.22381718456745148,
"learning_rate": 0.00017131011608623547,
"loss": 0.4492,
"step": 1562
},
{
"epoch": 0.8616317530319736,
"grad_norm": 0.24450813233852386,
"learning_rate": 0.00017122719734660033,
"loss": 0.4764,
"step": 1563
},
{
"epoch": 0.8621830209481808,
"grad_norm": 0.2357473075389862,
"learning_rate": 0.00017114427860696513,
"loss": 0.4727,
"step": 1564
},
{
"epoch": 0.8627342888643881,
"grad_norm": 0.22676219046115875,
"learning_rate": 0.00017106135986733,
"loss": 0.454,
"step": 1565
},
{
"epoch": 0.8632855567805954,
"grad_norm": 0.24174387753009796,
"learning_rate": 0.00017097844112769483,
"loss": 0.4451,
"step": 1566
},
{
"epoch": 0.8638368246968027,
"grad_norm": 0.24716874957084656,
"learning_rate": 0.00017089552238805969,
"loss": 0.4639,
"step": 1567
},
{
"epoch": 0.86438809261301,
"grad_norm": 0.24672383069992065,
"learning_rate": 0.00017081260364842452,
"loss": 0.4811,
"step": 1568
},
{
"epoch": 0.8649393605292172,
"grad_norm": 0.2504035234451294,
"learning_rate": 0.00017072968490878938,
"loss": 0.4715,
"step": 1569
},
{
"epoch": 0.8654906284454245,
"grad_norm": 0.2296275794506073,
"learning_rate": 0.0001706467661691542,
"loss": 0.4552,
"step": 1570
},
{
"epoch": 0.8660418963616318,
"grad_norm": 0.24308894574642181,
"learning_rate": 0.00017056384742951907,
"loss": 0.4798,
"step": 1571
},
{
"epoch": 0.8665931642778391,
"grad_norm": 0.25587549805641174,
"learning_rate": 0.0001704809286898839,
"loss": 0.473,
"step": 1572
},
{
"epoch": 0.8671444321940464,
"grad_norm": 0.22006462514400482,
"learning_rate": 0.00017039800995024876,
"loss": 0.4512,
"step": 1573
},
{
"epoch": 0.8676957001102535,
"grad_norm": 0.2469773143529892,
"learning_rate": 0.00017031509121061356,
"loss": 0.4651,
"step": 1574
},
{
"epoch": 0.8682469680264608,
"grad_norm": 0.23426435887813568,
"learning_rate": 0.00017023217247097842,
"loss": 0.4658,
"step": 1575
},
{
"epoch": 0.8687982359426681,
"grad_norm": 0.2696544826030731,
"learning_rate": 0.00017014925373134325,
"loss": 0.4555,
"step": 1576
},
{
"epoch": 0.8693495038588754,
"grad_norm": 0.24263867735862732,
"learning_rate": 0.00017006633499170811,
"loss": 0.4426,
"step": 1577
},
{
"epoch": 0.8699007717750827,
"grad_norm": 0.24693246185779572,
"learning_rate": 0.00016998341625207295,
"loss": 0.4876,
"step": 1578
},
{
"epoch": 0.8704520396912899,
"grad_norm": 0.24460558593273163,
"learning_rate": 0.0001699004975124378,
"loss": 0.4704,
"step": 1579
},
{
"epoch": 0.8710033076074972,
"grad_norm": 0.2212182730436325,
"learning_rate": 0.00016981757877280264,
"loss": 0.4496,
"step": 1580
},
{
"epoch": 0.8715545755237045,
"grad_norm": 0.23751485347747803,
"learning_rate": 0.0001697346600331675,
"loss": 0.4546,
"step": 1581
},
{
"epoch": 0.8721058434399118,
"grad_norm": 0.2521110475063324,
"learning_rate": 0.00016965174129353233,
"loss": 0.4706,
"step": 1582
},
{
"epoch": 0.8726571113561191,
"grad_norm": 0.24147383868694305,
"learning_rate": 0.0001695688225538972,
"loss": 0.4519,
"step": 1583
},
{
"epoch": 0.8732083792723263,
"grad_norm": 0.2279898077249527,
"learning_rate": 0.000169485903814262,
"loss": 0.4648,
"step": 1584
},
{
"epoch": 0.8737596471885336,
"grad_norm": 0.24053026735782623,
"learning_rate": 0.00016940298507462685,
"loss": 0.4747,
"step": 1585
},
{
"epoch": 0.8743109151047409,
"grad_norm": 0.24321089684963226,
"learning_rate": 0.00016932006633499168,
"loss": 0.4562,
"step": 1586
},
{
"epoch": 0.8748621830209482,
"grad_norm": 0.2396124303340912,
"learning_rate": 0.00016923714759535654,
"loss": 0.4631,
"step": 1587
},
{
"epoch": 0.8754134509371555,
"grad_norm": 0.23284991085529327,
"learning_rate": 0.00016915422885572137,
"loss": 0.4452,
"step": 1588
},
{
"epoch": 0.8759647188533627,
"grad_norm": 0.2377912849187851,
"learning_rate": 0.00016907131011608623,
"loss": 0.4471,
"step": 1589
},
{
"epoch": 0.87651598676957,
"grad_norm": 0.23828253149986267,
"learning_rate": 0.00016898839137645107,
"loss": 0.4463,
"step": 1590
},
{
"epoch": 0.8770672546857773,
"grad_norm": 0.24640867114067078,
"learning_rate": 0.00016890547263681593,
"loss": 0.4776,
"step": 1591
},
{
"epoch": 0.8776185226019846,
"grad_norm": 0.24699927866458893,
"learning_rate": 0.00016882255389718076,
"loss": 0.437,
"step": 1592
},
{
"epoch": 0.8781697905181919,
"grad_norm": 0.24521562457084656,
"learning_rate": 0.00016873963515754562,
"loss": 0.4805,
"step": 1593
},
{
"epoch": 0.8787210584343991,
"grad_norm": 0.2375350147485733,
"learning_rate": 0.00016865671641791042,
"loss": 0.4835,
"step": 1594
},
{
"epoch": 0.8792723263506064,
"grad_norm": 0.23784852027893066,
"learning_rate": 0.00016857379767827528,
"loss": 0.49,
"step": 1595
},
{
"epoch": 0.8798235942668137,
"grad_norm": 0.23371200263500214,
"learning_rate": 0.0001684908789386401,
"loss": 0.4701,
"step": 1596
},
{
"epoch": 0.880374862183021,
"grad_norm": 0.23373621702194214,
"learning_rate": 0.00016840796019900497,
"loss": 0.4765,
"step": 1597
},
{
"epoch": 0.8809261300992283,
"grad_norm": 0.25964394211769104,
"learning_rate": 0.0001683250414593698,
"loss": 0.4505,
"step": 1598
},
{
"epoch": 0.8814773980154355,
"grad_norm": 0.2420414835214615,
"learning_rate": 0.00016824212271973464,
"loss": 0.5,
"step": 1599
},
{
"epoch": 0.8820286659316428,
"grad_norm": 0.24534733593463898,
"learning_rate": 0.0001681592039800995,
"loss": 0.4625,
"step": 1600
},
{
"epoch": 0.88257993384785,
"grad_norm": 0.22338466346263885,
"learning_rate": 0.00016807628524046433,
"loss": 0.4383,
"step": 1601
},
{
"epoch": 0.8831312017640573,
"grad_norm": 0.24304436147212982,
"learning_rate": 0.00016799336650082919,
"loss": 0.4717,
"step": 1602
},
{
"epoch": 0.8836824696802646,
"grad_norm": 0.24378708004951477,
"learning_rate": 0.000167910447761194,
"loss": 0.4732,
"step": 1603
},
{
"epoch": 0.8842337375964718,
"grad_norm": 0.22068338096141815,
"learning_rate": 0.00016782752902155885,
"loss": 0.4709,
"step": 1604
},
{
"epoch": 0.8847850055126791,
"grad_norm": 0.25752487778663635,
"learning_rate": 0.00016774461028192368,
"loss": 0.4571,
"step": 1605
},
{
"epoch": 0.8853362734288864,
"grad_norm": 0.21915499866008759,
"learning_rate": 0.00016766169154228854,
"loss": 0.4551,
"step": 1606
},
{
"epoch": 0.8858875413450937,
"grad_norm": 0.220630943775177,
"learning_rate": 0.00016757877280265337,
"loss": 0.4336,
"step": 1607
},
{
"epoch": 0.886438809261301,
"grad_norm": 0.2279721051454544,
"learning_rate": 0.00016749585406301823,
"loss": 0.4546,
"step": 1608
},
{
"epoch": 0.8869900771775082,
"grad_norm": 0.23162703216075897,
"learning_rate": 0.00016741293532338306,
"loss": 0.4596,
"step": 1609
},
{
"epoch": 0.8875413450937155,
"grad_norm": 0.22968967258930206,
"learning_rate": 0.00016733001658374792,
"loss": 0.4457,
"step": 1610
},
{
"epoch": 0.8880926130099228,
"grad_norm": 0.23839277029037476,
"learning_rate": 0.00016724709784411276,
"loss": 0.444,
"step": 1611
},
{
"epoch": 0.8886438809261301,
"grad_norm": 0.2291092872619629,
"learning_rate": 0.00016716417910447761,
"loss": 0.4796,
"step": 1612
},
{
"epoch": 0.8891951488423374,
"grad_norm": 0.2277524322271347,
"learning_rate": 0.00016708126036484242,
"loss": 0.4373,
"step": 1613
},
{
"epoch": 0.8897464167585446,
"grad_norm": 0.24553948640823364,
"learning_rate": 0.00016699834162520728,
"loss": 0.4948,
"step": 1614
},
{
"epoch": 0.8902976846747519,
"grad_norm": 0.21850357949733734,
"learning_rate": 0.0001669154228855721,
"loss": 0.4575,
"step": 1615
},
{
"epoch": 0.8908489525909592,
"grad_norm": 0.23171943426132202,
"learning_rate": 0.00016683250414593697,
"loss": 0.4947,
"step": 1616
},
{
"epoch": 0.8914002205071665,
"grad_norm": 0.22626076638698578,
"learning_rate": 0.0001667495854063018,
"loss": 0.4619,
"step": 1617
},
{
"epoch": 0.8919514884233738,
"grad_norm": 0.23768572509288788,
"learning_rate": 0.00016666666666666666,
"loss": 0.4535,
"step": 1618
},
{
"epoch": 0.892502756339581,
"grad_norm": 0.2264167070388794,
"learning_rate": 0.0001665837479270315,
"loss": 0.467,
"step": 1619
},
{
"epoch": 0.8930540242557883,
"grad_norm": 0.2234300673007965,
"learning_rate": 0.00016650082918739635,
"loss": 0.4331,
"step": 1620
},
{
"epoch": 0.8936052921719956,
"grad_norm": 0.22206327319145203,
"learning_rate": 0.00016641791044776118,
"loss": 0.4442,
"step": 1621
},
{
"epoch": 0.8941565600882029,
"grad_norm": 0.22858171164989471,
"learning_rate": 0.00016633499170812604,
"loss": 0.4611,
"step": 1622
},
{
"epoch": 0.8947078280044102,
"grad_norm": 0.24421337246894836,
"learning_rate": 0.00016625207296849085,
"loss": 0.4551,
"step": 1623
},
{
"epoch": 0.8952590959206174,
"grad_norm": 0.20711436867713928,
"learning_rate": 0.0001661691542288557,
"loss": 0.4555,
"step": 1624
},
{
"epoch": 0.8958103638368247,
"grad_norm": 0.22994433343410492,
"learning_rate": 0.00016608623548922054,
"loss": 0.4745,
"step": 1625
},
{
"epoch": 0.896361631753032,
"grad_norm": 0.22984014451503754,
"learning_rate": 0.0001660033167495854,
"loss": 0.4613,
"step": 1626
},
{
"epoch": 0.8969128996692393,
"grad_norm": 0.2339726984500885,
"learning_rate": 0.00016592039800995023,
"loss": 0.469,
"step": 1627
},
{
"epoch": 0.8974641675854466,
"grad_norm": 0.23884552717208862,
"learning_rate": 0.0001658374792703151,
"loss": 0.4812,
"step": 1628
},
{
"epoch": 0.8980154355016539,
"grad_norm": 0.23677459359169006,
"learning_rate": 0.00016575456053067992,
"loss": 0.471,
"step": 1629
},
{
"epoch": 0.898566703417861,
"grad_norm": 0.22945214807987213,
"learning_rate": 0.00016567164179104478,
"loss": 0.4666,
"step": 1630
},
{
"epoch": 0.8991179713340683,
"grad_norm": 0.231664776802063,
"learning_rate": 0.0001655887230514096,
"loss": 0.4657,
"step": 1631
},
{
"epoch": 0.8996692392502756,
"grad_norm": 0.22424204647541046,
"learning_rate": 0.00016550580431177447,
"loss": 0.4682,
"step": 1632
},
{
"epoch": 0.9002205071664829,
"grad_norm": 0.23469983041286469,
"learning_rate": 0.00016542288557213928,
"loss": 0.4761,
"step": 1633
},
{
"epoch": 0.9007717750826902,
"grad_norm": 0.2397875040769577,
"learning_rate": 0.00016533996683250414,
"loss": 0.4763,
"step": 1634
},
{
"epoch": 0.9013230429988974,
"grad_norm": 0.21035277843475342,
"learning_rate": 0.00016525704809286897,
"loss": 0.4225,
"step": 1635
},
{
"epoch": 0.9018743109151047,
"grad_norm": 0.24221475422382355,
"learning_rate": 0.00016517412935323383,
"loss": 0.4666,
"step": 1636
},
{
"epoch": 0.902425578831312,
"grad_norm": 0.22903227806091309,
"learning_rate": 0.00016509121061359866,
"loss": 0.4699,
"step": 1637
},
{
"epoch": 0.9029768467475193,
"grad_norm": 0.23368406295776367,
"learning_rate": 0.00016500829187396352,
"loss": 0.4763,
"step": 1638
},
{
"epoch": 0.9035281146637266,
"grad_norm": 0.2397768199443817,
"learning_rate": 0.00016492537313432835,
"loss": 0.4552,
"step": 1639
},
{
"epoch": 0.9040793825799338,
"grad_norm": 0.24322962760925293,
"learning_rate": 0.0001648424543946932,
"loss": 0.4441,
"step": 1640
},
{
"epoch": 0.9046306504961411,
"grad_norm": 0.21771124005317688,
"learning_rate": 0.00016475953565505801,
"loss": 0.4635,
"step": 1641
},
{
"epoch": 0.9051819184123484,
"grad_norm": 0.21717268228530884,
"learning_rate": 0.0001646766169154229,
"loss": 0.4459,
"step": 1642
},
{
"epoch": 0.9057331863285557,
"grad_norm": 0.23191964626312256,
"learning_rate": 0.0001645936981757877,
"loss": 0.4605,
"step": 1643
},
{
"epoch": 0.906284454244763,
"grad_norm": 0.24638865888118744,
"learning_rate": 0.00016451077943615254,
"loss": 0.4477,
"step": 1644
},
{
"epoch": 0.9068357221609702,
"grad_norm": 0.24050134420394897,
"learning_rate": 0.0001644278606965174,
"loss": 0.4389,
"step": 1645
},
{
"epoch": 0.9073869900771775,
"grad_norm": 0.23574888706207275,
"learning_rate": 0.00016434494195688223,
"loss": 0.4556,
"step": 1646
},
{
"epoch": 0.9079382579933848,
"grad_norm": 0.23960547149181366,
"learning_rate": 0.0001642620232172471,
"loss": 0.4599,
"step": 1647
},
{
"epoch": 0.9084895259095921,
"grad_norm": 0.22923794388771057,
"learning_rate": 0.00016417910447761192,
"loss": 0.4566,
"step": 1648
},
{
"epoch": 0.9090407938257994,
"grad_norm": 0.23294423520565033,
"learning_rate": 0.00016409618573797678,
"loss": 0.4726,
"step": 1649
},
{
"epoch": 0.9095920617420066,
"grad_norm": 0.24964945018291473,
"learning_rate": 0.00016401326699834158,
"loss": 0.483,
"step": 1650
},
{
"epoch": 0.9101433296582139,
"grad_norm": 0.22729866206645966,
"learning_rate": 0.00016393034825870644,
"loss": 0.4708,
"step": 1651
},
{
"epoch": 0.9106945975744212,
"grad_norm": 0.22324109077453613,
"learning_rate": 0.00016384742951907128,
"loss": 0.4798,
"step": 1652
},
{
"epoch": 0.9112458654906285,
"grad_norm": 0.2301269918680191,
"learning_rate": 0.00016376451077943613,
"loss": 0.4659,
"step": 1653
},
{
"epoch": 0.9117971334068358,
"grad_norm": 0.26973679661750793,
"learning_rate": 0.00016368159203980097,
"loss": 0.4743,
"step": 1654
},
{
"epoch": 0.912348401323043,
"grad_norm": 0.2236243188381195,
"learning_rate": 0.00016359867330016583,
"loss": 0.4464,
"step": 1655
},
{
"epoch": 0.9128996692392503,
"grad_norm": 0.23898382484912872,
"learning_rate": 0.00016351575456053066,
"loss": 0.4715,
"step": 1656
},
{
"epoch": 0.9134509371554576,
"grad_norm": 0.226115882396698,
"learning_rate": 0.00016343283582089552,
"loss": 0.452,
"step": 1657
},
{
"epoch": 0.9140022050716649,
"grad_norm": 0.24120070040225983,
"learning_rate": 0.00016334991708126035,
"loss": 0.4594,
"step": 1658
},
{
"epoch": 0.9145534729878722,
"grad_norm": 0.2507602870464325,
"learning_rate": 0.0001632669983416252,
"loss": 0.4759,
"step": 1659
},
{
"epoch": 0.9151047409040793,
"grad_norm": 0.26350581645965576,
"learning_rate": 0.00016318407960199,
"loss": 0.4553,
"step": 1660
},
{
"epoch": 0.9156560088202866,
"grad_norm": 0.23043513298034668,
"learning_rate": 0.00016310116086235487,
"loss": 0.4754,
"step": 1661
},
{
"epoch": 0.9162072767364939,
"grad_norm": 0.22888733446598053,
"learning_rate": 0.0001630182421227197,
"loss": 0.4602,
"step": 1662
},
{
"epoch": 0.9167585446527012,
"grad_norm": 0.23566976189613342,
"learning_rate": 0.00016293532338308456,
"loss": 0.4492,
"step": 1663
},
{
"epoch": 0.9173098125689085,
"grad_norm": 0.2403411716222763,
"learning_rate": 0.0001628524046434494,
"loss": 0.4529,
"step": 1664
},
{
"epoch": 0.9178610804851157,
"grad_norm": 0.24615786969661713,
"learning_rate": 0.00016276948590381425,
"loss": 0.4688,
"step": 1665
},
{
"epoch": 0.918412348401323,
"grad_norm": 0.2582218647003174,
"learning_rate": 0.0001626865671641791,
"loss": 0.4626,
"step": 1666
},
{
"epoch": 0.9189636163175303,
"grad_norm": 0.2405799925327301,
"learning_rate": 0.00016260364842454395,
"loss": 0.4529,
"step": 1667
},
{
"epoch": 0.9195148842337376,
"grad_norm": 0.2288394719362259,
"learning_rate": 0.00016252072968490878,
"loss": 0.4513,
"step": 1668
},
{
"epoch": 0.9200661521499449,
"grad_norm": 0.22039665281772614,
"learning_rate": 0.00016243781094527364,
"loss": 0.4636,
"step": 1669
},
{
"epoch": 0.9206174200661521,
"grad_norm": 0.2359505444765091,
"learning_rate": 0.00016235489220563844,
"loss": 0.4703,
"step": 1670
},
{
"epoch": 0.9211686879823594,
"grad_norm": 0.25222134590148926,
"learning_rate": 0.0001622719734660033,
"loss": 0.4729,
"step": 1671
},
{
"epoch": 0.9217199558985667,
"grad_norm": 0.24714909493923187,
"learning_rate": 0.00016218905472636813,
"loss": 0.4376,
"step": 1672
},
{
"epoch": 0.922271223814774,
"grad_norm": 0.271454781293869,
"learning_rate": 0.000162106135986733,
"loss": 0.4771,
"step": 1673
},
{
"epoch": 0.9228224917309813,
"grad_norm": 0.2408027946949005,
"learning_rate": 0.00016202321724709782,
"loss": 0.4581,
"step": 1674
},
{
"epoch": 0.9233737596471885,
"grad_norm": 0.25041836500167847,
"learning_rate": 0.00016194029850746268,
"loss": 0.4685,
"step": 1675
},
{
"epoch": 0.9239250275633958,
"grad_norm": 0.2697443664073944,
"learning_rate": 0.00016185737976782752,
"loss": 0.4905,
"step": 1676
},
{
"epoch": 0.9244762954796031,
"grad_norm": 0.261924684047699,
"learning_rate": 0.00016177446102819237,
"loss": 0.5045,
"step": 1677
},
{
"epoch": 0.9250275633958104,
"grad_norm": 0.23671838641166687,
"learning_rate": 0.0001616915422885572,
"loss": 0.4477,
"step": 1678
},
{
"epoch": 0.9255788313120177,
"grad_norm": 0.26420533657073975,
"learning_rate": 0.00016160862354892207,
"loss": 0.4922,
"step": 1679
},
{
"epoch": 0.9261300992282249,
"grad_norm": 0.2353939265012741,
"learning_rate": 0.00016152570480928687,
"loss": 0.4434,
"step": 1680
},
{
"epoch": 0.9266813671444322,
"grad_norm": 0.23843790590763092,
"learning_rate": 0.00016144278606965173,
"loss": 0.4567,
"step": 1681
},
{
"epoch": 0.9272326350606395,
"grad_norm": 0.22744010388851166,
"learning_rate": 0.00016135986733001656,
"loss": 0.4607,
"step": 1682
},
{
"epoch": 0.9277839029768468,
"grad_norm": 0.2599264979362488,
"learning_rate": 0.00016127694859038142,
"loss": 0.4839,
"step": 1683
},
{
"epoch": 0.9283351708930541,
"grad_norm": 0.2337629646062851,
"learning_rate": 0.00016119402985074625,
"loss": 0.4697,
"step": 1684
},
{
"epoch": 0.9288864388092613,
"grad_norm": 0.2365848571062088,
"learning_rate": 0.0001611111111111111,
"loss": 0.4589,
"step": 1685
},
{
"epoch": 0.9294377067254685,
"grad_norm": 0.22954298555850983,
"learning_rate": 0.00016102819237147594,
"loss": 0.4071,
"step": 1686
},
{
"epoch": 0.9299889746416758,
"grad_norm": 0.22945284843444824,
"learning_rate": 0.00016094527363184078,
"loss": 0.4432,
"step": 1687
},
{
"epoch": 0.9305402425578831,
"grad_norm": 0.2274722009897232,
"learning_rate": 0.00016086235489220564,
"loss": 0.4537,
"step": 1688
},
{
"epoch": 0.9310915104740904,
"grad_norm": 0.23572379350662231,
"learning_rate": 0.00016077943615257044,
"loss": 0.4621,
"step": 1689
},
{
"epoch": 0.9316427783902976,
"grad_norm": 0.2582686245441437,
"learning_rate": 0.0001606965174129353,
"loss": 0.4845,
"step": 1690
},
{
"epoch": 0.9321940463065049,
"grad_norm": 0.252638578414917,
"learning_rate": 0.00016061359867330013,
"loss": 0.4583,
"step": 1691
},
{
"epoch": 0.9327453142227122,
"grad_norm": 0.24242907762527466,
"learning_rate": 0.000160530679933665,
"loss": 0.4659,
"step": 1692
},
{
"epoch": 0.9332965821389195,
"grad_norm": 0.25426262617111206,
"learning_rate": 0.00016044776119402982,
"loss": 0.4615,
"step": 1693
},
{
"epoch": 0.9338478500551268,
"grad_norm": 0.2503727972507477,
"learning_rate": 0.00016036484245439468,
"loss": 0.4732,
"step": 1694
},
{
"epoch": 0.934399117971334,
"grad_norm": 0.23591485619544983,
"learning_rate": 0.00016028192371475951,
"loss": 0.4865,
"step": 1695
},
{
"epoch": 0.9349503858875413,
"grad_norm": 0.2307887077331543,
"learning_rate": 0.00016019900497512437,
"loss": 0.4694,
"step": 1696
},
{
"epoch": 0.9355016538037486,
"grad_norm": 0.24209177494049072,
"learning_rate": 0.0001601160862354892,
"loss": 0.4716,
"step": 1697
},
{
"epoch": 0.9360529217199559,
"grad_norm": 0.23071332275867462,
"learning_rate": 0.00016003316749585406,
"loss": 0.4548,
"step": 1698
},
{
"epoch": 0.9366041896361632,
"grad_norm": 0.2404324859380722,
"learning_rate": 0.00015995024875621887,
"loss": 0.4614,
"step": 1699
},
{
"epoch": 0.9371554575523704,
"grad_norm": 0.24288049340248108,
"learning_rate": 0.00015986733001658373,
"loss": 0.477,
"step": 1700
},
{
"epoch": 0.9377067254685777,
"grad_norm": 0.2315543293952942,
"learning_rate": 0.00015978441127694856,
"loss": 0.4294,
"step": 1701
},
{
"epoch": 0.938257993384785,
"grad_norm": 0.24326400458812714,
"learning_rate": 0.00015970149253731342,
"loss": 0.4751,
"step": 1702
},
{
"epoch": 0.9388092613009923,
"grad_norm": 0.23202817142009735,
"learning_rate": 0.00015961857379767825,
"loss": 0.4539,
"step": 1703
},
{
"epoch": 0.9393605292171996,
"grad_norm": 0.24364544451236725,
"learning_rate": 0.0001595356550580431,
"loss": 0.4742,
"step": 1704
},
{
"epoch": 0.9399117971334069,
"grad_norm": 0.24248524010181427,
"learning_rate": 0.00015945273631840794,
"loss": 0.4335,
"step": 1705
},
{
"epoch": 0.9404630650496141,
"grad_norm": 0.2423916757106781,
"learning_rate": 0.0001593698175787728,
"loss": 0.4825,
"step": 1706
},
{
"epoch": 0.9410143329658214,
"grad_norm": 0.22844377160072327,
"learning_rate": 0.00015928689883913763,
"loss": 0.468,
"step": 1707
},
{
"epoch": 0.9415656008820287,
"grad_norm": 0.23481746017932892,
"learning_rate": 0.0001592039800995025,
"loss": 0.459,
"step": 1708
},
{
"epoch": 0.942116868798236,
"grad_norm": 0.23676711320877075,
"learning_rate": 0.0001591210613598673,
"loss": 0.4748,
"step": 1709
},
{
"epoch": 0.9426681367144433,
"grad_norm": 0.23470185697078705,
"learning_rate": 0.00015903814262023216,
"loss": 0.4538,
"step": 1710
},
{
"epoch": 0.9432194046306505,
"grad_norm": 0.26180773973464966,
"learning_rate": 0.000158955223880597,
"loss": 0.4737,
"step": 1711
},
{
"epoch": 0.9437706725468578,
"grad_norm": 0.23656126856803894,
"learning_rate": 0.00015887230514096185,
"loss": 0.4716,
"step": 1712
},
{
"epoch": 0.9443219404630651,
"grad_norm": 0.2338191270828247,
"learning_rate": 0.00015878938640132668,
"loss": 0.4712,
"step": 1713
},
{
"epoch": 0.9448732083792724,
"grad_norm": 0.2348823845386505,
"learning_rate": 0.00015870646766169154,
"loss": 0.4645,
"step": 1714
},
{
"epoch": 0.9454244762954797,
"grad_norm": 0.23620596528053284,
"learning_rate": 0.00015862354892205637,
"loss": 0.4456,
"step": 1715
},
{
"epoch": 0.9459757442116868,
"grad_norm": 0.25021445751190186,
"learning_rate": 0.00015854063018242123,
"loss": 0.4807,
"step": 1716
},
{
"epoch": 0.9465270121278941,
"grad_norm": 0.23087383806705475,
"learning_rate": 0.00015845771144278606,
"loss": 0.4648,
"step": 1717
},
{
"epoch": 0.9470782800441014,
"grad_norm": 0.23474477231502533,
"learning_rate": 0.00015837479270315092,
"loss": 0.4672,
"step": 1718
},
{
"epoch": 0.9476295479603087,
"grad_norm": 0.2543323338031769,
"learning_rate": 0.00015829187396351573,
"loss": 0.473,
"step": 1719
},
{
"epoch": 0.948180815876516,
"grad_norm": 0.2378506064414978,
"learning_rate": 0.00015820895522388059,
"loss": 0.4569,
"step": 1720
},
{
"epoch": 0.9487320837927232,
"grad_norm": 0.23003467917442322,
"learning_rate": 0.00015812603648424542,
"loss": 0.4621,
"step": 1721
},
{
"epoch": 0.9492833517089305,
"grad_norm": 0.24162529408931732,
"learning_rate": 0.00015804311774461028,
"loss": 0.445,
"step": 1722
},
{
"epoch": 0.9498346196251378,
"grad_norm": 0.23978053033351898,
"learning_rate": 0.0001579601990049751,
"loss": 0.4753,
"step": 1723
},
{
"epoch": 0.9503858875413451,
"grad_norm": 0.23133328557014465,
"learning_rate": 0.00015787728026533997,
"loss": 0.4735,
"step": 1724
},
{
"epoch": 0.9509371554575524,
"grad_norm": 0.20942679047584534,
"learning_rate": 0.0001577943615257048,
"loss": 0.4208,
"step": 1725
},
{
"epoch": 0.9514884233737596,
"grad_norm": 0.23965676128864288,
"learning_rate": 0.00015771144278606966,
"loss": 0.4758,
"step": 1726
},
{
"epoch": 0.9520396912899669,
"grad_norm": 0.23537394404411316,
"learning_rate": 0.0001576285240464345,
"loss": 0.4276,
"step": 1727
},
{
"epoch": 0.9525909592061742,
"grad_norm": 0.24360457062721252,
"learning_rate": 0.00015754560530679935,
"loss": 0.4686,
"step": 1728
},
{
"epoch": 0.9531422271223815,
"grad_norm": 0.22790101170539856,
"learning_rate": 0.00015746268656716416,
"loss": 0.4501,
"step": 1729
},
{
"epoch": 0.9536934950385888,
"grad_norm": 0.23862150311470032,
"learning_rate": 0.00015737976782752901,
"loss": 0.4545,
"step": 1730
},
{
"epoch": 0.954244762954796,
"grad_norm": 0.24378471076488495,
"learning_rate": 0.00015729684908789385,
"loss": 0.4912,
"step": 1731
},
{
"epoch": 0.9547960308710033,
"grad_norm": 0.23474174737930298,
"learning_rate": 0.00015721393034825868,
"loss": 0.4692,
"step": 1732
},
{
"epoch": 0.9553472987872106,
"grad_norm": 0.24299736320972443,
"learning_rate": 0.00015713101160862354,
"loss": 0.4582,
"step": 1733
},
{
"epoch": 0.9558985667034179,
"grad_norm": 0.23355722427368164,
"learning_rate": 0.00015704809286898837,
"loss": 0.4579,
"step": 1734
},
{
"epoch": 0.9564498346196252,
"grad_norm": 0.2307385504245758,
"learning_rate": 0.00015696517412935323,
"loss": 0.4276,
"step": 1735
},
{
"epoch": 0.9570011025358324,
"grad_norm": 0.25666573643684387,
"learning_rate": 0.00015688225538971806,
"loss": 0.4488,
"step": 1736
},
{
"epoch": 0.9575523704520397,
"grad_norm": 0.2472536265850067,
"learning_rate": 0.00015679933665008292,
"loss": 0.4635,
"step": 1737
},
{
"epoch": 0.958103638368247,
"grad_norm": 0.23561540246009827,
"learning_rate": 0.00015671641791044772,
"loss": 0.456,
"step": 1738
},
{
"epoch": 0.9586549062844543,
"grad_norm": 0.2695865333080292,
"learning_rate": 0.00015663349917081258,
"loss": 0.4894,
"step": 1739
},
{
"epoch": 0.9592061742006616,
"grad_norm": 0.23878848552703857,
"learning_rate": 0.00015655058043117742,
"loss": 0.4945,
"step": 1740
},
{
"epoch": 0.9597574421168688,
"grad_norm": 0.2417537271976471,
"learning_rate": 0.00015646766169154228,
"loss": 0.4456,
"step": 1741
},
{
"epoch": 0.960308710033076,
"grad_norm": 0.258645623922348,
"learning_rate": 0.0001563847429519071,
"loss": 0.4767,
"step": 1742
},
{
"epoch": 0.9608599779492834,
"grad_norm": 0.23502197861671448,
"learning_rate": 0.00015630182421227197,
"loss": 0.4636,
"step": 1743
},
{
"epoch": 0.9614112458654906,
"grad_norm": 0.22951334714889526,
"learning_rate": 0.0001562189054726368,
"loss": 0.4329,
"step": 1744
},
{
"epoch": 0.961962513781698,
"grad_norm": 0.24502499401569366,
"learning_rate": 0.00015613598673300166,
"loss": 0.4452,
"step": 1745
},
{
"epoch": 0.9625137816979051,
"grad_norm": 0.24659104645252228,
"learning_rate": 0.0001560530679933665,
"loss": 0.4489,
"step": 1746
},
{
"epoch": 0.9630650496141124,
"grad_norm": 0.2458224892616272,
"learning_rate": 0.00015597014925373135,
"loss": 0.4903,
"step": 1747
},
{
"epoch": 0.9636163175303197,
"grad_norm": 0.24105043709278107,
"learning_rate": 0.00015588723051409615,
"loss": 0.4738,
"step": 1748
},
{
"epoch": 0.964167585446527,
"grad_norm": 0.2505391836166382,
"learning_rate": 0.000155804311774461,
"loss": 0.4643,
"step": 1749
},
{
"epoch": 0.9647188533627343,
"grad_norm": 0.23488488793373108,
"learning_rate": 0.00015572139303482584,
"loss": 0.4731,
"step": 1750
},
{
"epoch": 0.9652701212789415,
"grad_norm": 0.2317710667848587,
"learning_rate": 0.0001556384742951907,
"loss": 0.4736,
"step": 1751
},
{
"epoch": 0.9658213891951488,
"grad_norm": 0.23009353876113892,
"learning_rate": 0.00015555555555555554,
"loss": 0.4512,
"step": 1752
},
{
"epoch": 0.9663726571113561,
"grad_norm": 0.24625705182552338,
"learning_rate": 0.0001554726368159204,
"loss": 0.455,
"step": 1753
},
{
"epoch": 0.9669239250275634,
"grad_norm": 0.2400812804698944,
"learning_rate": 0.00015538971807628523,
"loss": 0.4725,
"step": 1754
},
{
"epoch": 0.9674751929437707,
"grad_norm": 0.26011791825294495,
"learning_rate": 0.00015530679933665009,
"loss": 0.4868,
"step": 1755
},
{
"epoch": 0.9680264608599779,
"grad_norm": 0.2298017144203186,
"learning_rate": 0.0001552238805970149,
"loss": 0.4559,
"step": 1756
},
{
"epoch": 0.9685777287761852,
"grad_norm": 0.23378150165081024,
"learning_rate": 0.00015514096185737978,
"loss": 0.4511,
"step": 1757
},
{
"epoch": 0.9691289966923925,
"grad_norm": 0.24460946023464203,
"learning_rate": 0.00015505804311774458,
"loss": 0.4571,
"step": 1758
},
{
"epoch": 0.9696802646085998,
"grad_norm": 0.241620734333992,
"learning_rate": 0.00015497512437810944,
"loss": 0.4743,
"step": 1759
},
{
"epoch": 0.9702315325248071,
"grad_norm": 0.23285698890686035,
"learning_rate": 0.00015489220563847427,
"loss": 0.4619,
"step": 1760
},
{
"epoch": 0.9707828004410143,
"grad_norm": 0.24175579845905304,
"learning_rate": 0.00015480928689883913,
"loss": 0.4544,
"step": 1761
},
{
"epoch": 0.9713340683572216,
"grad_norm": 0.22799162566661835,
"learning_rate": 0.00015472636815920396,
"loss": 0.4679,
"step": 1762
},
{
"epoch": 0.9718853362734289,
"grad_norm": 0.23015514016151428,
"learning_rate": 0.00015464344941956882,
"loss": 0.4867,
"step": 1763
},
{
"epoch": 0.9724366041896362,
"grad_norm": 0.22983665764331818,
"learning_rate": 0.00015456053067993366,
"loss": 0.4608,
"step": 1764
},
{
"epoch": 0.9729878721058435,
"grad_norm": 0.22515413165092468,
"learning_rate": 0.00015447761194029851,
"loss": 0.4578,
"step": 1765
},
{
"epoch": 0.9735391400220507,
"grad_norm": 0.23187264800071716,
"learning_rate": 0.00015439469320066332,
"loss": 0.4253,
"step": 1766
},
{
"epoch": 0.974090407938258,
"grad_norm": 0.23280374705791473,
"learning_rate": 0.00015431177446102818,
"loss": 0.4473,
"step": 1767
},
{
"epoch": 0.9746416758544653,
"grad_norm": 0.2500572204589844,
"learning_rate": 0.000154228855721393,
"loss": 0.4519,
"step": 1768
},
{
"epoch": 0.9751929437706726,
"grad_norm": 0.23001956939697266,
"learning_rate": 0.00015414593698175787,
"loss": 0.4708,
"step": 1769
},
{
"epoch": 0.9757442116868799,
"grad_norm": 0.23875866830348969,
"learning_rate": 0.0001540630182421227,
"loss": 0.4679,
"step": 1770
},
{
"epoch": 0.976295479603087,
"grad_norm": 0.22990469634532928,
"learning_rate": 0.00015398009950248756,
"loss": 0.4632,
"step": 1771
},
{
"epoch": 0.9768467475192943,
"grad_norm": 0.24912653863430023,
"learning_rate": 0.0001538971807628524,
"loss": 0.4569,
"step": 1772
},
{
"epoch": 0.9773980154355016,
"grad_norm": 0.2521923780441284,
"learning_rate": 0.00015381426202321725,
"loss": 0.4696,
"step": 1773
},
{
"epoch": 0.9779492833517089,
"grad_norm": 0.23184111714363098,
"learning_rate": 0.00015373134328358208,
"loss": 0.4518,
"step": 1774
},
{
"epoch": 0.9785005512679162,
"grad_norm": 0.22830599546432495,
"learning_rate": 0.0001536484245439469,
"loss": 0.4511,
"step": 1775
},
{
"epoch": 0.9790518191841234,
"grad_norm": 0.24908460676670074,
"learning_rate": 0.00015356550580431175,
"loss": 0.4556,
"step": 1776
},
{
"epoch": 0.9796030871003307,
"grad_norm": 0.2542704939842224,
"learning_rate": 0.00015348258706467658,
"loss": 0.4876,
"step": 1777
},
{
"epoch": 0.980154355016538,
"grad_norm": 0.23091669380664825,
"learning_rate": 0.00015339966832504144,
"loss": 0.4502,
"step": 1778
},
{
"epoch": 0.9807056229327453,
"grad_norm": 0.24079181253910065,
"learning_rate": 0.00015331674958540627,
"loss": 0.4549,
"step": 1779
},
{
"epoch": 0.9812568908489526,
"grad_norm": 0.224042147397995,
"learning_rate": 0.00015323383084577113,
"loss": 0.4568,
"step": 1780
},
{
"epoch": 0.9818081587651599,
"grad_norm": 0.23204737901687622,
"learning_rate": 0.00015315091210613596,
"loss": 0.4516,
"step": 1781
},
{
"epoch": 0.9823594266813671,
"grad_norm": 0.24899733066558838,
"learning_rate": 0.00015306799336650082,
"loss": 0.4422,
"step": 1782
},
{
"epoch": 0.9829106945975744,
"grad_norm": 0.2473718822002411,
"learning_rate": 0.00015298507462686565,
"loss": 0.4698,
"step": 1783
},
{
"epoch": 0.9834619625137817,
"grad_norm": 0.23376363515853882,
"learning_rate": 0.0001529021558872305,
"loss": 0.4735,
"step": 1784
},
{
"epoch": 0.984013230429989,
"grad_norm": 0.21901825070381165,
"learning_rate": 0.00015281923714759532,
"loss": 0.4055,
"step": 1785
},
{
"epoch": 0.9845644983461963,
"grad_norm": 0.24539053440093994,
"learning_rate": 0.00015273631840796018,
"loss": 0.477,
"step": 1786
},
{
"epoch": 0.9851157662624035,
"grad_norm": 0.2802634537220001,
"learning_rate": 0.000152653399668325,
"loss": 0.4924,
"step": 1787
},
{
"epoch": 0.9856670341786108,
"grad_norm": 0.2387421429157257,
"learning_rate": 0.00015257048092868987,
"loss": 0.4671,
"step": 1788
},
{
"epoch": 0.9862183020948181,
"grad_norm": 0.22999261319637299,
"learning_rate": 0.0001524875621890547,
"loss": 0.4682,
"step": 1789
},
{
"epoch": 0.9867695700110254,
"grad_norm": 0.2567140758037567,
"learning_rate": 0.00015240464344941956,
"loss": 0.4395,
"step": 1790
},
{
"epoch": 0.9873208379272327,
"grad_norm": 0.24533671140670776,
"learning_rate": 0.0001523217247097844,
"loss": 0.4415,
"step": 1791
},
{
"epoch": 0.9878721058434399,
"grad_norm": 0.24147699773311615,
"learning_rate": 0.00015223880597014925,
"loss": 0.4731,
"step": 1792
},
{
"epoch": 0.9884233737596472,
"grad_norm": 0.23697462677955627,
"learning_rate": 0.00015215588723051408,
"loss": 0.451,
"step": 1793
},
{
"epoch": 0.9889746416758545,
"grad_norm": 0.2380775809288025,
"learning_rate": 0.00015207296849087894,
"loss": 0.452,
"step": 1794
},
{
"epoch": 0.9895259095920618,
"grad_norm": 0.24654051661491394,
"learning_rate": 0.00015199004975124375,
"loss": 0.4724,
"step": 1795
},
{
"epoch": 0.9900771775082691,
"grad_norm": 0.2548507750034332,
"learning_rate": 0.0001519071310116086,
"loss": 0.4578,
"step": 1796
},
{
"epoch": 0.9906284454244763,
"grad_norm": 0.23419903218746185,
"learning_rate": 0.00015182421227197344,
"loss": 0.4627,
"step": 1797
},
{
"epoch": 0.9911797133406836,
"grad_norm": 0.2721438705921173,
"learning_rate": 0.0001517412935323383,
"loss": 0.4704,
"step": 1798
},
{
"epoch": 0.9917309812568909,
"grad_norm": 0.22823266685009003,
"learning_rate": 0.00015165837479270313,
"loss": 0.4402,
"step": 1799
},
{
"epoch": 0.9922822491730982,
"grad_norm": 0.3155699372291565,
"learning_rate": 0.000151575456053068,
"loss": 0.4537,
"step": 1800
},
{
"epoch": 0.9928335170893055,
"grad_norm": 0.24750587344169617,
"learning_rate": 0.00015149253731343282,
"loss": 0.4674,
"step": 1801
},
{
"epoch": 0.9933847850055126,
"grad_norm": 0.23167037963867188,
"learning_rate": 0.00015140961857379768,
"loss": 0.4506,
"step": 1802
},
{
"epoch": 0.9939360529217199,
"grad_norm": 0.24583961069583893,
"learning_rate": 0.0001513266998341625,
"loss": 0.4809,
"step": 1803
},
{
"epoch": 0.9944873208379272,
"grad_norm": 0.23894868791103363,
"learning_rate": 0.00015124378109452737,
"loss": 0.4729,
"step": 1804
},
{
"epoch": 0.9950385887541345,
"grad_norm": 0.23357604444026947,
"learning_rate": 0.00015116086235489218,
"loss": 0.4608,
"step": 1805
},
{
"epoch": 0.9955898566703418,
"grad_norm": 0.2364039272069931,
"learning_rate": 0.00015107794361525703,
"loss": 0.4803,
"step": 1806
},
{
"epoch": 0.996141124586549,
"grad_norm": 0.23034816980361938,
"learning_rate": 0.00015099502487562187,
"loss": 0.4687,
"step": 1807
},
{
"epoch": 0.9966923925027563,
"grad_norm": 0.23677074909210205,
"learning_rate": 0.00015091210613598673,
"loss": 0.4591,
"step": 1808
},
{
"epoch": 0.9972436604189636,
"grad_norm": 0.24638359248638153,
"learning_rate": 0.00015082918739635156,
"loss": 0.462,
"step": 1809
},
{
"epoch": 0.9977949283351709,
"grad_norm": 0.23346304893493652,
"learning_rate": 0.00015074626865671642,
"loss": 0.4245,
"step": 1810
},
{
"epoch": 0.9983461962513782,
"grad_norm": 0.2604617774486542,
"learning_rate": 0.00015066334991708125,
"loss": 0.4665,
"step": 1811
},
{
"epoch": 0.9988974641675854,
"grad_norm": 0.22308942675590515,
"learning_rate": 0.0001505804311774461,
"loss": 0.4671,
"step": 1812
},
{
"epoch": 0.9994487320837927,
"grad_norm": 0.2405402511358261,
"learning_rate": 0.00015049751243781094,
"loss": 0.4808,
"step": 1813
},
{
"epoch": 1.0,
"grad_norm": 0.2668411433696747,
"learning_rate": 0.0001504145936981758,
"loss": 0.4683,
"step": 1814
},
{
"epoch": 1.0005512679162072,
"grad_norm": 0.23000217974185944,
"learning_rate": 0.0001503316749585406,
"loss": 0.3736,
"step": 1815
},
{
"epoch": 1.0011025358324146,
"grad_norm": 0.2307773381471634,
"learning_rate": 0.00015024875621890546,
"loss": 0.3834,
"step": 1816
},
{
"epoch": 1.0016538037486218,
"grad_norm": 0.23737002909183502,
"learning_rate": 0.0001501658374792703,
"loss": 0.3863,
"step": 1817
},
{
"epoch": 1.0022050716648292,
"grad_norm": 0.2283601313829422,
"learning_rate": 0.00015008291873963515,
"loss": 0.3837,
"step": 1818
},
{
"epoch": 1.0027563395810364,
"grad_norm": 0.21821331977844238,
"learning_rate": 0.00015,
"loss": 0.4085,
"step": 1819
},
{
"epoch": 1.0033076074972436,
"grad_norm": 0.2391849011182785,
"learning_rate": 0.00014991708126036482,
"loss": 0.4207,
"step": 1820
},
{
"epoch": 1.003858875413451,
"grad_norm": 0.23875446617603302,
"learning_rate": 0.00014983416252072968,
"loss": 0.4013,
"step": 1821
},
{
"epoch": 1.0044101433296582,
"grad_norm": 0.24305221438407898,
"learning_rate": 0.0001497512437810945,
"loss": 0.4106,
"step": 1822
},
{
"epoch": 1.0049614112458656,
"grad_norm": 0.21675904095172882,
"learning_rate": 0.00014966832504145937,
"loss": 0.3744,
"step": 1823
},
{
"epoch": 1.0055126791620728,
"grad_norm": 0.23470553755760193,
"learning_rate": 0.0001495854063018242,
"loss": 0.3997,
"step": 1824
},
{
"epoch": 1.00606394707828,
"grad_norm": 0.2310658097267151,
"learning_rate": 0.00014950248756218903,
"loss": 0.411,
"step": 1825
},
{
"epoch": 1.0066152149944874,
"grad_norm": 0.23178675770759583,
"learning_rate": 0.0001494195688225539,
"loss": 0.3884,
"step": 1826
},
{
"epoch": 1.0071664829106945,
"grad_norm": 0.23985427618026733,
"learning_rate": 0.00014933665008291872,
"loss": 0.4026,
"step": 1827
},
{
"epoch": 1.007717750826902,
"grad_norm": 0.228210911154747,
"learning_rate": 0.00014925373134328358,
"loss": 0.3952,
"step": 1828
},
{
"epoch": 1.0082690187431091,
"grad_norm": 0.22802165150642395,
"learning_rate": 0.00014917081260364842,
"loss": 0.4194,
"step": 1829
},
{
"epoch": 1.0088202866593163,
"grad_norm": 0.2423812299966812,
"learning_rate": 0.00014908789386401325,
"loss": 0.4282,
"step": 1830
},
{
"epoch": 1.0093715545755237,
"grad_norm": 0.23589813709259033,
"learning_rate": 0.0001490049751243781,
"loss": 0.3911,
"step": 1831
},
{
"epoch": 1.009922822491731,
"grad_norm": 0.21917280554771423,
"learning_rate": 0.00014892205638474294,
"loss": 0.3723,
"step": 1832
},
{
"epoch": 1.0104740904079383,
"grad_norm": 0.22650456428527832,
"learning_rate": 0.0001488391376451078,
"loss": 0.3962,
"step": 1833
},
{
"epoch": 1.0110253583241455,
"grad_norm": 0.23731641471385956,
"learning_rate": 0.00014875621890547263,
"loss": 0.4235,
"step": 1834
},
{
"epoch": 1.0115766262403527,
"grad_norm": 0.21167220175266266,
"learning_rate": 0.00014867330016583746,
"loss": 0.3786,
"step": 1835
},
{
"epoch": 1.0121278941565601,
"grad_norm": 0.23506543040275574,
"learning_rate": 0.00014859038142620232,
"loss": 0.4098,
"step": 1836
},
{
"epoch": 1.0126791620727673,
"grad_norm": 0.25581830739974976,
"learning_rate": 0.00014850746268656715,
"loss": 0.4052,
"step": 1837
},
{
"epoch": 1.0132304299889747,
"grad_norm": 0.2236202359199524,
"learning_rate": 0.000148424543946932,
"loss": 0.3975,
"step": 1838
},
{
"epoch": 1.013781697905182,
"grad_norm": 0.21659554541110992,
"learning_rate": 0.00014834162520729684,
"loss": 0.3843,
"step": 1839
},
{
"epoch": 1.014332965821389,
"grad_norm": 0.22564005851745605,
"learning_rate": 0.00014825870646766168,
"loss": 0.4013,
"step": 1840
},
{
"epoch": 1.0148842337375965,
"grad_norm": 0.225655660033226,
"learning_rate": 0.00014817578772802654,
"loss": 0.3976,
"step": 1841
},
{
"epoch": 1.0154355016538037,
"grad_norm": 0.21095581352710724,
"learning_rate": 0.00014809286898839137,
"loss": 0.3812,
"step": 1842
},
{
"epoch": 1.015986769570011,
"grad_norm": 0.23854820430278778,
"learning_rate": 0.0001480099502487562,
"loss": 0.4089,
"step": 1843
},
{
"epoch": 1.0165380374862183,
"grad_norm": 0.22585038840770721,
"learning_rate": 0.00014792703150912103,
"loss": 0.4193,
"step": 1844
},
{
"epoch": 1.0170893054024255,
"grad_norm": 0.2209796905517578,
"learning_rate": 0.0001478441127694859,
"loss": 0.3989,
"step": 1845
},
{
"epoch": 1.017640573318633,
"grad_norm": 0.2113056629896164,
"learning_rate": 0.00014776119402985072,
"loss": 0.4089,
"step": 1846
},
{
"epoch": 1.01819184123484,
"grad_norm": 0.22150270640850067,
"learning_rate": 0.00014767827529021558,
"loss": 0.3946,
"step": 1847
},
{
"epoch": 1.0187431091510475,
"grad_norm": 0.22819051146507263,
"learning_rate": 0.00014759535655058041,
"loss": 0.3914,
"step": 1848
},
{
"epoch": 1.0192943770672547,
"grad_norm": 0.21912482380867004,
"learning_rate": 0.00014751243781094525,
"loss": 0.3621,
"step": 1849
},
{
"epoch": 1.0198456449834619,
"grad_norm": 0.22611315548419952,
"learning_rate": 0.0001474295190713101,
"loss": 0.386,
"step": 1850
},
{
"epoch": 1.0203969128996693,
"grad_norm": 0.225437730550766,
"learning_rate": 0.00014734660033167494,
"loss": 0.4115,
"step": 1851
},
{
"epoch": 1.0209481808158765,
"grad_norm": 0.22555771470069885,
"learning_rate": 0.0001472636815920398,
"loss": 0.4121,
"step": 1852
},
{
"epoch": 1.0214994487320839,
"grad_norm": 0.22996987402439117,
"learning_rate": 0.00014718076285240463,
"loss": 0.3799,
"step": 1853
},
{
"epoch": 1.022050716648291,
"grad_norm": 0.227546826004982,
"learning_rate": 0.00014709784411276946,
"loss": 0.406,
"step": 1854
},
{
"epoch": 1.0226019845644982,
"grad_norm": 0.21384532749652863,
"learning_rate": 0.00014701492537313432,
"loss": 0.393,
"step": 1855
},
{
"epoch": 1.0231532524807057,
"grad_norm": 0.21834981441497803,
"learning_rate": 0.00014693200663349915,
"loss": 0.3737,
"step": 1856
},
{
"epoch": 1.0237045203969128,
"grad_norm": 0.2231069952249527,
"learning_rate": 0.000146849087893864,
"loss": 0.3755,
"step": 1857
},
{
"epoch": 1.0242557883131203,
"grad_norm": 0.22336961328983307,
"learning_rate": 0.00014676616915422884,
"loss": 0.3936,
"step": 1858
},
{
"epoch": 1.0248070562293274,
"grad_norm": 0.22250871360301971,
"learning_rate": 0.00014668325041459367,
"loss": 0.4021,
"step": 1859
},
{
"epoch": 1.0253583241455346,
"grad_norm": 0.21691983938217163,
"learning_rate": 0.00014660033167495853,
"loss": 0.375,
"step": 1860
},
{
"epoch": 1.025909592061742,
"grad_norm": 0.2267792969942093,
"learning_rate": 0.00014651741293532337,
"loss": 0.4089,
"step": 1861
},
{
"epoch": 1.0264608599779492,
"grad_norm": 0.22236919403076172,
"learning_rate": 0.00014643449419568823,
"loss": 0.384,
"step": 1862
},
{
"epoch": 1.0270121278941566,
"grad_norm": 0.2280534952878952,
"learning_rate": 0.00014635157545605306,
"loss": 0.3982,
"step": 1863
},
{
"epoch": 1.0275633958103638,
"grad_norm": 0.23323461413383484,
"learning_rate": 0.0001462686567164179,
"loss": 0.3947,
"step": 1864
},
{
"epoch": 1.028114663726571,
"grad_norm": 0.2187027931213379,
"learning_rate": 0.00014618573797678275,
"loss": 0.3758,
"step": 1865
},
{
"epoch": 1.0286659316427784,
"grad_norm": 0.2233375459909439,
"learning_rate": 0.00014610281923714758,
"loss": 0.3889,
"step": 1866
},
{
"epoch": 1.0292171995589856,
"grad_norm": 0.23430676758289337,
"learning_rate": 0.00014601990049751244,
"loss": 0.3919,
"step": 1867
},
{
"epoch": 1.029768467475193,
"grad_norm": 0.22947613894939423,
"learning_rate": 0.00014593698175787727,
"loss": 0.3886,
"step": 1868
},
{
"epoch": 1.0303197353914002,
"grad_norm": 0.23334287106990814,
"learning_rate": 0.0001458540630182421,
"loss": 0.413,
"step": 1869
},
{
"epoch": 1.0308710033076074,
"grad_norm": 0.2178686261177063,
"learning_rate": 0.00014577114427860696,
"loss": 0.393,
"step": 1870
},
{
"epoch": 1.0314222712238148,
"grad_norm": 0.2510049045085907,
"learning_rate": 0.0001456882255389718,
"loss": 0.413,
"step": 1871
},
{
"epoch": 1.031973539140022,
"grad_norm": 0.23210124671459198,
"learning_rate": 0.00014560530679933665,
"loss": 0.3817,
"step": 1872
},
{
"epoch": 1.0325248070562294,
"grad_norm": 0.23246748745441437,
"learning_rate": 0.00014552238805970149,
"loss": 0.4026,
"step": 1873
},
{
"epoch": 1.0330760749724366,
"grad_norm": 0.22752533853054047,
"learning_rate": 0.00014543946932006632,
"loss": 0.411,
"step": 1874
},
{
"epoch": 1.0336273428886438,
"grad_norm": 0.21562816202640533,
"learning_rate": 0.00014535655058043118,
"loss": 0.3966,
"step": 1875
},
{
"epoch": 1.0341786108048512,
"grad_norm": 0.227711021900177,
"learning_rate": 0.000145273631840796,
"loss": 0.4008,
"step": 1876
},
{
"epoch": 1.0347298787210584,
"grad_norm": 0.22064116597175598,
"learning_rate": 0.00014519071310116087,
"loss": 0.3855,
"step": 1877
},
{
"epoch": 1.0352811466372658,
"grad_norm": 0.22657108306884766,
"learning_rate": 0.0001451077943615257,
"loss": 0.4147,
"step": 1878
},
{
"epoch": 1.035832414553473,
"grad_norm": 0.220686674118042,
"learning_rate": 0.00014502487562189053,
"loss": 0.3953,
"step": 1879
},
{
"epoch": 1.0363836824696802,
"grad_norm": 0.21113237738609314,
"learning_rate": 0.0001449419568822554,
"loss": 0.3908,
"step": 1880
},
{
"epoch": 1.0369349503858876,
"grad_norm": 0.21575047075748444,
"learning_rate": 0.00014485903814262022,
"loss": 0.3917,
"step": 1881
},
{
"epoch": 1.0374862183020948,
"grad_norm": 0.22273024916648865,
"learning_rate": 0.00014477611940298508,
"loss": 0.4007,
"step": 1882
},
{
"epoch": 1.0380374862183022,
"grad_norm": 0.22036762535572052,
"learning_rate": 0.00014469320066334991,
"loss": 0.3797,
"step": 1883
},
{
"epoch": 1.0385887541345094,
"grad_norm": 0.22144779562950134,
"learning_rate": 0.00014461028192371475,
"loss": 0.3911,
"step": 1884
},
{
"epoch": 1.0391400220507165,
"grad_norm": 0.22937916219234467,
"learning_rate": 0.0001445273631840796,
"loss": 0.406,
"step": 1885
},
{
"epoch": 1.039691289966924,
"grad_norm": 0.21770672500133514,
"learning_rate": 0.0001444444444444444,
"loss": 0.389,
"step": 1886
},
{
"epoch": 1.0402425578831311,
"grad_norm": 0.2170240730047226,
"learning_rate": 0.00014436152570480927,
"loss": 0.4225,
"step": 1887
},
{
"epoch": 1.0407938257993385,
"grad_norm": 0.23694483935832977,
"learning_rate": 0.0001442786069651741,
"loss": 0.4124,
"step": 1888
},
{
"epoch": 1.0413450937155457,
"grad_norm": 0.2358977198600769,
"learning_rate": 0.00014419568822553896,
"loss": 0.3932,
"step": 1889
},
{
"epoch": 1.041896361631753,
"grad_norm": 0.2379174828529358,
"learning_rate": 0.0001441127694859038,
"loss": 0.3921,
"step": 1890
},
{
"epoch": 1.0424476295479603,
"grad_norm": 0.22685475647449493,
"learning_rate": 0.00014402985074626863,
"loss": 0.398,
"step": 1891
},
{
"epoch": 1.0429988974641675,
"grad_norm": 0.2381109744310379,
"learning_rate": 0.00014394693200663348,
"loss": 0.4002,
"step": 1892
},
{
"epoch": 1.043550165380375,
"grad_norm": 0.23132000863552094,
"learning_rate": 0.00014386401326699832,
"loss": 0.3917,
"step": 1893
},
{
"epoch": 1.0441014332965821,
"grad_norm": 0.23595485091209412,
"learning_rate": 0.00014378109452736318,
"loss": 0.3811,
"step": 1894
},
{
"epoch": 1.0446527012127893,
"grad_norm": 0.23046362400054932,
"learning_rate": 0.000143698175787728,
"loss": 0.389,
"step": 1895
},
{
"epoch": 1.0452039691289967,
"grad_norm": 0.21979711949825287,
"learning_rate": 0.00014361525704809284,
"loss": 0.4008,
"step": 1896
},
{
"epoch": 1.045755237045204,
"grad_norm": 0.21169352531433105,
"learning_rate": 0.0001435323383084577,
"loss": 0.3767,
"step": 1897
},
{
"epoch": 1.0463065049614113,
"grad_norm": 0.2226918339729309,
"learning_rate": 0.00014344941956882253,
"loss": 0.4059,
"step": 1898
},
{
"epoch": 1.0468577728776185,
"grad_norm": 0.23048485815525055,
"learning_rate": 0.0001433665008291874,
"loss": 0.4013,
"step": 1899
},
{
"epoch": 1.0474090407938257,
"grad_norm": 0.22347117960453033,
"learning_rate": 0.00014328358208955222,
"loss": 0.4042,
"step": 1900
},
{
"epoch": 1.047960308710033,
"grad_norm": 0.2321341335773468,
"learning_rate": 0.00014320066334991705,
"loss": 0.4055,
"step": 1901
},
{
"epoch": 1.0485115766262403,
"grad_norm": 0.22918953001499176,
"learning_rate": 0.0001431177446102819,
"loss": 0.3845,
"step": 1902
},
{
"epoch": 1.0490628445424477,
"grad_norm": 0.21781106293201447,
"learning_rate": 0.00014303482587064675,
"loss": 0.4067,
"step": 1903
},
{
"epoch": 1.0496141124586549,
"grad_norm": 0.21180634200572968,
"learning_rate": 0.0001429519071310116,
"loss": 0.3891,
"step": 1904
},
{
"epoch": 1.0501653803748623,
"grad_norm": 0.2400248795747757,
"learning_rate": 0.00014286898839137644,
"loss": 0.3878,
"step": 1905
},
{
"epoch": 1.0507166482910695,
"grad_norm": 0.22464604675769806,
"learning_rate": 0.00014278606965174127,
"loss": 0.3909,
"step": 1906
},
{
"epoch": 1.0512679162072767,
"grad_norm": 0.23820553719997406,
"learning_rate": 0.00014270315091210613,
"loss": 0.3967,
"step": 1907
},
{
"epoch": 1.051819184123484,
"grad_norm": 0.23168790340423584,
"learning_rate": 0.00014262023217247096,
"loss": 0.4057,
"step": 1908
},
{
"epoch": 1.0523704520396913,
"grad_norm": 0.2253868579864502,
"learning_rate": 0.00014253731343283582,
"loss": 0.3844,
"step": 1909
},
{
"epoch": 1.0529217199558987,
"grad_norm": 0.21465058624744415,
"learning_rate": 0.00014245439469320065,
"loss": 0.3804,
"step": 1910
},
{
"epoch": 1.0534729878721059,
"grad_norm": 0.22617360949516296,
"learning_rate": 0.00014237147595356548,
"loss": 0.3738,
"step": 1911
},
{
"epoch": 1.054024255788313,
"grad_norm": 0.23942868411540985,
"learning_rate": 0.00014228855721393034,
"loss": 0.4044,
"step": 1912
},
{
"epoch": 1.0545755237045205,
"grad_norm": 0.23497670888900757,
"learning_rate": 0.00014220563847429517,
"loss": 0.4138,
"step": 1913
},
{
"epoch": 1.0551267916207276,
"grad_norm": 0.229624941945076,
"learning_rate": 0.00014212271973466003,
"loss": 0.402,
"step": 1914
},
{
"epoch": 1.055678059536935,
"grad_norm": 0.22944937646389008,
"learning_rate": 0.00014203980099502486,
"loss": 0.4016,
"step": 1915
},
{
"epoch": 1.0562293274531422,
"grad_norm": 0.2452874332666397,
"learning_rate": 0.0001419568822553897,
"loss": 0.4149,
"step": 1916
},
{
"epoch": 1.0567805953693494,
"grad_norm": 0.23434410989284515,
"learning_rate": 0.00014187396351575456,
"loss": 0.3818,
"step": 1917
},
{
"epoch": 1.0573318632855568,
"grad_norm": 0.22487396001815796,
"learning_rate": 0.0001417910447761194,
"loss": 0.4071,
"step": 1918
},
{
"epoch": 1.057883131201764,
"grad_norm": 0.2129317820072174,
"learning_rate": 0.00014170812603648425,
"loss": 0.3653,
"step": 1919
},
{
"epoch": 1.0584343991179714,
"grad_norm": 0.21573378145694733,
"learning_rate": 0.00014162520729684908,
"loss": 0.3924,
"step": 1920
},
{
"epoch": 1.0589856670341786,
"grad_norm": 0.23635123670101166,
"learning_rate": 0.0001415422885572139,
"loss": 0.3883,
"step": 1921
},
{
"epoch": 1.0595369349503858,
"grad_norm": 0.23705770075321198,
"learning_rate": 0.00014145936981757877,
"loss": 0.3865,
"step": 1922
},
{
"epoch": 1.0600882028665932,
"grad_norm": 0.22904790937900543,
"learning_rate": 0.0001413764510779436,
"loss": 0.3851,
"step": 1923
},
{
"epoch": 1.0606394707828004,
"grad_norm": 0.21958112716674805,
"learning_rate": 0.00014129353233830846,
"loss": 0.3965,
"step": 1924
},
{
"epoch": 1.0611907386990078,
"grad_norm": 0.232145294547081,
"learning_rate": 0.0001412106135986733,
"loss": 0.4001,
"step": 1925
},
{
"epoch": 1.061742006615215,
"grad_norm": 0.23748160898685455,
"learning_rate": 0.00014112769485903813,
"loss": 0.3809,
"step": 1926
},
{
"epoch": 1.0622932745314222,
"grad_norm": 0.25450122356414795,
"learning_rate": 0.00014104477611940298,
"loss": 0.3986,
"step": 1927
},
{
"epoch": 1.0628445424476296,
"grad_norm": 0.23028801381587982,
"learning_rate": 0.00014096185737976782,
"loss": 0.3905,
"step": 1928
},
{
"epoch": 1.0633958103638368,
"grad_norm": 0.23206226527690887,
"learning_rate": 0.00014087893864013268,
"loss": 0.3757,
"step": 1929
},
{
"epoch": 1.0639470782800442,
"grad_norm": 0.23685060441493988,
"learning_rate": 0.00014079601990049748,
"loss": 0.3844,
"step": 1930
},
{
"epoch": 1.0644983461962514,
"grad_norm": 0.22835825383663177,
"learning_rate": 0.00014071310116086234,
"loss": 0.388,
"step": 1931
},
{
"epoch": 1.0650496141124586,
"grad_norm": 0.2305503487586975,
"learning_rate": 0.00014063018242122717,
"loss": 0.4015,
"step": 1932
},
{
"epoch": 1.065600882028666,
"grad_norm": 0.23914876580238342,
"learning_rate": 0.00014054726368159203,
"loss": 0.3826,
"step": 1933
},
{
"epoch": 1.0661521499448732,
"grad_norm": 0.2508886158466339,
"learning_rate": 0.00014046434494195686,
"loss": 0.3948,
"step": 1934
},
{
"epoch": 1.0667034178610806,
"grad_norm": 0.280200332403183,
"learning_rate": 0.0001403814262023217,
"loss": 0.4042,
"step": 1935
},
{
"epoch": 1.0672546857772878,
"grad_norm": 0.22536714375019073,
"learning_rate": 0.00014029850746268655,
"loss": 0.3948,
"step": 1936
},
{
"epoch": 1.067805953693495,
"grad_norm": 0.24053654074668884,
"learning_rate": 0.0001402155887230514,
"loss": 0.3976,
"step": 1937
},
{
"epoch": 1.0683572216097024,
"grad_norm": 0.2461492270231247,
"learning_rate": 0.00014013266998341625,
"loss": 0.385,
"step": 1938
},
{
"epoch": 1.0689084895259096,
"grad_norm": 0.24768413603305817,
"learning_rate": 0.00014004975124378108,
"loss": 0.3734,
"step": 1939
},
{
"epoch": 1.069459757442117,
"grad_norm": 0.2460828721523285,
"learning_rate": 0.0001399668325041459,
"loss": 0.3924,
"step": 1940
},
{
"epoch": 1.0700110253583242,
"grad_norm": 0.2739814519882202,
"learning_rate": 0.00013988391376451077,
"loss": 0.3779,
"step": 1941
},
{
"epoch": 1.0705622932745313,
"grad_norm": 0.23434729874134064,
"learning_rate": 0.0001398009950248756,
"loss": 0.4186,
"step": 1942
},
{
"epoch": 1.0711135611907387,
"grad_norm": 0.23552288115024567,
"learning_rate": 0.00013971807628524046,
"loss": 0.3951,
"step": 1943
},
{
"epoch": 1.071664829106946,
"grad_norm": 0.2381044626235962,
"learning_rate": 0.0001396351575456053,
"loss": 0.3938,
"step": 1944
},
{
"epoch": 1.0722160970231533,
"grad_norm": 0.25459203124046326,
"learning_rate": 0.00013955223880597012,
"loss": 0.3997,
"step": 1945
},
{
"epoch": 1.0727673649393605,
"grad_norm": 0.2563784718513489,
"learning_rate": 0.00013946932006633498,
"loss": 0.404,
"step": 1946
},
{
"epoch": 1.0733186328555677,
"grad_norm": 0.23130348324775696,
"learning_rate": 0.00013938640132669982,
"loss": 0.3844,
"step": 1947
},
{
"epoch": 1.0738699007717751,
"grad_norm": 0.24562886357307434,
"learning_rate": 0.00013930348258706467,
"loss": 0.4131,
"step": 1948
},
{
"epoch": 1.0744211686879823,
"grad_norm": 0.22779060900211334,
"learning_rate": 0.0001392205638474295,
"loss": 0.4107,
"step": 1949
},
{
"epoch": 1.0749724366041897,
"grad_norm": 0.23528602719306946,
"learning_rate": 0.00013913764510779434,
"loss": 0.4128,
"step": 1950
},
{
"epoch": 1.075523704520397,
"grad_norm": 0.23987142741680145,
"learning_rate": 0.0001390547263681592,
"loss": 0.3987,
"step": 1951
},
{
"epoch": 1.076074972436604,
"grad_norm": 0.2401638627052307,
"learning_rate": 0.00013897180762852403,
"loss": 0.3923,
"step": 1952
},
{
"epoch": 1.0766262403528115,
"grad_norm": 0.24218258261680603,
"learning_rate": 0.0001388888888888889,
"loss": 0.4001,
"step": 1953
},
{
"epoch": 1.0771775082690187,
"grad_norm": 0.23231711983680725,
"learning_rate": 0.00013880597014925372,
"loss": 0.3795,
"step": 1954
},
{
"epoch": 1.0777287761852261,
"grad_norm": 0.2225574404001236,
"learning_rate": 0.00013872305140961855,
"loss": 0.3867,
"step": 1955
},
{
"epoch": 1.0782800441014333,
"grad_norm": 0.22481811046600342,
"learning_rate": 0.0001386401326699834,
"loss": 0.3946,
"step": 1956
},
{
"epoch": 1.0788313120176405,
"grad_norm": 0.22649556398391724,
"learning_rate": 0.00013855721393034824,
"loss": 0.3834,
"step": 1957
},
{
"epoch": 1.079382579933848,
"grad_norm": 0.21780644357204437,
"learning_rate": 0.0001384742951907131,
"loss": 0.3874,
"step": 1958
},
{
"epoch": 1.079933847850055,
"grad_norm": 0.21539410948753357,
"learning_rate": 0.00013839137645107794,
"loss": 0.3788,
"step": 1959
},
{
"epoch": 1.0804851157662625,
"grad_norm": 0.22845754027366638,
"learning_rate": 0.00013830845771144277,
"loss": 0.395,
"step": 1960
},
{
"epoch": 1.0810363836824697,
"grad_norm": 0.23722249269485474,
"learning_rate": 0.00013822553897180763,
"loss": 0.3993,
"step": 1961
},
{
"epoch": 1.0815876515986769,
"grad_norm": 0.2395038902759552,
"learning_rate": 0.00013814262023217246,
"loss": 0.4204,
"step": 1962
},
{
"epoch": 1.0821389195148843,
"grad_norm": 0.2149537056684494,
"learning_rate": 0.00013805970149253732,
"loss": 0.381,
"step": 1963
},
{
"epoch": 1.0826901874310915,
"grad_norm": 0.24547190964221954,
"learning_rate": 0.00013797678275290215,
"loss": 0.404,
"step": 1964
},
{
"epoch": 1.0832414553472989,
"grad_norm": 0.21485422551631927,
"learning_rate": 0.00013789386401326698,
"loss": 0.3756,
"step": 1965
},
{
"epoch": 1.083792723263506,
"grad_norm": 0.2199661284685135,
"learning_rate": 0.00013781094527363184,
"loss": 0.39,
"step": 1966
},
{
"epoch": 1.0843439911797133,
"grad_norm": 0.2321014702320099,
"learning_rate": 0.00013772802653399667,
"loss": 0.3877,
"step": 1967
},
{
"epoch": 1.0848952590959207,
"grad_norm": 0.23033714294433594,
"learning_rate": 0.00013764510779436153,
"loss": 0.4018,
"step": 1968
},
{
"epoch": 1.0854465270121278,
"grad_norm": 0.2251034677028656,
"learning_rate": 0.00013756218905472636,
"loss": 0.3911,
"step": 1969
},
{
"epoch": 1.0859977949283353,
"grad_norm": 0.22630800306797028,
"learning_rate": 0.0001374792703150912,
"loss": 0.397,
"step": 1970
},
{
"epoch": 1.0865490628445424,
"grad_norm": 0.22938160598278046,
"learning_rate": 0.00013739635157545606,
"loss": 0.401,
"step": 1971
},
{
"epoch": 1.0871003307607496,
"grad_norm": 0.24200983345508575,
"learning_rate": 0.0001373134328358209,
"loss": 0.3988,
"step": 1972
},
{
"epoch": 1.087651598676957,
"grad_norm": 0.25386059284210205,
"learning_rate": 0.00013723051409618575,
"loss": 0.4093,
"step": 1973
},
{
"epoch": 1.0882028665931642,
"grad_norm": 0.2258448451757431,
"learning_rate": 0.00013714759535655055,
"loss": 0.386,
"step": 1974
},
{
"epoch": 1.0887541345093716,
"grad_norm": 0.2277601659297943,
"learning_rate": 0.0001370646766169154,
"loss": 0.4041,
"step": 1975
},
{
"epoch": 1.0893054024255788,
"grad_norm": 0.20614218711853027,
"learning_rate": 0.00013698175787728024,
"loss": 0.3784,
"step": 1976
},
{
"epoch": 1.089856670341786,
"grad_norm": 0.22764301300048828,
"learning_rate": 0.0001368988391376451,
"loss": 0.395,
"step": 1977
},
{
"epoch": 1.0904079382579934,
"grad_norm": 0.23423810303211212,
"learning_rate": 0.00013681592039800993,
"loss": 0.4114,
"step": 1978
},
{
"epoch": 1.0909592061742006,
"grad_norm": 0.2042825073003769,
"learning_rate": 0.00013673300165837477,
"loss": 0.3724,
"step": 1979
},
{
"epoch": 1.091510474090408,
"grad_norm": 0.2203364223241806,
"learning_rate": 0.00013665008291873962,
"loss": 0.4084,
"step": 1980
},
{
"epoch": 1.0920617420066152,
"grad_norm": 0.23350727558135986,
"learning_rate": 0.00013656716417910446,
"loss": 0.4041,
"step": 1981
},
{
"epoch": 1.0926130099228224,
"grad_norm": 0.23900878429412842,
"learning_rate": 0.00013648424543946932,
"loss": 0.3976,
"step": 1982
},
{
"epoch": 1.0931642778390298,
"grad_norm": 0.22579023241996765,
"learning_rate": 0.00013640132669983415,
"loss": 0.4019,
"step": 1983
},
{
"epoch": 1.093715545755237,
"grad_norm": 0.23907893896102905,
"learning_rate": 0.00013631840796019898,
"loss": 0.4185,
"step": 1984
},
{
"epoch": 1.0942668136714444,
"grad_norm": 0.22953177988529205,
"learning_rate": 0.00013623548922056384,
"loss": 0.4009,
"step": 1985
},
{
"epoch": 1.0948180815876516,
"grad_norm": 0.22816117107868195,
"learning_rate": 0.00013615257048092867,
"loss": 0.3773,
"step": 1986
},
{
"epoch": 1.0953693495038588,
"grad_norm": 0.2403888702392578,
"learning_rate": 0.00013606965174129353,
"loss": 0.3857,
"step": 1987
},
{
"epoch": 1.0959206174200662,
"grad_norm": 0.2400594800710678,
"learning_rate": 0.00013598673300165836,
"loss": 0.398,
"step": 1988
},
{
"epoch": 1.0964718853362734,
"grad_norm": 0.2451186329126358,
"learning_rate": 0.0001359038142620232,
"loss": 0.4066,
"step": 1989
},
{
"epoch": 1.0970231532524808,
"grad_norm": 0.2371450811624527,
"learning_rate": 0.00013582089552238805,
"loss": 0.3855,
"step": 1990
},
{
"epoch": 1.097574421168688,
"grad_norm": 0.2529587745666504,
"learning_rate": 0.00013573797678275289,
"loss": 0.3851,
"step": 1991
},
{
"epoch": 1.0981256890848952,
"grad_norm": 0.23810137808322906,
"learning_rate": 0.00013565505804311774,
"loss": 0.3644,
"step": 1992
},
{
"epoch": 1.0986769570011026,
"grad_norm": 0.23532289266586304,
"learning_rate": 0.00013557213930348258,
"loss": 0.3813,
"step": 1993
},
{
"epoch": 1.0992282249173098,
"grad_norm": 0.2418917566537857,
"learning_rate": 0.0001354892205638474,
"loss": 0.3775,
"step": 1994
},
{
"epoch": 1.0997794928335172,
"grad_norm": 0.2366194874048233,
"learning_rate": 0.00013540630182421227,
"loss": 0.4047,
"step": 1995
},
{
"epoch": 1.1003307607497244,
"grad_norm": 0.23951660096645355,
"learning_rate": 0.0001353233830845771,
"loss": 0.3956,
"step": 1996
},
{
"epoch": 1.1008820286659315,
"grad_norm": 0.260423481464386,
"learning_rate": 0.00013524046434494196,
"loss": 0.3979,
"step": 1997
},
{
"epoch": 1.101433296582139,
"grad_norm": 0.22453179955482483,
"learning_rate": 0.0001351575456053068,
"loss": 0.3918,
"step": 1998
},
{
"epoch": 1.1019845644983461,
"grad_norm": 0.2185899168252945,
"learning_rate": 0.00013507462686567162,
"loss": 0.38,
"step": 1999
},
{
"epoch": 1.1025358324145536,
"grad_norm": 0.2236957997083664,
"learning_rate": 0.00013499170812603648,
"loss": 0.4007,
"step": 2000
},
{
"epoch": 1.1025358324145536,
"eval_loss": 0.4581758677959442,
"eval_runtime": 312.0177,
"eval_samples_per_second": 3.734,
"eval_steps_per_second": 0.468,
"step": 2000
},
{
"epoch": 1.1030871003307607,
"grad_norm": 0.2543388903141022,
"learning_rate": 0.00013490878938640131,
"loss": 0.39,
"step": 2001
},
{
"epoch": 1.103638368246968,
"grad_norm": 0.22843103110790253,
"learning_rate": 0.00013482587064676615,
"loss": 0.3835,
"step": 2002
},
{
"epoch": 1.1041896361631753,
"grad_norm": 0.226676806807518,
"learning_rate": 0.000134742951907131,
"loss": 0.3907,
"step": 2003
},
{
"epoch": 1.1047409040793825,
"grad_norm": 0.22164440155029297,
"learning_rate": 0.00013466003316749584,
"loss": 0.3727,
"step": 2004
},
{
"epoch": 1.10529217199559,
"grad_norm": 0.2151675671339035,
"learning_rate": 0.0001345771144278607,
"loss": 0.3749,
"step": 2005
},
{
"epoch": 1.1058434399117971,
"grad_norm": 0.23192958533763885,
"learning_rate": 0.00013449419568822553,
"loss": 0.407,
"step": 2006
},
{
"epoch": 1.1063947078280043,
"grad_norm": 0.2130926102399826,
"learning_rate": 0.00013441127694859036,
"loss": 0.3702,
"step": 2007
},
{
"epoch": 1.1069459757442117,
"grad_norm": 0.22862909734249115,
"learning_rate": 0.00013432835820895522,
"loss": 0.3784,
"step": 2008
},
{
"epoch": 1.107497243660419,
"grad_norm": 0.22866345942020416,
"learning_rate": 0.00013424543946932005,
"loss": 0.4035,
"step": 2009
},
{
"epoch": 1.1080485115766263,
"grad_norm": 0.2159378081560135,
"learning_rate": 0.0001341625207296849,
"loss": 0.3996,
"step": 2010
},
{
"epoch": 1.1085997794928335,
"grad_norm": 0.22037655115127563,
"learning_rate": 0.00013407960199004974,
"loss": 0.3873,
"step": 2011
},
{
"epoch": 1.1091510474090407,
"grad_norm": 0.24213933944702148,
"learning_rate": 0.00013399668325041458,
"loss": 0.4144,
"step": 2012
},
{
"epoch": 1.109702315325248,
"grad_norm": 0.2235259711742401,
"learning_rate": 0.00013391376451077943,
"loss": 0.4028,
"step": 2013
},
{
"epoch": 1.1102535832414553,
"grad_norm": 0.2354377955198288,
"learning_rate": 0.00013383084577114427,
"loss": 0.4103,
"step": 2014
},
{
"epoch": 1.1108048511576627,
"grad_norm": 0.22363215684890747,
"learning_rate": 0.00013374792703150913,
"loss": 0.3962,
"step": 2015
},
{
"epoch": 1.1113561190738699,
"grad_norm": 0.22264409065246582,
"learning_rate": 0.00013366500829187396,
"loss": 0.3818,
"step": 2016
},
{
"epoch": 1.111907386990077,
"grad_norm": 0.22731584310531616,
"learning_rate": 0.0001335820895522388,
"loss": 0.4013,
"step": 2017
},
{
"epoch": 1.1124586549062845,
"grad_norm": 0.22340711951255798,
"learning_rate": 0.00013349917081260362,
"loss": 0.3734,
"step": 2018
},
{
"epoch": 1.1130099228224917,
"grad_norm": 0.23701246082782745,
"learning_rate": 0.00013341625207296848,
"loss": 0.3943,
"step": 2019
},
{
"epoch": 1.113561190738699,
"grad_norm": 0.22929784655570984,
"learning_rate": 0.0001333333333333333,
"loss": 0.3848,
"step": 2020
},
{
"epoch": 1.1141124586549063,
"grad_norm": 0.24790272116661072,
"learning_rate": 0.00013325041459369814,
"loss": 0.4047,
"step": 2021
},
{
"epoch": 1.1146637265711137,
"grad_norm": 0.22452253103256226,
"learning_rate": 0.000133167495854063,
"loss": 0.385,
"step": 2022
},
{
"epoch": 1.1152149944873209,
"grad_norm": 0.23337581753730774,
"learning_rate": 0.00013308457711442784,
"loss": 0.3791,
"step": 2023
},
{
"epoch": 1.115766262403528,
"grad_norm": 0.23171287775039673,
"learning_rate": 0.0001330016583747927,
"loss": 0.3885,
"step": 2024
},
{
"epoch": 1.1163175303197355,
"grad_norm": 0.24028973281383514,
"learning_rate": 0.00013291873963515753,
"loss": 0.4071,
"step": 2025
},
{
"epoch": 1.1168687982359427,
"grad_norm": 0.23416177928447723,
"learning_rate": 0.00013283582089552236,
"loss": 0.3815,
"step": 2026
},
{
"epoch": 1.11742006615215,
"grad_norm": 0.2444845736026764,
"learning_rate": 0.00013275290215588722,
"loss": 0.4048,
"step": 2027
},
{
"epoch": 1.1179713340683572,
"grad_norm": 0.23157843947410583,
"learning_rate": 0.00013266998341625205,
"loss": 0.402,
"step": 2028
},
{
"epoch": 1.1185226019845644,
"grad_norm": 0.24158456921577454,
"learning_rate": 0.0001325870646766169,
"loss": 0.3821,
"step": 2029
},
{
"epoch": 1.1190738699007718,
"grad_norm": 0.23520436882972717,
"learning_rate": 0.00013250414593698174,
"loss": 0.3848,
"step": 2030
},
{
"epoch": 1.119625137816979,
"grad_norm": 0.2458154559135437,
"learning_rate": 0.00013242122719734657,
"loss": 0.3926,
"step": 2031
},
{
"epoch": 1.1201764057331864,
"grad_norm": 0.2308206707239151,
"learning_rate": 0.00013233830845771143,
"loss": 0.3982,
"step": 2032
},
{
"epoch": 1.1207276736493936,
"grad_norm": 0.23016606271266937,
"learning_rate": 0.00013225538971807626,
"loss": 0.3936,
"step": 2033
},
{
"epoch": 1.1212789415656008,
"grad_norm": 0.24838510155677795,
"learning_rate": 0.00013217247097844112,
"loss": 0.4081,
"step": 2034
},
{
"epoch": 1.1218302094818082,
"grad_norm": 0.2287745475769043,
"learning_rate": 0.00013208955223880596,
"loss": 0.371,
"step": 2035
},
{
"epoch": 1.1223814773980154,
"grad_norm": 0.23816218972206116,
"learning_rate": 0.0001320066334991708,
"loss": 0.3952,
"step": 2036
},
{
"epoch": 1.1229327453142228,
"grad_norm": 0.2324012964963913,
"learning_rate": 0.00013192371475953565,
"loss": 0.3861,
"step": 2037
},
{
"epoch": 1.12348401323043,
"grad_norm": 0.23907962441444397,
"learning_rate": 0.00013184079601990048,
"loss": 0.3927,
"step": 2038
},
{
"epoch": 1.1240352811466372,
"grad_norm": 0.2464779168367386,
"learning_rate": 0.00013175787728026534,
"loss": 0.4246,
"step": 2039
},
{
"epoch": 1.1245865490628446,
"grad_norm": 0.23501858115196228,
"learning_rate": 0.00013167495854063017,
"loss": 0.3918,
"step": 2040
},
{
"epoch": 1.1251378169790518,
"grad_norm": 0.2514742314815521,
"learning_rate": 0.000131592039800995,
"loss": 0.3828,
"step": 2041
},
{
"epoch": 1.1256890848952592,
"grad_norm": 0.25326284766197205,
"learning_rate": 0.00013150912106135986,
"loss": 0.4042,
"step": 2042
},
{
"epoch": 1.1262403528114664,
"grad_norm": 0.23037280142307281,
"learning_rate": 0.0001314262023217247,
"loss": 0.3919,
"step": 2043
},
{
"epoch": 1.1267916207276736,
"grad_norm": 0.241755872964859,
"learning_rate": 0.00013134328358208955,
"loss": 0.3867,
"step": 2044
},
{
"epoch": 1.127342888643881,
"grad_norm": 0.27031564712524414,
"learning_rate": 0.00013126036484245438,
"loss": 0.3767,
"step": 2045
},
{
"epoch": 1.1278941565600882,
"grad_norm": 0.24623173475265503,
"learning_rate": 0.00013117744610281922,
"loss": 0.4077,
"step": 2046
},
{
"epoch": 1.1284454244762956,
"grad_norm": 0.24347223341464996,
"learning_rate": 0.00013109452736318408,
"loss": 0.3846,
"step": 2047
},
{
"epoch": 1.1289966923925028,
"grad_norm": 0.24663501977920532,
"learning_rate": 0.0001310116086235489,
"loss": 0.3992,
"step": 2048
},
{
"epoch": 1.12954796030871,
"grad_norm": 0.23556159436702728,
"learning_rate": 0.00013092868988391377,
"loss": 0.3949,
"step": 2049
},
{
"epoch": 1.1300992282249174,
"grad_norm": 0.21868300437927246,
"learning_rate": 0.0001308457711442786,
"loss": 0.3824,
"step": 2050
},
{
"epoch": 1.1306504961411246,
"grad_norm": 0.23438437283039093,
"learning_rate": 0.00013076285240464343,
"loss": 0.3801,
"step": 2051
},
{
"epoch": 1.131201764057332,
"grad_norm": 0.22960849106311798,
"learning_rate": 0.0001306799336650083,
"loss": 0.4088,
"step": 2052
},
{
"epoch": 1.1317530319735392,
"grad_norm": 0.240730881690979,
"learning_rate": 0.00013059701492537312,
"loss": 0.3644,
"step": 2053
},
{
"epoch": 1.1323042998897463,
"grad_norm": 0.2219470739364624,
"learning_rate": 0.00013051409618573798,
"loss": 0.3817,
"step": 2054
},
{
"epoch": 1.1328555678059538,
"grad_norm": 0.22481395304203033,
"learning_rate": 0.0001304311774461028,
"loss": 0.3858,
"step": 2055
},
{
"epoch": 1.133406835722161,
"grad_norm": 0.24147982895374298,
"learning_rate": 0.00013034825870646765,
"loss": 0.3977,
"step": 2056
},
{
"epoch": 1.1339581036383684,
"grad_norm": 0.2390933483839035,
"learning_rate": 0.0001302653399668325,
"loss": 0.3985,
"step": 2057
},
{
"epoch": 1.1345093715545755,
"grad_norm": 0.24776338040828705,
"learning_rate": 0.00013018242122719734,
"loss": 0.4026,
"step": 2058
},
{
"epoch": 1.1350606394707827,
"grad_norm": 0.23255294561386108,
"learning_rate": 0.0001300995024875622,
"loss": 0.3975,
"step": 2059
},
{
"epoch": 1.1356119073869901,
"grad_norm": 0.2401493936777115,
"learning_rate": 0.00013001658374792703,
"loss": 0.3924,
"step": 2060
},
{
"epoch": 1.1361631753031973,
"grad_norm": 0.2360658049583435,
"learning_rate": 0.00012993366500829186,
"loss": 0.3835,
"step": 2061
},
{
"epoch": 1.1367144432194047,
"grad_norm": 0.24272675812244415,
"learning_rate": 0.0001298507462686567,
"loss": 0.3816,
"step": 2062
},
{
"epoch": 1.137265711135612,
"grad_norm": 0.2370130568742752,
"learning_rate": 0.00012976782752902155,
"loss": 0.3807,
"step": 2063
},
{
"epoch": 1.137816979051819,
"grad_norm": 0.22449509799480438,
"learning_rate": 0.00012968490878938638,
"loss": 0.3857,
"step": 2064
},
{
"epoch": 1.1383682469680265,
"grad_norm": 0.2332579791545868,
"learning_rate": 0.00012960199004975121,
"loss": 0.3882,
"step": 2065
},
{
"epoch": 1.1389195148842337,
"grad_norm": 0.23922313749790192,
"learning_rate": 0.00012951907131011607,
"loss": 0.3924,
"step": 2066
},
{
"epoch": 1.1394707828004411,
"grad_norm": 0.23937387764453888,
"learning_rate": 0.0001294361525704809,
"loss": 0.3982,
"step": 2067
},
{
"epoch": 1.1400220507166483,
"grad_norm": 0.23198926448822021,
"learning_rate": 0.00012935323383084577,
"loss": 0.3971,
"step": 2068
},
{
"epoch": 1.1405733186328555,
"grad_norm": 0.23774142563343048,
"learning_rate": 0.0001292703150912106,
"loss": 0.419,
"step": 2069
},
{
"epoch": 1.141124586549063,
"grad_norm": 0.23457486927509308,
"learning_rate": 0.00012918739635157543,
"loss": 0.3947,
"step": 2070
},
{
"epoch": 1.14167585446527,
"grad_norm": 0.23662830889225006,
"learning_rate": 0.0001291044776119403,
"loss": 0.3989,
"step": 2071
},
{
"epoch": 1.1422271223814775,
"grad_norm": 0.2307705134153366,
"learning_rate": 0.00012902155887230512,
"loss": 0.3988,
"step": 2072
},
{
"epoch": 1.1427783902976847,
"grad_norm": 0.23430916666984558,
"learning_rate": 0.00012893864013266998,
"loss": 0.3956,
"step": 2073
},
{
"epoch": 1.1433296582138919,
"grad_norm": 0.24138319492340088,
"learning_rate": 0.0001288557213930348,
"loss": 0.4103,
"step": 2074
},
{
"epoch": 1.1438809261300993,
"grad_norm": 0.22443422675132751,
"learning_rate": 0.00012877280265339964,
"loss": 0.3839,
"step": 2075
},
{
"epoch": 1.1444321940463065,
"grad_norm": 0.2313619703054428,
"learning_rate": 0.0001286898839137645,
"loss": 0.4063,
"step": 2076
},
{
"epoch": 1.1449834619625139,
"grad_norm": 0.22947578132152557,
"learning_rate": 0.00012860696517412933,
"loss": 0.3852,
"step": 2077
},
{
"epoch": 1.145534729878721,
"grad_norm": 0.2276720404624939,
"learning_rate": 0.0001285240464344942,
"loss": 0.3968,
"step": 2078
},
{
"epoch": 1.1460859977949283,
"grad_norm": 0.22463871538639069,
"learning_rate": 0.00012844112769485903,
"loss": 0.3904,
"step": 2079
},
{
"epoch": 1.1466372657111357,
"grad_norm": 0.22553198039531708,
"learning_rate": 0.00012835820895522386,
"loss": 0.3902,
"step": 2080
},
{
"epoch": 1.1471885336273429,
"grad_norm": 0.23410287499427795,
"learning_rate": 0.00012827529021558872,
"loss": 0.3952,
"step": 2081
},
{
"epoch": 1.1477398015435503,
"grad_norm": 0.2365550547838211,
"learning_rate": 0.00012819237147595355,
"loss": 0.3907,
"step": 2082
},
{
"epoch": 1.1482910694597575,
"grad_norm": 0.22853030264377594,
"learning_rate": 0.0001281094527363184,
"loss": 0.4041,
"step": 2083
},
{
"epoch": 1.1488423373759646,
"grad_norm": 0.23059257864952087,
"learning_rate": 0.00012802653399668324,
"loss": 0.4047,
"step": 2084
},
{
"epoch": 1.149393605292172,
"grad_norm": 0.23414267599582672,
"learning_rate": 0.00012794361525704807,
"loss": 0.4077,
"step": 2085
},
{
"epoch": 1.1499448732083792,
"grad_norm": 0.23295001685619354,
"learning_rate": 0.00012786069651741293,
"loss": 0.3942,
"step": 2086
},
{
"epoch": 1.1504961411245866,
"grad_norm": 0.23734460771083832,
"learning_rate": 0.00012777777777777776,
"loss": 0.4074,
"step": 2087
},
{
"epoch": 1.1510474090407938,
"grad_norm": 0.21490591764450073,
"learning_rate": 0.00012769485903814262,
"loss": 0.3747,
"step": 2088
},
{
"epoch": 1.151598676957001,
"grad_norm": 0.22734799981117249,
"learning_rate": 0.00012761194029850745,
"loss": 0.3836,
"step": 2089
},
{
"epoch": 1.1521499448732084,
"grad_norm": 0.22835008800029755,
"learning_rate": 0.0001275290215588723,
"loss": 0.3983,
"step": 2090
},
{
"epoch": 1.1527012127894156,
"grad_norm": 0.2260267287492752,
"learning_rate": 0.00012744610281923715,
"loss": 0.3785,
"step": 2091
},
{
"epoch": 1.153252480705623,
"grad_norm": 0.22667206823825836,
"learning_rate": 0.00012736318407960198,
"loss": 0.3945,
"step": 2092
},
{
"epoch": 1.1538037486218302,
"grad_norm": 0.23218148946762085,
"learning_rate": 0.00012728026533996684,
"loss": 0.3967,
"step": 2093
},
{
"epoch": 1.1543550165380374,
"grad_norm": 0.24123932421207428,
"learning_rate": 0.00012719734660033167,
"loss": 0.3994,
"step": 2094
},
{
"epoch": 1.1549062844542448,
"grad_norm": 0.23074567317962646,
"learning_rate": 0.0001271144278606965,
"loss": 0.405,
"step": 2095
},
{
"epoch": 1.155457552370452,
"grad_norm": 0.23828662931919098,
"learning_rate": 0.00012703150912106136,
"loss": 0.3886,
"step": 2096
},
{
"epoch": 1.1560088202866594,
"grad_norm": 0.22315117716789246,
"learning_rate": 0.0001269485903814262,
"loss": 0.3925,
"step": 2097
},
{
"epoch": 1.1565600882028666,
"grad_norm": 0.22071965038776398,
"learning_rate": 0.00012686567164179105,
"loss": 0.3997,
"step": 2098
},
{
"epoch": 1.1571113561190738,
"grad_norm": 0.22145338356494904,
"learning_rate": 0.00012678275290215588,
"loss": 0.3784,
"step": 2099
},
{
"epoch": 1.1576626240352812,
"grad_norm": 0.2308942675590515,
"learning_rate": 0.00012669983416252072,
"loss": 0.3576,
"step": 2100
},
{
"epoch": 1.1582138919514884,
"grad_norm": 0.2193097174167633,
"learning_rate": 0.00012661691542288557,
"loss": 0.3806,
"step": 2101
},
{
"epoch": 1.1587651598676958,
"grad_norm": 0.2277258038520813,
"learning_rate": 0.0001265339966832504,
"loss": 0.389,
"step": 2102
},
{
"epoch": 1.159316427783903,
"grad_norm": 0.22830741107463837,
"learning_rate": 0.00012645107794361527,
"loss": 0.4132,
"step": 2103
},
{
"epoch": 1.1598676957001102,
"grad_norm": 0.22856192290782928,
"learning_rate": 0.0001263681592039801,
"loss": 0.3879,
"step": 2104
},
{
"epoch": 1.1604189636163176,
"grad_norm": 0.23155651986598969,
"learning_rate": 0.00012628524046434493,
"loss": 0.3902,
"step": 2105
},
{
"epoch": 1.1609702315325248,
"grad_norm": 0.22571994364261627,
"learning_rate": 0.00012620232172470976,
"loss": 0.4017,
"step": 2106
},
{
"epoch": 1.1615214994487322,
"grad_norm": 0.2258533239364624,
"learning_rate": 0.00012611940298507462,
"loss": 0.4027,
"step": 2107
},
{
"epoch": 1.1620727673649394,
"grad_norm": 0.24114197492599487,
"learning_rate": 0.00012603648424543945,
"loss": 0.3983,
"step": 2108
},
{
"epoch": 1.1626240352811466,
"grad_norm": 0.22286631166934967,
"learning_rate": 0.00012595356550580429,
"loss": 0.4026,
"step": 2109
},
{
"epoch": 1.163175303197354,
"grad_norm": 0.2404211014509201,
"learning_rate": 0.00012587064676616914,
"loss": 0.4082,
"step": 2110
},
{
"epoch": 1.1637265711135611,
"grad_norm": 0.22578535974025726,
"learning_rate": 0.00012578772802653398,
"loss": 0.3881,
"step": 2111
},
{
"epoch": 1.1642778390297686,
"grad_norm": 0.24066035449504852,
"learning_rate": 0.00012570480928689884,
"loss": 0.4144,
"step": 2112
},
{
"epoch": 1.1648291069459757,
"grad_norm": 0.22703833878040314,
"learning_rate": 0.00012562189054726367,
"loss": 0.3942,
"step": 2113
},
{
"epoch": 1.165380374862183,
"grad_norm": 0.2277577817440033,
"learning_rate": 0.0001255389718076285,
"loss": 0.4116,
"step": 2114
},
{
"epoch": 1.1659316427783903,
"grad_norm": 0.2201533019542694,
"learning_rate": 0.00012545605306799336,
"loss": 0.3961,
"step": 2115
},
{
"epoch": 1.1664829106945975,
"grad_norm": 0.22969132661819458,
"learning_rate": 0.0001253731343283582,
"loss": 0.4146,
"step": 2116
},
{
"epoch": 1.167034178610805,
"grad_norm": 0.2208871990442276,
"learning_rate": 0.00012529021558872305,
"loss": 0.3925,
"step": 2117
},
{
"epoch": 1.1675854465270121,
"grad_norm": 0.24675814807415009,
"learning_rate": 0.00012520729684908788,
"loss": 0.3923,
"step": 2118
},
{
"epoch": 1.1681367144432193,
"grad_norm": 0.25365886092185974,
"learning_rate": 0.00012512437810945271,
"loss": 0.4018,
"step": 2119
},
{
"epoch": 1.1686879823594267,
"grad_norm": 0.2352716475725174,
"learning_rate": 0.00012504145936981757,
"loss": 0.4136,
"step": 2120
},
{
"epoch": 1.169239250275634,
"grad_norm": 0.22656375169754028,
"learning_rate": 0.0001249585406301824,
"loss": 0.3896,
"step": 2121
},
{
"epoch": 1.1697905181918413,
"grad_norm": 0.22290179133415222,
"learning_rate": 0.00012487562189054724,
"loss": 0.4059,
"step": 2122
},
{
"epoch": 1.1703417861080485,
"grad_norm": 0.24139589071273804,
"learning_rate": 0.0001247927031509121,
"loss": 0.3999,
"step": 2123
},
{
"epoch": 1.1708930540242557,
"grad_norm": 0.24391639232635498,
"learning_rate": 0.00012470978441127693,
"loss": 0.3876,
"step": 2124
},
{
"epoch": 1.171444321940463,
"grad_norm": 0.2283831685781479,
"learning_rate": 0.0001246268656716418,
"loss": 0.3988,
"step": 2125
},
{
"epoch": 1.1719955898566703,
"grad_norm": 0.24799783527851105,
"learning_rate": 0.00012454394693200662,
"loss": 0.396,
"step": 2126
},
{
"epoch": 1.1725468577728777,
"grad_norm": 0.22174561023712158,
"learning_rate": 0.00012446102819237145,
"loss": 0.3809,
"step": 2127
},
{
"epoch": 1.173098125689085,
"grad_norm": 0.22951188683509827,
"learning_rate": 0.0001243781094527363,
"loss": 0.3882,
"step": 2128
},
{
"epoch": 1.173649393605292,
"grad_norm": 0.21973788738250732,
"learning_rate": 0.00012429519071310114,
"loss": 0.3872,
"step": 2129
},
{
"epoch": 1.1742006615214995,
"grad_norm": 0.22701437771320343,
"learning_rate": 0.000124212271973466,
"loss": 0.3876,
"step": 2130
},
{
"epoch": 1.1747519294377067,
"grad_norm": 0.22394593060016632,
"learning_rate": 0.00012412935323383083,
"loss": 0.3874,
"step": 2131
},
{
"epoch": 1.175303197353914,
"grad_norm": 0.24040114879608154,
"learning_rate": 0.00012404643449419567,
"loss": 0.3856,
"step": 2132
},
{
"epoch": 1.1758544652701213,
"grad_norm": 0.2295607030391693,
"learning_rate": 0.00012396351575456052,
"loss": 0.3861,
"step": 2133
},
{
"epoch": 1.1764057331863285,
"grad_norm": 0.229506716132164,
"learning_rate": 0.00012388059701492536,
"loss": 0.3877,
"step": 2134
},
{
"epoch": 1.1769570011025359,
"grad_norm": 0.24226558208465576,
"learning_rate": 0.00012379767827529022,
"loss": 0.4051,
"step": 2135
},
{
"epoch": 1.177508269018743,
"grad_norm": 0.23359960317611694,
"learning_rate": 0.00012371475953565505,
"loss": 0.3911,
"step": 2136
},
{
"epoch": 1.1780595369349505,
"grad_norm": 0.24533167481422424,
"learning_rate": 0.00012363184079601988,
"loss": 0.4075,
"step": 2137
},
{
"epoch": 1.1786108048511577,
"grad_norm": 0.22445149719715118,
"learning_rate": 0.00012354892205638474,
"loss": 0.3762,
"step": 2138
},
{
"epoch": 1.1791620727673648,
"grad_norm": 0.2399044781923294,
"learning_rate": 0.00012346600331674957,
"loss": 0.375,
"step": 2139
},
{
"epoch": 1.1797133406835723,
"grad_norm": 0.2472797930240631,
"learning_rate": 0.00012338308457711443,
"loss": 0.4036,
"step": 2140
},
{
"epoch": 1.1802646085997794,
"grad_norm": 0.2297624945640564,
"learning_rate": 0.00012330016583747926,
"loss": 0.4154,
"step": 2141
},
{
"epoch": 1.1808158765159869,
"grad_norm": 0.23524117469787598,
"learning_rate": 0.0001232172470978441,
"loss": 0.3879,
"step": 2142
},
{
"epoch": 1.181367144432194,
"grad_norm": 0.23935049772262573,
"learning_rate": 0.00012313432835820895,
"loss": 0.4107,
"step": 2143
},
{
"epoch": 1.1819184123484012,
"grad_norm": 0.21305608749389648,
"learning_rate": 0.00012305140961857379,
"loss": 0.3964,
"step": 2144
},
{
"epoch": 1.1824696802646086,
"grad_norm": 0.2339240163564682,
"learning_rate": 0.00012296849087893864,
"loss": 0.4185,
"step": 2145
},
{
"epoch": 1.1830209481808158,
"grad_norm": 0.23344539105892181,
"learning_rate": 0.00012288557213930348,
"loss": 0.3934,
"step": 2146
},
{
"epoch": 1.1835722160970232,
"grad_norm": 0.2274356484413147,
"learning_rate": 0.0001228026533996683,
"loss": 0.3854,
"step": 2147
},
{
"epoch": 1.1841234840132304,
"grad_norm": 0.23241972923278809,
"learning_rate": 0.00012271973466003317,
"loss": 0.4106,
"step": 2148
},
{
"epoch": 1.1846747519294376,
"grad_norm": 0.22595259547233582,
"learning_rate": 0.000122636815920398,
"loss": 0.401,
"step": 2149
},
{
"epoch": 1.185226019845645,
"grad_norm": 0.22598454356193542,
"learning_rate": 0.00012255389718076283,
"loss": 0.4041,
"step": 2150
},
{
"epoch": 1.1857772877618522,
"grad_norm": 0.233281672000885,
"learning_rate": 0.00012247097844112766,
"loss": 0.3763,
"step": 2151
},
{
"epoch": 1.1863285556780596,
"grad_norm": 0.22901344299316406,
"learning_rate": 0.00012238805970149252,
"loss": 0.3949,
"step": 2152
},
{
"epoch": 1.1868798235942668,
"grad_norm": 0.24648213386535645,
"learning_rate": 0.00012230514096185736,
"loss": 0.4229,
"step": 2153
},
{
"epoch": 1.187431091510474,
"grad_norm": 0.24580827355384827,
"learning_rate": 0.00012222222222222221,
"loss": 0.4125,
"step": 2154
},
{
"epoch": 1.1879823594266814,
"grad_norm": 0.23127946257591248,
"learning_rate": 0.00012213930348258705,
"loss": 0.3727,
"step": 2155
},
{
"epoch": 1.1885336273428886,
"grad_norm": 0.2267657071352005,
"learning_rate": 0.00012205638474295189,
"loss": 0.3951,
"step": 2156
},
{
"epoch": 1.189084895259096,
"grad_norm": 0.23497919738292694,
"learning_rate": 0.00012197346600331674,
"loss": 0.3721,
"step": 2157
},
{
"epoch": 1.1896361631753032,
"grad_norm": 0.22601653635501862,
"learning_rate": 0.00012189054726368157,
"loss": 0.3945,
"step": 2158
},
{
"epoch": 1.1901874310915104,
"grad_norm": 0.21945270895957947,
"learning_rate": 0.00012180762852404642,
"loss": 0.3574,
"step": 2159
},
{
"epoch": 1.1907386990077178,
"grad_norm": 0.2285127341747284,
"learning_rate": 0.00012172470978441126,
"loss": 0.3891,
"step": 2160
},
{
"epoch": 1.191289966923925,
"grad_norm": 0.23766474425792694,
"learning_rate": 0.0001216417910447761,
"loss": 0.3968,
"step": 2161
},
{
"epoch": 1.1918412348401324,
"grad_norm": 0.23863717913627625,
"learning_rate": 0.00012155887230514095,
"loss": 0.389,
"step": 2162
},
{
"epoch": 1.1923925027563396,
"grad_norm": 0.22550217807292938,
"learning_rate": 0.00012147595356550578,
"loss": 0.3842,
"step": 2163
},
{
"epoch": 1.1929437706725468,
"grad_norm": 0.22460085153579712,
"learning_rate": 0.00012139303482587063,
"loss": 0.3874,
"step": 2164
},
{
"epoch": 1.1934950385887542,
"grad_norm": 0.2168971300125122,
"learning_rate": 0.00012131011608623548,
"loss": 0.3783,
"step": 2165
},
{
"epoch": 1.1940463065049614,
"grad_norm": 0.2768751084804535,
"learning_rate": 0.00012122719734660032,
"loss": 0.4206,
"step": 2166
},
{
"epoch": 1.1945975744211688,
"grad_norm": 0.2357032299041748,
"learning_rate": 0.00012114427860696517,
"loss": 0.3943,
"step": 2167
},
{
"epoch": 1.195148842337376,
"grad_norm": 0.24314233660697937,
"learning_rate": 0.00012106135986733,
"loss": 0.3983,
"step": 2168
},
{
"epoch": 1.1957001102535831,
"grad_norm": 0.2605820596218109,
"learning_rate": 0.00012097844112769484,
"loss": 0.4036,
"step": 2169
},
{
"epoch": 1.1962513781697905,
"grad_norm": 0.22138415277004242,
"learning_rate": 0.00012089552238805969,
"loss": 0.3794,
"step": 2170
},
{
"epoch": 1.1968026460859977,
"grad_norm": 0.2328760325908661,
"learning_rate": 0.00012081260364842454,
"loss": 0.3948,
"step": 2171
},
{
"epoch": 1.1973539140022051,
"grad_norm": 0.22606134414672852,
"learning_rate": 0.00012072968490878938,
"loss": 0.3958,
"step": 2172
},
{
"epoch": 1.1979051819184123,
"grad_norm": 0.25683924555778503,
"learning_rate": 0.00012064676616915421,
"loss": 0.3939,
"step": 2173
},
{
"epoch": 1.1984564498346195,
"grad_norm": 0.22325700521469116,
"learning_rate": 0.00012056384742951906,
"loss": 0.3915,
"step": 2174
},
{
"epoch": 1.199007717750827,
"grad_norm": 0.21337918937206268,
"learning_rate": 0.0001204809286898839,
"loss": 0.3699,
"step": 2175
},
{
"epoch": 1.1995589856670341,
"grad_norm": 0.2343214452266693,
"learning_rate": 0.00012039800995024875,
"loss": 0.4029,
"step": 2176
},
{
"epoch": 1.2001102535832415,
"grad_norm": 0.2408185601234436,
"learning_rate": 0.0001203150912106136,
"loss": 0.3915,
"step": 2177
},
{
"epoch": 1.2006615214994487,
"grad_norm": 0.2592547535896301,
"learning_rate": 0.00012023217247097843,
"loss": 0.409,
"step": 2178
},
{
"epoch": 1.201212789415656,
"grad_norm": 0.2201685607433319,
"learning_rate": 0.00012014925373134327,
"loss": 0.381,
"step": 2179
},
{
"epoch": 1.2017640573318633,
"grad_norm": 0.23619139194488525,
"learning_rate": 0.00012006633499170812,
"loss": 0.3708,
"step": 2180
},
{
"epoch": 1.2023153252480705,
"grad_norm": 0.24719634652137756,
"learning_rate": 0.00011998341625207296,
"loss": 0.3996,
"step": 2181
},
{
"epoch": 1.202866593164278,
"grad_norm": 0.24691031873226166,
"learning_rate": 0.00011990049751243781,
"loss": 0.3897,
"step": 2182
},
{
"epoch": 1.203417861080485,
"grad_norm": 0.2518804967403412,
"learning_rate": 0.00011981757877280264,
"loss": 0.3886,
"step": 2183
},
{
"epoch": 1.2039691289966923,
"grad_norm": 0.2279016375541687,
"learning_rate": 0.00011973466003316749,
"loss": 0.3791,
"step": 2184
},
{
"epoch": 1.2045203969128997,
"grad_norm": 0.24580788612365723,
"learning_rate": 0.00011965174129353233,
"loss": 0.4013,
"step": 2185
},
{
"epoch": 1.2050716648291069,
"grad_norm": 0.2422635406255722,
"learning_rate": 0.00011956882255389718,
"loss": 0.3831,
"step": 2186
},
{
"epoch": 1.2056229327453143,
"grad_norm": 0.24743367731571198,
"learning_rate": 0.00011948590381426202,
"loss": 0.3939,
"step": 2187
},
{
"epoch": 1.2061742006615215,
"grad_norm": 0.24504512548446655,
"learning_rate": 0.00011940298507462686,
"loss": 0.3976,
"step": 2188
},
{
"epoch": 1.2067254685777287,
"grad_norm": 0.2121214121580124,
"learning_rate": 0.0001193200663349917,
"loss": 0.3692,
"step": 2189
},
{
"epoch": 1.207276736493936,
"grad_norm": 0.23639699816703796,
"learning_rate": 0.00011923714759535655,
"loss": 0.3999,
"step": 2190
},
{
"epoch": 1.2078280044101433,
"grad_norm": 0.2503402531147003,
"learning_rate": 0.00011915422885572139,
"loss": 0.3807,
"step": 2191
},
{
"epoch": 1.2083792723263507,
"grad_norm": 0.2412857562303543,
"learning_rate": 0.00011907131011608624,
"loss": 0.397,
"step": 2192
},
{
"epoch": 1.2089305402425579,
"grad_norm": 0.2293364554643631,
"learning_rate": 0.00011898839137645107,
"loss": 0.3752,
"step": 2193
},
{
"epoch": 1.209481808158765,
"grad_norm": 0.23062635958194733,
"learning_rate": 0.00011890547263681592,
"loss": 0.3779,
"step": 2194
},
{
"epoch": 1.2100330760749725,
"grad_norm": 0.23140175640583038,
"learning_rate": 0.00011882255389718075,
"loss": 0.3763,
"step": 2195
},
{
"epoch": 1.2105843439911796,
"grad_norm": 0.23366335034370422,
"learning_rate": 0.0001187396351575456,
"loss": 0.3959,
"step": 2196
},
{
"epoch": 1.211135611907387,
"grad_norm": 0.2382514774799347,
"learning_rate": 0.00011865671641791043,
"loss": 0.3876,
"step": 2197
},
{
"epoch": 1.2116868798235942,
"grad_norm": 0.23558002710342407,
"learning_rate": 0.00011857379767827527,
"loss": 0.4032,
"step": 2198
},
{
"epoch": 1.2122381477398014,
"grad_norm": 0.23793788254261017,
"learning_rate": 0.00011849087893864012,
"loss": 0.3909,
"step": 2199
},
{
"epoch": 1.2127894156560088,
"grad_norm": 0.2181142121553421,
"learning_rate": 0.00011840796019900496,
"loss": 0.3923,
"step": 2200
},
{
"epoch": 1.213340683572216,
"grad_norm": 0.21802657842636108,
"learning_rate": 0.00011832504145936981,
"loss": 0.3795,
"step": 2201
},
{
"epoch": 1.2138919514884234,
"grad_norm": 0.2436913102865219,
"learning_rate": 0.00011824212271973464,
"loss": 0.3985,
"step": 2202
},
{
"epoch": 1.2144432194046306,
"grad_norm": 0.22913113236427307,
"learning_rate": 0.00011815920398009949,
"loss": 0.3872,
"step": 2203
},
{
"epoch": 1.2149944873208378,
"grad_norm": 0.2223367691040039,
"learning_rate": 0.00011807628524046433,
"loss": 0.3905,
"step": 2204
},
{
"epoch": 1.2155457552370452,
"grad_norm": 0.23263731598854065,
"learning_rate": 0.00011799336650082918,
"loss": 0.4048,
"step": 2205
},
{
"epoch": 1.2160970231532524,
"grad_norm": 0.2505498230457306,
"learning_rate": 0.00011791044776119402,
"loss": 0.395,
"step": 2206
},
{
"epoch": 1.2166482910694598,
"grad_norm": 0.2553291916847229,
"learning_rate": 0.00011782752902155885,
"loss": 0.3935,
"step": 2207
},
{
"epoch": 1.217199558985667,
"grad_norm": 0.22239425778388977,
"learning_rate": 0.0001177446102819237,
"loss": 0.381,
"step": 2208
},
{
"epoch": 1.2177508269018742,
"grad_norm": 0.21807150542736053,
"learning_rate": 0.00011766169154228855,
"loss": 0.3878,
"step": 2209
},
{
"epoch": 1.2183020948180816,
"grad_norm": 0.23478740453720093,
"learning_rate": 0.00011757877280265339,
"loss": 0.3815,
"step": 2210
},
{
"epoch": 1.2188533627342888,
"grad_norm": 0.23702913522720337,
"learning_rate": 0.00011749585406301822,
"loss": 0.4001,
"step": 2211
},
{
"epoch": 1.2194046306504962,
"grad_norm": 0.23261341452598572,
"learning_rate": 0.00011741293532338307,
"loss": 0.3935,
"step": 2212
},
{
"epoch": 1.2199558985667034,
"grad_norm": 0.22314967215061188,
"learning_rate": 0.00011733001658374791,
"loss": 0.4048,
"step": 2213
},
{
"epoch": 1.2205071664829106,
"grad_norm": 0.23277883231639862,
"learning_rate": 0.00011724709784411276,
"loss": 0.3739,
"step": 2214
},
{
"epoch": 1.221058434399118,
"grad_norm": 0.24505817890167236,
"learning_rate": 0.0001171641791044776,
"loss": 0.3922,
"step": 2215
},
{
"epoch": 1.2216097023153252,
"grad_norm": 0.24386508762836456,
"learning_rate": 0.00011708126036484244,
"loss": 0.3872,
"step": 2216
},
{
"epoch": 1.2221609702315326,
"grad_norm": 0.2437102198600769,
"learning_rate": 0.00011699834162520728,
"loss": 0.4048,
"step": 2217
},
{
"epoch": 1.2227122381477398,
"grad_norm": 0.22707347571849823,
"learning_rate": 0.00011691542288557213,
"loss": 0.3996,
"step": 2218
},
{
"epoch": 1.223263506063947,
"grad_norm": 0.23951935768127441,
"learning_rate": 0.00011683250414593697,
"loss": 0.399,
"step": 2219
},
{
"epoch": 1.2238147739801544,
"grad_norm": 0.27458345890045166,
"learning_rate": 0.00011674958540630182,
"loss": 0.4093,
"step": 2220
},
{
"epoch": 1.2243660418963616,
"grad_norm": 0.23940932750701904,
"learning_rate": 0.00011666666666666665,
"loss": 0.3915,
"step": 2221
},
{
"epoch": 1.224917309812569,
"grad_norm": 0.24100755155086517,
"learning_rate": 0.0001165837479270315,
"loss": 0.3915,
"step": 2222
},
{
"epoch": 1.2254685777287762,
"grad_norm": 0.2423773556947708,
"learning_rate": 0.00011650082918739634,
"loss": 0.4061,
"step": 2223
},
{
"epoch": 1.2260198456449833,
"grad_norm": 0.2552812099456787,
"learning_rate": 0.00011641791044776119,
"loss": 0.3922,
"step": 2224
},
{
"epoch": 1.2265711135611908,
"grad_norm": 0.24121615290641785,
"learning_rate": 0.00011633499170812603,
"loss": 0.3949,
"step": 2225
},
{
"epoch": 1.227122381477398,
"grad_norm": 0.24254634976387024,
"learning_rate": 0.00011625207296849087,
"loss": 0.3776,
"step": 2226
},
{
"epoch": 1.2276736493936053,
"grad_norm": 0.2757539451122284,
"learning_rate": 0.00011616915422885571,
"loss": 0.4181,
"step": 2227
},
{
"epoch": 1.2282249173098125,
"grad_norm": 0.25508221983909607,
"learning_rate": 0.00011608623548922056,
"loss": 0.4069,
"step": 2228
},
{
"epoch": 1.2287761852260197,
"grad_norm": 0.24166013300418854,
"learning_rate": 0.0001160033167495854,
"loss": 0.3848,
"step": 2229
},
{
"epoch": 1.2293274531422271,
"grad_norm": 0.23408280313014984,
"learning_rate": 0.00011592039800995025,
"loss": 0.3867,
"step": 2230
},
{
"epoch": 1.2298787210584343,
"grad_norm": 0.2366735339164734,
"learning_rate": 0.00011583747927031508,
"loss": 0.407,
"step": 2231
},
{
"epoch": 1.2304299889746417,
"grad_norm": 0.247688889503479,
"learning_rate": 0.00011575456053067993,
"loss": 0.3898,
"step": 2232
},
{
"epoch": 1.230981256890849,
"grad_norm": 0.23416852951049805,
"learning_rate": 0.00011567164179104477,
"loss": 0.3871,
"step": 2233
},
{
"epoch": 1.231532524807056,
"grad_norm": 0.243104949593544,
"learning_rate": 0.00011558872305140962,
"loss": 0.4209,
"step": 2234
},
{
"epoch": 1.2320837927232635,
"grad_norm": 0.23723013699054718,
"learning_rate": 0.00011550580431177446,
"loss": 0.3867,
"step": 2235
},
{
"epoch": 1.2326350606394707,
"grad_norm": 0.2383720874786377,
"learning_rate": 0.0001154228855721393,
"loss": 0.3861,
"step": 2236
},
{
"epoch": 1.2331863285556781,
"grad_norm": 0.25127896666526794,
"learning_rate": 0.00011533996683250414,
"loss": 0.4039,
"step": 2237
},
{
"epoch": 1.2337375964718853,
"grad_norm": 0.23529255390167236,
"learning_rate": 0.00011525704809286899,
"loss": 0.3838,
"step": 2238
},
{
"epoch": 1.2342888643880925,
"grad_norm": 0.2100450098514557,
"learning_rate": 0.00011517412935323382,
"loss": 0.3639,
"step": 2239
},
{
"epoch": 1.2348401323043,
"grad_norm": 0.24556870758533478,
"learning_rate": 0.00011509121061359865,
"loss": 0.3901,
"step": 2240
},
{
"epoch": 1.235391400220507,
"grad_norm": 0.2549160420894623,
"learning_rate": 0.0001150082918739635,
"loss": 0.3871,
"step": 2241
},
{
"epoch": 1.2359426681367145,
"grad_norm": 0.23175586760044098,
"learning_rate": 0.00011492537313432834,
"loss": 0.3886,
"step": 2242
},
{
"epoch": 1.2364939360529217,
"grad_norm": 0.2296617478132248,
"learning_rate": 0.00011484245439469319,
"loss": 0.406,
"step": 2243
},
{
"epoch": 1.237045203969129,
"grad_norm": 0.2378944754600525,
"learning_rate": 0.00011475953565505803,
"loss": 0.3949,
"step": 2244
},
{
"epoch": 1.2375964718853363,
"grad_norm": 0.23094962537288666,
"learning_rate": 0.00011467661691542286,
"loss": 0.3875,
"step": 2245
},
{
"epoch": 1.2381477398015435,
"grad_norm": 0.22399038076400757,
"learning_rate": 0.00011459369817578771,
"loss": 0.4009,
"step": 2246
},
{
"epoch": 1.2386990077177509,
"grad_norm": 0.24871258437633514,
"learning_rate": 0.00011451077943615256,
"loss": 0.3926,
"step": 2247
},
{
"epoch": 1.239250275633958,
"grad_norm": 0.23597979545593262,
"learning_rate": 0.0001144278606965174,
"loss": 0.3803,
"step": 2248
},
{
"epoch": 1.2398015435501655,
"grad_norm": 0.23361554741859436,
"learning_rate": 0.00011434494195688225,
"loss": 0.3994,
"step": 2249
},
{
"epoch": 1.2403528114663727,
"grad_norm": 0.2614096999168396,
"learning_rate": 0.00011426202321724708,
"loss": 0.3946,
"step": 2250
},
{
"epoch": 1.2409040793825798,
"grad_norm": 0.23481406271457672,
"learning_rate": 0.00011417910447761192,
"loss": 0.3981,
"step": 2251
},
{
"epoch": 1.2414553472987873,
"grad_norm": 0.21524877846240997,
"learning_rate": 0.00011409618573797677,
"loss": 0.3725,
"step": 2252
},
{
"epoch": 1.2420066152149944,
"grad_norm": 0.2307668924331665,
"learning_rate": 0.00011401326699834162,
"loss": 0.3829,
"step": 2253
},
{
"epoch": 1.2425578831312019,
"grad_norm": 0.2581194341182709,
"learning_rate": 0.00011393034825870646,
"loss": 0.3901,
"step": 2254
},
{
"epoch": 1.243109151047409,
"grad_norm": 0.235372856259346,
"learning_rate": 0.0001138474295190713,
"loss": 0.3922,
"step": 2255
},
{
"epoch": 1.2436604189636162,
"grad_norm": 0.23432569205760956,
"learning_rate": 0.00011376451077943614,
"loss": 0.3767,
"step": 2256
},
{
"epoch": 1.2442116868798236,
"grad_norm": 0.2407122552394867,
"learning_rate": 0.00011368159203980098,
"loss": 0.4207,
"step": 2257
},
{
"epoch": 1.2447629547960308,
"grad_norm": 0.25739043951034546,
"learning_rate": 0.00011359867330016583,
"loss": 0.3838,
"step": 2258
},
{
"epoch": 1.2453142227122382,
"grad_norm": 0.25240135192871094,
"learning_rate": 0.00011351575456053068,
"loss": 0.3989,
"step": 2259
},
{
"epoch": 1.2458654906284454,
"grad_norm": 0.22552815079689026,
"learning_rate": 0.00011343283582089551,
"loss": 0.3848,
"step": 2260
},
{
"epoch": 1.2464167585446526,
"grad_norm": 0.2320718765258789,
"learning_rate": 0.00011334991708126035,
"loss": 0.382,
"step": 2261
},
{
"epoch": 1.24696802646086,
"grad_norm": 0.23423726856708527,
"learning_rate": 0.0001132669983416252,
"loss": 0.3817,
"step": 2262
},
{
"epoch": 1.2475192943770672,
"grad_norm": 0.22892701625823975,
"learning_rate": 0.00011318407960199004,
"loss": 0.3858,
"step": 2263
},
{
"epoch": 1.2480705622932746,
"grad_norm": 0.23635762929916382,
"learning_rate": 0.00011310116086235489,
"loss": 0.3946,
"step": 2264
},
{
"epoch": 1.2486218302094818,
"grad_norm": 0.23909956216812134,
"learning_rate": 0.00011301824212271972,
"loss": 0.3826,
"step": 2265
},
{
"epoch": 1.249173098125689,
"grad_norm": 0.23733805119991302,
"learning_rate": 0.00011293532338308457,
"loss": 0.4215,
"step": 2266
},
{
"epoch": 1.2497243660418964,
"grad_norm": 0.2257446050643921,
"learning_rate": 0.00011285240464344941,
"loss": 0.3959,
"step": 2267
},
{
"epoch": 1.2502756339581036,
"grad_norm": 0.2394627183675766,
"learning_rate": 0.00011276948590381426,
"loss": 0.398,
"step": 2268
},
{
"epoch": 1.2508269018743108,
"grad_norm": 0.22113938629627228,
"learning_rate": 0.0001126865671641791,
"loss": 0.3837,
"step": 2269
},
{
"epoch": 1.2513781697905182,
"grad_norm": 0.22951479256153107,
"learning_rate": 0.00011260364842454394,
"loss": 0.391,
"step": 2270
},
{
"epoch": 1.2519294377067256,
"grad_norm": 0.22468437254428864,
"learning_rate": 0.00011252072968490878,
"loss": 0.3788,
"step": 2271
},
{
"epoch": 1.2524807056229328,
"grad_norm": 0.21054887771606445,
"learning_rate": 0.00011243781094527363,
"loss": 0.3891,
"step": 2272
},
{
"epoch": 1.25303197353914,
"grad_norm": 0.2274617701768875,
"learning_rate": 0.00011235489220563847,
"loss": 0.3883,
"step": 2273
},
{
"epoch": 1.2535832414553472,
"grad_norm": 0.22995011508464813,
"learning_rate": 0.0001122719734660033,
"loss": 0.3847,
"step": 2274
},
{
"epoch": 1.2541345093715546,
"grad_norm": 0.22627364099025726,
"learning_rate": 0.00011218905472636815,
"loss": 0.3924,
"step": 2275
},
{
"epoch": 1.254685777287762,
"grad_norm": 0.23559615015983582,
"learning_rate": 0.000112106135986733,
"loss": 0.3966,
"step": 2276
},
{
"epoch": 1.2552370452039692,
"grad_norm": 0.21304303407669067,
"learning_rate": 0.00011202321724709784,
"loss": 0.3624,
"step": 2277
},
{
"epoch": 1.2557883131201764,
"grad_norm": 0.241587296128273,
"learning_rate": 0.00011194029850746269,
"loss": 0.3719,
"step": 2278
},
{
"epoch": 1.2563395810363835,
"grad_norm": 0.22992491722106934,
"learning_rate": 0.00011185737976782752,
"loss": 0.4019,
"step": 2279
},
{
"epoch": 1.256890848952591,
"grad_norm": 0.2323186844587326,
"learning_rate": 0.00011177446102819237,
"loss": 0.3725,
"step": 2280
},
{
"epoch": 1.2574421168687984,
"grad_norm": 0.23510509729385376,
"learning_rate": 0.00011169154228855721,
"loss": 0.4176,
"step": 2281
},
{
"epoch": 1.2579933847850056,
"grad_norm": 0.23601877689361572,
"learning_rate": 0.00011160862354892206,
"loss": 0.4036,
"step": 2282
},
{
"epoch": 1.2585446527012127,
"grad_norm": 0.23654739558696747,
"learning_rate": 0.00011152570480928687,
"loss": 0.403,
"step": 2283
},
{
"epoch": 1.25909592061742,
"grad_norm": 0.2428976446390152,
"learning_rate": 0.00011144278606965172,
"loss": 0.3703,
"step": 2284
},
{
"epoch": 1.2596471885336273,
"grad_norm": 0.23753516376018524,
"learning_rate": 0.00011135986733001657,
"loss": 0.3979,
"step": 2285
},
{
"epoch": 1.2601984564498347,
"grad_norm": 0.2367447316646576,
"learning_rate": 0.00011127694859038141,
"loss": 0.3822,
"step": 2286
},
{
"epoch": 1.260749724366042,
"grad_norm": 0.2365788072347641,
"learning_rate": 0.00011119402985074626,
"loss": 0.389,
"step": 2287
},
{
"epoch": 1.2613009922822491,
"grad_norm": 0.22868278622627258,
"learning_rate": 0.00011111111111111109,
"loss": 0.391,
"step": 2288
},
{
"epoch": 1.2618522601984565,
"grad_norm": 0.23099401593208313,
"learning_rate": 0.00011102819237147593,
"loss": 0.3947,
"step": 2289
},
{
"epoch": 1.2624035281146637,
"grad_norm": 0.24031782150268555,
"learning_rate": 0.00011094527363184078,
"loss": 0.3839,
"step": 2290
},
{
"epoch": 1.2629547960308711,
"grad_norm": 0.2490132451057434,
"learning_rate": 0.00011086235489220563,
"loss": 0.3896,
"step": 2291
},
{
"epoch": 1.2635060639470783,
"grad_norm": 0.2366219013929367,
"learning_rate": 0.00011077943615257047,
"loss": 0.3933,
"step": 2292
},
{
"epoch": 1.2640573318632855,
"grad_norm": 0.22578656673431396,
"learning_rate": 0.0001106965174129353,
"loss": 0.3723,
"step": 2293
},
{
"epoch": 1.264608599779493,
"grad_norm": 0.23483921587467194,
"learning_rate": 0.00011061359867330015,
"loss": 0.3895,
"step": 2294
},
{
"epoch": 1.2651598676957,
"grad_norm": 0.2586977481842041,
"learning_rate": 0.000110530679933665,
"loss": 0.4042,
"step": 2295
},
{
"epoch": 1.2657111356119075,
"grad_norm": 0.23051442205905914,
"learning_rate": 0.00011044776119402984,
"loss": 0.3862,
"step": 2296
},
{
"epoch": 1.2662624035281147,
"grad_norm": 0.2358439564704895,
"learning_rate": 0.00011036484245439469,
"loss": 0.3798,
"step": 2297
},
{
"epoch": 1.2668136714443219,
"grad_norm": 0.23679201304912567,
"learning_rate": 0.00011028192371475952,
"loss": 0.4037,
"step": 2298
},
{
"epoch": 1.2673649393605293,
"grad_norm": 0.23940104246139526,
"learning_rate": 0.00011019900497512436,
"loss": 0.3898,
"step": 2299
},
{
"epoch": 1.2679162072767365,
"grad_norm": 0.23662586510181427,
"learning_rate": 0.00011011608623548921,
"loss": 0.4001,
"step": 2300
},
{
"epoch": 1.268467475192944,
"grad_norm": 0.23159541189670563,
"learning_rate": 0.00011003316749585405,
"loss": 0.3919,
"step": 2301
},
{
"epoch": 1.269018743109151,
"grad_norm": 0.21939191222190857,
"learning_rate": 0.0001099502487562189,
"loss": 0.3902,
"step": 2302
},
{
"epoch": 1.2695700110253583,
"grad_norm": 0.24052447080612183,
"learning_rate": 0.00010986733001658373,
"loss": 0.391,
"step": 2303
},
{
"epoch": 1.2701212789415657,
"grad_norm": 0.22359569370746613,
"learning_rate": 0.00010978441127694858,
"loss": 0.3813,
"step": 2304
},
{
"epoch": 1.2706725468577729,
"grad_norm": 0.22367626428604126,
"learning_rate": 0.00010970149253731342,
"loss": 0.3873,
"step": 2305
},
{
"epoch": 1.2712238147739803,
"grad_norm": 0.24156810343265533,
"learning_rate": 0.00010961857379767827,
"loss": 0.3996,
"step": 2306
},
{
"epoch": 1.2717750826901875,
"grad_norm": 0.23700320720672607,
"learning_rate": 0.00010953565505804311,
"loss": 0.3901,
"step": 2307
},
{
"epoch": 1.2723263506063947,
"grad_norm": 0.2303237020969391,
"learning_rate": 0.00010945273631840795,
"loss": 0.4031,
"step": 2308
},
{
"epoch": 1.272877618522602,
"grad_norm": 0.2249428927898407,
"learning_rate": 0.00010936981757877279,
"loss": 0.3942,
"step": 2309
},
{
"epoch": 1.2734288864388092,
"grad_norm": 0.2448328137397766,
"learning_rate": 0.00010928689883913764,
"loss": 0.3941,
"step": 2310
},
{
"epoch": 1.2739801543550167,
"grad_norm": 0.23278410732746124,
"learning_rate": 0.00010920398009950248,
"loss": 0.395,
"step": 2311
},
{
"epoch": 1.2745314222712238,
"grad_norm": 0.24542638659477234,
"learning_rate": 0.00010912106135986733,
"loss": 0.4278,
"step": 2312
},
{
"epoch": 1.275082690187431,
"grad_norm": 0.22305360436439514,
"learning_rate": 0.00010903814262023216,
"loss": 0.3932,
"step": 2313
},
{
"epoch": 1.2756339581036384,
"grad_norm": 0.24365827441215515,
"learning_rate": 0.00010895522388059701,
"loss": 0.3963,
"step": 2314
},
{
"epoch": 1.2761852260198456,
"grad_norm": 0.24421466886997223,
"learning_rate": 0.00010887230514096185,
"loss": 0.3956,
"step": 2315
},
{
"epoch": 1.276736493936053,
"grad_norm": 0.24353346228599548,
"learning_rate": 0.0001087893864013267,
"loss": 0.3837,
"step": 2316
},
{
"epoch": 1.2772877618522602,
"grad_norm": 0.24044160544872284,
"learning_rate": 0.00010870646766169154,
"loss": 0.3964,
"step": 2317
},
{
"epoch": 1.2778390297684674,
"grad_norm": 0.2651362717151642,
"learning_rate": 0.00010862354892205638,
"loss": 0.388,
"step": 2318
},
{
"epoch": 1.2783902976846748,
"grad_norm": 0.23700033128261566,
"learning_rate": 0.00010854063018242122,
"loss": 0.38,
"step": 2319
},
{
"epoch": 1.278941565600882,
"grad_norm": 0.23535655438899994,
"learning_rate": 0.00010845771144278607,
"loss": 0.3934,
"step": 2320
},
{
"epoch": 1.2794928335170894,
"grad_norm": 0.26524481177330017,
"learning_rate": 0.00010837479270315091,
"loss": 0.3875,
"step": 2321
},
{
"epoch": 1.2800441014332966,
"grad_norm": 0.24175146222114563,
"learning_rate": 0.00010829187396351576,
"loss": 0.3634,
"step": 2322
},
{
"epoch": 1.2805953693495038,
"grad_norm": 0.231819286942482,
"learning_rate": 0.00010820895522388059,
"loss": 0.388,
"step": 2323
},
{
"epoch": 1.2811466372657112,
"grad_norm": 0.21814289689064026,
"learning_rate": 0.00010812603648424544,
"loss": 0.3711,
"step": 2324
},
{
"epoch": 1.2816979051819184,
"grad_norm": 0.23096728324890137,
"learning_rate": 0.00010804311774461028,
"loss": 0.3974,
"step": 2325
},
{
"epoch": 1.2822491730981258,
"grad_norm": 0.24553930759429932,
"learning_rate": 0.00010796019900497513,
"loss": 0.3897,
"step": 2326
},
{
"epoch": 1.282800441014333,
"grad_norm": 0.23141168057918549,
"learning_rate": 0.00010787728026533995,
"loss": 0.3898,
"step": 2327
},
{
"epoch": 1.2833517089305402,
"grad_norm": 0.23394468426704407,
"learning_rate": 0.00010779436152570479,
"loss": 0.4049,
"step": 2328
},
{
"epoch": 1.2839029768467476,
"grad_norm": 0.2231445461511612,
"learning_rate": 0.00010771144278606964,
"loss": 0.3911,
"step": 2329
},
{
"epoch": 1.2844542447629548,
"grad_norm": 0.2506980299949646,
"learning_rate": 0.00010762852404643448,
"loss": 0.423,
"step": 2330
},
{
"epoch": 1.2850055126791622,
"grad_norm": 0.23698961734771729,
"learning_rate": 0.00010754560530679931,
"loss": 0.4046,
"step": 2331
},
{
"epoch": 1.2855567805953694,
"grad_norm": 0.24735629558563232,
"learning_rate": 0.00010746268656716416,
"loss": 0.4078,
"step": 2332
},
{
"epoch": 1.2861080485115766,
"grad_norm": 0.25394487380981445,
"learning_rate": 0.000107379767827529,
"loss": 0.4027,
"step": 2333
},
{
"epoch": 1.286659316427784,
"grad_norm": 0.24036946892738342,
"learning_rate": 0.00010729684908789385,
"loss": 0.4042,
"step": 2334
},
{
"epoch": 1.2872105843439912,
"grad_norm": 0.24319007992744446,
"learning_rate": 0.0001072139303482587,
"loss": 0.3901,
"step": 2335
},
{
"epoch": 1.2877618522601986,
"grad_norm": 0.23505842685699463,
"learning_rate": 0.00010713101160862353,
"loss": 0.3914,
"step": 2336
},
{
"epoch": 1.2883131201764058,
"grad_norm": 0.24473319947719574,
"learning_rate": 0.00010704809286898837,
"loss": 0.4098,
"step": 2337
},
{
"epoch": 1.288864388092613,
"grad_norm": 0.24411208927631378,
"learning_rate": 0.00010696517412935322,
"loss": 0.4158,
"step": 2338
},
{
"epoch": 1.2894156560088204,
"grad_norm": 0.2365306317806244,
"learning_rate": 0.00010688225538971807,
"loss": 0.3955,
"step": 2339
},
{
"epoch": 1.2899669239250275,
"grad_norm": 0.23471403121948242,
"learning_rate": 0.00010679933665008291,
"loss": 0.3796,
"step": 2340
},
{
"epoch": 1.290518191841235,
"grad_norm": 0.22727487981319427,
"learning_rate": 0.00010671641791044774,
"loss": 0.4044,
"step": 2341
},
{
"epoch": 1.2910694597574421,
"grad_norm": 0.22571586072444916,
"learning_rate": 0.00010663349917081259,
"loss": 0.3551,
"step": 2342
},
{
"epoch": 1.2916207276736493,
"grad_norm": 0.24545998871326447,
"learning_rate": 0.00010655058043117743,
"loss": 0.4144,
"step": 2343
},
{
"epoch": 1.2921719955898567,
"grad_norm": 0.2357962727546692,
"learning_rate": 0.00010646766169154228,
"loss": 0.391,
"step": 2344
},
{
"epoch": 1.292723263506064,
"grad_norm": 0.23277200758457184,
"learning_rate": 0.00010638474295190713,
"loss": 0.4027,
"step": 2345
},
{
"epoch": 1.2932745314222713,
"grad_norm": 0.2385130524635315,
"learning_rate": 0.00010630182421227196,
"loss": 0.4039,
"step": 2346
},
{
"epoch": 1.2938257993384785,
"grad_norm": 0.21902373433113098,
"learning_rate": 0.0001062189054726368,
"loss": 0.3699,
"step": 2347
},
{
"epoch": 1.2943770672546857,
"grad_norm": 0.23025818169116974,
"learning_rate": 0.00010613598673300165,
"loss": 0.3822,
"step": 2348
},
{
"epoch": 1.2949283351708931,
"grad_norm": 0.2286684513092041,
"learning_rate": 0.0001060530679933665,
"loss": 0.401,
"step": 2349
},
{
"epoch": 1.2954796030871003,
"grad_norm": 0.23381029069423676,
"learning_rate": 0.00010597014925373134,
"loss": 0.3991,
"step": 2350
},
{
"epoch": 1.2960308710033077,
"grad_norm": 0.23572219908237457,
"learning_rate": 0.00010588723051409617,
"loss": 0.3993,
"step": 2351
},
{
"epoch": 1.296582138919515,
"grad_norm": 0.22969138622283936,
"learning_rate": 0.00010580431177446102,
"loss": 0.3859,
"step": 2352
},
{
"epoch": 1.297133406835722,
"grad_norm": 0.24054940044879913,
"learning_rate": 0.00010572139303482586,
"loss": 0.4137,
"step": 2353
},
{
"epoch": 1.2976846747519295,
"grad_norm": 0.235767662525177,
"learning_rate": 0.00010563847429519071,
"loss": 0.377,
"step": 2354
},
{
"epoch": 1.2982359426681367,
"grad_norm": 0.22807767987251282,
"learning_rate": 0.00010555555555555555,
"loss": 0.3974,
"step": 2355
},
{
"epoch": 1.298787210584344,
"grad_norm": 0.22131551802158356,
"learning_rate": 0.00010547263681592039,
"loss": 0.4002,
"step": 2356
},
{
"epoch": 1.2993384785005513,
"grad_norm": 0.24462686479091644,
"learning_rate": 0.00010538971807628523,
"loss": 0.4169,
"step": 2357
},
{
"epoch": 1.2998897464167585,
"grad_norm": 0.24126161634922028,
"learning_rate": 0.00010530679933665008,
"loss": 0.3846,
"step": 2358
},
{
"epoch": 1.3004410143329659,
"grad_norm": 0.2536928951740265,
"learning_rate": 0.00010522388059701492,
"loss": 0.3883,
"step": 2359
},
{
"epoch": 1.300992282249173,
"grad_norm": 0.23638053238391876,
"learning_rate": 0.00010514096185737977,
"loss": 0.3916,
"step": 2360
},
{
"epoch": 1.3015435501653805,
"grad_norm": 0.21713566780090332,
"learning_rate": 0.0001050580431177446,
"loss": 0.382,
"step": 2361
},
{
"epoch": 1.3020948180815877,
"grad_norm": 0.23291055858135223,
"learning_rate": 0.00010497512437810945,
"loss": 0.3831,
"step": 2362
},
{
"epoch": 1.3026460859977949,
"grad_norm": 0.2169044464826584,
"learning_rate": 0.00010489220563847429,
"loss": 0.3705,
"step": 2363
},
{
"epoch": 1.3031973539140023,
"grad_norm": 0.23216962814331055,
"learning_rate": 0.00010480928689883914,
"loss": 0.3691,
"step": 2364
},
{
"epoch": 1.3037486218302095,
"grad_norm": 0.2367962896823883,
"learning_rate": 0.00010472636815920398,
"loss": 0.4011,
"step": 2365
},
{
"epoch": 1.3042998897464169,
"grad_norm": 0.22988784313201904,
"learning_rate": 0.00010464344941956881,
"loss": 0.3904,
"step": 2366
},
{
"epoch": 1.304851157662624,
"grad_norm": 0.21731241047382355,
"learning_rate": 0.00010456053067993366,
"loss": 0.3815,
"step": 2367
},
{
"epoch": 1.3054024255788312,
"grad_norm": 0.25733426213264465,
"learning_rate": 0.0001044776119402985,
"loss": 0.4253,
"step": 2368
},
{
"epoch": 1.3059536934950386,
"grad_norm": 0.23438294231891632,
"learning_rate": 0.00010439469320066335,
"loss": 0.4041,
"step": 2369
},
{
"epoch": 1.3065049614112458,
"grad_norm": 0.22011101245880127,
"learning_rate": 0.0001043117744610282,
"loss": 0.3948,
"step": 2370
},
{
"epoch": 1.3070562293274532,
"grad_norm": 0.2404097616672516,
"learning_rate": 0.00010422885572139302,
"loss": 0.3996,
"step": 2371
},
{
"epoch": 1.3076074972436604,
"grad_norm": 0.23479090631008148,
"learning_rate": 0.00010414593698175786,
"loss": 0.4048,
"step": 2372
},
{
"epoch": 1.3081587651598676,
"grad_norm": 0.22892162203788757,
"learning_rate": 0.0001040630182421227,
"loss": 0.3751,
"step": 2373
},
{
"epoch": 1.308710033076075,
"grad_norm": 0.22712910175323486,
"learning_rate": 0.00010398009950248755,
"loss": 0.3777,
"step": 2374
},
{
"epoch": 1.3092613009922822,
"grad_norm": 0.22894370555877686,
"learning_rate": 0.00010389718076285238,
"loss": 0.3936,
"step": 2375
},
{
"epoch": 1.3098125689084896,
"grad_norm": 0.24097605049610138,
"learning_rate": 0.00010381426202321723,
"loss": 0.3693,
"step": 2376
},
{
"epoch": 1.3103638368246968,
"grad_norm": 0.23055890202522278,
"learning_rate": 0.00010373134328358208,
"loss": 0.3777,
"step": 2377
},
{
"epoch": 1.310915104740904,
"grad_norm": 0.23357531428337097,
"learning_rate": 0.00010364842454394692,
"loss": 0.3945,
"step": 2378
},
{
"epoch": 1.3114663726571114,
"grad_norm": 0.2378157526254654,
"learning_rate": 0.00010356550580431177,
"loss": 0.4077,
"step": 2379
},
{
"epoch": 1.3120176405733186,
"grad_norm": 0.2348390370607376,
"learning_rate": 0.0001034825870646766,
"loss": 0.3905,
"step": 2380
},
{
"epoch": 1.312568908489526,
"grad_norm": 0.24251805245876312,
"learning_rate": 0.00010339966832504144,
"loss": 0.4174,
"step": 2381
},
{
"epoch": 1.3131201764057332,
"grad_norm": 0.23102574050426483,
"learning_rate": 0.00010331674958540629,
"loss": 0.3856,
"step": 2382
},
{
"epoch": 1.3136714443219404,
"grad_norm": 0.2383720427751541,
"learning_rate": 0.00010323383084577114,
"loss": 0.3932,
"step": 2383
},
{
"epoch": 1.3142227122381478,
"grad_norm": 0.22161129117012024,
"learning_rate": 0.00010315091210613598,
"loss": 0.396,
"step": 2384
},
{
"epoch": 1.314773980154355,
"grad_norm": 0.2228018343448639,
"learning_rate": 0.00010306799336650081,
"loss": 0.3862,
"step": 2385
},
{
"epoch": 1.3153252480705624,
"grad_norm": 0.22873203456401825,
"learning_rate": 0.00010298507462686566,
"loss": 0.3513,
"step": 2386
},
{
"epoch": 1.3158765159867696,
"grad_norm": 0.23780828714370728,
"learning_rate": 0.0001029021558872305,
"loss": 0.3888,
"step": 2387
},
{
"epoch": 1.3164277839029768,
"grad_norm": 0.2447124868631363,
"learning_rate": 0.00010281923714759535,
"loss": 0.4046,
"step": 2388
},
{
"epoch": 1.3169790518191842,
"grad_norm": 0.24726513028144836,
"learning_rate": 0.0001027363184079602,
"loss": 0.4086,
"step": 2389
},
{
"epoch": 1.3175303197353914,
"grad_norm": 0.2359735518693924,
"learning_rate": 0.00010265339966832503,
"loss": 0.4015,
"step": 2390
},
{
"epoch": 1.3180815876515988,
"grad_norm": 0.23657964169979095,
"learning_rate": 0.00010257048092868987,
"loss": 0.3859,
"step": 2391
},
{
"epoch": 1.318632855567806,
"grad_norm": 0.23830877244472504,
"learning_rate": 0.00010248756218905472,
"loss": 0.3864,
"step": 2392
},
{
"epoch": 1.3191841234840131,
"grad_norm": 0.2303212434053421,
"learning_rate": 0.00010240464344941956,
"loss": 0.4036,
"step": 2393
},
{
"epoch": 1.3197353914002206,
"grad_norm": 0.2221781462430954,
"learning_rate": 0.0001023217247097844,
"loss": 0.3712,
"step": 2394
},
{
"epoch": 1.3202866593164277,
"grad_norm": 0.22085942327976227,
"learning_rate": 0.00010223880597014924,
"loss": 0.3708,
"step": 2395
},
{
"epoch": 1.3208379272326352,
"grad_norm": 0.24135445058345795,
"learning_rate": 0.00010215588723051409,
"loss": 0.3896,
"step": 2396
},
{
"epoch": 1.3213891951488423,
"grad_norm": 0.24116064608097076,
"learning_rate": 0.00010207296849087893,
"loss": 0.3866,
"step": 2397
},
{
"epoch": 1.3219404630650495,
"grad_norm": 0.26890698075294495,
"learning_rate": 0.00010199004975124378,
"loss": 0.3795,
"step": 2398
},
{
"epoch": 1.322491730981257,
"grad_norm": 0.2322501391172409,
"learning_rate": 0.00010190713101160861,
"loss": 0.3837,
"step": 2399
},
{
"epoch": 1.3230429988974641,
"grad_norm": 0.24631264805793762,
"learning_rate": 0.00010182421227197346,
"loss": 0.3954,
"step": 2400
},
{
"epoch": 1.3235942668136715,
"grad_norm": 0.2258647084236145,
"learning_rate": 0.0001017412935323383,
"loss": 0.3705,
"step": 2401
},
{
"epoch": 1.3241455347298787,
"grad_norm": 0.2519420087337494,
"learning_rate": 0.00010165837479270315,
"loss": 0.3921,
"step": 2402
},
{
"epoch": 1.324696802646086,
"grad_norm": 0.23400020599365234,
"learning_rate": 0.00010157545605306799,
"loss": 0.3702,
"step": 2403
},
{
"epoch": 1.3252480705622933,
"grad_norm": 0.22752946615219116,
"learning_rate": 0.00010149253731343282,
"loss": 0.3756,
"step": 2404
},
{
"epoch": 1.3257993384785005,
"grad_norm": 0.24144931137561798,
"learning_rate": 0.00010140961857379767,
"loss": 0.41,
"step": 2405
},
{
"epoch": 1.326350606394708,
"grad_norm": 0.24649466574192047,
"learning_rate": 0.00010132669983416252,
"loss": 0.4227,
"step": 2406
},
{
"epoch": 1.326901874310915,
"grad_norm": 0.22007010877132416,
"learning_rate": 0.00010124378109452736,
"loss": 0.3802,
"step": 2407
},
{
"epoch": 1.3274531422271223,
"grad_norm": 0.2177124321460724,
"learning_rate": 0.00010116086235489221,
"loss": 0.3733,
"step": 2408
},
{
"epoch": 1.3280044101433297,
"grad_norm": 0.23224158585071564,
"learning_rate": 0.00010107794361525704,
"loss": 0.3774,
"step": 2409
},
{
"epoch": 1.328555678059537,
"grad_norm": 0.24728813767433167,
"learning_rate": 0.00010099502487562188,
"loss": 0.3926,
"step": 2410
},
{
"epoch": 1.3291069459757443,
"grad_norm": 0.22190050780773163,
"learning_rate": 0.00010091210613598673,
"loss": 0.3826,
"step": 2411
},
{
"epoch": 1.3296582138919515,
"grad_norm": 0.23956191539764404,
"learning_rate": 0.00010082918739635158,
"loss": 0.3982,
"step": 2412
},
{
"epoch": 1.3302094818081587,
"grad_norm": 0.23789376020431519,
"learning_rate": 0.00010074626865671642,
"loss": 0.4032,
"step": 2413
},
{
"epoch": 1.330760749724366,
"grad_norm": 0.24080632627010345,
"learning_rate": 0.00010066334991708125,
"loss": 0.3974,
"step": 2414
},
{
"epoch": 1.3313120176405733,
"grad_norm": 0.22118644416332245,
"learning_rate": 0.00010058043117744609,
"loss": 0.3848,
"step": 2415
},
{
"epoch": 1.3318632855567807,
"grad_norm": 0.24440258741378784,
"learning_rate": 0.00010049751243781093,
"loss": 0.3801,
"step": 2416
},
{
"epoch": 1.3324145534729879,
"grad_norm": 0.23864087462425232,
"learning_rate": 0.00010041459369817578,
"loss": 0.4019,
"step": 2417
},
{
"epoch": 1.332965821389195,
"grad_norm": 0.2365901917219162,
"learning_rate": 0.00010033167495854061,
"loss": 0.3827,
"step": 2418
},
{
"epoch": 1.3335170893054025,
"grad_norm": 0.22480501234531403,
"learning_rate": 0.00010024875621890545,
"loss": 0.3696,
"step": 2419
},
{
"epoch": 1.3340683572216097,
"grad_norm": 0.23156774044036865,
"learning_rate": 0.0001001658374792703,
"loss": 0.3803,
"step": 2420
},
{
"epoch": 1.334619625137817,
"grad_norm": 0.22590211033821106,
"learning_rate": 0.00010008291873963515,
"loss": 0.387,
"step": 2421
},
{
"epoch": 1.3351708930540243,
"grad_norm": 0.2270091325044632,
"learning_rate": 9.999999999999999e-05,
"loss": 0.381,
"step": 2422
},
{
"epoch": 1.3357221609702314,
"grad_norm": 0.22601434588432312,
"learning_rate": 9.991708126036482e-05,
"loss": 0.3907,
"step": 2423
},
{
"epoch": 1.3362734288864389,
"grad_norm": 0.2249268740415573,
"learning_rate": 9.983416252072967e-05,
"loss": 0.3794,
"step": 2424
},
{
"epoch": 1.336824696802646,
"grad_norm": 0.2406623363494873,
"learning_rate": 9.975124378109451e-05,
"loss": 0.3912,
"step": 2425
},
{
"epoch": 1.3373759647188534,
"grad_norm": 0.24089276790618896,
"learning_rate": 9.966832504145936e-05,
"loss": 0.3997,
"step": 2426
},
{
"epoch": 1.3379272326350606,
"grad_norm": 0.2207108587026596,
"learning_rate": 9.95854063018242e-05,
"loss": 0.3804,
"step": 2427
},
{
"epoch": 1.3384785005512678,
"grad_norm": 0.21747317910194397,
"learning_rate": 9.950248756218904e-05,
"loss": 0.3808,
"step": 2428
},
{
"epoch": 1.3390297684674752,
"grad_norm": 0.2578473687171936,
"learning_rate": 9.941956882255388e-05,
"loss": 0.4195,
"step": 2429
},
{
"epoch": 1.3395810363836824,
"grad_norm": 0.22663085162639618,
"learning_rate": 9.933665008291873e-05,
"loss": 0.3877,
"step": 2430
},
{
"epoch": 1.3401323042998898,
"grad_norm": 0.24075528979301453,
"learning_rate": 9.925373134328357e-05,
"loss": 0.405,
"step": 2431
},
{
"epoch": 1.340683572216097,
"grad_norm": 0.22877177596092224,
"learning_rate": 9.917081260364842e-05,
"loss": 0.382,
"step": 2432
},
{
"epoch": 1.3412348401323042,
"grad_norm": 0.22892452776432037,
"learning_rate": 9.908789386401325e-05,
"loss": 0.3812,
"step": 2433
},
{
"epoch": 1.3417861080485116,
"grad_norm": 0.24187688529491425,
"learning_rate": 9.90049751243781e-05,
"loss": 0.3825,
"step": 2434
},
{
"epoch": 1.3423373759647188,
"grad_norm": 0.22903688251972198,
"learning_rate": 9.892205638474294e-05,
"loss": 0.3878,
"step": 2435
},
{
"epoch": 1.3428886438809262,
"grad_norm": 0.22924572229385376,
"learning_rate": 9.883913764510779e-05,
"loss": 0.388,
"step": 2436
},
{
"epoch": 1.3434399117971334,
"grad_norm": 0.24021534621715546,
"learning_rate": 9.875621890547263e-05,
"loss": 0.4031,
"step": 2437
},
{
"epoch": 1.3439911797133406,
"grad_norm": 0.23757272958755493,
"learning_rate": 9.867330016583747e-05,
"loss": 0.3934,
"step": 2438
},
{
"epoch": 1.344542447629548,
"grad_norm": 0.2555783987045288,
"learning_rate": 9.859038142620231e-05,
"loss": 0.3988,
"step": 2439
},
{
"epoch": 1.3450937155457552,
"grad_norm": 0.23108243942260742,
"learning_rate": 9.850746268656716e-05,
"loss": 0.379,
"step": 2440
},
{
"epoch": 1.3456449834619626,
"grad_norm": 0.24363455176353455,
"learning_rate": 9.8424543946932e-05,
"loss": 0.3939,
"step": 2441
},
{
"epoch": 1.3461962513781698,
"grad_norm": 0.2295197993516922,
"learning_rate": 9.834162520729685e-05,
"loss": 0.3799,
"step": 2442
},
{
"epoch": 1.346747519294377,
"grad_norm": 0.23563653230667114,
"learning_rate": 9.825870646766168e-05,
"loss": 0.3755,
"step": 2443
},
{
"epoch": 1.3472987872105844,
"grad_norm": 0.2241990715265274,
"learning_rate": 9.817578772802653e-05,
"loss": 0.3794,
"step": 2444
},
{
"epoch": 1.3478500551267916,
"grad_norm": 0.2593122124671936,
"learning_rate": 9.809286898839137e-05,
"loss": 0.3766,
"step": 2445
},
{
"epoch": 1.348401323042999,
"grad_norm": 0.22955520451068878,
"learning_rate": 9.800995024875622e-05,
"loss": 0.3787,
"step": 2446
},
{
"epoch": 1.3489525909592062,
"grad_norm": 0.23866330087184906,
"learning_rate": 9.792703150912106e-05,
"loss": 0.3955,
"step": 2447
},
{
"epoch": 1.3495038588754134,
"grad_norm": 0.24115972220897675,
"learning_rate": 9.78441127694859e-05,
"loss": 0.3811,
"step": 2448
},
{
"epoch": 1.3500551267916208,
"grad_norm": 0.23597833514213562,
"learning_rate": 9.776119402985074e-05,
"loss": 0.3831,
"step": 2449
},
{
"epoch": 1.350606394707828,
"grad_norm": 0.2415011078119278,
"learning_rate": 9.767827529021559e-05,
"loss": 0.3896,
"step": 2450
},
{
"epoch": 1.3511576626240354,
"grad_norm": 0.2416457235813141,
"learning_rate": 9.759535655058043e-05,
"loss": 0.3888,
"step": 2451
},
{
"epoch": 1.3517089305402425,
"grad_norm": 0.23950545489788055,
"learning_rate": 9.751243781094528e-05,
"loss": 0.3942,
"step": 2452
},
{
"epoch": 1.3522601984564497,
"grad_norm": 0.24059046804904938,
"learning_rate": 9.742951907131011e-05,
"loss": 0.4005,
"step": 2453
},
{
"epoch": 1.3528114663726571,
"grad_norm": 0.2414311021566391,
"learning_rate": 9.734660033167496e-05,
"loss": 0.3795,
"step": 2454
},
{
"epoch": 1.3533627342888643,
"grad_norm": 0.23370300233364105,
"learning_rate": 9.72636815920398e-05,
"loss": 0.3728,
"step": 2455
},
{
"epoch": 1.3539140022050717,
"grad_norm": 0.23373939096927643,
"learning_rate": 9.718076285240465e-05,
"loss": 0.3925,
"step": 2456
},
{
"epoch": 1.354465270121279,
"grad_norm": 0.22576579451560974,
"learning_rate": 9.709784411276948e-05,
"loss": 0.3787,
"step": 2457
},
{
"epoch": 1.3550165380374861,
"grad_norm": 0.22904476523399353,
"learning_rate": 9.701492537313432e-05,
"loss": 0.3939,
"step": 2458
},
{
"epoch": 1.3555678059536935,
"grad_norm": 0.24833030998706818,
"learning_rate": 9.693200663349916e-05,
"loss": 0.394,
"step": 2459
},
{
"epoch": 1.3561190738699007,
"grad_norm": 0.22664152085781097,
"learning_rate": 9.6849087893864e-05,
"loss": 0.363,
"step": 2460
},
{
"epoch": 1.3566703417861081,
"grad_norm": 0.23569191992282867,
"learning_rate": 9.676616915422883e-05,
"loss": 0.3823,
"step": 2461
},
{
"epoch": 1.3572216097023153,
"grad_norm": 0.23659692704677582,
"learning_rate": 9.668325041459368e-05,
"loss": 0.3879,
"step": 2462
},
{
"epoch": 1.3577728776185225,
"grad_norm": 0.22711534798145294,
"learning_rate": 9.660033167495852e-05,
"loss": 0.3761,
"step": 2463
},
{
"epoch": 1.35832414553473,
"grad_norm": 0.23172332346439362,
"learning_rate": 9.651741293532337e-05,
"loss": 0.3774,
"step": 2464
},
{
"epoch": 1.358875413450937,
"grad_norm": 0.23141370713710785,
"learning_rate": 9.643449419568822e-05,
"loss": 0.3976,
"step": 2465
},
{
"epoch": 1.3594266813671445,
"grad_norm": 0.24368800222873688,
"learning_rate": 9.635157545605305e-05,
"loss": 0.3843,
"step": 2466
},
{
"epoch": 1.3599779492833517,
"grad_norm": 0.22588768601417542,
"learning_rate": 9.62686567164179e-05,
"loss": 0.3798,
"step": 2467
},
{
"epoch": 1.3605292171995589,
"grad_norm": 0.2269313633441925,
"learning_rate": 9.618573797678274e-05,
"loss": 0.3874,
"step": 2468
},
{
"epoch": 1.3610804851157663,
"grad_norm": 0.23487702012062073,
"learning_rate": 9.610281923714758e-05,
"loss": 0.3888,
"step": 2469
},
{
"epoch": 1.3616317530319735,
"grad_norm": 0.2513071894645691,
"learning_rate": 9.601990049751243e-05,
"loss": 0.4122,
"step": 2470
},
{
"epoch": 1.362183020948181,
"grad_norm": 0.21708211302757263,
"learning_rate": 9.593698175787726e-05,
"loss": 0.3597,
"step": 2471
},
{
"epoch": 1.362734288864388,
"grad_norm": 0.2279457300901413,
"learning_rate": 9.585406301824211e-05,
"loss": 0.3834,
"step": 2472
},
{
"epoch": 1.3632855567805953,
"grad_norm": 0.22766946256160736,
"learning_rate": 9.577114427860695e-05,
"loss": 0.3682,
"step": 2473
},
{
"epoch": 1.3638368246968027,
"grad_norm": 0.22673630714416504,
"learning_rate": 9.56882255389718e-05,
"loss": 0.3823,
"step": 2474
},
{
"epoch": 1.3643880926130099,
"grad_norm": 0.23767007887363434,
"learning_rate": 9.560530679933664e-05,
"loss": 0.3991,
"step": 2475
},
{
"epoch": 1.3649393605292173,
"grad_norm": 0.2326952964067459,
"learning_rate": 9.552238805970148e-05,
"loss": 0.39,
"step": 2476
},
{
"epoch": 1.3654906284454245,
"grad_norm": 0.2336025983095169,
"learning_rate": 9.543946932006632e-05,
"loss": 0.3748,
"step": 2477
},
{
"epoch": 1.3660418963616316,
"grad_norm": 0.23857955634593964,
"learning_rate": 9.535655058043117e-05,
"loss": 0.4077,
"step": 2478
},
{
"epoch": 1.366593164277839,
"grad_norm": 0.22810246050357819,
"learning_rate": 9.527363184079601e-05,
"loss": 0.406,
"step": 2479
},
{
"epoch": 1.3671444321940462,
"grad_norm": 0.23381425440311432,
"learning_rate": 9.519071310116086e-05,
"loss": 0.395,
"step": 2480
},
{
"epoch": 1.3676957001102537,
"grad_norm": 0.21443428099155426,
"learning_rate": 9.510779436152569e-05,
"loss": 0.3772,
"step": 2481
},
{
"epoch": 1.3682469680264608,
"grad_norm": 0.23185119032859802,
"learning_rate": 9.502487562189054e-05,
"loss": 0.3892,
"step": 2482
},
{
"epoch": 1.368798235942668,
"grad_norm": 0.2298753708600998,
"learning_rate": 9.494195688225538e-05,
"loss": 0.3891,
"step": 2483
},
{
"epoch": 1.3693495038588754,
"grad_norm": 0.216232031583786,
"learning_rate": 9.485903814262023e-05,
"loss": 0.382,
"step": 2484
},
{
"epoch": 1.3699007717750826,
"grad_norm": 0.23376402258872986,
"learning_rate": 9.477611940298507e-05,
"loss": 0.3992,
"step": 2485
},
{
"epoch": 1.37045203969129,
"grad_norm": 0.2535459101200104,
"learning_rate": 9.46932006633499e-05,
"loss": 0.3957,
"step": 2486
},
{
"epoch": 1.3710033076074972,
"grad_norm": 0.22214862704277039,
"learning_rate": 9.461028192371475e-05,
"loss": 0.3713,
"step": 2487
},
{
"epoch": 1.3715545755237044,
"grad_norm": 0.23064962029457092,
"learning_rate": 9.45273631840796e-05,
"loss": 0.3821,
"step": 2488
},
{
"epoch": 1.3721058434399118,
"grad_norm": 0.249479740858078,
"learning_rate": 9.444444444444444e-05,
"loss": 0.3837,
"step": 2489
},
{
"epoch": 1.372657111356119,
"grad_norm": 0.22704121470451355,
"learning_rate": 9.436152570480929e-05,
"loss": 0.3931,
"step": 2490
},
{
"epoch": 1.3732083792723264,
"grad_norm": 0.23015405237674713,
"learning_rate": 9.427860696517412e-05,
"loss": 0.4049,
"step": 2491
},
{
"epoch": 1.3737596471885336,
"grad_norm": 0.23387496173381805,
"learning_rate": 9.419568822553897e-05,
"loss": 0.3727,
"step": 2492
},
{
"epoch": 1.3743109151047408,
"grad_norm": 0.21825988590717316,
"learning_rate": 9.411276948590381e-05,
"loss": 0.382,
"step": 2493
},
{
"epoch": 1.3748621830209482,
"grad_norm": 0.2230725735425949,
"learning_rate": 9.402985074626866e-05,
"loss": 0.3935,
"step": 2494
},
{
"epoch": 1.3754134509371554,
"grad_norm": 0.22703075408935547,
"learning_rate": 9.39469320066335e-05,
"loss": 0.3848,
"step": 2495
},
{
"epoch": 1.3759647188533628,
"grad_norm": 0.2219892293214798,
"learning_rate": 9.386401326699833e-05,
"loss": 0.3898,
"step": 2496
},
{
"epoch": 1.37651598676957,
"grad_norm": 0.23172403872013092,
"learning_rate": 9.378109452736318e-05,
"loss": 0.3861,
"step": 2497
},
{
"epoch": 1.3770672546857772,
"grad_norm": 0.23237434029579163,
"learning_rate": 9.369817578772803e-05,
"loss": 0.3705,
"step": 2498
},
{
"epoch": 1.3776185226019846,
"grad_norm": 0.2246798872947693,
"learning_rate": 9.361525704809287e-05,
"loss": 0.3679,
"step": 2499
},
{
"epoch": 1.3781697905181918,
"grad_norm": 0.2427067756652832,
"learning_rate": 9.353233830845772e-05,
"loss": 0.4212,
"step": 2500
},
{
"epoch": 1.3781697905181918,
"eval_loss": 0.4513299763202667,
"eval_runtime": 311.7925,
"eval_samples_per_second": 3.736,
"eval_steps_per_second": 0.468,
"step": 2500
},
{
"epoch": 1.3787210584343992,
"grad_norm": 0.2319420874118805,
"learning_rate": 9.344941956882255e-05,
"loss": 0.3887,
"step": 2501
},
{
"epoch": 1.3792723263506064,
"grad_norm": 0.23304283618927002,
"learning_rate": 9.33665008291874e-05,
"loss": 0.396,
"step": 2502
},
{
"epoch": 1.3798235942668136,
"grad_norm": 0.2571066617965698,
"learning_rate": 9.328358208955223e-05,
"loss": 0.3862,
"step": 2503
},
{
"epoch": 1.380374862183021,
"grad_norm": 0.22332634031772614,
"learning_rate": 9.320066334991707e-05,
"loss": 0.3608,
"step": 2504
},
{
"epoch": 1.3809261300992282,
"grad_norm": 0.2485717236995697,
"learning_rate": 9.31177446102819e-05,
"loss": 0.4238,
"step": 2505
},
{
"epoch": 1.3814773980154356,
"grad_norm": 0.230104461312294,
"learning_rate": 9.303482587064675e-05,
"loss": 0.4036,
"step": 2506
},
{
"epoch": 1.3820286659316428,
"grad_norm": 0.2558598816394806,
"learning_rate": 9.29519071310116e-05,
"loss": 0.3958,
"step": 2507
},
{
"epoch": 1.38257993384785,
"grad_norm": 0.23400071263313293,
"learning_rate": 9.286898839137644e-05,
"loss": 0.3862,
"step": 2508
},
{
"epoch": 1.3831312017640573,
"grad_norm": 0.23237945139408112,
"learning_rate": 9.278606965174129e-05,
"loss": 0.3753,
"step": 2509
},
{
"epoch": 1.3836824696802645,
"grad_norm": 0.2357659935951233,
"learning_rate": 9.270315091210612e-05,
"loss": 0.3826,
"step": 2510
},
{
"epoch": 1.384233737596472,
"grad_norm": 0.2599101960659027,
"learning_rate": 9.262023217247096e-05,
"loss": 0.4028,
"step": 2511
},
{
"epoch": 1.3847850055126791,
"grad_norm": 0.2372962385416031,
"learning_rate": 9.253731343283581e-05,
"loss": 0.4181,
"step": 2512
},
{
"epoch": 1.3853362734288863,
"grad_norm": 0.27277928590774536,
"learning_rate": 9.245439469320065e-05,
"loss": 0.4025,
"step": 2513
},
{
"epoch": 1.3858875413450937,
"grad_norm": 0.22424361109733582,
"learning_rate": 9.237147595356549e-05,
"loss": 0.3735,
"step": 2514
},
{
"epoch": 1.386438809261301,
"grad_norm": 0.2312849462032318,
"learning_rate": 9.228855721393033e-05,
"loss": 0.4009,
"step": 2515
},
{
"epoch": 1.3869900771775083,
"grad_norm": 0.24405118823051453,
"learning_rate": 9.220563847429518e-05,
"loss": 0.4026,
"step": 2516
},
{
"epoch": 1.3875413450937155,
"grad_norm": 0.25049299001693726,
"learning_rate": 9.212271973466002e-05,
"loss": 0.3878,
"step": 2517
},
{
"epoch": 1.3880926130099227,
"grad_norm": 0.23999334871768951,
"learning_rate": 9.203980099502487e-05,
"loss": 0.3758,
"step": 2518
},
{
"epoch": 1.3886438809261301,
"grad_norm": 0.23169536888599396,
"learning_rate": 9.19568822553897e-05,
"loss": 0.3758,
"step": 2519
},
{
"epoch": 1.3891951488423373,
"grad_norm": 0.228010356426239,
"learning_rate": 9.187396351575455e-05,
"loss": 0.3731,
"step": 2520
},
{
"epoch": 1.3897464167585447,
"grad_norm": 0.2497485876083374,
"learning_rate": 9.179104477611939e-05,
"loss": 0.3995,
"step": 2521
},
{
"epoch": 1.390297684674752,
"grad_norm": 0.257614403963089,
"learning_rate": 9.170812603648424e-05,
"loss": 0.3873,
"step": 2522
},
{
"epoch": 1.390848952590959,
"grad_norm": 0.22421546280384064,
"learning_rate": 9.162520729684908e-05,
"loss": 0.3746,
"step": 2523
},
{
"epoch": 1.3914002205071665,
"grad_norm": 0.22990712523460388,
"learning_rate": 9.154228855721392e-05,
"loss": 0.3916,
"step": 2524
},
{
"epoch": 1.3919514884233737,
"grad_norm": 0.24670518934726715,
"learning_rate": 9.145936981757876e-05,
"loss": 0.3983,
"step": 2525
},
{
"epoch": 1.392502756339581,
"grad_norm": 0.23636974394321442,
"learning_rate": 9.137645107794361e-05,
"loss": 0.3776,
"step": 2526
},
{
"epoch": 1.3930540242557883,
"grad_norm": 0.2319977879524231,
"learning_rate": 9.129353233830845e-05,
"loss": 0.3809,
"step": 2527
},
{
"epoch": 1.3936052921719955,
"grad_norm": 0.22971488535404205,
"learning_rate": 9.12106135986733e-05,
"loss": 0.3643,
"step": 2528
},
{
"epoch": 1.3941565600882029,
"grad_norm": 0.24024169147014618,
"learning_rate": 9.112769485903813e-05,
"loss": 0.3915,
"step": 2529
},
{
"epoch": 1.39470782800441,
"grad_norm": 0.22295120358467102,
"learning_rate": 9.104477611940298e-05,
"loss": 0.3702,
"step": 2530
},
{
"epoch": 1.3952590959206175,
"grad_norm": 0.23186278343200684,
"learning_rate": 9.096185737976782e-05,
"loss": 0.3733,
"step": 2531
},
{
"epoch": 1.3958103638368247,
"grad_norm": 0.25662240386009216,
"learning_rate": 9.087893864013267e-05,
"loss": 0.3843,
"step": 2532
},
{
"epoch": 1.3963616317530319,
"grad_norm": 0.24374930560588837,
"learning_rate": 9.079601990049751e-05,
"loss": 0.4025,
"step": 2533
},
{
"epoch": 1.3969128996692393,
"grad_norm": 0.22312727570533752,
"learning_rate": 9.071310116086234e-05,
"loss": 0.3794,
"step": 2534
},
{
"epoch": 1.3974641675854464,
"grad_norm": 0.21616993844509125,
"learning_rate": 9.063018242122719e-05,
"loss": 0.3771,
"step": 2535
},
{
"epoch": 1.3980154355016539,
"grad_norm": 0.24162566661834717,
"learning_rate": 9.054726368159204e-05,
"loss": 0.3797,
"step": 2536
},
{
"epoch": 1.398566703417861,
"grad_norm": 0.24157093465328217,
"learning_rate": 9.046434494195688e-05,
"loss": 0.3815,
"step": 2537
},
{
"epoch": 1.3991179713340682,
"grad_norm": 0.2437802404165268,
"learning_rate": 9.038142620232173e-05,
"loss": 0.3944,
"step": 2538
},
{
"epoch": 1.3996692392502756,
"grad_norm": 0.24138353765010834,
"learning_rate": 9.029850746268656e-05,
"loss": 0.392,
"step": 2539
},
{
"epoch": 1.4002205071664828,
"grad_norm": 0.25548362731933594,
"learning_rate": 9.02155887230514e-05,
"loss": 0.408,
"step": 2540
},
{
"epoch": 1.4007717750826902,
"grad_norm": 0.24517594277858734,
"learning_rate": 9.013266998341625e-05,
"loss": 0.3979,
"step": 2541
},
{
"epoch": 1.4013230429988974,
"grad_norm": 0.24252092838287354,
"learning_rate": 9.00497512437811e-05,
"loss": 0.4122,
"step": 2542
},
{
"epoch": 1.4018743109151046,
"grad_norm": 0.23663447797298431,
"learning_rate": 8.996683250414594e-05,
"loss": 0.3936,
"step": 2543
},
{
"epoch": 1.402425578831312,
"grad_norm": 0.2445666640996933,
"learning_rate": 8.988391376451077e-05,
"loss": 0.3863,
"step": 2544
},
{
"epoch": 1.4029768467475192,
"grad_norm": 0.24747510254383087,
"learning_rate": 8.980099502487562e-05,
"loss": 0.4024,
"step": 2545
},
{
"epoch": 1.4035281146637266,
"grad_norm": 0.22010785341262817,
"learning_rate": 8.971807628524046e-05,
"loss": 0.3765,
"step": 2546
},
{
"epoch": 1.4040793825799338,
"grad_norm": 0.24189656972885132,
"learning_rate": 8.963515754560531e-05,
"loss": 0.3735,
"step": 2547
},
{
"epoch": 1.404630650496141,
"grad_norm": 0.23379263281822205,
"learning_rate": 8.955223880597013e-05,
"loss": 0.3886,
"step": 2548
},
{
"epoch": 1.4051819184123484,
"grad_norm": 0.2319820672273636,
"learning_rate": 8.946932006633497e-05,
"loss": 0.3932,
"step": 2549
},
{
"epoch": 1.4057331863285556,
"grad_norm": 0.2426556944847107,
"learning_rate": 8.938640132669982e-05,
"loss": 0.3579,
"step": 2550
},
{
"epoch": 1.406284454244763,
"grad_norm": 0.23170387744903564,
"learning_rate": 8.930348258706467e-05,
"loss": 0.3657,
"step": 2551
},
{
"epoch": 1.4068357221609702,
"grad_norm": 0.24107246100902557,
"learning_rate": 8.922056384742951e-05,
"loss": 0.4121,
"step": 2552
},
{
"epoch": 1.4073869900771774,
"grad_norm": 0.23268483579158783,
"learning_rate": 8.913764510779434e-05,
"loss": 0.3964,
"step": 2553
},
{
"epoch": 1.4079382579933848,
"grad_norm": 0.24437369406223297,
"learning_rate": 8.905472636815919e-05,
"loss": 0.3886,
"step": 2554
},
{
"epoch": 1.4084895259095922,
"grad_norm": 0.2408677190542221,
"learning_rate": 8.897180762852403e-05,
"loss": 0.4128,
"step": 2555
},
{
"epoch": 1.4090407938257994,
"grad_norm": 0.24828049540519714,
"learning_rate": 8.888888888888888e-05,
"loss": 0.3968,
"step": 2556
},
{
"epoch": 1.4095920617420066,
"grad_norm": 0.25326454639434814,
"learning_rate": 8.880597014925373e-05,
"loss": 0.4163,
"step": 2557
},
{
"epoch": 1.4101433296582138,
"grad_norm": 0.2104220986366272,
"learning_rate": 8.872305140961856e-05,
"loss": 0.3861,
"step": 2558
},
{
"epoch": 1.4106945975744212,
"grad_norm": 0.24456249177455902,
"learning_rate": 8.86401326699834e-05,
"loss": 0.3969,
"step": 2559
},
{
"epoch": 1.4112458654906286,
"grad_norm": 0.23775126039981842,
"learning_rate": 8.855721393034825e-05,
"loss": 0.4024,
"step": 2560
},
{
"epoch": 1.4117971334068358,
"grad_norm": 0.2330765575170517,
"learning_rate": 8.84742951907131e-05,
"loss": 0.3988,
"step": 2561
},
{
"epoch": 1.412348401323043,
"grad_norm": 0.23499152064323425,
"learning_rate": 8.839137645107794e-05,
"loss": 0.4021,
"step": 2562
},
{
"epoch": 1.4128996692392501,
"grad_norm": 0.23784568905830383,
"learning_rate": 8.830845771144277e-05,
"loss": 0.4093,
"step": 2563
},
{
"epoch": 1.4134509371554576,
"grad_norm": 0.25330281257629395,
"learning_rate": 8.822553897180762e-05,
"loss": 0.3896,
"step": 2564
},
{
"epoch": 1.414002205071665,
"grad_norm": 0.2372010052204132,
"learning_rate": 8.814262023217246e-05,
"loss": 0.3887,
"step": 2565
},
{
"epoch": 1.4145534729878722,
"grad_norm": 0.227810338139534,
"learning_rate": 8.805970149253731e-05,
"loss": 0.3727,
"step": 2566
},
{
"epoch": 1.4151047409040793,
"grad_norm": 0.23357363045215607,
"learning_rate": 8.797678275290215e-05,
"loss": 0.3735,
"step": 2567
},
{
"epoch": 1.4156560088202865,
"grad_norm": 0.23767000436782837,
"learning_rate": 8.789386401326699e-05,
"loss": 0.3906,
"step": 2568
},
{
"epoch": 1.416207276736494,
"grad_norm": 0.22021612524986267,
"learning_rate": 8.781094527363183e-05,
"loss": 0.3907,
"step": 2569
},
{
"epoch": 1.4167585446527013,
"grad_norm": 0.22677011787891388,
"learning_rate": 8.772802653399668e-05,
"loss": 0.3568,
"step": 2570
},
{
"epoch": 1.4173098125689085,
"grad_norm": 0.23188649117946625,
"learning_rate": 8.764510779436152e-05,
"loss": 0.3872,
"step": 2571
},
{
"epoch": 1.4178610804851157,
"grad_norm": 0.24772998690605164,
"learning_rate": 8.756218905472637e-05,
"loss": 0.4013,
"step": 2572
},
{
"epoch": 1.418412348401323,
"grad_norm": 0.23278258740901947,
"learning_rate": 8.74792703150912e-05,
"loss": 0.3783,
"step": 2573
},
{
"epoch": 1.4189636163175303,
"grad_norm": 0.24379077553749084,
"learning_rate": 8.739635157545605e-05,
"loss": 0.3929,
"step": 2574
},
{
"epoch": 1.4195148842337377,
"grad_norm": 0.23344534635543823,
"learning_rate": 8.731343283582089e-05,
"loss": 0.3709,
"step": 2575
},
{
"epoch": 1.420066152149945,
"grad_norm": 0.23678019642829895,
"learning_rate": 8.723051409618574e-05,
"loss": 0.3973,
"step": 2576
},
{
"epoch": 1.420617420066152,
"grad_norm": 0.23193979263305664,
"learning_rate": 8.714759535655057e-05,
"loss": 0.3778,
"step": 2577
},
{
"epoch": 1.4211686879823593,
"grad_norm": 0.24555335938930511,
"learning_rate": 8.706467661691541e-05,
"loss": 0.4252,
"step": 2578
},
{
"epoch": 1.4217199558985667,
"grad_norm": 0.22985686361789703,
"learning_rate": 8.698175787728026e-05,
"loss": 0.3896,
"step": 2579
},
{
"epoch": 1.422271223814774,
"grad_norm": 0.24446120858192444,
"learning_rate": 8.68988391376451e-05,
"loss": 0.3969,
"step": 2580
},
{
"epoch": 1.4228224917309813,
"grad_norm": 0.22781571745872498,
"learning_rate": 8.681592039800995e-05,
"loss": 0.3836,
"step": 2581
},
{
"epoch": 1.4233737596471885,
"grad_norm": 0.2543814778327942,
"learning_rate": 8.673300165837478e-05,
"loss": 0.3934,
"step": 2582
},
{
"epoch": 1.4239250275633957,
"grad_norm": 0.2298593968153,
"learning_rate": 8.665008291873963e-05,
"loss": 0.3894,
"step": 2583
},
{
"epoch": 1.424476295479603,
"grad_norm": 0.24680182337760925,
"learning_rate": 8.656716417910447e-05,
"loss": 0.3928,
"step": 2584
},
{
"epoch": 1.4250275633958105,
"grad_norm": 0.2492562234401703,
"learning_rate": 8.648424543946932e-05,
"loss": 0.3793,
"step": 2585
},
{
"epoch": 1.4255788313120177,
"grad_norm": 0.24546745419502258,
"learning_rate": 8.640132669983417e-05,
"loss": 0.3671,
"step": 2586
},
{
"epoch": 1.4261300992282249,
"grad_norm": 0.24431215226650238,
"learning_rate": 8.6318407960199e-05,
"loss": 0.3613,
"step": 2587
},
{
"epoch": 1.426681367144432,
"grad_norm": 0.24530234932899475,
"learning_rate": 8.623548922056384e-05,
"loss": 0.3894,
"step": 2588
},
{
"epoch": 1.4272326350606395,
"grad_norm": 0.2521824240684509,
"learning_rate": 8.615257048092869e-05,
"loss": 0.3938,
"step": 2589
},
{
"epoch": 1.4277839029768469,
"grad_norm": 0.23589465022087097,
"learning_rate": 8.606965174129353e-05,
"loss": 0.377,
"step": 2590
},
{
"epoch": 1.428335170893054,
"grad_norm": 0.22879983484745026,
"learning_rate": 8.598673300165838e-05,
"loss": 0.387,
"step": 2591
},
{
"epoch": 1.4288864388092613,
"grad_norm": 0.2426953762769699,
"learning_rate": 8.59038142620232e-05,
"loss": 0.3921,
"step": 2592
},
{
"epoch": 1.4294377067254687,
"grad_norm": 0.2464035451412201,
"learning_rate": 8.582089552238804e-05,
"loss": 0.3842,
"step": 2593
},
{
"epoch": 1.4299889746416758,
"grad_norm": 0.24871256947517395,
"learning_rate": 8.573797678275289e-05,
"loss": 0.4075,
"step": 2594
},
{
"epoch": 1.4305402425578833,
"grad_norm": 0.22682443261146545,
"learning_rate": 8.565505804311774e-05,
"loss": 0.3538,
"step": 2595
},
{
"epoch": 1.4310915104740904,
"grad_norm": 0.23264093697071075,
"learning_rate": 8.557213930348257e-05,
"loss": 0.3802,
"step": 2596
},
{
"epoch": 1.4316427783902976,
"grad_norm": 0.2368372529745102,
"learning_rate": 8.548922056384741e-05,
"loss": 0.3897,
"step": 2597
},
{
"epoch": 1.432194046306505,
"grad_norm": 0.23906560242176056,
"learning_rate": 8.540630182421226e-05,
"loss": 0.3691,
"step": 2598
},
{
"epoch": 1.4327453142227122,
"grad_norm": 0.22911648452281952,
"learning_rate": 8.53233830845771e-05,
"loss": 0.3829,
"step": 2599
},
{
"epoch": 1.4332965821389196,
"grad_norm": 0.23407630622386932,
"learning_rate": 8.524046434494195e-05,
"loss": 0.3841,
"step": 2600
}
],
"logging_steps": 1,
"max_steps": 3628,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.918731552725244e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}