tod_zero_eiqracdg_supervised / trainer_state.json
Brendan's picture
Upload folder using huggingface_hub
d6eccd2 verified
raw
history blame contribute delete
No virus
103 kB
{
"best_metric": 60.0,
"best_model_checkpoint": "/data/users/bking2/tod_zero/outputs/runs/finetune/starcoder_3b/supervised/eiqracdg/checkpoint-28800",
"epoch": 0.8109888995894369,
"eval_steps": 3200,
"global_step": 32000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 1.92511785030365,
"learning_rate": 2.5e-06,
"loss": 1.186,
"step": 50
},
{
"epoch": 0.0,
"grad_norm": 1.4528826475143433,
"learning_rate": 5e-06,
"loss": 0.287,
"step": 100
},
{
"epoch": 0.0,
"grad_norm": 1.285142421722412,
"learning_rate": 4.9999696912850374e-06,
"loss": 0.2339,
"step": 150
},
{
"epoch": 0.01,
"grad_norm": 1.2731701135635376,
"learning_rate": 4.999878765875043e-06,
"loss": 0.2247,
"step": 200
},
{
"epoch": 0.01,
"grad_norm": 1.5167465209960938,
"learning_rate": 4.999727225974682e-06,
"loss": 0.2029,
"step": 250
},
{
"epoch": 0.01,
"grad_norm": 1.0947519540786743,
"learning_rate": 4.999515075258341e-06,
"loss": 0.2034,
"step": 300
},
{
"epoch": 0.01,
"grad_norm": 1.1314687728881836,
"learning_rate": 4.999242318870029e-06,
"loss": 0.1977,
"step": 350
},
{
"epoch": 0.01,
"grad_norm": 1.172371506690979,
"learning_rate": 4.998908963423264e-06,
"loss": 0.1924,
"step": 400
},
{
"epoch": 0.01,
"grad_norm": 1.348120093345642,
"learning_rate": 4.998515017000907e-06,
"loss": 0.189,
"step": 450
},
{
"epoch": 0.01,
"grad_norm": 1.0465130805969238,
"learning_rate": 4.998060489154965e-06,
"loss": 0.1863,
"step": 500
},
{
"epoch": 0.01,
"grad_norm": 1.0949101448059082,
"learning_rate": 4.997545390906362e-06,
"loss": 0.1894,
"step": 550
},
{
"epoch": 0.02,
"grad_norm": 1.2342430353164673,
"learning_rate": 4.996969734744671e-06,
"loss": 0.1842,
"step": 600
},
{
"epoch": 0.02,
"grad_norm": 1.2161908149719238,
"learning_rate": 4.99633353462781e-06,
"loss": 0.1885,
"step": 650
},
{
"epoch": 0.02,
"grad_norm": 1.237592101097107,
"learning_rate": 4.995636805981707e-06,
"loss": 0.188,
"step": 700
},
{
"epoch": 0.02,
"grad_norm": 1.0181180238723755,
"learning_rate": 4.99487956569992e-06,
"loss": 0.1807,
"step": 750
},
{
"epoch": 0.02,
"grad_norm": 1.0869165658950806,
"learning_rate": 4.994061832143235e-06,
"loss": 0.1859,
"step": 800
},
{
"epoch": 0.02,
"grad_norm": 1.115837574005127,
"learning_rate": 4.993183625139212e-06,
"loss": 0.1733,
"step": 850
},
{
"epoch": 0.02,
"grad_norm": 0.9784592986106873,
"learning_rate": 4.992244965981714e-06,
"loss": 0.1781,
"step": 900
},
{
"epoch": 0.02,
"grad_norm": 1.1208012104034424,
"learning_rate": 4.991245877430382e-06,
"loss": 0.1793,
"step": 950
},
{
"epoch": 0.03,
"grad_norm": 1.1315056085586548,
"learning_rate": 4.990186383710089e-06,
"loss": 0.1737,
"step": 1000
},
{
"epoch": 0.03,
"grad_norm": 0.9561248421669006,
"learning_rate": 4.9890665105103484e-06,
"loss": 0.1755,
"step": 1050
},
{
"epoch": 0.03,
"grad_norm": 1.0493712425231934,
"learning_rate": 4.987886284984695e-06,
"loss": 0.1733,
"step": 1100
},
{
"epoch": 0.03,
"grad_norm": 1.0467311143875122,
"learning_rate": 4.986645735750025e-06,
"loss": 0.1742,
"step": 1150
},
{
"epoch": 0.03,
"grad_norm": 0.915791928768158,
"learning_rate": 4.985344892885899e-06,
"loss": 0.1722,
"step": 1200
},
{
"epoch": 0.03,
"grad_norm": 1.0208431482315063,
"learning_rate": 4.98398378793382e-06,
"loss": 0.1722,
"step": 1250
},
{
"epoch": 0.03,
"grad_norm": 1.0767868757247925,
"learning_rate": 4.982562453896458e-06,
"loss": 0.1726,
"step": 1300
},
{
"epoch": 0.03,
"grad_norm": 0.9579061269760132,
"learning_rate": 4.9810809252368615e-06,
"loss": 0.1688,
"step": 1350
},
{
"epoch": 0.04,
"grad_norm": 0.9485280513763428,
"learning_rate": 4.979539237877615e-06,
"loss": 0.1614,
"step": 1400
},
{
"epoch": 0.04,
"grad_norm": 0.9779006838798523,
"learning_rate": 4.977937429199968e-06,
"loss": 0.1694,
"step": 1450
},
{
"epoch": 0.04,
"grad_norm": 0.9930433034896851,
"learning_rate": 4.976275538042932e-06,
"loss": 0.1694,
"step": 1500
},
{
"epoch": 0.04,
"grad_norm": 1.1304377317428589,
"learning_rate": 4.974553604702332e-06,
"loss": 0.1697,
"step": 1550
},
{
"epoch": 0.04,
"grad_norm": 0.9419906139373779,
"learning_rate": 4.972771670929841e-06,
"loss": 0.1678,
"step": 1600
},
{
"epoch": 0.04,
"grad_norm": 0.9315632581710815,
"learning_rate": 4.970929779931955e-06,
"loss": 0.1735,
"step": 1650
},
{
"epoch": 0.04,
"grad_norm": 1.015580415725708,
"learning_rate": 4.969027976368954e-06,
"loss": 0.1706,
"step": 1700
},
{
"epoch": 0.04,
"grad_norm": 1.059506893157959,
"learning_rate": 4.967066306353816e-06,
"loss": 0.1729,
"step": 1750
},
{
"epoch": 0.05,
"grad_norm": 0.8472514152526855,
"learning_rate": 4.9650448174510986e-06,
"loss": 0.1718,
"step": 1800
},
{
"epoch": 0.05,
"grad_norm": 1.6632156372070312,
"learning_rate": 4.9629635586757865e-06,
"loss": 0.1667,
"step": 1850
},
{
"epoch": 0.05,
"grad_norm": 1.0739672183990479,
"learning_rate": 4.960822580492103e-06,
"loss": 0.1654,
"step": 1900
},
{
"epoch": 0.05,
"grad_norm": 0.9874502420425415,
"learning_rate": 4.958621934812286e-06,
"loss": 0.1641,
"step": 1950
},
{
"epoch": 0.05,
"grad_norm": 0.9233792424201965,
"learning_rate": 4.95636167499533e-06,
"loss": 0.1648,
"step": 2000
},
{
"epoch": 0.05,
"grad_norm": 0.8490021228790283,
"learning_rate": 4.9540418558456915e-06,
"loss": 0.1688,
"step": 2050
},
{
"epoch": 0.05,
"grad_norm": 0.9746788144111633,
"learning_rate": 4.951662533611959e-06,
"loss": 0.169,
"step": 2100
},
{
"epoch": 0.05,
"grad_norm": 0.8029181957244873,
"learning_rate": 4.9492237659854946e-06,
"loss": 0.1645,
"step": 2150
},
{
"epoch": 0.06,
"grad_norm": 1.1753889322280884,
"learning_rate": 4.9467256120990255e-06,
"loss": 0.1692,
"step": 2200
},
{
"epoch": 0.06,
"grad_norm": 0.8767470717430115,
"learning_rate": 4.9441681325252215e-06,
"loss": 0.1617,
"step": 2250
},
{
"epoch": 0.06,
"grad_norm": 0.9515625238418579,
"learning_rate": 4.941551389275217e-06,
"loss": 0.1599,
"step": 2300
},
{
"epoch": 0.06,
"grad_norm": 0.9250404238700867,
"learning_rate": 4.938875445797112e-06,
"loss": 0.1678,
"step": 2350
},
{
"epoch": 0.06,
"grad_norm": 0.9468141794204712,
"learning_rate": 4.936140366974434e-06,
"loss": 0.1665,
"step": 2400
},
{
"epoch": 0.06,
"grad_norm": 0.9202454090118408,
"learning_rate": 4.933346219124562e-06,
"loss": 0.1579,
"step": 2450
},
{
"epoch": 0.06,
"grad_norm": 0.9553722739219666,
"learning_rate": 4.93049306999712e-06,
"loss": 0.1706,
"step": 2500
},
{
"epoch": 0.06,
"grad_norm": 0.8615751266479492,
"learning_rate": 4.927580988772336e-06,
"loss": 0.168,
"step": 2550
},
{
"epoch": 0.07,
"grad_norm": 0.9888120293617249,
"learning_rate": 4.9246100460593606e-06,
"loss": 0.1688,
"step": 2600
},
{
"epoch": 0.07,
"grad_norm": 0.915663480758667,
"learning_rate": 4.92158031389456e-06,
"loss": 0.1615,
"step": 2650
},
{
"epoch": 0.07,
"grad_norm": 0.9765244722366333,
"learning_rate": 4.918491865739763e-06,
"loss": 0.1571,
"step": 2700
},
{
"epoch": 0.07,
"grad_norm": 0.847814679145813,
"learning_rate": 4.915344776480487e-06,
"loss": 0.1602,
"step": 2750
},
{
"epoch": 0.07,
"grad_norm": 0.9848460555076599,
"learning_rate": 4.912139122424118e-06,
"loss": 0.1634,
"step": 2800
},
{
"epoch": 0.07,
"grad_norm": 0.8652825355529785,
"learning_rate": 4.908874981298058e-06,
"loss": 0.1594,
"step": 2850
},
{
"epoch": 0.07,
"grad_norm": 0.9553362131118774,
"learning_rate": 4.9055524322478456e-06,
"loss": 0.1681,
"step": 2900
},
{
"epoch": 0.07,
"grad_norm": 0.8901177644729614,
"learning_rate": 4.902171555835236e-06,
"loss": 0.1562,
"step": 2950
},
{
"epoch": 0.08,
"grad_norm": 1.047637701034546,
"learning_rate": 4.8987324340362445e-06,
"loss": 0.1601,
"step": 3000
},
{
"epoch": 0.08,
"grad_norm": 0.8235883116722107,
"learning_rate": 4.895235150239159e-06,
"loss": 0.1666,
"step": 3050
},
{
"epoch": 0.08,
"grad_norm": 1.1097720861434937,
"learning_rate": 4.891679789242524e-06,
"loss": 0.1584,
"step": 3100
},
{
"epoch": 0.08,
"grad_norm": 0.8984447121620178,
"learning_rate": 4.8880664372530765e-06,
"loss": 0.1612,
"step": 3150
},
{
"epoch": 0.08,
"grad_norm": 0.9215020537376404,
"learning_rate": 4.884395181883661e-06,
"loss": 0.1606,
"step": 3200
},
{
"epoch": 0.08,
"grad_norm": 0.8073440790176392,
"learning_rate": 4.880666112151104e-06,
"loss": 0.1624,
"step": 3250
},
{
"epoch": 0.08,
"grad_norm": 0.9907032251358032,
"learning_rate": 4.876879318474056e-06,
"loss": 0.1526,
"step": 3300
},
{
"epoch": 0.08,
"grad_norm": 1.0674902200698853,
"learning_rate": 4.873034892670795e-06,
"loss": 0.1628,
"step": 3350
},
{
"epoch": 0.09,
"grad_norm": 0.886870801448822,
"learning_rate": 4.869132927957007e-06,
"loss": 0.1597,
"step": 3400
},
{
"epoch": 0.09,
"grad_norm": 0.8470545411109924,
"learning_rate": 4.8651735189435205e-06,
"loss": 0.155,
"step": 3450
},
{
"epoch": 0.09,
"grad_norm": 0.8206085562705994,
"learning_rate": 4.861156761634014e-06,
"loss": 0.1607,
"step": 3500
},
{
"epoch": 0.09,
"grad_norm": 0.9522375464439392,
"learning_rate": 4.857082753422691e-06,
"loss": 0.1622,
"step": 3550
},
{
"epoch": 0.09,
"grad_norm": 0.9393265843391418,
"learning_rate": 4.852951593091914e-06,
"loss": 0.1574,
"step": 3600
},
{
"epoch": 0.09,
"grad_norm": 0.89833003282547,
"learning_rate": 4.848763380809811e-06,
"loss": 0.1627,
"step": 3650
},
{
"epoch": 0.09,
"grad_norm": 0.8664096593856812,
"learning_rate": 4.844518218127849e-06,
"loss": 0.1569,
"step": 3700
},
{
"epoch": 0.1,
"grad_norm": 0.8381832838058472,
"learning_rate": 4.840216207978368e-06,
"loss": 0.1585,
"step": 3750
},
{
"epoch": 0.1,
"grad_norm": 0.7903485298156738,
"learning_rate": 4.835857454672087e-06,
"loss": 0.1591,
"step": 3800
},
{
"epoch": 0.1,
"grad_norm": 0.9844456315040588,
"learning_rate": 4.831442063895575e-06,
"loss": 0.1539,
"step": 3850
},
{
"epoch": 0.1,
"grad_norm": 0.8709468245506287,
"learning_rate": 4.8269701427086905e-06,
"loss": 0.1588,
"step": 3900
},
{
"epoch": 0.1,
"grad_norm": 0.9181196093559265,
"learning_rate": 4.822441799541979e-06,
"loss": 0.1569,
"step": 3950
},
{
"epoch": 0.1,
"grad_norm": 0.868691623210907,
"learning_rate": 4.8178571441940515e-06,
"loss": 0.1554,
"step": 4000
},
{
"epoch": 0.1,
"grad_norm": 0.9550264477729797,
"learning_rate": 4.813216287828917e-06,
"loss": 0.1595,
"step": 4050
},
{
"epoch": 0.1,
"grad_norm": 0.9788670539855957,
"learning_rate": 4.808519342973289e-06,
"loss": 0.158,
"step": 4100
},
{
"epoch": 0.11,
"grad_norm": 0.9521192908287048,
"learning_rate": 4.80376642351386e-06,
"loss": 0.1541,
"step": 4150
},
{
"epoch": 0.11,
"grad_norm": 0.9816685318946838,
"learning_rate": 4.798957644694533e-06,
"loss": 0.1555,
"step": 4200
},
{
"epoch": 0.11,
"grad_norm": 0.8248597383499146,
"learning_rate": 4.794093123113635e-06,
"loss": 0.1575,
"step": 4250
},
{
"epoch": 0.11,
"grad_norm": 0.8728579878807068,
"learning_rate": 4.789172976721082e-06,
"loss": 0.1582,
"step": 4300
},
{
"epoch": 0.11,
"grad_norm": 0.8118800520896912,
"learning_rate": 4.7841973248155275e-06,
"loss": 0.1606,
"step": 4350
},
{
"epoch": 0.11,
"grad_norm": 0.8186320066452026,
"learning_rate": 4.779166288041463e-06,
"loss": 0.1515,
"step": 4400
},
{
"epoch": 0.11,
"grad_norm": 0.8599525690078735,
"learning_rate": 4.7740799883862966e-06,
"loss": 0.1604,
"step": 4450
},
{
"epoch": 0.11,
"grad_norm": 0.9356303811073303,
"learning_rate": 4.7689385491773934e-06,
"loss": 0.155,
"step": 4500
},
{
"epoch": 0.12,
"grad_norm": 0.9057179689407349,
"learning_rate": 4.7637420950790855e-06,
"loss": 0.1561,
"step": 4550
},
{
"epoch": 0.12,
"grad_norm": 0.953428328037262,
"learning_rate": 4.75849075208965e-06,
"loss": 0.1624,
"step": 4600
},
{
"epoch": 0.12,
"grad_norm": 0.8495259284973145,
"learning_rate": 4.7531846475382526e-06,
"loss": 0.1583,
"step": 4650
},
{
"epoch": 0.12,
"grad_norm": 0.8086969256401062,
"learning_rate": 4.7478239100818626e-06,
"loss": 0.1514,
"step": 4700
},
{
"epoch": 0.12,
"grad_norm": 0.8309988975524902,
"learning_rate": 4.742408669702131e-06,
"loss": 0.1586,
"step": 4750
},
{
"epoch": 0.12,
"grad_norm": 0.8857250809669495,
"learning_rate": 4.736939057702239e-06,
"loss": 0.1534,
"step": 4800
},
{
"epoch": 0.12,
"grad_norm": 0.9398581981658936,
"learning_rate": 4.731415206703714e-06,
"loss": 0.1522,
"step": 4850
},
{
"epoch": 0.12,
"grad_norm": 0.8459595441818237,
"learning_rate": 4.725837250643218e-06,
"loss": 0.1488,
"step": 4900
},
{
"epoch": 0.13,
"grad_norm": 0.9744529724121094,
"learning_rate": 4.720205324769296e-06,
"loss": 0.153,
"step": 4950
},
{
"epoch": 0.13,
"grad_norm": 0.8761635422706604,
"learning_rate": 4.714519565639095e-06,
"loss": 0.1558,
"step": 5000
},
{
"epoch": 0.13,
"grad_norm": 0.7558119297027588,
"learning_rate": 4.708780111115058e-06,
"loss": 0.1563,
"step": 5050
},
{
"epoch": 0.13,
"grad_norm": 0.9466744661331177,
"learning_rate": 4.702987100361578e-06,
"loss": 0.1578,
"step": 5100
},
{
"epoch": 0.13,
"grad_norm": 0.9422277808189392,
"learning_rate": 4.697140673841624e-06,
"loss": 0.147,
"step": 5150
},
{
"epoch": 0.13,
"grad_norm": 0.7488685846328735,
"learning_rate": 4.6912409733133365e-06,
"loss": 0.1535,
"step": 5200
},
{
"epoch": 0.13,
"grad_norm": 0.8875038027763367,
"learning_rate": 4.685288141826589e-06,
"loss": 0.1491,
"step": 5250
},
{
"epoch": 0.13,
"grad_norm": 1.1153545379638672,
"learning_rate": 4.679282323719519e-06,
"loss": 0.1523,
"step": 5300
},
{
"epoch": 0.14,
"grad_norm": 0.9965418577194214,
"learning_rate": 4.67322366461503e-06,
"loss": 0.1499,
"step": 5350
},
{
"epoch": 0.14,
"grad_norm": 0.7546530961990356,
"learning_rate": 4.66711231141726e-06,
"loss": 0.1517,
"step": 5400
},
{
"epoch": 0.14,
"grad_norm": 0.8970305323600769,
"learning_rate": 4.660948412308018e-06,
"loss": 0.1546,
"step": 5450
},
{
"epoch": 0.14,
"grad_norm": 1.0205148458480835,
"learning_rate": 4.654732116743193e-06,
"loss": 0.1532,
"step": 5500
},
{
"epoch": 0.14,
"grad_norm": 0.8177210092544556,
"learning_rate": 4.64846357544913e-06,
"loss": 0.1437,
"step": 5550
},
{
"epoch": 0.14,
"grad_norm": 1.0768853425979614,
"learning_rate": 4.642142940418973e-06,
"loss": 0.1499,
"step": 5600
},
{
"epoch": 0.14,
"grad_norm": 0.8934694528579712,
"learning_rate": 4.635770364908984e-06,
"loss": 0.1538,
"step": 5650
},
{
"epoch": 0.14,
"grad_norm": 0.9092695713043213,
"learning_rate": 4.629346003434822e-06,
"loss": 0.1507,
"step": 5700
},
{
"epoch": 0.15,
"grad_norm": 0.8804992437362671,
"learning_rate": 4.622870011767798e-06,
"loss": 0.1494,
"step": 5750
},
{
"epoch": 0.15,
"grad_norm": 0.7924209237098694,
"learning_rate": 4.616342546931103e-06,
"loss": 0.155,
"step": 5800
},
{
"epoch": 0.15,
"grad_norm": 0.8898280262947083,
"learning_rate": 4.609763767195991e-06,
"loss": 0.1511,
"step": 5850
},
{
"epoch": 0.15,
"grad_norm": 0.8630085587501526,
"learning_rate": 4.603133832077953e-06,
"loss": 0.1486,
"step": 5900
},
{
"epoch": 0.15,
"grad_norm": 0.7191097140312195,
"learning_rate": 4.596452902332839e-06,
"loss": 0.1497,
"step": 5950
},
{
"epoch": 0.15,
"grad_norm": 0.815141499042511,
"learning_rate": 4.589721139952964e-06,
"loss": 0.1503,
"step": 6000
},
{
"epoch": 0.15,
"grad_norm": 0.8735461831092834,
"learning_rate": 4.582938708163183e-06,
"loss": 0.1532,
"step": 6050
},
{
"epoch": 0.15,
"grad_norm": 0.851352334022522,
"learning_rate": 4.576105771416928e-06,
"loss": 0.1527,
"step": 6100
},
{
"epoch": 0.16,
"grad_norm": 0.7596533298492432,
"learning_rate": 4.569222495392227e-06,
"loss": 0.153,
"step": 6150
},
{
"epoch": 0.16,
"grad_norm": 0.8966355919837952,
"learning_rate": 4.562289046987679e-06,
"loss": 0.1493,
"step": 6200
},
{
"epoch": 0.16,
"grad_norm": 0.8785326480865479,
"learning_rate": 4.555305594318414e-06,
"loss": 0.1458,
"step": 6250
},
{
"epoch": 0.16,
"grad_norm": 0.8890253305435181,
"learning_rate": 4.548272306712013e-06,
"loss": 0.1495,
"step": 6300
},
{
"epoch": 0.16,
"grad_norm": 0.8389493823051453,
"learning_rate": 4.541189354704403e-06,
"loss": 0.1554,
"step": 6350
},
{
"epoch": 0.16,
"grad_norm": 0.8545738458633423,
"learning_rate": 4.534056910035724e-06,
"loss": 0.1512,
"step": 6400
},
{
"epoch": 0.16,
"grad_norm": 0.859368085861206,
"learning_rate": 4.5268751456461605e-06,
"loss": 0.148,
"step": 6450
},
{
"epoch": 0.16,
"grad_norm": 0.9073702692985535,
"learning_rate": 4.5196442356717526e-06,
"loss": 0.1522,
"step": 6500
},
{
"epoch": 0.17,
"grad_norm": 0.8107109069824219,
"learning_rate": 4.512364355440172e-06,
"loss": 0.1506,
"step": 6550
},
{
"epoch": 0.17,
"grad_norm": 0.6515960693359375,
"learning_rate": 4.505035681466472e-06,
"loss": 0.1459,
"step": 6600
},
{
"epoch": 0.17,
"grad_norm": 0.8400324583053589,
"learning_rate": 4.497658391448803e-06,
"loss": 0.1486,
"step": 6650
},
{
"epoch": 0.17,
"grad_norm": 0.8535892367362976,
"learning_rate": 4.49023266426411e-06,
"loss": 0.1533,
"step": 6700
},
{
"epoch": 0.17,
"grad_norm": 0.8867340087890625,
"learning_rate": 4.482758679963792e-06,
"loss": 0.1546,
"step": 6750
},
{
"epoch": 0.17,
"grad_norm": 0.8694542050361633,
"learning_rate": 4.475236619769336e-06,
"loss": 0.1462,
"step": 6800
},
{
"epoch": 0.17,
"grad_norm": 0.8599735498428345,
"learning_rate": 4.4676666660679265e-06,
"loss": 0.1472,
"step": 6850
},
{
"epoch": 0.17,
"grad_norm": 0.8738076686859131,
"learning_rate": 4.460049002408018e-06,
"loss": 0.1521,
"step": 6900
},
{
"epoch": 0.18,
"grad_norm": 0.9029892683029175,
"learning_rate": 4.452383813494887e-06,
"loss": 0.1534,
"step": 6950
},
{
"epoch": 0.18,
"grad_norm": 0.828575849533081,
"learning_rate": 4.444671285186155e-06,
"loss": 0.1487,
"step": 7000
},
{
"epoch": 0.18,
"grad_norm": 0.7742441892623901,
"learning_rate": 4.4369116044872786e-06,
"loss": 0.1449,
"step": 7050
},
{
"epoch": 0.18,
"grad_norm": 0.8644819855690002,
"learning_rate": 4.42910495954702e-06,
"loss": 0.1532,
"step": 7100
},
{
"epoch": 0.18,
"grad_norm": 0.955912172794342,
"learning_rate": 4.421251539652879e-06,
"loss": 0.1479,
"step": 7150
},
{
"epoch": 0.18,
"grad_norm": 0.8100504875183105,
"learning_rate": 4.413351535226507e-06,
"loss": 0.1515,
"step": 7200
},
{
"epoch": 0.18,
"grad_norm": 0.8034056425094604,
"learning_rate": 4.4054051378190915e-06,
"loss": 0.1509,
"step": 7250
},
{
"epoch": 0.19,
"grad_norm": 1.0016452074050903,
"learning_rate": 4.397412540106707e-06,
"loss": 0.1477,
"step": 7300
},
{
"epoch": 0.19,
"grad_norm": 0.7266576290130615,
"learning_rate": 4.3893739358856465e-06,
"loss": 0.1382,
"step": 7350
},
{
"epoch": 0.19,
"grad_norm": 0.8133652210235596,
"learning_rate": 4.38128952006772e-06,
"loss": 0.148,
"step": 7400
},
{
"epoch": 0.19,
"grad_norm": 0.8336594104766846,
"learning_rate": 4.373159488675533e-06,
"loss": 0.1487,
"step": 7450
},
{
"epoch": 0.19,
"grad_norm": 0.7953880429267883,
"learning_rate": 4.364984038837727e-06,
"loss": 0.1471,
"step": 7500
},
{
"epoch": 0.19,
"grad_norm": 0.8772010803222656,
"learning_rate": 4.356763368784207e-06,
"loss": 0.1509,
"step": 7550
},
{
"epoch": 0.19,
"grad_norm": 0.8125504851341248,
"learning_rate": 4.348497677841328e-06,
"loss": 0.1477,
"step": 7600
},
{
"epoch": 0.19,
"grad_norm": 0.8239701390266418,
"learning_rate": 4.340187166427067e-06,
"loss": 0.1439,
"step": 7650
},
{
"epoch": 0.2,
"grad_norm": 0.8961694240570068,
"learning_rate": 4.331832036046162e-06,
"loss": 0.1484,
"step": 7700
},
{
"epoch": 0.2,
"grad_norm": 0.8788717985153198,
"learning_rate": 4.323432489285223e-06,
"loss": 0.1459,
"step": 7750
},
{
"epoch": 0.2,
"grad_norm": 0.8486537337303162,
"learning_rate": 4.3149887298078275e-06,
"loss": 0.1457,
"step": 7800
},
{
"epoch": 0.2,
"grad_norm": 0.914941132068634,
"learning_rate": 4.306500962349573e-06,
"loss": 0.144,
"step": 7850
},
{
"epoch": 0.2,
"grad_norm": 0.8333587050437927,
"learning_rate": 4.2979693927131205e-06,
"loss": 0.1434,
"step": 7900
},
{
"epoch": 0.2,
"grad_norm": 0.8071913719177246,
"learning_rate": 4.289394227763199e-06,
"loss": 0.1498,
"step": 7950
},
{
"epoch": 0.2,
"grad_norm": 0.7681615948677063,
"learning_rate": 4.2807756754215926e-06,
"loss": 0.1461,
"step": 8000
},
{
"epoch": 0.2,
"grad_norm": 0.8993908762931824,
"learning_rate": 4.272113944662099e-06,
"loss": 0.146,
"step": 8050
},
{
"epoch": 0.21,
"grad_norm": 0.8620333671569824,
"learning_rate": 4.263409245505461e-06,
"loss": 0.1444,
"step": 8100
},
{
"epoch": 0.21,
"grad_norm": 0.8208089470863342,
"learning_rate": 4.254661789014274e-06,
"loss": 0.1415,
"step": 8150
},
{
"epoch": 0.21,
"grad_norm": 0.7570805549621582,
"learning_rate": 4.2458717872878715e-06,
"loss": 0.1406,
"step": 8200
},
{
"epoch": 0.21,
"grad_norm": 0.8517532348632812,
"learning_rate": 4.237039453457179e-06,
"loss": 0.1439,
"step": 8250
},
{
"epoch": 0.21,
"grad_norm": 0.7504327893257141,
"learning_rate": 4.228165001679547e-06,
"loss": 0.1408,
"step": 8300
},
{
"epoch": 0.21,
"grad_norm": 0.7522333264350891,
"learning_rate": 4.219248647133559e-06,
"loss": 0.1434,
"step": 8350
},
{
"epoch": 0.21,
"grad_norm": 0.9496443271636963,
"learning_rate": 4.210290606013813e-06,
"loss": 0.1472,
"step": 8400
},
{
"epoch": 0.21,
"grad_norm": 1.0091819763183594,
"learning_rate": 4.2012910955256825e-06,
"loss": 0.1502,
"step": 8450
},
{
"epoch": 0.22,
"grad_norm": 0.9098952412605286,
"learning_rate": 4.192250333880045e-06,
"loss": 0.1488,
"step": 8500
},
{
"epoch": 0.22,
"grad_norm": 0.6374802589416504,
"learning_rate": 4.183168540287995e-06,
"loss": 0.1443,
"step": 8550
},
{
"epoch": 0.22,
"grad_norm": 0.7613967657089233,
"learning_rate": 4.174045934955527e-06,
"loss": 0.149,
"step": 8600
},
{
"epoch": 0.22,
"grad_norm": 0.8770780563354492,
"learning_rate": 4.164882739078197e-06,
"loss": 0.1432,
"step": 8650
},
{
"epoch": 0.22,
"grad_norm": 0.8647637963294983,
"learning_rate": 4.155679174835758e-06,
"loss": 0.1453,
"step": 8700
},
{
"epoch": 0.22,
"grad_norm": 0.8997677564620972,
"learning_rate": 4.146435465386776e-06,
"loss": 0.1429,
"step": 8750
},
{
"epoch": 0.22,
"grad_norm": 0.702021598815918,
"learning_rate": 4.137151834863213e-06,
"loss": 0.1448,
"step": 8800
},
{
"epoch": 0.22,
"grad_norm": 0.8877437710762024,
"learning_rate": 4.1278285083649985e-06,
"loss": 0.1449,
"step": 8850
},
{
"epoch": 0.23,
"grad_norm": 0.8944686055183411,
"learning_rate": 4.11846571195457e-06,
"loss": 0.1522,
"step": 8900
},
{
"epoch": 0.23,
"grad_norm": 0.9450408816337585,
"learning_rate": 4.1090636726513875e-06,
"loss": 0.1453,
"step": 8950
},
{
"epoch": 0.23,
"grad_norm": 0.7170203328132629,
"learning_rate": 4.0996226184264355e-06,
"loss": 0.1469,
"step": 9000
},
{
"epoch": 0.23,
"grad_norm": 0.8969573974609375,
"learning_rate": 4.090142778196692e-06,
"loss": 0.1494,
"step": 9050
},
{
"epoch": 0.23,
"grad_norm": 0.9128451347351074,
"learning_rate": 4.080624381819577e-06,
"loss": 0.1413,
"step": 9100
},
{
"epoch": 0.23,
"grad_norm": 0.8542819023132324,
"learning_rate": 4.071067660087379e-06,
"loss": 0.1418,
"step": 9150
},
{
"epoch": 0.23,
"grad_norm": 0.8860799670219421,
"learning_rate": 4.061472844721664e-06,
"loss": 0.1408,
"step": 9200
},
{
"epoch": 0.23,
"grad_norm": 0.8985223770141602,
"learning_rate": 4.05184016836765e-06,
"loss": 0.1444,
"step": 9250
},
{
"epoch": 0.24,
"grad_norm": 0.7234982252120972,
"learning_rate": 4.042169864588571e-06,
"loss": 0.1469,
"step": 9300
},
{
"epoch": 0.24,
"grad_norm": 0.7840049266815186,
"learning_rate": 4.032462167860012e-06,
"loss": 0.1367,
"step": 9350
},
{
"epoch": 0.24,
"grad_norm": 0.8657572269439697,
"learning_rate": 4.022717313564223e-06,
"loss": 0.1456,
"step": 9400
},
{
"epoch": 0.24,
"grad_norm": 0.8249016404151917,
"learning_rate": 4.012935537984414e-06,
"loss": 0.1398,
"step": 9450
},
{
"epoch": 0.24,
"grad_norm": 0.9602368474006653,
"learning_rate": 4.0031170782990214e-06,
"loss": 0.1465,
"step": 9500
},
{
"epoch": 0.24,
"grad_norm": 0.7444416284561157,
"learning_rate": 3.993262172575962e-06,
"loss": 0.1415,
"step": 9550
},
{
"epoch": 0.24,
"grad_norm": 0.8414493799209595,
"learning_rate": 3.983371059766862e-06,
"loss": 0.1487,
"step": 9600
},
{
"epoch": 0.24,
"grad_norm": 0.8294157385826111,
"learning_rate": 3.973443979701252e-06,
"loss": 0.1406,
"step": 9650
},
{
"epoch": 0.25,
"grad_norm": 0.8009164333343506,
"learning_rate": 3.963481173080768e-06,
"loss": 0.1445,
"step": 9700
},
{
"epoch": 0.25,
"grad_norm": 0.8331242203712463,
"learning_rate": 3.9534828814733e-06,
"loss": 0.1388,
"step": 9750
},
{
"epoch": 0.25,
"grad_norm": 0.8543001413345337,
"learning_rate": 3.943449347307146e-06,
"loss": 0.1399,
"step": 9800
},
{
"epoch": 0.25,
"grad_norm": 0.625985324382782,
"learning_rate": 3.9333808138651265e-06,
"loss": 0.1376,
"step": 9850
},
{
"epoch": 0.25,
"grad_norm": 0.7723961472511292,
"learning_rate": 3.923277525278691e-06,
"loss": 0.1443,
"step": 9900
},
{
"epoch": 0.25,
"grad_norm": 0.8138018846511841,
"learning_rate": 3.913139726521993e-06,
"loss": 0.1358,
"step": 9950
},
{
"epoch": 0.25,
"grad_norm": 0.7179055213928223,
"learning_rate": 3.9029676634059565e-06,
"loss": 0.1376,
"step": 10000
},
{
"epoch": 0.25,
"grad_norm": 0.7832974195480347,
"learning_rate": 3.89276158257231e-06,
"loss": 0.1391,
"step": 10050
},
{
"epoch": 0.26,
"grad_norm": 0.6319305300712585,
"learning_rate": 3.882521731487609e-06,
"loss": 0.1394,
"step": 10100
},
{
"epoch": 0.26,
"grad_norm": 0.8627734184265137,
"learning_rate": 3.872248358437236e-06,
"loss": 0.1437,
"step": 10150
},
{
"epoch": 0.26,
"grad_norm": 0.8356693387031555,
"learning_rate": 3.861941712519379e-06,
"loss": 0.1395,
"step": 10200
},
{
"epoch": 0.26,
"grad_norm": 0.9000275135040283,
"learning_rate": 3.8516020436389945e-06,
"loss": 0.1415,
"step": 10250
},
{
"epoch": 0.26,
"grad_norm": 0.8124093413352966,
"learning_rate": 3.841229602501742e-06,
"loss": 0.1425,
"step": 10300
},
{
"epoch": 0.26,
"grad_norm": 0.7685266137123108,
"learning_rate": 3.8308246406079116e-06,
"loss": 0.1444,
"step": 10350
},
{
"epoch": 0.26,
"grad_norm": 0.858231782913208,
"learning_rate": 3.820387410246324e-06,
"loss": 0.1408,
"step": 10400
},
{
"epoch": 0.26,
"grad_norm": 0.7910386323928833,
"learning_rate": 3.809918164488208e-06,
"loss": 0.1404,
"step": 10450
},
{
"epoch": 0.27,
"grad_norm": 0.7267195582389832,
"learning_rate": 3.7994171571810756e-06,
"loss": 0.1444,
"step": 10500
},
{
"epoch": 0.27,
"grad_norm": 0.9443105459213257,
"learning_rate": 3.788884642942555e-06,
"loss": 0.1427,
"step": 10550
},
{
"epoch": 0.27,
"grad_norm": 0.8015535473823547,
"learning_rate": 3.7783208771542237e-06,
"loss": 0.1386,
"step": 10600
},
{
"epoch": 0.27,
"grad_norm": 0.8959324955940247,
"learning_rate": 3.7677261159554145e-06,
"loss": 0.1428,
"step": 10650
},
{
"epoch": 0.27,
"grad_norm": 0.8010550737380981,
"learning_rate": 3.757100616237006e-06,
"loss": 0.1407,
"step": 10700
},
{
"epoch": 0.27,
"grad_norm": 0.7638524174690247,
"learning_rate": 3.746444635635191e-06,
"loss": 0.1413,
"step": 10750
},
{
"epoch": 0.27,
"grad_norm": 0.8806131482124329,
"learning_rate": 3.735758432525234e-06,
"loss": 0.1392,
"step": 10800
},
{
"epoch": 0.27,
"grad_norm": 0.763507068157196,
"learning_rate": 3.725042266015201e-06,
"loss": 0.1414,
"step": 10850
},
{
"epoch": 0.28,
"grad_norm": 0.7763858437538147,
"learning_rate": 3.7142963959396805e-06,
"loss": 0.142,
"step": 10900
},
{
"epoch": 0.28,
"grad_norm": 0.8945651054382324,
"learning_rate": 3.7035210828534846e-06,
"loss": 0.1444,
"step": 10950
},
{
"epoch": 0.28,
"grad_norm": 0.8169758915901184,
"learning_rate": 3.692716588025327e-06,
"loss": 0.1392,
"step": 11000
},
{
"epoch": 0.28,
"grad_norm": 0.8331303596496582,
"learning_rate": 3.68188317343149e-06,
"loss": 0.1408,
"step": 11050
},
{
"epoch": 0.28,
"grad_norm": 0.7105779051780701,
"learning_rate": 3.671021101749476e-06,
"loss": 0.1369,
"step": 11100
},
{
"epoch": 0.28,
"grad_norm": 0.8562634587287903,
"learning_rate": 3.6601306363516297e-06,
"loss": 0.1476,
"step": 11150
},
{
"epoch": 0.28,
"grad_norm": 0.831794798374176,
"learning_rate": 3.649212041298763e-06,
"loss": 0.141,
"step": 11200
},
{
"epoch": 0.29,
"grad_norm": 0.7753794193267822,
"learning_rate": 3.638265581333742e-06,
"loss": 0.1396,
"step": 11250
},
{
"epoch": 0.29,
"grad_norm": 0.8438231348991394,
"learning_rate": 3.627291521875076e-06,
"loss": 0.1411,
"step": 11300
},
{
"epoch": 0.29,
"grad_norm": 0.8898864388465881,
"learning_rate": 3.616290129010476e-06,
"loss": 0.1363,
"step": 11350
},
{
"epoch": 0.29,
"grad_norm": 0.8584467768669128,
"learning_rate": 3.605261669490407e-06,
"loss": 0.1437,
"step": 11400
},
{
"epoch": 0.29,
"grad_norm": 0.8061217665672302,
"learning_rate": 3.5942064107216183e-06,
"loss": 0.1408,
"step": 11450
},
{
"epoch": 0.29,
"grad_norm": 0.7721576690673828,
"learning_rate": 3.5831246207606597e-06,
"loss": 0.14,
"step": 11500
},
{
"epoch": 0.29,
"grad_norm": 0.7443546056747437,
"learning_rate": 3.57201656830738e-06,
"loss": 0.1416,
"step": 11550
},
{
"epoch": 0.29,
"grad_norm": 0.8907784819602966,
"learning_rate": 3.5608825226984168e-06,
"loss": 0.1409,
"step": 11600
},
{
"epoch": 0.3,
"grad_norm": 0.8339740633964539,
"learning_rate": 3.549722753900662e-06,
"loss": 0.1403,
"step": 11650
},
{
"epoch": 0.3,
"grad_norm": 0.844826340675354,
"learning_rate": 3.5385375325047167e-06,
"loss": 0.135,
"step": 11700
},
{
"epoch": 0.3,
"grad_norm": 0.875929057598114,
"learning_rate": 3.5273271297183302e-06,
"loss": 0.1404,
"step": 11750
},
{
"epoch": 0.3,
"grad_norm": 0.7786159515380859,
"learning_rate": 3.516091817359825e-06,
"loss": 0.1372,
"step": 11800
},
{
"epoch": 0.3,
"grad_norm": 0.9039524793624878,
"learning_rate": 3.5048318678515052e-06,
"loss": 0.1383,
"step": 11850
},
{
"epoch": 0.3,
"grad_norm": 0.8629227876663208,
"learning_rate": 3.493547554213051e-06,
"loss": 0.1365,
"step": 11900
},
{
"epoch": 0.3,
"grad_norm": 0.9458798170089722,
"learning_rate": 3.482239150054898e-06,
"loss": 0.1377,
"step": 11950
},
{
"epoch": 0.3,
"grad_norm": 0.7950788736343384,
"learning_rate": 3.470906929571605e-06,
"loss": 0.1415,
"step": 12000
},
{
"epoch": 0.31,
"grad_norm": 0.9919633865356445,
"learning_rate": 3.459551167535205e-06,
"loss": 0.1375,
"step": 12050
},
{
"epoch": 0.31,
"grad_norm": 0.8692936301231384,
"learning_rate": 3.4481721392885415e-06,
"loss": 0.1406,
"step": 12100
},
{
"epoch": 0.31,
"grad_norm": 0.7293336391448975,
"learning_rate": 3.4367701207385944e-06,
"loss": 0.1379,
"step": 12150
},
{
"epoch": 0.31,
"grad_norm": 0.9827872514724731,
"learning_rate": 3.425345388349787e-06,
"loss": 0.1406,
"step": 12200
},
{
"epoch": 0.31,
"grad_norm": 0.8083829283714294,
"learning_rate": 3.4138982191372838e-06,
"loss": 0.1451,
"step": 12250
},
{
"epoch": 0.31,
"grad_norm": 0.8368284106254578,
"learning_rate": 3.402428890660279e-06,
"loss": 0.1371,
"step": 12300
},
{
"epoch": 0.31,
"grad_norm": 0.7933076024055481,
"learning_rate": 3.390937681015256e-06,
"loss": 0.136,
"step": 12350
},
{
"epoch": 0.31,
"grad_norm": 0.7652096748352051,
"learning_rate": 3.379424868829254e-06,
"loss": 0.1349,
"step": 12400
},
{
"epoch": 0.32,
"grad_norm": 0.9007067084312439,
"learning_rate": 3.367890733253108e-06,
"loss": 0.1457,
"step": 12450
},
{
"epoch": 0.32,
"grad_norm": 0.744931161403656,
"learning_rate": 3.3563355539546795e-06,
"loss": 0.1413,
"step": 12500
},
{
"epoch": 0.32,
"grad_norm": 0.8155565857887268,
"learning_rate": 3.3447596111120767e-06,
"loss": 0.1332,
"step": 12550
},
{
"epoch": 0.32,
"grad_norm": 0.9961397647857666,
"learning_rate": 3.333163185406861e-06,
"loss": 0.1396,
"step": 12600
},
{
"epoch": 0.32,
"grad_norm": 0.7819132208824158,
"learning_rate": 3.321546558017243e-06,
"loss": 0.1355,
"step": 12650
},
{
"epoch": 0.32,
"grad_norm": 0.7664303779602051,
"learning_rate": 3.309910010611259e-06,
"loss": 0.1379,
"step": 12700
},
{
"epoch": 0.32,
"grad_norm": 0.7041974067687988,
"learning_rate": 3.29825382533995e-06,
"loss": 0.1365,
"step": 12750
},
{
"epoch": 0.32,
"grad_norm": 0.8403826355934143,
"learning_rate": 3.286578284830513e-06,
"loss": 0.1435,
"step": 12800
},
{
"epoch": 0.33,
"grad_norm": 0.7580990791320801,
"learning_rate": 3.2748836721794514e-06,
"loss": 0.1364,
"step": 12850
},
{
"epoch": 0.33,
"grad_norm": 0.7609158158302307,
"learning_rate": 3.263170270945709e-06,
"loss": 0.1374,
"step": 12900
},
{
"epoch": 0.33,
"grad_norm": 0.8184811472892761,
"learning_rate": 3.2514383651437987e-06,
"loss": 0.1396,
"step": 12950
},
{
"epoch": 0.33,
"grad_norm": 0.948807954788208,
"learning_rate": 3.239688239236911e-06,
"loss": 0.1339,
"step": 13000
},
{
"epoch": 0.33,
"grad_norm": 0.8595229983329773,
"learning_rate": 3.2279201781300206e-06,
"loss": 0.1374,
"step": 13050
},
{
"epoch": 0.33,
"grad_norm": 0.8162684440612793,
"learning_rate": 3.2161344671629736e-06,
"loss": 0.135,
"step": 13100
},
{
"epoch": 0.33,
"grad_norm": 0.7258612513542175,
"learning_rate": 3.2043313921035747e-06,
"loss": 0.1376,
"step": 13150
},
{
"epoch": 0.33,
"grad_norm": 0.8645867705345154,
"learning_rate": 3.1925112391406534e-06,
"loss": 0.142,
"step": 13200
},
{
"epoch": 0.34,
"grad_norm": 0.7972207069396973,
"learning_rate": 3.1806742948771276e-06,
"loss": 0.1351,
"step": 13250
},
{
"epoch": 0.34,
"grad_norm": 0.9695426225662231,
"learning_rate": 3.168820846323053e-06,
"loss": 0.1403,
"step": 13300
},
{
"epoch": 0.34,
"grad_norm": 0.6566038727760315,
"learning_rate": 3.1569511808886633e-06,
"loss": 0.1342,
"step": 13350
},
{
"epoch": 0.34,
"grad_norm": 0.8785290122032166,
"learning_rate": 3.1450655863774053e-06,
"loss": 0.1391,
"step": 13400
},
{
"epoch": 0.34,
"grad_norm": 0.7746759653091431,
"learning_rate": 3.1331643509789553e-06,
"loss": 0.1385,
"step": 13450
},
{
"epoch": 0.34,
"grad_norm": 0.7730455994606018,
"learning_rate": 3.121247763262235e-06,
"loss": 0.1336,
"step": 13500
},
{
"epoch": 0.34,
"grad_norm": 0.864236056804657,
"learning_rate": 3.1093161121684118e-06,
"loss": 0.1384,
"step": 13550
},
{
"epoch": 0.34,
"grad_norm": 0.9014241099357605,
"learning_rate": 3.097369687003896e-06,
"loss": 0.1302,
"step": 13600
},
{
"epoch": 0.35,
"grad_norm": 0.8704994916915894,
"learning_rate": 3.085408777433323e-06,
"loss": 0.1347,
"step": 13650
},
{
"epoch": 0.35,
"grad_norm": 0.793929934501648,
"learning_rate": 3.0734336734725327e-06,
"loss": 0.1396,
"step": 13700
},
{
"epoch": 0.35,
"grad_norm": 0.7899141907691956,
"learning_rate": 3.0614446654815346e-06,
"loss": 0.1331,
"step": 13750
},
{
"epoch": 0.35,
"grad_norm": 0.7553165555000305,
"learning_rate": 3.049442044157469e-06,
"loss": 0.1326,
"step": 13800
},
{
"epoch": 0.35,
"grad_norm": 0.7848679423332214,
"learning_rate": 3.0374261005275606e-06,
"loss": 0.1379,
"step": 13850
},
{
"epoch": 0.35,
"grad_norm": 0.9563062191009521,
"learning_rate": 3.025397125942056e-06,
"loss": 0.1357,
"step": 13900
},
{
"epoch": 0.35,
"grad_norm": 0.7703486084938049,
"learning_rate": 3.0133554120671653e-06,
"loss": 0.1317,
"step": 13950
},
{
"epoch": 0.35,
"grad_norm": 0.8696340322494507,
"learning_rate": 3.001301250877987e-06,
"loss": 0.1309,
"step": 14000
},
{
"epoch": 0.36,
"grad_norm": 0.7226243615150452,
"learning_rate": 2.9892349346514306e-06,
"loss": 0.1346,
"step": 14050
},
{
"epoch": 0.36,
"grad_norm": 0.8346455097198486,
"learning_rate": 2.977156755959126e-06,
"loss": 0.1421,
"step": 14100
},
{
"epoch": 0.36,
"grad_norm": 0.9874286651611328,
"learning_rate": 2.9650670076603342e-06,
"loss": 0.137,
"step": 14150
},
{
"epoch": 0.36,
"grad_norm": 0.7056658267974854,
"learning_rate": 2.952965982894844e-06,
"loss": 0.1355,
"step": 14200
},
{
"epoch": 0.36,
"grad_norm": 0.8198770880699158,
"learning_rate": 2.9408539750758625e-06,
"loss": 0.1353,
"step": 14250
},
{
"epoch": 0.36,
"grad_norm": 0.8781945109367371,
"learning_rate": 2.9287312778829047e-06,
"loss": 0.1333,
"step": 14300
},
{
"epoch": 0.36,
"grad_norm": 0.8908482193946838,
"learning_rate": 2.9165981852546688e-06,
"loss": 0.14,
"step": 14350
},
{
"epoch": 0.36,
"grad_norm": 0.872043251991272,
"learning_rate": 2.9044549913819125e-06,
"loss": 0.1327,
"step": 14400
},
{
"epoch": 0.37,
"grad_norm": 0.8322109580039978,
"learning_rate": 2.892301990700316e-06,
"loss": 0.1391,
"step": 14450
},
{
"epoch": 0.37,
"grad_norm": 0.8258897662162781,
"learning_rate": 2.8801394778833475e-06,
"loss": 0.1381,
"step": 14500
},
{
"epoch": 0.37,
"grad_norm": 0.9012707471847534,
"learning_rate": 2.8679677478351147e-06,
"loss": 0.1336,
"step": 14550
},
{
"epoch": 0.37,
"grad_norm": 0.9831441044807434,
"learning_rate": 2.8557870956832135e-06,
"loss": 0.135,
"step": 14600
},
{
"epoch": 0.37,
"grad_norm": 0.8136524558067322,
"learning_rate": 2.8435978167715753e-06,
"loss": 0.1367,
"step": 14650
},
{
"epoch": 0.37,
"grad_norm": 0.7816210985183716,
"learning_rate": 2.8314002066533053e-06,
"loss": 0.13,
"step": 14700
},
{
"epoch": 0.37,
"grad_norm": 0.8131623268127441,
"learning_rate": 2.8191945610835138e-06,
"loss": 0.1387,
"step": 14750
},
{
"epoch": 0.38,
"grad_norm": 0.7747004628181458,
"learning_rate": 2.8069811760121463e-06,
"loss": 0.1369,
"step": 14800
},
{
"epoch": 0.38,
"grad_norm": 0.715416669845581,
"learning_rate": 2.794760347576809e-06,
"loss": 0.1353,
"step": 14850
},
{
"epoch": 0.38,
"grad_norm": 0.8204655051231384,
"learning_rate": 2.7825323720955853e-06,
"loss": 0.1377,
"step": 14900
},
{
"epoch": 0.38,
"grad_norm": 0.6974436044692993,
"learning_rate": 2.7702975460598545e-06,
"loss": 0.1352,
"step": 14950
},
{
"epoch": 0.38,
"grad_norm": 0.8297330141067505,
"learning_rate": 2.7580561661271015e-06,
"loss": 0.1385,
"step": 15000
},
{
"epoch": 0.38,
"grad_norm": 0.7207173705101013,
"learning_rate": 2.7458085291137213e-06,
"loss": 0.1352,
"step": 15050
},
{
"epoch": 0.38,
"grad_norm": 0.7374753355979919,
"learning_rate": 2.733554931987825e-06,
"loss": 0.1357,
"step": 15100
},
{
"epoch": 0.38,
"grad_norm": 0.8534835577011108,
"learning_rate": 2.7212956718620404e-06,
"loss": 0.1286,
"step": 15150
},
{
"epoch": 0.39,
"grad_norm": 0.8803167939186096,
"learning_rate": 2.709031045986302e-06,
"loss": 0.1302,
"step": 15200
},
{
"epoch": 0.39,
"grad_norm": 0.7593905925750732,
"learning_rate": 2.6967613517406514e-06,
"loss": 0.1344,
"step": 15250
},
{
"epoch": 0.39,
"grad_norm": 0.8493793606758118,
"learning_rate": 2.68448688662802e-06,
"loss": 0.133,
"step": 15300
},
{
"epoch": 0.39,
"grad_norm": 0.7607067823410034,
"learning_rate": 2.6722079482670196e-06,
"loss": 0.1355,
"step": 15350
},
{
"epoch": 0.39,
"grad_norm": 0.6717010736465454,
"learning_rate": 2.6599248343847243e-06,
"loss": 0.1337,
"step": 15400
},
{
"epoch": 0.39,
"grad_norm": 0.8807181715965271,
"learning_rate": 2.6476378428094523e-06,
"loss": 0.1308,
"step": 15450
},
{
"epoch": 0.39,
"grad_norm": 0.7503638863563538,
"learning_rate": 2.6353472714635443e-06,
"loss": 0.1358,
"step": 15500
},
{
"epoch": 0.39,
"grad_norm": 0.8454340100288391,
"learning_rate": 2.6230534183561385e-06,
"loss": 0.1349,
"step": 15550
},
{
"epoch": 0.4,
"grad_norm": 0.7447136640548706,
"learning_rate": 2.6107565815759473e-06,
"loss": 0.1366,
"step": 15600
},
{
"epoch": 0.4,
"grad_norm": 0.891941249370575,
"learning_rate": 2.598457059284027e-06,
"loss": 0.1387,
"step": 15650
},
{
"epoch": 0.4,
"grad_norm": 0.8272932767868042,
"learning_rate": 2.5861551497065497e-06,
"loss": 0.1324,
"step": 15700
},
{
"epoch": 0.4,
"grad_norm": 0.8277787566184998,
"learning_rate": 2.5738511511275716e-06,
"loss": 0.1344,
"step": 15750
},
{
"epoch": 0.4,
"grad_norm": 0.7462809681892395,
"learning_rate": 2.5615453618818033e-06,
"loss": 0.1289,
"step": 15800
},
{
"epoch": 0.4,
"grad_norm": 0.6402873396873474,
"learning_rate": 2.5492380803473705e-06,
"loss": 0.1296,
"step": 15850
},
{
"epoch": 0.4,
"grad_norm": 0.8029218912124634,
"learning_rate": 2.5369296049385837e-06,
"loss": 0.1354,
"step": 15900
},
{
"epoch": 0.4,
"grad_norm": 0.912257969379425,
"learning_rate": 2.5246202340987004e-06,
"loss": 0.129,
"step": 15950
},
{
"epoch": 0.41,
"grad_norm": 0.8321228623390198,
"learning_rate": 2.5123102662926912e-06,
"loss": 0.1346,
"step": 16000
},
{
"epoch": 0.41,
"grad_norm": 0.7897807955741882,
"learning_rate": 2.5e-06,
"loss": 0.1343,
"step": 16050
},
{
"epoch": 0.41,
"grad_norm": 0.8118634223937988,
"learning_rate": 2.4876897337073105e-06,
"loss": 0.1316,
"step": 16100
},
{
"epoch": 0.41,
"grad_norm": 0.8614838123321533,
"learning_rate": 2.475379765901301e-06,
"loss": 0.1356,
"step": 16150
},
{
"epoch": 0.41,
"grad_norm": 0.8548163175582886,
"learning_rate": 2.4630703950614176e-06,
"loss": 0.1342,
"step": 16200
},
{
"epoch": 0.41,
"grad_norm": 0.8339961171150208,
"learning_rate": 2.45076191965263e-06,
"loss": 0.1253,
"step": 16250
},
{
"epoch": 0.41,
"grad_norm": 0.7520368695259094,
"learning_rate": 2.4384546381181975e-06,
"loss": 0.1281,
"step": 16300
},
{
"epoch": 0.41,
"grad_norm": 0.8519024848937988,
"learning_rate": 2.4261488488724284e-06,
"loss": 0.1339,
"step": 16350
},
{
"epoch": 0.42,
"grad_norm": 0.6523202061653137,
"learning_rate": 2.413844850293451e-06,
"loss": 0.1354,
"step": 16400
},
{
"epoch": 0.42,
"grad_norm": 0.8685447573661804,
"learning_rate": 2.4015429407159746e-06,
"loss": 0.134,
"step": 16450
},
{
"epoch": 0.42,
"grad_norm": 0.7570633292198181,
"learning_rate": 2.3892434184240536e-06,
"loss": 0.131,
"step": 16500
},
{
"epoch": 0.42,
"grad_norm": 0.8787110447883606,
"learning_rate": 2.3769465816438627e-06,
"loss": 0.1305,
"step": 16550
},
{
"epoch": 0.42,
"grad_norm": 0.8211491107940674,
"learning_rate": 2.3646527285364565e-06,
"loss": 0.133,
"step": 16600
},
{
"epoch": 0.42,
"grad_norm": 0.8259795308113098,
"learning_rate": 2.3523621571905485e-06,
"loss": 0.1324,
"step": 16650
},
{
"epoch": 0.42,
"grad_norm": 0.8258258700370789,
"learning_rate": 2.340075165615276e-06,
"loss": 0.1305,
"step": 16700
},
{
"epoch": 0.42,
"grad_norm": 0.8116273283958435,
"learning_rate": 2.3277920517329813e-06,
"loss": 0.1341,
"step": 16750
},
{
"epoch": 0.43,
"grad_norm": 0.8958277702331543,
"learning_rate": 2.315513113371981e-06,
"loss": 0.1274,
"step": 16800
},
{
"epoch": 0.43,
"grad_norm": 0.7367030382156372,
"learning_rate": 2.303238648259349e-06,
"loss": 0.1276,
"step": 16850
},
{
"epoch": 0.43,
"grad_norm": 0.7800854444503784,
"learning_rate": 2.2909689540136986e-06,
"loss": 0.1347,
"step": 16900
},
{
"epoch": 0.43,
"grad_norm": 0.7913332581520081,
"learning_rate": 2.27870432813796e-06,
"loss": 0.1293,
"step": 16950
},
{
"epoch": 0.43,
"grad_norm": 0.7659708261489868,
"learning_rate": 2.2664450680121757e-06,
"loss": 0.1334,
"step": 17000
},
{
"epoch": 0.43,
"grad_norm": 0.7981456518173218,
"learning_rate": 2.254191470886279e-06,
"loss": 0.1356,
"step": 17050
},
{
"epoch": 0.43,
"grad_norm": 0.8289954662322998,
"learning_rate": 2.241943833872899e-06,
"loss": 0.1305,
"step": 17100
},
{
"epoch": 0.43,
"grad_norm": 0.8367072343826294,
"learning_rate": 2.2297024539401463e-06,
"loss": 0.1307,
"step": 17150
},
{
"epoch": 0.44,
"grad_norm": 0.8345316648483276,
"learning_rate": 2.2174676279044155e-06,
"loss": 0.1283,
"step": 17200
},
{
"epoch": 0.44,
"grad_norm": 0.8068302869796753,
"learning_rate": 2.2052396524231924e-06,
"loss": 0.1342,
"step": 17250
},
{
"epoch": 0.44,
"grad_norm": 0.811231791973114,
"learning_rate": 2.193018823987854e-06,
"loss": 0.1374,
"step": 17300
},
{
"epoch": 0.44,
"grad_norm": 0.8022611141204834,
"learning_rate": 2.180805438916487e-06,
"loss": 0.1296,
"step": 17350
},
{
"epoch": 0.44,
"grad_norm": 0.8078568577766418,
"learning_rate": 2.1685997933466947e-06,
"loss": 0.1273,
"step": 17400
},
{
"epoch": 0.44,
"grad_norm": 0.749242901802063,
"learning_rate": 2.1564021832284255e-06,
"loss": 0.1302,
"step": 17450
},
{
"epoch": 0.44,
"grad_norm": 0.8693391680717468,
"learning_rate": 2.1442129043167877e-06,
"loss": 0.1296,
"step": 17500
},
{
"epoch": 0.44,
"grad_norm": 0.9399083852767944,
"learning_rate": 2.1320322521648857e-06,
"loss": 0.127,
"step": 17550
},
{
"epoch": 0.45,
"grad_norm": 0.8048396110534668,
"learning_rate": 2.119860522116653e-06,
"loss": 0.128,
"step": 17600
},
{
"epoch": 0.45,
"grad_norm": 0.7869442701339722,
"learning_rate": 2.1076980092996845e-06,
"loss": 0.1324,
"step": 17650
},
{
"epoch": 0.45,
"grad_norm": 0.8741517066955566,
"learning_rate": 2.0955450086180883e-06,
"loss": 0.1351,
"step": 17700
},
{
"epoch": 0.45,
"grad_norm": 0.9408276081085205,
"learning_rate": 2.083401814745332e-06,
"loss": 0.1331,
"step": 17750
},
{
"epoch": 0.45,
"grad_norm": 0.8211988210678101,
"learning_rate": 2.071268722117096e-06,
"loss": 0.1281,
"step": 17800
},
{
"epoch": 0.45,
"grad_norm": 0.8505455851554871,
"learning_rate": 2.0591460249241383e-06,
"loss": 0.1296,
"step": 17850
},
{
"epoch": 0.45,
"grad_norm": 0.8733687400817871,
"learning_rate": 2.0470340171051567e-06,
"loss": 0.1339,
"step": 17900
},
{
"epoch": 0.45,
"grad_norm": 0.8266494274139404,
"learning_rate": 2.034932992339666e-06,
"loss": 0.1283,
"step": 17950
},
{
"epoch": 0.46,
"grad_norm": 0.7836053967475891,
"learning_rate": 2.022843244040874e-06,
"loss": 0.1243,
"step": 18000
},
{
"epoch": 0.46,
"grad_norm": 0.9171106815338135,
"learning_rate": 2.0107650653485707e-06,
"loss": 0.1341,
"step": 18050
},
{
"epoch": 0.46,
"grad_norm": 0.796025812625885,
"learning_rate": 1.998698749122014e-06,
"loss": 0.1336,
"step": 18100
},
{
"epoch": 0.46,
"grad_norm": 0.8582082986831665,
"learning_rate": 1.986644587932835e-06,
"loss": 0.1311,
"step": 18150
},
{
"epoch": 0.46,
"grad_norm": 0.7605078220367432,
"learning_rate": 1.9746028740579453e-06,
"loss": 0.1288,
"step": 18200
},
{
"epoch": 0.46,
"grad_norm": 0.8088697195053101,
"learning_rate": 1.96257389947244e-06,
"loss": 0.1285,
"step": 18250
},
{
"epoch": 0.46,
"grad_norm": 0.8578440546989441,
"learning_rate": 1.9505579558425315e-06,
"loss": 0.1346,
"step": 18300
},
{
"epoch": 0.47,
"grad_norm": 0.7750033140182495,
"learning_rate": 1.938555334518466e-06,
"loss": 0.1306,
"step": 18350
},
{
"epoch": 0.47,
"grad_norm": 0.8363983631134033,
"learning_rate": 1.926566326527468e-06,
"loss": 0.1313,
"step": 18400
},
{
"epoch": 0.47,
"grad_norm": 0.7722070813179016,
"learning_rate": 1.914591222566678e-06,
"loss": 0.1337,
"step": 18450
},
{
"epoch": 0.47,
"grad_norm": 0.8674912452697754,
"learning_rate": 1.9026303129961049e-06,
"loss": 0.1268,
"step": 18500
},
{
"epoch": 0.47,
"grad_norm": 1.0055360794067383,
"learning_rate": 1.8906838878315886e-06,
"loss": 0.1299,
"step": 18550
},
{
"epoch": 0.47,
"grad_norm": 0.797717273235321,
"learning_rate": 1.878752236737765e-06,
"loss": 0.1261,
"step": 18600
},
{
"epoch": 0.47,
"grad_norm": 0.8092938661575317,
"learning_rate": 1.8668356490210449e-06,
"loss": 0.1362,
"step": 18650
},
{
"epoch": 0.47,
"grad_norm": 0.8348585367202759,
"learning_rate": 1.8549344136225946e-06,
"loss": 0.1276,
"step": 18700
},
{
"epoch": 0.48,
"grad_norm": 0.8768981099128723,
"learning_rate": 1.8430488191113373e-06,
"loss": 0.1332,
"step": 18750
},
{
"epoch": 0.48,
"grad_norm": 0.7733373045921326,
"learning_rate": 1.8311791536769485e-06,
"loss": 0.134,
"step": 18800
},
{
"epoch": 0.48,
"grad_norm": 0.8418094515800476,
"learning_rate": 1.819325705122873e-06,
"loss": 0.133,
"step": 18850
},
{
"epoch": 0.48,
"grad_norm": 0.8411191701889038,
"learning_rate": 1.8074887608593477e-06,
"loss": 0.1304,
"step": 18900
},
{
"epoch": 0.48,
"grad_norm": 0.9010744094848633,
"learning_rate": 1.7956686078964257e-06,
"loss": 0.131,
"step": 18950
},
{
"epoch": 0.48,
"grad_norm": 0.8585550785064697,
"learning_rate": 1.7838655328370268e-06,
"loss": 0.1308,
"step": 19000
},
{
"epoch": 0.48,
"grad_norm": 0.8417059183120728,
"learning_rate": 1.7720798218699798e-06,
"loss": 0.1282,
"step": 19050
},
{
"epoch": 0.48,
"grad_norm": 0.8636181354522705,
"learning_rate": 1.7603117607630892e-06,
"loss": 0.1256,
"step": 19100
},
{
"epoch": 0.49,
"grad_norm": 0.8818506598472595,
"learning_rate": 1.7485616348562023e-06,
"loss": 0.1306,
"step": 19150
},
{
"epoch": 0.49,
"grad_norm": 0.8800364136695862,
"learning_rate": 1.7368297290542918e-06,
"loss": 0.1303,
"step": 19200
},
{
"epoch": 0.49,
"grad_norm": 0.8845418095588684,
"learning_rate": 1.72511632782055e-06,
"loss": 0.131,
"step": 19250
},
{
"epoch": 0.49,
"grad_norm": 0.9156243205070496,
"learning_rate": 1.7134217151694873e-06,
"loss": 0.1269,
"step": 19300
},
{
"epoch": 0.49,
"grad_norm": 0.772602915763855,
"learning_rate": 1.7017461746600506e-06,
"loss": 0.1321,
"step": 19350
},
{
"epoch": 0.49,
"grad_norm": 0.7767558097839355,
"learning_rate": 1.690089989388741e-06,
"loss": 0.1302,
"step": 19400
},
{
"epoch": 0.49,
"grad_norm": 0.816466212272644,
"learning_rate": 1.678453441982758e-06,
"loss": 0.132,
"step": 19450
},
{
"epoch": 0.49,
"grad_norm": 0.746376633644104,
"learning_rate": 1.66683681459314e-06,
"loss": 0.1321,
"step": 19500
},
{
"epoch": 0.5,
"grad_norm": 0.9455410242080688,
"learning_rate": 1.6552403888879243e-06,
"loss": 0.1265,
"step": 19550
},
{
"epoch": 0.5,
"grad_norm": 0.9499367475509644,
"learning_rate": 1.6436644460453218e-06,
"loss": 0.1266,
"step": 19600
},
{
"epoch": 0.5,
"grad_norm": 0.8672183156013489,
"learning_rate": 1.6321092667468926e-06,
"loss": 0.1288,
"step": 19650
},
{
"epoch": 0.5,
"grad_norm": 0.7655101418495178,
"learning_rate": 1.6205751311707463e-06,
"loss": 0.1346,
"step": 19700
},
{
"epoch": 0.5,
"grad_norm": 0.7775772213935852,
"learning_rate": 1.6090623189847443e-06,
"loss": 0.1276,
"step": 19750
},
{
"epoch": 0.5,
"grad_norm": 0.8449127078056335,
"learning_rate": 1.5975711093397223e-06,
"loss": 0.1323,
"step": 19800
},
{
"epoch": 0.5,
"grad_norm": 0.9375335574150085,
"learning_rate": 1.5861017808627167e-06,
"loss": 0.1338,
"step": 19850
},
{
"epoch": 0.5,
"grad_norm": 0.7458611130714417,
"learning_rate": 1.574654611650214e-06,
"loss": 0.1307,
"step": 19900
},
{
"epoch": 0.51,
"grad_norm": 0.749284029006958,
"learning_rate": 1.5632298792614064e-06,
"loss": 0.1216,
"step": 19950
},
{
"epoch": 0.51,
"grad_norm": 0.8348965644836426,
"learning_rate": 1.5518278607114585e-06,
"loss": 0.1253,
"step": 20000
},
{
"epoch": 0.51,
"grad_norm": 0.9566074013710022,
"learning_rate": 1.540448832464796e-06,
"loss": 0.1289,
"step": 20050
},
{
"epoch": 0.51,
"grad_norm": 0.8038182854652405,
"learning_rate": 1.5290930704283953e-06,
"loss": 0.1266,
"step": 20100
},
{
"epoch": 0.51,
"grad_norm": 0.8434205055236816,
"learning_rate": 1.517760849945103e-06,
"loss": 0.1276,
"step": 20150
},
{
"epoch": 0.51,
"grad_norm": 0.7553625702857971,
"learning_rate": 1.5064524457869506e-06,
"loss": 0.128,
"step": 20200
},
{
"epoch": 0.51,
"grad_norm": 0.7524191737174988,
"learning_rate": 1.4951681321484952e-06,
"loss": 0.1313,
"step": 20250
},
{
"epoch": 0.51,
"grad_norm": 0.8314839601516724,
"learning_rate": 1.4839081826401756e-06,
"loss": 0.1255,
"step": 20300
},
{
"epoch": 0.52,
"grad_norm": 0.8954294323921204,
"learning_rate": 1.47267287028167e-06,
"loss": 0.1259,
"step": 20350
},
{
"epoch": 0.52,
"grad_norm": 0.7757198810577393,
"learning_rate": 1.4614624674952843e-06,
"loss": 0.1288,
"step": 20400
},
{
"epoch": 0.52,
"grad_norm": 0.9685442447662354,
"learning_rate": 1.4502772460993387e-06,
"loss": 0.1313,
"step": 20450
},
{
"epoch": 0.52,
"grad_norm": 0.8876562118530273,
"learning_rate": 1.4391174773015836e-06,
"loss": 0.1326,
"step": 20500
},
{
"epoch": 0.52,
"grad_norm": 1.0486761331558228,
"learning_rate": 1.4279834316926217e-06,
"loss": 0.126,
"step": 20550
},
{
"epoch": 0.52,
"grad_norm": 0.806884765625,
"learning_rate": 1.4168753792393413e-06,
"loss": 0.1286,
"step": 20600
},
{
"epoch": 0.52,
"grad_norm": 0.8478713631629944,
"learning_rate": 1.405793589278382e-06,
"loss": 0.1248,
"step": 20650
},
{
"epoch": 0.52,
"grad_norm": 1.0112961530685425,
"learning_rate": 1.394738330509593e-06,
"loss": 0.1265,
"step": 20700
},
{
"epoch": 0.53,
"grad_norm": 0.9463728666305542,
"learning_rate": 1.3837098709895246e-06,
"loss": 0.1274,
"step": 20750
},
{
"epoch": 0.53,
"grad_norm": 0.8236443996429443,
"learning_rate": 1.3727084781249251e-06,
"loss": 0.1287,
"step": 20800
},
{
"epoch": 0.53,
"grad_norm": 0.8493944406509399,
"learning_rate": 1.3617344186662585e-06,
"loss": 0.1252,
"step": 20850
},
{
"epoch": 0.53,
"grad_norm": 0.7850596904754639,
"learning_rate": 1.3507879587012378e-06,
"loss": 0.1199,
"step": 20900
},
{
"epoch": 0.53,
"grad_norm": 1.119826316833496,
"learning_rate": 1.3398693636483707e-06,
"loss": 0.1259,
"step": 20950
},
{
"epoch": 0.53,
"grad_norm": 0.8658459782600403,
"learning_rate": 1.328978898250525e-06,
"loss": 0.1276,
"step": 21000
},
{
"epoch": 0.53,
"grad_norm": 0.8346827030181885,
"learning_rate": 1.31811682656851e-06,
"loss": 0.126,
"step": 21050
},
{
"epoch": 0.53,
"grad_norm": 1.0707381963729858,
"learning_rate": 1.307283411974674e-06,
"loss": 0.1266,
"step": 21100
},
{
"epoch": 0.54,
"grad_norm": 0.8419653177261353,
"learning_rate": 1.2964789171465164e-06,
"loss": 0.1253,
"step": 21150
},
{
"epoch": 0.54,
"grad_norm": 0.6659058332443237,
"learning_rate": 1.2857036040603204e-06,
"loss": 0.1251,
"step": 21200
},
{
"epoch": 0.54,
"grad_norm": 0.9063161015510559,
"learning_rate": 1.2749577339848007e-06,
"loss": 0.1245,
"step": 21250
},
{
"epoch": 0.54,
"grad_norm": 0.9174826741218567,
"learning_rate": 1.2642415674747675e-06,
"loss": 0.1322,
"step": 21300
},
{
"epoch": 0.54,
"grad_norm": 0.8082279562950134,
"learning_rate": 1.25355536436481e-06,
"loss": 0.1294,
"step": 21350
},
{
"epoch": 0.54,
"grad_norm": 0.9118791222572327,
"learning_rate": 1.2428993837629943e-06,
"loss": 0.1277,
"step": 21400
},
{
"epoch": 0.54,
"grad_norm": 0.9657939076423645,
"learning_rate": 1.2322738840445867e-06,
"loss": 0.1281,
"step": 21450
},
{
"epoch": 0.54,
"grad_norm": 0.8594872355461121,
"learning_rate": 1.2216791228457778e-06,
"loss": 0.1193,
"step": 21500
},
{
"epoch": 0.55,
"grad_norm": 1.0133215188980103,
"learning_rate": 1.2111153570574454e-06,
"loss": 0.1275,
"step": 21550
},
{
"epoch": 0.55,
"grad_norm": 0.8617738485336304,
"learning_rate": 1.2005828428189256e-06,
"loss": 0.1269,
"step": 21600
},
{
"epoch": 0.55,
"grad_norm": 0.7881460189819336,
"learning_rate": 1.1900818355117918e-06,
"loss": 0.1251,
"step": 21650
},
{
"epoch": 0.55,
"grad_norm": 0.8519832491874695,
"learning_rate": 1.1796125897536782e-06,
"loss": 0.1311,
"step": 21700
},
{
"epoch": 0.55,
"grad_norm": 0.8465796113014221,
"learning_rate": 1.1691753593920884e-06,
"loss": 0.1293,
"step": 21750
},
{
"epoch": 0.55,
"grad_norm": 0.892995297908783,
"learning_rate": 1.1587703974982583e-06,
"loss": 0.1272,
"step": 21800
},
{
"epoch": 0.55,
"grad_norm": 0.8453019261360168,
"learning_rate": 1.148397956361007e-06,
"loss": 0.1267,
"step": 21850
},
{
"epoch": 0.56,
"grad_norm": 0.7927883863449097,
"learning_rate": 1.1380582874806208e-06,
"loss": 0.1274,
"step": 21900
},
{
"epoch": 0.56,
"grad_norm": 0.848616898059845,
"learning_rate": 1.127751641562765e-06,
"loss": 0.1318,
"step": 21950
},
{
"epoch": 0.56,
"grad_norm": 0.8871294260025024,
"learning_rate": 1.1174782685123919e-06,
"loss": 0.1278,
"step": 22000
},
{
"epoch": 0.56,
"grad_norm": 0.9262788891792297,
"learning_rate": 1.107238417427691e-06,
"loss": 0.1242,
"step": 22050
},
{
"epoch": 0.56,
"grad_norm": 1.0166550874710083,
"learning_rate": 1.0970323365940443e-06,
"loss": 0.1271,
"step": 22100
},
{
"epoch": 0.56,
"grad_norm": 0.9763181805610657,
"learning_rate": 1.0868602734780075e-06,
"loss": 0.1266,
"step": 22150
},
{
"epoch": 0.56,
"grad_norm": 0.8121241927146912,
"learning_rate": 1.0767224747213102e-06,
"loss": 0.1286,
"step": 22200
},
{
"epoch": 0.56,
"grad_norm": 0.7165303230285645,
"learning_rate": 1.0666191861348741e-06,
"loss": 0.1275,
"step": 22250
},
{
"epoch": 0.57,
"grad_norm": 1.0297795534133911,
"learning_rate": 1.0565506526928548e-06,
"loss": 0.1251,
"step": 22300
},
{
"epoch": 0.57,
"grad_norm": 0.8937394022941589,
"learning_rate": 1.0465171185267007e-06,
"loss": 0.1297,
"step": 22350
},
{
"epoch": 0.57,
"grad_norm": 0.9162909388542175,
"learning_rate": 1.036518826919233e-06,
"loss": 0.1325,
"step": 22400
},
{
"epoch": 0.57,
"grad_norm": 0.8973715305328369,
"learning_rate": 1.0265560202987474e-06,
"loss": 0.1296,
"step": 22450
},
{
"epoch": 0.57,
"grad_norm": 0.8906801342964172,
"learning_rate": 1.0166289402331391e-06,
"loss": 0.128,
"step": 22500
},
{
"epoch": 0.57,
"grad_norm": 0.7446568608283997,
"learning_rate": 1.006737827424038e-06,
"loss": 0.1238,
"step": 22550
},
{
"epoch": 0.57,
"grad_norm": 1.0760340690612793,
"learning_rate": 9.9688292170098e-07,
"loss": 0.129,
"step": 22600
},
{
"epoch": 0.57,
"grad_norm": 0.9284375309944153,
"learning_rate": 9.870644620155878e-07,
"loss": 0.125,
"step": 22650
},
{
"epoch": 0.58,
"grad_norm": 0.9257674217224121,
"learning_rate": 9.77282686435777e-07,
"loss": 0.1272,
"step": 22700
},
{
"epoch": 0.58,
"grad_norm": 0.7764928936958313,
"learning_rate": 9.67537832139989e-07,
"loss": 0.1224,
"step": 22750
},
{
"epoch": 0.58,
"grad_norm": 0.7755612730979919,
"learning_rate": 9.578301354114292e-07,
"loss": 0.1279,
"step": 22800
},
{
"epoch": 0.58,
"grad_norm": 0.847436785697937,
"learning_rate": 9.481598316323504e-07,
"loss": 0.1273,
"step": 22850
},
{
"epoch": 0.58,
"grad_norm": 0.8615244030952454,
"learning_rate": 9.385271552783376e-07,
"loss": 0.1267,
"step": 22900
},
{
"epoch": 0.58,
"grad_norm": 0.8775952458381653,
"learning_rate": 9.289323399126216e-07,
"loss": 0.1276,
"step": 22950
},
{
"epoch": 0.58,
"grad_norm": 0.9326611161231995,
"learning_rate": 9.193756181804248e-07,
"loss": 0.1296,
"step": 23000
},
{
"epoch": 0.58,
"grad_norm": 0.6743093132972717,
"learning_rate": 9.098572218033084e-07,
"loss": 0.1279,
"step": 23050
},
{
"epoch": 0.59,
"grad_norm": 0.9707579612731934,
"learning_rate": 9.003773815735644e-07,
"loss": 0.1283,
"step": 23100
},
{
"epoch": 0.59,
"grad_norm": 0.8160597681999207,
"learning_rate": 8.90936327348613e-07,
"loss": 0.127,
"step": 23150
},
{
"epoch": 0.59,
"grad_norm": 1.070728063583374,
"learning_rate": 8.815342880454312e-07,
"loss": 0.1265,
"step": 23200
},
{
"epoch": 0.59,
"grad_norm": 0.945746123790741,
"learning_rate": 8.721714916350019e-07,
"loss": 0.1241,
"step": 23250
},
{
"epoch": 0.59,
"grad_norm": 0.844608724117279,
"learning_rate": 8.628481651367876e-07,
"loss": 0.1271,
"step": 23300
},
{
"epoch": 0.59,
"grad_norm": 0.8871841430664062,
"learning_rate": 8.535645346132246e-07,
"loss": 0.1313,
"step": 23350
},
{
"epoch": 0.59,
"grad_norm": 0.791871964931488,
"learning_rate": 8.443208251642418e-07,
"loss": 0.1242,
"step": 23400
},
{
"epoch": 0.59,
"grad_norm": 0.9160509705543518,
"learning_rate": 8.351172609218033e-07,
"loss": 0.1217,
"step": 23450
},
{
"epoch": 0.6,
"grad_norm": 0.7962935566902161,
"learning_rate": 8.259540650444736e-07,
"loss": 0.128,
"step": 23500
},
{
"epoch": 0.6,
"grad_norm": 0.7934704422950745,
"learning_rate": 8.168314597120059e-07,
"loss": 0.1217,
"step": 23550
},
{
"epoch": 0.6,
"grad_norm": 0.7720403671264648,
"learning_rate": 8.077496661199557e-07,
"loss": 0.1252,
"step": 23600
},
{
"epoch": 0.6,
"grad_norm": 0.806203305721283,
"learning_rate": 7.987089044743182e-07,
"loss": 0.1276,
"step": 23650
},
{
"epoch": 0.6,
"grad_norm": 0.9752463698387146,
"learning_rate": 7.897093939861878e-07,
"loss": 0.1288,
"step": 23700
},
{
"epoch": 0.6,
"grad_norm": 0.8070717453956604,
"learning_rate": 7.807513528664415e-07,
"loss": 0.1272,
"step": 23750
},
{
"epoch": 0.6,
"grad_norm": 0.773583710193634,
"learning_rate": 7.71834998320454e-07,
"loss": 0.1192,
"step": 23800
},
{
"epoch": 0.6,
"grad_norm": 0.7892506122589111,
"learning_rate": 7.629605465428211e-07,
"loss": 0.1217,
"step": 23850
},
{
"epoch": 0.61,
"grad_norm": 0.6606497764587402,
"learning_rate": 7.541282127121291e-07,
"loss": 0.1222,
"step": 23900
},
{
"epoch": 0.61,
"grad_norm": 0.8119127154350281,
"learning_rate": 7.453382109857269e-07,
"loss": 0.1241,
"step": 23950
},
{
"epoch": 0.61,
"grad_norm": 0.7261886596679688,
"learning_rate": 7.365907544945398e-07,
"loss": 0.1294,
"step": 24000
},
{
"epoch": 0.61,
"grad_norm": 0.8544095754623413,
"learning_rate": 7.27886055337902e-07,
"loss": 0.126,
"step": 24050
},
{
"epoch": 0.61,
"grad_norm": 0.9304544925689697,
"learning_rate": 7.192243245784075e-07,
"loss": 0.1296,
"step": 24100
},
{
"epoch": 0.61,
"grad_norm": 0.8993602395057678,
"learning_rate": 7.106057722368012e-07,
"loss": 0.1225,
"step": 24150
},
{
"epoch": 0.61,
"grad_norm": 0.7909960150718689,
"learning_rate": 7.020306072868804e-07,
"loss": 0.1261,
"step": 24200
},
{
"epoch": 0.61,
"grad_norm": 0.8330161571502686,
"learning_rate": 6.934990376504269e-07,
"loss": 0.1242,
"step": 24250
},
{
"epoch": 0.62,
"grad_norm": 0.9058279395103455,
"learning_rate": 6.850112701921735e-07,
"loss": 0.1224,
"step": 24300
},
{
"epoch": 0.62,
"grad_norm": 0.9598106741905212,
"learning_rate": 6.76567510714777e-07,
"loss": 0.1261,
"step": 24350
},
{
"epoch": 0.62,
"grad_norm": 0.8675400018692017,
"learning_rate": 6.681679639538388e-07,
"loss": 0.1275,
"step": 24400
},
{
"epoch": 0.62,
"grad_norm": 0.7243128418922424,
"learning_rate": 6.598128335729332e-07,
"loss": 0.1301,
"step": 24450
},
{
"epoch": 0.62,
"grad_norm": 0.866690456867218,
"learning_rate": 6.515023221586722e-07,
"loss": 0.1276,
"step": 24500
},
{
"epoch": 0.62,
"grad_norm": 0.8871277570724487,
"learning_rate": 6.432366312157933e-07,
"loss": 0.1285,
"step": 24550
},
{
"epoch": 0.62,
"grad_norm": 0.6913353800773621,
"learning_rate": 6.35015961162273e-07,
"loss": 0.1207,
"step": 24600
},
{
"epoch": 0.62,
"grad_norm": 0.8807539939880371,
"learning_rate": 6.268405113244677e-07,
"loss": 0.1227,
"step": 24650
},
{
"epoch": 0.63,
"grad_norm": 0.9127259850502014,
"learning_rate": 6.187104799322805e-07,
"loss": 0.1234,
"step": 24700
},
{
"epoch": 0.63,
"grad_norm": 1.0802431106567383,
"learning_rate": 6.106260641143547e-07,
"loss": 0.1257,
"step": 24750
},
{
"epoch": 0.63,
"grad_norm": 0.8640583157539368,
"learning_rate": 6.025874598932937e-07,
"loss": 0.1195,
"step": 24800
},
{
"epoch": 0.63,
"grad_norm": 0.8064579963684082,
"learning_rate": 5.945948621809092e-07,
"loss": 0.1266,
"step": 24850
},
{
"epoch": 0.63,
"grad_norm": 0.8165306448936462,
"learning_rate": 5.866484647734935e-07,
"loss": 0.1229,
"step": 24900
},
{
"epoch": 0.63,
"grad_norm": 0.885006308555603,
"learning_rate": 5.787484603471221e-07,
"loss": 0.1215,
"step": 24950
},
{
"epoch": 0.63,
"grad_norm": 1.0080711841583252,
"learning_rate": 5.708950404529812e-07,
"loss": 0.1233,
"step": 25000
},
{
"epoch": 0.63,
"grad_norm": 0.8697314858436584,
"learning_rate": 5.630883955127211e-07,
"loss": 0.1215,
"step": 25050
},
{
"epoch": 0.64,
"grad_norm": 0.8583015203475952,
"learning_rate": 5.553287148138462e-07,
"loss": 0.1267,
"step": 25100
},
{
"epoch": 0.64,
"grad_norm": 0.771619439125061,
"learning_rate": 5.47616186505113e-07,
"loss": 0.1185,
"step": 25150
},
{
"epoch": 0.64,
"grad_norm": 0.8931272625923157,
"learning_rate": 5.399509975919828e-07,
"loss": 0.1232,
"step": 25200
},
{
"epoch": 0.64,
"grad_norm": 0.9257284998893738,
"learning_rate": 5.323333339320739e-07,
"loss": 0.126,
"step": 25250
},
{
"epoch": 0.64,
"grad_norm": 0.7422381639480591,
"learning_rate": 5.247633802306637e-07,
"loss": 0.1258,
"step": 25300
},
{
"epoch": 0.64,
"grad_norm": 0.8282724022865295,
"learning_rate": 5.172413200362092e-07,
"loss": 0.1244,
"step": 25350
},
{
"epoch": 0.64,
"grad_norm": 0.9066148996353149,
"learning_rate": 5.097673357358906e-07,
"loss": 0.1227,
"step": 25400
},
{
"epoch": 0.64,
"grad_norm": 1.0274531841278076,
"learning_rate": 5.023416085511976e-07,
"loss": 0.1206,
"step": 25450
},
{
"epoch": 0.65,
"grad_norm": 0.8062939047813416,
"learning_rate": 4.949643185335288e-07,
"loss": 0.121,
"step": 25500
},
{
"epoch": 0.65,
"grad_norm": 0.9104852080345154,
"learning_rate": 4.876356445598279e-07,
"loss": 0.1318,
"step": 25550
},
{
"epoch": 0.65,
"grad_norm": 0.9484885334968567,
"learning_rate": 4.803557643282486e-07,
"loss": 0.1215,
"step": 25600
},
{
"epoch": 0.65,
"grad_norm": 1.0123718976974487,
"learning_rate": 4.731248543538405e-07,
"loss": 0.1232,
"step": 25650
},
{
"epoch": 0.65,
"grad_norm": 0.7490533590316772,
"learning_rate": 4.6594308996427696e-07,
"loss": 0.1226,
"step": 25700
},
{
"epoch": 0.65,
"grad_norm": 0.867254912853241,
"learning_rate": 4.588106452955973e-07,
"loss": 0.1183,
"step": 25750
},
{
"epoch": 0.65,
"grad_norm": 0.889826238155365,
"learning_rate": 4.517276932879877e-07,
"loss": 0.1229,
"step": 25800
},
{
"epoch": 0.66,
"grad_norm": 0.8290361166000366,
"learning_rate": 4.446944056815866e-07,
"loss": 0.123,
"step": 25850
},
{
"epoch": 0.66,
"grad_norm": 0.8148057460784912,
"learning_rate": 4.377109530123216e-07,
"loss": 0.1275,
"step": 25900
},
{
"epoch": 0.66,
"grad_norm": 0.8523454666137695,
"learning_rate": 4.307775046077739e-07,
"loss": 0.1227,
"step": 25950
},
{
"epoch": 0.66,
"grad_norm": 0.8022044897079468,
"learning_rate": 4.2389422858307244e-07,
"loss": 0.1273,
"step": 26000
},
{
"epoch": 0.66,
"grad_norm": 0.7871876955032349,
"learning_rate": 4.1706129183681834e-07,
"loss": 0.1281,
"step": 26050
},
{
"epoch": 0.66,
"grad_norm": 0.8694405555725098,
"learning_rate": 4.10278860047037e-07,
"loss": 0.1202,
"step": 26100
},
{
"epoch": 0.66,
"grad_norm": 0.9443354606628418,
"learning_rate": 4.035470976671621e-07,
"loss": 0.1194,
"step": 26150
},
{
"epoch": 0.66,
"grad_norm": 0.8417434096336365,
"learning_rate": 3.9686616792204677e-07,
"loss": 0.1213,
"step": 26200
},
{
"epoch": 0.67,
"grad_norm": 0.855648398399353,
"learning_rate": 3.902362328040091e-07,
"loss": 0.1272,
"step": 26250
},
{
"epoch": 0.67,
"grad_norm": 0.8079158067703247,
"learning_rate": 3.836574530688983e-07,
"loss": 0.1237,
"step": 26300
},
{
"epoch": 0.67,
"grad_norm": 0.9010571837425232,
"learning_rate": 3.7712998823220243e-07,
"loss": 0.1268,
"step": 26350
},
{
"epoch": 0.67,
"grad_norm": 0.9768967032432556,
"learning_rate": 3.7065399656517955e-07,
"loss": 0.1226,
"step": 26400
},
{
"epoch": 0.67,
"grad_norm": 0.720100462436676,
"learning_rate": 3.6422963509101626e-07,
"loss": 0.1212,
"step": 26450
},
{
"epoch": 0.67,
"grad_norm": 0.8054412603378296,
"learning_rate": 3.578570595810274e-07,
"loss": 0.12,
"step": 26500
},
{
"epoch": 0.67,
"grad_norm": 0.7987850904464722,
"learning_rate": 3.515364245508704e-07,
"loss": 0.1205,
"step": 26550
},
{
"epoch": 0.67,
"grad_norm": 0.7818375825881958,
"learning_rate": 3.452678832568071e-07,
"loss": 0.1301,
"step": 26600
},
{
"epoch": 0.68,
"grad_norm": 0.8323056101799011,
"learning_rate": 3.390515876919831e-07,
"loss": 0.1208,
"step": 26650
},
{
"epoch": 0.68,
"grad_norm": 0.8122744560241699,
"learning_rate": 3.328876885827406e-07,
"loss": 0.121,
"step": 26700
},
{
"epoch": 0.68,
"grad_norm": 0.9000005125999451,
"learning_rate": 3.267763353849704e-07,
"loss": 0.1187,
"step": 26750
},
{
"epoch": 0.68,
"grad_norm": 0.7364488840103149,
"learning_rate": 3.207176762804814e-07,
"loss": 0.1228,
"step": 26800
},
{
"epoch": 0.68,
"grad_norm": 0.9550331234931946,
"learning_rate": 3.1471185817341153e-07,
"loss": 0.1197,
"step": 26850
},
{
"epoch": 0.68,
"grad_norm": 0.8683730959892273,
"learning_rate": 3.0875902668666386e-07,
"loss": 0.1279,
"step": 26900
},
{
"epoch": 0.68,
"grad_norm": 0.8578702807426453,
"learning_rate": 3.0285932615837646e-07,
"loss": 0.1212,
"step": 26950
},
{
"epoch": 0.68,
"grad_norm": 0.9135186076164246,
"learning_rate": 2.970128996384228e-07,
"loss": 0.1219,
"step": 27000
},
{
"epoch": 0.69,
"grad_norm": 0.8612381815910339,
"learning_rate": 2.9121988888494297e-07,
"loss": 0.1274,
"step": 27050
},
{
"epoch": 0.69,
"grad_norm": 0.8324002027511597,
"learning_rate": 2.854804343609058e-07,
"loss": 0.1282,
"step": 27100
},
{
"epoch": 0.69,
"grad_norm": 0.8618267774581909,
"learning_rate": 2.7979467523070484e-07,
"loss": 0.1219,
"step": 27150
},
{
"epoch": 0.69,
"grad_norm": 0.8603401780128479,
"learning_rate": 2.741627493567822e-07,
"loss": 0.1225,
"step": 27200
},
{
"epoch": 0.69,
"grad_norm": 1.0212452411651611,
"learning_rate": 2.685847932962868e-07,
"loss": 0.1225,
"step": 27250
},
{
"epoch": 0.69,
"grad_norm": 0.7207959294319153,
"learning_rate": 2.630609422977623e-07,
"loss": 0.1266,
"step": 27300
},
{
"epoch": 0.69,
"grad_norm": 0.9578468203544617,
"learning_rate": 2.575913302978697e-07,
"loss": 0.1274,
"step": 27350
},
{
"epoch": 0.69,
"grad_norm": 1.033157229423523,
"learning_rate": 2.5217608991813774e-07,
"loss": 0.116,
"step": 27400
},
{
"epoch": 0.7,
"grad_norm": 0.8012121915817261,
"learning_rate": 2.468153524617478e-07,
"loss": 0.121,
"step": 27450
},
{
"epoch": 0.7,
"grad_norm": 0.7251663208007812,
"learning_rate": 2.4150924791035037e-07,
"loss": 0.1213,
"step": 27500
},
{
"epoch": 0.7,
"grad_norm": 0.8772590160369873,
"learning_rate": 2.3625790492091544e-07,
"loss": 0.1205,
"step": 27550
},
{
"epoch": 0.7,
"grad_norm": 0.9037516713142395,
"learning_rate": 2.3106145082260777e-07,
"loss": 0.1234,
"step": 27600
},
{
"epoch": 0.7,
"grad_norm": 0.8488568067550659,
"learning_rate": 2.2592001161370392e-07,
"loss": 0.1203,
"step": 27650
},
{
"epoch": 0.7,
"grad_norm": 0.8486872911453247,
"learning_rate": 2.2083371195853797e-07,
"loss": 0.1309,
"step": 27700
},
{
"epoch": 0.7,
"grad_norm": 0.964940071105957,
"learning_rate": 2.158026751844733e-07,
"loss": 0.1228,
"step": 27750
},
{
"epoch": 0.7,
"grad_norm": 0.8965867161750793,
"learning_rate": 2.1082702327891918e-07,
"loss": 0.13,
"step": 27800
},
{
"epoch": 0.71,
"grad_norm": 0.8563171029090881,
"learning_rate": 2.0590687688636619e-07,
"loss": 0.1187,
"step": 27850
},
{
"epoch": 0.71,
"grad_norm": 1.000481367111206,
"learning_rate": 2.0104235530546745e-07,
"loss": 0.1267,
"step": 27900
},
{
"epoch": 0.71,
"grad_norm": 0.793648898601532,
"learning_rate": 1.9623357648614088e-07,
"loss": 0.1225,
"step": 27950
},
{
"epoch": 0.71,
"grad_norm": 0.8979523181915283,
"learning_rate": 1.914806570267111e-07,
"loss": 0.1234,
"step": 28000
},
{
"epoch": 0.71,
"grad_norm": 0.9331807494163513,
"learning_rate": 1.8678371217108387e-07,
"loss": 0.1197,
"step": 28050
},
{
"epoch": 0.71,
"grad_norm": 0.8871297240257263,
"learning_rate": 1.821428558059493e-07,
"loss": 0.1159,
"step": 28100
},
{
"epoch": 0.71,
"grad_norm": 1.0340665578842163,
"learning_rate": 1.7755820045802146e-07,
"loss": 0.1266,
"step": 28150
},
{
"epoch": 0.71,
"grad_norm": 1.055516004562378,
"learning_rate": 1.7302985729131e-07,
"loss": 0.1255,
"step": 28200
},
{
"epoch": 0.72,
"grad_norm": 0.8390622138977051,
"learning_rate": 1.6855793610442484e-07,
"loss": 0.1256,
"step": 28250
},
{
"epoch": 0.72,
"grad_norm": 0.8009083867073059,
"learning_rate": 1.6414254532791357e-07,
"loss": 0.1215,
"step": 28300
},
{
"epoch": 0.72,
"grad_norm": 1.0313583612442017,
"learning_rate": 1.5978379202163275e-07,
"loss": 0.1274,
"step": 28350
},
{
"epoch": 0.72,
"grad_norm": 0.9737227559089661,
"learning_rate": 1.554817818721513e-07,
"loss": 0.1248,
"step": 28400
},
{
"epoch": 0.72,
"grad_norm": 1.0153248310089111,
"learning_rate": 1.51236619190189e-07,
"loss": 0.1247,
"step": 28450
},
{
"epoch": 0.72,
"grad_norm": 0.8174459338188171,
"learning_rate": 1.4704840690808658e-07,
"loss": 0.122,
"step": 28500
},
{
"epoch": 0.72,
"grad_norm": 0.9065647125244141,
"learning_rate": 1.4291724657730904e-07,
"loss": 0.1245,
"step": 28550
},
{
"epoch": 0.72,
"grad_norm": 0.9131068587303162,
"learning_rate": 1.3884323836598656e-07,
"loss": 0.1199,
"step": 28600
},
{
"epoch": 0.73,
"grad_norm": 0.8409487009048462,
"learning_rate": 1.348264810564809e-07,
"loss": 0.1282,
"step": 28650
},
{
"epoch": 0.73,
"grad_norm": 0.8395945429801941,
"learning_rate": 1.3086707204299415e-07,
"loss": 0.1264,
"step": 28700
},
{
"epoch": 0.73,
"grad_norm": 0.8718541860580444,
"learning_rate": 1.269651073292058e-07,
"loss": 0.1204,
"step": 28750
},
{
"epoch": 0.73,
"grad_norm": 0.8746705055236816,
"learning_rate": 1.2312068152594448e-07,
"loss": 0.1244,
"step": 28800
},
{
"epoch": 0.73,
"grad_norm": 0.8849207758903503,
"learning_rate": 1.1933388784889617e-07,
"loss": 0.1215,
"step": 28850
},
{
"epoch": 0.73,
"grad_norm": 0.9171212315559387,
"learning_rate": 1.1560481811633911e-07,
"loss": 0.1274,
"step": 28900
},
{
"epoch": 0.73,
"grad_norm": 0.9320688247680664,
"learning_rate": 1.1193356274692424e-07,
"loss": 0.1211,
"step": 28950
},
{
"epoch": 0.73,
"grad_norm": 0.7832902073860168,
"learning_rate": 1.0832021075747712e-07,
"loss": 0.1257,
"step": 29000
},
{
"epoch": 0.74,
"grad_norm": 0.8894676566123962,
"learning_rate": 1.047648497608414e-07,
"loss": 0.1268,
"step": 29050
},
{
"epoch": 0.74,
"grad_norm": 0.7472261786460876,
"learning_rate": 1.0126756596375687e-07,
"loss": 0.1222,
"step": 29100
},
{
"epoch": 0.74,
"grad_norm": 0.9430793523788452,
"learning_rate": 9.782844416476423e-08,
"loss": 0.1219,
"step": 29150
},
{
"epoch": 0.74,
"grad_norm": 0.8395842909812927,
"learning_rate": 9.444756775215446e-08,
"loss": 0.1212,
"step": 29200
},
{
"epoch": 0.74,
"grad_norm": 0.8106001019477844,
"learning_rate": 9.112501870194273e-08,
"loss": 0.124,
"step": 29250
},
{
"epoch": 0.74,
"grad_norm": 0.9083417654037476,
"learning_rate": 8.786087757588269e-08,
"loss": 0.1269,
"step": 29300
},
{
"epoch": 0.74,
"grad_norm": 0.9193884134292603,
"learning_rate": 8.465522351951305e-08,
"loss": 0.1207,
"step": 29350
},
{
"epoch": 0.75,
"grad_norm": 0.9699887633323669,
"learning_rate": 8.150813426023752e-08,
"loss": 0.1245,
"step": 29400
},
{
"epoch": 0.75,
"grad_norm": 0.984941303730011,
"learning_rate": 7.841968610544121e-08,
"loss": 0.1228,
"step": 29450
},
{
"epoch": 0.75,
"grad_norm": 1.0322271585464478,
"learning_rate": 7.538995394063996e-08,
"loss": 0.1247,
"step": 29500
},
{
"epoch": 0.75,
"grad_norm": 0.9415401220321655,
"learning_rate": 7.241901122766515e-08,
"loss": 0.1209,
"step": 29550
},
{
"epoch": 0.75,
"grad_norm": 0.8878545165061951,
"learning_rate": 6.950693000288056e-08,
"loss": 0.1268,
"step": 29600
},
{
"epoch": 0.75,
"grad_norm": 0.8881374001502991,
"learning_rate": 6.665378087543889e-08,
"loss": 0.1239,
"step": 29650
},
{
"epoch": 0.75,
"grad_norm": 0.8045355081558228,
"learning_rate": 6.385963302556642e-08,
"loss": 0.1197,
"step": 29700
},
{
"epoch": 0.75,
"grad_norm": 0.8438431620597839,
"learning_rate": 6.112455420288821e-08,
"loss": 0.1242,
"step": 29750
},
{
"epoch": 0.76,
"grad_norm": 0.8916789293289185,
"learning_rate": 5.844861072478336e-08,
"loss": 0.1244,
"step": 29800
},
{
"epoch": 0.76,
"grad_norm": 0.8392596244812012,
"learning_rate": 5.583186747477848e-08,
"loss": 0.1234,
"step": 29850
},
{
"epoch": 0.76,
"grad_norm": 0.8782594203948975,
"learning_rate": 5.32743879009745e-08,
"loss": 0.1225,
"step": 29900
},
{
"epoch": 0.76,
"grad_norm": 0.8895407319068909,
"learning_rate": 5.077623401450599e-08,
"loss": 0.1212,
"step": 29950
},
{
"epoch": 0.76,
"grad_norm": 0.9181334376335144,
"learning_rate": 4.8337466388040935e-08,
"loss": 0.1203,
"step": 30000
},
{
"epoch": 0.76,
"grad_norm": 0.7273566126823425,
"learning_rate": 4.595814415430916e-08,
"loss": 0.1235,
"step": 30050
},
{
"epoch": 0.76,
"grad_norm": 0.8395570516586304,
"learning_rate": 4.3638325004670134e-08,
"loss": 0.1275,
"step": 30100
},
{
"epoch": 0.76,
"grad_norm": 0.9657353758811951,
"learning_rate": 4.1378065187714365e-08,
"loss": 0.1194,
"step": 30150
},
{
"epoch": 0.77,
"grad_norm": 0.8621689081192017,
"learning_rate": 3.917741950789727e-08,
"loss": 0.1219,
"step": 30200
},
{
"epoch": 0.77,
"grad_norm": 0.8802511692047119,
"learning_rate": 3.703644132421386e-08,
"loss": 0.1246,
"step": 30250
},
{
"epoch": 0.77,
"grad_norm": 0.8545549511909485,
"learning_rate": 3.4955182548901956e-08,
"loss": 0.1236,
"step": 30300
},
{
"epoch": 0.77,
"grad_norm": 0.9442453384399414,
"learning_rate": 3.293369364618465e-08,
"loss": 0.1255,
"step": 30350
},
{
"epoch": 0.77,
"grad_norm": 0.8238327503204346,
"learning_rate": 3.097202363104679e-08,
"loss": 0.1223,
"step": 30400
},
{
"epoch": 0.77,
"grad_norm": 0.9696635007858276,
"learning_rate": 2.9070220068045663e-08,
"loss": 0.1203,
"step": 30450
},
{
"epoch": 0.77,
"grad_norm": 0.760059118270874,
"learning_rate": 2.722832907015971e-08,
"loss": 0.1241,
"step": 30500
},
{
"epoch": 0.77,
"grad_norm": 0.9633733034133911,
"learning_rate": 2.544639529766829e-08,
"loss": 0.1282,
"step": 30550
},
{
"epoch": 0.78,
"grad_norm": 0.949769914150238,
"learning_rate": 2.3724461957068955e-08,
"loss": 0.1244,
"step": 30600
},
{
"epoch": 0.78,
"grad_norm": 0.8858135342597961,
"learning_rate": 2.206257080003188e-08,
"loss": 0.1241,
"step": 30650
},
{
"epoch": 0.78,
"grad_norm": 0.8067658543586731,
"learning_rate": 2.0460762122385124e-08,
"loss": 0.1162,
"step": 30700
},
{
"epoch": 0.78,
"grad_norm": 0.874151885509491,
"learning_rate": 1.8919074763138757e-08,
"loss": 0.1246,
"step": 30750
},
{
"epoch": 0.78,
"grad_norm": 0.8034783601760864,
"learning_rate": 1.7437546103542814e-08,
"loss": 0.1246,
"step": 30800
},
{
"epoch": 0.78,
"grad_norm": 0.9454385042190552,
"learning_rate": 1.6016212066181368e-08,
"loss": 0.122,
"step": 30850
},
{
"epoch": 0.78,
"grad_norm": 0.8052737712860107,
"learning_rate": 1.4655107114101008e-08,
"loss": 0.1256,
"step": 30900
},
{
"epoch": 0.78,
"grad_norm": 0.9731265902519226,
"learning_rate": 1.3354264249975379e-08,
"loss": 0.1246,
"step": 30950
},
{
"epoch": 0.79,
"grad_norm": 0.8946937918663025,
"learning_rate": 1.2113715015304728e-08,
"loss": 0.1245,
"step": 31000
},
{
"epoch": 0.79,
"grad_norm": 0.8165032863616943,
"learning_rate": 1.0933489489651783e-08,
"loss": 0.12,
"step": 31050
},
{
"epoch": 0.79,
"grad_norm": 0.8956162333488464,
"learning_rate": 9.81361628991151e-09,
"loss": 0.1226,
"step": 31100
},
{
"epoch": 0.79,
"grad_norm": 0.9210562705993652,
"learning_rate": 8.754122569618329e-09,
"loss": 0.123,
"step": 31150
},
{
"epoch": 0.79,
"grad_norm": 0.9154261946678162,
"learning_rate": 7.755034018286644e-09,
"loss": 0.1242,
"step": 31200
},
{
"epoch": 0.79,
"grad_norm": 0.7360591888427734,
"learning_rate": 6.816374860788566e-09,
"loss": 0.1215,
"step": 31250
},
{
"epoch": 0.79,
"grad_norm": 0.83289635181427,
"learning_rate": 5.938167856766319e-09,
"loss": 0.1224,
"step": 31300
},
{
"epoch": 0.79,
"grad_norm": 0.7892395257949829,
"learning_rate": 5.120434300080745e-09,
"loss": 0.1207,
"step": 31350
},
{
"epoch": 0.8,
"grad_norm": 0.9500446915626526,
"learning_rate": 4.363194018293937e-09,
"loss": 0.13,
"step": 31400
},
{
"epoch": 0.8,
"grad_norm": 0.8159528970718384,
"learning_rate": 3.666465372190453e-09,
"loss": 0.1224,
"step": 31450
},
{
"epoch": 0.8,
"grad_norm": 0.7910547852516174,
"learning_rate": 3.030265255329623e-09,
"loss": 0.1228,
"step": 31500
},
{
"epoch": 0.8,
"grad_norm": 0.8346930742263794,
"learning_rate": 2.4546090936383717e-09,
"loss": 0.1239,
"step": 31550
},
{
"epoch": 0.8,
"grad_norm": 0.8468943238258362,
"learning_rate": 1.9395108450351308e-09,
"loss": 0.119,
"step": 31600
},
{
"epoch": 0.8,
"grad_norm": 0.8092018961906433,
"learning_rate": 1.4849829990931653e-09,
"loss": 0.1221,
"step": 31650
},
{
"epoch": 0.8,
"grad_norm": 0.8936166763305664,
"learning_rate": 1.0910365767358155e-09,
"loss": 0.1218,
"step": 31700
},
{
"epoch": 0.8,
"grad_norm": 1.029935598373413,
"learning_rate": 7.576811299714326e-10,
"loss": 0.1292,
"step": 31750
},
{
"epoch": 0.81,
"grad_norm": 0.8104945421218872,
"learning_rate": 4.849247416599534e-10,
"loss": 0.1189,
"step": 31800
},
{
"epoch": 0.81,
"grad_norm": 0.7662272453308105,
"learning_rate": 2.727740253177791e-10,
"loss": 0.1243,
"step": 31850
},
{
"epoch": 0.81,
"grad_norm": 0.8399808406829834,
"learning_rate": 1.2123412495762543e-10,
"loss": 0.1229,
"step": 31900
},
{
"epoch": 0.81,
"grad_norm": 0.7275273203849792,
"learning_rate": 3.0308714963067644e-11,
"loss": 0.1182,
"step": 31950
},
{
"epoch": 0.81,
"grad_norm": 0.9211012721061707,
"learning_rate": 0.0,
"loss": 0.1208,
"step": 32000
}
],
"logging_steps": 50,
"max_steps": 32000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 3200,
"total_flos": 2.2216620291373056e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}